[FFmpeg-devel] [PATCH 1/3] swscale: [LA] Optimize range convert for yuvj420p.
Shiyou Yin
yinshiyou-hf at loongson.cn
Sat Mar 16 05:03:31 EET 2024
---
libswscale/loongarch/swscale.S | 368 ++++++++++++++++++
libswscale/loongarch/swscale_init_loongarch.c | 33 ++
libswscale/loongarch/swscale_loongarch.h | 11 +
libswscale/swscale_internal.h | 1 +
libswscale/utils.c | 6 +-
5 files changed, 418 insertions(+), 1 deletion(-)
diff --git a/libswscale/loongarch/swscale.S b/libswscale/loongarch/swscale.S
index aa4c5cbe28..67b1bc834d 100644
--- a/libswscale/loongarch/swscale.S
+++ b/libswscale/loongarch/swscale.S
@@ -1866,3 +1866,371 @@ function ff_hscale_16_to_19_sub_lsx
ld.d s8, sp, 64
addi.d sp, sp, 72
endfunc
+
+function lumRangeFromJpeg_lsx
+ li.w t0, 14071
+ li.w t1, 33561947
+ vreplgr2vr.h vr0, t0
+ srli.w t2, a1, 3
+ andi t3, a1, 7
+ beqz t2, 2f
+1:
+ vld vr1, a0, 0
+ vreplgr2vr.w vr2, t1
+ vreplgr2vr.w vr3, t1
+ vmaddwev.w.h vr2, vr0, vr1
+ vmaddwod.w.h vr3, vr0, vr1
+ vsrai.w vr2, vr2, 14
+ vsrai.w vr3, vr3, 14
+ vpackev.h vr1, vr3, vr2
+ vst vr1, a0, 0
+ addi.d a0, a0, 16
+ addi.d t2, t2, -1
+ bnez t2, 1b
+2:
+ beqz t3, 4f
+3:
+ ld.h t4, a0, 0
+ mul.w t4, t4, t0
+ add.w t4, t4, t1
+ srai.w t4, t4, 14
+ st.h t4, a0, 0
+ addi.d a0, a0, 2
+ addi.d t3, t3, -1
+ bnez t3, 3b
+4:
+endfunc
+
+function lumRangeFromJpeg_lasx
+ li.w t0, 14071
+ li.w t1, 33561947
+ xvreplgr2vr.h xr0, t0
+ srli.w t2, a1, 4
+ andi t3, a1, 15
+ beqz t2, 2f
+1:
+ xvld xr1, a0, 0
+ xvreplgr2vr.w xr2, t1
+ xvreplgr2vr.w xr3, t1
+ xvmaddwev.w.h xr2, xr0, xr1
+ xvmaddwod.w.h xr3, xr0, xr1
+ xvsrai.w xr2, xr2, 14
+ xvsrai.w xr3, xr3, 14
+ xvpackev.h xr1, xr3, xr2
+ xvst xr1, a0, 0
+ addi.d a0, a0, 32
+ addi.d t2, t2, -1
+ bnez t2, 1b
+2:
+ beqz t3, 4f
+3:
+ ld.h t4, a0, 0
+ mul.w t4, t4, t0
+ add.w t4, t4, t1
+ srai.w t4, t4, 14
+ st.h t4, a0, 0
+ addi.d a0, a0, 2
+ addi.d t3, t3, -1
+ bnez t3, 3b
+4:
+endfunc
+
+function lumRangeToJpeg_lsx
+ li.w t0, 19077
+ li.w t1, -39057361
+ li.w t2, 30189
+ vreplgr2vr.h vr0, t0
+ vreplgr2vr.h vr4, t2
+ srli.w t2, a1, 3
+ andi t3, a1, 7
+ beqz t2, 2f
+1:
+ vld vr1, a0, 0
+ vreplgr2vr.w vr2, t1
+ vreplgr2vr.w vr3, t1
+ vmin.h vr1, vr1, vr4
+ vmaddwev.w.h vr2, vr0, vr1
+ vmaddwod.w.h vr3, vr0, vr1
+ vsrai.w vr2, vr2, 14
+ vsrai.w vr3, vr3, 14
+ vpackev.h vr1, vr3, vr2
+ vst vr1, a0, 0
+ addi.d a0, a0, 16
+ addi.d t2, t2, -1
+ bnez t2, 1b
+2:
+ beqz t3, 4f
+3:
+ ld.h t4, a0, 0
+ vreplgr2vr.h vr1, t4
+ vmin.h vr1, vr1, vr4
+ vpickve2gr.h t4, vr1, 0
+ mul.w t4, t4, t0
+ add.w t4, t4, t1
+ srai.w t4, t4, 14
+ st.h t4, a0, 0
+ addi.d a0, a0, 2
+ addi.d t3, t3, -1
+ bnez t3, 3b
+4:
+endfunc
+
+function lumRangeToJpeg_lasx
+ li.w t0, 19077
+ li.w t1, -39057361
+ li.w t2, 30189
+ xvreplgr2vr.h xr0, t0
+ xvreplgr2vr.h xr4, t2
+ srli.w t2, a1, 4
+ andi t3, a1, 15
+ beqz t2, 2f
+1:
+ xvld xr1, a0, 0
+ xvreplgr2vr.w xr2, t1
+ xvreplgr2vr.w xr3, t1
+ xvmin.h xr1, xr1, xr4
+ xvmaddwev.w.h xr2, xr0, xr1
+ xvmaddwod.w.h xr3, xr0, xr1
+ xvsrai.w xr2, xr2, 14
+ xvsrai.w xr3, xr3, 14
+ xvpackev.h xr1, xr3, xr2
+ xvst xr1, a0, 0
+ addi.d a0, a0, 32
+ addi.d t2, t2, -1
+ bnez t2, 1b
+2:
+ beqz t3, 4f
+3:
+ ld.h t4, a0, 0
+ vreplgr2vr.h vr1, t4
+ vmin.h vr1, vr1, vr4
+ vpickve2gr.h t4, vr1, 0
+ mul.w t4, t4, t0
+ add.w t4, t4, t1
+ srai.w t4, t4, 14
+ st.h t4, a0, 0
+ addi.d a0, a0, 2
+ addi.d t3, t3, -1
+ bnez t3, 3b
+4:
+endfunc
+
+function chrRangeFromJpeg_lsx
+ li.w t0, 1799
+ li.w t1, 4081085
+ vreplgr2vr.h vr0, t0
+ srli.w t2, a2, 3
+ andi t3, a2, 7
+ beqz t2, 2f
+1:
+ vld vr1, a0, 0
+ vld vr2, a1, 0
+ vreplgr2vr.w vr3, t1
+ vreplgr2vr.w vr4, t1
+ vreplgr2vr.w vr5, t1
+ vreplgr2vr.w vr6, t1
+ vmaddwev.w.h vr3, vr0, vr1
+ vmaddwod.w.h vr4, vr0, vr1
+ vmaddwev.w.h vr5, vr0, vr2
+ vmaddwod.w.h vr6, vr0, vr2
+ vsrai.w vr3, vr3, 11
+ vsrai.w vr4, vr4, 11
+ vsrai.w vr5, vr5, 11
+ vsrai.w vr6, vr6, 11
+ vpackev.h vr1, vr4, vr3
+ vpackev.h vr2, vr6, vr5
+ vst vr1, a0, 0
+ vst vr2, a1, 0
+ addi.d a0, a0, 16
+ addi.d a1, a1, 16
+ addi.d t2, t2, -1
+ bnez t2, 1b
+2:
+ beqz t3, 4f
+3:
+ ld.h t4, a0, 0
+ ld.h t5, a1, 0
+ mul.w t4, t4, t0
+ mul.w t5, t5, t0
+ add.w t4, t4, t1
+ add.w t5, t5, t1
+ srai.w t4, t4, 11
+ srai.w t5, t5, 11
+ st.h t4, a0, 0
+ st.h t5, a1, 0
+ addi.d a0, a0, 2
+ addi.d a1, a1, 2
+ addi.d t3, t3, -1
+ bnez t3, 3b
+4:
+endfunc
+
+function chrRangeFromJpeg_lasx
+ li.w t0, 1799
+ li.w t1, 4081085
+ xvreplgr2vr.h xr0, t0
+ srli.w t2, a2, 4
+ andi t3, a2, 15
+ beqz t2, 2f
+1:
+ xvld xr1, a0, 0
+ xvld xr2, a1, 0
+ xvreplgr2vr.w xr3, t1
+ xvreplgr2vr.w xr4, t1
+ xvreplgr2vr.w xr5, t1
+ xvreplgr2vr.w xr6, t1
+ xvmaddwev.w.h xr3, xr0, xr1
+ xvmaddwod.w.h xr4, xr0, xr1
+ xvmaddwev.w.h xr5, xr0, xr2
+ xvmaddwod.w.h xr6, xr0, xr2
+ xvsrai.w xr3, xr3, 11
+ xvsrai.w xr4, xr4, 11
+ xvsrai.w xr5, xr5, 11
+ xvsrai.w xr6, xr6, 11
+ xvpackev.h xr1, xr4, xr3
+ xvpackev.h xr2, xr6, xr5
+ xvst xr1, a0, 0
+ xvst xr2, a1, 0
+ addi.d a0, a0, 32
+ addi.d a1, a1, 32
+ addi.d t2, t2, -1
+ bnez t2, 1b
+2:
+ beqz t3, 4f
+3:
+ ld.h t4, a0, 0
+ ld.h t5, a1, 0
+ mul.w t4, t4, t0
+ mul.w t5, t5, t0
+ add.w t4, t4, t1
+ add.w t5, t5, t1
+ srai.w t4, t4, 11
+ srai.w t5, t5, 11
+ st.h t4, a0, 0
+ st.h t5, a1, 0
+ addi.d a0, a0, 2
+ addi.d a1, a1, 2
+ addi.d t3, t3, -1
+ bnez t3, 3b
+4:
+endfunc
+
+function chrRangeToJpeg_lsx
+ li.w t0, 4663
+ li.w t1, -9289992
+ li.w t2, 30775
+ vreplgr2vr.h vr0, t0
+ vreplgr2vr.h vr7, t2
+ srli.w t2, a2, 3
+ andi t3, a2, 7
+ beqz t2, 2f
+1:
+ vld vr1, a0, 0
+ vld vr2, a1, 0
+ vreplgr2vr.w vr3, t1
+ vreplgr2vr.w vr4, t1
+ vreplgr2vr.w vr5, t1
+ vreplgr2vr.w vr6, t1
+ vmin.h vr1, vr1, vr7
+ vmin.h vr2, vr2, vr7
+ vmaddwev.w.h vr3, vr0, vr1
+ vmaddwod.w.h vr4, vr0, vr1
+ vmaddwev.w.h vr5, vr0, vr2
+ vmaddwod.w.h vr6, vr0, vr2
+ vsrai.w vr3, vr3, 12
+ vsrai.w vr4, vr4, 12
+ vsrai.w vr5, vr5, 12
+ vsrai.w vr6, vr6, 12
+ vpackev.h vr1, vr4, vr3
+ vpackev.h vr2, vr6, vr5
+ vst vr1, a0, 0
+ vst vr2, a1, 0
+ addi.d a0, a0, 16
+ addi.d a1, a1, 16
+ addi.d t2, t2, -1
+ bnez t2, 1b
+2:
+ beqz t3, 4f
+3:
+ ld.h t4, a0, 0
+ ld.h t5, a1, 0
+ vreplgr2vr.h vr1, t4
+ vreplgr2vr.h vr2, t5
+ vmin.h vr1, vr1, vr7
+ vmin.h vr2, vr2, vr7
+ vpickve2gr.h t4, vr1, 0
+ vpickve2gr.h t5, vr2, 0
+ mul.w t4, t4, t0
+ mul.w t5, t5, t0
+ add.w t4, t4, t1
+ add.w t5, t5, t1
+ srai.w t4, t4, 12
+ srai.w t5, t5, 12
+ st.h t4, a0, 0
+ st.h t5, a1, 0
+ addi.d a0, a0, 2
+ addi.d a1, a1, 2
+ addi.d t3, t3, -1
+ bnez t3, 3b
+4:
+endfunc
+
+function chrRangeToJpeg_lasx
+ li.w t0, 4663
+ li.w t1, -9289992
+ li.w t2, 30775
+ xvreplgr2vr.h xr0, t0
+ xvreplgr2vr.h xr7, t2
+ srli.w t2, a2, 4
+ andi t3, a2, 15
+ beqz t2, 2f
+1:
+ xvld xr1, a0, 0
+ xvld xr2, a1, 0
+ xvreplgr2vr.w xr3, t1
+ xvreplgr2vr.w xr4, t1
+ xvreplgr2vr.w xr5, t1
+ xvreplgr2vr.w xr6, t1
+ xvmin.h xr1, xr1, xr7
+ xvmin.h xr2, xr2, xr7
+ xvmaddwev.w.h xr3, xr0, xr1
+ xvmaddwod.w.h xr4, xr0, xr1
+ xvmaddwev.w.h xr5, xr0, xr2
+ xvmaddwod.w.h xr6, xr0, xr2
+ xvsrai.w xr3, xr3, 12
+ xvsrai.w xr4, xr4, 12
+ xvsrai.w xr5, xr5, 12
+ xvsrai.w xr6, xr6, 12
+ xvpackev.h xr1, xr4, xr3
+ xvpackev.h xr2, xr6, xr5
+ xvst xr1, a0, 0
+ xvst xr2, a1, 0
+ addi.d a0, a0, 32
+ addi.d a1, a1, 32
+ addi.d t2, t2, -1
+ bnez t2, 1b
+2:
+ beqz t3, 4f
+3:
+ ld.h t4, a0, 0
+ ld.h t5, a1, 0
+ vreplgr2vr.h vr1, t4
+ vreplgr2vr.h vr2, t5
+ vmin.h vr1, vr1, vr7
+ vmin.h vr2, vr2, vr7
+ vpickve2gr.h t4, vr1, 0
+ vpickve2gr.h t5, vr2, 0
+ mul.w t4, t4, t0
+ mul.w t5, t5, t0
+ add.w t4, t4, t1
+ add.w t5, t5, t1
+ srai.w t4, t4, 12
+ srai.w t5, t5, 12
+ st.h t4, a0, 0
+ st.h t5, a1, 0
+ addi.d a0, a0, 2
+ addi.d a1, a1, 2
+ addi.d t3, t3, -1
+ bnez t3, 3b
+4:
+endfunc
diff --git a/libswscale/loongarch/swscale_init_loongarch.c b/libswscale/loongarch/swscale_init_loongarch.c
index 53e4f970b6..6d2786c55f 100644
--- a/libswscale/loongarch/swscale_init_loongarch.c
+++ b/libswscale/loongarch/swscale_init_loongarch.c
@@ -24,6 +24,38 @@
#include "libswscale/rgb2rgb.h"
#include "libavutil/loongarch/cpu.h"
+av_cold void ff_sws_init_range_convert_loongarch(SwsContext *c)
+{
+ int cpu_flags = av_get_cpu_flags();
+
+ if (have_lsx(cpu_flags)) {
+ if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) {
+ if (c->dstBpc <= 14) {
+ if (c->srcRange) {
+ c->lumConvertRange = lumRangeFromJpeg_lsx;
+ c->chrConvertRange = chrRangeFromJpeg_lsx;
+ } else {
+ c->lumConvertRange = lumRangeToJpeg_lsx;
+ c->chrConvertRange = chrRangeToJpeg_lsx;
+ }
+ }
+ }
+ }
+ if (have_lasx(cpu_flags)) {
+ if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) {
+ if (c->dstBpc <= 14) {
+ if (c->srcRange) {
+ c->lumConvertRange = lumRangeFromJpeg_lasx;
+ c->chrConvertRange = chrRangeFromJpeg_lasx;
+ } else {
+ c->lumConvertRange = lumRangeToJpeg_lasx;
+ c->chrConvertRange = chrRangeToJpeg_lasx;
+ }
+ }
+ }
+ }
+}
+
av_cold void ff_sws_init_swscale_loongarch(SwsContext *c)
{
int cpu_flags = av_get_cpu_flags();
@@ -77,6 +109,7 @@ av_cold void ff_sws_init_swscale_loongarch(SwsContext *c)
c->yuv2planeX = ff_yuv2planeX_8_lasx;
}
#endif // #if HAVE_LASX
+ ff_sws_init_range_convert_loongarch(c);
}
av_cold void rgb2rgb_init_loongarch(void)
diff --git a/libswscale/loongarch/swscale_loongarch.h b/libswscale/loongarch/swscale_loongarch.h
index 0514abae21..c96b085982 100644
--- a/libswscale/loongarch/swscale_loongarch.h
+++ b/libswscale/loongarch/swscale_loongarch.h
@@ -50,6 +50,11 @@ void ff_hscale_16_to_19_sub_lsx(SwsContext *c, int16_t *_dst, int dstW,
const uint8_t *_src, const int16_t *filter,
const int32_t *filterPos, int filterSize, int sh);
+void lumRangeFromJpeg_lsx(int16_t *dst, int width);
+void chrRangeFromJpeg_lsx(int16_t *dstU, int16_t *dstV, int width);
+void lumRangeToJpeg_lsx(int16_t *dst, int width);
+void chrRangeToJpeg_lsx(int16_t *dstU, int16_t *dstV, int width);
+
void planar_rgb_to_uv_lsx(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *src[4],
int width, int32_t *rgb2yuv, void *opq);
@@ -97,6 +102,11 @@ void ff_hscale_16_to_15_lasx(SwsContext *c, int16_t *dst, int dstW,
const uint8_t *_src, const int16_t *filter,
const int32_t *filterPos, int filterSize);
+void lumRangeFromJpeg_lasx(int16_t *dst, int width);
+void chrRangeFromJpeg_lasx(int16_t *dstU, int16_t *dstV, int width);
+void lumRangeToJpeg_lasx(int16_t *dst, int width);
+void chrRangeToJpeg_lasx(int16_t *dstU, int16_t *dstV, int width);
+
void planar_rgb_to_uv_lasx(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *src[4],
int width, int32_t *rgb2yuv, void *opq);
@@ -130,6 +140,7 @@ void ff_yuv2planeX_8_lasx(const int16_t *filter, int filterSize,
const uint8_t *dither, int offset);
av_cold void ff_sws_init_output_lasx(SwsContext *c);
+
#endif // #if HAVE_LASX
#endif /* SWSCALE_LOONGARCH_SWSCALE_LOONGARCH_H */
diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h
index abeebbb002..0db581acf8 100644
--- a/libswscale/swscale_internal.h
+++ b/libswscale/swscale_internal.h
@@ -695,6 +695,7 @@ void ff_yuv2rgb_init_tables_ppc(SwsContext *c, const int inv_table[4],
void ff_updateMMXDitherTables(SwsContext *c, int dstY);
av_cold void ff_sws_init_range_convert(SwsContext *c);
+av_cold void ff_sws_init_range_convert_loongarch(SwsContext *c);
SwsFunc ff_yuv2rgb_init_x86(SwsContext *c);
SwsFunc ff_yuv2rgb_init_ppc(SwsContext *c);
diff --git a/libswscale/utils.c b/libswscale/utils.c
index ab8a68e241..47db65ef0e 100644
--- a/libswscale/utils.c
+++ b/libswscale/utils.c
@@ -1049,8 +1049,12 @@ int sws_setColorspaceDetails(struct SwsContext *c, const int inv_table[4],
c->srcRange = srcRange;
c->dstRange = dstRange;
- if (need_reinit)
+ if (need_reinit) {
ff_sws_init_range_convert(c);
+#if ARCH_LOONGARCH64
+ ff_sws_init_range_convert_loongarch(c);
+#endif
+ }
c->dstFormatBpp = av_get_bits_per_pixel(desc_dst);
c->srcFormatBpp = av_get_bits_per_pixel(desc_src);
--
2.20.1
More information about the ffmpeg-devel
mailing list