[FFmpeg-devel] [PATCH v3 5/7] swscale/x86: add sse2, sse4, and avx2 {lum, chr}ConvertRange16
Ramiro Polla
ramiro.polla at gmail.com
Sat Nov 30 17:23:40 EET 2024
chrRangeFromJpeg16_1920_c: 5809.1
chrRangeFromJpeg16_1920_sse2: 3909.5 ( 1.49x)
chrRangeFromJpeg16_1920_avx2: 1985.3 ( 2.93x)
chrRangeToJpeg16_1920_c: 9261.5
chrRangeToJpeg16_1920_sse2: 6053.2 ( 1.53x)
chrRangeToJpeg16_1920_sse4: 4493.6 ( 2.06x)
chrRangeToJpeg16_1920_avx2: 2405.2 ( 3.85x)
lumRangeFromJpeg16_1920_c: 4143.4
lumRangeFromJpeg16_1920_sse2: 1982.5 ( 2.09x)
lumRangeFromJpeg16_1920_avx2: 1040.8 ( 3.98x)
lumRangeToJpeg16_1920_c: 5139.5
lumRangeToJpeg16_1920_sse2: 3041.1 ( 1.69x)
lumRangeToJpeg16_1920_sse4: 2313.0 ( 2.22x)
lumRangeToJpeg16_1920_avx2: 1181.2 ( 4.35x)
---
libswscale/x86/range_convert.asm | 116 +++++++++++++++++++++++++------
libswscale/x86/swscale.c | 50 +++++++++----
2 files changed, 132 insertions(+), 34 deletions(-)
diff --git a/libswscale/x86/range_convert.asm b/libswscale/x86/range_convert.asm
index 27be2a4b31..6c33e6b397 100644
--- a/libswscale/x86/range_convert.asm
+++ b/libswscale/x86/range_convert.asm
@@ -20,21 +20,24 @@
%include "libavutil/x86/x86util.asm"
+SECTION_RODATA
+pack19: times 4 dd (1 << 19) - 1
+
SECTION .text
;-----------------------------------------------------------------------------
; lumConvertRange
;
-; void ff_lumRangeToJpeg_<opt>(int16_t *dst, int width,
-; uint32_t coeff, int64_t offset);
-; void ff_lumRangeFromJpeg_<opt>(int16_t *dst, int width,
-; uint32_t coeff, int64_t offset);
+; void ff_lumRangeToJpeg{8,16}_<opt>(int16_t *dst, int width,
+; uint32_t coeff, int64_t offset);
+; void ff_lumRangeFromJpeg{8,16}_<opt>(int16_t *dst, int width,
+; uint32_t coeff, int64_t offset);
;
;-----------------------------------------------------------------------------
-%macro LUMCONVERTRANGE 1
-cglobal lumRange%1Jpeg, 4, 4, 5, dst, width, coeff, offset
- shl widthd, 1
+%macro LUMCONVERTRANGE 2
+cglobal lumRange%1Jpeg%2, 4, 4, 5, dst, width, coeff, offset
+ shl widthd, %2 >> 3
movd xm2, coeffd
VBROADCASTSS m2, xm2
%if ARCH_X86_64
@@ -42,12 +45,34 @@ cglobal lumRange%1Jpeg, 4, 4, 5, dst, width, coeff, offset
%else
movq xm3, offsetm
%endif
+%if %2 == 16
+ VBROADCASTSD m3, xm3
+%ifidni %1,To
+ VBROADCASTI128 m4, [pack19]
+%endif
+%elif %2 == 8
VBROADCASTSS m3, xm3
pxor m4, m4
+%endif ; %2 == 8/16
add dstq, widthq
neg widthq
.loop:
movu m0, [dstq+widthq]
+%if %2 == 16
+ pshufd m1, m0, 0xb1
+ pmuludq m0, m2
+ pmuludq m1, m2
+ paddq m0, m3
+ paddq m1, m3
+ psrlq m0, 18
+ psrlq m1, 18
+ pshufd m0, m0, 0xd8
+ pshufd m1, m1, 0xd8
+ punpckldq m0, m1
+%ifidni %1,To
+ PMINSD m0, m4, m1
+%endif
+%elif %2 == 8
punpckhwd m1, m0, m4
punpcklwd m0, m4
pmaddwd m0, m2
@@ -57,6 +82,7 @@ cglobal lumRange%1Jpeg, 4, 4, 5, dst, width, coeff, offset
psrad m0, 14
psrad m1, 14
packssdw m0, m1
+%endif ; %2 == 8/16
movu [dstq+widthq], m0
add widthq, mmsize
jl .loop
@@ -66,16 +92,16 @@ cglobal lumRange%1Jpeg, 4, 4, 5, dst, width, coeff, offset
;-----------------------------------------------------------------------------
; chrConvertRange
;
-; void ff_chrRangeToJpeg_<opt>(int16_t *dstU, int16_t *dstV, int width,
-; uint32_t coeff, int64_t offset);
-; void ff_chrRangeFromJpeg_<opt>(int16_t *dstU, int16_t *dstV, int width,
-; uint32_t coeff, int64_t offset);
+; void ff_chrRangeToJpeg{8,16}_<opt>(int16_t *dstU, int16_t *dstV, int width,
+; uint32_t coeff, int64_t offset);
+; void ff_chrRangeFromJpeg{8,16}_<opt>(int16_t *dstU, int16_t *dstV, int width,
+; uint32_t coeff, int64_t offset);
;
;-----------------------------------------------------------------------------
-%macro CHRCONVERTRANGE 1
-cglobal chrRange%1Jpeg, 5, 5, 7, dstU, dstV, width, coeff, offset
- shl widthd, 1
+%macro CHRCONVERTRANGE 2
+cglobal chrRange%1Jpeg%2, 5, 5, 7, dstU, dstV, width, coeff, offset
+ shl widthd, %2 >> 3
movd xm4, coeffd
VBROADCASTSS m4, xm4
%if ARCH_X86_64
@@ -83,14 +109,47 @@ cglobal chrRange%1Jpeg, 5, 5, 7, dstU, dstV, width, coeff, offset
%else
movq xm5, offsetm
%endif
+%if %2 == 16
+ VBROADCASTSD m5, xm5
+%ifidni %1,To
+ VBROADCASTI128 m6, [pack19]
+%endif
+%elif %2 == 8
VBROADCASTSS m5, xm5
pxor m6, m6
+%endif ; %2 == 8/16
add dstUq, widthq
add dstVq, widthq
neg widthq
.loop:
movu m0, [dstUq+widthq]
movu m2, [dstVq+widthq]
+%if %2 == 16
+ pshufd m1, m0, 0xb1
+ pshufd m3, m2, 0xb1
+ pmuludq m0, m4
+ pmuludq m1, m4
+ pmuludq m2, m4
+ pmuludq m3, m4
+ paddq m0, m5
+ paddq m1, m5
+ paddq m2, m5
+ paddq m3, m5
+ psrlq m0, 18
+ psrlq m1, 18
+ psrlq m2, 18
+ psrlq m3, 18
+ pshufd m0, m0, 0xd8
+ pshufd m1, m1, 0xd8
+ pshufd m2, m2, 0xd8
+ pshufd m3, m3, 0xd8
+ punpckldq m0, m1
+ punpckldq m2, m3
+%ifidni %1,To
+ PMINSD m0, m6, m1
+ PMINSD m2, m6, m3
+%endif
+%elif %2 == 8
punpckhwd m1, m0, m6
punpckhwd m3, m2, m6
punpcklwd m0, m6
@@ -109,6 +168,7 @@ cglobal chrRange%1Jpeg, 5, 5, 7, dstU, dstV, width, coeff, offset
psrad m3, 14
packssdw m0, m1
packssdw m2, m3
+%endif ; %2 == 8/16
movu [dstUq+widthq], m0
movu [dstVq+widthq], m2
add widthq, mmsize
@@ -117,15 +177,27 @@ cglobal chrRange%1Jpeg, 5, 5, 7, dstU, dstV, width, coeff, offset
%endmacro
INIT_XMM sse2
-LUMCONVERTRANGE To
-CHRCONVERTRANGE To
-LUMCONVERTRANGE From
-CHRCONVERTRANGE From
+LUMCONVERTRANGE To, 8
+LUMCONVERTRANGE To, 16
+CHRCONVERTRANGE To, 8
+CHRCONVERTRANGE To, 16
+LUMCONVERTRANGE From, 8
+LUMCONVERTRANGE From, 16
+CHRCONVERTRANGE From, 8
+CHRCONVERTRANGE From, 16
+
+INIT_XMM sse4
+LUMCONVERTRANGE To, 16
+CHRCONVERTRANGE To, 16
%if HAVE_AVX2_EXTERNAL
INIT_YMM avx2
-LUMCONVERTRANGE To
-CHRCONVERTRANGE To
-LUMCONVERTRANGE From
-CHRCONVERTRANGE From
+LUMCONVERTRANGE To, 8
+LUMCONVERTRANGE To, 16
+CHRCONVERTRANGE To, 8
+CHRCONVERTRANGE To, 16
+LUMCONVERTRANGE From, 8
+LUMCONVERTRANGE From, 16
+CHRCONVERTRANGE From, 8
+CHRCONVERTRANGE From, 16
%endif
diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
index 550ad99f3f..0bf6a13886 100644
--- a/libswscale/x86/swscale.c
+++ b/libswscale/x86/swscale.c
@@ -454,26 +454,46 @@ INPUT_PLANAR_RGB_A_ALL_DECL(avx2);
#define RANGE_CONVERT_FUNCS(opt) do { \
if (c->dstBpc <= 14) { \
if (c->opts.src_range) { \
- c->lumConvertRange = ff_lumRangeFromJpeg_ ##opt; \
- c->chrConvertRange = ff_chrRangeFromJpeg_ ##opt; \
+ c->lumConvertRange = ff_lumRangeFromJpeg8_ ##opt; \
+ c->chrConvertRange = ff_chrRangeFromJpeg8_ ##opt; \
} else { \
- c->lumConvertRange = ff_lumRangeToJpeg_ ##opt; \
- c->chrConvertRange = ff_chrRangeToJpeg_ ##opt; \
+ c->lumConvertRange = ff_lumRangeToJpeg8_ ##opt; \
+ c->chrConvertRange = ff_chrRangeToJpeg8_ ##opt; \
+ } \
+ } else { \
+ if (c->opts.src_range) { \
+ c->lumConvertRange = ff_lumRangeFromJpeg16_ ##opt; \
+ c->chrConvertRange = ff_chrRangeFromJpeg16_ ##opt; \
+ } else { \
+ c->lumConvertRange = ff_lumRangeToJpeg16_ ##opt; \
+ c->chrConvertRange = ff_chrRangeToJpeg16_ ##opt; \
} \
} \
} while (0)
#define RANGE_CONVERT_FUNCS_DECL(opt) \
-void ff_lumRangeFromJpeg_ ##opt(int16_t *dst, int width, \
+void ff_lumRangeFromJpeg8_ ##opt(int16_t *dst, int width, \
+ uint32_t coeff, int64_t offset); \
+void ff_chrRangeFromJpeg8_ ##opt(int16_t *dstU, int16_t *dstV, int width, \
+ uint32_t coeff, int64_t offset); \
+void ff_lumRangeToJpeg8_ ##opt(int16_t *dst, int width, \
+ uint32_t coeff, int64_t offset); \
+void ff_chrRangeToJpeg8_ ##opt(int16_t *dstU, int16_t *dstV, int width, \
+ uint32_t coeff, int64_t offset); \
+void ff_lumRangeFromJpeg16_ ##opt(int16_t *dst, int width, \
+ uint32_t coeff, int64_t offset); \
+void ff_chrRangeFromJpeg16_ ##opt(int16_t *dstU, int16_t *dstV, int width, \
+ uint32_t coeff, int64_t offset); \
+void ff_lumRangeToJpeg16_ ##opt(int16_t *dst, int width, \
uint32_t coeff, int64_t offset); \
-void ff_chrRangeFromJpeg_ ##opt(int16_t *dstU, int16_t *dstV, int width, \
+void ff_chrRangeToJpeg16_ ##opt(int16_t *dstU, int16_t *dstV, int width, \
uint32_t coeff, int64_t offset); \
-void ff_lumRangeToJpeg_ ##opt(int16_t *dst, int width, \
- uint32_t coeff, int64_t offset); \
-void ff_chrRangeToJpeg_ ##opt(int16_t *dstU, int16_t *dstV, int width, \
- uint32_t coeff, int64_t offset); \
RANGE_CONVERT_FUNCS_DECL(sse2);
+void ff_lumRangeToJpeg16_sse4(int16_t *dst, int width,
+ uint32_t coeff, int64_t offset);
+void ff_chrRangeToJpeg16_sse4(int16_t *dstU, int16_t *dstV, int width,
+ uint32_t coeff, int64_t offset);
RANGE_CONVERT_FUNCS_DECL(avx2);
av_cold void ff_sws_init_range_convert_x86(SwsInternal *c)
@@ -481,8 +501,14 @@ av_cold void ff_sws_init_range_convert_x86(SwsInternal *c)
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_AVX2_FAST(cpu_flags)) {
RANGE_CONVERT_FUNCS(avx2);
- } else if (EXTERNAL_SSE2(cpu_flags)) {
- RANGE_CONVERT_FUNCS(sse2);
+ } else {
+ if (EXTERNAL_SSE2(cpu_flags)) {
+ RANGE_CONVERT_FUNCS(sse2);
+ }
+ if (EXTERNAL_SSE4(cpu_flags) && c->dstBpc > 14 && !c->opts.src_range) {
+ c->lumConvertRange = ff_lumRangeToJpeg16_sse4;
+ c->chrConvertRange = ff_chrRangeToJpeg16_sse4;
+ }
}
}
--
2.39.5
More information about the ffmpeg-devel
mailing list