[FFmpeg-cvslog] swscale/x86: add sse4 and avx2 {lum,chr}ConvertRange16
Ramiro Polla
git at videolan.org
Thu Dec 5 22:16:58 EET 2024
ffmpeg | branch: master | Ramiro Polla <ramiro.polla at gmail.com> | Sun Sep 22 13:30:03 2024 +0200| [87052c09336e4dc001e98d618c83ebe2d3bbc970] | committer: Ramiro Polla
swscale/x86: add sse4 and avx2 {lum,chr}ConvertRange16
chrRangeFromJpeg16_1920_c: 3153.9
chrRangeFromJpeg16_1920_sse4: 1770.0 (1.78x)
chrRangeFromJpeg16_1920_avx2: 891.5 (3.54x)
chrRangeToJpeg16_1920_c: 3165.0
chrRangeToJpeg16_1920_sse4: 1953.2 (1.62x)
chrRangeToJpeg16_1920_avx2: 973.0 (3.25x)
lumRangeFromJpeg16_1920_c: 1298.5
lumRangeFromJpeg16_1920_sse4: 886.5 (1.46x)
lumRangeFromJpeg16_1920_avx2: 447.7 (2.90x)
lumRangeToJpeg16_1920_c: 1905.0
lumRangeToJpeg16_1920_sse4: 993.0 (1.92x)
lumRangeToJpeg16_1920_avx2: 498.9 (3.82x)
> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=87052c09336e4dc001e98d618c83ebe2d3bbc970
---
libswscale/x86/range_convert.asm | 114 +++++++++++++++++++++++++++++++--------
libswscale/x86/swscale.c | 54 ++++++++++---------
2 files changed, 122 insertions(+), 46 deletions(-)
diff --git a/libswscale/x86/range_convert.asm b/libswscale/x86/range_convert.asm
index 27be2a4b31..e5b8866a1f 100644
--- a/libswscale/x86/range_convert.asm
+++ b/libswscale/x86/range_convert.asm
@@ -20,21 +20,24 @@
%include "libavutil/x86/x86util.asm"
+SECTION_RODATA
+pack19: times 4 dd (1 << 19) - 1
+
SECTION .text
;-----------------------------------------------------------------------------
; lumConvertRange
;
-; void ff_lumRangeToJpeg_<opt>(int16_t *dst, int width,
-; uint32_t coeff, int64_t offset);
-; void ff_lumRangeFromJpeg_<opt>(int16_t *dst, int width,
-; uint32_t coeff, int64_t offset);
+; void ff_lumRangeToJpeg{8,16}_<opt>(int16_t *dst, int width,
+; uint32_t coeff, int64_t offset);
+; void ff_lumRangeFromJpeg{8,16}_<opt>(int16_t *dst, int width,
+; uint32_t coeff, int64_t offset);
;
;-----------------------------------------------------------------------------
-%macro LUMCONVERTRANGE 1
-cglobal lumRange%1Jpeg, 4, 4, 5, dst, width, coeff, offset
- shl widthd, 1
+%macro LUMCONVERTRANGE 2
+cglobal lumRange%1Jpeg%2, 4, 4, 5, dst, width, coeff, offset
+ shl widthd, %2 >> 3
movd xm2, coeffd
VBROADCASTSS m2, xm2
%if ARCH_X86_64
@@ -42,12 +45,34 @@ cglobal lumRange%1Jpeg, 4, 4, 5, dst, width, coeff, offset
%else
movq xm3, offsetm
%endif
+%if %2 == 16
+ VBROADCASTSD m3, xm3
+%ifidni %1,To
+ VBROADCASTI128 m4, [pack19]
+%endif
+%elif %2 == 8
VBROADCASTSS m3, xm3
pxor m4, m4
+%endif ; %2 == 8/16
add dstq, widthq
neg widthq
.loop:
movu m0, [dstq+widthq]
+%if %2 == 16
+ pshufd m1, m0, 0xb1
+ pmuldq m0, m2
+ pmuldq m1, m2
+ paddq m0, m3
+ paddq m1, m3
+ psrlq m0, 18
+ psrlq m1, 18
+ pshufd m0, m0, 0xd8
+ pshufd m1, m1, 0xd8
+ punpckldq m0, m1
+%ifidni %1,To
+ PMINSD m0, m4, m1
+%endif
+%elif %2 == 8
punpckhwd m1, m0, m4
punpcklwd m0, m4
pmaddwd m0, m2
@@ -57,6 +82,7 @@ cglobal lumRange%1Jpeg, 4, 4, 5, dst, width, coeff, offset
psrad m0, 14
psrad m1, 14
packssdw m0, m1
+%endif ; %2 == 8/16
movu [dstq+widthq], m0
add widthq, mmsize
jl .loop
@@ -66,16 +92,16 @@ cglobal lumRange%1Jpeg, 4, 4, 5, dst, width, coeff, offset
;-----------------------------------------------------------------------------
; chrConvertRange
;
-; void ff_chrRangeToJpeg_<opt>(int16_t *dstU, int16_t *dstV, int width,
-; uint32_t coeff, int64_t offset);
-; void ff_chrRangeFromJpeg_<opt>(int16_t *dstU, int16_t *dstV, int width,
-; uint32_t coeff, int64_t offset);
+; void ff_chrRangeToJpeg{8,16}_<opt>(int16_t *dstU, int16_t *dstV, int width,
+; uint32_t coeff, int64_t offset);
+; void ff_chrRangeFromJpeg{8,16}_<opt>(int16_t *dstU, int16_t *dstV, int width,
+; uint32_t coeff, int64_t offset);
;
;-----------------------------------------------------------------------------
-%macro CHRCONVERTRANGE 1
-cglobal chrRange%1Jpeg, 5, 5, 7, dstU, dstV, width, coeff, offset
- shl widthd, 1
+%macro CHRCONVERTRANGE 2
+cglobal chrRange%1Jpeg%2, 5, 5, 7, dstU, dstV, width, coeff, offset
+ shl widthd, %2 >> 3
movd xm4, coeffd
VBROADCASTSS m4, xm4
%if ARCH_X86_64
@@ -83,14 +109,47 @@ cglobal chrRange%1Jpeg, 5, 5, 7, dstU, dstV, width, coeff, offset
%else
movq xm5, offsetm
%endif
+%if %2 == 16
+ VBROADCASTSD m5, xm5
+%ifidni %1,To
+ VBROADCASTI128 m6, [pack19]
+%endif
+%elif %2 == 8
VBROADCASTSS m5, xm5
pxor m6, m6
+%endif ; %2 == 8/16
add dstUq, widthq
add dstVq, widthq
neg widthq
.loop:
movu m0, [dstUq+widthq]
movu m2, [dstVq+widthq]
+%if %2 == 16
+ pshufd m1, m0, 0xb1
+ pshufd m3, m2, 0xb1
+ pmuldq m0, m4
+ pmuldq m1, m4
+ pmuldq m2, m4
+ pmuldq m3, m4
+ paddq m0, m5
+ paddq m1, m5
+ paddq m2, m5
+ paddq m3, m5
+ psrlq m0, 18
+ psrlq m1, 18
+ psrlq m2, 18
+ psrlq m3, 18
+ pshufd m0, m0, 0xd8
+ pshufd m1, m1, 0xd8
+ pshufd m2, m2, 0xd8
+ pshufd m3, m3, 0xd8
+ punpckldq m0, m1
+ punpckldq m2, m3
+%ifidni %1,To
+ PMINSD m0, m6, m1
+ PMINSD m2, m6, m3
+%endif
+%elif %2 == 8
punpckhwd m1, m0, m6
punpckhwd m3, m2, m6
punpcklwd m0, m6
@@ -109,6 +168,7 @@ cglobal chrRange%1Jpeg, 5, 5, 7, dstU, dstV, width, coeff, offset
psrad m3, 14
packssdw m0, m1
packssdw m2, m3
+%endif ; %2 == 8/16
movu [dstUq+widthq], m0
movu [dstVq+widthq], m2
add widthq, mmsize
@@ -117,15 +177,25 @@ cglobal chrRange%1Jpeg, 5, 5, 7, dstU, dstV, width, coeff, offset
%endmacro
INIT_XMM sse2
-LUMCONVERTRANGE To
-CHRCONVERTRANGE To
-LUMCONVERTRANGE From
-CHRCONVERTRANGE From
+LUMCONVERTRANGE To, 8
+CHRCONVERTRANGE To, 8
+LUMCONVERTRANGE From, 8
+CHRCONVERTRANGE From, 8
+
+INIT_XMM sse4
+LUMCONVERTRANGE To, 16
+CHRCONVERTRANGE To, 16
+LUMCONVERTRANGE From, 16
+CHRCONVERTRANGE From, 16
%if HAVE_AVX2_EXTERNAL
INIT_YMM avx2
-LUMCONVERTRANGE To
-CHRCONVERTRANGE To
-LUMCONVERTRANGE From
-CHRCONVERTRANGE From
+LUMCONVERTRANGE To, 8
+LUMCONVERTRANGE To, 16
+CHRCONVERTRANGE To, 8
+CHRCONVERTRANGE To, 16
+LUMCONVERTRANGE From, 8
+LUMCONVERTRANGE From, 16
+CHRCONVERTRANGE From, 8
+CHRCONVERTRANGE From, 16
%endif
diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
index 550ad99f3f..a7985a3b01 100644
--- a/libswscale/x86/swscale.c
+++ b/libswscale/x86/swscale.c
@@ -451,38 +451,44 @@ INPUT_PLANAR_RGB_UV_ALL_DECL(avx2);
INPUT_PLANAR_RGB_A_ALL_DECL(avx2);
#endif
-#define RANGE_CONVERT_FUNCS(opt) do { \
- if (c->dstBpc <= 14) { \
- if (c->opts.src_range) { \
- c->lumConvertRange = ff_lumRangeFromJpeg_ ##opt; \
- c->chrConvertRange = ff_chrRangeFromJpeg_ ##opt; \
- } else { \
- c->lumConvertRange = ff_lumRangeToJpeg_ ##opt; \
- c->chrConvertRange = ff_chrRangeToJpeg_ ##opt; \
- } \
+#define RANGE_CONVERT_FUNCS(opt, bpc) do { \
+ if (c->opts.src_range) { \
+ c->lumConvertRange = ff_lumRangeFromJpeg##bpc##_##opt; \
+ c->chrConvertRange = ff_chrRangeFromJpeg##bpc##_##opt; \
+ } else { \
+ c->lumConvertRange = ff_lumRangeToJpeg##bpc##_##opt; \
+ c->chrConvertRange = ff_chrRangeToJpeg##bpc##_##opt; \
} \
} while (0)
-#define RANGE_CONVERT_FUNCS_DECL(opt) \
-void ff_lumRangeFromJpeg_ ##opt(int16_t *dst, int width, \
- uint32_t coeff, int64_t offset); \
-void ff_chrRangeFromJpeg_ ##opt(int16_t *dstU, int16_t *dstV, int width, \
- uint32_t coeff, int64_t offset); \
-void ff_lumRangeToJpeg_ ##opt(int16_t *dst, int width, \
- uint32_t coeff, int64_t offset); \
-void ff_chrRangeToJpeg_ ##opt(int16_t *dstU, int16_t *dstV, int width, \
- uint32_t coeff, int64_t offset); \
-
-RANGE_CONVERT_FUNCS_DECL(sse2);
-RANGE_CONVERT_FUNCS_DECL(avx2);
+#define RANGE_CONVERT_FUNCS_DECL(opt, bpc) \
+void ff_lumRangeFromJpeg##bpc##_##opt(int16_t *dst, int width, \
+ uint32_t coeff, int64_t offset); \
+void ff_chrRangeFromJpeg##bpc##_##opt(int16_t *dstU, int16_t *dstV, int width, \
+ uint32_t coeff, int64_t offset); \
+void ff_lumRangeToJpeg##bpc##_##opt(int16_t *dst, int width, \
+ uint32_t coeff, int64_t offset); \
+void ff_chrRangeToJpeg##bpc##_##opt(int16_t *dstU, int16_t *dstV, int width, \
+ uint32_t coeff, int64_t offset); \
+
+RANGE_CONVERT_FUNCS_DECL(sse2, 8)
+RANGE_CONVERT_FUNCS_DECL(sse4, 16)
+RANGE_CONVERT_FUNCS_DECL(avx2, 8)
+RANGE_CONVERT_FUNCS_DECL(avx2, 16)
av_cold void ff_sws_init_range_convert_x86(SwsInternal *c)
{
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_AVX2_FAST(cpu_flags)) {
- RANGE_CONVERT_FUNCS(avx2);
- } else if (EXTERNAL_SSE2(cpu_flags)) {
- RANGE_CONVERT_FUNCS(sse2);
+ if (c->dstBpc <= 14) {
+ RANGE_CONVERT_FUNCS(avx2, 8);
+ } else {
+ RANGE_CONVERT_FUNCS(avx2, 16);
+ }
+ } else if (EXTERNAL_SSE2(cpu_flags) && c->dstBpc <= 14) {
+ RANGE_CONVERT_FUNCS(sse2, 8);
+ } else if (EXTERNAL_SSE4(cpu_flags) && c->dstBpc > 14) {
+ RANGE_CONVERT_FUNCS(sse4, 16);
}
}
More information about the ffmpeg-cvslog
mailing list