[FFmpeg-devel] [PATCH v2 3/4] libswscale/x86/rgb2rgb: add uyvytoyuv422 avx2
Wu Jianhua
jianhua.wu at intel.com
Thu Sep 30 04:56:11 EEST 2021
With the accelerating by means of AVX2, the uyvytoyuv422 can be faster
Performance data(Less is better):
uyvytoyuv422_sse2 0.50388
uyvytoyuv422_avx 0.46132
uyvytoyuv422_avx2 0.27309
Signed-off-by: Wu Jianhua <jianhua.wu at intel.com>
---
libswscale/x86/rgb2rgb.c | 6 ++++
libswscale/x86/rgb_2_rgb.asm | 60 ++++++++++++++++++++++++++++--------
2 files changed, 53 insertions(+), 13 deletions(-)
diff --git a/libswscale/x86/rgb2rgb.c b/libswscale/x86/rgb2rgb.c
index c9ff33ab77..a965a1755c 100644
--- a/libswscale/x86/rgb2rgb.c
+++ b/libswscale/x86/rgb2rgb.c
@@ -164,6 +164,9 @@ void ff_uyvytoyuv422_sse2(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
void ff_uyvytoyuv422_avx(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
const uint8_t *src, int width, int height,
int lumStride, int chromStride, int srcStride);
+void ff_uyvytoyuv422_avx2(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
+ const uint8_t *src, int width, int height,
+ int lumStride, int chromStride, int srcStride);
#endif
av_cold void rgb2rgb_init_x86(void)
@@ -216,5 +219,8 @@ av_cold void rgb2rgb_init_x86(void)
if (EXTERNAL_AVX(cpu_flags)) {
uyvytoyuv422 = ff_uyvytoyuv422_avx;
}
+ if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+ uyvytoyuv422 = ff_uyvytoyuv422_avx2;
+ }
#endif
}
diff --git a/libswscale/x86/rgb_2_rgb.asm b/libswscale/x86/rgb_2_rgb.asm
index 3380a1272c..41ac91a747 100644
--- a/libswscale/x86/rgb_2_rgb.asm
+++ b/libswscale/x86/rgb_2_rgb.asm
@@ -26,14 +26,22 @@
SECTION_RODATA
pb_mask_shuffle2103_mmx times 8 dw 255
+pb_shuffle_low: times 4 db 1, 3, 5, 7, 9, 11, 13, 15, -1, -1, -1, -1, -1, -1, -1, -1
pb_shuffle2103: db 2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15
pb_shuffle0321: db 0, 3, 2, 1, 4, 7, 6, 5, 8, 11, 10, 9, 12, 15, 14, 13
pb_shuffle1230: db 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12
pb_shuffle3012: db 3, 0, 1, 2, 7, 4, 5, 6, 11, 8, 9, 10, 15, 12, 13, 14
pb_shuffle3210: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
+pd_permd256_uv: dd 0, 4, 1, 5, 2, 6, 3, 7
SECTION .text
+%macro VPERM 5
+%if mmsize == %2
+ vperm%1 %3, %4, %5
+%endif
+%endmacro
+
%macro RSHIFT_COPY 3
; %1 dst ; %2 src ; %3 shift
%if cpuflag(avx)
@@ -198,11 +206,16 @@ cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_s
mov whalfq, wq
shr whalfq, 1 ; whalf = width / 2
- lea srcq, [srcq + wq * 2]
+ lea srcq, [srcq + wq * 2]
add ydstq, wq
add udstq, whalfq
add vdstq, whalfq
+%if mmsize > 16
+ movu m13, [pb_shuffle_low]
+ movu m15, [pd_permd256_uv]
+%endif
+
.loop_line:
mov xq, wq
mov wtwoq, wq
@@ -246,23 +259,35 @@ cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_s
movu m5, [srcq + wtwoq + mmsize * 3]
; extract y part 1
+%if mmsize < 32
RSHIFT_COPY m6, m2, 1 ; UYVY UYVY -> YVYU YVY...
pand m6, m1; YxYx YxYx...
RSHIFT_COPY m7, m3, 1 ; UYVY UYVY -> YVYU YVY...
pand m7, m1 ; YxYx YxYx...
-
packuswb m6, m7 ; YYYY YYYY...
+%else
+ pshufb m6, m2, m13
+ pshufb m7, m3, m13
+ punpcklqdq m6, m6, m7
+ VPERM q, 32, m6, m6, 0xd8
+%endif
movu [ydstq + wq], m6
; extract y part 2
+%if mmsize < 32
RSHIFT_COPY m6, m4, 1 ; UYVY UYVY -> YVYU YVY...
pand m6, m1; YxYx YxYx...
RSHIFT_COPY m7, m5, 1 ; UYVY UYVY -> YVYU YVY...
pand m7, m1 ; YxYx YxYx...
-
- packuswb m6, m7 ; YYYY YYYY...
+ packuswb m6, m7 ; YYYY YYYY...
+%else
+ pshufb m6, m4, m13
+ pshufb m7, m5, m13
+ punpcklqdq m6, m6, m7
+ VPERM q, 32, m6, m6, 0xd8
+%endif
movu [ydstq + wq + mmsize], m6
; extract uv
@@ -275,17 +300,21 @@ cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_s
packuswb m4, m5 ; UVUV...
; U
- pand m6, m2, m1 ; UxUx...
- pand m7, m4, m1 ; UxUx...
+ pand m6, m2, m1 ; UxUx...
+ pand m7, m4, m1 ; UxUx...
+ packuswb m6, m7 ; UUUU
- packuswb m6, m7 ; UUUU
- movu [udstq + whalfq], m6
+ VPERM d, 32, m6, m15, m6
+ movu [udstq + whalfq], m6
; V
- psrlw m2, 8 ; VxVx...
- psrlw m4, 8 ; VxVx...
- packuswb m2, m4 ; VVVV
+ psrlw m2, 8 ; VxVx...
+ psrlw m4, 8 ; VxVx...
+ packuswb m2, m4 ; VVVV
+
+ VPERM d, 32, m2, m15, m2
+
movu [vdstq + whalfq], m2
add whalfq, mmsize
@@ -294,13 +323,13 @@ cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_s
jl .loop_simd
.end_line:
- add srcq, src_strideq
+ add srcq, src_strideq
add ydstq, lum_strideq
add udstq, chrom_strideq
add vdstq, chrom_strideq
;restore initial state of line variable
- mov wq, back_wq
+ mov wq, back_wq
mov xq, wq
mov whalfq, wq
shr whalfq, 1 ; whalf = width / 2
@@ -316,4 +345,9 @@ UYVY_TO_YUV422
INIT_XMM avx
UYVY_TO_YUV422
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+UYVY_TO_YUV422
+%endif
%endif
--
2.17.1
More information about the ffmpeg-devel
mailing list