[FFmpeg-cvslog] swscale/x86/rgb2rgb : port shuffle 2103 mmxext to external asm and remove inline asm version
Martin Vignali
git at videolan.org
Sat Oct 13 15:13:37 EEST 2018
ffmpeg | branch: master | Martin Vignali <martin.vignali at gmail.com> | Thu Oct 11 21:35:05 2018 +0200| [296609f859a587575b91fe9e9691f2707d6e8136] | committer: Martin Vignali
swscale/x86/rgb2rgb : port shuffle 2103 mmxext to external asm and remove inline asm version
> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=296609f859a587575b91fe9e9691f2707d6e8136
---
libswscale/x86/rgb2rgb.c | 4 +++
libswscale/x86/rgb2rgb_template.c | 48 -----------------------------
libswscale/x86/rgb_2_rgb.asm | 63 +++++++++++++++++++++++++++++++++++++++
3 files changed, 67 insertions(+), 48 deletions(-)
diff --git a/libswscale/x86/rgb2rgb.c b/libswscale/x86/rgb2rgb.c
index 1191081440..2d6fc2ad26 100644
--- a/libswscale/x86/rgb2rgb.c
+++ b/libswscale/x86/rgb2rgb.c
@@ -144,6 +144,7 @@ DECLARE_ALIGNED(8, extern const uint64_t, ff_bgr2UVOffset);
#endif /* HAVE_INLINE_ASM */
+void ff_shuffle_bytes_2103_mmxext(const uint8_t *src, uint8_t *dst, int src_size);
void ff_shuffle_bytes_2103_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
void ff_shuffle_bytes_0321_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
void ff_shuffle_bytes_1230_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
@@ -176,6 +177,9 @@ av_cold void rgb2rgb_init_x86(void)
rgb2rgb_init_avx();
#endif /* HAVE_INLINE_ASM */
+ if (EXTERNAL_MMXEXT(cpu_flags)) {
+ shuffle_bytes_2103 = ff_shuffle_bytes_2103_mmxext;
+ }
if (EXTERNAL_SSE2(cpu_flags)) {
#if ARCH_X86_64
uyvytoyuv422 = ff_uyvytoyuv422_sse2;
diff --git a/libswscale/x86/rgb2rgb_template.c b/libswscale/x86/rgb2rgb_template.c
index 287e1d3501..ae2469e663 100644
--- a/libswscale/x86/rgb2rgb_template.c
+++ b/libswscale/x86/rgb2rgb_template.c
@@ -1034,51 +1034,6 @@ static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, int src_s
}
}
-#if COMPILE_TEMPLATE_MMXEXT
-static inline void RENAME(shuffle_bytes_2103)(const uint8_t *src, uint8_t *dst, int src_size)
-{
- x86_reg idx = 15 - src_size;
- const uint8_t *s = src-idx;
- uint8_t *d = dst-idx;
- __asm__ volatile(
- "test %0, %0 \n\t"
- "jns 2f \n\t"
- PREFETCH" (%1, %0) \n\t"
- "movq %3, %%mm7 \n\t"
- "pxor %4, %%mm7 \n\t"
- "movq %%mm7, %%mm6 \n\t"
- "pxor %5, %%mm7 \n\t"
- ".p2align 4 \n\t"
- "1: \n\t"
- PREFETCH" 32(%1, %0) \n\t"
- "movq (%1, %0), %%mm0 \n\t"
- "movq 8(%1, %0), %%mm1 \n\t"
- "pshufw $177, %%mm0, %%mm3 \n\t"
- "pshufw $177, %%mm1, %%mm5 \n\t"
- "pand %%mm7, %%mm0 \n\t"
- "pand %%mm6, %%mm3 \n\t"
- "pand %%mm7, %%mm1 \n\t"
- "pand %%mm6, %%mm5 \n\t"
- "por %%mm3, %%mm0 \n\t"
- "por %%mm5, %%mm1 \n\t"
- MOVNTQ" %%mm0, (%2, %0) \n\t"
- MOVNTQ" %%mm1, 8(%2, %0) \n\t"
- "add $16, %0 \n\t"
- "js 1b \n\t"
- SFENCE" \n\t"
- EMMS" \n\t"
- "2: \n\t"
- : "+&r"(idx)
- : "r" (s), "r" (d), "m" (mask32b), "m" (mask32r), "m" (mmx_one)
- : "memory");
- for (; idx<15; idx+=4) {
- register unsigned v = *(const uint32_t *)&s[idx], g = v & 0xff00ff00;
- v &= 0xff00ff;
- *(uint32_t *)&d[idx] = (v>>16) + g + (v<<16);
- }
-}
-#endif
-
static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
{
unsigned i;
@@ -2555,9 +2510,6 @@ static av_cold void RENAME(rgb2rgb_init)(void)
rgb24to15 = RENAME(rgb24to15);
rgb24to16 = RENAME(rgb24to16);
rgb24tobgr24 = RENAME(rgb24tobgr24);
-#if COMPILE_TEMPLATE_MMXEXT
- shuffle_bytes_2103 = RENAME(shuffle_bytes_2103);
-#endif
rgb32tobgr16 = RENAME(rgb32tobgr16);
rgb32tobgr15 = RENAME(rgb32tobgr15);
yv12toyuy2 = RENAME(yv12toyuy2);
diff --git a/libswscale/x86/rgb_2_rgb.asm b/libswscale/x86/rgb_2_rgb.asm
index 156b4d2c74..5fb5d2ee61 100644
--- a/libswscale/x86/rgb_2_rgb.asm
+++ b/libswscale/x86/rgb_2_rgb.asm
@@ -24,6 +24,7 @@
SECTION_RODATA
+pb_mask_shuffle2103_mmx times 8 dw 255
pb_shuffle2103: db 2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15
pb_shuffle0321: db 0, 3, 2, 1, 4, 7, 6, 5, 8, 11, 10, 9, 12, 15, 14, 13
pb_shuffle1230: db 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12
@@ -43,6 +44,68 @@ SECTION .text
%endmacro
;------------------------------------------------------------------------------
+; shuffle_bytes_2103_mmext (const uint8_t *src, uint8_t *dst, int src_size)
+;------------------------------------------------------------------------------
+INIT_MMX mmxext
+cglobal shuffle_bytes_2103, 3, 5, 8, src, dst, w, tmp, x
+ mova m6, [pb_mask_shuffle2103_mmx]
+ mova m7, m6
+ psllq m7, 8
+
+ movsxdifnidn wq, wd
+ mov xq, wq
+
+ add srcq, wq
+ add dstq, wq
+ neg wq
+
+;calc scalar loop
+ and xq, mmsize*2 -4
+ je .loop_simd
+
+.loop_scalar:
+ mov tmpb, [srcq + wq + 2]
+ mov [dstq+wq + 0], tmpb
+ mov tmpb, [srcq + wq + 1]
+ mov [dstq+wq + 1], tmpb
+ mov tmpb, [srcq + wq + 0]
+ mov [dstq+wq + 2], tmpb
+ mov tmpb, [srcq + wq + 3]
+ mov [dstq+wq + 3], tmpb
+ add wq, 4
+ sub xq, 4
+ jg .loop_scalar
+
+;check if src_size < mmsize * 2
+cmp wq, 0
+jge .end
+
+.loop_simd:
+ movu m0, [srcq+wq]
+ movu m1, [srcq+wq+8]
+
+ pshufw m3, m0, 177
+ pshufw m5, m1, 177
+
+ pand m0, m7
+ pand m3, m6
+
+ pand m1, m7
+ pand m5, m6
+
+ por m0, m3
+ por m1, m5
+
+ movu [dstq+wq], m0
+ movu [dstq+wq + 8], m1
+
+ add wq, mmsize*2
+ jl .loop_simd
+
+.end:
+ RET
+
+;------------------------------------------------------------------------------
; shuffle_bytes_## (const uint8_t *src, uint8_t *dst, int src_size)
;------------------------------------------------------------------------------
; %1-4 index shuffle
More information about the ffmpeg-cvslog
mailing list