[FFmpeg-devel] [PATCH 3/4] swscale/x86/rgb2rgb: remove rgb24toyv12_mmxext
Ramiro Polla
ramiro.polla at gmail.com
Wed Aug 28 23:43:02 EEST 2024
The mmxext implementation is slower than the C version.
rgb24toyv12_16_200_c: 14812.6 ( 1.00x)
rgb24toyv12_16_200_mmxext: 17400.4 ( 0.85x)
rgb24toyv12_128_60_c: 35616.9 ( 1.00x)
rgb24toyv12_128_60_mmxext: 39610.4 ( 0.90x)
rgb24toyv12_512_16_c: 37209.4 ( 1.00x)
rgb24toyv12_512_16_mmxext: 41136.2 ( 0.90x)
rgb24toyv12_1920_4_c: 34737.4 ( 1.00x)
rgb24toyv12_1920_4_mmxext: 34818.9 ( 1.00x)
rgb24toyv12_1920_4_negstride_c: 34855.2 ( 1.00x)
rgb24toyv12_1920_4_negstride_mmxext: 34773.7 ( 1.00x)
---
libswscale/x86/rgb2rgb.c | 207 ---------------------------------------
1 file changed, 207 deletions(-)
diff --git a/libswscale/x86/rgb2rgb.c b/libswscale/x86/rgb2rgb.c
index 4d6ba9ff21..e27aea7b83 100644
--- a/libswscale/x86/rgb2rgb.c
+++ b/libswscale/x86/rgb2rgb.c
@@ -1473,210 +1473,6 @@ static inline void planar2x_mmxext(const uint8_t *src, uint8_t *dst, int srcWidt
:::"memory");
}
-/**
- * Height should be a multiple of 2 and width should be a multiple of 2.
- * (If this is a problem for anyone then tell me, and I will fix it.)
- * Chrominance data is only taken from every second line,
- * others are ignored in the C version.
- * FIXME: Write HQ version.
- */
-#if HAVE_7REGS
-static inline void rgb24toyv12_mmxext(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
- int width, int height,
- int lumStride, int chromStride, int srcStride,
- int32_t *rgb2yuv)
-{
-#define BGR2Y_IDX "16*4+16*32"
-#define BGR2U_IDX "16*4+16*33"
-#define BGR2V_IDX "16*4+16*34"
- int y;
- const x86_reg chromWidth= width>>1;
-
- if (height > 2) {
- ff_rgb24toyv12_c(src, ydst, udst, vdst, width, 2, lumStride, chromStride, srcStride, rgb2yuv);
- src += 2*srcStride;
- ydst += 2*lumStride;
- udst += chromStride;
- vdst += chromStride;
- height -= 2;
- }
-
- for (y = 0; y < height - 2; y += 2) {
- for (int i = 0; i < 2; i++) {
- __asm__ volatile(
- "mov %2, %%"FF_REG_a"\n\t"
- "movq "BGR2Y_IDX"(%3), %%mm6 \n\t"
- "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
- "pxor %%mm7, %%mm7 \n\t"
- "lea (%%"FF_REG_a", %%"FF_REG_a", 2), %%"FF_REG_d" \n\t"
- ".p2align 4 \n\t"
- "1: \n\t"
- PREFETCH" 64(%0, %%"FF_REG_d") \n\t"
- "movd (%0, %%"FF_REG_d"), %%mm0 \n\t"
- "movd 3(%0, %%"FF_REG_d"), %%mm1 \n\t"
- "punpcklbw %%mm7, %%mm0 \n\t"
- "punpcklbw %%mm7, %%mm1 \n\t"
- "movd 6(%0, %%"FF_REG_d"), %%mm2 \n\t"
- "movd 9(%0, %%"FF_REG_d"), %%mm3 \n\t"
- "punpcklbw %%mm7, %%mm2 \n\t"
- "punpcklbw %%mm7, %%mm3 \n\t"
- "pmaddwd %%mm6, %%mm0 \n\t"
- "pmaddwd %%mm6, %%mm1 \n\t"
- "pmaddwd %%mm6, %%mm2 \n\t"
- "pmaddwd %%mm6, %%mm3 \n\t"
- "psrad $8, %%mm0 \n\t"
- "psrad $8, %%mm1 \n\t"
- "psrad $8, %%mm2 \n\t"
- "psrad $8, %%mm3 \n\t"
- "packssdw %%mm1, %%mm0 \n\t"
- "packssdw %%mm3, %%mm2 \n\t"
- "pmaddwd %%mm5, %%mm0 \n\t"
- "pmaddwd %%mm5, %%mm2 \n\t"
- "packssdw %%mm2, %%mm0 \n\t"
- "psraw $7, %%mm0 \n\t"
-
- "movd 12(%0, %%"FF_REG_d"), %%mm4 \n\t"
- "movd 15(%0, %%"FF_REG_d"), %%mm1 \n\t"
- "punpcklbw %%mm7, %%mm4 \n\t"
- "punpcklbw %%mm7, %%mm1 \n\t"
- "movd 18(%0, %%"FF_REG_d"), %%mm2 \n\t"
- "movd 21(%0, %%"FF_REG_d"), %%mm3 \n\t"
- "punpcklbw %%mm7, %%mm2 \n\t"
- "punpcklbw %%mm7, %%mm3 \n\t"
- "pmaddwd %%mm6, %%mm4 \n\t"
- "pmaddwd %%mm6, %%mm1 \n\t"
- "pmaddwd %%mm6, %%mm2 \n\t"
- "pmaddwd %%mm6, %%mm3 \n\t"
- "psrad $8, %%mm4 \n\t"
- "psrad $8, %%mm1 \n\t"
- "psrad $8, %%mm2 \n\t"
- "psrad $8, %%mm3 \n\t"
- "packssdw %%mm1, %%mm4 \n\t"
- "packssdw %%mm3, %%mm2 \n\t"
- "pmaddwd %%mm5, %%mm4 \n\t"
- "pmaddwd %%mm5, %%mm2 \n\t"
- "add $24, %%"FF_REG_d"\n\t"
- "packssdw %%mm2, %%mm4 \n\t"
- "psraw $7, %%mm4 \n\t"
-
- "packuswb %%mm4, %%mm0 \n\t"
- "paddusb "MANGLE(ff_bgr2YOffset)", %%mm0 \n\t"
-
- MOVNTQ" %%mm0, (%1, %%"FF_REG_a") \n\t"
- "add $8, %%"FF_REG_a" \n\t"
- " js 1b \n\t"
- : : "r" (src+width*3), "r" (ydst+width), "g" ((x86_reg)-width), "r"(rgb2yuv)
- NAMED_CONSTRAINTS_ADD(ff_w1111,ff_bgr2YOffset)
- : "%"FF_REG_a, "%"FF_REG_d
- );
- ydst += lumStride;
- src += srcStride;
- }
- src -= srcStride*2;
- __asm__ volatile(
- "mov %4, %%"FF_REG_a"\n\t"
- "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
- "movq "BGR2U_IDX"(%5), %%mm6 \n\t"
- "pxor %%mm7, %%mm7 \n\t"
- "lea (%%"FF_REG_a", %%"FF_REG_a", 2), %%"FF_REG_d" \n\t"
- "add %%"FF_REG_d", %%"FF_REG_d"\n\t"
- ".p2align 4 \n\t"
- "1: \n\t"
- PREFETCH" 64(%0, %%"FF_REG_d") \n\t"
- PREFETCH" 64(%1, %%"FF_REG_d") \n\t"
- "movq (%0, %%"FF_REG_d"), %%mm0 \n\t"
- "movq (%1, %%"FF_REG_d"), %%mm1 \n\t"
- "movq 6(%0, %%"FF_REG_d"), %%mm2 \n\t"
- "movq 6(%1, %%"FF_REG_d"), %%mm3 \n\t"
- PAVGB" %%mm1, %%mm0 \n\t"
- PAVGB" %%mm3, %%mm2 \n\t"
- "movq %%mm0, %%mm1 \n\t"
- "movq %%mm2, %%mm3 \n\t"
- "psrlq $24, %%mm0 \n\t"
- "psrlq $24, %%mm2 \n\t"
- PAVGB" %%mm1, %%mm0 \n\t"
- PAVGB" %%mm3, %%mm2 \n\t"
- "punpcklbw %%mm7, %%mm0 \n\t"
- "punpcklbw %%mm7, %%mm2 \n\t"
- "movq "BGR2V_IDX"(%5), %%mm1 \n\t"
- "movq "BGR2V_IDX"(%5), %%mm3 \n\t"
-
- "pmaddwd %%mm0, %%mm1 \n\t"
- "pmaddwd %%mm2, %%mm3 \n\t"
- "pmaddwd %%mm6, %%mm0 \n\t"
- "pmaddwd %%mm6, %%mm2 \n\t"
- "psrad $8, %%mm0 \n\t"
- "psrad $8, %%mm1 \n\t"
- "psrad $8, %%mm2 \n\t"
- "psrad $8, %%mm3 \n\t"
- "packssdw %%mm2, %%mm0 \n\t"
- "packssdw %%mm3, %%mm1 \n\t"
- "pmaddwd %%mm5, %%mm0 \n\t"
- "pmaddwd %%mm5, %%mm1 \n\t"
- "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
- "psraw $7, %%mm0 \n\t"
-
- "movq 12(%0, %%"FF_REG_d"), %%mm4 \n\t"
- "movq 12(%1, %%"FF_REG_d"), %%mm1 \n\t"
- "movq 18(%0, %%"FF_REG_d"), %%mm2 \n\t"
- "movq 18(%1, %%"FF_REG_d"), %%mm3 \n\t"
- PAVGB" %%mm1, %%mm4 \n\t"
- PAVGB" %%mm3, %%mm2 \n\t"
- "movq %%mm4, %%mm1 \n\t"
- "movq %%mm2, %%mm3 \n\t"
- "psrlq $24, %%mm4 \n\t"
- "psrlq $24, %%mm2 \n\t"
- PAVGB" %%mm1, %%mm4 \n\t"
- PAVGB" %%mm3, %%mm2 \n\t"
- "punpcklbw %%mm7, %%mm4 \n\t"
- "punpcklbw %%mm7, %%mm2 \n\t"
- "movq "BGR2V_IDX"(%5), %%mm1 \n\t"
- "movq "BGR2V_IDX"(%5), %%mm3 \n\t"
-
- "pmaddwd %%mm4, %%mm1 \n\t"
- "pmaddwd %%mm2, %%mm3 \n\t"
- "pmaddwd %%mm6, %%mm4 \n\t"
- "pmaddwd %%mm6, %%mm2 \n\t"
- "psrad $8, %%mm4 \n\t"
- "psrad $8, %%mm1 \n\t"
- "psrad $8, %%mm2 \n\t"
- "psrad $8, %%mm3 \n\t"
- "packssdw %%mm2, %%mm4 \n\t"
- "packssdw %%mm3, %%mm1 \n\t"
- "pmaddwd %%mm5, %%mm4 \n\t"
- "pmaddwd %%mm5, %%mm1 \n\t"
- "add $24, %%"FF_REG_d"\n\t"
- "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
- "psraw $7, %%mm4 \n\t"
-
- "movq %%mm0, %%mm1 \n\t"
- "punpckldq %%mm4, %%mm0 \n\t"
- "punpckhdq %%mm4, %%mm1 \n\t"
- "packsswb %%mm1, %%mm0 \n\t"
- "paddb "MANGLE(ff_bgr2UVOffset)", %%mm0 \n\t"
- "movd %%mm0, (%2, %%"FF_REG_a") \n\t"
- "punpckhdq %%mm0, %%mm0 \n\t"
- "movd %%mm0, (%3, %%"FF_REG_a") \n\t"
- "add $4, %%"FF_REG_a" \n\t"
- " js 1b \n\t"
- : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth), "r"(rgb2yuv)
- NAMED_CONSTRAINTS_ADD(ff_w1111,ff_bgr2UVOffset)
- : "%"FF_REG_a, "%"FF_REG_d
- );
-
- udst += chromStride;
- vdst += chromStride;
- src += srcStride*2;
- }
-
- __asm__ volatile(EMMS" \n\t"
- SFENCE" \n\t"
- :::"memory");
-
- ff_rgb24toyv12_c(src, ydst, udst, vdst, width, height-y, lumStride, chromStride, srcStride, rgb2yuv);
-}
-#endif /* HAVE_7REGS */
-
static inline void vu9_to_vu12_mmxext(const uint8_t *src1, const uint8_t *src2,
uint8_t *dst1, uint8_t *dst2,
int width, int height,
@@ -2257,9 +2053,6 @@ static av_cold void rgb2rgb_init_mmxext(void)
yuyvtoyuv422 = yuyvtoyuv422_mmxext;
planar2x = planar2x_mmxext;
-#if HAVE_7REGS
- ff_rgb24toyv12 = rgb24toyv12_mmxext;
-#endif /* HAVE_7REGS */
yuyvtoyuv420 = yuyvtoyuv420_mmxext;
uyvytoyuv420 = uyvytoyuv420_mmxext;
--
2.30.2
More information about the ffmpeg-devel
mailing list