[FFmpeg-devel] [PATCH 3/4] swscale/x86/rgb2rgb: remove rgb24toyv12_mmxext

Ramiro Polla ramiro.polla at gmail.com
Wed Aug 28 23:43:02 EEST 2024


The mmxext implementation is slower than the C version.

rgb24toyv12_16_200_c:                                14812.6 ( 1.00x)
rgb24toyv12_16_200_mmxext:                           17400.4 ( 0.85x)
rgb24toyv12_128_60_c:                                35616.9 ( 1.00x)
rgb24toyv12_128_60_mmxext:                           39610.4 ( 0.90x)
rgb24toyv12_512_16_c:                                37209.4 ( 1.00x)
rgb24toyv12_512_16_mmxext:                           41136.2 ( 0.90x)
rgb24toyv12_1920_4_c:                                34737.4 ( 1.00x)
rgb24toyv12_1920_4_mmxext:                           34818.9 ( 1.00x)
rgb24toyv12_1920_4_negstride_c:                      34855.2 ( 1.00x)
rgb24toyv12_1920_4_negstride_mmxext:                 34773.7 ( 1.00x)
---
 libswscale/x86/rgb2rgb.c | 207 ---------------------------------------
 1 file changed, 207 deletions(-)

diff --git a/libswscale/x86/rgb2rgb.c b/libswscale/x86/rgb2rgb.c
index 4d6ba9ff21..e27aea7b83 100644
--- a/libswscale/x86/rgb2rgb.c
+++ b/libswscale/x86/rgb2rgb.c
@@ -1473,210 +1473,6 @@ static inline void planar2x_mmxext(const uint8_t *src, uint8_t *dst, int srcWidt
                      :::"memory");
 }
 
-/**
- * Height should be a multiple of 2 and width should be a multiple of 2.
- * (If this is a problem for anyone then tell me, and I will fix it.)
- * Chrominance data is only taken from every second line,
- * others are ignored in the C version.
- * FIXME: Write HQ version.
- */
-#if HAVE_7REGS
-static inline void rgb24toyv12_mmxext(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
-                                       int width, int height,
-                                       int lumStride, int chromStride, int srcStride,
-                                       int32_t *rgb2yuv)
-{
-#define BGR2Y_IDX "16*4+16*32"
-#define BGR2U_IDX "16*4+16*33"
-#define BGR2V_IDX "16*4+16*34"
-    int y;
-    const x86_reg chromWidth= width>>1;
-
-    if (height > 2) {
-        ff_rgb24toyv12_c(src, ydst, udst, vdst, width, 2, lumStride, chromStride, srcStride, rgb2yuv);
-        src  += 2*srcStride;
-        ydst += 2*lumStride;
-        udst += chromStride;
-        vdst += chromStride;
-        height -= 2;
-    }
-
-    for (y = 0; y < height - 2; y += 2) {
-        for (int i = 0; i < 2; i++) {
-            __asm__ volatile(
-                "mov                        %2, %%"FF_REG_a"\n\t"
-                "movq          "BGR2Y_IDX"(%3), %%mm6       \n\t"
-                "movq       "MANGLE(ff_w1111)", %%mm5       \n\t"
-                "pxor                    %%mm7, %%mm7       \n\t"
-                "lea (%%"FF_REG_a", %%"FF_REG_a", 2), %%"FF_REG_d" \n\t"
-                ".p2align                    4              \n\t"
-                "1:                                         \n\t"
-                PREFETCH" 64(%0, %%"FF_REG_d")              \n\t"
-                "movd       (%0, %%"FF_REG_d"), %%mm0       \n\t"
-                "movd      3(%0, %%"FF_REG_d"), %%mm1       \n\t"
-                "punpcklbw               %%mm7, %%mm0       \n\t"
-                "punpcklbw               %%mm7, %%mm1       \n\t"
-                "movd      6(%0, %%"FF_REG_d"), %%mm2       \n\t"
-                "movd      9(%0, %%"FF_REG_d"), %%mm3       \n\t"
-                "punpcklbw               %%mm7, %%mm2       \n\t"
-                "punpcklbw               %%mm7, %%mm3       \n\t"
-                "pmaddwd                 %%mm6, %%mm0       \n\t"
-                "pmaddwd                 %%mm6, %%mm1       \n\t"
-                "pmaddwd                 %%mm6, %%mm2       \n\t"
-                "pmaddwd                 %%mm6, %%mm3       \n\t"
-                "psrad                      $8, %%mm0       \n\t"
-                "psrad                      $8, %%mm1       \n\t"
-                "psrad                      $8, %%mm2       \n\t"
-                "psrad                      $8, %%mm3       \n\t"
-                "packssdw                %%mm1, %%mm0       \n\t"
-                "packssdw                %%mm3, %%mm2       \n\t"
-                "pmaddwd                 %%mm5, %%mm0       \n\t"
-                "pmaddwd                 %%mm5, %%mm2       \n\t"
-                "packssdw                %%mm2, %%mm0       \n\t"
-                "psraw                      $7, %%mm0       \n\t"
-
-                "movd     12(%0, %%"FF_REG_d"), %%mm4       \n\t"
-                "movd     15(%0, %%"FF_REG_d"), %%mm1       \n\t"
-                "punpcklbw               %%mm7, %%mm4       \n\t"
-                "punpcklbw               %%mm7, %%mm1       \n\t"
-                "movd     18(%0, %%"FF_REG_d"), %%mm2       \n\t"
-                "movd     21(%0, %%"FF_REG_d"), %%mm3       \n\t"
-                "punpcklbw               %%mm7, %%mm2       \n\t"
-                "punpcklbw               %%mm7, %%mm3       \n\t"
-                "pmaddwd                 %%mm6, %%mm4       \n\t"
-                "pmaddwd                 %%mm6, %%mm1       \n\t"
-                "pmaddwd                 %%mm6, %%mm2       \n\t"
-                "pmaddwd                 %%mm6, %%mm3       \n\t"
-                "psrad                      $8, %%mm4       \n\t"
-                "psrad                      $8, %%mm1       \n\t"
-                "psrad                      $8, %%mm2       \n\t"
-                "psrad                      $8, %%mm3       \n\t"
-                "packssdw                %%mm1, %%mm4       \n\t"
-                "packssdw                %%mm3, %%mm2       \n\t"
-                "pmaddwd                 %%mm5, %%mm4       \n\t"
-                "pmaddwd                 %%mm5, %%mm2       \n\t"
-                "add                       $24, %%"FF_REG_d"\n\t"
-                "packssdw                %%mm2, %%mm4       \n\t"
-                "psraw                      $7, %%mm4       \n\t"
-
-                "packuswb                %%mm4, %%mm0       \n\t"
-                "paddusb "MANGLE(ff_bgr2YOffset)", %%mm0    \n\t"
-
-                MOVNTQ"                  %%mm0, (%1, %%"FF_REG_a") \n\t"
-                "add                        $8,      %%"FF_REG_a"  \n\t"
-                " js                        1b                     \n\t"
-                : : "r" (src+width*3), "r" (ydst+width), "g" ((x86_reg)-width), "r"(rgb2yuv)
-                  NAMED_CONSTRAINTS_ADD(ff_w1111,ff_bgr2YOffset)
-                : "%"FF_REG_a, "%"FF_REG_d
-            );
-            ydst += lumStride;
-            src  += srcStride;
-        }
-        src -= srcStride*2;
-        __asm__ volatile(
-            "mov                        %4, %%"FF_REG_a"\n\t"
-            "movq       "MANGLE(ff_w1111)", %%mm5       \n\t"
-            "movq          "BGR2U_IDX"(%5), %%mm6       \n\t"
-            "pxor                    %%mm7, %%mm7       \n\t"
-            "lea (%%"FF_REG_a", %%"FF_REG_a", 2), %%"FF_REG_d" \n\t"
-            "add              %%"FF_REG_d", %%"FF_REG_d"\n\t"
-            ".p2align                    4              \n\t"
-            "1:                                         \n\t"
-            PREFETCH" 64(%0, %%"FF_REG_d")              \n\t"
-            PREFETCH" 64(%1, %%"FF_REG_d")              \n\t"
-            "movq       (%0, %%"FF_REG_d"), %%mm0       \n\t"
-            "movq       (%1, %%"FF_REG_d"), %%mm1       \n\t"
-            "movq      6(%0, %%"FF_REG_d"), %%mm2       \n\t"
-            "movq      6(%1, %%"FF_REG_d"), %%mm3       \n\t"
-            PAVGB"                   %%mm1, %%mm0       \n\t"
-            PAVGB"                   %%mm3, %%mm2       \n\t"
-            "movq                    %%mm0, %%mm1       \n\t"
-            "movq                    %%mm2, %%mm3       \n\t"
-            "psrlq                     $24, %%mm0       \n\t"
-            "psrlq                     $24, %%mm2       \n\t"
-            PAVGB"                   %%mm1, %%mm0       \n\t"
-            PAVGB"                   %%mm3, %%mm2       \n\t"
-            "punpcklbw               %%mm7, %%mm0       \n\t"
-            "punpcklbw               %%mm7, %%mm2       \n\t"
-            "movq          "BGR2V_IDX"(%5), %%mm1       \n\t"
-            "movq          "BGR2V_IDX"(%5), %%mm3       \n\t"
-
-            "pmaddwd                 %%mm0, %%mm1       \n\t"
-            "pmaddwd                 %%mm2, %%mm3       \n\t"
-            "pmaddwd                 %%mm6, %%mm0       \n\t"
-            "pmaddwd                 %%mm6, %%mm2       \n\t"
-            "psrad                      $8, %%mm0       \n\t"
-            "psrad                      $8, %%mm1       \n\t"
-            "psrad                      $8, %%mm2       \n\t"
-            "psrad                      $8, %%mm3       \n\t"
-            "packssdw                %%mm2, %%mm0       \n\t"
-            "packssdw                %%mm3, %%mm1       \n\t"
-            "pmaddwd                 %%mm5, %%mm0       \n\t"
-            "pmaddwd                 %%mm5, %%mm1       \n\t"
-            "packssdw                %%mm1, %%mm0       \n\t" // V1 V0 U1 U0
-            "psraw                      $7, %%mm0       \n\t"
-
-            "movq     12(%0, %%"FF_REG_d"), %%mm4       \n\t"
-            "movq     12(%1, %%"FF_REG_d"), %%mm1       \n\t"
-            "movq     18(%0, %%"FF_REG_d"), %%mm2       \n\t"
-            "movq     18(%1, %%"FF_REG_d"), %%mm3       \n\t"
-            PAVGB"                   %%mm1, %%mm4       \n\t"
-            PAVGB"                   %%mm3, %%mm2       \n\t"
-            "movq                    %%mm4, %%mm1       \n\t"
-            "movq                    %%mm2, %%mm3       \n\t"
-            "psrlq                     $24, %%mm4       \n\t"
-            "psrlq                     $24, %%mm2       \n\t"
-            PAVGB"                   %%mm1, %%mm4       \n\t"
-            PAVGB"                   %%mm3, %%mm2       \n\t"
-            "punpcklbw               %%mm7, %%mm4       \n\t"
-            "punpcklbw               %%mm7, %%mm2       \n\t"
-            "movq          "BGR2V_IDX"(%5), %%mm1       \n\t"
-            "movq          "BGR2V_IDX"(%5), %%mm3       \n\t"
-
-            "pmaddwd                 %%mm4, %%mm1       \n\t"
-            "pmaddwd                 %%mm2, %%mm3       \n\t"
-            "pmaddwd                 %%mm6, %%mm4       \n\t"
-            "pmaddwd                 %%mm6, %%mm2       \n\t"
-            "psrad                      $8, %%mm4       \n\t"
-            "psrad                      $8, %%mm1       \n\t"
-            "psrad                      $8, %%mm2       \n\t"
-            "psrad                      $8, %%mm3       \n\t"
-            "packssdw                %%mm2, %%mm4       \n\t"
-            "packssdw                %%mm3, %%mm1       \n\t"
-            "pmaddwd                 %%mm5, %%mm4       \n\t"
-            "pmaddwd                 %%mm5, %%mm1       \n\t"
-            "add                       $24, %%"FF_REG_d"\n\t"
-            "packssdw                %%mm1, %%mm4       \n\t" // V3 V2 U3 U2
-            "psraw                      $7, %%mm4       \n\t"
-
-            "movq                    %%mm0, %%mm1           \n\t"
-            "punpckldq               %%mm4, %%mm0           \n\t"
-            "punpckhdq               %%mm4, %%mm1           \n\t"
-            "packsswb                %%mm1, %%mm0           \n\t"
-            "paddb "MANGLE(ff_bgr2UVOffset)", %%mm0         \n\t"
-            "movd                    %%mm0, (%2, %%"FF_REG_a") \n\t"
-            "punpckhdq               %%mm0, %%mm0              \n\t"
-            "movd                    %%mm0, (%3, %%"FF_REG_a") \n\t"
-            "add                        $4, %%"FF_REG_a"       \n\t"
-            " js                        1b              \n\t"
-            : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth), "r"(rgb2yuv)
-              NAMED_CONSTRAINTS_ADD(ff_w1111,ff_bgr2UVOffset)
-            : "%"FF_REG_a, "%"FF_REG_d
-        );
-
-        udst += chromStride;
-        vdst += chromStride;
-        src  += srcStride*2;
-    }
-
-    __asm__ volatile(EMMS"       \n\t"
-                     SFENCE"     \n\t"
-                     :::"memory");
-
-     ff_rgb24toyv12_c(src, ydst, udst, vdst, width, height-y, lumStride, chromStride, srcStride, rgb2yuv);
-}
-#endif /* HAVE_7REGS */
-
 static inline void vu9_to_vu12_mmxext(const uint8_t *src1, const uint8_t *src2,
                                        uint8_t *dst1, uint8_t *dst2,
                                        int width, int height,
@@ -2257,9 +2053,6 @@ static av_cold void rgb2rgb_init_mmxext(void)
     yuyvtoyuv422       = yuyvtoyuv422_mmxext;
 
     planar2x           = planar2x_mmxext;
-#if HAVE_7REGS
-    ff_rgb24toyv12     = rgb24toyv12_mmxext;
-#endif /* HAVE_7REGS */
 
     yuyvtoyuv420       = yuyvtoyuv420_mmxext;
     uyvytoyuv420       = uyvytoyuv420_mmxext;
-- 
2.30.2



More information about the ffmpeg-devel mailing list