[FFmpeg-devel] [PATCH 22/41] avcodec/x86/h264dsp_init: Disable overridden functions on x64

Andreas Rheinhardt andreas.rheinhardt at outlook.com
Fri Jun 10 02:55:04 EEST 2022


x64 always has MMX, MMXEXT, SSE and SSE2 and this means
that some functions for MMX, MMXEXT, SSE and 3dnow are always
overridden by other functions (unless one e.g. explicitly
disables SSE2). This commit therefore disables such h264dsp functions
at compile-time for x64.

Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt at outlook.com>
---
 libavcodec/x86/h264_deblock.asm | 24 +++-----------
 libavcodec/x86/h264_idct.asm    | 57 +++++++--------------------------
 libavcodec/x86/h264_weight.asm  |  8 +++++
 libavcodec/x86/h264dsp_init.c   | 21 ++++++++----
 4 files changed, 38 insertions(+), 72 deletions(-)

diff --git a/libavcodec/x86/h264_deblock.asm b/libavcodec/x86/h264_deblock.asm
index a2e745cd8e..9e671af45c 100644
--- a/libavcodec/x86/h264_deblock.asm
+++ b/libavcodec/x86/h264_deblock.asm
@@ -867,7 +867,6 @@ DEBLOCK_LUMA_INTRA v
 %if ARCH_X86_64 == 0
 INIT_MMX mmxext
 DEBLOCK_LUMA_INTRA v8
-%endif
 
 INIT_MMX mmxext
 
@@ -911,17 +910,8 @@ cglobal deblock_v_chroma_8, 5,6
 ;                          int8_t *tc0)
 ;-----------------------------------------------------------------------------
 cglobal deblock_h_chroma_8, 5,7
-%if ARCH_X86_64
-    ; This could use the red zone on 64 bit unix to avoid the stack pointer
-    ; readjustment, but valgrind assumes the red zone is clobbered on
-    ; function calls and returns.
-    sub   rsp, 16
-    %define buf0 [rsp]
-    %define buf1 [rsp+8]
-%else
     %define buf0 r0m
     %define buf1 r2m
-%endif
     CHROMA_H_START
     TRANSPOSE4x8_LOAD  bw, wd, dq, PASS8ROWS(t5, r0, r1, t6)
     movq  buf0, m0
@@ -934,9 +924,6 @@ cglobal deblock_h_chroma_8, 5,7
     movq  m0, buf0
     movq  m3, buf1
     TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6)
-%if ARCH_X86_64
-    add   rsp, 16
-%endif
     RET
 
 ALIGN 16
@@ -953,13 +940,8 @@ ff_chroma_inter_body_mmxext:
 
 cglobal deblock_h_chroma422_8, 5, 6
     SUB rsp, (1+ARCH_X86_64*2)*mmsize
-    %if ARCH_X86_64
-        %define buf0 [rsp+16]
-        %define buf1 [rsp+8]
-    %else
-        %define buf0 r0m
-        %define buf1 r2m
-    %endif
+    %define buf0 r0m
+    %define buf1 r2m
 
     movd m6, [r4]
     punpcklbw m6, m6
@@ -1059,6 +1041,8 @@ ff_chroma_intra_body_mmxext:
     paddb  m2, m6
     ret
 
+%endif ; ARCH_X86_64 == 0
+
 %macro LOAD_8_ROWS 8
     movd m0, %1
     movd m1, %2
diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm
index c54f9f1a68..17c7af388c 100644
--- a/libavcodec/x86/h264_idct.asm
+++ b/libavcodec/x86/h264_idct.asm
@@ -87,12 +87,14 @@ SECTION .text
     STORE_DIFFx2 m2, m3, m4, m5, m7, 6, %1, %3
 %endmacro
 
+%if ARCH_X86_32
 INIT_MMX mmx
 ; void ff_h264_idct_add_8_mmx(uint8_t *dst, int16_t *block, int stride)
 cglobal h264_idct_add_8, 3, 3, 0
     movsxdifnidn r2, r2d
     IDCT4_ADD    r0, r1, r2
     RET
+%endif
 
 %macro IDCT8_1D 2
     psraw        m0, m1, 1
@@ -207,6 +209,7 @@ cglobal h264_idct_add_8, 3, 3, 0
     STORE_DIFFx2 m1, m2, m5, m6, m7, 6, %1, %3
 %endmacro
 
+%if ARCH_X86_32
 INIT_MMX mmx
 ; void ff_h264_idct8_add_8_mmx(uint8_t *dst, int16_t *block, int stride)
 cglobal h264_idct8_add_8, 3, 4, 0
@@ -223,6 +226,7 @@ cglobal h264_idct8_add_8, 3, 4, 0
 
     ADD         rsp, pad
     RET
+%endif
 
 ; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
 %macro IDCT8_ADD_SSE 4
@@ -315,16 +319,7 @@ cglobal h264_idct8_add_8, 3, 4, 10
 %endmacro
 
 INIT_MMX mmxext
-; void ff_h264_idct_dc_add_8_mmxext(uint8_t *dst, int16_t *block, int stride)
 %if ARCH_X86_64
-cglobal h264_idct_dc_add_8, 3, 4, 0
-    movsxd       r2, r2d
-    movsx        r3, word [r1]
-    mov  dword [r1], 0
-    DC_ADD_MMXEXT_INIT r3, r2
-    DC_ADD_MMXEXT_OP movh, r0, r2, r3
-    RET
-
 ; void ff_h264_idct8_dc_add_8_mmxext(uint8_t *dst, int16_t *block, int stride)
 cglobal h264_idct8_dc_add_8, 3, 4, 0
     movsxd       r2, r2d
@@ -358,6 +353,7 @@ cglobal h264_idct8_dc_add_8, 2, 3, 0
 %endif
 
 INIT_MMX mmx
+%if ARCH_X86_32
 ; void ff_h264_idct_add16_8_mmx(uint8_t *dst, const int *block_offset,
 ;                               int16_t *block, int stride,
 ;                               const uint8_t nnzc[6 * 8])
@@ -438,16 +434,12 @@ cglobal h264_idct_add16_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride
     jz .no_dc
     mov   word [r2], 0
     DC_ADD_MMXEXT_INIT r6, r3
-%if ARCH_X86_64 == 0
 %define dst2q r1
 %define dst2d r1d
-%endif
     mov       dst2d, dword [r1+r5*4]
     lea       dst2q, [r0+dst2q]
     DC_ADD_MMXEXT_OP movh, dst2q, r3, r6
-%if ARCH_X86_64 == 0
     mov          r1, r1m
-%endif
     inc          r5
     add          r2, 32
     cmp          r5, 16
@@ -519,16 +511,12 @@ cglobal h264_idct_add16intra_8, 5, 8 + npicregs, 0, dst1, block_offset, block, s
     jz .skipblock
     mov   word [r2], 0
     DC_ADD_MMXEXT_INIT r6, r3
-%if ARCH_X86_64 == 0
 %define dst2q r1
 %define dst2d r1d
-%endif
     mov       dst2d, dword [r1+r5*4]
     add       dst2q, r0
     DC_ADD_MMXEXT_OP movh, dst2q, r3, r6
-%if ARCH_X86_64 == 0
     mov          r1, r1m
-%endif
 .skipblock:
     inc          r5
     add          r2, 32
@@ -560,18 +548,14 @@ cglobal h264_idct8_add4_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride
     jz .no_dc
     mov   word [r2], 0
     DC_ADD_MMXEXT_INIT r6, r3
-%if ARCH_X86_64 == 0
 %define dst2q r1
 %define dst2d r1d
-%endif
     mov       dst2d, dword [r1+r5*4]
     lea       dst2q, [r0+dst2q]
     DC_ADD_MMXEXT_OP mova, dst2q, r3, r6
     lea       dst2q, [dst2q+r3*4]
     DC_ADD_MMXEXT_OP mova, dst2q, r3, r6
-%if ARCH_X86_64 == 0
     mov          r1, r1m
-%endif
     add          r5, 4
     add          r2, 128
     cmp          r5, 16
@@ -597,6 +581,7 @@ cglobal h264_idct8_add4_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride
 
     ADD         rsp, pad
     RET
+%endif
 
 INIT_XMM sse2
 ; void ff_h264_idct8_add4_8_sse2(uint8_t *dst, const int *block_offset,
@@ -678,6 +663,7 @@ h264_idct_add8_mmx_plane:
     jnz .nextblock
     rep ret
 
+%if ARCH_X86_32
 ; void ff_h264_idct_add8_8_mmx(uint8_t **dest, const int *block_offset,
 ;                              int16_t *block, int stride,
 ;                              const uint8_t nnzc[6 * 8])
@@ -687,20 +673,14 @@ cglobal h264_idct_add8_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride,
     add          r2, 512
 %ifdef PIC
     lea     picregq, [scan8_mem]
-%endif
-%if ARCH_X86_64
-    mov       dst2q, r0
 %endif
     call         h264_idct_add8_mmx_plane
     mov          r5, 32
     add          r2, 384
-%if ARCH_X86_64
-    add       dst2q, gprsize
-%else
     add        r0mp, gprsize
-%endif
     call         h264_idct_add8_mmx_plane
     RET ; TODO: check rep ret after a function call
+%endif
 
 cglobal h264_idct_add8_422_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
 ; dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
@@ -734,6 +714,7 @@ cglobal h264_idct_add8_422_8, 5, 8 + npicregs, 0, dst1, block_offset, block, str
 
     RET ; TODO: check rep ret after a function call
 
+%if ARCH_X86_32
 h264_idct_add8_mmxext_plane:
     movsxdifnidn r3, r3d
 .nextblock:
@@ -741,14 +722,9 @@ h264_idct_add8_mmxext_plane:
     movzx        r6, byte [r4+r6]
     test         r6, r6
     jz .try_dc
-%if ARCH_X86_64
-    mov         r0d, dword [r1+r5*4]
-    add          r0, [dst2q]
-%else
     mov          r0, r1m ; XXX r1m here is actually r0m of the calling func
     mov          r0, [r0]
     add          r0, dword [r1+r5*4]
-%endif
     IDCT4_ADD    r0, r2, r3
     inc          r5
     add          r2, 32
@@ -761,14 +737,9 @@ h264_idct_add8_mmxext_plane:
     jz .skipblock
     mov   word [r2], 0
     DC_ADD_MMXEXT_INIT r6, r3
-%if ARCH_X86_64
-    mov         r0d, dword [r1+r5*4]
-    add          r0, [dst2q]
-%else
     mov          r0, r1m ; XXX r1m here is actually r0m of the calling func
     mov          r0, [r0]
     add          r0, dword [r1+r5*4]
-%endif
     DC_ADD_MMXEXT_OP movh, r0, r3, r6
 .skipblock:
     inc          r5
@@ -785,22 +756,16 @@ cglobal h264_idct_add8_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride,
     movsxdifnidn r3, r3d
     mov          r5, 16
     add          r2, 512
-%if ARCH_X86_64
-    mov       dst2q, r0
-%endif
 %ifdef PIC
     lea     picregq, [scan8_mem]
 %endif
     call h264_idct_add8_mmxext_plane
     mov          r5, 32
     add          r2, 384
-%if ARCH_X86_64
-    add       dst2q, gprsize
-%else
     add        r0mp, gprsize
-%endif
     call h264_idct_add8_mmxext_plane
     RET ; TODO: check rep ret after a function call
+%endif
 
 ; r0 = uint8_t *dst, r2 = int16_t *block, r3 = int stride, r6=clobbered
 h264_idct_dc_add8_mmxext:
@@ -1139,8 +1104,10 @@ cglobal h264_luma_dc_dequant_idct, 3, 4, %1
     RET
 %endmacro
 
+%if ARCH_X86_32
 INIT_MMX mmx
 IDCT_DC_DEQUANT 0
+%endif
 INIT_MMX sse2
 IDCT_DC_DEQUANT 7
 
diff --git a/libavcodec/x86/h264_weight.asm b/libavcodec/x86/h264_weight.asm
index 0975d74fcf..086616e633 100644
--- a/libavcodec/x86/h264_weight.asm
+++ b/libavcodec/x86/h264_weight.asm
@@ -70,6 +70,7 @@ SECTION .text
     packuswb      m0, m1
 %endmacro
 
+%if ARCH_X86_32
 INIT_MMX mmxext
 cglobal h264_weight_16, 6, 6, 0
     WEIGHT_SETUP
@@ -82,6 +83,7 @@ cglobal h264_weight_16, 6, 6, 0
     dec        r2d
     jnz .nextrow
     REP_RET
+%endif
 
 %macro WEIGHT_FUNC_MM 2
 cglobal h264_weight_%1, 6, 6, %2
@@ -95,8 +97,10 @@ cglobal h264_weight_%1, 6, 6, %2
     REP_RET
 %endmacro
 
+%if ARCH_X86_32
 INIT_MMX mmxext
 WEIGHT_FUNC_MM  8, 0
+%endif
 INIT_XMM sse2
 WEIGHT_FUNC_MM 16, 8
 
@@ -198,6 +202,7 @@ WEIGHT_FUNC_HALF_MM 8, 8
     packuswb   m0, m1
 %endmacro
 
+%if ARCH_X86_32
 INIT_MMX mmxext
 cglobal h264_biweight_16, 7, 8, 0
     BIWEIGHT_SETUP
@@ -216,6 +221,7 @@ cglobal h264_biweight_16, 7, 8, 0
     dec        r3d
     jnz .nextrow
     REP_RET
+%endif
 
 %macro BIWEIGHT_FUNC_MM 2
 cglobal h264_biweight_%1, 7, 8, %2
@@ -233,8 +239,10 @@ cglobal h264_biweight_%1, 7, 8, %2
     REP_RET
 %endmacro
 
+%if ARCH_X86_32
 INIT_MMX mmxext
 BIWEIGHT_FUNC_MM  8, 0
+%endif
 INIT_XMM sse2
 BIWEIGHT_FUNC_MM 16, 8
 
diff --git a/libavcodec/x86/h264dsp_init.c b/libavcodec/x86/h264dsp_init.c
index c9a96c7dca..9ef6c6bb53 100644
--- a/libavcodec/x86/h264dsp_init.c
+++ b/libavcodec/x86/h264dsp_init.c
@@ -236,6 +236,10 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
 
     if (bit_depth == 8) {
         if (EXTERNAL_MMX(cpu_flags)) {
+#if ARCH_X86_32
+            if (cpu_flags & AV_CPU_FLAG_CMOV)
+                c->h264_luma_dc_dequant_idct = ff_h264_luma_dc_dequant_idct_mmx;
+
             c->h264_idct_dc_add   =
             c->h264_idct_add      = ff_h264_idct_add_8_mmx;
             c->h264_idct8_dc_add  =
@@ -243,18 +247,21 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
 
             c->h264_idct_add16 = ff_h264_idct_add16_8_mmx;
             c->h264_idct8_add4 = ff_h264_idct8_add4_8_mmx;
+
+            c->h264_idct_add16intra = ff_h264_idct_add16intra_8_mmx;
+#endif
             if (chroma_format_idc <= 1) {
+#if ARCH_X86_32
                 c->h264_idct_add8 = ff_h264_idct_add8_8_mmx;
+#endif
             } else {
                 c->h264_idct_add8 = ff_h264_idct_add8_422_8_mmx;
             }
-            c->h264_idct_add16intra = ff_h264_idct_add16intra_8_mmx;
-            if (cpu_flags & AV_CPU_FLAG_CMOV)
-                c->h264_luma_dc_dequant_idct = ff_h264_luma_dc_dequant_idct_mmx;
         }
         if (EXTERNAL_MMXEXT(cpu_flags)) {
-            c->h264_idct_dc_add  = ff_h264_idct_dc_add_8_mmxext;
             c->h264_idct8_dc_add = ff_h264_idct8_dc_add_8_mmxext;
+#if ARCH_X86_32 && HAVE_MMXEXT_EXTERNAL
+            c->h264_idct_dc_add  = ff_h264_idct_dc_add_8_mmxext;
             c->h264_idct_add16   = ff_h264_idct_add16_8_mmxext;
             c->h264_idct8_add4   = ff_h264_idct8_add4_8_mmxext;
             if (chroma_format_idc <= 1)
@@ -270,18 +277,18 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
                 c->h264_h_loop_filter_chroma = ff_deblock_h_chroma422_8_mmxext;
                 c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma422_intra_8_mmxext;
             }
-#if ARCH_X86_32 && HAVE_MMXEXT_EXTERNAL
             c->h264_v_loop_filter_luma       = deblock_v_luma_8_mmxext;
             c->h264_h_loop_filter_luma       = ff_deblock_h_luma_8_mmxext;
             c->h264_v_loop_filter_luma_intra = deblock_v_luma_intra_8_mmxext;
             c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_mmxext;
-#endif /* ARCH_X86_32 && HAVE_MMXEXT_EXTERNAL */
+
             c->weight_h264_pixels_tab[0] = ff_h264_weight_16_mmxext;
             c->weight_h264_pixels_tab[1] = ff_h264_weight_8_mmxext;
-            c->weight_h264_pixels_tab[2] = ff_h264_weight_4_mmxext;
 
             c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_mmxext;
             c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_mmxext;
+#endif /* ARCH_X86_32 && HAVE_MMXEXT_EXTERNAL */
+            c->weight_h264_pixels_tab[2]   = ff_h264_weight_4_mmxext;
             c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_mmxext;
         }
         if (EXTERNAL_SSE2(cpu_flags)) {
-- 
2.34.1



More information about the ffmpeg-devel mailing list