[FFmpeg-devel] [PATCH 33/41] avcodec/x86/h264_qpel: Disable overridden functions on x64

Fri Jun 10 02:55:15 EEST 2022

x64 always has MMX, MMXEXT, SSE and SSE2 and this means
that some functions for MMX, MMXEXT, SSE and 3dnow are always
overridden by other functions (unless one e.g. explicitly
disables SSE2). This commit therefore disables several MMXEXT
functions (that are overridden by SSE2 functions)
at compile-time for x64.

Notice that some 10-bit SSE2 functions are overridden by sse2_cache64
functions in the same code block. This is suboptimal and the functions
that are overridden should either be removed or the sse2_cache64
functions be put behind suitable checks. This commit does neither.

Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt at outlook.com>
---
I would love to get input on what to do with these sse2_cache64
functions. If no one says anything, I will send a patch that
retains the current behaviour and removes the functions
overridden by the sse2_cache64 functions.

 libavcodec/x86/h264_qpel.c        | 44 +++++++++++++++++++++----------
 libavcodec/x86/h264_qpel_8bit.asm |  4 +++
 2 files changed, 34 insertions(+), 14 deletions(-)

diff --git a/libavcodec/x86/h264_qpel.c b/libavcodec/x86/h264_qpel.c
index fd1070247b..cb5f8a126c 100644
--- a/libavcodec/x86/h264_qpel.c
+++ b/libavcodec/x86/h264_qpel.c
@@ -236,7 +236,11 @@ static av_always_inline void ff_ ## OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uin
 #define ff_put_h264_qpel8or16_hv2_lowpass_sse2 ff_put_h264_qpel8or16_hv2_lowpass_mmxext
 #define ff_avg_h264_qpel8or16_hv2_lowpass_sse2 ff_avg_h264_qpel8or16_hv2_lowpass_mmxext
 
-#define H264_MC(OPNAME, SIZE, MMX, ALIGN) \
+#define H264_MC_C_H(OPNAME, SIZE, MMX, ALIGN) \
+H264_MC_C(OPNAME, SIZE, MMX, ALIGN)\
+H264_MC_H(OPNAME, SIZE, MMX, ALIGN)\
+
+#define H264_MC_C_V_H_HV(OPNAME, SIZE, MMX, ALIGN) \
 H264_MC_C(OPNAME, SIZE, MMX, ALIGN)\
 H264_MC_V(OPNAME, SIZE, MMX, ALIGN)\
 H264_MC_H(OPNAME, SIZE, MMX, ALIGN)\
@@ -372,13 +376,9 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, const uin
     ff_ ## OPNAME ## pixels ## SIZE ## _l2_shift5_mmxext(dst, halfV+3, halfHV, stride, SIZE, SIZE);\
 }\
 
-#define H264_MC_4816(MMX)\
-H264_MC(put_, 4, MMX, 8)\
-H264_MC(put_, 8, MMX, 8)\
-H264_MC(put_, 16,MMX, 8)\
-H264_MC(avg_, 4, MMX, 8)\
-H264_MC(avg_, 8, MMX, 8)\
-H264_MC(avg_, 16,MMX, 8)\
+#define H264_MC(QPEL, SIZE, MMX, ALIGN)\
+QPEL(put_, SIZE, MMX, ALIGN) \
+QPEL(avg_, SIZE, MMX, ALIGN) \
 
 #define H264_MC_816(QPEL, XMM)\
 QPEL(put_, 8, XMM, 16)\
@@ -397,7 +397,14 @@ QPEL_H264_H_XMM(avg_,AVG_MMXEXT_OP, ssse3)
 QPEL_H264_HV_XMM(put_,       PUT_OP, ssse3)
 QPEL_H264_HV_XMM(avg_,AVG_MMXEXT_OP, ssse3)
 
-H264_MC_4816(mmxext)
+H264_MC(H264_MC_C_V_H_HV, 4, mmxext, 8)
+#if ARCH_X86_32
+H264_MC(H264_MC_C_V_H_HV, 8, mmxext, 8)
+H264_MC(H264_MC_C_V_H_HV, 16, mmxext, 8)
+#else
+H264_MC(H264_MC_C_H, 8, mmxext, 8)
+H264_MC(H264_MC_C_H, 16, mmxext, 8)
+#endif
 H264_MC_816(H264_MC_V, sse2)
 H264_MC_816(H264_MC_HV, sse2)
 H264_MC_816(H264_MC_H, ssse3)
@@ -499,12 +506,16 @@ QPEL16(mmxext)
 
 #endif /* HAVE_X86ASM */
 
-#define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX)                          \
+#define SET_QPEL_FUNCS0123(PFX, IDX, SIZE, CPU, PREFIX)                      \
     do {                                                                     \
     c->PFX ## _pixels_tab[IDX][ 0] = PREFIX ## PFX ## SIZE ## _mc00_ ## CPU; \
     c->PFX ## _pixels_tab[IDX][ 1] = PREFIX ## PFX ## SIZE ## _mc10_ ## CPU; \
     c->PFX ## _pixels_tab[IDX][ 2] = PREFIX ## PFX ## SIZE ## _mc20_ ## CPU; \
     c->PFX ## _pixels_tab[IDX][ 3] = PREFIX ## PFX ## SIZE ## _mc30_ ## CPU; \
+    } while (0)
+#define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX)                          \
+    do {                                                                     \
+    SET_QPEL_FUNCS0123(PFX, IDX, SIZE, CPU, PREFIX);                         \
     c->PFX ## _pixels_tab[IDX][ 4] = PREFIX ## PFX ## SIZE ## _mc01_ ## CPU; \
     c->PFX ## _pixels_tab[IDX][ 5] = PREFIX ## PFX ## SIZE ## _mc11_ ## CPU; \
     c->PFX ## _pixels_tab[IDX][ 6] = PREFIX ## PFX ## SIZE ## _mc21_ ## CPU; \
@@ -543,11 +554,16 @@ av_cold void ff_h264qpel_init_x86(H264QpelContext *c, int bit_depth)
 
     if (EXTERNAL_MMXEXT(cpu_flags)) {
         if (!high_bit_depth) {
-            SET_QPEL_FUNCS(put_h264_qpel, 0, 16, mmxext, );
-            SET_QPEL_FUNCS(put_h264_qpel, 1,  8, mmxext, );
+#if ARCH_X86_32
+#define SET_MMXEXT_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX) SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX)
+#else
+#define SET_MMXEXT_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX) SET_QPEL_FUNCS0123(PFX, IDX, SIZE, CPU, PREFIX)
+#endif
+            SET_MMXEXT_QPEL_FUNCS(put_h264_qpel, 0, 16, mmxext, );
+            SET_MMXEXT_QPEL_FUNCS(put_h264_qpel, 1,  8, mmxext, );
             SET_QPEL_FUNCS(put_h264_qpel, 2,  4, mmxext, );
-            SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, mmxext, );
-            SET_QPEL_FUNCS(avg_h264_qpel, 1,  8, mmxext, );
+            SET_MMXEXT_QPEL_FUNCS(avg_h264_qpel, 0, 16, mmxext, );
+            SET_MMXEXT_QPEL_FUNCS(avg_h264_qpel, 1,  8, mmxext, );
             SET_QPEL_FUNCS(avg_h264_qpel, 2,  4, mmxext, );
         } else if (bit_depth == 10) {
 #if ARCH_X86_32
diff --git a/libavcodec/x86/h264_qpel_8bit.asm b/libavcodec/x86/h264_qpel_8bit.asm
index 03c7d88f8c..72e98248d8 100644
--- a/libavcodec/x86/h264_qpel_8bit.asm
+++ b/libavcodec/x86/h264_qpel_8bit.asm
@@ -461,9 +461,11 @@ cglobal %1_h264_qpel8or16_v_lowpass_op, 5,5,8 ; dst, src, dstStride, srcStride,
     REP_RET
 %endmacro
 
+%if ARCH_X86_32
 INIT_MMX mmxext
 QPEL8OR16_V_LOWPASS_OP put
 QPEL8OR16_V_LOWPASS_OP avg
+%endif
 
 INIT_XMM sse2
 QPEL8OR16_V_LOWPASS_OP put
@@ -581,8 +583,10 @@ cglobal %1_h264_qpel8or16_hv1_lowpass_op, 4,4,8 ; src, tmp, srcStride, size
     REP_RET
 %endmacro
 
+%if ARCH_X86_32
 INIT_MMX mmxext
 QPEL8OR16_HV1_LOWPASS_OP put
+%endif
 
 INIT_XMM sse2
 QPEL8OR16_HV1_LOWPASS_OP put
-- 
2.34.1