[FFmpeg-devel] [PATCH 22/41] avcodec/x86/h264dsp_init: Disable overridden functions on x64
Andreas Rheinhardt
andreas.rheinhardt at outlook.com
Fri Jun 10 02:55:04 EEST 2022
x64 always has MMX, MMXEXT, SSE and SSE2 and this means
that some functions for MMX, MMXEXT, SSE and 3dnow are always
overridden by other functions (unless one e.g. explicitly
disables SSE2). This commit therefore disables such h264dsp functions
at compile-time for x64.
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt at outlook.com>
---
libavcodec/x86/h264_deblock.asm | 24 +++-----------
libavcodec/x86/h264_idct.asm | 57 +++++++--------------------------
libavcodec/x86/h264_weight.asm | 8 +++++
libavcodec/x86/h264dsp_init.c | 21 ++++++++----
4 files changed, 38 insertions(+), 72 deletions(-)
diff --git a/libavcodec/x86/h264_deblock.asm b/libavcodec/x86/h264_deblock.asm
index a2e745cd8e..9e671af45c 100644
--- a/libavcodec/x86/h264_deblock.asm
+++ b/libavcodec/x86/h264_deblock.asm
@@ -867,7 +867,6 @@ DEBLOCK_LUMA_INTRA v
%if ARCH_X86_64 == 0
INIT_MMX mmxext
DEBLOCK_LUMA_INTRA v8
-%endif
INIT_MMX mmxext
@@ -911,17 +910,8 @@ cglobal deblock_v_chroma_8, 5,6
; int8_t *tc0)
;-----------------------------------------------------------------------------
cglobal deblock_h_chroma_8, 5,7
-%if ARCH_X86_64
- ; This could use the red zone on 64 bit unix to avoid the stack pointer
- ; readjustment, but valgrind assumes the red zone is clobbered on
- ; function calls and returns.
- sub rsp, 16
- %define buf0 [rsp]
- %define buf1 [rsp+8]
-%else
%define buf0 r0m
%define buf1 r2m
-%endif
CHROMA_H_START
TRANSPOSE4x8_LOAD bw, wd, dq, PASS8ROWS(t5, r0, r1, t6)
movq buf0, m0
@@ -934,9 +924,6 @@ cglobal deblock_h_chroma_8, 5,7
movq m0, buf0
movq m3, buf1
TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6)
-%if ARCH_X86_64
- add rsp, 16
-%endif
RET
ALIGN 16
@@ -953,13 +940,8 @@ ff_chroma_inter_body_mmxext:
cglobal deblock_h_chroma422_8, 5, 6
SUB rsp, (1+ARCH_X86_64*2)*mmsize
- %if ARCH_X86_64
- %define buf0 [rsp+16]
- %define buf1 [rsp+8]
- %else
- %define buf0 r0m
- %define buf1 r2m
- %endif
+ %define buf0 r0m
+ %define buf1 r2m
movd m6, [r4]
punpcklbw m6, m6
@@ -1059,6 +1041,8 @@ ff_chroma_intra_body_mmxext:
paddb m2, m6
ret
+%endif ; ARCH_X86_64 == 0
+
%macro LOAD_8_ROWS 8
movd m0, %1
movd m1, %2
diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm
index c54f9f1a68..17c7af388c 100644
--- a/libavcodec/x86/h264_idct.asm
+++ b/libavcodec/x86/h264_idct.asm
@@ -87,12 +87,14 @@ SECTION .text
STORE_DIFFx2 m2, m3, m4, m5, m7, 6, %1, %3
%endmacro
+%if ARCH_X86_32
INIT_MMX mmx
; void ff_h264_idct_add_8_mmx(uint8_t *dst, int16_t *block, int stride)
cglobal h264_idct_add_8, 3, 3, 0
movsxdifnidn r2, r2d
IDCT4_ADD r0, r1, r2
RET
+%endif
%macro IDCT8_1D 2
psraw m0, m1, 1
@@ -207,6 +209,7 @@ cglobal h264_idct_add_8, 3, 3, 0
STORE_DIFFx2 m1, m2, m5, m6, m7, 6, %1, %3
%endmacro
+%if ARCH_X86_32
INIT_MMX mmx
; void ff_h264_idct8_add_8_mmx(uint8_t *dst, int16_t *block, int stride)
cglobal h264_idct8_add_8, 3, 4, 0
@@ -223,6 +226,7 @@ cglobal h264_idct8_add_8, 3, 4, 0
ADD rsp, pad
RET
+%endif
; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
%macro IDCT8_ADD_SSE 4
@@ -315,16 +319,7 @@ cglobal h264_idct8_add_8, 3, 4, 10
%endmacro
INIT_MMX mmxext
-; void ff_h264_idct_dc_add_8_mmxext(uint8_t *dst, int16_t *block, int stride)
%if ARCH_X86_64
-cglobal h264_idct_dc_add_8, 3, 4, 0
- movsxd r2, r2d
- movsx r3, word [r1]
- mov dword [r1], 0
- DC_ADD_MMXEXT_INIT r3, r2
- DC_ADD_MMXEXT_OP movh, r0, r2, r3
- RET
-
; void ff_h264_idct8_dc_add_8_mmxext(uint8_t *dst, int16_t *block, int stride)
cglobal h264_idct8_dc_add_8, 3, 4, 0
movsxd r2, r2d
@@ -358,6 +353,7 @@ cglobal h264_idct8_dc_add_8, 2, 3, 0
%endif
INIT_MMX mmx
+%if ARCH_X86_32
; void ff_h264_idct_add16_8_mmx(uint8_t *dst, const int *block_offset,
; int16_t *block, int stride,
; const uint8_t nnzc[6 * 8])
@@ -438,16 +434,12 @@ cglobal h264_idct_add16_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride
jz .no_dc
mov word [r2], 0
DC_ADD_MMXEXT_INIT r6, r3
-%if ARCH_X86_64 == 0
%define dst2q r1
%define dst2d r1d
-%endif
mov dst2d, dword [r1+r5*4]
lea dst2q, [r0+dst2q]
DC_ADD_MMXEXT_OP movh, dst2q, r3, r6
-%if ARCH_X86_64 == 0
mov r1, r1m
-%endif
inc r5
add r2, 32
cmp r5, 16
@@ -519,16 +511,12 @@ cglobal h264_idct_add16intra_8, 5, 8 + npicregs, 0, dst1, block_offset, block, s
jz .skipblock
mov word [r2], 0
DC_ADD_MMXEXT_INIT r6, r3
-%if ARCH_X86_64 == 0
%define dst2q r1
%define dst2d r1d
-%endif
mov dst2d, dword [r1+r5*4]
add dst2q, r0
DC_ADD_MMXEXT_OP movh, dst2q, r3, r6
-%if ARCH_X86_64 == 0
mov r1, r1m
-%endif
.skipblock:
inc r5
add r2, 32
@@ -560,18 +548,14 @@ cglobal h264_idct8_add4_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride
jz .no_dc
mov word [r2], 0
DC_ADD_MMXEXT_INIT r6, r3
-%if ARCH_X86_64 == 0
%define dst2q r1
%define dst2d r1d
-%endif
mov dst2d, dword [r1+r5*4]
lea dst2q, [r0+dst2q]
DC_ADD_MMXEXT_OP mova, dst2q, r3, r6
lea dst2q, [dst2q+r3*4]
DC_ADD_MMXEXT_OP mova, dst2q, r3, r6
-%if ARCH_X86_64 == 0
mov r1, r1m
-%endif
add r5, 4
add r2, 128
cmp r5, 16
@@ -597,6 +581,7 @@ cglobal h264_idct8_add4_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride
ADD rsp, pad
RET
+%endif
INIT_XMM sse2
; void ff_h264_idct8_add4_8_sse2(uint8_t *dst, const int *block_offset,
@@ -678,6 +663,7 @@ h264_idct_add8_mmx_plane:
jnz .nextblock
rep ret
+%if ARCH_X86_32
; void ff_h264_idct_add8_8_mmx(uint8_t **dest, const int *block_offset,
; int16_t *block, int stride,
; const uint8_t nnzc[6 * 8])
@@ -687,20 +673,14 @@ cglobal h264_idct_add8_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride,
add r2, 512
%ifdef PIC
lea picregq, [scan8_mem]
-%endif
-%if ARCH_X86_64
- mov dst2q, r0
%endif
call h264_idct_add8_mmx_plane
mov r5, 32
add r2, 384
-%if ARCH_X86_64
- add dst2q, gprsize
-%else
add r0mp, gprsize
-%endif
call h264_idct_add8_mmx_plane
RET ; TODO: check rep ret after a function call
+%endif
cglobal h264_idct_add8_422_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
; dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
@@ -734,6 +714,7 @@ cglobal h264_idct_add8_422_8, 5, 8 + npicregs, 0, dst1, block_offset, block, str
RET ; TODO: check rep ret after a function call
+%if ARCH_X86_32
h264_idct_add8_mmxext_plane:
movsxdifnidn r3, r3d
.nextblock:
@@ -741,14 +722,9 @@ h264_idct_add8_mmxext_plane:
movzx r6, byte [r4+r6]
test r6, r6
jz .try_dc
-%if ARCH_X86_64
- mov r0d, dword [r1+r5*4]
- add r0, [dst2q]
-%else
mov r0, r1m ; XXX r1m here is actually r0m of the calling func
mov r0, [r0]
add r0, dword [r1+r5*4]
-%endif
IDCT4_ADD r0, r2, r3
inc r5
add r2, 32
@@ -761,14 +737,9 @@ h264_idct_add8_mmxext_plane:
jz .skipblock
mov word [r2], 0
DC_ADD_MMXEXT_INIT r6, r3
-%if ARCH_X86_64
- mov r0d, dword [r1+r5*4]
- add r0, [dst2q]
-%else
mov r0, r1m ; XXX r1m here is actually r0m of the calling func
mov r0, [r0]
add r0, dword [r1+r5*4]
-%endif
DC_ADD_MMXEXT_OP movh, r0, r3, r6
.skipblock:
inc r5
@@ -785,22 +756,16 @@ cglobal h264_idct_add8_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride,
movsxdifnidn r3, r3d
mov r5, 16
add r2, 512
-%if ARCH_X86_64
- mov dst2q, r0
-%endif
%ifdef PIC
lea picregq, [scan8_mem]
%endif
call h264_idct_add8_mmxext_plane
mov r5, 32
add r2, 384
-%if ARCH_X86_64
- add dst2q, gprsize
-%else
add r0mp, gprsize
-%endif
call h264_idct_add8_mmxext_plane
RET ; TODO: check rep ret after a function call
+%endif
; r0 = uint8_t *dst, r2 = int16_t *block, r3 = int stride, r6=clobbered
h264_idct_dc_add8_mmxext:
@@ -1139,8 +1104,10 @@ cglobal h264_luma_dc_dequant_idct, 3, 4, %1
RET
%endmacro
+%if ARCH_X86_32
INIT_MMX mmx
IDCT_DC_DEQUANT 0
+%endif
INIT_MMX sse2
IDCT_DC_DEQUANT 7
diff --git a/libavcodec/x86/h264_weight.asm b/libavcodec/x86/h264_weight.asm
index 0975d74fcf..086616e633 100644
--- a/libavcodec/x86/h264_weight.asm
+++ b/libavcodec/x86/h264_weight.asm
@@ -70,6 +70,7 @@ SECTION .text
packuswb m0, m1
%endmacro
+%if ARCH_X86_32
INIT_MMX mmxext
cglobal h264_weight_16, 6, 6, 0
WEIGHT_SETUP
@@ -82,6 +83,7 @@ cglobal h264_weight_16, 6, 6, 0
dec r2d
jnz .nextrow
REP_RET
+%endif
%macro WEIGHT_FUNC_MM 2
cglobal h264_weight_%1, 6, 6, %2
@@ -95,8 +97,10 @@ cglobal h264_weight_%1, 6, 6, %2
REP_RET
%endmacro
+%if ARCH_X86_32
INIT_MMX mmxext
WEIGHT_FUNC_MM 8, 0
+%endif
INIT_XMM sse2
WEIGHT_FUNC_MM 16, 8
@@ -198,6 +202,7 @@ WEIGHT_FUNC_HALF_MM 8, 8
packuswb m0, m1
%endmacro
+%if ARCH_X86_32
INIT_MMX mmxext
cglobal h264_biweight_16, 7, 8, 0
BIWEIGHT_SETUP
@@ -216,6 +221,7 @@ cglobal h264_biweight_16, 7, 8, 0
dec r3d
jnz .nextrow
REP_RET
+%endif
%macro BIWEIGHT_FUNC_MM 2
cglobal h264_biweight_%1, 7, 8, %2
@@ -233,8 +239,10 @@ cglobal h264_biweight_%1, 7, 8, %2
REP_RET
%endmacro
+%if ARCH_X86_32
INIT_MMX mmxext
BIWEIGHT_FUNC_MM 8, 0
+%endif
INIT_XMM sse2
BIWEIGHT_FUNC_MM 16, 8
diff --git a/libavcodec/x86/h264dsp_init.c b/libavcodec/x86/h264dsp_init.c
index c9a96c7dca..9ef6c6bb53 100644
--- a/libavcodec/x86/h264dsp_init.c
+++ b/libavcodec/x86/h264dsp_init.c
@@ -236,6 +236,10 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
if (bit_depth == 8) {
if (EXTERNAL_MMX(cpu_flags)) {
+#if ARCH_X86_32
+ if (cpu_flags & AV_CPU_FLAG_CMOV)
+ c->h264_luma_dc_dequant_idct = ff_h264_luma_dc_dequant_idct_mmx;
+
c->h264_idct_dc_add =
c->h264_idct_add = ff_h264_idct_add_8_mmx;
c->h264_idct8_dc_add =
@@ -243,18 +247,21 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
c->h264_idct_add16 = ff_h264_idct_add16_8_mmx;
c->h264_idct8_add4 = ff_h264_idct8_add4_8_mmx;
+
+ c->h264_idct_add16intra = ff_h264_idct_add16intra_8_mmx;
+#endif
if (chroma_format_idc <= 1) {
+#if ARCH_X86_32
c->h264_idct_add8 = ff_h264_idct_add8_8_mmx;
+#endif
} else {
c->h264_idct_add8 = ff_h264_idct_add8_422_8_mmx;
}
- c->h264_idct_add16intra = ff_h264_idct_add16intra_8_mmx;
- if (cpu_flags & AV_CPU_FLAG_CMOV)
- c->h264_luma_dc_dequant_idct = ff_h264_luma_dc_dequant_idct_mmx;
}
if (EXTERNAL_MMXEXT(cpu_flags)) {
- c->h264_idct_dc_add = ff_h264_idct_dc_add_8_mmxext;
c->h264_idct8_dc_add = ff_h264_idct8_dc_add_8_mmxext;
+#if ARCH_X86_32 && HAVE_MMXEXT_EXTERNAL
+ c->h264_idct_dc_add = ff_h264_idct_dc_add_8_mmxext;
c->h264_idct_add16 = ff_h264_idct_add16_8_mmxext;
c->h264_idct8_add4 = ff_h264_idct8_add4_8_mmxext;
if (chroma_format_idc <= 1)
@@ -270,18 +277,18 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
c->h264_h_loop_filter_chroma = ff_deblock_h_chroma422_8_mmxext;
c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma422_intra_8_mmxext;
}
-#if ARCH_X86_32 && HAVE_MMXEXT_EXTERNAL
c->h264_v_loop_filter_luma = deblock_v_luma_8_mmxext;
c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_mmxext;
c->h264_v_loop_filter_luma_intra = deblock_v_luma_intra_8_mmxext;
c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_mmxext;
-#endif /* ARCH_X86_32 && HAVE_MMXEXT_EXTERNAL */
+
c->weight_h264_pixels_tab[0] = ff_h264_weight_16_mmxext;
c->weight_h264_pixels_tab[1] = ff_h264_weight_8_mmxext;
- c->weight_h264_pixels_tab[2] = ff_h264_weight_4_mmxext;
c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_mmxext;
c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_mmxext;
+#endif /* ARCH_X86_32 && HAVE_MMXEXT_EXTERNAL */
+ c->weight_h264_pixels_tab[2] = ff_h264_weight_4_mmxext;
c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_mmxext;
}
if (EXTERNAL_SSE2(cpu_flags)) {
--
2.34.1
More information about the ffmpeg-devel
mailing list