[FFmpeg-cvslog] avcodec/x86/h264dsp_init: Remove obsolete MMX(EXT) functions
Andreas Rheinhardt
git at videolan.org
Wed Jun 22 15:28:38 EEST 2022
ffmpeg | branch: master | Andreas Rheinhardt <andreas.rheinhardt at outlook.com> | Sat Jun 11 16:24:23 2022 +0200| [4618f36a2424a3a4d5760afabc2e9dd18d73f0a4] | committer: Andreas Rheinhardt
avcodec/x86/h264dsp_init: Remove obsolete MMX(EXT) functions
x64 always has MMX, MMXEXT, SSE and SSE2 and this means
that some functions for MMX, MMXEXT and 3dnow are always
overridden by other functions (unless one e.g. explicitly
disables SSE2) for x64. So given that the only systems that
benefit from these functions are truely ancient 32bit x86s
they are removed.
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt at outlook.com>
> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=4618f36a2424a3a4d5760afabc2e9dd18d73f0a4
---
libavcodec/x86/h264_deblock.asm | 196 -----------------
libavcodec/x86/h264_deblock_10bit.asm | 42 +---
libavcodec/x86/h264_idct.asm | 382 ----------------------------------
libavcodec/x86/h264_weight.asm | 36 ----
libavcodec/x86/h264dsp_init.c | 95 +--------
5 files changed, 9 insertions(+), 742 deletions(-)
diff --git a/libavcodec/x86/h264_deblock.asm b/libavcodec/x86/h264_deblock.asm
index a2e745cd8e..479e6c3460 100644
--- a/libavcodec/x86/h264_deblock.asm
+++ b/libavcodec/x86/h264_deblock.asm
@@ -581,8 +581,6 @@ cglobal deblock_h_luma_8, 0,5,8,0x60+12
RET
%endmacro ; DEBLOCK_LUMA
-INIT_MMX mmxext
-DEBLOCK_LUMA v8, 8
INIT_XMM sse2
DEBLOCK_LUMA v, 16
%if HAVE_AVX_EXTERNAL
@@ -864,200 +862,6 @@ DEBLOCK_LUMA_INTRA v
INIT_XMM avx
DEBLOCK_LUMA_INTRA v
%endif
-%if ARCH_X86_64 == 0
-INIT_MMX mmxext
-DEBLOCK_LUMA_INTRA v8
-%endif
-
-INIT_MMX mmxext
-
-%macro CHROMA_V_START 0
- dec r2d ; alpha-1
- dec r3d ; beta-1
- mov t5, r0
- sub t5, r1
- sub t5, r1
-%endmacro
-
-%macro CHROMA_H_START 0
- dec r2d
- dec r3d
- sub r0, 2
- lea t6, [r1*3]
- mov t5, r0
- add r0, t6
-%endmacro
-
-%define t5 r5
-%define t6 r6
-
-;-----------------------------------------------------------------------------
-; void ff_deblock_v_chroma(uint8_t *pix, int stride, int alpha, int beta,
-; int8_t *tc0)
-;-----------------------------------------------------------------------------
-cglobal deblock_v_chroma_8, 5,6
- CHROMA_V_START
- movq m0, [t5]
- movq m1, [t5+r1]
- movq m2, [r0]
- movq m3, [r0+r1]
- call ff_chroma_inter_body_mmxext
- movq [t5+r1], m1
- movq [r0], m2
- RET
-
-;-----------------------------------------------------------------------------
-; void ff_deblock_h_chroma(uint8_t *pix, int stride, int alpha, int beta,
-; int8_t *tc0)
-;-----------------------------------------------------------------------------
-cglobal deblock_h_chroma_8, 5,7
-%if ARCH_X86_64
- ; This could use the red zone on 64 bit unix to avoid the stack pointer
- ; readjustment, but valgrind assumes the red zone is clobbered on
- ; function calls and returns.
- sub rsp, 16
- %define buf0 [rsp]
- %define buf1 [rsp+8]
-%else
- %define buf0 r0m
- %define buf1 r2m
-%endif
- CHROMA_H_START
- TRANSPOSE4x8_LOAD bw, wd, dq, PASS8ROWS(t5, r0, r1, t6)
- movq buf0, m0
- movq buf1, m3
- LOAD_MASK r2d, r3d
- movd m6, [r4] ; tc0
- punpcklbw m6, m6
- pand m7, m6
- DEBLOCK_P0_Q0
- movq m0, buf0
- movq m3, buf1
- TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6)
-%if ARCH_X86_64
- add rsp, 16
-%endif
- RET
-
-ALIGN 16
-ff_chroma_inter_body_mmxext:
- LOAD_MASK r2d, r3d
- movd m6, [r4] ; tc0
- punpcklbw m6, m6
- pand m7, m6
- DEBLOCK_P0_Q0
- ret
-
-%define t5 r4
-%define t6 r5
-
-cglobal deblock_h_chroma422_8, 5, 6
- SUB rsp, (1+ARCH_X86_64*2)*mmsize
- %if ARCH_X86_64
- %define buf0 [rsp+16]
- %define buf1 [rsp+8]
- %else
- %define buf0 r0m
- %define buf1 r2m
- %endif
-
- movd m6, [r4]
- punpcklbw m6, m6
- movq [rsp], m6
- CHROMA_H_START
-
- TRANSPOSE4x8B_LOAD PASS8ROWS(t5, r0, r1, t6)
- movq buf0, m0
- movq buf1, m3
- LOAD_MASK r2d, r3d
- movd m6, [rsp]
- punpcklwd m6, m6
- pand m7, m6
- DEBLOCK_P0_Q0
- movq m0, buf0
- movq m3, buf1
- TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6)
-
- lea r0, [r0+r1*8]
- lea t5, [t5+r1*8]
-
- TRANSPOSE4x8B_LOAD PASS8ROWS(t5, r0, r1, t6)
- movq buf0, m0
- movq buf1, m3
- LOAD_MASK r2d, r3d
- movd m6, [rsp+4]
- punpcklwd m6, m6
- pand m7, m6
- DEBLOCK_P0_Q0
- movq m0, buf0
- movq m3, buf1
- TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6)
- ADD rsp, (1+ARCH_X86_64*2)*mmsize
-RET
-
-; in: %1=p0 %2=p1 %3=q1
-; out: p0 = (p0 + q1 + 2*p1 + 2) >> 2
-%macro CHROMA_INTRA_P0 3
- movq m4, %1
- pxor m4, %3
- pand m4, [pb_1] ; m4 = (p0^q1)&1
- pavgb %1, %3
- psubusb %1, m4
- pavgb %1, %2 ; dst = avg(p1, avg(p0,q1) - ((p0^q1)&1))
-%endmacro
-
-;------------------------------------------------------------------------------
-; void ff_deblock_v_chroma_intra(uint8_t *pix, int stride, int alpha, int beta)
-;------------------------------------------------------------------------------
-cglobal deblock_v_chroma_intra_8, 4,5
- CHROMA_V_START
- movq m0, [t5]
- movq m1, [t5+r1]
- movq m2, [r0]
- movq m3, [r0+r1]
- call ff_chroma_intra_body_mmxext
- movq [t5+r1], m1
- movq [r0], m2
- RET
-
-;------------------------------------------------------------------------------
-; void ff_deblock_h_chroma_intra(uint8_t *pix, int stride, int alpha, int beta)
-;------------------------------------------------------------------------------
-cglobal deblock_h_chroma_intra_8, 4,6
- CHROMA_H_START
- TRANSPOSE4x8_LOAD bw, wd, dq, PASS8ROWS(t5, r0, r1, t6)
- call ff_chroma_intra_body_mmxext
- TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6)
- RET
-
-cglobal deblock_h_chroma422_intra_8, 4, 6
- CHROMA_H_START
- TRANSPOSE4x8_LOAD bw, wd, dq, PASS8ROWS(t5, r0, r1, t6)
- call ff_chroma_intra_body_mmxext
- TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6)
-
- lea r0, [r0+r1*8]
- lea t5, [t5+r1*8]
-
- TRANSPOSE4x8_LOAD bw, wd, dq, PASS8ROWS(t5, r0, r1, t6)
- call ff_chroma_intra_body_mmxext
- TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6)
-RET
-
-ALIGN 16
-ff_chroma_intra_body_mmxext:
- LOAD_MASK r2d, r3d
- movq m5, m1
- movq m6, m2
- CHROMA_INTRA_P0 m1, m0, m3
- CHROMA_INTRA_P0 m2, m3, m0
- psubb m1, m5
- psubb m2, m6
- pand m1, m7
- pand m2, m7
- paddb m1, m5
- paddb m2, m6
- ret
%macro LOAD_8_ROWS 8
movd m0, %1
diff --git a/libavcodec/x86/h264_deblock_10bit.asm b/libavcodec/x86/h264_deblock_10bit.asm
index 1af3257a67..23971b5cb5 100644
--- a/libavcodec/x86/h264_deblock_10bit.asm
+++ b/libavcodec/x86/h264_deblock_10bit.asm
@@ -798,9 +798,11 @@ cglobal deblock_h_luma_intra_10, 4,7,8*(mmsize/16)
%endmacro
%if ARCH_X86_64 == 0
+%if HAVE_ALIGNED_STACK == 0
INIT_MMX mmxext
DEBLOCK_LUMA
DEBLOCK_LUMA_INTRA
+%endif
INIT_XMM sse2
DEBLOCK_LUMA
DEBLOCK_LUMA_INTRA
@@ -938,10 +940,6 @@ cglobal deblock_v_chroma_10, 5,7-(mmsize/16),8*(mmsize/16)
sub r0, r1
shl r2d, 2
shl r3d, 2
-%if mmsize < 16
- mov r6, 16/mmsize
-.loop:
-%endif
CHROMA_V_LOAD r5
LOAD_AB m4, m5, r2d, r3d
LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4
@@ -952,16 +950,7 @@ cglobal deblock_v_chroma_10, 5,7-(mmsize/16),8*(mmsize/16)
pand m7, m6
DEBLOCK_P0_Q0 m1, m2, m0, m3, m7, m5, m6
CHROMA_V_STORE
-%if mmsize < 16
- add r0, mmsize
- add r5, mmsize
- add r4, mmsize/4
- dec r6
- jg .loop
- REP_RET
-%else
RET
-%endif
;-----------------------------------------------------------------------------
; void ff_deblock_v_chroma_intra_10(uint16_t *pix, int stride, int alpha,
@@ -973,24 +962,12 @@ cglobal deblock_v_chroma_intra_10, 4,6-(mmsize/16),8*(mmsize/16)
sub r0, r1
shl r2d, 2
shl r3d, 2
-%if mmsize < 16
- mov r5, 16/mmsize
-.loop:
-%endif
CHROMA_V_LOAD r4
LOAD_AB m4, m5, r2d, r3d
LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4
CHROMA_DEBLOCK_P0_Q0_INTRA m1, m2, m0, m3, m7, m5, m6
CHROMA_V_STORE
-%if mmsize < 16
- add r0, mmsize
- add r4, mmsize
- dec r5
- jg .loop
- REP_RET
-%else
RET
-%endif
;-----------------------------------------------------------------------------
; void ff_deblock_h_chroma_10(uint16_t *pix, int stride, int alpha, int beta,
@@ -1002,10 +979,6 @@ cglobal deblock_h_chroma_10, 5, 7, 8, 0-2*mmsize, pix_, stride_, alpha_, beta_,
mov r5, pix_q
lea r6, [3*stride_q]
add r5, r6
-%if mmsize == 8
- mov r6d, 2
- .loop:
-%endif
CHROMA_H_LOAD r5, r6, [rsp], [rsp + mmsize]
LOAD_AB m4, m5, alpha_d, beta_d
@@ -1018,13 +991,6 @@ cglobal deblock_h_chroma_10, 5, 7, 8, 0-2*mmsize, pix_, stride_, alpha_, beta_,
DEBLOCK_P0_Q0 m1, m2, m0, m3, m7, m5, m6
CHROMA_H_STORE r5, r6, [rsp], [rsp + mmsize]
-%if mmsize == 8
- lea pix_q, [pix_q + 4*stride_q]
- lea r5, [r5 + 4*stride_q]
- add tc0_q, 2
- dec r6d
- jg .loop
-%endif
RET
;-----------------------------------------------------------------------------
@@ -1068,10 +1034,6 @@ RET
%endmacro
-%if ARCH_X86_64 == 0
-INIT_MMX mmxext
-DEBLOCK_CHROMA
-%endif
INIT_XMM sse2
DEBLOCK_CHROMA
%if HAVE_AVX_EXTERNAL
diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm
index c54f9f1a68..9b5920d3b0 100644
--- a/libavcodec/x86/h264_idct.asm
+++ b/libavcodec/x86/h264_idct.asm
@@ -87,13 +87,6 @@ SECTION .text
STORE_DIFFx2 m2, m3, m4, m5, m7, 6, %1, %3
%endmacro
-INIT_MMX mmx
-; void ff_h264_idct_add_8_mmx(uint8_t *dst, int16_t *block, int stride)
-cglobal h264_idct_add_8, 3, 3, 0
- movsxdifnidn r2, r2d
- IDCT4_ADD r0, r1, r2
- RET
-
%macro IDCT8_1D 2
psraw m0, m1, 1
SWAP 0, 1
@@ -207,23 +200,6 @@ cglobal h264_idct_add_8, 3, 3, 0
STORE_DIFFx2 m1, m2, m5, m6, m7, 6, %1, %3
%endmacro
-INIT_MMX mmx
-; void ff_h264_idct8_add_8_mmx(uint8_t *dst, int16_t *block, int stride)
-cglobal h264_idct8_add_8, 3, 4, 0
- movsxdifnidn r2, r2d
- %assign pad 128+4-(stack_offset&7)
- SUB rsp, pad
-
- add word [r1], 32
- IDCT8_ADD_MMX_START r1 , rsp
- IDCT8_ADD_MMX_START r1+8, rsp+64
- lea r3, [r0+4]
- IDCT8_ADD_MMX_END r0 , rsp, r2, r1
- IDCT8_ADD_MMX_END r3 , rsp+8, r2
-
- ADD rsp, pad
- RET
-
; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
%macro IDCT8_ADD_SSE 4
IDCT8_1D_FULL %2
@@ -315,16 +291,7 @@ cglobal h264_idct8_add_8, 3, 4, 10
%endmacro
INIT_MMX mmxext
-; void ff_h264_idct_dc_add_8_mmxext(uint8_t *dst, int16_t *block, int stride)
%if ARCH_X86_64
-cglobal h264_idct_dc_add_8, 3, 4, 0
- movsxd r2, r2d
- movsx r3, word [r1]
- mov dword [r1], 0
- DC_ADD_MMXEXT_INIT r3, r2
- DC_ADD_MMXEXT_OP movh, r0, r2, r3
- RET
-
; void ff_h264_idct8_dc_add_8_mmxext(uint8_t *dst, int16_t *block, int stride)
cglobal h264_idct8_dc_add_8, 3, 4, 0
movsxd r2, r2d
@@ -336,15 +303,6 @@ cglobal h264_idct8_dc_add_8, 3, 4, 0
DC_ADD_MMXEXT_OP mova, r0, r2, r3
RET
%else
-; void ff_h264_idct_dc_add_8_mmxext(uint8_t *dst, int16_t *block, int stride)
-cglobal h264_idct_dc_add_8, 2, 3, 0
- movsx r2, word [r1]
- mov dword [r1], 0
- mov r1, r2m
- DC_ADD_MMXEXT_INIT r2, r1
- DC_ADD_MMXEXT_OP movh, r0, r1, r2
- RET
-
; void ff_h264_idct8_dc_add_8_mmxext(uint8_t *dst, int16_t *block, int stride)
cglobal h264_idct8_dc_add_8, 2, 3, 0
movsx r2, word [r1]
@@ -357,247 +315,6 @@ cglobal h264_idct8_dc_add_8, 2, 3, 0
RET
%endif
-INIT_MMX mmx
-; void ff_h264_idct_add16_8_mmx(uint8_t *dst, const int *block_offset,
-; int16_t *block, int stride,
-; const uint8_t nnzc[6 * 8])
-cglobal h264_idct_add16_8, 5, 7 + npicregs, 0, dst, block_offset, block, stride, nnzc, cntr, coeff, picreg
- movsxdifnidn r3, r3d
- xor r5, r5
-%ifdef PIC
- lea picregq, [scan8_mem]
-%endif
-.nextblock:
- movzx r6, byte [scan8+r5]
- movzx r6, byte [r4+r6]
- test r6, r6
- jz .skipblock
- mov r6d, dword [r1+r5*4]
- lea r6, [r0+r6]
- IDCT4_ADD r6, r2, r3
-.skipblock:
- inc r5
- add r2, 32
- cmp r5, 16
- jl .nextblock
- REP_RET
-
-; void ff_h264_idct8_add4_8_mmx(uint8_t *dst, const int *block_offset,
-; int16_t *block, int stride,
-; const uint8_t nnzc[6 * 8])
-cglobal h264_idct8_add4_8, 5, 7 + npicregs, 0, dst, block_offset, block, stride, nnzc, cntr, coeff, picreg
- movsxdifnidn r3, r3d
- %assign pad 128+4-(stack_offset&7)
- SUB rsp, pad
-
- xor r5, r5
-%ifdef PIC
- lea picregq, [scan8_mem]
-%endif
-.nextblock:
- movzx r6, byte [scan8+r5]
- movzx r6, byte [r4+r6]
- test r6, r6
- jz .skipblock
- mov r6d, dword [r1+r5*4]
- add r6, r0
- add word [r2], 32
- IDCT8_ADD_MMX_START r2 , rsp
- IDCT8_ADD_MMX_START r2+8, rsp+64
- IDCT8_ADD_MMX_END r6 , rsp, r3, r2
- mov r6d, dword [r1+r5*4]
- lea r6, [r0+r6+4]
- IDCT8_ADD_MMX_END r6 , rsp+8, r3
-.skipblock:
- add r5, 4
- add r2, 128
- cmp r5, 16
- jl .nextblock
- ADD rsp, pad
- RET
-
-INIT_MMX mmxext
-; void ff_h264_idct_add16_8_mmxext(uint8_t *dst, const int *block_offset,
-; int16_t *block, int stride,
-; const uint8_t nnzc[6 * 8])
-cglobal h264_idct_add16_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
- movsxdifnidn r3, r3d
- xor r5, r5
-%ifdef PIC
- lea picregq, [scan8_mem]
-%endif
-.nextblock:
- movzx r6, byte [scan8+r5]
- movzx r6, byte [r4+r6]
- test r6, r6
- jz .skipblock
- cmp r6, 1
- jnz .no_dc
- movsx r6, word [r2]
- test r6, r6
- jz .no_dc
- mov word [r2], 0
- DC_ADD_MMXEXT_INIT r6, r3
-%if ARCH_X86_64 == 0
-%define dst2q r1
-%define dst2d r1d
-%endif
- mov dst2d, dword [r1+r5*4]
- lea dst2q, [r0+dst2q]
- DC_ADD_MMXEXT_OP movh, dst2q, r3, r6
-%if ARCH_X86_64 == 0
- mov r1, r1m
-%endif
- inc r5
- add r2, 32
- cmp r5, 16
- jl .nextblock
- REP_RET
-.no_dc:
- mov r6d, dword [r1+r5*4]
- add r6, r0
- IDCT4_ADD r6, r2, r3
-.skipblock:
- inc r5
- add r2, 32
- cmp r5, 16
- jl .nextblock
- REP_RET
-
-INIT_MMX mmx
-; void ff_h264_idct_add16intra_8_mmx(uint8_t *dst, const int *block_offset,
-; int16_t *block, int stride,
-; const uint8_t nnzc[6 * 8])
-cglobal h264_idct_add16intra_8, 5, 7 + npicregs, 0, dst, block_offset, block, stride, nnzc, cntr, coeff, picreg
- movsxdifnidn r3, r3d
- xor r5, r5
-%ifdef PIC
- lea picregq, [scan8_mem]
-%endif
-.nextblock:
- movzx r6, byte [scan8+r5]
- movzx r6, byte [r4+r6]
- or r6w, word [r2]
- test r6, r6
- jz .skipblock
- mov r6d, dword [r1+r5*4]
- add r6, r0
- IDCT4_ADD r6, r2, r3
-.skipblock:
- inc r5
- add r2, 32
- cmp r5, 16
- jl .nextblock
- REP_RET
-
-INIT_MMX mmxext
-; void ff_h264_idct_add16intra_8_mmxext(uint8_t *dst, const int *block_offset,
-; int16_t *block, int stride,
-; const uint8_t nnzc[6 * 8])
-cglobal h264_idct_add16intra_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
- movsxdifnidn r3, r3d
- xor r5, r5
-%ifdef PIC
- lea picregq, [scan8_mem]
-%endif
-.nextblock:
- movzx r6, byte [scan8+r5]
- movzx r6, byte [r4+r6]
- test r6, r6
- jz .try_dc
- mov r6d, dword [r1+r5*4]
- lea r6, [r0+r6]
- IDCT4_ADD r6, r2, r3
- inc r5
- add r2, 32
- cmp r5, 16
- jl .nextblock
- REP_RET
-.try_dc:
- movsx r6, word [r2]
- test r6, r6
- jz .skipblock
- mov word [r2], 0
- DC_ADD_MMXEXT_INIT r6, r3
-%if ARCH_X86_64 == 0
-%define dst2q r1
-%define dst2d r1d
-%endif
- mov dst2d, dword [r1+r5*4]
- add dst2q, r0
- DC_ADD_MMXEXT_OP movh, dst2q, r3, r6
-%if ARCH_X86_64 == 0
- mov r1, r1m
-%endif
-.skipblock:
- inc r5
- add r2, 32
- cmp r5, 16
- jl .nextblock
- REP_RET
-
-; void ff_h264_idct8_add4_8_mmxext(uint8_t *dst, const int *block_offset,
-; int16_t *block, int stride,
-; const uint8_t nnzc[6 * 8])
-cglobal h264_idct8_add4_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
- movsxdifnidn r3, r3d
- %assign pad 128+4-(stack_offset&7)
- SUB rsp, pad
-
- xor r5, r5
-%ifdef PIC
- lea picregq, [scan8_mem]
-%endif
-.nextblock:
- movzx r6, byte [scan8+r5]
- movzx r6, byte [r4+r6]
- test r6, r6
- jz .skipblock
- cmp r6, 1
- jnz .no_dc
- movsx r6, word [r2]
- test r6, r6
- jz .no_dc
- mov word [r2], 0
- DC_ADD_MMXEXT_INIT r6, r3
-%if ARCH_X86_64 == 0
-%define dst2q r1
-%define dst2d r1d
-%endif
- mov dst2d, dword [r1+r5*4]
- lea dst2q, [r0+dst2q]
- DC_ADD_MMXEXT_OP mova, dst2q, r3, r6
- lea dst2q, [dst2q+r3*4]
- DC_ADD_MMXEXT_OP mova, dst2q, r3, r6
-%if ARCH_X86_64 == 0
- mov r1, r1m
-%endif
- add r5, 4
- add r2, 128
- cmp r5, 16
- jl .nextblock
-
- ADD rsp, pad
- RET
-.no_dc:
- mov r6d, dword [r1+r5*4]
- add r6, r0
- add word [r2], 32
- IDCT8_ADD_MMX_START r2 , rsp
- IDCT8_ADD_MMX_START r2+8, rsp+64
- IDCT8_ADD_MMX_END r6 , rsp, r3, r2
- mov r6d, dword [r1+r5*4]
- lea r6, [r0+r6+4]
- IDCT8_ADD_MMX_END r6 , rsp+8, r3
-.skipblock:
- add r5, 4
- add r2, 128
- cmp r5, 16
- jl .nextblock
-
- ADD rsp, pad
- RET
-
INIT_XMM sse2
; void ff_h264_idct8_add4_8_sse2(uint8_t *dst, const int *block_offset,
; int16_t *block, int stride,
@@ -678,30 +395,6 @@ h264_idct_add8_mmx_plane:
jnz .nextblock
rep ret
-; void ff_h264_idct_add8_8_mmx(uint8_t **dest, const int *block_offset,
-; int16_t *block, int stride,
-; const uint8_t nnzc[6 * 8])
-cglobal h264_idct_add8_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
- movsxdifnidn r3, r3d
- mov r5, 16
- add r2, 512
-%ifdef PIC
- lea picregq, [scan8_mem]
-%endif
-%if ARCH_X86_64
- mov dst2q, r0
-%endif
- call h264_idct_add8_mmx_plane
- mov r5, 32
- add r2, 384
-%if ARCH_X86_64
- add dst2q, gprsize
-%else
- add r0mp, gprsize
-%endif
- call h264_idct_add8_mmx_plane
- RET ; TODO: check rep ret after a function call
-
cglobal h264_idct_add8_422_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
; dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
movsxdifnidn r3, r3d
@@ -734,74 +427,6 @@ cglobal h264_idct_add8_422_8, 5, 8 + npicregs, 0, dst1, block_offset, block, str
RET ; TODO: check rep ret after a function call
-h264_idct_add8_mmxext_plane:
- movsxdifnidn r3, r3d
-.nextblock:
- movzx r6, byte [scan8+r5]
- movzx r6, byte [r4+r6]
- test r6, r6
- jz .try_dc
-%if ARCH_X86_64
- mov r0d, dword [r1+r5*4]
- add r0, [dst2q]
-%else
- mov r0, r1m ; XXX r1m here is actually r0m of the calling func
- mov r0, [r0]
- add r0, dword [r1+r5*4]
-%endif
- IDCT4_ADD r0, r2, r3
- inc r5
- add r2, 32
- test r5, 3
- jnz .nextblock
- rep ret
-.try_dc:
- movsx r6, word [r2]
- test r6, r6
- jz .skipblock
- mov word [r2], 0
- DC_ADD_MMXEXT_INIT r6, r3
-%if ARCH_X86_64
- mov r0d, dword [r1+r5*4]
- add r0, [dst2q]
-%else
- mov r0, r1m ; XXX r1m here is actually r0m of the calling func
- mov r0, [r0]
- add r0, dword [r1+r5*4]
-%endif
- DC_ADD_MMXEXT_OP movh, r0, r3, r6
-.skipblock:
- inc r5
- add r2, 32
- test r5, 3
- jnz .nextblock
- rep ret
-
-INIT_MMX mmxext
-; void ff_h264_idct_add8_8_mmxext(uint8_t **dest, const int *block_offset,
-; int16_t *block, int stride,
-; const uint8_t nnzc[6 * 8])
-cglobal h264_idct_add8_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
- movsxdifnidn r3, r3d
- mov r5, 16
- add r2, 512
-%if ARCH_X86_64
- mov dst2q, r0
-%endif
-%ifdef PIC
- lea picregq, [scan8_mem]
-%endif
- call h264_idct_add8_mmxext_plane
- mov r5, 32
- add r2, 384
-%if ARCH_X86_64
- add dst2q, gprsize
-%else
- add r0mp, gprsize
-%endif
- call h264_idct_add8_mmxext_plane
- RET ; TODO: check rep ret after a function call
-
; r0 = uint8_t *dst, r2 = int16_t *block, r3 = int stride, r6=clobbered
h264_idct_dc_add8_mmxext:
movsxdifnidn r3, r3d
@@ -1129,18 +754,11 @@ cglobal h264_luma_dc_dequant_idct, 3, 4, %1
inc t1d
shr t3d, t0b
sub t1d, t0d
-%if cpuflag(sse2)
movd xmm6, t1d
DEQUANT_STORE xmm6
-%else
- movd m6, t1d
- DEQUANT_STORE m6
-%endif
RET
%endmacro
-INIT_MMX mmx
-IDCT_DC_DEQUANT 0
INIT_MMX sse2
IDCT_DC_DEQUANT 7
diff --git a/libavcodec/x86/h264_weight.asm b/libavcodec/x86/h264_weight.asm
index 0975d74fcf..6076e64ae0 100644
--- a/libavcodec/x86/h264_weight.asm
+++ b/libavcodec/x86/h264_weight.asm
@@ -70,19 +70,6 @@ SECTION .text
packuswb m0, m1
%endmacro
-INIT_MMX mmxext
-cglobal h264_weight_16, 6, 6, 0
- WEIGHT_SETUP
-.nextrow:
- WEIGHT_OP 0, 4
- mova [r0 ], m0
- WEIGHT_OP 8, 12
- mova [r0+8], m0
- add r0, r1
- dec r2d
- jnz .nextrow
- REP_RET
-
%macro WEIGHT_FUNC_MM 2
cglobal h264_weight_%1, 6, 6, %2
WEIGHT_SETUP
@@ -95,8 +82,6 @@ cglobal h264_weight_%1, 6, 6, %2
REP_RET
%endmacro
-INIT_MMX mmxext
-WEIGHT_FUNC_MM 8, 0
INIT_XMM sse2
WEIGHT_FUNC_MM 16, 8
@@ -198,25 +183,6 @@ WEIGHT_FUNC_HALF_MM 8, 8
packuswb m0, m1
%endmacro
-INIT_MMX mmxext
-cglobal h264_biweight_16, 7, 8, 0
- BIWEIGHT_SETUP
- movifnidn r3d, r3m
-.nextrow:
- BIWEIGHT_STEPA 0, 1, 0
- BIWEIGHT_STEPA 1, 2, 4
- BIWEIGHT_STEPB
- mova [r0], m0
- BIWEIGHT_STEPA 0, 1, 8
- BIWEIGHT_STEPA 1, 2, 12
- BIWEIGHT_STEPB
- mova [r0+8], m0
- add r0, r2
- add r1, r2
- dec r3d
- jnz .nextrow
- REP_RET
-
%macro BIWEIGHT_FUNC_MM 2
cglobal h264_biweight_%1, 7, 8, %2
BIWEIGHT_SETUP
@@ -233,8 +199,6 @@ cglobal h264_biweight_%1, 7, 8, %2
REP_RET
%endmacro
-INIT_MMX mmxext
-BIWEIGHT_FUNC_MM 8, 0
INIT_XMM sse2
BIWEIGHT_FUNC_MM 16, 8
diff --git a/libavcodec/x86/h264dsp_init.c b/libavcodec/x86/h264dsp_init.c
index c9a96c7dca..dc8fc4f720 100644
--- a/libavcodec/x86/h264dsp_init.c
+++ b/libavcodec/x86/h264dsp_init.c
@@ -31,17 +31,14 @@ void ff_h264_idct ## NUM ## _add_ ## DEPTH ## _ ## OPT(uint8_t *dst, \
int16_t *block, \
int stride);
-IDCT_ADD_FUNC(, 8, mmx)
IDCT_ADD_FUNC(, 8, sse2)
IDCT_ADD_FUNC(, 8, avx)
IDCT_ADD_FUNC(, 10, sse2)
-IDCT_ADD_FUNC(_dc, 8, mmxext)
IDCT_ADD_FUNC(_dc, 8, sse2)
IDCT_ADD_FUNC(_dc, 8, avx)
IDCT_ADD_FUNC(_dc, 10, mmxext)
IDCT_ADD_FUNC(8_dc, 8, mmxext)
IDCT_ADD_FUNC(8_dc, 10, sse2)
-IDCT_ADD_FUNC(8, 8, mmx)
IDCT_ADD_FUNC(8, 8, sse2)
IDCT_ADD_FUNC(8, 10, sse2)
IDCT_ADD_FUNC(, 10, avx)
@@ -54,17 +51,11 @@ void ff_h264_idct ## NUM ## _add ## REP ## _ ## DEPTH ## _ ## OPT \
(uint8_t *dst, const int *block_offset, \
int16_t *block, int stride, const uint8_t nnzc[5 * 8]);
-IDCT_ADD_REP_FUNC(8, 4, 8, mmx)
-IDCT_ADD_REP_FUNC(8, 4, 8, mmxext)
IDCT_ADD_REP_FUNC(8, 4, 8, sse2)
IDCT_ADD_REP_FUNC(8, 4, 10, sse2)
IDCT_ADD_REP_FUNC(8, 4, 10, avx)
-IDCT_ADD_REP_FUNC(, 16, 8, mmx)
-IDCT_ADD_REP_FUNC(, 16, 8, mmxext)
IDCT_ADD_REP_FUNC(, 16, 8, sse2)
IDCT_ADD_REP_FUNC(, 16, 10, sse2)
-IDCT_ADD_REP_FUNC(, 16intra, 8, mmx)
-IDCT_ADD_REP_FUNC(, 16intra, 8, mmxext)
IDCT_ADD_REP_FUNC(, 16intra, 8, sse2)
IDCT_ADD_REP_FUNC(, 16intra, 10, sse2)
IDCT_ADD_REP_FUNC(, 16, 10, avx)
@@ -76,8 +67,6 @@ void ff_h264_idct ## NUM ## _add ## REP ## _ ## DEPTH ## _ ## OPT \
(uint8_t **dst, const int *block_offset, \
int16_t *block, int stride, const uint8_t nnzc[15 * 8]);
-IDCT_ADD_REP_FUNC2(, 8, 8, mmx)
-IDCT_ADD_REP_FUNC2(, 8, 8, mmxext)
IDCT_ADD_REP_FUNC2(, 8, 8, sse2)
IDCT_ADD_REP_FUNC2(, 8, 10, sse2)
IDCT_ADD_REP_FUNC2(, 8, 10, avx)
@@ -87,7 +76,6 @@ IDCT_ADD_REP_FUNC2(, 8_422, 8, mmx)
IDCT_ADD_REP_FUNC2(, 8_422, 10, sse2)
IDCT_ADD_REP_FUNC2(, 8_422, 10, avx)
-void ff_h264_luma_dc_dequant_idct_mmx(int16_t *output, int16_t *input, int qmul);
void ff_h264_luma_dc_dequant_idct_sse2(int16_t *output, int16_t *input, int qmul);
/***********************************/
@@ -112,14 +100,6 @@ void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT(uint8_t *pix, \
int beta);
#define LF_FUNCS(type, depth) \
-LF_FUNC(h, chroma, depth, mmxext) \
-LF_IFUNC(h, chroma_intra, depth, mmxext) \
-LF_FUNC(h, chroma422, depth, mmxext) \
-LF_IFUNC(h, chroma422_intra, depth, mmxext) \
-LF_FUNC(v, chroma, depth, mmxext) \
-LF_IFUNC(v, chroma_intra, depth, mmxext) \
-LF_FUNC(h, luma, depth, mmxext) \
-LF_IFUNC(h, luma_intra, depth, mmxext) \
LF_FUNC(h, luma, depth, sse2) \
LF_IFUNC(h, luma_intra, depth, sse2) \
LF_FUNC(v, luma, depth, sse2) \
@@ -147,27 +127,10 @@ LF_FUNC(h, luma_mbaff, 8, avx)
LF_FUNCS(uint8_t, 8)
LF_FUNCS(uint16_t, 10)
-#if ARCH_X86_32 && HAVE_MMXEXT_EXTERNAL
-LF_FUNC(v8, luma, 8, mmxext)
-static void deblock_v_luma_8_mmxext(uint8_t *pix, int stride, int alpha,
- int beta, int8_t *tc0)
-{
- if ((tc0[0] & tc0[1]) >= 0)
- ff_deblock_v8_luma_8_mmxext(pix + 0, stride, alpha, beta, tc0);
- if ((tc0[2] & tc0[3]) >= 0)
- ff_deblock_v8_luma_8_mmxext(pix + 8, stride, alpha, beta, tc0 + 2);
-}
-LF_IFUNC(v8, luma_intra, 8, mmxext)
-static void deblock_v_luma_intra_8_mmxext(uint8_t *pix, int stride,
- int alpha, int beta)
-{
- ff_deblock_v8_luma_intra_8_mmxext(pix + 0, stride, alpha, beta);
- ff_deblock_v8_luma_intra_8_mmxext(pix + 8, stride, alpha, beta);
-}
-#endif /* ARCH_X86_32 && HAVE_MMXEXT_EXTERNAL */
-
LF_FUNC(v, luma, 10, mmxext)
+LF_FUNC(h, luma, 10, mmxext)
LF_IFUNC(v, luma_intra, 10, mmxext)
+LF_IFUNC(h, luma_intra, 10, mmxext)
/***********************************/
/* weighted prediction */
@@ -187,14 +150,13 @@ void ff_h264_biweight_ ## W ## _ ## OPT(uint8_t *dst, uint8_t *src, \
H264_WEIGHT(W, mmxext) \
H264_BIWEIGHT(W, mmxext)
-#define H264_BIWEIGHT_MMX_SSE(W) \
- H264_BIWEIGHT_MMX(W) \
+#define H264_BIWEIGHT_SSE(W) \
H264_WEIGHT(W, sse2) \
H264_BIWEIGHT(W, sse2) \
H264_BIWEIGHT(W, ssse3)
-H264_BIWEIGHT_MMX_SSE(16)
-H264_BIWEIGHT_MMX_SSE(8)
+H264_BIWEIGHT_SSE(16)
+H264_BIWEIGHT_SSE(8)
H264_BIWEIGHT_MMX(4)
#define H264_WEIGHT_10(W, DEPTH, OPT) \
@@ -236,52 +198,16 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
if (bit_depth == 8) {
if (EXTERNAL_MMX(cpu_flags)) {
- c->h264_idct_dc_add =
- c->h264_idct_add = ff_h264_idct_add_8_mmx;
- c->h264_idct8_dc_add =
- c->h264_idct8_add = ff_h264_idct8_add_8_mmx;
-
- c->h264_idct_add16 = ff_h264_idct_add16_8_mmx;
- c->h264_idct8_add4 = ff_h264_idct8_add4_8_mmx;
if (chroma_format_idc <= 1) {
- c->h264_idct_add8 = ff_h264_idct_add8_8_mmx;
} else {
c->h264_idct_add8 = ff_h264_idct_add8_422_8_mmx;
}
- c->h264_idct_add16intra = ff_h264_idct_add16intra_8_mmx;
- if (cpu_flags & AV_CPU_FLAG_CMOV)
- c->h264_luma_dc_dequant_idct = ff_h264_luma_dc_dequant_idct_mmx;
}
if (EXTERNAL_MMXEXT(cpu_flags)) {
- c->h264_idct_dc_add = ff_h264_idct_dc_add_8_mmxext;
c->h264_idct8_dc_add = ff_h264_idct8_dc_add_8_mmxext;
- c->h264_idct_add16 = ff_h264_idct_add16_8_mmxext;
- c->h264_idct8_add4 = ff_h264_idct8_add4_8_mmxext;
- if (chroma_format_idc <= 1)
- c->h264_idct_add8 = ff_h264_idct_add8_8_mmxext;
- c->h264_idct_add16intra = ff_h264_idct_add16intra_8_mmxext;
- c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_8_mmxext;
- c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_8_mmxext;
- if (chroma_format_idc <= 1) {
- c->h264_h_loop_filter_chroma = ff_deblock_h_chroma_8_mmxext;
- c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma_intra_8_mmxext;
- } else {
- c->h264_h_loop_filter_chroma = ff_deblock_h_chroma422_8_mmxext;
- c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma422_intra_8_mmxext;
- }
-#if ARCH_X86_32 && HAVE_MMXEXT_EXTERNAL
- c->h264_v_loop_filter_luma = deblock_v_luma_8_mmxext;
- c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_mmxext;
- c->h264_v_loop_filter_luma_intra = deblock_v_luma_intra_8_mmxext;
- c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_mmxext;
-#endif /* ARCH_X86_32 && HAVE_MMXEXT_EXTERNAL */
- c->weight_h264_pixels_tab[0] = ff_h264_weight_16_mmxext;
- c->weight_h264_pixels_tab[1] = ff_h264_weight_8_mmxext;
c->weight_h264_pixels_tab[2] = ff_h264_weight_4_mmxext;
- c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_mmxext;
- c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_mmxext;
c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_mmxext;
}
if (EXTERNAL_SSE2(cpu_flags)) {
@@ -350,19 +276,12 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
}
} else if (bit_depth == 10) {
if (EXTERNAL_MMXEXT(cpu_flags)) {
-#if ARCH_X86_32
- c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_10_mmxext;
- c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_10_mmxext;
- if (chroma_format_idc <= 1) {
- c->h264_h_loop_filter_chroma = ff_deblock_h_chroma_10_mmxext;
- } else {
- c->h264_h_loop_filter_chroma = ff_deblock_h_chroma422_10_mmxext;
- }
+#if ARCH_X86_32 && !HAVE_ALIGNED_STACK
c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_mmxext;
c->h264_h_loop_filter_luma = ff_deblock_h_luma_10_mmxext;
c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_mmxext;
c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_mmxext;
-#endif /* ARCH_X86_32 */
+#endif /* ARCH_X86_32 && !HAVE_ALIGNED_STACK */
c->h264_idct_dc_add = ff_h264_idct_dc_add_10_mmxext;
}
if (EXTERNAL_SSE2(cpu_flags)) {
More information about the ffmpeg-cvslog
mailing list