[FFmpeg-devel] [PATCH] vp9/x86: iwht4x4 (lossless) mmx.
Ronald S. Bultje
rsbultje at gmail.com
Tue Jan 21 02:05:13 CET 2014
---
libavcodec/x86/vp9dsp_init.c | 5 +++++
libavcodec/x86/vp9itxfm.asm | 43 +++++++++++++++++++++++++++++++++++++++++++
2 files changed, 48 insertions(+)
diff --git a/libavcodec/x86/vp9dsp_init.c b/libavcodec/x86/vp9dsp_init.c
index 9c322c1..9e4bc93 100644
--- a/libavcodec/x86/vp9dsp_init.c
+++ b/libavcodec/x86/vp9dsp_init.c
@@ -173,6 +173,7 @@ itxfm_funcs(16, ssse3);
itxfm_funcs(16, avx);
itxfm_func(idct, idct, 32, ssse3);
itxfm_func(idct, idct, 32, avx);
+itxfm_func(iwht, iwht, 4, mmx);
#undef itxfm_func
#undef itxfm_funcs
@@ -223,6 +224,10 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp)
if (EXTERNAL_MMX(cpu_flags)) {
init_fpel(4, 0, 4, put, mmx);
init_fpel(3, 0, 8, put, mmx);
+ dsp->itxfm_add[4 /* lossless */][DCT_DCT] =
+ dsp->itxfm_add[4 /* lossless */][ADST_DCT] =
+ dsp->itxfm_add[4 /* lossless */][DCT_ADST] =
+ dsp->itxfm_add[4 /* lossless */][ADST_ADST] = ff_vp9_iwht_iwht_4x4_add_mmx;
}
if (EXTERNAL_SSE(cpu_flags)) {
diff --git a/libavcodec/x86/vp9itxfm.asm b/libavcodec/x86/vp9itxfm.asm
index fe9f99a..3279b53 100644
--- a/libavcodec/x86/vp9itxfm.asm
+++ b/libavcodec/x86/vp9itxfm.asm
@@ -152,6 +152,49 @@ SECTION .text
%endmacro
;-------------------------------------------------------------------------------------------
+; void vp9_iwht_iwht_4x4_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
+;-------------------------------------------------------------------------------------------
+
+%macro VP9_IWHT4_1D 0
+ SWAP 1, 2
+ SWAP 2, 3
+ paddw m0, m2
+ psubw m3, m1
+ psubw m4, m0, m3
+ psraw m4, 1
+ psubw m5, m4, m1
+ SWAP 5, 1
+ psubw m4, m2
+ SWAP 4, 2
+ psubw m0, m1
+ paddw m3, m2
+ SWAP 2, 3
+ SWAP 1, 2
+%endmacro
+
+INIT_MMX mmx
+cglobal vp9_iwht_iwht_4x4_add, 3, 3, 0, dst, stride, block, eob
+ mova m0, [blockq+0*8]
+ mova m1, [blockq+1*8]
+ mova m2, [blockq+2*8]
+ mova m3, [blockq+3*8]
+ psraw m0, 2
+ psraw m1, 2
+ psraw m2, 2
+ psraw m3, 2
+
+ VP9_IWHT4_1D
+ TRANSPOSE4x4W 0, 1, 2, 3, 4
+ VP9_IWHT4_1D
+
+ pxor m4, m4
+ VP9_STORE_2X 0, 1, 5, 6, 4
+ lea dstq, [dstq+strideq*2]
+ VP9_STORE_2X 2, 3, 5, 6, 4
+ ZERO_BLOCK blockq, 8, 4, m4
+ RET
+
+;-------------------------------------------------------------------------------------------
; void vp9_idct_idct_4x4_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
;-------------------------------------------------------------------------------------------
--
1.8.4
More information about the ffmpeg-devel
mailing list