[FFmpeg-cvslog] vp9/x86: idct_32x32_add_ssse3 sub-8x8-idct.

Wed Jan 8 13:08:57 CET 2014

ffmpeg | branch: master | Ronald S. Bultje <rsbultje at gmail.com> | Sun Jan  5 11:18:47 2014 -0500| [04a187fb2ae2c14645c34e0e678f797af3ebb7a9] | committer: Ronald S. Bultje

vp9/x86: idct_32x32_add_ssse3 sub-8x8-idct.

Runtime of the full 32x32 idct goes from 2446 to 2441 cycles (intra) or
from 1425 to 1306 cycles (inter). Overall runtime is not significantly
affected.

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=04a187fb2ae2c14645c34e0e678f797af3ebb7a9
---

 libavcodec/x86/vp9itxfm.asm |  109 ++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 107 insertions(+), 2 deletions(-)

diff --git a/libavcodec/x86/vp9itxfm.asm b/libavcodec/x86/vp9itxfm.asm
index b137df8..c5d52f6 100644
--- a/libavcodec/x86/vp9itxfm.asm
+++ b/libavcodec/x86/vp9itxfm.asm
@@ -74,13 +74,22 @@ SECTION .text
     psrad              m%2,  14
 %endmacro
 
-%macro VP9_UNPACK_MULSUB_2W_4X 7 ; dst1, dst2, coef1, coef2, rnd, tmp1, tmp2
+%macro VP9_UNPACK_MULSUB_2W_4X 7-9 ; dst1, dst2, (src1, src2,) coef1, coef2, rnd, tmp1, tmp2
+%if %0 == 7
     punpckhwd          m%6, m%2, m%1
     VP9_MULSUB_2W_2X    %7,  %6,  %6, %5, [pw_m%3_%4], [pw_%4_%3]
     punpcklwd          m%2, m%1
     VP9_MULSUB_2W_2X    %1,  %2,  %2, %5, [pw_m%3_%4], [pw_%4_%3]
     packssdw           m%1, m%7
     packssdw           m%2, m%6
+%else
+    punpckhwd          m%8, m%4, m%3
+    VP9_MULSUB_2W_2X    %9,  %8,  %8, %7, [pw_m%5_%6], [pw_%6_%5]
+    punpcklwd          m%2, m%4, m%3
+    VP9_MULSUB_2W_2X    %1,  %2,  %2, %7, [pw_m%5_%6], [pw_%6_%5]
+    packssdw           m%1, m%9
+    packssdw           m%2, m%8
+%endif
 %endmacro
 
 %macro VP9_STORE_2X 5-6 dstq ; reg1, reg2, tmp1, tmp2, zero, dst
@@ -381,6 +390,32 @@ cglobal vp9_idct_idct_8x8_add, 4,4,13, dst, stride, block, eob
 ;    SUMSUB_BA            w,  6,  9, 15      ; t6, t9
 ;    SUMSUB_BA            w,  7,  8, 15      ; t7, t8
 %macro VP9_IDCT16_1D_START 4 ; src, nnzc, stride, stack_scratch
+%if %2 <= 4
+    mova                m3, [%1+ 1*%3]      ; IN(1)
+    mova               m12, [%1+ 2*%3]      ; IN(2)
+    mova                m0, [%1+ 3*%3]      ; IN(3)
+
+    pmulhrsw           m15, m12, [pw_16069x2]       ; t6-7
+    pmulhrsw           m12, [pw_3196x2]             ; t4-5
+    pmulhrsw            m4, m3,  [pw_16305x2]       ; t14-15
+    pmulhrsw            m3, [pw_1606x2]             ; t8-9
+    pmulhrsw            m7, m0,  [pw_m4756x2]       ; t10-11
+    pmulhrsw            m0, [pw_15679x2]            ; t12-13
+
+    ; m8=t0, m9=t1, m10=t2, m11=t3, m12=t4, m14=t5, m13=t6, m15=t7
+    ; m3=t8, m5=t9, m1=t10, m7=t11, m0=t12, m6=t13, m2=t14, m4=t15
+
+    paddw              m14, m15, m12
+    psubw              m13, m15, m12
+    pmulhrsw           m13, [pw_11585x2]            ; t5
+    pmulhrsw           m14, [pw_11585x2]            ; t6
+
+    VP9_UNPACK_MULSUB_2W_4X 2, 5, 4, 3, 15137,  6270, [pd_8192], 10, 11 ; t9,  t14
+    VP9_UNPACK_MULSUB_2W_4X 6, 1, 0, 7, 6270, m15137, [pd_8192], 10, 11 ; t10, t13
+
+    ; m15=t0, m14=t1, m13=t2, m12=t3, m11=t4, m10=t5, m9=t6, m8=t7
+    ; m7=t8, m6=t9, m2=t10, m3=t11, m4=t12, m5=t13, m1=t14, m0=t15
+%else
     mova                m5, [%1+ 1*%3]     ; IN(1)
     mova               m14, [%1+ 2*%3]     ; IN(2)
     mova                m6, [%1+ 3*%3]     ; IN(3)
@@ -442,6 +477,7 @@ cglobal vp9_idct_idct_8x8_add, 4,4,13, dst, stride, block, eob
     pmulhrsw           m14, [pw_11585x2]                              ; t6
     VP9_UNPACK_MULSUB_2W_4X   2,   5, 15137,  6270, [pd_8192], 10, 11 ; t9,  t14
     VP9_UNPACK_MULSUB_2W_4X   6,   1, 6270, m15137, [pd_8192], 10, 11 ; t10, t13
+%endif
 
     ; m8=t0, m9=t1, m10=t2, m11=t3, m12=t4, m13=t5, m14=t6, m15=t7
     ; m3=t8, m2=t9, m6=t10, m7=t11, m0=t12, m1=t13, m5=t14, m4=t15
@@ -468,6 +504,17 @@ cglobal vp9_idct_idct_8x8_add, 4,4,13, dst, stride, block, eob
     ; m7=t8, m6=t9, m5=t10, m4=t11, m3=t12, m2=t13, m1=t14, m0=t15
 
     ; from load/start
+%if %2 <= 4
+    mova               m11, [%1+ 0*%3]      ; IN(0)
+    pmulhrsw           m11, [pw_11585x2]    ; t0-t3
+
+    psubw               m8, m11, m15
+    paddw              m15, m11
+    psubw               m9, m11, m14
+    paddw              m14, m11
+    psubw              m10, m11, m13
+    paddw              m13, m11
+%else
     mova               m10, [%1+ 0*%3]      ; IN(0)
 %if %2 <= 8
     pmulhrsw           m10, [pw_11585x2]    ; t0 and t1
@@ -490,6 +537,7 @@ cglobal vp9_idct_idct_8x8_add, 4,4,13, dst, stride, block, eob
     SUMSUB_BA            w, 15,  8, 7       ; t0,  t7
     SUMSUB_BA            w, 14,  9, 7       ; t1,  t6
     SUMSUB_BA            w, 13, 10, 7       ; t2,  t5
+%endif
     SUMSUB_BA            w, 12, 11, 7       ; t3,  t4
 
     SUMSUB_BA            w,  0, 15, 7       ; t0, t15
@@ -716,6 +764,37 @@ cglobal vp9_idct_idct_16x16_add, 4, 5, 16, 512, dst, stride, block, eob
     mova     [rsp+26*%%str], m14    ; t14
 
     ; then, secondly, do t16-31
+%if %3 <= 8
+    mova                 m4, [%1+ 1*64]
+    mova                 m3, [%1+ 3*64]
+    mova                 m0, [%1+ 5*64]
+    mova                 m7, [%1+ 7*64]
+
+    pmulhrsw            m11,  m4, [pw_16364x2] ;t31
+    pmulhrsw             m4, [pw_804x2] ;t16
+    pmulhrsw             m8,  m7, [pw_m5520x2] ;t19
+    pmulhrsw             m7, [pw_15426x2] ;t28
+    pmulhrsw            m15,  m0, [pw_15893x2] ;t27
+    pmulhrsw             m0, [pw_3981x2] ;t20
+    pmulhrsw            m12,  m3, [pw_m2404x2] ;t23
+    pmulhrsw             m3, [pw_16207x2] ;t24
+
+    ; m4=t16/17, m8=t18/19, m0=t20/21, m12=t22/23,
+    ; m3=t24/25, m15=t26/27, m7=t28/29, m11=t30/31
+
+    VP9_UNPACK_MULSUB_2W_4X   5, 10, 11,  4, 16069,  3196, [pd_8192], 6,  9 ; t17, t30
+    VP9_UNPACK_MULSUB_2W_4X   9,  6,  7,  8, 3196, m16069, [pd_8192], 1, 14 ; t18, t29
+    ; from 1 stage forward
+    SUMSUB_BA                 w,  8,  4,  1
+    ; temporary storage
+    mova     [rsp+17*%%str], m8             ; t16
+    mova     [rsp+21*%%str], m4             ; t19
+    VP9_UNPACK_MULSUB_2W_4X   1, 14, 15,  0,  9102, 13623, [pd_8192], 4,  8 ; t21, t26
+    VP9_UNPACK_MULSUB_2W_4X  13,  2,  3, 12, 13623, m9102, [pd_8192], 4,  8 ; t22, t25
+
+    ; m4=t16, m5=t17, m9=t18, m8=t19, m0=t20, m1=t21, m13=t22, m12=t23,
+    ; m3=t24, m2=t25, m14=t26, m15=t27, m7=t28, m6=t29, m10=t30, m11=t31
+%else
     mova                m10, [%1+ 1*64]
     mova                m13, [%1+ 3*64]
     mova                m14, [%1+ 5*64]
@@ -793,6 +872,7 @@ cglobal vp9_idct_idct_16x16_add, 4, 5, 16, 512, dst, stride, block, eob
     VP9_UNPACK_MULSUB_2W_4X   9,  6, 3196, m16069, [pd_8192], 4, 8 ; t18, t29
     VP9_UNPACK_MULSUB_2W_4X   1, 14,  9102, 13623, [pd_8192], 4, 8 ; t21, t26
     VP9_UNPACK_MULSUB_2W_4X  13,  2, 13623, m9102, [pd_8192], 4, 8 ; t22, t25
+%endif
 
     ; m4=t16, m5=t17, m9=t18, m8=t19, m0=t20, m1=t21, m13=t22, m12=t23,
     ; m3=t24, m2=t25, m14=t26, m15=t27, m7=t28, m6=t29, m10=t30, m11=t31
@@ -1029,8 +1109,10 @@ INIT_XMM ssse3
 cglobal vp9_idct_idct_32x32_add, 4, 8, 16, 2048, dst, stride, block, eob
     cmp eobd, 135
     jg .idctfull
-    cmp eobd, 1
+    cmp eobd, 34
     jg .idct16x16
+    cmp eobd, 1
+    jg .idct8x8
 
     ; dc-only case
     movd                m0, [blockq]
@@ -1050,6 +1132,29 @@ cglobal vp9_idct_idct_32x32_add, 4, 8, 16, 2048, dst, stride, block, eob
     RET
 
     DEFINE_ARGS dst_bak, stride, block, cnt, dst, stride30, dst_end, stride2
+.idct8x8:
+    VP9_IDCT32_1D   blockq, 1, 8
+
+    mov          stride30q, strideq         ; stride
+    lea           stride2q, [strideq*2]     ; stride*2
+    shl          stride30q, 5               ; stride*32
+    mov               cntd, 4
+    sub          stride30q, stride2q        ; stride*30
+.loop2_8x8:
+    mov               dstq, dst_bakq
+    lea           dst_endq, [dst_bakq+stride30q]
+    VP9_IDCT32_1D      rsp, 2, 8
+    add           dst_bakq, 8
+    add                rsp, 16
+    dec               cntd
+    jg .loop2_8x8
+    sub                rsp, 64
+
+    ; at the end of the loop, m7 should still be zero
+    ; use that to zero out block coefficients
+    ZERO_BLOCK      blockq, 64,  8, m7
+    RET
+
 .idct16x16:
     mov               cntd, 2
 .loop1_16x16: