[FFmpeg-cvslog] x86/vp9lpf: simplify 2nd transpose in 44/48/88/84.

Clément Bœsch git at videolan.org
Sat Feb 8 11:11:37 CET 2014


ffmpeg | branch: master | Clément Bœsch <u at pkh.me> | Sat Feb  8 11:09:30 2014 +0100| [669d4f9053f931ceee513f76dba4ed131e4861a8] | committer: Clément Bœsch

x86/vp9lpf: simplify 2nd transpose in 44/48/88/84.

For non-avx optims, this saves 8 movs.

before:
  1785 decicycles in ff_vp9_loop_filter_h_44_16_ssse3, 524129 runs, 159 skips
  3327 decicycles in ff_vp9_loop_filter_h_48_16_ssse3, 262116 runs, 28 skips
  2712 decicycles in ff_vp9_loop_filter_h_88_16_ssse3, 4193729 runs, 575 skips
  3237 decicycles in ff_vp9_loop_filter_h_84_16_ssse3, 524061 runs, 227 skips

after:
  1768 decicycles in ff_vp9_loop_filter_h_44_16_ssse3, 524062 runs, 226 skips
  3310 decicycles in ff_vp9_loop_filter_h_48_16_ssse3, 262107 runs, 37 skips
  2719 decicycles in ff_vp9_loop_filter_h_88_16_ssse3, 4193954 runs, 350 skips
  3184 decicycles in ff_vp9_loop_filter_h_84_16_ssse3, 524236 runs, 52 skips

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=669d4f9053f931ceee513f76dba4ed131e4861a8
---

 libavcodec/x86/vp9lpf.asm |  114 +++++++++++++++++++--------------------------
 1 file changed, 48 insertions(+), 66 deletions(-)

diff --git a/libavcodec/x86/vp9lpf.asm b/libavcodec/x86/vp9lpf.asm
index 58a6854..e41dd2c 100644
--- a/libavcodec/x86/vp9lpf.asm
+++ b/libavcodec/x86/vp9lpf.asm
@@ -286,12 +286,6 @@ SECTION .text
     SWAP %12, %15
 %endmacro
 
-; %2 = punpckhqdq(%1, %2)
-%macro PUNPCKHQDQ_SWAP 3 ; a, b, tmp
-    punpckhqdq m%3, m%1, m%2
-    SWAP %3, %2
-%endmacro
-
 %macro DEFINE_REAL_P7_TO_Q7 0-1 0
 %define P7 dst1q + 2*mstrideq  + %1
 %define P6 dst1q +   mstrideq  + %1
@@ -728,36 +722,30 @@ SECTION .text
     SBUTTERFLY  dq, 1, 5, 8
     SBUTTERFLY  dq, 2, 6, 8
     SBUTTERFLY  dq, 3, 7, 8
-    PUNPCKHQDQ_SWAP 0,  8, 15
-    movd  [P7],  m0
-    PUNPCKHQDQ_SWAP 1,  9,  0
-    PUNPCKHQDQ_SWAP 2,  10, 0
-    PUNPCKHQDQ_SWAP 3,  11, 0
-    PUNPCKHQDQ_SWAP 4,  12, 0
-    PUNPCKHQDQ_SWAP 5,  13, 0
-    PUNPCKHQDQ_SWAP 6,  14, 0
-    PUNPCKHQDQ_SWAP 7,  15, 0
-    SWAP 1,  8
-    SWAP 2,  4
-    SWAP 3,  12
-    SWAP 5,  10
-    SWAP 7,  14
-    SWAP 11, 13
-    movd  [P6],  m1
-    movd  [P5],  m2
-    movd  [P4],  m3
-    movd  [P3],  m4
-    movd  [P2],  m5
-    movd  [P1],  m6
-    movd  [P0],  m7
-    movd  [Q0],  m8
-    movd  [Q1],  m9
-    movd  [Q2], m10
-    movd  [Q3], m11
-    movd  [Q4], m12
-    movd  [Q5], m13
-    movd  [Q6], m14
-    movd  [Q7], m15
+    movd  [P7], m0
+    punpckhqdq m0, m8
+    movd  [P6], m0
+    movd  [Q0], m1
+    punpckhqdq  m1, m9
+    movd  [Q1], m1
+    movd  [P3], m2
+    punpckhqdq  m2, m10
+    movd  [P2], m2
+    movd  [Q4], m3
+    punpckhqdq m3, m11
+    movd  [Q5], m3
+    movd  [P5], m4
+    punpckhqdq m4, m12
+    movd  [P4], m4
+    movd  [Q2], m5
+    punpckhqdq m5, m13
+    movd  [Q3], m5
+    movd  [P1], m6
+    punpckhqdq m6, m14
+    movd  [P0], m6
+    movd  [Q6], m7
+    punpckhqdq m7, m8
+    movd  [Q7], m7
 %else
     ; the following code do a transpose of 8 full lines to 16 half
     ; lines (high part). It is inlined to avoid the need of a staging area
@@ -782,36 +770,30 @@ SECTION .text
     SBUTTERFLY  dq,  1,  5, 8
     SBUTTERFLY  dq,  2,  6, 8
     SBUTTERFLY  dq,  3,  7, 8
-    PUNPCKHQDQ_SWAP  0,  8, 15
-    movh  [P7],  m0
-    PUNPCKHQDQ_SWAP 1, 9,  0
-    PUNPCKHQDQ_SWAP 2, 10, 0
-    PUNPCKHQDQ_SWAP 3, 11, 0
-    PUNPCKHQDQ_SWAP 4, 12, 0
-    PUNPCKHQDQ_SWAP 5, 13, 0
-    PUNPCKHQDQ_SWAP 6, 14, 0
-    PUNPCKHQDQ_SWAP 7, 15, 0
-    SWAP  1, 8
-    SWAP  2, 4
-    SWAP  3, 12
-    SWAP  5, 10
-    SWAP  7, 14
-    SWAP 11, 13
-    movh  [P6],  m1
-    movh  [P5],  m2
-    movh  [P4],  m3
-    movh  [P3],  m4
-    movh  [P2],  m5
-    movh  [P1],  m6
-    movh  [P0],  m7
-    movh  [Q0],  m8
-    movh  [Q1],  m9
-    movh  [Q2], m10
-    movh  [Q3], m11
-    movh  [Q4], m12
-    movh  [Q5], m13
-    movh  [Q6], m14
-    movh  [Q7], m15
+    movh  [P7], m0
+    punpckhqdq m0, m8
+    movh  [P6], m0
+    movh  [Q0], m1
+    punpckhqdq  m1, m9
+    movh  [Q1], m1
+    movh  [P3], m2
+    punpckhqdq  m2, m10
+    movh  [P2], m2
+    movh  [Q4], m3
+    punpckhqdq m3, m11
+    movh  [Q5], m3
+    movh  [P5], m4
+    punpckhqdq m4, m12
+    movh  [P4], m4
+    movh  [Q2], m5
+    punpckhqdq m5, m13
+    movh  [Q3], m5
+    movh  [P1], m6
+    punpckhqdq m6, m14
+    movh  [P0], m6
+    movh  [Q6], m7
+    punpckhqdq m7, m8
+    movh  [Q7], m7
 %endif
 %endif
 



More information about the ffmpeg-cvslog mailing list