[FFmpeg-devel] [PATCH 2/2] x86/vp9: inital AVX2 intra_pred

James Almer jamrial at gmail.com
Mon May 19 02:59:58 CEST 2014


tos3k-vp9-b10000.webm on a Core i5-4200U @1.6GHz

1219 decicycles in ff_vp9_ipred_dc_32x32_ssse3, 131070 runs, 2 skips
439 decicycles in ff_vp9_ipred_dc_32x32_avx2, 131070 runs, 2 skips

3570 decicycles in ff_vp9_ipred_dc_top_32x32_ssse3, 4096 runs, 0 skips
2494 decicycles in ff_vp9_ipred_dc_top_32x32_avx2, 4096 runs, 0 skips

1419 decicycles in ff_vp9_ipred_dc_left_32x32_ssse3, 16384 runs, 0 skips
717 decicycles in ff_vp9_ipred_dc_left_32x32_avx2, 16384 runs, 0 skips

2737 decicycles in ff_vp9_ipred_tm_32x32_avx, 1024 runs, 0 skips
2088 decicycles in ff_vp9_ipred_tm_32x32_avx2, 1024 runs, 0 skips

3090 decicycles in ff_vp9_ipred_v_32x32_avx, 512 runs, 0 skips
2226 decicycles in ff_vp9_ipred_v_32x32_avx2, 512 runs, 0 skips

1565 decicycles in ff_vp9_ipred_h_32x32_avx, 1024 runs, 0 skips
922 decicycles in ff_vp9_ipred_h_32x32_avx2, 1024 runs, 0 skips

Signed-off-by: James Almer <jamrial at gmail.com>
---
 libavcodec/x86/constants.c      |   8 ++-
 libavcodec/x86/constants.h      |   4 +-
 libavcodec/x86/vp9dsp_init.c    |  16 +++++
 libavcodec/x86/vp9intrapred.asm | 147 ++++++++++++++++++++++++++++++++++++++--
 libavutil/x86/asm.h             |   1 +
 5 files changed, 167 insertions(+), 9 deletions(-)

diff --git a/libavcodec/x86/constants.c b/libavcodec/x86/constants.c
index 3bba80b..7608bb3 100644
--- a/libavcodec/x86/constants.c
+++ b/libavcodec/x86/constants.c
@@ -1,5 +1,5 @@
 /*
- * MMX/SSE constants used across x86 dsp optimizations.
+ * MMX/SSE/AVX constants used across x86 dsp optimizations.
  *
  * This file is part of FFmpeg.
  *
@@ -47,7 +47,9 @@ DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_512)  = { 0x0200020002000200ULL, 0x020
 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_1019) = { 0x03FB03FB03FB03FBULL, 0x03FB03FB03FB03FBULL };
 
 DECLARE_ALIGNED(16, const xmm_reg,  ff_pb_0)    = { 0x0000000000000000ULL, 0x0000000000000000ULL };
-DECLARE_ALIGNED(16, const xmm_reg,  ff_pb_1)    = { 0x0101010101010101ULL, 0x0101010101010101ULL };
-DECLARE_ALIGNED(16, const xmm_reg,  ff_pb_3)    = { 0x0303030303030303ULL, 0x0303030303030303ULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pb_1)    = { 0x0101010101010101ULL, 0x0101010101010101ULL,
+                                                    0x0101010101010101ULL, 0x0101010101010101ULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pb_3)    = { 0x0303030303030303ULL, 0x0303030303030303ULL,
+                                                    0x0303030303030303ULL, 0x0303030303030303ULL };
 DECLARE_ALIGNED(16, const xmm_reg,  ff_pb_80)   = { 0x8080808080808080ULL, 0x8080808080808080ULL };
 DECLARE_ALIGNED(8,  const uint64_t, ff_pb_FC)   =   0xFCFCFCFCFCFCFCFCULL;
diff --git a/libavcodec/x86/constants.h b/libavcodec/x86/constants.h
index 4bf74ff..3ebf171 100644
--- a/libavcodec/x86/constants.h
+++ b/libavcodec/x86/constants.h
@@ -44,8 +44,8 @@ extern const uint64_t ff_pw_96;
 extern const uint64_t ff_pw_128;
 extern const uint64_t ff_pw_255;
 
-extern const xmm_reg  ff_pb_1;
-extern const xmm_reg  ff_pb_3;
+extern const ymm_reg  ff_pb_1;
+extern const ymm_reg  ff_pb_3;
 extern const xmm_reg  ff_pb_80;
 extern const xmm_reg  ff_pb_F8;
 extern const uint64_t ff_pb_FC;
diff --git a/libavcodec/x86/vp9dsp_init.c b/libavcodec/x86/vp9dsp_init.c
index 3fd274d..b04e678 100644
--- a/libavcodec/x86/vp9dsp_init.c
+++ b/libavcodec/x86/vp9dsp_init.c
@@ -241,6 +241,13 @@ ipred_funcs(hd, ssse3, avx);
 ipred_funcs(vl, ssse3, avx);
 ipred_funcs(vr, ssse3, avx);
 
+ipred_func(32, dc, avx2);
+ipred_func(32, dc_left, avx2);
+ipred_func(32, dc_top, avx2);
+ipred_func(32, v, avx2);
+ipred_func(32, h, avx2);
+ipred_func(32, tm, avx2);
+
 #undef ipred_funcs
 #undef ipred_func_set
 #undef ipred_func
@@ -388,6 +395,15 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp)
         init_ipred(TX_32X32, 32, avx);
     }
 
+    if (EXTERNAL_AVX2(cpu_flags)) {
+        dsp->intra_pred[TX_32X32][DC_PRED] = ff_vp9_ipred_dc_32x32_avx2;
+        dsp->intra_pred[TX_32X32][LEFT_DC_PRED] = ff_vp9_ipred_dc_left_32x32_avx2;
+        dsp->intra_pred[TX_32X32][TOP_DC_PRED] = ff_vp9_ipred_dc_top_32x32_avx2;
+        dsp->intra_pred[TX_32X32][VERT_PRED] = ff_vp9_ipred_v_32x32_avx2;
+        dsp->intra_pred[TX_32X32][HOR_PRED] = ff_vp9_ipred_h_32x32_avx2;
+        dsp->intra_pred[TX_32X32][TM_VP8_PRED] = ff_vp9_ipred_tm_32x32_avx2;
+    }
+
 #undef init_fpel
 #undef init_subpel1
 #undef init_subpel2
diff --git a/libavcodec/x86/vp9intrapred.asm b/libavcodec/x86/vp9intrapred.asm
index 3faf1c5..b2e7e5b 100644
--- a/libavcodec/x86/vp9intrapred.asm
+++ b/libavcodec/x86/vp9intrapred.asm
@@ -29,10 +29,10 @@
 
 %include "libavutil/x86/x86util.asm"
 
-SECTION_RODATA
+SECTION_RODATA 32
 
-pw_m256: times 8 dw -256
-pw_m255: times 8 dw -255
+pw_m256: times 16 dw -256
+pw_m255: times 16 dw -255
 pw_512:  times 8 dw  512
 pw_1024: times 8 dw 1024
 pw_2048: times 8 dw 2048
@@ -72,7 +72,7 @@ pb_3to1_5x0:  db 3, 2, 1
               times 9 db 0
 pb_Fto0:      db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
 
-pb_2:  times 16 db 2
+pb_2:  times 32 db 2
 pb_15: times 16 db 15
 
 cextern pb_1
@@ -180,6 +180,38 @@ cglobal vp9_ipred_dc_32x32, 4, 4, 5, dst, stride, l, a
     jg .loop
     RET
 
+INIT_YMM avx2
+cglobal vp9_ipred_dc_32x32, 4, 4, 3, dst, stride, l, a
+    mova                    m0, [lq]
+    mova                    m1, [aq]
+    DEFINE_ARGS dst, stride, stride3, cnt
+    lea               stride3q, [strideq*3]
+    pxor                    m2, m2
+    psadbw                  m0, m2
+    psadbw                  m1, m2
+    paddw                   m0, m1
+    vextracti128           xm1, m0, 1
+    paddw                  xm0, xm1
+    movhlps                xm1, xm0
+    paddw                  xm0, xm1
+    pmulhrsw               xm0, [pw_512]
+    vpbroadcastb            m0, xm0
+    mov                   cntd, 4
+.loop:
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m0
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m0
+    lea                   dstq, [dstq+strideq*4]
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m0
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m0
+    lea                   dstq, [dstq+strideq*4]
+    dec                   cntd
+    jg .loop
+    RET
+
 ; dc_top/left_NxN(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, const uint8_t *a)
 
 %macro DC_1D_FUNCS 2 ; dir (top or left), arg (a or l)
@@ -267,6 +299,35 @@ cglobal vp9_ipred_dc_%1_32x32, 4, 4, 3, dst, stride, l, a
     dec                   cntd
     jg .loop
     RET
+
+INIT_YMM avx2
+cglobal vp9_ipred_dc_%1_32x32, 4, 4, 3, dst, stride, l, a
+    mova                    m0, [%2q]
+    DEFINE_ARGS dst, stride, stride3, cnt
+    lea               stride3q, [strideq*3]
+    pxor                    m2, m2
+    psadbw                  m0, m2
+    vextracti128           xm1, m0, 1
+    paddw                  xm0, xm1
+    movhlps                xm1, xm0
+    paddw                  xm0, xm1
+    pmulhrsw               xm0, [pw_1024]
+    vpbroadcastb            m0, xm0
+    mov                   cntd, 4
+.loop:
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m0
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m0
+    lea                   dstq, [dstq+strideq*4]
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m0
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m0
+    lea                   dstq, [dstq+strideq*4]
+    dec                   cntd
+    jg .loop
+    RET
 %endmacro
 
 DC_1D_FUNCS top,  a
@@ -327,6 +388,27 @@ cglobal vp9_ipred_v_32x32, 4, 4, 2, dst, stride, l, a
     jg .loop
     RET
 
+INIT_YMM avx2
+cglobal vp9_ipred_v_32x32, 4, 4, 1, dst, stride, l, a
+    mova                    m0, [aq]
+    DEFINE_ARGS dst, stride, stride3, cnt
+    lea               stride3q, [strideq*3]
+    mov                   cntd, 4
+.loop:
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m0
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m0
+    lea                   dstq, [dstq+strideq*4]
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m0
+    mova      [dstq+strideq*2], m0
+    mova      [dstq+stride3q ], m0
+    lea                   dstq, [dstq+strideq*4]
+    dec                   cntd
+    jg .loop
+    RET
+
 ; h
 
 INIT_XMM ssse3
@@ -417,6 +499,30 @@ cglobal vp9_ipred_h_32x32, 3, 5, 8, dst, stride, l, stride3, cnt
 H_XMM_FUNCS ssse3
 H_XMM_FUNCS avx
 
+INIT_YMM avx2
+cglobal vp9_ipred_h_32x32, 3, 5, 8, dst, stride, l, stride3, cnt
+    mova                    m5, [pb_1]
+    mova                    m6, [pb_2]
+    mova                    m7, [pb_3]
+    pxor                    m4, m4
+    lea               stride3q, [strideq*3]
+    mov                   cntq, 7
+.loop:
+    movd                   xm3, [lq+cntq*4]
+    vinserti128             m3, m3, xm3, 1
+    pshufb                  m0, m3, m7
+    pshufb                  m1, m3, m6
+    mova      [dstq+strideq*0], m0
+    mova      [dstq+strideq*1], m1
+    pshufb                  m2, m3, m5
+    pshufb                  m3, m4
+    mova      [dstq+strideq*2], m2
+    mova      [dstq+stride3q ], m3
+    lea                   dstq, [dstq+strideq*4]
+    dec                   cntq
+    jge .loop
+    RET
+
 ; tm
 
 INIT_MMX ssse3
@@ -554,6 +660,39 @@ cglobal vp9_ipred_tm_32x32, 4, 4, 14, dst, stride, l, a
 TM_XMM_FUNCS ssse3
 TM_XMM_FUNCS avx
 
+INIT_YMM avx2
+cglobal vp9_ipred_tm_32x32, 4, 4, 8, dst, stride, l, a
+    pxor                    m3, m3
+    pinsrw                 xm2, [aq-1], 0
+    vinserti128             m2, m2, xm2, 1
+    mova                    m0, [aq]
+    DEFINE_ARGS dst, stride, l, cnt
+    mova                    m4, [pw_m256]
+    mova                    m5, [pw_m255]
+    pshufb                  m2, m4
+    punpckhbw               m1, m0, m3
+    punpcklbw               m0, m3
+    psubw                   m1, m2
+    psubw                   m0, m2
+    mov                   cntq, 15
+.loop:
+    pinsrw                 xm7, [lq+cntq*2], 0
+    vinserti128             m7, m7, xm7, 1
+    pshufb                  m3, m7, m5
+    pshufb                  m7, m4
+    paddw                   m2, m3, m0
+    paddw                   m3, m1
+    paddw                   m6, m7, m0
+    paddw                   m7, m1
+    packuswb                m2, m3
+    packuswb                m6, m7
+    mova      [dstq+strideq*0], m2
+    mova      [dstq+strideq*1], m6
+    lea                   dstq, [dstq+strideq*2]
+    dec                   cntq
+    jge .loop
+    RET
+
 ; dl
 
 %macro LOWPASS 4 ; left [dst], center, right, tmp
diff --git a/libavutil/x86/asm.h b/libavutil/x86/asm.h
index 2cecc98..616ad6c 100644
--- a/libavutil/x86/asm.h
+++ b/libavutil/x86/asm.h
@@ -25,6 +25,7 @@
 #include "config.h"
 
 typedef struct xmm_reg { uint64_t a, b; } xmm_reg;
+typedef struct ymm_reg { uint64_t a, b, c, d; } ymm_reg;
 
 #if ARCH_X86_64
 #    define OPSIZE "q"
-- 
1.8.5.5



More information about the ffmpeg-devel mailing list