[FFmpeg-devel] [PATCH 2/2] x86: hevc_mt: use proxy functions for WP

Christophe Gisquet christophe.gisquet at gmail.com
Thu Oct 2 20:52:45 CEST 2014


On Win64:

Before: 155576b
64765 decicycles in qpel_bi_w, 8185 runs, 7 skips
13676 decicycles in epel_bi_w, 16378 runs, 6 skips
54402 decicycles in qpel_uni_w, 1023 runs, 1 skips
12328 decicycles in epel_uni_w, 2048 runs, 0 skips

After: 94260b
65037 decicycles in qpel_bi_w, 8185 runs, 7 skips
13752 decicycles in epel_bi_w, 16380 runs, 4 skips
54709 decicycles in qpel_uni_w, 1021 runs, 3 skips
12037 decicycles in epel_uni_w, 2047 runs, 1 skips
---
 libavcodec/x86/hevcdsp_init.c | 542 +++++++++++++++++++++++++++++++++++-------
 1 file changed, 461 insertions(+), 81 deletions(-)

diff --git a/libavcodec/x86/hevcdsp_init.c b/libavcodec/x86/hevcdsp_init.c
index 4c536ac..a8284db 100644
--- a/libavcodec/x86/hevcdsp_init.c
+++ b/libavcodec/x86/hevcdsp_init.c
@@ -550,9 +550,23 @@ mc_rep_proxies(qpel_hv,12,  8, sse4);
 #define ff_hevc_put_hevc_bi_qpel_hv16_12_sse4  proxy_bi_qpel_hv8_12_sse4
 mc_rep_funcs(qpel_hv,12,  4, 12, sse4);
 
+#define mc_rep_uni_w_proxy(bitd, step, opt) \
+static void proxy_uni_w##step##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t dststride, int16_t *_src, ptrdiff_t _srcstride, \
+                                               int height, int denom,  int _wx, int _ox, int width)                     \
+{                                                                                                                       \
+    int i;                                                                                                              \
+    int16_t *src;                                                                                                       \
+    uint8_t *dst;                                                                                                       \
+    for (i = 0; i < width; i += step) {                                                                                 \
+        src= _src + i;                                                                                                  \
+        dst= _dst + (i * ((bitd + 7) / 8));                                                                             \
+        ff_hevc_put_hevc_uni_w##step##_##bitd##_##opt(dst, dststride, src, _srcstride, height, denom, _wx, _ox);        \
+    }                                                                                                                   \
+}
+
 #define mc_rep_uni_w(bitd, step, W, opt) \
-void ff_hevc_put_hevc_uni_w##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t dststride, int16_t *_src, ptrdiff_t _srcstride,\
-                                               int height, int denom,  int _wx, int _ox)                                \
+static void no_proxy_uni_w##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t dststride, int16_t *_src, ptrdiff_t _srcstride, \
+                                              int height, int denom,  int _wx, int _ox, int width)                      \
 {                                                                                                                       \
     int i;                                                                                                              \
     int16_t *src;                                                                                                       \
@@ -560,36 +574,84 @@ void ff_hevc_put_hevc_uni_w##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t dststri
     for (i = 0; i < W; i += step) {                                                                                     \
         src= _src + i;                                                                                                  \
         dst= _dst + (i * ((bitd + 7) / 8));                                                                             \
-        ff_hevc_put_hevc_uni_w##step##_##bitd##_##opt(dst, dststride, src, _srcstride,                                  \
-                                                     height, denom, _wx, _ox);                                          \
+        ff_hevc_put_hevc_uni_w##step##_##bitd##_##opt(dst, dststride, src, _srcstride, height, denom, _wx, _ox);        \
     }                                                                                                                   \
 }
 
+#define mc_rep_uni_w_unproxy(bitd, W, opt) \
+static void unproxy_uni_w##W##_##bitd##_##opt(uint8_t *dst, ptrdiff_t dststride, int16_t *src, ptrdiff_t srcstride,     \
+                                              int height, int denom,  int _wx, int _ox, int width)                      \
+{                                                                                                                       \
+    ff_hevc_put_hevc_uni_w##W##_##bitd##_##opt(dst, dststride, src, srcstride, height, denom, _wx, _ox);                \
+}
+
 mc_rep_uni_w(8, 6, 12, sse4);
-mc_rep_uni_w(8, 8, 16, sse4);
-mc_rep_uni_w(8, 8, 24, sse4);
-mc_rep_uni_w(8, 8, 32, sse4);
-mc_rep_uni_w(8, 8, 48, sse4);
-mc_rep_uni_w(8, 8, 64, sse4);
+#define ff_hevc_put_hevc_uni_w12_8_sse4   no_proxy_uni_w12_8_sse4
+mc_rep_uni_w_proxy(8, 8, sse4);
+#define ff_hevc_put_hevc_uni_w64_8_sse4   proxy_uni_w8_8_sse4
+#define ff_hevc_put_hevc_uni_w48_8_sse4   proxy_uni_w8_8_sse4
+#define ff_hevc_put_hevc_uni_w32_8_sse4   proxy_uni_w8_8_sse4
+#define ff_hevc_put_hevc_uni_w24_8_sse4   proxy_uni_w8_8_sse4
+#define ff_hevc_put_hevc_uni_w16_8_sse4   proxy_uni_w8_8_sse4
+mc_rep_uni_w_unproxy(8, 4, sse4);
+mc_rep_uni_w_unproxy(8, 6, sse4);
+mc_rep_uni_w_unproxy(8, 8, sse4);
+#define ff_hevc_put_hevc_uni_w4_8_sse4    unproxy_uni_w4_8_sse4
+#define ff_hevc_put_hevc_uni_w6_8_sse4    unproxy_uni_w6_8_sse4
+#define ff_hevc_put_hevc_uni_w8_8_sse4    unproxy_uni_w8_8_sse4
 
 mc_rep_uni_w(10, 6, 12, sse4);
-mc_rep_uni_w(10, 8, 16, sse4);
-mc_rep_uni_w(10, 8, 24, sse4);
-mc_rep_uni_w(10, 8, 32, sse4);
-mc_rep_uni_w(10, 8, 48, sse4);
-mc_rep_uni_w(10, 8, 64, sse4);
+#define ff_hevc_put_hevc_uni_w12_10_sse4  no_proxy_uni_w12_10_sse4
+mc_rep_uni_w_proxy(10, 8, sse4);
+#define ff_hevc_put_hevc_uni_w64_10_sse4  proxy_uni_w8_10_sse4
+#define ff_hevc_put_hevc_uni_w48_10_sse4  proxy_uni_w8_10_sse4
+#define ff_hevc_put_hevc_uni_w32_10_sse4  proxy_uni_w8_10_sse4
+#define ff_hevc_put_hevc_uni_w24_10_sse4  proxy_uni_w8_10_sse4
+#define ff_hevc_put_hevc_uni_w16_10_sse4  proxy_uni_w8_10_sse4
+mc_rep_uni_w_unproxy(10, 4, sse4);
+mc_rep_uni_w_unproxy(10, 6, sse4);
+mc_rep_uni_w_unproxy(10, 8, sse4);
+#define ff_hevc_put_hevc_uni_w4_10_sse4   unproxy_uni_w4_10_sse4
+#define ff_hevc_put_hevc_uni_w6_10_sse4   unproxy_uni_w6_10_sse4
+#define ff_hevc_put_hevc_uni_w8_10_sse4   unproxy_uni_w8_10_sse4
 
 mc_rep_uni_w(12, 6, 12, sse4);
-mc_rep_uni_w(12, 8, 16, sse4);
-mc_rep_uni_w(12, 8, 24, sse4);
-mc_rep_uni_w(12, 8, 32, sse4);
-mc_rep_uni_w(12, 8, 48, sse4);
-mc_rep_uni_w(12, 8, 64, sse4);
+#define ff_hevc_put_hevc_uni_w12_12_sse4  no_proxy_uni_w12_12_sse4
+mc_rep_uni_w_proxy(12, 8, sse4);
+#define ff_hevc_put_hevc_uni_w64_12_sse4  proxy_uni_w8_12_sse4
+#define ff_hevc_put_hevc_uni_w48_12_sse4  proxy_uni_w8_12_sse4
+#define ff_hevc_put_hevc_uni_w32_12_sse4  proxy_uni_w8_12_sse4
+#define ff_hevc_put_hevc_uni_w24_12_sse4  proxy_uni_w8_12_sse4
+#define ff_hevc_put_hevc_uni_w16_12_sse4  proxy_uni_w8_12_sse4
+mc_rep_uni_w_unproxy(12, 4, sse4);
+mc_rep_uni_w_unproxy(12, 6, sse4);
+mc_rep_uni_w_unproxy(12, 8, sse4);
+#define ff_hevc_put_hevc_uni_w4_12_sse4   unproxy_uni_w4_12_sse4
+#define ff_hevc_put_hevc_uni_w6_12_sse4   unproxy_uni_w6_12_sse4
+#define ff_hevc_put_hevc_uni_w8_12_sse4   unproxy_uni_w8_12_sse4
+
+#define mc_rep_bi_w_proxy(bitd, step, opt) \
+static void proxy_bi_w##step##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t dststride, int16_t *_src, ptrdiff_t _srcstride,  \
+                                              int16_t *_src2, int height,                                               \
+                                              int denom,  int _wx0,  int _wx1, int _ox0, int _ox1, int width)           \
+{                                                                                                                       \
+    int i;                                                                                                              \
+    int16_t *src;                                                                                                       \
+    int16_t *src2;                                                                                                      \
+    uint8_t *dst;                                                                                                       \
+    for (i = 0; i < width; i += step) {                                                                                 \
+        src  = _src  + i;                                                                                               \
+        src2 = _src2 + i;                                                                                               \
+        dst  = _dst  + (i * ((bitd + 7) / 8));                                                                          \
+        ff_hevc_put_hevc_bi_w##step##_##bitd##_##opt(dst, dststride, src, _srcstride, src2,                             \
+                                                     height, denom, _wx0, _wx1, _ox0, _ox1);                            \
+    }                                                                                                                   \
+}
 
 #define mc_rep_bi_w(bitd, step, W, opt) \
-void ff_hevc_put_hevc_bi_w##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t dststride, int16_t *_src, ptrdiff_t _srcstride, \
+static void no_proxy_bi_w##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t dststride, int16_t *_src, ptrdiff_t _srcstride,  \
                                               int16_t *_src2, int height,                                               \
-                                              int denom,  int _wx0,  int _wx1, int _ox0, int _ox1)                      \
+                                              int denom,  int _wx0,  int _wx1, int _ox0, int _ox1, int width)           \
 {                                                                                                                       \
     int i;                                                                                                              \
     int16_t *src;                                                                                                       \
@@ -604,26 +666,69 @@ void ff_hevc_put_hevc_bi_w##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t dststrid
     }                                                                                                                   \
 }
 
+#define mc_rep_bi_w_unproxy(bitd, W, opt) \
+static void unproxy_bi_w##W##_##bitd##_##opt(uint8_t *dst, ptrdiff_t dstride, int16_t *src, ptrdiff_t sstride,          \
+                                             int16_t *src2, int h, int denom, int w0, int w1, int o0, int o1, int w)    \
+{                                                                                                                       \
+    ff_hevc_put_hevc_bi_w##W##_##bitd##_##opt(dst, dstride, src, sstride, src2, h, denom, w0, w1, o0, o1);              \
+}
+
 mc_rep_bi_w(8, 6, 12, sse4);
-mc_rep_bi_w(8, 8, 16, sse4);
-mc_rep_bi_w(8, 8, 24, sse4);
-mc_rep_bi_w(8, 8, 32, sse4);
-mc_rep_bi_w(8, 8, 48, sse4);
-mc_rep_bi_w(8, 8, 64, sse4);
+#define ff_hevc_put_hevc_bi_w12_8_sse4   no_proxy_bi_w12_8_sse4
+mc_rep_bi_w_proxy(8, 8, sse4);
+#define ff_hevc_put_hevc_bi_w64_8_sse4   proxy_bi_w8_8_sse4
+#define ff_hevc_put_hevc_bi_w48_8_sse4   proxy_bi_w8_8_sse4
+#define ff_hevc_put_hevc_bi_w32_8_sse4   proxy_bi_w8_8_sse4
+#define ff_hevc_put_hevc_bi_w24_8_sse4   proxy_bi_w8_8_sse4
+#define ff_hevc_put_hevc_bi_w16_8_sse4   proxy_bi_w8_8_sse4
+mc_rep_bi_w_unproxy(8, 4, sse4);
+mc_rep_bi_w_unproxy(8, 6, sse4);
+mc_rep_bi_w_unproxy(8, 8, sse4);
+#define ff_hevc_put_hevc_bi_w4_8_sse4    unproxy_bi_w4_8_sse4
+#define ff_hevc_put_hevc_bi_w6_8_sse4    unproxy_bi_w6_8_sse4
+#define ff_hevc_put_hevc_bi_w8_8_sse4    unproxy_bi_w8_8_sse4
 
 mc_rep_bi_w(10, 6, 12, sse4);
-mc_rep_bi_w(10, 8, 16, sse4);
-mc_rep_bi_w(10, 8, 24, sse4);
-mc_rep_bi_w(10, 8, 32, sse4);
-mc_rep_bi_w(10, 8, 48, sse4);
-mc_rep_bi_w(10, 8, 64, sse4);
+#define ff_hevc_put_hevc_bi_w12_10_sse4  no_proxy_bi_w12_10_sse4
+mc_rep_bi_w_proxy(10, 8, sse4);
+#define ff_hevc_put_hevc_bi_w64_10_sse4  proxy_bi_w8_10_sse4
+#define ff_hevc_put_hevc_bi_w48_10_sse4  proxy_bi_w8_10_sse4
+#define ff_hevc_put_hevc_bi_w32_10_sse4  proxy_bi_w8_10_sse4
+#define ff_hevc_put_hevc_bi_w24_10_sse4  proxy_bi_w8_10_sse4
+#define ff_hevc_put_hevc_bi_w16_10_sse4  proxy_bi_w8_10_sse4
+mc_rep_bi_w_unproxy(10, 4, sse4);
+mc_rep_bi_w_unproxy(10, 6, sse4);
+mc_rep_bi_w_unproxy(10, 8, sse4);
+#define ff_hevc_put_hevc_bi_w4_10_sse4   unproxy_bi_w4_10_sse4
+#define ff_hevc_put_hevc_bi_w6_10_sse4   unproxy_bi_w6_10_sse4
+#define ff_hevc_put_hevc_bi_w8_10_sse4   unproxy_bi_w8_10_sse4
 
 mc_rep_bi_w(12, 6, 12, sse4);
-mc_rep_bi_w(12, 8, 16, sse4);
-mc_rep_bi_w(12, 8, 24, sse4);
-mc_rep_bi_w(12, 8, 32, sse4);
-mc_rep_bi_w(12, 8, 48, sse4);
-mc_rep_bi_w(12, 8, 64, sse4);
+#define ff_hevc_put_hevc_bi_w12_12_sse4  no_proxy_bi_w12_12_sse4
+mc_rep_bi_w_proxy(12, 8, sse4);
+#define ff_hevc_put_hevc_bi_w64_12_sse4  proxy_bi_w8_12_sse4
+#define ff_hevc_put_hevc_bi_w48_12_sse4  proxy_bi_w8_12_sse4
+#define ff_hevc_put_hevc_bi_w32_12_sse4  proxy_bi_w8_12_sse4
+#define ff_hevc_put_hevc_bi_w24_12_sse4  proxy_bi_w8_12_sse4
+#define ff_hevc_put_hevc_bi_w16_12_sse4  proxy_bi_w8_12_sse4
+mc_rep_bi_w_unproxy(12, 4, sse4);
+mc_rep_bi_w_unproxy(12, 6, sse4);
+mc_rep_bi_w_unproxy(12, 8, sse4);
+#define ff_hevc_put_hevc_bi_w4_12_sse4   unproxy_bi_w4_12_sse4
+#define ff_hevc_put_hevc_bi_w6_12_sse4   unproxy_bi_w6_12_sse4
+#define ff_hevc_put_hevc_bi_w8_12_sse4   unproxy_bi_w8_12_sse4
+
+#define mc_uni_w_func_proxy(name, bitd, step, opt) \
+static void proxy_uni_w_##name##step##_##bitd##_##opt(uint8_t *dst, ptrdiff_t dststride,           \
+                                                      uint8_t *src, ptrdiff_t srcstride,           \
+                                                      int height, int denom,                       \
+                                                      int wx, int ox,                              \
+                                                      intptr_t mx, intptr_t my, int width)         \
+{                                                                                                  \
+    LOCAL_ALIGNED_16(int16_t, temp, [71 * MAX_PB_SIZE]);                                           \
+    proxy_##name##step##_##bitd##_##opt(temp, src, srcstride, height, mx, my, width);              \
+    proxy_uni_w8##_##bitd##_##opt(dst, dststride, temp, MAX_PB_SIZE, height, denom, wx, ox, width);\
+}
 
 #define mc_uni_w_func(name, bitd, W, opt) \
 void ff_hevc_put_hevc_uni_w_##name##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t _dststride,         \
@@ -634,54 +739,199 @@ void ff_hevc_put_hevc_uni_w_##name##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t
 {                                                                                                   \
     LOCAL_ALIGNED_16(int16_t, temp, [71 * MAX_PB_SIZE]);                                            \
     ff_hevc_put_hevc_##name##W##_##bitd##_##opt(temp, _src, _srcstride, height, mx, my, width);     \
-    ff_hevc_put_hevc_uni_w##W##_##bitd##_##opt(_dst, _dststride, temp, MAX_PB_SIZE, height, denom, _wx, _ox);\
+    ff_hevc_put_hevc_uni_w##W##_##bitd##_##opt(_dst, _dststride, temp, MAX_PB_SIZE, height, denom, _wx, _ox, width);\
 }
 
 #define mc_uni_w_funcs(name, bitd, opt)       \
         mc_uni_w_func(name, bitd, 4, opt);    \
         mc_uni_w_func(name, bitd, 8, opt);    \
         mc_uni_w_func(name, bitd, 12, opt);   \
-        mc_uni_w_func(name, bitd, 16, opt);   \
         mc_uni_w_func(name, bitd, 24, opt);   \
+        mc_uni_w_func(name, bitd, 16, opt);   \
         mc_uni_w_func(name, bitd, 32, opt);   \
         mc_uni_w_func(name, bitd, 48, opt);   \
         mc_uni_w_func(name, bitd, 64, opt)
 
-mc_uni_w_funcs(pel_pixels, 8, sse4);
+#define mc_uni_w_proxy_funcs(name, bitd, step, opt) \
+        mc_uni_w_func(name, bitd, 4, opt);    \
+        mc_uni_w_func(name, bitd, 8, opt);    \
+        mc_uni_w_func(name, bitd, 12, opt);   \
+        mc_uni_w_func_proxy(name, bitd, step, opt)
+
+
+mc_uni_w_proxy_funcs(pel_pixels, 8, 16, sse4);
+#define ff_hevc_put_hevc_uni_w_pel_pixels16_8_sse4  proxy_uni_w_pel_pixels16_8_sse4
+#define ff_hevc_put_hevc_uni_w_pel_pixels32_8_sse4  proxy_uni_w_pel_pixels16_8_sse4
+#define ff_hevc_put_hevc_uni_w_pel_pixels48_8_sse4  proxy_uni_w_pel_pixels16_8_sse4
+#define ff_hevc_put_hevc_uni_w_pel_pixels64_8_sse4  proxy_uni_w_pel_pixels16_8_sse4
+mc_uni_w_func(pel_pixels, 8, 24, sse4);
 mc_uni_w_func(pel_pixels, 8, 6, sse4);
-mc_uni_w_funcs(epel_h, 8, sse4);
+
+mc_uni_w_proxy_funcs(epel_h, 8, 16, sse4);
+#define ff_hevc_put_hevc_uni_w_epel_h16_8_sse4      proxy_uni_w_epel_h16_8_sse4
+#define ff_hevc_put_hevc_uni_w_epel_h32_8_sse4      proxy_uni_w_epel_h16_8_sse4
+#define ff_hevc_put_hevc_uni_w_epel_h48_8_sse4      proxy_uni_w_epel_h16_8_sse4
+#define ff_hevc_put_hevc_uni_w_epel_h64_8_sse4      proxy_uni_w_epel_h16_8_sse4
+mc_uni_w_func(epel_h, 8, 24, sse4);
 mc_uni_w_func(epel_h, 8, 6, sse4);
-mc_uni_w_funcs(epel_v, 8, sse4);
+
+mc_uni_w_proxy_funcs(epel_v, 8, 16, sse4);
+#define ff_hevc_put_hevc_uni_w_epel_v16_8_sse4      proxy_uni_w_epel_v16_8_sse4
+#define ff_hevc_put_hevc_uni_w_epel_v32_8_sse4      proxy_uni_w_epel_v16_8_sse4
+#define ff_hevc_put_hevc_uni_w_epel_v48_8_sse4      proxy_uni_w_epel_v16_8_sse4
+#define ff_hevc_put_hevc_uni_w_epel_v64_8_sse4      proxy_uni_w_epel_v16_8_sse4
+mc_uni_w_func(epel_v, 8, 24, sse4);
 mc_uni_w_func(epel_v, 8, 6, sse4);
-mc_uni_w_funcs(epel_hv, 8, sse4);
+
+mc_uni_w_proxy_funcs(epel_hv, 8, 8, sse4);
+#define ff_hevc_put_hevc_uni_w_epel_hv16_8_sse4     proxy_uni_w_epel_hv8_8_sse4
+#define ff_hevc_put_hevc_uni_w_epel_hv24_8_sse4     proxy_uni_w_epel_hv8_8_sse4
+#define ff_hevc_put_hevc_uni_w_epel_hv32_8_sse4     proxy_uni_w_epel_hv8_8_sse4
+#define ff_hevc_put_hevc_uni_w_epel_hv48_8_sse4     proxy_uni_w_epel_hv8_8_sse4
+#define ff_hevc_put_hevc_uni_w_epel_hv64_8_sse4     proxy_uni_w_epel_hv8_8_sse4
 mc_uni_w_func(epel_hv, 8, 6, sse4);
-mc_uni_w_funcs(qpel_h, 8, sse4);
-mc_uni_w_funcs(qpel_v, 8, sse4);
-mc_uni_w_funcs(qpel_hv, 8, sse4);
 
-mc_uni_w_funcs(pel_pixels, 10, sse4);
+mc_uni_w_proxy_funcs(qpel_h, 8, 16, sse4);
+#define ff_hevc_put_hevc_uni_w_qpel_h16_8_sse4      proxy_uni_w_qpel_h16_8_sse4
+#define ff_hevc_put_hevc_uni_w_qpel_h32_8_sse4      proxy_uni_w_qpel_h16_8_sse4
+#define ff_hevc_put_hevc_uni_w_qpel_h48_8_sse4      proxy_uni_w_qpel_h16_8_sse4
+#define ff_hevc_put_hevc_uni_w_qpel_h64_8_sse4      proxy_uni_w_qpel_h16_8_sse4
+mc_uni_w_func(qpel_h, 8, 24, sse4);
+
+mc_uni_w_proxy_funcs(qpel_v, 8, 16, sse4);
+#define ff_hevc_put_hevc_uni_w_qpel_v16_8_sse4      proxy_uni_w_qpel_v16_8_sse4
+#define ff_hevc_put_hevc_uni_w_qpel_v32_8_sse4      proxy_uni_w_qpel_v16_8_sse4
+#define ff_hevc_put_hevc_uni_w_qpel_v48_8_sse4      proxy_uni_w_qpel_v16_8_sse4
+#define ff_hevc_put_hevc_uni_w_qpel_v64_8_sse4      proxy_uni_w_qpel_v16_8_sse4
+mc_uni_w_func(qpel_v, 8, 24, sse4);
+
+mc_uni_w_proxy_funcs(qpel_hv, 8, 8, sse4);
+mc_uni_w_func(qpel_hv, 8, 16, sse4);
+#define ff_hevc_put_hevc_uni_w_qpel_hv24_8_sse4     proxy_uni_w_qpel_hv8_8_sse4
+#define ff_hevc_put_hevc_uni_w_qpel_hv32_8_sse4     proxy_uni_w_qpel_hv8_8_sse4
+#define ff_hevc_put_hevc_uni_w_qpel_hv48_8_sse4     proxy_uni_w_qpel_hv8_8_sse4
+#define ff_hevc_put_hevc_uni_w_qpel_hv64_8_sse4     proxy_uni_w_qpel_hv8_8_sse4
+
+mc_uni_w_proxy_funcs(pel_pixels, 10, 8, sse4);
+#define ff_hevc_put_hevc_uni_w_pel_pixels16_10_sse4 proxy_uni_w_pel_pixels8_10_sse4
+#define ff_hevc_put_hevc_uni_w_pel_pixels24_10_sse4 proxy_uni_w_pel_pixels8_10_sse4
+#define ff_hevc_put_hevc_uni_w_pel_pixels32_10_sse4 proxy_uni_w_pel_pixels8_10_sse4
+#define ff_hevc_put_hevc_uni_w_pel_pixels48_10_sse4 proxy_uni_w_pel_pixels8_10_sse4
+#define ff_hevc_put_hevc_uni_w_pel_pixels64_10_sse4 proxy_uni_w_pel_pixels8_10_sse4
 mc_uni_w_func(pel_pixels, 10, 6, sse4);
-mc_uni_w_funcs(epel_h, 10, sse4);
+
+mc_uni_w_proxy_funcs(epel_h, 10, 8, sse4);
+#define ff_hevc_put_hevc_uni_w_epel_h16_10_sse4     proxy_uni_w_epel_h8_10_sse4
+#define ff_hevc_put_hevc_uni_w_epel_h24_10_sse4     proxy_uni_w_epel_h8_10_sse4
+#define ff_hevc_put_hevc_uni_w_epel_h32_10_sse4     proxy_uni_w_epel_h8_10_sse4
+#define ff_hevc_put_hevc_uni_w_epel_h48_10_sse4     proxy_uni_w_epel_h8_10_sse4
+#define ff_hevc_put_hevc_uni_w_epel_h64_10_sse4     proxy_uni_w_epel_h8_10_sse4
 mc_uni_w_func(epel_h, 10, 6, sse4);
-mc_uni_w_funcs(epel_v, 10, sse4);
+
+mc_uni_w_proxy_funcs(epel_v, 10, 8, sse4);
+#define ff_hevc_put_hevc_uni_w_epel_v16_10_sse4     proxy_uni_w_epel_v8_10_sse4
+#define ff_hevc_put_hevc_uni_w_epel_v24_10_sse4     proxy_uni_w_epel_v8_10_sse4
+#define ff_hevc_put_hevc_uni_w_epel_v32_10_sse4     proxy_uni_w_epel_v8_10_sse4
+#define ff_hevc_put_hevc_uni_w_epel_v48_10_sse4     proxy_uni_w_epel_v8_10_sse4
+#define ff_hevc_put_hevc_uni_w_epel_v64_10_sse4     proxy_uni_w_epel_v8_10_sse4
 mc_uni_w_func(epel_v, 10, 6, sse4);
-mc_uni_w_funcs(epel_hv, 10, sse4);
+
+mc_uni_w_proxy_funcs(epel_hv, 10, 8, sse4);
+#define ff_hevc_put_hevc_uni_w_epel_hv16_10_sse4    proxy_uni_w_epel_hv8_10_sse4
+#define ff_hevc_put_hevc_uni_w_epel_hv24_10_sse4    proxy_uni_w_epel_hv8_10_sse4
+#define ff_hevc_put_hevc_uni_w_epel_hv32_10_sse4    proxy_uni_w_epel_hv8_10_sse4
+#define ff_hevc_put_hevc_uni_w_epel_hv48_10_sse4    proxy_uni_w_epel_hv8_10_sse4
+#define ff_hevc_put_hevc_uni_w_epel_hv64_10_sse4    proxy_uni_w_epel_hv8_10_sse4
 mc_uni_w_func(epel_hv, 10, 6, sse4);
-mc_uni_w_funcs(qpel_h, 10, sse4);
-mc_uni_w_funcs(qpel_v, 10, sse4);
-mc_uni_w_funcs(qpel_hv, 10, sse4);
 
-mc_uni_w_funcs(pel_pixels, 12, sse4);
+mc_uni_w_proxy_funcs(qpel_h, 10, 8, sse4);
+#define ff_hevc_put_hevc_uni_w_qpel_h16_10_sse4     proxy_uni_w_qpel_h8_10_sse4
+#define ff_hevc_put_hevc_uni_w_qpel_h24_10_sse4     proxy_uni_w_qpel_h8_10_sse4
+#define ff_hevc_put_hevc_uni_w_qpel_h32_10_sse4     proxy_uni_w_qpel_h8_10_sse4
+#define ff_hevc_put_hevc_uni_w_qpel_h48_10_sse4     proxy_uni_w_qpel_h8_10_sse4
+#define ff_hevc_put_hevc_uni_w_qpel_h64_10_sse4     proxy_uni_w_qpel_h8_10_sse4
+
+mc_uni_w_proxy_funcs(qpel_v, 10, 8, sse4);
+#define ff_hevc_put_hevc_uni_w_qpel_v16_10_sse4     proxy_uni_w_qpel_v8_10_sse4
+#define ff_hevc_put_hevc_uni_w_qpel_v24_10_sse4     proxy_uni_w_qpel_v8_10_sse4
+#define ff_hevc_put_hevc_uni_w_qpel_v32_10_sse4     proxy_uni_w_qpel_v8_10_sse4
+#define ff_hevc_put_hevc_uni_w_qpel_v48_10_sse4     proxy_uni_w_qpel_v8_10_sse4
+#define ff_hevc_put_hevc_uni_w_qpel_v64_10_sse4     proxy_uni_w_qpel_v8_10_sse4
+
+mc_uni_w_proxy_funcs(qpel_hv, 10, 8, sse4);
+#define ff_hevc_put_hevc_uni_w_qpel_hv16_10_sse4    proxy_uni_w_qpel_hv8_10_sse4
+#define ff_hevc_put_hevc_uni_w_qpel_hv24_10_sse4    proxy_uni_w_qpel_hv8_10_sse4
+#define ff_hevc_put_hevc_uni_w_qpel_hv32_10_sse4    proxy_uni_w_qpel_hv8_10_sse4
+#define ff_hevc_put_hevc_uni_w_qpel_hv48_10_sse4    proxy_uni_w_qpel_hv8_10_sse4
+#define ff_hevc_put_hevc_uni_w_qpel_hv64_10_sse4    proxy_uni_w_qpel_hv8_10_sse4
+
+mc_uni_w_proxy_funcs(pel_pixels, 12, 8, sse4);
+#define ff_hevc_put_hevc_uni_w_pel_pixels16_12_sse4 proxy_uni_w_pel_pixels8_12_sse4
+#define ff_hevc_put_hevc_uni_w_pel_pixels24_12_sse4 proxy_uni_w_pel_pixels8_12_sse4
+#define ff_hevc_put_hevc_uni_w_pel_pixels32_12_sse4 proxy_uni_w_pel_pixels8_12_sse4
+#define ff_hevc_put_hevc_uni_w_pel_pixels48_12_sse4 proxy_uni_w_pel_pixels8_12_sse4
+#define ff_hevc_put_hevc_uni_w_pel_pixels64_12_sse4 proxy_uni_w_pel_pixels8_12_sse4
 mc_uni_w_func(pel_pixels, 12, 6, sse4);
-mc_uni_w_funcs(epel_h, 12, sse4);
+
+mc_uni_w_proxy_funcs(epel_h, 12, 8, sse4);
+#define ff_hevc_put_hevc_uni_w_epel_h16_12_sse4     proxy_uni_w_epel_h8_12_sse4
+#define ff_hevc_put_hevc_uni_w_epel_h24_12_sse4     proxy_uni_w_epel_h8_12_sse4
+#define ff_hevc_put_hevc_uni_w_epel_h32_12_sse4     proxy_uni_w_epel_h8_12_sse4
+#define ff_hevc_put_hevc_uni_w_epel_h48_12_sse4     proxy_uni_w_epel_h8_12_sse4
+#define ff_hevc_put_hevc_uni_w_epel_h64_12_sse4     proxy_uni_w_epel_h8_12_sse4
 mc_uni_w_func(epel_h, 12, 6, sse4);
-mc_uni_w_funcs(epel_v, 12, sse4);
+
+mc_uni_w_proxy_funcs(epel_v, 12, 8, sse4);
+#define ff_hevc_put_hevc_uni_w_epel_v16_12_sse4     proxy_uni_w_epel_v8_12_sse4
+#define ff_hevc_put_hevc_uni_w_epel_v24_12_sse4     proxy_uni_w_epel_v8_12_sse4
+#define ff_hevc_put_hevc_uni_w_epel_v32_12_sse4     proxy_uni_w_epel_v8_12_sse4
+#define ff_hevc_put_hevc_uni_w_epel_v48_12_sse4     proxy_uni_w_epel_v8_12_sse4
+#define ff_hevc_put_hevc_uni_w_epel_v64_12_sse4     proxy_uni_w_epel_v8_12_sse4
 mc_uni_w_func(epel_v, 12, 6, sse4);
-mc_uni_w_funcs(epel_hv, 12, sse4);
+
+mc_uni_w_proxy_funcs(epel_hv, 12, 8, sse4);
+#define ff_hevc_put_hevc_uni_w_epel_hv16_12_sse4    proxy_uni_w_epel_hv8_12_sse4
+#define ff_hevc_put_hevc_uni_w_epel_hv24_12_sse4    proxy_uni_w_epel_hv8_12_sse4
+#define ff_hevc_put_hevc_uni_w_epel_hv32_12_sse4    proxy_uni_w_epel_hv8_12_sse4
+#define ff_hevc_put_hevc_uni_w_epel_hv48_12_sse4    proxy_uni_w_epel_hv8_12_sse4
+#define ff_hevc_put_hevc_uni_w_epel_hv64_12_sse4    proxy_uni_w_epel_hv8_12_sse4
 mc_uni_w_func(epel_hv, 12, 6, sse4);
-mc_uni_w_funcs(qpel_h, 12, sse4);
-mc_uni_w_funcs(qpel_v, 12, sse4);
-mc_uni_w_funcs(qpel_hv, 12, sse4);
+
+mc_uni_w_proxy_funcs(qpel_h, 12, 8, sse4);
+#define ff_hevc_put_hevc_uni_w_qpel_h16_12_sse4     proxy_uni_w_qpel_h8_12_sse4
+#define ff_hevc_put_hevc_uni_w_qpel_h24_12_sse4     proxy_uni_w_qpel_h8_12_sse4
+#define ff_hevc_put_hevc_uni_w_qpel_h32_12_sse4     proxy_uni_w_qpel_h8_12_sse4
+#define ff_hevc_put_hevc_uni_w_qpel_h48_12_sse4     proxy_uni_w_qpel_h8_12_sse4
+#define ff_hevc_put_hevc_uni_w_qpel_h64_12_sse4     proxy_uni_w_qpel_h8_12_sse4
+
+mc_uni_w_proxy_funcs(qpel_v, 12, 8, sse4);
+#define ff_hevc_put_hevc_uni_w_qpel_v16_12_sse4     proxy_uni_w_qpel_v8_12_sse4
+#define ff_hevc_put_hevc_uni_w_qpel_v24_12_sse4     proxy_uni_w_qpel_v8_12_sse4
+#define ff_hevc_put_hevc_uni_w_qpel_v32_12_sse4     proxy_uni_w_qpel_v8_12_sse4
+#define ff_hevc_put_hevc_uni_w_qpel_v48_12_sse4     proxy_uni_w_qpel_v8_12_sse4
+#define ff_hevc_put_hevc_uni_w_qpel_v64_12_sse4     proxy_uni_w_qpel_v8_12_sse4
+
+mc_uni_w_proxy_funcs(qpel_hv, 12, 8, sse4);
+#define ff_hevc_put_hevc_uni_w_qpel_hv16_12_sse4    proxy_uni_w_qpel_hv8_12_sse4
+#define ff_hevc_put_hevc_uni_w_qpel_hv24_12_sse4    proxy_uni_w_qpel_hv8_12_sse4
+#define ff_hevc_put_hevc_uni_w_qpel_hv32_12_sse4    proxy_uni_w_qpel_hv8_12_sse4
+#define ff_hevc_put_hevc_uni_w_qpel_hv48_12_sse4    proxy_uni_w_qpel_hv8_12_sse4
+#define ff_hevc_put_hevc_uni_w_qpel_hv64_12_sse4    proxy_uni_w_qpel_hv8_12_sse4
+
+// Step only for first proxy
+#define mc_bi_w_func_proxy(name, bitd, step, opt) \
+static void proxy_bi_w_##name##step##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t _dststride,           \
+                                                     uint8_t *_src, ptrdiff_t _srcstride,            \
+                                                     int16_t *_src2,                                 \
+                                                     int height, int denom,                          \
+                                                     int _wx0, int _wx1, int _ox0, int _ox1,         \
+                                                     intptr_t mx, intptr_t my, int width)            \
+{                                                                                                  \
+    LOCAL_ALIGNED_16(int16_t, temp, [71 * MAX_PB_SIZE]);                                           \
+    proxy_##name##step##_##bitd##_##opt(temp, _src, _srcstride, height, mx, my, width);      \
+    proxy_bi_w8##_##bitd##_##opt(_dst, _dststride, temp, MAX_PB_SIZE, _src2,            \
+                                 height, denom, _wx0, _wx1, _ox0, _ox1, width);          \
+}
 
 #define mc_bi_w_func(name, bitd, W, opt) \
 void ff_hevc_put_hevc_bi_w_##name##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t _dststride,           \
@@ -694,7 +944,7 @@ void ff_hevc_put_hevc_bi_w_##name##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t _
     LOCAL_ALIGNED_16(int16_t, temp, [71 * MAX_PB_SIZE]);                                             \
     ff_hevc_put_hevc_##name##W##_##bitd##_##opt(temp, _src, _srcstride, height, mx, my, width);      \
     ff_hevc_put_hevc_bi_w##W##_##bitd##_##opt(_dst, _dststride, temp, MAX_PB_SIZE, _src2,            \
-                                             height, denom, _wx0, _wx1, _ox0, _ox1);                 \
+                                             height, denom, _wx0, _wx1, _ox0, _ox1, width);          \
 }
 
 #define mc_bi_w_funcs(name, bitd, opt)       \
@@ -707,41 +957,171 @@ void ff_hevc_put_hevc_bi_w_##name##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t _
         mc_bi_w_func(name, bitd, 48, opt);   \
         mc_bi_w_func(name, bitd, 64, opt)
 
-mc_bi_w_funcs(pel_pixels, 8, sse4);
+#define mc_bi_w_proxy_funcs(name, bitd, step, opt) \
+        mc_bi_w_func(name, bitd, 4, opt);    \
+        mc_bi_w_func(name, bitd, 8, opt);    \
+        mc_bi_w_func(name, bitd, 12, opt);   \
+        mc_bi_w_func_proxy(name, bitd, step, opt)
+
+mc_bi_w_proxy_funcs(pel_pixels, 8, 16, sse4);
+#define ff_hevc_put_hevc_bi_w_pel_pixels16_8_sse4 proxy_bi_w_pel_pixels16_8_sse4
+#define ff_hevc_put_hevc_bi_w_pel_pixels32_8_sse4 proxy_bi_w_pel_pixels16_8_sse4
+#define ff_hevc_put_hevc_bi_w_pel_pixels48_8_sse4 proxy_bi_w_pel_pixels16_8_sse4
+#define ff_hevc_put_hevc_bi_w_pel_pixels64_8_sse4 proxy_bi_w_pel_pixels16_8_sse4
+mc_bi_w_func(pel_pixels, 8, 24, sse4);
 mc_bi_w_func(pel_pixels, 8, 6, sse4);
-mc_bi_w_funcs(epel_h, 8, sse4);
+
+mc_bi_w_proxy_funcs(epel_h, 8, 16, sse4);
+#define ff_hevc_put_hevc_bi_w_epel_h16_8_sse4     proxy_bi_w_epel_h16_8_sse4
+#define ff_hevc_put_hevc_bi_w_epel_h32_8_sse4     proxy_bi_w_epel_h16_8_sse4
+#define ff_hevc_put_hevc_bi_w_epel_h48_8_sse4     proxy_bi_w_epel_h16_8_sse4
+#define ff_hevc_put_hevc_bi_w_epel_h64_8_sse4     proxy_bi_w_epel_h16_8_sse4
+mc_bi_w_func(epel_h, 8, 24, sse4);
 mc_bi_w_func(epel_h, 8, 6, sse4);
-mc_bi_w_funcs(epel_v, 8, sse4);
+
+mc_bi_w_proxy_funcs(epel_v, 8, 16, sse4);
+#define ff_hevc_put_hevc_bi_w_epel_v16_8_sse4     proxy_bi_w_epel_v16_8_sse4
+#define ff_hevc_put_hevc_bi_w_epel_v32_8_sse4     proxy_bi_w_epel_v16_8_sse4
+#define ff_hevc_put_hevc_bi_w_epel_v48_8_sse4     proxy_bi_w_epel_v16_8_sse4
+#define ff_hevc_put_hevc_bi_w_epel_v64_8_sse4     proxy_bi_w_epel_v16_8_sse4
+mc_bi_w_func(epel_v, 8, 24, sse4);
 mc_bi_w_func(epel_v, 8, 6, sse4);
-mc_bi_w_funcs(epel_hv, 8, sse4);
+
+mc_bi_w_proxy_funcs(epel_hv, 8, 8, sse4);
+#define ff_hevc_put_hevc_bi_w_epel_hv16_8_sse4     proxy_bi_w_epel_hv8_8_sse4
+#define ff_hevc_put_hevc_bi_w_epel_hv24_8_sse4     proxy_bi_w_epel_hv8_8_sse4
+#define ff_hevc_put_hevc_bi_w_epel_hv32_8_sse4     proxy_bi_w_epel_hv8_8_sse4
+#define ff_hevc_put_hevc_bi_w_epel_hv48_8_sse4     proxy_bi_w_epel_hv8_8_sse4
+#define ff_hevc_put_hevc_bi_w_epel_hv64_8_sse4     proxy_bi_w_epel_hv8_8_sse4
 mc_bi_w_func(epel_hv, 8, 6, sse4);
-mc_bi_w_funcs(qpel_h, 8, sse4);
-mc_bi_w_funcs(qpel_v, 8, sse4);
-mc_bi_w_funcs(qpel_hv, 8, sse4);
 
-mc_bi_w_funcs(pel_pixels, 10, sse4);
+mc_bi_w_proxy_funcs(qpel_h, 8, 16, sse4);
+#define ff_hevc_put_hevc_bi_w_qpel_h16_8_sse4     proxy_bi_w_qpel_h16_8_sse4
+#define ff_hevc_put_hevc_bi_w_qpel_h32_8_sse4     proxy_bi_w_qpel_h16_8_sse4
+#define ff_hevc_put_hevc_bi_w_qpel_h48_8_sse4     proxy_bi_w_qpel_h16_8_sse4
+#define ff_hevc_put_hevc_bi_w_qpel_h64_8_sse4     proxy_bi_w_qpel_h16_8_sse4
+mc_bi_w_func(qpel_h, 8, 24, sse4);
+
+mc_bi_w_proxy_funcs(qpel_v, 8, 16, sse4);
+#define ff_hevc_put_hevc_bi_w_qpel_v16_8_sse4     proxy_bi_w_qpel_v16_8_sse4
+#define ff_hevc_put_hevc_bi_w_qpel_v32_8_sse4     proxy_bi_w_qpel_v16_8_sse4
+#define ff_hevc_put_hevc_bi_w_qpel_v48_8_sse4     proxy_bi_w_qpel_v16_8_sse4
+#define ff_hevc_put_hevc_bi_w_qpel_v64_8_sse4     proxy_bi_w_qpel_v16_8_sse4
+mc_bi_w_func(qpel_v, 8, 24, sse4);
+
+mc_bi_w_proxy_funcs(qpel_hv, 8, 8, sse4);
+#define ff_hevc_put_hevc_bi_w_qpel_hv16_8_sse4     proxy_bi_w_qpel_hv8_8_sse4
+#define ff_hevc_put_hevc_bi_w_qpel_hv24_8_sse4     proxy_bi_w_qpel_hv8_8_sse4
+#define ff_hevc_put_hevc_bi_w_qpel_hv32_8_sse4     proxy_bi_w_qpel_hv8_8_sse4
+#define ff_hevc_put_hevc_bi_w_qpel_hv48_8_sse4     proxy_bi_w_qpel_hv8_8_sse4
+#define ff_hevc_put_hevc_bi_w_qpel_hv64_8_sse4     proxy_bi_w_qpel_hv8_8_sse4
+
+mc_bi_w_proxy_funcs(pel_pixels, 10, 8, sse4);
+#define ff_hevc_put_hevc_bi_w_pel_pixels16_10_sse4 proxy_bi_w_pel_pixels8_10_sse4
+#define ff_hevc_put_hevc_bi_w_pel_pixels24_10_sse4 proxy_bi_w_pel_pixels8_10_sse4
+#define ff_hevc_put_hevc_bi_w_pel_pixels32_10_sse4 proxy_bi_w_pel_pixels8_10_sse4
+#define ff_hevc_put_hevc_bi_w_pel_pixels48_10_sse4 proxy_bi_w_pel_pixels8_10_sse4
+#define ff_hevc_put_hevc_bi_w_pel_pixels64_10_sse4 proxy_bi_w_pel_pixels8_10_sse4
 mc_bi_w_func(pel_pixels, 10, 6, sse4);
-mc_bi_w_funcs(epel_h, 10, sse4);
+
+mc_bi_w_proxy_funcs(epel_h, 10, 8, sse4);
+#define ff_hevc_put_hevc_bi_w_epel_h16_10_sse4     proxy_bi_w_epel_h8_10_sse4
+#define ff_hevc_put_hevc_bi_w_epel_h24_10_sse4     proxy_bi_w_epel_h8_10_sse4
+#define ff_hevc_put_hevc_bi_w_epel_h32_10_sse4     proxy_bi_w_epel_h8_10_sse4
+#define ff_hevc_put_hevc_bi_w_epel_h48_10_sse4     proxy_bi_w_epel_h8_10_sse4
+#define ff_hevc_put_hevc_bi_w_epel_h64_10_sse4     proxy_bi_w_epel_h8_10_sse4
 mc_bi_w_func(epel_h, 10, 6, sse4);
-mc_bi_w_funcs(epel_v, 10, sse4);
+
+mc_bi_w_proxy_funcs(epel_v, 10, 8, sse4);
+#define ff_hevc_put_hevc_bi_w_epel_v16_10_sse4     proxy_bi_w_epel_v8_10_sse4
+#define ff_hevc_put_hevc_bi_w_epel_v24_10_sse4     proxy_bi_w_epel_v8_10_sse4
+#define ff_hevc_put_hevc_bi_w_epel_v32_10_sse4     proxy_bi_w_epel_v8_10_sse4
+#define ff_hevc_put_hevc_bi_w_epel_v48_10_sse4     proxy_bi_w_epel_v8_10_sse4
+#define ff_hevc_put_hevc_bi_w_epel_v64_10_sse4     proxy_bi_w_epel_v8_10_sse4
 mc_bi_w_func(epel_v, 10, 6, sse4);
-mc_bi_w_funcs(epel_hv, 10, sse4);
+
+mc_bi_w_proxy_funcs(epel_hv, 10, 8, sse4);
+#define ff_hevc_put_hevc_bi_w_epel_hv16_10_sse4     proxy_bi_w_epel_hv8_10_sse4
+#define ff_hevc_put_hevc_bi_w_epel_hv24_10_sse4     proxy_bi_w_epel_hv8_10_sse4
+#define ff_hevc_put_hevc_bi_w_epel_hv32_10_sse4     proxy_bi_w_epel_hv8_10_sse4
+#define ff_hevc_put_hevc_bi_w_epel_hv48_10_sse4     proxy_bi_w_epel_hv8_10_sse4
+#define ff_hevc_put_hevc_bi_w_epel_hv64_10_sse4     proxy_bi_w_epel_hv8_10_sse4
 mc_bi_w_func(epel_hv, 10, 6, sse4);
-mc_bi_w_funcs(qpel_h, 10, sse4);
-mc_bi_w_funcs(qpel_v, 10, sse4);
-mc_bi_w_funcs(qpel_hv, 10, sse4);
 
-mc_bi_w_funcs(pel_pixels, 12, sse4);
+mc_bi_w_proxy_funcs(qpel_h, 10, 8, sse4);
+#define ff_hevc_put_hevc_bi_w_qpel_h16_10_sse4     proxy_bi_w_qpel_h8_10_sse4
+#define ff_hevc_put_hevc_bi_w_qpel_h24_10_sse4     proxy_bi_w_qpel_h8_10_sse4
+#define ff_hevc_put_hevc_bi_w_qpel_h32_10_sse4     proxy_bi_w_qpel_h8_10_sse4
+#define ff_hevc_put_hevc_bi_w_qpel_h48_10_sse4     proxy_bi_w_qpel_h8_10_sse4
+#define ff_hevc_put_hevc_bi_w_qpel_h64_10_sse4     proxy_bi_w_qpel_h8_10_sse4
+
+mc_bi_w_proxy_funcs(qpel_v, 10, 8, sse4);
+#define ff_hevc_put_hevc_bi_w_qpel_v16_10_sse4     proxy_bi_w_qpel_v8_10_sse4
+#define ff_hevc_put_hevc_bi_w_qpel_v24_10_sse4     proxy_bi_w_qpel_v8_10_sse4
+#define ff_hevc_put_hevc_bi_w_qpel_v32_10_sse4     proxy_bi_w_qpel_v8_10_sse4
+#define ff_hevc_put_hevc_bi_w_qpel_v48_10_sse4     proxy_bi_w_qpel_v8_10_sse4
+#define ff_hevc_put_hevc_bi_w_qpel_v64_10_sse4     proxy_bi_w_qpel_v8_10_sse4
+
+mc_bi_w_proxy_funcs(qpel_hv, 10, 8, sse4);
+#define ff_hevc_put_hevc_bi_w_qpel_hv16_10_sse4     proxy_bi_w_qpel_hv8_10_sse4
+#define ff_hevc_put_hevc_bi_w_qpel_hv24_10_sse4     proxy_bi_w_qpel_hv8_10_sse4
+#define ff_hevc_put_hevc_bi_w_qpel_hv32_10_sse4     proxy_bi_w_qpel_hv8_10_sse4
+#define ff_hevc_put_hevc_bi_w_qpel_hv48_10_sse4     proxy_bi_w_qpel_hv8_10_sse4
+#define ff_hevc_put_hevc_bi_w_qpel_hv64_10_sse4     proxy_bi_w_qpel_hv8_10_sse4
+
+mc_bi_w_proxy_funcs(pel_pixels, 12, 8, sse4);
+#define ff_hevc_put_hevc_bi_w_pel_pixels16_12_sse4 proxy_bi_w_pel_pixels8_12_sse4
+#define ff_hevc_put_hevc_bi_w_pel_pixels24_12_sse4 proxy_bi_w_pel_pixels8_12_sse4
+#define ff_hevc_put_hevc_bi_w_pel_pixels32_12_sse4 proxy_bi_w_pel_pixels8_12_sse4
+#define ff_hevc_put_hevc_bi_w_pel_pixels48_12_sse4 proxy_bi_w_pel_pixels8_12_sse4
+#define ff_hevc_put_hevc_bi_w_pel_pixels64_12_sse4 proxy_bi_w_pel_pixels8_12_sse4
 mc_bi_w_func(pel_pixels, 12, 6, sse4);
-mc_bi_w_funcs(epel_h, 12, sse4);
+
+mc_bi_w_proxy_funcs(epel_h, 12, 8, sse4);
+#define ff_hevc_put_hevc_bi_w_epel_h16_12_sse4     proxy_bi_w_epel_h8_12_sse4
+#define ff_hevc_put_hevc_bi_w_epel_h24_12_sse4     proxy_bi_w_epel_h8_12_sse4
+#define ff_hevc_put_hevc_bi_w_epel_h32_12_sse4     proxy_bi_w_epel_h8_12_sse4
+#define ff_hevc_put_hevc_bi_w_epel_h48_12_sse4     proxy_bi_w_epel_h8_12_sse4
+#define ff_hevc_put_hevc_bi_w_epel_h64_12_sse4     proxy_bi_w_epel_h8_12_sse4
 mc_bi_w_func(epel_h, 12, 6, sse4);
-mc_bi_w_funcs(epel_v, 12, sse4);
+
+mc_bi_w_proxy_funcs(epel_v, 12, 8, sse4);
+#define ff_hevc_put_hevc_bi_w_epel_v16_12_sse4     proxy_bi_w_epel_v8_12_sse4
+#define ff_hevc_put_hevc_bi_w_epel_v24_12_sse4     proxy_bi_w_epel_v8_12_sse4
+#define ff_hevc_put_hevc_bi_w_epel_v32_12_sse4     proxy_bi_w_epel_v8_12_sse4
+#define ff_hevc_put_hevc_bi_w_epel_v48_12_sse4     proxy_bi_w_epel_v8_12_sse4
+#define ff_hevc_put_hevc_bi_w_epel_v64_12_sse4     proxy_bi_w_epel_v8_12_sse4
 mc_bi_w_func(epel_v, 12, 6, sse4);
-mc_bi_w_funcs(epel_hv, 12, sse4);
+
+mc_bi_w_proxy_funcs(epel_hv, 12, 8, sse4);
+#define ff_hevc_put_hevc_bi_w_epel_hv16_12_sse4     proxy_bi_w_epel_hv8_12_sse4
+#define ff_hevc_put_hevc_bi_w_epel_hv24_12_sse4     proxy_bi_w_epel_hv8_12_sse4
+#define ff_hevc_put_hevc_bi_w_epel_hv32_12_sse4     proxy_bi_w_epel_hv8_12_sse4
+#define ff_hevc_put_hevc_bi_w_epel_hv48_12_sse4     proxy_bi_w_epel_hv8_12_sse4
+#define ff_hevc_put_hevc_bi_w_epel_hv64_12_sse4     proxy_bi_w_epel_hv8_12_sse4
 mc_bi_w_func(epel_hv, 12, 6, sse4);
-mc_bi_w_funcs(qpel_h, 12, sse4);
-mc_bi_w_funcs(qpel_v, 12, sse4);
-mc_bi_w_funcs(qpel_hv, 12, sse4);
+
+mc_bi_w_proxy_funcs(qpel_h, 12, 8, sse4);
+#define ff_hevc_put_hevc_bi_w_qpel_h16_12_sse4     proxy_bi_w_qpel_h8_12_sse4
+#define ff_hevc_put_hevc_bi_w_qpel_h24_12_sse4     proxy_bi_w_qpel_h8_12_sse4
+#define ff_hevc_put_hevc_bi_w_qpel_h32_12_sse4     proxy_bi_w_qpel_h8_12_sse4
+#define ff_hevc_put_hevc_bi_w_qpel_h48_12_sse4     proxy_bi_w_qpel_h8_12_sse4
+#define ff_hevc_put_hevc_bi_w_qpel_h64_12_sse4     proxy_bi_w_qpel_h8_12_sse4
+
+mc_bi_w_proxy_funcs(qpel_v, 12, 8, sse4);
+#define ff_hevc_put_hevc_bi_w_qpel_v16_12_sse4     proxy_bi_w_qpel_v8_12_sse4
+#define ff_hevc_put_hevc_bi_w_qpel_v24_12_sse4     proxy_bi_w_qpel_v8_12_sse4
+#define ff_hevc_put_hevc_bi_w_qpel_v32_12_sse4     proxy_bi_w_qpel_v8_12_sse4
+#define ff_hevc_put_hevc_bi_w_qpel_v48_12_sse4     proxy_bi_w_qpel_v8_12_sse4
+#define ff_hevc_put_hevc_bi_w_qpel_v64_12_sse4     proxy_bi_w_qpel_v8_12_sse4
+
+mc_bi_w_proxy_funcs(qpel_hv, 12, 8, sse4);
+#define ff_hevc_put_hevc_bi_w_qpel_hv16_12_sse4     proxy_bi_w_qpel_hv8_12_sse4
+#define ff_hevc_put_hevc_bi_w_qpel_hv24_12_sse4     proxy_bi_w_qpel_hv8_12_sse4
+#define ff_hevc_put_hevc_bi_w_qpel_hv32_12_sse4     proxy_bi_w_qpel_hv8_12_sse4
+#define ff_hevc_put_hevc_bi_w_qpel_hv48_12_sse4     proxy_bi_w_qpel_hv8_12_sse4
+#define ff_hevc_put_hevc_bi_w_qpel_hv64_12_sse4     proxy_bi_w_qpel_hv8_12_sse4
+
 #endif //ARCH_X86_64 && HAVE_SSE4_EXTERNAL
 
 
-- 
1.9.2.msysgit.0



More information about the ffmpeg-devel mailing list