[FFmpeg-devel] [PATCH 24/33] Move x86 hpel assembly from dsputil to hpeldsp.

Ronald S. Bultje rsbultje at gmail.com
Wed Feb 6 04:27:37 CET 2013


From: "Ronald S. Bultje" <rsbultje at gmail.com>

---
 libavcodec/hpeldsp.c                  |   2 +-
 libavcodec/x86/Makefile               |   3 +-
 libavcodec/x86/dsputil_avg_template.c |  77 ------
 libavcodec/x86/dsputil_mmx.c          | 205 +---------------
 libavcodec/x86/dsputil_rnd_template.c | 373 -----------------------------
 libavcodec/x86/hpeldsp_avg_template.c |  77 ++++++
 libavcodec/x86/hpeldsp_init.c         | 417 +++++++++++++++++++++++++++++++++
 libavcodec/x86/hpeldsp_rnd_template.c | 428 ++++++++++++++++++++++++++++++++++
 8 files changed, 933 insertions(+), 649 deletions(-)
 delete mode 100644 libavcodec/x86/dsputil_avg_template.c
 create mode 100644 libavcodec/x86/hpeldsp_avg_template.c
 create mode 100644 libavcodec/x86/hpeldsp_init.c
 create mode 100644 libavcodec/x86/hpeldsp_rnd_template.c

diff --git a/libavcodec/hpeldsp.c b/libavcodec/hpeldsp.c
index 3705d43..de3cd76 100644
--- a/libavcodec/hpeldsp.c
+++ b/libavcodec/hpeldsp.c
@@ -53,8 +53,8 @@ av_cold void ff_hpeldsp_init(HpelDSPContext* c, int flags)
     hpel_funcs(avg, [3],  2);
     hpel_funcs(avg_no_rnd,, 16);
 
-#if 0
     if (ARCH_X86)        ff_hpeldsp_init_x86   (c, flags);
+#if 0
     if (ARCH_ARM)        ff_hpeldsp_init_arm   (c, flags);
     if (HAVE_VIS)        ff_hpeldsp_init_vis   (c, flags);
     if (ARCH_ALPHA)      ff_hpeldsp_init_alpha (c, flags);
diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
index 23348a6..3f037d8 100644
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -9,6 +9,7 @@ OBJS-$(CONFIG_GPL)                     += x86/idct_mmx.o
 OBJS-$(CONFIG_H264DSP)                 += x86/h264dsp_init.o
 OBJS-$(CONFIG_H264PRED)                += x86/h264_intrapred_init.o
 OBJS-$(CONFIG_H264QPEL)                += x86/h264_qpel.o
+OBJS-$(CONFIG_HPELDSP)                 += x86/hpeldsp_init.o
 OBJS-$(CONFIG_LPC)                     += x86/lpc.o
 OBJS-$(CONFIG_MLP_DECODER)             += x86/mlpdsp.o
 OBJS-$(CONFIG_MPEGAUDIODSP)            += x86/mpegaudiodec.o
@@ -63,6 +64,7 @@ YASM-OBJS-$(CONFIG_H264PRED)           += x86/h264_intrapred.o          \
                                           x86/h264_intrapred_10bit.o
 YASM-OBJS-$(CONFIG_H264QPEL)           += x86/h264_qpel_8bit.o          \
                                           x86/h264_qpel_10bit.o
+YASM-OBJS-$(CONFIG_HPELDSP)            += x86/hpeldsp.o
 YASM-OBJS-$(CONFIG_MPEGAUDIODSP)       += x86/imdct36.o
 YASM-OBJS-$(CONFIG_PNG_DECODER)        += x86/pngdsp.o
 YASM-OBJS-$(CONFIG_PRORES_DECODER)     += x86/proresdsp.o
@@ -81,5 +83,4 @@ YASM-OBJS-$(CONFIG_VP8_DECODER)        += x86/vp8dsp.o
 YASM-OBJS                              += x86/dsputil.o                 \
                                           x86/deinterlace.o             \
                                           x86/fmtconvert.o              \
-                                          x86/hpeldsp.o                 \
                                           x86/mpeg4qpel.o               \
diff --git a/libavcodec/x86/dsputil_avg_template.c b/libavcodec/x86/dsputil_avg_template.c
deleted file mode 100644
index 414667c..0000000
--- a/libavcodec/x86/dsputil_avg_template.c
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * DSP utils : average functions are compiled twice for 3dnow/mmxext
- * Copyright (c) 2000, 2001 Fabrice Bellard
- * Copyright (c) 2002-2004 Michael Niedermayer
- *
- * MMX optimization by Nick Kurshev <nickols_k at mail.ru>
- * mostly rewritten by Michael Niedermayer <michaelni at gmx.at>
- * and improved by Zdenek Kabelac <kabi at users.sf.net>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-//FIXME the following could be optimized too ...
-static void DEF(ff_put_no_rnd_pixels16_x2)(uint8_t *block,
-                                           const uint8_t *pixels,
-                                           int line_size, int h)
-{
-    DEF(ff_put_no_rnd_pixels8_x2)(block,     pixels,     line_size, h);
-    DEF(ff_put_no_rnd_pixels8_x2)(block + 8, pixels + 8, line_size, h);
-}
-
-static void DEF(ff_put_pixels16_y2)(uint8_t *block, const uint8_t *pixels,
-                                    int line_size, int h)
-{
-    DEF(ff_put_pixels8_y2)(block,     pixels,     line_size, h);
-    DEF(ff_put_pixels8_y2)(block + 8, pixels + 8, line_size, h);
-}
-
-static void DEF(ff_put_no_rnd_pixels16_y2)(uint8_t *block,
-                                           const uint8_t *pixels,
-                                           int line_size, int h)
-{
-    DEF(ff_put_no_rnd_pixels8_y2)(block,     pixels,     line_size, h);
-    DEF(ff_put_no_rnd_pixels8_y2)(block + 8, pixels + 8, line_size, h);
-}
-
-static void DEF(ff_avg_pixels16)(uint8_t *block, const uint8_t *pixels,
-                                 int line_size, int h)
-{
-    DEF(ff_avg_pixels8)(block,     pixels,     line_size, h);
-    DEF(ff_avg_pixels8)(block + 8, pixels + 8, line_size, h);
-}
-
-static void DEF(ff_avg_pixels16_x2)(uint8_t *block, const uint8_t *pixels,
-                                    int line_size, int h)
-{
-    DEF(ff_avg_pixels8_x2)(block,     pixels,     line_size, h);
-    DEF(ff_avg_pixels8_x2)(block + 8, pixels + 8, line_size, h);
-}
-
-static void DEF(ff_avg_pixels16_y2)(uint8_t *block, const uint8_t *pixels,
-                                    int line_size, int h)
-{
-    DEF(ff_avg_pixels8_y2)(block,     pixels,     line_size, h);
-    DEF(ff_avg_pixels8_y2)(block + 8, pixels + 8, line_size, h);
-}
-
-static void DEF(ff_avg_pixels16_xy2)(uint8_t *block, const uint8_t *pixels,
-                                     int line_size, int h)
-{
-    DEF(ff_avg_pixels8_xy2)(block,     pixels,     line_size, h);
-    DEF(ff_avg_pixels8_xy2)(block + 8, pixels + 8, line_size, h);
-}
diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c
index 2e8300a..7f1ea30 100644
--- a/libavcodec/x86/dsputil_mmx.c
+++ b/libavcodec/x86/dsputil_mmx.c
@@ -83,10 +83,6 @@ DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 };
 
 
 #if HAVE_YASM
-void ff_put_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
-                              int line_size, int h);
-void ff_put_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
-                             int line_size, int h);
 void ff_put_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
                               int dstStride, int src1Stride, int h);
 void ff_put_no_rnd_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1,
@@ -94,56 +90,14 @@ void ff_put_no_rnd_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1,
                                      int src1Stride, int h);
 void ff_avg_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
                               int dstStride, int src1Stride, int h);
-void ff_put_pixels16_x2_mmxext(uint8_t *block, const uint8_t *pixels,
-                               int line_size, int h);
-void ff_put_pixels16_x2_3dnow(uint8_t *block, const uint8_t *pixels,
-                              int line_size, int h);
 void ff_put_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
                                int dstStride, int src1Stride, int h);
 void ff_avg_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
                                int dstStride, int src1Stride, int h);
 void ff_put_no_rnd_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
                                       int dstStride, int src1Stride, int h);
-void ff_put_no_rnd_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
-                                     int line_size, int h);
-void ff_put_no_rnd_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
-                                    int line_size, int h);
-void ff_put_no_rnd_pixels8_x2_exact_mmxext(uint8_t *block,
-                                           const uint8_t *pixels,
-                                           int line_size, int h);
-void ff_put_no_rnd_pixels8_x2_exact_3dnow(uint8_t *block,
-                                          const uint8_t *pixels,
-                                          int line_size, int h);
-void ff_put_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
-                              int line_size, int h);
-void ff_put_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
-                             int line_size, int h);
-void ff_put_no_rnd_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
-                                     int line_size, int h);
-void ff_put_no_rnd_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
-                                    int line_size, int h);
-void ff_put_no_rnd_pixels8_y2_exact_mmxext(uint8_t *block,
-                                           const uint8_t *pixels,
-                                           int line_size, int h);
-void ff_put_no_rnd_pixels8_y2_exact_3dnow(uint8_t *block,
-                                          const uint8_t *pixels,
-                                          int line_size, int h);
 void ff_avg_pixels8_mmxext(uint8_t *block, const uint8_t *pixels,
                            int line_size, int h);
-void ff_avg_pixels8_3dnow(uint8_t *block, const uint8_t *pixels,
-                          int line_size, int h);
-void ff_avg_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
-                              int line_size, int h);
-void ff_avg_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
-                             int line_size, int h);
-void ff_avg_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
-                              int line_size, int h);
-void ff_avg_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
-                             int line_size, int h);
-void ff_avg_pixels8_xy2_mmxext(uint8_t *block, const uint8_t *pixels,
-                               int line_size, int h);
-void ff_avg_pixels8_xy2_3dnow(uint8_t *block, const uint8_t *pixels,
-                              int line_size, int h);
 
 void ff_put_pixels8_mmxext(uint8_t *block, const uint8_t *pixels, int line_size, int h);
 static void ff_put_pixels16_mmxext(uint8_t *block, const uint8_t *pixels,
@@ -217,14 +171,6 @@ void ff_put_no_rnd_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
 // using regr as temporary and for the output result
 // first argument is unmodifed and second is trashed
 // regfe is supposed to contain 0xfefefefefefefefe
-#define PAVGB_MMX_NO_RND(rega, regb, regr, regfe)                \
-    "movq   "#rega", "#regr"            \n\t"                    \
-    "pand   "#regb", "#regr"            \n\t"                    \
-    "pxor   "#rega", "#regb"            \n\t"                    \
-    "pand  "#regfe", "#regb"            \n\t"                    \
-    "psrlq       $1, "#regb"            \n\t"                    \
-    "paddb  "#regb", "#regr"            \n\t"
-
 #define PAVGB_MMX(rega, regb, regr, regfe)                       \
     "movq   "#rega", "#regr"            \n\t"                    \
     "por    "#regb", "#regr"            \n\t"                    \
@@ -234,20 +180,6 @@ void ff_put_no_rnd_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
     "psubb  "#regb", "#regr"            \n\t"
 
 // mm6 is supposed to contain 0xfefefefefefefefe
-#define PAVGBP_MMX_NO_RND(rega, regb, regr,  regc, regd, regp)   \
-    "movq  "#rega", "#regr"             \n\t"                    \
-    "movq  "#regc", "#regp"             \n\t"                    \
-    "pand  "#regb", "#regr"             \n\t"                    \
-    "pand  "#regd", "#regp"             \n\t"                    \
-    "pxor  "#rega", "#regb"             \n\t"                    \
-    "pxor  "#regc", "#regd"             \n\t"                    \
-    "pand    %%mm6, "#regb"             \n\t"                    \
-    "pand    %%mm6, "#regd"             \n\t"                    \
-    "psrlq      $1, "#regb"             \n\t"                    \
-    "psrlq      $1, "#regd"             \n\t"                    \
-    "paddb "#regb", "#regr"             \n\t"                    \
-    "paddb "#regd", "#regp"             \n\t"
-
 #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp)           \
     "movq  "#rega", "#regr"             \n\t"                    \
     "movq  "#regc", "#regp"             \n\t"                    \
@@ -263,28 +195,13 @@ void ff_put_no_rnd_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
     "psubb "#regd", "#regp"             \n\t"
 
 /***********************************/
-/* MMX no rounding */
-#define NO_RND 1
-#define DEF(x, y) x ## _no_rnd_ ## y ## _mmx
-#define SET_RND  MOVQ_WONE
-#define PAVGBP(a, b, c, d, e, f)        PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
-#define PAVGB(a, b, c, e)               PAVGB_MMX_NO_RND(a, b, c, e)
-#define OP_AVG(a, b, c, e)              PAVGB_MMX(a, b, c, e)
-
-#include "dsputil_rnd_template.c"
-
-#undef DEF
-#undef SET_RND
-#undef PAVGBP
-#undef PAVGB
-#undef NO_RND
-/***********************************/
 /* MMX rounding */
 
 #define DEF(x, y) x ## _ ## y ## _mmx
 #define SET_RND  MOVQ_WTWO
 #define PAVGBP(a, b, c, d, e, f)        PAVGBP_MMX(a, b, c, d, e, f)
 #define PAVGB(a, b, c, e)               PAVGB_MMX(a, b, c, e)
+#define OP_AVG(a, b, c, e)              PAVGB_MMX(a, b, c, e)
 
 #include "dsputil_rnd_template.c"
 
@@ -301,35 +218,20 @@ void ff_put_no_rnd_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
 #define ff_put_pixels8_mmx ff_put_pixels8_mmxext
 
 /***********************************/
-/* 3Dnow specific */
-
-#define DEF(x) x ## _3dnow
-
-#include "dsputil_avg_template.c"
-
-#undef DEF
-
-/***********************************/
 /* MMXEXT specific */
 
-#define DEF(x) x ## _mmxext
-
-#include "dsputil_avg_template.c"
-
-#undef DEF
+//FIXME the following could be optimized too ...
+static void ff_avg_pixels16_mmxext(uint8_t *block, const uint8_t *pixels,
+                                   int line_size, int h)
+{
+    ff_avg_pixels8_mmxext(block,     pixels,     line_size, h);
+    ff_avg_pixels8_mmxext(block + 8, pixels + 8, line_size, h);
+}
 
 #endif /* HAVE_YASM */
 
 
 #if HAVE_INLINE_ASM
-#define put_no_rnd_pixels16_mmx put_pixels16_mmx
-#define put_no_rnd_pixels8_mmx put_pixels8_mmx
-#define put_pixels16_mmxext put_pixels16_mmx
-#define put_pixels8_mmxext put_pixels8_mmx
-#define put_pixels4_mmxext put_pixels4_mmx
-#define put_no_rnd_pixels16_mmxext put_no_rnd_pixels16_mmx
-#define put_no_rnd_pixels8_mmxext put_no_rnd_pixels8_mmx
-
 /***********************************/
 /* standard MMX */
 
@@ -1813,14 +1715,6 @@ void ff_vector_clip_int32_sse4    (int32_t *dst, const int32_t *src,
     c->PFX ## _pixels_tab[IDX][15] = PREFIX ## PFX ## SIZE ## _mc33_ ## CPU; \
     } while (0)
 
-#define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU)                                     \
-    do {                                                                        \
-        c->PFX ## _pixels_tab IDX [0] = PFX ## _pixels ## SIZE ## _     ## CPU; \
-        c->PFX ## _pixels_tab IDX [1] = PFX ## _pixels ## SIZE ## _x2_  ## CPU; \
-        c->PFX ## _pixels_tab IDX [2] = PFX ## _pixels ## SIZE ## _y2_  ## CPU; \
-        c->PFX ## _pixels_tab IDX [3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU; \
-    } while (0)
-
 static void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx, int mm_flags)
 {
     const int high_bit_depth = avctx->bits_per_raw_sample > 8;
@@ -1834,14 +1728,6 @@ static void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx, int mm_flags)
         c->clear_block  = clear_block_mmx;
         c->clear_blocks = clear_blocks_mmx;
         c->draw_edges   = draw_edges_mmx;
-
-        SET_HPEL_FUNCS(put,        [0], 16, mmx);
-        SET_HPEL_FUNCS(put_no_rnd, [0], 16, mmx);
-        SET_HPEL_FUNCS(avg,        [0], 16, mmx);
-        SET_HPEL_FUNCS(avg_no_rnd,    , 16, mmx);
-        SET_HPEL_FUNCS(put,        [1],  8, mmx);
-        SET_HPEL_FUNCS(put_no_rnd, [1],  8, mmx);
-        SET_HPEL_FUNCS(avg,        [1],  8, mmx);
     }
 
 #if CONFIG_VIDEODSP && (ARCH_X86_32 || !HAVE_YASM)
@@ -1881,43 +1767,9 @@ static void dsputil_init_mmxext(DSPContext *c, AVCodecContext *avctx,
     SET_QPEL_FUNCS(put_qpel,        1,  8, mmxext, );
     SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmxext, );
     SET_QPEL_FUNCS(put_no_rnd_qpel, 1,  8, mmxext, );
-
-    if (!high_bit_depth) {
-        c->put_pixels_tab[0][1] = ff_put_pixels16_x2_mmxext;
-        c->put_pixels_tab[0][2] = ff_put_pixels16_y2_mmxext;
-
-        c->avg_pixels_tab[0][0] = ff_avg_pixels16_mmxext;
-        c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_mmxext;
-        c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_mmxext;
-
-        c->put_pixels_tab[1][1] = ff_put_pixels8_x2_mmxext;
-        c->put_pixels_tab[1][2] = ff_put_pixels8_y2_mmxext;
-
-        c->avg_pixels_tab[1][0] = ff_avg_pixels8_mmxext;
-        c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_mmxext;
-        c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_mmxext;
-    }
-
-    if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
-        if (!high_bit_depth) {
-            c->put_no_rnd_pixels_tab[0][1] = ff_put_no_rnd_pixels16_x2_mmxext;
-            c->put_no_rnd_pixels_tab[0][2] = ff_put_no_rnd_pixels16_y2_mmxext;
-            c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_mmxext;
-            c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_mmxext;
-
-            c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_mmxext;
-            c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_mmxext;
-        }
-    }
 #endif /* HAVE_YASM */
 
 #if HAVE_MMXEXT_EXTERNAL
-    if (CONFIG_VP3_DECODER && (avctx->codec_id == AV_CODEC_ID_VP3 ||
-                               avctx->codec_id == AV_CODEC_ID_THEORA)) {
-        c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_mmxext;
-        c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_mmxext;
-    }
-
     if (!high_bit_depth && CONFIG_H264CHROMA) {
         c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_rnd_mmxext;
         c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_mmxext;
@@ -1952,38 +1804,6 @@ static void dsputil_init_3dnow(DSPContext *c, AVCodecContext *avctx,
     const int high_bit_depth = avctx->bits_per_raw_sample > 8;
 
 #if HAVE_YASM
-    if (!high_bit_depth) {
-        c->put_pixels_tab[0][1] = ff_put_pixels16_x2_3dnow;
-        c->put_pixels_tab[0][2] = ff_put_pixels16_y2_3dnow;
-
-        c->avg_pixels_tab[0][0] = ff_avg_pixels16_3dnow;
-        c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_3dnow;
-        c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_3dnow;
-
-        c->put_pixels_tab[1][1] = ff_put_pixels8_x2_3dnow;
-        c->put_pixels_tab[1][2] = ff_put_pixels8_y2_3dnow;
-
-        c->avg_pixels_tab[1][0] = ff_avg_pixels8_3dnow;
-        c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_3dnow;
-        c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_3dnow;
-
-        if (!(avctx->flags & CODEC_FLAG_BITEXACT)){
-            c->put_no_rnd_pixels_tab[0][1] = ff_put_no_rnd_pixels16_x2_3dnow;
-            c->put_no_rnd_pixels_tab[0][2] = ff_put_no_rnd_pixels16_y2_3dnow;
-            c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_3dnow;
-            c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_3dnow;
-
-            c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_3dnow;
-            c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_3dnow;
-        }
-    }
-
-    if (CONFIG_VP3_DECODER && (avctx->codec_id == AV_CODEC_ID_VP3 ||
-                               avctx->codec_id == AV_CODEC_ID_THEORA)) {
-        c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_3dnow;
-        c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_3dnow;
-    }
-
     if (!high_bit_depth && CONFIG_H264CHROMA) {
         c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_rnd_3dnow;
         c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_3dnow;
@@ -2030,15 +1850,6 @@ static void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx,
 #endif /* HAVE_SSE2_INLINE */
 
 #if HAVE_SSE2_EXTERNAL
-    if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
-        // these functions are slower than mmx on AMD, but faster on Intel
-        if (!high_bit_depth) {
-            c->put_pixels_tab[0][0]        = ff_put_pixels16_sse2;
-            c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_sse2;
-            c->avg_pixels_tab[0][0]        = ff_avg_pixels16_sse2;
-        }
-    }
-
     if (bit_depth == 10) {
         if (CONFIG_H264CHROMA) {
             c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_10_sse2;
diff --git a/libavcodec/x86/dsputil_rnd_template.c b/libavcodec/x86/dsputil_rnd_template.c
index c4790af..56ee46a 100644
--- a/libavcodec/x86/dsputil_rnd_template.c
+++ b/libavcodec/x86/dsputil_rnd_template.c
@@ -25,212 +25,6 @@
  */
 
 // put_pixels
-static void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
-{
-    MOVQ_BFE(mm6);
-    __asm__ volatile(
-        "lea    (%3, %3), %%"REG_a"     \n\t"
-        ".p2align 3                     \n\t"
-        "1:                             \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        "movq   1(%1), %%mm1            \n\t"
-        "movq   (%1, %3), %%mm2         \n\t"
-        "movq   1(%1, %3), %%mm3        \n\t"
-        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
-        "movq   %%mm4, (%2)             \n\t"
-        "movq   %%mm5, (%2, %3)         \n\t"
-        "add    %%"REG_a", %1           \n\t"
-        "add    %%"REG_a", %2           \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        "movq   1(%1), %%mm1            \n\t"
-        "movq   (%1, %3), %%mm2         \n\t"
-        "movq   1(%1, %3), %%mm3        \n\t"
-        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
-        "movq   %%mm4, (%2)             \n\t"
-        "movq   %%mm5, (%2, %3)         \n\t"
-        "add    %%"REG_a", %1           \n\t"
-        "add    %%"REG_a", %2           \n\t"
-        "subl   $4, %0                  \n\t"
-        "jnz    1b                      \n\t"
-        :"+g"(h), "+S"(pixels), "+D"(block)
-        :"r"((x86_reg)line_size)
-        :REG_a, "memory");
-}
-
-static void av_unused DEF(put, pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
-{
-    MOVQ_BFE(mm6);
-    __asm__ volatile(
-        "testl $1, %0                   \n\t"
-        " jz 1f                         \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        "movq   (%2), %%mm1             \n\t"
-        "add    %4, %1                  \n\t"
-        "add    $8, %2                  \n\t"
-        PAVGB(%%mm0, %%mm1, %%mm4, %%mm6)
-        "movq   %%mm4, (%3)             \n\t"
-        "add    %5, %3                  \n\t"
-        "decl   %0                      \n\t"
-        ".p2align 3                     \n\t"
-        "1:                             \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        "movq   (%2), %%mm1             \n\t"
-        "add    %4, %1                  \n\t"
-        "movq   (%1), %%mm2             \n\t"
-        "movq   8(%2), %%mm3            \n\t"
-        "add    %4, %1                  \n\t"
-        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
-        "movq   %%mm4, (%3)             \n\t"
-        "add    %5, %3                  \n\t"
-        "movq   %%mm5, (%3)             \n\t"
-        "add    %5, %3                  \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        "movq   16(%2), %%mm1           \n\t"
-        "add    %4, %1                  \n\t"
-        "movq   (%1), %%mm2             \n\t"
-        "movq   24(%2), %%mm3           \n\t"
-        "add    %4, %1                  \n\t"
-        "add    $32, %2                 \n\t"
-        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
-        "movq   %%mm4, (%3)             \n\t"
-        "add    %5, %3                  \n\t"
-        "movq   %%mm5, (%3)             \n\t"
-        "add    %5, %3                  \n\t"
-        "subl   $4, %0                  \n\t"
-        "jnz    1b                      \n\t"
-#if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
-        :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
-#else
-        :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
-#endif
-        :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
-        :"memory");
-}
-
-static void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
-{
-    MOVQ_BFE(mm6);
-    __asm__ volatile(
-        "lea        (%3, %3), %%"REG_a" \n\t"
-        ".p2align 3                     \n\t"
-        "1:                             \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        "movq   1(%1), %%mm1            \n\t"
-        "movq   (%1, %3), %%mm2         \n\t"
-        "movq   1(%1, %3), %%mm3        \n\t"
-        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
-        "movq   %%mm4, (%2)             \n\t"
-        "movq   %%mm5, (%2, %3)         \n\t"
-        "movq   8(%1), %%mm0            \n\t"
-        "movq   9(%1), %%mm1            \n\t"
-        "movq   8(%1, %3), %%mm2        \n\t"
-        "movq   9(%1, %3), %%mm3        \n\t"
-        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
-        "movq   %%mm4, 8(%2)            \n\t"
-        "movq   %%mm5, 8(%2, %3)        \n\t"
-        "add    %%"REG_a", %1           \n\t"
-        "add    %%"REG_a", %2           \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        "movq   1(%1), %%mm1            \n\t"
-        "movq   (%1, %3), %%mm2         \n\t"
-        "movq   1(%1, %3), %%mm3        \n\t"
-        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
-        "movq   %%mm4, (%2)             \n\t"
-        "movq   %%mm5, (%2, %3)         \n\t"
-        "movq   8(%1), %%mm0            \n\t"
-        "movq   9(%1), %%mm1            \n\t"
-        "movq   8(%1, %3), %%mm2        \n\t"
-        "movq   9(%1, %3), %%mm3        \n\t"
-        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
-        "movq   %%mm4, 8(%2)            \n\t"
-        "movq   %%mm5, 8(%2, %3)        \n\t"
-        "add    %%"REG_a", %1           \n\t"
-        "add    %%"REG_a", %2           \n\t"
-        "subl   $4, %0                  \n\t"
-        "jnz    1b                      \n\t"
-        :"+g"(h), "+S"(pixels), "+D"(block)
-        :"r"((x86_reg)line_size)
-        :REG_a, "memory");
-}
-
-static void av_unused DEF(put, pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
-{
-    MOVQ_BFE(mm6);
-    __asm__ volatile(
-        "testl $1, %0                   \n\t"
-        " jz 1f                         \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        "movq   (%2), %%mm1             \n\t"
-        "movq   8(%1), %%mm2            \n\t"
-        "movq   8(%2), %%mm3            \n\t"
-        "add    %4, %1                  \n\t"
-        "add    $16, %2                 \n\t"
-        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
-        "movq   %%mm4, (%3)             \n\t"
-        "movq   %%mm5, 8(%3)            \n\t"
-        "add    %5, %3                  \n\t"
-        "decl   %0                      \n\t"
-        ".p2align 3                     \n\t"
-        "1:                             \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        "movq   (%2), %%mm1             \n\t"
-        "movq   8(%1), %%mm2            \n\t"
-        "movq   8(%2), %%mm3            \n\t"
-        "add    %4, %1                  \n\t"
-        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
-        "movq   %%mm4, (%3)             \n\t"
-        "movq   %%mm5, 8(%3)            \n\t"
-        "add    %5, %3                  \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        "movq   16(%2), %%mm1           \n\t"
-        "movq   8(%1), %%mm2            \n\t"
-        "movq   24(%2), %%mm3           \n\t"
-        "add    %4, %1                  \n\t"
-        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
-        "movq   %%mm4, (%3)             \n\t"
-        "movq   %%mm5, 8(%3)            \n\t"
-        "add    %5, %3                  \n\t"
-        "add    $32, %2                 \n\t"
-        "subl   $2, %0                  \n\t"
-        "jnz    1b                      \n\t"
-#if !HAVE_EBX_AVAILABLE  //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
-        :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
-#else
-        :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
-#endif
-        :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
-        :"memory");
-}
-
-static void DEF(put, pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
-{
-    MOVQ_BFE(mm6);
-    __asm__ volatile(
-        "lea (%3, %3), %%"REG_a"        \n\t"
-        "movq (%1), %%mm0               \n\t"
-        ".p2align 3                     \n\t"
-        "1:                             \n\t"
-        "movq   (%1, %3), %%mm1         \n\t"
-        "movq   (%1, %%"REG_a"),%%mm2   \n\t"
-        PAVGBP(%%mm1, %%mm0, %%mm4,   %%mm2, %%mm1, %%mm5)
-        "movq   %%mm4, (%2)             \n\t"
-        "movq   %%mm5, (%2, %3)         \n\t"
-        "add    %%"REG_a", %1           \n\t"
-        "add    %%"REG_a", %2           \n\t"
-        "movq   (%1, %3), %%mm1         \n\t"
-        "movq   (%1, %%"REG_a"),%%mm0   \n\t"
-        PAVGBP(%%mm1, %%mm2, %%mm4,   %%mm0, %%mm1, %%mm5)
-        "movq   %%mm4, (%2)             \n\t"
-        "movq   %%mm5, (%2, %3)         \n\t"
-        "add    %%"REG_a", %1           \n\t"
-        "add    %%"REG_a", %2           \n\t"
-        "subl   $4, %0                  \n\t"
-        "jnz    1b                      \n\t"
-        :"+g"(h), "+S"(pixels), "+D"(block)
-        :"r"((x86_reg)line_size)
-        :REG_a, "memory");
-}
-
 static void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
 {
     MOVQ_ZERO(mm7);
@@ -297,27 +91,6 @@ static void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int lin
         :REG_a, "memory");
 }
 
-// avg_pixels
-static void av_unused DEF(avg, pixels4)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
-{
-    MOVQ_BFE(mm6);
-    JUMPALIGN();
-    do {
-        __asm__ volatile(
-             "movd  %0, %%mm0           \n\t"
-             "movd  %1, %%mm1           \n\t"
-             OP_AVG(%%mm0, %%mm1, %%mm2, %%mm6)
-             "movd  %%mm2, %0           \n\t"
-             :"+m"(*block)
-             :"m"(*pixels)
-             :"memory");
-        pixels += line_size;
-        block += line_size;
-    }
-    while (--h);
-}
-
-#ifndef NO_RND
 // in case more speed is needed - unroling would certainly help
 static void DEF(avg, pixels8)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
 {
@@ -337,7 +110,6 @@ static void DEF(avg, pixels8)(uint8_t *block, const uint8_t *pixels, int line_si
     }
     while (--h);
 }
-#endif // NO_RND
 
 static void DEF(avg, pixels16)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
 {
@@ -362,141 +134,6 @@ static void DEF(avg, pixels16)(uint8_t *block, const uint8_t *pixels, int line_s
     while (--h);
 }
 
-#ifndef NO_RND
-static void DEF(avg, pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
-{
-    MOVQ_BFE(mm6);
-    JUMPALIGN();
-    do {
-        __asm__ volatile(
-            "movq  %1, %%mm0            \n\t"
-            "movq  1%1, %%mm1           \n\t"
-            "movq  %0, %%mm3            \n\t"
-            PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
-            OP_AVG(%%mm3, %%mm2, %%mm0, %%mm6)
-            "movq  %%mm0, %0            \n\t"
-            :"+m"(*block)
-            :"m"(*pixels)
-            :"memory");
-        pixels += line_size;
-        block += line_size;
-    } while (--h);
-}
-#endif // NO_RND
-
-static av_unused void DEF(avg, pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
-{
-    MOVQ_BFE(mm6);
-    JUMPALIGN();
-    do {
-        __asm__ volatile(
-            "movq  %1, %%mm0            \n\t"
-            "movq  %2, %%mm1            \n\t"
-            "movq  %0, %%mm3            \n\t"
-            PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
-            OP_AVG(%%mm3, %%mm2, %%mm0, %%mm6)
-            "movq  %%mm0, %0            \n\t"
-            :"+m"(*dst)
-            :"m"(*src1), "m"(*src2)
-            :"memory");
-        dst += dstStride;
-        src1 += src1Stride;
-        src2 += 8;
-    } while (--h);
-}
-
-static void DEF(avg, pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
-{
-    MOVQ_BFE(mm6);
-    JUMPALIGN();
-    do {
-        __asm__ volatile(
-            "movq  %1, %%mm0            \n\t"
-            "movq  1%1, %%mm1           \n\t"
-            "movq  %0, %%mm3            \n\t"
-            PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
-            OP_AVG(%%mm3, %%mm2, %%mm0, %%mm6)
-            "movq  %%mm0, %0            \n\t"
-            "movq  8%1, %%mm0           \n\t"
-            "movq  9%1, %%mm1           \n\t"
-            "movq  8%0, %%mm3           \n\t"
-            PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
-            OP_AVG(%%mm3, %%mm2, %%mm0, %%mm6)
-            "movq  %%mm0, 8%0           \n\t"
-            :"+m"(*block)
-            :"m"(*pixels)
-            :"memory");
-        pixels += line_size;
-        block += line_size;
-    } while (--h);
-}
-
-static av_unused void DEF(avg, pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
-{
-    MOVQ_BFE(mm6);
-    JUMPALIGN();
-    do {
-        __asm__ volatile(
-            "movq  %1, %%mm0            \n\t"
-            "movq  %2, %%mm1            \n\t"
-            "movq  %0, %%mm3            \n\t"
-            PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
-            OP_AVG(%%mm3, %%mm2, %%mm0, %%mm6)
-            "movq  %%mm0, %0            \n\t"
-            "movq  8%1, %%mm0           \n\t"
-            "movq  8%2, %%mm1           \n\t"
-            "movq  8%0, %%mm3           \n\t"
-            PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
-            OP_AVG(%%mm3, %%mm2, %%mm0, %%mm6)
-            "movq  %%mm0, 8%0           \n\t"
-            :"+m"(*dst)
-            :"m"(*src1), "m"(*src2)
-            :"memory");
-        dst += dstStride;
-        src1 += src1Stride;
-        src2 += 16;
-    } while (--h);
-}
-
-static void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
-{
-    MOVQ_BFE(mm6);
-    __asm__ volatile(
-        "lea    (%3, %3), %%"REG_a"     \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        ".p2align 3                     \n\t"
-        "1:                             \n\t"
-        "movq   (%1, %3), %%mm1         \n\t"
-        "movq   (%1, %%"REG_a"), %%mm2  \n\t"
-        PAVGBP(%%mm1, %%mm0, %%mm4,   %%mm2, %%mm1, %%mm5)
-        "movq   (%2), %%mm3             \n\t"
-        OP_AVG(%%mm3, %%mm4, %%mm0, %%mm6)
-        "movq   (%2, %3), %%mm3         \n\t"
-        OP_AVG(%%mm3, %%mm5, %%mm1, %%mm6)
-        "movq   %%mm0, (%2)             \n\t"
-        "movq   %%mm1, (%2, %3)         \n\t"
-        "add    %%"REG_a", %1           \n\t"
-        "add    %%"REG_a", %2           \n\t"
-
-        "movq   (%1, %3), %%mm1         \n\t"
-        "movq   (%1, %%"REG_a"), %%mm0  \n\t"
-        PAVGBP(%%mm1, %%mm2, %%mm4,   %%mm0, %%mm1, %%mm5)
-        "movq   (%2), %%mm3             \n\t"
-        OP_AVG(%%mm3, %%mm4, %%mm2, %%mm6)
-        "movq   (%2, %3), %%mm3         \n\t"
-        OP_AVG(%%mm3, %%mm5, %%mm1, %%mm6)
-        "movq   %%mm2, (%2)             \n\t"
-        "movq   %%mm1, (%2, %3)         \n\t"
-        "add    %%"REG_a", %1           \n\t"
-        "add    %%"REG_a", %2           \n\t"
-
-        "subl   $4, %0                  \n\t"
-        "jnz    1b                      \n\t"
-        :"+g"(h), "+S"(pixels), "+D"(block)
-        :"r"((x86_reg)line_size)
-        :REG_a, "memory");
-}
-
 // this routine is 'slightly' suboptimal but mostly unused
 static void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
 {
@@ -573,21 +210,11 @@ static void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int lin
 }
 
 //FIXME optimize
-static void DEF(put, pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
-    DEF(put, pixels8_y2)(block  , pixels  , line_size, h);
-    DEF(put, pixels8_y2)(block+8, pixels+8, line_size, h);
-}
-
 static void DEF(put, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
     DEF(put, pixels8_xy2)(block  , pixels  , line_size, h);
     DEF(put, pixels8_xy2)(block+8, pixels+8, line_size, h);
 }
 
-static void DEF(avg, pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
-    DEF(avg, pixels8_y2)(block  , pixels  , line_size, h);
-    DEF(avg, pixels8_y2)(block+8, pixels+8, line_size, h);
-}
-
 static void DEF(avg, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
     DEF(avg, pixels8_xy2)(block  , pixels  , line_size, h);
     DEF(avg, pixels8_xy2)(block+8, pixels+8, line_size, h);
diff --git a/libavcodec/x86/hpeldsp_avg_template.c b/libavcodec/x86/hpeldsp_avg_template.c
new file mode 100644
index 0000000..414667c
--- /dev/null
+++ b/libavcodec/x86/hpeldsp_avg_template.c
@@ -0,0 +1,77 @@
+/*
+ * DSP utils : average functions are compiled twice for 3dnow/mmxext
+ * Copyright (c) 2000, 2001 Fabrice Bellard
+ * Copyright (c) 2002-2004 Michael Niedermayer
+ *
+ * MMX optimization by Nick Kurshev <nickols_k at mail.ru>
+ * mostly rewritten by Michael Niedermayer <michaelni at gmx.at>
+ * and improved by Zdenek Kabelac <kabi at users.sf.net>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+//FIXME the following could be optimized too ...
+static void DEF(ff_put_no_rnd_pixels16_x2)(uint8_t *block,
+                                           const uint8_t *pixels,
+                                           int line_size, int h)
+{
+    DEF(ff_put_no_rnd_pixels8_x2)(block,     pixels,     line_size, h);
+    DEF(ff_put_no_rnd_pixels8_x2)(block + 8, pixels + 8, line_size, h);
+}
+
+static void DEF(ff_put_pixels16_y2)(uint8_t *block, const uint8_t *pixels,
+                                    int line_size, int h)
+{
+    DEF(ff_put_pixels8_y2)(block,     pixels,     line_size, h);
+    DEF(ff_put_pixels8_y2)(block + 8, pixels + 8, line_size, h);
+}
+
+static void DEF(ff_put_no_rnd_pixels16_y2)(uint8_t *block,
+                                           const uint8_t *pixels,
+                                           int line_size, int h)
+{
+    DEF(ff_put_no_rnd_pixels8_y2)(block,     pixels,     line_size, h);
+    DEF(ff_put_no_rnd_pixels8_y2)(block + 8, pixels + 8, line_size, h);
+}
+
+static void DEF(ff_avg_pixels16)(uint8_t *block, const uint8_t *pixels,
+                                 int line_size, int h)
+{
+    DEF(ff_avg_pixels8)(block,     pixels,     line_size, h);
+    DEF(ff_avg_pixels8)(block + 8, pixels + 8, line_size, h);
+}
+
+static void DEF(ff_avg_pixels16_x2)(uint8_t *block, const uint8_t *pixels,
+                                    int line_size, int h)
+{
+    DEF(ff_avg_pixels8_x2)(block,     pixels,     line_size, h);
+    DEF(ff_avg_pixels8_x2)(block + 8, pixels + 8, line_size, h);
+}
+
+static void DEF(ff_avg_pixels16_y2)(uint8_t *block, const uint8_t *pixels,
+                                    int line_size, int h)
+{
+    DEF(ff_avg_pixels8_y2)(block,     pixels,     line_size, h);
+    DEF(ff_avg_pixels8_y2)(block + 8, pixels + 8, line_size, h);
+}
+
+static void DEF(ff_avg_pixels16_xy2)(uint8_t *block, const uint8_t *pixels,
+                                     int line_size, int h)
+{
+    DEF(ff_avg_pixels8_xy2)(block,     pixels,     line_size, h);
+    DEF(ff_avg_pixels8_xy2)(block + 8, pixels + 8, line_size, h);
+}
diff --git a/libavcodec/x86/hpeldsp_init.c b/libavcodec/x86/hpeldsp_init.c
new file mode 100644
index 0000000..a5a3ae3
--- /dev/null
+++ b/libavcodec/x86/hpeldsp_init.c
@@ -0,0 +1,417 @@
+/*
+ * MMX optimized DSP utils
+ * Copyright (c) 2000, 2001 Fabrice Bellard
+ * Copyright (c) 2002-2004 Michael Niedermayer <michaelni at gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * MMX optimization by Nick Kurshev <nickols_k at mail.ru>
+ */
+
+#include "libavutil/cpu.h"
+#include "libavutil/x86/asm.h"
+#include "libavcodec/hpeldsp.h"
+#include "dsputil_mmx.h"
+
+//#undef NDEBUG
+//#include <assert.h>
+
+#if HAVE_YASM
+void ff_put_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
+                              int line_size, int h);
+void ff_put_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
+                             int line_size, int h);
+void ff_put_pixels16_x2_mmxext(uint8_t *block, const uint8_t *pixels,
+                               int line_size, int h);
+void ff_put_pixels16_x2_3dnow(uint8_t *block, const uint8_t *pixels,
+                              int line_size, int h);
+void ff_put_no_rnd_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
+                                     int line_size, int h);
+void ff_put_no_rnd_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
+                                    int line_size, int h);
+void ff_put_no_rnd_pixels8_x2_exact_mmxext(uint8_t *block,
+                                           const uint8_t *pixels,
+                                           int line_size, int h);
+void ff_put_no_rnd_pixels8_x2_exact_3dnow(uint8_t *block,
+                                          const uint8_t *pixels,
+                                          int line_size, int h);
+void ff_put_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
+                              int line_size, int h);
+void ff_put_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
+                             int line_size, int h);
+void ff_put_no_rnd_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
+                                     int line_size, int h);
+void ff_put_no_rnd_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
+                                    int line_size, int h);
+void ff_put_no_rnd_pixels8_y2_exact_mmxext(uint8_t *block,
+                                           const uint8_t *pixels,
+                                           int line_size, int h);
+void ff_put_no_rnd_pixels8_y2_exact_3dnow(uint8_t *block,
+                                          const uint8_t *pixels,
+                                          int line_size, int h);
+void ff_avg_pixels8_mmxext(uint8_t *block, const uint8_t *pixels,
+                           int line_size, int h);
+void ff_avg_pixels8_3dnow(uint8_t *block, const uint8_t *pixels,
+                          int line_size, int h);
+void ff_avg_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
+                              int line_size, int h);
+void ff_avg_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
+                             int line_size, int h);
+void ff_avg_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
+                              int line_size, int h);
+void ff_avg_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
+                             int line_size, int h);
+void ff_avg_pixels8_xy2_mmxext(uint8_t *block, const uint8_t *pixels,
+                               int line_size, int h);
+void ff_avg_pixels8_xy2_3dnow(uint8_t *block, const uint8_t *pixels,
+                              int line_size, int h);
+#endif /* HAVE_YASM */
+
+
+#if HAVE_INLINE_ASM
+
+#define JUMPALIGN()     __asm__ volatile (".p2align 3"::)
+#define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%"#regd", %%"#regd ::)
+
+#define MOVQ_BFE(regd)                                  \
+    __asm__ volatile (                                  \
+        "pcmpeqd %%"#regd", %%"#regd"   \n\t"           \
+        "paddb   %%"#regd", %%"#regd"   \n\t" ::)
+
+#ifndef PIC
+#define MOVQ_BONE(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_bone))
+#define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_wtwo))
+#else
+// for shared library it's better to use this way for accessing constants
+// pcmpeqd -> -1
+#define MOVQ_BONE(regd)                                 \
+    __asm__ volatile (                                  \
+        "pcmpeqd  %%"#regd", %%"#regd"  \n\t"           \
+        "psrlw          $15, %%"#regd"  \n\t"           \
+        "packuswb %%"#regd", %%"#regd"  \n\t" ::)
+
+#define MOVQ_WTWO(regd)                                 \
+    __asm__ volatile (                                  \
+        "pcmpeqd %%"#regd", %%"#regd"   \n\t"           \
+        "psrlw         $15, %%"#regd"   \n\t"           \
+        "psllw          $1, %%"#regd"   \n\t"::)
+
+#endif
+
+// using regr as temporary and for the output result
+// first argument is unmodifed and second is trashed
+// regfe is supposed to contain 0xfefefefefefefefe
+#define PAVGB_MMX_NO_RND(rega, regb, regr, regfe)                \
+    "movq   "#rega", "#regr"            \n\t"                    \
+    "pand   "#regb", "#regr"            \n\t"                    \
+    "pxor   "#rega", "#regb"            \n\t"                    \
+    "pand  "#regfe", "#regb"            \n\t"                    \
+    "psrlq       $1, "#regb"            \n\t"                    \
+    "paddb  "#regb", "#regr"            \n\t"
+
+#define PAVGB_MMX(rega, regb, regr, regfe)                       \
+    "movq   "#rega", "#regr"            \n\t"                    \
+    "por    "#regb", "#regr"            \n\t"                    \
+    "pxor   "#rega", "#regb"            \n\t"                    \
+    "pand  "#regfe", "#regb"            \n\t"                    \
+    "psrlq       $1, "#regb"            \n\t"                    \
+    "psubb  "#regb", "#regr"            \n\t"
+
+// mm6 is supposed to contain 0xfefefefefefefefe
+#define PAVGBP_MMX_NO_RND(rega, regb, regr,  regc, regd, regp)   \
+    "movq  "#rega", "#regr"             \n\t"                    \
+    "movq  "#regc", "#regp"             \n\t"                    \
+    "pand  "#regb", "#regr"             \n\t"                    \
+    "pand  "#regd", "#regp"             \n\t"                    \
+    "pxor  "#rega", "#regb"             \n\t"                    \
+    "pxor  "#regc", "#regd"             \n\t"                    \
+    "pand    %%mm6, "#regb"             \n\t"                    \
+    "pand    %%mm6, "#regd"             \n\t"                    \
+    "psrlq      $1, "#regb"             \n\t"                    \
+    "psrlq      $1, "#regd"             \n\t"                    \
+    "paddb "#regb", "#regr"             \n\t"                    \
+    "paddb "#regd", "#regp"             \n\t"
+
+#define PAVGBP_MMX(rega, regb, regr, regc, regd, regp)           \
+    "movq  "#rega", "#regr"             \n\t"                    \
+    "movq  "#regc", "#regp"             \n\t"                    \
+    "por   "#regb", "#regr"             \n\t"                    \
+    "por   "#regd", "#regp"             \n\t"                    \
+    "pxor  "#rega", "#regb"             \n\t"                    \
+    "pxor  "#regc", "#regd"             \n\t"                    \
+    "pand    %%mm6, "#regb"             \n\t"                    \
+    "pand    %%mm6, "#regd"             \n\t"                    \
+    "psrlq      $1, "#regd"             \n\t"                    \
+    "psrlq      $1, "#regb"             \n\t"                    \
+    "psubb "#regb", "#regr"             \n\t"                    \
+    "psubb "#regd", "#regp"             \n\t"
+
+/***********************************/
+/* MMX no rounding */
+#define NO_RND 1
+#define DEF(x, y) x ## _no_rnd_ ## y ## _mmx
+#define SET_RND  MOVQ_WONE
+#define PAVGBP(a, b, c, d, e, f)        PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
+#define PAVGB(a, b, c, e)               PAVGB_MMX_NO_RND(a, b, c, e)
+#define OP_AVG(a, b, c, e)              PAVGB_MMX(a, b, c, e)
+
+#include "hpeldsp_rnd_template.c"
+
+#undef DEF
+#undef SET_RND
+#undef PAVGBP
+#undef PAVGB
+#undef NO_RND
+/***********************************/
+/* MMX rounding */
+
+#define DEF(x, y) x ## _ ## y ## _mmx
+#define SET_RND  MOVQ_WTWO
+#define PAVGBP(a, b, c, d, e, f)        PAVGBP_MMX(a, b, c, d, e, f)
+#define PAVGB(a, b, c, e)               PAVGB_MMX(a, b, c, e)
+
+#include "hpeldsp_rnd_template.c"
+
+#undef DEF
+#undef SET_RND
+#undef PAVGBP
+#undef PAVGB
+#undef OP_AVG
+
+#endif /* HAVE_INLINE_ASM */
+
+
+#if HAVE_YASM
+#define ff_put_pixels8_mmx ff_put_pixels8_mmxext
+
+/***********************************/
+/* 3Dnow specific */
+
+#define DEF(x) x ## _3dnow
+
+#include "hpeldsp_avg_template.c"
+
+#undef DEF
+
+/***********************************/
+/* MMXEXT specific */
+
+#define DEF(x) x ## _mmxext
+
+#include "hpeldsp_avg_template.c"
+
+#undef DEF
+
+#endif /* HAVE_YASM */
+
+
+#if HAVE_INLINE_ASM
+#define put_no_rnd_pixels16_mmx put_pixels16_mmx
+#define put_no_rnd_pixels8_mmx put_pixels8_mmx
+#define put_pixels16_mmxext put_pixels16_mmx
+#define put_pixels8_mmxext put_pixels8_mmx
+#define put_pixels4_mmxext put_pixels4_mmx
+#define put_no_rnd_pixels16_mmxext put_no_rnd_pixels16_mmx
+#define put_no_rnd_pixels8_mmxext put_no_rnd_pixels8_mmx
+
+static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels,
+                            int line_size, int h)
+{
+    __asm__ volatile (
+        "lea   (%3, %3), %%"REG_a"      \n\t"
+        ".p2align     3                 \n\t"
+        "1:                             \n\t"
+        "movq  (%1    ), %%mm0          \n\t"
+        "movq  (%1, %3), %%mm1          \n\t"
+        "movq     %%mm0, (%2)           \n\t"
+        "movq     %%mm1, (%2, %3)       \n\t"
+        "add  %%"REG_a", %1             \n\t"
+        "add  %%"REG_a", %2             \n\t"
+        "movq  (%1    ), %%mm0          \n\t"
+        "movq  (%1, %3), %%mm1          \n\t"
+        "movq     %%mm0, (%2)           \n\t"
+        "movq     %%mm1, (%2, %3)       \n\t"
+        "add  %%"REG_a", %1             \n\t"
+        "add  %%"REG_a", %2             \n\t"
+        "subl        $4, %0             \n\t"
+        "jnz         1b                 \n\t"
+        : "+g"(h), "+r"(pixels),  "+r"(block)
+        : "r"((x86_reg)line_size)
+        : "%"REG_a, "memory"
+        );
+}
+
+static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels,
+                             int line_size, int h)
+{
+    __asm__ volatile (
+        "lea   (%3, %3), %%"REG_a"      \n\t"
+        ".p2align     3                 \n\t"
+        "1:                             \n\t"
+        "movq  (%1    ), %%mm0          \n\t"
+        "movq 8(%1    ), %%mm4          \n\t"
+        "movq  (%1, %3), %%mm1          \n\t"
+        "movq 8(%1, %3), %%mm5          \n\t"
+        "movq     %%mm0,  (%2)          \n\t"
+        "movq     %%mm4, 8(%2)          \n\t"
+        "movq     %%mm1,  (%2, %3)      \n\t"
+        "movq     %%mm5, 8(%2, %3)      \n\t"
+        "add  %%"REG_a", %1             \n\t"
+        "add  %%"REG_a", %2             \n\t"
+        "movq  (%1    ), %%mm0          \n\t"
+        "movq 8(%1    ), %%mm4          \n\t"
+        "movq  (%1, %3), %%mm1          \n\t"
+        "movq 8(%1, %3), %%mm5          \n\t"
+        "movq     %%mm0,  (%2)          \n\t"
+        "movq     %%mm4, 8(%2)          \n\t"
+        "movq     %%mm1,  (%2, %3)      \n\t"
+        "movq     %%mm5, 8(%2, %3)      \n\t"
+        "add  %%"REG_a", %1             \n\t"
+        "add  %%"REG_a", %2             \n\t"
+        "subl        $4, %0             \n\t"
+        "jnz         1b                 \n\t"
+        : "+g"(h), "+r"(pixels),  "+r"(block)
+        : "r"((x86_reg)line_size)
+        : "%"REG_a, "memory"
+        );
+}
+#endif /* HAVE_INLINE_ASM */
+
+void ff_put_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
+                          int line_size, int h);
+void ff_avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
+                          int line_size, int h);
+
+#define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU)                                     \
+    do {                                                                        \
+        c->PFX ## _pixels_tab IDX [0] = PFX ## _pixels ## SIZE ## _     ## CPU; \
+        c->PFX ## _pixels_tab IDX [1] = PFX ## _pixels ## SIZE ## _x2_  ## CPU; \
+        c->PFX ## _pixels_tab IDX [2] = PFX ## _pixels ## SIZE ## _y2_  ## CPU; \
+        c->PFX ## _pixels_tab IDX [3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU; \
+    } while (0)
+
+static void hpeldsp_init_mmx(HpelDSPContext *c, int flags, int mm_flags)
+{
+#if HAVE_INLINE_ASM
+    SET_HPEL_FUNCS(put,        [0], 16, mmx);
+    SET_HPEL_FUNCS(put_no_rnd, [0], 16, mmx);
+    SET_HPEL_FUNCS(avg,        [0], 16, mmx);
+    SET_HPEL_FUNCS(avg_no_rnd,    , 16, mmx);
+    SET_HPEL_FUNCS(put,        [1],  8, mmx);
+    SET_HPEL_FUNCS(put_no_rnd, [1],  8, mmx);
+    SET_HPEL_FUNCS(avg,        [1],  8, mmx);
+#endif /* HAVE_INLINE_ASM */
+}
+
+static void hpeldsp_init_mmxext(HpelDSPContext *c, int flags, int mm_flags)
+{
+#if HAVE_YASM
+    c->put_pixels_tab[0][1] = ff_put_pixels16_x2_mmxext;
+    c->put_pixels_tab[0][2] = ff_put_pixels16_y2_mmxext;
+
+    c->avg_pixels_tab[0][0] = ff_avg_pixels16_mmxext;
+    c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_mmxext;
+    c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_mmxext;
+
+    c->put_pixels_tab[1][1] = ff_put_pixels8_x2_mmxext;
+    c->put_pixels_tab[1][2] = ff_put_pixels8_y2_mmxext;
+
+    c->avg_pixels_tab[1][0] = ff_avg_pixels8_mmxext;
+    c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_mmxext;
+    c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_mmxext;
+
+    if (!(flags & CODEC_FLAG_BITEXACT)) {
+        c->put_no_rnd_pixels_tab[0][1] = ff_put_no_rnd_pixels16_x2_mmxext;
+        c->put_no_rnd_pixels_tab[0][2] = ff_put_no_rnd_pixels16_y2_mmxext;
+        c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_mmxext;
+        c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_mmxext;
+
+        c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_mmxext;
+        c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_mmxext;
+    }
+#endif /* HAVE_YASM */
+
+#if HAVE_MMXEXT_EXTERNAL
+    if (flags & CODEC_FLAG_BITEXACT && CONFIG_VP3_DECODER) {
+        c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_mmxext;
+        c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_mmxext;
+    }
+#endif /* HAVE_MMXEXT_EXTERNAL */
+}
+
+static void hpeldsp_init_3dnow(HpelDSPContext *c, int flags, int mm_flags)
+{
+#if HAVE_YASM
+    c->put_pixels_tab[0][1] = ff_put_pixels16_x2_3dnow;
+    c->put_pixels_tab[0][2] = ff_put_pixels16_y2_3dnow;
+
+    c->avg_pixels_tab[0][0] = ff_avg_pixels16_3dnow;
+    c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_3dnow;
+    c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_3dnow;
+
+    c->put_pixels_tab[1][1] = ff_put_pixels8_x2_3dnow;
+    c->put_pixels_tab[1][2] = ff_put_pixels8_y2_3dnow;
+
+    c->avg_pixels_tab[1][0] = ff_avg_pixels8_3dnow;
+    c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_3dnow;
+    c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_3dnow;
+
+    if (!(flags & CODEC_FLAG_BITEXACT)){
+        c->put_no_rnd_pixels_tab[0][1] = ff_put_no_rnd_pixels16_x2_3dnow;
+        c->put_no_rnd_pixels_tab[0][2] = ff_put_no_rnd_pixels16_y2_3dnow;
+        c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_3dnow;
+        c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_3dnow;
+
+        c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_3dnow;
+        c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_3dnow;
+    }
+
+    if (flags & CODEC_FLAG_BITEXACT && CONFIG_VP3_DECODER) {
+        c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_3dnow;
+        c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_3dnow;
+    }
+#endif /* HAVE_YASM */
+}
+
+static void hpeldsp_init_sse2(HpelDSPContext *c, int flags, int mm_flags)
+{
+#if HAVE_SSE2_EXTERNAL
+    if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
+        // these functions are slower than mmx on AMD, but faster on Intel
+        c->put_pixels_tab[0][0]        = ff_put_pixels16_sse2;
+        c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_sse2;
+        c->avg_pixels_tab[0][0]        = ff_avg_pixels16_sse2;
+    }
+#endif /* HAVE_SSE2_EXTERNAL */
+}
+
+void ff_hpeldsp_init_x86(HpelDSPContext *c, int flags)
+{
+    int mm_flags = av_get_cpu_flags();
+
+    if (mm_flags & AV_CPU_FLAG_MMX)
+        hpeldsp_init_mmx(c, flags, mm_flags);
+
+    if (mm_flags & AV_CPU_FLAG_MMXEXT)
+        hpeldsp_init_mmxext(c, flags, mm_flags);
+
+    if (mm_flags & AV_CPU_FLAG_3DNOW)
+        hpeldsp_init_3dnow(c, flags, mm_flags);
+
+    if (mm_flags & AV_CPU_FLAG_SSE2)
+        hpeldsp_init_sse2(c, flags, mm_flags);
+}
diff --git a/libavcodec/x86/hpeldsp_rnd_template.c b/libavcodec/x86/hpeldsp_rnd_template.c
new file mode 100644
index 0000000..aedc9d4
--- /dev/null
+++ b/libavcodec/x86/hpeldsp_rnd_template.c
@@ -0,0 +1,428 @@
+/*
+ * DSP utils mmx functions are compiled twice for rnd/no_rnd
+ * Copyright (c) 2000, 2001 Fabrice Bellard
+ * Copyright (c) 2003-2004 Michael Niedermayer <michaelni at gmx.at>
+ *
+ * MMX optimization by Nick Kurshev <nickols_k at mail.ru>
+ * mostly rewritten by Michael Niedermayer <michaelni at gmx.at>
+ * and improved by Zdenek Kabelac <kabi at users.sf.net>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+// put_pixels
+static void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+    MOVQ_BFE(mm6);
+    __asm__ volatile(
+        "lea    (%3, %3), %%"REG_a"     \n\t"
+        ".p2align 3                     \n\t"
+        "1:                             \n\t"
+        "movq   (%1), %%mm0             \n\t"
+        "movq   1(%1), %%mm1            \n\t"
+        "movq   (%1, %3), %%mm2         \n\t"
+        "movq   1(%1, %3), %%mm3        \n\t"
+        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
+        "movq   %%mm4, (%2)             \n\t"
+        "movq   %%mm5, (%2, %3)         \n\t"
+        "add    %%"REG_a", %1           \n\t"
+        "add    %%"REG_a", %2           \n\t"
+        "movq   (%1), %%mm0             \n\t"
+        "movq   1(%1), %%mm1            \n\t"
+        "movq   (%1, %3), %%mm2         \n\t"
+        "movq   1(%1, %3), %%mm3        \n\t"
+        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
+        "movq   %%mm4, (%2)             \n\t"
+        "movq   %%mm5, (%2, %3)         \n\t"
+        "add    %%"REG_a", %1           \n\t"
+        "add    %%"REG_a", %2           \n\t"
+        "subl   $4, %0                  \n\t"
+        "jnz    1b                      \n\t"
+        :"+g"(h), "+S"(pixels), "+D"(block)
+        :"r"((x86_reg)line_size)
+        :REG_a, "memory");
+}
+
+static void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+    MOVQ_BFE(mm6);
+    __asm__ volatile(
+        "lea        (%3, %3), %%"REG_a" \n\t"
+        ".p2align 3                     \n\t"
+        "1:                             \n\t"
+        "movq   (%1), %%mm0             \n\t"
+        "movq   1(%1), %%mm1            \n\t"
+        "movq   (%1, %3), %%mm2         \n\t"
+        "movq   1(%1, %3), %%mm3        \n\t"
+        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
+        "movq   %%mm4, (%2)             \n\t"
+        "movq   %%mm5, (%2, %3)         \n\t"
+        "movq   8(%1), %%mm0            \n\t"
+        "movq   9(%1), %%mm1            \n\t"
+        "movq   8(%1, %3), %%mm2        \n\t"
+        "movq   9(%1, %3), %%mm3        \n\t"
+        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
+        "movq   %%mm4, 8(%2)            \n\t"
+        "movq   %%mm5, 8(%2, %3)        \n\t"
+        "add    %%"REG_a", %1           \n\t"
+        "add    %%"REG_a", %2           \n\t"
+        "movq   (%1), %%mm0             \n\t"
+        "movq   1(%1), %%mm1            \n\t"
+        "movq   (%1, %3), %%mm2         \n\t"
+        "movq   1(%1, %3), %%mm3        \n\t"
+        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
+        "movq   %%mm4, (%2)             \n\t"
+        "movq   %%mm5, (%2, %3)         \n\t"
+        "movq   8(%1), %%mm0            \n\t"
+        "movq   9(%1), %%mm1            \n\t"
+        "movq   8(%1, %3), %%mm2        \n\t"
+        "movq   9(%1, %3), %%mm3        \n\t"
+        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
+        "movq   %%mm4, 8(%2)            \n\t"
+        "movq   %%mm5, 8(%2, %3)        \n\t"
+        "add    %%"REG_a", %1           \n\t"
+        "add    %%"REG_a", %2           \n\t"
+        "subl   $4, %0                  \n\t"
+        "jnz    1b                      \n\t"
+        :"+g"(h), "+S"(pixels), "+D"(block)
+        :"r"((x86_reg)line_size)
+        :REG_a, "memory");
+}
+
+static void DEF(put, pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+    MOVQ_BFE(mm6);
+    __asm__ volatile(
+        "lea (%3, %3), %%"REG_a"        \n\t"
+        "movq (%1), %%mm0               \n\t"
+        ".p2align 3                     \n\t"
+        "1:                             \n\t"
+        "movq   (%1, %3), %%mm1         \n\t"
+        "movq   (%1, %%"REG_a"),%%mm2   \n\t"
+        PAVGBP(%%mm1, %%mm0, %%mm4,   %%mm2, %%mm1, %%mm5)
+        "movq   %%mm4, (%2)             \n\t"
+        "movq   %%mm5, (%2, %3)         \n\t"
+        "add    %%"REG_a", %1           \n\t"
+        "add    %%"REG_a", %2           \n\t"
+        "movq   (%1, %3), %%mm1         \n\t"
+        "movq   (%1, %%"REG_a"),%%mm0   \n\t"
+        PAVGBP(%%mm1, %%mm2, %%mm4,   %%mm0, %%mm1, %%mm5)
+        "movq   %%mm4, (%2)             \n\t"
+        "movq   %%mm5, (%2, %3)         \n\t"
+        "add    %%"REG_a", %1           \n\t"
+        "add    %%"REG_a", %2           \n\t"
+        "subl   $4, %0                  \n\t"
+        "jnz    1b                      \n\t"
+        :"+g"(h), "+S"(pixels), "+D"(block)
+        :"r"((x86_reg)line_size)
+        :REG_a, "memory");
+}
+
+static void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+    MOVQ_ZERO(mm7);
+    SET_RND(mm6); // =2 for rnd  and  =1 for no_rnd version
+    __asm__ volatile(
+        "movq   (%1), %%mm0             \n\t"
+        "movq   1(%1), %%mm4            \n\t"
+        "movq   %%mm0, %%mm1            \n\t"
+        "movq   %%mm4, %%mm5            \n\t"
+        "punpcklbw %%mm7, %%mm0         \n\t"
+        "punpcklbw %%mm7, %%mm4         \n\t"
+        "punpckhbw %%mm7, %%mm1         \n\t"
+        "punpckhbw %%mm7, %%mm5         \n\t"
+        "paddusw %%mm0, %%mm4           \n\t"
+        "paddusw %%mm1, %%mm5           \n\t"
+        "xor    %%"REG_a", %%"REG_a"    \n\t"
+        "add    %3, %1                  \n\t"
+        ".p2align 3                     \n\t"
+        "1:                             \n\t"
+        "movq   (%1, %%"REG_a"), %%mm0  \n\t"
+        "movq   1(%1, %%"REG_a"), %%mm2 \n\t"
+        "movq   %%mm0, %%mm1            \n\t"
+        "movq   %%mm2, %%mm3            \n\t"
+        "punpcklbw %%mm7, %%mm0         \n\t"
+        "punpcklbw %%mm7, %%mm2         \n\t"
+        "punpckhbw %%mm7, %%mm1         \n\t"
+        "punpckhbw %%mm7, %%mm3         \n\t"
+        "paddusw %%mm2, %%mm0           \n\t"
+        "paddusw %%mm3, %%mm1           \n\t"
+        "paddusw %%mm6, %%mm4           \n\t"
+        "paddusw %%mm6, %%mm5           \n\t"
+        "paddusw %%mm0, %%mm4           \n\t"
+        "paddusw %%mm1, %%mm5           \n\t"
+        "psrlw  $2, %%mm4               \n\t"
+        "psrlw  $2, %%mm5               \n\t"
+        "packuswb  %%mm5, %%mm4         \n\t"
+        "movq   %%mm4, (%2, %%"REG_a")  \n\t"
+        "add    %3, %%"REG_a"           \n\t"
+
+        "movq   (%1, %%"REG_a"), %%mm2  \n\t" // 0 <-> 2   1 <-> 3
+        "movq   1(%1, %%"REG_a"), %%mm4 \n\t"
+        "movq   %%mm2, %%mm3            \n\t"
+        "movq   %%mm4, %%mm5            \n\t"
+        "punpcklbw %%mm7, %%mm2         \n\t"
+        "punpcklbw %%mm7, %%mm4         \n\t"
+        "punpckhbw %%mm7, %%mm3         \n\t"
+        "punpckhbw %%mm7, %%mm5         \n\t"
+        "paddusw %%mm2, %%mm4           \n\t"
+        "paddusw %%mm3, %%mm5           \n\t"
+        "paddusw %%mm6, %%mm0           \n\t"
+        "paddusw %%mm6, %%mm1           \n\t"
+        "paddusw %%mm4, %%mm0           \n\t"
+        "paddusw %%mm5, %%mm1           \n\t"
+        "psrlw  $2, %%mm0               \n\t"
+        "psrlw  $2, %%mm1               \n\t"
+        "packuswb  %%mm1, %%mm0         \n\t"
+        "movq   %%mm0, (%2, %%"REG_a")  \n\t"
+        "add    %3, %%"REG_a"           \n\t"
+
+        "subl   $2, %0                  \n\t"
+        "jnz    1b                      \n\t"
+        :"+g"(h), "+S"(pixels)
+        :"D"(block), "r"((x86_reg)line_size)
+        :REG_a, "memory");
+}
+
+// avg_pixels
+#ifndef NO_RND
+// in case more speed is needed - unroling would certainly help
+static void DEF(avg, pixels8)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+    MOVQ_BFE(mm6);
+    JUMPALIGN();
+    do {
+        __asm__ volatile(
+             "movq  %0, %%mm0           \n\t"
+             "movq  %1, %%mm1           \n\t"
+             OP_AVG(%%mm0, %%mm1, %%mm2, %%mm6)
+             "movq  %%mm2, %0           \n\t"
+             :"+m"(*block)
+             :"m"(*pixels)
+             :"memory");
+        pixels += line_size;
+        block += line_size;
+    }
+    while (--h);
+}
+#endif // NO_RND
+
+static void DEF(avg, pixels16)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+    MOVQ_BFE(mm6);
+    JUMPALIGN();
+    do {
+        __asm__ volatile(
+             "movq  %0, %%mm0           \n\t"
+             "movq  %1, %%mm1           \n\t"
+             OP_AVG(%%mm0, %%mm1, %%mm2, %%mm6)
+             "movq  %%mm2, %0           \n\t"
+             "movq  8%0, %%mm0          \n\t"
+             "movq  8%1, %%mm1          \n\t"
+             OP_AVG(%%mm0, %%mm1, %%mm2, %%mm6)
+             "movq  %%mm2, 8%0          \n\t"
+             :"+m"(*block)
+             :"m"(*pixels)
+             :"memory");
+        pixels += line_size;
+        block += line_size;
+    }
+    while (--h);
+}
+
+#ifndef NO_RND
+static void DEF(avg, pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+    MOVQ_BFE(mm6);
+    JUMPALIGN();
+    do {
+        __asm__ volatile(
+            "movq  %1, %%mm0            \n\t"
+            "movq  1%1, %%mm1           \n\t"
+            "movq  %0, %%mm3            \n\t"
+            PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
+            OP_AVG(%%mm3, %%mm2, %%mm0, %%mm6)
+            "movq  %%mm0, %0            \n\t"
+            :"+m"(*block)
+            :"m"(*pixels)
+            :"memory");
+        pixels += line_size;
+        block += line_size;
+    } while (--h);
+}
+#endif // NO_RND
+
+static void DEF(avg, pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+    MOVQ_BFE(mm6);
+    JUMPALIGN();
+    do {
+        __asm__ volatile(
+            "movq  %1, %%mm0            \n\t"
+            "movq  1%1, %%mm1           \n\t"
+            "movq  %0, %%mm3            \n\t"
+            PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
+            OP_AVG(%%mm3, %%mm2, %%mm0, %%mm6)
+            "movq  %%mm0, %0            \n\t"
+            "movq  8%1, %%mm0           \n\t"
+            "movq  9%1, %%mm1           \n\t"
+            "movq  8%0, %%mm3           \n\t"
+            PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
+            OP_AVG(%%mm3, %%mm2, %%mm0, %%mm6)
+            "movq  %%mm0, 8%0           \n\t"
+            :"+m"(*block)
+            :"m"(*pixels)
+            :"memory");
+        pixels += line_size;
+        block += line_size;
+    } while (--h);
+}
+
+static void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+    MOVQ_BFE(mm6);
+    __asm__ volatile(
+        "lea    (%3, %3), %%"REG_a"     \n\t"
+        "movq   (%1), %%mm0             \n\t"
+        ".p2align 3                     \n\t"
+        "1:                             \n\t"
+        "movq   (%1, %3), %%mm1         \n\t"
+        "movq   (%1, %%"REG_a"), %%mm2  \n\t"
+        PAVGBP(%%mm1, %%mm0, %%mm4,   %%mm2, %%mm1, %%mm5)
+        "movq   (%2), %%mm3             \n\t"
+        OP_AVG(%%mm3, %%mm4, %%mm0, %%mm6)
+        "movq   (%2, %3), %%mm3         \n\t"
+        OP_AVG(%%mm3, %%mm5, %%mm1, %%mm6)
+        "movq   %%mm0, (%2)             \n\t"
+        "movq   %%mm1, (%2, %3)         \n\t"
+        "add    %%"REG_a", %1           \n\t"
+        "add    %%"REG_a", %2           \n\t"
+
+        "movq   (%1, %3), %%mm1         \n\t"
+        "movq   (%1, %%"REG_a"), %%mm0  \n\t"
+        PAVGBP(%%mm1, %%mm2, %%mm4,   %%mm0, %%mm1, %%mm5)
+        "movq   (%2), %%mm3             \n\t"
+        OP_AVG(%%mm3, %%mm4, %%mm2, %%mm6)
+        "movq   (%2, %3), %%mm3         \n\t"
+        OP_AVG(%%mm3, %%mm5, %%mm1, %%mm6)
+        "movq   %%mm2, (%2)             \n\t"
+        "movq   %%mm1, (%2, %3)         \n\t"
+        "add    %%"REG_a", %1           \n\t"
+        "add    %%"REG_a", %2           \n\t"
+
+        "subl   $4, %0                  \n\t"
+        "jnz    1b                      \n\t"
+        :"+g"(h), "+S"(pixels), "+D"(block)
+        :"r"((x86_reg)line_size)
+        :REG_a, "memory");
+}
+
+// this routine is 'slightly' suboptimal but mostly unused
+static void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+    MOVQ_ZERO(mm7);
+    SET_RND(mm6); // =2 for rnd  and  =1 for no_rnd version
+    __asm__ volatile(
+        "movq   (%1), %%mm0             \n\t"
+        "movq   1(%1), %%mm4            \n\t"
+        "movq   %%mm0, %%mm1            \n\t"
+        "movq   %%mm4, %%mm5            \n\t"
+        "punpcklbw %%mm7, %%mm0         \n\t"
+        "punpcklbw %%mm7, %%mm4         \n\t"
+        "punpckhbw %%mm7, %%mm1         \n\t"
+        "punpckhbw %%mm7, %%mm5         \n\t"
+        "paddusw %%mm0, %%mm4           \n\t"
+        "paddusw %%mm1, %%mm5           \n\t"
+        "xor    %%"REG_a", %%"REG_a"    \n\t"
+        "add    %3, %1                  \n\t"
+        ".p2align 3                     \n\t"
+        "1:                             \n\t"
+        "movq   (%1, %%"REG_a"), %%mm0  \n\t"
+        "movq   1(%1, %%"REG_a"), %%mm2 \n\t"
+        "movq   %%mm0, %%mm1            \n\t"
+        "movq   %%mm2, %%mm3            \n\t"
+        "punpcklbw %%mm7, %%mm0         \n\t"
+        "punpcklbw %%mm7, %%mm2         \n\t"
+        "punpckhbw %%mm7, %%mm1         \n\t"
+        "punpckhbw %%mm7, %%mm3         \n\t"
+        "paddusw %%mm2, %%mm0           \n\t"
+        "paddusw %%mm3, %%mm1           \n\t"
+        "paddusw %%mm6, %%mm4           \n\t"
+        "paddusw %%mm6, %%mm5           \n\t"
+        "paddusw %%mm0, %%mm4           \n\t"
+        "paddusw %%mm1, %%mm5           \n\t"
+        "psrlw  $2, %%mm4               \n\t"
+        "psrlw  $2, %%mm5               \n\t"
+                "movq   (%2, %%"REG_a"), %%mm3  \n\t"
+        "packuswb  %%mm5, %%mm4         \n\t"
+                "pcmpeqd %%mm2, %%mm2   \n\t"
+                "paddb %%mm2, %%mm2     \n\t"
+                OP_AVG(%%mm3, %%mm4, %%mm5, %%mm2)
+                "movq   %%mm5, (%2, %%"REG_a")  \n\t"
+        "add    %3, %%"REG_a"                \n\t"
+
+        "movq   (%1, %%"REG_a"), %%mm2  \n\t" // 0 <-> 2   1 <-> 3
+        "movq   1(%1, %%"REG_a"), %%mm4 \n\t"
+        "movq   %%mm2, %%mm3            \n\t"
+        "movq   %%mm4, %%mm5            \n\t"
+        "punpcklbw %%mm7, %%mm2         \n\t"
+        "punpcklbw %%mm7, %%mm4         \n\t"
+        "punpckhbw %%mm7, %%mm3         \n\t"
+        "punpckhbw %%mm7, %%mm5         \n\t"
+        "paddusw %%mm2, %%mm4           \n\t"
+        "paddusw %%mm3, %%mm5           \n\t"
+        "paddusw %%mm6, %%mm0           \n\t"
+        "paddusw %%mm6, %%mm1           \n\t"
+        "paddusw %%mm4, %%mm0           \n\t"
+        "paddusw %%mm5, %%mm1           \n\t"
+        "psrlw  $2, %%mm0               \n\t"
+        "psrlw  $2, %%mm1               \n\t"
+                "movq   (%2, %%"REG_a"), %%mm3  \n\t"
+        "packuswb  %%mm1, %%mm0         \n\t"
+                "pcmpeqd %%mm2, %%mm2   \n\t"
+                "paddb %%mm2, %%mm2     \n\t"
+                OP_AVG(%%mm3, %%mm0, %%mm1, %%mm2)
+                "movq   %%mm1, (%2, %%"REG_a")  \n\t"
+        "add    %3, %%"REG_a"           \n\t"
+
+        "subl   $2, %0                  \n\t"
+        "jnz    1b                      \n\t"
+        :"+g"(h), "+S"(pixels)
+        :"D"(block), "r"((x86_reg)line_size)
+        :REG_a, "memory");
+}
+
+//FIXME optimize
+static void DEF(put, pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
+    DEF(put, pixels8_y2)(block  , pixels  , line_size, h);
+    DEF(put, pixels8_y2)(block+8, pixels+8, line_size, h);
+}
+
+static void DEF(put, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
+    DEF(put, pixels8_xy2)(block  , pixels  , line_size, h);
+    DEF(put, pixels8_xy2)(block+8, pixels+8, line_size, h);
+}
+
+static void DEF(avg, pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
+    DEF(avg, pixels8_y2)(block  , pixels  , line_size, h);
+    DEF(avg, pixels8_y2)(block+8, pixels+8, line_size, h);
+}
+
+static void DEF(avg, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
+    DEF(avg, pixels8_xy2)(block  , pixels  , line_size, h);
+    DEF(avg, pixels8_xy2)(block+8, pixels+8, line_size, h);
+}
-- 
1.7.11.3



More information about the ffmpeg-devel mailing list