[FFmpeg-cvslog] x86/vf_fspp: port inline asm to yasm

James Almer git at videolan.org
Fri Dec 26 19:45:33 CET 2014


ffmpeg | branch: master | James Almer <jamrial at gmail.com> | Fri Dec 26 15:37:54 2014 -0300| [466e32bf25ac0a9abb216edc6670f747504685f1] | committer: James Almer

x86/vf_fspp: port inline asm to yasm

Reviewed-by: Michael Niedermayer <michaelni at gmx.at>
Signed-off-by: James Almer <jamrial at gmail.com>

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=466e32bf25ac0a9abb216edc6670f747504685f1
---

 libavfilter/vf_fspp.c          |   12 +-
 libavfilter/vf_fspp.h          |    6 +-
 libavfilter/x86/Makefile       |    3 +-
 libavfilter/x86/vf_fspp.asm    |  727 +++++++++++++++++++++
 libavfilter/x86/vf_fspp.c      | 1409 ----------------------------------------
 libavfilter/x86/vf_fspp_init.c |   49 ++
 6 files changed, 787 insertions(+), 1419 deletions(-)

diff --git a/libavfilter/vf_fspp.c b/libavfilter/vf_fspp.c
index 2e73421..61d68ed 100644
--- a/libavfilter/vf_fspp.c
+++ b/libavfilter/vf_fspp.c
@@ -151,11 +151,11 @@ static void store_slice2_c(uint8_t *dst, int16_t *src,
     }
 }
 
-static void mul_thrmat_c(FSPPContext *p, int q)
+static void mul_thrmat_c(int16_t *thr_adr_noq, int16_t *thr_adr, int q)
 {
     int a;
     for (a = 0; a < 64; a++)
-        ((int16_t *)p->threshold_mtx)[a] = q * ((int16_t *)p->threshold_mtx_noq)[a];//ints faster in C
+        thr_adr[a] = q * thr_adr_noq[a];
 }
 
 static void filter(FSPPContext *p, uint8_t *dst, uint8_t *src,
@@ -220,7 +220,7 @@ static void filter(FSPPContext *p, uint8_t *dst, uint8_t *src,
                     t = qp_store[qy + (t >> qpsh)];
                     t = norm_qscale(t, p->qscale_type);
 
-                    if (t != p->prev_q) p->prev_q = t, p->mul_thrmat(p, t);
+                    if (t != p->prev_q) p->prev_q = t, p->mul_thrmat((int16_t *)(&p->threshold_mtx_noq[0]), (int16_t *)(&p->threshold_mtx[0]), t);
                     p->column_fidct((int16_t *)(&p->threshold_mtx[0]), block + x * 8, block3 + x * 8, 8); //yes, this is a HOTSPOT
                 }
             p->row_idct(block3 + 0 * 8, p->temp + (y & 15) * stride + x0 + 2 - (y & 1), stride, 2 * (BLOCKSZ - 1));
@@ -378,7 +378,7 @@ static void column_fidct_c(int16_t *thr_adr, int16_t *data, int16_t *output, int
     }
 }
 
-static void row_idct_c(int16_t *workspace, int16_t *output_adr, int output_stride, int cnt)
+static void row_idct_c(int16_t *workspace, int16_t *output_adr, ptrdiff_t output_stride, int cnt)
 {
     int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
     int_simd16_t tmp10, tmp11, tmp12, tmp13;
@@ -440,7 +440,7 @@ static void row_idct_c(int16_t *workspace, int16_t *output_adr, int output_strid
     }
 }
 
-static void row_fdct_c(int16_t *data, const uint8_t *pixels, int line_size, int cnt)
+static void row_fdct_c(int16_t *data, const uint8_t *pixels, ptrdiff_t line_size, int cnt)
 {
     int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
     int_simd16_t tmp10, tmp11, tmp12, tmp13;
@@ -582,7 +582,7 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *in)
     }
 
     if (fspp->qp)
-        fspp->prev_q = fspp->qp, fspp->mul_thrmat(fspp, fspp->qp);
+        fspp->prev_q = fspp->qp, fspp->mul_thrmat((int16_t *)(&fspp->threshold_mtx_noq[0]), (int16_t *)(&fspp->threshold_mtx[0]), fspp->qp);
 
     /* if we are not in a constant user quantizer mode and we don't want to use
      * the quantizers from the B-frames (B-frames often have a higher QP), we
diff --git a/libavfilter/vf_fspp.h b/libavfilter/vf_fspp.h
index db860c6..237ffb1 100644
--- a/libavfilter/vf_fspp.h
+++ b/libavfilter/vf_fspp.h
@@ -79,16 +79,16 @@ typedef struct FSPPContext {
                          ptrdiff_t dst_stride, ptrdiff_t src_stride,
                          ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale);
 
-    void (*mul_thrmat)(struct FSPPContext *fspp, int q);
+    void (*mul_thrmat)(int16_t *thr_adr_noq, int16_t *thr_adr, int q);
 
     void (*column_fidct)(int16_t *thr_adr, int16_t *data,
                          int16_t *output, int cnt);
 
     void (*row_idct)(int16_t *workspace, int16_t *output_adr,
-                     int output_stride, int cnt);
+                     ptrdiff_t output_stride, int cnt);
 
     void (*row_fdct)(int16_t *data, const uint8_t *pixels,
-                     int line_size, int cnt);
+                     ptrdiff_t line_size, int cnt);
 
 } FSPPContext;
 
diff --git a/libavfilter/x86/Makefile b/libavfilter/x86/Makefile
index 4f9c83d..d9265c9 100644
--- a/libavfilter/x86/Makefile
+++ b/libavfilter/x86/Makefile
@@ -1,4 +1,4 @@
-OBJS-$(CONFIG_FSPP_FILTER)                   += x86/vf_fspp.o
+OBJS-$(CONFIG_FSPP_FILTER)                   += x86/vf_fspp_init.o
 OBJS-$(CONFIG_GRADFUN_FILTER)                += x86/vf_gradfun_init.o
 OBJS-$(CONFIG_HQDN3D_FILTER)                 += x86/vf_hqdn3d_init.o
 OBJS-$(CONFIG_IDET_FILTER)                   += x86/vf_idet_init.o
@@ -10,6 +10,7 @@ OBJS-$(CONFIG_TINTERLACE_FILTER)             += x86/vf_tinterlace_init.o
 OBJS-$(CONFIG_VOLUME_FILTER)                 += x86/af_volume_init.o
 OBJS-$(CONFIG_YADIF_FILTER)                  += x86/vf_yadif_init.o
 
+YASM-OBJS-$(CONFIG_FSPP_FILTER)              += x86/vf_fspp.o
 YASM-OBJS-$(CONFIG_GRADFUN_FILTER)           += x86/vf_gradfun.o
 YASM-OBJS-$(CONFIG_HQDN3D_FILTER)            += x86/vf_hqdn3d.o
 YASM-OBJS-$(CONFIG_IDET_FILTER)              += x86/vf_idet.o
diff --git a/libavfilter/x86/vf_fspp.asm b/libavfilter/x86/vf_fspp.asm
new file mode 100644
index 0000000..5ad4275
--- /dev/null
+++ b/libavfilter/x86/vf_fspp.asm
@@ -0,0 +1,727 @@
+;*****************************************************************************
+;* x86-optimized functions for fspp filter
+;*
+;* Copyright (c) 2003 Michael Niedermayer <michaelni at gmx.at>
+;* Copyright (C) 2005 Nikolaj Poroshin <porosh3 at psu.ru>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License along
+;* with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+;* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+pb_dither: db 0,  48,  12,  60,   3,  51,  15,  63, 32,  16,  44,  28,  35,  19,  47,  31, \
+              8,  56,   4,  52,  11,  59,   7,  55, 40,  24,  36,  20,  43,  27,  39,  23, \
+              2,  50,  14,  62,   1,  49,  13,  61, 34,  18,  46,  30,  33,  17,  45,  29, \
+             10,  58,   6,  54,   9,  57,   5,  53, 42,  26,  38,  22,  41,  25,  37,  21
+pw_187E: times 4 dw 0x187E ; FIX64(0.382683433, 14)
+pw_22A3: times 4 dw 0x22A3 ; FIX64(1.082392200, 13)
+pw_2D41: times 4 dw 0x2D41 ; FIX64(1.414213562, 13)
+pw_539F: times 4 dw 0x539F ; FIX64(1.306562965, 14)
+pw_5A82: times 4 dw 0x5A82 ; FIX64(1.414213562, 14)
+pw_3B21: times 4 dw 0x3B21 ; FIX64(1.847759065, 13)
+pw_AC62: times 4 dw 0xAC62 ; FIX64(-2.613125930, 13)
+pw_3642: times 4 dw 0x3642 ; FIX64(0.847759065, 14)
+pw_2441: times 4 dw 0x2441 ; FIX64(0.566454497, 14)
+pw_0CBB: times 4 dw 0x0CBB ; FIX64(0.198912367, 14)
+pw_4:    times 4 dw 4
+pw_2:    times 4 dw 2
+
+SECTION .text
+
+%define DCTSIZE 8
+
+INIT_MMX mmx
+
+;void ff_store_slice_mmx(uint8_t *dst, int16_t *src,
+;                        ptrdiff_t dst_stride, ptrdiff_t src_stride,
+;                        ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale)
+%if ARCH_X86_64
+cglobal store_slice, 7, 9, 0, dst, src, dst_stride, src_stride, width, dither_height, dither, tmp, tmp2
+%else
+cglobal store_slice, 2, 7, 0, dst, src, width, dither_height, dither, tmp, tmp2
+%define dst_strideq r2m
+%define src_strideq r3m
+    mov       widthq, r4m
+    mov       dither_heightq, r5m
+    mov       ditherq, r6m ; log2_scale
+%endif
+    add       widthq, 7
+    mov       tmpq, src_strideq
+    and       widthq, ~7
+    sub       dst_strideq, widthq
+    movd      m5, ditherq ; log2_scale
+    xor       ditherq, -1 ; log2_scale
+    mov       tmp2q, tmpq
+    add       ditherq, 7 ; log2_scale
+    neg       tmpq
+    sub       tmp2q, widthq
+    movd      m2, ditherq ; log2_scale
+    add       tmp2q, tmp2q
+    lea       ditherq, [pb_dither]
+    mov       src_strideq, tmp2q
+    shl       tmpq, 4
+    lea       dither_heightq, [ditherq+dither_heightq*8]
+
+.loop_height:
+    movq      m3, [ditherq]
+    movq      m4, m3
+    pxor      m7, m7
+    punpcklbw m3, m7
+    punpckhbw m4, m7
+    mov       tmp2q, widthq
+    psraw     m3, m5
+    psraw     m4, m5
+
+.loop_width:
+    movq      [srcq+tmpq], m7
+    movq      m0, [srcq]
+    movq      m1, [srcq+8]
+    movq      [srcq+tmpq+8], m7
+    paddw     m0, m3
+    paddw     m1, m4
+    movq      [srcq], m7
+    psraw     m0, m2
+    psraw     m1, m2
+    movq      [srcq+8], m7
+    packuswb  m0, m1
+    add       srcq, 16
+    movq      [dstq], m0
+    add       dstq, 8
+    sub       tmp2q, 8
+    jg .loop_width
+
+    add       srcq, src_strideq
+    add       ditherq, 8
+    add       dstq, dst_strideq
+    cmp       ditherq, dither_heightq
+    jl .loop_height
+    RET
+
+;void ff_store_slice2_mmx(uint8_t *dst, int16_t *src,
+;                         ptrdiff_t dst_stride, ptrdiff_t src_stride,
+;                         ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale)
+%if ARCH_X86_64
+cglobal store_slice2, 7, 9, 0, dst, src, dst_stride, src_stride, width, dither_height, dither, tmp, tmp2
+%else
+cglobal store_slice2, 0, 7, 0, dst, src, width, dither_height, dither, tmp, tmp2
+%define dst_strideq r2m
+%define src_strideq r3m
+    mov       dstq, dstm
+    mov       srcq, srcm
+    mov       widthq, r4m
+    mov       dither_heightq, r5m
+    mov       ditherq, r6m ; log2_scale
+%endif
+    add       widthq, 7
+    mov       tmpq, src_strideq
+    and       widthq, ~7
+    sub       dst_strideq, widthq
+    movd      m5, ditherq ; log2_scale
+    xor       ditherq, -1 ; log2_scale
+    mov       tmp2q, tmpq
+    add       ditherq, 7 ; log2_scale
+    sub       tmp2q, widthq
+    movd      m2, ditherq ; log2_scale
+    add       tmp2q, tmp2q
+    lea       ditherq, [pb_dither]
+    mov       src_strideq, tmp2q
+    shl       tmpq, 5
+    lea       dither_heightq, [ditherq+dither_heightq*8]
+
+.loop_height:
+    movq      m3, [ditherq]
+    movq      m4, m3
+    pxor      m7, m7
+    punpcklbw m3, m7
+    punpckhbw m4, m7
+    mov       tmp2q,widthq
+    psraw     m3, m5
+    psraw     m4, m5
+
+.loop_width:
+    movq      m0, [srcq]
+    movq      m1, [srcq+8]
+    paddw     m0, m3
+    paddw     m0, [srcq+tmpq]
+    paddw     m1, m4
+    movq      m6, [srcq+tmpq+8]
+    movq      [srcq+tmpq], m7
+    psraw     m0, m2
+    paddw     m1, m6
+    movq      [srcq+tmpq+8], m7
+    psraw     m1, m2
+    packuswb  m0, m1
+    movq      [dstq], m0
+    add       srcq, 16
+    add       dstq, 8
+    sub       tmp2q, 8
+    jg .loop_width
+
+    add       srcq, src_strideq
+    add       ditherq, 8
+    add       dstq, dst_strideq
+    cmp       ditherq, dither_heightq
+    jl .loop_height
+    RET
+
+;void ff_mul_thrmat_mmx(int16_t *thr_adr_noq, int16_t *thr_adr, int q);
+cglobal mul_thrmat, 3, 3, 0, thrn, thr, q
+    movd      m7, qd
+    movq      m0, [thrnq]
+    punpcklwd m7, m7
+    movq      m1, [thrnq+8]
+    punpckldq m7, m7
+    pmullw    m0, m7
+    movq      m2, [thrnq+8*2]
+    pmullw    m1, m7
+    movq      m3, [thrnq+8*3]
+    pmullw    m2, m7
+    movq      [thrq], m0
+    movq      m4, [thrnq+8*4]
+    pmullw    m3, m7
+    movq      [thrq+8], m1
+    movq      m5, [thrnq+8*5]
+    pmullw    m4, m7
+    movq      [thrq+8*2], m2
+    movq      m6, [thrnq+8*6]
+    pmullw    m5, m7
+    movq      [thrq+8*3], m3
+    movq      m0, [thrnq+8*7]
+    pmullw    m6, m7
+    movq      [thrq+8*4], m4
+    movq      m1, [thrnq+8*7+8]
+    pmullw    m0, m7
+    movq      [thrq+8*5], m5
+    movq      m2, [thrnq+8*7+8*2]
+    pmullw    m1, m7
+    movq      [thrq+8*6], m6
+    movq      m3, [thrnq+8*7+8*3]
+    pmullw    m2, m7
+    movq      [thrq+8*7], m0
+    movq      m4, [thrnq+8*7+8*4]
+    pmullw    m3, m7
+    movq      [thrq+8*7+8], m1
+    movq      m5, [thrnq+8*7+8*5]
+    pmullw    m4, m7
+    movq      [thrq+8*7+8*2], m2
+    movq      m6, [thrnq+8*7+8*6]
+    pmullw    m5, m7
+    movq      [thrq+8*7+8*3], m3
+    movq      m0, [thrnq+14*8]
+    pmullw    m6, m7
+    movq      [thrq+8*7+8*4], m4
+    movq      m1, [thrnq+14*8+8]
+    pmullw    m0, m7
+    movq      [thrq+8*7+8*5], m5
+    pmullw    m1, m7
+    movq      [thrq+8*7+8*6], m6
+    movq      [thrq+14*8], m0
+    movq      [thrq+14*8+8], m1
+    RET
+
+%macro COLUMN_FDCT 1-3 0, 0
+    movq      m1, [srcq+DCTSIZE*0*2]
+    movq      m7, [srcq+DCTSIZE*3*2]
+    movq      m0, m1
+    paddw     m1, [srcq+DCTSIZE*7*2]
+    movq      m3, m7
+    paddw     m7, [srcq+DCTSIZE*4*2]
+    movq      m5, m1
+    movq      m6, [srcq+DCTSIZE*1*2]
+    psubw     m1, m7
+    movq      m2, [srcq+DCTSIZE*2*2]
+    movq      m4, m6
+    paddw     m6, [srcq+DCTSIZE*6*2]
+    paddw     m5, m7
+    paddw     m2, [srcq+DCTSIZE*5*2]
+    movq      m7, m6
+    paddw     m6, m2
+    psubw     m7, m2
+    movq      m2, m5
+    paddw     m5, m6
+    psubw     m2, m6
+    paddw     m7, m1
+    movq      m6, [thrq+4*16+%2]
+    psllw     m7, 2
+    psubw     m5, [thrq+%2]
+    psubw     m2, m6
+    paddusw   m5, [thrq+%2]
+    paddusw   m2, m6
+    pmulhw    m7, [pw_2D41]
+    paddw     m5, [thrq+%2]
+    paddw     m2, m6
+    psubusw   m5, [thrq+%2]
+    psubusw   m2, m6
+    paddw     m5, [pw_2]
+    movq      m6, m2
+    paddw     m2, m5
+    psubw     m5, m6
+    movq      m6, m1
+    paddw     m1, m7
+    psubw     m1, [thrq+2*16+%2]
+    psubw     m6, m7
+    movq      m7, [thrq+6*16+%2]
+    psraw     m5, 2
+    paddusw   m1, [thrq+2*16+%2]
+    psubw     m6, m7
+    paddw     m1, [thrq+2*16+%2]
+    paddusw   m6, m7
+    psubusw   m1, [thrq+2*16+%2]
+    paddw     m6, m7
+    psubw     m3, [srcq+DCTSIZE*4*2]
+    psubusw   m6, m7
+    movq      m7, m1
+    psraw     m2, 2
+    psubw     m4, [srcq+DCTSIZE*6*2]
+    psubw     m1, m6
+    psubw     m0, [srcq+DCTSIZE*7*2]
+    paddw     m6, m7
+    psraw     m6, 2
+    movq      m7, m2
+    pmulhw    m1, [pw_5A82]
+    paddw     m2, m6
+    movq      [rsp], m2
+    psubw     m7, m6
+    movq      m2, [srcq+DCTSIZE*2*2]
+    psubw     m1, m6
+    psubw     m2, [srcq+DCTSIZE*5*2]
+    movq      m6, m5
+    movq      [rsp+8*3], m7
+    paddw     m3, m2
+    paddw     m2, m4
+    paddw     m4, m0
+    movq      m7, m3
+    psubw     m3, m4
+    psllw     m3, 2
+    psllw     m7, 2
+    pmulhw    m3, [pw_187E]
+    psllw     m4, 2
+    pmulhw    m7, [pw_22A3]
+    psllw     m2, 2
+    pmulhw    m4, [pw_539F]
+    paddw     m5, m1
+    pmulhw    m2, [pw_2D41]
+    psubw     m6, m1
+    paddw     m7, m3
+    movq      [rsp+8], m5
+    paddw     m4, m3
+    movq      m3, [thrq+3*16+%2]
+    movq      m1, m0
+    movq      [rsp+8*2], m6
+    psubw     m1, m2
+    paddw     m0, m2
+    movq      m5, m1
+    movq      m2, [thrq+5*16+%2]
+    psubw     m1, m7
+    paddw     m5, m7
+    psubw     m1, m3
+    movq      m7, [thrq+16+%2]
+    psubw     m5, m2
+    movq      m6, m0
+    paddw     m0, m4
+    paddusw   m1, m3
+    psubw     m6, m4
+    movq      m4, [thrq+7*16+%2]
+    psubw     m0, m7
+    psubw     m6, m4
+    paddusw   m5, m2
+    paddusw   m6, m4
+    paddw     m1, m3
+    paddw     m5, m2
+    paddw     m6, m4
+    psubusw   m1, m3
+    psubusw   m5, m2
+    psubusw   m6, m4
+    movq      m4, m1
+    por       m4, m5
+    paddusw   m0, m7
+    por       m4, m6
+    paddw     m0, m7
+    packssdw  m4, m4
+    psubusw   m0, m7
+    movd      tmpd, m4
+    or        tmpd, tmpd
+    jnz %1
+    movq      m4, [rsp]
+    movq      m1, m0
+    pmulhw    m0, [pw_3642]
+    movq      m2, m1
+    movq      m5, [outq+DCTSIZE*0*2]
+    movq      m3, m2
+    pmulhw    m1, [pw_2441]
+    paddw     m5, m4
+    movq      m6, [rsp+8]
+    psraw     m3, 2
+    pmulhw    m2, [pw_0CBB]
+    psubw     m4, m3
+    movq      m7, [outq+DCTSIZE*1*2]
+    paddw     m5, m3
+    movq      [outq+DCTSIZE*7*2], m4
+    paddw     m7, m6
+    movq      m3, [rsp+8*2]
+    psubw     m6, m0
+    movq      m4, [outq+DCTSIZE*2*2]
+    paddw     m7, m0
+    movq      [outq], m5
+    paddw     m4, m3
+    movq      [outq+DCTSIZE*6*2], m6
+    psubw     m3, m1
+    movq      m5, [outq+DCTSIZE*5*2]
+    paddw     m4, m1
+    movq      m6, [outq+DCTSIZE*3*2]
+    paddw     m5, m3
+    movq      m0, [rsp+8*3]
+    add       srcq, 8+%3
+    movq      [outq+DCTSIZE*1*2], m7
+    paddw     m6, m0
+    movq      [outq+DCTSIZE*2*2], m4
+    psubw     m0, m2
+    movq      m7, [outq+DCTSIZE*4*2]
+    paddw     m6, m2
+    movq      [outq+DCTSIZE*5*2], m5
+    paddw     m7, m0
+    movq      [outq+DCTSIZE*3*2], m6
+    movq      [outq+DCTSIZE*4*2], m7
+    add       outq, 8+%3
+%endmacro
+
+%macro COLUMN_IDCT 0-1 0
+    movq      m3, m5
+    psubw     m5, m1
+    psllw     m5, 1
+    paddw     m3, m1
+    movq      m2, m0
+    psubw     m0, m6
+    movq      m1, m5
+    psllw     m0, 1
+    pmulhw    m1, [pw_AC62]
+    paddw     m5, m0
+    pmulhw    m5, [pw_3B21]
+    paddw     m2, m6
+    pmulhw    m0, [pw_22A3]
+    movq      m7, m2
+    movq      m4, [rsp]
+    psubw     m2, m3
+    psllw     m2, 1
+    paddw     m7, m3
+    pmulhw    m2, [pw_2D41]
+    movq      m6, m4
+    psraw     m7, 2
+    paddw     m4, [outq]
+    psubw     m6, m7
+    movq      m3, [rsp+8]
+    paddw     m4, m7
+    movq      [outq+DCTSIZE*7*2], m6
+    paddw     m1, m5
+    movq      [outq], m4
+    psubw     m1, m7
+    movq      m7, [rsp+8*2]
+    psubw     m0, m5
+    movq      m6, [rsp+8*3]
+    movq      m5, m3
+    paddw     m3, [outq+DCTSIZE*1*2]
+    psubw     m5, m1
+    psubw     m2, m1
+    paddw     m3, m1
+    movq      [outq+DCTSIZE*6*2], m5
+    movq      m4, m7
+    paddw     m7, [outq+DCTSIZE*2*2]
+    psubw     m4, m2
+    paddw     m4, [outq+DCTSIZE*5*2]
+    paddw     m7, m2
+    movq      [outq+DCTSIZE*1*2], m3
+    paddw     m0, m2
+    movq      [outq+DCTSIZE*2*2], m7
+    movq      m1, m6
+    paddw     m6, [outq+DCTSIZE*4*2]
+    psubw     m1, m0
+    paddw     m1, [outq+DCTSIZE*3*2]
+    paddw     m6, m0
+    movq      [outq+DCTSIZE*5*2], m4
+    add       srcq, 8+%1
+    movq      [outq+DCTSIZE*4*2], m6
+    movq      [outq+DCTSIZE*3*2], m1
+    add       outq, 8+%1
+%endmacro
+
+;void ff_column_fidct_mmx(int16_t *thr_adr, int16_t *data, int16_t *output, int cnt);
+cglobal column_fidct, 4, 5, 0, 32, thr, src, out, cnt, tmp
+.fdct1:
+    COLUMN_FDCT .idct1
+    jmp .fdct2
+
+.idct1:
+    COLUMN_IDCT
+
+.fdct2:
+    COLUMN_FDCT .idct2, 8, 16
+    sub    cntd, 2
+    jnz .fdct1
+    RET
+
+.idct2:
+    COLUMN_IDCT 16
+    sub    cntd, 2
+    jnz .fdct1
+    RET
+
+;void ff_row_idct_mmx(int16_t *workspace, int16_t *output_adr, ptrdiff_t output_stride, int cnt);
+cglobal row_idct, 4, 5, 0, 16, src, dst, stride, cnt, stride3
+    add       strideq, strideq
+    lea       stride3q, [strideq+strideq*2]
+.loop:
+    movq      m0, [srcq+DCTSIZE*0*2]
+    movq      m1, [srcq+DCTSIZE*1*2]
+    movq      m4, m0
+    movq      m2, [srcq+DCTSIZE*2*2]
+    punpcklwd m0, m1
+    movq      m3, [srcq+DCTSIZE*3*2]
+    punpckhwd m4, m1
+    movq      m7, m2
+    punpcklwd m2, m3
+    movq      m6, m0
+    punpckldq m0, m2
+    punpckhdq m6, m2
+    movq      m5, m0
+    punpckhwd m7, m3
+    psubw     m0, m6
+    pmulhw    m0, [pw_5A82]
+    movq      m2, m4
+    punpckldq m4, m7
+    paddw     m5, m6
+    punpckhdq m2, m7
+    movq      m1, m4
+    psllw     m0, 2
+    paddw     m4, m2
+    movq      m3, [srcq+DCTSIZE*0*2+8]
+    psubw     m1, m2
+    movq      m2, [srcq+DCTSIZE*1*2+8]
+    psubw     m0, m5
+    movq      m6, m4
+    paddw     m4, m5
+    psubw     m6, m5
+    movq      m7, m1
+    movq      m5, [srcq+DCTSIZE*2*2+8]
+    paddw     m1, m0
+    movq      [rsp], m4
+    movq      m4, m3
+    movq      [rsp+8], m6
+    punpcklwd m3, m2
+    movq      m6, [srcq+DCTSIZE*3*2+8]
+    punpckhwd m4, m2
+    movq      m2, m5
+    punpcklwd m5, m6
+    psubw     m7, m0
+    punpckhwd m2, m6
+    movq      m0, m3
+    punpckldq m3, m5
+    punpckhdq m0, m5
+    movq      m5, m4
+    movq      m6, m3
+    punpckldq m4, m2
+    psubw     m3, m0
+    punpckhdq m5, m2
+    paddw     m6, m0
+    movq      m2, m4
+    movq      m0, m3
+    psubw     m4, m5
+    pmulhw    m0, [pw_AC62]
+    paddw     m3, m4
+    pmulhw    m3, [pw_3B21]
+    paddw     m2, m5
+    pmulhw    m4, [pw_22A3]
+    movq      m5, m2
+    psubw     m2, m6
+    paddw     m5, m6
+    pmulhw    m2, [pw_2D41]
+    paddw     m0, m3
+    psllw     m0, 3
+    psubw     m4, m3
+    movq      m6, [rsp]
+    movq      m3, m1
+    psllw     m4, 3
+    psubw     m0, m5
+    psllw     m2, 3
+    paddw     m1, m0
+    psubw     m2, m0
+    psubw     m3, m0
+    paddw     m4, m2
+    movq      m0, m7
+    paddw     m7, m2
+    psubw     m0, m2
+    movq      m2, [pw_4]
+    psubw     m6, m5
+    paddw     m5, [rsp]
+    paddw     m1, m2
+    paddw     m5, m2
+    psraw     m1, 3
+    paddw     m7, m2
+    psraw     m5, 3
+    paddw     m5, [dstq]
+    psraw     m7, 3
+    paddw     m1, [dstq+strideq*1]
+    paddw     m0, m2
+    paddw     m7, [dstq+strideq*2]
+    paddw     m3, m2
+    movq      [dstq], m5
+    paddw     m6, m2
+    movq      [dstq+strideq*1], m1
+    psraw     m0, 3
+    movq      [dstq+strideq*2], m7
+    add       dstq, stride3q
+    movq      m5, [rsp+8]
+    psraw     m3, 3
+    paddw     m0, [dstq+strideq*2]
+    psubw     m5, m4
+    paddw     m3, [dstq+stride3q*1]
+    psraw     m6, 3
+    paddw     m4, [rsp+8]
+    paddw     m5, m2
+    paddw     m6, [dstq+strideq*4]
+    paddw     m4, m2
+    movq      [dstq+strideq*2], m0
+    psraw     m5, 3
+    paddw     m5, [dstq]
+    psraw     m4, 3
+    paddw     m4, [dstq+strideq*1]
+    add       srcq, DCTSIZE*2*4
+    movq      [dstq+stride3q*1], m3
+    movq      [dstq+strideq*4], m6
+    movq      [dstq], m5
+    movq      [dstq+strideq*1], m4
+    sub       dstq, stride3q
+    add       dstq, 8
+    dec       r3d
+    jnz .loop
+    RET
+
+;void ff_row_fdct_mmx(int16_t *data, const uint8_t *pixels, ptrdiff_t line_size, int cnt);
+cglobal row_fdct, 4, 5, 0, 16, src, pix, stride, cnt, stride3
+    lea       stride3q, [strideq+strideq*2]
+.loop:
+    movd      m0, [pixq]
+    pxor      m7, m7
+    movd      m1, [pixq+strideq*1]
+    punpcklbw m0, m7
+    movd      m2, [pixq+strideq*2]
+    punpcklbw m1, m7
+    punpcklbw m2, m7
+    add       pixq,stride3q
+    movq      m5, m0
+    movd      m3, [pixq+strideq*4]
+    movq      m6, m1
+    movd      m4, [pixq+stride3q*1]
+    punpcklbw m3, m7
+    psubw     m5, m3
+    punpcklbw m4, m7
+    paddw     m0, m3
+    psubw     m6, m4
+    movd      m3, [pixq+strideq*2]
+    paddw     m1, m4
+    movq      [rsp], m5
+    punpcklbw m3, m7
+    movq      [rsp+8], m6
+    movq      m4, m2
+    movd      m5, [pixq]
+    paddw     m2, m3
+    movd      m6, [pixq+strideq*1]
+    punpcklbw m5, m7
+    psubw     m4, m3
+    punpcklbw m6, m7
+    movq      m3, m5
+    paddw     m5, m6
+    psubw     m3, m6
+    movq      m6, m0
+    movq      m7, m1
+    psubw     m0, m5
+    psubw     m1, m2
+    paddw     m7, m2
+    paddw     m1, m0
+    movq      m2, m7
+    psllw     m1, 2
+    paddw     m6, m5
+    pmulhw    m1, [pw_2D41]
+    paddw     m7, m6
+    psubw     m6, m2
+    movq      m5, m0
+    movq      m2, m7
+    punpcklwd m7, m6
+    paddw     m0, m1
+    punpckhwd m2, m6
+    psubw     m5, m1
+    movq      m6, m0
+    movq      m1, [rsp+8]
+    punpcklwd m0, m5
+    punpckhwd m6, m5
+    movq      m5, m0
+    punpckldq m0, m7
+    paddw     m3, m4
+    punpckhdq m5, m7
+    movq      m7, m6
+    movq      [srcq+DCTSIZE*0*2], m0
+    punpckldq m6, m2
+    movq      [srcq+DCTSIZE*1*2], m5
+    punpckhdq m7, m2
+    movq      [srcq+DCTSIZE*2*2], m6
+    paddw     m4, m1
+    movq      [srcq+DCTSIZE*3*2], m7
+    psllw     m3, 2
+    movq      m2, [rsp]
+    psllw     m4, 2
+    pmulhw    m4, [pw_2D41]
+    paddw     m1, m2
+    psllw     m1, 2
+    movq      m0, m3
+    pmulhw    m0, [pw_22A3]
+    psubw     m3, m1
+    pmulhw    m3, [pw_187E]
+    movq      m5, m2
+    pmulhw    m1, [pw_539F]
+    psubw     m2, m4
+    paddw     m5, m4
+    movq      m6, m2
+    paddw     m0, m3
+    movq      m7, m5
+    paddw     m2, m0
+    psubw     m6, m0
+    movq      m4, m2
+    paddw     m1, m3
+    punpcklwd m2, m6
+    paddw     m5, m1
+    punpckhwd m4, m6
+    psubw     m7, m1
+    movq      m6, m5
+    punpcklwd m5, m7
+    punpckhwd m6, m7
+    movq      m7, m2
+    punpckldq m2, m5
+    sub       pixq, stride3q
+    punpckhdq m7, m5
+    movq      m5, m4
+    movq      [srcq+DCTSIZE*0*2+8], m2
+    punpckldq m4, m6
+    movq      [srcq+DCTSIZE*1*2+8], m7
+    punpckhdq m5, m6
+    movq      [srcq+DCTSIZE*2*2+8], m4
+    add       pixq, 4
+    movq      [srcq+DCTSIZE*3*2+8], m5
+    add       srcq, DCTSIZE*4*2
+    dec       cntd
+    jnz .loop
+    RET
diff --git a/libavfilter/x86/vf_fspp.c b/libavfilter/x86/vf_fspp.c
deleted file mode 100644
index ec24a1e..0000000
--- a/libavfilter/x86/vf_fspp.c
+++ /dev/null
@@ -1,1409 +0,0 @@
-/*
- * Copyright (c) 2003 Michael Niedermayer <michaelni at gmx.at>
- * Copyright (C) 2005 Nikolaj Poroshin <porosh3 at psu.ru>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
- * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- */
-
-#include "libavutil/attributes.h"
-#include "libavutil/cpu.h"
-#include "libavutil/mem.h"
-#include "libavutil/x86/asm.h"
-#include "libavfilter/vf_fspp.h"
-
-#if HAVE_MMX_INLINE
-DECLARE_ALIGNED(32, static const uint8_t, dither)[8][8] = {
-    {  0,  48,  12,  60,   3,  51,  15,  63, },
-    { 32,  16,  44,  28,  35,  19,  47,  31, },
-    {  8,  56,   4,  52,  11,  59,   7,  55, },
-    { 40,  24,  36,  20,  43,  27,  39,  23, },
-    {  2,  50,  14,  62,   1,  49,  13,  61, },
-    { 34,  18,  46,  30,  33,  17,  45,  29, },
-    { 10,  58,   6,  54,   9,  57,   5,  53, },
-    { 42,  26,  38,  22,  41,  25,  37,  21, },
-};
-
-//This func reads from 1 slice, 1 and clears 0 & 1
-static void store_slice_mmx(uint8_t *dst, int16_t *src,
-                            ptrdiff_t dst_stride, ptrdiff_t src_stride,
-                            ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale)
-{
-    const uint8_t *od = &dither[0][0];
-    const uint8_t *end = &dither[height][0];
-    width = (width + 7) & ~7;
-    dst_stride -= width;
-
-    __asm__ volatile(
-        "mov %5 , %%"REG_d"                \n\t"
-        "mov %6 , %%"REG_S"                \n\t"
-        "mov %7 , %%"REG_D"                \n\t"
-        "mov %1 , %%"REG_a"                \n\t"
-        "movd %%"REG_d" , %%mm5            \n\t"
-        "xor $-1 , %%"REG_d"               \n\t"
-        "mov %%"REG_a" , %%"REG_c"         \n\t"
-        "add $7 , %%"REG_d"                \n\t"
-        "neg %%"REG_a"                     \n\t"
-        "sub %0 , %%"REG_c"                \n\t"
-        "add %%"REG_c" , %%"REG_c"         \n\t"
-        "movd %%"REG_d" , %%mm2            \n\t"
-        "mov %%"REG_c" , %1                \n\t"
-        "mov %2 , %%"REG_d"                \n\t"
-        "shl $4 , %%"REG_a"                \n\t"
-
-        "2:                                \n\t"
-        "movq (%%"REG_d") , %%mm3          \n\t"
-        "movq %%mm3 , %%mm4                \n\t"
-        "pxor %%mm7 , %%mm7                \n\t"
-        "punpcklbw %%mm7 , %%mm3           \n\t"
-        "punpckhbw %%mm7 , %%mm4           \n\t"
-        "mov %0 , %%"REG_c"                \n\t"
-        "psraw %%mm5 , %%mm3               \n\t"
-        "psraw %%mm5 , %%mm4               \n\t"
-        "1:                                \n\t"
-        "movq %%mm7, (%%"REG_S",%%"REG_a") \n\t"
-        "movq (%%"REG_S") , %%mm0          \n\t"
-        "movq 8(%%"REG_S"), %%mm1          \n\t"
-
-        "movq %%mm7, 8(%%"REG_S",%%"REG_a")\n\t"
-        "paddw %%mm3, %%mm0                \n\t"
-        "paddw %%mm4, %%mm1                \n\t"
-
-        "movq %%mm7, (%%"REG_S")           \n\t"
-        "psraw %%mm2, %%mm0                \n\t"
-        "psraw %%mm2, %%mm1                \n\t"
-
-        "movq %%mm7, 8(%%"REG_S")          \n\t"
-        "packuswb %%mm1, %%mm0             \n\t"
-        "add $16, %%"REG_S"                \n\t"
-
-        "movq %%mm0, (%%"REG_D")           \n\t"
-        "add $8, %%"REG_D"                 \n\t"
-        "sub $8, %%"REG_c"                 \n\t"
-        "jg 1b                             \n\t"
-        "add %1, %%"REG_S"                 \n\t"
-        "add $8, %%"REG_d"                 \n\t"
-        "add %3, %%"REG_D"                 \n\t"
-        "cmp %4, %%"REG_d"                 \n\t"
-        "jl 2b                             \n\t"
-
-        :
-        : "m" (width),      "m" (src_stride), "erm" (od), "m" (dst_stride), "erm" (end),
-          "m" (log2_scale), "m" (src),        "m" (dst)                                     //input
-        : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
-        );
-}
-
-//This func reads from 2 slices, 0 & 2  and clears 2-nd
-static void store_slice2_mmx(uint8_t *dst, int16_t *src,
-                             ptrdiff_t dst_stride, ptrdiff_t src_stride,
-                             ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale)
-{
-    const uint8_t *od = &dither[0][0];
-    const uint8_t *end = &dither[height][0];
-    width = (width + 7) & ~7;
-    dst_stride -= width;
-
-    __asm__ volatile(
-        "mov %5, %%"REG_d"                \n\t"
-        "mov %6, %%"REG_S"                \n\t"
-        "mov %7, %%"REG_D"                \n\t"
-        "mov %1, %%"REG_a"                \n\t"
-        "movd %%"REG_d", %%mm5            \n\t"
-        "xor $-1, %%"REG_d"               \n\t"
-        "mov %%"REG_a", %%"REG_c"         \n\t"
-        "add $7, %%"REG_d"                \n\t"
-        "sub %0, %%"REG_c"                \n\t"
-        "add %%"REG_c", %%"REG_c"         \n\t"
-        "movd %%"REG_d", %%mm2            \n\t"
-        "mov %%"REG_c", %1                \n\t"
-        "mov %2, %%"REG_d"                \n\t"
-        "shl $5, %%"REG_a"                \n\t"
-
-        "2:                               \n\t"
-        "movq (%%"REG_d"), %%mm3          \n\t"
-        "movq %%mm3, %%mm4                \n\t"
-        "pxor %%mm7, %%mm7                \n\t"
-        "punpcklbw %%mm7, %%mm3           \n\t"
-        "punpckhbw %%mm7, %%mm4           \n\t"
-        "mov %0, %%"REG_c"                \n\t"
-        "psraw %%mm5, %%mm3               \n\t"
-        "psraw %%mm5, %%mm4               \n\t"
-        "1:                               \n\t"
-        "movq (%%"REG_S"), %%mm0          \n\t"
-        "movq 8(%%"REG_S"), %%mm1         \n\t"
-        "paddw %%mm3, %%mm0               \n\t"
-
-        "paddw (%%"REG_S",%%"REG_a"),%%mm0\n\t"
-        "paddw %%mm4, %%mm1               \n\t"
-        "movq 8(%%"REG_S",%%"REG_a"),%%mm6\n\t"
-
-        "movq %%mm7, (%%"REG_S",%%"REG_a")\n\t"
-        "psraw %%mm2, %%mm0               \n\t"
-        "paddw %%mm6, %%mm1               \n\t"
-
-        "movq %%mm7,8(%%"REG_S",%%"REG_a")\n\t"
-        "psraw %%mm2, %%mm1               \n\t"
-        "packuswb %%mm1, %%mm0            \n\t"
-
-        "movq %%mm0, (%%"REG_D")          \n\t"
-        "add $16, %%"REG_S"               \n\t"
-        "add $8, %%"REG_D"                \n\t"
-        "sub $8, %%"REG_c"                \n\t"
-        "jg 1b                            \n\t"
-        "add %1, %%"REG_S"                \n\t"
-        "add $8, %%"REG_d"                \n\t"
-        "add %3, %%"REG_D"                \n\t"
-        "cmp %4, %%"REG_d"                \n\t"
-        "jl 2b                            \n\t"
-
-        :
-        : "m" (width),      "m" (src_stride), "erm" (od), "m" (dst_stride), "erm" (end),
-          "m" (log2_scale), "m" (src),        "m" (dst)                                     //input
-        : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_D, "%"REG_S
-        );
-}
-
-static void mul_thrmat_mmx(FSPPContext *p, int q)
-{
-    uint64_t *adr = &p->threshold_mtx_noq[0];
-
-    __asm__ volatile(
-        "movd %0, %%mm7                   \n\t"
-        "add $8*8*2, %%"REG_D"            \n\t"
-        "movq 0*8(%%"REG_S"), %%mm0       \n\t"
-        "punpcklwd %%mm7, %%mm7           \n\t"
-        "movq 1*8(%%"REG_S"), %%mm1       \n\t"
-        "punpckldq %%mm7, %%mm7           \n\t"
-        "pmullw %%mm7, %%mm0              \n\t"
-
-        "movq 2*8(%%"REG_S"), %%mm2       \n\t"
-        "pmullw %%mm7, %%mm1              \n\t"
-
-        "movq 3*8(%%"REG_S"), %%mm3       \n\t"
-        "pmullw %%mm7, %%mm2              \n\t"
-
-        "movq %%mm0, 0*8(%%"REG_D")       \n\t"
-        "movq 4*8(%%"REG_S"), %%mm4       \n\t"
-        "pmullw %%mm7, %%mm3              \n\t"
-
-        "movq %%mm1, 1*8(%%"REG_D")       \n\t"
-        "movq 5*8(%%"REG_S"), %%mm5       \n\t"
-        "pmullw %%mm7, %%mm4              \n\t"
-
-        "movq %%mm2, 2*8(%%"REG_D")       \n\t"
-        "movq 6*8(%%"REG_S"), %%mm6       \n\t"
-        "pmullw %%mm7, %%mm5              \n\t"
-
-        "movq %%mm3, 3*8(%%"REG_D")       \n\t"
-        "movq 7*8+0*8(%%"REG_S"), %%mm0   \n\t"
-        "pmullw %%mm7, %%mm6              \n\t"
-
-        "movq %%mm4, 4*8(%%"REG_D")       \n\t"
-        "movq 7*8+1*8(%%"REG_S"), %%mm1   \n\t"
-        "pmullw %%mm7, %%mm0              \n\t"
-
-        "movq %%mm5, 5*8(%%"REG_D")       \n\t"
-        "movq 7*8+2*8(%%"REG_S"), %%mm2   \n\t"
-        "pmullw %%mm7, %%mm1              \n\t"
-
-        "movq %%mm6, 6*8(%%"REG_D")       \n\t"
-        "movq 7*8+3*8(%%"REG_S"), %%mm3   \n\t"
-        "pmullw %%mm7, %%mm2              \n\t"
-
-        "movq %%mm0, 7*8+0*8(%%"REG_D")   \n\t"
-        "movq 7*8+4*8(%%"REG_S"), %%mm4   \n\t"
-        "pmullw %%mm7, %%mm3              \n\t"
-
-        "movq %%mm1, 7*8+1*8(%%"REG_D")   \n\t"
-        "movq 7*8+5*8(%%"REG_S"), %%mm5   \n\t"
-        "pmullw %%mm7, %%mm4              \n\t"
-
-        "movq %%mm2, 7*8+2*8(%%"REG_D")   \n\t"
-        "movq 7*8+6*8(%%"REG_S"), %%mm6   \n\t"
-        "pmullw %%mm7, %%mm5              \n\t"
-
-        "movq %%mm3, 7*8+3*8(%%"REG_D")   \n\t"
-        "movq 14*8+0*8(%%"REG_S"), %%mm0  \n\t"
-        "pmullw %%mm7, %%mm6              \n\t"
-
-        "movq %%mm4, 7*8+4*8(%%"REG_D")   \n\t"
-        "movq 14*8+1*8(%%"REG_S"), %%mm1  \n\t"
-        "pmullw %%mm7, %%mm0              \n\t"
-
-        "movq %%mm5, 7*8+5*8(%%"REG_D")   \n\t"
-        "pmullw %%mm7, %%mm1              \n\t"
-
-        "movq %%mm6, 7*8+6*8(%%"REG_D")   \n\t"
-        "movq %%mm0, 14*8+0*8(%%"REG_D")  \n\t"
-        "movq %%mm1, 14*8+1*8(%%"REG_D")  \n\t"
-
-        : "+g" (q), "+S" (adr), "+D" (adr)
-        :
-        );
-}
-
-DECLARE_ASM_CONST(8, uint64_t, MM_FIX_0_382683433)   = FIX64(0.382683433, 14);
-DECLARE_ALIGNED  (8, uint64_t, ff_MM_FIX_0_541196100)= FIX64(0.541196100, 14);
-DECLARE_ALIGNED  (8, uint64_t, ff_MM_FIX_0_707106781)= FIX64(0.707106781, 14);
-DECLARE_ASM_CONST(8, uint64_t, MM_FIX_1_306562965)   = FIX64(1.306562965, 14);
-
-DECLARE_ASM_CONST(8, uint64_t, MM_FIX_1_414213562_A) = FIX64(1.414213562, 14);
-
-DECLARE_ASM_CONST(8, uint64_t, MM_FIX_1_847759065)   = FIX64(1.847759065, 13);
-DECLARE_ASM_CONST(8, uint64_t, MM_FIX_2_613125930)   = FIX64(-2.613125930, 13);
-DECLARE_ASM_CONST(8, uint64_t, MM_FIX_1_414213562)   = FIX64(1.414213562, 13);
-DECLARE_ASM_CONST(8, uint64_t, MM_FIX_1_082392200)   = FIX64(1.082392200, 13);
-//for t3,t5,t7 == 0 shortcut
-DECLARE_ASM_CONST(8, uint64_t, MM_FIX_0_847759065)   = FIX64(0.847759065, 14);
-DECLARE_ASM_CONST(8, uint64_t, MM_FIX_0_566454497)   = FIX64(0.566454497, 14);
-DECLARE_ASM_CONST(8, uint64_t, MM_FIX_0_198912367)   = FIX64(0.198912367, 14);
-
-DECLARE_ASM_CONST(8, uint64_t, MM_DESCALE_RND)       = C64(4);
-DECLARE_ASM_CONST(8, uint64_t, MM_2)                 = C64(2);
-
-static void column_fidct_mmx(int16_t *thr_adr, int16_t *data, int16_t *output, int cnt)
-{
-    DECLARE_ALIGNED(8, uint64_t, temps)[4];
-
-    __asm__ volatile(
-
-        "1:                                       \n\t"
-        "movq "DCTSIZE_S"*0*2(%%"REG_S"), %%mm1   \n\t"
-        //
-        "movq "DCTSIZE_S"*3*2(%%"REG_S"), %%mm7   \n\t"
-        "movq %%mm1, %%mm0                        \n\t"
-
-        "paddw "DCTSIZE_S"*7*2(%%"REG_S"), %%mm1  \n\t" //t0
-        "movq %%mm7, %%mm3                        \n\t"
-
-        "paddw "DCTSIZE_S"*4*2(%%"REG_S"), %%mm7  \n\t" //t3
-        "movq %%mm1, %%mm5             \n\t"
-
-        "movq "DCTSIZE_S"*1*2(%%"REG_S"), %%mm6   \n\t"
-        "psubw %%mm7, %%mm1                       \n\t" //t13
-
-        "movq "DCTSIZE_S"*2*2(%%"REG_S"), %%mm2   \n\t"
-        "movq %%mm6, %%mm4                        \n\t"
-
-        "paddw "DCTSIZE_S"*6*2(%%"REG_S"), %%mm6  \n\t" //t1
-        "paddw %%mm7, %%mm5                       \n\t" //t10
-
-        "paddw "DCTSIZE_S"*5*2(%%"REG_S"), %%mm2  \n\t" //t2
-        "movq %%mm6, %%mm7                        \n\t"
-
-        "paddw %%mm2, %%mm6                       \n\t" //t11
-        "psubw %%mm2, %%mm7                       \n\t" //t12
-
-        "movq %%mm5, %%mm2                        \n\t"
-        "paddw %%mm6, %%mm5                       \n\t" //d0
-        // i0 t13 t12 i3 i1 d0 - d4
-        "psubw %%mm6, %%mm2                       \n\t" //d4
-        "paddw %%mm1, %%mm7                       \n\t"
-
-        "movq  4*16(%%"REG_d"), %%mm6             \n\t"
-        "psllw $2, %%mm7                          \n\t"
-
-        "psubw 0*16(%%"REG_d"), %%mm5             \n\t"
-        "psubw %%mm6, %%mm2                       \n\t"
-
-        "paddusw 0*16(%%"REG_d"), %%mm5           \n\t"
-        "paddusw %%mm6, %%mm2                     \n\t"
-
-        "pmulhw "MANGLE(ff_MM_FIX_0_707106781)", %%mm7 \n\t"
-        //
-        "paddw 0*16(%%"REG_d"), %%mm5             \n\t"
-        "paddw %%mm6, %%mm2                       \n\t"
-
-        "psubusw 0*16(%%"REG_d"), %%mm5           \n\t"
-        "psubusw %%mm6, %%mm2                     \n\t"
-
-//This func is totally compute-bound,  operates at huge speed. So,  DC shortcut
-// at this place isn't worthwhile due to BTB miss penalty (checked on Pent. 3).
-//However,  typical numbers: nondc - 29%%,  dc - 46%%,  zero - 25%%. All <> 0 case is very rare.
-        "paddw "MANGLE(MM_2)", %%mm5              \n\t"
-        "movq %%mm2, %%mm6                        \n\t"
-
-        "paddw %%mm5, %%mm2                       \n\t"
-        "psubw %%mm6, %%mm5                       \n\t"
-
-        "movq %%mm1, %%mm6                        \n\t"
-        "paddw %%mm7, %%mm1                       \n\t" //d2
-
-        "psubw 2*16(%%"REG_d"), %%mm1             \n\t"
-        "psubw %%mm7, %%mm6                       \n\t" //d6
-
-        "movq 6*16(%%"REG_d"), %%mm7              \n\t"
-        "psraw $2, %%mm5                          \n\t"
-
-        "paddusw 2*16(%%"REG_d"), %%mm1           \n\t"
-        "psubw %%mm7, %%mm6                       \n\t"
-        // t7 d2 /t11 t4 t6 - d6 /t10
-
-        "paddw 2*16(%%"REG_d"), %%mm1             \n\t"
-        "paddusw %%mm7, %%mm6                     \n\t"
-
-        "psubusw 2*16(%%"REG_d"), %%mm1           \n\t"
-        "paddw %%mm7, %%mm6                       \n\t"
-
-        "psubw "DCTSIZE_S"*4*2(%%"REG_S"), %%mm3  \n\t"
-        "psubusw %%mm7, %%mm6                     \n\t"
-
-        //movq [edi+"DCTSIZE_S"*2*2], mm1
-        //movq [edi+"DCTSIZE_S"*6*2], mm6
-        "movq %%mm1, %%mm7                        \n\t"
-        "psraw $2, %%mm2                          \n\t"
-
-        "psubw "DCTSIZE_S"*6*2(%%"REG_S"), %%mm4  \n\t"
-        "psubw %%mm6, %%mm1                       \n\t"
-
-        "psubw "DCTSIZE_S"*7*2(%%"REG_S"), %%mm0  \n\t"
-        "paddw %%mm7, %%mm6                       \n\t" //'t13
-
-        "psraw $2, %%mm6                          \n\t" //paddw mm6, MM_2 !!    ---
-        "movq %%mm2, %%mm7                        \n\t"
-
-        "pmulhw "MANGLE(MM_FIX_1_414213562_A)", %%mm1 \n\t"
-        "paddw %%mm6, %%mm2                       \n\t" //'t0
-
-        "movq %%mm2, 0*8+%3                       \n\t" //!
-        "psubw %%mm6, %%mm7                       \n\t" //'t3
-
-        "movq "DCTSIZE_S"*2*2(%%"REG_S"), %%mm2   \n\t"
-        "psubw %%mm6, %%mm1                       \n\t" //'t12
-
-        "psubw "DCTSIZE_S"*5*2(%%"REG_S"), %%mm2  \n\t" //t5
-        "movq %%mm5, %%mm6                        \n\t"
-
-        "movq %%mm7, 3*8+%3                       \n\t"
-        "paddw %%mm2, %%mm3                       \n\t" //t10
-
-        "paddw %%mm4, %%mm2                       \n\t" //t11
-        "paddw %%mm0, %%mm4                       \n\t" //t12
-
-        "movq %%mm3, %%mm7                        \n\t"
-        "psubw %%mm4, %%mm3                       \n\t"
-
-        "psllw $2, %%mm3                          \n\t"
-        "psllw $2, %%mm7                          \n\t" //opt for P6
-
-        "pmulhw "MANGLE(MM_FIX_0_382683433)", %%mm3 \n\t"
-        "psllw $2, %%mm4                          \n\t"
-
-        "pmulhw "MANGLE(ff_MM_FIX_0_541196100)", %%mm7 \n\t"
-        "psllw $2, %%mm2                          \n\t"
-
-        "pmulhw "MANGLE(MM_FIX_1_306562965)", %%mm4 \n\t"
-        "paddw %%mm1, %%mm5                       \n\t" //'t1
-
-        "pmulhw "MANGLE(ff_MM_FIX_0_707106781)", %%mm2 \n\t"
-        "psubw %%mm1, %%mm6                       \n\t" //'t2
-        // t7 't12 't11 t4 t6 - 't13 't10   ---
-
-        "paddw %%mm3, %%mm7                       \n\t" //z2
-
-        "movq %%mm5, 1*8+%3                       \n\t"
-        "paddw %%mm3, %%mm4                       \n\t" //z4
-
-        "movq 3*16(%%"REG_d"), %%mm3              \n\t"
-        "movq %%mm0, %%mm1                        \n\t"
-
-        "movq %%mm6, 2*8+%3                       \n\t"
-        "psubw %%mm2, %%mm1                       \n\t" //z13
-
-//===
-        "paddw %%mm2, %%mm0                       \n\t" //z11
-        "movq %%mm1, %%mm5                        \n\t"
-
-        "movq 5*16(%%"REG_d"), %%mm2              \n\t"
-        "psubw %%mm7, %%mm1                       \n\t" //d3
-
-        "paddw %%mm7, %%mm5                       \n\t" //d5
-        "psubw %%mm3, %%mm1                       \n\t"
-
-        "movq 1*16(%%"REG_d"), %%mm7              \n\t"
-        "psubw %%mm2, %%mm5                       \n\t"
-
-        "movq %%mm0, %%mm6                        \n\t"
-        "paddw %%mm4, %%mm0                       \n\t" //d1
-
-        "paddusw %%mm3, %%mm1                     \n\t"
-        "psubw %%mm4, %%mm6                       \n\t" //d7
-
-        // d1 d3 - - - d5 d7 -
-        "movq 7*16(%%"REG_d"), %%mm4              \n\t"
-        "psubw %%mm7, %%mm0                       \n\t"
-
-        "psubw %%mm4, %%mm6                       \n\t"
-        "paddusw %%mm2, %%mm5                     \n\t"
-
-        "paddusw %%mm4, %%mm6                     \n\t"
-        "paddw %%mm3, %%mm1                       \n\t"
-
-        "paddw %%mm2, %%mm5                       \n\t"
-        "paddw %%mm4, %%mm6                       \n\t"
-
-        "psubusw %%mm3, %%mm1                     \n\t"
-        "psubusw %%mm2, %%mm5                     \n\t"
-
-        "psubusw %%mm4, %%mm6                     \n\t"
-        "movq %%mm1, %%mm4                        \n\t"
-
-        "por %%mm5, %%mm4                         \n\t"
-        "paddusw %%mm7, %%mm0                     \n\t"
-
-        "por %%mm6, %%mm4                         \n\t"
-        "paddw %%mm7, %%mm0                       \n\t"
-
-        "packssdw %%mm4, %%mm4                    \n\t"
-        "psubusw %%mm7, %%mm0                     \n\t"
-
-        "movd %%mm4, %%"REG_a"                    \n\t"
-        "or %%"REG_a", %%"REG_a"                  \n\t"
-        "jnz 2f                                   \n\t"
-        //movq [edi+"DCTSIZE_S"*3*2], mm1
-        //movq [edi+"DCTSIZE_S"*5*2], mm5
-        //movq [edi+"DCTSIZE_S"*1*2], mm0
-        //movq [edi+"DCTSIZE_S"*7*2], mm6
-        // t4 t5 - - - t6 t7 -
-        //--- t4 (mm0) may be <>0; mm1, mm5, mm6 == 0
-//Typical numbers: nondc - 19%%,  dc - 26%%,  zero - 55%%. zero case alone isn't worthwhile
-        "movq 0*8+%3, %%mm4                      \n\t"
-        "movq %%mm0, %%mm1                       \n\t"
-
-        "pmulhw "MANGLE(MM_FIX_0_847759065)", %%mm0 \n\t" //tmp6
-        "movq %%mm1, %%mm2                       \n\t"
-
-        "movq "DCTSIZE_S"*0*2(%%"REG_D"), %%mm5 \n\t"
-        "movq %%mm2, %%mm3                      \n\t"
-
-        "pmulhw "MANGLE(MM_FIX_0_566454497)", %%mm1 \n\t" //tmp5
-        "paddw %%mm4, %%mm5                     \n\t"
-
-        "movq 1*8+%3, %%mm6                     \n\t"
-        //paddw mm3, MM_2
-        "psraw $2, %%mm3                        \n\t" //tmp7
-
-        "pmulhw "MANGLE(MM_FIX_0_198912367)", %%mm2 \n\t" //-tmp4
-        "psubw %%mm3, %%mm4                     \n\t"
-
-        "movq "DCTSIZE_S"*1*2(%%"REG_D"), %%mm7 \n\t"
-        "paddw %%mm3, %%mm5                     \n\t"
-
-        "movq %%mm4, "DCTSIZE_S"*7*2(%%"REG_D") \n\t"
-        "paddw %%mm6, %%mm7                     \n\t"
-
-        "movq 2*8+%3, %%mm3                     \n\t"
-        "psubw %%mm0, %%mm6                     \n\t"
-
-        "movq "DCTSIZE_S"*2*2(%%"REG_D"), %%mm4 \n\t"
-        "paddw %%mm0, %%mm7                     \n\t"
-
-        "movq %%mm5, "DCTSIZE_S"*0*2(%%"REG_D") \n\t"
-        "paddw %%mm3, %%mm4                     \n\t"
-
-        "movq %%mm6, "DCTSIZE_S"*6*2(%%"REG_D") \n\t"
-        "psubw %%mm1, %%mm3                     \n\t"
-
-        "movq "DCTSIZE_S"*5*2(%%"REG_D"), %%mm5 \n\t"
-        "paddw %%mm1, %%mm4                     \n\t"
-
-        "movq "DCTSIZE_S"*3*2(%%"REG_D"), %%mm6 \n\t"
-        "paddw %%mm3, %%mm5                     \n\t"
-
-        "movq 3*8+%3, %%mm0                     \n\t"
-        "add $8, %%"REG_S"                      \n\t"
-
-        "movq %%mm7, "DCTSIZE_S"*1*2(%%"REG_D") \n\t"
-        "paddw %%mm0, %%mm6                     \n\t"
-
-        "movq %%mm4, "DCTSIZE_S"*2*2(%%"REG_D") \n\t"
-        "psubw %%mm2, %%mm0                     \n\t"
-
-        "movq "DCTSIZE_S"*4*2(%%"REG_D"), %%mm7 \n\t"
-        "paddw %%mm2, %%mm6                     \n\t"
-
-        "movq %%mm5, "DCTSIZE_S"*5*2(%%"REG_D") \n\t"
-        "paddw %%mm0, %%mm7                     \n\t"
-
-        "movq %%mm6, "DCTSIZE_S"*3*2(%%"REG_D") \n\t"
-
-        "movq %%mm7, "DCTSIZE_S"*4*2(%%"REG_D") \n\t"
-        "add $8, %%"REG_D"                      \n\t"
-        "jmp 4f                                 \n\t"
-
-        "2:                                     \n\t"
-        //--- non DC2
-        //psraw mm1, 2 w/o it -> offset. thr1, thr1, thr1  (actually thr1, thr1, thr1-1)
-        //psraw mm5, 2
-        //psraw mm0, 2
-        //psraw mm6, 2
-        "movq %%mm5, %%mm3                      \n\t"
-        "psubw %%mm1, %%mm5                     \n\t"
-
-        "psllw $1, %%mm5                        \n\t" //'z10
-        "paddw %%mm1, %%mm3                     \n\t" //'z13
-
-        "movq %%mm0, %%mm2                      \n\t"
-        "psubw %%mm6, %%mm0                     \n\t"
-
-        "movq %%mm5, %%mm1                      \n\t"
-        "psllw $1, %%mm0                        \n\t" //'z12
-
-        "pmulhw "MANGLE(MM_FIX_2_613125930)", %%mm1 \n\t" //-
-        "paddw %%mm0, %%mm5                     \n\t"
-
-        "pmulhw "MANGLE(MM_FIX_1_847759065)", %%mm5 \n\t" //'z5
-        "paddw %%mm6, %%mm2                     \n\t" //'z11
-
-        "pmulhw "MANGLE(MM_FIX_1_082392200)", %%mm0 \n\t"
-        "movq %%mm2, %%mm7                      \n\t"
-
-        //---
-        "movq 0*8+%3, %%mm4                     \n\t"
-        "psubw %%mm3, %%mm2                     \n\t"
-
-        "psllw $1, %%mm2                        \n\t"
-        "paddw %%mm3, %%mm7                     \n\t" //'t7
-
-        "pmulhw "MANGLE(MM_FIX_1_414213562)", %%mm2 \n\t" //'t11
-        "movq %%mm4, %%mm6                      \n\t"
-        //paddw mm7, MM_2
-        "psraw $2, %%mm7                        \n\t"
-
-        "paddw "DCTSIZE_S"*0*2(%%"REG_D"), %%mm4\n\t"
-        "psubw %%mm7, %%mm6                     \n\t"
-
-        "movq 1*8+%3, %%mm3                     \n\t"
-        "paddw %%mm7, %%mm4                     \n\t"
-
-        "movq %%mm6, "DCTSIZE_S"*7*2(%%"REG_D") \n\t"
-        "paddw %%mm5, %%mm1                     \n\t" //'t12
-
-        "movq %%mm4, "DCTSIZE_S"*0*2(%%"REG_D") \n\t"
-        "psubw %%mm7, %%mm1                     \n\t" //'t6
-
-        "movq 2*8+%3, %%mm7                     \n\t"
-        "psubw %%mm5, %%mm0                     \n\t" //'t10
-
-        "movq 3*8+%3, %%mm6                     \n\t"
-        "movq %%mm3, %%mm5                      \n\t"
-
-        "paddw "DCTSIZE_S"*1*2(%%"REG_D"), %%mm3\n\t"
-        "psubw %%mm1, %%mm5                     \n\t"
-
-        "psubw %%mm1, %%mm2                     \n\t" //'t5
-        "paddw %%mm1, %%mm3                     \n\t"
-
-        "movq %%mm5, "DCTSIZE_S"*6*2(%%"REG_D") \n\t"
-        "movq %%mm7, %%mm4                      \n\t"
-
-        "paddw "DCTSIZE_S"*2*2(%%"REG_D"), %%mm7\n\t"
-        "psubw %%mm2, %%mm4                     \n\t"
-
-        "paddw "DCTSIZE_S"*5*2(%%"REG_D"), %%mm4\n\t"
-        "paddw %%mm2, %%mm7                     \n\t"
-
-        "movq %%mm3, "DCTSIZE_S"*1*2(%%"REG_D") \n\t"
-        "paddw %%mm2, %%mm0                     \n\t" //'t4
-
-        // 't4 't6 't5 - - - - 't7
-        "movq %%mm7, "DCTSIZE_S"*2*2(%%"REG_D") \n\t"
-        "movq %%mm6, %%mm1                      \n\t"
-
-        "paddw "DCTSIZE_S"*4*2(%%"REG_D"), %%mm6\n\t"
-        "psubw %%mm0, %%mm1                     \n\t"
-
-        "paddw "DCTSIZE_S"*3*2(%%"REG_D"), %%mm1\n\t"
-        "paddw %%mm0, %%mm6                     \n\t"
-
-        "movq %%mm4, "DCTSIZE_S"*5*2(%%"REG_D") \n\t"
-        "add $8, %%"REG_S"                      \n\t"
-
-        "movq %%mm6, "DCTSIZE_S"*4*2(%%"REG_D") \n\t"
-
-        "movq %%mm1, "DCTSIZE_S"*3*2(%%"REG_D") \n\t"
-        "add $8, %%"REG_D"                      \n\t"
-
-        "4:                                     \n\t"
-        "movq "DCTSIZE_S"*0*2(%%"REG_S"), %%mm1 \n\t"
-        //
-        "movq "DCTSIZE_S"*3*2(%%"REG_S"), %%mm7 \n\t"
-        "movq %%mm1, %%mm0                      \n\t"
-
-        "paddw "DCTSIZE_S"*7*2(%%"REG_S"), %%mm1\n\t" //t0
-        "movq %%mm7, %%mm3                      \n\t"
-
-        "paddw "DCTSIZE_S"*4*2(%%"REG_S"), %%mm7\n\t" //t3
-        "movq %%mm1, %%mm5                      \n\t"
-
-        "movq "DCTSIZE_S"*1*2(%%"REG_S"), %%mm6 \n\t"
-        "psubw %%mm7, %%mm1                     \n\t" //t13
-
-        "movq "DCTSIZE_S"*2*2(%%"REG_S"), %%mm2 \n\t"
-        "movq %%mm6, %%mm4                      \n\t"
-
-        "paddw "DCTSIZE_S"*6*2(%%"REG_S"), %%mm6\n\t" //t1
-        "paddw %%mm7, %%mm5                     \n\t" //t10
-
-        "paddw "DCTSIZE_S"*5*2(%%"REG_S"), %%mm2\n\t" //t2
-        "movq %%mm6, %%mm7                      \n\t"
-
-        "paddw %%mm2, %%mm6                     \n\t" //t11
-        "psubw %%mm2, %%mm7                     \n\t" //t12
-
-        "movq %%mm5, %%mm2                      \n\t"
-        "paddw %%mm6, %%mm5                     \n\t" //d0
-        // i0 t13 t12 i3 i1 d0 - d4
-        "psubw %%mm6, %%mm2                     \n\t" //d4
-        "paddw %%mm1, %%mm7                     \n\t"
-
-        "movq  1*8+4*16(%%"REG_d"), %%mm6       \n\t"
-        "psllw $2, %%mm7                        \n\t"
-
-        "psubw 1*8+0*16(%%"REG_d"), %%mm5       \n\t"
-        "psubw %%mm6, %%mm2                     \n\t"
-
-        "paddusw 1*8+0*16(%%"REG_d"), %%mm5     \n\t"
-        "paddusw %%mm6, %%mm2                   \n\t"
-
-        "pmulhw "MANGLE(ff_MM_FIX_0_707106781)", %%mm7 \n\t"
-        //
-        "paddw 1*8+0*16(%%"REG_d"), %%mm5       \n\t"
-        "paddw %%mm6, %%mm2                     \n\t"
-
-        "psubusw 1*8+0*16(%%"REG_d"), %%mm5     \n\t"
-        "psubusw %%mm6, %%mm2                   \n\t"
-
-//This func is totally compute-bound,  operates at huge speed. So,  DC shortcut
-// at this place isn't worthwhile due to BTB miss penalty (checked on Pent. 3).
-//However,  typical numbers: nondc - 29%%,  dc - 46%%,  zero - 25%%. All <> 0 case is very rare.
-        "paddw "MANGLE(MM_2)", %%mm5            \n\t"
-        "movq %%mm2, %%mm6                      \n\t"
-
-        "paddw %%mm5, %%mm2                     \n\t"
-        "psubw %%mm6, %%mm5                     \n\t"
-
-        "movq %%mm1, %%mm6                      \n\t"
-        "paddw %%mm7, %%mm1                     \n\t" //d2
-
-        "psubw 1*8+2*16(%%"REG_d"), %%mm1       \n\t"
-        "psubw %%mm7, %%mm6                     \n\t" //d6
-
-        "movq 1*8+6*16(%%"REG_d"), %%mm7        \n\t"
-        "psraw $2, %%mm5                        \n\t"
-
-        "paddusw 1*8+2*16(%%"REG_d"), %%mm1     \n\t"
-        "psubw %%mm7, %%mm6                     \n\t"
-        // t7 d2 /t11 t4 t6 - d6 /t10
-
-        "paddw 1*8+2*16(%%"REG_d"), %%mm1       \n\t"
-        "paddusw %%mm7, %%mm6                   \n\t"
-
-        "psubusw 1*8+2*16(%%"REG_d"), %%mm1     \n\t"
-        "paddw %%mm7, %%mm6                     \n\t"
-
-        "psubw "DCTSIZE_S"*4*2(%%"REG_S"), %%mm3\n\t"
-        "psubusw %%mm7, %%mm6                   \n\t"
-
-        //movq [edi+"DCTSIZE_S"*2*2], mm1
-        //movq [edi+"DCTSIZE_S"*6*2], mm6
-        "movq %%mm1, %%mm7                      \n\t"
-        "psraw $2, %%mm2                        \n\t"
-
-        "psubw "DCTSIZE_S"*6*2(%%"REG_S"), %%mm4\n\t"
-        "psubw %%mm6, %%mm1                     \n\t"
-
-        "psubw "DCTSIZE_S"*7*2(%%"REG_S"), %%mm0\n\t"
-        "paddw %%mm7, %%mm6                     \n\t" //'t13
-
-        "psraw $2, %%mm6                        \n\t" //paddw mm6, MM_2 !!    ---
-        "movq %%mm2, %%mm7                      \n\t"
-
-        "pmulhw "MANGLE(MM_FIX_1_414213562_A)", %%mm1 \n\t"
-        "paddw %%mm6, %%mm2                     \n\t" //'t0
-
-        "movq %%mm2, 0*8+%3                     \n\t" //!
-        "psubw %%mm6, %%mm7                     \n\t" //'t3
-
-        "movq "DCTSIZE_S"*2*2(%%"REG_S"), %%mm2 \n\t"
-        "psubw %%mm6, %%mm1                     \n\t" //'t12
-
-        "psubw "DCTSIZE_S"*5*2(%%"REG_S"), %%mm2\n\t" //t5
-        "movq %%mm5, %%mm6                      \n\t"
-
-        "movq %%mm7, 3*8+%3                     \n\t"
-        "paddw %%mm2, %%mm3                     \n\t" //t10
-
-        "paddw %%mm4, %%mm2                     \n\t" //t11
-        "paddw %%mm0, %%mm4                     \n\t" //t12
-
-        "movq %%mm3, %%mm7                      \n\t"
-        "psubw %%mm4, %%mm3                     \n\t"
-
-        "psllw $2, %%mm3                        \n\t"
-        "psllw $2, %%mm7                        \n\t" //opt for P6
-
-        "pmulhw "MANGLE(MM_FIX_0_382683433)", %%mm3 \n\t"
-        "psllw $2, %%mm4                        \n\t"
-
-        "pmulhw "MANGLE(ff_MM_FIX_0_541196100)", %%mm7 \n\t"
-        "psllw $2, %%mm2                        \n\t"
-
-        "pmulhw "MANGLE(MM_FIX_1_306562965)", %%mm4 \n\t"
-        "paddw %%mm1, %%mm5                     \n\t" //'t1
-
-        "pmulhw "MANGLE(ff_MM_FIX_0_707106781)", %%mm2 \n\t"
-        "psubw %%mm1, %%mm6                     \n\t" //'t2
-        // t7 't12 't11 t4 t6 - 't13 't10   ---
-
-        "paddw %%mm3, %%mm7                     \n\t" //z2
-
-        "movq %%mm5, 1*8+%3                     \n\t"
-        "paddw %%mm3, %%mm4                     \n\t" //z4
-
-        "movq 1*8+3*16(%%"REG_d"), %%mm3        \n\t"
-        "movq %%mm0, %%mm1                      \n\t"
-
-        "movq %%mm6, 2*8+%3                     \n\t"
-        "psubw %%mm2, %%mm1                     \n\t" //z13
-
-//===
-        "paddw %%mm2, %%mm0                     \n\t" //z11
-        "movq %%mm1, %%mm5                      \n\t"
-
-        "movq 1*8+5*16(%%"REG_d"), %%mm2        \n\t"
-        "psubw %%mm7, %%mm1                     \n\t" //d3
-
-        "paddw %%mm7, %%mm5                     \n\t" //d5
-        "psubw %%mm3, %%mm1                     \n\t"
-
-        "movq 1*8+1*16(%%"REG_d"), %%mm7        \n\t"
-        "psubw %%mm2, %%mm5                     \n\t"
-
-        "movq %%mm0, %%mm6                      \n\t"
-        "paddw %%mm4, %%mm0                     \n\t" //d1
-
-        "paddusw %%mm3, %%mm1                   \n\t"
-        "psubw %%mm4, %%mm6                     \n\t" //d7
-
-        // d1 d3 - - - d5 d7 -
-        "movq 1*8+7*16(%%"REG_d"), %%mm4        \n\t"
-        "psubw %%mm7, %%mm0                     \n\t"
-
-        "psubw %%mm4, %%mm6                     \n\t"
-        "paddusw %%mm2, %%mm5                   \n\t"
-
-        "paddusw %%mm4, %%mm6                   \n\t"
-        "paddw %%mm3, %%mm1                     \n\t"
-
-        "paddw %%mm2, %%mm5                     \n\t"
-        "paddw %%mm4, %%mm6                     \n\t"
-
-        "psubusw %%mm3, %%mm1                   \n\t"
-        "psubusw %%mm2, %%mm5                   \n\t"
-
-        "psubusw %%mm4, %%mm6                   \n\t"
-        "movq %%mm1, %%mm4                      \n\t"
-
-        "por %%mm5, %%mm4                       \n\t"
-        "paddusw %%mm7, %%mm0                   \n\t"
-
-        "por %%mm6, %%mm4                       \n\t"
-        "paddw %%mm7, %%mm0                     \n\t"
-
-        "packssdw %%mm4, %%mm4                  \n\t"
-        "psubusw %%mm7, %%mm0                   \n\t"
-
-        "movd %%mm4, %%"REG_a"                  \n\t"
-        "or %%"REG_a", %%"REG_a"                \n\t"
-        "jnz 3f                                 \n\t"
-        //movq [edi+"DCTSIZE_S"*3*2], mm1
-        //movq [edi+"DCTSIZE_S"*5*2], mm5
-        //movq [edi+"DCTSIZE_S"*1*2], mm0
-        //movq [edi+"DCTSIZE_S"*7*2], mm6
-        // t4 t5 - - - t6 t7 -
-        //--- t4 (mm0) may be <>0; mm1, mm5, mm6 == 0
-//Typical numbers: nondc - 19%%,  dc - 26%%,  zero - 55%%. zero case alone isn't worthwhile
-        "movq 0*8+%3, %%mm4                    \n\t"
-        "movq %%mm0, %%mm1                     \n\t"
-
-        "pmulhw "MANGLE(MM_FIX_0_847759065)", %%mm0 \n\t" //tmp6
-        "movq %%mm1, %%mm2                     \n\t"
-
-        "movq "DCTSIZE_S"*0*2(%%"REG_D"), %%mm5\n\t"
-        "movq %%mm2, %%mm3                     \n\t"
-
-        "pmulhw "MANGLE(MM_FIX_0_566454497)", %%mm1 \n\t" //tmp5
-        "paddw %%mm4, %%mm5                    \n\t"
-
-        "movq 1*8+%3, %%mm6                    \n\t"
-        //paddw mm3, MM_2
-        "psraw $2, %%mm3                       \n\t" //tmp7
-
-        "pmulhw "MANGLE(MM_FIX_0_198912367)", %%mm2 \n\t" //-tmp4
-        "psubw %%mm3, %%mm4                    \n\t"
-
-        "movq "DCTSIZE_S"*1*2(%%"REG_D"), %%mm7\n\t"
-        "paddw %%mm3, %%mm5                    \n\t"
-
-        "movq %%mm4, "DCTSIZE_S"*7*2(%%"REG_D")\n\t"
-        "paddw %%mm6, %%mm7                    \n\t"
-
-        "movq 2*8+%3, %%mm3                    \n\t"
-        "psubw %%mm0, %%mm6                    \n\t"
-
-        "movq "DCTSIZE_S"*2*2(%%"REG_D"), %%mm4\n\t"
-        "paddw %%mm0, %%mm7                    \n\t"
-
-        "movq %%mm5, "DCTSIZE_S"*0*2(%%"REG_D")\n\t"
-        "paddw %%mm3, %%mm4                    \n\t"
-
-        "movq %%mm6, "DCTSIZE_S"*6*2(%%"REG_D")\n\t"
-        "psubw %%mm1, %%mm3                    \n\t"
-
-        "movq "DCTSIZE_S"*5*2(%%"REG_D"), %%mm5\n\t"
-        "paddw %%mm1, %%mm4                    \n\t"
-
-        "movq "DCTSIZE_S"*3*2(%%"REG_D"), %%mm6\n\t"
-        "paddw %%mm3, %%mm5                    \n\t"
-
-        "movq 3*8+%3, %%mm0                    \n\t"
-        "add $24, %%"REG_S"                    \n\t"
-
-        "movq %%mm7, "DCTSIZE_S"*1*2(%%"REG_D")\n\t"
-        "paddw %%mm0, %%mm6                    \n\t"
-
-        "movq %%mm4, "DCTSIZE_S"*2*2(%%"REG_D")\n\t"
-        "psubw %%mm2, %%mm0                    \n\t"
-
-        "movq "DCTSIZE_S"*4*2(%%"REG_D"), %%mm7\n\t"
-        "paddw %%mm2, %%mm6                    \n\t"
-
-        "movq %%mm5, "DCTSIZE_S"*5*2(%%"REG_D")\n\t"
-        "paddw %%mm0, %%mm7                    \n\t"
-
-        "movq %%mm6, "DCTSIZE_S"*3*2(%%"REG_D")\n\t"
-
-        "movq %%mm7, "DCTSIZE_S"*4*2(%%"REG_D")\n\t"
-        "add $24, %%"REG_D"                    \n\t"
-        "sub $2, %%"REG_c"                     \n\t"
-        "jnz 1b                                \n\t"
-        "jmp 5f                                \n\t"
-
-        "3:                                    \n\t"
-        //--- non DC2
-        //psraw mm1, 2 w/o it -> offset. thr1, thr1, thr1  (actually thr1, thr1, thr1-1)
-        //psraw mm5, 2
-        //psraw mm0, 2
-        //psraw mm6, 2
-        "movq %%mm5, %%mm3                    \n\t"
-        "psubw %%mm1, %%mm5                   \n\t"
-
-        "psllw $1, %%mm5                      \n\t" //'z10
-        "paddw %%mm1, %%mm3                   \n\t" //'z13
-
-        "movq %%mm0, %%mm2                    \n\t"
-        "psubw %%mm6, %%mm0                   \n\t"
-
-        "movq %%mm5, %%mm1                    \n\t"
-        "psllw $1, %%mm0                      \n\t" //'z12
-
-        "pmulhw "MANGLE(MM_FIX_2_613125930)", %%mm1 \n\t" //-
-        "paddw %%mm0, %%mm5                   \n\t"
-
-        "pmulhw "MANGLE(MM_FIX_1_847759065)", %%mm5 \n\t" //'z5
-        "paddw %%mm6, %%mm2                   \n\t" //'z11
-
-        "pmulhw "MANGLE(MM_FIX_1_082392200)", %%mm0 \n\t"
-        "movq %%mm2, %%mm7                    \n\t"
-
-        //---
-        "movq 0*8+%3, %%mm4                   \n\t"
-        "psubw %%mm3, %%mm2                   \n\t"
-
-        "psllw $1, %%mm2                      \n\t"
-        "paddw %%mm3, %%mm7                   \n\t" //'t7
-
-        "pmulhw "MANGLE(MM_FIX_1_414213562)", %%mm2 \n\t" //'t11
-        "movq %%mm4, %%mm6                    \n\t"
-        //paddw mm7, MM_2
-        "psraw $2, %%mm7                      \n\t"
-
-        "paddw "DCTSIZE_S"*0*2(%%"REG_D"), %%mm4 \n\t"
-        "psubw %%mm7, %%mm6                   \n\t"
-
-        "movq 1*8+%3, %%mm3                   \n\t"
-        "paddw %%mm7, %%mm4                   \n\t"
-
-        "movq %%mm6, "DCTSIZE_S"*7*2(%%"REG_D") \n\t"
-        "paddw %%mm5, %%mm1                   \n\t" //'t12
-
-        "movq %%mm4, "DCTSIZE_S"*0*2(%%"REG_D") \n\t"
-        "psubw %%mm7, %%mm1                   \n\t" //'t6
-
-        "movq 2*8+%3, %%mm7                   \n\t"
-        "psubw %%mm5, %%mm0                   \n\t" //'t10
-
-        "movq 3*8+%3, %%mm6                   \n\t"
-        "movq %%mm3, %%mm5                    \n\t"
-
-        "paddw "DCTSIZE_S"*1*2(%%"REG_D"), %%mm3 \n\t"
-        "psubw %%mm1, %%mm5                   \n\t"
-
-        "psubw %%mm1, %%mm2                   \n\t" //'t5
-        "paddw %%mm1, %%mm3                   \n\t"
-
-        "movq %%mm5, "DCTSIZE_S"*6*2(%%"REG_D") \n\t"
-        "movq %%mm7, %%mm4                    \n\t"
-
-        "paddw "DCTSIZE_S"*2*2(%%"REG_D"), %%mm7 \n\t"
-        "psubw %%mm2, %%mm4                   \n\t"
-
-        "paddw "DCTSIZE_S"*5*2(%%"REG_D"), %%mm4 \n\t"
-        "paddw %%mm2, %%mm7                   \n\t"
-
-        "movq %%mm3, "DCTSIZE_S"*1*2(%%"REG_D") \n\t"
-        "paddw %%mm2, %%mm0                    \n\t" //'t4
-
-        // 't4 't6 't5 - - - - 't7
-        "movq %%mm7, "DCTSIZE_S"*2*2(%%"REG_D") \n\t"
-        "movq %%mm6, %%mm1                     \n\t"
-
-        "paddw "DCTSIZE_S"*4*2(%%"REG_D"), %%mm6 \n\t"
-        "psubw %%mm0, %%mm1                    \n\t"
-
-        "paddw "DCTSIZE_S"*3*2(%%"REG_D"), %%mm1 \n\t"
-        "paddw %%mm0, %%mm6                    \n\t"
-
-        "movq %%mm4, "DCTSIZE_S"*5*2(%%"REG_D") \n\t"
-        "add $24, %%"REG_S"                    \n\t"
-
-        "movq %%mm6, "DCTSIZE_S"*4*2(%%"REG_D") \n\t"
-
-        "movq %%mm1, "DCTSIZE_S"*3*2(%%"REG_D") \n\t"
-        "add $24, %%"REG_D"                    \n\t"
-        "sub $2, %%"REG_c"                     \n\t"
-        "jnz 1b                                \n\t"
-        "5:                                    \n\t"
-
-        : "+S"(data), "+D"(output), "+c"(cnt), "=o"(temps)
-        : "d"(thr_adr)
-          NAMED_CONSTRAINTS_ADD(ff_MM_FIX_0_707106781, MM_2,MM_FIX_1_414213562_A, MM_FIX_1_414213562, MM_FIX_0_382683433,
-                                ff_MM_FIX_0_541196100, MM_FIX_1_306562965, MM_FIX_0_847759065)
-          NAMED_CONSTRAINTS_ADD(MM_FIX_0_566454497, MM_FIX_0_198912367, MM_FIX_2_613125930, MM_FIX_1_847759065,
-                                MM_FIX_1_082392200)
-        : "%"REG_a
-        );
-}
-
-static void row_idct_mmx (int16_t *workspace, int16_t *output_adr, int output_stride, int cnt)
-{
-    DECLARE_ALIGNED(8, uint64_t, temps)[4];
-
-    __asm__ volatile(
-        "lea (%%"REG_a",%%"REG_a",2), %%"REG_d"    \n\t"
-        "1:                     \n\t"
-        "movq "DCTSIZE_S"*0*2(%%"REG_S"), %%mm0    \n\t"
-        //
-
-        "movq "DCTSIZE_S"*1*2(%%"REG_S"), %%mm1    \n\t"
-        "movq %%mm0, %%mm4                         \n\t"
-
-        "movq "DCTSIZE_S"*2*2(%%"REG_S"), %%mm2    \n\t"
-        "punpcklwd %%mm1, %%mm0                    \n\t"
-
-        "movq "DCTSIZE_S"*3*2(%%"REG_S"), %%mm3    \n\t"
-        "punpckhwd %%mm1, %%mm4                    \n\t"
-
-        //transpose 4x4
-        "movq %%mm2, %%mm7                         \n\t"
-        "punpcklwd %%mm3, %%mm2                    \n\t"
-
-        "movq %%mm0, %%mm6                         \n\t"
-        "punpckldq %%mm2, %%mm0                    \n\t" //0
-
-        "punpckhdq %%mm2, %%mm6                    \n\t" //1
-        "movq %%mm0, %%mm5                         \n\t"
-
-        "punpckhwd %%mm3, %%mm7                    \n\t"
-        "psubw %%mm6, %%mm0                        \n\t"
-
-        "pmulhw "MANGLE(MM_FIX_1_414213562_A)", %%mm0 \n\t"
-        "movq %%mm4, %%mm2                         \n\t"
-
-        "punpckldq %%mm7, %%mm4                    \n\t" //2
-        "paddw %%mm6, %%mm5                        \n\t"
-
-        "punpckhdq %%mm7, %%mm2                    \n\t" //3
-        "movq %%mm4, %%mm1                         \n\t"
-
-        "psllw $2, %%mm0                           \n\t"
-        "paddw %%mm2, %%mm4                        \n\t" //t10
-
-        "movq "DCTSIZE_S"*0*2+"DCTSIZE_S"(%%"REG_S"), %%mm3 \n\t"
-        "psubw %%mm2, %%mm1                        \n\t" //t11
-
-        "movq "DCTSIZE_S"*1*2+"DCTSIZE_S"(%%"REG_S"), %%mm2 \n\t"
-        "psubw %%mm5, %%mm0                        \n\t"
-
-        "movq %%mm4, %%mm6                         \n\t"
-        "paddw %%mm5, %%mm4                        \n\t" //t0
-
-        "psubw %%mm5, %%mm6                        \n\t" //t3
-        "movq %%mm1, %%mm7                         \n\t"
-
-        "movq "DCTSIZE_S"*2*2+"DCTSIZE_S"(%%"REG_S"), %%mm5 \n\t"
-        "paddw %%mm0, %%mm1                        \n\t" //t1
-
-        "movq %%mm4, 0*8+%3                        \n\t" //t0
-        "movq %%mm3, %%mm4                         \n\t"
-
-        "movq %%mm6, 1*8+%3                        \n\t" //t3
-        "punpcklwd %%mm2, %%mm3                    \n\t"
-
-        //transpose 4x4
-        "movq "DCTSIZE_S"*3*2+"DCTSIZE_S"(%%"REG_S"), %%mm6 \n\t"
-        "punpckhwd %%mm2, %%mm4                    \n\t"
-
-        "movq %%mm5, %%mm2                         \n\t"
-        "punpcklwd %%mm6, %%mm5                    \n\t"
-
-        "psubw %%mm0, %%mm7                        \n\t" //t2
-        "punpckhwd %%mm6, %%mm2                    \n\t"
-
-        "movq %%mm3, %%mm0                         \n\t"
-        "punpckldq %%mm5, %%mm3                    \n\t" //4
-
-        "punpckhdq %%mm5, %%mm0                    \n\t" //5
-        "movq %%mm4, %%mm5                         \n\t"
-
-        //
-        "movq %%mm3, %%mm6                         \n\t"
-        "punpckldq %%mm2, %%mm4                    \n\t" //6
-
-        "psubw %%mm0, %%mm3                        \n\t" //z10
-        "punpckhdq %%mm2, %%mm5                    \n\t" //7
-
-        "paddw %%mm0, %%mm6                        \n\t" //z13
-        "movq %%mm4, %%mm2                         \n\t"
-
-        "movq %%mm3, %%mm0                         \n\t"
-        "psubw %%mm5, %%mm4                        \n\t" //z12
-
-        "pmulhw "MANGLE(MM_FIX_2_613125930)", %%mm0\n\t" //-
-        "paddw %%mm4, %%mm3                        \n\t"
-
-        "pmulhw "MANGLE(MM_FIX_1_847759065)", %%mm3\n\t" //z5
-        "paddw %%mm5, %%mm2                        \n\t" //z11  >
-
-        "pmulhw "MANGLE(MM_FIX_1_082392200)", %%mm4\n\t"
-        "movq %%mm2, %%mm5                         \n\t"
-
-        "psubw %%mm6, %%mm2                        \n\t"
-        "paddw %%mm6, %%mm5                        \n\t" //t7
-
-        "pmulhw "MANGLE(MM_FIX_1_414213562)", %%mm2\n\t" //t11
-        "paddw %%mm3, %%mm0                        \n\t" //t12
-
-        "psllw $3, %%mm0                           \n\t"
-        "psubw %%mm3, %%mm4                        \n\t" //t10
-
-        "movq 0*8+%3, %%mm6                        \n\t"
-        "movq %%mm1, %%mm3                         \n\t"
-
-        "psllw $3, %%mm4                           \n\t"
-        "psubw %%mm5, %%mm0                        \n\t" //t6
-
-        "psllw $3, %%mm2                           \n\t"
-        "paddw %%mm0, %%mm1                        \n\t" //d1
-
-        "psubw %%mm0, %%mm2                        \n\t" //t5
-        "psubw %%mm0, %%mm3                        \n\t" //d6
-
-        "paddw %%mm2, %%mm4                        \n\t" //t4
-        "movq %%mm7, %%mm0                         \n\t"
-
-        "paddw %%mm2, %%mm7                        \n\t" //d2
-        "psubw %%mm2, %%mm0                        \n\t" //d5
-
-        "movq "MANGLE(MM_DESCALE_RND)", %%mm2      \n\t" //4
-        "psubw %%mm5, %%mm6                        \n\t" //d7
-
-        "paddw 0*8+%3, %%mm5                       \n\t" //d0
-        "paddw %%mm2, %%mm1                        \n\t"
-
-        "paddw %%mm2, %%mm5                        \n\t"
-        "psraw $3, %%mm1                           \n\t"
-
-        "paddw %%mm2, %%mm7                        \n\t"
-        "psraw $3, %%mm5                           \n\t"
-
-        "paddw (%%"REG_D"), %%mm5                  \n\t"
-        "psraw $3, %%mm7                           \n\t"
-
-        "paddw (%%"REG_D",%%"REG_a"), %%mm1        \n\t"
-        "paddw %%mm2, %%mm0                        \n\t"
-
-        "paddw (%%"REG_D",%%"REG_a",2), %%mm7      \n\t"
-        "paddw %%mm2, %%mm3                        \n\t"
-
-        "movq %%mm5, (%%"REG_D")                   \n\t"
-        "paddw %%mm2, %%mm6                        \n\t"
-
-        "movq %%mm1, (%%"REG_D",%%"REG_a")         \n\t"
-        "psraw $3, %%mm0                           \n\t"
-
-        "movq %%mm7, (%%"REG_D",%%"REG_a",2)       \n\t"
-        "add %%"REG_d", %%"REG_D"                  \n\t" //3*ls
-
-        "movq 1*8+%3, %%mm5                        \n\t" //t3
-        "psraw $3, %%mm3                           \n\t"
-
-        "paddw (%%"REG_D",%%"REG_a",2), %%mm0      \n\t"
-        "psubw %%mm4, %%mm5                        \n\t" //d3
-
-        "paddw (%%"REG_D",%%"REG_d"), %%mm3        \n\t"
-        "psraw $3, %%mm6                           \n\t"
-
-        "paddw 1*8+%3, %%mm4                       \n\t" //d4
-        "paddw %%mm2, %%mm5                        \n\t"
-
-        "paddw (%%"REG_D",%%"REG_a",4), %%mm6      \n\t"
-        "paddw %%mm2, %%mm4                        \n\t"
-
-        "movq %%mm0, (%%"REG_D",%%"REG_a",2)       \n\t"
-        "psraw $3, %%mm5                           \n\t"
-
-        "paddw (%%"REG_D"), %%mm5                  \n\t"
-        "psraw $3, %%mm4                           \n\t"
-
-        "paddw (%%"REG_D",%%"REG_a"), %%mm4        \n\t"
-        "add $"DCTSIZE_S"*2*4, %%"REG_S"           \n\t" //4 rows
-
-        "movq %%mm3, (%%"REG_D",%%"REG_d")         \n\t"
-        "movq %%mm6, (%%"REG_D",%%"REG_a",4)       \n\t"
-        "movq %%mm5, (%%"REG_D")                   \n\t"
-        "movq %%mm4, (%%"REG_D",%%"REG_a")         \n\t"
-
-        "sub %%"REG_d", %%"REG_D"                  \n\t"
-        "add $8, %%"REG_D"                         \n\t"
-        "dec %%"REG_c"                             \n\t"
-        "jnz 1b                                    \n\t"
-
-        : "+S"(workspace), "+D"(output_adr), "+c"(cnt), "=o"(temps)
-        : "a"(output_stride * sizeof(short))
-        NAMED_CONSTRAINTS_ADD(MM_FIX_1_414213562_A, MM_FIX_2_613125930, MM_FIX_1_847759065, MM_FIX_1_082392200,
-                              MM_FIX_1_414213562,MM_DESCALE_RND)
-        : "%"REG_d
-        );
-}
-
-static void row_fdct_mmx(int16_t *data, const uint8_t *pixels, int line_size, int cnt)
-{
-    DECLARE_ALIGNED(8, uint64_t, temps)[4];
-
-    __asm__ volatile(
-        "lea (%%"REG_a",%%"REG_a",2), %%"REG_d"    \n\t"
-        "6:                                        \n\t"
-        "movd (%%"REG_S"), %%mm0                   \n\t"
-        "pxor %%mm7, %%mm7                         \n\t"
-
-        "movd (%%"REG_S",%%"REG_a"), %%mm1         \n\t"
-        "punpcklbw %%mm7, %%mm0                    \n\t"
-
-        "movd (%%"REG_S",%%"REG_a",2), %%mm2       \n\t"
-        "punpcklbw %%mm7, %%mm1                    \n\t"
-
-        "punpcklbw %%mm7, %%mm2                    \n\t"
-        "add %%"REG_d", %%"REG_S"                  \n\t"
-
-        "movq %%mm0, %%mm5                         \n\t"
-        //
-
-        "movd (%%"REG_S",%%"REG_a",4), %%mm3       \n\t" //7  ;prefetch!
-        "movq %%mm1, %%mm6                         \n\t"
-
-        "movd (%%"REG_S",%%"REG_d"), %%mm4         \n\t" //6
-        "punpcklbw %%mm7, %%mm3                    \n\t"
-
-        "psubw %%mm3, %%mm5                        \n\t"
-        "punpcklbw %%mm7, %%mm4                    \n\t"
-
-        "paddw %%mm3, %%mm0                        \n\t"
-        "psubw %%mm4, %%mm6                        \n\t"
-
-        "movd (%%"REG_S",%%"REG_a",2), %%mm3       \n\t" //5
-        "paddw %%mm4, %%mm1                        \n\t"
-
-        "movq %%mm5, %3                            \n\t" //t7
-        "punpcklbw %%mm7, %%mm3                    \n\t"
-
-        "movq %%mm6, %4                            \n\t" //t6
-        "movq %%mm2, %%mm4                         \n\t"
-
-        "movd (%%"REG_S"), %%mm5                   \n\t" //3
-        "paddw %%mm3, %%mm2                        \n\t"
-
-        "movd (%%"REG_S",%%"REG_a"), %%mm6         \n\t" //4
-        "punpcklbw %%mm7, %%mm5                    \n\t"
-
-        "psubw %%mm3, %%mm4                        \n\t"
-        "punpcklbw %%mm7, %%mm6                    \n\t"
-
-        "movq %%mm5, %%mm3                         \n\t"
-        "paddw %%mm6, %%mm5                        \n\t" //t3
-
-        "psubw %%mm6, %%mm3                        \n\t" //t4  ; t0 t1 t2 t4 t5 t3 - -
-        "movq %%mm0, %%mm6                         \n\t"
-
-        "movq %%mm1, %%mm7                         \n\t"
-        "psubw %%mm5, %%mm0                        \n\t" //t13
-
-        "psubw %%mm2, %%mm1                        \n\t"
-        "paddw %%mm2, %%mm7                        \n\t" //t11
-
-        "paddw %%mm0, %%mm1                        \n\t"
-        "movq %%mm7, %%mm2                         \n\t"
-
-        "psllw $2, %%mm1                           \n\t"
-        "paddw %%mm5, %%mm6                        \n\t" //t10
-
-        "pmulhw "MANGLE(ff_MM_FIX_0_707106781)", %%mm1 \n\t"
-        "paddw %%mm6, %%mm7                        \n\t" //d2
-
-        "psubw %%mm2, %%mm6                        \n\t" //d3
-        "movq %%mm0, %%mm5                         \n\t"
-
-        //transpose 4x4
-        "movq %%mm7, %%mm2                         \n\t"
-        "punpcklwd %%mm6, %%mm7                    \n\t"
-
-        "paddw %%mm1, %%mm0                        \n\t" //d0
-        "punpckhwd %%mm6, %%mm2                    \n\t"
-
-        "psubw %%mm1, %%mm5                        \n\t" //d1
-        "movq %%mm0, %%mm6                         \n\t"
-
-        "movq %4, %%mm1                            \n\t"
-        "punpcklwd %%mm5, %%mm0                    \n\t"
-
-        "punpckhwd %%mm5, %%mm6                    \n\t"
-        "movq %%mm0, %%mm5                         \n\t"
-
-        "punpckldq %%mm7, %%mm0                    \n\t" //0
-        "paddw %%mm4, %%mm3                        \n\t"
-
-        "punpckhdq %%mm7, %%mm5                    \n\t" //1
-        "movq %%mm6, %%mm7                         \n\t"
-
-        "movq %%mm0, "DCTSIZE_S"*0*2(%%"REG_D")    \n\t"
-        "punpckldq %%mm2, %%mm6                    \n\t" //2
-
-        "movq %%mm5, "DCTSIZE_S"*1*2(%%"REG_D")    \n\t"
-        "punpckhdq %%mm2, %%mm7                    \n\t" //3
-
-        "movq %%mm6, "DCTSIZE_S"*2*2(%%"REG_D")    \n\t"
-        "paddw %%mm1, %%mm4                        \n\t"
-
-        "movq %%mm7, "DCTSIZE_S"*3*2(%%"REG_D")    \n\t"
-        "psllw $2, %%mm3                           \n\t" //t10
-
-        "movq %3, %%mm2                            \n\t"
-        "psllw $2, %%mm4                           \n\t" //t11
-
-        "pmulhw "MANGLE(ff_MM_FIX_0_707106781)", %%mm4 \n\t" //z3
-        "paddw %%mm2, %%mm1                        \n\t"
-
-        "psllw $2, %%mm1                           \n\t" //t12
-        "movq %%mm3, %%mm0                         \n\t"
-
-        "pmulhw "MANGLE(ff_MM_FIX_0_541196100)", %%mm0 \n\t"
-        "psubw %%mm1, %%mm3                        \n\t"
-
-        "pmulhw "MANGLE(MM_FIX_0_382683433)", %%mm3 \n\t" //z5
-        "movq %%mm2, %%mm5                         \n\t"
-
-        "pmulhw "MANGLE(MM_FIX_1_306562965)", %%mm1 \n\t"
-        "psubw %%mm4, %%mm2                        \n\t" //z13
-
-        "paddw %%mm4, %%mm5                        \n\t" //z11
-        "movq %%mm2, %%mm6                         \n\t"
-
-        "paddw %%mm3, %%mm0                        \n\t" //z2
-        "movq %%mm5, %%mm7                         \n\t"
-
-        "paddw %%mm0, %%mm2                        \n\t" //d4
-        "psubw %%mm0, %%mm6                        \n\t" //d5
-
-        "movq %%mm2, %%mm4                         \n\t"
-        "paddw %%mm3, %%mm1                        \n\t" //z4
-
-        //transpose 4x4
-        "punpcklwd %%mm6, %%mm2                    \n\t"
-        "paddw %%mm1, %%mm5                        \n\t" //d6
-
-        "punpckhwd %%mm6, %%mm4                    \n\t"
-        "psubw %%mm1, %%mm7                        \n\t" //d7
-
-        "movq %%mm5, %%mm6                         \n\t"
-        "punpcklwd %%mm7, %%mm5                    \n\t"
-
-        "punpckhwd %%mm7, %%mm6                    \n\t"
-        "movq %%mm2, %%mm7                         \n\t"
-
-        "punpckldq %%mm5, %%mm2                    \n\t" //4
-        "sub %%"REG_d", %%"REG_S"                  \n\t"
-
-        "punpckhdq %%mm5, %%mm7                    \n\t" //5
-        "movq %%mm4, %%mm5                         \n\t"
-
-        "movq %%mm2, "DCTSIZE_S"*0*2+"DCTSIZE_S"(%%"REG_D") \n\t"
-        "punpckldq %%mm6, %%mm4                    \n\t" //6
-
-        "movq %%mm7, "DCTSIZE_S"*1*2+"DCTSIZE_S"(%%"REG_D") \n\t"
-        "punpckhdq %%mm6, %%mm5                    \n\t" //7
-
-        "movq %%mm4, "DCTSIZE_S"*2*2+"DCTSIZE_S"(%%"REG_D") \n\t"
-        "add $4, %%"REG_S"                         \n\t"
-
-        "movq %%mm5, "DCTSIZE_S"*3*2+"DCTSIZE_S"(%%"REG_D") \n\t"
-        "add $"DCTSIZE_S"*2*4, %%"REG_D"           \n\t" //4 rows
-        "dec %%"REG_c"                             \n\t"
-        "jnz 6b                                    \n\t"
-
-        : "+S"(pixels), "+D"(data), "+c"(cnt), "=o"(temps), "=o"(temps[1])
-        : "a"(line_size)
-        NAMED_CONSTRAINTS_ADD(ff_MM_FIX_0_707106781, ff_MM_FIX_0_541196100, MM_FIX_0_382683433, MM_FIX_1_306562965)
-        : "%"REG_d);
-}
-#endif
-
-av_cold void ff_fspp_init_x86(FSPPContext *s)
-{
-#if HAVE_MMX_INLINE
-    int cpu_flags = av_get_cpu_flags();
-
-    if (HAVE_MMX_INLINE && cpu_flags & AV_CPU_FLAG_MMX) {
-        s->store_slice  = store_slice_mmx;
-        s->store_slice2 = store_slice2_mmx;
-        s->mul_thrmat   = mul_thrmat_mmx;
-        s->column_fidct = column_fidct_mmx;
-        s->row_idct     = row_idct_mmx;
-        s->row_fdct     = row_fdct_mmx;
-    }
-#endif
-}
diff --git a/libavfilter/x86/vf_fspp_init.c b/libavfilter/x86/vf_fspp_init.c
new file mode 100644
index 0000000..8e00317
--- /dev/null
+++ b/libavfilter/x86/vf_fspp_init.c
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2003 Michael Niedermayer <michaelni at gmx.at>
+ * Copyright (C) 2005 Nikolaj Poroshin <porosh3 at psu.ru>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/x86/cpu.h"
+#include "libavfilter/vf_fspp.h"
+
+void ff_store_slice_mmx(uint8_t *dst, int16_t *src,
+                        ptrdiff_t dst_stride, ptrdiff_t src_stride,
+                        ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale);
+void ff_store_slice2_mmx(uint8_t *dst, int16_t *src,
+                         ptrdiff_t dst_stride, ptrdiff_t src_stride,
+                         ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale);
+void ff_mul_thrmat_mmx(int16_t *thr_adr_noq, int16_t *thr_adr, int q);
+void ff_column_fidct_mmx(int16_t *thr_adr, int16_t *data, int16_t *output, int cnt);
+void ff_row_idct_mmx(int16_t *workspace, int16_t *output_adr, ptrdiff_t output_stride, int cnt);
+void ff_row_fdct_mmx(int16_t *data, const uint8_t *pixels, ptrdiff_t line_size, int cnt);
+
+av_cold void ff_fspp_init_x86(FSPPContext *s)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_MMX(cpu_flags)) {
+        s->store_slice  = ff_store_slice_mmx;
+        s->store_slice2 = ff_store_slice2_mmx;
+        s->mul_thrmat   = ff_mul_thrmat_mmx;
+        s->column_fidct = ff_column_fidct_mmx;
+        s->row_idct     = ff_row_idct_mmx;
+        s->row_fdct     = ff_row_fdct_mmx;
+    }
+}



More information about the ffmpeg-cvslog mailing list