[FFmpeg-devel] [PATCH] JPEG2000: SSE optimisation of DWT decoding
Nicolas Bertrand
nicoinattendu at gmail.com
Fri Oct 6 18:30:57 EEST 2017
From: Maxime Taisant <maximetaisant at hotmail.fr>
---
libavcodec/jpeg2000dwt.c | 45 +-
libavcodec/jpeg2000dwt.h | 5 +
libavcodec/x86/jpeg2000dsp.asm | 1339 +++++++++++++++++++++++++++++++++++++
libavcodec/x86/jpeg2000dsp_init.c | 119 ++++
tests/checkasm/jpeg2000dsp.c | 1 +
5 files changed, 1496 insertions(+), 13 deletions(-)
diff --git a/libavcodec/jpeg2000dwt.c b/libavcodec/jpeg2000dwt.c
index 55dd5e89b5..1a0c3fc034 100644
--- a/libavcodec/jpeg2000dwt.c
+++ b/libavcodec/jpeg2000dwt.c
@@ -30,6 +30,7 @@
#include "libavutil/mem.h"
#include "jpeg2000dwt.h"
#include "internal.h"
+#include "libavutil/timer.h"
/* Defines for 9/7 DWT lifting parameters.
* Parameters are in float. */
@@ -558,7 +559,7 @@ int ff_jpeg2000_dwt_init(DWTContext *s, int border[2][2],
}
switch (type) {
case FF_DWT97:
- s->f_linebuf = av_malloc_array((maxlen + 12), sizeof(*s->f_linebuf));
+ s->f_linebuf = av_malloc_array(4*(maxlen + 12), sizeof(*s->f_linebuf));
if (!s->f_linebuf)
return AVERROR(ENOMEM);
break;
@@ -575,6 +576,11 @@ int ff_jpeg2000_dwt_init(DWTContext *s, int border[2][2],
default:
return -1;
}
+
+ s->sse = 0;
+ if (ARCH_X86)
+ ff_jpeg2000dwt_init_x86(s, type);
+
return 0;
}
@@ -601,18 +607,31 @@ int ff_dwt_decode(DWTContext *s, void *t)
if (s->ndeclevels == 0)
return 0;
- switch (s->type) {
- case FF_DWT97:
- dwt_decode97_float(s, t);
- break;
- case FF_DWT97_INT:
- dwt_decode97_int(s, t);
- break;
- case FF_DWT53:
- dwt_decode53(s, t);
- break;
- default:
- return -1;
+ switch(s->type){
+ case FF_DWT97:
+ if (s->sse)
+ //{
+ // START_TIMER
+ dwt_decode97_float_sse(s, t);
+ // STOP_TIMER("dwt_decode97_float_sse");
+ //}
+ else
+ //{
+ // START_TIMER
+ dwt_decode97_float(s, t);
+ // STOP_TIMER("dwt_decode97_float");
+ //}
+ /*{
+ START_TIMER
+ STOP_TIMER("decode_NULL");
+ }*/
+ break;
+ case FF_DWT97_INT:
+ dwt_decode97_int(s, t); break;
+ case FF_DWT53:
+ dwt_decode53(s, t); break;
+ default:
+ return -1;
}
return 0;
}
diff --git a/libavcodec/jpeg2000dwt.h b/libavcodec/jpeg2000dwt.h
index 718d183ac1..622a404b79 100644
--- a/libavcodec/jpeg2000dwt.h
+++ b/libavcodec/jpeg2000dwt.h
@@ -48,6 +48,7 @@ typedef struct DWTContext {
uint8_t type; ///< 0 for 9/7; 1 for 5/3
int32_t *i_linebuf; ///< int buffer used by transform
float *f_linebuf; ///< float buffer used by transform
+ int sse;
} DWTContext;
/**
@@ -65,4 +66,8 @@ int ff_dwt_decode(DWTContext *s, void *t);
void ff_dwt_destroy(DWTContext *s);
+void dwt_decode97_float_sse(DWTContext *s, float *t);
+
+void ff_jpeg2000dwt_init_x86(DWTContext *s, int type);
+
#endif /* AVCODEC_JPEG2000DWT_H */
diff --git a/libavcodec/x86/jpeg2000dsp.asm b/libavcodec/x86/jpeg2000dsp.asm
index 56b5fbd606..b5d5b9a04b 100644
--- a/libavcodec/x86/jpeg2000dsp.asm
+++ b/libavcodec/x86/jpeg2000dsp.asm
@@ -2,6 +2,7 @@
;* SIMD-optimized JPEG2000 DSP functions
;* Copyright (c) 2014 Nicolas Bertrand
;* Copyright (c) 2015 James Almer
+;* Copyright (c) 2017 Maxime Taisant
;*
;* This file is part of FFmpeg.
;*
@@ -29,6 +30,16 @@ pf_ict1: times 8 dd 0.34413
pf_ict2: times 8 dd 0.71414
pf_ict3: times 8 dd 1.772
+F_LFTG_K: dd 1.230174104914001
+F_LFTG_X: dd 0.812893066115961
+
+F_LFTG_ALPHA: times 8 dd 1.586134342059924
+F_LFTG_BETA: times 8 dd 0.052980118572961
+F_LFTG_GAMMA: times 8 dd 0.882911075530934
+F_LFTG_DELTA: times 8 dd 0.443506852043971
+
+TWO: dd 2.0
+
SECTION .text
;***********************************************************************
@@ -142,3 +153,1331 @@ RCT_INT
INIT_YMM avx2
RCT_INT
%endif
+
+;***********************************************************************
+; ff_sr_ld97_float_<opt>(float *line, int i0, int i1)
+;***********************************************************************
+%macro SR1D97FLOAT 1
+cglobal sr_1d97_float, 3, 5, %1, line, i0, i1, j0, j1
+ mov j0q, i0q
+ mov j1q, i1q
+ add j0q, 1
+ cmp j1q, j0q
+ jg %%extend
+ sub j0q, 2
+ jnz %%else
+ movss m0, [lineq+4]
+ movss m1, [F_LFTG_K]
+ movss m2, [TWO]
+ divss m1, m2
+ mulss m0, m1
+ movss [lineq+4], m0
+ jmp %%end
+
+%%else:
+ movss m0, [lineq]
+ movss m1, [F_LFTG_X]
+ mulss m0, m1
+ movss [lineq], m0
+ jmp %%end
+
+%%extend:
+ shl i0d, 2
+ shl i1d, 2
+ mov j0q, i0q
+ mov j1q, i1q
+ movups m0, [lineq+j0q+4]
+ shufps m0, m0, q0123
+ movups [lineq+j0q-16], m0
+ movups m0, [lineq+j1q-20]
+ shufps m0, m0, q0123
+ movups [lineq+j1q], m0
+
+ movups m3, [F_LFTG_DELTA]
+ mov j0q, i0q
+ mov j1q, i1q
+ shr j0q, 1
+ sub j0q, 4
+ shr j1q, 1
+ add j1q, 8
+ cmp j0q, j1q
+ jge %%beginloop2
+%%loop1:
+ add j0q, 12
+ cmp j0q, j1q
+ jge %%endloop1
+
+ ;line{2*i,2*(i+1),2*(i+2),2*(i+3)} -= F_LFTG_DELTA*(line{2*i-1,2*(i+1)-1,2*(i+2)-1,2*(i+3)-1}+line{2*i+1,2*(i+1)+1,2*(i+2)+1,2*(i+3)+1})
+ movups m0, [lineq+2*j0q-28]
+ movups m4, [lineq+2*j0q-12]
+ movaps m1, m0
+ shufps m0, m4, q3131
+ shufps m1, m4, q2020
+ movups m2, [lineq+2*j0q-24]
+ movups m5, [lineq+2*j0q-8]
+ shufps m2, m5, q3131
+ addps m2, m1
+ mulps m2, m3
+ subps m0, m2
+ movaps m4, m1
+ movlhps m1, m0
+ shufps m1, m1, q3120
+ shufps m4, m0, q3232
+ shufps m4, m4, q3120
+ movups [lineq+2*j0q-28], m1
+ movups [lineq+2*j0q-12], m4
+
+ add j0q, 4
+ cmp j0q, j1q
+ jge %%beginloop2
+ jmp %%loop1
+
+%%endloop1:
+ sub j0q, 12
+%%littleloop1:
+ movss m0, [lineq+2*j0q]
+ movss m1, [lineq+2*j0q-4]
+ movss m2, [lineq+2*j0q+4]
+ addss m1, m2
+ mulss m1, m3
+ subss m0, m1
+ movss [lineq+2*j0q], m0
+ add j0q, 4
+ cmp j0q, j1q
+ jl %%littleloop1
+
+%%beginloop2:
+ movups m3, [F_LFTG_GAMMA]
+ mov j0q, i0q
+ mov j1q, i1q
+ shr j0q, 1
+ sub j0q, 4
+ shr j1q, 1
+ add j1q, 4
+ cmp j0q, j1q
+ jge %%beginloop3
+%%loop2:
+ add j0q, 12
+ cmp j0q, j1q
+ jge %%endloop2
+
+ ;line{2*i+1,2*(i+1)+1,2*(i+2)+1,2*(i+3)+1} -= F_LFTG_GAMMA*(line{2*i,2*(i+1),2*(i+2),2*(i+3)}+line{2*i+2,2*(i+1)+2,2*(i+2)+2,2*(i+3)+2})
+ movups m0, [lineq+2*j0q-24]
+ movups m4, [lineq+2*j0q-8]
+ movaps m1, m0
+ shufps m0, m4, q3131
+ shufps m1, m4, q2020
+ movups m2, [lineq+2*j0q-20]
+ movups m5, [lineq+2*j0q-4]
+ shufps m2, m5, q3131
+ addps m2, m1
+ mulps m2, m3
+ subps m0, m2
+ movaps m4, m1
+ movlhps m1, m0
+ shufps m1, m1, q3120
+ shufps m4, m0, q3232
+ shufps m4, m4, q3120
+ movups [lineq+2*j0q-24], m1
+ movups [lineq+2*j0q-8], m4
+
+ add j0q, 4
+ cmp j0q, j1q
+ jge %%beginloop3
+ jmp %%loop2
+
+%%endloop2:
+ sub j0q, 12
+%%littleloop2:
+ movss m0, [lineq+2*j0q+4]
+ movss m1, [lineq+2*j0q]
+ movss m2, [lineq+2*j0q+8]
+ addss m1, m2
+ mulss m1, m3
+ subss m0, m1
+ movss [lineq+2*j0q+4], m0
+ add j0q, 4
+ cmp j0q, j1q
+ jl %%littleloop2
+
+%%beginloop3:
+ movups m3, [F_LFTG_BETA]
+ mov j0q, i0q
+ mov j1q, i1q
+ shr j0q, 1
+ sub j0q, 4
+ shr j1q, 1
+ add j1q, 8
+ cmp j0q, j1q
+ jge %%beginloop4
+%%loop3:
+ add j0q, 12
+ cmp j0q, j1q
+ jge %%endloop3
+
+ ;line{2*i,2*(i+1),2*(i+2),2*(i+3)} += F_LFTG_BETA*(line{2*i-1,2*(i+1)-1,2*(i+2)-1,2*(i+3)-1}+line{2*i+1,2*(i+1)+1,2*(i+2)+1,2*(i+3)+1})
+ movups m0, [lineq+2*j0q-28]
+ movups m4, [lineq+2*j0q-12]
+ movaps m1, m0
+ shufps m0, m4, q3131
+ shufps m1, m4, q2020
+ movups m2, [lineq+2*j0q-24]
+ movups m5, [lineq+2*j0q-8]
+ shufps m2, m5, q3131
+ addps m2, m1
+ mulps m2, m3
+ addps m0, m2
+ movaps m4, m1
+ movlhps m1, m0
+ shufps m1, m1, q3120
+ shufps m4, m0, q3232
+ shufps m4, m4, q3120
+ movups [lineq+2*j0q-28], m1
+ movups [lineq+2*j0q-12], m4
+
+ add j0q, 4
+ cmp j0q, j1q
+ jge %%beginloop4
+ jmp %%loop3
+
+%%endloop3:
+ sub j0q, 12
+%%littleloop3:
+ movss m0, [lineq+2*j0q]
+ movss m1, [lineq+2*j0q-4]
+ movss m2, [lineq+2*j0q+4]
+ addss m1, m2
+ mulss m1, m3
+ addss m0, m1
+ movss [lineq+2*j0q], m0
+ add j0q, 4
+ cmp j0q, j1q
+ jl %%littleloop3
+
+%%beginloop4:
+ movups m3, [F_LFTG_ALPHA]
+ mov j0q, i0q
+ mov j1q, i1q
+ shr j0q, 1
+ sub j0q, 4
+ shr j1q, 1
+ add j1q, 4
+ cmp j0q, j1q
+ jge %%end
+%%loop4:
+ add j0q, 12
+ cmp j0q, j1q
+ jge %%endloop4
+
+ ;line{2*i+1,2*(i+1)+1,2*(i+2)+1,2*(i+3)+1} += F_LFTG_ALPHA*(line{2*i,2*(i+1),2*(i+2),2*(i+3)}+line{2*i+2,2*(i+1)+2,2*(i+2)+2,2*(i+3)+2})
+ movups m0, [lineq+2*j0q-24]
+ movups m4, [lineq+2*j0q-8]
+ movaps m1, m0
+ shufps m0, m4, q3131
+ shufps m1, m4, q2020
+ movups m2, [lineq+2*j0q-20]
+ movups m5, [lineq+2*j0q-4]
+ shufps m2, m5, q3131
+ addps m2, m1
+ mulps m2, m3
+ addps m0, m2
+ movaps m4, m1
+ movlhps m1, m0
+ shufps m1, m1, q3120
+ shufps m4, m0, q3232
+ shufps m4, m4, q3120
+ movups [lineq+2*j0q-24], m1
+ movups [lineq+2*j0q-8], m4
+
+ add j0q, 4
+ cmp j0q, j1q
+ jge %%end
+ jmp %%loop4
+
+%%endloop4:
+ sub j0q, 12
+%%littleloop4:
+ movss m0, [lineq+2*j0q+4]
+ movss m1, [lineq+2*j0q]
+ movss m2, [lineq+2*j0q+8]
+ addss m1, m2
+ mulss m1, m3
+ addss m0, m1
+ movss [lineq+2*j0q+4], m0
+ add j0q, 4
+ cmp j0q, j1q
+ jl %%littleloop4
+
+%%end:
+ REP_RET
+%endmacro
+
+INIT_XMM sse
+SR1D97FLOAT 6
+
+%macro SR1D97FLOAT_ 5 ; p, i0, i1, tmp0, tmp1
+ mov %4, %2
+ mov %5, %3
+ add %4, 1
+ cmp %5, %4
+ jg %%extend
+ sub %4, 2
+ jnz %%else
+ movss m0, [%1+4]
+ movss m1, [F_LFTG_K]
+ movss m2, [TWO]
+ divss m1, m2
+ mulss m0, m1
+ movss [%1+4], m0
+ jmp %%end
+
+%%else:
+ movss m0, [%1]
+ movss m1, [F_LFTG_X]
+ mulss m0, m1
+ movss [%1], m0
+ jmp %%end
+
+%%extend:
+ shl %2, 2
+ shl %3, 2
+ mov %4, %2
+ mov %5, %3
+ movups m0, [%1+%4+4]
+ shufps m0, m0, q0123
+ movups [%1+%4-16], m0
+ movups m0, [%1+%5-20]
+ shufps m0, m0, q0123
+ movups [%1+%5], m0
+
+ movups m3, [F_LFTG_DELTA]
+ mov %4, %2
+ mov %5, %3
+ shr %4, 1
+ sub %4, 4
+ shr %5, 1
+ add %5, 8
+ cmp %4, %5
+ jge %%beginloop2
+%%loop1:
+ add %4, 12
+ cmp %4, %5
+ jge %%endloop1
+
+ movups m0, [%1+2*%4-28]
+ movups m4, [%1+2*%4-12]
+ movaps m1, m0
+ shufps m0, m4, q3131
+ shufps m1, m4, q2020
+ movups m2, [%1+2*%4-24]
+ movups m5, [%1+2*%4-8]
+ shufps m2, m5, q3131
+ addps m2, m1
+ mulps m2, m3
+ subps m0, m2
+ movaps m4, m1
+ movlhps m1, m0
+ shufps m1, m1, q3120
+ shufps m4, m0, q3232
+ shufps m4, m4, q3120
+ movups [%1+2*%4-28], m1
+ movups [%1+2*%4-12], m4
+
+ add %4, 4
+ cmp %4, %5
+ jge %%beginloop2
+ jmp %%loop1
+
+%%endloop1:
+ sub %4, 12
+%%littleloop1:
+ movss m0, [%1+2*%4]
+ movss m1, [%1+2*%4-4]
+ movss m2, [%1+2*%4+4]
+ addss m1, m2
+ mulss m1, m3
+ subss m0, m1
+ movss [%1+2*%4], m0
+ add %4, 4
+ cmp %4, %5
+ jl %%littleloop1
+
+%%beginloop2:
+ movups m3, [F_LFTG_GAMMA]
+ mov %4, %2
+ mov %5, %3
+ shr %4, 1
+ sub %4, 4
+ shr %5, 1
+ add %5, 4
+ cmp %4, %5
+ jge %%beginloop3
+%%loop2:
+ add %4, 12
+ cmp %4, %5
+ jge %%endloop2
+
+ movups m0, [%1+2*%4-24]
+ movups m4, [%1+2*%4-8]
+ movaps m1, m0
+ shufps m0, m4, q3131
+ shufps m1, m4, q2020
+ movups m2, [%1+2*%4-20]
+ movups m5, [%1+2*%4-4]
+ shufps m2, m5, q3131
+ addps m2, m1
+ mulps m2, m3
+ subps m0, m2
+ movaps m4, m1
+ movlhps m1, m0
+ shufps m1, m1, q3120
+ shufps m4, m0, q3232
+ shufps m4, m4, q3120
+ movups [%1+2*%4-24], m1
+ movups [%1+2*%4-8], m4
+
+ add %4, 4
+ cmp %4, %5
+ jge %%beginloop3
+ jmp %%loop2
+
+%%endloop2:
+ sub %4, 12
+%%littleloop2:
+ movss m0, [%1+2*%4+4]
+ movss m1, [%1+2*%4]
+ movss m2, [%1+2*%4+8]
+ addss m1, m2
+ mulss m1, m3
+ subss m0, m1
+ movss [%1+2*%4+4], m0
+ add %4, 4
+ cmp %4, %5
+ jl %%littleloop2
+
+%%beginloop3:
+ movups m3, [F_LFTG_BETA]
+ mov %4, %2
+ mov %5, %3
+ shr %4, 1
+ sub %4, 4
+ shr %5, 1
+ add %5, 8
+ cmp %4, %5
+ jge %%beginloop4
+%%loop3:
+ add %4, 12
+ cmp %4, %5
+ jge %%endloop3
+
+ movups m0, [%1+2*%4-28]
+ movups m4, [%1+2*%4-12]
+ movaps m1, m0
+ shufps m0, m4, q3131
+ shufps m1, m4, q2020
+ movups m2, [%1+2*%4-24]
+ movups m5, [%1+2*%4-8]
+ shufps m2, m5, q3131
+ addps m2, m1
+ mulps m2, m3
+ addps m0, m2
+ movaps m4, m1
+ movlhps m1, m0
+ shufps m1, m1, q3120
+ shufps m4, m0, q3232
+ shufps m4, m4, q3120
+ movups [%1+2*%4-28], m1
+ movups [%1+2*%4-12], m4
+
+ add %4, 4
+ cmp %4, %5
+ jge %%beginloop4
+ jmp %%loop3
+
+%%endloop3:
+ sub %4, 12
+%%littleloop3:
+ movss m0, [%1+2*%4]
+ movss m1, [%1+2*%4-4]
+ movss m2, [%1+2*%4+4]
+ addss m1, m2
+ mulss m1, m3
+ addss m0, m1
+ movss [%1+2*%4], m0
+ add %4, 4
+ cmp %4, %5
+ jl %%littleloop3
+
+%%beginloop4:
+ movups m3, [F_LFTG_ALPHA]
+ mov %4, %2
+ mov %5, %3
+ shr %4, 1
+ sub %4, 4
+ shr %5, 1
+ add %5, 4
+ cmp %4, %5
+ jge %%end
+%%loop4:
+ add %4, 12
+ cmp %4, %5
+ jge %%endloop4
+
+ movups m0, [%1+2*%4-24]
+ movups m4, [%1+2*%4-8]
+ movaps m1, m0
+ shufps m0, m4, q3131
+ shufps m1, m4, q2020
+ movups m2, [%1+2*%4-20]
+ movups m5, [%1+2*%4-4]
+ shufps m2, m5, q3131
+ addps m2, m1
+ mulps m2, m3
+ addps m0, m2
+ movaps m4, m1
+ movlhps m1, m0
+ shufps m1, m1, q3120
+ shufps m4, m0, q3232
+ shufps m4, m4, q3120
+ movups [%1+2*%4-24], m1
+ movups [%1+2*%4-8], m4
+
+ add %4, 4
+ cmp %4, %5
+ jge %%end
+ jmp %%loop4
+
+%%endloop4:
+ sub %4, 12
+%%littleloop4:
+ movss m0, [%1+2*%4+4]
+ movss m1, [%1+2*%4]
+ movss m2, [%1+2*%4+8]
+ addss m1, m2
+ mulss m1, m3
+ addss m0, m1
+ movss [%1+2*%4+4], m0
+ add %4, 4
+ cmp %4, %5
+ jl %%littleloop4
+
+%%end:
+ shr %2, 2
+ shr %3, 2
+%endmacro
+
+
+;***********************************************************************
+; ff_hor_sd_float_<opt>(float *line, float *data, int mh, int lh, int lv, int w)
+;***********************************************************************
+%macro HORSDFLOAT 1
+cglobal hor_sd_float, 6, 12, %1, line, data, mh, lh, lv, w, l, lp, i0, i1, j0, j1
+ mov lq, mhq
+ shl lq, 2
+ add lq, lineq
+ shl lhq, 2
+
+ mov lpq, 0
+%%mainloop:
+ ;j0 = w*lp+j
+ mov j0q, wq
+ imul j0q, lpq
+
+ ;j1 = (lh-mh+1)/2 + j0
+ mov j1q, lhq
+ shr j1q, 2
+ sub j1q, mhq
+ add j1q, 1
+ shr j1q, 1
+ add j1q, j0q
+
+ shl j0q, 2
+ shl j1q, 2
+
+ ;i1 = 1-mh
+ mov i1q, 1
+ sub i1q, mhq
+ shl i1q, 2
+
+ ;i0 = mh
+ mov i0q, mhq
+ shl i0q, 2
+
+ cmp i0q, i1q
+ jg %%i1i0
+
+;i0 < i1
+ cmp i1q, lhq
+ jge %%i0
+
+ add i0q, 4
+ cmp i0q, i1q
+ jne %%inci0
+
+;i1 = i0+1
+%%beginloopi0i1
+ sub i0q, 4
+
+%%loopi0i1:
+ add i1q, 24
+ cmp i1q, lhq
+ jge %%endloopi0i1
+
+ ;l{i0,i0+2,i0+4,i0+6} <- data[j0:j0+3]
+ ;l{i0,i0+3,i0+5,i0+7} = l{i1,i1+2,i1+4,i1+6} <- data[j1:j1+3]
+ movups m0, [dataq+j0q]
+ movups m2, [dataq+j1q]
+ movaps m1, m0
+ movlhps m0, m2
+ shufps m0, m0, q3120
+ shufps m1, m2, q3232
+ shufps m1, m1, q3120
+ movups [lq+i0q], m0
+ movups [lq+i0q+16], m1
+
+ add i1q, 8
+ add i0q, 32
+ add j0q, 16
+ add j1q, 16
+ cmp i1q, lhq
+ jl %%loopi0i1
+ cmp i0q, lhq
+ jge %%sr_1d
+
+ ;i1>=lh & i0<lh
+ movss m0, [dataq+j0q]
+ movss [lq+i0q], m0
+ jmp %%sr_1d
+
+;i1 + 6 >= lh
+%%endloopi0i1:
+ sub i1q, 24
+%%littleloopi0i1:
+
+ ;l[i0] <- data[j0]
+ ;l[i1] <- data[j1]
+ movss m0, [dataq+j0q]
+ movss m1, [dataq+j1q]
+ movss [lq+i0q], m0
+ movss [lq+i1q], m1
+
+ add i0q, 8
+ add i1q, 8
+ add j0q, 4
+ add j1q, 4
+ cmp i1q, lhq
+ jl %%littleloopi0i1
+ cmp i0q, lhq
+ jge %%sr_1d
+
+ ;i1>=lh & i0<lh
+ movss m0, [dataq+j0q]
+ movss [lq+i0q], m0
+ jmp %%sr_1d
+
+;i1 < i0
+%%i1i0:
+ cmp i0q, lhq
+ jge %%i1
+
+ add i1q, 4
+ cmp i0q, i1q
+ jne %%inci1
+
+;i0 = i1+1
+%%beginloopi1i0
+ sub i1q, 4
+
+%%loopi1i0:
+ add i0q, 24
+ cmp i0q, lhq
+ jge %%endloopi1i0
+
+ ;l{i1,i1+2,i1+4,i1+6} <- data[j1:j1+3]
+ ;l{i1,i1+3,i1+5,i1+7} = l{i0,i0+2,i0+4,i0+6} <- data[j0:j0+3]
+ movups m0, [dataq+j1q]
+ movups m2, [dataq+j0q]
+ movaps m1, m0
+ movlhps m0, m2
+ shufps m0, m0, q3120
+ shufps m1, m2, q3232
+ shufps m1, m1, q3120
+ movups [lq+i1q], m0
+ movups [lq+i1q+16], m1
+
+ add i0q, 8
+ add i1q, 32
+ add j0q, 16
+ add j1q, 16
+ cmp i0q, lhq
+ jl %%loopi1i0
+ cmp i1q, lhq
+ jge %%sr_1d
+
+ ;i0>=lh & i1<lh
+ movss m0, [dataq+j1q]
+ movss [lq+i1q], m0
+ jmp %%sr_1d
+
+%%endloopi1i0:
+ sub i1q, 24
+%%littleloopi1i0:
+
+ ;l[i0] <- data[j0]
+ ;l[i1] <- data[j1]
+ movss m0, [dataq+j1q]
+ movss m1, [dataq+j0q]
+ movss [lq+i1q], m0
+ movss [lq+i0q], m1
+
+ add i0q, 8
+ add i1q, 8
+ add j0q, 4
+ add j1q, 4
+ cmp i1q, lhq
+ jl %%littleloopi1i0
+ cmp i0q, lhq
+ jge %%sr_1d
+
+ ;i0>=lh & i1<lh
+ movss m0, [dataq+j0q]
+ movss [lq+i0q], m0
+ jmp %%sr_1d
+
+;i0<i1 & i1>=lh
+%%i0:
+ cmp i0q, lhq
+ jge %%sr_1d
+ movss m0, [dataq+j0q]
+ movss [lq+i0q], m0
+ add i0q, 8
+ add j0q, 4
+ jmp %%i0
+
+;i1<i0 & i0>=lh
+%%i1:
+ cmp i1q, lhq
+ jge %%sr_1d
+ movss m0, [dataq+j1q]
+ movss [lq+i1q], m0
+ add i1q, 8
+ add j1q, 4
+ jmp %%i1
+
+;i0 < i1-1
+%%inci0:
+ cmp i0q, lhq
+ jge %%sr_1d
+ movss m0, [dataq+j0q]
+ movss [lq+i0q-4], m0
+ add i0q, 8
+ add j0q, 4
+ cmp i0q, i1q
+ je %%beginloopi0i1
+ jmp %%inci0
+
+;i1 < i0-1
+%%inci1:
+ cmp i1q, lhq
+ jge %%sr_1d
+ movss m0, [dataq+j1q]
+ movss [lq+i1q-4], m0
+ add i1q, 8
+ add j1q, 4
+ cmp i0q, i1q
+ je %%beginloopi1i0
+ jmp %%inci1
+
+%%sr_1d:
+ mov i0q, mhq
+ mov i1q, lhq
+ shr i1q, 2
+ add i1q, mhq
+ SR1D97FLOAT_ lineq, i0q, i1q, j0q, j1q
+
+ mov i0q, 0
+ cmp i0q, lhq
+ jge %%endmainloop
+ mov j0q, wq
+ imul j0q, lpq
+ shl j0q, 2
+%%subloop3:
+ add i0q, 12
+ cmp i0q, lhq
+ jge %%endsubloop3
+
+ movups m0, [lq+i0q-12]
+ movups [dataq+j0q], m0
+
+ add i0q, 4
+ add j0q, 16
+ cmp i0q, lhq
+ jge %%endmainloop
+ jmp %%subloop3
+
+%%endsubloop3:
+ sub i0q, 12
+%%littlesubloop3:
+ movss m0, [lq+i0q]
+ movss [dataq+j0q], m0
+
+ add i0q, 4
+ add j0q, 4
+ cmp i0q, lhq
+ jl %%littlesubloop3
+
+%%endmainloop:
+ add lpq, 1
+ cmp lpq, lvq
+ jl %%mainloop
+
+ REP_RET
+%endmacro
+
+INIT_XMM sse
+HORSDFLOAT 6
+
+;***********************************************************************
+; ff_ver_sd_float_<opt>(float *line, float *data, int mv, int lv, int lh, int w)
+;***********************************************************************
+%macro VERSDFLOAT 1
+cglobal ver_sd_float, 6, 12, %1, line, data, mv, lh, lv, w, lp, i0, i1, j0, j1, inc
+ shl mvq, 2
+ add lineq, mvq
+ mov incq, lvq
+ add incq, 12
+ shl incq, 2
+ shl lvq, 2
+ shl wq, 2
+
+ mov lpq, 0
+
+%%mainloop:
+ ;j0 = w*j+lp
+ mov j0q, lpq
+
+ add lpq, 3
+ cmp lpq, lhq
+ jge %%beginmainloop2
+
+ shr lvq, 2
+ shr wq, 2
+ ;j1 = w*(lv-mv+1)/2 + j0
+ mov j1q, lvq
+ sub j1q, mvq
+ add j1q, 1
+ shr j1q, 1
+ imul j1q, wq
+ add j1q, j0q
+
+ shl lvq, 2
+ shl wq, 2
+ shl j1q, 2
+ shl j0q, 2
+
+ ;i1 = 1-mv
+ mov i1q, 4
+ sub i1q, mvq
+
+ ;i0 = mv
+ mov i0q, mvq
+
+ cmp i0q, i1q
+ jg %%i1i0
+
+;i0 < i1
+ cmp i1q, lvq
+ jge %%i0
+
+ add i0q, 4
+ cmp i0q, i1q
+ jne %%inci0
+
+;i1 = i0+1
+%%beginloopi0i1
+ sub i0q, 4
+
+%%loopi0i1:
+; add i1q, 12
+; cmp i1q, lvq
+; jge %%endloopi0i1
+
+; movlps m0, [dataq+j0q]
+; movhps m0, [dataq+j1q]
+; movlps m1, [dataq+j0q+8]
+; movhps m1, [dataq+j1q+8]
+; add j0q, wq
+; add j1q, wq
+; movlps m2, [dataq+j0q]
+; movhps m2, [dataq+j1q]
+; movlps m3, [dataq+j0q+8]
+; movhps m3, [dataq+j1q+8]
+; movaps m4, m0
+; shufps m0, m2, q2020
+; shufps m4, m2, q3131
+; movaps m2, m4
+; movaps m4, m1
+; shufps m1, m3, q2020
+; shufps m4, m3, q3131
+; movaps m3, m4
+; movups [lineq+i0q], m0
+; add lineq, incq
+; movups [lineq+i0q], m2
+; add lineq, incq
+; movups [lineq+i0q], m1
+; add lineq, incq
+; movups [lineq+i0q], m3
+; sub lineq, incq
+; sub lineq, incq
+; sub lineq, incq
+
+; add i1q, 4
+; add i0q, 16
+; add j0q, wq
+; add j1q, wq
+; cmp i1q, lvq
+; jl %%loopi0i1
+; cmp i0q, lvq
+; jl %%lasti0
+; jmp %%sr_1d
+
+;i1 + 3 >= lv
+%%endloopi0i1:
+; sub i1q, 12
+%%littleloopi0i1:
+
+ movss m0, [dataq+j0q]
+ movss m1, [dataq+j1q]
+ movss [lineq+i0q], m0
+ movss [lineq+i0q+4], m1
+ movss m0, [dataq+j0q+4]
+ movss m1, [dataq+j1q+4]
+ add lineq, incq
+ movss [lineq+i0q], m0
+ movss [lineq+i0q+4], m1
+ movss m0, [dataq+j0q+8]
+ movss m1, [dataq+j1q+8]
+ add lineq, incq
+ movss [lineq+i0q], m0
+ movss [lineq+i0q+4], m1
+ movss m0, [dataq+j0q+12]
+ movss m1, [dataq+j1q+12]
+ add lineq, incq
+ movss [lineq+i0q], m0
+ movss [lineq+i0q+4], m1
+ sub lineq, incq
+ sub lineq, incq
+ sub lineq, incq
+
+ add i1q, 8
+ add i0q, 8
+ add j0q, wq
+ add j1q, wq
+ cmp i1q, lvq
+ jl %%littleloopi0i1
+ cmp i0q, lvq
+ jge %%sr_1d
+
+%%lasti0:
+ movss m0, [dataq+j0q]
+ movss [lineq+i0q], m0
+ add lineq, incq
+ movss m0, [dataq+j0q+4]
+ movss [lineq+i0q], m0
+ add lineq, incq
+ movss m0, [dataq+j0q+8]
+ movss [lineq+i0q], m0
+ add lineq, incq
+ movss m0, [dataq+j0q+12]
+ movss [lineq+i0q], m0
+ sub lineq, incq
+ sub lineq, incq
+ sub lineq, incq
+ jmp %%sr_1d
+
+;i1 < i0
+%%i1i0:
+ cmp i0q, lvq
+ jge %%i1
+
+ add i1q, 4
+ cmp i0q, i1q
+ jne %%inci1
+
+;i0 = i1+1
+%%beginloopi1i0
+ sub i1q, 4
+
+%%loopi1i0:
+; add i0q, 12
+; cmp i0q, lvq
+; jge %%endloopi0i1
+
+; movlps m0, [dataq+j1q]
+; movhps m0, [dataq+j0q]
+; movlps m1, [dataq+j1q+8]
+; movhps m1, [dataq+j0q+8]
+; add j0q, wq
+; add j1q, wq
+; movlps m2, [dataq+j1q]
+; movhps m2, [dataq+j0q]
+; movlps m3, [dataq+j1q+8]
+; movhps m3, [dataq+j0q+8]
+; movaps m4, m0
+; shufps m0, m2, q2020
+; shufps m4, m2, q3131
+; movaps m2, m4
+; movaps m4, m1
+; shufps m1, m3, q2020
+; shufps m4, m3, q3131
+; movaps m3, m4
+; movups [lineq+i1q], m0
+; add lineq, incq
+; movups [lineq+i1q], m2
+; add lineq, incq
+; movups [lineq+i1q], m1
+; add lineq, incq
+; movups [lineq+i1q], m3
+; sub lineq, incq
+; sub lineq, incq
+; sub lineq, incq
+
+; add i0q, 4
+; add i1q, 16
+; add j1q, wq
+; add j0q, wq
+; cmp i0q, lvq
+; jl %%loopi1i0
+; cmp i1q, lvq
+; jl %%lasti1
+; jmp %%sr_1d
+
+%%endloopi1i0:
+; sub i1q, 12
+%%littleloopi1i0:
+
+ movss m0, [dataq+j1q]
+ movss m1, [dataq+j0q]
+ movss [lineq+i1q], m0
+ movss [lineq+i1q+4], m1
+ movss m0, [dataq+j1q+4]
+ movss m1, [dataq+j0q+4]
+ add lineq, incq
+ movss [lineq+i1q], m0
+ movss [lineq+i1q+4], m1
+ movss m0, [dataq+j1q+8]
+ movss m1, [dataq+j0q+8]
+ add lineq, incq
+ movss [lineq+i1q], m0
+ movss [lineq+i1q+4], m1
+ movss m0, [dataq+j1q+12]
+ movss m1, [dataq+j0q+12]
+ add lineq, incq
+ movss [lineq+i1q], m0
+ movss [lineq+i1q+4], m1
+ sub lineq, incq
+ sub lineq, incq
+ sub lineq, incq
+
+ add i1q, 8
+ add i0q, 8
+ add j0q, wq
+ add j1q, wq
+ cmp i0q, lvq
+ jl %%littleloopi1i0
+ cmp i1q, lvq
+ jge %%sr_1d
+
+%%lasti1:
+ movss m0, [dataq+j1q]
+ movss [lineq+i1q], m0
+ add lineq, incq
+ movss m0, [dataq+j1q+4]
+ movss [lineq+i1q], m0
+ add lineq, incq
+ movss m0, [dataq+j1q+8]
+ movss [lineq+i1q], m0
+ add lineq, incq
+ movss m0, [dataq+j1q+12]
+ movss [lineq+i1q], m0
+ sub lineq, incq
+ sub lineq, incq
+ sub lineq, incq
+ jmp %%sr_1d
+
+;i0<i1 & i1>=lv
+%%i0:
+ cmp i0q, lvq
+ jge %%sr_1d
+ movss m0, [dataq+j0q]
+ movss [lineq+i0q], m0
+ add lineq, incq
+ movss m0, [dataq+j0q+4]
+ movss [lineq+i0q], m0
+ add lineq, incq
+ movss m0, [dataq+j0q+8]
+ movss [lineq+i0q], m0
+ add lineq, incq
+ movss m0, [dataq+j0q+12]
+ movss [lineq+i0q], m0
+ sub lineq, incq
+ sub lineq, incq
+ sub lineq, incq
+ add i0q, 8
+ add j0q, wq
+ jmp %%i0
+
+;i1<i0 & i0>=lh
+%%i1:
+ cmp i1q, lvq
+ jge %%sr_1d
+ movss m0, [dataq+j1q]
+ movss [lineq+i1q], m0
+ add lineq, incq
+ movss m0, [dataq+j1q+4]
+ movss [lineq+i1q], m0
+ add lineq, incq
+ movss m0, [dataq+j1q+8]
+ movss [lineq+i1q], m0
+ add lineq, incq
+ movss m0, [dataq+j1q+12]
+ movss [lineq+i1q], m0
+ sub lineq, incq
+ sub lineq, incq
+ sub lineq, incq
+ add i1q, 8
+ add j1q, wq
+ jmp %%i1
+
+;i0 < i1-1
+%%inci0:
+ cmp i0q, lvq
+ jge %%sr_1d
+ movss m0, [dataq+j0q]
+ movss [lineq+i0q-4], m0
+ add lineq, incq
+ movss m0, [dataq+j0q+4]
+ movss [lineq+i0q-4], m0
+ add lineq, incq
+ movss m0, [dataq+j0q+8]
+ movss [lineq+i0q-4], m0
+ add lineq, incq
+ movss m0, [dataq+j0q+12]
+ movss [lineq+i0q-4], m0
+ sub lineq, incq
+ sub lineq, incq
+ sub lineq, incq
+ add i0q, 8
+ add j0q, wq
+ cmp i0q, i1q
+ je %%beginloopi0i1
+ jmp %%inci0
+
+;i1 < i0-1
+%%inci1:
+ cmp i1q, lvq
+ jge %%sr_1d
+ movss m0, [dataq+j1q]
+ movss [lineq+i1q-4], m0
+ add lineq, incq
+ movss m0, [dataq+j1q+4]
+ movss [lineq+i1q-4], m0
+ add lineq, incq
+ movss m0, [dataq+j1q+8]
+ movss [lineq+i1q-4], m0
+ add lineq, incq
+ movss m0, [dataq+j1q+12]
+ movss [lineq+i1q-4], m0
+ sub lineq, incq
+ sub lineq, incq
+ sub lineq, incq
+ add i1q, 8
+ add j1q, wq
+ cmp i0q, i1q
+ je %%beginloopi1i0
+ jmp %%inci1
+
+%%sr_1d:
+ sub lineq, mvq
+ mov i0q, mvq
+ mov i1q, lvq
+ add i1q, mvq
+ shr i0q, 2
+ shr i1q, 2
+ SR1D97FLOAT_ lineq, i0q, i1q, j0q, j1q
+ add lineq, incq
+ mov i0q, mvq
+ mov i1q, lvq
+ add i1q, mvq
+ shr i0q, 2
+ shr i1q, 2
+ SR1D97FLOAT_ lineq, i0q, i1q, j0q, j1q
+ add lineq, incq
+ mov i0q, mvq
+ mov i1q, lvq
+ add i1q, mvq
+ shr i0q, 2
+ shr i1q, 2
+ SR1D97FLOAT_ lineq, i0q, i1q, j0q, j1q
+ add lineq, incq
+ mov i0q, mvq
+ mov i1q, lvq
+ add i1q, mvq
+ shr i0q, 2
+ shr i1q, 2
+ SR1D97FLOAT_ lineq, i0q, i1q, j0q, j1q
+ sub lineq, incq
+ sub lineq, incq
+ sub lineq, incq
+ add lineq, mvq
+
+ mov i0q, 0
+ ;cmp i0q, lvq
+ ;jge %%endmainloop
+ mov j0q, lpq
+ sub j0q, 3
+ shl j0q, 2
+%%loop3:
+ add i0q, 12
+ cmp i0q, lvq
+ jge %%endloop3
+
+ movups m0, [lineq+i0q-12]
+ add lineq, incq
+ movups m1, [lineq+i0q-12]
+ add lineq, incq
+ movups m2, [lineq+i0q-12]
+ add lineq, incq
+ movups m3, [lineq+i0q-12]
+ movaps m4, m0
+ movaps m5, m2
+ movlhps m0, m1
+ movlhps m2, m3
+ movaps m6, m0
+ shufps m0, m2, q2020
+ shufps m6, m2, q3131
+ movaps m2, m6
+ movhlps m1, m4
+ movhlps m3, m5
+ movaps m6, m1
+ shufps m1, m3, q2020
+ shufps m6, m3, q3131
+ movaps m3, m6
+ movups [dataq+j0q], m0
+ add j0q, wq
+ movups [dataq+j0q], m2
+ add j0q, wq
+ movups [dataq+j0q], m1
+ add j0q, wq
+ movups [dataq+j0q], m3
+ add j0q, wq
+ sub lineq, incq
+ sub lineq, incq
+ sub lineq, incq
+
+ add i0q, 4
+ cmp i0q, lvq
+ jge %%endmainloop
+ jmp %%loop3
+
+%%endloop3:
+ sub i0q, 12
+
+%%littleloop3:
+ movss m0, [lineq+i0q]
+ movss [dataq+j0q], m0
+ add lineq, incq
+ movss m0, [lineq+i0q]
+ movss [dataq+j0q+4], m0
+ add lineq, incq
+ movss m0, [lineq+i0q]
+ movss [dataq+j0q+8], m0
+ add lineq, incq
+ movss m0, [lineq+i0q]
+ movss [dataq+j0q+12], m0
+ sub lineq, incq
+ sub lineq, incq
+ sub lineq, incq
+
+ add i0q, 4
+ add j0q, wq
+ cmp i0q, lvq
+ jl %%littleloop3
+
+%%endmainloop:
+ add lpq, 1
+ cmp lpq, lhq
+ jl %%mainloop
+ jmp %%end
+
+%%beginmainloop2:
+ sub lpq, 3
+%%mainloop2:
+ ;j0 = w*j+lp
+ mov j0q, lpq
+ shl j0q, 2
+
+ ;i0 = mv
+ mov i0q, mvq
+
+ cmp i0q, lvq
+ jge %%beginloop5
+%%loop4:
+ movss m0, [dataq+j0q]
+ movss [lineq+i0q], m0
+
+ add j0q, wq
+ add i0q, 8
+ cmp i0q, lvq
+ jl %%loop4
+
+%%beginloop5:
+ ;i0 = 1-mv
+ mov i0q, 4
+ sub i0q, mvq
+ cmp i0q, lvq
+ jge %%sr_1d_2
+%%loop5:
+ movss m0, [dataq+j0q]
+ movss [lineq+i0q], m0
+
+ add j0q, wq
+ add i0q, 8
+ cmp i0q, lvq
+ jl %%loop5
+
+%%sr_1d_2:
+ sub lineq, mvq
+ mov i0q, mvq
+ mov i1q, lvq
+ add i1q, mvq
+ shr i1q, 2
+ shr i0q, 2
+ SR1D97FLOAT_ lineq, i0q, i1q, j0q, j1q
+ add lineq, mvq
+
+ mov i0q, 0
+ cmp i0q, lvq
+ jge %%endmainloop
+ mov j0q, lpq
+ shl j0q, 2
+%%loop6:
+ movss m0, [lineq+i0q]
+ movss [dataq+j0q], m0
+
+ add j0q, wq
+ add i0q, 4
+ cmp i0q, lvq
+ jl %%loop6
+
+%%endmainloop2:
+ add lpq, 1
+ cmp lpq, lhq
+ jl %%mainloop2
+
+%%end:
+ REP_RET
+%endmacro
+
+INIT_XMM sse
+VERSDFLOAT 6
diff --git a/libavcodec/x86/jpeg2000dsp_init.c b/libavcodec/x86/jpeg2000dsp_init.c
index baa81383ea..04cd01379d 100644
--- a/libavcodec/x86/jpeg2000dsp_init.c
+++ b/libavcodec/x86/jpeg2000dsp_init.c
@@ -19,16 +19,23 @@
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
+#include "libavutil/avassert.h"
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/jpeg2000dsp.h"
+#include "libavcodec/jpeg2000dwt.h"
+#include <stdio.h>
void ff_ict_float_sse(void *src0, void *src1, void *src2, int csize);
void ff_ict_float_avx(void *src0, void *src1, void *src2, int csize);
void ff_rct_int_sse2 (void *src0, void *src1, void *src2, int csize);
void ff_rct_int_avx2 (void *src0, void *src1, void *src2, int csize);
+void ff_sr_1d97_float_sse(float *line, int i0, int i1);
+void ff_hor_sd_float_sse(float *line, float *data, int mh, int lh, int lv, int w);
+void ff_ver_sd_float_sse(float *line, float *data, int mv, int lh, int lv, int w);
+
av_cold void ff_jpeg2000dsp_init_x86(Jpeg2000DSPContext *c)
{
int cpu_flags = av_get_cpu_flags();
@@ -48,3 +55,115 @@ av_cold void ff_jpeg2000dsp_init_x86(Jpeg2000DSPContext *c)
c->mct_decode[FF_DWT53] = ff_rct_int_avx2;
}
}
+
+av_cold void ff_jpeg2000dwt_init_x86(DWTContext *s, int type)
+{
+ int cpu_flags = av_get_cpu_flags();
+ if (EXTERNAL_SSE(cpu_flags)) {
+ if (type == FF_DWT97){
+ s->sse = 1;
+ }
+ }
+}
+
+void dwt_decode97_float_sse(DWTContext *s, float *t)
+{
+ int lev;
+ int w = s->linelen[s->ndeclevels - 1][0];
+ float *line = s->f_linebuf;
+ float *data = t;
+ /* position at index O of line range [0-5,w+5] cf. extend function */
+ line += 5;
+ int len = s->linelen[s->ndeclevels - 1][0]*s->linelen[s->ndeclevels - 1][1];
+
+ int i, j = 0;
+
+ for (lev = 0; lev < s->ndeclevels; lev++) {
+ int lh = s->linelen[lev][0],
+ lv = s->linelen[lev][1],
+ mh = s->mod[lev][0],
+ mv = s->mod[lev][1],
+ lp;
+ float *l;
+ int *test = malloc(sizeof(int));
+
+ // HOR_SD
+ ff_hor_sd_float_sse(line, data, mh, lh, lv, w);
+
+ // VER_SD
+ ff_ver_sd_float_sse(line, data, mv, lh, lv, w);
+ /*l = line + mv;
+ inc = lv+16;
+ for (lp = 0; lp+3 < lh; lp += 4) {
+ //printf("hello \n");
+ j = 0;
+ // copy with interleaving
+ for (i = mv; i < lv; i += 2, j++){
+ l[i] = data[w * j + lp];
+ l += inc;
+ l[i] = data[w * j + lp + 1];
+ l += inc;
+ l[i] = data[w * j + lp + 2];
+ l += inc;
+ l[i] = data[w * j + lp + 3];
+ l -= inc;
+ l -= inc;
+ l -= inc;
+ }
+ for (i = 1 - mv; i < lv; i += 2, j++){
+ l[i] = data[w * j + lp];
+ l += inc;
+ l[i] = data[w * j + lp + 1];
+ l += inc;
+ l[i] = data[w * j + lp + 2];
+ l += inc;
+ l[i] = data[w * j + lp + 3];
+ l -= inc;
+ l -= inc;
+ l -= inc;
+ }
+
+ ff_sr_1d97_float_sse(l, mv, mv + lv);
+ l += inc;
+ ff_sr_1d97_float_sse(l, mv, mv + lv);
+ l += inc;
+ ff_sr_1d97_float_sse(l, mv, mv + lv);
+ l += inc;
+ ff_sr_1d97_float_sse(l, mv, mv + lv);
+ l -= inc;
+ l -= inc;
+ l -= inc;
+
+ for (i = 0; i < lv; i++){
+ data[w * i + lp] = l[i];
+ l += inc;
+ data[w * i + lp + 1] = l[i];
+ l += inc;
+ data[w * i + lp + 2] = l[i];
+ l += inc;
+ data[w * i + lp + 3] = l[i];
+ l -= inc;
+ l -= inc;
+ l -= inc;
+ }
+ }
+
+ for (; lp < lh; lp ++) {
+ //printf("hello \n");
+ j = 0;
+ // copy with interleaving
+ for (i = mv; i < lv; i += 2, j++){
+ l[i] = data[w * j + lp];
+ }
+ for (i = 1 - mv; i < lv; i += 2, j++){
+ l[i] = data[w * j + lp];
+ }
+
+ ff_sr_1d97_float_sse(line, mv, mv + lv);
+
+ for (i = 0; i < lv; i++){
+ data[w * i + lp] = l[i];
+ }
+ }*/
+ }
+}
diff --git a/tests/checkasm/jpeg2000dsp.c b/tests/checkasm/jpeg2000dsp.c
index 48559df085..92f3264674 100644
--- a/tests/checkasm/jpeg2000dsp.c
+++ b/tests/checkasm/jpeg2000dsp.c
@@ -20,6 +20,7 @@
#include "checkasm.h"
#include "libavcodec/jpeg2000dsp.h"
+#include "libavcodec/jpeg2000dwt.h"
#include "libavutil/common.h"
#include "libavutil/internal.h"
#include "libavutil/intreadwrite.h"
--
2.11.0
More information about the ffmpeg-devel
mailing list