[FFmpeg-devel] [PATCH] more APE optimization

Loren Merritt lorenm
Fri Dec 4 16:19:56 CET 2009


After applying this, there is only one reference to scalarproduct_int16(), 
in acelp_pitch_delay.c. AFAICT, acelp_pitch_delay.c hasn't ever been 
compiled. Is it someone's work-in-progress, or can I delete 
scalarproduct_int16?

--Loren Merritt
-------------- next part --------------
29-105% faster apply_filter, 6-90% faster ape decoding on core2
(anything other than core2 probably gets much less, since this is mostly due to ssse3 cachesplit avoidance and I haven't written the full gamut of other cachesplit modes)

diff --git a/libavcodec/apedec.c b/libavcodec/apedec.c
index bdbf338..ecb24b6 100644
--- a/libavcodec/apedec.c
+++ b/libavcodec/apedec.c
@@ -664,22 +664,16 @@ static void init_filter(APEContext * ctx, APEFilter *f, int16_t * buf, int order
     do_init_filter(&f[1], buf + order * 3 + HISTORY_SIZE, order);
 }
 
-static inline void do_apply_filter(APEContext * ctx, int version, APEFilter *f, int32_t *data, int count, int order, int fracbits)
+static void do_apply_filter(APEContext * ctx, int version, APEFilter *f, int32_t *data, int count, int order, int fracbits)
 {
     int res;
     int absres;
 
     while (count--) {
         /* round fixedpoint scalar product */
-        res = (ctx->dsp.scalarproduct_int16(f->delay - order, f->coeffs, order, 0) + (1 << (fracbits - 1))) >> fracbits;
-
-        if (*data < 0)
-            ctx->dsp.add_int16(f->coeffs, f->adaptcoeffs - order, order);
-        else if (*data > 0)
-            ctx->dsp.sub_int16(f->coeffs, f->adaptcoeffs - order, order);
-
+        res = ctx->dsp.scalarproduct_and_madd_int16(f->coeffs, f->delay - order, f->adaptcoeffs - order, order, APESIGN(*data));
+        res = (res + (1 << (fracbits - 1))) >> fracbits;
         res += *data;
-
         *data++ = res;
 
         /* Update the output history */
diff --git a/libavcodec/dsputil.c b/libavcodec/dsputil.c
index a04b8a4..552f9a8 100644
--- a/libavcodec/dsputil.c
+++ b/libavcodec/dsputil.c
@@ -4298,18 +4298,6 @@ void ff_float_to_int16_interleave_c(int16_t *dst, const float **src, long len, i
     }
 }
 
-static void add_int16_c(int16_t * v1, int16_t * v2, int order)
-{
-    while (order--)
-       *v1++ += *v2++;
-}
-
-static void sub_int16_c(int16_t * v1, int16_t * v2, int order)
-{
-    while (order--)
-        *v1++ -= *v2++;
-}
-
 static int32_t scalarproduct_int16_c(int16_t * v1, int16_t * v2, int order, int shift)
 {
     int res = 0;
@@ -4320,6 +4308,16 @@ static int32_t scalarproduct_int16_c(int16_t * v1, int16_t * v2, int order, int
     return res;
 }
 
+static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
+{
+    int res = 0;
+    while (order--) {
+        res += *v1 * *v2++;
+        *v1++ += *v3++ * mul;
+    }
+    return res;
+}
+
 #define W0 2048
 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
@@ -4848,9 +4846,8 @@ void dsputil_init(DSPContext* c, AVCodecContext *avctx)
     c->vector_clipf = vector_clipf_c;
     c->float_to_int16 = ff_float_to_int16_c;
     c->float_to_int16_interleave = ff_float_to_int16_interleave_c;
-    c->add_int16 = add_int16_c;
-    c->sub_int16 = sub_int16_c;
     c->scalarproduct_int16 = scalarproduct_int16_c;
+    c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
     c->scalarproduct_float = scalarproduct_float_c;
     c->butterflies_float = butterflies_float_c;
     c->vector_fmul_scalar = vector_fmul_scalar_c;
diff --git a/libavcodec/dsputil.h b/libavcodec/dsputil.h
index c097461..f1f7993 100644
--- a/libavcodec/dsputil.h
+++ b/libavcodec/dsputil.h
@@ -562,21 +562,17 @@ typedef struct DSPContext {
 
     /* ape functions */
     /**
-     * Add contents of the second vector to the first one.
+     * Calculate scalar product of v1 and v2,
+     * and v1[i] += v3[i] * mul
      * @param len length of vectors, should be multiple of 16
      */
-    void (*add_int16)(int16_t *v1/*align 16*/, int16_t *v2, int len);
-    /**
-     * Add contents of the second vector to the first one.
-     * @param len length of vectors, should be multiple of 16
-     */
-    void (*sub_int16)(int16_t *v1/*align 16*/, int16_t *v2, int len);
+    int32_t (*scalarproduct_and_madd_int16)(int16_t *v1/*align 16*/, int16_t *v2, int16_t *v3, int len, int mul);
     /**
      * Calculate scalar product of two vectors.
      * @param len length of vectors, should be multiple of 16
      * @param shift number of bits to discard from product
      */
-    int32_t (*scalarproduct_int16)(int16_t *v1, int16_t *v2/*align 16*/, int len, int shift);
+    int32_t (*scalarproduct_int16)(int16_t *v1/*align 16*/, int16_t *v2, int len, int shift);
 
     /* rv30 functions */
     qpel_mc_func put_rv30_tpel_pixels_tab[4][16];
diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c
index 93d4af5..8406b44 100644
--- a/libavcodec/x86/dsputil_mmx.c
+++ b/libavcodec/x86/dsputil_mmx.c
@@ -2384,12 +2384,11 @@ static void float_to_int16_sse2(int16_t *dst, const float *src, long len){
 void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len);
 void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len);
 void ff_float_to_int16_interleave6_3dn2(int16_t *dst, const float **src, int len);
-void ff_add_int16_mmx2(int16_t * v1, int16_t * v2, int order);
-void ff_add_int16_sse2(int16_t * v1, int16_t * v2, int order);
-void ff_sub_int16_mmx2(int16_t * v1, int16_t * v2, int order);
-void ff_sub_int16_sse2(int16_t * v1, int16_t * v2, int order);
 int32_t ff_scalarproduct_int16_mmx2(int16_t * v1, int16_t * v2, int order, int shift);
 int32_t ff_scalarproduct_int16_sse2(int16_t * v1, int16_t * v2, int order, int shift);
+int32_t ff_scalarproduct_and_madd_int16_mmx2(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul);
+int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul);
+int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul);
 void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top);
 int  ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src, int w, int left);
 int  ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src, int w, int left);
@@ -2951,9 +2950,8 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
         }
         if(mm_flags & FF_MM_MMX2){
 #if HAVE_YASM
-            c->add_int16 = ff_add_int16_mmx2;
-            c->sub_int16 = ff_sub_int16_mmx2;
             c->scalarproduct_int16 = ff_scalarproduct_int16_mmx2;
+            c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmx2;
 #endif
         }
         if(mm_flags & FF_MM_SSE){
@@ -2975,11 +2973,12 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
             c->float_to_int16 = float_to_int16_sse2;
             c->float_to_int16_interleave = float_to_int16_interleave_sse2;
 #if HAVE_YASM
-            c->add_int16 = ff_add_int16_sse2;
-            c->sub_int16 = ff_sub_int16_sse2;
             c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
+            c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
 #endif
         }
+        if((mm_flags & FF_MM_SSSE3) && !(mm_flags & (FF_MM_SSE42|FF_MM_3DNOW)) && HAVE_YASM) // cacheline split
+            c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
     }
 
     if (CONFIG_ENCODERS)
diff --git a/libavcodec/x86/dsputil_yasm.asm b/libavcodec/x86/dsputil_yasm.asm
index c8a4230..2199959 100644
--- a/libavcodec/x86/dsputil_yasm.asm
+++ b/libavcodec/x86/dsputil_yasm.asm
@@ -100,43 +100,8 @@ FLOAT_TO_INT16_INTERLEAVE6 3dn2
 
 
 %macro SCALARPRODUCT 1
-; void add_int16(int16_t * v1, int16_t * v2, int order)
-cglobal add_int16_%1, 3,3,2, v1, v2, order
-    shl orderq, 1
-    add v1q, orderq
-    add v2q, orderq
-    neg orderq
-.loop:
-    movu    m0, [v2q + orderq]
-    movu    m1, [v2q + orderq + mmsize]
-    paddw   m0, [v1q + orderq]
-    paddw   m1, [v1q + orderq + mmsize]
-    mova    [v1q + orderq], m0
-    mova    [v1q + orderq + mmsize], m1
-    add     orderq, mmsize*2
-    jl .loop
-    REP_RET
-
-; void sub_int16(int16_t * v1, int16_t * v2, int order)
-cglobal sub_int16_%1, 3,3,4, v1, v2, order
-    shl orderq, 1
-    add v1q, orderq
-    add v2q, orderq
-    neg orderq
-.loop:
-    movu    m2, [v2q + orderq]
-    movu    m3, [v2q + orderq + mmsize]
-    mova    m0, [v1q + orderq]
-    mova    m1, [v1q + orderq + mmsize]
-    psubw   m0, m2
-    psubw   m1, m3
-    mova    [v1q + orderq], m0
-    mova    [v1q + orderq + mmsize], m1
-    add     orderq, mmsize*2
-    jl .loop
-    REP_RET
-
-; int scalarproduct_int16_sse2(int16_t * v1, int16_t * v2, int order, int shift)
+; int scalarproduct_int16(int16_t * v1, int16_t * v2, int order, int shift)
+; FIXME is this actually used?
 cglobal scalarproduct_int16_%1, 3,3,4, v1, v2, order, shift
     shl orderq, 1
     add v1q, orderq
@@ -145,10 +110,10 @@ cglobal scalarproduct_int16_%1, 3,3,4, v1, v2, order, shift
     movd    m3, shiftm
     pxor    m2, m2
 .loop:
-    movu    m0, [v1q + orderq]
-    movu    m1, [v1q + orderq + mmsize]
-    pmaddwd m0, [v2q + orderq]
-    pmaddwd m1, [v2q + orderq + mmsize]
+    movu    m0, [v2q + orderq]
+    movu    m1, [v2q + orderq + mmsize]
+    pmaddwd m0, [v1q + orderq]
+    pmaddwd m1, [v1q + orderq + mmsize]
     paddd   m2, m0
     paddd   m2, m1
     add     orderq, mmsize*2
@@ -165,6 +130,51 @@ cglobal scalarproduct_int16_%1, 3,3,4, v1, v2, order, shift
     paddd   m2, m0
     movd   eax, m2
     RET
+
+; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
+cglobal scalarproduct_and_madd_int16_%1, 3,4,8, v1, v2, v3, order, mul
+    shl orderq, 1
+    movd    m7, mulm
+%if mmsize == 16
+    pshuflw m7, m7, 0
+    punpcklqdq m7, m7
+%else
+    pshufw  m7, m7, 0
+%endif
+    pxor    m6, m6
+    add v1q, orderq
+    add v2q, orderq
+    add v3q, orderq
+    neg orderq
+.loop:
+    movu    m0, [v2q + orderq]
+    movu    m1, [v2q + orderq + mmsize]
+    mova    m4, [v1q + orderq]
+    mova    m5, [v1q + orderq + mmsize]
+    movu    m2, [v3q + orderq]
+    movu    m3, [v3q + orderq + mmsize]
+    pmaddwd m0, m4
+    pmaddwd m1, m5
+    pmullw  m2, m7
+    pmullw  m3, m7
+    paddd   m6, m0
+    paddd   m6, m1
+    paddw   m2, m4
+    paddw   m3, m5
+    mova    [v1q + orderq], m2
+    mova    [v1q + orderq + mmsize], m3
+    add     orderq, mmsize*2
+    jl .loop
+%if mmsize == 16
+    movhlps m0, m6
+    paddd   m6, m0
+    pshuflw m0, m6, 0x4e
+%else
+    pshufw  m0, m6, 0x4e
+%endif
+    paddd   m6, m0
+    movd   eax, m6
+    RET
 %endmacro
 
 INIT_MMX
@@ -172,6 +182,87 @@ SCALARPRODUCT mmx2
 INIT_XMM
 SCALARPRODUCT sse2
 
+%macro SCALARPRODUCT_LOOP 1
+align 16
+.loop%1:
+    sub     orderq, mmsize*2
+%if %1
+    mova    m1, m4
+    mova    m4, [v2q + orderq]
+    mova    m0, [v2q + orderq + mmsize]
+    palignr m1, m0, %1
+    palignr m0, m4, %1
+    mova    m3, m5
+    mova    m5, [v3q + orderq]
+    mova    m2, [v3q + orderq + mmsize]
+    palignr m3, m2, %1
+    palignr m2, m5, %1
+%else
+    mova    m0, [v2q + orderq]
+    mova    m1, [v2q + orderq + mmsize]
+    mova    m2, [v3q + orderq]
+    mova    m3, [v3q + orderq + mmsize]
+%endif
+    pmaddwd m0, [v1q + orderq]
+    pmaddwd m1, [v1q + orderq + mmsize]
+    pmullw  m2, m7
+    pmullw  m3, m7
+    paddw   m2, [v1q + orderq]
+    paddw   m3, [v1q + orderq + mmsize]
+    paddd   m6, m0
+    paddd   m6, m1
+    mova    [v1q + orderq], m2
+    mova    [v1q + orderq + mmsize], m3
+    jg .loop%1
+%if %1
+    jmp .end
+%endif
+%endmacro
+
+; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
+cglobal scalarproduct_and_madd_int16_ssse3, 4,5,8, v1, v2, v3, order, mul
+    shl orderq, 1
+    movd    m7, mulm
+    pshuflw m7, m7, 0
+    punpcklqdq m7, m7
+    pxor    m6, m6
+    mov    r4d, v2d
+    and    r4d, 15
+    and    v2q, ~15
+    and    v3q, ~15
+    mova    m4, [v2q + orderq]
+    mova    m5, [v3q + orderq]
+    ; linear is faster than branch tree or jump table, because the branches taken are cyclic (i.e. predictable)
+    cmp    r4d, 0
+    je .loop0
+    cmp    r4d, 2
+    je .loop2
+    cmp    r4d, 4
+    je .loop4
+    cmp    r4d, 6
+    je .loop6
+    cmp    r4d, 8
+    je .loop8
+    cmp    r4d, 10
+    je .loop10
+    cmp    r4d, 12
+    je .loop12
+SCALARPRODUCT_LOOP 14
+SCALARPRODUCT_LOOP 12
+SCALARPRODUCT_LOOP 10
+SCALARPRODUCT_LOOP 8
+SCALARPRODUCT_LOOP 6
+SCALARPRODUCT_LOOP 4
+SCALARPRODUCT_LOOP 2
+SCALARPRODUCT_LOOP 0
+.end:
+    movhlps m0, m6
+    paddd   m6, m0
+    pshuflw m0, m6, 0x4e
+    paddd   m6, m0
+    movd   eax, m6
+    RET
+
 
 
 ; void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top)



More information about the ffmpeg-devel mailing list