[FFmpeg-cvslog] r20720 - trunk/libavcodec/apedec.c

Loren Merritt lorenm
Thu Dec 3 18:56:35 CET 2009


On Thu, 3 Dec 2009, lorenm wrote:

> Author: lorenm
> Date: Thu Dec  3 18:48:54 2009
> New Revision: 20720
>
> Log:
> avoid an unpredictable branch
> 20% faster predictor_update_filter

I can get another 15% from simd. But converting this patch to runtime cpu 
detection would be too annoying, so I won't.

--Loren Merritt
-------------- next part --------------
diff --git a/libavcodec/apedec.c b/libavcodec/apedec.c
index a15cd9b..f4d5208 100644
--- a/libavcodec/apedec.c
+++ b/libavcodec/apedec.c
@@ -119,8 +119,8 @@ typedef struct APEPredictor {
     int32_t filterA[2];
     int32_t filterB[2];
 
-    int32_t coeffsA[2][4];  ///< adaption coefficients
-    int32_t coeffsB[2][5];  ///< adaption coefficients
+    DECLARE_ALIGNED_16(int32_t, coeffsA[2][4]);  ///< adaption coefficients
+    DECLARE_ALIGNED_16(int32_t, coeffsB[2][8]);  ///< adaption coefficients, [2][5] rounded for alignment
     int32_t historybuffer[HISTORY_SIZE + PREDICTOR_SIZE];
 } APEPredictor;
 
@@ -475,7 +475,7 @@ static void init_entropy_decoder(APEContext * ctx)
 }
 
 static const int32_t initial_coeffs[4] = {
-    360, 317, -109, 98
+    98, -109, 317, 360
 };
 
 static void init_predictor_decoder(APEContext * ctx)
@@ -510,10 +510,10 @@ static av_always_inline int predictor_update_filter(APEPredictor *p, const int d
     p->buf[delayA - 1] = p->buf[delayA] - p->buf[delayA - 1];
     p->buf[adaptA - 1] = APESIGN(p->buf[delayA - 1]);
 
-    predictionA = p->buf[delayA    ] * p->coeffsA[filter][0] +
-                  p->buf[delayA - 1] * p->coeffsA[filter][1] +
-                  p->buf[delayA - 2] * p->coeffsA[filter][2] +
-                  p->buf[delayA - 3] * p->coeffsA[filter][3];
+    predictionA = p->buf[delayA    ] * p->coeffsA[filter][3] +
+                  p->buf[delayA - 1] * p->coeffsA[filter][2] +
+                  p->buf[delayA - 2] * p->coeffsA[filter][1] +
+                  p->buf[delayA - 3] * p->coeffsA[filter][0];
 
     /*  Apply a scaled first-order filter compression */
     p->buf[delayB]     = p->filterA[filter ^ 1] - ((p->filterB[filter] * 31) >> 5);
@@ -522,25 +522,57 @@ static av_always_inline int predictor_update_filter(APEPredictor *p, const int d
     p->buf[adaptB - 1] = APESIGN(p->buf[delayB - 1]);
     p->filterB[filter] = p->filterA[filter ^ 1];
 
-    predictionB = p->buf[delayB    ] * p->coeffsB[filter][0] +
-                  p->buf[delayB - 1] * p->coeffsB[filter][1] +
+    predictionB = p->buf[delayB    ] * p->coeffsB[filter][4] +
+                  p->buf[delayB - 1] * p->coeffsB[filter][3] +
                   p->buf[delayB - 2] * p->coeffsB[filter][2] +
-                  p->buf[delayB - 3] * p->coeffsB[filter][3] +
-                  p->buf[delayB - 4] * p->coeffsB[filter][4];
+                  p->buf[delayB - 3] * p->coeffsB[filter][1] +
+                  p->buf[delayB - 4] * p->coeffsB[filter][0];
+
+#if 1
+    sign = APESIGN(decoded);
+    p->coeffsB[filter][4] += p->buf[adaptB] * sign;
+    asm volatile(
+        "movd %0, %%xmm2 \n"
+        "pshufd $0, %%xmm2, %%xmm2 \n"
+        "movdqu %1, %%xmm0 \n"
+        "movdqu %2, %%xmm1 \n"
+        ::"g"(sign),
+          "m"(p->buf[adaptA - 3]), // FIXME doesn't fully specify dependencies
+          "m"(p->buf[adaptB - 4])
+    );
+#endif
 
     p->lastA[filter] = decoded + ((predictionA + (predictionB >> 1)) >> 10);
     p->filterA[filter] = p->lastA[filter] + ((p->filterA[filter] * 31) >> 5);
 
+#if 0
     sign = APESIGN(decoded);
-    p->coeffsA[filter][0] += p->buf[adaptA    ] * sign;
-    p->coeffsA[filter][1] += p->buf[adaptA - 1] * sign;
-    p->coeffsA[filter][2] += p->buf[adaptA - 2] * sign;
-    p->coeffsA[filter][3] += p->buf[adaptA - 3] * sign;
-    p->coeffsB[filter][0] += p->buf[adaptB    ] * sign;
-    p->coeffsB[filter][1] += p->buf[adaptB - 1] * sign;
+    p->coeffsA[filter][3] += p->buf[adaptA    ] * sign;
+    p->coeffsA[filter][2] += p->buf[adaptA - 1] * sign;
+    p->coeffsA[filter][1] += p->buf[adaptA - 2] * sign;
+    p->coeffsA[filter][0] += p->buf[adaptA - 3] * sign;
+    p->coeffsB[filter][4] += p->buf[adaptB    ] * sign;
+    p->coeffsB[filter][3] += p->buf[adaptB - 1] * sign;
     p->coeffsB[filter][2] += p->buf[adaptB - 2] * sign;
-    p->coeffsB[filter][3] += p->buf[adaptB - 3] * sign;
-    p->coeffsB[filter][4] += p->buf[adaptB - 4] * sign;
+    p->coeffsB[filter][1] += p->buf[adaptB - 3] * sign;
+    p->coeffsB[filter][0] += p->buf[adaptB - 4] * sign;
+#else
+    asm volatile(
+        "movdqa %%xmm2, %%xmm3 \n"
+        "pslld $31, %%xmm2 \n"
+        "psrad $31, %%xmm2 \n"
+        "psignd %%xmm3, %%xmm0 \n"
+        "psignd %%xmm3, %%xmm1 \n"
+        "pand %%xmm2, %%xmm0 \n"
+        "pand %%xmm2, %%xmm1 \n"
+        "paddd %0, %%xmm0 \n"
+        "paddd %1, %%xmm1 \n"
+        "movdqa %%xmm0, %0 \n"
+        "movdqa %%xmm1, %1 \n"
+        :"=m"(p->coeffsA[filter]),
+         "=m"(p->coeffsB[filter])
+    );
+#endif
 
     return p->filterA[filter];
 }
@@ -583,10 +615,10 @@ static void predictor_decode_mono(APEContext * ctx, int count)
         p->buf[YDELAYA] = currentA;
         p->buf[YDELAYA - 1] = p->buf[YDELAYA] - p->buf[YDELAYA - 1];
 
-        predictionA = p->buf[YDELAYA    ] * p->coeffsA[0][0] +
-                      p->buf[YDELAYA - 1] * p->coeffsA[0][1] +
-                      p->buf[YDELAYA - 2] * p->coeffsA[0][2] +
-                      p->buf[YDELAYA - 3] * p->coeffsA[0][3];
+        predictionA = p->buf[YDELAYA    ] * p->coeffsA[0][3] +
+                      p->buf[YDELAYA - 1] * p->coeffsA[0][2] +
+                      p->buf[YDELAYA - 2] * p->coeffsA[0][1] +
+                      p->buf[YDELAYA - 3] * p->coeffsA[0][0];
 
         currentA = A + (predictionA >> 10);
 
@@ -594,10 +626,10 @@ static void predictor_decode_mono(APEContext * ctx, int count)
         p->buf[YADAPTCOEFFSA - 1] = APESIGN(p->buf[YDELAYA - 1]);
 
         sign = APESIGN(A);
-        p->coeffsA[0][0] += p->buf[YADAPTCOEFFSA    ] * sign;
-        p->coeffsA[0][1] += p->buf[YADAPTCOEFFSA - 1] * sign;
-        p->coeffsA[0][2] += p->buf[YADAPTCOEFFSA - 2] * sign;
-        p->coeffsA[0][3] += p->buf[YADAPTCOEFFSA - 3] * sign;
+        p->coeffsA[0][3] += p->buf[YADAPTCOEFFSA    ] * sign;
+        p->coeffsA[0][2] += p->buf[YADAPTCOEFFSA - 1] * sign;
+        p->coeffsA[0][1] += p->buf[YADAPTCOEFFSA - 2] * sign;
+        p->coeffsA[0][0] += p->buf[YADAPTCOEFFSA - 3] * sign;
 
         p->buf++;
 



More information about the ffmpeg-cvslog mailing list