[FFmpeg-cvslog] r20720 - trunk/libavcodec/apedec.c
Loren Merritt
lorenm
Thu Dec 3 18:56:35 CET 2009
On Thu, 3 Dec 2009, lorenm wrote:
> Author: lorenm
> Date: Thu Dec 3 18:48:54 2009
> New Revision: 20720
>
> Log:
> avoid an unpredictable branch
> 20% faster predictor_update_filter
I can get another 15% from simd. But converting this patch to runtime cpu
detection would be too annoying, so I won't.
--Loren Merritt
-------------- next part --------------
diff --git a/libavcodec/apedec.c b/libavcodec/apedec.c
index a15cd9b..f4d5208 100644
--- a/libavcodec/apedec.c
+++ b/libavcodec/apedec.c
@@ -119,8 +119,8 @@ typedef struct APEPredictor {
int32_t filterA[2];
int32_t filterB[2];
- int32_t coeffsA[2][4]; ///< adaption coefficients
- int32_t coeffsB[2][5]; ///< adaption coefficients
+ DECLARE_ALIGNED_16(int32_t, coeffsA[2][4]); ///< adaption coefficients
+ DECLARE_ALIGNED_16(int32_t, coeffsB[2][8]); ///< adaption coefficients, [2][5] rounded for alignment
int32_t historybuffer[HISTORY_SIZE + PREDICTOR_SIZE];
} APEPredictor;
@@ -475,7 +475,7 @@ static void init_entropy_decoder(APEContext * ctx)
}
static const int32_t initial_coeffs[4] = {
- 360, 317, -109, 98
+ 98, -109, 317, 360
};
static void init_predictor_decoder(APEContext * ctx)
@@ -510,10 +510,10 @@ static av_always_inline int predictor_update_filter(APEPredictor *p, const int d
p->buf[delayA - 1] = p->buf[delayA] - p->buf[delayA - 1];
p->buf[adaptA - 1] = APESIGN(p->buf[delayA - 1]);
- predictionA = p->buf[delayA ] * p->coeffsA[filter][0] +
- p->buf[delayA - 1] * p->coeffsA[filter][1] +
- p->buf[delayA - 2] * p->coeffsA[filter][2] +
- p->buf[delayA - 3] * p->coeffsA[filter][3];
+ predictionA = p->buf[delayA ] * p->coeffsA[filter][3] +
+ p->buf[delayA - 1] * p->coeffsA[filter][2] +
+ p->buf[delayA - 2] * p->coeffsA[filter][1] +
+ p->buf[delayA - 3] * p->coeffsA[filter][0];
/* Apply a scaled first-order filter compression */
p->buf[delayB] = p->filterA[filter ^ 1] - ((p->filterB[filter] * 31) >> 5);
@@ -522,25 +522,57 @@ static av_always_inline int predictor_update_filter(APEPredictor *p, const int d
p->buf[adaptB - 1] = APESIGN(p->buf[delayB - 1]);
p->filterB[filter] = p->filterA[filter ^ 1];
- predictionB = p->buf[delayB ] * p->coeffsB[filter][0] +
- p->buf[delayB - 1] * p->coeffsB[filter][1] +
+ predictionB = p->buf[delayB ] * p->coeffsB[filter][4] +
+ p->buf[delayB - 1] * p->coeffsB[filter][3] +
p->buf[delayB - 2] * p->coeffsB[filter][2] +
- p->buf[delayB - 3] * p->coeffsB[filter][3] +
- p->buf[delayB - 4] * p->coeffsB[filter][4];
+ p->buf[delayB - 3] * p->coeffsB[filter][1] +
+ p->buf[delayB - 4] * p->coeffsB[filter][0];
+
+#if 1
+ sign = APESIGN(decoded);
+ p->coeffsB[filter][4] += p->buf[adaptB] * sign;
+ asm volatile(
+ "movd %0, %%xmm2 \n"
+ "pshufd $0, %%xmm2, %%xmm2 \n"
+ "movdqu %1, %%xmm0 \n"
+ "movdqu %2, %%xmm1 \n"
+ ::"g"(sign),
+ "m"(p->buf[adaptA - 3]), // FIXME doesn't fully specify dependencies
+ "m"(p->buf[adaptB - 4])
+ );
+#endif
p->lastA[filter] = decoded + ((predictionA + (predictionB >> 1)) >> 10);
p->filterA[filter] = p->lastA[filter] + ((p->filterA[filter] * 31) >> 5);
+#if 0
sign = APESIGN(decoded);
- p->coeffsA[filter][0] += p->buf[adaptA ] * sign;
- p->coeffsA[filter][1] += p->buf[adaptA - 1] * sign;
- p->coeffsA[filter][2] += p->buf[adaptA - 2] * sign;
- p->coeffsA[filter][3] += p->buf[adaptA - 3] * sign;
- p->coeffsB[filter][0] += p->buf[adaptB ] * sign;
- p->coeffsB[filter][1] += p->buf[adaptB - 1] * sign;
+ p->coeffsA[filter][3] += p->buf[adaptA ] * sign;
+ p->coeffsA[filter][2] += p->buf[adaptA - 1] * sign;
+ p->coeffsA[filter][1] += p->buf[adaptA - 2] * sign;
+ p->coeffsA[filter][0] += p->buf[adaptA - 3] * sign;
+ p->coeffsB[filter][4] += p->buf[adaptB ] * sign;
+ p->coeffsB[filter][3] += p->buf[adaptB - 1] * sign;
p->coeffsB[filter][2] += p->buf[adaptB - 2] * sign;
- p->coeffsB[filter][3] += p->buf[adaptB - 3] * sign;
- p->coeffsB[filter][4] += p->buf[adaptB - 4] * sign;
+ p->coeffsB[filter][1] += p->buf[adaptB - 3] * sign;
+ p->coeffsB[filter][0] += p->buf[adaptB - 4] * sign;
+#else
+ asm volatile(
+ "movdqa %%xmm2, %%xmm3 \n"
+ "pslld $31, %%xmm2 \n"
+ "psrad $31, %%xmm2 \n"
+ "psignd %%xmm3, %%xmm0 \n"
+ "psignd %%xmm3, %%xmm1 \n"
+ "pand %%xmm2, %%xmm0 \n"
+ "pand %%xmm2, %%xmm1 \n"
+ "paddd %0, %%xmm0 \n"
+ "paddd %1, %%xmm1 \n"
+ "movdqa %%xmm0, %0 \n"
+ "movdqa %%xmm1, %1 \n"
+ :"=m"(p->coeffsA[filter]),
+ "=m"(p->coeffsB[filter])
+ );
+#endif
return p->filterA[filter];
}
@@ -583,10 +615,10 @@ static void predictor_decode_mono(APEContext * ctx, int count)
p->buf[YDELAYA] = currentA;
p->buf[YDELAYA - 1] = p->buf[YDELAYA] - p->buf[YDELAYA - 1];
- predictionA = p->buf[YDELAYA ] * p->coeffsA[0][0] +
- p->buf[YDELAYA - 1] * p->coeffsA[0][1] +
- p->buf[YDELAYA - 2] * p->coeffsA[0][2] +
- p->buf[YDELAYA - 3] * p->coeffsA[0][3];
+ predictionA = p->buf[YDELAYA ] * p->coeffsA[0][3] +
+ p->buf[YDELAYA - 1] * p->coeffsA[0][2] +
+ p->buf[YDELAYA - 2] * p->coeffsA[0][1] +
+ p->buf[YDELAYA - 3] * p->coeffsA[0][0];
currentA = A + (predictionA >> 10);
@@ -594,10 +626,10 @@ static void predictor_decode_mono(APEContext * ctx, int count)
p->buf[YADAPTCOEFFSA - 1] = APESIGN(p->buf[YDELAYA - 1]);
sign = APESIGN(A);
- p->coeffsA[0][0] += p->buf[YADAPTCOEFFSA ] * sign;
- p->coeffsA[0][1] += p->buf[YADAPTCOEFFSA - 1] * sign;
- p->coeffsA[0][2] += p->buf[YADAPTCOEFFSA - 2] * sign;
- p->coeffsA[0][3] += p->buf[YADAPTCOEFFSA - 3] * sign;
+ p->coeffsA[0][3] += p->buf[YADAPTCOEFFSA ] * sign;
+ p->coeffsA[0][2] += p->buf[YADAPTCOEFFSA - 1] * sign;
+ p->coeffsA[0][1] += p->buf[YADAPTCOEFFSA - 2] * sign;
+ p->coeffsA[0][0] += p->buf[YADAPTCOEFFSA - 3] * sign;
p->buf++;
More information about the ffmpeg-cvslog
mailing list