[FFmpeg-devel] [PATCH] Dsputilize some functions from APE decode 1/2 - Altivec implementation

Sat Jul 5 20:18:13 CEST 2008

On Sat, Jul 05, 2008 at 09:17:08PM +0300, Kostya wrote:
> Here's Altivec version of $subj. SSE2 version will follow next week.

I know, attached patch was appproved but this will give more speedup.
-------------- next part --------------
Index: libavcodec/ppc/dsputil_altivec.c
===================================================================

--- libavcodec/ppc/dsputil_altivec.c	(revision 14044)
+++ libavcodec/ppc/dsputil_altivec.c	(working copy)
@@ -1484,6 +1484,65 @@
 POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_xy2_num, 1);
 }
 
+static void add_int16_altivec(int16_t * v1, int16_t * v2, int order)
+{
+    int i;
+    register vector short vec, *pv;
+
+    for(i = 0; i < order; i += 8){
+        pv = (vector signed short*)v2;
+        vec = vec_perm(pv[0], pv[1], vec_lvsl(0, v2));
+        vec_st(vec_add(vec_ld(0, v1), vec), 0, v1);
+        v1 += 8;
+        v2 += 8;
+    }
+}
+
+static void sub_int16_altivec(int16_t * v1, int16_t * v2, int order)
+{
+    int i;
+    register vector short vec, *pv;
+
+    for(i = 0; i < order; i += 8){
+        pv = (vector signed short*)v2;
+        vec = vec_perm(pv[0], pv[1], vec_lvsl(0, v2));
+        vec_st(vec_sub(vec_ld(0, v1), vec), 0, v1);
+        v1 += 8;
+        v2 += 8;
+    }
+}
+
+static int32_t scalarproduct_int16_altivec(int16_t * v1, int16_t * v2, int order, const int shift)
+{
+    int i;
+    register vector signed short vec1, *pv;
+    register const vector signed int zero = vec_splat_s32(0);
+    register vector signed int res = vec_splat_s32(0), t;
+    register vector unsigned int shifts;
+    int32_t ires __attribute__((aligned(16)));
+
+    shifts = (vector unsigned int)zero;
+    if(shift & 0x10) shifts = vec_add(shifts, vec_sl(vec_splat_u32(0x08), vec_splat_u32(0x1)));
+    if(shift & 0x08) shifts = vec_add(shifts, vec_splat_u32(0x08));
+    if(shift & 0x04) shifts = vec_add(shifts, vec_splat_u32(0x04));
+    if(shift & 0x02) shifts = vec_add(shifts, vec_splat_u32(0x02));
+    if(shift & 0x01) shifts = vec_add(shifts, vec_splat_u32(0x01));
+
+    for(i = 0; i < order; i += 8){
+        pv = (vector signed short*)v1;
+        vec1 = vec_perm(pv[0], pv[1], vec_lvsl(0, v1));
+        t = vec_msum(vec1, vec_ld(0, v2), zero);
+        t = vec_sr(t, shifts);
+        res = vec_sums(t, res);
+        v1 += 8;
+        v2 += 8;
+    }
+    res = vec_splat(res, 3);
+    vec_ste(res, 0, &ires);
+    return ires;
+}
+
+
 void dsputil_init_altivec(DSPContext* c, AVCodecContext *avctx)
 {
     c->pix_abs[0][1] = sad16_x2_altivec;
@@ -1515,4 +1574,8 @@
     c->hadamard8_diff[1] = hadamard8_diff8x8_altivec;
     if (ENABLE_VORBIS_DECODER)
         c->vorbis_inverse_coupling = vorbis_inverse_coupling_altivec;
+
+    c->add_int16 = add_int16_altivec;
+    c->sub_int16 = sub_int16_altivec;
+    c->scalarproduct_int16 = scalarproduct_int16_altivec;
 }