[Ffmpeg-devel] [RFC] VC1 Transform in AltiVec

Kostya kostya.shishkov
Tue Jul 18 05:46:23 CEST 2006


Here is my first attept to optimize something with processor-specific instructions.
A patch to vc1.c provided.

Please note that:
a) It is AltiVec-only, so don't try to compile on x86 or machine without AltiVec support
b) It's just a hack to demonstrate it works, in future this will go to ppc/vc1_altivec.c

TRANSPOSE8() macro was taken from ppc/mpegvideo_altivec.c

I'd like to hear from people who know this stuff if I took the right approach (and further
suggestions of optimization).

MMX version will follow.
-------------- next part --------------
--- vc1_svn.c	2006-07-16 07:47:53.000000000 +0300
+++ vc1.c	2006-07-17 19:09:12.000000000 +0300
@@ -716,6 +716,192 @@
     return 0;
 }
 
+#define TRANSPOSE8(a,b,c,d,e,f,g,h) \
+do { \
+    __typeof__(a)  _A1, _B1, _C1, _D1, _E1, _F1, _G1, _H1; \
+    __typeof__(a)  _A2, _B2, _C2, _D2, _E2, _F2, _G2, _H2; \
+ \
+    _A1 = vec_mergeh (a, e); \
+    _B1 = vec_mergel (a, e); \
+    _C1 = vec_mergeh (b, f); \
+    _D1 = vec_mergel (b, f); \
+    _E1 = vec_mergeh (c, g); \
+    _F1 = vec_mergel (c, g); \
+    _G1 = vec_mergeh (d, h); \
+    _H1 = vec_mergel (d, h); \
+ \
+    _A2 = vec_mergeh (_A1, _E1); \
+    _B2 = vec_mergel (_A1, _E1); \
+    _C2 = vec_mergeh (_B1, _F1); \
+    _D2 = vec_mergel (_B1, _F1); \
+    _E2 = vec_mergeh (_C1, _G1); \
+    _F2 = vec_mergel (_C1, _G1); \
+    _G2 = vec_mergeh (_D1, _H1); \
+    _H2 = vec_mergel (_D1, _H1); \
+ \
+    a = vec_mergeh (_A2, _E2); \
+    b = vec_mergel (_A2, _E2); \
+    c = vec_mergeh (_B2, _F2); \
+    d = vec_mergel (_B2, _F2); \
+    e = vec_mergeh (_C2, _G2); \
+    f = vec_mergel (_C2, _G2); \
+    g = vec_mergeh (_D2, _H2); \
+    h = vec_mergel (_D2, _H2); \
+} while (0)
+
+#define STEP8(src0, src1, src2, src3, src4, src5, src6, src7, rnd_add) \
+ do {\
+    t0 = vec_sl(vec_add(src0, src4), vec_2); \
+    t0 = vec_add(vec_add(t0, t0), vec_add(t0, rnd_add)); \
+    t1 = vec_sl(vec_sub(src0, src4), vec_2); \
+    t1 = vec_add(vec_add(t1, t1), vec_add(t1, rnd_add)); \
+    t2 = vec_sl(vec_add(vec_add(src6, src6), src6), vec_1); \
+    t2 = vec_add(t2, vec_sl(src2, vec_4)); \
+    t3 = vec_sl(vec_add(vec_add(src2, src2), src2), vec_1); \
+    t3 = vec_sub(t3, vec_sl(src6, vec_4)); \
+\
+    t4 = vec_add(t0, t2); \
+    t5 = vec_add(t1, t3); \
+    t6 = vec_sub(t1, t3); \
+    t7 = vec_sub(t0, t2); \
+\
+    t0 = vec_sl(vec_add(src1, src3), vec_1); \
+    t0 = vec_sl(vec_add(t0, src5), vec_1); \
+    t0 = vec_sl(vec_add(t0, src7), vec_2); \
+    t0 = vec_add(t0, vec_sub(src5, src3)); \
+\
+    t1 = vec_sl(vec_sub(src1, src5), vec_1); \
+    t1 = vec_sl(vec_sub(t1, src7), vec_1); \
+    t1 = vec_sl(vec_sub(t1, src3), vec_2); \
+    t1 = vec_sub(t1, vec_add(src1, src7)); \
+\
+    t2 = vec_sl(vec_sub(src7, src3), vec_1); \
+    t2 = vec_sl(vec_add(t2, src1), vec_1); \
+    t2 = vec_sl(vec_add(t2, src5), vec_2); \
+    t2 = vec_add(t2, vec_sub(src1, src7)); \
+\
+    t3 = vec_sl(vec_sub(src5, src7), vec_1); \
+    t3 = vec_sl(vec_sub(t3, src3), vec_1); \
+    t3 = vec_sl(vec_add(t3, src1), vec_2); \
+    t3 = vec_sub(t3, vec_add(src3, src5)); \
+}while(0)
+
+#define SHIFT_HOR(src0, src1, src2, src3, src4, src5, src6, src7) \
+do { \
+    src0 = vec_sra(vec_add(t4, t0), vec_3); \
+    src1 = vec_sra(vec_add(t5, t1), vec_3); \
+    src2 = vec_sra(vec_add(t6, t2), vec_3); \
+    src3 = vec_sra(vec_add(t7, t3), vec_3); \
+    src4 = vec_sra(vec_sub(t7, t3), vec_3); \
+    src5 = vec_sra(vec_sub(t6, t2), vec_3); \
+    src6 = vec_sra(vec_sub(t5, t1), vec_3); \
+    src7 = vec_sra(vec_sub(t4, t0), vec_3); \
+}while(0)
+
+#define SHIFT_VERT(src0, src1, src2, src3, src4, src5, src6, src7) \
+do { \
+    src0 = vec_sra(vec_add(t4, t0), vec_7); \
+    src1 = vec_sra(vec_add(t5, t1), vec_7); \
+    src2 = vec_sra(vec_add(t6, t2), vec_7); \
+    src3 = vec_sra(vec_add(t7, t3), vec_7); \
+    src4 = vec_sra(vec_add(vec_sub(t7, t3), vec_1), vec_7); \
+    src5 = vec_sra(vec_add(vec_sub(t6, t2), vec_1), vec_7); \
+    src6 = vec_sra(vec_add(vec_sub(t5, t1), vec_1), vec_7); \
+    src7 = vec_sra(vec_add(vec_sub(t4, t0), vec_1), vec_7); \
+}while(0)
+
+static void vc1_8x8_altivec(DCTELEM block[64])
+{
+    vector signed short ssrc0, ssrc1, ssrc2, ssrc3, ssrc4, ssrc5, ssrc6, ssrc7;
+    vector signed int s0, s1, s2, s3, s4, s5, s6, s7;
+    vector signed int s8, s9, sA, sB, sC, sD, sE, sF;
+    vector signed int t0, t1, t2, t3, t4, t5, t6, t7;
+    const vector signed int vec_64 = {64, 64, 64, 64};
+    const vector signed int vec_7 = {7, 7, 7, 7};
+    const vector signed int vec_4 = {4, 4, 4, 4};
+    const vector signed int vec_3 = {3, 3, 3, 3};
+    const vector signed int vec_2 = {2, 2, 2, 2};
+    const vector signed int vec_1 = {1, 1, 1, 1};
+
+    ssrc0 = vec_ld(  0, block);
+    ssrc1 = vec_ld( 16, block);
+    ssrc2 = vec_ld( 32, block);
+    ssrc3 = vec_ld( 48, block);
+    ssrc4 = vec_ld( 64, block);
+    ssrc5 = vec_ld( 80, block);
+    ssrc6 = vec_ld( 96, block);
+    ssrc7 = vec_ld(112, block);
+
+    TRANSPOSE8(ssrc0, ssrc1, ssrc2, ssrc3, ssrc4, ssrc5, ssrc6, ssrc7);
+    s0 = vec_unpackl(ssrc0);
+    s1 = vec_unpackl(ssrc1);
+    s2 = vec_unpackl(ssrc2);
+    s3 = vec_unpackl(ssrc3);
+    s4 = vec_unpackl(ssrc4);
+    s5 = vec_unpackl(ssrc5);
+    s6 = vec_unpackl(ssrc6);
+    s7 = vec_unpackl(ssrc7);
+    s8 = vec_unpackh(ssrc0);
+    s9 = vec_unpackh(ssrc1);
+    sA = vec_unpackh(ssrc2);
+    sB = vec_unpackh(ssrc3);
+    sC = vec_unpackh(ssrc4);
+    sD = vec_unpackh(ssrc5);
+    sE = vec_unpackh(ssrc6);
+    sF = vec_unpackh(ssrc7);
+
+    STEP8(s0, s1, s2, s3, s4, s5, s6, s7, vec_4);
+    SHIFT_HOR(s0, s1, s2, s3, s4, s5, s6, s7);
+    STEP8(s8, s9, sA, sB, sC, sD, sE, sF, vec_4);
+    SHIFT_HOR(s8, s9, sA, sB, sC, sD, sE, sF);
+    ssrc0 = vec_pack(s8, s0);
+    ssrc1 = vec_pack(s9, s1);
+    ssrc2 = vec_pack(sA, s2);
+    ssrc3 = vec_pack(sB, s3);
+    ssrc4 = vec_pack(sC, s4);
+    ssrc5 = vec_pack(sD, s5);
+    ssrc6 = vec_pack(sE, s6);
+    ssrc7 = vec_pack(sF, s7);
+
+    TRANSPOSE8(ssrc0, ssrc1, ssrc2, ssrc3, ssrc4, ssrc5, ssrc6, ssrc7);
+    s0 = vec_unpackl(ssrc0);
+    s1 = vec_unpackl(ssrc1);
+    s2 = vec_unpackl(ssrc2);
+    s3 = vec_unpackl(ssrc3);
+    s4 = vec_unpackl(ssrc4);
+    s5 = vec_unpackl(ssrc5);
+    s6 = vec_unpackl(ssrc6);
+    s7 = vec_unpackl(ssrc7);
+    s8 = vec_unpackh(ssrc0);
+    s9 = vec_unpackh(ssrc1);
+    sA = vec_unpackh(ssrc2);
+    sB = vec_unpackh(ssrc3);
+    sC = vec_unpackh(ssrc4);
+    sD = vec_unpackh(ssrc5);
+    sE = vec_unpackh(ssrc6);
+    sF = vec_unpackh(ssrc7);
+    STEP8(s0, s1, s2, s3, s4, s5, s6, s7, vec_4);
+    SHIFT_VERT(s0, s1, s2, s3, s4, s5, s6, s7);
+    STEP8(s8, s9, sA, sB, sC, sD, sE, sF, vec_4);
+    SHIFT_VERT(s8, s9, sA, sB, sC, sD, sE, sF);
+    ssrc0 = vec_pack(s8, s0);
+    ssrc1 = vec_pack(s9, s1);
+    ssrc2 = vec_pack(sA, s2);
+    ssrc3 = vec_pack(sB, s3);
+    ssrc4 = vec_pack(sC, s4);
+    ssrc5 = vec_pack(sD, s5);
+    ssrc6 = vec_pack(sE, s6);
+    ssrc7 = vec_pack(sF, s7);
+
+    vec_st(ssrc0,  0, block);
+    vec_st(ssrc1, 16, block);
+    vec_st(ssrc2, 32, block);
+    vec_st(ssrc3, 48, block);
+    vec_st(ssrc4, 64, block);
+    vec_st(ssrc5, 80, block);
+    vec_st(ssrc6, 96, block);
+    vec_st(ssrc7,112, block);
+}
 
 /** Do inverse transform
  */
@@ -725,7 +911,11 @@
     register int t1,t2,t3,t4,t5,t6,t7,t8;
     DCTELEM *src, *dst;
 
+    if((M&N) == 8){
+        vc1_8x8_altivec(block);
+        return;
+    }
     src = block;
     dst = block;
     if(M==4){
         for(i = 0; i < N; i++){



More information about the ffmpeg-devel mailing list