[FFmpeg-devel] [PATCH] Altivec version of-altivec h264_h-v_loop_filter_luma

Fri May 11 23:10:31 CEST 2007

Guillaume POIRIER wrote:
> 
> is that any better?

yes, thank you

> +/* A routine to read an unaligned vector.  Thanks for the example code Apple */
> +static inline vector unsigned char read_unaligned(int offset, uint8_t *src)

I'd move to a common header with a doxy comment.

> +#define transpose4x16(r0, r1, r2, r3) {      \
> +    register vec_u8_t r4;                    \
> +    register vec_u8_t r5;                    \
> +    register vec_u8_t r6;                    \
> +    register vec_u8_t r7;                    \
> +                                             \
> +    r4 = vec_mergeh(r0, r2);  /*0, 2 set 0*/ \
> +    r5 = vec_mergel(r0, r2);  /*0, 2 set 1*/ \
> +    r6 = vec_mergeh(r1, r3);  /*1, 3 set 0*/ \
> +    r7 = vec_mergel(r1, r3);  /*1, 3 set 1*/ \
> +                                             \
> +    r0 = vec_mergeh(r4, r6);  /*all set 0*/  \
> +    r1 = vec_mergel(r4, r6);  /*all set 1*/  \
> +    r2 = vec_mergeh(r5, r7);  /*all set 2*/  \
> +    r3 = vec_mergel(r5, r7);  /*all set 3*/  \
> +}
> +
> +static inline void write16x4(uint8_t *dst, int dst_stride,
> +        register vec_u8_t r0, register vec_u8_t r1,
> +        register vec_u8_t r2, register vec_u8_t r3) {
> +    DECLARE_ALIGNED_16(unsigned char, result[64]);
> +    uint32_t *src_int = (uint32_t *)result, *dst_int = (uint32_t *)dst;
> +    int int_dst_stride = dst_stride/4;
> +
> +    vec_st(r0, 0, result);
> +    vec_st(r1, 16, result);
> +    vec_st(r2, 32, result);
> +    vec_st(r3, 48, result);
> +    /* there has to be a better way!!!! */
> +    *dst_int = *src_int;
> +    *(dst_int+   int_dst_stride) = *(src_int + 1);
> +    *(dst_int+ 2*int_dst_stride) = *(src_int + 2);
> +    *(dst_int+ 3*int_dst_stride) = *(src_int + 3);
> +    *(dst_int+ 4*int_dst_stride) = *(src_int + 4);
> +    *(dst_int+ 5*int_dst_stride) = *(src_int + 5);
> +    *(dst_int+ 6*int_dst_stride) = *(src_int + 6);
> +    *(dst_int+ 7*int_dst_stride) = *(src_int + 7);
> +    *(dst_int+ 8*int_dst_stride) = *(src_int + 8);
> +    *(dst_int+ 9*int_dst_stride) = *(src_int + 9);
> +    *(dst_int+10*int_dst_stride) = *(src_int + 10);
> +    *(dst_int+11*int_dst_stride) = *(src_int + 11);
> +    *(dst_int+12*int_dst_stride) = *(src_int + 12);
> +    *(dst_int+13*int_dst_stride) = *(src_int + 13);
> +    *(dst_int+14*int_dst_stride) = *(src_int + 14);
> +    *(dst_int+15*int_dst_stride) = *(src_int + 15);
> +}
> +
> +/* This function does an 6x16 transpose on data in src, and stores it in dst */
> +#define readAndTranspose16x6(src, src_stride, r8, r9, r10, r11, r12, r13) {\

won't be possible to factorize something in order to spare some lvsl ?

> +    register vec_u8_t r0  = read_unaligned(0,             src);\
> +    register vec_u8_t r1  = read_unaligned(   src_stride, src);\
> +    register vec_u8_t r2  = read_unaligned(2* src_stride, src);\
> +    register vec_u8_t r3  = read_unaligned(3* src_stride, src);\
> +    register vec_u8_t r4  = read_unaligned(4* src_stride, src);\
> +    register vec_u8_t r5  = read_unaligned(5* src_stride, src);\
> +    register vec_u8_t r6  = read_unaligned(6* src_stride, src);\
> +    register vec_u8_t r7  = read_unaligned(7* src_stride, src);\
> +    register vec_u8_t r14 = read_unaligned(14*src_stride, src);\
> +    register vec_u8_t r15 = read_unaligned(15*src_stride, src);\
> +                                                               \
> +    r8  = read_unaligned( 8*src_stride, src);                  \
> +    r9  = read_unaligned( 9*src_stride, src);                  \
> +    r10 = read_unaligned(10*src_stride, src);                  \
> +    r11 = read_unaligned(11*src_stride, src);                  \
> +    r12 = read_unaligned(12*src_stride, src);                  \
> +    r13 = read_unaligned(13*src_stride, src);                  \
> +                                                               \
> +// out: o = |x-y| < a
> +static inline vec_u8_t diff_lt_altivec (register vec_u8_t x,
> +                                       register vec_u8_t y,
> +                                       register vec_u8_t a) {
> +

There isn't a simpler way?

> +    register vec_u8_t diff = vec_subs(x, y);
> +    register vec_u8_t diffneg = vec_subs(y, x);
> +    register vec_u8_t o = vec_or(diff, diffneg); /* |x-y| */
> +    o = vec_cmplt(o, a);
> +    return o;
> +}

I'm too tired to read further...

lu

-- 

Luca Barbato

Gentoo/linux Gentoo/PPC
http://dev.gentoo.org/~lu_zero