[FFmpeg-devel] [PATCH] Altivec version of-altivec h264_h-v_loop_filter_luma
Luca Barbato
lu_zero
Fri May 11 23:10:31 CEST 2007
Guillaume POIRIER wrote:
>
> is that any better?
yes, thank you
> +/* A routine to read an unaligned vector. Thanks for the example code Apple */
> +static inline vector unsigned char read_unaligned(int offset, uint8_t *src)
I'd move to a common header with a doxy comment.
> +#define transpose4x16(r0, r1, r2, r3) { \
> + register vec_u8_t r4; \
> + register vec_u8_t r5; \
> + register vec_u8_t r6; \
> + register vec_u8_t r7; \
> + \
> + r4 = vec_mergeh(r0, r2); /*0, 2 set 0*/ \
> + r5 = vec_mergel(r0, r2); /*0, 2 set 1*/ \
> + r6 = vec_mergeh(r1, r3); /*1, 3 set 0*/ \
> + r7 = vec_mergel(r1, r3); /*1, 3 set 1*/ \
> + \
> + r0 = vec_mergeh(r4, r6); /*all set 0*/ \
> + r1 = vec_mergel(r4, r6); /*all set 1*/ \
> + r2 = vec_mergeh(r5, r7); /*all set 2*/ \
> + r3 = vec_mergel(r5, r7); /*all set 3*/ \
> +}
> +
> +static inline void write16x4(uint8_t *dst, int dst_stride,
> + register vec_u8_t r0, register vec_u8_t r1,
> + register vec_u8_t r2, register vec_u8_t r3) {
> + DECLARE_ALIGNED_16(unsigned char, result[64]);
> + uint32_t *src_int = (uint32_t *)result, *dst_int = (uint32_t *)dst;
> + int int_dst_stride = dst_stride/4;
> +
> + vec_st(r0, 0, result);
> + vec_st(r1, 16, result);
> + vec_st(r2, 32, result);
> + vec_st(r3, 48, result);
> + /* there has to be a better way!!!! */
> + *dst_int = *src_int;
> + *(dst_int+ int_dst_stride) = *(src_int + 1);
> + *(dst_int+ 2*int_dst_stride) = *(src_int + 2);
> + *(dst_int+ 3*int_dst_stride) = *(src_int + 3);
> + *(dst_int+ 4*int_dst_stride) = *(src_int + 4);
> + *(dst_int+ 5*int_dst_stride) = *(src_int + 5);
> + *(dst_int+ 6*int_dst_stride) = *(src_int + 6);
> + *(dst_int+ 7*int_dst_stride) = *(src_int + 7);
> + *(dst_int+ 8*int_dst_stride) = *(src_int + 8);
> + *(dst_int+ 9*int_dst_stride) = *(src_int + 9);
> + *(dst_int+10*int_dst_stride) = *(src_int + 10);
> + *(dst_int+11*int_dst_stride) = *(src_int + 11);
> + *(dst_int+12*int_dst_stride) = *(src_int + 12);
> + *(dst_int+13*int_dst_stride) = *(src_int + 13);
> + *(dst_int+14*int_dst_stride) = *(src_int + 14);
> + *(dst_int+15*int_dst_stride) = *(src_int + 15);
> +}
> +
> +/* This function does an 6x16 transpose on data in src, and stores it in dst */
> +#define readAndTranspose16x6(src, src_stride, r8, r9, r10, r11, r12, r13) {\
won't be possible to factorize something in order to spare some lvsl ?
> + register vec_u8_t r0 = read_unaligned(0, src);\
> + register vec_u8_t r1 = read_unaligned( src_stride, src);\
> + register vec_u8_t r2 = read_unaligned(2* src_stride, src);\
> + register vec_u8_t r3 = read_unaligned(3* src_stride, src);\
> + register vec_u8_t r4 = read_unaligned(4* src_stride, src);\
> + register vec_u8_t r5 = read_unaligned(5* src_stride, src);\
> + register vec_u8_t r6 = read_unaligned(6* src_stride, src);\
> + register vec_u8_t r7 = read_unaligned(7* src_stride, src);\
> + register vec_u8_t r14 = read_unaligned(14*src_stride, src);\
> + register vec_u8_t r15 = read_unaligned(15*src_stride, src);\
> + \
> + r8 = read_unaligned( 8*src_stride, src); \
> + r9 = read_unaligned( 9*src_stride, src); \
> + r10 = read_unaligned(10*src_stride, src); \
> + r11 = read_unaligned(11*src_stride, src); \
> + r12 = read_unaligned(12*src_stride, src); \
> + r13 = read_unaligned(13*src_stride, src); \
> + \
> +// out: o = |x-y| < a
> +static inline vec_u8_t diff_lt_altivec (register vec_u8_t x,
> + register vec_u8_t y,
> + register vec_u8_t a) {
> +
There isn't a simpler way?
> + register vec_u8_t diff = vec_subs(x, y);
> + register vec_u8_t diffneg = vec_subs(y, x);
> + register vec_u8_t o = vec_or(diff, diffneg); /* |x-y| */
> + o = vec_cmplt(o, a);
> + return o;
> +}
I'm too tired to read further...
lu
--
Luca Barbato
Gentoo/linux Gentoo/PPC
http://dev.gentoo.org/~lu_zero
More information about the ffmpeg-devel
mailing list