[Ffmpeg-devel] [PATCH] put_mpeg4_qpel16_h_lowpass altivec, take 2
Luca Barbato
lu_zero
Sun Nov 26 17:23:35 CET 2006
Brian Foley wrote:
> On Mon, Nov 20, 2006 at 02:43:17AM +0100, Michael Niedermayer wrote:
>> Hi
>>
>> On Sun, Nov 19, 2006 at 11:20:14PM +0000, Brian Foley wrote:
>>> +static inline void copy_block17(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
>> code duplication, move copy_block17 to a common header like dsputil.h or
>> dsputil_internal.h or whatever, dont copy and paste it
>
> OK, done. I've moved all the copy_block* stuff into dsputil.h. sh4 was
> using a copy of this too as it turns out.
make it a separate patch alone.
>
> I'm not really sure the best way to handle this. I could have an aligned
> load in an 'if (((int) src & 0xf) == 0)', but I suspect the branch would
> hurt us quite badly. The other approach is to have 3 other functions
> where we assert src1 or src2 or both (and their strides) are aligned.
requires benchmark =/
>
> I guess I should look at it with simg4 at some point, but there are much
> more CPU intensive functions that could be sped up first.
try using valgrind callgrind and oprofile if you could ^^;
>
> Interesting. I implemented your suggestion, and it actually does work out
> a little faster, and it saves on setting up a couple of constant vectors
> too, which is no bad thing. The other tweaks suggested here save reading
> another constant vector from memory too.
good =)
> I've tidied up the use of types so it compiles cleanly with GCC 3.4 on
> Linux/PPC. Hopefully it should be about ready to commit now.
Comment inlined as usual
>
> +
> +static void put_pixels16_l2_altivec(uint8_t *dst, const uint8_t *src1,
> + const uint8_t *src2, int dst_stride, int src_stride1,
> + int src_stride2, int h)
> +{
> + register vector unsigned char src1v, src2v, dstv;
> + register vector unsigned char tmp1, tmp2, mask, edges, align;
> + int i;
> +
> + for(i=0; i<h; i++) {
> + /* Unaligned load */
> + src1v = vec_perm(
> + vec_ld(0, src1), vec_ld(15, src1), vec_lvsl(0, src1));
> + src2v = vec_perm(
> + vec_ld(0, src2), vec_ld(15, src2), vec_lvsl(0, src2));
if the stride is a multiple of 16 you could put vec_lvsl out the loop
> +
> + /*Altivec's vec_avg is exactly the (a+b+1)>>1 that we want */
> + dstv = vec_avg(src1v, src2v);
> +
> + if ((int)dst & 0xf) {
> + /* Unaligned store */
> + tmp2 = vec_ld(15, dst);
> + tmp1 = vec_ld(0, dst);
> +
> + mask = vec_lvsl(0, dst);
> + edges = vec_perm(tmp2, tmp1, mask);
> + align = vec_lvsr(0, dst);
same for mask and align
> +static void put_mpeg4_qpel16_h_lowpass_altivec(uint8_t *dst, uint8_t *src,
> + int dstStride, int srcStride, int height)
> +{
> + POWERPC_PERF_DECLARE(put_mpeg4_qpel16_h_lowpass_altivec, 1);
are you really using those stuff? if not you could avoid them (less
lines is always better)
> Index: ppc/dsputil_ppc.c
> ===================================================================
> --- ppc/dsputil_ppc.c (revision 7166)
> +++ ppc/dsputil_ppc.c (working copy)
> @@ -35,6 +35,7 @@
>
> void dsputil_h264_init_ppc(DSPContext* c, AVCodecContext *avctx);
>
> +void dsputil_mpeg4_init_altivec(DSPContext* c, AVCodecContext *avctx);
> void dsputil_init_altivec(DSPContext* c, AVCodecContext *avctx);
> void vc1dsp_init_altivec(DSPContext* c, AVCodecContext *avctx);
> void snow_init_altivec(DSPContext* c, AVCodecContext *avctx);
> @@ -274,7 +275,12 @@
> }
>
> #ifdef HAVE_ALTIVEC
> - if(ENABLE_H264_DECODER) dsputil_h264_init_ppc(c, avctx);
> +#ifdef ENABLE_H264_DECODER
> + dsputil_h264_init_ppc(c, avctx);
> +#endif
cosmetic....
> +#ifdef ENABLE_MPEG4_DECODER
> + dsputil_mpeg4_init_altivec(c, avctx);
> +#endif
>
> if (has_altivec()) {
> mm_flags |= MM_ALTIVEC;
> Index: dsputil.c
> ===================================================================
> --- dsputil.c (revision 7166)
> +++ dsputil.c (working copy)
> @@ -1513,83 +1513,7 @@
> }
> }
>
> -static inline void copy_block2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
> -{
> - int i;
> - for(i=0; i<h; i++)
> - {
> - ST16(dst , LD16(src ));
> - dst+=dstStride;
> - src+=srcStride;
> - }
> -}
>
> -static inline void copy_block4(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
> -{
> - int i;
> - for(i=0; i<h; i++)
> - {
> - ST32(dst , LD32(src ));
> - dst+=dstStride;
> - src+=srcStride;
> - }
> -}
> -
> -static inline void copy_block8(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
> -{
> - int i;
> - for(i=0; i<h; i++)
> - {
> - ST32(dst , LD32(src ));
> - ST32(dst+4 , LD32(src+4 ));
> - dst+=dstStride;
> - src+=srcStride;
> - }
> -}
> -
> -static inline void copy_block16(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
> -{
> - int i;
> - for(i=0; i<h; i++)
> - {
> - ST32(dst , LD32(src ));
> - ST32(dst+4 , LD32(src+4 ));
> - ST32(dst+8 , LD32(src+8 ));
> - ST32(dst+12, LD32(src+12));
> - dst+=dstStride;
> - src+=srcStride;
> - }
> -}
> -
> -static inline void copy_block17(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
> -{
> - int i;
> - for(i=0; i<h; i++)
> - {
> - ST32(dst , LD32(src ));
> - ST32(dst+4 , LD32(src+4 ));
> - ST32(dst+8 , LD32(src+8 ));
> - ST32(dst+12, LD32(src+12));
> - dst[16]= src[16];
> - dst+=dstStride;
> - src+=srcStride;
> - }
> -}
> -
> -static inline void copy_block9(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
> -{
> - int i;
> - for(i=0; i<h; i++)
> - {
> - ST32(dst , LD32(src ));
> - ST32(dst+4 , LD32(src+4 ));
> - dst[8]= src[8];
> - dst+=dstStride;
> - src+=srcStride;
> - }
> -}
> -
> -
> #define QPEL_MC(r, OPNAME, RND, OP) \
> static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
> uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
Make the following a separate patch please
> Index: dsputil.h
> ===================================================================
> --- dsputil.h (revision 7166)
> +++ dsputil.h (working copy)
I'm unsure about it, those function aren't mpeg4-only?
> Index: Makefile
> ===================================================================
> --- Makefile (revision 7166)
> +++ Makefile (working copy)
> @@ -383,6 +383,7 @@
> sh4/dsputil_align.o \
>
> OBJS-$(TARGET_ALTIVEC) += ppc/dsputil_altivec.o \
> + ppc/mpeg4_altivec.o \
> ppc/mpegvideo_altivec.o \
> ppc/idct_altivec.o \
> ppc/fft_altivec.o \
keep it in the other patch
> Index: sh4/qpel.c
> ===================================================================
> --- sh4/qpel.c (revision 7166)
> +++ sh4/qpel.c (working copy)
--
Luca Barbato
Gentoo/linux Gentoo/PPC
http://dev.gentoo.org/~lu_zero
More information about the ffmpeg-devel
mailing list