[FFmpeg-devel] [PATCH] H.264: SSE2 weight/biweight functions
Michael Niedermayer
michaelni
Thu Jan 8 01:48:15 CET 2009
On Mon, Jan 05, 2009 at 05:39:21PM -0500, Jason Garrett-Glaser wrote:
> On Mon, Jan 5, 2009 at 5:31 PM, Jason Garrett-Glaser
> <darkshikari at gmail.com> wrote:
> > On Mon, Jan 5, 2009 at 5:25 PM, Diego Biurrun <diego at biurrun.de> wrote:
> >> On Mon, Jan 05, 2009 at 05:17:45PM -0500, Jason Garrett-Glaser wrote:
> >>> On Mon, Jan 5, 2009 at 4:55 PM, Jason Garrett-Glaser
> >>> <darkshikari at gmail.com> wrote:
> >>> > $subject
> >>>
> >>> The bad news: 10L in the above patch.
> >>>
> >>> The good news: now includes SSSE3 support.
> >>
> >> some nits..
> >>
> >>> --- libavcodec/x86/dsputil_mmx.c (revision 16439)
> >>> +++ libavcodec/x86/dsputil_mmx.c (working copy)
> >>> @@ -2852,6 +2852,18 @@
> >>> +
> >>> + c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_sse2;
> >>> + c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_sse2;
> >>> + c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_sse2;
> >>> + c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_sse2;
> >>> + c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_sse2;
> >>> +
> >>> + c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_sse2;
> >>> + c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_sse2;
> >>> + c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_sse2;
> >>> + c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_sse2;
> >>> + c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_sse2;
> >>
> >> This could be more readable with a space before the =.
> >>
> >>> @@ -2873,6 +2885,11 @@
> >>> c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_ssse3;
> >>> c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_ssse3;
> >>> c->add_png_paeth_prediction= add_png_paeth_prediction_ssse3;
> >>> + c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_ssse3;
> >>> + c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_ssse3;
> >>> + c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_ssse3;
> >>> + c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_ssse3;
> >>> + c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_ssse3;
> >>
> >> ditto
> >>
> >>> --- libavcodec/x86/h264dsp_mmx.c (revision 16439)
> >>> +++ libavcodec/x86/h264dsp_mmx.c (working copy)
> >>> @@ -2239,6 +2239,47 @@
> >>>
> >>> + "movd %0, %%xmm4 \n\t"
> >>> + "movd %1, %%xmm5 \n\t"
> >>> + "movd %2, %%xmm6 \n\t"
> >>> + "pshuflw $0, %%xmm4, %%xmm4 \n\t"
> >>> + "pshuflw $0, %%xmm5, %%xmm5 \n\t"
> >>> + "punpcklqdq %%xmm4, %%xmm4 \n\t"
> >>> + "punpcklqdq %%xmm5, %%xmm5 \n\t"
> >>> + "pxor %%xmm7, %%xmm7 \n\t"
> >>
> >> This could be aligned for better readability, same below...
> >>
> >> Diego
> >> _______________________________________________
> >> ffmpeg-devel mailing list
> >> ffmpeg-devel at mplayerhq.hu
> >> https://lists.mplayerhq.hu/mailman/listinfo/ffmpeg-devel
> >>
> >
> > Done.
> >
> > Dark Shikari
> >
>
> Seems I forgot about weight sse2. Fixed as well.
[...]
> Index: libavcodec/x86/h264dsp_mmx.c
> ===================================================================
> --- libavcodec/x86/h264dsp_mmx.c (revision 16439)
> +++ libavcodec/x86/h264dsp_mmx.c (working copy)
> @@ -2239,6 +2239,47 @@
> }
> }
>
> +static inline void ff_h264_weight_WxH_sse2(uint8_t *dst, int stride, int log2_denom, int weight, int offset, int w, int h)
> +{
> + int x, y;
> + offset <<= log2_denom;
> + offset += (1 << log2_denom) >> 1;
considering that the c code does
offset <<= log2_denom; \
if(log2_denom) offset += 1<<(log2_denom-1); \
i think, this could be done once per slice instead of repeatly for
each block
then we pass
h->luma_weight[list][refn], h->luma_offset[list][refn]
currently, which is silly, a single table containing weight and offset
interleaved would mean passing just one pointer instead of 2 dereferenced
ones.
Also it may (but doesnt have to be) faster to do the pshuflw/punpcklqdq
also ahead per slice and store the 16byte repeated variant in the table.
> + __asm__ volatile(
> + "movd %0, %%xmm4 \n\t"
> + "movd %1, %%xmm5 \n\t"
> + "movd %2, %%xmm6 \n\t"
> + "pshuflw $0, %%xmm4, %%xmm4 \n\t"
> + "pshuflw $0, %%xmm5, %%xmm5 \n\t"
> + "punpcklqdq %%xmm4, %%xmm4 \n\t"
> + "punpcklqdq %%xmm5, %%xmm5 \n\t"
> + "pxor %%xmm7, %%xmm7 \n\t"
> + :: "g"(weight), "g"(offset), "g"(log2_denom)
> + );
> + for(y=0; y<h; y+=2){
> + for(x=0; x<w; x+=8){
> + __asm__ volatile(
> + "movq %0, %%xmm0 \n\t"
> + "movq %1, %%xmm1 \n\t"
> + "punpcklbw %%xmm7, %%xmm0 \n\t"
> + "punpcklbw %%xmm7, %%xmm1 \n\t"
> + "pmullw %%xmm4, %%xmm0 \n\t"
> + "pmullw %%xmm4, %%xmm1 \n\t"
> + "paddsw %%xmm5, %%xmm0 \n\t"
> + "paddsw %%xmm5, %%xmm1 \n\t"
> + "psraw %%xmm6, %%xmm0 \n\t"
> + "psraw %%xmm6, %%xmm1 \n\t"
> + "packuswb %%xmm7, %%xmm0 \n\t"
> + "packuswb %%xmm7, %%xmm1 \n\t"
> + "movq %%xmm0, %0 \n\t"
> + "movq %%xmm1, %1 \n\t"
> + : "+m"(*(uint64_t*)(dst+x)),
> + "+m"(*(uint64_t*)(dst+x+stride))
> + );
> + }
> + dst += 2*stride;
> + }
the loops should be done in asm, i dont trust gcc
> +}
> +
> static inline void ff_h264_biweight_WxH_mmx2(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset, int w, int h)
> {
> int x, y;
> @@ -2277,6 +2318,84 @@
> }
> }
>
> +static inline void ff_h264_biweight_WxH_sse2(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset, int w, int h)
> +{
> + int x, y;
> + offset = ((offset + 1) | 1) << log2_denom;
> + __asm__ volatile(
> + "movd %0, %%xmm3 \n\t"
> + "movd %1, %%xmm4 \n\t"
> + "movd %2, %%xmm5 \n\t"
> + "movd %3, %%xmm6 \n\t"
> + "pshuflw $0, %%xmm3, %%xmm3 \n\t"
> + "pshuflw $0, %%xmm4, %%xmm4 \n\t"
> + "pshuflw $0, %%xmm5, %%xmm5 \n\t"
> + "punpcklqdq %%xmm3, %%xmm3 \n\t"
> + "punpcklqdq %%xmm4, %%xmm4 \n\t"
> + "punpcklqdq %%xmm5, %%xmm5 \n\t"
> + "pxor %%xmm7, %%xmm7 \n\t"
> + :: "g"(weightd), "g"(weights), "g"(offset), "g"(log2_denom+1)
> + );
this stuff similarly could be calculated per slice and stored in a
[ref0][ref1] table (for example the implicit weight table ...)
i think at least ...
(of course only if it is faster ...)
[...]
--
Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB
No human being will ever know the Truth, for even if they happen to say it
by chance, they would not even known they had done so. -- Xenophanes
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 189 bytes
Desc: Digital signature
URL: <http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/attachments/20090108/8c8eba12/attachment.pgp>
More information about the ffmpeg-devel
mailing list