[FFmpeg-devel] [PATCH 01/10] diracdsp: add SIMD for the 10 bit version of put_signed_rect_clamped
Rostislav Pehlivanov
atomnuker at gmail.com
Mon Jun 27 12:27:36 CEST 2016
On 24 June 2016 at 16:21, James Almer <jamrial at gmail.com> wrote:
> On 6/24/2016 8:44 AM, Rostislav Pehlivanov wrote:
> > From 86ecebfe70509329d6f5b8a587ae79d19f9c8154 Mon Sep 17 00:00:00 2001
> > From: Rostislav Pehlivanov <rpehlivanov at ob-encoder.com>
> > Date: Thu, 23 Jun 2016 18:06:55 +0100
> > Subject: [PATCH 1/2] diracdsp: add SIMD for the 10 bit version of
> > put_signed_rect_clamped
> >
> > Signed-off-by: Rostislav Pehlivanov <rpehlivanov at obe.tv>
> > ---
> > libavcodec/x86/diracdsp.asm | 45
> ++++++++++++++++++++++++++++++++++++++++++
> > libavcodec/x86/diracdsp_init.c | 10 ++++++++++
> > 2 files changed, 55 insertions(+)
> >
> > diff --git a/libavcodec/x86/diracdsp.asm b/libavcodec/x86/diracdsp.asm
> > index a042413..a0d6788 100644
> > --- a/libavcodec/x86/diracdsp.asm
> > +++ b/libavcodec/x86/diracdsp.asm
> > @@ -22,6 +22,8 @@
> >
> > SECTION_RODATA
> > pw_7: times 8 dw 7
> > +convert_to_unsigned_10bit: times 4 dd 0x200
> > +clip_10bit: times 8 dw 0x3ff
> >
> > cextern pw_3
> > cextern pw_16
> > @@ -263,3 +265,46 @@ ADD_RECT sse2
> > HPEL_FILTER sse2
> > ADD_OBMC 32, sse2
> > ADD_OBMC 16, sse2
> > +
> > +%if ARCH_X86_64 == 1
> > +INIT_XMM sse4
> > +
> > +; void put_signed_rect_clamped_10(uint8_t *dst, int dst_stride, const
> uint8_t *src, int src_stride, int width, int height)
> > +cglobal put_signed_rect_clamped_10, 6, 9, 6, dst, dst_stride, src,
> src_stride, w, h
> > +
> > + mov r6, srcq
> > + mov r7, dstq
> > + mov r8, wq
> > + pxor m2, m2
> > + mova m3, [clip_10bit]
> > + mova m4, [convert_to_unsigned_10bit]
> > +
> > + .loop_h:
> > + mov srcq, r6
> > + mov dstq, r7
> > + mov wq, r8
> > +
> > + .loop_w:
> > + movu m0, [srcq+0*mmsize]
> > + movu m1, [srcq+1*mmsize]
> > +
> > + paddd m0, m4
> > + paddd m1, m4
> > + packusdw m0, m0, m1
> > + CLIPW m0, m2, m3 ; packusdw saturates so it's fine
> > +
> > + movu [dstq], m0
> > +
> > + add srcq, 2*mmsize
> > + add dstq, 1*mmsize
> > + sub wq, 8
> > + jl .loop_w
>
> Since you're substracting w now, this should be jump if greater.
>
> Also, use wd, not wq, since it comes from stack on Win64. With msvc
> x86_64 afaik there's no guarantee that the upper half of the register
> is zeroed.
>
> > +
> > + add r6, src_strideq
> > + add r7, dst_strideq
> > + sub hq, 1
> > + jl .loop_h
>
> Ditto.
>
> Alternatively as i said before is to just change the prototypes to
> use ptrdiff_t instead of int.
>
> > +
> > + RET
> > +
> > +%endif
> > diff --git a/libavcodec/x86/diracdsp_init.c
> b/libavcodec/x86/diracdsp_init.c
> > index 5fae798..7fa554e 100644
> > --- a/libavcodec/x86/diracdsp_init.c
> > +++ b/libavcodec/x86/diracdsp_init.c
> > @@ -46,6 +46,10 @@ void ff_put_rect_clamped_sse2(uint8_t *dst, int
> dst_stride, const int16_t *src,
> > void ff_put_signed_rect_clamped_mmx(uint8_t *dst, int dst_stride, const
> int16_t *src, int src_stride, int width, int height);
> > void ff_put_signed_rect_clamped_sse2(uint8_t *dst, int dst_stride,
> const int16_t *src, int src_stride, int width, int height);
> >
> > +#if ARCH_X86_64
> > +void ff_put_signed_rect_clamped_10_sse4(uint8_t *dst, int dst_stride,
> const uint8_t *src, int src_stride, int width, int height);
> > +#endif
> > +
> > #if HAVE_YASM
> >
> > #define HPEL_FILTER(MMSIZE, EXT)
> \
> > @@ -184,4 +188,10 @@ void ff_diracdsp_init_x86(DiracDSPContext* c)
> > c->put_dirac_pixels_tab[2][0] = ff_put_dirac_pixels32_sse2;
> > c->avg_dirac_pixels_tab[2][0] = ff_avg_dirac_pixels32_sse2;
> > }
> > +
> > +#if ARCH_X86_64
> > + if (EXTERNAL_SSE4(mm_flags)) {
> > + c->put_signed_rect_clamped[1] =
> ff_put_signed_rect_clamped_10_sse4;
> > + }
> > +#endif
> > }
> > --
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel at ffmpeg.org
> http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
Attached a new patch, should be fine now.
Chose not to change w and h to 64 bits since I'd have to do more changes to
existing code.
-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0001-diracdsp-add-SIMD-for-the-10-bit-version-of-put_sign.patch
Type: text/x-patch
Size: 3077 bytes
Desc: not available
URL: <http://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20160627/67c4284c/attachment.bin>
More information about the ffmpeg-devel
mailing list