[FFmpeg-devel] [PATCH] vp9/x86: add ff_vp9_loop_filter_[vh]_16_16_sse2().

Clément Bœsch u at pkh.me
Fri Jan 17 07:47:43 CET 2014


On Fri, Jan 17, 2014 at 03:40:36AM -0300, James Almer wrote:
> Similar gains in performance as the SSSE3 version
> 
> Signed-off-by: James Almer <jamrial at gmail.com>
> ---
>  libavcodec/x86/vp9dsp_init.c | 19 +++++++++++++++----
>  libavcodec/x86/vp9lpf.asm    | 10 ++++------
>  2 files changed, 19 insertions(+), 10 deletions(-)
> 
> diff --git a/libavcodec/x86/vp9dsp_init.c b/libavcodec/x86/vp9dsp_init.c
> index 900efb3..ab3396e 100644
> --- a/libavcodec/x86/vp9dsp_init.c
> +++ b/libavcodec/x86/vp9dsp_init.c
> @@ -177,10 +177,17 @@ itxfm_func(idct, idct, 32, avx);
>  #undef itxfm_func
>  #undef itxfm_funcs
>  
> -void ff_vp9_loop_filter_v_16_16_ssse3(uint8_t *dst, ptrdiff_t stride, int E, int I, int H);
> -void ff_vp9_loop_filter_v_16_16_avx  (uint8_t *dst, ptrdiff_t stride, int E, int I, int H);
> -void ff_vp9_loop_filter_h_16_16_ssse3(uint8_t *dst, ptrdiff_t stride, int E, int I, int H);
> -void ff_vp9_loop_filter_h_16_16_avx  (uint8_t *dst, ptrdiff_t stride, int E, int I, int H);
> +#define lpf_funcs(size1, size2, opt) \
> +void ff_vp9_loop_filter_v_##size1##_##size2##_##opt(uint8_t *dst, ptrdiff_t stride, \
> +                                                    int E, int I, int H); \
> +void ff_vp9_loop_filter_h_##size1##_##size2##_##opt(uint8_t *dst, ptrdiff_t stride, \
> +                                                    int E, int I, int H)
> +
> +lpf_funcs(16, 16, sse2);
> +lpf_funcs(16, 16, ssse3);
> +lpf_funcs(16, 16, avx);
> +
> +#undef lpf_funcs
>  
>  #endif /* HAVE_YASM */
>  
> @@ -230,6 +237,10 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp)
>          init_fpel(2, 1, 16, avg, sse2);
>          init_fpel(1, 1, 32, avg, sse2);
>          init_fpel(0, 1, 64, avg, sse2);
> +        if (ARCH_X86_64) {
> +            dsp->loop_filter_16[0] = ff_vp9_loop_filter_h_16_16_sse2;
> +            dsp->loop_filter_16[1] = ff_vp9_loop_filter_v_16_16_sse2;
> +        }
>      }
>  
>      if (EXTERNAL_SSSE3(cpu_flags)) {
> diff --git a/libavcodec/x86/vp9lpf.asm b/libavcodec/x86/vp9lpf.asm
> index c5e5df9..60caf73 100644
> --- a/libavcodec/x86/vp9lpf.asm
> +++ b/libavcodec/x86/vp9lpf.asm
> @@ -285,10 +285,8 @@ SECTION .text
>  
>      ; calc fm mask
>      pxor                m0, m0

I think you don't need this with the sse2 version of SPLATB_REG, same
below

> -    movd                m2, Id
> -    movd                m3, Ed
> -    pshufb              m2, m0                          ; I I I I ...
> -    pshufb              m3, m0                          ; E E E E ...
> +    SPLATB_REG          m2, I, m0                       ; I I I I ...
> +    SPLATB_REG          m3, E, m0                       ; E E E E ...
>      mova                m0, [pb_80]
>      pxor                m2, m0
>      pxor                m3, m0
> @@ -341,8 +339,7 @@ SECTION .text
>      pand                m2, m1
>      ABSSUB              m4, m10, m11, m5                ; abs(p1 - p0)
>      pxor                m0, m0
> -    movd                m7, Hd
> -    pshufb              m7, m0                          ; H H H H ...
> +    SPLATB_REG          m7, H, m0                       ; H H H H ...
>      pxor                m7, m8
>      pxor                m4, m8
>      pcmpgtb             m0, m4, m7                      ; abs(p1 - p0) > H (1/2 hev condition)
> @@ -665,6 +662,7 @@ cglobal vp9_loop_filter_h_16_16, 5,8,16, 256, dst, stride, E, I, H, mstride, dst
>      RET
>  %endmacro
>  
> +LPF_16_16_VH sse2
>  LPF_16_16_VH ssse3
>  LPF_16_16_VH avx
>  

-- 
Clément B.
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 490 bytes
Desc: not available
URL: <http://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20140117/596a4cf6/attachment.asc>


More information about the ffmpeg-devel mailing list