[FFmpeg-devel] [PATCH 1/3] diracdec: add 10-bit Haar SIMD functions

Thu Jul 26 18:29:11 EEST 2018

On 26 July 2018 at 12:28, James Darnley <jdarnley at obe.tv> wrote:

> Speed of ffmpeg when decoding a 720p yuv422p10 file encoded with the
> relevant transform.
> C:    119fps
> SSE2: 204fps
> AVX:  206fps
> AVX2: 221fps
>
> timer measurements, haar horizontal compose:
>     sse2: 3.68x faster (45143 vs. 12279 decicycles) compared with C
>     avx:  3.68x faster (45143 vs. 12275 decicycles) compared with C
>     avx2: 5.16x faster (45143 vs.  8742 decicycles) compared with C
> haar vertical compose:
>     sse2: 1.64x faster (31792 vs. 19377 decicycles) compared with C
>     avx:  1.58x faster (31792 vs. 20090 decicycles) compared with C
>     avx2: 1.66x faster (31792 vs. 19157 decicycles) compared with C
> ---
>  libavcodec/dirac_dwt.c                |   7 +-
>  libavcodec/dirac_dwt.h                |   1 +
>  libavcodec/x86/Makefile               |   6 +-
>  libavcodec/x86/dirac_dwt_10bit.asm    | 160 ++++++++++++++++++++++++++
>  libavcodec/x86/dirac_dwt_init_10bit.c |  76 ++++++++++++
>  5 files changed, 247 insertions(+), 3 deletions(-)
>  create mode 100644 libavcodec/x86/dirac_dwt_10bit.asm
>  create mode 100644 libavcodec/x86/dirac_dwt_init_10bit.c
>
> diff --git a/libavcodec/dirac_dwt.c b/libavcodec/dirac_dwt.c
> index cc08f8865a..86bee5bb9b 100644
> --- a/libavcodec/dirac_dwt.c
> +++ b/libavcodec/dirac_dwt.c
> @@ -59,8 +59,13 @@ int ff_spatial_idwt_init(DWTContext *d, DWTPlane *p,
> enum dwt_type type,
>          return AVERROR_INVALIDDATA;
>      }
>
> -    if (ARCH_X86 && bit_depth == 8)
> +#if ARCH_X86
> +    if (bit_depth == 8)
>          ff_spatial_idwt_init_x86(d, type);
> +    else if (bit_depth == 10)
> +        ff_spatial_idwt_init_10bit_x86(d, type);
> +#endif
> +
>      return 0;
>  }
>
> diff --git a/libavcodec/dirac_dwt.h b/libavcodec/dirac_dwt.h
> index 994dc21d70..1ad7b9a821 100644
> --- a/libavcodec/dirac_dwt.h
> +++ b/libavcodec/dirac_dwt.h
> @@ -88,6 +88,7 @@ enum dwt_type {
>  int ff_spatial_idwt_init(DWTContext *d, DWTPlane *p, enum dwt_type type,
>                           int decomposition_count, int bit_depth);
>  void ff_spatial_idwt_init_x86(DWTContext *d, enum dwt_type type);
> +void ff_spatial_idwt_init_10bit_x86(DWTContext *d, enum dwt_type type);
>
>  void ff_spatial_idwt_slice2(DWTContext *d, int y);
>
> diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
> index 2350c8bbee..590d83c167 100644
> --- a/libavcodec/x86/Makefile
> +++ b/libavcodec/x86/Makefile
> @@ -7,7 +7,8 @@ OBJS-$(CONFIG_BLOCKDSP)                +=
> x86/blockdsp_init.o
>  OBJS-$(CONFIG_BSWAPDSP)                += x86/bswapdsp_init.o
>  OBJS-$(CONFIG_DCT)                     += x86/dct_init.o
>  OBJS-$(CONFIG_DIRAC_DECODER)           += x86/diracdsp_init.o           \
> -                                          x86/dirac_dwt_init.o
> +                                          x86/dirac_dwt_init.o \
> +                                          x86/dirac_dwt_init_10bit.o
>  OBJS-$(CONFIG_FDCTDSP)                 += x86/fdctdsp_init.o
>  OBJS-$(CONFIG_FFT)                     += x86/fft_init.o
>  OBJS-$(CONFIG_FLACDSP)                 += x86/flacdsp_init.o
> @@ -153,7 +154,8 @@ X86ASM-OBJS-$(CONFIG_APNG_DECODER)     += x86/pngdsp.o
>  X86ASM-OBJS-$(CONFIG_CAVS_DECODER)     += x86/cavsidct.o
>  X86ASM-OBJS-$(CONFIG_DCA_DECODER)      += x86/dcadsp.o x86/synth_filter.o
>  X86ASM-OBJS-$(CONFIG_DIRAC_DECODER)    += x86/diracdsp.o                \
> -                                          x86/dirac_dwt.o
> +                                          x86/dirac_dwt.o \
> +                                          x86/dirac_dwt_10bit.o
>  X86ASM-OBJS-$(CONFIG_DNXHD_ENCODER)    += x86/dnxhdenc.o
>  X86ASM-OBJS-$(CONFIG_EXR_DECODER)      += x86/exrdsp.o
>  X86ASM-OBJS-$(CONFIG_FLAC_DECODER)     += x86/flacdsp.o
> diff --git a/libavcodec/x86/dirac_dwt_10bit.asm
> b/libavcodec/x86/dirac_dwt_10bit.asm
> new file mode 100644
> index 0000000000..baea91329e
> --- /dev/null
> +++ b/libavcodec/x86/dirac_dwt_10bit.asm
> @@ -0,0 +1,160 @@
> +;**********************************************************
> ********************
> +;* x86 optimized discrete 10-bit wavelet trasnform
> +;* Copyright (c) 2018 James Darnley
> +;*
> +;* This file is part of FFmpeg.
> +;*
> +;* FFmpeg is free software; you can redistribute it and/or
> +;* modify it under the terms of the GNU Lesser General Public
> +;* License as published by the Free Software Foundation; either
> +;* version 2.1 of the License, or (at your option) any later version.
> +;*
> +;* FFmpeg is distributed in the hope that it will be useful,
> +;* but WITHOUT ANY WARRANTY; without even the implied warranty of
> +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +;* Lesser General Public License for more details.
> +;*
> +;* You should have received a copy of the GNU Lesser General Public
> +;* License along with FFmpeg; if not, write to the Free Software
> +;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA
> 02110-1301 USA
> +;**********************************************************
> ********************
> +
> +%include "libavutil/x86/x86util.asm"
> +
> +SECTION_RODATA
> +
> +cextern pd_1
> +
> +SECTION .text
> +
> +%macro HAAR_VERTICAL 0
>
+
>

See below.

+cglobal vertical_compose_haar_10bit, 3, 6, 4, b0, b1, w
> +    DECLARE_REG_TMP 4,5
> +
> +    mova  m2, [pd_1]
> +    mov  r3d, wd
> +    and   wd, ~(mmsize/4 - 1)
> +    shl   wd, 2
> +    add  b0q, wq
> +    add  b1q, wq
> +    neg   wq
> +
> +    ALIGN 16
> +    .loop_simd:
> +        mova m0, [b0q + wq]
> +        mova m1, [b1q + wq]
> +        paddd m3, m1, m2
> +        psrad m3, 1
> +        psubd m0, m3
> +        paddd m1, m0
> +        mova [b0q + wq], m0
> +        mova [b1q + wq], m1
> +        add wq, mmsize
> +    jl .loop_simd
> +
> +    and  r3d, mmsize/4 - 1
> +    jz .end
> +    .loop_scalar:
> +        mov t0d, [b0q]
> +        mov t1d, [b1q]
> +        mov r2d, t1d
> +        add r2d, 1
> +        sar r2d, 1
> +        sub t0d, r2d
> +        add t1d, t0d
> +        mov [b0q], t0d
> +        mov [b1q], t1d
> +
> +        add b0q, 4
> +        add b1q, 4
> +        sub r3d, 1
> +    jg .loop_scalar
> +
> +    .end:
> +RET
> +
> +%endmacro
> +
> +%macro HAAR_HORIZONTAL 0
>
+
>

Could you remove this newline from every patch? All asm I've written and
seen keep them without a newline. It made me think there's something in the
asm which checked the value of the macro, not that the entire function is
macro'd.

+cglobal horizontal_compose_haar_10bit, 3, 6+ARCH_X86_64, 4, b, temp_, w,
> x, b2
> +    DECLARE_REG_TMP 2,5
> +    %if ARCH_X86_64
> +        %define tail r6d
> +    %else
> +        %define tail dword wm
> +    %endif
> +
> +    mova m2, [pd_1]
> +    xor xd, xd
> +    shr wd, 1
> +    mov tail, wd
> +    lea b2q, [bq + 4*wq]
> +
> +    ALIGN 16
> +    .loop_lo:
> +        mova m0, [bq  + 4*xq]
> +        movu m1, [b2q + 4*xq]
> +        paddd m1, m2
> +        psrad m1, 1
> +        psubd m0, m1
> +        mova [temp_q + 4*xq], m0
> +        add xd, mmsize/4
> +        cmp xd, wd
> +    jl .loop_lo
> +
> +    xor xd, xd
> +    and wd, ~(mmsize/4 - 1)
> +
> +    ALIGN 16
> +    .loop_hi:
> +        mova m0, [temp_q + 4*xq]
> +        movu m1, [b2q    + 4*xq]
> +        paddd m1, m0
> +        paddd m0, m2
> +        paddd m1, m2
> +        psrad m0, 1
> +        psrad m1, 1
> +        SBUTTERFLY dq, 0,1,3
> +        %if cpuflag(avx2)
> +            SBUTTERFLY dqqq, 0,1,3
> +        %endif
> +        mova [bq + 8*xq], m0
> +        mova [bq + 8*xq + mmsize], m1
> +        add xd, mmsize/4
> +        cmp xd, wd
> +    jl .loop_hi
> +
> +    and tail, mmsize/4 - 1
> +    jz .end
> +    .loop_scalar:
> +        mov t0d, [temp_q + 4*xq]
> +        mov t1d, [b2q    + 4*xq]
> +        add t1d, t0d
> +        add t0d, 1
> +        add t1d, 1
> +        sar t0d, 1
> +        sar t1d, 1
> +        mov [bq + 8*xq], t0d
> +        mov [bq + 8*xq + 4], t1d
> +        add  xq, 1
> +        sub tail, 1
> +    jg .loop_scalar
> +
> +    .end:
> +REP_RET
> +
> +%endmacro
> +
> +INIT_XMM sse2
> +HAAR_HORIZONTAL
> +HAAR_VERTICAL
> +
> +INIT_XMM avx
> +HAAR_HORIZONTAL
> +HAAR_VERTICAL
>

You're not using any avx functions in that version, not unless a macro'd
instruction inserts one for you. I think you should remove the avx version
then.
Also since you always have a HAAR_HORIZONTAL and HAAR_VERTICAL macros per
version you can just make a single macro to do both versions at the same
time.

+
> +INIT_YMM avx2
> +HAAR_HORIZONTAL
> +HAAR_VERTICAL
> diff --git a/libavcodec/x86/dirac_dwt_init_10bit.c
> b/libavcodec/x86/dirac_dwt_init_10bit.c
> new file mode 100644
> index 0000000000..289862d728
> --- /dev/null
> +++ b/libavcodec/x86/dirac_dwt_init_10bit.c
> @@ -0,0 +1,76 @@
> +/*
> + * x86 optimized discrete wavelet transform
> + * Copyright (c) 2018 James Darnley
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
> 02110-1301 USA
> + */
> +
> +#include "libavutil/x86/asm.h"
> +#include "libavutil/x86/cpu.h"
> +#include "libavcodec/dirac_dwt.h"
> +
> +void ff_horizontal_compose_haar_10bit_sse2(int32_t *b0, int32_t *b1, int
> width_align);
> +void ff_horizontal_compose_haar_10bit_avx(int32_t *b0, int32_t *b1, int
> width_align);
> +void ff_horizontal_compose_haar_10bit_avx2(int32_t *b0, int32_t *b1, int
> width_align);
> +
> +void ff_vertical_compose_haar_10bit_sse2(int32_t *b0, int32_t *b1, int
> width_align);
> +void ff_vertical_compose_haar_10bit_avx(int32_t *b0, int32_t *b1, int
> width_align);
> +void ff_vertical_compose_haar_10bit_avx2(int32_t *b0, int32_t *b1, int
> width_align);
> +
> +av_cold void ff_spatial_idwt_init_10bit_x86(DWTContext *d, enum dwt_type
> type)
> +{
> +#if HAVE_X86ASM
> +    int cpu_flags = av_get_cpu_flags();
> +
> +    if (EXTERNAL_SSE2(cpu_flags)) {
> +        switch (type) {
> +            case DWT_DIRAC_HAAR0:
> +                d->vertical_compose = (void*)ff_vertical_compose_
> haar_10bit_sse2;
> +                break;
> +            case DWT_DIRAC_HAAR1:
> +                d->horizontal_compose = (void*)ff_horizontal_compose_
> haar_10bit_sse2;
> +                d->vertical_compose = (void*)ff_vertical_compose_
> haar_10bit_sse2;
> +                break;
> +        }
> +    }
> +
> +    if (EXTERNAL_AVX(cpu_flags)) {
> +        switch (type) {
> +            case DWT_DIRAC_HAAR0:
> +                d->vertical_compose = (void*)ff_vertical_compose_
> haar_10bit_avx;
> +                break;
> +            case DWT_DIRAC_HAAR1:
> +                d->horizontal_compose = (void*)ff_horizontal_compose_
> haar_10bit_avx;
> +                d->vertical_compose = (void*)ff_vertical_compose_
> haar_10bit_avx;
> +                break;
> +        }
>

We keep switches and cases on the same line.

+    }
> +
> +    if (EXTERNAL_AVX2(cpu_flags)) {
> +        switch (type) {
> +            case DWT_DIRAC_HAAR0:
> +                d->vertical_compose = (void*)ff_vertical_compose_
> haar_10bit_avx2;
> +                break;
> +            case DWT_DIRAC_HAAR1:
> +                d->horizontal_compose = (void*)ff_horizontal_compose_
> haar_10bit_avx2;
> +                d->vertical_compose = (void*)ff_vertical_compose_
> haar_10bit_avx2;
> +                break;
> +        }

Same.