[FFmpeg-devel] [PATCH 1/3] diracdec: add 10-bit Haar SIMD functions

Fri Jul 27 14:47:57 EEST 2018

On 2018-07-26 17:29, Rostislav Pehlivanov wrote:
> On 26 July 2018 at 12:28, James Darnley <jdarnley at obe.tv> wrote:
> +cglobal vertical_compose_haar_10bit, 3, 6, 4, b0, b1, w
>> +    DECLARE_REG_TMP 4,5
>> +
>> +    mova  m2, [pd_1]
>> +    mov  r3d, wd
>> +    and   wd, ~(mmsize/4 - 1)
>> +    shl   wd, 2
>> +    add  b0q, wq
>> +    add  b1q, wq
>> +    neg   wq
>> +
>> +    ALIGN 16
>> +    .loop_simd:
>> +        mova m0, [b0q + wq]
>> +        mova m1, [b1q + wq]
>> +        paddd m3, m1, m2
>> +        psrad m3, 1
>> +        psubd m0, m3
>> +        paddd m1, m0
>> +        mova [b0q + wq], m0
>> +        mova [b1q + wq], m1
>> +        add wq, mmsize
>> +    jl .loop_simd
>> +
>> +    and  r3d, mmsize/4 - 1
>> +    jz .end
>> +    .loop_scalar:
>> +        mov t0d, [b0q]
>> +        mov t1d, [b1q]
>> +        mov r2d, t1d
>> +        add r2d, 1
>> +        sar r2d, 1
>> +        sub t0d, r2d
>> +        add t1d, t0d
>> +        mov [b0q], t0d
>> +        mov [b1q], t1d
>> +
>> +        add b0q, 4
>> +        add b1q, 4
>> +        sub r3d, 1
>> +    jg .loop_scalar
>> +
>> +    .end:
>> +RET
>> +
>> +%endmacro
>> +
>> +%macro HAAR_HORIZONTAL 0
>>
> +
>>
> 
> Could you remove this newline from every patch? All asm I've written and
> seen keep them without a newline. It made me think there's something in the
> asm which checked the value of the macro, not that the entire function is
> macro'd.

What?  I don't understand what you mean.  Do you think I have too many
blank lines between things?

> +cglobal horizontal_compose_haar_10bit, 3, 6+ARCH_X86_64, 4, b, temp_, w,
>> x, b2
>> +    DECLARE_REG_TMP 2,5
>> +    %if ARCH_X86_64
>> +        %define tail r6d
>> +    %else
>> +        %define tail dword wm
>> +    %endif
>> +
>> +    mova m2, [pd_1]
>> +    xor xd, xd
>> +    shr wd, 1
>> +    mov tail, wd
>> +    lea b2q, [bq + 4*wq]
>> +
>> +    ALIGN 16
>> +    .loop_lo:
>> +        mova m0, [bq  + 4*xq]
>> +        movu m1, [b2q + 4*xq]
>> +        paddd m1, m2
>> +        psrad m1, 1
>> +        psubd m0, m1
>> +        mova [temp_q + 4*xq], m0
>> +        add xd, mmsize/4
>> +        cmp xd, wd
>> +    jl .loop_lo
>> +
>> +    xor xd, xd
>> +    and wd, ~(mmsize/4 - 1)
>> +
>> +    ALIGN 16
>> +    .loop_hi:
>> +        mova m0, [temp_q + 4*xq]
>> +        movu m1, [b2q    + 4*xq]
>> +        paddd m1, m0
>> +        paddd m0, m2
>> +        paddd m1, m2
>> +        psrad m0, 1
>> +        psrad m1, 1
>> +        SBUTTERFLY dq, 0,1,3
>> +        %if cpuflag(avx2)
>> +            SBUTTERFLY dqqq, 0,1,3
>> +        %endif
>> +        mova [bq + 8*xq], m0
>> +        mova [bq + 8*xq + mmsize], m1
>> +        add xd, mmsize/4
>> +        cmp xd, wd
>> +    jl .loop_hi
>> +
>> +    and tail, mmsize/4 - 1
>> +    jz .end
>> +    .loop_scalar:
>> +        mov t0d, [temp_q + 4*xq]
>> +        mov t1d, [b2q    + 4*xq]
>> +        add t1d, t0d
>> +        add t0d, 1
>> +        add t1d, 1
>> +        sar t0d, 1
>> +        sar t1d, 1
>> +        mov [bq + 8*xq], t0d
>> +        mov [bq + 8*xq + 4], t1d
>> +        add  xq, 1
>> +        sub tail, 1
>> +    jg .loop_scalar
>> +
>> +    .end:
>> +REP_RET
>> +
>> +%endmacro
>> +
>> +INIT_XMM sse2
>> +HAAR_HORIZONTAL
>> +HAAR_VERTICAL
>> +
>> +INIT_XMM avx
>> +HAAR_HORIZONTAL
>> +HAAR_VERTICAL
>>
> 
> You're not using any avx functions in that version, not unless a macro'd
> instruction inserts one for you. I think you should remove the avx version
> then.
> Also since you always have a HAAR_HORIZONTAL and HAAR_VERTICAL macros per
> version you can just make a single macro to do both versions at the same
> time.

Now that I think about it there will be only one 3-operand instruction
in the SBUTTERFLY and the vertical function also only has 1.  I will
remove it.

I can merge the two macros but I will look back at what I've done
previously.  I think it is usually 1 macro per function.

> +
>> +INIT_YMM avx2
>> +HAAR_HORIZONTAL
>> +HAAR_VERTICAL
>> diff --git a/libavcodec/x86/dirac_dwt_init_10bit.c
>> b/libavcodec/x86/dirac_dwt_init_10bit.c
>> new file mode 100644
>> index 0000000000..289862d728
>> --- /dev/null
>> +++ b/libavcodec/x86/dirac_dwt_init_10bit.c
>> @@ -0,0 +1,76 @@
>> +/*
>> + * x86 optimized discrete wavelet transform
>> + * Copyright (c) 2018 James Darnley
>> + *
>> + * This file is part of FFmpeg.
>> + *
>> + * FFmpeg is free software; you can redistribute it and/or
>> + * modify it under the terms of the GNU Lesser General Public
>> + * License as published by the Free Software Foundation; either
>> + * version 2.1 of the License, or (at your option) any later version.
>> + *
>> + * FFmpeg is distributed in the hope that it will be useful,
>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>> + * Lesser General Public License for more details.
>> + *
>> + * You should have received a copy of the GNU Lesser General Public
>> + * License along with FFmpeg; if not, write to the Free Software
>> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
>> 02110-1301 USA
>> + */
>> +
>> +#include "libavutil/x86/asm.h"
>> +#include "libavutil/x86/cpu.h"
>> +#include "libavcodec/dirac_dwt.h"
>> +
>> +void ff_horizontal_compose_haar_10bit_sse2(int32_t *b0, int32_t *b1, int
>> width_align);
>> +void ff_horizontal_compose_haar_10bit_avx(int32_t *b0, int32_t *b1, int
>> width_align);
>> +void ff_horizontal_compose_haar_10bit_avx2(int32_t *b0, int32_t *b1, int
>> width_align);
>> +
>> +void ff_vertical_compose_haar_10bit_sse2(int32_t *b0, int32_t *b1, int
>> width_align);
>> +void ff_vertical_compose_haar_10bit_avx(int32_t *b0, int32_t *b1, int
>> width_align);
>> +void ff_vertical_compose_haar_10bit_avx2(int32_t *b0, int32_t *b1, int
>> width_align);
>> +
>> +av_cold void ff_spatial_idwt_init_10bit_x86(DWTContext *d, enum dwt_type
>> type)
>> +{
>> +#if HAVE_X86ASM
>> +    int cpu_flags = av_get_cpu_flags();
>> +
>> +    if (EXTERNAL_SSE2(cpu_flags)) {
>> +        switch (type) {
>> +            case DWT_DIRAC_HAAR0:
>> +                d->vertical_compose = (void*)ff_vertical_compose_
>> haar_10bit_sse2;
>> +                break;
>> +            case DWT_DIRAC_HAAR1:
>> +                d->horizontal_compose = (void*)ff_horizontal_compose_
>> haar_10bit_sse2;
>> +                d->vertical_compose = (void*)ff_vertical_compose_
>> haar_10bit_sse2;
>> +                break;
>> +        }
>> +    }
>> +
>> +    if (EXTERNAL_AVX(cpu_flags)) {
>> +        switch (type) {
>> +            case DWT_DIRAC_HAAR0:
>> +                d->vertical_compose = (void*)ff_vertical_compose_
>> haar_10bit_avx;
>> +                break;
>> +            case DWT_DIRAC_HAAR1:
>> +                d->horizontal_compose = (void*)ff_horizontal_compose_
>> haar_10bit_avx;
>> +                d->vertical_compose = (void*)ff_vertical_compose_
>> haar_10bit_avx;
>> +                break;
>> +        }
>>
> 
> We keep switches and cases on the same line.

I had a look and it is the most common style even though not everywhere
holds to it.  Also `indent` does it that way.  So I will change it and
keep it in mind for the future.

>> +
>> +%macro HAAR_HORIZONTAL 0
>> +
>> +cglobal horizontal_compose_haar_10bit, 3, 6+ARCH_X86_64, 4, b, temp_, w,
>> x, b2
>> +    DECLARE_REG_TMP 2,5
>> +    %if ARCH_X86_64
>> +        %define tail r6d
>> +    %else
>> +        %define tail dword wm
>> +    %endif
>> +
>>
> 
> You can remove this whole bit, the init function only gets called if
> ARCH_X86_64 is true.

Where did you get that from?  I don't require 64-bit for this.