[FFmpeg-devel] [PATCH 3/4] lavc/aarch64: add HEVC sao_band NEON

Martin Storsjö martin at martin.st
Sun Jan 17 01:14:09 EET 2021


On Thu, 7 Jan 2021, Josh Dekker wrote:

> Only works for 8x8.
>
> Signed-off-by: Josh Dekker <josh at itanimul.li>
> ---
> libavcodec/aarch64/Makefile           |  3 +-
> libavcodec/aarch64/hevcdsp_init.c     |  7 +++
> libavcodec/aarch64/hevcdsp_sao_neon.S | 87 +++++++++++++++++++++++++++
> 3 files changed, 96 insertions(+), 1 deletion(-)
> create mode 100644 libavcodec/aarch64/hevcdsp_sao_neon.S
>
> diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
> index 42d80bf74c..1f54fc31f4 100644
> --- a/libavcodec/aarch64/Makefile
> +++ b/libavcodec/aarch64/Makefile
> @@ -55,7 +55,8 @@ NEON-OBJS-$(CONFIG_VP8DSP)              += aarch64/vp8dsp_neon.o
> NEON-OBJS-$(CONFIG_AAC_DECODER)         += aarch64/aacpsdsp_neon.o
> NEON-OBJS-$(CONFIG_DCA_DECODER)         += aarch64/synth_filter_neon.o
> NEON-OBJS-$(CONFIG_HEVC_DECODER)        += aarch64/hevcdsp_add_res_neon.o      \
> -                                           aarch64/hevcdsp_idct_neon.o
> +                                           aarch64/hevcdsp_idct_neon.o         \
> +                                           aarch64/hevcdsp_sao_neon.o
> NEON-OBJS-$(CONFIG_OPUS_DECODER)        += aarch64/opusdsp_neon.o
> NEON-OBJS-$(CONFIG_VORBIS_DECODER)      += aarch64/vorbisdsp_neon.o
> NEON-OBJS-$(CONFIG_VP9_DECODER)         += aarch64/vp9itxfm_16bpp_neon.o       \
> diff --git a/libavcodec/aarch64/hevcdsp_init.c b/libavcodec/aarch64/hevcdsp_init.c
> index 2cd7ef3a6c..8f0a923ab1 100644
> --- a/libavcodec/aarch64/hevcdsp_init.c
> +++ b/libavcodec/aarch64/hevcdsp_init.c
> @@ -23,6 +23,11 @@
> #include "libavcodec/hevcdsp.h"
> #include "libavcodec/avcodec.h"
> 
> +void ff_hevc_sao_band_filter_8x8_8_neon(uint8_t *_dst, uint8_t *_src,
> +                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
> +                                  int16_t *sao_offset_val, int sao_left_class,
> +                                  int width, int height);
> +
> void ff_hevc_idct_4x4_dc_8_neon(int16_t *coeffs);
> void ff_hevc_idct_8x8_dc_8_neon(int16_t *coeffs);
> void ff_hevc_idct_16x16_dc_8_neon(int16_t *coeffs);
> @@ -53,6 +58,8 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
> {
>     int cpu_flags = av_get_cpu_flags();
>     if (have_neon(cpu_flags) && bit_depth == 8) {
> +        c->sao_band_filter[0]          = ff_hevc_sao_band_filter_8x8_8_neon;
> +
>         c->add_residual[0]             = ff_hevc_add_residual_4x4_8_neon;
>         c->add_residual[1]             = ff_hevc_add_residual_8x8_8_neon;
>         c->add_residual[2]             = ff_hevc_add_residual_16x16_8_neon;
> diff --git a/libavcodec/aarch64/hevcdsp_sao_neon.S b/libavcodec/aarch64/hevcdsp_sao_neon.S
> new file mode 100644
> index 0000000000..25b6c25117
> --- /dev/null
> +++ b/libavcodec/aarch64/hevcdsp_sao_neon.S
> @@ -0,0 +1,87 @@
> +/* -*-arm64-*-
> + * vim: syntax=arm64asm
> + *
> + * AArch64 NEON optimised SAO functions for HEVC decoding
> + *
> + * Copyright (c) 2020 Josh Dekker <josh at itanimul.li>
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#include "libavutil/aarch64/asm.S"
> +
> +// void sao_band_filter(uint8_t *_dst, uint8_t *_src,
> +//                      ptrdiff_t stride_dst, ptrdiff_t stride_src,
> +//                      int16_t *sao_offset_val, int sao_left_class,
> +//                      int width, int height)
> +function ff_hevc_sao_band_filter_8x8_8_neon, export=1
> +    sub sp, sp, #64
> +    stp xzr, xzr, [sp]
> +    stp xzr, xzr, [sp, #16]
> +    stp xzr, xzr, [sp, #32]
> +    stp xzr, xzr, [sp, #48]
> +    mov w8, #4
> +.setup:

Please use a numeric anonymous label for this one; gas-preprocessor 
doesn't handle these kinds of labels. This actually creates a real symbol 
(though, a non-external one) here; to make it just a local label, you'd 
prefix it with .L on ELF and with L on MachO. In dav1d there's a macro in 
asm.S for doing this, but for here, I'd just suggest you'd go with 1: 
instead, for simplicity.

> +    ldrsh x9, [x4, x8, lsl #1] // x9 = sao_offset_val[k+1]
> +    subs w8, w8, #1
> +    add w10, w8, w5 // x10 = k + sao_left_class
> +    and w10, w10, #0x1F
> +    strh w9, [sp, x10, lsl #1]
> +    bne .setup
> +    ld1 {v16.16B-v19.16B}, [sp], #64

MSVC's armasm64 can't handle these register lists with a dash; 
gas-preprocessor does try to expand them, but it turned out it had a minor 
bug. I'll send patches for fixing that though.

> +    movi v20.8H, #1
> +0:  // beginning of line
> +    mov w8, w6
> +8:
> +    // Simple layout for accessing 16bit values
> +    // with 8bit LUT.
> +    //
> +    //   00  01  02  03  04  05  06  07
> +    // +----------------------------------->
> +    // |xDE#xAD|xCA#xFE|xBE#xEF|xFE#xED|....
> +    // +----------------------------------->
> +    //    i-0     i-1     i-2     i-3
> +    // dst[x] = av_clip_pixel(src[x] + offset_table[src[x] >> shift]);
> +    ld1 {v2.8B}, [x1]
> +    // load src[x]
> +    ushll v0.8H, v2.8B, #0

uxtl

> +    // >> shift
> +    ushr v2.8H, v0.8H, #3 // BIT_DEPTH - 3
> +    // x2 (access lower short)
> +    shl v1.8H, v2.8H, #1 // low (x2, accessing short)
> +    // +1 access upper short
> +    add v3.8H, v1.8H, v20.8H
> +    // shift insert index to upper byte
> +    sli v1.8H, v3.8H, #8
> +    // table
> +    tbx v2.16B, {v16.16B-v19.16B}, v1.16B
> +    // src[x] + table
> +    add v1.8H, v0.8H, v2.8H
> +    // clip + narrow
> +    sqxtun v4.8B, v1.8H
> +    // store
> +    st1 {v4.8B}, [x0]

This all looks very serial to me, most instructions depend on the output 
from the previous one, so most of them will stall for a couple 
instructions. tbx in particular has a rather high latency. If possible, 
I'd suggest unrolling the loop so you operate on two registers of data at 
the same time.

Didn't try following the code algorithm though, but it overall looks 
pretty sensible.

// Martin



More information about the ffmpeg-devel mailing list