[FFmpeg-devel] [PATCH v3 1/3] aarch64/vvc: Add w_avg

Fri Sep 27 14:20:26 EEST 2024

On Thu, 26 Sep 2024, Zhao Zhili wrote:

> From: Zhao Zhili <zhilizhao at tencent.com>
>
> w_avg_8_2x2_c:                                           0.0 ( 0.00x)
> w_avg_8_2x2_neon:                                        0.0 ( 0.00x)
> w_avg_8_4x4_c:                                           0.2 ( 1.00x)
> w_avg_8_4x4_neon:                                        0.0 ( 0.00x)
> w_avg_8_8x8_c:                                           1.2 ( 1.00x)
> w_avg_8_8x8_neon:                                        0.2 ( 5.00x)
> w_avg_8_16x16_c:                                         4.2 ( 1.00x)
> w_avg_8_16x16_neon:                                      0.8 ( 5.67x)
> w_avg_8_32x32_c:                                        16.2 ( 1.00x)
> w_avg_8_32x32_neon:                                      2.5 ( 6.50x)
> w_avg_8_64x64_c:                                        64.5 ( 1.00x)
> w_avg_8_64x64_neon:                                      9.0 ( 7.17x)
> w_avg_8_128x128_c:                                     269.5 ( 1.00x)
> w_avg_8_128x128_neon:                                   35.5 ( 7.59x)
> w_avg_10_2x2_c:                                          0.2 ( 1.00x)
> w_avg_10_2x2_neon:                                       0.2 ( 1.00x)
> w_avg_10_4x4_c:                                          0.2 ( 1.00x)
> w_avg_10_4x4_neon:                                       0.2 ( 1.00x)
> w_avg_10_8x8_c:                                          1.0 ( 1.00x)
> w_avg_10_8x8_neon:                                       0.2 ( 4.00x)
> w_avg_10_16x16_c:                                        4.2 ( 1.00x)
> w_avg_10_16x16_neon:                                     0.8 ( 5.67x)
> w_avg_10_32x32_c:                                       16.2 ( 1.00x)
> w_avg_10_32x32_neon:                                     2.5 ( 6.50x)
> w_avg_10_64x64_c:                                       66.2 ( 1.00x)
> w_avg_10_64x64_neon:                                    10.0 ( 6.62x)
> w_avg_10_128x128_c:                                    277.8 ( 1.00x)
> w_avg_10_128x128_neon:                                  39.8 ( 6.99x)
> w_avg_12_2x2_c:                                          0.0 ( 0.00x)
> w_avg_12_2x2_neon:                                       0.2 ( 0.00x)
> w_avg_12_4x4_c:                                          0.2 ( 1.00x)
> w_avg_12_4x4_neon:                                       0.0 ( 0.00x)
> w_avg_12_8x8_c:                                          1.2 ( 1.00x)
> w_avg_12_8x8_neon:                                       0.5 ( 2.50x)
> w_avg_12_16x16_c:                                        4.8 ( 1.00x)
> w_avg_12_16x16_neon:                                     0.8 ( 6.33x)
> w_avg_12_32x32_c:                                       17.0 ( 1.00x)
> w_avg_12_32x32_neon:                                     2.8 ( 6.18x)
> w_avg_12_64x64_c:                                       64.0 ( 1.00x)
> w_avg_12_64x64_neon:                                    10.0 ( 6.40x)
> w_avg_12_128x128_c:                                    269.2 ( 1.00x)
> w_avg_12_128x128_neon:                                  42.0 ( 6.41x)
> ---
> libavcodec/aarch64/vvc/dsp_init.c | 34 +++++++++++
> libavcodec/aarch64/vvc/inter.S    | 99 +++++++++++++++++++++++++------
> 2 files changed, 116 insertions(+), 17 deletions(-)
>
> diff --git a/libavcodec/aarch64/vvc/dsp_init.c b/libavcodec/aarch64/vvc/dsp_init.c
> index ad767d17e2..41d0e02d62 100644
> --- a/libavcodec/aarch64/vvc/dsp_init.c
> +++ b/libavcodec/aarch64/vvc/dsp_init.c
> @@ -52,6 +52,37 @@ void ff_vvc_avg_12_neon(uint8_t *dst, ptrdiff_t dst_stride,
>                         const int16_t *src0, const int16_t *src1, int width,
>                         int height);
>
> +void ff_vvc_w_avg_8_neon(uint8_t *_dst, ptrdiff_t _dst_stride,
> +                         const int16_t *src0, const int16_t *src1,
> +                         int width, int height,
> +                         uintptr_t w0_w1, uintptr_t offset_shift);
> +void ff_vvc_w_avg_10_neon(uint8_t *_dst, ptrdiff_t _dst_stride,
> +                         const int16_t *src0, const int16_t *src1,
> +                         int width, int height,
> +                         uintptr_t w0_w1, uintptr_t offset_shift);
> +void ff_vvc_w_avg_12_neon(uint8_t *_dst, ptrdiff_t _dst_stride,
> +                          const int16_t *src0, const int16_t *src1,
> +                          int width, int height,
> +                          uintptr_t w0_w1, uintptr_t offset_shift);
> +/* When passing arguments to functions, Apple platforms diverge from the ARM64
> + * standard ABI, that we can't implement the function directly in asm.

I would prefer to reword the comment a little bit - as it _is_ possible to 
implement directly in assembly, it just is messy wrt portability.

What about:

When passing arguments to functions, Apple platforms diverge from the 
ARM64 standard ABI, for functions that require passing arguments on the 
stack. To simplify portability in the assembly function interface, use a 
different function signature that doesn't require passing arguments on the 
stack.

> + */
> +#define W_AVG_FUN(bit_depth) \
> +static void vvc_w_avg_ ## bit_depth(uint8_t *dst, ptrdiff_t dst_stride, \
> +    const int16_t *src0, const int16_t *src1, int width, int height, \
> +    int denom, int w0, int w1, int o0, int o1) \
> +{ \
> +    const int shift = denom + FFMAX(3, 15 - bit_depth); \
> +    const int offset = ((o0 + o1) * (1 << (bit_depth - 8)) + 1) * (1 << (shift - 1)); \

These local variables don't need to be const either.

The rest of this patchset seems fine, thanks!

// Martin