[FFmpeg-devel] [PATCH 1/2] avcodec/aarch64/vvc: Optimize vvc_avg{8, 10, 12}
Zhao Zhili
quinkblack at foxmail.com
Thu Feb 20 09:20:08 EET 2025
> On Feb 20, 2025, at 01:40, Krzysztof Pyrkosz via ffmpeg-devel <ffmpeg-devel at ffmpeg.org> wrote:
>
> ---
>
> This patch replaces integer widening with halving addition, and
> multi-step "emulated" rounding shift with a single asm instruction doing
> exactly that. This pattern repeats in other functions in this file, I
> fixed some in the succeeding patch. There's a lot of performance to be
> gained there.
>
> I didn't modify the existing function because it adds a few extra steps
> solely for the shared w_avg implementation (every cycle matters), but also
> because I find this linear version easier to digest and understand.
>
> Besides, I noticed that removing smin and smax instructions used for
> clamping the values for 10 and 12 bit_depth instantiations does not
> affect the checkasm result, but it breaks FATE.
>
> Benchmarks before and after:
> A78
> avg_8_2x2_neon: 21.0 ( 1.55x)
> avg_8_4x4_neon: 25.8 ( 3.05x)
> avg_8_8x8_neon: 45.0 ( 5.86x)
> avg_8_16x16_neon: 178.5 ( 5.49x)
> avg_8_32x32_neon: 709.2 ( 6.20x)
> avg_8_64x64_neon: 2686.2 ( 6.12x)
> avg_8_128x128_neon: 10734.2 ( 5.88x)
> avg_10_2x2_neon: 19.0 ( 1.75x)
> avg_10_4x4_neon: 28.2 ( 2.76x)
> avg_10_8x8_neon: 44.0 ( 5.82x)
> avg_10_16x16_neon: 179.5 ( 4.81x)
> avg_10_32x32_neon: 680.8 ( 5.58x)
> avg_10_64x64_neon: 2536.8 ( 5.40x)
> avg_10_128x128_neon: 10079.0 ( 5.22x)
> avg_12_2x2_neon: 20.8 ( 1.59x)
> avg_12_4x4_neon: 25.2 ( 3.09x)
> avg_12_8x8_neon: 44.0 ( 5.79x)
> avg_12_16x16_neon: 182.2 ( 4.80x)
> avg_12_32x32_neon: 696.2 ( 5.46x)
> avg_12_64x64_neon: 2548.2 ( 5.38x)
> avg_12_128x128_neon: 10133.8 ( 5.19x)
>
> avg_8_2x2_neon: 16.5 ( 1.98x)
> avg_8_4x4_neon: 26.2 ( 2.93x)
> avg_8_8x8_neon: 31.8 ( 8.55x)
> avg_8_16x16_neon: 82.0 (12.02x)
> avg_8_32x32_neon: 310.2 (14.12x)
> avg_8_64x64_neon: 897.8 (18.26x)
> avg_8_128x128_neon: 3608.5 (17.37x)
> avg_10_2x2_neon: 19.5 ( 1.69x)
> avg_10_4x4_neon: 28.0 ( 2.79x)
> avg_10_8x8_neon: 34.8 ( 7.32x)
> avg_10_16x16_neon: 119.8 ( 7.35x)
> avg_10_32x32_neon: 444.2 ( 8.51x)
> avg_10_64x64_neon: 1711.8 ( 8.00x)
> avg_10_128x128_neon: 7065.2 ( 7.43x)
> avg_12_2x2_neon: 19.5 ( 1.71x)
> avg_12_4x4_neon: 24.2 ( 3.22x)
> avg_12_8x8_neon: 33.8 ( 7.57x)
> avg_12_16x16_neon: 120.2 ( 7.33x)
> avg_12_32x32_neon: 442.5 ( 8.53x)
> avg_12_64x64_neon: 1706.2 ( 8.02x)
> avg_12_128x128_neon: 7010.0 ( 7.46x)
>
> A72
> avg_8_2x2_neon: 30.2 ( 1.48x)
> avg_8_4x4_neon: 40.0 ( 3.10x)
> avg_8_8x8_neon: 91.0 ( 4.14x)
> avg_8_16x16_neon: 340.4 ( 3.92x)
> avg_8_32x32_neon: 1220.7 ( 4.67x)
> avg_8_64x64_neon: 5823.4 ( 3.88x)
> avg_8_128x128_neon: 17430.5 ( 4.73x)
> avg_10_2x2_neon: 34.0 ( 1.66x)
> avg_10_4x4_neon: 45.2 ( 2.73x)
> avg_10_8x8_neon: 97.5 ( 3.87x)
> avg_10_16x16_neon: 317.7 ( 3.90x)
> avg_10_32x32_neon: 1376.2 ( 4.21x)
> avg_10_64x64_neon: 5228.1 ( 3.71x)
> avg_10_128x128_neon: 16722.2 ( 4.17x)
> avg_12_2x2_neon: 31.7 ( 1.76x)
> avg_12_4x4_neon: 36.0 ( 3.44x)
> avg_12_8x8_neon: 91.7 ( 4.10x)
> avg_12_16x16_neon: 297.2 ( 4.13x)
> avg_12_32x32_neon: 1400.5 ( 4.14x)
> avg_12_64x64_neon: 5379.1 ( 3.51x)
> avg_12_128x128_neon: 16715.7 ( 4.17x)
>
> avg_8_2x2_neon: 33.7 ( 1.72x)
> avg_8_4x4_neon: 45.5 ( 2.84x)
> avg_8_8x8_neon: 65.0 ( 5.98x)
> avg_8_16x16_neon: 171.0 ( 7.81x)
> avg_8_32x32_neon: 558.2 (10.05x)
> avg_8_64x64_neon: 2006.5 (10.61x)
> avg_8_128x128_neon: 9158.7 ( 8.96x)
> avg_10_2x2_neon: 38.0 ( 1.92x)
> avg_10_4x4_neon: 53.2 ( 2.69x)
> avg_10_8x8_neon: 95.2 ( 4.08x)
> avg_10_16x16_neon: 243.0 ( 5.02x)
> avg_10_32x32_neon: 891.7 ( 5.64x)
> avg_10_64x64_neon: 3357.7 ( 5.60x)
> avg_10_128x128_neon: 12411.7 ( 5.56x)
> avg_12_2x2_neon: 34.7 ( 1.97x)
> avg_12_4x4_neon: 53.2 ( 2.68x)
> avg_12_8x8_neon: 91.7 ( 4.22x)
> avg_12_16x16_neon: 239.0 ( 5.08x)
> avg_12_32x32_neon: 895.7 ( 5.62x)
> avg_12_64x64_neon: 3317.5 ( 5.67x)
> avg_12_128x128_neon: 12358.5 ( 5.58x)
>
>
> A53
> avg_8_2x2_neon: 58.3 ( 1.41x)
> avg_8_4x4_neon: 101.8 ( 2.21x)
> avg_8_8x8_neon: 178.6 ( 4.53x)
> avg_8_16x16_neon: 569.5 ( 5.01x)
> avg_8_32x32_neon: 1962.5 ( 5.50x)
> avg_8_64x64_neon: 8327.8 ( 5.18x)
> avg_8_128x128_neon: 31631.3 ( 5.34x)
> avg_10_2x2_neon: 54.5 ( 1.56x)
> avg_10_4x4_neon: 88.8 ( 2.53x)
> avg_10_8x8_neon: 163.6 ( 4.97x)
> avg_10_16x16_neon: 550.5 ( 5.16x)
> avg_10_32x32_neon: 1942.5 ( 5.64x)
> avg_10_64x64_neon: 8783.5 ( 4.98x)
> avg_10_128x128_neon: 32617.0 ( 5.25x)
> avg_12_2x2_neon: 53.3 ( 1.66x)
> avg_12_4x4_neon: 86.8 ( 2.61x)
> avg_12_8x8_neon: 156.6 ( 5.12x)
> avg_12_16x16_neon: 541.3 ( 5.25x)
> avg_12_32x32_neon: 1955.3 ( 5.59x)
> avg_12_64x64_neon: 8686.0 ( 5.06x)
> avg_12_128x128_neon: 32487.5 ( 5.25x)
>
> avg_8_2x2_neon: 39.5 ( 1.96x)
> avg_8_4x4_neon: 65.3 ( 3.41x)
> avg_8_8x8_neon: 168.8 ( 4.79x)
> avg_8_16x16_neon: 348.0 ( 8.20x)
> avg_8_32x32_neon: 1207.5 ( 8.98x)
> avg_8_64x64_neon: 6032.3 ( 7.17x)
> avg_8_128x128_neon: 22008.5 ( 7.69x)
> avg_10_2x2_neon: 55.5 ( 1.52x)
> avg_10_4x4_neon: 73.8 ( 3.08x)
> avg_10_8x8_neon: 157.8 ( 5.12x)
> avg_10_16x16_neon: 445.0 ( 6.43x)
> avg_10_32x32_neon: 1587.3 ( 6.87x)
> avg_10_64x64_neon: 7738.0 ( 5.68x)
> avg_10_128x128_neon: 27813.8 ( 6.14x)
> avg_12_2x2_neon: 48.3 ( 1.80x)
> avg_12_4x4_neon: 77.0 ( 2.95x)
> avg_12_8x8_neon: 161.5 ( 4.98x)
> avg_12_16x16_neon: 433.5 ( 6.59x)
> avg_12_32x32_neon: 1622.0 ( 6.75x)
> avg_12_64x64_neon: 7844.5 ( 5.60x)
> avg_12_128x128_neon: 26999.5 ( 6.34x)
>
> Krzysztof
>
> libavcodec/aarch64/vvc/inter.S | 124 ++++++++++++++++++++++++++++++++-
> 1 file changed, 121 insertions(+), 3 deletions(-)
>
> diff --git a/libavcodec/aarch64/vvc/inter.S b/libavcodec/aarch64/vvc/inter.S
> index 0edc861f97..c9d698ee29 100644
> --- a/libavcodec/aarch64/vvc/inter.S
> +++ b/libavcodec/aarch64/vvc/inter.S
> @@ -217,13 +217,131 @@ function ff_vvc_\type\()_\bit_depth\()_neon, export=1
> endfunc
> .endm
>
> -vvc_avg avg, 8
> -vvc_avg avg, 10
> -vvc_avg avg, 12
> vvc_avg w_avg, 8
> vvc_avg w_avg, 10
> vvc_avg w_avg, 12
>
> +.macro vvc_avg2 bit_depth
> +function ff_vvc_avg_\bit_depth\()_neon, export=1
> + mov x10, #(VVC_MAX_PB_SIZE * 2)
> + movi v16.8h, #0
> + movi v17.16b, #255
> + ushr v17.8h, v17.8h, #(16 - \bit_depth)
Please set v16 v17 only for bit_depth > 8. LGTM otherwise.
> +
> + cmp w4, #8
> + b.gt 16f
> + b.eq 8f
> + cmp w4, #4
> + b.eq 4f
> +
> +2: // width == 2
> + ldr s0, [x2]
> + subs w5, w5, #1
> + ldr s1, [x3]
> +.if \bit_depth == 8
> + shadd v0.4h, v0.4h, v1.4h
> + sqrshrun v0.8b, v0.8h, #(15 - 1 - \bit_depth)
> + str h0, [x0]
> +.else
> + shadd v0.4h, v0.4h, v1.4h
> + srshr v0.4h, v0.4h, #(15 - 1 - \bit_depth)
> + smax v0.4h, v0.4h, v16.4h
> + smin v0.4h, v0.4h, v17.4h
> + str s0, [x0]
> +.endif
> + add x2, x2, #(VVC_MAX_PB_SIZE * 2)
> + add x3, x3, #(VVC_MAX_PB_SIZE * 2)
> + add x0, x0, x1
> + b.ne 2b
> + ret
> +
> +4: // width == 4
> + ldr d0, [x2]
> + subs w5, w5, #1
> + ldr d1, [x3]
> +.if \bit_depth == 8
> + shadd v0.4h, v0.4h, v1.4h
> + sqrshrun v0.8b, v0.8h, #(15 - 1 - \bit_depth)
> + str s0, [x0]
> +.else
> + shadd v0.4h, v0.4h, v1.4h
> + srshr v0.4h, v0.4h, #(15 - 1 - \bit_depth)
> + smax v0.4h, v0.4h, v16.4h
> + smin v0.4h, v0.4h, v17.4h
> + str d0, [x0]
> +.endif
> + add x2, x2, #(VVC_MAX_PB_SIZE * 2)
> + add x3, x3, #(VVC_MAX_PB_SIZE * 2)
> + add x0, x0, x1
> + b.ne 4b
> + ret
> +
> +8: // width == 8
> + ldr q0, [x2]
> + subs w5, w5, #1
> + ldr q1, [x3]
> +.if \bit_depth == 8
> + shadd v0.8h, v0.8h, v1.8h
> + sqrshrun v0.8b, v0.8h, #(15 - 1 - \bit_depth)
> + str d0, [x0]
> +.else
> + shadd v0.8h, v0.8h, v1.8h
> + srshr v0.8h, v0.8h, #(15 - 1 - \bit_depth)
> + smax v0.8h, v0.8h, v16.8h
> + smin v0.8h, v0.8h, v17.8h
> + str q0, [x0]
> +.endif
> + add x2, x2, #(VVC_MAX_PB_SIZE * 2)
> + add x3, x3, #(VVC_MAX_PB_SIZE * 2)
> + add x0, x0, x1
> + b.ne 8b
> + ret
> +
> +16: // width >= 16
> +.if \bit_depth == 8
> + sub x1, x1, w4, sxtw
> +.else
> + sub x1, x1, w4, sxtw #1
> +.endif
> + sub x10, x10, w4, sxtw #1
> +3:
> + mov w6, w4 // width
> +1:
> + ldp q0, q1, [x2], #32
> + subs w6, w6, #16
> + ldp q2, q3, [x3], #32
> +.if \bit_depth == 8
> + shadd v4.8h, v0.8h, v2.8h
> + shadd v5.8h, v1.8h, v3.8h
> + sqrshrun v0.8b, v4.8h, #6
> + sqrshrun2 v0.16b, v5.8h, #6
> + st1 {v0.16b}, [x0], #16
> +.else
> + shadd v4.8h, v0.8h, v2.8h
> + shadd v5.8h, v1.8h, v3.8h
> + srshr v0.8h, v4.8h, #(15 - 1 - \bit_depth)
> + srshr v1.8h, v5.8h, #(15 - 1 - \bit_depth)
> + smax v0.8h, v0.8h, v16.8h
> + smax v1.8h, v1.8h, v16.8h
> + smin v0.8h, v0.8h, v17.8h
> + smin v1.8h, v1.8h, v17.8h
> + stp q0, q1, [x0], #32
> +.endif
> + b.ne 1b
> +
> + subs w5, w5, #1
> + add x2, x2, x10
> + add x3, x3, x10
> + add x0, x0, x1
> + b.ne 3b
> + ret
> +endfunc
> +.endm
> +
> +vvc_avg2 8
> +vvc_avg2 10
> +vvc_avg2 12
> +
> /* x0: int16_t *dst
> * x1: const uint8_t *_src
> * x2: ptrdiff_t _src_stride
> --
> 2.47.2
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel at ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request at ffmpeg.org with subject "unsubscribe".
More information about the ffmpeg-devel
mailing list