[FFmpeg-devel] [PATCH] swscale: rgb_to_yuv neon optimizations
Dmitriy Kovalenko
dmtr.kovalenko at outlook.com
Thu May 22 09:55:39 EEST 2025
Bumping on the review for this one
On 19/05/2025 21:50, Dmitriy Kovalenko wrote:
> I've found quite a few ways to optimize existing ffmpeg's rgb to yuv
> subsampled conversion. In this patch stack I'll try to
> improve the performance.
>
> This particular set of changes is a small improvement to all the
> existing functions and macro. The biggest performance gain is
> coming from post loading increment of the pointer and immediate
> pref etching of the memory blocks and interleaving the multiplication
> shifting operations of
> different registers for better scheduling.
>
> Also changed a bunch of places where cmp + b.le was used instead
> of one instruction cbnz/tbnz and some other small cleanups.
>
> Here are checkasm results on the macbook pro with the latest M4 max
>
> <before>
>
> bgra_to_uv_1080_c: 257.5 ( 1.00x)
> bgra_to_uv_1080_neon: 211.9 ( 1.22x)
> bgra_to_uv_1920_c: 467.1 ( 1.00x)
> bgra_to_uv_1920_neon: 379.3 ( 1.23x)
> bgra_to_uv_half_1080_c: 198.9 ( 1.00x)
> bgra_to_uv_half_1080_neon: 125.7 ( 1.58x)
> bgra_to_uv_half_1920_c: 346.3 ( 1.00x)
> bgra_to_uv_half_1920_neon: 223.7 ( 1.55x)
>
> <after>
>
> bgra_to_uv_1080_c: 268.3 ( 1.00x)
> bgra_to_uv_1080_neon: 176.0 ( 1.53x)
> bgra_to_uv_1920_c: 456.6 ( 1.00x)
> bgra_to_uv_1920_neon: 307.7 ( 1.48x)
> bgra_to_uv_half_1080_c: 193.2 ( 1.00x)
> bgra_to_uv_half_1080_neon: 96.8 ( 2.00x)
> bgra_to_uv_half_1920_c: 347.2 ( 1.00x)
> bgra_to_uv_half_1920_neon: 182.6 ( 1.92x)
>
> With my proprietary test on IOS it gives around 70% of performance
> improvement converting bgra 1920x1920 image to yuv420p
>
> On my linux arm cortex-r processing the performance improvement not that
> visible but still consistently faster by 5-10% than the current
> implementation.
>
> Signed-off-by: Dmitriy Kovalenko <dmtr.kovalenko at outlook.com>
> ---
> libswscale/aarch64/input.S | 166 +++++++++++++++++++++++++------------
> 1 file changed, 112 insertions(+), 54 deletions(-)
>
> diff --git a/libswscale/aarch64/input.S b/libswscale/aarch64/input.S
> index c1c0adffc8..ee8eb24c14 100644
> --- a/libswscale/aarch64/input.S
> +++ b/libswscale/aarch64/input.S
> @@ -1,5 +1,4 @@
> -/*
> - * Copyright (c) 2024 Zhao Zhili <quinkblack at foxmail.com>
> +/* Copyright (c) 2024 Zhao Zhili <quinkblack at foxmail.com>
> *
> * This file is part of FFmpeg.
> *
> @@ -57,20 +56,41 @@
> sqshrn2 \dst\().8h, \dst2\().4s, \right_shift //
> dst_higher_half = dst2 >> right_shift
> .endm
> +// interleaved product version of the rgb to yuv gives slightly
> better performance on non-performant mobile +.macro
> rgb_to_uv_interleaved_product r, g, b, u_coef0, u_coef1, u_coef2,
> v_coef0, v_coef1, v_coef2, u_dst1, u_dst2, v_dst1, v_dst2, u_dst,
> v_dst, right_shift
> + smlal \u_dst1\().4s, \u_coef0\().4h, \r\().4h // U += ru *
> r (first 4)
> + smlal \v_dst1\().4s, \v_coef0\().4h, \r\().4h // V += rv *
> r (first 4)
> + smlal2 \u_dst2\().4s, \u_coef0\().8h, \r\().8h // U += ru *
> r (second 4)
> + smlal2 \v_dst2\().4s, \v_coef0\().8h, \r\().8h // V += rv *
> r (second 4)
> + + smlal \u_dst1\().4s, \u_coef1\().4h, \g\().4h // U +=
> gu * g (first 4)
> + smlal \v_dst1\().4s, \v_coef1\().4h, \g\().4h // V += gv *
> g (first 4)
> + smlal2 \u_dst2\().4s, \u_coef1\().8h, \g\().8h // U += gu *
> g (second 4)
> + smlal2 \v_dst2\().4s, \v_coef1\().8h, \g\().8h // V += gv *
> g (second 4)
> + + smlal \u_dst1\().4s, \u_coef2\().4h, \b\().4h // U +=
> bu * b (first 4)
> + smlal \v_dst1\().4s, \v_coef2\().4h, \b\().4h // V += bv *
> b (first 4)
> + smlal2 \u_dst2\().4s, \u_coef2\().8h, \b\().8h // U += bu *
> b (second 4)
> + smlal2 \v_dst2\().4s, \v_coef2\().8h, \b\().8h // V += bv *
> b (second 4)
> +
> + sqshrn \u_dst\().4h, \u_dst1\().4s, \right_shift // U first 4
> pixels
> + sqshrn2 \u_dst\().8h, \u_dst2\().4s, \right_shift // U all 8
> pixels
> + sqshrn \v_dst\().4h, \v_dst1\().4s, \right_shift // V first 4
> pixels
> + sqshrn2 \v_dst\().8h, \v_dst2\().4s, \right_shift // V all 8
> pixels
> +.endm
> +
> .macro rgbToY_neon fmt_bgr, fmt_rgb, element, alpha_first=0
> function ff_\fmt_bgr\()ToY_neon, export=1
> - cmp w4, #0 // check width > 0
> + cbz w4, 3f // check width > 0
> ldp w12, w11, [x5] // w12: ry, w11: gy
> ldr w10, [x5, #8] // w10: by
> - b.gt 4f
> - ret
> + b 4f
> endfunc
> function ff_\fmt_rgb\()ToY_neon, export=1
> - cmp w4, #0 // check width > 0
> + cbz w4, 3f // check width > 0
> ldp w10, w11, [x5] // w10: ry, w11: gy
> ldr w12, [x5, #8] // w12: by
> - b.le 3f
> 4:
> mov w9, #256 // w9 = 1 <<
> (RGB2YUV_SHIFT - 7)
> movk w9, #8, lsl #16 // w9 += 32 <<
> (RGB2YUV_SHIFT - 1)
> @@ -158,8 +178,7 @@ rgbToY_neon abgr32, argb32, element=4, alpha_first=1
> .macro rgbToUV_half_neon fmt_bgr, fmt_rgb, element, alpha_first=0
> function ff_\fmt_bgr\()ToUV_half_neon, export=1
> - cmp w5, #0 // check width > 0
> - b.le 3f
> + cbz w5, 3f // check width > 0
> ldp w12, w11, [x6, #12]
> ldp w10, w15, [x6, #20]
> @@ -168,7 +187,7 @@ function ff_\fmt_bgr\()ToUV_half_neon, export=1
> endfunc
> function ff_\fmt_rgb\()ToUV_half_neon, export=1
> - cmp w5, #0 // check width > 0
> + cmp w5, #0 // check width > 0
> b.le 3f
> ldp w10, w11, [x6, #12] // w10: ru, w11: gu
> @@ -178,32 +197,41 @@ function ff_\fmt_rgb\()ToUV_half_neon, export=1
> cmp w5, #8
> rgb_set_uv_coeff half=1
> b.lt 2f
> -1:
> +1: // load 16 pixels and prefetch memory for the next block
> .if \element == 3
> - ld3 { v16.16b, v17.16b, v18.16b }, [x3]
> + ld3 { v16.16b, v17.16b, v18.16b }, [x3], #48
> + prfm pldl1strm, [x3, #48]
> .else
> - ld4 { v16.16b, v17.16b, v18.16b, v19.16b }, [x3]
> + ld4 { v16.16b, v17.16b, v18.16b, v19.16b }, [x3],
> #64
> + prfm pldl1strm, [x3, #64]
> .endif
> + .if \alpha_first
> - uaddlp v21.8h, v19.16b
> - uaddlp v20.8h, v18.16b
> - uaddlp v19.8h, v17.16b
> + uaddlp v21.8h, v19.16b // v21: summed b pairs
> + uaddlp v20.8h, v18.16b // v20: summed g pairs
> + uaddlp v19.8h, v17.16b // v19: summed r pairs
> .else
> - uaddlp v19.8h, v16.16b // v19: r
> - uaddlp v20.8h, v17.16b // v20: g
> - uaddlp v21.8h, v18.16b // v21: b
> + uaddlp v19.8h, v16.16b // v19: summed r pairs
> + uaddlp v20.8h, v17.16b // v20: summed g pairs
> + uaddlp v21.8h, v18.16b // v21: summed b pairs
> .endif
> - rgb_to_yuv_product v19, v20, v21, v22, v23, v16, v0, v1, v2,
> #10
> - rgb_to_yuv_product v19, v20, v21, v24, v25, v17, v3, v4, v5, #10
> - sub w5, w5, #8 // width -= 8
> - add x3, x3, #(16*\element)
> - cmp w5, #8 // width >= 8 ?
> + mov v22.16b, v6.16b // U first half
> + mov v23.16b, v6.16b // U second half
> + mov v24.16b, v6.16b // V first half
> + mov v25.16b, v6.16b // V second half
> +
> + rgb_to_uv_interleaved_product v19, v20, v21, v0, v1, v2, v3,
> v4, v5, v22, v23, v24, v25, v16, v17, #10
> +
> str q16, [x0], #16 // store dst_u
> str q17, [x1], #16 // store dst_v
> + + sub w5, w5, #8 // width -= 8
> + cmp w5, #8 // width >= 8 ?
> b.ge 1b
> - cbz w5, 3f
> -2:
> + cbz w5, 3f // No pixels left? Exit
> +
> +2: // Scalar fallback for remaining pixels
> .if \alpha_first
> rgb_load_add_half 1, 5, 2, 6, 3, 7
> .else
> @@ -213,21 +241,24 @@ function ff_\fmt_rgb\()ToUV_half_neon, export=1
> rgb_load_add_half 0, 4, 1, 5, 2, 6
> .endif
> .endif
> -
> smaddl x8, w2, w10, x9 // dst_u = ru * r +
> const_offset
> + smaddl x16, w2, w13, x9 // dst_v = rv * r +
> const_offset (parallel)
> + smaddl x8, w4, w11, x8 // dst_u +=
> gu * g
> + smaddl x16, w4, w14, x16 // dst_v += gv * g
> (parallel)
> + smaddl x8, w7, w12, x8 // dst_u +=
> bu * b
> - asr x8, x8, #10 // dst_u >>= 10
> + smaddl x16, w7, w15, x16 // dst_v += bv * b
> (parallel)
> + + asr w8, w8, #10 // dst_u >>= 10
> + asr w16, w16, #10 // dst_v >>= 10
> + strh w8, [x0], #2 // store dst_u
> -
> - smaddl x8, w2, w13, x9 // dst_v = rv * r +
> const_offset
> - smaddl x8, w4, w14, x8 // dst_v += gv * g
> - smaddl x8, w7, w15, x8 // dst_v += bv * b
> - asr x8, x8, #10 // dst_v >>= 10
> - sub w5, w5, #1
> - add x3, x3, #(2*\element)
> - strh w8, [x1], #2 // store dst_v
> - cbnz w5, 2b
> + strh w16, [x1], #2 // store dst_v
> + + sub w5, w5, #1 // width--
> + add x3, x3, #(2*\element) // Advance source
> pointer
> + cbnz w5, 2b // Process next pixel
> if any left
> 3:
> ret
> endfunc
> @@ -244,9 +275,9 @@ function ff_\fmt_bgr\()ToUV_neon, export=1
> cmp w5, #0 // check width > 0
> b.le 3f
> - ldp w12, w11, [x6, #12]
> - ldp w10, w15, [x6, #20]
> - ldp w14, w13, [x6, #28]
> + ldp w12, w11, [x6, #12] // bu, gu
> + ldp w10, w15, [x6, #20] // ru, bv
> + ldp w14, w13, [x6, #28] // gv, rv
> b 4f
> endfunc
> @@ -263,21 +294,48 @@ function ff_\fmt_rgb\()ToUV_neon, export=1
> b.lt 2f
> 1:
> .if \alpha_first
> - argb_to_yuv_load_rgb x3
> + ld4 { v16.16b, v17.16b, v18.16b, v19.16b }, [x3],
> #64
> + uxtl v21.8h, v19.8b // v21: b
> + uxtl2 v24.8h, v19.16b // v24: b
> + uxtl v19.8h, v17.8b // v19: r
> + uxtl v20.8h, v18.8b // v20: g
> + uxtl2 v22.8h, v17.16b // v22: r
> + uxtl2 v23.8h, v18.16b // v23: g
> .else
> - rgb_to_yuv_load_rgb x3, \element
> + .if \element == 3
> + ld3 { v16.16b, v17.16b, v18.16b }, [x3], #48
> + prfm pldl1strm, [x3, #48]
> + .else // element == 4
> + ld4 { v16.16b, v17.16b, v18.16b, v19.16b },
> [x3], #64
> + prfm pldl1strm, [x3, #64]
> + .endif
> + uxtl v19.8h, v16.8b // v19: r
> + uxtl v20.8h, v17.8b // v20: g
> + uxtl v21.8h, v18.8b // v21: b
> + uxtl2 v22.8h, v16.16b // v22: r
> + uxtl2 v23.8h, v17.16b // v23: g
> + uxtl2 v24.8h, v18.16b // v24: b
> .endif
> - rgb_to_yuv_product v19, v20, v21, v25, v26, v16, v0, v1, v2, #9
> - rgb_to_yuv_product v22, v23, v24, v27, v28, v17, v0, v1, v2, #9
> - rgb_to_yuv_product v19, v20, v21, v25, v26, v18, v3, v4, v5, #9
> - rgb_to_yuv_product v22, v23, v24, v27, v28, v19, v3, v4, v5, #9
> - sub w5, w5, #16
> - add x3, x3, #(16*\element)
> - cmp w5, #16
> - stp q16, q17, [x0], #32 // store to dst_u
> - stp q18, q19, [x1], #32 // store to dst_v
> + // process 2 groups of 8 pixels
> + mov v25.16b, v6.16b // U_dst1 =
> const_offset (32-bit accumulators)
> + mov v26.16b, v6.16b // U_dst2 = const_offset
> + mov v27.16b, v6.16b // V_dst1 = const_offset
> + mov v28.16b, v6.16b // V_dst2 = const_offset
> + rgb_to_uv_interleaved_product v19, v20, v21, v0, v1, v2, v3,
> v4, v5, v25, v26, v27, v28, v16, v18, #9
> +
> + mov v25.16b, v6.16b + mov v26.16b, v6.16b
> + mov v27.16b, v6.16b
> + mov v28.16b, v6.16b
> + rgb_to_uv_interleaved_product v22, v23, v24, v0, v1, v2, v3,
> v4, v5, v25, v26, v27, v28, v17, v19, #9
> +
> + sub w5, w5, #16 // width -= 16
> + cmp w5, #16 // width >= 16 ?
> + stp q16, q17, [x0], #32 // store to dst_u
> (post-increment)
> + stp q18, q19, [x1], #32 // store to dst_v
> (post-increment)
> b.ge 1b
> - cbz w5, 3f
> + cbz w5, 3f // No pixels left? Exit
> +
> 2:
> .if \alpha_first
> ldrb w16, [x3, #1] // w16: r
> @@ -292,13 +350,13 @@ function ff_\fmt_rgb\()ToUV_neon, export=1
> smaddl x8, w16, w10, x9 // x8 = ru * r +
> const_offset
> smaddl x8, w17, w11, x8 // x8 += gu * g
> smaddl x8, w4, w12, x8 // x8 += bu * b
> - asr w8, w8, #9 // x8 >>= 9
> + asr x8, x8, #9 // x8 >>= 9
> strh w8, [x0], #2 // store to dst_u
> smaddl x8, w16, w13, x9 // x8 = rv * r +
> const_offset
> smaddl x8, w17, w14, x8 // x8 += gv * g
> smaddl x8, w4, w15, x8 // x8 += bv * b
> - asr w8, w8, #9 // x8 >>= 9
> + asr x8, x8, #9 // x8 >>= 9
> sub w5, w5, #1 // width--
> add x3, x3, #\element
> strh w8, [x1], #2 // store to dst_v
More information about the ffmpeg-devel
mailing list