[FFmpeg-devel] [PATCH 2/2] swscale: Neon rgb_to_yuv_half process 16 pixels at a time

Fri May 30 10:08:18 EEST 2025

On Thu, 29 May 2025, Martin Storsjö wrote:

> On Tue, 27 May 2025, Dmitriy Kovalenko wrote:
>
>> This patches integrates so called double bufferring when we are loading
>> 2 batch elements at a time and then processing them in parallel. On the
>> moden arm processors especially Apple Silicon it gives a visible
>> benefit, for subsampled pixel processing it is especially nice because
>> it allows to read elements w/ 2 instructions and write with a single one
>> (which is usually the slowest part).
>> 
>> Including the previous patch in a stack on macbook pro m4 max 
>> rgb_to_yuv_half
>> in checkasm goes up 2x of the c version
>> ---
>> libswscale/aarch64/input.S | 332 ++++++++++++++++++++++++++++++++++---
>> 1 file changed, 309 insertions(+), 23 deletions(-)
>> 
>> diff --git a/libswscale/aarch64/input.S b/libswscale/aarch64/input.S
>> index ee8eb24c14..59d66d0022 100644
>> --- a/libswscale/aarch64/input.S
>> +++ b/libswscale/aarch64/input.S
>> @@ -194,40 +194,94 @@ function ff_\fmt_rgb\()ToUV_half_neon, export=1
>>         ldp             w12, w13, [x6, #20]     // w12: bu, w13: rv
>>         ldp             w14, w15, [x6, #28]     // w14: gv, w15: bv
>> 4:
>> -        cmp             w5, #8
>>         rgb_set_uv_coeff half=1
>> -        b.lt            2f
>> -1:  // load 16 pixels and prefetch memory for the next block
>> +
>> +        cmp             w5, #16
>> +        b.lt            2f                      // Go directly to scalar 
>> if < 16
>> +
>> +1:
>>     .if \element == 3
>> -        ld3             { v16.16b, v17.16b, v18.16b }, [x3], #48
>> -        prfm            pldl1strm, [x3, #48]
>> +        ld3             { v16.16b, v17.16b, v18.16b }, [x3], #48  // First 
>> 16 pixels
>> +        ld3             { v26.16b, v27.16b, v28.16b }, [x3], #48  // 
>> Second 16 pixels
>> +        prfm            pldl1keep, [x3, #96]
>>     .else
>> -        ld4             { v16.16b, v17.16b, v18.16b, v19.16b }, [x3], #64
>> -        prfm            pldl1strm, [x3, #64]
>> +        ld4             { v16.16b, v17.16b, v18.16b, v19.16b }, [x3], #64 
>> // First 16 pixels
>> +        ld4             { v26.16b, v27.16b, v28.16b, v29.16b }, [x3], #64 
>> // Second 16 pixels
>> +        prfm            pldl1keep, [x3, #128]
>>     .endif
>> 
>> +    // **Sum adjacent pixel pairs**
>>     .if \alpha_first
>> -        uaddlp          v21.8h, v19.16b         // v21: summed b pairs
>> -        uaddlp          v20.8h, v18.16b         // v20: summed g pairs
>> -        uaddlp          v19.8h, v17.16b         // v19: summed r pairs
>> +        uaddlp          v21.8h, v19.16b         // Block 1: B sums
>> +        uaddlp          v20.8h, v18.16b         // Block 1: G sums
>> +        uaddlp          v19.8h, v17.16b         // Block 1: R sums
>> +        uaddlp          v31.8h, v29.16b         // Block 2: B sums
>> +        uaddlp          v30.8h, v28.16b         // Block 2: G sums
>> +        uaddlp          v29.8h, v27.16b         // Block 2: R sums
>>     .else
>> -        uaddlp          v19.8h, v16.16b         // v19: summed r pairs
>> -        uaddlp          v20.8h, v17.16b         // v20: summed g pairs
>> -        uaddlp          v21.8h, v18.16b         // v21: summed b pairs
>> +        uaddlp          v19.8h, v16.16b         // Block 1: R sums
>> +        uaddlp          v20.8h, v17.16b         // Block 1: G sums
>> +        uaddlp          v21.8h, v18.16b         // Block 1: B sums
>> +        uaddlp          v29.8h, v26.16b         // Block 2: R sums
>> +        uaddlp          v30.8h, v27.16b         // Block 2: G sums
>> +        uaddlp          v31.8h, v28.16b         // Block 2: B sums
>>     .endif
>> 
>> -        mov             v22.16b, v6.16b         // U first half
>> -        mov             v23.16b, v6.16b         // U second half
>> -        mov             v24.16b, v6.16b         // V first half
>> -        mov             v25.16b, v6.16b         // V second half
>> -
>> -        rgb_to_uv_interleaved_product v19, v20, v21, v0, v1, v2, v3, v4, 
>> v5, v22, v23, v24, v25, v16, v17, #10
>> +        // init accumulatos for both blocks
>> +        mov             v7.16b, v6.16b          //  U_low
>> +        mov             v8.16b, v6.16b          //  U_high
>> +        mov             v9.16b, v6.16b          //  V_low
>> +        mov             v10.16b, v6.16b         //  V_high
>> +        mov             v11.16b, v6.16b         //  U_low
>> +        mov             v12.16b, v6.16b         //  U_high
>> +        mov             v13.16b, v6.16b         //  V_low
>> +        mov             v14.16b, v6.16b         //  V_high
>> +
>> +        smlal           v7.4s, v0.4h, v19.4h    // U += ru * r (0-3)
>> +        smlal           v9.4s, v3.4h, v19.4h    // V += rv * r (0-3)
>> +        smlal           v11.4s, v0.4h, v29.4h   // U += ru * r (0-3)
>> +        smlal           v13.4s, v3.4h, v29.4h   // V += rv * r (0-3)
>> +
>> +        smlal2          v8.4s, v0.8h, v19.8h    // U += ru * r (4-7)
>> +        smlal2          v10.4s, v3.8h, v19.8h   // V += rv * r (4-7)
>> +        smlal2          v12.4s, v0.8h, v29.8h   // U += ru * r (4-7)
>> +        smlal2          v14.4s, v3.8h, v29.8h   // V += rv * r (4-7)
>> +
>> +        smlal           v7.4s, v1.4h, v20.4h    // U += gu * g (0-3)
>> +        smlal           v9.4s, v4.4h, v20.4h    // V += gv * g (0-3)
>> +        smlal           v11.4s, v1.4h, v30.4h   // U += gu * g (0-3)
>> +        smlal           v13.4s, v4.4h, v30.4h   // V += gv * g (0-3)
>> +
>> +        smlal2          v8.4s, v1.8h, v20.8h    // U += gu * g (4-7)
>> +        smlal2          v10.4s, v4.8h, v20.8h   // V += gv * g (4-7)
>> +        smlal2          v12.4s, v1.8h, v30.8h   // U += gu * g (4-7)
>> +        smlal2          v14.4s, v4.8h, v30.8h   // V += gv * g (4-7)
>> +
>> +        smlal           v7.4s, v2.4h, v21.4h    // U += bu * b (0-3)
>> +        smlal           v9.4s, v5.4h, v21.4h    // V += bv * b (0-3)
>> +        smlal           v11.4s, v2.4h, v31.4h   // U += bu * b (0-3)
>> +        smlal           v13.4s, v5.4h, v31.4h   // V += bv * b (0-3)
>> +
>> +        smlal2          v8.4s, v2.8h, v21.8h    // U += bu * b (4-7)
>> +        smlal2          v10.4s, v5.8h, v21.8h   // V += bv * b (4-7)
>> +        smlal2          v12.4s, v2.8h, v31.8h   // U += bu * b (4-7)
>> +        smlal2          v14.4s, v5.8h, v31.8h   // V += bv * b (4-7)
>> +
>> +        sqshrn          v16.4h, v7.4s, #10      // U (0-3)
>> +        sqshrn          v17.4h, v9.4s, #10      // V (0-3)
>> +        sqshrn          v22.4h, v11.4s, #10     // U (0-3)
>> +        sqshrn          v23.4h, v13.4s, #10     // V (0-3)
>> +
>> +        sqshrn2         v16.8h, v8.4s, #10      // U (0-7)
>> +        sqshrn2         v17.8h, v10.4s, #10     // V (0-7)
>> +        sqshrn2         v22.8h, v12.4s, #10     // U (0-7)
>> +        sqshrn2         v23.8h, v14.4s, #10     // V (0-7)
>> 
>> -        str             q16, [x0], #16          // store dst_u
>> -        str             q17, [x1], #16          // store dst_v
>> +        stp             q16, q22, [x0], #32     // Store all 16 U values
>> +        stp             q17, q23, [x1], #32     // Store all 16 V values
>> 
>> -        sub             w5, w5, #8              // width -= 8
>> -        cmp             w5, #8                  // width >= 8 ?
>> +        sub             w5, w5, #16             // width -= 16
>> +        cmp             w5, #16                 // width >= 16 ?
>>         b.ge            1b
>>         cbz             w5, 3f                  // No pixels left? Exit
>> 
>> @@ -459,3 +513,235 @@ endfunc
>> 
>> DISABLE_DOTPROD
>> #endif
>> +
>> +.macro rgbToUV_half_neon_double fmt_bgr, fmt_rgb, element, alpha_first=0
>> +function ff_\fmt_bgr\()ToUV_half_neon_double, export=1
>> +        cbz             w5, 9f                  // exit immediately if 
>> width is 0
>> +        cmp             w5, #16                 // check if we have at 
>> least 16 pixels
>> +        b.lt            _ff_\fmt_bgr\()ToUV_half_neon
>
> Also, with that fixed, this fails to properly back up and restore registers 
> v8-v15; checkasm doesn't notice this on macOS, but on Linux and windows, 
> checkasm has a call wrapper which does detect such issues.

This comment is still unaddressed, checkasm still fails on Linux and 
Windows.

// Martin