[FFmpeg-devel] [PATCH 2/2] swscale: Neon rgb_to_yuv_half process 32 pixels at a time

Sat May 31 12:11:45 EEST 2025

This patch integrates so called double bufferring when we are loading
2 batch of elements at a time and then processing them in parallel. On the
moden arm processors especially Apple Silicon it gives a visible
benefit, for subsampled pixel processing it is especially nice because
it allows to read elements w/ 2 instructions and write with a single one
(especially visible on a platforms with slower memory like ios).

Including the previous patch in a stack on macbook pro m4 max rgb_to_yuv_half
in checkasm goes up 2x of the c version
---
 libswscale/aarch64/input.S | 130 ++++++++++++++++++++++++++-----------
 1 file changed, 91 insertions(+), 39 deletions(-)

diff --git a/libswscale/aarch64/input.S b/libswscale/aarch64/input.S
index 260a26e965..b90ca05996 100644
--- a/libswscale/aarch64/input.S
+++ b/libswscale/aarch64/input.S
@@ -178,7 +178,7 @@ rgbToY_neon abgr32, argb32, element=4, alpha_first=1
 
 .macro rgbToUV_half_neon fmt_bgr, fmt_rgb, element, alpha_first=0
 function ff_\fmt_bgr\()ToUV_half_neon, export=1
-        cbz             w5, 3f          // check width > 0
+        cbz             w5, 3f
 
         ldp             w12, w11, [x6, #12]
         ldp             w10, w15, [x6, #20]
@@ -187,49 +187,101 @@ function ff_\fmt_bgr\()ToUV_half_neon, export=1
 endfunc
 
 function ff_\fmt_rgb\()ToUV_half_neon, export=1
-        cmp             w5, #0                  // check width > 0
+        cmp             w5, #0
         b.le            3f
 
-        ldp             w10, w11, [x6, #12]     // w10: ru, w11: gu
-        ldp             w12, w13, [x6, #20]     // w12: bu, w13: rv
-        ldp             w14, w15, [x6, #28]     // w14: gv, w15: bv
+        ldp             w10, w11, [x6, #12]
+        ldp             w12, w13, [x6, #20]
+        ldp             w14, w15, [x6, #28]
 4:
-        cmp             w5, #8
         rgb_set_uv_coeff half=1
+
+        cmp             w5, #16
         b.lt            2f
-1:  // load 16 pixels
+
+1:
     .if \element == 3
         ld3             { v16.16b, v17.16b, v18.16b }, [x3], #48
+        ld3             { v26.16b, v27.16b, v28.16b }, [x3], #48
     .else
         ld4             { v16.16b, v17.16b, v18.16b, v19.16b }, [x3], #64
+        ld4             { v26.16b, v27.16b, v28.16b, v29.16b }, [x3], #64
     .endif
 
     .if \alpha_first
-        uaddlp          v21.8h, v19.16b         // v21: summed b pairs
-        uaddlp          v20.8h, v18.16b         // v20: summed g pairs
-        uaddlp          v19.8h, v17.16b         // v19: summed r pairs
+        uaddlp          v21.8h, v19.16b
+        uaddlp          v20.8h, v18.16b
+        uaddlp          v19.8h, v17.16b
+        uaddlp          v31.8h, v29.16b
+        uaddlp          v30.8h, v28.16b
+        uaddlp          v29.8h, v27.16b
     .else
-        uaddlp          v19.8h, v16.16b         // v19: summed r pairs
-        uaddlp          v20.8h, v17.16b         // v20: summed g pairs
-        uaddlp          v21.8h, v18.16b         // v21: summed b pairs
+        uaddlp          v19.8h, v16.16b
+        uaddlp          v20.8h, v17.16b
+        uaddlp          v21.8h, v18.16b
+        uaddlp          v29.8h, v26.16b
+        uaddlp          v30.8h, v27.16b
+        uaddlp          v31.8h, v28.16b
     .endif
 
-        mov             v22.16b, v6.16b         // U first half
-        mov             v23.16b, v6.16b         // U second half
-        mov             v24.16b, v6.16b         // V first half
-        mov             v25.16b, v6.16b         // V second half
-
-        rgb_to_uv_interleaved_product v19, v20, v21, v0, v1, v2, v3, v4, v5, v22, v23, v24, v25, v16, v17, #10
-
-        str             q16, [x0], #16          // store dst_u
-        str             q17, [x1], #16          // store dst_v
+        mov             v7.16b, v6.16b
+        mov             v16.16b, v6.16b
+        mov             v17.16b, v6.16b
+        mov             v18.16b, v6.16b
+        mov             v26.16b, v6.16b
+        mov             v27.16b, v6.16b
+        mov             v28.16b, v6.16b
+        mov             v25.16b, v6.16b
 
-        sub             w5, w5, #8              // width -= 8
-        cmp             w5, #8                  // width >= 8 ?
+        smlal           v7.4s, v0.4h, v19.4h
+        smlal           v17.4s, v3.4h, v19.4h
+        smlal           v26.4s, v0.4h, v29.4h
+        smlal           v28.4s, v3.4h, v29.4h
+
+        smlal2          v16.4s, v0.8h, v19.8h
+        smlal2          v18.4s, v3.8h, v19.8h
+        smlal2          v27.4s, v0.8h, v29.8h
+        smlal2          v25.4s, v3.8h, v29.8h
+
+        smlal           v7.4s, v1.4h, v20.4h
+        smlal           v17.4s, v4.4h, v20.4h
+        smlal           v26.4s, v1.4h, v30.4h
+        smlal           v28.4s, v4.4h, v30.4h
+
+        smlal2          v16.4s, v1.8h, v20.8h
+        smlal2          v18.4s, v4.8h, v20.8h
+        smlal2          v27.4s, v1.8h, v30.8h
+        smlal2          v25.4s, v4.8h, v30.8h
+
+        smlal           v7.4s, v2.4h, v21.4h
+        smlal           v17.4s, v5.4h, v21.4h
+        smlal           v26.4s, v2.4h, v31.4h
+        smlal           v28.4s, v5.4h, v31.4h
+
+        smlal2          v16.4s, v2.8h, v21.8h
+        smlal2          v18.4s, v5.8h, v21.8h
+        smlal2          v27.4s, v2.8h, v31.8h
+        smlal2          v25.4s, v5.8h, v31.8h
+
+        sqshrn          v19.4h, v7.4s, #10
+        sqshrn          v20.4h, v17.4s, #10
+        sqshrn          v22.4h, v26.4s, #10
+        sqshrn          v23.4h, v28.4s, #10
+
+        sqshrn2         v19.8h, v16.4s, #10
+        sqshrn2         v20.8h, v18.4s, #10
+        sqshrn2         v22.8h, v27.4s, #10
+        sqshrn2         v23.8h, v25.4s, #10
+
+        stp             q19, q22, [x0], #32
+        stp             q20, q23, [x1], #32
+
+        sub             w5, w5, #16
+        cmp             w5, #16
         b.ge            1b
-        cbz             w5, 3f                  // No pixels left? Exit
+        cbz             w5, 3f
 
-2:      // Scalar fallback for remaining pixels
+2:
 .if \alpha_first
         rgb_load_add_half 1, 5, 2, 6, 3, 7
 .else
@@ -239,24 +291,24 @@ function ff_\fmt_rgb\()ToUV_half_neon, export=1
         rgb_load_add_half 0, 4, 1, 5, 2, 6
     .endif
 .endif
-        smaddl          x8, w2, w10, x9         // dst_u = ru * r + const_offset
-        smaddl          x16, w2, w13, x9        // dst_v = rv * r + const_offset (parallel)
+        smaddl          x8, w2, w10, x9
+        smaddl          x16, w2, w13, x9
 
-        smaddl          x8, w4, w11, x8         // dst_u += gu * g
-        smaddl          x16, w4, w14, x16       // dst_v += gv * g (parallel)
+        smaddl          x8, w4, w11, x8
+        smaddl          x16, w4, w14, x16
 
-        smaddl          x8, w7, w12, x8         // dst_u += bu * b
-        smaddl          x16, w7, w15, x16       // dst_v += bv * b (parallel)
+        smaddl          x8, w7, w12, x8
+        smaddl          x16, w7, w15, x16
 
-        asr             w8, w8, #10             // dst_u >>= 10
-        asr             w16, w16, #10           // dst_v >>= 10
+        asr             w8, w8, #10
+        asr             w16, w16, #10
 
-        strh            w8, [x0], #2            // store dst_u
-        strh            w16, [x1], #2           // store dst_v
+        strh            w8, [x0], #2
+        strh            w16, [x1], #2
 
-        sub             w5, w5, #1              // width--
-        add             x3, x3, #(2*\element)   // Advance source pointer
-        cbnz            w5, 2b                  // Process next pixel if any left
+        sub             w5, w5, #1
+        add             x3, x3, #(2*\element)
+        cbnz            w5, 2b
 3:
         ret
 endfunc
-- 
2.49.0