[FFmpeg-devel] [PATCH] swscale/aarch64/output: Implement neon assembly for yuv2planeX_10_c_template()

Logaprakash Ramajayam logaprakash.ramajayam at multicorewareinc.com
Fri Jun 6 11:44:59 EEST 2025


Checked FATE tests and gha-aarch64 git workflow.

>From 34cdef26eaebcf98916e9881b3a04f4f698f09c6 Mon Sep 17 00:00:00 2001
From: Logaprakash Ramajayam <logaprakash.ramajayam at multicorewareinc.com>
Date: Thu, 5 Jun 2025 01:33:39 -0700
Subject: [PATCH] swscale/aarch64/output: Implement neon assembly for
 yuv2planeX_10_c_template()
---
 libswscale/aarch64/output.S  | 167 +++++++++++++++++++++++++++++++++++
 libswscale/aarch64/swscale.c |  38 ++++++++
 2 files changed, 205 insertions(+)

diff --git a/libswscale/aarch64/output.S b/libswscale/aarch64/output.S
index 190c438870..e039e820ae 100644
--- a/libswscale/aarch64/output.S
+++ b/libswscale/aarch64/output.S
@@ -20,6 +20,173 @@

 #include "libavutil/aarch64/asm.S"

+function ff_yuv2planeX_10_neon, export=1
+// x0 = filter (int16_t*)
+// w1 = filterSize
+// x2 = src (int16_t**)
+// x3 = dest (uint16_t*)
+// w4 = dstW
+// w5 = big_endian
+// w6 = output_bits
+
+        mov             w8, #27
+        sub             w8, w8, w6                      // shift = 11 + 16 - output_bits
+
+        sub             w9, w8, #1
+        mov             w10, #1
+        lsl             w9, w10, w9                     // val = 1 << (shift - 1)
+
+        dup             v1.4s, w9
+        dup             v2.4s, w9                       // Create vectors with val
+
+        mov             w17, #0
+        sub             w16, w17, w8
+        dup             v8.4s, w16                      // Create (-shift) vector for right shift
+
+        movi            v11.4s, #0
+
+        mov             w10, #1
+        lsl             w10, w10, w6
+        sub             w10, w10, #1                    // (1U << output_bits) - 1
+        dup             v12.4s, w10                     // Create Clip vector for uppr bound
+
+        tst             w4, #15                         // if dstW divisible by 16, process 16 elements
+        b.ne            4f                              // else process 8 elements
+
+        mov             x7, #0                          // i = 0
+1:  // Loop
+
+        mov             v3.16b, v1.16b
+        mov             v4.16b, v2.16b
+        mov             v5.16b, v1.16b
+        mov             v6.16b, v2.16b
+
+        mov             w11, w1                         // tmpfilterSize = filterSize
+        mov             x12, x2                         // srcp = src
+        mov             x13, x0                         // filterp = filter
+
+2:  // Filter loop
+
+        ldp             x14, x15, [x12], #16            // get 2 pointers: src[j] and src[j+1]
+        ldr             s7, [x13], #4                   // load filter coefficients
+        add             x14, x14, x7, lsl #1
+        add             x15, x15, x7, lsl #1
+        ld1             {v16.8h, v17.8h}, [x14]
+        ld1             {v18.8h, v19.8h}, [x15]
+
+        // Multiply-accumulate
+        smlal           v3.4s,  v16.4h, v7.h[0]
+        smlal2          v4.4s,  v16.8h, v7.h[0]
+        smlal           v5.4s,  v17.4h, v7.h[0]
+        smlal2          v6.4s,  v17.8h, v7.h[0]
+
+        smlal           v3.4s,  v18.4h, v7.h[1]
+        smlal2          v4.4s,  v18.8h, v7.h[1]
+        smlal           v5.4s,  v19.4h, v7.h[1]
+        smlal2          v6.4s,  v19.8h, v7.h[1]
+
+        subs            w11, w11, #2                    // tmpfilterSize -= 2
+        b.gt            2b                              // continue filter loop
+
+        // Shift results
+        sshl            v3.4s,  v3.4s, v8.4s
+        sshl            v4.4s,  v4.4s, v8.4s
+        sshl            v5.4s,  v5.4s, v8.4s
+        sshl            v6.4s,  v6.4s, v8.4s
+
+        // Clamp to 0
+        smax            v3.4s,  v3.4s, v11.4s
+        smax            v4.4s,  v4.4s, v11.4s
+        smax            v5.4s,  v5.4s, v11.4s
+        smax            v6.4s,  v6.4s, v11.4s
+
+        // Clip upper bound
+        smin            v3.4s,  v3.4s, v12.4s
+        smin            v4.4s,  v4.4s, v12.4s
+        smin            v5.4s,  v5.4s, v12.4s
+        smin            v6.4s,  v6.4s, v12.4s
+
+        // Narrow to 16-bit
+        xtn             v13.4h, v3.4s
+        xtn2            v13.8h, v4.4s
+        xtn             v14.4h, v5.4s
+        xtn2            v14.8h, v6.4s
+
+        cbz             w5, 3f                          // Check if big endian
+        rev16           v13.16b, v13.16b
+        rev16           v14.16b, v14.16b                // Swap bits for big endian
+3:
+        // Store 16 pixels
+        st1             {v13.8h}, [x3], #16
+        st1             {v14.8h}, [x3], #16
+
+        add             x7, x7, #16                     // i = i + 16
+        subs            w4, w4, #16                     // dstW = dstW - 16
+        b.gt            1b                              // Continue loop
+        b               8f                              // end
+
+4: // Process 8 elements
+        mov             x7, #0
+5: // Loop
+
+        mov             v3.16b, v1.16b
+        mov             v4.16b, v2.16b
+
+        mov             w11, w1
+        mov             x12, x2
+        mov             x13, x0
+
+6: // Filter loop
+
+        ldp             x14, x15, [x12], #16
+        ldr             s7, [x13], #4
+        add             x14, x14, x7, lsl #1
+        add             x15, x15, x7, lsl #1
+        ld1             {v5.8h}, [x14]
+        ld1             {v6.8h}, [x15]
+
+        // Multiply-accumulate
+        smlal           v3.4s, v5.4h, v7.h[0]
+        smlal2          v4.4s, v5.8h, v7.h[0]
+        smlal           v3.4s, v6.4h, v7.h[1]
+        smlal2          v4.4s, v6.8h, v7.h[1]
+
+        subs            w11, w11, #2                    // tmpfilterSize -= 2
+        b.gt            6b                              // loop until filterSize consumed
+
+        // Shift results
+        sshl            v3.4s, v3.4s, v8.4s
+        sshl            v4.4s, v4.4s, v8.4s
+
+        // Clamp to 0
+        smax            v3.4s, v3.4s, v11.4s
+        smax            v4.4s, v4.4s, v11.4s
+
+        // Clip upper bound
+        smin            v3.4s, v3.4s, v12.4s
+        smin            v4.4s, v4.4s, v12.4s
+
+        // Narrow to 16-bit
+        xtn             v9.4h, v3.4s
+        xtn             v10.4h, v4.4s
+
+        cbz             w5, 7f                          // Check if big endian
+        rev16           v9.8b, v9.8b
+        rev16           v10.8b, v10.8b                  // Swap bits for big endian
+
+7:
+        // Store 8 pixels
+        st1             {v9.4h}, [x3], #8
+        st1             {v10.4h}, [x3], #8
+
+        add             x7, x7, #8                      // i = i + 8
+        subs            w4, w4, #8                      // dstW = dstW - 8
+        b.gt            5b                              // Continue Loop
+
+8:
+        ret
+endfunc
+
 function ff_yuv2planeX_8_neon, export=1
 // x0 - const int16_t *filter,
 // x1 - int filterSize,
diff --git a/libswscale/aarch64/swscale.c b/libswscale/aarch64/swscale.c
index 6e5a721c1f..23cdb7d26e 100644
--- a/libswscale/aarch64/swscale.c
+++ b/libswscale/aarch64/swscale.c
@@ -158,6 +158,29 @@ void ff_hscale ## from_bpc ## to ## to_bpc ## _ ## filter_n ## _ ## opt( \

 ALL_SCALE_FUNCS(neon);

+void ff_yuv2planeX_10_neon(const int16_t *filter, int filterSize,
+                const int16_t **src, uint16_t *dest, int dstW,
+                int big_endian, int output_bits);
+
+#define yuv2NBPS(bits, BE_LE, is_be, template_size, typeX_t) \
+static void yuv2planeX_ ## bits ## BE_LE ## _neon(const int16_t *filter, int filterSize, \
+                                const int16_t **src, uint8_t *dest, int dstW, \
+                                const uint8_t *dither, int offset)\
+{ \
+    ff_yuv2planeX_## template_size ## _neon(filter, \
+                                filterSize, (const typeX_t **) src, \
+                                (uint16_t *) dest, dstW, is_be, bits); \
+}
+
+yuv2NBPS( 9, BE, 1, 10, int16_t)
+yuv2NBPS( 9, LE, 0, 10, int16_t)
+yuv2NBPS(10, BE, 1, 10, int16_t)
+yuv2NBPS(10, LE, 0, 10, int16_t)
+yuv2NBPS(12, BE, 1, 10, int16_t)
+yuv2NBPS(12, LE, 0, 10, int16_t)
+yuv2NBPS(14, BE, 1, 10, int16_t)
+yuv2NBPS(14, LE, 0, 10, int16_t)
+
 void ff_yuv2planeX_8_neon(const int16_t *filter, int filterSize,
                           const int16_t **src, uint8_t *dest, int dstW,
                           const uint8_t *dither, int offset);
@@ -268,6 +291,8 @@ av_cold void ff_sws_init_range_convert_aarch64(SwsInternal *c)
 av_cold void ff_sws_init_swscale_aarch64(SwsInternal *c)
 {
     int cpu_flags = av_get_cpu_flags();
+    enum AVPixelFormat dstFormat = c->opts.dst_format;
+    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(dstFormat);

     if (have_neon(cpu_flags)) {
         ASSIGN_SCALE_FUNC(c->hyScale, c->hLumFilterSize, neon);
@@ -276,6 +301,19 @@ av_cold void ff_sws_init_swscale_aarch64(SwsInternal *c)
         if (c->dstBpc == 8) {
             c->yuv2planeX = ff_yuv2planeX_8_neon;
         }
+
+        if (isNBPS(dstFormat) && !isSemiPlanarYUV(dstFormat)) {
+            if (desc->comp[0].depth == 9) {
+                c->yuv2planeX = isBE(dstFormat) ? yuv2planeX_9BE_neon  : yuv2planeX_9LE_neon;
+            } else if (desc->comp[0].depth == 10) {
+                c->yuv2planeX = isBE(dstFormat) ? yuv2planeX_10BE_neon  : yuv2planeX_10LE_neon;
+            } else if (desc->comp[0].depth == 12) {
+                c->yuv2planeX = isBE(dstFormat) ? yuv2planeX_12BE_neon  : yuv2planeX_12LE_neon;
+            } else if (desc->comp[0].depth == 14) {
+                c->yuv2planeX = isBE(dstFormat) ? yuv2planeX_14BE_neon  : yuv2planeX_14LE_neon;
+            } else
+            av_assert0(0);
+        }
         switch (c->opts.src_format) {
         case AV_PIX_FMT_ABGR:
             c->lumToYV12 = ff_abgr32ToY_neon;
--
2.36.0.windows.1


-------------- next part --------------
A non-text attachment was scrubbed...
Name: Aarch64-Implement-neon-assembly-yuv2planeX_10_c_template.patch
Type: application/octet-stream
Size: 9819 bytes
Desc: Aarch64-Implement-neon-assembly-yuv2planeX_10_c_template.patch
URL: <https://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20250606/5526ceca/attachment.obj>


More information about the ffmpeg-devel mailing list