[FFmpeg-devel] [PATCH] swscale/aarch64/output: Implement neon assembly for yuv2nv12cX_c()
Harshitha Sarangu Suresh
harshitha at multicorewareinc.com
Mon Jun 2 07:36:14 EEST 2025
>From 7260822a578130a713c1455cca6cdd06f1540db8 Mon Sep 17 00:00:00 2001
From: Harshitha Suresh <harshitha at multicorewareinc.com>
Date: Mon, 19 May 2025 22:37:20 +0530
Subject: [PATCH] swscale/aarch64/output: Implement neon assembly for yuv2nv12cX_c()
yuv2nv12cX_2_512_accurate_c: 3508.8 ( 1.00x)
yuv2nv12cX_2_512_accurate_neon: 369.2 ( 9.50x)
yuv2nv12cX_2_512_approximate_c: 3499.0 ( 1.00x)
yuv2nv12cX_2_512_approximate_neon: 370.2 ( 9.45x)
yuv2nv12cX_4_512_accurate_c: 4683.0 ( 1.00x)
yuv2nv12cX_4_512_accurate_neon: 568.8 ( 8.23x)
yuv2nv12cX_4_512_approximate_c: 4682.6 ( 1.00x)
yuv2nv12cX_4_512_approximate_neon: 569.9 ( 8.22x)
yuv2nv12cX_8_512_accurate_c: 7243.0 ( 1.00x)
yuv2nv12cX_8_512_accurate_neon: 937.6 ( 7.72x)
yuv2nv12cX_8_512_approximate_c: 7235.9 ( 1.00x)
yuv2nv12cX_8_512_approximate_neon: 938.3 ( 7.71x)
yuv2nv12cX_16_512_accurate_c: 13749.7 ( 1.00x)
yuv2nv12cX_16_512_accurate_neon: 1708.1 ( 8.05x)
yuv2nv12cX_16_512_approximate_c: 13750.0 ( 1.00x)
yuv2nv12cX_16_512_approximate_neon: 1708.6 ( 8.05x)
---
libswscale/aarch64/output.S | 308 +++++++++++++++++++++++++++++++++++
libswscale/aarch64/swscale.c | 18 ++
2 files changed, 326 insertions(+)
diff --git a/libswscale/aarch64/output.S b/libswscale/aarch64/output.S
index 190c438870..8eb89e8b54 100644
--- a/libswscale/aarch64/output.S
+++ b/libswscale/aarch64/output.S
@@ -226,3 +226,311 @@ function ff_yuv2plane1_8_neon, export=1
b.gt 2b // loop until width consumed
ret
endfunc
+
+// void ff_yuv2nv12cX_neon(enum AVPixelFormat dstFormat, const uint8_t *chrDither,
+// const int16_t *chrFilter, int chrFilterSize,
+// const int16_t **chrUSrc, const int16_t **chrVSrc,
+// uint8_t *dest, int chrDstW)
+
+function ff_yuv2nv12cX_notswapped_neon, export=1
+ // x0 - dstFormat (unused)
+ // x1 - uint8_t *chrDither
+ // x2 - int16_t *chrFilter
+ // x3 - int chrFilterSize
+ // x4 - int16_t **chrUSrc
+ // x5 - int16_t **chrVSrc
+ // x6 - uint8_t *dest
+ // x7 - int chrDstW
+
+ // Load dither pattern and compute U and V dither vectors
+ ld1 {v0.8b}, [x1] // chrDither[0..7]
+ ext v1.8b, v0.8b, v0.8b, #3 // Rotate for V: (i+3)&7
+
+ uxtl v0.8h, v0.8b
+ uxtl v1.8h, v1.8b
+
+ ushll v2.4s, v0.4h, #12 // U dither low
+ ushll2 v3.4s, v0.8h, #12 // U dither high
+ ushll v4.4s, v1.4h, #12 // V dither low
+ ushll2 v5.4s, v1.8h, #12 // V dither high
+
+ // Check if we can process 16 pixels at a time
+ tst w7, #15 // Check if chrDstW % 16 == 0
+ b.ne .Lprocess_8_pixels // If not, use 8-pixel version
+
+ // =============================================
+ // 16-pixel processing path
+ // =============================================
+ mov x8, #0 // i = 0
+.Lloop_16_pixels:
+
+ mov v16.16b, v2.16b // U acc low
+ mov v17.16b, v3.16b // U acc high
+ mov v18.16b, v4.16b // V acc low
+ mov v19.16b, v5.16b // V acc high
+
+ mov v20.16b, v2.16b
+ mov v21.16b, v3.16b
+ mov v22.16b, v4.16b
+ mov v23.16b, v5.16b
+
+ mov w9, w3 // chrFilterSize counter
+ mov x10, x2 // chrFilter pointer
+ mov x11, x4 // chrUSrc base
+ mov x12, x5 // chrVSrc base
+
+.Lfilter_loop_16:
+ ldr h6, [x10], #2 // Load filter coefficient
+
+ ldr x13, [x11], #8 // chrUSrc[j]
+ ldr x14, [x12], #8 // chrVSrc[j]
+ add x13, x13, x8, lsl #1 // &chrUSrc[j][i]
+ add x14, x14, x8, lsl #1 // &chrVSrc[j][i]
+ add x15, x13, #16 // x15 = &chrUSrc[j][i+8] (8 samples * 2 bytes)
+ add x16, x14, #16
+
+ ld1 {v24.8h}, [x13] // U samples 0-7
+ ld1 {v25.8h}, [x14] // V samples 0-7
+
+ ld1 {v26.8h}, [x15] // U samples 8-15
+ ld1 {v27.8h}, [x16] // V samples 8-15
+
+ smlal v16.4s, v24.4h, v6.h[0]
+ smlal2 v17.4s, v24.8h, v6.h[0]
+ smlal v18.4s, v25.4h, v6.h[0]
+ smlal2 v19.4s, v25.8h, v6.h[0]
+
+ smlal v20.4s, v26.4h, v6.h[0]
+ smlal2 v21.4s, v26.8h, v6.h[0]
+ smlal v22.4s, v27.4h, v6.h[0]
+ smlal2 v23.4s, v27.8h, v6.h[0]
+
+ subs w9, w9, #1
+ b.gt .Lfilter_loop_16
+
+ // Process and store first 8 pixels
+ sqshrun v28.4h, v16.4s, #16
+ sqshrun2 v28.8h, v17.4s, #16
+ sqshrun v29.4h, v18.4s, #16
+ sqshrun2 v29.8h, v19.4s, #16
+ uqshrn v30.8b, v28.8h, #3 // U
+ uqshrn v31.8b, v29.8h, #3 // V
+
+ // Process and store next 8 pixels
+ sqshrun v28.4h, v20.4s, #16
+ sqshrun2 v28.8h, v21.4s, #16
+ sqshrun v29.4h, v22.4s, #16
+ sqshrun2 v29.8h, v23.4s, #16
+ uqshrn v24.8b, v28.8h, #3 // U
+ uqshrn v25.8b, v29.8h, #3 // V
+
+ // Store both 8-pixel blocks
+ st2 {v30.8b, v31.8b}, [x6], #16
+ st2 {v24.8b, v25.8b}, [x6], #16
+
+ subs w7, w7, #16
+ add x8, x8, #16
+ b.gt .Lloop_16_pixels
+ ret
+
+ // =============================================
+ // 8-pixel processing path (original code)
+ // =============================================
+.Lprocess_8_pixels:
+ mov x8, #0 // i = 0
+.Lloop_8_pixels:
+ // Initialize accumulators with dither
+ mov v16.16b, v2.16b // U acc low
+ mov v17.16b, v3.16b // U acc high
+ mov v18.16b, v4.16b // V acc low
+ mov v19.16b, v5.16b // V acc high
+
+ mov w9, w3 // chrFilterSize counter
+ mov x10, x2 // chrFilter pointer
+ mov x11, x4 // chrUSrc base
+ mov x12, x5 // chrVSrc base
+
+.Lfilter_loop_8:
+ ldr h6, [x10], #2 // Load filter coefficient
+
+ ldr x13, [x11], #8 // chrUSrc[j]
+ ldr x14, [x12], #8 // chrVSrc[j]
+ add x13, x13, x8, lsl #1 // &chrUSrc[j][i]
+ add x14, x14, x8, lsl #1 // &chrVSrc[j][i]
+
+ ld1 {v20.8h}, [x13] // U samples
+ ld1 {v21.8h}, [x14] // V samples
+
+ smlal v16.4s, v20.4h, v6.h[0]
+ smlal2 v17.4s, v20.8h, v6.h[0]
+ smlal v18.4s, v21.4h, v6.h[0]
+ smlal2 v19.4s, v21.8h, v6.h[0]
+
+ subs w9, w9, #1
+ b.gt .Lfilter_loop_8
+
+ // Final processing and store
+ sqshrun v26.4h, v16.4s, #16
+ sqshrun2 v26.8h, v17.4s, #16
+ sqshrun v27.4h, v18.4s, #16
+ sqshrun2 v27.8h, v19.4s, #16
+ uqshrn v28.8b, v26.8h, #3 // U
+ uqshrn v29.8b, v27.8h, #3 // V
+
+ st2 {v28.8b, v29.8b}, [x6], #16
+
+ subs w7, w7, #8
+ add x8, x8, #8
+ b.gt .Lloop_8_pixels
+ ret
+endfunc
+
+function ff_yuv2nv12cX_swapped_neon, export=1
+ // x0 - dstFormat (unused)
+ // x1 - uint8_t *chrDither
+ // x2 - int16_t *chrFilter
+ // x3 - int chrFilterSize
+ // x4 - int16_t **chrUSrc
+ // x5 - int16_t **chrVSrc
+ // x6 - uint8_t *dest
+ // x7 - int chrDstW
+
+ ld1 {v0.8b}, [x1] // chrDither[0..7]
+ ext v1.8b, v0.8b, v0.8b, #3 // Rotate for V: (i+3)&7
+
+ uxtl v0.8h, v0.8b
+ uxtl v1.8h, v1.8b
+
+ ushll v2.4s, v0.4h, #12 // U dither low
+ ushll2 v3.4s, v0.8h, #12 // U dither high
+ ushll v4.4s, v1.4h, #12 // V dither low
+ ushll2 v5.4s, v1.8h, #12 // V dither high
+
+ // Check if we can process 16 pixels at a time
+ tst w7, #15 // Check if chrDstW % 16 == 0
+ b.ne .Lswapped_process_8_pixels // If not, use 8-pixel version
+
+ // =============================================
+ // 16-pixel processing path
+ // =============================================
+ mov x8, #0 // i = 0
+.Lswapped_loop_16_pixels:
+
+ mov v16.16b, v2.16b // U acc low
+ mov v17.16b, v3.16b // U acc high
+ mov v18.16b, v4.16b // V acc low
+ mov v19.16b, v5.16b // V acc high
+
+ mov v20.16b, v2.16b
+ mov v21.16b, v3.16b
+ mov v22.16b, v4.16b
+ mov v23.16b, v5.16b
+
+ mov w9, w3 // chrFilterSize counter
+ mov x10, x2 // chrFilter pointer
+ mov x11, x4 // chrUSrc base
+ mov x12, x5 // chrVSrc base
+
+.Lswapped_filter_loop_16:
+ ldr h6, [x10], #2 // Load filter coefficient
+
+ // Load pointers for first 8 pixels
+ ldr x13, [x11], #8 // chrUSrc[j]
+ ldr x14, [x12], #8 // chrVSrc[j]
+ add x13, x13, x8, lsl #1 // &chrUSrc[j][i]
+ add x14, x14, x8, lsl #1 // &chrVSrc[j][i]
+ add x15, x13, #16 // x15 = &chrUSrc[j][i+8] (8 samples * 2 bytes)
+ add x16, x14, #16
+
+ ld1 {v24.8h}, [x13] // U samples 0-7
+ ld1 {v25.8h}, [x14] // V samples 0-7
+
+ ld1 {v26.8h}, [x15] // U samples 8-15
+ ld1 {v27.8h}, [x16] // V samples 8-15
+
+ smlal v16.4s, v24.4h, v6.h[0]
+ smlal2 v17.4s, v24.8h, v6.h[0]
+ smlal v18.4s, v25.4h, v6.h[0]
+ smlal2 v19.4s, v25.8h, v6.h[0]
+
+ smlal v20.4s, v26.4h, v6.h[0]
+ smlal2 v21.4s, v26.8h, v6.h[0]
+ smlal v22.4s, v27.4h, v6.h[0]
+ smlal2 v23.4s, v27.8h, v6.h[0]
+
+ subs w9, w9, #1
+ b.gt .Lswapped_filter_loop_16
+
+ sqshrun v28.4h, v16.4s, #16
+ sqshrun2 v28.8h, v17.4s, #16
+ sqshrun v29.4h, v18.4s, #16
+ sqshrun2 v29.8h, v19.4s, #16
+ uqshrn v30.8b, v28.8h, #3 // U
+ uqshrn v31.8b, v29.8h, #3 // V
+
+ sqshrun v28.4h, v20.4s, #16
+ sqshrun2 v28.8h, v21.4s, #16
+ sqshrun v29.4h, v22.4s, #16
+ sqshrun2 v29.8h, v23.4s, #16
+ uqshrn v24.8b, v28.8h, #3 // U
+ uqshrn v25.8b, v29.8h, #3 // V
+
+ // Store both 8-pixel blocks
+ st2 {v30.8b, v31.8b}, [x6], #16
+ st2 {v24.8b, v25.8b}, [x6], #16
+
+ subs w7, w7, #16
+ add x8, x8, #16
+ b.gt .Lswapped_loop_16_pixels
+ ret
+
+ // =============================================
+ // 8-pixel processing path
+ // =============================================
+.Lswapped_process_8_pixels:
+ mov x8, #0 // i = 0
+.Lswapped_loop_8_pixels:
+ // Initialize accumulators with dither
+ mov v16.16b, v2.16b // U acc low
+ mov v17.16b, v3.16b // U acc high
+ mov v18.16b, v4.16b // V acc low
+ mov v19.16b, v5.16b // V acc high
+
+ mov w9, w3 // chrFilterSize counter
+ mov x10, x2 // chrFilter pointer
+ mov x11, x4 // chrUSrc base
+ mov x12, x5 // chrVSrc base
+
+.Lswapped_filter_loop_8:
+ ldr h6, [x10], #2 // Load filter coefficient
+
+ ldr x13, [x11], #8 // chrUSrc[j]
+ ldr x14, [x12], #8 // chrVSrc[j]
+ add x13, x13, x8, lsl #1 // &chrUSrc[j][i]
+ add x14, x14, x8, lsl #1 // &chrVSrc[j][i]
+
+ ld1 {v20.8h}, [x13] // U samples
+ ld1 {v21.8h}, [x14] // V samples
+
+ smlal v16.4s, v20.4h, v6.h[0]
+ smlal2 v17.4s, v20.8h, v6.h[0]
+ smlal v18.4s, v21.4h, v6.h[0]
+ smlal2 v19.4s, v21.8h, v6.h[0]
+
+ subs w9, w9, #1
+ b.gt .Lswapped_filter_loop_8
+
+ sqshrun v26.4h, v16.4s, #16
+ sqshrun2 v26.8h, v17.4s, #16
+ sqshrun v27.4h, v18.4s, #16
+ sqshrun2 v27.8h, v19.4s, #16
+ uqshrn v28.8b, v26.8h, #3 // U
+ uqshrn v29.8b, v27.8h, #3 // V
+
+ st2 {v28.8b, v29.8b}, [x6], #16
+
+ subs w7, w7, #8
+ add x8, x8, #8
+ b.gt .Lswapped_loop_8_pixels
+ ret
+endfunc
diff --git a/libswscale/aarch64/swscale.c b/libswscale/aarch64/swscale.c
index 6e5a721c1f..0e57112f42 100644
--- a/libswscale/aarch64/swscale.c
+++ b/libswscale/aarch64/swscale.c
@@ -168,6 +168,16 @@ void ff_yuv2plane1_8_neon(
const uint8_t *dither,
int offset);
+void ff_yuv2nv12cX_notswapped_neon(enum AVPixelFormat dstFormat, const uint8_t *chrDither,
+ const int16_t *chrFilter, int chrFilterSize,
+ const int16_t **chrUSrc, const int16_t **chrVSrc,
+ uint8_t *dest, int chrDstW);
+
+void ff_yuv2nv12cX_swapped_neon(enum AVPixelFormat dstFormat, const uint8_t *chrDither,
+ const int16_t *chrFilter, int chrFilterSize,
+ const int16_t **chrUSrc, const int16_t **chrVSrc,
+ uint8_t *dest, int chrDstW);
+
#define ASSIGN_SCALE_FUNC2(hscalefn, filtersize, opt) do { \
if (c->srcBpc == 8) { \
if(c->dstBpc <= 14) { \
@@ -201,6 +211,12 @@ void ff_yuv2plane1_8_neon(
default: break; \
}
+#define ASSIGN_YUV2NV12_FUNC(yuv2nv12fn, opt, dstFormat) \
+ if(!isSwappedChroma(dstFormat)) \
+ yuv2nv12fn = ff_yuv2nv12cX_notswapped_ ## opt; \
+ else \
+ yuv2nv12fn = ff_yuv2nv12cX_swapped_ ## opt;
+
#define NEON_INPUT(name) \
void ff_##name##ToY_neon(uint8_t *dst, const uint8_t *src, const uint8_t *, \
const uint8_t *, int w, uint32_t *coeffs, void *); \
@@ -275,7 +291,9 @@ av_cold void ff_sws_init_swscale_aarch64(SwsInternal *c)
ASSIGN_VSCALE_FUNC(c->yuv2plane1, neon);
if (c->dstBpc == 8) {
c->yuv2planeX = ff_yuv2planeX_8_neon;
+ ASSIGN_YUV2NV12_FUNC(c->yuv2nv12cX, neon, c->opts.dst_format);
}
+
switch (c->opts.src_format) {
case AV_PIX_FMT_ABGR:
c->lumToYV12 = ff_abgr32ToY_neon;
--
2.36.0.windows.1
-------------- next part --------------
A non-text attachment was scrubbed...
Name: swscale-aarch64-output-Implement-neon-assembly-fo.patch
Type: application/octet-stream
Size: 14784 bytes
Desc: swscale-aarch64-output-Implement-neon-assembly-fo.patch
URL: <https://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20250602/b4105351/attachment.obj>
More information about the ffmpeg-devel
mailing list