[FFmpeg-devel] [PATCH] swscale/output: Implement neon intrinsics for yuv2nv12cX_c()
Harshitha Sarangu Suresh
harshitha at multicorewareinc.com
Thu May 22 16:54:15 EEST 2025
This optimization provides 6x improvement for the module. The boost in performance was calculated by adding C timers inside the C function and the optimized neon intrinsic function.
>From 1deceb0394a5acdf70677870dc252fd66a91dd9f Mon Sep 17 00:00:00 2001
From: Harshitha Suresh <harshitha at multicorewareinc.com>
Date: Mon, 19 May 2025 22:37:20 +0530
Subject: [PATCH] swscale/output: Implement neon intrinsics for yuv2nv12cX_c()
---
libswscale/aarch64/swscale.c | 151 +++++++++++++++++++++++++++++++++++
1 file changed, 151 insertions(+)
diff --git a/libswscale/aarch64/swscale.c b/libswscale/aarch64/swscale.c
index 6e5a721c1f..fb59c3f1b0 100644
--- a/libswscale/aarch64/swscale.c
+++ b/libswscale/aarch64/swscale.c
@@ -21,6 +21,9 @@
#include "libswscale/swscale.h"
#include "libswscale/swscale_internal.h"
#include "libavutil/aarch64/cpu.h"
+#if defined (__aarch64__)
+#include <arm_neon.h>
+#endif
void ff_hscale16to15_4_neon_asm(int shift, int16_t *_dst, int dstW,
const uint8_t *_src, const int16_t *filter,
@@ -142,6 +145,153 @@ static void ff_hscale16to19_X4_neon(SwsInternal *c, int16_t *_dst, int dstW,
}
+static void ff_yuv2nv12cX_neon(enum AVPixelFormat dstFormat, const uint8_t *chrDither,
+ const int16_t *chrFilter, int chrFilterSize,
+ const int16_t **chrUSrc, const int16_t **chrVSrc,
+ uint8_t *dest, int chrDstW)
+{
+
+ int i;
+ int u_dither[8], v_dither[8];
+ for (i = 0; i < 8; i++) {
+ u_dither[i] = chrDither[i & 7] << 12;
+ v_dither[i] = chrDither[(i + 3) & 7] << 12;
+ }
+ int32x4_t u0 = vld1q_s32(&u_dither[0]);
+ int32x4_t u1 = vld1q_s32(&u_dither[4]);
+ int32x4_t v0 = vld1q_s32(&v_dither[0]);
+ int32x4_t v1 = vld1q_s32(&v_dither[4]);
+
+ if (!isSwappedChroma(dstFormat))
+ {
+ for (i = 0; i <= chrDstW - 8; i += 8)
+ {
+ int32x4_t udst0 = u0;
+ int32x4_t udst1 = u1;
+ int32x4_t vdst0 = v0;
+ int32x4_t vdst1 = v1;
+
+ for (int j = 0; j < chrFilterSize; j++)
+ {
+ int16x8_t usrc0 = vld1q_s16(&chrUSrc[j][i]);
+ int16x8_t vsrc0 = vld1q_s16(&chrVSrc[j][i]);
+
+ int32x4_t usrc0_low = vmovl_s16(vget_low_s16(usrc0));
+ int32x4_t usrc0_high = vmovl_s16(vget_high_s16(usrc0));
+ int32x4_t vsrc0_low = vmovl_s16(vget_low_s16(vsrc0));
+ int32x4_t vsrc0_high = vmovl_s16(vget_high_s16(vsrc0));
+
+ udst0 = vmlaq_n_s32(udst0, usrc0_low, chrFilter[j]);
+ udst1 = vmlaq_n_s32(udst1, usrc0_high, chrFilter[j]);
+ vdst0 = vmlaq_n_s32(vdst0, vsrc0_low, chrFilter[j]);
+ vdst1 = vmlaq_n_s32(vdst1, vsrc0_high, chrFilter[j]);
+
+ }
+ // Right shift by 19
+ udst0 = vshrq_n_s32(udst0, 19);
+ udst1 = vshrq_n_s32(udst1, 19);
+ vdst0 = vshrq_n_s32(vdst0, 19);
+ vdst1 = vshrq_n_s32(vdst1, 19);
+
+ // Convert to 16-bit and then to uint8, with saturation
+ int16x8_t u16 = vcombine_s16(vqmovn_s32(udst0), vqmovn_s32(udst1));
+ int16x8_t v16 = vcombine_s16(vqmovn_s32(vdst0), vqmovn_s32(vdst1));
+
+ uint8x8_t u8 = vqmovun_s16(u16);
+ uint8x8_t v8 = vqmovun_s16(v16);
+
+ // Store interleaved u/v as UV UV UV...
+ uint8x8x2_t uv;
+ uv.val[0] = u8;
+ uv.val[1] = v8;
+ vst2_u8(dest + 2 * i, uv);
+ }
+
+ // Handle remaining pixels with scalar fallback
+ for (; i < chrDstW; i++)
+ {
+ int u = chrDither[i & 7] << 12;
+ int v = chrDither[(i + 3) & 7] << 12;
+
+ for (int j = 0; j < chrFilterSize; j++)
+ {
+ u += chrUSrc[j][i] * chrFilter[j];
+ v += chrVSrc[j][i] * chrFilter[j];
+ }
+
+ uint8_t uu = av_clip_uint8(u >> 19);
+ uint8_t vv = av_clip_uint8(v >> 19);
+ dest[2 * i] = uu;
+ dest[2 * i + 1] = vv;
+ }
+ }
+ else
+ {
+ if (!isSwappedChroma(dstFormat))
+ {
+ for (i = 0; i <= chrDstW - 8; i += 8)
+ {
+ int32x4_t udst0 = u0;
+ int32x4_t udst1 = u1;
+ int32x4_t vdst0 = v0;
+ int32x4_t vdst1 = v1;
+
+ for (int j = 0; j < chrFilterSize; j++)
+ {
+ int16x8_t usrc = vld1q_s16(&chrUSrc[j][i]);
+ int16x8_t vsrc = vld1q_s16(&chrVSrc[j][i]);
+
+ int32x4_t usrc_low = vmovl_s16(vget_low_s16(usrc));
+ int32x4_t usrc_high = vmovl_s16(vget_high_s16(usrc));
+ int32x4_t vsrc_low = vmovl_s16(vget_low_s16(vsrc));
+ int32x4_t vsrc_high = vmovl_s16(vget_high_s16(vsrc));
+
+ udst0 = vmlaq_n_s32(udst0, usrc_low, chrFilter[j]);
+ udst1 = vmlaq_n_s32(udst1, usrc_high, chrFilter[j]);
+ vdst0 = vmlaq_n_s32(vdst0, vsrc_low, chrFilter[j]);
+ vdst1 = vmlaq_n_s32(vdst1, vsrc_high, chrFilter[j]);
+ }
+ // Right shift by 19
+ udst0 = vshrq_n_s32(udst0, 19);
+ udst1 = vshrq_n_s32(udst1, 19);
+ vdst0 = vshrq_n_s32(vdst0, 19);
+ vdst1 = vshrq_n_s32(vdst1, 19);
+
+ // Convert to 16-bit and then to uint8, with saturation
+ int16x8_t u16 = vcombine_s16(vqmovn_s32(udst0), vqmovn_s32(udst1));
+ int16x8_t v16 = vcombine_s16(vqmovn_s32(vdst0), vqmovn_s32(vdst1));
+
+ uint8x8_t u8 = vqmovun_s16(u16);
+ uint8x8_t v8 = vqmovun_s16(v16);
+
+ // Store interleaved u/v as UV UV UV...
+ uint8x8x2_t uv;
+ uv.val[0] = v8;
+ uv.val[1] = u8;
+ vst2_u8(dest + 2 * i, uv);
+ }
+
+ // Handle remaining pixels with scalar fallback
+ for (; i < chrDstW; i++)
+ {
+ int u = chrDither[i & 7] << 12;
+ int v = chrDither[(i + 3) & 7] << 12;
+
+ for (int j = 0; j < chrFilterSize; j++)
+ {
+ u += chrUSrc[j][i] * chrFilter[j];
+ v += chrVSrc[j][i] * chrFilter[j];
+ }
+
+ uint8_t uu = av_clip_uint8(u >> 19);
+ uint8_t vv = av_clip_uint8(v >> 19);
+ dest[2 * i] = vv;
+ dest[2 * i + 1] = uu;
+ }
+ }
+ }
+}
+
#define SCALE_FUNC(filter_n, from_bpc, to_bpc, opt) \
void ff_hscale ## from_bpc ## to ## to_bpc ## _ ## filter_n ## _ ## opt( \
SwsInternal *c, int16_t *data, \
@@ -275,6 +425,7 @@ av_cold void ff_sws_init_swscale_aarch64(SwsInternal *c)
ASSIGN_VSCALE_FUNC(c->yuv2plane1, neon);
if (c->dstBpc == 8) {
c->yuv2planeX = ff_yuv2planeX_8_neon;
+ c->yuv2nv12cX = ff_yuv2nv12cX_neon;
}
switch (c->opts.src_format) {
case AV_PIX_FMT_ABGR:
--
2.36.0.windows.1
-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0001-swscale-output-Implement-neon-intrinsics-for-yuv2nv1.patch
Type: application/octet-stream
Size: 7037 bytes
Desc: 0001-swscale-output-Implement-neon-intrinsics-for-yuv2nv1.patch
URL: <https://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20250522/936f514a/attachment.obj>
More information about the ffmpeg-devel
mailing list