[FFmpeg-devel] [PATCH] swscale/output: Implement neon intrinsics for yuv2planeX_10_c_template()
Harshitha Sarangu Suresh
harshitha at multicorewareinc.com
Mon May 26 11:40:17 EEST 2025
Hi,
Did you get a chance to review this patch?
Get Outlook for Android<https://aka.ms/AAb9ysg>
________________________________
From: Harshitha Sarangu Suresh
Sent: Thursday, May 22, 2025 7:27:31 PM
To: ffmpeg-devel at ffmpeg.org <ffmpeg-devel at ffmpeg.org>
Cc: Dash Santosh Sathyanarayanan <dash.sathyanarayanan at multicorewareinc.com>
Subject: [FFmpeg-devel] [PATCH] swscale/output: Implement neon intrinsics for yuv2planeX_10_c_template()
This optimization provides 5x improvement for the module. The boost in performance was calculated by adding C timers inside the C function and the optimized neon intrinsic function.
>From 904144c2db9e5e72d56360c4c2eb38d426852901 Mon Sep 17 00:00:00 2001
From: Harshitha Suresh <harshitha at multicorewareinc.com>
Date: Thu, 22 May 2025 10:23:55 +0530
Subject: [PATCH] swscale/output: Implement neon intrinsics for
yuv2planeX_10_c_template()
---
libswscale/output.c | 76 ++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 75 insertions(+), 1 deletion(-)
diff --git a/libswscale/output.c b/libswscale/output.c
index c37649e7ce..345df5ce59 100644
--- a/libswscale/output.c
+++ b/libswscale/output.c
@@ -22,7 +22,9 @@
#include <stddef.h>
#include <stdint.h>
#include <string.h>
-
+#if defined (__aarch64__)
+#include <arm_neon.h>
+#endif
#include "libavutil/attributes.h"
#include "libavutil/avutil.h"
#include "libavutil/avassert.h"
@@ -337,6 +339,77 @@ yuv2plane1_10_c_template(const int16_t *src, uint16_t *dest, int dstW,
}
}
+
+#if defined (__aarch64__) && !defined(__APPLE__)
+static av_always_inline void
+yuv2planeX_10_c_template(const int16_t *filter, int filterSize,
+ const int16_t **src, uint16_t *dest, int dstW,
+ int big_endian, int output_bits)
+{
+ const int shift = 11 + 16 - output_bits;
+ const int bias = 1 << (shift - 1);
+ const int clip_max = (1 << output_bits) - 1;
+ int i;
+
+ for (i = 0; i < dstW; i += 16) {
+ int32x4_t sum0_lo = vdupq_n_s32(bias);
+ int32x4_t sum0_hi = vdupq_n_s32(bias);
+ int32x4_t sum1_lo = vdupq_n_s32(bias);
+ int32x4_t sum1_hi = vdupq_n_s32(bias);
+
+ for (int j = 0; j < filterSize; j++) {
+ int16x8_t src_vec0 = vld1q_s16(&src[j][i]);
+ int16x8_t src_vec1 = vld1q_s16(&src[j][i + 8]);
+ int16x8_t filter_val = vdupq_n_s16(filter[j]);
+
+ sum0_lo = vmlal_s16(sum0_lo, vget_low_s16(src_vec0), vget_low_s16(filter_val));
+ sum0_hi = vmlal_s16(sum0_hi, vget_high_s16(src_vec0), vget_high_s16(filter_val));
+ sum1_lo = vmlal_s16(sum1_lo, vget_low_s16(src_vec1), vget_low_s16(filter_val));
+ sum1_hi = vmlal_s16(sum1_hi, vget_high_s16(src_vec1), vget_high_s16(filter_val));
+ }
+
+ // Right shift with rounding
+ int32x4_t shift_vec = vdupq_n_s32(-shift);
+ sum0_lo = vshlq_s32(sum0_lo, shift_vec);
+ sum0_hi = vshlq_s32(sum0_hi, shift_vec);
+ sum1_lo = vshlq_s32(sum1_lo, shift_vec);
+ sum1_hi = vshlq_s32(sum1_hi, shift_vec);
+
+ // Clip to output_bits range
+ sum0_lo = vmaxq_s32(vminq_s32(sum0_lo, vdupq_n_s32(clip_max)), vdupq_n_s32(0));
+ sum0_hi = vmaxq_s32(vminq_s32(sum0_hi, vdupq_n_s32(clip_max)), vdupq_n_s32(0));
+ sum1_lo = vmaxq_s32(vminq_s32(sum1_lo, vdupq_n_s32(clip_max)), vdupq_n_s32(0));
+ sum1_hi = vmaxq_s32(vminq_s32(sum1_hi, vdupq_n_s32(clip_max)), vdupq_n_s32(0));
+
+ // Convert to 16-bit
+ uint16x8_t result0 = vcombine_u16(
+ vreinterpret_u16_s16(vmovn_s32(sum0_lo)),
+ vreinterpret_u16_s16(vmovn_s32(sum0_hi))
+ );
+ uint16x8_t result1 = vcombine_u16(
+ vreinterpret_u16_s16(vmovn_s32(sum1_lo)),
+ vreinterpret_u16_s16(vmovn_s32(sum1_hi))
+ );
+
+ // Store with proper endianness
+ if (big_endian) {
+ result0 = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(result0)));
+ result1 = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(result1)));
+ }
+ vst1q_u16(&dest[i], result0);
+ vst1q_u16(&dest[i + 8], result1);
+ }
+
+ // Handle remaining pixels
+ for (; i < dstW; i++) {
+ int val = bias;
+ for (int j = 0; j < filterSize; j++) {
+ val += src[j][i] * filter[j];
+ }
+ output_pixel(&dest[i], val);
+ }
+}
+#else
static av_always_inline void
yuv2planeX_10_c_template(const int16_t *filter, int filterSize,
const int16_t **src, uint16_t *dest, int dstW,
@@ -355,6 +428,7 @@ yuv2planeX_10_c_template(const int16_t *filter, int filterSize,
output_pixel(&dest[i], val);
}
}
+#endif
#undef output_pixel
--
2.36.0.windows.1
More information about the ffmpeg-devel
mailing list