[FFmpeg-cvslog] swscale: [LA] Optimize yuv2plane1_8_c.
Shiyou Yin
git at videolan.org
Fri Apr 12 00:57:04 EEST 2024
ffmpeg | branch: master | Shiyou Yin <yinshiyou-hf at loongson.cn> | Sat Mar 16 11:03:32 2024 +0800| [8b76df914285b1e10460c16134715531050e7a74] | committer: Michael Niedermayer
swscale: [LA] Optimize yuv2plane1_8_c.
Reviewed-by: colleague of Shiyou Yin
Signed-off-by: Michael Niedermayer <michael at niedermayer.cc>
> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=8b76df914285b1e10460c16134715531050e7a74
---
libswscale/loongarch/output.S | 254 +++++++++++++++++++++++++-
libswscale/loongarch/output_lasx.c | 23 ++-
libswscale/loongarch/output_lsx.c | 22 ++-
libswscale/loongarch/swscale_init_loongarch.c | 12 +-
libswscale/loongarch/swscale_loongarch.h | 29 ++-
5 files changed, 324 insertions(+), 16 deletions(-)
diff --git a/libswscale/loongarch/output.S b/libswscale/loongarch/output.S
index b44bac502a..d71667e38a 100644
--- a/libswscale/loongarch/output.S
+++ b/libswscale/loongarch/output.S
@@ -23,11 +23,11 @@
#include "libavcodec/loongarch/loongson_asm.S"
-/* static void ff_yuv2planeX_8_lsx(const int16_t *filter, int filterSize,
+/* static void yuv2planeX_8_lsx(const int16_t *filter, int filterSize,
* const int16_t **src, uint8_t *dest, int dstW,
* const uint8_t *dither, int offset)
*/
-function ff_yuv2planeX_8_lsx
+function yuv2planeX_8_lsx
addi.w t1, a6, 1
addi.w t2, a6, 2
addi.w t3, a6, 3
@@ -136,3 +136,253 @@ function ff_yuv2planeX_8_lsx
blt zero, a4, .DEST
.END:
endfunc
+
+/*
+ * void yuv2plane1_8_lsx(const int16_t *src, uint8_t *dest, int dstW,
+ * const uint8_t *dither, int offset)
+ */
+function yuv2plane1_8_lsx
+ addi.w t1, a4, 1
+ addi.w t2, a4, 2
+ addi.w t3, a4, 3
+ addi.w t4, a4, 4
+ addi.w t5, a4, 5
+ addi.w t6, a4, 6
+ addi.w t7, a4, 7
+ andi t0, a4, 7
+ andi t1, t1, 7
+ andi t2, t2, 7
+ andi t3, t3, 7
+ andi t4, t4, 7
+ andi t5, t5, 7
+ andi t6, t6, 7
+ andi t7, t7, 7
+ ldx.bu t0, a3, t0
+ ldx.bu t1, a3, t1
+ ldx.bu t2, a3, t2
+ ldx.bu t3, a3, t3
+ ldx.bu t4, a3, t4
+ ldx.bu t5, a3, t5
+ ldx.bu t6, a3, t6
+ ldx.bu t7, a3, t7
+ vinsgr2vr.h vr1, t0, 0
+ vinsgr2vr.h vr1, t1, 1
+ vinsgr2vr.h vr1, t2, 2
+ vinsgr2vr.h vr1, t3, 3
+ vinsgr2vr.h vr1, t4, 4
+ vinsgr2vr.h vr1, t5, 5
+ vinsgr2vr.h vr1, t6, 6
+ vinsgr2vr.h vr1, t7, 7
+ vsub.h vr0, vr0, vr0
+ vilvl.h vr2, vr0, vr1
+ vilvh.h vr3, vr0, vr1
+
+ andi t8, a2, 7
+ srli.d a2, a2, 3
+ beqz a2, 2f
+1:
+ vld vr1, a0, 0
+ addi.d a0, a0, 16
+ vshuf4i.d vr0, vr1, 8
+ vexth.w.h vr4, vr0
+ vexth.w.h vr5, vr1
+
+ vadd.w vr4, vr2, vr4
+ vadd.w vr5, vr3, vr5
+ vsrai.w vr4, vr4, 7
+ vsrai.w vr5, vr5, 7
+ vclip255.w vr4, vr4
+ vclip255.w vr5, vr5
+ vpickev.h vr1, vr5, vr4
+ vpickev.b vr1, vr1, vr1
+ fst.d f1, a1, 0
+ addi.d a1, a1, 8
+ addi.d a2, a2, -1
+ bnez a2, 1b
+2:
+ beqz t8, 4f
+3:
+ add.w a4, a4, t8
+ addi.w t1, a4, 1
+ addi.w t2, a4, 2
+ addi.w t3, a4, 3
+ addi.w t4, a4, 4
+ addi.w t5, a4, 5
+ addi.w t6, a4, 6
+ addi.w t7, a4, 7
+ andi t0, a4, 7
+ andi t1, t1, 7
+ andi t2, t2, 7
+ andi t3, t3, 7
+ andi t4, t4, 7
+ andi t5, t5, 7
+ andi t6, t6, 7
+ andi t7, t7, 7
+ ldx.bu t0, a3, t0
+ ldx.bu t1, a3, t1
+ ldx.bu t2, a3, t2
+ ldx.bu t3, a3, t3
+ ldx.bu t4, a3, t4
+ ldx.bu t5, a3, t5
+ ldx.bu t6, a3, t6
+ ldx.bu t7, a3, t7
+ vinsgr2vr.h vr1, t0, 0
+ vinsgr2vr.h vr1, t1, 1
+ vinsgr2vr.h vr1, t2, 2
+ vinsgr2vr.h vr1, t3, 3
+ vinsgr2vr.h vr1, t4, 4
+ vinsgr2vr.h vr1, t5, 5
+ vinsgr2vr.h vr1, t6, 6
+ vinsgr2vr.h vr1, t7, 7
+ vsub.h vr0, vr0, vr0
+ vilvl.h vr2, vr0, vr1
+ vilvh.h vr3, vr0, vr1
+
+ addi.d a0, a0, -16
+ add.d a0, a0, t8
+ add.d a0, a0, t8
+ addi.d a1, a1, -8
+ add.d a1, a1, t8
+
+ vld vr1, a0, 0
+ vshuf4i.d vr0, vr1, 8
+ vexth.w.h vr4, vr0
+ vexth.w.h vr5, vr1
+
+ vadd.w vr4, vr2, vr4
+ vadd.w vr5, vr3, vr5
+ vsrai.w vr4, vr4, 7
+ vsrai.w vr5, vr5, 7
+ vclip255.w vr4, vr4
+ vclip255.w vr5, vr5
+ vpickev.h vr1, vr5, vr4
+ vpickev.b vr1, vr1, vr1
+ fst.d f1, a1, 0
+4:
+endfunc
+
+function yuv2plane1_8_lasx
+ addi.w t1, a4, 1
+ addi.w t2, a4, 2
+ addi.w t3, a4, 3
+ addi.w t4, a4, 4
+ addi.w t5, a4, 5
+ addi.w t6, a4, 6
+ addi.w t7, a4, 7
+ andi t0, a4, 7
+ andi t1, t1, 7
+ andi t2, t2, 7
+ andi t3, t3, 7
+ andi t4, t4, 7
+ andi t5, t5, 7
+ andi t6, t6, 7
+ andi t7, t7, 7
+ ldx.bu t0, a3, t0
+ ldx.bu t1, a3, t1
+ ldx.bu t2, a3, t2
+ ldx.bu t3, a3, t3
+ ldx.bu t4, a3, t4
+ ldx.bu t5, a3, t5
+ ldx.bu t6, a3, t6
+ ldx.bu t7, a3, t7
+ vinsgr2vr.h vr1, t0, 0
+ vinsgr2vr.h vr1, t1, 1
+ vinsgr2vr.h vr1, t2, 2
+ vinsgr2vr.h vr1, t3, 3
+ vinsgr2vr.h vr1, t4, 4
+ vinsgr2vr.h vr1, t5, 5
+ vinsgr2vr.h vr1, t6, 6
+ vinsgr2vr.h vr1, t7, 7
+ xvpermi.q xr1, xr1, 0
+ xvsub.h xr0, xr0, xr0
+ xvilvl.h xr2, xr0, xr1
+ xvilvh.h xr3, xr0, xr1
+
+ andi t8, a2, 15
+ srli.d a2, a2, 4
+ beqz a2, 2f
+1:
+ xvld xr1, a0, 0
+ addi.d a0, a0, 32
+ xvpermi.d xr0, xr1, 0xa0
+ xvexth.w.h xr4, xr0
+ xvexth.w.h xr5, xr1
+
+ xvadd.w xr4, xr2, xr4
+ xvadd.w xr5, xr3, xr5
+ xvsrai.w xr4, xr4, 7
+ xvsrai.w xr5, xr5, 7
+ xvclip255.w xr4, xr4
+ xvclip255.w xr5, xr5
+ xvpickev.h xr1, xr5, xr4
+ xvpickev.b xr0, xr1, xr1
+ xvpermi.q xr1, xr0, 1
+ fst.d f0, a1, 0
+ fst.d f1, a1, 8
+ addi.d a1, a1, 16
+ addi.d a2, a2, -1
+ bnez a2, 1b
+2:
+ beqz t8, 4f
+3:
+ add.w a4, a4, t8
+ addi.w t1, a4, 1
+ addi.w t2, a4, 2
+ addi.w t3, a4, 3
+ addi.w t4, a4, 4
+ addi.w t5, a4, 5
+ addi.w t6, a4, 6
+ addi.w t7, a4, 7
+ andi t0, a4, 7
+ andi t1, t1, 7
+ andi t2, t2, 7
+ andi t3, t3, 7
+ andi t4, t4, 7
+ andi t5, t5, 7
+ andi t6, t6, 7
+ andi t7, t7, 7
+ ldx.bu t0, a3, t0
+ ldx.bu t1, a3, t1
+ ldx.bu t2, a3, t2
+ ldx.bu t3, a3, t3
+ ldx.bu t4, a3, t4
+ ldx.bu t5, a3, t5
+ ldx.bu t6, a3, t6
+ ldx.bu t7, a3, t7
+ vinsgr2vr.h vr1, t0, 0
+ vinsgr2vr.h vr1, t1, 1
+ vinsgr2vr.h vr1, t2, 2
+ vinsgr2vr.h vr1, t3, 3
+ vinsgr2vr.h vr1, t4, 4
+ vinsgr2vr.h vr1, t5, 5
+ vinsgr2vr.h vr1, t6, 6
+ vinsgr2vr.h vr1, t7, 7
+ xvpermi.q xr1, xr1, 0
+ xvsub.h xr0, xr0, xr0
+ xvilvl.h xr2, xr0, xr1
+ xvilvh.h xr3, xr0, xr1
+
+ addi.d a0, a0, -32
+ add.d a0, a0, t8
+ add.d a0, a0, t8
+ addi.d a1, a1, -16
+ add.d a1, a1, t8
+
+ xvld xr1, a0, 0
+ xvpermi.d xr0, xr1, 0xa0
+ xvexth.w.h xr4, xr0
+ xvexth.w.h xr5, xr1
+
+ xvadd.w xr4, xr2, xr4
+ xvadd.w xr5, xr3, xr5
+ xvsrai.w xr4, xr4, 7
+ xvsrai.w xr5, xr5, 7
+ xvclip255.w xr4, xr4
+ xvclip255.w xr5, xr5
+ xvpickev.h xr1, xr5, xr4
+ xvpickev.b xr0, xr1, xr1
+ xvpermi.q xr1, xr0, 1
+ fst.d f0, a1, 0
+ fst.d f1, a1, 8
+4:
+endfunc
diff --git a/libswscale/loongarch/output_lasx.c b/libswscale/loongarch/output_lasx.c
index 277d7063e6..bc8ab8cf36 100644
--- a/libswscale/loongarch/output_lasx.c
+++ b/libswscale/loongarch/output_lasx.c
@@ -22,7 +22,7 @@
#include "swscale_loongarch.h"
#include "libavutil/loongarch/loongson_intrinsics.h"
-void ff_yuv2planeX_8_lasx(const int16_t *filter, int filterSize,
+void yuv2planeX_8_lasx(const int16_t *filter, int filterSize,
const int16_t **src, uint8_t *dest, int dstW,
const uint8_t *dither, int offset)
{
@@ -1775,8 +1775,27 @@ YUV2RGBWRAPPER(yuv2, rgb_full, bgr8_full, AV_PIX_FMT_BGR8, 0)
YUV2RGBWRAPPER(yuv2, rgb_full, rgb8_full, AV_PIX_FMT_RGB8, 0)
-av_cold void ff_sws_init_output_lasx(SwsContext *c)
+av_cold void ff_sws_init_output_lasx(SwsContext *c,
+ yuv2planar1_fn *yuv2plane1,
+ yuv2planarX_fn *yuv2planeX,
+ yuv2interleavedX_fn *yuv2nv12cX,
+ yuv2packed1_fn *yuv2packed1,
+ yuv2packed2_fn *yuv2packed2,
+ yuv2packedX_fn *yuv2packedX,
+ yuv2anyX_fn *yuv2anyX)
{
+ enum AVPixelFormat dstFormat = c->dstFormat;
+
+ /* Add initialization once optimized */
+ if (isSemiPlanarYUV(dstFormat) && isDataInHighBits(dstFormat)) {
+ } else if (is16BPS(dstFormat)) {
+ } else if (isNBPS(dstFormat)) {
+ } else if (dstFormat == AV_PIX_FMT_GRAYF32BE) {
+ } else if (dstFormat == AV_PIX_FMT_GRAYF32LE) {
+ } else {
+ *yuv2plane1 = yuv2plane1_8_lasx;
+ *yuv2planeX = yuv2planeX_8_lasx;
+ }
if(c->flags & SWS_FULL_CHR_H_INT) {
switch (c->dstFormat) {
diff --git a/libswscale/loongarch/output_lsx.c b/libswscale/loongarch/output_lsx.c
index 768cc3abc6..de9b1534ee 100644
--- a/libswscale/loongarch/output_lsx.c
+++ b/libswscale/loongarch/output_lsx.c
@@ -1624,8 +1624,28 @@ YUV2RGBWRAPPER(yuv2, rgb_full, bgr8_full, AV_PIX_FMT_BGR8, 0)
YUV2RGBWRAPPER(yuv2, rgb_full, rgb8_full, AV_PIX_FMT_RGB8, 0)
-av_cold void ff_sws_init_output_lsx(SwsContext *c)
+av_cold void ff_sws_init_output_lsx(SwsContext *c,
+ yuv2planar1_fn *yuv2plane1,
+ yuv2planarX_fn *yuv2planeX,
+ yuv2interleavedX_fn *yuv2nv12cX,
+ yuv2packed1_fn *yuv2packed1,
+ yuv2packed2_fn *yuv2packed2,
+ yuv2packedX_fn *yuv2packedX,
+ yuv2anyX_fn *yuv2anyX)
{
+ enum AVPixelFormat dstFormat = c->dstFormat;
+
+ /* Add initialization once optimized */
+ if (isSemiPlanarYUV(dstFormat) && isDataInHighBits(dstFormat)) {
+ } else if (is16BPS(dstFormat)) {
+ } else if (isNBPS(dstFormat)) {
+ } else if (dstFormat == AV_PIX_FMT_GRAYF32BE) {
+ } else if (dstFormat == AV_PIX_FMT_GRAYF32LE) {
+ } else {
+ *yuv2plane1 = yuv2plane1_8_lsx;
+ *yuv2planeX = yuv2planeX_8_lsx;
+ }
+
if(c->flags & SWS_FULL_CHR_H_INT) {
switch (c->dstFormat) {
case AV_PIX_FMT_RGBA:
diff --git a/libswscale/loongarch/swscale_init_loongarch.c b/libswscale/loongarch/swscale_init_loongarch.c
index 6d2786c55f..04d2553fa4 100644
--- a/libswscale/loongarch/swscale_init_loongarch.c
+++ b/libswscale/loongarch/swscale_init_loongarch.c
@@ -60,7 +60,9 @@ av_cold void ff_sws_init_swscale_loongarch(SwsContext *c)
{
int cpu_flags = av_get_cpu_flags();
if (have_lsx(cpu_flags)) {
- ff_sws_init_output_lsx(c);
+ ff_sws_init_output_lsx(c, &c->yuv2plane1, &c->yuv2planeX,
+ &c->yuv2nv12cX, &c->yuv2packed1,
+ &c->yuv2packed2, &c->yuv2packedX, &c->yuv2anyX);
if (c->srcBpc == 8) {
if (c->dstBpc <= 14) {
c->hyScale = c->hcScale = ff_hscale_8_to_15_lsx;
@@ -80,12 +82,12 @@ av_cold void ff_sws_init_swscale_loongarch(SwsContext *c)
}
break;
}
- if (c->dstBpc == 8)
- c->yuv2planeX = ff_yuv2planeX_8_lsx;
}
#if HAVE_LASX
if (have_lasx(cpu_flags)) {
- ff_sws_init_output_lasx(c);
+ ff_sws_init_output_lasx(c, &c->yuv2plane1, &c->yuv2planeX,
+ &c->yuv2nv12cX, &c->yuv2packed1,
+ &c->yuv2packed2, &c->yuv2packedX, &c->yuv2anyX);
if (c->srcBpc == 8) {
if (c->dstBpc <= 14) {
c->hyScale = c->hcScale = ff_hscale_8_to_15_lasx;
@@ -105,8 +107,6 @@ av_cold void ff_sws_init_swscale_loongarch(SwsContext *c)
}
break;
}
- if (c->dstBpc == 8)
- c->yuv2planeX = ff_yuv2planeX_8_lasx;
}
#endif // #if HAVE_LASX
ff_sws_init_range_convert_loongarch(c);
diff --git a/libswscale/loongarch/swscale_loongarch.h b/libswscale/loongarch/swscale_loongarch.h
index c96b085982..ea93881f8e 100644
--- a/libswscale/loongarch/swscale_loongarch.h
+++ b/libswscale/loongarch/swscale_loongarch.h
@@ -61,11 +61,21 @@ void planar_rgb_to_uv_lsx(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *src[4],
void planar_rgb_to_y_lsx(uint8_t *_dst, const uint8_t *src[4], int width,
int32_t *rgb2yuv, void *opq);
-void ff_yuv2planeX_8_lsx(const int16_t *filter, int filterSize,
+void yuv2planeX_8_lsx(const int16_t *filter, int filterSize,
const int16_t **src, uint8_t *dest, int dstW,
const uint8_t *dither, int offset);
-av_cold void ff_sws_init_output_lsx(SwsContext *c);
+void yuv2plane1_8_lsx(const int16_t *src, uint8_t *dest, int dstW,
+ const uint8_t *dither, int offset);
+
+av_cold void ff_sws_init_output_lsx(SwsContext *c,
+ yuv2planar1_fn *yuv2plane1,
+ yuv2planarX_fn *yuv2planeX,
+ yuv2interleavedX_fn *yuv2nv12cX,
+ yuv2packed1_fn *yuv2packed1,
+ yuv2packed2_fn *yuv2packed2,
+ yuv2packedX_fn *yuv2packedX,
+ yuv2anyX_fn *yuv2anyX);
int yuv420_rgb24_lsx(SwsContext *c, const uint8_t *src[], int srcStride[],
int srcSliceY, int srcSliceH, uint8_t *dst[], int dstStride[]);
@@ -135,12 +145,21 @@ void ff_interleave_bytes_lasx(const uint8_t *src1, const uint8_t *src2,
uint8_t *dest, int width, int height,
int src1Stride, int src2Stride, int dstStride);
-void ff_yuv2planeX_8_lasx(const int16_t *filter, int filterSize,
+void yuv2planeX_8_lasx(const int16_t *filter, int filterSize,
const int16_t **src, uint8_t *dest, int dstW,
const uint8_t *dither, int offset);
-av_cold void ff_sws_init_output_lasx(SwsContext *c);
-
+void yuv2plane1_8_lasx(const int16_t *src, uint8_t *dest, int dstW,
+ const uint8_t *dither, int offset);
+
+av_cold void ff_sws_init_output_lasx(SwsContext *c,
+ yuv2planar1_fn *yuv2plane1,
+ yuv2planarX_fn *yuv2planeX,
+ yuv2interleavedX_fn *yuv2nv12cX,
+ yuv2packed1_fn *yuv2packed1,
+ yuv2packed2_fn *yuv2packed2,
+ yuv2packedX_fn *yuv2packedX,
+ yuv2anyX_fn *yuv2anyX);
#endif // #if HAVE_LASX
#endif /* SWSCALE_LOONGARCH_SWSCALE_LOONGARCH_H */
More information about the ffmpeg-cvslog
mailing list