[FFmpeg-cvslog] swscale: [LA] Optimize yuv2plane1_8_c.

Fri Apr 12 00:57:04 EEST 2024

ffmpeg | branch: master | Shiyou Yin <yinshiyou-hf at loongson.cn> | Sat Mar 16 11:03:32 2024 +0800| [8b76df914285b1e10460c16134715531050e7a74] | committer: Michael Niedermayer

swscale: [LA] Optimize yuv2plane1_8_c.

Reviewed-by: colleague of Shiyou Yin
Signed-off-by: Michael Niedermayer <michael at niedermayer.cc>

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=8b76df914285b1e10460c16134715531050e7a74
---

 libswscale/loongarch/output.S                 | 254 +++++++++++++++++++++++++-
 libswscale/loongarch/output_lasx.c            |  23 ++-
 libswscale/loongarch/output_lsx.c             |  22 ++-
 libswscale/loongarch/swscale_init_loongarch.c |  12 +-
 libswscale/loongarch/swscale_loongarch.h      |  29 ++-
 5 files changed, 324 insertions(+), 16 deletions(-)

diff --git a/libswscale/loongarch/output.S b/libswscale/loongarch/output.S
index b44bac502a..d71667e38a 100644
--- a/libswscale/loongarch/output.S
+++ b/libswscale/loongarch/output.S
@@ -23,11 +23,11 @@
 
 #include "libavcodec/loongarch/loongson_asm.S"
 
-/* static void ff_yuv2planeX_8_lsx(const int16_t *filter, int filterSize,
+/* static void yuv2planeX_8_lsx(const int16_t *filter, int filterSize,
  *                                 const int16_t **src, uint8_t *dest, int dstW,
  *                                 const uint8_t *dither, int offset)
  */
-function ff_yuv2planeX_8_lsx
+function yuv2planeX_8_lsx
     addi.w          t1,     a6,     1
     addi.w          t2,     a6,     2
     addi.w          t3,     a6,     3
@@ -136,3 +136,253 @@ function ff_yuv2planeX_8_lsx
     blt             zero,   a4,     .DEST
 .END:
 endfunc
+
+/*
+ * void yuv2plane1_8_lsx(const int16_t *src, uint8_t *dest, int dstW,
+ *                       const uint8_t *dither, int offset)
+ */
+function yuv2plane1_8_lsx
+    addi.w       t1,    a4,    1
+    addi.w       t2,    a4,    2
+    addi.w       t3,    a4,    3
+    addi.w       t4,    a4,    4
+    addi.w       t5,    a4,    5
+    addi.w       t6,    a4,    6
+    addi.w       t7,    a4,    7
+    andi         t0,    a4,    7
+    andi         t1,    t1,    7
+    andi         t2,    t2,    7
+    andi         t3,    t3,    7
+    andi         t4,    t4,    7
+    andi         t5,    t5,    7
+    andi         t6,    t6,    7
+    andi         t7,    t7,    7
+    ldx.bu       t0,    a3,    t0
+    ldx.bu       t1,    a3,    t1
+    ldx.bu       t2,    a3,    t2
+    ldx.bu       t3,    a3,    t3
+    ldx.bu       t4,    a3,    t4
+    ldx.bu       t5,    a3,    t5
+    ldx.bu       t6,    a3,    t6
+    ldx.bu       t7,    a3,    t7
+    vinsgr2vr.h  vr1,   t0,    0
+    vinsgr2vr.h  vr1,   t1,    1
+    vinsgr2vr.h  vr1,   t2,    2
+    vinsgr2vr.h  vr1,   t3,    3
+    vinsgr2vr.h  vr1,   t4,    4
+    vinsgr2vr.h  vr1,   t5,    5
+    vinsgr2vr.h  vr1,   t6,    6
+    vinsgr2vr.h  vr1,   t7,    7
+    vsub.h       vr0,   vr0,   vr0
+    vilvl.h      vr2,   vr0,   vr1
+    vilvh.h      vr3,   vr0,   vr1
+
+    andi         t8,    a2,    7
+    srli.d       a2,    a2,    3
+    beqz         a2,    2f
+1:
+    vld          vr1,   a0,    0
+    addi.d       a0,    a0,    16
+    vshuf4i.d    vr0,   vr1,   8
+    vexth.w.h    vr4,   vr0
+    vexth.w.h    vr5,   vr1
+
+    vadd.w       vr4,   vr2,   vr4
+    vadd.w       vr5,   vr3,   vr5
+    vsrai.w      vr4,   vr4,   7
+    vsrai.w      vr5,   vr5,   7
+    vclip255.w   vr4,   vr4
+    vclip255.w   vr5,   vr5
+    vpickev.h    vr1,   vr5,   vr4
+    vpickev.b    vr1,   vr1,   vr1
+    fst.d        f1,    a1,    0
+    addi.d       a1,    a1,    8
+    addi.d       a2,    a2,    -1
+    bnez         a2,    1b
+2:
+    beqz         t8,    4f
+3:
+    add.w        a4,    a4,    t8
+    addi.w       t1,    a4,    1
+    addi.w       t2,    a4,    2
+    addi.w       t3,    a4,    3
+    addi.w       t4,    a4,    4
+    addi.w       t5,    a4,    5
+    addi.w       t6,    a4,    6
+    addi.w       t7,    a4,    7
+    andi         t0,    a4,    7
+    andi         t1,    t1,    7
+    andi         t2,    t2,    7
+    andi         t3,    t3,    7
+    andi         t4,    t4,    7
+    andi         t5,    t5,    7
+    andi         t6,    t6,    7
+    andi         t7,    t7,    7
+    ldx.bu       t0,    a3,    t0
+    ldx.bu       t1,    a3,    t1
+    ldx.bu       t2,    a3,    t2
+    ldx.bu       t3,    a3,    t3
+    ldx.bu       t4,    a3,    t4
+    ldx.bu       t5,    a3,    t5
+    ldx.bu       t6,    a3,    t6
+    ldx.bu       t7,    a3,    t7
+    vinsgr2vr.h  vr1,   t0,    0
+    vinsgr2vr.h  vr1,   t1,    1
+    vinsgr2vr.h  vr1,   t2,    2
+    vinsgr2vr.h  vr1,   t3,    3
+    vinsgr2vr.h  vr1,   t4,    4
+    vinsgr2vr.h  vr1,   t5,    5
+    vinsgr2vr.h  vr1,   t6,    6
+    vinsgr2vr.h  vr1,   t7,    7
+    vsub.h       vr0,   vr0,   vr0
+    vilvl.h      vr2,   vr0,   vr1
+    vilvh.h      vr3,   vr0,   vr1
+
+    addi.d       a0,    a0,    -16
+    add.d        a0,    a0,    t8
+    add.d        a0,    a0,    t8
+    addi.d       a1,    a1,    -8
+    add.d        a1,    a1,    t8
+
+    vld          vr1,   a0,    0
+    vshuf4i.d    vr0,   vr1,   8
+    vexth.w.h    vr4,   vr0
+    vexth.w.h    vr5,   vr1
+
+    vadd.w       vr4,   vr2,   vr4
+    vadd.w       vr5,   vr3,   vr5
+    vsrai.w      vr4,   vr4,   7
+    vsrai.w      vr5,   vr5,   7
+    vclip255.w   vr4,   vr4
+    vclip255.w   vr5,   vr5
+    vpickev.h    vr1,   vr5,   vr4
+    vpickev.b    vr1,   vr1,   vr1
+    fst.d        f1,    a1,    0
+4:
+endfunc
+
+function yuv2plane1_8_lasx
+    addi.w       t1,    a4,    1
+    addi.w       t2,    a4,    2
+    addi.w       t3,    a4,    3
+    addi.w       t4,    a4,    4
+    addi.w       t5,    a4,    5
+    addi.w       t6,    a4,    6
+    addi.w       t7,    a4,    7
+    andi         t0,    a4,    7
+    andi         t1,    t1,    7
+    andi         t2,    t2,    7
+    andi         t3,    t3,    7
+    andi         t4,    t4,    7
+    andi         t5,    t5,    7
+    andi         t6,    t6,    7
+    andi         t7,    t7,    7
+    ldx.bu       t0,    a3,    t0
+    ldx.bu       t1,    a3,    t1
+    ldx.bu       t2,    a3,    t2
+    ldx.bu       t3,    a3,    t3
+    ldx.bu       t4,    a3,    t4
+    ldx.bu       t5,    a3,    t5
+    ldx.bu       t6,    a3,    t6
+    ldx.bu       t7,    a3,    t7
+    vinsgr2vr.h  vr1,   t0,    0
+    vinsgr2vr.h  vr1,   t1,    1
+    vinsgr2vr.h  vr1,   t2,    2
+    vinsgr2vr.h  vr1,   t3,    3
+    vinsgr2vr.h  vr1,   t4,    4
+    vinsgr2vr.h  vr1,   t5,    5
+    vinsgr2vr.h  vr1,   t6,    6
+    vinsgr2vr.h  vr1,   t7,    7
+    xvpermi.q    xr1,   xr1,   0
+    xvsub.h      xr0,   xr0,   xr0
+    xvilvl.h     xr2,   xr0,   xr1
+    xvilvh.h     xr3,   xr0,   xr1
+
+    andi         t8,    a2,    15
+    srli.d       a2,    a2,    4
+    beqz         a2,    2f
+1:
+    xvld         xr1,   a0,    0
+    addi.d       a0,    a0,    32
+    xvpermi.d    xr0,   xr1,   0xa0
+    xvexth.w.h   xr4,   xr0
+    xvexth.w.h   xr5,   xr1
+
+    xvadd.w      xr4,   xr2,   xr4
+    xvadd.w      xr5,   xr3,   xr5
+    xvsrai.w     xr4,   xr4,   7
+    xvsrai.w     xr5,   xr5,   7
+    xvclip255.w  xr4,   xr4
+    xvclip255.w  xr5,   xr5
+    xvpickev.h   xr1,   xr5,   xr4
+    xvpickev.b   xr0,   xr1,   xr1
+    xvpermi.q    xr1,   xr0,   1
+    fst.d        f0,    a1,    0
+    fst.d        f1,    a1,    8
+    addi.d       a1,    a1,    16
+    addi.d       a2,    a2,    -1
+    bnez         a2,    1b
+2:
+    beqz         t8,    4f
+3:
+    add.w        a4,    a4,    t8
+    addi.w       t1,    a4,    1
+    addi.w       t2,    a4,    2
+    addi.w       t3,    a4,    3
+    addi.w       t4,    a4,    4
+    addi.w       t5,    a4,    5
+    addi.w       t6,    a4,    6
+    addi.w       t7,    a4,    7
+    andi         t0,    a4,    7
+    andi         t1,    t1,    7
+    andi         t2,    t2,    7
+    andi         t3,    t3,    7
+    andi         t4,    t4,    7
+    andi         t5,    t5,    7
+    andi         t6,    t6,    7
+    andi         t7,    t7,    7
+    ldx.bu       t0,    a3,    t0
+    ldx.bu       t1,    a3,    t1
+    ldx.bu       t2,    a3,    t2
+    ldx.bu       t3,    a3,    t3
+    ldx.bu       t4,    a3,    t4
+    ldx.bu       t5,    a3,    t5
+    ldx.bu       t6,    a3,    t6
+    ldx.bu       t7,    a3,    t7
+    vinsgr2vr.h  vr1,   t0,    0
+    vinsgr2vr.h  vr1,   t1,    1
+    vinsgr2vr.h  vr1,   t2,    2
+    vinsgr2vr.h  vr1,   t3,    3
+    vinsgr2vr.h  vr1,   t4,    4
+    vinsgr2vr.h  vr1,   t5,    5
+    vinsgr2vr.h  vr1,   t6,    6
+    vinsgr2vr.h  vr1,   t7,    7
+    xvpermi.q    xr1,   xr1,   0
+    xvsub.h      xr0,   xr0,   xr0
+    xvilvl.h     xr2,   xr0,   xr1
+    xvilvh.h     xr3,   xr0,   xr1
+
+    addi.d       a0,    a0,    -32
+    add.d        a0,    a0,    t8
+    add.d        a0,    a0,    t8
+    addi.d       a1,    a1,    -16
+    add.d        a1,    a1,    t8
+
+    xvld         xr1,   a0,    0
+    xvpermi.d    xr0,   xr1,   0xa0
+    xvexth.w.h   xr4,   xr0
+    xvexth.w.h   xr5,   xr1
+
+    xvadd.w      xr4,   xr2,   xr4
+    xvadd.w      xr5,   xr3,   xr5
+    xvsrai.w     xr4,   xr4,   7
+    xvsrai.w     xr5,   xr5,   7
+    xvclip255.w  xr4,   xr4
+    xvclip255.w  xr5,   xr5
+    xvpickev.h   xr1,   xr5,   xr4
+    xvpickev.b   xr0,   xr1,   xr1
+    xvpermi.q    xr1,   xr0,   1
+    fst.d        f0,    a1,    0
+    fst.d        f1,    a1,    8
+4:
+endfunc
diff --git a/libswscale/loongarch/output_lasx.c b/libswscale/loongarch/output_lasx.c
index 277d7063e6..bc8ab8cf36 100644
--- a/libswscale/loongarch/output_lasx.c
+++ b/libswscale/loongarch/output_lasx.c
@@ -22,7 +22,7 @@
 #include "swscale_loongarch.h"
 #include "libavutil/loongarch/loongson_intrinsics.h"
 
-void ff_yuv2planeX_8_lasx(const int16_t *filter, int filterSize,
+void yuv2planeX_8_lasx(const int16_t *filter, int filterSize,
                           const int16_t **src, uint8_t *dest, int dstW,
                           const uint8_t *dither, int offset)
 {
@@ -1775,8 +1775,27 @@ YUV2RGBWRAPPER(yuv2, rgb_full, bgr8_full,   AV_PIX_FMT_BGR8,  0)
 YUV2RGBWRAPPER(yuv2, rgb_full, rgb8_full,   AV_PIX_FMT_RGB8,  0)
 
 
-av_cold void ff_sws_init_output_lasx(SwsContext *c)
+av_cold void ff_sws_init_output_lasx(SwsContext *c,
+                                     yuv2planar1_fn *yuv2plane1,
+                                     yuv2planarX_fn *yuv2planeX,
+                                     yuv2interleavedX_fn *yuv2nv12cX,
+                                     yuv2packed1_fn *yuv2packed1,
+                                     yuv2packed2_fn *yuv2packed2,
+                                     yuv2packedX_fn *yuv2packedX,
+                                     yuv2anyX_fn *yuv2anyX)
 {
+    enum AVPixelFormat dstFormat = c->dstFormat;
+
+    /* Add initialization once optimized */
+    if (isSemiPlanarYUV(dstFormat) && isDataInHighBits(dstFormat)) {
+    } else if (is16BPS(dstFormat)) {
+    } else if (isNBPS(dstFormat)) {
+    } else if (dstFormat == AV_PIX_FMT_GRAYF32BE) {
+    } else if (dstFormat == AV_PIX_FMT_GRAYF32LE) {
+    } else {
+        *yuv2plane1 = yuv2plane1_8_lasx;
+        *yuv2planeX = yuv2planeX_8_lasx;
+    }
 
     if(c->flags & SWS_FULL_CHR_H_INT) {
         switch (c->dstFormat) {
diff --git a/libswscale/loongarch/output_lsx.c b/libswscale/loongarch/output_lsx.c
index 768cc3abc6..de9b1534ee 100644
--- a/libswscale/loongarch/output_lsx.c
+++ b/libswscale/loongarch/output_lsx.c
@@ -1624,8 +1624,28 @@ YUV2RGBWRAPPER(yuv2, rgb_full, bgr8_full,   AV_PIX_FMT_BGR8,  0)
 YUV2RGBWRAPPER(yuv2, rgb_full, rgb8_full,   AV_PIX_FMT_RGB8,  0)
 
 
-av_cold void ff_sws_init_output_lsx(SwsContext *c)
+av_cold void ff_sws_init_output_lsx(SwsContext *c,
+                                    yuv2planar1_fn *yuv2plane1,
+                                    yuv2planarX_fn *yuv2planeX,
+                                    yuv2interleavedX_fn *yuv2nv12cX,
+                                    yuv2packed1_fn *yuv2packed1,
+                                    yuv2packed2_fn *yuv2packed2,
+                                    yuv2packedX_fn *yuv2packedX,
+                                    yuv2anyX_fn *yuv2anyX)
 {
+    enum AVPixelFormat dstFormat = c->dstFormat;
+
+    /* Add initialization once optimized */
+    if (isSemiPlanarYUV(dstFormat) && isDataInHighBits(dstFormat)) {
+    } else if (is16BPS(dstFormat)) {
+    } else if (isNBPS(dstFormat)) {
+    } else if (dstFormat == AV_PIX_FMT_GRAYF32BE) {
+    } else if (dstFormat == AV_PIX_FMT_GRAYF32LE) {
+    } else {
+        *yuv2plane1 = yuv2plane1_8_lsx;
+        *yuv2planeX = yuv2planeX_8_lsx;
+    }
+
     if(c->flags & SWS_FULL_CHR_H_INT) {
         switch (c->dstFormat) {
         case AV_PIX_FMT_RGBA:
diff --git a/libswscale/loongarch/swscale_init_loongarch.c b/libswscale/loongarch/swscale_init_loongarch.c
index 6d2786c55f..04d2553fa4 100644
--- a/libswscale/loongarch/swscale_init_loongarch.c
+++ b/libswscale/loongarch/swscale_init_loongarch.c
@@ -60,7 +60,9 @@ av_cold void ff_sws_init_swscale_loongarch(SwsContext *c)
 {
     int cpu_flags = av_get_cpu_flags();
     if (have_lsx(cpu_flags)) {
-        ff_sws_init_output_lsx(c);
+        ff_sws_init_output_lsx(c, &c->yuv2plane1, &c->yuv2planeX,
+                               &c->yuv2nv12cX, &c->yuv2packed1,
+                               &c->yuv2packed2, &c->yuv2packedX, &c->yuv2anyX);
         if (c->srcBpc == 8) {
             if (c->dstBpc <= 14) {
                 c->hyScale = c->hcScale = ff_hscale_8_to_15_lsx;
@@ -80,12 +82,12 @@ av_cold void ff_sws_init_swscale_loongarch(SwsContext *c)
             }
             break;
         }
-        if (c->dstBpc == 8)
-            c->yuv2planeX = ff_yuv2planeX_8_lsx;
     }
 #if HAVE_LASX
     if (have_lasx(cpu_flags)) {
-        ff_sws_init_output_lasx(c);
+        ff_sws_init_output_lasx(c, &c->yuv2plane1, &c->yuv2planeX,
+                                &c->yuv2nv12cX, &c->yuv2packed1,
+                                &c->yuv2packed2, &c->yuv2packedX, &c->yuv2anyX);
         if (c->srcBpc == 8) {
             if (c->dstBpc <= 14) {
                 c->hyScale = c->hcScale = ff_hscale_8_to_15_lasx;
@@ -105,8 +107,6 @@ av_cold void ff_sws_init_swscale_loongarch(SwsContext *c)
             }
             break;
         }
-        if (c->dstBpc == 8)
-            c->yuv2planeX = ff_yuv2planeX_8_lasx;
     }
 #endif // #if HAVE_LASX
     ff_sws_init_range_convert_loongarch(c);
diff --git a/libswscale/loongarch/swscale_loongarch.h b/libswscale/loongarch/swscale_loongarch.h
index c96b085982..ea93881f8e 100644
--- a/libswscale/loongarch/swscale_loongarch.h
+++ b/libswscale/loongarch/swscale_loongarch.h
@@ -61,11 +61,21 @@ void planar_rgb_to_uv_lsx(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *src[4],
 void planar_rgb_to_y_lsx(uint8_t *_dst, const uint8_t *src[4], int width,
                          int32_t *rgb2yuv, void *opq);
 
-void ff_yuv2planeX_8_lsx(const int16_t *filter, int filterSize,
+void yuv2planeX_8_lsx(const int16_t *filter, int filterSize,
                          const int16_t **src, uint8_t *dest, int dstW,
                          const uint8_t *dither, int offset);
 
-av_cold void ff_sws_init_output_lsx(SwsContext *c);
+void yuv2plane1_8_lsx(const int16_t *src, uint8_t *dest, int dstW,
+                      const uint8_t *dither, int offset);
+
+av_cold void ff_sws_init_output_lsx(SwsContext *c,
+                                    yuv2planar1_fn *yuv2plane1,
+                                    yuv2planarX_fn *yuv2planeX,
+                                    yuv2interleavedX_fn *yuv2nv12cX,
+                                    yuv2packed1_fn *yuv2packed1,
+                                    yuv2packed2_fn *yuv2packed2,
+                                    yuv2packedX_fn *yuv2packedX,
+                                    yuv2anyX_fn *yuv2anyX);
 
 int yuv420_rgb24_lsx(SwsContext *c, const uint8_t *src[], int srcStride[],
                      int srcSliceY, int srcSliceH, uint8_t *dst[], int dstStride[]);
@@ -135,12 +145,21 @@ void ff_interleave_bytes_lasx(const uint8_t *src1, const uint8_t *src2,
                               uint8_t *dest, int width, int height,
                               int src1Stride, int src2Stride, int dstStride);
 
-void ff_yuv2planeX_8_lasx(const int16_t *filter, int filterSize,
+void yuv2planeX_8_lasx(const int16_t *filter, int filterSize,
                           const int16_t **src, uint8_t *dest, int dstW,
                           const uint8_t *dither, int offset);
 
-av_cold void ff_sws_init_output_lasx(SwsContext *c);
-
+void yuv2plane1_8_lasx(const int16_t *src, uint8_t *dest, int dstW,
+                      const uint8_t *dither, int offset);
+
+av_cold void ff_sws_init_output_lasx(SwsContext *c,
+                                     yuv2planar1_fn *yuv2plane1,
+                                     yuv2planarX_fn *yuv2planeX,
+                                     yuv2interleavedX_fn *yuv2nv12cX,
+                                     yuv2packed1_fn *yuv2packed1,
+                                     yuv2packed2_fn *yuv2packed2,
+                                     yuv2packedX_fn *yuv2packedX,
+                                     yuv2anyX_fn *yuv2anyX);
 #endif // #if HAVE_LASX
 
 #endif /* SWSCALE_LOONGARCH_SWSCALE_LOONGARCH_H */