[FFmpeg-devel] [PATCH v1 3/6] avcodec/hevc: Add pel_uni_w_pixels4/6/8/12/16/24/32/48/64 asm opt
jinbo
jinbo at loongson.cn
Fri Dec 22 12:52:11 EET 2023
tests/checkasm/checkasm: C LSX LASX
put_hevc_pel_uni_w_pixels4_8_c: 2.7 1.0
put_hevc_pel_uni_w_pixels6_8_c: 6.2 2.0 1.5
put_hevc_pel_uni_w_pixels8_8_c: 10.7 2.5 1.7
put_hevc_pel_uni_w_pixels12_8_c: 23.0 5.5 5.0
put_hevc_pel_uni_w_pixels16_8_c: 41.0 8.2 5.0
put_hevc_pel_uni_w_pixels24_8_c: 91.0 19.7 13.2
put_hevc_pel_uni_w_pixels32_8_c: 161.7 32.5 16.2
put_hevc_pel_uni_w_pixels48_8_c: 354.5 73.7 43.0
put_hevc_pel_uni_w_pixels64_8_c: 641.5 130.0 64.2
Speedup of decoding H265 4K 30FPS 30Mbps on 3A6000 with
8 threads is 1fps(47fps-->48fps).
---
libavcodec/loongarch/Makefile | 3 +-
libavcodec/loongarch/hevc_mc.S | 471 ++++++++++++++++++
libavcodec/loongarch/hevcdsp_init_loongarch.c | 43 ++
libavcodec/loongarch/hevcdsp_lasx.h | 53 ++
libavcodec/loongarch/hevcdsp_lsx.h | 27 +
5 files changed, 596 insertions(+), 1 deletion(-)
create mode 100644 libavcodec/loongarch/hevc_mc.S
create mode 100644 libavcodec/loongarch/hevcdsp_lasx.h
diff --git a/libavcodec/loongarch/Makefile b/libavcodec/loongarch/Makefile
index 07ea97f803..ad98cd4054 100644
--- a/libavcodec/loongarch/Makefile
+++ b/libavcodec/loongarch/Makefile
@@ -28,7 +28,8 @@ LSX-OBJS-$(CONFIG_HEVC_DECODER) += loongarch/hevcdsp_lsx.o \
loongarch/hevc_mc_bi_lsx.o \
loongarch/hevc_mc_uni_lsx.o \
loongarch/hevc_mc_uniw_lsx.o \
- loongarch/hevc_add_res.o
+ loongarch/hevc_add_res.o \
+ loongarch/hevc_mc.o
LSX-OBJS-$(CONFIG_H264DSP) += loongarch/h264idct.o \
loongarch/h264idct_loongarch.o \
loongarch/h264dsp.o
diff --git a/libavcodec/loongarch/hevc_mc.S b/libavcodec/loongarch/hevc_mc.S
new file mode 100644
index 0000000000..c5d553effe
--- /dev/null
+++ b/libavcodec/loongarch/hevc_mc.S
@@ -0,0 +1,471 @@
+/*
+ * Copyright (c) 2023 Loongson Technology Corporation Limited
+ * Contributed by jinbo <jinbo at loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "loongson_asm.S"
+
+.macro LOAD_VAR bit
+ addi.w t1, a5, 6 //shift
+ addi.w t3, zero, 1 //one
+ sub.w t4, t1, t3
+ sll.w t3, t3, t4 //offset
+.if \bit == 128
+ vreplgr2vr.w vr1, a6 //wx
+ vreplgr2vr.w vr2, t3 //offset
+ vreplgr2vr.w vr3, t1 //shift
+ vreplgr2vr.w vr4, a7 //ox
+.else
+ xvreplgr2vr.w xr1, a6
+ xvreplgr2vr.w xr2, t3
+ xvreplgr2vr.w xr3, t1
+ xvreplgr2vr.w xr4, a7
+.endif
+.endm
+
+.macro HEVC_PEL_UNI_W_PIXELS8_LSX src0, dst0, w
+ vldrepl.d vr0, \src0, 0
+ vsllwil.hu.bu vr0, vr0, 0
+ vexth.wu.hu vr5, vr0
+ vsllwil.wu.hu vr0, vr0, 0
+ vslli.w vr0, vr0, 6
+ vslli.w vr5, vr5, 6
+ vmul.w vr0, vr0, vr1
+ vmul.w vr5, vr5, vr1
+ vadd.w vr0, vr0, vr2
+ vadd.w vr5, vr5, vr2
+ vsra.w vr0, vr0, vr3
+ vsra.w vr5, vr5, vr3
+ vadd.w vr0, vr0, vr4
+ vadd.w vr5, vr5, vr4
+ vssrani.h.w vr5, vr0, 0
+ vssrani.bu.h vr5, vr5, 0
+.if \w == 6
+ fst.s f5, \dst0, 0
+ vstelm.h vr5, \dst0, 4, 2
+.else
+ fst.d f5, \dst0, 0
+.endif
+.endm
+
+.macro HEVC_PEL_UNI_W_PIXELS8x2_LASX src0, dst0, w
+ vldrepl.d vr0, \src0, 0
+ add.d t2, \src0, a3
+ vldrepl.d vr5, t2, 0
+ xvpermi.q xr0, xr5, 0x02
+ xvsllwil.hu.bu xr0, xr0, 0
+ xvexth.wu.hu xr5, xr0
+ xvsllwil.wu.hu xr0, xr0, 0
+ xvslli.w xr0, xr0, 6
+ xvslli.w xr5, xr5, 6
+ xvmul.w xr0, xr0, xr1
+ xvmul.w xr5, xr5, xr1
+ xvadd.w xr0, xr0, xr2
+ xvadd.w xr5, xr5, xr2
+ xvsra.w xr0, xr0, xr3
+ xvsra.w xr5, xr5, xr3
+ xvadd.w xr0, xr0, xr4
+ xvadd.w xr5, xr5, xr4
+ xvssrani.h.w xr5, xr0, 0
+ xvpermi.q xr0, xr5, 0x01
+ xvssrani.bu.h xr0, xr5, 0
+ add.d t3, \dst0, a1
+.if \w == 6
+ vstelm.w vr0, \dst0, 0, 0
+ vstelm.h vr0, \dst0, 4, 2
+ vstelm.w vr0, t3, 0, 2
+ vstelm.h vr0, t3, 4, 6
+.else
+ vstelm.d vr0, \dst0, 0, 0
+ vstelm.d vr0, t3, 0, 1
+.endif
+.endm
+
+.macro HEVC_PEL_UNI_W_PIXELS16_LSX src0, dst0
+ vld vr0, \src0, 0
+ vexth.hu.bu vr7, vr0
+ vexth.wu.hu vr8, vr7
+ vsllwil.wu.hu vr7, vr7, 0
+ vsllwil.hu.bu vr5, vr0, 0
+ vexth.wu.hu vr6, vr5
+ vsllwil.wu.hu vr5, vr5, 0
+ vslli.w vr5, vr5, 6
+ vslli.w vr6, vr6, 6
+ vslli.w vr7, vr7, 6
+ vslli.w vr8, vr8, 6
+ vmul.w vr5, vr5, vr1
+ vmul.w vr6, vr6, vr1
+ vmul.w vr7, vr7, vr1
+ vmul.w vr8, vr8, vr1
+ vadd.w vr5, vr5, vr2
+ vadd.w vr6, vr6, vr2
+ vadd.w vr7, vr7, vr2
+ vadd.w vr8, vr8, vr2
+ vsra.w vr5, vr5, vr3
+ vsra.w vr6, vr6, vr3
+ vsra.w vr7, vr7, vr3
+ vsra.w vr8, vr8, vr3
+ vadd.w vr5, vr5, vr4
+ vadd.w vr6, vr6, vr4
+ vadd.w vr7, vr7, vr4
+ vadd.w vr8, vr8, vr4
+ vssrani.h.w vr6, vr5, 0
+ vssrani.h.w vr8, vr7, 0
+ vssrani.bu.h vr8, vr6, 0
+ vst vr8, \dst0, 0
+.endm
+
+.macro HEVC_PEL_UNI_W_PIXELS16_LASX src0, dst0
+ vld vr0, \src0, 0
+ xvpermi.d xr0, xr0, 0xd8
+ xvsllwil.hu.bu xr0, xr0, 0
+ xvexth.wu.hu xr6, xr0
+ xvsllwil.wu.hu xr5, xr0, 0
+ xvslli.w xr5, xr5, 6
+ xvslli.w xr6, xr6, 6
+ xvmul.w xr5, xr5, xr1
+ xvmul.w xr6, xr6, xr1
+ xvadd.w xr5, xr5, xr2
+ xvadd.w xr6, xr6, xr2
+ xvsra.w xr5, xr5, xr3
+ xvsra.w xr6, xr6, xr3
+ xvadd.w xr5, xr5, xr4
+ xvadd.w xr6, xr6, xr4
+ xvssrani.h.w xr6, xr5, 0
+ xvpermi.q xr7, xr6, 0x01
+ xvssrani.bu.h xr7, xr6, 0
+ vst vr7, \dst0, 0
+.endm
+
+.macro HEVC_PEL_UNI_W_PIXELS32_LASX src0, dst0, w
+.if \w == 16
+ vld vr0, \src0, 0
+ add.d t2, \src0, a3
+ vld vr5, t2, 0
+ xvpermi.q xr0, xr5, 0x02
+.else //w=24/32
+ xvld xr0, \src0, 0
+.endif
+ xvexth.hu.bu xr7, xr0
+ xvexth.wu.hu xr8, xr7
+ xvsllwil.wu.hu xr7, xr7, 0
+ xvsllwil.hu.bu xr5, xr0, 0
+ xvexth.wu.hu xr6, xr5
+ xvsllwil.wu.hu xr5, xr5, 0
+ xvslli.w xr5, xr5, 6
+ xvslli.w xr6, xr6, 6
+ xvslli.w xr7, xr7, 6
+ xvslli.w xr8, xr8, 6
+ xvmul.w xr5, xr5, xr1
+ xvmul.w xr6, xr6, xr1
+ xvmul.w xr7, xr7, xr1
+ xvmul.w xr8, xr8, xr1
+ xvadd.w xr5, xr5, xr2
+ xvadd.w xr6, xr6, xr2
+ xvadd.w xr7, xr7, xr2
+ xvadd.w xr8, xr8, xr2
+ xvsra.w xr5, xr5, xr3
+ xvsra.w xr6, xr6, xr3
+ xvsra.w xr7, xr7, xr3
+ xvsra.w xr8, xr8, xr3
+ xvadd.w xr5, xr5, xr4
+ xvadd.w xr6, xr6, xr4
+ xvadd.w xr7, xr7, xr4
+ xvadd.w xr8, xr8, xr4
+ xvssrani.h.w xr6, xr5, 0
+ xvssrani.h.w xr8, xr7, 0
+ xvssrani.bu.h xr8, xr6, 0
+.if \w == 16
+ vst vr8, \dst0, 0
+ add.d t2, \dst0, a1
+ xvpermi.q xr8, xr8, 0x01
+ vst vr8, t2, 0
+.elseif \w == 24
+ vst vr8, \dst0, 0
+ xvstelm.d xr8, \dst0, 16, 2
+.else
+ xvst xr8, \dst0, 0
+.endif
+.endm
+
+function ff_hevc_put_hevc_pel_uni_w_pixels4_8_lsx
+ LOAD_VAR 128
+ srli.w t0, a4, 1
+.LOOP_PIXELS4:
+ vldrepl.w vr0, a2, 0
+ add.d t1, a2, a3
+ vldrepl.w vr5, t1, 0
+ vsllwil.hu.bu vr0, vr0, 0
+ vsllwil.wu.hu vr0, vr0, 0
+ vsllwil.hu.bu vr5, vr5, 0
+ vsllwil.wu.hu vr5, vr5, 0
+ vslli.w vr0, vr0, 6
+ vslli.w vr5, vr5, 6
+ vmul.w vr0, vr0, vr1
+ vmul.w vr5, vr5, vr1
+ vadd.w vr0, vr0, vr2
+ vadd.w vr5, vr5, vr2
+ vsra.w vr0, vr0, vr3
+ vsra.w vr5, vr5, vr3
+ vadd.w vr0, vr0, vr4
+ vadd.w vr5, vr5, vr4
+ vssrani.h.w vr5, vr0, 0
+ vssrani.bu.h vr5, vr5, 0
+ fst.s f5, a0, 0
+ add.d t2, a0, a1
+ vstelm.w vr5, t2, 0, 1
+ alsl.d a2, a3, a2, 1
+ alsl.d a0, a1, a0, 1
+ addi.w t0, t0, -1
+ bnez t0, .LOOP_PIXELS4
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels6_8_lsx
+ LOAD_VAR 128
+.LOOP_PIXELS6:
+ HEVC_PEL_UNI_W_PIXELS8_LSX a2, a0, 6
+ add.d a2, a2, a3
+ add.d a0, a0, a1
+ addi.w a4, a4, -1
+ bnez a4, .LOOP_PIXELS6
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels6_8_lasx
+ LOAD_VAR 256
+ srli.w t0, a4, 1
+.LOOP_PIXELS6_LASX:
+ HEVC_PEL_UNI_W_PIXELS8x2_LASX a2, a0, 6
+ alsl.d a2, a3, a2, 1
+ alsl.d a0, a1, a0, 1
+ addi.w t0, t0, -1
+ bnez t0, .LOOP_PIXELS6_LASX
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels8_8_lsx
+ LOAD_VAR 128
+.LOOP_PIXELS8:
+ HEVC_PEL_UNI_W_PIXELS8_LSX a2, a0, 8
+ add.d a2, a2, a3
+ add.d a0, a0, a1
+ addi.w a4, a4, -1
+ bnez a4, .LOOP_PIXELS8
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels8_8_lasx
+ LOAD_VAR 256
+ srli.w t0, a4, 1
+.LOOP_PIXELS8_LASX:
+ HEVC_PEL_UNI_W_PIXELS8x2_LASX a2, a0, 8
+ alsl.d a2, a3, a2, 1
+ alsl.d a0, a1, a0, 1
+ addi.w t0, t0, -1
+ bnez t0, .LOOP_PIXELS8_LASX
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels12_8_lsx
+ LOAD_VAR 128
+.LOOP_PIXELS12:
+ vld vr0, a2, 0
+ vexth.hu.bu vr7, vr0
+ vsllwil.wu.hu vr7, vr7, 0
+ vsllwil.hu.bu vr5, vr0, 0
+ vexth.wu.hu vr6, vr5
+ vsllwil.wu.hu vr5, vr5, 0
+ vslli.w vr5, vr5, 6
+ vslli.w vr6, vr6, 6
+ vslli.w vr7, vr7, 6
+ vmul.w vr5, vr5, vr1
+ vmul.w vr6, vr6, vr1
+ vmul.w vr7, vr7, vr1
+ vadd.w vr5, vr5, vr2
+ vadd.w vr6, vr6, vr2
+ vadd.w vr7, vr7, vr2
+ vsra.w vr5, vr5, vr3
+ vsra.w vr6, vr6, vr3
+ vsra.w vr7, vr7, vr3
+ vadd.w vr5, vr5, vr4
+ vadd.w vr6, vr6, vr4
+ vadd.w vr7, vr7, vr4
+ vssrani.h.w vr6, vr5, 0
+ vssrani.h.w vr7, vr7, 0
+ vssrani.bu.h vr7, vr6, 0
+ fst.d f7, a0, 0
+ vstelm.w vr7, a0, 8, 2
+ add.d a2, a2, a3
+ add.d a0, a0, a1
+ addi.w a4, a4, -1
+ bnez a4, .LOOP_PIXELS12
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels12_8_lasx
+ LOAD_VAR 256
+.LOOP_PIXELS12_LASX:
+ vld vr0, a2, 0
+ xvpermi.d xr0, xr0, 0xd8
+ xvsllwil.hu.bu xr0, xr0, 0
+ xvexth.wu.hu xr6, xr0
+ xvsllwil.wu.hu xr5, xr0, 0
+ xvslli.w xr5, xr5, 6
+ xvslli.w xr6, xr6, 6
+ xvmul.w xr5, xr5, xr1
+ xvmul.w xr6, xr6, xr1
+ xvadd.w xr5, xr5, xr2
+ xvadd.w xr6, xr6, xr2
+ xvsra.w xr5, xr5, xr3
+ xvsra.w xr6, xr6, xr3
+ xvadd.w xr5, xr5, xr4
+ xvadd.w xr6, xr6, xr4
+ xvssrani.h.w xr6, xr5, 0
+ xvpermi.q xr7, xr6, 0x01
+ xvssrani.bu.h xr7, xr6, 0
+ fst.d f7, a0, 0
+ vstelm.w vr7, a0, 8, 2
+ add.d a2, a2, a3
+ add.d a0, a0, a1
+ addi.w a4, a4, -1
+ bnez a4, .LOOP_PIXELS12_LASX
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels16_8_lsx
+ LOAD_VAR 128
+.LOOP_PIXELS16:
+ HEVC_PEL_UNI_W_PIXELS16_LSX a2, a0
+ add.d a2, a2, a3
+ add.d a0, a0, a1
+ addi.w a4, a4, -1
+ bnez a4, .LOOP_PIXELS16
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels16_8_lasx
+ LOAD_VAR 256
+ srli.w t0, a4, 1
+.LOOP_PIXELS16_LASX:
+ HEVC_PEL_UNI_W_PIXELS32_LASX a2, a0, 16
+ alsl.d a2, a3, a2, 1
+ alsl.d a0, a1, a0, 1
+ addi.w t0, t0, -1
+ bnez t0, .LOOP_PIXELS16_LASX
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels24_8_lsx
+ LOAD_VAR 128
+.LOOP_PIXELS24:
+ HEVC_PEL_UNI_W_PIXELS16_LSX a2, a0
+ addi.d t0, a2, 16
+ addi.d t1, a0, 16
+ HEVC_PEL_UNI_W_PIXELS8_LSX t0, t1, 8
+ add.d a2, a2, a3
+ add.d a0, a0, a1
+ addi.w a4, a4, -1
+ bnez a4, .LOOP_PIXELS24
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels24_8_lasx
+ LOAD_VAR 256
+.LOOP_PIXELS24_LASX:
+ HEVC_PEL_UNI_W_PIXELS32_LASX a2, a0, 24
+ add.d a2, a2, a3
+ add.d a0, a0, a1
+ addi.w a4, a4, -1
+ bnez a4, .LOOP_PIXELS24_LASX
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels32_8_lsx
+ LOAD_VAR 128
+.LOOP_PIXELS32:
+ HEVC_PEL_UNI_W_PIXELS16_LSX a2, a0
+ addi.d t0, a2, 16
+ addi.d t1, a0, 16
+ HEVC_PEL_UNI_W_PIXELS16_LSX t0, t1
+ add.d a2, a2, a3
+ add.d a0, a0, a1
+ addi.w a4, a4, -1
+ bnez a4, .LOOP_PIXELS32
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels32_8_lasx
+ LOAD_VAR 256
+.LOOP_PIXELS32_LASX:
+ HEVC_PEL_UNI_W_PIXELS32_LASX a2, a0, 32
+ add.d a2, a2, a3
+ add.d a0, a0, a1
+ addi.w a4, a4, -1
+ bnez a4, .LOOP_PIXELS32_LASX
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels48_8_lsx
+ LOAD_VAR 128
+.LOOP_PIXELS48:
+ HEVC_PEL_UNI_W_PIXELS16_LSX a2, a0
+ addi.d t0, a2, 16
+ addi.d t1, a0, 16
+ HEVC_PEL_UNI_W_PIXELS16_LSX t0, t1
+ addi.d t0, a2, 32
+ addi.d t1, a0, 32
+ HEVC_PEL_UNI_W_PIXELS16_LSX t0, t1
+ add.d a2, a2, a3
+ add.d a0, a0, a1
+ addi.w a4, a4, -1
+ bnez a4, .LOOP_PIXELS48
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels48_8_lasx
+ LOAD_VAR 256
+.LOOP_PIXELS48_LASX:
+ HEVC_PEL_UNI_W_PIXELS32_LASX a2, a0, 32
+ addi.d t0, a2, 32
+ addi.d t1, a0, 32
+ HEVC_PEL_UNI_W_PIXELS16_LASX t0, t1
+ add.d a2, a2, a3
+ add.d a0, a0, a1
+ addi.w a4, a4, -1
+ bnez a4, .LOOP_PIXELS48_LASX
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels64_8_lsx
+ LOAD_VAR 128
+.LOOP_PIXELS64:
+ HEVC_PEL_UNI_W_PIXELS16_LSX a2, a0
+ addi.d t0, a2, 16
+ addi.d t1, a0, 16
+ HEVC_PEL_UNI_W_PIXELS16_LSX t0, t1
+ addi.d t0, a2, 32
+ addi.d t1, a0, 32
+ HEVC_PEL_UNI_W_PIXELS16_LSX t0, t1
+ addi.d t0, a2, 48
+ addi.d t1, a0, 48
+ HEVC_PEL_UNI_W_PIXELS16_LSX t0, t1
+ add.d a2, a2, a3
+ add.d a0, a0, a1
+ addi.w a4, a4, -1
+ bnez a4, .LOOP_PIXELS64
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels64_8_lasx
+ LOAD_VAR 256
+.LOOP_PIXELS64_LASX:
+ HEVC_PEL_UNI_W_PIXELS32_LASX a2, a0, 32
+ addi.d t0, a2, 32
+ addi.d t1, a0, 32
+ HEVC_PEL_UNI_W_PIXELS32_LASX t0, t1, 32
+ add.d a2, a2, a3
+ add.d a0, a0, a1
+ addi.w a4, a4, -1
+ bnez a4, .LOOP_PIXELS64_LASX
+endfunc
diff --git a/libavcodec/loongarch/hevcdsp_init_loongarch.c b/libavcodec/loongarch/hevcdsp_init_loongarch.c
index a8f753dc86..d0ee99d6b5 100644
--- a/libavcodec/loongarch/hevcdsp_init_loongarch.c
+++ b/libavcodec/loongarch/hevcdsp_init_loongarch.c
@@ -22,6 +22,7 @@
#include "libavutil/loongarch/cpu.h"
#include "hevcdsp_lsx.h"
+#include "hevcdsp_lasx.h"
void ff_hevc_dsp_init_loongarch(HEVCDSPContext *c, const int bit_depth)
{
@@ -160,6 +161,26 @@ void ff_hevc_dsp_init_loongarch(HEVCDSPContext *c, const int bit_depth)
c->put_hevc_epel_uni[6][1][1] = ff_hevc_put_hevc_uni_epel_hv24_8_lsx;
c->put_hevc_epel_uni[7][1][1] = ff_hevc_put_hevc_uni_epel_hv32_8_lsx;
+ c->put_hevc_qpel_uni_w[1][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels4_8_lsx;
+ c->put_hevc_qpel_uni_w[2][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels6_8_lsx;
+ c->put_hevc_qpel_uni_w[3][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels8_8_lsx;
+ c->put_hevc_qpel_uni_w[4][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels12_8_lsx;
+ c->put_hevc_qpel_uni_w[5][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels16_8_lsx;
+ c->put_hevc_qpel_uni_w[6][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels24_8_lsx;
+ c->put_hevc_qpel_uni_w[7][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels32_8_lsx;
+ c->put_hevc_qpel_uni_w[8][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels48_8_lsx;
+ c->put_hevc_qpel_uni_w[9][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels64_8_lsx;
+
+ c->put_hevc_epel_uni_w[1][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels4_8_lsx;
+ c->put_hevc_epel_uni_w[2][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels6_8_lsx;
+ c->put_hevc_epel_uni_w[3][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels8_8_lsx;
+ c->put_hevc_epel_uni_w[4][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels12_8_lsx;
+ c->put_hevc_epel_uni_w[5][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels16_8_lsx;
+ c->put_hevc_epel_uni_w[6][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels24_8_lsx;
+ c->put_hevc_epel_uni_w[7][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels32_8_lsx;
+ c->put_hevc_epel_uni_w[8][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels48_8_lsx;
+ c->put_hevc_epel_uni_w[9][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels64_8_lsx;
+
c->put_hevc_qpel_uni_w[3][1][1] = ff_hevc_put_hevc_uni_w_qpel_hv8_8_lsx;
c->put_hevc_qpel_uni_w[5][1][1] = ff_hevc_put_hevc_uni_w_qpel_hv16_8_lsx;
c->put_hevc_qpel_uni_w[6][1][1] = ff_hevc_put_hevc_uni_w_qpel_hv24_8_lsx;
@@ -196,4 +217,26 @@ void ff_hevc_dsp_init_loongarch(HEVCDSPContext *c, const int bit_depth)
c->add_residual[3] = ff_hevc_add_residual32x32_8_lsx;
}
}
+
+ if (have_lasx(cpu_flags)) {
+ if (bit_depth == 8) {
+ c->put_hevc_qpel_uni_w[2][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels6_8_lasx;
+ c->put_hevc_qpel_uni_w[3][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels8_8_lasx;
+ c->put_hevc_qpel_uni_w[4][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels12_8_lasx;
+ c->put_hevc_qpel_uni_w[5][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels16_8_lasx;
+ c->put_hevc_qpel_uni_w[6][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels24_8_lasx;
+ c->put_hevc_qpel_uni_w[7][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels32_8_lasx;
+ c->put_hevc_qpel_uni_w[8][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels48_8_lasx;
+ c->put_hevc_qpel_uni_w[9][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels64_8_lasx;
+
+ c->put_hevc_epel_uni_w[2][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels6_8_lasx;
+ c->put_hevc_epel_uni_w[3][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels8_8_lasx;
+ c->put_hevc_epel_uni_w[4][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels12_8_lasx;
+ c->put_hevc_epel_uni_w[5][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels16_8_lasx;
+ c->put_hevc_epel_uni_w[6][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels24_8_lasx;
+ c->put_hevc_epel_uni_w[7][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels32_8_lasx;
+ c->put_hevc_epel_uni_w[8][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels48_8_lasx;
+ c->put_hevc_epel_uni_w[9][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels64_8_lasx;
+ }
+ }
}
diff --git a/libavcodec/loongarch/hevcdsp_lasx.h b/libavcodec/loongarch/hevcdsp_lasx.h
new file mode 100644
index 0000000000..819c3c3ecf
--- /dev/null
+++ b/libavcodec/loongarch/hevcdsp_lasx.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2023 Loongson Technology Corporation Limited
+ * Contributed by jinbo <jinbo at loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_LOONGARCH_HEVCDSP_LASX_H
+#define AVCODEC_LOONGARCH_HEVCDSP_LASX_H
+
+#include "libavcodec/hevcdsp.h"
+
+#define PEL_UNI_W(PEL, DIR, WIDTH) \
+void ff_hevc_put_hevc_##PEL##_uni_w_##DIR##WIDTH##_8_lasx(uint8_t *dst, \
+ ptrdiff_t \
+ dst_stride, \
+ const uint8_t *src, \
+ ptrdiff_t \
+ src_stride, \
+ int height, \
+ int denom, \
+ int wx, \
+ int ox, \
+ intptr_t mx, \
+ intptr_t my, \
+ int width)
+
+PEL_UNI_W(pel, pixels, 6);
+PEL_UNI_W(pel, pixels, 8);
+PEL_UNI_W(pel, pixels, 12);
+PEL_UNI_W(pel, pixels, 16);
+PEL_UNI_W(pel, pixels, 24);
+PEL_UNI_W(pel, pixels, 32);
+PEL_UNI_W(pel, pixels, 48);
+PEL_UNI_W(pel, pixels, 64);
+
+#undef PEL_UNI_W
+
+#endif // #ifndef AVCODEC_LOONGARCH_HEVCDSP_LASX_H
diff --git a/libavcodec/loongarch/hevcdsp_lsx.h b/libavcodec/loongarch/hevcdsp_lsx.h
index ac509984fd..0d724a90ef 100644
--- a/libavcodec/loongarch/hevcdsp_lsx.h
+++ b/libavcodec/loongarch/hevcdsp_lsx.h
@@ -232,4 +232,31 @@ void ff_hevc_add_residual8x8_8_lsx(uint8_t *dst, const int16_t *res, ptrdiff_t s
void ff_hevc_add_residual16x16_8_lsx(uint8_t *dst, const int16_t *res, ptrdiff_t stride);
void ff_hevc_add_residual32x32_8_lsx(uint8_t *dst, const int16_t *res, ptrdiff_t stride);
+#define PEL_UNI_W(PEL, DIR, WIDTH) \
+void ff_hevc_put_hevc_##PEL##_uni_w_##DIR##WIDTH##_8_lsx(uint8_t *dst, \
+ ptrdiff_t \
+ dst_stride, \
+ const uint8_t *src, \
+ ptrdiff_t \
+ src_stride, \
+ int height, \
+ int denom, \
+ int wx, \
+ int ox, \
+ intptr_t mx, \
+ intptr_t my, \
+ int width)
+
+PEL_UNI_W(pel, pixels, 4);
+PEL_UNI_W(pel, pixels, 6);
+PEL_UNI_W(pel, pixels, 8);
+PEL_UNI_W(pel, pixels, 12);
+PEL_UNI_W(pel, pixels, 16);
+PEL_UNI_W(pel, pixels, 24);
+PEL_UNI_W(pel, pixels, 32);
+PEL_UNI_W(pel, pixels, 48);
+PEL_UNI_W(pel, pixels, 64);
+
+#undef PEL_UNI_W
+
#endif // #ifndef AVCODEC_LOONGARCH_HEVCDSP_LSX_H
--
2.20.1
More information about the ffmpeg-devel
mailing list