[FFmpeg-devel] [PATCH] avcodec/x86: add cfhdenc SIMD

Paul B Mahol onemda at gmail.com
Wed Feb 24 18:41:45 EET 2021


Signed-off-by: Paul B Mahol <onemda at gmail.com>
---
 libavcodec/cfhdencdsp.c          |   3 +
 libavcodec/x86/Makefile          |   2 +
 libavcodec/x86/cfhdencdsp.asm    | 429 +++++++++++++++++++++++++++++++
 libavcodec/x86/cfhdencdsp_init.c |  48 ++++
 4 files changed, 482 insertions(+)
 create mode 100644 libavcodec/x86/cfhdencdsp.asm
 create mode 100644 libavcodec/x86/cfhdencdsp_init.c

diff --git a/libavcodec/cfhdencdsp.c b/libavcodec/cfhdencdsp.c
index 0becb76d1d..b979e9e09a 100644
--- a/libavcodec/cfhdencdsp.c
+++ b/libavcodec/cfhdencdsp.c
@@ -73,4 +73,7 @@ av_cold void ff_cfhdencdsp_init(CFHDEncDSPContext *c)
 {
     c->horiz_filter = horiz_filter;
     c->vert_filter = vert_filter;
+
+    if (ARCH_X86)
+        ff_cfhdencdsp_init_x86(c);
 }
diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
index 884dc0c759..6361161180 100644
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -51,6 +51,7 @@ OBJS-$(CONFIG_ALAC_DECODER)            += x86/alacdsp_init.o
 OBJS-$(CONFIG_APNG_DECODER)            += x86/pngdsp_init.o
 OBJS-$(CONFIG_CAVS_DECODER)            += x86/cavsdsp.o
 OBJS-$(CONFIG_CFHD_DECODER)            += x86/cfhddsp_init.o
+OBJS-$(CONFIG_CFHD_ENCODER)            += x86/cfhdencdsp_init.o
 OBJS-$(CONFIG_DCA_DECODER)             += x86/dcadsp_init.o x86/synth_filter_init.o
 OBJS-$(CONFIG_DNXHD_ENCODER)           += x86/dnxhdenc_init.o
 OBJS-$(CONFIG_EXR_DECODER)             += x86/exrdsp_init.o
@@ -154,6 +155,7 @@ X86ASM-OBJS-$(CONFIG_ADPCM_G722_ENCODER) += x86/g722dsp.o
 X86ASM-OBJS-$(CONFIG_ALAC_DECODER)     += x86/alacdsp.o
 X86ASM-OBJS-$(CONFIG_APNG_DECODER)     += x86/pngdsp.o
 X86ASM-OBJS-$(CONFIG_CAVS_DECODER)     += x86/cavsidct.o
+X86ASM-OBJS-$(CONFIG_CFHD_ENCODER)     += x86/cfhdencdsp.o
 X86ASM-OBJS-$(CONFIG_CFHD_DECODER)     += x86/cfhddsp.o
 X86ASM-OBJS-$(CONFIG_DCA_DECODER)      += x86/dcadsp.o x86/synth_filter.o
 X86ASM-OBJS-$(CONFIG_DIRAC_DECODER)    += x86/diracdsp.o                \
diff --git a/libavcodec/x86/cfhdencdsp.asm b/libavcodec/x86/cfhdencdsp.asm
new file mode 100644
index 0000000000..1753b7829a
--- /dev/null
+++ b/libavcodec/x86/cfhdencdsp.asm
@@ -0,0 +1,429 @@
+;******************************************************************************
+;* x86-optimized functions for the CFHD encoder
+;* Copyright (c) 2021 Paul B Mahol
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+pw_p1_n1:  dw  1, -1, 1, -1, 1, -1, 1, -1
+pw_n1_p1:  dw  -1, 1, -1, 1, -1, 1, -1, 1
+pw_p5_n11: dw  5, -11, 5, -11, 5, -11, 5, -11
+pw_n5_p11: dw -5, 11, -5, 11, -5, 11, -5, 11
+pw_p11_n5: dw 11, -5, 11, -5, 11, -5, 11, -5
+pw_n11_p5: dw -11, 5, -11, 5, -11, 5, -11, 5
+pd_4:  times 4 dd  4
+pw_p4: times 8 dw  4
+pw_n4: times 8 dw -4
+pw_p1: times 8 dw  1
+pw_n1: times 8 dw -1
+
+SECTION .text
+
+%if ARCH_X86_64
+INIT_XMM sse2
+cglobal cfhdenc_horiz_filter, 8, 10, 11, input, low, high, istride, lwidth, hwidth, width, y, x, temp
+    shl  istrided, 1
+    shl   lwidthd, 1
+    shl   hwidthd, 1
+    mova       m7, [pd_4]
+    mova       m8, [pw_p1]
+    mova       m9, [pw_n1]
+    mova       m10,[pw_p1_n1]
+    neg        yq
+.looph:
+    movsx          xq, word [inputq]
+
+    movsx       tempq, word [inputq + 2]
+    add         tempq, xq
+
+    movd          xm0, tempd
+    packssdw       m0, m0
+    pextrw      tempd, xm0, 0
+    mov   word [lowq], tempw
+
+    movsx          xq, word [inputq]
+    imul           xq, 5
+    movsx       tempq, word [inputq + 2]
+    imul        tempq, -11
+    add         tempq, xq
+
+    movsx          xq, word [inputq + 4]
+    imul           xq, 4
+    add         tempq, xq
+
+    movsx          xq, word [inputq + 6]
+    imul           xq, 4
+    add         tempq, xq
+
+    movsx          xq, word [inputq + 8]
+    imul           xq, -1
+    add         tempq, xq
+
+    movsx          xq, word [inputq + 10]
+    imul           xq, -1
+    add         tempq, xq
+
+    add         tempq, 4
+    sar         tempq, 3
+
+    movd          xm0, tempd
+    packssdw       m0, m0
+    pextrw      tempd, xm0, 0
+    mov  word [highq], tempw
+
+    mov            xq, 2
+
+.loopw:
+    movu           m0, [inputq + xq * 2]
+    movu           m1, [inputq + xq * 2 + mmsize]
+
+    pmaddwd        m0, m8
+    pmaddwd        m1, m8
+
+    packssdw       m0, m1
+    movu    [lowq+xq], m0
+
+    movu           m2, [inputq + xq * 2 - 4]
+    movu           m3, [inputq + xq * 2 - 4 + mmsize]
+
+    pmaddwd        m2, m9
+    pmaddwd        m3, m9
+
+    movu           m0, [inputq + xq * 2 + 4]
+    movu           m1, [inputq + xq * 2 + 4 + mmsize]
+
+    pmaddwd        m0, m8
+    pmaddwd        m1, m8
+
+    paddd          m0, m2
+    paddd          m1, m3
+
+    paddd          m0, m7
+    paddd          m1, m7
+
+    psrad          m0, 3
+    psrad          m1, 3
+
+    movu           m5, [inputq + xq * 2 + 0]
+    movu           m6, [inputq + xq * 2 + mmsize]
+
+    pmaddwd        m5, m10
+    pmaddwd        m6, m10
+
+    paddd          m0, m5
+    paddd          m1, m6
+
+    packssdw       m0, m1
+    movu   [highq+xq], m0
+
+    add            xq, mmsize
+    cmp            xq, widthq
+    jl .loopw
+
+    add          lowq, widthq
+    add         highq, widthq
+    add        inputq, widthq
+    add        inputq, widthq
+
+    movsx          xq, word [inputq - 4]
+    movsx       tempq, word [inputq - 2]
+    add         tempq, xq
+
+    movd          xm0, tempd
+    packssdw       m0, m0
+    pextrw      tempd, xm0, 0
+    mov word [lowq-2], tempw
+
+    movsx       tempq, word [inputq - 4]
+    imul        tempq, 11
+    movsx          xq, word [inputq - 2]
+    imul           xq, -5
+    add         tempq, xq
+
+    movsx          xq, word [inputq - 6]
+    imul           xq, -4
+    add         tempq, xq
+
+    movsx          xq, word [inputq - 8]
+    imul           xq, -4
+    add         tempq, xq
+
+    movsx          xq, word [inputq - 10]
+    add         tempq, xq
+
+    movsx          xq, word [inputq - 12]
+    add         tempq, xq
+
+    add         tempq, 4
+    sar         tempq, 3
+
+    movd          xm0, tempd
+    packssdw       m0, m0
+    pextrw      tempd, xm0, 0
+    mov word [highq-2], tempw
+
+    sub        inputq, widthq
+    sub        inputq, widthq
+    sub         highq, widthq
+    sub          lowq, widthq
+
+    add          lowq, lwidthq
+    add         highq, hwidthq
+    add        inputq, istrideq
+    add            yq, 1
+    jl .looph
+
+    RET
+%endif
+
+%if ARCH_X86_64
+INIT_XMM sse2
+cglobal cfhdenc_vert_filter, 8, 11, 12, input, low, high, istride, lwidth, hwidth, width, height, x, y, pos
+    shl  istrided, 1
+    shl    widthd, 1
+
+    sub   heightq, 2
+
+    xor        xq, xq
+
+    mova       m7, [pd_4]
+    mova       m8, [pw_p1]
+    mova       m9, [pw_n1]
+    mova       m10,[pw_p1_n1]
+    mova       m11,[pw_n1_p1]
+.loopw:
+    mov        yq, 2
+
+    mov      posq, xq
+    movu       m0, [inputq + posq]
+    add      posq, istrideq
+    movu       m1, [inputq + posq]
+
+    paddsw     m0, m1
+
+    movu    [lowq + xq], m0
+
+    mov      posq, xq
+
+    movu       m0, [inputq + posq]
+    add      posq, istrideq
+    movu       m1, [inputq + posq]
+    add      posq, istrideq
+    movu       m2, [inputq + posq]
+    add      posq, istrideq
+    movu       m3, [inputq + posq]
+    add      posq, istrideq
+    movu       m4, [inputq + posq]
+    add      posq, istrideq
+    movu       m5, [inputq + posq]
+
+    mova       m6, m0
+    punpcklwd  m0, m1
+    punpckhwd  m1, m6
+
+    mova       m6, m2
+    punpcklwd  m2, m3
+    punpckhwd  m3, m6
+
+    mova       m6, m4
+    punpcklwd  m4, m5
+    punpckhwd  m5, m6
+
+    pmaddwd    m0, [pw_p5_n11]
+    pmaddwd    m1, [pw_n11_p5]
+    pmaddwd    m2, [pw_p4]
+    pmaddwd    m3, [pw_p4]
+    pmaddwd    m4, m9
+    pmaddwd    m5, m9
+
+    paddd      m0, m2
+    paddd      m1, m3
+    paddd      m0, m4
+    paddd      m1, m5
+
+    paddd      m0, m7
+    paddd      m1, m7
+
+    psrad      m0, 3
+    psrad      m1, 3
+    packssdw   m0, m1
+
+    movu   [highq + xq], m0
+
+.looph:
+
+    mov      posq, istrideq
+    imul     posq, yq
+    add      posq, xq
+
+    movu       m0, [inputq + posq]
+
+    add      posq, istrideq
+    movu       m1, [inputq + posq]
+
+    paddsw     m0, m1
+
+    mov      posq, lwidthq
+    imul     posq, yq
+    add      posq, xq
+
+    movu    [lowq + posq], m0
+
+    add        yq, -2
+
+    mov      posq, istrideq
+    imul     posq, yq
+    add      posq, xq
+
+    movu       m0, [inputq + posq]
+    add      posq, istrideq
+    movu       m1, [inputq + posq]
+    add      posq, istrideq
+    movu       m2, [inputq + posq]
+    add      posq, istrideq
+    movu       m3, [inputq + posq]
+    add      posq, istrideq
+    movu       m4, [inputq + posq]
+    add      posq, istrideq
+    movu       m5, [inputq + posq]
+
+    add        yq, 2
+
+    mova       m6, m0
+    punpcklwd  m0, m1
+    punpckhwd  m1, m6
+
+    mova       m6, m2
+    punpcklwd  m2, m3
+    punpckhwd  m3, m6
+
+    mova       m6, m4
+    punpcklwd  m4, m5
+    punpckhwd  m5, m6
+
+    pmaddwd    m0, m9
+    pmaddwd    m1, m9
+    pmaddwd    m2, m10
+    pmaddwd    m3, m11
+    pmaddwd    m4, m8
+    pmaddwd    m5, m8
+
+    paddd      m0, m4
+    paddd      m1, m5
+
+    paddd      m0, m7
+    paddd      m1, m7
+
+    psrad      m0, 3
+    psrad      m1, 3
+    paddd      m0, m2
+    paddd      m1, m3
+    packssdw   m0, m1
+
+    mov      posq, hwidthq
+    imul     posq, yq
+    add      posq, xq
+
+    movu   [highq + posq], m0
+
+    add        yq, 2
+    cmp        yq, heightq
+    jl .looph
+
+    mov      posq, istrideq
+    imul     posq, yq
+    add      posq, xq
+
+    movu       m0, [inputq + posq]
+    add      posq, istrideq
+    movu       m1, [inputq + posq]
+
+    paddsw     m0, m1
+
+    mov      posq, lwidthq
+    imul     posq, yq
+    add      posq, xq
+
+    movu    [lowq + posq], m0
+
+    sub        yq, 4
+
+    mov      posq, istrideq
+    imul     posq, yq
+    add      posq, xq
+
+    movu       m0, [inputq + posq]
+    add      posq, istrideq
+    movu       m1, [inputq + posq]
+    add      posq, istrideq
+    movu       m2, [inputq + posq]
+    add      posq, istrideq
+    movu       m3, [inputq + posq]
+    add      posq, istrideq
+    movu       m4, [inputq + posq]
+    add      posq, istrideq
+    movu       m5, [inputq + posq]
+
+    add        yq, 4
+
+    mova       m6, m0
+    punpcklwd  m0, m1
+    punpckhwd  m1, m6
+
+    mova       m6, m2
+    punpcklwd  m2, m3
+    punpckhwd  m3, m6
+
+    mova       m6, m4
+    punpcklwd  m4, m5
+    punpckhwd  m5, m6
+
+    pmaddwd    m0, m8
+    pmaddwd    m1, m8
+    pmaddwd    m2, [pw_n4]
+    pmaddwd    m3, [pw_n4]
+    pmaddwd    m4, [pw_p11_n5]
+    pmaddwd    m5, [pw_n5_p11]
+
+    paddd      m4, m2
+    paddd      m5, m3
+
+    paddd      m4, m0
+    paddd      m5, m1
+
+    paddd      m4, m7
+    paddd      m5, m7
+
+    psrad      m4, 3
+    psrad      m5, 3
+    packssdw   m4, m5
+
+    mov      posq, hwidthq
+    imul     posq, yq
+    add      posq, xq
+
+    movu   [highq + posq], m4
+
+    add        xq, mmsize
+    cmp        xq, widthq
+    jl .loopw
+    RET
+%endif
diff --git a/libavcodec/x86/cfhdencdsp_init.c b/libavcodec/x86/cfhdencdsp_init.c
new file mode 100644
index 0000000000..28f1dd504d
--- /dev/null
+++ b/libavcodec/x86/cfhdencdsp_init.c
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2021 Paul B Mahol
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/avcodec.h"
+#include "libavcodec/cfhdencdsp.h"
+
+void ff_cfhdenc_horiz_filter_sse2(int16_t *input, int16_t *low, int16_t *high,
+                                  ptrdiff_t in_stride, ptrdiff_t low_stride,
+                                  ptrdiff_t high_stride,
+                                  int width, int height);
+void ff_cfhdenc_vert_filter_sse2(int16_t *input, int16_t *low, int16_t *high,
+                                 ptrdiff_t in_stride, ptrdiff_t low_stride,
+                                 ptrdiff_t high_stride,
+                                 int width, int height);
+
+av_cold void ff_cfhdencdsp_init_x86(CFHDEncDSPContext *c)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+#if ARCH_X86_64
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        c->horiz_filter = ff_cfhdenc_horiz_filter_sse2;
+        c->vert_filter = ff_cfhdenc_vert_filter_sse2;
+    }
+#endif
+}
-- 
2.17.1



More information about the ffmpeg-devel mailing list