[FFmpeg-devel] [PATCH 1/3] diracdec: add 10-bit Haar SIMD functions

James Darnley jdarnley at obe.tv
Thu Jul 26 14:28:06 EEST 2018


Speed of ffmpeg when decoding a 720p yuv422p10 file encoded with the
relevant transform.
C:    119fps
SSE2: 204fps
AVX:  206fps
AVX2: 221fps

timer measurements, haar horizontal compose:
    sse2: 3.68x faster (45143 vs. 12279 decicycles) compared with C
    avx:  3.68x faster (45143 vs. 12275 decicycles) compared with C
    avx2: 5.16x faster (45143 vs.  8742 decicycles) compared with C
haar vertical compose:
    sse2: 1.64x faster (31792 vs. 19377 decicycles) compared with C
    avx:  1.58x faster (31792 vs. 20090 decicycles) compared with C
    avx2: 1.66x faster (31792 vs. 19157 decicycles) compared with C
---
 libavcodec/dirac_dwt.c                |   7 +-
 libavcodec/dirac_dwt.h                |   1 +
 libavcodec/x86/Makefile               |   6 +-
 libavcodec/x86/dirac_dwt_10bit.asm    | 160 ++++++++++++++++++++++++++
 libavcodec/x86/dirac_dwt_init_10bit.c |  76 ++++++++++++
 5 files changed, 247 insertions(+), 3 deletions(-)
 create mode 100644 libavcodec/x86/dirac_dwt_10bit.asm
 create mode 100644 libavcodec/x86/dirac_dwt_init_10bit.c

diff --git a/libavcodec/dirac_dwt.c b/libavcodec/dirac_dwt.c
index cc08f8865a..86bee5bb9b 100644
--- a/libavcodec/dirac_dwt.c
+++ b/libavcodec/dirac_dwt.c
@@ -59,8 +59,13 @@ int ff_spatial_idwt_init(DWTContext *d, DWTPlane *p, enum dwt_type type,
         return AVERROR_INVALIDDATA;
     }
 
-    if (ARCH_X86 && bit_depth == 8)
+#if ARCH_X86
+    if (bit_depth == 8)
         ff_spatial_idwt_init_x86(d, type);
+    else if (bit_depth == 10)
+        ff_spatial_idwt_init_10bit_x86(d, type);
+#endif
+
     return 0;
 }
 
diff --git a/libavcodec/dirac_dwt.h b/libavcodec/dirac_dwt.h
index 994dc21d70..1ad7b9a821 100644
--- a/libavcodec/dirac_dwt.h
+++ b/libavcodec/dirac_dwt.h
@@ -88,6 +88,7 @@ enum dwt_type {
 int ff_spatial_idwt_init(DWTContext *d, DWTPlane *p, enum dwt_type type,
                          int decomposition_count, int bit_depth);
 void ff_spatial_idwt_init_x86(DWTContext *d, enum dwt_type type);
+void ff_spatial_idwt_init_10bit_x86(DWTContext *d, enum dwt_type type);
 
 void ff_spatial_idwt_slice2(DWTContext *d, int y);
 
diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
index 2350c8bbee..590d83c167 100644
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -7,7 +7,8 @@ OBJS-$(CONFIG_BLOCKDSP)                += x86/blockdsp_init.o
 OBJS-$(CONFIG_BSWAPDSP)                += x86/bswapdsp_init.o
 OBJS-$(CONFIG_DCT)                     += x86/dct_init.o
 OBJS-$(CONFIG_DIRAC_DECODER)           += x86/diracdsp_init.o           \
-                                          x86/dirac_dwt_init.o
+                                          x86/dirac_dwt_init.o \
+                                          x86/dirac_dwt_init_10bit.o
 OBJS-$(CONFIG_FDCTDSP)                 += x86/fdctdsp_init.o
 OBJS-$(CONFIG_FFT)                     += x86/fft_init.o
 OBJS-$(CONFIG_FLACDSP)                 += x86/flacdsp_init.o
@@ -153,7 +154,8 @@ X86ASM-OBJS-$(CONFIG_APNG_DECODER)     += x86/pngdsp.o
 X86ASM-OBJS-$(CONFIG_CAVS_DECODER)     += x86/cavsidct.o
 X86ASM-OBJS-$(CONFIG_DCA_DECODER)      += x86/dcadsp.o x86/synth_filter.o
 X86ASM-OBJS-$(CONFIG_DIRAC_DECODER)    += x86/diracdsp.o                \
-                                          x86/dirac_dwt.o
+                                          x86/dirac_dwt.o \
+                                          x86/dirac_dwt_10bit.o
 X86ASM-OBJS-$(CONFIG_DNXHD_ENCODER)    += x86/dnxhdenc.o
 X86ASM-OBJS-$(CONFIG_EXR_DECODER)      += x86/exrdsp.o
 X86ASM-OBJS-$(CONFIG_FLAC_DECODER)     += x86/flacdsp.o
diff --git a/libavcodec/x86/dirac_dwt_10bit.asm b/libavcodec/x86/dirac_dwt_10bit.asm
new file mode 100644
index 0000000000..baea91329e
--- /dev/null
+++ b/libavcodec/x86/dirac_dwt_10bit.asm
@@ -0,0 +1,160 @@
+;******************************************************************************
+;* x86 optimized discrete 10-bit wavelet trasnform
+;* Copyright (c) 2018 James Darnley
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+cextern pd_1
+
+SECTION .text
+
+%macro HAAR_VERTICAL 0
+
+cglobal vertical_compose_haar_10bit, 3, 6, 4, b0, b1, w
+    DECLARE_REG_TMP 4,5
+
+    mova  m2, [pd_1]
+    mov  r3d, wd
+    and   wd, ~(mmsize/4 - 1)
+    shl   wd, 2
+    add  b0q, wq
+    add  b1q, wq
+    neg   wq
+
+    ALIGN 16
+    .loop_simd:
+        mova m0, [b0q + wq]
+        mova m1, [b1q + wq]
+        paddd m3, m1, m2
+        psrad m3, 1
+        psubd m0, m3
+        paddd m1, m0
+        mova [b0q + wq], m0
+        mova [b1q + wq], m1
+        add wq, mmsize
+    jl .loop_simd
+
+    and  r3d, mmsize/4 - 1
+    jz .end
+    .loop_scalar:
+        mov t0d, [b0q]
+        mov t1d, [b1q]
+        mov r2d, t1d
+        add r2d, 1
+        sar r2d, 1
+        sub t0d, r2d
+        add t1d, t0d
+        mov [b0q], t0d
+        mov [b1q], t1d
+
+        add b0q, 4
+        add b1q, 4
+        sub r3d, 1
+    jg .loop_scalar
+
+    .end:
+RET
+
+%endmacro
+
+%macro HAAR_HORIZONTAL 0
+
+cglobal horizontal_compose_haar_10bit, 3, 6+ARCH_X86_64, 4, b, temp_, w, x, b2
+    DECLARE_REG_TMP 2,5
+    %if ARCH_X86_64
+        %define tail r6d
+    %else
+        %define tail dword wm
+    %endif
+
+    mova m2, [pd_1]
+    xor xd, xd
+    shr wd, 1
+    mov tail, wd
+    lea b2q, [bq + 4*wq]
+
+    ALIGN 16
+    .loop_lo:
+        mova m0, [bq  + 4*xq]
+        movu m1, [b2q + 4*xq]
+        paddd m1, m2
+        psrad m1, 1
+        psubd m0, m1
+        mova [temp_q + 4*xq], m0
+        add xd, mmsize/4
+        cmp xd, wd
+    jl .loop_lo
+
+    xor xd, xd
+    and wd, ~(mmsize/4 - 1)
+
+    ALIGN 16
+    .loop_hi:
+        mova m0, [temp_q + 4*xq]
+        movu m1, [b2q    + 4*xq]
+        paddd m1, m0
+        paddd m0, m2
+        paddd m1, m2
+        psrad m0, 1
+        psrad m1, 1
+        SBUTTERFLY dq, 0,1,3
+        %if cpuflag(avx2)
+            SBUTTERFLY dqqq, 0,1,3
+        %endif
+        mova [bq + 8*xq], m0
+        mova [bq + 8*xq + mmsize], m1
+        add xd, mmsize/4
+        cmp xd, wd
+    jl .loop_hi
+
+    and tail, mmsize/4 - 1
+    jz .end
+    .loop_scalar:
+        mov t0d, [temp_q + 4*xq]
+        mov t1d, [b2q    + 4*xq]
+        add t1d, t0d
+        add t0d, 1
+        add t1d, 1
+        sar t0d, 1
+        sar t1d, 1
+        mov [bq + 8*xq], t0d
+        mov [bq + 8*xq + 4], t1d
+        add  xq, 1
+        sub tail, 1
+    jg .loop_scalar
+
+    .end:
+REP_RET
+
+%endmacro
+
+INIT_XMM sse2
+HAAR_HORIZONTAL
+HAAR_VERTICAL
+
+INIT_XMM avx
+HAAR_HORIZONTAL
+HAAR_VERTICAL
+
+INIT_YMM avx2
+HAAR_HORIZONTAL
+HAAR_VERTICAL
diff --git a/libavcodec/x86/dirac_dwt_init_10bit.c b/libavcodec/x86/dirac_dwt_init_10bit.c
new file mode 100644
index 0000000000..289862d728
--- /dev/null
+++ b/libavcodec/x86/dirac_dwt_init_10bit.c
@@ -0,0 +1,76 @@
+/*
+ * x86 optimized discrete wavelet transform
+ * Copyright (c) 2018 James Darnley
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/x86/asm.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/dirac_dwt.h"
+
+void ff_horizontal_compose_haar_10bit_sse2(int32_t *b0, int32_t *b1, int width_align);
+void ff_horizontal_compose_haar_10bit_avx(int32_t *b0, int32_t *b1, int width_align);
+void ff_horizontal_compose_haar_10bit_avx2(int32_t *b0, int32_t *b1, int width_align);
+
+void ff_vertical_compose_haar_10bit_sse2(int32_t *b0, int32_t *b1, int width_align);
+void ff_vertical_compose_haar_10bit_avx(int32_t *b0, int32_t *b1, int width_align);
+void ff_vertical_compose_haar_10bit_avx2(int32_t *b0, int32_t *b1, int width_align);
+
+av_cold void ff_spatial_idwt_init_10bit_x86(DWTContext *d, enum dwt_type type)
+{
+#if HAVE_X86ASM
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        switch (type) {
+            case DWT_DIRAC_HAAR0:
+                d->vertical_compose = (void*)ff_vertical_compose_haar_10bit_sse2;
+                break;
+            case DWT_DIRAC_HAAR1:
+                d->horizontal_compose = (void*)ff_horizontal_compose_haar_10bit_sse2;
+                d->vertical_compose = (void*)ff_vertical_compose_haar_10bit_sse2;
+                break;
+        }
+    }
+
+    if (EXTERNAL_AVX(cpu_flags)) {
+        switch (type) {
+            case DWT_DIRAC_HAAR0:
+                d->vertical_compose = (void*)ff_vertical_compose_haar_10bit_avx;
+                break;
+            case DWT_DIRAC_HAAR1:
+                d->horizontal_compose = (void*)ff_horizontal_compose_haar_10bit_avx;
+                d->vertical_compose = (void*)ff_vertical_compose_haar_10bit_avx;
+                break;
+        }
+    }
+
+    if (EXTERNAL_AVX2(cpu_flags)) {
+        switch (type) {
+            case DWT_DIRAC_HAAR0:
+                d->vertical_compose = (void*)ff_vertical_compose_haar_10bit_avx2;
+                break;
+            case DWT_DIRAC_HAAR1:
+                d->horizontal_compose = (void*)ff_horizontal_compose_haar_10bit_avx2;
+                d->vertical_compose = (void*)ff_vertical_compose_haar_10bit_avx2;
+                break;
+        }
+    }
+
+#endif // HAVE_X86ASM
+}
-- 
2.18.0



More information about the ffmpeg-devel mailing list