[FFmpeg-devel] [PATCH] avcodec/cfhd: add x86 SIMD

Mon Aug 17 16:30:38 EEST 2020

Quoting Paul B Mahol (2020-08-14 14:24:25)
>From 874fd9e604a6dcd55cca77c7256a633e5739da77 Mon Sep 17 00:00:00 2001
>From: Paul B Mahol <onemda at gmail.com>
>Date: Sun, 9 Aug 2020 17:47:34 +0200
>Subject: [PATCH] avcodec/cfhd: add x86 SIMD
>
>---
> libavcodec/Makefile           |   2 +-
> libavcodec/cfhd.c             | 271 ++++++++--------------------------
> libavcodec/cfhd.h             |   3 +
> libavcodec/cfhddsp.c          | 110 ++++++++++++++
> libavcodec/cfhddsp.h          |  42 ++++++
> libavcodec/x86/Makefile       |   2 +
> libavcodec/x86/cfhddsp.asm    | 227 ++++++++++++++++++++++++++++
> libavcodec/x86/cfhddsp_init.c |  44 ++++++
> 8 files changed, 488 insertions(+), 213 deletions(-)
> create mode 100644 libavcodec/cfhddsp.c
> create mode 100644 libavcodec/cfhddsp.h
> create mode 100644 libavcodec/x86/cfhddsp.asm
> create mode 100644 libavcodec/x86/cfhddsp_init.c

It would be way easier to review if the rearrangement of the C code was
separate from adding the x86 SIMD.

Also, checkasm tests would make it easier to test and benchmark.

>
>--- /dev/null
>+++ b/libavcodec/x86/cfhddsp.asm
>@@ -0,0 +1,227 @@
>+;******************************************************************************
>+;* x86-optimized functions for the CFHD decoder
>+;* Copyright (c) 2020 Paul B Mahol
>+;*
>+;* This file is part of FFmpeg.
>+;*
>+;* FFmpeg is free software; you can redistribute it and/or
>+;* modify it under the terms of the GNU Lesser General Public
>+;* License as published by the Free Software Foundation; either
>+;* version 2.1 of the License, or (at your option) any later version.
>+;*
>+;* FFmpeg is distributed in the hope that it will be useful,
>+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
>+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>+;* Lesser General Public License for more details.
>+;*
>+;* You should have received a copy of the GNU Lesser General Public
>+;* License along with FFmpeg; if not, write to the Free Software
>+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
>+;******************************************************************************
>+
>+%include "libavutil/x86/x86util.asm"
>+
>+SECTION_RODATA
>+
>+factor_p1_p1: dw 1,  1, 1,  1, 1,  1, 1,  1,
>+factor_p1_n1: dw 1, -1, 1, -1, 1, -1, 1, -1,
>+factor_n1_p1: dw -1, 1, -1, 1, -1, 1, -1, 1,
>+pd_4: times 4 dd 4
>+pw_0: times 8 dw 0
>+pw_1023: times 8 dw 1023
>+pw_4095: times 8 dw 4095
>+
>+SECTION .text
>+
>+%macro CFHD_HORIZ_FILTER 1
>+%if %1 == 1023
>+cglobal cfhd_horiz_filter_clip10, 5, 6, 8, output, low, high, width, bpc
>+    DEFINE_ARGS    output, low, high, width, x, temp
>+%elif %1 == 4095
>+cglobal cfhd_horiz_filter_clip12, 5, 6, 8, output, low, high, width, bpc
>+    DEFINE_ARGS    output, low, high, width, x, temp
>+%else
>+cglobal cfhd_horiz_filter, 4, 6, 8, output, low, high, width, x
>+    DEFINE_ARGS    output, low, high, width, x, temp
>+%endif

I don't think you need DEFINE_ARGS, temp can be added directly to
cglobal invocation.

Also, bpc seems unused.

Then you could call the macro with the bit depth as a parameter and
%define pixel_max ((1 << bpc) - 1)
for extra readability

>+    shl        widthd, 1
>+
>+    movsx          xq, word [lowq]
>+    imul           xq, 11
>+
>+    movsx       tempq, word [lowq + 2]
>+    imul        tempq, -4
>+    add         tempq, xq
>+
>+    movsx          xq, word [lowq + 4]
>+    add         tempq, xq
>+    add         tempq, 4
>+    sar         tempq, 3
>+
>+    movsx          xq, word [highq]
>+    add         tempq, xq
>+    sar         tempq, 1
>+
>+%if %1
>+    movd          xm0, tempd
>+    CLIPW          m0, [pw_0], [pw_%1]
>+    pextrw  [outputq], xm0, 0
>+%else
>+    mov  word [outputq], tempw
>+%endif
>+
>+    movsx          xq, word [lowq]
>+    imul           xq, 5
>+
>+    movsx       tempq, word [lowq + 2]
>+    imul        tempq, 4
>+    add         tempq, xq
>+
>+    movsx          xq, word [lowq + 4]
>+    sub         tempq, xq
>+    add         tempq, 4
>+    sar         tempq, 3
>+
>+    movsx          xq, word [highq]
>+    sub         tempq, xq
>+    sar         tempq, 1
>+
>+%if %1
>+    movd          xm0, tempd
>+    CLIPW          m0, [pw_0], [pw_%1]
>+    pextrw [outputq + 2], xm0, 0
>+%else
>+    mov  word [outputq + 2], tempw
>+%endif
>+
>+    mov            xq, 0
>+
>+.loop:
>+    movu           m4, [lowq + xq]
>+    movu           m1, [lowq + xq + 4]
>+
>+    mova           m5, m4
>+    punpcklwd      m4, m1
>+    punpckhwd      m5, m1
>+
>+    mova           m6, m4
>+    mova           m7, m5
>+
>+    pmaddwd        m4, [factor_p1_n1]
>+    pmaddwd        m5, [factor_p1_n1]
>+    pmaddwd        m6, [factor_n1_p1]
>+    pmaddwd        m7, [factor_n1_p1]
>+
>+    paddd          m4, [pd_4]
>+    paddd          m5, [pd_4]
>+    paddd          m6, [pd_4]
>+    paddd          m7, [pd_4]

I'd drop 32bit compatibility and load those constants into registers.

>+
>+    psrad          m4, 3
>+    psrad          m5, 3
>+    psrad          m6, 3
>+    psrad          m7, 3
>+
>+    movu           m2, [lowq + xq + 2]
>+    movu           m3, [highq + xq + 2]
>+
>+    mova           m0, m2
>+    punpcklwd      m2, m3
>+    punpckhwd      m0, m3
>+
>+    mova           m1, m2
>+    mova           m3, m0
>+
>+    pmaddwd        m2, [factor_p1_p1]
>+    pmaddwd        m0, [factor_p1_p1]
>+    pmaddwd        m1, [factor_p1_n1]
>+    pmaddwd        m3, [factor_p1_n1]
>+
>+    paddd          m2, m4
>+    paddd          m0, m5
>+    paddd          m1, m6
>+    paddd          m3, m7
>+
>+    psrad          m2, 1
>+    psrad          m0, 1
>+    psrad          m1, 1
>+    psrad          m3, 1
>+
>+    packssdw       m2, m0
>+    packssdw       m1, m3
>+
>+    mova           m0, m2
>+    punpcklwd      m2, m1
>+    punpckhwd      m0, m1
>+
>+%if %1
>+    CLIPW          m2, [pw_0], [pw_%1]
>+    CLIPW          m0, [pw_0], [pw_%1]
>+%endif
>+
>+    movu  [outputq + xq * 2 + 4], m2
>+    movu  [outputq + xq * 2 + mmsize + 4], m0

Is it guaranteed that this does not write out of bounds? Seems you
assume output to be at least 18 pixels, but I don't see anything in
cfhd.c enforcing that.

-- 
Anton Khirnov