[FFmpeg-devel] [PATCH] avcodec/cfhd: add x86 SIMD

Paul B Mahol onemda at gmail.com
Mon Aug 17 16:48:33 EEST 2020


On 8/17/20, Anton Khirnov <anton at khirnov.net> wrote:
> Quoting Paul B Mahol (2020-08-14 14:24:25)
> >From 874fd9e604a6dcd55cca77c7256a633e5739da77 Mon Sep 17 00:00:00 2001
>>From: Paul B Mahol <onemda at gmail.com>
>>Date: Sun, 9 Aug 2020 17:47:34 +0200
>>Subject: [PATCH] avcodec/cfhd: add x86 SIMD
>>
>>---
>> libavcodec/Makefile           |   2 +-
>> libavcodec/cfhd.c             | 271 ++++++++--------------------------
>> libavcodec/cfhd.h             |   3 +
>> libavcodec/cfhddsp.c          | 110 ++++++++++++++
>> libavcodec/cfhddsp.h          |  42 ++++++
>> libavcodec/x86/Makefile       |   2 +
>> libavcodec/x86/cfhddsp.asm    | 227 ++++++++++++++++++++++++++++
>> libavcodec/x86/cfhddsp_init.c |  44 ++++++
>> 8 files changed, 488 insertions(+), 213 deletions(-)
>> create mode 100644 libavcodec/cfhddsp.c
>> create mode 100644 libavcodec/cfhddsp.h
>> create mode 100644 libavcodec/x86/cfhddsp.asm
>> create mode 100644 libavcodec/x86/cfhddsp_init.c
>
> It would be way easier to review if the rearrangement of the C code was
> separate from adding the x86 SIMD.
>
> Also, checkasm tests would make it easier to test and benchmark.

Unfortunately you are reviewing old version of patch.

>
>>
>>--- /dev/null
>>+++ b/libavcodec/x86/cfhddsp.asm
>>@@ -0,0 +1,227 @@
>>+;******************************************************************************
>>+;* x86-optimized functions for the CFHD decoder
>>+;* Copyright (c) 2020 Paul B Mahol
>>+;*
>>+;* This file is part of FFmpeg.
>>+;*
>>+;* FFmpeg is free software; you can redistribute it and/or
>>+;* modify it under the terms of the GNU Lesser General Public
>>+;* License as published by the Free Software Foundation; either
>>+;* version 2.1 of the License, or (at your option) any later version.
>>+;*
>>+;* FFmpeg is distributed in the hope that it will be useful,
>>+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
>>+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>>+;* Lesser General Public License for more details.
>>+;*
>>+;* You should have received a copy of the GNU Lesser General Public
>>+;* License along with FFmpeg; if not, write to the Free Software
>>+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
>> 02110-1301 USA
>>+;******************************************************************************
>>+
>>+%include "libavutil/x86/x86util.asm"
>>+
>>+SECTION_RODATA
>>+
>>+factor_p1_p1: dw 1,  1, 1,  1, 1,  1, 1,  1,
>>+factor_p1_n1: dw 1, -1, 1, -1, 1, -1, 1, -1,
>>+factor_n1_p1: dw -1, 1, -1, 1, -1, 1, -1, 1,
>>+pd_4: times 4 dd 4
>>+pw_0: times 8 dw 0
>>+pw_1023: times 8 dw 1023
>>+pw_4095: times 8 dw 4095
>>+
>>+SECTION .text
>>+
>>+%macro CFHD_HORIZ_FILTER 1
>>+%if %1 == 1023
>>+cglobal cfhd_horiz_filter_clip10, 5, 6, 8, output, low, high, width, bpc
>>+    DEFINE_ARGS    output, low, high, width, x, temp
>>+%elif %1 == 4095
>>+cglobal cfhd_horiz_filter_clip12, 5, 6, 8, output, low, high, width, bpc
>>+    DEFINE_ARGS    output, low, high, width, x, temp
>>+%else
>>+cglobal cfhd_horiz_filter, 4, 6, 8, output, low, high, width, x
>>+    DEFINE_ARGS    output, low, high, width, x, temp
>>+%endif
>
> I don't think you need DEFINE_ARGS, temp can be added directly to
> cglobal invocation.
>
> Also, bpc seems unused.
>
> Then you could call the macro with the bit depth as a parameter and
> %define pixel_max ((1 << bpc) - 1)
> for extra readability
>
>>+    shl        widthd, 1
>>+
>>+    movsx          xq, word [lowq]
>>+    imul           xq, 11
>>+
>>+    movsx       tempq, word [lowq + 2]
>>+    imul        tempq, -4
>>+    add         tempq, xq
>>+
>>+    movsx          xq, word [lowq + 4]
>>+    add         tempq, xq
>>+    add         tempq, 4
>>+    sar         tempq, 3
>>+
>>+    movsx          xq, word [highq]
>>+    add         tempq, xq
>>+    sar         tempq, 1
>>+
>>+%if %1
>>+    movd          xm0, tempd
>>+    CLIPW          m0, [pw_0], [pw_%1]
>>+    pextrw  [outputq], xm0, 0
>>+%else
>>+    mov  word [outputq], tempw
>>+%endif
>>+
>>+    movsx          xq, word [lowq]
>>+    imul           xq, 5
>>+
>>+    movsx       tempq, word [lowq + 2]
>>+    imul        tempq, 4
>>+    add         tempq, xq
>>+
>>+    movsx          xq, word [lowq + 4]
>>+    sub         tempq, xq
>>+    add         tempq, 4
>>+    sar         tempq, 3
>>+
>>+    movsx          xq, word [highq]
>>+    sub         tempq, xq
>>+    sar         tempq, 1
>>+
>>+%if %1
>>+    movd          xm0, tempd
>>+    CLIPW          m0, [pw_0], [pw_%1]
>>+    pextrw [outputq + 2], xm0, 0
>>+%else
>>+    mov  word [outputq + 2], tempw
>>+%endif
>>+
>>+    mov            xq, 0
>>+
>>+.loop:
>>+    movu           m4, [lowq + xq]
>>+    movu           m1, [lowq + xq + 4]
>>+
>>+    mova           m5, m4
>>+    punpcklwd      m4, m1
>>+    punpckhwd      m5, m1
>>+
>>+    mova           m6, m4
>>+    mova           m7, m5
>>+
>>+    pmaddwd        m4, [factor_p1_n1]
>>+    pmaddwd        m5, [factor_p1_n1]
>>+    pmaddwd        m6, [factor_n1_p1]
>>+    pmaddwd        m7, [factor_n1_p1]
>>+
>>+    paddd          m4, [pd_4]
>>+    paddd          m5, [pd_4]
>>+    paddd          m6, [pd_4]
>>+    paddd          m7, [pd_4]
>
> I'd drop 32bit compatibility and load those constants into registers.
>
>>+
>>+    psrad          m4, 3
>>+    psrad          m5, 3
>>+    psrad          m6, 3
>>+    psrad          m7, 3
>>+
>>+    movu           m2, [lowq + xq + 2]
>>+    movu           m3, [highq + xq + 2]
>>+
>>+    mova           m0, m2
>>+    punpcklwd      m2, m3
>>+    punpckhwd      m0, m3
>>+
>>+    mova           m1, m2
>>+    mova           m3, m0
>>+
>>+    pmaddwd        m2, [factor_p1_p1]
>>+    pmaddwd        m0, [factor_p1_p1]
>>+    pmaddwd        m1, [factor_p1_n1]
>>+    pmaddwd        m3, [factor_p1_n1]
>>+
>>+    paddd          m2, m4
>>+    paddd          m0, m5
>>+    paddd          m1, m6
>>+    paddd          m3, m7
>>+
>>+    psrad          m2, 1
>>+    psrad          m0, 1
>>+    psrad          m1, 1
>>+    psrad          m3, 1
>>+
>>+    packssdw       m2, m0
>>+    packssdw       m1, m3
>>+
>>+    mova           m0, m2
>>+    punpcklwd      m2, m1
>>+    punpckhwd      m0, m1
>>+
>>+%if %1
>>+    CLIPW          m2, [pw_0], [pw_%1]
>>+    CLIPW          m0, [pw_0], [pw_%1]
>>+%endif
>>+
>>+    movu  [outputq + xq * 2 + 4], m2
>>+    movu  [outputq + xq * 2 + mmsize + 4], m0
>
> Is it guaranteed that this does not write out of bounds? Seems you
> assume output to be at least 18 pixels, but I don't see anything in
> cfhd.c enforcing that.

See + 64 in latest posted patch on this mailing list.

>
> --
> Anton Khirnov
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel at ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request at ffmpeg.org with subject "unsubscribe".


More information about the ffmpeg-devel mailing list