[FFmpeg-devel] [PATCH] avfilter/vf_transpose: add x86 SIMD

James Almer jamrial at gmail.com
Mon Oct 21 20:01:42 EEST 2019


On 10/21/2019 12:45 PM, Paul B Mahol wrote:
> Signed-off-by: Paul B Mahol <onemda at gmail.com>
> ---
>  libavfilter/transpose.h             |  10 +++
>  libavfilter/vf_transpose.c          |  18 ++---
>  libavfilter/x86/Makefile            |   2 +
>  libavfilter/x86/vf_transpose.asm    | 104 ++++++++++++++++++++++++++++
>  libavfilter/x86/vf_transpose_init.c |  49 +++++++++++++
>  5 files changed, 174 insertions(+), 9 deletions(-)
>  create mode 100644 libavfilter/x86/vf_transpose.asm
>  create mode 100644 libavfilter/x86/vf_transpose_init.c
> 
> diff --git a/libavfilter/transpose.h b/libavfilter/transpose.h
> index aa262b9487..f73a42864f 100644
> --- a/libavfilter/transpose.h
> +++ b/libavfilter/transpose.h
> @@ -34,4 +34,14 @@ enum TransposeDir {
>      TRANSPOSE_VFLIP,
>  };
>  
> +typedef struct TransVtable {
> +    void (*transpose_8x8)(uint8_t *src, ptrdiff_t src_linesize,
> +                          uint8_t *dst, ptrdiff_t dst_linesize);
> +    void (*transpose_block)(uint8_t *src, ptrdiff_t src_linesize,
> +                            uint8_t *dst, ptrdiff_t dst_linesize,
> +                            int w, int h);
> +} TransVtable;
> +
> +void ff_transpose_init_x86(TransVtable *v, int pixstep);
> +
>  #endif
> diff --git a/libavfilter/vf_transpose.c b/libavfilter/vf_transpose.c
> index dd54947bd9..16ac6c311a 100644
> --- a/libavfilter/vf_transpose.c
> +++ b/libavfilter/vf_transpose.c
> @@ -40,14 +40,6 @@
>  #include "video.h"
>  #include "transpose.h"
>  
> -typedef struct TransVtable {
> -    void (*transpose_8x8)(uint8_t *src, ptrdiff_t src_linesize,
> -                          uint8_t *dst, ptrdiff_t dst_linesize);
> -    void (*transpose_block)(uint8_t *src, ptrdiff_t src_linesize,
> -                            uint8_t *dst, ptrdiff_t dst_linesize,
> -                            int w, int h);
> -} TransVtable;
> -
>  typedef struct TransContext {
>      const AVClass *class;
>      int hsub, vsub;
> @@ -243,7 +235,15 @@ static int config_props_output(AVFilterLink *outlink)
>          }
>      }
>  
> -    av_log(ctx, AV_LOG_VERBOSE,
> +    if (ARCH_X86) {
> +        for (int i = 0; i < 4; i++) {
> +            TransVtable *v = &s->vtables[i];
> +
> +            ff_transpose_init_x86(v, s->pixsteps[i]);
> +        }
> +    }
> +
> +     av_log(ctx, AV_LOG_VERBOSE,
>             "w:%d h:%d dir:%d -> w:%d h:%d rotation:%s vflip:%d\n",
>             inlink->w, inlink->h, s->dir, outlink->w, outlink->h,
>             s->dir == 1 || s->dir == 3 ? "clockwise" : "counterclockwise",
> diff --git a/libavfilter/x86/Makefile b/libavfilter/x86/Makefile
> index 06f832e36c..8d97e46c3f 100644
> --- a/libavfilter/x86/Makefile
> +++ b/libavfilter/x86/Makefile
> @@ -31,6 +31,7 @@ OBJS-$(CONFIG_STEREO3D_FILTER)               += x86/vf_stereo3d_init.o
>  OBJS-$(CONFIG_TBLEND_FILTER)                 += x86/vf_blend_init.o
>  OBJS-$(CONFIG_THRESHOLD_FILTER)              += x86/vf_threshold_init.o
>  OBJS-$(CONFIG_TINTERLACE_FILTER)             += x86/vf_tinterlace_init.o
> +OBJS-$(CONFIG_TRANSPOSE_FILTER)              += x86/vf_transpose_init.o
>  OBJS-$(CONFIG_VOLUME_FILTER)                 += x86/af_volume_init.o
>  OBJS-$(CONFIG_V360_FILTER)                   += x86/vf_v360_init.o
>  OBJS-$(CONFIG_W3FDIF_FILTER)                 += x86/vf_w3fdif_init.o
> @@ -69,6 +70,7 @@ X86ASM-OBJS-$(CONFIG_STEREO3D_FILTER)        += x86/vf_stereo3d.o
>  X86ASM-OBJS-$(CONFIG_TBLEND_FILTER)          += x86/vf_blend.o
>  X86ASM-OBJS-$(CONFIG_THRESHOLD_FILTER)       += x86/vf_threshold.o
>  X86ASM-OBJS-$(CONFIG_TINTERLACE_FILTER)      += x86/vf_interlace.o
> +X86ASM-OBJS-$(CONFIG_TRANSPOSE_FILTER)       += x86/vf_transpose.o
>  X86ASM-OBJS-$(CONFIG_VOLUME_FILTER)          += x86/af_volume.o
>  X86ASM-OBJS-$(CONFIG_V360_FILTER)            += x86/vf_v360.o
>  X86ASM-OBJS-$(CONFIG_W3FDIF_FILTER)          += x86/vf_w3fdif.o
> diff --git a/libavfilter/x86/vf_transpose.asm b/libavfilter/x86/vf_transpose.asm
> new file mode 100644
> index 0000000000..6d925d5d97
> --- /dev/null
> +++ b/libavfilter/x86/vf_transpose.asm
> @@ -0,0 +1,104 @@
> +;*****************************************************************************
> +;* x86-optimized functions for transpose filter
> +;*
> +;* Copyright (C) 2019 Paul B Mahol
> +;*
> +;* This file is part of FFmpeg.
> +;*
> +;* FFmpeg is free software; you can redistribute it and/or
> +;* modify it under the terms of the GNU Lesser General Public
> +;* License as published by the Free Software Foundation; either
> +;* version 2.1 of the License, or (at your option) any later version.
> +;*
> +;* FFmpeg is distributed in the hope that it will be useful,
> +;* but WITHOUT ANY WARRANTY; without even the implied warranty of
> +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +;* Lesser General Public License for more details.
> +;*
> +;* You should have received a copy of the GNU Lesser General Public
> +;* License along with FFmpeg; if not, write to the Free Software
> +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> +;******************************************************************************
> +
> +%include "libavutil/x86/x86util.asm"
> +
> +SECTION .text
> +
> +;------------------------------------------------------------------------------
> +; void ff_transpose_8x8(uint8_t *src, ptrdiff_t src_linesize,
> +;                       uint8_t *dst, ptrdiff_t dst_linesize)
> +;------------------------------------------------------------------------------
> +
> +INIT_XMM sse4
> +cglobal transpose_8x8_8, 4,4,8, src, src_linesize, dst, dst_linesize
> +    movu    m0, [srcq]
> +    add    srcq, src_linesizeq
> +    movu    m1, [srcq]
> +    add    srcq, src_linesizeq
> +    movu    m2, [srcq]
> +    add    srcq, src_linesizeq
> +    movu    m3, [srcq]
> +    add    srcq, src_linesizeq
> +    movu    m4, [srcq]
> +    add    srcq, src_linesizeq
> +    movu    m5, [srcq]
> +    add    srcq, src_linesizeq
> +    movu    m6, [srcq]
> +    add    srcq, src_linesizeq
> +    movu    m7, [srcq]
> +
> +    TRANSPOSE_8X8B 0, 1, 2, 3, 4, 5, 6, 7
> +
> +    movq [dstq], m0
> +    add    dstq, dst_linesizeq
> +    movq [dstq], m1
> +    add    dstq, dst_linesizeq
> +    movq [dstq], m2
> +    add    dstq, dst_linesizeq
> +    movq [dstq], m3
> +    add    dstq, dst_linesizeq
> +    movq [dstq], m4
> +    add    dstq, dst_linesizeq
> +    movq [dstq], m5
> +    add    dstq, dst_linesizeq
> +    movq [dstq], m6
> +    add    dstq, dst_linesizeq
> +    movq [dstq], m7
> +    RET
> +
> +INIT_XMM sse4
> +cglobal transpose_8x8_16, 4,4,9, src, src_linesize, dst, dst_linesize
> +    movu    m0, [srcq]
> +    add    srcq, src_linesizeq
> +    movu    m1, [srcq]
> +    add    srcq, src_linesizeq
> +    movu    m2, [srcq]
> +    add    srcq, src_linesizeq
> +    movu    m3, [srcq]
> +    add    srcq, src_linesizeq
> +    movu    m4, [srcq]
> +    add    srcq, src_linesizeq
> +    movu    m5, [srcq]
> +    add    srcq, src_linesizeq
> +    movu    m6, [srcq]
> +    add    srcq, src_linesizeq
> +    movu    m7, [srcq]
> +
> +    TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8

For x86_32 this needs memory arguments (Either reserved stack space, or
the dst buffer when it's aligned), otherwise it will not compile.

If you don't want to do it, then just wrap this one function with x86_64
preprocessor checks here and below.

> +
> +    movu [dstq], m0
> +    add    dstq, dst_linesizeq
> +    movu [dstq], m1
> +    add    dstq, dst_linesizeq
> +    movu [dstq], m2
> +    add    dstq, dst_linesizeq
> +    movu [dstq], m3
> +    add    dstq, dst_linesizeq
> +    movu [dstq], m4
> +    add    dstq, dst_linesizeq
> +    movu [dstq], m5
> +    add    dstq, dst_linesizeq
> +    movu [dstq], m6
> +    add    dstq, dst_linesizeq
> +    movu [dstq], m7
> +    RET
> diff --git a/libavfilter/x86/vf_transpose_init.c b/libavfilter/x86/vf_transpose_init.c
> new file mode 100644
> index 0000000000..4f5acd5e56
> --- /dev/null
> +++ b/libavfilter/x86/vf_transpose_init.c
> @@ -0,0 +1,49 @@
> +/*
> + * Copyright (C) 2019 Paul B Mahol
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#include "libavutil/attributes.h"
> +#include "libavutil/cpu.h"
> +#include "libavutil/mem.h"
> +#include "libavutil/x86/asm.h"
> +#include "libavutil/x86/cpu.h"
> +#include "libavfilter/transpose.h"
> +
> +void ff_transpose_8x8_8_sse4(uint8_t *src,
> +                             ptrdiff_t src_linesize,
> +                             uint8_t *dst,
> +                             ptrdiff_t dst_linesize);
> +
> +void ff_transpose_8x8_16_sse4(uint8_t *src,
> +                              ptrdiff_t src_linesize,
> +                              uint8_t *dst,
> +                              ptrdiff_t dst_linesize);
> +
> +av_cold void ff_transpose_init_x86(TransVtable *v, int pixstep)
> +{
> +    int cpu_flags = av_get_cpu_flags();
> +
> +    if (EXTERNAL_SSE4(cpu_flags) && pixstep == 1) {
> +        v->transpose_8x8 = ff_transpose_8x8_8_sse4;
> +    }
> +
> +    if (EXTERNAL_SSE4(cpu_flags) && pixstep == 2) {
> +        v->transpose_8x8 = ff_transpose_8x8_16_sse4;
> +    }
> +}
> 



More information about the ffmpeg-devel mailing list