[FFmpeg-devel] [PATCH] avfilter/vf_lut3d: add x86-optimized tetrahedral interpolation
chen
chenm003 at 163.com
Wed Sep 29 04:37:41 EEST 2021
Hello,
Excuse me, how about FMADD on AVX2 platform?
For example
+ mulps m7, m7, m14
+ addps m0, m0, m7
==>
fmadd231ps m0,m7,m14
Regards,
Min Chen
2021-09-29 09:18:05,mindmark at gmail.com
>From: Mark Reid <mindmark at gmail.com>
>
>Only supports float and 16bit planer formats at the momoment.
>Mainly focused on AVX and AVX2 optimizations, but SSE2 does seem offer some
>speed gains.
>
>f32 1920x1080 1 thread with prelut
>c impl
>1389936500 UNITS in lut3d->interp, 1 runs, 0 skips
>1425800240 UNITS in lut3d->interp, 2 runs, 0 skips
>1433312777 UNITS in lut3d->interp, 4 runs, 0 skips
>1443346798 UNITS in lut3d->interp, 8 runs, 0 skips
>
>sse2
>948662320 UNITS in lut3d->interp, 1 runs, 0 skips
>1101247540 UNITS in lut3d->interp, 2 runs, 0 skips
>1050645695 UNITS in lut3d->interp, 4 runs, 0 skips
>1041102937 UNITS in lut3d->interp, 8 runs, 0 skips
>
>avx
>633837000 UNITS in lut3d->interp, 1 runs, 0 skips
>669452850 UNITS in lut3d->interp, 2 runs, 0 skips
>650716580 UNITS in lut3d->interp, 4 runs, 0 skips
>644698550 UNITS in lut3d->interp, 8 runs, 0 skips
>
>avx2
>354940020 UNITS in lut3d->interp, 1 runs, 0 skips
>362384340 UNITS in lut3d->interp, 2 runs, 0 skips
>356799020 UNITS in lut3d->interp, 4 runs, 0 skips
>357276815 UNITS in lut3d->interp, 8 runs, 0 skips
>
>gbrap16 1920x1080 1 thread with prelut
>c impl
>1445071160 UNITS in lut3d->interp, 1 runs, 0 skips
>1477959120 UNITS in lut3d->interp, 2 runs, 0 skips
>1472102670 UNITS in lut3d->interp, 4 runs, 0 skips
>1462579330 UNITS in lut3d->interp, 8 runs, 0 skips
>
>sse2
>1035437580 UNITS in lut3d->interp, 1 runs, 0 skips
>1050139710 UNITS in lut3d->interp, 2 runs, 0 skips
>1070147205 UNITS in lut3d->interp, 4 runs, 0 skips
>1064583037 UNITS in lut3d->interp, 8 runs, 0 skips
>
>avx
>678089880 UNITS in lut3d->interp, 1 runs, 0 skips
>679112485 UNITS in lut3d->interp, 2 runs, 0 skips
>695527212 UNITS in lut3d->interp, 4 runs, 0 skips
>691300053 UNITS in lut3d->interp, 8 runs, 0 skips
>
>avx2
>372671340 UNITS in lut3d->interp, 1 runs, 0 skips
>373449870 UNITS in lut3d->interp, 2 runs, 0 skips
>383725625 UNITS in lut3d->interp, 4 runs, 0 skips
>382860848 UNITS in lut3d->interp, 8 runs, 0 skips
>
>---
> libavfilter/lut3d.h | 83 ++++
> libavfilter/vf_lut3d.c | 61 +--
> libavfilter/x86/Makefile | 2 +
> libavfilter/x86/vf_lut3d.asm | 757 ++++++++++++++++++++++++++++++++
> libavfilter/x86/vf_lut3d_init.c | 88 ++++
> 5 files changed, 935 insertions(+), 56 deletions(-)
> create mode 100644 libavfilter/lut3d.h
> create mode 100644 libavfilter/x86/vf_lut3d.asm
> create mode 100644 libavfilter/x86/vf_lut3d_init.c
>
>diff --git a/libavfilter/lut3d.h b/libavfilter/lut3d.h
>new file mode 100644
>index 0000000000..ded2a036a5
>--- /dev/null
>+++ b/libavfilter/lut3d.h
>@@ -0,0 +1,83 @@
>+/*
>+ * Copyright (c) 2013 Clément Bœsch
>+ * Copyright (c) 2018 Paul B Mahol
>+ *
>+ * This file is part of FFmpeg.
>+ *
>+ * FFmpeg is free software; you can redistribute it and/or
>+ * modify it under the terms of the GNU Lesser General Public
>+ * License as published by the Free Software Foundation; either
>+ * version 2.1 of the License, or (at your option) any later version.
>+ *
>+ * FFmpeg is distributed in the hope that it will be useful,
>+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
>+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
>+ * Lesser General Public License for more details.
>+ *
>+ * You should have received a copy of the GNU Lesser General Public
>+ * License along with FFmpeg; if not, write to the Free Software
>+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
>+ */
>+#ifndef AVFILTER_LUT3D_H
>+#define AVFILTER_LUT3D_H
>+
>+#include "libavutil/pixdesc.h"
>+#include "framesync.h"
>+#include "avfilter.h"
>+
>+enum interp_mode {
>+ INTERPOLATE_NEAREST,
>+ INTERPOLATE_TRILINEAR,
>+ INTERPOLATE_TETRAHEDRAL,
>+ INTERPOLATE_PYRAMID,
>+ INTERPOLATE_PRISM,
>+ NB_INTERP_MODE
>+};
>+
>+struct rgbvec {
>+ float r, g, b;
>+};
>+
>+/* 3D LUT don't often go up to level 32, but it is common to have a Hald CLUT
>+ * of 512x512 (64x64x64) */
>+#define MAX_LEVEL 256
>+#define PRELUT_SIZE 65536
>+
>+typedef struct Lut3DPreLut {
>+ int size;
>+ float min[3];
>+ float max[3];
>+ float scale[3];
>+ float* lut[3];
>+} Lut3DPreLut;
>+
>+typedef struct LUT3DContext {
>+ const AVClass *class;
>+ struct rgbvec *lut;
>+ int lutsize;
>+ int lutsize2;
>+ struct rgbvec scale;
>+ int interpolation; ///<interp_mode
>+ char *file;
>+ uint8_t rgba_map[4];
>+ int step;
>+ avfilter_action_func *interp;
>+ Lut3DPreLut prelut;
>+#if CONFIG_HALDCLUT_FILTER
>+ uint8_t clut_rgba_map[4];
>+ int clut_step;
>+ int clut_bits;
>+ int clut_planar;
>+ int clut_float;
>+ int clut_width;
>+ FFFrameSync fs;
>+#endif
>+} LUT3DContext;
>+
>+typedef struct ThreadData {
>+ AVFrame *in, *out;
>+} ThreadData;
>+
>+void ff_lut3d_init_x86(LUT3DContext *s, const AVPixFmtDescriptor *desc);
>+
>+#endif /* AVFILTER_LUT3D_H */
>\ No newline at end of file
>diff --git a/libavfilter/vf_lut3d.c b/libavfilter/vf_lut3d.c
>index 9fbda833b9..1fd0af06db 100644
>--- a/libavfilter/vf_lut3d.c
>+++ b/libavfilter/vf_lut3d.c
>@@ -31,73 +31,18 @@
> #include "libavutil/intreadwrite.h"
> #include "libavutil/intfloat.h"
> #include "libavutil/avassert.h"
>-#include "libavutil/pixdesc.h"
> #include "libavutil/avstring.h"
>-#include "avfilter.h"
> #include "drawutils.h"
> #include "formats.h"
>-#include "framesync.h"
> #include "internal.h"
> #include "video.h"
>+#include "lut3d.h"
>
> #define R 0
> #define G 1
> #define B 2
> #define A 3
>
>-enum interp_mode {
>- INTERPOLATE_NEAREST,
>- INTERPOLATE_TRILINEAR,
>- INTERPOLATE_TETRAHEDRAL,
>- INTERPOLATE_PYRAMID,
>- INTERPOLATE_PRISM,
>- NB_INTERP_MODE
>-};
>-
>-struct rgbvec {
>- float r, g, b;
>-};
>-
>-/* 3D LUT don't often go up to level 32, but it is common to have a Hald CLUT
>- * of 512x512 (64x64x64) */
>-#define MAX_LEVEL 256
>-#define PRELUT_SIZE 65536
>-
>-typedef struct Lut3DPreLut {
>- int size;
>- float min[3];
>- float max[3];
>- float scale[3];
>- float* lut[3];
>-} Lut3DPreLut;
>-
>-typedef struct LUT3DContext {
>- const AVClass *class;
>- int interpolation; ///<interp_mode
>- char *file;
>- uint8_t rgba_map[4];
>- int step;
>- avfilter_action_func *interp;
>- struct rgbvec scale;
>- struct rgbvec *lut;
>- int lutsize;
>- int lutsize2;
>- Lut3DPreLut prelut;
>-#if CONFIG_HALDCLUT_FILTER
>- uint8_t clut_rgba_map[4];
>- int clut_step;
>- int clut_bits;
>- int clut_planar;
>- int clut_float;
>- int clut_width;
>- FFFrameSync fs;
>-#endif
>-} LUT3DContext;
>-
>-typedef struct ThreadData {
>- AVFrame *in, *out;
>-} ThreadData;
>-
> #define OFFSET(x) offsetof(LUT3DContext, x)
> #define FLAGS AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_VIDEO_PARAM
> #define TFLAGS AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_VIDEO_PARAM|AV_OPT_FLAG_RUNTIME_PARAM
>@@ -1207,6 +1152,10 @@ static int config_input(AVFilterLink *inlink)
> av_assert0(0);
> }
>
>+ if (ARCH_X86) {
>+ ff_lut3d_init_x86(lut3d, desc);
>+ }
>+
> return 0;
> }
>
>diff --git a/libavfilter/x86/Makefile b/libavfilter/x86/Makefile
>index 016a5b3511..a29941eaeb 100644
>--- a/libavfilter/x86/Makefile
>+++ b/libavfilter/x86/Makefile
>@@ -17,6 +17,7 @@ OBJS-$(CONFIG_HQDN3D_FILTER) += x86/vf_hqdn3d_init.o
> OBJS-$(CONFIG_IDET_FILTER) += x86/vf_idet_init.o
> OBJS-$(CONFIG_INTERLACE_FILTER) += x86/vf_tinterlace_init.o
> OBJS-$(CONFIG_LIMITER_FILTER) += x86/vf_limiter_init.o
>+OBJS-$(CONFIG_LUT3D_FILTER) += x86/vf_lut3d_init.o
> OBJS-$(CONFIG_MASKEDCLAMP_FILTER) += x86/vf_maskedclamp_init.o
> OBJS-$(CONFIG_MASKEDMERGE_FILTER) += x86/vf_maskedmerge_init.o
> OBJS-$(CONFIG_NOISE_FILTER) += x86/vf_noise.o
>@@ -57,6 +58,7 @@ X86ASM-OBJS-$(CONFIG_HQDN3D_FILTER) += x86/vf_hqdn3d.o
> X86ASM-OBJS-$(CONFIG_IDET_FILTER) += x86/vf_idet.o
> X86ASM-OBJS-$(CONFIG_INTERLACE_FILTER) += x86/vf_interlace.o
> X86ASM-OBJS-$(CONFIG_LIMITER_FILTER) += x86/vf_limiter.o
>+X86ASM-OBJS-$(CONFIG_LUT3D_FILTER) += x86/vf_lut3d.o
> X86ASM-OBJS-$(CONFIG_MASKEDCLAMP_FILTER) += x86/vf_maskedclamp.o
> X86ASM-OBJS-$(CONFIG_MASKEDMERGE_FILTER) += x86/vf_maskedmerge.o
> X86ASM-OBJS-$(CONFIG_OVERLAY_FILTER) += x86/vf_overlay.o
>diff --git a/libavfilter/x86/vf_lut3d.asm b/libavfilter/x86/vf_lut3d.asm
>new file mode 100644
>index 0000000000..b3d7c3962b
>--- /dev/null
>+++ b/libavfilter/x86/vf_lut3d.asm
>@@ -0,0 +1,757 @@
>+;*****************************************************************************
>+;* x86-optimized functions for lut3d filter
>+;*
>+;* Copyright (c) 2021 Mark Reid <mindmark at gmail.com>
>+;*
>+;* This file is part of FFmpeg.
>+;*
>+;* FFmpeg is free software; you can redistribute it and/or
>+;* modify it under the terms of the GNU Lesser General Public
>+;* License as published by the Free Software Foundation; either
>+;* version 2.1 of the License, or (at your option) any later version.
>+;*
>+;* FFmpeg is distributed in the hope that it will be useful,
>+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
>+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
>+;* Lesser General Public License for more details.
>+;*
>+;* You should have received a copy of the GNU Lesser General Public
>+;* License along with FFmpeg; if not, write to the Free Software
>+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
>+;******************************************************************************
>+
>+%include "libavutil/x86/x86util.asm"
>+
>+SECTION_RODATA
>+pd_1f: times 8 dd 1.0
>+pd_3f: times 8 dd 3.0
>+
>+; used to limit rshifts as they are more expensive in avx1
>+pd_001: times 8 dd 001b
>+pd_010: times 8 dd 010b
>+pd_100: times 8 dd 100b
>+
>+pd_65535f: times 8 dd 65535.0
>+pd_65535_invf: times 8 dd 0x37800080 ;1.0/65535.0
>+
>+pb_shuffle16: db 0, 1, 0x80, 0x80, \
>+ 2, 3, 0x80, 0x80, \
>+ 4, 5, 0x80, 0x80, \
>+ 6, 7, 0x80, 0x80
>+
>+pb_lo_pack_shuffle16: db 0, 1, 4, 5, \
>+ 8, 9, 12, 13, \
>+ 0x80, 0x80, 0x80, 0x80, \
>+ 0x80, 0x80, 0x80, 0x80
>+
>+pb_hi_pack_shuffle16: db 0x80, 0x80, 0x80, 0x80, \
>+ 0x80, 0x80, 0x80, 0x80, \
>+ 0, 1, 4, 5, \
>+ 8, 9, 12, 13
>+
>+; tetrahedral table --------------------------------------------
>+; name: x2| x1| x0| cxxb| cxxa
>+; values: r 00| r 00| r 00| c011 011| c001 001
>+; g 01| g 01| g 01| c101 101| c010 010
>+; b 10| b 10| b 10| c110 110| c100 100
>+
>+; g>b b | g | r | c110 | c100
>+pd_tetra_table0: times 8 dd (10b << 10) | (01b << 8) | (00b << 6) | (110b << 3) | 100b
>+; r>b g | b | r | c101 | c100
>+pd_tetra_table1: times 8 dd (01b << 10) | (10b << 8) | (00b << 6) | (101b << 3) | 100b
>+; else g | r | b | c101 | c001
>+pd_tetra_table2: times 8 dd (01b << 10) | (00b << 8) | (10b << 6) | (101b << 3) | 001b
>+; b>g r | g | b | c011 | c001
>+pd_tetra_table3: times 8 dd (00b << 10) | (01b << 8) | (10b << 6) | (011b << 3) | 001b
>+; b>r r | b | g | c011 | c010
>+pd_tetra_table4: times 8 dd (00b << 10) | (10b << 8) | (01b << 6) | (011b << 3) | 010b
>+; else b | r | g | c110 | c010
>+pd_tetra_table5: times 8 dd (10b << 10) | (00b << 8) | (01b << 6) | (110b << 3) | 010b
>+
>+SECTION .text
>+
>+struc Lut3DPreLut
>+ .size: resd 1
>+ .min: resd 3
>+ .max: resd 3
>+ .scale: resd 3
>+ .lut: resq 3
>+endstruc
>+
>+struc LUT3DContext
>+ .class: resq 1
>+ .lut: resq 1
>+ .lutsize: resd 1
>+ .lutsize2: resd 1
>+ .scale: resd 3
>+endstruc
>+
>+%define AV_NUM_DATA_POINTERS 8
>+
>+struc AVFrame
>+ .data: resq AV_NUM_DATA_POINTERS
>+ .linesize: resd AV_NUM_DATA_POINTERS
>+ .extended_data: resq 1
>+ .width: resd 1
>+ .height: resd 1
>+endstruc
>+
>+%define rm rsp
>+%define gm rsp+mmsize
>+%define bm rsp+(mmsize*2)
>+
>+%define lut3dsizem [rsp+mmsize*3]
>+%define lut3dsize2m [rsp+mmsize*4]
>+%define lut3dmaxm [rsp+mmsize*5]
>+%define prelutmaxm [rsp+mmsize*6]
>+
>+%define scalerm [rsp+mmsize*7]
>+%define scalegm [rsp+mmsize*8]
>+%define scalebm [rsp+mmsize*9]
>+
>+%define prelutminrm [rsp+mmsize*10]
>+%define prelutmingm [rsp+mmsize*11]
>+%define prelutminbm [rsp+mmsize*12]
>+
>+%define prelutscalerm [rsp+mmsize*13]
>+%define prelutscalegm [rsp+mmsize*14]
>+%define prelutscalebm [rsp+mmsize*15]
>+
>+; data pointers
>+%define srcrm [rsp+mmsize*16 + 0]
>+%define srcgm [rsp+mmsize*16 + 8]
>+%define srcbm [rsp+mmsize*16 + 16]
>+%define srcam [rsp+mmsize*16 + 24]
>+
>+%define dstrm [rsp+mmsize*16 + 32]
>+%define dstgm [rsp+mmsize*16 + 40]
>+%define dstbm [rsp+mmsize*16 + 48]
>+%define dstam [rsp+mmsize*16 + 56]
>+
>+%macro FETCH_PRELUT_PN 3
>+ mov tmp2d, [rm + %3]
>+ mov tmp3d, [gm + %3]
>+ movss xm%1, [tmpq + tmp2q*4]
>+ movss xm%2, [tmpq + tmp3q*4]
>+ movss [rm + %3], xm%1
>+ movss [gm + %3], xm%2
>+%endmacro
>+
>+; 1 - p
>+; 2 - n
>+; 3 - p indices
>+; 4 - n indices
>+%macro GATHER_PRELUT 4
>+ %if cpuflag(avx2)
>+ vpcmpeqb m7, m7
>+ vgatherdps m%1, [tmpq + m%3*4], m7 ; p
>+ vpcmpeqb m9, m9
>+ vgatherdps m%2, [tmpq + m%4*4], m9 ; n
>+ %else
>+ mova [rm], m%3
>+ mova [gm], m%4
>+ FETCH_PRELUT_PN %1, %2, 0
>+ FETCH_PRELUT_PN %1, %2, 4
>+ FETCH_PRELUT_PN %1, %2, 8
>+ FETCH_PRELUT_PN %1, %2, 12
>+ %if mmsize > 16
>+ FETCH_PRELUT_PN %1, %2, 16
>+ FETCH_PRELUT_PN %1, %2, 20
>+ FETCH_PRELUT_PN %1, %2, 24
>+ FETCH_PRELUT_PN %1, %2, 28
>+ %endif
>+ movu m%1, [rm]
>+ movu m%2, [gm]
>+ %endif
>+%endmacro
>+
>+%macro FLOORPS 2
>+ %if mmsize > 16
>+ vroundps %1, %2, 0x01
>+ %else
>+ cvttps2dq %1, %2
>+ cvtdq2ps %1, %1
>+ %endif
>+%endmacro
>+
>+; 1 - dst
>+; 2 - index
>+; 3 - min
>+; 4 - scale
>+; assumes lut max m13, m14 1.0f, zero m15
>+%macro APPLY_PRELUT 4
>+ ; scale
>+ subps m5, m%1, %3 ; v - min
>+ mulps m5, m5, %4 ; v * scale
>+ ; clamp
>+ maxps m5, m5, m15 ; max zero
>+ minps m5, m5, m13 ; min lut max
>+
>+ FLOORPS m3, m5 ; prev index
>+ subps m5, m5, m3 ; d
>+ addps m4, m3, m14 ; p+1 = n index
>+ minps m4, m4, m13 ; clamp n idex
>+
>+ mov tmpq, [prelutq + Lut3DPreLut.lut + %2*8]
>+ cvttps2dq m6, m3
>+ cvttps2dq m10, m4
>+ GATHER_PRELUT 3, 4, 6, 10
>+
>+ ; lerp
>+ subps m8, m4, m3
>+ mulps m8, m8, m5
>+ addps m%1, m8, m3
>+%endmacro
>+
>+; 1 - dst
>+; 2 - scale
>+; assumes lut max m13, zero m15
>+%macro APPLY_SCALE 2
>+ mulps m%1, m%1, %2
>+ maxps m%1, m%1, m15
>+ minps m%1, m%1, m13
>+%endmacro
>+
>+%macro BLEND 4
>+%if mmsize > 16
>+ vblendvps %1, %2, %3, %4
>+%else
>+ %ifidni %1,%2
>+ %error operand 1 must not equal operand 2
>+ %endif
>+ %ifidni %1,%3
>+ %error operand 1 must not equal operand 3
>+ %endif
>+ mova %1, %2
>+ xorps %1, %3
>+ andps %1, %4
>+ xorps %1, %2
>+%endif
>+%endmacro
>+
>+; sets nans to zere, +inf -inf handled later by min/max clamps
>+%macro SANITIZE_F 1
>+ cmpps m5, %1, %1, 0x0 ; nan == nan = False
>+ %if mmsize <= 16
>+ mova m6, %1
>+ BLEND %1, m15, m6, m5
>+ %else
>+ BLEND %1, m15, %1, m5
>+ %endif
>+%endmacro
>+
>+%macro ADD3 4
>+ addps %1, %2, %3
>+ addps %1, %1, %4
>+%endmacro
>+
>+%macro CMP_EQUAL 3
>+%if cpuflag(avx2)
>+ vpcmpeqd %1, %2, %3
>+%elif cpuflag(avx)
>+ cmpps %1, %2, %3, 0x0
>+%else
>+ pcmpeqd %1, %2, %3
>+%endif
>+%endmacro
>+
>+%macro SHIFT_RIGHT 2
>+%if mmsize <= 16
>+ psrld xm%1, %2
>+%elif cpuflag(avx2)
>+ vpsrld m%1, m%1, %2
>+%else
>+ vextractf128 xm15, m%1, 1
>+ psrld xm%1, %2
>+ psrld xm15, %2
>+ vinsertf128 m%1, m%1, xm15, 1
>+%endif
>+%endmacro
>+
>+%macro FETCH_LUT3D_RGB 4
>+ mov tmp2d, [rm + %4]
>+ movss xm%1, [tmpq + tmp2q*4 + 0]
>+ movss xm%2, [tmpq + tmp2q*4 + 4]
>+ movss xm%3, [tmpq + tmp2q*4 + 8]
>+ movss [rm + %4], xm%1
>+ movss [gm + %4], xm%2
>+ movss [bm + %4], xm%3
>+%endmacro
>+
>+; 1 - dstr
>+; 2 - dstg
>+; 3 - dstb
>+; 4 - indices
>+%macro GATHER_LUT3D_INDICES 4
>+%if cpuflag(avx2)
>+ vpcmpeqb m3, m3
>+ vgatherdps m%1, [tmpq + m%4*4 + 0], m3
>+ vpcmpeqb m14, m14
>+ vgatherdps m%2, [tmpq + m%4*4 + 4], m14
>+ vpcmpeqb m15, m15
>+ vgatherdps m%3, [tmpq + m%4*4 + 8], m15
>+%else
>+ movu [rm], m%4
>+ FETCH_LUT3D_RGB %1, %2, %3, 0
>+ FETCH_LUT3D_RGB %1, %2, %3, 4
>+ FETCH_LUT3D_RGB %1, %2, %3, 8
>+ FETCH_LUT3D_RGB %1, %2, %3, 12
>+%if mmsize > 16
>+ FETCH_LUT3D_RGB %1, %2, %3, 16
>+ FETCH_LUT3D_RGB %1, %2, %3, 20
>+ FETCH_LUT3D_RGB %1, %2, %3, 24
>+ FETCH_LUT3D_RGB %1, %2, %3, 28
>+%endif
>+ movu m%1, [rm]
>+ movu m%2, [gm]
>+ movu m%3, [bm]
>+%endif
>+%endmacro
>+
>+%macro interp_tetrahedral 0
>+ %define d_r m0
>+ %define d_g m1
>+ %define d_b m2
>+
>+ %define prev_r m3
>+ %define prev_g m4
>+ %define prev_b m5
>+
>+ %define next_r m6
>+ %define next_g m7
>+ %define next_b m8
>+
>+ %define x0 m4
>+ %define x1 m5
>+ %define x2 m6
>+
>+ ; setup prev index
>+ FLOORPS prev_r, m0
>+ FLOORPS prev_g, m1
>+ FLOORPS prev_b, m2
>+
>+ ; setup deltas
>+ subps d_r, m0, prev_r
>+ subps d_g, m1, prev_g
>+ subps d_b, m2, prev_b
>+
>+ ; calculate select mask m9
>+ movu m6, [pd_tetra_table2]
>+ cmpps m7, d_r, d_b, 0x1E ; r > b CMP_GT_OQ
>+ BLEND m10, m6, [pd_tetra_table1], m7
>+ cmpps m7, d_g, d_b, 0x1E ; g > b CMP_GT_OQ
>+ BLEND m6, m10, [pd_tetra_table0], m7
>+
>+ movu m10, [pd_tetra_table5]
>+ cmpps m7, d_b, d_r, 0x1E ; b > r CMP_GT_OQ
>+ BLEND m9, m10, [pd_tetra_table4], m7
>+ cmpps m7, d_b, d_g, 0x1E ; b > g CMP_GT_OQ
>+ BLEND m10, m9, [pd_tetra_table3], m7
>+
>+ cmpps m7, d_r, d_g, 0x1E ; r > g CMP_GT_OQ
>+ BLEND m9, m10, m6, m7
>+
>+ ; setup next index
>+ addps next_r, prev_r, m14 ; +1
>+ minps next_r, next_r, m13 ; clamp lutmax
>+
>+ addps next_g, prev_g, m14 ; +1
>+ minps next_g, next_g, m13 ; clamp lutmax
>+
>+ addps next_b, prev_b, m14 ; +1
>+ minps next_b, next_b, m13 ; clamp lutmax
>+
>+ ; prescale indices
>+ mulps prev_r, prev_r, lut3dsize2m
>+ mulps next_r, next_r, lut3dsize2m
>+
>+ mulps prev_g, prev_g, lut3dsizem
>+ mulps next_g, next_g, lut3dsizem
>+
>+ mulps prev_b, prev_b, [pd_3f]
>+ mulps next_b, next_b, [pd_3f]
>+
>+ movu m14, [pd_001]
>+
>+ ; cxxa m10
>+ ; b
>+ andps m15, m9, m14
>+ CMP_EQUAL m15, m15, m14
>+ BLEND m10, prev_b, next_b, m15
>+
>+ ; g
>+ andps m15, m9, [pd_010]
>+ CMP_EQUAL m15, m15, [pd_010]
>+ BLEND m12, prev_g, next_g, m15
>+
>+ ; r
>+ andps m15, m9, [pd_100]
>+ CMP_EQUAL m15, m15, [pd_100]
>+ BLEND m13, prev_r, next_r, m15
>+
>+ ADD3 m10, m10, m12, m13
>+
>+ SHIFT_RIGHT 9, 3 ; 3
>+
>+ ; cxxb m11;
>+ ; b
>+ andps m15, m9, m14
>+ CMP_EQUAL m15, m15, m14
>+ BLEND m11, prev_b, next_b, m15
>+
>+ ; g
>+ andps m15, m9, [pd_010]
>+ CMP_EQUAL m15, m15, [pd_010]
>+ BLEND m12, prev_g, next_g, m15
>+
>+ ; r
>+ andps m15, m9, [pd_100]
>+ CMP_EQUAL m15, m15, [pd_100]
>+ BLEND m13, prev_r, next_r, m15
>+
>+ ADD3 m11, m11, m12, m13
>+
>+ ; c000 m12;
>+ ADD3 m12, prev_r, prev_g, prev_b
>+
>+ ; c111 m13;
>+ ADD3 m13, next_r, next_g, next_b
>+
>+ SHIFT_RIGHT 9, 3 ; 6
>+
>+ ; x0, m4
>+ andps m15, m9, m14
>+ CMP_EQUAL m15, m15, m14
>+ BLEND m7, d_r, d_g, m15 ; r,g
>+
>+ andps m15, m9, [pd_010]
>+ CMP_EQUAL m15, m15, [pd_010]
>+ BLEND x0, m7, d_b, m15 ; b
>+
>+ ; x1, m5
>+ andps m15, m9, [pd_100]
>+ CMP_EQUAL m15, m15, [pd_100]
>+ BLEND m7, d_r, d_g, m15 ; r,g
>+
>+ SHIFT_RIGHT 9, 3 ; 9
>+
>+ andps m15, m9, m14
>+ CMP_EQUAL m15, m15, m14
>+ BLEND x1, m7, d_b, m15 ; b
>+
>+ ; x2, m6
>+ andps m15, m9, [pd_010]
>+ CMP_EQUAL m15, m15, [pd_010]
>+ BLEND m7, d_r, d_g, m15 ; r,g
>+
>+ andps m15, m9, [pd_100]
>+ CMP_EQUAL m15, m15, [pd_100]
>+ BLEND x2, m7, d_b, m15 ; b
>+
>+ ; convert indices to integer
>+ cvttps2dq m12, m12
>+ cvttps2dq m10, m10
>+ cvttps2dq m11, m11
>+ cvttps2dq m13, m13
>+
>+ ; now the gathering festival
>+ mov tmpq, [ctxq + LUT3DContext.lut]
>+
>+ GATHER_LUT3D_INDICES 0, 1, 2, 12
>+ movu m14, [pd_1f]
>+ subps m14, m14, x0; 1 - x0
>+
>+ mulps m0, m0, m14
>+ mulps m1, m1, m14
>+ mulps m2, m2, m14
>+
>+ GATHER_LUT3D_INDICES 7, 8, 9, 10
>+ subps m14, x0, x1; x0 - x1
>+ mulps m7, m7, m14
>+ addps m0, m0, m7
>+
>+ mulps m8, m8, m14
>+ addps m1, m1, m8
>+
>+ mulps m9, m9, m14
>+ addps m2, m2, m9
>+
>+ GATHER_LUT3D_INDICES 7, 8, 9, 11
>+ subps m14, x1, x2; x1 - x2
>+
>+ mulps m7, m7, m14
>+ addps m0, m0, m7
>+
>+ mulps m8, m8, m14
>+ addps m1, m1, m8
>+
>+ mulps m9, m9, m14
>+ addps m2, m2, m9
>+
>+ GATHER_LUT3D_INDICES 7, 8, 9, 13
>+ mulps m7, m7, x2
>+ addps m0, m0, m7
>+
>+ mulps m8, m8, x2
>+ addps m1, m1, m8
>+
>+ mulps m9, m9, x2
>+ addps m2, m2, m9
>+%endmacro
>+
>+%macro INIT_DATA_PTR 3
>+ mov ptrq, [%2 + AVFrame.data + %3 * 8]
>+ mov tmpd, [%2 + AVFrame.linesize + %3 * 4]
>+ imul tmpd, slice_startd
>+ add ptrq, tmpq
>+ mov %1, ptrq
>+%endmacro
>+
>+%macro INC_DATA_PTR 3
>+ mov tmpd, [%2 + AVFrame.linesize + %3 * 4]
>+ mov ptrq, %1
>+ add ptrq, tmpq
>+ mov %1, ptrq
>+%endmacro
>+
>+%macro LOAD16 2
>+ mov ptrq, %2
>+ %if mmsize > 16
>+ movu xm%1, [ptrq + xq*2]
>+ %else
>+ movsd xm%1, [ptrq + xq*2]
>+ %endif
>+ %if cpuflag(avx2)
>+ vpmovzxwd m%1, xm%1
>+ %else
>+ %if mmsize > 16
>+ pshufd xm4, xm%1, (1 << 6 | 0 << 4 | 3 << 2 | 2 << 0)
>+ pshufb xm%1, xm6 ; pb_shuffle16
>+ pshufb xm4, xm6 ; pb_shuffle16
>+ vinsertf128 m%1, m%1, xm4, 1
>+ %else
>+ pshufd xm%1, xm%1, (3 << 6 | 1 << 4 | 3 << 2 | 0 << 0)
>+ pshuflw xm%1, xm%1, (2 << 6 | 1 << 4 | 2 << 2 | 0 << 0)
>+ pshufhw xm%1, xm%1, (2 << 6 | 1 << 4 | 2 << 2 | 0 << 0)
>+ %endif
>+ %endif
>+ cvtdq2ps m%1, m%1
>+ mulps m%1, m%1, m7 ; pd_65535_invf
>+%endmacro
>+
>+%macro STORE16 2
>+ mulps m%2, m%2, m5 ; [pd_65535f]
>+ minps m%2, m%2, m5 ; [pd_65535f]
>+ maxps m%2, m%2, m15 ; zero
>+ cvttps2dq m%2, m%2
>+ %if mmsize > 16
>+ vextractf128 xm4, m%2, 1
>+ pshufb xm%2, xm6 ; [pb_lo_pack_shuffle16]
>+ pshufb xm4, xm7 ; [pb_hi_pack_shuffle16]
>+ por xm%2, xm4
>+ %else
>+ pshuflw xm%2, xm%2, (1 << 6 | 1 << 4 | 2 << 2 | 0 << 0)
>+ pshufhw xm%2, xm%2, (1 << 6 | 1 << 4 | 2 << 2 | 0 << 0)
>+ pshufd xm%2, xm%2, (3 << 6 | 3 << 4 | 2 << 2 | 0 << 0)
>+ %endif
>+ mov ptrq, %1
>+ %if mmsize > 16
>+ movu [ptrq + xq*2], xm%2
>+ %else
>+ movsd [ptrq + xq*2], xm%2
>+ %endif
>+%endmacro
>+
>+; 1 - interp method
>+; 2 - format_name
>+; 3 - depth
>+; 4 - is float format
>+%macro DEFINE_INTERP_FUNC 4
>+cglobal interp_%1_%2, 7, 13, 16, mmsize*16+(8*8), ctx, prelut, src_image, dst_image, slice_start, slice_end, has_alpha, width, x, ptr, tmp, tmp2, tmp3
>+ ; store lut max and lutsize
>+ mov tmpd, dword [ctxq + LUT3DContext.lutsize]
>+ cvtsi2ss xm0, tmpd
>+ mulss xm0, xm0, [pd_3f]
>+ VBROADCASTSS m0, xm0
>+ mova lut3dsizem, m0
>+ sub tmpd, 1
>+ cvtsi2ss xm0, tmpd
>+ VBROADCASTSS m0, xm0
>+ mova lut3dmaxm, m0
>+
>+ ; scale_r
>+ mulss xm1, xm0, dword [ctxq + LUT3DContext.scale + 0*4]
>+ VBROADCASTSS m1, xm1
>+ mova scalerm, m1
>+
>+ ; scale_g
>+ mulss xm1, xm0, dword [ctxq + LUT3DContext.scale + 1*4]
>+ VBROADCASTSS m1, xm1
>+ mova scalegm, m1
>+
>+ ; scale_b
>+ mulss xm1, xm0, dword [ctxq + LUT3DContext.scale + 2*4]
>+ VBROADCASTSS m1, xm1
>+ mova scalebm, m1
>+
>+ ; store lutsize2
>+ cvtsi2ss xm0, dword [ctxq + LUT3DContext.lutsize2]
>+ mulss xm0, xm0, [pd_3f]
>+ VBROADCASTSS m0, xm0
>+ mova lut3dsize2m, m0
>+
>+ ; init prelut values
>+ cmp prelutq, 0
>+ je %%skip_init_prelut
>+ mov tmpd, dword [prelutq + Lut3DPreLut.size]
>+ sub tmpd, 1
>+ cvtsi2ss xm0, tmpd
>+ VBROADCASTSS m0, xm0
>+ mova prelutmaxm, m0
>+ VBROADCASTSS m0, dword [prelutq + Lut3DPreLut.min + 0*4]
>+ mova prelutminrm, m0
>+ VBROADCASTSS m0, dword [prelutq + Lut3DPreLut.min + 1*4]
>+ mova prelutmingm, m0
>+ VBROADCASTSS m0, dword [prelutq + Lut3DPreLut.min + 2*4]
>+ mova prelutminbm, m0
>+ VBROADCASTSS m0, dword [prelutq + Lut3DPreLut.scale + 0*4]
>+ mova prelutscalerm, m0
>+ VBROADCASTSS m0, dword [prelutq + Lut3DPreLut.scale + 1*4]
>+ mova prelutscalegm, m0
>+ VBROADCASTSS m0, dword [prelutq + Lut3DPreLut.scale + 2*4]
>+ mova prelutscalebm, m0
>+ %%skip_init_prelut:
>+
>+ mov widthd, [src_imageq + AVFrame.width]
>+
>+ ; gbra pixel order
>+ INIT_DATA_PTR srcrm, src_imageq, 2
>+ INIT_DATA_PTR srcgm, src_imageq, 0
>+ INIT_DATA_PTR srcbm, src_imageq, 1
>+ INIT_DATA_PTR srcam, src_imageq, 3
>+
>+ INIT_DATA_PTR dstrm, dst_imageq, 2
>+ INIT_DATA_PTR dstgm, dst_imageq, 0
>+ INIT_DATA_PTR dstbm, dst_imageq, 1
>+ INIT_DATA_PTR dstam, dst_imageq, 3
>+
>+ %%loop_y:
>+ xor xq, xq
>+ %%loop_x:
>+ movu m14, [pd_1f]
>+ xorps m15, m15, m15
>+ %if %4 ; float
>+ mov ptrq, srcrm
>+ movu m0, [ptrq + xq*4]
>+ mov ptrq, srcgm
>+ movu m1, [ptrq + xq*4]
>+ mov ptrq, srcbm
>+ movu m2, [ptrq + xq*4]
>+ SANITIZE_F m0
>+ SANITIZE_F m1
>+ SANITIZE_F m2
>+ %else
>+ ; constants for LOAD16
>+ movu m7, [pd_65535_invf]
>+ %if notcpuflag(avx2) && mmsize >= 32
>+ movu xm6, [pb_shuffle16]
>+ %endif
>+ LOAD16 0, srcrm
>+ LOAD16 1, srcgm
>+ LOAD16 2, srcbm
>+ %endif
>+
>+ cmp prelutq, 0
>+ je %%skip_prelut
>+ mova m13, prelutmaxm
>+ APPLY_PRELUT 0, 0, prelutminrm, prelutscalerm
>+ APPLY_PRELUT 1, 1, prelutmingm, prelutscalegm
>+ APPLY_PRELUT 2, 2, prelutminbm, prelutscalebm
>+ %%skip_prelut:
>+
>+ mova m13, lut3dmaxm
>+ APPLY_SCALE 0, scalerm
>+ APPLY_SCALE 1, scalegm
>+ APPLY_SCALE 2, scalebm
>+
>+ interp_%1
>+
>+ %if %4 ; float
>+ mov ptrq, dstrm
>+ movu [ptrq + xq*4], m0
>+ mov ptrq, dstgm
>+ movu [ptrq + xq*4], m1
>+ mov ptrq, dstbm
>+ movu [ptrq + xq*4], m2
>+ cmp has_alphad, 0
>+ je %%skip_alphaf
>+ mov ptrq, srcam
>+ movu m0, [ptrq + xq*4]
>+ mov ptrq, dstam
>+ movu [ptrq + xq*4], m0
>+ %%skip_alphaf:
>+ %else
>+ ; constants for STORE16
>+ movu m5, [pd_65535f]
>+ %if mmsize > 16
>+ movu xm6, [pb_lo_pack_shuffle16]
>+ movu xm7, [pb_hi_pack_shuffle16]
>+ %endif
>+
>+ xorps m15, m15, m15
>+ STORE16 dstrm, 0
>+ STORE16 dstgm, 1
>+ STORE16 dstbm, 2
>+
>+ cmp has_alphad, 0
>+ je %%skip_alpha
>+ %if mmsize > 16
>+ mov ptrq, srcam
>+ movu xm0, [ptrq + xq*2]
>+ mov ptrq, dstam
>+ movu [ptrq + xq*2], xm0
>+ %else
>+ mov ptrq, srcam
>+ movsd xm0, [ptrq + xq*2]
>+ mov ptrq, dstam
>+ movsd [ptrq + xq*2], xm0
>+ %endif
>+
>+ %%skip_alpha:
>+ %endif
>+
>+ add xq, mmsize/4
>+ cmp xd, widthd
>+ jl %%loop_x
>+
>+ INC_DATA_PTR srcrm, src_imageq, 2
>+ INC_DATA_PTR srcgm, src_imageq, 0
>+ INC_DATA_PTR srcbm, src_imageq, 1
>+ INC_DATA_PTR srcam, src_imageq, 3
>+
>+ INC_DATA_PTR dstrm, dst_imageq, 2
>+ INC_DATA_PTR dstgm, dst_imageq, 0
>+ INC_DATA_PTR dstbm, dst_imageq, 1
>+ INC_DATA_PTR dstam, dst_imageq, 3
>+
>+ inc slice_startd
>+ cmp slice_startd, slice_endd
>+ jl %%loop_y
>+
>+ RET
>+%endmacro
>+%if ARCH_X86_64
>+ %if HAVE_AVX2_EXTERNAL
>+ INIT_YMM avx2
>+ DEFINE_INTERP_FUNC tetrahedral, pf32, 32, 1
>+ DEFINE_INTERP_FUNC tetrahedral, p16, 16, 0
>+ %endif
>+ %if HAVE_AVX_EXTERNAL
>+ INIT_YMM avx
>+ DEFINE_INTERP_FUNC tetrahedral, pf32, 32, 1
>+ DEFINE_INTERP_FUNC tetrahedral, p16, 16, 0
>+ %endif
>+ INIT_XMM sse2
>+ DEFINE_INTERP_FUNC tetrahedral, pf32, 32, 1
>+ DEFINE_INTERP_FUNC tetrahedral, p16, 16, 0
>+%endif
>\ No newline at end of file
>diff --git a/libavfilter/x86/vf_lut3d_init.c b/libavfilter/x86/vf_lut3d_init.c
>new file mode 100644
>index 0000000000..9b9b36e4af
>--- /dev/null
>+++ b/libavfilter/x86/vf_lut3d_init.c
>@@ -0,0 +1,88 @@
>+/*
>+ * Copyright (c) 2021 Mark Reid <mindmark at gmail.com>
>+ *
>+ * This file is part of FFmpeg.
>+ *
>+ * FFmpeg is free software; you can redistribute it and/or
>+ * modify it under the terms of the GNU Lesser General Public
>+ * License as published by the Free Software Foundation; either
>+ * version 2.1 of the License, or (at your option) any later version.
>+ *
>+ * FFmpeg is distributed in the hope that it will be useful,
>+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
>+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
>+ * Lesser General Public License for more details.
>+ *
>+ * You should have received a copy of the GNU Lesser General Public
>+ * License along with FFmpeg; if not, write to the Free Software
>+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
>+ */
>+
>+#include "libavutil/attributes.h"
>+#include "libavutil/cpu.h"
>+#include "libavutil/x86/cpu.h"
>+#include "libavfilter/lut3d.h"
>+
>+#define DEFINE_INTERP_FUNC(name, format, opt) \
>+void ff_interp_##name##_##format##_##opt(LUT3DContext *lut3d, Lut3DPreLut *prelut, AVFrame *src, AVFrame *dst, int slice_start, int slice_end, int has_alpha); \
>+static int interp_##name##_##format##_##opt(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs) \
>+{ \
>+ LUT3DContext *lut3d = ctx->priv; \
>+ Lut3DPreLut *prelut = lut3d->prelut.size > 0? &lut3d->prelut: NULL; \
>+ ThreadData *td = arg; \
>+ AVFrame *in = td->in; \
>+ AVFrame *out = td->out; \
>+ int has_alpha = in->linesize[3] && out != in; \
>+ int slice_start = (in->height * jobnr ) / nb_jobs; \
>+ int slice_end = (in->height * (jobnr+1)) / nb_jobs; \
>+ ff_interp_##name##_##format##_##opt(lut3d, prelut, in, out, slice_start, slice_end, has_alpha); \
>+ return 0; \
>+}
>+
>+#if ARCH_X86_64
>+#if HAVE_AVX2_EXTERNAL
>+ DEFINE_INTERP_FUNC(tetrahedral, pf32, avx2)
>+ DEFINE_INTERP_FUNC(tetrahedral, p16, avx2)
>+#endif
>+#if HAVE_AVX_EXTERNAL
>+ DEFINE_INTERP_FUNC(tetrahedral, pf32, avx)
>+ DEFINE_INTERP_FUNC(tetrahedral, p16, avx)
>+#endif
>+ DEFINE_INTERP_FUNC(tetrahedral, pf32, sse2)
>+ DEFINE_INTERP_FUNC(tetrahedral, p16, sse2)
>+#endif
>+
>+
>+av_cold void ff_lut3d_init_x86(LUT3DContext *s, const AVPixFmtDescriptor *desc)
>+{
>+ int cpu_flags = av_get_cpu_flags();
>+ int planar = desc->flags & AV_PIX_FMT_FLAG_PLANAR;
>+ int isfloat = desc->flags & AV_PIX_FMT_FLAG_FLOAT;
>+ int depth = desc->comp[0].depth;
>+
>+#if ARCH_X86_64
>+ if (EXTERNAL_AVX2_FAST(cpu_flags) && s->interpolation == INTERPOLATE_TETRAHEDRAL && planar) {
>+#if HAVE_AVX2_EXTERNAL
>+ if (isfloat && planar) {
>+ s->interp = interp_tetrahedral_pf32_avx2;
>+ } else if (depth == 16) {
>+ s->interp = interp_tetrahedral_p16_avx2;
>+ }
>+#endif
>+ } else if (EXTERNAL_AVX_FAST(cpu_flags) && s->interpolation == INTERPOLATE_TETRAHEDRAL && planar) {
>+#if HAVE_AVX_EXTERNAL
>+ if (isfloat) {
>+ s->interp = interp_tetrahedral_pf32_avx;
>+ } else if (depth == 16) {
>+ s->interp = interp_tetrahedral_p16_avx;
>+ }
>+#endif
>+ } else if (EXTERNAL_SSE2(cpu_flags) && s->interpolation == INTERPOLATE_TETRAHEDRAL && planar) {
>+ if (isfloat) {
>+ s->interp = interp_tetrahedral_pf32_sse2;
>+ } else if (depth == 16) {
>+ s->interp = interp_tetrahedral_p16_sse2;
>+ }
>+ }
>+#endif
>+}
>--
>2.31.1.windows.1
>
>_______________________________________________
>ffmpeg-devel mailing list
>ffmpeg-devel at ffmpeg.org
>https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
>To unsubscribe, visit link above, or email
>ffmpeg-devel-request at ffmpeg.org with subject "unsubscribe".
More information about the ffmpeg-devel
mailing list