[FFmpeg-devel] [PATCH] avfilter/vf_lut3d: add x86-optimized tetrahedral interpolation
Mark Reid
mindmark at gmail.com
Mon Oct 4 06:49:52 EEST 2021
On Wed, Sep 29, 2021 at 10:27 AM Mark Reid <mindmark at gmail.com> wrote:
>
>
> On Tue, Sep 28, 2021 at 6:38 PM chen <chenm003 at 163.com> wrote:
>
>> Hello,
>>
>>
>> Excuse me, how about FMADD on AVX2 platform?
>>
>>
>> For example
>> + mulps m7, m7, m14
>> + addps m0, m0, m7
>>
>> ==>
>>
>>
>> fmadd231ps m0,m7,m14
>>
>>
> Interesting, does having AVX2 guarantee having FMA instructions?
>
>
I'm still not 100% certain all AVX2 cpus have FMA instructions so I'll
add cpuflags check for FMA too. I also came up with a faster way to
calculate x0,x1,x2 without the lookup table.
will send a new patch.
>
>> Regards,
>> Min Chen
>>
>>
>> 2021-09-29 09:18:05,mindmark at gmail.com
>> >From: Mark Reid <mindmark at gmail.com>
>> >
>> >Only supports float and 16bit planer formats at the momoment.
>> >Mainly focused on AVX and AVX2 optimizations, but SSE2 does seem offer
>> some
>> >speed gains.
>> >
>> >f32 1920x1080 1 thread with prelut
>> >c impl
>> >1389936500 UNITS in lut3d->interp, 1 runs, 0 skips
>> >1425800240 UNITS in lut3d->interp, 2 runs, 0 skips
>> >1433312777 UNITS in lut3d->interp, 4 runs, 0 skips
>> >1443346798 UNITS in lut3d->interp, 8 runs, 0 skips
>> >
>> >sse2
>> >948662320 UNITS in lut3d->interp, 1 runs, 0 skips
>> >1101247540 UNITS in lut3d->interp, 2 runs, 0 skips
>> >1050645695 UNITS in lut3d->interp, 4 runs, 0 skips
>> >1041102937 UNITS in lut3d->interp, 8 runs, 0 skips
>> >
>> >avx
>> >633837000 UNITS in lut3d->interp, 1 runs, 0 skips
>> >669452850 UNITS in lut3d->interp, 2 runs, 0 skips
>> >650716580 UNITS in lut3d->interp, 4 runs, 0 skips
>> >644698550 UNITS in lut3d->interp, 8 runs, 0 skips
>> >
>> >avx2
>> >354940020 UNITS in lut3d->interp, 1 runs, 0 skips
>> >362384340 UNITS in lut3d->interp, 2 runs, 0 skips
>> >356799020 UNITS in lut3d->interp, 4 runs, 0 skips
>> >357276815 UNITS in lut3d->interp, 8 runs, 0 skips
>> >
>> >gbrap16 1920x1080 1 thread with prelut
>> >c impl
>> >1445071160 UNITS in lut3d->interp, 1 runs, 0 skips
>> >1477959120 UNITS in lut3d->interp, 2 runs, 0 skips
>> >1472102670 UNITS in lut3d->interp, 4 runs, 0 skips
>> >1462579330 UNITS in lut3d->interp, 8 runs, 0 skips
>> >
>> >sse2
>> >1035437580 UNITS in lut3d->interp, 1 runs, 0 skips
>> >1050139710 UNITS in lut3d->interp, 2 runs, 0 skips
>> >1070147205 UNITS in lut3d->interp, 4 runs, 0 skips
>> >1064583037 UNITS in lut3d->interp, 8 runs, 0 skips
>> >
>> >avx
>> >678089880 UNITS in lut3d->interp, 1 runs, 0 skips
>> >679112485 UNITS in lut3d->interp, 2 runs, 0 skips
>> >695527212 UNITS in lut3d->interp, 4 runs, 0 skips
>> >691300053 UNITS in lut3d->interp, 8 runs, 0 skips
>> >
>> >avx2
>> >372671340 UNITS in lut3d->interp, 1 runs, 0 skips
>> >373449870 UNITS in lut3d->interp, 2 runs, 0 skips
>> >383725625 UNITS in lut3d->interp, 4 runs, 0 skips
>> >382860848 UNITS in lut3d->interp, 8 runs, 0 skips
>> >
>> >---
>> > libavfilter/lut3d.h | 83 ++++
>> > libavfilter/vf_lut3d.c | 61 +--
>> > libavfilter/x86/Makefile | 2 +
>> > libavfilter/x86/vf_lut3d.asm | 757 ++++++++++++++++++++++++++++++++
>> > libavfilter/x86/vf_lut3d_init.c | 88 ++++
>> > 5 files changed, 935 insertions(+), 56 deletions(-)
>> > create mode 100644 libavfilter/lut3d.h
>> > create mode 100644 libavfilter/x86/vf_lut3d.asm
>> > create mode 100644 libavfilter/x86/vf_lut3d_init.c
>> >
>> >diff --git a/libavfilter/lut3d.h b/libavfilter/lut3d.h
>> >new file mode 100644
>> >index 0000000000..ded2a036a5
>> >--- /dev/null
>> >+++ b/libavfilter/lut3d.h
>> >@@ -0,0 +1,83 @@
>> >+/*
>> >+ * Copyright (c) 2013 Clément Bœsch
>> >+ * Copyright (c) 2018 Paul B Mahol
>> >+ *
>> >+ * This file is part of FFmpeg.
>> >+ *
>> >+ * FFmpeg is free software; you can redistribute it and/or
>> >+ * modify it under the terms of the GNU Lesser General Public
>> >+ * License as published by the Free Software Foundation; either
>> >+ * version 2.1 of the License, or (at your option) any later version.
>> >+ *
>> >+ * FFmpeg is distributed in the hope that it will be useful,
>> >+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
>> >+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
>> >+ * Lesser General Public License for more details.
>> >+ *
>> >+ * You should have received a copy of the GNU Lesser General Public
>> >+ * License along with FFmpeg; if not, write to the Free Software
>> >+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
>> 02110-1301 USA
>> >+ */
>> >+#ifndef AVFILTER_LUT3D_H
>> >+#define AVFILTER_LUT3D_H
>> >+
>> >+#include "libavutil/pixdesc.h"
>> >+#include "framesync.h"
>> >+#include "avfilter.h"
>> >+
>> >+enum interp_mode {
>> >+ INTERPOLATE_NEAREST,
>> >+ INTERPOLATE_TRILINEAR,
>> >+ INTERPOLATE_TETRAHEDRAL,
>> >+ INTERPOLATE_PYRAMID,
>> >+ INTERPOLATE_PRISM,
>> >+ NB_INTERP_MODE
>> >+};
>> >+
>> >+struct rgbvec {
>> >+ float r, g, b;
>> >+};
>> >+
>> >+/* 3D LUT don't often go up to level 32, but it is common to have a
>> Hald CLUT
>> >+ * of 512x512 (64x64x64) */
>> >+#define MAX_LEVEL 256
>> >+#define PRELUT_SIZE 65536
>> >+
>> >+typedef struct Lut3DPreLut {
>> >+ int size;
>> >+ float min[3];
>> >+ float max[3];
>> >+ float scale[3];
>> >+ float* lut[3];
>> >+} Lut3DPreLut;
>> >+
>> >+typedef struct LUT3DContext {
>> >+ const AVClass *class;
>> >+ struct rgbvec *lut;
>> >+ int lutsize;
>> >+ int lutsize2;
>> >+ struct rgbvec scale;
>> >+ int interpolation; ///<interp_mode
>> >+ char *file;
>> >+ uint8_t rgba_map[4];
>> >+ int step;
>> >+ avfilter_action_func *interp;
>> >+ Lut3DPreLut prelut;
>> >+#if CONFIG_HALDCLUT_FILTER
>> >+ uint8_t clut_rgba_map[4];
>> >+ int clut_step;
>> >+ int clut_bits;
>> >+ int clut_planar;
>> >+ int clut_float;
>> >+ int clut_width;
>> >+ FFFrameSync fs;
>> >+#endif
>> >+} LUT3DContext;
>> >+
>> >+typedef struct ThreadData {
>> >+ AVFrame *in, *out;
>> >+} ThreadData;
>> >+
>> >+void ff_lut3d_init_x86(LUT3DContext *s, const AVPixFmtDescriptor *desc);
>> >+
>> >+#endif /* AVFILTER_LUT3D_H */
>> >\ No newline at end of file
>> >diff --git a/libavfilter/vf_lut3d.c b/libavfilter/vf_lut3d.c
>> >index 9fbda833b9..1fd0af06db 100644
>> >--- a/libavfilter/vf_lut3d.c
>> >+++ b/libavfilter/vf_lut3d.c
>> >@@ -31,73 +31,18 @@
>> > #include "libavutil/intreadwrite.h"
>> > #include "libavutil/intfloat.h"
>> > #include "libavutil/avassert.h"
>> >-#include "libavutil/pixdesc.h"
>> > #include "libavutil/avstring.h"
>> >-#include "avfilter.h"
>> > #include "drawutils.h"
>> > #include "formats.h"
>> >-#include "framesync.h"
>> > #include "internal.h"
>> > #include "video.h"
>> >+#include "lut3d.h"
>> >
>> > #define R 0
>> > #define G 1
>> > #define B 2
>> > #define A 3
>> >
>> >-enum interp_mode {
>> >- INTERPOLATE_NEAREST,
>> >- INTERPOLATE_TRILINEAR,
>> >- INTERPOLATE_TETRAHEDRAL,
>> >- INTERPOLATE_PYRAMID,
>> >- INTERPOLATE_PRISM,
>> >- NB_INTERP_MODE
>> >-};
>> >-
>> >-struct rgbvec {
>> >- float r, g, b;
>> >-};
>> >-
>> >-/* 3D LUT don't often go up to level 32, but it is common to have a
>> Hald CLUT
>> >- * of 512x512 (64x64x64) */
>> >-#define MAX_LEVEL 256
>> >-#define PRELUT_SIZE 65536
>> >-
>> >-typedef struct Lut3DPreLut {
>> >- int size;
>> >- float min[3];
>> >- float max[3];
>> >- float scale[3];
>> >- float* lut[3];
>> >-} Lut3DPreLut;
>> >-
>> >-typedef struct LUT3DContext {
>> >- const AVClass *class;
>> >- int interpolation; ///<interp_mode
>> >- char *file;
>> >- uint8_t rgba_map[4];
>> >- int step;
>> >- avfilter_action_func *interp;
>> >- struct rgbvec scale;
>> >- struct rgbvec *lut;
>> >- int lutsize;
>> >- int lutsize2;
>> >- Lut3DPreLut prelut;
>> >-#if CONFIG_HALDCLUT_FILTER
>> >- uint8_t clut_rgba_map[4];
>> >- int clut_step;
>> >- int clut_bits;
>> >- int clut_planar;
>> >- int clut_float;
>> >- int clut_width;
>> >- FFFrameSync fs;
>> >-#endif
>> >-} LUT3DContext;
>> >-
>> >-typedef struct ThreadData {
>> >- AVFrame *in, *out;
>> >-} ThreadData;
>> >-
>> > #define OFFSET(x) offsetof(LUT3DContext, x)
>> > #define FLAGS AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_VIDEO_PARAM
>> > #define TFLAGS
>> AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_VIDEO_PARAM|AV_OPT_FLAG_RUNTIME_PARAM
>> >@@ -1207,6 +1152,10 @@ static int config_input(AVFilterLink *inlink)
>> > av_assert0(0);
>> > }
>> >
>> >+ if (ARCH_X86) {
>> >+ ff_lut3d_init_x86(lut3d, desc);
>> >+ }
>> >+
>> > return 0;
>> > }
>> >
>> >diff --git a/libavfilter/x86/Makefile b/libavfilter/x86/Makefile
>> >index 016a5b3511..a29941eaeb 100644
>> >--- a/libavfilter/x86/Makefile
>> >+++ b/libavfilter/x86/Makefile
>> >@@ -17,6 +17,7 @@ OBJS-$(CONFIG_HQDN3D_FILTER) +=
>> x86/vf_hqdn3d_init.o
>> > OBJS-$(CONFIG_IDET_FILTER) += x86/vf_idet_init.o
>> > OBJS-$(CONFIG_INTERLACE_FILTER) += x86/vf_tinterlace_init.o
>> > OBJS-$(CONFIG_LIMITER_FILTER) += x86/vf_limiter_init.o
>> >+OBJS-$(CONFIG_LUT3D_FILTER) += x86/vf_lut3d_init.o
>> > OBJS-$(CONFIG_MASKEDCLAMP_FILTER) +=
>> x86/vf_maskedclamp_init.o
>> > OBJS-$(CONFIG_MASKEDMERGE_FILTER) +=
>> x86/vf_maskedmerge_init.o
>> > OBJS-$(CONFIG_NOISE_FILTER) += x86/vf_noise.o
>> >@@ -57,6 +58,7 @@ X86ASM-OBJS-$(CONFIG_HQDN3D_FILTER) +=
>> x86/vf_hqdn3d.o
>> > X86ASM-OBJS-$(CONFIG_IDET_FILTER) += x86/vf_idet.o
>> > X86ASM-OBJS-$(CONFIG_INTERLACE_FILTER) += x86/vf_interlace.o
>> > X86ASM-OBJS-$(CONFIG_LIMITER_FILTER) += x86/vf_limiter.o
>> >+X86ASM-OBJS-$(CONFIG_LUT3D_FILTER) += x86/vf_lut3d.o
>> > X86ASM-OBJS-$(CONFIG_MASKEDCLAMP_FILTER) += x86/vf_maskedclamp.o
>> > X86ASM-OBJS-$(CONFIG_MASKEDMERGE_FILTER) += x86/vf_maskedmerge.o
>> > X86ASM-OBJS-$(CONFIG_OVERLAY_FILTER) += x86/vf_overlay.o
>> >diff --git a/libavfilter/x86/vf_lut3d.asm b/libavfilter/x86/vf_lut3d.asm
>> >new file mode 100644
>> >index 0000000000..b3d7c3962b
>> >--- /dev/null
>> >+++ b/libavfilter/x86/vf_lut3d.asm
>> >@@ -0,0 +1,757 @@
>>
>> >+;*****************************************************************************
>> >+;* x86-optimized functions for lut3d filter
>> >+;*
>> >+;* Copyright (c) 2021 Mark Reid <mindmark at gmail.com>
>> >+;*
>> >+;* This file is part of FFmpeg.
>> >+;*
>> >+;* FFmpeg is free software; you can redistribute it and/or
>> >+;* modify it under the terms of the GNU Lesser General Public
>> >+;* License as published by the Free Software Foundation; either
>> >+;* version 2.1 of the License, or (at your option) any later version.
>> >+;*
>> >+;* FFmpeg is distributed in the hope that it will be useful,
>> >+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
>> >+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
>> >+;* Lesser General Public License for more details.
>> >+;*
>> >+;* You should have received a copy of the GNU Lesser General Public
>> >+;* License along with FFmpeg; if not, write to the Free Software
>> >+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
>> 02110-1301 USA
>>
>> >+;******************************************************************************
>> >+
>> >+%include "libavutil/x86/x86util.asm"
>> >+
>> >+SECTION_RODATA
>> >+pd_1f: times 8 dd 1.0
>> >+pd_3f: times 8 dd 3.0
>> >+
>> >+; used to limit rshifts as they are more expensive in avx1
>> >+pd_001: times 8 dd 001b
>> >+pd_010: times 8 dd 010b
>> >+pd_100: times 8 dd 100b
>> >+
>> >+pd_65535f: times 8 dd 65535.0
>> >+pd_65535_invf: times 8 dd 0x37800080 ;1.0/65535.0
>> >+
>> >+pb_shuffle16: db 0, 1, 0x80, 0x80, \
>> >+ 2, 3, 0x80, 0x80, \
>> >+ 4, 5, 0x80, 0x80, \
>> >+ 6, 7, 0x80, 0x80
>> >+
>> >+pb_lo_pack_shuffle16: db 0, 1, 4, 5, \
>> >+ 8, 9, 12, 13, \
>> >+ 0x80, 0x80, 0x80, 0x80, \
>> >+ 0x80, 0x80, 0x80, 0x80
>> >+
>> >+pb_hi_pack_shuffle16: db 0x80, 0x80, 0x80, 0x80, \
>> >+ 0x80, 0x80, 0x80, 0x80, \
>> >+ 0, 1, 4, 5, \
>> >+ 8, 9, 12, 13
>> >+
>> >+; tetrahedral table --------------------------------------------
>> >+; name: x2| x1| x0| cxxb| cxxa
>> >+; values: r 00| r 00| r 00| c011 011| c001 001
>> >+; g 01| g 01| g 01| c101 101| c010 010
>> >+; b 10| b 10| b 10| c110 110| c100 100
>> >+
>> >+; g>b b | g | r |
>> c110 | c100
>> >+pd_tetra_table0: times 8 dd (10b << 10) | (01b << 8) | (00b << 6) |
>> (110b << 3) | 100b
>> >+; r>b g | b | r |
>> c101 | c100
>> >+pd_tetra_table1: times 8 dd (01b << 10) | (10b << 8) | (00b << 6) |
>> (101b << 3) | 100b
>> >+; else g | r | b |
>> c101 | c001
>> >+pd_tetra_table2: times 8 dd (01b << 10) | (00b << 8) | (10b << 6) |
>> (101b << 3) | 001b
>> >+; b>g r | g | b |
>> c011 | c001
>> >+pd_tetra_table3: times 8 dd (00b << 10) | (01b << 8) | (10b << 6) |
>> (011b << 3) | 001b
>> >+; b>r r | b | g |
>> c011 | c010
>> >+pd_tetra_table4: times 8 dd (00b << 10) | (10b << 8) | (01b << 6) |
>> (011b << 3) | 010b
>> >+; else b | r | g |
>> c110 | c010
>> >+pd_tetra_table5: times 8 dd (10b << 10) | (00b << 8) | (01b << 6) |
>> (110b << 3) | 010b
>> >+
>> >+SECTION .text
>> >+
>> >+struc Lut3DPreLut
>> >+ .size: resd 1
>> >+ .min: resd 3
>> >+ .max: resd 3
>> >+ .scale: resd 3
>> >+ .lut: resq 3
>> >+endstruc
>> >+
>> >+struc LUT3DContext
>> >+ .class: resq 1
>> >+ .lut: resq 1
>> >+ .lutsize: resd 1
>> >+ .lutsize2: resd 1
>> >+ .scale: resd 3
>> >+endstruc
>> >+
>> >+%define AV_NUM_DATA_POINTERS 8
>> >+
>> >+struc AVFrame
>> >+ .data: resq AV_NUM_DATA_POINTERS
>> >+ .linesize: resd AV_NUM_DATA_POINTERS
>> >+ .extended_data: resq 1
>> >+ .width: resd 1
>> >+ .height: resd 1
>> >+endstruc
>> >+
>> >+%define rm rsp
>> >+%define gm rsp+mmsize
>> >+%define bm rsp+(mmsize*2)
>> >+
>> >+%define lut3dsizem [rsp+mmsize*3]
>> >+%define lut3dsize2m [rsp+mmsize*4]
>> >+%define lut3dmaxm [rsp+mmsize*5]
>> >+%define prelutmaxm [rsp+mmsize*6]
>> >+
>> >+%define scalerm [rsp+mmsize*7]
>> >+%define scalegm [rsp+mmsize*8]
>> >+%define scalebm [rsp+mmsize*9]
>> >+
>> >+%define prelutminrm [rsp+mmsize*10]
>> >+%define prelutmingm [rsp+mmsize*11]
>> >+%define prelutminbm [rsp+mmsize*12]
>> >+
>> >+%define prelutscalerm [rsp+mmsize*13]
>> >+%define prelutscalegm [rsp+mmsize*14]
>> >+%define prelutscalebm [rsp+mmsize*15]
>> >+
>> >+; data pointers
>> >+%define srcrm [rsp+mmsize*16 + 0]
>> >+%define srcgm [rsp+mmsize*16 + 8]
>> >+%define srcbm [rsp+mmsize*16 + 16]
>> >+%define srcam [rsp+mmsize*16 + 24]
>> >+
>> >+%define dstrm [rsp+mmsize*16 + 32]
>> >+%define dstgm [rsp+mmsize*16 + 40]
>> >+%define dstbm [rsp+mmsize*16 + 48]
>> >+%define dstam [rsp+mmsize*16 + 56]
>> >+
>> >+%macro FETCH_PRELUT_PN 3
>> >+ mov tmp2d, [rm + %3]
>> >+ mov tmp3d, [gm + %3]
>> >+ movss xm%1, [tmpq + tmp2q*4]
>> >+ movss xm%2, [tmpq + tmp3q*4]
>> >+ movss [rm + %3], xm%1
>> >+ movss [gm + %3], xm%2
>> >+%endmacro
>> >+
>> >+; 1 - p
>> >+; 2 - n
>> >+; 3 - p indices
>> >+; 4 - n indices
>> >+%macro GATHER_PRELUT 4
>> >+ %if cpuflag(avx2)
>> >+ vpcmpeqb m7, m7
>> >+ vgatherdps m%1, [tmpq + m%3*4], m7 ; p
>> >+ vpcmpeqb m9, m9
>> >+ vgatherdps m%2, [tmpq + m%4*4], m9 ; n
>> >+ %else
>> >+ mova [rm], m%3
>> >+ mova [gm], m%4
>> >+ FETCH_PRELUT_PN %1, %2, 0
>> >+ FETCH_PRELUT_PN %1, %2, 4
>> >+ FETCH_PRELUT_PN %1, %2, 8
>> >+ FETCH_PRELUT_PN %1, %2, 12
>> >+ %if mmsize > 16
>> >+ FETCH_PRELUT_PN %1, %2, 16
>> >+ FETCH_PRELUT_PN %1, %2, 20
>> >+ FETCH_PRELUT_PN %1, %2, 24
>> >+ FETCH_PRELUT_PN %1, %2, 28
>> >+ %endif
>> >+ movu m%1, [rm]
>> >+ movu m%2, [gm]
>> >+ %endif
>> >+%endmacro
>> >+
>> >+%macro FLOORPS 2
>> >+ %if mmsize > 16
>> >+ vroundps %1, %2, 0x01
>> >+ %else
>> >+ cvttps2dq %1, %2
>> >+ cvtdq2ps %1, %1
>> >+ %endif
>> >+%endmacro
>> >+
>> >+; 1 - dst
>> >+; 2 - index
>> >+; 3 - min
>> >+; 4 - scale
>> >+; assumes lut max m13, m14 1.0f, zero m15
>> >+%macro APPLY_PRELUT 4
>> >+ ; scale
>> >+ subps m5, m%1, %3 ; v - min
>> >+ mulps m5, m5, %4 ; v * scale
>> >+ ; clamp
>> >+ maxps m5, m5, m15 ; max zero
>> >+ minps m5, m5, m13 ; min lut max
>> >+
>> >+ FLOORPS m3, m5 ; prev index
>> >+ subps m5, m5, m3 ; d
>> >+ addps m4, m3, m14 ; p+1 = n index
>> >+ minps m4, m4, m13 ; clamp n idex
>> >+
>> >+ mov tmpq, [prelutq + Lut3DPreLut.lut + %2*8]
>> >+ cvttps2dq m6, m3
>> >+ cvttps2dq m10, m4
>> >+ GATHER_PRELUT 3, 4, 6, 10
>> >+
>> >+ ; lerp
>> >+ subps m8, m4, m3
>> >+ mulps m8, m8, m5
>> >+ addps m%1, m8, m3
>> >+%endmacro
>> >+
>> >+; 1 - dst
>> >+; 2 - scale
>> >+; assumes lut max m13, zero m15
>> >+%macro APPLY_SCALE 2
>> >+ mulps m%1, m%1, %2
>> >+ maxps m%1, m%1, m15
>> >+ minps m%1, m%1, m13
>> >+%endmacro
>> >+
>> >+%macro BLEND 4
>> >+%if mmsize > 16
>> >+ vblendvps %1, %2, %3, %4
>> >+%else
>> >+ %ifidni %1,%2
>> >+ %error operand 1 must not equal operand 2
>> >+ %endif
>> >+ %ifidni %1,%3
>> >+ %error operand 1 must not equal operand 3
>> >+ %endif
>> >+ mova %1, %2
>> >+ xorps %1, %3
>> >+ andps %1, %4
>> >+ xorps %1, %2
>> >+%endif
>> >+%endmacro
>> >+
>> >+; sets nans to zere, +inf -inf handled later by min/max clamps
>> >+%macro SANITIZE_F 1
>> >+ cmpps m5, %1, %1, 0x0 ; nan == nan = False
>> >+ %if mmsize <= 16
>> >+ mova m6, %1
>> >+ BLEND %1, m15, m6, m5
>> >+ %else
>> >+ BLEND %1, m15, %1, m5
>> >+ %endif
>> >+%endmacro
>> >+
>> >+%macro ADD3 4
>> >+ addps %1, %2, %3
>> >+ addps %1, %1, %4
>> >+%endmacro
>> >+
>> >+%macro CMP_EQUAL 3
>> >+%if cpuflag(avx2)
>> >+ vpcmpeqd %1, %2, %3
>> >+%elif cpuflag(avx)
>> >+ cmpps %1, %2, %3, 0x0
>> >+%else
>> >+ pcmpeqd %1, %2, %3
>> >+%endif
>> >+%endmacro
>> >+
>> >+%macro SHIFT_RIGHT 2
>> >+%if mmsize <= 16
>> >+ psrld xm%1, %2
>> >+%elif cpuflag(avx2)
>> >+ vpsrld m%1, m%1, %2
>> >+%else
>> >+ vextractf128 xm15, m%1, 1
>> >+ psrld xm%1, %2
>> >+ psrld xm15, %2
>> >+ vinsertf128 m%1, m%1, xm15, 1
>> >+%endif
>> >+%endmacro
>> >+
>> >+%macro FETCH_LUT3D_RGB 4
>> >+ mov tmp2d, [rm + %4]
>> >+ movss xm%1, [tmpq + tmp2q*4 + 0]
>> >+ movss xm%2, [tmpq + tmp2q*4 + 4]
>> >+ movss xm%3, [tmpq + tmp2q*4 + 8]
>> >+ movss [rm + %4], xm%1
>> >+ movss [gm + %4], xm%2
>> >+ movss [bm + %4], xm%3
>> >+%endmacro
>> >+
>> >+; 1 - dstr
>> >+; 2 - dstg
>> >+; 3 - dstb
>> >+; 4 - indices
>> >+%macro GATHER_LUT3D_INDICES 4
>> >+%if cpuflag(avx2)
>> >+ vpcmpeqb m3, m3
>> >+ vgatherdps m%1, [tmpq + m%4*4 + 0], m3
>> >+ vpcmpeqb m14, m14
>> >+ vgatherdps m%2, [tmpq + m%4*4 + 4], m14
>> >+ vpcmpeqb m15, m15
>> >+ vgatherdps m%3, [tmpq + m%4*4 + 8], m15
>> >+%else
>> >+ movu [rm], m%4
>> >+ FETCH_LUT3D_RGB %1, %2, %3, 0
>> >+ FETCH_LUT3D_RGB %1, %2, %3, 4
>> >+ FETCH_LUT3D_RGB %1, %2, %3, 8
>> >+ FETCH_LUT3D_RGB %1, %2, %3, 12
>> >+%if mmsize > 16
>> >+ FETCH_LUT3D_RGB %1, %2, %3, 16
>> >+ FETCH_LUT3D_RGB %1, %2, %3, 20
>> >+ FETCH_LUT3D_RGB %1, %2, %3, 24
>> >+ FETCH_LUT3D_RGB %1, %2, %3, 28
>> >+%endif
>> >+ movu m%1, [rm]
>> >+ movu m%2, [gm]
>> >+ movu m%3, [bm]
>> >+%endif
>> >+%endmacro
>> >+
>> >+%macro interp_tetrahedral 0
>> >+ %define d_r m0
>> >+ %define d_g m1
>> >+ %define d_b m2
>> >+
>> >+ %define prev_r m3
>> >+ %define prev_g m4
>> >+ %define prev_b m5
>> >+
>> >+ %define next_r m6
>> >+ %define next_g m7
>> >+ %define next_b m8
>> >+
>> >+ %define x0 m4
>> >+ %define x1 m5
>> >+ %define x2 m6
>> >+
>> >+ ; setup prev index
>> >+ FLOORPS prev_r, m0
>> >+ FLOORPS prev_g, m1
>> >+ FLOORPS prev_b, m2
>> >+
>> >+ ; setup deltas
>> >+ subps d_r, m0, prev_r
>> >+ subps d_g, m1, prev_g
>> >+ subps d_b, m2, prev_b
>> >+
>> >+ ; calculate select mask m9
>> >+ movu m6, [pd_tetra_table2]
>> >+ cmpps m7, d_r, d_b, 0x1E ; r > b CMP_GT_OQ
>> >+ BLEND m10, m6, [pd_tetra_table1], m7
>> >+ cmpps m7, d_g, d_b, 0x1E ; g > b CMP_GT_OQ
>> >+ BLEND m6, m10, [pd_tetra_table0], m7
>> >+
>> >+ movu m10, [pd_tetra_table5]
>> >+ cmpps m7, d_b, d_r, 0x1E ; b > r CMP_GT_OQ
>> >+ BLEND m9, m10, [pd_tetra_table4], m7
>> >+ cmpps m7, d_b, d_g, 0x1E ; b > g CMP_GT_OQ
>> >+ BLEND m10, m9, [pd_tetra_table3], m7
>> >+
>> >+ cmpps m7, d_r, d_g, 0x1E ; r > g CMP_GT_OQ
>> >+ BLEND m9, m10, m6, m7
>> >+
>> >+ ; setup next index
>> >+ addps next_r, prev_r, m14 ; +1
>> >+ minps next_r, next_r, m13 ; clamp lutmax
>> >+
>> >+ addps next_g, prev_g, m14 ; +1
>> >+ minps next_g, next_g, m13 ; clamp lutmax
>> >+
>> >+ addps next_b, prev_b, m14 ; +1
>> >+ minps next_b, next_b, m13 ; clamp lutmax
>> >+
>> >+ ; prescale indices
>> >+ mulps prev_r, prev_r, lut3dsize2m
>> >+ mulps next_r, next_r, lut3dsize2m
>> >+
>> >+ mulps prev_g, prev_g, lut3dsizem
>> >+ mulps next_g, next_g, lut3dsizem
>> >+
>> >+ mulps prev_b, prev_b, [pd_3f]
>> >+ mulps next_b, next_b, [pd_3f]
>> >+
>> >+ movu m14, [pd_001]
>> >+
>> >+ ; cxxa m10
>> >+ ; b
>> >+ andps m15, m9, m14
>> >+ CMP_EQUAL m15, m15, m14
>> >+ BLEND m10, prev_b, next_b, m15
>> >+
>> >+ ; g
>> >+ andps m15, m9, [pd_010]
>> >+ CMP_EQUAL m15, m15, [pd_010]
>> >+ BLEND m12, prev_g, next_g, m15
>> >+
>> >+ ; r
>> >+ andps m15, m9, [pd_100]
>> >+ CMP_EQUAL m15, m15, [pd_100]
>> >+ BLEND m13, prev_r, next_r, m15
>> >+
>> >+ ADD3 m10, m10, m12, m13
>> >+
>> >+ SHIFT_RIGHT 9, 3 ; 3
>> >+
>> >+ ; cxxb m11;
>> >+ ; b
>> >+ andps m15, m9, m14
>> >+ CMP_EQUAL m15, m15, m14
>> >+ BLEND m11, prev_b, next_b, m15
>> >+
>> >+ ; g
>> >+ andps m15, m9, [pd_010]
>> >+ CMP_EQUAL m15, m15, [pd_010]
>> >+ BLEND m12, prev_g, next_g, m15
>> >+
>> >+ ; r
>> >+ andps m15, m9, [pd_100]
>> >+ CMP_EQUAL m15, m15, [pd_100]
>> >+ BLEND m13, prev_r, next_r, m15
>> >+
>> >+ ADD3 m11, m11, m12, m13
>> >+
>> >+ ; c000 m12;
>> >+ ADD3 m12, prev_r, prev_g, prev_b
>> >+
>> >+ ; c111 m13;
>> >+ ADD3 m13, next_r, next_g, next_b
>> >+
>> >+ SHIFT_RIGHT 9, 3 ; 6
>> >+
>> >+ ; x0, m4
>> >+ andps m15, m9, m14
>> >+ CMP_EQUAL m15, m15, m14
>> >+ BLEND m7, d_r, d_g, m15 ; r,g
>> >+
>> >+ andps m15, m9, [pd_010]
>> >+ CMP_EQUAL m15, m15, [pd_010]
>> >+ BLEND x0, m7, d_b, m15 ; b
>> >+
>> >+ ; x1, m5
>> >+ andps m15, m9, [pd_100]
>> >+ CMP_EQUAL m15, m15, [pd_100]
>> >+ BLEND m7, d_r, d_g, m15 ; r,g
>> >+
>> >+ SHIFT_RIGHT 9, 3 ; 9
>> >+
>> >+ andps m15, m9, m14
>> >+ CMP_EQUAL m15, m15, m14
>> >+ BLEND x1, m7, d_b, m15 ; b
>> >+
>> >+ ; x2, m6
>> >+ andps m15, m9, [pd_010]
>> >+ CMP_EQUAL m15, m15, [pd_010]
>> >+ BLEND m7, d_r, d_g, m15 ; r,g
>> >+
>> >+ andps m15, m9, [pd_100]
>> >+ CMP_EQUAL m15, m15, [pd_100]
>> >+ BLEND x2, m7, d_b, m15 ; b
>> >+
>> >+ ; convert indices to integer
>> >+ cvttps2dq m12, m12
>> >+ cvttps2dq m10, m10
>> >+ cvttps2dq m11, m11
>> >+ cvttps2dq m13, m13
>> >+
>> >+ ; now the gathering festival
>> >+ mov tmpq, [ctxq + LUT3DContext.lut]
>> >+
>> >+ GATHER_LUT3D_INDICES 0, 1, 2, 12
>> >+ movu m14, [pd_1f]
>> >+ subps m14, m14, x0; 1 - x0
>> >+
>> >+ mulps m0, m0, m14
>> >+ mulps m1, m1, m14
>> >+ mulps m2, m2, m14
>> >+
>> >+ GATHER_LUT3D_INDICES 7, 8, 9, 10
>> >+ subps m14, x0, x1; x0 - x1
>> >+ mulps m7, m7, m14
>> >+ addps m0, m0, m7
>> >+
>> >+ mulps m8, m8, m14
>> >+ addps m1, m1, m8
>> >+
>> >+ mulps m9, m9, m14
>> >+ addps m2, m2, m9
>> >+
>> >+ GATHER_LUT3D_INDICES 7, 8, 9, 11
>> >+ subps m14, x1, x2; x1 - x2
>> >+
>> >+ mulps m7, m7, m14
>> >+ addps m0, m0, m7
>> >+
>> >+ mulps m8, m8, m14
>> >+ addps m1, m1, m8
>> >+
>> >+ mulps m9, m9, m14
>> >+ addps m2, m2, m9
>> >+
>> >+ GATHER_LUT3D_INDICES 7, 8, 9, 13
>> >+ mulps m7, m7, x2
>> >+ addps m0, m0, m7
>> >+
>> >+ mulps m8, m8, x2
>> >+ addps m1, m1, m8
>> >+
>> >+ mulps m9, m9, x2
>> >+ addps m2, m2, m9
>> >+%endmacro
>> >+
>> >+%macro INIT_DATA_PTR 3
>> >+ mov ptrq, [%2 + AVFrame.data + %3 * 8]
>> >+ mov tmpd, [%2 + AVFrame.linesize + %3 * 4]
>> >+ imul tmpd, slice_startd
>> >+ add ptrq, tmpq
>> >+ mov %1, ptrq
>> >+%endmacro
>> >+
>> >+%macro INC_DATA_PTR 3
>> >+ mov tmpd, [%2 + AVFrame.linesize + %3 * 4]
>> >+ mov ptrq, %1
>> >+ add ptrq, tmpq
>> >+ mov %1, ptrq
>> >+%endmacro
>> >+
>> >+%macro LOAD16 2
>> >+ mov ptrq, %2
>> >+ %if mmsize > 16
>> >+ movu xm%1, [ptrq + xq*2]
>> >+ %else
>> >+ movsd xm%1, [ptrq + xq*2]
>> >+ %endif
>> >+ %if cpuflag(avx2)
>> >+ vpmovzxwd m%1, xm%1
>> >+ %else
>> >+ %if mmsize > 16
>> >+ pshufd xm4, xm%1, (1 << 6 | 0 << 4 | 3 << 2 | 2 << 0)
>> >+ pshufb xm%1, xm6 ; pb_shuffle16
>> >+ pshufb xm4, xm6 ; pb_shuffle16
>> >+ vinsertf128 m%1, m%1, xm4, 1
>> >+ %else
>> >+ pshufd xm%1, xm%1, (3 << 6 | 1 << 4 | 3 << 2 | 0 << 0)
>> >+ pshuflw xm%1, xm%1, (2 << 6 | 1 << 4 | 2 << 2 | 0 << 0)
>> >+ pshufhw xm%1, xm%1, (2 << 6 | 1 << 4 | 2 << 2 | 0 << 0)
>> >+ %endif
>> >+ %endif
>> >+ cvtdq2ps m%1, m%1
>> >+ mulps m%1, m%1, m7 ; pd_65535_invf
>> >+%endmacro
>> >+
>> >+%macro STORE16 2
>> >+ mulps m%2, m%2, m5 ; [pd_65535f]
>> >+ minps m%2, m%2, m5 ; [pd_65535f]
>> >+ maxps m%2, m%2, m15 ; zero
>> >+ cvttps2dq m%2, m%2
>> >+ %if mmsize > 16
>> >+ vextractf128 xm4, m%2, 1
>> >+ pshufb xm%2, xm6 ; [pb_lo_pack_shuffle16]
>> >+ pshufb xm4, xm7 ; [pb_hi_pack_shuffle16]
>> >+ por xm%2, xm4
>> >+ %else
>> >+ pshuflw xm%2, xm%2, (1 << 6 | 1 << 4 | 2 << 2 | 0 << 0)
>> >+ pshufhw xm%2, xm%2, (1 << 6 | 1 << 4 | 2 << 2 | 0 << 0)
>> >+ pshufd xm%2, xm%2, (3 << 6 | 3 << 4 | 2 << 2 | 0 << 0)
>> >+ %endif
>> >+ mov ptrq, %1
>> >+ %if mmsize > 16
>> >+ movu [ptrq + xq*2], xm%2
>> >+ %else
>> >+ movsd [ptrq + xq*2], xm%2
>> >+ %endif
>> >+%endmacro
>> >+
>> >+; 1 - interp method
>> >+; 2 - format_name
>> >+; 3 - depth
>> >+; 4 - is float format
>> >+%macro DEFINE_INTERP_FUNC 4
>> >+cglobal interp_%1_%2, 7, 13, 16, mmsize*16+(8*8), ctx, prelut,
>> src_image, dst_image, slice_start, slice_end, has_alpha, width, x, ptr,
>> tmp, tmp2, tmp3
>> >+ ; store lut max and lutsize
>> >+ mov tmpd, dword [ctxq + LUT3DContext.lutsize]
>> >+ cvtsi2ss xm0, tmpd
>> >+ mulss xm0, xm0, [pd_3f]
>> >+ VBROADCASTSS m0, xm0
>> >+ mova lut3dsizem, m0
>> >+ sub tmpd, 1
>> >+ cvtsi2ss xm0, tmpd
>> >+ VBROADCASTSS m0, xm0
>> >+ mova lut3dmaxm, m0
>> >+
>> >+ ; scale_r
>> >+ mulss xm1, xm0, dword [ctxq + LUT3DContext.scale + 0*4]
>> >+ VBROADCASTSS m1, xm1
>> >+ mova scalerm, m1
>> >+
>> >+ ; scale_g
>> >+ mulss xm1, xm0, dword [ctxq + LUT3DContext.scale + 1*4]
>> >+ VBROADCASTSS m1, xm1
>> >+ mova scalegm, m1
>> >+
>> >+ ; scale_b
>> >+ mulss xm1, xm0, dword [ctxq + LUT3DContext.scale + 2*4]
>> >+ VBROADCASTSS m1, xm1
>> >+ mova scalebm, m1
>> >+
>> >+ ; store lutsize2
>> >+ cvtsi2ss xm0, dword [ctxq + LUT3DContext.lutsize2]
>> >+ mulss xm0, xm0, [pd_3f]
>> >+ VBROADCASTSS m0, xm0
>> >+ mova lut3dsize2m, m0
>> >+
>> >+ ; init prelut values
>> >+ cmp prelutq, 0
>> >+ je %%skip_init_prelut
>> >+ mov tmpd, dword [prelutq + Lut3DPreLut.size]
>> >+ sub tmpd, 1
>> >+ cvtsi2ss xm0, tmpd
>> >+ VBROADCASTSS m0, xm0
>> >+ mova prelutmaxm, m0
>> >+ VBROADCASTSS m0, dword [prelutq + Lut3DPreLut.min + 0*4]
>> >+ mova prelutminrm, m0
>> >+ VBROADCASTSS m0, dword [prelutq + Lut3DPreLut.min + 1*4]
>> >+ mova prelutmingm, m0
>> >+ VBROADCASTSS m0, dword [prelutq + Lut3DPreLut.min + 2*4]
>> >+ mova prelutminbm, m0
>> >+ VBROADCASTSS m0, dword [prelutq + Lut3DPreLut.scale + 0*4]
>> >+ mova prelutscalerm, m0
>> >+ VBROADCASTSS m0, dword [prelutq + Lut3DPreLut.scale + 1*4]
>> >+ mova prelutscalegm, m0
>> >+ VBROADCASTSS m0, dword [prelutq + Lut3DPreLut.scale + 2*4]
>> >+ mova prelutscalebm, m0
>> >+ %%skip_init_prelut:
>> >+
>> >+ mov widthd, [src_imageq + AVFrame.width]
>> >+
>> >+ ; gbra pixel order
>> >+ INIT_DATA_PTR srcrm, src_imageq, 2
>> >+ INIT_DATA_PTR srcgm, src_imageq, 0
>> >+ INIT_DATA_PTR srcbm, src_imageq, 1
>> >+ INIT_DATA_PTR srcam, src_imageq, 3
>> >+
>> >+ INIT_DATA_PTR dstrm, dst_imageq, 2
>> >+ INIT_DATA_PTR dstgm, dst_imageq, 0
>> >+ INIT_DATA_PTR dstbm, dst_imageq, 1
>> >+ INIT_DATA_PTR dstam, dst_imageq, 3
>> >+
>> >+ %%loop_y:
>> >+ xor xq, xq
>> >+ %%loop_x:
>> >+ movu m14, [pd_1f]
>> >+ xorps m15, m15, m15
>> >+ %if %4 ; float
>> >+ mov ptrq, srcrm
>> >+ movu m0, [ptrq + xq*4]
>> >+ mov ptrq, srcgm
>> >+ movu m1, [ptrq + xq*4]
>> >+ mov ptrq, srcbm
>> >+ movu m2, [ptrq + xq*4]
>> >+ SANITIZE_F m0
>> >+ SANITIZE_F m1
>> >+ SANITIZE_F m2
>> >+ %else
>> >+ ; constants for LOAD16
>> >+ movu m7, [pd_65535_invf]
>> >+ %if notcpuflag(avx2) && mmsize >= 32
>> >+ movu xm6, [pb_shuffle16]
>> >+ %endif
>> >+ LOAD16 0, srcrm
>> >+ LOAD16 1, srcgm
>> >+ LOAD16 2, srcbm
>> >+ %endif
>> >+
>> >+ cmp prelutq, 0
>> >+ je %%skip_prelut
>> >+ mova m13, prelutmaxm
>> >+ APPLY_PRELUT 0, 0, prelutminrm, prelutscalerm
>> >+ APPLY_PRELUT 1, 1, prelutmingm, prelutscalegm
>> >+ APPLY_PRELUT 2, 2, prelutminbm, prelutscalebm
>> >+ %%skip_prelut:
>> >+
>> >+ mova m13, lut3dmaxm
>> >+ APPLY_SCALE 0, scalerm
>> >+ APPLY_SCALE 1, scalegm
>> >+ APPLY_SCALE 2, scalebm
>> >+
>> >+ interp_%1
>> >+
>> >+ %if %4 ; float
>> >+ mov ptrq, dstrm
>> >+ movu [ptrq + xq*4], m0
>> >+ mov ptrq, dstgm
>> >+ movu [ptrq + xq*4], m1
>> >+ mov ptrq, dstbm
>> >+ movu [ptrq + xq*4], m2
>> >+ cmp has_alphad, 0
>> >+ je %%skip_alphaf
>> >+ mov ptrq, srcam
>> >+ movu m0, [ptrq + xq*4]
>> >+ mov ptrq, dstam
>> >+ movu [ptrq + xq*4], m0
>> >+ %%skip_alphaf:
>> >+ %else
>> >+ ; constants for STORE16
>> >+ movu m5, [pd_65535f]
>> >+ %if mmsize > 16
>> >+ movu xm6, [pb_lo_pack_shuffle16]
>> >+ movu xm7, [pb_hi_pack_shuffle16]
>> >+ %endif
>> >+
>> >+ xorps m15, m15, m15
>> >+ STORE16 dstrm, 0
>> >+ STORE16 dstgm, 1
>> >+ STORE16 dstbm, 2
>> >+
>> >+ cmp has_alphad, 0
>> >+ je %%skip_alpha
>> >+ %if mmsize > 16
>> >+ mov ptrq, srcam
>> >+ movu xm0, [ptrq + xq*2]
>> >+ mov ptrq, dstam
>> >+ movu [ptrq + xq*2], xm0
>> >+ %else
>> >+ mov ptrq, srcam
>> >+ movsd xm0, [ptrq + xq*2]
>> >+ mov ptrq, dstam
>> >+ movsd [ptrq + xq*2], xm0
>> >+ %endif
>> >+
>> >+ %%skip_alpha:
>> >+ %endif
>> >+
>> >+ add xq, mmsize/4
>> >+ cmp xd, widthd
>> >+ jl %%loop_x
>> >+
>> >+ INC_DATA_PTR srcrm, src_imageq, 2
>> >+ INC_DATA_PTR srcgm, src_imageq, 0
>> >+ INC_DATA_PTR srcbm, src_imageq, 1
>> >+ INC_DATA_PTR srcam, src_imageq, 3
>> >+
>> >+ INC_DATA_PTR dstrm, dst_imageq, 2
>> >+ INC_DATA_PTR dstgm, dst_imageq, 0
>> >+ INC_DATA_PTR dstbm, dst_imageq, 1
>> >+ INC_DATA_PTR dstam, dst_imageq, 3
>> >+
>> >+ inc slice_startd
>> >+ cmp slice_startd, slice_endd
>> >+ jl %%loop_y
>> >+
>> >+ RET
>> >+%endmacro
>> >+%if ARCH_X86_64
>> >+ %if HAVE_AVX2_EXTERNAL
>> >+ INIT_YMM avx2
>> >+ DEFINE_INTERP_FUNC tetrahedral, pf32, 32, 1
>> >+ DEFINE_INTERP_FUNC tetrahedral, p16, 16, 0
>> >+ %endif
>> >+ %if HAVE_AVX_EXTERNAL
>> >+ INIT_YMM avx
>> >+ DEFINE_INTERP_FUNC tetrahedral, pf32, 32, 1
>> >+ DEFINE_INTERP_FUNC tetrahedral, p16, 16, 0
>> >+ %endif
>> >+ INIT_XMM sse2
>> >+ DEFINE_INTERP_FUNC tetrahedral, pf32, 32, 1
>> >+ DEFINE_INTERP_FUNC tetrahedral, p16, 16, 0
>> >+%endif
>> >\ No newline at end of file
>> >diff --git a/libavfilter/x86/vf_lut3d_init.c
>> b/libavfilter/x86/vf_lut3d_init.c
>> >new file mode 100644
>> >index 0000000000..9b9b36e4af
>> >--- /dev/null
>> >+++ b/libavfilter/x86/vf_lut3d_init.c
>> >@@ -0,0 +1,88 @@
>> >+/*
>> >+ * Copyright (c) 2021 Mark Reid <mindmark at gmail.com>
>> >+ *
>> >+ * This file is part of FFmpeg.
>> >+ *
>> >+ * FFmpeg is free software; you can redistribute it and/or
>> >+ * modify it under the terms of the GNU Lesser General Public
>> >+ * License as published by the Free Software Foundation; either
>> >+ * version 2.1 of the License, or (at your option) any later version.
>> >+ *
>> >+ * FFmpeg is distributed in the hope that it will be useful,
>> >+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
>> >+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
>> >+ * Lesser General Public License for more details.
>> >+ *
>> >+ * You should have received a copy of the GNU Lesser General Public
>> >+ * License along with FFmpeg; if not, write to the Free Software
>> >+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
>> 02110-1301 USA
>> >+ */
>> >+
>> >+#include "libavutil/attributes.h"
>> >+#include "libavutil/cpu.h"
>> >+#include "libavutil/x86/cpu.h"
>> >+#include "libavfilter/lut3d.h"
>> >+
>> >+#define DEFINE_INTERP_FUNC(name, format, opt)
>>
>> \
>> >+void ff_interp_##name##_##format##_##opt(LUT3DContext *lut3d,
>> Lut3DPreLut *prelut, AVFrame *src, AVFrame *dst, int slice_start, int
>> slice_end, int has_alpha); \
>> >+static int interp_##name##_##format##_##opt(AVFilterContext *ctx, void
>> *arg, int jobnr, int nb_jobs)
>> \
>> >+{
>>
>> \
>> >+ LUT3DContext *lut3d = ctx->priv;
>>
>> \
>> >+ Lut3DPreLut *prelut = lut3d->prelut.size > 0? &lut3d->prelut:
>> NULL;
>> \
>> >+ ThreadData *td = arg;
>>
>> \
>> >+ AVFrame *in = td->in;
>>
>> \
>> >+ AVFrame *out = td->out;
>>
>> \
>> >+ int has_alpha = in->linesize[3] && out != in;
>>
>> \
>> >+ int slice_start = (in->height * jobnr ) / nb_jobs;
>>
>> \
>> >+ int slice_end = (in->height * (jobnr+1)) / nb_jobs;
>>
>> \
>> >+ ff_interp_##name##_##format##_##opt(lut3d, prelut, in, out,
>> slice_start, slice_end, has_alpha);
>> \
>> >+ return 0;
>>
>> \
>> >+}
>> >+
>> >+#if ARCH_X86_64
>> >+#if HAVE_AVX2_EXTERNAL
>> >+ DEFINE_INTERP_FUNC(tetrahedral, pf32, avx2)
>> >+ DEFINE_INTERP_FUNC(tetrahedral, p16, avx2)
>> >+#endif
>> >+#if HAVE_AVX_EXTERNAL
>> >+ DEFINE_INTERP_FUNC(tetrahedral, pf32, avx)
>> >+ DEFINE_INTERP_FUNC(tetrahedral, p16, avx)
>> >+#endif
>> >+ DEFINE_INTERP_FUNC(tetrahedral, pf32, sse2)
>> >+ DEFINE_INTERP_FUNC(tetrahedral, p16, sse2)
>> >+#endif
>> >+
>> >+
>> >+av_cold void ff_lut3d_init_x86(LUT3DContext *s, const
>> AVPixFmtDescriptor *desc)
>> >+{
>> >+ int cpu_flags = av_get_cpu_flags();
>> >+ int planar = desc->flags & AV_PIX_FMT_FLAG_PLANAR;
>> >+ int isfloat = desc->flags & AV_PIX_FMT_FLAG_FLOAT;
>> >+ int depth = desc->comp[0].depth;
>> >+
>> >+#if ARCH_X86_64
>> >+ if (EXTERNAL_AVX2_FAST(cpu_flags) && s->interpolation ==
>> INTERPOLATE_TETRAHEDRAL && planar) {
>> >+#if HAVE_AVX2_EXTERNAL
>> >+ if (isfloat && planar) {
>> >+ s->interp = interp_tetrahedral_pf32_avx2;
>> >+ } else if (depth == 16) {
>> >+ s->interp = interp_tetrahedral_p16_avx2;
>> >+ }
>> >+#endif
>> >+ } else if (EXTERNAL_AVX_FAST(cpu_flags) && s->interpolation ==
>> INTERPOLATE_TETRAHEDRAL && planar) {
>> >+#if HAVE_AVX_EXTERNAL
>> >+ if (isfloat) {
>> >+ s->interp = interp_tetrahedral_pf32_avx;
>> >+ } else if (depth == 16) {
>> >+ s->interp = interp_tetrahedral_p16_avx;
>> >+ }
>> >+#endif
>> >+ } else if (EXTERNAL_SSE2(cpu_flags) && s->interpolation ==
>> INTERPOLATE_TETRAHEDRAL && planar) {
>> >+ if (isfloat) {
>> >+ s->interp = interp_tetrahedral_pf32_sse2;
>> >+ } else if (depth == 16) {
>> >+ s->interp = interp_tetrahedral_p16_sse2;
>> >+ }
>> >+ }
>> >+#endif
>> >+}
>> >--
>> >2.31.1.windows.1
>> >
>> >_______________________________________________
>> >ffmpeg-devel mailing list
>> >ffmpeg-devel at ffmpeg.org
>> >https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>> >
>> >To unsubscribe, visit link above, or email
>> >ffmpeg-devel-request at ffmpeg.org with subject "unsubscribe".
>> _______________________________________________
>> ffmpeg-devel mailing list
>> ffmpeg-devel at ffmpeg.org
>> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>>
>> To unsubscribe, visit link above, or email
>> ffmpeg-devel-request at ffmpeg.org with subject "unsubscribe".
>>
>
More information about the ffmpeg-devel
mailing list