[FFmpeg-devel] [PATCH 4/5] avfilter/vf_v360: x86 SIMD for interpolations
Paul B Mahol
onemda at gmail.com
Thu Sep 5 11:52:54 EEST 2019
Signed-off-by: Paul B Mahol <onemda at gmail.com>
---
libavfilter/v360.h | 113 ++++++++++++++++
libavfilter/vf_v360.c | 236 ++++++++++++---------------------
libavfilter/x86/Makefile | 2 +
libavfilter/x86/vf_v360.asm | 98 ++++++++++++++
libavfilter/x86/vf_v360_init.c | 43 ++++++
5 files changed, 343 insertions(+), 149 deletions(-)
create mode 100644 libavfilter/v360.h
create mode 100644 libavfilter/x86/vf_v360.asm
create mode 100644 libavfilter/x86/vf_v360_init.c
diff --git a/libavfilter/v360.h b/libavfilter/v360.h
new file mode 100644
index 0000000000..a0eefdec16
--- /dev/null
+++ b/libavfilter/v360.h
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2019 Eugene Lyapustin
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVFILTER_V360_H
+#define AVFILTER_V360_H
+#include "avfilter.h"
+
+enum Projections {
+ EQUIRECTANGULAR,
+ CUBEMAP_3_2,
+ CUBEMAP_6_1,
+ EQUIANGULAR,
+ FLAT,
+ DUAL_FISHEYE,
+ BARREL,
+ CUBEMAP_1_6,
+ NB_PROJECTIONS,
+};
+
+enum InterpMethod {
+ NEAREST,
+ BILINEAR,
+ BICUBIC,
+ LANCZOS,
+ NB_INTERP_METHODS,
+};
+
+enum Faces {
+ TOP_LEFT,
+ TOP_MIDDLE,
+ TOP_RIGHT,
+ BOTTOM_LEFT,
+ BOTTOM_MIDDLE,
+ BOTTOM_RIGHT,
+ NB_FACES,
+};
+
+enum Direction {
+ RIGHT, ///< Axis +X
+ LEFT, ///< Axis -X
+ UP, ///< Axis +Y
+ DOWN, ///< Axis -Y
+ FRONT, ///< Axis -Z
+ BACK, ///< Axis +Z
+ NB_DIRECTIONS,
+};
+
+enum Rotation {
+ ROT_0,
+ ROT_90,
+ ROT_180,
+ ROT_270,
+ NB_ROTATIONS,
+};
+
+typedef struct V360Context {
+ const AVClass *class;
+ int in, out;
+ int interp;
+ int width, height;
+ char* in_forder;
+ char* out_forder;
+ char* in_frot;
+ char* out_frot;
+
+ int in_cubemap_face_order[6];
+ int out_cubemap_direction_order[6];
+ int in_cubemap_face_rotation[6];
+ int out_cubemap_face_rotation[6];
+
+ float in_pad, out_pad;
+
+ float yaw, pitch, roll;
+
+ int h_flip, v_flip, d_flip;
+
+ float h_fov, v_fov;
+ float flat_range[3];
+
+ int planewidth[4], planeheight[4];
+ int inplanewidth[4], inplaneheight[4];
+ int nb_planes;
+
+ uint16_t *u[4], *v[4];
+ int16_t *ker[4];
+
+ int (*remap_slice)(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs);
+
+ void (*remap_line)(uint8_t *dst, int width, const uint8_t *src, ptrdiff_t in_linesize,
+ const uint16_t *u, const uint16_t *v, const int16_t *ker);
+} V360Context;
+
+void ff_v360_init(V360Context *s, int depth);
+void ff_v360_init_x86(V360Context *s, int depth);
+
+#endif /* AVFILTER_V360_H */
diff --git a/libavfilter/vf_v360.c b/libavfilter/vf_v360.c
index fc120097d9..e69aa7e8c5 100644
--- a/libavfilter/vf_v360.c
+++ b/libavfilter/vf_v360.c
@@ -41,88 +41,7 @@
#include "formats.h"
#include "internal.h"
#include "video.h"
-
-enum Projections {
- EQUIRECTANGULAR,
- CUBEMAP_3_2,
- CUBEMAP_6_1,
- EQUIANGULAR,
- FLAT,
- DUAL_FISHEYE,
- BARREL,
- CUBEMAP_1_6,
- NB_PROJECTIONS,
-};
-
-enum InterpMethod {
- NEAREST,
- BILINEAR,
- BICUBIC,
- LANCZOS,
- NB_INTERP_METHODS,
-};
-
-enum Faces {
- TOP_LEFT,
- TOP_MIDDLE,
- TOP_RIGHT,
- BOTTOM_LEFT,
- BOTTOM_MIDDLE,
- BOTTOM_RIGHT,
- NB_FACES,
-};
-
-enum Direction {
- RIGHT, ///< Axis +X
- LEFT, ///< Axis -X
- UP, ///< Axis +Y
- DOWN, ///< Axis -Y
- FRONT, ///< Axis -Z
- BACK, ///< Axis +Z
- NB_DIRECTIONS,
-};
-
-enum Rotation {
- ROT_0,
- ROT_90,
- ROT_180,
- ROT_270,
- NB_ROTATIONS,
-};
-
-typedef struct V360Context {
- const AVClass *class;
- int in, out;
- int interp;
- int width, height;
- char* in_forder;
- char* out_forder;
- char* in_frot;
- char* out_frot;
-
- int in_cubemap_face_order[6];
- int out_cubemap_direction_order[6];
- int in_cubemap_face_rotation[6];
- int out_cubemap_face_rotation[6];
-
- float in_pad, out_pad;
-
- float yaw, pitch, roll;
-
- int h_flip, v_flip, d_flip;
-
- float h_fov, v_fov;
- float flat_range[3];
-
- int planewidth[4], planeheight[4];
- int inplanewidth[4], inplaneheight[4];
- int nb_planes;
-
- uint16_t *u[4], *v[4];
- int16_t *ker[4];
-
- int (*remap_slice)(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs);
-} V360Context;
+#include "v360.h"
typedef struct ThreadData {
AVFrame *in;
@@ -251,47 +170,26 @@ static int query_formats(AVFilterContext *ctx)
return ff_set_common_formats(ctx, fmts_list);
}
-/**
- * Generate no-interpolation remapping function with a given pixel depth.
- *
- * @param bits number of bits per pixel
- * @param div number of bytes per pixel
- */
-#define DEFINE_REMAP1(bits, div) \
-static int remap1_##bits##bit_slice(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs) \
-{ \
- ThreadData *td = (ThreadData*)arg; \
- const V360Context *s = ctx->priv; \
- const AVFrame *in = td->in; \
- AVFrame *out = td->out; \
- \
- int plane, x, y; \
- \
- for (plane = 0; plane < s->nb_planes; plane++) { \
- const int in_linesize = in->linesize[plane] / div; \
- const int out_linesize = out->linesize[plane] / div; \
- const uint##bits##_t *src = (const uint##bits##_t *)in->data[plane]; \
- uint##bits##_t *dst = (uint##bits##_t *)out->data[plane]; \
- const int width = s->planewidth[plane]; \
- const int height = s->planeheight[plane]; \
- \
- const int slice_start = (height * jobnr ) / nb_jobs; \
- const int slice_end = (height * (jobnr + 1)) / nb_jobs; \
- \
- for (y = slice_start; y < slice_end; y++) { \
- const uint16_t *u = s->u[plane] + y * width; \
- const uint16_t *v = s->v[plane] + y * width; \
- uint##bits##_t *d = dst + y * out_linesize; \
- for (x = 0; x < width; x++) \
- *d++ = src[v[x] * in_linesize + u[x]]; \
- } \
- } \
- \
- return 0; \
+#define DEFINE_REMAP1_LINE(bits, div) \
+static void remap1_##bits##bit_line_c(uint8_t *dst, int width, const uint8_t *src, \
+ ptrdiff_t in_linesize, \
+ const uint16_t *u, const uint16_t *v, const int16_t *ker) \
+{ \
+ const uint##bits##_t *s = (const uint##bits##_t *)src; \
+ uint##bits##_t *d = (uint##bits##_t *)dst; \
+ \
+ in_linesize /= div; \
+ \
+ for (int x = 0; x < width; x++) { \
+ const uint16_t *uu = u + x; \
+ const uint16_t *vv = v + x; \
+ \
+ d[x] = s[vv[0] * in_linesize + uu[0]]; \
+ } \
}
-DEFINE_REMAP1( 8, 1)
-DEFINE_REMAP1(16, 2)
+DEFINE_REMAP1_LINE( 8, 1)
+DEFINE_REMAP1_LINE(16, 2)
typedef struct XYRemap {
uint16_t u[4][4];
@@ -304,9 +202,8 @@ typedef struct XYRemap {
*
* @param ws size of interpolation window
* @param bits number of bits per pixel
- * @param div number of bytes per pixel
*/
-#define DEFINE_REMAP(ws, bits, div) \
+#define DEFINE_REMAP(ws, bits) \
static int remap##ws##_##bits##bit_slice(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs) \
{ \
ThreadData *td = (ThreadData*)arg; \
@@ -314,48 +211,87 @@ static int remap##ws##_##bits##bit_slice(AVFilterContext *ctx, void *arg, int jo
const AVFrame *in = td->in; \
AVFrame *out = td->out; \
\
- int plane, x, y, i, j; \
- \
- for (plane = 0; plane < s->nb_planes; plane++) { \
- const int in_linesize = in->linesize[plane] / div; \
- const int out_linesize = out->linesize[plane] / div; \
- const uint##bits##_t *src = (const uint##bits##_t *)in->data[plane]; \
- uint##bits##_t *dst = (uint##bits##_t *)out->data[plane]; \
+ for (int plane = 0; plane < s->nb_planes; plane++) { \
+ const int in_linesize = in->linesize[plane]; \
+ const int out_linesize = out->linesize[plane]; \
+ const uint8_t *src = in->data[plane]; \
+ uint8_t *dst = out->data[plane]; \
const int width = s->planewidth[plane]; \
const int height = s->planeheight[plane]; \
\
const int slice_start = (height * jobnr ) / nb_jobs; \
const int slice_end = (height * (jobnr + 1)) / nb_jobs; \
\
- for (y = slice_start; y < slice_end; y++) { \
- uint##bits##_t *d = dst + y * out_linesize; \
+ for (int y = slice_start; y < slice_end; y++) { \
const uint16_t *u = s->u[plane] + y * width * ws * ws; \
const uint16_t *v = s->v[plane] + y * width * ws * ws; \
const int16_t *ker = s->ker[plane] + y * width * ws * ws; \
- for (x = 0; x < width; x++) { \
- const uint16_t *uu = u + x * ws * ws; \
- const uint16_t *vv = v + x * ws * ws; \
- const int16_t *kker = ker + x * ws * ws; \
- int tmp = 0; \
- \
- for (i = 0; i < ws; i++) { \
- for (j = 0; j < ws; j++) { \
- tmp += kker[i * ws + j] * src[vv[i * ws + j] * in_linesize + uu[i * ws + j]]; \
- } \
- } \
\
- *d++ = av_clip_uint##bits(tmp >> (15 - ws)); \
- } \
+ s->remap_line(dst + y * out_linesize, width, src, in_linesize, u, v, ker); \
} \
} \
\
return 0; \
}
-DEFINE_REMAP(2, 8, 1)
-DEFINE_REMAP(4, 8, 1)
-DEFINE_REMAP(2, 16, 2)
-DEFINE_REMAP(4, 16, 2)
+DEFINE_REMAP(1, 8)
+DEFINE_REMAP(2, 8)
+DEFINE_REMAP(4, 8)
+DEFINE_REMAP(1, 16)
+DEFINE_REMAP(2, 16)
+DEFINE_REMAP(4, 16)
+
+#define DEFINE_REMAP_LINE(ws, bits, div) \
+static void remap##ws##_##bits##bit_line_c(uint8_t *dst, int width, const uint8_t *src, \
+ ptrdiff_t in_linesize, \
+ const uint16_t *u, const uint16_t *v, const int16_t *ker) \
+{ \
+ const uint##bits##_t *s = (const uint##bits##_t *)src; \
+ uint##bits##_t *d = (uint##bits##_t *)dst; \
+ \
+ in_linesize /= div; \
+ \
+ for (int x = 0; x < width; x++) { \
+ const uint16_t *uu = u + x * ws * ws; \
+ const uint16_t *vv = v + x * ws * ws; \
+ const int16_t *kker = ker + x * ws * ws; \
+ int tmp = 0; \
+ \
+ for (int i = 0; i < ws; i++) { \
+ for (int j = 0; j < ws; j++) { \
+ tmp += kker[i * ws + j] * s[vv[i * ws + j] * in_linesize + uu[i * ws + j]]; \
+ } \
+ } \
+ \
+ d[x] = av_clip_uint##bits(tmp >> (15 - ws)); \
+ } \
+}
+
+DEFINE_REMAP_LINE(2, 8, 1)
+DEFINE_REMAP_LINE(4, 8, 1)
+DEFINE_REMAP_LINE(2, 16, 2)
+DEFINE_REMAP_LINE(4, 16, 2)
+
+void ff_v360_init(V360Context *s, int depth)
+{
+ switch (s->interp) {
+ case NEAREST:
+ s->remap_line = depth <= 8 ? remap1_8bit_line_c : remap1_16bit_line_c;
+ break;
+ case BILINEAR:
+ s->remap_line = depth <= 8 ? remap2_8bit_line_c : remap2_16bit_line_c;
+ break;
+ case BICUBIC:
+ s->remap_line = depth <= 8 ? remap4_8bit_line_c : remap4_16bit_line_c;
+ break;
+ case LANCZOS:
+ s->remap_line = depth <= 8 ? remap4_8bit_line_c : remap4_16bit_line_c;
+ break;
+ }
+
+ if (ARCH_X86_64)
+ ff_v360_init_x86(s, depth);
+}
/**
* Save nearest pixel coordinates for remapping.
@@ -2038,6 +1974,8 @@ static int config_output(AVFilterLink *outlink)
av_assert0(0);
}
+ ff_v360_init(s, depth);
+
switch (s->in) {
case EQUIRECTANGULAR:
in_transform = xyz_to_equirect;
diff --git a/libavfilter/x86/Makefile b/libavfilter/x86/Makefile
index 8dc0b0e6d4..f12993e606 100644
--- a/libavfilter/x86/Makefile
+++ b/libavfilter/x86/Makefile
@@ -31,6 +31,7 @@ OBJS-$(CONFIG_TBLEND_FILTER) += x86/vf_blend_init.o
OBJS-$(CONFIG_THRESHOLD_FILTER) += x86/vf_threshold_init.o
OBJS-$(CONFIG_TINTERLACE_FILTER) += x86/vf_tinterlace_init.o
OBJS-$(CONFIG_VOLUME_FILTER) += x86/af_volume_init.o
+OBJS-$(CONFIG_V360_FILTER) += x86/vf_v360_init.o
OBJS-$(CONFIG_W3FDIF_FILTER) += x86/vf_w3fdif_init.o
OBJS-$(CONFIG_YADIF_FILTER) += x86/vf_yadif_init.o
@@ -66,5 +67,6 @@ X86ASM-OBJS-$(CONFIG_TBLEND_FILTER) += x86/vf_blend.o
X86ASM-OBJS-$(CONFIG_THRESHOLD_FILTER) += x86/vf_threshold.o
X86ASM-OBJS-$(CONFIG_TINTERLACE_FILTER) += x86/vf_interlace.o
X86ASM-OBJS-$(CONFIG_VOLUME_FILTER) += x86/af_volume.o
+X86ASM-OBJS-$(CONFIG_V360_FILTER) += x86/vf_v360.o
X86ASM-OBJS-$(CONFIG_W3FDIF_FILTER) += x86/vf_w3fdif.o
X86ASM-OBJS-$(CONFIG_YADIF_FILTER) += x86/vf_yadif.o x86/yadif-16.o x86/yadif-10.o
diff --git a/libavfilter/x86/vf_v360.asm b/libavfilter/x86/vf_v360.asm
new file mode 100644
index 0000000000..e1efe2e3a3
--- /dev/null
+++ b/libavfilter/x86/vf_v360.asm
@@ -0,0 +1,98 @@
+;*****************************************************************************
+;* x86-optimized functions for v360 filter
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+ALIGN 32
+
+pb_mask: db 0,4,8,12,5,5,5,5,5,5,5,5,5,5,5,5
+pd_255: times 4 dd 255
+
+SECTION .text
+
+; void ff_remap2_8bit_line_avx2(uint8_t *dst, int width, const uint8_t *src, ptrdiff_t in_linesize,
+; const uint16_t *u, const uint16_t *v, const int16_t *ker);
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+cglobal remap1_8bit_line, 6, 7, 7, dst, width, src, in_linesize, u, v, x
+ movsxdifnidn widthq, widthd
+ xor xq, xq
+ movd xm0, in_linesized
+ pcmpeqw m4, m4
+ VBROADCASTI128 m6, [pb_mask]
+ vpbroadcastd m0, xm0
+
+ .loop:
+ pmovsxwd m1, [vq + xq * 2]
+ pmovsxwd m2, [uq + xq * 2]
+
+ pmulld m1, m0
+ paddd m1, m2
+ mova m2, m4
+ vpgatherdd m5, [srcq + m1], m2
+ pshufb m1, m5, m6
+ vextracti128 xm2, m1, 1
+ movd [dstq+xq], xm1
+ movd [dstq+xq+4], xm2
+
+ add xq, mmsize / 4
+ cmp xq, widthq
+ jl .loop
+ RET
+
+INIT_YMM avx2
+cglobal remap2_8bit_line, 7, 8, 8, dst, width, src, in_linesize, u, v, ker, x
+ movsxdifnidn widthq, widthd
+ xor xq, xq
+ movd xm0, in_linesized
+ pcmpeqw m7, m7
+ vpbroadcastd m0, xm0
+ movd xm6, [pd_255]
+ vpbroadcastd m6, xm6
+ VBROADCASTI128 m5, [pb_mask]
+
+ .loop:
+ pmovsxwd m1, [kerq + xq * 8]
+ pmovsxwd m2, [vq + xq * 8]
+ pmovsxwd m3, [uq + xq * 8]
+
+ pmulld m4, m2, m0
+ paddd m4, m3
+ mova m3, m7
+ vpgatherdd m2, [srcq + m4], m3
+ pand m2, m6
+ pmulld m2, m1
+ phaddd m2, m2
+ phaddd m1, m2, m2
+ psrld m1, m1, 0xd
+ pshufb m1, m1, m5
+ vextracti128 xm2, m1, 1
+
+ pextrb [dstq+xq], xm1, 0
+ pextrb [dstq+xq+1], xm2, 0
+
+ add xq, mmsize / 16
+ cmp xq, widthq
+ jl .loop
+ RET
+%endif
diff --git a/libavfilter/x86/vf_v360_init.c b/libavfilter/x86/vf_v360_init.c
new file mode 100644
index 0000000000..b781fb13d1
--- /dev/null
+++ b/libavfilter/x86/vf_v360_init.c
@@ -0,0 +1,43 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavfilter/v360.h"
+
+void ff_remap1_8bit_line_avx2(uint8_t *dst, int width, const uint8_t *src, ptrdiff_t in_linesize,
+ const uint16_t *u, const uint16_t *v, const int16_t *ker);
+
+void ff_remap2_8bit_line_avx2(uint8_t *dst, int width, const uint8_t *src, ptrdiff_t in_linesize,
+ const uint16_t *u, const uint16_t *v, const int16_t *ker);
+
+av_cold void ff_v360_init_x86(V360Context *s, int depth)
+{
+#if ARCH_X86_64
+ int cpu_flags = av_get_cpu_flags();
+
+ if (EXTERNAL_AVX2_FAST(cpu_flags) && s->interp == NEAREST && depth <= 8)
+ s->remap_line = ff_remap1_8bit_line_avx2;
+
+ if (EXTERNAL_AVX2_FAST(cpu_flags) && s->interp == BILINEAR && depth <= 8)
+ s->remap_line = ff_remap2_8bit_line_avx2;
+#endif
+}
--
2.17.1
More information about the ffmpeg-devel
mailing list