[FFmpeg-devel] [RFC] lavfi: add scale_opencl filter.
Ruiling Song
ruiling.song at intel.com
Wed Jan 30 10:13:02 EET 2019
Signed-off-by: Ruiling Song <ruiling.song at intel.com>
---
This patch depends on the colorspace patchset I sent before
(https://patchwork.ffmpeg.org/patch/11820/)
Although I am still working on some minor functionality,
hope somebody could give some comments about the overall design.
Ruiling
configure | 1 +
libavfilter/Makefile | 2 +
libavfilter/allfilters.c | 1 +
libavfilter/opencl/scale.cl | 252 ++++++++++++++++
libavfilter/opencl_source.h | 1 +
libavfilter/vf_scale_opencl.c | 682 ++++++++++++++++++++++++++++++++++++++++++
6 files changed, 939 insertions(+)
create mode 100644 libavfilter/opencl/scale.cl
create mode 100644 libavfilter/vf_scale_opencl.c
diff --git a/configure b/configure
index ec8f70d..5640137 100755
--- a/configure
+++ b/configure
@@ -3450,6 +3450,7 @@ rubberband_filter_deps="librubberband"
sab_filter_deps="gpl swscale"
scale2ref_filter_deps="swscale"
scale_filter_deps="swscale"
+scale_opencl_filter_deps="opencl"
scale_qsv_filter_deps="libmfx"
select_filter_select="scene_sad"
sharpness_vaapi_filter_deps="vaapi"
diff --git a/libavfilter/Makefile b/libavfilter/Makefile
index bc642ac..9de7d44 100644
--- a/libavfilter/Makefile
+++ b/libavfilter/Makefile
@@ -343,6 +343,8 @@ OBJS-$(CONFIG_SCALE_FILTER) += vf_scale.o scale.o
OBJS-$(CONFIG_SCALE_CUDA_FILTER) += vf_scale_cuda.o vf_scale_cuda.ptx.o \
cuda_check.o
OBJS-$(CONFIG_SCALE_NPP_FILTER) += vf_scale_npp.o scale.o cuda_check.o
+OBJS-$(CONFIG_SCALE_OPENCL_FILTER) += vf_scale_opencl.o opencl.o \
+ opencl/scale.o
OBJS-$(CONFIG_SCALE_QSV_FILTER) += vf_scale_qsv.o
OBJS-$(CONFIG_SCALE_VAAPI_FILTER) += vf_scale_vaapi.o scale.o vaapi_vpp.o
OBJS-$(CONFIG_SCALE2REF_FILTER) += vf_scale.o scale.o
diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c
index c51ae0f..5708d16 100644
--- a/libavfilter/allfilters.c
+++ b/libavfilter/allfilters.c
@@ -325,6 +325,7 @@ extern AVFilter ff_vf_sab;
extern AVFilter ff_vf_scale;
extern AVFilter ff_vf_scale_cuda;
extern AVFilter ff_vf_scale_npp;
+extern AVFilter ff_vf_scale_opencl;
extern AVFilter ff_vf_scale_qsv;
extern AVFilter ff_vf_scale_vaapi;
extern AVFilter ff_vf_scale2ref;
diff --git a/libavfilter/opencl/scale.cl b/libavfilter/opencl/scale.cl
new file mode 100644
index 0000000..5d3deda
--- /dev/null
+++ b/libavfilter/opencl/scale.cl
@@ -0,0 +1,252 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+extern float3 yuv2rgb(float, float, float);
+extern float3 rgb2yuv(float, float, float);
+
+const sampler_t sampler_nearest = (CLK_NORMALIZED_COORDS_FALSE |
+ CLK_ADDRESS_CLAMP |
+ CLK_FILTER_NEAREST);
+
+const sampler_t sampler_linear = (CLK_NORMALIZED_COORDS_FALSE |
+ CLK_ADDRESS_CLAMP |
+ CLK_FILTER_LINEAR);
+
+float4 neighbor(image2d_t img, float vscale,
+ float hscale, int x, int y,
+ __constant float *coff_x,
+ __constant float *coff_y,
+ int2 filter_size)
+{
+ float xi = ((float)x + 0.5f) * hscale;
+ float yi = ((float)y + 0.5f) * vscale;
+
+ return read_imagef(img, sampler_nearest, (float2)(xi, yi));
+}
+
+float4 bilinear(image2d_t img, float vscale,
+ float hscale, int x, int y,
+ __constant float *coff_x,
+ __constant float *coff_y,
+ int2 filter_size)
+{
+ float xi = ((float)x + 0.5f) * hscale;
+ float yi = ((float)y + 0.5f) * vscale;
+
+ return read_imagef(img, sampler_linear, (float2)(xi, yi));
+}
+
+float4 generic_filter(image2d_t img, float vscale, float hscale, int x, int y,
+ __constant float *coff_x, __constant float *coff_y,
+ int2 filter_size)
+{
+ int2 dst_pos = (int2)(x, y);
+ float2 src_coord = (convert_float2(dst_pos) + 0.5f) *
+ (float2)(hscale, vscale);
+ int2 src_pos = convert_int2(floor(src_coord - 0.5f));
+
+ float4 color = 0.0f;
+ for (int i = 0; i < filter_size.y; ++i) {
+ float4 sum = 0.0f;
+ for (int j = 0; j < filter_size.x; ++j) {
+ int x_offset = filter_size.x / 2 - j;
+ int y_offset = filter_size.y / 2 - i;
+ float4 c = read_imagef(img, sampler_nearest,
+ src_pos + (int2)(x_offset, y_offset));
+ sum += c * coff_x[dst_pos.x * filter_size.x + j];
+ }
+ color += sum * coff_y[dst_pos.y * filter_size.y + i];
+ }
+ return color;
+}
+
+__kernel void scale(__write_only image2d_t dst,
+ __read_only image2d_t src,
+ int dst_width, int dst_height,
+ int src_width, int src_height,
+ __constant float *coff_x,
+ __constant float *coff_y,
+ int2 filter_size)
+{
+ int x = get_global_id(0);
+ int y = get_global_id(1);
+ float vscale = (float)src_height / (float)dst_height;
+ float hscale = (float)src_width / (float)dst_width;
+
+ float4 sum = algorithm(src, vscale, hscale, x, y,
+ coff_x, coff_y, filter_size);
+
+ if (x < dst_width && y < dst_height)
+ write_imagef(dst, (int2)(x, y), sum);
+}
+
+// read chroma value from 'img', 'color[4]' contains the results.
+inline void read_chroma(float4 color[4], image2d_t img,
+ float scalev, float scaleh, int xchr, int ychr,
+ __constant float *coff_x, __constant float *coff_y,
+ int2 filter_size, int chroma_h, int chroma_v)
+{
+ color[0] = algorithm(img, scalev, scaleh, xchr, ychr, coff_x, coff_y,
+ filter_size);
+
+ if (chroma_v == 2) {
+ color[2] = algorithm(img, scalev, scaleh, xchr, ychr + 1, coff_x, coff_y,
+ filter_size);
+ if (chroma_h == 2) {
+ color[1] = algorithm(img, scalev, scaleh, xchr + 1, ychr, coff_x,
+ coff_y, filter_size);
+ color[3] = algorithm(img, scalev, scaleh, xchr + 1, ychr + 1, coff_x,
+ coff_y, filter_size);
+ } else {
+ color[1] = color[0];
+ color[3] = color[2];
+ }
+ } else {
+ color[1] = color[2] = color[3] = color[0];
+ }
+}
+
+inline void write_chroma(write_only image2d_t img, float4 c[4],
+ int xchr, int ychr, int chroma_h, int chroma_v)
+{
+ write_imagef(img, (int2)(xchr, ychr), c[0]);
+
+ if (chroma_v == 2) {
+ write_imagef(img, (int2)(xchr, ychr + 1), c[2]);
+ if (chroma_h == 2) {
+ write_imagef(img, (int2)(xchr + 1, ychr), c[1]);
+ write_imagef(img, (int2)(xchr + 1, ychr + 1), c[3]);
+ }
+ }
+}
+
+__kernel void convert(__write_only image2d_t dst0,
+ __write_only image2d_t dst1,
+ __write_only image2d_t dst2,
+ __write_only image2d_t dst3,
+ __read_only image2d_t src0,
+ __read_only image2d_t src1,
+ __read_only image2d_t src2,
+ __read_only image2d_t src3,
+ int dst_width, int dst_height,
+ int src_width, int src_height,
+ __constant float *coff_x,
+ __constant float *coff_y,
+ int2 filter_size)
+{
+ int x = get_global_id(0);
+ int y = get_global_id(1);
+ int x2 = x * 2;
+ int y2 = y * 2;
+ int xchr = SRC_CHROMA_H * x;
+ int ychr = SRC_CHROMA_V * y;
+ int i;
+ float4 color[4], color0[4], color1[4], color2[4], color3[4];
+
+ float scalev = (float)src_height / (float)dst_height;
+ float scaleh = (float)src_width / (float)dst_width;
+ color0[0] = algorithm(src0, scalev, scaleh, x2, y2,
+ coff_x, coff_y, filter_size);
+ color0[1] = algorithm(src0, scalev, scaleh, x2 + 1, y2,
+ coff_x, coff_y, filter_size);
+ color0[2] = algorithm(src0, scalev, scaleh, x2, y2 + 1,
+ coff_x, coff_y, filter_size);
+ color0[3] = algorithm(src0, scalev, scaleh, x2 + 1, y2 + 1,
+ coff_x, coff_y, filter_size);
+ #pragma unroll
+ for (i = 0; i < 4; i++)
+ color[i] = color0[i];
+
+ if (SRC_IMGS > 1) {
+ read_chroma(color1, src1, scalev, scaleh, xchr, ychr,
+ coff_x, coff_y, filter_size,
+ SRC_CHROMA_H, SRC_CHROMA_V);
+ #pragma unroll
+ for (i = 0; i < 4; i++)
+ color[i].yz = color1[i].xy;
+ }
+
+ if (SRC_IMGS > 2) {
+ read_chroma(color2, src2, scalev, scaleh, xchr, ychr,
+ coff_x, coff_y, filter_size,
+ SRC_CHROMA_H, SRC_CHROMA_V);
+ #pragma unroll
+ for (i = 0; i < 4; i++)
+ color[i].z = color2[i].x;
+ }
+
+ if (SRC_IMGS > 3) {
+ color3[0] = algorithm(src3, scalev, scaleh, x2, y2,
+ coff_x, coff_y, filter_size);
+ color3[1] = algorithm(src3, scalev, scaleh, x2 + 1, y2,
+ coff_x, coff_y, filter_size);
+ color3[2] = algorithm(src3, scalev, scaleh, x2, y2 + 1,
+ coff_x, coff_y, filter_size);
+ color3[3] = algorithm(src3, scalev, scaleh, x2 + 1, y2 + 1,
+ coff_x, coff_y, filter_size);
+ #pragma unroll
+ for (i = 0; i < 4; i++)
+ color[i].w = color3[i].x;
+ }
+
+ // possible yuv-rgb conversion here
+ #ifdef YUV2RGB
+ #pragma unroll
+ for (i = 0; i < 4; i++)
+ color[i].xyz = yuv2rgb(color[i].x, color[i].y, color[i].z);
+ #endif
+
+ #ifdef RGB2YUV
+ #pragma unroll
+ for (i = 0; i < 4; i++)
+ color[i].xyz = rgb2yuv(color[i].x, color[i].y, color[i].z);
+
+ #endif
+
+ xchr = DST_CHROMA_H * x;
+ ychr = DST_CHROMA_V * y;
+
+ if (x2 < dst_width && y2 < dst_height) {
+ write_imagef(dst0, (int2)(x2, y2 ), color[0]);
+ write_imagef(dst0, (int2)(x2 + 1, y2 ), color[1]);
+ write_imagef(dst0, (int2)(x2, y2 + 1), color[2]);
+ write_imagef(dst0, (int2)(x2 + 1, y2 + 1), color[3]);
+ }
+
+ if (DST_IMGS > 1 && x2 < dst_width && y2 < dst_height) {
+ float4 c2[4];
+ #pragma unroll
+ for (i = 0; i < 4; i++)
+ c2[i] = color[i].yzyz;
+ write_chroma(dst1, c2, xchr, ychr, DST_CHROMA_H, DST_CHROMA_V);
+
+ if (DST_IMGS > 2) {
+ #pragma unroll
+ for (i = 0; i < 4; i++)
+ c2[i] = color[i].zzzz;
+ write_chroma(dst2, c2, xchr, ychr, DST_CHROMA_H, DST_CHROMA_V);
+ }
+
+ if (DST_IMGS > 3) {
+ write_imagef(dst3, (int2)(x2, y2), color[0].wwww);
+ write_imagef(dst3, (int2)(x2 + 1, y2), color[1].wwww);
+ write_imagef(dst3, (int2)(x2, y2 + 1), color[2].wwww);
+ write_imagef(dst3, (int2)(x2 + 1, y2 + 1), color[3].wwww);
+ }
+ }
+}
diff --git a/libavfilter/opencl_source.h b/libavfilter/opencl_source.h
index 4118138..3dd7634 100644
--- a/libavfilter/opencl_source.h
+++ b/libavfilter/opencl_source.h
@@ -24,6 +24,7 @@ extern const char *ff_opencl_source_colorspace_common;
extern const char *ff_opencl_source_convolution;
extern const char *ff_opencl_source_neighbor;
extern const char *ff_opencl_source_overlay;
+extern const char *ff_opencl_source_scale;
extern const char *ff_opencl_source_tonemap;
extern const char *ff_opencl_source_transpose;
extern const char *ff_opencl_source_unsharp;
diff --git a/libavfilter/vf_scale_opencl.c b/libavfilter/vf_scale_opencl.c
new file mode 100644
index 0000000..929a5dd
--- /dev/null
+++ b/libavfilter/vf_scale_opencl.c
@@ -0,0 +1,682 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/bprint.h"
+#include "libavutil/common.h"
+#include "libavutil/imgutils.h"
+#include "libavutil/mem.h"
+#include "libavutil/opt.h"
+#include "libavutil/pixdesc.h"
+
+#include "avfilter.h"
+#include "colorspace.h"
+#include "internal.h"
+#include "opencl.h"
+#include "opencl_source.h"
+#include "scale.h"
+#include "video.h"
+
+enum ScaleAlgorithm {
+ SCALE_NEIGHBOR = 0,
+ SCALE_BILINEAR,
+ SCALE_BICUBIC,
+};
+
+typedef struct SurfaceInfo {
+ int plane_id[4];
+ int nb_planes;
+ int is_rgb;
+ int width;
+ int height;
+ int chroma_v; /// value of (chroma_height / half_luma_height)
+ int chroma_h; /// value of (chroma_width / half_luma_width)
+} SurfaceInfo;
+
+typedef struct ScaleInfo {
+ SurfaceInfo src;
+ SurfaceInfo dst;
+} ScaleInfo;
+
+typedef struct ScaleOpenCLContext {
+ OpenCLFilterContext ocf;
+ enum AVPixelFormat format;
+ enum AVColorSpace colorspace, colorspace_in, colorspace_out;
+ enum AVColorRange range, range_in, range_out;
+ enum AVChromaLocation chroma_loc;
+ enum ScaleAlgorithm algorithm;
+
+ char *w_expr;
+ char *h_expr;
+ char *format_str;
+ int initialised;
+ cl_kernel kernel;
+ cl_command_queue command_queue;
+ cl_mem coff_x;
+ cl_mem coff_y;
+ cl_int2 filter_size;
+ ScaleInfo job;
+} ScaleOpenCLContext;
+
+static const char *algo_str[3] =
+{
+ "neighbor",
+ "bilinear",
+ "generic_filter"
+};
+
+static const enum AVPixelFormat supported_formats[] = {
+ AV_PIX_FMT_YUV444P,
+ AV_PIX_FMT_YUV422P,
+ AV_PIX_FMT_YUV420P,
+ AV_PIX_FMT_NV12,
+ AV_PIX_FMT_GBRP,
+ AV_PIX_FMT_RGBA,
+ AV_PIX_FMT_BGRA,
+ AV_PIX_FMT_ARGB
+};
+
+static void gather_surface_info(AVFilterContext *avctx, SurfaceInfo *info,
+ AVFrame *frame)
+{
+ const AVPixFmtDescriptor *desc;
+ AVHWFramesContext *hwctx;
+ enum AVPixelFormat format;
+ int plane_id, plane_id_last;
+ int i, nb_comp;
+
+ hwctx = (AVHWFramesContext *)frame->hw_frames_ctx->data;
+ format = hwctx->sw_format;
+ desc = av_pix_fmt_desc_get(format);
+ info->is_rgb = desc->flags & AV_PIX_FMT_FLAG_RGB;
+
+ nb_comp = desc->nb_components;
+ plane_id_last = -1;
+
+ info->width = frame->width;
+ info->height = frame->height;
+ info->chroma_h = 2 / (1 << desc->log2_chroma_w);
+ info->chroma_v = 2 / (1 << desc->log2_chroma_h);
+ for (i = 0; i < nb_comp; i++) {
+ plane_id = desc->comp[i].plane;
+
+ if (plane_id != plane_id_last) {
+ info->plane_id[info->nb_planes] = plane_id;
+ info->nb_planes++;
+ }
+ plane_id_last = plane_id;
+ }
+}
+
+static int init_jobs(AVFilterContext *avctx, AVFrame *in, AVFrame *out)
+{
+ ScaleOpenCLContext *ctx = avctx->priv;
+ ScaleInfo *info = &ctx->job;
+
+ gather_surface_info(avctx, &info->src, in);
+ gather_surface_info(avctx, &info->dst, out);
+
+ av_log(avctx, AV_LOG_DEBUG, "Scale Job Info:\n");
+ av_log(avctx, AV_LOG_DEBUG, "src: planes: %d width: %d height: %d, "
+ "plane-id (%d %d %d %d)\n", info->src.nb_planes,
+ info->src.width, info->src.height,
+ info->src.plane_id[0], info->src.plane_id[1],
+ info->src.plane_id[2], info->src.plane_id[3]);
+
+ av_log(avctx, AV_LOG_DEBUG, "dst: planes: %d width: %d height: %d, "
+ "plane-id (%d %d %d %d)\n", info->dst.nb_planes,
+ info->dst.width, info->dst.height,
+ info->dst.plane_id[0], info->dst.plane_id[1],
+ info->dst.plane_id[2], info->dst.plane_id[3]);
+
+ return 0;
+}
+
+static const int filter_size[] = {
+ [SCALE_NEIGHBOR] = 1,
+ [SCALE_BILINEAR] = 1,
+ [SCALE_BICUBIC] = 4,
+};
+
+static float catmullrom(float x)
+{
+ float x2, x3;
+ const float B = 0.0f;
+ const float C = 0.5f;
+ x = x < 0.0f ? -x : x;
+ x2 = x * x;
+ x3 = x2 * x;
+
+ if(x < 1.0f) {
+ return ((12 - 9 * B - 6 * C) * x3 +
+ (-18 + 12 * B + 6 * C) * x2 +
+ (6 - 2 * B)) / 6.0f;
+ } else if (x >= 1.0 && x < 2.0f) {
+ return (( -B - 6 * C) * x3 +
+ ( 6 * B + 30 * C ) * x2 +
+ (-( 12 * B ) - 48 * C) * x +
+ 8 * B + 24 * C) / 6.0f;
+ } else {
+ return 0.0f;
+ }
+}
+
+static float filter(enum ScaleAlgorithm algo, float f)
+{
+ switch (algo) {
+ case SCALE_BICUBIC:
+ return catmullrom(f);
+ default:
+ return f;
+ }
+}
+
+static void fill_filter_table(AVFilterContext *avctx, float *buf, int length,
+ int filter_size, float scale)
+{
+ ScaleOpenCLContext *ctx = avctx->priv;
+ int i, k;
+ float sum;
+ for (i = 0; i < length; i++) {
+ float dst_in_src = (i + 0.5) * scale - 0.5;
+ float t = dst_in_src - floor(dst_in_src);
+ sum = 0.0;
+ for (k = 0; k < filter_size; k++) {
+ float fpos = ((float)(filter_size / 2 - k) - t);
+ float ff = filter(ctx->algorithm, fpos / FFMIN(scale, 1.0));
+ buf[i * filter_size + k] = ff;
+ sum += ff;
+ }
+
+ for (k = 0; k < filter_size; k++) {
+ buf[i * filter_size + k] /= sum;
+ }
+ }
+}
+static int prepare_opencl_header(AVFilterContext *avctx, AVBPrint *header)
+{
+ ScaleOpenCLContext *ctx = avctx->priv;
+ double rgb2yuv[3][3], yuv2rgb[3][3];
+ const struct LumaCoefficients *luma_src;
+
+ luma_src = ff_get_luma_coefficients(ctx->colorspace_in);
+ if (!luma_src) {
+ av_log(avctx, AV_LOG_ERROR, "unsupported input colorspace %d\n",
+ ctx->colorspace_in);
+ return AVERROR(EINVAL);
+ }
+ // fill this value as it is required to compile opencl program successfully
+ // currently we only support source & destination with same color-space.
+ av_bprintf(header, "constant float3 luma_src = {%.4ff, %.4ff, %.4ff};\n",
+ luma_src->cr, luma_src->cg, luma_src->cb);
+ av_bprintf(header, "constant float3 luma_dst = {%.4ff, %.4ff, %.4ff};\n",
+ luma_src->cr, luma_src->cg, luma_src->cb);
+ av_bprintf(header, "#define RGB2RGB_PASSTHROUGH\n");
+
+ ff_fill_rgb2yuv_table(luma_src, rgb2yuv);
+ ff_opencl_print_const_matrix_3x3(header, "yuv_matrix", rgb2yuv);
+ ff_matrix_invert_3x3(rgb2yuv, yuv2rgb);
+ ff_opencl_print_const_matrix_3x3(header, "rgb_matrix", yuv2rgb);
+
+ if (ctx->job.src.is_rgb && !ctx->job.dst.is_rgb) {
+ av_bprintf(header, "#define RGB2YUV\n");
+ } else if (!ctx->job.src.is_rgb && ctx->job.dst.is_rgb) {
+ av_bprintf(header, "#define YUV2RGB\n");
+ }
+
+ if (!ctx->job.src.is_rgb && ctx->range_in == AVCOL_RANGE_JPEG)
+ av_bprintf(header, "#define FULL_RANGE_IN\n");
+
+ if (!ctx->job.dst.is_rgb && ctx->range_out == AVCOL_RANGE_JPEG)
+ av_bprintf(header, "#define FULL_RANGE_OUT\n");
+
+ av_bprintf(header, "#define chroma_loc %d\n", (int)ctx->chroma_loc);
+ av_bprintf(header, "#define algorithm %s\n", algo_str[ctx->algorithm]);
+ av_bprintf(header, "#define DST_IMGS %d\n", ctx->job.dst.nb_planes);
+ av_bprintf(header, "#define SRC_IMGS %d\n", ctx->job.src.nb_planes);
+ av_bprintf(header, "#define SRC_CHROMA_V %d\n", ctx->job.src.chroma_v);
+ av_bprintf(header, "#define SRC_CHROMA_H %d\n", ctx->job.src.chroma_h);
+ av_bprintf(header, "#define DST_CHROMA_V %d\n", ctx->job.dst.chroma_v);
+ av_bprintf(header, "#define DST_CHROMA_H %d\n", ctx->job.dst.chroma_h);
+ return 0;
+}
+#define OPENCL_SOURCE_NB 3
+static int scale_opencl_init(AVFilterContext *avctx)
+{
+ ScaleOpenCLContext *ctx = avctx->priv;
+ const char *opencl_sources[OPENCL_SOURCE_NB];
+ float *coff_x = NULL, *coff_y = NULL;
+ cl_int cle;
+ int err;
+ int filter_size_x, filter_size_y, size_factor;
+ float vscale, hscale;
+ const char *k_name;
+ AVBPrint header;
+
+ size_factor = filter_size[ctx->algorithm];
+ vscale = ctx->job.src.height / ctx->job.dst.height;
+ hscale = ctx->job.src.width / ctx->job.dst.width;
+
+ filter_size_x = ceil(size_factor * FFMAX(1, hscale));
+ filter_size_y = ceil(size_factor * FFMAX(1, vscale));
+ ctx->filter_size.s[0] = filter_size_x;
+ ctx->filter_size.s[1] = filter_size_y;
+
+ if (ctx->algorithm != SCALE_BILINEAR &&
+ ctx->algorithm != SCALE_NEIGHBOR) {
+ // we don't need such coefficients for bilinear & neighbor
+ coff_x = av_malloc_array(filter_size_x * ctx->job.dst.width,
+ sizeof(float));
+ coff_y = av_malloc_array(filter_size_y * ctx->job.dst.height,
+ sizeof(float));
+ if (!coff_x || !coff_y) {
+ goto fail;
+ }
+
+ fill_filter_table(avctx, coff_x, ctx->job.dst.width, filter_size_x,
+ hscale);
+ fill_filter_table(avctx, coff_y, ctx->job.dst.height, filter_size_y,
+ vscale);
+
+ ctx->coff_x = clCreateBuffer(ctx->ocf.hwctx->context,
+ CL_MEM_COPY_HOST_PTR | CL_MEM_READ_ONLY,
+ ctx->job.dst.width * filter_size_x *
+ sizeof(cl_float),
+ coff_x, &cle);
+
+ ctx->coff_y = clCreateBuffer(ctx->ocf.hwctx->context,
+ CL_MEM_COPY_HOST_PTR | CL_MEM_READ_ONLY,
+ ctx->job.dst.height * filter_size_y *
+ sizeof(cl_float),
+ coff_y, &cle);
+ av_freep(&coff_x);
+ av_freep(&coff_y);
+ }
+
+ av_bprint_init(&header, 1024, AV_BPRINT_SIZE_AUTOMATIC);
+ err = prepare_opencl_header(avctx, &header);
+ if (err < 0)
+ goto fail;
+ av_log(avctx, AV_LOG_DEBUG, "Generated OpenCL header:\n%s\n", header.str);
+ opencl_sources[0] = header.str;
+ opencl_sources[1] = ff_opencl_source_scale;
+ opencl_sources[2] = ff_opencl_source_colorspace_common;
+ err = ff_opencl_filter_load_program(avctx, opencl_sources, OPENCL_SOURCE_NB);
+
+ av_bprint_finalize(&header, NULL);
+
+ if (err < 0)
+ goto fail;
+
+ ctx->command_queue = clCreateCommandQueue(ctx->ocf.hwctx->context,
+ ctx->ocf.hwctx->device_id,
+ 0, &cle);
+ CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to create OpenCL "
+ "command queue %d.\n", cle);
+
+ k_name = ctx->format != AV_PIX_FMT_NONE ? "convert" : "scale";
+ ctx->kernel = clCreateKernel(ctx->ocf.program, k_name, &cle);
+ CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to create kernel %d.\n", cle);
+
+ ctx->initialised = 1;
+ return 0;
+
+fail:
+ if (ctx->command_queue)
+ clReleaseCommandQueue(ctx->command_queue);
+ if (ctx->kernel)
+ clReleaseKernel(ctx->kernel);
+
+ if (coff_x)
+ av_freep(&coff_x);
+ if (coff_y)
+ av_freep(&coff_y);
+
+ av_bprint_finalize(&header, NULL);
+ return err;
+}
+
+static int is_fmt_supported(enum AVPixelFormat fmt)
+{
+ int i;
+
+ for (i = 0; i < FF_ARRAY_ELEMS(supported_formats); i++)
+ if (supported_formats[i] == fmt)
+ return 1;
+
+ return 0;
+}
+
+static int scale_opencl_config_output(AVFilterLink *outlink)
+{
+ AVFilterContext *avctx = outlink->src;
+ ScaleOpenCLContext *s = avctx->priv;
+ AVFilterLink *inlink = outlink->src->inputs[0];
+ AVHWFramesContext *in_frames_ctx;
+ enum AVPixelFormat in_format;
+ int w, h, ret;
+
+ if (!strcmp(s->format_str, "same")) {
+ s->format = AV_PIX_FMT_NONE;
+ } else {
+ s->format = av_get_pix_fmt(s->format_str);
+ if (s->format == AV_PIX_FMT_NONE) {
+ av_log(avctx, AV_LOG_ERROR, "Unrecognized pixel format: %s\n", s->format_str);
+ return AVERROR(EINVAL);
+ }
+ }
+
+ if ((ret = ff_scale_eval_dimensions(s,
+ s->w_expr, s->h_expr,
+ inlink, outlink,
+ &w, &h)) < 0)
+ return ret;
+
+ s->ocf.output_width = w;
+ s->ocf.output_height = h;
+ if (s->format != AV_PIX_FMT_NONE)
+ s->ocf.output_format = s->format;
+
+ ret = ff_opencl_filter_config_output(outlink);
+ if (ret < 0)
+ return ret;
+
+ in_frames_ctx = (AVHWFramesContext*)inlink->hw_frames_ctx->data;
+ in_format = in_frames_ctx->sw_format;
+
+ if (!is_fmt_supported(in_format)) {
+ av_log(avctx, AV_LOG_ERROR, "unsupported input format %s\n",
+ av_get_pix_fmt_name(in_format));
+ return AVERROR(ENOSYS);
+ }
+
+ if (!is_fmt_supported(s->ocf.output_format)) {
+ av_log(avctx, AV_LOG_ERROR, "unsupported output format %s\n",
+ av_get_pix_fmt_name(s->ocf.output_format));
+ return AVERROR(ENOSYS);
+ }
+
+ if (inlink->sample_aspect_ratio.num)
+ outlink->sample_aspect_ratio = av_mul_q((AVRational){outlink->h*inlink->w,
+ outlink->w*inlink->h},
+ inlink->sample_aspect_ratio);
+ else
+ outlink->sample_aspect_ratio = inlink->sample_aspect_ratio;
+ return 0;
+}
+
+static int enqueue_simple_scale(AVFilterContext *avctx, cl_command_queue queue,
+ cl_kernel kernel, AVFrame *input,
+ AVFrame *output)
+{
+ ScaleOpenCLContext *ctx = avctx->priv;
+ size_t global_work[2];
+ cl_int cle;
+ cl_mem src, dst;
+ cl_int src_width, src_height, dst_width, dst_height;
+ int err, plane, is_chroma;
+
+ for (plane = 0; plane < FF_ARRAY_ELEMS(output->data); plane++) {
+ dst = (cl_mem)output->data[plane];
+ src = (cl_mem)input->data[plane];
+
+ if (!dst) break;
+
+ is_chroma = (plane == 1 || plane == 2);
+ if (!is_chroma) {
+ src_width = input->width;
+ src_height = input->height;
+ dst_width = output->width;
+ dst_height = output->height;
+ } else {
+ src_width = input->width * ctx->job.src.chroma_h / 2;
+ src_height = input->height * ctx->job.src.chroma_v / 2;
+ dst_width = output->width * ctx->job.dst.chroma_h / 2;
+ dst_height = output->height * ctx->job.dst.chroma_v / 2;
+ }
+
+ CL_SET_KERNEL_ARG(kernel, 0, cl_mem, &dst);
+ CL_SET_KERNEL_ARG(kernel, 1, cl_mem, &src);
+ CL_SET_KERNEL_ARG(kernel, 2, cl_int, &dst_width);
+ CL_SET_KERNEL_ARG(kernel, 3, cl_int, &dst_height);
+ CL_SET_KERNEL_ARG(kernel, 4, cl_int, &src_width);
+ CL_SET_KERNEL_ARG(kernel, 5, cl_int, &src_height);
+ CL_SET_KERNEL_ARG(kernel, 6, cl_mem, &ctx->coff_x);
+ CL_SET_KERNEL_ARG(kernel, 7, cl_mem, &ctx->coff_y);
+ CL_SET_KERNEL_ARG(kernel, 8, cl_int2, &ctx->filter_size);
+
+ err = ff_opencl_filter_work_size_from_image(avctx, global_work, output,
+ plane, 8);
+ if (err < 0)
+ return err;
+
+ cle = clEnqueueNDRangeKernel(ctx->command_queue, kernel, 2, NULL,
+ global_work, NULL,
+ 0, NULL, NULL);
+ CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to enqueue kernel: %d.\n", cle);
+
+ }
+ return 0;
+fail:
+ return err;
+}
+
+static int enqueue_convert(AVFilterContext *avctx, cl_command_queue queue,
+ cl_kernel kernel, AVFrame *in, AVFrame *out)
+{
+ ScaleOpenCLContext *ctx = avctx->priv;
+ cl_int cle;
+ cl_mem mem;
+ int err;
+ size_t global_work[2];
+
+ global_work[0] = ctx->job.dst.width / 2;
+ global_work[1] = ctx->job.dst.height / 2;
+#define SET_DST(idx) \
+mem = (cl_mem)out->data[ctx->job.dst.plane_id[idx]]; \
+CL_SET_KERNEL_ARG(kernel, idx, cl_mem, &mem);
+
+#define SET_SRC(idx) \
+mem = (cl_mem)in->data[ctx->job.src.plane_id[idx]]; \
+CL_SET_KERNEL_ARG(kernel, idx + 4, cl_mem, &mem);
+
+ SET_DST(0)
+ SET_DST(1)
+ SET_DST(2)
+ SET_DST(3)
+
+ SET_SRC(0)
+ SET_SRC(1)
+ SET_SRC(2)
+ SET_SRC(3)
+#undef SET_DST
+#undef SET_SRC
+ CL_SET_KERNEL_ARG(kernel, 8, cl_int, &ctx->job.dst.width);
+ CL_SET_KERNEL_ARG(kernel, 9, cl_int, &ctx->job.dst.height);
+ CL_SET_KERNEL_ARG(kernel, 10, cl_int, &ctx->job.src.width);
+ CL_SET_KERNEL_ARG(kernel, 11, cl_int, &ctx->job.src.height);
+ CL_SET_KERNEL_ARG(kernel, 12, cl_mem, &ctx->coff_x);
+ CL_SET_KERNEL_ARG(kernel, 13, cl_mem, &ctx->coff_y);
+ CL_SET_KERNEL_ARG(kernel, 14, cl_int2, &ctx->filter_size);
+
+ cle = clEnqueueNDRangeKernel(queue, kernel, 2, NULL,
+ global_work, NULL,
+ 0, NULL, NULL);
+ CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to enqueue kernel: %d.\n", cle);
+
+ return 0;
+fail:
+ return err;
+}
+
+static int scale_opencl_filter_frame(AVFilterLink *inlink, AVFrame *input)
+{
+ AVFilterContext *avctx = inlink->dst;
+ AVFilterLink *outlink = avctx->outputs[0];
+ ScaleOpenCLContext *ctx = avctx->priv;
+ AVFrame *output = NULL;
+ cl_kernel kernel;
+ cl_int cle;
+ int err;
+
+ av_log(ctx, AV_LOG_DEBUG, "Filter input: %s, %ux%u (%"PRId64").\n",
+ av_get_pix_fmt_name(input->format),
+ input->width, input->height, input->pts);
+
+ if (!input->hw_frames_ctx)
+ return AVERROR(EINVAL);
+
+
+ output = ff_get_video_buffer(outlink, outlink->w, outlink->h);
+ if (!output) {
+ err = AVERROR(ENOMEM);
+ goto fail;
+ }
+
+ err = av_frame_copy_props(output, input);
+ if (err < 0)
+ goto fail;
+
+ if (ctx->colorspace != -1)
+ ctx->colorspace_in = ctx->colorspace;
+ else
+ ctx->colorspace_in = input->colorspace;
+ // default to bt601
+ if (ctx->colorspace_in == AVCOL_SPC_UNSPECIFIED)
+ ctx->colorspace_in = AVCOL_SPC_BT470BG;
+ ctx->range_in = input->color_range;
+
+ if (!ctx->initialised) {
+ err = init_jobs(avctx, input, output);
+ if (err < 0)
+ goto fail;
+
+ err = scale_opencl_init(avctx);
+ if (err < 0)
+ goto fail;
+ }
+
+ kernel = ctx->kernel;
+
+ if (ctx->format != AV_PIX_FMT_NONE)
+ err = enqueue_convert(avctx, ctx->command_queue, kernel, input, output);
+ else
+ err = enqueue_simple_scale(avctx, ctx->command_queue, kernel, input,
+ output);
+ if (err < 0)
+ goto fail;
+
+ cle = clFinish(ctx->command_queue);
+ CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to finish command queue: %d.\n", cle);
+
+ av_frame_free(&input);
+
+ av_log(ctx, AV_LOG_DEBUG, "Filter output: %s, %ux%u (%"PRId64").\n",
+ av_get_pix_fmt_name(output->format),
+ output->width, output->height, output->pts);
+
+ return ff_filter_frame(outlink, output);
+
+fail:
+ clFinish(ctx->command_queue);
+ av_frame_free(&input);
+ av_frame_free(&output);
+ return err;
+}
+
+static av_cold void scale_opencl_uninit(AVFilterContext *avctx)
+{
+ ScaleOpenCLContext *ctx = avctx->priv;
+ cl_int cle;
+
+ if (ctx->kernel) {
+ cle = clReleaseKernel(ctx->kernel);
+ if (cle != CL_SUCCESS)
+ av_log(avctx, AV_LOG_ERROR, "Failed to release "
+ "kernel: %d.\n", cle);
+ }
+
+ if (ctx->command_queue) {
+ cle = clReleaseCommandQueue(ctx->command_queue);
+ if (cle != CL_SUCCESS)
+ av_log(avctx, AV_LOG_ERROR, "Failed to release "
+ "command queue: %d.\n", cle);
+ }
+
+ if (ctx->coff_x) {
+ cle = clReleaseMemObject(ctx->coff_x);
+ }
+
+ if (ctx->coff_y) {
+ cle = clReleaseMemObject(ctx->coff_y);
+ }
+ ff_opencl_filter_uninit(avctx);
+}
+
+#define OFFSET(x) offsetof(ScaleOpenCLContext, x)
+#define FLAGS (AV_OPT_FLAG_FILTERING_PARAM | AV_OPT_FLAG_VIDEO_PARAM)
+static const AVOption scale_opencl_options[] = {
+ { "w", "Output video width", OFFSET(w_expr), AV_OPT_TYPE_STRING, { .str = "iw" }, .flags = FLAGS },
+ { "h", "Output video height", OFFSET(h_expr), AV_OPT_TYPE_STRING, { .str = "ih" }, .flags = FLAGS },
+ { "format", "Output pixel format", OFFSET(format_str), AV_OPT_TYPE_STRING, { .str = "same" }, .flags = FLAGS },
+ { "in_color_matrix", "set input YCbCr type", OFFSET(colorspace), AV_OPT_TYPE_INT, { .i64 = -1 }, -1, INT_MAX, FLAGS, "matrix" },
+ { "bt709", 0, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_SPC_BT709}, 0, 0, FLAGS, "matrix" },
+ { "fcc", 0, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_SPC_FCC}, 0, 0, FLAGS, "matrix" },
+ { "bt470bg", 0, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_SPC_BT470BG}, 0, 0, FLAGS, "matrix" },
+ { "smpte170m", 0, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_SPC_SMPTE170M}, 0, 0, FLAGS, "matrix" },
+ { "smpte240m", 0, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_SPC_SMPTE240M}, 0, 0, FLAGS, "matrix" },
+ { "bt2020", 0, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_SPC_BT2020_NCL}, 0, 0, FLAGS, "matrix" },
+ { "algorithm", "Scaling algorithm", OFFSET(algorithm), AV_OPT_TYPE_INT, { .i64 = SCALE_BILINEAR }, INT_MIN, INT_MAX, FLAGS, "algorithm" },
+ { "neighbor", "nearest neighbor", 0, AV_OPT_TYPE_CONST, { .i64 = SCALE_NEIGHBOR}, 0, 0, FLAGS, "algorithm" },
+ { "bilinear", "bilinear", 0, AV_OPT_TYPE_CONST, { .i64 = SCALE_BILINEAR}, 0, 0, FLAGS, "algorithm" },
+ { "bicubic", "bicubic", 0, AV_OPT_TYPE_CONST, { .i64 = SCALE_BICUBIC}, 0, 0, FLAGS, "algorithm" },
+ { NULL }
+};
+
+AVFILTER_DEFINE_CLASS(scale_opencl);
+
+static const AVFilterPad scale_opencl_inputs[] = {
+ {
+ .name = "default",
+ .type = AVMEDIA_TYPE_VIDEO,
+ .filter_frame = &scale_opencl_filter_frame,
+ .config_props = &ff_opencl_filter_config_input,
+ },
+ { NULL }
+};
+
+static const AVFilterPad scale_opencl_outputs[] = {
+ {
+ .name = "default",
+ .type = AVMEDIA_TYPE_VIDEO,
+ .config_props = &scale_opencl_config_output,
+ },
+ { NULL }
+};
+
+AVFilter ff_vf_scale_opencl = {
+ .name = "scale_opencl",
+ .description = NULL_IF_CONFIG_SMALL("OpenCL Scale and Format Conversion Filter"),
+ .priv_size = sizeof(ScaleOpenCLContext),
+ .priv_class = &scale_opencl_class,
+ .init = &ff_opencl_filter_init,
+ .uninit = &scale_opencl_uninit,
+ .query_formats = &ff_opencl_filter_query_formats,
+ .inputs = scale_opencl_inputs,
+ .outputs = scale_opencl_outputs,
+ .flags_internal = FF_FILTER_FLAG_HWFRAME_AWARE,
+};
--
2.7.4
More information about the ffmpeg-devel
mailing list