[FFmpeg-devel] [PATCH 07/17] swscale/optimizer: add high-level ops optimizer
Niklas Haas
ffmpeg at haasn.xyz
Sun May 18 17:59:43 EEST 2025
From: Niklas Haas <git at haasn.dev>
This is responsible for taking a "naive" ops list and optimizing it
as much as possible. Also includes a small analyzer that generates component
metadata for use by the optimizer.
---
libswscale/Makefile | 1 +
libswscale/ops.h | 12 +
libswscale/ops_optimizer.c | 781 +++++++++++++++++++++++++++++++++++++
3 files changed, 794 insertions(+)
create mode 100644 libswscale/ops_optimizer.c
diff --git a/libswscale/Makefile b/libswscale/Makefile
index e0beef4e69..810c9dee78 100644
--- a/libswscale/Makefile
+++ b/libswscale/Makefile
@@ -16,6 +16,7 @@ OBJS = alphablend.o \
input.o \
lut3d.o \
ops.o \
+ ops_optimizer.o \
options.o \
output.o \
rgb2rgb.o \
diff --git a/libswscale/ops.h b/libswscale/ops.h
index 85462ae337..ae65d578b3 100644
--- a/libswscale/ops.h
+++ b/libswscale/ops.h
@@ -237,4 +237,16 @@ void ff_sws_op_list_remove_at(SwsOpList *ops, int index, int count);
*/
void ff_sws_op_list_print(void *log_ctx, int log_level, const SwsOpList *ops);
+/**
+ * Infer + propagate known information about components. Called automatically
+ * when needed by the optimizer and compiler.
+ */
+void ff_sws_op_list_update_comps(SwsOpList *ops);
+
+/**
+ * Fuse compatible and eliminate redundant operations, as well as replacing
+ * some operations with more efficient alternatives.
+ */
+int ff_sws_op_list_optimize(SwsOpList *ops);
+
#endif
diff --git a/libswscale/ops_optimizer.c b/libswscale/ops_optimizer.c
new file mode 100644
index 0000000000..829e691b3f
--- /dev/null
+++ b/libswscale/ops_optimizer.c
@@ -0,0 +1,781 @@
+/**
+ * Copyright (C) 2025 Niklas Haas
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/avassert.h"
+#include "libavutil/rational.h"
+
+#include "ops.h"
+
+#define Q(N) ((AVRational) { N, 1 })
+
+#define RET(x) \
+ do { \
+ if ((ret = (x)) < 0) \
+ return ret; \
+ } while (0)
+
+/* Returns true for operations that are independent per channel. These can
+ * usually be commuted freely other such operations. */
+static bool op_type_is_independent(SwsOpType op)
+{
+ switch (op) {
+ case SWS_OP_SWAP_BYTES:
+ case SWS_OP_LSHIFT:
+ case SWS_OP_RSHIFT:
+ case SWS_OP_CONVERT:
+ case SWS_OP_DITHER:
+ case SWS_OP_MIN:
+ case SWS_OP_MAX:
+ case SWS_OP_SCALE:
+ return true;
+ case SWS_OP_INVALID:
+ case SWS_OP_READ:
+ case SWS_OP_WRITE:
+ case SWS_OP_SWIZZLE:
+ case SWS_OP_CLEAR:
+ case SWS_OP_LINEAR:
+ case SWS_OP_PACK:
+ case SWS_OP_UNPACK:
+ return false;
+ case SWS_OP_TYPE_NB:
+ break;
+ }
+
+ av_assert0(!"Invalid operation type!");
+ return false;
+}
+
+static AVRational expand_factor(SwsPixelType from, SwsPixelType to)
+{
+ const int src = ff_sws_pixel_type_size(from);
+ const int dst = ff_sws_pixel_type_size(to);
+ int scale = 0;
+ for (int i = 0; i < dst / src; i++)
+ scale = scale << src * 8 | 1;
+ return Q(scale);
+}
+
+/* merge_comp_flags() forms a monoid with flags_identity as the null element */
+static const unsigned flags_identity = SWS_COMP_ZERO | SWS_COMP_EXACT;
+static unsigned merge_comp_flags(unsigned a, unsigned b)
+{
+ const unsigned flags_or = SWS_COMP_GARBAGE;
+ const unsigned flags_and = SWS_COMP_ZERO | SWS_COMP_EXACT;
+ return ((a & b) & flags_and) | ((a | b) & flags_or);
+}
+
+/* Infer + propagate known information about components */
+void ff_sws_op_list_update_comps(SwsOpList *ops)
+{
+ SwsComps next = { .unused = {true, true, true, true} };
+ SwsComps prev = { .flags = {
+ SWS_COMP_GARBAGE, SWS_COMP_GARBAGE, SWS_COMP_GARBAGE, SWS_COMP_GARBAGE,
+ }};
+
+ /* Forwards pass, propagates knowledge about the incoming pixel values */
+ for (int n = 0; n < ops->num_ops; n++) {
+ SwsOp *op = &ops->ops[n];
+
+ /* Prefill min/max values automatically; may have to be fixed in
+ * special cases */
+ memcpy(op->comps.min, prev.min, sizeof(prev.min));
+ memcpy(op->comps.max, prev.max, sizeof(prev.max));
+ ff_sws_apply_op_q(op, op->comps.min);
+ ff_sws_apply_op_q(op, op->comps.max);
+
+ switch (op->op) {
+ case SWS_OP_READ:
+ for (int i = 0; i < op->rw.elems; i++) {
+ if (ff_sws_pixel_type_is_int(op->type)) {
+ int bits = 8 * ff_sws_pixel_type_size(op->type);
+ if (!op->rw.packed && ops->src.desc) {
+ /* Use legal value range from pixdesc if available;
+ * we don't need to do this for packed formats because
+ * non-byte-aligned packed formats will necessarily go
+ * through SWS_OP_UNPACK anyway */
+ for (int c = 0; c < 4; c++) {
+ if (ops->src.desc->comp[c].plane == i) {
+ bits = ops->src.desc->comp[c].depth;
+ break;
+ }
+ }
+ }
+
+ op->comps.flags[i] = SWS_COMP_EXACT;
+ op->comps.min[i] = Q(0);
+ op->comps.max[i] = Q((1 << bits) - 1);
+ }
+ }
+ for (int i = op->rw.elems; i < 4; i++)
+ op->comps.flags[i] = prev.flags[i];
+ break;
+ case SWS_OP_WRITE:
+ for (int i = 0; i < op->rw.elems; i++)
+ av_assert1(!(prev.flags[i] & SWS_COMP_GARBAGE));
+ /* fall through */
+ case SWS_OP_SWAP_BYTES:
+ case SWS_OP_LSHIFT:
+ case SWS_OP_RSHIFT:
+ case SWS_OP_MIN:
+ case SWS_OP_MAX:
+ /* Linearly propagate flags per component */
+ for (int i = 0; i < 4; i++)
+ op->comps.flags[i] = prev.flags[i];
+ break;
+ case SWS_OP_DITHER:
+ /* Strip zero flag because of the nonzero dithering offset */
+ for (int i = 0; i < 4; i++)
+ op->comps.flags[i] = prev.flags[i] & ~SWS_COMP_ZERO;
+ break;
+ case SWS_OP_UNPACK:
+ for (int i = 0; i < 4; i++) {
+ if (op->pack.pattern[i])
+ op->comps.flags[i] = prev.flags[0];
+ else
+ op->comps.flags[i] = SWS_COMP_GARBAGE;
+ }
+ break;
+ case SWS_OP_PACK: {
+ unsigned flags = flags_identity;
+ for (int i = 0; i < 4; i++) {
+ if (op->pack.pattern[i])
+ flags = merge_comp_flags(flags, prev.flags[i]);
+ if (i > 0) /* clear remaining comps for sanity */
+ op->comps.flags[i] = SWS_COMP_GARBAGE;
+ }
+ op->comps.flags[0] = flags;
+ break;
+ }
+ case SWS_OP_CLEAR:
+ for (int i = 0; i < 4; i++) {
+ if (op->c.q4[i].den) {
+ if (op->c.q4[i].num == 0) {
+ op->comps.flags[i] = SWS_COMP_ZERO | SWS_COMP_EXACT;
+ } else if (op->c.q4[i].den == 1) {
+ op->comps.flags[i] = SWS_COMP_EXACT;
+ }
+ } else {
+ op->comps.flags[i] = prev.flags[i];
+ }
+ }
+ break;
+ case SWS_OP_SWIZZLE:
+ for (int i = 0; i < 4; i++)
+ op->comps.flags[i] = prev.flags[op->swizzle.in[i]];
+ break;
+ case SWS_OP_CONVERT:
+ for (int i = 0; i < 4; i++) {
+ op->comps.flags[i] = prev.flags[i];
+ if (ff_sws_pixel_type_is_int(op->convert.to))
+ op->comps.flags[i] |= SWS_COMP_EXACT;
+ }
+ break;
+ case SWS_OP_LINEAR:
+ for (int i = 0; i < 4; i++) {
+ unsigned flags = flags_identity;
+ AVRational min = Q(0), max = Q(0);
+ for (int j = 0; j < 4; j++) {
+ const AVRational k = op->lin.m[i][j];
+ AVRational mink = av_mul_q(prev.min[j], k);
+ AVRational maxk = av_mul_q(prev.max[j], k);
+ if (k.num) {
+ flags = merge_comp_flags(flags, prev.flags[j]);
+ if (k.den != 1) /* fractional coefficient */
+ flags &= ~SWS_COMP_EXACT;
+ if (k.num < 0)
+ FFSWAP(AVRational, mink, maxk);
+ min = av_add_q(min, mink);
+ max = av_add_q(max, maxk);
+ }
+ }
+ if (op->lin.m[i][4].num) { /* nonzero offset */
+ flags &= ~SWS_COMP_ZERO;
+ if (op->lin.m[i][4].den != 1) /* fractional offset */
+ flags &= ~SWS_COMP_EXACT;
+ min = av_add_q(min, op->lin.m[i][4]);
+ max = av_add_q(max, op->lin.m[i][4]);
+ }
+ op->comps.flags[i] = flags;
+ op->comps.min[i] = min;
+ op->comps.max[i] = max;
+ }
+ break;
+ case SWS_OP_SCALE:
+ for (int i = 0; i < 4; i++) {
+ op->comps.flags[i] = prev.flags[i];
+ if (op->c.q.den != 1) /* fractional scale */
+ op->comps.flags[i] &= ~SWS_COMP_EXACT;
+ if (op->c.q.num < 0)
+ FFSWAP(AVRational, op->comps.min[i], op->comps.max[i]);
+ }
+ break;
+
+ case SWS_OP_INVALID:
+ case SWS_OP_TYPE_NB:
+ av_assert0(!"Invalid operation type!");
+ }
+
+ prev = op->comps;
+ }
+
+ /* Backwards pass, solves for component dependencies */
+ for (int n = ops->num_ops - 1; n >= 0; n--) {
+ SwsOp *op = &ops->ops[n];
+
+ switch (op->op) {
+ case SWS_OP_READ:
+ case SWS_OP_WRITE:
+ for (int i = 0; i < op->rw.elems; i++)
+ op->comps.unused[i] = op->op == SWS_OP_READ;
+ for (int i = op->rw.elems; i < 4; i++)
+ op->comps.unused[i] = next.unused[i];
+ break;
+ case SWS_OP_SWAP_BYTES:
+ case SWS_OP_LSHIFT:
+ case SWS_OP_RSHIFT:
+ case SWS_OP_CONVERT:
+ case SWS_OP_DITHER:
+ case SWS_OP_MIN:
+ case SWS_OP_MAX:
+ case SWS_OP_SCALE:
+ for (int i = 0; i < 4; i++)
+ op->comps.unused[i] = next.unused[i];
+ break;
+ case SWS_OP_UNPACK: {
+ bool unused = true;
+ for (int i = 0; i < 4; i++) {
+ if (op->pack.pattern[i])
+ unused &= next.unused[i];
+ op->comps.unused[i] = i > 0;
+ }
+ op->comps.unused[0] = unused;
+ break;
+ }
+ case SWS_OP_PACK:
+ for (int i = 0; i < 4; i++) {
+ if (op->pack.pattern[i])
+ op->comps.unused[i] = next.unused[0];
+ else
+ op->comps.unused[i] = true;
+ }
+ break;
+ case SWS_OP_CLEAR:
+ for (int i = 0; i < 4; i++) {
+ if (op->c.q4[i].den)
+ op->comps.unused[i] = true;
+ else
+ op->comps.unused[i] = next.unused[i];
+ }
+ break;
+ case SWS_OP_SWIZZLE: {
+ bool unused[4] = { true, true, true, true };
+ for (int i = 0; i < 4; i++)
+ unused[op->swizzle.in[i]] &= next.unused[i];
+ for (int i = 0; i < 4; i++)
+ op->comps.unused[i] = unused[i];
+ break;
+ }
+ case SWS_OP_LINEAR:
+ for (int j = 0; j < 4; j++) {
+ bool unused = true;
+ for (int i = 0; i < 4; i++) {
+ if (op->lin.m[i][j].num)
+ unused &= next.unused[i];
+ }
+ op->comps.unused[j] = unused;
+ }
+ break;
+ }
+
+ next = op->comps;
+ }
+}
+
+/* returns log2(x) only if x is a power of two, or 0 otherwise */
+static int exact_log2(const int x)
+{
+ int p;
+ if (x <= 0)
+ return 0;
+ p = av_log2(x);
+ return (1 << p) == x ? p : 0;
+}
+
+static int exact_log2_q(const AVRational x)
+{
+ if (x.den == 1)
+ return exact_log2(x.num);
+ else if (x.num == 1)
+ return -exact_log2(x.den);
+ else
+ return 0;
+}
+
+/**
+ * If a linear operation can be reduced to a scalar multiplication, returns
+ * the corresponding scaling factor, or 0 otherwise.
+ */
+static bool extract_scalar(const SwsLinearOp *c, SwsComps prev, SwsComps next,
+ SwsConst *out_scale)
+{
+ SwsConst scale = {0};
+
+ /* There are components not on the main diagonal */
+ if (c->mask & ~SWS_MASK_DIAG4)
+ return false;
+
+ for (int i = 0; i < 4; i++) {
+ const AVRational s = c->m[i][i];
+ if ((prev.flags[i] & SWS_COMP_ZERO) || next.unused[i])
+ continue;
+ if (scale.q.den && av_cmp_q(s, scale.q))
+ return false;
+ scale.q = s;
+ }
+
+ if (scale.q.den)
+ *out_scale = scale;
+ return scale.q.den;
+}
+
+/* Extracts an integer clear operation (subset) from the given linear op. */
+static bool extract_constant_rows(SwsLinearOp *c, SwsComps prev,
+ SwsConst *out_clear)
+{
+ SwsConst clear = {0};
+ bool ret = false;
+
+ for (int i = 0; i < 4; i++) {
+ bool const_row = c->m[i][4].den == 1; /* offset is integer */
+ for (int j = 0; j < 4; j++) {
+ const_row &= c->m[i][j].num == 0 || /* scalar is zero */
+ (prev.flags[j] & SWS_COMP_ZERO); /* input is zero */
+ }
+ if (const_row && (c->mask & SWS_MASK_ROW(i))) {
+ clear.q4[i] = c->m[i][4];
+ for (int j = 0; j < 5; j++)
+ c->m[i][j] = Q(i == j);
+ c->mask &= ~SWS_MASK_ROW(i);
+ ret = true;
+ }
+ }
+
+ if (ret)
+ *out_clear = clear;
+ return ret;
+}
+
+/* Unswizzle a linear operation by aligning single-input rows with
+ * their corresponding diagonal */
+static bool extract_swizzle(SwsLinearOp *op, SwsComps prev, SwsSwizzleOp *out_swiz)
+{
+ SwsSwizzleOp swiz = SWS_SWIZZLE(0, 1, 2, 3);
+ SwsLinearOp c = *op;
+
+ for (int i = 0; i < 4; i++) {
+ int idx = -1;
+ for (int j = 0; j < 4; j++) {
+ if (!c.m[i][j].num || (prev.flags[j] & SWS_COMP_ZERO))
+ continue;
+ if (idx >= 0)
+ return false; /* multiple inputs */
+ idx = j;
+ }
+
+ if (idx >= 0 && idx != i) {
+ /* Move coefficient to the diagonal */
+ c.m[i][i] = c.m[i][idx];
+ c.m[i][idx] = Q(0);
+ swiz.in[i] = idx;
+ }
+ }
+
+ if (swiz.mask == SWS_SWIZZLE(0, 1, 2, 3).mask)
+ return false; /* no swizzle was identified */
+
+ c.mask = ff_sws_linear_mask(c);
+ *out_swiz = swiz;
+ *op = c;
+ return true;
+}
+
+static void op_copy_flags(SwsOp *op, const SwsOp *op2)
+{
+ for (int i = 0; i < 4; i++)
+ op->comps.flags[i] = op2->comps.flags[i];
+}
+
+int ff_sws_op_list_optimize(SwsOpList *ops)
+{
+ int ret;
+
+retry:
+ ff_sws_op_list_update_comps(ops);
+
+ for (int n = 0; n < ops->num_ops;) {
+ SwsOp dummy = {0};
+ SwsOp *op = &ops->ops[n];
+ SwsOp *prev = n ? &ops->ops[n - 1] : &dummy;
+ SwsOp *next = n + 1 < ops->num_ops ? &ops->ops[n + 1] : &dummy;
+
+ /* common helper variable */
+ bool noop = true;
+
+ switch (op->op) {
+ case SWS_OP_READ:
+ /* Optimized further into refcopy / memcpy */
+ if (next->op == SWS_OP_WRITE &&
+ next->rw.elems == op->rw.elems &&
+ next->rw.packed == op->rw.packed &&
+ next->rw.frac == op->rw.frac)
+ {
+ ff_sws_op_list_remove_at(ops, n, 2);
+ av_assert1(ops->num_ops == 0);
+ return 0;
+ }
+
+ /* Skip reading extra unneeded components */
+ if (!op->rw.packed) {
+ int needed = op->rw.elems;
+ while (needed > 0 && next->comps.unused[needed - 1])
+ needed--;
+ if (op->rw.elems != needed) {
+ op->rw.elems = needed;
+ op->rw.packed &= op->rw.elems > 1;
+ goto retry;
+ }
+ }
+ break;
+
+ case SWS_OP_SWAP_BYTES:
+ /* Redundant (double) swap */
+ if (next->op == SWS_OP_SWAP_BYTES) {
+ ff_sws_op_list_remove_at(ops, n, 2);
+ goto retry;
+ }
+ break;
+
+ case SWS_OP_UNPACK:
+ /* Redundant unpack+pack */
+ if (next->op == SWS_OP_PACK && next->type == op->type &&
+ next->pack.pattern[0] == op->pack.pattern[0] &&
+ next->pack.pattern[1] == op->pack.pattern[1] &&
+ next->pack.pattern[2] == op->pack.pattern[2] &&
+ next->pack.pattern[3] == op->pack.pattern[3])
+ {
+ ff_sws_op_list_remove_at(ops, n, 2);
+ goto retry;
+ }
+ break;
+
+ case SWS_OP_LSHIFT:
+ case SWS_OP_RSHIFT:
+ /* Two shifts in the same direction */
+ if (next->op == op->op) {
+ op->c.u += next->c.u;
+ ff_sws_op_list_remove_at(ops, n + 1, 1);
+ goto retry;
+ }
+
+ /* No-op shift */
+ if (!op->c.u) {
+ ff_sws_op_list_remove_at(ops, n, 1);
+ goto retry;
+ }
+ break;
+
+ case SWS_OP_CLEAR:
+ for (int i = 0; i < 4; i++) {
+ if (!op->c.q4[i].den)
+ continue;
+
+ if ((prev->comps.flags[i] & SWS_COMP_ZERO) &&
+ !(prev->comps.flags[i] & SWS_COMP_GARBAGE) &&
+ op->c.q4[i].num == 0)
+ {
+ /* Redundant clear-to-zero of zero component */
+ op->c.q4[i].den = 0;
+ } else if (next->comps.unused[i]) {
+ /* Unnecessary clear of unused component */
+ op->c.q4[i] = (AVRational) {0, 0};
+ } else if (op->c.q4[i].den) {
+ noop = false;
+ }
+ }
+
+ if (noop) {
+ ff_sws_op_list_remove_at(ops, n, 1);
+ goto retry;
+ }
+
+ /* Transitive clear */
+ if (next->op == SWS_OP_CLEAR) {
+ for (int i = 0; i < 4; i++) {
+ if (next->c.q4[i].den)
+ op->c.q4[i] = next->c.q4[i];
+ }
+ ff_sws_op_list_remove_at(ops, n + 1, 1);
+ goto retry;
+ }
+
+ /* Prefer to clear as late as possible, to avoid doing
+ * redundant work */
+ if ((op_type_is_independent(next->op) && next->op != SWS_OP_SWAP_BYTES) ||
+ next->op == SWS_OP_SWIZZLE)
+ {
+ if (next->op == SWS_OP_CONVERT)
+ op->type = next->convert.to;
+ ff_sws_apply_op_q(next, op->c.q4);
+ FFSWAP(SwsOp, *op, *next);
+ goto retry;
+ }
+ break;
+
+ case SWS_OP_SWIZZLE: {
+ bool seen[4] = {0};
+ bool has_duplicates = false;
+ for (int i = 0; i < 4; i++) {
+ if (next->comps.unused[i])
+ continue;
+ if (op->swizzle.in[i] != i)
+ noop = false;
+ has_duplicates |= seen[op->swizzle.in[i]];
+ seen[op->swizzle.in[i]] = true;
+ }
+
+ /* Identity swizzle */
+ if (noop) {
+ ff_sws_op_list_remove_at(ops, n, 1);
+ goto retry;
+ }
+
+ /* Transitive swizzle */
+ if (next->op == SWS_OP_SWIZZLE) {
+ const SwsSwizzleOp orig = op->swizzle;
+ for (int i = 0; i < 4; i++)
+ op->swizzle.in[i] = orig.in[next->swizzle.in[i]];
+ op_copy_flags(op, next);
+ ff_sws_op_list_remove_at(ops, n + 1, 1);
+ goto retry;
+ }
+
+ /* Try to push swizzles with duplicates towards the output */
+ if (has_duplicates && op_type_is_independent(next->op)) {
+ if (next->op == SWS_OP_CONVERT)
+ op->type = next->convert.to;
+ if (next->op == SWS_OP_MIN || next->op == SWS_OP_MAX) {
+ /* Un-swizzle the next operation */
+ const SwsConst c = next->c;
+ for (int i = 0; i < 4; i++) {
+ if (!next->comps.unused[i])
+ next->c.q4[op->swizzle.in[i]] = c.q4[i];
+ }
+ }
+ FFSWAP(SwsOp, *op, *next);
+ goto retry;
+ }
+ break;
+ }
+
+ case SWS_OP_CONVERT:
+ /* No-op conversion */
+ if (op->type == op->convert.to) {
+ ff_sws_op_list_remove_at(ops, n, 1);
+ goto retry;
+ }
+
+ /* Transitive conversion */
+ if (next->op == SWS_OP_CONVERT &&
+ op->convert.expand == next->convert.expand)
+ {
+ av_assert1(op->convert.to == next->type);
+ op->convert.to = next->convert.to;
+ op_copy_flags(op, next);
+ ff_sws_op_list_remove_at(ops, n + 1, 1);
+ goto retry;
+ }
+
+ /* Conversion followed by integer expansion */
+ if (next->op == SWS_OP_SCALE &&
+ !av_cmp_q(next->c.q, expand_factor(op->type, op->convert.to)))
+ {
+ op->convert.expand = true;
+ ff_sws_op_list_remove_at(ops, n + 1, 1);
+ goto retry;
+ }
+ break;
+
+ case SWS_OP_MIN:
+ for (int i = 0; i < 4; i++) {
+ if (next->comps.unused[i] || !op->c.q4[i].den)
+ continue;
+ if (av_cmp_q(op->c.q4[i], prev->comps.max[i]) < 0)
+ noop = false;
+ }
+
+ if (noop) {
+ ff_sws_op_list_remove_at(ops, n, 1);
+ goto retry;
+ }
+ break;
+
+ case SWS_OP_MAX:
+ for (int i = 0; i < 4; i++) {
+ if (next->comps.unused[i] || !op->c.q4[i].den)
+ continue;
+ if (av_cmp_q(prev->comps.min[i], op->c.q4[i]) < 0)
+ noop = false;
+ }
+
+ if (noop) {
+ ff_sws_op_list_remove_at(ops, n, 1);
+ goto retry;
+ }
+ break;
+
+ case SWS_OP_DITHER:
+ for (int i = 0; i < 4; i++) {
+ noop &= (prev->comps.flags[i] & SWS_COMP_EXACT) ||
+ next->comps.unused[i];
+ }
+
+ if (noop) {
+ ff_sws_op_list_remove_at(ops, n, 1);
+ goto retry;
+ }
+ break;
+
+ case SWS_OP_LINEAR: {
+ SwsSwizzleOp swizzle;
+ SwsConst c;
+
+ /* No-op (identity) linear operation */
+ if (!op->lin.mask) {
+ ff_sws_op_list_remove_at(ops, n, 1);
+ goto retry;
+ }
+
+ if (next->op == SWS_OP_LINEAR) {
+ /* 5x5 matrix multiplication after appending [ 0 0 0 0 1 ] */
+ const SwsLinearOp m1 = op->lin;
+ const SwsLinearOp m2 = next->lin;
+ for (int i = 0; i < 4; i++) {
+ for (int j = 0; j < 5; j++) {
+ AVRational sum = Q(0);
+ for (int k = 0; k < 4; k++)
+ sum = av_add_q(sum, av_mul_q(m2.m[i][k], m1.m[k][j]));
+ if (j == 4) /* m1.m[4][j] == 1 */
+ sum = av_add_q(sum, m2.m[i][4]);
+ op->lin.m[i][j] = sum;
+ }
+ }
+ op_copy_flags(op, next);
+ op->lin.mask = ff_sws_linear_mask(op->lin);
+ ff_sws_op_list_remove_at(ops, n + 1, 1);
+ goto retry;
+ }
+
+ /* Optimize away zero columns */
+ for (int j = 0; j < 4; j++) {
+ const uint32_t col = SWS_MASK_COL(j);
+ if (!(prev->comps.flags[j] & SWS_COMP_ZERO) || !(op->lin.mask & col))
+ continue;
+ for (int i = 0; i < 4; i++)
+ op->lin.m[i][j] = Q(i == j);
+ op->lin.mask &= ~col;
+ goto retry;
+ }
+
+ /* Optimize away unused rows */
+ for (int i = 0; i < 4; i++) {
+ const uint32_t row = SWS_MASK_ROW(i);
+ if (!next->comps.unused[i] || !(op->lin.mask & row))
+ continue;
+ for (int j = 0; j < 5; j++)
+ op->lin.m[i][j] = Q(i == j);
+ op->lin.mask &= ~row;
+ goto retry;
+ }
+
+ /* Convert constant rows to explicit clear instruction */
+ if (extract_constant_rows(&op->lin, prev->comps, &c)) {
+ RET(ff_sws_op_list_insert_at(ops, n + 1, &(SwsOp) {
+ .op = SWS_OP_CLEAR,
+ .type = op->type,
+ .comps = op->comps,
+ .c = c,
+ }));
+ goto retry;
+ }
+
+ /* Multiplication by scalar constant */
+ if (extract_scalar(&op->lin, prev->comps, next->comps, &c)) {
+ op->op = SWS_OP_SCALE;
+ op->c = c;
+ goto retry;
+ }
+
+ /* Swizzle by fixed pattern */
+ if (extract_swizzle(&op->lin, prev->comps, &swizzle)) {
+ RET(ff_sws_op_list_insert_at(ops, n, &(SwsOp) {
+ .op = SWS_OP_SWIZZLE,
+ .type = op->type,
+ .swizzle = swizzle,
+ }));
+ goto retry;
+ }
+ break;
+ }
+
+ case SWS_OP_SCALE: {
+ const int factor2 = exact_log2_q(op->c.q);
+
+ /* No-op scaling */
+ if (op->c.q.num == 1 && op->c.q.den == 1) {
+ ff_sws_op_list_remove_at(ops, n, 1);
+ goto retry;
+ }
+
+ /* Scaling by integer before conversion to int */
+ if (op->c.q.den == 1 &&
+ next->op == SWS_OP_CONVERT &&
+ ff_sws_pixel_type_is_int(next->convert.to))
+ {
+ op->type = next->convert.to;
+ FFSWAP(SwsOp, *op, *next);
+ goto retry;
+ }
+
+ /* Scaling by exact power of two */
+ if (factor2 && ff_sws_pixel_type_is_int(op->type)) {
+ op->op = factor2 > 0 ? SWS_OP_LSHIFT : SWS_OP_RSHIFT;
+ op->c.u = FFABS(factor2);
+ goto retry;
+ }
+ break;
+ }
+ }
+
+ /* No optimization triggered, move on to next operation */
+ n++;
+ }
+
+ return 0;
+}
--
2.49.0
More information about the ffmpeg-devel
mailing list