[FFmpeg-devel] [PATCH] vf_overlay: unroll Y plane output computation in blend_slice()
Stefano Sabatini
stefasab at gmail.com
Mon Oct 31 12:46:45 CET 2011
Faster, as avoids unnecessary comparation instructions.
Improve performance, from:
47528710 dezicycles in blend_slice, 8192 runs, 0 skips
to:
40958444 dezicycles in blend_slice, 8192 runs, 0 skips
---
libavfilter/vf_overlay.c | 36 ++++++++++++++++++++++++++++++------
1 files changed, 30 insertions(+), 6 deletions(-)
diff --git a/libavfilter/vf_overlay.c b/libavfilter/vf_overlay.c
index 0698299..22a14bf 100644
--- a/libavfilter/vf_overlay.c
+++ b/libavfilter/vf_overlay.c
@@ -425,15 +425,39 @@ static void blend_slice(AVFilterContext *ctx,
sp += src->linesize[0];
}
} else {
- for (i = 0; i < 3; i++) {
+ uint8_t *dp, *sp, *ap;
+
+ /* compute Y */
+ dp = dst->data[Y] + x + start_y * dst->linesize[Y];
+ sp = src->data[Y];
+ ap = src->data[A];
+
+ if (slice_y > y) {
+ sp += (slice_y - y) * src->linesize[Y];
+ ap += (slice_y - y) * src->linesize[A];
+ }
+ for (i = 0; i < height; i++) {
+ uint8_t *d = dp, *s = sp, *a = ap;
+ for (j = 0; j < width; j++) {
+ *d = FAST_DIV255(*d * (255 - *a) + *s * *a);
+ s++;
+ d++;
+ a++;
+ }
+ dp += dst->linesize[Y];
+ sp += src->linesize[Y];
+ ap += src->linesize[A];
+ }
+
+ /* compute U and V */
+ for (i = 1; i < A; i++) {
int hsub = i ? over->hsub : 0;
int vsub = i ? over->vsub : 0;
- uint8_t *dp = dst->data[i] + (x >> hsub) +
- (start_y >> vsub) * dst->linesize[i];
- uint8_t *sp = src->data[i];
- uint8_t *ap = src->data[3];
- int wp = FFALIGN(width, 1<<hsub) >> hsub;
+ int wp = FFALIGN(width, 1<<hsub) >> hsub;
int hp = FFALIGN(height, 1<<vsub) >> vsub;
+ dp = dst->data[i] + (x >> hsub) + (start_y >> vsub) * dst->linesize[i];
+ sp = src->data[i];
+ ap = src->data[3];
if (slice_y > y) {
sp += ((slice_y - y) >> vsub) * src->linesize[i];
ap += (slice_y - y) * src->linesize[3];
--
1.7.4.1
More information about the ffmpeg-devel
mailing list