[FFmpeg-devel] [PATCH 11/15] avfilter/vf_bwdif: Add neon for filter_line
John Cox
jc at kynesim.co.uk
Thu Jun 29 20:57:25 EEST 2023
Signed-off-by: John Cox <jc at kynesim.co.uk>
---
libavfilter/aarch64/vf_bwdif_init_aarch64.c | 21 ++
libavfilter/aarch64/vf_bwdif_neon.S | 215 ++++++++++++++++++++
2 files changed, 236 insertions(+)
diff --git a/libavfilter/aarch64/vf_bwdif_init_aarch64.c b/libavfilter/aarch64/vf_bwdif_init_aarch64.c
index e75cf2f204..21e67884ab 100644
--- a/libavfilter/aarch64/vf_bwdif_init_aarch64.c
+++ b/libavfilter/aarch64/vf_bwdif_init_aarch64.c
@@ -31,6 +31,26 @@ void ff_bwdif_filter_edge_neon(void *dst1, void *prev1, void *cur1, void *next1,
void ff_bwdif_filter_intra_neon(void *dst1, void *cur1, int w, int prefs, int mrefs,
int prefs3, int mrefs3, int parity, int clip_max);
+void ff_bwdif_filter_line_neon(void *dst1, void *prev1, void *cur1, void *next1,
+ int w, int prefs, int mrefs, int prefs2, int mrefs2,
+ int prefs3, int mrefs3, int prefs4, int mrefs4,
+ int parity, int clip_max);
+
+
+static void filter_line_helper(void *dst1, void *prev1, void *cur1, void *next1,
+ int w, int prefs, int mrefs, int prefs2, int mrefs2,
+ int prefs3, int mrefs3, int prefs4, int mrefs4,
+ int parity, int clip_max)
+{
+ const int w0 = clip_max != 255 ? 0 : w & ~15;
+
+ ff_bwdif_filter_line_neon(dst1, prev1, cur1, next1,
+ w0, prefs, mrefs, prefs2, mrefs2, prefs3, mrefs3, prefs4, mrefs4, parity, clip_max);
+
+ if (w0 < w)
+ ff_bwdif_filter_line_c((char *)dst1 + w0, (char *)prev1 + w0, (char *)cur1 + w0, (char *)next1 + w0,
+ w - w0, prefs, mrefs, prefs2, mrefs2, prefs3, mrefs3, prefs4, mrefs4, parity, clip_max);
+}
static void filter_edge_helper(void *dst1, void *prev1, void *cur1, void *next1,
int w, int prefs, int mrefs, int prefs2, int mrefs2,
@@ -71,6 +91,7 @@ ff_bwdif_init_aarch64(BWDIFContext *s, int bit_depth)
return;
s->filter_intra = filter_intra_helper;
+ s->filter_line = filter_line_helper;
s->filter_edge = filter_edge_helper;
}
diff --git a/libavfilter/aarch64/vf_bwdif_neon.S b/libavfilter/aarch64/vf_bwdif_neon.S
index a33b235882..675e97d966 100644
--- a/libavfilter/aarch64/vf_bwdif_neon.S
+++ b/libavfilter/aarch64/vf_bwdif_neon.S
@@ -128,6 +128,221 @@ coeffs:
.hword 5570, 3801, 1016, -3801 // hf[0] = v0.h[2], -hf[1] = v0.h[5]
.hword 5077, 981 // sp[0] = v0.h[6]
+// ===========================================================================
+//
+// void filter_line(
+// void *dst1, // x0
+// void *prev1, // x1
+// void *cur1, // x2
+// void *next1, // x3
+// int w, // w4
+// int prefs, // w5
+// int mrefs, // w6
+// int prefs2, // w7
+// int mrefs2, // [sp, #0]
+// int prefs3, // [sp, #8]
+// int mrefs3, // [sp, #16]
+// int prefs4, // [sp, #24]
+// int mrefs4, // [sp, #32]
+// int parity, // [sp, #40]
+// int clip_max) // [sp, #48]
+
+function ff_bwdif_filter_line_neon, export=1
+ // Sanity check w
+ cmp w4, #0
+ ble 99f
+
+ // Rearrange regs to be the same as line3 for ease of debug!
+ mov w10, w4 // w10 = loop count
+ mov w9, w6 // w9 = mref
+ mov w12, w7 // w12 = pref2
+ mov w11, w5 // w11 = pref
+ ldr w8, [sp, #0] // w8 = mref2
+ ldr w7, [sp, #16] // w7 = mref3
+ ldr w6, [sp, #32] // w6 = mref4
+ ldr w13, [sp, #8] // w13 = pref3
+ ldr w14, [sp, #24] // w14 = pref4
+
+ mov x4, x3
+ mov x3, x2
+ mov x2, x1
+
+// #define prev2 cur
+// const uint8_t * restrict next2 = parity ? prev : next;
+ ldr w17, [sp, #40] // parity
+ cmp w17, #0
+ csel x17, x2, x4, ne
+
+ // We want all the V registers - save all the ones we must
+ stp d14, d15, [sp, #-64]!
+ stp d8, d9, [sp, #48]
+ stp d10, d11, [sp, #32]
+ stp d12, d13, [sp, #16]
+
+ ldr q0, coeffs
+
+// for (x = 0; x < w; x++) {
+// int diff0, diff2;
+// int d0, d2;
+// int temporal_diff0, temporal_diff2;
+//
+// int i1, i2;
+// int j1, j2;
+// int p6, p5, p4, p3, p2, p1, c0, m1, m2, m3, m4;
+
+10:
+// c0 = prev2[0] + next2[0]; // c0 = v20, v21
+// d0 = c0 >> 1; // d0 = v10
+// temporal_diff0 = FFABS(prev2[0] - next2[0]); // td0 = v11
+ ldr q31, [x3]
+ ldr q21, [x17]
+ uhadd v10.16b, v31.16b, v21.16b
+ uabd v11.16b, v31.16b, v21.16b
+ uaddl v20.8h, v21.8b, v31.8b
+ uaddl2 v21.8h, v21.16b, v31.16b
+
+ ldr q31, [x3, w6, SXTW]
+ ldr q23, [x17, w6, SXTW]
+
+// i1 = coef_hf[0] * c0; // i1 = v2-v5
+ UMULL4K v2, v3, v4, v5, v20, v21, v0.h[2]
+
+ ldr q30, [x3, w14, SXTW]
+ ldr q25, [x17, w14, SXTW]
+
+// m4 = prev2[mrefs4] + next2[mrefs4]; // m4 = v22,v23
+ uaddl v22.8h, v23.8b, v31.8b
+ uaddl2 v23.8h, v23.16b, v31.16b
+
+// p4 = prev2[prefs4] + next2[prefs4]; // p4 = v24,v25, (p4 >> 1) = v12
+ uhadd v12.16b, v25.16b, v30.16b
+ uaddl v24.8h, v25.8b, v30.8b
+ uaddl2 v25.8h, v25.16b, v30.16b
+
+// m3 = cur[mrefs3]; // m3 = v20
+ ldr q20, [x3, w7, SXTW]
+
+// p3 = cur[prefs3]; // p3 = v21
+ ldr q21, [x3, w13, SXTW]
+
+// i1 += coef_hf[2] * (m4 + p4); // (-m4:v22,v23) (-p4:v24,v25)
+ add v22.8h, v22.8h, v24.8h
+ add v23.8h, v23.8h, v25.8h
+ UMLAL4K v2, v3, v4, v5, v22, v23, v0.h[4]
+
+ ldr q29, [x3, w8, SXTW]
+ ldr q23, [x17, w8, SXTW]
+
+// i1 -= coef_lf[1] * 4 * (m3 + p3); // -
+ uaddl v30.8h, v20.8b, v21.8b
+ uaddl2 v31.8h, v20.16b, v21.16b
+
+ UMLSL4K v2, v3, v4, v5, v30, v31, v0.h[1]
+
+// m2 = prev2[mrefs2] + next2[mrefs2]; // m2 = v22,v23, (m2 >> 1) = v13
+ uhadd v13.16b, v23.16b, v29.16b
+ uaddl v22.8h, v23.8b, v29.8b
+ uaddl2 v23.8h, v23.16b, v29.16b
+
+ ldr q31, [x3, w12, SXTW]
+ ldr q27, [x17, w12, SXTW]
+
+// j1 += coef_hf[2] * (m2 + p6); // (-p6:v24,v25)
+ add v24.8h, v24.8h, v22.8h
+ add v25.8h, v25.8h, v23.8h
+ UMLAL4K v6, v7, v8, v9, v24, v25, v0.h[4]
+
+// m1 = cur[mrefs]; // m1 = v24
+ ldr q24, [x3, w9, SXTW]
+
+// p2 = prev2[prefs2] + next2[prefs2]; // p2 = v26, v27
+// temporal_diff2 = FFABS(prev2[prefs2] - next2[prefs2]); // td2 = v14
+// d2 = p2 >> 1; // d2 = v15
+ uabd v14.16b, v31.16b, v27.16b
+ uhadd v15.16b, v31.16b, v27.16b
+ uaddl v26.8h, v27.8b, v31.8b
+ uaddl2 v27.8h, v27.16b, v31.16b
+
+// i1 -= coef_hf[1] * (m2 + p2); // (-m2:v22,v23*) (-p2:v26*,v27*)
+ add v22.8h, v22.8h, v26.8h
+ add v23.8h, v23.8h, v27.8h
+ UMLSL4K v2, v3, v4, v5, v22, v23, v0.h[3]
+
+// p1 = cur[prefs]; // p1 = v22
+ ldr q22, [x3, w11, SXTW]
+
+// i2 = (coef_sp[0] * (m1 + p1) - coef_sp[1] * (m3 + p3)) >> 13; // (-m3:v20*) i2=v17
+ uaddl v18.8h, v22.8b, v24.8b
+ uaddl2 v19.8h, v22.16b, v24.16b
+ UMULL4K v28, v29, v30, v31, v18, v19, v0.h[6]
+
+ uaddl v18.8h, v20.8b, v21.8b
+ uaddl2 v19.8h, v20.16b, v21.16b
+ UMLSL4K v28, v29, v30, v31, v18, v19, v0.h[7]
+
+ SQSHRUNN v17, v28, v29, v30, v31, 13
+
+// i1 += coef_lf[0] * 4 * (m1 + p1); // p1 = v22, m1 = v24
+ uaddl v26.8h, v24.8b, v22.8b
+ uaddl2 v27.8h, v24.16b, v22.16b
+ UMLAL4K v2, v3, v4, v5, v26, v27, v0.h[0]
+
+ ldr q31, [x2, w9, SXTW]
+ ldr q29, [x4, w9, SXTW]
+
+ ldr q30, [x2, w11, SXTW]
+ ldr q28, [x4, w11, SXTW]
+
+// i1 >>= 15; // i1 = v2, -v3, -v4*, -v5*
+ SQSHRUNN v2, v2, v3, v4, v5, 15
+
+// {
+// int t1 =(FFABS(prev[mrefs] - m1) + FFABS(prev[prefs] - p1)) >> 1;
+// int t2 =(FFABS(next[mrefs] - m1) + FFABS(next[prefs] - p1)) >> 1;
+ uabd v30.16b, v22.16b, v30.16b
+ uabd v31.16b, v24.16b, v31.16b
+ uabd v28.16b, v22.16b, v28.16b
+ uabd v29.16b, v24.16b, v29.16b
+ uhadd v31.16b, v31.16b, v30.16b
+ uhadd v29.16b, v29.16b, v28.16b
+
+// diff0 = FFMAX3(temporal_diff0 >> 1, t1, t2); // diff0=v18
+ ushr v18.16b, v11.16b, #1
+ umax v18.16b, v18.16b, v31.16b
+ umax v18.16b, v18.16b, v29.16b
+
+ // diff0 = v18, (m2 >> 1) = v13, m1 = v24, d0 = v10, p1 = v22, d2 = v15
+ SPAT_CHECK v18, v13, v24, v10, v22, v15, v31, v30, v29, v28
+
+ // i1 = v2, i2 = v17, m1 = v24, d0 = v10, p1 = v22, td2 = v11, diff2 = v18
+ INTERPOL v2, v2, v17, v24, v10, v22, v11, v18, v31, v30, v29
+
+// dst[0] = av_clip_uint8(interpol);
+ str q2, [x0], #16
+// }
+//
+// dst++;
+// cur++;
+// prev++;
+// prev2++;
+// next++;
+// }
+
+ subs w10, w10, #16
+ add x2, x2, #16
+ add x3, x3, #16
+ add x4, x4, #16
+ add x17, x17, #16
+ bgt 10b
+
+ ldp d12, d13, [sp, #16]
+ ldp d10, d11, [sp, #32]
+ ldp d8, d9, [sp, #48]
+ ldp d14, d15, [sp], #64
+99:
+ ret
+endfunc
+
// ============================================================================
//
// void ff_bwdif_filter_edge_neon(
--
2.39.2
More information about the ffmpeg-devel
mailing list