[FFmpeg-devel] [PATCH 11/15] avfilter/vf_bwdif: Add neon for filter_line

Sun Jul 2 13:57:09 EEST 2023

On Sun, 2 Jul 2023 00:44:10 +0300 (EEST), you wrote:

>On Thu, 29 Jun 2023, John Cox wrote:
>
>> Signed-off-by: John Cox <jc at kynesim.co.uk>
>> ---
>> libavfilter/aarch64/vf_bwdif_init_aarch64.c |  21 ++
>> libavfilter/aarch64/vf_bwdif_neon.S         | 215 ++++++++++++++++++++
>> 2 files changed, 236 insertions(+)
>>
>> diff --git a/libavfilter/aarch64/vf_bwdif_init_aarch64.c b/libavfilter/aarch64/vf_bwdif_init_aarch64.c
>> index e75cf2f204..21e67884ab 100644
>> --- a/libavfilter/aarch64/vf_bwdif_init_aarch64.c
>> +++ b/libavfilter/aarch64/vf_bwdif_init_aarch64.c
>> @@ -31,6 +31,26 @@ void ff_bwdif_filter_edge_neon(void *dst1, void *prev1, void *cur1, void *next1,
>> void ff_bwdif_filter_intra_neon(void *dst1, void *cur1, int w, int prefs, int mrefs,
>>                                 int prefs3, int mrefs3, int parity, int clip_max);
>>
>> +void ff_bwdif_filter_line_neon(void *dst1, void *prev1, void *cur1, void *next1,
>> +                               int w, int prefs, int mrefs, int prefs2, int mrefs2,
>> +                               int prefs3, int mrefs3, int prefs4, int mrefs4,
>> +                               int parity, int clip_max);
>> +
>> +
>> +static void filter_line_helper(void *dst1, void *prev1, void *cur1, void *next1,
>> +                               int w, int prefs, int mrefs, int prefs2, int mrefs2,
>> +                               int prefs3, int mrefs3, int prefs4, int mrefs4,
>> +                               int parity, int clip_max)
>> +{
>> +    const int w0 = clip_max != 255 ? 0 : w & ~15;
>> +
>> +    ff_bwdif_filter_line_neon(dst1, prev1, cur1, next1,
>> +                              w0, prefs, mrefs, prefs2, mrefs2, prefs3, mrefs3, prefs4, mrefs4, parity, clip_max);
>> +
>> +    if (w0 < w)
>> +        ff_bwdif_filter_line_c((char *)dst1 + w0, (char *)prev1 + w0, (char *)cur1 + w0, (char *)next1 + w0,
>> +                               w - w0, prefs, mrefs, prefs2, mrefs2, prefs3, mrefs3, prefs4, mrefs4, parity, clip_max);
>> +}
>>
>> static void filter_edge_helper(void *dst1, void *prev1, void *cur1, void *next1,
>>                                int w, int prefs, int mrefs, int prefs2, int mrefs2,
>> @@ -71,6 +91,7 @@ ff_bwdif_init_aarch64(BWDIFContext *s, int bit_depth)
>>         return;
>>
>>     s->filter_intra = filter_intra_helper;
>> +    s->filter_line  = filter_line_helper;
>>     s->filter_edge  = filter_edge_helper;
>> }
>>
>> diff --git a/libavfilter/aarch64/vf_bwdif_neon.S b/libavfilter/aarch64/vf_bwdif_neon.S
>> index a33b235882..675e97d966 100644
>> --- a/libavfilter/aarch64/vf_bwdif_neon.S
>> +++ b/libavfilter/aarch64/vf_bwdif_neon.S
>> @@ -128,6 +128,221 @@ coeffs:
>>         .hword          5570, 3801, 1016, -3801         // hf[0] = v0.h[2], -hf[1] = v0.h[5]
>>         .hword          5077, 981                       // sp[0] = v0.h[6]
>>
>> +// ===========================================================================
>> +//
>> +// void filter_line(
>> +//      void *dst1,     // x0
>> +//      void *prev1,    // x1
>> +//      void *cur1,     // x2
>> +//      void *next1,    // x3
>> +//      int w,          // w4
>> +//      int prefs,      // w5
>> +//      int mrefs,      // w6
>> +//      int prefs2,     // w7
>> +//      int mrefs2,     // [sp, #0]
>> +//      int prefs3,     // [sp, #8]
>> +//      int mrefs3,     // [sp, #16]
>> +//      int prefs4,     // [sp, #24]
>> +//      int mrefs4,     // [sp, #32]
>> +//      int parity,     // [sp, #40]
>> +//      int clip_max)   // [sp, #48]
>> +
>> +function ff_bwdif_filter_line_neon, export=1
>> +        // Sanity check w
>> +        cmp             w4, #0
>> +        ble             99f
>> +
>> +        // Rearrange regs to be the same as line3 for ease of debug!
>> +        mov             w10, w4                         // w10 = loop count
>> +        mov             w9,  w6                         // w9  = mref
>> +        mov             w12, w7                         // w12 = pref2
>> +        mov             w11, w5                         // w11 = pref
>> +        ldr             w8,  [sp, #0]                   // w8 =  mref2
>> +        ldr             w7,  [sp, #16]                  // w7  = mref3
>> +        ldr             w6,  [sp, #32]                  // w6  = mref4
>> +        ldr             w13, [sp, #8]                   // w13 = pref3
>> +        ldr             w14, [sp, #24]                  // w14 = pref4
>
>Btw, remember that you can load two arguments from the stack at once with 
>ldp, e.g. "ldp x8, x13, [sp, #0]". If they're made intptr_t/ptrdiff_t, you 
>won't have an issue with garbage in the upper 32 bits either.

Fair point - I was indeed worrying about garbage in the upper half (and
this is not performance or size critical code).

>> +
>> +        mov             x4,  x3
>> +        mov             x3,  x2
>> +        mov             x2,  x1
>> +
>> +// #define prev2 cur
>> +//        const uint8_t * restrict next2 = parity ? prev : next;
>> +        ldr             w17, [sp, #40]                  // parity
>> +        cmp             w17, #0
>> +        csel            x17, x2, x4, ne
>> +
>> +        // We want all the V registers - save all the ones we must
>> +        stp             d14, d15, [sp, #-64]!
>> +        stp             d8,  d9,  [sp, #48]
>> +        stp             d10, d11, [sp, #32]
>> +        stp             d12, d13, [sp, #16]
>
>The order looks a bit weird here even if they end up sequential on the 
>stack. If you'd fill it from the bottom up, e.g.
>
>stp d8, d9, [sp, #-64]!
>stp d10, d11, [sp, #16]
>stp d12, d13, [sp, #32]
>stp d14, d15, [sp, #48]
>
>they're sequential both in code and on the stack.

Sure I can tweak that.

JC

>// Martin