[FFmpeg-devel] [PATCH 1/6] avcodec/vc1: Arm 64-bit NEON deblocking filter fast paths

Thu Mar 17 20:58:14 EET 2022

Signed-off-by: Ben Avison <bavison at riscosopen.org>
---
 libavcodec/aarch64/Makefile              |   1 +
 libavcodec/aarch64/vc1dsp_init_aarch64.c |  14 +
 libavcodec/aarch64/vc1dsp_neon.S         | 698 +++++++++++++++++++++++
 3 files changed, 713 insertions(+)
 create mode 100644 libavcodec/aarch64/vc1dsp_neon.S

diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
index 954461f81d..5b25e4dfb9 100644
--- a/libavcodec/aarch64/Makefile
+++ b/libavcodec/aarch64/Makefile
@@ -48,6 +48,7 @@ NEON-OBJS-$(CONFIG_IDCTDSP)             += aarch64/simple_idct_neon.o
 NEON-OBJS-$(CONFIG_MDCT)                += aarch64/mdct_neon.o
 NEON-OBJS-$(CONFIG_MPEGAUDIODSP)        += aarch64/mpegaudiodsp_neon.o
 NEON-OBJS-$(CONFIG_PIXBLOCKDSP)         += aarch64/pixblockdsp_neon.o
+NEON-OBJS-$(CONFIG_VC1DSP)              += aarch64/vc1dsp_neon.o
 NEON-OBJS-$(CONFIG_VP8DSP)              += aarch64/vp8dsp_neon.o
 
 # decoders/encoders
diff --git a/libavcodec/aarch64/vc1dsp_init_aarch64.c b/libavcodec/aarch64/vc1dsp_init_aarch64.c
index 13dfd74940..edfb296b75 100644
--- a/libavcodec/aarch64/vc1dsp_init_aarch64.c
+++ b/libavcodec/aarch64/vc1dsp_init_aarch64.c
@@ -25,6 +25,13 @@
 
 #include "config.h"
 
+void ff_vc1_v_loop_filter4_neon(uint8_t *src, int stride, int pq);
+void ff_vc1_h_loop_filter4_neon(uint8_t *src, int stride, int pq);
+void ff_vc1_v_loop_filter8_neon(uint8_t *src, int stride, int pq);
+void ff_vc1_h_loop_filter8_neon(uint8_t *src, int stride, int pq);
+void ff_vc1_v_loop_filter16_neon(uint8_t *src, int stride, int pq);
+void ff_vc1_h_loop_filter16_neon(uint8_t *src, int stride, int pq);
+
 void ff_put_vc1_chroma_mc8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
                                 int h, int x, int y);
 void ff_avg_vc1_chroma_mc8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
@@ -39,6 +46,13 @@ av_cold void ff_vc1dsp_init_aarch64(VC1DSPContext *dsp)
     int cpu_flags = av_get_cpu_flags();
 
     if (have_neon(cpu_flags)) {
+        dsp->vc1_v_loop_filter4  = ff_vc1_v_loop_filter4_neon;
+        dsp->vc1_h_loop_filter4  = ff_vc1_h_loop_filter4_neon;
+        dsp->vc1_v_loop_filter8  = ff_vc1_v_loop_filter8_neon;
+        dsp->vc1_h_loop_filter8  = ff_vc1_h_loop_filter8_neon;
+        dsp->vc1_v_loop_filter16 = ff_vc1_v_loop_filter16_neon;
+        dsp->vc1_h_loop_filter16 = ff_vc1_h_loop_filter16_neon;
+
         dsp->put_no_rnd_vc1_chroma_pixels_tab[0] = ff_put_vc1_chroma_mc8_neon;
         dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_neon;
         dsp->put_no_rnd_vc1_chroma_pixels_tab[1] = ff_put_vc1_chroma_mc4_neon;
diff --git a/libavcodec/aarch64/vc1dsp_neon.S b/libavcodec/aarch64/vc1dsp_neon.S
new file mode 100644
index 0000000000..fe8963545a
--- /dev/null
+++ b/libavcodec/aarch64/vc1dsp_neon.S
@@ -0,0 +1,698 @@
+/*
+ * VC1 AArch64 NEON optimisations
+ *
+ * Copyright (c) 2022 Ben Avison <bavison at riscosopen.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+.align  5
+.Lcoeffs:
+.quad   0x00050002
+
+// VC-1 in-loop deblocking filter for 4 pixel pairs at boundary of vertically-neighbouring blocks
+// On entry:
+//   x0 -> top-left pel of lower block
+//   w1 = row stride, bytes
+//   w2 = PQUANT bitstream parameter
+function ff_vc1_v_loop_filter4_neon, export=1
+        sub     x3, x0, w1, sxtw #2
+        sxtw    x1, w1                  // technically, stride is signed int
+        ldr     d0, .Lcoeffs
+        ld1     {v1.s}[0], [x0], x1     // P5
+        ld1     {v2.s}[0], [x3], x1     // P1
+        ld1     {v3.s}[0], [x3], x1     // P2
+        ld1     {v4.s}[0], [x0], x1     // P6
+        ld1     {v5.s}[0], [x3], x1     // P3
+        ld1     {v6.s}[0], [x0], x1     // P7
+        ld1     {v7.s}[0], [x3]         // P4
+        ld1     {v16.s}[0], [x0]        // P8
+        ushll   v17.8h, v1.8b, #1       // 2*P5
+        dup     v18.8h, w2              // pq
+        ushll   v2.8h, v2.8b, #1        // 2*P1
+        uxtl    v3.8h, v3.8b            // P2
+        uxtl    v4.8h, v4.8b            // P6
+        uxtl    v19.8h, v5.8b           // P3
+        mls     v2.4h, v3.4h, v0.h[1]   // 2*P1-5*P2
+        uxtl    v3.8h, v6.8b            // P7
+        mls     v17.4h, v4.4h, v0.h[1]  // 2*P5-5*P6
+        ushll   v5.8h, v5.8b, #1        // 2*P3
+        uxtl    v6.8h, v7.8b            // P4
+        mla     v17.4h, v3.4h, v0.h[1]  // 2*P5-5*P6+5*P7
+        uxtl    v3.8h, v16.8b           // P8
+        mla     v2.4h, v19.4h, v0.h[1]  // 2*P1-5*P2+5*P3
+        uxtl    v1.8h, v1.8b            // P5
+        mls     v5.4h, v6.4h, v0.h[1]   // 2*P3-5*P4
+        mls     v17.4h, v3.4h, v0.h[0]  // 2*P5-5*P6+5*P7-2*P8
+        sub     v3.4h, v6.4h, v1.4h     // P4-P5
+        mls     v2.4h, v6.4h, v0.h[0]   // 2*P1-5*P2+5*P3-2*P4
+        mla     v5.4h, v1.4h, v0.h[1]   // 2*P3-5*P4+5*P5
+        mls     v5.4h, v4.4h, v0.h[0]   // 2*P3-5*P4+5*P5-2*P6
+        abs     v4.4h, v3.4h
+        srshr   v7.4h, v17.4h, #3
+        srshr   v2.4h, v2.4h, #3
+        sshr    v4.4h, v4.4h, #1        // clip
+        srshr   v5.4h, v5.4h, #3
+        abs     v7.4h, v7.4h            // a2
+        sshr    v3.4h, v3.4h, #8        // clip_sign
+        abs     v2.4h, v2.4h            // a1
+        cmeq    v16.4h, v4.4h, #0       // test clip == 0
+        abs     v17.4h, v5.4h           // a0
+        sshr    v5.4h, v5.4h, #8        // a0_sign
+        cmhs    v19.4h, v2.4h, v7.4h    // test a1 >= a2
+        cmhs    v18.4h, v17.4h, v18.4h  // test a0 >= pq
+        sub     v3.4h, v3.4h, v5.4h     // clip_sign - a0_sign
+        bsl     v19.8b, v7.8b, v2.8b    // a3
+        orr     v2.8b, v16.8b, v18.8b   // test clip == 0 || a0 >= pq
+        uqsub   v5.4h, v17.4h, v19.4h   // a0 >= a3 ? a0-a3 : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
+        cmhs    v7.4h, v19.4h, v17.4h   // test a3 >= a0
+        mul     v0.4h, v5.4h, v0.h[1]   // a0 >= a3 ? 5*(a0-a3) : 0
+        orr     v5.8b, v2.8b, v7.8b     // test clip == 0 || a0 >= pq || a3 >= a0
+        mov     w0, v5.s[1]             // move to gp reg
+        ushr    v0.4h, v0.4h, #3        // a0 >= a3 ? (5*(a0-a3))>>3 : 0
+        cmhs    v5.4h, v0.4h, v4.4h
+        tbnz    w0, #0, 1f              // none of the 4 pixel pairs should be updated if this one is not filtered
+        bsl     v5.8b, v4.8b, v0.8b     // FFMIN(d, clip)
+        bic     v0.8b, v5.8b, v2.8b     // set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub)
+        mls     v6.4h, v0.4h, v3.4h     // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4
+        mla     v1.4h, v0.4h, v3.4h     // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5
+        sqxtun  v0.8b, v6.8h
+        sqxtun  v1.8b, v1.8h
+        st1     {v0.s}[0], [x3], x1
+        st1     {v1.s}[0], [x3]
+1:      ret
+endfunc
+
+// VC-1 in-loop deblocking filter for 4 pixel pairs at boundary of horizontally-neighbouring blocks
+// On entry:
+//   x0 -> top-left pel of right block
+//   w1 = row stride, bytes
+//   w2 = PQUANT bitstream parameter
+function ff_vc1_h_loop_filter4_neon, export=1
+        sub     x3, x0, #4              // where to start reading
+        sxtw    x1, w1                  // technically, stride is signed int
+        ldr     d0, .Lcoeffs
+        ld1     {v1.8b}, [x3], x1
+        sub     x0, x0, #1              // where to start writing
+        ld1     {v2.8b}, [x3], x1
+        ld1     {v3.8b}, [x3], x1
+        ld1     {v4.8b}, [x3]
+        dup     v5.8h, w2               // pq
+        trn1    v6.8b, v1.8b, v2.8b
+        trn2    v1.8b, v1.8b, v2.8b
+        trn1    v2.8b, v3.8b, v4.8b
+        trn2    v3.8b, v3.8b, v4.8b
+        trn1    v4.4h, v6.4h, v2.4h     // P1, P5
+        trn1    v7.4h, v1.4h, v3.4h     // P2, P6
+        trn2    v2.4h, v6.4h, v2.4h     // P3, P7
+        trn2    v1.4h, v1.4h, v3.4h     // P4, P8
+        ushll   v3.8h, v4.8b, #1        // 2*P1, 2*P5
+        uxtl    v6.8h, v7.8b            // P2, P6
+        uxtl    v7.8h, v2.8b            // P3, P7
+        uxtl    v1.8h, v1.8b            // P4, P8
+        mls     v3.8h, v6.8h, v0.h[1]   // 2*P1-5*P2, 2*P5-5*P6
+        ushll   v2.8h, v2.8b, #1        // 2*P3, 2*P7
+        uxtl    v4.8h, v4.8b            // P1, P5
+        mla     v3.8h, v7.8h, v0.h[1]   // 2*P1-5*P2+5*P3, 2*P5-5*P6+5*P7
+        mov     d6, v6.d[1]             // P6
+        mls     v3.8h, v1.8h, v0.h[0]   // 2*P1-5*P2+5*P3-2*P4, 2*P5-5*P6+5*P7-2*P8
+        mov     d4, v4.d[1]             // P5
+        mls     v2.4h, v1.4h, v0.h[1]   // 2*P3-5*P4
+        mla     v2.4h, v4.4h, v0.h[1]   // 2*P3-5*P4+5*P5
+        sub     v7.4h, v1.4h, v4.4h     // P4-P5
+        mls     v2.4h, v6.4h, v0.h[0]   // 2*P3-5*P4+5*P5-2*P6
+        srshr   v3.8h, v3.8h, #3
+        abs     v6.4h, v7.4h
+        sshr    v7.4h, v7.4h, #8        // clip_sign
+        srshr   v2.4h, v2.4h, #3
+        abs     v3.8h, v3.8h            // a1, a2
+        sshr    v6.4h, v6.4h, #1        // clip
+        mov     d16, v3.d[1]            // a2
+        abs     v17.4h, v2.4h           // a0
+        cmeq    v18.4h, v6.4h, #0       // test clip == 0
+        sshr    v2.4h, v2.4h, #8        // a0_sign
+        cmhs    v19.4h, v3.4h, v16.4h   // test a1 >= a2
+        cmhs    v5.4h, v17.4h, v5.4h    // test a0 >= pq
+        sub     v2.4h, v7.4h, v2.4h     // clip_sign - a0_sign
+        bsl     v19.8b, v16.8b, v3.8b   // a3
+        orr     v3.8b, v18.8b, v5.8b    // test clip == 0 || a0 >= pq
+        uqsub   v5.4h, v17.4h, v19.4h   // a0 >= a3 ? a0-a3 : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
+        cmhs    v7.4h, v19.4h, v17.4h   // test a3 >= a0
+        mul     v0.4h, v5.4h, v0.h[1]   // a0 >= a3 ? 5*(a0-a3) : 0
+        orr     v5.8b, v3.8b, v7.8b     // test clip == 0 || a0 >= pq || a3 >= a0
+        mov     w2, v5.s[1]             // move to gp reg
+        ushr    v0.4h, v0.4h, #3        // a0 >= a3 ? (5*(a0-a3))>>3 : 0
+        cmhs    v5.4h, v0.4h, v6.4h
+        tbnz    w2, #0, 1f              // none of the 4 pixel pairs should be updated if this one is not filtered
+        bsl     v5.8b, v6.8b, v0.8b     // FFMIN(d, clip)
+        bic     v0.8b, v5.8b, v3.8b     // set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub)
+        mla     v4.4h, v0.4h, v2.4h     // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5
+        mls     v1.4h, v0.4h, v2.4h     // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4
+        sqxtun  v3.8b, v4.8h
+        sqxtun  v2.8b, v1.8h
+        st2     {v2.b, v3.b}[0], [x0], x1
+        st2     {v2.b, v3.b}[1], [x0], x1
+        st2     {v2.b, v3.b}[2], [x0], x1
+        st2     {v2.b, v3.b}[3], [x0]
+1:      ret
+endfunc
+
+// VC-1 in-loop deblocking filter for 8 pixel pairs at boundary of vertically-neighbouring blocks
+// On entry:
+//   x0 -> top-left pel of lower block
+//   w1 = row stride, bytes
+//   w2 = PQUANT bitstream parameter
+function ff_vc1_v_loop_filter8_neon, export=1
+        sub     x3, x0, w1, sxtw #2
+        sxtw    x1, w1                  // technically, stride is signed int
+        ldr     d0, .Lcoeffs
+        ld1     {v1.8b}, [x0], x1       // P5
+        movi    v2.2d, #0x0000ffff00000000
+        ld1     {v3.8b}, [x3], x1       // P1
+        ld1     {v4.8b}, [x3], x1       // P2
+        ld1     {v5.8b}, [x0], x1       // P6
+        ld1     {v6.8b}, [x3], x1       // P3
+        ld1     {v7.8b}, [x0], x1       // P7
+        ushll   v16.8h, v1.8b, #1       // 2*P5
+        ushll   v3.8h, v3.8b, #1        // 2*P1
+        ld1     {v17.8b}, [x3]          // P4
+        uxtl    v4.8h, v4.8b            // P2
+        ld1     {v18.8b}, [x0]          // P8
+        uxtl    v5.8h, v5.8b            // P6
+        dup     v19.8h, w2              // pq
+        uxtl    v20.8h, v6.8b           // P3
+        mls     v3.8h, v4.8h, v0.h[1]   // 2*P1-5*P2
+        uxtl    v4.8h, v7.8b            // P7
+        ushll   v6.8h, v6.8b, #1        // 2*P3
+        mls     v16.8h, v5.8h, v0.h[1]  // 2*P5-5*P6
+        uxtl    v7.8h, v17.8b           // P4
+        uxtl    v17.8h, v18.8b          // P8
+        mla     v16.8h, v4.8h, v0.h[1]  // 2*P5-5*P6+5*P7
+        uxtl    v1.8h, v1.8b            // P5
+        mla     v3.8h, v20.8h, v0.h[1]  // 2*P1-5*P2+5*P3
+        sub     v4.8h, v7.8h, v1.8h     // P4-P5
+        mls     v6.8h, v7.8h, v0.h[1]   // 2*P3-5*P4
+        mls     v16.8h, v17.8h, v0.h[0] // 2*P5-5*P6+5*P7-2*P8
+        abs     v17.8h, v4.8h
+        sshr    v4.8h, v4.8h, #8        // clip_sign
+        mls     v3.8h, v7.8h, v0.h[0]   // 2*P1-5*P2+5*P3-2*P4
+        sshr    v17.8h, v17.8h, #1      // clip
+        mla     v6.8h, v1.8h, v0.h[1]   // 2*P3-5*P4+5*P5
+        srshr   v16.8h, v16.8h, #3
+        mls     v6.8h, v5.8h, v0.h[0]   // 2*P3-5*P4+5*P5-2*P6
+        cmeq    v5.8h, v17.8h, #0       // test clip == 0
+        srshr   v3.8h, v3.8h, #3
+        abs     v16.8h, v16.8h          // a2
+        abs     v3.8h, v3.8h            // a1
+        srshr   v6.8h, v6.8h, #3
+        cmhs    v18.8h, v3.8h, v16.8h   // test a1 >= a2
+        abs     v20.8h, v6.8h           // a0
+        sshr    v6.8h, v6.8h, #8        // a0_sign
+        bsl     v18.16b, v16.16b, v3.16b // a3
+        cmhs    v3.8h, v20.8h, v19.8h   // test a0 >= pq
+        sub     v4.8h, v4.8h, v6.8h     // clip_sign - a0_sign
+        uqsub   v6.8h, v20.8h, v18.8h   // a0 >= a3 ? a0-a3 : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
+        cmhs    v16.8h, v18.8h, v20.8h  // test a3 >= a0
+        orr     v3.16b, v5.16b, v3.16b  // test clip == 0 || a0 >= pq
+        mul     v0.8h, v6.8h, v0.h[1]   // a0 >= a3 ? 5*(a0-a3) : 0
+        orr     v5.16b, v3.16b, v16.16b // test clip == 0 || a0 >= pq || a3 >= a0
+        cmtst   v2.2d, v5.2d, v2.2d     // if 2nd of each group of is not filtered, then none of the others in the group should be either
+        mov     w0, v5.s[1]             // move to gp reg
+        ushr    v0.8h, v0.8h, #3        // a0 >= a3 ? (5*(a0-a3))>>3 : 0
+        mov     w2, v5.s[3]
+        orr     v2.16b, v3.16b, v2.16b
+        cmhs    v3.8h, v0.8h, v17.8h
+        and     w0, w0, w2
+        bsl     v3.16b, v17.16b, v0.16b // FFMIN(d, clip)
+        tbnz    w0, #0, 1f              // none of the 8 pixel pairs should be updated in this case
+        bic     v0.16b, v3.16b, v2.16b  // set each d to zero if it should not be filtered
+        mls     v7.8h, v0.8h, v4.8h     // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4
+        mla     v1.8h, v0.8h, v4.8h     // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5
+        sqxtun  v0.8b, v7.8h
+        sqxtun  v1.8b, v1.8h
+        st1     {v0.8b}, [x3], x1
+        st1     {v1.8b}, [x3]
+1:      ret
+endfunc
+
+// VC-1 in-loop deblocking filter for 8 pixel pairs at boundary of horizontally-neighbouring blocks
+// On entry:
+//   x0 -> top-left pel of right block
+//   w1 = row stride, bytes
+//   w2 = PQUANT bitstream parameter
+function ff_vc1_h_loop_filter8_neon, export=1
+        sub     x3, x0, #4              // where to start reading
+        sxtw    x1, w1                  // technically, stride is signed int
+        ldr     d0, .Lcoeffs
+        ld1     {v1.8b}, [x3], x1       // P1[0], P2[0]...
+        sub     x0, x0, #1              // where to start writing
+        ld1     {v2.8b}, [x3], x1
+        add     x4, x0, x1, lsl #2
+        ld1     {v3.8b}, [x3], x1
+        ld1     {v4.8b}, [x3], x1
+        ld1     {v5.8b}, [x3], x1
+        ld1     {v6.8b}, [x3], x1
+        ld1     {v7.8b}, [x3], x1
+        trn1    v16.8b, v1.8b, v2.8b    // P1[0], P1[1], P3[0]...
+        ld1     {v17.8b}, [x3]
+        trn2    v1.8b, v1.8b, v2.8b     // P2[0], P2[1], P4[0]...
+        trn1    v2.8b, v3.8b, v4.8b     // P1[2], P1[3], P3[2]...
+        trn2    v3.8b, v3.8b, v4.8b     // P2[2], P2[3], P4[2]...
+        dup     v4.8h, w2               // pq
+        trn1    v18.8b, v5.8b, v6.8b    // P1[4], P1[5], P3[4]...
+        trn2    v5.8b, v5.8b, v6.8b     // P2[4], P2[5], P4[4]...
+        trn1    v6.4h, v16.4h, v2.4h    // P1[0], P1[1], P1[2], P1[3], P5[0]...
+        trn1    v19.4h, v1.4h, v3.4h    // P2[0], P2[1], P2[2], P2[3], P6[0]...
+        trn1    v20.8b, v7.8b, v17.8b   // P1[6], P1[7], P3[6]...
+        trn2    v7.8b, v7.8b, v17.8b    // P2[6], P2[7], P4[6]...
+        trn2    v2.4h, v16.4h, v2.4h    // P3[0], P3[1], P3[2], P3[3], P7[0]...
+        trn2    v1.4h, v1.4h, v3.4h     // P4[0], P4[1], P4[2], P4[3], P8[0]...
+        trn1    v3.4h, v18.4h, v20.4h   // P1[4], P1[5], P1[6], P1[7], P5[4]...
+        trn1    v16.4h, v5.4h, v7.4h    // P2[4], P2[5], P2[6], P2[7], P6[4]...
+        trn2    v17.4h, v18.4h, v20.4h  // P3[4], P3[5], P3[6], P3[7], P7[4]...
+        trn2    v5.4h, v5.4h, v7.4h     // P4[4], P4[5], P4[6], P4[7], P8[4]...
+        trn1    v7.2s, v6.2s, v3.2s     // P1
+        trn1    v18.2s, v19.2s, v16.2s  // P2
+        trn2    v3.2s, v6.2s, v3.2s     // P5
+        trn2    v6.2s, v19.2s, v16.2s   // P6
+        trn1    v16.2s, v2.2s, v17.2s   // P3
+        trn2    v2.2s, v2.2s, v17.2s    // P7
+        ushll   v7.8h, v7.8b, #1        // 2*P1
+        trn1    v17.2s, v1.2s, v5.2s    // P4
+        ushll   v19.8h, v3.8b, #1       // 2*P5
+        trn2    v1.2s, v1.2s, v5.2s     // P8
+        uxtl    v5.8h, v18.8b           // P2
+        uxtl    v6.8h, v6.8b            // P6
+        uxtl    v18.8h, v16.8b          // P3
+        mls     v7.8h, v5.8h, v0.h[1]   // 2*P1-5*P2
+        uxtl    v2.8h, v2.8b            // P7
+        ushll   v5.8h, v16.8b, #1       // 2*P3
+        mls     v19.8h, v6.8h, v0.h[1]  // 2*P5-5*P6
+        uxtl    v16.8h, v17.8b          // P4
+        uxtl    v1.8h, v1.8b            // P8
+        mla     v19.8h, v2.8h, v0.h[1]  // 2*P5-5*P6+5*P7
+        uxtl    v2.8h, v3.8b            // P5
+        mla     v7.8h, v18.8h, v0.h[1]  // 2*P1-5*P2+5*P3
+        sub     v3.8h, v16.8h, v2.8h    // P4-P5
+        mls     v5.8h, v16.8h, v0.h[1]  // 2*P3-5*P4
+        mls     v19.8h, v1.8h, v0.h[0]  // 2*P5-5*P6+5*P7-2*P8
+        abs     v1.8h, v3.8h
+        sshr    v3.8h, v3.8h, #8        // clip_sign
+        mls     v7.8h, v16.8h, v0.h[0]  // 2*P1-5*P2+5*P3-2*P4
+        sshr    v1.8h, v1.8h, #1        // clip
+        mla     v5.8h, v2.8h, v0.h[1]   // 2*P3-5*P4+5*P5
+        srshr   v17.8h, v19.8h, #3
+        mls     v5.8h, v6.8h, v0.h[0]   // 2*P3-5*P4+5*P5-2*P6
+        cmeq    v6.8h, v1.8h, #0        // test clip == 0
+        srshr   v7.8h, v7.8h, #3
+        abs     v17.8h, v17.8h          // a2
+        abs     v7.8h, v7.8h            // a1
+        srshr   v5.8h, v5.8h, #3
+        cmhs    v18.8h, v7.8h, v17.8h   // test a1 >= a2
+        abs     v19.8h, v5.8h           // a0
+        sshr    v5.8h, v5.8h, #8        // a0_sign
+        bsl     v18.16b, v17.16b, v7.16b // a3
+        cmhs    v4.8h, v19.8h, v4.8h    // test a0 >= pq
+        sub     v3.8h, v3.8h, v5.8h     // clip_sign - a0_sign
+        uqsub   v5.8h, v19.8h, v18.8h   // a0 >= a3 ? a0-a3 : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
+        cmhs    v7.8h, v18.8h, v19.8h   // test a3 >= a0
+        orr     v4.16b, v6.16b, v4.16b  // test clip == 0 || a0 >= pq
+        mul     v0.8h, v5.8h, v0.h[1]   // a0 >= a3 ? 5*(a0-a3) : 0
+        orr     v5.16b, v4.16b, v7.16b  // test clip == 0 || a0 >= pq || a3 >= a0
+        mov     w2, v5.s[1]             // move to gp reg
+        ushr    v0.8h, v0.8h, #3        // a0 >= a3 ? (5*(a0-a3))>>3 : 0
+        mov     w3, v5.s[3]
+        cmhs    v5.8h, v0.8h, v1.8h
+        and     w5, w2, w3
+        bsl     v5.16b, v1.16b, v0.16b  // FFMIN(d, clip)
+        tbnz    w5, #0, 2f              // none of the 8 pixel pairs should be updated in this case
+        bic     v0.16b, v5.16b, v4.16b  // set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub)
+        mla     v2.8h, v0.8h, v3.8h     // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5
+        mls     v16.8h, v0.8h, v3.8h    // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4
+        sqxtun  v1.8b, v2.8h
+        sqxtun  v0.8b, v16.8h
+        tbnz    w2, #0, 1f              // none of the first 4 pixel pairs should be updated if so
+        st2     {v0.b, v1.b}[0], [x0], x1
+        st2     {v0.b, v1.b}[1], [x0], x1
+        st2     {v0.b, v1.b}[2], [x0], x1
+        st2     {v0.b, v1.b}[3], [x0]
+1:      tbnz    w3, #0, 2f              // none of the second 4 pixel pairs should be updated if so
+        st2     {v0.b, v1.b}[4], [x4], x1
+        st2     {v0.b, v1.b}[5], [x4], x1
+        st2     {v0.b, v1.b}[6], [x4], x1
+        st2     {v0.b, v1.b}[7], [x4]
+2:      ret
+endfunc
+
+// VC-1 in-loop deblocking filter for 16 pixel pairs at boundary of vertically-neighbouring blocks
+// On entry:
+//   x0 -> top-left pel of lower block
+//   w1 = row stride, bytes
+//   w2 = PQUANT bitstream parameter
+function ff_vc1_v_loop_filter16_neon, export=1
+        sub     x3, x0, w1, sxtw #2
+        sxtw    x1, w1                  // technically, stride is signed int
+        ldr     d0, .Lcoeffs
+        ld1     {v1.16b}, [x0], x1      // P5
+        movi    v2.2d, #0x0000ffff00000000
+        ld1     {v3.16b}, [x3], x1      // P1
+        ld1     {v4.16b}, [x3], x1      // P2
+        ld1     {v5.16b}, [x0], x1      // P6
+        ld1     {v6.16b}, [x3], x1      // P3
+        ld1     {v7.16b}, [x0], x1      // P7
+        ushll   v16.8h, v1.8b, #1       // 2*P5[0..7]
+        ushll   v17.8h, v3.8b, #1       // 2*P1[0..7]
+        ld1     {v18.16b}, [x3]         // P4
+        uxtl    v19.8h, v4.8b           // P2[0..7]
+        ld1     {v20.16b}, [x0]         // P8
+        uxtl    v21.8h, v5.8b           // P6[0..7]
+        dup     v22.8h, w2              // pq
+        ushll2  v3.8h, v3.16b, #1       // 2*P1[8..15]
+        mls     v17.8h, v19.8h, v0.h[1] // 2*P1[0..7]-5*P2[0..7]
+        ushll2  v19.8h, v1.16b, #1      // 2*P5[8..15]
+        uxtl2   v4.8h, v4.16b           // P2[8..15]
+        mls     v16.8h, v21.8h, v0.h[1] // 2*P5[0..7]-5*P6[0..7]
+        uxtl2   v5.8h, v5.16b           // P6[8..15]
+        uxtl    v23.8h, v6.8b           // P3[0..7]
+        uxtl    v24.8h, v7.8b           // P7[0..7]
+        mls     v3.8h, v4.8h, v0.h[1]   // 2*P1[8..15]-5*P2[8..15]
+        ushll   v4.8h, v6.8b, #1        // 2*P3[0..7]
+        uxtl    v25.8h, v18.8b          // P4[0..7]
+        mls     v19.8h, v5.8h, v0.h[1]  // 2*P5[8..15]-5*P6[8..15]
+        uxtl2   v26.8h, v6.16b          // P3[8..15]
+        mla     v17.8h, v23.8h, v0.h[1] // 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]
+        uxtl2   v7.8h, v7.16b           // P7[8..15]
+        ushll2  v6.8h, v6.16b, #1       // 2*P3[8..15]
+        mla     v16.8h, v24.8h, v0.h[1] // 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]
+        uxtl2   v18.8h, v18.16b         // P4[8..15]
+        uxtl    v23.8h, v20.8b          // P8[0..7]
+        mls     v4.8h, v25.8h, v0.h[1]  // 2*P3[0..7]-5*P4[0..7]
+        uxtl    v24.8h, v1.8b           // P5[0..7]
+        uxtl2   v20.8h, v20.16b         // P8[8..15]
+        mla     v3.8h, v26.8h, v0.h[1]  // 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]
+        uxtl2   v1.8h, v1.16b           // P5[8..15]
+        sub     v26.8h, v25.8h, v24.8h  // P4[0..7]-P5[0..7]
+        mla     v19.8h, v7.8h, v0.h[1]  // 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]
+        sub     v7.8h, v18.8h, v1.8h    // P4[8..15]-P5[8..15]
+        mls     v6.8h, v18.8h, v0.h[1]  // 2*P3[8..15]-5*P4[8..15]
+        abs     v27.8h, v26.8h
+        sshr    v26.8h, v26.8h, #8      // clip_sign[0..7]
+        mls     v17.8h, v25.8h, v0.h[0] // 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]-2*P4[0..7]
+        abs     v28.8h, v7.8h
+        sshr    v27.8h, v27.8h, #1      // clip[0..7]
+        mls     v16.8h, v23.8h, v0.h[0] // 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]-2*P8[0..7]
+        sshr    v7.8h, v7.8h, #8        // clip_sign[8..15]
+        sshr    v23.8h, v28.8h, #1      // clip[8..15]
+        mla     v4.8h, v24.8h, v0.h[1]  // 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]
+        cmeq    v28.8h, v27.8h, #0      // test clip[0..7] == 0
+        srshr   v17.8h, v17.8h, #3
+        mls     v3.8h, v18.8h, v0.h[0]  // 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]-2*P4[8..15]
+        cmeq    v29.8h, v23.8h, #0      // test clip[8..15] == 0
+        srshr   v16.8h, v16.8h, #3
+        mls     v19.8h, v20.8h, v0.h[0] // 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]-2*P8[8..15]
+        abs     v17.8h, v17.8h          // a1[0..7]
+        mla     v6.8h, v1.8h, v0.h[1]   // 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]
+        srshr   v3.8h, v3.8h, #3
+        mls     v4.8h, v21.8h, v0.h[0]  // 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]-2*P6[0..7]
+        abs     v16.8h, v16.8h          // a2[0..7]
+        srshr   v19.8h, v19.8h, #3
+        mls     v6.8h, v5.8h, v0.h[0]   // 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]-2*P6[8..15]
+        cmhs    v5.8h, v17.8h, v16.8h   // test a1[0..7] >= a2[0..7]
+        abs     v3.8h, v3.8h            // a1[8..15]
+        srshr   v4.8h, v4.8h, #3
+        abs     v19.8h, v19.8h          // a2[8..15]
+        bsl     v5.16b, v16.16b, v17.16b // a3[0..7]
+        srshr   v6.8h, v6.8h, #3
+        cmhs    v16.8h, v3.8h, v19.8h   // test a1[8..15] >= a2[8.15]
+        abs     v17.8h, v4.8h           // a0[0..7]
+        sshr    v4.8h, v4.8h, #8        // a0_sign[0..7]
+        bsl     v16.16b, v19.16b, v3.16b // a3[8..15]
+        uqsub   v3.8h, v17.8h, v5.8h    // a0[0..7] >= a3[0..7] ? a0[0..7]-a3[0..7] : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
+        abs     v19.8h, v6.8h           // a0[8..15]
+        cmhs    v20.8h, v17.8h, v22.8h  // test a0[0..7] >= pq
+        cmhs    v5.8h, v5.8h, v17.8h    // test a3[0..7] >= a0[0..7]
+        sub     v4.8h, v26.8h, v4.8h    // clip_sign[0..7] - a0_sign[0..7]
+        sshr    v6.8h, v6.8h, #8        // a0_sign[8..15]
+        mul     v3.8h, v3.8h, v0.h[1]   // a0[0..7] >= a3[0..7] ? 5*(a0[0..7]-a3[0..7]) : 0
+        uqsub   v17.8h, v19.8h, v16.8h  // a0[8..15] >= a3[8..15] ? a0[8..15]-a3[8..15] : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
+        orr     v20.16b, v28.16b, v20.16b // test clip[0..7] == 0 || a0[0..7] >= pq
+        cmhs    v21.8h, v19.8h, v22.8h  // test a0[8..15] >= pq
+        cmhs    v16.8h, v16.8h, v19.8h  // test a3[8..15] >= a0[8..15]
+        mul     v0.8h, v17.8h, v0.h[1]  // a0[8..15] >= a3[8..15] ? 5*(a0[8..15]-a3[8..15]) : 0
+        sub     v6.8h, v7.8h, v6.8h     // clip_sign[8..15] - a0_sign[8..15]
+        orr     v5.16b, v20.16b, v5.16b // test clip[0..7] == 0 || a0[0..7] >= pq || a3[0..7] >= a0[0..7]
+        ushr    v3.8h, v3.8h, #3        // a0[0..7] >= a3[0..7] ? (5*(a0[0..7]-a3[0..7]))>>3 : 0
+        orr     v7.16b, v29.16b, v21.16b // test clip[8..15] == 0 || a0[8..15] >= pq
+        cmtst   v17.2d, v5.2d, v2.2d    // if 2nd of each group of is not filtered, then none of the others in the group should be either
+        mov     w0, v5.s[1]             // move to gp reg
+        cmhs    v19.8h, v3.8h, v27.8h
+        ushr    v0.8h, v0.8h, #3        // a0[8..15] >= a3[8..15] ? (5*(a0[8..15]-a3[8..15]))>>3 : 0
+        mov     w2, v5.s[3]
+        orr     v5.16b, v7.16b, v16.16b // test clip[8..15] == 0 || a0[8..15] >= pq || a3[8..15] >= a0[8..15]
+        orr     v16.16b, v20.16b, v17.16b
+        bsl     v19.16b, v27.16b, v3.16b // FFMIN(d[0..7], clip[0..7])
+        cmtst   v2.2d, v5.2d, v2.2d
+        cmhs    v3.8h, v0.8h, v23.8h
+        mov     w4, v5.s[1]
+        mov     w5, v5.s[3]
+        and     w0, w0, w2
+        bic     v5.16b, v19.16b, v16.16b // set each d[0..7] to zero if it should not be filtered because clip[0..7] == 0 || a0[0..7] >= pq (a3 > a0 case already zeroed by saturating sub)
+        orr     v2.16b, v7.16b, v2.16b
+        bsl     v3.16b, v23.16b, v0.16b // FFMIN(d[8..15], clip[8..15])
+        mls     v25.8h, v5.8h, v4.8h    // invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P4[0..7]
+        and     w2, w4, w5
+        bic     v0.16b, v3.16b, v2.16b  // set each d[8..15] to zero if it should not be filtered because clip[8..15] == 0 || a0[8..15] >= pq (a3 > a0 case already zeroed by saturating sub)
+        mla     v24.8h, v5.8h, v4.8h    // invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P5[0..7]
+        and     w0, w0, w2
+        mls     v18.8h, v0.8h, v6.8h    // invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P4[8..15]
+        sqxtun  v2.8b, v25.8h
+        tbnz    w0, #0, 1f              // none of the 16 pixel pairs should be updated in this case
+        mla     v1.8h, v0.8h, v6.8h     // invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P5[8..15]
+        sqxtun  v0.8b, v24.8h
+        sqxtun2 v2.16b, v18.8h
+        sqxtun2 v0.16b, v1.8h
+        st1     {v2.16b}, [x3], x1
+        st1     {v0.16b}, [x3]
+1:      ret
+endfunc
+
+// VC-1 in-loop deblocking filter for 16 pixel pairs at boundary of horizontally-neighbouring blocks
+// On entry:
+//   x0 -> top-left pel of right block
+//   w1 = row stride, bytes
+//   w2 = PQUANT bitstream parameter
+function ff_vc1_h_loop_filter16_neon, export=1
+        sub     x3, x0, #4              // where to start reading
+        sxtw    x1, w1                  // technically, stride is signed int
+        ldr     d0, .Lcoeffs
+        ld1     {v1.8b}, [x3], x1       // P1[0], P2[0]...
+        sub     x0, x0, #1              // where to start writing
+        ld1     {v2.8b}, [x3], x1
+        add     x4, x0, x1, lsl #3
+        ld1     {v3.8b}, [x3], x1
+        add     x5, x0, x1, lsl #2
+        ld1     {v4.8b}, [x3], x1
+        add     x6, x4, x1, lsl #2
+        ld1     {v5.8b}, [x3], x1
+        ld1     {v6.8b}, [x3], x1
+        ld1     {v7.8b}, [x3], x1
+        trn1    v16.8b, v1.8b, v2.8b    // P1[0], P1[1], P3[0]...
+        ld1     {v17.8b}, [x3], x1
+        trn2    v1.8b, v1.8b, v2.8b     // P2[0], P2[1], P4[0]...
+        ld1     {v2.8b}, [x3], x1
+        trn1    v18.8b, v3.8b, v4.8b    // P1[2], P1[3], P3[2]...
+        ld1     {v19.8b}, [x3], x1
+        trn2    v3.8b, v3.8b, v4.8b     // P2[2], P2[3], P4[2]...
+        ld1     {v4.8b}, [x3], x1
+        trn1    v20.8b, v5.8b, v6.8b    // P1[4], P1[5], P3[4]...
+        ld1     {v21.8b}, [x3], x1
+        trn2    v5.8b, v5.8b, v6.8b     // P2[4], P2[5], P4[4]...
+        ld1     {v6.8b}, [x3], x1
+        trn1    v22.8b, v7.8b, v17.8b   // P1[6], P1[7], P3[6]...
+        ld1     {v23.8b}, [x3], x1
+        trn2    v7.8b, v7.8b, v17.8b    // P2[6], P2[7], P4[6]...
+        ld1     {v17.8b}, [x3], x1
+        trn1    v24.8b, v2.8b, v19.8b   // P1[8], P1[9], P3[8]...
+        ld1     {v25.8b}, [x3]
+        trn2    v2.8b, v2.8b, v19.8b    // P2[8], P2[9], P4[8]...
+        trn1    v19.4h, v16.4h, v18.4h  // P1[0], P1[1], P1[2], P1[3], P5[0]...
+        trn1    v26.8b, v4.8b, v21.8b   // P1[10], P1[11], P3[10]...
+        trn2    v4.8b, v4.8b, v21.8b    // P2[10], P2[11], P4[10]...
+        trn1    v21.4h, v1.4h, v3.4h    // P2[0], P2[1], P2[2], P2[3], P6[0]...
+        trn1    v27.4h, v20.4h, v22.4h  // P1[4], P1[5], P1[6], P1[7], P5[4]...
+        trn1    v28.8b, v6.8b, v23.8b   // P1[12], P1[13], P3[12]...
+        trn2    v6.8b, v6.8b, v23.8b    // P2[12], P2[13], P4[12]...
+        trn1    v23.4h, v5.4h, v7.4h    // P2[4], P2[5], P2[6], P2[7], P6[4]...
+        trn1    v29.4h, v24.4h, v26.4h  // P1[8], P1[9], P1[10], P1[11], P5[8]...
+        trn1    v30.8b, v17.8b, v25.8b  // P1[14], P1[15], P3[14]...
+        trn2    v17.8b, v17.8b, v25.8b  // P2[14], P2[15], P4[14]...
+        trn1    v25.4h, v2.4h, v4.4h    // P2[8], P2[9], P2[10], P2[11], P6[8]...
+        trn1    v31.2s, v19.2s, v27.2s  // P1[0..7]
+        trn2    v19.2s, v19.2s, v27.2s  // P5[0..7]
+        trn1    v27.2s, v21.2s, v23.2s  // P2[0..7]
+        trn2    v21.2s, v21.2s, v23.2s  // P6[0..7]
+        trn1    v23.4h, v28.4h, v30.4h  // P1[12], P1[13], P1[14], P1[15], P5[12]...
+        trn2    v16.4h, v16.4h, v18.4h  // P3[0], P3[1], P3[2], P3[3], P7[0]...
+        trn1    v18.4h, v6.4h, v17.4h   // P2[12], P2[13], P2[14], P2[15], P6[12]...
+        trn2    v20.4h, v20.4h, v22.4h  // P3[4], P3[5], P3[6], P3[7], P7[4]...
+        trn2    v22.4h, v24.4h, v26.4h  // P3[8], P3[9], P3[10], P3[11], P7[8]...
+        trn1    v24.2s, v29.2s, v23.2s  // P1[8..15]
+        trn2    v23.2s, v29.2s, v23.2s  // P5[8..15]
+        trn1    v26.2s, v25.2s, v18.2s  // P2[8..15]
+        trn2    v18.2s, v25.2s, v18.2s  // P6[8..15]
+        trn2    v25.4h, v28.4h, v30.4h  // P3[12], P3[13], P3[14], P3[15], P7[12]...
+        trn2    v1.4h, v1.4h, v3.4h     // P4[0], P4[1], P4[2], P4[3], P8[0]...
+        trn2    v3.4h, v5.4h, v7.4h     // P4[4], P4[5], P4[6], P4[7], P8[4]...
+        trn2    v2.4h, v2.4h, v4.4h     // P4[8], P4[9], P4[10], P4[11], P8[8]...
+        trn2    v4.4h, v6.4h, v17.4h    // P4[12], P4[13], P4[14], P4[15], P8[12]...
+        ushll   v5.8h, v31.8b, #1       // 2*P1[0..7]
+        ushll   v6.8h, v19.8b, #1       // 2*P5[0..7]
+        trn1    v7.2s, v16.2s, v20.2s   // P3[0..7]
+        uxtl    v17.8h, v27.8b          // P2[0..7]
+        trn2    v16.2s, v16.2s, v20.2s  // P7[0..7]
+        uxtl    v20.8h, v21.8b          // P6[0..7]
+        trn1    v21.2s, v22.2s, v25.2s  // P3[8..15]
+        ushll   v24.8h, v24.8b, #1      // 2*P1[8..15]
+        trn2    v22.2s, v22.2s, v25.2s  // P7[8..15]
+        ushll   v25.8h, v23.8b, #1      // 2*P5[8..15]
+        trn1    v27.2s, v1.2s, v3.2s    // P4[0..7]
+        uxtl    v26.8h, v26.8b          // P2[8..15]
+        mls     v5.8h, v17.8h, v0.h[1]  // 2*P1[0..7]-5*P2[0..7]
+        uxtl    v17.8h, v18.8b          // P6[8..15]
+        mls     v6.8h, v20.8h, v0.h[1]  // 2*P5[0..7]-5*P6[0..7]
+        trn1    v18.2s, v2.2s, v4.2s    // P4[8..15]
+        uxtl    v28.8h, v7.8b           // P3[0..7]
+        mls     v24.8h, v26.8h, v0.h[1] // 2*P1[8..15]-5*P2[8..15]
+        uxtl    v16.8h, v16.8b          // P7[0..7]
+        uxtl    v26.8h, v21.8b          // P3[8..15]
+        mls     v25.8h, v17.8h, v0.h[1] // 2*P5[8..15]-5*P6[8..15]
+        uxtl    v22.8h, v22.8b          // P7[8..15]
+        ushll   v7.8h, v7.8b, #1        // 2*P3[0..7]
+        uxtl    v27.8h, v27.8b          // P4[0..7]
+        trn2    v1.2s, v1.2s, v3.2s     // P8[0..7]
+        ushll   v3.8h, v21.8b, #1       // 2*P3[8..15]
+        trn2    v2.2s, v2.2s, v4.2s     // P8[8..15]
+        uxtl    v4.8h, v18.8b           // P4[8..15]
+        mla     v5.8h, v28.8h, v0.h[1]  // 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]
+        uxtl    v1.8h, v1.8b            // P8[0..7]
+        mla     v6.8h, v16.8h, v0.h[1]  // 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]
+        uxtl    v2.8h, v2.8b            // P8[8..15]
+        uxtl    v16.8h, v19.8b          // P5[0..7]
+        mla     v24.8h, v26.8h, v0.h[1] // 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]
+        uxtl    v18.8h, v23.8b          // P5[8..15]
+        dup     v19.8h, w2              // pq
+        mla     v25.8h, v22.8h, v0.h[1] // 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]
+        sub     v21.8h, v27.8h, v16.8h  // P4[0..7]-P5[0..7]
+        sub     v22.8h, v4.8h, v18.8h   // P4[8..15]-P5[8..15]
+        mls     v7.8h, v27.8h, v0.h[1]  // 2*P3[0..7]-5*P4[0..7]
+        abs     v23.8h, v21.8h
+        mls     v3.8h, v4.8h, v0.h[1]   // 2*P3[8..15]-5*P4[8..15]
+        abs     v26.8h, v22.8h
+        sshr    v21.8h, v21.8h, #8      // clip_sign[0..7]
+        mls     v5.8h, v27.8h, v0.h[0]  // 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]-2*P4[0..7]
+        sshr    v23.8h, v23.8h, #1      // clip[0..7]
+        sshr    v26.8h, v26.8h, #1      // clip[8..15]
+        mls     v6.8h, v1.8h, v0.h[0]   // 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]-2*P8[0..7]
+        sshr    v1.8h, v22.8h, #8       // clip_sign[8..15]
+        cmeq    v22.8h, v23.8h, #0      // test clip[0..7] == 0
+        mls     v24.8h, v4.8h, v0.h[0]  // 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]-2*P4[8..15]
+        cmeq    v28.8h, v26.8h, #0      // test clip[8..15] == 0
+        srshr   v5.8h, v5.8h, #3
+        mls     v25.8h, v2.8h, v0.h[0]  // 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]-2*P8[8..15]
+        srshr   v2.8h, v6.8h, #3
+        mla     v7.8h, v16.8h, v0.h[1]  // 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]
+        srshr   v6.8h, v24.8h, #3
+        mla     v3.8h, v18.8h, v0.h[1]  // 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]
+        abs     v5.8h, v5.8h            // a1[0..7]
+        srshr   v24.8h, v25.8h, #3
+        mls     v3.8h, v17.8h, v0.h[0]  // 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]-2*P6[8..15]
+        abs     v2.8h, v2.8h            // a2[0..7]
+        abs     v6.8h, v6.8h            // a1[8..15]
+        mls     v7.8h, v20.8h, v0.h[0]  // 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]-2*P6[0..7]
+        abs     v17.8h, v24.8h          // a2[8..15]
+        cmhs    v20.8h, v5.8h, v2.8h    // test a1[0..7] >= a2[0..7]
+        srshr   v3.8h, v3.8h, #3
+        cmhs    v24.8h, v6.8h, v17.8h   // test a1[8..15] >= a2[8.15]
+        srshr   v7.8h, v7.8h, #3
+        bsl     v20.16b, v2.16b, v5.16b // a3[0..7]
+        abs     v2.8h, v3.8h            // a0[8..15]
+        sshr    v3.8h, v3.8h, #8        // a0_sign[8..15]
+        bsl     v24.16b, v17.16b, v6.16b // a3[8..15]
+        abs     v5.8h, v7.8h            // a0[0..7]
+        sshr    v6.8h, v7.8h, #8        // a0_sign[0..7]
+        cmhs    v7.8h, v2.8h, v19.8h    // test a0[8..15] >= pq
+        sub     v1.8h, v1.8h, v3.8h     // clip_sign[8..15] - a0_sign[8..15]
+        uqsub   v3.8h, v2.8h, v24.8h    // a0[8..15] >= a3[8..15] ? a0[8..15]-a3[8..15] : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
+        cmhs    v2.8h, v24.8h, v2.8h    // test a3[8..15] >= a0[8..15]
+        uqsub   v17.8h, v5.8h, v20.8h   // a0[0..7] >= a3[0..7] ? a0[0..7]-a3[0..7] : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
+        cmhs    v19.8h, v5.8h, v19.8h   // test a0[0..7] >= pq
+        orr     v7.16b, v28.16b, v7.16b // test clip[8..15] == 0 || a0[8..15] >= pq
+        sub     v6.8h, v21.8h, v6.8h    // clip_sign[0..7] - a0_sign[0..7]
+        mul     v3.8h, v3.8h, v0.h[1]   // a0[8..15] >= a3[8..15] ? 5*(a0[8..15]-a3[8..15]) : 0
+        cmhs    v5.8h, v20.8h, v5.8h    // test a3[0..7] >= a0[0..7]
+        orr     v19.16b, v22.16b, v19.16b // test clip[0..7] == 0 || a0[0..7] >= pq
+        mul     v0.8h, v17.8h, v0.h[1]  // a0[0..7] >= a3[0..7] ? 5*(a0[0..7]-a3[0..7]) : 0
+        orr     v2.16b, v7.16b, v2.16b  // test clip[8..15] == 0 || a0[8..15] >= pq || a3[8..15] >= a0[8..15]
+        orr     v5.16b, v19.16b, v5.16b // test clip[0..7] == 0 || a0[0..7] >= pq || a3[0..7] >= a0[0..7]
+        ushr    v3.8h, v3.8h, #3        // a0[8..15] >= a3[8..15] ? (5*(a0[8..15]-a3[8..15]))>>3 : 0
+        mov     w7, v2.s[1]
+        mov     w8, v2.s[3]
+        ushr    v0.8h, v0.8h, #3        // a0[0..7] >= a3[0..7] ? (5*(a0[0..7]-a3[0..7]))>>3 : 0
+        mov     w2, v5.s[1]             // move to gp reg
+        cmhs    v2.8h, v3.8h, v26.8h
+        mov     w3, v5.s[3]
+        cmhs    v5.8h, v0.8h, v23.8h
+        bsl     v2.16b, v26.16b, v3.16b // FFMIN(d[8..15], clip[8..15])
+        and     w9, w7, w8
+        bsl     v5.16b, v23.16b, v0.16b // FFMIN(d[0..7], clip[0..7])
+        and     w10, w2, w3
+        bic     v0.16b, v2.16b, v7.16b  // set each d[8..15] to zero if it should not be filtered because clip[8..15] == 0 || a0[8..15] >= pq (a3 > a0 case already zeroed by saturating sub)
+        and     w9, w10, w9
+        bic     v2.16b, v5.16b, v19.16b // set each d[0..7] to zero if it should not be filtered because clip[0..7] == 0 || a0[0..7] >= pq (a3 > a0 case already zeroed by saturating sub)
+        mls     v4.8h, v0.8h, v1.8h     // invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P4
+        tbnz    w9, #0, 4f              // none of the 16 pixel pairs should be updated in this case
+        mls     v27.8h, v2.8h, v6.8h    // invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P4
+        mla     v16.8h, v2.8h, v6.8h    // invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P5
+        sqxtun  v2.8b, v4.8h
+        mla     v18.8h, v0.8h, v1.8h    // invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P5
+        sqxtun  v0.8b, v27.8h
+        sqxtun  v1.8b, v16.8h
+        sqxtun  v3.8b, v18.8h
+        tbnz    w2, #0, 1f
+        st2     {v0.b, v1.b}[0], [x0], x1
+        st2     {v0.b, v1.b}[1], [x0], x1
+        st2     {v0.b, v1.b}[2], [x0], x1
+        st2     {v0.b, v1.b}[3], [x0]
+1:      tbnz    w3, #0, 2f
+        st2     {v0.b, v1.b}[4], [x5], x1
+        st2     {v0.b, v1.b}[5], [x5], x1
+        st2     {v0.b, v1.b}[6], [x5], x1
+        st2     {v0.b, v1.b}[7], [x5]
+2:      tbnz    w7, #0, 3f
+        st2     {v2.b, v3.b}[0], [x4], x1
+        st2     {v2.b, v3.b}[1], [x4], x1
+        st2     {v2.b, v3.b}[2], [x4], x1
+        st2     {v2.b, v3.b}[3], [x4]
+3:      tbnz    w8, #0, 4f
+        st2     {v2.b, v3.b}[4], [x6], x1
+        st2     {v2.b, v3.b}[5], [x6], x1
+        st2     {v2.b, v3.b}[6], [x6], x1
+        st2     {v2.b, v3.b}[7], [x6]
+4:      ret
+endfunc
-- 
2.25.1