[FFmpeg-devel] [PATCH] lavc/aarch64: add hevc horizontal qpel/uni/bi

Tue May 24 14:38:03 EEST 2022

checkasm --benchmark on Ampere Altra (Neoverse N1):

put_hevc_qpel_bi_h4_8_c: 173.7
put_hevc_qpel_bi_h4_8_neon: 77.0
put_hevc_qpel_bi_h6_8_c: 385.7
put_hevc_qpel_bi_h6_8_neon: 125.7
put_hevc_qpel_bi_h8_8_c: 680.7
put_hevc_qpel_bi_h8_8_neon: 137.5
put_hevc_qpel_bi_h12_8_c: 1480.0
put_hevc_qpel_bi_h12_8_neon: 438.5
put_hevc_qpel_bi_h16_8_c: 2663.2
put_hevc_qpel_bi_h16_8_neon: 561.5
put_hevc_qpel_bi_h24_8_c: 6039.0
put_hevc_qpel_bi_h24_8_neon: 1717.5
put_hevc_qpel_bi_h32_8_c: 11104.2
put_hevc_qpel_bi_h32_8_neon: 2222.0
put_hevc_qpel_bi_h48_8_c: 25175.2
put_hevc_qpel_bi_h48_8_neon: 4983.7
put_hevc_qpel_bi_h64_8_c: 42806.5
put_hevc_qpel_bi_h64_8_neon: 8848.5
put_hevc_qpel_h4_8_c: 149.7
put_hevc_qpel_h4_8_neon: 68.2
put_hevc_qpel_h6_8_c: 318.5
put_hevc_qpel_h6_8_neon: 105.2
put_hevc_qpel_h8_8_c: 577.0
put_hevc_qpel_h8_8_neon: 133.2
put_hevc_qpel_h12_8_c: 1276.0
put_hevc_qpel_h12_8_neon: 394.5
put_hevc_qpel_h16_8_c: 2278.2
put_hevc_qpel_h16_8_neon: 517.5
put_hevc_qpel_h24_8_c: 5081.7
put_hevc_qpel_h24_8_neon: 1546.5
put_hevc_qpel_h32_8_c: 9081.0
put_hevc_qpel_h32_8_neon: 2054.0
put_hevc_qpel_h48_8_c: 20280.7
put_hevc_qpel_h48_8_neon: 4615.5
put_hevc_qpel_h64_8_c: 36042.0
put_hevc_qpel_h64_8_neon: 8197.5
put_hevc_qpel_uni_h4_8_c: 165.5
put_hevc_qpel_uni_h4_8_neon: 73.5
put_hevc_qpel_uni_h6_8_c: 366.5
put_hevc_qpel_uni_h6_8_neon: 118.5
put_hevc_qpel_uni_h8_8_c: 661.7
put_hevc_qpel_uni_h8_8_neon: 138.2
put_hevc_qpel_uni_h12_8_c: 1440.5
put_hevc_qpel_uni_h12_8_neon: 399.5
put_hevc_qpel_uni_h16_8_c: 2489.0
put_hevc_qpel_uni_h16_8_neon: 532.2
put_hevc_qpel_uni_h24_8_c: 5896.5
put_hevc_qpel_uni_h24_8_neon: 1558.5
put_hevc_qpel_uni_h32_8_c: 10675.5
put_hevc_qpel_uni_h32_8_neon: 2092.2
put_hevc_qpel_uni_h48_8_c: 24103.0
put_hevc_qpel_uni_h48_8_neon: 4680.2
put_hevc_qpel_uni_h64_8_c: 42789.2
put_hevc_qpel_uni_h64_8_neon: 8330.0

Signed-off-by: J. Dekker <jdek at itanimul.li>
---
 libavcodec/aarch64/Makefile               |   1 +
 libavcodec/aarch64/hevcdsp_init_aarch64.c |  43 +-
 libavcodec/aarch64/hevcdsp_qpel_neon.S    | 520 ++++++++++++++++++++++
 3 files changed, 563 insertions(+), 1 deletion(-)
 create mode 100644 libavcodec/aarch64/hevcdsp_qpel_neon.S

diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
index c8935f205e..2f95649c66 100644
--- a/libavcodec/aarch64/Makefile
+++ b/libavcodec/aarch64/Makefile
@@ -65,4 +65,5 @@ NEON-OBJS-$(CONFIG_VP9_DECODER)         += aarch64/vp9itxfm_16bpp_neon.o       \
                                            aarch64/vp9mc_neon.o
 NEON-OBJS-$(CONFIG_HEVC_DECODER)        += aarch64/hevcdsp_idct_neon.o         \
                                            aarch64/hevcdsp_init_aarch64.o      \
+                                           aarch64/hevcdsp_qpel_neon.o         \
                                            aarch64/hevcdsp_sao_neon.o
diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index 1e40be740c..ca2cb7cf97 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -58,7 +58,21 @@ void ff_hevc_sao_band_filter_8x8_8_neon(uint8_t *_dst, uint8_t *_src,
                                   int16_t *sao_offset_val, int sao_left_class,
                                   int width, int height);
 
-
+void ff_hevc_put_hevc_qpel_h4_8_neon(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_qpel_h6_8_neon(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_qpel_h8_8_neon(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_qpel_h12_8_neon(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_qpel_h16_8_neon(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_qpel_uni_h4_8_neon(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_qpel_uni_h6_8_neon(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_qpel_uni_h8_8_neon(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_qpel_uni_h12_8_neon(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_qpel_uni_h16_8_neon(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_qpel_bi_h4_8_neon(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_qpel_bi_h6_8_neon(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_qpel_bi_h8_8_neon(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_qpel_bi_h12_8_neon(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_qpel_bi_h16_8_neon(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width);
 
 av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
 {
@@ -80,6 +94,33 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
         // for the current size, but if enabled for bigger sizes, the cases
         // of non-multiple of 8 seem to arise.
 //        c->sao_band_filter[0]          = ff_hevc_sao_band_filter_8x8_8_neon;
+        c->put_hevc_qpel[1][0][1] = ff_hevc_put_hevc_qpel_h4_8_neon;
+        c->put_hevc_qpel[2][0][1] = ff_hevc_put_hevc_qpel_h6_8_neon;
+        c->put_hevc_qpel[3][0][1] = ff_hevc_put_hevc_qpel_h8_8_neon;
+        c->put_hevc_qpel[4][0][1] =
+        c->put_hevc_qpel[6][0][1] = ff_hevc_put_hevc_qpel_h12_8_neon;
+        c->put_hevc_qpel[5][0][1] =
+        c->put_hevc_qpel[7][0][1] =
+        c->put_hevc_qpel[8][0][1] =
+        c->put_hevc_qpel[9][0][1] = ff_hevc_put_hevc_qpel_h16_8_neon;
+        c->put_hevc_qpel_uni[1][0][1] = ff_hevc_put_hevc_qpel_uni_h4_8_neon;
+        c->put_hevc_qpel_uni[2][0][1] = ff_hevc_put_hevc_qpel_uni_h6_8_neon;
+        c->put_hevc_qpel_uni[3][0][1] = ff_hevc_put_hevc_qpel_uni_h8_8_neon;
+        c->put_hevc_qpel_uni[4][0][1] =
+        c->put_hevc_qpel_uni[6][0][1] = ff_hevc_put_hevc_qpel_uni_h12_8_neon;
+        c->put_hevc_qpel_uni[5][0][1] =
+        c->put_hevc_qpel_uni[7][0][1] =
+        c->put_hevc_qpel_uni[8][0][1] =
+        c->put_hevc_qpel_uni[9][0][1] = ff_hevc_put_hevc_qpel_uni_h16_8_neon;
+        c->put_hevc_qpel_bi[1][0][1] = ff_hevc_put_hevc_qpel_bi_h4_8_neon;
+        c->put_hevc_qpel_bi[2][0][1] = ff_hevc_put_hevc_qpel_bi_h6_8_neon;
+        c->put_hevc_qpel_bi[3][0][1] = ff_hevc_put_hevc_qpel_bi_h8_8_neon;
+        c->put_hevc_qpel_bi[4][0][1] =
+        c->put_hevc_qpel_bi[6][0][1] = ff_hevc_put_hevc_qpel_bi_h12_8_neon;
+        c->put_hevc_qpel_bi[5][0][1] =
+        c->put_hevc_qpel_bi[7][0][1] =
+        c->put_hevc_qpel_bi[8][0][1] =
+        c->put_hevc_qpel_bi[9][0][1] = ff_hevc_put_hevc_qpel_bi_h16_8_neon;
     }
     if (bit_depth == 10) {
         c->add_residual[0]             = ff_hevc_add_residual_4x4_10_neon;
diff --git a/libavcodec/aarch64/hevcdsp_qpel_neon.S b/libavcodec/aarch64/hevcdsp_qpel_neon.S
new file mode 100644
index 0000000000..bbaa32a9d9
--- /dev/null
+++ b/libavcodec/aarch64/hevcdsp_qpel_neon.S
@@ -0,0 +1,520 @@
+/* -*-arm64-*-
+ * vim: syntax=arm64asm
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+#define MAX_PB_SIZE 64
+
+const qpel_filters, align=4
+         .byte  0,  0,  0,  0,  0,  0, 0,  0
+         .byte -1,  4,-10, 58, 17, -5, 1,  0
+         .byte -1,  4,-11, 40, 40,-11, 4, -1
+         .byte  0,  1, -5, 17, 58,-10, 4, -1
+endconst
+
+.macro load_qpel_filter m
+         movrel          x15, qpel_filters
+         add             x15, x15, \m, lsl #3
+         ld1             {v0.8b}, [x15]
+         sxtl            v0.8h, v0.8b
+.endm
+
+// void put_hevc_qpel_h(int16_t *dst,
+//                      uint8_t *_src, ptrdiff_t _srcstride,
+//                      int height, intptr_t mx, intptr_t my, int width)
+
+// void put_hevc_qpel_uni_h(uint8_t *_dst,  ptrdiff_t _dststride,
+//                          uint8_t *_src, ptrdiff_t _srcstride,
+//                          int height, intptr_t mx, intptr_t my, int width)
+
+// void put_hevc_qpel_bi_h(uint8_t *_dst, ptrdiff_t _dststride,
+//                         uint8_t *_src, ptrdiff_t _srcstride,
+//                         int16_t *src2, int height, intptr_t mx,
+//                         intptr_t my, int width)
+
+.macro put_hevc type
+function ff_hevc_put_hevc_\type\()_h4_8_neon, export=1
+.ifc \type, qpel
+         load_qpel_filter x4
+         lsl             x10, x2, #1                  // src stride * 2
+         sub             x13, x1, #3                  // src1 = src - 3
+         mov             x15, #(MAX_PB_SIZE << 2)     // dst stride
+         add             x14, x13, x2                 // src2 = src1 + src stride
+         add             x17, x0, #(MAX_PB_SIZE << 1) // dst2 = dst1 + 64 * 2
+.else
+.ifc \type, qpel_bi
+         load_qpel_filter x6
+         mov             x6, #(MAX_PB_SIZE << 2)      // rsrc stride << 1
+         add             x7, x4, #(MAX_PB_SIZE << 1)  // rsrc2
+.else
+         load_qpel_filter x5
+.endif
+         lsl             x10, x3, #1                  // src stride * 2
+         sub             x13, x2, #3                  // src1 = src - 3
+         lsl             x15, x1, #1                  // dst stride * 2
+         add             x14, x13, x3                 // src2 = src1 + src stride
+         add             x17, x0, x1                  // dst2 = dst1 + dst stride
+.endif
+0:       ld1             {v16.8b, v17.8b}, [x13], x10
+         ld1             {v18.8b, v19.8b}, [x14], x10
+.ifc \type, qpel_bi
+         ld1             {v25.8h}, [x4], x6
+         ld1             {v26.8h}, [x7], x6
+.endif
+         uxtl            v16.8h,  v16.8b
+         uxtl            v17.8h,  v17.8b
+         uxtl            v18.8h,  v18.8b
+         uxtl            v19.8h,  v19.8b
+
+         mul             v23.8h,  v16.8h, v0.h[0]
+         mul             v24.8h,  v18.8h, v0.h[0]
+
+.irpc i, 1234567
+         ext             v20.16b, v16.16b, v17.16b, #(2*\i)
+         ext             v21.16b, v18.16b, v19.16b, #(2*\i)
+         mla             v23.8h,  v20.8h, v0.h[\i]
+         mla             v24.8h,  v21.8h, v0.h[\i]
+.endr
+
+.ifc \type, qpel
+         subs            w3, w3, #2
+         st1             {v23.4h}, [ x0], x15
+         st1             {v24.4h}, [x17], x15
+.else
+.ifc \type, qpel_bi
+         subs            w5, w5, #2
+         sqadd           v23.8h, v23.8h, v25.8h
+         sqadd           v24.8h, v24.8h, v26.8h
+         sqrshrun        v23.8b, v23.8h, #7
+         sqrshrun        v24.8b, v24.8h, #7
+.else
+         subs            w4, w4, #2
+         sqrshrun        v23.8b, v23.8h, #6
+         sqrshrun        v24.8b, v24.8h, #6
+.endif
+         st1             {v23.s}[0], [ x0], x15
+         st1             {v24.s}[0], [x17], x15
+.endif
+         b.gt            0b // double line
+         ret
+endfunc
+
+function ff_hevc_put_hevc_\type\()_h6_8_neon, export=1
+.ifc \type, qpel
+         load_qpel_filter x4
+         lsl             x10, x2, #1                  // width * 2
+         sub             x13, x1, #3                  // src1 = src - 3
+         mov             x15, #(MAX_PB_SIZE * 4 - 8)  // dst stride
+         add             x14, x13, x2                 // src2 = src1 + src stride
+         add             x17, x0, #(MAX_PB_SIZE << 1) // dst2 = dst1 + 64 * 2
+.else
+.ifc \type, qpel_bi
+         load_qpel_filter x6
+         mov             x6, #(MAX_PB_SIZE << 2)      // rsrc stride << 1
+         add             x7, x4, #(MAX_PB_SIZE << 1)  // rsrc2
+.else
+         load_qpel_filter x5
+.endif
+         lsl             x10, x3, #1                  // src stride * 2
+         sub             x13, x2, #3                  // src1 = src - 3
+         lsl             x15, x1, #1                  // dst stride * 2
+         subs            x15, x15, #4
+         add             x14, x13, x3                 // src2 = src1 + src stride
+         add             x17, x0, x1                  // dst2 = dst1 + dst stride
+.endif
+0:       ld1             {v16.8b, v17.8b}, [x13], x10
+         ld1             {v18.8b, v19.8b}, [x14], x10
+.ifc \type, qpel_bi
+         ld1             {v25.8h}, [x4], x6
+         ld1             {v26.8h}, [x7], x6
+.endif
+
+         uxtl            v16.8h,  v16.8b
+         uxtl            v17.8h,  v17.8b
+         uxtl            v18.8h,  v18.8b
+         uxtl            v19.8h,  v19.8b
+
+         mul             v23.8h,  v16.8h, v0.h[0]
+         mul             v24.8h,  v18.8h, v0.h[0]
+
+.irpc i, 1234567
+         ext             v20.16b, v16.16b, v17.16b, #(2*\i)
+         ext             v21.16b, v18.16b, v19.16b, #(2*\i)
+         mla             v23.8h,  v20.8h, v0.h[\i]
+         mla             v24.8h,  v21.8h, v0.h[\i]
+.endr
+
+.ifc \type, qpel
+         subs            w3, w3, #2
+         st1             {v23.4h},   [ x0], #8
+         st1             {v23.s}[2], [ x0], x15
+         st1             {v24.4h},   [x17], #8
+         st1             {v24.s}[2], [x17], x15
+.else
+.ifc \type, qpel_bi
+         subs            w5, w5, #2
+         sqadd           v23.8h, v23.8h, v25.8h
+         sqadd           v24.8h, v24.8h, v26.8h
+         sqrshrun        v23.8b, v23.8h, #7
+         sqrshrun        v24.8b, v24.8h, #7
+.else
+         subs            w4, w4, #2
+         sqrshrun        v23.8b, v23.8h, #6
+         sqrshrun        v24.8b, v24.8h, #6
+.endif
+         st1             {v23.s}[0], [ x0], #4
+         st1             {v23.h}[2], [ x0], x15
+         st1             {v24.s}[0], [x17], #4
+         st1             {v24.h}[2], [x17], x15
+.endif
+         b.gt            0b // double line
+         ret
+endfunc
+
+function ff_hevc_put_hevc_\type\()_h8_8_neon, export=1
+.ifc \type, qpel
+         load_qpel_filter x4
+         lsl             x10, x2, #1 // width * 2
+         sub             x13, x1, #3                  // src1 = src - 3
+         mov             x15, #(MAX_PB_SIZE << 2)     // dst stride
+         add             x14, x13, x2                 // src2 = src1 + src stride
+         add             x17, x0, #(MAX_PB_SIZE << 1) // dst2 = dst1 + 64 * 2
+.else
+.ifc \type, qpel_bi
+         load_qpel_filter x6
+         mov             x6, #(MAX_PB_SIZE << 2)      // rsrc stride << 1
+         add             x7, x4, #(MAX_PB_SIZE << 1)  // rsrc2
+.else
+         load_qpel_filter x5
+.endif
+         lsl             x10, x3, #1                  // src stride * 2
+         sub             x13, x2, #3                  // src1 = src - 3
+         lsl             x15, x1, #1                  // dst stride * 2
+         add             x14, x13, x3                 // src2 = src1 + src stride
+         add             x17, x0, x1                  // dst2 = dst1 + dst stride
+.endif
+0:       ld1             {v16.8b, v17.8b}, [x13], x10
+         ld1             {v18.8b, v19.8b}, [x14], x10
+.ifc \type, qpel_bi
+         ld1             {v25.8h}, [x4], x6
+         ld1             {v26.8h}, [x7], x6
+.endif
+
+         uxtl            v16.8h,  v16.8b
+         uxtl            v17.8h,  v17.8b
+         uxtl            v18.8h,  v18.8b
+         uxtl            v19.8h,  v19.8b
+
+         mul             v23.8h,  v16.8h, v0.h[0]
+         mul             v24.8h,  v18.8h, v0.h[0]
+
+.irpc i, 1234567
+         ext             v20.16b, v16.16b, v17.16b, #(2*\i)
+         ext             v21.16b, v18.16b, v19.16b, #(2*\i)
+         mla             v23.8h,  v20.8h, v0.h[\i]
+         mla             v24.8h,  v21.8h, v0.h[\i]
+.endr
+
+.ifc \type, qpel
+         subs            w3, w3, #2
+         st1             {v23.8h}, [ x0], x15
+         st1             {v24.8h}, [x17], x15
+.else
+.ifc \type, qpel_bi
+         subs            w5, w5, #2
+         sqadd           v23.8h, v23.8h, v25.8h
+         sqadd           v24.8h, v24.8h, v26.8h
+         sqrshrun        v23.8b, v23.8h, #7
+         sqrshrun        v24.8b, v24.8h, #7
+.else
+         subs            w4, w4, #2
+         sqrshrun        v23.8b, v23.8h, #6
+         sqrshrun        v24.8b, v24.8h, #6
+.endif
+         st1             {v23.8b}, [ x0], x15
+         st1             {v24.8b}, [x17], x15
+.endif
+         b.gt            0b // double line
+         ret
+endfunc
+
+function ff_hevc_put_hevc_\type\()_h12_8_neon, export=1
+.ifc \type, qpel
+         load_qpel_filter x4
+         // blocks
+         mov             w8, #0xAAAB
+         movk            w8, #0x2AAA, lsl #16
+         smull           x15, w8, w6
+         asr             x15, x15, #33
+         sub             w6, w15, w6, asr #31
+         // fast divide by 12, thank gcc for this one...
+
+         // src constants
+         lsl             x10, x2, #1 // width * 2
+         sub             x1, x1, #3  // src = src - 3
+
+         // dst constants
+         mov             x15, #(MAX_PB_SIZE * 4 - 16) // dst stride
+
+         // loop
+         mov             x8, xzr     // hblock
+0:       mov             w7, w3
+
+         // 12 * hblock
+         lsl             x12, x8, #3
+         add             x12, x12, x8, lsl #2
+
+         add             x13, x1, x12    // src1 = src0 + 12 * hblock
+         add             x14, x13, x2    // src2 = src1 + src stride
+
+         add             x16, x0, x12, lsl #1    // dst1 = dst0 + 12 * hblock * 2
+         add             x17, x16, #(MAX_PB_SIZE << 1) // dst2 = dst1 + dst stride
+.else
+         // blocks
+.ifc \type, qpel_bi
+         ldrh            w7, [sp]
+         load_qpel_filter x6
+.else
+         load_qpel_filter x5
+.endif
+         mov             w9, #0xAAAB
+         movk            w9, #0x2AAA, lsl #16
+         smull           x15, w9, w7
+         asr             x15, x15, #33
+         sub             w6, w15, w7, asr #31
+
+         // src constants
+         lsl             x10, x3, #1 // src stride * 2
+         sub             x2, x2, #3  // src = src - 3
+
+         // dst constants
+         lsl             x15, x1, #1 // dst stride * 2
+.ifc \type, qpel_bi
+         mov             x9, #(MAX_PB_SIZE << 2)
+.endif
+         sub             x15, x15, #8
+         // loop
+         mov             x8, xzr     // hblock
+0:
+.ifc \type, qpel_bi // height
+         mov             w7, w5
+.else
+         mov             w7, w4
+.endif
+         // 12 * hblock
+         lsl             x12, x8, #3
+         add             x12, x12, x8, lsl #2
+
+         add             x13, x2, x12    // src1 = src0 + 12 * hblock
+         add             x14, x13, x3    // src2 = src1 + src stride
+
+         add             x16, x0, x12    // dst1 = dst0 + 12 * hblock
+         add             x17, x16, x1    // dst2 = dst1 + dst stride
+.ifc \type, qpel_bi
+         add             x11, x4, x12, lsl #1 // rsrc1 = rsrc0 + 12 * hblock * 2
+         add             x12, x11, #(MAX_PB_SIZE << 1) // rsrc2 = rsrc1 + rsrc stride
+.endif
+.endif
+1:       ld1             {v16.8b-v18.8b}, [x13], x10
+         ld1             {v19.8b-v21.8b}, [x14], x10
+
+         uxtl            v16.8h,  v16.8b
+         uxtl            v17.8h,  v17.8b
+         uxtl            v18.8h,  v18.8b
+
+         uxtl            v19.8h,  v19.8b
+         uxtl            v20.8h,  v20.8b
+         uxtl            v21.8h,  v21.8b
+
+         mul             v26.8h,  v16.8h, v0.h[0]
+         mul             v27.8h,  v17.8h, v0.h[0]
+         mul             v28.8h,  v19.8h, v0.h[0]
+         mul             v29.8h,  v20.8h, v0.h[0]
+
+.irpc i, 1234567
+         ext             v22.16b, v16.16b, v17.16b, #(2*\i)
+         ext             v23.16b, v17.16b, v18.16b, #(2*\i)
+
+         ext             v24.16b, v19.16b, v20.16b, #(2*\i)
+         ext             v25.16b, v20.16b, v21.16b, #(2*\i)
+
+         mla             v26.8h,  v22.8h, v0.h[\i]
+         mla             v27.8h,  v23.8h, v0.h[\i]
+
+         mla             v28.8h,  v24.8h, v0.h[\i]
+         mla             v29.8h,  v25.8h, v0.h[\i]
+.endr
+         subs            w7, w7, #2
+.ifc \type, qpel
+         st1             {v26.8h}, [x16], #16
+         st1             {v27.4h}, [x16], x15
+         st1             {v28.8h}, [x17], #16
+         st1             {v29.4h}, [x17], x15
+.else
+.ifc \type, qpel_bi
+         ld1             {v16.8h, v17.8h}, [x11], x9
+         ld1             {v18.8h, v19.8h}, [x12], x9
+         sqadd           v26.8h, v26.8h, v16.8h
+         sqadd           v27.8h, v27.8h, v17.8h
+         sqadd           v28.8h, v28.8h, v18.8h
+         sqadd           v29.8h, v29.8h, v19.8h
+         sqrshrun        v26.8b, v26.8h, #7
+         sqrshrun        v27.8b, v27.8h, #7
+         sqrshrun        v28.8b, v28.8h, #7
+         sqrshrun        v29.8b, v29.8h, #7
+.else
+         sqrshrun        v26.8b, v26.8h, #6
+         sqrshrun        v27.8b, v27.8h, #6
+         sqrshrun        v28.8b, v28.8h, #6
+         sqrshrun        v29.8b, v29.8h, #6
+.endif
+         st1             {v26.8b},   [x16], #8
+         st1             {v27.s}[0], [x16], x15
+         st1             {v28.8b},   [x17], #8
+         st1             {v29.s}[0], [x17], x15
+.endif
+         b.gt            1b // double line
+         add             x8, x8, #1
+         cmp             x8, x6
+         b.lt            0b // line of blocks
+         ret
+endfunc
+
+function ff_hevc_put_hevc_\type\()_h16_8_neon, export=1
+         mov             x8, xzr     // hblock
+.ifc \type, qpel
+         load_qpel_filter x4
+         // blocks
+         lsr             w6, w6, #4  // horizontal block count
+         // src constants
+         lsl             x10, x2, #1 // width * 2
+         sub             x1, x1, #3  // src = src - 3
+         // dst constants
+         mov             x15, #(MAX_PB_SIZE * 4 - 16) // dst stride
+         // loop
+0:       mov             w7, w3      // reset height
+
+         add             x13, x1, x8, lsl #4
+         add             x14, x13, x2         // src2 = src1 + src stride
+
+         add             x16, x0, x8, lsl #5  // dst1 = dst0 + hblock * 16 * 2
+         add             x17, x16, #(MAX_PB_SIZE << 1) // dst2 = dst1 + 64 * 2
+.else
+.ifc \type, qpel_bi
+         mov             x9, #(MAX_PB_SIZE << 2)
+         ldrh            w7, [sp]
+         load_qpel_filter x6
+.else
+         load_qpel_filter x5
+.endif
+         // blocks
+         lsr             w6, w7, #4  // horizontal block count
+         // src constants
+         lsl             x10, x3, #1 // src stride * 2
+         sub             x2, x2, #3  // src = src - 3
+         // dst constants
+         lsl             x15, x1, #1 // dst stride * 2
+         sub             x15, x15, #8
+         // loop
+0:
+.ifc \type, qpel_bi // height
+         mov             w7, w5
+.else
+         mov             w7, w4
+.endif
+
+         add             x13, x2, x8, lsl #4  // src1 = src0 + hblock * 16
+         add             x14, x13, x3         // src2 = src1 + src stride
+
+         add             x16, x0, x8, lsl #4  // dst1 = dst0 + hblock * 16
+         add             x17, x16, x1         // dst2 = dst1 + dst stride
+.ifc \type, qpel_bi
+         add             x11, x4, x8, lsl #5 // rsrc1 = rsrc0 + 16 * hblock * 2
+         add             x12, x11, #(MAX_PB_SIZE << 1) // rsrc2 = rsrc1 + rsrc stride
+.endif
+.endif
+1:       ld1             {v16.8b-v18.8b}, [x13], x10
+         ld1             {v19.8b-v21.8b}, [x14], x10
+
+         uxtl            v16.8h,  v16.8b
+         uxtl            v17.8h,  v17.8b
+         uxtl            v18.8h,  v18.8b
+
+         uxtl            v19.8h,  v19.8b
+         uxtl            v20.8h,  v20.8b
+         uxtl            v21.8h,  v21.8b
+
+         mul             v26.8h,  v16.8h, v0.h[0]
+         mul             v27.8h,  v17.8h, v0.h[0]
+         mul             v28.8h,  v19.8h, v0.h[0]
+         mul             v29.8h,  v20.8h, v0.h[0]
+
+.irpc i, 1234567
+         ext             v22.16b, v16.16b, v17.16b, #(2*\i)
+         ext             v23.16b, v17.16b, v18.16b, #(2*\i)
+
+         ext             v24.16b, v19.16b, v20.16b, #(2*\i)
+         ext             v25.16b, v20.16b, v21.16b, #(2*\i)
+
+         mla             v26.8h,  v22.8h, v0.h[\i]
+         mla             v27.8h,  v23.8h, v0.h[\i]
+
+         mla             v28.8h,  v24.8h, v0.h[\i]
+         mla             v29.8h,  v25.8h, v0.h[\i]
+.endr
+         subs            w7, w7, #2
+.ifc \type, qpel
+         st1             {v26.8h}, [x16], #16
+         st1             {v27.8h}, [x16], x15
+         st1             {v28.8h}, [x17], #16
+         st1             {v29.8h}, [x17], x15
+.else
+.ifc \type, qpel_bi
+         ld1             {v16.8h, v17.8h}, [x11], x9
+         ld1             {v18.8h, v19.8h}, [x12], x9
+         sqadd           v26.8h, v26.8h, v16.8h
+         sqadd           v27.8h, v27.8h, v17.8h
+         sqadd           v28.8h, v28.8h, v18.8h
+         sqadd           v29.8h, v29.8h, v19.8h
+         sqrshrun        v26.8b, v26.8h, #7
+         sqrshrun        v27.8b, v27.8h, #7
+         sqrshrun        v28.8b, v28.8h, #7
+         sqrshrun        v29.8b, v29.8h, #7
+.else
+         sqrshrun        v26.8b, v26.8h, #6
+         sqrshrun        v27.8b, v27.8h, #6
+         sqrshrun        v28.8b, v28.8h, #6
+         sqrshrun        v29.8b, v29.8h, #6
+.endif
+         st1             {v26.8b}, [x16], #8
+         st1             {v27.8b}, [x16], x15
+         st1             {v28.8b}, [x17], #8
+         st1             {v29.8b}, [x17], x15
+.endif
+         b.gt            1b // double line
+         add             x8, x8, #1
+         cmp             x8, x6
+         b.lt            0b // horizontal tiling
+         ret
+endfunc
+.endm
+
+put_hevc qpel
+put_hevc qpel_uni
+put_hevc qpel_bi
-- 
2.32.0 (Apple Git-132)