[FFmpeg-cvslog] aarch64: h264 qpel NEON optimizations

Janne Grunau git at videolan.org
Wed Jan 15 15:19:04 CET 2014


ffmpeg | branch: master | Janne Grunau <janne-libav at jannau.net> | Wed Dec 18 15:56:50 2013 +0100| [d5dd8c7bf0f0d77c581db3236e0d938f06fd5591] | committer: Janne Grunau

aarch64: h264 qpel NEON optimizations

Ported from ARMv7 NEON.

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=d5dd8c7bf0f0d77c581db3236e0d938f06fd5591
---

 libavcodec/aarch64/Makefile                |    2 +
 libavcodec/aarch64/h264qpel_init_aarch64.c |  172 +++++
 libavcodec/aarch64/h264qpel_neon.S         |  934 ++++++++++++++++++++++++++++
 libavcodec/aarch64/neon.S                  |   64 ++
 libavcodec/h264qpel.c                      |    2 +
 libavcodec/h264qpel.h                      |    1 +
 6 files changed, 1175 insertions(+)

diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
index 06e3c77..1d80d9a 100644
--- a/libavcodec/aarch64/Makefile
+++ b/libavcodec/aarch64/Makefile
@@ -1,7 +1,9 @@
 OBJS-$(CONFIG_H264CHROMA)               += aarch64/h264chroma_init_aarch64.o
 OBJS-$(CONFIG_H264DSP)                  += aarch64/h264dsp_init_aarch64.o
+OBJS-$(CONFIG_H264QPEL)                 += aarch64/h264qpel_init_aarch64.o
 OBJS-$(CONFIG_RV40_DECODER)             += aarch64/rv40dsp_init_aarch64.o
 OBJS-$(CONFIG_VC1_DECODER)              += aarch64/vc1dsp_init_aarch64.o
 
 NEON-OBJS-$(CONFIG_H264CHROMA)          += aarch64/h264cmc_neon.o
 NEON-OBJS-$(CONFIG_H264DSP)             += aarch64/h264idct_neon.o
+NEON-OBJS-$(CONFIG_H264QPEL)            += aarch64/h264qpel_neon.o
diff --git a/libavcodec/aarch64/h264qpel_init_aarch64.c b/libavcodec/aarch64/h264qpel_init_aarch64.c
new file mode 100644
index 0000000..11611df
--- /dev/null
+++ b/libavcodec/aarch64/h264qpel_init_aarch64.c
@@ -0,0 +1,172 @@
+/*
+ * ARM NEON optimised DSP functions
+ * Copyright (c) 2008 Mans Rullgard <mans at mansr.com>
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/aarch64/cpu.h"
+#include "libavcodec/h264qpel.h"
+
+void ff_put_h264_qpel16_mc00_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel16_mc10_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel16_mc20_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel16_mc30_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel16_mc01_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel16_mc11_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel16_mc21_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel16_mc31_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel16_mc02_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel16_mc12_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel16_mc22_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel16_mc32_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel16_mc03_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel16_mc13_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel16_mc23_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel16_mc33_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+
+void ff_put_h264_qpel8_mc00_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel8_mc10_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel8_mc20_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel8_mc30_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel8_mc01_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel8_mc11_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel8_mc21_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel8_mc31_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel8_mc02_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel8_mc12_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel8_mc22_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel8_mc32_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel8_mc03_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel8_mc13_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel8_mc23_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel8_mc33_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+
+void ff_avg_h264_qpel16_mc00_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc10_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc20_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc30_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc01_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc11_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc21_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc31_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc02_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc12_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc22_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc32_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc03_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc13_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc23_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc33_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+
+void ff_avg_h264_qpel8_mc00_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc10_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc20_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc30_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc01_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc11_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc21_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc31_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc02_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc12_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc22_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc32_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc03_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc13_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc23_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc33_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+
+av_cold void ff_h264qpel_init_aarch64(H264QpelContext *c, int bit_depth)
+{
+    const int high_bit_depth = bit_depth > 8;
+    int cpu_flags = av_get_cpu_flags();
+
+    if (have_neon(cpu_flags) && !high_bit_depth) {
+        /* c->put_h264_qpel_pixels_tab[0][ 0] = ff_put_h264_qpel16_mc00_neon; */
+        c->put_h264_qpel_pixels_tab[0][ 1] = ff_put_h264_qpel16_mc10_neon;
+        c->put_h264_qpel_pixels_tab[0][ 2] = ff_put_h264_qpel16_mc20_neon;
+        c->put_h264_qpel_pixels_tab[0][ 3] = ff_put_h264_qpel16_mc30_neon;
+        c->put_h264_qpel_pixels_tab[0][ 4] = ff_put_h264_qpel16_mc01_neon;
+        c->put_h264_qpel_pixels_tab[0][ 5] = ff_put_h264_qpel16_mc11_neon;
+        c->put_h264_qpel_pixels_tab[0][ 6] = ff_put_h264_qpel16_mc21_neon;
+        c->put_h264_qpel_pixels_tab[0][ 7] = ff_put_h264_qpel16_mc31_neon;
+        c->put_h264_qpel_pixels_tab[0][ 8] = ff_put_h264_qpel16_mc02_neon;
+        c->put_h264_qpel_pixels_tab[0][ 9] = ff_put_h264_qpel16_mc12_neon;
+        c->put_h264_qpel_pixels_tab[0][10] = ff_put_h264_qpel16_mc22_neon;
+        c->put_h264_qpel_pixels_tab[0][11] = ff_put_h264_qpel16_mc32_neon;
+        c->put_h264_qpel_pixels_tab[0][12] = ff_put_h264_qpel16_mc03_neon;
+        c->put_h264_qpel_pixels_tab[0][13] = ff_put_h264_qpel16_mc13_neon;
+        c->put_h264_qpel_pixels_tab[0][14] = ff_put_h264_qpel16_mc23_neon;
+        c->put_h264_qpel_pixels_tab[0][15] = ff_put_h264_qpel16_mc33_neon;
+
+        /* c->put_h264_qpel_pixels_tab[1][ 0] = ff_put_h264_qpel8_mc00_neon; */
+        c->put_h264_qpel_pixels_tab[1][ 1] = ff_put_h264_qpel8_mc10_neon;
+        c->put_h264_qpel_pixels_tab[1][ 2] = ff_put_h264_qpel8_mc20_neon;
+        c->put_h264_qpel_pixels_tab[1][ 3] = ff_put_h264_qpel8_mc30_neon;
+        c->put_h264_qpel_pixels_tab[1][ 4] = ff_put_h264_qpel8_mc01_neon;
+        c->put_h264_qpel_pixels_tab[1][ 5] = ff_put_h264_qpel8_mc11_neon;
+        c->put_h264_qpel_pixels_tab[1][ 6] = ff_put_h264_qpel8_mc21_neon;
+        c->put_h264_qpel_pixels_tab[1][ 7] = ff_put_h264_qpel8_mc31_neon;
+        c->put_h264_qpel_pixels_tab[1][ 8] = ff_put_h264_qpel8_mc02_neon;
+        c->put_h264_qpel_pixels_tab[1][ 9] = ff_put_h264_qpel8_mc12_neon;
+        c->put_h264_qpel_pixels_tab[1][10] = ff_put_h264_qpel8_mc22_neon;
+        c->put_h264_qpel_pixels_tab[1][11] = ff_put_h264_qpel8_mc32_neon;
+        c->put_h264_qpel_pixels_tab[1][12] = ff_put_h264_qpel8_mc03_neon;
+        c->put_h264_qpel_pixels_tab[1][13] = ff_put_h264_qpel8_mc13_neon;
+        c->put_h264_qpel_pixels_tab[1][14] = ff_put_h264_qpel8_mc23_neon;
+        c->put_h264_qpel_pixels_tab[1][15] = ff_put_h264_qpel8_mc33_neon;
+
+        /* c->avg_h264_qpel_pixels_tab[0][ 0] = ff_avg_h264_qpel16_mc00_neon; */
+        c->avg_h264_qpel_pixels_tab[0][ 1] = ff_avg_h264_qpel16_mc10_neon;
+        c->avg_h264_qpel_pixels_tab[0][ 2] = ff_avg_h264_qpel16_mc20_neon;
+        c->avg_h264_qpel_pixels_tab[0][ 3] = ff_avg_h264_qpel16_mc30_neon;
+        c->avg_h264_qpel_pixels_tab[0][ 4] = ff_avg_h264_qpel16_mc01_neon;
+        c->avg_h264_qpel_pixels_tab[0][ 5] = ff_avg_h264_qpel16_mc11_neon;
+        c->avg_h264_qpel_pixels_tab[0][ 6] = ff_avg_h264_qpel16_mc21_neon;
+        c->avg_h264_qpel_pixels_tab[0][ 7] = ff_avg_h264_qpel16_mc31_neon;
+        c->avg_h264_qpel_pixels_tab[0][ 8] = ff_avg_h264_qpel16_mc02_neon;
+        c->avg_h264_qpel_pixels_tab[0][ 9] = ff_avg_h264_qpel16_mc12_neon;
+        c->avg_h264_qpel_pixels_tab[0][10] = ff_avg_h264_qpel16_mc22_neon;
+        c->avg_h264_qpel_pixels_tab[0][11] = ff_avg_h264_qpel16_mc32_neon;
+        c->avg_h264_qpel_pixels_tab[0][12] = ff_avg_h264_qpel16_mc03_neon;
+        c->avg_h264_qpel_pixels_tab[0][13] = ff_avg_h264_qpel16_mc13_neon;
+        c->avg_h264_qpel_pixels_tab[0][14] = ff_avg_h264_qpel16_mc23_neon;
+        c->avg_h264_qpel_pixels_tab[0][15] = ff_avg_h264_qpel16_mc33_neon;
+
+        /* c->avg_h264_qpel_pixels_tab[1][ 0] = ff_avg_h264_qpel8_mc00_neon; */
+        c->avg_h264_qpel_pixels_tab[1][ 1] = ff_avg_h264_qpel8_mc10_neon;
+        c->avg_h264_qpel_pixels_tab[1][ 2] = ff_avg_h264_qpel8_mc20_neon;
+        c->avg_h264_qpel_pixels_tab[1][ 3] = ff_avg_h264_qpel8_mc30_neon;
+        c->avg_h264_qpel_pixels_tab[1][ 4] = ff_avg_h264_qpel8_mc01_neon;
+        c->avg_h264_qpel_pixels_tab[1][ 5] = ff_avg_h264_qpel8_mc11_neon;
+        c->avg_h264_qpel_pixels_tab[1][ 6] = ff_avg_h264_qpel8_mc21_neon;
+        c->avg_h264_qpel_pixels_tab[1][ 7] = ff_avg_h264_qpel8_mc31_neon;
+        c->avg_h264_qpel_pixels_tab[1][ 8] = ff_avg_h264_qpel8_mc02_neon;
+        c->avg_h264_qpel_pixels_tab[1][ 9] = ff_avg_h264_qpel8_mc12_neon;
+        c->avg_h264_qpel_pixels_tab[1][10] = ff_avg_h264_qpel8_mc22_neon;
+        c->avg_h264_qpel_pixels_tab[1][11] = ff_avg_h264_qpel8_mc32_neon;
+        c->avg_h264_qpel_pixels_tab[1][12] = ff_avg_h264_qpel8_mc03_neon;
+        c->avg_h264_qpel_pixels_tab[1][13] = ff_avg_h264_qpel8_mc13_neon;
+        c->avg_h264_qpel_pixels_tab[1][14] = ff_avg_h264_qpel8_mc23_neon;
+        c->avg_h264_qpel_pixels_tab[1][15] = ff_avg_h264_qpel8_mc33_neon;
+    }
+}
diff --git a/libavcodec/aarch64/h264qpel_neon.S b/libavcodec/aarch64/h264qpel_neon.S
new file mode 100644
index 0000000..731dc06
--- /dev/null
+++ b/libavcodec/aarch64/h264qpel_neon.S
@@ -0,0 +1,934 @@
+/*
+ * Copyright (c) 2008 Mans Rullgard <mans at mansr.com>
+ * Copyright (c) 2013 Janne Grunau <janne-libav at jannau.net>
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+#include "neon.S"
+
+        /* H.264 qpel MC */
+
+.macro  lowpass_const   r
+        movz            \r, #20, lsl #16
+        movk            \r, #5
+        mov             v6.S[0], \r
+.endm
+
+//trashes v0-v5
+.macro  lowpass_8       r0,  r1,  r2,  r3,  d0,  d1,  narrow=1
+        ext             v2.8B,      \r0\().8B, \r1\().8B, #2
+        ext             v3.8B,      \r0\().8B, \r1\().8B, #3
+        uaddl           v2.8H,      v2.8B,     v3.8B
+        ext             v4.8B,      \r0\().8B, \r1\().8B, #1
+        ext             v5.8B,      \r0\().8B, \r1\().8B, #4
+        uaddl           v4.8H,      v4.8B,     v5.8B
+        ext             v1.8B,      \r0\().8B, \r1\().8B, #5
+        uaddl           \d0\().8H,  \r0\().8B, v1.8B
+        ext             v0.8B,      \r2\().8B, \r3\().8B, #2
+        mla             \d0\().8H,  v2.8H,     v6.H[1]
+        ext             v1.8B,      \r2\().8B, \r3\().8B, #3
+        uaddl           v0.8H,      v0.8B,     v1.8B
+        ext             v1.8B,      \r2\().8B, \r3\().8B, #1
+        mls             \d0\().8H,  v4.8H,     v6.H[0]
+        ext             v3.8B,      \r2\().8B, \r3\().8B, #4
+        uaddl           v1.8H,      v1.8B,     v3.8B
+        ext             v2.8B,      \r2\().8B, \r3\().8B, #5
+        uaddl           \d1\().8H,  \r2\().8B, v2.8B
+        mla             \d1\().8H,  v0.8H,     v6.H[1]
+        mls             \d1\().8H,  v1.8H,     v6.H[0]
+  .if \narrow
+        sqrshrun        \d0\().8B,  \d0\().8H, #5
+        sqrshrun        \d1\().8B,  \d1\().8H, #5
+  .endif
+.endm
+
+//trashes v0-v5, v7, v30-v31
+.macro  lowpass_8H      r0,  r1
+        ext             v0.16B,     \r0\().16B, \r0\().16B, #2
+        ext             v1.16B,     \r0\().16B, \r0\().16B, #3
+        uaddl           v0.8H,      v0.8B,      v1.8B
+        ext             v2.16B,     \r0\().16B, \r0\().16B, #1
+        ext             v3.16B,     \r0\().16B, \r0\().16B, #4
+        uaddl           v2.8H,      v2.8B,      v3.8B
+        ext             v30.16B,    \r0\().16B, \r0\().16B, #5
+        uaddl           \r0\().8H,  \r0\().8B,  v30.8B
+        ext             v4.16B,     \r1\().16B, \r1\().16B, #2
+        mla             \r0\().8H,  v0.8H,      v6.H[1]
+        ext             v5.16B,     \r1\().16B, \r1\().16B, #3
+        uaddl           v4.8H,      v4.8B,      v5.8B
+        ext             v7.16B,     \r1\().16B, \r1\().16B, #1
+        mls             \r0\().8H,  v2.8H,      v6.H[0]
+        ext             v0.16B,     \r1\().16B, \r1\().16B, #4
+        uaddl           v7.8H,      v7.8B,      v0.8B
+        ext             v31.16B,    \r1\().16B, \r1\().16B, #5
+        uaddl           \r1\().8H,  \r1\().8B,  v31.8B
+        mla             \r1\().8H,  v4.8H,      v6.H[1]
+        mls             \r1\().8H,  v7.8H,      v6.H[0]
+.endm
+
+// trashes v2-v5, v30
+.macro  lowpass_8_1     r0,  r1,  d0,  narrow=1
+        ext             v2.8B,     \r0\().8B, \r1\().8B, #2
+        ext             v3.8B,     \r0\().8B, \r1\().8B, #3
+        uaddl           v2.8H,     v2.8B,     v3.8B
+        ext             v4.8B,     \r0\().8B, \r1\().8B, #1
+        ext             v5.8B,     \r0\().8B, \r1\().8B, #4
+        uaddl           v4.8H,     v4.8B,     v5.8B
+        ext             v30.8B,    \r0\().8B, \r1\().8B, #5
+        uaddl           \d0\().8H, \r0\().8B, v30.8B
+        mla             \d0\().8H, v2.8H,     v6.H[1]
+        mls             \d0\().8H, v4.8H,     v6.H[0]
+  .if \narrow
+        sqrshrun        \d0\().8B, \d0\().8H, #5
+  .endif
+.endm
+
+// trashed v0-v7
+.macro  lowpass_8.16    r0,  r1,  r2
+        ext             v1.16B,     \r0\().16B, \r1\().16B, #4
+        ext             v0.16B,     \r0\().16B, \r1\().16B, #6
+        saddl           v5.4S,      v1.4H,      v0.4H
+        ext             v2.16B,     \r0\().16B, \r1\().16B, #2
+        saddl2          v1.4S,      v1.8H,      v0.8H
+        ext             v3.16B,     \r0\().16B, \r1\().16B, #8
+        saddl           v6.4S,      v2.4H,      v3.4H
+        ext             \r1\().16B, \r0\().16B, \r1\().16B, #10
+        saddl2          v2.4S,      v2.8H,      v3.8H
+        saddl           v0.4S,      \r0\().4H,  \r1\().4H
+        saddl2          v4.4S,      \r0\().8H,  \r1\().8H
+
+        shl             v3.4S,  v5.4S,  #4
+        shl             v5.4S,  v5.4S,  #2
+        shl             v7.4S,  v6.4S,  #2
+        add             v5.4S,  v5.4S,  v3.4S
+        add             v6.4S,  v6.4S,  v7.4S
+
+        shl             v3.4S,  v1.4S,  #4
+        shl             v1.4S,  v1.4S,  #2
+        shl             v7.4S,  v2.4S,  #2
+        add             v1.4S,  v1.4S,  v3.4S
+        add             v2.4S,  v2.4S,  v7.4S
+
+        add             v5.4S,  v5.4S,  v0.4S
+        sub             v5.4S,  v5.4S,  v6.4S
+
+        add             v1.4S,  v1.4S,  v4.4S
+        sub             v1.4S,  v1.4S,  v2.4S
+
+        rshrn           v5.4H,  v5.4S,  #10
+        rshrn2          v5.8H,  v1.4S,  #10
+
+        sqxtun          \r2\().8B,  v5.8H
+.endm
+
+function put_h264_qpel16_h_lowpass_neon_packed
+        mov             x4,  x30
+        mov             x12, #16
+        mov             x3,  #8
+        bl              put_h264_qpel8_h_lowpass_neon
+        sub             x1,  x1,  x2, lsl #4
+        add             x1,  x1,  #8
+        mov             x12, #16
+        mov             x30, x4
+        b               put_h264_qpel8_h_lowpass_neon
+endfunc
+
+.macro  h264_qpel_h_lowpass type
+function \type\()_h264_qpel16_h_lowpass_neon
+        mov             x13, x30
+        mov             x12, #16
+        bl              \type\()_h264_qpel8_h_lowpass_neon
+        sub             x0,  x0,  x3, lsl #4
+        sub             x1,  x1,  x2, lsl #4
+        add             x0,  x0,  #8
+        add             x1,  x1,  #8
+        mov             x12, #16
+        mov             x30, x13
+endfunc
+
+function \type\()_h264_qpel8_h_lowpass_neon
+1:      ld1             {v28.8B, v29.8B}, [x1], x2
+        ld1             {v16.8B, v17.8B}, [x1], x2
+        subs            x12, x12, #2
+        lowpass_8       v28, v29, v16, v17, v28, v16
+  .ifc \type,avg
+        ld1             {v2.8B},    [x0], x3
+        urhadd          v28.8B, v28.8B,  v2.8B
+        ld1             {v3.8B},    [x0]
+        urhadd          v16.8B, v16.8B, v3.8B
+        sub             x0,  x0,  x3
+  .endif
+        st1             {v28.8B},    [x0], x3
+        st1             {v16.8B},    [x0], x3
+        b.ne            1b
+        ret
+endfunc
+.endm
+
+        h264_qpel_h_lowpass put
+        h264_qpel_h_lowpass avg
+
+.macro  h264_qpel_h_lowpass_l2 type
+function \type\()_h264_qpel16_h_lowpass_l2_neon
+        mov             x13, x30
+        mov             x12, #16
+        bl              \type\()_h264_qpel8_h_lowpass_l2_neon
+        sub             x0,  x0,  x2, lsl #4
+        sub             x1,  x1,  x2, lsl #4
+        sub             x3,  x3,  x2, lsl #4
+        add             x0,  x0,  #8
+        add             x1,  x1,  #8
+        add             x3,  x3,  #8
+        mov             x12, #16
+        mov             x30, x13
+endfunc
+
+function \type\()_h264_qpel8_h_lowpass_l2_neon
+1:      ld1             {v26.8B, v27.8B}, [x1], x2
+        ld1             {v16.8B, v17.8B}, [x1], x2
+        ld1             {v28.8B},     [x3], x2
+        ld1             {v29.8B},     [x3], x2
+        subs            x12, x12, #2
+        lowpass_8       v26, v27, v16, v17, v26, v27
+        urhadd          v26.8B, v26.8B, v28.8B
+        urhadd          v27.8B, v27.8B, v29.8B
+  .ifc \type,avg
+        ld1             {v2.8B},      [x0], x2
+        urhadd          v26.8B, v26.8B, v2.8B
+        ld1             {v3.8B},      [x0]
+        urhadd          v27.8B, v27.8B, v3.8B
+        sub             x0,  x0,  x2
+  .endif
+        st1             {v26.8B},     [x0], x2
+        st1             {v27.8B},     [x0], x2
+        b.ne            1b
+        ret
+endfunc
+.endm
+
+        h264_qpel_h_lowpass_l2 put
+        h264_qpel_h_lowpass_l2 avg
+
+function put_h264_qpel16_v_lowpass_neon_packed
+        mov             x4,  x30
+        mov             x2,  #8
+        bl              put_h264_qpel8_v_lowpass_neon
+        sub             x1,  x1,  x3, lsl #2
+        bl              put_h264_qpel8_v_lowpass_neon
+        sub             x1,  x1,  x3, lsl #4
+        sub             x1,  x1,  x3, lsl #2
+        add             x1,  x1,  #8
+        bl              put_h264_qpel8_v_lowpass_neon
+        sub             x1,  x1,  x3, lsl #2
+        mov             x30, x4
+        b               put_h264_qpel8_v_lowpass_neon
+endfunc
+
+.macro  h264_qpel_v_lowpass type
+function \type\()_h264_qpel16_v_lowpass_neon
+        mov             x4,  x30
+        bl              \type\()_h264_qpel8_v_lowpass_neon
+        sub             x1,  x1,  x3, lsl #2
+        bl              \type\()_h264_qpel8_v_lowpass_neon
+        sub             x0,  x0,  x2, lsl #4
+        add             x0,  x0,  #8
+        sub             x1,  x1,  x3, lsl #4
+        sub             x1,  x1,  x3, lsl #2
+        add             x1,  x1,  #8
+        bl              \type\()_h264_qpel8_v_lowpass_neon
+        sub             x1,  x1,  x3, lsl #2
+        mov             x30, x4
+endfunc
+
+function \type\()_h264_qpel8_v_lowpass_neon
+        ld1             {v16.8B}, [x1], x3
+        ld1             {v18.8B}, [x1], x3
+        ld1             {v20.8B}, [x1], x3
+        ld1             {v22.8B}, [x1], x3
+        ld1             {v24.8B}, [x1], x3
+        ld1             {v26.8B}, [x1], x3
+        ld1             {v28.8B}, [x1], x3
+        ld1             {v30.8B}, [x1], x3
+        ld1             {v17.8B}, [x1], x3
+        ld1             {v19.8B}, [x1], x3
+        ld1             {v21.8B}, [x1], x3
+        ld1             {v23.8B}, [x1], x3
+        ld1             {v25.8B}, [x1]
+
+        transpose_8x8B  v16, v18, v20, v22, v24, v26, v28, v30, v0,  v1
+        transpose_8x8B  v17, v19, v21, v23, v25, v27, v29, v31, v0,  v1
+        lowpass_8       v16, v17, v18, v19, v16, v17
+        lowpass_8       v20, v21, v22, v23, v18, v19
+        lowpass_8       v24, v25, v26, v27, v20, v21
+        lowpass_8       v28, v29, v30, v31, v22, v23
+        transpose_8x8B  v16, v17, v18, v19, v20, v21, v22, v23, v0,  v1
+
+  .ifc \type,avg
+        ld1             {v24.8B},  [x0], x2
+        urhadd          v16.8B, v16.8B, v24.8B
+        ld1             {v25.8B}, [x0], x2
+        urhadd          v17.8B, v17.8B, v25.8B
+        ld1             {v26.8B}, [x0], x2
+        urhadd          v18.8B, v18.8B, v26.8B
+        ld1             {v27.8B}, [x0], x2
+        urhadd          v19.8B, v19.8B, v27.8B
+        ld1             {v28.8B}, [x0], x2
+        urhadd          v20.8B, v20.8B, v28.8B
+        ld1             {v29.8B}, [x0], x2
+        urhadd          v21.8B, v21.8B, v29.8B
+        ld1             {v30.8B}, [x0], x2
+        urhadd          v22.8B, v22.8B, v30.8B
+        ld1             {v31.8B}, [x0], x2
+        urhadd          v23.8B, v23.8B, v31.8B
+        sub             x0,  x0,  x2,  lsl #3
+  .endif
+
+        st1             {v16.8B}, [x0], x2
+        st1             {v17.8B}, [x0], x2
+        st1             {v18.8B}, [x0], x2
+        st1             {v19.8B}, [x0], x2
+        st1             {v20.8B}, [x0], x2
+        st1             {v21.8B}, [x0], x2
+        st1             {v22.8B}, [x0], x2
+        st1             {v23.8B}, [x0], x2
+
+        ret
+endfunc
+.endm
+
+        h264_qpel_v_lowpass put
+        h264_qpel_v_lowpass avg
+
+.macro  h264_qpel_v_lowpass_l2 type
+function \type\()_h264_qpel16_v_lowpass_l2_neon
+        mov             x4,  x30
+        bl              \type\()_h264_qpel8_v_lowpass_l2_neon
+        sub             x1,  x1,  x3, lsl #2
+        bl              \type\()_h264_qpel8_v_lowpass_l2_neon
+        sub             x0,  x0,  x3, lsl #4
+        sub             x12, x12, x2, lsl #4
+        add             x0,  x0,  #8
+        add             x12, x12, #8
+        sub             x1,  x1,  x3, lsl #4
+        sub             x1,  x1,  x3, lsl #2
+        add             x1,  x1,  #8
+        bl              \type\()_h264_qpel8_v_lowpass_l2_neon
+        sub             x1,  x1,  x3, lsl #2
+        mov             x30, x4
+endfunc
+
+function \type\()_h264_qpel8_v_lowpass_l2_neon
+        ld1             {v16.8B}, [x1], x3
+        ld1             {v18.8B}, [x1], x3
+        ld1             {v20.8B}, [x1], x3
+        ld1             {v22.8B}, [x1], x3
+        ld1             {v24.8B}, [x1], x3
+        ld1             {v26.8B}, [x1], x3
+        ld1             {v28.8B}, [x1], x3
+        ld1             {v30.8B}, [x1], x3
+        ld1             {v17.8B}, [x1], x3
+        ld1             {v19.8B}, [x1], x3
+        ld1             {v21.8B}, [x1], x3
+        ld1             {v23.8B}, [x1], x3
+        ld1             {v25.8B}, [x1]
+
+        transpose_8x8B  v16, v18, v20, v22, v24, v26, v28, v30, v0,  v1
+        transpose_8x8B  v17, v19, v21, v23, v25, v27, v29, v31, v0,  v1
+        lowpass_8       v16, v17, v18, v19, v16, v17
+        lowpass_8       v20, v21, v22, v23, v18, v19
+        lowpass_8       v24, v25, v26, v27, v20, v21
+        lowpass_8       v28, v29, v30, v31, v22, v23
+        transpose_8x8B  v16, v17, v18, v19, v20, v21, v22, v23, v0,  v1
+
+        ld1             {v24.8B},  [x12], x2
+        ld1             {v25.8B},  [x12], x2
+        ld1             {v26.8B},  [x12], x2
+        ld1             {v27.8B},  [x12], x2
+        ld1             {v28.8B},  [x12], x2
+        urhadd          v16.8B, v24.8B, v16.8B
+        urhadd          v17.8B, v25.8B, v17.8B
+        ld1             {v29.8B},  [x12], x2
+        urhadd          v18.8B, v26.8B, v18.8B
+        urhadd          v19.8B, v27.8B, v19.8B
+        ld1             {v30.8B}, [x12], x2
+        urhadd          v20.8B, v28.8B, v20.8B
+        urhadd          v21.8B, v29.8B, v21.8B
+        ld1             {v31.8B}, [x12], x2
+        urhadd          v22.8B, v30.8B, v22.8B
+        urhadd          v23.8B, v31.8B, v23.8B
+
+  .ifc \type,avg
+        ld1             {v24.8B}, [x0], x3
+        urhadd          v16.8B, v16.8B, v24.8B
+        ld1             {v25.8B}, [x0], x3
+        urhadd          v17.8B, v17.8B, v25.8B
+        ld1             {v26.8B}, [x0], x3
+        urhadd          v18.8B, v18.8B, v26.8B
+        ld1             {v27.8B}, [x0], x3
+        urhadd          v19.8B, v19.8B, v27.8B
+        ld1             {v28.8B}, [x0], x3
+        urhadd          v20.8B, v20.8B, v28.8B
+        ld1             {v29.8B}, [x0], x3
+        urhadd          v21.8B, v21.8B, v29.8B
+        ld1             {v30.8B}, [x0], x3
+        urhadd          v22.8B, v22.8B, v30.8B
+        ld1             {v31.8B}, [x0], x3
+        urhadd          v23.8B, v23.8B, v31.8B
+        sub             x0,  x0,  x3,  lsl #3
+  .endif
+
+        st1             {v16.8B}, [x0], x3
+        st1             {v17.8B}, [x0], x3
+        st1             {v18.8B}, [x0], x3
+        st1             {v19.8B}, [x0], x3
+        st1             {v20.8B}, [x0], x3
+        st1             {v21.8B}, [x0], x3
+        st1             {v22.8B}, [x0], x3
+        st1             {v23.8B}, [x0], x3
+
+        ret
+endfunc
+.endm
+
+        h264_qpel_v_lowpass_l2 put
+        h264_qpel_v_lowpass_l2 avg
+
+function put_h264_qpel8_hv_lowpass_neon_top
+        lowpass_const   w12
+        ld1             {v16.8H}, [x1], x3
+        ld1             {v17.8H}, [x1], x3
+        ld1             {v18.8H}, [x1], x3
+        ld1             {v19.8H}, [x1], x3
+        ld1             {v20.8H}, [x1], x3
+        ld1             {v21.8H}, [x1], x3
+        ld1             {v22.8H}, [x1], x3
+        ld1             {v23.8H}, [x1], x3
+        ld1             {v24.8H}, [x1], x3
+        ld1             {v25.8H}, [x1], x3
+        ld1             {v26.8H}, [x1], x3
+        ld1             {v27.8H}, [x1], x3
+        ld1             {v28.8H}, [x1]
+        lowpass_8H      v16, v17
+        lowpass_8H      v18, v19
+        lowpass_8H      v20, v21
+        lowpass_8H      v22, v23
+        lowpass_8H      v24, v25
+        lowpass_8H      v26, v27
+        lowpass_8H      v28, v29
+
+        transpose_8x8H  v16, v17, v18, v19, v20, v21, v22, v23, v0,  v1
+        transpose_8x8H  v24, v25, v26, v27, v28, v29, v30, v31, v0,  v1
+
+        lowpass_8.16    v16, v24, v16
+        lowpass_8.16    v17, v25, v17
+
+        lowpass_8.16    v18, v26, v18
+        lowpass_8.16    v19, v27, v19
+
+        lowpass_8.16    v20, v28, v20
+        lowpass_8.16    v21, v29, v21
+
+        lowpass_8.16    v22, v30, v22
+        lowpass_8.16    v23, v31, v23
+
+        transpose_8x8B v16, v17, v18, v19, v20, v21, v22, v23, v0,  v1
+
+        ret
+endfunc
+
+.macro  h264_qpel8_hv_lowpass type
+function \type\()_h264_qpel8_hv_lowpass_neon
+        mov             x10, x30
+        bl              put_h264_qpel8_hv_lowpass_neon_top
+  .ifc \type,avg
+        ld1             {v0.8B},      [x0], x2
+        urhadd          v16.8B, v16.8B, v0.8B
+        ld1             {v1.8B},      [x0], x2
+        urhadd          v17.8B, v17.8B, v1.8B
+        ld1             {v2.8B},      [x0], x2
+        urhadd          v18.8B, v18.8B, v2.8B
+        ld1             {v3.8B},      [x0], x2
+        urhadd          v19.8B, v19.8B, v3.8B
+        ld1             {v4.8B},      [x0], x2
+        urhadd          v20.8B, v20.8B, v4.8B
+        ld1             {v5.8B},      [x0], x2
+        urhadd          v21.8B, v21.8B, v5.8B
+        ld1             {v6.8B},      [x0], x2
+        urhadd          v22.8B, v22.8B, v6.8B
+        ld1             {v7.8B},      [x0], x2
+        urhadd          v23.8B, v23.8B, v7.8B
+        sub             x0,  x0,  x2,  lsl #3
+  .endif
+
+        st1             {v16.8B},     [x0], x2
+        st1             {v17.8B},     [x0], x2
+        st1             {v18.8B},     [x0], x2
+        st1             {v19.8B},     [x0], x2
+        st1             {v20.8B},     [x0], x2
+        st1             {v21.8B},     [x0], x2
+        st1             {v22.8B},     [x0], x2
+        st1             {v23.8B},     [x0], x2
+
+        ret             x10
+endfunc
+.endm
+
+        h264_qpel8_hv_lowpass put
+        h264_qpel8_hv_lowpass avg
+
+.macro  h264_qpel8_hv_lowpass_l2 type
+function \type\()_h264_qpel8_hv_lowpass_l2_neon
+        mov             x10, x30
+        bl              put_h264_qpel8_hv_lowpass_neon_top
+
+        ld1             {v0.8B, v1.8B},  [x2], #16
+        ld1             {v2.8B, v3.8B},  [x2], #16
+        urhadd          v0.8B,  v0.8B,  v16.8B
+        urhadd          v1.8B,  v1.8B,  v17.8B
+        ld1             {v4.8B, v5.8B},  [x2], #16
+        urhadd          v2.8B,  v2.8B,  v18.8B
+        urhadd          v3.8B,  v3.8B,  v19.8B
+        ld1             {v6.8B, v7.8B},  [x2], #16
+        urhadd          v4.8B,  v4.8B,  v20.8B
+        urhadd          v5.8B,  v5.8B,  v21.8B
+        urhadd          v6.8B,  v6.8B,  v22.8B
+        urhadd          v7.8B,  v7.8B,  v23.8B
+  .ifc \type,avg
+        ld1             {v16.8B},     [x0], x3
+        urhadd          v0.8B,  v0.8B,  v16.8B
+        ld1             {v17.8B},     [x0], x3
+        urhadd          v1.8B,  v1.8B,  v17.8B
+        ld1             {v18.8B},     [x0], x3
+        urhadd          v2.8B,  v2.8B,  v18.8B
+        ld1             {v19.8B},     [x0], x3
+        urhadd          v3.8B,  v3.8B,  v19.8B
+        ld1             {v20.8B},     [x0], x3
+        urhadd          v4.8B,  v4.8B,  v20.8B
+        ld1             {v21.8B},     [x0], x3
+        urhadd          v5.8B,  v5.8B,  v21.8B
+        ld1             {v22.8B},     [x0], x3
+        urhadd          v6.8B,  v6.8B,  v22.8B
+        ld1             {v23.8B},     [x0], x3
+        urhadd          v7.8B,  v7.8B,  v23.8B
+        sub             x0,  x0,  x3,  lsl #3
+  .endif
+        st1             {v0.8B},      [x0], x3
+        st1             {v1.8B},      [x0], x3
+        st1             {v2.8B},      [x0], x3
+        st1             {v3.8B},      [x0], x3
+        st1             {v4.8B},      [x0], x3
+        st1             {v5.8B},      [x0], x3
+        st1             {v6.8B},      [x0], x3
+        st1             {v7.8B},      [x0], x3
+
+        ret             x10
+endfunc
+.endm
+
+        h264_qpel8_hv_lowpass_l2 put
+        h264_qpel8_hv_lowpass_l2 avg
+
+.macro  h264_qpel16_hv  type
+function \type\()_h264_qpel16_hv_lowpass_neon
+        mov             x13, x30
+        bl              \type\()_h264_qpel8_hv_lowpass_neon
+        sub             x1,  x1,  x3, lsl #2
+        bl              \type\()_h264_qpel8_hv_lowpass_neon
+        sub             x1,  x1,  x3, lsl #4
+        sub             x1,  x1,  x3, lsl #2
+        add             x1,  x1,  #8
+        sub             x0,  x0,  x2, lsl #4
+        add             x0,  x0,  #8
+        bl              \type\()_h264_qpel8_hv_lowpass_neon
+        sub             x1,  x1,  x3, lsl #2
+        mov             x30, x13
+        b               \type\()_h264_qpel8_hv_lowpass_neon
+endfunc
+
+function \type\()_h264_qpel16_hv_lowpass_l2_neon
+        mov             x13, x30
+        sub             x2,  x4,  #256
+        bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
+        sub             x1,  x1,  x3, lsl #2
+        bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
+        sub             x1,  x1,  x3, lsl #4
+        sub             x1,  x1,  x3, lsl #2
+        add             x1,  x1,  #8
+        sub             x0,  x0,  x3, lsl #4
+        add             x0,  x0,  #8
+        bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
+        sub             x1,  x1,  x3, lsl #2
+        mov             x30, x13
+        b               \type\()_h264_qpel8_hv_lowpass_l2_neon
+endfunc
+.endm
+
+        h264_qpel16_hv put
+        h264_qpel16_hv avg
+
+.macro  h264_qpel8      type
+function ff_\type\()_h264_qpel8_mc10_neon, export=1
+        lowpass_const   w3
+        mov             x3,  x1
+        sub             x1,  x1,  #2
+        mov             x12, #8
+        b               \type\()_h264_qpel8_h_lowpass_l2_neon
+endfunc
+
+function ff_\type\()_h264_qpel8_mc20_neon, export=1
+        lowpass_const   w3
+        sub             x1,  x1,  #2
+        mov             x3,  x2
+        mov             x12, #8
+        b               \type\()_h264_qpel8_h_lowpass_neon
+endfunc
+
+function ff_\type\()_h264_qpel8_mc30_neon, export=1
+        lowpass_const   w3
+        add             x3,  x1,  #1
+        sub             x1,  x1,  #2
+        mov             x12, #8
+        b               \type\()_h264_qpel8_h_lowpass_l2_neon
+endfunc
+
+function ff_\type\()_h264_qpel8_mc01_neon, export=1
+        mov             x14, x30
+        mov             x12, x1
+\type\()_h264_qpel8_mc01:
+        lowpass_const   w3
+        mov             x3,  x2
+        sub             x1,  x1,  x2, lsl #1
+        bl              \type\()_h264_qpel8_v_lowpass_l2_neon
+        ret             x14
+endfunc
+
+function ff_\type\()_h264_qpel8_mc11_neon, export=1
+        mov             x14, x30
+        mov             x8,  x0
+        mov             x9,  x1
+\type\()_h264_qpel8_mc11:
+        lowpass_const   w3
+        mov             x11, sp
+        sub             sp,  sp,  #64
+        mov             x0,  sp
+        sub             x1,  x1,  #2
+        mov             x3,  #8
+        mov             x12, #8
+        bl              put_h264_qpel8_h_lowpass_neon
+        mov             x0,  x8
+        mov             x3,  x2
+        mov             x12, sp
+        sub             x1,  x9,  x2, lsl #1
+        mov             x2,  #8
+        bl              \type\()_h264_qpel8_v_lowpass_l2_neon
+        mov             sp,  x11
+        ret             x14
+endfunc
+
+function ff_\type\()_h264_qpel8_mc21_neon, export=1
+        mov             x14, x30
+        mov             x8,  x0
+        mov             x9,  x1
+\type\()_h264_qpel8_mc21:
+        lowpass_const   w3
+        mov             x11, sp
+        sub             sp,  sp,  #(8*8+16*12)
+        sub             x1,  x1,  #2
+        mov             x3,  #8
+        mov             x0,  sp
+        mov             x12, #8
+        bl              put_h264_qpel8_h_lowpass_neon
+        mov             x4,  x0
+        mov             x0,  x8
+        sub             x1,  x9,  x2, lsl #1
+        sub             x1,  x1,  #2
+        mov             x3,  x2
+        sub             x2,  x4,  #64
+        bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
+        mov             sp,  x11
+        ret             x14
+endfunc
+
+function ff_\type\()_h264_qpel8_mc31_neon, export=1
+        add             x1,  x1,  #1
+        mov             x14, x30
+        mov             x8,  x0
+        mov             x9,  x1
+        sub             x1,  x1,  #1
+        b               \type\()_h264_qpel8_mc11
+endfunc
+
+function ff_\type\()_h264_qpel8_mc02_neon, export=1
+        mov             x14, x30
+        lowpass_const   w3
+        sub             x1,  x1,  x2, lsl #1
+        mov             x3,  x2
+        bl              \type\()_h264_qpel8_v_lowpass_neon
+        ret             x14
+endfunc
+
+function ff_\type\()_h264_qpel8_mc12_neon, export=1
+        mov             x14, x30
+        mov             x8,  x0
+        mov             x9,  x1
+\type\()_h264_qpel8_mc12:
+        lowpass_const   w3
+        mov             x11, sp
+        sub             sp,  sp,  #(8*8+16*12)
+        sub             x1,  x1,  x2, lsl #1
+        mov             x3,  x2
+        mov             x2,  #8
+        mov             x0,  sp
+        bl              put_h264_qpel8_v_lowpass_neon
+        mov             x4,  x0
+        mov             x0,  x8
+        sub             x1,  x9,  x3, lsl #1
+        sub             x1,  x1,  #2
+        sub             x2,  x4,  #64
+        bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
+        mov             sp,  x11
+        ret             x14
+endfunc
+
+function ff_\type\()_h264_qpel8_mc22_neon, export=1
+        mov             x14, x30
+        mov             x11, sp
+        sub             x1,  x1,  x2, lsl #1
+        sub             x1,  x1,  #2
+        mov             x3,  x2
+        bl              \type\()_h264_qpel8_hv_lowpass_neon
+        mov             sp,  x11
+        ret             x14
+endfunc
+
+function ff_\type\()_h264_qpel8_mc32_neon, export=1
+        mov             x14, x30
+        mov             x8,  x0
+        mov             x9,  x1
+        add             x1,  x1,  #1
+        b               \type\()_h264_qpel8_mc12
+endfunc
+
+function ff_\type\()_h264_qpel8_mc03_neon, export=1
+        mov             x14, x30
+        add             x12, x1,  x2
+        b               \type\()_h264_qpel8_mc01
+endfunc
+
+function ff_\type\()_h264_qpel8_mc13_neon, export=1
+        mov             x14, x30
+        mov             x8,  x0
+        mov             x9,  x1
+        add             x1,  x1,  x2
+        b               \type\()_h264_qpel8_mc11
+endfunc
+
+function ff_\type\()_h264_qpel8_mc23_neon, export=1
+        mov             x14, x30
+        mov             x8,  x0
+        mov             x9,  x1
+        add             x1,  x1,  x2
+        b               \type\()_h264_qpel8_mc21
+endfunc
+
+function ff_\type\()_h264_qpel8_mc33_neon, export=1
+        add             x1,  x1,  #1
+        mov             x14, x30
+        mov             x8,  x0
+        mov             x9,  x1
+        add             x1,  x1,  x2
+        sub             x1,  x1,  #1
+        b               \type\()_h264_qpel8_mc11
+endfunc
+.endm
+
+        h264_qpel8 put
+        h264_qpel8 avg
+
+.macro  h264_qpel16     type
+function ff_\type\()_h264_qpel16_mc10_neon, export=1
+        lowpass_const   w3
+        mov             x3,  x1
+        sub             x1,  x1,  #2
+        b               \type\()_h264_qpel16_h_lowpass_l2_neon
+endfunc
+
+function ff_\type\()_h264_qpel16_mc20_neon, export=1
+        lowpass_const   w3
+        sub             x1,  x1,  #2
+        mov             x3,  x2
+        b               \type\()_h264_qpel16_h_lowpass_neon
+endfunc
+
+function ff_\type\()_h264_qpel16_mc30_neon, export=1
+        lowpass_const   w3
+        add             x3,  x1,  #1
+        sub             x1,  x1,  #2
+        b               \type\()_h264_qpel16_h_lowpass_l2_neon
+endfunc
+
+function ff_\type\()_h264_qpel16_mc01_neon, export=1
+        mov             x14, x30
+        mov             x12, x1
+\type\()_h264_qpel16_mc01:
+        lowpass_const   w3
+        mov             x3,  x2
+        sub             x1,  x1,  x2, lsl #1
+        bl              \type\()_h264_qpel16_v_lowpass_l2_neon
+        ret             x14
+endfunc
+
+function ff_\type\()_h264_qpel16_mc11_neon, export=1
+        mov             x14, x30
+        mov             x8,  x0
+        mov             x9,  x1
+\type\()_h264_qpel16_mc11:
+        lowpass_const   w3
+        mov             x11, sp
+        sub             sp,  sp,  #256
+        mov             x0,  sp
+        sub             x1,  x1,  #2
+        mov             x3,  #16
+        bl              put_h264_qpel16_h_lowpass_neon
+        mov             x0,  x8
+        mov             x3,  x2
+        mov             x12, sp
+        sub             x1,  x9,  x2, lsl #1
+        mov             x2,  #16
+        bl              \type\()_h264_qpel16_v_lowpass_l2_neon
+        mov             sp,  x11
+        ret             x14
+endfunc
+
+function ff_\type\()_h264_qpel16_mc21_neon, export=1
+        mov             x14, x30
+        mov             x8,  x0
+        mov             x9,  x1
+\type\()_h264_qpel16_mc21:
+        lowpass_const   w3
+        mov             x11, sp
+        sub             sp,  sp,  #(16*16+16*12)
+        sub             x1,  x1,  #2
+        mov             x0,  sp
+        bl              put_h264_qpel16_h_lowpass_neon_packed
+        mov             x4,  x0
+        mov             x0,  x8
+        sub             x1,  x9,  x2, lsl #1
+        sub             x1,  x1,  #2
+        mov             x3,  x2
+        bl              \type\()_h264_qpel16_hv_lowpass_l2_neon
+        mov             sp,  x11
+        ret             x14
+endfunc
+
+function ff_\type\()_h264_qpel16_mc31_neon, export=1
+        add             x1,  x1,  #1
+        mov             x14, x30
+        mov             x8,  x0
+        mov             x9,  x1
+        sub             x1,  x1,  #1
+        b               \type\()_h264_qpel16_mc11
+endfunc
+
+function ff_\type\()_h264_qpel16_mc02_neon, export=1
+        mov             x14, x30
+        lowpass_const   w3
+        sub             x1,  x1,  x2, lsl #1
+        mov             x3,  x2
+        bl              \type\()_h264_qpel16_v_lowpass_neon
+        ret             x14
+endfunc
+
+function ff_\type\()_h264_qpel16_mc12_neon, export=1
+        mov             x14, x30
+        mov             x8,  x0
+        mov             x9,  x1
+\type\()_h264_qpel16_mc12:
+        lowpass_const   w3
+        mov             x11, sp
+        sub             sp,  sp,  #(16*16+16*12)
+        sub             x1,  x1,  x2, lsl #1
+        mov             x0,  sp
+        mov             x3,  x2
+        bl              put_h264_qpel16_v_lowpass_neon_packed
+        mov             x4,  x0
+        mov             x0,  x8
+        sub             x1,  x9,  x3, lsl #1
+        sub             x1,  x1,  #2
+        mov             x2,  x3
+        bl              \type\()_h264_qpel16_hv_lowpass_l2_neon
+        mov             sp,  x11
+        ret             x14
+endfunc
+
+function ff_\type\()_h264_qpel16_mc22_neon, export=1
+        mov             x14, x30
+        lowpass_const   w3
+        mov             x11, sp
+        sub             x1,  x1,  x2, lsl #1
+        sub             x1,  x1,  #2
+        mov             x3,  x2
+        bl              \type\()_h264_qpel16_hv_lowpass_neon
+        mov             sp,  x11 // restore stack
+        ret             x14
+endfunc
+
+function ff_\type\()_h264_qpel16_mc32_neon, export=1
+        mov             x14, x30
+        mov             x8,  x0
+        mov             x9,  x1
+        add             x1,  x1,  #1
+        b               \type\()_h264_qpel16_mc12
+endfunc
+
+function ff_\type\()_h264_qpel16_mc03_neon, export=1
+        mov             x14, x30
+        add             x12, x1,  x2
+        b               \type\()_h264_qpel16_mc01
+endfunc
+
+function ff_\type\()_h264_qpel16_mc13_neon, export=1
+        mov             x14, x30
+        mov             x8,  x0
+        mov             x9,  x1
+        add             x1,  x1,  x2
+        b               \type\()_h264_qpel16_mc11
+endfunc
+
+function ff_\type\()_h264_qpel16_mc23_neon, export=1
+        mov             x14, x30
+        mov             x8,  x0
+        mov             x9,  x1
+        add             x1,  x1,  x2
+        b               \type\()_h264_qpel16_mc21
+endfunc
+
+function ff_\type\()_h264_qpel16_mc33_neon, export=1
+        add             x1,  x1,  #1
+        mov             x14, x30
+        mov             x8,  x0
+        mov             x9,  x1
+        add             x1,  x1,  x2
+        sub             x1,  x1,  #1
+        b               \type\()_h264_qpel16_mc11
+endfunc
+.endm
+
+        h264_qpel16 put
+        h264_qpel16 avg
diff --git a/libavcodec/aarch64/neon.S b/libavcodec/aarch64/neon.S
index 3af9bcd..c4310d7 100644
--- a/libavcodec/aarch64/neon.S
+++ b/libavcodec/aarch64/neon.S
@@ -16,6 +16,70 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
+.macro  transpose_8x8B  r0, r1, r2, r3, r4, r5, r6, r7, r8, r9
+        trn1            \r8\().8B,  \r0\().8B,  \r1\().8B
+        trn2            \r9\().8B,  \r0\().8B,  \r1\().8B
+        trn1            \r1\().8B,  \r2\().8B,  \r3\().8B
+        trn2            \r3\().8B,  \r2\().8B,  \r3\().8B
+        trn1            \r0\().8B,  \r4\().8B,  \r5\().8B
+        trn2            \r5\().8B,  \r4\().8B,  \r5\().8B
+        trn1            \r2\().8B,  \r6\().8B,  \r7\().8B
+        trn2            \r7\().8B,  \r6\().8B,  \r7\().8B
+
+        trn1            \r4\().4H,  \r0\().4H,  \r2\().4H
+        trn2            \r2\().4H,  \r0\().4H,  \r2\().4H
+        trn1            \r6\().4H,  \r5\().4H,  \r7\().4H
+        trn2            \r7\().4H,  \r5\().4H,  \r7\().4H
+        trn1            \r5\().4H,  \r9\().4H,  \r3\().4H
+        trn2            \r9\().4H,  \r9\().4H,  \r3\().4H
+        trn1            \r3\().4H,  \r8\().4H,  \r1\().4H
+        trn2            \r8\().4H,  \r8\().4H,  \r1\().4H
+
+        trn1            \r0\().2S,  \r3\().2S,  \r4\().2S
+        trn2            \r4\().2S,  \r3\().2S,  \r4\().2S
+
+        trn1            \r1\().2S,  \r5\().2S,  \r6\().2S
+        trn2            \r5\().2S,  \r5\().2S,  \r6\().2S
+
+        trn2            \r6\().2S,  \r8\().2S,  \r2\().2S
+        trn1            \r2\().2S,  \r8\().2S,  \r2\().2S
+
+        trn1            \r3\().2S,  \r9\().2S,  \r7\().2S
+        trn2            \r7\().2S,  \r9\().2S,  \r7\().2S
+.endm
+
+.macro  transpose_8x16B r0, r1, r2, r3, r4, r5, r6, r7, t0, t1
+        trn1            \t0\().16B, \r0\().16B, \r1\().16B
+        trn2            \t1\().16B, \r0\().16B, \r1\().16B
+        trn1            \r1\().16B, \r2\().16B, \r3\().16B
+        trn2            \r3\().16B, \r2\().16B, \r3\().16B
+        trn1            \r0\().16B, \r4\().16B, \r5\().16B
+        trn2            \r5\().16B, \r4\().16B, \r5\().16B
+        trn1            \r2\().16B, \r6\().16B, \r7\().16B
+        trn2            \r7\().16B, \r6\().16B, \r7\().16B
+
+        trn1            \r4\().8H,  \r0\().8H,  \r2\().8H
+        trn2            \r2\().8H,  \r0\().8H,  \r2\().8H
+        trn1            \r6\().8H,  \r5\().8H,  \r7\().8H
+        trn2            \r7\().8H,  \r5\().8H,  \r7\().8H
+        trn1            \r5\().8H,  \t1\().8H,  \r3\().8H
+        trn2            \t1\().8H,  \t1\().8H,  \r3\().8H
+        trn1            \r3\().8H,  \t0\().8H,  \r1\().8H
+        trn2            \t0\().8H,  \t0\().8H,  \r1\().8H
+
+        trn1            \r0\().4S,  \r3\().4S,  \r4\().4S
+        trn2            \r4\().4S,  \r3\().4S,  \r4\().4S
+
+        trn1            \r1\().4S,  \r5\().4S,  \r6\().4S
+        trn2            \r5\().4S,  \r5\().4S,  \r6\().4S
+
+        trn2            \r6\().4S,  \t0\().4S,  \r2\().4S
+        trn1            \r2\().4S,  \t0\().4S,  \r2\().4S
+
+        trn1            \r3\().4S,  \t1\().4S,  \r7\().4S
+        trn2            \r7\().4S,  \t1\().4S,  \r7\().4S
+.endm
+
 .macro  transpose_4x4H  r0, r1, r2, r3, r4, r5, r6, r7
         trn1            \r4\().4H,  \r0\().4H,  \r1\().4H
         trn2            \r5\().4H,  \r0\().4H,  \r1\().4H
diff --git a/libavcodec/h264qpel.c b/libavcodec/h264qpel.c
index 24c9299..ec46da2 100644
--- a/libavcodec/h264qpel.c
+++ b/libavcodec/h264qpel.c
@@ -78,6 +78,8 @@ av_cold void ff_h264qpel_init(H264QpelContext *c, int bit_depth)
         break;
     }
 
+    if (ARCH_AARCH64)
+        ff_h264qpel_init_aarch64(c, bit_depth);
     if (ARCH_ARM)
         ff_h264qpel_init_arm(c, bit_depth);
     if (ARCH_PPC)
diff --git a/libavcodec/h264qpel.h b/libavcodec/h264qpel.h
index d761775..202e97d 100644
--- a/libavcodec/h264qpel.h
+++ b/libavcodec/h264qpel.h
@@ -31,6 +31,7 @@ typedef struct H264QpelContext {
 
 void ff_h264qpel_init(H264QpelContext *c, int bit_depth);
 
+void ff_h264qpel_init_aarch64(H264QpelContext *c, int bit_depth);
 void ff_h264qpel_init_arm(H264QpelContext *c, int bit_depth);
 void ff_h264qpel_init_ppc(H264QpelContext *c, int bit_depth);
 void ff_h264qpel_init_x86(H264QpelContext *c, int bit_depth);



More information about the ffmpeg-cvslog mailing list