[FFmpeg-devel] [PATCH] lavc/aarch64: h264qpel, add lowpass_8 based functions

Mikhail Nitenko mnitenko at gmail.com
Thu Aug 19 23:53:18 EEST 2021


Benchmarks:                        A53     A72
avg_h264_qpel_8_mc01_10_c:        932.7   638.5
avg_h264_qpel_8_mc01_10_neon:     397.7   212.2
avg_h264_qpel_8_mc02_10_c:        946.2   691.2
avg_h264_qpel_8_mc02_10_neon:     365.0   199.0
avg_h264_qpel_8_mc03_10_c:        932.7   639.5
avg_h264_qpel_8_mc03_10_neon:     399.2   214.0
avg_h264_qpel_8_mc10_10_c:       1441.7   810.2
avg_h264_qpel_8_mc10_10_neon:     341.7   156.0
avg_h264_qpel_8_mc11_10_c:       2158.0  1330.0
avg_h264_qpel_8_mc11_10_neon:     671.0   343.5
avg_h264_qpel_8_mc13_10_c:       2163.7  1327.7
avg_h264_qpel_8_mc13_10_neon:     673.0   335.0
avg_h264_qpel_8_mc20_10_c:       1434.0   769.5
avg_h264_qpel_8_mc20_10_neon:     309.7   140.5
avg_h264_qpel_8_mc30_10_c:       1448.2   802.0
avg_h264_qpel_8_mc30_10_neon:     357.7   156.7
avg_h264_qpel_8_mc31_10_c:       2188.5  1329.2
avg_h264_qpel_8_mc31_10_neon:     699.0   346.2
avg_h264_qpel_8_mc33_10_c:       2192.2  1337.5
avg_h264_qpel_8_mc33_10_neon:     700.0   349.0
avg_h264_qpel_16_mc01_10_c:      3768.5  2583.5
avg_h264_qpel_16_mc01_10_neon:   1572.5   854.5
avg_h264_qpel_16_mc02_10_c:      3783.0  2736.2
avg_h264_qpel_16_mc02_10_neon:   1442.7   796.7
avg_h264_qpel_16_mc03_10_c:      3789.5  2572.5
avg_h264_qpel_16_mc03_10_neon:   1574.0   854.2
avg_h264_qpel_16_mc10_10_c:      5879.0  3276.0
avg_h264_qpel_16_mc10_10_neon:   1331.5   611.0
avg_h264_qpel_16_mc11_10_c:      8711.7  5344.0
avg_h264_qpel_16_mc11_10_neon:   2634.0  1349.0
avg_h264_qpel_16_mc13_10_c:      8645.0  5309.2
avg_h264_qpel_16_mc13_10_neon:   2630.7  1356.5
avg_h264_qpel_16_mc20_10_c:      5722.5  3111.0
avg_h264_qpel_16_mc20_10_neon:   1203.5   561.0
avg_h264_qpel_16_mc30_10_c:      5926.0  3252.0
avg_h264_qpel_16_mc30_10_neon:   1395.5   613.5
avg_h264_qpel_16_mc31_10_c:      8722.2  5310.2
avg_h264_qpel_16_mc31_10_neon:   2739.7  1382.2
avg_h264_qpel_16_mc33_10_c:      8754.7  5312.7
avg_h264_qpel_16_mc33_10_neon:   2735.7  1402.7
put_h264_qpel_8_mc01_10_c:        854.7   589.0
put_h264_qpel_8_mc01_10_neon:     356.7   196.2
put_h264_qpel_8_mc02_10_c:        780.0   548.5
put_h264_qpel_8_mc02_10_neon:     324.0   181.2
put_h264_qpel_8_mc03_10_c:        854.7   591.7
put_h264_qpel_8_mc03_10_neon:     358.2   199.0
put_h264_qpel_8_mc10_10_c:       1364.7   754.2
put_h264_qpel_8_mc10_10_neon:     305.7   140.7
put_h264_qpel_8_mc11_10_c:       2079.0  1282.2
put_h264_qpel_8_mc11_10_neon:     630.0   328.2
put_h264_qpel_8_mc13_10_c:       2078.5  1279.0
put_h264_qpel_8_mc13_10_neon:     632.0   322.5
put_h264_qpel_8_mc20_10_c:       1221.5   683.7
put_h264_qpel_8_mc20_10_neon:     273.7   125.0
put_h264_qpel_8_mc30_10_c:       1377.2   758.0
put_h264_qpel_8_mc30_10_neon:     326.7   141.5
put_h264_qpel_8_mc31_10_c:       2107.0  1278.5
put_h264_qpel_8_mc31_10_neon:     658.0   331.2
put_h264_qpel_8_mc33_10_c:       2107.0  1285.0
put_h264_qpel_8_mc33_10_neon:     659.0   332.0
put_h264_qpel_16_mc01_10_c:      3529.7  2412.5
put_h264_qpel_16_mc01_10_neon:   1408.5   786.5
put_h264_qpel_16_mc02_10_c:      3151.5  2121.0
put_h264_qpel_16_mc02_10_neon:   1278.7   725.5
put_h264_qpel_16_mc03_10_c:      3546.5  2375.5
put_h264_qpel_16_mc03_10_neon:   1410.0   787.7
put_h264_qpel_16_mc10_10_c:      5511.5  2999.0
put_h264_qpel_16_mc10_10_neon:   1187.5   558.2
put_h264_qpel_16_mc11_10_c:      8424.2  5137.7
put_h264_qpel_16_mc11_10_neon:   2465.0  1277.7
put_h264_qpel_16_mc13_10_c:      8597.2  5127.7
put_h264_qpel_16_mc13_10_neon:   2466.7  1290.5
put_h264_qpel_16_mc20_10_c:      4894.5  2745.7
put_h264_qpel_16_mc20_10_neon:   1059.5   494.2
put_h264_qpel_16_mc30_10_c:      5576.5  3035.0
put_h264_qpel_16_mc30_10_neon:   1251.5   558.2
put_h264_qpel_16_mc31_10_c:      8695.5  5150.5
put_h264_qpel_16_mc31_10_neon:   2570.7  1320.5
put_h264_qpel_16_mc33_10_c:      8702.5  5131.2
put_h264_qpel_16_mc33_10_neon:   2571.7  1337.0

Signed-off-by: Mikhail Nitenko <mnitenko at gmail.com>
---
 libavcodec/aarch64/h264qpel_init_aarch64.c |  91 +++-
 libavcodec/aarch64/h264qpel_neon.S         | 515 +++++++++++++++++++++
 2 files changed, 604 insertions(+), 2 deletions(-)

diff --git a/libavcodec/aarch64/h264qpel_init_aarch64.c b/libavcodec/aarch64/h264qpel_init_aarch64.c
index 77f41d9a21..93fa5246c4 100644
--- a/libavcodec/aarch64/h264qpel_init_aarch64.c
+++ b/libavcodec/aarch64/h264qpel_init_aarch64.c
@@ -95,12 +95,55 @@ void ff_avg_h264_qpel8_mc13_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t str
 void ff_avg_h264_qpel8_mc23_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
 void ff_avg_h264_qpel8_mc33_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
 
+void ff_put_h264_qpel16_mc10_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel16_mc20_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel16_mc30_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel16_mc01_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel16_mc11_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel16_mc31_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel16_mc02_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel16_mc03_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel16_mc13_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel16_mc33_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+
+void ff_put_h264_qpel8_mc10_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel8_mc20_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel8_mc30_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel8_mc01_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel8_mc11_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel8_mc31_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel8_mc02_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel8_mc03_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel8_mc13_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel8_mc33_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+
+void ff_avg_h264_qpel16_mc10_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc20_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc30_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc01_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc11_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc31_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc02_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc03_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc13_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc33_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+
+void ff_avg_h264_qpel8_mc10_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc20_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc30_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc01_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc11_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc31_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc02_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc03_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc13_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc33_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+
 av_cold void ff_h264qpel_init_aarch64(H264QpelContext *c, int bit_depth)
 {
-    const int high_bit_depth = bit_depth > 8;
     int cpu_flags = av_get_cpu_flags();
 
-    if (have_neon(cpu_flags) && !high_bit_depth) {
+    if (have_neon(cpu_flags) && bit_depth <= 8) {
         c->put_h264_qpel_pixels_tab[0][ 0] = ff_put_h264_qpel16_mc00_neon;
         c->put_h264_qpel_pixels_tab[0][ 1] = ff_put_h264_qpel16_mc10_neon;
         c->put_h264_qpel_pixels_tab[0][ 2] = ff_put_h264_qpel16_mc20_neon;
@@ -168,5 +211,49 @@ av_cold void ff_h264qpel_init_aarch64(H264QpelContext *c, int bit_depth)
         c->avg_h264_qpel_pixels_tab[1][13] = ff_avg_h264_qpel8_mc13_neon;
         c->avg_h264_qpel_pixels_tab[1][14] = ff_avg_h264_qpel8_mc23_neon;
         c->avg_h264_qpel_pixels_tab[1][15] = ff_avg_h264_qpel8_mc33_neon;
+    } else if (have_neon(cpu_flags) && bit_depth == 10) {
+        c->put_h264_qpel_pixels_tab[0][ 1] = ff_put_h264_qpel16_mc10_neon_10;
+        c->put_h264_qpel_pixels_tab[0][ 2] = ff_put_h264_qpel16_mc20_neon_10;
+        c->put_h264_qpel_pixels_tab[0][ 3] = ff_put_h264_qpel16_mc30_neon_10;
+        c->put_h264_qpel_pixels_tab[0][ 4] = ff_put_h264_qpel16_mc01_neon_10;
+        c->put_h264_qpel_pixels_tab[0][ 5] = ff_put_h264_qpel16_mc11_neon_10;
+        c->put_h264_qpel_pixels_tab[0][ 7] = ff_put_h264_qpel16_mc31_neon_10;
+        c->put_h264_qpel_pixels_tab[0][ 8] = ff_put_h264_qpel16_mc02_neon_10;
+        c->put_h264_qpel_pixels_tab[0][12] = ff_put_h264_qpel16_mc03_neon_10;
+        c->put_h264_qpel_pixels_tab[0][13] = ff_put_h264_qpel16_mc13_neon_10;
+        c->put_h264_qpel_pixels_tab[0][15] = ff_put_h264_qpel16_mc33_neon_10;
+
+        c->put_h264_qpel_pixels_tab[1][ 1] = ff_put_h264_qpel8_mc10_neon_10;
+        c->put_h264_qpel_pixels_tab[1][ 2] = ff_put_h264_qpel8_mc20_neon_10;
+        c->put_h264_qpel_pixels_tab[1][ 3] = ff_put_h264_qpel8_mc30_neon_10;
+        c->put_h264_qpel_pixels_tab[1][ 4] = ff_put_h264_qpel8_mc01_neon_10;
+        c->put_h264_qpel_pixels_tab[1][ 5] = ff_put_h264_qpel8_mc11_neon_10;
+        c->put_h264_qpel_pixels_tab[1][ 7] = ff_put_h264_qpel8_mc31_neon_10;
+        c->put_h264_qpel_pixels_tab[1][ 8] = ff_put_h264_qpel8_mc02_neon_10;
+        c->put_h264_qpel_pixels_tab[1][12] = ff_put_h264_qpel8_mc03_neon_10;
+        c->put_h264_qpel_pixels_tab[1][13] = ff_put_h264_qpel8_mc13_neon_10;
+        c->put_h264_qpel_pixels_tab[1][15] = ff_put_h264_qpel8_mc33_neon_10;
+
+        c->avg_h264_qpel_pixels_tab[0][ 1] = ff_avg_h264_qpel16_mc10_neon_10;
+        c->avg_h264_qpel_pixels_tab[0][ 2] = ff_avg_h264_qpel16_mc20_neon_10;
+        c->avg_h264_qpel_pixels_tab[0][ 3] = ff_avg_h264_qpel16_mc30_neon_10;
+        c->avg_h264_qpel_pixels_tab[0][ 4] = ff_avg_h264_qpel16_mc01_neon_10;
+        c->avg_h264_qpel_pixels_tab[0][ 5] = ff_avg_h264_qpel16_mc11_neon_10;
+        c->avg_h264_qpel_pixels_tab[0][ 7] = ff_avg_h264_qpel16_mc31_neon_10;
+        c->avg_h264_qpel_pixels_tab[0][ 8] = ff_avg_h264_qpel16_mc02_neon_10;
+        c->avg_h264_qpel_pixels_tab[0][12] = ff_avg_h264_qpel16_mc03_neon_10;
+        c->avg_h264_qpel_pixels_tab[0][13] = ff_avg_h264_qpel16_mc13_neon_10;
+        c->avg_h264_qpel_pixels_tab[0][15] = ff_avg_h264_qpel16_mc33_neon_10;
+
+        c->avg_h264_qpel_pixels_tab[1][ 1] = ff_avg_h264_qpel8_mc10_neon_10;
+        c->avg_h264_qpel_pixels_tab[1][ 2] = ff_avg_h264_qpel8_mc20_neon_10;
+        c->avg_h264_qpel_pixels_tab[1][ 3] = ff_avg_h264_qpel8_mc30_neon_10;
+        c->avg_h264_qpel_pixels_tab[1][ 4] = ff_avg_h264_qpel8_mc01_neon_10;
+        c->avg_h264_qpel_pixels_tab[1][ 5] = ff_avg_h264_qpel8_mc11_neon_10;
+        c->avg_h264_qpel_pixels_tab[1][ 7] = ff_avg_h264_qpel8_mc31_neon_10;
+        c->avg_h264_qpel_pixels_tab[1][ 8] = ff_avg_h264_qpel8_mc02_neon_10;
+        c->avg_h264_qpel_pixels_tab[1][12] = ff_avg_h264_qpel8_mc03_neon_10;
+        c->avg_h264_qpel_pixels_tab[1][13] = ff_avg_h264_qpel8_mc13_neon_10;
+        c->avg_h264_qpel_pixels_tab[1][15] = ff_avg_h264_qpel8_mc33_neon_10;
     }
 }
diff --git a/libavcodec/aarch64/h264qpel_neon.S b/libavcodec/aarch64/h264qpel_neon.S
index d27cfac494..eb18469b7f 100644
--- a/libavcodec/aarch64/h264qpel_neon.S
+++ b/libavcodec/aarch64/h264qpel_neon.S
@@ -932,3 +932,518 @@ endfunc
 
         h264_qpel16 put
         h264_qpel16 avg
+
+//trashes v0-v5, v7
+.macro  lowpass_8_10    r0,  r1,  r2,  r3,  d0,  d1
+        ext             v2.16B,     \r0\().16B,  \r1\().16B, #4
+        ext             v3.16B,     \r0\().16B,  \r1\().16B, #6
+        add             v2.8H,      v2.8H,       v3.8H
+        ext             v4.16B,     \r0\().16B,  \r1\().16B, #2
+        ext             v5.16B,     \r0\().16B,  \r1\().16B, #8
+        add             v4.8H,      v4.8H,       v5.8H
+        ext             v1.16B,     \r0\().16B,  \r1\().16B, #10
+        uaddl2          \d1\().4S,  \r0\().8H,   v1.8H
+        uaddl           \d0\().4S,  \r0\().4H,   v1.4H
+        ext             v0.16B,      \r2\().16B, \r3\().16B, #4
+        umlal           \d0\().4S,  v2.4H,       v6.H[1]
+        umlal2          \d1\().4S,  v2.8H,       v6.H[1]
+        ext             v1.16B,     \r2\().16B, \r3\().16B, #6
+        add             v0.8H,      v0.8H,       v1.8H
+        ext             v1.16B,     \r2\().16B,  \r3\().16B, #2
+        umlsl           \d0\().4S,  v4.4H,       v6.H[0]
+        umlsl2          \d1\().4S,  v4.8H,       v6.H[0]
+        sqrshrun        \d0\().4H,  \d0\().4S,   #5
+        sqrshrun2       \d0\().8H,  \d1\().4S,   #5
+        ext             v3.16B,     \r2\().16B,  \r3\().16B, #8
+        add             v1.8H,      v1.8H,       v3.8H
+        ext             v2.16B,     \r2\().16B,  \r3\().16B, #10
+        uaddl           v3.4S,      \r2\().4H,   v2.4H
+        uaddl2          v4.4S,      \r2\().8H,   v2.8H
+        umlal           v3.4S,      v0.4H,       v6.H[1]
+        umlal2          v4.4S,      v0.8H,       v6.H[1]
+        umlsl           v3.4S,      v1.4H,       v6.H[0]
+        umlsl2          v4.4S,      v1.8H,       v6.H[0]
+        mvni            v5.8h,      #0xFC,       lsl #8 // 1023 for clipping
+        sqrshrun        \d1\().4H,  v3.4S,       #5
+        sqrshrun2       \d1\().8H,  v4.4S,       #5
+        smin            \d0\().8H,  \d0\().8H,   v5.8h
+        smin            \d1\().8H,  \d1\().8H,   v5.8h
+.endm
+
+function put_h264_qpel16_h_lowpass_neon_packed_10
+        mov             x4,  x30
+        mov             x12, #32
+        mov             x3,  #16
+        bl              put_h264_qpel8_h_lowpass_neon_10
+        sub             x1,  x1,  x2, lsl #4
+        add             x1,  x1,  #16
+        mov             x12, #32
+        mov             x30, x4
+        b               put_h264_qpel8_h_lowpass_neon_10
+endfunc
+
+.macro  h264_qpel_h_lowpass_10 type
+function \type\()_h264_qpel16_h_lowpass_neon_10
+        mov             x13, x30
+        mov             x12, #32
+        bl              \type\()_h264_qpel8_h_lowpass_neon_10
+        sub             x0,  x0,  x3, lsl #4
+        sub             x1,  x1,  x2, lsl #4
+        add             x0,  x0,  #16
+        add             x1,  x1,  #16
+        mov             x12, #32
+        mov             x30, x13
+endfunc
+
+function \type\()_h264_qpel8_h_lowpass_neon_10
+1:      ld1             {v28.8H, v29.8H}, [x1], x2
+        ld1             {v16.8H, v17.8H}, [x1], x2
+        subs            x12, x12, #4
+        lowpass_8_10    v28, v29, v16, v17, v28, v20
+  .ifc \type,avg
+        ld1             {v2.8H},    [x0], x3
+        urhadd          v28.8H, v28.8H,  v2.8H
+        ld1             {v3.8H},    [x0]
+        urhadd          v20.8H, v20.8H, v3.8H
+        sub             x0,  x0,  x3
+  .endif
+        st1             {v28.8H},    [x0], x3
+        st1             {v20.8H},    [x0], x3
+        b.ne            1b
+        ret
+endfunc
+.endm
+
+        h264_qpel_h_lowpass_10 put
+        h264_qpel_h_lowpass_10 avg
+
+.macro h264_qpel_h_lowpass_l2_10 type
+function \type\()_h264_qpel16_h_lowpass_l2_neon_10
+        mov             x13, x30
+        mov             x12, #32
+        bl              \type\()_h264_qpel8_h_lowpass_l2_neon_10
+        sub             x0,  x0,  x2, lsl #4
+        sub             x1,  x1,  x2, lsl #4
+        sub             x3,  x3,  x2, lsl #4
+        add             x0,  x0,  #16
+        add             x1,  x1,  #16
+        add             x3,  x3,  #16
+        mov             x12, #32
+        mov             x30, x13
+endfunc
+
+function \type\()_h264_qpel8_h_lowpass_l2_neon_10
+1:      ld1             {v26.8H, v27.8H}, [x1], x2
+        ld1             {v16.8H, v17.8H}, [x1], x2
+        ld1             {v28.8H},     [x3], x2
+        ld1             {v29.8H},     [x3], x2
+        subs            x12, x12, #4
+        lowpass_8_10    v26, v27, v16, v17, v26, v27
+        urhadd          v26.8H, v26.8H, v28.8H
+        urhadd          v27.8H, v27.8H, v29.8H
+  .ifc \type,avg
+        ld1             {v2.8H},      [x0], x2
+        urhadd          v26.8H, v26.8H, v2.8H
+        ld1             {v3.8H},      [x0]
+        urhadd          v27.8H, v27.8H, v3.8H
+        sub             x0,  x0,  x2
+  .endif
+        st1             {v26.8H},     [x0], x2
+        st1             {v27.8H},     [x0], x2
+        b.ne            1b
+        ret
+endfunc
+.endm
+
+        h264_qpel_h_lowpass_l2_10 put
+        h264_qpel_h_lowpass_l2_10 avg
+
+function put_h264_qpel16_v_lowpass_neon_packed_10
+        mov             x4,  x30
+        mov             x2,  #8
+        bl              put_h264_qpel8_v_lowpass_neon
+        sub             x1,  x1,  x3, lsl #2
+        bl              put_h264_qpel8_v_lowpass_neon
+        sub             x1,  x1,  x3, lsl #4
+        sub             x1,  x1,  x3, lsl #2
+        add             x1,  x1,  #8
+        bl              put_h264_qpel8_v_lowpass_neon
+        sub             x1,  x1,  x3, lsl #2
+        mov             x30, x4
+        b               put_h264_qpel8_v_lowpass_neon
+endfunc
+
+.macro  h264_qpel_v_lowpass_10 type
+function \type\()_h264_qpel16_v_lowpass_neon_10
+        mov             x4,  x30
+        bl              \type\()_h264_qpel8_v_lowpass_neon_10
+        sub             x1,  x1,  x3, lsl #2
+        bl              \type\()_h264_qpel8_v_lowpass_neon_10
+        sub             x0,  x0,  x2, lsl #4
+        add             x0,  x0,  #16
+        sub             x1,  x1,  x3, lsl #4
+        sub             x1,  x1,  x3, lsl #2
+        add             x1,  x1,  #16
+        bl              \type\()_h264_qpel8_v_lowpass_neon_10
+        sub             x1,  x1,  x3, lsl #2
+        mov             x30, x4
+endfunc
+
+function \type\()_h264_qpel8_v_lowpass_neon_10
+        ld1             {v16.8H}, [x1], x3
+        ld1             {v18.8H}, [x1], x3
+        ld1             {v20.8H}, [x1], x3
+        ld1             {v22.8H}, [x1], x3
+        ld1             {v24.8H}, [x1], x3
+        ld1             {v26.8H}, [x1], x3
+        ld1             {v28.8H}, [x1], x3
+        ld1             {v30.8H}, [x1], x3
+        ld1             {v17.8H}, [x1], x3
+        ld1             {v19.8H}, [x1], x3
+        ld1             {v21.8H}, [x1], x3
+        ld1             {v23.8H}, [x1], x3
+        ld1             {v25.8H}, [x1]
+
+        transpose_8x8H  v16, v18, v20, v22, v24, v26, v28, v30, v0,  v1
+        transpose_8x8H  v17, v19, v21, v23, v25, v27, v29, v31, v0,  v1
+        lowpass_8_10    v16, v17, v18, v19, v16, v17
+        lowpass_8_10    v20, v21, v22, v23, v18, v19
+        lowpass_8_10    v24, v25, v26, v27, v20, v21
+        lowpass_8_10    v28, v29, v30, v31, v22, v23
+        transpose_8x8H  v16, v17, v18, v19, v20, v21, v22, v23, v0,  v1
+
+  .ifc \type,avg
+        ld1             {v24.8H},  [x0], x2
+        urhadd          v16.8H, v16.8H, v24.8H
+        ld1             {v25.8H}, [x0], x2
+        urhadd          v17.8H, v17.8H, v25.8H
+        ld1             {v26.8H}, [x0], x2
+        urhadd          v18.8H, v18.8H, v26.8H
+        ld1             {v27.8H}, [x0], x2
+        urhadd          v19.8H, v19.8H, v27.8H
+        ld1             {v28.8H}, [x0], x2
+        urhadd          v20.8H, v20.8H, v28.8H
+        ld1             {v29.8H}, [x0], x2
+        urhadd          v21.8H, v21.8H, v29.8H
+        ld1             {v30.8H}, [x0], x2
+        urhadd          v22.8H, v22.8H, v30.8H
+        ld1             {v31.8H}, [x0], x2
+        urhadd          v23.8H, v23.8H, v31.8H
+        sub             x0,  x0,  x2,  lsl #3
+  .endif
+
+        st1             {v16.8H}, [x0], x2
+        st1             {v17.8H}, [x0], x2
+        st1             {v18.8H}, [x0], x2
+        st1             {v19.8H}, [x0], x2
+        st1             {v20.8H}, [x0], x2
+        st1             {v21.8H}, [x0], x2
+        st1             {v22.8H}, [x0], x2
+        st1             {v23.8H}, [x0], x2
+
+        ret
+endfunc
+.endm
+
+        h264_qpel_v_lowpass_10 put
+        h264_qpel_v_lowpass_10 avg
+
+.macro  h264_qpel_v_lowpass_l2_10 type
+function \type\()_h264_qpel16_v_lowpass_l2_neon_10
+        mov             x4,  x30
+        bl              \type\()_h264_qpel8_v_lowpass_l2_neon_10
+        sub             x1,  x1,  x3, lsl #2
+        bl              \type\()_h264_qpel8_v_lowpass_l2_neon_10
+        sub             x0,  x0,  x3, lsl #4
+        sub             x12, x12, x2, lsl #4
+        add             x0,  x0,  #16
+        add             x12, x12, #16
+        sub             x1,  x1,  x3, lsl #4
+        sub             x1,  x1,  x3, lsl #2
+        add             x1,  x1,  #16
+        bl              \type\()_h264_qpel8_v_lowpass_l2_neon_10
+        sub             x1,  x1,  x3, lsl #2
+        mov             x30, x4
+endfunc
+
+function \type\()_h264_qpel8_v_lowpass_l2_neon_10
+        ld1             {v16.8H}, [x1], x3
+        ld1             {v18.8H}, [x1], x3
+        ld1             {v20.8H}, [x1], x3
+        ld1             {v22.8H}, [x1], x3
+        ld1             {v24.8H}, [x1], x3
+        ld1             {v26.8H}, [x1], x3
+        ld1             {v28.8H}, [x1], x3
+        ld1             {v30.8H}, [x1], x3
+        ld1             {v17.8H}, [x1], x3
+        ld1             {v19.8H}, [x1], x3
+        ld1             {v21.8H}, [x1], x3
+        ld1             {v23.8H}, [x1], x3
+        ld1             {v25.8H}, [x1]
+
+        transpose_8x8H  v16, v18, v20, v22, v24, v26, v28, v30, v0,  v1
+        transpose_8x8H  v17, v19, v21, v23, v25, v27, v29, v31, v0,  v1
+        lowpass_8_10    v16, v17, v18, v19, v16, v17
+        lowpass_8_10    v20, v21, v22, v23, v18, v19
+        lowpass_8_10    v24, v25, v26, v27, v20, v21
+        lowpass_8_10    v28, v29, v30, v31, v22, v23
+        transpose_8x8H  v16, v17, v18, v19, v20, v21, v22, v23, v0,  v1
+
+        ld1             {v24.8H},  [x12], x2
+        ld1             {v25.8H},  [x12], x2
+        ld1             {v26.8H},  [x12], x2
+        ld1             {v27.8H},  [x12], x2
+        ld1             {v28.8H},  [x12], x2
+        urhadd          v16.8H, v24.8H, v16.8H
+        urhadd          v17.8H, v25.8H, v17.8H
+        ld1             {v29.8H},  [x12], x2
+        urhadd          v18.8H, v26.8H, v18.8H
+        urhadd          v19.8H, v27.8H, v19.8H
+        ld1             {v30.8H}, [x12], x2
+        urhadd          v20.8H, v28.8H, v20.8H
+        urhadd          v21.8H, v29.8H, v21.8H
+        ld1             {v31.8H}, [x12], x2
+        urhadd          v22.8H, v30.8H, v22.8H
+        urhadd          v23.8H, v31.8H, v23.8H
+
+  .ifc \type,avg
+        ld1             {v24.8H}, [x0], x3
+        urhadd          v16.8H, v16.8H, v24.8H
+        ld1             {v25.8H}, [x0], x3
+        urhadd          v17.8H, v17.8H, v25.8H
+        ld1             {v26.8H}, [x0], x3
+        urhadd          v18.8H, v18.8H, v26.8H
+        ld1             {v27.8H}, [x0], x3
+        urhadd          v19.8H, v19.8H, v27.8H
+        ld1             {v28.8H}, [x0], x3
+        urhadd          v20.8H, v20.8H, v28.8H
+        ld1             {v29.8H}, [x0], x3
+        urhadd          v21.8H, v21.8H, v29.8H
+        ld1             {v30.8H}, [x0], x3
+        urhadd          v22.8H, v22.8H, v30.8H
+        ld1             {v31.8H}, [x0], x3
+        urhadd          v23.8H, v23.8H, v31.8H
+        sub             x0,  x0,  x3,  lsl #3
+  .endif
+
+        st1             {v16.8H}, [x0], x3
+        st1             {v17.8H}, [x0], x3
+        st1             {v18.8H}, [x0], x3
+        st1             {v19.8H}, [x0], x3
+        st1             {v20.8H}, [x0], x3
+        st1             {v21.8H}, [x0], x3
+        st1             {v22.8H}, [x0], x3
+        st1             {v23.8H}, [x0], x3
+
+        ret
+endfunc
+.endm
+
+        h264_qpel_v_lowpass_l2_10 put
+        h264_qpel_v_lowpass_l2_10 avg
+
+.macro  h264_qpel8_10   type
+function ff_\type\()_h264_qpel8_mc10_neon_10, export=1
+        lowpass_const   w3
+        mov             x3,  x1
+        sub             x1,  x1,  #4
+        mov             x12, #16
+        b               \type\()_h264_qpel8_h_lowpass_l2_neon_10
+endfunc
+
+function ff_\type\()_h264_qpel8_mc20_neon_10, export=1
+        lowpass_const   w3
+        sub             x1,  x1,  #4
+        mov             x3,  x2
+        mov             x12, #16
+        b               \type\()_h264_qpel8_h_lowpass_neon_10
+endfunc
+
+function ff_\type\()_h264_qpel8_mc30_neon_10, export=1
+        lowpass_const   w3
+        add             x3,  x1,  #2
+        sub             x1,  x1,  #4
+        mov             x12, #16
+        b               \type\()_h264_qpel8_h_lowpass_l2_neon_10
+endfunc
+
+function ff_\type\()_h264_qpel8_mc01_neon_10, export=1
+        mov             x14, x30
+        mov             x12, x1
+\type\()_h264_qpel8_mc01_10:
+        lowpass_const   w3
+        mov             x3,  x2
+        sub             x1,  x1,  x2, lsl #1
+        bl              \type\()_h264_qpel8_v_lowpass_l2_neon_10
+        ret             x14
+endfunc
+
+function ff_\type\()_h264_qpel8_mc11_neon_10, export=1
+        mov             x14, x30
+        mov             x8,  x0
+        mov             x9,  x1
+\type\()_h264_qpel8_mc11_10:
+        lowpass_const   w3
+        mov             x11, sp
+        sub             sp,  sp,  #128
+        mov             x0,  sp
+        sub             x1,  x1,  #4
+        mov             x3,  #16
+        mov             x12, #16
+        bl              put_h264_qpel8_h_lowpass_neon_10
+        mov             x0,  x8
+        mov             x3,  x2
+        mov             x12, sp
+        sub             x1,  x9,  x2, lsl #1
+        mov             x2,  #16
+        bl              \type\()_h264_qpel8_v_lowpass_l2_neon_10
+        mov             sp,  x11
+        ret             x14
+endfunc
+
+function ff_\type\()_h264_qpel8_mc31_neon_10, export=1
+        add             x1,  x1,  #2
+        mov             x14, x30
+        mov             x8,  x0
+        mov             x9,  x1
+        sub             x1,  x1,  #2
+        b               \type\()_h264_qpel8_mc11_10
+endfunc
+
+function ff_\type\()_h264_qpel8_mc02_neon_10, export=1
+        mov             x14, x30
+        lowpass_const   w3
+        sub             x1,  x1,  x2, lsl #1
+        mov             x3,  x2
+        bl              \type\()_h264_qpel8_v_lowpass_neon_10
+        ret             x14
+endfunc
+
+function ff_\type\()_h264_qpel8_mc03_neon_10, export=1
+        mov             x14, x30
+        add             x12, x1,  x2
+        b               \type\()_h264_qpel8_mc01_10
+endfunc
+
+function ff_\type\()_h264_qpel8_mc13_neon_10, export=1
+        mov             x14, x30
+        mov             x8,  x0
+        mov             x9,  x1
+        add             x1,  x1,  x2
+        b               \type\()_h264_qpel8_mc11_10
+endfunc
+
+function ff_\type\()_h264_qpel8_mc33_neon_10, export=1
+        add             x1,  x1,  #2
+        mov             x14, x30
+        mov             x8,  x0
+        mov             x9,  x1
+        add             x1,  x1,  x2
+        sub             x1,  x1,  #2
+        b               \type\()_h264_qpel8_mc11_10
+endfunc
+.endm
+
+        h264_qpel8_10 put
+        h264_qpel8_10 avg
+
+.macro  h264_qpel16_10     type
+function ff_\type\()_h264_qpel16_mc10_neon_10, export=1
+        lowpass_const   w3
+        mov             x3,  x1
+        sub             x1,  x1,  #4
+        b               \type\()_h264_qpel16_h_lowpass_l2_neon_10
+endfunc
+
+function ff_\type\()_h264_qpel16_mc20_neon_10, export=1
+        lowpass_const   w3
+        sub             x1,  x1,  #4
+        mov             x3,  x2
+        b               \type\()_h264_qpel16_h_lowpass_neon_10
+endfunc
+
+function ff_\type\()_h264_qpel16_mc30_neon_10, export=1
+        lowpass_const   w3
+        add             x3,  x1,  #2
+        sub             x1,  x1,  #4
+        b               \type\()_h264_qpel16_h_lowpass_l2_neon_10
+endfunc
+
+function ff_\type\()_h264_qpel16_mc01_neon_10, export=1
+        mov             x14, x30
+        mov             x12, x1
+\type\()_h264_qpel16_mc01_10:
+        lowpass_const   w3
+        mov             x3,  x2
+        sub             x1,  x1,  x2, lsl #1
+        bl              \type\()_h264_qpel16_v_lowpass_l2_neon_10
+        ret             x14
+endfunc
+
+function ff_\type\()_h264_qpel16_mc11_neon_10, export=1
+        mov             x14, x30
+        mov             x8,  x0
+        mov             x9,  x1
+\type\()_h264_qpel16_mc11_10:
+        lowpass_const   w3
+        mov             x11, sp
+        sub             sp,  sp,  #512
+        mov             x0,  sp
+        sub             x1,  x1,  #4
+        mov             x3,  #32
+        bl              put_h264_qpel16_h_lowpass_neon_10
+        mov             x0,  x8
+        mov             x3,  x2
+        mov             x12, sp
+        sub             x1,  x9,  x2, lsl #1
+        mov             x2,  #32
+        bl              \type\()_h264_qpel16_v_lowpass_l2_neon_10
+        mov             sp,  x11
+        ret             x14
+endfunc
+
+function ff_\type\()_h264_qpel16_mc31_neon_10, export=1
+        add             x1,  x1,  #2
+        mov             x14, x30
+        mov             x8,  x0
+        mov             x9,  x1
+        sub             x1,  x1,  #2
+        b               \type\()_h264_qpel16_mc11_10
+endfunc
+
+function ff_\type\()_h264_qpel16_mc02_neon_10, export=1
+        mov             x14, x30
+        lowpass_const   w3
+        sub             x1,  x1,  x2, lsl #1
+        mov             x3,  x2
+        bl              \type\()_h264_qpel16_v_lowpass_neon_10
+        ret             x14
+endfunc
+
+function ff_\type\()_h264_qpel16_mc03_neon_10, export=1
+        mov             x14, x30
+        add             x12, x1,  x2
+        b               \type\()_h264_qpel16_mc01_10
+endfunc
+
+function ff_\type\()_h264_qpel16_mc13_neon_10, export=1
+        mov             x14, x30
+        mov             x8,  x0
+        mov             x9,  x1
+        add             x1,  x1,  x2
+        b               \type\()_h264_qpel16_mc11_10
+endfunc
+
+function ff_\type\()_h264_qpel16_mc33_neon_10, export=1
+        add             x1,  x1,  #2
+        mov             x14, x30
+        mov             x8,  x0
+        mov             x9,  x1
+        add             x1,  x1,  x2
+        sub             x1,  x1,  #2
+        b               \type\()_h264_qpel16_mc11_10
+endfunc
+.endm
+
+        h264_qpel16_10 put
+        h264_qpel16_10 avg
-- 
2.32.0



More information about the ffmpeg-devel mailing list