[FFmpeg-devel] [PATCH] lavc/aarch64: h264qpel, add lowpass_8 based functions

Mikhail Nitenko mnitenko at gmail.com
Mon Dec 4 12:00:35 EET 2023


Benchmarks                         A53      A55     A72     A76
avg_h264_qpel_8_mc01_10_c:        936.5    924.0   656.0   504.7
avg_h264_qpel_8_mc01_10_neon:     234.7    202.0   120.7    63.2
avg_h264_qpel_8_mc02_10_c:        921.0    920.0   669.2   493.7
avg_h264_qpel_8_mc02_10_neon:     202.0    173.2   102.7    58.5
avg_h264_qpel_8_mc03_10_c:        936.5    924.0   656.0   509.5
avg_h264_qpel_8_mc03_10_neon:     236.2    203.7   120.0    63.2
avg_h264_qpel_8_mc10_10_c:       1441.0   1437.7   806.7   478.5
avg_h264_qpel_8_mc10_10_neon:     325.7    324.0   153.7    94.2
avg_h264_qpel_8_mc11_10_c:       2160.7   2148.2  1366.7   906.7
avg_h264_qpel_8_mc11_10_neon:     492.0    464.0   242.5   134.5
avg_h264_qpel_8_mc13_10_c:       2157.0   2138.2  1357.0   908.2
avg_h264_qpel_8_mc13_10_neon:     494.0    467.2   242.0   140.0
avg_h264_qpel_8_mc20_10_c:       1433.5   1410.0   785.2   486.0
avg_h264_qpel_8_mc20_10_neon:     293.7    289.7   138.0    91.5
avg_h264_qpel_8_mc30_10_c:       1458.5   1461.7   813.7   483.2
avg_h264_qpel_8_mc30_10_neon:     341.7    339.2   154.0    95.2
avg_h264_qpel_8_mc31_10_c:       2194.7   2197.2  1358.7   928.0
avg_h264_qpel_8_mc31_10_neon:     520.0    495.0   245.5   142.5
avg_h264_qpel_8_mc33_10_c:       2188.0   2205.5  1356.7   910.7
avg_h264_qpel_8_mc33_10_neon:     521.0    494.5   245.7   145.7
avg_h264_qpel_16_mc01_10_c:      3717.2   3595.0  2610.0  2012.0
avg_h264_qpel_16_mc01_10_neon:    920.5    791.5   483.2   240.5
avg_h264_qpel_16_mc02_10_c:      3684.0   3633.0  2659.0  1919.7
avg_h264_qpel_16_mc02_10_neon:    790.7    678.2   409.2   217.0
avg_h264_qpel_16_mc03_10_c:      3726.5   3596.0  2606.7  2010.0
avg_h264_qpel_16_mc03_10_neon:    922.0    792.5   483.2   239.7
avg_h264_qpel_16_mc10_10_c:      5912.0   5803.2  3241.5  1916.7
avg_h264_qpel_16_mc10_10_neon:   1267.5   1277.2   616.5   365.0
avg_h264_qpel_16_mc11_10_c:      8599.2   8482.5  5338.0  3616.2
avg_h264_qpel_16_mc11_10_neon:   1913.0   1827.0   956.2   542.2
avg_h264_qpel_16_mc13_10_c:      8643.7   8488.5  5388.0  3628.5
avg_h264_qpel_16_mc13_10_neon:   1914.7   1828.7   969.2   530.5
avg_h264_qpel_16_mc20_10_c:      5719.5   5641.0  3147.0  1946.2
avg_h264_qpel_16_mc20_10_neon:   1139.5   1150.0   539.5   344.0
avg_h264_qpel_16_mc30_10_c:      5930.0   5872.5  3267.5  1918.0
avg_h264_qpel_16_mc30_10_neon:   1331.5   1341.2   616.5   369.5
avg_h264_qpel_16_mc31_10_c:      8758.7   8697.7  5353.0  3630.7
avg_h264_qpel_16_mc31_10_neon:   2018.7   1941.7   982.2   574.7
avg_h264_qpel_16_mc33_10_c:      8683.2   8675.2  5339.2  3634.7
avg_h264_qpel_16_mc33_10_neon:   2019.7   1940.2   994.5   566.0
put_h264_qpel_8_mc01_10_c:        854.2    843.0   599.2   478.0
put_h264_qpel_8_mc01_10_neon:     192.7    168.0   101.7    56.7
put_h264_qpel_8_mc02_10_c:        766.5    760.0   550.2   441.0
put_h264_qpel_8_mc02_10_neon:     160.0    139.2    88.7    53.0
put_h264_qpel_8_mc03_10_c:        854.2    843.0   599.2   479.0
put_h264_qpel_8_mc03_10_neon:     194.2    169.7   102.0    56.2
put_h264_qpel_8_mc10_10_c:       1352.7   1353.7   749.7   446.7
put_h264_qpel_8_mc10_10_neon:     289.7    294.2   135.5    88.5
put_h264_qpel_8_mc11_10_c:       2080.0   2066.2  1309.5   876.7
put_h264_qpel_8_mc11_10_neon:     450.0    429.7   229.7   131.2
put_h264_qpel_8_mc13_10_c:       2074.7   2060.2  1294.5   870.5
put_h264_qpel_8_mc13_10_neon:     452.5    434.5   226.5   130.0
put_h264_qpel_8_mc20_10_c:       1221.5   1216.0   684.5   399.7
put_h264_qpel_8_mc20_10_neon:     257.7    262.5   121.2    78.7
put_h264_qpel_8_mc30_10_c:       1379.0   1374.7   757.2   449.5
put_h264_qpel_8_mc30_10_neon:     305.7    310.2   135.5    86.5
put_h264_qpel_8_mc31_10_c:       2109.2   2119.7  1299.5   878.0
put_h264_qpel_8_mc31_10_neon:     478.0    458.5   226.0   137.2
put_h264_qpel_8_mc33_10_c:       2101.5   2115.2  1306.5   887.0
put_h264_qpel_8_mc33_10_neon:     479.0    458.7   229.7   141.7
put_h264_qpel_16_mc01_10_c:      3485.7   3396.7  2460.5  1914.5
put_h264_qpel_16_mc01_10_neon:    752.5    665.5   397.0   213.2
put_h264_qpel_16_mc02_10_c:      3103.5   3023.2  2154.7  1720.7
put_h264_qpel_16_mc02_10_neon:    622.7    551.2   347.7   196.2
put_h264_qpel_16_mc03_10_c:      3486.2   3394.0  2436.5  1917.7
put_h264_qpel_16_mc03_10_neon:    754.0    666.5   397.0   215.7
put_h264_qpel_16_mc10_10_c:      5533.0   5488.5  2989.0  1783.0
put_h264_qpel_16_mc10_10_neon:   1123.5   1165.2   535.2   334.7
put_h264_qpel_16_mc11_10_c:      8437.7   8281.2  5209.0  3510.7
put_h264_qpel_16_mc11_10_neon:   1745.0   1697.0   878.5   513.5
put_h264_qpel_16_mc13_10_c:      8567.7   8468.0  5221.5  3528.0
put_h264_qpel_16_mc13_10_neon:   1751.7   1698.2   889.2   507.0
put_h264_qpel_16_mc20_10_c:      4907.5   4885.0  2786.2  1607.5
put_h264_qpel_16_mc20_10_neon:    995.5   1034.5   475.5   307.0
put_h264_qpel_16_mc30_10_c:      5579.7   5537.7  3045.2  1789.5
put_h264_qpel_16_mc30_10_neon:   1187.5   1231.2   532.5   334.5
put_h264_qpel_16_mc31_10_c:      8677.2   8672.5  5204.2  3516.0
put_h264_qpel_16_mc31_10_neon:   1850.7   1813.2   893.0   545.2
put_h264_qpel_16_mc33_10_c:      8688.7   8671.2  5223.2  3512.0
put_h264_qpel_16_mc33_10_neon:   1851.7   1814.2   908.5   535.2

Signed-off-by: Mikhail Nitenko <mnitenko at gmail.com>
---

I remodeled the patch (as Martin once suggested), it doesn't
go to 32bits in lowpass_8_10 and is also using the much 
faster lowpass_8_10_v.

 libavcodec/aarch64/h264qpel_init_aarch64.c |  91 +++-
 libavcodec/aarch64/h264qpel_neon.S         | 532 +++++++++++++++++++++
 2 files changed, 621 insertions(+), 2 deletions(-)

diff --git a/libavcodec/aarch64/h264qpel_init_aarch64.c b/libavcodec/aarch64/h264qpel_init_aarch64.c
index 77f41d9a21..93fa5246c4 100644
--- a/libavcodec/aarch64/h264qpel_init_aarch64.c
+++ b/libavcodec/aarch64/h264qpel_init_aarch64.c
@@ -95,12 +95,55 @@ void ff_avg_h264_qpel8_mc13_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t str
 void ff_avg_h264_qpel8_mc23_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
 void ff_avg_h264_qpel8_mc33_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
 
+void ff_put_h264_qpel16_mc10_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel16_mc20_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel16_mc30_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel16_mc01_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel16_mc11_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel16_mc31_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel16_mc02_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel16_mc03_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel16_mc13_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel16_mc33_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+
+void ff_put_h264_qpel8_mc10_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel8_mc20_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel8_mc30_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel8_mc01_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel8_mc11_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel8_mc31_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel8_mc02_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel8_mc03_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel8_mc13_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel8_mc33_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+
+void ff_avg_h264_qpel16_mc10_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc20_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc30_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc01_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc11_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc31_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc02_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc03_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc13_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc33_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+
+void ff_avg_h264_qpel8_mc10_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc20_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc30_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc01_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc11_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc31_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc02_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc03_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc13_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc33_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+
 av_cold void ff_h264qpel_init_aarch64(H264QpelContext *c, int bit_depth)
 {
-    const int high_bit_depth = bit_depth > 8;
     int cpu_flags = av_get_cpu_flags();
 
-    if (have_neon(cpu_flags) && !high_bit_depth) {
+    if (have_neon(cpu_flags) && bit_depth <= 8) {
         c->put_h264_qpel_pixels_tab[0][ 0] = ff_put_h264_qpel16_mc00_neon;
         c->put_h264_qpel_pixels_tab[0][ 1] = ff_put_h264_qpel16_mc10_neon;
         c->put_h264_qpel_pixels_tab[0][ 2] = ff_put_h264_qpel16_mc20_neon;
@@ -168,5 +211,49 @@ av_cold void ff_h264qpel_init_aarch64(H264QpelContext *c, int bit_depth)
         c->avg_h264_qpel_pixels_tab[1][13] = ff_avg_h264_qpel8_mc13_neon;
         c->avg_h264_qpel_pixels_tab[1][14] = ff_avg_h264_qpel8_mc23_neon;
         c->avg_h264_qpel_pixels_tab[1][15] = ff_avg_h264_qpel8_mc33_neon;
+    } else if (have_neon(cpu_flags) && bit_depth == 10) {
+        c->put_h264_qpel_pixels_tab[0][ 1] = ff_put_h264_qpel16_mc10_neon_10;
+        c->put_h264_qpel_pixels_tab[0][ 2] = ff_put_h264_qpel16_mc20_neon_10;
+        c->put_h264_qpel_pixels_tab[0][ 3] = ff_put_h264_qpel16_mc30_neon_10;
+        c->put_h264_qpel_pixels_tab[0][ 4] = ff_put_h264_qpel16_mc01_neon_10;
+        c->put_h264_qpel_pixels_tab[0][ 5] = ff_put_h264_qpel16_mc11_neon_10;
+        c->put_h264_qpel_pixels_tab[0][ 7] = ff_put_h264_qpel16_mc31_neon_10;
+        c->put_h264_qpel_pixels_tab[0][ 8] = ff_put_h264_qpel16_mc02_neon_10;
+        c->put_h264_qpel_pixels_tab[0][12] = ff_put_h264_qpel16_mc03_neon_10;
+        c->put_h264_qpel_pixels_tab[0][13] = ff_put_h264_qpel16_mc13_neon_10;
+        c->put_h264_qpel_pixels_tab[0][15] = ff_put_h264_qpel16_mc33_neon_10;
+
+        c->put_h264_qpel_pixels_tab[1][ 1] = ff_put_h264_qpel8_mc10_neon_10;
+        c->put_h264_qpel_pixels_tab[1][ 2] = ff_put_h264_qpel8_mc20_neon_10;
+        c->put_h264_qpel_pixels_tab[1][ 3] = ff_put_h264_qpel8_mc30_neon_10;
+        c->put_h264_qpel_pixels_tab[1][ 4] = ff_put_h264_qpel8_mc01_neon_10;
+        c->put_h264_qpel_pixels_tab[1][ 5] = ff_put_h264_qpel8_mc11_neon_10;
+        c->put_h264_qpel_pixels_tab[1][ 7] = ff_put_h264_qpel8_mc31_neon_10;
+        c->put_h264_qpel_pixels_tab[1][ 8] = ff_put_h264_qpel8_mc02_neon_10;
+        c->put_h264_qpel_pixels_tab[1][12] = ff_put_h264_qpel8_mc03_neon_10;
+        c->put_h264_qpel_pixels_tab[1][13] = ff_put_h264_qpel8_mc13_neon_10;
+        c->put_h264_qpel_pixels_tab[1][15] = ff_put_h264_qpel8_mc33_neon_10;
+
+        c->avg_h264_qpel_pixels_tab[0][ 1] = ff_avg_h264_qpel16_mc10_neon_10;
+        c->avg_h264_qpel_pixels_tab[0][ 2] = ff_avg_h264_qpel16_mc20_neon_10;
+        c->avg_h264_qpel_pixels_tab[0][ 3] = ff_avg_h264_qpel16_mc30_neon_10;
+        c->avg_h264_qpel_pixels_tab[0][ 4] = ff_avg_h264_qpel16_mc01_neon_10;
+        c->avg_h264_qpel_pixels_tab[0][ 5] = ff_avg_h264_qpel16_mc11_neon_10;
+        c->avg_h264_qpel_pixels_tab[0][ 7] = ff_avg_h264_qpel16_mc31_neon_10;
+        c->avg_h264_qpel_pixels_tab[0][ 8] = ff_avg_h264_qpel16_mc02_neon_10;
+        c->avg_h264_qpel_pixels_tab[0][12] = ff_avg_h264_qpel16_mc03_neon_10;
+        c->avg_h264_qpel_pixels_tab[0][13] = ff_avg_h264_qpel16_mc13_neon_10;
+        c->avg_h264_qpel_pixels_tab[0][15] = ff_avg_h264_qpel16_mc33_neon_10;
+
+        c->avg_h264_qpel_pixels_tab[1][ 1] = ff_avg_h264_qpel8_mc10_neon_10;
+        c->avg_h264_qpel_pixels_tab[1][ 2] = ff_avg_h264_qpel8_mc20_neon_10;
+        c->avg_h264_qpel_pixels_tab[1][ 3] = ff_avg_h264_qpel8_mc30_neon_10;
+        c->avg_h264_qpel_pixels_tab[1][ 4] = ff_avg_h264_qpel8_mc01_neon_10;
+        c->avg_h264_qpel_pixels_tab[1][ 5] = ff_avg_h264_qpel8_mc11_neon_10;
+        c->avg_h264_qpel_pixels_tab[1][ 7] = ff_avg_h264_qpel8_mc31_neon_10;
+        c->avg_h264_qpel_pixels_tab[1][ 8] = ff_avg_h264_qpel8_mc02_neon_10;
+        c->avg_h264_qpel_pixels_tab[1][12] = ff_avg_h264_qpel8_mc03_neon_10;
+        c->avg_h264_qpel_pixels_tab[1][13] = ff_avg_h264_qpel8_mc13_neon_10;
+        c->avg_h264_qpel_pixels_tab[1][15] = ff_avg_h264_qpel8_mc33_neon_10;
     }
 }
diff --git a/libavcodec/aarch64/h264qpel_neon.S b/libavcodec/aarch64/h264qpel_neon.S
index f4475d96f9..31130a57fd 100644
--- a/libavcodec/aarch64/h264qpel_neon.S
+++ b/libavcodec/aarch64/h264qpel_neon.S
@@ -933,3 +933,535 @@ endfunc
 
         h264_qpel16     put
         h264_qpel16     avg
+
+//trashes v0-v5
+.macro  lowpass_8_10    r0,  r1,  r2,  r3,  d0,  d1
+        ext             v2.16B,     \r0\().16B,  \r1\().16B, #4
+        ext             v3.16B,     \r0\().16B,  \r1\().16B, #6
+        add             v2.8H,      v2.8H,       v3.8H
+        ext             v4.16B,     \r0\().16B,  \r1\().16B, #2
+        ext             v5.16B,     \r0\().16B,  \r1\().16B, #8
+        add             v4.8H,      v4.8H,       v5.8H
+        ext             v1.16B,     \r0\().16B,  \r1\().16B, #10
+
+        add             \d0\().8H,  \r0\().8H,   v1.8H
+        ext             v0.16B,     \r2\().16B,  \r3\().16B, #4
+        mla             \d0\().8H,  v2.8H,       v6.H[1]
+        ext             v1.16B,     \r2\().16B,  \r3\().16B, #6
+        add             v0.8H,      v0.8H,       v1.8H
+        ext             v1.16B,     \r2\().16B,  \r3\().16B, #2
+        mul             v5.8H,      v4.8H,       v6.H[0]
+        uqsub           \d0\().8H,  \d0\().8H,   v5.8H
+        urshr           \d0\().8H,  \d0\().8H,   #5
+
+        ext             v3.16B,     \r2\().16B,  \r3\().16B, #8
+        add             v1.8H,      v1.8H,       v3.8H
+        ext             v2.16B,     \r2\().16B,  \r3\().16B, #10
+
+        add             \d1\().8H,  \r2\().8H,   v2.8H
+        mla             \d1\().8H,  v0.8H,       v6.H[1]
+        mul             v5.8H,      v1.8H,       v6.H[0]
+        uqsub           \d1\().8H,  \d1\().8H,   v5.8H
+        mvni            v5.8h,      #0xFC,       lsl #8 // 1023 for clipping
+        urshr           \d1\().8H,  \d1\().8H,   #5
+
+        umin            \d0\().8H,  \d0\().8H,   v5.8h
+        umin            \d1\().8H,  \d1\().8H,   v5.8h
+.endm
+
+//trashes v0-v4
+.macro lowpass_8_10_v   r0,  r1,  r2,  r3,  r4,  r5,  r6,  d0,  d1
+        add             v2.8H,      \r2\().8H,   \r3\().8H
+        add             v0.8H,      \r3\().8H,   \r4\().8H
+        add             v4.8H,      \r1\().8H,   \r4\().8H
+        add             v1.8H,      \r2\().8H,   \r5\().8H
+
+        add             \d0\().8H,  \r0\().8H,   \r5\().8H
+        add             \d1\().8H,  \r1\().8H,   \r6\().8H
+        mla             \d0\().8H,  v2.8H,       v6.H[1]
+        mla             \d1\().8H,  v0.8H,       v6.H[1]
+        mul             v2.8H,      v4.8H,       v6.H[0]
+        mul             v0.8H,      v1.8H,       v6.H[0]
+        uqsub           \d0\().8H,  \d0\().8H,   v2.8H
+        uqsub           \d1\().8H,  \d1\().8H,   v0.8H
+
+        mvni            v0.8H,      #0xFC,       lsl #8 // 1023 for clipping
+
+        urshr           \d0\().8H,  \d0\().8H,   #5
+        urshr           \d1\().8H,  \d1\().8H,   #5
+
+        umin            \d0\().8H,  \d0\().8H,   v0.8H
+        umin            \d1\().8H,  \d1\().8H,   v0.8H
+.endm
+
+function put_h264_qpel16_h_lowpass_neon_packed_10
+        mov             x4,  x30
+        mov             x12, #32
+        mov             x3,  #16
+        bl              put_h264_qpel8_h_lowpass_neon_10
+        sub             x1,  x1,  x2, lsl #4
+        add             x1,  x1,  #16
+        mov             x12, #32
+        mov             x30, x4
+        b               put_h264_qpel8_h_lowpass_neon_10
+endfunc
+
+.macro  h264_qpel_h_lowpass_10 type
+function \type\()_h264_qpel16_h_lowpass_neon_10
+        mov             x13, x30
+        mov             x12, #32
+        bl              \type\()_h264_qpel8_h_lowpass_neon_10
+        sub             x0,  x0,  x3, lsl #4
+        sub             x1,  x1,  x2, lsl #4
+        add             x0,  x0,  #16
+        add             x1,  x1,  #16
+        mov             x12, #32
+        mov             x30, x13
+endfunc
+
+function \type\()_h264_qpel8_h_lowpass_neon_10
+1:      ld1             {v28.8H, v29.8H}, [x1], x2
+        ld1             {v16.8H, v17.8H}, [x1], x2
+        subs            x12, x12, #4
+        lowpass_8_10    v28, v29, v16, v17, v28, v20
+  .ifc \type,avg
+        ld1             {v2.8H},    [x0], x3
+        ld1             {v3.8H},    [x0]
+        urhadd          v28.8H, v28.8H, v2.8H
+        urhadd          v20.8H, v20.8H, v3.8H
+        sub             x0,  x0,  x3
+  .endif
+        st1             {v28.8H},    [x0], x3
+        st1             {v20.8H},    [x0], x3
+        b.ne            1b
+        ret
+endfunc
+.endm
+
+        h264_qpel_h_lowpass_10 put
+        h264_qpel_h_lowpass_10 avg
+
+.macro h264_qpel_h_lowpass_l2_10 type
+function \type\()_h264_qpel16_h_lowpass_l2_neon_10
+        mov             x13, x30
+        mov             x12, #32
+        bl              \type\()_h264_qpel8_h_lowpass_l2_neon_10
+        sub             x0,  x0,  x2, lsl #4
+        sub             x1,  x1,  x2, lsl #4
+        sub             x3,  x3,  x2, lsl #4
+        add             x0,  x0,  #16
+        add             x1,  x1,  #16
+        add             x3,  x3,  #16
+        mov             x12, #32
+        mov             x30, x13
+endfunc
+
+function \type\()_h264_qpel8_h_lowpass_l2_neon_10
+1:      ld1             {v26.8H, v27.8H}, [x1], x2
+        ld1             {v16.8H, v17.8H}, [x1], x2
+        ld1             {v28.8H},     [x3], x2
+        ld1             {v29.8H},     [x3], x2
+        subs            x12, x12, #4
+        lowpass_8_10    v26, v27, v16, v17, v26, v27
+        urhadd          v26.8H, v26.8H, v28.8H
+        urhadd          v27.8H, v27.8H, v29.8H
+  .ifc \type,avg
+        ld1             {v2.8H},      [x0], x2
+        ld1             {v3.8H},      [x0]
+        urhadd          v26.8H, v26.8H, v2.8H
+        urhadd          v27.8H, v27.8H, v3.8H
+        sub             x0,  x0,  x2
+  .endif
+        st1             {v26.8H},     [x0], x2
+        st1             {v27.8H},     [x0], x2
+        b.ne            1b
+        ret
+endfunc
+.endm
+
+        h264_qpel_h_lowpass_l2_10 put
+        h264_qpel_h_lowpass_l2_10 avg
+
+function put_h264_qpel16_v_lowpass_neon_packed_10
+        mov             x4,  x30
+        mov             x2,  #8
+        bl              put_h264_qpel8_v_lowpass_neon
+        sub             x1,  x1,  x3, lsl #2
+        bl              put_h264_qpel8_v_lowpass_neon
+        sub             x1,  x1,  x3, lsl #4
+        sub             x1,  x1,  x3, lsl #2
+        add             x1,  x1,  #8
+        bl              put_h264_qpel8_v_lowpass_neon
+        sub             x1,  x1,  x3, lsl #2
+        mov             x30, x4
+        b               put_h264_qpel8_v_lowpass_neon
+endfunc
+
+.macro  h264_qpel_v_lowpass_10 type
+function \type\()_h264_qpel16_v_lowpass_neon_10
+        mov             x4,  x30
+        bl              \type\()_h264_qpel8_v_lowpass_neon_10
+        sub             x1,  x1,  x3, lsl #2
+        bl              \type\()_h264_qpel8_v_lowpass_neon_10
+        sub             x0,  x0,  x2, lsl #4
+        add             x0,  x0,  #16
+        sub             x1,  x1,  x3, lsl #4
+        sub             x1,  x1,  x3, lsl #2
+        add             x1,  x1,  #16
+        bl              \type\()_h264_qpel8_v_lowpass_neon_10
+        sub             x1,  x1,  x3, lsl #2
+        mov             x30, x4
+endfunc
+
+function \type\()_h264_qpel8_v_lowpass_neon_10
+        ld1             {v16.8H}, [x1], x3
+        ld1             {v17.8H}, [x1], x3
+        ld1             {v18.8H}, [x1], x3
+        ld1             {v19.8H}, [x1], x3
+        ld1             {v20.8H}, [x1], x3
+        ld1             {v21.8H}, [x1], x3
+        ld1             {v22.8H}, [x1], x3
+        ld1             {v23.8H}, [x1], x3
+        ld1             {v24.8H}, [x1], x3
+        ld1             {v25.8H}, [x1], x3
+        ld1             {v26.8H}, [x1], x3
+        ld1             {v27.8H}, [x1], x3
+        ld1             {v28.8H}, [x1]
+
+        lowpass_8_10_v  v16, v17, v18, v19, v20, v21, v22, v16, v17
+        lowpass_8_10_v  v18, v19, v20, v21, v22, v23, v24, v18, v19
+        lowpass_8_10_v  v20, v21, v22, v23, v24, v25, v26, v20, v21
+        lowpass_8_10_v  v22, v23, v24, v25, v26, v27, v28, v22, v23
+
+  .ifc \type,avg
+        ld1             {v24.8H},  [x0], x2
+        ld1             {v25.8H}, [x0], x2
+        ld1             {v26.8H}, [x0], x2
+        urhadd          v16.8H, v16.8H, v24.8H
+        ld1             {v27.8H}, [x0], x2
+        urhadd          v17.8H, v17.8H, v25.8H
+        ld1             {v28.8H}, [x0], x2
+        urhadd          v18.8H, v18.8H, v26.8H
+        ld1             {v29.8H}, [x0], x2
+        urhadd          v19.8H, v19.8H, v27.8H
+        ld1             {v30.8H}, [x0], x2
+        urhadd          v20.8H, v20.8H, v28.8H
+        ld1             {v31.8H}, [x0], x2
+        urhadd          v21.8H, v21.8H, v29.8H
+        urhadd          v22.8H, v22.8H, v30.8H
+        urhadd          v23.8H, v23.8H, v31.8H
+        sub             x0,  x0,  x2,  lsl #3
+  .endif
+
+        st1             {v16.8H}, [x0], x2
+        st1             {v17.8H}, [x0], x2
+        st1             {v18.8H}, [x0], x2
+        st1             {v19.8H}, [x0], x2
+        st1             {v20.8H}, [x0], x2
+        st1             {v21.8H}, [x0], x2
+        st1             {v22.8H}, [x0], x2
+        st1             {v23.8H}, [x0], x2
+
+        ret
+endfunc
+.endm
+
+        h264_qpel_v_lowpass_10 put
+        h264_qpel_v_lowpass_10 avg
+
+.macro  h264_qpel_v_lowpass_l2_10 type
+function \type\()_h264_qpel16_v_lowpass_l2_neon_10
+        mov             x4,  x30
+        bl              \type\()_h264_qpel8_v_lowpass_l2_neon_10
+        sub             x1,  x1,  x3, lsl #2
+        bl              \type\()_h264_qpel8_v_lowpass_l2_neon_10
+        sub             x0,  x0,  x3, lsl #4
+        sub             x12, x12, x2, lsl #4
+        add             x0,  x0,  #16
+        add             x12, x12, #16
+        sub             x1,  x1,  x3, lsl #4
+        sub             x1,  x1,  x3, lsl #2
+        add             x1,  x1,  #16
+        bl              \type\()_h264_qpel8_v_lowpass_l2_neon_10
+        sub             x1,  x1,  x3, lsl #2
+        mov             x30, x4
+endfunc
+
+function \type\()_h264_qpel8_v_lowpass_l2_neon_10
+        ld1             {v16.8H}, [x1], x3
+        ld1             {v17.8H}, [x1], x3
+        ld1             {v18.8H}, [x1], x3
+        ld1             {v19.8H}, [x1], x3
+        ld1             {v20.8H}, [x1], x3
+        ld1             {v21.8H}, [x1], x3
+        ld1             {v22.8H}, [x1], x3
+        ld1             {v23.8H}, [x1], x3
+        ld1             {v24.8H}, [x1], x3
+        ld1             {v25.8H}, [x1], x3
+        ld1             {v26.8H}, [x1], x3
+        ld1             {v27.8H}, [x1], x3
+        ld1             {v28.8H}, [x1]
+
+        lowpass_8_10_v  v16, v17, v18, v19, v20, v21, v22, v16, v17
+        lowpass_8_10_v  v18, v19, v20, v21, v22, v23, v24, v18, v19
+        lowpass_8_10_v  v20, v21, v22, v23, v24, v25, v26, v20, v21
+        lowpass_8_10_v  v22, v23, v24, v25, v26, v27, v28, v22, v23
+
+        ld1             {v24.8H},  [x12], x2
+        ld1             {v25.8H},  [x12], x2
+        ld1             {v26.8H},  [x12], x2
+        ld1             {v27.8H},  [x12], x2
+        ld1             {v28.8H},  [x12], x2
+        urhadd          v16.8H, v24.8H, v16.8H
+        urhadd          v17.8H, v25.8H, v17.8H
+        ld1             {v29.8H},  [x12], x2
+        urhadd          v18.8H, v26.8H, v18.8H
+        urhadd          v19.8H, v27.8H, v19.8H
+        ld1             {v30.8H}, [x12], x2
+        urhadd          v20.8H, v28.8H, v20.8H
+        urhadd          v21.8H, v29.8H, v21.8H
+        ld1             {v31.8H}, [x12], x2
+        urhadd          v22.8H, v30.8H, v22.8H
+        urhadd          v23.8H, v31.8H, v23.8H
+
+  .ifc \type,avg
+        ld1             {v24.8H}, [x0], x3
+        ld1             {v25.8H}, [x0], x3
+        ld1             {v26.8H}, [x0], x3
+        urhadd          v16.8H, v16.8H, v24.8H
+        ld1             {v27.8H}, [x0], x3
+        urhadd          v17.8H, v17.8H, v25.8H
+        ld1             {v28.8H}, [x0], x3
+        urhadd          v18.8H, v18.8H, v26.8H
+        ld1             {v29.8H}, [x0], x3
+        urhadd          v19.8H, v19.8H, v27.8H
+        ld1             {v30.8H}, [x0], x3
+        urhadd          v20.8H, v20.8H, v28.8H
+        ld1             {v31.8H}, [x0], x3
+        urhadd          v21.8H, v21.8H, v29.8H
+        urhadd          v22.8H, v22.8H, v30.8H
+        urhadd          v23.8H, v23.8H, v31.8H
+        sub             x0,  x0,  x3,  lsl #3
+  .endif
+
+        st1             {v16.8H}, [x0], x3
+        st1             {v17.8H}, [x0], x3
+        st1             {v18.8H}, [x0], x3
+        st1             {v19.8H}, [x0], x3
+        st1             {v20.8H}, [x0], x3
+        st1             {v21.8H}, [x0], x3
+        st1             {v22.8H}, [x0], x3
+        st1             {v23.8H}, [x0], x3
+
+        ret
+endfunc
+.endm
+
+        h264_qpel_v_lowpass_l2_10 put
+        h264_qpel_v_lowpass_l2_10 avg
+
+.macro  h264_qpel8_10   type
+function ff_\type\()_h264_qpel8_mc10_neon_10, export=1
+        lowpass_const   w3
+        mov             x3,  x1
+        sub             x1,  x1,  #4
+        mov             x12, #16
+        b               \type\()_h264_qpel8_h_lowpass_l2_neon_10
+endfunc
+
+function ff_\type\()_h264_qpel8_mc20_neon_10, export=1
+        lowpass_const   w3
+        sub             x1,  x1,  #4
+        mov             x3,  x2
+        mov             x12, #16
+        b               \type\()_h264_qpel8_h_lowpass_neon_10
+endfunc
+
+function ff_\type\()_h264_qpel8_mc30_neon_10, export=1
+        lowpass_const   w3
+        add             x3,  x1,  #2
+        sub             x1,  x1,  #4
+        mov             x12, #16
+        b               \type\()_h264_qpel8_h_lowpass_l2_neon_10
+endfunc
+
+function ff_\type\()_h264_qpel8_mc01_neon_10, export=1
+        mov             x14, x30
+        mov             x12, x1
+\type\()_h264_qpel8_mc01_10:
+        lowpass_const   w3
+        mov             x3,  x2
+        sub             x1,  x1,  x2, lsl #1
+        bl              \type\()_h264_qpel8_v_lowpass_l2_neon_10
+        ret             x14
+endfunc
+
+function ff_\type\()_h264_qpel8_mc11_neon_10, export=1
+        mov             x14, x30
+        mov             x8,  x0
+        mov             x9,  x1
+\type\()_h264_qpel8_mc11_10:
+        lowpass_const   w3
+        mov             x11, sp
+        sub             sp,  sp,  #128
+        mov             x0,  sp
+        sub             x1,  x1,  #4
+        mov             x3,  #16
+        mov             x12, #16
+        bl              put_h264_qpel8_h_lowpass_neon_10
+        mov             x0,  x8
+        mov             x3,  x2
+        mov             x12, sp
+        sub             x1,  x9,  x2, lsl #1
+        mov             x2,  #16
+        bl              \type\()_h264_qpel8_v_lowpass_l2_neon_10
+        mov             sp,  x11
+        ret             x14
+endfunc
+
+function ff_\type\()_h264_qpel8_mc31_neon_10, export=1
+        add             x1,  x1,  #2
+        mov             x14, x30
+        mov             x8,  x0
+        mov             x9,  x1
+        sub             x1,  x1,  #2
+        b               \type\()_h264_qpel8_mc11_10
+endfunc
+
+function ff_\type\()_h264_qpel8_mc02_neon_10, export=1
+        mov             x14, x30
+        lowpass_const   w3
+        sub             x1,  x1,  x2, lsl #1
+        mov             x3,  x2
+        bl              \type\()_h264_qpel8_v_lowpass_neon_10
+        ret             x14
+endfunc
+
+function ff_\type\()_h264_qpel8_mc03_neon_10, export=1
+        mov             x14, x30
+        add             x12, x1,  x2
+        b               \type\()_h264_qpel8_mc01_10
+endfunc
+
+function ff_\type\()_h264_qpel8_mc13_neon_10, export=1
+        mov             x14, x30
+        mov             x8,  x0
+        mov             x9,  x1
+        add             x1,  x1,  x2
+        b               \type\()_h264_qpel8_mc11_10
+endfunc
+
+function ff_\type\()_h264_qpel8_mc33_neon_10, export=1
+        add             x1,  x1,  #2
+        mov             x14, x30
+        mov             x8,  x0
+        mov             x9,  x1
+        add             x1,  x1,  x2
+        sub             x1,  x1,  #2
+        b               \type\()_h264_qpel8_mc11_10
+endfunc
+.endm
+
+        h264_qpel8_10 put
+        h264_qpel8_10 avg
+
+.macro  h264_qpel16_10     type
+function ff_\type\()_h264_qpel16_mc10_neon_10, export=1
+        lowpass_const   w3
+        mov             x3,  x1
+        sub             x1,  x1,  #4
+        b               \type\()_h264_qpel16_h_lowpass_l2_neon_10
+endfunc
+
+function ff_\type\()_h264_qpel16_mc20_neon_10, export=1
+        lowpass_const   w3
+        sub             x1,  x1,  #4
+        mov             x3,  x2
+        b               \type\()_h264_qpel16_h_lowpass_neon_10
+endfunc
+
+function ff_\type\()_h264_qpel16_mc30_neon_10, export=1
+        lowpass_const   w3
+        add             x3,  x1,  #2
+        sub             x1,  x1,  #4
+        b               \type\()_h264_qpel16_h_lowpass_l2_neon_10
+endfunc
+
+function ff_\type\()_h264_qpel16_mc01_neon_10, export=1
+        mov             x14, x30
+        mov             x12, x1
+\type\()_h264_qpel16_mc01_10:
+        lowpass_const   w3
+        mov             x3,  x2
+        sub             x1,  x1,  x2, lsl #1
+        bl              \type\()_h264_qpel16_v_lowpass_l2_neon_10
+        ret             x14
+endfunc
+
+function ff_\type\()_h264_qpel16_mc11_neon_10, export=1
+        mov             x14, x30
+        mov             x8,  x0
+        mov             x9,  x1
+\type\()_h264_qpel16_mc11_10:
+        lowpass_const   w3
+        mov             x11, sp
+        sub             sp,  sp,  #512
+        mov             x0,  sp
+        sub             x1,  x1,  #4
+        mov             x3,  #32
+        bl              put_h264_qpel16_h_lowpass_neon_10
+        mov             x0,  x8
+        mov             x3,  x2
+        mov             x12, sp
+        sub             x1,  x9,  x2, lsl #1
+        mov             x2,  #32
+        bl              \type\()_h264_qpel16_v_lowpass_l2_neon_10
+        mov             sp,  x11
+        ret             x14
+endfunc
+
+function ff_\type\()_h264_qpel16_mc31_neon_10, export=1
+        add             x1,  x1,  #2
+        mov             x14, x30
+        mov             x8,  x0
+        mov             x9,  x1
+        sub             x1,  x1,  #2
+        b               \type\()_h264_qpel16_mc11_10
+endfunc
+
+function ff_\type\()_h264_qpel16_mc02_neon_10, export=1
+        mov             x14, x30
+        lowpass_const   w3
+        sub             x1,  x1,  x2, lsl #1
+        mov             x3,  x2
+        bl              \type\()_h264_qpel16_v_lowpass_neon_10
+        ret             x14
+endfunc
+
+function ff_\type\()_h264_qpel16_mc03_neon_10, export=1
+        mov             x14, x30
+        add             x12, x1,  x2
+        b               \type\()_h264_qpel16_mc01_10
+endfunc
+
+function ff_\type\()_h264_qpel16_mc13_neon_10, export=1
+        mov             x14, x30
+        mov             x8,  x0
+        mov             x9,  x1
+        add             x1,  x1,  x2
+        b               \type\()_h264_qpel16_mc11_10
+endfunc
+
+function ff_\type\()_h264_qpel16_mc33_neon_10, export=1
+        add             x1,  x1,  #2
+        mov             x14, x30
+        mov             x8,  x0
+        mov             x9,  x1
+        add             x1,  x1,  x2
+        sub             x1,  x1,  #2
+        b               \type\()_h264_qpel16_mc11_10
+endfunc
+.endm
+
+        h264_qpel16_10 put
+        h264_qpel16_10 avg
-- 
2.34.1



More information about the ffmpeg-devel mailing list