[FFmpeg-devel] [PATCH] lavc/aarch64: h264qpel, add lowpass_8 based functions
Mikhail Nitenko
mnitenko at gmail.com
Mon Dec 4 12:00:35 EET 2023
Benchmarks A53 A55 A72 A76
avg_h264_qpel_8_mc01_10_c: 936.5 924.0 656.0 504.7
avg_h264_qpel_8_mc01_10_neon: 234.7 202.0 120.7 63.2
avg_h264_qpel_8_mc02_10_c: 921.0 920.0 669.2 493.7
avg_h264_qpel_8_mc02_10_neon: 202.0 173.2 102.7 58.5
avg_h264_qpel_8_mc03_10_c: 936.5 924.0 656.0 509.5
avg_h264_qpel_8_mc03_10_neon: 236.2 203.7 120.0 63.2
avg_h264_qpel_8_mc10_10_c: 1441.0 1437.7 806.7 478.5
avg_h264_qpel_8_mc10_10_neon: 325.7 324.0 153.7 94.2
avg_h264_qpel_8_mc11_10_c: 2160.7 2148.2 1366.7 906.7
avg_h264_qpel_8_mc11_10_neon: 492.0 464.0 242.5 134.5
avg_h264_qpel_8_mc13_10_c: 2157.0 2138.2 1357.0 908.2
avg_h264_qpel_8_mc13_10_neon: 494.0 467.2 242.0 140.0
avg_h264_qpel_8_mc20_10_c: 1433.5 1410.0 785.2 486.0
avg_h264_qpel_8_mc20_10_neon: 293.7 289.7 138.0 91.5
avg_h264_qpel_8_mc30_10_c: 1458.5 1461.7 813.7 483.2
avg_h264_qpel_8_mc30_10_neon: 341.7 339.2 154.0 95.2
avg_h264_qpel_8_mc31_10_c: 2194.7 2197.2 1358.7 928.0
avg_h264_qpel_8_mc31_10_neon: 520.0 495.0 245.5 142.5
avg_h264_qpel_8_mc33_10_c: 2188.0 2205.5 1356.7 910.7
avg_h264_qpel_8_mc33_10_neon: 521.0 494.5 245.7 145.7
avg_h264_qpel_16_mc01_10_c: 3717.2 3595.0 2610.0 2012.0
avg_h264_qpel_16_mc01_10_neon: 920.5 791.5 483.2 240.5
avg_h264_qpel_16_mc02_10_c: 3684.0 3633.0 2659.0 1919.7
avg_h264_qpel_16_mc02_10_neon: 790.7 678.2 409.2 217.0
avg_h264_qpel_16_mc03_10_c: 3726.5 3596.0 2606.7 2010.0
avg_h264_qpel_16_mc03_10_neon: 922.0 792.5 483.2 239.7
avg_h264_qpel_16_mc10_10_c: 5912.0 5803.2 3241.5 1916.7
avg_h264_qpel_16_mc10_10_neon: 1267.5 1277.2 616.5 365.0
avg_h264_qpel_16_mc11_10_c: 8599.2 8482.5 5338.0 3616.2
avg_h264_qpel_16_mc11_10_neon: 1913.0 1827.0 956.2 542.2
avg_h264_qpel_16_mc13_10_c: 8643.7 8488.5 5388.0 3628.5
avg_h264_qpel_16_mc13_10_neon: 1914.7 1828.7 969.2 530.5
avg_h264_qpel_16_mc20_10_c: 5719.5 5641.0 3147.0 1946.2
avg_h264_qpel_16_mc20_10_neon: 1139.5 1150.0 539.5 344.0
avg_h264_qpel_16_mc30_10_c: 5930.0 5872.5 3267.5 1918.0
avg_h264_qpel_16_mc30_10_neon: 1331.5 1341.2 616.5 369.5
avg_h264_qpel_16_mc31_10_c: 8758.7 8697.7 5353.0 3630.7
avg_h264_qpel_16_mc31_10_neon: 2018.7 1941.7 982.2 574.7
avg_h264_qpel_16_mc33_10_c: 8683.2 8675.2 5339.2 3634.7
avg_h264_qpel_16_mc33_10_neon: 2019.7 1940.2 994.5 566.0
put_h264_qpel_8_mc01_10_c: 854.2 843.0 599.2 478.0
put_h264_qpel_8_mc01_10_neon: 192.7 168.0 101.7 56.7
put_h264_qpel_8_mc02_10_c: 766.5 760.0 550.2 441.0
put_h264_qpel_8_mc02_10_neon: 160.0 139.2 88.7 53.0
put_h264_qpel_8_mc03_10_c: 854.2 843.0 599.2 479.0
put_h264_qpel_8_mc03_10_neon: 194.2 169.7 102.0 56.2
put_h264_qpel_8_mc10_10_c: 1352.7 1353.7 749.7 446.7
put_h264_qpel_8_mc10_10_neon: 289.7 294.2 135.5 88.5
put_h264_qpel_8_mc11_10_c: 2080.0 2066.2 1309.5 876.7
put_h264_qpel_8_mc11_10_neon: 450.0 429.7 229.7 131.2
put_h264_qpel_8_mc13_10_c: 2074.7 2060.2 1294.5 870.5
put_h264_qpel_8_mc13_10_neon: 452.5 434.5 226.5 130.0
put_h264_qpel_8_mc20_10_c: 1221.5 1216.0 684.5 399.7
put_h264_qpel_8_mc20_10_neon: 257.7 262.5 121.2 78.7
put_h264_qpel_8_mc30_10_c: 1379.0 1374.7 757.2 449.5
put_h264_qpel_8_mc30_10_neon: 305.7 310.2 135.5 86.5
put_h264_qpel_8_mc31_10_c: 2109.2 2119.7 1299.5 878.0
put_h264_qpel_8_mc31_10_neon: 478.0 458.5 226.0 137.2
put_h264_qpel_8_mc33_10_c: 2101.5 2115.2 1306.5 887.0
put_h264_qpel_8_mc33_10_neon: 479.0 458.7 229.7 141.7
put_h264_qpel_16_mc01_10_c: 3485.7 3396.7 2460.5 1914.5
put_h264_qpel_16_mc01_10_neon: 752.5 665.5 397.0 213.2
put_h264_qpel_16_mc02_10_c: 3103.5 3023.2 2154.7 1720.7
put_h264_qpel_16_mc02_10_neon: 622.7 551.2 347.7 196.2
put_h264_qpel_16_mc03_10_c: 3486.2 3394.0 2436.5 1917.7
put_h264_qpel_16_mc03_10_neon: 754.0 666.5 397.0 215.7
put_h264_qpel_16_mc10_10_c: 5533.0 5488.5 2989.0 1783.0
put_h264_qpel_16_mc10_10_neon: 1123.5 1165.2 535.2 334.7
put_h264_qpel_16_mc11_10_c: 8437.7 8281.2 5209.0 3510.7
put_h264_qpel_16_mc11_10_neon: 1745.0 1697.0 878.5 513.5
put_h264_qpel_16_mc13_10_c: 8567.7 8468.0 5221.5 3528.0
put_h264_qpel_16_mc13_10_neon: 1751.7 1698.2 889.2 507.0
put_h264_qpel_16_mc20_10_c: 4907.5 4885.0 2786.2 1607.5
put_h264_qpel_16_mc20_10_neon: 995.5 1034.5 475.5 307.0
put_h264_qpel_16_mc30_10_c: 5579.7 5537.7 3045.2 1789.5
put_h264_qpel_16_mc30_10_neon: 1187.5 1231.2 532.5 334.5
put_h264_qpel_16_mc31_10_c: 8677.2 8672.5 5204.2 3516.0
put_h264_qpel_16_mc31_10_neon: 1850.7 1813.2 893.0 545.2
put_h264_qpel_16_mc33_10_c: 8688.7 8671.2 5223.2 3512.0
put_h264_qpel_16_mc33_10_neon: 1851.7 1814.2 908.5 535.2
Signed-off-by: Mikhail Nitenko <mnitenko at gmail.com>
---
I remodeled the patch (as Martin once suggested), it doesn't
go to 32bits in lowpass_8_10 and is also using the much
faster lowpass_8_10_v.
libavcodec/aarch64/h264qpel_init_aarch64.c | 91 +++-
libavcodec/aarch64/h264qpel_neon.S | 532 +++++++++++++++++++++
2 files changed, 621 insertions(+), 2 deletions(-)
diff --git a/libavcodec/aarch64/h264qpel_init_aarch64.c b/libavcodec/aarch64/h264qpel_init_aarch64.c
index 77f41d9a21..93fa5246c4 100644
--- a/libavcodec/aarch64/h264qpel_init_aarch64.c
+++ b/libavcodec/aarch64/h264qpel_init_aarch64.c
@@ -95,12 +95,55 @@ void ff_avg_h264_qpel8_mc13_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t str
void ff_avg_h264_qpel8_mc23_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
void ff_avg_h264_qpel8_mc33_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel16_mc10_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel16_mc20_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel16_mc30_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel16_mc01_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel16_mc11_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel16_mc31_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel16_mc02_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel16_mc03_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel16_mc13_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel16_mc33_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+
+void ff_put_h264_qpel8_mc10_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel8_mc20_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel8_mc30_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel8_mc01_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel8_mc11_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel8_mc31_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel8_mc02_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel8_mc03_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel8_mc13_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_put_h264_qpel8_mc33_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+
+void ff_avg_h264_qpel16_mc10_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc20_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc30_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc01_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc11_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc31_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc02_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc03_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc13_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel16_mc33_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+
+void ff_avg_h264_qpel8_mc10_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc20_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc30_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc01_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc11_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc31_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc02_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc03_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc13_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+void ff_avg_h264_qpel8_mc33_neon_10(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
+
av_cold void ff_h264qpel_init_aarch64(H264QpelContext *c, int bit_depth)
{
- const int high_bit_depth = bit_depth > 8;
int cpu_flags = av_get_cpu_flags();
- if (have_neon(cpu_flags) && !high_bit_depth) {
+ if (have_neon(cpu_flags) && bit_depth <= 8) {
c->put_h264_qpel_pixels_tab[0][ 0] = ff_put_h264_qpel16_mc00_neon;
c->put_h264_qpel_pixels_tab[0][ 1] = ff_put_h264_qpel16_mc10_neon;
c->put_h264_qpel_pixels_tab[0][ 2] = ff_put_h264_qpel16_mc20_neon;
@@ -168,5 +211,49 @@ av_cold void ff_h264qpel_init_aarch64(H264QpelContext *c, int bit_depth)
c->avg_h264_qpel_pixels_tab[1][13] = ff_avg_h264_qpel8_mc13_neon;
c->avg_h264_qpel_pixels_tab[1][14] = ff_avg_h264_qpel8_mc23_neon;
c->avg_h264_qpel_pixels_tab[1][15] = ff_avg_h264_qpel8_mc33_neon;
+ } else if (have_neon(cpu_flags) && bit_depth == 10) {
+ c->put_h264_qpel_pixels_tab[0][ 1] = ff_put_h264_qpel16_mc10_neon_10;
+ c->put_h264_qpel_pixels_tab[0][ 2] = ff_put_h264_qpel16_mc20_neon_10;
+ c->put_h264_qpel_pixels_tab[0][ 3] = ff_put_h264_qpel16_mc30_neon_10;
+ c->put_h264_qpel_pixels_tab[0][ 4] = ff_put_h264_qpel16_mc01_neon_10;
+ c->put_h264_qpel_pixels_tab[0][ 5] = ff_put_h264_qpel16_mc11_neon_10;
+ c->put_h264_qpel_pixels_tab[0][ 7] = ff_put_h264_qpel16_mc31_neon_10;
+ c->put_h264_qpel_pixels_tab[0][ 8] = ff_put_h264_qpel16_mc02_neon_10;
+ c->put_h264_qpel_pixels_tab[0][12] = ff_put_h264_qpel16_mc03_neon_10;
+ c->put_h264_qpel_pixels_tab[0][13] = ff_put_h264_qpel16_mc13_neon_10;
+ c->put_h264_qpel_pixels_tab[0][15] = ff_put_h264_qpel16_mc33_neon_10;
+
+ c->put_h264_qpel_pixels_tab[1][ 1] = ff_put_h264_qpel8_mc10_neon_10;
+ c->put_h264_qpel_pixels_tab[1][ 2] = ff_put_h264_qpel8_mc20_neon_10;
+ c->put_h264_qpel_pixels_tab[1][ 3] = ff_put_h264_qpel8_mc30_neon_10;
+ c->put_h264_qpel_pixels_tab[1][ 4] = ff_put_h264_qpel8_mc01_neon_10;
+ c->put_h264_qpel_pixels_tab[1][ 5] = ff_put_h264_qpel8_mc11_neon_10;
+ c->put_h264_qpel_pixels_tab[1][ 7] = ff_put_h264_qpel8_mc31_neon_10;
+ c->put_h264_qpel_pixels_tab[1][ 8] = ff_put_h264_qpel8_mc02_neon_10;
+ c->put_h264_qpel_pixels_tab[1][12] = ff_put_h264_qpel8_mc03_neon_10;
+ c->put_h264_qpel_pixels_tab[1][13] = ff_put_h264_qpel8_mc13_neon_10;
+ c->put_h264_qpel_pixels_tab[1][15] = ff_put_h264_qpel8_mc33_neon_10;
+
+ c->avg_h264_qpel_pixels_tab[0][ 1] = ff_avg_h264_qpel16_mc10_neon_10;
+ c->avg_h264_qpel_pixels_tab[0][ 2] = ff_avg_h264_qpel16_mc20_neon_10;
+ c->avg_h264_qpel_pixels_tab[0][ 3] = ff_avg_h264_qpel16_mc30_neon_10;
+ c->avg_h264_qpel_pixels_tab[0][ 4] = ff_avg_h264_qpel16_mc01_neon_10;
+ c->avg_h264_qpel_pixels_tab[0][ 5] = ff_avg_h264_qpel16_mc11_neon_10;
+ c->avg_h264_qpel_pixels_tab[0][ 7] = ff_avg_h264_qpel16_mc31_neon_10;
+ c->avg_h264_qpel_pixels_tab[0][ 8] = ff_avg_h264_qpel16_mc02_neon_10;
+ c->avg_h264_qpel_pixels_tab[0][12] = ff_avg_h264_qpel16_mc03_neon_10;
+ c->avg_h264_qpel_pixels_tab[0][13] = ff_avg_h264_qpel16_mc13_neon_10;
+ c->avg_h264_qpel_pixels_tab[0][15] = ff_avg_h264_qpel16_mc33_neon_10;
+
+ c->avg_h264_qpel_pixels_tab[1][ 1] = ff_avg_h264_qpel8_mc10_neon_10;
+ c->avg_h264_qpel_pixels_tab[1][ 2] = ff_avg_h264_qpel8_mc20_neon_10;
+ c->avg_h264_qpel_pixels_tab[1][ 3] = ff_avg_h264_qpel8_mc30_neon_10;
+ c->avg_h264_qpel_pixels_tab[1][ 4] = ff_avg_h264_qpel8_mc01_neon_10;
+ c->avg_h264_qpel_pixels_tab[1][ 5] = ff_avg_h264_qpel8_mc11_neon_10;
+ c->avg_h264_qpel_pixels_tab[1][ 7] = ff_avg_h264_qpel8_mc31_neon_10;
+ c->avg_h264_qpel_pixels_tab[1][ 8] = ff_avg_h264_qpel8_mc02_neon_10;
+ c->avg_h264_qpel_pixels_tab[1][12] = ff_avg_h264_qpel8_mc03_neon_10;
+ c->avg_h264_qpel_pixels_tab[1][13] = ff_avg_h264_qpel8_mc13_neon_10;
+ c->avg_h264_qpel_pixels_tab[1][15] = ff_avg_h264_qpel8_mc33_neon_10;
}
}
diff --git a/libavcodec/aarch64/h264qpel_neon.S b/libavcodec/aarch64/h264qpel_neon.S
index f4475d96f9..31130a57fd 100644
--- a/libavcodec/aarch64/h264qpel_neon.S
+++ b/libavcodec/aarch64/h264qpel_neon.S
@@ -933,3 +933,535 @@ endfunc
h264_qpel16 put
h264_qpel16 avg
+
+//trashes v0-v5
+.macro lowpass_8_10 r0, r1, r2, r3, d0, d1
+ ext v2.16B, \r0\().16B, \r1\().16B, #4
+ ext v3.16B, \r0\().16B, \r1\().16B, #6
+ add v2.8H, v2.8H, v3.8H
+ ext v4.16B, \r0\().16B, \r1\().16B, #2
+ ext v5.16B, \r0\().16B, \r1\().16B, #8
+ add v4.8H, v4.8H, v5.8H
+ ext v1.16B, \r0\().16B, \r1\().16B, #10
+
+ add \d0\().8H, \r0\().8H, v1.8H
+ ext v0.16B, \r2\().16B, \r3\().16B, #4
+ mla \d0\().8H, v2.8H, v6.H[1]
+ ext v1.16B, \r2\().16B, \r3\().16B, #6
+ add v0.8H, v0.8H, v1.8H
+ ext v1.16B, \r2\().16B, \r3\().16B, #2
+ mul v5.8H, v4.8H, v6.H[0]
+ uqsub \d0\().8H, \d0\().8H, v5.8H
+ urshr \d0\().8H, \d0\().8H, #5
+
+ ext v3.16B, \r2\().16B, \r3\().16B, #8
+ add v1.8H, v1.8H, v3.8H
+ ext v2.16B, \r2\().16B, \r3\().16B, #10
+
+ add \d1\().8H, \r2\().8H, v2.8H
+ mla \d1\().8H, v0.8H, v6.H[1]
+ mul v5.8H, v1.8H, v6.H[0]
+ uqsub \d1\().8H, \d1\().8H, v5.8H
+ mvni v5.8h, #0xFC, lsl #8 // 1023 for clipping
+ urshr \d1\().8H, \d1\().8H, #5
+
+ umin \d0\().8H, \d0\().8H, v5.8h
+ umin \d1\().8H, \d1\().8H, v5.8h
+.endm
+
+//trashes v0-v4
+.macro lowpass_8_10_v r0, r1, r2, r3, r4, r5, r6, d0, d1
+ add v2.8H, \r2\().8H, \r3\().8H
+ add v0.8H, \r3\().8H, \r4\().8H
+ add v4.8H, \r1\().8H, \r4\().8H
+ add v1.8H, \r2\().8H, \r5\().8H
+
+ add \d0\().8H, \r0\().8H, \r5\().8H
+ add \d1\().8H, \r1\().8H, \r6\().8H
+ mla \d0\().8H, v2.8H, v6.H[1]
+ mla \d1\().8H, v0.8H, v6.H[1]
+ mul v2.8H, v4.8H, v6.H[0]
+ mul v0.8H, v1.8H, v6.H[0]
+ uqsub \d0\().8H, \d0\().8H, v2.8H
+ uqsub \d1\().8H, \d1\().8H, v0.8H
+
+ mvni v0.8H, #0xFC, lsl #8 // 1023 for clipping
+
+ urshr \d0\().8H, \d0\().8H, #5
+ urshr \d1\().8H, \d1\().8H, #5
+
+ umin \d0\().8H, \d0\().8H, v0.8H
+ umin \d1\().8H, \d1\().8H, v0.8H
+.endm
+
+function put_h264_qpel16_h_lowpass_neon_packed_10
+ mov x4, x30
+ mov x12, #32
+ mov x3, #16
+ bl put_h264_qpel8_h_lowpass_neon_10
+ sub x1, x1, x2, lsl #4
+ add x1, x1, #16
+ mov x12, #32
+ mov x30, x4
+ b put_h264_qpel8_h_lowpass_neon_10
+endfunc
+
+.macro h264_qpel_h_lowpass_10 type
+function \type\()_h264_qpel16_h_lowpass_neon_10
+ mov x13, x30
+ mov x12, #32
+ bl \type\()_h264_qpel8_h_lowpass_neon_10
+ sub x0, x0, x3, lsl #4
+ sub x1, x1, x2, lsl #4
+ add x0, x0, #16
+ add x1, x1, #16
+ mov x12, #32
+ mov x30, x13
+endfunc
+
+function \type\()_h264_qpel8_h_lowpass_neon_10
+1: ld1 {v28.8H, v29.8H}, [x1], x2
+ ld1 {v16.8H, v17.8H}, [x1], x2
+ subs x12, x12, #4
+ lowpass_8_10 v28, v29, v16, v17, v28, v20
+ .ifc \type,avg
+ ld1 {v2.8H}, [x0], x3
+ ld1 {v3.8H}, [x0]
+ urhadd v28.8H, v28.8H, v2.8H
+ urhadd v20.8H, v20.8H, v3.8H
+ sub x0, x0, x3
+ .endif
+ st1 {v28.8H}, [x0], x3
+ st1 {v20.8H}, [x0], x3
+ b.ne 1b
+ ret
+endfunc
+.endm
+
+ h264_qpel_h_lowpass_10 put
+ h264_qpel_h_lowpass_10 avg
+
+.macro h264_qpel_h_lowpass_l2_10 type
+function \type\()_h264_qpel16_h_lowpass_l2_neon_10
+ mov x13, x30
+ mov x12, #32
+ bl \type\()_h264_qpel8_h_lowpass_l2_neon_10
+ sub x0, x0, x2, lsl #4
+ sub x1, x1, x2, lsl #4
+ sub x3, x3, x2, lsl #4
+ add x0, x0, #16
+ add x1, x1, #16
+ add x3, x3, #16
+ mov x12, #32
+ mov x30, x13
+endfunc
+
+function \type\()_h264_qpel8_h_lowpass_l2_neon_10
+1: ld1 {v26.8H, v27.8H}, [x1], x2
+ ld1 {v16.8H, v17.8H}, [x1], x2
+ ld1 {v28.8H}, [x3], x2
+ ld1 {v29.8H}, [x3], x2
+ subs x12, x12, #4
+ lowpass_8_10 v26, v27, v16, v17, v26, v27
+ urhadd v26.8H, v26.8H, v28.8H
+ urhadd v27.8H, v27.8H, v29.8H
+ .ifc \type,avg
+ ld1 {v2.8H}, [x0], x2
+ ld1 {v3.8H}, [x0]
+ urhadd v26.8H, v26.8H, v2.8H
+ urhadd v27.8H, v27.8H, v3.8H
+ sub x0, x0, x2
+ .endif
+ st1 {v26.8H}, [x0], x2
+ st1 {v27.8H}, [x0], x2
+ b.ne 1b
+ ret
+endfunc
+.endm
+
+ h264_qpel_h_lowpass_l2_10 put
+ h264_qpel_h_lowpass_l2_10 avg
+
+function put_h264_qpel16_v_lowpass_neon_packed_10
+ mov x4, x30
+ mov x2, #8
+ bl put_h264_qpel8_v_lowpass_neon
+ sub x1, x1, x3, lsl #2
+ bl put_h264_qpel8_v_lowpass_neon
+ sub x1, x1, x3, lsl #4
+ sub x1, x1, x3, lsl #2
+ add x1, x1, #8
+ bl put_h264_qpel8_v_lowpass_neon
+ sub x1, x1, x3, lsl #2
+ mov x30, x4
+ b put_h264_qpel8_v_lowpass_neon
+endfunc
+
+.macro h264_qpel_v_lowpass_10 type
+function \type\()_h264_qpel16_v_lowpass_neon_10
+ mov x4, x30
+ bl \type\()_h264_qpel8_v_lowpass_neon_10
+ sub x1, x1, x3, lsl #2
+ bl \type\()_h264_qpel8_v_lowpass_neon_10
+ sub x0, x0, x2, lsl #4
+ add x0, x0, #16
+ sub x1, x1, x3, lsl #4
+ sub x1, x1, x3, lsl #2
+ add x1, x1, #16
+ bl \type\()_h264_qpel8_v_lowpass_neon_10
+ sub x1, x1, x3, lsl #2
+ mov x30, x4
+endfunc
+
+function \type\()_h264_qpel8_v_lowpass_neon_10
+ ld1 {v16.8H}, [x1], x3
+ ld1 {v17.8H}, [x1], x3
+ ld1 {v18.8H}, [x1], x3
+ ld1 {v19.8H}, [x1], x3
+ ld1 {v20.8H}, [x1], x3
+ ld1 {v21.8H}, [x1], x3
+ ld1 {v22.8H}, [x1], x3
+ ld1 {v23.8H}, [x1], x3
+ ld1 {v24.8H}, [x1], x3
+ ld1 {v25.8H}, [x1], x3
+ ld1 {v26.8H}, [x1], x3
+ ld1 {v27.8H}, [x1], x3
+ ld1 {v28.8H}, [x1]
+
+ lowpass_8_10_v v16, v17, v18, v19, v20, v21, v22, v16, v17
+ lowpass_8_10_v v18, v19, v20, v21, v22, v23, v24, v18, v19
+ lowpass_8_10_v v20, v21, v22, v23, v24, v25, v26, v20, v21
+ lowpass_8_10_v v22, v23, v24, v25, v26, v27, v28, v22, v23
+
+ .ifc \type,avg
+ ld1 {v24.8H}, [x0], x2
+ ld1 {v25.8H}, [x0], x2
+ ld1 {v26.8H}, [x0], x2
+ urhadd v16.8H, v16.8H, v24.8H
+ ld1 {v27.8H}, [x0], x2
+ urhadd v17.8H, v17.8H, v25.8H
+ ld1 {v28.8H}, [x0], x2
+ urhadd v18.8H, v18.8H, v26.8H
+ ld1 {v29.8H}, [x0], x2
+ urhadd v19.8H, v19.8H, v27.8H
+ ld1 {v30.8H}, [x0], x2
+ urhadd v20.8H, v20.8H, v28.8H
+ ld1 {v31.8H}, [x0], x2
+ urhadd v21.8H, v21.8H, v29.8H
+ urhadd v22.8H, v22.8H, v30.8H
+ urhadd v23.8H, v23.8H, v31.8H
+ sub x0, x0, x2, lsl #3
+ .endif
+
+ st1 {v16.8H}, [x0], x2
+ st1 {v17.8H}, [x0], x2
+ st1 {v18.8H}, [x0], x2
+ st1 {v19.8H}, [x0], x2
+ st1 {v20.8H}, [x0], x2
+ st1 {v21.8H}, [x0], x2
+ st1 {v22.8H}, [x0], x2
+ st1 {v23.8H}, [x0], x2
+
+ ret
+endfunc
+.endm
+
+ h264_qpel_v_lowpass_10 put
+ h264_qpel_v_lowpass_10 avg
+
+.macro h264_qpel_v_lowpass_l2_10 type
+function \type\()_h264_qpel16_v_lowpass_l2_neon_10
+ mov x4, x30
+ bl \type\()_h264_qpel8_v_lowpass_l2_neon_10
+ sub x1, x1, x3, lsl #2
+ bl \type\()_h264_qpel8_v_lowpass_l2_neon_10
+ sub x0, x0, x3, lsl #4
+ sub x12, x12, x2, lsl #4
+ add x0, x0, #16
+ add x12, x12, #16
+ sub x1, x1, x3, lsl #4
+ sub x1, x1, x3, lsl #2
+ add x1, x1, #16
+ bl \type\()_h264_qpel8_v_lowpass_l2_neon_10
+ sub x1, x1, x3, lsl #2
+ mov x30, x4
+endfunc
+
+function \type\()_h264_qpel8_v_lowpass_l2_neon_10
+ ld1 {v16.8H}, [x1], x3
+ ld1 {v17.8H}, [x1], x3
+ ld1 {v18.8H}, [x1], x3
+ ld1 {v19.8H}, [x1], x3
+ ld1 {v20.8H}, [x1], x3
+ ld1 {v21.8H}, [x1], x3
+ ld1 {v22.8H}, [x1], x3
+ ld1 {v23.8H}, [x1], x3
+ ld1 {v24.8H}, [x1], x3
+ ld1 {v25.8H}, [x1], x3
+ ld1 {v26.8H}, [x1], x3
+ ld1 {v27.8H}, [x1], x3
+ ld1 {v28.8H}, [x1]
+
+ lowpass_8_10_v v16, v17, v18, v19, v20, v21, v22, v16, v17
+ lowpass_8_10_v v18, v19, v20, v21, v22, v23, v24, v18, v19
+ lowpass_8_10_v v20, v21, v22, v23, v24, v25, v26, v20, v21
+ lowpass_8_10_v v22, v23, v24, v25, v26, v27, v28, v22, v23
+
+ ld1 {v24.8H}, [x12], x2
+ ld1 {v25.8H}, [x12], x2
+ ld1 {v26.8H}, [x12], x2
+ ld1 {v27.8H}, [x12], x2
+ ld1 {v28.8H}, [x12], x2
+ urhadd v16.8H, v24.8H, v16.8H
+ urhadd v17.8H, v25.8H, v17.8H
+ ld1 {v29.8H}, [x12], x2
+ urhadd v18.8H, v26.8H, v18.8H
+ urhadd v19.8H, v27.8H, v19.8H
+ ld1 {v30.8H}, [x12], x2
+ urhadd v20.8H, v28.8H, v20.8H
+ urhadd v21.8H, v29.8H, v21.8H
+ ld1 {v31.8H}, [x12], x2
+ urhadd v22.8H, v30.8H, v22.8H
+ urhadd v23.8H, v31.8H, v23.8H
+
+ .ifc \type,avg
+ ld1 {v24.8H}, [x0], x3
+ ld1 {v25.8H}, [x0], x3
+ ld1 {v26.8H}, [x0], x3
+ urhadd v16.8H, v16.8H, v24.8H
+ ld1 {v27.8H}, [x0], x3
+ urhadd v17.8H, v17.8H, v25.8H
+ ld1 {v28.8H}, [x0], x3
+ urhadd v18.8H, v18.8H, v26.8H
+ ld1 {v29.8H}, [x0], x3
+ urhadd v19.8H, v19.8H, v27.8H
+ ld1 {v30.8H}, [x0], x3
+ urhadd v20.8H, v20.8H, v28.8H
+ ld1 {v31.8H}, [x0], x3
+ urhadd v21.8H, v21.8H, v29.8H
+ urhadd v22.8H, v22.8H, v30.8H
+ urhadd v23.8H, v23.8H, v31.8H
+ sub x0, x0, x3, lsl #3
+ .endif
+
+ st1 {v16.8H}, [x0], x3
+ st1 {v17.8H}, [x0], x3
+ st1 {v18.8H}, [x0], x3
+ st1 {v19.8H}, [x0], x3
+ st1 {v20.8H}, [x0], x3
+ st1 {v21.8H}, [x0], x3
+ st1 {v22.8H}, [x0], x3
+ st1 {v23.8H}, [x0], x3
+
+ ret
+endfunc
+.endm
+
+ h264_qpel_v_lowpass_l2_10 put
+ h264_qpel_v_lowpass_l2_10 avg
+
+.macro h264_qpel8_10 type
+function ff_\type\()_h264_qpel8_mc10_neon_10, export=1
+ lowpass_const w3
+ mov x3, x1
+ sub x1, x1, #4
+ mov x12, #16
+ b \type\()_h264_qpel8_h_lowpass_l2_neon_10
+endfunc
+
+function ff_\type\()_h264_qpel8_mc20_neon_10, export=1
+ lowpass_const w3
+ sub x1, x1, #4
+ mov x3, x2
+ mov x12, #16
+ b \type\()_h264_qpel8_h_lowpass_neon_10
+endfunc
+
+function ff_\type\()_h264_qpel8_mc30_neon_10, export=1
+ lowpass_const w3
+ add x3, x1, #2
+ sub x1, x1, #4
+ mov x12, #16
+ b \type\()_h264_qpel8_h_lowpass_l2_neon_10
+endfunc
+
+function ff_\type\()_h264_qpel8_mc01_neon_10, export=1
+ mov x14, x30
+ mov x12, x1
+\type\()_h264_qpel8_mc01_10:
+ lowpass_const w3
+ mov x3, x2
+ sub x1, x1, x2, lsl #1
+ bl \type\()_h264_qpel8_v_lowpass_l2_neon_10
+ ret x14
+endfunc
+
+function ff_\type\()_h264_qpel8_mc11_neon_10, export=1
+ mov x14, x30
+ mov x8, x0
+ mov x9, x1
+\type\()_h264_qpel8_mc11_10:
+ lowpass_const w3
+ mov x11, sp
+ sub sp, sp, #128
+ mov x0, sp
+ sub x1, x1, #4
+ mov x3, #16
+ mov x12, #16
+ bl put_h264_qpel8_h_lowpass_neon_10
+ mov x0, x8
+ mov x3, x2
+ mov x12, sp
+ sub x1, x9, x2, lsl #1
+ mov x2, #16
+ bl \type\()_h264_qpel8_v_lowpass_l2_neon_10
+ mov sp, x11
+ ret x14
+endfunc
+
+function ff_\type\()_h264_qpel8_mc31_neon_10, export=1
+ add x1, x1, #2
+ mov x14, x30
+ mov x8, x0
+ mov x9, x1
+ sub x1, x1, #2
+ b \type\()_h264_qpel8_mc11_10
+endfunc
+
+function ff_\type\()_h264_qpel8_mc02_neon_10, export=1
+ mov x14, x30
+ lowpass_const w3
+ sub x1, x1, x2, lsl #1
+ mov x3, x2
+ bl \type\()_h264_qpel8_v_lowpass_neon_10
+ ret x14
+endfunc
+
+function ff_\type\()_h264_qpel8_mc03_neon_10, export=1
+ mov x14, x30
+ add x12, x1, x2
+ b \type\()_h264_qpel8_mc01_10
+endfunc
+
+function ff_\type\()_h264_qpel8_mc13_neon_10, export=1
+ mov x14, x30
+ mov x8, x0
+ mov x9, x1
+ add x1, x1, x2
+ b \type\()_h264_qpel8_mc11_10
+endfunc
+
+function ff_\type\()_h264_qpel8_mc33_neon_10, export=1
+ add x1, x1, #2
+ mov x14, x30
+ mov x8, x0
+ mov x9, x1
+ add x1, x1, x2
+ sub x1, x1, #2
+ b \type\()_h264_qpel8_mc11_10
+endfunc
+.endm
+
+ h264_qpel8_10 put
+ h264_qpel8_10 avg
+
+.macro h264_qpel16_10 type
+function ff_\type\()_h264_qpel16_mc10_neon_10, export=1
+ lowpass_const w3
+ mov x3, x1
+ sub x1, x1, #4
+ b \type\()_h264_qpel16_h_lowpass_l2_neon_10
+endfunc
+
+function ff_\type\()_h264_qpel16_mc20_neon_10, export=1
+ lowpass_const w3
+ sub x1, x1, #4
+ mov x3, x2
+ b \type\()_h264_qpel16_h_lowpass_neon_10
+endfunc
+
+function ff_\type\()_h264_qpel16_mc30_neon_10, export=1
+ lowpass_const w3
+ add x3, x1, #2
+ sub x1, x1, #4
+ b \type\()_h264_qpel16_h_lowpass_l2_neon_10
+endfunc
+
+function ff_\type\()_h264_qpel16_mc01_neon_10, export=1
+ mov x14, x30
+ mov x12, x1
+\type\()_h264_qpel16_mc01_10:
+ lowpass_const w3
+ mov x3, x2
+ sub x1, x1, x2, lsl #1
+ bl \type\()_h264_qpel16_v_lowpass_l2_neon_10
+ ret x14
+endfunc
+
+function ff_\type\()_h264_qpel16_mc11_neon_10, export=1
+ mov x14, x30
+ mov x8, x0
+ mov x9, x1
+\type\()_h264_qpel16_mc11_10:
+ lowpass_const w3
+ mov x11, sp
+ sub sp, sp, #512
+ mov x0, sp
+ sub x1, x1, #4
+ mov x3, #32
+ bl put_h264_qpel16_h_lowpass_neon_10
+ mov x0, x8
+ mov x3, x2
+ mov x12, sp
+ sub x1, x9, x2, lsl #1
+ mov x2, #32
+ bl \type\()_h264_qpel16_v_lowpass_l2_neon_10
+ mov sp, x11
+ ret x14
+endfunc
+
+function ff_\type\()_h264_qpel16_mc31_neon_10, export=1
+ add x1, x1, #2
+ mov x14, x30
+ mov x8, x0
+ mov x9, x1
+ sub x1, x1, #2
+ b \type\()_h264_qpel16_mc11_10
+endfunc
+
+function ff_\type\()_h264_qpel16_mc02_neon_10, export=1
+ mov x14, x30
+ lowpass_const w3
+ sub x1, x1, x2, lsl #1
+ mov x3, x2
+ bl \type\()_h264_qpel16_v_lowpass_neon_10
+ ret x14
+endfunc
+
+function ff_\type\()_h264_qpel16_mc03_neon_10, export=1
+ mov x14, x30
+ add x12, x1, x2
+ b \type\()_h264_qpel16_mc01_10
+endfunc
+
+function ff_\type\()_h264_qpel16_mc13_neon_10, export=1
+ mov x14, x30
+ mov x8, x0
+ mov x9, x1
+ add x1, x1, x2
+ b \type\()_h264_qpel16_mc11_10
+endfunc
+
+function ff_\type\()_h264_qpel16_mc33_neon_10, export=1
+ add x1, x1, #2
+ mov x14, x30
+ mov x8, x0
+ mov x9, x1
+ add x1, x1, x2
+ sub x1, x1, #2
+ b \type\()_h264_qpel16_mc11_10
+endfunc
+.endm
+
+ h264_qpel16_10 put
+ h264_qpel16_10 avg
--
2.34.1
More information about the ffmpeg-devel
mailing list