[FFmpeg-devel] [PATCH 2/2] lavc/aarch64: add hevc epel/qpel assembly

Josh Dekker josh at itanimul.li
Wed Apr 28 22:50:26 EEST 2021


From: Rafal Dabrowa <fatwildcat at gmail.com>

Benchmarked on Apple M1:

put_hevc_epel_bi_h4_8_c: 69.9
put_hevc_epel_bi_h4_8_neon: 15.4
put_hevc_epel_bi_h6_8_c: 137.1
put_hevc_epel_bi_h6_8_neon: 31.9
put_hevc_epel_bi_h8_8_c: 124.6
put_hevc_epel_bi_h8_8_neon: 40.9
put_hevc_epel_bi_h12_8_c: 331.9
put_hevc_epel_bi_h12_8_neon: 72.4
put_hevc_epel_bi_h16_8_c: 383.4
put_hevc_epel_bi_h16_8_neon: 124.9
put_hevc_epel_bi_h24_8_c: 771.6
put_hevc_epel_bi_h24_8_neon: 209.6
put_hevc_epel_bi_h32_8_c: 1324.4
put_hevc_epel_bi_h32_8_neon: 389.4
put_hevc_epel_bi_h48_8_c: 2869.6
put_hevc_epel_bi_h48_8_neon: 730.1
put_hevc_epel_bi_h64_8_c: 4992.6
put_hevc_epel_bi_h64_8_neon: 1490.4
put_hevc_epel_bi_hv4_8_c: 163.4
put_hevc_epel_bi_hv4_8_neon: 38.4
put_hevc_epel_bi_hv6_8_c: 292.4
put_hevc_epel_bi_hv6_8_neon: 66.4
put_hevc_epel_bi_hv8_8_c: 375.6
put_hevc_epel_bi_hv8_8_neon: 62.4
put_hevc_epel_bi_hv12_8_c: 831.6
put_hevc_epel_bi_hv12_8_neon: 134.9
put_hevc_epel_bi_hv16_8_c: 1257.9
put_hevc_epel_bi_hv16_8_neon: 214.1
put_hevc_epel_bi_hv24_8_c: 2666.6
put_hevc_epel_bi_hv24_8_neon: 391.1
put_hevc_epel_bi_hv32_8_c: 4722.4
put_hevc_epel_bi_hv32_8_neon: 734.1
put_hevc_epel_bi_hv48_8_c: 10100.4
put_hevc_epel_bi_hv48_8_neon: 1570.4
put_hevc_epel_bi_hv64_8_c: 17613.4
put_hevc_epel_bi_hv64_8_neon: 2810.6
put_hevc_epel_bi_v4_8_c: 77.4
put_hevc_epel_bi_v4_8_neon: 18.6
put_hevc_epel_bi_v6_8_c: 142.1
put_hevc_epel_bi_v6_8_neon: 27.1
put_hevc_epel_bi_v8_8_c: 192.9
put_hevc_epel_bi_v8_8_neon: 9.1
put_hevc_epel_bi_v12_8_c: 415.6
put_hevc_epel_bi_v12_8_neon: 55.6
put_hevc_epel_bi_v16_8_c: 487.6
put_hevc_epel_bi_v16_8_neon: 61.9
put_hevc_epel_bi_v24_8_c: 957.4
put_hevc_epel_bi_v24_8_neon: 131.1
put_hevc_epel_bi_v32_8_c: 1540.4
put_hevc_epel_bi_v32_8_neon: 210.4
put_hevc_epel_bi_v48_8_c: 3242.9
put_hevc_epel_bi_v48_8_neon: 465.6
put_hevc_epel_bi_v64_8_c: 5441.1
put_hevc_epel_bi_v64_8_neon: 818.1
put_hevc_epel_h4_8_c: 41.6
put_hevc_epel_h4_8_neon: 8.4
put_hevc_epel_h6_8_c: 110.1
put_hevc_epel_h6_8_neon: 24.4
put_hevc_epel_h8_8_c: 41.6
put_hevc_epel_h8_8_neon: 17.6
put_hevc_epel_h12_8_c: 183.1
put_hevc_epel_h12_8_neon: 58.1
put_hevc_epel_h16_8_c: 146.6
put_hevc_epel_h16_8_neon: 83.4
put_hevc_epel_h24_8_c: 240.4
put_hevc_epel_h24_8_neon: 157.1
put_hevc_epel_h32_8_c: 431.1
put_hevc_epel_h32_8_neon: 292.1
put_hevc_epel_h48_8_c: 858.6
put_hevc_epel_h48_8_neon: 557.4
put_hevc_epel_h64_8_c: 1536.6
put_hevc_epel_h64_8_neon: 1116.6
put_hevc_epel_hv4_8_c: 152.6
put_hevc_epel_hv4_8_neon: 34.9
put_hevc_epel_hv6_8_c: 269.6
put_hevc_epel_hv6_8_neon: 61.6
put_hevc_epel_hv8_8_c: 307.4
put_hevc_epel_hv8_8_neon: 76.9
put_hevc_epel_hv12_8_c: 702.6
put_hevc_epel_hv12_8_neon: 113.1
put_hevc_epel_hv16_8_c: 1081.4
put_hevc_epel_hv16_8_neon: 190.6
put_hevc_epel_hv24_8_c: 2276.1
put_hevc_epel_hv24_8_neon: 345.1
put_hevc_epel_hv32_8_c: 4068.6
put_hevc_epel_hv32_8_neon: 780.4
put_hevc_epel_hv48_8_c: 8754.1
put_hevc_epel_hv48_8_neon: 1394.4
put_hevc_epel_hv64_8_c: 15402.1
put_hevc_epel_hv64_8_neon: 2616.6
put_hevc_epel_uni_hv4_8_c: 142.1
put_hevc_epel_uni_hv4_8_neon: 46.6
put_hevc_epel_uni_hv6_8_c: 298.4
put_hevc_epel_uni_hv6_8_neon: 72.4
put_hevc_epel_uni_hv8_8_c: 352.9
put_hevc_epel_uni_hv8_8_neon: 75.1
put_hevc_epel_uni_hv12_8_c: 776.6
put_hevc_epel_uni_hv12_8_neon: 125.9
put_hevc_epel_uni_hv16_8_c: 1216.1
put_hevc_epel_uni_hv16_8_neon: 199.1
put_hevc_epel_uni_hv24_8_c: 2577.9
put_hevc_epel_uni_hv24_8_neon: 386.6
put_hevc_epel_uni_hv32_8_c: 4554.9
put_hevc_epel_uni_hv32_8_neon: 710.9
put_hevc_epel_uni_hv48_8_c: 9869.1
put_hevc_epel_uni_hv48_8_neon: 1499.4
put_hevc_epel_uni_hv64_8_c: 17307.1
put_hevc_epel_uni_hv64_8_neon: 2750.6
put_hevc_epel_uni_v4_8_c: 59.9
put_hevc_epel_uni_v4_8_neon: 21.9
put_hevc_epel_uni_v6_8_c: 136.1
put_hevc_epel_uni_v6_8_neon: 19.6
put_hevc_epel_uni_v8_8_c: 222.4
put_hevc_epel_uni_v8_8_neon: 17.1
put_hevc_epel_uni_v12_8_c: 481.6
put_hevc_epel_uni_v12_8_neon: 42.4
put_hevc_epel_uni_v16_8_c: 424.4
put_hevc_epel_uni_v16_8_neon: 63.4
put_hevc_epel_uni_v24_8_c: 1184.1
put_hevc_epel_uni_v24_8_neon: 109.9
put_hevc_epel_uni_v32_8_c: 1401.1
put_hevc_epel_uni_v32_8_neon: 182.9
put_hevc_epel_uni_v48_8_c: 2933.9
put_hevc_epel_uni_v48_8_neon: 388.9
put_hevc_epel_uni_v64_8_c: 5044.9
put_hevc_epel_uni_v64_8_neon: 701.1
put_hevc_epel_v4_8_c: 31.9
put_hevc_epel_v4_8_neon: 13.4
put_hevc_epel_v6_8_c: 95.1
put_hevc_epel_v6_8_neon: 16.4
put_hevc_epel_v8_8_c: 98.9
put_hevc_epel_v8_8_neon: 26.1
put_hevc_epel_v12_8_c: 283.9
put_hevc_epel_v12_8_neon: 36.9
put_hevc_epel_v16_8_c: 229.6
put_hevc_epel_v16_8_neon: 41.9
put_hevc_epel_v24_8_c: 376.4
put_hevc_epel_v24_8_neon: 90.4
put_hevc_epel_v32_8_c: 577.4
put_hevc_epel_v32_8_neon: 188.4
put_hevc_epel_v48_8_c: 1058.4
put_hevc_epel_v48_8_neon: 350.6
put_hevc_epel_v64_8_c: 1647.4
put_hevc_epel_v64_8_neon: 647.9
put_hevc_pel_bi_pixels4_8_c: 39.1
put_hevc_pel_bi_pixels4_8_neon: 36.4
put_hevc_pel_bi_pixels6_8_c: 78.6
put_hevc_pel_bi_pixels6_8_neon: 0.-6
put_hevc_pel_bi_pixels8_8_c: 60.6
put_hevc_pel_bi_pixels8_8_neon: 14.1
put_hevc_pel_bi_pixels12_8_c: 186.1
put_hevc_pel_bi_pixels12_8_neon: 30.4
put_hevc_pel_bi_pixels16_8_c: 231.9
put_hevc_pel_bi_pixels16_8_neon: 32.1
put_hevc_pel_bi_pixels24_8_c: 454.1
put_hevc_pel_bi_pixels24_8_neon: 70.1
put_hevc_pel_bi_pixels32_8_c: 774.1
put_hevc_pel_bi_pixels32_8_neon: 102.1
put_hevc_pel_bi_pixels48_8_c: 1632.9
put_hevc_pel_bi_pixels48_8_neon: 220.4
put_hevc_pel_bi_pixels64_8_c: 2812.9
put_hevc_pel_bi_pixels64_8_neon: 402.4
put_hevc_pel_pixels4_8_c: 41.1
put_hevc_pel_pixels4_8_neon: 6.4
put_hevc_pel_pixels6_8_c: 45.1
put_hevc_pel_pixels6_8_neon: 5.4
put_hevc_pel_pixels8_8_c: 94.6
put_hevc_pel_pixels8_8_neon: 15.6
put_hevc_pel_pixels12_8_c: 198.6
put_hevc_pel_pixels12_8_neon: 15.4
put_hevc_pel_pixels16_8_c: 87.9
put_hevc_pel_pixels16_8_neon: 18.1
put_hevc_pel_pixels24_8_c: 310.6
put_hevc_pel_pixels24_8_neon: 39.6
put_hevc_pel_pixels32_8_c: 198.6
put_hevc_pel_pixels32_8_neon: 78.1
put_hevc_pel_pixels48_8_c: 372.4
put_hevc_pel_pixels48_8_neon: 173.1
put_hevc_pel_pixels64_8_c: 569.1
put_hevc_pel_pixels64_8_neon: 324.4
put_hevc_qpel_bi_h4_8_c: 101.4
put_hevc_qpel_bi_h4_8_neon: 34.6
put_hevc_qpel_bi_h6_8_c: 270.1
put_hevc_qpel_bi_h6_8_neon: 61.6
put_hevc_qpel_bi_h8_8_c: 165.6
put_hevc_qpel_bi_h8_8_neon: 62.9
put_hevc_qpel_bi_h12_8_c: 546.4
put_hevc_qpel_bi_h12_8_neon: 124.1
put_hevc_qpel_bi_h16_8_c: 536.9
put_hevc_qpel_bi_h16_8_neon: 178.6
put_hevc_qpel_bi_h24_8_c: 1151.6
put_hevc_qpel_bi_h24_8_neon: 316.6
put_hevc_qpel_bi_h32_8_c: 1981.4
put_hevc_qpel_bi_h32_8_neon: 575.4
put_hevc_qpel_bi_h48_8_c: 4336.6
put_hevc_qpel_bi_h48_8_neon: 1189.6
put_hevc_qpel_bi_h64_8_c: 7591.6
put_hevc_qpel_bi_h64_8_neon: 2184.9
put_hevc_qpel_bi_hv4_8_c: 438.9
put_hevc_qpel_bi_hv4_8_neon: 97.6
put_hevc_qpel_bi_hv6_8_c: 829.1
put_hevc_qpel_bi_hv6_8_neon: 131.4
put_hevc_qpel_bi_hv8_8_c: 983.9
put_hevc_qpel_bi_hv8_8_neon: 146.1
put_hevc_qpel_bi_hv12_8_c: 2050.9
put_hevc_qpel_bi_hv12_8_neon: 364.6
put_hevc_qpel_bi_hv16_8_c: 3028.4
put_hevc_qpel_bi_hv16_8_neon: 432.6
put_hevc_qpel_bi_hv24_8_c: 6294.9
put_hevc_qpel_bi_hv24_8_neon: 910.1
put_hevc_qpel_bi_hv32_8_c: 10583.4
put_hevc_qpel_bi_hv32_8_neon: 1345.9
put_hevc_qpel_bi_hv48_8_c: 22412.4
put_hevc_qpel_bi_hv48_8_neon: 2852.6
put_hevc_qpel_bi_hv64_8_c: 38653.9
put_hevc_qpel_bi_hv64_8_neon: 5094.1
put_hevc_qpel_bi_v4_8_c: 143.9
put_hevc_qpel_bi_v4_8_neon: 25.9
put_hevc_qpel_bi_v6_8_c: 296.6
put_hevc_qpel_bi_v6_8_neon: 35.1
put_hevc_qpel_bi_v8_8_c: 515.4
put_hevc_qpel_bi_v8_8_neon: 31.6
put_hevc_qpel_bi_v12_8_c: 1175.6
put_hevc_qpel_bi_v12_8_neon: 81.1
put_hevc_qpel_bi_v16_8_c: 2051.6
put_hevc_qpel_bi_v16_8_neon: 111.1
put_hevc_qpel_bi_v24_8_c: 4556.9
put_hevc_qpel_bi_v24_8_neon: 208.6
put_hevc_qpel_bi_v32_8_c: 8048.1
put_hevc_qpel_bi_v32_8_neon: 351.6
put_hevc_qpel_bi_v48_8_c: 18009.9
put_hevc_qpel_bi_v48_8_neon: 773.1
put_hevc_qpel_bi_v64_8_c: 31784.9
put_hevc_qpel_bi_v64_8_neon: 1370.6
put_hevc_qpel_h4_8_c: 120.1
put_hevc_qpel_h4_8_neon: 33.1
put_hevc_qpel_h6_8_c: 241.6
put_hevc_qpel_h6_8_neon: 29.1
put_hevc_qpel_h8_8_c: 70.6
put_hevc_qpel_h8_8_neon: 52.6
put_hevc_qpel_h12_8_c: 347.4
put_hevc_qpel_h12_8_neon: 111.1
put_hevc_qpel_h16_8_c: 180.4
put_hevc_qpel_h16_8_neon: 149.9
put_hevc_qpel_h24_8_c: 333.4
put_hevc_qpel_h24_8_neon: 289.1
put_hevc_qpel_h32_8_c: 597.1
put_hevc_qpel_h32_8_neon: 478.9
put_hevc_qpel_h48_8_c: 1262.6
put_hevc_qpel_h48_8_neon: 975.6
put_hevc_qpel_h64_8_c: 2212.4
put_hevc_qpel_h64_8_neon: 1831.9
put_hevc_qpel_hv4_8_c: 430.9
put_hevc_qpel_hv4_8_neon: 77.4
put_hevc_qpel_hv6_8_c: 785.9
put_hevc_qpel_hv6_8_neon: 122.9
put_hevc_qpel_hv8_8_c: 921.9
put_hevc_qpel_hv8_8_neon: 150.1
put_hevc_qpel_hv12_8_c: 1943.4
put_hevc_qpel_hv12_8_neon: 245.4
put_hevc_qpel_hv16_8_c: 2886.9
put_hevc_qpel_hv16_8_neon: 375.4
put_hevc_qpel_hv24_8_c: 5954.6
put_hevc_qpel_hv24_8_neon: 711.4
put_hevc_qpel_hv32_8_c: 9967.1
put_hevc_qpel_hv32_8_neon: 1161.1
put_hevc_qpel_hv48_8_c: 21173.1
put_hevc_qpel_hv48_8_neon: 2593.9
put_hevc_qpel_hv64_8_c: 37378.1
put_hevc_qpel_hv64_8_neon: 4470.4
put_hevc_qpel_uni_h4_8_c: 108.4
put_hevc_qpel_uni_h4_8_neon: 38.9
put_hevc_qpel_uni_h6_8_c: 237.9
put_hevc_qpel_uni_h6_8_neon: 54.6
put_hevc_qpel_uni_h8_8_c: 432.4
put_hevc_qpel_uni_h8_8_neon: 64.9
put_hevc_qpel_uni_h12_8_c: 1019.4
put_hevc_qpel_uni_h12_8_neon: 116.1
put_hevc_qpel_uni_h16_8_c: 463.6
put_hevc_qpel_uni_h16_8_neon: 153.1
put_hevc_qpel_uni_h24_8_c: 1919.4
put_hevc_qpel_uni_h24_8_neon: 292.1
put_hevc_qpel_uni_h32_8_c: 1800.6
put_hevc_qpel_uni_h32_8_neon: 496.9
put_hevc_qpel_uni_h48_8_c: 4056.1
put_hevc_qpel_uni_h48_8_neon: 1071.1
put_hevc_qpel_uni_h64_8_c: 7149.9
put_hevc_qpel_uni_h64_8_neon: 1820.6
put_hevc_qpel_uni_hv4_8_c: 444.6
put_hevc_qpel_uni_hv4_8_neon: 86.6
put_hevc_qpel_uni_hv6_8_c: 810.6
put_hevc_qpel_uni_hv6_8_neon: 121.9
put_hevc_qpel_uni_hv8_8_c: 949.6
put_hevc_qpel_uni_hv8_8_neon: 137.6
put_hevc_qpel_uni_hv12_8_c: 2021.6
put_hevc_qpel_uni_hv12_8_neon: 261.1
put_hevc_qpel_uni_hv16_8_c: 3004.6
put_hevc_qpel_uni_hv16_8_neon: 367.1
put_hevc_qpel_uni_hv24_8_c: 6204.9
put_hevc_qpel_uni_hv24_8_neon: 813.1
put_hevc_qpel_uni_hv32_8_c: 10447.4
put_hevc_qpel_uni_hv32_8_neon: 1216.4
put_hevc_qpel_uni_hv48_8_c: 22322.9
put_hevc_qpel_uni_hv48_8_neon: 2531.6
put_hevc_qpel_uni_hv64_8_c: 38859.9
put_hevc_qpel_uni_hv64_8_neon: 4528.9
put_hevc_qpel_uni_v4_8_c: 124.6
put_hevc_qpel_uni_v4_8_neon: 33.9
put_hevc_qpel_uni_v6_8_c: 260.6
put_hevc_qpel_uni_v6_8_neon: 28.6
put_hevc_qpel_uni_v8_8_c: 480.4
put_hevc_qpel_uni_v8_8_neon: 30.4
put_hevc_qpel_uni_v12_8_c: 1101.4
put_hevc_qpel_uni_v12_8_neon: 72.1
put_hevc_qpel_uni_v16_8_c: 720.4
put_hevc_qpel_uni_v16_8_neon: 87.4
put_hevc_qpel_uni_v24_8_c: 2443.4
put_hevc_qpel_uni_v24_8_neon: 253.9
put_hevc_qpel_uni_v32_8_c: 2328.6
put_hevc_qpel_uni_v32_8_neon: 311.4
put_hevc_qpel_uni_v48_8_c: 4856.9
put_hevc_qpel_uni_v48_8_neon: 692.6
put_hevc_qpel_uni_v64_8_c: 8169.9
put_hevc_qpel_uni_v64_8_neon: 1203.4
put_hevc_qpel_v4_8_c: 123.6
put_hevc_qpel_v4_8_neon: 26.1
put_hevc_qpel_v6_8_c: 259.9
put_hevc_qpel_v6_8_neon: 22.6
put_hevc_qpel_v8_8_c: 197.4
put_hevc_qpel_v8_8_neon: 24.9
put_hevc_qpel_v12_8_c: 561.4
put_hevc_qpel_v12_8_neon: 53.6
put_hevc_qpel_v16_8_c: 474.9
put_hevc_qpel_v16_8_neon: 75.4
put_hevc_qpel_v24_8_c: 799.9
put_hevc_qpel_v24_8_neon: 159.1
put_hevc_qpel_v32_8_c: 1214.1
put_hevc_qpel_v32_8_neon: 267.9
put_hevc_qpel_v48_8_c: 2217.6
put_hevc_qpel_v48_8_neon: 639.1
put_hevc_qpel_v64_8_c: 3495.4
put_hevc_qpel_v64_8_neon: 1081.1

Signed-off-by: Josh Dekker <josh at itanimul.li>
---
 libavcodec/aarch64/Makefile               |    4 +-
 libavcodec/aarch64/hevcdsp_epel_neon.S    | 3931 ++++++++++++++
 libavcodec/aarch64/hevcdsp_init_aarch64.c |  118 +
 libavcodec/aarch64/hevcdsp_qpel_neon.S    | 5646 +++++++++++++++++++++
 4 files changed, 9698 insertions(+), 1 deletion(-)
 create mode 100644 libavcodec/aarch64/hevcdsp_epel_neon.S
 create mode 100644 libavcodec/aarch64/hevcdsp_qpel_neon.S

diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
index 954461f81d..ebedc03bfa 100644
--- a/libavcodec/aarch64/Makefile
+++ b/libavcodec/aarch64/Makefile
@@ -61,6 +61,8 @@ NEON-OBJS-$(CONFIG_VP9_DECODER)         += aarch64/vp9itxfm_16bpp_neon.o       \
                                            aarch64/vp9lpf_neon.o               \
                                            aarch64/vp9mc_16bpp_neon.o          \
                                            aarch64/vp9mc_neon.o
-NEON-OBJS-$(CONFIG_HEVC_DECODER)        += aarch64/hevcdsp_idct_neon.o         \
+NEON-OBJS-$(CONFIG_HEVC_DECODER)        += aarch64/hevcdsp_epel_neon.o         \
+                                           aarch64/hevcdsp_idct_neon.o         \
                                            aarch64/hevcdsp_init_aarch64.o      \
+                                           aarch64/hevcdsp_qpel_neon.o         \
                                            aarch64/hevcdsp_sao_neon.o
diff --git a/libavcodec/aarch64/hevcdsp_epel_neon.S b/libavcodec/aarch64/hevcdsp_epel_neon.S
new file mode 100644
index 0000000000..0366fe8ae3
--- /dev/null
+++ b/libavcodec/aarch64/hevcdsp_epel_neon.S
@@ -0,0 +1,3931 @@
+/* -*-arm64-*-
+ * vim: syntax=arm64asm
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+#define MAX_PB_SIZE 64
+
+function ff_hevc_put_hevc_pel_pixels4_8_neon, export=1
+        mov             x7, #(MAX_PB_SIZE * 2)
+1:      ld1            {v0.s}[0], [x1], x2
+        ushll           v4.8h, v0.8b, #6
+        st1            {v4.d}[0], [x0], x7
+        subs            x3, x3, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_pixels6_8_neon, export=1
+        mov             x7, #(MAX_PB_SIZE * 2 - 8)
+1:      ld1            {v0.8b}, [x1], x2
+        ushll           v4.8h, v0.8b, #6
+        st1            {v4.d}[0], [x0], #8
+        st1            {v4.s}[2], [x0], x7
+        subs            x3, x3, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_pixels8_8_neon, export=1
+        mov             x7, #(MAX_PB_SIZE * 2)
+1:      ld1            {v0.8b}, [x1], x2
+        ushll           v4.8h, v0.8b, #6
+        st1            {v4.8h}, [x0], x7
+        subs            x3, x3, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_pixels12_8_neon, export=1
+        mov             x7, #(MAX_PB_SIZE * 2 - 16)
+1:      ld1            {v0.8b, v1.8b}, [x1], x2
+        ushll           v4.8h, v0.8b, #6
+        st1            {v4.8h}, [x0], #16
+        ushll           v5.8h, v1.8b, #6
+        st1            {v5.d}[0], [x0], x7
+        subs            x3, x3, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_pixels16_8_neon, export=1
+        mov             x7, #(MAX_PB_SIZE * 2)
+1:      ld1            {v0.8b, v1.8b}, [x1], x2
+        ushll           v4.8h, v0.8b, #6
+        ushll           v5.8h, v1.8b, #6
+        st1            {v4.8h, v5.8h}, [x0], x7
+        subs            x3, x3, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_pixels24_8_neon, export=1
+        mov             x7, #(MAX_PB_SIZE * 2)
+1:      ld1            {v0.8b, v1.8b, v2.8b}, [x1], x2
+        ushll           v4.8h, v0.8b, #6
+        ushll           v5.8h, v1.8b, #6
+        ushll           v6.8h, v2.8b, #6
+        st1            {v4.8h, v5.8h, v6.8h}, [x0], x7
+        subs            x3, x3, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_pixels32_8_neon, export=1
+        mov             x7, #(MAX_PB_SIZE * 2)
+1:      ld1            {v0.8b, v1.8b, v2.8b, v3.8b}, [x1], x2
+        ushll           v4.8h, v0.8b, #6
+        ushll           v5.8h, v1.8b, #6
+        ushll           v6.8h, v2.8b, #6
+        ushll           v7.8h, v3.8b, #6
+        st1            {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x7
+        subs            x3, x3, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_pixels48_8_neon, export=1
+        mov             x7, #(MAX_PB_SIZE)
+1:      ld1            {v0.16b, v1.16b, v2.16b}, [x1], x2
+        ushll           v4.8h, v0.8b, #6
+        ushll2          v5.8h, v0.16b, #6
+        ushll           v6.8h, v1.8b, #6
+        ushll2          v7.8h, v1.16b, #6
+        st1            {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
+        ushll           v4.8h, v2.8b, #6
+        ushll2          v5.8h, v2.16b, #6
+        st1            {v4.8h, v5.8h}, [x0], x7
+        subs            x3, x3, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_pixels64_8_neon, export=1
+1:      ld1            {v0.16b, v1.16b, v2.16b, v3.16b}, [x1], x2
+        ushll           v4.8h, v0.8b, #6
+        ushll2          v5.8h, v0.16b, #6
+        ushll           v6.8h, v1.8b, #6
+        ushll2          v7.8h, v1.16b, #6
+        st1            {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #(MAX_PB_SIZE)
+        ushll           v4.8h, v2.8b, #6
+        ushll2          v5.8h, v2.16b, #6
+        ushll           v6.8h, v3.8b, #6
+        ushll2          v7.8h, v3.16b, #6
+        st1            {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #(MAX_PB_SIZE)
+        subs            x3, x3, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_bi_pixels4_8_neon, export=1
+        mov             x10, #(MAX_PB_SIZE * 2)
+1:      ld1            {v0.s}[0], [x2], x3 // src
+        ushll           v16.8h, v0.8b, #6
+        ld1            {v20.4h}, [x4], x10 // src2
+        sqadd           v16.8h, v16.8h, v20.8h
+        sqrshrun        v0.8b,  v16.8h, #7
+        st1            {v0.s}[0], [x0], x1
+        subs            x5, x5, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_bi_pixels6_8_neon, export=1
+        mov             x10, #(MAX_PB_SIZE * 2 - 8)
+        sub             x1, x1, #4
+1:      ld1            {v0.8b}, [x2], x3
+        ushll           v16.8h, v0.8b, #6
+        ld1            {v20.4h}, [x4], #8
+        ld1            {v20.s}[2], [x4], x10
+        sqadd           v16.8h, v16.8h, v20.8h
+        sqrshrun        v0.8b,  v16.8h, #7
+        st1            {v0.s}[0], [x0], #4
+        st1            {v0.h}[2], [x0], x1
+        subs            x5, x5, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_bi_pixels8_8_neon, export=1
+        mov             x10, #(MAX_PB_SIZE * 2)
+1:      ld1            {v0.8b}, [x2], x3     // src
+        ushll           v16.8h, v0.8b, #6
+        ld1            {v20.8h}, [x4], x10   // src2
+        sqadd           v16.8h, v16.8h, v20.8h
+        sqrshrun        v0.8b,  v16.8h, #7
+        st1            {v0.8b}, [x0], x1
+        subs            x5, x5, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_bi_pixels12_8_neon, export=1
+        mov             x10, #(MAX_PB_SIZE * 2 - 16)
+        sub             x1, x1, #8
+1:      ld1            {v0.16b}, [x2], x3
+        ushll           v16.8h, v0.8b, #6
+        ushll2          v17.8h, v0.16b, #6
+        ld1            {v20.8h}, [x4], #16
+        ld1            {v21.4h}, [x4], x10
+        sqadd           v16.8h, v16.8h, v20.8h
+        sqadd           v17.8h, v17.8h, v21.8h
+        sqrshrun        v0.8b,  v16.8h, #7
+        sqrshrun2       v0.16b, v17.8h, #7
+        st1            {v0.8b}, [x0], #8
+        st1            {v0.s}[2], [x0], x1
+        subs            x5, x5, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_bi_pixels16_8_neon, export=1
+        mov             x10, #(MAX_PB_SIZE * 2)
+1:      ld1            {v0.16b}, [x2], x3            // src
+        ushll           v16.8h, v0.8b, #6
+        ushll2          v17.8h, v0.16b, #6
+        ld1            {v20.8h, v21.8h}, [x4], x10   // src2
+        sqadd           v16.8h, v16.8h, v20.8h
+        sqadd           v17.8h, v17.8h, v21.8h
+        sqrshrun        v0.8b,  v16.8h, #7
+        sqrshrun2       v0.16b, v17.8h, #7
+        st1            {v0.16b}, [x0], x1
+        subs            x5, x5, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_bi_pixels24_8_neon, export=1
+        mov             x10, #(MAX_PB_SIZE * 2)
+1:      ld1            {v0.8b, v1.8b, v2.8b}, [x2], x3  // src
+        ushll           v16.8h, v0.8b, #6
+        ushll           v17.8h, v1.8b, #6
+        ushll           v18.8h, v2.8b, #6
+        ld1            {v20.8h, v21.8h, v22.8h}, [x4], x10   // src2
+        sqadd           v16.8h, v16.8h, v20.8h
+        sqadd           v17.8h, v17.8h, v21.8h
+        sqadd           v18.8h, v18.8h, v22.8h
+        sqrshrun        v0.8b, v16.8h, #7
+        sqrshrun        v1.8b, v17.8h, #7
+        sqrshrun        v2.8b, v18.8h, #7
+        st1            {v0.8b, v1.8b, v2.8b}, [x0], x1
+        subs            x5, x5, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_bi_pixels32_8_neon, export=1
+        mov             x10, #(MAX_PB_SIZE * 2)
+1:      ld1            {v0.16b, v1.16b}, [x2], x3            // src
+        ushll           v16.8h, v0.8b, #6
+        ushll2          v17.8h, v0.16b, #6
+        ushll           v18.8h, v1.8b, #6
+        ushll2          v19.8h, v1.16b, #6
+        ld1            {v20.8h, v21.8h, v22.8h, v23.8h}, [x4], x10   // src2
+        sqadd           v16.8h, v16.8h, v20.8h
+        sqadd           v17.8h, v17.8h, v21.8h
+        sqadd           v18.8h, v18.8h, v22.8h
+        sqadd           v19.8h, v19.8h, v23.8h
+        sqrshrun        v0.8b,  v16.8h, #7
+        sqrshrun2       v0.16b, v17.8h, #7
+        sqrshrun        v1.8b,  v18.8h, #7
+        sqrshrun2       v1.16b, v19.8h, #7
+        st1            {v0.16b, v1.16b}, [x0], x1
+        subs            x5, x5, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_bi_pixels48_8_neon, export=1
+        mov             x10, #(MAX_PB_SIZE)
+1:      ld1            {v0.16b, v1.16b, v2.16b}, [x2], x3            // src
+        ushll           v16.8h, v0.8b, #6
+        ushll2          v17.8h, v0.16b, #6
+        ushll           v18.8h, v1.8b, #6
+        ushll2          v19.8h, v1.16b, #6
+        ushll           v20.8h, v2.8b, #6
+        ushll2          v21.8h, v2.16b, #6
+        ld1            {v24.8h, v25.8h, v26.8h, v27.8h}, [x4], #(MAX_PB_SIZE)        // src2
+        sqadd           v16.8h, v16.8h, v24.8h
+        sqadd           v17.8h, v17.8h, v25.8h
+        sqadd           v18.8h, v18.8h, v26.8h
+        sqadd           v19.8h, v19.8h, v27.8h
+        ld1            {v24.8h, v25.8h}, [x4], x10
+        sqadd           v20.8h, v20.8h, v24.8h
+        sqadd           v21.8h, v21.8h, v25.8h
+        sqrshrun        v0.8b, v16.8h, #7
+        sqrshrun2       v0.16b, v17.8h, #7
+        sqrshrun        v1.8b, v18.8h, #7
+        sqrshrun2       v1.16b, v19.8h, #7
+        sqrshrun        v2.8b, v20.8h, #7
+        sqrshrun2       v2.16b, v21.8h, #7
+        st1            {v0.16b, v1.16b, v2.16b}, [x0], x1
+        subs            x5, x5, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_bi_pixels64_8_neon, export=1
+1:      ld1            {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], x3            // src
+        ushll           v16.8h, v0.8b, #6
+        ushll2          v17.8h, v0.16b, #6
+        ushll           v18.8h, v1.8b, #6
+        ushll2          v19.8h, v1.16b, #6
+        ushll           v20.8h, v2.8b, #6
+        ushll2          v21.8h, v2.16b, #6
+        ushll           v22.8h, v3.8b, #6
+        ushll2          v23.8h, v3.16b, #6
+        ld1            {v24.8h, v25.8h, v26.8h, v27.8h}, [x4], #(MAX_PB_SIZE)        // src2
+        sqadd           v16.8h, v16.8h, v24.8h
+        sqadd           v17.8h, v17.8h, v25.8h
+        sqadd           v18.8h, v18.8h, v26.8h
+        sqadd           v19.8h, v19.8h, v27.8h
+        ld1            {v24.8h, v25.8h, v26.8h, v27.8h}, [x4], #(MAX_PB_SIZE)
+        sqadd           v20.8h, v20.8h, v24.8h
+        sqadd           v21.8h, v21.8h, v25.8h
+        sqadd           v22.8h, v22.8h, v26.8h
+        sqadd           v23.8h, v23.8h, v27.8h
+        sqrshrun        v0.8b, v16.8h, #7
+        sqrshrun2       v0.16b, v17.8h, #7
+        sqrshrun        v1.8b, v18.8h, #7
+        sqrshrun2       v1.16b, v19.8h, #7
+        sqrshrun        v2.8b, v20.8h, #7
+        sqrshrun2       v2.16b, v21.8h, #7
+        sqrshrun        v3.8b, v22.8h, #7
+        sqrshrun2       v3.16b, v23.8h, #7
+        st1            {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
+        subs            x5, x5, #1
+        b.ne            1b
+        ret
+endfunc
+
+.Lepel_filters:
+        .byte  0,  0,  0,  0
+        .byte -2, 58, 10, -2
+        .byte -4, 54, 16, -2
+        .byte -6, 46, 28, -4
+        .byte -4, 36, 36, -4
+        .byte -4, 28, 46, -6
+        .byte -2, 16, 54, -4
+        .byte -2, 10, 58, -2
+
+.macro load_epel_filterb freg, xreg
+        adr             \xreg, .Lepel_filters
+        add             \xreg, \xreg, \freg, lsl #2
+        ld4r           {v0.16b, v1.16b, v2.16b, v3.16b}, [\xreg] // filter
+        neg             v0.16b, v0.16b
+        neg             v3.16b, v3.16b
+.endm
+
+.macro calc_epelb dst, src1, src2, src3, src4
+        umlsl           \dst\().8h, \src1\().8b, v0.8b
+        umlal           \dst\().8h, \src2\().8b, v1.8b
+        umlal           \dst\().8h, \src3\().8b, v2.8b
+        umlsl           \dst\().8h, \src4\().8b, v3.8b
+.endm
+
+.macro calc_epelb2 dst, src1, src2, src3, src4
+        umlsl2          \dst\().8h, \src1\().16b, v0.16b
+        umlal2          \dst\().8h, \src2\().16b, v1.16b
+        umlal2          \dst\().8h, \src3\().16b, v2.16b
+        umlsl2          \dst\().8h, \src4\().16b, v3.16b
+.endm
+
+.macro load_epel_filterh freg, xreg
+        adr             \xreg, .Lepel_filters
+        add             \xreg, \xreg, \freg, lsl #2
+        ld1            {v0.8b}, [\xreg]
+        sxtl            v0.8h, v0.8b
+.endm
+
+.macro calc_epelh dst, src1, src2, src3, src4
+        smull           \dst\().4s, \src1\().4h, v0.h[0]
+        smlal           \dst\().4s, \src2\().4h, v0.h[1]
+        smlal           \dst\().4s, \src3\().4h, v0.h[2]
+        smlal           \dst\().4s, \src4\().4h, v0.h[3]
+        sqshrn          \dst\().4h, \dst\().4s, #6
+.endm
+
+.macro calc_epelh2 dst, tmp, src1, src2, src3, src4
+        smull2          \tmp\().4s, \src1\().8h, v0.h[0]
+        smlal2          \tmp\().4s, \src2\().8h, v0.h[1]
+        smlal2          \tmp\().4s, \src3\().8h, v0.h[2]
+        smlal2          \tmp\().4s, \src4\().8h, v0.h[3]
+        sqshrn2         \dst\().8h, \tmp\().4s, #6
+.endm
+
+function ff_hevc_put_hevc_epel_h4_8_neon, export=1
+        load_epel_filterb x4, x5
+        sub             x1, x1, #1
+        mov             x10, #(MAX_PB_SIZE * 2)
+1:      ld1            {v4.8b}, [x1], x2
+        ushr            v5.2d, v4.2d, #8
+        ushr            v6.2d, v5.2d, #8
+        ushr            v7.2d, v6.2d, #8
+        movi            v16.8h, #0
+        calc_epelb      v16, v4, v5, v6, v7
+        st1            {v16.4h}, [x0], x10
+        subs            x3, x3, #1   // height
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_h6_8_neon, export=1
+        load_epel_filterb x4, x5
+        sub             x1,  x1, #1
+        sub             x2,  x2, #8
+        mov             x10, #(MAX_PB_SIZE * 2 - 8)
+1:      ld1            {v24.8b},  [x1], #8
+        ushr            v26.2d, v24.2d, #8
+        ushr            v27.2d, v26.2d, #8
+        ushr            v28.2d, v27.2d, #8
+        movi            v16.8h,   #0
+        ld1            {v28.b}[5], [x1], x2
+        calc_epelb      v16, v24, v26, v27, v28
+        st1            {v16.4h},   [x0], #8
+        st1            {v16.s}[2], [x0], x10
+        subs            x3, x3,   #1   // height
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_h8_8_neon, export=1
+        load_epel_filterb x4, x5
+        sub             x1, x1, #1
+        mov             x10, #(MAX_PB_SIZE * 2)
+1:      ld2            {v24.8b, v25.8b}, [x1], x2
+        ushr            v26.2d, v24.2d, #8
+        ushr            v27.2d, v25.2d, #8
+        ushr            v28.2d, v26.2d, #8
+        movi            v16.8h, #0
+        movi            v17.8h, #0
+        calc_epelb      v16, v24, v25, v26, v27
+        calc_epelb      v17, v25, v26, v27, v28
+        st2            {v16.4h, v17.4h}, [x0], x10
+        subs            x3, x3, #1   // height
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_h12_8_neon, export=1
+        load_epel_filterb x4, x5
+        sub             x1, x1, #1
+        mov             x10, #(MAX_PB_SIZE * 2 - 16)
+1:      ld2            {v24.8b, v25.8b}, [x1], x2
+        ushr            v26.2d, v24.2d, #8
+        ushr            v27.2d, v25.2d, #8
+        ushr            v28.2d, v26.2d, #8
+        movi            v16.8h, #0
+        movi            v17.8h, #0
+        calc_epelb      v16, v24, v25, v26, v27
+        calc_epelb      v17, v25, v26, v27, v28
+        zip1            v18.8h, v16.8h, v17.8h
+        zip2            v19.8h, v16.8h, v17.8h
+        st1            {v18.8h},   [x0], #16
+        st1            {v19.d}[0], [x0], x10
+        subs            x3, x3, #1   // height
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_h16_8_neon, export=1
+        load_epel_filterb x4, x5
+        sub             x1, x1,  #1
+        sub             x2, x2, #16
+        mov             x10, #(MAX_PB_SIZE * 2)
+1:      ld2            {v24.8b, v25.8b}, [x1], #16
+        ld1            {v20.s}[0], [x1], x2
+        ushr            v26.2d, v24.2d, #8
+        ushr            v27.2d, v25.2d, #8
+        mov             v26.b[7], v20.b[0]
+        mov             v27.b[7], v20.b[1]
+        ushr            v28.2d, v26.2d, #8
+        mov             v28.b[7], v20.b[2]
+        movi            v16.8h, #0
+        movi            v17.8h, #0
+        calc_epelb      v16, v24, v25, v26, v27
+        calc_epelb      v17, v25, v26, v27, v28
+        st2            {v16.8h, v17.8h}, [x0], x10
+        subs            x3, x3, #1   // height
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_h24_8_neon, export=1
+        load_epel_filterb x4, x5
+        sub             x1, x1, #1
+        sub             x2, x2, #24
+        mov             x10, #(MAX_PB_SIZE * 2)
+1:      ld3            {v24.8b, v25.8b, v26.8b}, [x1], #24
+        ld1            {v20.s}[0], [x1], x2
+        ushr            v27.2d, v24.2d, #8
+        ushr            v28.2d, v25.2d, #8
+        ushr            v29.2d, v26.2d, #8
+        mov             v27.b[7], v20.b[0]
+        mov             v28.b[7], v20.b[1]
+        mov             v29.b[7], v20.b[2]
+        movi            v16.8h, #0
+        movi            v17.8h, #0
+        movi            v18.8h, #0
+        calc_epelb      v16, v24, v25, v26, v27
+        calc_epelb      v17, v25, v26, v27, v28
+        calc_epelb      v18, v26, v27, v28, v29
+        st3            {v16.8h, v17.8h, v18.8h}, [x0], x10
+        subs            x3, x3, #1   // height
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_h32_8_neon, export=1
+        load_epel_filterb x4, x5
+        sub             x1, x1, #1
+        sub             x2, x2, #32
+        mov             x10, #(MAX_PB_SIZE * 2)
+1:      ld4            {v24.8b, v25.8b, v26.8b, v27.8b}, [x1], #32
+        ld1            {v20.s}[0], [x1], x2
+        ushr            v28.2d, v24.2d, #8
+        ushr            v29.2d, v25.2d, #8
+        ushr            v30.2d, v26.2d, #8
+        ins             v28.b[7], v20.b[0]
+        ins             v29.b[7], v20.b[1]
+        ins             v30.b[7], v20.b[2]
+        movi            v16.8h, #0
+        movi            v17.8h, #0
+        movi            v18.8h, #0
+        movi            v19.8h, #0
+        calc_epelb      v16, v24, v25, v26, v27
+        calc_epelb      v17, v25, v26, v27, v28
+        calc_epelb      v18, v26, v27, v28, v29
+        calc_epelb      v19, v27, v28, v29, v30
+        st4            {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], x10
+        subs            x3, x3, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_h48_8_neon, export=1
+        load_epel_filterb x4, x5
+        sub             x1, x1, #1
+        sub             x2, x2, #48
+        mov             x5, #24
+        mov             x10, #(128 - 48)
+1:      ld3            {v26.16b, v27.16b, v28.16b}, [x1], x5
+        ushr            v29.2d, v26.2d, #8
+        ushr            v30.2d, v27.2d, #8
+        ushr            v31.2d, v28.2d, #8
+        ld1            {v24.s}[0], [x1], x5
+        ld1            {v25.s}[0], [x1], x2
+        mov             v29.b[7], v24.b[0]
+        mov             v30.b[7], v24.b[1]
+        mov             v31.b[7], v24.b[2]
+        mov             v29.b[15], v25.b[0]
+        mov             v30.b[15], v25.b[1]
+        mov             v31.b[15], v25.b[2]
+        movi            v16.8h, #0
+        movi            v17.8h, #0
+        movi            v18.8h, #0
+        movi            v20.8h, #0
+        movi            v21.8h, #0
+        movi            v22.8h, #0
+        calc_epelb      v16, v26, v27, v28, v29
+        calc_epelb2     v20, v26, v27, v28, v29
+        calc_epelb      v17, v27, v28, v29, v30
+        calc_epelb2     v21, v27, v28, v29, v30
+        calc_epelb      v18, v28, v29, v30, v31
+        calc_epelb2     v22, v28, v29, v30, v31
+        st3            {v16.8h, v17.8h, v18.8h}, [x0], #48
+        st3            {v20.8h, v21.8h, v22.8h}, [x0], x10
+        subs            x3, x3, #1   // height
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_h64_8_neon, export=1
+        load_epel_filterb x4, x5
+        sub             x1, x1, #1
+        sub             x2, x2, #64
+        mov             x7, #32
+1:      ld4            {v24.16b, v25.16b, v26.16b, v27.16b}, [x1], x7
+        ushr            v28.2d, v24.2d, #8
+        ushr            v29.2d, v25.2d, #8
+        ushr            v30.2d, v26.2d, #8
+        ld1            {v4.s}[0], [x1], x7
+        ld1            {v5.s}[0], [x1], x2
+        ins             v28.b[7],  v4.b[0]
+        ins             v28.b[15], v5.b[0]
+        ins             v29.b[7],  v4.b[1]
+        ins             v29.b[15], v5.b[1]
+        ins             v30.b[7],  v4.b[2]
+        ins             v30.b[15], v5.b[2]
+        movi            v16.8h, #0
+        movi            v17.8h, #0
+        movi            v18.8h, #0
+        movi            v19.8h, #0
+        movi            v20.8h, #0
+        movi            v21.8h, #0
+        movi            v22.8h, #0
+        movi            v23.8h, #0
+        calc_epelb      v16, v24, v25, v26, v27
+        calc_epelb2     v20, v24, v25, v26, v27
+        calc_epelb      v17, v25, v26, v27, v28
+        calc_epelb2     v21, v25, v26, v27, v28
+        calc_epelb      v18, v26, v27, v28, v29
+        calc_epelb2     v22, v26, v27, v28, v29
+        calc_epelb      v19, v27, v28, v29, v30
+        calc_epelb2     v23, v27, v28, v29, v30
+        st4            {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], #64
+        st4            {v20.8h, v21.8h, v22.8h, v23.8h}, [x0], #64
+        subs            x3, x3, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v4_8_neon, export=1
+        load_epel_filterb x5, x4
+        sub             x1, x1, x2
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1            {v16.s}[0], [x1], x2
+        ld1            {v17.s}[0], [x1], x2
+        ld1            {v18.s}[0], [x1], x2
+1:      ld1            {v19.s}[0], [x1], x2
+        movi            v4.8h, #0
+        calc_epelb      v4, v16, v17, v18, v19
+        st1            {v4.4h}, [x0], x10
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v16.s}[0], [x1], x2
+        movi            v4.8h, #0
+        calc_epelb      v4, v17, v18, v19, v16
+        st1            {v4.4h}, [x0], x10
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v17.s}[0], [x1], x2
+        movi            v4.8h, #0
+        calc_epelb      v4, v18, v19, v16, v17
+        st1            {v4.4h}, [x0], x10
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v18.s}[0], [x1], x2
+        movi            v4.8h, #0
+        calc_epelb      v4, v19, v16, v17, v18
+        st1            {v4.4h}, [x0], x10
+        subs            x3, x3, #1
+        b.ne            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v6_8_neon, export=1
+        load_epel_filterb x5, x4
+        sub             x1, x1, x2
+        mov             x10, #(MAX_PB_SIZE * 2 - 8)
+        ld1            {v16.8b}, [x1], x2
+        ld1            {v17.8b}, [x1], x2
+        ld1            {v18.8b}, [x1], x2
+1:      ld1            {v19.8b}, [x1], x2
+        movi            v4.8h, #0
+        calc_epelb      v4, v16, v17, v18, v19
+        st1            {v4.d}[0], [x0], #8
+        st1            {v4.s}[2], [x0], x10
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v16.8b}, [x1], x2
+        movi            v4.8h, #0
+        calc_epelb      v4, v17, v18, v19, v16
+        st1            {v4.d}[0], [x0], #8
+        st1            {v4.s}[2], [x0], x10
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v17.8b}, [x1], x2
+        movi            v4.8h, #0
+        calc_epelb      v4, v18, v19, v16, v17
+        st1            {v4.d}[0], [x0], #8
+        st1            {v4.s}[2], [x0], x10
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v18.8b}, [x1], x2
+        movi            v4.8h, #0
+        calc_epelb      v4, v19, v16, v17, v18
+        st1            {v4.d}[0], [x0], #8
+        st1            {v4.s}[2], [x0], x10
+        subs            x3, x3, #1
+        b.ne            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v8_8_neon, export=1
+        load_epel_filterb x5, x4
+        sub             x1, x1, x2
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1            {v16.8b}, [x1], x2
+        ld1            {v17.8b}, [x1], x2
+        ld1            {v18.8b}, [x1], x2
+1:      ld1            {v19.8b}, [x1], x2
+        movi            v4.8h, #0
+        calc_epelb      v4, v16, v17, v18, v19
+        st1            {v4.8h}, [x0], x10
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v16.8b}, [x1], x2
+        movi            v4.8h, #0
+        calc_epelb      v4, v17, v18, v19, v16
+        st1            {v4.8h}, [x0], x10
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v17.8b}, [x1], x2
+        movi            v4.8h, #0
+        calc_epelb      v4, v18, v19, v16, v17
+        st1            {v4.8h}, [x0], x10
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v18.8b}, [x1], x2
+        movi            v4.8h, #0
+        calc_epelb      v4, v19, v16, v17, v18
+        st1            {v4.8h}, [x0], x10
+        subs            x3, x3, #1
+        b.ne            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v12_8_neon, export=1
+        load_epel_filterb x5, x4
+        sub             x1, x1, x2
+        mov             x10, #(MAX_PB_SIZE * 2 - 16)
+        ld1            {v16.16b}, [x1], x2
+        ld1            {v17.16b}, [x1], x2
+        ld1            {v18.16b}, [x1], x2
+1:      ld1            {v19.16b}, [x1], x2
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        calc_epelb      v4, v16, v17, v18, v19
+        calc_epelb2     v5, v16, v17, v18, v19
+        st1            {v4.8h}, [x0], #16
+        st1            {v5.d}[0], [x0], x10
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v16.16b}, [x1], x2
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        calc_epelb      v4, v17, v18, v19, v16
+        calc_epelb2     v5, v17, v18, v19, v16
+        st1            {v4.8h}, [x0], #16
+        st1            {v5.d}[0], [x0], x10
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v17.16b}, [x1], x2
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        calc_epelb      v4, v18, v19, v16, v17
+        calc_epelb2     v5, v18, v19, v16, v17
+        st1            {v4.8h}, [x0], #16
+        st1            {v5.d}[0], [x0], x10
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v18.16b}, [x1], x2
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        calc_epelb      v4, v19, v16, v17, v18
+        calc_epelb2     v5, v19, v16, v17, v18
+        st1            {v4.8h}, [x0], #16
+        st1            {v5.d}[0], [x0], x10
+        subs            x3, x3, #1
+        b.ne            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v16_8_neon, export=1
+        load_epel_filterb x5, x4
+        sub             x1, x1, x2
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1            {v16.16b}, [x1], x2
+        ld1            {v17.16b}, [x1], x2
+        ld1            {v18.16b}, [x1], x2
+1:      ld1            {v19.16b}, [x1], x2
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        calc_epelb      v4, v16, v17, v18, v19
+        calc_epelb2     v5, v16, v17, v18, v19
+        st1            {v4.8h, v5.8h}, [x0], x10
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v16.16b}, [x1], x2
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        calc_epelb      v4, v17, v18, v19, v16
+        calc_epelb2     v5, v17, v18, v19, v16
+        st1            {v4.8h, v5.8h}, [x0], x10
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v17.16b}, [x1], x2
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        calc_epelb      v4, v18, v19, v16, v17
+        calc_epelb2     v5, v18, v19, v16, v17
+        st1            {v4.8h, v5.8h}, [x0], x10
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v18.16b}, [x1], x2
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        calc_epelb      v4, v19, v16, v17, v18
+        calc_epelb2     v5, v19, v16, v17, v18
+        st1            {v4.8h, v5.8h}, [x0], x10
+        subs            x3, x3, #1
+        b.ne            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v24_8_neon, export=1
+        load_epel_filterb x5, x4
+        sub             x1, x1, x2
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1            {v16.8b, v17.8b, v18.8b}, [x1], x2
+        ld1            {v19.8b, v20.8b, v21.8b}, [x1], x2
+        ld1            {v22.8b, v23.8b, v24.8b}, [x1], x2
+1:      ld1            {v25.8b, v26.8b, v27.8b}, [x1], x2
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        movi            v6.8h, #0
+        calc_epelb      v4, v16, v19, v22, v25
+        calc_epelb      v5, v17, v20, v23, v26
+        calc_epelb      v6, v18, v21, v24, v27
+        st1            {v4.8h, v5.8h, v6.8h}, [x0], x10
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v16.8b, v17.8b, v18.8b}, [x1], x2
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        movi            v6.8h, #0
+        calc_epelb      v4, v19, v22, v25, v16
+        calc_epelb      v5, v20, v23, v26, v17
+        calc_epelb      v6, v21, v24, v27, v18
+        st1            {v4.8h, v5.8h, v6.8h}, [x0], x10
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v19.8b, v20.8b, v21.8b}, [x1], x2
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        movi            v6.8h, #0
+        calc_epelb      v4, v22, v25, v16, v19
+        calc_epelb      v5, v23, v26, v17, v20
+        calc_epelb      v6, v24, v27, v18, v21
+        st1            {v4.8h, v5.8h, v6.8h}, [x0], x10
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v22.8b, v23.8b, v24.8b}, [x1], x2
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        movi            v6.8h, #0
+        calc_epelb      v4, v25, v16, v19, v22
+        calc_epelb      v5, v26, v17, v20, v23
+        calc_epelb      v6, v27, v18, v21, v24
+        st1            {v4.8h, v5.8h, v6.8h}, [x0], x10
+        subs            x3, x3, #1
+        b.ne            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v32_8_neon, export=1
+        load_epel_filterb x5, x4
+        sub             x1, x1, x2
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1            {v16.16b, v17.16b}, [x1], x2
+        ld1            {v18.16b, v19.16b}, [x1], x2
+        ld1            {v20.16b, v21.16b}, [x1], x2
+1:      ld1            {v22.16b, v23.16b}, [x1], x2
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        movi            v6.8h, #0
+        movi            v7.8h, #0
+        calc_epelb      v4, v16, v18, v20, v22
+        calc_epelb2     v5, v16, v18, v20, v22
+        calc_epelb      v6, v17, v19, v21, v23
+        calc_epelb2     v7, v17, v19, v21, v23
+        st1            {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x10
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v16.16b, v17.16b}, [x1], x2
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        movi            v6.8h, #0
+        movi            v7.8h, #0
+        calc_epelb      v4, v18, v20, v22, v16
+        calc_epelb2     v5, v18, v20, v22, v16
+        calc_epelb      v6, v19, v21, v23, v17
+        calc_epelb2     v7, v19, v21, v23, v17
+        st1            {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x10
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v18.16b, v19.16b}, [x1], x2
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        movi            v6.8h, #0
+        movi            v7.8h, #0
+        calc_epelb      v4, v20, v22, v16, v18
+        calc_epelb2     v5, v20, v22, v16, v18
+        calc_epelb      v6, v21, v23, v17, v19
+        calc_epelb2     v7, v21, v23, v17, v19
+        st1            {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x10
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v20.16b, v21.16b}, [x1], x2
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        movi            v6.8h, #0
+        movi            v7.8h, #0
+        calc_epelb      v4, v22, v16, v18, v20
+        calc_epelb2     v5, v22, v16, v18, v20
+        calc_epelb      v6, v23, v17, v19, v21
+        calc_epelb2     v7, v23, v17, v19, v21
+        st1            {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x10
+        subs            x3, x3, #1
+        b.ne            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v48_8_neon, export=1
+        load_epel_filterb x5, x4
+        sub             x1, x1, x2
+        mov             x10, #64
+        ld1            {v16.16b, v17.16b, v18.16b}, [x1], x2
+        ld1            {v19.16b, v20.16b, v21.16b}, [x1], x2
+        ld1            {v22.16b, v23.16b, v24.16b}, [x1], x2
+1:      ld1            {v25.16b, v26.16b, v27.16b}, [x1], x2
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        movi            v6.8h, #0
+        movi            v7.8h, #0
+        movi            v28.8h, #0
+        movi            v29.8h, #0
+        calc_epelb      v4,  v16, v19, v22, v25
+        calc_epelb2     v5,  v16, v19, v22, v25
+        calc_epelb      v6,  v17, v20, v23, v26
+        calc_epelb2     v7,  v17, v20, v23, v26
+        calc_epelb      v28, v18, v21, v24, v27
+        calc_epelb2     v29, v18, v21, v24, v27
+        st1            { v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
+        st1            {v28.8h, v29.8h}, [x0], x10
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v16.16b, v17.16b, v18.16b}, [x1], x2
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        movi            v6.8h, #0
+        movi            v7.8h, #0
+        movi            v28.8h, #0
+        movi            v29.8h, #0
+        calc_epelb      v4,  v19, v22, v25, v16
+        calc_epelb2     v5,  v19, v22, v25, v16
+        calc_epelb      v6,  v20, v23, v26, v17
+        calc_epelb2     v7,  v20, v23, v26, v17
+        calc_epelb      v28, v21, v24, v27, v18
+        calc_epelb2     v29, v21, v24, v27, v18
+        st1            {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
+        st1            {v28.8h, v29.8h}, [x0], x10
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v19.16b, v20.16b, v21.16b}, [x1], x2
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        movi            v6.8h, #0
+        movi            v7.8h, #0
+        movi            v28.8h, #0
+        movi            v29.8h, #0
+        calc_epelb      v4,  v22, v25, v16, v19
+        calc_epelb2     v5,  v22, v25, v16, v19
+        calc_epelb      v6,  v23, v26, v17, v20
+        calc_epelb2     v7,  v23, v26, v17, v20
+        calc_epelb      v28, v24, v27, v18, v21
+        calc_epelb2     v29, v24, v27, v18, v21
+        st1            {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
+        st1            {v28.8h, v29.8h}, [x0], x10
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v22.16b, v23.16b, v24.16b}, [x1], x2
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        movi            v6.8h, #0
+        movi            v7.8h, #0
+        movi            v28.8h, #0
+        movi            v29.8h, #0
+        calc_epelb      v4,  v25, v16, v19, v22
+        calc_epelb2     v5,  v25, v16, v19, v22
+        calc_epelb      v6,  v26, v17, v20, v23
+        calc_epelb2     v7,  v26, v17, v20, v23
+        calc_epelb      v28, v27, v18, v21, v24
+        calc_epelb2     v29, v27, v18, v21, v24
+        st1            {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
+        st1            {v28.8h, v29.8h}, [x0], x10
+        subs            x3, x3, #1
+        b.ne            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v64_8_neon, export=1
+        load_epel_filterb x5, x4
+        sub             x1, x1, x2
+        ld1            {v16.16b, v17.16b, v18.16b, v19.16b}, [x1], x2
+        ld1            {v20.16b, v21.16b, v22.16b, v23.16b}, [x1], x2
+        ld1            {v24.16b, v25.16b, v26.16b, v27.16b}, [x1], x2
+1:      ld1            {v28.16b, v29.16b, v30.16b, v31.16b}, [x1], x2
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        movi            v6.8h, #0
+        movi            v7.8h, #0
+        movi            v8.8h, #0
+        movi            v9.8h, #0
+        movi            v10.8h, #0
+        movi            v11.8h, #0
+        calc_epelb      v4,  v16, v20, v24, v28
+        calc_epelb2     v5,  v16, v20, v24, v28
+        calc_epelb      v6,  v17, v21, v25, v29
+        calc_epelb2     v7,  v17, v21, v25, v29
+        calc_epelb      v8,  v18, v22, v26, v30
+        calc_epelb2     v9,  v18, v22, v26, v30
+        calc_epelb      v10, v19, v23, v27, v31
+        calc_epelb2     v11, v19, v23, v27, v31
+        st1            {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
+        st1            {v8.8h, v9.8h, v10.8h, v11.8h}, [x0], #64
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v16.16b, v17.16b, v18.16b, v19.16b}, [x1], x2
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        movi            v6.8h, #0
+        movi            v7.8h, #0
+        movi            v8.8h, #0
+        movi            v9.8h, #0
+        movi            v10.8h, #0
+        movi            v11.8h, #0
+        calc_epelb      v4,  v20, v24, v28, v16
+        calc_epelb2     v5,  v20, v24, v28, v16
+        calc_epelb      v6,  v21, v25, v29, v17
+        calc_epelb2     v7,  v21, v25, v29, v17
+        calc_epelb      v8,  v22, v26, v30, v18
+        calc_epelb2     v9,  v22, v26, v30, v18
+        calc_epelb      v10, v23, v27, v31, v19
+        calc_epelb2     v11, v23, v27, v31, v19
+        st1            {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
+        st1            {v8.8h, v9.8h, v10.8h, v11.8h}, [x0], #64
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v20.16b, v21.16b, v22.16b, v23.16b}, [x1], x2
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        movi            v6.8h, #0
+        movi            v7.8h, #0
+        movi            v8.8h, #0
+        movi            v9.8h, #0
+        movi            v10.8h, #0
+        movi            v11.8h, #0
+        calc_epelb      v4,  v24, v28, v16, v20
+        calc_epelb2     v5,  v24, v28, v16, v20
+        calc_epelb      v6,  v25, v29, v17, v21
+        calc_epelb2     v7,  v25, v29, v17, v21
+        calc_epelb      v8,  v26, v30, v18, v22
+        calc_epelb2     v9,  v26, v30, v18, v22
+        calc_epelb      v10, v27, v31, v19, v23
+        calc_epelb2     v11, v27, v31, v19, v23
+        st1            {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
+        st1            {v8.8h, v9.8h, v10.8h, v11.8h}, [x0], #64
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v24.16b, v25.16b, v26.16b, v27.16b}, [x1], x2
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        movi            v6.8h, #0
+        movi            v7.8h, #0
+        movi            v8.8h, #0
+        movi            v9.8h, #0
+        movi            v10.8h, #0
+        movi            v11.8h, #0
+        calc_epelb      v4, v28, v16, v20, v24
+        calc_epelb2     v5, v28, v16, v20, v24
+        calc_epelb      v6, v29, v17, v21, v25
+        calc_epelb2     v7, v29, v17, v21, v25
+        calc_epelb      v8, v30, v18, v22, v26
+        calc_epelb2     v9, v30, v18, v22, v26
+        calc_epelb      v10, v31, v19, v23, v27
+        calc_epelb2     v11, v31, v19, v23, v27
+        st1            {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
+        st1            {v8.8h, v9.8h, v10.8h, v11.8h}, [x0], #64
+        subs            x3, x3, #1
+        b.ne            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_hv4_8_neon, export=1
+        add             x10, x3, #3
+        lsl             x10, x10, #7
+        sub             sp, sp, x10     // tmp_array
+        stp             x0, x3, [sp, #-16]!
+        stp             x5, x30, [sp, #-16]!
+        add             x0, sp, #32
+        sub             x1, x1, x2
+        add             x3, x3, #3
+        bl              X(ff_hevc_put_hevc_epel_h4_8_neon)
+        ldp             x5, x30, [sp], #16
+        ldp             x0, x3, [sp], #16
+        load_epel_filterh x5, x4
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1            {v16.4h}, [sp], x10
+        ld1            {v17.4h}, [sp], x10
+        ld1            {v18.4h}, [sp], x10
+1:      ld1            {v19.4h}, [sp], x10
+        calc_epelh      v4, v16, v17, v18, v19
+        st1            {v4.4h}, [x0], x10
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v16.4h}, [sp], x10
+        calc_epelh      v4, v17, v18, v19, v16
+        st1            {v4.4h}, [x0], x10
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v17.4h}, [sp], x10
+        calc_epelh      v4, v18, v19, v16, v17
+        st1            {v4.4h}, [x0], x10
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v18.4h}, [sp], x10
+        calc_epelh      v4, v19, v16, v17, v18
+        st1            {v4.4h}, [x0], x10
+        subs            x3, x3, #1
+        b.ne            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_hv6_8_neon, export=1
+        add             x10,  x3, #3
+        lsl             x10, x10, #7
+        sub             sp, sp, x10     // tmp_array
+        stp             x0,  x3, [sp, #-16]!
+        stp             x5, x30, [sp, #-16]!
+        add             x0, sp, #32
+        sub             x1, x1, x2
+        add             x3, x3, #3
+        bl              X(ff_hevc_put_hevc_epel_h6_8_neon)
+        ldp             x5, x30, [sp], #16
+        ldp             x0,  x3, [sp], #16
+        load_epel_filterh x5, x4
+        mov             x5, #120
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1            {v16.8h}, [sp], x10
+        ld1            {v17.8h}, [sp], x10
+        ld1            {v18.8h}, [sp], x10
+1:      ld1            {v19.8h}, [sp], x10
+        calc_epelh      v4, v16, v17, v18, v19
+        calc_epelh2     v4, v5, v16, v17, v18, v19
+        st1            {v4.d}[0], [x0], #8
+        st1            {v4.s}[2], [x0], x5
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v16.8h}, [sp], x10
+        calc_epelh      v4, v17, v18, v19, v16
+        calc_epelh2     v4, v5, v17, v18, v19, v16
+        st1            {v4.d}[0], [x0], #8
+        st1            {v4.s}[2], [x0], x5
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v17.8h}, [sp], x10
+        calc_epelh      v4, v18, v19, v16, v17
+        calc_epelh2     v4, v5, v18, v19, v16, v17
+        st1            {v4.d}[0], [x0], #8
+        st1            {v4.s}[2], [x0], x5
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v18.8h}, [sp], x10
+        calc_epelh      v4, v19, v16, v17, v18
+        calc_epelh2     v4, v5, v19, v16, v17, v18
+        st1            {v4.d}[0], [x0], #8
+        st1            {v4.s}[2], [x0], x5
+        subs            x3, x3, #1
+        b.ne            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_hv8_8_neon, export=1
+        add             x10, x3, #3
+        lsl             x10, x10, #7
+        sub             sp, sp, x10     // tmp_array
+        stp             x0, x3, [sp, #-16]!
+        stp             x5, x30, [sp, #-16]!
+        add             x0, sp, #32
+        sub             x1, x1, x2
+        add             x3, x3, #3
+        bl              X(ff_hevc_put_hevc_epel_h8_8_neon)
+        ldp             x5, x30, [sp], #16
+        ldp             x0, x3, [sp], #16
+        load_epel_filterh x5, x4
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1            {v16.8h}, [sp], x10
+        ld1            {v17.8h}, [sp], x10
+        ld1            {v18.8h}, [sp], x10
+1:      ld1            {v19.8h}, [sp], x10
+        calc_epelh      v4, v16, v17, v18, v19
+        calc_epelh2     v4, v5, v16, v17, v18, v19
+        st1            {v4.8h}, [x0], x10
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v16.8h}, [sp], x10
+        calc_epelh      v4, v17, v18, v19, v16
+        calc_epelh2     v4, v5, v17, v18, v19, v16
+        st1            {v4.8h}, [x0], x10
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v17.8h}, [sp], x10
+        calc_epelh      v4, v18, v19, v16, v17
+        calc_epelh2     v4, v5, v18, v19, v16, v17
+        st1            {v4.8h}, [x0], x10
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v18.8h}, [sp], x10
+        calc_epelh      v4, v19, v16, v17, v18
+        calc_epelh2     v4, v5, v19, v16, v17, v18
+        st1            {v4.8h}, [x0], x10
+        subs            x3, x3, #1
+        b.ne            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_hv12_8_neon, export=1
+        add             x10, x3, #3
+        lsl             x10, x10, #7
+        sub             sp, sp, x10     // tmp_array
+        stp             x0, x3, [sp, #-16]!
+        stp             x5, x30, [sp, #-16]!
+        add             x0, sp, #32
+        sub             x1, x1, x2
+        add             x3, x3, #3
+        bl              X(ff_hevc_put_hevc_epel_h12_8_neon)
+        ldp             x5, x30, [sp], #16
+        ldp             x0, x3, [sp], #16
+        load_epel_filterh x5, x4
+        mov             x5, #112
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1            {v16.8h, v17.8h}, [sp], x10
+        ld1            {v18.8h, v19.8h}, [sp], x10
+        ld1            {v20.8h, v21.8h}, [sp], x10
+1:      ld1            {v22.8h, v23.8h}, [sp], x10
+        calc_epelh      v4, v16, v18, v20, v22
+        calc_epelh2     v4, v5, v16, v18, v20, v22
+        calc_epelh      v5, v17, v19, v21, v23
+        st1            {v4.8h}, [x0], #16
+        st1            {v5.4h}, [x0], x5
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v16.8h, v17.8h}, [sp], x10
+        calc_epelh      v4, v18, v20, v22, v16
+        calc_epelh2     v4, v5, v18, v20, v22, v16
+        calc_epelh      v5, v19, v21, v23, v17
+        st1            {v4.8h}, [x0], #16
+        st1            {v5.4h}, [x0], x5
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v18.8h, v19.8h}, [sp], x10
+        calc_epelh      v4, v20, v22, v16, v18
+        calc_epelh2     v4, v5, v20, v22, v16, v18
+        calc_epelh      v5, v21, v23, v17, v19
+        st1            {v4.8h}, [x0], #16
+        st1            {v5.4h}, [x0], x5
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v20.8h, v21.8h}, [sp], x10
+        calc_epelh      v4, v22, v16, v18, v20
+        calc_epelh2     v4, v5, v22, v16, v18, v20
+        calc_epelh      v5, v23, v17, v19, v21
+        st1            {v4.8h}, [x0], #16
+        st1            {v5.4h}, [x0], x5
+        subs            x3, x3, #1
+        b.ne            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_hv16_8_neon, export=1
+        add             x10, x3, #3
+        lsl             x10, x10, #7
+        sub             sp, sp, x10     // tmp_array
+        stp             x0, x3, [sp, #-16]!
+        stp             x5, x30, [sp, #-16]!
+        add             x0, sp, #32
+        sub             x1, x1, x2
+        add             x3, x3, #3
+        bl              X(ff_hevc_put_hevc_epel_h16_8_neon)
+        ldp             x5, x30, [sp], #16
+        ldp             x0, x3, [sp], #16
+        load_epel_filterh x5, x4
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1            {v16.8h, v17.8h}, [sp], x10
+        ld1            {v18.8h, v19.8h}, [sp], x10
+        ld1            {v20.8h, v21.8h}, [sp], x10
+1:      ld1            {v22.8h, v23.8h}, [sp], x10
+        calc_epelh      v4, v16, v18, v20, v22
+        calc_epelh2     v4, v5, v16, v18, v20, v22
+        calc_epelh      v5, v17, v19, v21, v23
+        calc_epelh2     v5, v6, v17, v19, v21, v23
+        st1            {v4.8h, v5.8h}, [x0], x10
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v16.8h, v17.8h}, [sp], x10
+        calc_epelh      v4, v18, v20, v22, v16
+        calc_epelh2     v4, v5, v18, v20, v22, v16
+        calc_epelh      v5, v19, v21, v23, v17
+        calc_epelh2     v5, v6, v19, v21, v23, v17
+        st1            {v4.8h, v5.8h}, [x0], x10
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v18.8h, v19.8h}, [sp], x10
+        calc_epelh      v4, v20, v22, v16, v18
+        calc_epelh2     v4, v5, v20, v22, v16, v18
+        calc_epelh      v5, v21, v23, v17, v19
+        calc_epelh2     v5, v6, v21, v23, v17, v19
+        st1            {v4.8h, v5.8h}, [x0], x10
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v20.8h, v21.8h}, [sp], x10
+        calc_epelh      v4, v22, v16, v18, v20
+        calc_epelh2     v4, v5, v22, v16, v18, v20
+        calc_epelh      v5, v23, v17, v19, v21
+        calc_epelh2     v5, v6, v23, v17, v19, v21
+        st1            {v4.8h, v5.8h}, [x0], x10
+        subs            x3, x3, #1
+        b.ne            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_hv24_8_neon, export=1
+        add             x10, x3, #3
+        lsl             x10, x10, #7
+        sub             sp, sp, x10     // tmp_array
+        stp             x0, x3, [sp, #-16]!
+        stp             x5, x30, [sp, #-16]!
+        add             x0, sp, #32
+        sub             x1, x1, x2
+        add             x3, x3, #3
+        bl              X(ff_hevc_put_hevc_epel_h24_8_neon)
+        ldp             x5, x30, [sp], #16
+        ldp             x0, x3, [sp], #16
+        load_epel_filterh x5, x4
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1            {v16.8h, v17.8h, v18.8h}, [sp], x10
+        ld1            {v19.8h, v20.8h, v21.8h}, [sp], x10
+        ld1            {v22.8h, v23.8h, v24.8h}, [sp], x10
+1:      ld1            {v25.8h, v26.8h, v27.8h}, [sp], x10
+        calc_epelh      v4, v16, v19, v22, v25
+        calc_epelh2     v4, v5, v16, v19, v22, v25
+        calc_epelh      v5, v17, v20, v23, v26
+        calc_epelh2     v5, v6, v17, v20, v23, v26
+        calc_epelh      v6, v18, v21, v24, v27
+        calc_epelh2     v6, v7, v18, v21, v24, v27
+        st1            {v4.8h, v5.8h, v6.8h}, [x0], x10
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v16.8h, v17.8h, v18.8h}, [sp], x10
+        calc_epelh      v4, v19, v22, v25, v16
+        calc_epelh2     v4, v5, v19, v22, v25, v16
+        calc_epelh      v5, v20, v23, v26, v17
+        calc_epelh2     v5, v6, v20, v23, v26, v17
+        calc_epelh      v6, v21, v24, v27, v18
+        calc_epelh2     v6, v7, v21, v24, v27, v18
+        st1            {v4.8h, v5.8h, v6.8h}, [x0], x10
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v19.8h, v20.8h, v21.8h}, [sp], x10
+        calc_epelh      v4, v22, v25, v16, v19
+        calc_epelh2     v4, v5, v22, v25, v16, v19
+        calc_epelh      v5, v23, v26, v17, v20
+        calc_epelh2     v5, v6, v23, v26, v17, v20
+        calc_epelh      v6, v24, v27, v18, v21
+        calc_epelh2     v6, v7, v24, v27, v18, v21
+        st1            {v4.8h, v5.8h, v6.8h}, [x0], x10
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v22.8h, v23.8h, v24.8h}, [sp], x10
+        calc_epelh      v4, v25, v16, v19, v22
+        calc_epelh2     v4, v5, v25, v16, v19, v22
+        calc_epelh      v5, v26, v17, v20, v23
+        calc_epelh2     v5, v6, v26, v17, v20, v23
+        calc_epelh      v6, v27, v18, v21, v24
+        calc_epelh2     v6, v7, v27, v18, v21, v24
+        st1            {v4.8h, v5.8h, v6.8h}, [x0], x10
+        subs            x3, x3, #1
+        b.ne            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_hv32_8_neon, export=1
+        stp             xzr, x30, [sp, #-16]!
+        stp             x0, x1, [sp, #-16]!
+        stp             x2, x3, [sp, #-16]!
+        stp             x4, x5, [sp, #-16]!
+        mov             x6, #16
+        bl              X(ff_hevc_put_hevc_epel_hv16_8_neon)
+        ldp             x4, x5, [sp], #16
+        ldp             x2, x3, [sp], #16
+        ldp             x0, x1, [sp], #16
+        add             x0, x0, #32
+        add             x1, x1, #16
+        mov             x6, #16
+        bl              X(ff_hevc_put_hevc_epel_hv16_8_neon)
+        ldp             xzr, x30, [sp], #16
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_hv48_8_neon, export=1
+        stp             xzr, x30, [sp, #-16]!
+        stp             x0, x1, [sp, #-16]!
+        stp             x2, x3, [sp, #-16]!
+        stp             x4, x5, [sp, #-16]!
+        mov             x6, #24
+        bl              X(ff_hevc_put_hevc_epel_hv24_8_neon)
+        ldp             x4, x5, [sp], #16
+        ldp             x2, x3, [sp], #16
+        ldp             x0, x1, [sp], #16
+        add             x0, x0, #48
+        add             x1, x1, #24
+        mov             x6, #24
+        bl              X(ff_hevc_put_hevc_epel_hv24_8_neon)
+        ldp             xzr, x30, [sp], #16
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_hv64_8_neon, export=1
+        stp             xzr, x30, [sp, #-16]!
+        stp             x0, x1, [sp, #-16]!
+        stp             x2, x3, [sp, #-16]!
+        stp             x4, x5, [sp, #-16]!
+        mov             x6, #16
+        bl              X(ff_hevc_put_hevc_epel_hv16_8_neon)
+        ldp             x4, x5, [sp]
+        ldp             x2, x3, [sp, #16]
+        ldp             x0, x1, [sp, #32]
+        add             x0, x0, #32
+        add             x1, x1, #16
+        mov             x6, #16
+        bl              X(ff_hevc_put_hevc_epel_hv16_8_neon)
+        ldp             x4, x5, [sp]
+        ldp             x2, x3, [sp, #16]
+        ldp             x0, x1, [sp, #32]
+        add             x0, x0, #64
+        add             x1, x1, #32
+        mov             x6, #16
+        bl              X(ff_hevc_put_hevc_epel_hv16_8_neon)
+        ldp             x4, x5, [sp], #16
+        ldp             x2, x3, [sp], #16
+        ldp             x0, x1, [sp], #16
+        add             x0, x0, #96
+        add             x1, x1, #48
+        mov             x6, #16
+        bl              X(ff_hevc_put_hevc_epel_hv16_8_neon)
+        ldp             xzr, x30, [sp], #16
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_v4_8_neon, export=1
+        load_epel_filterb x6, x5
+        sub             x2, x2, x3
+        ld1            {v16.s}[0], [x2], x3
+        ld1            {v17.s}[0], [x2], x3
+        ld1            {v18.s}[0], [x2], x3
+1:      ld1            {v19.s}[0], [x2], x3
+        movi            v4.8h, #0
+        calc_epelb      v4, v16, v17, v18, v19
+        sqrshrun        v4.8b,  v4.8h, #6
+        st1            {v4.s}[0], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v16.s}[0], [x2], x3
+        movi            v4.8h, #0
+        calc_epelb      v4, v17, v18, v19, v16
+        sqrshrun        v4.8b,  v4.8h, #6
+        st1            {v4.s}[0], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v17.s}[0], [x2], x3
+        movi            v4.8h, #0
+        calc_epelb      v4, v18, v19, v16, v17
+        sqrshrun        v4.8b,  v4.8h, #6
+        st1            {v4.s}[0], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v18.s}[0], [x2], x3
+        movi            v4.8h, #0
+        calc_epelb      v4, v19, v16, v17, v18
+        sqrshrun        v4.8b,  v4.8h, #6
+        st1            {v4.s}[0], [x0], x1
+        subs            x4, x4, #1
+        b.ne            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_v6_8_neon, export=1
+        load_epel_filterb x6, x5
+        sub             x2, x2, x3
+        sub             x1, x1, #4
+        ld1            {v16.8b}, [x2], x3
+        ld1            {v17.8b}, [x2], x3
+        ld1            {v18.8b}, [x2], x3
+1:      ld1            {v19.8b}, [x2], x3
+        movi            v4.8h, #0
+        calc_epelb      v4, v16, v17, v18, v19
+        sqrshrun        v4.8b,  v4.8h, #6
+        st1            {v4.s}[0], [x0], #4
+        st1            {v4.h}[2], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v16.8b}, [x2], x3
+        movi            v4.8h, #0
+        calc_epelb      v4, v17, v18, v19, v16
+        sqrshrun        v4.8b,  v4.8h, #6
+        st1            {v4.s}[0], [x0], #4
+        st1            {v4.h}[2], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v17.8b}, [x2], x3
+        movi            v4.8h, #0
+        calc_epelb      v4, v18, v19, v16, v17
+        sqrshrun        v4.8b,  v4.8h, #6
+        st1            {v4.s}[0], [x0], #4
+        st1            {v4.h}[2], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v18.8b}, [x2], x3
+        movi            v4.8h, #0
+        calc_epelb      v4, v19, v16, v17, v18
+        sqrshrun        v4.8b,  v4.8h, #6
+        st1            {v4.s}[0], [x0], #4
+        st1            {v4.h}[2], [x0], x1
+        subs            x4, x4, #1
+        b.ne            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_v8_8_neon, export=1
+        load_epel_filterb x6, x5
+        sub             x2, x2, x3
+        ld1            {v16.8b}, [x2], x3
+        ld1            {v17.8b}, [x2], x3
+        ld1            {v18.8b}, [x2], x3
+1:      ld1            {v19.8b}, [x2], x3
+        movi            v4.8h, #0
+        calc_epelb      v4, v16, v17, v18, v19
+        sqrshrun        v4.8b,  v4.8h, #6
+        st1            {v4.8b}, [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v16.8b}, [x2], x3
+        movi            v4.8h, #0
+        calc_epelb      v4, v17, v18, v19, v16
+        sqrshrun        v4.8b,  v4.8h, #6
+        st1            {v4.8b}, [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v17.8b}, [x2], x3
+        movi            v4.8h, #0
+        calc_epelb      v4, v18, v19, v16, v17
+        sqrshrun        v4.8b,  v4.8h, #6
+        st1            {v4.8b}, [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v18.8b}, [x2], x3
+        movi            v4.8h, #0
+        calc_epelb      v4, v19, v16, v17, v18
+        sqrshrun        v4.8b,  v4.8h, #6
+        st1            {v4.8b}, [x0], x1
+        subs            x4, x4, #1
+        b.ne            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_v12_8_neon, export=1
+        load_epel_filterb x6, x5
+        sub             x2, x2, x3
+        sub             x1, x1, #8
+        ld1            {v16.16b}, [x2], x3
+        ld1            {v17.16b}, [x2], x3
+        ld1            {v18.16b}, [x2], x3
+1:      ld1            {v19.16b}, [x2], x3
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        calc_epelb      v4, v16, v17, v18, v19
+        calc_epelb2     v5, v16, v17, v18, v19
+        sqrshrun        v4.8b,  v4.8h, #6
+        sqrshrun2       v4.16b,  v5.8h, #6
+        st1            {v4.8b}, [x0], #8
+        st1            {v4.s}[2], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v16.16b}, [x2], x3
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        calc_epelb      v4, v17, v18, v19, v16
+        calc_epelb2     v5, v17, v18, v19, v16
+        sqrshrun        v4.8b,  v4.8h, #6
+        sqrshrun2       v4.16b,  v5.8h, #6
+        st1            {v4.8b}, [x0], #8
+        st1            {v4.s}[2], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v17.16b}, [x2], x3
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        calc_epelb      v4, v18, v19, v16, v17
+        calc_epelb2     v5, v18, v19, v16, v17
+        sqrshrun        v4.8b,  v4.8h, #6
+        sqrshrun2       v4.16b,  v5.8h, #6
+        st1            {v4.8b}, [x0], #8
+        st1            {v4.s}[2], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v18.16b}, [x2], x3
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        calc_epelb      v4, v19, v16, v17, v18
+        calc_epelb2     v5, v19, v16, v17, v18
+        sqrshrun        v4.8b,  v4.8h, #6
+        sqrshrun2       v4.16b,  v5.8h, #6
+        st1            {v4.8b}, [x0], #8
+        st1            {v4.s}[2], [x0], x1
+        subs            x4, x4, #1
+        b.ne            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_v16_8_neon, export=1
+        load_epel_filterb x6, x5
+        sub             x2, x2, x3
+        ld1            {v16.16b}, [x2], x3
+        ld1            {v17.16b}, [x2], x3
+        ld1            {v18.16b}, [x2], x3
+1:      ld1            {v19.16b}, [x2], x3
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        calc_epelb      v4, v16, v17, v18, v19
+        calc_epelb2     v5, v16, v17, v18, v19
+        sqrshrun        v4.8b,  v4.8h, #6
+        sqrshrun2       v4.16b,  v5.8h, #6
+        st1            {v4.16b}, [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v16.16b}, [x2], x3
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        calc_epelb      v4, v17, v18, v19, v16
+        calc_epelb2     v5, v17, v18, v19, v16
+        sqrshrun        v4.8b,  v4.8h, #6
+        sqrshrun2       v4.16b,  v5.8h, #6
+        st1            {v4.16b}, [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v17.16b}, [x2], x3
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        calc_epelb      v4, v18, v19, v16, v17
+        calc_epelb2     v5, v18, v19, v16, v17
+        sqrshrun        v4.8b,  v4.8h, #6
+        sqrshrun2       v4.16b,  v5.8h, #6
+        st1            {v4.16b}, [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v18.16b}, [x2], x3
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        calc_epelb      v4, v19, v16, v17, v18
+        calc_epelb2     v5, v19, v16, v17, v18
+        sqrshrun        v4.8b,  v4.8h, #6
+        sqrshrun2       v4.16b,  v5.8h, #6
+        st1            {v4.16b}, [x0], x1
+        subs            x4, x4, #1
+        b.ne            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_v24_8_neon, export=1
+        load_epel_filterb x6, x5
+        sub             x2, x2, x3
+        ld1            {v16.8b, v17.8b, v18.8b}, [x2], x3
+        ld1            {v19.8b, v20.8b, v21.8b}, [x2], x3
+        ld1            {v22.8b, v23.8b, v24.8b}, [x2], x3
+1:      ld1            {v25.8b, v26.8b, v27.8b}, [x2], x3
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        movi            v6.8h, #0
+        calc_epelb      v4, v16, v19, v22, v25
+        calc_epelb      v5, v17, v20, v23, v26
+        calc_epelb      v6, v18, v21, v24, v27
+        sqrshrun        v4.8b,  v4.8h, #6
+        sqrshrun        v5.8b,  v5.8h, #6
+        sqrshrun        v6.8b,  v6.8h, #6
+        st1            {v4.8b, v5.8b, v6.8b}, [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v16.8b, v17.8b, v18.8b}, [x2], x3
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        movi            v6.8h, #0
+        calc_epelb      v4, v19, v22, v25, v16
+        calc_epelb      v5, v20, v23, v26, v17
+        calc_epelb      v6, v21, v24, v27, v18
+        sqrshrun        v4.8b,  v4.8h, #6
+        sqrshrun        v5.8b,  v5.8h, #6
+        sqrshrun        v6.8b,  v6.8h, #6
+        st1            {v4.8b, v5.8b, v6.8b}, [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v19.8b, v20.8b, v21.8b}, [x2], x3
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        movi            v6.8h, #0
+        calc_epelb      v4, v22, v25, v16, v19
+        calc_epelb      v5, v23, v26, v17, v20
+        calc_epelb      v6, v24, v27, v18, v21
+        sqrshrun        v4.8b,  v4.8h, #6
+        sqrshrun        v5.8b,  v5.8h, #6
+        sqrshrun        v6.8b,  v6.8h, #6
+        st1            {v4.8b, v5.8b, v6.8b}, [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v22.8b, v23.8b, v24.8b}, [x2], x3
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        movi            v6.8h, #0
+        calc_epelb      v4, v25, v16, v19, v22
+        calc_epelb      v5, v26, v17, v20, v23
+        calc_epelb      v6, v27, v18, v21, v24
+        sqrshrun        v4.8b,  v4.8h, #6
+        sqrshrun        v5.8b,  v5.8h, #6
+        sqrshrun        v6.8b,  v6.8h, #6
+        st1            {v4.8b, v5.8b, v6.8b}, [x0], x1
+        subs            x4, x4, #1
+        b.ne            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_v32_8_neon, export=1
+        load_epel_filterb x6, x5
+        sub             x2, x2, x3
+        ld1            {v16.16b, v17.16b}, [x2], x3
+        ld1            {v18.16b, v19.16b}, [x2], x3
+        ld1            {v20.16b, v21.16b}, [x2], x3
+1:      ld1            {v22.16b, v23.16b}, [x2], x3
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        movi            v6.8h, #0
+        movi            v7.8h, #0
+        calc_epelb      v4, v16, v18, v20, v22
+        calc_epelb2     v5, v16, v18, v20, v22
+        calc_epelb      v6, v17, v19, v21, v23
+        calc_epelb2     v7, v17, v19, v21, v23
+        sqrshrun        v4.8b,  v4.8h, #6
+        sqrshrun2       v4.16b,  v5.8h, #6
+        sqrshrun        v5.8b,  v6.8h, #6
+        sqrshrun2       v5.16b,  v7.8h, #6
+        st1            {v4.16b, v5.16b}, [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v16.16b, v17.16b}, [x2], x3
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        movi            v6.8h, #0
+        movi            v7.8h, #0
+        calc_epelb      v4, v18, v20, v22, v16
+        calc_epelb2     v5, v18, v20, v22, v16
+        calc_epelb      v6, v19, v21, v23, v17
+        calc_epelb2     v7, v19, v21, v23, v17
+        sqrshrun        v4.8b,  v4.8h, #6
+        sqrshrun2       v4.16b,  v5.8h, #6
+        sqrshrun        v5.8b,  v6.8h, #6
+        sqrshrun2       v5.16b,  v7.8h, #6
+        st1            {v4.16b, v5.16b}, [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v18.16b, v19.16b}, [x2], x3
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        movi            v6.8h, #0
+        movi            v7.8h, #0
+        calc_epelb      v4, v20, v22, v16, v18
+        calc_epelb2     v5, v20, v22, v16, v18
+        calc_epelb      v6, v21, v23, v17, v19
+        calc_epelb2     v7, v21, v23, v17, v19
+        sqrshrun        v4.8b,  v4.8h, #6
+        sqrshrun2       v4.16b,  v5.8h, #6
+        sqrshrun        v5.8b,  v6.8h, #6
+        sqrshrun2       v5.16b,  v7.8h, #6
+        st1            {v4.16b, v5.16b}, [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v20.16b, v21.16b}, [x2], x3
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        movi            v6.8h, #0
+        movi            v7.8h, #0
+        calc_epelb      v4, v22, v16, v18, v20
+        calc_epelb2     v5, v22, v16, v18, v20
+        calc_epelb      v6, v23, v17, v19, v21
+        calc_epelb2     v7, v23, v17, v19, v21
+        sqrshrun        v4.8b,  v4.8h, #6
+        sqrshrun2       v4.16b,  v5.8h, #6
+        sqrshrun        v5.8b,  v6.8h, #6
+        sqrshrun2       v5.16b,  v7.8h, #6
+        st1            {v4.16b, v5.16b}, [x0], x1
+        subs            x4, x4, #1
+        b.ne            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_v48_8_neon, export=1
+        load_epel_filterb x6, x5
+        sub             x2, x2, x3
+        ld1            {v16.16b, v17.16b, v18.16b}, [x2], x3
+        ld1            {v19.16b, v20.16b, v21.16b}, [x2], x3
+        ld1            {v22.16b, v23.16b, v24.16b}, [x2], x3
+1:      ld1            {v25.16b, v26.16b, v27.16b}, [x2], x3
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        movi            v6.8h, #0
+        movi            v7.8h, #0
+        movi            v28.8h, #0
+        movi            v29.8h, #0
+        calc_epelb      v4, v16, v19, v22, v25
+        calc_epelb2     v5, v16, v19, v22, v25
+        calc_epelb      v6, v17, v20, v23, v26
+        calc_epelb2     v7, v17, v20, v23, v26
+        calc_epelb      v28, v18, v21, v24, v27
+        calc_epelb2     v29, v18, v21, v24, v27
+        sqrshrun        v4.8b,  v4.8h, #6
+        sqrshrun2       v4.16b,  v5.8h, #6
+        sqrshrun        v5.8b,  v6.8h, #6
+        sqrshrun2       v5.16b,  v7.8h, #6
+        sqrshrun        v6.8b,  v28.8h, #6
+        sqrshrun2       v6.16b,  v29.8h, #6
+        st1            {v4.16b, v5.16b, v6.16b}, [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v16.16b, v17.16b, v18.16b}, [x2], x3
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        movi            v6.8h, #0
+        movi            v7.8h, #0
+        movi            v28.8h, #0
+        movi            v29.8h, #0
+        calc_epelb      v28, v21, v24, v27, v18
+        calc_epelb2     v29, v21, v24, v27, v18
+        calc_epelb      v4, v19, v22, v25, v16
+        calc_epelb2     v5, v19, v22, v25, v16
+        calc_epelb      v6, v20, v23, v26, v17
+        calc_epelb2     v7, v20, v23, v26, v17
+        sqrshrun        v4.8b,  v4.8h, #6
+        sqrshrun2       v4.16b,  v5.8h, #6
+        sqrshrun        v5.8b,  v6.8h, #6
+        sqrshrun2       v5.16b,  v7.8h, #6
+        sqrshrun        v6.8b,  v28.8h, #6
+        sqrshrun2       v6.16b,  v29.8h, #6
+        st1            {v4.16b, v5.16b, v6.16b}, [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v19.16b, v20.16b, v21.16b}, [x2], x3
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        movi            v6.8h, #0
+        movi            v7.8h, #0
+        movi            v28.8h, #0
+        movi            v29.8h, #0
+        calc_epelb      v28, v24, v27, v18, v21
+        calc_epelb2     v29, v24, v27, v18, v21
+        calc_epelb      v4, v22, v25, v16, v19
+        calc_epelb2     v5, v22, v25, v16, v19
+        calc_epelb      v6, v23, v26, v17, v20
+        calc_epelb2     v7, v23, v26, v17, v20
+        sqrshrun        v4.8b,  v4.8h, #6
+        sqrshrun2       v4.16b,  v5.8h, #6
+        sqrshrun        v5.8b,  v6.8h, #6
+        sqrshrun2       v5.16b,  v7.8h, #6
+        sqrshrun        v6.8b,  v28.8h, #6
+        sqrshrun2       v6.16b,  v29.8h, #6
+        st1            {v4.16b, v5.16b, v6.16b}, [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v22.16b, v23.16b, v24.16b}, [x2], x3
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        movi            v6.8h, #0
+        movi            v7.8h, #0
+        movi            v28.8h, #0
+        movi            v29.8h, #0
+        calc_epelb      v28, v27, v18, v21, v24
+        calc_epelb2     v29, v27, v18, v21, v24
+        calc_epelb      v4, v25, v16, v19, v22
+        calc_epelb2     v5, v25, v16, v19, v22
+        calc_epelb      v6, v26, v17, v20, v23
+        calc_epelb2     v7, v26, v17, v20, v23
+        sqrshrun        v4.8b,  v4.8h, #6
+        sqrshrun2       v4.16b,  v5.8h, #6
+        sqrshrun        v5.8b,  v6.8h, #6
+        sqrshrun2       v5.16b,  v7.8h, #6
+        sqrshrun        v6.8b,  v28.8h, #6
+        sqrshrun2       v6.16b,  v29.8h, #6
+        st1            {v4.16b, v5.16b, v6.16b}, [x0], x1
+        subs            x4, x4, #1
+        b.ne            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_v64_8_neon, export=1
+        load_epel_filterb x6, x5
+        sub             x2, x2, x3
+        ld1            {v16.16b, v17.16b, v18.16b, v19.16b}, [x2], x3
+        ld1            {v20.16b, v21.16b, v22.16b, v23.16b}, [x2], x3
+        ld1            {v24.16b, v25.16b, v26.16b, v27.16b}, [x2], x3
+1:      ld1            {v28.16b, v29.16b, v30.16b, v31.16b}, [x2], x3
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        movi            v6.8h, #0
+        movi            v7.8h, #0
+        movi            v8.8h, #0
+        movi            v9.8h, #0
+        movi            v10.8h, #0
+        movi            v11.8h, #0
+        calc_epelb      v10, v19, v23, v27, v31
+        calc_epelb2     v11, v19, v23, v27, v31
+        calc_epelb      v4, v16, v20, v24, v28
+        calc_epelb2     v5, v16, v20, v24, v28
+        calc_epelb      v6, v17, v21, v25, v29
+        calc_epelb2     v7, v17, v21, v25, v29
+        calc_epelb      v8, v18, v22, v26, v30
+        calc_epelb2     v9, v18, v22, v26, v30
+        sqrshrun        v4.8b,  v4.8h, #6
+        sqrshrun2       v4.16b,  v5.8h, #6
+        sqrshrun        v5.8b,  v6.8h, #6
+        sqrshrun2       v5.16b,  v7.8h, #6
+        sqrshrun        v6.8b,  v8.8h, #6
+        sqrshrun2       v6.16b,  v9.8h, #6
+        sqrshrun        v7.8b,  v10.8h, #6
+        sqrshrun2       v7.16b,  v11.8h, #6
+        st1            {v4.16b, v5.16b, v6.16b, v7.16b}, [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v16.16b, v17.16b, v18.16b, v19.16b}, [x2], x3
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        movi            v6.8h, #0
+        movi            v7.8h, #0
+        movi            v8.8h, #0
+        movi            v9.8h, #0
+        movi            v10.8h, #0
+        movi            v11.8h, #0
+        calc_epelb      v10, v23, v27, v31, v19
+        calc_epelb2     v11, v23, v27, v31, v19
+        calc_epelb      v4, v20, v24, v28, v16
+        calc_epelb2     v5, v20, v24, v28, v16
+        calc_epelb      v6, v21, v25, v29, v17
+        calc_epelb2     v7, v21, v25, v29, v17
+        calc_epelb      v8, v22, v26, v30, v18
+        calc_epelb2     v9, v22, v26, v30, v18
+        sqrshrun        v4.8b,  v4.8h, #6
+        sqrshrun2       v4.16b,  v5.8h, #6
+        sqrshrun        v5.8b,  v6.8h, #6
+        sqrshrun2       v5.16b,  v7.8h, #6
+        sqrshrun        v6.8b,  v8.8h, #6
+        sqrshrun2       v6.16b,  v9.8h, #6
+        sqrshrun        v7.8b,  v10.8h, #6
+        sqrshrun2       v7.16b,  v11.8h, #6
+        st1            {v4.16b, v5.16b, v6.16b, v7.16b}, [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v20.16b, v21.16b, v22.16b, v23.16b}, [x2], x3
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        movi            v6.8h, #0
+        movi            v7.8h, #0
+        movi            v8.8h, #0
+        movi            v9.8h, #0
+        movi            v10.8h, #0
+        movi            v11.8h, #0
+        calc_epelb      v10, v27, v31, v19, v23
+        calc_epelb2     v11, v27, v31, v19, v23
+        calc_epelb      v4, v24, v28, v16, v20
+        calc_epelb2     v5, v24, v28, v16, v20
+        calc_epelb      v6, v25, v29, v17, v21
+        calc_epelb2     v7, v25, v29, v17, v21
+        calc_epelb      v8, v26, v30, v18, v22
+        calc_epelb2     v9, v26, v30, v18, v22
+        sqrshrun        v4.8b,  v4.8h, #6
+        sqrshrun2       v4.16b,  v5.8h, #6
+        sqrshrun        v5.8b,  v6.8h, #6
+        sqrshrun2       v5.16b,  v7.8h, #6
+        sqrshrun        v6.8b,  v8.8h, #6
+        sqrshrun2       v6.16b,  v9.8h, #6
+        sqrshrun        v7.8b,  v10.8h, #6
+        sqrshrun2       v7.16b,  v11.8h, #6
+        st1            {v4.16b, v5.16b, v6.16b, v7.16b}, [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v24.16b, v25.16b, v26.16b, v27.16b}, [x2], x3
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        movi            v6.8h, #0
+        movi            v7.8h, #0
+        movi            v8.8h, #0
+        movi            v9.8h, #0
+        movi            v10.8h, #0
+        movi            v11.8h, #0
+        calc_epelb      v10, v31, v19, v23, v27
+        calc_epelb2     v11, v31, v19, v23, v27
+        calc_epelb      v4, v28, v16, v20, v24
+        calc_epelb2     v5, v28, v16, v20, v24
+        calc_epelb      v6, v29, v17, v21, v25
+        calc_epelb2     v7, v29, v17, v21, v25
+        calc_epelb      v8, v30, v18, v22, v26
+        calc_epelb2     v9, v30, v18, v22, v26
+        sqrshrun        v4.8b,  v4.8h, #6
+        sqrshrun2       v4.16b,  v5.8h, #6
+        sqrshrun        v5.8b,  v6.8h, #6
+        sqrshrun2       v5.16b,  v7.8h, #6
+        sqrshrun        v6.8b,  v8.8h, #6
+        sqrshrun2       v6.16b,  v9.8h, #6
+        sqrshrun        v7.8b,  v10.8h, #6
+        sqrshrun2       v7.16b,  v11.8h, #6
+        st1            {v4.16b, v5.16b, v6.16b, v7.16b}, [x0], x1
+        subs            x4, x4, #1
+        b.ne            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_hv4_8_neon, export=1
+        add             x10, x4, #3
+        lsl             x10, x10, #7
+        sub             sp, sp, x10     // tmp_array
+        stp             x0, x1, [sp, #-16]!
+        stp             x4, x6, [sp, #-16]!
+        stp             xzr, x30, [sp, #-16]!
+        add             x0, sp, #48
+        sub             x1, x2, x3
+        mov             x2, x3
+        add             x3, x4, #3
+        mov             x4, x5
+        bl              X(ff_hevc_put_hevc_epel_h4_8_neon)
+        ldp             xzr, x30, [sp], #16
+        ldp             x4, x6, [sp], #16
+        ldp             x0, x1, [sp], #16
+        load_epel_filterh x6, x5
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1            {v16.4h}, [sp], x10
+        ld1            {v17.4h}, [sp], x10
+        ld1            {v18.4h}, [sp], x10
+1:      ld1            {v19.4h}, [sp], x10
+        calc_epelh      v4, v16, v17, v18, v19
+        sqrshrun        v4.8b, v4.8h, #6
+        st1            {v4.s}[0], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v16.4h}, [sp], x10
+        calc_epelh      v4, v17, v18, v19, v16
+        sqrshrun        v4.8b, v4.8h, #6
+        st1            {v4.s}[0], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v17.4h}, [sp], x10
+        calc_epelh      v4, v18, v19, v16, v17
+        sqrshrun        v4.8b, v4.8h, #6
+        st1            {v4.s}[0], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v18.4h}, [sp], x10
+        calc_epelh      v4, v19, v16, v17, v18
+        sqrshrun        v4.8b, v4.8h, #6
+        st1            {v4.s}[0], [x0], x1
+        subs            x4, x4, #1
+        b.ne            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_hv6_8_neon, export=1
+        add             x10, x4, #3
+        lsl             x10, x10, #7
+        sub             sp, sp, x10     // tmp_array
+        stp             x0, x1, [sp, #-16]!
+        stp             x4, x6, [sp, #-16]!
+        stp             xzr, x30, [sp, #-16]!
+        add             x0, sp, #48
+        sub             x1, x2, x3
+        mov             x2, x3
+        add             x3, x4, #3
+        mov             x4, x5
+        bl              X(ff_hevc_put_hevc_epel_h6_8_neon)
+        ldp             xzr, x30, [sp], #16
+        ldp             x4, x6, [sp], #16
+        ldp             x0, x1, [sp], #16
+        load_epel_filterh x6, x5
+        sub             x1, x1, #4
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1            {v16.8h}, [sp], x10
+        ld1            {v17.8h}, [sp], x10
+        ld1            {v18.8h}, [sp], x10
+1:      ld1            {v19.8h}, [sp], x10
+        calc_epelh      v4, v16, v17, v18, v19
+        calc_epelh2     v4, v5, v16, v17, v18, v19
+        sqrshrun        v4.8b, v4.8h, #6
+        st1            {v4.s}[0], [x0], #4
+        st1            {v4.h}[2], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v16.8h}, [sp], x10
+        calc_epelh      v4, v17, v18, v19, v16
+        calc_epelh2     v4, v5, v17, v18, v19, v16
+        sqrshrun        v4.8b, v4.8h, #6
+        st1            {v4.s}[0], [x0], #4
+        st1            {v4.h}[2], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v17.8h}, [sp], x10
+        calc_epelh      v4, v18, v19, v16, v17
+        calc_epelh2     v4, v5, v18, v19, v16, v17
+        sqrshrun        v4.8b, v4.8h, #6
+        st1            {v4.s}[0], [x0], #4
+        st1            {v4.h}[2], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v18.8h}, [sp], x10
+        calc_epelh      v4, v19, v16, v17, v18
+        calc_epelh2     v4, v5, v19, v16, v17, v18
+        sqrshrun        v4.8b, v4.8h, #6
+        st1            {v4.s}[0], [x0], #4
+        st1            {v4.h}[2], [x0], x1
+        subs            x4, x4, #1
+        b.ne            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_hv8_8_neon, export=1
+        add             x10, x4, #3
+        lsl             x10, x10, #7
+        sub             sp, sp, x10     // tmp_array
+        stp             x0, x1, [sp, #-16]!
+        stp             x4, x6, [sp, #-16]!
+        stp             xzr, x30, [sp, #-16]!
+        add             x0, sp, #48
+        sub             x1, x2, x3
+        mov             x2, x3
+        add             x3, x4, #3
+        mov             x4, x5
+        bl              X(ff_hevc_put_hevc_epel_h8_8_neon)
+        ldp             xzr, x30, [sp], #16
+        ldp             x4, x6, [sp], #16
+        ldp             x0, x1, [sp], #16
+        load_epel_filterh x6, x5
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1            {v16.8h}, [sp], x10
+        ld1            {v17.8h}, [sp], x10
+        ld1            {v18.8h}, [sp], x10
+1:      ld1            {v19.8h}, [sp], x10
+        calc_epelh      v4, v16, v17, v18, v19
+        calc_epelh2     v4, v5, v16, v17, v18, v19
+        sqrshrun        v4.8b, v4.8h, #6
+        st1            {v4.8b}, [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v16.8h}, [sp], x10
+        calc_epelh      v4, v17, v18, v19, v16
+        calc_epelh2     v4, v5, v17, v18, v19, v16
+        sqrshrun        v4.8b, v4.8h, #6
+        st1            {v4.8b}, [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v17.8h}, [sp], x10
+        calc_epelh      v4, v18, v19, v16, v17
+        calc_epelh2     v4, v5, v18, v19, v16, v17
+        sqrshrun        v4.8b, v4.8h, #6
+        st1            {v4.8b}, [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v18.8h}, [sp], x10
+        calc_epelh      v4, v19, v16, v17, v18
+        calc_epelh2     v4, v5, v19, v16, v17, v18
+        sqrshrun        v4.8b, v4.8h, #6
+        st1            {v4.8b}, [x0], x1
+        subs            x4, x4, #1
+        b.ne            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_hv12_8_neon, export=1
+        add             x10, x4, #3
+        lsl             x10, x10, #7
+        sub             sp, sp, x10     // tmp_array
+        stp             x0, x1, [sp, #-16]!
+        stp             x4, x6, [sp, #-16]!
+        stp             xzr, x30, [sp, #-16]!
+        add             x0, sp, #48
+        sub             x1, x2, x3
+        mov             x2, x3
+        add             x3, x4, #3
+        mov             x4, x5
+        bl              X(ff_hevc_put_hevc_epel_h12_8_neon)
+        ldp             xzr, x30, [sp], #16
+        ldp             x4, x6, [sp], #16
+        ldp             x0, x1, [sp], #16
+        load_epel_filterh x6, x5
+        sub             x1, x1, #8
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1            {v16.8h, v17.8h}, [sp], x10
+        ld1            {v18.8h, v19.8h}, [sp], x10
+        ld1            {v20.8h, v21.8h}, [sp], x10
+1:      ld1            {v22.8h, v23.8h}, [sp], x10
+        calc_epelh      v4, v16, v18, v20, v22
+        calc_epelh2     v4, v5, v16, v18, v20, v22
+        calc_epelh      v5, v17, v19, v21, v23
+        sqrshrun        v4.8b, v4.8h, #6
+        sqrshrun2       v4.16b, v5.8h, #6
+        st1            {v4.8b}, [x0], #8
+        st1            {v4.s}[2], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v16.8h, v17.8h}, [sp], x10
+        calc_epelh      v4, v18, v20, v22, v16
+        calc_epelh2     v4, v5, v18, v20, v22, v16
+        calc_epelh      v5, v19, v21, v23, v17
+        sqrshrun        v4.8b, v4.8h, #6
+        sqrshrun2       v4.16b, v5.8h, #6
+        st1            {v4.8b}, [x0], #8
+        st1            {v4.s}[2], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v18.8h, v19.8h}, [sp], x10
+        calc_epelh      v4, v20, v22, v16, v18
+        calc_epelh2     v4, v5, v20, v22, v16, v18
+        calc_epelh      v5, v21, v23, v17, v19
+        sqrshrun        v4.8b, v4.8h, #6
+        sqrshrun2       v4.16b, v5.8h, #6
+        st1            {v4.8b}, [x0], #8
+        st1            {v4.s}[2], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v20.8h, v21.8h}, [sp], x10
+        calc_epelh      v4, v22, v16, v18, v20
+        calc_epelh2     v4, v5, v22, v16, v18, v20
+        calc_epelh      v5, v23, v17, v19, v21
+        sqrshrun        v4.8b, v4.8h, #6
+        sqrshrun2       v4.16b, v5.8h, #6
+        st1            {v4.8b}, [x0], #8
+        st1            {v4.s}[2], [x0], x1
+        subs            x4, x4, #1
+        b.ne            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_hv16_8_neon, export=1
+        add             x10, x4, #3
+        lsl             x10, x10, #7
+        sub             sp, sp, x10     // tmp_array
+        stp             x0, x1, [sp, #-16]!
+        stp             x4, x6, [sp, #-16]!
+        stp             xzr, x30, [sp, #-16]!
+        add             x0, sp, #48
+        sub             x1, x2, x3
+        mov             x2, x3
+        add             x3, x4, #3
+        mov             x4, x5
+        bl              X(ff_hevc_put_hevc_epel_h16_8_neon)
+        ldp             xzr, x30, [sp], #16
+        ldp             x4, x6, [sp], #16
+        ldp             x0, x1, [sp], #16
+        load_epel_filterh x6, x5
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1            {v16.8h, v17.8h}, [sp], x10
+        ld1            {v18.8h, v19.8h}, [sp], x10
+        ld1            {v20.8h, v21.8h}, [sp], x10
+1:      ld1            {v22.8h, v23.8h}, [sp], x10
+        calc_epelh      v4, v16, v18, v20, v22
+        calc_epelh2     v4, v5, v16, v18, v20, v22
+        calc_epelh      v5, v17, v19, v21, v23
+        calc_epelh2     v5, v6, v17, v19, v21, v23
+        sqrshrun        v4.8b, v4.8h, #6
+        sqrshrun2       v4.16b, v5.8h, #6
+        st1            {v4.16b}, [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v16.8h, v17.8h}, [sp], x10
+        calc_epelh      v4, v18, v20, v22, v16
+        calc_epelh2     v4, v5, v18, v20, v22, v16
+        calc_epelh      v5, v19, v21, v23, v17
+        calc_epelh2     v5, v6, v19, v21, v23, v17
+        sqrshrun        v4.8b, v4.8h, #6
+        sqrshrun2       v4.16b, v5.8h, #6
+        st1            {v4.16b}, [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v18.8h, v19.8h}, [sp], x10
+        calc_epelh      v4, v20, v22, v16, v18
+        calc_epelh2     v4, v5, v20, v22, v16, v18
+        calc_epelh      v5, v21, v23, v17, v19
+        calc_epelh2     v5, v6, v21, v23, v17, v19
+        sqrshrun        v4.8b, v4.8h, #6
+        sqrshrun2       v4.16b, v5.8h, #6
+        st1            {v4.16b}, [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v20.8h, v21.8h}, [sp], x10
+        calc_epelh      v4, v22, v16, v18, v20
+        calc_epelh2     v4, v5, v22, v16, v18, v20
+        calc_epelh      v5, v23, v17, v19, v21
+        calc_epelh2     v5, v6, v23, v17, v19, v21
+        sqrshrun        v4.8b, v4.8h, #6
+        sqrshrun2       v4.16b, v5.8h, #6
+        st1            {v4.16b}, [x0], x1
+        subs            x4, x4, #1
+        b.ne            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_hv24_8_neon, export=1
+        add             x10, x4, #3
+        lsl             x10, x10, #7
+        sub             sp, sp, x10     // tmp_array
+        stp             x0, x1, [sp, #-16]!
+        stp             x4, x6, [sp, #-16]!
+        stp             xzr, x30, [sp, #-16]!
+        add             x0, sp, #48
+        sub             x1, x2, x3
+        mov             x2, x3
+        add             x3, x4, #3
+        mov             x4, x5
+        bl              X(ff_hevc_put_hevc_epel_h24_8_neon)
+        ldp             xzr, x30, [sp], #16
+        ldp             x4, x6, [sp], #16
+        ldp             x0, x1, [sp], #16
+        load_epel_filterh x6, x5
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1            {v16.8h, v17.8h, v18.8h}, [sp], x10
+        ld1            {v19.8h, v20.8h, v21.8h}, [sp], x10
+        ld1            {v22.8h, v23.8h, v24.8h}, [sp], x10
+1:      ld1            {v25.8h, v26.8h, v27.8h}, [sp], x10
+        calc_epelh      v4, v16, v19, v22, v25
+        calc_epelh2     v4, v5, v16, v19, v22, v25
+        calc_epelh      v5, v17, v20, v23, v26
+        calc_epelh2     v5, v6, v17, v20, v23, v26
+        calc_epelh      v6, v18, v21, v24, v27
+        calc_epelh2     v6, v7, v18, v21, v24, v27
+        sqrshrun        v4.8b, v4.8h, #6
+        sqrshrun        v5.8b, v5.8h, #6
+        sqrshrun        v6.8b, v6.8h, #6
+        st1            {v4.8b, v5.8b, v6.8b}, [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v16.8h, v17.8h, v18.8h}, [sp], x10
+        calc_epelh      v4, v19, v22, v25, v16
+        calc_epelh2     v4, v5, v19, v22, v25, v16
+        calc_epelh      v5, v20, v23, v26, v17
+        calc_epelh2     v5, v6, v20, v23, v26, v17
+        calc_epelh      v6, v21, v24, v27, v18
+        calc_epelh2     v6, v7, v21, v24, v27, v18
+        sqrshrun        v4.8b, v4.8h, #6
+        sqrshrun        v5.8b, v5.8h, #6
+        sqrshrun        v6.8b, v6.8h, #6
+        st1            {v4.8b, v5.8b, v6.8b}, [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v19.8h, v20.8h, v21.8h}, [sp], x10
+        calc_epelh      v4, v22, v25, v16, v19
+        calc_epelh2     v4, v5, v22, v25, v16, v19
+        calc_epelh      v5, v23, v26, v17, v20
+        calc_epelh2     v5, v6, v23, v26, v17, v20
+        calc_epelh      v6, v24, v27, v18, v21
+        calc_epelh2     v6, v7, v24, v27, v18, v21
+        sqrshrun        v4.8b, v4.8h, #6
+        sqrshrun        v5.8b, v5.8h, #6
+        sqrshrun        v6.8b, v6.8h, #6
+        st1            {v4.8b, v5.8b, v6.8b}, [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v22.8h, v23.8h, v24.8h}, [sp], x10
+        calc_epelh      v4, v25, v16, v19, v22
+        calc_epelh2     v4, v5, v25, v16, v19, v22
+        calc_epelh      v5, v26, v17, v20, v23
+        calc_epelh2     v5, v6, v26, v17, v20, v23
+        calc_epelh      v6, v27, v18, v21, v24
+        calc_epelh2     v6, v7, v27, v18, v21, v24
+        sqrshrun        v4.8b, v4.8h, #6
+        sqrshrun        v5.8b, v5.8h, #6
+        sqrshrun        v6.8b, v6.8h, #6
+        st1            {v4.8b, v5.8b, v6.8b}, [x0], x1
+        subs            x4, x4, #1
+        b.ne            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_hv32_8_neon, export=1
+        stp             x0, x30, [sp, #-16]!
+        stp             x1, x2, [sp, #-16]!
+        stp             x3, x4, [sp, #-16]!
+        stp             x5, x6, [sp, #-16]!
+        mov             x7, #16
+        bl              X(ff_hevc_put_hevc_epel_uni_hv16_8_neon)
+        ldp             x5, x6, [sp], #16
+        ldp             x3, x4, [sp], #16
+        ldp             x1, x2, [sp], #16
+        ldr             x0, [sp]
+        add             x0, x0, #16
+        add             x2, x2, #16
+        mov             x7, #16
+        bl              X(ff_hevc_put_hevc_epel_uni_hv16_8_neon)
+        ldp             xzr, x30, [sp], #16
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_hv48_8_neon, export=1
+        stp             x0, x30, [sp, #-16]!
+        stp             x1, x2, [sp, #-16]!
+        stp             x3, x4, [sp, #-16]!
+        stp             x5, x6, [sp, #-16]!
+        mov             x7, #24
+        bl              X(ff_hevc_put_hevc_epel_uni_hv24_8_neon)
+        ldp             x5, x6, [sp], #16
+        ldp             x3, x4, [sp], #16
+        ldp             x1, x2, [sp], #16
+        ldr             x0, [sp]
+        add             x0, x0, #24
+        add             x2, x2, #24
+        mov             x7, #24
+        bl              X(ff_hevc_put_hevc_epel_uni_hv24_8_neon)
+        ldp             xzr, x30, [sp], #16
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_hv64_8_neon, export=1
+        stp             x0, x30, [sp, #-16]!
+        stp             x1, x2, [sp, #-16]!
+        stp             x3, x4, [sp, #-16]!
+        stp             x5, x6, [sp, #-16]!
+        mov             x7, #16
+        bl              X(ff_hevc_put_hevc_epel_uni_hv16_8_neon)
+        ldp             x5, x6, [sp]
+        ldp             x3, x4, [sp, #16]
+        ldp             x1, x2, [sp, #32]
+        ldr             x0, [sp, #48]
+        add             x0, x0, #16
+        add             x2, x2, #16
+        mov             x7, #16
+        bl              X(ff_hevc_put_hevc_epel_uni_hv16_8_neon)
+        ldp             x5, x6, [sp]
+        ldp             x3, x4, [sp, #16]
+        ldp             x1, x2, [sp, #32]
+        ldr             x0, [sp, #48]
+        add             x0, x0, #32
+        add             x2, x2, #32
+        mov             x7, #16
+        bl              X(ff_hevc_put_hevc_epel_uni_hv16_8_neon)
+        ldp             x5, x6, [sp], #16
+        ldp             x3, x4, [sp], #16
+        ldp             x1, x2, [sp], #16
+        ldr             x0, [sp]
+        add             x0, x0, #48
+        add             x2, x2, #48
+        mov             x7, #16
+        bl              X(ff_hevc_put_hevc_epel_uni_hv16_8_neon)
+        ldp             xzr, x30, [sp], #16
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_bi_h4_8_neon, export=1
+        load_epel_filterb x6, x7
+        sub             x2, x2, #1
+        mov             x10, #(MAX_PB_SIZE * 2)
+1:      ld1            {v4.8b}, [x2], x3
+        ushr            v5.2d, v4.2d, #8
+        ushr            v6.2d, v5.2d, #8
+        ushr            v7.2d, v6.2d, #8
+        movi            v16.8h, #0
+        calc_epelb      v16, v4, v5, v6, v7
+        ld1            {v20.4h}, [x4], x10
+        sqadd           v16.8h, v16.8h, v20.8h
+        sqrshrun        v4.8b, v16.8h, #7
+        st1            {v4.s}[0], [x0], x1
+        subs            x5, x5, #1   // height
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_bi_h6_8_neon, export=1
+        load_epel_filterb x6, x7
+        sub             x1, x1, #4
+        sub             x2, x2, #1
+        sub             x3, x3, #8
+        mov             x10, #(MAX_PB_SIZE * 2)
+1:      ld1            {v24.8b}, [x2], #8
+        ushr            v26.2d, v24.2d, #8
+        ushr            v27.2d, v26.2d, #8
+        ushr            v28.2d, v27.2d, #8
+        movi            v16.8h, #0
+        ld1            {v28.b}[5], [x2], x3
+        calc_epelb      v16, v24, v26, v27, v28
+        ld1            {v20.8h}, [x4], x10
+        sqadd           v16.8h, v16.8h, v20.8h
+        sqrshrun        v16.8b, v16.8h, #7
+        st1            {v16.s}[0], [x0], #4
+        st1            {v16.h}[2], [x0], x1
+        subs            x5, x5, #1   // height
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_bi_h8_8_neon, export=1
+        load_epel_filterb x6, x7
+        sub             x2, x2, #1
+        mov             x10, #(MAX_PB_SIZE * 2)
+1:      ld2            {v24.8b, v25.8b}, [x2], x3
+        ushr            v26.2d, v24.2d, #8
+        ushr            v27.2d, v25.2d, #8
+        ushr            v28.2d, v26.2d, #8
+        movi            v16.8h, #0
+        movi            v17.8h, #0
+        calc_epelb      v16, v24, v25, v26, v27
+        calc_epelb      v17, v25, v26, v27, v28
+        zip1            v16.8h, v16.8h, v17.8h
+        ld1            {v20.8h}, [x4], x10
+        sqadd           v16.8h, v16.8h, v20.8h
+        sqrshrun        v16.8b, v16.8h, #7
+        st1            {v16.8b}, [x0], x1
+        subs            x5, x5, #1   // height
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_bi_h12_8_neon, export=1
+        load_epel_filterb x6, x7
+        sub             x1, x1, #8
+        sub             x2, x2, #1
+        mov             x10, #(MAX_PB_SIZE * 2)
+1:      ld2            {v24.8b, v25.8b}, [x2], x3
+        ushr            v26.2d, v24.2d, #8
+        ushr            v27.2d, v25.2d, #8
+        ushr            v28.2d, v26.2d, #8
+        movi            v16.8h, #0
+        movi            v17.8h, #0
+        calc_epelb      v16, v24, v25, v26, v27
+        calc_epelb      v17, v25, v26, v27, v28
+        zip1            v18.8h, v16.8h, v17.8h
+        zip2            v19.8h, v16.8h, v17.8h
+        ld1            {v20.8h, v21.8h}, [x4], x10
+        sqadd           v18.8h, v18.8h, v20.8h
+        sqadd           v19.8h, v19.8h, v21.8h
+        sqrshrun        v20.8b, v18.8h, #7
+        sqrshrun        v21.8b, v19.8h, #7
+        st1            {v20.8b}, [x0], #8
+        st1            {v21.s}[0], [x0], x1
+        subs            x5, x5, #1   // height
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_bi_h16_8_neon, export=1
+        load_epel_filterb x6, x7
+        sub             x2, x2, #1
+        sub             x3, x3, #16
+        mov             x10, #(MAX_PB_SIZE * 2)
+1:      ld2            {v24.8b, v25.8b}, [x2], #16
+        ld1            {v20.s}[0], [x2], x3
+        ushr            v26.2d, v24.2d, #8
+        ushr            v27.2d, v25.2d, #8
+        mov             v26.b[7], v20.b[0]
+        mov             v27.b[7], v20.b[1]
+        ushr            v28.2d, v26.2d, #8
+        mov             v28.b[7], v20.b[2]
+        movi            v16.8h, #0
+        movi            v17.8h, #0
+        calc_epelb      v16, v24, v25, v26, v27
+        calc_epelb      v17, v25, v26, v27, v28
+        zip1            v18.8h, v16.8h, v17.8h
+        zip2            v19.8h, v16.8h, v17.8h
+        ld2            {v24.8h, v25.8h}, [x4], x10
+        sqadd           v16.8h, v16.8h, v24.8h
+        sqadd           v17.8h, v17.8h, v25.8h
+        sqrshrun        v4.8b, v16.8h, #7
+        sqrshrun        v5.8b, v17.8h, #7
+        st2            {v4.8b, v5.8b}, [x0], x1
+        subs            x5, x5, #1   // height
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_bi_h24_8_neon, export=1
+        load_epel_filterb x6, x7
+        sub             x2, x2, #1
+        sub             x3, x3, #24
+        mov             x10, #(MAX_PB_SIZE * 2)
+1:      ld3            {v24.8b, v25.8b, v26.8b}, [x2], #24
+        ld1            {v20.s}[0], [x2], x3
+        ushr            v27.2d, v24.2d, #8
+        ushr            v28.2d, v25.2d, #8
+        ushr            v29.2d, v26.2d, #8
+        mov             v27.b[7], v20.b[0]
+        mov             v28.b[7], v20.b[1]
+        mov             v29.b[7], v20.b[2]
+        movi            v16.8h, #0
+        movi            v17.8h, #0
+        movi            v18.8h, #0
+        calc_epelb      v16, v24, v25, v26, v27
+        calc_epelb      v17, v25, v26, v27, v28
+        calc_epelb      v18, v26, v27, v28, v29
+        ld3            {v24.8h, v25.8h, v26.8h}, [x4], x10
+        sqadd           v16.8h, v16.8h, v24.8h
+        sqadd           v17.8h, v17.8h, v25.8h
+        sqadd           v18.8h, v18.8h, v26.8h
+        sqrshrun        v4.8b, v16.8h, #7
+        sqrshrun        v5.8b, v17.8h, #7
+        sqrshrun        v6.8b, v18.8h, #7
+        st3            {v4.8b, v5.8b, v6.8b}, [x0], x1
+        subs            x5, x5, #1   // height
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_bi_h32_8_neon, export=1
+        load_epel_filterb x6, x7
+        sub             x2, x2, #1
+        sub             x3, x3, #32
+        mov             x10, #(MAX_PB_SIZE * 2)
+1:      ld4            {v24.8b, v25.8b, v26.8b, v27.8b}, [x2], #32
+        ld1            {v20.s}[0], [x2], x3
+        ushr            v28.2d, v24.2d, #8
+        ushr            v29.2d, v25.2d, #8
+        ushr            v30.2d, v26.2d, #8
+        ins             v28.b[7], v20.b[0]
+        ins             v29.b[7], v20.b[1]
+        ins             v30.b[7], v20.b[2]
+        movi            v16.8h, #0
+        movi            v17.8h, #0
+        movi            v18.8h, #0
+        movi            v19.8h, #0
+        calc_epelb      v16, v24, v25, v26, v27
+        calc_epelb      v17, v25, v26, v27, v28
+        calc_epelb      v18, v26, v27, v28, v29
+        calc_epelb      v19, v27, v28, v29, v30
+        ld4            {v24.8h, v25.8h, v26.8h, v27.8h}, [x4], x10
+        sqadd           v16.8h, v16.8h, v24.8h
+        sqadd           v17.8h, v17.8h, v25.8h
+        sqadd           v18.8h, v18.8h, v26.8h
+        sqadd           v19.8h, v19.8h, v27.8h
+        sqrshrun        v4.8b, v16.8h, #7
+        sqrshrun        v5.8b, v17.8h, #7
+        sqrshrun        v6.8b, v18.8h, #7
+        sqrshrun        v7.8b, v19.8h, #7
+        st4            {v4.8b, v5.8b, v6.8b, v7.8b}, [x0], x1
+        subs            x5, x5, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_bi_h48_8_neon, export=1
+        load_epel_filterb x6, x7
+        sub             x2, x2, #1
+        sub             x3, x3, #48
+        mov             x7, #24
+        mov             x10, #(128 - 48)
+1:      ld3            {v26.16b, v27.16b, v28.16b}, [x2], x7
+        ushr            v29.2d, v26.2d, #8
+        ushr            v30.2d, v27.2d, #8
+        ushr            v31.2d, v28.2d, #8
+        ld1            {v24.s}[0], [x2], x7
+        ld1            {v25.s}[0], [x2], x3
+        mov             v29.b[7], v24.b[0]
+        mov             v30.b[7], v24.b[1]
+        mov             v31.b[7], v24.b[2]
+        mov             v29.b[15], v25.b[0]
+        mov             v30.b[15], v25.b[1]
+        mov             v31.b[15], v25.b[2]
+        movi            v16.8h, #0
+        movi            v17.8h, #0
+        movi            v18.8h, #0
+        movi            v20.8h, #0
+        movi            v21.8h, #0
+        movi            v22.8h, #0
+        calc_epelb      v16, v26, v27, v28, v29
+        calc_epelb2     v20, v26, v27, v28, v29
+        calc_epelb      v17, v27, v28, v29, v30
+        calc_epelb2     v21, v27, v28, v29, v30
+        calc_epelb      v18, v28, v29, v30, v31
+        calc_epelb2     v22, v28, v29, v30, v31
+        ld3            {v24.8h, v25.8h, v26.8h}, [x4], #48
+        sqadd           v16.8h, v16.8h, v24.8h
+        sqadd           v17.8h, v17.8h, v25.8h
+        sqadd           v18.8h, v18.8h, v26.8h
+        ld3            {v27.8h, v28.8h, v29.8h}, [x4], x10
+        sqadd           v20.8h, v20.8h, v27.8h
+        sqadd           v21.8h, v21.8h, v28.8h
+        sqadd           v22.8h, v22.8h, v29.8h
+        sqrshrun        v4.8b, v16.8h, #7
+        sqrshrun        v5.8b, v17.8h, #7
+        sqrshrun        v6.8b, v18.8h, #7
+        sqrshrun2       v4.16b, v20.8h, #7
+        sqrshrun2       v5.16b, v21.8h, #7
+        sqrshrun2       v6.16b, v22.8h, #7
+        st3            {v4.16b, v5.16b, v6.16b}, [x0], x1
+        subs            x5, x5, #1   // height
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_bi_h64_8_neon, export=1
+        load_epel_filterb x6, x7
+        sub             x2, x2, #1
+        sub             x3, x3, #64
+        mov             x7, #32
+1:      ld4            {v24.16b, v25.16b, v26.16b, v27.16b}, [x2], x7
+        ushr            v28.2d, v24.2d, #8
+        ushr            v29.2d, v25.2d, #8
+        ushr            v30.2d, v26.2d, #8
+        ld1            {v4.s}[0], [x2], x7
+        ld1            {v5.s}[0], [x2], x3
+        ins             v28.b[7], v4.b[0]
+        ins             v28.b[15], v5.b[0]
+        ins             v29.b[7], v4.b[1]
+        ins             v29.b[15], v5.b[1]
+        ins             v30.b[7], v4.b[2]
+        ins             v30.b[15], v5.b[2]
+        movi            v16.8h, #0
+        movi            v17.8h, #0
+        movi            v18.8h, #0
+        movi            v19.8h, #0
+        movi            v20.8h, #0
+        movi            v21.8h, #0
+        movi            v22.8h, #0
+        movi            v23.8h, #0
+        calc_epelb      v16, v24, v25, v26, v27
+        calc_epelb2     v20, v24, v25, v26, v27
+        calc_epelb      v17, v25, v26, v27, v28
+        calc_epelb2     v21, v25, v26, v27, v28
+        calc_epelb      v18, v26, v27, v28, v29
+        calc_epelb2     v22, v26, v27, v28, v29
+        calc_epelb      v19, v27, v28, v29, v30
+        calc_epelb2     v23, v27, v28, v29, v30
+        ld4            {v24.8h, v25.8h, v26.8h, v27.8h}, [x4], #64
+        sqadd           v16.8h, v16.8h, v24.8h
+        sqadd           v17.8h, v17.8h, v25.8h
+        sqadd           v18.8h, v18.8h, v26.8h
+        sqadd           v19.8h, v19.8h, v27.8h
+        ld4            {v28.8h, v29.8h, v30.8h, v31.8h}, [x4], #64
+        sqadd           v20.8h, v20.8h, v28.8h
+        sqadd           v21.8h, v21.8h, v29.8h
+        sqadd           v22.8h, v22.8h, v30.8h
+        sqadd           v23.8h, v23.8h, v31.8h
+        sqrshrun        v4.8b, v16.8h, #7
+        sqrshrun        v5.8b, v17.8h, #7
+        sqrshrun        v6.8b, v18.8h, #7
+        sqrshrun        v7.8b, v19.8h, #7
+        sqrshrun2       v4.16b, v20.8h, #7
+        sqrshrun2       v5.16b, v21.8h, #7
+        sqrshrun2       v6.16b, v22.8h, #7
+        sqrshrun2       v7.16b, v23.8h, #7
+        st4            {v4.16b, v5.16b, v6.16b, v7.16b}, [x0], x1
+        subs            x5, x5, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_bi_v4_8_neon, export=1
+        load_epel_filterb x7, x6
+        sub             x2, x2, x3
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1            {v16.s}[0], [x2], x3
+        ld1            {v17.s}[0], [x2], x3
+        ld1            {v18.s}[0], [x2], x3
+1:      ld1            {v19.s}[0], [x2], x3
+        movi            v4.8h, #0
+        calc_epelb      v4, v16, v17, v18, v19
+        ld1            {v24.4h}, [x4], x10
+        sqadd           v4.8h, v4.8h, v24.8h
+        sqrshrun        v4.8b, v4.8h, #7
+        st1            {v4.s}[0], [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v16.s}[0], [x2], x3
+        movi            v4.8h, #0
+        calc_epelb      v4, v17, v18, v19, v16
+        ld1            {v24.4h}, [x4], x10
+        sqadd           v4.8h, v4.8h, v24.8h
+        sqrshrun        v4.8b, v4.8h, #7
+        st1            {v4.s}[0], [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v17.s}[0], [x2], x3
+        movi            v4.8h, #0
+        calc_epelb      v4, v18, v19, v16, v17
+        ld1            {v24.4h}, [x4], x10
+        sqadd           v4.8h, v4.8h, v24.8h
+        sqrshrun        v4.8b, v4.8h, #7
+        st1            {v4.s}[0], [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v18.s}[0], [x2], x3
+        movi            v4.8h, #0
+        calc_epelb      v4, v19, v16, v17, v18
+        ld1            {v24.4h}, [x4], x10
+        sqadd           v4.8h, v4.8h, v24.8h
+        sqrshrun        v4.8b, v4.8h, #7
+        st1            {v4.s}[0], [x0], x1
+        subs            x5, x5, #1
+        b.ne            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_bi_v6_8_neon, export=1
+        load_epel_filterb x7, x6
+        sub             x2, x2, x3
+        sub             x1, x1, #4
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1            {v16.8b}, [x2], x3
+        ld1            {v17.8b}, [x2], x3
+        ld1            {v18.8b}, [x2], x3
+1:      ld1            {v19.8b}, [x2], x3
+        movi            v4.8h, #0
+        calc_epelb      v4, v16, v17, v18, v19
+        ld1            {v24.8h}, [x4], x10
+        sqadd           v4.8h, v4.8h, v24.8h
+        sqrshrun        v4.8b, v4.8h, #7
+        st1            {v4.s}[0], [x0], #4
+        st1            {v4.h}[2], [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v16.8b}, [x2], x3
+        movi            v4.8h, #0
+        calc_epelb      v4, v17, v18, v19, v16
+        ld1            {v24.8h}, [x4], x10
+        sqadd           v4.8h, v4.8h, v24.8h
+        sqrshrun        v4.8b, v4.8h, #7
+        st1            {v4.s}[0], [x0], #4
+        st1            {v4.h}[2], [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v17.8b}, [x2], x3
+        movi            v4.8h, #0
+        calc_epelb      v4, v18, v19, v16, v17
+        ld1            {v24.8h}, [x4], x10
+        sqadd           v4.8h, v4.8h, v24.8h
+        sqrshrun        v4.8b, v4.8h, #7
+        st1            {v4.s}[0], [x0], #4
+        st1            {v4.h}[2], [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v18.8b}, [x2], x3
+        movi            v4.8h, #0
+        calc_epelb      v4, v19, v16, v17, v18
+        ld1            {v24.8h}, [x4], x10
+        sqadd           v4.8h, v4.8h, v24.8h
+        sqrshrun        v4.8b, v4.8h, #7
+        st1            {v4.s}[0], [x0], #4
+        st1            {v4.h}[2], [x0], x1
+        subs            x5, x5, #1
+        b.ne            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_bi_v8_8_neon, export=1
+        load_epel_filterb x7, x6
+        sub             x2, x2, x3
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1            {v16.8b}, [x2], x3
+        ld1            {v17.8b}, [x2], x3
+        ld1            {v18.8b}, [x2], x3
+1:      ld1            {v19.8b}, [x2], x3
+        movi            v4.8h, #0
+        calc_epelb      v4, v16, v17, v18, v19
+        ld1            {v24.8h}, [x4], x10
+        sqadd           v4.8h, v4.8h, v24.8h
+        sqrshrun        v4.8b, v4.8h, #7
+        st1            {v4.8b}, [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v16.8b}, [x2], x3
+        movi            v4.8h, #0
+        calc_epelb      v4, v17, v18, v19, v16
+        ld1            {v24.8h}, [x4], x10
+        sqadd           v4.8h, v4.8h, v24.8h
+        sqrshrun        v4.8b, v4.8h, #7
+        st1            {v4.8b}, [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v17.8b}, [x2], x3
+        movi            v4.8h, #0
+        calc_epelb      v4, v18, v19, v16, v17
+        ld1            {v24.8h}, [x4], x10
+        sqadd           v4.8h, v4.8h, v24.8h
+        sqrshrun        v4.8b, v4.8h, #7
+        st1            {v4.8b}, [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v18.8b}, [x2], x3
+        movi            v4.8h, #0
+        calc_epelb      v4, v19, v16, v17, v18
+        ld1            {v24.8h}, [x4], x10
+        sqadd           v4.8h, v4.8h, v24.8h
+        sqrshrun        v4.8b, v4.8h, #7
+        st1            {v4.8b}, [x0], x1
+        subs            x5, x5, #1
+        b.ne            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_bi_v12_8_neon, export=1
+        load_epel_filterb x7, x6
+        sub             x1, x1, #8
+        sub             x2, x2, x3
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1            {v16.16b}, [x2], x3
+        ld1            {v17.16b}, [x2], x3
+        ld1            {v18.16b}, [x2], x3
+1:      ld1            {v19.16b}, [x2], x3
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        calc_epelb      v4, v16, v17, v18, v19
+        calc_epelb2     v5, v16, v17, v18, v19
+        ld1            {v24.8h, v25.8h}, [x4], x10
+        sqadd           v4.8h, v4.8h, v24.8h
+        sqadd           v5.8h, v5.8h, v25.8h
+        sqrshrun        v4.8b, v4.8h, #7
+        sqrshrun2       v4.16b, v5.8h, #7
+        st1            {v4.8b}, [x0], #8
+        st1            {v4.s}[2], [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v16.16b}, [x2], x3
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        calc_epelb      v4, v17, v18, v19, v16
+        calc_epelb2     v5, v17, v18, v19, v16
+        ld1            {v24.8h, v25.8h}, [x4], x10
+        sqadd           v4.8h, v4.8h, v24.8h
+        sqadd           v5.8h, v5.8h, v25.8h
+        sqrshrun        v4.8b, v4.8h, #7
+        sqrshrun2       v4.16b, v5.8h, #7
+        st1            {v4.8b}, [x0], #8
+        st1            {v4.s}[2], [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v17.16b}, [x2], x3
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        calc_epelb      v4, v18, v19, v16, v17
+        calc_epelb2     v5, v18, v19, v16, v17
+        ld1            {v24.8h, v25.8h}, [x4], x10
+        sqadd           v4.8h, v4.8h, v24.8h
+        sqadd           v5.8h, v5.8h, v25.8h
+        sqrshrun        v4.8b, v4.8h, #7
+        sqrshrun2       v4.16b, v5.8h, #7
+        st1            {v4.8b}, [x0], #8
+        st1            {v4.s}[2], [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v18.16b}, [x2], x3
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        calc_epelb      v4, v19, v16, v17, v18
+        calc_epelb2     v5, v19, v16, v17, v18
+        ld1            {v24.8h, v25.8h}, [x4], x10
+        sqadd           v4.8h, v4.8h, v24.8h
+        sqadd           v5.8h, v5.8h, v25.8h
+        sqrshrun        v4.8b, v4.8h, #7
+        sqrshrun2       v4.16b, v5.8h, #7
+        st1            {v4.8b}, [x0], #8
+        st1            {v4.s}[2], [x0], x1
+        subs            x5, x5, #1
+        b.ne            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_bi_v16_8_neon, export=1
+        load_epel_filterb x7, x6
+        sub             x2, x2, x3
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1            {v16.16b}, [x2], x3
+        ld1            {v17.16b}, [x2], x3
+        ld1            {v18.16b}, [x2], x3
+1:      ld1            {v19.16b}, [x2], x3
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        calc_epelb      v4, v16, v17, v18, v19
+        calc_epelb2     v5, v16, v17, v18, v19
+        ld1            {v24.8h, v25.8h}, [x4], x10
+        sqadd           v4.8h, v4.8h, v24.8h
+        sqadd           v5.8h, v5.8h, v25.8h
+        sqrshrun        v4.8b, v4.8h, #7
+        sqrshrun2       v4.16b, v5.8h, #7
+        st1            {v4.16b}, [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v16.16b}, [x2], x3
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        calc_epelb      v4, v17, v18, v19, v16
+        calc_epelb2     v5, v17, v18, v19, v16
+        ld1            {v24.8h, v25.8h}, [x4], x10
+        sqadd           v4.8h, v4.8h, v24.8h
+        sqadd           v5.8h, v5.8h, v25.8h
+        sqrshrun        v4.8b, v4.8h, #7
+        sqrshrun2       v4.16b, v5.8h, #7
+        st1            {v4.16b}, [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v17.16b}, [x2], x3
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        calc_epelb      v4, v18, v19, v16, v17
+        calc_epelb2     v5, v18, v19, v16, v17
+        ld1            {v24.8h, v25.8h}, [x4], x10
+        sqadd           v4.8h, v4.8h, v24.8h
+        sqadd           v5.8h, v5.8h, v25.8h
+        sqrshrun        v4.8b, v4.8h, #7
+        sqrshrun2       v4.16b, v5.8h, #7
+        st1            {v4.16b}, [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v18.16b}, [x2], x3
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        calc_epelb      v4, v19, v16, v17, v18
+        calc_epelb2     v5, v19, v16, v17, v18
+        ld1            {v24.8h, v25.8h}, [x4], x10
+        sqadd           v4.8h, v4.8h, v24.8h
+        sqadd           v5.8h, v5.8h, v25.8h
+        sqrshrun        v4.8b, v4.8h, #7
+        sqrshrun2       v4.16b, v5.8h, #7
+        st1            {v4.16b}, [x0], x1
+        subs            x5, x5, #1
+        b.ne            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_bi_v24_8_neon, export=1
+        load_epel_filterb x7, x6
+        sub             x2, x2, x3
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1            {v16.8b, v17.8b, v18.8b}, [x2], x3
+        ld1            {v19.8b, v20.8b, v21.8b}, [x2], x3
+        ld1            {v22.8b, v23.8b, v24.8b}, [x2], x3
+1:      ld1            {v25.8b, v26.8b, v27.8b}, [x2], x3
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        movi            v6.8h, #0
+        calc_epelb      v4, v16, v19, v22, v25
+        calc_epelb      v5, v17, v20, v23, v26
+        calc_epelb      v6, v18, v21, v24, v27
+        ld1            {v28.8h, v29.8h, v30.8h}, [x4], x10
+        sqadd           v4.8h, v4.8h, v28.8h
+        sqadd           v5.8h, v5.8h, v29.8h
+        sqadd           v6.8h, v6.8h, v30.8h
+        sqrshrun        v4.8b, v4.8h, #7
+        sqrshrun        v5.8b, v5.8h, #7
+        sqrshrun        v6.8b, v6.8h, #7
+        st1            {v4.8b, v5.8b, v6.8b}, [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v16.8b, v17.8b, v18.8b}, [x2], x3
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        movi            v6.8h, #0
+        calc_epelb      v4, v19, v22, v25, v16
+        calc_epelb      v5, v20, v23, v26, v17
+        calc_epelb      v6, v21, v24, v27, v18
+        ld1            {v28.8h, v29.8h, v30.8h}, [x4], x10
+        sqadd           v4.8h, v4.8h, v28.8h
+        sqadd           v5.8h, v5.8h, v29.8h
+        sqadd           v6.8h, v6.8h, v30.8h
+        sqrshrun        v4.8b, v4.8h, #7
+        sqrshrun        v5.8b, v5.8h, #7
+        sqrshrun        v6.8b, v6.8h, #7
+        st1            {v4.8b, v5.8b, v6.8b}, [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v19.8b, v20.8b, v21.8b}, [x2], x3
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        movi            v6.8h, #0
+        calc_epelb      v4, v22, v25, v16, v19
+        calc_epelb      v5, v23, v26, v17, v20
+        calc_epelb      v6, v24, v27, v18, v21
+        ld1            {v28.8h, v29.8h, v30.8h}, [x4], x10
+        sqadd           v4.8h, v4.8h, v28.8h
+        sqadd           v5.8h, v5.8h, v29.8h
+        sqadd           v6.8h, v6.8h, v30.8h
+        sqrshrun        v4.8b, v4.8h, #7
+        sqrshrun        v5.8b, v5.8h, #7
+        sqrshrun        v6.8b, v6.8h, #7
+        st1            {v4.8b, v5.8b, v6.8b}, [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v22.8b, v23.8b, v24.8b}, [x2], x3
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        movi            v6.8h, #0
+        calc_epelb      v4, v25, v16, v19, v22
+        calc_epelb      v5, v26, v17, v20, v23
+        calc_epelb      v6, v27, v18, v21, v24
+        ld1            {v28.8h, v29.8h, v30.8h}, [x4], x10
+        sqadd           v4.8h, v4.8h, v28.8h
+        sqadd           v5.8h, v5.8h, v29.8h
+        sqadd           v6.8h, v6.8h, v30.8h
+        sqrshrun        v4.8b, v4.8h, #7
+        sqrshrun        v5.8b, v5.8h, #7
+        sqrshrun        v6.8b, v6.8h, #7
+        st1            {v4.8b, v5.8b, v6.8b}, [x0], x1
+        subs            x5, x5, #1
+        b.ne            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_bi_v32_8_neon, export=1
+        load_epel_filterb x7, x6
+        sub             x2, x2, x3
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1            {v16.16b, v17.16b}, [x2], x3
+        ld1            {v18.16b, v19.16b}, [x2], x3
+        ld1            {v20.16b, v21.16b}, [x2], x3
+1:      ld1            {v22.16b, v23.16b}, [x2], x3
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        movi            v6.8h, #0
+        movi            v7.8h, #0
+        calc_epelb      v4, v16, v18, v20, v22
+        calc_epelb2     v5, v16, v18, v20, v22
+        calc_epelb      v6, v17, v19, v21, v23
+        calc_epelb2     v7, v17, v19, v21, v23
+        ld1            {v24.8h, v25.8h, v26.8h, v27.8h}, [x4], x10
+        sqadd           v4.8h, v4.8h, v24.8h
+        sqadd           v5.8h, v5.8h, v25.8h
+        sqadd           v6.8h, v6.8h, v26.8h
+        sqadd           v7.8h, v7.8h, v27.8h
+        sqrshrun        v4.8b, v4.8h, #7
+        sqrshrun2       v4.16b, v5.8h, #7
+        sqrshrun        v5.8b, v6.8h, #7
+        sqrshrun2       v5.16b, v7.8h, #7
+        st1            {v4.16b, v5.16b}, [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v16.16b, v17.16b}, [x2], x3
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        movi            v6.8h, #0
+        movi            v7.8h, #0
+        calc_epelb      v4, v18, v20, v22, v16
+        calc_epelb2     v5, v18, v20, v22, v16
+        calc_epelb      v6, v19, v21, v23, v17
+        calc_epelb2     v7, v19, v21, v23, v17
+        ld1            {v24.8h, v25.8h, v26.8h, v27.8h}, [x4], x10
+        sqadd           v4.8h, v4.8h, v24.8h
+        sqadd           v5.8h, v5.8h, v25.8h
+        sqadd           v6.8h, v6.8h, v26.8h
+        sqadd           v7.8h, v7.8h, v27.8h
+        sqrshrun        v4.8b, v4.8h, #7
+        sqrshrun2       v4.16b, v5.8h, #7
+        sqrshrun        v5.8b, v6.8h, #7
+        sqrshrun2       v5.16b, v7.8h, #7
+        st1            {v4.16b, v5.16b}, [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v18.16b, v19.16b}, [x2], x3
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        movi            v6.8h, #0
+        movi            v7.8h, #0
+        calc_epelb      v4, v20, v22, v16, v18
+        calc_epelb2     v5, v20, v22, v16, v18
+        calc_epelb      v6, v21, v23, v17, v19
+        calc_epelb2     v7, v21, v23, v17, v19
+        ld1            {v24.8h, v25.8h, v26.8h, v27.8h}, [x4], x10
+        sqadd           v4.8h, v4.8h, v24.8h
+        sqadd           v5.8h, v5.8h, v25.8h
+        sqadd           v6.8h, v6.8h, v26.8h
+        sqadd           v7.8h, v7.8h, v27.8h
+        sqrshrun        v4.8b, v4.8h, #7
+        sqrshrun2       v4.16b, v5.8h, #7
+        sqrshrun        v5.8b, v6.8h, #7
+        sqrshrun2       v5.16b, v7.8h, #7
+        st1            {v4.16b, v5.16b}, [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v20.16b, v21.16b}, [x2], x3
+        movi            v4.8h, #0
+        movi            v5.8h, #0
+        movi            v6.8h, #0
+        movi            v7.8h, #0
+        calc_epelb      v4, v22, v16, v18, v20
+        calc_epelb2     v5, v22, v16, v18, v20
+        calc_epelb      v6, v23, v17, v19, v21
+        calc_epelb2     v7, v23, v17, v19, v21
+        ld1            {v24.8h, v25.8h, v26.8h, v27.8h}, [x4], x10
+        sqadd           v4.8h, v4.8h, v24.8h
+        sqadd           v5.8h, v5.8h, v25.8h
+        sqadd           v6.8h, v6.8h, v26.8h
+        sqadd           v7.8h, v7.8h, v27.8h
+        sqrshrun        v4.8b, v4.8h, #7
+        sqrshrun2       v4.16b, v5.8h, #7
+        sqrshrun        v5.8b, v6.8h, #7
+        sqrshrun2       v5.16b, v7.8h, #7
+        st1            {v4.16b, v5.16b}, [x0], x1
+        subs            x5, x5, #1
+        b.ne            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_bi_v48_8_neon, export=1
+        stp             x7, x30, [sp, #-16]!
+        stp             x0, x1, [sp, #-16]!
+        stp             x2, x3, [sp, #-16]!
+        stp             x4, x5, [sp, #-16]!
+        bl              X(ff_hevc_put_hevc_epel_bi_v24_8_neon)
+        ldp             x4, x5, [sp], #16
+        ldp             x2, x3, [sp], #16
+        ldp             x0, x1, [sp], #16
+        add             x0, x0, #24
+        add             x2, x2, #24
+        add             x4, x4, #48
+        ldr             x7, [sp]
+        bl              X(ff_hevc_put_hevc_epel_bi_v24_8_neon)
+        ldp             xzr, x30, [sp], #16
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_bi_v64_8_neon, export=1
+        stp             x7, x30, [sp, #-16]!
+        stp             x0, x1, [sp, #-16]!
+        stp             x2, x3, [sp, #-16]!
+        stp             x4, x5, [sp, #-16]!
+        bl              X(ff_hevc_put_hevc_epel_bi_v32_8_neon)
+        ldp             x4, x5, [sp], #16
+        ldp             x2, x3, [sp], #16
+        ldp             x0, x1, [sp], #16
+        add             x0, x0, #32
+        add             x2, x2, #32
+        add             x4, x4, #64
+        ldr             x7, [sp]
+        bl              X(ff_hevc_put_hevc_epel_bi_v32_8_neon)
+        ldp             xzr, x30, [sp], #16
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_bi_hv4_8_neon, export=1
+        add             x10, x5, #3
+        lsl             x10, x10, #7
+        sub             sp, sp, x10     // tmp_array
+        stp             x0, x1, [sp, #-16]!
+        stp             x4, x5, [sp, #-16]!
+        stp             x7, x30, [sp, #-16]!
+        add             x0, sp, #48
+        sub             x1, x2, x3
+        mov             x2, x3
+        add             x3, x5, #3
+        mov             x4, x6
+        mov             x5, x7
+        bl              X(ff_hevc_put_hevc_epel_h4_8_neon)
+        ldp             x7, x30, [sp], #16
+        ldp             x4, x5, [sp], #16
+        ldp             x0, x1, [sp], #16
+        load_epel_filterh x7, x6
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1            {v16.4h}, [sp], x10
+        ld1            {v17.4h}, [sp], x10
+        ld1            {v18.4h}, [sp], x10
+1:      ld1            {v19.4h}, [sp], x10
+        calc_epelh      v4, v16, v17, v18, v19
+        ld1            {v6.4h}, [x4], x10
+        sqadd           v4.4h, v4.4h, v6.4h
+        sqrshrun        v4.8b, v4.8h, #7
+        st1            {v4.s}[0], [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v16.4h}, [sp], x10
+        calc_epelh      v4, v17, v18, v19, v16
+        ld1            {v6.4h}, [x4], x10
+        sqadd           v4.4h, v4.4h, v6.4h
+        sqrshrun        v4.8b, v4.8h, #7
+        st1            {v4.s}[0], [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v17.4h}, [sp], x10
+        calc_epelh      v4, v18, v19, v16, v17
+        ld1            {v6.4h}, [x4], x10
+        sqadd           v4.4h, v4.4h, v6.4h
+        sqrshrun        v4.8b, v4.8h, #7
+        st1            {v4.s}[0], [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v18.4h}, [sp], x10
+        calc_epelh      v4, v19, v16, v17, v18
+        ld1            {v6.4h}, [x4], x10
+        sqadd           v4.4h, v4.4h, v6.4h
+        sqrshrun        v4.8b, v4.8h, #7
+        st1            {v4.s}[0], [x0], x1
+        subs            x5, x5, #1
+        b.ne            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_bi_hv6_8_neon, export=1
+        add             x10, x5, #3
+        lsl             x10, x10, #7
+        sub             sp, sp, x10     // tmp_array
+        stp             x0, x1, [sp, #-16]!
+        stp             x4, x5, [sp, #-16]!
+        stp             x7, x30, [sp, #-16]!
+        add             x0, sp, #48
+        sub             x1, x2, x3
+        mov             x2, x3
+        add             x3, x5, #3
+        mov             x4, x6
+        mov             x5, x7
+        bl              X(ff_hevc_put_hevc_epel_h6_8_neon)
+        ldp             x7, x30, [sp], #16
+        ldp             x4, x5, [sp], #16
+        ldp             x0, x1, [sp], #16
+        load_epel_filterh x7, x6
+        sub             x1, x1, #4
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1            {v16.8h}, [sp], x10
+        ld1            {v17.8h}, [sp], x10
+        ld1            {v18.8h}, [sp], x10
+1:      ld1            {v19.8h}, [sp], x10
+        calc_epelh      v4, v16, v17, v18, v19
+        calc_epelh2     v4, v5, v16, v17, v18, v19
+        ld1            {v6.8h}, [x4], x10
+        sqadd           v4.8h, v4.8h, v6.8h
+        sqrshrun        v4.8b, v4.8h, #7
+        st1            {v4.s}[0], [x0], #4
+        st1            {v4.h}[2], [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v16.8h}, [sp], x10
+        calc_epelh      v4, v17, v18, v19, v16
+        calc_epelh2     v4, v5, v17, v18, v19, v16
+        ld1            {v6.8h}, [x4], x10
+        sqadd           v4.8h, v4.8h, v6.8h
+        sqrshrun        v4.8b, v4.8h, #7
+        st1            {v4.s}[0], [x0], #4
+        st1            {v4.h}[2], [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v17.8h}, [sp], x10
+        calc_epelh      v4, v18, v19, v16, v17
+        calc_epelh2     v4, v5, v18, v19, v16, v17
+        ld1            {v6.8h}, [x4], x10
+        sqadd           v4.8h, v4.8h, v6.8h
+        sqrshrun        v4.8b, v4.8h, #7
+        st1            {v4.s}[0], [x0], #4
+        st1            {v4.h}[2], [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v18.8h}, [sp], x10
+        calc_epelh      v4, v19, v16, v17, v18
+        calc_epelh2     v4, v5, v19, v16, v17, v18
+        ld1            {v6.8h}, [x4], x10
+        sqadd           v4.8h, v4.8h, v6.8h
+        sqrshrun        v4.8b, v4.8h, #7
+        st1            {v4.s}[0], [x0], #4
+        st1            {v4.h}[2], [x0], x1
+        subs            x5, x5, #1
+        b.ne            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_bi_hv8_8_neon, export=1
+        add             x10, x5, #3
+        lsl             x10, x10, #7
+        sub             sp, sp, x10     // tmp_array
+        stp             x0, x1, [sp, #-16]!
+        stp             x4, x5, [sp, #-16]!
+        stp             x7, x30, [sp, #-16]!
+        add             x0, sp, #48
+        sub             x1, x2, x3
+        mov             x2, x3
+        add             x3, x5, #3
+        mov             x4, x6
+        mov             x5, x7
+        bl              X(ff_hevc_put_hevc_epel_h8_8_neon)
+        ldp             x7, x30, [sp], #16
+        ldp             x4, x5, [sp], #16
+        ldp             x0, x1, [sp], #16
+        load_epel_filterh x7, x6
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1            {v16.8h}, [sp], x10
+        ld1            {v17.8h}, [sp], x10
+        ld1            {v18.8h}, [sp], x10
+1:      ld1            {v19.8h}, [sp], x10
+        calc_epelh      v4, v16, v17, v18, v19
+        calc_epelh2     v4, v5, v16, v17, v18, v19
+        ld1            {v6.8h}, [x4], x10
+        sqadd           v4.8h, v4.8h, v6.8h
+        sqrshrun        v4.8b, v4.8h, #7
+        st1            {v4.8b}, [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v16.8h}, [sp], x10
+        calc_epelh      v4, v17, v18, v19, v16
+        calc_epelh2     v4, v5, v17, v18, v19, v16
+        ld1            {v6.8h}, [x4], x10
+        sqadd           v4.8h, v4.8h, v6.8h
+        sqrshrun        v4.8b, v4.8h, #7
+        st1            {v4.8b}, [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v17.8h}, [sp], x10
+        calc_epelh      v4, v18, v19, v16, v17
+        calc_epelh2     v4, v5, v18, v19, v16, v17
+        ld1            {v6.8h}, [x4], x10
+        sqadd           v4.8h, v4.8h, v6.8h
+        sqrshrun        v4.8b, v4.8h, #7
+        st1            {v4.8b}, [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v18.8h}, [sp], x10
+        calc_epelh      v4, v19, v16, v17, v18
+        calc_epelh2     v4, v5, v19, v16, v17, v18
+        ld1            {v6.8h}, [x4], x10
+        sqadd           v4.8h, v4.8h, v6.8h
+        sqrshrun        v4.8b, v4.8h, #7
+        st1            {v4.8b}, [x0], x1
+        subs            x5, x5, #1
+        b.ne            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_bi_hv12_8_neon, export=1
+        add             x10, x5, #3
+        lsl             x10, x10, #7
+        sub             sp, sp, x10     // tmp_array
+        stp             x0, x1, [sp, #-16]!
+        stp             x4, x5, [sp, #-16]!
+        stp             x7, x30, [sp, #-16]!
+        add             x0, sp, #48
+        sub             x1, x2, x3
+        mov             x2, x3
+        add             x3, x5, #3
+        mov             x4, x6
+        mov             x5, x7
+        bl              X(ff_hevc_put_hevc_epel_h12_8_neon)
+        ldp             x7, x30, [sp], #16
+        ldp             x4, x5, [sp], #16
+        ldp             x0, x1, [sp], #16
+        load_epel_filterh x7, x6
+        sub             x1, x1, #8
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1            {v16.8h, v17.8h}, [sp], x10
+        ld1            {v18.8h, v19.8h}, [sp], x10
+        ld1            {v20.8h, v21.8h}, [sp], x10
+1:      ld1            {v22.8h, v23.8h}, [sp], x10
+        calc_epelh      v4, v16, v18, v20, v22
+        calc_epelh2     v4, v5, v16, v18, v20, v22
+        calc_epelh      v5, v17, v19, v21, v23
+        ld1            {v6.8h, v7.8h}, [x4], x10
+        sqadd           v4.8h, v4.8h, v6.8h
+        sqadd           v5.8h, v5.8h, v7.8h
+        sqrshrun        v4.8b, v4.8h, #7
+        sqrshrun2       v4.16b, v5.8h, #7
+        st1            {v4.8b}, [x0], #8
+        st1            {v4.s}[2], [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v16.8h, v17.8h}, [sp], x10
+        calc_epelh      v4, v18, v20, v22, v16
+        calc_epelh2     v4, v5, v18, v20, v22, v16
+        calc_epelh      v5, v19, v21, v23, v17
+        ld1            {v6.8h, v7.8h}, [x4], x10
+        sqadd           v4.8h, v4.8h, v6.8h
+        sqadd           v5.8h, v5.8h, v7.8h
+        sqrshrun        v4.8b, v4.8h, #7
+        sqrshrun2       v4.16b, v5.8h, #7
+        st1            {v4.8b}, [x0], #8
+        st1            {v4.s}[2], [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v18.8h, v19.8h}, [sp], x10
+        calc_epelh      v4, v20, v22, v16, v18
+        calc_epelh2     v4, v5, v20, v22, v16, v18
+        calc_epelh      v5, v21, v23, v17, v19
+        ld1            {v6.8h, v7.8h}, [x4], x10
+        sqadd           v4.8h, v4.8h, v6.8h
+        sqadd           v5.8h, v5.8h, v7.8h
+        sqrshrun        v4.8b, v4.8h, #7
+        sqrshrun2       v4.16b, v5.8h, #7
+        st1            {v4.8b}, [x0], #8
+        st1            {v4.s}[2], [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v20.8h, v21.8h}, [sp], x10
+        calc_epelh      v4, v22, v16, v18, v20
+        calc_epelh2     v4, v5, v22, v16, v18, v20
+        calc_epelh      v5, v23, v17, v19, v21
+        ld1            {v6.8h, v7.8h}, [x4], x10
+        sqadd           v4.8h, v4.8h, v6.8h
+        sqadd           v5.8h, v5.8h, v7.8h
+        sqrshrun        v4.8b, v4.8h, #7
+        sqrshrun2       v4.16b, v5.8h, #7
+        st1            {v4.8b}, [x0], #8
+        st1            {v4.s}[2], [x0], x1
+        subs            x5, x5, #1
+        b.ne            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_bi_hv16_8_neon, export=1
+        add             x10, x5, #3
+        lsl             x10, x10, #7
+        sub             sp, sp, x10     // tmp_array
+        stp             x0, x1, [sp, #-16]!
+        stp             x4, x5, [sp, #-16]!
+        stp             x7, x30, [sp, #-16]!
+        add             x0, sp, #48
+        sub             x1, x2, x3
+        mov             x2, x3
+        add             x3, x5, #3
+        mov             x4, x6
+        mov             x5, x7
+        bl              X(ff_hevc_put_hevc_epel_h16_8_neon)
+        ldp             x7, x30, [sp], #16
+        ldp             x4, x5, [sp], #16
+        ldp             x0, x1, [sp], #16
+        load_epel_filterh x7, x6
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1            {v16.8h, v17.8h}, [sp], x10
+        ld1            {v18.8h, v19.8h}, [sp], x10
+        ld1            {v20.8h, v21.8h}, [sp], x10
+1:      ld1            {v22.8h, v23.8h}, [sp], x10
+        calc_epelh      v4, v16, v18, v20, v22
+        calc_epelh2     v4, v5, v16, v18, v20, v22
+        calc_epelh      v5, v17, v19, v21, v23
+        calc_epelh2     v5, v6, v17, v19, v21, v23
+        ld1            {v6.8h, v7.8h}, [x4], x10
+        sqadd           v4.8h, v4.8h, v6.8h
+        sqadd           v5.8h, v5.8h, v7.8h
+        sqrshrun        v4.8b, v4.8h, #7
+        sqrshrun2       v4.16b, v5.8h, #7
+        st1            {v4.16b}, [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v16.8h, v17.8h}, [sp], x10
+        calc_epelh      v4, v18, v20, v22, v16
+        calc_epelh2     v4, v5, v18, v20, v22, v16
+        calc_epelh      v5, v19, v21, v23, v17
+        calc_epelh2     v5, v6, v19, v21, v23, v17
+        ld1            {v6.8h, v7.8h}, [x4], x10
+        sqadd           v4.8h, v4.8h, v6.8h
+        sqadd           v5.8h, v5.8h, v7.8h
+        sqrshrun        v4.8b, v4.8h, #7
+        sqrshrun2       v4.16b, v5.8h, #7
+        st1            {v4.16b}, [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v18.8h, v19.8h}, [sp], x10
+        calc_epelh      v4, v20, v22, v16, v18
+        calc_epelh2     v4, v5, v20, v22, v16, v18
+        calc_epelh      v5, v21, v23, v17, v19
+        calc_epelh2     v5, v6, v21, v23, v17, v19
+        ld1            {v6.8h, v7.8h}, [x4], x10
+        sqadd           v4.8h, v4.8h, v6.8h
+        sqadd           v5.8h, v5.8h, v7.8h
+        sqrshrun        v4.8b, v4.8h, #7
+        sqrshrun2       v4.16b, v5.8h, #7
+        st1            {v4.16b}, [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v20.8h, v21.8h}, [sp], x10
+        calc_epelh      v4, v22, v16, v18, v20
+        calc_epelh2     v4, v5, v22, v16, v18, v20
+        calc_epelh      v5, v23, v17, v19, v21
+        calc_epelh2     v5, v6, v23, v17, v19, v21
+        ld1            {v6.8h, v7.8h}, [x4], x10
+        sqadd           v4.8h, v4.8h, v6.8h
+        sqadd           v5.8h, v5.8h, v7.8h
+        sqrshrun        v4.8b, v4.8h, #7
+        sqrshrun2       v4.16b, v5.8h, #7
+        st1            {v4.16b}, [x0], x1
+        subs            x5, x5, #1
+        b.ne            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_bi_hv24_8_neon, export=1
+        add             x10, x5, #3
+        lsl             x10, x10, #7
+        sub             sp, sp, x10     // tmp_array
+        stp             x0, x1, [sp, #-16]!
+        stp             x4, x5, [sp, #-16]!
+        stp             x7, x30, [sp, #-16]!
+        add             x0, sp, #48
+        sub             x1, x2, x3
+        mov             x2, x3
+        add             x3, x5, #3
+        mov             x4, x6
+        mov             x5, x7
+        bl              X(ff_hevc_put_hevc_epel_h24_8_neon)
+        ldp             x7, x30, [sp], #16
+        ldp             x4, x5, [sp], #16
+        ldp             x0, x1, [sp], #16
+        load_epel_filterh x7, x6
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1            {v16.8h, v17.8h, v18.8h}, [sp], x10
+        ld1            {v19.8h, v20.8h, v21.8h}, [sp], x10
+        ld1            {v22.8h, v23.8h, v24.8h}, [sp], x10
+1:      ld1            {v25.8h, v26.8h, v27.8h}, [sp], x10
+        calc_epelh      v1, v16, v19, v22, v25
+        calc_epelh2     v1, v2, v16, v19, v22, v25
+        calc_epelh      v2, v17, v20, v23, v26
+        calc_epelh2     v2, v3, v17, v20, v23, v26
+        calc_epelh      v3, v18, v21, v24, v27
+        calc_epelh2     v3, v4, v18, v21, v24, v27
+        ld1            {v4.8h, v5.8h, v6.8h}, [x4], x10
+        sqadd           v1.8h, v1.8h, v4.8h
+        sqadd           v2.8h, v2.8h, v5.8h
+        sqadd           v3.8h, v3.8h, v6.8h
+        sqrshrun        v1.8b, v1.8h, #7
+        sqrshrun        v2.8b, v2.8h, #7
+        sqrshrun        v3.8b, v3.8h, #7
+        st1            {v1.8b, v2.8b, v3.8b}, [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v16.8h, v17.8h, v18.8h}, [sp], x10
+        calc_epelh      v1, v19, v22, v25, v16
+        calc_epelh2     v1, v2, v19, v22, v25, v16
+        calc_epelh      v2, v20, v23, v26, v17
+        calc_epelh2     v2, v3, v20, v23, v26, v17
+        calc_epelh      v3, v21, v24, v27, v18
+        calc_epelh2     v3, v4, v21, v24, v27, v18
+        ld1            {v4.8h, v5.8h, v6.8h}, [x4], x10
+        sqadd           v1.8h, v1.8h, v4.8h
+        sqadd           v2.8h, v2.8h, v5.8h
+        sqadd           v3.8h, v3.8h, v6.8h
+        sqrshrun        v1.8b, v1.8h, #7
+        sqrshrun        v2.8b, v2.8h, #7
+        sqrshrun        v3.8b, v3.8h, #7
+        st1            {v1.8b, v2.8b, v3.8b}, [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v19.8h, v20.8h, v21.8h}, [sp], x10
+        calc_epelh      v1, v22, v25, v16, v19
+        calc_epelh2     v1, v2, v22, v25, v16, v19
+        calc_epelh      v2, v23, v26, v17, v20
+        calc_epelh2     v2, v3, v23, v26, v17, v20
+        calc_epelh      v3, v24, v27, v18, v21
+        calc_epelh2     v3, v4, v24, v27, v18, v21
+        ld1            {v4.8h, v5.8h, v6.8h}, [x4], x10
+        sqadd           v1.8h, v1.8h, v4.8h
+        sqadd           v2.8h, v2.8h, v5.8h
+        sqadd           v3.8h, v3.8h, v6.8h
+        sqrshrun        v1.8b, v1.8h, #7
+        sqrshrun        v2.8b, v2.8h, #7
+        sqrshrun        v3.8b, v3.8h, #7
+        st1            {v1.8b, v2.8b, v3.8b}, [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v22.8h, v23.8h, v24.8h}, [sp], x10
+        calc_epelh      v1, v25, v16, v19, v22
+        calc_epelh2     v1, v2, v25, v16, v19, v22
+        calc_epelh      v2, v26, v17, v20, v23
+        calc_epelh2     v2, v3, v26, v17, v20, v23
+        calc_epelh      v3, v27, v18, v21, v24
+        calc_epelh2     v3, v4, v27, v18, v21, v24
+        ld1            {v4.8h, v5.8h, v6.8h}, [x4], x10
+        sqadd           v1.8h, v1.8h, v4.8h
+        sqadd           v2.8h, v2.8h, v5.8h
+        sqadd           v3.8h, v3.8h, v6.8h
+        sqrshrun        v1.8b, v1.8h, #7
+        sqrshrun        v2.8b, v2.8h, #7
+        sqrshrun        v3.8b, v3.8h, #7
+        st1            {v1.8b, v2.8b, v3.8b}, [x0], x1
+        subs            x5, x5, #1
+        b.ne            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_epel_bi_hv32_8_neon, export=1
+        sub             sp, sp, #16
+        st1            {v8.16b}, [sp]
+        add             x10, x5, #3
+        lsl             x10, x10, #7
+        sub             sp, sp, x10     // tmp_array
+        stp             x0, x1, [sp, #-16]!
+        stp             x4, x5, [sp, #-16]!
+        stp             x7, x30, [sp, #-16]!
+        add             x0, sp, #48
+        sub             x1, x2, x3
+        mov             x2, x3
+        add             x3, x5, #3
+        mov             x4, x6
+        mov             x5, x7
+        bl              X(ff_hevc_put_hevc_epel_h32_8_neon)
+        ldp             x7, x30, [sp], #16
+        ldp             x4, x5, [sp], #16
+        ldp             x0, x1, [sp], #16
+        load_epel_filterh x7, x6
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1            {v16.8h, v17.8h, v18.8h, v19.8h}, [sp], x10
+        ld1            {v20.8h, v21.8h, v22.8h, v23.8h}, [sp], x10
+        ld1            {v24.8h, v25.8h, v26.8h, v27.8h}, [sp], x10
+1:      ld1            {v28.8h, v29.8h, v30.8h, v31.8h}, [sp], x10
+        calc_epelh      v1, v16, v20, v24, v28
+        calc_epelh2     v1, v2, v16, v20, v24, v28
+        calc_epelh      v2, v17, v21, v25, v29
+        calc_epelh2     v2, v3, v17, v21, v25, v29
+        calc_epelh      v3, v18, v22, v26, v30
+        calc_epelh2     v3, v4, v18, v22, v26, v30
+        calc_epelh      v4, v19, v23, v27, v31
+        calc_epelh2     v4, v5, v19, v23, v27, v31
+        ld1            {v5.8h, v6.8h, v7.8h, v8.8h}, [x4], x10
+        sqadd           v1.8h, v1.8h, v5.8h
+        sqadd           v2.8h, v2.8h, v6.8h
+        sqadd           v3.8h, v3.8h, v7.8h
+        sqadd           v4.8h, v4.8h, v8.8h
+        sqrshrun        v1.8b, v1.8h, #7
+        sqrshrun        v2.8b, v2.8h, #7
+        sqrshrun        v3.8b, v3.8h, #7
+        sqrshrun        v4.8b, v4.8h, #7
+        st1            {v1.8b, v2.8b, v3.8b, v4.8b}, [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v16.8h, v17.8h, v18.8h, v19.8h}, [sp], x10
+        calc_epelh      v1, v20, v24, v28, v16
+        calc_epelh2     v1, v2, v20, v24, v28, v16
+        calc_epelh      v2, v21, v25, v29, v17
+        calc_epelh2     v2, v3, v21, v25, v29, v17
+        calc_epelh      v3, v22, v26, v30, v18
+        calc_epelh2     v3, v4, v22, v26, v30, v18
+        calc_epelh      v4, v23, v27, v31, v19
+        calc_epelh2     v4, v5, v23, v27, v31, v19
+        ld1            {v5.8h, v6.8h, v7.8h, v8.8h}, [x4], x10
+        sqadd           v1.8h, v1.8h, v5.8h
+        sqadd           v2.8h, v2.8h, v6.8h
+        sqadd           v3.8h, v3.8h, v7.8h
+        sqadd           v4.8h, v4.8h, v8.8h
+        sqrshrun        v1.8b, v1.8h, #7
+        sqrshrun        v2.8b, v2.8h, #7
+        sqrshrun        v3.8b, v3.8h, #7
+        sqrshrun        v4.8b, v4.8h, #7
+        st1            {v1.8b, v2.8b, v3.8b, v4.8b}, [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v20.8h, v21.8h, v22.8h, v23.8h}, [sp], x10
+        calc_epelh      v1, v24, v28, v16, v20
+        calc_epelh2     v1, v2, v24, v28, v16, v20
+        calc_epelh      v2, v25, v29, v17, v21
+        calc_epelh2     v2, v3, v25, v29, v17, v21
+        calc_epelh      v3, v26, v30, v18, v22
+        calc_epelh2     v3, v4, v26, v30, v18, v22
+        calc_epelh      v4, v27, v31, v19, v23
+        calc_epelh2     v4, v5, v27, v31, v19, v23
+        ld1            {v5.8h, v6.8h, v7.8h, v8.8h}, [x4], x10
+        sqadd           v1.8h, v1.8h, v5.8h
+        sqadd           v2.8h, v2.8h, v6.8h
+        sqadd           v3.8h, v3.8h, v7.8h
+        sqadd           v4.8h, v4.8h, v8.8h
+        sqrshrun        v1.8b, v1.8h, #7
+        sqrshrun        v2.8b, v2.8h, #7
+        sqrshrun        v3.8b, v3.8h, #7
+        sqrshrun        v4.8b, v4.8h, #7
+        st1            {v1.8b, v2.8b, v3.8b, v4.8b}, [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v24.8h, v25.8h, v26.8h, v27.8h}, [sp], x10
+        calc_epelh      v1, v28, v16, v20, v24
+        calc_epelh2     v1, v2, v28, v16, v20, v24
+        calc_epelh      v2, v29, v17, v21, v25
+        calc_epelh2     v2, v3, v29, v17, v21, v25
+        calc_epelh      v3, v30, v18, v22, v26
+        calc_epelh2     v3, v4, v30, v18, v22, v26
+        calc_epelh      v4, v31, v19, v23, v27
+        calc_epelh2     v4, v5, v31, v19, v23, v27
+        ld1            {v5.8h, v6.8h, v7.8h, v8.8h}, [x4], x10
+        sqadd           v1.8h, v1.8h, v5.8h
+        sqadd           v2.8h, v2.8h, v6.8h
+        sqadd           v3.8h, v3.8h, v7.8h
+        sqadd           v4.8h, v4.8h, v8.8h
+        sqrshrun        v1.8b, v1.8h, #7
+        sqrshrun        v2.8b, v2.8h, #7
+        sqrshrun        v3.8b, v3.8h, #7
+        sqrshrun        v4.8b, v4.8h, #7
+        st1            {v1.8b, v2.8b, v3.8b, v4.8b}, [x0], x1
+        subs            x5, x5, #1
+        b.ne            1b
+2:      ld1            {v8.16b}, [sp], #16
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_bi_hv48_8_neon, export=1
+        stp             xzr, x30, [sp, #-16]!
+        stp             x0, x1, [sp, #-16]!
+        stp             x2, x3, [sp, #-16]!
+        stp             x4, x5, [sp, #-16]!
+        stp             x6, x7, [sp, #-16]!
+        bl              X(ff_hevc_put_hevc_epel_bi_hv24_8_neon)
+        ldp             x6, x7, [sp], #16
+        ldp             x4, x5, [sp], #16
+        ldp             x2, x3, [sp], #16
+        ldp             x0, x1, [sp], #16
+        add             x0, x0, #24
+        add             x2, x2, #24
+        add             x4, x4, #48
+        bl              X(ff_hevc_put_hevc_epel_bi_hv24_8_neon)
+        ldp             xzr, x30, [sp], #16
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_bi_hv64_8_neon, export=1
+        stp             xzr, x30, [sp, #-16]!
+        stp             x0, x1, [sp, #-16]!
+        stp             x2, x3, [sp, #-16]!
+        stp             x4, x5, [sp, #-16]!
+        stp             x6, x7, [sp, #-16]!
+        bl              X(ff_hevc_put_hevc_epel_bi_hv32_8_neon)
+        ldp             x6, x7, [sp], #16
+        ldp             x4, x5, [sp], #16
+        ldp             x2, x3, [sp], #16
+        ldp             x0, x1, [sp], #16
+        add             x0, x0, #32
+        add             x2, x2, #32
+        add             x4, x4, #64
+        bl              X(ff_hevc_put_hevc_epel_bi_hv32_8_neon)
+        ldp             xzr, x30, [sp], #16
+        ret
+endfunc
diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index c785e46f79..0e107deea6 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -58,7 +58,103 @@ void ff_hevc_sao_band_filter_8x8_8_neon(uint8_t *_dst, uint8_t *_src,
                                   int16_t *sao_offset_val, int sao_left_class,
                                   int width, int height);
 
+#define NEON8_FNPROTO(fn, args) \
+    void ff_hevc_put_hevc_##fn##4_8_neon args; \
+    void ff_hevc_put_hevc_##fn##6_8_neon args; \
+    void ff_hevc_put_hevc_##fn##8_8_neon args; \
+    void ff_hevc_put_hevc_##fn##12_8_neon args; \
+    void ff_hevc_put_hevc_##fn##16_8_neon args; \
+    void ff_hevc_put_hevc_##fn##24_8_neon args; \
+    void ff_hevc_put_hevc_##fn##32_8_neon args; \
+    void ff_hevc_put_hevc_##fn##48_8_neon args; \
+    void ff_hevc_put_hevc_##fn##64_8_neon args; \
 
+NEON8_FNPROTO(pel_pixels, (int16_t *dst,
+        uint8_t *src, ptrdiff_t srcstride,
+        int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(pel_bi_pixels, (uint8_t *dst, ptrdiff_t dststride,
+        uint8_t *src, ptrdiff_t srcstride, int16_t *src2,
+        int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(epel_h, (int16_t *dst,
+        uint8_t *src, ptrdiff_t srcstride,
+        int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(epel_v, (int16_t *dst,
+        uint8_t *src, ptrdiff_t srcstride,
+        int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(epel_hv, (int16_t *dst,
+        uint8_t *src, ptrdiff_t srcstride,
+        int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(epel_uni_v, (uint8_t *dst,  ptrdiff_t dststride,
+        uint8_t *src, ptrdiff_t srcstride,
+        int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(epel_uni_hv, (uint8_t *dst, ptrdiff_t _dststride,
+        uint8_t *src, ptrdiff_t srcstride,
+        int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(epel_bi_h, (uint8_t *dst, ptrdiff_t dststride,
+        uint8_t *src, ptrdiff_t srcstride, int16_t *src2,
+        int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(epel_bi_v, (uint8_t *dst, ptrdiff_t dststride,
+        uint8_t *src, ptrdiff_t srcstride, int16_t *src2,
+        int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(epel_bi_hv, (uint8_t *dst, ptrdiff_t dststride,
+        uint8_t *src, ptrdiff_t srcstride, int16_t *src2,
+        int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(qpel_h, (int16_t *dst,
+        uint8_t *src, ptrdiff_t srcstride,
+        int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(qpel_v, (int16_t *dst,
+        uint8_t *src, ptrdiff_t srcstride,
+        int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(qpel_hv, (int16_t *dst,
+        uint8_t *src, ptrdiff_t srcstride,
+        int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(qpel_uni_h, (uint8_t *dst,  ptrdiff_t dststride,
+            uint8_t *src, ptrdiff_t srcstride,
+            int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(qpel_uni_v, (uint8_t *dst,  ptrdiff_t dststride,
+        uint8_t *src, ptrdiff_t srcstride,
+        int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(qpel_uni_hv, (uint8_t *dst,  ptrdiff_t dststride,
+        uint8_t *src, ptrdiff_t srcstride,
+        int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(qpel_bi_h, (uint8_t *dst, ptrdiff_t dststride,
+        uint8_t *src, ptrdiff_t srcstride, int16_t *src2,
+        int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(qpel_bi_v, (uint8_t *dst, ptrdiff_t dststride,
+        uint8_t *src, ptrdiff_t srcstride, int16_t *src2,
+        int height, intptr_t mx, intptr_t my, int width));
+
+NEON8_FNPROTO(qpel_bi_hv, (uint8_t *dst, ptrdiff_t dststride,
+        uint8_t *src, ptrdiff_t srcstride, int16_t *src2,
+        int height, intptr_t mx, intptr_t my, int width));
+
+#define NEON8_FNASSIGN(member, v, h, fn) \
+        member[1][v][h] = ff_hevc_put_hevc_##fn##4_8_neon;  \
+        member[2][v][h] = ff_hevc_put_hevc_##fn##6_8_neon;  \
+        member[3][v][h] = ff_hevc_put_hevc_##fn##8_8_neon;  \
+        member[4][v][h] = ff_hevc_put_hevc_##fn##12_8_neon; \
+        member[5][v][h] = ff_hevc_put_hevc_##fn##16_8_neon; \
+        member[6][v][h] = ff_hevc_put_hevc_##fn##24_8_neon; \
+        member[7][v][h] = ff_hevc_put_hevc_##fn##32_8_neon; \
+        member[8][v][h] = ff_hevc_put_hevc_##fn##48_8_neon; \
+        member[9][v][h] = ff_hevc_put_hevc_##fn##64_8_neon;
 
 av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
 {
@@ -76,6 +172,28 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
         c->idct_dc[2]                  = ff_hevc_idct_16x16_dc_8_neon;
         c->idct_dc[3]                  = ff_hevc_idct_32x32_dc_8_neon;
         c->sao_band_filter[0]          = ff_hevc_sao_band_filter_8x8_8_neon;
+
+        NEON8_FNASSIGN(c->put_hevc_epel, 0, 0, pel_pixels);
+        NEON8_FNASSIGN(c->put_hevc_epel, 0, 1, epel_h);
+        NEON8_FNASSIGN(c->put_hevc_epel, 1, 0, epel_v);
+        NEON8_FNASSIGN(c->put_hevc_epel, 1, 1, epel_hv);
+        NEON8_FNASSIGN(c->put_hevc_epel_uni, 1, 0, epel_uni_v);
+        NEON8_FNASSIGN(c->put_hevc_epel_uni, 1, 1, epel_uni_hv);
+        NEON8_FNASSIGN(c->put_hevc_epel_bi, 0, 0, pel_bi_pixels);
+        NEON8_FNASSIGN(c->put_hevc_epel_bi, 0, 1, epel_bi_h);
+        NEON8_FNASSIGN(c->put_hevc_epel_bi, 1, 0, epel_bi_v);
+        NEON8_FNASSIGN(c->put_hevc_epel_bi, 1, 1, epel_bi_hv);
+        NEON8_FNASSIGN(c->put_hevc_qpel, 0, 0, pel_pixels);
+        NEON8_FNASSIGN(c->put_hevc_qpel, 0, 1, qpel_h);
+        NEON8_FNASSIGN(c->put_hevc_qpel, 1, 0, qpel_v);
+        NEON8_FNASSIGN(c->put_hevc_qpel, 1, 1, qpel_hv);
+        NEON8_FNASSIGN(c->put_hevc_qpel_uni, 0, 1, qpel_uni_h);
+        NEON8_FNASSIGN(c->put_hevc_qpel_uni, 1, 0, qpel_uni_v);
+        NEON8_FNASSIGN(c->put_hevc_qpel_uni, 1, 1, qpel_uni_hv);
+        NEON8_FNASSIGN(c->put_hevc_qpel_bi, 0, 0, pel_bi_pixels);
+        NEON8_FNASSIGN(c->put_hevc_qpel_bi, 0, 1, qpel_bi_h);
+        NEON8_FNASSIGN(c->put_hevc_qpel_bi, 1, 0, qpel_bi_v);
+        NEON8_FNASSIGN(c->put_hevc_qpel_bi, 1, 1, qpel_bi_hv);
     }
     if (bit_depth == 10) {
         c->add_residual[0]             = ff_hevc_add_residual_4x4_10_neon;
diff --git a/libavcodec/aarch64/hevcdsp_qpel_neon.S b/libavcodec/aarch64/hevcdsp_qpel_neon.S
new file mode 100644
index 0000000000..cc2e9c51f9
--- /dev/null
+++ b/libavcodec/aarch64/hevcdsp_qpel_neon.S
@@ -0,0 +1,5646 @@
+/* -*-arm64-*-
+ * vim: syntax=arm64asm
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+#define MAX_PB_SIZE 64
+
+.Lqpel_filters:
+        .byte  0,  0,  0,  0,  0,  0, 0,  0
+        .byte -1,  4,-10, 58, 17, -5, 1,  0
+        .byte -1,  4,-11, 40, 40,-11, 4, -1
+        .byte  0,  1, -5, 17, 58,-10, 4, -1
+
+.macro load_qpel_filterb freg, xreg
+        adr             \xreg, .Lqpel_filters
+        add             \xreg, \xreg, \freg, lsl #3
+        ld4r           {v0.16b, v1.16b, v2.16b, v3.16b}, [\xreg], #4
+        ld4r           {v4.16b, v5.16b, v6.16b, v7.16b}, [\xreg]
+        neg             v0.16b, v0.16b
+        neg             v2.16b, v2.16b
+        neg             v5.16b, v5.16b
+        neg             v7.16b, v7.16b
+.endm
+
+.macro calc_qpelb dst, src0, src1, src2, src3, src4, src5, src6, src7
+        umlsl           \dst\().8h, \src0\().8b, v0.8b
+        umlal           \dst\().8h, \src1\().8b, v1.8b
+        umlsl           \dst\().8h, \src2\().8b, v2.8b
+        umlal           \dst\().8h, \src3\().8b, v3.8b
+        umlal           \dst\().8h, \src4\().8b, v4.8b
+        umlsl           \dst\().8h, \src5\().8b, v5.8b
+        umlal           \dst\().8h, \src6\().8b, v6.8b
+        umlsl           \dst\().8h, \src7\().8b, v7.8b
+.endm
+
+.macro calc_qpelb2 dst, src0, src1, src2, src3, src4, src5, src6, src7
+        umlsl2          \dst\().8h, \src0\().16b, v0.16b
+        umlal2          \dst\().8h, \src1\().16b, v1.16b
+        umlsl2          \dst\().8h, \src2\().16b, v2.16b
+        umlal2          \dst\().8h, \src3\().16b, v3.16b
+        umlal2          \dst\().8h, \src4\().16b, v4.16b
+        umlsl2          \dst\().8h, \src5\().16b, v5.16b
+        umlal2          \dst\().8h, \src6\().16b, v6.16b
+        umlsl2          \dst\().8h, \src7\().16b, v7.16b
+.endm
+
+.macro load_qpel_filterh freg, xreg
+        adr             \xreg, .Lqpel_filters
+        add             \xreg, \xreg, \freg, lsl #3
+        ld1            {v0.8b}, [\xreg]
+        sxtl            v0.8h, v0.8b
+.endm
+
+.macro calc_qpelh dst, src0, src1, src2, src3, src4, src5, src6, src7, op, shift=6
+        smull           \dst\().4s, \src0\().4h, v0.h[0]
+        smlal           \dst\().4s, \src1\().4h, v0.h[1]
+        smlal           \dst\().4s, \src2\().4h, v0.h[2]
+        smlal           \dst\().4s, \src3\().4h, v0.h[3]
+        smlal           \dst\().4s, \src4\().4h, v0.h[4]
+        smlal           \dst\().4s, \src5\().4h, v0.h[5]
+        smlal           \dst\().4s, \src6\().4h, v0.h[6]
+        smlal           \dst\().4s, \src7\().4h, v0.h[7]
+.ifeqs "\op", "sshr"
+        sshr            \dst\().4s, \dst\().4s, \shift
+.else
+        \op             \dst\().4h, \dst\().4s, \shift
+.endif
+.endm
+
+.macro calc_qpelh2 dst, dstt, src0, src1, src2, src3, src4, src5, src6, src7, op, shift=6
+        smull2          \dstt\().4s, \src0\().8h, v0.h[0]
+        smlal2          \dstt\().4s, \src1\().8h, v0.h[1]
+        smlal2          \dstt\().4s, \src2\().8h, v0.h[2]
+        smlal2          \dstt\().4s, \src3\().8h, v0.h[3]
+        smlal2          \dstt\().4s, \src4\().8h, v0.h[4]
+        smlal2          \dstt\().4s, \src5\().8h, v0.h[5]
+        smlal2          \dstt\().4s, \src6\().8h, v0.h[6]
+        smlal2          \dstt\().4s, \src7\().8h, v0.h[7]
+.ifeqs "\op", "sshr"
+        sshr            \dst\().4s, \dstt\().4s, \shift
+.else
+        \op             \dst\().8h, \dstt\().4s, \shift
+.endif
+.endm
+
+function ff_hevc_put_hevc_qpel_h4_8_neon, export=1
+        load_qpel_filterb x4, x5
+        sub             x1, x1, #3
+        sub             x2, x2, #8
+        mov             x14, #(MAX_PB_SIZE * 2)
+1:      ld1            {v16.8b}, [x1], #8
+        ld1            {v17.s}[0], [x1], x2
+        ushr            v18.2d, v16.2d, #8
+        mov             v18.b[7], v17.b[0]
+        ushr            v19.2d, v18.2d, #8
+        mov             v19.b[7], v17.b[1]
+        ushr            v20.2d, v19.2d, #8
+        mov             v20.b[7], v17.b[2]
+        ushr            v21.2d, v20.2d, #8
+        ushr            v22.2d, v21.2d, #8
+        ushr            v23.2d, v22.2d, #8
+        ushr            v24.2d, v23.2d, #8
+        movi            v28.8h, #0
+        calc_qpelb      v28, v16, v18, v19, v20, v21, v22, v23, v24
+        st1            {v28.4h}, [x0], x14
+        subs            x3, x3, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_h6_8_neon, export=1
+        load_qpel_filterb x4, x5
+        sub             x1, x1, #3
+        mov             x14, #(MAX_PB_SIZE * 2 - 8)
+1:      ld1            {v16.8b, v17.8b}, [x1], x2
+        ushr            v18.2d, v16.2d, #8
+        mov             v18.b[7], v17.b[0]
+        ushr            v19.2d, v18.2d, #8
+        mov             v19.b[7], v17.b[1]
+        ushr            v20.2d, v19.2d, #8
+        mov             v20.b[7], v17.b[2]
+        ushr            v21.2d, v20.2d, #8
+        mov             v21.b[7], v17.b[3]
+        ushr            v22.2d, v21.2d, #8
+        mov             v22.b[7], v17.b[4]
+        ushr            v23.2d, v22.2d, #8
+        ushr            v24.2d, v23.2d, #8
+        movi            v28.8h, #0
+        calc_qpelb      v28, v16, v18, v19, v20, v21, v22, v23, v24
+        st1            {v28.4h}, [x0], #8
+        st1            {v28.s}[2], [x0], x14
+        subs            x3, x3, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_h8_8_neon, export=1
+        load_qpel_filterb x4, x5
+        sub             x1, x1, #3
+        mov             x14, #(MAX_PB_SIZE * 2)
+1:      ld1            {v16.8b, v17.8b}, [x1], x2
+        ushr            v18.2d, v16.2d, #8
+        mov             v18.b[7], v17.b[0]
+        ushr            v19.2d, v18.2d, #8
+        mov             v19.b[7], v17.b[1]
+        ushr            v20.2d, v19.2d, #8
+        mov             v20.b[7], v17.b[2]
+        ushr            v21.2d, v20.2d, #8
+        mov             v21.b[7], v17.b[3]
+        ushr            v22.2d, v21.2d, #8
+        mov             v22.b[7], v17.b[4]
+        ushr            v23.2d, v22.2d, #8
+        mov             v23.b[7], v17.b[5]
+        ushr            v24.2d, v23.2d, #8
+        mov             v24.b[7], v17.b[6]
+        movi            v28.8h, #0
+        calc_qpelb      v28, v16, v18, v19, v20, v21, v22, v23, v24
+        st1            {v28.8h}, [x0], x14
+        subs            x3, x3, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_h12_8_neon, export=1
+        load_qpel_filterb x4, x5
+        sub             x1, x1, #3
+        sub             x2, x2, #16
+        mov             x14, #(MAX_PB_SIZE * 2 - 16)
+1:      ld2            {v16.8b, v17.8b}, [x1], #16
+        ld1            {v27.s}[0], [x1], x2
+        ushr            v18.2d, v16.2d, #8
+        ushr            v19.2d, v17.2d, #8
+        mov             v18.b[7], v27.b[0]
+        mov             v19.b[7], v27.b[1]
+        ushr            v20.2d, v18.2d, #8
+        ushr            v21.2d, v19.2d, #8
+        mov             v20.b[7], v27.b[2]
+        mov             v21.b[7], v27.b[3]
+        ushr            v22.2d, v20.2d, #8
+        ushr            v23.2d, v21.2d, #8
+        ushr            v24.2d, v22.2d, #8
+        movi            v28.8h, #0
+        movi            v29.8h, #0
+        calc_qpelb      v28, v16, v17, v18, v19, v20, v21, v22, v23
+        calc_qpelb      v29, v17, v18, v19, v20, v21, v22, v23, v24
+        zip1            v16.8h, v28.8h, v29.8h
+        zip2            v17.8h, v28.8h, v29.8h
+        st1            {v16.8h}, [x0], #16
+        st1            {v17.4h}, [x0], x14
+        subs            x3, x3, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_h16_8_neon, export=1
+        load_qpel_filterb x4, x5
+        sub             x1, x1, #3
+        sub             x2, x2, #16
+        mov             x14, #(MAX_PB_SIZE * 2)
+1:      ld2            {v16.8b, v17.8b}, [x1], #16
+        ld1            {v27.8b}, [x1], x2
+        ushr            v18.2d, v16.2d, #8
+        ushr            v19.2d, v17.2d, #8
+        mov             v18.b[7], v27.b[0]
+        mov             v19.b[7], v27.b[1]
+        ushr            v20.2d, v18.2d, #8
+        ushr            v21.2d, v19.2d, #8
+        mov             v20.b[7], v27.b[2]
+        mov             v21.b[7], v27.b[3]
+        ushr            v22.2d, v20.2d, #8
+        ushr            v23.2d, v21.2d, #8
+        mov             v22.b[7], v27.b[4]
+        mov             v23.b[7], v27.b[5]
+        ushr            v24.2d, v22.2d, #8
+        mov             v24.b[7], v27.b[6]
+        movi            v28.8h, #0
+        movi            v29.8h, #0
+        calc_qpelb      v28, v16, v17, v18, v19, v20, v21, v22, v23
+        calc_qpelb      v29, v17, v18, v19, v20, v21, v22, v23, v24
+        st2            {v28.8h, v29.8h}, [x0], x14
+        subs            x3, x3, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_h24_8_neon, export=1
+        load_qpel_filterb x4, x5
+        sub             x1, x1, #3
+        sub             x2, x2, #24
+        mov             x14, #(MAX_PB_SIZE * 2)
+1:      ld3            {v16.8b, v17.8b, v18.8b}, [x1], #24
+        ld1            {v27.8b}, [x1], x2
+        ushr            v19.2d, v16.2d, #8
+        ushr            v20.2d, v17.2d, #8
+        ushr            v21.2d, v18.2d, #8
+        mov             v19.b[7], v27.b[0]
+        mov             v20.b[7], v27.b[1]
+        mov             v21.b[7], v27.b[2]
+        ushr            v22.2d, v19.2d, #8
+        ushr            v23.2d, v20.2d, #8
+        ushr            v24.2d, v21.2d, #8
+        mov             v22.b[7], v27.b[3]
+        mov             v23.b[7], v27.b[4]
+        mov             v24.b[7], v27.b[5]
+        ushr            v25.2d, v22.2d, #8
+        mov             v25.b[7], v27.b[6]
+        movi            v28.8h, #0
+        movi            v29.8h, #0
+        movi            v30.8h, #0
+        calc_qpelb      v28, v16, v17, v18, v19, v20, v21, v22, v23
+        calc_qpelb      v29, v17, v18, v19, v20, v21, v22, v23, v24
+        calc_qpelb      v30, v18, v19, v20, v21, v22, v23, v24, v25
+        st3            {v28.8h, v29.8h, v30.8h}, [x0], x14
+        subs            x3, x3, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_h32_8_neon, export=1
+        load_qpel_filterb x4, x5
+        sub             x1, x1, #3
+        sub             x2, x2, #32
+        mov             x14, #(MAX_PB_SIZE * 2)
+1:      ld4            {v16.8b, v17.8b, v18.8b, v19.8b}, [x1], #32
+        ld1            {v27.8b}, [x1], x2
+        ushr            v20.2d, v16.2d, #8
+        ushr            v21.2d, v17.2d, #8
+        ushr            v22.2d, v18.2d, #8
+        ushr            v23.2d, v19.2d, #8
+        mov             v20.b[7], v27.b[0]
+        mov             v21.b[7], v27.b[1]
+        mov             v22.b[7], v27.b[2]
+        mov             v23.b[7], v27.b[3]
+        ushr            v24.2d, v20.2d, #8
+        ushr            v25.2d, v21.2d, #8
+        ushr            v26.2d, v22.2d, #8
+        mov             v24.b[7], v27.b[4]
+        mov             v25.b[7], v27.b[5]
+        mov             v26.b[7], v27.b[6]
+        movi            v28.8h, #0
+        movi            v29.8h, #0
+        movi            v30.8h, #0
+        movi            v31.8h, #0
+        calc_qpelb      v28, v16, v17, v18, v19, v20, v21, v22, v23
+        calc_qpelb      v29, v17, v18, v19, v20, v21, v22, v23, v24
+        calc_qpelb      v30, v18, v19, v20, v21, v22, v23, v24, v25
+        calc_qpelb      v31, v19, v20, v21, v22, v23, v24, v25, v26
+        st4            {v28.8h, v29.8h, v30.8h, v31.8h}, [x0], x14
+        subs            x3, x3, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_h48_8_neon, export=1
+        load_qpel_filterb x4, x5
+        sub             x1, x1, #3
+        sub             x2, x2, #48
+        mov             x7, #24
+        mov             x14, #80
+1:      ld3            {v16.16b, v17.16b, v18.16b}, [x1], x7
+        ld1            {v26.8b}, [x1], x7
+        ld1            {v27.8b}, [x1], x2
+        ushr            v19.2d, v16.2d, #8
+        ushr            v20.2d, v17.2d, #8
+        ushr            v21.2d, v18.2d, #8
+        mov             v19.b[7], v26.b[0]
+        mov             v19.b[15], v27.b[0]
+        mov             v20.b[7], v26.b[1]
+        mov             v20.b[15], v27.b[1]
+        mov             v21.b[7], v26.b[2]
+        mov             v21.b[15], v27.b[2]
+        ushr            v22.2d, v19.2d, #8
+        ushr            v23.2d, v20.2d, #8
+        ushr            v24.2d, v21.2d, #8
+        mov             v22.b[7], v26.b[3]
+        mov             v22.b[15], v27.b[3]
+        mov             v23.b[7], v26.b[4]
+        mov             v23.b[15], v27.b[4]
+        mov             v24.b[7], v26.b[5]
+        mov             v24.b[15], v27.b[5]
+        ushr            v25.2d, v22.2d, #8
+        mov             v25.b[7], v26.b[6]
+        mov             v25.b[15], v27.b[6]
+        movi            v28.8h, #0
+        movi            v29.8h, #0
+        movi            v30.8h, #0
+        calc_qpelb      v28, v16, v17, v18, v19, v20, v21, v22, v23
+        calc_qpelb      v29, v17, v18, v19, v20, v21, v22, v23, v24
+        calc_qpelb      v30, v18, v19, v20, v21, v22, v23, v24, v25
+        st3            {v28.8h, v29.8h, v30.8h}, [x0], #48
+        movi            v28.8h, #0
+        movi            v29.8h, #0
+        movi            v30.8h, #0
+        calc_qpelb2     v28, v16, v17, v18, v19, v20, v21, v22, v23
+        calc_qpelb2     v29, v17, v18, v19, v20, v21, v22, v23, v24
+        calc_qpelb2     v30, v18, v19, v20, v21, v22, v23, v24, v25
+        st3            {v28.8h, v29.8h, v30.8h}, [x0], x14
+        subs            x3, x3, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_h64_8_neon, export=1
+        load_qpel_filterb x4, x5
+        sub             x1, x1, #3
+        sub             x2, x2, #64
+        mov             x7, #32
+1:      ld4            {v16.16b, v17.16b, v18.16b, v19.16b}, [x1], x7
+        ld1            {v27.8b}, [x1], x7
+        ld1            {v28.8b}, [x1], x2
+        ushr            v20.2d, v16.2d, #8
+        ushr            v21.2d, v17.2d, #8
+        ushr            v22.2d, v18.2d, #8
+        ushr            v23.2d, v19.2d, #8
+        mov             v20.b[7], v27.b[0]
+        mov             v21.b[7], v27.b[1]
+        mov             v22.b[7], v27.b[2]
+        mov             v23.b[7], v27.b[3]
+        mov             v20.b[15], v28.b[0]
+        mov             v21.b[15], v28.b[1]
+        mov             v22.b[15], v28.b[2]
+        mov             v23.b[15], v28.b[3]
+        ushr            v24.2d, v20.2d, #8
+        ushr            v25.2d, v21.2d, #8
+        ushr            v26.2d, v22.2d, #8
+        mov             v24.b[7], v27.b[4]
+        mov             v25.b[7], v27.b[5]
+        mov             v26.b[7], v27.b[6]
+        mov             v24.b[15], v28.b[4]
+        mov             v25.b[15], v28.b[5]
+        mov             v26.b[15], v28.b[6]
+        movi            v28.8h, #0
+        movi            v29.8h, #0
+        movi            v30.8h, #0
+        movi            v31.8h, #0
+        calc_qpelb      v28, v16, v17, v18, v19, v20, v21, v22, v23
+        calc_qpelb      v29, v17, v18, v19, v20, v21, v22, v23, v24
+        calc_qpelb      v30, v18, v19, v20, v21, v22, v23, v24, v25
+        calc_qpelb      v31, v19, v20, v21, v22, v23, v24, v25, v26
+        st4            {v28.8h, v29.8h, v30.8h, v31.8h}, [x0], #64
+        movi            v28.8h, #0
+        movi            v29.8h, #0
+        movi            v30.8h, #0
+        movi            v31.8h, #0
+        calc_qpelb2     v28, v16, v17, v18, v19, v20, v21, v22, v23
+        calc_qpelb2     v29, v17, v18, v19, v20, v21, v22, v23, v24
+        calc_qpelb2     v30, v18, v19, v20, v21, v22, v23, v24, v25
+        calc_qpelb2     v31, v19, v20, v21, v22, v23, v24, v25, v26
+        st4            {v28.8h, v29.8h, v30.8h, v31.8h}, [x0], #64
+        subs            x3, x3, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_v4_8_neon, export=1
+        load_qpel_filterb x5, x4
+        sub             x1, x1, x2, lsl #1
+        sub             x1, x1, x2
+        mov             x9, #(MAX_PB_SIZE * 2)
+        ld1            {v16.s}[0], [x1], x2
+        ld1            {v17.s}[0], [x1], x2
+        ld1            {v18.s}[0], [x1], x2
+        ld1            {v19.s}[0], [x1], x2
+        ld1            {v20.s}[0], [x1], x2
+        ld1            {v21.s}[0], [x1], x2
+        ld1            {v22.s}[0], [x1], x2
+1:      ld1            {v23.s}[0], [x1], x2
+        movi            v24.8h, #0
+        calc_qpelb      v24, v16, v17, v18, v19, v20, v21, v22, v23
+        st1            {v24.4h}, [x0], x9
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v16.s}[0], [x1], x2
+        movi            v24.8h, #0
+        calc_qpelb      v24, v17, v18, v19, v20, v21, v22, v23, v16
+        st1            {v24.4h}, [x0], x9
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v17.s}[0], [x1], x2
+        movi            v24.8h, #0
+        calc_qpelb      v24, v18, v19, v20, v21, v22, v23, v16, v17
+        st1            {v24.4h}, [x0], x9
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v18.s}[0], [x1], x2
+        movi            v24.8h, #0
+        calc_qpelb      v24, v19, v20, v21, v22, v23, v16, v17, v18
+        st1            {v24.4h}, [x0], x9
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v19.s}[0], [x1], x2
+        movi            v24.8h, #0
+        calc_qpelb      v24, v20, v21, v22, v23, v16, v17, v18, v19
+        st1            {v24.4h}, [x0], x9
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v20.s}[0], [x1], x2
+        movi            v24.8h, #0
+        calc_qpelb      v24, v21, v22, v23, v16, v17, v18, v19, v20
+        st1            {v24.4h}, [x0], x9
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v21.s}[0], [x1], x2
+        movi            v24.8h, #0
+        calc_qpelb      v24, v22, v23, v16, v17, v18, v19, v20, v21
+        st1            {v24.4h}, [x0], x9
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v22.s}[0], [x1], x2
+        movi            v24.8h, #0
+        calc_qpelb      v24, v23, v16, v17, v18, v19, v20, v21, v22
+        st1            {v24.4h}, [x0], x9
+        subs            x3, x3, #1
+        b.hi            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_v6_8_neon, export=1
+        load_qpel_filterb x5, x4
+        sub             x1, x1, x2, lsl #1
+        sub             x1, x1, x2
+        mov             x9, #(MAX_PB_SIZE * 2 - 8)
+        ld1            {v16.8b}, [x1], x2
+        ld1            {v17.8b}, [x1], x2
+        ld1            {v18.8b}, [x1], x2
+        ld1            {v19.8b}, [x1], x2
+        ld1            {v20.8b}, [x1], x2
+        ld1            {v21.8b}, [x1], x2
+        ld1            {v22.8b}, [x1], x2
+1:      ld1            {v23.8b}, [x1], x2
+        movi            v24.8h, #0
+        calc_qpelb      v24, v16, v17, v18, v19, v20, v21, v22, v23
+        st1            {v24.4h}, [x0], #8
+        st1            {v24.s}[2], [x0], x9
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v16.8b}, [x1], x2
+        movi            v24.8h, #0
+        calc_qpelb      v24, v17, v18, v19, v20, v21, v22, v23, v16
+        st1            {v24.4h}, [x0], #8
+        st1            {v24.s}[2], [x0], x9
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v17.8b}, [x1], x2
+        movi            v24.8h, #0
+        calc_qpelb      v24, v18, v19, v20, v21, v22, v23, v16, v17
+        st1            {v24.4h}, [x0], #8
+        st1            {v24.s}[2], [x0], x9
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v18.8b}, [x1], x2
+        movi            v24.8h, #0
+        calc_qpelb      v24, v19, v20, v21, v22, v23, v16, v17, v18
+        st1            {v24.4h}, [x0], #8
+        st1            {v24.s}[2], [x0], x9
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v19.8b}, [x1], x2
+        movi            v24.8h, #0
+        calc_qpelb      v24, v20, v21, v22, v23, v16, v17, v18, v19
+        st1            {v24.4h}, [x0], #8
+        st1            {v24.s}[2], [x0], x9
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v20.8b}, [x1], x2
+        movi            v24.8h, #0
+        calc_qpelb      v24, v21, v22, v23, v16, v17, v18, v19, v20
+        st1            {v24.4h}, [x0], #8
+        st1            {v24.s}[2], [x0], x9
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v21.8b}, [x1], x2
+        movi            v24.8h, #0
+        calc_qpelb      v24, v22, v23, v16, v17, v18, v19, v20, v21
+        st1            {v24.4h}, [x0], #8
+        st1            {v24.s}[2], [x0], x9
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v22.8b}, [x1], x2
+        movi            v24.8h, #0
+        calc_qpelb      v24, v23, v16, v17, v18, v19, v20, v21, v22
+        st1            {v24.4h}, [x0], #8
+        st1            {v24.s}[2], [x0], x9
+        subs            x3, x3, #1
+        b.hi            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_v8_8_neon, export=1
+        load_qpel_filterb x5, x4
+        sub             x1, x1, x2, lsl #1
+        sub             x1, x1, x2
+        mov             x9, #(MAX_PB_SIZE * 2)
+        ld1            {v16.8b}, [x1], x2
+        ld1            {v17.8b}, [x1], x2
+        ld1            {v18.8b}, [x1], x2
+        ld1            {v19.8b}, [x1], x2
+        ld1            {v20.8b}, [x1], x2
+        ld1            {v21.8b}, [x1], x2
+        ld1            {v22.8b}, [x1], x2
+1:      ld1            {v23.8b}, [x1], x2
+        movi            v24.8h, #0
+        calc_qpelb      v24, v16, v17, v18, v19, v20, v21, v22, v23
+        st1            {v24.8h}, [x0], x9
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v16.8b}, [x1], x2
+        movi            v24.8h, #0
+        calc_qpelb      v24, v17, v18, v19, v20, v21, v22, v23, v16
+        st1            {v24.8h}, [x0], x9
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v17.8b}, [x1], x2
+        movi            v24.8h, #0
+        calc_qpelb      v24, v18, v19, v20, v21, v22, v23, v16, v17
+        st1            {v24.8h}, [x0], x9
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v18.8b}, [x1], x2
+        movi            v24.8h, #0
+        calc_qpelb      v24, v19, v20, v21, v22, v23, v16, v17, v18
+        st1            {v24.8h}, [x0], x9
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v19.8b}, [x1], x2
+        movi            v24.8h, #0
+        calc_qpelb      v24, v20, v21, v22, v23, v16, v17, v18, v19
+        st1            {v24.8h}, [x0], x9
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v20.8b}, [x1], x2
+        movi            v24.8h, #0
+        calc_qpelb      v24, v21, v22, v23, v16, v17, v18, v19, v20
+        st1            {v24.8h}, [x0], x9
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v21.8b}, [x1], x2
+        movi            v24.8h, #0
+        calc_qpelb      v24, v22, v23, v16, v17, v18, v19, v20, v21
+        st1            {v24.8h}, [x0], x9
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v22.8b}, [x1], x2
+        movi            v24.8h, #0
+        calc_qpelb      v24, v23, v16, v17, v18, v19, v20, v21, v22
+        st1            {v24.8h}, [x0], x9
+        subs            x3, x3, #1
+        b.hi            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_v12_8_neon, export=1
+        load_qpel_filterb x5, x4
+        sub             x1, x1, x2, lsl #1
+        sub             x1, x1, x2
+        mov             x9, #(MAX_PB_SIZE * 2 - 16)
+        ld1            {v16.16b}, [x1], x2
+        ld1            {v17.16b}, [x1], x2
+        ld1            {v18.16b}, [x1], x2
+        ld1            {v19.16b}, [x1], x2
+        ld1            {v20.16b}, [x1], x2
+        ld1            {v21.16b}, [x1], x2
+        ld1            {v22.16b}, [x1], x2
+1:      ld1            {v23.16b}, [x1], x2
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        calc_qpelb      v24, v16, v17, v18, v19, v20, v21, v22, v23
+        calc_qpelb2     v25, v16, v17, v18, v19, v20, v21, v22, v23
+        st1            {v24.8h}, [x0], #16
+        st1            {v25.4h}, [x0], x9
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v16.16b}, [x1], x2
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        calc_qpelb      v24, v17, v18, v19, v20, v21, v22, v23, v16
+        calc_qpelb2     v25, v17, v18, v19, v20, v21, v22, v23, v16
+        st1            {v24.8h}, [x0], #16
+        st1            {v25.4h}, [x0], x9
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v17.16b}, [x1], x2
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        calc_qpelb      v24, v18, v19, v20, v21, v22, v23, v16, v17
+        calc_qpelb2     v25, v18, v19, v20, v21, v22, v23, v16, v17
+        st1            {v24.8h}, [x0], #16
+        st1            {v25.4h}, [x0], x9
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v18.16b}, [x1], x2
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        calc_qpelb      v24, v19, v20, v21, v22, v23, v16, v17, v18
+        calc_qpelb2     v25, v19, v20, v21, v22, v23, v16, v17, v18
+        st1            {v24.8h}, [x0], #16
+        st1            {v25.4h}, [x0], x9
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v19.16b}, [x1], x2
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        calc_qpelb      v24, v20, v21, v22, v23, v16, v17, v18, v19
+        calc_qpelb2     v25, v20, v21, v22, v23, v16, v17, v18, v19
+        st1            {v24.8h}, [x0], #16
+        st1            {v25.4h}, [x0], x9
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v20.16b}, [x1], x2
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        calc_qpelb      v24, v21, v22, v23, v16, v17, v18, v19, v20
+        calc_qpelb2     v25, v21, v22, v23, v16, v17, v18, v19, v20
+        st1            {v24.8h}, [x0], #16
+        st1            {v25.4h}, [x0], x9
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v21.16b}, [x1], x2
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        calc_qpelb      v24, v22, v23, v16, v17, v18, v19, v20, v21
+        calc_qpelb2     v25, v22, v23, v16, v17, v18, v19, v20, v21
+        st1            {v24.8h}, [x0], #16
+        st1            {v25.4h}, [x0], x9
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v22.16b}, [x1], x2
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        calc_qpelb      v24, v23, v16, v17, v18, v19, v20, v21, v22
+        calc_qpelb2     v25, v23, v16, v17, v18, v19, v20, v21, v22
+        st1            {v24.8h}, [x0], #16
+        st1            {v25.4h}, [x0], x9
+        subs            x3, x3, #1
+        b.hi            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_v16_8_neon, export=1
+        load_qpel_filterb x5, x4
+        sub             x1, x1, x2, lsl #1
+        sub             x1, x1, x2
+        mov             x9, #(MAX_PB_SIZE * 2)
+        ld1            {v16.16b}, [x1], x2
+        ld1            {v17.16b}, [x1], x2
+        ld1            {v18.16b}, [x1], x2
+        ld1            {v19.16b}, [x1], x2
+        ld1            {v20.16b}, [x1], x2
+        ld1            {v21.16b}, [x1], x2
+        ld1            {v22.16b}, [x1], x2
+1:      ld1            {v23.16b}, [x1], x2
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        calc_qpelb      v24, v16, v17, v18, v19, v20, v21, v22, v23
+        calc_qpelb2     v25, v16, v17, v18, v19, v20, v21, v22, v23
+        st1            {v24.8h, v25.8h}, [x0], x9
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v16.16b}, [x1], x2
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        calc_qpelb      v24, v17, v18, v19, v20, v21, v22, v23, v16
+        calc_qpelb2     v25, v17, v18, v19, v20, v21, v22, v23, v16
+        st1            {v24.8h, v25.8h}, [x0], x9
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v17.16b}, [x1], x2
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        calc_qpelb      v24, v18, v19, v20, v21, v22, v23, v16, v17
+        calc_qpelb2     v25, v18, v19, v20, v21, v22, v23, v16, v17
+        st1            {v24.8h, v25.8h}, [x0], x9
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v18.16b}, [x1], x2
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        calc_qpelb      v24, v19, v20, v21, v22, v23, v16, v17, v18
+        calc_qpelb2     v25, v19, v20, v21, v22, v23, v16, v17, v18
+        st1            {v24.8h, v25.8h}, [x0], x9
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v19.16b}, [x1], x2
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        calc_qpelb      v24, v20, v21, v22, v23, v16, v17, v18, v19
+        calc_qpelb2     v25, v20, v21, v22, v23, v16, v17, v18, v19
+        st1            {v24.8h, v25.8h}, [x0], x9
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v20.16b}, [x1], x2
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        calc_qpelb      v24, v21, v22, v23, v16, v17, v18, v19, v20
+        calc_qpelb2     v25, v21, v22, v23, v16, v17, v18, v19, v20
+        st1            {v24.8h, v25.8h}, [x0], x9
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v21.16b}, [x1], x2
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        calc_qpelb      v24, v22, v23, v16, v17, v18, v19, v20, v21
+        calc_qpelb2     v25, v22, v23, v16, v17, v18, v19, v20, v21
+        st1            {v24.8h, v25.8h}, [x0], x9
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v22.16b}, [x1], x2
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        calc_qpelb      v24, v23, v16, v17, v18, v19, v20, v21, v22
+        calc_qpelb2     v25, v23, v16, v17, v18, v19, v20, v21, v22
+        st1            {v24.8h, v25.8h}, [x0], x9
+        subs            x3, x3, #1
+        b.hi            1b
+2:      ret
+endfunc
+
+// todo: reads #32 bytes
+function ff_hevc_put_hevc_qpel_v24_8_neon, export=1
+        sub             sp, sp, #48
+        st1            {v8.16b, v9.16b, v10.16b}, [sp]
+        load_qpel_filterb x5, x4
+        sub             x1, x1, x2, lsl #1
+        sub             x1, x1, x2
+        mov             x9, #(MAX_PB_SIZE * 2)
+        ld1            {v16.16b, v17.16b}, [x1], x2
+        ld1            {v18.16b, v19.16b}, [x1], x2
+        ld1            {v20.16b, v21.16b}, [x1], x2
+        ld1            {v22.16b, v23.16b}, [x1], x2
+        ld1            {v24.16b, v25.16b}, [x1], x2
+        ld1            {v26.16b, v27.16b}, [x1], x2
+        ld1            {v28.16b, v29.16b}, [x1], x2
+1:      ld1            {v30.16b, v31.16b}, [x1], x2
+        movi            v8.8h, #0
+        movi            v9.8h, #0
+        movi            v10.8h, #0
+        calc_qpelb      v8,  v16, v18, v20, v22, v24, v26, v28, v30
+        calc_qpelb2     v9,  v16, v18, v20, v22, v24, v26, v28, v30
+        calc_qpelb      v10, v17, v19, v21, v23, v25, v27, v29, v31
+        st1            {v8.8h, v9.8h, v10.8h}, [x0], x9
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v16.16b, v17.16b}, [x1], x2
+        movi            v8.8h, #0
+        movi            v9.8h, #0
+        movi            v10.8h, #0
+        calc_qpelb      v8,  v18, v20, v22, v24, v26, v28, v30, v16
+        calc_qpelb2     v9,  v18, v20, v22, v24, v26, v28, v30, v16
+        calc_qpelb      v10, v19, v21, v23, v25, v27, v29, v31, v17
+        st1            {v8.8h, v9.8h, v10.8h}, [x0], x9
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v18.16b, v19.16b}, [x1], x2
+        movi            v8.8h, #0
+        movi            v9.8h, #0
+        movi            v10.8h, #0
+        calc_qpelb      v8,  v20, v22, v24, v26, v28, v30, v16, v18
+        calc_qpelb2     v9,  v20, v22, v24, v26, v28, v30, v16, v18
+        calc_qpelb      v10, v21, v23, v25, v27, v29, v31, v17, v19
+        st1            {v8.8h, v9.8h, v10.8h}, [x0], x9
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v20.16b, v21.16b}, [x1], x2
+        movi            v8.8h, #0
+        movi            v9.8h, #0
+        movi            v10.8h, #0
+        calc_qpelb      v8,  v22, v24, v26, v28, v30, v16, v18, v20
+        calc_qpelb2     v9,  v22, v24, v26, v28, v30, v16, v18, v20
+        calc_qpelb      v10, v23, v25, v27, v29, v31, v17, v19, v21
+        st1            {v8.8h, v9.8h, v10.8h}, [x0], x9
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v22.16b, v23.16b}, [x1], x2
+        movi            v8.8h, #0
+        movi            v9.8h, #0
+        movi            v10.8h, #0
+        calc_qpelb      v8,  v24, v26, v28, v30, v16, v18, v20, v22
+        calc_qpelb2     v9,  v24, v26, v28, v30, v16, v18, v20, v22
+        calc_qpelb      v10, v25, v27, v29, v31, v17, v19, v21, v23
+        st1            {v8.8h, v9.8h, v10.8h}, [x0], x9
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v24.16b, v25.16b}, [x1], x2
+        movi            v8.8h, #0
+        movi            v9.8h, #0
+        movi            v10.8h, #0
+        calc_qpelb      v8,  v26, v28, v30, v16, v18, v20, v22, v24
+        calc_qpelb2     v9,  v26, v28, v30, v16, v18, v20, v22, v24
+        calc_qpelb      v10, v27, v29, v31, v17, v19, v21, v23, v25
+        st1            {v8.8h, v9.8h, v10.8h}, [x0], x9
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v26.16b, v27.16b}, [x1], x2
+        movi            v8.8h, #0
+        movi            v9.8h, #0
+        movi            v10.8h, #0
+        calc_qpelb      v8,  v28, v30, v16, v18, v20, v22, v24, v26
+        calc_qpelb2     v9,  v28, v30, v16, v18, v20, v22, v24, v26
+        calc_qpelb      v10, v29, v31, v17, v19, v21, v23, v25, v27
+        st1            {v8.8h, v9.8h, v10.8h}, [x0], x9
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v28.16b, v29.16b}, [x1], x2
+        movi            v8.8h, #0
+        movi            v9.8h, #0
+        movi            v10.8h, #0
+        calc_qpelb      v8,  v30, v16, v18, v20, v22, v24, v26, v28
+        calc_qpelb2     v9,  v30, v16, v18, v20, v22, v24, v26, v28
+        calc_qpelb      v10, v31, v17, v19, v21, v23, v25, v27, v29
+        st1            {v8.8h, v9.8h, v10.8h}, [x0], x9
+        subs            x3, x3, #1
+        b.hi            1b
+2:      ld1            {v8.16b, v9.16b, v10.16b}, [sp], #48
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_v32_8_neon, export=1
+        sub             sp, sp, #64
+        st1            {v8.16b, v9.16b, v10.16b, v11.16b}, [sp]
+        load_qpel_filterb x5, x4
+        sub             x1, x1, x2, lsl #1
+        sub             x1, x1, x2
+        mov             x9, #(MAX_PB_SIZE * 2)
+        ld1            {v16.16b, v17.16b}, [x1], x2
+        ld1            {v18.16b, v19.16b}, [x1], x2
+        ld1            {v20.16b, v21.16b}, [x1], x2
+        ld1            {v22.16b, v23.16b}, [x1], x2
+        ld1            {v24.16b, v25.16b}, [x1], x2
+        ld1            {v26.16b, v27.16b}, [x1], x2
+        ld1            {v28.16b, v29.16b}, [x1], x2
+1:      ld1            {v30.16b, v31.16b}, [x1], x2
+        movi            v8.8h, #0
+        movi            v9.8h, #0
+        movi            v10.8h, #0
+        movi            v11.8h, #0
+        calc_qpelb      v8,  v16, v18, v20, v22, v24, v26, v28, v30
+        calc_qpelb2     v9,  v16, v18, v20, v22, v24, v26, v28, v30
+        calc_qpelb      v10, v17, v19, v21, v23, v25, v27, v29, v31
+        calc_qpelb2     v11, v17, v19, v21, v23, v25, v27, v29, v31
+        st1            {v8.8h, v9.8h, v10.8h, v11.8h}, [x0], x9
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v16.16b, v17.16b}, [x1], x2
+        movi            v8.8h, #0
+        movi            v9.8h, #0
+        movi            v10.8h, #0
+        movi            v11.8h, #0
+        calc_qpelb      v8,  v18, v20, v22, v24, v26, v28, v30, v16
+        calc_qpelb2     v9,  v18, v20, v22, v24, v26, v28, v30, v16
+        calc_qpelb      v10, v19, v21, v23, v25, v27, v29, v31, v17
+        calc_qpelb2     v11, v19, v21, v23, v25, v27, v29, v31, v17
+        st1            {v8.8h, v9.8h, v10.8h, v11.8h}, [x0], x9
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v18.16b, v19.16b}, [x1], x2
+        movi            v8.8h, #0
+        movi            v9.8h, #0
+        movi            v10.8h, #0
+        movi            v11.8h, #0
+        calc_qpelb      v8,  v20, v22, v24, v26, v28, v30, v16, v18
+        calc_qpelb2     v9,  v20, v22, v24, v26, v28, v30, v16, v18
+        calc_qpelb      v10, v21, v23, v25, v27, v29, v31, v17, v19
+        calc_qpelb2     v11, v21, v23, v25, v27, v29, v31, v17, v19
+        st1            {v8.8h, v9.8h, v10.8h, v11.8h}, [x0], x9
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v20.16b, v21.16b}, [x1], x2
+        movi            v8.8h, #0
+        movi            v9.8h, #0
+        movi            v10.8h, #0
+        movi            v11.8h, #0
+        calc_qpelb      v8,  v22, v24, v26, v28, v30, v16, v18, v20
+        calc_qpelb2     v9,  v22, v24, v26, v28, v30, v16, v18, v20
+        calc_qpelb      v10, v23, v25, v27, v29, v31, v17, v19, v21
+        calc_qpelb2     v11, v23, v25, v27, v29, v31, v17, v19, v21
+        st1            {v8.8h, v9.8h, v10.8h, v11.8h}, [x0], x9
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v22.16b, v23.16b}, [x1], x2
+        movi            v8.8h, #0
+        movi            v9.8h, #0
+        movi            v10.8h, #0
+        movi            v11.8h, #0
+        calc_qpelb      v8,  v24, v26, v28, v30, v16, v18, v20, v22
+        calc_qpelb2     v9,  v24, v26, v28, v30, v16, v18, v20, v22
+        calc_qpelb      v10, v25, v27, v29, v31, v17, v19, v21, v23
+        calc_qpelb2     v11, v25, v27, v29, v31, v17, v19, v21, v23
+        st1            {v8.8h, v9.8h, v10.8h, v11.8h}, [x0], x9
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v24.16b, v25.16b}, [x1], x2
+        movi            v8.8h, #0
+        movi            v9.8h, #0
+        movi            v10.8h, #0
+        movi            v11.8h, #0
+        calc_qpelb      v8,  v26, v28, v30, v16, v18, v20, v22, v24
+        calc_qpelb2     v9,  v26, v28, v30, v16, v18, v20, v22, v24
+        calc_qpelb      v10, v27, v29, v31, v17, v19, v21, v23, v25
+        calc_qpelb2     v11, v27, v29, v31, v17, v19, v21, v23, v25
+        st1            {v8.8h, v9.8h, v10.8h, v11.8h}, [x0], x9
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v26.16b, v27.16b}, [x1], x2
+        movi            v8.8h, #0
+        movi            v9.8h, #0
+        movi            v10.8h, #0
+        movi            v11.8h, #0
+        calc_qpelb      v8,  v28, v30, v16, v18, v20, v22, v24, v26
+        calc_qpelb2     v9,  v28, v30, v16, v18, v20, v22, v24, v26
+        calc_qpelb      v10, v29, v31, v17, v19, v21, v23, v25, v27
+        calc_qpelb2     v11, v29, v31, v17, v19, v21, v23, v25, v27
+        st1            {v8.8h, v9.8h, v10.8h, v11.8h}, [x0], x9
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v28.16b, v29.16b}, [x1], x2
+        movi            v8.8h, #0
+        movi            v9.8h, #0
+        movi            v10.8h, #0
+        movi            v11.8h, #0
+        calc_qpelb      v8,  v30, v16, v18, v20, v22, v24, v26, v28
+        calc_qpelb2     v9,  v30, v16, v18, v20, v22, v24, v26, v28
+        calc_qpelb      v10, v31, v17, v19, v21, v23, v25, v27, v29
+        calc_qpelb2     v11, v31, v17, v19, v21, v23, v25, v27, v29
+        st1            {v8.8h, v9.8h, v10.8h, v11.8h}, [x0], x9
+        subs            x3, x3, #1
+        b.hi            1b
+2:      ld1            {v8.16b, v9.16b, v10.16b, v11.16b}, [sp], #64
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_v48_8_neon, export=1
+        stp             x5, x30, [sp, #-16]!
+        stp             x0, x1, [sp, #-16]!
+        stp             x2, x3, [sp, #-16]!
+        bl              X(ff_hevc_put_hevc_qpel_v24_8_neon)
+        ldp             x2, x3, [sp], #16
+        ldp             x0, x1, [sp], #16
+        ldr             x5, [sp]
+        add             x0, x0, #48
+        add             x1, x1, #24
+        bl              X(ff_hevc_put_hevc_qpel_v24_8_neon)
+        ldp             xzr, x30, [sp], #16
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_v64_8_neon, export=1
+        sub             sp, sp, #64
+        st1            {v8.16b, v9.16b, v10.16b, v11.16b}, [sp]
+        load_qpel_filterb x5, x4
+        sub             x1, x1, x2, lsl #1
+        sub             x1, x1, x2
+        mov             x9, #(MAX_PB_SIZE * 2)
+1:      mov             x11, x3         // height
+        mov             x10, x0         // dst
+        mov             x8, x1          // src
+
+        ld1            {v16.16b, v17.16b}, [x8], x2
+        ld1            {v18.16b, v19.16b}, [x8], x2
+        ld1            {v20.16b, v21.16b}, [x8], x2
+        ld1            {v22.16b, v23.16b}, [x8], x2
+        ld1            {v24.16b, v25.16b}, [x8], x2
+        ld1            {v26.16b, v27.16b}, [x8], x2
+        ld1            {v28.16b, v29.16b}, [x8], x2
+2:      ld1            {v30.16b, v31.16b}, [x8], x2
+        movi            v8.8h, #0
+        movi            v9.8h, #0
+        movi            v10.8h, #0
+        movi            v11.8h, #0
+        calc_qpelb      v8,  v16, v18, v20, v22, v24, v26, v28, v30
+        calc_qpelb2     v9,  v16, v18, v20, v22, v24, v26, v28, v30
+        calc_qpelb      v10, v17, v19, v21, v23, v25, v27, v29, v31
+        calc_qpelb2     v11, v17, v19, v21, v23, v25, v27, v29, v31
+        st1            {v8.8h, v9.8h, v10.8h, v11.8h}, [x10], x9
+        subs            x11, x11, #1
+        b.eq            3f
+
+        ld1            {v16.16b, v17.16b}, [x8], x2
+        movi            v8.8h, #0
+        movi            v9.8h, #0
+        movi            v10.8h, #0
+        movi            v11.8h, #0
+        calc_qpelb      v8,  v18, v20, v22, v24, v26, v28, v30, v16
+        calc_qpelb2     v9,  v18, v20, v22, v24, v26, v28, v30, v16
+        calc_qpelb      v10, v19, v21, v23, v25, v27, v29, v31, v17
+        calc_qpelb2     v11, v19, v21, v23, v25, v27, v29, v31, v17
+        st1            {v8.8h, v9.8h, v10.8h, v11.8h}, [x10], x9
+        subs            x11, x11, #1
+        b.eq            3f
+
+        ld1            {v18.16b, v19.16b}, [x8], x2
+        movi            v8.8h, #0
+        movi            v9.8h, #0
+        movi            v10.8h, #0
+        movi            v11.8h, #0
+        calc_qpelb      v8,  v20, v22, v24, v26, v28, v30, v16, v18
+        calc_qpelb2     v9,  v20, v22, v24, v26, v28, v30, v16, v18
+        calc_qpelb      v10, v21, v23, v25, v27, v29, v31, v17, v19
+        calc_qpelb2     v11, v21, v23, v25, v27, v29, v31, v17, v19
+        st1            {v8.8h, v9.8h, v10.8h, v11.8h}, [x10], x9
+        subs            x11, x11, #1
+        b.eq            3f
+
+        ld1            {v20.16b, v21.16b}, [x8], x2
+        movi            v8.8h, #0
+        movi            v9.8h, #0
+        movi            v10.8h, #0
+        movi            v11.8h, #0
+        calc_qpelb      v8,  v22, v24, v26, v28, v30, v16, v18, v20
+        calc_qpelb2     v9,  v22, v24, v26, v28, v30, v16, v18, v20
+        calc_qpelb      v10, v23, v25, v27, v29, v31, v17, v19, v21
+        calc_qpelb2     v11, v23, v25, v27, v29, v31, v17, v19, v21
+        st1            {v8.8h, v9.8h, v10.8h, v11.8h}, [x10], x9
+        subs            x11, x11, #1
+        b.eq            3f
+
+        ld1            {v22.16b, v23.16b}, [x8], x2
+        movi            v8.8h, #0
+        movi            v9.8h, #0
+        movi            v10.8h, #0
+        movi            v11.8h, #0
+        calc_qpelb      v8,  v24, v26, v28, v30, v16, v18, v20, v22
+        calc_qpelb2     v9,  v24, v26, v28, v30, v16, v18, v20, v22
+        calc_qpelb      v10, v25, v27, v29, v31, v17, v19, v21, v23
+        calc_qpelb2     v11, v25, v27, v29, v31, v17, v19, v21, v23
+        st1            {v8.8h, v9.8h, v10.8h, v11.8h}, [x10], x9
+        subs            x11, x11, #1
+        b.eq            3f
+
+        ld1            {v24.16b, v25.16b}, [x8], x2
+        movi            v8.8h, #0
+        movi            v9.8h, #0
+        movi            v10.8h, #0
+        movi            v11.8h, #0
+        calc_qpelb      v8,  v26, v28, v30, v16, v18, v20, v22, v24
+        calc_qpelb2     v9,  v26, v28, v30, v16, v18, v20, v22, v24
+        calc_qpelb      v10, v27, v29, v31, v17, v19, v21, v23, v25
+        calc_qpelb2     v11, v27, v29, v31, v17, v19, v21, v23, v25
+        st1            {v8.8h, v9.8h, v10.8h, v11.8h}, [x10], x9
+        subs            x11, x11, #1
+        b.eq            3f
+
+        ld1            {v26.16b, v27.16b}, [x8], x2
+        movi            v8.8h, #0
+        movi            v9.8h, #0
+        movi            v10.8h, #0
+        movi            v11.8h, #0
+        calc_qpelb      v8,  v28, v30, v16, v18, v20, v22, v24, v26
+        calc_qpelb2     v9,  v28, v30, v16, v18, v20, v22, v24, v26
+        calc_qpelb      v10, v29, v31, v17, v19, v21, v23, v25, v27
+        calc_qpelb2     v11, v29, v31, v17, v19, v21, v23, v25, v27
+        st1            {v8.8h, v9.8h, v10.8h, v11.8h}, [x10], x9
+        subs            x11, x11, #1
+        b.eq            3f
+
+        ld1            {v28.16b, v29.16b}, [x8], x2
+        movi            v8.8h, #0
+        movi            v9.8h, #0
+        movi            v10.8h, #0
+        movi            v11.8h, #0
+        calc_qpelb      v8,  v30, v16, v18, v20, v22, v24, v26, v28
+        calc_qpelb2     v9,  v30, v16, v18, v20, v22, v24, v26, v28
+        calc_qpelb      v10, v31, v17, v19, v21, v23, v25, v27, v29
+        calc_qpelb2     v11, v31, v17, v19, v21, v23, v25, v27, v29
+        st1            {v8.8h, v9.8h, v10.8h, v11.8h}, [x10], x9
+        subs            x11, x11, #1
+        b.hi            2b
+
+3:      add             x0, x0, #64
+        add             x1, x1, #32
+        subs            x6, x6, #32
+        b.hi            1b
+        ld1            {v8.16b, v9.16b, v10.16b, v11.16b}, [sp], #64
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_hv4_8_neon, export=1
+        add             x10, x3, #7
+        lsl             x10, x10, #7
+        sub             sp, sp, x10         // tmp_array
+        stp             x0, x3, [sp, #-16]!
+        stp             x5, x30, [sp, #-16]!
+        add             x0, sp, #32
+        sub             x1, x1, x2, lsl #1
+        sub             x1, x1, x2
+        add             x3, x3, #7
+        bl              X(ff_hevc_put_hevc_qpel_h4_8_neon)
+        ldp             x5, x30, [sp], #16
+        ldp             x0, x3, [sp], #16
+        load_qpel_filterh x5, x4
+        mov             x7, #128
+        ld1            {v16.4h}, [sp], x7
+        ld1            {v17.4h}, [sp], x7
+        ld1            {v18.4h}, [sp], x7
+        ld1            {v19.4h}, [sp], x7
+        ld1            {v20.4h}, [sp], x7
+        ld1            {v21.4h}, [sp], x7
+        ld1            {v22.4h}, [sp], x7
+1:      ld1            {v23.4h}, [sp], x7
+        calc_qpelh      v1, v16, v17, v18, v19, v20, v21, v22, v23, sqshrn
+        st1            {v1.4h}, [x0], x7
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v16.4h}, [sp], x7
+        calc_qpelh      v1, v17, v18, v19, v20, v21, v22, v23, v16, sqshrn
+        st1            {v1.4h}, [x0], x7
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v17.4h}, [sp], x7
+        calc_qpelh      v1, v18, v19, v20, v21, v22, v23, v16, v17, sqshrn
+        st1            {v1.4h}, [x0], x7
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v18.4h}, [sp], x7
+        calc_qpelh      v1, v19, v20, v21, v22, v23, v16, v17, v18, sqshrn
+        st1            {v1.4h}, [x0], x7
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v19.4h}, [sp], x7
+        calc_qpelh      v1, v20, v21, v22, v23, v16, v17, v18, v19, sqshrn
+        st1            {v1.4h}, [x0], x7
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v20.4h}, [sp], x7
+        calc_qpelh      v1, v21, v22, v23, v16, v17, v18, v19, v20, sqshrn
+        st1            {v1.4h}, [x0], x7
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v21.4h}, [sp], x7
+        calc_qpelh      v1, v22, v23, v16, v17, v18, v19, v20, v21, sqshrn
+        st1            {v1.4h}, [x0], x7
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v22.4h}, [sp], x7
+        calc_qpelh      v1, v23, v16, v17, v18, v19, v20, v21, v22, sqshrn
+        st1            {v1.4h}, [x0], x7
+        subs            x3, x3, #1
+        b.hi            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_hv6_8_neon, export=1
+        add             x10, x3, #7
+        lsl             x10, x10, #7
+        sub             sp, sp, x10         // tmp_array
+        stp             x0, x3, [sp, #-16]!
+        stp             x5, x30, [sp, #-16]!
+        add             x0, sp, #32
+        sub             x1, x1, x2, lsl #1
+        sub             x1, x1, x2
+        add             x3, x3, #7
+        bl              X(ff_hevc_put_hevc_qpel_h6_8_neon)
+        ldp             x5, x30, [sp], #16
+        ldp             x0, x3, [sp], #16
+        load_qpel_filterh x5, x4
+        mov             x7, #128
+        mov             x8, #120
+        ld1            {v16.8h}, [sp], x7
+        ld1            {v17.8h}, [sp], x7
+        ld1            {v18.8h}, [sp], x7
+        ld1            {v19.8h}, [sp], x7
+        ld1            {v20.8h}, [sp], x7
+        ld1            {v21.8h}, [sp], x7
+        ld1            {v22.8h}, [sp], x7
+1:      ld1            {v23.8h}, [sp], x7
+        calc_qpelh      v1, v16, v17, v18, v19, v20, v21, v22, v23, sqshrn
+        calc_qpelh2     v1, v2, v16, v17, v18, v19, v20, v21, v22, v23, sqshrn2
+        st1            {v1.4h}, [x0], #8
+        st1            {v1.s}[2], [x0], x8
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v16.8h}, [sp], x7
+        calc_qpelh      v1, v17, v18, v19, v20, v21, v22, v23, v16, sqshrn
+        calc_qpelh2     v1, v2, v17, v18, v19, v20, v21, v22, v23, v16, sqshrn2
+        st1            {v1.4h}, [x0], #8
+        st1            {v1.s}[2], [x0], x8
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v17.8h}, [sp], x7
+        calc_qpelh      v1, v18, v19, v20, v21, v22, v23, v16, v17, sqshrn
+        calc_qpelh2     v1, v2, v18, v19, v20, v21, v22, v23, v16, v17, sqshrn2
+        st1            {v1.4h}, [x0], #8
+        st1            {v1.s}[2], [x0], x8
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v18.8h}, [sp], x7
+        calc_qpelh      v1, v19, v20, v21, v22, v23, v16, v17, v18, sqshrn
+        calc_qpelh2     v1, v2, v19, v20, v21, v22, v23, v16, v17, v18, sqshrn2
+        st1            {v1.4h}, [x0], #8
+        st1            {v1.s}[2], [x0], x8
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v19.8h}, [sp], x7
+        calc_qpelh      v1, v20, v21, v22, v23, v16, v17, v18, v19, sqshrn
+        calc_qpelh2     v1, v2, v20, v21, v22, v23, v16, v17, v18, v19, sqshrn2
+        st1            {v1.4h}, [x0], #8
+        st1            {v1.s}[2], [x0], x8
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v20.8h}, [sp], x7
+        calc_qpelh      v1, v21, v22, v23, v16, v17, v18, v19, v20, sqshrn
+        calc_qpelh2     v1, v2, v21, v22, v23, v16, v17, v18, v19, v20, sqshrn2
+        st1            {v1.4h}, [x0], #8
+        st1            {v1.s}[2], [x0], x8
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v21.8h}, [sp], x7
+        calc_qpelh      v1, v22, v23, v16, v17, v18, v19, v20, v21, sqshrn
+        calc_qpelh2     v1, v2, v22, v23, v16, v17, v18, v19, v20, v21, sqshrn2
+        st1            {v1.4h}, [x0], #8
+        st1            {v1.s}[2], [x0], x8
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v22.8h}, [sp], x7
+        calc_qpelh      v1, v23, v16, v17, v18, v19, v20, v21, v22, sqshrn
+        calc_qpelh2     v1, v2, v23, v16, v17, v18, v19, v20, v21, v22, sqshrn2
+        st1            {v1.4h}, [x0], #8
+        st1            {v1.s}[2], [x0], x8
+        subs            x3, x3, #1
+        b.hi            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_hv8_8_neon, export=1
+        add             x10, x3, #7
+        lsl             x10, x10, #7
+        sub             sp, sp, x10         // tmp_array
+        stp             x0, x3, [sp, #-16]!
+        stp             x5, x30, [sp, #-16]!
+        add             x0, sp, #32
+        sub             x1, x1, x2, lsl #1
+        sub             x1, x1, x2
+        add             x3, x3, #7
+        bl              X(ff_hevc_put_hevc_qpel_h8_8_neon)
+        ldp             x5, x30, [sp], #16
+        ldp             x0, x3, [sp], #16
+        load_qpel_filterh x5, x4
+        mov             x7, #128
+        ld1            {v16.8h}, [sp], x7
+        ld1            {v17.8h}, [sp], x7
+        ld1            {v18.8h}, [sp], x7
+        ld1            {v19.8h}, [sp], x7
+        ld1            {v20.8h}, [sp], x7
+        ld1            {v21.8h}, [sp], x7
+        ld1            {v22.8h}, [sp], x7
+1:      ld1            {v23.8h}, [sp], x7
+        calc_qpelh      v1, v16, v17, v18, v19, v20, v21, v22, v23, sqshrn
+        calc_qpelh2     v1, v2, v16, v17, v18, v19, v20, v21, v22, v23, sqshrn2
+        st1            {v1.8h}, [x0], x7
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v16.8h}, [sp], x7
+        calc_qpelh      v1, v17, v18, v19, v20, v21, v22, v23, v16, sqshrn
+        calc_qpelh2     v1, v2, v17, v18, v19, v20, v21, v22, v23, v16, sqshrn2
+        st1            {v1.8h}, [x0], x7
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v17.8h}, [sp], x7
+        calc_qpelh      v1, v18, v19, v20, v21, v22, v23, v16, v17, sqshrn
+        calc_qpelh2     v1, v2, v18, v19, v20, v21, v22, v23, v16, v17, sqshrn2
+        st1            {v1.8h}, [x0], x7
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v18.8h}, [sp], x7
+        calc_qpelh      v1, v19, v20, v21, v22, v23, v16, v17, v18, sqshrn
+        calc_qpelh2     v1, v2, v19, v20, v21, v22, v23, v16, v17, v18, sqshrn2
+        st1            {v1.8h}, [x0], x7
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v19.8h}, [sp], x7
+        calc_qpelh      v1, v20, v21, v22, v23, v16, v17, v18, v19, sqshrn
+        calc_qpelh2     v1, v2, v20, v21, v22, v23, v16, v17, v18, v19, sqshrn2
+        st1            {v1.8h}, [x0], x7
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v20.8h}, [sp], x7
+        calc_qpelh      v1, v21, v22, v23, v16, v17, v18, v19, v20, sqshrn
+        calc_qpelh2     v1, v2, v21, v22, v23, v16, v17, v18, v19, v20, sqshrn2
+        st1            {v1.8h}, [x0], x7
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v21.8h}, [sp], x7
+        calc_qpelh      v1, v22, v23, v16, v17, v18, v19, v20, v21, sqshrn
+        calc_qpelh2     v1, v2, v22, v23, v16, v17, v18, v19, v20, v21, sqshrn2
+        st1            {v1.8h}, [x0], x7
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v22.8h}, [sp], x7
+        calc_qpelh      v1, v23, v16, v17, v18, v19, v20, v21, v22, sqshrn
+        calc_qpelh2     v1, v2, v23, v16, v17, v18, v19, v20, v21, v22, sqshrn2
+        st1            {v1.8h}, [x0], x7
+        subs            x3, x3, #1
+        b.hi            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_hv12_8_neon, export=1
+        add             x10, x3, #7
+        lsl             x10, x10, #7
+        sub             sp, sp, x10         // tmp_array
+        stp             x0, x3, [sp, #-16]!
+        stp             x5, x30, [sp, #-16]!
+        add             x0, sp, #32
+        sub             x1, x1, x2, lsl #1
+        sub             x1, x1, x2
+        add             x3, x3, #7
+        bl              X(ff_hevc_put_hevc_qpel_h12_8_neon)
+        ldp             x5, x30, [sp], #16
+        ldp             x0, x3, [sp], #16
+        load_qpel_filterh x5, x4
+        mov             x7, #128
+        mov             x8, #112
+        ld1            {v16.8h, v17.8h}, [sp], x7
+        ld1            {v18.8h, v19.8h}, [sp], x7
+        ld1            {v20.8h, v21.8h}, [sp], x7
+        ld1            {v22.8h, v23.8h}, [sp], x7
+        ld1            {v24.8h, v25.8h}, [sp], x7
+        ld1            {v26.8h, v27.8h}, [sp], x7
+        ld1            {v28.8h, v29.8h}, [sp], x7
+1:      ld1            {v30.8h, v31.8h}, [sp], x7
+        calc_qpelh      v1, v16, v18, v20, v22, v24, v26, v28, v30, sqshrn
+        calc_qpelh2     v1, v2, v16, v18, v20, v22, v24, v26, v28, v30, sqshrn2
+        calc_qpelh      v2, v17, v19, v21, v23, v25, v27, v29, v31, sqshrn
+        st1            {v1.8h}, [x0], #16
+        st1            {v2.4h}, [x0], x8
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v16.8h, v17.8h}, [sp], x7
+        calc_qpelh      v1, v18, v20, v22, v24, v26, v28, v30, v16, sqshrn
+        calc_qpelh2     v1, v2, v18, v20, v22, v24, v26, v28, v30, v16, sqshrn2
+        calc_qpelh      v2, v19, v21, v23, v25, v27, v29, v31, v17, sqshrn
+        st1            {v1.8h}, [x0], #16
+        st1            {v2.4h}, [x0], x8
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v18.8h, v19.8h}, [sp], x7
+        calc_qpelh      v1, v20, v22, v24, v26, v28, v30, v16, v18, sqshrn
+        calc_qpelh2     v1, v2, v20, v22, v24, v26, v28, v30, v16, v18, sqshrn2
+        calc_qpelh      v2, v21, v23, v25, v27, v29, v31, v17, v19, sqshrn
+        st1            {v1.8h}, [x0], #16
+        st1            {v2.4h}, [x0], x8
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v20.8h, v21.8h}, [sp], x7
+        calc_qpelh      v1, v22, v24, v26, v28, v30, v16, v18, v20, sqshrn
+        calc_qpelh2     v1, v2, v22, v24, v26, v28, v30, v16, v18, v20, sqshrn2
+        calc_qpelh      v2, v23, v25, v27, v29, v31, v17, v19, v21, sqshrn
+        st1            {v1.8h}, [x0], #16
+        st1            {v2.4h}, [x0], x8
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v22.8h, v23.8h}, [sp], x7
+        calc_qpelh      v1, v24, v26, v28, v30, v16, v18, v20, v22, sqshrn
+        calc_qpelh2     v1, v2, v24, v26, v28, v30, v16, v18, v20, v22, sqshrn2
+        calc_qpelh      v2, v25, v27, v29, v31, v17, v19, v21, v23, sqshrn
+        st1            {v1.8h}, [x0], #16
+        st1            {v2.4h}, [x0], x8
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v24.8h, v25.8h}, [sp], x7
+        calc_qpelh      v1, v26, v28, v30, v16, v18, v20, v22, v24, sqshrn
+        calc_qpelh2     v1, v2, v26, v28, v30, v16, v18, v20, v22, v24, sqshrn2
+        calc_qpelh      v2, v27, v29, v31, v17, v19, v21, v23, v25, sqshrn
+        st1            {v1.8h}, [x0], #16
+        st1            {v2.4h}, [x0], x8
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v26.8h, v27.8h}, [sp], x7
+        calc_qpelh      v1, v28, v30, v16, v18, v20, v22, v24, v26, sqshrn
+        calc_qpelh2     v1, v2, v28, v30, v16, v18, v20, v22, v24, v26, sqshrn2
+        calc_qpelh      v2, v29, v31, v17, v19, v21, v23, v25, v27, sqshrn
+        st1            {v1.8h}, [x0], #16
+        st1            {v2.4h}, [x0], x8
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v28.8h, v29.8h}, [sp], x7
+        calc_qpelh      v1, v30, v16, v18, v20, v22, v24, v26, v28, sqshrn
+        calc_qpelh2     v1, v2, v30, v16, v18, v20, v22, v24, v26, v28, sqshrn2
+        calc_qpelh      v2, v31, v17, v19, v21, v23, v25, v27, v29, sqshrn
+        st1            {v1.8h}, [x0], #16
+        st1            {v2.4h}, [x0], x8
+        subs            x3, x3, #1
+        b.hi            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_hv16_8_neon, export=1
+        add             x10, x3, #7
+        lsl             x10, x10, #7
+        sub             sp, sp, x10         // tmp_array
+        stp             x0, x3, [sp, #-16]!
+        stp             x5, x30, [sp, #-16]!
+        add             x0, sp, #32
+        sub             x1, x1, x2, lsl #1
+        sub             x1, x1, x2
+        add             x3, x3, #7
+        bl              X(ff_hevc_put_hevc_qpel_h16_8_neon)
+        ldp             x5, x30, [sp], #16
+        ldp             x0, x3, [sp], #16
+        load_qpel_filterh x5, x4
+        mov             x7, #128
+        ld1            {v16.8h, v17.8h}, [sp], x7
+        ld1            {v18.8h, v19.8h}, [sp], x7
+        ld1            {v20.8h, v21.8h}, [sp], x7
+        ld1            {v22.8h, v23.8h}, [sp], x7
+        ld1            {v24.8h, v25.8h}, [sp], x7
+        ld1            {v26.8h, v27.8h}, [sp], x7
+        ld1            {v28.8h, v29.8h}, [sp], x7
+1:      ld1            {v30.8h, v31.8h}, [sp], x7
+        calc_qpelh      v1, v16, v18, v20, v22, v24, v26, v28, v30, sqshrn
+        calc_qpelh2     v1, v2, v16, v18, v20, v22, v24, v26, v28, v30, sqshrn2
+        calc_qpelh      v2, v17, v19, v21, v23, v25, v27, v29, v31, sqshrn
+        calc_qpelh2     v2, v3, v17, v19, v21, v23, v25, v27, v29, v31, sqshrn2
+        st1            {v1.8h, v2.8h}, [x0], x7
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v16.8h, v17.8h}, [sp], x7
+        calc_qpelh      v1, v18, v20, v22, v24, v26, v28, v30, v16, sqshrn
+        calc_qpelh2     v1, v2, v18, v20, v22, v24, v26, v28, v30, v16, sqshrn2
+        calc_qpelh      v2, v19, v21, v23, v25, v27, v29, v31, v17, sqshrn
+        calc_qpelh2     v2, v3, v19, v21, v23, v25, v27, v29, v31, v17, sqshrn2
+        st1            {v1.8h, v2.8h}, [x0], x7
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v18.8h, v19.8h}, [sp], x7
+        calc_qpelh      v1, v20, v22, v24, v26, v28, v30, v16, v18, sqshrn
+        calc_qpelh2     v1, v2, v20, v22, v24, v26, v28, v30, v16, v18, sqshrn2
+        calc_qpelh      v2, v21, v23, v25, v27, v29, v31, v17, v19, sqshrn
+        calc_qpelh2     v2, v3, v21, v23, v25, v27, v29, v31, v17, v19, sqshrn2
+        st1            {v1.8h, v2.8h}, [x0], x7
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v20.8h, v21.8h}, [sp], x7
+        calc_qpelh      v1, v22, v24, v26, v28, v30, v16, v18, v20, sqshrn
+        calc_qpelh2     v1, v2, v22, v24, v26, v28, v30, v16, v18, v20, sqshrn2
+        calc_qpelh      v2, v23, v25, v27, v29, v31, v17, v19, v21, sqshrn
+        calc_qpelh2     v2, v3, v23, v25, v27, v29, v31, v17, v19, v21, sqshrn2
+        st1            {v1.8h, v2.8h}, [x0], x7
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v22.8h, v23.8h}, [sp], x7
+        calc_qpelh      v1, v24, v26, v28, v30, v16, v18, v20, v22, sqshrn
+        calc_qpelh2     v1, v2, v24, v26, v28, v30, v16, v18, v20, v22, sqshrn2
+        calc_qpelh      v2, v25, v27, v29, v31, v17, v19, v21, v23, sqshrn
+        calc_qpelh2     v2, v3, v25, v27, v29, v31, v17, v19, v21, v23, sqshrn2
+        st1            {v1.8h, v2.8h}, [x0], x7
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v24.8h, v25.8h}, [sp], x7
+        calc_qpelh      v1, v26, v28, v30, v16, v18, v20, v22, v24, sqshrn
+        calc_qpelh2     v1, v2, v26, v28, v30, v16, v18, v20, v22, v24, sqshrn2
+        calc_qpelh      v2, v27, v29, v31, v17, v19, v21, v23, v25, sqshrn
+        calc_qpelh2     v2, v3, v27, v29, v31, v17, v19, v21, v23, v25, sqshrn2
+        st1            {v1.8h, v2.8h}, [x0], x7
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v26.8h, v27.8h}, [sp], x7
+        calc_qpelh      v1, v28, v30, v16, v18, v20, v22, v24, v26, sqshrn
+        calc_qpelh2     v1, v2, v28, v30, v16, v18, v20, v22, v24, v26, sqshrn2
+        calc_qpelh      v2, v29, v31, v17, v19, v21, v23, v25, v27, sqshrn
+        calc_qpelh2     v2, v3, v29, v31, v17, v19, v21, v23, v25, v27, sqshrn2
+        st1            {v1.8h, v2.8h}, [x0], x7
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v28.8h, v29.8h}, [sp], x7
+        calc_qpelh      v1, v30, v16, v18, v20, v22, v24, v26, v28, sqshrn
+        calc_qpelh2     v1, v2, v30, v16, v18, v20, v22, v24, v26, v28, sqshrn2
+        calc_qpelh      v2, v31, v17, v19, v21, v23, v25, v27, v29, sqshrn
+        calc_qpelh2     v2, v3, v31, v17, v19, v21, v23, v25, v27, v29, sqshrn2
+        st1            {v1.8h, v2.8h}, [x0], x7
+        subs            x3, x3, #1
+        b.hi            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_hv24_8_neon, export=1
+        sub             sp, sp, #64
+        st1            {v8.16b, v9.16b, v10.16b, v11.16b}, [sp]
+        sub             sp, sp, #64
+        st1            {v12.16b, v13.16b, v14.16b, v15.16b}, [sp]
+        add             x10, x3, #7
+        lsl             x10, x10, #7
+        sub             sp, sp, x10         // tmp_array
+        stp             x0, x3, [sp, #-16]!
+        stp             x5, x30, [sp, #-16]!
+        add             x0, sp, #32
+        sub             x1, x1, x2, lsl #1
+        sub             x1, x1, x2
+        add             x3, x3, #7
+        bl              X(ff_hevc_put_hevc_qpel_h24_8_neon)
+        ldp             x5, x30, [sp], #16
+        ldp             x0, x3, [sp], #16
+        load_qpel_filterh x5, x4
+        mov             x7, #128
+        ld1            {v8.8h, v9.8h, v10.8h}, [sp], x7
+        ld1            {v11.8h, v12.8h, v13.8h}, [sp], x7
+        ld1            {v14.8h, v15.8h, v16.8h}, [sp], x7
+        ld1            {v17.8h, v18.8h, v19.8h}, [sp], x7
+        ld1            {v20.8h, v21.8h, v22.8h}, [sp], x7
+        ld1            {v23.8h, v24.8h, v25.8h}, [sp], x7
+        ld1            {v26.8h, v27.8h, v28.8h}, [sp], x7
+1:      ld1            {v29.8h, v30.8h, v31.8h}, [sp], x7
+        calc_qpelh      v1, v8, v11, v14, v17, v20, v23, v26, v29, sqshrn
+        calc_qpelh2     v1, v2, v8, v11, v14, v17, v20, v23, v26, v29, sqshrn2
+        calc_qpelh      v2, v9, v12, v15, v18, v21, v24, v27, v30, sqshrn
+        calc_qpelh2     v2, v3, v9, v12, v15, v18, v21, v24, v27, v30, sqshrn2
+        calc_qpelh      v3, v10, v13, v16, v19, v22, v25, v28, v31, sqshrn
+        calc_qpelh2     v3, v4, v10, v13, v16, v19, v22, v25, v28, v31, sqshrn2
+        st1            {v1.8h, v2.8h, v3.8h}, [x0], x7
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v8.8h, v9.8h, v10.8h}, [sp], x7
+        calc_qpelh      v1, v11, v14, v17, v20, v23, v26, v29, v8, sqshrn
+        calc_qpelh2     v1, v2, v11, v14, v17, v20, v23, v26, v29, v8, sqshrn2
+        calc_qpelh      v2, v12, v15, v18, v21, v24, v27, v30, v9, sqshrn
+        calc_qpelh2     v2, v3, v12, v15, v18, v21, v24, v27, v30, v9, sqshrn2
+        calc_qpelh      v3, v13, v16, v19, v22, v25, v28, v31, v10, sqshrn
+        calc_qpelh2     v3, v4, v13, v16, v19, v22, v25, v28, v31, v10, sqshrn2
+        st1            {v1.8h, v2.8h, v3.8h}, [x0], x7
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v11.8h, v12.8h, v13.8h}, [sp], x7
+        calc_qpelh      v1, v14, v17, v20, v23, v26, v29, v8, v11, sqshrn
+        calc_qpelh2     v1, v2, v14, v17, v20, v23, v26, v29, v8, v11, sqshrn2
+        calc_qpelh      v2, v15, v18, v21, v24, v27, v30, v9, v12, sqshrn
+        calc_qpelh2     v2, v3, v15, v18, v21, v24, v27, v30, v9, v12, sqshrn2
+        calc_qpelh      v3, v16, v19, v22, v25, v28, v31, v10, v13, sqshrn
+        calc_qpelh2     v3, v4, v16, v19, v22, v25, v28, v31, v10, v13, sqshrn2
+        st1            {v1.8h, v2.8h, v3.8h}, [x0], x7
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v14.8h, v15.8h, v16.8h}, [sp], x7
+        calc_qpelh      v1, v17, v20, v23, v26, v29, v8, v11, v14, sqshrn
+        calc_qpelh2     v1, v2, v17, v20, v23, v26, v29, v8, v11, v14, sqshrn2
+        calc_qpelh      v2, v18, v21, v24, v27, v30, v9, v12, v15, sqshrn
+        calc_qpelh2     v2, v3, v18, v21, v24, v27, v30, v9, v12, v15, sqshrn2
+        calc_qpelh      v3, v19, v22, v25, v28, v31, v10, v13, v16, sqshrn
+        calc_qpelh2     v3, v4, v19, v22, v25, v28, v31, v10, v13, v16, sqshrn2
+        st1            {v1.8h, v2.8h, v3.8h}, [x0], x7
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v17.8h, v18.8h, v19.8h}, [sp], x7
+        calc_qpelh      v1, v20, v23, v26, v29, v8, v11, v14, v17, sqshrn
+        calc_qpelh2     v1, v2, v20, v23, v26, v29, v8, v11, v14, v17, sqshrn2
+        calc_qpelh      v2, v21, v24, v27, v30, v9, v12, v15, v18, sqshrn
+        calc_qpelh2     v2, v3, v21, v24, v27, v30, v9, v12, v15, v18, sqshrn2
+        calc_qpelh      v3, v22, v25, v28, v31, v10, v13, v16, v19, sqshrn
+        calc_qpelh2     v3, v4, v22, v25, v28, v31, v10, v13, v16, v19, sqshrn2
+        st1            {v1.8h, v2.8h, v3.8h}, [x0], x7
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v20.8h, v21.8h, v22.8h}, [sp], x7
+        calc_qpelh      v1, v23, v26, v29, v8, v11, v14, v17, v20, sqshrn
+        calc_qpelh2     v1, v2, v23, v26, v29, v8, v11, v14, v17, v20, sqshrn2
+        calc_qpelh      v2, v24, v27, v30, v9, v12, v15, v18, v21, sqshrn
+        calc_qpelh2     v2, v3, v24, v27, v30, v9, v12, v15, v18, v21, sqshrn2
+        calc_qpelh      v3, v25, v28, v31, v10, v13, v16, v19, v22, sqshrn
+        calc_qpelh2     v3, v4, v25, v28, v31, v10, v13, v16, v19, v22, sqshrn2
+        st1            {v1.8h, v2.8h, v3.8h}, [x0], x7
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v23.8h, v24.8h, v25.8h}, [sp], x7
+        calc_qpelh      v1, v26, v29, v8, v11, v14, v17, v20, v23, sqshrn
+        calc_qpelh2     v1, v2, v26, v29, v8, v11, v14, v17, v20, v23, sqshrn2
+        calc_qpelh      v2, v27, v30, v9, v12, v15, v18, v21, v24, sqshrn
+        calc_qpelh2     v2, v3, v27, v30, v9, v12, v15, v18, v21, v24, sqshrn2
+        calc_qpelh      v3, v28, v31, v10, v13, v16, v19, v22, v25, sqshrn
+        calc_qpelh2     v3, v4, v28, v31, v10, v13, v16, v19, v22, v25, sqshrn2
+        st1            {v1.8h, v2.8h, v3.8h}, [x0], x7
+        subs            x3, x3, #1
+        b.eq            2f
+
+        ld1            {v26.8h, v27.8h, v28.8h}, [sp], x7
+        calc_qpelh      v1, v29, v8, v11, v14, v17, v20, v23, v26, sqshrn
+        calc_qpelh2     v1, v2, v29, v8, v11, v14, v17, v20, v23, v26, sqshrn2
+        calc_qpelh      v2, v30, v9, v12, v15, v18, v21, v24, v27, sqshrn
+        calc_qpelh2     v2, v3, v30, v9, v12, v15, v18, v21, v24, v27, sqshrn2
+        calc_qpelh      v3, v31, v10, v13, v16, v19, v22, v25, v28, sqshrn
+        calc_qpelh2     v3, v4, v31, v10, v13, v16, v19, v22, v25, v28, sqshrn2
+        st1            {v1.8h, v2.8h, v3.8h}, [x0], x7
+        subs            x3, x3, #1
+        b.hi            1b
+2:      ld1            {v12.16b, v13.16b, v14.16b, v15.16b}, [sp], #64
+        ld1            {v8.16b, v9.16b, v10.16b, v11.16b}, [sp], #64
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_hv32_8_neon, export=1
+        add             x10, x3, #7
+        lsl             x10, x10, #7
+        sub             sp, sp, x10         // tmp_array
+        stp             x0, x3, [sp, #-16]!
+        stp             x5, x30, [sp, #-16]!
+        add             x0, sp, #32
+        sub             x1, x1, x2, lsl #1
+        sub             x1, x1, x2
+        add             x3, x3, #7
+        bl              X(ff_hevc_put_hevc_qpel_h32_8_neon)
+        ldp             x5, x30, [sp], #16
+        ldp             x0, x3, [sp], #16
+        load_qpel_filterh x5, x4
+        mov             x7, #128
+1:      mov             x9, x3          // height
+        mov             x5, x0          // dst
+        mov             x8, sp          // src
+
+        ld1            {v16.8h, v17.8h}, [x8], x7
+        ld1            {v18.8h, v19.8h}, [x8], x7
+        ld1            {v20.8h, v21.8h}, [x8], x7
+        ld1            {v22.8h, v23.8h}, [x8], x7
+        ld1            {v24.8h, v25.8h}, [x8], x7
+        ld1            {v26.8h, v27.8h}, [x8], x7
+        ld1            {v28.8h, v29.8h}, [x8], x7
+2:      ld1            {v30.8h, v31.8h}, [x8], x7
+        calc_qpelh      v1, v16, v18, v20, v22, v24, v26, v28, v30, sqshrn
+        calc_qpelh2     v1, v2, v16, v18, v20, v22, v24, v26, v28, v30, sqshrn2
+        calc_qpelh      v2, v17, v19, v21, v23, v25, v27, v29, v31, sqshrn
+        calc_qpelh2     v2, v3, v17, v19, v21, v23, v25, v27, v29, v31, sqshrn2
+        st1            {v1.8h, v2.8h}, [x5], x7
+        subs            x9, x9, #1
+        b.eq            3f
+
+        ld1            {v16.8h, v17.8h}, [x8], x7
+        calc_qpelh      v1, v18, v20, v22, v24, v26, v28, v30, v16, sqshrn
+        calc_qpelh2     v1, v2, v18, v20, v22, v24, v26, v28, v30, v16, sqshrn2
+        calc_qpelh      v2, v19, v21, v23, v25, v27, v29, v31, v17, sqshrn
+        calc_qpelh2     v2, v3, v19, v21, v23, v25, v27, v29, v31, v17, sqshrn2
+        st1            {v1.8h, v2.8h}, [x5], x7
+        subs            x9, x9, #1
+        b.eq            3f
+
+        ld1            {v18.8h, v19.8h}, [x8], x7
+        calc_qpelh      v1, v20, v22, v24, v26, v28, v30, v16, v18, sqshrn
+        calc_qpelh2     v1, v2, v20, v22, v24, v26, v28, v30, v16, v18, sqshrn2
+        calc_qpelh      v2, v21, v23, v25, v27, v29, v31, v17, v19, sqshrn
+        calc_qpelh2     v2, v3, v21, v23, v25, v27, v29, v31, v17, v19, sqshrn2
+        st1            {v1.8h, v2.8h}, [x5], x7
+        subs            x9, x9, #1
+        b.eq            3f
+
+        ld1            {v20.8h, v21.8h}, [x8], x7
+        calc_qpelh      v1, v22, v24, v26, v28, v30, v16, v18, v20, sqshrn
+        calc_qpelh2     v1, v2, v22, v24, v26, v28, v30, v16, v18, v20, sqshrn2
+        calc_qpelh      v2, v23, v25, v27, v29, v31, v17, v19, v21, sqshrn
+        calc_qpelh2     v2, v3, v23, v25, v27, v29, v31, v17, v19, v21, sqshrn2
+        st1            {v1.8h, v2.8h}, [x5], x7
+        subs            x9, x9, #1
+        b.eq            3f
+
+        ld1            {v22.8h, v23.8h}, [x8], x7
+        calc_qpelh      v1, v24, v26, v28, v30, v16, v18, v20, v22, sqshrn
+        calc_qpelh2     v1, v2, v24, v26, v28, v30, v16, v18, v20, v22, sqshrn2
+        calc_qpelh      v2, v25, v27, v29, v31, v17, v19, v21, v23, sqshrn
+        calc_qpelh2     v2, v3, v25, v27, v29, v31, v17, v19, v21, v23, sqshrn2
+        st1            {v1.8h, v2.8h}, [x5], x7
+        subs            x9, x9, #1
+        b.eq            3f
+
+        ld1            {v24.8h, v25.8h}, [x8], x7
+        calc_qpelh      v1, v26, v28, v30, v16, v18, v20, v22, v24, sqshrn
+        calc_qpelh2     v1, v2, v26, v28, v30, v16, v18, v20, v22, v24, sqshrn2
+        calc_qpelh      v2, v27, v29, v31, v17, v19, v21, v23, v25, sqshrn
+        calc_qpelh2     v2, v3, v27, v29, v31, v17, v19, v21, v23, v25, sqshrn2
+        st1            {v1.8h, v2.8h}, [x5], x7
+        subs            x9, x9, #1
+        b.eq            3f
+
+        ld1            {v26.8h, v27.8h}, [x8], x7
+        calc_qpelh      v1, v28, v30, v16, v18, v20, v22, v24, v26, sqshrn
+        calc_qpelh2     v1, v2, v28, v30, v16, v18, v20, v22, v24, v26, sqshrn2
+        calc_qpelh      v2, v29, v31, v17, v19, v21, v23, v25, v27, sqshrn
+        calc_qpelh2     v2, v3, v29, v31, v17, v19, v21, v23, v25, v27, sqshrn2
+        st1            {v1.8h, v2.8h}, [x5], x7
+        subs            x9, x9, #1
+        b.eq            3f
+
+        ld1            {v28.8h, v29.8h}, [x8], x7
+        calc_qpelh      v1, v30, v16, v18, v20, v22, v24, v26, v28, sqshrn
+        calc_qpelh2     v1, v2, v30, v16, v18, v20, v22, v24, v26, v28, sqshrn2
+        calc_qpelh      v2, v31, v17, v19, v21, v23, v25, v27, v29, sqshrn
+        calc_qpelh2     v2, v3, v31, v17, v19, v21, v23, v25, v27, v29, sqshrn2
+        st1            {v1.8h, v2.8h}, [x5], x7
+        subs            x9, x9, #1
+        b.hi            2b
+
+3:      add             x0, x0, #32
+        add             sp, sp, #32
+        subs            x6, x6, #16
+        b.hi            1b
+
+        add             sp, sp, #64          // discard rest of first line
+        add             x10, x3, #6
+        lsl             x10, x10, #7
+        add             sp, sp, x10         // tmp_array without first line
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_hv48_8_neon, export=1
+        stp             xzr, x30, [sp, #-16]!
+        stp             x0, x1, [sp, #-16]!
+        stp             x2, x3, [sp, #-16]!
+        stp             x4, x5, [sp, #-16]!
+        bl              X(ff_hevc_put_hevc_qpel_hv24_8_neon)
+        ldp             x4, x5, [sp], #16
+        ldp             x2, x3, [sp], #16
+        ldp             x0, x1, [sp], #16
+        add             x0, x0, #48
+        add             x1, x1, #24
+        bl              X(ff_hevc_put_hevc_qpel_hv24_8_neon)
+        ldp             xzr, x30, [sp], #16
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_hv64_8_neon, export=1
+        stp             xzr, x30, [sp, #-16]!
+        stp             x0, x1, [sp, #-16]!
+        stp             x2, x3, [sp, #-16]!
+        stp             x4, x5, [sp, #-16]!
+        mov             x6, #32
+        bl              X(ff_hevc_put_hevc_qpel_hv32_8_neon)
+        ldp             x4, x5, [sp], #16
+        ldp             x2, x3, [sp], #16
+        ldp             x0, x1, [sp], #16
+        add             x0, x0, #64
+        add             x1, x1, #32
+        mov             x6, #32
+        bl              X(ff_hevc_put_hevc_qpel_hv32_8_neon)
+        ldp             xzr, x30, [sp], #16
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_h4_8_neon, export=1
+        load_qpel_filterb x5, x6
+        sub             x2, x2, #3
+1:      ld1            {v16.8b, v17.8b}, [x2], x3
+        movi            v20.8h, #0
+        umlsl           v20.8h, v16.8b, v0.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], v17.b[0]
+        umlal           v20.8h, v16.8b, v1.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], v17.b[1]
+        umlsl           v20.8h, v16.8b, v2.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], v17.b[2]
+        umlal           v20.8h, v16.8b, v3.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], v17.b[3]
+        umlal           v20.8h, v16.8b, v4.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], v17.b[4]
+        umlsl           v20.8h, v16.8b, v5.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], v17.b[5]
+        umlal           v20.8h, v16.8b, v6.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], v17.b[6]
+        umlsl           v20.8h, v16.8b, v7.8b
+        sqrshrun        v20.8b, v20.8h, #6
+        st1            {v20.s}[0], [x0], x1
+        subs            x4, x4, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_h6_8_neon, export=1
+        load_qpel_filterb x5, x6
+        sub             x2, x2, #3
+        sub             x1, x1, #4
+1:      ld1            {v16.8b, v17.8b}, [x2], x3
+        movi            v20.8h, #0
+        umlsl           v20.8h, v16.8b, v0.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], v17.b[0]
+        umlal           v20.8h, v16.8b, v1.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], v17.b[1]
+        umlsl           v20.8h, v16.8b, v2.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], v17.b[2]
+        umlal           v20.8h, v16.8b, v3.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], v17.b[3]
+        umlal           v20.8h, v16.8b, v4.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], v17.b[4]
+        umlsl           v20.8h, v16.8b, v5.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], v17.b[5]
+        umlal           v20.8h, v16.8b, v6.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], v17.b[6]
+        umlsl           v20.8h, v16.8b, v7.8b
+        sqrshrun        v20.8b, v20.8h, #6
+        st1            {v20.s}[0], [x0], #4
+        st1            {v20.h}[2], [x0], x1
+        subs            x4, x4, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_h8_8_neon, export=1
+        load_qpel_filterb x5, x6
+        sub             x2, x2, #3
+1:      ld1            {v16.8b, v17.8b}, [x2], x3
+        movi            v20.8h, #0
+        umlsl           v20.8h, v16.8b, v0.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], v17.b[0]
+        umlal           v20.8h, v16.8b, v1.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], v17.b[1]
+        umlsl           v20.8h, v16.8b, v2.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], v17.b[2]
+        umlal           v20.8h, v16.8b, v3.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], v17.b[3]
+        umlal           v20.8h, v16.8b, v4.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], v17.b[4]
+        umlsl           v20.8h, v16.8b, v5.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], v17.b[5]
+        umlal           v20.8h, v16.8b, v6.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], v17.b[6]
+        umlsl           v20.8h, v16.8b, v7.8b
+        sqrshrun        v20.8b, v20.8h, #6
+        st1            {v20.8b}, [x0], x1
+        subs            x4, x4, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_h12_8_neon, export=1
+        load_qpel_filterb x5, x6
+        sub             x2, x2, #3
+        sub             x1, x1, #8
+1:      ld2            {v16.8b, v17.8b}, [x2]
+        ldr             w12, [x2, #16]
+        movi            v20.8h, #0
+        movi            v21.8h, #0
+        umlsl           v20.8h, v16.8b, v0.8b
+        umlal           v20.8h, v17.8b, v1.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], w12
+        lsr             x12, x12, #8
+        umlsl           v21.8h, v17.8b, v0.8b
+        umlal           v21.8h, v16.8b, v1.8b
+        ushr            v17.2d, v17.2d, #8
+        mov             v17.b[7], w12
+        lsr             x12, x12, #8
+        umlsl           v20.8h, v16.8b, v2.8b
+        umlal           v20.8h, v17.8b, v3.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], w12
+        umlsl           v21.8h, v17.8b, v2.8b
+        umlal           v21.8h, v16.8b, v3.8b
+        ushr            v17.2d, v17.2d, #8
+        umlal           v20.8h, v16.8b, v4.8b
+        umlsl           v20.8h, v17.8b, v5.8b
+        ushr            v16.2d, v16.2d, #8
+        umlal           v21.8h, v17.8b, v4.8b
+        umlsl           v21.8h, v16.8b, v5.8b
+        ushr            v17.2d, v17.2d, #8
+        umlal           v20.8h, v16.8b, v6.8b
+        umlsl           v20.8h, v17.8b, v7.8b
+        ushr            v16.2d, v16.2d, #8
+        umlal           v21.8h, v17.8b, v6.8b
+        umlsl           v21.8h, v16.8b, v7.8b
+        zip1            v16.8h, v20.8h, v21.8h
+        zip2            v17.8h, v20.8h, v21.8h
+        sqrshrun        v20.8b, v16.8h, #6
+        sqrshrun2       v20.16b, v17.8h, #6
+        st1            {v20.8b}, [x0], #8
+        st1            {v20.s}[2], [x0], x1
+        add             x2, x2, x3
+        subs            x4, x4, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_h16_8_neon, export=1
+        load_qpel_filterb x5, x6
+        sub             x2, x2, #3
+1:      ld2            {v16.8b, v17.8b}, [x2]
+        ldr             x12, [x2, #16]
+        movi            v20.8h, #0
+        movi            v21.8h, #0
+        umlsl           v20.8h, v16.8b, v0.8b
+        umlal           v20.8h, v17.8b, v1.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], w12
+        lsr             x12, x12, #8
+        umlsl           v21.8h, v17.8b, v0.8b
+        umlal           v21.8h, v16.8b, v1.8b
+        ushr            v17.2d, v17.2d, #8
+        mov             v17.b[7], w12
+        lsr             x12, x12, #8
+        umlsl           v20.8h, v16.8b, v2.8b
+        umlal           v20.8h, v17.8b, v3.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], w12
+        lsr             x12, x12, #8
+        umlsl           v21.8h, v17.8b, v2.8b
+        umlal           v21.8h, v16.8b, v3.8b
+        ushr            v17.2d, v17.2d, #8
+        mov             v17.b[7], w12
+        lsr             x12, x12, #8
+        umlal           v20.8h, v16.8b, v4.8b
+        umlsl           v20.8h, v17.8b, v5.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], w12
+        lsr             x12, x12, #8
+        umlal           v21.8h, v17.8b, v4.8b
+        umlsl           v21.8h, v16.8b, v5.8b
+        ushr            v17.2d, v17.2d, #8
+        mov             v17.b[7], w12
+        lsr             x12, x12, #8
+        umlal           v20.8h, v16.8b, v6.8b
+        umlsl           v20.8h, v17.8b, v7.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], w12
+        umlal           v21.8h, v17.8b, v6.8b
+        umlsl           v21.8h, v16.8b, v7.8b
+        sqrshrun        v20.8b, v20.8h, #6
+        sqrshrun        v21.8b, v21.8h, #6
+        st2            {v20.8b, v21.8b}, [x0], x1
+        add             x2, x2, x3
+        subs            x4, x4, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_h24_8_neon, export=1
+        load_qpel_filterb x5, x6
+        sub             x2, x2, #3
+1:      ld3            {v16.8b, v17.8b, v18.8b}, [x2]
+        ldr             x12, [x2, #24]
+        movi            v20.8h, #0
+        movi            v21.8h, #0
+        movi            v22.8h, #0
+        umlsl           v20.8h, v16.8b, v0.8b
+        umlal           v20.8h, v17.8b, v1.8b
+        umlsl           v20.8h, v18.8b, v2.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], w12
+        lsr             x12, x12, #8
+        umlsl           v21.8h, v17.8b, v0.8b
+        umlal           v21.8h, v18.8b, v1.8b
+        umlsl           v21.8h, v16.8b, v2.8b
+        ushr            v17.2d, v17.2d, #8
+        mov             v17.b[7], w12
+        lsr             x12, x12, #8
+        umlsl           v22.8h, v18.8b, v0.8b
+        umlal           v22.8h, v16.8b, v1.8b
+        umlsl           v22.8h, v17.8b, v2.8b
+        ushr            v18.2d, v18.2d, #8
+        mov             v18.b[7], w12
+        lsr             x12, x12, #8
+        umlal           v20.8h, v16.8b, v3.8b
+        umlal           v20.8h, v17.8b, v4.8b
+        umlsl           v20.8h, v18.8b, v5.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], w12
+        lsr             x12, x12, #8
+        umlal           v21.8h, v17.8b, v3.8b
+        umlal           v21.8h, v18.8b, v4.8b
+        umlsl           v21.8h, v16.8b, v5.8b
+        ushr            v17.2d, v17.2d, #8
+        mov             v17.b[7], w12
+        lsr             x12, x12, #8
+        umlal           v22.8h, v18.8b, v3.8b
+        umlal           v22.8h, v16.8b, v4.8b
+        umlsl           v22.8h, v17.8b, v5.8b
+        ushr            v18.2d, v18.2d, #8
+        mov             v18.b[7], w12
+        lsr             x12, x12, #8
+        umlal           v20.8h, v16.8b, v6.8b
+        umlsl           v20.8h, v17.8b, v7.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], w12
+        umlal           v21.8h, v17.8b, v6.8b
+        umlsl           v21.8h, v18.8b, v7.8b
+        umlal           v22.8h, v18.8b, v6.8b
+        umlsl           v22.8h, v16.8b, v7.8b
+        sqrshrun        v20.8b, v20.8h, #6
+        sqrshrun        v21.8b, v21.8h, #6
+        sqrshrun        v22.8b, v22.8h, #6
+        st3            {v20.8b, v21.8b, v22.8b}, [x0], x1
+        add             x2, x2, x3
+        subs            x4, x4, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_h32_8_neon, export=1
+        load_qpel_filterb x5, x6
+        sub             x2, x2, #3
+1:      ld4            {v16.8b, v17.8b, v18.8b, v19.8b}, [x2]
+        ldr             x12, [x2, #32]
+        movi            v20.8h, #0
+        movi            v21.8h, #0
+        movi            v22.8h, #0
+        movi            v23.8h, #0
+        umlsl           v20.8h, v16.8b, v0.8b
+        umlal           v20.8h, v17.8b, v1.8b
+        umlsl           v20.8h, v18.8b, v2.8b
+        umlal           v20.8h, v19.8b, v3.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], w12
+        lsr             x12, x12, #8
+        umlsl           v21.8h, v17.8b, v0.8b
+        umlal           v21.8h, v18.8b, v1.8b
+        umlsl           v21.8h, v19.8b, v2.8b
+        umlal           v21.8h, v16.8b, v3.8b
+        ushr            v17.2d, v17.2d, #8
+        mov             v17.b[7], w12
+        lsr             x12, x12, #8
+        umlsl           v22.8h, v18.8b, v0.8b
+        umlal           v22.8h, v19.8b, v1.8b
+        umlsl           v22.8h, v16.8b, v2.8b
+        umlal           v22.8h, v17.8b, v3.8b
+        ushr            v18.2d, v18.2d, #8
+        mov             v18.b[7], w12
+        lsr             x12, x12, #8
+        umlsl           v23.8h, v19.8b, v0.8b
+        umlal           v23.8h, v16.8b, v1.8b
+        umlsl           v23.8h, v17.8b, v2.8b
+        umlal           v23.8h, v18.8b, v3.8b
+        ushr            v19.2d, v19.2d, #8
+        mov             v19.b[7], w12
+        lsr             x12, x12, #8
+        umlal           v20.8h, v16.8b, v4.8b
+        umlsl           v20.8h, v17.8b, v5.8b
+        umlal           v20.8h, v18.8b, v6.8b
+        umlsl           v20.8h, v19.8b, v7.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], w12
+        lsr             x12, x12, #8
+        umlal           v21.8h, v17.8b, v4.8b
+        umlsl           v21.8h, v18.8b, v5.8b
+        umlal           v21.8h, v19.8b, v6.8b
+        umlsl           v21.8h, v16.8b, v7.8b
+        ushr            v17.2d, v17.2d, #8
+        mov             v17.b[7], w12
+        lsr             x12, x12, #8
+        umlal           v22.8h, v18.8b, v4.8b
+        umlsl           v22.8h, v19.8b, v5.8b
+        umlal           v22.8h, v16.8b, v6.8b
+        umlsl           v22.8h, v17.8b, v7.8b
+        ushr            v18.2d, v18.2d, #8
+        mov             v18.b[7], w12
+        umlal           v23.8h, v19.8b, v4.8b
+        umlsl           v23.8h, v16.8b, v5.8b
+        umlal           v23.8h, v17.8b, v6.8b
+        umlsl           v23.8h, v18.8b, v7.8b
+        sqrshrun        v20.8b, v20.8h, #6
+        sqrshrun        v21.8b, v21.8h, #6
+        sqrshrun        v22.8b, v22.8h, #6
+        sqrshrun        v23.8b, v23.8h, #6
+        st4            {v20.8b, v21.8b, v22.8b, v23.8b}, [x0], x1
+        add             x2, x2, x3
+        subs            x4, x4, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_h48_8_neon, export=1
+        load_qpel_filterb x5, x6
+        sub             x2, x2, #3
+1:      ld3            {v16.16b, v17.16b, v18.16b}, [x2]
+        ldr             x12, [x2, #24]
+        ldr             x13, [x2, #48]
+        movi            v20.8h, #0
+        movi            v21.8h, #0
+        movi            v22.8h, #0
+        movi            v23.8h, #0
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        umlsl           v20.8h, v16.8b, v0.8b
+        umlal           v20.8h, v17.8b, v1.8b
+        umlsl           v20.8h, v18.8b, v2.8b
+        umlsl2          v23.8h, v16.16b, v0.16b
+        umlal2          v23.8h, v17.16b, v1.16b
+        umlsl2          v23.8h, v18.16b, v2.16b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], w12
+        mov             v16.b[15], w13
+        lsr             x12, x12, #8
+        lsr             x13, x13, #8
+        umlsl           v21.8h, v17.8b, v0.8b
+        umlal           v21.8h, v18.8b, v1.8b
+        umlsl           v21.8h, v16.8b, v2.8b
+        umlsl2          v24.8h, v17.16b, v0.16b
+        umlal2          v24.8h, v18.16b, v1.16b
+        umlsl2          v24.8h, v16.16b, v2.16b
+        ushr            v17.2d, v17.2d, #8
+        mov             v17.b[7], w12
+        mov             v17.b[15], w13
+        lsr             x12, x12, #8
+        lsr             x13, x13, #8
+        umlsl           v22.8h, v18.8b, v0.8b
+        umlal           v22.8h, v16.8b, v1.8b
+        umlsl           v22.8h, v17.8b, v2.8b
+        umlsl2          v25.8h, v18.16b, v0.16b
+        umlal2          v25.8h, v16.16b, v1.16b
+        umlsl2          v25.8h, v17.16b, v2.16b
+        ushr            v18.2d, v18.2d, #8
+        mov             v18.b[7], w12
+        mov             v18.b[15], w13
+        lsr             x12, x12, #8
+        lsr             x13, x13, #8
+        umlal           v20.8h, v16.8b, v3.8b
+        umlal           v20.8h, v17.8b, v4.8b
+        umlsl           v20.8h, v18.8b, v5.8b
+        umlal2          v23.8h, v16.16b, v3.16b
+        umlal2          v23.8h, v17.16b, v4.16b
+        umlsl2          v23.8h, v18.16b, v5.16b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], w12
+        mov             v16.b[15], w13
+        lsr             x12, x12, #8
+        lsr             x13, x13, #8
+        umlal           v21.8h, v17.8b, v3.8b
+        umlal           v21.8h, v18.8b, v4.8b
+        umlsl           v21.8h, v16.8b, v5.8b
+        umlal2          v24.8h, v17.16b, v3.16b
+        umlal2          v24.8h, v18.16b, v4.16b
+        umlsl2          v24.8h, v16.16b, v5.16b
+        ushr            v17.2d, v17.2d, #8
+        mov             v17.b[7], w12
+        mov             v17.b[15], w13
+        lsr             x12, x12, #8
+        lsr             x13, x13, #8
+        umlal           v22.8h, v18.8b, v3.8b
+        umlal           v22.8h, v16.8b, v4.8b
+        umlsl           v22.8h, v17.8b, v5.8b
+        umlal2          v25.8h, v18.16b, v3.16b
+        umlal2          v25.8h, v16.16b, v4.16b
+        umlsl2          v25.8h, v17.16b, v5.16b
+        ushr            v18.2d, v18.2d, #8
+        mov             v18.b[7], w12
+        mov             v18.b[15], w13
+        lsr             x12, x12, #8
+        lsr             x13, x13, #8
+        umlal           v20.8h, v16.8b, v6.8b
+        umlsl           v20.8h, v17.8b, v7.8b
+        umlal2          v23.8h, v16.16b, v6.16b
+        umlsl2          v23.8h, v17.16b, v7.16b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], w12
+        mov             v16.b[15], w13
+        umlal           v21.8h, v17.8b, v6.8b
+        umlsl           v21.8h, v18.8b, v7.8b
+        umlal2          v24.8h, v17.16b, v6.16b
+        umlsl2          v24.8h, v18.16b, v7.16b
+        umlal           v22.8h, v18.8b, v6.8b
+        umlsl           v22.8h, v16.8b, v7.8b
+        umlal2          v25.8h, v18.16b, v6.16b
+        umlsl2          v25.8h, v16.16b, v7.16b
+        sqrshrun        v20.8b, v20.8h, #6
+        sqrshrun        v21.8b, v21.8h, #6
+        sqrshrun        v22.8b, v22.8h, #6
+        sqrshrun2       v20.16b, v23.8h, #6
+        sqrshrun2       v21.16b, v24.8h, #6
+        sqrshrun2       v22.16b, v25.8h, #6
+        st3            {v20.16b, v21.16b, v22.16b}, [x0], x1
+        add             x2, x2, x3
+        subs            x4, x4, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_h64_8_neon, export=1
+        load_qpel_filterb x5, x6
+        sub             x2, x2, #3
+1:      ld4            {v16.16b, v17.16b, v18.16b, v19.16b}, [x2]
+        ldr             x12, [x2, #32]
+        ldr             x13, [x2, #64]
+        movi            v20.8h, #0
+        movi            v21.8h, #0
+        movi            v22.8h, #0
+        movi            v23.8h, #0
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        movi            v26.8h, #0
+        movi            v27.8h, #0
+        umlsl           v20.8h, v16.8b, v0.8b
+        umlal           v20.8h, v17.8b, v1.8b
+        umlsl           v20.8h, v18.8b, v2.8b
+        umlal           v20.8h, v19.8b, v3.8b
+        umlsl2          v24.8h, v16.16b, v0.16b
+        umlal2          v24.8h, v17.16b, v1.16b
+        umlsl2          v24.8h, v18.16b, v2.16b
+        umlal2          v24.8h, v19.16b, v3.16b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], w12
+        lsr             x12, x12, #8
+        mov             v16.b[15], w13
+        lsr             x13, x13, #8
+        umlsl           v21.8h, v17.8b, v0.8b
+        umlal           v21.8h, v18.8b, v1.8b
+        umlsl           v21.8h, v19.8b, v2.8b
+        umlal           v21.8h, v16.8b, v3.8b
+        umlsl2          v25.8h, v17.16b, v0.16b
+        umlal2          v25.8h, v18.16b, v1.16b
+        umlsl2          v25.8h, v19.16b, v2.16b
+        umlal2          v25.8h, v16.16b, v3.16b
+        ushr            v17.2d, v17.2d, #8
+        mov             v17.b[7], w12
+        lsr             x12, x12, #8
+        mov             v17.b[15], w13
+        lsr             x13, x13, #8
+        umlsl           v22.8h, v18.8b, v0.8b
+        umlal           v22.8h, v19.8b, v1.8b
+        umlsl           v22.8h, v16.8b, v2.8b
+        umlal           v22.8h, v17.8b, v3.8b
+        umlsl2          v26.8h, v18.16b, v0.16b
+        umlal2          v26.8h, v19.16b, v1.16b
+        umlsl2          v26.8h, v16.16b, v2.16b
+        umlal2          v26.8h, v17.16b, v3.16b
+        ushr            v18.2d, v18.2d, #8
+        mov             v18.b[7], w12
+        lsr             x12, x12, #8
+        mov             v18.b[15], w13
+        lsr             x13, x13, #8
+        umlsl           v23.8h, v19.8b, v0.8b
+        umlal           v23.8h, v16.8b, v1.8b
+        umlsl           v23.8h, v17.8b, v2.8b
+        umlal           v23.8h, v18.8b, v3.8b
+        umlsl2          v27.8h, v19.16b, v0.16b
+        umlal2          v27.8h, v16.16b, v1.16b
+        umlsl2          v27.8h, v17.16b, v2.16b
+        umlal2          v27.8h, v18.16b, v3.16b
+        ushr            v19.2d, v19.2d, #8
+        mov             v19.b[7], w12
+        lsr             x12, x12, #8
+        mov             v19.b[15], w13
+        lsr             x13, x13, #8
+        umlal           v20.8h, v16.8b, v4.8b
+        umlsl           v20.8h, v17.8b, v5.8b
+        umlal           v20.8h, v18.8b, v6.8b
+        umlsl           v20.8h, v19.8b, v7.8b
+        umlal2          v24.8h, v16.16b, v4.16b
+        umlsl2          v24.8h, v17.16b, v5.16b
+        umlal2          v24.8h, v18.16b, v6.16b
+        umlsl2          v24.8h, v19.16b, v7.16b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], w12
+        lsr             x12, x12, #8
+        mov             v16.b[15], w13
+        lsr             x13, x13, #8
+        umlal           v21.8h, v17.8b, v4.8b
+        umlsl           v21.8h, v18.8b, v5.8b
+        umlal           v21.8h, v19.8b, v6.8b
+        umlsl           v21.8h, v16.8b, v7.8b
+        umlal2          v25.8h, v17.16b, v4.16b
+        umlsl2          v25.8h, v18.16b, v5.16b
+        umlal2          v25.8h, v19.16b, v6.16b
+        umlsl2          v25.8h, v16.16b, v7.16b
+        ushr            v17.2d, v17.2d, #8
+        mov             v17.b[7], w12
+        lsr             x12, x12, #8
+        mov             v17.b[15], w13
+        lsr             x13, x13, #8
+        umlal           v22.8h, v18.8b, v4.8b
+        umlsl           v22.8h, v19.8b, v5.8b
+        umlal           v22.8h, v16.8b, v6.8b
+        umlsl           v22.8h, v17.8b, v7.8b
+        umlal2          v26.8h, v18.16b, v4.16b
+        umlsl2          v26.8h, v19.16b, v5.16b
+        umlal2          v26.8h, v16.16b, v6.16b
+        umlsl2          v26.8h, v17.16b, v7.16b
+        ushr            v18.2d, v18.2d, #8
+        mov             v18.b[7], w12
+        mov             v18.b[15], w13
+        umlal           v23.8h, v19.8b, v4.8b
+        umlsl           v23.8h, v16.8b, v5.8b
+        umlal           v23.8h, v17.8b, v6.8b
+        umlsl           v23.8h, v18.8b, v7.8b
+        umlal2          v27.8h, v19.16b, v4.16b
+        umlsl2          v27.8h, v16.16b, v5.16b
+        umlal2          v27.8h, v17.16b, v6.16b
+        umlsl2          v27.8h, v18.16b, v7.16b
+        sqrshrun        v20.8b, v20.8h, #6
+        sqrshrun        v21.8b, v21.8h, #6
+        sqrshrun        v22.8b, v22.8h, #6
+        sqrshrun        v23.8b, v23.8h, #6
+        sqrshrun2       v20.16b, v24.8h, #6
+        sqrshrun2       v21.16b, v25.8h, #6
+        sqrshrun2       v22.16b, v26.8h, #6
+        sqrshrun2       v23.16b, v27.8h, #6
+        st4            {v20.16b, v21.16b, v22.16b, v23.16b}, [x0], x1
+        add             x2, x2, x3
+        subs            x4, x4, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_v4_8_neon, export=1
+        load_qpel_filterb x6, x5
+        sub             x2, x2, x3, lsl #1
+        sub             x2, x2, x3
+        ld1            {v16.s}[0], [x2], x3
+        ld1            {v17.s}[0], [x2], x3
+        ld1            {v18.s}[0], [x2], x3
+        ld1            {v19.s}[0], [x2], x3
+        ld1            {v20.s}[0], [x2], x3
+        ld1            {v21.s}[0], [x2], x3
+        ld1            {v22.s}[0], [x2], x3
+1:      ld1            {v23.s}[0], [x2], x3
+        movi            v24.8h, #0
+        calc_qpelb      v24, v16, v17, v18, v19, v20, v21, v22, v23
+        sqrshrun        v24.8b, v24.8h, #6
+        st1            {v24.s}[0], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v16.s}[0], [x2], x3
+        movi            v24.8h, #0
+        calc_qpelb      v24, v17, v18, v19, v20, v21, v22, v23, v16
+        sqrshrun        v24.8b, v24.8h, #6
+        st1            {v24.s}[0], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v17.s}[0], [x2], x3
+        movi            v24.8h, #0
+        calc_qpelb      v24, v18, v19, v20, v21, v22, v23, v16, v17
+        sqrshrun        v24.8b, v24.8h, #6
+        st1            {v24.s}[0], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v18.s}[0], [x2], x3
+        movi            v24.8h, #0
+        calc_qpelb      v24, v19, v20, v21, v22, v23, v16, v17, v18
+        sqrshrun        v24.8b, v24.8h, #6
+        st1            {v24.s}[0], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v19.s}[0], [x2], x3
+        movi            v24.8h, #0
+        calc_qpelb      v24, v20, v21, v22, v23, v16, v17, v18, v19
+        sqrshrun        v24.8b, v24.8h, #6
+        st1            {v24.s}[0], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v20.s}[0], [x2], x3
+        movi            v24.8h, #0
+        calc_qpelb      v24, v21, v22, v23, v16, v17, v18, v19, v20
+        sqrshrun        v24.8b, v24.8h, #6
+        st1            {v24.s}[0], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v21.s}[0], [x2], x3
+        movi            v24.8h, #0
+        calc_qpelb      v24, v22, v23, v16, v17, v18, v19, v20, v21
+        sqrshrun        v24.8b, v24.8h, #6
+        st1            {v24.s}[0], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v22.s}[0], [x2], x3
+        movi            v24.8h, #0
+        calc_qpelb      v24, v23, v16, v17, v18, v19, v20, v21, v22
+        sqrshrun        v24.8b, v24.8h, #6
+        st1            {v24.s}[0], [x0], x1
+        subs            x4, x4, #1
+        b.ne            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_v6_8_neon, export=1
+        load_qpel_filterb x6, x5
+        sub             x2, x2, x3, lsl #1
+        sub             x2, x2, x3
+        sub             x1, x1, #4
+        ld1            {v16.8b}, [x2], x3
+        ld1            {v17.8b}, [x2], x3
+        ld1            {v18.8b}, [x2], x3
+        ld1            {v19.8b}, [x2], x3
+        ld1            {v20.8b}, [x2], x3
+        ld1            {v21.8b}, [x2], x3
+        ld1            {v22.8b}, [x2], x3
+1:      ld1            {v23.8b}, [x2], x3
+        movi            v24.8h, #0
+        calc_qpelb      v24, v16, v17, v18, v19, v20, v21, v22, v23
+        sqrshrun        v24.8b, v24.8h, #6
+        st1            {v24.s}[0], [x0], #4
+        st1            {v24.h}[2], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v16.8b}, [x2], x3
+        movi            v24.8h, #0
+        calc_qpelb      v24, v17, v18, v19, v20, v21, v22, v23, v16
+        sqrshrun        v24.8b, v24.8h, #6
+        st1            {v24.s}[0], [x0], #4
+        st1            {v24.h}[2], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v17.8b}, [x2], x3
+        movi            v24.8h, #0
+        calc_qpelb      v24, v18, v19, v20, v21, v22, v23, v16, v17
+        sqrshrun        v24.8b, v24.8h, #6
+        st1            {v24.s}[0], [x0], #4
+        st1            {v24.h}[2], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v18.8b}, [x2], x3
+        movi            v24.8h, #0
+        calc_qpelb      v24, v19, v20, v21, v22, v23, v16, v17, v18
+        sqrshrun        v24.8b, v24.8h, #6
+        st1            {v24.s}[0], [x0], #4
+        st1            {v24.h}[2], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v19.8b}, [x2], x3
+        movi            v24.8h, #0
+        calc_qpelb      v24, v20, v21, v22, v23, v16, v17, v18, v19
+        sqrshrun        v24.8b, v24.8h, #6
+        st1            {v24.s}[0], [x0], #4
+        st1            {v24.h}[2], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v20.8b}, [x2], x3
+        movi            v24.8h, #0
+        calc_qpelb      v24, v21, v22, v23, v16, v17, v18, v19, v20
+        sqrshrun        v24.8b, v24.8h, #6
+        st1            {v24.s}[0], [x0], #4
+        st1            {v24.h}[2], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v21.8b}, [x2], x3
+        movi            v24.8h, #0
+        calc_qpelb      v24, v22, v23, v16, v17, v18, v19, v20, v21
+        sqrshrun        v24.8b, v24.8h, #6
+        st1            {v24.s}[0], [x0], #4
+        st1            {v24.h}[2], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v22.8b}, [x2], x3
+        movi            v24.8h, #0
+        calc_qpelb      v24, v23, v16, v17, v18, v19, v20, v21, v22
+        sqrshrun        v24.8b, v24.8h, #6
+        st1            {v24.s}[0], [x0], #4
+        st1            {v24.h}[2], [x0], x1
+        subs            x4, x4, #1
+        b.ne            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_v8_8_neon, export=1
+        load_qpel_filterb x6, x5
+        sub             x2, x2, x3, lsl #1
+        sub             x2, x2, x3
+        ld1            {v16.8b}, [x2], x3
+        ld1            {v17.8b}, [x2], x3
+        ld1            {v18.8b}, [x2], x3
+        ld1            {v19.8b}, [x2], x3
+        ld1            {v20.8b}, [x2], x3
+        ld1            {v21.8b}, [x2], x3
+        ld1            {v22.8b}, [x2], x3
+1:      ld1            {v23.8b}, [x2], x3
+        movi            v24.8h, #0
+        calc_qpelb      v24, v16, v17, v18, v19, v20, v21, v22, v23
+        sqrshrun        v24.8b, v24.8h, #6
+        st1            {v24.8b}, [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v16.8b}, [x2], x3
+        movi            v24.8h, #0
+        calc_qpelb      v24, v17, v18, v19, v20, v21, v22, v23, v16
+        sqrshrun        v24.8b, v24.8h, #6
+        st1            {v24.8b}, [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v17.8b}, [x2], x3
+        movi            v24.8h, #0
+        calc_qpelb      v24, v18, v19, v20, v21, v22, v23, v16, v17
+        sqrshrun        v24.8b, v24.8h, #6
+        st1            {v24.8b}, [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v18.8b}, [x2], x3
+        movi            v24.8h, #0
+        calc_qpelb      v24, v19, v20, v21, v22, v23, v16, v17, v18
+        sqrshrun        v24.8b, v24.8h, #6
+        st1            {v24.8b}, [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v19.8b}, [x2], x3
+        movi            v24.8h, #0
+        calc_qpelb      v24, v20, v21, v22, v23, v16, v17, v18, v19
+        sqrshrun        v24.8b, v24.8h, #6
+        st1            {v24.8b}, [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v20.8b}, [x2], x3
+        movi            v24.8h, #0
+        calc_qpelb      v24, v21, v22, v23, v16, v17, v18, v19, v20
+        sqrshrun        v24.8b, v24.8h, #6
+        st1            {v24.8b}, [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v21.8b}, [x2], x3
+        movi            v24.8h, #0
+        calc_qpelb      v24, v22, v23, v16, v17, v18, v19, v20, v21
+        sqrshrun        v24.8b, v24.8h, #6
+        st1            {v24.8b}, [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v22.8b}, [x2], x3
+        movi            v24.8h, #0
+        calc_qpelb      v24, v23, v16, v17, v18, v19, v20, v21, v22
+        sqrshrun        v24.8b, v24.8h, #6
+        st1            {v24.8b}, [x0], x1
+        subs            x4, x4, #1
+        b.ne            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_v12_8_neon, export=1
+        load_qpel_filterb x6, x5
+        sub             x2, x2, x3, lsl #1
+        sub             x2, x2, x3
+        sub             x1, x1, #8
+1:      mov             x11, x4         // height
+        mov             x10, x0         // dst
+        mov             x8, x2          // src
+
+        ld1            {v16.16b}, [x8], x3
+        ld1            {v17.16b}, [x8], x3
+        ld1            {v18.16b}, [x8], x3
+        ld1            {v19.16b}, [x8], x3
+        ld1            {v20.16b}, [x8], x3
+        ld1            {v21.16b}, [x8], x3
+        ld1            {v22.16b}, [x8], x3
+2:      ld1            {v23.16b}, [x8], x3
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        calc_qpelb      v24, v16, v17, v18, v19, v20, v21, v22, v23
+        calc_qpelb2     v25, v16, v17, v18, v19, v20, v21, v22, v23
+        sqrshrun        v24.8b, v24.8h, #6
+        sqrshrun2       v24.16b, v25.8h, #6
+        st1            {v24.8b}, [x10], #8
+        st1            {v24.s}[2], [x10], x1
+        subs            x11, x11, #1
+        b.eq            3f
+
+        ld1            {v16.16b}, [x8], x3
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        calc_qpelb      v24, v17, v18, v19, v20, v21, v22, v23, v16
+        calc_qpelb2     v25, v17, v18, v19, v20, v21, v22, v23, v16
+        sqrshrun        v24.8b, v24.8h, #6
+        sqrshrun2       v24.16b, v25.8h, #6
+        st1            {v24.8b}, [x10], #8
+        st1            {v24.s}[2], [x10], x1
+        subs            x11, x11, #1
+        b.eq            3f
+
+        ld1            {v17.16b}, [x8], x3
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        calc_qpelb      v24, v18, v19, v20, v21, v22, v23, v16, v17
+        calc_qpelb2     v25, v18, v19, v20, v21, v22, v23, v16, v17
+        sqrshrun        v24.8b, v24.8h, #6
+        sqrshrun2       v24.16b, v25.8h, #6
+        st1            {v24.8b}, [x10], #8
+        st1            {v24.s}[2], [x10], x1
+        subs            x11, x11, #1
+        b.eq            3f
+
+        ld1            {v18.16b}, [x8], x3
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        calc_qpelb      v24, v19, v20, v21, v22, v23, v16, v17, v18
+        calc_qpelb2     v25, v19, v20, v21, v22, v23, v16, v17, v18
+        sqrshrun        v24.8b, v24.8h, #6
+        sqrshrun2       v24.16b, v25.8h, #6
+        st1            {v24.8b}, [x10], #8
+        st1            {v24.s}[2], [x10], x1
+        subs            x11, x11, #1
+        b.eq            3f
+
+        ld1            {v19.16b}, [x8], x3
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        calc_qpelb      v24, v20, v21, v22, v23, v16, v17, v18, v19
+        calc_qpelb2     v25, v20, v21, v22, v23, v16, v17, v18, v19
+        sqrshrun        v24.8b, v24.8h, #6
+        sqrshrun2       v24.16b, v25.8h, #6
+        st1            {v24.8b}, [x10], #8
+        st1            {v24.s}[2], [x10], x1
+        subs            x11, x11, #1
+        b.eq            3f
+
+        ld1            {v20.16b}, [x8], x3
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        calc_qpelb      v24, v21, v22, v23, v16, v17, v18, v19, v20
+        calc_qpelb2     v25, v21, v22, v23, v16, v17, v18, v19, v20
+        sqrshrun        v24.8b, v24.8h, #6
+        sqrshrun2       v24.16b, v25.8h, #6
+        st1            {v24.8b}, [x10], #8
+        st1            {v24.s}[2], [x10], x1
+        subs            x11, x11, #1
+        b.eq            3f
+
+        ld1            {v21.16b}, [x8], x3
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        calc_qpelb      v24, v22, v23, v16, v17, v18, v19, v20, v21
+        calc_qpelb2     v25, v22, v23, v16, v17, v18, v19, v20, v21
+        sqrshrun        v24.8b, v24.8h, #6
+        sqrshrun2       v24.16b, v25.8h, #6
+        st1            {v24.8b}, [x10], #8
+        st1            {v24.s}[2], [x10], x1
+        subs            x11, x11, #1
+        b.eq            3f
+
+        ld1            {v22.16b}, [x8], x3
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        calc_qpelb      v24, v23, v16, v17, v18, v19, v20, v21, v22
+        calc_qpelb2     v25, v23, v16, v17, v18, v19, v20, v21, v22
+        sqrshrun        v24.8b, v24.8h, #6
+        sqrshrun2       v24.16b, v25.8h, #6
+        st1            {v24.8b}, [x10], #8
+        st1            {v24.s}[2], [x10], x1
+        subs            x11, x11, #1
+        b.hi            2b
+
+3:      add             x0, x0, #12
+        add             x2, x2, #12
+        subs            x7, x7, #12
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_v16_8_neon, export=1
+        load_qpel_filterb x6, x5
+        sub             x2, x2, x3, lsl #1
+        sub             x2, x2, x3
+1:      mov             x11, x4         // height
+        mov             x10, x0         // dst
+        mov             x8, x2          // src
+
+        ld1            {v16.16b}, [x8], x3
+        ld1            {v17.16b}, [x8], x3
+        ld1            {v18.16b}, [x8], x3
+        ld1            {v19.16b}, [x8], x3
+        ld1            {v20.16b}, [x8], x3
+        ld1            {v21.16b}, [x8], x3
+        ld1            {v22.16b}, [x8], x3
+2:      ld1            {v23.16b}, [x8], x3
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        calc_qpelb      v24, v16, v17, v18, v19, v20, v21, v22, v23
+        calc_qpelb2     v25, v16, v17, v18, v19, v20, v21, v22, v23
+        sqrshrun        v24.8b, v24.8h, #6
+        sqrshrun2       v24.16b, v25.8h, #6
+        st1            {v24.16b}, [x10], x1
+        subs            x11, x11, #1
+        b.eq            3f
+
+        ld1            {v16.16b}, [x8], x3
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        calc_qpelb      v24, v17, v18, v19, v20, v21, v22, v23, v16
+        calc_qpelb2     v25, v17, v18, v19, v20, v21, v22, v23, v16
+        sqrshrun        v24.8b, v24.8h, #6
+        sqrshrun2       v24.16b, v25.8h, #6
+        st1            {v24.16b}, [x10], x1
+        subs            x11, x11, #1
+        b.eq            3f
+
+        ld1            {v17.16b}, [x8], x3
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        calc_qpelb      v24, v18, v19, v20, v21, v22, v23, v16, v17
+        calc_qpelb2     v25, v18, v19, v20, v21, v22, v23, v16, v17
+        sqrshrun        v24.8b, v24.8h, #6
+        sqrshrun2       v24.16b, v25.8h, #6
+        st1            {v24.16b}, [x10], x1
+        subs            x11, x11, #1
+        b.eq            3f
+
+        ld1            {v18.16b}, [x8], x3
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        calc_qpelb      v24, v19, v20, v21, v22, v23, v16, v17, v18
+        calc_qpelb2     v25, v19, v20, v21, v22, v23, v16, v17, v18
+        sqrshrun        v24.8b, v24.8h, #6
+        sqrshrun2       v24.16b, v25.8h, #6
+        st1            {v24.16b}, [x10], x1
+        subs            x11, x11, #1
+        b.eq            3f
+
+        ld1            {v19.16b}, [x8], x3
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        calc_qpelb      v24, v20, v21, v22, v23, v16, v17, v18, v19
+        calc_qpelb2     v25, v20, v21, v22, v23, v16, v17, v18, v19
+        sqrshrun        v24.8b, v24.8h, #6
+        sqrshrun2       v24.16b, v25.8h, #6
+        st1            {v24.16b}, [x10], x1
+        subs            x11, x11, #1
+        b.eq            3f
+
+        ld1            {v20.16b}, [x8], x3
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        calc_qpelb      v24, v21, v22, v23, v16, v17, v18, v19, v20
+        calc_qpelb2     v25, v21, v22, v23, v16, v17, v18, v19, v20
+        sqrshrun        v24.8b, v24.8h, #6
+        sqrshrun2       v24.16b, v25.8h, #6
+        st1            {v24.16b}, [x10], x1
+        subs            x11, x11, #1
+        b.eq            3f
+
+        ld1            {v21.16b}, [x8], x3
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        calc_qpelb      v24, v22, v23, v16, v17, v18, v19, v20, v21
+        calc_qpelb2     v25, v22, v23, v16, v17, v18, v19, v20, v21
+        sqrshrun        v24.8b, v24.8h, #6
+        sqrshrun2       v24.16b, v25.8h, #6
+        st1            {v24.16b}, [x10], x1
+        subs            x11, x11, #1
+        b.eq            3f
+
+        ld1            {v22.16b}, [x8], x3
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        calc_qpelb      v24, v23, v16, v17, v18, v19, v20, v21, v22
+        calc_qpelb2     v25, v23, v16, v17, v18, v19, v20, v21, v22
+        sqrshrun        v24.8b, v24.8h, #6
+        sqrshrun2       v24.16b, v25.8h, #6
+        st1            {v24.16b}, [x10], x1
+        subs            x11, x11, #1
+        b.hi            2b
+
+3:      add             x0, x0, #16
+        add             x2, x2, #16
+        subs            x7, x7, #16
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_v24_8_neon, export=1
+        b X(ff_hevc_put_hevc_qpel_uni_v12_8_neon)
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_v32_8_neon, export=1
+        b X(ff_hevc_put_hevc_qpel_uni_v16_8_neon)
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_v48_8_neon, export=1
+        b X(ff_hevc_put_hevc_qpel_uni_v16_8_neon)
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_v64_8_neon, export=1
+        b X(ff_hevc_put_hevc_qpel_uni_v16_8_neon)
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_hv4_8_neon, export=1
+        add             x10, x4, #7
+        lsl             x10, x10, #7
+        sub             sp, sp, x10         // tmp_array
+        stp             x0, x1, [sp, #-16]!
+        stp             x4, x6, [sp, #-16]!
+        stp             x30, xzr, [sp, #-16]!
+        add             x0, sp, #48
+        sub             x1, x2, x3, lsl #1
+        sub             x1, x1, x3
+        mov             x2, x3
+        add             x3, x4, #7
+        mov             x4, x5
+        bl              X(ff_hevc_put_hevc_qpel_h4_8_neon)
+        ldp             x30, xzr, [sp], #16
+        ldp             x4, x6, [sp], #16
+        ldp             x0, x1, [sp], #16
+        load_qpel_filterh x6, x5
+        mov             x9, #(MAX_PB_SIZE * 2)
+        ld1            {v16.4h}, [sp], x9
+        ld1            {v17.4h}, [sp], x9
+        ld1            {v18.4h}, [sp], x9
+        ld1            {v19.4h}, [sp], x9
+        ld1            {v20.4h}, [sp], x9
+        ld1            {v21.4h}, [sp], x9
+        ld1            {v22.4h}, [sp], x9
+1:      ld1            {v23.4h}, [sp], x9
+        calc_qpelh      v1, v16, v17, v18, v19, v20, v21, v22, v23, sqrshrn, #12
+        sqxtun          v1.8b, v1.8h
+        st1            {v1.s}[0], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v16.8h}, [sp], x9
+        calc_qpelh      v1, v17, v18, v19, v20, v21, v22, v23, v16, sqrshrn, #12
+        sqxtun          v1.8b, v1.8h
+        st1            {v1.s}[0], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v17.8h}, [sp], x9
+        calc_qpelh      v1, v18, v19, v20, v21, v22, v23, v16, v17, sqrshrn, #12
+        sqxtun          v1.8b, v1.8h
+        st1            {v1.s}[0], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v18.8h}, [sp], x9
+        calc_qpelh      v1, v19, v20, v21, v22, v23, v16, v17, v18, sqrshrn, #12
+        sqxtun          v1.8b, v1.8h
+        st1            {v1.s}[0], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v19.8h}, [sp], x9
+        calc_qpelh      v1, v20, v21, v22, v23, v16, v17, v18, v19, sqrshrn, #12
+        sqxtun          v1.8b, v1.8h
+        st1            {v1.s}[0], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v20.8h}, [sp], x9
+        calc_qpelh      v1, v21, v22, v23, v16, v17, v18, v19, v20, sqrshrn, #12
+        sqxtun          v1.8b, v1.8h
+        st1            {v1.s}[0], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v21.8h}, [sp], x9
+        calc_qpelh      v1, v22, v23, v16, v17, v18, v19, v20, v21, sqrshrn, #12
+        sqxtun          v1.8b, v1.8h
+        st1            {v1.s}[0], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v22.8h}, [sp], x9
+        calc_qpelh      v1, v23, v16, v17, v18, v19, v20, v21, v22, sqrshrn, #12
+        sqxtun          v1.8b, v1.8h
+        st1            {v1.s}[0], [x0], x1
+        subs            x4, x4, #1
+        b.ne            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_hv6_8_neon, export=1
+        add             x10, x4, #7
+        lsl             x10, x10, #7
+        sub             sp, sp, x10         // tmp_array
+        stp             x0, x1, [sp, #-16]!
+        stp             x4, x6, [sp, #-16]!
+        stp             x30, xzr, [sp, #-16]!
+        add             x0, sp, #48
+        sub             x1, x2, x3, lsl #1
+        sub             x1, x1, x3
+        mov             x2, x3
+        add             x3, x4, #7
+        mov             x4, x5
+        bl              X(ff_hevc_put_hevc_qpel_h6_8_neon)
+        ldp             x30, xzr, [sp], #16
+        ldp             x4, x6, [sp], #16
+        ldp             x0, x1, [sp], #16
+        load_qpel_filterh x6, x5
+        sub             x1, x1, #4
+        mov             x9, #(MAX_PB_SIZE * 2)
+        ld1            {v16.8h}, [sp], x9
+        ld1            {v17.8h}, [sp], x9
+        ld1            {v18.8h}, [sp], x9
+        ld1            {v19.8h}, [sp], x9
+        ld1            {v20.8h}, [sp], x9
+        ld1            {v21.8h}, [sp], x9
+        ld1            {v22.8h}, [sp], x9
+1:      ld1            {v23.8h}, [sp], x9
+        calc_qpelh      v1, v16, v17, v18, v19, v20, v21, v22, v23, sqrshrn, #12
+        calc_qpelh2     v1, v2, v16, v17, v18, v19, v20, v21, v22, v23, sqrshrn2, #12
+        sqxtun          v1.8b, v1.8h
+        st1            {v1.s}[0], [x0], #4
+        st1            {v1.h}[2], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v16.8h}, [sp], x9
+        calc_qpelh      v1, v17, v18, v19, v20, v21, v22, v23, v16, sqrshrn, #12
+        calc_qpelh2     v1, v2, v17, v18, v19, v20, v21, v22, v23, v16, sqrshrn2, #12
+        sqxtun          v1.8b, v1.8h
+        st1            {v1.s}[0], [x0], #4
+        st1            {v1.h}[2], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v17.8h}, [sp], x9
+        calc_qpelh      v1, v18, v19, v20, v21, v22, v23, v16, v17, sqrshrn, #12
+        calc_qpelh2     v1, v2, v18, v19, v20, v21, v22, v23, v16, v17, sqrshrn2, #12
+        sqxtun          v1.8b, v1.8h
+        st1            {v1.s}[0], [x0], #4
+        st1            {v1.h}[2], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v18.8h}, [sp], x9
+        calc_qpelh      v1, v19, v20, v21, v22, v23, v16, v17, v18, sqrshrn, #12
+        calc_qpelh2     v1, v2, v19, v20, v21, v22, v23, v16, v17, v18, sqrshrn2, #12
+        sqxtun          v1.8b, v1.8h
+        st1            {v1.s}[0], [x0], #4
+        st1            {v1.h}[2], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v19.8h}, [sp], x9
+        calc_qpelh      v1, v20, v21, v22, v23, v16, v17, v18, v19, sqrshrn, #12
+        calc_qpelh2     v1, v2, v20, v21, v22, v23, v16, v17, v18, v19, sqrshrn2, #12
+        sqxtun          v1.8b, v1.8h
+        st1            {v1.s}[0], [x0], #4
+        st1            {v1.h}[2], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v20.8h}, [sp], x9
+        calc_qpelh      v1, v21, v22, v23, v16, v17, v18, v19, v20, sqrshrn, #12
+        calc_qpelh2     v1, v2, v21, v22, v23, v16, v17, v18, v19, v20, sqrshrn2, #12
+        sqxtun          v1.8b, v1.8h
+        st1            {v1.s}[0], [x0], #4
+        st1            {v1.h}[2], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v21.8h}, [sp], x9
+        calc_qpelh      v1, v22, v23, v16, v17, v18, v19, v20, v21, sqrshrn, #12
+        calc_qpelh2     v1, v2, v22, v23, v16, v17, v18, v19, v20, v21, sqrshrn2, #12
+        sqxtun          v1.8b, v1.8h
+        st1            {v1.s}[0], [x0], #4
+        st1            {v1.h}[2], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v22.8h}, [sp], x9
+        calc_qpelh      v1, v23, v16, v17, v18, v19, v20, v21, v22, sqrshrn, #12
+        calc_qpelh2     v1, v2, v23, v16, v17, v18, v19, v20, v21, v22, sqrshrn2, #12
+        sqxtun          v1.8b, v1.8h
+        st1            {v1.s}[0], [x0], #4
+        st1            {v1.h}[2], [x0], x1
+        subs            x4, x4, #1
+        b.hi            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_hv8_8_neon, export=1
+        add             x10, x4, #7
+        lsl             x10, x10, #7
+        sub             sp, sp, x10         // tmp_array
+        stp             x0, x1, [sp, #-16]!
+        stp             x4, x6, [sp, #-16]!
+        stp             x30, xzr, [sp, #-16]!
+        add             x0, sp, #48
+        sub             x1, x2, x3, lsl #1
+        sub             x1, x1, x3
+        mov             x2, x3
+        add             x3, x4, #7
+        mov             x4, x5
+        bl              X(ff_hevc_put_hevc_qpel_h8_8_neon)
+        ldp             x30, xzr, [sp], #16
+        ldp             x4, x6, [sp], #16
+        ldp             x0, x1, [sp], #16
+        load_qpel_filterh x6, x5
+        mov             x9, #(MAX_PB_SIZE * 2)
+        ld1            {v16.8h}, [sp], x9
+        ld1            {v17.8h}, [sp], x9
+        ld1            {v18.8h}, [sp], x9
+        ld1            {v19.8h}, [sp], x9
+        ld1            {v20.8h}, [sp], x9
+        ld1            {v21.8h}, [sp], x9
+        ld1            {v22.8h}, [sp], x9
+1:      ld1            {v23.8h}, [sp], x9
+        calc_qpelh      v1, v16, v17, v18, v19, v20, v21, v22, v23, sqrshrn, #12
+        calc_qpelh2     v1, v2, v16, v17, v18, v19, v20, v21, v22, v23, sqrshrn2, #12
+        sqxtun          v1.8b, v1.8h
+        st1            {v1.8b}, [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v16.8h}, [sp], x9
+        calc_qpelh      v1, v17, v18, v19, v20, v21, v22, v23, v16, sqrshrn, #12
+        calc_qpelh2     v1, v2, v17, v18, v19, v20, v21, v22, v23, v16, sqrshrn2, #12
+        sqxtun          v1.8b, v1.8h
+        st1            {v1.8b}, [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v17.8h}, [sp], x9
+        calc_qpelh      v1, v18, v19, v20, v21, v22, v23, v16, v17, sqrshrn, #12
+        calc_qpelh2     v1, v2, v18, v19, v20, v21, v22, v23, v16, v17, sqrshrn2, #12
+        sqxtun          v1.8b, v1.8h
+        st1            {v1.8b}, [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v18.8h}, [sp], x9
+        calc_qpelh      v1, v19, v20, v21, v22, v23, v16, v17, v18, sqrshrn, #12
+        calc_qpelh2     v1, v2, v19, v20, v21, v22, v23, v16, v17, v18, sqrshrn2, #12
+        sqxtun          v1.8b, v1.8h
+        st1            {v1.8b}, [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v19.8h}, [sp], x9
+        calc_qpelh      v1, v20, v21, v22, v23, v16, v17, v18, v19, sqrshrn, #12
+        calc_qpelh2     v1, v2, v20, v21, v22, v23, v16, v17, v18, v19, sqrshrn2, #12
+        sqxtun          v1.8b, v1.8h
+        st1            {v1.8b}, [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v20.8h}, [sp], x9
+        calc_qpelh      v1, v21, v22, v23, v16, v17, v18, v19, v20, sqrshrn, #12
+        calc_qpelh2     v1, v2, v21, v22, v23, v16, v17, v18, v19, v20, sqrshrn2, #12
+        sqxtun          v1.8b, v1.8h
+        st1            {v1.8b}, [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v21.8h}, [sp], x9
+        calc_qpelh      v1, v22, v23, v16, v17, v18, v19, v20, v21, sqrshrn, #12
+        calc_qpelh2     v1, v2, v22, v23, v16, v17, v18, v19, v20, v21, sqrshrn2, #12
+        sqxtun          v1.8b, v1.8h
+        st1            {v1.8b}, [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v22.8h}, [sp], x9
+        calc_qpelh      v1, v23, v16, v17, v18, v19, v20, v21, v22, sqrshrn, #12
+        calc_qpelh2     v1, v2, v23, v16, v17, v18, v19, v20, v21, v22, sqrshrn2, #12
+        sqxtun          v1.8b, v1.8h
+        st1            {v1.8b}, [x0], x1
+        subs            x4, x4, #1
+        b.hi            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_hv12_8_neon, export=1
+        add             x10, x4, #7
+        lsl             x10, x10, #7
+        sub             sp, sp, x10         // tmp_array
+        stp             x0, x1, [sp, #-16]!
+        stp             x4, x6, [sp, #-16]!
+        stp             x7, x30, [sp, #-16]!
+        add             x0, sp, #48
+        sub             x1, x2, x3, lsl #1
+        sub             x1, x1, x3
+        mov             x2, x3
+        add             x3, x4, #7
+        mov             x4, x5
+        bl              X(ff_hevc_put_hevc_qpel_h12_8_neon)
+        ldp             x7, x30, [sp], #16
+        ldp             x4, x6, [sp], #16
+        ldp             x0, x1, [sp], #16
+        load_qpel_filterh x6, x5
+        sub             x1, x1, #8
+        mov             x9, #(MAX_PB_SIZE * 2)
+
+        ld1            {v16.8h, v17.8h}, [sp], x9
+        ld1            {v18.8h, v19.8h}, [sp], x9
+        ld1            {v20.8h, v21.8h}, [sp], x9
+        ld1            {v22.8h, v23.8h}, [sp], x9
+        ld1            {v24.8h, v25.8h}, [sp], x9
+        ld1            {v26.8h, v27.8h}, [sp], x9
+        ld1            {v28.8h, v29.8h}, [sp], x9
+1:      ld1            {v30.8h, v31.8h}, [sp], x9
+        calc_qpelh      v1, v16, v18, v20, v22, v24, v26, v28, v30, sqrshrn, #12
+        calc_qpelh2     v1, v2, v16, v18, v20, v22, v24, v26, v28, v30, sqrshrn2, #12
+        calc_qpelh      v2, v17, v19, v21, v23, v25, v27, v29, v31, sqrshrn, #12
+        sqxtun          v1.8b, v1.8h
+        sqxtun2         v1.16b, v2.8h
+        st1            {v1.8b}, [x0], #8
+        st1            {v1.s}[2], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v16.8h, v17.8h}, [sp], x9
+        calc_qpelh      v1, v18, v20, v22, v24, v26, v28, v30, v16, sqrshrn, #12
+        calc_qpelh2     v1, v2, v18, v20, v22, v24, v26, v28, v30, v16, sqrshrn2, #12
+        calc_qpelh      v2, v19, v21, v23, v25, v27, v29, v31, v17, sqrshrn, #12
+        sqxtun          v1.8b, v1.8h
+        sqxtun2         v1.16b, v2.8h
+        st1            {v1.8b}, [x0], #8
+        st1            {v1.s}[2], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v18.8h, v19.8h}, [sp], x9
+        calc_qpelh      v1, v20, v22, v24, v26, v28, v30, v16, v18, sqrshrn, #12
+        calc_qpelh2     v1, v2, v20, v22, v24, v26, v28, v30, v16, v18, sqrshrn2, #12
+        calc_qpelh      v2, v21, v23, v25, v27, v29, v31, v17, v19, sqrshrn, #12
+        sqxtun          v1.8b, v1.8h
+        sqxtun2         v1.16b, v2.8h
+        st1            {v1.8b}, [x0], #8
+        st1            {v1.s}[2], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v20.8h, v21.8h}, [sp], x9
+        calc_qpelh      v1, v22, v24, v26, v28, v30, v16, v18, v20, sqrshrn, #12
+        calc_qpelh2     v1, v2, v22, v24, v26, v28, v30, v16, v18, v20, sqrshrn2, #12
+        calc_qpelh      v2, v23, v25, v27, v29, v31, v17, v19, v21, sqrshrn, #12
+        sqxtun          v1.8b, v1.8h
+        sqxtun2         v1.16b, v2.8h
+        st1            {v1.8b}, [x0], #8
+        st1            {v1.s}[2], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v22.8h, v23.8h}, [sp], x9
+        calc_qpelh      v1, v24, v26, v28, v30, v16, v18, v20, v22, sqrshrn, #12
+        calc_qpelh2     v1, v2, v24, v26, v28, v30, v16, v18, v20, v22, sqrshrn2, #12
+        calc_qpelh      v2, v25, v27, v29, v31, v17, v19, v21, v23, sqrshrn, #12
+        sqxtun          v1.8b, v1.8h
+        sqxtun2         v1.16b, v2.8h
+        st1            {v1.8b}, [x0], #8
+        st1            {v1.s}[2], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v24.8h, v25.8h}, [sp], x9
+        calc_qpelh      v1, v26, v28, v30, v16, v18, v20, v22, v24, sqrshrn, #12
+        calc_qpelh2     v1, v2, v26, v28, v30, v16, v18, v20, v22, v24, sqrshrn2, #12
+        calc_qpelh      v2, v27, v29, v31, v17, v19, v21, v23, v25, sqrshrn, #12
+        sqxtun          v1.8b, v1.8h
+        sqxtun2         v1.16b, v2.8h
+        st1            {v1.8b}, [x0], #8
+        st1            {v1.s}[2], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v26.8h, v27.8h}, [sp], x9
+        calc_qpelh      v1, v28, v30, v16, v18, v20, v22, v24, v26, sqrshrn, #12
+        calc_qpelh2     v1, v2, v28, v30, v16, v18, v20, v22, v24, v26, sqrshrn2, #12
+        calc_qpelh      v2, v29, v31, v17, v19, v21, v23, v25, v27, sqrshrn, #12
+        sqxtun          v1.8b, v1.8h
+        sqxtun2         v1.16b, v2.8h
+        st1            {v1.8b}, [x0], #8
+        st1            {v1.s}[2], [x0], x1
+        subs            x4, x4, #1
+        b.eq            2f
+
+        ld1            {v28.8h, v29.8h}, [sp], x9
+        calc_qpelh      v1, v30, v16, v18, v20, v22, v24, v26, v28, sqrshrn, #12
+        calc_qpelh2     v1, v2, v30, v16, v18, v20, v22, v24, v26, v28, sqrshrn2, #12
+        calc_qpelh      v2, v31, v17, v19, v21, v23, v25, v27, v29, sqrshrn, #12
+        sqxtun          v1.8b, v1.8h
+        sqxtun2         v1.16b, v2.8h
+        st1            {v1.8b}, [x0], #8
+        st1            {v1.s}[2], [x0], x1
+        subs            x4, x4, #1
+        b.ne            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_hv16_8_neon, export=1
+        add             x10, x4, #7
+        lsl             x10, x10, #7
+        sub             sp, sp, x10         // tmp_array
+        stp             x0, x1, [sp, #-16]!
+        stp             x4, x6, [sp, #-16]!
+        stp             x7, x30, [sp, #-16]!
+        add             x0, sp, #48
+        sub             x1, x2, x3, lsl #1
+        sub             x1, x1, x3
+        mov             x2, x3
+        add             x3, x4, #7
+        mov             x4, x5
+        bl              X(ff_hevc_put_hevc_qpel_h16_8_neon)
+        ldp             x7, x30, [sp], #16
+        ldp             x4, x6, [sp], #16
+        ldp             x0, x1, [sp], #16
+.Lqpel_uni_hv16_loop:
+        load_qpel_filterh x6, x5
+        mov             x9, #(MAX_PB_SIZE * 2)
+        sub             x12, x9, x7, lsl #1
+1:      mov             x11, x4         // height
+        mov             x10, x0         // dst
+        mov             x8, sp          // src
+
+        ld1            {v16.8h, v17.8h}, [x8], x9
+        ld1            {v18.8h, v19.8h}, [x8], x9
+        ld1            {v20.8h, v21.8h}, [x8], x9
+        ld1            {v22.8h, v23.8h}, [x8], x9
+        ld1            {v24.8h, v25.8h}, [x8], x9
+        ld1            {v26.8h, v27.8h}, [x8], x9
+        ld1            {v28.8h, v29.8h}, [x8], x9
+2:      ld1            {v30.8h, v31.8h}, [x8], x9
+        calc_qpelh      v1, v16, v18, v20, v22, v24, v26, v28, v30, sqrshrn, #12
+        calc_qpelh2     v1, v2, v16, v18, v20, v22, v24, v26, v28, v30, sqrshrn2, #12
+        calc_qpelh      v2, v17, v19, v21, v23, v25, v27, v29, v31, sqrshrn, #12
+        calc_qpelh2     v2, v3, v17, v19, v21, v23, v25, v27, v29, v31, sqrshrn2, #12
+        sqxtun          v1.8b, v1.8h
+        sqxtun2         v1.16b, v2.8h
+        st1            {v1.16b}, [x10], x1
+        subs            x11, x11, #1
+        b.eq            3f
+
+        ld1            {v16.8h, v17.8h}, [x8], x9
+        calc_qpelh      v1, v18, v20, v22, v24, v26, v28, v30, v16, sqrshrn, #12
+        calc_qpelh2     v1, v2, v18, v20, v22, v24, v26, v28, v30, v16, sqrshrn2, #12
+        calc_qpelh      v2, v19, v21, v23, v25, v27, v29, v31, v17, sqrshrn, #12
+        calc_qpelh2     v2, v3, v19, v21, v23, v25, v27, v29, v31, v17, sqrshrn2, #12
+        sqxtun          v1.8b, v1.8h
+        sqxtun2         v1.16b, v2.8h
+        st1            {v1.16b}, [x10], x1
+        subs            x11, x11, #1
+        b.eq            3f
+
+        ld1            {v18.8h, v19.8h}, [x8], x9
+        calc_qpelh      v1, v20, v22, v24, v26, v28, v30, v16, v18, sqrshrn, #12
+        calc_qpelh2     v1, v2, v20, v22, v24, v26, v28, v30, v16, v18, sqrshrn2, #12
+        calc_qpelh      v2, v21, v23, v25, v27, v29, v31, v17, v19, sqrshrn, #12
+        calc_qpelh2     v2, v3, v21, v23, v25, v27, v29, v31, v17, v19, sqrshrn2, #12
+        sqxtun          v1.8b, v1.8h
+        sqxtun2         v1.16b, v2.8h
+        st1            {v1.16b}, [x10], x1
+        subs            x11, x11, #1
+        b.eq            3f
+
+        ld1            {v20.8h, v21.8h}, [x8], x9
+        calc_qpelh      v1, v22, v24, v26, v28, v30, v16, v18, v20, sqrshrn, #12
+        calc_qpelh2     v1, v2, v22, v24, v26, v28, v30, v16, v18, v20, sqrshrn2, #12
+        calc_qpelh      v2, v23, v25, v27, v29, v31, v17, v19, v21, sqrshrn, #12
+        calc_qpelh2     v2, v3, v23, v25, v27, v29, v31, v17, v19, v21, sqrshrn2, #12
+        sqxtun          v1.8b, v1.8h
+        sqxtun2         v1.16b, v2.8h
+        st1            {v1.16b}, [x10], x1
+        subs            x11, x11, #1
+        b.eq            3f
+
+        ld1            {v22.8h, v23.8h}, [x8], x9
+        calc_qpelh      v1, v24, v26, v28, v30, v16, v18, v20, v22, sqrshrn, #12
+        calc_qpelh2     v1, v2, v24, v26, v28, v30, v16, v18, v20, v22, sqrshrn2, #12
+        calc_qpelh      v2, v25, v27, v29, v31, v17, v19, v21, v23, sqrshrn, #12
+        calc_qpelh2     v2, v3, v25, v27, v29, v31, v17, v19, v21, v23, sqrshrn2, #12
+        sqxtun          v1.8b, v1.8h
+        sqxtun2         v1.16b, v2.8h
+        st1            {v1.16b}, [x10], x1
+        subs            x11, x11, #1
+        b.eq            3f
+
+        ld1            {v24.8h, v25.8h}, [x8], x9
+        calc_qpelh      v1, v26, v28, v30, v16, v18, v20, v22, v24, sqrshrn, #12
+        calc_qpelh2     v1, v2, v26, v28, v30, v16, v18, v20, v22, v24, sqrshrn2, #12
+        calc_qpelh      v2, v27, v29, v31, v17, v19, v21, v23, v25, sqrshrn, #12
+        calc_qpelh2     v2, v3, v27, v29, v31, v17, v19, v21, v23, v25, sqrshrn2, #12
+        sqxtun          v1.8b, v1.8h
+        sqxtun2         v1.16b, v2.8h
+        st1            {v1.16b}, [x10], x1
+        subs            x11, x11, #1
+        b.eq            3f
+
+        ld1            {v26.8h, v27.8h}, [x8], x9
+        calc_qpelh      v1, v28, v30, v16, v18, v20, v22, v24, v26, sqrshrn, #12
+        calc_qpelh2     v1, v2, v28, v30, v16, v18, v20, v22, v24, v26, sqrshrn2, #12
+        calc_qpelh      v2, v29, v31, v17, v19, v21, v23, v25, v27, sqrshrn, #12
+        calc_qpelh2     v2, v3, v29, v31, v17, v19, v21, v23, v25, v27, sqrshrn2, #12
+        sqxtun          v1.8b, v1.8h
+        sqxtun2         v1.16b, v2.8h
+        st1            {v1.16b}, [x10], x1
+        subs            x11, x11, #1
+        b.eq            3f
+
+        ld1            {v28.8h, v29.8h}, [x8], x9
+        calc_qpelh      v1, v30, v16, v18, v20, v22, v24, v26, v28, sqrshrn, #12
+        calc_qpelh2     v1, v2, v30, v16, v18, v20, v22, v24, v26, v28, sqrshrn2, #12
+        calc_qpelh      v2, v31, v17, v19, v21, v23, v25, v27, v29, sqrshrn, #12
+        calc_qpelh2     v2, v3, v31, v17, v19, v21, v23, v25, v27, v29, sqrshrn2, #12
+        sqxtun          v1.8b, v1.8h
+        sqxtun2         v1.16b, v2.8h
+        st1            {v1.16b}, [x10], x1
+        subs            x11, x11, #1
+        b.hi            2b
+
+3:      add             x0, x0, #16
+        add             sp, sp, #32
+        subs            x7, x7, #16
+        b.ne            1b
+        add             sp, sp, x12         // discard rest of first line
+        add             x10, x4, #6
+        lsl             x10, x10, #7
+        add             sp, sp, x10         // tmp_array without first line
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_hv24_8_neon, export=1
+        stp             x6, x30, [sp, #-16]!
+        stp             x0, x1, [sp, #-16]!
+        stp             x2, x3, [sp, #-16]!
+        stp             x4, x5, [sp, #-16]!
+        mov             x7, #16
+        bl              X(ff_hevc_put_hevc_qpel_uni_hv16_8_neon)
+        ldp             x4, x5, [sp], #16
+        ldp             x2, x3, [sp], #16
+        ldp             x0, x1, [sp], #16
+        ldr             x6, [sp]
+        add             x0, x0, #16
+        add             x2, x2, #16
+        mov             x7, #8
+        bl              X(ff_hevc_put_hevc_qpel_uni_hv8_8_neon)
+        ldp             xzr, x30, [sp], #16
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_hv32_8_neon, export=1
+        add             x10, x4, #7
+        lsl             x10, x10, #7
+        sub             sp, sp, x10         // tmp_array
+        stp             x0, x1, [sp, #-16]!
+        stp             x4, x6, [sp, #-16]!
+        stp             x7, x30, [sp, #-16]!
+        add             x0, sp, #48
+        sub             x1, x2, x3, lsl #1
+        sub             x1, x1, x3
+        mov             x2, x3
+        add             x3, x4, #7
+        mov             x4, x5
+        bl              X(ff_hevc_put_hevc_qpel_h32_8_neon)
+        ldp             x7, x30, [sp], #16
+        ldp             x4, x6, [sp], #16
+        ldp             x0, x1, [sp], #16
+        b .Lqpel_uni_hv16_loop
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_hv48_8_neon, export=1
+        add             x10, x4, #7
+        lsl             x10, x10, #7
+        sub             sp, sp, x10         // tmp_array
+        stp             x0, x1, [sp, #-16]!
+        stp             x4, x6, [sp, #-16]!
+        stp             x7, x30, [sp, #-16]!
+        add             x0, sp, #48
+        sub             x1, x2, x3, lsl #1
+        sub             x1, x1, x3
+        mov             x2, x3
+        add             x3, x4, #7
+        mov             x4, x5
+        bl              X(ff_hevc_put_hevc_qpel_h48_8_neon)
+        ldp             x7, x30, [sp], #16
+        ldp             x4, x6, [sp], #16
+        ldp             x0, x1, [sp], #16
+        b .Lqpel_uni_hv16_loop
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_hv64_8_neon, export=1
+        add             x10, x4, #7
+        lsl             x10, x10, #7
+        sub             sp, sp, x10         // tmp_array
+        stp             x0, x1, [sp, #-16]!
+        stp             x4, x6, [sp, #-16]!
+        stp             x7, x30, [sp, #-16]!
+        add             x0, sp, #48
+        sub             x1, x2, x3, lsl #1
+        sub             x1, x1, x3
+        mov             x2, x3
+        add             x3, x4, #7
+        mov             x4, x5
+        bl              X(ff_hevc_put_hevc_qpel_h64_8_neon)
+        ldp             x7, x30, [sp], #16
+        ldp             x4, x6, [sp], #16
+        ldp             x0, x1, [sp], #16
+        b .Lqpel_uni_hv16_loop
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_h4_8_neon, export=1
+        load_qpel_filterb x6, x7
+        sub             x2, x2, #3
+        mov             x10, #(MAX_PB_SIZE * 2)
+1:      ld1            {v16.8b, v17.8b}, [x2], x3
+        movi            v20.8h, #0
+        umlsl           v20.8h, v16.8b, v0.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], v17.b[0]
+        umlal           v20.8h, v16.8b, v1.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], v17.b[1]
+        umlsl           v20.8h, v16.8b, v2.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], v17.b[2]
+        umlal           v20.8h, v16.8b, v3.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], v17.b[3]
+        umlal           v20.8h, v16.8b, v4.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], v17.b[4]
+        umlsl           v20.8h, v16.8b, v5.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], v17.b[5]
+        umlal           v20.8h, v16.8b, v6.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], v17.b[6]
+        umlsl           v20.8h, v16.8b, v7.8b
+        ld1            {v24.8h}, [x4], x10
+        sqadd           v16.8h, v20.8h, v24.8h
+        sqrshrun        v16.8b, v16.8h, #7
+        st1            {v16.s}[0], [x0], x1
+        subs            x5, x5, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_h6_8_neon, export=1
+        load_qpel_filterb x6, x7
+        sub             x2, x2, #3
+        sub             x1, x1, #4
+        mov             x10, #(MAX_PB_SIZE * 2)
+1:      ld1            {v16.8b, v17.8b}, [x2], x3
+        movi            v20.8h, #0
+        umlsl           v20.8h, v16.8b, v0.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], v17.b[0]
+        umlal           v20.8h, v16.8b, v1.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], v17.b[1]
+        umlsl           v20.8h, v16.8b, v2.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], v17.b[2]
+        umlal           v20.8h, v16.8b, v3.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], v17.b[3]
+        umlal           v20.8h, v16.8b, v4.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], v17.b[4]
+        umlsl           v20.8h, v16.8b, v5.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], v17.b[5]
+        umlal           v20.8h, v16.8b, v6.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], v17.b[6]
+        umlsl           v20.8h, v16.8b, v7.8b
+        ld1            {v24.8h}, [x4], x10
+        sqadd           v16.8h, v20.8h, v24.8h
+        sqrshrun        v16.8b, v16.8h, #7
+        st1            {v16.s}[0], [x0], #4
+        st1            {v16.h}[2], [x0], x1
+        subs            x5, x5, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_h8_8_neon, export=1
+        load_qpel_filterb x6, x7
+        sub             x2, x2, #3
+        mov             x10, #(MAX_PB_SIZE * 2)
+1:      ld1            {v16.8b, v17.8b}, [x2], x3
+        movi            v20.8h, #0
+        umlsl           v20.8h, v16.8b, v0.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], v17.b[0]
+        umlal           v20.8h, v16.8b, v1.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], v17.b[1]
+        umlsl           v20.8h, v16.8b, v2.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], v17.b[2]
+        umlal           v20.8h, v16.8b, v3.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], v17.b[3]
+        umlal           v20.8h, v16.8b, v4.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], v17.b[4]
+        umlsl           v20.8h, v16.8b, v5.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], v17.b[5]
+        umlal           v20.8h, v16.8b, v6.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], v17.b[6]
+        umlsl           v20.8h, v16.8b, v7.8b
+        ld1            {v24.8h}, [x4], x10
+        sqadd           v16.8h, v20.8h, v24.8h
+        sqrshrun        v16.8b, v16.8h, #7
+        st1            {v16.8b}, [x0], x1
+        subs            x5, x5, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_h12_8_neon, export=1
+        load_qpel_filterb x6, x7
+        sub             x2, x2, #3
+        sub             x1, x1, #8
+        mov             x10, #(MAX_PB_SIZE * 2)
+1:      ld2            {v16.8b, v17.8b}, [x2]
+        ldr             w12, [x2, #16]
+        movi            v20.8h, #0
+        movi            v21.8h, #0
+        umlsl           v20.8h, v16.8b, v0.8b
+        umlal           v20.8h, v17.8b, v1.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], w12
+        lsr             x12, x12, #8
+        umlsl           v21.8h, v17.8b, v0.8b
+        umlal           v21.8h, v16.8b, v1.8b
+        ushr            v17.2d, v17.2d, #8
+        mov             v17.b[7], w12
+        lsr             x12, x12, #8
+        umlsl           v20.8h, v16.8b, v2.8b
+        umlal           v20.8h, v17.8b, v3.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], w12
+        umlsl           v21.8h, v17.8b, v2.8b
+        umlal           v21.8h, v16.8b, v3.8b
+        ushr            v17.2d, v17.2d, #8
+        umlal           v20.8h, v16.8b, v4.8b
+        umlsl           v20.8h, v17.8b, v5.8b
+        ushr            v16.2d, v16.2d, #8
+        umlal           v21.8h, v17.8b, v4.8b
+        umlsl           v21.8h, v16.8b, v5.8b
+        ushr            v17.2d, v17.2d, #8
+        umlal           v20.8h, v16.8b, v6.8b
+        umlsl           v20.8h, v17.8b, v7.8b
+        ushr            v16.2d, v16.2d, #8
+        umlal           v21.8h, v17.8b, v6.8b
+        umlsl           v21.8h, v16.8b, v7.8b
+        ld2            {v24.8h, v25.8h}, [x4], x10
+        sqadd           v16.8h, v20.8h, v24.8h
+        sqadd           v17.8h, v21.8h, v25.8h
+        sqrshrun        v16.8b, v16.8h, #7
+        sqrshrun        v17.8b, v17.8h, #7
+        zip1            v16.16b, v16.16b, v17.16b
+        st1            {v16.8b}, [x0], #8
+        st1            {v16.s}[2], [x0], x1
+        add             x2, x2, x3
+        subs            x5, x5, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_h16_8_neon, export=1
+        load_qpel_filterb x6, x7
+        sub             x2, x2, #3
+        mov             x10, #(MAX_PB_SIZE * 2)
+1:      ld2            {v16.8b, v17.8b}, [x2]
+        ldr             x12, [x2, #16]
+        movi            v20.8h, #0
+        movi            v21.8h, #0
+        umlsl           v20.8h, v16.8b, v0.8b
+        umlal           v20.8h, v17.8b, v1.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], w12
+        lsr             x12, x12, #8
+        umlsl           v21.8h, v17.8b, v0.8b
+        umlal           v21.8h, v16.8b, v1.8b
+        ushr            v17.2d, v17.2d, #8
+        mov             v17.b[7], w12
+        lsr             x12, x12, #8
+        umlsl           v20.8h, v16.8b, v2.8b
+        umlal           v20.8h, v17.8b, v3.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], w12
+        lsr             x12, x12, #8
+        umlsl           v21.8h, v17.8b, v2.8b
+        umlal           v21.8h, v16.8b, v3.8b
+        ushr            v17.2d, v17.2d, #8
+        mov             v17.b[7], w12
+        lsr             x12, x12, #8
+        umlal           v20.8h, v16.8b, v4.8b
+        umlsl           v20.8h, v17.8b, v5.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], w12
+        lsr             x12, x12, #8
+        umlal           v21.8h, v17.8b, v4.8b
+        umlsl           v21.8h, v16.8b, v5.8b
+        ushr            v17.2d, v17.2d, #8
+        mov             v17.b[7], w12
+        lsr             x12, x12, #8
+        umlal           v20.8h, v16.8b, v6.8b
+        umlsl           v20.8h, v17.8b, v7.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], w12
+        umlal           v21.8h, v17.8b, v6.8b
+        umlsl           v21.8h, v16.8b, v7.8b
+        ld2            {v24.8h, v25.8h}, [x4], x10
+        sqadd           v16.8h, v20.8h, v24.8h
+        sqadd           v17.8h, v21.8h, v25.8h
+        sqrshrun        v16.8b, v16.8h, #7
+        sqrshrun        v17.8b, v17.8h, #7
+        st2            {v16.8b, v17.8b}, [x0], x1
+        add             x2, x2, x3
+        subs            x5, x5, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_h24_8_neon, export=1
+        load_qpel_filterb x6, x7
+        sub             x2, x2, #3
+        mov             x10, #(MAX_PB_SIZE * 2)
+        mov             x11, x7         // height
+1:      ld3            {v16.8b, v17.8b, v18.8b}, [x2]
+        ldr             x12, [x2, #24]
+        movi            v20.8h, #0
+        movi            v21.8h, #0
+        movi            v22.8h, #0
+        umlsl           v20.8h, v16.8b, v0.8b
+        umlal           v20.8h, v17.8b, v1.8b
+        umlsl           v20.8h, v18.8b, v2.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], w12
+        lsr             x12, x12, #8
+        umlsl           v21.8h, v17.8b, v0.8b
+        umlal           v21.8h, v18.8b, v1.8b
+        umlsl           v21.8h, v16.8b, v2.8b
+        ushr            v17.2d, v17.2d, #8
+        mov             v17.b[7], w12
+        lsr             x12, x12, #8
+        umlsl           v22.8h, v18.8b, v0.8b
+        umlal           v22.8h, v16.8b, v1.8b
+        umlsl           v22.8h, v17.8b, v2.8b
+        ushr            v18.2d, v18.2d, #8
+        mov             v18.b[7], w12
+        lsr             x12, x12, #8
+        umlal           v20.8h, v16.8b, v3.8b
+        umlal           v20.8h, v17.8b, v4.8b
+        umlsl           v20.8h, v18.8b, v5.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], w12
+        lsr             x12, x12, #8
+        umlal           v21.8h, v17.8b, v3.8b
+        umlal           v21.8h, v18.8b, v4.8b
+        umlsl           v21.8h, v16.8b, v5.8b
+        ushr            v17.2d, v17.2d, #8
+        mov             v17.b[7], w12
+        lsr             x12, x12, #8
+        umlal           v22.8h, v18.8b, v3.8b
+        umlal           v22.8h, v16.8b, v4.8b
+        umlsl           v22.8h, v17.8b, v5.8b
+        ushr            v18.2d, v18.2d, #8
+        mov             v18.b[7], w12
+        lsr             x12, x12, #8
+        umlal           v20.8h, v16.8b, v6.8b
+        umlsl           v20.8h, v17.8b, v7.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], w12
+        umlal           v21.8h, v17.8b, v6.8b
+        umlsl           v21.8h, v18.8b, v7.8b
+        umlal           v22.8h, v18.8b, v6.8b
+        umlsl           v22.8h, v16.8b, v7.8b
+        ld3            {v23.8h, v24.8h, v25.8h}, [x4], x10
+        sqadd           v16.8h, v20.8h, v23.8h
+        sqadd           v17.8h, v21.8h, v24.8h
+        sqadd           v18.8h, v22.8h, v25.8h
+        sqrshrun        v16.8b, v16.8h, #7
+        sqrshrun        v17.8b, v17.8h, #7
+        sqrshrun        v18.8b, v18.8h, #7
+        st3            {v16.8b, v17.8b, v18.8b}, [x0], x1
+        add             x2, x2, x3
+        subs            x5, x5, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_h32_8_neon, export=1
+        load_qpel_filterb x6, x7
+        sub             x2, x2, #3
+        mov             x10, #(MAX_PB_SIZE * 2)
+        mov             x11, x7         // height
+1:      ld4            {v16.8b, v17.8b, v18.8b, v19.8b}, [x2]
+        ldr             x12, [x2, #32]
+        movi            v20.8h, #0
+        movi            v21.8h, #0
+        movi            v22.8h, #0
+        movi            v23.8h, #0
+        umlsl           v20.8h, v16.8b, v0.8b
+        umlal           v20.8h, v17.8b, v1.8b
+        umlsl           v20.8h, v18.8b, v2.8b
+        umlal           v20.8h, v19.8b, v3.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], w12
+        lsr             x12, x12, #8
+        umlsl           v21.8h, v17.8b, v0.8b
+        umlal           v21.8h, v18.8b, v1.8b
+        umlsl           v21.8h, v19.8b, v2.8b
+        umlal           v21.8h, v16.8b, v3.8b
+        ushr            v17.2d, v17.2d, #8
+        mov             v17.b[7], w12
+        lsr             x12, x12, #8
+        umlsl           v22.8h, v18.8b, v0.8b
+        umlal           v22.8h, v19.8b, v1.8b
+        umlsl           v22.8h, v16.8b, v2.8b
+        umlal           v22.8h, v17.8b, v3.8b
+        ushr            v18.2d, v18.2d, #8
+        mov             v18.b[7], w12
+        lsr             x12, x12, #8
+        umlsl           v23.8h, v19.8b, v0.8b
+        umlal           v23.8h, v16.8b, v1.8b
+        umlsl           v23.8h, v17.8b, v2.8b
+        umlal           v23.8h, v18.8b, v3.8b
+        ushr            v19.2d, v19.2d, #8
+        mov             v19.b[7], w12
+        lsr             x12, x12, #8
+        umlal           v20.8h, v16.8b, v4.8b
+        umlsl           v20.8h, v17.8b, v5.8b
+        umlal           v20.8h, v18.8b, v6.8b
+        umlsl           v20.8h, v19.8b, v7.8b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], w12
+        lsr             x12, x12, #8
+        umlal           v21.8h, v17.8b, v4.8b
+        umlsl           v21.8h, v18.8b, v5.8b
+        umlal           v21.8h, v19.8b, v6.8b
+        umlsl           v21.8h, v16.8b, v7.8b
+        ushr            v17.2d, v17.2d, #8
+        mov             v17.b[7], w12
+        lsr             x12, x12, #8
+        umlal           v22.8h, v18.8b, v4.8b
+        umlsl           v22.8h, v19.8b, v5.8b
+        umlal           v22.8h, v16.8b, v6.8b
+        umlsl           v22.8h, v17.8b, v7.8b
+        ushr            v18.2d, v18.2d, #8
+        mov             v18.b[7], w12
+        umlal           v23.8h, v19.8b, v4.8b
+        umlsl           v23.8h, v16.8b, v5.8b
+        umlal           v23.8h, v17.8b, v6.8b
+        umlsl           v23.8h, v18.8b, v7.8b
+        ld4            {v24.8h, v25.8h, v26.8h, v27.8h}, [x4], x10
+        sqadd           v16.8h, v20.8h, v24.8h
+        sqadd           v17.8h, v21.8h, v25.8h
+        sqadd           v18.8h, v22.8h, v26.8h
+        sqadd           v19.8h, v23.8h, v27.8h
+        sqrshrun        v16.8b, v16.8h, #7
+        sqrshrun        v17.8b, v17.8h, #7
+        sqrshrun        v18.8b, v18.8h, #7
+        sqrshrun        v19.8b, v19.8h, #7
+        st4            {v16.8b, v17.8b, v18.8b, v19.8b}, [x0], x1
+        add             x2, x2, x3
+        subs            x5, x5, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_h48_8_neon, export=1
+        load_qpel_filterb x6, x7
+        sub             x2, x2, #3
+        mov             x10, #80
+        mov             x11, x7         // height
+1:      ld3            {v16.16b, v17.16b, v18.16b}, [x2]
+        ldr             x12, [x2, #24]
+        ldr             x13, [x2, #48]
+        movi            v20.8h, #0
+        movi            v21.8h, #0
+        movi            v22.8h, #0
+        movi            v23.8h, #0
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        umlsl           v20.8h, v16.8b, v0.8b
+        umlal           v20.8h, v17.8b, v1.8b
+        umlsl           v20.8h, v18.8b, v2.8b
+        umlsl2          v23.8h, v16.16b, v0.16b
+        umlal2          v23.8h, v17.16b, v1.16b
+        umlsl2          v23.8h, v18.16b, v2.16b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], w12
+        mov             v16.b[15], w13
+        lsr             x12, x12, #8
+        lsr             x13, x13, #8
+        umlsl           v21.8h, v17.8b, v0.8b
+        umlal           v21.8h, v18.8b, v1.8b
+        umlsl           v21.8h, v16.8b, v2.8b
+        umlsl2          v24.8h, v17.16b, v0.16b
+        umlal2          v24.8h, v18.16b, v1.16b
+        umlsl2          v24.8h, v16.16b, v2.16b
+        ushr            v17.2d, v17.2d, #8
+        mov             v17.b[7], w12
+        mov             v17.b[15], w13
+        lsr             x12, x12, #8
+        lsr             x13, x13, #8
+        umlsl           v22.8h, v18.8b, v0.8b
+        umlal           v22.8h, v16.8b, v1.8b
+        umlsl           v22.8h, v17.8b, v2.8b
+        umlsl2          v25.8h, v18.16b, v0.16b
+        umlal2          v25.8h, v16.16b, v1.16b
+        umlsl2          v25.8h, v17.16b, v2.16b
+        ushr            v18.2d, v18.2d, #8
+        mov             v18.b[7], w12
+        mov             v18.b[15], w13
+        lsr             x12, x12, #8
+        lsr             x13, x13, #8
+        umlal           v20.8h, v16.8b, v3.8b
+        umlal           v20.8h, v17.8b, v4.8b
+        umlsl           v20.8h, v18.8b, v5.8b
+        umlal2          v23.8h, v16.16b, v3.16b
+        umlal2          v23.8h, v17.16b, v4.16b
+        umlsl2          v23.8h, v18.16b, v5.16b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], w12
+        mov             v16.b[15], w13
+        lsr             x12, x12, #8
+        lsr             x13, x13, #8
+        umlal           v21.8h, v17.8b, v3.8b
+        umlal           v21.8h, v18.8b, v4.8b
+        umlsl           v21.8h, v16.8b, v5.8b
+        umlal2          v24.8h, v17.16b, v3.16b
+        umlal2          v24.8h, v18.16b, v4.16b
+        umlsl2          v24.8h, v16.16b, v5.16b
+        ushr            v17.2d, v17.2d, #8
+        mov             v17.b[7], w12
+        mov             v17.b[15], w13
+        lsr             x12, x12, #8
+        lsr             x13, x13, #8
+        umlal           v22.8h, v18.8b, v3.8b
+        umlal           v22.8h, v16.8b, v4.8b
+        umlsl           v22.8h, v17.8b, v5.8b
+        umlal2          v25.8h, v18.16b, v3.16b
+        umlal2          v25.8h, v16.16b, v4.16b
+        umlsl2          v25.8h, v17.16b, v5.16b
+        ushr            v18.2d, v18.2d, #8
+        mov             v18.b[7], w12
+        mov             v18.b[15], w13
+        lsr             x12, x12, #8
+        lsr             x13, x13, #8
+        umlal           v20.8h, v16.8b, v6.8b
+        umlsl           v20.8h, v17.8b, v7.8b
+        umlal2          v23.8h, v16.16b, v6.16b
+        umlsl2          v23.8h, v17.16b, v7.16b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], w12
+        mov             v16.b[15], w13
+        umlal           v21.8h, v17.8b, v6.8b
+        umlsl           v21.8h, v18.8b, v7.8b
+        umlal2          v24.8h, v17.16b, v6.16b
+        umlsl2          v24.8h, v18.16b, v7.16b
+        umlal           v22.8h, v18.8b, v6.8b
+        umlsl           v22.8h, v16.8b, v7.8b
+        umlal2          v25.8h, v18.16b, v6.16b
+        umlsl2          v25.8h, v16.16b, v7.16b
+        ld3            {v26.8h, v27.8h, v28.8h}, [x4], #48
+        sqadd           v16.8h, v20.8h, v26.8h
+        sqadd           v17.8h, v21.8h, v27.8h
+        sqadd           v18.8h, v22.8h, v28.8h
+        ld3            {v26.8h, v27.8h, v28.8h}, [x4], x10
+        sqadd           v19.8h, v23.8h, v26.8h
+        sqadd           v20.8h, v24.8h, v27.8h
+        sqadd           v21.8h, v25.8h, v28.8h
+        sqrshrun        v16.8b, v16.8h, #7
+        sqrshrun        v17.8b, v17.8h, #7
+        sqrshrun        v18.8b, v18.8h, #7
+        sqrshrun2       v16.16b, v19.8h, #7
+        sqrshrun2       v17.16b, v20.8h, #7
+        sqrshrun2       v18.16b, v21.8h, #7
+        st3            {v16.16b, v17.16b, v18.16b}, [x0], x1
+        add             x2, x2, x3
+        subs            x5, x5, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_h64_8_neon, export=1
+        load_qpel_filterb x6, x7
+        sub             x2, x2, #3
+1:      ld4            {v16.16b, v17.16b, v18.16b, v19.16b}, [x2]
+        ldr             x12, [x2, #32]
+        ldr             x13, [x2, #64]
+        movi            v20.8h, #0
+        movi            v21.8h, #0
+        movi            v22.8h, #0
+        movi            v23.8h, #0
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        movi            v26.8h, #0
+        movi            v27.8h, #0
+        umlsl           v20.8h, v16.8b, v0.8b
+        umlal           v20.8h, v17.8b, v1.8b
+        umlsl           v20.8h, v18.8b, v2.8b
+        umlal           v20.8h, v19.8b, v3.8b
+        umlsl2          v24.8h, v16.16b, v0.16b
+        umlal2          v24.8h, v17.16b, v1.16b
+        umlsl2          v24.8h, v18.16b, v2.16b
+        umlal2          v24.8h, v19.16b, v3.16b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], w12
+        lsr             x12, x12, #8
+        mov             v16.b[15], w13
+        lsr             x13, x13, #8
+        umlsl           v21.8h, v17.8b, v0.8b
+        umlal           v21.8h, v18.8b, v1.8b
+        umlsl           v21.8h, v19.8b, v2.8b
+        umlal           v21.8h, v16.8b, v3.8b
+        umlsl2          v25.8h, v17.16b, v0.16b
+        umlal2          v25.8h, v18.16b, v1.16b
+        umlsl2          v25.8h, v19.16b, v2.16b
+        umlal2          v25.8h, v16.16b, v3.16b
+        ushr            v17.2d, v17.2d, #8
+        mov             v17.b[7], w12
+        lsr             x12, x12, #8
+        mov             v17.b[15], w13
+        lsr             x13, x13, #8
+        umlsl           v22.8h, v18.8b, v0.8b
+        umlal           v22.8h, v19.8b, v1.8b
+        umlsl           v22.8h, v16.8b, v2.8b
+        umlal           v22.8h, v17.8b, v3.8b
+        umlsl2          v26.8h, v18.16b, v0.16b
+        umlal2          v26.8h, v19.16b, v1.16b
+        umlsl2          v26.8h, v16.16b, v2.16b
+        umlal2          v26.8h, v17.16b, v3.16b
+        ushr            v18.2d, v18.2d, #8
+        mov             v18.b[7], w12
+        lsr             x12, x12, #8
+        mov             v18.b[15], w13
+        lsr             x13, x13, #8
+        umlsl           v23.8h, v19.8b, v0.8b
+        umlal           v23.8h, v16.8b, v1.8b
+        umlsl           v23.8h, v17.8b, v2.8b
+        umlal           v23.8h, v18.8b, v3.8b
+        umlsl2          v27.8h, v19.16b, v0.16b
+        umlal2          v27.8h, v16.16b, v1.16b
+        umlsl2          v27.8h, v17.16b, v2.16b
+        umlal2          v27.8h, v18.16b, v3.16b
+        ushr            v19.2d, v19.2d, #8
+        mov             v19.b[7], w12
+        lsr             x12, x12, #8
+        mov             v19.b[15], w13
+        lsr             x13, x13, #8
+        umlal           v20.8h, v16.8b, v4.8b
+        umlsl           v20.8h, v17.8b, v5.8b
+        umlal           v20.8h, v18.8b, v6.8b
+        umlsl           v20.8h, v19.8b, v7.8b
+        umlal2          v24.8h, v16.16b, v4.16b
+        umlsl2          v24.8h, v17.16b, v5.16b
+        umlal2          v24.8h, v18.16b, v6.16b
+        umlsl2          v24.8h, v19.16b, v7.16b
+        ushr            v16.2d, v16.2d, #8
+        mov             v16.b[7], w12
+        lsr             x12, x12, #8
+        mov             v16.b[15], w13
+        lsr             x13, x13, #8
+        umlal           v21.8h, v17.8b, v4.8b
+        umlsl           v21.8h, v18.8b, v5.8b
+        umlal           v21.8h, v19.8b, v6.8b
+        umlsl           v21.8h, v16.8b, v7.8b
+        umlal2          v25.8h, v17.16b, v4.16b
+        umlsl2          v25.8h, v18.16b, v5.16b
+        umlal2          v25.8h, v19.16b, v6.16b
+        umlsl2          v25.8h, v16.16b, v7.16b
+        ushr            v17.2d, v17.2d, #8
+        mov             v17.b[7], w12
+        lsr             x12, x12, #8
+        mov             v17.b[15], w13
+        lsr             x13, x13, #8
+        umlal           v22.8h, v18.8b, v4.8b
+        umlsl           v22.8h, v19.8b, v5.8b
+        umlal           v22.8h, v16.8b, v6.8b
+        umlsl           v22.8h, v17.8b, v7.8b
+        umlal2          v26.8h, v18.16b, v4.16b
+        umlsl2          v26.8h, v19.16b, v5.16b
+        umlal2          v26.8h, v16.16b, v6.16b
+        umlsl2          v26.8h, v17.16b, v7.16b
+        ushr            v18.2d, v18.2d, #8
+        mov             v18.b[7], w12
+        mov             v18.b[15], w13
+        umlal           v23.8h, v19.8b, v4.8b
+        umlsl           v23.8h, v16.8b, v5.8b
+        umlal           v23.8h, v17.8b, v6.8b
+        umlsl           v23.8h, v18.8b, v7.8b
+        umlal2          v27.8h, v19.16b, v4.16b
+        umlsl2          v27.8h, v16.16b, v5.16b
+        umlal2          v27.8h, v17.16b, v6.16b
+        umlsl2          v27.8h, v18.16b, v7.16b
+        ld4            {v28.8h, v29.8h, v30.8h, v31.8h}, [x4], #64
+        sqadd           v20.8h, v20.8h, v28.8h
+        sqadd           v21.8h, v21.8h, v29.8h
+        sqadd           v22.8h, v22.8h, v30.8h
+        sqadd           v23.8h, v23.8h, v31.8h
+        ld4            {v28.8h, v29.8h, v30.8h, v31.8h}, [x4], #64
+        sqadd           v24.8h, v24.8h, v28.8h
+        sqadd           v25.8h, v25.8h, v29.8h
+        sqadd           v26.8h, v26.8h, v30.8h
+        sqadd           v27.8h, v27.8h, v31.8h
+        sqrshrun        v16.8b, v20.8h, #7
+        sqrshrun        v17.8b, v21.8h, #7
+        sqrshrun        v18.8b, v22.8h, #7
+        sqrshrun        v19.8b, v23.8h, #7
+        sqrshrun2       v16.16b, v24.8h, #7
+        sqrshrun2       v17.16b, v25.8h, #7
+        sqrshrun2       v18.16b, v26.8h, #7
+        sqrshrun2       v19.16b, v27.8h, #7
+        st4            {v16.16b, v17.16b, v18.16b, v19.16b}, [x0], x1
+        add             x2, x2, x3
+        subs            x5, x5, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_v4_8_neon, export=1
+        load_qpel_filterb x7, x6
+        sub             x2, x2, x3, lsl #1
+        sub             x2, x2, x3
+        mov             x12, #(MAX_PB_SIZE * 2)
+        ld1            {v16.s}[0], [x2], x3
+        ld1            {v17.s}[0], [x2], x3
+        ld1            {v18.s}[0], [x2], x3
+        ld1            {v19.s}[0], [x2], x3
+        ld1            {v20.s}[0], [x2], x3
+        ld1            {v21.s}[0], [x2], x3
+        ld1            {v22.s}[0], [x2], x3
+1:      ld1            {v23.s}[0], [x2], x3
+        movi            v24.8h, #0
+        calc_qpelb      v24, v16, v17, v18, v19, v20, v21, v22, v23
+        ld1            {v25.4h}, [x4], x12   // src2
+        sqadd           v24.8h, v24.8h, v25.8h
+        sqrshrun        v25.8b, v24.8h, #7
+        st1            {v25.s}[0], [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v16.s}[0], [x2], x3
+        movi            v24.8h, #0
+        calc_qpelb      v24, v17, v18, v19, v20, v21, v22, v23, v16
+        ld1            {v25.4h}, [x4], x12   // src2
+        sqadd           v24.8h, v24.8h, v25.8h
+        sqrshrun        v25.8b, v24.8h, #7
+        st1            {v25.s}[0], [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v17.s}[0], [x2], x3
+        movi            v24.8h, #0
+        calc_qpelb      v24, v18, v19, v20, v21, v22, v23, v16, v17
+        ld1            {v25.4h}, [x4], x12   // src2
+        sqadd           v24.8h, v24.8h, v25.8h
+        sqrshrun        v25.8b, v24.8h, #7
+        st1            {v25.s}[0], [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v18.s}[0], [x2], x3
+        movi            v24.8h, #0
+        calc_qpelb      v24, v19, v20, v21, v22, v23, v16, v17, v18
+        ld1            {v25.4h}, [x4], x12   // src2
+        sqadd           v24.8h, v24.8h, v25.8h
+        sqrshrun        v25.8b, v24.8h, #7
+        st1            {v25.s}[0], [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v19.s}[0], [x2], x3
+        movi            v24.8h, #0
+        calc_qpelb      v24, v20, v21, v22, v23, v16, v17, v18, v19
+        ld1            {v25.4h}, [x4], x12   // src2
+        sqadd           v24.8h, v24.8h, v25.8h
+        sqrshrun        v25.8b, v24.8h, #7
+        st1            {v25.s}[0], [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v20.s}[0], [x2], x3
+        movi            v24.8h, #0
+        calc_qpelb      v24, v21, v22, v23, v16, v17, v18, v19, v20
+        ld1            {v25.4h}, [x4], x12   // src2
+        sqadd           v24.8h, v24.8h, v25.8h
+        sqrshrun        v25.8b, v24.8h, #7
+        st1            {v25.s}[0], [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v21.s}[0], [x2], x3
+        movi            v24.8h, #0
+        calc_qpelb      v24, v22, v23, v16, v17, v18, v19, v20, v21
+        ld1            {v25.4h}, [x4], x12   // src2
+        sqadd           v24.8h, v24.8h, v25.8h
+        sqrshrun        v25.8b, v24.8h, #7
+        st1            {v25.s}[0], [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v22.s}[0], [x2], x3
+        movi            v24.8h, #0
+        calc_qpelb      v24, v23, v16, v17, v18, v19, v20, v21, v22
+        ld1            {v25.4h}, [x4], x12   // src2
+        sqadd           v24.8h, v24.8h, v25.8h
+        sqrshrun        v25.8b, v24.8h, #7
+        st1            {v25.s}[0], [x0], x1
+        subs            x5, x5, #1
+        b.hi            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_v6_8_neon, export=1
+        load_qpel_filterb x7, x6
+        sub             x2, x2, x3, lsl #1
+        sub             x2, x2, x3
+        sub             x1, x1, #4
+        mov             x12, #(MAX_PB_SIZE * 2)
+        ld1            {v16.8b}, [x2], x3
+        ld1            {v17.8b}, [x2], x3
+        ld1            {v18.8b}, [x2], x3
+        ld1            {v19.8b}, [x2], x3
+        ld1            {v20.8b}, [x2], x3
+        ld1            {v21.8b}, [x2], x3
+        ld1            {v22.8b}, [x2], x3
+1:      ld1            {v23.8b}, [x2], x3
+        movi            v24.8h, #0
+        calc_qpelb      v24, v16, v17, v18, v19, v20, v21, v22, v23
+        ld1            {v25.8h}, [x4], x12   // src2
+        sqadd           v24.8h, v24.8h, v25.8h
+        sqrshrun        v25.8b, v24.8h, #7
+        st1            {v25.s}[0], [x0], #4
+        st1            {v25.h}[2], [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v16.8b}, [x2], x3
+        movi            v24.8h, #0
+        calc_qpelb      v24, v17, v18, v19, v20, v21, v22, v23, v16
+        ld1            {v25.8h}, [x4], x12   // src2
+        sqadd           v24.8h, v24.8h, v25.8h
+        sqrshrun        v25.8b, v24.8h, #7
+        st1            {v25.s}[0], [x0], #4
+        st1            {v25.h}[2], [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v17.8b}, [x2], x3
+        movi            v24.8h, #0
+        calc_qpelb      v24, v18, v19, v20, v21, v22, v23, v16, v17
+        ld1            {v25.8h}, [x4], x12   // src2
+        sqadd           v24.8h, v24.8h, v25.8h
+        sqrshrun        v25.8b, v24.8h, #7
+        st1            {v25.s}[0], [x0], #4
+        st1            {v25.h}[2], [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v18.8b}, [x2], x3
+        movi            v24.8h, #0
+        calc_qpelb      v24, v19, v20, v21, v22, v23, v16, v17, v18
+        ld1            {v25.8h}, [x4], x12   // src2
+        sqadd           v24.8h, v24.8h, v25.8h
+        sqrshrun        v25.8b, v24.8h, #7
+        st1            {v25.s}[0], [x0], #4
+        st1            {v25.h}[2], [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v19.8b}, [x2], x3
+        movi            v24.8h, #0
+        calc_qpelb      v24, v20, v21, v22, v23, v16, v17, v18, v19
+        ld1            {v25.8h}, [x4], x12   // src2
+        sqadd           v24.8h, v24.8h, v25.8h
+        sqrshrun        v25.8b, v24.8h, #7
+        st1            {v25.s}[0], [x0], #4
+        st1            {v25.h}[2], [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v20.8b}, [x2], x3
+        movi            v24.8h, #0
+        calc_qpelb      v24, v21, v22, v23, v16, v17, v18, v19, v20
+        ld1            {v25.8h}, [x4], x12   // src2
+        sqadd           v24.8h, v24.8h, v25.8h
+        sqrshrun        v25.8b, v24.8h, #7
+        st1            {v25.s}[0], [x0], #4
+        st1            {v25.h}[2], [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v21.8b}, [x2], x3
+        movi            v24.8h, #0
+        calc_qpelb      v24, v22, v23, v16, v17, v18, v19, v20, v21
+        ld1            {v25.8h}, [x4], x12   // src2
+        sqadd           v24.8h, v24.8h, v25.8h
+        sqrshrun        v25.8b, v24.8h, #7
+        st1            {v25.s}[0], [x0], #4
+        st1            {v25.h}[2], [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v22.8b}, [x2], x3
+        movi            v24.8h, #0
+        calc_qpelb      v24, v23, v16, v17, v18, v19, v20, v21, v22
+        ld1            {v25.8h}, [x4], x12   // src2
+        sqadd           v24.8h, v24.8h, v25.8h
+        sqrshrun        v25.8b, v24.8h, #7
+        st1            {v25.s}[0], [x0], #4
+        st1            {v25.h}[2], [x0], x1
+        subs            x5, x5, #1
+        b.hi            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_v8_8_neon, export=1
+        load_qpel_filterb x7, x6
+        sub             x2, x2, x3, lsl #1
+        sub             x2, x2, x3
+        mov             x12, #(MAX_PB_SIZE * 2)
+        ld1            {v16.8b}, [x2], x3
+        ld1            {v17.8b}, [x2], x3
+        ld1            {v18.8b}, [x2], x3
+        ld1            {v19.8b}, [x2], x3
+        ld1            {v20.8b}, [x2], x3
+        ld1            {v21.8b}, [x2], x3
+        ld1            {v22.8b}, [x2], x3
+1:      ld1            {v23.8b}, [x2], x3
+        movi            v24.8h, #0
+        calc_qpelb      v24, v16, v17, v18, v19, v20, v21, v22, v23
+        ld1            {v25.8h}, [x4], x12   // src2
+        sqadd           v24.8h, v24.8h, v25.8h
+        sqrshrun        v25.8b, v24.8h, #7
+        st1            {v25.8b}, [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v16.8b}, [x2], x3
+        movi            v24.8h, #0
+        calc_qpelb      v24, v17, v18, v19, v20, v21, v22, v23, v16
+        ld1            {v25.8h}, [x4], x12   // src2
+        sqadd           v24.8h, v24.8h, v25.8h
+        sqrshrun        v25.8b, v24.8h, #7
+        st1            {v25.8b}, [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v17.8b}, [x2], x3
+        movi            v24.8h, #0
+        calc_qpelb      v24, v18, v19, v20, v21, v22, v23, v16, v17
+        ld1            {v25.8h}, [x4], x12   // src2
+        sqadd           v24.8h, v24.8h, v25.8h
+        sqrshrun        v25.8b, v24.8h, #7
+        st1            {v25.8b}, [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v18.8b}, [x2], x3
+        movi            v24.8h, #0
+        calc_qpelb      v24, v19, v20, v21, v22, v23, v16, v17, v18
+        ld1            {v25.8h}, [x4], x12   // src2
+        sqadd           v24.8h, v24.8h, v25.8h
+        sqrshrun        v25.8b, v24.8h, #7
+        st1            {v25.8b}, [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v19.8b}, [x2], x3
+        movi            v24.8h, #0
+        calc_qpelb      v24, v20, v21, v22, v23, v16, v17, v18, v19
+        ld1            {v25.8h}, [x4], x12   // src2
+        sqadd           v24.8h, v24.8h, v25.8h
+        sqrshrun        v25.8b, v24.8h, #7
+        st1            {v25.8b}, [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v20.8b}, [x2], x3
+        movi            v24.8h, #0
+        calc_qpelb      v24, v21, v22, v23, v16, v17, v18, v19, v20
+        ld1            {v25.8h}, [x4], x12   // src2
+        sqadd           v24.8h, v24.8h, v25.8h
+        sqrshrun        v25.8b, v24.8h, #7
+        st1            {v25.8b}, [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v21.8b}, [x2], x3
+        movi            v24.8h, #0
+        calc_qpelb      v24, v22, v23, v16, v17, v18, v19, v20, v21
+        ld1            {v25.8h}, [x4], x12   // src2
+        sqadd           v24.8h, v24.8h, v25.8h
+        sqrshrun        v25.8b, v24.8h, #7
+        st1            {v25.8b}, [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v22.8b}, [x2], x3
+        movi            v24.8h, #0
+        calc_qpelb      v24, v23, v16, v17, v18, v19, v20, v21, v22
+        ld1            {v25.8h}, [x4], x12   // src2
+        sqadd           v24.8h, v24.8h, v25.8h
+        sqrshrun        v25.8b, v24.8h, #7
+        st1            {v25.8b}, [x0], x1
+        subs            x5, x5, #1
+        b.hi            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_v12_8_neon, export=1
+        load_qpel_filterb x7, x6
+        sub             x2, x2, x3, lsl #1
+        sub             x2, x2, x3
+        sub             x1, x1, #8
+        mov             x12, #(MAX_PB_SIZE * 2)
+        ld1            {v16.16b}, [x2], x3
+        ld1            {v17.16b}, [x2], x3
+        ld1            {v18.16b}, [x2], x3
+        ld1            {v19.16b}, [x2], x3
+        ld1            {v20.16b}, [x2], x3
+        ld1            {v21.16b}, [x2], x3
+        ld1            {v22.16b}, [x2], x3
+1:      ld1            {v23.16b}, [x2], x3
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        calc_qpelb      v24, v16, v17, v18, v19, v20, v21, v22, v23
+        calc_qpelb2     v25, v16, v17, v18, v19, v20, v21, v22, v23
+        ld1            {v26.8h, v27.8h}, [x4], x12   // src2
+        sqadd           v24.8h, v24.8h, v26.8h
+        sqadd           v25.8h, v25.8h, v27.8h
+        sqrshrun        v26.8b, v24.8h, #7
+        sqrshrun2       v26.16b, v25.8h, #7
+        st1            {v26.8b}, [x0], #8
+        st1            {v26.s}[2], [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v16.16b}, [x2], x3
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        calc_qpelb      v24, v17, v18, v19, v20, v21, v22, v23, v16
+        calc_qpelb2     v25, v17, v18, v19, v20, v21, v22, v23, v16
+        ld1            {v26.8h, v27.8h}, [x4], x12   // src2
+        sqadd           v24.8h, v24.8h, v26.8h
+        sqadd           v25.8h, v25.8h, v27.8h
+        sqrshrun        v26.8b, v24.8h, #7
+        sqrshrun2       v26.16b, v25.8h, #7
+        st1            {v26.8b}, [x0], #8
+        st1            {v26.s}[2], [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v17.16b}, [x2], x3
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        calc_qpelb      v24, v18, v19, v20, v21, v22, v23, v16, v17
+        calc_qpelb2     v25, v18, v19, v20, v21, v22, v23, v16, v17
+        ld1            {v26.8h, v27.8h}, [x4], x12   // src2
+        sqadd           v24.8h, v24.8h, v26.8h
+        sqadd           v25.8h, v25.8h, v27.8h
+        sqrshrun        v26.8b, v24.8h, #7
+        sqrshrun2       v26.16b, v25.8h, #7
+        st1            {v26.8b}, [x0], #8
+        st1            {v26.s}[2], [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v18.16b}, [x2], x3
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        calc_qpelb      v24, v19, v20, v21, v22, v23, v16, v17, v18
+        calc_qpelb2     v25, v19, v20, v21, v22, v23, v16, v17, v18
+        ld1            {v26.8h, v27.8h}, [x4], x12   // src2
+        sqadd           v24.8h, v24.8h, v26.8h
+        sqadd           v25.8h, v25.8h, v27.8h
+        sqrshrun        v26.8b, v24.8h, #7
+        sqrshrun2       v26.16b, v25.8h, #7
+        st1            {v26.8b}, [x0], #8
+        st1            {v26.s}[2], [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v19.16b}, [x2], x3
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        calc_qpelb      v24, v20, v21, v22, v23, v16, v17, v18, v19
+        calc_qpelb2     v25, v20, v21, v22, v23, v16, v17, v18, v19
+        ld1            {v26.8h, v27.8h}, [x4], x12   // src2
+        sqadd           v24.8h, v24.8h, v26.8h
+        sqadd           v25.8h, v25.8h, v27.8h
+        sqrshrun        v26.8b, v24.8h, #7
+        sqrshrun2       v26.16b, v25.8h, #7
+        st1            {v26.8b}, [x0], #8
+        st1            {v26.s}[2], [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v20.16b}, [x2], x3
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        calc_qpelb      v24, v21, v22, v23, v16, v17, v18, v19, v20
+        calc_qpelb2     v25, v21, v22, v23, v16, v17, v18, v19, v20
+        ld1            {v26.8h, v27.8h}, [x4], x12   // src2
+        sqadd           v24.8h, v24.8h, v26.8h
+        sqadd           v25.8h, v25.8h, v27.8h
+        sqrshrun        v26.8b, v24.8h, #7
+        sqrshrun2       v26.16b, v25.8h, #7
+        st1            {v26.8b}, [x0], #8
+        st1            {v26.s}[2], [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v21.16b}, [x2], x3
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        calc_qpelb      v24, v22, v23, v16, v17, v18, v19, v20, v21
+        calc_qpelb2     v25, v22, v23, v16, v17, v18, v19, v20, v21
+        ld1            {v26.8h, v27.8h}, [x4], x12   // src2
+        sqadd           v24.8h, v24.8h, v26.8h
+        sqadd           v25.8h, v25.8h, v27.8h
+        sqrshrun        v26.8b, v24.8h, #7
+        sqrshrun2       v26.16b, v25.8h, #7
+        st1            {v26.8b}, [x0], #8
+        st1            {v26.s}[2], [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v22.16b}, [x2], x3
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        calc_qpelb      v24, v23, v16, v17, v18, v19, v20, v21, v22
+        calc_qpelb2     v25, v23, v16, v17, v18, v19, v20, v21, v22
+        ld1            {v26.8h, v27.8h}, [x4], x12   // src2
+        sqadd           v24.8h, v24.8h, v26.8h
+        sqadd           v25.8h, v25.8h, v27.8h
+        sqrshrun        v26.8b, v24.8h, #7
+        sqrshrun2       v26.16b, v25.8h, #7
+        st1            {v26.8b}, [x0], #8
+        st1            {v26.s}[2], [x0], x1
+        subs            x5, x5, #1
+        b.hi            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_v16_8_neon, export=1
+        load_qpel_filterb x7, x6
+        sub             x2, x2, x3, lsl #1
+        sub             x2, x2, x3
+        mov             x12, #(MAX_PB_SIZE * 2)
+        ld1            {v16.16b}, [x2], x3
+        ld1            {v17.16b}, [x2], x3
+        ld1            {v18.16b}, [x2], x3
+        ld1            {v19.16b}, [x2], x3
+        ld1            {v20.16b}, [x2], x3
+        ld1            {v21.16b}, [x2], x3
+        ld1            {v22.16b}, [x2], x3
+1:      ld1            {v23.16b}, [x2], x3
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        calc_qpelb      v24, v16, v17, v18, v19, v20, v21, v22, v23
+        calc_qpelb2     v25, v16, v17, v18, v19, v20, v21, v22, v23
+        ld1            {v26.8h, v27.8h}, [x4], x12   // src2
+        sqadd           v24.8h, v24.8h, v26.8h
+        sqadd           v25.8h, v25.8h, v27.8h
+        sqrshrun        v26.8b, v24.8h, #7
+        sqrshrun2       v26.16b, v25.8h, #7
+        st1            {v26.16b}, [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v16.16b}, [x2], x3
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        calc_qpelb      v24, v17, v18, v19, v20, v21, v22, v23, v16
+        calc_qpelb2     v25, v17, v18, v19, v20, v21, v22, v23, v16
+        ld1            {v26.8h, v27.8h}, [x4], x12   // src2
+        sqadd           v24.8h, v24.8h, v26.8h
+        sqadd           v25.8h, v25.8h, v27.8h
+        sqrshrun        v26.8b, v24.8h, #7
+        sqrshrun2       v26.16b, v25.8h, #7
+        st1            {v26.16b}, [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v17.16b}, [x2], x3
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        calc_qpelb      v24, v18, v19, v20, v21, v22, v23, v16, v17
+        calc_qpelb2     v25, v18, v19, v20, v21, v22, v23, v16, v17
+        ld1            {v26.8h, v27.8h}, [x4], x12   // src2
+        sqadd           v24.8h, v24.8h, v26.8h
+        sqadd           v25.8h, v25.8h, v27.8h
+        sqrshrun        v26.8b, v24.8h, #7
+        sqrshrun2       v26.16b, v25.8h, #7
+        st1            {v26.16b}, [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v18.16b}, [x2], x3
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        calc_qpelb      v24, v19, v20, v21, v22, v23, v16, v17, v18
+        calc_qpelb2     v25, v19, v20, v21, v22, v23, v16, v17, v18
+        ld1            {v26.8h, v27.8h}, [x4], x12   // src2
+        sqadd           v24.8h, v24.8h, v26.8h
+        sqadd           v25.8h, v25.8h, v27.8h
+        sqrshrun        v26.8b, v24.8h, #7
+        sqrshrun2       v26.16b, v25.8h, #7
+        st1            {v26.16b}, [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v19.16b}, [x2], x3
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        calc_qpelb      v24, v20, v21, v22, v23, v16, v17, v18, v19
+        calc_qpelb2     v25, v20, v21, v22, v23, v16, v17, v18, v19
+        ld1            {v26.8h, v27.8h}, [x4], x12   // src2
+        sqadd           v24.8h, v24.8h, v26.8h
+        sqadd           v25.8h, v25.8h, v27.8h
+        sqrshrun        v26.8b, v24.8h, #7
+        sqrshrun2       v26.16b, v25.8h, #7
+        st1            {v26.16b}, [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v20.16b}, [x2], x3
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        calc_qpelb      v24, v21, v22, v23, v16, v17, v18, v19, v20
+        calc_qpelb2     v25, v21, v22, v23, v16, v17, v18, v19, v20
+        ld1            {v26.8h, v27.8h}, [x4], x12   // src2
+        sqadd           v24.8h, v24.8h, v26.8h
+        sqadd           v25.8h, v25.8h, v27.8h
+        sqrshrun        v26.8b, v24.8h, #7
+        sqrshrun2       v26.16b, v25.8h, #7
+        st1            {v26.16b}, [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v21.16b}, [x2], x3
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        calc_qpelb      v24, v22, v23, v16, v17, v18, v19, v20, v21
+        calc_qpelb2     v25, v22, v23, v16, v17, v18, v19, v20, v21
+        ld1            {v26.8h, v27.8h}, [x4], x12   // src2
+        sqadd           v24.8h, v24.8h, v26.8h
+        sqadd           v25.8h, v25.8h, v27.8h
+        sqrshrun        v26.8b, v24.8h, #7
+        sqrshrun2       v26.16b, v25.8h, #7
+        st1            {v26.16b}, [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v22.16b}, [x2], x3
+        movi            v24.8h, #0
+        movi            v25.8h, #0
+        calc_qpelb      v24, v23, v16, v17, v18, v19, v20, v21, v22
+        calc_qpelb2     v25, v23, v16, v17, v18, v19, v20, v21, v22
+        ld1            {v26.8h, v27.8h}, [x4], x12   // src2
+        sqadd           v24.8h, v24.8h, v26.8h
+        sqadd           v25.8h, v25.8h, v27.8h
+        sqrshrun        v26.8b, v24.8h, #7
+        sqrshrun2       v26.16b, v25.8h, #7
+        st1            {v26.16b}, [x0], x1
+        subs            x5, x5, #1
+        b.hi            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_v24_8_neon, export=1
+        stp             x7, x30, [sp, #-16]!
+        stp             x0, x1, [sp, #-16]!
+        stp             x2, x3, [sp, #-16]!
+        stp             x4, x5, [sp, #-16]!
+        bl              X(ff_hevc_put_hevc_qpel_bi_v16_8_neon)
+        ldp             x4, x5, [sp], #16
+        ldp             x2, x3, [sp], #16
+        ldp             x0, x1, [sp], #16
+        ldr             x7, [sp]
+        add             x0, x0, #16
+        add             x2, x2, #16
+        add             x4, x4, #32
+        bl              X(ff_hevc_put_hevc_qpel_bi_v8_8_neon)
+        ldp             xzr, x30, [sp], #16
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_v32_8_neon, export=1
+        sub             sp, sp, #64
+        st1            {v12.16b, v13.16b, v14.16b, v15.16b}, [sp]
+        sub             sp, sp, #64
+        st1            {v8.16b, v9.16b, v10.16b, v11.16b}, [sp]
+        load_qpel_filterb x7, x6
+        sub             x2, x2, x3, lsl #1
+        sub             x2, x2, x3
+        ldr             w6, [sp, #128]
+        mov             x12, #(MAX_PB_SIZE * 2)
+1:      mov             x11, x5         // height
+        mov             x10, x0         // dst
+        mov             x8, x2          // src
+        mov             x9, x4          // src2
+
+        ld1            {v16.16b, v17.16b}, [x8], x3
+        ld1            {v18.16b, v19.16b}, [x8], x3
+        ld1            {v20.16b, v21.16b}, [x8], x3
+        ld1            {v22.16b, v23.16b}, [x8], x3
+        ld1            {v24.16b, v25.16b}, [x8], x3
+        ld1            {v26.16b, v27.16b}, [x8], x3
+        ld1            {v28.16b, v29.16b}, [x8], x3
+2:      ld1            {v30.16b, v31.16b}, [x8], x3
+        movi            v8.8h, #0
+        movi            v9.8h, #0
+        movi            v10.8h, #0
+        movi            v11.8h, #0
+        calc_qpelb      v8,  v16, v18, v20, v22, v24, v26, v28, v30
+        calc_qpelb2     v9,  v16, v18, v20, v22, v24, v26, v28, v30
+        calc_qpelb      v10, v17, v19, v21, v23, v25, v27, v29, v31
+        calc_qpelb2     v11, v17, v19, v21, v23, v25, v27, v29, v31
+        ld1            {v12.8h, v13.8h, v14.8h, v15.8h}, [x9], x12   // src2
+        sqadd           v8.8h, v8.8h, v12.8h
+        sqadd           v9.8h, v9.8h, v13.8h
+        sqadd           v10.8h, v10.8h, v14.8h
+        sqadd           v11.8h, v11.8h, v15.8h
+        sqrshrun        v12.8b, v8.8h, #7
+        sqrshrun2       v12.16b, v9.8h, #7
+        sqrshrun        v13.8b, v10.8h, #7
+        sqrshrun2       v13.16b, v11.8h, #7
+        st1            {v12.16b, v13.16b}, [x10], x1
+        subs            x11, x11, #1
+        b.eq            3f
+
+        ld1            {v16.16b, v17.16b}, [x8], x3
+        movi            v8.8h, #0
+        movi            v9.8h, #0
+        movi            v10.8h, #0
+        movi            v11.8h, #0
+        calc_qpelb      v8,  v18, v20, v22, v24, v26, v28, v30, v16
+        calc_qpelb2     v9,  v18, v20, v22, v24, v26, v28, v30, v16
+        calc_qpelb      v10, v19, v21, v23, v25, v27, v29, v31, v17
+        calc_qpelb2     v11, v19, v21, v23, v25, v27, v29, v31, v17
+        ld1            {v12.8h, v13.8h, v14.8h, v15.8h}, [x9], x12   // src2
+        sqadd           v8.8h, v8.8h, v12.8h
+        sqadd           v9.8h, v9.8h, v13.8h
+        sqadd           v10.8h, v10.8h, v14.8h
+        sqadd           v11.8h, v11.8h, v15.8h
+        sqrshrun        v12.8b, v8.8h, #7
+        sqrshrun2       v12.16b, v9.8h, #7
+        sqrshrun        v13.8b, v10.8h, #7
+        sqrshrun2       v13.16b, v11.8h, #7
+        st1            {v12.16b, v13.16b}, [x10], x1
+        subs            x11, x11, #1
+        b.eq            3f
+
+        ld1            {v18.16b, v19.16b}, [x8], x3
+        movi            v8.8h, #0
+        movi            v9.8h, #0
+        movi            v10.8h, #0
+        movi            v11.8h, #0
+        calc_qpelb      v8,  v20, v22, v24, v26, v28, v30, v16, v18
+        calc_qpelb2     v9,  v20, v22, v24, v26, v28, v30, v16, v18
+        calc_qpelb      v10, v21, v23, v25, v27, v29, v31, v17, v19
+        calc_qpelb2     v11, v21, v23, v25, v27, v29, v31, v17, v19
+        ld1            {v12.8h, v13.8h, v14.8h, v15.8h}, [x9], x12   // src2
+        sqadd           v8.8h, v8.8h, v12.8h
+        sqadd           v9.8h, v9.8h, v13.8h
+        sqadd           v10.8h, v10.8h, v14.8h
+        sqadd           v11.8h, v11.8h, v15.8h
+        sqrshrun        v12.8b, v8.8h, #7
+        sqrshrun2       v12.16b, v9.8h, #7
+        sqrshrun        v13.8b, v10.8h, #7
+        sqrshrun2       v13.16b, v11.8h, #7
+        st1            {v12.16b, v13.16b}, [x10], x1
+        subs            x11, x11, #1
+        b.eq            3f
+
+        ld1            {v20.16b, v21.16b}, [x8], x3
+        movi            v8.8h, #0
+        movi            v9.8h, #0
+        movi            v10.8h, #0
+        movi            v11.8h, #0
+        calc_qpelb      v8,  v22, v24, v26, v28, v30, v16, v18, v20
+        calc_qpelb2     v9,  v22, v24, v26, v28, v30, v16, v18, v20
+        calc_qpelb      v10, v23, v25, v27, v29, v31, v17, v19, v21
+        calc_qpelb2     v11, v23, v25, v27, v29, v31, v17, v19, v21
+        ld1            {v12.8h, v13.8h, v14.8h, v15.8h}, [x9], x12   // src2
+        sqadd           v8.8h, v8.8h, v12.8h
+        sqadd           v9.8h, v9.8h, v13.8h
+        sqadd           v10.8h, v10.8h, v14.8h
+        sqadd           v11.8h, v11.8h, v15.8h
+        sqrshrun        v12.8b, v8.8h, #7
+        sqrshrun2       v12.16b, v9.8h, #7
+        sqrshrun        v13.8b, v10.8h, #7
+        sqrshrun2       v13.16b, v11.8h, #7
+        st1            {v12.16b, v13.16b}, [x10], x1
+        subs            x11, x11, #1
+        b.eq            3f
+
+        ld1            {v22.16b, v23.16b}, [x8], x3
+        movi            v8.8h, #0
+        movi            v9.8h, #0
+        movi            v10.8h, #0
+        movi            v11.8h, #0
+        calc_qpelb      v8,  v24, v26, v28, v30, v16, v18, v20, v22
+        calc_qpelb2     v9,  v24, v26, v28, v30, v16, v18, v20, v22
+        calc_qpelb      v10, v25, v27, v29, v31, v17, v19, v21, v23
+        calc_qpelb2     v11, v25, v27, v29, v31, v17, v19, v21, v23
+        ld1            {v12.8h, v13.8h, v14.8h, v15.8h}, [x9], x12   // src2
+        sqadd           v8.8h, v8.8h, v12.8h
+        sqadd           v9.8h, v9.8h, v13.8h
+        sqadd           v10.8h, v10.8h, v14.8h
+        sqadd           v11.8h, v11.8h, v15.8h
+        sqrshrun        v12.8b, v8.8h, #7
+        sqrshrun2       v12.16b, v9.8h, #7
+        sqrshrun        v13.8b, v10.8h, #7
+        sqrshrun2       v13.16b, v11.8h, #7
+        st1            {v12.16b, v13.16b}, [x10], x1
+        subs            x11, x11, #1
+        b.eq            3f
+
+        ld1            {v24.16b, v25.16b}, [x8], x3
+        movi            v8.8h, #0
+        movi            v9.8h, #0
+        movi            v10.8h, #0
+        movi            v11.8h, #0
+        calc_qpelb      v8,  v26, v28, v30, v16, v18, v20, v22, v24
+        calc_qpelb2     v9,  v26, v28, v30, v16, v18, v20, v22, v24
+        calc_qpelb      v10, v27, v29, v31, v17, v19, v21, v23, v25
+        calc_qpelb2     v11, v27, v29, v31, v17, v19, v21, v23, v25
+        ld1            {v12.8h, v13.8h, v14.8h, v15.8h}, [x9], x12   // src2
+        sqadd           v8.8h, v8.8h, v12.8h
+        sqadd           v9.8h, v9.8h, v13.8h
+        sqadd           v10.8h, v10.8h, v14.8h
+        sqadd           v11.8h, v11.8h, v15.8h
+        sqrshrun        v12.8b, v8.8h, #7
+        sqrshrun2       v12.16b, v9.8h, #7
+        sqrshrun        v13.8b, v10.8h, #7
+        sqrshrun2       v13.16b, v11.8h, #7
+        st1            {v12.16b, v13.16b}, [x10], x1
+        subs            x11, x11, #1
+        b.eq            3f
+
+        ld1            {v26.16b, v27.16b}, [x8], x3
+        movi            v8.8h, #0
+        movi            v9.8h, #0
+        movi            v10.8h, #0
+        movi            v11.8h, #0
+        calc_qpelb      v8,  v28, v30, v16, v18, v20, v22, v24, v26
+        calc_qpelb2     v9,  v28, v30, v16, v18, v20, v22, v24, v26
+        calc_qpelb      v10, v29, v31, v17, v19, v21, v23, v25, v27
+        calc_qpelb2     v11, v29, v31, v17, v19, v21, v23, v25, v27
+        ld1            {v12.8h, v13.8h, v14.8h, v15.8h}, [x9], x12   // src2
+        sqadd           v8.8h, v8.8h, v12.8h
+        sqadd           v9.8h, v9.8h, v13.8h
+        sqadd           v10.8h, v10.8h, v14.8h
+        sqadd           v11.8h, v11.8h, v15.8h
+        sqrshrun        v12.8b, v8.8h, #7
+        sqrshrun2       v12.16b, v9.8h, #7
+        sqrshrun        v13.8b, v10.8h, #7
+        sqrshrun2       v13.16b, v11.8h, #7
+        st1            {v12.16b, v13.16b}, [x10], x1
+        subs            x11, x11, #1
+        b.eq            3f
+
+        ld1            {v28.16b, v29.16b}, [x8], x3
+        movi            v8.8h, #0
+        movi            v9.8h, #0
+        movi            v10.8h, #0
+        movi            v11.8h, #0
+        calc_qpelb      v8,  v30, v16, v18, v20, v22, v24, v26, v28
+        calc_qpelb2     v9,  v30, v16, v18, v20, v22, v24, v26, v28
+        calc_qpelb      v10, v31, v17, v19, v21, v23, v25, v27, v29
+        calc_qpelb2     v11, v31, v17, v19, v21, v23, v25, v27, v29
+        ld1            {v12.8h, v13.8h, v14.8h, v15.8h}, [x9], x12   // src2
+        sqadd           v8.8h, v8.8h, v12.8h
+        sqadd           v9.8h, v9.8h, v13.8h
+        sqadd           v10.8h, v10.8h, v14.8h
+        sqadd           v11.8h, v11.8h, v15.8h
+        sqrshrun        v12.8b, v8.8h, #7
+        sqrshrun2       v12.16b, v9.8h, #7
+        sqrshrun        v13.8b, v10.8h, #7
+        sqrshrun2       v13.16b, v11.8h, #7
+        st1            {v12.16b, v13.16b}, [x10], x1
+        subs            x11, x11, #1
+        b.hi            2b
+
+3:      add             x0, x0, #32          // dst
+        add             x2, x2, #32          // src
+        add             x4, x4, #64          // src2
+        subs            x6, x6, #32
+        b.ne            1b
+        ld1            {v8.16b, v9.16b, v10.16b, v11.16b}, [sp], #64
+        ld1            {v12.16b, v13.16b, v14.16b, v15.16b}, [sp], #64
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_v48_8_neon, export=1
+        stp             x7, x30, [sp, #-16]!
+        stp             x0, x1, [sp, #-16]!
+        stp             x2, x3, [sp, #-16]!
+        stp             x4, x5, [sp, #-16]!
+        mov             x8, #32
+        stp             x8, x8, [sp, #-16]!
+        bl              X(ff_hevc_put_hevc_qpel_bi_v32_8_neon)
+        ldp             x8, xzr, [sp], #16
+        ldp             x4, x5, [sp], #16
+        ldp             x2, x3, [sp], #16
+        ldp             x0, x1, [sp], #16
+        ldr             x7, [sp]
+        add             x0, x0, #32
+        add             x2, x2, #32
+        add             x4, x4, #64
+        bl              X(ff_hevc_put_hevc_qpel_bi_v16_8_neon)
+        ldp             xzr, x30, [sp], #16
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_v64_8_neon, export=1
+        b X(ff_hevc_put_hevc_qpel_bi_v32_8_neon)
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_hv4_8_neon, export=1
+        add             x10, x5, #7
+        lsl             x10, x10, #7
+        sub             sp, sp, x10         // tmp_array
+        stp             x0, x1, [sp, #-16]!
+        stp             x4, x5, [sp, #-16]!
+        stp             x7, x30, [sp, #-16]!
+        add             x0, sp, #48
+        sub             x1, x2, x3, lsl #1
+        sub             x1, x1, x3
+        mov             x2, x3
+        add             x3, x5, #7
+        mov             x4, x6
+        bl              X(ff_hevc_put_hevc_qpel_h4_8_neon)
+        ldp             x7, x30, [sp], #16
+        ldp             x4, x5, [sp], #16
+        ldp             x0, x1, [sp], #16
+        load_qpel_filterh x7, x6
+        mov             x9, #(MAX_PB_SIZE * 2)
+        ld1            {v16.4h}, [sp], x9
+        ld1            {v17.4h}, [sp], x9
+        ld1            {v18.4h}, [sp], x9
+        ld1            {v19.4h}, [sp], x9
+        ld1            {v20.4h}, [sp], x9
+        ld1            {v21.4h}, [sp], x9
+        ld1            {v22.4h}, [sp], x9
+1:      ld1            {v23.4h}, [sp], x9
+        calc_qpelh      v1, v16, v17, v18, v19, v20, v21, v22, v23, sshr
+        ld1            {v5.4h}, [x4], x9 // src2
+        saddw           v1.4s, v1.4s, v5.4h
+        rshrn           v1.4h, v1.4s, #7
+        sqxtun          v1.8b, v1.8h
+        st1            {v1.s}[0], [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v16.8h}, [sp], x9
+        calc_qpelh      v1, v17, v18, v19, v20, v21, v22, v23, v16, sshr
+        ld1            {v5.4h}, [x4], x9 // src2
+        saddw           v1.4s, v1.4s, v5.4h
+        rshrn           v1.4h, v1.4s, #7
+        sqxtun          v1.8b, v1.8h
+        st1            {v1.s}[0], [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v17.8h}, [sp], x9
+        calc_qpelh      v1, v18, v19, v20, v21, v22, v23, v16, v17, sshr
+        ld1            {v5.4h}, [x4], x9 // src2
+        saddw           v1.4s, v1.4s, v5.4h
+        rshrn           v1.4h, v1.4s, #7
+        sqxtun          v1.8b, v1.8h
+        st1            {v1.s}[0], [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v18.8h}, [sp], x9
+        calc_qpelh      v1, v19, v20, v21, v22, v23, v16, v17, v18, sshr
+        ld1            {v5.4h}, [x4], x9 // src2
+        saddw           v1.4s, v1.4s, v5.4h
+        rshrn           v1.4h, v1.4s, #7
+        sqxtun          v1.8b, v1.8h
+        st1            {v1.s}[0], [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v19.8h}, [sp], x9
+        calc_qpelh      v1, v20, v21, v22, v23, v16, v17, v18, v19, sshr
+        ld1            {v5.4h}, [x4], x9 // src2
+        saddw           v1.4s, v1.4s, v5.4h
+        rshrn           v1.4h, v1.4s, #7
+        sqxtun          v1.8b, v1.8h
+        st1            {v1.s}[0], [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v20.8h}, [sp], x9
+        calc_qpelh      v1, v21, v22, v23, v16, v17, v18, v19, v20, sshr
+        ld1            {v5.4h}, [x4], x9 // src2
+        saddw           v1.4s, v1.4s, v5.4h
+        rshrn           v1.4h, v1.4s, #7
+        sqxtun          v1.8b, v1.8h
+        st1            {v1.s}[0], [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v21.8h}, [sp], x9
+        calc_qpelh      v1, v22, v23, v16, v17, v18, v19, v20, v21, sshr
+        ld1            {v5.4h}, [x4], x9 // src2
+        saddw           v1.4s, v1.4s, v5.4h
+        rshrn           v1.4h, v1.4s, #7
+        sqxtun          v1.8b, v1.8h
+        st1            {v1.s}[0], [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v22.8h}, [sp], x9
+        calc_qpelh      v1, v23, v16, v17, v18, v19, v20, v21, v22, sshr
+        ld1            {v5.4h}, [x4], x9 // src2
+        saddw           v1.4s, v1.4s, v5.4h
+        rshrn           v1.4h, v1.4s, #7
+        sqxtun          v1.8b, v1.8h
+        st1            {v1.s}[0], [x0], x1
+        subs            x5, x5, #1
+        b.hi            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_hv6_8_neon, export=1
+        add             x10, x5, #7
+        lsl             x10, x10, #7
+        sub             sp, sp, x10         // tmp_array
+        stp             x0, x1, [sp, #-16]!
+        stp             x4, x5, [sp, #-16]!
+        stp             x7, x30, [sp, #-16]!
+        add             x0, sp, #48
+        sub             x1, x2, x3, lsl #1
+        sub             x1, x1, x3
+        mov             x2, x3
+        add             x3, x5, #7
+        mov             x4, x6
+        bl              X(ff_hevc_put_hevc_qpel_h6_8_neon)
+        ldp             x7, x30, [sp], #16
+        ldp             x4, x5, [sp], #16
+        ldp             x0, x1, [sp], #16
+        load_qpel_filterh x7, x6
+        sub             x1, x1, #4
+        mov             x9, #(MAX_PB_SIZE * 2)
+        ld1            {v16.8h}, [sp], x9
+        ld1            {v17.8h}, [sp], x9
+        ld1            {v18.8h}, [sp], x9
+        ld1            {v19.8h}, [sp], x9
+        ld1            {v20.8h}, [sp], x9
+        ld1            {v21.8h}, [sp], x9
+        ld1            {v22.8h}, [sp], x9
+1:      ld1            {v23.8h}, [sp], x9
+        calc_qpelh      v1, v16, v17, v18, v19, v20, v21, v22, v23, sshr
+        calc_qpelh2     v2, v2, v16, v17, v18, v19, v20, v21, v22, v23, sshr
+        ld1            {v5.8h}, [x4], x9 // src2
+        saddw           v1.4s, v1.4s, v5.4h
+        saddw2          v2.4s, v2.4s, v5.8h
+        rshrn           v1.4h, v1.4s, #7
+        rshrn2          v1.8h, v2.4s, #7
+        sqxtun          v1.8b, v1.8h
+        st1            {v1.s}[0], [x0], #4
+        st1            {v1.h}[2], [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v16.8h}, [sp], x9
+        calc_qpelh      v1, v17, v18, v19, v20, v21, v22, v23, v16, sshr
+        calc_qpelh2     v2, v2, v17, v18, v19, v20, v21, v22, v23, v16, sshr
+        ld1            {v5.8h}, [x4], x9 // src2
+        saddw           v1.4s, v1.4s, v5.4h
+        saddw2          v2.4s, v2.4s, v5.8h
+        rshrn           v1.4h, v1.4s, #7
+        rshrn2          v1.8h, v2.4s, #7
+        sqxtun          v1.8b, v1.8h
+        st1            {v1.s}[0], [x0], #4
+        st1            {v1.h}[2], [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v17.8h}, [sp], x9
+        calc_qpelh      v1, v18, v19, v20, v21, v22, v23, v16, v17, sshr
+        calc_qpelh2     v2, v2, v18, v19, v20, v21, v22, v23, v16, v17, sshr
+        ld1            {v5.8h}, [x4], x9 // src2
+        saddw           v1.4s, v1.4s, v5.4h
+        saddw2          v2.4s, v2.4s, v5.8h
+        rshrn           v1.4h, v1.4s, #7
+        rshrn2          v1.8h, v2.4s, #7
+        sqxtun          v1.8b, v1.8h
+        st1            {v1.s}[0], [x0], #4
+        st1            {v1.h}[2], [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v18.8h}, [sp], x9
+        calc_qpelh      v1, v19, v20, v21, v22, v23, v16, v17, v18, sshr
+        calc_qpelh2     v2, v2, v19, v20, v21, v22, v23, v16, v17, v18, sshr
+        ld1            {v5.8h}, [x4], x9 // src2
+        saddw           v1.4s, v1.4s, v5.4h
+        saddw2          v2.4s, v2.4s, v5.8h
+        rshrn           v1.4h, v1.4s, #7
+        rshrn2          v1.8h, v2.4s, #7
+        sqxtun          v1.8b, v1.8h
+        st1            {v1.s}[0], [x0], #4
+        st1            {v1.h}[2], [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v19.8h}, [sp], x9
+        calc_qpelh      v1, v20, v21, v22, v23, v16, v17, v18, v19, sshr
+        calc_qpelh2     v2, v2, v20, v21, v22, v23, v16, v17, v18, v19, sshr
+        ld1            {v5.8h}, [x4], x9 // src2
+        saddw           v1.4s, v1.4s, v5.4h
+        saddw2          v2.4s, v2.4s, v5.8h
+        rshrn           v1.4h, v1.4s, #7
+        rshrn2          v1.8h, v2.4s, #7
+        sqxtun          v1.8b, v1.8h
+        st1            {v1.s}[0], [x0], #4
+        st1            {v1.h}[2], [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v20.8h}, [sp], x9
+        calc_qpelh      v1, v21, v22, v23, v16, v17, v18, v19, v20, sshr
+        calc_qpelh2     v2, v2, v21, v22, v23, v16, v17, v18, v19, v20, sshr
+        ld1            {v5.8h}, [x4], x9 // src2
+        saddw           v1.4s, v1.4s, v5.4h
+        saddw2          v2.4s, v2.4s, v5.8h
+        rshrn           v1.4h, v1.4s, #7
+        rshrn2          v1.8h, v2.4s, #7
+        sqxtun          v1.8b, v1.8h
+        st1            {v1.s}[0], [x0], #4
+        st1            {v1.h}[2], [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v21.8h}, [sp], x9
+        calc_qpelh      v1, v22, v23, v16, v17, v18, v19, v20, v21, sshr
+        calc_qpelh2     v2, v2, v22, v23, v16, v17, v18, v19, v20, v21, sshr
+        ld1            {v5.8h}, [x4], x9 // src2
+        saddw           v1.4s, v1.4s, v5.4h
+        saddw2          v2.4s, v2.4s, v5.8h
+        rshrn           v1.4h, v1.4s, #7
+        rshrn2          v1.8h, v2.4s, #7
+        sqxtun          v1.8b, v1.8h
+        st1            {v1.s}[0], [x0], #4
+        st1            {v1.h}[2], [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v22.8h}, [sp], x9
+        calc_qpelh      v1, v23, v16, v17, v18, v19, v20, v21, v22, sshr
+        calc_qpelh2     v2, v2, v23, v16, v17, v18, v19, v20, v21, v22, sshr
+        ld1            {v5.8h}, [x4], x9 // src2
+        saddw           v1.4s, v1.4s, v5.4h
+        saddw2          v2.4s, v2.4s, v5.8h
+        rshrn           v1.4h, v1.4s, #7
+        rshrn2          v1.8h, v2.4s, #7
+        sqxtun          v1.8b, v1.8h
+        st1            {v1.s}[0], [x0], #4
+        st1            {v1.h}[2], [x0], x1
+        subs            x5, x5, #1
+        b.hi            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_hv8_8_neon, export=1
+        add             x10, x5, #7
+        lsl             x10, x10, #7
+        sub             sp, sp, x10         // tmp_array
+        stp             x0, x1, [sp, #-16]!
+        stp             x4, x5, [sp, #-16]!
+        stp             x7, x30, [sp, #-16]!
+        add             x0, sp, #48
+        sub             x1, x2, x3, lsl #1
+        sub             x1, x1, x3
+        mov             x2, x3
+        add             x3, x5, #7
+        mov             x4, x6
+        bl              X(ff_hevc_put_hevc_qpel_h8_8_neon)
+        ldp             x7, x30, [sp], #16
+        ldp             x4, x5, [sp], #16
+        ldp             x0, x1, [sp], #16
+        load_qpel_filterh x7, x6
+        mov             x9, #(MAX_PB_SIZE * 2)
+        ld1            {v16.8h}, [sp], x9
+        ld1            {v17.8h}, [sp], x9
+        ld1            {v18.8h}, [sp], x9
+        ld1            {v19.8h}, [sp], x9
+        ld1            {v20.8h}, [sp], x9
+        ld1            {v21.8h}, [sp], x9
+        ld1            {v22.8h}, [sp], x9
+1:      ld1            {v23.8h}, [sp], x9
+        calc_qpelh      v1, v16, v17, v18, v19, v20, v21, v22, v23, sshr
+        calc_qpelh2     v2, v2, v16, v17, v18, v19, v20, v21, v22, v23, sshr
+        ld1            {v5.8h}, [x4], x9 // src2
+        saddw           v1.4s, v1.4s, v5.4h
+        saddw2          v2.4s, v2.4s, v5.8h
+        rshrn           v1.4h, v1.4s, #7
+        rshrn2          v1.8h, v2.4s, #7
+        sqxtun          v1.8b, v1.8h
+        st1            {v1.8b}, [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v16.8h}, [sp], x9
+        calc_qpelh      v1, v17, v18, v19, v20, v21, v22, v23, v16, sshr
+        calc_qpelh2     v2, v2, v17, v18, v19, v20, v21, v22, v23, v16, sshr
+        ld1            {v5.8h}, [x4], x9 // src2
+        saddw           v1.4s, v1.4s, v5.4h
+        saddw2          v2.4s, v2.4s, v5.8h
+        rshrn           v1.4h, v1.4s, #7
+        rshrn2          v1.8h, v2.4s, #7
+        sqxtun          v1.8b, v1.8h
+        st1            {v1.8b}, [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v17.8h}, [sp], x9
+        calc_qpelh      v1, v18, v19, v20, v21, v22, v23, v16, v17, sshr
+        calc_qpelh2     v2, v2, v18, v19, v20, v21, v22, v23, v16, v17, sshr
+        ld1            {v5.8h}, [x4], x9 // src2
+        saddw           v1.4s, v1.4s, v5.4h
+        saddw2          v2.4s, v2.4s, v5.8h
+        rshrn           v1.4h, v1.4s, #7
+        rshrn2          v1.8h, v2.4s, #7
+        sqxtun          v1.8b, v1.8h
+        st1            {v1.8b}, [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v18.8h}, [sp], x9
+        calc_qpelh      v1, v19, v20, v21, v22, v23, v16, v17, v18, sshr
+        calc_qpelh2     v2, v2, v19, v20, v21, v22, v23, v16, v17, v18, sshr
+        ld1            {v5.8h}, [x4], x9 // src2
+        saddw           v1.4s, v1.4s, v5.4h
+        saddw2          v2.4s, v2.4s, v5.8h
+        rshrn           v1.4h, v1.4s, #7
+        rshrn2          v1.8h, v2.4s, #7
+        sqxtun          v1.8b, v1.8h
+        st1            {v1.8b}, [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v19.8h}, [sp], x9
+        calc_qpelh      v1, v20, v21, v22, v23, v16, v17, v18, v19, sshr
+        calc_qpelh2     v2, v2, v20, v21, v22, v23, v16, v17, v18, v19, sshr
+        ld1            {v5.8h}, [x4], x9 // src2
+        saddw           v1.4s, v1.4s, v5.4h
+        saddw2          v2.4s, v2.4s, v5.8h
+        rshrn           v1.4h, v1.4s, #7
+        rshrn2          v1.8h, v2.4s, #7
+        sqxtun          v1.8b, v1.8h
+        st1            {v1.8b}, [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v20.8h}, [sp], x9
+        calc_qpelh      v1, v21, v22, v23, v16, v17, v18, v19, v20, sshr
+        calc_qpelh2     v2, v2, v21, v22, v23, v16, v17, v18, v19, v20, sshr
+        ld1            {v5.8h}, [x4], x9 // src2
+        saddw           v1.4s, v1.4s, v5.4h
+        saddw2          v2.4s, v2.4s, v5.8h
+        rshrn           v1.4h, v1.4s, #7
+        rshrn2          v1.8h, v2.4s, #7
+        sqxtun          v1.8b, v1.8h
+        st1            {v1.8b}, [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v21.8h}, [sp], x9
+        calc_qpelh      v1, v22, v23, v16, v17, v18, v19, v20, v21, sshr
+        calc_qpelh2     v2, v2, v22, v23, v16, v17, v18, v19, v20, v21, sshr
+        ld1            {v5.8h}, [x4], x9 // src2
+        saddw           v1.4s, v1.4s, v5.4h
+        saddw2          v2.4s, v2.4s, v5.8h
+        rshrn           v1.4h, v1.4s, #7
+        rshrn2          v1.8h, v2.4s, #7
+        sqxtun          v1.8b, v1.8h
+        st1            {v1.8b}, [x0], x1
+        subs            x5, x5, #1
+        b.eq            2f
+
+        ld1            {v22.8h}, [sp], x9
+        calc_qpelh      v1, v23, v16, v17, v18, v19, v20, v21, v22, sshr
+        calc_qpelh2     v2, v2, v23, v16, v17, v18, v19, v20, v21, v22, sshr
+        ld1            {v5.8h}, [x4], x9 // src2
+        saddw           v1.4s, v1.4s, v5.4h
+        saddw2          v2.4s, v2.4s, v5.8h
+        rshrn           v1.4h, v1.4s, #7
+        rshrn2          v1.8h, v2.4s, #7
+        sqxtun          v1.8b, v1.8h
+        st1            {v1.8b}, [x0], x1
+        subs            x5, x5, #1
+        b.hi            1b
+2:      ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_hv12_8_neon, export=1
+        stp             xzr, x30, [sp, #-16]!
+        stp             x0, x1, [sp, #-16]!
+        stp             x2, x3, [sp, #-16]!
+        stp             x4, x5, [sp, #-16]!
+        stp             x6, x7, [sp, #-16]!
+        bl              X(ff_hevc_put_hevc_qpel_bi_hv8_8_neon)
+        ldp             x6, x7, [sp], #16
+        ldp             x4, x5, [sp], #16
+        ldp             x2, x3, [sp], #16
+        ldp             x0, x1, [sp], #16
+        add             x0, x0, #8
+        add             x2, x2, #8
+        add             x4, x4, #16
+        bl              X(ff_hevc_put_hevc_qpel_bi_hv4_8_neon)
+        ldp             xzr, x30, [sp], #16
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_hv16_8_neon, export=1
+        add             x10, x5, #7
+        lsl             x10, x10, #7
+        sub             sp, sp, x10         // tmp_array
+        stp             x0, x1, [sp, #-16]!
+        stp             x4, x5, [sp, #-16]!
+        stp             x7, x30, [sp, #-16]!
+        add             x0, sp, #48
+        sub             x1, x2, x3, lsl #1
+        sub             x1, x1, x3
+        mov             x2, x3
+        add             x3, x5, #7
+        mov             x4, x6
+        bl              X(ff_hevc_put_hevc_qpel_h16_8_neon)
+        ldp             x7, x30, [sp], #16
+        ldp             x4, x5, [sp], #16
+        ldp             x0, x1, [sp], #16
+        mov             x6, #16          // width
+.Lqpel_bi_hv16_loop:
+        load_qpel_filterh x7, x8
+        mov             x9, #(MAX_PB_SIZE * 2)
+        mov             x10, x6
+
+1:      mov             x11, x5         // height
+        mov             x7, x0          // dst
+        mov             x8, sp          // src
+        mov             x12, x4         // src2
+
+        ld1            {v16.8h, v17.8h}, [x8], x9
+        ld1            {v18.8h, v19.8h}, [x8], x9
+        ld1            {v20.8h, v21.8h}, [x8], x9
+        ld1            {v22.8h, v23.8h}, [x8], x9
+        ld1            {v24.8h, v25.8h}, [x8], x9
+        ld1            {v26.8h, v27.8h}, [x8], x9
+        ld1            {v28.8h, v29.8h}, [x8], x9
+2:      ld1            {v30.8h, v31.8h}, [x8], x9
+        calc_qpelh      v1, v16, v18, v20, v22, v24, v26, v28, v30, sshr
+        calc_qpelh2     v2, v2, v16, v18, v20, v22, v24, v26, v28, v30, sshr
+        calc_qpelh      v3, v17, v19, v21, v23, v25, v27, v29, v31, sshr
+        calc_qpelh2     v4, v4, v17, v19, v21, v23, v25, v27, v29, v31, sshr
+        ld1            {v5.8h, v6.8h}, [x12], x9 // src2
+        saddw           v1.4s, v1.4s, v5.4h
+        saddw2          v2.4s, v2.4s, v5.8h
+        saddw           v3.4s, v3.4s, v6.4h
+        saddw2          v4.4s, v4.4s, v6.8h
+        rshrn           v1.4h, v1.4s, #7
+        rshrn2          v1.8h, v2.4s, #7
+        rshrn           v2.4h, v3.4s, #7
+        rshrn2          v2.8h, v4.4s, #7
+        sqxtun          v1.8b, v1.8h
+        sqxtun2         v1.16b, v2.8h
+        st1            {v1.16b}, [x7], x1
+        subs            x11, x11, #1
+        b.eq            3f
+
+        ld1            {v16.8h, v17.8h}, [x8], x9
+        calc_qpelh      v1, v18, v20, v22, v24, v26, v28, v30, v16, sshr
+        calc_qpelh2     v2, v2, v18, v20, v22, v24, v26, v28, v30, v16, sshr
+        calc_qpelh      v3, v19, v21, v23, v25, v27, v29, v31, v17, sshr
+        calc_qpelh2     v4, v4, v19, v21, v23, v25, v27, v29, v31, v17, sshr
+        ld1            {v5.8h, v6.8h}, [x12], x9 // src2
+        saddw           v1.4s, v1.4s, v5.4h
+        saddw2          v2.4s, v2.4s, v5.8h
+        saddw           v3.4s, v3.4s, v6.4h
+        saddw2          v4.4s, v4.4s, v6.8h
+        rshrn           v1.4h, v1.4s, #7
+        rshrn2          v1.8h, v2.4s, #7
+        rshrn           v2.4h, v3.4s, #7
+        rshrn2          v2.8h, v4.4s, #7
+        sqxtun          v1.8b, v1.8h
+        sqxtun2         v1.16b, v2.8h
+        st1            {v1.16b}, [x7], x1
+        subs            x11, x11, #1
+        b.eq            3f
+
+        ld1            {v18.8h, v19.8h}, [x8], x9
+        calc_qpelh      v1, v20, v22, v24, v26, v28, v30, v16, v18, sshr
+        calc_qpelh2     v2, v2, v20, v22, v24, v26, v28, v30, v16, v18, sshr
+        calc_qpelh      v3, v21, v23, v25, v27, v29, v31, v17, v19, sshr
+        calc_qpelh2     v4, v4, v21, v23, v25, v27, v29, v31, v17, v19, sshr
+        ld1            {v5.8h, v6.8h}, [x12], x9 // src2
+        saddw           v1.4s, v1.4s, v5.4h
+        saddw2          v2.4s, v2.4s, v5.8h
+        saddw           v3.4s, v3.4s, v6.4h
+        saddw2          v4.4s, v4.4s, v6.8h
+        rshrn           v1.4h, v1.4s, #7
+        rshrn2          v1.8h, v2.4s, #7
+        rshrn           v2.4h, v3.4s, #7
+        rshrn2          v2.8h, v4.4s, #7
+        sqxtun          v1.8b, v1.8h
+        sqxtun2         v1.16b, v2.8h
+        st1            {v1.16b}, [x7], x1
+        subs            x11, x11, #1
+        b.eq            3f
+
+        ld1            {v20.8h, v21.8h}, [x8], x9
+        calc_qpelh      v1, v22, v24, v26, v28, v30, v16, v18, v20, sshr
+        calc_qpelh2     v2, v2, v22, v24, v26, v28, v30, v16, v18, v20, sshr
+        calc_qpelh      v3, v23, v25, v27, v29, v31, v17, v19, v21, sshr
+        calc_qpelh2     v4, v4, v23, v25, v27, v29, v31, v17, v19, v21, sshr
+        ld1            {v5.8h, v6.8h}, [x12], x9 // src2
+        saddw           v1.4s, v1.4s, v5.4h
+        saddw2          v2.4s, v2.4s, v5.8h
+        saddw           v3.4s, v3.4s, v6.4h
+        saddw2          v4.4s, v4.4s, v6.8h
+        rshrn           v1.4h, v1.4s, #7
+        rshrn2          v1.8h, v2.4s, #7
+        rshrn           v2.4h, v3.4s, #7
+        rshrn2          v2.8h, v4.4s, #7
+        sqxtun          v1.8b, v1.8h
+        sqxtun2         v1.16b, v2.8h
+        st1            {v1.16b}, [x7], x1
+        subs            x11, x11, #1
+        b.eq            3f
+
+        ld1            {v22.8h, v23.8h}, [x8], x9
+        calc_qpelh      v1, v24, v26, v28, v30, v16, v18, v20, v22, sshr
+        calc_qpelh2     v2, v2, v24, v26, v28, v30, v16, v18, v20, v22, sshr
+        calc_qpelh      v3, v25, v27, v29, v31, v17, v19, v21, v23, sshr
+        calc_qpelh2     v4, v4, v25, v27, v29, v31, v17, v19, v21, v23, sshr
+        ld1            {v5.8h, v6.8h}, [x12], x9 // src2
+        saddw           v1.4s, v1.4s, v5.4h
+        saddw2          v2.4s, v2.4s, v5.8h
+        saddw           v3.4s, v3.4s, v6.4h
+        saddw2          v4.4s, v4.4s, v6.8h
+        rshrn           v1.4h, v1.4s, #7
+        rshrn2          v1.8h, v2.4s, #7
+        rshrn           v2.4h, v3.4s, #7
+        rshrn2          v2.8h, v4.4s, #7
+        sqxtun          v1.8b, v1.8h
+        sqxtun2         v1.16b, v2.8h
+        st1            {v1.16b}, [x7], x1
+        subs            x11, x11, #1
+        b.eq            3f
+
+        ld1            {v24.8h, v25.8h}, [x8], x9
+        calc_qpelh      v1, v26, v28, v30, v16, v18, v20, v22, v24, sshr
+        calc_qpelh2     v2, v2, v26, v28, v30, v16, v18, v20, v22, v24, sshr
+        calc_qpelh      v3, v27, v29, v31, v17, v19, v21, v23, v25, sshr
+        calc_qpelh2     v4, v4, v27, v29, v31, v17, v19, v21, v23, v25, sshr
+        ld1            {v5.8h, v6.8h}, [x12], x9 // src2
+        saddw           v1.4s, v1.4s, v5.4h
+        saddw2          v2.4s, v2.4s, v5.8h
+        saddw           v3.4s, v3.4s, v6.4h
+        saddw2          v4.4s, v4.4s, v6.8h
+        rshrn           v1.4h, v1.4s, #7
+        rshrn2          v1.8h, v2.4s, #7
+        rshrn           v2.4h, v3.4s, #7
+        rshrn2          v2.8h, v4.4s, #7
+        sqxtun          v1.8b, v1.8h
+        sqxtun2         v1.16b, v2.8h
+        st1            {v1.16b}, [x7], x1
+        subs            x11, x11, #1
+        b.eq            3f
+
+        ld1            {v26.8h, v27.8h}, [x8], x9
+        calc_qpelh      v1, v28, v30, v16, v18, v20, v22, v24, v26, sshr
+        calc_qpelh2     v2, v2, v28, v30, v16, v18, v20, v22, v24, v26, sshr
+        calc_qpelh      v3, v29, v31, v17, v19, v21, v23, v25, v27, sshr
+        calc_qpelh2     v4, v4, v29, v31, v17, v19, v21, v23, v25, v27, sshr
+        ld1            {v5.8h, v6.8h}, [x12], x9 // src2
+        saddw           v1.4s, v1.4s, v5.4h
+        saddw2          v2.4s, v2.4s, v5.8h
+        saddw           v3.4s, v3.4s, v6.4h
+        saddw2          v4.4s, v4.4s, v6.8h
+        rshrn           v1.4h, v1.4s, #7
+        rshrn2          v1.8h, v2.4s, #7
+        rshrn           v2.4h, v3.4s, #7
+        rshrn2          v2.8h, v4.4s, #7
+        sqxtun          v1.8b, v1.8h
+        sqxtun2         v1.16b, v2.8h
+        st1            {v1.16b}, [x7], x1
+        subs            x11, x11, #1
+        b.eq            3f
+
+        ld1            {v28.8h, v29.8h}, [x8], x9
+        calc_qpelh      v1, v30, v16, v18, v20, v22, v24, v26, v28, sshr
+        calc_qpelh2     v2, v2, v30, v16, v18, v20, v22, v24, v26, v28, sshr
+        calc_qpelh      v3, v31, v17, v19, v21, v23, v25, v27, v29, sshr
+        calc_qpelh2     v4, v4, v31, v17, v19, v21, v23, v25, v27, v29, sshr
+        ld1            {v5.8h, v6.8h}, [x12], x9 // src2
+        saddw           v1.4s, v1.4s, v5.4h
+        saddw2          v2.4s, v2.4s, v5.8h
+        saddw           v3.4s, v3.4s, v6.4h
+        saddw2          v4.4s, v4.4s, v6.8h
+        rshrn           v1.4h, v1.4s, #7
+        rshrn2          v1.8h, v2.4s, #7
+        rshrn           v2.4h, v3.4s, #7
+        rshrn2          v2.8h, v4.4s, #7
+        sqxtun          v1.8b, v1.8h
+        sqxtun2         v1.16b, v2.8h
+        st1            {v1.16b}, [x7], x1
+        subs            x11, x11, #1
+        b.ne            2b
+
+3:      add             x0, x0, #16
+        add             sp, sp, #32
+        add             x4, x4, #32
+        subs            x10, x10, #16
+        b.ne            1b
+        add             x10, x5, #7
+        lsl             x10, x10, #7
+        sub             x10, x10, x6, lsl #1 // part of first line
+        add             sp, sp, x10         // tmp_array without first line
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_hv24_8_neon, export=1
+        stp             xzr, x30, [sp, #-16]!
+        stp             x0, x1, [sp, #-16]!
+        stp             x2, x3, [sp, #-16]!
+        stp             x4, x5, [sp, #-16]!
+        stp             x6, x7, [sp, #-16]!
+        bl              X(ff_hevc_put_hevc_qpel_bi_hv16_8_neon)
+        ldp             x6, x7, [sp], #16
+        ldp             x4, x5, [sp], #16
+        ldp             x2, x3, [sp], #16
+        ldp             x0, x1, [sp], #16
+        add             x0, x0, #16
+        add             x2, x2, #16
+        add             x4, x4, #32
+        bl              X(ff_hevc_put_hevc_qpel_bi_hv8_8_neon)
+        ldp             xzr, x30, [sp], #16
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_hv32_8_neon, export=1
+        add             x10, x5, #7
+        lsl             x10, x10, #7
+        sub             sp, sp, x10         // tmp_array
+        stp             x0, x1, [sp, #-16]!
+        stp             x4, x5, [sp, #-16]!
+        stp             x7, x30, [sp, #-16]!
+        add             x0, sp, #48
+        sub             x1, x2, x3, lsl #1
+        sub             x1, x1, x3
+        mov             x2, x3
+        add             x3, x5, #7
+        mov             x4, x6
+        bl              X(ff_hevc_put_hevc_qpel_h32_8_neon)
+        ldp             x7, x30, [sp], #16
+        ldp             x4, x5, [sp], #16
+        ldp             x0, x1, [sp], #16
+        mov             x6, #32 // width
+        b               .Lqpel_bi_hv16_loop
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_hv48_8_neon, export=1
+        add             x10, x5, #7
+        lsl             x10, x10, #7
+        sub             sp, sp, x10 // tmp_array
+        stp             x0, x1, [sp, #-16]!
+        stp             x4, x5, [sp, #-16]!
+        stp             x7, x30, [sp, #-16]!
+        add             x0, sp, #48
+        sub             x1, x2, x3, lsl #1
+        sub             x1, x1, x3
+        mov             x2, x3
+        add             x3, x5, #7
+        mov             x4, x6
+        bl              X(ff_hevc_put_hevc_qpel_h48_8_neon)
+        ldp             x7, x30, [sp], #16
+        ldp             x4, x5, [sp], #16
+        ldp             x0, x1, [sp], #16
+        mov             x6, #48 // width
+        b               .Lqpel_bi_hv16_loop
+endfunc
+
+function ff_hevc_put_hevc_qpel_bi_hv64_8_neon, export=1
+        add             x10, x5, #7
+        lsl             x10, x10, #7
+        sub             sp, sp, x10 // tmp_array
+        stp             x0, x1, [sp, #-16]!
+        stp             x4, x5, [sp, #-16]!
+        stp             x7, x30, [sp, #-16]!
+        add             x0, sp, #48
+        sub             x1, x2, x3, lsl #1
+        sub             x1, x1, x3
+        mov             x2, x3
+        add             x3, x5, #7
+        mov             x4, x6
+        bl              X(ff_hevc_put_hevc_qpel_h64_8_neon)
+        ldp             x7, x30, [sp], #16
+        ldp             x4, x5, [sp], #16
+        ldp             x0, x1, [sp], #16
+        mov             x6, #64          // width
+        b               .Lqpel_bi_hv16_loop
+endfunc
-- 
2.30.1 (Apple Git-130)



More information about the ffmpeg-devel mailing list