[FFmpeg-devel] [PATCH] lavc/aarch64: new optimization for 8-bit hevc_pel_uni_w_pixels, qpel_uni_w_h, qpel_uni_w_v, qpel_uni_w_hv and qpel_h
Logan.Lyu
Logan.Lyu at myais.com.cn
Fri Jun 2 15:47:47 EEST 2023
Hi, Martin,
I'm sorry I made a stupid mistake, And it's fixed now.
If these patches are acceptable to you, I will submit some similar
patches soon.
Thanks.
在 2023/6/1 19:23, Martin Storsjö 写道:
> On Sun, 28 May 2023, Logan.Lyu wrote:
>
>>
>> 在 2023/5/28 12:36, Jean-Baptiste Kempf 写道:
>>> Hello,
>>>
>>> The last interaction still has the wrong name in patchset.
>> Thanks for reminding. I modified the correct name in git.
>
> Thanks, most of the issues in the patch seem to have been fixed -
> however there's one big breakage here. Also even if this is accepted,
> we'll have to wait for the dependency patches to be merged before
> these can go in though.
>
> For restoring the saved registers on the stack, you currently have this:
>
> ldp x19, x30, [sp]
> ldp x26, x27, [sp, #16]
> ldp x24, x25, [sp, #32]
> ldp x22, x23, [sp, #48]
> ldp x20, x21, [sp, #64]
> add sp, sp, #80
>
> You can avoid the extra add at the end by reordering them like this:
>
> ldp x26, x27, [sp, #16]
> ldp x24, x25, [sp, #32]
> ldp x22, x23, [sp, #48]
> ldp x20, x21, [sp, #64]
> ldp x19, x30, [sp], #80
>
> But the order/layout of the registers doesn't match how they are
> backed up. So when you run checkasm, you'll get these errors:
>
> I8MM:
> - hevc_pel.qpel [OK]
> put_hevc_qpel_uni_w_hv4_8_i8mm (failed to preserve register)
> put_hevc_qpel_uni_w_hv8_8_i8mm (failed to preserve register)
> put_hevc_qpel_uni_w_hv16_8_i8mm (failed to preserve register)
> put_hevc_qpel_uni_w_hv32_8_i8mm (failed to preserve register)
> put_hevc_qpel_uni_w_hv64_8_i8mm (failed to preserve register)
> - hevc_pel.qpel_uni_w [FAILED]
> checkasm: 5 of 1136 tests have failed
>
> It's easiest to make the epilogue a mirror copy of the prologue.
>
> Please rerun checkasm on as system that does support i8mm when posting
> updated patches.
>
> // Martin
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel at ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request at ffmpeg.org with subject "unsubscribe".
-------------- next part --------------
From 8d5875ab393828b83163b98eb4b35837120f1322 Mon Sep 17 00:00:00 2001
From: Logan Lyu <Logan.Lyu at myais.com.cn>
Date: Wed, 3 May 2023 09:53:07 +0800
Subject: [PATCH 1/3] lavc/aarch64: new optimization for 8-bit
hevc_pel_uni_w_pixels and qpel_uni_w_v
---
libavcodec/aarch64/hevcdsp_init_aarch64.c | 51 ++
libavcodec/aarch64/hevcdsp_qpel_neon.S | 710 ++++++++++++++++++++++
2 files changed, 761 insertions(+)
diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index be1049a2ec..6b5341dd45 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -128,6 +128,52 @@ void ff_hevc_put_hevc_qpel_bi_h16_8_neon(uint8_t *_dst, ptrdiff_t _dststride, co
ptrdiff_t _srcstride, const int16_t *src2, int height, intptr_t
mx, intptr_t my, int width);
+#define NEON8_FNPROTO(fn, args, ext) \
+ void ff_hevc_put_hevc_##fn##4_8_neon##ext args; \
+ void ff_hevc_put_hevc_##fn##6_8_neon##ext args; \
+ void ff_hevc_put_hevc_##fn##8_8_neon##ext args; \
+ void ff_hevc_put_hevc_##fn##12_8_neon##ext args; \
+ void ff_hevc_put_hevc_##fn##16_8_neon##ext args; \
+ void ff_hevc_put_hevc_##fn##24_8_neon##ext args; \
+ void ff_hevc_put_hevc_##fn##32_8_neon##ext args; \
+ void ff_hevc_put_hevc_##fn##48_8_neon##ext args; \
+ void ff_hevc_put_hevc_##fn##64_8_neon##ext args; \
+
+#define NEON8_FNPROTO_PARTIAL_4(fn, args, ext) \
+ void ff_hevc_put_hevc_##fn##4_8_neon##ext args; \
+ void ff_hevc_put_hevc_##fn##8_8_neon##ext args; \
+ void ff_hevc_put_hevc_##fn##16_8_neon##ext args; \
+ void ff_hevc_put_hevc_##fn##64_8_neon##ext args; \
+
+NEON8_FNPROTO(pel_uni_w_pixels, (uint8_t *_dst, ptrdiff_t _dststride,
+ const uint8_t *_src, ptrdiff_t _srcstride,
+ int height, int denom, int wx, int ox,
+ intptr_t mx, intptr_t my, int width),);
+
+NEON8_FNPROTO_PARTIAL_4(qpel_uni_w_v, (uint8_t *_dst, ptrdiff_t _dststride,
+ const uint8_t *_src, ptrdiff_t _srcstride,
+ int height, int denom, int wx, int ox,
+ intptr_t mx, intptr_t my, int width),);
+
+#define NEON8_FNASSIGN(member, v, h, fn, ext) \
+ member[1][v][h] = ff_hevc_put_hevc_##fn##4_8_neon##ext; \
+ member[2][v][h] = ff_hevc_put_hevc_##fn##6_8_neon##ext; \
+ member[3][v][h] = ff_hevc_put_hevc_##fn##8_8_neon##ext; \
+ member[4][v][h] = ff_hevc_put_hevc_##fn##12_8_neon##ext; \
+ member[5][v][h] = ff_hevc_put_hevc_##fn##16_8_neon##ext; \
+ member[6][v][h] = ff_hevc_put_hevc_##fn##24_8_neon##ext; \
+ member[7][v][h] = ff_hevc_put_hevc_##fn##32_8_neon##ext; \
+ member[8][v][h] = ff_hevc_put_hevc_##fn##48_8_neon##ext; \
+ member[9][v][h] = ff_hevc_put_hevc_##fn##64_8_neon##ext;
+
+#define NEON8_FNASSIGN_PARTIAL_4(member, v, h, fn, ext) \
+ member[1][v][h] = ff_hevc_put_hevc_##fn##4_8_neon##ext; \
+ member[3][v][h] = ff_hevc_put_hevc_##fn##8_8_neon##ext; \
+ member[5][v][h] = ff_hevc_put_hevc_##fn##16_8_neon##ext; \
+ member[7][v][h] = ff_hevc_put_hevc_##fn##64_8_neon##ext; \
+ member[8][v][h] = ff_hevc_put_hevc_##fn##64_8_neon##ext; \
+ member[9][v][h] = ff_hevc_put_hevc_##fn##64_8_neon##ext;
+
av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
{
if (!have_neon(av_get_cpu_flags())) return;
@@ -185,6 +231,11 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
c->put_hevc_qpel_bi[7][0][1] =
c->put_hevc_qpel_bi[8][0][1] =
c->put_hevc_qpel_bi[9][0][1] = ff_hevc_put_hevc_qpel_bi_h16_8_neon;
+
+ NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 0, 0, pel_uni_w_pixels,);
+ NEON8_FNASSIGN(c->put_hevc_qpel_uni_w, 0, 0, pel_uni_w_pixels,);
+ NEON8_FNASSIGN_PARTIAL_4(c->put_hevc_qpel_uni_w, 1, 0, qpel_uni_w_v,);
+
}
if (bit_depth == 10) {
c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_10_neon;
diff --git a/libavcodec/aarch64/hevcdsp_qpel_neon.S b/libavcodec/aarch64/hevcdsp_qpel_neon.S
index 0e7b912678..51df52e1ea 100644
--- a/libavcodec/aarch64/hevcdsp_qpel_neon.S
+++ b/libavcodec/aarch64/hevcdsp_qpel_neon.S
@@ -30,6 +30,13 @@ const qpel_filters, align=4
.byte 0, 1, -5, 17, 58,-10, 4, -1
endconst
+const qpel_filters_abs, align=4
+ .byte 0, 0, 0, 0, 0, 0, 0, 0
+ .byte 1, 4, 10, 58, 17, 5, 1, 0
+ .byte 1, 4, 11, 40, 40, 11, 4, 1
+ .byte 0, 1, 5, 17, 58, 10, 4, 1
+endconst
+
.macro load_filter m
movrel x15, qpel_filters
add x15, x15, \m, lsl #3
@@ -482,3 +489,706 @@ endfunc
put_hevc qpel
put_hevc qpel_uni
put_hevc qpel_bi
+
+
+function ff_hevc_put_hevc_pel_uni_w_pixels4_8_neon, export=1
+ mov w10, #-6
+ sub w10, w10, w5
+ dup v30.8h, w6
+ dup v31.4s, w10
+ dup v29.4s, w7
+1:
+ ldr s0, [x2]
+ ldr s1, [x2, x3]
+ add x2, x2, x3, lsl #1
+ ushll v0.8h, v0.8b, #6
+ ushll v1.8h, v1.8b, #6
+ smull v0.4s, v0.4h, v30.4h
+ smull v1.4s, v1.4h, v30.4h
+ sqrshl v0.4s, v0.4s, v31.4s
+ sqrshl v1.4s, v1.4s, v31.4s
+ sqadd v0.4s, v0.4s, v29.4s
+ sqadd v1.4s, v1.4s, v29.4s
+ sqxtn v0.4h, v0.4s
+ sqxtn v1.4h, v1.4s
+ sqxtun v0.8b, v0.8h
+ sqxtun v1.8b, v1.8h
+ str s0, [x0]
+ str s1, [x0, x1]
+ add x0, x0, x1, lsl #1
+ subs w4, w4, #2
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels6_8_neon, export=1
+ mov w10, #-6
+ sub w10, w10, w5
+ dup v30.8h, w6
+ dup v31.4s, w10
+ dup v29.4s, w7
+ sub x1, x1, #4
+1:
+ ldr d0, [x2]
+ ldr d1, [x2, x3]
+ add x2, x2, x3, lsl #1
+ ushll v0.8h, v0.8b, #6
+ ushll v1.8h, v1.8b, #6
+ smull v4.4s, v0.4h, v30.4h
+ smull2 v5.4s, v0.8h, v30.8h
+ smull v6.4s, v1.4h, v30.4h
+ smull2 v7.4s, v1.8h, v30.8h
+ sqrshl v4.4s, v4.4s, v31.4s
+ sqrshl v5.4s, v5.4s, v31.4s
+ sqrshl v6.4s, v6.4s, v31.4s
+ sqrshl v7.4s, v7.4s, v31.4s
+ sqadd v4.4s, v4.4s, v29.4s
+ sqadd v5.4s, v5.4s, v29.4s
+ sqadd v6.4s, v6.4s, v29.4s
+ sqadd v7.4s, v7.4s, v29.4s
+ sqxtn v0.4h, v4.4s
+ sqxtn2 v0.8h, v5.4s
+ sqxtn v1.4h, v6.4s
+ sqxtn2 v1.8h, v7.4s
+ sqxtun v0.8b, v0.8h
+ sqxtun v1.8b, v1.8h
+ str s0, [x0], #4
+ st1 {v0.h}[2], [x0], x1
+ str s1, [x0], #4
+ st1 {v1.h}[2], [x0], x1
+ subs w4, w4, #2
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels8_8_neon, export=1
+ mov w10, #-6
+ sub w10, w10, w5
+ dup v30.8h, w6
+ dup v31.4s, w10
+ dup v29.4s, w7
+1:
+ ldr d0, [x2]
+ ldr d1, [x2, x3]
+ add x2, x2, x3, lsl #1
+ ushll v0.8h, v0.8b, #6
+ ushll v1.8h, v1.8b, #6
+ smull v4.4s, v0.4h, v30.4h
+ smull2 v5.4s, v0.8h, v30.8h
+ smull v6.4s, v1.4h, v30.4h
+ smull2 v7.4s, v1.8h, v30.8h
+ sqrshl v4.4s, v4.4s, v31.4s
+ sqrshl v5.4s, v5.4s, v31.4s
+ sqrshl v6.4s, v6.4s, v31.4s
+ sqrshl v7.4s, v7.4s, v31.4s
+ sqadd v4.4s, v4.4s, v29.4s
+ sqadd v5.4s, v5.4s, v29.4s
+ sqadd v6.4s, v6.4s, v29.4s
+ sqadd v7.4s, v7.4s, v29.4s
+ sqxtn v0.4h, v4.4s
+ sqxtn2 v0.8h, v5.4s
+ sqxtn v1.4h, v6.4s
+ sqxtn2 v1.8h, v7.4s
+ sqxtun v0.8b, v0.8h
+ sqxtun v1.8b, v1.8h
+ str d0, [x0]
+ str d1, [x0, x1]
+ add x0, x0, x1, lsl #1
+ subs w4, w4, #2
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels12_8_neon, export=1
+ mov w10, #-6
+ sub w10, w10, w5
+ dup v30.8h, w6
+ dup v31.4s, w10
+ dup v29.4s, w7
+ sub x1, x1, #8
+1:
+ ldr q0, [x2]
+ ldr q1, [x2, x3]
+ add x2, x2, x3, lsl #1
+ ushll v4.8h, v0.8b, #6
+ ushll2 v5.8h, v0.16b, #6
+ ushll v6.8h, v1.8b, #6
+ ushll2 v7.8h, v1.16b, #6
+ smull v16.4s, v4.4h, v30.4h
+ smull2 v17.4s, v4.8h, v30.8h
+ smull v18.4s, v5.4h, v30.4h
+ smull2 v19.4s, v5.8h, v30.8h
+ smull v20.4s, v6.4h, v30.4h
+ smull2 v21.4s, v6.8h, v30.8h
+ smull v22.4s, v7.4h, v30.4h
+ smull2 v23.4s, v7.8h, v30.8h
+
+ sqrshl v16.4s, v16.4s, v31.4s
+ sqrshl v17.4s, v17.4s, v31.4s
+ sqrshl v18.4s, v18.4s, v31.4s
+ sqrshl v19.4s, v19.4s, v31.4s
+ sqrshl v20.4s, v20.4s, v31.4s
+ sqrshl v21.4s, v21.4s, v31.4s
+ sqrshl v22.4s, v22.4s, v31.4s
+ sqrshl v23.4s, v23.4s, v31.4s
+ sqadd v16.4s, v16.4s, v29.4s
+ sqadd v17.4s, v17.4s, v29.4s
+ sqadd v18.4s, v18.4s, v29.4s
+ sqadd v19.4s, v19.4s, v29.4s
+ sqadd v20.4s, v20.4s, v29.4s
+ sqadd v21.4s, v21.4s, v29.4s
+ sqadd v22.4s, v22.4s, v29.4s
+ sqadd v23.4s, v23.4s, v29.4s
+ sqxtn v0.4h, v16.4s
+ sqxtn2 v0.8h, v17.4s
+ sqxtn v1.4h, v18.4s
+ sqxtn2 v1.8h, v19.4s
+ sqxtn v2.4h, v20.4s
+ sqxtn2 v2.8h, v21.4s
+ sqxtn v3.4h, v22.4s
+ sqxtn2 v3.8h, v23.4s
+ sqxtun v0.8b, v0.8h
+ sqxtun2 v0.16b, v1.8h
+ sqxtun v2.8b, v2.8h
+ sqxtun2 v2.16b, v3.8h
+ str d0, [x0], #8
+ st1 {v0.s}[2], [x0], x1
+ str d2, [x0], #8
+ st1 {v2.s}[2], [x0], x1
+ subs w4, w4, #2
+ b.ne 1b
+ ret
+endfunc
+
+.macro PEL_UNI_W_PIXEL_CALC s0, t0, t1, d0, d1, d2, d3
+ ushll \t0\().8h, \s0\().8b, #6
+ ushll2 \t1\().8h, \s0\().16b, #6
+ smull \d0\().4s, \t0\().4h, v30.4h
+ smull2 \d1\().4s, \t0\().8h, v30.8h
+ smull \d2\().4s, \t1\().4h, v30.4h
+ smull2 \d3\().4s, \t1\().8h, v30.8h
+ sqrshl \d0\().4s, \d0\().4s, v31.4s
+ sqrshl \d1\().4s, \d1\().4s, v31.4s
+ sqrshl \d2\().4s, \d2\().4s, v31.4s
+ sqrshl \d3\().4s, \d3\().4s, v31.4s
+ sqadd \d0\().4s, \d0\().4s, v29.4s
+ sqadd \d1\().4s, \d1\().4s, v29.4s
+ sqadd \d2\().4s, \d2\().4s, v29.4s
+ sqadd \d3\().4s, \d3\().4s, v29.4s
+ sqxtn \t0\().4h, \d0\().4s
+ sqxtn2 \t0\().8h, \d1\().4s
+ sqxtn \t1\().4h, \d2\().4s
+ sqxtn2 \t1\().8h, \d3\().4s
+ sqxtun \s0\().8b, \t0\().8h
+ sqxtun2 \s0\().16b, \t1\().8h
+.endm
+
+
+function ff_hevc_put_hevc_pel_uni_w_pixels16_8_neon, export=1
+ mov w10, #-6
+ sub w10, w10, w5
+ dup v30.8h, w6
+ dup v31.4s, w10
+ dup v29.4s, w7
+1:
+ ldr q0, [x2]
+ ldr q1, [x2, x3]
+ add x2, x2, x3, lsl #1
+ PEL_UNI_W_PIXEL_CALC v0, v4, v5, v16, v17, v18, v19
+ PEL_UNI_W_PIXEL_CALC v1, v6, v7, v20, v21, v22, v23
+ str q0, [x0]
+ str q1, [x0, x1]
+ add x0, x0, x1, lsl #1
+ subs w4, w4, #2
+ b.ne 1b
+ ret
+endfunc
+
+
+
+function ff_hevc_put_hevc_pel_uni_w_pixels24_8_neon, export=1
+ mov w10, #-6
+ sub w10, w10, w5
+ dup v30.8h, w6
+ dup v31.4s, w10
+ dup v29.4s, w7
+1:
+ ld1 {v0.16b, v1.16b}, [x2], x3
+ ushll v4.8h, v0.8b, #6
+ ushll2 v5.8h, v0.16b, #6
+ ushll v6.8h, v1.8b, #6
+ smull v16.4s, v4.4h, v30.4h
+ smull2 v17.4s, v4.8h, v30.8h
+ smull v18.4s, v5.4h, v30.4h
+ smull2 v19.4s, v5.8h, v30.8h
+ smull v20.4s, v6.4h, v30.4h
+ smull2 v21.4s, v6.8h, v30.8h
+ sqrshl v16.4s, v16.4s, v31.4s
+ sqrshl v17.4s, v17.4s, v31.4s
+ sqrshl v18.4s, v18.4s, v31.4s
+ sqrshl v19.4s, v19.4s, v31.4s
+ sqrshl v20.4s, v20.4s, v31.4s
+ sqrshl v21.4s, v21.4s, v31.4s
+ sqadd v16.4s, v16.4s, v29.4s
+ sqadd v17.4s, v17.4s, v29.4s
+ sqadd v18.4s, v18.4s, v29.4s
+ sqadd v19.4s, v19.4s, v29.4s
+ sqadd v20.4s, v20.4s, v29.4s
+ sqadd v21.4s, v21.4s, v29.4s
+ sqxtn v0.4h, v16.4s
+ sqxtn2 v0.8h, v17.4s
+ sqxtn v1.4h, v18.4s
+ sqxtn2 v1.8h, v19.4s
+ sqxtn v2.4h, v20.4s
+ sqxtn2 v2.8h, v21.4s
+ sqxtun v0.8b, v0.8h
+ sqxtun v1.8b, v1.8h
+ sqxtun v2.8b, v2.8h
+ st1 {v0.8b, v1.8b, v2.8b}, [x0], x1
+ subs w4, w4, #1
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels32_8_neon, export=1
+ mov w10, #-6
+ sub w10, w10, w5
+ dup v30.8h, w6
+ dup v31.4s, w10
+ dup v29.4s, w7
+1:
+ ld1 {v0.16b, v1.16b}, [x2], x3
+ PEL_UNI_W_PIXEL_CALC v0, v4, v5, v16, v17, v18, v19
+ PEL_UNI_W_PIXEL_CALC v1, v6, v7, v20, v21, v22, v23
+ st1 {v0.16b, v1.16b}, [x0], x1
+ subs w4, w4, #1
+ b.ne 1b
+ ret
+endfunc
+
+
+function ff_hevc_put_hevc_pel_uni_w_pixels48_8_neon, export=1
+ mov w10, #-6
+ sub w10, w10, w5
+ dup v30.8h, w6
+ dup v31.4s, w10
+ dup v29.4s, w7
+1:
+ ld1 {v0.16b, v1.16b, v2.16b}, [x2], x3
+ PEL_UNI_W_PIXEL_CALC v0, v4, v5, v16, v17, v18, v19
+ PEL_UNI_W_PIXEL_CALC v1, v6, v7, v20, v21, v22, v23
+ PEL_UNI_W_PIXEL_CALC v2, v4, v5, v16, v17, v18, v19
+ st1 {v0.16b, v1.16b, v2.16b}, [x0], x1
+ subs w4, w4, #1
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels64_8_neon, export=1
+ mov w10, #-6
+ sub w10, w10, w5
+ dup v30.8h, w6
+ dup v31.4s, w10
+ dup v29.4s, w7
+1:
+ ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], x3
+ PEL_UNI_W_PIXEL_CALC v0, v4, v5, v16, v17, v18, v19
+ PEL_UNI_W_PIXEL_CALC v1, v6, v7, v20, v21, v22, v23
+ PEL_UNI_W_PIXEL_CALC v2, v4, v5, v16, v17, v18, v19
+ PEL_UNI_W_PIXEL_CALC v3, v6, v7, v20, v21, v22, v23
+ st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
+ subs w4, w4, #1
+ b.ne 1b
+ ret
+endfunc
+
+.macro QPEL_UNI_W_V_HEADER
+ ldur x12, [sp, #8] // my
+ sub x2, x2, x3, lsl #1
+ sub x2, x2, x3
+ movrel x9, qpel_filters_abs
+ add x9, x9, x12, lsl #3
+ ldr d28, [x9]
+ dup v0.16b, v28.b[0]
+ dup v1.16b, v28.b[1]
+ dup v2.16b, v28.b[2]
+ dup v3.16b, v28.b[3]
+ dup v4.16b, v28.b[4]
+ dup v5.16b, v28.b[5]
+ dup v6.16b, v28.b[6]
+ dup v7.16b, v28.b[7]
+
+ mov w10, #-6
+ sub w10, w10, w5
+ dup v30.8h, w6 // wx
+ dup v31.4s, w10 // shift
+ dup v29.4s, w7 // ox
+.endm
+
+.macro QPEL_FILTER_B dst, src0, src1, src2, src3, src4, src5, src6, src7
+ umull \dst\().8h, \src1\().8b, v1.8b
+ umlsl \dst\().8h, \src0\().8b, v0.8b
+ umlsl \dst\().8h, \src2\().8b, v2.8b
+ umlal \dst\().8h, \src3\().8b, v3.8b
+ umlal \dst\().8h, \src4\().8b, v4.8b
+ umlsl \dst\().8h, \src5\().8b, v5.8b
+ umlal \dst\().8h, \src6\().8b, v6.8b
+ umlsl \dst\().8h, \src7\().8b, v7.8b
+.endm
+
+.macro QPEL_FILTER_B2 dst, src0, src1, src2, src3, src4, src5, src6, src7
+ umull2 \dst\().8h, \src1\().16b, v1.16b
+ umlsl2 \dst\().8h, \src0\().16b, v0.16b
+ umlsl2 \dst\().8h, \src2\().16b, v2.16b
+ umlal2 \dst\().8h, \src3\().16b, v3.16b
+ umlal2 \dst\().8h, \src4\().16b, v4.16b
+ umlsl2 \dst\().8h, \src5\().16b, v5.16b
+ umlal2 \dst\().8h, \src6\().16b, v6.16b
+ umlsl2 \dst\().8h, \src7\().16b, v7.16b
+.endm
+
+.macro QPEL_UNI_W_V_4
+ smull v24.4s, v24.4h, v30.4h
+ sqrshl v24.4s, v24.4s, v31.4s
+ sqadd v24.4s, v24.4s, v29.4s
+ sqxtn v24.4h, v24.4s
+ sqxtun v24.8b, v24.8h
+ st1 {v24.s}[0], [x0], x1
+.endm
+
+function ff_hevc_put_hevc_qpel_uni_w_v4_8_neon, export=1
+ QPEL_UNI_W_V_HEADER
+ ldr s16, [x2]
+ ldr s17, [x2, x3]
+ add x2, x2, x3, lsl #1
+ ldr s18, [x2]
+ ldr s19, [x2, x3]
+ add x2, x2, x3, lsl #1
+ ldr s20, [x2]
+ ldr s21, [x2, x3]
+ add x2, x2, x3, lsl #1
+ ldr s22, [x2]
+
+1: ldr s23, [x2, x3]
+ add x2, x2, x3, lsl #1
+ QPEL_FILTER_B v24, v16, v17, v18, v19, v20, v21, v22, v23
+ QPEL_UNI_W_V_4
+ subs w4, w4, #1
+ b.eq 2f
+
+ ldr s16, [x2]
+ QPEL_FILTER_B v24, v17, v18, v19, v20, v21, v22, v23, v16
+ QPEL_UNI_W_V_4
+ subs w4, w4, #1
+ b.eq 2f
+
+ ldr s17, [x2, x3]
+ add x2, x2, x3, lsl #1
+ QPEL_FILTER_B v24, v18, v19, v20, v21, v22, v23, v16, v17
+ QPEL_UNI_W_V_4
+ subs w4, w4, #1
+ b.eq 2f
+
+ ldr s18, [x2]
+ QPEL_FILTER_B v24, v19, v20, v21, v22, v23, v16, v17, v18
+ QPEL_UNI_W_V_4
+ subs w4, w4, #1
+ b.eq 2f
+
+ ldr s19, [x2, x3]
+ add x2, x2, x3, lsl #1
+ QPEL_FILTER_B v24, v20, v21, v22, v23, v16, v17, v18, v19
+ QPEL_UNI_W_V_4
+ subs w4, w4, #1
+ b.eq 2f
+
+ ldr s20, [x2]
+ QPEL_FILTER_B v24, v21, v22, v23, v16, v17, v18, v19, v20
+ QPEL_UNI_W_V_4
+ subs w4, w4, #1
+ b.eq 2f
+
+ ldr s21, [x2, x3]
+ add x2, x2, x3, lsl #1
+ QPEL_FILTER_B v24, v22, v23, v16, v17, v18, v19, v20, v21
+ QPEL_UNI_W_V_4
+ subs w4, w4, #1
+ b.eq 2f
+
+ ldr s22, [x2]
+ QPEL_FILTER_B v24, v23, v16, v17, v18, v19, v20, v21, v22
+ QPEL_UNI_W_V_4
+ subs w4, w4, #1
+ b.ne 1b
+2:
+ ret
+endfunc
+
+.macro QPEL_UNI_W_V_8
+ smull v24.4s, v26.4h, v30.4h
+ smull2 v25.4s, v26.8h, v30.8h
+ sqrshl v24.4s, v24.4s, v31.4s
+ sqrshl v25.4s, v25.4s, v31.4s
+ sqadd v24.4s, v24.4s, v29.4s
+ sqadd v25.4s, v25.4s, v29.4s
+ sqxtn v24.4h, v24.4s
+ sqxtn2 v24.8h, v25.4s
+ sqxtun v24.8b, v24.8h
+ st1 {v24.d}[0], [x0], x1
+.endm
+
+function ff_hevc_put_hevc_qpel_uni_w_v8_8_neon, export=1
+ QPEL_UNI_W_V_HEADER
+ ldr d16, [x2]
+ ldr d17, [x2, x3]
+ add x2, x2, x3, lsl #1
+ ldr d18, [x2]
+ ldr d19, [x2, x3]
+ add x2, x2, x3, lsl #1
+ ldr d20, [x2]
+ ldr d21, [x2, x3]
+ add x2, x2, x3, lsl #1
+ ldr d22, [x2]
+
+1: ldr d23, [x2, x3]
+ add x2, x2, x3, lsl #1
+ QPEL_FILTER_B v26, v16, v17, v18, v19, v20, v21, v22, v23
+ QPEL_UNI_W_V_8
+ subs w4, w4, #1
+ b.eq 2f
+
+ ldr d16, [x2]
+ QPEL_FILTER_B v26, v17, v18, v19, v20, v21, v22, v23, v16
+ QPEL_UNI_W_V_8
+ subs w4, w4, #1
+ b.eq 2f
+
+ ldr d17, [x2, x3]
+ add x2, x2, x3, lsl #1
+ QPEL_FILTER_B v26, v18, v19, v20, v21, v22, v23, v16, v17
+ QPEL_UNI_W_V_8
+ subs w4, w4, #1
+ b.eq 2f
+
+ ldr d18, [x2]
+ QPEL_FILTER_B v26, v19, v20, v21, v22, v23, v16, v17, v18
+ QPEL_UNI_W_V_8
+ subs w4, w4, #1
+ b.eq 2f
+
+ ldr d19, [x2, x3]
+ add x2, x2, x3, lsl #1
+ QPEL_FILTER_B v26, v20, v21, v22, v23, v16, v17, v18, v19
+ QPEL_UNI_W_V_8
+ subs w4, w4, #1
+ b.eq 2f
+
+ ldr d20, [x2]
+ QPEL_FILTER_B v26, v21, v22, v23, v16, v17, v18, v19, v20
+ QPEL_UNI_W_V_8
+ subs w4, w4, #1
+ b.eq 2f
+
+ ldr d21, [x2, x3]
+ add x2, x2, x3, lsl #1
+ QPEL_FILTER_B v26, v22, v23, v16, v17, v18, v19, v20, v21
+ QPEL_UNI_W_V_8
+ subs w4, w4, #1
+ b.eq 2f
+
+ ldr d22, [x2]
+ QPEL_FILTER_B v26, v23, v16, v17, v18, v19, v20, v21, v22
+ QPEL_UNI_W_V_8
+ subs w4, w4, #1
+ b.ne 1b
+2:
+ ret
+endfunc
+
+.macro QPEL_UNI_W_V_16
+ smull v24.4s, v26.4h, v30.4h
+ smull2 v25.4s, v26.8h, v30.8h
+ smull v26.4s, v27.4h, v30.4h
+ smull2 v27.4s, v27.8h, v30.8h
+ sqrshl v24.4s, v24.4s, v31.4s
+ sqrshl v25.4s, v25.4s, v31.4s
+ sqrshl v26.4s, v26.4s, v31.4s
+ sqrshl v27.4s, v27.4s, v31.4s
+ sqadd v24.4s, v24.4s, v29.4s
+ sqadd v25.4s, v25.4s, v29.4s
+ sqadd v26.4s, v26.4s, v29.4s
+ sqadd v27.4s, v27.4s, v29.4s
+ sqxtn v24.4h, v24.4s
+ sqxtn2 v24.8h, v25.4s
+ sqxtn v26.4h, v26.4s
+ sqxtn2 v26.8h, v27.4s
+ sqxtun v24.8b, v24.8h
+ sqxtun2 v24.16b, v26.8h
+ st1 {v24.16b}, [x0], x1
+.endm
+
+function ff_hevc_put_hevc_qpel_uni_w_v16_8_neon, export=1
+ QPEL_UNI_W_V_HEADER
+ ldr q16, [x2]
+ ldr q17, [x2, x3]
+ add x2, x2, x3, lsl #1
+ ldr q18, [x2]
+ ldr q19, [x2, x3]
+ add x2, x2, x3, lsl #1
+ ldr q20, [x2]
+ ldr q21, [x2, x3]
+ add x2, x2, x3, lsl #1
+ ldr q22, [x2]
+
+1: ldr q23, [x2, x3]
+ add x2, x2, x3, lsl #1
+ QPEL_FILTER_B v26, v16, v17, v18, v19, v20, v21, v22, v23
+ QPEL_FILTER_B2 v27, v16, v17, v18, v19, v20, v21, v22, v23
+ QPEL_UNI_W_V_16
+ subs w4, w4, #1
+ b.eq 2f
+
+ ldr q16, [x2]
+ QPEL_FILTER_B v26, v17, v18, v19, v20, v21, v22, v23, v16
+ QPEL_FILTER_B2 v27, v17, v18, v19, v20, v21, v22, v23, v16
+ QPEL_UNI_W_V_16
+ subs w4, w4, #1
+ b.eq 2f
+
+ ldr q17, [x2, x3]
+ add x2, x2, x3, lsl #1
+ QPEL_FILTER_B v26, v18, v19, v20, v21, v22, v23, v16, v17
+ QPEL_FILTER_B2 v27, v18, v19, v20, v21, v22, v23, v16, v17
+ QPEL_UNI_W_V_16
+ subs w4, w4, #1
+ b.eq 2f
+
+ ldr q18, [x2]
+ QPEL_FILTER_B v26, v19, v20, v21, v22, v23, v16, v17, v18
+ QPEL_FILTER_B2 v27, v19, v20, v21, v22, v23, v16, v17, v18
+ QPEL_UNI_W_V_16
+ subs w4, w4, #1
+ b.eq 2f
+
+ ldr q19, [x2, x3]
+ add x2, x2, x3, lsl #1
+ QPEL_FILTER_B v26, v20, v21, v22, v23, v16, v17, v18, v19
+ QPEL_FILTER_B2 v27, v20, v21, v22, v23, v16, v17, v18, v19
+ QPEL_UNI_W_V_16
+ subs w4, w4, #1
+ b.eq 2f
+
+ ldr q20, [x2]
+ QPEL_FILTER_B v26, v21, v22, v23, v16, v17, v18, v19, v20
+ QPEL_FILTER_B2 v27, v21, v22, v23, v16, v17, v18, v19, v20
+ QPEL_UNI_W_V_16
+ subs w4, w4, #1
+ b.eq 2f
+
+ ldr q21, [x2, x3]
+ add x2, x2, x3, lsl #1
+ QPEL_FILTER_B v26, v22, v23, v16, v17, v18, v19, v20, v21
+ QPEL_FILTER_B2 v27, v22, v23, v16, v17, v18, v19, v20, v21
+ QPEL_UNI_W_V_16
+ subs w4, w4, #1
+ b.eq 2f
+
+ ldr q22, [x2]
+ QPEL_FILTER_B v26, v23, v16, v17, v18, v19, v20, v21, v22
+ QPEL_FILTER_B2 v27, v23, v16, v17, v18, v19, v20, v21, v22
+ QPEL_UNI_W_V_16
+ subs w4, w4, #1
+ b.ne 1b
+2:
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_w_v64_8_neon, export=1
+ QPEL_UNI_W_V_HEADER
+ ldur w13, [sp, #16]
+ mov x14, x0
+ mov x15, x2
+ mov w11, w4
+
+3:
+ ldr q16, [x2]
+ ldr q17, [x2, x3]
+ add x2, x2, x3, lsl #1
+ ldr q18, [x2]
+ ldr q19, [x2, x3]
+ add x2, x2, x3, lsl #1
+ ldr q20, [x2]
+ ldr q21, [x2, x3]
+ add x2, x2, x3, lsl #1
+ ldr q22, [x2]
+
+
+1: ldr q23, [x2, x3]
+ add x2, x2, x3, lsl #1
+ QPEL_FILTER_B v26, v16, v17, v18, v19, v20, v21, v22, v23
+ QPEL_FILTER_B2 v27, v16, v17, v18, v19, v20, v21, v22, v23
+ QPEL_UNI_W_V_16
+ subs w4, w4, #1
+ b.eq 2f
+
+ ldr q16, [x2]
+ QPEL_FILTER_B v26, v17, v18, v19, v20, v21, v22, v23, v16
+ QPEL_FILTER_B2 v27, v17, v18, v19, v20, v21, v22, v23, v16
+ QPEL_UNI_W_V_16
+ subs w4, w4, #1
+ b.eq 2f
+
+ ldr q17, [x2, x3]
+ add x2, x2, x3, lsl #1
+ QPEL_FILTER_B v26, v18, v19, v20, v21, v22, v23, v16, v17
+ QPEL_FILTER_B2 v27, v18, v19, v20, v21, v22, v23, v16, v17
+ QPEL_UNI_W_V_16
+ subs w4, w4, #1
+ b.eq 2f
+
+ ldr q18, [x2]
+ QPEL_FILTER_B v26, v19, v20, v21, v22, v23, v16, v17, v18
+ QPEL_FILTER_B2 v27, v19, v20, v21, v22, v23, v16, v17, v18
+ QPEL_UNI_W_V_16
+ subs w4, w4, #1
+ b.eq 2f
+
+ ldr q19, [x2, x3]
+ add x2, x2, x3, lsl #1
+ QPEL_FILTER_B v26, v20, v21, v22, v23, v16, v17, v18, v19
+ QPEL_FILTER_B2 v27, v20, v21, v22, v23, v16, v17, v18, v19
+ QPEL_UNI_W_V_16
+ subs w4, w4, #1
+ b.eq 2f
+
+ ldr q20, [x2]
+ QPEL_FILTER_B v26, v21, v22, v23, v16, v17, v18, v19, v20
+ QPEL_FILTER_B2 v27, v21, v22, v23, v16, v17, v18, v19, v20
+ QPEL_UNI_W_V_16
+ subs w4, w4, #1
+ b.eq 2f
+
+ ldr q21, [x2, x3]
+ add x2, x2, x3, lsl #1
+ QPEL_FILTER_B v26, v22, v23, v16, v17, v18, v19, v20, v21
+ QPEL_FILTER_B2 v27, v22, v23, v16, v17, v18, v19, v20, v21
+ QPEL_UNI_W_V_16
+ subs w4, w4, #1
+ b.eq 2f
+
+ ldr q22, [x2]
+ QPEL_FILTER_B v26, v23, v16, v17, v18, v19, v20, v21, v22
+ QPEL_FILTER_B2 v27, v23, v16, v17, v18, v19, v20, v21, v22
+ QPEL_UNI_W_V_16
+ subs w4, w4, #1
+ b.ne 1b
+2:
+ subs w13, w13, #16
+ add x14, x14, #16
+ add x15, x15, #16
+ mov x0, x14
+ mov x2, x15
+ mov w4, w11
+ b.hi 3b
+ ret
+endfunc
--
2.38.0.windows.1
-------------- next part --------------
From e01cd973488aa4d65e09a85b53ea0639477fc76e Mon Sep 17 00:00:00 2001
From: Logan Lyu <Logan.Lyu at myais.com.cn>
Date: Fri, 5 May 2023 22:06:22 +0800
Subject: [PATCH 2/3] lavc/aarch64: new optimization for 8-bit
hevc_qpel_uni_w_h
---
libavcodec/aarch64/hevcdsp_init_aarch64.c | 15 +-
libavcodec/aarch64/hevcdsp_qpel_neon.S | 434 ++++++++++++++++++++++
2 files changed, 448 insertions(+), 1 deletion(-)
diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index 6b5341dd45..a7e62c7d15 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -145,6 +145,7 @@ void ff_hevc_put_hevc_qpel_bi_h16_8_neon(uint8_t *_dst, ptrdiff_t _dststride, co
void ff_hevc_put_hevc_##fn##16_8_neon##ext args; \
void ff_hevc_put_hevc_##fn##64_8_neon##ext args; \
+
NEON8_FNPROTO(pel_uni_w_pixels, (uint8_t *_dst, ptrdiff_t _dststride,
const uint8_t *_src, ptrdiff_t _srcstride,
int height, int denom, int wx, int ox,
@@ -155,6 +156,12 @@ NEON8_FNPROTO_PARTIAL_4(qpel_uni_w_v, (uint8_t *_dst, ptrdiff_t _dststride,
int height, int denom, int wx, int ox,
intptr_t mx, intptr_t my, int width),);
+NEON8_FNPROTO(qpel_uni_w_h, (uint8_t *_dst, ptrdiff_t _dststride,
+ const uint8_t *_src, ptrdiff_t _srcstride,
+ int height, int denom, int wx, int ox,
+ intptr_t mx, intptr_t my, int width), _i8mm);
+
+
#define NEON8_FNASSIGN(member, v, h, fn, ext) \
member[1][v][h] = ff_hevc_put_hevc_##fn##4_8_neon##ext; \
member[2][v][h] = ff_hevc_put_hevc_##fn##6_8_neon##ext; \
@@ -174,9 +181,11 @@ NEON8_FNPROTO_PARTIAL_4(qpel_uni_w_v, (uint8_t *_dst, ptrdiff_t _dststride,
member[8][v][h] = ff_hevc_put_hevc_##fn##64_8_neon##ext; \
member[9][v][h] = ff_hevc_put_hevc_##fn##64_8_neon##ext;
+
av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
{
- if (!have_neon(av_get_cpu_flags())) return;
+ int cpu_flags = av_get_cpu_flags();
+ if (!have_neon(cpu_flags)) return;
if (bit_depth == 8) {
c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_8_neon;
@@ -236,6 +245,10 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
NEON8_FNASSIGN(c->put_hevc_qpel_uni_w, 0, 0, pel_uni_w_pixels,);
NEON8_FNASSIGN_PARTIAL_4(c->put_hevc_qpel_uni_w, 1, 0, qpel_uni_w_v,);
+ if (have_i8mm(cpu_flags)) {
+ NEON8_FNASSIGN(c->put_hevc_qpel_uni_w, 0, 1, qpel_uni_w_h, _i8mm);
+ }
+
}
if (bit_depth == 10) {
c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_10_neon;
diff --git a/libavcodec/aarch64/hevcdsp_qpel_neon.S b/libavcodec/aarch64/hevcdsp_qpel_neon.S
index 51df52e1ea..8e8b88c9ea 100644
--- a/libavcodec/aarch64/hevcdsp_qpel_neon.S
+++ b/libavcodec/aarch64/hevcdsp_qpel_neon.S
@@ -1192,3 +1192,437 @@ function ff_hevc_put_hevc_qpel_uni_w_v64_8_neon, export=1
b.hi 3b
ret
endfunc
+
+#if HAVE_I8MM
+.macro QPEL_UNI_W_H_HEADER
+ ldr x12, [sp]
+ sub x2, x2, #3
+ movrel x9, qpel_filters
+ add x9, x9, x12, lsl #3
+ ldr x11, [x9]
+ dup v28.2d, x11
+ mov w10, #-6
+ sub w10, w10, w5
+ dup v30.4s, w6 // wx
+ dup v31.4s, w10 // shift
+ dup v29.4s, w7 // ox
+.endm
+
+function ff_hevc_put_hevc_qpel_uni_w_h4_8_neon_i8mm, export=1
+ QPEL_UNI_W_H_HEADER
+1:
+ ld1 {v0.16b}, [x2], x3
+ ext v1.16b, v0.16b, v0.16b, #1
+ ext v2.16b, v0.16b, v0.16b, #2
+ ext v3.16b, v0.16b, v0.16b, #3
+ zip1 v0.2d, v0.2d, v1.2d
+ zip1 v2.2d, v2.2d, v3.2d
+ movi v16.2d, #0
+ movi v17.2d, #0
+ usdot v16.4s, v0.16b, v28.16b
+ usdot v17.4s, v2.16b, v28.16b
+ addp v16.4s, v16.4s, v17.4s
+ mul v16.4s, v16.4s, v30.4s
+ sqrshl v16.4s, v16.4s, v31.4s
+ sqadd v16.4s, v16.4s, v29.4s
+ sqxtn v16.4h, v16.4s
+ sqxtun v16.8b, v16.8h
+ str s16, [x0]
+ add x0, x0, x1
+ subs w4, w4, #1
+ b.hi 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_w_h6_8_neon_i8mm, export=1
+ QPEL_UNI_W_H_HEADER
+ sub x1, x1, #4
+1:
+ ld1 {v0.16b}, [x2], x3
+ ext v1.16b, v0.16b, v0.16b, #1
+ ext v2.16b, v0.16b, v0.16b, #2
+ ext v3.16b, v0.16b, v0.16b, #3
+ ext v4.16b, v0.16b, v0.16b, #4
+ ext v5.16b, v0.16b, v0.16b, #5
+ zip1 v0.2d, v0.2d, v1.2d
+ zip1 v2.2d, v2.2d, v3.2d
+ zip1 v4.2d, v4.2d, v5.2d
+ movi v16.2d, #0
+ movi v17.2d, #0
+ movi v18.2d, #0
+ usdot v16.4s, v0.16b, v28.16b
+ usdot v17.4s, v2.16b, v28.16b
+ usdot v18.4s, v4.16b, v28.16b
+ addp v16.4s, v16.4s, v17.4s
+ addp v18.4s, v18.4s, v18.4s
+ mul v16.4s, v16.4s, v30.4s
+ mul v18.2s, v18.2s, v30.2s
+ sqrshl v16.4s, v16.4s, v31.4s
+ sqrshl v18.2s, v18.2s, v31.2s
+ sqadd v16.4s, v16.4s, v29.4s
+ sqadd v18.2s, v18.2s, v29.2s
+ sqxtn v16.4h, v16.4s
+ sqxtn2 v16.8h, v18.4s
+ sqxtun v16.8b, v16.8h
+ str s16, [x0], #4
+ st1 {v16.h}[2], [x0], x1
+ subs w4, w4, #1
+ b.hi 1b
+ ret
+endfunc
+
+
+.macro QPEL_UNI_W_H_CALC s0, s1, s2, s3, d0, d1, d2, d3
+ movi \d0\().2d, #0
+ movi \d1\().2d, #0
+ movi \d2\().2d, #0
+ movi \d3\().2d, #0
+ usdot \d0\().4s, \s0\().16b, v28.16b
+ usdot \d1\().4s, \s1\().16b, v28.16b
+ usdot \d2\().4s, \s2\().16b, v28.16b
+ usdot \d3\().4s, \s3\().16b, v28.16b
+ addp \d0\().4s, \d0\().4s, \d1\().4s
+ addp \d2\().4s, \d2\().4s, \d3\().4s
+ mul \d0\().4s, \d0\().4s, v30.4s
+ mul \d2\().4s, \d2\().4s, v30.4s
+ sqrshl \d0\().4s, \d0\().4s, v31.4s
+ sqrshl \d2\().4s, \d2\().4s, v31.4s
+ sqadd \d0\().4s, \d0\().4s, v29.4s
+ sqadd \d2\().4s, \d2\().4s, v29.4s
+.endm
+
+.macro QPEL_UNI_W_H_CALC_HALF s0, s1, d0, d1
+ movi \d0\().2d, #0
+ movi \d1\().2d, #0
+ usdot \d0\().4s, \s0\().16b, v28.16b
+ usdot \d1\().4s, \s1\().16b, v28.16b
+ addp \d0\().4s, \d0\().4s, \d1\().4s
+ mul \d0\().4s, \d0\().4s, v30.4s
+ sqrshl \d0\().4s, \d0\().4s, v31.4s
+ sqadd \d0\().4s, \d0\().4s, v29.4s
+.endm
+
+
+function ff_hevc_put_hevc_qpel_uni_w_h8_8_neon_i8mm, export=1
+ QPEL_UNI_W_H_HEADER
+1:
+ ld1 {v16.16b, v17.16b}, [x2], x3
+ ext v1.16b, v16.16b, v17.16b, #1
+ ext v2.16b, v16.16b, v17.16b, #2
+ ext v3.16b, v16.16b, v17.16b, #3
+ ext v4.16b, v16.16b, v17.16b, #4
+ ext v5.16b, v16.16b, v17.16b, #5
+ ext v6.16b, v16.16b, v17.16b, #6
+ ext v7.16b, v16.16b, v17.16b, #7
+ zip1 v0.2d, v16.2d, v1.2d
+ zip1 v2.2d, v2.2d, v3.2d
+ zip1 v4.2d, v4.2d, v5.2d
+ zip1 v6.2d, v6.2d, v7.2d
+ QPEL_UNI_W_H_CALC v0, v2, v4, v6, v18, v19, v20, v21
+ sqxtn v18.4h, v18.4s
+ sqxtn2 v18.8h, v20.4s
+ sqxtun v18.8b, v18.8h
+ str d18, [x0]
+ add x0, x0, x1
+ subs w4, w4, #1
+ b.hi 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_w_h12_8_neon_i8mm, export=1
+ QPEL_UNI_W_H_HEADER
+ add x13, x0, #8
+1:
+ ld1 {v16.16b, v17.16b}, [x2], x3
+ ext v1.16b, v16.16b, v17.16b, #1
+ ext v2.16b, v16.16b, v17.16b, #2
+ ext v3.16b, v16.16b, v17.16b, #3
+ ext v4.16b, v16.16b, v17.16b, #4
+ ext v5.16b, v16.16b, v17.16b, #5
+ ext v6.16b, v16.16b, v17.16b, #6
+ ext v7.16b, v16.16b, v17.16b, #7
+ zip1 v18.2d, v16.2d, v1.2d
+ zip1 v19.2d, v2.2d, v3.2d
+ zip1 v20.2d, v4.2d, v5.2d
+ zip1 v21.2d, v6.2d, v7.2d
+ zip2 v22.2d, v16.2d, v1.2d
+ zip2 v23.2d, v2.2d, v3.2d
+ QPEL_UNI_W_H_CALC v18, v19, v20, v21, v0, v2, v4, v6
+ QPEL_UNI_W_H_CALC_HALF v22, v23, v24, v25
+ sqxtn v0.4h, v0.4s
+ sqxtn2 v0.8h, v4.4s
+ sqxtn v1.4h, v24.4s
+ sqxtun v0.8b, v0.8h
+ sqxtun v1.8b, v1.8h
+
+ str d0, [x0]
+ str s1, [x13]
+ add x0, x0, x1
+ add x13, x13, x1
+ subs w4, w4, #1
+ b.hi 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_w_h16_8_neon_i8mm, export=1
+ QPEL_UNI_W_H_HEADER
+1:
+ ld1 {v16.16b, v17.16b}, [x2], x3
+ ext v1.16b, v16.16b, v17.16b, #1
+ ext v2.16b, v16.16b, v17.16b, #2
+ ext v3.16b, v16.16b, v17.16b, #3
+ ext v4.16b, v16.16b, v17.16b, #4
+ ext v5.16b, v16.16b, v17.16b, #5
+ ext v6.16b, v16.16b, v17.16b, #6
+ ext v7.16b, v16.16b, v17.16b, #7
+ QPEL_UNI_W_H_CALC v16, v2, v1, v3, v18, v19, v20, v21 // v18: 0, 8, 2, 10 v20: 1, 9, 3, 11
+ QPEL_UNI_W_H_CALC v4, v6, v5, v7, v22, v23, v24, v25 // v22: 4, 12, 6, 14 v24: 5, 13, 7, 15
+ sqxtn v0.4h, v18.4s
+ sqxtn2 v0.8h, v22.4s
+ sqxtn v1.4h, v20.4s
+ sqxtn2 v1.8h, v24.4s
+ trn1 v2.8h, v0.8h, v1.8h
+ trn2 v3.8h, v0.8h, v1.8h
+ sqxtun v0.8b, v2.8h
+ sqxtun2 v0.16b, v3.8h
+ st1 {v0.16b}, [x0], x1
+ subs w4, w4, #1
+ b.hi 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_w_h24_8_neon_i8mm, export=1
+ QPEL_UNI_W_H_HEADER
+ sub x1, x1, #16
+1:
+ ld1 {v16.16b, v17.16b}, [x2], x3
+ ext v1.16b, v16.16b, v17.16b, #1
+ ext v2.16b, v16.16b, v17.16b, #2
+ ext v3.16b, v16.16b, v17.16b, #3
+ ext v4.16b, v16.16b, v17.16b, #4
+ ext v5.16b, v16.16b, v17.16b, #5
+ ext v6.16b, v16.16b, v17.16b, #6
+ ext v7.16b, v16.16b, v17.16b, #7
+ QPEL_UNI_W_H_CALC v16, v2, v1, v3, v18, v19, v20, v21
+ QPEL_UNI_W_H_CALC v4, v6, v5, v7, v22, v23, v24, v25
+ sqxtn v18.4h, v18.4s
+ sqxtn2 v18.8h, v22.4s
+ sqxtn v19.4h, v20.4s
+ sqxtn2 v19.8h, v24.4s
+ trn1 v20.8h, v18.8h, v19.8h
+ trn2 v21.8h, v18.8h, v19.8h
+ sqxtun v26.8b, v20.8h
+ sqxtun2 v26.16b, v21.8h // 0-15
+ ext v1.16b, v17.16b, v17.16b, #1
+ ext v2.16b, v17.16b, v17.16b, #2
+ ext v3.16b, v17.16b, v17.16b, #3
+ ext v4.16b, v17.16b, v17.16b, #4
+ ext v5.16b, v17.16b, v17.16b, #5
+ ext v6.16b, v17.16b, v17.16b, #6
+ ext v7.16b, v17.16b, v17.16b, #7
+ zip1 v0.2d, v17.2d, v1.2d
+ zip1 v2.2d, v2.2d, v3.2d
+ zip1 v4.2d, v4.2d, v5.2d
+ zip1 v6.2d, v6.2d, v7.2d
+ QPEL_UNI_W_H_CALC v0, v2, v4, v6, v18, v19, v20, v21
+ sqxtn v18.4h, v18.4s
+ sqxtn2 v18.8h, v20.4s
+ sqxtun v27.8b, v18.8h
+
+ st1 {v26.16b}, [x0], #16
+ st1 {v27.8b}, [x0], x1
+ subs w4, w4, #1
+ b.hi 1b
+ ret
+endfunc
+
+
+function ff_hevc_put_hevc_qpel_uni_w_h32_8_neon_i8mm, export=1
+ QPEL_UNI_W_H_HEADER
+1:
+ ld1 {v16.16b, v17.16b, v18.16b}, [x2], x3
+ ext v1.16b, v16.16b, v17.16b, #1
+ ext v2.16b, v16.16b, v17.16b, #2
+ ext v3.16b, v16.16b, v17.16b, #3
+ ext v4.16b, v16.16b, v17.16b, #4
+ ext v5.16b, v16.16b, v17.16b, #5
+ ext v6.16b, v16.16b, v17.16b, #6
+ ext v7.16b, v16.16b, v17.16b, #7
+ QPEL_UNI_W_H_CALC v16, v2, v1, v3, v0, v19, v20, v21
+ QPEL_UNI_W_H_CALC v4, v6, v5, v7, v22, v23, v24, v25
+ sqxtn v0.4h, v0.4s
+ sqxtn2 v0.8h, v22.4s
+ sqxtn v19.4h, v20.4s
+ sqxtn2 v19.8h, v24.4s
+ trn1 v20.8h, v0.8h, v19.8h
+ trn2 v21.8h, v0.8h, v19.8h
+ sqxtun v26.8b, v20.8h
+ sqxtun2 v26.16b, v21.8h // 0-15
+ ext v1.16b, v17.16b, v18.16b, #1
+ ext v2.16b, v17.16b, v18.16b, #2
+ ext v3.16b, v17.16b, v18.16b, #3
+ ext v4.16b, v17.16b, v18.16b, #4
+ ext v5.16b, v17.16b, v18.16b, #5
+ ext v6.16b, v17.16b, v18.16b, #6
+ ext v7.16b, v17.16b, v18.16b, #7
+ QPEL_UNI_W_H_CALC v17, v2, v1, v3, v0, v19, v20, v21
+ QPEL_UNI_W_H_CALC v4, v6, v5, v7, v22, v23, v24, v25
+ sqxtn v0.4h, v0.4s
+ sqxtn2 v0.8h, v22.4s
+ sqxtn v19.4h, v20.4s
+ sqxtn2 v19.8h, v24.4s
+ trn1 v20.8h, v0.8h, v19.8h
+ trn2 v21.8h, v0.8h, v19.8h
+ sqxtun v27.8b, v20.8h
+ sqxtun2 v27.16b, v21.8h // 16-31
+ st1 {v26.16b, v27.16b}, [x0], x1
+ subs w4, w4, #1
+ b.hi 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_w_h48_8_neon_i8mm, export=1
+ QPEL_UNI_W_H_HEADER
+1:
+ ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x2], x3
+ ext v1.16b, v16.16b, v17.16b, #1
+ ext v2.16b, v16.16b, v17.16b, #2
+ ext v3.16b, v16.16b, v17.16b, #3
+ ext v4.16b, v16.16b, v17.16b, #4
+ ext v5.16b, v16.16b, v17.16b, #5
+ ext v6.16b, v16.16b, v17.16b, #6
+ ext v7.16b, v16.16b, v17.16b, #7
+ QPEL_UNI_W_H_CALC v16, v2, v1, v3, v20, v24, v21, v0
+ QPEL_UNI_W_H_CALC v4, v6, v5, v7, v22, v24, v23, v0
+ sqxtn v20.4h, v20.4s
+ sqxtn2 v20.8h, v22.4s
+ sqxtn v21.4h, v21.4s
+ sqxtn2 v21.8h, v23.4s
+ trn1 v22.8h, v20.8h, v21.8h
+ trn2 v23.8h, v20.8h, v21.8h
+ sqxtun v25.8b, v22.8h
+ sqxtun2 v25.16b, v23.8h // 0-15
+ ext v1.16b, v17.16b, v18.16b, #1
+ ext v2.16b, v17.16b, v18.16b, #2
+ ext v3.16b, v17.16b, v18.16b, #3
+ ext v4.16b, v17.16b, v18.16b, #4
+ ext v5.16b, v17.16b, v18.16b, #5
+ ext v6.16b, v17.16b, v18.16b, #6
+ ext v7.16b, v17.16b, v18.16b, #7
+ QPEL_UNI_W_H_CALC v17, v2, v1, v3, v20, v24, v21, v0
+ QPEL_UNI_W_H_CALC v4, v6, v5, v7, v22, v24, v23, v0
+ sqxtn v20.4h, v20.4s
+ sqxtn2 v20.8h, v22.4s
+ sqxtn v21.4h, v21.4s
+ sqxtn2 v21.8h, v23.4s
+ trn1 v22.8h, v20.8h, v21.8h
+ trn2 v23.8h, v20.8h, v21.8h
+ sqxtun v26.8b, v22.8h
+ sqxtun2 v26.16b, v23.8h // 16-31
+ ext v1.16b, v18.16b, v19.16b, #1
+ ext v2.16b, v18.16b, v19.16b, #2
+ ext v3.16b, v18.16b, v19.16b, #3
+ ext v4.16b, v18.16b, v19.16b, #4
+ ext v5.16b, v18.16b, v19.16b, #5
+ ext v6.16b, v18.16b, v19.16b, #6
+ ext v7.16b, v18.16b, v19.16b, #7
+ QPEL_UNI_W_H_CALC v18, v2, v1, v3, v20, v24, v21, v0
+ QPEL_UNI_W_H_CALC v4, v6, v5, v7, v22, v24, v23, v0
+ sqxtn v20.4h, v20.4s
+ sqxtn2 v20.8h, v22.4s
+ sqxtn v21.4h, v21.4s
+ sqxtn2 v21.8h, v23.4s
+ trn1 v22.8h, v20.8h, v21.8h
+ trn2 v23.8h, v20.8h, v21.8h
+ sqxtun v27.8b, v22.8h
+ sqxtun2 v27.16b, v23.8h // 32-47
+ st1 {v25.16b, v26.16b, v27.16b}, [x0], x1
+ subs w4, w4, #1
+ b.hi 1b
+ ret
+endfunc
+
+
+
+function ff_hevc_put_hevc_qpel_uni_w_h64_8_neon_i8mm, export=1
+ QPEL_UNI_W_H_HEADER
+ sub x3, x3, #64
+1:
+ ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x2], #64
+ ext v1.16b, v16.16b, v17.16b, #1
+ ext v2.16b, v16.16b, v17.16b, #2
+ ext v3.16b, v16.16b, v17.16b, #3
+ ext v4.16b, v16.16b, v17.16b, #4
+ ext v5.16b, v16.16b, v17.16b, #5
+ ext v6.16b, v16.16b, v17.16b, #6
+ ext v7.16b, v16.16b, v17.16b, #7
+ QPEL_UNI_W_H_CALC v16, v2, v1, v3, v20, v24, v21, v0
+ QPEL_UNI_W_H_CALC v4, v6, v5, v7, v22, v24, v23, v0
+ sqxtn v20.4h, v20.4s
+ sqxtn2 v20.8h, v22.4s
+ sqxtn v21.4h, v21.4s
+ sqxtn2 v21.8h, v23.4s
+ trn1 v22.8h, v20.8h, v21.8h
+ trn2 v23.8h, v20.8h, v21.8h
+ sqxtun v16.8b, v22.8h
+ sqxtun2 v16.16b, v23.8h // 0-15
+ ext v1.16b, v17.16b, v18.16b, #1
+ ext v2.16b, v17.16b, v18.16b, #2
+ ext v3.16b, v17.16b, v18.16b, #3
+ ext v4.16b, v17.16b, v18.16b, #4
+ ext v5.16b, v17.16b, v18.16b, #5
+ ext v6.16b, v17.16b, v18.16b, #6
+ ext v7.16b, v17.16b, v18.16b, #7
+ QPEL_UNI_W_H_CALC v17, v2, v1, v3, v20, v24, v21, v0
+ QPEL_UNI_W_H_CALC v4, v6, v5, v7, v22, v24, v23, v0
+ sqxtn v20.4h, v20.4s
+ sqxtn2 v20.8h, v22.4s
+ sqxtn v21.4h, v21.4s
+ sqxtn2 v21.8h, v23.4s
+ trn1 v22.8h, v20.8h, v21.8h
+ trn2 v23.8h, v20.8h, v21.8h
+ sqxtun v17.8b, v22.8h
+ sqxtun2 v17.16b, v23.8h // 16-31
+ ext v1.16b, v18.16b, v19.16b, #1
+ ext v2.16b, v18.16b, v19.16b, #2
+ ext v3.16b, v18.16b, v19.16b, #3
+ ext v4.16b, v18.16b, v19.16b, #4
+ ext v5.16b, v18.16b, v19.16b, #5
+ ext v6.16b, v18.16b, v19.16b, #6
+ ext v7.16b, v18.16b, v19.16b, #7
+ QPEL_UNI_W_H_CALC v18, v2, v1, v3, v20, v24, v21, v0
+ QPEL_UNI_W_H_CALC v4, v6, v5, v7, v22, v24, v23, v0
+ ld1 {v0.16b}, [x2], x3
+ sqxtn v20.4h, v20.4s
+ sqxtn2 v20.8h, v22.4s
+ sqxtn v21.4h, v21.4s
+ sqxtn2 v21.8h, v23.4s
+ trn1 v22.8h, v20.8h, v21.8h
+ trn2 v23.8h, v20.8h, v21.8h
+ sqxtun v18.8b, v22.8h
+ sqxtun2 v18.16b, v23.8h // 32-47
+ ext v1.16b, v19.16b, v0.16b, #1
+ ext v2.16b, v19.16b, v0.16b, #2
+ ext v3.16b, v19.16b, v0.16b, #3
+ ext v4.16b, v19.16b, v0.16b, #4
+ ext v5.16b, v19.16b, v0.16b, #5
+ ext v6.16b, v19.16b, v0.16b, #6
+ ext v7.16b, v19.16b, v0.16b, #7
+ QPEL_UNI_W_H_CALC v19, v2, v1, v3, v20, v24, v21, v0
+ QPEL_UNI_W_H_CALC v4, v6, v5, v7, v22, v24, v23, v0
+ sqxtn v20.4h, v20.4s
+ sqxtn2 v20.8h, v22.4s
+ sqxtn v21.4h, v21.4s
+ sqxtn2 v21.8h, v23.4s
+ trn1 v22.8h, v20.8h, v21.8h
+ trn2 v23.8h, v20.8h, v21.8h
+ sqxtun v19.8b, v22.8h
+ sqxtun2 v19.16b, v23.8h // 48-63
+
+ st1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x0], x1
+ subs w4, w4, #1
+ b.hi 1b
+ ret
+endfunc
+
+#endif // HAVE_I8MM
--
2.38.0.windows.1
-------------- next part --------------
From 7e0ebc6fb41816a01918cdc6521300140f2cb95d Mon Sep 17 00:00:00 2001
From: Logan Lyu <Logan.Lyu at myais.com.cn>
Date: Sun, 28 May 2023 09:56:51 +0800
Subject: [PATCH 3/3] lavc/aarch64: new optimization for 8-bit hevc_qpel_h
hevc_qpel_uni_w_hv
---
libavcodec/aarch64/hevcdsp_init_aarch64.c | 24 +
libavcodec/aarch64/hevcdsp_qpel_neon.S | 1079 +++++++++++++++++++++
2 files changed, 1103 insertions(+)
diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index a7e62c7d15..483a9d5253 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -145,6 +145,13 @@ void ff_hevc_put_hevc_qpel_bi_h16_8_neon(uint8_t *_dst, ptrdiff_t _dststride, co
void ff_hevc_put_hevc_##fn##16_8_neon##ext args; \
void ff_hevc_put_hevc_##fn##64_8_neon##ext args; \
+#define NEON8_FNPROTO_PARTIAL_5(fn, args, ext) \
+ void ff_hevc_put_hevc_##fn##4_8_neon##ext args; \
+ void ff_hevc_put_hevc_##fn##8_8_neon##ext args; \
+ void ff_hevc_put_hevc_##fn##16_8_neon##ext args; \
+ void ff_hevc_put_hevc_##fn##32_8_neon##ext args; \
+ void ff_hevc_put_hevc_##fn##64_8_neon##ext args; \
+
NEON8_FNPROTO(pel_uni_w_pixels, (uint8_t *_dst, ptrdiff_t _dststride,
const uint8_t *_src, ptrdiff_t _srcstride,
@@ -156,11 +163,20 @@ NEON8_FNPROTO_PARTIAL_4(qpel_uni_w_v, (uint8_t *_dst, ptrdiff_t _dststride,
int height, int denom, int wx, int ox,
intptr_t mx, intptr_t my, int width),);
+
+NEON8_FNPROTO(qpel_h, (int16_t *dst,
+ const uint8_t *_src, ptrdiff_t _srcstride,
+ int height, intptr_t mx, intptr_t my, int width), _i8mm);
+
NEON8_FNPROTO(qpel_uni_w_h, (uint8_t *_dst, ptrdiff_t _dststride,
const uint8_t *_src, ptrdiff_t _srcstride,
int height, int denom, int wx, int ox,
intptr_t mx, intptr_t my, int width), _i8mm);
+NEON8_FNPROTO_PARTIAL_5(qpel_uni_w_hv, (uint8_t *_dst, ptrdiff_t _dststride,
+ const uint8_t *_src, ptrdiff_t _srcstride,
+ int height, int denom, int wx, int ox,
+ intptr_t mx, intptr_t my, int width), _i8mm);
#define NEON8_FNASSIGN(member, v, h, fn, ext) \
member[1][v][h] = ff_hevc_put_hevc_##fn##4_8_neon##ext; \
@@ -181,6 +197,12 @@ NEON8_FNPROTO(qpel_uni_w_h, (uint8_t *_dst, ptrdiff_t _dststride,
member[8][v][h] = ff_hevc_put_hevc_##fn##64_8_neon##ext; \
member[9][v][h] = ff_hevc_put_hevc_##fn##64_8_neon##ext;
+#define NEON8_FNASSIGN_PARTIAL_5(member, v, h, fn, ext) \
+ member[1][v][h] = ff_hevc_put_hevc_##fn##4_8_neon##ext; \
+ member[3][v][h] = ff_hevc_put_hevc_##fn##8_8_neon##ext; \
+ member[5][v][h] = ff_hevc_put_hevc_##fn##16_8_neon##ext; \
+ member[7][v][h] = ff_hevc_put_hevc_##fn##32_8_neon##ext; \
+ member[9][v][h] = ff_hevc_put_hevc_##fn##64_8_neon##ext;
av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
{
@@ -247,6 +269,8 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
if (have_i8mm(cpu_flags)) {
NEON8_FNASSIGN(c->put_hevc_qpel_uni_w, 0, 1, qpel_uni_w_h, _i8mm);
+ NEON8_FNASSIGN(c->put_hevc_qpel, 0, 1, qpel_h, _i8mm);
+ NEON8_FNASSIGN_PARTIAL_5(c->put_hevc_qpel_uni_w, 1, 1, qpel_uni_w_hv, _i8mm);
}
}
diff --git a/libavcodec/aarch64/hevcdsp_qpel_neon.S b/libavcodec/aarch64/hevcdsp_qpel_neon.S
index 8e8b88c9ea..ed659cfe9b 100644
--- a/libavcodec/aarch64/hevcdsp_qpel_neon.S
+++ b/libavcodec/aarch64/hevcdsp_qpel_neon.S
@@ -1625,4 +1625,1083 @@ function ff_hevc_put_hevc_qpel_uni_w_h64_8_neon_i8mm, export=1
ret
endfunc
+.macro QPEL_H_HEADER
+ movrel x9, qpel_filters
+ add x9, x9, x4, lsl #3
+ ldr x11, [x9]
+ dup v31.2d, x11
+ sub x1, x1, #3
+.endm
+
+function ff_hevc_put_hevc_qpel_h4_8_neon_i8mm, export=1
+ QPEL_H_HEADER
+ mov x10, #MAX_PB_SIZE * 2
+1:
+ ld1 {v0.16b}, [x1], x2
+ ext v1.16b, v0.16b, v0.16b, #1
+ ext v2.16b, v0.16b, v0.16b, #2
+ ext v3.16b, v0.16b, v0.16b, #3
+ zip1 v0.2d, v0.2d, v1.2d
+ zip1 v2.2d, v2.2d, v3.2d
+ movi v16.2d, #0
+ movi v17.2d, #0
+ usdot v16.4s, v0.16b, v31.16b
+ usdot v17.4s, v2.16b, v31.16b
+ addp v16.4s, v16.4s, v17.4s
+ sqxtn v16.4h, v16.4s
+ str d16, [x0]
+ add x0, x0, x10
+ subs w3, w3, #1
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_h6_8_neon_i8mm, export=1
+ QPEL_H_HEADER
+ mov x10, #MAX_PB_SIZE * 2
+ add x15, x0, #8
+1:
+ ld1 {v0.16b}, [x1], x2
+ ext v1.16b, v0.16b, v0.16b, #1
+ ext v2.16b, v0.16b, v0.16b, #2
+ ext v3.16b, v0.16b, v0.16b, #3
+ ext v4.16b, v0.16b, v0.16b, #4
+ ext v5.16b, v0.16b, v0.16b, #5
+ zip1 v0.2d, v0.2d, v1.2d
+ zip1 v2.2d, v2.2d, v3.2d
+ zip1 v4.2d, v4.2d, v5.2d
+ movi v16.2d, #0
+ movi v17.2d, #0
+ movi v18.2d, #0
+ usdot v16.4s, v0.16b, v31.16b
+ usdot v17.4s, v2.16b, v31.16b
+ usdot v18.4s, v4.16b, v31.16b
+ addp v16.4s, v16.4s, v17.4s
+ addp v18.4s, v18.4s, v18.4s
+ sqxtn v16.4h, v16.4s
+ sqxtn v18.4h, v18.4s
+ str d16, [x0]
+ str s18, [x15]
+ add x0, x0, x10
+ add x15, x15, x10
+ subs w3, w3, #1
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_h8_8_neon_i8mm, export=1
+ QPEL_H_HEADER
+ mov x10, #MAX_PB_SIZE * 2
+1:
+ ld1 {v0.16b}, [x1], x2
+ ext v1.16b, v0.16b, v0.16b, #1
+ ext v2.16b, v0.16b, v0.16b, #2
+ ext v3.16b, v0.16b, v0.16b, #3
+ ext v4.16b, v0.16b, v0.16b, #4
+ ext v5.16b, v0.16b, v0.16b, #5
+ ext v6.16b, v0.16b, v0.16b, #6
+ ext v7.16b, v0.16b, v0.16b, #7
+ zip1 v0.2d, v0.2d, v1.2d
+ zip1 v2.2d, v2.2d, v3.2d
+ zip1 v4.2d, v4.2d, v5.2d
+ zip1 v6.2d, v6.2d, v7.2d
+ movi v16.2d, #0
+ movi v17.2d, #0
+ movi v18.2d, #0
+ movi v19.2d, #0
+ usdot v16.4s, v0.16b, v31.16b
+ usdot v17.4s, v2.16b, v31.16b
+ usdot v18.4s, v4.16b, v31.16b
+ usdot v19.4s, v6.16b, v31.16b
+ addp v16.4s, v16.4s, v17.4s
+ addp v18.4s, v18.4s, v19.4s
+ sqxtn v16.4h, v16.4s
+ sqxtn2 v16.8h, v18.4s
+ str q16, [x0]
+ add x0, x0, x10
+ subs w3, w3, #1
+ b.ne 1b
+ ret
+endfunc
+
+.macro QPEL_H_CALC s0, s1, s2, s3, d0, d1, d2, d3
+ movi \d0\().2d, #0
+ movi \d1\().2d, #0
+ movi \d2\().2d, #0
+ movi \d3\().2d, #0
+ usdot \d0\().4s, \s0\().16b, v31.16b
+ usdot \d1\().4s, \s1\().16b, v31.16b
+ usdot \d2\().4s, \s2\().16b, v31.16b
+ usdot \d3\().4s, \s3\().16b, v31.16b
+.endm
+
+function ff_hevc_put_hevc_qpel_h12_8_neon_i8mm, export=1
+ QPEL_H_HEADER
+ mov x10, #MAX_PB_SIZE * 2
+ add x15, x0, #16
+1:
+ ld1 {v16.16b, v17.16b}, [x1], x2
+ ext v1.16b, v16.16b, v17.16b, #1
+ ext v2.16b, v16.16b, v17.16b, #2
+ ext v3.16b, v16.16b, v17.16b, #3
+ ext v4.16b, v16.16b, v17.16b, #4
+ ext v5.16b, v16.16b, v17.16b, #5
+ ext v6.16b, v16.16b, v17.16b, #6
+ ext v7.16b, v16.16b, v17.16b, #7
+ zip1 v18.2d, v4.2d, v5.2d
+ zip1 v19.2d, v6.2d, v7.2d
+ QPEL_H_CALC v16, v1, v2, v3, v20, v21, v22, v23
+ addp v20.4s, v20.4s, v22.4s
+ addp v21.4s, v21.4s, v23.4s
+ movi v24.2d, #0
+ movi v25.2d, #0
+ usdot v24.4s, v18.16b, v31.16b
+ usdot v25.4s, v19.16b, v31.16b
+ addp v24.4s, v24.4s, v25.4s
+ trn1 v26.4s, v20.4s, v21.4s
+ trn2 v27.4s, v20.4s, v21.4s
+ sqxtn v26.4h, v26.4s
+ sqxtn v27.4h, v27.4s
+ sqxtn2 v26.8h, v24.4s
+
+ str q26, [x0]
+ str d27, [x15]
+ add x0, x0, x10
+ add x15, x15, x10
+ subs w3, w3, #1
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_h16_8_neon_i8mm, export=1
+ QPEL_H_HEADER
+ mov x10, #MAX_PB_SIZE * 2
+1:
+ ld1 {v16.16b, v17.16b}, [x1], x2
+ ext v1.16b, v16.16b, v17.16b, #1
+ ext v2.16b, v16.16b, v17.16b, #2
+ ext v3.16b, v16.16b, v17.16b, #3
+ ext v4.16b, v16.16b, v17.16b, #4
+ ext v5.16b, v16.16b, v17.16b, #5
+ ext v6.16b, v16.16b, v17.16b, #6
+ ext v7.16b, v16.16b, v17.16b, #7
+
+ QPEL_H_CALC v16, v1, v2, v3, v20, v21, v22, v23
+ QPEL_H_CALC v4, v5, v6, v7, v24, v25, v26, v27
+
+ addp v20.4s, v20.4s, v22.4s
+ addp v21.4s, v21.4s, v23.4s
+ addp v24.4s, v24.4s, v26.4s
+ addp v25.4s, v25.4s, v27.4s
+
+ trn1 v22.4s, v20.4s, v21.4s
+ trn2 v23.4s, v20.4s, v21.4s
+ trn1 v26.4s, v24.4s, v25.4s
+ trn2 v27.4s, v24.4s, v25.4s
+
+ sqxtn v18.4h, v22.4s
+ sqxtn2 v18.8h, v26.4s
+ sqxtn v19.4h, v23.4s
+ sqxtn2 v19.8h, v27.4s
+
+ stp q18, q19, [x0]
+ add x0, x0, x10
+ subs w3, w3, #1
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_h24_8_neon_i8mm, export=1
+ QPEL_H_HEADER
+ mov x10, #MAX_PB_SIZE * 2
+ add x15, x0, #32
+1:
+ ld1 {v16.16b, v17.16b}, [x1], x2
+ ext v1.16b, v16.16b, v17.16b, #1
+ ext v2.16b, v16.16b, v17.16b, #2
+ ext v3.16b, v16.16b, v17.16b, #3
+ ext v4.16b, v16.16b, v17.16b, #4
+ ext v5.16b, v16.16b, v17.16b, #5
+ ext v6.16b, v16.16b, v17.16b, #6
+ ext v7.16b, v16.16b, v17.16b, #7
+ QPEL_H_CALC v16, v1, v2, v3, v20, v21, v22, v23
+ QPEL_H_CALC v4, v5, v6, v7, v24, v25, v26, v27
+ addp v20.4s, v20.4s, v22.4s
+ addp v21.4s, v21.4s, v23.4s
+ addp v24.4s, v24.4s, v26.4s
+ addp v25.4s, v25.4s, v27.4s
+ trn1 v22.4s, v20.4s, v21.4s
+ trn2 v23.4s, v20.4s, v21.4s
+ trn1 v26.4s, v24.4s, v25.4s
+ trn2 v27.4s, v24.4s, v25.4s
+ sqxtn v18.4h, v22.4s
+ sqxtn2 v18.8h, v26.4s
+ sqxtn v19.4h, v23.4s
+ sqxtn2 v19.8h, v27.4s
+ stp q18, q19, [x0]
+ add x0, x0, x10
+ ext v1.16b, v17.16b, v17.16b, #1
+ ext v2.16b, v17.16b, v17.16b, #2
+ ext v3.16b, v17.16b, v17.16b, #3
+ ext v4.16b, v17.16b, v17.16b, #4
+ ext v5.16b, v17.16b, v17.16b, #5
+ ext v6.16b, v17.16b, v17.16b, #6
+ ext v7.16b, v17.16b, v17.16b, #7
+ zip1 v0.2d, v17.2d, v1.2d
+ zip1 v2.2d, v2.2d, v3.2d
+ zip1 v4.2d, v4.2d, v5.2d
+ zip1 v6.2d, v6.2d, v7.2d
+ QPEL_H_CALC v0, v2, v4, v6, v20, v21, v22, v23
+ addp v20.4s, v20.4s, v21.4s
+ addp v22.4s, v22.4s, v23.4s
+ sqxtn v20.4h, v20.4s
+ sqxtn2 v20.8h, v22.4s
+ str q20, [x15]
+ add x15, x15, x10
+ subs w3, w3, #1
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_h32_8_neon_i8mm, export=1
+ QPEL_H_HEADER
+ mov x10, #MAX_PB_SIZE * 2
+ add x15, x0, #32
+1:
+ ld1 {v16.16b, v17.16b, v18.16b}, [x1], x2
+ ext v1.16b, v16.16b, v17.16b, #1
+ ext v2.16b, v16.16b, v17.16b, #2
+ ext v3.16b, v16.16b, v17.16b, #3
+ ext v4.16b, v16.16b, v17.16b, #4
+ ext v5.16b, v16.16b, v17.16b, #5
+ ext v6.16b, v16.16b, v17.16b, #6
+ ext v7.16b, v16.16b, v17.16b, #7
+ QPEL_H_CALC v16, v1, v2, v3, v20, v21, v22, v23
+ QPEL_H_CALC v4, v5, v6, v7, v24, v25, v26, v27
+ addp v20.4s, v20.4s, v22.4s
+ addp v21.4s, v21.4s, v23.4s
+ addp v24.4s, v24.4s, v26.4s
+ addp v25.4s, v25.4s, v27.4s
+ trn1 v22.4s, v20.4s, v21.4s
+ trn2 v23.4s, v20.4s, v21.4s
+ trn1 v26.4s, v24.4s, v25.4s
+ trn2 v27.4s, v24.4s, v25.4s
+ sqxtn v20.4h, v22.4s
+ sqxtn2 v20.8h, v26.4s
+ sqxtn v21.4h, v23.4s
+ sqxtn2 v21.8h, v27.4s
+ stp q20, q21, [x0]
+ add x0, x0, x10
+ ext v1.16b, v17.16b, v18.16b, #1
+ ext v2.16b, v17.16b, v18.16b, #2
+ ext v3.16b, v17.16b, v18.16b, #3
+ ext v4.16b, v17.16b, v18.16b, #4
+ ext v5.16b, v17.16b, v18.16b, #5
+ ext v6.16b, v17.16b, v18.16b, #6
+ ext v7.16b, v17.16b, v18.16b, #7
+ QPEL_H_CALC v17, v1, v2, v3, v20, v21, v22, v23
+ QPEL_H_CALC v4, v5, v6, v7, v24, v25, v26, v27
+ addp v20.4s, v20.4s, v22.4s
+ addp v21.4s, v21.4s, v23.4s
+ addp v24.4s, v24.4s, v26.4s
+ addp v25.4s, v25.4s, v27.4s
+ trn1 v22.4s, v20.4s, v21.4s
+ trn2 v23.4s, v20.4s, v21.4s
+ trn1 v26.4s, v24.4s, v25.4s
+ trn2 v27.4s, v24.4s, v25.4s
+ sqxtn v20.4h, v22.4s
+ sqxtn2 v20.8h, v26.4s
+ sqxtn v21.4h, v23.4s
+ sqxtn2 v21.8h, v27.4s
+ stp q20, q21, [x15]
+ add x15, x15, x10
+ subs w3, w3, #1
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_h48_8_neon_i8mm, export=1
+ QPEL_H_HEADER
+ mov x10, #MAX_PB_SIZE * 2 - 64
+1:
+ ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x1], x2
+ ext v1.16b, v16.16b, v17.16b, #1
+ ext v2.16b, v16.16b, v17.16b, #2
+ ext v3.16b, v16.16b, v17.16b, #3
+ ext v4.16b, v16.16b, v17.16b, #4
+ ext v5.16b, v16.16b, v17.16b, #5
+ ext v6.16b, v16.16b, v17.16b, #6
+ ext v7.16b, v16.16b, v17.16b, #7
+ QPEL_H_CALC v16, v1, v2, v3, v20, v21, v22, v23
+ QPEL_H_CALC v4, v5, v6, v7, v24, v25, v26, v27
+ addp v20.4s, v20.4s, v22.4s
+ addp v21.4s, v21.4s, v23.4s
+ addp v24.4s, v24.4s, v26.4s
+ addp v25.4s, v25.4s, v27.4s
+ trn1 v22.4s, v20.4s, v21.4s
+ trn2 v23.4s, v20.4s, v21.4s
+ trn1 v26.4s, v24.4s, v25.4s
+ trn2 v27.4s, v24.4s, v25.4s
+ sqxtn v20.4h, v22.4s
+ sqxtn2 v20.8h, v26.4s
+ sqxtn v21.4h, v23.4s
+ sqxtn2 v21.8h, v27.4s
+ stp q20, q21, [x0], #32
+
+ ext v1.16b, v17.16b, v18.16b, #1
+ ext v2.16b, v17.16b, v18.16b, #2
+ ext v3.16b, v17.16b, v18.16b, #3
+ ext v4.16b, v17.16b, v18.16b, #4
+ ext v5.16b, v17.16b, v18.16b, #5
+ ext v6.16b, v17.16b, v18.16b, #6
+ ext v7.16b, v17.16b, v18.16b, #7
+ QPEL_H_CALC v17, v1, v2, v3, v20, v21, v22, v23
+ QPEL_H_CALC v4, v5, v6, v7, v24, v25, v26, v27
+ addp v20.4s, v20.4s, v22.4s
+ addp v21.4s, v21.4s, v23.4s
+ addp v24.4s, v24.4s, v26.4s
+ addp v25.4s, v25.4s, v27.4s
+ trn1 v22.4s, v20.4s, v21.4s
+ trn2 v23.4s, v20.4s, v21.4s
+ trn1 v26.4s, v24.4s, v25.4s
+ trn2 v27.4s, v24.4s, v25.4s
+ sqxtn v20.4h, v22.4s
+ sqxtn2 v20.8h, v26.4s
+ sqxtn v21.4h, v23.4s
+ sqxtn2 v21.8h, v27.4s
+ stp q20, q21, [x0], #32
+ ext v1.16b, v18.16b, v19.16b, #1
+ ext v2.16b, v18.16b, v19.16b, #2
+ ext v3.16b, v18.16b, v19.16b, #3
+ ext v4.16b, v18.16b, v19.16b, #4
+ ext v5.16b, v18.16b, v19.16b, #5
+ ext v6.16b, v18.16b, v19.16b, #6
+ ext v7.16b, v18.16b, v19.16b, #7
+ QPEL_H_CALC v18, v1, v2, v3, v20, v21, v22, v23
+ QPEL_H_CALC v4, v5, v6, v7, v24, v25, v26, v27
+ addp v20.4s, v20.4s, v22.4s
+ addp v21.4s, v21.4s, v23.4s
+ addp v24.4s, v24.4s, v26.4s
+ addp v25.4s, v25.4s, v27.4s
+ trn1 v22.4s, v20.4s, v21.4s
+ trn2 v23.4s, v20.4s, v21.4s
+ trn1 v26.4s, v24.4s, v25.4s
+ trn2 v27.4s, v24.4s, v25.4s
+ sqxtn v20.4h, v22.4s
+ sqxtn2 v20.8h, v26.4s
+ sqxtn v21.4h, v23.4s
+ sqxtn2 v21.8h, v27.4s
+ stp q20, q21, [x0]
+ add x0, x0, x10
+ subs w3, w3, #1
+ b.ne 1b
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_h64_8_neon_i8mm, export=1
+ QPEL_H_HEADER
+ sub x2, x2, #64
+1:
+ ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x1], #64
+ ext v1.16b, v16.16b, v17.16b, #1
+ ext v2.16b, v16.16b, v17.16b, #2
+ ext v3.16b, v16.16b, v17.16b, #3
+ ext v4.16b, v16.16b, v17.16b, #4
+ ext v5.16b, v16.16b, v17.16b, #5
+ ext v6.16b, v16.16b, v17.16b, #6
+ ext v7.16b, v16.16b, v17.16b, #7
+ QPEL_H_CALC v16, v1, v2, v3, v20, v21, v22, v23
+ QPEL_H_CALC v4, v5, v6, v7, v24, v25, v26, v27
+ addp v20.4s, v20.4s, v22.4s
+ addp v21.4s, v21.4s, v23.4s
+ addp v24.4s, v24.4s, v26.4s
+ addp v25.4s, v25.4s, v27.4s
+ trn1 v22.4s, v20.4s, v21.4s
+ trn2 v23.4s, v20.4s, v21.4s
+ trn1 v26.4s, v24.4s, v25.4s
+ trn2 v27.4s, v24.4s, v25.4s
+ sqxtn v20.4h, v22.4s
+ sqxtn2 v20.8h, v26.4s
+ sqxtn v21.4h, v23.4s
+ sqxtn2 v21.8h, v27.4s
+ stp q20, q21, [x0], #32
+
+ ext v1.16b, v17.16b, v18.16b, #1
+ ext v2.16b, v17.16b, v18.16b, #2
+ ext v3.16b, v17.16b, v18.16b, #3
+ ext v4.16b, v17.16b, v18.16b, #4
+ ext v5.16b, v17.16b, v18.16b, #5
+ ext v6.16b, v17.16b, v18.16b, #6
+ ext v7.16b, v17.16b, v18.16b, #7
+ QPEL_H_CALC v17, v1, v2, v3, v20, v21, v22, v23
+ QPEL_H_CALC v4, v5, v6, v7, v24, v25, v26, v27
+ addp v20.4s, v20.4s, v22.4s
+ addp v21.4s, v21.4s, v23.4s
+ addp v24.4s, v24.4s, v26.4s
+ addp v25.4s, v25.4s, v27.4s
+ trn1 v22.4s, v20.4s, v21.4s
+ trn2 v23.4s, v20.4s, v21.4s
+ trn1 v26.4s, v24.4s, v25.4s
+ trn2 v27.4s, v24.4s, v25.4s
+ sqxtn v20.4h, v22.4s
+ sqxtn2 v20.8h, v26.4s
+ sqxtn v21.4h, v23.4s
+ sqxtn2 v21.8h, v27.4s
+ stp q20, q21, [x0], #32
+ ext v1.16b, v18.16b, v19.16b, #1
+ ext v2.16b, v18.16b, v19.16b, #2
+ ext v3.16b, v18.16b, v19.16b, #3
+ ext v4.16b, v18.16b, v19.16b, #4
+ ext v5.16b, v18.16b, v19.16b, #5
+ ext v6.16b, v18.16b, v19.16b, #6
+ ext v7.16b, v18.16b, v19.16b, #7
+ QPEL_H_CALC v18, v1, v2, v3, v20, v21, v22, v23
+ QPEL_H_CALC v4, v5, v6, v7, v24, v25, v26, v27
+ addp v20.4s, v20.4s, v22.4s
+ addp v21.4s, v21.4s, v23.4s
+ addp v24.4s, v24.4s, v26.4s
+ addp v25.4s, v25.4s, v27.4s
+ trn1 v22.4s, v20.4s, v21.4s
+ trn2 v23.4s, v20.4s, v21.4s
+ trn1 v26.4s, v24.4s, v25.4s
+ trn2 v27.4s, v24.4s, v25.4s
+ sqxtn v20.4h, v22.4s
+ sqxtn2 v20.8h, v26.4s
+ sqxtn v21.4h, v23.4s
+ sqxtn2 v21.8h, v27.4s
+ stp q20, q21, [x0], #32
+ ld1 {v28.8b}, [x1], x2
+ ext v1.16b, v19.16b, v28.16b, #1
+ ext v2.16b, v19.16b, v28.16b, #2
+ ext v3.16b, v19.16b, v28.16b, #3
+ ext v4.16b, v19.16b, v28.16b, #4
+ ext v5.16b, v19.16b, v28.16b, #5
+ ext v6.16b, v19.16b, v28.16b, #6
+ ext v7.16b, v19.16b, v28.16b, #7
+ QPEL_H_CALC v19, v1, v2, v3, v20, v21, v22, v23
+ QPEL_H_CALC v4, v5, v6, v7, v24, v25, v26, v27
+ addp v20.4s, v20.4s, v22.4s
+ addp v21.4s, v21.4s, v23.4s
+ addp v24.4s, v24.4s, v26.4s
+ addp v25.4s, v25.4s, v27.4s
+ trn1 v22.4s, v20.4s, v21.4s
+ trn2 v23.4s, v20.4s, v21.4s
+ trn1 v26.4s, v24.4s, v25.4s
+ trn2 v27.4s, v24.4s, v25.4s
+ sqxtn v20.4h, v22.4s
+ sqxtn2 v20.8h, v26.4s
+ sqxtn v21.4h, v23.4s
+ sqxtn2 v21.8h, v27.4s
+ stp q20, q21, [x0], #32
+ subs w3, w3, #1
+ b.ne 1b
+ ret
+endfunc
+
+.macro QPEL_UNI_W_HV_HEADER width
+ ldp x14, x15, [sp] // mx, my
+ ldr w13, [sp, #16] // width
+ stp x19, x30, [sp, #-80]!
+ stp x20, x21, [sp, #16]
+ stp x22, x23, [sp, #32]
+ stp x24, x25, [sp, #48]
+ stp x26, x27, [sp, #64]
+ mov x19, sp
+ mov x11, #9088
+ sub sp, sp, x11
+ mov x20, x0
+ mov x21, x1
+ mov x0, sp
+ sub x1, x2, x3, lsl #1
+ sub x1, x1, x3
+ mov x2, x3
+ add w3, w4, #7
+ mov w22, w4 // height
+ mov x4, x14 // mx
+ mov x23, x15 // my
+ mov w24, w6 // wx
+ mov w25, w7 // ox
+ mov w26, #-6
+ sub w26, w26, w5 // -shift
+ mov w27, w13 // width
+ bl X(ff_hevc_put_hevc_qpel_h\width\()_8_neon_i8mm)
+ movrel x9, qpel_filters
+ add x9, x9, x23, lsl #3
+ ld1 {v0.8b}, [x9]
+ sxtl v0.8h, v0.8b
+ mov x10, #(MAX_PB_SIZE * 2)
+ dup v28.4s, w24
+ dup v29.4s, w25
+ dup v30.4s, w26
+.endm
+
+.macro QPEL_UNI_W_HV_END
+ mov sp, x19
+ ldp x20, x21, [sp, #16]
+ ldp x22, x23, [sp, #32]
+ ldp x24, x25, [sp, #48]
+ ldp x26, x27, [sp, #64]
+ ldp x19, x30, [sp], #80
+.endm
+
+.macro QPEL_UNI_W_HV_4
+ sshr v26.4s, v26.4s, #6
+ mul v24.4s, v26.4s, v28.4s
+ sqrshl v24.4s, v24.4s, v30.4s
+ sqadd v24.4s, v24.4s, v29.4s
+ sqxtn v24.4h, v24.4s
+ sqxtun v24.8b, v24.8h
+ st1 {v24.s}[0], [x20], x21
+.endm
+
+.macro QPEL_FILTER_H dst, src0, src1, src2, src3, src4, src5, src6, src7
+ smull \dst\().4s, \src0\().4h, v0.h[0]
+ smlal \dst\().4s, \src1\().4h, v0.h[1]
+ smlal \dst\().4s, \src2\().4h, v0.h[2]
+ smlal \dst\().4s, \src3\().4h, v0.h[3]
+ smlal \dst\().4s, \src4\().4h, v0.h[4]
+ smlal \dst\().4s, \src5\().4h, v0.h[5]
+ smlal \dst\().4s, \src6\().4h, v0.h[6]
+ smlal \dst\().4s, \src7\().4h, v0.h[7]
+.endm
+
+.macro QPEL_FILTER_H2 dst, src0, src1, src2, src3, src4, src5, src6, src7
+ smull2 \dst\().4s, \src0\().8h, v0.h[0]
+ smlal2 \dst\().4s, \src1\().8h, v0.h[1]
+ smlal2 \dst\().4s, \src2\().8h, v0.h[2]
+ smlal2 \dst\().4s, \src3\().8h, v0.h[3]
+ smlal2 \dst\().4s, \src4\().8h, v0.h[4]
+ smlal2 \dst\().4s, \src5\().8h, v0.h[5]
+ smlal2 \dst\().4s, \src6\().8h, v0.h[6]
+ smlal2 \dst\().4s, \src7\().8h, v0.h[7]
+.endm
+
+function ff_hevc_put_hevc_qpel_uni_w_hv4_8_neon_i8mm, export=1
+ QPEL_UNI_W_HV_HEADER 4
+ ldr d16, [sp]
+ ldr d17, [sp, x10]
+ add sp, sp, x10, lsl #1
+ ldr d18, [sp]
+ ldr d19, [sp, x10]
+ add sp, sp, x10, lsl #1
+ ldr d20, [sp]
+ ldr d21, [sp, x10]
+ add sp, sp, x10, lsl #1
+ ldr d22, [sp]
+ add sp, sp, x10
+1:
+ ldr d23, [sp]
+ add sp, sp, x10
+ QPEL_FILTER_H v26, v16, v17, v18, v19, v20, v21, v22, v23
+ QPEL_UNI_W_HV_4
+ subs w22, w22, #1
+ b.eq 2f
+
+ ldr d16, [sp]
+ add sp, sp, x10
+ QPEL_FILTER_H v26, v17, v18, v19, v20, v21, v22, v23, v16
+ QPEL_UNI_W_HV_4
+ subs w22, w22, #1
+ b.eq 2f
+
+ ldr d17, [sp]
+ add sp, sp, x10
+ QPEL_FILTER_H v26, v18, v19, v20, v21, v22, v23, v16, v17
+ QPEL_UNI_W_HV_4
+ subs w22, w22, #1
+ b.eq 2f
+
+ ldr d18, [sp]
+ add sp, sp, x10
+ QPEL_FILTER_H v26, v19, v20, v21, v22, v23, v16, v17, v18
+ QPEL_UNI_W_HV_4
+ subs w22, w22, #1
+ b.eq 2f
+
+ ldr d19, [sp]
+ add sp, sp, x10
+ QPEL_FILTER_H v26, v20, v21, v22, v23, v16, v17, v18, v19
+ QPEL_UNI_W_HV_4
+ subs w22, w22, #1
+ b.eq 2f
+
+ ldr d20, [sp]
+ add sp, sp, x10
+ QPEL_FILTER_H v26, v21, v22, v23, v16, v17, v18, v19, v20
+ QPEL_UNI_W_HV_4
+ subs w22, w22, #1
+ b.eq 2f
+
+ ldr d21, [sp]
+ add sp, sp, x10
+ QPEL_FILTER_H v26, v22, v23, v16, v17, v18, v19, v20, v21
+ QPEL_UNI_W_HV_4
+ subs w22, w22, #1
+ b.eq 2f
+
+ ldr d22, [sp]
+ add sp, sp, x10
+ QPEL_FILTER_H v26, v23, v16, v17, v18, v19, v20, v21, v22
+ QPEL_UNI_W_HV_4
+ subs w22, w22, #1
+ b.hi 1b
+
+2:
+ QPEL_UNI_W_HV_END
+ ret
+endfunc
+
+.macro QPEL_UNI_W_HV_8
+ sshr v26.4s, v26.4s, #6
+ sshr v27.4s, v27.4s, #6
+ mul v24.4s, v26.4s, v28.4s
+ mul v25.4s, v27.4s, v28.4s
+ sqrshl v24.4s, v24.4s, v30.4s
+ sqrshl v25.4s, v25.4s, v30.4s
+ sqadd v24.4s, v24.4s, v29.4s
+ sqadd v25.4s, v25.4s, v29.4s
+ sqxtn v24.4h, v24.4s
+ sqxtn2 v24.8h, v25.4s
+ sqxtun v24.8b, v24.8h
+ st1 {v24.d}[0], [x20], x21
+.endm
+
+function ff_hevc_put_hevc_qpel_uni_w_hv8_8_neon_i8mm, export=1
+ QPEL_UNI_W_HV_HEADER 8
+ ldr q16, [sp]
+ ldr q17, [sp, x10]
+ add sp, sp, x10, lsl #1
+ ldr q18, [sp]
+ ldr q19, [sp, x10]
+ add sp, sp, x10, lsl #1
+ ldr q20, [sp]
+ ldr q21, [sp, x10]
+ add sp, sp, x10, lsl #1
+ ldr q22, [sp]
+ add sp, sp, x10
+1:
+ ldr q23, [sp]
+ add sp, sp, x10
+ QPEL_FILTER_H v26, v16, v17, v18, v19, v20, v21, v22, v23
+ QPEL_FILTER_H2 v27, v16, v17, v18, v19, v20, v21, v22, v23
+ QPEL_UNI_W_HV_8
+ subs w22, w22, #1
+ b.eq 2f
+
+ ldr q16, [sp]
+ add sp, sp, x10
+ QPEL_FILTER_H v26, v17, v18, v19, v20, v21, v22, v23, v16
+ QPEL_FILTER_H2 v27, v17, v18, v19, v20, v21, v22, v23, v16
+ QPEL_UNI_W_HV_8
+ subs w22, w22, #1
+ b.eq 2f
+
+ ldr q17, [sp]
+ add sp, sp, x10
+ QPEL_FILTER_H v26, v18, v19, v20, v21, v22, v23, v16, v17
+ QPEL_FILTER_H2 v27, v18, v19, v20, v21, v22, v23, v16, v17
+ QPEL_UNI_W_HV_8
+ subs w22, w22, #1
+ b.eq 2f
+
+ ldr q18, [sp]
+ add sp, sp, x10
+ QPEL_FILTER_H v26, v19, v20, v21, v22, v23, v16, v17, v18
+ QPEL_FILTER_H2 v27, v19, v20, v21, v22, v23, v16, v17, v18
+ QPEL_UNI_W_HV_8
+ subs w22, w22, #1
+ b.eq 2f
+
+ ldr q19, [sp]
+ add sp, sp, x10
+ QPEL_FILTER_H v26, v20, v21, v22, v23, v16, v17, v18, v19
+ QPEL_FILTER_H2 v27, v20, v21, v22, v23, v16, v17, v18, v19
+ QPEL_UNI_W_HV_8
+ subs w22, w22, #1
+ b.eq 2f
+
+ ldr q20, [sp]
+ add sp, sp, x10
+ QPEL_FILTER_H v26, v21, v22, v23, v16, v17, v18, v19, v20
+ QPEL_FILTER_H2 v27, v21, v22, v23, v16, v17, v18, v19, v20
+ QPEL_UNI_W_HV_8
+ subs w22, w22, #1
+ b.eq 2f
+
+ ldr q21, [sp]
+ add sp, sp, x10
+ QPEL_FILTER_H v26, v22, v23, v16, v17, v18, v19, v20, v21
+ QPEL_FILTER_H2 v27, v22, v23, v16, v17, v18, v19, v20, v21
+ QPEL_UNI_W_HV_8
+ subs w22, w22, #1
+ b.eq 2f
+
+ ldr q22, [sp]
+ add sp, sp, x10
+ QPEL_FILTER_H v26, v23, v16, v17, v18, v19, v20, v21, v22
+ QPEL_FILTER_H2 v27, v23, v16, v17, v18, v19, v20, v21, v22
+ QPEL_UNI_W_HV_8
+ subs w22, w22, #1
+ b.hi 1b
+
+2:
+ QPEL_UNI_W_HV_END
+ ret
+endfunc
+
+.macro QPEL_UNI_W_HV_16
+ sshr v24.4s, v24.4s, #6
+ sshr v25.4s, v25.4s, #6
+ sshr v26.4s, v26.4s, #6
+ sshr v27.4s, v27.4s, #6
+ mul v24.4s, v24.4s, v28.4s
+ mul v25.4s, v25.4s, v28.4s
+ mul v26.4s, v26.4s, v28.4s
+ mul v27.4s, v27.4s, v28.4s
+ sqrshl v24.4s, v24.4s, v30.4s
+ sqrshl v25.4s, v25.4s, v30.4s
+ sqrshl v26.4s, v26.4s, v30.4s
+ sqrshl v27.4s, v27.4s, v30.4s
+ sqadd v24.4s, v24.4s, v29.4s
+ sqadd v25.4s, v25.4s, v29.4s
+ sqadd v26.4s, v26.4s, v29.4s
+ sqadd v27.4s, v27.4s, v29.4s
+ sqxtn v24.4h, v24.4s
+ sqxtn2 v24.8h, v25.4s
+ sqxtn v26.4h, v26.4s
+ sqxtn2 v26.8h, v27.4s
+ sqxtun v24.8b, v24.8h
+ sqxtun2 v24.16b, v26.8h
+
+ st1 {v24.16b}, [x20], x21
+.endm
+
+function ff_hevc_put_hevc_qpel_uni_w_hv16_8_neon_i8mm, export=1
+ QPEL_UNI_W_HV_HEADER 16
+ ldp q16, q1, [sp]
+ add sp, sp, x10
+ ldp q17, q2, [sp]
+ add sp, sp, x10
+ ldp q18, q3, [sp]
+ add sp, sp, x10
+ ldp q19, q4, [sp]
+ add sp, sp, x10
+ ldp q20, q5, [sp]
+ add sp, sp, x10
+ ldp q21, q6, [sp]
+ add sp, sp, x10
+ ldp q22, q7, [sp]
+ add sp, sp, x10
+1:
+ ldp q23, q31, [sp]
+ add sp, sp, x10
+ QPEL_FILTER_H v24, v16, v17, v18, v19, v20, v21, v22, v23
+ QPEL_FILTER_H2 v25, v16, v17, v18, v19, v20, v21, v22, v23
+ QPEL_FILTER_H v26, v1, v2, v3, v4, v5, v6, v7, v31
+ QPEL_FILTER_H2 v27, v1, v2, v3, v4, v5, v6, v7, v31
+ QPEL_UNI_W_HV_16
+ subs w22, w22, #1
+ b.eq 2f
+
+ ldp q16, q1, [sp]
+ add sp, sp, x10
+ QPEL_FILTER_H v24, v17, v18, v19, v20, v21, v22, v23, v16
+ QPEL_FILTER_H2 v25, v17, v18, v19, v20, v21, v22, v23, v16
+ QPEL_FILTER_H v26, v2, v3, v4, v5, v6, v7, v31, v1
+ QPEL_FILTER_H2 v27, v2, v3, v4, v5, v6, v7, v31, v1
+ QPEL_UNI_W_HV_16
+ subs w22, w22, #1
+ b.eq 2f
+
+ ldp q17, q2, [sp]
+ add sp, sp, x10
+ QPEL_FILTER_H v24, v18, v19, v20, v21, v22, v23, v16, v17
+ QPEL_FILTER_H2 v25, v18, v19, v20, v21, v22, v23, v16, v17
+ QPEL_FILTER_H v26, v3, v4, v5, v6, v7, v31, v1, v2
+ QPEL_FILTER_H2 v27, v3, v4, v5, v6, v7, v31, v1, v2
+ QPEL_UNI_W_HV_16
+ subs w22, w22, #1
+ b.eq 2f
+
+ ldp q18, q3, [sp]
+ add sp, sp, x10
+ QPEL_FILTER_H v24, v19, v20, v21, v22, v23, v16, v17, v18
+ QPEL_FILTER_H2 v25, v19, v20, v21, v22, v23, v16, v17, v18
+ QPEL_FILTER_H v26, v4, v5, v6, v7, v31, v1, v2, v3
+ QPEL_FILTER_H2 v27, v4, v5, v6, v7, v31, v1, v2, v3
+ QPEL_UNI_W_HV_16
+ subs w22, w22, #1
+ b.eq 2f
+
+ ldp q19, q4, [sp]
+ add sp, sp, x10
+ QPEL_FILTER_H v24, v20, v21, v22, v23, v16, v17, v18, v19
+ QPEL_FILTER_H2 v25, v20, v21, v22, v23, v16, v17, v18, v19
+ QPEL_FILTER_H v26, v5, v6, v7, v31, v1, v2, v3, v4
+ QPEL_FILTER_H2 v27, v5, v6, v7, v31, v1, v2, v3, v4
+ QPEL_UNI_W_HV_16
+ subs w22, w22, #1
+ b.eq 2f
+
+ ldp q20, q5, [sp]
+ add sp, sp, x10
+ QPEL_FILTER_H v24, v21, v22, v23, v16, v17, v18, v19, v20
+ QPEL_FILTER_H2 v25, v21, v22, v23, v16, v17, v18, v19, v20
+ QPEL_FILTER_H v26, v6, v7, v31, v1, v2, v3, v4, v5
+ QPEL_FILTER_H2 v27, v6, v7, v31, v1, v2, v3, v4, v5
+ QPEL_UNI_W_HV_16
+ subs w22, w22, #1
+ b.eq 2f
+
+ ldp q21, q6, [sp]
+ add sp, sp, x10
+ QPEL_FILTER_H v24, v22, v23, v16, v17, v18, v19, v20, v21
+ QPEL_FILTER_H2 v25, v22, v23, v16, v17, v18, v19, v20, v21
+ QPEL_FILTER_H v26, v7, v31, v1, v2, v3, v4, v5, v6
+ QPEL_FILTER_H2 v27, v7, v31, v1, v2, v3, v4, v5, v6
+ QPEL_UNI_W_HV_16
+ subs w22, w22, #1
+ b.eq 2f
+
+ ldp q22, q7, [sp]
+ add sp, sp, x10
+ QPEL_FILTER_H v24, v23, v16, v17, v18, v19, v20, v21, v22
+ QPEL_FILTER_H2 v25, v23, v16, v17, v18, v19, v20, v21, v22
+ QPEL_FILTER_H v26, v31, v1, v2, v3, v4, v5, v6, v7
+ QPEL_FILTER_H2 v27, v31, v1, v2, v3, v4, v5, v6, v7
+ QPEL_UNI_W_HV_16
+ subs w22, w22, #1
+ b.hi 1b
+
+2:
+ QPEL_UNI_W_HV_END
+ ret
+endfunc
+
+
+function ff_hevc_put_hevc_qpel_uni_w_hv32_8_neon_i8mm, export=1
+ QPEL_UNI_W_HV_HEADER 32
+ mov x11, sp
+ mov w12, w22
+ mov x13, x20
+3:
+ ldp q16, q1, [sp]
+ add sp, sp, x10
+ ldp q17, q2, [sp]
+ add sp, sp, x10
+ ldp q18, q3, [sp]
+ add sp, sp, x10
+ ldp q19, q4, [sp]
+ add sp, sp, x10
+ ldp q20, q5, [sp]
+ add sp, sp, x10
+ ldp q21, q6, [sp]
+ add sp, sp, x10
+ ldp q22, q7, [sp]
+ add sp, sp, x10
+1:
+ ldp q23, q31, [sp]
+ add sp, sp, x10
+ QPEL_FILTER_H v24, v16, v17, v18, v19, v20, v21, v22, v23
+ QPEL_FILTER_H2 v25, v16, v17, v18, v19, v20, v21, v22, v23
+ QPEL_FILTER_H v26, v1, v2, v3, v4, v5, v6, v7, v31
+ QPEL_FILTER_H2 v27, v1, v2, v3, v4, v5, v6, v7, v31
+ QPEL_UNI_W_HV_16
+ subs w22, w22, #1
+ b.eq 2f
+
+ ldp q16, q1, [sp]
+ add sp, sp, x10
+ QPEL_FILTER_H v24, v17, v18, v19, v20, v21, v22, v23, v16
+ QPEL_FILTER_H2 v25, v17, v18, v19, v20, v21, v22, v23, v16
+ QPEL_FILTER_H v26, v2, v3, v4, v5, v6, v7, v31, v1
+ QPEL_FILTER_H2 v27, v2, v3, v4, v5, v6, v7, v31, v1
+ QPEL_UNI_W_HV_16
+ subs w22, w22, #1
+ b.eq 2f
+
+ ldp q17, q2, [sp]
+ add sp, sp, x10
+ QPEL_FILTER_H v24, v18, v19, v20, v21, v22, v23, v16, v17
+ QPEL_FILTER_H2 v25, v18, v19, v20, v21, v22, v23, v16, v17
+ QPEL_FILTER_H v26, v3, v4, v5, v6, v7, v31, v1, v2
+ QPEL_FILTER_H2 v27, v3, v4, v5, v6, v7, v31, v1, v2
+ QPEL_UNI_W_HV_16
+ subs w22, w22, #1
+ b.eq 2f
+
+ ldp q18, q3, [sp]
+ add sp, sp, x10
+ QPEL_FILTER_H v24, v19, v20, v21, v22, v23, v16, v17, v18
+ QPEL_FILTER_H2 v25, v19, v20, v21, v22, v23, v16, v17, v18
+ QPEL_FILTER_H v26, v4, v5, v6, v7, v31, v1, v2, v3
+ QPEL_FILTER_H2 v27, v4, v5, v6, v7, v31, v1, v2, v3
+ QPEL_UNI_W_HV_16
+ subs w22, w22, #1
+ b.eq 2f
+
+ ldp q19, q4, [sp]
+ add sp, sp, x10
+ QPEL_FILTER_H v24, v20, v21, v22, v23, v16, v17, v18, v19
+ QPEL_FILTER_H2 v25, v20, v21, v22, v23, v16, v17, v18, v19
+ QPEL_FILTER_H v26, v5, v6, v7, v31, v1, v2, v3, v4
+ QPEL_FILTER_H2 v27, v5, v6, v7, v31, v1, v2, v3, v4
+ QPEL_UNI_W_HV_16
+ subs w22, w22, #1
+ b.eq 2f
+
+ ldp q20, q5, [sp]
+ add sp, sp, x10
+ QPEL_FILTER_H v24, v21, v22, v23, v16, v17, v18, v19, v20
+ QPEL_FILTER_H2 v25, v21, v22, v23, v16, v17, v18, v19, v20
+ QPEL_FILTER_H v26, v6, v7, v31, v1, v2, v3, v4, v5
+ QPEL_FILTER_H2 v27, v6, v7, v31, v1, v2, v3, v4, v5
+ QPEL_UNI_W_HV_16
+ subs w22, w22, #1
+ b.eq 2f
+
+ ldp q21, q6, [sp]
+ add sp, sp, x10
+ QPEL_FILTER_H v24, v22, v23, v16, v17, v18, v19, v20, v21
+ QPEL_FILTER_H2 v25, v22, v23, v16, v17, v18, v19, v20, v21
+ QPEL_FILTER_H v26, v7, v31, v1, v2, v3, v4, v5, v6
+ QPEL_FILTER_H2 v27, v7, v31, v1, v2, v3, v4, v5, v6
+ QPEL_UNI_W_HV_16
+ subs w22, w22, #1
+ b.eq 2f
+
+ ldp q22, q7, [sp]
+ add sp, sp, x10
+ QPEL_FILTER_H v24, v23, v16, v17, v18, v19, v20, v21, v22
+ QPEL_FILTER_H2 v25, v23, v16, v17, v18, v19, v20, v21, v22
+ QPEL_FILTER_H v26, v31, v1, v2, v3, v4, v5, v6, v7
+ QPEL_FILTER_H2 v27, v31, v1, v2, v3, v4, v5, v6, v7
+ QPEL_UNI_W_HV_16
+ subs w22, w22, #1
+ b.hi 1b
+2:
+ subs w27, w27, #16
+ add sp, x11, #32
+ add x20, x13, #16
+ mov w22, w12
+ mov x11, sp
+ mov x13, x20
+ b.hi 3b
+ QPEL_UNI_W_HV_END
+ ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_w_hv64_8_neon_i8mm, export=1
+ QPEL_UNI_W_HV_HEADER 64
+ mov x11, sp
+ mov w12, w22
+ mov x13, x20
+3:
+ ldp q16, q1, [sp]
+ add sp, sp, x10
+ ldp q17, q2, [sp]
+ add sp, sp, x10
+ ldp q18, q3, [sp]
+ add sp, sp, x10
+ ldp q19, q4, [sp]
+ add sp, sp, x10
+ ldp q20, q5, [sp]
+ add sp, sp, x10
+ ldp q21, q6, [sp]
+ add sp, sp, x10
+ ldp q22, q7, [sp]
+ add sp, sp, x10
+1:
+ ldp q23, q31, [sp]
+ add sp, sp, x10
+ QPEL_FILTER_H v24, v16, v17, v18, v19, v20, v21, v22, v23
+ QPEL_FILTER_H2 v25, v16, v17, v18, v19, v20, v21, v22, v23
+ QPEL_FILTER_H v26, v1, v2, v3, v4, v5, v6, v7, v31
+ QPEL_FILTER_H2 v27, v1, v2, v3, v4, v5, v6, v7, v31
+ QPEL_UNI_W_HV_16
+ subs w22, w22, #1
+ b.eq 2f
+
+ ldp q16, q1, [sp]
+ add sp, sp, x10
+ QPEL_FILTER_H v24, v17, v18, v19, v20, v21, v22, v23, v16
+ QPEL_FILTER_H2 v25, v17, v18, v19, v20, v21, v22, v23, v16
+ QPEL_FILTER_H v26, v2, v3, v4, v5, v6, v7, v31, v1
+ QPEL_FILTER_H2 v27, v2, v3, v4, v5, v6, v7, v31, v1
+ QPEL_UNI_W_HV_16
+ subs w22, w22, #1
+ b.eq 2f
+
+ ldp q17, q2, [sp]
+ add sp, sp, x10
+ QPEL_FILTER_H v24, v18, v19, v20, v21, v22, v23, v16, v17
+ QPEL_FILTER_H2 v25, v18, v19, v20, v21, v22, v23, v16, v17
+ QPEL_FILTER_H v26, v3, v4, v5, v6, v7, v31, v1, v2
+ QPEL_FILTER_H2 v27, v3, v4, v5, v6, v7, v31, v1, v2
+ QPEL_UNI_W_HV_16
+ subs w22, w22, #1
+ b.eq 2f
+
+ ldp q18, q3, [sp]
+ add sp, sp, x10
+ QPEL_FILTER_H v24, v19, v20, v21, v22, v23, v16, v17, v18
+ QPEL_FILTER_H2 v25, v19, v20, v21, v22, v23, v16, v17, v18
+ QPEL_FILTER_H v26, v4, v5, v6, v7, v31, v1, v2, v3
+ QPEL_FILTER_H2 v27, v4, v5, v6, v7, v31, v1, v2, v3
+ QPEL_UNI_W_HV_16
+ subs w22, w22, #1
+ b.eq 2f
+
+ ldp q19, q4, [sp]
+ add sp, sp, x10
+ QPEL_FILTER_H v24, v20, v21, v22, v23, v16, v17, v18, v19
+ QPEL_FILTER_H2 v25, v20, v21, v22, v23, v16, v17, v18, v19
+ QPEL_FILTER_H v26, v5, v6, v7, v31, v1, v2, v3, v4
+ QPEL_FILTER_H2 v27, v5, v6, v7, v31, v1, v2, v3, v4
+ QPEL_UNI_W_HV_16
+ subs w22, w22, #1
+ b.eq 2f
+
+ ldp q20, q5, [sp]
+ add sp, sp, x10
+ QPEL_FILTER_H v24, v21, v22, v23, v16, v17, v18, v19, v20
+ QPEL_FILTER_H2 v25, v21, v22, v23, v16, v17, v18, v19, v20
+ QPEL_FILTER_H v26, v6, v7, v31, v1, v2, v3, v4, v5
+ QPEL_FILTER_H2 v27, v6, v7, v31, v1, v2, v3, v4, v5
+ QPEL_UNI_W_HV_16
+ subs w22, w22, #1
+ b.eq 2f
+
+ ldp q21, q6, [sp]
+ add sp, sp, x10
+ QPEL_FILTER_H v24, v22, v23, v16, v17, v18, v19, v20, v21
+ QPEL_FILTER_H2 v25, v22, v23, v16, v17, v18, v19, v20, v21
+ QPEL_FILTER_H v26, v7, v31, v1, v2, v3, v4, v5, v6
+ QPEL_FILTER_H2 v27, v7, v31, v1, v2, v3, v4, v5, v6
+ QPEL_UNI_W_HV_16
+ subs w22, w22, #1
+ b.eq 2f
+
+ ldp q22, q7, [sp]
+ add sp, sp, x10
+ QPEL_FILTER_H v24, v23, v16, v17, v18, v19, v20, v21, v22
+ QPEL_FILTER_H2 v25, v23, v16, v17, v18, v19, v20, v21, v22
+ QPEL_FILTER_H v26, v31, v1, v2, v3, v4, v5, v6, v7
+ QPEL_FILTER_H2 v27, v31, v1, v2, v3, v4, v5, v6, v7
+ QPEL_UNI_W_HV_16
+ subs w22, w22, #1
+ b.hi 1b
+2:
+ subs w27, w27, #16
+ add sp, x11, #32
+ add x20, x13, #16
+ mov w22, w12
+ mov x11, sp
+ mov x13, x20
+ b.hi 3b
+ QPEL_UNI_W_HV_END
+ ret
+endfunc
+
#endif // HAVE_I8MM
+
--
2.38.0.windows.1
More information about the ffmpeg-devel
mailing list