[FFmpeg-cvslog] avcodec/x86/vp9: Add AVX-512ICL for 16x16 and 32x32 10bpc inverse transforms
Henrik Gramner
git at videolan.org
Mon May 26 16:38:14 EEST 2025
ffmpeg | branch: master | Henrik Gramner <gramner at twoorioles.com> | Wed May 21 14:49:17 2025 +0200| [eda0ac7e5f09e179bcdc946ddef543e8c44a4792] | committer: Henrik Gramner
avcodec/x86/vp9: Add AVX-512ICL for 16x16 and 32x32 10bpc inverse transforms
> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=eda0ac7e5f09e179bcdc946ddef543e8c44a4792
---
libavcodec/vp9dec.h | 4 +-
libavcodec/x86/Makefile | 1 +
libavcodec/x86/vp9dsp_init_16bpp_template.c | 8 +
libavcodec/x86/vp9itxfm_16bpp_avx512.asm | 1165 +++++++++++++++++++++++++++
4 files changed, 1176 insertions(+), 2 deletions(-)
diff --git a/libavcodec/vp9dec.h b/libavcodec/vp9dec.h
index 851ee9f6dd..e41f47a82a 100644
--- a/libavcodec/vp9dec.h
+++ b/libavcodec/vp9dec.h
@@ -220,8 +220,8 @@ struct VP9TileData {
DECLARE_ALIGNED(8, uint8_t, left_ref_ctx)[8];
DECLARE_ALIGNED(8, uint8_t, left_filter_ctx)[8];
// block reconstruction intermediates
- DECLARE_ALIGNED(32, uint8_t, tmp_y)[64 * 64 * 2];
- DECLARE_ALIGNED(32, uint8_t, tmp_uv)[2][64 * 64 * 2];
+ DECLARE_ALIGNED(64, uint8_t, tmp_y)[64 * 64 * 2];
+ DECLARE_ALIGNED(64, uint8_t, tmp_uv)[2][64 * 64 * 2];
struct { int x, y; } min_mv, max_mv;
int16_t *block_base, *block, *uvblock_base[2], *uvblock[2];
uint8_t *eob_base, *uveob_base[2], *eob, *uveob[2];
diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
index bf752f5da2..89ee8dc726 100644
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -186,6 +186,7 @@ X86ASM-OBJS-$(CONFIG_VP9_DECODER) += x86/vp9intrapred.o \
x86/vp9itxfm.o \
x86/vp9itxfm_avx512.o \
x86/vp9itxfm_16bpp.o \
+ x86/vp9itxfm_16bpp_avx512.o \
x86/vp9lpf.o \
x86/vp9lpf_16bpp.o \
x86/vp9mc.o \
diff --git a/libavcodec/x86/vp9dsp_init_16bpp_template.c b/libavcodec/x86/vp9dsp_init_16bpp_template.c
index f93ea2468e..db775f7c1a 100644
--- a/libavcodec/x86/vp9dsp_init_16bpp_template.c
+++ b/libavcodec/x86/vp9dsp_init_16bpp_template.c
@@ -127,6 +127,8 @@ decl_itxfm_func(iwht, iwht, 4, BPC, mmxext);
#if BPC == 10
decl_itxfm_func(idct, idct, 4, BPC, mmxext);
decl_itxfm_funcs(4, BPC, ssse3);
+decl_itxfm_funcs(16, BPC, avx512icl);
+decl_itxfm_func(idct, idct, 32, BPC, avx512icl);
#else
decl_itxfm_func(idct, idct, 4, BPC, sse2);
#endif
@@ -233,6 +235,12 @@ av_cold void INIT_FUNC(VP9DSPContext *dsp, int bitexact)
#endif
}
+#if ARCH_X86_64 && BPC == 10
+ if (EXTERNAL_AVX512ICL(cpu_flags)) {
+ init_itx_funcs(TX_16X16, 16, BPC, avx512icl);
+ init_itx_func_one(TX_32X32, idct, idct, 32, BPC, avx512icl);
+ }
+#endif
#endif /* HAVE_X86ASM */
ff_vp9dsp_init_16bpp_x86(dsp);
diff --git a/libavcodec/x86/vp9itxfm_16bpp_avx512.asm b/libavcodec/x86/vp9itxfm_16bpp_avx512.asm
new file mode 100644
index 0000000000..11d1e453a7
--- /dev/null
+++ b/libavcodec/x86/vp9itxfm_16bpp_avx512.asm
@@ -0,0 +1,1165 @@
+;******************************************************************************
+;* VP9 IDCT SIMD optimizations
+;*
+;* Copyright (C) 2025 Two Orioles, LLC
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+%if ARCH_X86_64 && HAVE_AVX512ICL_EXTERNAL
+
+SECTION_RODATA 64
+
+; Thw following set of constants are ordered to form the
+; qword shuffle mask { 0, 2, 4, 6, 1, 3, 5, 7 }
+%define deintq_perm pd_5520
+pd_5520: dd 5520
+pd_9760: dd 9760
+pd_10394: dd 10394
+pd_15426: dd 15426
+pd_804: dd 804
+pd_2404: dd 2404
+pd_6270: dd 6270
+pd_9102: dd 9102
+pd_11585: dd 11585
+pd_12665: dd 12665
+pd_7723: dd 7723
+pd_14811: dd 14811
+pd_7005: dd 7005
+pd_14053: dd 14053
+pd_8423: dd 8423
+pd_13623: dd 13623
+
+pixel_clip: times 2 dw 0x7c00
+pixel_clip6: dd 2031648 ; 32 + (pixel_clip << 6)
+pd_532480: dd 532480 ; 8192 + (32 << 14)
+pd_8192: dd 8192
+
+pd_1606: dd 1606
+pd_3196: dd 3196
+pd_3981: dd 3981
+pd_4756: dd 4756
+pd_11003: dd 11003
+pd_12140: dd 12140
+pd_13160: dd 13160
+pd_14449: dd 14449
+pd_15137: dd 15137
+pd_15679: dd 15679
+pd_15893: dd 15893
+pd_16069: dd 16069
+pd_16207: dd 16207
+pd_16305: dd 16305
+pd_16364: dd 16364
+
+SECTION .text
+
+%define o_base (deintq_perm+128)
+%define o(x) (r5 - o_base + (x))
+%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX)
+
+; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12
+; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12
+; skip round/shift if rnd is not a number
+%macro ITX_MULSUB_2D 8-9 0 ; dst/src[1-2], tmp[1-3], rnd, coef[1-2], inv_dst2
+%if %8 < 32
+ pmulld m%4, m%1, m%8
+ pmulld m%3, m%2, m%8
+%else
+ vpbroadcastd m%3, [o(pd_%8)]
+ pmulld m%4, m%1, m%3
+ pmulld m%3, m%2
+%endif
+%if %7 < 32
+ pmulld m%1, m%7
+ pmulld m%2, m%7
+%else
+ vpbroadcastd m%5, [o(pd_%7)]
+ pmulld m%1, m%5
+ pmulld m%2, m%5
+%endif
+%if %9
+ psubd m%4, m%6, m%4
+ psubd m%2, m%4, m%2
+%else
+%ifnum %6
+ paddd m%4, m%6
+%endif
+ paddd m%2, m%4
+%endif
+%ifnum %6
+ paddd m%1, m%6
+%endif
+ psubd m%1, m%3
+%ifnum %6
+ psrad m%2, 14
+ psrad m%1, 14
+%endif
+%endmacro
+
+%macro WRAP_YMM 1+
+ INIT_YMM cpuname
+ %1
+ INIT_ZMM cpuname
+%endmacro
+
+%macro TRANSPOSE_4D 5 ; in[1-4], tmp
+ punpckhdq m%5, m%3, m%4 ; c2 d2 c3 d3
+ punpckldq m%3, m%4 ; c0 d0 c1 d1
+ punpckhdq m%4, m%1, m%2 ; a2 b2 a3 b3
+ punpckldq m%1, m%2 ; a0 b0 a1 b1
+ punpckhqdq m%2, m%1, m%3 ; a1 b1 c1 d1
+ punpcklqdq m%1, m%3 ; a0 b0 c0 d0
+ punpcklqdq m%3, m%4, m%5 ; a2 b2 c2 d2
+ punpckhqdq m%4, m%5 ; a3 b3 c3 d3
+%endmacro
+
+%macro TRANSPOSE_4DQ 5 ; in[1-4], tmp
+ vshufi32x4 m%5, m%3, m%4, q3232 ; c2 c3 d2 d3
+ vinserti32x8 m%3, ym%4, 1 ; c0 c1 d0 d1
+ vshufi32x4 m%4, m%1, m%2, q3232 ; a2 a3 b2 b3
+ vinserti32x8 m%1, ym%2, 1 ; a0 a1 b0 b1
+ vshufi32x4 m%2, m%1, m%3, q3131 ; a1 b1 c1 d1
+ vshufi32x4 m%1, m%3, q2020 ; a0 b0 c0 d0
+ vshufi32x4 m%3, m%4, m%5, q2020 ; a2 b2 c2 d2
+ vshufi32x4 m%4, m%5, q3131 ; a3 b3 c3 d3
+%endmacro
+
+%macro INV_TXFM_FN 3-4 0 ; type1, type2, size, eob_offset
+cglobal vp9_i%1_i%2_%3_add_10, 4, 5, 0, dst, stride, c, eob, tx2
+ %define %%p1 m(vp9_i%1_%3_internal_10)
+ lea r5, [o_base]
+ ; Jump to the 1st txfm function if we're not taking the fast path, which
+ ; in turn performs an indirect jump to the 2nd txfm function.
+ lea tx2q, [m(vp9_i%2_%3_internal_10).pass2]
+%ifidn %1_%2, dct_dct
+ dec eobd
+ jnz %%p1
+%else
+%if %4
+ add eobd, %4
+%endif
+ ; jump to the 1st txfm function unless it's located directly after this
+ times ((%%end - %%p1) >> 31) & 1 jmp %%p1
+ALIGN function_align
+%%end:
+%endif
+%endmacro
+
+%macro INV_TXFM_16X16_FN 2-3 0 ; type1, type2, eob_offset
+ INV_TXFM_FN %1, %2, 16x16, %3
+%ifidn %1_%2, dct_dct
+ imul r6d, [cq], 11585
+ vpbroadcastd ym3, [o(pixel_clip)]
+ mov [cq], r3d
+ add r6d, 8192
+ sar r6d, 14
+ imul r6d, 11585
+ or r3d, 8
+ add r6d, 532480
+ sar r6d, 20
+ vpbroadcastw ym2, r6d
+ paddsw ym2, ym3
+.dconly_loop:
+ paddsw ym0, ym2, [dstq+strideq*0]
+ paddsw ym1, ym2, [dstq+strideq*1]
+ psubusw ym0, ym3
+ psubusw ym1, ym3
+ mova [dstq+strideq*0], ym0
+ mova [dstq+strideq*1], ym1
+ lea dstq, [dstq+strideq*2]
+ dec r3d
+ jg .dconly_loop
+ RET
+%endif
+%endmacro
+
+%macro IDCT16_PART1 0
+%if mmsize == 64
+.main_part1_fast:
+%endif
+ pmulld m15, m1, [o(pd_16305)] {bcstd} ; t15a
+ pmulld m1, [o(pd_1606)] {bcstd} ; t8a
+ pmulld m9, m7, [o(pd_10394)] {bcstd} ; t9a
+ pmulld m7, [o(pd_12665)] {bcstd} ; t14a
+ pmulld m11, m5, [o(pd_14449)] {bcstd} ; t13a
+ pmulld m5, [o(pd_7723)] {bcstd} ; t10a
+ pmulld m13, m3, [o(pd_4756)] {bcstd} ; t11a
+ pmulld m3, [o(pd_15679)] {bcstd} ; t12a
+ pmulld m10, m6, [o(pd_9102)] {bcstd} ; t5a
+ pmulld m6, [o(pd_13623)] {bcstd} ; t6a
+ pmulld m14, m2, [o(pd_16069)] {bcstd} ; t7a
+ pmulld m2, [o(pd_3196)] {bcstd} ; t4a
+ pmulld m12, m4, [o(pd_15137)] {bcstd} ; t3
+ pmulld m4, [o(pd_6270)] {bcstd} ; t2
+ pmulld m0, m21
+ REPX {psubd x, m20, x}, m9, m13, m10
+ paddd m0, m20
+ mova m18, m0
+%if mmsize == 64 ; for the ymm variant we only ever use the fast path
+ jmp %%main_part1b
+.main_part1:
+ ITX_MULSUB_2D 1, 15, 16, 17, 18, _, 1606, 16305 ; t8a, t15a
+ ITX_MULSUB_2D 9, 7, 16, 17, 18, _, 12665, 10394 ; t9a, t14a
+ ITX_MULSUB_2D 5, 11, 16, 17, 18, _, 7723, 14449 ; t10a, t13a
+ ITX_MULSUB_2D 13, 3, 16, 17, 18, _, 15679, 4756 ; t11a, t12a
+ ITX_MULSUB_2D 10, 6, 16, 17, 18, _, 13623, 9102 ; t5a, t6a
+ ITX_MULSUB_2D 2, 14, 16, 17, 18, _, 3196, 16069 ; t4a, t7a
+ ITX_MULSUB_2D 4, 12, 16, 17, 18, _, 6270, 15137 ; t2, t3
+ pmulld m0, m21
+ pmulld m8, m21
+ REPX {paddd x, m20}, m0, m9, m13, m10
+ psubd m18, m0, m8 ; t1
+ paddd m0, m8 ; t0
+%%main_part1b:
+%endif
+ vpbroadcastd m19, [o(pd_15137)]
+ vpbroadcastd m16, [o(pd_6270)]
+ REPX {paddd x, m20}, m15, m7, m1, m11, m3, m5
+ REPX {psrad x, 14 }, m15, m7, m1, m9, m11, m3, m5, m13
+ paddd m17, m15, m7 ; t15
+ psubd m15, m7 ; t14
+ psubd m7, m3, m11 ; t13
+ paddd m3, m11 ; t12
+ psubd m11, m13, m5 ; t10
+ paddd m5, m13 ; t11
+ psubd m13, m1, m9 ; t9
+ paddd m1, m9 ; t8
+ ITX_MULSUB_2D 15, 13, 8, 9, _, 20, 16, 19 ; t9a, t14a
+ ITX_MULSUB_2D 7, 11, 8, 9, _, 20, 16, 19, 2 ; t13a, t10a
+ paddd m16, m1, m5 ; t8a
+ psubd m1, m5 ; t11a
+ paddd m8, m15, m11 ; t9
+ psubd m15, m11 ; t10
+ psubd m11, m17, m3 ; t12a
+ paddd m17, m3 ; t15a
+ psubd m9, m13, m7 ; t13
+ paddd m13, m7 ; t14
+ REPX {pmulld x, m21}, m11, m9, m1, m15
+ REPX {paddd x, m20}, m2, m6, m14
+ REPX {psrad x, 14 }, m10, m2, m6, m14
+ psubd m3, m2, m10 ; t5a
+ paddd m10, m2 ; t4
+ paddd m11, m20
+ psubd m5, m11, m1 ; t11
+ paddd m11, m1 ; t12
+ psubd m1, m14, m6 ; t6a
+ paddd m14, m6 ; t7
+ pmulld m1, m21
+ pmulld m3, m21
+ paddd m4, m20
+ paddd m12, m20
+ REPX {psrad x, 14 }, m4, m12, m0, m18
+ paddd m9, m20
+ paddd m2, m9, m15 ; t13a
+ psubd m9, m15 ; t10a
+ paddd m1, m20
+ psubd m6, m1, m3 ; t5
+ paddd m1, m3 ; t6
+ REPX {psrad x, 14}, m6, m1, m11, m5, m2, m9
+%endmacro
+
+%macro IDCT16_PART2 0
+ psubd m3, m0, m12 ; t3
+ paddd m0, m12 ; t0
+ psubd m12, m18, m4 ; t2
+ paddd m18, m4 ; t1
+ psubd m4, m3, m10 ; t4
+ paddd m3, m10 ; t3
+ psubd m10, m12, m6 ; t5
+ paddd m12, m6 ; t2
+ psubd m6, m18, m1 ; t6
+ paddd m1, m18 ; t1
+ psubd m7, m0, m14 ; t7
+ paddd m0, m14 ; t0
+ psubd m15, m0, m17 ; out15
+ paddd m0, m17 ; out0
+ psubd m14, m1, m13 ; out14
+ paddd m1, m13 ; out1
+ psubd m13, m12, m2 ; out13
+ paddd m2, m12 ; out2
+ psubd m12, m3, m11 ; out12
+ paddd m3, m11 ; out3
+ psubd m11, m4, m5 ; out11
+ paddd m4, m5 ; out4
+ paddd m5, m10, m9 ; out5
+ psubd m10, m9 ; out10
+ psubd m9, m6, m8 ; out9
+ paddd m6, m8 ; out6
+ psubd m8, m7, m16 ; out8
+ paddd m7, m16 ; out7
+%endmacro
+
+INIT_ZMM avx512icl
+INV_TXFM_16X16_FN dct, dct
+INV_TXFM_16X16_FN dct, adst, 39-23-1
+
+cglobal vp9_idct_16x16_internal_10, 0, 7, 22, dst, stride, c, eob, tx2
+ mova m0, [cq+64* 0]
+ mova m1, [cq+64* 1]
+ mova m2, [cq+64* 2]
+ mova m3, [cq+64* 3]
+ mova m4, [cq+64* 4]
+ mova m5, [cq+64* 5]
+ mova m6, [cq+64* 6]
+ mova m7, [cq+64* 7]
+ vpbroadcastd m20, [o(pd_8192)]
+ vpbroadcastd m21, [o(pd_11585)]
+ sub eobd, 38
+ jl .pass1_fast
+ mova m8, [cq+64* 8]
+ mova m9, [cq+64* 9]
+ mova m10, [cq+64*10]
+ mova m11, [cq+64*11]
+ mova m12, [cq+64*12]
+ mova m13, [cq+64*13]
+ mova m14, [cq+64*14]
+ mova m15, [cq+64*15]
+ call .main_part1
+ call .main_part2
+.pass1_end:
+ TRANSPOSE_4DQ 0, 4, 8, 12, 16
+ TRANSPOSE_4DQ 1, 5, 9, 13, 16
+ TRANSPOSE_4DQ 2, 6, 10, 14, 16
+ TRANSPOSE_4DQ 3, 7, 11, 15, 16
+ TRANSPOSE_4D 8, 9, 10, 11, 16
+ TRANSPOSE_4D 12, 13, 14, 15, 16
+ mov r6d, 64*12
+ jmp .pass1_transpose_end
+.pass1_fast:
+ WRAP_YMM IDCT16_PART1
+ WRAP_YMM IDCT16_PART2
+.pass1_fast_end:
+ vinserti32x8 m0, ym4, 1
+ vinserti32x8 m8, ym12, 1
+ vinserti32x8 m1, ym5, 1
+ vinserti32x8 m9, ym13, 1
+ vinserti32x8 m2, ym6, 1
+ vinserti32x8 m10, ym14, 1
+ vinserti32x8 m3, ym7, 1
+ vinserti32x8 m11, ym15, 1
+ vshufi32x4 m4, m0, m8, q3131
+ vshufi32x4 m0, m8, q2020
+ vshufi32x4 m5, m1, m9, q3131
+ vshufi32x4 m1, m9, q2020
+ vshufi32x4 m6, m2, m10, q3131
+ vshufi32x4 m2, m10, q2020
+ vshufi32x4 m7, m3, m11, q3131
+ vshufi32x4 m3, m11, q2020
+ mov r6d, 64*4
+.pass1_transpose_end:
+ pxor m16, m16
+.zero_loop:
+ mova [cq+r6+64*0], m16
+ mova [cq+r6+64*1], m16
+ mova [cq+r6+64*2], m16
+ mova [cq+r6+64*3], m16
+ sub r6d, 64*4
+ jge .zero_loop
+ TRANSPOSE_4D 0, 1, 2, 3, 16
+ TRANSPOSE_4D 4, 5, 6, 7, 16
+ jmp tx2q
+.pass2:
+ test eobd, eobd
+ jl .pass2_fast
+ call .main_part1
+ jmp .pass2_end
+.pass2_fast:
+ call .main_part1_fast
+.pass2_end:
+ vpbroadcastd m3, [o(pixel_clip6)]
+ paddd m0, m3
+ paddd m18, m3
+ call .main_part2
+ REPX {psrad x, 6}, m0, m1, m2, m3
+ packssdw m0, m1
+ lea r6, [strideq*3]
+ packssdw m1, m2, m3
+ mova m2, [o(deintq_perm)]
+ vpbroadcastd m3, [o(pixel_clip)]
+ REPX {psrad x, 6}, m4, m5, m6, m7
+ call .write_16x4
+ packssdw m0, m4, m5
+ packssdw m1, m6, m7
+ REPX {psrad x, 6}, m8, m9, m10, m11
+ call .write_16x4
+ packssdw m0, m8, m9
+ packssdw m1, m10, m11
+.pass2_end2:
+ REPX {psrad x, 6}, m12, m13, m14, m15
+ call .write_16x4
+ packssdw m0, m12, m13
+ packssdw m1, m14, m15
+ call .write_16x4
+ RET
+ALIGN function_align
+.write_16x4:
+ mova ym16, [dstq+strideq*0]
+ vinserti32x8 m16, [dstq+strideq*1], 1
+ mova ym17, [dstq+strideq*2]
+ vinserti32x8 m17, [dstq+r6 ], 1
+ vpermq m0, m2, m0
+ vpermq m1, m2, m1
+ paddsw m16, m0
+ paddsw m17, m1
+ psubusw m16, m3
+ psubusw m17, m3
+ mova [dstq+strideq*0], ym16
+ vextracti32x8 [dstq+strideq*1], m16, 1
+ mova [dstq+strideq*2], ym17
+ vextracti32x8 [dstq+r6 ], m17, 1
+ lea dstq, [dstq+strideq*4]
+ ret
+ALIGN function_align
+ IDCT16_PART1
+ ret
+ALIGN function_align
+.main_part2:
+ IDCT16_PART2
+ ret
+
+%macro IADST16_PART1 0
+%if mmsize == 64
+.main_part1_fast:
+%endif
+ pmulld m15, m0, [o(pd_16364)] {bcstd} ; t1
+ pmulld m0, [o(pd_804)] {bcstd} ; t0
+ pmulld m13, m2, [o(pd_15893)] {bcstd} ; t3
+ pmulld m2, [o(pd_3981)] {bcstd} ; t2
+ pmulld m11, m4, [o(pd_14811)] {bcstd} ; t5
+ pmulld m4, [o(pd_7005)] {bcstd} ; t4
+ pmulld m9, m6, [o(pd_13160)] {bcstd} ; t7
+ pmulld m6, [o(pd_9760)] {bcstd} ; t6
+ pmulld m8, m7, [o(pd_11003)] {bcstd} ; t8
+ pmulld m7, [o(pd_12140)] {bcstd} ; t9
+ pmulld m10, m5, [o(pd_8423)] {bcstd} ; t10
+ pmulld m5, [o(pd_14053)] {bcstd} ; t11
+ pmulld m12, m3, [o(pd_5520)] {bcstd} ; t12
+ pmulld m3, [o(pd_15426)] {bcstd} ; t13
+ pmulld m14, m1, [o(pd_2404)] {bcstd} ; t14
+ pmulld m1, [o(pd_16207)] {bcstd} ; t15
+ REPX {psubd x, m20, x}, m15, m13, m11, m9
+%if mmsize == 64 ; for the ymm variant we only ever use the fast path
+ jmp %%main_part1b
+ALIGN function_align
+.main_part1:
+ ITX_MULSUB_2D 15, 0, 16, 17, 18, _, 804, 16364 ; t1, t0
+ ITX_MULSUB_2D 13, 2, 16, 17, 18, _, 3981, 15893 ; t3, t2
+ ITX_MULSUB_2D 11, 4, 16, 17, 18, _, 7005, 14811 ; t5, t4
+ ITX_MULSUB_2D 9, 6, 16, 17, 18, _, 9760, 13160 ; t7, t6
+ ITX_MULSUB_2D 7, 8, 16, 17, 18, _, 12140, 11003 ; t9, t8
+ ITX_MULSUB_2D 5, 10, 16, 17, 18, _, 14053, 8423 ; t11, t10
+ ITX_MULSUB_2D 3, 12, 16, 17, 18, _, 15426, 5520 ; t13, t12
+ ITX_MULSUB_2D 1, 14, 16, 17, 18, _, 16207, 2404 ; t15, t14
+ REPX {paddd x, m20}, m15, m13, m11, m9
+%%main_part1b:
+%endif
+ REPX {paddd x, m20}, m0, m2, m4, m6
+ psubd m16, m2, m10 ; t10a
+ paddd m2, m10 ; t2a
+ psubd m10, m9, m1 ; t15a
+ paddd m9, m1 ; t7a
+ psubd m1, m13, m5 ; t11a
+ paddd m13, m5 ; t3a
+ psubd m5, m6, m14 ; t14a
+ paddd m6, m14 ; t6a
+ REPX {psrad x, 14}, m16, m10, m1, m5
+ psubd m14, m0, m8 ; t8a
+ paddd m0, m8 ; t0a
+ psubd m8, m15, m7 ; t9a
+ paddd m15, m7 ; t1a
+ psubd m7, m4, m12 ; t12a
+ paddd m4, m12 ; t4a
+ paddd m12, m11, m3 ; t5a
+ psubd m11, m3 ; t13a
+ REPX {psrad x, 14}, m14, m8, m7, m11
+ vpbroadcastd m19, [o(pd_9102)]
+ vpbroadcastd m18, [o(pd_13623)]
+ ITX_MULSUB_2D 16, 1, 3, 17, _, _, 18, 19 ; t11, t10
+ ITX_MULSUB_2D 10, 5, 3, 17, _, _, 19, 18 ; t14, t15
+ vpbroadcastd m19, [o(pd_16069)]
+ vpbroadcastd m18, [o(pd_3196)]
+ ITX_MULSUB_2D 14, 8, 3, 17, _, _, 18, 19 ; t9, t8
+ ITX_MULSUB_2D 11, 7, 3, 17, _, _, 19, 18 ; t12, t13
+ vpbroadcastd m19, [o(pd_6270)]
+ vpbroadcastd m18, [o(pd_15137)]
+ REPX {psrad x, 14}, m15, m12, m0, m4
+ psubd m3, m15, m12 ; t5
+ paddd m15, m12 ; t1
+ psubd m12, m0, m4 ; t4
+ paddd m0, m4 ; t0
+ REPX {psrad x, 14}, m2, m6, m13, m9
+ psubd m4, m2, m6 ; t6
+ paddd m2, m6 ; t2
+ psubd m6, m13, m9 ; t7
+ paddd m9, m13 ; t3
+ REPX {paddd x, m20}, m8, m14, m1, m16
+ psubd m13, m8, m11 ; t12a
+ paddd m8, m11 ; t8a
+ psubd m11, m14, m7 ; t13a
+ paddd m14, m7 ; t9a
+ psubd m7, m1, m10 ; t14a
+ paddd m1, m10 ; t10a
+ psubd m10, m16, m5 ; t15a
+ paddd m16, m5 ; t11a
+ REPX {psrad x, 14}, m13, m11, m7, m10
+ ITX_MULSUB_2D 12, 3, 5, 17, _, _, 19, 18 ; t5a, t4a
+ ITX_MULSUB_2D 6, 4, 5, 17, _, _, 18, 19 ; t6a, t7a
+ ITX_MULSUB_2D 13, 11, 5, 17, _, _, 19, 18 ; t13, t12
+ ITX_MULSUB_2D 10, 7, 5, 17, _, _, 18, 19 ; t14, t15
+ REPX {psrad x, 14}, m8, m1, m14, m16
+ psubd m5, m8, m1 ; t10
+ paddd m1, m8 ; -out1
+ psubd m8, m15, m9 ; t3a
+ paddd m15, m9 ; -out15
+ psubd m9, m14, m16 ; t11
+ paddd m14, m16 ; out14
+ psubd m16, m0, m2 ; t2a
+ paddd m0, m2 ; out0
+ REPX {paddd x, m20}, m11, m13, m12, m3
+ paddd m2, m11, m10 ; out2
+ psubd m11, m10 ; t14a
+ psubd m10, m13, m7 ; t15a
+ paddd m13, m7 ; -out13
+ psubd m7, m12, m4 ; t7
+ paddd m12, m4 ; out12
+ psubd m4, m3, m6 ; t6
+ paddd m3, m6 ; -out3
+ REPX {psrad x, 14}, m10, m7, m11, m4
+ REPX {pmulld x, m21}, m9, m10, m7, m8, m5, m11, m4, m16
+ REPX {psrad x, 14}, m2, m13, m12, m3
+%endmacro
+
+%macro IADST16_PART2 0
+ paddd m9, m20
+ psubd m10, m20, m10
+ paddd m7, m20
+ psubd m8, m20, m8
+ paddd m6, m9, m5 ; out6
+ psubd m9, m5 ; out9
+ psubd m5, m10, m11 ; out5
+ paddd m10, m11 ; out10
+ psubd m11, m7, m4 ; out11
+ paddd m4, m7 ; out4
+ psubd m7, m8, m16 ; out7
+ paddd m8, m16 ; out8
+%endmacro
+
+%macro IADST16_PASS1_END 0
+ pxor m16, m16
+ psubd m1, m16, m1
+ psubd m3, m16, m3
+ psubd m13, m16, m13
+ psubd m15, m16, m15
+ REPX {psrad x, 14}, m4, m5, m6, m7, m8, m9, m10, m11
+%endmacro
+
+INV_TXFM_16X16_FN adst, dct, 39-18
+INV_TXFM_16X16_FN adst, adst
+
+cglobal vp9_iadst_16x16_internal_10, 0, 7, 22, dst, stride, c, eob, tx2
+ mova m0, [cq+64* 0]
+ mova m1, [cq+64* 1]
+ mova m2, [cq+64* 2]
+ mova m3, [cq+64* 3]
+ mova m4, [cq+64* 4]
+ mova m5, [cq+64* 5]
+ mova m6, [cq+64* 6]
+ mova m7, [cq+64* 7]
+ vpbroadcastd m20, [o(pd_8192)]
+ vpbroadcastd m21, [o(pd_11585)]
+ sub eobd, 39
+ jl .pass1_fast
+ mova m8, [cq+64* 8]
+ mova m9, [cq+64* 9]
+ mova m10, [cq+64*10]
+ mova m11, [cq+64*11]
+ mova m12, [cq+64*12]
+ mova m13, [cq+64*13]
+ mova m14, [cq+64*14]
+ mova m15, [cq+64*15]
+ call .main_part1
+ call .main_part2
+ IADST16_PASS1_END
+ jmp m(vp9_idct_16x16_internal_10).pass1_end
+.pass1_fast:
+ WRAP_YMM IADST16_PART1
+ WRAP_YMM IADST16_PART2
+ WRAP_YMM IADST16_PASS1_END
+ jmp m(vp9_idct_16x16_internal_10).pass1_fast_end
+.pass2:
+ test eobd, eobd
+ jl .pass2_fast
+ call .main_part1
+ jmp .pass2_end
+.pass2_fast:
+ call .main_part1_fast
+.pass2_end:
+ vpbroadcastd m20, [o(pd_532480)]
+ call .main_part2
+ vpbroadcastd m16, [o(pixel_clip6)]
+ REPX {paddd x, m16}, m0, m2, m12, m14
+ REPX {psubd x, m16, x}, m1, m3, m13, m15
+ REPX {psrad x, 6}, m0, m1, m2, m3
+ packssdw m0, m1
+ lea r6, [strideq*3]
+ packssdw m1, m2, m3
+ mova m2, [o(deintq_perm)]
+ vpbroadcastd m3, [o(pixel_clip)]
+ REPX {psrad x, 20}, m4, m5, m6, m7
+ call m(vp9_idct_16x16_internal_10).write_16x4
+ packssdw m0, m4, m5
+ packssdw m1, m6, m7
+ paddsw m0, m3
+ paddsw m1, m3
+ REPX {psrad x, 20}, m8, m9, m10, m11
+ call m(vp9_idct_16x16_internal_10).write_16x4
+ packssdw m0, m8, m9
+ packssdw m1, m10, m11
+ paddsw m0, m3
+ paddsw m1, m3
+ jmp m(vp9_idct_16x16_internal_10).pass2_end2
+ALIGN function_align
+ IADST16_PART1
+ ret
+ALIGN function_align
+.main_part2:
+ IADST16_PART2
+ ret
+
+cglobal vp9_idct_idct_32x32_add_10, 4, 7, 23, 64*64, dst, stride, c, eob
+%undef cmp
+ lea r5, [o_base]
+ dec eobd
+ jnz .pass1
+ imul r6d, [cq], 11585
+ vpbroadcastd m3, [o(pixel_clip)]
+ mov [cq], r3d
+ add r6d, 8192
+ sar r6d, 14
+ imul r6d, 11585
+ or r3d, 16
+ add r6d, 532480
+ sar r6d, 20
+ vpbroadcastw m2, r6d
+ paddsw m2, m3
+.dconly_loop:
+ paddsw m0, m2, [dstq+strideq*0]
+ paddsw m1, m2, [dstq+strideq*1]
+ psubusw m0, m3
+ psubusw m1, m3
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ lea dstq, [dstq+strideq*2]
+ dec r3d
+ jg .dconly_loop
+ RET
+.pass1:
+ vpbroadcastd m20, [o(pd_8192)]
+ vpbroadcastd m21, [o(pd_11585)]
+ cmp eobd, 135
+ jl .pass1_fast
+ add cq, 64
+ lea r4, [rsp+64*8]
+ cmp eobd, 579
+ jl .pass1_right_fast
+ mov r6d, 128*28
+ call .pass1_main
+ jmp .pass1_right_end
+.pass1_right_fast: ; bottomright quadrant is zero
+ mova m0, [cq+128* 1]
+ mova m1, [cq+128* 3]
+ mova m2, [cq+128* 5]
+ mova m3, [cq+128* 7]
+ mova m4, [cq+128* 9]
+ mova m5, [cq+128*11]
+ mova m6, [cq+128*13]
+ mova m7, [cq+128*15]
+ call .main_fast
+ mova m0, [cq+128* 0]
+ mova m1, [cq+128* 2]
+ mova m2, [cq+128* 4]
+ mova m3, [cq+128* 6]
+ mova m4, [cq+128* 8]
+ mova m5, [cq+128*10]
+ mova m6, [cq+128*12]
+ mova m7, [cq+128*14]
+ call m(vp9_idct_16x16_internal_10).main_part1_fast
+ mov r6d, 128*12
+ call .pass1_main_end
+.pass1_right_end:
+ mova [r4+64* 8], m0
+ mova [r4+64* 9], m1
+ mova [r4+64*10], m2
+ mova [r4+64*11], m3
+ mova [r4+64*12], m4
+ mova [r4+64*13], m5
+ mova [r4+64*14], m6
+ mova [r4+64*15], m7
+ mova [r4+64*16], m16
+ mova [r4+64*17], m17
+ mova [r4+64*18], m18
+ mova [r4+64*19], m19
+ mova [r4+64*20], m8
+ mova [r4+64*21], m9
+ mova [r4+64*22], m10
+ mova [r4+64*23], m11
+ sub cq, 64
+ sub r4, 64*8
+ mov r6d, 128*28
+ call .pass1_main
+ mova m12, [r4+64*20]
+ mova m13, [r4+64*21]
+ mova m14, [r4+64*22]
+ mova m15, [r4+64*23]
+ mova [r4+64*20], m8
+ mova [r4+64*21], m9
+ mova [r4+64*22], m10
+ mova [r4+64*23], m11
+ mova m8, [r4+64*16]
+ mova m9, [r4+64*17]
+ mova m10, [r4+64*18]
+ mova m11, [r4+64*19]
+ mova [r4+64*16], m16
+ mova [r4+64*17], m17
+ mova [r4+64*18], m18
+ mova [r4+64*19], m19
+ call .main
+ mova m0, [r4+64*16]
+ mova m1, [r4+64*17]
+ mova m2, [r4+64*18]
+ mova m3, [r4+64*19]
+ mova m4, [r4+64*20]
+ mova m5, [r4+64*21]
+ mova m6, [r4+64*22]
+ mova m7, [r4+64*23]
+ mova m8, [r4+64*24]
+ mova m9, [r4+64*25]
+ mova m10, [r4+64*26]
+ mova m11, [r4+64*27]
+ mova m12, [r4+64*28]
+ mova m13, [r4+64*29]
+ mova m14, [r4+64*30]
+ mova m15, [r4+64*31]
+ call m(vp9_idct_16x16_internal_10).main_part1
+ call .pass2_main_left
+ mova m8, [r4+64* 8]
+ mova m9, [r4+64* 9]
+ mova m10, [r4+64*10]
+ mova m11, [r4+64*11]
+ mova m12, [r4+64*12]
+ mova m13, [r4+64*13]
+ mova m14, [r4+64*14]
+ mova m15, [r4+64*15]
+ TRANSPOSE_4DQ 8, 10, 12, 14, 16
+ TRANSPOSE_4DQ 9, 11, 13, 15, 16
+ call .main
+ call .pass2_main_right
+ mova m8, [r4+64*24]
+ mova m9, [r4+64*25]
+ mova m10, [r4+64*26]
+ mova m11, [r4+64*27]
+ mova m12, [r4+64*28]
+ mova m13, [r4+64*29]
+ mova m14, [r4+64*30]
+ mova m15, [r4+64*31]
+ TRANSPOSE_4DQ 8, 10, 12, 14, 16
+ TRANSPOSE_4DQ 9, 11, 13, 15, 16
+ call m(vp9_idct_16x16_internal_10).main_part1
+ jmp .pass2_end
+.pass1_fast:
+ mova m0, [cq+128* 1]
+ mova m1, [cq+128* 3]
+ mova m2, [cq+128* 5]
+ mova m3, [cq+128* 7]
+ mova m4, [cq+128* 9]
+ mova m5, [cq+128*11]
+ mova m6, [cq+128*13]
+ mova m7, [cq+128*15]
+ mov r4, rsp
+ call .main_fast
+ mova m0, [cq+128* 0]
+ mova m1, [cq+128* 2]
+ mova m2, [cq+128* 4]
+ mova m3, [cq+128* 6]
+ mova m4, [cq+128* 8]
+ mova m5, [cq+128*10]
+ mova m6, [cq+128*12]
+ mova m7, [cq+128*14]
+ call m(vp9_idct_16x16_internal_10).main_part1_fast
+ call m(vp9_idct_16x16_internal_10).main_part2
+ mov r6d, 128*12
+ call .pass1_main_end2
+ mova [r4+64*16], m16
+ mova [r4+64*17], m17
+ mova [r4+64*18], m18
+ mova [r4+64*19], m19
+ mova [r4+64*20], m8
+ mova [r4+64*21], m9
+ mova [r4+64*22], m10
+ mova [r4+64*23], m11
+ call .main_fast
+ mova m0, [r4+64*16]
+ mova m1, [r4+64*17]
+ mova m2, [r4+64*18]
+ mova m3, [r4+64*19]
+ mova m4, [r4+64*20]
+ mova m5, [r4+64*21]
+ mova m6, [r4+64*22]
+ mova m7, [r4+64*23]
+ call m(vp9_idct_16x16_internal_10).main_part1_fast
+ call .pass2_main_left
+ call .main_fast
+ call .pass2_main_right
+ call m(vp9_idct_16x16_internal_10).main_part1_fast
+.pass2_end:
+ paddd m0, m22
+ paddd m18, m22
+ call m(vp9_idct_16x16_internal_10).main_part2
+ mova m20, [o(deintq_perm)]
+ rorx r2, strideq, 59 ; strideq*32
+ vpbroadcastd m21, [o(pixel_clip)]
+ add r2, dstq
+%assign i 0
+%rep 16
+ mova m16, [r4+64*(15-i)]
+ mova m17, [r4+64*(i-16)]
+ mova m18, [r4-64*(17+i)]
+ paddd m19, m %+ i, m16
+ psubd m0, m %+ i, m16
+ call .write_32x2
+ %assign i i+1
+%endrep
+ RET
+ALIGN function_align
+.write_32x2:
+ paddd m16, m17, m18
+ psubd m17, m18
+ REPX {psrad x, 6}, m19, m16, m0, m17
+ packssdw m16, m19
+ packssdw m17, m0
+ sub r2, strideq
+ vpermq m16, m20, m16
+ vpermq m17, m20, m17
+ paddsw m16, [dstq]
+ paddsw m17, [r2 ]
+ psubusw m16, m21
+ psubusw m17, m21
+ mova [dstq], m16
+ mova [r2 ], m17
+ add dstq, strideq
+ ret
+ALIGN function_align
+.pass1_main:
+ mova m0, [cq+128* 1]
+ mova m1, [cq+128* 3]
+ mova m2, [cq+128* 5]
+ mova m3, [cq+128* 7]
+ mova m4, [cq+128* 9]
+ mova m5, [cq+128*11]
+ mova m6, [cq+128*13]
+ mova m7, [cq+128*15]
+ mova m8, [cq+128*17]
+ mova m9, [cq+128*19]
+ mova m10, [cq+128*21]
+ mova m11, [cq+128*23]
+ mova m12, [cq+128*25]
+ mova m13, [cq+128*27]
+ mova m14, [cq+128*29]
+ mova m15, [cq+128*31]
+ call .main
+ mova m0, [cq+128* 0]
+ mova m1, [cq+128* 2]
+ mova m2, [cq+128* 4]
+ mova m3, [cq+128* 6]
+ mova m4, [cq+128* 8]
+ mova m5, [cq+128*10]
+ mova m6, [cq+128*12]
+ mova m7, [cq+128*14]
+ mova m8, [cq+128*16]
+ mova m9, [cq+128*18]
+ mova m10, [cq+128*20]
+ mova m11, [cq+128*22]
+ mova m12, [cq+128*24]
+ mova m13, [cq+128*26]
+ mova m14, [cq+128*28]
+ mova m15, [cq+128*30]
+ call m(vp9_idct_16x16_internal_10).main_part1
+.pass1_main_end:
+ call m(vp9_idct_16x16_internal_10).main_part2
+.pass1_main_end2:
+ pxor m16, m16
+.pass1_zero_loop:
+ mova [cq+r6+128*0], m16
+ mova [cq+r6+128*1], m16
+ mova [cq+r6+128*2], m16
+ mova [cq+r6+128*3], m16
+ sub r6d, 128*4
+ jge .pass1_zero_loop
+ mova m16, [r4+64*15]
+ mova m19, [r4+64*14]
+ mova m22, [r4+64*13]
+ mova m17, [r4+64*12]
+ psubd m18, m0, m16
+ paddd m16, m0
+ paddd m0, m19, m1
+ psubd m19, m1, m19
+ paddd m1, m17, m3
+ psubd m3, m17
+ paddd m17, m2, m22
+ psubd m2, m22
+ TRANSPOSE_4D 3, 2, 19, 18, 22 ; 28 29 30 31
+ TRANSPOSE_4D 16, 0, 17, 1, 22 ; 0 1 2 3
+ mova [r4+64*54], m3
+ mova [r4+64*55], m19
+ mova [r4+64*38], m2
+ mova [r4+64*39], m18
+ mova m2, [r4+64*11]
+ mova m19, [r4+64*10]
+ mova m3, [r4+64* 9]
+ mova m22, [r4+64* 8]
+ paddd m18, m4, m2
+ psubd m4, m2
+ paddd m2, m5, m19
+ psubd m5, m19
+ paddd m19, m6, m3
+ psubd m6, m3
+ paddd m3, m7, m22
+ psubd m7, m22
+ TRANSPOSE_4D 7, 6, 5, 4, 22 ; 24 25 26 27
+ TRANSPOSE_4D 18, 2, 19, 3, 22 ; 4 5 6 7
+ mova [r4+64*52], m7
+ mova [r4+64*53], m5
+ mova [r4+64*36], m6
+ mova [r4+64*37], m4
+ mova m7, [r4+64* 7]
+ mova m4, [r4+64* 6]
+ mova m5, [r4+64* 5]
+ mova m22, [r4+64* 4]
+ psubd m6, m8, m7
+ paddd m8, m7
+ psubd m7, m9, m4
+ paddd m4, m9
+ paddd m9, m10, m5
+ psubd m10, m5
+ paddd m5, m11, m22
+ psubd m11, m22
+ TRANSPOSE_4D 11, 10, 7, 6, 22 ; 20 21 22 23
+ TRANSPOSE_4D 8, 4, 9, 5, 22 ; 8 9 10 11
+ mova [r4+64*50], m11
+ mova [r4+64*51], m7
+ mova [r4+64*34], m10
+ mova [r4+64*35], m6
+ mova m6, [r4+64* 3]
+ mova m11, [r4+64* 2]
+ mova m7, [r4+64* 1]
+ mova m22, [r4+64* 0]
+ paddd m10, m12, m6
+ psubd m12, m6
+ paddd m6, m13, m11
+ psubd m13, m11
+ paddd m11, m14, m7
+ psubd m14, m7
+ paddd m7, m15, m22
+ psubd m15, m22
+ TRANSPOSE_4D 15, 14, 13, 12, 22 ; 16 17 18 19
+ TRANSPOSE_4D 10, 6, 11, 7, 22 ; 12 13 14 15
+ mova [r4+64*48], m15
+ mova [r4+64*49], m13
+ mova [r4+64*32], m14
+ mova [r4+64*33], m12
+ TRANSPOSE_4DQ 0, 2, 4, 6, 22
+ TRANSPOSE_4DQ 1, 3, 5, 7, 22
+ TRANSPOSE_4DQ 16, 18, 8, 10, 22
+ TRANSPOSE_4DQ 17, 19, 9, 11, 22
+ ret
+ALIGN function_align
+.pass2_main_left:
+ vpbroadcastd m22, [o(pixel_clip6)]
+ paddd m0, m22
+ paddd m18, m22
+ call m(vp9_idct_16x16_internal_10).main_part2
+ mova [r4+64*16], m0
+ mova [r4+64*17], m1
+ mova [r4+64*18], m2
+ mova [r4+64*19], m3
+ mova [r4+64*20], m4
+ mova [r4+64*21], m5
+ mova [r4+64*22], m6
+ mova [r4+64*23], m7
+ mova [r4+64*24], m8
+ mova [r4+64*25], m9
+ mova [r4+64*26], m10
+ mova [r4+64*27], m11
+ mova [r4+64*28], m12
+ mova [r4+64*29], m13
+ mova [r4+64*30], m14
+ mova [r4+64*31], m15
+ add r4, 64*32
+ mova m0, [r4+64* 0]
+ mova m1, [r4+64* 1]
+ mova m2, [r4+64* 2]
+ mova m3, [r4+64* 3]
+ mova m4, [r4+64* 4]
+ mova m5, [r4+64* 5]
+ mova m6, [r4+64* 6]
+ mova m7, [r4+64* 7]
+ jmp .pass2_main_transpose
+ALIGN function_align
+.pass2_main_right:
+ mova m0, [r4+64*16]
+ mova m1, [r4+64*17]
+ mova m2, [r4+64*18]
+ mova m3, [r4+64*19]
+ mova m4, [r4+64*20]
+ mova m5, [r4+64*21]
+ mova m6, [r4+64*22]
+ mova m7, [r4+64*23]
+.pass2_main_transpose:
+ TRANSPOSE_4DQ 0, 2, 4, 6, 8
+ TRANSPOSE_4DQ 1, 3, 5, 7, 8
+ ret
+ALIGN function_align
+.main_fast:
+ pmulld m15, m0, [o(pd_16364)] {1to16} ; t31a
+ pmulld m0, [o(pd_804)] {1to16} ; t16a
+ pmulld m8, m7, [o(pd_11003)] {1to16} ; t17a
+ pmulld m7, [o(pd_12140)] {1to16} ; t30a
+ pmulld m11, m4, [o(pd_14811)] {1to16} ; t29a
+ pmulld m4, [o(pd_7005)] {1to16} ; t18a
+ pmulld m12, m3, [o(pd_5520)] {1to16} ; t19a
+ pmulld m3, [o(pd_15426)] {1to16} ; t28a
+ pmulld m13, m2, [o(pd_15893)] {1to16} ; t27a
+ pmulld m2, [o(pd_3981)] {1to16} ; t20a
+ pmulld m10, m5, [o(pd_8423)] {1to16} ; t21a
+ pmulld m5, [o(pd_14053)] {1to16} ; t26a
+ pmulld m9, m6, [o(pd_13160)] {1to16} ; t25a
+ pmulld m6, [o(pd_9760)] {1to16} ; t22a
+ pmulld m14, m1, [o(pd_2404)] {1to16} ; t23a
+ pmulld m1, [o(pd_16207)] {1to16} ; t24a
+ REPX {psubd x, m20, x}, m8, m12, m10, m14
+ jmp .main2
+ALIGN function_align
+.main:
+ ITX_MULSUB_2D 0, 15, 16, 17, 18, _, 804, 16364 ; t16a, t31a
+ ITX_MULSUB_2D 8, 7, 16, 17, 18, _, 12140, 11003 ; t17a, t30a
+ ITX_MULSUB_2D 4, 11, 16, 17, 18, _, 7005, 14811 ; t18a, t29a
+ ITX_MULSUB_2D 12, 3, 16, 17, 18, _, 15426, 5520 ; t19a, t28a
+ ITX_MULSUB_2D 2, 13, 16, 17, 18, _, 3981, 15893 ; t20a, t27a
+ ITX_MULSUB_2D 10, 5, 16, 17, 18, _, 14053, 8423 ; t21a, t26a
+ ITX_MULSUB_2D 6, 9, 16, 17, 18, _, 9760, 13160 ; t22a, t25a
+ ITX_MULSUB_2D 14, 1, 16, 17, 18, _, 16207, 2404 ; t23a, t24a
+ REPX {paddd x, m20}, m8, m12, m10, m14
+.main2:
+ REPX {paddd x, m20}, m0, m15, m7, m4, m3, m11
+ REPX {psrad x, 14 }, m8, m0, m15, m7, m12, m4, m3, m11
+ psubd m16, m0, m8 ; t17
+ paddd m0, m8 ; t16
+ psubd m8, m15, m7 ; t30
+ paddd m15, m7 ; t31
+ paddd m7, m12, m4 ; t19
+ psubd m12, m4 ; t18
+ paddd m4, m3, m11 ; t28
+ psubd m3, m11 ; t29
+ REPX {paddd x, m20}, m2, m13, m5, m6, m1, m9
+ REPX {psrad x, 14 }, m10, m2, m13, m5, m14, m6, m1, m9
+ psubd m11, m2, m10 ; t21
+ paddd m2, m10 ; t20
+ psubd m10, m13, m5 ; t26
+ paddd m13, m5 ; t27
+ psubd m5, m14, m6 ; t22
+ paddd m6, m14 ; t23
+ psubd m14, m1, m9 ; t25
+ paddd m9, m1 ; t24
+ vpbroadcastd m19, [o(pd_16069)]
+ vpbroadcastd m18, [o(pd_3196)]
+ ITX_MULSUB_2D 8, 16, 1, 17, _, 20, 18, 19 ; t17a, t30a
+ ITX_MULSUB_2D 3, 12, 1, 17, _, 20, 18, 19, 1 ; t29a, t18a
+ vpbroadcastd m19, [o(pd_9102)]
+ vpbroadcastd m18, [o(pd_13623)]
+ ITX_MULSUB_2D 10, 11, 1, 17, _, 20, 18, 19 ; t21a, t26a
+ ITX_MULSUB_2D 14, 5, 1, 17, _, 20, 18, 19, 1 ; t25a, t22a
+ paddd m1, m6, m2 ; t23a
+ psubd m6, m2 ; t20a
+ psubd m2, m9, m13 ; t27a
+ paddd m9, m13 ; t24a
+ psubd m13, m15, m4 ; t28a
+ paddd m15, m4 ; t31a
+ psubd m4, m8, m12 ; t18
+ paddd m8, m12 ; t17
+ psubd m12, m0, m7 ; t19a
+ paddd m0, m7 ; t16a
+ psubd m7, m16, m3 ; t29
+ paddd m3, m16 ; t30
+ paddd m16, m5, m10 ; t22
+ psubd m5, m10 ; t21
+ psubd m10, m14, m11 ; t26
+ paddd m14, m11 ; t25
+ vpbroadcastd m19, [o(pd_15137)]
+ vpbroadcastd m18, [o(pd_6270)]
+ ITX_MULSUB_2D 13, 12, 11, 17, _, 20, 18, 19 ; t19, t28
+ ITX_MULSUB_2D 2, 6, 11, 17, _, 20, 18, 19, 1 ; t27, t20
+ ITX_MULSUB_2D 7, 4, 11, 17, _, 20, 18, 19 ; t18a, t29a
+ ITX_MULSUB_2D 10, 5, 11, 17, _, 20, 18, 19, 1 ; t26a, t21a
+ psubd m11, m0, m1 ; t23
+ paddd m0, m1 ; t16
+ paddd m1, m16, m8 ; t17a
+ psubd m16, m8, m16 ; t22a
+ psubd m8, m15, m9 ; t24
+ paddd m15, m9 ; t31
+ psubd m9, m3, m14 ; t25a
+ paddd m14, m3 ; t30a
+ paddd m3, m6, m13 ; t19a
+ psubd m6, m13, m6 ; t20a
+ paddd m13, m10, m4 ; t29
+ psubd m10, m4, m10 ; t26
+ psubd m4, m12, m2 ; t27a
+ paddd m12, m2 ; t28a
+ paddd m2, m7, m5 ; t18
+ psubd m7, m5 ; t21
+ REPX {pmulld x, m21}, m10, m8, m4, m9, m7, m11, m6, m16
+ mova [r4+64* 0], m0
+ mova [r4+64* 1], m1
+ mova [r4+64* 2], m2
+ mova [r4+64* 3], m3
+ mova [r4+64*12], m12
+ mova [r4+64*13], m13
+ mova [r4+64*14], m14
+ mova [r4+64*15], m15
+ REPX {paddd x, m20}, m10, m8, m4, m9
+ psubd m5, m10, m7 ; t21a
+ paddd m10, m7 ; t26a
+ psubd m7, m8, m11 ; t23a
+ paddd m8, m11 ; t24a
+ REPX {psrad x, 14 }, m5, m10, m7, m8
+ paddd m11, m4, m6 ; t27
+ psubd m4, m6 ; t20
+ psubd m6, m9, m16 ; t22
+ paddd m9, m16 ; t25
+ REPX {psrad x, 14 }, m11, m4, m6, m9
+ mova [r4+64* 4], m4
+ mova [r4+64* 5], m5
+ mova [r4+64* 6], m6
+ mova [r4+64* 7], m7
+ mova [r4+64* 8], m8
+ mova [r4+64* 9], m9
+ mova [r4+64*10], m10
+ mova [r4+64*11], m11
+ ret
+
+%endif
More information about the ffmpeg-cvslog
mailing list