[FFmpeg-devel] [PATCH 6/8] sbcenc: add MMX optimizations
James Almer
jamrial at gmail.com
Wed Dec 20 23:26:38 EET 2017
On 12/20/2017 5:06 PM, Aurelien Jacobs wrote:
> On Wed, Dec 20, 2017 at 03:47:35PM -0300, James Almer wrote:
>> On 12/17/2017 6:47 PM, Aurelien Jacobs wrote:
>>> This was originally based on libsbc, and was fully integrated into ffmpeg.
>>> ---
>>> libavcodec/sbcdsp.c | 3 +
>>> libavcodec/sbcdsp.h | 2 +
>>> libavcodec/x86/Makefile | 2 +
>>> libavcodec/x86/sbcdsp.asm | 284 +++++++++++++++++++++++++++++++++++++++++++
>>> libavcodec/x86/sbcdsp_init.c | 51 ++++++++
>>> 5 files changed, 342 insertions(+)
>>> create mode 100644 libavcodec/x86/sbcdsp.asm
>>> create mode 100644 libavcodec/x86/sbcdsp_init.c
>>>
>>> diff --git a/libavcodec/sbcdsp.c b/libavcodec/sbcdsp.c
>>> index 16faf5ba9b..9bb60cdd5e 100644
>>> --- a/libavcodec/sbcdsp.c
>>> +++ b/libavcodec/sbcdsp.c
>>> @@ -387,4 +387,7 @@ av_cold void ff_sbcdsp_init(SBCDSPContext *s)
>>> /* Default implementation for scale factors calculation */
>>> s->sbc_calc_scalefactors = sbc_calc_scalefactors;
>>> s->sbc_calc_scalefactors_j = sbc_calc_scalefactors_j;
>>> +
>>> + if (ARCH_X86)
>>> + ff_sbcdsp_init_x86(s);
>>> }
>>> diff --git a/libavcodec/sbcdsp.h b/libavcodec/sbcdsp.h
>>> index 66ed7d324e..127e6a8a11 100644
>>> --- a/libavcodec/sbcdsp.h
>>> +++ b/libavcodec/sbcdsp.h
>>> @@ -80,4 +80,6 @@ struct sbc_dsp_context {
>>> */
>>> void ff_sbcdsp_init(SBCDSPContext *s);
>>>
>>> +void ff_sbcdsp_init_x86(SBCDSPContext *s);
>>> +
>>> #endif /* AVCODEC_SBCDSP_H */
>>> diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
>>> index a805cd37b4..2350c8bbee 100644
>>> --- a/libavcodec/x86/Makefile
>>> +++ b/libavcodec/x86/Makefile
>>> @@ -63,6 +63,7 @@ OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp_init.o
>>> OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp_init.o
>>> OBJS-$(CONFIG_PRORES_LGPL_DECODER) += x86/proresdsp_init.o
>>> OBJS-$(CONFIG_RV40_DECODER) += x86/rv40dsp_init.o
>>> +OBJS-$(CONFIG_SBC_ENCODER) += x86/sbcdsp_init.o
>>> OBJS-$(CONFIG_SVQ1_ENCODER) += x86/svq1enc_init.o
>>> OBJS-$(CONFIG_TAK_DECODER) += x86/takdsp_init.o
>>> OBJS-$(CONFIG_TRUEHD_DECODER) += x86/mlpdsp_init.o
>>> @@ -172,6 +173,7 @@ X86ASM-OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp.o
>>> X86ASM-OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp.o
>>> X86ASM-OBJS-$(CONFIG_PRORES_LGPL_DECODER) += x86/proresdsp.o
>>> X86ASM-OBJS-$(CONFIG_RV40_DECODER) += x86/rv40dsp.o
>>> +X86ASM-OBJS-$(CONFIG_SBC_ENCODER) += x86/sbcdsp.o
>>> X86ASM-OBJS-$(CONFIG_SVQ1_ENCODER) += x86/svq1enc.o
>>> X86ASM-OBJS-$(CONFIG_TAK_DECODER) += x86/takdsp.o
>>> X86ASM-OBJS-$(CONFIG_TRUEHD_DECODER) += x86/mlpdsp.o
>>> diff --git a/libavcodec/x86/sbcdsp.asm b/libavcodec/x86/sbcdsp.asm
>>> new file mode 100644
>>> index 0000000000..00b48a821b
>>> --- /dev/null
>>> +++ b/libavcodec/x86/sbcdsp.asm
>>> @@ -0,0 +1,284 @@
>>> +;******************************************************************************
>>> +;* SIMD optimized SBC encoder DSP functions
>>> +;*
>>> +;* Copyright (C) 2017 Aurelien Jacobs <aurel at gnuage.org>
>>> +;* Copyright (C) 2008-2010 Nokia Corporation
>>> +;* Copyright (C) 2004-2010 Marcel Holtmann <marcel at holtmann.org>
>>> +;* Copyright (C) 2004-2005 Henryk Ploetz <henryk at ploetzli.ch>
>>> +;* Copyright (C) 2005-2006 Brad Midgley <bmidgley at xmission.com>
>>> +;*
>>> +;* This file is part of FFmpeg.
>>> +;*
>>> +;* FFmpeg is free software; you can redistribute it and/or
>>> +;* modify it under the terms of the GNU Lesser General Public
>>> +;* License as published by the Free Software Foundation; either
>>> +;* version 2.1 of the License, or (at your option) any later version.
>>> +;*
>>> +;* FFmpeg is distributed in the hope that it will be useful,
>>> +;* but WITHOUT ANY WARRANTY; without even the implied warranty of
>>> +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
>>> +;* Lesser General Public License for more details.
>>> +;*
>>> +;* You should have received a copy of the GNU Lesser General Public
>>> +;* License along with FFmpeg; if not, write to the Free Software
>>> +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
>>> +;******************************************************************************
>>> +
>>> +%include "libavutil/x86/x86util.asm"
>>> +
>>> +SECTION_RODATA
>>> +
>>> +scale_mask: times 2 dd 0x8000 ; 1 << (SBC_PROTO_FIXED_SCALE - 1)
>>> +
>>> +SECTION .text
>>> +
>>> +;*******************************************************************
>>> +;void ff_sbc_analyze_4(const int16_t *in, int32_t *out, const int16_t *consts);
>>> +;*******************************************************************
>>> +INIT_MMX mmx
>>> +cglobal sbc_analyze_4, 3, 3, 4, in, out, consts
>>> + movq m0, [inq]
>>> + movq m1, [inq+8]
>>> + pmaddwd m0, [constsq]
>>> + pmaddwd m1, [constsq+8]
>>> + paddd m0, [scale_mask]
>>> + paddd m1, [scale_mask]
>>> +
>>> + movq m2, [inq+16]
>>> + movq m3, [inq+24]
>>> + pmaddwd m2, [constsq+16]
>>> + pmaddwd m3, [constsq+24]
>>> + paddd m0, m2
>>> + paddd m1, m3
>>> +
>>> + movq m2, [inq+32]
>>> + movq m3, [inq+40]
>>> + pmaddwd m2, [constsq+32]
>>> + pmaddwd m3, [constsq+40]
>>> + paddd m0, m2
>>> + paddd m1, m3
>>> +
>>> + movq m2, [inq+48]
>>> + movq m3, [inq+56]
>>> + pmaddwd m2, [constsq+48]
>>> + pmaddwd m3, [constsq+56]
>>> + paddd m0, m2
>>> + paddd m1, m3
>>> +
>>> + movq m2, [inq+64]
>>> + movq m3, [inq+72]
>>> + pmaddwd m2, [constsq+64]
>>> + pmaddwd m3, [constsq+72]
>>> + paddd m0, m2
>>> + paddd m1, m3
>>> +
>>> + psrad m0, 16 ; SBC_PROTO_FIXED_SCALE
>>> + psrad m1, 16 ; SBC_PROTO_FIXED_SCALE
>>> + packssdw m0, m0
>>> + packssdw m1, m1
>>> +
>>> + movq m2, m0
>>> + pmaddwd m0, [constsq+80]
>>> + pmaddwd m2, [constsq+88]
>>> +
>>> + movq m3, m1
>>> + pmaddwd m1, [constsq+96]
>>> + pmaddwd m3, [constsq+104]
>>> + paddd m0, m1
>>> + paddd m2, m3
>>> +
>>> + movq [outq ], m0
>>> + movq [outq+8], m2
>>> +
>>> + RET
>>> +
>>> +
>>> +
>>> +;*******************************************************************
>>> +;void ff_sbc_analyze_8(const int16_t *in, int32_t *out, const int16_t *consts);
>>> +;*******************************************************************
>>> +INIT_MMX mmx
>>> +cglobal sbc_analyze_8, 3, 3, 4, in, out, consts
>>> + movq m0, [inq]
>>> + movq m1, [inq+8]
>>> + movq m2, [inq+16]
>>> + movq m3, [inq+24]
>>> + pmaddwd m0, [constsq]
>>> + pmaddwd m1, [constsq+8]
>>> + pmaddwd m2, [constsq+16]
>>> + pmaddwd m3, [constsq+24]
>>> + paddd m0, [scale_mask]
>>> + paddd m1, [scale_mask]
>>> + paddd m2, [scale_mask]
>>> + paddd m3, [scale_mask]
>>> +
>>> + movq m4, [inq+32]
>>> + movq m5, [inq+40]
>>> + movq m6, [inq+48]
>>> + movq m7, [inq+56]
>>> + pmaddwd m4, [constsq+32]
>>> + pmaddwd m5, [constsq+40]
>>> + pmaddwd m6, [constsq+48]
>>> + pmaddwd m7, [constsq+56]
>>> + paddd m0, m4
>>> + paddd m1, m5
>>> + paddd m2, m6
>>> + paddd m3, m7
>>> +
>>> + movq m4, [inq+64]
>>> + movq m5, [inq+72]
>>> + movq m6, [inq+80]
>>> + movq m7, [inq+88]
>>> + pmaddwd m4, [constsq+64]
>>> + pmaddwd m5, [constsq+72]
>>> + pmaddwd m6, [constsq+80]
>>> + pmaddwd m7, [constsq+88]
>>> + paddd m0, m4
>>> + paddd m1, m5
>>> + paddd m2, m6
>>> + paddd m3, m7
>>> +
>>> + movq m4, [inq+96]
>>> + movq m5, [inq+104]
>>> + movq m6, [inq+112]
>>> + movq m7, [inq+120]
>>> + pmaddwd m4, [constsq+96]
>>> + pmaddwd m5, [constsq+104]
>>> + pmaddwd m6, [constsq+112]
>>> + pmaddwd m7, [constsq+120]
>>> + paddd m0, m4
>>> + paddd m1, m5
>>> + paddd m2, m6
>>> + paddd m3, m7
>>> +
>>> + movq m4, [inq+128]
>>> + movq m5, [inq+136]
>>> + movq m6, [inq+144]
>>> + movq m7, [inq+152]
>>> + pmaddwd m4, [constsq+128]
>>> + pmaddwd m5, [constsq+136]
>>> + pmaddwd m6, [constsq+144]
>>> + pmaddwd m7, [constsq+152]
>>> + paddd m0, m4
>>> + paddd m1, m5
>>> + paddd m2, m6
>>> + paddd m3, m7
>>> +
>>> + psrad m0, 16 ; SBC_PROTO_FIXED_SCALE
>>> + psrad m1, 16 ; SBC_PROTO_FIXED_SCALE
>>> + psrad m2, 16 ; SBC_PROTO_FIXED_SCALE
>>> + psrad m3, 16 ; SBC_PROTO_FIXED_SCALE
>>> +
>>> + packssdw m0, m0
>>> + packssdw m1, m1
>>> + packssdw m2, m2
>>> + packssdw m3, m3
>>> +
>>> + movq m4, m0
>>> + movq m5, m0
>>> + pmaddwd m4, [constsq+160]
>>> + pmaddwd m5, [constsq+168]
>>> +
>>> + movq m6, m1
>>> + movq m7, m1
>>> + pmaddwd m6, [constsq+192]
>>> + pmaddwd m7, [constsq+200]
>>> + paddd m4, m6
>>> + paddd m5, m7
>>> +
>>> + movq m6, m2
>>> + movq m7, m2
>>> + pmaddwd m6, [constsq+224]
>>> + pmaddwd m7, [constsq+232]
>>> + paddd m4, m6
>>> + paddd m5, m7
>>> +
>>> + movq m6, m3
>>> + movq m7, m3
>>> + pmaddwd m6, [constsq+256]
>>> + pmaddwd m7, [constsq+264]
>>> + paddd m4, m6
>>> + paddd m5, m7
>>> +
>>> + movq [outq ], m4
>>> + movq [outq+8], m5
>>> +
>>> + movq m5, m0
>>> + pmaddwd m0, [constsq+176]
>>> + pmaddwd m5, [constsq+184]
>>> +
>>> + movq m7, m1
>>> + pmaddwd m1, [constsq+208]
>>> + pmaddwd m7, [constsq+216]
>>> + paddd m0, m1
>>> + paddd m5, m7
>>> +
>>> + movq m7, m2
>>> + pmaddwd m2, [constsq+240]
>>> + pmaddwd m7, [constsq+248]
>>> + paddd m0, m2
>>> + paddd m5, m7
>>> +
>>> + movq m7, m3
>>> + pmaddwd m3, [constsq+272]
>>> + pmaddwd m7, [constsq+280]
>>> + paddd m0, m3
>>> + paddd m5, m7
>>> +
>>> + movq [outq+16], m0
>>> + movq [outq+24], m5
>>> +
>>> + RET
>>> +
>>> +
>>> +;*******************************************************************
>>> +;void ff_sbc_calc_scalefactors(int32_t sb_sample_f[16][2][8],
>>> +; uint32_t scale_factor[2][8],
>>> +; int blocks, int channels, int subbands)
>>> +;*******************************************************************
>>> +INIT_MMX mmx
>>> +cglobal sbc_calc_scalefactors, 5, 7, 3, sb_sample_f, scale_factor, blocks, channels, subbands, ptr, blk
>>> + ; subbands = 4 * subbands * channels
>>> + shl subbandsq, 2
>>> + cmp channelsq, 2
>>> + jl .loop_1
>>> + shl subbandsq, 1
>>> +
>>> +.loop_1:
>>> + sub subbandsq, 8
>>> + lea ptrq, [sb_sample_fq + subbandsq]
>>> +
>>> + ; blk = (blocks - 1) * 64;
>>> + lea blkq, [blocksq - 1]
>>> + shl blkq, 6
>>> +
>>> + movq m0, [scale_mask]
>>> +.loop_2:
>>> + movq m1, [ptrq+blkq]
>>> + pxor m2, m2
>>> + pcmpgtd m1, m2
>>> + paddd m1, [ptrq+blkq]
>>> + pcmpgtd m2, m1
>>> + pxor m1, m2
>>> +
>>> + por m0, m1
>>> +
>>> + sub blkq, 64
>>> + jns .loop_2
>>> +
>>> + movd blkd, m0
>>> + psrlq m0, 32
>>> + bsr blkd, blkd
>>> + sub blkd, 15 ; SCALE_OUT_BITS
>>> + mov [scale_factorq + subbandsq], blkd
>>> +
>>> + movd blkd, m0
>>> + bsr blkd, blkd
>>> + sub blkd, 15 ; SCALE_OUT_BITS
>>> + mov [scale_factorq + subbandsq + 4], blkd
>>> +
>>> + cmp subbandsq, 0
>>> + jg .loop_1
>>> +
>>> + emms
>>> + RET
>>
>> These should be done in SSE2. There's no reason for them to be MMX.
>
> There is at least one reason for it to be MMX. It is existing legacy
> code that is ported to ffmpeg. So the MMX code is here and working.
The code had to be adapted to x86inc syntax, so it's not a copy paste
done without effort. Said effort could have also been used to turn them
into SSE2, as it's trivial at least for the first two.
I'm not discrediting your work in porting these, I'm saying that said
work should have gone into making them up to date as well.
I'm also not blocking this patch if you decide to not make them SSE2.
I'll point a few things about the existing asm in a separate email.
More information about the ffmpeg-devel
mailing list