[FFmpeg-devel] [PATCH 6/8] sbcenc: add MMX optimizations
James Almer
jamrial at gmail.com
Wed Dec 20 23:58:19 EET 2017
On 12/17/2017 6:47 PM, Aurelien Jacobs wrote:
> This was originally based on libsbc, and was fully integrated into ffmpeg.
> ---
> libavcodec/sbcdsp.c | 3 +
> libavcodec/sbcdsp.h | 2 +
> libavcodec/x86/Makefile | 2 +
> libavcodec/x86/sbcdsp.asm | 284 +++++++++++++++++++++++++++++++++++++++++++
> libavcodec/x86/sbcdsp_init.c | 51 ++++++++
> 5 files changed, 342 insertions(+)
> create mode 100644 libavcodec/x86/sbcdsp.asm
> create mode 100644 libavcodec/x86/sbcdsp_init.c
>
> diff --git a/libavcodec/sbcdsp.c b/libavcodec/sbcdsp.c
> index 16faf5ba9b..9bb60cdd5e 100644
> --- a/libavcodec/sbcdsp.c
> +++ b/libavcodec/sbcdsp.c
> @@ -387,4 +387,7 @@ av_cold void ff_sbcdsp_init(SBCDSPContext *s)
> /* Default implementation for scale factors calculation */
> s->sbc_calc_scalefactors = sbc_calc_scalefactors;
> s->sbc_calc_scalefactors_j = sbc_calc_scalefactors_j;
> +
> + if (ARCH_X86)
> + ff_sbcdsp_init_x86(s);
> }
> diff --git a/libavcodec/sbcdsp.h b/libavcodec/sbcdsp.h
> index 66ed7d324e..127e6a8a11 100644
> --- a/libavcodec/sbcdsp.h
> +++ b/libavcodec/sbcdsp.h
> @@ -80,4 +80,6 @@ struct sbc_dsp_context {
> */
> void ff_sbcdsp_init(SBCDSPContext *s);
>
> +void ff_sbcdsp_init_x86(SBCDSPContext *s);
> +
> #endif /* AVCODEC_SBCDSP_H */
> diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
> index a805cd37b4..2350c8bbee 100644
> --- a/libavcodec/x86/Makefile
> +++ b/libavcodec/x86/Makefile
> @@ -63,6 +63,7 @@ OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp_init.o
> OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp_init.o
> OBJS-$(CONFIG_PRORES_LGPL_DECODER) += x86/proresdsp_init.o
> OBJS-$(CONFIG_RV40_DECODER) += x86/rv40dsp_init.o
> +OBJS-$(CONFIG_SBC_ENCODER) += x86/sbcdsp_init.o
> OBJS-$(CONFIG_SVQ1_ENCODER) += x86/svq1enc_init.o
> OBJS-$(CONFIG_TAK_DECODER) += x86/takdsp_init.o
> OBJS-$(CONFIG_TRUEHD_DECODER) += x86/mlpdsp_init.o
> @@ -172,6 +173,7 @@ X86ASM-OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp.o
> X86ASM-OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp.o
> X86ASM-OBJS-$(CONFIG_PRORES_LGPL_DECODER) += x86/proresdsp.o
> X86ASM-OBJS-$(CONFIG_RV40_DECODER) += x86/rv40dsp.o
> +X86ASM-OBJS-$(CONFIG_SBC_ENCODER) += x86/sbcdsp.o
> X86ASM-OBJS-$(CONFIG_SVQ1_ENCODER) += x86/svq1enc.o
> X86ASM-OBJS-$(CONFIG_TAK_DECODER) += x86/takdsp.o
> X86ASM-OBJS-$(CONFIG_TRUEHD_DECODER) += x86/mlpdsp.o
> diff --git a/libavcodec/x86/sbcdsp.asm b/libavcodec/x86/sbcdsp.asm
> new file mode 100644
> index 0000000000..00b48a821b
> --- /dev/null
> +++ b/libavcodec/x86/sbcdsp.asm
> @@ -0,0 +1,284 @@
> +;******************************************************************************
> +;* SIMD optimized SBC encoder DSP functions
> +;*
> +;* Copyright (C) 2017 Aurelien Jacobs <aurel at gnuage.org>
> +;* Copyright (C) 2008-2010 Nokia Corporation
> +;* Copyright (C) 2004-2010 Marcel Holtmann <marcel at holtmann.org>
> +;* Copyright (C) 2004-2005 Henryk Ploetz <henryk at ploetzli.ch>
> +;* Copyright (C) 2005-2006 Brad Midgley <bmidgley at xmission.com>
> +;*
> +;* This file is part of FFmpeg.
> +;*
> +;* FFmpeg is free software; you can redistribute it and/or
> +;* modify it under the terms of the GNU Lesser General Public
> +;* License as published by the Free Software Foundation; either
> +;* version 2.1 of the License, or (at your option) any later version.
> +;*
> +;* FFmpeg is distributed in the hope that it will be useful,
> +;* but WITHOUT ANY WARRANTY; without even the implied warranty of
> +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> +;* Lesser General Public License for more details.
> +;*
> +;* You should have received a copy of the GNU Lesser General Public
> +;* License along with FFmpeg; if not, write to the Free Software
> +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> +;******************************************************************************
> +
> +%include "libavutil/x86/x86util.asm"
> +
> +SECTION_RODATA
> +
> +scale_mask: times 2 dd 0x8000 ; 1 << (SBC_PROTO_FIXED_SCALE - 1)
> +
> +SECTION .text
> +
> +;*******************************************************************
> +;void ff_sbc_analyze_4(const int16_t *in, int32_t *out, const int16_t *consts);
> +;*******************************************************************
> +INIT_MMX mmx
> +cglobal sbc_analyze_4, 3, 3, 4, in, out, consts
> + movq m0, [inq]
> + movq m1, [inq+8]
> + pmaddwd m0, [constsq]
> + pmaddwd m1, [constsq+8]
> + paddd m0, [scale_mask]
> + paddd m1, [scale_mask]
> +
> + movq m2, [inq+16]
> + movq m3, [inq+24]
> + pmaddwd m2, [constsq+16]
> + pmaddwd m3, [constsq+24]
> + paddd m0, m2
> + paddd m1, m3
> +
> + movq m2, [inq+32]
> + movq m3, [inq+40]
> + pmaddwd m2, [constsq+32]
> + pmaddwd m3, [constsq+40]
> + paddd m0, m2
> + paddd m1, m3
> +
> + movq m2, [inq+48]
> + movq m3, [inq+56]
> + pmaddwd m2, [constsq+48]
> + pmaddwd m3, [constsq+56]
> + paddd m0, m2
> + paddd m1, m3
> +
> + movq m2, [inq+64]
> + movq m3, [inq+72]
> + pmaddwd m2, [constsq+64]
> + pmaddwd m3, [constsq+72]
> + paddd m0, m2
> + paddd m1, m3
> +
> + psrad m0, 16 ; SBC_PROTO_FIXED_SCALE
> + psrad m1, 16 ; SBC_PROTO_FIXED_SCALE
> + packssdw m0, m0
> + packssdw m1, m1
> +
> + movq m2, m0
> + pmaddwd m0, [constsq+80]
> + pmaddwd m2, [constsq+88]
> +
> + movq m3, m1
> + pmaddwd m1, [constsq+96]
> + pmaddwd m3, [constsq+104]
> + paddd m0, m1
> + paddd m2, m3
> +
> + movq [outq ], m0
> + movq [outq+8], m2
> +
> + RET
> +
> +
> +
> +;*******************************************************************
> +;void ff_sbc_analyze_8(const int16_t *in, int32_t *out, const int16_t *consts);
> +;*******************************************************************
> +INIT_MMX mmx
> +cglobal sbc_analyze_8, 3, 3, 4, in, out, consts
> + movq m0, [inq]
> + movq m1, [inq+8]
> + movq m2, [inq+16]
> + movq m3, [inq+24]
> + pmaddwd m0, [constsq]
> + pmaddwd m1, [constsq+8]
> + pmaddwd m2, [constsq+16]
> + pmaddwd m3, [constsq+24]
> + paddd m0, [scale_mask]
> + paddd m1, [scale_mask]
> + paddd m2, [scale_mask]
> + paddd m3, [scale_mask]
> +
> + movq m4, [inq+32]
> + movq m5, [inq+40]
> + movq m6, [inq+48]
> + movq m7, [inq+56]
> + pmaddwd m4, [constsq+32]
> + pmaddwd m5, [constsq+40]
> + pmaddwd m6, [constsq+48]
> + pmaddwd m7, [constsq+56]
> + paddd m0, m4
> + paddd m1, m5
> + paddd m2, m6
> + paddd m3, m7
> +
> + movq m4, [inq+64]
> + movq m5, [inq+72]
> + movq m6, [inq+80]
> + movq m7, [inq+88]
> + pmaddwd m4, [constsq+64]
> + pmaddwd m5, [constsq+72]
> + pmaddwd m6, [constsq+80]
> + pmaddwd m7, [constsq+88]
> + paddd m0, m4
> + paddd m1, m5
> + paddd m2, m6
> + paddd m3, m7
> +
> + movq m4, [inq+96]
> + movq m5, [inq+104]
> + movq m6, [inq+112]
> + movq m7, [inq+120]
> + pmaddwd m4, [constsq+96]
> + pmaddwd m5, [constsq+104]
> + pmaddwd m6, [constsq+112]
> + pmaddwd m7, [constsq+120]
> + paddd m0, m4
> + paddd m1, m5
> + paddd m2, m6
> + paddd m3, m7
> +
> + movq m4, [inq+128]
> + movq m5, [inq+136]
> + movq m6, [inq+144]
> + movq m7, [inq+152]
> + pmaddwd m4, [constsq+128]
> + pmaddwd m5, [constsq+136]
> + pmaddwd m6, [constsq+144]
> + pmaddwd m7, [constsq+152]
> + paddd m0, m4
> + paddd m1, m5
> + paddd m2, m6
> + paddd m3, m7
> +
> + psrad m0, 16 ; SBC_PROTO_FIXED_SCALE
> + psrad m1, 16 ; SBC_PROTO_FIXED_SCALE
> + psrad m2, 16 ; SBC_PROTO_FIXED_SCALE
> + psrad m3, 16 ; SBC_PROTO_FIXED_SCALE
> +
> + packssdw m0, m0
> + packssdw m1, m1
> + packssdw m2, m2
> + packssdw m3, m3
> +
> + movq m4, m0
> + movq m5, m0
> + pmaddwd m4, [constsq+160]
> + pmaddwd m5, [constsq+168]
> +
> + movq m6, m1
> + movq m7, m1
> + pmaddwd m6, [constsq+192]
> + pmaddwd m7, [constsq+200]
> + paddd m4, m6
> + paddd m5, m7
> +
> + movq m6, m2
> + movq m7, m2
> + pmaddwd m6, [constsq+224]
> + pmaddwd m7, [constsq+232]
> + paddd m4, m6
> + paddd m5, m7
> +
> + movq m6, m3
> + movq m7, m3
> + pmaddwd m6, [constsq+256]
> + pmaddwd m7, [constsq+264]
> + paddd m4, m6
> + paddd m5, m7
> +
> + movq [outq ], m4
> + movq [outq+8], m5
> +
> + movq m5, m0
> + pmaddwd m0, [constsq+176]
> + pmaddwd m5, [constsq+184]
> +
> + movq m7, m1
> + pmaddwd m1, [constsq+208]
> + pmaddwd m7, [constsq+216]
> + paddd m0, m1
> + paddd m5, m7
> +
> + movq m7, m2
> + pmaddwd m2, [constsq+240]
> + pmaddwd m7, [constsq+248]
> + paddd m0, m2
> + paddd m5, m7
> +
> + movq m7, m3
> + pmaddwd m3, [constsq+272]
> + pmaddwd m7, [constsq+280]
> + paddd m0, m3
> + paddd m5, m7
> +
> + movq [outq+16], m0
> + movq [outq+24], m5
> +
> + RET
> +
> +
> +;*******************************************************************
> +;void ff_sbc_calc_scalefactors(int32_t sb_sample_f[16][2][8],
> +; uint32_t scale_factor[2][8],
> +; int blocks, int channels, int subbands)
> +;*******************************************************************
> +INIT_MMX mmx
> +cglobal sbc_calc_scalefactors, 5, 7, 3, sb_sample_f, scale_factor, blocks, channels, subbands, ptr, blk
On x86_64 (Windows at least), the high 32 bits of registers storing int
arguments may contain garbage, so you need to work around it.
For blocks, add a "movsxdifnidn blocksq, blocksd" instruction line at
the beginning of the function. For the other two see below.
> + ; subbands = 4 * subbands * channels
> + shl subbandsq, 2
shl subbandsd, 2
This implicitly zeroes the high bits.
> + cmp channelsq, 2
cmp channelsd, 2
If any of the two above can have negative values, then you'll have to
also use movsxdifnidn with them and keep the q suffix.
> + jl .loop_1
> + shl subbandsq, 1
> +
> +.loop_1:
> + sub subbandsq, 8
> + lea ptrq, [sb_sample_fq + subbandsq]
> +
> + ; blk = (blocks - 1) * 64;
> + lea blkq, [blocksq - 1]
> + shl blkq, 6
> +
> + movq m0, [scale_mask]
You could keep the scale_mask constant in a separate register, to avoid
having to load it repeatedly inside the loop.
> +.loop_2:
> + movq m1, [ptrq+blkq]
> + pxor m2, m2
> + pcmpgtd m1, m2
> + paddd m1, [ptrq+blkq]
Similarly, you could load this once. Although not sure it will make much
difference, to be honest.
> + pcmpgtd m2, m1
> + pxor m1, m2
> +
> + por m0, m1
> +
> + sub blkq, 64
> + jns .loop_2
> +
> + movd blkd, m0
> + psrlq m0, 32
> + bsr blkd, blkd
> + sub blkd, 15 ; SCALE_OUT_BITS
> + mov [scale_factorq + subbandsq], blkd
> +
> + movd blkd, m0
> + bsr blkd, blkd
> + sub blkd, 15 ; SCALE_OUT_BITS
> + mov [scale_factorq + subbandsq + 4], blkd
> +
> + cmp subbandsq, 0
> + jg .loop_1
> +
> + emms
> + RET
> diff --git a/libavcodec/x86/sbcdsp_init.c b/libavcodec/x86/sbcdsp_init.c
> new file mode 100644
> index 0000000000..86effecfdf
> --- /dev/null
> +++ b/libavcodec/x86/sbcdsp_init.c
> @@ -0,0 +1,51 @@
> +/*
> + * Bluetooth low-complexity, subband codec (SBC)
> + *
> + * Copyright (C) 2017 Aurelien Jacobs <aurel at gnuage.org>
> + * Copyright (C) 2008-2010 Nokia Corporation
> + * Copyright (C) 2004-2010 Marcel Holtmann <marcel at holtmann.org>
> + * Copyright (C) 2004-2005 Henryk Ploetz <henryk at ploetzli.ch>
> + * Copyright (C) 2005-2006 Brad Midgley <bmidgley at xmission.com>
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +/**
> + * @file
> + * SBC MMX optimization for some basic "building bricks"
> + */
> +
> +#include "libavutil/cpu.h"
> +#include "libavutil/x86/cpu.h"
> +#include "libavcodec/sbcdsp.h"
> +
> +void ff_sbc_analyze_4_mmx(const int16_t *in, int32_t *out, const int16_t *consts);
> +void ff_sbc_analyze_8_mmx(const int16_t *in, int32_t *out, const int16_t *consts);
> +void ff_sbc_calc_scalefactors_mmx(int32_t sb_sample_f[16][2][8],
> + uint32_t scale_factor[2][8],
> + int blocks, int channels, int subbands);
> +
> +av_cold void ff_sbcdsp_init_x86(SBCDSPContext *s)
> +{
> + int cpu_flags = av_get_cpu_flags();
> +
> + if (EXTERNAL_MMX(cpu_flags)) {
> + s->sbc_analyze_4 = ff_sbc_analyze_4_mmx;
> + s->sbc_analyze_8 = ff_sbc_analyze_8_mmx;
> + s->sbc_calc_scalefactors = ff_sbc_calc_scalefactors_mmx;
> + }
> +}
>
More information about the ffmpeg-devel
mailing list