[FFmpeg-devel] [PATCH 3/3] x86: sbrdsp: implement SSE2 hf_apply_noise

Michael Niedermayer michaelni at gmx.at
Sat Apr 13 00:53:20 CEST 2013


On Fri, Apr 12, 2013 at 07:14:58PM +0200, Christophe Gisquet wrote:
> Hi,
> 
> 2013/4/12 Michael Niedermayer <michaelni at gmx.at>:
> > Applying this or 2/3 and this i get
> > libavcodec/x86/sbrdsp.asm:357: error: (PROLOGUE:2) cannot reference symbol `NREGS' in preprocessor
> > libavcodec/x86/sbrdsp.asm:364: error: (PROLOGUE:2) cannot reference symbol `NREGS' in preprocessor
> > libavcodec/x86/sbrdsp.asm:367: error: (LOAD_NST:1) cannot reference symbol `NREGS' in preprocessor
> > libavcodec/x86/sbrdsp.asm:373: error: (PROLOGUE:2) cannot reference symbol `NREGS' in preprocessor
> > libavcodec/x86/sbrdsp.asm:380: error: (PROLOGUE:2) cannot reference symbol `NREGS' in preprocessor
> > libavcodec/x86/sbrdsp.asm:383: error: (LOAD_NST:1) cannot reference symbol `NREGS' in preprocessor
> > libavcodec/x86/sbrdsp.asm:394: error: cannot reference symbol `NREGS' in preprocessor
> 
> Indeed, bad conflict resolution after rebasing I guess.
> 
> Here's a fixed version, for which fate-aac runs fine on win32 and win64.
> 
> --
> Christophe

>  aacsbrdata.h      |    6 ++-
>  x86/sbrdsp.asm    |  108 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  x86/sbrdsp_init.c |   16 ++++++++
>  3 files changed, 129 insertions(+), 1 deletion(-)
> e69ce59238c11ae266ec0794bdca182af62d0b73  0002-x86-sbrdsp-implement-SSE2-hf_apply_noise.patch
> From bf49837b8cfef2652c6dfe46031b8685ed9af217 Mon Sep 17 00:00:00 2001
> From: Christophe Gisquet <christophe.gisquet at gmail.com>
> Date: Wed, 10 Apr 2013 00:42:38 +0200
> Subject: [PATCH 2/2] x86: sbrdsp: implement SSE2 hf_apply_noise
> 
> 233 to 107 cycles on Arrandale and Win64.
> Replacing the multiplication by s_m[m] by a pand and a pxor with
> appropriate vectors is slower. Unrolling is a 15 cycles win.
> A SSE version was 4 cycles slower.
> ---
>  libavcodec/aacsbrdata.h      |   6 ++-
>  libavcodec/x86/sbrdsp.asm    | 108 +++++++++++++++++++++++++++++++++++++++++++
>  libavcodec/x86/sbrdsp_init.c |  16 +++++++
>  3 files changed, 129 insertions(+), 1 deletion(-)
> 
> diff --git a/libavcodec/aacsbrdata.h b/libavcodec/aacsbrdata.h
> index dd7a827..12575ee 100644
> --- a/libavcodec/aacsbrdata.h
> +++ b/libavcodec/aacsbrdata.h
> @@ -352,7 +352,7 @@ static DECLARE_ALIGNED(32, float, sbr_qmf_window_us)[640] = {
>       0.8537385600,
>  };
>  
> -/* First two entries repeated at end to simplify SIMD implementations. */
> +/* First eight entries repeated at end to simplify SIMD implementations. */
>  const DECLARE_ALIGNED(16, float, ff_sbr_noise_table)[][2] = {
>  {-0.99948153278296, -0.59483417516607}, { 0.97113454393991, -0.67528515225647},
>  { 0.14130051758487, -0.95090983575689}, {-0.47005496701697, -0.37340549728647},
> @@ -610,7 +610,11 @@ const DECLARE_ALIGNED(16, float, ff_sbr_noise_table)[][2] = {
>  {-0.93412041758744,  0.41374052024363}, { 0.96063943315511,  0.93116709541280},
>  { 0.97534253457837,  0.86150930812689}, { 0.99642466504163,  0.70190043427512},
>  {-0.94705089665984, -0.29580042814306}, { 0.91599807087376, -0.98147830385781},
> +// Start of duplicated table
>  {-0.99948153278296, -0.59483417516607}, { 0.97113454393991, -0.67528515225647},
> +{ 0.14130051758487, -0.95090983575689}, {-0.47005496701697, -0.37340549728647},
> +{ 0.80705063769351,  0.29653668284408}, {-0.38981478896926,  0.89572605717087},
> +{-0.01053049862020, -0.66959058036166}, {-0.91266367957293, -0.11522938140034},
>  };
>  
>  #endif /* AVCODEC_AACSBRDATA_H */
> diff --git a/libavcodec/x86/sbrdsp.asm b/libavcodec/x86/sbrdsp.asm
> index 099382a..e0c2088 100644
> --- a/libavcodec/x86/sbrdsp.asm
> +++ b/libavcodec/x86/sbrdsp.asm
> @@ -26,6 +26,12 @@ SECTION_RODATA
>  ps_mask         times 2 dd 1<<31, 0
>  ps_mask2        times 2 dd 0, 1<<31
>  ps_neg          times 4 dd 1<<31
> +ps_noise0       times 2 dd  1.0,  0.0,
> +ps_noise2       times 2 dd -1.0,  0.0
> +ps_noise13      dd  0.0,  1.0, 0.0, -1.0
> +                dd  0.0, -1.0, 0.0,  1.0
> +                dd  0.0,  1.0, 0.0, -1.0
> +cextern         sbr_noise_table
>  
>  SECTION_TEXT
>  
> @@ -334,3 +340,105 @@ cglobal sbr_qmf_deint_neg, 2,3,3,v,src,vrev
>      cmp        vq, vrevq
>      jl      .loop
>      REP_RET
> +
> +%if WIN64
> +%define NREGS 0
> +%elifdef PIC
> +%define NREGS 1
> +%else
> +%define NREGS 0
> +%endif
> +
> +%macro LOAD_NST  1
> +%if NREGS
> +    lea       r5q, [%1]
> +    mova       m0, [kxq + r5q]
> +%else
> +    mova       m0, [kxq + %1]
> +%endif
> +%endmacro
> +
> +INIT_XMM sse2
> +; sbr_hf_apply_noise_0(float (*Y)[2], const float *s_m,
> +;                      const float *q_filt, int noise,
> +;                      int kx, int m_max)
> +cglobal sbr_hf_apply_noise_0, 5,5+NREGS,8, Y,s_m,q_filt,noise,kx,m_max
> +    mova       m0, [ps_noise0]
> +    jmp apply_noise_main
> +
> +; sbr_hf_apply_noise_1(float (*Y)[2], const float *s_m,
> +;                      const float *q_filt, int noise,
> +;                      int kx, int m_max)
> +cglobal sbr_hf_apply_noise_1, 5,5+NREGS,8, Y,s_m,q_filt,noise,kx,m_max
> +    and       kxq, 1
> +    shl       kxq, 4
> +    LOAD_NST  ps_noise13
> +    jmp apply_noise_main
> +
> +; sbr_hf_apply_noise_2(float (*Y)[2], const float *s_m,
> +;                      const float *q_filt, int noise,
> +;                      int kx, int m_max)
> +cglobal sbr_hf_apply_noise_2, 5,5+NREGS,8, Y,s_m,q_filt,noise,kx,m_max
> +    mova       m0, [ps_noise2]
> +    jmp apply_noise_main
> +
> +; sbr_hf_apply_noise_3(float (*Y)[2], const float *s_m,
> +;                      const float *q_filt, int noise,
> +;                      int kx, int m_max)
> +cglobal sbr_hf_apply_noise_3, 5,5+NREGS,8, Y,s_m,q_filt,noise,kx,m_max
> +    and       kxq, 1
> +    shl       kxq, 4
> +    LOAD_NST  ps_noise13+16
> +
> +apply_noise_main:
> +%if ARCH_X86_64 == 0 || WIN64
> +    mov       kxd, m_maxm
> +%define count kxq
> +%else
> +%define count m_maxq
> +%endif
> +    dec    noiseq
> +    shl    count, 2
> +%if NREGS
> +    lea       r5q, [sbr_noise_table]

count and r5q end being the same register here on x86_64 linux shared

-- 
Michael     GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

While the State exists there can be no freedom; when there is freedom there
will be no State. -- Vladimir Lenin
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 198 bytes
Desc: Digital signature
URL: <http://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20130413/089247ce/attachment.asc>


More information about the ffmpeg-devel mailing list