[FFmpeg-cvslog] x86/swr: convert resample_{common, linear}_double_sse2 to yasm
James Almer
git at videolan.org
Tue Jul 1 18:31:38 CEST 2014
ffmpeg | branch: master | James Almer <jamrial at gmail.com> | Mon Jun 30 13:06:00 2014 -0300| [dd2c9034b174a2b17f8e3ed972c49720bab1d4c1] | committer: Michael Niedermayer
x86/swr: convert resample_{common, linear}_double_sse2 to yasm
Signed-off-by: James Almer <jamrial at gmail.com>
312531 -> 311528 dezicycles
Signed-off-by: Michael Niedermayer <michaelni at gmx.at>
> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=dd2c9034b174a2b17f8e3ed972c49720bab1d4c1
---
libswresample/resample_template.c | 22 +-----
libswresample/x86/resample.asm | 122 ++++++++++++++++++----------------
libswresample/x86/resample_mmx.h | 72 --------------------
libswresample/x86/resample_x86_dsp.c | 27 +++-----
4 files changed, 74 insertions(+), 169 deletions(-)
diff --git a/libswresample/resample_template.c b/libswresample/resample_template.c
index 2a64f50..4f1638e 100644
--- a/libswresample/resample_template.c
+++ b/libswresample/resample_template.c
@@ -25,23 +25,15 @@
* @author Michael Niedermayer <michaelni at gmx.at>
*/
-#if defined(TEMPLATE_RESAMPLE_DBL) \
- || defined(TEMPLATE_RESAMPLE_DBL_SSE2)
+#if defined(TEMPLATE_RESAMPLE_DBL)
+# define RENAME(N) N ## _double
# define FILTER_SHIFT 0
# define DELEM double
# define FELEM double
# define FELEM2 double
# define OUT(d, v) d = v
-# if defined(TEMPLATE_RESAMPLE_DBL)
-# define RENAME(N) N ## _double
-# elif defined(TEMPLATE_RESAMPLE_DBL_SSE2)
-# define COMMON_CORE COMMON_CORE_DBL_SSE2
-# define LINEAR_CORE LINEAR_CORE_DBL_SSE2
-# define RENAME(N) N ## _double_sse2
-# endif
-
#elif defined(TEMPLATE_RESAMPLE_FLT)
# define RENAME(N) N ## _float
@@ -104,16 +96,12 @@ int RENAME(swri_resample_common)(ResampleContext *c,
for (dst_index = 0; dst_index < n; dst_index++) {
FELEM *filter = ((FELEM *) c->filter_bank) + c->filter_alloc * index;
-#ifdef COMMON_CORE
- COMMON_CORE
-#else
FELEM2 val=0;
int i;
for (i = 0; i < c->filter_length; i++) {
val += src[sample_index + i] * (FELEM2)filter[i];
}
OUT(dst[dst_index], val);
-#endif
frac += c->dst_incr_mod;
index += c->dst_incr_div;
@@ -150,15 +138,11 @@ int RENAME(swri_resample_linear)(ResampleContext *c,
FELEM *filter = ((FELEM *) c->filter_bank) + c->filter_alloc * index;
FELEM2 val=0, v2 = 0;
-#ifdef LINEAR_CORE
- LINEAR_CORE
-#else
int i;
for (i = 0; i < c->filter_length; i++) {
val += src[sample_index + i] * (FELEM2)filter[i];
v2 += src[sample_index + i] * (FELEM2)filter[i + c->filter_alloc];
}
-#endif
#ifdef FELEML
val += (v2 - val) * (FELEML) frac / c->src_incr;
#else
@@ -188,8 +172,6 @@ int RENAME(swri_resample_linear)(ResampleContext *c,
return sample_index;
}
-#undef COMMON_CORE
-#undef LINEAR_CORE
#undef RENAME
#undef FILTER_SHIFT
#undef DELEM
diff --git a/libswresample/x86/resample.asm b/libswresample/x86/resample.asm
index 2fe03c8..bce1389 100644
--- a/libswresample/x86/resample.asm
+++ b/libswresample/x86/resample.asm
@@ -50,11 +50,12 @@ endstruc
SECTION_RODATA
pf_1: dd 1.0
+pdbl_1: dq 1.0
pd_0x4000: dd 0x4000
SECTION .text
-%macro RESAMPLE_FNS 3 ; format [float or int16], bps, log2_bps
+%macro RESAMPLE_FNS 3-5 ; format [float or int16], bps, log2_bps, float op suffix [s or d], 1.0 constant
; int resample_common_$format(ResampleContext *ctx, $format *dst,
; const $format *src, int size, int update_ctx)
%if ARCH_X86_64 ; unix64 and win64
@@ -165,21 +166,21 @@ cglobal resample_common_%1, 1, 7, 2, ctx, phase_shift, dst, frac, \
lea filterq, [min_filter_count_x4q+filterq*%2]
mov min_filter_count_x4q, min_filter_length_x4q
%endif
-%ifidn %1, float
- xorps m0, m0, m0
-%else ; int16
+%ifidn %1, int16
movd m0, [pd_0x4000]
+%else ; float/double
+ xorps m0, m0, m0
%endif
align 16
.inner_loop:
movu m1, [srcq+min_filter_count_x4q*1]
-%ifidn %1, float
- mulps m1, m1, [filterq+min_filter_count_x4q*1]
- addps m0, m0, m1
-%else ; int16
+%ifidn %1, int16
pmaddwd m1, [filterq+min_filter_count_x4q*1]
paddd m0, m1
+%else ; float/double
+ mulp%4 m1, m1, [filterq+min_filter_count_x4q*1]
+ addp%4 m0, m0, m1
%endif
add min_filter_count_x4q, mmsize
js .inner_loop
@@ -189,16 +190,7 @@ cglobal resample_common_%1, 1, 7, 2, ctx, phase_shift, dst, frac, \
addps xm0, xm1
%endif
- ; horizontal sum & store
-%ifidn %1, float
- movhlps xm1, xm0
- addps xm0, xm1
- shufps xm1, xm0, xm0, q0001
- add fracd, dst_incr_modd
- addps xm0, xm1
- add indexd, dst_incr_divd
- movss [dstq], xm0
-%else ; int16
+%ifidn %1, int16
%if mmsize == 16
pshufd m1, m0, q0032
paddd m0, m1
@@ -212,6 +204,17 @@ cglobal resample_common_%1, 1, 7, 2, ctx, phase_shift, dst, frac, \
packssdw m0, m0
add indexd, dst_incr_divd
movd [dstq], m0
+%else ; float/double
+ ; horizontal sum & store
+ movhlps xm1, xm0
+%ifidn %1, float
+ addps xm0, xm1
+ shufps xm1, xm0, xm0, q0001
+%endif
+ add fracd, dst_incr_modd
+ addp%4 xm0, xm1
+ add indexd, dst_incr_divd
+ movs%4 [dstq], xm0
%endif
cmp fracd, src_incrd
jl .skip
@@ -307,12 +310,12 @@ cglobal resample_linear_%1, 0, 15, 5, ctx, phase_mask, src, phase_shift, index,
mov ctx_stackq, ctxq
mov phase_mask_stackd, phase_maskd
mov min_filter_len_x4d, [ctxq+ResampleContext.filter_length]
-%ifidn %1, float
- cvtsi2ss xm0, src_incrd
- movss xm4, [pf_1]
- divss xm4, xm0
-%else ; int16
+%ifidn %1, int16
movd m4, [pd_0x4000]
+%else ; float/double
+ cvtsi2s%4 xm0, src_incrd
+ movs%4 xm4, [%5]
+ divs%4 xm4, xm0
%endif
mov dst_incr_divd, [ctxq+ResampleContext.dst_incr_div]
shl min_filter_len_x4d, %3
@@ -360,12 +363,12 @@ cglobal resample_linear_%1, 1, 7, 5, ctx, min_filter_length_x4, filter2, \
mov r3, dword [ctxq+ResampleContext.src_incr]
PUSH dword [ctxq+ResampleContext.phase_mask]
PUSH r3d
-%ifidn %1, float
- cvtsi2ss xm0, r3d
- movss xm4, [pf_1]
- divss xm4, xm0
-%else ; int16
+%ifidn %1, int16
movd m4, [pd_0x4000]
+%else ; float/double
+ cvtsi2s%4 xm0, r3d
+ movs%4 xm4, [%5]
+ divs%4 xm4, xm0
%endif
mov min_filter_length_x4d, [ctxq+ResampleContext.filter_length]
mov indexd, [ctxq+ResampleContext.index]
@@ -409,27 +412,27 @@ cglobal resample_linear_%1, 1, 7, 5, ctx, min_filter_length_x4, filter2, \
mov filter2q, filter1q
add filter2q, filter_alloc_x4q
%endif
-%ifidn %1, float
- xorps m0, m0, m0
- xorps m2, m2, m2
-%else ; int16
+%ifidn %1, int16
mova m0, m4
mova m2, m4
+%else ; float/double
+ xorps m0, m0, m0
+ xorps m2, m2, m2
%endif
align 16
.inner_loop:
movu m1, [srcq+min_filter_count_x4q*1]
-%ifidn %1, float
- mulps m3, m1, [filter2q+min_filter_count_x4q*1]
- mulps m1, m1, [filter1q+min_filter_count_x4q*1]
- addps m2, m2, m3
- addps m0, m0, m1
-%else ; int16
+%ifidn %1, int16
pmaddwd m3, m1, [filter2q+min_filter_count_x4q*1]
pmaddwd m1, [filter1q+min_filter_count_x4q*1]
paddd m2, m3
paddd m0, m1
+%else ; float/double
+ mulp%4 m3, m1, [filter2q+min_filter_count_x4q*1]
+ mulp%4 m1, m1, [filter1q+min_filter_count_x4q*1]
+ addp%4 m2, m2, m3
+ addp%4 m0, m0, m1
%endif
add min_filter_count_x4q, mmsize
js .inner_loop
@@ -441,24 +444,7 @@ cglobal resample_linear_%1, 1, 7, 5, ctx, min_filter_length_x4, filter2, \
addps xm2, xm3
%endif
-%ifidn %1, float
- ; val += (v2 - val) * (FELEML) frac / c->src_incr;
- cvtsi2ss xm1, fracd
- subps xm2, xm0
- mulps xm1, xm4
- shufps xm1, xm1, q0000
- mulps xm2, xm1
- addps xm0, xm2
-
- ; horizontal sum & store
- movhlps xm1, xm0
- addps xm0, xm1
- shufps xm1, xm0, xm0, q0001
- add fracd, dst_incr_modd
- addps xm0, xm1
- add indexd, dst_incr_divd
- movss [dstq], xm0
-%else ; int16
+%ifidn %1, int16
%if mmsize == 16
pshufd m3, m2, q0032
pshufd m1, m0, q0032
@@ -491,6 +477,25 @@ cglobal resample_linear_%1, 1, 7, 5, ctx, min_filter_length_x4, filter2, \
; - 32bit: eax=r0[filter1], edx=r2[filter2]
; - win64: eax=r6[filter1], edx=r1[todo]
; - unix64: eax=r6[filter1], edx=r2[todo]
+%else ; float/double
+ ; val += (v2 - val) * (FELEML) frac / c->src_incr;
+ cvtsi2s%4 xm1, fracd
+ subp%4 xm2, xm0
+ mulp%4 xm1, xm4
+ shufp%4 xm1, xm1, q0000
+ mulp%4 xm2, xm1
+ addp%4 xm0, xm2
+
+ ; horizontal sum & store
+ movhlps xm1, xm0
+%ifidn %1, float
+ addps xm0, xm1
+ shufps xm1, xm0, xm0, q0001
+%endif
+ add fracd, dst_incr_modd
+ addp%4 xm0, xm1
+ add indexd, dst_incr_divd
+ movs%4 [dstq], xm0
%endif
cmp fracd, src_incrd
jl .skip
@@ -553,11 +558,11 @@ cglobal resample_linear_%1, 1, 7, 5, ctx, min_filter_length_x4, filter2, \
%endmacro
INIT_XMM sse
-RESAMPLE_FNS float, 4, 2
+RESAMPLE_FNS float, 4, 2, s, pf_1
%if HAVE_AVX_EXTERNAL
INIT_YMM avx
-RESAMPLE_FNS float, 4, 2
+RESAMPLE_FNS float, 4, 2, s, pf_1
%endif
%if ARCH_X86_32
@@ -567,3 +572,4 @@ RESAMPLE_FNS int16, 2, 1
INIT_XMM sse2
RESAMPLE_FNS int16, 2, 1
+RESAMPLE_FNS double, 8, 3, d, pdbl_1
diff --git a/libswresample/x86/resample_mmx.h b/libswresample/x86/resample_mmx.h
deleted file mode 100644
index b0ea496..0000000
--- a/libswresample/x86/resample_mmx.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Copyright (c) 2012 Michael Niedermayer <michaelni at gmx.at>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/x86/asm.h"
-#include "libavutil/cpu.h"
-#include "libswresample/swresample_internal.h"
-
-#define COMMON_CORE_DBL_SSE2 \
- x86_reg len= -8*c->filter_length;\
-__asm__ volatile(\
- "xorpd %%xmm0, %%xmm0 \n\t"\
- "1: \n\t"\
- "movupd (%1, %0), %%xmm1 \n\t"\
- "mulpd (%2, %0), %%xmm1 \n\t"\
- "addpd %%xmm1, %%xmm0 \n\t"\
- "add $16, %0 \n\t"\
- " js 1b \n\t"\
- "movhlps %%xmm0, %%xmm1 \n\t"\
- "addpd %%xmm1, %%xmm0 \n\t"\
- "movsd %%xmm0, (%3) \n\t"\
- : "+r" (len)\
- : "r" (((uint8_t*)(src+sample_index))-len),\
- "r" (((uint8_t*)filter)-len),\
- "r" (dst+dst_index)\
- XMM_CLOBBERS_ONLY("%xmm0", "%xmm1")\
-);
-
-#define LINEAR_CORE_DBL_SSE2 \
- x86_reg len= -8*c->filter_length;\
-__asm__ volatile(\
- "xorpd %%xmm0, %%xmm0 \n\t"\
- "xorpd %%xmm2, %%xmm2 \n\t"\
- "1: \n\t"\
- "movupd (%3, %0), %%xmm1 \n\t"\
- "movapd %%xmm1, %%xmm3 \n\t"\
- "mulpd (%4, %0), %%xmm1 \n\t"\
- "mulpd (%5, %0), %%xmm3 \n\t"\
- "addpd %%xmm1, %%xmm0 \n\t"\
- "addpd %%xmm3, %%xmm2 \n\t"\
- "add $16, %0 \n\t"\
- " js 1b \n\t"\
- "movhlps %%xmm0, %%xmm1 \n\t"\
- "movhlps %%xmm2, %%xmm3 \n\t"\
- "addpd %%xmm1, %%xmm0 \n\t"\
- "addpd %%xmm3, %%xmm2 \n\t"\
- "movsd %%xmm0, %1 \n\t"\
- "movsd %%xmm2, %2 \n\t"\
- : "+r" (len),\
- "=m" (val),\
- "=m" (v2)\
- : "r" (((uint8_t*)(src+sample_index))-len),\
- "r" (((uint8_t*)filter)-len),\
- "r" (((uint8_t*)(filter+c->filter_alloc))-len)\
- XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3")\
-);
diff --git a/libswresample/x86/resample_x86_dsp.c b/libswresample/x86/resample_x86_dsp.c
index 5130ecd..9049da6 100644
--- a/libswresample/x86/resample_x86_dsp.c
+++ b/libswresample/x86/resample_x86_dsp.c
@@ -27,21 +27,6 @@
#include "libswresample/resample.h"
-int swri_resample_common_double_sse2(ResampleContext *c, double *dst, const double *src, int n, int update_ctx);
-int swri_resample_linear_double_sse2(ResampleContext *c, double *dst, const double *src, int n, int update_ctx);
-
-#if HAVE_SSE2_INLINE
-#define DO_RESAMPLE_ONE 0
-
-#include "resample_mmx.h"
-
-#define TEMPLATE_RESAMPLE_DBL_SSE2
-#include "libswresample/resample_template.c"
-#undef TEMPLATE_RESAMPLE_DBL_SSE2
-#endif
-
-#undef DO_RESAMPLE_ONE
-
int ff_resample_common_int16_mmxext(ResampleContext *c, uint8_t *dst,
const uint8_t *src, int sz, int upd);
int ff_resample_linear_int16_mmxext(ResampleContext *c, uint8_t *dst,
@@ -62,6 +47,11 @@ int ff_resample_common_float_avx(ResampleContext *c, uint8_t *dst,
int ff_resample_linear_float_avx(ResampleContext *c, uint8_t *dst,
const uint8_t *src, int sz, int upd);
+int ff_resample_common_double_sse2(ResampleContext *c, uint8_t *dst,
+ const uint8_t *src, int sz, int upd);
+int ff_resample_linear_double_sse2(ResampleContext *c, uint8_t *dst,
+ const uint8_t *src, int sz, int upd);
+
void swresample_dsp_x86_init(ResampleContext *c)
{
int av_unused mm_flags = av_get_cpu_flags();
@@ -78,10 +68,9 @@ void swresample_dsp_x86_init(ResampleContext *c)
if (HAVE_SSE2_EXTERNAL && mm_flags & AV_CPU_FLAG_SSE2) {
c->dsp.resample_common[FNIDX(S16P)] = ff_resample_common_int16_sse2;
c->dsp.resample_linear[FNIDX(S16P)] = ff_resample_linear_int16_sse2;
- }
- if (HAVE_SSE2_INLINE && mm_flags & AV_CPU_FLAG_SSE2) {
- c->dsp.resample_common[FNIDX(DBLP)] = (resample_fn) swri_resample_common_double_sse2;
- c->dsp.resample_linear[FNIDX(DBLP)] = (resample_fn) swri_resample_linear_double_sse2;
+
+ c->dsp.resample_common[FNIDX(DBLP)] = ff_resample_common_double_sse2;
+ c->dsp.resample_linear[FNIDX(DBLP)] = ff_resample_linear_double_sse2;
}
if (HAVE_AVX_EXTERNAL && mm_flags & AV_CPU_FLAG_AVX) {
c->dsp.resample_common[FNIDX(FLTP)] = ff_resample_common_float_avx;
More information about the ffmpeg-cvslog
mailing list