[FFmpeg-devel] [WIP] [PATCH 4/4] x86: dsputilenc: convert hf_noise*_mmx to yasm
Timothy Gu
timothygu99 at gmail.com
Fri May 30 05:56:04 CEST 2014
Signed-off-by: Timothy Gu <timothygu99 at gmail.com>
---
hf_noise16 segfaults with make fate-vsynth1-mpeg4-nsse, but I don't know
hot to fix it. Also, did I get the `call` right?
---
libavcodec/x86/dsputilenc.asm | 91 ++++++++++++++
libavcodec/x86/dsputilenc_mmx.c | 265 ++--------------------------------------
2 files changed, 101 insertions(+), 255 deletions(-)
diff --git a/libavcodec/x86/dsputilenc.asm b/libavcodec/x86/dsputilenc.asm
index 7064024..09bbe95 100644
--- a/libavcodec/x86/dsputilenc.asm
+++ b/libavcodec/x86/dsputilenc.asm
@@ -608,3 +608,94 @@ INIT_XMM sse2
SUM_ABS_DCTELEM 7, 2
INIT_XMM ssse3
SUM_ABS_DCTELEM 6, 2
+
+;------------------------------------------------------------------------------
+; int ff_hf_noise*_mmx(uint8_t *pix1, int lsize, int h)
+;------------------------------------------------------------------------------
+; %1 = 8/16. %2-5=m#
+%macro HF_NOISE_PART1 5
+ mova m%2, [pix1q]
+%if %1 == 8
+ mova m%3, m%2
+ psllq m%2, 8
+ psrlq m%3, 8
+ psrlq m%2, 8
+%else
+ mova m%3, [pix1q+1]
+%endif
+ mova m%4, m%2
+ mova m%5, m%3
+ punpcklbw m%2, m7
+ punpcklbw m%3, m7
+ punpckhbw m%4, m7
+ punpckhbw m%5, m7
+ psubw m%2, m%3
+ psubw m%4, m%5
+%endmacro
+
+; %1-2 = m#
+%macro HF_NOISE_PART2 2
+ psubw m0, m4
+ psubw m2, m5
+ pxor m3, m3
+ pxor m1, m1
+ pcmpgtw m3, m%1
+ pcmpgtw m1, m%2
+ pxor m%1, m3
+ pxor m%2, m1
+ psubw m%1, m3
+ psubw m%2, m1
+ paddw m%2, m%1
+ paddw m6, m%2
+%endmacro
+
+; %1 = 8/16
+%macro HF_NOISE 1
+cglobal hf_noise%1, 3,3,0, pix1, lsize, h
+ movsxdifnidn lsizeq, lsized
+%if %1 == 16
+ push pix1q
+ push hq
+%endif
+ sub hd, 2
+ pxor m7, m7
+ pxor m6, m6
+ HF_NOISE_PART1 %1, 0, 1, 2, 3
+ add pix1q, lsizeq
+ HF_NOISE_PART1 %1, 4, 1, 5, 3
+ HF_NOISE_PART2 0, 2
+ add pix1q, lsizeq
+.loop:
+ HF_NOISE_PART1 %1, 0, 1, 2, 3
+ HF_NOISE_PART2 4, 5
+ add pix1q, lsizeq
+ HF_NOISE_PART1 %1, 4, 1, 5, 3
+ HF_NOISE_PART2 0, 2
+ add pix1q, lsizeq
+ sub hd, 2
+ jne .loop
+
+ mova m0, m6
+ punpcklwd m0, m7
+ punpckhwd m6, m7
+ paddd m6, m0
+ mova m0, m6
+ psrlq m6, 32
+ paddd m0, m6
+%if %1 == 16
+ movd ebx, m0 ; ebx = result of hf_noise16;
+ pop hq ; restore h and pix1
+ pop pix1q
+ ; lsize is unchanged (except movsxd, which hf_noise8 is going to do anyway)
+ add pix1q, 8 ; pix1 = pix1 + 8;
+ call hf_noise8 ; eax = hf_noise8_mmx(pix1, lsize, h);
+ add eax, ebx ; eax = eax + ebx;
+%else
+ movd eax, m0 ; eax = result of hf_noise8;
+%endif
+ RET ; return eax;
+%endmacro
+
+INIT_MMX mmx
+HF_NOISE 8
+HF_NOISE 16
diff --git a/libavcodec/x86/dsputilenc_mmx.c b/libavcodec/x86/dsputilenc_mmx.c
index 13128d2..f215347 100644
--- a/libavcodec/x86/dsputilenc_mmx.c
+++ b/libavcodec/x86/dsputilenc_mmx.c
@@ -52,6 +52,8 @@ int ff_sse16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
int line_size, int h);
int ff_sse16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
int line_size, int h);
+int ff_hf_noise8_mmx(uint8_t *pix1, int lsize, int h);
+int ff_hf_noise16_mmx(uint8_t *pix1, int lsize, int h);
#define hadamard_func(cpu) \
int ff_hadamard8_diff_ ## cpu(MpegEncContext *s, uint8_t *src1, \
@@ -64,255 +66,8 @@ hadamard_func(mmxext)
hadamard_func(sse2)
hadamard_func(ssse3)
-#if HAVE_INLINE_ASM
-
#if HAVE_YASM
-static int hf_noise8_mmx(uint8_t *pix1, int line_size, int h)
-{
- int tmp;
-
- __asm__ volatile (
- "movl %3, %%ecx\n"
- "pxor %%mm7, %%mm7\n"
- "pxor %%mm6, %%mm6\n"
-
- "movq (%0), %%mm0\n"
- "movq %%mm0, %%mm1\n"
- "psllq $8, %%mm0\n"
- "psrlq $8, %%mm1\n"
- "psrlq $8, %%mm0\n"
- "movq %%mm0, %%mm2\n"
- "movq %%mm1, %%mm3\n"
- "punpcklbw %%mm7, %%mm0\n"
- "punpcklbw %%mm7, %%mm1\n"
- "punpckhbw %%mm7, %%mm2\n"
- "punpckhbw %%mm7, %%mm3\n"
- "psubw %%mm1, %%mm0\n"
- "psubw %%mm3, %%mm2\n"
-
- "add %2, %0\n"
-
- "movq (%0), %%mm4\n"
- "movq %%mm4, %%mm1\n"
- "psllq $8, %%mm4\n"
- "psrlq $8, %%mm1\n"
- "psrlq $8, %%mm4\n"
- "movq %%mm4, %%mm5\n"
- "movq %%mm1, %%mm3\n"
- "punpcklbw %%mm7, %%mm4\n"
- "punpcklbw %%mm7, %%mm1\n"
- "punpckhbw %%mm7, %%mm5\n"
- "punpckhbw %%mm7, %%mm3\n"
- "psubw %%mm1, %%mm4\n"
- "psubw %%mm3, %%mm5\n"
- "psubw %%mm4, %%mm0\n"
- "psubw %%mm5, %%mm2\n"
- "pxor %%mm3, %%mm3\n"
- "pxor %%mm1, %%mm1\n"
- "pcmpgtw %%mm0, %%mm3\n\t"
- "pcmpgtw %%mm2, %%mm1\n\t"
- "pxor %%mm3, %%mm0\n"
- "pxor %%mm1, %%mm2\n"
- "psubw %%mm3, %%mm0\n"
- "psubw %%mm1, %%mm2\n"
- "paddw %%mm0, %%mm2\n"
- "paddw %%mm2, %%mm6\n"
-
- "add %2, %0\n"
- "1:\n"
-
- "movq (%0), %%mm0\n"
- "movq %%mm0, %%mm1\n"
- "psllq $8, %%mm0\n"
- "psrlq $8, %%mm1\n"
- "psrlq $8, %%mm0\n"
- "movq %%mm0, %%mm2\n"
- "movq %%mm1, %%mm3\n"
- "punpcklbw %%mm7, %%mm0\n"
- "punpcklbw %%mm7, %%mm1\n"
- "punpckhbw %%mm7, %%mm2\n"
- "punpckhbw %%mm7, %%mm3\n"
- "psubw %%mm1, %%mm0\n"
- "psubw %%mm3, %%mm2\n"
- "psubw %%mm0, %%mm4\n"
- "psubw %%mm2, %%mm5\n"
- "pxor %%mm3, %%mm3\n"
- "pxor %%mm1, %%mm1\n"
- "pcmpgtw %%mm4, %%mm3\n\t"
- "pcmpgtw %%mm5, %%mm1\n\t"
- "pxor %%mm3, %%mm4\n"
- "pxor %%mm1, %%mm5\n"
- "psubw %%mm3, %%mm4\n"
- "psubw %%mm1, %%mm5\n"
- "paddw %%mm4, %%mm5\n"
- "paddw %%mm5, %%mm6\n"
-
- "add %2, %0\n"
-
- "movq (%0), %%mm4\n"
- "movq %%mm4, %%mm1\n"
- "psllq $8, %%mm4\n"
- "psrlq $8, %%mm1\n"
- "psrlq $8, %%mm4\n"
- "movq %%mm4, %%mm5\n"
- "movq %%mm1, %%mm3\n"
- "punpcklbw %%mm7, %%mm4\n"
- "punpcklbw %%mm7, %%mm1\n"
- "punpckhbw %%mm7, %%mm5\n"
- "punpckhbw %%mm7, %%mm3\n"
- "psubw %%mm1, %%mm4\n"
- "psubw %%mm3, %%mm5\n"
- "psubw %%mm4, %%mm0\n"
- "psubw %%mm5, %%mm2\n"
- "pxor %%mm3, %%mm3\n"
- "pxor %%mm1, %%mm1\n"
- "pcmpgtw %%mm0, %%mm3\n\t"
- "pcmpgtw %%mm2, %%mm1\n\t"
- "pxor %%mm3, %%mm0\n"
- "pxor %%mm1, %%mm2\n"
- "psubw %%mm3, %%mm0\n"
- "psubw %%mm1, %%mm2\n"
- "paddw %%mm0, %%mm2\n"
- "paddw %%mm2, %%mm6\n"
-
- "add %2, %0\n"
- "subl $2, %%ecx\n"
- " jnz 1b\n"
-
- "movq %%mm6, %%mm0\n"
- "punpcklwd %%mm7, %%mm0\n"
- "punpckhwd %%mm7, %%mm6\n"
- "paddd %%mm0, %%mm6\n"
-
- "movq %%mm6, %%mm0\n"
- "psrlq $32, %%mm6\n"
- "paddd %%mm6, %%mm0\n"
- "movd %%mm0, %1\n"
- : "+r" (pix1), "=r" (tmp)
- : "r" ((x86_reg) line_size), "g" (h - 2)
- : "%ecx");
-
- return tmp;
-}
-
-static int hf_noise16_mmx(uint8_t *pix1, int line_size, int h)
-{
- int tmp;
- uint8_t *pix = pix1;
-
- __asm__ volatile (
- "movl %3, %%ecx\n"
- "pxor %%mm7, %%mm7\n"
- "pxor %%mm6, %%mm6\n"
-
- "movq (%0), %%mm0\n"
- "movq 1(%0), %%mm1\n"
- "movq %%mm0, %%mm2\n"
- "movq %%mm1, %%mm3\n"
- "punpcklbw %%mm7, %%mm0\n"
- "punpcklbw %%mm7, %%mm1\n"
- "punpckhbw %%mm7, %%mm2\n"
- "punpckhbw %%mm7, %%mm3\n"
- "psubw %%mm1, %%mm0\n"
- "psubw %%mm3, %%mm2\n"
-
- "add %2, %0\n"
-
- "movq (%0), %%mm4\n"
- "movq 1(%0), %%mm1\n"
- "movq %%mm4, %%mm5\n"
- "movq %%mm1, %%mm3\n"
- "punpcklbw %%mm7, %%mm4\n"
- "punpcklbw %%mm7, %%mm1\n"
- "punpckhbw %%mm7, %%mm5\n"
- "punpckhbw %%mm7, %%mm3\n"
- "psubw %%mm1, %%mm4\n"
- "psubw %%mm3, %%mm5\n"
- "psubw %%mm4, %%mm0\n"
- "psubw %%mm5, %%mm2\n"
- "pxor %%mm3, %%mm3\n"
- "pxor %%mm1, %%mm1\n"
- "pcmpgtw %%mm0, %%mm3\n\t"
- "pcmpgtw %%mm2, %%mm1\n\t"
- "pxor %%mm3, %%mm0\n"
- "pxor %%mm1, %%mm2\n"
- "psubw %%mm3, %%mm0\n"
- "psubw %%mm1, %%mm2\n"
- "paddw %%mm0, %%mm2\n"
- "paddw %%mm2, %%mm6\n"
-
- "add %2, %0\n"
- "1:\n"
-
- "movq (%0), %%mm0\n"
- "movq 1(%0), %%mm1\n"
- "movq %%mm0, %%mm2\n"
- "movq %%mm1, %%mm3\n"
- "punpcklbw %%mm7, %%mm0\n"
- "punpcklbw %%mm7, %%mm1\n"
- "punpckhbw %%mm7, %%mm2\n"
- "punpckhbw %%mm7, %%mm3\n"
- "psubw %%mm1, %%mm0\n"
- "psubw %%mm3, %%mm2\n"
- "psubw %%mm0, %%mm4\n"
- "psubw %%mm2, %%mm5\n"
- "pxor %%mm3, %%mm3\n"
- "pxor %%mm1, %%mm1\n"
- "pcmpgtw %%mm4, %%mm3\n\t"
- "pcmpgtw %%mm5, %%mm1\n\t"
- "pxor %%mm3, %%mm4\n"
- "pxor %%mm1, %%mm5\n"
- "psubw %%mm3, %%mm4\n"
- "psubw %%mm1, %%mm5\n"
- "paddw %%mm4, %%mm5\n"
- "paddw %%mm5, %%mm6\n"
-
- "add %2, %0\n"
-
- "movq (%0), %%mm4\n"
- "movq 1(%0), %%mm1\n"
- "movq %%mm4, %%mm5\n"
- "movq %%mm1, %%mm3\n"
- "punpcklbw %%mm7, %%mm4\n"
- "punpcklbw %%mm7, %%mm1\n"
- "punpckhbw %%mm7, %%mm5\n"
- "punpckhbw %%mm7, %%mm3\n"
- "psubw %%mm1, %%mm4\n"
- "psubw %%mm3, %%mm5\n"
- "psubw %%mm4, %%mm0\n"
- "psubw %%mm5, %%mm2\n"
- "pxor %%mm3, %%mm3\n"
- "pxor %%mm1, %%mm1\n"
- "pcmpgtw %%mm0, %%mm3\n\t"
- "pcmpgtw %%mm2, %%mm1\n\t"
- "pxor %%mm3, %%mm0\n"
- "pxor %%mm1, %%mm2\n"
- "psubw %%mm3, %%mm0\n"
- "psubw %%mm1, %%mm2\n"
- "paddw %%mm0, %%mm2\n"
- "paddw %%mm2, %%mm6\n"
-
- "add %2, %0\n"
- "subl $2, %%ecx\n"
- " jnz 1b\n"
-
- "movq %%mm6, %%mm0\n"
- "punpcklwd %%mm7, %%mm0\n"
- "punpckhwd %%mm7, %%mm6\n"
- "paddd %%mm0, %%mm6\n"
-
- "movq %%mm6, %%mm0\n"
- "psrlq $32, %%mm6\n"
- "paddd %%mm6, %%mm0\n"
- "movd %%mm0, %1\n"
- : "+r" (pix1), "=r" (tmp)
- : "r" ((x86_reg) line_size), "g" (h - 2)
- : "%ecx");
-
- return tmp + hf_noise8_mmx(pix + 8, line_size, h);
-}
-
static int nsse16_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2,
int line_size, int h)
{
@@ -322,8 +77,8 @@ static int nsse16_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2,
score1 = c->dsp.sse[0](c, pix1, pix2, line_size, h);
else
score1 = ff_sse16_mmx(c, pix1, pix2, line_size, h);
- score2 = hf_noise16_mmx(pix1, line_size, h) -
- hf_noise16_mmx(pix2, line_size, h);
+ score2 = ff_hf_noise16_mmx(pix1, line_size, h) -
+ ff_hf_noise16_mmx(pix2, line_size, h);
if (c)
return score1 + FFABS(score2) * c->avctx->nsse_weight;
@@ -335,8 +90,8 @@ static int nsse8_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2,
int line_size, int h)
{
int score1 = ff_sse8_mmx(c, pix1, pix2, line_size, h);
- int score2 = hf_noise8_mmx(pix1, line_size, h) -
- hf_noise8_mmx(pix2, line_size, h);
+ int score2 = ff_hf_noise8_mmx(pix1, line_size, h) -
+ ff_hf_noise8_mmx(pix2, line_size, h);
if (c)
return score1 + FFABS(score2) * c->avctx->nsse_weight;
@@ -346,6 +101,8 @@ static int nsse8_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2,
#endif /* HAVE_YASM */
+#if HAVE_INLINE_ASM
+
static int vsad_intra16_mmx(MpegEncContext *v, uint8_t *pix, uint8_t *dummy,
int line_size, int h)
{
@@ -689,10 +446,6 @@ av_cold void ff_dsputilenc_init_mmx(DSPContext *c, AVCodecContext *avctx,
c->vsad[4] = vsad_intra16_mmx;
-#if HAVE_YASM
- c->nsse[0] = nsse16_mmx;
- c->nsse[1] = nsse8_mmx;
-#endif /* HAVE_YASM */
if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
c->vsad[0] = vsad16_mmx;
c->try_8x8basis = try_8x8basis_mmx;
@@ -741,6 +494,8 @@ av_cold void ff_dsputilenc_init_mmx(DSPContext *c, AVCodecContext *avctx,
c->sum_abs_dctelem = ff_sum_abs_dctelem_mmx;
c->sse[0] = ff_sse16_mmx;
c->sse[1] = ff_sse8_mmx;
+ c->nsse[0] = nsse16_mmx;
+ c->nsse[1] = nsse8_mmx;
}
if (EXTERNAL_MMXEXT(cpu_flags)) {
--
1.9.1
More information about the ffmpeg-devel
mailing list