[FFmpeg-devel] [PATCH] h264: integrate clear_blocks calls with IDCT.
Ronald S. Bultje
rsbultje at gmail.com
Fri Feb 8 07:20:52 CET 2013
From: "Ronald S. Bultje" <rsbultje at gmail.com>
In case of no-transform, integrate it with put_pixels4/8(). In case
of intra PCM, do an explicit memset(0) call. Together, this makes
the H264 decoder almost-independent of dsputil.
(PPC and Arm assembly not yet ported.)
---
libavcodec/h264.c | 3 +-
libavcodec/h264_mb_template.c | 12 ++---
libavcodec/h264idct_template.c | 16 ++++--
libavcodec/h264pred.h | 8 +--
libavcodec/h264pred_template.c | 28 ++++++----
libavcodec/svq3.c | 2 +
libavcodec/x86/h264_idct.asm | 108 ++++++++++++++++++++++++++++---------
libavcodec/x86/h264_idct_10bit.asm | 53 ++++++++++++++++--
8 files changed, 172 insertions(+), 58 deletions(-)
diff --git a/libavcodec/h264.c b/libavcodec/h264.c
index cfcb552..def1a36 100644
--- a/libavcodec/h264.c
+++ b/libavcodec/h264.c
@@ -1270,8 +1270,7 @@ static int decode_update_thread_context(AVCodecContext *dst,
h->thread_context[0] = h;
- s->dsp.clear_blocks(h->mb);
- s->dsp.clear_blocks(h->mb + (24 * 16 << h->pixel_shift));
+ memset(h->mb, 0, 24 * 16 * 4 << h->pixel_shift);
}
/* frame_start may not be called for the next thread (if it's decoding
diff --git a/libavcodec/h264_mb_template.c b/libavcodec/h264_mb_template.c
index a841555..4fe2030 100644
--- a/libavcodec/h264_mb_template.c
+++ b/libavcodec/h264_mb_template.c
@@ -133,6 +133,7 @@ static av_noinline void FUNC(hl_decode_mb)(H264Context *h)
}
}
}
+ memset(h->mb, 0, ff_h264_mb_sizes[h->sps.chroma_format_idc] * 2);
} else {
for (i = 0; i < 16; i++)
memcpy(dest_y + i * linesize, (uint8_t *)h->mb + i * 16, 16);
@@ -151,6 +152,7 @@ static av_noinline void FUNC(hl_decode_mb)(H264Context *h)
}
}
}
+ memset(h->mb, 0, ff_h264_mb_sizes[h->sps.chroma_format_idc]);
}
} else {
if (IS_INTRA(mb_type)) {
@@ -260,10 +262,6 @@ static av_noinline void FUNC(hl_decode_mb)(H264Context *h)
}
}
}
- if (h->cbp || IS_INTRA(mb_type)) {
- s->dsp.clear_blocks(h->mb);
- s->dsp.clear_blocks(h->mb + (24 * 16 << PIXEL_SHIFT));
- }
}
#if !SIMPLE || BITS == 8
@@ -335,11 +333,13 @@ static av_noinline void FUNC(hl_decode_mb_444)(H264Context *h)
for (j = 0; j < 16; j++)
tmp[j] = get_bits(&gb, bit_depth);
}
+ memset(h->mb, 0, 768 * 2);
} else {
for (p = 0; p < plane_count; p++)
for (i = 0; i < 16; i++)
memcpy(dest[p] + i * linesize,
(uint8_t *)h->mb + p * 256 + i * 16, 16);
+ memset(h->mb, 0, 768);
}
} else {
if (IS_INTRA(mb_type)) {
@@ -368,10 +368,6 @@ static av_noinline void FUNC(hl_decode_mb_444)(H264Context *h)
PIXEL_SHIFT, block_offset, linesize,
dest[p], p);
}
- if (h->cbp || IS_INTRA(mb_type)) {
- s->dsp.clear_blocks(h->mb);
- s->dsp.clear_blocks(h->mb + (24 * 16 << PIXEL_SHIFT));
- }
}
#endif
diff --git a/libavcodec/h264idct_template.c b/libavcodec/h264idct_template.c
index 4a029a0..702dbc9 100644
--- a/libavcodec/h264idct_template.c
+++ b/libavcodec/h264idct_template.c
@@ -79,6 +79,8 @@ void FUNCC(ff_h264_idct_add)(uint8_t *_dst, int16_t *_block, int stride)
dst[i + 2*stride]= av_clip_pixel(dst[i + 2*stride] + ((z1 - z2) >> 6));
dst[i + 3*stride]= av_clip_pixel(dst[i + 3*stride] + ((z0 - z3) >> 6));
}
+
+ memset(block, 0, 16 * sizeof(dctcoef));
}
void FUNCC(ff_h264_idct8_add)(uint8_t *_dst, int16_t *_block, int stride){
@@ -151,14 +153,18 @@ void FUNCC(ff_h264_idct8_add)(uint8_t *_dst, int16_t *_block, int stride){
dst[i + 6*stride] = av_clip_pixel( dst[i + 6*stride] + ((b2 - b5) >> 6) );
dst[i + 7*stride] = av_clip_pixel( dst[i + 7*stride] + ((b0 - b7) >> 6) );
}
+
+ memset(block, 0, 64 * sizeof(dctcoef));
}
// assumes all AC coefs are 0
-void FUNCC(ff_h264_idct_dc_add)(uint8_t *_dst, int16_t *block, int stride){
+void FUNCC(ff_h264_idct_dc_add)(uint8_t *_dst, int16_t *_block, int stride){
int i, j;
- int dc = (((dctcoef*)block)[0] + 32) >> 6;
pixel *dst = (pixel*)_dst;
+ dctcoef *block = (dctcoef*)_block;
+ int dc = (block[0] + 32) >> 6;
stride >>= sizeof(pixel)-1;
+ block[0] = 0;
for( j = 0; j < 4; j++ )
{
for( i = 0; i < 4; i++ )
@@ -167,10 +173,12 @@ void FUNCC(ff_h264_idct_dc_add)(uint8_t *_dst, int16_t *block, int stride){
}
}
-void FUNCC(ff_h264_idct8_dc_add)(uint8_t *_dst, int16_t *block, int stride){
+void FUNCC(ff_h264_idct8_dc_add)(uint8_t *_dst, int16_t *_block, int stride){
int i, j;
- int dc = (((dctcoef*)block)[0] + 32) >> 6;
pixel *dst = (pixel*)_dst;
+ dctcoef *block = (dctcoef*)_block;
+ int dc = (block[0] + 32) >> 6;
+ block[0] = 0;
stride >>= sizeof(pixel)-1;
for( j = 0; j < 8; j++ )
{
diff --git a/libavcodec/h264pred.h b/libavcodec/h264pred.h
index 36b542b..ed67d2e 100644
--- a/libavcodec/h264pred.h
+++ b/libavcodec/h264pred.h
@@ -98,15 +98,15 @@ typedef struct H264PredContext {
void(*pred16x16[4 + 3 + 2])(uint8_t *src, ptrdiff_t stride);
void(*pred4x4_add[2])(uint8_t *pix /*align 4*/,
- const int16_t *block /*align 16*/, ptrdiff_t stride);
+ int16_t *block /*align 16*/, ptrdiff_t stride);
void(*pred8x8l_add[2])(uint8_t *pix /*align 8*/,
- const int16_t *block /*align 16*/, ptrdiff_t stride);
+ int16_t *block /*align 16*/, ptrdiff_t stride);
void(*pred8x8_add[3])(uint8_t *pix /*align 8*/,
const int *block_offset,
- const int16_t *block /*align 16*/, ptrdiff_t stride);
+ int16_t *block /*align 16*/, ptrdiff_t stride);
void(*pred16x16_add[3])(uint8_t *pix /*align 16*/,
const int *block_offset,
- const int16_t *block /*align 16*/, ptrdiff_t stride);
+ int16_t *block /*align 16*/, ptrdiff_t stride);
} H264PredContext;
void ff_h264_pred_init(H264PredContext *h, int codec_id,
diff --git a/libavcodec/h264pred_template.c b/libavcodec/h264pred_template.c
index e78f2d4..8d8d62e 100644
--- a/libavcodec/h264pred_template.c
+++ b/libavcodec/h264pred_template.c
@@ -1132,7 +1132,7 @@ static void FUNCC(pred8x8l_horizontal_up)(uint8_t *_src, int has_topleft,
#undef PL
#undef SRC
-static void FUNCC(pred4x4_vertical_add)(uint8_t *_pix, const int16_t *_block,
+static void FUNCC(pred4x4_vertical_add)(uint8_t *_pix, int16_t *_block,
ptrdiff_t stride)
{
int i;
@@ -1149,9 +1149,11 @@ static void FUNCC(pred4x4_vertical_add)(uint8_t *_pix, const int16_t *_block,
pix++;
block++;
}
+
+ memset(_block, 0, sizeof(dctcoef) * 16);
}
-static void FUNCC(pred4x4_horizontal_add)(uint8_t *_pix, const int16_t *_block,
+static void FUNCC(pred4x4_horizontal_add)(uint8_t *_pix, int16_t *_block,
ptrdiff_t stride)
{
int i;
@@ -1167,9 +1169,11 @@ static void FUNCC(pred4x4_horizontal_add)(uint8_t *_pix, const int16_t *_block,
pix+= stride;
block+= 4;
}
+
+ memset(_block, 0, sizeof(dctcoef) * 16);
}
-static void FUNCC(pred8x8l_vertical_add)(uint8_t *_pix, const int16_t *_block,
+static void FUNCC(pred8x8l_vertical_add)(uint8_t *_pix, int16_t *_block,
ptrdiff_t stride)
{
int i;
@@ -1190,9 +1194,11 @@ static void FUNCC(pred8x8l_vertical_add)(uint8_t *_pix, const int16_t *_block,
pix++;
block++;
}
+
+ memset(_block, 0, sizeof(dctcoef) * 64);
}
-static void FUNCC(pred8x8l_horizontal_add)(uint8_t *_pix, const int16_t *_block,
+static void FUNCC(pred8x8l_horizontal_add)(uint8_t *_pix, int16_t *_block,
ptrdiff_t stride)
{
int i;
@@ -1212,10 +1218,12 @@ static void FUNCC(pred8x8l_horizontal_add)(uint8_t *_pix, const int16_t *_block,
pix+= stride;
block+= 8;
}
+
+ memset(_block, 0, sizeof(dctcoef) * 64);
}
static void FUNCC(pred16x16_vertical_add)(uint8_t *pix, const int *block_offset,
- const int16_t *block,
+ int16_t *block,
ptrdiff_t stride)
{
int i;
@@ -1225,7 +1233,7 @@ static void FUNCC(pred16x16_vertical_add)(uint8_t *pix, const int *block_offset,
static void FUNCC(pred16x16_horizontal_add)(uint8_t *pix,
const int *block_offset,
- const int16_t *block,
+ int16_t *block,
ptrdiff_t stride)
{
int i;
@@ -1234,7 +1242,7 @@ static void FUNCC(pred16x16_horizontal_add)(uint8_t *pix,
}
static void FUNCC(pred8x8_vertical_add)(uint8_t *pix, const int *block_offset,
- const int16_t *block, ptrdiff_t stride)
+ int16_t *block, ptrdiff_t stride)
{
int i;
for(i=0; i<4; i++)
@@ -1242,7 +1250,7 @@ static void FUNCC(pred8x8_vertical_add)(uint8_t *pix, const int *block_offset,
}
static void FUNCC(pred8x16_vertical_add)(uint8_t *pix, const int *block_offset,
- const int16_t *block, ptrdiff_t stride)
+ int16_t *block, ptrdiff_t stride)
{
int i;
for(i=0; i<4; i++)
@@ -1252,7 +1260,7 @@ static void FUNCC(pred8x16_vertical_add)(uint8_t *pix, const int *block_offset,
}
static void FUNCC(pred8x8_horizontal_add)(uint8_t *pix, const int *block_offset,
- const int16_t *block,
+ int16_t *block,
ptrdiff_t stride)
{
int i;
@@ -1262,7 +1270,7 @@ static void FUNCC(pred8x8_horizontal_add)(uint8_t *pix, const int *block_offset,
static void FUNCC(pred8x16_horizontal_add)(uint8_t *pix,
const int *block_offset,
- const int16_t *block, ptrdiff_t stride)
+ int16_t *block, ptrdiff_t stride)
{
int i;
for(i=0; i<4; i++)
diff --git a/libavcodec/svq3.c b/libavcodec/svq3.c
index b79e69b..1dc2a6f 100644
--- a/libavcodec/svq3.c
+++ b/libavcodec/svq3.c
@@ -210,6 +210,8 @@ void ff_svq3_add_idct_c(uint8_t *dst, int16_t *block,
dst[i + stride * 2] = av_clip_uint8(dst[i + stride * 2] + ((z1 - z2) * qmul + rr >> 20));
dst[i + stride * 3] = av_clip_uint8(dst[i + stride * 3] + ((z0 - z3) * qmul + rr >> 20));
}
+
+ memset(block, 0, 16 * sizeof(int16_t));
}
static inline int svq3_decode_block(GetBitContext *gb, int16_t *block,
diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm
index 8fef7b8..7b7eb05 100644
--- a/libavcodec/x86/h264_idct.asm
+++ b/libavcodec/x86/h264_idct.asm
@@ -70,6 +70,10 @@ SECTION .text
paddw m0, m6
IDCT4_1D w, 0, 1, 2, 3, 4, 5
pxor m7, m7
+ movq [%2+ 0], m7
+ movq [%2+ 8], m7
+ movq [%2+16], m7
+ movq [%2+24], m7
STORE_DIFFx2 m0, m1, m4, m5, m7, 6, %1, %3
lea %1, [%1+%3*2]
@@ -161,13 +165,31 @@ cglobal h264_idct_add_8, 3, 3, 0
%endmacro
; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
-%macro IDCT8_ADD_MMX_END 3
+%macro IDCT8_ADD_MMX_END 3-4
IDCT8_1D_FULL %2
mova [%2 ], m5
mova [%2+16], m6
mova [%2+32], m7
pxor m7, m7
+%if %0 == 4
+ movq [%4+ 0], m7
+ movq [%4+ 8], m7
+ movq [%4+ 16], m7
+ movq [%4+ 24], m7
+ movq [%4+ 32], m7
+ movq [%4+ 40], m7
+ movq [%4+ 48], m7
+ movq [%4+ 56], m7
+ movq [%4+ 64], m7
+ movq [%4+ 72], m7
+ movq [%4+ 80], m7
+ movq [%4+ 88], m7
+ movq [%4+ 96], m7
+ movq [%4+104], m7
+ movq [%4+112], m7
+ movq [%4+120], m7
+%endif
STORE_DIFFx2 m0, m1, m5, m6, m7, 6, %1, %3
lea %1, [%1+%3*2]
STORE_DIFFx2 m2, m3, m5, m6, m7, 6, %1, %3
@@ -190,7 +212,7 @@ cglobal h264_idct8_add_8, 3, 4, 0
IDCT8_ADD_MMX_START r1 , rsp
IDCT8_ADD_MMX_START r1+8, rsp+64
lea r3, [r0+4]
- IDCT8_ADD_MMX_END r0 , rsp, r2
+ IDCT8_ADD_MMX_END r0 , rsp, r2, r1
IDCT8_ADD_MMX_END r3 , rsp+8, r2
ADD rsp, pad
@@ -233,6 +255,14 @@ cglobal h264_idct8_add_8, 3, 4, 0
SWAP 0, 8
SWAP 1, 9
%endif
+ mova [%2+ 0], m7
+ mova [%2+ 16], m7
+ mova [%2+ 32], m7
+ mova [%2+ 48], m7
+ mova [%2+ 64], m7
+ mova [%2+ 80], m7
+ mova [%2+ 96], m7
+ mova [%2+112], m7
lea %1, [%1+%3*4]
STORE_DIFF m4, m6, m7, [%1 ]
STORE_DIFF m5, m6, m7, [%1+%3 ]
@@ -246,19 +276,11 @@ cglobal h264_idct8_add_8, 3, 4, 10
IDCT8_ADD_SSE r0, r1, r2, r3
RET
-%macro DC_ADD_MMXEXT_INIT 2-3
-%if %0 == 2
- movsx %1, word [%1]
+%macro DC_ADD_MMXEXT_INIT 2
add %1, 32
sar %1, 6
movd m0, %1d
lea %1, [%2*3]
-%else
- add %3, 32
- sar %3, 6
- movd m0, %3d
- lea %3, [%2*3]
-%endif
pshufw m0, m0, 0
pxor m1, m1
psubw m1, m0
@@ -287,19 +309,44 @@ cglobal h264_idct8_add_8, 3, 4, 10
INIT_MMX mmxext
; ff_h264_idct_dc_add_mmxext(uint8_t *dst, int16_t *block, int stride)
-cglobal h264_idct_dc_add_8, 3, 3, 0
- DC_ADD_MMXEXT_INIT r1, r2
- DC_ADD_MMXEXT_OP movh, r0, r2, r1
+%if ARCH_X86_64
+cglobal h264_idct_dc_add_8, 3, 4, 0
+ movsx r3, word [r1]
+ mov word [r1], 0
+ DC_ADD_MMXEXT_INIT r3, r2
+ DC_ADD_MMXEXT_OP movh, r0, r2, r3
RET
; ff_h264_idct8_dc_add_mmxext(uint8_t *dst, int16_t *block, int stride)
-cglobal h264_idct8_dc_add_8, 3, 3, 0
- DC_ADD_MMXEXT_INIT r1, r2
- DC_ADD_MMXEXT_OP mova, r0, r2, r1
+cglobal h264_idct8_dc_add_8, 3, 4, 0
+ movsx r3, word [r1]
+ mov word [r1], 0
+ DC_ADD_MMXEXT_INIT r3, r2
+ DC_ADD_MMXEXT_OP mova, r0, r2, r3
lea r0, [r0+r2*4]
- DC_ADD_MMXEXT_OP mova, r0, r2, r1
+ DC_ADD_MMXEXT_OP mova, r0, r2, r3
+ RET
+%else
+cglobal h264_idct_dc_add_8, 2, 3, 0
+ movsx r2, word [r1]
+ mov word [r1], 0
+ mov r1, r2m
+ DC_ADD_MMXEXT_INIT r2, r1
+ DC_ADD_MMXEXT_OP movh, r0, r1, r2
RET
+; ff_h264_idct8_dc_add_mmxext(uint8_t *dst, int16_t *block, int stride)
+cglobal h264_idct8_dc_add_8, 2, 3, 0
+ movsx r2, word [%1]
+ mov word [r1], 0
+ mov r1, r2m
+ DC_ADD_MMXEXT_INIT r2, r1
+ DC_ADD_MMXEXT_OP mova, r0, r1, r2
+ lea r0, [r0+r1*4]
+ DC_ADD_MMXEXT_OP mova, r0, r1, r2
+ RET
+%endif
+
INIT_MMX mmx
; ff_h264_idct_add16_mmx(uint8_t *dst, const int *block_offset,
; int16_t *block, int stride, const uint8_t nnzc[6*8])
@@ -343,7 +390,7 @@ cglobal h264_idct8_add4_8, 5, 7 + npicregs, 0, dst, block_offset, block, stride,
add word [r2], 32
IDCT8_ADD_MMX_START r2 , rsp
IDCT8_ADD_MMX_START r2+8, rsp+64
- IDCT8_ADD_MMX_END r6 , rsp, r3
+ IDCT8_ADD_MMX_END r6 , rsp, r3, r2
mov r6d, dword [r1+r5*4]
lea r6, [r0+r6+4]
IDCT8_ADD_MMX_END r6 , rsp+8, r3
@@ -373,7 +420,8 @@ cglobal h264_idct_add16_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride
movsx r6, word [r2]
test r6, r6
jz .no_dc
- DC_ADD_MMXEXT_INIT r2, r3, r6
+ mov word [r2], 0
+ DC_ADD_MMXEXT_INIT r6, r3
%if ARCH_X86_64 == 0
%define dst2q r1
%define dst2d r1d
@@ -450,7 +498,8 @@ cglobal h264_idct_add16intra_8, 5, 8 + npicregs, 0, dst1, block_offset, block, s
movsx r6, word [r2]
test r6, r6
jz .skipblock
- DC_ADD_MMXEXT_INIT r2, r3, r6
+ mov word [r2], 0
+ DC_ADD_MMXEXT_INIT r6, r3
%if ARCH_X86_64 == 0
%define dst2q r1
%define dst2d r1d
@@ -489,7 +538,8 @@ cglobal h264_idct8_add4_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride
movsx r6, word [r2]
test r6, r6
jz .no_dc
- DC_ADD_MMXEXT_INIT r2, r3, r6
+ mov word [r2], 0
+ DC_ADD_MMXEXT_INIT r6, r3
%if ARCH_X86_64 == 0
%define dst2q r1
%define dst2d r1d
@@ -515,7 +565,7 @@ cglobal h264_idct8_add4_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride
add word [r2], 32
IDCT8_ADD_MMX_START r2 , rsp
IDCT8_ADD_MMX_START r2+8, rsp+64
- IDCT8_ADD_MMX_END r6 , rsp, r3
+ IDCT8_ADD_MMX_END r6 , rsp, r3, r2
mov r6d, dword [r1+r5*4]
lea r6, [r0+r6+4]
IDCT8_ADD_MMX_END r6 , rsp+8, r3
@@ -547,7 +597,8 @@ cglobal h264_idct8_add4_8, 5, 8 + npicregs, 10, dst1, block_offset, block, strid
test r6, r6
jz .no_dc
INIT_MMX cpuname
- DC_ADD_MMXEXT_INIT r2, r3, r6
+ mov word [r2], 0
+ DC_ADD_MMXEXT_INIT r6, r3
%if ARCH_X86_64 == 0
%define dst2q r1
%define dst2d r1d
@@ -650,7 +701,8 @@ h264_idct_add8_mmxext_plane:
movsx r6, word [r2]
test r6, r6
jz .skipblock
- DC_ADD_MMXEXT_INIT r2, r3, r6
+ mov word [r2], 0
+ DC_ADD_MMXEXT_INIT r6, r3
%if ARCH_X86_64
mov r0d, dword [r1+r5*4]
add r0, [dst2q]
@@ -693,7 +745,9 @@ cglobal h264_idct_add8_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride,
; r0 = uint8_t *dst, r2 = int16_t *block, r3 = int stride, r6=clobbered
h264_idct_dc_add8_mmxext:
movd m0, [r2 ] ; 0 0 X D
+ mov word [r2+ 0], 0
punpcklwd m0, [r2+32] ; x X d D
+ mov word [r2+32], 0
paddsw m0, [pw_32]
psraw m0, 6
punpcklwd m0, m0 ; d d D D
@@ -723,6 +777,10 @@ h264_add8x4_idct_sse2:
paddw m0, [pw_32]
IDCT4_1D w,0,1,2,3,4,5
pxor m7, m7
+ mova [r2+ 0], m7
+ mova [r2+16], m7
+ mova [r2+32], m7
+ mova [r2+48], m7
STORE_DIFFx2 m0, m1, m4, m5, m7, 6, r0, r3
lea r0, [r0+r3*2]
STORE_DIFFx2 m2, m3, m4, m5, m7, 6, r0, r3
diff --git a/libavcodec/x86/h264_idct_10bit.asm b/libavcodec/x86/h264_idct_10bit.asm
index eb375f9..88fdb84 100644
--- a/libavcodec/x86/h264_idct_10bit.asm
+++ b/libavcodec/x86/h264_idct_10bit.asm
@@ -66,6 +66,10 @@ SECTION .text
paddd m0, [pd_32]
IDCT4_1D d,0,1,2,3,4,5
pxor m5, m5
+ mova [%2+ 0], m5
+ mova [%2+16], m5
+ mova [%2+32], m5
+ mova [%2+48], m5
STORE_DIFFx2 m0, m1, m4, m5, %1, %3
lea %1, [%1+%3*2]
STORE_DIFFx2 m2, m3, m4, m5, %1, %3
@@ -100,6 +104,10 @@ add4x4_idct %+ SUFFIX:
paddd m0, [pd_32]
IDCT4_1D d,0,1,2,3,4,5
pxor m5, m5
+ mova [r2+ 0], m5
+ mova [r2+16], m5
+ mova [r2+32], m5
+ mova [r2+48], m5
STORE_DIFFx2 m0, m1, m4, m5, r5, r3
lea r5, [r5+r3*2]
STORE_DIFFx2 m2, m3, m4, m5, r5, r3
@@ -187,6 +195,7 @@ IDCT_ADD16_10
INIT_MMX mmxext
cglobal h264_idct_dc_add_10,3,3
movd m0, [r1]
+ mov dword [r1], 0
paddd m0, [pd_32]
psrad m0, 6
lea r1, [r2*3]
@@ -199,11 +208,11 @@ cglobal h264_idct_dc_add_10,3,3
; void h264_idct8_dc_add(pixel *dst, dctcoef *block, int stride)
;-----------------------------------------------------------------------------
%macro IDCT8_DC_ADD 0
-cglobal h264_idct8_dc_add_10,3,3,7
- mov r1d, [r1]
- add r1, 32
- sar r1, 6
- movd m0, r1d
+cglobal h264_idct8_dc_add_10,3,4,7
+ movd m0, [r1]
+ mov dword[r1], 0
+ paddd m0, [pd_32]
+ psrad m0, 6
lea r1, [r2*3]
SPLATW m0, m0, 0
mova m6, [pw_pixel_max]
@@ -255,6 +264,8 @@ idct_dc_add %+ SUFFIX:
add r5, r0
movq m0, [r2+ 0]
movhps m0, [r2+64]
+ mov dword [r2+ 0], 0
+ mov dword [r2+64], 0
paddd m0, [pd_32]
psrad m0, 6
pshufhw m0, m0, 0
@@ -473,6 +484,22 @@ h264_idct8_add1_10 %+ SUFFIX:
packssdw m8, m0
paddsw m8, [r0]
pxor m0, m0
+ mova [r1+ 0], m0
+ mova [r1+ 16], m0
+ mova [r1+ 32], m0
+ mova [r1+ 48], m0
+ mova [r1+ 64], m0
+ mova [r1+ 80], m0
+ mova [r1+ 96], m0
+ mova [r1+112], m0
+ mova [r1+128], m0
+ mova [r1+144], m0
+ mova [r1+160], m0
+ mova [r1+176], m0
+ mova [r1+192], m0
+ mova [r1+208], m0
+ mova [r1+224], m0
+ mova [r1+240], m0
CLIPW m8, m0, [pw_pixel_max]
mova [r0], m8
mova m8, [pw_pixel_max]
@@ -492,6 +519,22 @@ h264_idct8_add1_10 %+ SUFFIX:
lea r3, [r0+8]
IDCT8_ADD_SSE_END r0, rsp, r2
IDCT8_ADD_SSE_END r3, rsp+16, r2
+ mova [r1+ 0], m7
+ mova [r1+ 16], m7
+ mova [r1+ 32], m7
+ mova [r1+ 48], m7
+ mova [r1+ 64], m7
+ mova [r1+ 80], m7
+ mova [r1+ 96], m7
+ mova [r1+112], m7
+ mova [r1+128], m7
+ mova [r1+144], m7
+ mova [r1+160], m7
+ mova [r1+176], m7
+ mova [r1+192], m7
+ mova [r1+208], m7
+ mova [r1+224], m7
+ mova [r1+240], m7
%endif ; ARCH_X86_64
add rsp, pad
--
1.7.11.3
More information about the ffmpeg-devel
mailing list