[FFmpeg-cvslog] r10223 - in trunk/libavcodec/i386: dsputil_mmx.c snowdsp_mmx.c
Reimar Döffinger
Reimar.Doeffinger
Mon Aug 27 15:07:51 CEST 2007
Hello,
On Mon, Aug 27, 2007 at 01:05:03PM +0200, Michael Niedermayer wrote:
[...]
> if you want to fix it i dont mind helping, awnsering questions (even stupid
> questions about asm)
> i do mind fixing it myself because
> 1.i dont have a SSE2 system here, i only have ssh access to one which isnt
> that good for actually watching the result ...
> 2.if someone else fixes it that means someone else will afterwards be more
> familiar with the code and that means more potential developers who might
> improve it or other asm code in ffmpeg ...
Well, I do not have too much time, but still here is an attempt to
partially fix some things and add some helpful comments.
I think inner_add_yblock_bw_8_obmc_16_bh_even_sse2 works now, though
though I only get a black-and-white image...
I don't claim the performance to be great or anything, and I am also
wondering about slice_buffer_get_line that is used by the C code but not
the asm...
Maybe you can comment on a few things, like which of the comments you
think should stay, if adding the "&& add" is correct and if yes if the
add == 0 case is worth optimizing etc.
Greetings,
Reimar D?ffinger
-------------- next part --------------
Index: libavcodec/i386/snowdsp_mmx.c
===================================================================
--- libavcodec/i386/snowdsp_mmx.c (revision 10250)
+++ libavcodec/i386/snowdsp_mmx.c (working copy)
@@ -605,6 +605,15 @@
}
#endif //HAVE_7REGS
+/**
+ * Register usage:
+ * REG_c src_stride (2*src_stride in snow_inner_add_yblock_sse2_end_common1)
+ * REG_d block[ptr_offset]
+ * REG_S &obmc[2*y*obmc_stride]
+ * REG_D sb->line[src_y + 2*y][src_x]
+ * xmm7 0
+ * xmm3 0x00080008000800080008000800080008
+ */
#define snow_inner_add_yblock_sse2_header \
IDWTELEM * * dst_array = sb->line + src_y;\
long tmp;\
@@ -621,6 +630,10 @@
"mov (%%"REG_D"), %%"REG_D" \n\t"\
"add %3, %%"REG_D" \n\t"
+/* load two rows (8 bytes each) of block[ptr_offset] expanded to 16 bits into
+ * out_reg1 and out_reg2 and multiply by the 8 bytes at
+ * obmc[2*y*obmc_stride + s_offset] and obmc[(2*y + 1)*obmc_stride + s_offset]
+ */
#define snow_inner_add_yblock_sse2_start_8(out_reg1, out_reg2, ptr_offset, s_offset)\
"mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\
"movq (%%"REG_d"), %%"out_reg1" \n\t"\
@@ -689,47 +702,35 @@
static void inner_add_yblock_bw_8_obmc_16_bh_even_sse2(const uint8_t *obmc, const long obmc_stride, uint8_t * * block, int b_w, long b_h,
int src_x, int src_y, long src_stride, slice_buffer * sb, int add, uint8_t * dst8){
snow_inner_add_yblock_sse2_header
+// xmm1/5 = obmc1[x] * block[3][x + y*src_stride]
snow_inner_add_yblock_sse2_start_8("xmm1", "xmm5", "3", "0")
+// xmm1/5 += obmc2[x] * block[2][x + y*src_stride]
snow_inner_add_yblock_sse2_accum_8("2", "8")
+// xmm1/5 += obmc3[x] * block[1][x + y*src_stride]
snow_inner_add_yblock_sse2_accum_8("1", "128")
+// xmm1/5 += obmc4[x] * block[0][x + y*src_stride]
snow_inner_add_yblock_sse2_accum_8("0", "136")
"mov %0, %%"REG_d" \n\t"
- "movdqa (%%"REG_D"), %%xmm0 \n\t"
- "movdqa %%xmm1, %%xmm2 \n\t"
+ "movdqu (%%"REG_D"), %%xmm0 \n\t"
- "punpckhwd %%xmm7, %%xmm1 \n\t"
- "punpcklwd %%xmm7, %%xmm2 \n\t"
- "paddd %%xmm2, %%xmm0 \n\t"
- "movdqa 16(%%"REG_D"), %%xmm2 \n\t"
- "paddd %%xmm1, %%xmm2 \n\t"
+ "paddd %%xmm1, %%xmm0 \n\t"
"paddd %%xmm3, %%xmm0 \n\t"
- "paddd %%xmm3, %%xmm2 \n\t"
"mov %1, %%"REG_D" \n\t"
"mov "PTR_SIZE"(%%"REG_D"), %%"REG_D";\n\t"
"add %3, %%"REG_D" \n\t"
- "movdqa (%%"REG_D"), %%xmm4 \n\t"
- "movdqa %%xmm5, %%xmm6 \n\t"
- "punpckhwd %%xmm7, %%xmm5 \n\t"
- "punpcklwd %%xmm7, %%xmm6 \n\t"
- "paddd %%xmm6, %%xmm4 \n\t"
- "movdqa 16(%%"REG_D"), %%xmm6 \n\t"
- "paddd %%xmm5, %%xmm6 \n\t"
+ "movdqu (%%"REG_D"), %%xmm4 \n\t"
+ "paddd %%xmm5, %%xmm4 \n\t"
"paddd %%xmm3, %%xmm4 \n\t"
- "paddd %%xmm3, %%xmm6 \n\t"
- "psrad $8, %%xmm0 \n\t" /* FRAC_BITS. */
- "psrad $8, %%xmm2 \n\t" /* FRAC_BITS. */
- "packssdw %%xmm2, %%xmm0 \n\t"
- "packuswb %%xmm7, %%xmm0 \n\t"
+ "psraw $4, %%xmm0 \n\t" /* FRAC_BITS. */
+ "packsswb %%xmm7, %%xmm0 \n\t"
"movq %%xmm0, (%%"REG_d") \n\t"
- "psrad $8, %%xmm4 \n\t" /* FRAC_BITS. */
- "psrad $8, %%xmm6 \n\t" /* FRAC_BITS. */
- "packssdw %%xmm6, %%xmm4 \n\t"
- "packuswb %%xmm7, %%xmm4 \n\t"
+ "psraw $4, %%xmm4 \n\t" /* FRAC_BITS. */
+ "packsswb %%xmm7, %%xmm4 \n\t"
"movq %%xmm4, (%%"REG_d",%%"REG_c");\n\t"
snow_inner_add_yblock_sse2_end_8
}
@@ -851,9 +852,9 @@
void ff_snow_inner_add_yblock_sse2(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){
- if (b_w == 16)
+ if (b_w == 16 && add)
inner_add_yblock_bw_16_obmc_32_sse2(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
- else if (b_w == 8 && obmc_stride == 16) {
+ else if (b_w == 8 && obmc_stride == 16 && add) {
if (!(b_h & 1))
inner_add_yblock_bw_8_obmc_16_bh_even_sse2(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
else
More information about the ffmpeg-cvslog
mailing list