[FFmpeg-devel] [PATCH] MMX implementation of VC-1 inverse transforms
Michael Niedermayer
michaelni
Mon Jan 14 12:05:23 CET 2008
On Sun, Jan 13, 2008 at 05:10:30PM +0100, Christophe GISQUET wrote:
> Each function is around 2 times faster. Functions taking the most time
> now are top level ones or for entropy decoding.
[...]
> +DECLARE_ALIGNED_16(const uint64_t, ff_pw_6) = 0x0006000600060006LL;
duplicate
> +DECLARE_ALIGNED_16(const uint64_t, ff_pw_12) = 0x000C000C000C000CLL;
> +
> +static void vc1_inv_trans8_1d(DCTELEM *block, long int off, int rnd1, int rnd2, unsigned int shift)
> +{
> + DECLARE_ALIGNED_16(uint64_t, mm_rnd1);
> + DECLARE_ALIGNED_16(uint64_t, mm_rnd2);
> + /* Might already be aligned on the stack */
> + DECLARE_ALIGNED_16(uint64_t, mm_shift) = shift;
> +
> + asm volatile (
> + "movd %2, %%mm0 \n\t"
> + "movd %3, %%mm1 \n\t"
> + "punpcklwd %%mm0, %%mm0 \n\t"
> + "punpcklwd %%mm1, %%mm1 \n\t"
> + "punpckldq %%mm0, %%mm0 \n\t"
> + "punpckldq %%mm1, %%mm1 \n\t"
> + "movq %%mm0, %0 \n\t"
> + "movq %%mm1, %1 \n\t"
> + : "+m"(mm_rnd1), "+m"(mm_rnd2)
> + : "m"(rnd1), "m"(rnd2)
> + );
as rnd1 and 2 as well as shift are constants, building these in the inner
loops is completely unnacceptable, you should pass int64_t arguments
> +
> + asm volatile(
> + "movq (%0 ), %%mm1 \n\t"
> + "movq (%0,%1,4), %%mm2 \n\t"
> + "movq (%0,%1,2), %%mm3 \n\t"
> + "movq (%0,%2,2), %%mm4 \n\t"
> + "movq %%mm1, %%mm0 \n\t"
> + "psubw %%mm2, %%mm1 \n\t" /* s0-s4 */
> + "paddw %%mm2, %%mm0 \n\t" /* s0+s4*/
> + "movq %%mm3, %%mm5 \n\t"
> + "movq %%mm4, %%mm6 \n\t"
> + "pmullw ff_pw_12, %%mm0 \n\t" /* t1 */
> + "pmullw ff_pw_12, %%mm1 \n\t" /* t2 */
> + "psllw $4, %%mm3 \n\t" /* 16s2 */
> + "psllw $4, %%mm6 \n\t" /* 16s6 */
> + "pmullw ff_pw_6, %%mm5 \n\t" /* 6s2 */
> + "pmullw ff_pw_6, %%mm4 \n\t" /* 6s6 */
> + "psubw %%mm6, %%mm5 \n\t" /* t4 */
> + "paddw %%mm4, %%mm3 \n\t" /* t3 */
> +
> + "movq %%mm0, %%mm2 \n\t"
> + "movq %%mm1, %%mm4 \n\t"
> + "paddsw %%mm3, %%mm0 \n\t" /* t5 */
> + "psubsw %%mm3, %%mm2 \n\t" /* t8 */
> + "paddsw %%mm5, %%mm1 \n\t" /* t6 */
> + "psubsw %%mm5, %%mm4 \n\t" /* t7 */
> +
> + /* Storing results for later */
> + "movq %%mm0, (%0 ) \n\t" /* src[0] = t5 */
> + "movq %%mm1, (%0,%2,2) \n\t" /* src[2] = t6 */
> + "movq %%mm4, (%0,%1,2) \n\t" /* src[4] = t7 */
> + "movq %%mm2, (%0,%1,4) \n\t" /* src[6] = t8 */
you do not need temporary storeage
the butterflies can be implemented like:
b+=a
a+=a
a-=b
and yes do the odd part first if thats simpler!
> +
> +/* Using (a<<4 - a) and (a<<3 + a) instead of 15*a and 9*a is slower (2% total) */
> +#define LOAD_CONV(A, B, C, D, OFF) \
> + "movq (%0,"OFF"), %%mm"#A" \n\t" \
> + "movq %%mm"#A", %%mm"#B" \n\t" \
> + "movq %%mm"#A", %%mm"#C" \n\t" \
> + "movq %%mm"#A", %%mm"#D" \n\t" \
> + "psllw $4, %%mm"#A" \n\t" \
> + "pmullw "MANGLE(ff_pw_15)", %%mm"#B" \n\t" \
> + "pmullw "MANGLE(ff_pw_9)", %%mm"#C" \n\t" \
> + "psllw $2, %%mm"#D" \n\t"
> +
> + LOAD_CONV(1, 2, 3, 4, "%1")
> + LOAD_CONV(0, 5, 6, 7, "%2")
> + "paddw %%mm5, %%mm1 \n\t"
> + "psubw %%mm7, %%mm2 \n\t"
> + "psubw %%mm0, %%mm3 \n\t"
> + "psubw %%mm6, %%mm4 \n\t"
> + LOAD_CONV(0, 5, 6, 7, "%3")
> + "paddw %%mm6, %%mm1 \n\t"
> + "psubw %%mm0, %%mm2 \n\t"
> + "paddw %%mm7, %%mm3 \n\t"
> + "paddw %%mm5, %%mm4 \n\t"
> + LOAD_CONV(0, 5, 6, 7, "%4")
> + "paddw %%mm7, %%mm1 \n\t" /* t1 */
> + "psubw %%mm6, %%mm2 \n\t" /* t2 */
> + "paddw %%mm5, %%mm3 \n\t" /* t3 */
> + "psubw %%mm0, %%mm4 \n\t" /* t4 */
uhm
instead of
> + "movq (%0,"OFF"), %%mm0 \n\t" \
> + "movq %%mm0, %%mm5 \n\t" \
> + "movq %%mm0, %%mm6 \n\t" \
> + "movq %%mm0, %%mm7 \n\t" \
> + "psllw $4, %%mm0 \n\t" \
> + "pmullw "MANGLE(ff_pw_15)", %%mm5 \n\t" \
> + "pmullw "MANGLE(ff_pw_9)", %%mm6 \n\t" \
> + "psllw $2, %%mm7 \n\t"
> + "paddw %%mm5, %%mm1 \n\t"
> + "psubw %%mm7, %%mm2 \n\t"
> + "psubw %%mm0, %%mm3 \n\t"
> + "psubw %%mm6, %%mm4 \n\t"
you should at least do
> + "movq (%0,"OFF"), %%mm0 \n\t" \
> + "psubw %%mm0, %%mm1 \n\t"
> + "psubw %%mm0, %%mm4 \n\t"
> + "psllw $2, %%mm0 \n\t"
> + "psubw %%mm0, %%mm2 \n\t"
> + "paddw %%mm0, %%mm0 \n\t"
> + "psubw %%mm0, %%mm4 \n\t"
> + "paddw %%mm0, %%mm0 \n\t"
> + "psubw %%mm0, %%mm3 \n\t"
> + "paddw %%mm0, %%mm1 \n\t"
2 instructions less, 3 registers less, no multiply, no constants read
> +
> +/* Be aware that OUT2>OUT3 while OUT0<OUT1 */
> +#define RELOAD_SAVE(R0,R1, IN0,IN1, OUT0,OUT1, OUT2,OUT3) \
> + "movq (%0"IN0"), %%mm5 \n\t" /* reload tx */ \
> + "movq (%0"IN1"), %%mm6 \n\t" /* reload ty */ \
> + "movq %%mm5, %%mm0 \n\t" \
> + "movq %%mm6, %%mm7 \n\t" \
> + "paddw %5, %%mm5 \n\t" \
> + "paddw %5, %%mm6 \n\t" \
> + "paddw %6, %%mm0 \n\t" \
> + "paddw %6, %%mm7 \n\t" \
> + \
> + "paddw %%"#R0", %%mm5 \n\t" \
> + "paddw %%"#R1", %%mm6 \n\t" \
> + "psubw %%"#R0", %%mm0 \n\t" \
> + "psubw %%"#R1", %%mm7 \n\t" \
> + "psraw %7, %%mm5 \n\t" \
> + "psraw %7, %%mm6 \n\t" \
> + "psraw %7, %%mm0 \n\t" \
> + "psraw %7, %%mm7 \n\t" \
> + "movq %%mm5, (%0"OUT0") \n\t" \
> + "movq %%mm6, (%0"OUT1") \n\t" \
> + "movq %%mm0, (%0"OUT2") \n\t" \
> + "movq %%mm7, (%0"OUT3") \n\t"
> +
> + /* Output memory must contain input memory
> + to not overwrite other input */
> + RELOAD_SAVE(mm1,mm2, " ",",%2,2", " ",",%1", ",%4",",%2,2")
> + RELOAD_SAVE(mm3,mm4, ",%1,2",",%1,4", ",%1,2",",%2", ",%3",",%1,4")
> +
> + : "+r"(block)
the +4 can be factored out:
t3= 12*src[0] + 4;
t4= 12*src[4];
t1= t3 + t4;
t2= t3 - t4;
the same is true for the +64 in the secodn loop
> + : "r"(off), "r"(3*off), "r"(5*off), "r"(7*off),
unneeded wasting of 4 registers to load a constant
and resulting more complex and slower addressing
> + "m"(mm_rnd1), "m"(mm_rnd2), "m"(mm_shift)
> + );
> +}
> +
> +static void vc1_inv_trans_8x8_mmx(DCTELEM block[64])
> +{
> + transpose8x8_mmx(block);
all initial permutations (here a transpose) MUST be merged into the scantable
all other codecs do this too! vc1 wont become an exception
> + vc1_inv_trans8_1d(block+0,8*2, 4, 4, 3);
> + vc1_inv_trans8_1d(block+4,8*2, 4, 4, 3);
> + transpose8x8_mmx(block);
> + vc1_inv_trans8_1d(block+0,8*2, 64, 64+1, 7);
> + vc1_inv_trans8_1d(block+4,8*2, 64, 64+1, 7);
> +}
> +
> +DECLARE_ALIGNED_16(const uint64_t, ff_pw_10) = 0x000A000A000A000ALL;
> +DECLARE_ALIGNED_16(const uint64_t, ff_pw_17) = 0x0011001100110011LL;
> +DECLARE_ALIGNED_16(const uint64_t, ff_pw_22) = 0x0016001600160016LL;
> +
> +/**
> + * Input data: R0, R1, R2, R3
> + * Output data: R0, TMP1, R1, TMP3
> + */
> +#define IDCT4_1D(R0, R1, R2, R3, TMP1, TMP2, TMP3, SHIFT) \
> + "movq %%"#R0", %%"#TMP1" \n\t" \
> + "movq %%"#R1", %%"#TMP2" \n\t" \
> + "paddw %%"#R2", %%"#R0" \n\t" \
> + "psubw %%"#R2", %%"#TMP1" \n\t" \
> + "pmullw "MANGLE(ff_pw_17)", %%"#R0" \n\t" /* t1 */ \
> + "pmullw "MANGLE(ff_pw_17)", %%"#TMP1" \n\t" /* t2 */ \
> + "movq %%"#R3", %%"#R2" \n\t" \
> + "pmullw "MANGLE(ff_pw_22)", %%"#R1" \n\t" /* t3 */ \
> + "pmullw "MANGLE(ff_pw_22)", %%"#R3" \n\t" /* t4 */ \
> + "paddw %%mm7, %%"#R0" \n\t" /* Add bias */ \
> + "paddw %%mm7, %%"#TMP1" \n\t" \
same as above the multiply can be done before the butterfly and
thus 1 bias add can be avoided
> + "pmullw "MANGLE(ff_pw_10)", %%"#TMP2" \n\t" /* t5 */ \
> + "pmullw "MANGLE(ff_pw_10)", %%"#R2" \n\t" /* t6 */ \
bad:
> + "movq %%"#R0", %%"#TMP3" \n\t" \
> + "paddw %%"#R1", %%"#R0" \n\t" /* t1+t3 */ \
> + "psubw %%"#R1", %%"#TMP3" \n\t" /* t1-t3 */ \
> + "movq %%"#TMP1", %%"#R1" \n\t" \
> + "psubw %%"#R3", %%"#TMP1" \n\t" /* t2-t4 */ \
> + "paddw %%"#R3", %%"#R1" \n\t" /* t2+t4 */ \
> + "paddw %%"#R2", %%"#R0" \n\t" /* t1+t3 + t6 */ \
> + "paddw %%"#TMP2", %%"#TMP1" \n\t" /* t2-t4 + t5 */ \
> + "psubw %%"#TMP2", %%"#R1" \n\t" /* t2+t4 - t5 */ \
> + "psubw %%"#R2", %%"#TMP3" \n\t" /* t1-t3 - t6 */ \
good:
"paddw %%"#R2", %%"#R1" \n\t" /* t3 + t6 */ \
"psubw %%"#TMP2", %%"#R3" \n\t" /* t4 - t5 */ \
"movq %%"#R0", %%"#TMP3" \n\t" \
"paddw %%"#R1", %%"#R0" \n\t" /* t1+t3 + t6 */ \
"psubw %%"#R1", %%"#TMP3" \n\t" /* t1-t3 - t6 */ \
"movq %%"#TMP1", %%"#R1" \n\t" \
"psubw %%"#R3", %%"#TMP1" \n\t" /* t2-t4 + t5 */ \
"paddw %%"#R3", %%"#R1" \n\t" /* t2+t4 - t5 */ \
please try to simplify your code instead of having others go over it and
do the work
this is a mere factorization of t3+t6 and t4-t5 which you calculate twice
[...]
--
Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB
I do not agree with what you have to say, but I'll defend to the death your
right to say it. -- Voltaire
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 189 bytes
Desc: Digital signature
URL: <http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/attachments/20080114/64dc17d1/attachment.pgp>
More information about the ffmpeg-devel
mailing list