[FFmpeg-devel] [PATCH] mmx implementation of vc-1 inverse transformations

Mon Jun 30 05:07:39 CEST 2008

On Sat, Jun 28, 2008 at 12:31:41PM +0200, Victor Pollex wrote:
> Michael Niedermayer schrieb:
>> On Sun, Jun 22, 2008 at 03:21:53PM +0200, Victor Pollex wrote:
>>   
>>> Michael Niedermayer schrieb:
>>>     
>>>> On Sat, Jun 21, 2008 at 03:37:44PM +0200, Victor Pollex wrote:
>>>>         
>>>>> Hi,
>>>>> as in subject.
>>>>>
>>>>> Victor Pollex
>>>>>             
>>>>         
>>>>> Index: libavcodec/i386/vc1dsp_mmx.c
>>>>> ===================================================================
>>>>> --- libavcodec/i386/vc1dsp_mmx.c	(Revision 13854)
>>>>> +++ libavcodec/i386/vc1dsp_mmx.c	(Arbeitskopie)
>>>>> @@ -1,6 +1,7 @@
>>>>>  /*
>>>>>   * VC-1 and WMV3 - DSP functions MMX-optimized
>>>>>   * Copyright (c) 2007 Christophe GISQUET <christophe.gisquet at free.fr>
>>>>> + * Copyright (c) 2008 Victor Pollex
>>>>>   *
>>>>>   * Permission is hereby granted, free of charge, to any person
>>>>>   * obtaining a copy of this software and associated documentation
>>>>> @@ -467,7 +468,609 @@
>>>>>  DECLARE_FUNCTION(3, 2)
>>>>>  DECLARE_FUNCTION(3, 3)
>>>>>  +#define LOAD_4X4(stride,base,in)\
>>>>> +    "movq 0*"#stride"+"#base#in", %%mm0\n\t"\
>>>>> +    "movq 1*"#stride"+"#base#in", %%mm1\n\t"\
>>>>> +    "movq 2*"#stride"+"#base#in", %%mm2\n\t"\
>>>>> +    "movq 3*"#stride"+"#base#in", %%mm3\n\t"
>>>>>             
>>>> duplicate of LOAD4
>>>>         
>>> the only LOAD4 I found is in dsputilenc_mmx.c and has a fixed stride of 
>>> 8, but I also need a stride of 16. If I missed something give me a hint.
>>>     
>>
>> LOAD4 does load 4 your LOAD_4X4 loads 4, they have slightly different 
>> things
>> that are hardcoded and that are settable through arguments. This doesnt 
>> make
>> them less duplicates of each other. Simply add stride to the exsting LOAD4
>> (in a seperate patch!) and use it.
>>
>>   
> attached patch which moves the macros from dsputilenc_mmx.c to 
> dsputil_mmx.h
>> [...]
>>   
>>>>> +/*
>>>>> +    precondition:
>>>>> +        r0 = row0/col0
>>>>> +        r1 = row1/col1
>>>>> +        r2 = row2/col2
>>>>> +        r3 = row3/col3
>>>>> +
>>>>> +    postcondition:
>>>>> +        r0 = col0/row0
>>>>> +        r1 = col1/row1
>>>>> +        r2 = col2/row2
>>>>> +        r3 = col3/row3
>>>>> +        t0 = undefined
>>>>> +*/
>>>>> +#define TRANSPOSE_4X4(r0,r1,r2,r3,t0)\
>>>>> +    "movq      "#r2", "#t0"\n\t"\
>>>>> +    "punpcklwd "#r3", "#r2"\n\t"\
>>>>> +    "punpckhwd "#r3", "#t0"\n\t"\
>>>>> +    \
>>>>> +    "movq      "#r0", "#r3"\n\t"\
>>>>> +    "punpcklwd "#r1", "#r0"\n\t"\
>>>>> +    "punpckhwd "#r1", "#r3"\n\t"\
>>>>> +    \
>>>>> +    "movq      "#r0", "#r1"\n\t"\
>>>>> +    "punpckldq "#r2", "#r0"\n\t"\
>>>>> +    "punpckhdq "#r2", "#r1"\n\t"\
>>>>> +    \
>>>>> +    "movq      "#r3", "#r2"\n\t"\
>>>>> +    "punpckldq "#t0", "#r2"\n\t"\
>>>>> +    "punpckhdq "#t0", "#r3"\n\t"
>>>>>             
>>>> duplicate of TRANSPOSE4
>>>>         
>>> basically yes, but here the output registers are the same and in the same 
>>> order as the input registers and with TRANSPOSE4 the input registers are 
>>> a, b, c, d and the output registers are a, d, t, c according to the 
>>> comments. If I feel like i could try to rewrite part of the code to use 
>>> TRANSPOSE4, but I rather try to rewrite it so that I don't need to 
>>> transpose it in the first place.
>>>     
>>
>> Its entirely your decission on how to solve it as long as the solution is
>> optimal speed/size/readbility wise. But i will not approve a
>> patch that adds duplicated code (like TRANSPOSE4 / TRANSPOSE_4X4) when it
>> is avoidable
>>
>>   
> attached patch which now uses TRANSPOSE4, LOAD4, STORE4. I tried to write a 
> 4x4 row transformation without the need to transpose, but it ended up being 
> slower.
>> [...]
>>   
>>>>> +static void vc1_inv_trans_8x8_mmx(DCTELEM block[64])
>>>>> +{
>>>>> +    DECLARE_ALIGNED_16(int16_t, temp[64]);
>>>>> +    asm volatile(
>>>>> +    TRANSFORM_8X4_ROW(0x00, (%0), %1)
>>>>> +    TRANSFORM_8X4_ROW(0x40, (%0), %1)
>>>>> +
>>>>> +
>>>>> +    LOAD_4X4(0x10, 0x00, %1)
>>>>> +    TRANSPOSE_4X4(%%mm1, %%mm0, %%mm3, %%mm2, %%mm4)
>>>>> +    STORE_4X4(0x10, 0x00, %1)
>>>>> +    LOAD_4X4(0x10, 0x40, %1)
>>>>> +    TRANSPOSE_4X4(%%mm1, %%mm0, %%mm3, %%mm2, %%mm4)
>>>>> +    STORE_4X4(0x10, 0x40, %1)
>>>>> +    TRANSFORM_4X8_COL(0x00, %1, (%0))
>>>>> +
>>>>> +    LOAD_4X4(0x10, 0x08, %1)
>>>>> +    TRANSPOSE_4X4(%%mm1, %%mm0, %%mm3, %%mm2, %%mm4)
>>>>> +    STORE_4X4(0x10, 0x08, %1)
>>>>> +    LOAD_4X4(0x10, 0x48, %1)
>>>>> +    TRANSPOSE_4X4(%%mm1, %%mm0, %%mm3, %%mm2, %%mm4)
>>>>> +    STORE_4X4(0x10, 0x48, %1)
>>>>> +    TRANSFORM_4X8_COL(0x08, %1, (%0))
>>>>> +    : "+r"(block), "+m"(temp)
>>>>> +    :
>>>>> +    : "memory"
>>>>> +    );
>>>>> +}
>>>>> +
>>>>> +static void vc1_inv_trans_8x4_mmx(uint8_t *dest, int linesize, DCTELEM 
>>>>> *block)
>>>>> +{
>>>>> +    DECLARE_ALIGNED_16(int16_t, temp[64]);
>>>>> +    asm volatile(
>>>>> +    TRANSFORM_8X4_ROW(0x00, (%0), %1)
>>>>> +
>>>>> +    LOAD_4X4(0x10, 0x00, %1)
>>>>> +    TRANSFORM_4X4_COL
>>>>> +    STORE_4X4(0x10, 0x00, (%0))
>>>>> +    LOAD_4X4(0x10, 0x08, %1)
>>>>> +    TRANSFORM_4X4_COL
>>>>> +    STORE_4X4(0x10, 0x08, (%0))
>>>>> +
>>>>> +    "pxor        %%mm7,     %%mm7\n\t"
>>>>> +    LOAD_4X4(0x08, 0x00, (%0))
>>>>> +    LOAD_ADD_CLAMP_STORE_8X2(%2, %3)
>>>>> +    "add %3, %2\n\t"
>>>>> +    LOAD_4X4(0x08, 0x20, (%0))
>>>>> +    LOAD_ADD_CLAMP_STORE_8X2(%2, %3)
>>>>> +    : "+r"(block), "+m"(temp), "+r"(dest)
>>>>> +    : "r"(linesize)
>>>>> +    : "memory"
>>>>> +    );
>>>>> +}
>>>>> +
>>>>> +static void vc1_inv_trans_4x8_mmx(uint8_t *dest, int linesize, DCTELEM 
>>>>> *block)
>>>>> +{
>>>>> +    DECLARE_ALIGNED_16(int16_t, temp[64]);
>>>>> +    asm volatile(
>>>>> +    LOAD_4X4(0x10, 0x00, (%0))
>>>>> +    TRANSFORM_4X4_ROW
>>>>> +    TRANSPOSE_4X4(%%mm1, %%mm0, %%mm3, %%mm2, %%mm4)
>>>>> +    STORE_4X4(0x10, 0x00, %1)
>>>>> +    LOAD_4X4(0x10, 0x40, (%0))
>>>>> +    TRANSFORM_4X4_ROW
>>>>> +    TRANSPOSE_4X4(%%mm1, %%mm0, %%mm3, %%mm2, %%mm4)
>>>>> +    STORE_4X4(0x10, 0x40, %1)
>>>>> +
>>>>> +    TRANSFORM_4X8_COL(0x00, %1, (%0))
>>>>> +
>>>>> +    "pxor     %%mm7,   %%mm7\n\t"
>>>>> +    LOAD_4X4(0x10, 0x00, (%0))
>>>>> +    LOAD_ADD_CLAMP_STORE_4X4(%2, %3)
>>>>> +    "add %3, %2\n\t"
>>>>> +    LOAD_4X4(0x10, 0x40, (%0))
>>>>> +    LOAD_ADD_CLAMP_STORE_4X4(%2, %3)
>>>>> +    : "+r"(block), "+m"(temp), "+r"(dest)
>>>>> +    : "r"(linesize)
>>>>> +    : "memory"
>>>>> +    );
>>>>> +}
>>>>> +
>>>>> +static void vc1_inv_trans_4x4_mmx(uint8_t *dest, int linesize, DCTELEM 
>>>>> *block)
>>>>> +{
>>>>> +    asm volatile(
>>>>> +    LOAD_4X4(0x10, 0x00, (%1))
>>>>> +    TRANSFORM_4X4_ROW
>>>>> +    TRANSFORM_4X4_COL
>>>>> +    "pxor     %%mm7,   %%mm7\n\t"
>>>>> +    LOAD_ADD_CLAMP_STORE_4X4(%0, %2)
>>>>> +    : "+r"(dest)
>>>>> +    : "r"(block), "r"(linesize)
>>>>> +    : "memory"
>>>>> +    );
>>>>> +}
>>>>>             
>>>> I do not think that brute force duplicating and unrolling of all 
>>>> variants
>>>> is optimal. Also benchmarks are needed for C vs, your mmx vs mmx
>>>> code with no duplicated transforms
>>>>
>>>> [...]
>>>>         
>>> what do you mean with duplicated transforms? Do you mean that for example 
>>> the 4x4 row transformation should be a method instead of a macro?
>>>     
>>
>> Yes (not inlined methods/function or just plain asm that uses loops/calls
>> whatever), i meant this should be tried, code cache
>> is a limited resource and your code after macro expansion is likely large.
>> I do not know which way it would be faster of course but it should be 
>> tested.
>>
>>   
> I tried it with not inlined methods for the 8x8 transformation, but for me 
> it was slower.
>>   
>>> As for benchmarks I used START_TIMER and STOP_TIMER from libavutil with 
>>> mingw 4.3.0-tdm3
>>> I don't know if it is an appropiate method, but i did it somewhat like 
>>> this
>>>
>>> for(i = 0; i < (1 << 19); ++i) {
>>>    memcpy(block1, block, 64 * sizeof(short));
>>>    START_TIMER
>>>    vc1_inv_trans_4x4_c(dest, 8, block1);
>>>    STOP_TIMER("vc1_inv_trans_4x4_c")
>>> }
>>>     
>>
>> The transforms should be tested in the real code that is in vc1.c
>> just put START/STOP_TIMER over the call to the transform
>>
>>
>> [...]
>>
>>   
> these have been done with mingw 3.4.5
> for the c version
> 19801 dezicycles in vc1_inv_trans_8x8, 1048466 runs, 110 skips
> 4069 dezicycles in vc1_inv_trans_4x4, 262046 runs, 98 skips
> 9432 dezicycles in vc1_inv_trans_4x8, 524263 runs, 25 skips
> 8056 dezicycles in vc1_inv_trans_8x4, 524259 runs, 29 skips
> 3992 dezicycles in vc1_inv_trans_4x4, 524148 runs, 140 skips
> 19743 dezicycles in vc1_inv_trans_8x8, 2096925 runs, 227 skips
> 9393 dezicycles in vc1_inv_trans_4x8, 1048529 runs, 47 skips
> 8006 dezicycles in vc1_inv_trans_8x4, 1048520 runs, 56 skips
> 3929 dezicycles in vc1_inv_trans_4x4, 1048312 runs, 264 skips
> 3893 dezicycles in vc1_inv_trans_4x4, 2096685 runs, 467 skips
> 7950 dezicycles in vc1_inv_trans_8x4, 2097040 runs, 112 skips
> 9358 dezicycles in vc1_inv_trans_4x8, 2097028 runs, 124 skips
> 19529 dezicycles in vc1_inv_trans_8x8, 4193864 runs, 440 skips
>
> for the mmx version
> 7157 dezicycles in vc1_inv_trans_8x8, 1048200 runs, 376 skips
> 2567 dezicycles in vc1_inv_trans_4x4, 261770 runs, 374 skips
> 4454 dezicycles in vc1_inv_trans_4x8, 523938 runs, 350 skips
> 3949 dezicycles in vc1_inv_trans_8x4, 523877 runs, 411 skips
> 2532 dezicycles in vc1_inv_trans_4x4, 523793 runs, 495 skips
> 7159 dezicycles in vc1_inv_trans_8x8, 2096437 runs, 715 skips
> 4424 dezicycles in vc1_inv_trans_4x8, 1047981 runs, 595 skips
> 3914 dezicycles in vc1_inv_trans_8x4, 1047851 runs, 725 skips
> 2512 dezicycles in vc1_inv_trans_4x4, 1047902 runs, 674 skips
> 2501 dezicycles in vc1_inv_trans_4x4, 2096179 runs, 973 skips
> 3896 dezicycles in vc1_inv_trans_8x4, 2095994 runs, 1158 skips
> 4412 dezicycles in vc1_inv_trans_4x8, 2096082 runs, 1070 skips
> 7142 dezicycles in vc1_inv_trans_8x8, 4193046 runs, 1258 skips
>
> If you are wondering why the c version of the 8x8 transformation isn't as 
> fast as the mmx version because of my last post, I mistakenly benchmarked 
> the mmx version twice last time. I attached a patch which adds the 
> START/STOP_TIMER around the calls for someone else to check.
>
> [...]
>>   
>>> +#define SRA2(r0,r1,c0)\
>>> +    "psraw $"#c0", "#r0"\n\t" /* r0 >> c0 */\
>>> +    "psraw $"#c0", "#r1"\n\t" /* r1 >> c0 */
>>> +
>>> +/*
>>> +    postcondition:
>>> +        r0 = r0 >> c0;
>>> +        r1 = r1 >> c0;
>>> +        r2 = r2 >> c0;
>>> +        r3 = r3 >> c0;
>>> +*/
>>> +#define SRA4(r0,r1,r2,r3,c0)\
>>> +    SRA2(r0,r1,c0)\
>>> +    SRA2(r2,r3,c0)
>>>     
>>
>> I think a generic macro like:
>>
>> OPC_SS_AB(opc, dst0, dst1, src) \
>>     opc" "src","dst0"\n\t" \
>>     opc" "src","dst1"\n\t"
>>
>> OPC_SSSS_ABCD(opc, dst0, dst1, dst2, dst3, src) \
>>     OPC_SS_AB(opc, dst0, dst1, src) \
>>     OPC_SS_AB(opc, dst2, dst3, src) \
>>
>> would be simpler
>>
>>
>> [...]
>>   
>>> +DECLARE_ALIGNED_16(static const int16_t, constants[]) = {
>>> +  X4( 4),
>>> +  X4(64)
>>> +};
>>>     
>>
>> Also please reduce the amount of marcos in your code this hurts 
>> readability a
>> lot.
>> That means at least every macro that is bigger than the space it safes.
>> 4,4,4,4
>> 64,64,64,64
>>
>> vs.
>>
>> X4( 4),
>> X4(64)
>> #define X4(x) x, x, x, x
>>
>> being an example
>>
>> [...]
>>   
> hopefully all issues have been addressed.
>

> Index: libavcodec/i386/dsputil_mmx.h
> ===================================================================
> --- libavcodec/i386/dsputil_mmx.h	(Revision 14018)
> +++ libavcodec/i386/dsputil_mmx.h	(Arbeitskopie)
> @@ -57,6 +57,18 @@
>  extern const double ff_pd_1[2];
>  extern const double ff_pd_2[2];
>  
> +#define LOAD4(s,o,in,a,b,c,d)\
> +    "movq 0*"#s"+"#o#in", "#a"\n\t"\
> +    "movq 1*"#s"+"#o#in", "#b"\n\t"\
> +    "movq 2*"#s"+"#o#in", "#c"\n\t"\
> +    "movq 3*"#s"+"#o#in", "#d"\n\t"
> +
> +#define STORE4(s,o,out,a,b,c,d)\
> +    "movq "#a", 0*"#s"+"#o#out"\n\t"\
> +    "movq "#b", 1*"#s"+"#o#out"\n\t"\
> +    "movq "#c", 2*"#s"+"#o#out"\n\t"\
> +    "movq "#d", 3*"#s"+"#o#out"\n\t"
> +

the o-out and o-in seem a little redundant. one for each should do i think
and i would rename s to stride for better readability

[...]

> +/*
> +    postcondition:
> +        dst0 = [15:0](dst0 + src);
> +        dst1 = [15:0](dst1 - src);
> +*/
> +#define ADD1SUB1(src, dst0, dst1)\
> +    "paddw "#src", "#dst0"\n\t" /* dst0 + src */\
> +    "psubw "#src", "#dst1"\n\t" /* dst1 - src */

i think this code is self explanatory and doesnt need a comment

[...]

> +#define LOAD_ADD_CLAMP_STORE_8X1(io,t0,t1,r0,r1)\
> +    "movq      "#io", "#t0"\n\t"\
> +    "movq      "#t0", "#t1"\n\t"\
> +    "punpcklbw %%mm7, "#t0"\n\t"\
> +    "punpckhbw %%mm7, "#t1"\n\t"\
> +    "paddw     "#r0", "#t0"\n\t"\
> +    "paddw     "#r1", "#t1"\n\t"\
> +    "packuswb  "#t1", "#t0"\n\t"\
> +    "movq      "#t0", "#io"\n\t"

some of the movq seem redundant

[...]

> +    STORE4(0x10,0x00,(%0),%%mm3,%%mm0,%%mm2,%%mm4)
> +
> +    LOAD4(0x10,0x08,%1,%%mm0,%%mm1,%%mm2,%%mm3)
> +    TRANSPOSE4(%%mm0,%%mm1,%%mm2,%%mm3,%%mm4)
> +    TRANSFORM_4X4_COL(%%mm0,%%mm3,%%mm4,%%mm2,%%mm1,%%mm5,%%mm6,%%mm7,%4)
> +    STORE4(0x10,0x08,(%0),%%mm3,%%mm0,%%mm2,%%mm4)
> +
> +    "pxor %%mm7, %%mm7\n\t"
> +    LOAD4(0x08,0x00,(%0),%%mm0,%%mm1,%%mm2,%%mm3)
> +    LOAD_ADD_CLAMP_STORE_8X2(%2,%3)
> +    "add     %3,    %2\n\t"
> +    LOAD4(0x08,0x20,(%0),%%mm0,%%mm1,%%mm2,%%mm3)
> +    LOAD_ADD_CLAMP_STORE_8X2(%2,%3)

redundant store+load 

[...]
-- 
Michael     GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

When the tyrant has disposed of foreign enemies by conquest or treaty, and
there is nothing more to fear from them, then he is always stirring up
some war or other, in order that the people may require a leader. -- Plato
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 189 bytes
Desc: Digital signature
URL: <http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/attachments/20080630/12e6c638/attachment.pgp>