[FFmpeg-devel] [PATCH] mmx implementation of vc-1 inverse transformations

Victor Pollex victor.pollex
Thu Jul 3 14:51:18 CEST 2008


Michael Niedermayer schrieb:
> On Sat, Jun 28, 2008 at 12:31:41PM +0200, Victor Pollex wrote:
>   
>> Michael Niedermayer schrieb:
>>     
>>> On Sun, Jun 22, 2008 at 03:21:53PM +0200, Victor Pollex wrote:
>>>   
>>>       
>>>> Michael Niedermayer schrieb:
>>>>     
>>>>         
>>>>> On Sat, Jun 21, 2008 at 03:37:44PM +0200, Victor Pollex wrote:
>>>>>         
>>>>>           
>>>>>> Hi,
>>>>>> as in subject.
>>>>>>
>>>>>> Victor Pollex
>>>>>>             
>>>>>>             
>>>>>         
>>>>>           
>>>>>> Index: libavcodec/i386/vc1dsp_mmx.c
>>>>>> ===================================================================
>>>>>> --- libavcodec/i386/vc1dsp_mmx.c	(Revision 13854)
>>>>>> +++ libavcodec/i386/vc1dsp_mmx.c	(Arbeitskopie)
>>>>>> @@ -1,6 +1,7 @@
>>>>>>  /*
>>>>>>   * VC-1 and WMV3 - DSP functions MMX-optimized
>>>>>>   * Copyright (c) 2007 Christophe GISQUET <christophe.gisquet at free.fr>
>>>>>> + * Copyright (c) 2008 Victor Pollex
>>>>>>   *
>>>>>>   * Permission is hereby granted, free of charge, to any person
>>>>>>   * obtaining a copy of this software and associated documentation
>>>>>> @@ -467,7 +468,609 @@
>>>>>>  DECLARE_FUNCTION(3, 2)
>>>>>>  DECLARE_FUNCTION(3, 3)
>>>>>>  +#define LOAD_4X4(stride,base,in)\
>>>>>> +    "movq 0*"#stride"+"#base#in", %%mm0\n\t"\
>>>>>> +    "movq 1*"#stride"+"#base#in", %%mm1\n\t"\
>>>>>> +    "movq 2*"#stride"+"#base#in", %%mm2\n\t"\
>>>>>> +    "movq 3*"#stride"+"#base#in", %%mm3\n\t"
>>>>>>             
>>>>>>             
>>>>> duplicate of LOAD4
>>>>>         
>>>>>           
>>>> the only LOAD4 I found is in dsputilenc_mmx.c and has a fixed stride of 
>>>> 8, but I also need a stride of 16. If I missed something give me a hint.
>>>>     
>>>>         
>>> LOAD4 does load 4 your LOAD_4X4 loads 4, they have slightly different 
>>> things
>>> that are hardcoded and that are settable through arguments. This doesnt 
>>> make
>>> them less duplicates of each other. Simply add stride to the exsting LOAD4
>>> (in a seperate patch!) and use it.
>>>
>>>   
>>>       
>> attached patch which moves the macros from dsputilenc_mmx.c to 
>> dsputil_mmx.h
>>     
>>> [...]
>>>   
>>>       
>>>>>> +/*
>>>>>> +    precondition:
>>>>>> +        r0 = row0/col0
>>>>>> +        r1 = row1/col1
>>>>>> +        r2 = row2/col2
>>>>>> +        r3 = row3/col3
>>>>>> +
>>>>>> +    postcondition:
>>>>>> +        r0 = col0/row0
>>>>>> +        r1 = col1/row1
>>>>>> +        r2 = col2/row2
>>>>>> +        r3 = col3/row3
>>>>>> +        t0 = undefined
>>>>>> +*/
>>>>>> +#define TRANSPOSE_4X4(r0,r1,r2,r3,t0)\
>>>>>> +    "movq      "#r2", "#t0"\n\t"\
>>>>>> +    "punpcklwd "#r3", "#r2"\n\t"\
>>>>>> +    "punpckhwd "#r3", "#t0"\n\t"\
>>>>>> +    \
>>>>>> +    "movq      "#r0", "#r3"\n\t"\
>>>>>> +    "punpcklwd "#r1", "#r0"\n\t"\
>>>>>> +    "punpckhwd "#r1", "#r3"\n\t"\
>>>>>> +    \
>>>>>> +    "movq      "#r0", "#r1"\n\t"\
>>>>>> +    "punpckldq "#r2", "#r0"\n\t"\
>>>>>> +    "punpckhdq "#r2", "#r1"\n\t"\
>>>>>> +    \
>>>>>> +    "movq      "#r3", "#r2"\n\t"\
>>>>>> +    "punpckldq "#t0", "#r2"\n\t"\
>>>>>> +    "punpckhdq "#t0", "#r3"\n\t"
>>>>>>             
>>>>>>             
>>>>> duplicate of TRANSPOSE4
>>>>>         
>>>>>           
>>>> basically yes, but here the output registers are the same and in the same 
>>>> order as the input registers and with TRANSPOSE4 the input registers are 
>>>> a, b, c, d and the output registers are a, d, t, c according to the 
>>>> comments. If I feel like i could try to rewrite part of the code to use 
>>>> TRANSPOSE4, but I rather try to rewrite it so that I don't need to 
>>>> transpose it in the first place.
>>>>     
>>>>         
>>> Its entirely your decission on how to solve it as long as the solution is
>>> optimal speed/size/readbility wise. But i will not approve a
>>> patch that adds duplicated code (like TRANSPOSE4 / TRANSPOSE_4X4) when it
>>> is avoidable
>>>
>>>   
>>>       
>> attached patch which now uses TRANSPOSE4, LOAD4, STORE4. I tried to write a 
>> 4x4 row transformation without the need to transpose, but it ended up being 
>> slower.
>>     
>>> [...]
>>>   
>>>       
>>>>>> +static void vc1_inv_trans_8x8_mmx(DCTELEM block[64])
>>>>>> +{
>>>>>> +    DECLARE_ALIGNED_16(int16_t, temp[64]);
>>>>>> +    asm volatile(
>>>>>> +    TRANSFORM_8X4_ROW(0x00, (%0), %1)
>>>>>> +    TRANSFORM_8X4_ROW(0x40, (%0), %1)
>>>>>> +
>>>>>> +
>>>>>> +    LOAD_4X4(0x10, 0x00, %1)
>>>>>> +    TRANSPOSE_4X4(%%mm1, %%mm0, %%mm3, %%mm2, %%mm4)
>>>>>> +    STORE_4X4(0x10, 0x00, %1)
>>>>>> +    LOAD_4X4(0x10, 0x40, %1)
>>>>>> +    TRANSPOSE_4X4(%%mm1, %%mm0, %%mm3, %%mm2, %%mm4)
>>>>>> +    STORE_4X4(0x10, 0x40, %1)
>>>>>> +    TRANSFORM_4X8_COL(0x00, %1, (%0))
>>>>>> +
>>>>>> +    LOAD_4X4(0x10, 0x08, %1)
>>>>>> +    TRANSPOSE_4X4(%%mm1, %%mm0, %%mm3, %%mm2, %%mm4)
>>>>>> +    STORE_4X4(0x10, 0x08, %1)
>>>>>> +    LOAD_4X4(0x10, 0x48, %1)
>>>>>> +    TRANSPOSE_4X4(%%mm1, %%mm0, %%mm3, %%mm2, %%mm4)
>>>>>> +    STORE_4X4(0x10, 0x48, %1)
>>>>>> +    TRANSFORM_4X8_COL(0x08, %1, (%0))
>>>>>> +    : "+r"(block), "+m"(temp)
>>>>>> +    :
>>>>>> +    : "memory"
>>>>>> +    );
>>>>>> +}
>>>>>> +
>>>>>> +static void vc1_inv_trans_8x4_mmx(uint8_t *dest, int linesize, DCTELEM 
>>>>>> *block)
>>>>>> +{
>>>>>> +    DECLARE_ALIGNED_16(int16_t, temp[64]);
>>>>>> +    asm volatile(
>>>>>> +    TRANSFORM_8X4_ROW(0x00, (%0), %1)
>>>>>> +
>>>>>> +    LOAD_4X4(0x10, 0x00, %1)
>>>>>> +    TRANSFORM_4X4_COL
>>>>>> +    STORE_4X4(0x10, 0x00, (%0))
>>>>>> +    LOAD_4X4(0x10, 0x08, %1)
>>>>>> +    TRANSFORM_4X4_COL
>>>>>> +    STORE_4X4(0x10, 0x08, (%0))
>>>>>> +
>>>>>> +    "pxor        %%mm7,     %%mm7\n\t"
>>>>>> +    LOAD_4X4(0x08, 0x00, (%0))
>>>>>> +    LOAD_ADD_CLAMP_STORE_8X2(%2, %3)
>>>>>> +    "add %3, %2\n\t"
>>>>>> +    LOAD_4X4(0x08, 0x20, (%0))
>>>>>> +    LOAD_ADD_CLAMP_STORE_8X2(%2, %3)
>>>>>> +    : "+r"(block), "+m"(temp), "+r"(dest)
>>>>>> +    : "r"(linesize)
>>>>>> +    : "memory"
>>>>>> +    );
>>>>>> +}
>>>>>> +
>>>>>> +static void vc1_inv_trans_4x8_mmx(uint8_t *dest, int linesize, DCTELEM 
>>>>>> *block)
>>>>>> +{
>>>>>> +    DECLARE_ALIGNED_16(int16_t, temp[64]);
>>>>>> +    asm volatile(
>>>>>> +    LOAD_4X4(0x10, 0x00, (%0))
>>>>>> +    TRANSFORM_4X4_ROW
>>>>>> +    TRANSPOSE_4X4(%%mm1, %%mm0, %%mm3, %%mm2, %%mm4)
>>>>>> +    STORE_4X4(0x10, 0x00, %1)
>>>>>> +    LOAD_4X4(0x10, 0x40, (%0))
>>>>>> +    TRANSFORM_4X4_ROW
>>>>>> +    TRANSPOSE_4X4(%%mm1, %%mm0, %%mm3, %%mm2, %%mm4)
>>>>>> +    STORE_4X4(0x10, 0x40, %1)
>>>>>> +
>>>>>> +    TRANSFORM_4X8_COL(0x00, %1, (%0))
>>>>>> +
>>>>>> +    "pxor     %%mm7,   %%mm7\n\t"
>>>>>> +    LOAD_4X4(0x10, 0x00, (%0))
>>>>>> +    LOAD_ADD_CLAMP_STORE_4X4(%2, %3)
>>>>>> +    "add %3, %2\n\t"
>>>>>> +    LOAD_4X4(0x10, 0x40, (%0))
>>>>>> +    LOAD_ADD_CLAMP_STORE_4X4(%2, %3)
>>>>>> +    : "+r"(block), "+m"(temp), "+r"(dest)
>>>>>> +    : "r"(linesize)
>>>>>> +    : "memory"
>>>>>> +    );
>>>>>> +}
>>>>>> +
>>>>>> +static void vc1_inv_trans_4x4_mmx(uint8_t *dest, int linesize, DCTELEM 
>>>>>> *block)
>>>>>> +{
>>>>>> +    asm volatile(
>>>>>> +    LOAD_4X4(0x10, 0x00, (%1))
>>>>>> +    TRANSFORM_4X4_ROW
>>>>>> +    TRANSFORM_4X4_COL
>>>>>> +    "pxor     %%mm7,   %%mm7\n\t"
>>>>>> +    LOAD_ADD_CLAMP_STORE_4X4(%0, %2)
>>>>>> +    : "+r"(dest)
>>>>>> +    : "r"(block), "r"(linesize)
>>>>>> +    : "memory"
>>>>>> +    );
>>>>>> +}
>>>>>>             
>>>>>>             
>>>>> I do not think that brute force duplicating and unrolling of all 
>>>>> variants
>>>>> is optimal. Also benchmarks are needed for C vs, your mmx vs mmx
>>>>> code with no duplicated transforms
>>>>>
>>>>> [...]
>>>>>         
>>>>>           
>>>> what do you mean with duplicated transforms? Do you mean that for example 
>>>> the 4x4 row transformation should be a method instead of a macro?
>>>>     
>>>>         
>>> Yes (not inlined methods/function or just plain asm that uses loops/calls
>>> whatever), i meant this should be tried, code cache
>>> is a limited resource and your code after macro expansion is likely large.
>>> I do not know which way it would be faster of course but it should be 
>>> tested.
>>>
>>>   
>>>       
>> I tried it with not inlined methods for the 8x8 transformation, but for me 
>> it was slower.
>>     
>>>   
>>>       
>>>> As for benchmarks I used START_TIMER and STOP_TIMER from libavutil with 
>>>> mingw 4.3.0-tdm3
>>>> I don't know if it is an appropiate method, but i did it somewhat like 
>>>> this
>>>>
>>>> for(i = 0; i < (1 << 19); ++i) {
>>>>    memcpy(block1, block, 64 * sizeof(short));
>>>>    START_TIMER
>>>>    vc1_inv_trans_4x4_c(dest, 8, block1);
>>>>    STOP_TIMER("vc1_inv_trans_4x4_c")
>>>> }
>>>>     
>>>>         
>>> The transforms should be tested in the real code that is in vc1.c
>>> just put START/STOP_TIMER over the call to the transform
>>>
>>>
>>> [...]
>>>
>>>   
>>>       
>> these have been done with mingw 3.4.5
>> for the c version
>> 19801 dezicycles in vc1_inv_trans_8x8, 1048466 runs, 110 skips
>> 4069 dezicycles in vc1_inv_trans_4x4, 262046 runs, 98 skips
>> 9432 dezicycles in vc1_inv_trans_4x8, 524263 runs, 25 skips
>> 8056 dezicycles in vc1_inv_trans_8x4, 524259 runs, 29 skips
>> 3992 dezicycles in vc1_inv_trans_4x4, 524148 runs, 140 skips
>> 19743 dezicycles in vc1_inv_trans_8x8, 2096925 runs, 227 skips
>> 9393 dezicycles in vc1_inv_trans_4x8, 1048529 runs, 47 skips
>> 8006 dezicycles in vc1_inv_trans_8x4, 1048520 runs, 56 skips
>> 3929 dezicycles in vc1_inv_trans_4x4, 1048312 runs, 264 skips
>> 3893 dezicycles in vc1_inv_trans_4x4, 2096685 runs, 467 skips
>> 7950 dezicycles in vc1_inv_trans_8x4, 2097040 runs, 112 skips
>> 9358 dezicycles in vc1_inv_trans_4x8, 2097028 runs, 124 skips
>> 19529 dezicycles in vc1_inv_trans_8x8, 4193864 runs, 440 skips
>>
>> for the mmx version
>> 7157 dezicycles in vc1_inv_trans_8x8, 1048200 runs, 376 skips
>> 2567 dezicycles in vc1_inv_trans_4x4, 261770 runs, 374 skips
>> 4454 dezicycles in vc1_inv_trans_4x8, 523938 runs, 350 skips
>> 3949 dezicycles in vc1_inv_trans_8x4, 523877 runs, 411 skips
>> 2532 dezicycles in vc1_inv_trans_4x4, 523793 runs, 495 skips
>> 7159 dezicycles in vc1_inv_trans_8x8, 2096437 runs, 715 skips
>> 4424 dezicycles in vc1_inv_trans_4x8, 1047981 runs, 595 skips
>> 3914 dezicycles in vc1_inv_trans_8x4, 1047851 runs, 725 skips
>> 2512 dezicycles in vc1_inv_trans_4x4, 1047902 runs, 674 skips
>> 2501 dezicycles in vc1_inv_trans_4x4, 2096179 runs, 973 skips
>> 3896 dezicycles in vc1_inv_trans_8x4, 2095994 runs, 1158 skips
>> 4412 dezicycles in vc1_inv_trans_4x8, 2096082 runs, 1070 skips
>> 7142 dezicycles in vc1_inv_trans_8x8, 4193046 runs, 1258 skips
>>
>> If you are wondering why the c version of the 8x8 transformation isn't as 
>> fast as the mmx version because of my last post, I mistakenly benchmarked 
>> the mmx version twice last time. I attached a patch which adds the 
>> START/STOP_TIMER around the calls for someone else to check.
>>
>> [...]
>>     
>>>   
>>>       
>>>> +#define SRA2(r0,r1,c0)\
>>>> +    "psraw $"#c0", "#r0"\n\t" /* r0 >> c0 */\
>>>> +    "psraw $"#c0", "#r1"\n\t" /* r1 >> c0 */
>>>> +
>>>> +/*
>>>> +    postcondition:
>>>> +        r0 = r0 >> c0;
>>>> +        r1 = r1 >> c0;
>>>> +        r2 = r2 >> c0;
>>>> +        r3 = r3 >> c0;
>>>> +*/
>>>> +#define SRA4(r0,r1,r2,r3,c0)\
>>>> +    SRA2(r0,r1,c0)\
>>>> +    SRA2(r2,r3,c0)
>>>>     
>>>>         
>>> I think a generic macro like:
>>>
>>> OPC_SS_AB(opc, dst0, dst1, src) \
>>>     opc" "src","dst0"\n\t" \
>>>     opc" "src","dst1"\n\t"
>>>
>>> OPC_SSSS_ABCD(opc, dst0, dst1, dst2, dst3, src) \
>>>     OPC_SS_AB(opc, dst0, dst1, src) \
>>>     OPC_SS_AB(opc, dst2, dst3, src) \
>>>
>>> would be simpler
>>>
>>>
>>> [...]
>>>   
>>>       
>>>> +DECLARE_ALIGNED_16(static const int16_t, constants[]) = {
>>>> +  X4( 4),
>>>> +  X4(64)
>>>> +};
>>>>     
>>>>         
>>> Also please reduce the amount of marcos in your code this hurts 
>>> readability a
>>> lot.
>>> That means at least every macro that is bigger than the space it safes.
>>> 4,4,4,4
>>> 64,64,64,64
>>>
>>> vs.
>>>
>>> X4( 4),
>>> X4(64)
>>> #define X4(x) x, x, x, x
>>>
>>> being an example
>>>
>>> [...]
>>>   
>>>       
>> hopefully all issues have been addressed.
>>
>>     
>
>   
>> Index: libavcodec/i386/dsputil_mmx.h
>> ===================================================================
>> --- libavcodec/i386/dsputil_mmx.h	(Revision 14018)
>> +++ libavcodec/i386/dsputil_mmx.h	(Arbeitskopie)
>> @@ -57,6 +57,18 @@
>>  extern const double ff_pd_1[2];
>>  extern const double ff_pd_2[2];
>>  
>> +#define LOAD4(s,o,in,a,b,c,d)\
>> +    "movq 0*"#s"+"#o#in", "#a"\n\t"\
>> +    "movq 1*"#s"+"#o#in", "#b"\n\t"\
>> +    "movq 2*"#s"+"#o#in", "#c"\n\t"\
>> +    "movq 3*"#s"+"#o#in", "#d"\n\t"
>> +
>> +#define STORE4(s,o,out,a,b,c,d)\
>> +    "movq "#a", 0*"#s"+"#o#out"\n\t"\
>> +    "movq "#b", 1*"#s"+"#o#out"\n\t"\
>> +    "movq "#c", 2*"#s"+"#o#out"\n\t"\
>> +    "movq "#d", 3*"#s"+"#o#out"\n\t"
>> +
>>     
>
> the o-out and o-in seem a little redundant. one for each should do i think
> and i would rename s to stride for better readability
>   
done
>
>
> [...]
>
>   
>> +/*
>> +    postcondition:
>> +        dst0 = [15:0](dst0 + src);
>> +        dst1 = [15:0](dst1 - src);
>> +*/
>> +#define ADD1SUB1(src, dst0, dst1)\
>> +    "paddw "#src", "#dst0"\n\t" /* dst0 + src */\
>> +    "psubw "#src", "#dst1"\n\t" /* dst1 - src */
>>     
>
> i think this code is self explanatory and doesnt need a comment
>   
ok
>
> [...]
>
>   
>> +#define LOAD_ADD_CLAMP_STORE_8X1(io,t0,t1,r0,r1)\
>> +    "movq      "#io", "#t0"\n\t"\
>> +    "movq      "#t0", "#t1"\n\t"\
>> +    "punpcklbw %%mm7, "#t0"\n\t"\
>> +    "punpckhbw %%mm7, "#t1"\n\t"\
>> +    "paddw     "#r0", "#t0"\n\t"\
>> +    "paddw     "#r1", "#t1"\n\t"\
>> +    "packuswb  "#t1", "#t0"\n\t"\
>> +    "movq      "#t0", "#io"\n\t"
>>     
>
> some of the movq seem redundant
>   
I'm sorry but I don' see it. Although this is now obsolete, I'd still 
like to know which one is redundant and why.
>
> [...]
>
>   
>> +    STORE4(0x10,0x00,(%0),%%mm3,%%mm0,%%mm2,%%mm4)
>> +
>> +    LOAD4(0x10,0x08,%1,%%mm0,%%mm1,%%mm2,%%mm3)
>> +    TRANSPOSE4(%%mm0,%%mm1,%%mm2,%%mm3,%%mm4)
>> +    TRANSFORM_4X4_COL(%%mm0,%%mm3,%%mm4,%%mm2,%%mm1,%%mm5,%%mm6,%%mm7,%4)
>> +    STORE4(0x10,0x08,(%0),%%mm3,%%mm0,%%mm2,%%mm4)
>> +
>> +    "pxor %%mm7, %%mm7\n\t"
>> +    LOAD4(0x08,0x00,(%0),%%mm0,%%mm1,%%mm2,%%mm3)
>> +    LOAD_ADD_CLAMP_STORE_8X2(%2,%3)
>> +    "add     %3,    %2\n\t"
>> +    LOAD4(0x08,0x20,(%0),%%mm0,%%mm1,%%mm2,%%mm3)
>> +    LOAD_ADD_CLAMP_STORE_8X2(%2,%3)
>>     
>
> redundant store+load 
>
>
> [...]
ok

-------------- next part --------------
An embedded and charset-unspecified text was scrubbed...
Name: dsputil_v2.patch
URL: <http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/attachments/20080703/30327ef0/attachment.asc>
-------------- next part --------------
An embedded and charset-unspecified text was scrubbed...
Name: vc1_inv_trans_mmx_v4.patch
URL: <http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/attachments/20080703/30327ef0/attachment.txt>



More information about the ffmpeg-devel mailing list