[FFmpeg-devel] [FFMpeg-Devel] [PATCH 4/7] Replaced BLOCK_HEIGHT macro with block_height and block_width variables

Michael Niedermayer michaelni at gmx.at
Fri Mar 13 19:53:31 CET 2015


On Fri, Mar 13, 2015 at 02:15:11PM -0400, Tucker DiNapoli wrote:
> This change is to allow support for different sized blocks, which will
> be necessary for sse and avx. My plan is for the code to still act on
> 8x8 blocks, but to process multiple 8x8 blocks in parallel when using
> sse/avx.
> ---
>  libpostproc/postprocess.c          |  3 ---
>  libpostproc/postprocess_c.c        | 36 ++++++++++++++++++------------------
>  libpostproc/postprocess_internal.h | 17 ++++++++++++++++-
>  libpostproc/postprocess_template.c | 18 +++++++++---------
>  4 files changed, 43 insertions(+), 31 deletions(-)
> 
> diff --git a/libpostproc/postprocess.c b/libpostproc/postprocess.c
> index 2cdd988..3090869 100644
> --- a/libpostproc/postprocess.c
> +++ b/libpostproc/postprocess.c
> @@ -115,9 +115,6 @@ const char *postproc_license(void)
>  
>  #define GET_MODE_BUFFER_SIZE 500
>  #define OPTIONS_ARRAY_SIZE 10
> -#define BLOCK_SIZE 8
> -#define TEMP_STRIDE 8
> -//#define NUM_BLOCKS_AT_ONCE 16 //not used yet
>  
>  #if ARCH_X86 && HAVE_INLINE_ASM
>  DECLARE_ASM_CONST(8, uint64_t, w05)= 0x0005000500050005LL;
> diff --git a/libpostproc/postprocess_c.c b/libpostproc/postprocess_c.c
> index 3d3b738..5660c64 100644
> --- a/libpostproc/postprocess_c.c
> +++ b/libpostproc/postprocess_c.c
> @@ -32,7 +32,7 @@ static inline int isHorizDC_C(const uint8_t src[], int stride, const PPContext *
>      const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
>      const int dcThreshold= dcOffset*2 + 1;
>  
> -    for(y=0; y<BLOCK_SIZE; y++){
> +    for(y=0; y<block_height; y++){
>          numEq += ((unsigned)(src[0] - src[1] + dcOffset)) < dcThreshold;
>          numEq += ((unsigned)(src[1] - src[2] + dcOffset)) < dcThreshold;
>          numEq += ((unsigned)(src[2] - src[3] + dcOffset)) < dcThreshold;
> @@ -56,7 +56,7 @@ static inline int isVertDC_C(const uint8_t src[], int stride, const PPContext *c
>      const int dcThreshold= dcOffset*2 + 1;
>  
>      src+= stride*4; // src points to begin of the 8x8 Block
> -    for(y=0; y<BLOCK_SIZE-1; y++){
> +    for(y=0; y<block_height-1; y++){
>          numEq += ((unsigned)(src[0] - src[0+stride] + dcOffset)) < dcThreshold;
>          numEq += ((unsigned)(src[1] - src[1+stride] + dcOffset)) < dcThreshold;
>          numEq += ((unsigned)(src[2] - src[2+stride] + dcOffset)) < dcThreshold;
> @@ -90,7 +90,7 @@ static inline int isVertMinMaxOk_C(const uint8_t src[], int stride, int QP)
>  {
>      int x;
>      src+= stride*4;
> -    for(x=0; x<BLOCK_SIZE; x+=4){
> +    for(x=0; x<block_width; x+=4){
>          if((unsigned)(src[  x + 0*stride] - src[  x + 5*stride] + 2*QP) > 4*QP) return 0;
>          if((unsigned)(src[1+x + 2*stride] - src[1+x + 7*stride] + 2*QP) > 4*QP) return 0;
>          if((unsigned)(src[2+x + 4*stride] - src[2+x + 1*stride] + 2*QP) > 4*QP) return 0;
> @@ -120,7 +120,7 @@ static inline int vertClassify_C(const uint8_t src[], int stride, const PPContex
>  static inline void doHorizDefFilter_C(uint8_t dst[], int stride, const PPContext *c)
>  {
>      int y;
> -    for(y=0; y<BLOCK_SIZE; y++){
> +    for(y=0; y<block_height; y++){
>          const int middleEnergy= 5*(dst[4] - dst[3]) + 2*(dst[2] - dst[5]);
>  
>          if(FFABS(middleEnergy) < 8*c->QP){
> @@ -159,7 +159,7 @@ static inline void doHorizDefFilter_C(uint8_t dst[], int stride, const PPContext
>  static inline void doHorizLowPass_C(uint8_t dst[], int stride, const PPContext *c)
>  {
>      int y;
> -    for(y=0; y<BLOCK_SIZE; y++){
> +    for(y=0; y<block_height; y++){
>          const int first= FFABS(dst[-1] - dst[0]) < c->QP ? dst[-1] : dst[0];
>          const int last= FFABS(dst[8] - dst[7]) < c->QP ? dst[8] : dst[7];
>  
> @@ -229,7 +229,7 @@ static inline void horizX1Filter(uint8_t *src, int stride, int QP)
>          }
>      }
>  
> -    for(y=0; y<BLOCK_SIZE; y++){
> +    for(y=0; y<block_height; y++){
>          int a= src[1] - src[2];
>          int b= src[3] - src[4];
>          int c= src[5] - src[6];
> @@ -392,7 +392,7 @@ static inline void doVertLowPass_C(uint8_t *src, int stride, PPContext *c)
>      const int l9= stride + l8;
>      int x;
>      src+= stride*3;
> -    for(x=0; x<BLOCK_SIZE; x++){
> +    for(x=0; x<block_width; x++){
>          const int first= FFABS(src[0] - src[l1]) < c->QP ? src[0] : src[l1];
>          const int last= FFABS(src[l8] - src[l9]) < c->QP ? src[l9] : src[l8];
>  
> @@ -443,7 +443,7 @@ static inline void vertX1Filter_C(uint8_t *src, int stride, PPContext *co)
>      int x;
>  
>      src+= stride*3;
> -    for(x=0; x<BLOCK_SIZE; x++){
> +    for(x=0; x<block_width; x++){
>          int a= src[l3] - src[l4];
>          int b= src[l4] - src[l5];
>          int c= src[l5] - src[l6];
> @@ -478,7 +478,7 @@ static inline void doVertDefFilter_C(uint8_t src[], int stride, PPContext *c)
>  //    const int l9= stride + l8;
>      int x;
>      src+= stride*3;
> -    for(x=0; x<BLOCK_SIZE; x++){
> +    for(x=0; x<block_width; x++){
>          const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
>          if(FFABS(middleEnergy) < 8*c->QP){
>              const int q=(src[l4] - src[l5])/2;
> @@ -881,13 +881,13 @@ static inline void blockCopy_C(uint8_t dst[], int dstStride, const uint8_t src[]
>  {
>      int i;
>      if(levelFix){
> -    for(i=0; i<8; i++)
> +    for(i=0; i<block_height; i++)
>          memcpy( &(dst[dstStride*i]),
> -                &(src[srcStride*i]), BLOCK_SIZE);
> +                &(src[srcStride*i]), block_width);
>      }else{
> -    for(i=0; i<8; i++)
> +    for(i=0; i<block_height; i++)
>          memcpy( &(dst[dstStride*i]),
> -                &(src[srcStride*i]), BLOCK_SIZE);
> +                &(src[srcStride*i]), block_width);
>      }
>  }
>  
> @@ -908,7 +908,7 @@ static inline void duplicate_C(uint8_t src[], int stride)
>   * Filter array of bytes (Y or U or V values)
>   */
>  static void postProcess_C(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
> -                                const QP_STORE_T QPs[], int QPStride, int isColor, PPContext *c2)
> +                          const QP_STORE_T QPs[], int QPStride, int isColor, PPContext *c2)
>  {
>      DECLARE_ALIGNED(8, PPContext, c)= *c2; //copy to stack for faster access
>      int x,y;
> @@ -999,7 +999,7 @@ static void postProcess_C(const uint8_t src[], int srcStride, uint8_t dst[], int
>      }
>  
>      /* copy & deinterlace first row of blocks */
> -    y=-BLOCK_SIZE;
> +    y=-block_height;
>      {
>          const uint8_t *srcBlock= &(src[y*srcStride]);
>          uint8_t *dstBlock= tempDst + dstStride;
> @@ -1007,7 +1007,7 @@ static void postProcess_C(const uint8_t src[], int srcStride, uint8_t dst[], int
>          // From this point on it is guaranteed that we can read and write 16 lines downward
>          // finish 1 block before the next otherwise we might have a problem
>          // with the L1 Cache of the P4 ... or only a few blocks at a time or something
> -        for(x=0; x<width; x+=BLOCK_SIZE){
> +        for(x=0; x<width; x+=block_width){
>  
>  
>              blockCopy_C(dstBlock + dstStride*8, dstStride,
> @@ -1043,7 +1043,7 @@ static void postProcess_C(const uint8_t src[], int srcStride, uint8_t dst[], int
>          }
>      }
>  
> -    for(y=0; y<height; y+=BLOCK_SIZE){
> +    for(y=0; y<height; y+=block_height){
>          //1% speedup if these are here instead of the inner loop
>          const uint8_t *srcBlock= &(src[y*srcStride]);
>          uint8_t *dstBlock= &(dst[y*dstStride]);
> @@ -1077,7 +1077,7 @@ static void postProcess_C(const uint8_t src[], int srcStride, uint8_t dst[], int
>          // From this point on it is guaranteed that we can read and write 16 lines downward
>          // finish 1 block before the next otherwise we might have a problem
>          // with the L1 Cache of the P4 ... or only a few blocks at a time or something
> -        for(x=0; x<width; x+=BLOCK_SIZE){
> +        for(x=0; x<width; x+=block_width){
>              const int stride= dstStride;
>              if(isColor){
>                  QP= QPptr[x>>qpHShift];
> diff --git a/libpostproc/postprocess_internal.h b/libpostproc/postprocess_internal.h
> index 1ebd974..5a7be1f 100644
> --- a/libpostproc/postprocess_internal.h
> +++ b/libpostproc/postprocess_internal.h
> @@ -174,5 +174,20 @@ static inline void linecpy(void *dest, const void *src, int lines, int stride) {
>          memcpy((uint8_t*)dest+(lines-1)*stride, (const uint8_t*)src+(lines-1)*stride, -lines*stride);
>      }
>  }
> -
> +/*
> +   Currently blocks are always 8xN bytes, where N is determined by the size of
> +   the simd registers being used
> +*/
> +static const int block_height = 8;
> +#if ARCH_X86 && !CONFIG_RUNTIME_CPUDETECT
> +#if HAVE_AVX2
> +static const int block_width = 32;
> +#elif HAVE_SSE2
> +static const int block_width = 16;
> +#else
> +static const int block_width = 8;
> +#endif
> +#else
> +static int block_width; //determined at runtime
> +#endif
>  #endif /* POSTPROC_POSTPROCESS_INTERNAL_H */

non constant globals are not allowed, there can be multiple instances
of the postprocess library and each might in theory use a different
set of cpu flags and optimizations

also this does not work
all tests fail:
TEST    filter-pp
--- ./tests/ref/fate/filter-pp  2015-03-13 19:34:25.103761168 +0100
+++ tests/data/fate/filter-pp   2015-03-13 19:49:45.559780559 +0100
@@ -1 +1 @@
-pp                  3730f1ed7bf244ce059d110e21f39bbd
+pp
\ No newline at end of file
Test filter-pp failed. Look at tests/data/fate/filter-pp.err for details.
make: *** [fate-filter-pp] Error 139
TEST    filter-pp1
--- ./tests/ref/fate/filter-pp1 2015-03-13 19:34:25.103761168 +0100
+++ tests/data/fate/filter-pp1  2015-03-13 19:49:45.647780561 +0100
@@ -1 +1 @@
-pp1                 cb9f884e27a5be11f72afc9b517efd10
+pp1
\ No newline at end of file
Test filter-pp1 failed. Look at tests/data/fate/filter-pp1.err for details.
make: *** [fate-filter-pp1] Error 139
TEST    filter-pp2
--- ./tests/ref/fate/filter-pp2 2015-03-13 19:34:25.103761168 +0100
+++ tests/data/fate/filter-pp2  2015-03-13 19:49:45.731780563 +0100
@@ -1 +1 @@
-pp2                 975d4c1d489e42ddbd4a27464e8355af
+pp2
\ No newline at end of file
Test filter-pp2 failed. Look at tests/data/fate/filter-pp2.err for details.
make: *** [fate-filter-pp2] Error 139
TEST    filter-pp3
--- ./tests/ref/fate/filter-pp3 2015-03-13 19:34:25.103761168 +0100
+++ tests/data/fate/filter-pp3  2015-03-13 19:49:45.811780565 +0100
@@ -1 +1 @@
-pp3                 ef0f10f1859af2f75717e8c9d64ee38a
+pp3
\ No newline at end of file
Test filter-pp3 failed. Look at tests/data/fate/filter-pp3.err for details.
make: *** [fate-filter-pp3] Error 139
TEST    filter-pp4
--- ./tests/ref/fate/filter-pp4 2015-03-13 19:34:25.103761168 +0100
+++ tests/data/fate/filter-pp4  2015-03-13 19:49:45.883780566 +0100
@@ -1 +1 @@
-pp4                 0a2895c619ab9c6c22fd7cffb25070a8
+pp4
\ No newline at end of file
Test filter-pp4 failed. Look at tests/data/fate/filter-pp4.err for details.
make: *** [fate-filter-pp4] Error 139
TEST    filter-pp5
--- ./tests/ref/fate/filter-pp5 2015-03-13 19:34:25.103761168 +0100
+++ tests/data/fate/filter-pp5  2015-03-13 19:49:45.963780568 +0100
@@ -1 +1 @@
-pp5                 5fc6703d42bd98942e5dd104ce220291
+pp5
\ No newline at end of file
Test filter-pp5 failed. Look at tests/data/fate/filter-pp5.err for details.
make: *** [fate-filter-pp5] Error 139
TEST    filter-pp6
--- ./tests/ref/fate/filter-pp6 2015-03-13 19:34:25.103761168 +0100
+++ tests/data/fate/filter-pp6  2015-03-13 19:49:46.035780569 +0100
@@ -1 +1 @@
-pp6                 93b508d07bfcf4703aa7dff2d2ef5c03
+pp6
\ No newline at end of file
Test filter-pp6 failed. Look at tests/data/fate/filter-pp6.err for details.
make: *** [fate-filter-pp6] Error 139

and i do not understand how this could work, the QP parameter is at
a granularity of 16x16 in the luma plane and 8x8 in the chroma planes
the existing code uses one QP or 2 adjacent ones while processing a
8x8 block. If you increase the block size there will be more QP
parameters to consider

[...]
-- 
Michael     GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

If a bugfix only changes things apparently unrelated to the bug with no
further explanation, that is a good sign that the bugfix is wrong.
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 181 bytes
Desc: Digital signature
URL: <https://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20150313/32616914/attachment.asc>


More information about the ffmpeg-devel mailing list