[FFmpeg-devel] [FFMpeg-Devel][GSoC][PATCH 3/6] postproc: Moved inline asm for packing QP to seperate function

Wed Apr 22 22:27:28 CEST 2015

This patch contains the code for the avx2/sse2 versions of the new
function, but they are deliberately ignored, since the support for
avx2/sse2 isn't yet present (the next commit fixes this).

This is a temporary measure until full sse2/avx2 implementation is
complete, but it works with sse2/avx2 as inline asm. 

Moving this to a separate file would add overhead due to having to call a function,
if this is a reasonable trade off for removing inline asm than I can eaisly do that.
---
 libpostproc/postprocess_template.c | 61 +++++++++++++++++++++++++++++++-------
 1 file changed, 51 insertions(+), 10 deletions(-)

diff --git a/libpostproc/postprocess_template.c b/libpostproc/postprocess_template.c
index b7296c4..083be9d 100644
--- a/libpostproc/postprocess_template.c
+++ b/libpostproc/postprocess_template.c
@@ -3249,7 +3249,6 @@ static inline void RENAME(prefetchnta)(const void *p)
         : : "r" (p)
     );
 }
-
 static inline void RENAME(prefetcht0)(const void *p)
 {
     __asm__ volatile(   "prefetcht0 (%0)\n\t"
@@ -3305,6 +3304,55 @@ static inline void RENAME(prefetcht2)(const void *p)
     return;
 }
 #endif
+/*
+  This is temporary. Ultimately the inline asm should be removed completely
+  and moved to another file (though this has some performance overhead), but for
+  now this code is necessary.
+  Get around the issues with inline avx by using an explicit register
+  and simplify code by abstracting simd detail like in yasm code
+*/
+#if TEMPLATE_PP_MMX
+static inline void RENAME(packQP)(PPContext c)
+{
+#if 0 //TEMPLATE_PP_AVX2 goes here
+    __asm__ volatile(
+        "vmovdqa (%1), %%ymm0\n\t"
+        "vpermq $0, %%ymm0, %%ymm0 \n\t"
+        "vpunpcklbw %%ymm0, %%ymm0, %%ymm0  \n\t" // 0, 0, 0, 0, 0, 0, OP, QP
+        "vpunpcklwd %%ymm0, %%ymm0, %%ymm0  \n\t" // 0, 0, 0, 0, 0, 0, QP, QP
+        "vpunpckldq %%ymm0, %%ymm0, %%ymm0  \n\t" //QP,...,QP
+        "vpunpcklqdq %%ymm0, %%ymm0, %%ymm0  \n\t" //copy to upper quadword(s)
+        "vmovdqa %%ymm0, %0"
+        : "=m" (c.pQPb_block)
+        : "r" (c.QP_block)
+        : "%ymm0"
+    );
+#else
+#if 0 //TEMPLATE_PP_SSE2 goes here
+#define M0 "%xmm0"
+#define MOVA "movdqa"
+#else
+#define M0 "%mm0"
+#define MOVA "movq"
+#endif
+    __asm__ volatile(
+        MOVA" (%1), %"M0"\n\t"
+        "punpcklbw %"M0", %"M0"  \n\t" // 0, 0, 0, 0, 0, 0, OP, QP
+        "punpcklwd %"M0", %"M0"  \n\t" // 0, 0, 0, 0, 0, 0, QP, QP
+        "punpckldq %"M0", %"M0"  \n\t" //QP,...,QP
+#if 0 //TEMPLATE_PP_SSE2
+        "punpcklqdq %"M0", %"M0"  \n\t" //copy to upper quadword(s)
+#endif
+        MOVA" %"M0", %0"
+        : "=m" (c.pQPb_block)
+        : "r" (c.QP_block)
+        : M0
+    );
+#undef M0
+#undef MOVA
+#endif
+}
+#endif
 /**
  * Filter array of bytes (Y or U or V values)
  */
@@ -3516,6 +3564,7 @@ static void RENAME(postProcess)(const uint8_t src[], int srcStride, uint8_t dst[
             for(qp_index=0; qp_index < (endx-startx)/BLOCK_SIZE; qp_index++){
                 QP = QPptr[(x+qp_index*BLOCK_SIZE)>>qpHShift];
                 nonBQP = nonBQPptr[(x+qp_index*BLOCK_SIZE)>>qpHShift];
+
             if(!isColor){
                 QP= (QP* QPCorrecture + 256*128)>>16;
                 nonBQP= (nonBQP* QPCorrecture + 256*128)>>16;
@@ -3524,15 +3573,7 @@ static void RENAME(postProcess)(const uint8_t src[], int srcStride, uint8_t dst[
             c.QP_block[qp_index] = QP;
             c.nonBQP_block[qp_index] = nonBQP;
 #if TEMPLATE_PP_MMX
-            __asm__ volatile(
-                "movd %1, %%mm7         \n\t"
-                "packuswb %%mm7, %%mm7  \n\t" // 0, 0, 0, QP, 0, 0, 0, QP
-                "packuswb %%mm7, %%mm7  \n\t" // 0,QP, 0, QP, 0,QP, 0, QP
-                "packuswb %%mm7, %%mm7  \n\t" // QP,..., QP
-                "movq %%mm7, %0         \n\t"
-                : "=m" (c.pQPb_block[qp_index])
-                : "r" (QP)
-            );
+            RENAME(packQP)(c);
 #endif
             }
           for(; x < endx; x+=BLOCK_SIZE){
-- 
2.3.5