[FFmpeg-devel] [PATCH 06/11] avcodec/mips: loongson optimize hpeldsp with mmi v1

周晓勇 ipfootball at 126.com
Tue May 17 08:51:50 CEST 2016


avcodec/mips: loongson optimize hpeldsp with mmi v1
1.the codes are compatible with O32 ABI
2.use uld and mtc1 to workaround cpu 3A2000 gslwlc1 bug (gslwlc1 instruction extension bug in O32 ABI)






在 2016-05-13 18:05:07,"周晓勇" <ipfootball at 126.com> 写道:

From 8212b9b5beecb6e2ba3f05a2a4c7f1704220c911 Mon Sep 17 00:00:00 2001
From: Zhou Xiaoyong <zhouxiaoyong at loongson.cn>
Date: Thu, 12 May 2016 01:59:03 +0800
Subject: [PATCH 06/11] avcodec/mips: loongson optimize hpeldsp with mmi v1


---
 libavcodec/mips/Makefile            |    1 +
 libavcodec/mips/hpeldsp_init_mips.c |   49 ++
 libavcodec/mips/hpeldsp_mips.h      |   87 +++
 libavcodec/mips/hpeldsp_mmi.c       | 1257 +++++++++++++++++++++++++++++++++++
 4 files changed, 1394 insertions(+)


diff --git a/libavcodec/mips/Makefile b/libavcodec/mips/Makefile
index f66017a..3c43600 100644
--- a/libavcodec/mips/Makefile
+++ b/libavcodec/mips/Makefile
@@ -77,3 +77,4 @@ MMI-OBJS-$(CONFIG_MPEG4_DECODER)          += mips/xvid_idct_mmi.o
 MMI-OBJS-$(CONFIG_BLOCKDSP)               += mips/blockdsp_mmi.o
 MMI-OBJS-$(CONFIG_PIXBLOCKDSP)            += mips/pixblockdsp_mmi.o
 MMI-OBJS-$(CONFIG_H264QPEL)               += mips/h264qpel_mmi.o
+MMI-OBJS-$(CONFIG_HPELDSP)                += mips/hpeldsp_mmi.o
diff --git a/libavcodec/mips/hpeldsp_init_mips.c b/libavcodec/mips/hpeldsp_init_mips.c
index 82f2310..363a045 100644
--- a/libavcodec/mips/hpeldsp_init_mips.c
+++ b/libavcodec/mips/hpeldsp_init_mips.c
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2015 Parag Salasakar (Parag.Salasakar at imgtec.com)
+ * Copyright (c) 2016 Zhou Xiaoyong <zhouxiaoyong at loongson.cn>
  *
  * This file is part of FFmpeg.
  *
@@ -65,9 +66,57 @@ static void ff_hpeldsp_init_msa(HpelDSPContext *c, int flags)
 }
 #endif  // #if HAVE_MSA
 
+#if HAVE_MMI
+static void ff_hpeldsp_init_mmi(HpelDSPContext *c, int flags)
+{
+    c->put_pixels_tab[0][0] = ff_put_pixels16_8_mmi;
+    c->put_pixels_tab[0][1] = ff_put_pixels16_x2_8_mmi;
+    c->put_pixels_tab[0][2] = ff_put_pixels16_y2_8_mmi;
+    c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_8_mmi;
+
+    c->put_pixels_tab[1][0] = ff_put_pixels8_8_mmi;
+    c->put_pixels_tab[1][1] = ff_put_pixels8_x2_8_mmi;
+    c->put_pixels_tab[1][2] = ff_put_pixels8_y2_8_mmi;
+    c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_8_mmi;
+
+    c->put_pixels_tab[2][0] = ff_put_pixels4_8_mmi;
+    c->put_pixels_tab[2][1] = ff_put_pixels4_x2_8_mmi;
+    c->put_pixels_tab[2][2] = ff_put_pixels4_y2_8_mmi;
+    c->put_pixels_tab[2][3] = ff_put_pixels4_xy2_8_mmi;
+
+    c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_8_mmi;
+    c->put_no_rnd_pixels_tab[0][1] = ff_put_no_rnd_pixels16_x2_8_mmi;
+    c->put_no_rnd_pixels_tab[0][2] = ff_put_no_rnd_pixels16_y2_8_mmi;
+    c->put_no_rnd_pixels_tab[0][3] = ff_put_no_rnd_pixels16_xy2_8_mmi;
+
+    c->put_no_rnd_pixels_tab[1][0] = ff_put_pixels8_8_mmi;
+    c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_8_mmi;
+    c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_8_mmi;
+    c->put_no_rnd_pixels_tab[1][3] = ff_put_no_rnd_pixels8_xy2_8_mmi;
+
+    c->avg_pixels_tab[0][0] = ff_avg_pixels16_8_mmi;
+    c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_8_mmi;
+    c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_8_mmi;
+    c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_8_mmi;
+
+    c->avg_pixels_tab[1][0] = ff_avg_pixels8_8_mmi;
+    c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_8_mmi;
+    c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_8_mmi;
+    c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_8_mmi;
+
+    c->avg_pixels_tab[2][0] = ff_avg_pixels4_8_mmi;
+    c->avg_pixels_tab[2][1] = ff_avg_pixels4_x2_8_mmi;
+    c->avg_pixels_tab[2][2] = ff_avg_pixels4_y2_8_mmi;
+    c->avg_pixels_tab[2][3] = ff_avg_pixels4_xy2_8_mmi;
+}
+#endif  // #if HAVE_MMI
+
 void ff_hpeldsp_init_mips(HpelDSPContext *c, int flags)
 {
 #if HAVE_MSA
     ff_hpeldsp_init_msa(c, flags);
 #endif  // #if HAVE_MSA
+#if HAVE_MMI
+    ff_hpeldsp_init_mmi(c, flags);
+#endif  // #if HAVE_MMI
 }
diff --git a/libavcodec/mips/hpeldsp_mips.h b/libavcodec/mips/hpeldsp_mips.h
index f4ab53e..f527c1d 100644
--- a/libavcodec/mips/hpeldsp_mips.h
+++ b/libavcodec/mips/hpeldsp_mips.h
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2015 Parag Salasakar (Parag.Salasakar at imgtec.com)
+ * Copyright (c) 2016 Zhou Xiaoyong <zhouxiaoyong at loongson.cn>
  *
  * This file is part of FFmpeg.
  *
@@ -84,4 +85,90 @@ void ff_avg_pixels4_y2_msa(uint8_t *block, const uint8_t *pixels,
 void ff_avg_pixels4_xy2_msa(uint8_t *block, const uint8_t *pixels,
                             ptrdiff_t line_size, int32_t h);
 
+void ff_put_pixels16_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
+    const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2,
+    int h);
+void ff_put_pixels8_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
+    const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2,
+    int h);
+void ff_put_pixels4_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
+    const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2,
+    int h);
+void ff_avg_pixels16_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
+    const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2,
+    int h);
+void ff_avg_pixels8_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
+    const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2,
+    int h);
+void ff_avg_pixels4_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
+    const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2,
+    int h);
+void ff_put_no_rnd_pixels16_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
+    const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2,
+    int h);
+void ff_put_no_rnd_pixels8_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
+    const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2,
+    int h);
+
+void ff_put_pixels16_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int32_t h);
+void ff_put_pixels16_x2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int32_t h);
+void ff_put_pixels16_y2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int32_t h);
+void ff_put_pixels16_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int32_t h);
+void ff_put_pixels8_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int32_t h);
+void ff_put_pixels8_x2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int32_t h);
+void ff_put_pixels8_y2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int32_t h);
+void ff_put_pixels8_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int32_t h);
+void ff_put_pixels4_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int32_t h);
+void ff_put_pixels4_x2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int32_t h);
+void ff_put_pixels4_y2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int32_t h);
+void ff_put_pixels4_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int32_t h);
+void ff_put_no_rnd_pixels16_x2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int32_t h);
+void ff_put_no_rnd_pixels16_y2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int32_t h);
+void ff_put_no_rnd_pixels16_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int32_t h);
+void ff_put_no_rnd_pixels8_x2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int32_t h);
+void ff_put_no_rnd_pixels8_y2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int32_t h);
+void ff_put_no_rnd_pixels8_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int32_t h);
+void ff_avg_pixels16_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int32_t h);
+void ff_avg_pixels16_x2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int32_t h);
+void ff_avg_pixels16_y2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int32_t h);
+void ff_avg_pixels16_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int32_t h);
+void ff_avg_pixels8_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int32_t h);
+void ff_avg_pixels8_x2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int32_t h);
+void ff_avg_pixels8_y2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int32_t h);
+void ff_avg_pixels8_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int32_t h);
+void ff_avg_pixels4_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int32_t h);
+void ff_avg_pixels4_x2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int32_t h);
+void ff_avg_pixels4_y2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int32_t h);
+void ff_avg_pixels4_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int32_t h);
+
 #endif  // #ifndef AVCODEC_MIPS_HPELDSP_MIPS_H
diff --git a/libavcodec/mips/hpeldsp_mmi.c b/libavcodec/mips/hpeldsp_mmi.c
new file mode 100644
index 0000000..4c46f00
--- /dev/null
+++ b/libavcodec/mips/hpeldsp_mmi.c
@@ -0,0 +1,1257 @@
+/*
+ * Loongson SIMD optimized qpeldsp
+ *
+ * Copyright (c) 2016 Loongson Technology Corporation Limited
+ * Copyright (c) 2016 Zhou Xiaoyong <zhouxiaoyong at loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "hpeldsp_mips.h"
+#include "libavcodec/bit_depth_template.c"
+#include "libavutil/mips/asmdefs.h"
+#include "constants.h"
+
+void ff_put_pixels4_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int h)
+{
+    double ftmp[2];
+    mips_reg addr[2];
+    uint64_t low32;
+
+    __asm__ volatile (
+        PTR_ADDU   "%[addr1],   %[line_size],   %[line_size]            \n\t"
+        "1:                                                             \n\t"
+        PTR_ADDU   "%[addr0],   %[pixels],      %[line_size]            \n\t"
+        "uld        %[low32],   0x00(%[pixels])                         \n\t"
+        "mtc1       %[low32],   %[ftmp0]                                \n\t"
+        "uld        %[low32],   0x00(%[addr0])                          \n\t"
+        "mtc1       %[low32],   %[ftmp1]                                \n\t"
+        "swc1       %[ftmp0],   0x00(%[block])                          \n\t"
+        "gsswxc1    %[ftmp1],   0x00(%[block],  %[line_size])           \n\t"
+        PTR_ADDU   "%[pixels],  %[pixels],      %[addr1]                \n\t"
+        PTR_ADDU   "%[block],   %[block],       %[addr1]                \n\t"
+
+        PTR_ADDU   "%[addr0],   %[pixels],      %[line_size]            \n\t"
+        "uld        %[low32],   0x00(%[pixels])                         \n\t"
+        "mtc1       %[low32],   %[ftmp0]                                \n\t"
+        "uld        %[low32],   0x00(%[addr0])                          \n\t"
+        "mtc1       %[low32],   %[ftmp1]                                \n\t"
+        "swc1       %[ftmp0],   0x00(%[block])                          \n\t"
+        "gsswxc1    %[ftmp1],   0x00(%[block],  %[line_size])           \n\t"
+        PTR_ADDU   "%[pixels],  %[pixels],      %[addr1]                \n\t"
+        PTR_ADDU   "%[block],   %[block],       %[addr1]                \n\t"
+
+        PTR_ADDI   "%[h],       %[h],           -0x04                   \n\t"
+        "bnez       %[h],       1b                                      \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
+          [low32]"=&r"(low32),
+          [block]"+&r"(block),              [pixels]"+&r"(pixels),
+          [h]"+&r"(h)
+        : [line_size]"r"((mips_reg)line_size)
+        : "memory"
+    );
+}
+
+void ff_put_pixels8_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int h)
+{
+    double ftmp[2];
+    mips_reg addr[2];
+
+    __asm__ volatile (
+        PTR_ADDU   "%[addr1],   %[line_size],   %[line_size]            \n\t"
+        "1:                                                             \n\t"
+        "gsldlc1    %[ftmp0],   0x07(%[pixels])                         \n\t"
+        PTR_ADDU   "%[addr0],   %[pixels],      %[line_size]            \n\t"
+        "gsldrc1    %[ftmp0],   0x00(%[pixels])                         \n\t"
+        "gsldlc1    %[ftmp1],   0x07(%[addr0])                          \n\t"
+        "gsldrc1    %[ftmp1],   0x00(%[addr0])                          \n\t"
+        "sdc1       %[ftmp0],   0x00(%[block])                          \n\t"
+        "gssdxc1    %[ftmp1],   0x00(%[block],  %[line_size])           \n\t"
+        PTR_ADDU   "%[pixels],  %[pixels],      %[addr1]                \n\t"
+        PTR_ADDU   "%[block],   %[block],       %[addr1]                \n\t"
+
+        "gsldlc1    %[ftmp0],   0x07(%[pixels])                         \n\t"
+        PTR_ADDU   "%[addr0],   %[pixels],      %[line_size]            \n\t"
+        "gsldrc1    %[ftmp0],   0x00(%[pixels])                         \n\t"
+        "gsldlc1    %[ftmp1],   0x07(%[addr0])                          \n\t"
+        "gsldrc1    %[ftmp1],   0x00(%[addr0])                          \n\t"
+        "sdc1       %[ftmp0],   0x00(%[block])                          \n\t"
+        "gssdxc1    %[ftmp1],   0x00(%[block],  %[line_size])           \n\t"
+        PTR_ADDU   "%[pixels],  %[pixels],      %[addr1]                \n\t"
+        PTR_ADDU   "%[block],   %[block],       %[addr1]                \n\t"
+
+        PTR_ADDI   "%[h],       %[h],           -0x04                   \n\t"
+        "bnez       %[h],       1b                                      \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
+          [block]"+&r"(block),              [pixels]"+&r"(pixels),
+          [h]"+&r"(h)
+        : [line_size]"r"((mips_reg)line_size)
+        : "memory"
+    );
+}
+
+void ff_put_pixels16_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int h)
+{
+    double ftmp[4];
+    mips_reg addr[2];
+
+    __asm__ volatile (
+        PTR_ADDU   "%[addr1],   %[line_size],   %[line_size]            \n\t"
+        "1:                                                             \n\t"
+        "gsldlc1    %[ftmp0],   0x07(%[pixels])                         \n\t"
+        PTR_ADDU   "%[addr0],   %[pixels],      %[line_size]            \n\t"
+        "gsldrc1    %[ftmp0],   0x00(%[pixels])                         \n\t"
+        "gsldlc1    %[ftmp2],   0x0f(%[pixels])                         \n\t"
+        "gsldrc1    %[ftmp2],   0x08(%[pixels])                         \n\t"
+        "gsldlc1    %[ftmp1],   0x07(%[addr0])                          \n\t"
+        "gsldrc1    %[ftmp1],   0x00(%[addr0])                          \n\t"
+        "gsldlc1    %[ftmp3],   0x0f(%[addr0])                          \n\t"
+        "gsldrc1    %[ftmp3],   0x08(%[addr0])                          \n\t"
+        "sdc1       %[ftmp0],   0x00(%[block])                          \n\t"
+        "gssdxc1    %[ftmp1],   0x00(%[block],  %[line_size])           \n\t"
+        "sdc1       %[ftmp2],   0x08(%[block])                          \n\t"
+        "gssdxc1    %[ftmp3],   0x08(%[block],  %[line_size])           \n\t"
+        PTR_ADDU   "%[pixels],  %[pixels],      %[addr1]                \n\t"
+        PTR_ADDU   "%[block],   %[block],       %[addr1]                \n\t"
+
+        "gsldlc1    %[ftmp0],   0x07(%[pixels])                         \n\t"
+        PTR_ADDU   "%[addr0],   %[pixels],      %[line_size]            \n\t"
+        "gsldrc1    %[ftmp0],   0x00(%[pixels])                         \n\t"
+        "gsldlc1    %[ftmp2],   0x0f(%[pixels])                         \n\t"
+        "gsldrc1    %[ftmp2],   0x08(%[pixels])                         \n\t"
+        "gsldlc1    %[ftmp1],   0x07(%[addr0])                          \n\t"
+        "gsldrc1    %[ftmp1],   0x00(%[addr0])                          \n\t"
+        "gsldlc1    %[ftmp3],   0x0f(%[addr0])                          \n\t"
+        "gsldrc1    %[ftmp3],   0x08(%[addr0])                          \n\t"
+        "sdc1       %[ftmp0],   0x00(%[block])                          \n\t"
+        "gssdxc1    %[ftmp1],   0x00(%[block],  %[line_size])           \n\t"
+        "sdc1       %[ftmp2],   0x08(%[block])                          \n\t"
+        "gssdxc1    %[ftmp3],   0x08(%[block],  %[line_size])           \n\t"
+        PTR_ADDU   "%[pixels],  %[pixels],      %[addr1]                \n\t"
+        PTR_ADDU   "%[block],   %[block],       %[addr1]                \n\t"
+
+        PTR_ADDI   "%[h],       %[h],           -0x04                   \n\t"
+        "bnez       %[h],       1b                                      \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
+          [block]"+&r"(block),              [pixels]"+&r"(pixels),
+          [h]"+&r"(h)
+        : [line_size]"r"((mips_reg)line_size)
+        : "memory"
+    );
+}
+
+void ff_avg_pixels4_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int h)
+{
+    double ftmp[4];
+    mips_reg addr[3];
+    uint64_t low32;
+
+    __asm__ volatile (
+        PTR_ADDU   "%[addr2],   %[line_size],   %[line_size]            \n\t"
+        "1:                                                             \n\t"
+        PTR_ADDU   "%[addr0],   %[pixels],      %[line_size]            \n\t"
+        "uld        %[low32],   0x00(%[pixels])                         \n\t"
+        "mtc1       %[low32],   %[ftmp0]                                \n\t"
+        "uld        %[low32],   0x00(%[addr0])                          \n\t"
+        "mtc1       %[low32],   %[ftmp1]                                \n\t"
+        PTR_ADDU   "%[addr1],   %[block],       %[line_size]            \n\t"
+        "uld        %[low32],   0x00(%[block])                          \n\t"
+        "mtc1       %[low32],   %[ftmp2]                                \n\t"
+        "uld        %[low32],   0x00(%[addr1])                          \n\t"
+        "mtc1       %[low32],   %[ftmp3]                                \n\t"
+        "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
+        "swc1       %[ftmp0],   0x00(%[block])                          \n\t"
+        "gsswxc1    %[ftmp1],   0x00(%[block],  %[line_size])           \n\t"
+        PTR_ADDU   "%[pixels],  %[pixels],      %[addr2]                \n\t"
+        PTR_ADDU   "%[block],   %[block],       %[addr2]                \n\t"
+
+        PTR_ADDU   "%[addr0],   %[pixels],      %[line_size]            \n\t"
+        "uld        %[low32],   0x00(%[pixels])                         \n\t"
+        "mtc1       %[low32],   %[ftmp0]                                \n\t"
+        "uld        %[low32],   0x00(%[addr0])                          \n\t"
+        "mtc1       %[low32],   %[ftmp1]                                \n\t"
+        PTR_ADDU   "%[addr1],   %[block],       %[line_size]            \n\t"
+        "uld        %[low32],   0x00(%[block])                          \n\t"
+        "mtc1       %[low32],   %[ftmp2]                                \n\t"
+        "uld        %[low32],   0x00(%[addr1])                          \n\t"
+        "mtc1       %[low32],   %[ftmp3]                                \n\t"
+        "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
+        "swc1       %[ftmp0],   0x00(%[block])                          \n\t"
+        "gsswxc1    %[ftmp1],   0x00(%[block],  %[line_size])           \n\t"
+        PTR_ADDU   "%[pixels],  %[pixels],      %[addr2]                \n\t"
+        PTR_ADDU   "%[block],   %[block],       %[addr2]                \n\t"
+
+        PTR_ADDI   "%[h],       %[h],           -0x04                   \n\t"
+        "bnez       %[h],       1b                                      \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
+          [addr2]"=&r"(addr[2]),
+          [low32]"=&r"(low32),
+          [block]"+&r"(block),              [pixels]"+&r"(pixels),
+          [h]"+&r"(h)
+        : [line_size]"r"((mips_reg)line_size)
+        : "memory"
+    );
+}
+
+void ff_avg_pixels8_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int h)
+{
+    double ftmp[4];
+    mips_reg addr[3];
+
+    __asm__ volatile (
+        PTR_ADDU   "%[addr2],   %[line_size],   %[line_size]            \n\t"
+        "1:                                                             \n\t"
+        "gsldlc1    %[ftmp0],   0x07(%[pixels])                         \n\t"
+        PTR_ADDU   "%[addr0],   %[pixels],      %[line_size]            \n\t"
+        "gsldrc1    %[ftmp0],   0x00(%[pixels])                         \n\t"
+        "gsldlc1    %[ftmp1],   0x07(%[addr0])                          \n\t"
+        PTR_ADDU   "%[addr1],   %[block],       %[line_size]            \n\t"
+        "gsldrc1    %[ftmp1],   0x00(%[addr0])                          \n\t"
+        "gsldlc1    %[ftmp2],   0x07(%[block])                          \n\t"
+        "gsldrc1    %[ftmp2],   0x00(%[block])                          \n\t"
+        "gsldlc1    %[ftmp3],   0x07(%[addr1])                          \n\t"
+        "gsldrc1    %[ftmp3],   0x00(%[addr1])                          \n\t"
+        "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
+        "sdc1       %[ftmp0],   0x00(%[block])                          \n\t"
+        "gssdxc1    %[ftmp1],   0x00(%[block],  %[line_size])           \n\t"
+        PTR_ADDU   "%[pixels],  %[pixels],      %[addr2]                \n\t"
+        PTR_ADDU   "%[block],   %[block],       %[addr2]                \n\t"
+
+        "gsldlc1    %[ftmp0],   0x07(%[pixels])                         \n\t"
+        PTR_ADDU   "%[addr0],   %[pixels],      %[line_size]            \n\t"
+        "gsldrc1    %[ftmp0],   0x00(%[pixels])                         \n\t"
+        "gsldlc1    %[ftmp1],   0x07(%[addr0])                          \n\t"
+        PTR_ADDU   "%[addr1],   %[block],       %[line_size]            \n\t"
+        "gsldrc1    %[ftmp1],   0x00(%[addr0])                          \n\t"
+        "gsldlc1    %[ftmp2],   0x07(%[block])                          \n\t"
+        "gsldrc1    %[ftmp2],   0x00(%[block])                          \n\t"
+        "gsldlc1    %[ftmp3],   0x07(%[addr1])                          \n\t"
+        "gsldrc1    %[ftmp3],   0x00(%[addr1])                          \n\t"
+        "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
+        "sdc1       %[ftmp0],   0x00(%[block])                          \n\t"
+        "gssdxc1    %[ftmp1],   0x00(%[block],  %[line_size])           \n\t"
+        PTR_ADDU   "%[pixels],  %[pixels],      %[addr2]                \n\t"
+        PTR_ADDU   "%[block],   %[block],       %[addr2]                \n\t"
+
+        PTR_ADDI   "%[h],       %[h],           -0x04                   \n\t"
+        "bnez       %[h],       1b                                      \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
+          [addr2]"=&r"(addr[2]),
+          [block]"+&r"(block),              [pixels]"+&r"(pixels),
+          [h]"+&r"(h)
+        : [line_size]"r"((mips_reg)line_size)
+        : "memory"
+    );
+}
+
+void ff_avg_pixels16_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int h)
+{
+    double ftmp[8];
+    mips_reg addr[3];
+
+    __asm__ volatile (
+        PTR_ADDU   "%[addr2],   %[line_size],   %[line_size]            \n\t"
+        "1:                                                             \n\t"
+        "gsldlc1    %[ftmp0],   0x07(%[pixels])                         \n\t"
+        PTR_ADDU   "%[addr0],   %[pixels],      %[line_size]            \n\t"
+        "gsldrc1    %[ftmp0],   0x00(%[pixels])                         \n\t"
+        "gsldlc1    %[ftmp4],   0x0f(%[pixels])                         \n\t"
+        PTR_ADDU   "%[addr1],   %[block],       %[line_size]            \n\t"
+        "gsldrc1    %[ftmp4],   0x08(%[pixels])                         \n\t"
+        "gsldlc1    %[ftmp1],   0x07(%[addr0])                          \n\t"
+        "gsldrc1    %[ftmp1],   0x00(%[addr0])                          \n\t"
+        "gsldlc1    %[ftmp5],   0x0f(%[addr0])                          \n\t"
+        "gsldrc1    %[ftmp5],   0x08(%[addr0])                          \n\t"
+        "gsldlc1    %[ftmp2],   0x07(%[block])                          \n\t"
+        "gsldrc1    %[ftmp2],   0x00(%[block])                          \n\t"
+        "gsldlc1    %[ftmp6],   0x0f(%[block])                          \n\t"
+        "gsldrc1    %[ftmp6],   0x08(%[block])                          \n\t"
+        "gsldlc1    %[ftmp3],   0x07(%[addr1])                          \n\t"
+        "gsldrc1    %[ftmp3],   0x00(%[addr1])                          \n\t"
+        "gsldlc1    %[ftmp7],   0x0f(%[addr1])                          \n\t"
+        "gsldrc1    %[ftmp7],   0x08(%[addr1])                          \n\t"
+        "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "pavgb      %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
+        "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
+        "pavgb      %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+        "sdc1       %[ftmp0],   0x00(%[block])                          \n\t"
+        "gssdxc1    %[ftmp1],   0x00(%[block],  %[line_size])           \n\t"
+        "sdc1       %[ftmp4],   0x08(%[block])                          \n\t"
+        "gssdxc1    %[ftmp5],   0x08(%[block],  %[line_size])           \n\t"
+        PTR_ADDU   "%[pixels],  %[pixels],      %[addr2]                \n\t"
+        PTR_ADDU   "%[block],   %[block],       %[addr2]                \n\t"
+
+        "gsldlc1    %[ftmp0],   0x07(%[pixels])                         \n\t"
+        PTR_ADDU   "%[addr0],   %[pixels],      %[line_size]            \n\t"
+        "gsldrc1    %[ftmp0],   0x00(%[pixels])                         \n\t"
+        "gsldlc1    %[ftmp4],   0x0f(%[pixels])                         \n\t"
+        PTR_ADDU   "%[addr1],   %[block],       %[line_size]            \n\t"
+        "gsldrc1    %[ftmp4],   0x08(%[pixels])                         \n\t"
+        "gsldlc1    %[ftmp1],   0x07(%[addr0])                          \n\t"
+        "gsldrc1    %[ftmp1],   0x00(%[addr0])                          \n\t"
+        "gsldlc1    %[ftmp5],   0x0f(%[addr0])                          \n\t"
+        "gsldrc1    %[ftmp5],   0x08(%[addr0])                          \n\t"
+        "gsldlc1    %[ftmp2],   0x07(%[block])                          \n\t"
+        "gsldrc1    %[ftmp2],   0x00(%[block])                          \n\t"
+        "gsldlc1    %[ftmp6],   0x0f(%[block])                          \n\t"
+        "gsldrc1    %[ftmp6],   0x08(%[block])                          \n\t"
+        "gsldlc1    %[ftmp3],   0x07(%[addr1])                          \n\t"
+        "gsldrc1    %[ftmp3],   0x00(%[addr1])                          \n\t"
+        "gsldlc1    %[ftmp7],   0x0f(%[addr1])                          \n\t"
+        "gsldrc1    %[ftmp7],   0x08(%[addr1])                          \n\t"
+        "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "pavgb      %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
+        "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
+        "pavgb      %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+        "sdc1       %[ftmp0],   0x00(%[block])                          \n\t"
+        "gssdxc1    %[ftmp1],   0x00(%[block],  %[line_size])           \n\t"
+        "sdc1       %[ftmp4],   0x08(%[block])                          \n\t"
+        "gssdxc1    %[ftmp5],   0x08(%[block],  %[line_size])           \n\t"
+        PTR_ADDU   "%[pixels],  %[pixels],      %[addr2]                \n\t"
+        PTR_ADDU   "%[block],   %[block],       %[addr2]                \n\t"
+
+        PTR_ADDI   "%[h],       %[h],           -0x04                   \n\t"
+        "bnez       %[h],       1b                                      \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+          [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
+          [addr2]"=&r"(addr[2]),
+          [block]"+&r"(block),              [pixels]"+&r"(pixels),
+          [h]"+&r"(h)
+        : [line_size]"r"((mips_reg)line_size)
+        : "memory"
+    );
+}
+
+inline void ff_put_pixels4_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
+    const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2,
+    int h)
+{
+    double ftmp[4];
+    mips_reg addr[5];
+    uint64_t low32;
+
+    __asm__ volatile (
+        PTR_ADDU   "%[addr2],   %[src_stride1], %[src_stride1]          \n\t"
+        PTR_ADDU   "%[addr3],   %[src_stride2], %[src_stride2]          \n\t"
+        PTR_ADDU   "%[addr4],   %[dst_stride],  %[dst_stride]           \n\t"
+        "1:                                                             \n\t"
+        PTR_ADDU   "%[addr0],   %[src1],        %[src_stride1]          \n\t"
+        "uld        %[low32],   0x00(%[src1])                           \n\t"
+        "mtc1       %[low32],   %[ftmp0]                                \n\t"
+        "uld        %[low32],   0x00(%[addr0])                          \n\t"
+        "mtc1       %[low32],   %[ftmp1]                                \n\t"
+        "uld        %[low32],   0x00(%[src2])                           \n\t"
+        "mtc1       %[low32],   %[ftmp2]                                \n\t"
+        PTR_ADDU   "%[addr1],   %[src2],        %[src_stride2]          \n\t"
+        "uld        %[low32],   0x00(%[addr1])                          \n\t"
+        "mtc1       %[low32],   %[ftmp3]                                \n\t"
+        PTR_ADDU   "%[src1],    %[src1],        %[addr2]                \n\t"
+        "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
+        "swc1       %[ftmp0],   0x00(%[dst])                            \n\t"
+        "gsswxc1    %[ftmp1],   0x00(%[dst],    %[dst_stride])          \n\t"
+        PTR_ADDU   "%[src2],    %[src2],        %[addr3]                \n\t"
+        PTR_ADDU   "%[dst],     %[dst],         %[addr4]                \n\t"
+
+        PTR_ADDU   "%[addr0],   %[src1],        %[src_stride1]          \n\t"
+        "uld        %[low32],   0x00(%[src1])                           \n\t"
+        "mtc1       %[low32],   %[ftmp0]                                \n\t"
+        "uld        %[low32],   0x00(%[addr0])                          \n\t"
+        "mtc1       %[low32],   %[ftmp1]                                \n\t"
+        "uld        %[low32],   0x00(%[src2])                           \n\t"
+        "mtc1       %[low32],   %[ftmp2]                                \n\t"
+        PTR_ADDU   "%[addr1],   %[src2],        %[src_stride2]          \n\t"
+        "uld        %[low32],   0x00(%[addr1])                          \n\t"
+        "mtc1       %[low32],   %[ftmp3]                                \n\t"
+        PTR_ADDU   "%[src1],    %[src1],        %[addr2]                \n\t"
+        "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
+        "swc1       %[ftmp0],   0x00(%[dst])                            \n\t"
+        "gsswxc1    %[ftmp1],   0x00(%[dst],    %[dst_stride])          \n\t"
+        PTR_ADDU   "%[src2],    %[src2],        %[addr3]                \n\t"
+        PTR_ADDU   "%[dst],     %[dst],         %[addr4]                \n\t"
+
+        PTR_ADDI   "%[h],       %[h],           -0x04                   \n\t"
+        "bnez       %[h],       1b                                      \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
+          [addr2]"=&r"(addr[2]),            [addr3]"=&r"(addr[3]),
+          [addr4]"=&r"(addr[4]),
+          [low32]"=&r"(low32),
+          [dst]"+&r"(dst),                  [src1]"+&r"(src1),
+          [src2]"+&r"(src2),                [h]"+&r"(h)
+        : [dst_stride]"r"((mips_reg)dst_stride),
+          [src_stride1]"r"((mips_reg)src_stride1),
+          [src_stride2]"r"((mips_reg)src_stride2)
+        : "memory"
+    );
+}
+
+inline void ff_put_pixels8_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
+    const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2,
+    int h)
+{
+    double ftmp[4];
+    mips_reg addr[5];
+
+    __asm__ volatile (
+        PTR_ADDU   "%[addr2],   %[src_stride1], %[src_stride1]          \n\t"
+        PTR_ADDU   "%[addr3],   %[src_stride2], %[src_stride2]          \n\t"
+        PTR_ADDU   "%[addr4],   %[dst_stride],  %[dst_stride]           \n\t"
+        "1:                                                             \n\t"
+        "gsldlc1    %[ftmp0],   0x07(%[src1])                           \n\t"
+        PTR_ADDU   "%[addr0],   %[src1],        %[src_stride1]          \n\t"
+        "gsldrc1    %[ftmp0],   0x00(%[src1])                           \n\t"
+        "gsldlc1    %[ftmp1],   0x07(%[addr0])                          \n\t"
+        "gsldrc1    %[ftmp1],   0x00(%[addr0])                          \n\t"
+        "gsldlc1    %[ftmp2],   0x07(%[src2])                           \n\t"
+        PTR_ADDU   "%[addr1],   %[src2],        %[src_stride2]          \n\t"
+        "gsldrc1    %[ftmp2],   0x00(%[src2])                           \n\t"
+        "gsldlc1    %[ftmp3],   0x07(%[addr1])                          \n\t"
+        PTR_ADDU   "%[src1],    %[src1],        %[addr2]                \n\t"
+        "gsldrc1    %[ftmp3],   0x00(%[addr1])                          \n\t"
+        "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
+        "sdc1       %[ftmp0],   0x00(%[dst])                            \n\t"
+        "gssdxc1    %[ftmp1],   0x00(%[dst],    %[dst_stride])          \n\t"
+        PTR_ADDU   "%[src2],    %[src2],        %[addr3]                \n\t"
+        PTR_ADDU   "%[dst],     %[dst],         %[addr4]                \n\t"
+
+        "gsldlc1    %[ftmp0],   0x07(%[src1])                           \n\t"
+        PTR_ADDU   "%[addr0],   %[src1],        %[src_stride1]          \n\t"
+        "gsldrc1    %[ftmp0],   0x00(%[src1])                           \n\t"
+        "gsldlc1    %[ftmp1],   0x07(%[addr0])                          \n\t"
+        "gsldrc1    %[ftmp1],   0x00(%[addr0])                          \n\t"
+        "gsldlc1    %[ftmp2],   0x07(%[src2])                           \n\t"
+        PTR_ADDU   "%[addr1],   %[src2],        %[src_stride2]          \n\t"
+        "gsldrc1    %[ftmp2],   0x00(%[src2])                           \n\t"
+        "gsldlc1    %[ftmp3],   0x07(%[addr1])                          \n\t"
+        PTR_ADDU   "%[src1],    %[src1],        %[addr2]                \n\t"
+        "gsldrc1    %[ftmp3],   0x00(%[addr1])                          \n\t"
+        "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
+        "sdc1       %[ftmp0],   0x00(%[dst])                            \n\t"
+        "gssdxc1    %[ftmp1],   0x00(%[dst],    %[dst_stride])          \n\t"
+        PTR_ADDU   "%[src2],    %[src2],        %[addr3]                \n\t"
+        PTR_ADDU   "%[dst],     %[dst],         %[addr4]                \n\t"
+
+        PTR_ADDI   "%[h],       %[h],           -0x04                   \n\t"
+        "bnez       %[h],       1b                                      \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
+          [addr2]"=&r"(addr[2]),            [addr3]"=&r"(addr[3]),
+          [addr4]"=&r"(addr[4]),
+          [dst]"+&r"(dst),                  [src1]"+&r"(src1),
+          [src2]"+&r"(src2),                [h]"+&r"(h)
+        : [dst_stride]"r"((mips_reg)dst_stride),
+          [src_stride1]"r"((mips_reg)src_stride1),
+          [src_stride2]"r"((mips_reg)src_stride2)
+        : "memory"
+    );
+}
+
+inline void ff_put_pixels16_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
+    const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2,
+    int h)
+{
+    double ftmp[8];
+    mips_reg addr[5];
+
+    __asm__ volatile (
+        PTR_ADDU   "%[addr2],   %[src_stride1], %[src_stride1]          \n\t"
+        PTR_ADDU   "%[addr3],   %[src_stride2], %[src_stride2]          \n\t"
+        PTR_ADDU   "%[addr4],   %[dst_stride],  %[dst_stride]           \n\t"
+        "1:                                                             \n\t"
+        "gsldlc1    %[ftmp0],   0x07(%[src1])                           \n\t"
+        PTR_ADDU   "%[addr0],   %[src1],        %[src_stride1]          \n\t"
+        "gsldrc1    %[ftmp0],   0x00(%[src1])                           \n\t"
+        "gsldlc1    %[ftmp4],   0x0f(%[src1])                           \n\t"
+        "gsldrc1    %[ftmp4],   0x08(%[src1])                           \n\t"
+        "gsldlc1    %[ftmp1],   0x07(%[addr0])                          \n\t"
+        "gsldrc1    %[ftmp1],   0x00(%[addr0])                          \n\t"
+        "gsldlc1    %[ftmp5],   0x0f(%[addr0])                          \n\t"
+        "gsldrc1    %[ftmp5],   0x08(%[addr0])                          \n\t"
+        "gsldlc1    %[ftmp2],   0x07(%[src2])                           \n\t"
+        PTR_ADDU   "%[addr1],   %[src2],        %[src_stride2]          \n\t"
+        "gsldrc1    %[ftmp2],   0x00(%[src2])                           \n\t"
+        "gsldlc1    %[ftmp6],   0x0f(%[src2])                           \n\t"
+        "gsldrc1    %[ftmp6],   0x08(%[src2])                           \n\t"
+        "gsldlc1    %[ftmp3],   0x07(%[addr1])                          \n\t"
+        PTR_ADDU   "%[src1],    %[src1],        %[addr2]                \n\t"
+        "gsldrc1    %[ftmp3],   0x00(%[addr1])                          \n\t"
+        "gsldlc1    %[ftmp7],   0x0f(%[addr1])                          \n\t"
+        "gsldrc1    %[ftmp7],   0x08(%[addr1])                          \n\t"
+        "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "pavgb      %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
+        "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
+        "pavgb      %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+        "sdc1       %[ftmp0],   0x00(%[dst])                            \n\t"
+        "gssdxc1    %[ftmp1],   0x00(%[dst],    %[dst_stride])          \n\t"
+        "sdc1       %[ftmp4],   0x08(%[dst])                            \n\t"
+        "gssdxc1    %[ftmp5],   0x08(%[dst],    %[dst_stride])          \n\t"
+        PTR_ADDU   "%[src2],    %[src2],        %[addr3]                \n\t"
+        PTR_ADDU   "%[dst],     %[dst],         %[addr4]                \n\t"
+
+        "gsldlc1    %[ftmp0],   0x07(%[src1])                           \n\t"
+        PTR_ADDU   "%[addr0],   %[src1],        %[src_stride1]          \n\t"
+        "gsldrc1    %[ftmp0],   0x00(%[src1])                           \n\t"
+        "gsldlc1    %[ftmp4],   0x0f(%[src1])                           \n\t"
+        "gsldrc1    %[ftmp4],   0x08(%[src1])                           \n\t"
+        "gsldlc1    %[ftmp1],   0x07(%[addr0])                          \n\t"
+        "gsldrc1    %[ftmp1],   0x00(%[addr0])                          \n\t"
+        "gsldlc1    %[ftmp5],   0x0f(%[addr0])                          \n\t"
+        "gsldrc1    %[ftmp5],   0x08(%[addr0])                          \n\t"
+        "gsldlc1    %[ftmp2],   0x07(%[src2])                           \n\t"
+        PTR_ADDU   "%[addr1],   %[src2],        %[src_stride2]          \n\t"
+        "gsldrc1    %[ftmp2],   0x00(%[src2])                           \n\t"
+        "gsldlc1    %[ftmp6],   0x0f(%[src2])                           \n\t"
+        "gsldrc1    %[ftmp6],   0x08(%[src2])                           \n\t"
+        "gsldlc1    %[ftmp3],   0x07(%[addr1])                          \n\t"
+        PTR_ADDU   "%[src1],    %[src1],        %[addr2]                \n\t"
+        "gsldrc1    %[ftmp3],   0x00(%[addr1])                          \n\t"
+        "gsldlc1    %[ftmp7],   0x0f(%[addr1])                          \n\t"
+        "gsldrc1    %[ftmp7],   0x08(%[addr1])                          \n\t"
+        "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "pavgb      %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
+        "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
+        "pavgb      %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+        "sdc1       %[ftmp0],   0x00(%[dst])                            \n\t"
+        "gssdxc1    %[ftmp1],   0x00(%[dst],    %[dst_stride])          \n\t"
+        "sdc1       %[ftmp4],   0x08(%[dst])                            \n\t"
+        "gssdxc1    %[ftmp5],   0x08(%[dst],    %[dst_stride])          \n\t"
+        PTR_ADDU   "%[src2],    %[src2],        %[addr3]                \n\t"
+        PTR_ADDU   "%[dst],     %[dst],         %[addr4]                \n\t"
+
+        PTR_ADDI   "%[h],       %[h],           -0x04                   \n\t"
+        "bnez       %[h],       1b                                      \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+          [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
+          [addr2]"=&r"(addr[2]),            [addr3]"=&r"(addr[3]),
+          [addr4]"=&r"(addr[4]),
+          [dst]"+&r"(dst),                  [src1]"+&r"(src1),
+          [src2]"+&r"(src2),                [h]"+&r"(h)
+        : [dst_stride]"r"((mips_reg)dst_stride),
+          [src_stride1]"r"((mips_reg)src_stride1),
+          [src_stride2]"r"((mips_reg)src_stride2)
+        : "memory"
+    );
+}
+
+inline void ff_avg_pixels4_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
+    const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2,
+    int h)
+{
+    double ftmp[6];
+    mips_reg addr[6];
+    uint64_t low32;
+
+    __asm__ volatile (
+        PTR_ADDU   "%[addr2],   %[src_stride1], %[src_stride1]          \n\t"
+        PTR_ADDU   "%[addr3],   %[src_stride2], %[src_stride2]          \n\t"
+        PTR_ADDU   "%[addr4],   %[dst_stride],  %[dst_stride]           \n\t"
+        "1:                                                             \n\t"
+        PTR_ADDU   "%[addr0],   %[src1],        %[src_stride1]          \n\t"
+        "uld        %[low32],   0x00(%[src1])                           \n\t"
+        "mtc1       %[low32],   %[ftmp0]                                \n\t"
+        "uld        %[low32],   0x00(%[addr0])                          \n\t"
+        "mtc1       %[low32],   %[ftmp1]                                \n\t"
+        "uld        %[low32],   0x00(%[src2])                           \n\t"
+        "mtc1       %[low32],   %[ftmp2]                                \n\t"
+        PTR_ADDU   "%[addr1],   %[src2],        %[src_stride2]          \n\t"
+        "uld        %[low32],   0x00(%[addr1])                          \n\t"
+        "mtc1       %[low32],   %[ftmp3]                                \n\t"
+        PTR_ADDU   "%[src1],    %[src1],        %[addr2]                \n\t"
+        "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
+        PTR_ADDU   "%[addr5],   %[dst],         %[dst_stride]           \n\t"
+        "uld        %[low32],   0x00(%[dst])                            \n\t"
+        "mtc1       %[low32],   %[ftmp4]                                \n\t"
+        "uld        %[low32],   0x00(%[addr5])                          \n\t"
+        "mtc1       %[low32],   %[ftmp5]                                \n\t"
+        "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
+        "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
+        "swc1       %[ftmp0],   0x00(%[dst])                            \n\t"
+        "gsswxc1    %[ftmp1],   0x00(%[dst],    %[dst_stride])          \n\t"
+        PTR_ADDU   "%[src2],    %[src2],        %[addr3]                \n\t"
+        PTR_ADDU   "%[dst],     %[dst],         %[addr4]                \n\t"
+
+        PTR_ADDU   "%[addr0],   %[src1],        %[src_stride1]          \n\t"
+        "uld        %[low32],   0x00(%[src1])                           \n\t"
+        "mtc1       %[low32],   %[ftmp0]                                \n\t"
+        "uld        %[low32],   0x00(%[addr0])                          \n\t"
+        "mtc1       %[low32],   %[ftmp1]                                \n\t"
+        "uld        %[low32],   0x00(%[src2])                           \n\t"
+        "mtc1       %[low32],   %[ftmp2]                                \n\t"
+        PTR_ADDU   "%[addr1],   %[src2],        %[src_stride2]          \n\t"
+        "uld        %[low32],   0x00(%[addr1])                          \n\t"
+        "mtc1       %[low32],   %[ftmp3]                                \n\t"
+        PTR_ADDU   "%[src1],    %[src1],        %[addr2]                \n\t"
+        "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
+        PTR_ADDU   "%[addr5],   %[dst],         %[dst_stride]           \n\t"
+        "uld        %[low32],   0x00(%[dst])                            \n\t"
+        "mtc1       %[low32],   %[ftmp4]                                \n\t"
+        "uld        %[low32],   0x00(%[addr5])                          \n\t"
+        "mtc1       %[low32],   %[ftmp5]                                \n\t"
+        "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
+        "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
+        "swc1       %[ftmp0],   0x00(%[dst])                            \n\t"
+        "gsswxc1    %[ftmp1],   0x00(%[dst],    %[dst_stride])          \n\t"
+        PTR_ADDU   "%[src2],    %[src2],        %[addr3]                \n\t"
+        PTR_ADDU   "%[dst],     %[dst],         %[addr4]                \n\t"
+
+        PTR_ADDI   "%[h],       %[h],           -0x04                   \n\t"
+        "bnez       %[h],       1b                                      \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
+          [addr2]"=&r"(addr[2]),            [addr3]"=&r"(addr[3]),
+          [addr4]"=&r"(addr[4]),            [addr5]"=&r"(addr[5]),
+          [low32]"=&r"(low32),
+          [dst]"+&r"(dst),                  [src1]"+&r"(src1),
+          [src2]"+&r"(src2),                [h]"+&r"(h)
+        : [dst_stride]"r"((mips_reg)dst_stride),
+          [src_stride1]"r"((mips_reg)src_stride1),
+          [src_stride2]"r"((mips_reg)src_stride2)
+        : "memory"
+    );
+}
+
+inline void ff_avg_pixels8_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
+    const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2,
+    int h)
+{
+    double ftmp[6];
+    mips_reg addr[6];
+
+    __asm__ volatile (
+        PTR_ADDU   "%[addr2],   %[src_stride1], %[src_stride1]          \n\t"
+        PTR_ADDU   "%[addr3],   %[src_stride2], %[src_stride2]          \n\t"
+        PTR_ADDU   "%[addr4],   %[dst_stride],  %[dst_stride]           \n\t"
+        "1:                                                             \n\t"
+        "gsldlc1    %[ftmp0],   0x07(%[src1])                           \n\t"
+        PTR_ADDU   "%[addr0],   %[src1],        %[src_stride1]          \n\t"
+        "gsldrc1    %[ftmp0],   0x00(%[src1])                           \n\t"
+        "gsldlc1    %[ftmp1],   0x07(%[addr0])                          \n\t"
+        "gsldrc1    %[ftmp1],   0x00(%[addr0])                          \n\t"
+        "gsldlc1    %[ftmp2],   0x07(%[src2])                           \n\t"
+        PTR_ADDU   "%[addr1],   %[src2],        %[src_stride2]          \n\t"
+        "gsldrc1    %[ftmp2],   0x00(%[src2])                           \n\t"
+        "gsldlc1    %[ftmp3],   0x07(%[addr1])                          \n\t"
+        PTR_ADDU   "%[src1],    %[src1],        %[addr2]                \n\t"
+        "gsldrc1    %[ftmp3],   0x00(%[addr1])                          \n\t"
+        "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
+        PTR_ADDU   "%[addr5],   %[dst],         %[dst_stride]           \n\t"
+        "gsldlc1    %[ftmp4],   0x07(%[dst])                            \n\t"
+        "gsldrc1    %[ftmp4],   0x00(%[dst])                            \n\t"
+        "gsldlc1    %[ftmp5],   0x07(%[addr5])                          \n\t"
+        "gsldrc1    %[ftmp5],   0x00(%[addr5])                          \n\t"
+        "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
+        "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
+        "sdc1       %[ftmp0],   0x00(%[dst])                            \n\t"
+        "gssdxc1    %[ftmp1],   0x00(%[dst],    %[dst_stride])          \n\t"
+        PTR_ADDU   "%[src2],    %[src2],        %[addr3]                \n\t"
+        PTR_ADDU   "%[dst],     %[dst],         %[addr4]                \n\t"
+
+        "gsldlc1    %[ftmp0],   0x07(%[src1])                           \n\t"
+        PTR_ADDU   "%[addr0],   %[src1],        %[src_stride1]          \n\t"
+        "gsldrc1    %[ftmp0],   0x00(%[src1])                           \n\t"
+        "gsldlc1    %[ftmp1],   0x07(%[addr0])                          \n\t"
+        "gsldrc1    %[ftmp1],   0x00(%[addr0])                          \n\t"
+        "gsldlc1    %[ftmp2],   0x07(%[src2])                           \n\t"
+        PTR_ADDU   "%[addr1],   %[src2],        %[src_stride2]          \n\t"
+        "gsldrc1    %[ftmp2],   0x00(%[src2])                           \n\t"
+        "gsldlc1    %[ftmp3],   0x07(%[addr1])                          \n\t"
+        PTR_ADDU   "%[src1],    %[src1],        %[addr2]                \n\t"
+        "gsldrc1    %[ftmp3],   0x00(%[addr1])                          \n\t"
+        "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
+        PTR_ADDU   "%[addr5],   %[dst],         %[dst_stride]           \n\t"
+        "gsldlc1    %[ftmp4],   0x07(%[dst])                            \n\t"
+        "gsldrc1    %[ftmp4],   0x00(%[dst])                            \n\t"
+        "gsldlc1    %[ftmp5],   0x07(%[addr5])                          \n\t"
+        "gsldrc1    %[ftmp5],   0x00(%[addr5])                          \n\t"
+        "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
+        "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
+        "sdc1       %[ftmp0],   0x00(%[dst])                            \n\t"
+        "gssdxc1    %[ftmp1],   0x00(%[dst],    %[dst_stride])          \n\t"
+        PTR_ADDU   "%[src2],    %[src2],        %[addr3]                \n\t"
+        PTR_ADDU   "%[dst],     %[dst],         %[addr4]                \n\t"
+
+        PTR_ADDI   "%[h],       %[h],           -0x04                   \n\t"
+        "bnez       %[h],       1b                                      \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
+          [addr2]"=&r"(addr[2]),            [addr3]"=&r"(addr[3]),
+          [addr4]"=&r"(addr[4]),            [addr5]"=&r"(addr[5]),
+          [dst]"+&r"(dst),                  [src1]"+&r"(src1),
+          [src2]"+&r"(src2),                [h]"+&r"(h)
+        : [dst_stride]"r"((mips_reg)dst_stride),
+          [src_stride1]"r"((mips_reg)src_stride1),
+          [src_stride2]"r"((mips_reg)src_stride2)
+        : "memory"
+    );
+}
+
+inline void ff_avg_pixels16_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
+    const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2,
+    int h)
+{
+    ff_avg_pixels8_l2_8_mmi(dst, src1, src2, dst_stride, src_stride1,
+            src_stride2, h);
+    ff_avg_pixels8_l2_8_mmi(dst + 8, src1 + 8, src2 + 8, dst_stride,
+            src_stride1, src_stride2, h);
+}
+
+void ff_put_pixels4_x2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int h)
+{
+    ff_put_pixels4_l2_8_mmi(block, pixels, pixels + 1, line_size, line_size,
+            line_size, h);
+}
+
+void ff_put_pixels8_x2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int h)
+{
+    ff_put_pixels8_l2_8_mmi(block, pixels, pixels + 1, line_size, line_size,
+            line_size, h);
+}
+
+void ff_put_pixels16_x2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int h)
+{
+    ff_put_pixels16_l2_8_mmi(block, pixels, pixels + 1, line_size, line_size,
+            line_size, h);
+}
+
+void ff_avg_pixels4_x2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int h)
+{
+    ff_avg_pixels4_l2_8_mmi(block, pixels, pixels + 1, line_size, line_size,
+            line_size, h);
+}
+
+void ff_avg_pixels8_x2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int h)
+{
+    ff_avg_pixels8_l2_8_mmi(block, pixels, pixels + 1, line_size, line_size,
+            line_size, h);
+}
+
+void ff_avg_pixels16_x2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int h)
+{
+    ff_avg_pixels8_x2_8_mmi(block, pixels, line_size, h);
+    ff_avg_pixels8_x2_8_mmi(block + 8, pixels + 8, line_size, h);
+}
+
+inline void ff_put_no_rnd_pixels8_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
+    const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2,
+    int h)
+{
+    double ftmp[5];
+    mips_reg addr[5];
+
+    __asm__ volatile (
+        "pcmpeqb    %[ftmp4],   %[ftmp4],       %[ftmp4]                \n\t"
+        PTR_ADDU   "%[addr2],   %[src_stride1], %[src_stride1]          \n\t"
+        PTR_ADDU   "%[addr3],   %[src_stride2], %[src_stride2]          \n\t"
+        PTR_ADDU   "%[addr4],   %[dst_stride],  %[dst_stride]           \n\t"
+        "1:                                                             \n\t"
+        "gsldlc1    %[ftmp0],   0x07(%[src1])                           \n\t"
+        PTR_ADDU   "%[addr0],   %[src1],        %[src_stride1]          \n\t"
+        "gsldrc1    %[ftmp0],   0x00(%[src1])                           \n\t"
+        "gsldlc1    %[ftmp1],   0x07(%[addr0])                          \n\t"
+        "gsldrc1    %[ftmp1],   0x00(%[addr0])                          \n\t"
+        "gsldlc1    %[ftmp2],   0x07(%[src2])                           \n\t"
+        PTR_ADDU   "%[addr1],   %[src2],        %[src_stride2]          \n\t"
+        "gsldrc1    %[ftmp2],   0x00(%[src2])                           \n\t"
+        "gsldlc1    %[ftmp3],   0x07(%[addr1])                          \n\t"
+        PTR_ADDU   "%[src1],    %[src1],        %[addr2]                \n\t"
+        "gsldrc1    %[ftmp3],   0x00(%[addr1])                          \n\t"
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
+        "xor        %[ftmp1],   %[ftmp1],       %[ftmp4]                \n\t"
+        "xor        %[ftmp2],   %[ftmp2],       %[ftmp4]                \n\t"
+        "xor        %[ftmp3],   %[ftmp3],       %[ftmp4]                \n\t"
+        "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
+        "xor        %[ftmp1],   %[ftmp1],       %[ftmp4]                \n\t"
+        "sdc1       %[ftmp0],   0x00(%[dst])                            \n\t"
+        "gssdxc1    %[ftmp1],   0x00(%[dst],    %[dst_stride])          \n\t"
+        PTR_ADDU   "%[src2],    %[src2],        %[addr3]                \n\t"
+        PTR_ADDU   "%[dst],     %[dst],         %[addr4]                \n\t"
+
+        "gsldlc1    %[ftmp0],   0x07(%[src1])                           \n\t"
+        PTR_ADDU   "%[addr0],   %[src1],        %[src_stride1]          \n\t"
+        "gsldrc1    %[ftmp0],   0x00(%[src1])                           \n\t"
+        "gsldlc1    %[ftmp1],   0x07(%[addr0])                          \n\t"
+        "gsldrc1    %[ftmp1],   0x00(%[addr0])                          \n\t"
+        "gsldlc1    %[ftmp2],   0x07(%[src2])                           \n\t"
+        PTR_ADDU   "%[addr1],   %[src2],        %[src_stride2]          \n\t"
+        "gsldrc1    %[ftmp2],   0x00(%[src2])                           \n\t"
+        "gsldlc1    %[ftmp3],   0x07(%[addr1])                          \n\t"
+        PTR_ADDU   "%[src1],    %[src1],        %[addr2]                \n\t"
+        "gsldrc1    %[ftmp3],   0x00(%[addr1])                          \n\t"
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
+        "xor        %[ftmp1],   %[ftmp1],       %[ftmp4]                \n\t"
+        "xor        %[ftmp2],   %[ftmp2],       %[ftmp4]                \n\t"
+        "xor        %[ftmp3],   %[ftmp3],       %[ftmp4]                \n\t"
+        "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
+        "xor        %[ftmp1],   %[ftmp1],       %[ftmp4]                \n\t"
+        "sdc1       %[ftmp0],   0x00(%[dst])                            \n\t"
+        "gssdxc1    %[ftmp1],   0x00(%[dst],    %[dst_stride])          \n\t"
+        PTR_ADDU   "%[src2],    %[src2],        %[addr3]                \n\t"
+        PTR_ADDU   "%[dst],     %[dst],         %[addr4]                \n\t"
+
+        PTR_ADDI   "%[h],       %[h],           -0x04                   \n\t"
+        "bnez       %[h],       1b                                      \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),
+          [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
+          [addr2]"=&r"(addr[2]),            [addr3]"=&r"(addr[3]),
+          [addr4]"=&r"(addr[4]),
+          [dst]"+&r"(dst),                  [src1]"+&r"(src1),
+          [src2]"+&r"(src2),                [h]"+&r"(h)
+        : [dst_stride]"r"((mips_reg)dst_stride),
+          [src_stride1]"r"((mips_reg)src_stride1),
+          [src_stride2]"r"((mips_reg)src_stride2)
+        : "memory"
+    );
+}
+
+void ff_put_no_rnd_pixels8_x2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int h)
+{
+    ff_put_no_rnd_pixels8_l2_8_mmi(block, pixels, pixels + 1, line_size,
+            line_size, line_size, h);
+}
+
+void ff_put_no_rnd_pixels16_x2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int h)
+{
+    ff_put_no_rnd_pixels8_x2_8_mmi(block, pixels, line_size, h);
+    ff_put_no_rnd_pixels8_x2_8_mmi(block + 8, pixels + 8, line_size, h);
+}
+
+void ff_put_pixels4_y2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int h)
+{
+    ff_put_pixels4_l2_8_mmi(block, pixels, pixels + line_size, line_size,
+            line_size, line_size, h);
+}
+
+void ff_put_pixels8_y2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int h)
+{
+    ff_put_pixels8_l2_8_mmi(block, pixels, pixels + line_size, line_size,
+            line_size, line_size, h);
+}
+
+void ff_put_pixels16_y2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int h)
+{
+    ff_put_pixels16_l2_8_mmi(block, pixels, pixels + line_size, line_size,
+            line_size, line_size, h);
+}
+
+void ff_avg_pixels4_y2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int h)
+{
+    ff_avg_pixels4_l2_8_mmi(block, pixels, pixels + line_size, line_size,
+            line_size, line_size, h);
+}
+
+void ff_avg_pixels8_y2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int h)
+{
+    ff_avg_pixels8_l2_8_mmi(block, pixels, pixels + line_size, line_size,
+            line_size, line_size, h);
+}
+
+void ff_avg_pixels16_y2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int h)
+{
+    ff_avg_pixels8_y2_8_mmi(block, pixels, line_size, h);
+    ff_avg_pixels8_y2_8_mmi(block + 8, pixels + 8, line_size, h);
+}
+
+void ff_put_no_rnd_pixels8_y2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int h)
+{
+    ff_put_no_rnd_pixels8_l2_8_mmi(block, pixels, pixels + line_size,
+            line_size, line_size, line_size, h);
+}
+
+void ff_put_no_rnd_pixels16_y2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int h)
+{
+    ff_put_no_rnd_pixels8_y2_8_mmi(block, pixels, line_size, h);
+    ff_put_no_rnd_pixels8_y2_8_mmi(block + 8 , pixels + 8, line_size, h);
+}
+
+void ff_put_pixels4_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int h)
+{
+    /* FIXME HIGH BIT DEPTH */
+    int i;
+    const uint32_t a = AV_RN32(pixels);
+    const uint32_t b = AV_RN32(pixels + 1);
+    uint32_t l0 = (a & 0x03030303UL) +
+                  (b & 0x03030303UL) +
+                       0x02020202UL;
+    uint32_t h0 = ((a & 0xFCFCFCFCUL) >> 2) +
+                  ((b & 0xFCFCFCFCUL) >> 2);
+    uint32_t l1, h1;
+
+    pixels += line_size;
+    for (i = 0; i < h; i += 2) {
+        uint32_t a = AV_RN32(pixels);
+        uint32_t b = AV_RN32(pixels + 1);
+        l1 = (a & 0x03030303UL) +
+             (b & 0x03030303UL);
+        h1 = ((a & 0xFCFCFCFCUL) >> 2) +
+             ((b & 0xFCFCFCFCUL) >> 2);
+        *((uint32_t *) block) = h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
+        pixels += line_size;
+        block  += line_size;
+        a  = AV_RN32(pixels);
+        b  = AV_RN32(pixels + 1);
+        l0 = (a & 0x03030303UL) +
+             (b & 0x03030303UL) +
+                  0x02020202UL;
+        h0 = ((a & 0xFCFCFCFCUL) >> 2) +
+             ((b & 0xFCFCFCFCUL) >> 2);
+        *((uint32_t *) block) = h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
+        pixels += line_size;
+        block  += line_size;
+    }
+}
+
+void ff_put_pixels8_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int h)
+{
+#if 1
+    double ftmp[10];
+    mips_reg addr[2];
+
+    __asm__ volatile (
+        "xor        %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
+        "dli        %[addr0],   0x0f                                    \n\t"
+        "pcmpeqw    %[ftmp6],   %[ftmp6],       %[ftmp6]                \n\t"
+        "dmtc1      %[addr0],   %[ftmp8]                                \n\t"
+        "dli        %[addr0],   0x01                                    \n\t"
+        "psrlh      %[ftmp6],   %[ftmp6],       %[ftmp8]                \n\t"
+        "dmtc1      %[addr0],   %[ftmp8]                                \n\t"
+        "psllh      %[ftmp6],   %[ftmp6],       %[ftmp8]                \n\t"
+
+        "dli        %[addr0],   0x02                                    \n\t"
+        "gsldlc1    %[ftmp0],   0x07(%[pixels])                         \n\t"
+        "gsldrc1    %[ftmp0],   0x00(%[pixels])                         \n\t"
+        "dmtc1      %[addr0],   %[ftmp9]                                \n\t"
+        "gsldlc1    %[ftmp4],   0x08(%[pixels])                         \n\t"
+        "gsldrc1    %[ftmp4],   0x01(%[pixels])                         \n\t"
+        "mov.d      %[ftmp1],   %[ftmp0]                                \n\t"
+        "mov.d      %[ftmp5],   %[ftmp4]                                \n\t"
+        "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp7]                \n\t"
+        "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp7]                \n\t"
+        "punpckhbh  %[ftmp1],   %[ftmp1],       %[ftmp7]                \n\t"
+        "punpckhbh  %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+        "paddush    %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
+        "paddush    %[ftmp5],   %[ftmp5],       %[ftmp1]                \n\t"
+        "xor        %[addr0],   %[addr0],       %[addr0]                \n\t"
+        PTR_ADDU   "%[pixels],  %[pixels],      %[line_size]            \n\t"
+        ".p2align   3                                                   \n\t"
+        "1:                                                             \n\t"
+        PTR_ADDU   "%[addr1],   %[pixels],      %[addr0]                \n\t"
+        "gsldlc1    %[ftmp0],   0x07(%[addr1])                          \n\t"
+        "gsldrc1    %[ftmp0],   0x00(%[addr1])                          \n\t"
+        "gsldlc1    %[ftmp2],   0x08(%[addr1])                          \n\t"
+        "gsldrc1    %[ftmp2],   0x01(%[addr1])                          \n\t"
+        "mov.d      %[ftmp1],   %[ftmp0]                                \n\t"
+        "mov.d      %[ftmp3],   %[ftmp2]                                \n\t"
+        "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp7]                \n\t"
+        "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp7]                \n\t"
+        "punpckhbh  %[ftmp1],   %[ftmp1],       %[ftmp7]                \n\t"
+        "punpckhbh  %[ftmp3],   %[ftmp3],       %[ftmp7]                \n\t"
+        "paddush    %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "paddush    %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
+        "paddush    %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
+        "paddush    %[ftmp5],   %[ftmp5],       %[ftmp6]                \n\t"
+        "paddush    %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
+        "paddush    %[ftmp5],   %[ftmp5],       %[ftmp1]                \n\t"
+        "psrlh      %[ftmp4],   %[ftmp4],       %[ftmp9]                \n\t"
+        "psrlh      %[ftmp5],   %[ftmp5],       %[ftmp9]                \n\t"
+        "packushb   %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
+        "gssdxc1    %[ftmp4],   0x00(%[block],  %[addr0])               \n\t"
+        PTR_ADDU   "%[addr0],   %[addr0],       %[line_size]            \n\t"
+        PTR_ADDU   "%[addr1],   %[pixels],      %[addr0]                \n\t"
+        "gsldlc1    %[ftmp2],   0x07(%[addr1])                          \n\t"
+        "gsldrc1    %[ftmp2],   0x00(%[addr1])                          \n\t"
+        "gsldlc1    %[ftmp4],   0x08(%[addr1])                          \n\t"
+        "gsldrc1    %[ftmp4],   0x01(%[addr1])                          \n\t"
+        "mov.d      %[ftmp3],   %[ftmp2]                                \n\t"
+        "mov.d      %[ftmp5],   %[ftmp4]                                \n\t"
+        "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp7]                \n\t"
+        "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp7]                \n\t"
+        "punpckhbh  %[ftmp3],   %[ftmp3],       %[ftmp7]                \n\t"
+        "punpckhbh  %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+        "paddush    %[ftmp4],   %[ftmp4],       %[ftmp2]                \n\t"
+        "paddush    %[ftmp5],   %[ftmp5],       %[ftmp3]                \n\t"
+        "paddush    %[ftmp0],   %[ftmp0],       %[ftmp6]                \n\t"
+        "paddush    %[ftmp1],   %[ftmp1],       %[ftmp6]                \n\t"
+        "paddush    %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
+        "paddush    %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
+        "psrlh      %[ftmp0],   %[ftmp0],       %[ftmp9]                \n\t"
+        "psrlh      %[ftmp1],   %[ftmp1],       %[ftmp9]                \n\t"
+        "packushb   %[ftmp0],   %[ftmp0],       %[ftmp1]                \n\t"
+        "gssdxc1    %[ftmp0],   0x00(%[block],  %[addr0])               \n\t"
+        PTR_ADDU   "%[addr0],   %[addr0],       %[line_size]            \n\t"
+        PTR_ADDU   "%[h],       %[h],           -0x02                   \n\t"
+        "bnez       %[h],       1b                                      \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+          [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
+          [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
+          [h]"+&r"(h),                      [pixels]"+&r"(pixels)
+        : [block]"r"(block),                [line_size]"r"((mips_reg)line_size)
+        : "memory"
+    );
+#else
+    /* FIXME HIGH BIT DEPTH */
+    int j;
+
+    for (j = 0; j < 2; j++) {
+        int i;
+        const uint32_t a = AV_RN32(pixels);
+        const uint32_t b = AV_RN32(pixels + 1);
+        uint32_t l0 = (a & 0x03030303UL) +
+                      (b & 0x03030303UL) +
+                           0x02020202UL;
+        uint32_t h0 = ((a & 0xFCFCFCFCUL) >> 2) +
+                      ((b & 0xFCFCFCFCUL) >> 2);
+        uint32_t l1, h1;
+
+        pixels += line_size;
+        for (i = 0; i < h; i += 2) {
+            uint32_t a = AV_RN32(pixels);
+            uint32_t b = AV_RN32(pixels + 1);
+            l1 = (a & 0x03030303UL) +
+                 (b & 0x03030303UL);
+            h1 = ((a & 0xFCFCFCFCUL) >> 2) +
+                 ((b & 0xFCFCFCFCUL) >> 2);
+            *((uint32_t *) block) = h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
+            pixels += line_size;
+            block  += line_size;
+            a  = AV_RN32(pixels);
+            b  = AV_RN32(pixels + 1);
+            l0 = (a & 0x03030303UL) +
+                 (b & 0x03030303UL) +
+                      0x02020202UL;
+            h0 = ((a & 0xFCFCFCFCUL) >> 2) +
+                 ((b & 0xFCFCFCFCUL) >> 2);
+            *((uint32_t *) block) = h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
+            pixels += line_size;
+            block  += line_size;
+        }
+        pixels += 4 - line_size * (h + 1);
+        block  += 4 - line_size * h;
+    }
+#endif
+}
+
+void ff_put_pixels16_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int h)
+{
+    ff_put_pixels8_xy2_8_mmi(block, pixels, line_size, h);
+    ff_put_pixels8_xy2_8_mmi(block + 8, pixels + 8, line_size, h);
+}
+
+void ff_avg_pixels4_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int h)
+{
+    /* FIXME HIGH BIT DEPTH */
+    int i;
+    const uint32_t a = AV_RN32(pixels);
+    const uint32_t b = AV_RN32(pixels + 1);
+    uint32_t l0 = (a & 0x03030303UL) +
+                  (b & 0x03030303UL) +
+                       0x02020202UL;
+    uint32_t h0 = ((a & 0xFCFCFCFCUL) >> 2) +
+                  ((b & 0xFCFCFCFCUL) >> 2);
+    uint32_t l1, h1;
+
+    pixels += line_size;
+    for (i = 0; i < h; i += 2) {
+        uint32_t a = AV_RN32(pixels);
+        uint32_t b = AV_RN32(pixels + 1);
+        l1 = (a & 0x03030303UL) +
+             (b & 0x03030303UL);
+        h1 = ((a & 0xFCFCFCFCUL) >> 2) +
+             ((b & 0xFCFCFCFCUL) >> 2);
+        *((uint32_t *) block) = rnd_avg32(*((uint32_t *) block), h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL));
+        pixels += line_size;
+        block  += line_size;
+        a  = AV_RN32(pixels);
+        b  = AV_RN32(pixels + 1);
+        l0 = (a & 0x03030303UL) +
+             (b & 0x03030303UL) +
+                  0x02020202UL;
+        h0 = ((a & 0xFCFCFCFCUL) >> 2) +
+             ((b & 0xFCFCFCFCUL) >> 2);
+        *((uint32_t *) block) = rnd_avg32(*((uint32_t *) block), h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL));
+        pixels += line_size;
+        block  += line_size;
+    }
+}
+
+void ff_avg_pixels8_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int h)
+{
+    /* FIXME HIGH BIT DEPTH */
+    int j;
+
+    for (j = 0; j < 2; j++) {
+        int i;
+        const uint32_t a = AV_RN32(pixels);
+        const uint32_t b = AV_RN32(pixels + 1);
+        uint32_t l0 = (a & 0x03030303UL) +
+                      (b & 0x03030303UL) +
+                           0x02020202UL;
+        uint32_t h0 = ((a & 0xFCFCFCFCUL) >> 2) +
+                      ((b & 0xFCFCFCFCUL) >> 2);
+        uint32_t l1, h1;
+
+        pixels += line_size;
+        for (i = 0; i < h; i += 2) {
+            uint32_t a = AV_RN32(pixels);
+            uint32_t b = AV_RN32(pixels + 1);
+            l1 = (a & 0x03030303UL) +
+                 (b & 0x03030303UL);
+            h1 = ((a & 0xFCFCFCFCUL) >> 2) +
+                 ((b & 0xFCFCFCFCUL) >> 2);
+            *((uint32_t *) block) = rnd_avg32(*((uint32_t *) block), h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL));
+            pixels += line_size;
+            block  += line_size;
+            a  = AV_RN32(pixels);
+            b  = AV_RN32(pixels + 1);
+            l0 = (a & 0x03030303UL) +
+                 (b & 0x03030303UL) +
+                      0x02020202UL;
+            h0 = ((a & 0xFCFCFCFCUL) >> 2) +
+                 ((b & 0xFCFCFCFCUL) >> 2);
+            *((uint32_t *) block) = rnd_avg32(*((uint32_t *) block), h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL));
+            pixels += line_size;
+            block  += line_size;
+        }
+        pixels += 4 - line_size * (h + 1);
+        block  += 4 - line_size * h;
+    }
+}
+
+void ff_avg_pixels16_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int h)
+{
+    ff_avg_pixels8_xy2_8_mmi(block, pixels, line_size, h);
+    ff_avg_pixels8_xy2_8_mmi(block + 8, pixels + 8, line_size, h);
+}
+
+void ff_put_no_rnd_pixels8_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int h)
+{
+    /* FIXME HIGH BIT DEPTH */
+    int j;
+
+    for (j = 0; j < 2; j++) {
+        int i;
+        const uint32_t a = AV_RN32(pixels);
+        const uint32_t b = AV_RN32(pixels + 1);
+        uint32_t l0 = (a & 0x03030303UL) +
+                      (b & 0x03030303UL) +
+                           0x01010101UL;
+        uint32_t h0 = ((a & 0xFCFCFCFCUL) >> 2) +
+                      ((b & 0xFCFCFCFCUL) >> 2);
+        uint32_t l1, h1;
+
+        pixels += line_size;
+        for (i = 0; i < h; i += 2) {
+            uint32_t a = AV_RN32(pixels);
+            uint32_t b = AV_RN32(pixels + 1);
+            l1 = (a & 0x03030303UL) +
+                 (b & 0x03030303UL);
+            h1 = ((a & 0xFCFCFCFCUL) >> 2) +
+                 ((b & 0xFCFCFCFCUL) >> 2);
+            *((uint32_t *) block) = h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
+            pixels += line_size;
+            block  += line_size;
+            a  = AV_RN32(pixels);
+            b  = AV_RN32(pixels + 1);
+            l0 = (a & 0x03030303UL) +
+                 (b & 0x03030303UL) +
+                      0x01010101UL;
+            h0 = ((a & 0xFCFCFCFCUL) >> 2) +
+                 ((b & 0xFCFCFCFCUL) >> 2);
+            *((uint32_t *) block) = h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
+            pixels += line_size;
+            block  += line_size;
+        }
+        pixels += 4 - line_size * (h + 1);
+        block  += 4 - line_size * h;
+    }
+}
+
+void ff_put_no_rnd_pixels16_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
+    ptrdiff_t line_size, int h)
+{
+    ff_put_no_rnd_pixels8_xy2_8_mmi(block, pixels, line_size, h);
+    ff_put_no_rnd_pixels8_xy2_8_mmi(block + 8, pixels + 8, line_size, h);
+}
-- 
2.1.0



More information about the ffmpeg-devel mailing list