[FFmpeg-cvslog] avcodec/mips: [loongson] optimize memset in h264dsp.
Shiyou Yin
git at videolan.org
Sun Sep 2 04:38:29 EEST 2018
ffmpeg | branch: master | Shiyou Yin <yinshiyou-hf at loongson.cn> | Fri Aug 31 21:57:23 2018 +0800| [93b35a0555355ad6fe7a24b2a39b8d364f3403f8] | committer: Michael Niedermayer
avcodec/mips: [loongson] optimize memset in h264dsp.
Optimized memset with mmi in following functions:
1. ff_h264_add_pixels4_8_mmi.
2. ff_h264_idct_add_8_mmi.
3. ff_h264_idct8_add_8_mmi.
This optimization improved h264 decoding performance about 1.3%(tested on loongson 3A3000).
Signed-off-by: Michael Niedermayer <michael at niedermayer.cc>
> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=93b35a0555355ad6fe7a24b2a39b8d364f3403f8
---
libavcodec/mips/h264dsp_mmi.c | 30 +++++++++++++++++++++---------
1 file changed, 21 insertions(+), 9 deletions(-)
diff --git a/libavcodec/mips/h264dsp_mmi.c b/libavcodec/mips/h264dsp_mmi.c
index ac6fa996ad..ac65a20db0 100644
--- a/libavcodec/mips/h264dsp_mmi.c
+++ b/libavcodec/mips/h264dsp_mmi.c
@@ -31,7 +31,6 @@ void ff_h264_add_pixels4_8_mmi(uint8_t *dst, int16_t *src, int stride)
{
double ftmp[9];
DECLARE_VAR_LOW32;
- DECLARE_VAR_ALL64;
__asm__ volatile (
"xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
@@ -59,12 +58,16 @@ void ff_h264_add_pixels4_8_mmi(uint8_t *dst, int16_t *src, int stride)
MMI_SWC1(%[ftmp2], %[dst1], 0x00)
MMI_SWC1(%[ftmp3], %[dst2], 0x00)
MMI_SWC1(%[ftmp4], %[dst3], 0x00)
+
+ /* memset(src, 0, 32); */
+ "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ "gssqc1 %[ftmp0], %[ftmp0], 0x00(%[src]) \n\t"
+ "gssqc1 %[ftmp0], %[ftmp0], 0x10(%[src]) \n\t"
: [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
[ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
[ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
[ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
RESTRICT_ASM_LOW32
- RESTRICT_ASM_ALL64
[ftmp8]"=&f"(ftmp[8])
: [dst0]"r"(dst), [dst1]"r"(dst+stride),
[dst2]"r"(dst+2*stride), [dst3]"r"(dst+3*stride),
@@ -72,7 +75,6 @@ void ff_h264_add_pixels4_8_mmi(uint8_t *dst, int16_t *src, int stride)
: "memory"
);
- memset(src, 0, 32);
}
void ff_h264_idct_add_8_mmi(uint8_t *dst, int16_t *block, int stride)
@@ -80,7 +82,6 @@ void ff_h264_idct_add_8_mmi(uint8_t *dst, int16_t *block, int stride)
double ftmp[12];
uint64_t tmp[1];
DECLARE_VAR_LOW32;
- DECLARE_VAR_ALL64;
DECLARE_VAR_ADDRT;
__asm__ volatile (
@@ -152,6 +153,11 @@ void ff_h264_idct_add_8_mmi(uint8_t *dst, int16_t *block, int stride)
MMI_SWC1(%[ftmp2], %[dst], 0x00)
"packushb %[ftmp0], %[ftmp0], %[ftmp7] \n\t"
MMI_SWXC1(%[ftmp0], %[dst], %[stride], 0x00)
+
+ /* memset(block, 0, 32) */
+ "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ "gssqc1 %[ftmp0], %[ftmp0], 0x00(%[block]) \n\t"
+ "gssqc1 %[ftmp0], %[ftmp0], 0x10(%[block]) \n\t"
: [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
[ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
[ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
@@ -159,7 +165,6 @@ void ff_h264_idct_add_8_mmi(uint8_t *dst, int16_t *block, int stride)
[ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
[ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
RESTRICT_ASM_LOW32
- RESTRICT_ASM_ALL64
RESTRICT_ASM_ADDRT
[tmp0]"=&r"(tmp[0])
: [dst]"r"(dst), [block]"r"(block),
@@ -167,7 +172,6 @@ void ff_h264_idct_add_8_mmi(uint8_t *dst, int16_t *block, int stride)
: "memory"
);
- memset(block, 0, 32);
}
void ff_h264_idct8_add_8_mmi(uint8_t *dst, int16_t *block, int stride)
@@ -176,7 +180,6 @@ void ff_h264_idct8_add_8_mmi(uint8_t *dst, int16_t *block, int stride)
uint64_t tmp[7];
mips_reg addr[1];
DECLARE_VAR_LOW32;
- DECLARE_VAR_ALL64;
DECLARE_VAR_ADDRT;
__asm__ volatile (
@@ -617,6 +620,17 @@ void ff_h264_idct8_add_8_mmi(uint8_t *dst, int16_t *block, int stride)
MMI_SWC1(%[ftmp6], %[addr0], 0x00)
MMI_SWXC1(%[ftmp7], %[addr0], %[stride], 0x00)
PTR_ADDIU "$29, $29, 0x20 \n\t"
+
+ /* memset(block, 0, 128) */
+ "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ "gssqc1 %[ftmp0], %[ftmp0], 0x00(%[block]) \n\t"
+ "gssqc1 %[ftmp0], %[ftmp0], 0x10(%[block]) \n\t"
+ "gssqc1 %[ftmp0], %[ftmp0], 0x20(%[block]) \n\t"
+ "gssqc1 %[ftmp0], %[ftmp0], 0x30(%[block]) \n\t"
+ "gssqc1 %[ftmp0], %[ftmp0], 0x40(%[block]) \n\t"
+ "gssqc1 %[ftmp0], %[ftmp0], 0x50(%[block]) \n\t"
+ "gssqc1 %[ftmp0], %[ftmp0], 0x60(%[block]) \n\t"
+ "gssqc1 %[ftmp0], %[ftmp0], 0x70(%[block]) \n\t"
: [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
[ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
[ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
@@ -630,7 +644,6 @@ void ff_h264_idct8_add_8_mmi(uint8_t *dst, int16_t *block, int stride)
[tmp4]"=&r"(tmp[4]), [tmp5]"=&r"(tmp[5]),
[tmp6]"=&r"(tmp[6]),
RESTRICT_ASM_LOW32
- RESTRICT_ASM_ALL64
RESTRICT_ASM_ADDRT
[addr0]"=&r"(addr[0])
: [dst]"r"(dst), [block]"r"(block),
@@ -638,7 +651,6 @@ void ff_h264_idct8_add_8_mmi(uint8_t *dst, int16_t *block, int stride)
: "$29","memory"
);
- memset(block, 0, 128);
}
void ff_h264_idct_dc_add_8_mmi(uint8_t *dst, int16_t *block, int stride)
More information about the ffmpeg-cvslog
mailing list