[MPlayer-cvslog] r36488 - trunk/libmpcodecs/ad_ffmpeg.c
reimar
subversion at mplayerhq.hu
Sat Oct 26 10:30:29 CEST 2013
Author: reimar
Date: Sat Oct 26 10:30:29 2013
New Revision: 36488
Log:
Add ARM NEON optimization of common channel interleave cases.
While the code is certainly far from optimal, it is about
3 times faster.
Modified:
trunk/libmpcodecs/ad_ffmpeg.c
Modified: trunk/libmpcodecs/ad_ffmpeg.c
==============================================================================
--- trunk/libmpcodecs/ad_ffmpeg.c Fri Oct 25 21:59:59 2013 (r36487)
+++ trunk/libmpcodecs/ad_ffmpeg.c Sat Oct 26 10:30:29 2013 (r36488)
@@ -220,6 +220,45 @@ static av_always_inline void copy_sample
{
size_t s, c, o = 0;
+#if HAVE_NEON
+ if (nb_channels == 2 && bps == 4) {
+ const unsigned char *src0 = src[0];
+ const unsigned char *src1 = src[1];
+ size_t aligned = nb_samples & ~7;
+ const unsigned char *src0_end = src0 + aligned*bps;
+ while (src0 < src0_end) {
+ __asm__ (
+ "vld1.32 {q0}, [%0]!\n\t"
+ "vld1.32 {q1}, [%1]!\n\t"
+ "vld1.32 {q2}, [%0]!\n\t"
+ "vld1.32 {q3}, [%1]!\n\t"
+ "vst2.32 {q0,q1}, [%2]!\n\t"
+ "vst2.32 {q2,q3}, [%2]!\n\t"
+ : "+&r"(src0), "+&r"(src1), "+&r"(dst)
+ :: "q0", "q1", "q2", "q3", "memory");
+ }
+ o += aligned*bps;
+ nb_samples -= aligned;
+ } else if (nb_channels == 2 && bps == 2) {
+ const unsigned char *src0 = src[0];
+ const unsigned char *src1 = src[1];
+ size_t aligned = nb_samples & ~15;
+ const unsigned char *src0_end = src0 + aligned*bps;
+ while (src0 < src0_end) {
+ __asm__ (
+ "vld1.16 {q0}, [%0]!\n\t"
+ "vld1.16 {q1}, [%1]!\n\t"
+ "vld1.16 {q2}, [%0]!\n\t"
+ "vld1.16 {q3}, [%1]!\n\t"
+ "vst2.16 {q0,q1}, [%2]!\n\t"
+ "vst2.16 {q2,q3}, [%2]!\n\t"
+ : "+&r"(src0), "+&r"(src1), "+&r"(dst)
+ :: "q0", "q1", "q2", "q3", "memory");
+ }
+ o += aligned*bps;
+ nb_samples -= aligned;
+ }
+#endif
for (s = 0; s < nb_samples; s++) {
for (c = 0; c < nb_channels; c++) {
memcpy(dst, src[c] + o, bps);
More information about the MPlayer-cvslog
mailing list