[Ffmpeg-devel] [PATCH] SSE2 implementation for motion_est_mmx.c

Mon Jun 19 18:44:25 CEST 2006

Hello list,

  I have implemented SSE2 implementation of motion_est_mmx.c, and it
works slightly faster than the original MMX2 implementation. Currently,
it depends on MMX intrinsics, so that HAVE_BUILTIN_VECTOR is required,
which means you have to build the code with GCC and Intel C++ compiler.

Sincerely,
Jim Huang (jserv)
http://jserv.sayya.org/
-------------- next part --------------
Index: libavcodec/i386/motion_est_mmx.c
===================================================================

--- libavcodec/i386/motion_est_mmx.c	(revision 5498)
+++ libavcodec/i386/motion_est_mmx.c	(working copy)
@@ -2,6 +2,7 @@
  * MMX optimized motion estimation
  * Copyright (c) 2001 Fabrice Bellard.
  * Copyright (c) 2002-2004 Michael Niedermayer
+ * Copyright (c) 2006 Jim Huang <jserv.tw at gmail.com>
  *
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
@@ -22,6 +23,29 @@
 #include "../dsputil.h"
 #include "mmx.h"
 
+#if defined(HAVE_BUILTIN_VECTOR)
+#include <inttypes.h>
+#ifdef __GNUC__
+  #ifndef __forceinline
+    #define __forceinline __attribute__((__always_inline__)) inline
+  #endif
+  /* GCC needs to force inlining of intrinsics functions */
+  #define __inline __forceinline
+#endif
+#include <emmintrin.h>
+#ifdef __GNUC__
+  #undef __inline
+#endif
+
+static __m128i const_1_16_bytes;
+
+static void __attribute__((constructor)) mpeg2_MC_sse_ctor()
+{
+    const_1_16_bytes = _mm_set1_epi16(1);
+}
+
+#endif 	/* HAVE_BUILTIN_VECTOR */
+
 static const __attribute__ ((aligned(8))) uint64_t round_tab[3]={
 0x0000000000000000ULL,
 0x0001000100010001ULL,
@@ -372,6 +396,208 @@ static int sad16_xy2_ ## suf(void *v, ui
 PIX_SAD(mmx)
 PIX_SAD(mmx2)
 
+
+#if defined(HAVE_BUILTIN_VECTOR)
+
+static void sad16_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height)
+{
+	const uint8_t *edx = ref;
+	uint8_t *ecx = dest;
+	int esi = height;
+	int eax = stride;
+	int edi = eax + eax;
+	for (; esi; edx += edi, ecx += edi,esi -= 2) {
+		__m128i xmm0, xmm1, xmm2, xmm3;
+		xmm0 = _mm_loadu_si128((__m128i*) edx);
+		xmm1 = _mm_loadu_si128((__m128i*) (edx + eax));
+		xmm2 = _mm_load_si128((__m128i*) ecx);
+		xmm3 = _mm_load_si128((__m128i*) (ecx + eax));
+		xmm0 = _mm_avg_epu8(xmm0, xmm2);
+		xmm1 = _mm_avg_epu8(xmm1, xmm3);
+		_mm_store_si128((__m128i*) ecx, xmm0);
+		_mm_store_si128((__m128i*) (ecx + eax), xmm1);
+	}
+}
+
+static void sad8_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height)
+{
+	const uint8_t *edx = ref;
+	uint8_t *ecx = dest;
+	int esi = height;
+	int eax = stride;
+	int edi = eax + eax;
+	__m128i xmm0, xmm1;
+	for (; esi; edx += edi, ecx += edi, esi -= 2) {
+		xmm0 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm0), (double*) edx));
+		xmm0 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm0), (double*) (edx + eax)));
+		xmm1 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm1), (double*) ecx));
+		xmm1 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm1), (double*) (edx + eax)));
+		xmm0 = _mm_avg_epu8(xmm0, xmm1);
+		_mm_storel_pd((double*) ecx, _mm_castsi128_pd(xmm0));
+		_mm_storeh_pd((double*) (ecx + eax), _mm_castsi128_pd(xmm0));
+	}
+}
+
+static void sad16_x2_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height)
+{
+	const uint8_t *edx = ref;
+	uint8_t *ecx = dest;
+	int esi = height;
+	int eax = stride;
+	int edi = eax + eax;
+	__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5;
+	for (; esi; edx += edi, ecx += edi, esi -= 2) {
+		xmm0 = _mm_loadu_si128((__m128i*) edx);
+		xmm1 = _mm_loadu_si128((__m128i*) (edx + 1));
+		xmm2 = _mm_loadu_si128((__m128i*) (edx + eax));
+		xmm3 = _mm_loadu_si128((__m128i*) (edx + eax + 1));
+		xmm0 = _mm_avg_epu8(xmm0, xmm1);
+		xmm2 = _mm_avg_epu8(xmm2, xmm3);
+		xmm4 = _mm_load_si128((__m128i*) ecx);
+		xmm5 = _mm_load_si128((__m128i*) (ecx + eax));
+		xmm0 = _mm_avg_epu8(xmm0, xmm4);
+		xmm2 = _mm_avg_epu8(xmm2, xmm5);
+		_mm_store_si128((__m128i*) ecx, xmm0);
+		_mm_store_si128((__m128i*) (ecx + eax), xmm2);
+	}
+}
+
+static void sad8_x2_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height)
+{
+	const uint8_t *edx = ref;
+	uint8_t *ecx = dest;
+	int esi = height;
+	int eax = stride;
+	int edi = eax + eax;
+	__m128i xmm0, xmm1, xmm2;
+	for (; esi; edx += edi, ecx += edi, esi -= 2) {
+		xmm0 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm0), (double*) edx));
+		xmm1 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm1), (double*) (edx + 1)));
+		xmm0 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm0), (double*) (edx + eax)));
+		xmm1 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm1), (double*) (edx + eax + 1)));
+		xmm0 = _mm_avg_epu8(xmm0, xmm1);
+		xmm2 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm2), (double*) ecx));
+		xmm2 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm2), (double*) (ecx + eax)));
+		xmm0 = _mm_avg_epu8(xmm0, xmm2);
+		_mm_storel_pd((double*) ecx, _mm_castsi128_pd(xmm0));
+		_mm_storeh_pd((double*) (ecx + eax), _mm_castsi128_pd(xmm0));
+	}
+}
+
+static void sad16_y2_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height)
+{
+	const uint8_t *edx = ref;
+	uint8_t *ecx = dest;
+	int esi = height;
+	int eax = stride;
+	int edi = eax + eax;
+	__m128i xmm0, xmm1, xmm2, xmm3, xmm4;
+
+	xmm0 = _mm_loadu_si128((__m128i*) edx);
+	for (; esi; edx += edi, ecx += edi, esi -= 2) {
+		xmm1 = _mm_loadu_si128((__m128i*) (edx + eax));
+		xmm2 = _mm_loadu_si128((__m128i*) (edx + edi));
+		xmm0 = _mm_avg_epu8(xmm0, xmm1);
+		xmm1 = _mm_avg_epu8(xmm1, xmm2);
+		xmm3 = _mm_load_si128((__m128i*) ecx);
+		xmm4 = _mm_load_si128((__m128i*) (ecx + eax));
+		xmm0 = _mm_avg_epu8(xmm0, xmm3);
+		xmm1 = _mm_avg_epu8(xmm1, xmm4);
+		_mm_store_si128((__m128i*) ecx, xmm0);
+		xmm0 = xmm2;
+		_mm_store_si128((__m128i*) (ecx + eax), xmm1);
+	}
+}
+
+static void sad8_y2_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height)
+{
+	const uint8_t *edx = ref;
+	uint8_t *ecx = dest;
+	int esi = height;
+	int eax = stride;
+	int edi = eax + eax;
+	__m128i xmm0, xmm1, xmm2;
+	xmm0 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm0), (double*) edx));
+	xmm0 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm0), (double*) (edx + eax)));
+	for (; esi; edx += edi, ecx += edi, esi -= 2) {
+		xmm1 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm1), (double*) (edx + eax)));
+		xmm1 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm1), (double*) (edx + edi)));
+		xmm0 = _mm_avg_epu8(xmm0, xmm1);
+		xmm2 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm2), (double*) ecx));
+		xmm2 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm2), (double*) (ecx + eax)));
+		xmm0 = _mm_avg_epu8(xmm0, xmm2);
+		_mm_storeh_pd((double*) ecx, _mm_castsi128_pd(xmm0));
+		_mm_storel_pd((double*) (ecx + eax), _mm_castsi128_pd(xmm0));
+		xmm0 = xmm1;
+	}
+}
+
+static void sad16_xy2_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height)
+{
+	const uint8_t *edx = ref;
+	uint8_t *ecx = dest;
+	int esi = height;
+	int eax = stride;
+	int edi = eax + eax;
+	__m128i xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
+	xmm7 = const_1_16_bytes;
+	xmm0 = _mm_loadu_si128((__m128i*) edx);
+	xmm1 = _mm_loadu_si128((__m128i*) (edx + 1));
+	for (; esi; edx += edi, ecx += edi, esi-= 2) {
+		xmm2 = _mm_loadu_si128((__m128i*) (edx + eax));
+		xmm3 = _mm_loadu_si128((__m128i*) (edx + eax + 1));
+		xmm4 = _mm_loadu_si128((__m128i*) (edx + edi));
+		xmm5 = _mm_loadu_si128((__m128i*) (edx + edi + 1));
+		xmm0 = _mm_avg_epu8(xmm0, xmm1);
+		xmm2 = _mm_avg_epu8(xmm2, xmm3);
+		xmm1 = xmm5;
+		xmm5 = _mm_avg_epu8(xmm5, xmm4);
+		xmm2 = _mm_subs_epu8(xmm2, xmm7);
+		xmm0 = _mm_avg_epu8(xmm0, xmm2);
+		xmm2 = _mm_avg_epu8(xmm2, xmm5);
+		xmm5 = _mm_load_si128((__m128i*) ecx);
+		xmm6 = _mm_load_si128((__m128i*) (ecx + eax));
+		xmm0 = _mm_avg_epu8(xmm0, xmm5);
+		xmm2 = _mm_avg_epu8(xmm2, xmm6);
+		_mm_store_si128((__m128i*) ecx, xmm0);
+		xmm0 = xmm4;
+		_mm_store_si128((__m128i*) (ecx + eax), xmm2);
+	}
+}
+
+static void sad8_xy2_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height)
+{
+	const uint8_t *edx = ref;
+	uint8_t *ecx = dest;
+	int esi = height;
+	int eax = stride;
+	int edi = eax + eax;
+	__m128i xmm7, xmm0, xmm2, xmm1, xmm3, xmm4;
+	xmm7 = const_1_16_bytes;
+	xmm0 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm0), (double*) edx));
+	xmm0 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm0), (double*) (edx + eax)));
+	xmm2 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm2), (double*) (edx + 1)));
+	xmm2 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm2), (double*) (edx + eax + 1)));
+	for (; esi; edx += edi, ecx += edi, esi -= 2) {
+		xmm1 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm1), (double*) (edx + eax)));
+		xmm1 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm1), (double*) (edx + edi)));
+		xmm3 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm3), (double*) (edx + eax + 1)));
+		xmm3 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm3), (double*) (edx + edi + 1)));
+		xmm0 = _mm_avg_epu8(xmm0, xmm1);
+		xmm2 = _mm_avg_epu8(xmm2, xmm3);
+		xmm0 = _mm_subs_epu8(xmm0, xmm7);
+		xmm0 = _mm_avg_epu8(xmm0, xmm2);
+		xmm4 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm4), (double*) ecx));
+		xmm4 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm4), (double*) (ecx + eax)));
+		xmm0 = _mm_avg_epu8(xmm0, xmm4);
+		_mm_storeh_pd((double*) ecx, _mm_castsi128_pd(xmm0));
+		_mm_storel_pd((double*) (ecx + eax), _mm_castsi128_pd(xmm0));
+		xmm0 = xmm1;
+		xmm2 = xmm3;
+	}
+}
+#endif 	/* HAVE_BUILTIN_VECTOR */
+
 void dsputil_init_pix_mmx(DSPContext* c, AVCodecContext *avctx)
 {
     if (mm_flags & MM_MMX) {
@@ -403,4 +629,19 @@ void dsputil_init_pix_mmx(DSPContext* c,
             c->pix_abs[1][3] = sad8_xy2_mmx2;
         }
     }
+#if defined(HAVE_BUILTIN_VECTOR)
+    if (mm_flags & MM_SSE2) {
+        c->pix_abs[0][0] = sad16_sse2;        
+        c->pix_abs[0][1] = sad16_x2_sse2;
+        c->pix_abs[0][2] = sad16_y2_sse2;
+        c->pix_abs[0][3] = sad16_xy2_sse2;
+        c->pix_abs[1][0] = sad8_sse2;
+        c->pix_abs[1][1] = sad8_x2_sse2;
+        c->pix_abs[1][2] = sad8_y2_sse2;
+        c->pix_abs[1][3] = sad8_xy2_sse2;
+
+        c->sad[0]= sad16_sse2;
+        c->sad[1]= sad8_sse2;
+    }
+#endif  /* HAVE_BUILTIN_VECTOR */
 }