[Ffmpeg-devel] [PATCH] SSE2 implementation for motion_est_mmx.c
jserv at linux2.cc.ntu.edu.tw
jserv
Mon Jun 19 18:44:25 CEST 2006
Hello list,
I have implemented SSE2 implementation of motion_est_mmx.c, and it
works slightly faster than the original MMX2 implementation. Currently,
it depends on MMX intrinsics, so that HAVE_BUILTIN_VECTOR is required,
which means you have to build the code with GCC and Intel C++ compiler.
Sincerely,
Jim Huang (jserv)
http://jserv.sayya.org/
-------------- next part --------------
Index: libavcodec/i386/motion_est_mmx.c
===================================================================
--- libavcodec/i386/motion_est_mmx.c (revision 5498)
+++ libavcodec/i386/motion_est_mmx.c (working copy)
@@ -2,6 +2,7 @@
* MMX optimized motion estimation
* Copyright (c) 2001 Fabrice Bellard.
* Copyright (c) 2002-2004 Michael Niedermayer
+ * Copyright (c) 2006 Jim Huang <jserv.tw at gmail.com>
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
@@ -22,6 +23,29 @@
#include "../dsputil.h"
#include "mmx.h"
+#if defined(HAVE_BUILTIN_VECTOR)
+#include <inttypes.h>
+#ifdef __GNUC__
+ #ifndef __forceinline
+ #define __forceinline __attribute__((__always_inline__)) inline
+ #endif
+ /* GCC needs to force inlining of intrinsics functions */
+ #define __inline __forceinline
+#endif
+#include <emmintrin.h>
+#ifdef __GNUC__
+ #undef __inline
+#endif
+
+static __m128i const_1_16_bytes;
+
+static void __attribute__((constructor)) mpeg2_MC_sse_ctor()
+{
+ const_1_16_bytes = _mm_set1_epi16(1);
+}
+
+#endif /* HAVE_BUILTIN_VECTOR */
+
static const __attribute__ ((aligned(8))) uint64_t round_tab[3]={
0x0000000000000000ULL,
0x0001000100010001ULL,
@@ -372,6 +396,208 @@ static int sad16_xy2_ ## suf(void *v, ui
PIX_SAD(mmx)
PIX_SAD(mmx2)
+
+#if defined(HAVE_BUILTIN_VECTOR)
+
+static void sad16_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height)
+{
+ const uint8_t *edx = ref;
+ uint8_t *ecx = dest;
+ int esi = height;
+ int eax = stride;
+ int edi = eax + eax;
+ for (; esi; edx += edi, ecx += edi,esi -= 2) {
+ __m128i xmm0, xmm1, xmm2, xmm3;
+ xmm0 = _mm_loadu_si128((__m128i*) edx);
+ xmm1 = _mm_loadu_si128((__m128i*) (edx + eax));
+ xmm2 = _mm_load_si128((__m128i*) ecx);
+ xmm3 = _mm_load_si128((__m128i*) (ecx + eax));
+ xmm0 = _mm_avg_epu8(xmm0, xmm2);
+ xmm1 = _mm_avg_epu8(xmm1, xmm3);
+ _mm_store_si128((__m128i*) ecx, xmm0);
+ _mm_store_si128((__m128i*) (ecx + eax), xmm1);
+ }
+}
+
+static void sad8_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height)
+{
+ const uint8_t *edx = ref;
+ uint8_t *ecx = dest;
+ int esi = height;
+ int eax = stride;
+ int edi = eax + eax;
+ __m128i xmm0, xmm1;
+ for (; esi; edx += edi, ecx += edi, esi -= 2) {
+ xmm0 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm0), (double*) edx));
+ xmm0 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm0), (double*) (edx + eax)));
+ xmm1 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm1), (double*) ecx));
+ xmm1 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm1), (double*) (edx + eax)));
+ xmm0 = _mm_avg_epu8(xmm0, xmm1);
+ _mm_storel_pd((double*) ecx, _mm_castsi128_pd(xmm0));
+ _mm_storeh_pd((double*) (ecx + eax), _mm_castsi128_pd(xmm0));
+ }
+}
+
+static void sad16_x2_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height)
+{
+ const uint8_t *edx = ref;
+ uint8_t *ecx = dest;
+ int esi = height;
+ int eax = stride;
+ int edi = eax + eax;
+ __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5;
+ for (; esi; edx += edi, ecx += edi, esi -= 2) {
+ xmm0 = _mm_loadu_si128((__m128i*) edx);
+ xmm1 = _mm_loadu_si128((__m128i*) (edx + 1));
+ xmm2 = _mm_loadu_si128((__m128i*) (edx + eax));
+ xmm3 = _mm_loadu_si128((__m128i*) (edx + eax + 1));
+ xmm0 = _mm_avg_epu8(xmm0, xmm1);
+ xmm2 = _mm_avg_epu8(xmm2, xmm3);
+ xmm4 = _mm_load_si128((__m128i*) ecx);
+ xmm5 = _mm_load_si128((__m128i*) (ecx + eax));
+ xmm0 = _mm_avg_epu8(xmm0, xmm4);
+ xmm2 = _mm_avg_epu8(xmm2, xmm5);
+ _mm_store_si128((__m128i*) ecx, xmm0);
+ _mm_store_si128((__m128i*) (ecx + eax), xmm2);
+ }
+}
+
+static void sad8_x2_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height)
+{
+ const uint8_t *edx = ref;
+ uint8_t *ecx = dest;
+ int esi = height;
+ int eax = stride;
+ int edi = eax + eax;
+ __m128i xmm0, xmm1, xmm2;
+ for (; esi; edx += edi, ecx += edi, esi -= 2) {
+ xmm0 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm0), (double*) edx));
+ xmm1 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm1), (double*) (edx + 1)));
+ xmm0 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm0), (double*) (edx + eax)));
+ xmm1 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm1), (double*) (edx + eax + 1)));
+ xmm0 = _mm_avg_epu8(xmm0, xmm1);
+ xmm2 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm2), (double*) ecx));
+ xmm2 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm2), (double*) (ecx + eax)));
+ xmm0 = _mm_avg_epu8(xmm0, xmm2);
+ _mm_storel_pd((double*) ecx, _mm_castsi128_pd(xmm0));
+ _mm_storeh_pd((double*) (ecx + eax), _mm_castsi128_pd(xmm0));
+ }
+}
+
+static void sad16_y2_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height)
+{
+ const uint8_t *edx = ref;
+ uint8_t *ecx = dest;
+ int esi = height;
+ int eax = stride;
+ int edi = eax + eax;
+ __m128i xmm0, xmm1, xmm2, xmm3, xmm4;
+
+ xmm0 = _mm_loadu_si128((__m128i*) edx);
+ for (; esi; edx += edi, ecx += edi, esi -= 2) {
+ xmm1 = _mm_loadu_si128((__m128i*) (edx + eax));
+ xmm2 = _mm_loadu_si128((__m128i*) (edx + edi));
+ xmm0 = _mm_avg_epu8(xmm0, xmm1);
+ xmm1 = _mm_avg_epu8(xmm1, xmm2);
+ xmm3 = _mm_load_si128((__m128i*) ecx);
+ xmm4 = _mm_load_si128((__m128i*) (ecx + eax));
+ xmm0 = _mm_avg_epu8(xmm0, xmm3);
+ xmm1 = _mm_avg_epu8(xmm1, xmm4);
+ _mm_store_si128((__m128i*) ecx, xmm0);
+ xmm0 = xmm2;
+ _mm_store_si128((__m128i*) (ecx + eax), xmm1);
+ }
+}
+
+static void sad8_y2_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height)
+{
+ const uint8_t *edx = ref;
+ uint8_t *ecx = dest;
+ int esi = height;
+ int eax = stride;
+ int edi = eax + eax;
+ __m128i xmm0, xmm1, xmm2;
+ xmm0 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm0), (double*) edx));
+ xmm0 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm0), (double*) (edx + eax)));
+ for (; esi; edx += edi, ecx += edi, esi -= 2) {
+ xmm1 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm1), (double*) (edx + eax)));
+ xmm1 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm1), (double*) (edx + edi)));
+ xmm0 = _mm_avg_epu8(xmm0, xmm1);
+ xmm2 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm2), (double*) ecx));
+ xmm2 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm2), (double*) (ecx + eax)));
+ xmm0 = _mm_avg_epu8(xmm0, xmm2);
+ _mm_storeh_pd((double*) ecx, _mm_castsi128_pd(xmm0));
+ _mm_storel_pd((double*) (ecx + eax), _mm_castsi128_pd(xmm0));
+ xmm0 = xmm1;
+ }
+}
+
+static void sad16_xy2_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height)
+{
+ const uint8_t *edx = ref;
+ uint8_t *ecx = dest;
+ int esi = height;
+ int eax = stride;
+ int edi = eax + eax;
+ __m128i xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
+ xmm7 = const_1_16_bytes;
+ xmm0 = _mm_loadu_si128((__m128i*) edx);
+ xmm1 = _mm_loadu_si128((__m128i*) (edx + 1));
+ for (; esi; edx += edi, ecx += edi, esi-= 2) {
+ xmm2 = _mm_loadu_si128((__m128i*) (edx + eax));
+ xmm3 = _mm_loadu_si128((__m128i*) (edx + eax + 1));
+ xmm4 = _mm_loadu_si128((__m128i*) (edx + edi));
+ xmm5 = _mm_loadu_si128((__m128i*) (edx + edi + 1));
+ xmm0 = _mm_avg_epu8(xmm0, xmm1);
+ xmm2 = _mm_avg_epu8(xmm2, xmm3);
+ xmm1 = xmm5;
+ xmm5 = _mm_avg_epu8(xmm5, xmm4);
+ xmm2 = _mm_subs_epu8(xmm2, xmm7);
+ xmm0 = _mm_avg_epu8(xmm0, xmm2);
+ xmm2 = _mm_avg_epu8(xmm2, xmm5);
+ xmm5 = _mm_load_si128((__m128i*) ecx);
+ xmm6 = _mm_load_si128((__m128i*) (ecx + eax));
+ xmm0 = _mm_avg_epu8(xmm0, xmm5);
+ xmm2 = _mm_avg_epu8(xmm2, xmm6);
+ _mm_store_si128((__m128i*) ecx, xmm0);
+ xmm0 = xmm4;
+ _mm_store_si128((__m128i*) (ecx + eax), xmm2);
+ }
+}
+
+static void sad8_xy2_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height)
+{
+ const uint8_t *edx = ref;
+ uint8_t *ecx = dest;
+ int esi = height;
+ int eax = stride;
+ int edi = eax + eax;
+ __m128i xmm7, xmm0, xmm2, xmm1, xmm3, xmm4;
+ xmm7 = const_1_16_bytes;
+ xmm0 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm0), (double*) edx));
+ xmm0 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm0), (double*) (edx + eax)));
+ xmm2 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm2), (double*) (edx + 1)));
+ xmm2 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm2), (double*) (edx + eax + 1)));
+ for (; esi; edx += edi, ecx += edi, esi -= 2) {
+ xmm1 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm1), (double*) (edx + eax)));
+ xmm1 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm1), (double*) (edx + edi)));
+ xmm3 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm3), (double*) (edx + eax + 1)));
+ xmm3 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm3), (double*) (edx + edi + 1)));
+ xmm0 = _mm_avg_epu8(xmm0, xmm1);
+ xmm2 = _mm_avg_epu8(xmm2, xmm3);
+ xmm0 = _mm_subs_epu8(xmm0, xmm7);
+ xmm0 = _mm_avg_epu8(xmm0, xmm2);
+ xmm4 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm4), (double*) ecx));
+ xmm4 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm4), (double*) (ecx + eax)));
+ xmm0 = _mm_avg_epu8(xmm0, xmm4);
+ _mm_storeh_pd((double*) ecx, _mm_castsi128_pd(xmm0));
+ _mm_storel_pd((double*) (ecx + eax), _mm_castsi128_pd(xmm0));
+ xmm0 = xmm1;
+ xmm2 = xmm3;
+ }
+}
+#endif /* HAVE_BUILTIN_VECTOR */
+
void dsputil_init_pix_mmx(DSPContext* c, AVCodecContext *avctx)
{
if (mm_flags & MM_MMX) {
@@ -403,4 +629,19 @@ void dsputil_init_pix_mmx(DSPContext* c,
c->pix_abs[1][3] = sad8_xy2_mmx2;
}
}
+#if defined(HAVE_BUILTIN_VECTOR)
+ if (mm_flags & MM_SSE2) {
+ c->pix_abs[0][0] = sad16_sse2;
+ c->pix_abs[0][1] = sad16_x2_sse2;
+ c->pix_abs[0][2] = sad16_y2_sse2;
+ c->pix_abs[0][3] = sad16_xy2_sse2;
+ c->pix_abs[1][0] = sad8_sse2;
+ c->pix_abs[1][1] = sad8_x2_sse2;
+ c->pix_abs[1][2] = sad8_y2_sse2;
+ c->pix_abs[1][3] = sad8_xy2_sse2;
+
+ c->sad[0]= sad16_sse2;
+ c->sad[1]= sad8_sse2;
+ }
+#endif /* HAVE_BUILTIN_VECTOR */
}
More information about the ffmpeg-devel
mailing list