[FFmpeg-cvslog] x86/intreadwrite: use intrinsics instead of inline asm for AV_COPY128

James Almer git at videolan.org
Wed Jul 10 19:26:40 EEST 2024


ffmpeg | branch: master | James Almer <jamrial at gmail.com> | Wed Jul 10 13:00:20 2024 -0300| [bd1bcb07e0f29c135103a402d71b343a09ad1690] | committer: James Almer

x86/intreadwrite: use intrinsics instead of inline asm for AV_COPY128

This has the benefit of removing any SSE -> AVX penalty that may happen when
the compiler emits VEX encoded instructions.

Signed-off-by: James Almer <jamrial at gmail.com>

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=bd1bcb07e0f29c135103a402d71b343a09ad1690
---

 configure                    |  5 ++++-
 libavutil/x86/intreadwrite.h | 20 +++++++-------------
 2 files changed, 11 insertions(+), 14 deletions(-)

diff --git a/configure b/configure
index f84fefeaab..7151ed1de3 100755
--- a/configure
+++ b/configure
@@ -2314,6 +2314,7 @@ HEADERS_LIST="
 
 INTRINSICS_LIST="
     intrinsics_neon
+    intrinsics_sse
     intrinsics_sse2
 "
 
@@ -2744,7 +2745,8 @@ armv6t2_deps="arm"
 armv8_deps="aarch64"
 neon_deps_any="aarch64 arm"
 intrinsics_neon_deps="neon"
-intrinsics_sse2_deps="sse2"
+intrinsics_sse_deps="sse"
+intrinsics_sse2_deps="sse2 intrinsics_sse"
 vfp_deps="arm"
 vfpv3_deps="vfp"
 setend_deps="arm"
@@ -6446,6 +6448,7 @@ elif enabled loongarch; then
 fi
 
 check_cc intrinsics_neon arm_neon.h "int16x8_t test = vdupq_n_s16(0)"
+check_cc intrinsics_sse immintrin.h "__m128 test = _mm_setzero_ps()"
 check_cc intrinsics_sse2 emmintrin.h "__m128i test = _mm_setzero_si128()"
 
 check_ldflags -Wl,--as-needed
diff --git a/libavutil/x86/intreadwrite.h b/libavutil/x86/intreadwrite.h
index 9bbef00dba..6546eb016c 100644
--- a/libavutil/x86/intreadwrite.h
+++ b/libavutil/x86/intreadwrite.h
@@ -22,29 +22,25 @@
 #define AVUTIL_X86_INTREADWRITE_H
 
 #include <stdint.h>
+#if HAVE_INTRINSICS_SSE
+#include <immintrin.h>
+#endif
 #if HAVE_INTRINSICS_SSE2
 #include <emmintrin.h>
 #endif
 #include "config.h"
 #include "libavutil/attributes.h"
 
-#if HAVE_MMX
-
-#ifdef __SSE__
+#if HAVE_INTRINSICS_SSE
 
 #define AV_COPY128 AV_COPY128
 static av_always_inline void AV_COPY128(void *d, const void *s)
 {
-    struct v {uint64_t v[2];};
-
-    __asm__("movaps   %1, %%xmm0  \n\t"
-            "movaps   %%xmm0, %0  \n\t"
-            : "=m"(*(struct v*)d)
-            : "m" (*(const struct v*)s)
-            : "xmm0");
+    __m128 tmp = _mm_load_ps(s);
+    _mm_store_ps(d, tmp);
 }
 
-#endif /* __SSE__ */
+#endif /* HAVE_INTRINSICS_SSE */
 
 #if HAVE_INTRINSICS_SSE2
 
@@ -57,6 +53,4 @@ static av_always_inline void AV_ZERO128(void *d)
 
 #endif /* HAVE_INTRINSICS_SSE2 */
 
-#endif /* HAVE_MMX */
-
 #endif /* AVUTIL_X86_INTREADWRITE_H */



More information about the ffmpeg-cvslog mailing list