[FFmpeg-cvslog] x86/intreadwrite: use intrinsics instead of inline asm for AV_ZERO128
James Almer
git at videolan.org
Wed Jul 10 19:26:38 EEST 2024
ffmpeg | branch: master | James Almer <jamrial at gmail.com> | Mon Nov 14 02:32:33 2022 -0300| [4a04cca69af807ccf831da977a94350611967c4c] | committer: James Almer
x86/intreadwrite: use intrinsics instead of inline asm for AV_ZERO128
When called inside a loop, the inline asm version results in one pxor
unnecessarely emitted per iteration, as the contents of the __asm__() block are
opaque to the compiler's instruction scheduler.
This is not the case with intrinsics, where pxor will be emitted once with any
half decent compiler.
This also has the benefit of removing any SSE -> AVX penalty that may happen
when the compiler emits VEX encoded instructions.
Signed-off-by: James Almer <jamrial at gmail.com>
> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=4a04cca69af807ccf831da977a94350611967c4c
---
configure | 3 +++
libavutil/x86/intreadwrite.h | 15 +++++++--------
2 files changed, 10 insertions(+), 8 deletions(-)
diff --git a/configure b/configure
index fa2e384350..f84fefeaab 100755
--- a/configure
+++ b/configure
@@ -2314,6 +2314,7 @@ HEADERS_LIST="
INTRINSICS_LIST="
intrinsics_neon
+ intrinsics_sse2
"
MATH_FUNCS="
@@ -2743,6 +2744,7 @@ armv6t2_deps="arm"
armv8_deps="aarch64"
neon_deps_any="aarch64 arm"
intrinsics_neon_deps="neon"
+intrinsics_sse2_deps="sse2"
vfp_deps="arm"
vfpv3_deps="vfp"
setend_deps="arm"
@@ -6444,6 +6446,7 @@ elif enabled loongarch; then
fi
check_cc intrinsics_neon arm_neon.h "int16x8_t test = vdupq_n_s16(0)"
+check_cc intrinsics_sse2 emmintrin.h "__m128i test = _mm_setzero_si128()"
check_ldflags -Wl,--as-needed
check_ldflags -Wl,-z,noexecstack
diff --git a/libavutil/x86/intreadwrite.h b/libavutil/x86/intreadwrite.h
index 5e57d6a8cd..9bbef00dba 100644
--- a/libavutil/x86/intreadwrite.h
+++ b/libavutil/x86/intreadwrite.h
@@ -22,6 +22,9 @@
#define AVUTIL_X86_INTREADWRITE_H
#include <stdint.h>
+#if HAVE_INTRINSICS_SSE2
+#include <emmintrin.h>
+#endif
#include "config.h"
#include "libavutil/attributes.h"
@@ -43,20 +46,16 @@ static av_always_inline void AV_COPY128(void *d, const void *s)
#endif /* __SSE__ */
-#ifdef __SSE2__
+#if HAVE_INTRINSICS_SSE2
#define AV_ZERO128 AV_ZERO128
static av_always_inline void AV_ZERO128(void *d)
{
- struct v {uint64_t v[2];};
-
- __asm__("pxor %%xmm0, %%xmm0 \n\t"
- "movdqa %%xmm0, %0 \n\t"
- : "=m"(*(struct v*)d)
- :: "xmm0");
+ __m128i zero = _mm_setzero_si128();
+ _mm_store_si128(d, zero);
}
-#endif /* __SSE2__ */
+#endif /* HAVE_INTRINSICS_SSE2 */
#endif /* HAVE_MMX */
More information about the ffmpeg-cvslog
mailing list