[FFmpeg-cvslog] lavc/vp9dsp: copy 8 pixels at once
Rémi Denis-Courmont
git at videolan.org
Mon Jul 29 21:16:49 EEST 2024
ffmpeg | branch: master | Rémi Denis-Courmont <remi at remlab.net> | Mon Jul 22 21:06:38 2024 +0300| [7aa6510fe1fc726f0acd22c5b2d2537c69099395] | committer: Rémi Denis-Courmont
lavc/vp9dsp: copy 8 pixels at once
In the 8-bit case, we can actually read/write 8 aligned pixel values per
load/store, which unsurprisingly tends to be faster on 64-bit systems (and
makes no differences on 32-bit systems). This requires ifdef'ing though.
> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=7aa6510fe1fc726f0acd22c5b2d2537c69099395
---
libavcodec/vp9dsp_template.c | 32 ++++++++++++++++++++++++++++++++
1 file changed, 32 insertions(+)
diff --git a/libavcodec/vp9dsp_template.c b/libavcodec/vp9dsp_template.c
index da3cc28e5e..9e5b25142d 100644
--- a/libavcodec/vp9dsp_template.c
+++ b/libavcodec/vp9dsp_template.c
@@ -49,14 +49,22 @@ static void vert_8x8_c(uint8_t *restrict _dst, ptrdiff_t stride,
{
pixel *dst = (pixel *) _dst;
const pixel *top = (const pixel *) _top;
+#if BIT_DEPTH == 8
+ uint64_t p8 = AV_RN64A(top);
+#else
pixel4 p4a = AV_RN4PA(top + 0);
pixel4 p4b = AV_RN4PA(top + 4);
+#endif
int y;
stride /= sizeof(pixel);
for (y = 0; y < 8; y++) {
+#if BIT_DEPTH == 8
+ AV_WN64A(dst, p8);
+#else
AV_WN4PA(dst + 0, p4a);
AV_WN4PA(dst + 4, p4b);
+#endif
dst += stride;
}
}
@@ -66,18 +74,28 @@ static void vert_16x16_c(uint8_t *restrict _dst, ptrdiff_t stride,
{
pixel *dst = (pixel *) _dst;
const pixel *top = (const pixel *) _top;
+#if BIT_DEPTH == 8
+ uint64_t p8a = AV_RN64A(top);
+ uint64_t p8b = AV_RN64A(top + 8);
+#else
pixel4 p4a = AV_RN4PA(top + 0);
pixel4 p4b = AV_RN4PA(top + 4);
pixel4 p4c = AV_RN4PA(top + 8);
pixel4 p4d = AV_RN4PA(top + 12);
+#endif
int y;
stride /= sizeof(pixel);
for (y = 0; y < 16; y++) {
+#if BIT_DEPTH == 8
+ AV_WN64A(dst + 0, p8a);
+ AV_WN64A(dst + 8, p8b);
+#else
AV_WN4PA(dst + 0, p4a);
AV_WN4PA(dst + 4, p4b);
AV_WN4PA(dst + 8, p4c);
AV_WN4PA(dst + 12, p4d);
+#endif
dst += stride;
}
}
@@ -87,6 +105,12 @@ static void vert_32x32_c(uint8_t *restrict _dst, ptrdiff_t stride,
{
pixel *dst = (pixel *) _dst;
const pixel *top = (const pixel *) _top;
+#if BIT_DEPTH == 8
+ uint64_t p8a = AV_RN64A(top);
+ uint64_t p8b = AV_RN64A(top + 8);
+ uint64_t p8c = AV_RN64A(top + 16);
+ uint64_t p8d = AV_RN64A(top + 24);
+#else
pixel4 p4a = AV_RN4PA(top + 0);
pixel4 p4b = AV_RN4PA(top + 4);
pixel4 p4c = AV_RN4PA(top + 8);
@@ -95,10 +119,17 @@ static void vert_32x32_c(uint8_t *restrict _dst, ptrdiff_t stride,
pixel4 p4f = AV_RN4PA(top + 20);
pixel4 p4g = AV_RN4PA(top + 24);
pixel4 p4h = AV_RN4PA(top + 28);
+#endif
int y;
stride /= sizeof(pixel);
for (y = 0; y < 32; y++) {
+#if BIT_DEPTH == 8
+ AV_WN64A(dst + 0, p8a);
+ AV_WN64A(dst + 8, p8b);
+ AV_WN64A(dst + 16, p8c);
+ AV_WN64A(dst + 24, p8d);
+#else
AV_WN4PA(dst + 0, p4a);
AV_WN4PA(dst + 4, p4b);
AV_WN4PA(dst + 8, p4c);
@@ -107,6 +138,7 @@ static void vert_32x32_c(uint8_t *restrict _dst, ptrdiff_t stride,
AV_WN4PA(dst + 20, p4f);
AV_WN4PA(dst + 24, p4g);
AV_WN4PA(dst + 28, p4h);
+#endif
dst += stride;
}
}
More information about the ffmpeg-cvslog
mailing list