[FFmpeg-cvslog] diracdsp: add SIMD for the 10 bit version of put_signed_rect_clamped
Rostislav Pehlivanov
git at videolan.org
Tue Jul 12 00:45:35 CEST 2016
ffmpeg | branch: master | Rostislav Pehlivanov <rpehlivanov at ob-encoder.com> | Thu Jun 23 18:06:55 2016 +0100| [bd61f3c6bfb83d7691e124a02394ae76737c26f4] | committer: Rostislav Pehlivanov
diracdsp: add SIMD for the 10 bit version of put_signed_rect_clamped
Signed-off-by: Rostislav Pehlivanov <rpehlivanov at obe.tv>
> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=bd61f3c6bfb83d7691e124a02394ae76737c26f4
---
libavcodec/x86/diracdsp.asm | 42 ++++++++++++++++++++++++++++++++++++++++
libavcodec/x86/diracdsp_init.c | 4 ++++
2 files changed, 46 insertions(+)
diff --git a/libavcodec/x86/diracdsp.asm b/libavcodec/x86/diracdsp.asm
index 8e9f0fb..d86b543 100644
--- a/libavcodec/x86/diracdsp.asm
+++ b/libavcodec/x86/diracdsp.asm
@@ -22,6 +22,8 @@
SECTION_RODATA
pw_7: times 8 dw 7
+convert_to_unsigned_10bit: times 4 dd 0x200
+clip_10bit: times 8 dw 0x3ff
cextern pw_3
cextern pw_16
@@ -300,3 +302,43 @@ cglobal dequant_subband_32, 7, 7, 4, src, dst, stride, qf, qs, tot_v, tot_h
jg .loop_v
RET
+
+%if ARCH_X86_64 == 1
+; void put_signed_rect_clamped_10(uint8_t *dst, int dst_stride, const uint8_t *src, int src_stride, int width, int height)
+cglobal put_signed_rect_clamped_10, 6, 9, 6, dst, dst_stride, src, src_stride, w, h
+ mov r6, srcq
+ mov r7, dstq
+ mov r8, wq
+ pxor m2, m2
+ mova m3, [clip_10bit]
+ mova m4, [convert_to_unsigned_10bit]
+
+ .loop_h:
+ mov srcq, r6
+ mov dstq, r7
+ mov wq, r8
+
+ .loop_w:
+ movu m0, [srcq+0*mmsize]
+ movu m1, [srcq+1*mmsize]
+
+ paddd m0, m4
+ paddd m1, m4
+ packusdw m0, m0, m1
+ CLIPW m0, m2, m3 ; packusdw saturates so it's fine
+
+ movu [dstq], m0
+
+ add srcq, 2*mmsize
+ add dstq, 1*mmsize
+ sub wd, 8
+ jg .loop_w
+
+ add r6, src_strideq
+ add r7, dst_strideq
+ sub hd, 1
+ jg .loop_h
+
+ RET
+
+%endif
diff --git a/libavcodec/x86/diracdsp_init.c b/libavcodec/x86/diracdsp_init.c
index 26b885d..43aab6a 100644
--- a/libavcodec/x86/diracdsp_init.c
+++ b/libavcodec/x86/diracdsp_init.c
@@ -45,6 +45,9 @@ void ff_put_rect_clamped_mmx(uint8_t *dst, int dst_stride, const int16_t *src, i
void ff_put_rect_clamped_sse2(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height);
void ff_put_signed_rect_clamped_mmx(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height);
void ff_put_signed_rect_clamped_sse2(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height);
+#if ARCH_X86_64
+void ff_put_signed_rect_clamped_10_sse4(uint8_t *dst, int dst_stride, const uint8_t *src, int src_stride, int width, int height);
+#endif
void ff_dequant_subband_32_sse4(uint8_t *src, uint8_t *dst, ptrdiff_t stride, const int qf, const int qs, int tot_v, int tot_h);
@@ -189,5 +192,6 @@ void ff_diracdsp_init_x86(DiracDSPContext* c)
if (EXTERNAL_SSE4(mm_flags)) {
c->dequant_subband[1] = ff_dequant_subband_32_sse4;
+ c->put_signed_rect_clamped[1] = ff_put_signed_rect_clamped_10_sse4;
}
}
More information about the ffmpeg-cvslog
mailing list