[FFmpeg-devel] [PATCH 01/10] diracdsp: add SIMD for the 10 bit version of put_signed_rect_clamped
Rostislav Pehlivanov
rpehlivanov at ob-encoder.com
Thu Jun 23 19:06:55 CEST 2016
Signed-off-by: Rostislav Pehlivanov <rpehlivanov at obe.tv>
---
libavcodec/x86/diracdsp.asm | 47 ++++++++++++++++++++++++++++++++++++++++++
libavcodec/x86/diracdsp_init.c | 6 ++++++
2 files changed, 53 insertions(+)
diff --git a/libavcodec/x86/diracdsp.asm b/libavcodec/x86/diracdsp.asm
index a042413..9db7b67 100644
--- a/libavcodec/x86/diracdsp.asm
+++ b/libavcodec/x86/diracdsp.asm
@@ -22,6 +22,8 @@
SECTION_RODATA
pw_7: times 8 dw 7
+convert_to_unsigned_10bit: times 4 dd 0x200
+clip_10bit: times 8 dw 0x3ff
cextern pw_3
cextern pw_16
@@ -172,6 +174,48 @@ cglobal put_signed_rect_clamped_%1, 5,9,3, dst, dst_stride, src, src_stride, w,
RET
%endm
+%macro PUT_RECT_10 0
+; void put_signed_rect_clamped_10(uint8_t *dst, int dst_stride, const uint8_t *src, int src_stride, int width, int height)
+cglobal put_signed_rect_clamped_10, 6, 9, 6, dst, dst_stride, src, src_stride, w, h
+
+ neg wq
+ neg hq
+ mov r6, srcq
+ mov r7, dstq
+ mov r8, wq
+ pxor m2, m2
+ mova m3, [clip_10bit]
+ mova m4, [convert_to_unsigned_10bit]
+
+ .loop_h:
+ mov srcq, r6
+ mov dstq, r7
+ mov wq, r8
+
+ .loop_w:
+ movu m0, [srcq+0*mmsize]
+ movu m1, [srcq+1*mmsize]
+
+ paddd m0, m4
+ paddd m1, m4
+ packusdw m0, m0, m1
+ CLIPW m0, m2, m3 ; packusdw saturates so it's fine
+
+ movu [dstq], m0
+
+ add srcq, 2*mmsize
+ add dstq, 1*mmsize
+ add wq, 8
+ jl .loop_w
+
+ add r6, src_strideq
+ add r7, dst_strideq
+ add hq, 1
+ jl .loop_h
+
+ RET
+%endm
+
%macro ADD_RECT 1
; void add_rect_clamped(uint8_t *dst, uint16_t *src, int stride, int16_t *idwt, int idwt_stride, int width, int height)
cglobal add_rect_clamped_%1, 7,9,3, dst, src, stride, idwt, idwt_stride, w, h
@@ -263,3 +307,6 @@ ADD_RECT sse2
HPEL_FILTER sse2
ADD_OBMC 32, sse2
ADD_OBMC 16, sse2
+
+INIT_XMM sse4
+PUT_RECT_10
diff --git a/libavcodec/x86/diracdsp_init.c b/libavcodec/x86/diracdsp_init.c
index 5fae798..4786eea 100644
--- a/libavcodec/x86/diracdsp_init.c
+++ b/libavcodec/x86/diracdsp_init.c
@@ -46,6 +46,8 @@ void ff_put_rect_clamped_sse2(uint8_t *dst, int dst_stride, const int16_t *src,
void ff_put_signed_rect_clamped_mmx(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height);
void ff_put_signed_rect_clamped_sse2(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height);
+void ff_put_signed_rect_clamped_10_sse4(uint8_t *dst, int dst_stride, const uint8_t *src, int src_stride, int width, int height);
+
#if HAVE_YASM
#define HPEL_FILTER(MMSIZE, EXT) \
@@ -184,4 +186,8 @@ void ff_diracdsp_init_x86(DiracDSPContext* c)
c->put_dirac_pixels_tab[2][0] = ff_put_dirac_pixels32_sse2;
c->avg_dirac_pixels_tab[2][0] = ff_avg_dirac_pixels32_sse2;
}
+
+ if (EXTERNAL_SSE4(mm_flags)) {
+ c->put_signed_rect_clamped[1] = ff_put_signed_rect_clamped_10_sse4;
+ }
}
--
2.8.1.369.geae769a
More information about the ffmpeg-devel
mailing list