[FFmpeg-devel] [PATCH 2/3] simple_idct12: align C and x86
Christophe Gisquet
christophe.gisquet at gmail.com
Tue Oct 13 21:21:40 CEST 2015
Results for omse on the 3 idct dct-test.
C: 0.16915859 0.11848359 0.12913125
x86: 0.16883281 0.11849063 0.19041875
Using 14 and 17 as shifts subtantially improve those, but actually
cause overflows and incorrect decoding of 12bpp content.
---
libavcodec/simple_idct_template.c | 17 ++++-------------
libavcodec/x86/idctdsp_init.c | 8 +++-----
libavcodec/x86/simple_idct10.asm | 7 +++----
3 files changed, 10 insertions(+), 22 deletions(-)
diff --git a/libavcodec/simple_idct_template.c b/libavcodec/simple_idct_template.c
index 0585679..c94c583 100644
--- a/libavcodec/simple_idct_template.c
+++ b/libavcodec/simple_idct_template.c
@@ -66,7 +66,6 @@
#elif BIT_DEPTH == 10 || BIT_DEPTH == 12
-# if BIT_DEPTH == 10
#define W1 22725 // 90901
#define W2 21407 // 85627
#define W3 19265 // 77062
@@ -75,6 +74,7 @@
#define W6 8867 // 35468
#define W7 4520 // 18081
+# if BIT_DEPTH == 10
# ifdef EXTRA_SHIFT
#define ROW_SHIFT 13
#define COL_SHIFT 18
@@ -84,19 +84,10 @@
#define COL_SHIFT 19
#define DC_SHIFT 2
# endif
-
# else
-#define W1 45451
-#define W2 42813
-#define W3 38531
-#define W4 32767
-#define W5 25746
-#define W6 17734
-#define W7 9041
-
-#define ROW_SHIFT 16
-#define COL_SHIFT 17
-#define DC_SHIFT -1
+#define ROW_SHIFT 15
+#define COL_SHIFT 16
+#define DC_SHIFT -1
# endif
#define MUL(a, b) ((a) * (b))
diff --git a/libavcodec/x86/idctdsp_init.c b/libavcodec/x86/idctdsp_init.c
index bcf7e5b..8b25ff9 100644
--- a/libavcodec/x86/idctdsp_init.c
+++ b/libavcodec/x86/idctdsp_init.c
@@ -86,11 +86,11 @@ av_cold void ff_idctdsp_init_x86(IDCTDSPContext *c, AVCodecContext *avctx,
c->add_pixels_clamped = ff_add_pixels_clamped_sse2;
}
- if (ARCH_X86_64 && avctx->lowres == 0) {
- if (avctx->bits_per_raw_sample == 10 &&
+ if (ARCH_X86_64 && avctx->lowres == 0 &&
(avctx->idct_algo == FF_IDCT_AUTO ||
avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
avctx->idct_algo == FF_IDCT_SIMPLE)) {
+ if (avctx->bits_per_raw_sample == 10) {
if (EXTERNAL_SSE2(cpu_flags)) {
c->idct_put = ff_simple_idct10_put_sse2;
c->idct_add = NULL;
@@ -106,9 +106,7 @@ av_cold void ff_idctdsp_init_x86(IDCTDSPContext *c, AVCodecContext *avctx,
}
}
- if (avctx->bits_per_raw_sample == 12 &&
- (avctx->idct_algo == FF_IDCT_AUTO ||
- avctx->idct_algo == FF_IDCT_SIMPLEMMX)) {
+ if (avctx->bits_per_raw_sample == 12) {
if (EXTERNAL_SSE2(cpu_flags)) {
c->idct_put = ff_simple_idct12_put_sse2;
c->idct_add = NULL;
diff --git a/libavcodec/x86/simple_idct10.asm b/libavcodec/x86/simple_idct10.asm
index cd83d61..c5ee05c 100644
--- a/libavcodec/x86/simple_idct10.asm
+++ b/libavcodec/x86/simple_idct10.asm
@@ -29,12 +29,11 @@
SECTION_RODATA
-cextern pw_2
-cextern pw_16
cextern pw_1023
cextern pw_4095
pd_round_12: times 4 dd 1<<(12-1)
pd_round_15: times 4 dd 1<<(15-1)
+pd_round_16: times 4 dd 1<<(16-1)
pd_round_19: times 4 dd 1<<(19-1)
%macro CONST_DEC 3
@@ -79,14 +78,14 @@ cglobal simple_idct10_put, 3, 3, 16
cglobal simple_idct12, 1, 1, 16
; coeffs are already 15bits, adding the offset would cause
; overflow in the input
- IDCT_FN "", 15, pw_2, 16
+ IDCT_FN "", 15, "", 16
RET
cglobal simple_idct12_put, 3, 3, 16
; range isn't known, so the C simple_idct range is used
; Also, using a bias on input overflows, so use the bias
; on output of the first butterfly instead
- IDCT_FN "", 15, pw_2, 16, 0, pw_4095
+ IDCT_FN "", 15, "", 16, 0, pw_4095
RET
%endmacro
--
2.6.0
More information about the ffmpeg-devel
mailing list