[FFmpeg-cvslog] rv40dsp x86: use only one register, for both increment and loop counter
Christophe GISQUET
git at videolan.org
Tue Apr 10 23:07:21 CEST 2012
ffmpeg | branch: master | Christophe GISQUET <christophe.gisquet at gmail.com> | Tue Mar 20 16:13:55 2012 +0100| [2130bd8f5b6504ea14cd41e33f5d4f431eb724f3] | committer: Ronald S. Bultje
rv40dsp x86: use only one register, for both increment and loop counter
Around 10 cycles faster for luma.
Signed-off-by: Ronald S. Bultje <rsbultje at gmail.com>
> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=2130bd8f5b6504ea14cd41e33f5d4f431eb724f3
---
libavcodec/x86/rv40dsp.asm | 43 ++++++++++++++++++++-----------------------
1 files changed, 20 insertions(+), 23 deletions(-)
diff --git a/libavcodec/x86/rv40dsp.asm b/libavcodec/x86/rv40dsp.asm
index 9028e74..721d3df 100644
--- a/libavcodec/x86/rv40dsp.asm
+++ b/libavcodec/x86/rv40dsp.asm
@@ -32,13 +32,14 @@ SECTION .text
; %1=5bits weights?, %2=dst %3=src1 %4=src3 %5=stride if sse2
%macro RV40_WCORE 4-5
- movh m4, [%3 + 0]
- movh m5, [%4 + 0]
+ movh m4, [%3 + r6 + 0]
+ movh m5, [%4 + r6 + 0]
%if %0 == 4
-%define OFFSET mmsize / 2
+%define OFFSET r6 + mmsize / 2
%else
; 8x8 block and sse2, stride was provided
-%define OFFSET %5
+%define OFFSET r6
+ add r6, r5
%endif
movh m6, [%3 + OFFSET]
movh m7, [%4 + OFFSET]
@@ -99,10 +100,12 @@ SECTION .text
packuswb m4, m6
%if %0 == 5
; Only called for 8x8 blocks and sse2
- movh [%2 + 0], m4
- movhps [%2 + %5], m4
+ sub r6, r5
+ movh [%2 + r6], m4
+ add r6, r5
+ movhps [%2 + r6], m4
%else
- mova [%2], m4
+ mova [%2 + r6], m4
%endif
%endmacro
@@ -115,26 +118,19 @@ SECTION .text
%endif
; Prepare for next loop
- add r0, r5
- add r1, r5
- add r2, r5
+ add r6, r5
%else
%ifidn %1, 8
RV40_WCORE %2, r0, r1, r2, r5
; Prepare 2 next lines
- lea r0, [r0 + 2 * r5]
- lea r1, [r1 + 2 * r5]
- lea r2, [r2 + 2 * r5]
+ add r6, r5
%else
RV40_WCORE %2, r0, r1, r2
; Prepare single next line
- add r0, r5
- add r1, r5
- add r2, r5
+ add r6, r5
%endif
%endif
- dec r6
%endmacro
; rv40_weight_func_%1(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w1, int w2, int stride)
@@ -145,7 +141,7 @@ SECTION .text
; Therefore, we check here whether they are multiples of 2^9 for
; those simplifications to occur.
%macro RV40_WEIGHT 3
-cglobal rv40_weight_func_%1_%2, 6, 7, %3
+cglobal rv40_weight_func_%1_%2, 6, 7, 8
%if cpuflag(ssse3)
mova m1, [shift_round]
%else
@@ -153,11 +149,12 @@ cglobal rv40_weight_func_%1_%2, 6, 7, %3
%endif
pxor m0, m0
; Set loop counter and increments
-%if mmsize == 8
- mov r6, %2
-%else
- mov r6, (%2 * %2) / mmsize
-%endif
+ mov r6, r5
+ shl r6, %3
+ add r0, r6
+ add r1, r6
+ add r2, r6
+ neg r6
movd m2, r3
movd m3, r4
More information about the ffmpeg-cvslog
mailing list