[FFmpeg-cvslog] aarch64: vp9itxfm: Skip empty slices in the first pass of idct_idct 16x16 and 32x32
Martin Storsjö
git at videolan.org
Sat Jan 14 22:36:16 EET 2017
ffmpeg | branch: master | Martin Storsjö <martin at martin.st> | Tue Jan 10 00:15:16 2017 +0200| [8b11a89c06b94632d545f67ca508bd9c05c435ac] | committer: Michael Niedermayer
aarch64: vp9itxfm: Skip empty slices in the first pass of idct_idct 16x16 and 32x32
This work is sponsored by, and copyright, Google.
Previously all subpartitions except the eob=1 (DC) case ran with
the same runtime:
vp9_inv_dct_dct_16x16_sub16_add_neon: 1373.2
vp9_inv_dct_dct_32x32_sub32_add_neon: 8089.0
By skipping individual 8x16 or 8x32 pixel slices in the first pass,
we reduce the runtime of these functions like this:
vp9_inv_dct_dct_16x16_sub1_add_neon: 235.3
vp9_inv_dct_dct_16x16_sub2_add_neon: 1036.7
vp9_inv_dct_dct_16x16_sub4_add_neon: 1036.7
vp9_inv_dct_dct_16x16_sub8_add_neon: 1036.7
vp9_inv_dct_dct_16x16_sub12_add_neon: 1372.1
vp9_inv_dct_dct_16x16_sub16_add_neon: 1372.1
vp9_inv_dct_dct_32x32_sub1_add_neon: 555.1
vp9_inv_dct_dct_32x32_sub2_add_neon: 5190.2
vp9_inv_dct_dct_32x32_sub4_add_neon: 5180.0
vp9_inv_dct_dct_32x32_sub8_add_neon: 5183.1
vp9_inv_dct_dct_32x32_sub12_add_neon: 6161.5
vp9_inv_dct_dct_32x32_sub16_add_neon: 6155.5
vp9_inv_dct_dct_32x32_sub20_add_neon: 7136.3
vp9_inv_dct_dct_32x32_sub24_add_neon: 7128.4
vp9_inv_dct_dct_32x32_sub28_add_neon: 8098.9
vp9_inv_dct_dct_32x32_sub32_add_neon: 8098.8
I.e. in general a very minor overhead for the full subpartition case due
to the additional cmps, but a significant speedup for the cases when we
only need to process a small part of the actual input data.
This is cherrypicked from libav commits
cad42fadcd2c2ae1b3676bb398844a1f521a2d7b and
a0c443a3980dc22eb02b067ac4cb9ffa2f9b04d2.
Signed-off-by: Michael Niedermayer <michael at niedermayer.cc>
> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=8b11a89c06b94632d545f67ca508bd9c05c435ac
---
libavcodec/aarch64/vp9itxfm_neon.S | 61 ++++++++++++++++++++++++++++++++++----
1 file changed, 56 insertions(+), 5 deletions(-)
diff --git a/libavcodec/aarch64/vp9itxfm_neon.S b/libavcodec/aarch64/vp9itxfm_neon.S
index e5fc612..82f1f41 100644
--- a/libavcodec/aarch64/vp9itxfm_neon.S
+++ b/libavcodec/aarch64/vp9itxfm_neon.S
@@ -588,6 +588,9 @@ endfunc
.macro store i, dst, inc
st1 {v\i\().8h}, [\dst], \inc
.endm
+.macro movi_v i, size, imm
+ movi v\i\()\size, \imm
+.endm
.macro load_clear i, src, inc
ld1 {v\i\().8h}, [\src]
st1 {v2.8h}, [\src], \inc
@@ -596,9 +599,8 @@ endfunc
// Read a vertical 8x16 slice out of a 16x16 matrix, do a transform on it,
// transpose into a horizontal 16x8 slice and store.
// x0 = dst (temp buffer)
-// x1 = unused
+// x1 = slice offset
// x2 = src
-// x3 = slice offset
// x9 = input stride
.macro itxfm16_1d_funcs txfm
function \txfm\()16_1d_8x16_pass1_neon
@@ -616,14 +618,14 @@ function \txfm\()16_1d_8x16_pass1_neon
transpose_8x8H v24, v25, v26, v27, v28, v29, v30, v31, v2, v3
// Store the transposed 8x8 blocks horizontally.
- cmp x3, #8
+ cmp x1, #8
b.eq 1f
.irp i, 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31
store \i, x0, #16
.endr
ret
1:
- // Special case: For the last input column (x3 == 8),
+ // Special case: For the last input column (x1 == 8),
// which would be stored as the last row in the temp buffer,
// don't store the first 8x8 block, but keep it in registers
// for the first slice of the second pass (where it is the
@@ -751,13 +753,36 @@ function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1
.irp i, 0, 8
add x0, sp, #(\i*32)
+.ifc \txfm1\()_\txfm2,idct_idct
+.if \i == 8
+ cmp w3, #38
+ b.le 1f
+.endif
+.endif
+ mov x1, #\i
add x2, x6, #(\i*2)
- mov x3, #\i
bl \txfm1\()16_1d_8x16_pass1_neon
.endr
.ifc \txfm1\()_\txfm2,iadst_idct
ld1 {v0.8h,v1.8h}, [x10]
.endif
+
+.ifc \txfm1\()_\txfm2,idct_idct
+ b 3f
+1:
+ // Set v24-v31 to zero, for the in-register passthrough of
+ // coefficients to pass 2. Since we only do two slices, this can
+ // only ever happen for the second slice. So we only need to store
+ // zeros to the temp buffer for the second half of the buffer.
+ // Move x0 to the second half, and use x9 == 32 as increment.
+ add x0, x0, #16
+.irp i, 24, 25, 26, 27, 28, 29, 30, 31
+ movi_v \i, .16b, #0
+ st1 {v24.8h}, [x0], x9
+.endr
+3:
+.endif
+
.irp i, 0, 8
add x0, x4, #(\i)
mov x1, x5
@@ -1073,12 +1098,17 @@ function idct32_1d_8x32_pass2_neon
ret
endfunc
+const min_eob_idct_idct_32, align=4
+ .short 0, 34, 135, 336
+endconst
+
function ff_vp9_idct_idct_32x32_add_neon, export=1
cmp w3, #1
b.eq idct32x32_dc_add_neon
movrel x10, idct_coeffs
add x11, x10, #32
+ movrel x12, min_eob_idct_idct_32, 2
mov x15, x30
@@ -1099,9 +1129,30 @@ function ff_vp9_idct_idct_32x32_add_neon, export=1
.irp i, 0, 8, 16, 24
add x0, sp, #(\i*64)
+.if \i > 0
+ ldrh w1, [x12], #2
+ cmp w3, w1
+ mov x1, #(32 - \i)/4
+ b.le 1f
+.endif
add x2, x6, #(\i*2)
bl idct32_1d_8x32_pass1_neon
.endr
+ b 3f
+
+1:
+ // Write zeros to the temp buffer for pass 2
+ movi v16.8h, #0
+ movi v17.8h, #0
+ movi v18.8h, #0
+ movi v19.8h, #0
+2:
+ subs x1, x1, #1
+.rept 4
+ st1 {v16.8h-v19.8h}, [x0], #64
+.endr
+ b.ne 2b
+3:
.irp i, 0, 8, 16, 24
add x0, x4, #(\i)
mov x1, x5
More information about the ffmpeg-cvslog
mailing list