[FFmpeg-cvslog] aarch64: vp9itxfm: Use a single lane ld1 instead of ld1r where possible

Sat Mar 11 13:53:21 EET 2017

ffmpeg | branch: master | Martin Storsjö <martin at martin.st> | Tue Jan  3 14:55:46 2017 +0200| [19a0f9529ccdb48696f0caa251fe36b1d30df739] | committer: Martin Storsjö

aarch64: vp9itxfm: Use a single lane ld1 instead of ld1r where possible

The ld1r is a leftover from the arm version, where this trick is
beneficial on some cores.

Use a single-lane load where we don't need the semantics of ld1r.

This is cherrypicked from libav commit
ed8d293306e12c9b79022d37d39f48825ce7f2fa.

Signed-off-by: Martin Storsjö <martin at martin.st>

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=19a0f9529ccdb48696f0caa251fe36b1d30df739
---

 libavcodec/aarch64/vp9itxfm_neon.S | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/libavcodec/aarch64/vp9itxfm_neon.S b/libavcodec/aarch64/vp9itxfm_neon.S
index df178d2..e42cc2d 100644
--- a/libavcodec/aarch64/vp9itxfm_neon.S
+++ b/libavcodec/aarch64/vp9itxfm_neon.S
@@ -255,7 +255,7 @@ function ff_vp9_\txfm1\()_\txfm2\()_4x4_add_neon, export=1
         cmp             w3,  #1
         b.ne            1f
         // DC-only for idct/idct
-        ld1r            {v2.4h},  [x2]
+        ld1             {v2.h}[0], [x2]
         smull           v2.4s,  v2.4h, v0.h[0]
         rshrn           v2.4h,  v2.4s, #14
         smull           v2.4s,  v2.4h, v0.h[0]
@@ -287,8 +287,8 @@ function ff_vp9_\txfm1\()_\txfm2\()_4x4_add_neon, export=1
 
         \txfm2\()4      v4,  v5,  v6,  v7
 2:
-        ld1r            {v0.2s},   [x0], x1
-        ld1r            {v1.2s},   [x0], x1
+        ld1             {v0.s}[0],   [x0], x1
+        ld1             {v1.s}[0],   [x0], x1
 .ifnc \txfm1,iwht
         srshr           v4.4h,  v4.4h,  #4
         srshr           v5.4h,  v5.4h,  #4
@@ -297,8 +297,8 @@ function ff_vp9_\txfm1\()_\txfm2\()_4x4_add_neon, export=1
 .endif
         uaddw           v4.8h,  v4.8h,  v0.8b
         uaddw           v5.8h,  v5.8h,  v1.8b
-        ld1r            {v2.2s},   [x0], x1
-        ld1r            {v3.2s},   [x0], x1
+        ld1             {v2.s}[0],   [x0], x1
+        ld1             {v3.s}[0],   [x0], x1
         sqxtun          v0.8b,  v4.8h
         sqxtun          v1.8b,  v5.8h
         sub             x0,  x0,  x1, lsl #2
@@ -394,7 +394,7 @@ function ff_vp9_\txfm1\()_\txfm2\()_8x8_add_neon, export=1
         cmp             w3,  #1
         b.ne            1f
         // DC-only for idct/idct
-        ld1r            {v2.4h},  [x2]
+        ld1             {v2.h}[0],  [x2]
         smull           v2.4s,  v2.4h, v0.h[0]
         rshrn           v2.4h,  v2.4s, #14
         smull           v2.4s,  v2.4h, v0.h[0]
@@ -485,7 +485,7 @@ function idct16x16_dc_add_neon
 
         movi            v1.4h, #0
 
-        ld1r            {v2.4h}, [x2]
+        ld1             {v2.h}[0], [x2]
         smull           v2.4s,  v2.4h, v0.h[0]
         rshrn           v2.4h,  v2.4s, #14
         smull           v2.4s,  v2.4h, v0.h[0]
@@ -1044,7 +1044,7 @@ function idct32x32_dc_add_neon
 
         movi            v1.4h, #0
 
-        ld1r            {v2.4h}, [x2]
+        ld1             {v2.h}[0], [x2]
         smull           v2.4s,  v2.4h,  v0.h[0]
         rshrn           v2.4h,  v2.4s,  #14
         smull           v2.4s,  v2.4h,  v0.h[0]