[FFmpeg-devel] [PATCH 28/33] Move hpel arm optimizations from dsputil to hpeldsp.
Ronald S. Bultje
rsbultje at gmail.com
Wed Feb 6 04:27:41 CET 2013
From: "Ronald S. Bultje" <rsbultje at gmail.com>
---
libavcodec/arm/Makefile | 9 +
libavcodec/arm/dsputil_arm.S | 584 ----------------------------------
libavcodec/arm/dsputil_armv6.S | 238 --------------
libavcodec/arm/dsputil_init_arm.c | 40 ---
libavcodec/arm/dsputil_init_armv6.c | 41 ---
libavcodec/arm/dsputil_init_neon.c | 59 ----
libavcodec/arm/dsputil_neon.S | 388 -----------------------
libavcodec/arm/hpeldsp_arm.S | 611 ++++++++++++++++++++++++++++++++++++
libavcodec/arm/hpeldsp_arm.h | 29 ++
libavcodec/arm/hpeldsp_armv6.S | 259 +++++++++++++++
libavcodec/arm/hpeldsp_init_arm.c | 68 ++++
libavcodec/arm/hpeldsp_init_armv6.c | 66 ++++
libavcodec/arm/hpeldsp_init_neon.c | 86 +++++
libavcodec/arm/hpeldsp_neon.S | 410 ++++++++++++++++++++++++
libavcodec/hpeldsp.c | 2 +-
15 files changed, 1539 insertions(+), 1351 deletions(-)
create mode 100644 libavcodec/arm/hpeldsp_arm.S
create mode 100644 libavcodec/arm/hpeldsp_arm.h
create mode 100644 libavcodec/arm/hpeldsp_armv6.S
create mode 100644 libavcodec/arm/hpeldsp_init_arm.c
create mode 100644 libavcodec/arm/hpeldsp_init_armv6.c
create mode 100644 libavcodec/arm/hpeldsp_init_neon.c
create mode 100644 libavcodec/arm/hpeldsp_neon.S
diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
index 0c372a4..37edd27 100644
--- a/libavcodec/arm/Makefile
+++ b/libavcodec/arm/Makefile
@@ -30,6 +30,9 @@ OBJS-$(CONFIG_H264DSP) += arm/h264dsp_init_arm.o
OBJS-$(CONFIG_H264PRED) += arm/h264pred_init_arm.o
OBJS-$(CONFIG_H264QPEL) += arm/h264qpel_init_arm.o
+OBJS-$(CONFIG_HPELDSP) += arm/hpeldsp_arm.o \
+ arm/hpeldsp_init_arm.o
+
OBJS-$(CONFIG_RV30_DECODER) += arm/rv34dsp_init_arm.o
OBJS-$(CONFIG_RV40_DECODER) += arm/rv34dsp_init_arm.o \
arm/rv40dsp_init_arm.o \
@@ -57,6 +60,9 @@ ARMV6-OBJS += arm/dsputil_init_armv6.o \
arm/dsputil_armv6.o \
arm/simple_idct_armv6.o \
+ARMV6-OBJS-$(CONFIG_HPELDSP) += arm/hpeldsp_armv6.o \
+ arm/hpeldsp_init_armv6.o
+
VFP-OBJS-$(HAVE_ARMV6) += arm/fmtconvert_vfp.o
NEON-OBJS-$(CONFIG_FFT) += arm/fft_neon.o \
@@ -75,6 +81,9 @@ NEON-OBJS-$(CONFIG_H264PRED) += arm/h264pred_neon.o \
NEON-OBJS-$(CONFIG_H264QPEL) += arm/h264qpel_neon.o \
+NEON-OBJS-$(CONFIG_HPELDSP) += arm/hpeldsp_neon.o \
+ arm/hpeldsp_init_neon.o
+
NEON-OBJS-$(CONFIG_AC3DSP) += arm/ac3dsp_neon.o
NEON-OBJS-$(CONFIG_AAC_DECODER) += arm/sbrdsp_neon.o \
diff --git a/libavcodec/arm/dsputil_arm.S b/libavcodec/arm/dsputil_arm.S
index 994b440..586a833 100644
--- a/libavcodec/arm/dsputil_arm.S
+++ b/libavcodec/arm/dsputil_arm.S
@@ -26,590 +26,6 @@
#define pld @
#endif
-.macro ALIGN_QWORD_D shift, Rd0, Rd1, Rd2, Rd3, Rn0, Rn1, Rn2, Rn3, Rn4
- mov \Rd0, \Rn0, lsr #(\shift * 8)
- mov \Rd1, \Rn1, lsr #(\shift * 8)
- mov \Rd2, \Rn2, lsr #(\shift * 8)
- mov \Rd3, \Rn3, lsr #(\shift * 8)
- orr \Rd0, \Rd0, \Rn1, lsl #(32 - \shift * 8)
- orr \Rd1, \Rd1, \Rn2, lsl #(32 - \shift * 8)
- orr \Rd2, \Rd2, \Rn3, lsl #(32 - \shift * 8)
- orr \Rd3, \Rd3, \Rn4, lsl #(32 - \shift * 8)
-.endm
-.macro ALIGN_DWORD shift, R0, R1, R2
- mov \R0, \R0, lsr #(\shift * 8)
- orr \R0, \R0, \R1, lsl #(32 - \shift * 8)
- mov \R1, \R1, lsr #(\shift * 8)
- orr \R1, \R1, \R2, lsl #(32 - \shift * 8)
-.endm
-.macro ALIGN_DWORD_D shift, Rdst0, Rdst1, Rsrc0, Rsrc1, Rsrc2
- mov \Rdst0, \Rsrc0, lsr #(\shift * 8)
- mov \Rdst1, \Rsrc1, lsr #(\shift * 8)
- orr \Rdst0, \Rdst0, \Rsrc1, lsl #(32 - (\shift * 8))
- orr \Rdst1, \Rdst1, \Rsrc2, lsl #(32 - (\shift * 8))
-.endm
-
-.macro RND_AVG32 Rd0, Rd1, Rn0, Rn1, Rm0, Rm1, Rmask
- @ Rd = (Rn | Rm) - (((Rn ^ Rm) & ~0x01010101) >> 1)
- @ Rmask = 0xFEFEFEFE
- @ Rn = destroy
- eor \Rd0, \Rn0, \Rm0
- eor \Rd1, \Rn1, \Rm1
- orr \Rn0, \Rn0, \Rm0
- orr \Rn1, \Rn1, \Rm1
- and \Rd0, \Rd0, \Rmask
- and \Rd1, \Rd1, \Rmask
- sub \Rd0, \Rn0, \Rd0, lsr #1
- sub \Rd1, \Rn1, \Rd1, lsr #1
-.endm
-
-.macro NO_RND_AVG32 Rd0, Rd1, Rn0, Rn1, Rm0, Rm1, Rmask
- @ Rd = (Rn & Rm) - (((Rn ^ Rm) & ~0x01010101) >> 1)
- @ Rmask = 0xFEFEFEFE
- @ Rn = destroy
- eor \Rd0, \Rn0, \Rm0
- eor \Rd1, \Rn1, \Rm1
- and \Rn0, \Rn0, \Rm0
- and \Rn1, \Rn1, \Rm1
- and \Rd0, \Rd0, \Rmask
- and \Rd1, \Rd1, \Rmask
- add \Rd0, \Rn0, \Rd0, lsr #1
- add \Rd1, \Rn1, \Rd1, lsr #1
-.endm
-
-.macro JMP_ALIGN tmp, reg
- ands \tmp, \reg, #3
- bic \reg, \reg, #3
- beq 1f
- subs \tmp, \tmp, #1
- beq 2f
- subs \tmp, \tmp, #1
- beq 3f
- b 4f
-.endm
-
-@ ----------------------------------------------------------------
- .align 5
-function ff_put_pixels16_arm, export=1
- @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
- @ block = word aligned, pixles = unaligned
- pld [r1]
- push {r4-r11, lr}
- JMP_ALIGN r5, r1
-1:
- ldm r1, {r4-r7}
- add r1, r1, r2
- stm r0, {r4-r7}
- pld [r1]
- subs r3, r3, #1
- add r0, r0, r2
- bne 1b
- pop {r4-r11, pc}
- .align 5
-2:
- ldm r1, {r4-r8}
- add r1, r1, r2
- ALIGN_QWORD_D 1, r9, r10, r11, r12, r4, r5, r6, r7, r8
- pld [r1]
- subs r3, r3, #1
- stm r0, {r9-r12}
- add r0, r0, r2
- bne 2b
- pop {r4-r11, pc}
- .align 5
-3:
- ldm r1, {r4-r8}
- add r1, r1, r2
- ALIGN_QWORD_D 2, r9, r10, r11, r12, r4, r5, r6, r7, r8
- pld [r1]
- subs r3, r3, #1
- stm r0, {r9-r12}
- add r0, r0, r2
- bne 3b
- pop {r4-r11, pc}
- .align 5
-4:
- ldm r1, {r4-r8}
- add r1, r1, r2
- ALIGN_QWORD_D 3, r9, r10, r11, r12, r4, r5, r6, r7, r8
- pld [r1]
- subs r3, r3, #1
- stm r0, {r9-r12}
- add r0, r0, r2
- bne 4b
- pop {r4-r11,pc}
-endfunc
-
-@ ----------------------------------------------------------------
- .align 5
-function ff_put_pixels8_arm, export=1
- @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
- @ block = word aligned, pixles = unaligned
- pld [r1]
- push {r4-r5,lr}
- JMP_ALIGN r5, r1
-1:
- ldm r1, {r4-r5}
- add r1, r1, r2
- subs r3, r3, #1
- pld [r1]
- stm r0, {r4-r5}
- add r0, r0, r2
- bne 1b
- pop {r4-r5,pc}
- .align 5
-2:
- ldm r1, {r4-r5, r12}
- add r1, r1, r2
- ALIGN_DWORD 1, r4, r5, r12
- pld [r1]
- subs r3, r3, #1
- stm r0, {r4-r5}
- add r0, r0, r2
- bne 2b
- pop {r4-r5,pc}
- .align 5
-3:
- ldm r1, {r4-r5, r12}
- add r1, r1, r2
- ALIGN_DWORD 2, r4, r5, r12
- pld [r1]
- subs r3, r3, #1
- stm r0, {r4-r5}
- add r0, r0, r2
- bne 3b
- pop {r4-r5,pc}
- .align 5
-4:
- ldm r1, {r4-r5, r12}
- add r1, r1, r2
- ALIGN_DWORD 3, r4, r5, r12
- pld [r1]
- subs r3, r3, #1
- stm r0, {r4-r5}
- add r0, r0, r2
- bne 4b
- pop {r4-r5,pc}
-endfunc
-
-@ ----------------------------------------------------------------
- .align 5
-function ff_put_pixels8_x2_arm, export=1
- @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
- @ block = word aligned, pixles = unaligned
- pld [r1]
- push {r4-r10,lr}
- ldr r12, =0xfefefefe
- JMP_ALIGN r5, r1
-1:
- ldm r1, {r4-r5, r10}
- add r1, r1, r2
- ALIGN_DWORD_D 1, r6, r7, r4, r5, r10
- pld [r1]
- RND_AVG32 r8, r9, r4, r5, r6, r7, r12
- subs r3, r3, #1
- stm r0, {r8-r9}
- add r0, r0, r2
- bne 1b
- pop {r4-r10,pc}
- .align 5
-2:
- ldm r1, {r4-r5, r10}
- add r1, r1, r2
- ALIGN_DWORD_D 1, r6, r7, r4, r5, r10
- ALIGN_DWORD_D 2, r8, r9, r4, r5, r10
- pld [r1]
- RND_AVG32 r4, r5, r6, r7, r8, r9, r12
- subs r3, r3, #1
- stm r0, {r4-r5}
- add r0, r0, r2
- bne 2b
- pop {r4-r10,pc}
- .align 5
-3:
- ldm r1, {r4-r5, r10}
- add r1, r1, r2
- ALIGN_DWORD_D 2, r6, r7, r4, r5, r10
- ALIGN_DWORD_D 3, r8, r9, r4, r5, r10
- pld [r1]
- RND_AVG32 r4, r5, r6, r7, r8, r9, r12
- subs r3, r3, #1
- stm r0, {r4-r5}
- add r0, r0, r2
- bne 3b
- pop {r4-r10,pc}
- .align 5
-4:
- ldm r1, {r4-r5, r10}
- add r1, r1, r2
- ALIGN_DWORD_D 3, r6, r7, r4, r5, r10
- pld [r1]
- RND_AVG32 r8, r9, r6, r7, r5, r10, r12
- subs r3, r3, #1
- stm r0, {r8-r9}
- add r0, r0, r2
- bne 4b
- pop {r4-r10,pc}
-endfunc
-
- .align 5
-function ff_put_no_rnd_pixels8_x2_arm, export=1
- @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
- @ block = word aligned, pixles = unaligned
- pld [r1]
- push {r4-r10,lr}
- ldr r12, =0xfefefefe
- JMP_ALIGN r5, r1
-1:
- ldm r1, {r4-r5, r10}
- add r1, r1, r2
- ALIGN_DWORD_D 1, r6, r7, r4, r5, r10
- pld [r1]
- NO_RND_AVG32 r8, r9, r4, r5, r6, r7, r12
- subs r3, r3, #1
- stm r0, {r8-r9}
- add r0, r0, r2
- bne 1b
- pop {r4-r10,pc}
- .align 5
-2:
- ldm r1, {r4-r5, r10}
- add r1, r1, r2
- ALIGN_DWORD_D 1, r6, r7, r4, r5, r10
- ALIGN_DWORD_D 2, r8, r9, r4, r5, r10
- pld [r1]
- NO_RND_AVG32 r4, r5, r6, r7, r8, r9, r12
- subs r3, r3, #1
- stm r0, {r4-r5}
- add r0, r0, r2
- bne 2b
- pop {r4-r10,pc}
- .align 5
-3:
- ldm r1, {r4-r5, r10}
- add r1, r1, r2
- ALIGN_DWORD_D 2, r6, r7, r4, r5, r10
- ALIGN_DWORD_D 3, r8, r9, r4, r5, r10
- pld [r1]
- NO_RND_AVG32 r4, r5, r6, r7, r8, r9, r12
- subs r3, r3, #1
- stm r0, {r4-r5}
- add r0, r0, r2
- bne 3b
- pop {r4-r10,pc}
- .align 5
-4:
- ldm r1, {r4-r5, r10}
- add r1, r1, r2
- ALIGN_DWORD_D 3, r6, r7, r4, r5, r10
- pld [r1]
- NO_RND_AVG32 r8, r9, r6, r7, r5, r10, r12
- subs r3, r3, #1
- stm r0, {r8-r9}
- add r0, r0, r2
- bne 4b
- pop {r4-r10,pc}
-endfunc
-
-
-@ ----------------------------------------------------------------
- .align 5
-function ff_put_pixels8_y2_arm, export=1
- @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
- @ block = word aligned, pixles = unaligned
- pld [r1]
- push {r4-r11,lr}
- mov r3, r3, lsr #1
- ldr r12, =0xfefefefe
- JMP_ALIGN r5, r1
-1:
- ldm r1, {r4-r5}
- add r1, r1, r2
-6: ldm r1, {r6-r7}
- add r1, r1, r2
- pld [r1]
- RND_AVG32 r8, r9, r4, r5, r6, r7, r12
- ldm r1, {r4-r5}
- add r1, r1, r2
- stm r0, {r8-r9}
- add r0, r0, r2
- pld [r1]
- RND_AVG32 r8, r9, r6, r7, r4, r5, r12
- subs r3, r3, #1
- stm r0, {r8-r9}
- add r0, r0, r2
- bne 6b
- pop {r4-r11,pc}
- .align 5
-2:
- ldm r1, {r4-r6}
- add r1, r1, r2
- pld [r1]
- ALIGN_DWORD 1, r4, r5, r6
-6: ldm r1, {r7-r9}
- add r1, r1, r2
- pld [r1]
- ALIGN_DWORD 1, r7, r8, r9
- RND_AVG32 r10, r11, r4, r5, r7, r8, r12
- stm r0, {r10-r11}
- add r0, r0, r2
- ldm r1, {r4-r6}
- add r1, r1, r2
- pld [r1]
- ALIGN_DWORD 1, r4, r5, r6
- subs r3, r3, #1
- RND_AVG32 r10, r11, r7, r8, r4, r5, r12
- stm r0, {r10-r11}
- add r0, r0, r2
- bne 6b
- pop {r4-r11,pc}
- .align 5
-3:
- ldm r1, {r4-r6}
- add r1, r1, r2
- pld [r1]
- ALIGN_DWORD 2, r4, r5, r6
-6: ldm r1, {r7-r9}
- add r1, r1, r2
- pld [r1]
- ALIGN_DWORD 2, r7, r8, r9
- RND_AVG32 r10, r11, r4, r5, r7, r8, r12
- stm r0, {r10-r11}
- add r0, r0, r2
- ldm r1, {r4-r6}
- add r1, r1, r2
- pld [r1]
- ALIGN_DWORD 2, r4, r5, r6
- subs r3, r3, #1
- RND_AVG32 r10, r11, r7, r8, r4, r5, r12
- stm r0, {r10-r11}
- add r0, r0, r2
- bne 6b
- pop {r4-r11,pc}
- .align 5
-4:
- ldm r1, {r4-r6}
- add r1, r1, r2
- pld [r1]
- ALIGN_DWORD 3, r4, r5, r6
-6: ldm r1, {r7-r9}
- add r1, r1, r2
- pld [r1]
- ALIGN_DWORD 3, r7, r8, r9
- RND_AVG32 r10, r11, r4, r5, r7, r8, r12
- stm r0, {r10-r11}
- add r0, r0, r2
- ldm r1, {r4-r6}
- add r1, r1, r2
- pld [r1]
- ALIGN_DWORD 3, r4, r5, r6
- subs r3, r3, #1
- RND_AVG32 r10, r11, r7, r8, r4, r5, r12
- stm r0, {r10-r11}
- add r0, r0, r2
- bne 6b
- pop {r4-r11,pc}
-endfunc
-
- .align 5
-function ff_put_no_rnd_pixels8_y2_arm, export=1
- @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
- @ block = word aligned, pixles = unaligned
- pld [r1]
- push {r4-r11,lr}
- mov r3, r3, lsr #1
- ldr r12, =0xfefefefe
- JMP_ALIGN r5, r1
-1:
- ldm r1, {r4-r5}
- add r1, r1, r2
-6: ldm r1, {r6-r7}
- add r1, r1, r2
- pld [r1]
- NO_RND_AVG32 r8, r9, r4, r5, r6, r7, r12
- ldm r1, {r4-r5}
- add r1, r1, r2
- stm r0, {r8-r9}
- add r0, r0, r2
- pld [r1]
- NO_RND_AVG32 r8, r9, r6, r7, r4, r5, r12
- subs r3, r3, #1
- stm r0, {r8-r9}
- add r0, r0, r2
- bne 6b
- pop {r4-r11,pc}
- .align 5
-2:
- ldm r1, {r4-r6}
- add r1, r1, r2
- pld [r1]
- ALIGN_DWORD 1, r4, r5, r6
-6: ldm r1, {r7-r9}
- add r1, r1, r2
- pld [r1]
- ALIGN_DWORD 1, r7, r8, r9
- NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12
- stm r0, {r10-r11}
- add r0, r0, r2
- ldm r1, {r4-r6}
- add r1, r1, r2
- pld [r1]
- ALIGN_DWORD 1, r4, r5, r6
- subs r3, r3, #1
- NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12
- stm r0, {r10-r11}
- add r0, r0, r2
- bne 6b
- pop {r4-r11,pc}
- .align 5
-3:
- ldm r1, {r4-r6}
- add r1, r1, r2
- pld [r1]
- ALIGN_DWORD 2, r4, r5, r6
-6: ldm r1, {r7-r9}
- add r1, r1, r2
- pld [r1]
- ALIGN_DWORD 2, r7, r8, r9
- NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12
- stm r0, {r10-r11}
- add r0, r0, r2
- ldm r1, {r4-r6}
- add r1, r1, r2
- pld [r1]
- ALIGN_DWORD 2, r4, r5, r6
- subs r3, r3, #1
- NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12
- stm r0, {r10-r11}
- add r0, r0, r2
- bne 6b
- pop {r4-r11,pc}
- .align 5
-4:
- ldm r1, {r4-r6}
- add r1, r1, r2
- pld [r1]
- ALIGN_DWORD 3, r4, r5, r6
-6: ldm r1, {r7-r9}
- add r1, r1, r2
- pld [r1]
- ALIGN_DWORD 3, r7, r8, r9
- NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12
- stm r0, {r10-r11}
- add r0, r0, r2
- ldm r1, {r4-r6}
- add r1, r1, r2
- pld [r1]
- ALIGN_DWORD 3, r4, r5, r6
- subs r3, r3, #1
- NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12
- stm r0, {r10-r11}
- add r0, r0, r2
- bne 6b
- pop {r4-r11,pc}
-endfunc
-
- .ltorg
-
-@ ----------------------------------------------------------------
-.macro RND_XY2_IT align, rnd
- @ l1= (a & 0x03030303) + (b & 0x03030303) ?(+ 0x02020202)
- @ h1= ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2)
-.if \align == 0
- ldm r1, {r6-r8}
-.elseif \align == 3
- ldm r1, {r5-r7}
-.else
- ldm r1, {r8-r10}
-.endif
- add r1, r1, r2
- pld [r1]
-.if \align == 0
- ALIGN_DWORD_D 1, r4, r5, r6, r7, r8
-.elseif \align == 1
- ALIGN_DWORD_D 1, r4, r5, r8, r9, r10
- ALIGN_DWORD_D 2, r6, r7, r8, r9, r10
-.elseif \align == 2
- ALIGN_DWORD_D 2, r4, r5, r8, r9, r10
- ALIGN_DWORD_D 3, r6, r7, r8, r9, r10
-.elseif \align == 3
- ALIGN_DWORD_D 3, r4, r5, r5, r6, r7
-.endif
- ldr r14, =0x03030303
- tst r3, #1
- and r8, r4, r14
- and r9, r5, r14
- and r10, r6, r14
- and r11, r7, r14
- it eq
- andeq r14, r14, r14, \rnd #1
- add r8, r8, r10
- add r9, r9, r11
- ldr r12, =0xfcfcfcfc >> 2
- itt eq
- addeq r8, r8, r14
- addeq r9, r9, r14
- and r4, r12, r4, lsr #2
- and r5, r12, r5, lsr #2
- and r6, r12, r6, lsr #2
- and r7, r12, r7, lsr #2
- add r10, r4, r6
- add r11, r5, r7
- subs r3, r3, #1
-.endm
-
-.macro RND_XY2_EXPAND align, rnd
- RND_XY2_IT \align, \rnd
-6: push {r8-r11}
- RND_XY2_IT \align, \rnd
- pop {r4-r7}
- add r4, r4, r8
- add r5, r5, r9
- ldr r14, =0x0f0f0f0f
- add r6, r6, r10
- add r7, r7, r11
- and r4, r14, r4, lsr #2
- and r5, r14, r5, lsr #2
- add r4, r4, r6
- add r5, r5, r7
- stm r0, {r4-r5}
- add r0, r0, r2
- bge 6b
- pop {r4-r11,pc}
-.endm
-
- .align 5
-function ff_put_pixels8_xy2_arm, export=1
- @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
- @ block = word aligned, pixles = unaligned
- pld [r1]
- push {r4-r11,lr} @ R14 is also called LR
- JMP_ALIGN r5, r1
-1: RND_XY2_EXPAND 0, lsl
- .align 5
-2: RND_XY2_EXPAND 1, lsl
- .align 5
-3: RND_XY2_EXPAND 2, lsl
- .align 5
-4: RND_XY2_EXPAND 3, lsl
-endfunc
-
- .align 5
-function ff_put_no_rnd_pixels8_xy2_arm, export=1
- @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
- @ block = word aligned, pixles = unaligned
- pld [r1]
- push {r4-r11,lr}
- JMP_ALIGN r5, r1
-1: RND_XY2_EXPAND 0, lsr
- .align 5
-2: RND_XY2_EXPAND 1, lsr
- .align 5
-3: RND_XY2_EXPAND 2, lsr
- .align 5
-4: RND_XY2_EXPAND 3, lsr
-endfunc
-
.align 5
@ void ff_add_pixels_clamped_arm(int16_t *block, uint8_t *dest, int stride)
function ff_add_pixels_clamped_arm, export=1
diff --git a/libavcodec/arm/dsputil_armv6.S b/libavcodec/arm/dsputil_armv6.S
index 0c54b52..6ec238b 100644
--- a/libavcodec/arm/dsputil_armv6.S
+++ b/libavcodec/arm/dsputil_armv6.S
@@ -20,244 +20,6 @@
#include "libavutil/arm/asm.S"
-.macro call_2x_pixels type, subp
-function ff_\type\()_pixels16\subp\()_armv6, export=1
- push {r0-r3, lr}
- bl ff_\type\()_pixels8\subp\()_armv6
- pop {r0-r3, lr}
- add r0, r0, #8
- add r1, r1, #8
- b ff_\type\()_pixels8\subp\()_armv6
-endfunc
-.endm
-
-call_2x_pixels avg
-call_2x_pixels put, _x2
-call_2x_pixels put, _y2
-call_2x_pixels put, _x2_no_rnd
-call_2x_pixels put, _y2_no_rnd
-
-function ff_put_pixels16_armv6, export=1
- push {r4-r11}
-1:
- ldr r5, [r1, #4]
- ldr r6, [r1, #8]
- ldr r7, [r1, #12]
- ldr_post r4, r1, r2
- strd r6, r7, [r0, #8]
- ldr r9, [r1, #4]
- strd_post r4, r5, r0, r2
- ldr r10, [r1, #8]
- ldr r11, [r1, #12]
- ldr_post r8, r1, r2
- strd r10, r11, [r0, #8]
- subs r3, r3, #2
- strd_post r8, r9, r0, r2
- bne 1b
-
- pop {r4-r11}
- bx lr
-endfunc
-
-function ff_put_pixels8_armv6, export=1
- push {r4-r7}
-1:
- ldr r5, [r1, #4]
- ldr_post r4, r1, r2
- ldr r7, [r1, #4]
- strd_post r4, r5, r0, r2
- ldr_post r6, r1, r2
- subs r3, r3, #2
- strd_post r6, r7, r0, r2
- bne 1b
-
- pop {r4-r7}
- bx lr
-endfunc
-
-function ff_put_pixels8_x2_armv6, export=1
- push {r4-r11, lr}
- mov r12, #1
- orr r12, r12, r12, lsl #8
- orr r12, r12, r12, lsl #16
-1:
- ldr r4, [r1]
- subs r3, r3, #2
- ldr r5, [r1, #4]
- ldr r7, [r1, #5]
- lsr r6, r4, #8
- ldr_pre r8, r1, r2
- orr r6, r6, r5, lsl #24
- ldr r9, [r1, #4]
- ldr r11, [r1, #5]
- lsr r10, r8, #8
- add r1, r1, r2
- orr r10, r10, r9, lsl #24
- eor r14, r4, r6
- uhadd8 r4, r4, r6
- eor r6, r5, r7
- uhadd8 r5, r5, r7
- and r14, r14, r12
- and r6, r6, r12
- uadd8 r4, r4, r14
- eor r14, r8, r10
- uadd8 r5, r5, r6
- eor r6, r9, r11
- uhadd8 r8, r8, r10
- and r14, r14, r12
- uhadd8 r9, r9, r11
- and r6, r6, r12
- uadd8 r8, r8, r14
- strd_post r4, r5, r0, r2
- uadd8 r9, r9, r6
- strd_post r8, r9, r0, r2
- bne 1b
-
- pop {r4-r11, pc}
-endfunc
-
-function ff_put_pixels8_y2_armv6, export=1
- push {r4-r11}
- mov r12, #1
- orr r12, r12, r12, lsl #8
- orr r12, r12, r12, lsl #16
- ldr r4, [r1]
- ldr r5, [r1, #4]
- ldr_pre r6, r1, r2
- ldr r7, [r1, #4]
-1:
- subs r3, r3, #2
- uhadd8 r8, r4, r6
- eor r10, r4, r6
- uhadd8 r9, r5, r7
- eor r11, r5, r7
- and r10, r10, r12
- ldr_pre r4, r1, r2
- uadd8 r8, r8, r10
- and r11, r11, r12
- uadd8 r9, r9, r11
- ldr r5, [r1, #4]
- uhadd8 r10, r4, r6
- eor r6, r4, r6
- uhadd8 r11, r5, r7
- and r6, r6, r12
- eor r7, r5, r7
- uadd8 r10, r10, r6
- and r7, r7, r12
- ldr_pre r6, r1, r2
- uadd8 r11, r11, r7
- strd_post r8, r9, r0, r2
- ldr r7, [r1, #4]
- strd_post r10, r11, r0, r2
- bne 1b
-
- pop {r4-r11}
- bx lr
-endfunc
-
-function ff_put_pixels8_x2_no_rnd_armv6, export=1
- push {r4-r9, lr}
-1:
- subs r3, r3, #2
- ldr r4, [r1]
- ldr r5, [r1, #4]
- ldr r7, [r1, #5]
- ldr_pre r8, r1, r2
- ldr r9, [r1, #4]
- ldr r14, [r1, #5]
- add r1, r1, r2
- lsr r6, r4, #8
- orr r6, r6, r5, lsl #24
- lsr r12, r8, #8
- orr r12, r12, r9, lsl #24
- uhadd8 r4, r4, r6
- uhadd8 r5, r5, r7
- uhadd8 r8, r8, r12
- uhadd8 r9, r9, r14
- stm r0, {r4,r5}
- add r0, r0, r2
- stm r0, {r8,r9}
- add r0, r0, r2
- bne 1b
-
- pop {r4-r9, pc}
-endfunc
-
-function ff_put_pixels8_y2_no_rnd_armv6, export=1
- push {r4-r9, lr}
- ldr r4, [r1]
- ldr r5, [r1, #4]
- ldr_pre r6, r1, r2
- ldr r7, [r1, #4]
-1:
- subs r3, r3, #2
- uhadd8 r8, r4, r6
- ldr_pre r4, r1, r2
- uhadd8 r9, r5, r7
- ldr r5, [r1, #4]
- uhadd8 r12, r4, r6
- ldr_pre r6, r1, r2
- uhadd8 r14, r5, r7
- ldr r7, [r1, #4]
- stm r0, {r8,r9}
- add r0, r0, r2
- stm r0, {r12,r14}
- add r0, r0, r2
- bne 1b
-
- pop {r4-r9, pc}
-endfunc
-
-function ff_avg_pixels8_armv6, export=1
- pld [r1, r2]
- push {r4-r10, lr}
- mov lr, #1
- orr lr, lr, lr, lsl #8
- orr lr, lr, lr, lsl #16
- ldrd r4, r5, [r0]
- ldr r10, [r1, #4]
- ldr_post r9, r1, r2
- subs r3, r3, #2
-1:
- pld [r1, r2]
- eor r8, r4, r9
- uhadd8 r4, r4, r9
- eor r12, r5, r10
- ldrd_reg r6, r7, r0, r2
- uhadd8 r5, r5, r10
- and r8, r8, lr
- ldr r10, [r1, #4]
- and r12, r12, lr
- uadd8 r4, r4, r8
- ldr_post r9, r1, r2
- eor r8, r6, r9
- uadd8 r5, r5, r12
- pld [r1, r2, lsl #1]
- eor r12, r7, r10
- uhadd8 r6, r6, r9
- strd_post r4, r5, r0, r2
- uhadd8 r7, r7, r10
- beq 2f
- and r8, r8, lr
- ldrd_reg r4, r5, r0, r2
- uadd8 r6, r6, r8
- ldr r10, [r1, #4]
- and r12, r12, lr
- subs r3, r3, #2
- uadd8 r7, r7, r12
- ldr_post r9, r1, r2
- strd_post r6, r7, r0, r2
- b 1b
-2:
- and r8, r8, lr
- and r12, r12, lr
- uadd8 r6, r6, r8
- uadd8 r7, r7, r12
- strd_post r6, r7, r0, r2
-
- pop {r4-r10, pc}
-endfunc
-
function ff_add_pixels_clamped_armv6, export=1
push {r4-r8,lr}
mov r3, #8
diff --git a/libavcodec/arm/dsputil_init_arm.c b/libavcodec/arm/dsputil_init_arm.c
index b3abed1..cb5133d 100644
--- a/libavcodec/arm/dsputil_init_arm.c
+++ b/libavcodec/arm/dsputil_init_arm.c
@@ -21,7 +21,6 @@
#include "libavutil/arm/cpu.h"
#include "libavcodec/dsputil.h"
-#include "libavcodec/bit_depth_template.c" // for CALL_2X_PIXELS
#include "dsputil_arm.h"
void ff_j_rev_dct_arm(int16_t *data);
@@ -31,24 +30,6 @@ void ff_simple_idct_arm(int16_t *data);
static void (*ff_put_pixels_clamped)(const int16_t *block, uint8_t *pixels, int line_size);
static void (*ff_add_pixels_clamped)(const int16_t *block, uint8_t *pixels, int line_size);
-void ff_put_pixels8_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
-void ff_put_pixels8_x2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
-void ff_put_pixels8_y2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
-void ff_put_pixels8_xy2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
-
-void ff_put_no_rnd_pixels8_x2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
-void ff_put_no_rnd_pixels8_y2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
-void ff_put_no_rnd_pixels8_xy2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
-
-void ff_put_pixels16_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
-
-CALL_2X_PIXELS(ff_put_pixels16_x2_arm, ff_put_pixels8_x2_arm, 8)
-CALL_2X_PIXELS(ff_put_pixels16_y2_arm, ff_put_pixels8_y2_arm, 8)
-CALL_2X_PIXELS(ff_put_pixels16_xy2_arm, ff_put_pixels8_xy2_arm, 8)
-CALL_2X_PIXELS(ff_put_no_rnd_pixels16_x2_arm, ff_put_no_rnd_pixels8_x2_arm, 8)
-CALL_2X_PIXELS(ff_put_no_rnd_pixels16_y2_arm, ff_put_no_rnd_pixels8_y2_arm, 8)
-CALL_2X_PIXELS(ff_put_no_rnd_pixels16_xy2_arm, ff_put_no_rnd_pixels8_xy2_arm,8)
-
void ff_add_pixels_clamped_arm(const int16_t *block, uint8_t *dest,
int line_size);
@@ -77,7 +58,6 @@ static void simple_idct_arm_add(uint8_t *dest, int line_size, int16_t *block)
void ff_dsputil_init_arm(DSPContext* c, AVCodecContext *avctx)
{
- const int high_bit_depth = avctx->bits_per_raw_sample > 8;
int cpu_flags = av_get_cpu_flags();
ff_put_pixels_clamped = c->put_pixels_clamped;
@@ -100,26 +80,6 @@ void ff_dsputil_init_arm(DSPContext* c, AVCodecContext *avctx)
c->add_pixels_clamped = ff_add_pixels_clamped_arm;
- if (!high_bit_depth) {
- c->put_pixels_tab[0][0] = ff_put_pixels16_arm;
- c->put_pixels_tab[0][1] = ff_put_pixels16_x2_arm;
- c->put_pixels_tab[0][2] = ff_put_pixels16_y2_arm;
- c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_arm;
- c->put_pixels_tab[1][0] = ff_put_pixels8_arm;
- c->put_pixels_tab[1][1] = ff_put_pixels8_x2_arm;
- c->put_pixels_tab[1][2] = ff_put_pixels8_y2_arm;
- c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_arm;
-
- c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_arm;
- c->put_no_rnd_pixels_tab[0][1] = ff_put_no_rnd_pixels16_x2_arm;
- c->put_no_rnd_pixels_tab[0][2] = ff_put_no_rnd_pixels16_y2_arm;
- c->put_no_rnd_pixels_tab[0][3] = ff_put_no_rnd_pixels16_xy2_arm;
- c->put_no_rnd_pixels_tab[1][0] = ff_put_pixels8_arm;
- c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_arm;
- c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_arm;
- c->put_no_rnd_pixels_tab[1][3] = ff_put_no_rnd_pixels8_xy2_arm;
- }
-
if (have_armv5te(cpu_flags)) ff_dsputil_init_armv5te(c, avctx);
if (have_armv6(cpu_flags)) ff_dsputil_init_armv6(c, avctx);
if (have_neon(cpu_flags)) ff_dsputil_init_neon(c, avctx);
diff --git a/libavcodec/arm/dsputil_init_armv6.c b/libavcodec/arm/dsputil_init_armv6.c
index dc126fe..6ec3ed7 100644
--- a/libavcodec/arm/dsputil_init_armv6.c
+++ b/libavcodec/arm/dsputil_init_armv6.c
@@ -28,24 +28,6 @@ void ff_simple_idct_armv6(int16_t *data);
void ff_simple_idct_put_armv6(uint8_t *dest, int line_size, int16_t *data);
void ff_simple_idct_add_armv6(uint8_t *dest, int line_size, int16_t *data);
-void ff_put_pixels16_armv6(uint8_t *, const uint8_t *, int, int);
-void ff_put_pixels16_x2_armv6(uint8_t *, const uint8_t *, int, int);
-void ff_put_pixels16_y2_armv6(uint8_t *, const uint8_t *, int, int);
-
-void ff_put_pixels16_x2_no_rnd_armv6(uint8_t *, const uint8_t *, int, int);
-void ff_put_pixels16_y2_no_rnd_armv6(uint8_t *, const uint8_t *, int, int);
-
-void ff_avg_pixels16_armv6(uint8_t *, const uint8_t *, int, int);
-
-void ff_put_pixels8_armv6(uint8_t *, const uint8_t *, int, int);
-void ff_put_pixels8_x2_armv6(uint8_t *, const uint8_t *, int, int);
-void ff_put_pixels8_y2_armv6(uint8_t *, const uint8_t *, int, int);
-
-void ff_put_pixels8_x2_no_rnd_armv6(uint8_t *, const uint8_t *, int, int);
-void ff_put_pixels8_y2_no_rnd_armv6(uint8_t *, const uint8_t *, int, int);
-
-void ff_avg_pixels8_armv6(uint8_t *, const uint8_t *, int, int);
-
void ff_add_pixels_clamped_armv6(const int16_t *block,
uint8_t *restrict pixels,
int line_size);
@@ -83,29 +65,6 @@ av_cold void ff_dsputil_init_armv6(DSPContext *c, AVCodecContext *avctx)
c->idct_permutation_type = FF_LIBMPEG2_IDCT_PERM;
}
- if (!high_bit_depth) {
- c->put_pixels_tab[0][0] = ff_put_pixels16_armv6;
- c->put_pixels_tab[0][1] = ff_put_pixels16_x2_armv6;
- c->put_pixels_tab[0][2] = ff_put_pixels16_y2_armv6;
-/* c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_armv6; */
- c->put_pixels_tab[1][0] = ff_put_pixels8_armv6;
- c->put_pixels_tab[1][1] = ff_put_pixels8_x2_armv6;
- c->put_pixels_tab[1][2] = ff_put_pixels8_y2_armv6;
-/* c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_armv6; */
-
- c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_armv6;
- c->put_no_rnd_pixels_tab[0][1] = ff_put_pixels16_x2_no_rnd_armv6;
- c->put_no_rnd_pixels_tab[0][2] = ff_put_pixels16_y2_no_rnd_armv6;
-/* c->put_no_rnd_pixels_tab[0][3] = ff_put_pixels16_xy2_no_rnd_armv6; */
- c->put_no_rnd_pixels_tab[1][0] = ff_put_pixels8_armv6;
- c->put_no_rnd_pixels_tab[1][1] = ff_put_pixels8_x2_no_rnd_armv6;
- c->put_no_rnd_pixels_tab[1][2] = ff_put_pixels8_y2_no_rnd_armv6;
-/* c->put_no_rnd_pixels_tab[1][3] = ff_put_pixels8_xy2_no_rnd_armv6; */
-
- c->avg_pixels_tab[0][0] = ff_avg_pixels16_armv6;
- c->avg_pixels_tab[1][0] = ff_avg_pixels8_armv6;
- }
-
if (!high_bit_depth)
c->get_pixels = ff_get_pixels_armv6;
c->add_pixels_clamped = ff_add_pixels_clamped_armv6;
diff --git a/libavcodec/arm/dsputil_init_neon.c b/libavcodec/arm/dsputil_init_neon.c
index 3050e8c..1c70bd7 100644
--- a/libavcodec/arm/dsputil_init_neon.c
+++ b/libavcodec/arm/dsputil_init_neon.c
@@ -32,33 +32,6 @@ void ff_simple_idct_add_neon(uint8_t *dest, int line_size, int16_t *data);
void ff_clear_block_neon(int16_t *block);
void ff_clear_blocks_neon(int16_t *blocks);
-void ff_put_pixels16_neon(uint8_t *, const uint8_t *, int, int);
-void ff_put_pixels16_x2_neon(uint8_t *, const uint8_t *, int, int);
-void ff_put_pixels16_y2_neon(uint8_t *, const uint8_t *, int, int);
-void ff_put_pixels16_xy2_neon(uint8_t *, const uint8_t *, int, int);
-void ff_put_pixels8_neon(uint8_t *, const uint8_t *, int, int);
-void ff_put_pixels8_x2_neon(uint8_t *, const uint8_t *, int, int);
-void ff_put_pixels8_y2_neon(uint8_t *, const uint8_t *, int, int);
-void ff_put_pixels8_xy2_neon(uint8_t *, const uint8_t *, int, int);
-void ff_put_pixels16_x2_no_rnd_neon(uint8_t *, const uint8_t *, int, int);
-void ff_put_pixels16_y2_no_rnd_neon(uint8_t *, const uint8_t *, int, int);
-void ff_put_pixels16_xy2_no_rnd_neon(uint8_t *, const uint8_t *, int, int);
-void ff_put_pixels8_x2_no_rnd_neon(uint8_t *, const uint8_t *, int, int);
-void ff_put_pixels8_y2_no_rnd_neon(uint8_t *, const uint8_t *, int, int);
-void ff_put_pixels8_xy2_no_rnd_neon(uint8_t *, const uint8_t *, int, int);
-
-void ff_avg_pixels16_neon(uint8_t *, const uint8_t *, int, int);
-void ff_avg_pixels16_x2_neon(uint8_t *, const uint8_t *, int, int);
-void ff_avg_pixels16_y2_neon(uint8_t *, const uint8_t *, int, int);
-void ff_avg_pixels16_xy2_neon(uint8_t *, const uint8_t *, int, int);
-void ff_avg_pixels8_neon(uint8_t *, const uint8_t *, int, int);
-void ff_avg_pixels8_x2_neon(uint8_t *, const uint8_t *, int, int);
-void ff_avg_pixels8_y2_neon(uint8_t *, const uint8_t *, int, int);
-void ff_avg_pixels8_xy2_neon(uint8_t *, const uint8_t *, int, int);
-void ff_avg_pixels16_x2_no_rnd_neon(uint8_t *, const uint8_t *, int, int);
-void ff_avg_pixels16_y2_no_rnd_neon(uint8_t *, const uint8_t *, int, int);
-void ff_avg_pixels16_xy2_no_rnd_neon(uint8_t *, const uint8_t *, int, int);
-
void ff_add_pixels_clamped_neon(const int16_t *, uint8_t *, int);
void ff_put_pixels_clamped_neon(const int16_t *, uint8_t *, int);
void ff_put_signed_pixels_clamped_neon(const int16_t *, uint8_t *, int);
@@ -100,38 +73,6 @@ void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx)
if (!high_bit_depth) {
c->clear_block = ff_clear_block_neon;
c->clear_blocks = ff_clear_blocks_neon;
-
- c->put_pixels_tab[0][0] = ff_put_pixels16_neon;
- c->put_pixels_tab[0][1] = ff_put_pixels16_x2_neon;
- c->put_pixels_tab[0][2] = ff_put_pixels16_y2_neon;
- c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_neon;
- c->put_pixels_tab[1][0] = ff_put_pixels8_neon;
- c->put_pixels_tab[1][1] = ff_put_pixels8_x2_neon;
- c->put_pixels_tab[1][2] = ff_put_pixels8_y2_neon;
- c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_neon;
-
- c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_neon;
- c->put_no_rnd_pixels_tab[0][1] = ff_put_pixels16_x2_no_rnd_neon;
- c->put_no_rnd_pixels_tab[0][2] = ff_put_pixels16_y2_no_rnd_neon;
- c->put_no_rnd_pixels_tab[0][3] = ff_put_pixels16_xy2_no_rnd_neon;
- c->put_no_rnd_pixels_tab[1][0] = ff_put_pixels8_neon;
- c->put_no_rnd_pixels_tab[1][1] = ff_put_pixels8_x2_no_rnd_neon;
- c->put_no_rnd_pixels_tab[1][2] = ff_put_pixels8_y2_no_rnd_neon;
- c->put_no_rnd_pixels_tab[1][3] = ff_put_pixels8_xy2_no_rnd_neon;
-
- c->avg_pixels_tab[0][0] = ff_avg_pixels16_neon;
- c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_neon;
- c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_neon;
- c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_neon;
- c->avg_pixels_tab[1][0] = ff_avg_pixels8_neon;
- c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_neon;
- c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_neon;
- c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_neon;
-
- c->avg_no_rnd_pixels_tab[0] = ff_avg_pixels16_neon;
- c->avg_no_rnd_pixels_tab[1] = ff_avg_pixels16_x2_no_rnd_neon;
- c->avg_no_rnd_pixels_tab[2] = ff_avg_pixels16_y2_no_rnd_neon;
- c->avg_no_rnd_pixels_tab[3] = ff_avg_pixels16_xy2_no_rnd_neon;
}
c->add_pixels_clamped = ff_add_pixels_clamped_neon;
diff --git a/libavcodec/arm/dsputil_neon.S b/libavcodec/arm/dsputil_neon.S
index 89d3643..307e122 100644
--- a/libavcodec/arm/dsputil_neon.S
+++ b/libavcodec/arm/dsputil_neon.S
@@ -37,394 +37,6 @@ function ff_clear_blocks_neon, export=1
bx lr
endfunc
-.macro pixels16 rnd=1, avg=0
- .if \avg
- mov r12, r0
- .endif
-1: vld1.8 {q0}, [r1], r2
- vld1.8 {q1}, [r1], r2
- vld1.8 {q2}, [r1], r2
- pld [r1, r2, lsl #2]
- vld1.8 {q3}, [r1], r2
- pld [r1]
- pld [r1, r2]
- pld [r1, r2, lsl #1]
- .if \avg
- vld1.8 {q8}, [r12,:128], r2
- vrhadd.u8 q0, q0, q8
- vld1.8 {q9}, [r12,:128], r2
- vrhadd.u8 q1, q1, q9
- vld1.8 {q10}, [r12,:128], r2
- vrhadd.u8 q2, q2, q10
- vld1.8 {q11}, [r12,:128], r2
- vrhadd.u8 q3, q3, q11
- .endif
- subs r3, r3, #4
- vst1.64 {q0}, [r0,:128], r2
- vst1.64 {q1}, [r0,:128], r2
- vst1.64 {q2}, [r0,:128], r2
- vst1.64 {q3}, [r0,:128], r2
- bne 1b
- bx lr
-.endm
-
-.macro pixels16_x2 rnd=1, avg=0
-1: vld1.8 {d0-d2}, [r1], r2
- vld1.8 {d4-d6}, [r1], r2
- pld [r1]
- pld [r1, r2]
- subs r3, r3, #2
- vext.8 q1, q0, q1, #1
- avg q0, q0, q1
- vext.8 q3, q2, q3, #1
- avg q2, q2, q3
- .if \avg
- vld1.8 {q1}, [r0,:128], r2
- vld1.8 {q3}, [r0,:128]
- vrhadd.u8 q0, q0, q1
- vrhadd.u8 q2, q2, q3
- sub r0, r0, r2
- .endif
- vst1.8 {q0}, [r0,:128], r2
- vst1.8 {q2}, [r0,:128], r2
- bne 1b
- bx lr
-.endm
-
-.macro pixels16_y2 rnd=1, avg=0
- sub r3, r3, #2
- vld1.8 {q0}, [r1], r2
- vld1.8 {q1}, [r1], r2
-1: subs r3, r3, #2
- avg q2, q0, q1
- vld1.8 {q0}, [r1], r2
- avg q3, q0, q1
- vld1.8 {q1}, [r1], r2
- pld [r1]
- pld [r1, r2]
- .if \avg
- vld1.8 {q8}, [r0,:128], r2
- vld1.8 {q9}, [r0,:128]
- vrhadd.u8 q2, q2, q8
- vrhadd.u8 q3, q3, q9
- sub r0, r0, r2
- .endif
- vst1.8 {q2}, [r0,:128], r2
- vst1.8 {q3}, [r0,:128], r2
- bne 1b
-
- avg q2, q0, q1
- vld1.8 {q0}, [r1], r2
- avg q3, q0, q1
- .if \avg
- vld1.8 {q8}, [r0,:128], r2
- vld1.8 {q9}, [r0,:128]
- vrhadd.u8 q2, q2, q8
- vrhadd.u8 q3, q3, q9
- sub r0, r0, r2
- .endif
- vst1.8 {q2}, [r0,:128], r2
- vst1.8 {q3}, [r0,:128], r2
-
- bx lr
-.endm
-
-.macro pixels16_xy2 rnd=1, avg=0
- sub r3, r3, #2
- vld1.8 {d0-d2}, [r1], r2
- vld1.8 {d4-d6}, [r1], r2
-NRND vmov.i16 q13, #1
- pld [r1]
- pld [r1, r2]
- vext.8 q1, q0, q1, #1
- vext.8 q3, q2, q3, #1
- vaddl.u8 q8, d0, d2
- vaddl.u8 q10, d1, d3
- vaddl.u8 q9, d4, d6
- vaddl.u8 q11, d5, d7
-1: subs r3, r3, #2
- vld1.8 {d0-d2}, [r1], r2
- vadd.u16 q12, q8, q9
- pld [r1]
-NRND vadd.u16 q12, q12, q13
- vext.8 q15, q0, q1, #1
- vadd.u16 q1 , q10, q11
- shrn d28, q12, #2
-NRND vadd.u16 q1, q1, q13
- shrn d29, q1, #2
- .if \avg
- vld1.8 {q8}, [r0,:128]
- vrhadd.u8 q14, q14, q8
- .endif
- vaddl.u8 q8, d0, d30
- vld1.8 {d2-d4}, [r1], r2
- vaddl.u8 q10, d1, d31
- vst1.8 {q14}, [r0,:128], r2
- vadd.u16 q12, q8, q9
- pld [r1, r2]
-NRND vadd.u16 q12, q12, q13
- vext.8 q2, q1, q2, #1
- vadd.u16 q0, q10, q11
- shrn d30, q12, #2
-NRND vadd.u16 q0, q0, q13
- shrn d31, q0, #2
- .if \avg
- vld1.8 {q9}, [r0,:128]
- vrhadd.u8 q15, q15, q9
- .endif
- vaddl.u8 q9, d2, d4
- vaddl.u8 q11, d3, d5
- vst1.8 {q15}, [r0,:128], r2
- bgt 1b
-
- vld1.8 {d0-d2}, [r1], r2
- vadd.u16 q12, q8, q9
-NRND vadd.u16 q12, q12, q13
- vext.8 q15, q0, q1, #1
- vadd.u16 q1 , q10, q11
- shrn d28, q12, #2
-NRND vadd.u16 q1, q1, q13
- shrn d29, q1, #2
- .if \avg
- vld1.8 {q8}, [r0,:128]
- vrhadd.u8 q14, q14, q8
- .endif
- vaddl.u8 q8, d0, d30
- vaddl.u8 q10, d1, d31
- vst1.8 {q14}, [r0,:128], r2
- vadd.u16 q12, q8, q9
-NRND vadd.u16 q12, q12, q13
- vadd.u16 q0, q10, q11
- shrn d30, q12, #2
-NRND vadd.u16 q0, q0, q13
- shrn d31, q0, #2
- .if \avg
- vld1.8 {q9}, [r0,:128]
- vrhadd.u8 q15, q15, q9
- .endif
- vst1.8 {q15}, [r0,:128], r2
-
- bx lr
-.endm
-
-.macro pixels8 rnd=1, avg=0
-1: vld1.8 {d0}, [r1], r2
- vld1.8 {d1}, [r1], r2
- vld1.8 {d2}, [r1], r2
- pld [r1, r2, lsl #2]
- vld1.8 {d3}, [r1], r2
- pld [r1]
- pld [r1, r2]
- pld [r1, r2, lsl #1]
- .if \avg
- vld1.8 {d4}, [r0,:64], r2
- vrhadd.u8 d0, d0, d4
- vld1.8 {d5}, [r0,:64], r2
- vrhadd.u8 d1, d1, d5
- vld1.8 {d6}, [r0,:64], r2
- vrhadd.u8 d2, d2, d6
- vld1.8 {d7}, [r0,:64], r2
- vrhadd.u8 d3, d3, d7
- sub r0, r0, r2, lsl #2
- .endif
- subs r3, r3, #4
- vst1.8 {d0}, [r0,:64], r2
- vst1.8 {d1}, [r0,:64], r2
- vst1.8 {d2}, [r0,:64], r2
- vst1.8 {d3}, [r0,:64], r2
- bne 1b
- bx lr
-.endm
-
-.macro pixels8_x2 rnd=1, avg=0
-1: vld1.8 {q0}, [r1], r2
- vext.8 d1, d0, d1, #1
- vld1.8 {q1}, [r1], r2
- vext.8 d3, d2, d3, #1
- pld [r1]
- pld [r1, r2]
- subs r3, r3, #2
- vswp d1, d2
- avg q0, q0, q1
- .if \avg
- vld1.8 {d4}, [r0,:64], r2
- vld1.8 {d5}, [r0,:64]
- vrhadd.u8 q0, q0, q2
- sub r0, r0, r2
- .endif
- vst1.8 {d0}, [r0,:64], r2
- vst1.8 {d1}, [r0,:64], r2
- bne 1b
- bx lr
-.endm
-
-.macro pixels8_y2 rnd=1, avg=0
- sub r3, r3, #2
- vld1.8 {d0}, [r1], r2
- vld1.8 {d1}, [r1], r2
-1: subs r3, r3, #2
- avg d4, d0, d1
- vld1.8 {d0}, [r1], r2
- avg d5, d0, d1
- vld1.8 {d1}, [r1], r2
- pld [r1]
- pld [r1, r2]
- .if \avg
- vld1.8 {d2}, [r0,:64], r2
- vld1.8 {d3}, [r0,:64]
- vrhadd.u8 q2, q2, q1
- sub r0, r0, r2
- .endif
- vst1.8 {d4}, [r0,:64], r2
- vst1.8 {d5}, [r0,:64], r2
- bne 1b
-
- avg d4, d0, d1
- vld1.8 {d0}, [r1], r2
- avg d5, d0, d1
- .if \avg
- vld1.8 {d2}, [r0,:64], r2
- vld1.8 {d3}, [r0,:64]
- vrhadd.u8 q2, q2, q1
- sub r0, r0, r2
- .endif
- vst1.8 {d4}, [r0,:64], r2
- vst1.8 {d5}, [r0,:64], r2
-
- bx lr
-.endm
-
-.macro pixels8_xy2 rnd=1, avg=0
- sub r3, r3, #2
- vld1.8 {q0}, [r1], r2
- vld1.8 {q1}, [r1], r2
-NRND vmov.i16 q11, #1
- pld [r1]
- pld [r1, r2]
- vext.8 d4, d0, d1, #1
- vext.8 d6, d2, d3, #1
- vaddl.u8 q8, d0, d4
- vaddl.u8 q9, d2, d6
-1: subs r3, r3, #2
- vld1.8 {q0}, [r1], r2
- pld [r1]
- vadd.u16 q10, q8, q9
- vext.8 d4, d0, d1, #1
-NRND vadd.u16 q10, q10, q11
- vaddl.u8 q8, d0, d4
- shrn d5, q10, #2
- vld1.8 {q1}, [r1], r2
- vadd.u16 q10, q8, q9
- pld [r1, r2]
- .if \avg
- vld1.8 {d7}, [r0,:64]
- vrhadd.u8 d5, d5, d7
- .endif
-NRND vadd.u16 q10, q10, q11
- vst1.8 {d5}, [r0,:64], r2
- shrn d7, q10, #2
- .if \avg
- vld1.8 {d5}, [r0,:64]
- vrhadd.u8 d7, d7, d5
- .endif
- vext.8 d6, d2, d3, #1
- vaddl.u8 q9, d2, d6
- vst1.8 {d7}, [r0,:64], r2
- bgt 1b
-
- vld1.8 {q0}, [r1], r2
- vadd.u16 q10, q8, q9
- vext.8 d4, d0, d1, #1
-NRND vadd.u16 q10, q10, q11
- vaddl.u8 q8, d0, d4
- shrn d5, q10, #2
- vadd.u16 q10, q8, q9
- .if \avg
- vld1.8 {d7}, [r0,:64]
- vrhadd.u8 d5, d5, d7
- .endif
-NRND vadd.u16 q10, q10, q11
- vst1.8 {d5}, [r0,:64], r2
- shrn d7, q10, #2
- .if \avg
- vld1.8 {d5}, [r0,:64]
- vrhadd.u8 d7, d7, d5
- .endif
- vst1.8 {d7}, [r0,:64], r2
-
- bx lr
-.endm
-
-.macro pixfunc pfx, name, suf, rnd=1, avg=0
- .if \rnd
- .macro avg rd, rn, rm
- vrhadd.u8 \rd, \rn, \rm
- .endm
- .macro shrn rd, rn, rm
- vrshrn.u16 \rd, \rn, \rm
- .endm
- .macro NRND insn:vararg
- .endm
- .else
- .macro avg rd, rn, rm
- vhadd.u8 \rd, \rn, \rm
- .endm
- .macro shrn rd, rn, rm
- vshrn.u16 \rd, \rn, \rm
- .endm
- .macro NRND insn:vararg
- \insn
- .endm
- .endif
-function ff_\pfx\name\suf\()_neon, export=1
- \name \rnd, \avg
-endfunc
- .purgem avg
- .purgem shrn
- .purgem NRND
-.endm
-
-.macro pixfunc2 pfx, name, avg=0
- pixfunc \pfx, \name, rnd=1, avg=\avg
- pixfunc \pfx, \name, _no_rnd, rnd=0, avg=\avg
-.endm
-
-function ff_put_h264_qpel16_mc00_neon, export=1
- mov r3, #16
-endfunc
-
- pixfunc put_, pixels16, avg=0
- pixfunc2 put_, pixels16_x2, avg=0
- pixfunc2 put_, pixels16_y2, avg=0
- pixfunc2 put_, pixels16_xy2, avg=0
-
-function ff_avg_h264_qpel16_mc00_neon, export=1
- mov r3, #16
-endfunc
-
- pixfunc avg_, pixels16, avg=1
- pixfunc2 avg_, pixels16_x2, avg=1
- pixfunc2 avg_, pixels16_y2, avg=1
- pixfunc2 avg_, pixels16_xy2, avg=1
-
-function ff_put_h264_qpel8_mc00_neon, export=1
- mov r3, #8
-endfunc
-
- pixfunc put_, pixels8, avg=0
- pixfunc2 put_, pixels8_x2, avg=0
- pixfunc2 put_, pixels8_y2, avg=0
- pixfunc2 put_, pixels8_xy2, avg=0
-
-function ff_avg_h264_qpel8_mc00_neon, export=1
- mov r3, #8
-endfunc
-
- pixfunc avg_, pixels8, avg=1
- pixfunc avg_, pixels8_x2, avg=1
- pixfunc avg_, pixels8_y2, avg=1
- pixfunc avg_, pixels8_xy2, avg=1
-
function ff_put_pixels_clamped_neon, export=1
vld1.16 {d16-d19}, [r0,:128]!
vqmovun.s16 d0, q8
diff --git a/libavcodec/arm/hpeldsp_arm.S b/libavcodec/arm/hpeldsp_arm.S
new file mode 100644
index 0000000..2f3d311
--- /dev/null
+++ b/libavcodec/arm/hpeldsp_arm.S
@@ -0,0 +1,611 @@
+@
+@ ARMv4 optimized DSP utils
+@ Copyright (c) 2004 AGAWA Koji <i (AT) atty (DOT) jp>
+@
+@ This file is part of FFmpeg.
+@
+@ FFmpeg is free software; you can redistribute it and/or
+@ modify it under the terms of the GNU Lesser General Public
+@ License as published by the Free Software Foundation; either
+@ version 2.1 of the License, or (at your option) any later version.
+@
+@ FFmpeg is distributed in the hope that it will be useful,
+@ but WITHOUT ANY WARRANTY; without even the implied warranty of
+@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+@ Lesser General Public License for more details.
+@
+@ You should have received a copy of the GNU Lesser General Public
+@ License along with FFmpeg; if not, write to the Free Software
+@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+@
+
+#include "config.h"
+#include "libavutil/arm/asm.S"
+
+#if !HAVE_ARMV5TE_EXTERNAL
+#define pld @
+#endif
+
+.macro ALIGN_QWORD_D shift, Rd0, Rd1, Rd2, Rd3, Rn0, Rn1, Rn2, Rn3, Rn4
+ mov \Rd0, \Rn0, lsr #(\shift * 8)
+ mov \Rd1, \Rn1, lsr #(\shift * 8)
+ mov \Rd2, \Rn2, lsr #(\shift * 8)
+ mov \Rd3, \Rn3, lsr #(\shift * 8)
+ orr \Rd0, \Rd0, \Rn1, lsl #(32 - \shift * 8)
+ orr \Rd1, \Rd1, \Rn2, lsl #(32 - \shift * 8)
+ orr \Rd2, \Rd2, \Rn3, lsl #(32 - \shift * 8)
+ orr \Rd3, \Rd3, \Rn4, lsl #(32 - \shift * 8)
+.endm
+.macro ALIGN_DWORD shift, R0, R1, R2
+ mov \R0, \R0, lsr #(\shift * 8)
+ orr \R0, \R0, \R1, lsl #(32 - \shift * 8)
+ mov \R1, \R1, lsr #(\shift * 8)
+ orr \R1, \R1, \R2, lsl #(32 - \shift * 8)
+.endm
+.macro ALIGN_DWORD_D shift, Rdst0, Rdst1, Rsrc0, Rsrc1, Rsrc2
+ mov \Rdst0, \Rsrc0, lsr #(\shift * 8)
+ mov \Rdst1, \Rsrc1, lsr #(\shift * 8)
+ orr \Rdst0, \Rdst0, \Rsrc1, lsl #(32 - (\shift * 8))
+ orr \Rdst1, \Rdst1, \Rsrc2, lsl #(32 - (\shift * 8))
+.endm
+
+.macro RND_AVG32 Rd0, Rd1, Rn0, Rn1, Rm0, Rm1, Rmask
+ @ Rd = (Rn | Rm) - (((Rn ^ Rm) & ~0x01010101) >> 1)
+ @ Rmask = 0xFEFEFEFE
+ @ Rn = destroy
+ eor \Rd0, \Rn0, \Rm0
+ eor \Rd1, \Rn1, \Rm1
+ orr \Rn0, \Rn0, \Rm0
+ orr \Rn1, \Rn1, \Rm1
+ and \Rd0, \Rd0, \Rmask
+ and \Rd1, \Rd1, \Rmask
+ sub \Rd0, \Rn0, \Rd0, lsr #1
+ sub \Rd1, \Rn1, \Rd1, lsr #1
+.endm
+
+.macro NO_RND_AVG32 Rd0, Rd1, Rn0, Rn1, Rm0, Rm1, Rmask
+ @ Rd = (Rn & Rm) - (((Rn ^ Rm) & ~0x01010101) >> 1)
+ @ Rmask = 0xFEFEFEFE
+ @ Rn = destroy
+ eor \Rd0, \Rn0, \Rm0
+ eor \Rd1, \Rn1, \Rm1
+ and \Rn0, \Rn0, \Rm0
+ and \Rn1, \Rn1, \Rm1
+ and \Rd0, \Rd0, \Rmask
+ and \Rd1, \Rd1, \Rmask
+ add \Rd0, \Rn0, \Rd0, lsr #1
+ add \Rd1, \Rn1, \Rd1, lsr #1
+.endm
+
+.macro JMP_ALIGN tmp, reg
+ ands \tmp, \reg, #3
+ bic \reg, \reg, #3
+ beq 1f
+ subs \tmp, \tmp, #1
+ beq 2f
+ subs \tmp, \tmp, #1
+ beq 3f
+ b 4f
+.endm
+
+@ ----------------------------------------------------------------
+ .align 5
+function ff_put_pixels16_arm, export=1
+ @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+ @ block = word aligned, pixles = unaligned
+ pld [r1]
+ push {r4-r11, lr}
+ JMP_ALIGN r5, r1
+1:
+ ldm r1, {r4-r7}
+ add r1, r1, r2
+ stm r0, {r4-r7}
+ pld [r1]
+ subs r3, r3, #1
+ add r0, r0, r2
+ bne 1b
+ pop {r4-r11, pc}
+ .align 5
+2:
+ ldm r1, {r4-r8}
+ add r1, r1, r2
+ ALIGN_QWORD_D 1, r9, r10, r11, r12, r4, r5, r6, r7, r8
+ pld [r1]
+ subs r3, r3, #1
+ stm r0, {r9-r12}
+ add r0, r0, r2
+ bne 2b
+ pop {r4-r11, pc}
+ .align 5
+3:
+ ldm r1, {r4-r8}
+ add r1, r1, r2
+ ALIGN_QWORD_D 2, r9, r10, r11, r12, r4, r5, r6, r7, r8
+ pld [r1]
+ subs r3, r3, #1
+ stm r0, {r9-r12}
+ add r0, r0, r2
+ bne 3b
+ pop {r4-r11, pc}
+ .align 5
+4:
+ ldm r1, {r4-r8}
+ add r1, r1, r2
+ ALIGN_QWORD_D 3, r9, r10, r11, r12, r4, r5, r6, r7, r8
+ pld [r1]
+ subs r3, r3, #1
+ stm r0, {r9-r12}
+ add r0, r0, r2
+ bne 4b
+ pop {r4-r11,pc}
+endfunc
+
+@ ----------------------------------------------------------------
+ .align 5
+function ff_put_pixels8_arm, export=1
+ @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+ @ block = word aligned, pixles = unaligned
+ pld [r1]
+ push {r4-r5,lr}
+ JMP_ALIGN r5, r1
+1:
+ ldm r1, {r4-r5}
+ add r1, r1, r2
+ subs r3, r3, #1
+ pld [r1]
+ stm r0, {r4-r5}
+ add r0, r0, r2
+ bne 1b
+ pop {r4-r5,pc}
+ .align 5
+2:
+ ldm r1, {r4-r5, r12}
+ add r1, r1, r2
+ ALIGN_DWORD 1, r4, r5, r12
+ pld [r1]
+ subs r3, r3, #1
+ stm r0, {r4-r5}
+ add r0, r0, r2
+ bne 2b
+ pop {r4-r5,pc}
+ .align 5
+3:
+ ldm r1, {r4-r5, r12}
+ add r1, r1, r2
+ ALIGN_DWORD 2, r4, r5, r12
+ pld [r1]
+ subs r3, r3, #1
+ stm r0, {r4-r5}
+ add r0, r0, r2
+ bne 3b
+ pop {r4-r5,pc}
+ .align 5
+4:
+ ldm r1, {r4-r5, r12}
+ add r1, r1, r2
+ ALIGN_DWORD 3, r4, r5, r12
+ pld [r1]
+ subs r3, r3, #1
+ stm r0, {r4-r5}
+ add r0, r0, r2
+ bne 4b
+ pop {r4-r5,pc}
+endfunc
+
+@ ----------------------------------------------------------------
+ .align 5
+function ff_put_pixels8_x2_arm, export=1
+ @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+ @ block = word aligned, pixles = unaligned
+ pld [r1]
+ push {r4-r10,lr}
+ ldr r12, =0xfefefefe
+ JMP_ALIGN r5, r1
+1:
+ ldm r1, {r4-r5, r10}
+ add r1, r1, r2
+ ALIGN_DWORD_D 1, r6, r7, r4, r5, r10
+ pld [r1]
+ RND_AVG32 r8, r9, r4, r5, r6, r7, r12
+ subs r3, r3, #1
+ stm r0, {r8-r9}
+ add r0, r0, r2
+ bne 1b
+ pop {r4-r10,pc}
+ .align 5
+2:
+ ldm r1, {r4-r5, r10}
+ add r1, r1, r2
+ ALIGN_DWORD_D 1, r6, r7, r4, r5, r10
+ ALIGN_DWORD_D 2, r8, r9, r4, r5, r10
+ pld [r1]
+ RND_AVG32 r4, r5, r6, r7, r8, r9, r12
+ subs r3, r3, #1
+ stm r0, {r4-r5}
+ add r0, r0, r2
+ bne 2b
+ pop {r4-r10,pc}
+ .align 5
+3:
+ ldm r1, {r4-r5, r10}
+ add r1, r1, r2
+ ALIGN_DWORD_D 2, r6, r7, r4, r5, r10
+ ALIGN_DWORD_D 3, r8, r9, r4, r5, r10
+ pld [r1]
+ RND_AVG32 r4, r5, r6, r7, r8, r9, r12
+ subs r3, r3, #1
+ stm r0, {r4-r5}
+ add r0, r0, r2
+ bne 3b
+ pop {r4-r10,pc}
+ .align 5
+4:
+ ldm r1, {r4-r5, r10}
+ add r1, r1, r2
+ ALIGN_DWORD_D 3, r6, r7, r4, r5, r10
+ pld [r1]
+ RND_AVG32 r8, r9, r6, r7, r5, r10, r12
+ subs r3, r3, #1
+ stm r0, {r8-r9}
+ add r0, r0, r2
+ bne 4b
+ pop {r4-r10,pc}
+endfunc
+
+ .align 5
+function ff_put_no_rnd_pixels8_x2_arm, export=1
+ @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+ @ block = word aligned, pixles = unaligned
+ pld [r1]
+ push {r4-r10,lr}
+ ldr r12, =0xfefefefe
+ JMP_ALIGN r5, r1
+1:
+ ldm r1, {r4-r5, r10}
+ add r1, r1, r2
+ ALIGN_DWORD_D 1, r6, r7, r4, r5, r10
+ pld [r1]
+ NO_RND_AVG32 r8, r9, r4, r5, r6, r7, r12
+ subs r3, r3, #1
+ stm r0, {r8-r9}
+ add r0, r0, r2
+ bne 1b
+ pop {r4-r10,pc}
+ .align 5
+2:
+ ldm r1, {r4-r5, r10}
+ add r1, r1, r2
+ ALIGN_DWORD_D 1, r6, r7, r4, r5, r10
+ ALIGN_DWORD_D 2, r8, r9, r4, r5, r10
+ pld [r1]
+ NO_RND_AVG32 r4, r5, r6, r7, r8, r9, r12
+ subs r3, r3, #1
+ stm r0, {r4-r5}
+ add r0, r0, r2
+ bne 2b
+ pop {r4-r10,pc}
+ .align 5
+3:
+ ldm r1, {r4-r5, r10}
+ add r1, r1, r2
+ ALIGN_DWORD_D 2, r6, r7, r4, r5, r10
+ ALIGN_DWORD_D 3, r8, r9, r4, r5, r10
+ pld [r1]
+ NO_RND_AVG32 r4, r5, r6, r7, r8, r9, r12
+ subs r3, r3, #1
+ stm r0, {r4-r5}
+ add r0, r0, r2
+ bne 3b
+ pop {r4-r10,pc}
+ .align 5
+4:
+ ldm r1, {r4-r5, r10}
+ add r1, r1, r2
+ ALIGN_DWORD_D 3, r6, r7, r4, r5, r10
+ pld [r1]
+ NO_RND_AVG32 r8, r9, r6, r7, r5, r10, r12
+ subs r3, r3, #1
+ stm r0, {r8-r9}
+ add r0, r0, r2
+ bne 4b
+ pop {r4-r10,pc}
+endfunc
+
+
+@ ----------------------------------------------------------------
+ .align 5
+function ff_put_pixels8_y2_arm, export=1
+ @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+ @ block = word aligned, pixles = unaligned
+ pld [r1]
+ push {r4-r11,lr}
+ mov r3, r3, lsr #1
+ ldr r12, =0xfefefefe
+ JMP_ALIGN r5, r1
+1:
+ ldm r1, {r4-r5}
+ add r1, r1, r2
+6: ldm r1, {r6-r7}
+ add r1, r1, r2
+ pld [r1]
+ RND_AVG32 r8, r9, r4, r5, r6, r7, r12
+ ldm r1, {r4-r5}
+ add r1, r1, r2
+ stm r0, {r8-r9}
+ add r0, r0, r2
+ pld [r1]
+ RND_AVG32 r8, r9, r6, r7, r4, r5, r12
+ subs r3, r3, #1
+ stm r0, {r8-r9}
+ add r0, r0, r2
+ bne 6b
+ pop {r4-r11,pc}
+ .align 5
+2:
+ ldm r1, {r4-r6}
+ add r1, r1, r2
+ pld [r1]
+ ALIGN_DWORD 1, r4, r5, r6
+6: ldm r1, {r7-r9}
+ add r1, r1, r2
+ pld [r1]
+ ALIGN_DWORD 1, r7, r8, r9
+ RND_AVG32 r10, r11, r4, r5, r7, r8, r12
+ stm r0, {r10-r11}
+ add r0, r0, r2
+ ldm r1, {r4-r6}
+ add r1, r1, r2
+ pld [r1]
+ ALIGN_DWORD 1, r4, r5, r6
+ subs r3, r3, #1
+ RND_AVG32 r10, r11, r7, r8, r4, r5, r12
+ stm r0, {r10-r11}
+ add r0, r0, r2
+ bne 6b
+ pop {r4-r11,pc}
+ .align 5
+3:
+ ldm r1, {r4-r6}
+ add r1, r1, r2
+ pld [r1]
+ ALIGN_DWORD 2, r4, r5, r6
+6: ldm r1, {r7-r9}
+ add r1, r1, r2
+ pld [r1]
+ ALIGN_DWORD 2, r7, r8, r9
+ RND_AVG32 r10, r11, r4, r5, r7, r8, r12
+ stm r0, {r10-r11}
+ add r0, r0, r2
+ ldm r1, {r4-r6}
+ add r1, r1, r2
+ pld [r1]
+ ALIGN_DWORD 2, r4, r5, r6
+ subs r3, r3, #1
+ RND_AVG32 r10, r11, r7, r8, r4, r5, r12
+ stm r0, {r10-r11}
+ add r0, r0, r2
+ bne 6b
+ pop {r4-r11,pc}
+ .align 5
+4:
+ ldm r1, {r4-r6}
+ add r1, r1, r2
+ pld [r1]
+ ALIGN_DWORD 3, r4, r5, r6
+6: ldm r1, {r7-r9}
+ add r1, r1, r2
+ pld [r1]
+ ALIGN_DWORD 3, r7, r8, r9
+ RND_AVG32 r10, r11, r4, r5, r7, r8, r12
+ stm r0, {r10-r11}
+ add r0, r0, r2
+ ldm r1, {r4-r6}
+ add r1, r1, r2
+ pld [r1]
+ ALIGN_DWORD 3, r4, r5, r6
+ subs r3, r3, #1
+ RND_AVG32 r10, r11, r7, r8, r4, r5, r12
+ stm r0, {r10-r11}
+ add r0, r0, r2
+ bne 6b
+ pop {r4-r11,pc}
+endfunc
+
+ .align 5
+function ff_put_no_rnd_pixels8_y2_arm, export=1
+ @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+ @ block = word aligned, pixles = unaligned
+ pld [r1]
+ push {r4-r11,lr}
+ mov r3, r3, lsr #1
+ ldr r12, =0xfefefefe
+ JMP_ALIGN r5, r1
+1:
+ ldm r1, {r4-r5}
+ add r1, r1, r2
+6: ldm r1, {r6-r7}
+ add r1, r1, r2
+ pld [r1]
+ NO_RND_AVG32 r8, r9, r4, r5, r6, r7, r12
+ ldm r1, {r4-r5}
+ add r1, r1, r2
+ stm r0, {r8-r9}
+ add r0, r0, r2
+ pld [r1]
+ NO_RND_AVG32 r8, r9, r6, r7, r4, r5, r12
+ subs r3, r3, #1
+ stm r0, {r8-r9}
+ add r0, r0, r2
+ bne 6b
+ pop {r4-r11,pc}
+ .align 5
+2:
+ ldm r1, {r4-r6}
+ add r1, r1, r2
+ pld [r1]
+ ALIGN_DWORD 1, r4, r5, r6
+6: ldm r1, {r7-r9}
+ add r1, r1, r2
+ pld [r1]
+ ALIGN_DWORD 1, r7, r8, r9
+ NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12
+ stm r0, {r10-r11}
+ add r0, r0, r2
+ ldm r1, {r4-r6}
+ add r1, r1, r2
+ pld [r1]
+ ALIGN_DWORD 1, r4, r5, r6
+ subs r3, r3, #1
+ NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12
+ stm r0, {r10-r11}
+ add r0, r0, r2
+ bne 6b
+ pop {r4-r11,pc}
+ .align 5
+3:
+ ldm r1, {r4-r6}
+ add r1, r1, r2
+ pld [r1]
+ ALIGN_DWORD 2, r4, r5, r6
+6: ldm r1, {r7-r9}
+ add r1, r1, r2
+ pld [r1]
+ ALIGN_DWORD 2, r7, r8, r9
+ NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12
+ stm r0, {r10-r11}
+ add r0, r0, r2
+ ldm r1, {r4-r6}
+ add r1, r1, r2
+ pld [r1]
+ ALIGN_DWORD 2, r4, r5, r6
+ subs r3, r3, #1
+ NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12
+ stm r0, {r10-r11}
+ add r0, r0, r2
+ bne 6b
+ pop {r4-r11,pc}
+ .align 5
+4:
+ ldm r1, {r4-r6}
+ add r1, r1, r2
+ pld [r1]
+ ALIGN_DWORD 3, r4, r5, r6
+6: ldm r1, {r7-r9}
+ add r1, r1, r2
+ pld [r1]
+ ALIGN_DWORD 3, r7, r8, r9
+ NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12
+ stm r0, {r10-r11}
+ add r0, r0, r2
+ ldm r1, {r4-r6}
+ add r1, r1, r2
+ pld [r1]
+ ALIGN_DWORD 3, r4, r5, r6
+ subs r3, r3, #1
+ NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12
+ stm r0, {r10-r11}
+ add r0, r0, r2
+ bne 6b
+ pop {r4-r11,pc}
+endfunc
+
+ .ltorg
+
+@ ----------------------------------------------------------------
+.macro RND_XY2_IT align, rnd
+ @ l1= (a & 0x03030303) + (b & 0x03030303) ?(+ 0x02020202)
+ @ h1= ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2)
+.if \align == 0
+ ldm r1, {r6-r8}
+.elseif \align == 3
+ ldm r1, {r5-r7}
+.else
+ ldm r1, {r8-r10}
+.endif
+ add r1, r1, r2
+ pld [r1]
+.if \align == 0
+ ALIGN_DWORD_D 1, r4, r5, r6, r7, r8
+.elseif \align == 1
+ ALIGN_DWORD_D 1, r4, r5, r8, r9, r10
+ ALIGN_DWORD_D 2, r6, r7, r8, r9, r10
+.elseif \align == 2
+ ALIGN_DWORD_D 2, r4, r5, r8, r9, r10
+ ALIGN_DWORD_D 3, r6, r7, r8, r9, r10
+.elseif \align == 3
+ ALIGN_DWORD_D 3, r4, r5, r5, r6, r7
+.endif
+ ldr r14, =0x03030303
+ tst r3, #1
+ and r8, r4, r14
+ and r9, r5, r14
+ and r10, r6, r14
+ and r11, r7, r14
+ it eq
+ andeq r14, r14, r14, \rnd #1
+ add r8, r8, r10
+ add r9, r9, r11
+ ldr r12, =0xfcfcfcfc >> 2
+ itt eq
+ addeq r8, r8, r14
+ addeq r9, r9, r14
+ and r4, r12, r4, lsr #2
+ and r5, r12, r5, lsr #2
+ and r6, r12, r6, lsr #2
+ and r7, r12, r7, lsr #2
+ add r10, r4, r6
+ add r11, r5, r7
+ subs r3, r3, #1
+.endm
+
+.macro RND_XY2_EXPAND align, rnd
+ RND_XY2_IT \align, \rnd
+6: push {r8-r11}
+ RND_XY2_IT \align, \rnd
+ pop {r4-r7}
+ add r4, r4, r8
+ add r5, r5, r9
+ ldr r14, =0x0f0f0f0f
+ add r6, r6, r10
+ add r7, r7, r11
+ and r4, r14, r4, lsr #2
+ and r5, r14, r5, lsr #2
+ add r4, r4, r6
+ add r5, r5, r7
+ stm r0, {r4-r5}
+ add r0, r0, r2
+ bge 6b
+ pop {r4-r11,pc}
+.endm
+
+ .align 5
+function ff_put_pixels8_xy2_arm, export=1
+ @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+ @ block = word aligned, pixles = unaligned
+ pld [r1]
+ push {r4-r11,lr} @ R14 is also called LR
+ JMP_ALIGN r5, r1
+1: RND_XY2_EXPAND 0, lsl
+ .align 5
+2: RND_XY2_EXPAND 1, lsl
+ .align 5
+3: RND_XY2_EXPAND 2, lsl
+ .align 5
+4: RND_XY2_EXPAND 3, lsl
+endfunc
+
+ .align 5
+function ff_put_no_rnd_pixels8_xy2_arm, export=1
+ @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+ @ block = word aligned, pixles = unaligned
+ pld [r1]
+ push {r4-r11,lr}
+ JMP_ALIGN r5, r1
+1: RND_XY2_EXPAND 0, lsr
+ .align 5
+2: RND_XY2_EXPAND 1, lsr
+ .align 5
+3: RND_XY2_EXPAND 2, lsr
+ .align 5
+4: RND_XY2_EXPAND 3, lsr
+endfunc
diff --git a/libavcodec/arm/hpeldsp_arm.h b/libavcodec/arm/hpeldsp_arm.h
new file mode 100644
index 0000000..e79bc6f
--- /dev/null
+++ b/libavcodec/arm/hpeldsp_arm.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2009 Mans Rullgard <mans at mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_ARM_HPELDSP_H
+#define AVCODEC_ARM_HPELDSP_H
+
+#include "libavcodec/hpeldsp.h"
+
+void ff_hpeldsp_init_armv6(HpelDSPContext* c, int flags);
+void ff_hpeldsp_init_neon(HpelDSPContext *c, int flags);
+
+#endif /* AVCODEC_ARM_HPELDSP_H */
diff --git a/libavcodec/arm/hpeldsp_armv6.S b/libavcodec/arm/hpeldsp_armv6.S
new file mode 100644
index 0000000..cd50150
--- /dev/null
+++ b/libavcodec/arm/hpeldsp_armv6.S
@@ -0,0 +1,259 @@
+/*
+ * Copyright (c) 2009 Mans Rullgard <mans at mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+
+.macro call_2x_pixels type, subp
+function ff_\type\()_pixels16\subp\()_armv6, export=1
+ push {r0-r3, lr}
+ bl ff_\type\()_pixels8\subp\()_armv6
+ pop {r0-r3, lr}
+ add r0, r0, #8
+ add r1, r1, #8
+ b ff_\type\()_pixels8\subp\()_armv6
+endfunc
+.endm
+
+call_2x_pixels avg
+call_2x_pixels put, _x2
+call_2x_pixels put, _y2
+call_2x_pixels put, _x2_no_rnd
+call_2x_pixels put, _y2_no_rnd
+
+function ff_put_pixels16_armv6, export=1
+ push {r4-r11}
+1:
+ ldr r5, [r1, #4]
+ ldr r6, [r1, #8]
+ ldr r7, [r1, #12]
+ ldr_post r4, r1, r2
+ strd r6, r7, [r0, #8]
+ ldr r9, [r1, #4]
+ strd_post r4, r5, r0, r2
+ ldr r10, [r1, #8]
+ ldr r11, [r1, #12]
+ ldr_post r8, r1, r2
+ strd r10, r11, [r0, #8]
+ subs r3, r3, #2
+ strd_post r8, r9, r0, r2
+ bne 1b
+
+ pop {r4-r11}
+ bx lr
+endfunc
+
+function ff_put_pixels8_armv6, export=1
+ push {r4-r7}
+1:
+ ldr r5, [r1, #4]
+ ldr_post r4, r1, r2
+ ldr r7, [r1, #4]
+ strd_post r4, r5, r0, r2
+ ldr_post r6, r1, r2
+ subs r3, r3, #2
+ strd_post r6, r7, r0, r2
+ bne 1b
+
+ pop {r4-r7}
+ bx lr
+endfunc
+
+function ff_put_pixels8_x2_armv6, export=1
+ push {r4-r11, lr}
+ mov r12, #1
+ orr r12, r12, r12, lsl #8
+ orr r12, r12, r12, lsl #16
+1:
+ ldr r4, [r1]
+ subs r3, r3, #2
+ ldr r5, [r1, #4]
+ ldr r7, [r1, #5]
+ lsr r6, r4, #8
+ ldr_pre r8, r1, r2
+ orr r6, r6, r5, lsl #24
+ ldr r9, [r1, #4]
+ ldr r11, [r1, #5]
+ lsr r10, r8, #8
+ add r1, r1, r2
+ orr r10, r10, r9, lsl #24
+ eor r14, r4, r6
+ uhadd8 r4, r4, r6
+ eor r6, r5, r7
+ uhadd8 r5, r5, r7
+ and r14, r14, r12
+ and r6, r6, r12
+ uadd8 r4, r4, r14
+ eor r14, r8, r10
+ uadd8 r5, r5, r6
+ eor r6, r9, r11
+ uhadd8 r8, r8, r10
+ and r14, r14, r12
+ uhadd8 r9, r9, r11
+ and r6, r6, r12
+ uadd8 r8, r8, r14
+ strd_post r4, r5, r0, r2
+ uadd8 r9, r9, r6
+ strd_post r8, r9, r0, r2
+ bne 1b
+
+ pop {r4-r11, pc}
+endfunc
+
+function ff_put_pixels8_y2_armv6, export=1
+ push {r4-r11}
+ mov r12, #1
+ orr r12, r12, r12, lsl #8
+ orr r12, r12, r12, lsl #16
+ ldr r4, [r1]
+ ldr r5, [r1, #4]
+ ldr_pre r6, r1, r2
+ ldr r7, [r1, #4]
+1:
+ subs r3, r3, #2
+ uhadd8 r8, r4, r6
+ eor r10, r4, r6
+ uhadd8 r9, r5, r7
+ eor r11, r5, r7
+ and r10, r10, r12
+ ldr_pre r4, r1, r2
+ uadd8 r8, r8, r10
+ and r11, r11, r12
+ uadd8 r9, r9, r11
+ ldr r5, [r1, #4]
+ uhadd8 r10, r4, r6
+ eor r6, r4, r6
+ uhadd8 r11, r5, r7
+ and r6, r6, r12
+ eor r7, r5, r7
+ uadd8 r10, r10, r6
+ and r7, r7, r12
+ ldr_pre r6, r1, r2
+ uadd8 r11, r11, r7
+ strd_post r8, r9, r0, r2
+ ldr r7, [r1, #4]
+ strd_post r10, r11, r0, r2
+ bne 1b
+
+ pop {r4-r11}
+ bx lr
+endfunc
+
+function ff_put_pixels8_x2_no_rnd_armv6, export=1
+ push {r4-r9, lr}
+1:
+ subs r3, r3, #2
+ ldr r4, [r1]
+ ldr r5, [r1, #4]
+ ldr r7, [r1, #5]
+ ldr_pre r8, r1, r2
+ ldr r9, [r1, #4]
+ ldr r14, [r1, #5]
+ add r1, r1, r2
+ lsr r6, r4, #8
+ orr r6, r6, r5, lsl #24
+ lsr r12, r8, #8
+ orr r12, r12, r9, lsl #24
+ uhadd8 r4, r4, r6
+ uhadd8 r5, r5, r7
+ uhadd8 r8, r8, r12
+ uhadd8 r9, r9, r14
+ stm r0, {r4,r5}
+ add r0, r0, r2
+ stm r0, {r8,r9}
+ add r0, r0, r2
+ bne 1b
+
+ pop {r4-r9, pc}
+endfunc
+
+function ff_put_pixels8_y2_no_rnd_armv6, export=1
+ push {r4-r9, lr}
+ ldr r4, [r1]
+ ldr r5, [r1, #4]
+ ldr_pre r6, r1, r2
+ ldr r7, [r1, #4]
+1:
+ subs r3, r3, #2
+ uhadd8 r8, r4, r6
+ ldr_pre r4, r1, r2
+ uhadd8 r9, r5, r7
+ ldr r5, [r1, #4]
+ uhadd8 r12, r4, r6
+ ldr_pre r6, r1, r2
+ uhadd8 r14, r5, r7
+ ldr r7, [r1, #4]
+ stm r0, {r8,r9}
+ add r0, r0, r2
+ stm r0, {r12,r14}
+ add r0, r0, r2
+ bne 1b
+
+ pop {r4-r9, pc}
+endfunc
+
+function ff_avg_pixels8_armv6, export=1
+ pld [r1, r2]
+ push {r4-r10, lr}
+ mov lr, #1
+ orr lr, lr, lr, lsl #8
+ orr lr, lr, lr, lsl #16
+ ldrd r4, r5, [r0]
+ ldr r10, [r1, #4]
+ ldr_post r9, r1, r2
+ subs r3, r3, #2
+1:
+ pld [r1, r2]
+ eor r8, r4, r9
+ uhadd8 r4, r4, r9
+ eor r12, r5, r10
+ ldrd_reg r6, r7, r0, r2
+ uhadd8 r5, r5, r10
+ and r8, r8, lr
+ ldr r10, [r1, #4]
+ and r12, r12, lr
+ uadd8 r4, r4, r8
+ ldr_post r9, r1, r2
+ eor r8, r6, r9
+ uadd8 r5, r5, r12
+ pld [r1, r2, lsl #1]
+ eor r12, r7, r10
+ uhadd8 r6, r6, r9
+ strd_post r4, r5, r0, r2
+ uhadd8 r7, r7, r10
+ beq 2f
+ and r8, r8, lr
+ ldrd_reg r4, r5, r0, r2
+ uadd8 r6, r6, r8
+ ldr r10, [r1, #4]
+ and r12, r12, lr
+ subs r3, r3, #2
+ uadd8 r7, r7, r12
+ ldr_post r9, r1, r2
+ strd_post r6, r7, r0, r2
+ b 1b
+2:
+ and r8, r8, lr
+ and r12, r12, lr
+ uadd8 r6, r6, r8
+ uadd8 r7, r7, r12
+ strd_post r6, r7, r0, r2
+
+ pop {r4-r10, pc}
+endfunc
diff --git a/libavcodec/arm/hpeldsp_init_arm.c b/libavcodec/arm/hpeldsp_init_arm.c
new file mode 100644
index 0000000..2cb087f
--- /dev/null
+++ b/libavcodec/arm/hpeldsp_init_arm.c
@@ -0,0 +1,68 @@
+/*
+ * ARM optimized DSP utils
+ * Copyright (c) 2001 Lionel Ulmer
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/cpu.h"
+#include "libavcodec/bit_depth_template.c" // for CALL_2X_PIXELS
+#include "hpeldsp_arm.h"
+
+void ff_put_pixels8_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
+void ff_put_pixels8_x2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
+void ff_put_pixels8_y2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
+void ff_put_pixels8_xy2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
+
+void ff_put_no_rnd_pixels8_x2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
+void ff_put_no_rnd_pixels8_y2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
+void ff_put_no_rnd_pixels8_xy2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
+
+void ff_put_pixels16_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
+
+CALL_2X_PIXELS(ff_put_pixels16_x2_arm, ff_put_pixels8_x2_arm, 8)
+CALL_2X_PIXELS(ff_put_pixels16_y2_arm, ff_put_pixels8_y2_arm, 8)
+CALL_2X_PIXELS(ff_put_pixels16_xy2_arm, ff_put_pixels8_xy2_arm, 8)
+CALL_2X_PIXELS(ff_put_no_rnd_pixels16_x2_arm, ff_put_no_rnd_pixels8_x2_arm, 8)
+CALL_2X_PIXELS(ff_put_no_rnd_pixels16_y2_arm, ff_put_no_rnd_pixels8_y2_arm, 8)
+CALL_2X_PIXELS(ff_put_no_rnd_pixels16_xy2_arm, ff_put_no_rnd_pixels8_xy2_arm,8)
+
+void ff_hpeldsp_init_arm(HpelDSPContext* c, int flags)
+{
+ int cpu_flags = av_get_cpu_flags();
+
+ c->put_pixels_tab[0][0] = ff_put_pixels16_arm;
+ c->put_pixels_tab[0][1] = ff_put_pixels16_x2_arm;
+ c->put_pixels_tab[0][2] = ff_put_pixels16_y2_arm;
+ c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_arm;
+ c->put_pixels_tab[1][0] = ff_put_pixels8_arm;
+ c->put_pixels_tab[1][1] = ff_put_pixels8_x2_arm;
+ c->put_pixels_tab[1][2] = ff_put_pixels8_y2_arm;
+ c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_arm;
+
+ c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_arm;
+ c->put_no_rnd_pixels_tab[0][1] = ff_put_no_rnd_pixels16_x2_arm;
+ c->put_no_rnd_pixels_tab[0][2] = ff_put_no_rnd_pixels16_y2_arm;
+ c->put_no_rnd_pixels_tab[0][3] = ff_put_no_rnd_pixels16_xy2_arm;
+ c->put_no_rnd_pixels_tab[1][0] = ff_put_pixels8_arm;
+ c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_arm;
+ c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_arm;
+ c->put_no_rnd_pixels_tab[1][3] = ff_put_no_rnd_pixels8_xy2_arm;
+
+ if (have_armv6(cpu_flags)) ff_hpeldsp_init_armv6(c, flags);
+ if (have_neon(cpu_flags)) ff_hpeldsp_init_neon(c, flags);
+}
diff --git a/libavcodec/arm/hpeldsp_init_armv6.c b/libavcodec/arm/hpeldsp_init_armv6.c
new file mode 100644
index 0000000..bb3afed
--- /dev/null
+++ b/libavcodec/arm/hpeldsp_init_armv6.c
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2009 Mans Rullgard <mans at mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavutil/attributes.h"
+#include "hpeldsp_arm.h"
+
+void ff_put_pixels16_armv6(uint8_t *, const uint8_t *, int, int);
+void ff_put_pixels16_x2_armv6(uint8_t *, const uint8_t *, int, int);
+void ff_put_pixels16_y2_armv6(uint8_t *, const uint8_t *, int, int);
+
+void ff_put_pixels16_x2_no_rnd_armv6(uint8_t *, const uint8_t *, int, int);
+void ff_put_pixels16_y2_no_rnd_armv6(uint8_t *, const uint8_t *, int, int);
+
+void ff_avg_pixels16_armv6(uint8_t *, const uint8_t *, int, int);
+
+void ff_put_pixels8_armv6(uint8_t *, const uint8_t *, int, int);
+void ff_put_pixels8_x2_armv6(uint8_t *, const uint8_t *, int, int);
+void ff_put_pixels8_y2_armv6(uint8_t *, const uint8_t *, int, int);
+
+void ff_put_pixels8_x2_no_rnd_armv6(uint8_t *, const uint8_t *, int, int);
+void ff_put_pixels8_y2_no_rnd_armv6(uint8_t *, const uint8_t *, int, int);
+
+void ff_avg_pixels8_armv6(uint8_t *, const uint8_t *, int, int);
+
+av_cold void ff_hpeldsp_init_armv6(HpelDSPContext *c, int flags)
+{
+ c->put_pixels_tab[0][0] = ff_put_pixels16_armv6;
+ c->put_pixels_tab[0][1] = ff_put_pixels16_x2_armv6;
+ c->put_pixels_tab[0][2] = ff_put_pixels16_y2_armv6;
+/* c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_armv6; */
+ c->put_pixels_tab[1][0] = ff_put_pixels8_armv6;
+ c->put_pixels_tab[1][1] = ff_put_pixels8_x2_armv6;
+ c->put_pixels_tab[1][2] = ff_put_pixels8_y2_armv6;
+/* c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_armv6; */
+
+ c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_armv6;
+ c->put_no_rnd_pixels_tab[0][1] = ff_put_pixels16_x2_no_rnd_armv6;
+ c->put_no_rnd_pixels_tab[0][2] = ff_put_pixels16_y2_no_rnd_armv6;
+/* c->put_no_rnd_pixels_tab[0][3] = ff_put_pixels16_xy2_no_rnd_armv6; */
+ c->put_no_rnd_pixels_tab[1][0] = ff_put_pixels8_armv6;
+ c->put_no_rnd_pixels_tab[1][1] = ff_put_pixels8_x2_no_rnd_armv6;
+ c->put_no_rnd_pixels_tab[1][2] = ff_put_pixels8_y2_no_rnd_armv6;
+/* c->put_no_rnd_pixels_tab[1][3] = ff_put_pixels8_xy2_no_rnd_armv6; */
+
+ c->avg_pixels_tab[0][0] = ff_avg_pixels16_armv6;
+ c->avg_pixels_tab[1][0] = ff_avg_pixels8_armv6;
+}
diff --git a/libavcodec/arm/hpeldsp_init_neon.c b/libavcodec/arm/hpeldsp_init_neon.c
new file mode 100644
index 0000000..2d87e3e
--- /dev/null
+++ b/libavcodec/arm/hpeldsp_init_neon.c
@@ -0,0 +1,86 @@
+/*
+ * ARM NEON optimised DSP functions
+ * Copyright (c) 2008 Mans Rullgard <mans at mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "hpeldsp_arm.h"
+
+void ff_put_pixels16_neon(uint8_t *, const uint8_t *, int, int);
+void ff_put_pixels16_x2_neon(uint8_t *, const uint8_t *, int, int);
+void ff_put_pixels16_y2_neon(uint8_t *, const uint8_t *, int, int);
+void ff_put_pixels16_xy2_neon(uint8_t *, const uint8_t *, int, int);
+void ff_put_pixels8_neon(uint8_t *, const uint8_t *, int, int);
+void ff_put_pixels8_x2_neon(uint8_t *, const uint8_t *, int, int);
+void ff_put_pixels8_y2_neon(uint8_t *, const uint8_t *, int, int);
+void ff_put_pixels8_xy2_neon(uint8_t *, const uint8_t *, int, int);
+void ff_put_pixels16_x2_no_rnd_neon(uint8_t *, const uint8_t *, int, int);
+void ff_put_pixels16_y2_no_rnd_neon(uint8_t *, const uint8_t *, int, int);
+void ff_put_pixels16_xy2_no_rnd_neon(uint8_t *, const uint8_t *, int, int);
+void ff_put_pixels8_x2_no_rnd_neon(uint8_t *, const uint8_t *, int, int);
+void ff_put_pixels8_y2_no_rnd_neon(uint8_t *, const uint8_t *, int, int);
+void ff_put_pixels8_xy2_no_rnd_neon(uint8_t *, const uint8_t *, int, int);
+
+void ff_avg_pixels16_neon(uint8_t *, const uint8_t *, int, int);
+void ff_avg_pixels16_x2_neon(uint8_t *, const uint8_t *, int, int);
+void ff_avg_pixels16_y2_neon(uint8_t *, const uint8_t *, int, int);
+void ff_avg_pixels16_xy2_neon(uint8_t *, const uint8_t *, int, int);
+void ff_avg_pixels8_neon(uint8_t *, const uint8_t *, int, int);
+void ff_avg_pixels8_x2_neon(uint8_t *, const uint8_t *, int, int);
+void ff_avg_pixels8_y2_neon(uint8_t *, const uint8_t *, int, int);
+void ff_avg_pixels8_xy2_neon(uint8_t *, const uint8_t *, int, int);
+void ff_avg_pixels16_x2_no_rnd_neon(uint8_t *, const uint8_t *, int, int);
+void ff_avg_pixels16_y2_no_rnd_neon(uint8_t *, const uint8_t *, int, int);
+void ff_avg_pixels16_xy2_no_rnd_neon(uint8_t *, const uint8_t *, int, int);
+
+void ff_hpeldsp_init_neon(HpelDSPContext *c, int flags)
+{
+ c->put_pixels_tab[0][0] = ff_put_pixels16_neon;
+ c->put_pixels_tab[0][1] = ff_put_pixels16_x2_neon;
+ c->put_pixels_tab[0][2] = ff_put_pixels16_y2_neon;
+ c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_neon;
+ c->put_pixels_tab[1][0] = ff_put_pixels8_neon;
+ c->put_pixels_tab[1][1] = ff_put_pixels8_x2_neon;
+ c->put_pixels_tab[1][2] = ff_put_pixels8_y2_neon;
+ c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_neon;
+
+ c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_neon;
+ c->put_no_rnd_pixels_tab[0][1] = ff_put_pixels16_x2_no_rnd_neon;
+ c->put_no_rnd_pixels_tab[0][2] = ff_put_pixels16_y2_no_rnd_neon;
+ c->put_no_rnd_pixels_tab[0][3] = ff_put_pixels16_xy2_no_rnd_neon;
+ c->put_no_rnd_pixels_tab[1][0] = ff_put_pixels8_neon;
+ c->put_no_rnd_pixels_tab[1][1] = ff_put_pixels8_x2_no_rnd_neon;
+ c->put_no_rnd_pixels_tab[1][2] = ff_put_pixels8_y2_no_rnd_neon;
+ c->put_no_rnd_pixels_tab[1][3] = ff_put_pixels8_xy2_no_rnd_neon;
+
+ c->avg_pixels_tab[0][0] = ff_avg_pixels16_neon;
+ c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_neon;
+ c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_neon;
+ c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_neon;
+ c->avg_pixels_tab[1][0] = ff_avg_pixels8_neon;
+ c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_neon;
+ c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_neon;
+ c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_neon;
+
+ c->avg_no_rnd_pixels_tab[0] = ff_avg_pixels16_neon;
+ c->avg_no_rnd_pixels_tab[1] = ff_avg_pixels16_x2_no_rnd_neon;
+ c->avg_no_rnd_pixels_tab[2] = ff_avg_pixels16_y2_no_rnd_neon;
+ c->avg_no_rnd_pixels_tab[3] = ff_avg_pixels16_xy2_no_rnd_neon;
+}
diff --git a/libavcodec/arm/hpeldsp_neon.S b/libavcodec/arm/hpeldsp_neon.S
new file mode 100644
index 0000000..cf4a6cf
--- /dev/null
+++ b/libavcodec/arm/hpeldsp_neon.S
@@ -0,0 +1,410 @@
+/*
+ * ARM NEON optimised DSP functions
+ * Copyright (c) 2008 Mans Rullgard <mans at mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+
+.macro pixels16 rnd=1, avg=0
+ .if \avg
+ mov r12, r0
+ .endif
+1: vld1.8 {q0}, [r1], r2
+ vld1.8 {q1}, [r1], r2
+ vld1.8 {q2}, [r1], r2
+ pld [r1, r2, lsl #2]
+ vld1.8 {q3}, [r1], r2
+ pld [r1]
+ pld [r1, r2]
+ pld [r1, r2, lsl #1]
+ .if \avg
+ vld1.8 {q8}, [r12,:128], r2
+ vrhadd.u8 q0, q0, q8
+ vld1.8 {q9}, [r12,:128], r2
+ vrhadd.u8 q1, q1, q9
+ vld1.8 {q10}, [r12,:128], r2
+ vrhadd.u8 q2, q2, q10
+ vld1.8 {q11}, [r12,:128], r2
+ vrhadd.u8 q3, q3, q11
+ .endif
+ subs r3, r3, #4
+ vst1.64 {q0}, [r0,:128], r2
+ vst1.64 {q1}, [r0,:128], r2
+ vst1.64 {q2}, [r0,:128], r2
+ vst1.64 {q3}, [r0,:128], r2
+ bne 1b
+ bx lr
+.endm
+
+.macro pixels16_x2 rnd=1, avg=0
+1: vld1.8 {d0-d2}, [r1], r2
+ vld1.8 {d4-d6}, [r1], r2
+ pld [r1]
+ pld [r1, r2]
+ subs r3, r3, #2
+ vext.8 q1, q0, q1, #1
+ avg q0, q0, q1
+ vext.8 q3, q2, q3, #1
+ avg q2, q2, q3
+ .if \avg
+ vld1.8 {q1}, [r0,:128], r2
+ vld1.8 {q3}, [r0,:128]
+ vrhadd.u8 q0, q0, q1
+ vrhadd.u8 q2, q2, q3
+ sub r0, r0, r2
+ .endif
+ vst1.8 {q0}, [r0,:128], r2
+ vst1.8 {q2}, [r0,:128], r2
+ bne 1b
+ bx lr
+.endm
+
+.macro pixels16_y2 rnd=1, avg=0
+ sub r3, r3, #2
+ vld1.8 {q0}, [r1], r2
+ vld1.8 {q1}, [r1], r2
+1: subs r3, r3, #2
+ avg q2, q0, q1
+ vld1.8 {q0}, [r1], r2
+ avg q3, q0, q1
+ vld1.8 {q1}, [r1], r2
+ pld [r1]
+ pld [r1, r2]
+ .if \avg
+ vld1.8 {q8}, [r0,:128], r2
+ vld1.8 {q9}, [r0,:128]
+ vrhadd.u8 q2, q2, q8
+ vrhadd.u8 q3, q3, q9
+ sub r0, r0, r2
+ .endif
+ vst1.8 {q2}, [r0,:128], r2
+ vst1.8 {q3}, [r0,:128], r2
+ bne 1b
+
+ avg q2, q0, q1
+ vld1.8 {q0}, [r1], r2
+ avg q3, q0, q1
+ .if \avg
+ vld1.8 {q8}, [r0,:128], r2
+ vld1.8 {q9}, [r0,:128]
+ vrhadd.u8 q2, q2, q8
+ vrhadd.u8 q3, q3, q9
+ sub r0, r0, r2
+ .endif
+ vst1.8 {q2}, [r0,:128], r2
+ vst1.8 {q3}, [r0,:128], r2
+
+ bx lr
+.endm
+
+.macro pixels16_xy2 rnd=1, avg=0
+ sub r3, r3, #2
+ vld1.8 {d0-d2}, [r1], r2
+ vld1.8 {d4-d6}, [r1], r2
+NRND vmov.i16 q13, #1
+ pld [r1]
+ pld [r1, r2]
+ vext.8 q1, q0, q1, #1
+ vext.8 q3, q2, q3, #1
+ vaddl.u8 q8, d0, d2
+ vaddl.u8 q10, d1, d3
+ vaddl.u8 q9, d4, d6
+ vaddl.u8 q11, d5, d7
+1: subs r3, r3, #2
+ vld1.8 {d0-d2}, [r1], r2
+ vadd.u16 q12, q8, q9
+ pld [r1]
+NRND vadd.u16 q12, q12, q13
+ vext.8 q15, q0, q1, #1
+ vadd.u16 q1 , q10, q11
+ shrn d28, q12, #2
+NRND vadd.u16 q1, q1, q13
+ shrn d29, q1, #2
+ .if \avg
+ vld1.8 {q8}, [r0,:128]
+ vrhadd.u8 q14, q14, q8
+ .endif
+ vaddl.u8 q8, d0, d30
+ vld1.8 {d2-d4}, [r1], r2
+ vaddl.u8 q10, d1, d31
+ vst1.8 {q14}, [r0,:128], r2
+ vadd.u16 q12, q8, q9
+ pld [r1, r2]
+NRND vadd.u16 q12, q12, q13
+ vext.8 q2, q1, q2, #1
+ vadd.u16 q0, q10, q11
+ shrn d30, q12, #2
+NRND vadd.u16 q0, q0, q13
+ shrn d31, q0, #2
+ .if \avg
+ vld1.8 {q9}, [r0,:128]
+ vrhadd.u8 q15, q15, q9
+ .endif
+ vaddl.u8 q9, d2, d4
+ vaddl.u8 q11, d3, d5
+ vst1.8 {q15}, [r0,:128], r2
+ bgt 1b
+
+ vld1.8 {d0-d2}, [r1], r2
+ vadd.u16 q12, q8, q9
+NRND vadd.u16 q12, q12, q13
+ vext.8 q15, q0, q1, #1
+ vadd.u16 q1 , q10, q11
+ shrn d28, q12, #2
+NRND vadd.u16 q1, q1, q13
+ shrn d29, q1, #2
+ .if \avg
+ vld1.8 {q8}, [r0,:128]
+ vrhadd.u8 q14, q14, q8
+ .endif
+ vaddl.u8 q8, d0, d30
+ vaddl.u8 q10, d1, d31
+ vst1.8 {q14}, [r0,:128], r2
+ vadd.u16 q12, q8, q9
+NRND vadd.u16 q12, q12, q13
+ vadd.u16 q0, q10, q11
+ shrn d30, q12, #2
+NRND vadd.u16 q0, q0, q13
+ shrn d31, q0, #2
+ .if \avg
+ vld1.8 {q9}, [r0,:128]
+ vrhadd.u8 q15, q15, q9
+ .endif
+ vst1.8 {q15}, [r0,:128], r2
+
+ bx lr
+.endm
+
+.macro pixels8 rnd=1, avg=0
+1: vld1.8 {d0}, [r1], r2
+ vld1.8 {d1}, [r1], r2
+ vld1.8 {d2}, [r1], r2
+ pld [r1, r2, lsl #2]
+ vld1.8 {d3}, [r1], r2
+ pld [r1]
+ pld [r1, r2]
+ pld [r1, r2, lsl #1]
+ .if \avg
+ vld1.8 {d4}, [r0,:64], r2
+ vrhadd.u8 d0, d0, d4
+ vld1.8 {d5}, [r0,:64], r2
+ vrhadd.u8 d1, d1, d5
+ vld1.8 {d6}, [r0,:64], r2
+ vrhadd.u8 d2, d2, d6
+ vld1.8 {d7}, [r0,:64], r2
+ vrhadd.u8 d3, d3, d7
+ sub r0, r0, r2, lsl #2
+ .endif
+ subs r3, r3, #4
+ vst1.8 {d0}, [r0,:64], r2
+ vst1.8 {d1}, [r0,:64], r2
+ vst1.8 {d2}, [r0,:64], r2
+ vst1.8 {d3}, [r0,:64], r2
+ bne 1b
+ bx lr
+.endm
+
+.macro pixels8_x2 rnd=1, avg=0
+1: vld1.8 {q0}, [r1], r2
+ vext.8 d1, d0, d1, #1
+ vld1.8 {q1}, [r1], r2
+ vext.8 d3, d2, d3, #1
+ pld [r1]
+ pld [r1, r2]
+ subs r3, r3, #2
+ vswp d1, d2
+ avg q0, q0, q1
+ .if \avg
+ vld1.8 {d4}, [r0,:64], r2
+ vld1.8 {d5}, [r0,:64]
+ vrhadd.u8 q0, q0, q2
+ sub r0, r0, r2
+ .endif
+ vst1.8 {d0}, [r0,:64], r2
+ vst1.8 {d1}, [r0,:64], r2
+ bne 1b
+ bx lr
+.endm
+
+.macro pixels8_y2 rnd=1, avg=0
+ sub r3, r3, #2
+ vld1.8 {d0}, [r1], r2
+ vld1.8 {d1}, [r1], r2
+1: subs r3, r3, #2
+ avg d4, d0, d1
+ vld1.8 {d0}, [r1], r2
+ avg d5, d0, d1
+ vld1.8 {d1}, [r1], r2
+ pld [r1]
+ pld [r1, r2]
+ .if \avg
+ vld1.8 {d2}, [r0,:64], r2
+ vld1.8 {d3}, [r0,:64]
+ vrhadd.u8 q2, q2, q1
+ sub r0, r0, r2
+ .endif
+ vst1.8 {d4}, [r0,:64], r2
+ vst1.8 {d5}, [r0,:64], r2
+ bne 1b
+
+ avg d4, d0, d1
+ vld1.8 {d0}, [r1], r2
+ avg d5, d0, d1
+ .if \avg
+ vld1.8 {d2}, [r0,:64], r2
+ vld1.8 {d3}, [r0,:64]
+ vrhadd.u8 q2, q2, q1
+ sub r0, r0, r2
+ .endif
+ vst1.8 {d4}, [r0,:64], r2
+ vst1.8 {d5}, [r0,:64], r2
+
+ bx lr
+.endm
+
+.macro pixels8_xy2 rnd=1, avg=0
+ sub r3, r3, #2
+ vld1.8 {q0}, [r1], r2
+ vld1.8 {q1}, [r1], r2
+NRND vmov.i16 q11, #1
+ pld [r1]
+ pld [r1, r2]
+ vext.8 d4, d0, d1, #1
+ vext.8 d6, d2, d3, #1
+ vaddl.u8 q8, d0, d4
+ vaddl.u8 q9, d2, d6
+1: subs r3, r3, #2
+ vld1.8 {q0}, [r1], r2
+ pld [r1]
+ vadd.u16 q10, q8, q9
+ vext.8 d4, d0, d1, #1
+NRND vadd.u16 q10, q10, q11
+ vaddl.u8 q8, d0, d4
+ shrn d5, q10, #2
+ vld1.8 {q1}, [r1], r2
+ vadd.u16 q10, q8, q9
+ pld [r1, r2]
+ .if \avg
+ vld1.8 {d7}, [r0,:64]
+ vrhadd.u8 d5, d5, d7
+ .endif
+NRND vadd.u16 q10, q10, q11
+ vst1.8 {d5}, [r0,:64], r2
+ shrn d7, q10, #2
+ .if \avg
+ vld1.8 {d5}, [r0,:64]
+ vrhadd.u8 d7, d7, d5
+ .endif
+ vext.8 d6, d2, d3, #1
+ vaddl.u8 q9, d2, d6
+ vst1.8 {d7}, [r0,:64], r2
+ bgt 1b
+
+ vld1.8 {q0}, [r1], r2
+ vadd.u16 q10, q8, q9
+ vext.8 d4, d0, d1, #1
+NRND vadd.u16 q10, q10, q11
+ vaddl.u8 q8, d0, d4
+ shrn d5, q10, #2
+ vadd.u16 q10, q8, q9
+ .if \avg
+ vld1.8 {d7}, [r0,:64]
+ vrhadd.u8 d5, d5, d7
+ .endif
+NRND vadd.u16 q10, q10, q11
+ vst1.8 {d5}, [r0,:64], r2
+ shrn d7, q10, #2
+ .if \avg
+ vld1.8 {d5}, [r0,:64]
+ vrhadd.u8 d7, d7, d5
+ .endif
+ vst1.8 {d7}, [r0,:64], r2
+
+ bx lr
+.endm
+
+.macro pixfunc pfx, name, suf, rnd=1, avg=0
+ .if \rnd
+ .macro avg rd, rn, rm
+ vrhadd.u8 \rd, \rn, \rm
+ .endm
+ .macro shrn rd, rn, rm
+ vrshrn.u16 \rd, \rn, \rm
+ .endm
+ .macro NRND insn:vararg
+ .endm
+ .else
+ .macro avg rd, rn, rm
+ vhadd.u8 \rd, \rn, \rm
+ .endm
+ .macro shrn rd, rn, rm
+ vshrn.u16 \rd, \rn, \rm
+ .endm
+ .macro NRND insn:vararg
+ \insn
+ .endm
+ .endif
+function ff_\pfx\name\suf\()_neon, export=1
+ \name \rnd, \avg
+endfunc
+ .purgem avg
+ .purgem shrn
+ .purgem NRND
+.endm
+
+.macro pixfunc2 pfx, name, avg=0
+ pixfunc \pfx, \name, rnd=1, avg=\avg
+ pixfunc \pfx, \name, _no_rnd, rnd=0, avg=\avg
+.endm
+
+function ff_put_h264_qpel16_mc00_neon, export=1
+ mov r3, #16
+endfunc
+
+ pixfunc put_, pixels16, avg=0
+ pixfunc2 put_, pixels16_x2, avg=0
+ pixfunc2 put_, pixels16_y2, avg=0
+ pixfunc2 put_, pixels16_xy2, avg=0
+
+function ff_avg_h264_qpel16_mc00_neon, export=1
+ mov r3, #16
+endfunc
+
+ pixfunc avg_, pixels16, avg=1
+ pixfunc2 avg_, pixels16_x2, avg=1
+ pixfunc2 avg_, pixels16_y2, avg=1
+ pixfunc2 avg_, pixels16_xy2, avg=1
+
+function ff_put_h264_qpel8_mc00_neon, export=1
+ mov r3, #8
+endfunc
+
+ pixfunc put_, pixels8, avg=0
+ pixfunc2 put_, pixels8_x2, avg=0
+ pixfunc2 put_, pixels8_y2, avg=0
+ pixfunc2 put_, pixels8_xy2, avg=0
+
+function ff_avg_h264_qpel8_mc00_neon, export=1
+ mov r3, #8
+endfunc
+
+ pixfunc avg_, pixels8, avg=1
+ pixfunc avg_, pixels8_x2, avg=1
+ pixfunc avg_, pixels8_y2, avg=1
+ pixfunc avg_, pixels8_xy2, avg=1
diff --git a/libavcodec/hpeldsp.c b/libavcodec/hpeldsp.c
index 4bc7a80..a7ac681 100644
--- a/libavcodec/hpeldsp.c
+++ b/libavcodec/hpeldsp.c
@@ -54,8 +54,8 @@ av_cold void ff_hpeldsp_init(HpelDSPContext* c, int flags)
hpel_funcs(avg_no_rnd,, 16);
if (ARCH_X86) ff_hpeldsp_init_x86 (c, flags);
-#if 0
if (ARCH_ARM) ff_hpeldsp_init_arm (c, flags);
+#if 0
if (HAVE_VIS) ff_hpeldsp_init_vis (c, flags);
if (ARCH_ALPHA) ff_hpeldsp_init_alpha (c, flags);
#endif
--
1.7.11.3
More information about the ffmpeg-devel
mailing list