[FFmpeg-devel] HEVC patch : adding ASM functions
Clément Bœsch
u at pkh.me
Sun Feb 23 19:21:21 CET 2014
On Fri, Feb 21, 2014 at 04:07:53PM +0100, Pierre Edouard Lepere wrote:
[...]
> +%include "libavutil/x86/x86util.asm"
> +
> +SECTION_RODATA
> +cextern hevc_epel_filters
> +cextern hevc_qpel_filters
> +
> +epel_extra_before DB 1 ;corresponds to EPEL_EXTRA_BEFORE in hevc.h
> +max_pb_size DB 64 ;corresponds to MAX_PB_SIZE in hevc.h
> +
> +qpel_extra_before DB 0, 3, 3, 2 ;corresponds to the ff_hevc_qpel_extra_before in hevc.c
> +qpel_extra DB 0, 6, 7, 6 ;corresponds to the ff_hevc_qpel_extra in hevc.c
> +qpel_extra_after DB 0, 3, 4, 4
> +
> +
> +single_mask_16 DW 0, 0, 0, 0, 0, 0, 0, -1
> +single_mask_32 DW 0, 0, 0, 0, 0, 0, -1, -1
The common convention is to use ':' after labels, and no caps for
db/dw/etc
> +SECTION .text
> +
> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
> +;;
> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
?
> +%macro LOOP_INIT 2
> + pxor m15, m15 ; set register at zero
> + mov r10, 0 ; set height counter
> +%1:
> + mov r9, 0 ; set width counter
> +%2:
make sure these labels are prefixed with a '.'
> +%endmacro
> +
> +%macro LOOP_END 7
> + add r9, %3 ; add 4 for width loop
> + cmp r9, widthq ; cmp width
> + jl %2 ; width loop
> +%ifidn %5, dststride
> + lea %4q, [%4q+2*%5q] ; dst += dststride
> +%else
> + lea %4q, [%4q+ %5 ] ; dst += dststride
> +%endif
> +%ifidn %7, srcstride
> + lea %6q, [%6q+ %7q] ; src += srcstride
> +%else
> + lea %6q, [%6q+ %7 ] ; src += srcstride
> +%endif
Is there a place where it's not srcstride?
If not, you probably want to use the default value system with something
like:
%macro LOOP_END 6-7 srcstrideq
...
> + add r10, 1
> + cmp r10, heightq ; cmp height
> + jl %1 ; height loop
> +%endmacro
> +
> +%macro LOOP_EPEL_HV_INIT 4
> + pxor m15, m15 ; set register at zero
> + mov r9, 0 ; set width counter
> +%1:
> + mov r10, 0 ; set height counter
> +;first 3 H calculus
> +%if %3 == 8
> + EPEL_LOAD %3, mx, 1
> +%else
> + EPEL_LOAD %3, mx, 2
> +%endif
> + EPEL_COMPUTE %3, %4
> + SWAP m4, m0
> + lea mxq, [mxq + srcstrideq]
> +%if %3 == 8
> + EPEL_LOAD %3, mx, 1
> +%else
> + EPEL_LOAD %3, mx, 2
> +%endif
> + EPEL_COMPUTE %3, %4
> + SWAP m5, m0
> + lea mxq, [mxq + srcstrideq]
> +%if %3 == 8
> + EPEL_LOAD %3, mx, 1
> +%else
> + EPEL_LOAD %3, mx, 2
> +%endif
instead of this %if ... %else ..., can you either do something like:
EPEL_LOAD %3, mx, 1 + (%3 == 8)
or simply add the condition on %3 in the macro itself?
> + EPEL_COMPUTE %3, %4
> + SWAP m6, m0
> + lea mxq, [mxq + srcstrideq]
> +%2:
. prefix
> +%endmacro
> +
> +%macro LOOP_HV_END 9
> + lea %8q, [%8q+2*%5q] ; dst += dststride
> + lea %9q, [%9q+ %7q] ; src += srcstride
> + inc r10 ; add 1 for height loop
> + cmp r10, heightq ; cmp height
> + jl %2 ; height loop
> + add r9, %3 ; add width
> + lea %8q,[%4q] ; dst = dst2
> + lea %9q,[%6q] ; src = src2
> + cmp r9, widthq ; cmp width
> + jl %1 ; weight loop
> +%endmacro
> +
> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
> +;;
> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
> +
> +
> +%macro EPEL_FILTER 2
> + movsxd %2q, %2d ; extend sign
> + sub %2q, 1
> + shl %2q, 2 ; multiply by 4
> + lea r11, [hevc_epel_filters]
> + movq m13, [r11 + %2q] ; get 4 first values of filters
> +%if %1 == 8
> + punpcklwd m13, m13
> +%else
> + pmovsxbw m13, m13
> +%endif
> + punpckldq m13, m13
> + movdqa m14, m13
> + punpckhqdq m14, m14 ; contains last 2 filters
> + punpcklqdq m13, m13 ;
> +%endmacro
> +
> +%macro EPEL_HV_FILTER 1
> + movsxd mxq, mxd ; extend sign
> + movsxd myq, myd ; extend sign
> + sub mxq, 1
> + sub myq, 1
> + shl mxq, 2 ; multiply by 4
> + shl myq, 2 ; multiply by 4
> + lea r11, [hevc_epel_filters]
> + movq m13, [r11 + mxq] ; get 4 first values of H filter
> + movq m11, [r11 + myq]
> +%if %1 == 8
> + punpcklwd m13, m13
> + pmovsxbw m11, m11 ; need 16bit filters for V
> +%else
> + pmovsxbw m13, m13
> + pmovsxbw m11, m11
> +%endif
pmovsxbw looks common to both
> + punpckldq m13, m13
> + punpckldq m11, m11
> + movdqa m14, m13
> + movdqa m12, m11
> + punpckhqdq m14, m14 ;H
> + punpckhqdq m12, m12 ;V
> + punpcklqdq m13, m13 ;H
> + punpcklqdq m11, m11 ;V
> +%endmacro
> +
> +
> +%macro QPEL_H_FILTER 2
> + movsxd %2q, %2d ; extend sign
> + sub %2q, 1
> + shl %2q, 4 ; multiply by 16
> + lea r11, [hevc_qpel_filters]
> + movq m11, [r11 + %2q] ; get 4 first values of filters
> +%if %1 == 8
> + punpcklwd m11, m11
> +%else
> + pmovsxbw m11, m11
> +%endif
> + punpckhdq m13, m11, m11
> + punpckldq m11, m11
> + movdqa m14, m13
> + movdqa m12, m11
> + punpckhqdq m14, m14 ; contains last 2 filters
> + punpcklqdq m13, m13 ;
> + punpckhqdq m12, m12 ;
> + punpcklqdq m11, m11 ; contains first 2 filters
> +
> +%endmacro
> +
> +%macro QPEL_V_FILTER 2
> + movdqu m6, [qpel_h_filter%1_10]
> + psrldq m7, m6, 2
> + psrldq m8, m6, 4
> + psrldq m9, m6, 6
> + psrldq m10, m6, 8
> + psrldq m11, m6, 10
> + psrldq m12, m6, 12
> + psrldq m13, m6, 14
> +
> + punpcklwd m6, m6 ; put double values to 32bit.
> + punpcklwd m7, m7
> + punpcklwd m8, m8
> + punpcklwd m9, m9
> + punpcklwd m10, m10
> + punpcklwd m11, m11
> + punpcklwd m12, m12
> + punpcklwd m13, m13
> +
> +%if %2 == 10
> + psrad m6, m6, 16
> + psrad m7, m7, 16
> + psrad m8, m8, 16
> + psrad m9, m9, 16
> + psrad m10, m10, 16
> + psrad m11, m11, 16
> + psrad m12, m12, 16
> + psrad m13, m13, 16
> +%endif
> +
You can remove the first argument here
> + pshufd m6, m6, 0
> + pshufd m7, m7, 0
> + pshufd m8, m8, 0
> + pshufd m9, m9, 0
> + pshufd m10, m10, 0
> + pshufd m11, m11, 0
> + pshufd m12, m12, 0
> + pshufd m13, m13, 0
ditto.
Also maybe SPLAT* macro?
> +
> +%endmacro
> +
> +
> +
> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
> +;;
> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
?
[...]
> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
> +;;
> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
> +
> +%if ARCH_X86_64
> +INIT_XMM sse4 ; adds ff_ and _sse4 to function name
> +
> +; ******************************
> +; void put_hevc_mc_pixels(int16_t *dst, ptrdiff_t dststride,
> +; uint8_t *_src, ptrdiff_t _srcstride,
> +; int width, int height, int mx, int my,
> +; int16_t* mcbuffer)
> +;
> +; r0 : *dst
> +; r1 : dststride
> +; r2 : *src
> +; r3 : srcstride
> +; r4 : width
> +; r5 : height
You named them in the cglobal, so this comment is not useful.
Same below
[...]
> +cglobal hevc_put_hevc_pel_pixels2_8, 9, 12, 0 , dst, dststride, src, srcstride,width,height
> + PUT_HEVC_MC_PIXELS 2, 8, epel
> +cglobal hevc_put_hevc_pel_pixels4_8, 9, 12, 0 , dst, dststride, src, srcstride,width,height
> + PUT_HEVC_MC_PIXELS 4, 8, epel
> +cglobal hevc_put_hevc_pel_pixels8_8, 9, 12, 0 , dst, dststride, src, srcstride,width,height
> + PUT_HEVC_MC_PIXELS 8, 8, epel
> +cglobal hevc_put_hevc_pel_pixels16_8, 9, 12, 0 , dst, dststride, src, srcstride,width,height
> + PUT_HEVC_MC_PIXELS 16, 8, epel
> +
> +cglobal hevc_put_hevc_pel_pixels2_10, 9, 12, 0 , dst, dststride, src, srcstride,width,height
> + PUT_HEVC_MC_PIXELS 2, 10, epel
> +cglobal hevc_put_hevc_pel_pixels4_10, 9, 12, 0 , dst, dststride, src, srcstride,width,height
> + PUT_HEVC_MC_PIXELS 4, 10, epel
> +cglobal hevc_put_hevc_pel_pixels8_10, 9, 12, 0 , dst, dststride, src, srcstride,width,height
> + PUT_HEVC_MC_PIXELS 8, 10, epel
Move the cglobal into the macro?
Same for all the others below
[...]
--
Clément B.
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 490 bytes
Desc: not available
URL: <http://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20140223/ad2fdb54/attachment.asc>
More information about the ffmpeg-devel
mailing list