[FFmpeg-devel] [patch][OpenHEVC]added ASM functions for epel + qpel

Tue Mar 4 04:20:25 CET 2014

Hi,

OK going to try a full review. For future reference, try to git send-email
your patches. Also, when you make changes, amend existing patches, don't
send them on top, I'm not reviewing patch 3/3 because it should be merged
with 2/3, except for this part:

patch 1:

> +SECTION_RODATA
> +
> +hevc_epel_filters_asm_8 times 8 db -2, 58
> +                        times 8 db 10, -2
> +                        times 8 db -4, 54
> +                        times 8 db 16, -2
> +                        times 8 db -6, 46
> +                        times 8 db 28, -4
> +                        times 8 db -4, 36
> +                        times 8 db 36, -4
> +                        times 8 db -4, 28
> +                        times 8 db 46, -6
> +                        times 8 db -2, 16
> +                        times 8 db 54, -4
> +                        times 8 db -2, 10
> +                        times 8 db 58, -2
> +
> +hevc_epel_filters_asm_10 times 4 dw -2, 58

patch 2:

> +ALIGN 16
>  hevc_epel_filters_asm_8 times 8 db -2, 58

ALIGN_16 is implicit in SECTION_RODATA.

> +                        times 4 dw 10, -2
> +                        times 4 dw -4, 54
> +                        times 4 dw 16, -2
> +                        times 4 dw -6, 46
> +                        times 4 dw 28, -4
> +                        times 4 dw -4, 36
> +                        times 4 dw 36, -4
> +                        times 4 dw -4, 28
> +                        times 4 dw 46, -6
> +                        times 4 dw -2, 16
> +                        times 4 dw 54, -4
> +                        times 4 dw -2, 10
> +                        times 4 dw 58, -2

Consider making this table a macro where the type (db or dw) is an
argument, like this:

%macro TABLE 1
name_%1: times 8 d%1 0, 1
         times 8 d%1 2, 3
%endmacro
TABLE b
TABLE w

> +%macro EPEL_FILTER 2                             ; bit depth, filter
index
> +    movsxd           %2q, %2d                    ; extend sign

BAD! BAD! BAD! BAD!

Whenever you write movsxd, a cute little kitten dies. Make your function
argument ptrdiff_t or intptr_t instead of int to prevent this.

> +    sub              %2q, 1
> +    shl              %2q, 5                      ; multiply by 32
> +    lea         rfilterq, [hevc_epel_filters_asm_%1]
> +    movdqa           m13, [rfilterq + %2q]        ; get 2 first values
of filters

Make lea dependent on %ifdef PIC.

One of the indirections can be removed if you write e.g.:

lea %2q, [%2q*8+1]
%ifdef PIC
lea rfilterq, [stuff]
%else
%define rfilterq stuff
%endif
movdqa m13, [rfilterq+%2q*4]

In most optimal case (no PIC - e.g. Linux64), this is 2 instead of 5
instructions, else (Win64, Mac64) 3.

> +%macro EPEL_HV_FILTER 1
> +    movsxd           mxq, mxd                    ; extend sign
> +    movsxd           myq, myd                    ; extend sign

BAD you just killed 2 kittens.

> +    sub              mxq, 1
> +    sub              myq, 1
> +    shl              mxq, 5                      ; multiply by 32
> +    shl              myq, 5                      ; multiply by 32
> +    lea         rfilterq, [hevc_epel_filters_asm_%1]
> +    movdqa           m13, [rfilterq + mxq]        ; get 2 first values
of filters
> +    movdqa           m14, [rfilterq + mxq+16]     ; get 2 last values of
filters
> +    lea         rfilterq, [hevc_epel_filters_asm_10]
> +    movdqa           m11, [rfilterq + myq]        ; get 2 first values
of filters
> +    movdqa           m12, [rfilterq + myq+16]     ; get 2 last values of
filters

See my earlier comments about making lea conditional on PIC and merging the
3 indirections into 2.

> +%macro QPEL_FILTER 2
> +    movsxd           %2q, %2d                    ; extend sign
> +    sub              %2q, 1
> +    shl              %2q, 6                      ; multiply by 16
> +    lea         rfilterq, [hevc_qpel_filters_asm_%1]
> +    movdqa           m11, [rfilterq + %2q]       ; get 4 first values of
filters

You know what to do here now.

> +%macro EPEL_LOAD 3
> +    lea              r11, [%2q]

Why?

> +    movdqu            m0, [r11     ]            ;load 128bit of x
> +%ifidn %3, srcstride

%ifnum

> +    lea            r3srcq, [%3q*3]

Uhm, assuming you call EPEL_LOAD in a loop (over h, possibly multiple times
per 1 h for larger w), doesn't this re-lea in every loop iteration?
Probably do this outside loop.

> +%if %1 == 8
> +    SBUTTERFLY        bw, 0, 1, 10
> +    SBUTTERFLY        bw, 2, 3, 10
> +%else
> +    SBUTTERFLY        wd, 0, 1, 10
> +    SBUTTERFLY        wd, 2, 3, 10
> +%endif

I see you're calling this macro even if width=small (like 2, 4, 8 for
8bit); in those cases, you don't need movdqu, movd for w=4, movd or pinsrd
for w=2 and movq for w=8 is just fine, and the SBUTTERFLY is overly complex
(it combines a punpcklXX with a punpckhXX), you just need punpcklXX in
those cases.

> +%macro QPEL_H_LOAD 2
> +%if %1 = 8
> +    movdqu            m0, [%2q-3]          ; load data from source
> +    movdqu            m1, [%2q-2]
> > +    movdqu            m2, [%2q-1]
> +    movdqu            m3, [%2q  ]
> +    movdqu            m4, [%2q+1]
> +    movdqu            m5, [%2q+2]
> +    movdqu            m6, [%2q+3]
> +    movdqu            m7, [%2q+4]
> +    SBUTTERFLY        wd, 0, 1, 10
> +    SBUTTERFLY        wd, 2, 3, 10
> +    SBUTTERFLY        wd, 4, 5, 10
> +    SBUTTERFLY        wd, 6, 7, 10
> +%else
> +    movdqu            m0, [%2q-6]          ; load data from source
> +    movdqu            m1, [%2q-4]
> +    movdqu            m2, [%2q-2]
> +    movdqu            m3, [%2q  ]
> +    movdqu            m4, [%2q+2]
> +    movdqu            m5, [%2q+4]
> +    movdqu            m6, [%2q+6]
> +    movdqu            m7, [%2q+8]
> +    SBUTTERFLY        dq, 0, 1, 10
> +    SBUTTERFLY        dq, 2, 3, 10
> +    SBUTTERFLY        dq, 4, 5, 10
> +    SBUTTERFLY        dq, 6, 7, 10
> +%endif
> +%endmacro

%assign %%stride (%1+7)/8, and then use -3*%%stride instead of -6/-3 to
merge these two pieces. Also same comment as above, use punpcklXX oly for
w<=8 if 8bit or w<=4 if 10bit.

> +%macro QPEL_V_LOAD 3
> +    xor              r11, r11
> +%if %1 == 8
> +    lea              r11, [%2q]
> +%else
> +    lea              r11, [%2q]
> +%endif

Merge (mov r11, %2q) - and why? Can't you just use %2q directly?

> +    lea           r3srcq, [%3q*3]

Does not require loop context => move out of loop.

> +%if %1 == 8
> +    SBUTTERFLY        bw, 0, 1, 8
> +    SBUTTERFLY        bw, 2, 3, 8
> +    SBUTTERFLY        bw, 4, 5, 8
> +    SBUTTERFLY        bw, 6, 7, 8
> +%else
> +    SBUTTERFLY        wd, 0, 1, 8
> +    SBUTTERFLY        wd, 2, 3, 8
> +    SBUTTERFLY        wd, 4, 5, 8
> +    SBUTTERFLY        wd, 6, 7, 8
> +%endif

I see a pattern.

OK rest comes later, sleep time.

Ronald