[FFmpeg-devel] [PATCH] x86/vf_w3fdif: 32-bit compatibility for w3fdif_simple_high

Hendrik Leppkes h.leppkes at gmail.com
Thu Jan 7 11:21:27 CET 2016


On Thu, Jan 7, 2016 at 4:18 AM, James Almer <jamrial at gmail.com> wrote:
> On 1/6/2016 11:54 PM, Hendrik Leppkes wrote:
>> ---
>> Based on an idea from Ronald mentioend in an earlier thread about this function.
>>
>> It works and passes FATE, however I'm sure some aspects can be done easier or cleaner, so please let me know.
>>
>>
>>  libavfilter/x86/vf_w3fdif.asm    | 37 ++++++++++++++++++++++++++++++++++---
>>  libavfilter/x86/vf_w3fdif_init.c |  2 +-
>>  2 files changed, 35 insertions(+), 4 deletions(-)
>>
>> diff --git a/libavfilter/x86/vf_w3fdif.asm b/libavfilter/x86/vf_w3fdif.asm
>> index c3c73ea..35768c3 100644
>> --- a/libavfilter/x86/vf_w3fdif.asm
>> +++ b/libavfilter/x86/vf_w3fdif.asm
>> @@ -102,14 +102,22 @@ cglobal w3fdif_complex_low, 4, 7, 8, 0, work_line, in_lines_cur0, coef, linesize
>>  REP_RET
>>
>>  %if ARCH_X86_64
>> -
>>  cglobal w3fdif_simple_high, 5, 9, 8, 0, work_line, in_lines_cur0, in_lines_adj0, coef, linesize
>> +%else
>> +cglobal w3fdif_simple_high, 4, 7, 8, 0, work_line, in_lines_cur0, in_lines_adj0, coef, linesize
>> +%endif
>>      movq                  m2, [coefq]
>> -    DEFINE_ARGS    work_line, in_lines_cur0, in_lines_adj0, in_lines_cur1, linesize, offset, in_lines_cur2, in_lines_adj1, in_lines_adj2
>> +%if ARCH_X86_64
>> +    DEFINE_ARGS    work_line, in_lines_cur0, in_lines_adj0, linesize, offset, in_lines_cur1, in_lines_cur2, in_lines_adj1, in_lines_adj2
>
> This broke x86_64. Leave it as it was above.

The change wasn't intentional, I mangled that when moving things
around apparently.

>
>> +    mov              offsetq, 0
>
> Since you're moving this take the chance to replace it with a xor.

Done

>
>> +%else
>> +    DEFINE_ARGS    work_line, in_lines_cur0, in_lines_adj0, in_lines_cur1, in_lines_cur2, in_lines_adj1, in_lines_adj2
>> +    %define linesized dword r4m
>
> Nit: r4mp instead of dword r4m

I stole dword r4m from audio_convert.asm, but apparently r4mp aliases
to dword r4m, so changed anyway.
>
>> +%endif
>> +
>>      pshufd                m0, m2, q0000
>>      SPLATW                m2, m2, 2
>>      pxor                  m7, m7
>> -    mov              offsetq, 0
>>      mov       in_lines_cur2q, [in_lines_cur0q+gprsize*2]
>>      mov       in_lines_cur1q, [in_lines_cur0q+gprsize]
>>      mov       in_lines_cur0q, [in_lines_cur0q]
>> @@ -117,8 +125,21 @@ cglobal w3fdif_simple_high, 5, 9, 8, 0, work_line, in_lines_cur0, in_lines_adj0,
>>      mov       in_lines_adj1q, [in_lines_adj0q+gprsize]
>>      mov       in_lines_adj0q, [in_lines_adj0q]
>>
>> +%if ARCH_X86_32
>> +    sub in_lines_cur1q, in_lines_cur0q
>> +    sub in_lines_cur2q, in_lines_cur0q
>> +    sub in_lines_adj0q, in_lines_cur0q
>> +    sub in_lines_adj1q, in_lines_cur0q
>> +    sub in_lines_adj2q, in_lines_cur0q
>> +    %define offsetq in_lines_cur0q
>> +%endif
>> +
>>  .loop:
>> +%if ARCH_X86_64
>>      movh                                   m3, [in_lines_cur0q+offsetq]
>> +%else
>> +    movh                                   m3, [in_lines_cur0q]
>> +%endif
>>      movh                                   m4, [in_lines_cur1q+offsetq]
>>      punpcklbw                              m3, m7
>>      punpcklbw                              m4, m7
>> @@ -143,15 +164,25 @@ cglobal w3fdif_simple_high, 5, 9, 8, 0, work_line, in_lines_cur0, in_lines_adj0,
>>      pmaddwd                                m6, m2
>>      paddd                                  m3, m5
>>      paddd                                  m4, m6
>> +%if ARCH_X86_64
>>      paddd                                  m3, [work_lineq+offsetq*4]
>>      paddd                                  m4, [work_lineq+offsetq*4+mmsize]
>>      mova               [work_lineq+offsetq*4], m3
>>      mova        [work_lineq+offsetq*4+mmsize], m4
>> +%else
>> +    paddd                                  m3, [work_lineq]
>> +    paddd                                  m4, [work_lineq+mmsize]
>> +    mova                         [work_lineq], m3
>> +    mova                  [work_lineq+mmsize], m4
>> +    add                            work_lineq, mmsize*2
>> +%endif
>>      add                               offsetq, mmsize/2
>>      sub                             linesized, mmsize/2
>>      jg .loop
>>  REP_RET
>>
>> +%if ARCH_X86_64
>> +
>>  cglobal w3fdif_complex_high, 5, 13, 10, 0, work_line, in_lines_cur0, in_lines_adj0, coef, linesize
>>      movq                  m0, [coefq+0]
>>      movd                  m4, [coefq+8]
>> diff --git a/libavfilter/x86/vf_w3fdif_init.c b/libavfilter/x86/vf_w3fdif_init.c
>> index 72ea657..9bf06e8 100644
>> --- a/libavfilter/x86/vf_w3fdif_init.c
>> +++ b/libavfilter/x86/vf_w3fdif_init.c
>> @@ -51,12 +51,12 @@ av_cold void ff_w3fdif_init_x86(W3FDIFDSPContext *dsp)
>>
>>      if (EXTERNAL_SSE2(cpu_flags)) {
>>          dsp->filter_simple_low   = ff_w3fdif_simple_low_sse2;
>> +        dsp->filter_simple_high  = ff_w3fdif_simple_high_sse2;
>>          dsp->filter_complex_low  = ff_w3fdif_complex_low_sse2;
>>          dsp->filter_scale        = ff_w3fdif_scale_sse2;
>>      }
>>
>>      if (ARCH_X86_64 && EXTERNAL_SSE2(cpu_flags)) {
>> -        dsp->filter_simple_high  = ff_w3fdif_simple_high_sse2;
>>          dsp->filter_complex_high = ff_w3fdif_complex_high_sse2;
>>      }
>>  }
>>
>
> Seems to work. Maybe it can be improved but it should be good as is.
>
> And to answer your question, no, i wasn't working on this. I got
> distracted with other filters :P

Will re-test both x86 and x64 in a bit and push in a few hours if no
further comments show up.

Thanks.

- Hendrik


More information about the ffmpeg-devel mailing list