[FFmpeg-devel] [PATCH] avfilter: add hflip x86 SIMD

Sun Dec 3 21:00:21 EET 2017

On 12/3/2017 3:55 PM, Martin Vignali wrote:
> in O2 or O3 : clang -S -O3 test_asm_gen.c
> 
> If i correctly understand, same idea than paul's patch
> but processing two xmm in the main loop
> 
>     .section    __TEXT,__text,regular,pure_instructions
>     .macosx_version_min 10, 12
>     .section    __TEXT,__literal16,16byte_literals
>     .p2align    4
> LCPI0_0:
>     .byte    15                      ## 0xf
>     .byte    14                      ## 0xe
>     .byte    13                      ## 0xd
>     .byte    12                      ## 0xc
>     .byte    11                      ## 0xb
>     .byte    10                      ## 0xa
>     .byte    9                       ## 0x9
>     .byte    8                       ## 0x8
>     .byte    7                       ## 0x7
>     .byte    6                       ## 0x6
>     .byte    5                       ## 0x5
>     .byte    4                       ## 0x4
>     .byte    3                       ## 0x3
>     .byte    2                       ## 0x2
>     .byte    1                       ## 0x1
>     .byte    0                       ## 0x0
>     .section    __TEXT,__text,regular,pure_instructions
>     .globl    _hflip_byte_c
>     .p2align    4, 0x90
> _hflip_byte_c:                          ## @hflip_byte_c
>     .cfi_startproc
> ## BB#0:
>     pushq    %rbp
> Ltmp0:
>     .cfi_def_cfa_offset 16
> Ltmp1:
>     .cfi_offset %rbp, -16
>     movq    %rsp, %rbp
> Ltmp2:
>     .cfi_def_cfa_register %rbp
>                                         ## kill: %EDX<def> %EDX<kill>
> %RDX<def>
>     testl    %edx, %edx
>     jle    LBB0_17
> ## BB#1:
>     movl    %edx, %r8d
>     cmpl    $32, %edx
>     jae    LBB0_3
> ## BB#2:
>     xorl    %r11d, %r11d
>     jmp    LBB0_11
> LBB0_3:
>     andl    $31, %edx
>     movq    %r8, %r11
>     subq    %rdx, %r11
>     je    LBB0_7
> ## BB#4:
>     leaq    1(%rdi), %rax
>     cmpq    %rsi, %rax
>     jbe    LBB0_8
> ## BB#5:
>     leaq    (%rsi,%r8), %r9
>     movl    $1, %eax
>     subq    %r8, %rax
>     addq    %rdi, %rax
>     cmpq    %r9, %rax
>     jae    LBB0_8
> LBB0_7:
>     xorl    %r11d, %r11d
>     jmp    LBB0_11
> LBB0_8:
>     leaq    -15(%rdi), %r9
>     leaq    16(%rsi), %rax
>     movdqa    LCPI0_0(%rip), %xmm0    ## xmm0 =
> [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
>     movq    %r11, %r10
>     .p2align    4, 0x90
> LBB0_9:                                 ## =>This Inner Loop Header: Depth=1
>     movdqu    -16(%r9), %xmm1
>     movdqu    (%r9), %xmm2
>     pshufb    %xmm0, %xmm2
>     pshufb    %xmm0, %xmm1
>     movdqu    %xmm2, -16(%rax)
>     movdqu    %xmm1, (%rax)
>     addq    $-32, %r9
>     addq    $32, %rax
>     addq    $-32, %r10
>     jne    LBB0_9

Huh, so we're not disabling tree vectorization with clang, only with
GCC. Guess it hasn't generated broken code before to justify disabling it.

In any case, if clang or gcc can generate better code, then the hand
written version needs to be optimized to be as fast or faster.