[FFmpeg-devel] [PATCH 2/3] x86/hevcdsp: add ff_hevc_sao_edge_filter_8_{ssse3, avx2}
James Almer
jamrial at gmail.com
Fri Feb 6 16:41:51 CET 2015
On 06/02/15 9:49 AM, Christophe Gisquet wrote:
> diff --git a/libavcodec/x86/hevc_sao.asm b/libavcodec/x86/hevc_sao.asm
> index 5136121..8619716 100644
> --- a/libavcodec/x86/hevc_sao.asm
> +++ b/libavcodec/x86/hevc_sao.asm
> @@ -296,14 +296,16 @@ HEVC_SAO_BAND_FILTER_16 12, 64, 2
> %if WIN64
> cglobal hevc_sao_edge_filter_%1_8, 4, 8, 8, dst, src, dststride, offset, a_stride, b_stride, height, tmp
> %define eoq heightq
> - movsxd eoq, dword r4m
> - movsx a_strideq, byte [pb_eo+eoq*4+1]
> - movsx b_strideq, byte [pb_eo+eoq*4+3]
> + movsxd b_strideq, dword r4m
> + lea tmpq, [pb_eo]
> + lea eoq, [tmpq+4*b_strideq]
> + movsx a_strideq, byte [eoq+1]
> + movsx b_strideq, byte [eoq+3]
> imul a_strideq, EDGE_SRCSTRIDE
> imul b_strideq, EDGE_SRCSTRIDE
> - movsx tmpq, byte [pb_eo+eoq*4]
> + movsx tmpq, byte [eoq]
> add a_strideq, tmpq
> - movsx tmpq, byte [pb_eo+eoq*4+2]
> + movsx tmpq, byte [eoq+2]
> add b_strideq, tmpq
> mov heightd, r6m
>
> @@ -442,14 +444,16 @@ INIT_YMM cpuname
> %if WIN64
> cglobal hevc_sao_edge_filter_%2_%1, 4, 8, 16, dst, src, dststride, offset, a_stride, b_stride, height, tmp
> %define eoq heightq
> - movsxd eoq, dword r4m
> - movsx a_strideq, byte [pb_eo+eoq*4+1]
> - movsx b_strideq, byte [pb_eo+eoq*4+3]
> + movsxd b_strideq, dword r4m
> + lea tmpq, [pb_eo]
> + lea eoq, [tmpq+4*b_strideq]
> + movsx a_strideq, byte [eoq+1]
> + movsx b_strideq, byte [eoq+3]
> imul a_strideq, EDGE_SRCSTRIDE>>1
> imul b_strideq, EDGE_SRCSTRIDE>>1
> - movsx tmpq, byte [pb_eo+eoq*4]
> + movsx tmpq, byte [eoq]
> add a_strideq, tmpq
> - movsx tmpq, byte [pb_eo+eoq*4+2]
> + movsx tmpq, byte [eoq+2]
> add b_strideq, tmpq
> mov heightd, r6m
> add a_strideq, a_strideq
Wouldn't it be better to just use the same code as UNIX64 instead? Now that we're going to load
the address of the table to a reg, there's not point in having a whole separate init path for
WIN64.
One for X86_64 and one for X86_32 (Where applicable) is much cleaner.
More information about the ffmpeg-devel
mailing list