[FFmpeg-devel] [PATCH] x86/hevc_sao: make sao_edge_filter_{10, 12} work on x86_32
James Almer
jamrial at gmail.com
Tue Feb 10 05:44:26 CET 2015
Signed-off-by: James Almer <jamrial at gmail.com>
---
libavcodec/x86/hevc_sao.asm | 106 ++++++++++++++++++++++++++----------------
libavcodec/x86/hevcdsp_init.c | 22 ++++-----
2 files changed, 74 insertions(+), 54 deletions(-)
diff --git a/libavcodec/x86/hevc_sao.asm b/libavcodec/x86/hevc_sao.asm
index cd95f0b..8c2436b 100644
--- a/libavcodec/x86/hevc_sao.asm
+++ b/libavcodec/x86/hevc_sao.asm
@@ -293,6 +293,25 @@ HEVC_SAO_BAND_FILTER_16 12, 64, 2
%define EDGE_SRCSTRIDE 2 * MAX_PB_SIZE + PADDING_SIZE
+%macro HEVC_SAO_EDGE_FILTER_INIT 1
+%if WIN64
+ movsxd eoq, dword eom
+%elif ARCH_X86_64
+ movsxd eoq, eod
+%else
+ mov eoq, r4m
+%endif
+ lea tmp2q, [pb_eo]
+ movsx a_strideq, byte [tmp2q+eoq*4+1]
+ movsx b_strideq, byte [tmp2q+eoq*4+3]
+ imul a_strideq, EDGE_SRCSTRIDE>>%1
+ imul b_strideq, EDGE_SRCSTRIDE>>%1
+ movsx tmpq, byte [tmp2q+eoq*4]
+ add a_strideq, tmpq
+ movsx tmpq, byte [tmp2q+eoq*4+2]
+ add b_strideq, tmpq
+%endmacro
+
%macro HEVC_SAO_EDGE_FILTER_COMPUTE_8 1
pminub m4, m1, m2
pminub m5, m1, m3
@@ -328,20 +347,7 @@ HEVC_SAO_BAND_FILTER_16 12, 64, 2
%if ARCH_X86_64
cglobal hevc_sao_edge_filter_%1_8, 4, 9, 8, dst, src, dststride, offset, eo, a_stride, b_stride, height, tmp
%define tmp2q heightq
-%if WIN64
- movsxd eoq, dword r4m
-%else
- movsxd eoq, eod
-%endif
- lea tmp2q, [pb_eo]
- movsx a_strideq, byte [tmp2q+eoq*4+1]
- movsx b_strideq, byte [tmp2q+eoq*4+3]
- imul a_strideq, EDGE_SRCSTRIDE
- imul b_strideq, EDGE_SRCSTRIDE
- movsx tmpq, byte [tmp2q+eoq*4]
- add a_strideq, tmpq
- movsx tmpq, byte [tmp2q+eoq*4+2]
- add b_strideq, tmpq
+ HEVC_SAO_EDGE_FILTER_INIT 0
mov heightd, r6m
%else ; ARCH_X86_32
@@ -350,17 +356,7 @@ cglobal hevc_sao_edge_filter_%1_8, 1, 6, 8, dst, src, dststride, a_stride, b_str
%define tmpq heightq
%define tmp2q dststrideq
%define offsetq heightq
- mov eoq, r4m
- lea tmp2q, [pb_eo]
- movsx a_strideq, byte [tmp2q+eoq*4+1]
- movsx b_strideq, byte [tmp2q+eoq*4+3]
- imul a_strideq, EDGE_SRCSTRIDE
- imul b_strideq, EDGE_SRCSTRIDE
- movsx tmpq, byte [tmp2q+eoq*4]
- add a_strideq, tmpq
- movsx tmpq, byte [tmp2q+eoq*4+2]
- add b_strideq, tmpq
-
+ HEVC_SAO_EDGE_FILTER_INIT 0
mov srcq, srcm
mov offsetq, r3m
mov dststrideq, dststridem
@@ -442,6 +438,7 @@ INIT_YMM cpuname
paddw m4, m5
pcmpeqw m2, m4, [pw_m2]
+%if ARCH_X86_64
pcmpeqw m3, m4, m13
pcmpeqw m5, m4, m0
pcmpeqw m6, m4, m14
@@ -451,6 +448,17 @@ INIT_YMM cpuname
pand m5, m10
pand m6, m11
pand m7, m12
+%else
+ pcmpeqw m3, m4, [pw_m1]
+ pcmpeqw m5, m4, m0
+ pcmpeqw m6, m4, [pw_1]
+ pcmpeqw m7, m4, [pw_2]
+ pand m2, [rsp+MMSIZE*0]
+ pand m3, [rsp+MMSIZE*1]
+ pand m5, [rsp+MMSIZE*2]
+ pand m6, [rsp+MMSIZE*3]
+ pand m7, [rsp+MMSIZE*4]
+%endif
paddw m2, m3
paddw m5, m6
paddw m2, m7
@@ -461,26 +469,35 @@ INIT_YMM cpuname
;void ff_hevc_sao_edge_filter_<width>_<depth>_<opt>(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val,
; int eo, int width, int height);
%macro HEVC_SAO_EDGE_FILTER_16 3
+%if ARCH_X86_64
cglobal hevc_sao_edge_filter_%2_%1, 4, 9, 16, dst, src, dststride, offset, eo, a_stride, b_stride, height, tmp
%define tmp2q heightq
-%if WIN64
- movsxd eoq, dword r4m
-%else
- movsxd eoq, eod
-%endif
- lea tmp2q, [pb_eo]
- movsx a_strideq, byte [tmp2q+eoq*4+1]
- movsx b_strideq, byte [tmp2q+eoq*4+3]
- imul a_strideq, EDGE_SRCSTRIDE>>1
- imul b_strideq, EDGE_SRCSTRIDE>>1
- movsx tmpq, byte [tmp2q+eoq*4]
- add a_strideq, tmpq
- movsx tmpq, byte [tmp2q+eoq*4+2]
- add b_strideq, tmpq
+ HEVC_SAO_EDGE_FILTER_INIT 1
mov heightd, r6m
add a_strideq, a_strideq
add b_strideq, b_strideq
+%else ; ARCH_X86_32
+cglobal hevc_sao_edge_filter_%2_%1, 1, 6, 8, 5*mmsize, dst, src, dststride, a_stride, b_stride, height
+%assign MMSIZE mmsize
+%define eoq srcq
+%define tmpq heightq
+%define tmp2q dststrideq
+%define offsetq heightq
+%define m8 m1
+%define m9 m2
+%define m10 m3
+%define m11 m4
+%define m12 m5
+ HEVC_SAO_EDGE_FILTER_INIT 1
+ mov srcq, srcm
+ mov offsetq, r3m
+ mov dststrideq, dststridem
+ add a_strideq, a_strideq
+ add b_strideq, b_strideq
+
+%endif ; ARCH
+
%if cpuflag(avx2)
SPLATW m8, [offsetq+2]
SPLATW m9, [offsetq+4]
@@ -497,9 +514,18 @@ cglobal hevc_sao_edge_filter_%2_%1, 4, 9, 16, dst, src, dststride, offset, eo, a
SPLATW m12, xm12, 1
%endif
pxor m0, m0
+%if ARCH_X86_64
mova m13, [pw_m1]
mova m14, [pw_1]
mova m15, [pw_2]
+%else
+ mov heightd, r6m
+ mova [rsp+mmsize*0], m8
+ mova [rsp+mmsize*1], m9
+ mova [rsp+mmsize*2], m10
+ mova [rsp+mmsize*3], m11
+ mova [rsp+mmsize*4], m12
+%endif
align 16
.loop
@@ -573,7 +599,6 @@ HEVC_SAO_EDGE_FILTER_8 48, 1, u
HEVC_SAO_EDGE_FILTER_8 64, 2, a
%endif
-%if ARCH_X86_64
INIT_XMM sse2
HEVC_SAO_EDGE_FILTER_16 10, 8, 0
HEVC_SAO_EDGE_FILTER_16 10, 16, 1
@@ -597,4 +622,3 @@ HEVC_SAO_EDGE_FILTER_16 12, 32, 1
HEVC_SAO_EDGE_FILTER_16 12, 48, 1
HEVC_SAO_EDGE_FILTER_16 12, 64, 2
%endif
-%endif
diff --git a/libavcodec/x86/hevcdsp_init.c b/libavcodec/x86/hevcdsp_init.c
index 8f7473d..78569ae 100644
--- a/libavcodec/x86/hevcdsp_init.c
+++ b/libavcodec/x86/hevcdsp_init.c
@@ -865,10 +865,9 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
if (ARCH_X86_64) {
c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_sse2;
c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_sse2;
-
- SAO_EDGE_INIT(10, sse2);
}
SAO_BAND_INIT(10, sse2);
+ SAO_EDGE_INIT(10, sse2);
c->idct_dc[1] = ff_hevc_idct8x8_dc_10_sse2;
c->idct_dc[2] = ff_hevc_idct16x16_dc_10_sse2;
@@ -907,10 +906,6 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
c->idct_dc[2] = ff_hevc_idct16x16_dc_10_avx2;
c->idct_dc[3] = ff_hevc_idct32x32_dc_10_avx2;
if (ARCH_X86_64) {
- c->sao_edge_filter[2] = ff_hevc_sao_edge_filter_32_10_avx2;
- c->sao_edge_filter[3] = ff_hevc_sao_edge_filter_48_10_avx2;
- c->sao_edge_filter[4] = ff_hevc_sao_edge_filter_64_10_avx2;
-
c->put_hevc_epel[5][0][0] = ff_hevc_put_hevc_pel_pixels16_10_avx2;
c->put_hevc_epel[6][0][0] = ff_hevc_put_hevc_pel_pixels24_10_avx2;
c->put_hevc_epel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_10_avx2;
@@ -1055,6 +1050,9 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
c->put_hevc_qpel_bi[9][1][1] = ff_hevc_put_hevc_bi_qpel_hv64_10_avx2;
}
SAO_BAND_INIT(10, avx2);
+ c->sao_edge_filter[2] = ff_hevc_sao_edge_filter_32_10_avx2;
+ c->sao_edge_filter[3] = ff_hevc_sao_edge_filter_48_10_avx2;
+ c->sao_edge_filter[4] = ff_hevc_sao_edge_filter_64_10_avx2;
c->transform_add[2] = ff_hevc_transform_add16_10_avx2;
c->transform_add[3] = ff_hevc_transform_add32_10_avx2;
@@ -1071,10 +1069,9 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
if (ARCH_X86_64) {
c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_12_sse2;
c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_12_sse2;
-
- SAO_EDGE_INIT(12, sse2);
}
SAO_BAND_INIT(12, sse2);
+ SAO_EDGE_INIT(12, sse2);
c->idct_dc[1] = ff_hevc_idct8x8_dc_12_sse2;
c->idct_dc[2] = ff_hevc_idct16x16_dc_12_sse2;
@@ -1107,12 +1104,11 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
if (EXTERNAL_AVX2(cpu_flags)) {
c->idct_dc[2] = ff_hevc_idct16x16_dc_12_avx2;
c->idct_dc[3] = ff_hevc_idct32x32_dc_12_avx2;
- if (ARCH_X86_64) {
- c->sao_edge_filter[2] = ff_hevc_sao_edge_filter_32_12_avx2;
- c->sao_edge_filter[3] = ff_hevc_sao_edge_filter_48_12_avx2;
- c->sao_edge_filter[4] = ff_hevc_sao_edge_filter_64_12_avx2;
- }
+
SAO_BAND_INIT(12, avx2);
+ c->sao_edge_filter[2] = ff_hevc_sao_edge_filter_32_12_avx2;
+ c->sao_edge_filter[3] = ff_hevc_sao_edge_filter_48_12_avx2;
+ c->sao_edge_filter[4] = ff_hevc_sao_edge_filter_64_12_avx2;
}
}
}
--
2.2.2
More information about the ffmpeg-devel
mailing list