[FFmpeg-devel] [PATCH] x86/hevc_sao: make sao_band_filter work on x86_32

James Almer jamrial at gmail.com
Sat Feb 7 23:06:30 CET 2015


Signed-off-by: James Almer <jamrial at gmail.com>
---
 libavcodec/x86/hevc_sao.asm   | 40 ++++++++++++++++++++++++++++++++++++----
 libavcodec/x86/hevcdsp_init.c | 24 ++++++++++++------------
 2 files changed, 48 insertions(+), 16 deletions(-)

diff --git a/libavcodec/x86/hevc_sao.asm b/libavcodec/x86/hevc_sao.asm
index 8202236..e669ef3 100644
--- a/libavcodec/x86/hevc_sao.asm
+++ b/libavcodec/x86/hevc_sao.asm
@@ -45,7 +45,6 @@ SECTION_TEXT
 ;SAO Band Filter
 ;******************************************************************************
 
-%if ARCH_X86_64
 %macro HEVC_SAO_BAND_FILTER_INIT 1
     and            leftq, 31
     movd             xm0, leftd
@@ -76,17 +75,37 @@ SECTION_TEXT
     SPLATW            m7, m7, 3
 %endif
 
+%if ARCH_X86_64
 %if %1 > 8
     mova             m13, [pw_mask %+ %1]
 %endif
     pxor             m14, m14
 
+%else ; ARCH_X86_32
+    mova  [rsp+mmsize*0], m0
+    mova  [rsp+mmsize*1], m1
+    mova  [rsp+mmsize*2], m2
+    mova  [rsp+mmsize*3], m3
+    mova  [rsp+mmsize*4], m4
+    mova  [rsp+mmsize*5], m5
+    mova  [rsp+mmsize*6], m6
+    pxor              m0, m0
+%if %1 > 8
+    mova              m1, [pw_mask %+ %1]
+%endif
+    %assign MMSIZE mmsize
+    %define m14 m0
+    %define m13 m1
+    %define  m9 m2
+    %define  m8 m3
+%endif ; ARCH
 DEFINE_ARGS dst, src, dststride, srcstride, offset, height
     mov          heightd, r7m
 %endmacro
 
 %macro HEVC_SAO_BAND_FILTER_COMPUTE 3
     psraw             %2, %3, %1-5
+%if ARCH_X86_64
     pcmpeqw          m10, %2, m0
     pcmpeqw          m11, %2, m1
     pcmpeqw          m12, %2, m2
@@ -99,12 +118,26 @@ DEFINE_ARGS dst, src, dststride, srcstride, offset, height
     por              m12, %2
     por              m10, m12
     paddw             %3, m10
+%else ; ARCH_X86_32
+    pcmpeqw           m4, %2, [rsp+MMSIZE*0]
+    pcmpeqw           m5, %2, [rsp+MMSIZE*1]
+    pcmpeqw           m6, %2, [rsp+MMSIZE*2]
+    pcmpeqw           %2, [rsp+MMSIZE*3]
+    pand              m4, [rsp+MMSIZE*4]
+    pand              m5, [rsp+MMSIZE*5]
+    pand              m6, [rsp+MMSIZE*6]
+    pand              %2, m7
+    por               m4, m5
+    por               m6, %2
+    por               m4, m6
+    paddw             %3, m4
+%endif ; ARCH
 %endmacro
 
 ;void ff_hevc_sao_band_filter_<width>_8_<opt>(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
 ;                                             int16_t *sao_offset_val, int sao_left_class, int width, int height);
 %macro HEVC_SAO_BAND_FILTER_8 2
-cglobal hevc_sao_band_filter_%1_8, 6, 6, 15, dst, src, dststride, srcstride, offset, left
+cglobal hevc_sao_band_filter_%1_8, 6, 6, 15, 8*mmsize*ARCH_X86_32, dst, src, dststride, srcstride, offset, left
     HEVC_SAO_BAND_FILTER_INIT 8
 
 align 16
@@ -154,7 +187,7 @@ INIT_YMM cpuname
 ;void ff_hevc_sao_band_filter_<width>_<depth>_<opt>(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
 ;                                                   int16_t *sao_offset_val, int sao_left_class, int width, int height);
 %macro HEVC_SAO_BAND_FILTER_16 3
-cglobal hevc_sao_band_filter_%2_%1, 6, 6, 15, dst, src, dststride, srcstride, offset, left
+cglobal hevc_sao_band_filter_%2_%1, 6, 6, 15, 8*mmsize*ARCH_X86_32, dst, src, dststride, srcstride, offset, left
     HEVC_SAO_BAND_FILTER_INIT %1
 
 align 16
@@ -253,7 +286,6 @@ HEVC_SAO_BAND_FILTER_16 12, 32, 1
 HEVC_SAO_BAND_FILTER_16 12, 48, 1
 HEVC_SAO_BAND_FILTER_16 12, 64, 2
 %endif
-%endif
 
 ;******************************************************************************
 ;SAO Edge Filter
diff --git a/libavcodec/x86/hevcdsp_init.c b/libavcodec/x86/hevcdsp_init.c
index f7b3d0f..8f7473d 100644
--- a/libavcodec/x86/hevcdsp_init.c
+++ b/libavcodec/x86/hevcdsp_init.c
@@ -714,8 +714,9 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
                 c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_sse2;
                 c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_sse2;
 
-                SAO_BAND_INIT(8, sse2);
             }
+            SAO_BAND_INIT(8, sse2);
+
             c->idct_dc[1] = ff_hevc_idct8x8_dc_8_sse2;
             c->idct_dc[2] = ff_hevc_idct16x16_dc_8_sse2;
             c->idct_dc[3] = ff_hevc_idct32x32_dc_8_sse2;
@@ -749,9 +750,9 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
             if (ARCH_X86_64) {
                 c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_avx;
                 c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_avx;
-
-                SAO_BAND_INIT(8, avx);
             }
+            SAO_BAND_INIT(8, avx);
+
             c->transform_add[1]    = ff_hevc_transform_add8_8_avx;
             c->transform_add[2]    = ff_hevc_transform_add16_8_avx;
             c->transform_add[3]    = ff_hevc_transform_add32_8_avx;
@@ -760,7 +761,6 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
             c->idct_dc[2] = ff_hevc_idct16x16_dc_8_avx2;
             c->idct_dc[3] = ff_hevc_idct32x32_dc_8_avx2;
             if (ARCH_X86_64) {
-                SAO_BAND_INIT(8, avx2);
                 c->put_hevc_epel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_8_avx2;
                 c->put_hevc_epel[8][0][0] = ff_hevc_put_hevc_pel_pixels48_8_avx2;
                 c->put_hevc_epel[9][0][0] = ff_hevc_put_hevc_pel_pixels64_8_avx2;
@@ -845,6 +845,8 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
                 c->put_hevc_qpel_bi[8][1][0] = ff_hevc_put_hevc_bi_qpel_v48_8_avx2;
                 c->put_hevc_qpel_bi[9][1][0] = ff_hevc_put_hevc_bi_qpel_v64_8_avx2;
             }
+            SAO_BAND_INIT(8, avx2);
+
             c->sao_edge_filter[2] = ff_hevc_sao_edge_filter_32_8_avx2;
             c->sao_edge_filter[3] = ff_hevc_sao_edge_filter_48_8_avx2;
             c->sao_edge_filter[4] = ff_hevc_sao_edge_filter_64_8_avx2;
@@ -864,9 +866,9 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
                 c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_sse2;
                 c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_sse2;
 
-                SAO_BAND_INIT(10, sse2);
                 SAO_EDGE_INIT(10, sse2);
             }
+            SAO_BAND_INIT(10, sse2);
 
             c->idct_dc[1] = ff_hevc_idct8x8_dc_10_sse2;
             c->idct_dc[2] = ff_hevc_idct16x16_dc_10_sse2;
@@ -897,16 +899,14 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
             if (ARCH_X86_64) {
                 c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_avx;
                 c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_avx;
-
-                SAO_BAND_INIT(10, avx);
             }
+            SAO_BAND_INIT(10, avx);
         }
         if (EXTERNAL_AVX2(cpu_flags)) {
 
             c->idct_dc[2] = ff_hevc_idct16x16_dc_10_avx2;
             c->idct_dc[3] = ff_hevc_idct32x32_dc_10_avx2;
             if (ARCH_X86_64) {
-                SAO_BAND_INIT(10, avx2);
                 c->sao_edge_filter[2] = ff_hevc_sao_edge_filter_32_10_avx2;
                 c->sao_edge_filter[3] = ff_hevc_sao_edge_filter_48_10_avx2;
                 c->sao_edge_filter[4] = ff_hevc_sao_edge_filter_64_10_avx2;
@@ -1054,6 +1054,7 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
                 c->put_hevc_qpel_bi[8][1][1] = ff_hevc_put_hevc_bi_qpel_hv48_10_avx2;
                 c->put_hevc_qpel_bi[9][1][1] = ff_hevc_put_hevc_bi_qpel_hv64_10_avx2;
             }
+            SAO_BAND_INIT(10, avx2);
 
             c->transform_add[2] = ff_hevc_transform_add16_10_avx2;
             c->transform_add[3] = ff_hevc_transform_add32_10_avx2;
@@ -1071,9 +1072,9 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
                 c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_12_sse2;
                 c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_12_sse2;
 
-                SAO_BAND_INIT(12, sse2);
                 SAO_EDGE_INIT(12, sse2);
             }
+            SAO_BAND_INIT(12, sse2);
 
             c->idct_dc[1] = ff_hevc_idct8x8_dc_12_sse2;
             c->idct_dc[2] = ff_hevc_idct16x16_dc_12_sse2;
@@ -1100,19 +1101,18 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
             if (ARCH_X86_64) {
                 c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_12_avx;
                 c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_12_avx;
-
-                SAO_BAND_INIT(12, avx);
             }
+            SAO_BAND_INIT(12, avx);
         }
         if (EXTERNAL_AVX2(cpu_flags)) {
             c->idct_dc[2] = ff_hevc_idct16x16_dc_12_avx2;
             c->idct_dc[3] = ff_hevc_idct32x32_dc_12_avx2;
             if (ARCH_X86_64) {
-                SAO_BAND_INIT(12, avx2);
                 c->sao_edge_filter[2] = ff_hevc_sao_edge_filter_32_12_avx2;
                 c->sao_edge_filter[3] = ff_hevc_sao_edge_filter_48_12_avx2;
                 c->sao_edge_filter[4] = ff_hevc_sao_edge_filter_64_12_avx2;
             }
+            SAO_BAND_INIT(12, avx2);
         }
     }
 }
-- 
2.2.2



More information about the ffmpeg-devel mailing list