[FFmpeg-cvslog] avcodec/x86/hevc: fix luma 12b overflow

Ronald S. Bultje git at videolan.org
Mon Feb 26 16:28:12 EET 2024


ffmpeg | branch: master | Ronald S. Bultje <rsbultje at gmail.com> | Sun Feb 25 10:49:35 2024 -0500| [d6083f503d5bd7f9a2540c3e30d95e7add765d1e] | committer: J. Dekker

avcodec/x86/hevc: fix luma 12b overflow

Signed-off-by: J. Dekker <jdek at itanimul.li>

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=d6083f503d5bd7f9a2540c3e30d95e7add765d1e
---

 libavcodec/x86/hevc_deblock.asm | 40 +++++++++++++++++++++++++++++++---------
 1 file changed, 31 insertions(+), 9 deletions(-)

diff --git a/libavcodec/x86/hevc_deblock.asm b/libavcodec/x86/hevc_deblock.asm
index 85ee4800bb..61b79f8079 100644
--- a/libavcodec/x86/hevc_deblock.asm
+++ b/libavcodec/x86/hevc_deblock.asm
@@ -541,19 +541,41 @@ ALIGN 16
     add             betaq, r13
     shr             betaq, 3; ((beta + (beta >> 1)) >> 3))
 
-    mova            m13, [pw_8]
     psubw           m12, m4, m3 ; q0 - p0
-    psllw           m10, m12, 3; 8 * (q0 - p0)
-    paddw           m12, m10 ; 9 * (q0 - p0)
-
+    paddw           m10, m12, m12
+    paddw           m12, m10 ; 3 * (q0 - p0)
     psubw           m10, m5, m2 ; q1 - p1
-    psllw            m8, m10, 1; 2 * ( q1 - p1 )
-    paddw           m10, m8; 3 * ( q1 - p1 )
-    psubw           m12, m10; 9 * (q0 - p0) - 3 * ( q1 - p1 )
-    paddw           m12, m13; + 8
+    psubw           m12, m10 ; 3 * (q0 - p0) - (q1 - p1)
+%if %1 < 12
+    paddw           m10, m12, m12
+    paddw           m12, [pw_8]; + 8
+    paddw           m12, m10 ; 9 * (q0 - p0) - 3 * ( q1 - p1 )
     psraw           m12, 4; >> 4 , delta0
     PABSW           m13, m12; abs(delta0)
-
+%elif cpuflag(ssse3)
+    pabsw           m13, m12
+    paddw           m10, m13, m13
+    paddw           m13, [pw_8]
+    paddw           m13, m10 ; abs(9 * (q0 - p0) - 3 * ( q1 - p1 ))
+    pxor            m10, m10
+    pcmpgtw         m10, m12
+    paddw           m13, m10
+    psrlw           m13, 4; >> 4, abs(delta0)
+    psignw          m10, m13, m12
+    SWAP             10, 12
+%else
+    pxor            m10, m10
+    pcmpgtw         m10, m12
+    pxor            m12, m10
+    psubw           m12, m10 ; abs()
+    paddw           m13, m12, m12
+    paddw           m12, [pw_8]
+    paddw           m13, m12 ; 3*abs(m12)
+    paddw           m13, m10
+    psrlw           m13, 4
+    pxor            m12, m13, m10
+    psubw           m12, m10
+%endif
 
     psllw           m10, m9, 2; 8 * tc
     paddw           m10, m9; 10 * tc



More information about the ffmpeg-cvslog mailing list