[FFmpeg-cvslog] [ffmpeg] branch master updated. e6635ada64 avfilter/vf_colordetect: optimize C functions a bit

ffmpeg-git at ffmpeg.org ffmpeg-git at ffmpeg.org
Mon Aug 11 20:39:50 EEST 2025


The branch, master has been updated
       via  e6635ada646b9ca65355a7904000103e1a0bd31e (commit)
      from  85e8e590015e918462031cac21c9c5862a1776b8 (commit)


- Log -----------------------------------------------------------------
commit e6635ada646b9ca65355a7904000103e1a0bd31e
Author:     Kacper Michajłow <kasper93 at gmail.com>
AuthorDate: Wed Aug 6 18:16:08 2025 +0200
Commit:     Kacper Michajłow <kasper93 at gmail.com>
CommitDate: Mon Aug 11 17:39:23 2025 +0000

    avfilter/vf_colordetect: optimize C functions a bit
    
    They are used to process tail, so it's still good to have them faster.
    Even if AVX version are used.
    
    GCC 14.2.0 | x86_64 (default config) | Before:
    
    detect_alpha_8_full_c:                                3803.0 ( 1.00x)
    detect_alpha_8_full_avx2:                              166.4 (22.86x)
    detect_alpha_8_full_avx512icl:                         144.2 (26.37x)
    detect_alpha_8_limited_c:                            10454.4 ( 1.00x)
    detect_alpha_8_limited_avx2:                           616.5 (16.96x)
    detect_alpha_8_limited_avx512icl:                      509.4 (20.52x)
    detect_alpha_16_full_c:                               1903.0 ( 1.00x)
    detect_alpha_16_full_avx2:                             172.4 (11.04x)
    detect_alpha_16_full_avx512icl:                        163.4 (11.65x)
    detect_alpha_16_limited_c:                            3703.6 ( 1.00x)
    detect_alpha_16_limited_avx2:                          644.4 ( 5.75x)
    detect_alpha_16_limited_avx512icl:                     558.0 ( 6.64x)
    detect_range_8_c:                                     5855.9 ( 1.00x)
    detect_range_8_avx2:                                   150.4 (38.94x)
    detect_range_8_avx512icl:                              146.7 (39.91x)
    detect_range_16_c:                                    2702.2 ( 1.00x)
    detect_range_16_avx2:                                  256.7 (10.53x)
    detect_range_16_avx512icl:                             116.8 (23.13x)
    
    GCC 14.2.0 | x86_64 (default config) | After:
    
    detect_alpha_8_full_c:                                 376.3 ( 1.00x)
    detect_alpha_8_full_avx2:                              169.2 ( 2.22x)
    detect_alpha_8_full_avx512icl:                         134.6 ( 2.80x)
    detect_alpha_8_limited_c:                             6024.1 ( 1.00x)
    detect_alpha_8_limited_avx2:                           641.8 ( 9.39x)
    detect_alpha_8_limited_avx512icl:                      493.0 (12.22x)
    detect_alpha_16_full_c:                                436.4 ( 1.00x)
    detect_alpha_16_full_avx2:                             156.3 ( 2.79x)
    detect_alpha_16_full_avx512icl:                        151.8 ( 2.87x)
    detect_alpha_16_limited_c:                            3679.9 ( 1.00x)
    detect_alpha_16_limited_avx2:                          642.0 ( 5.73x)
    detect_alpha_16_limited_avx512icl:                     555.2 ( 6.63x)
    detect_range_8_c:                                      655.2 ( 1.00x)
    detect_range_8_avx2:                                   153.9 ( 4.26x)
    detect_range_8_avx512icl:                              147.4 ( 4.45x)
    detect_range_16_c:                                     743.3 ( 1.00x)
    detect_range_16_avx2:                                  258.6 ( 2.87x)
    detect_range_16_avx512icl:                             107.7 ( 6.90x)
    
    Clang 19.1.7 | x86_64 (default config) | Before:
    
    detect_alpha_8_full_c:                                7013.4 ( 1.00x)
    detect_alpha_8_full_avx2:                              141.8 (49.46x)
    detect_alpha_8_full_avx512icl:                         133.8 (52.40x)
    detect_alpha_8_limited_c:                             7038.8 ( 1.00x)
    detect_alpha_8_limited_avx2:                           605.0 (11.63x)
    detect_alpha_8_limited_avx512icl:                      506.5 (13.90x)
    detect_alpha_16_full_c:                               1799.5 ( 1.00x)
    detect_alpha_16_full_avx2:                             143.0 (12.59x)
    detect_alpha_16_full_avx512icl:                        127.5 (14.12x)
    detect_alpha_16_limited_c:                            3499.6 ( 1.00x)
    detect_alpha_16_limited_avx2:                          633.6 ( 5.52x)
    detect_alpha_16_limited_avx512icl:                     551.9 ( 6.34x)
    detect_range_8_c:                                     5253.6 ( 1.00x)
    detect_range_8_avx2:                                   125.0 (42.01x)
    detect_range_8_avx512icl:                              123.2 (42.65x)
    detect_range_16_c:                                    3055.2 ( 1.00x)
    detect_range_16_avx2:                                  230.0 (13.28x)
    detect_range_16_avx512icl:                              95.9 (31.86x)
    
    Clang 19.1.7 | x86_64 (default config) | After:
    
    detect_alpha_8_full_c:                                 323.3 ( 1.00x)
    detect_alpha_8_full_avx2:                              149.7 ( 2.16x)
    detect_alpha_8_full_avx512icl:                         127.7 ( 2.53x)
    detect_alpha_8_limited_c:                             5075.9 ( 1.00x)
    detect_alpha_8_limited_avx2:                           625.4 ( 8.12x)
    detect_alpha_8_limited_avx512icl:                      493.0 (10.30x)
    detect_alpha_16_full_c:                                421.0 ( 1.00x)
    detect_alpha_16_full_avx2:                             238.8 ( 1.76x)
    detect_alpha_16_full_avx512icl:                        126.0 ( 3.34x)
    detect_alpha_16_limited_c:                            3516.8 ( 1.00x)
    detect_alpha_16_limited_avx2:                          624.7 ( 5.63x)
    detect_alpha_16_limited_avx512icl:                     544.7 ( 6.46x)
    detect_range_8_c:                                      609.1 ( 1.00x)
    detect_range_8_avx2:                                   239.4 ( 2.54x)
    detect_range_8_avx512icl:                               89.0 ( 6.84x)
    detect_range_16_c:                                     463.9 ( 1.00x)
    detect_range_16_avx2:                                  127.4 ( 3.64x)
    detect_range_16_avx512icl:                              86.4 ( 5.37x)
    
    Signed-off-by: Kacper Michajłow <kasper93 at gmail.com>

diff --git a/libavfilter/vf_colordetect.h b/libavfilter/vf_colordetect.h
index afd2db9c26..24279643d3 100644
--- a/libavfilter/vf_colordetect.h
+++ b/libavfilter/vf_colordetect.h
@@ -22,6 +22,7 @@
 #include <stddef.h>
 #include <stdint.h>
 
+#include <libavutil/avassert.h>
 #include <libavutil/macros.h>
 #include <libavutil/pixfmt.h>
 
@@ -44,39 +45,61 @@ void ff_color_detect_dsp_init(FFColorDetectDSPContext *dsp, int depth,
 void ff_color_detect_dsp_init_x86(FFColorDetectDSPContext *dsp, int depth,
                                   enum AVColorRange color_range);
 
-static inline int ff_detect_range_c(const uint8_t *data, ptrdiff_t stride,
+static inline int ff_detect_range_impl_c(const uint8_t *data, ptrdiff_t stride,
                                     ptrdiff_t width, ptrdiff_t height,
-                                    int mpeg_min, int mpeg_max)
+                                    uint8_t mpeg_min, uint8_t mpeg_max)
 {
     while (height--) {
+        uint8_t cond = 0;
         for (int x = 0; x < width; x++) {
             const uint8_t val = data[x];
-            if (val < mpeg_min || val > mpeg_max)
-                return 1;
+            cond |= val < mpeg_min || val > mpeg_max;
         }
+        if (cond)
+            return 1;
         data += stride;
     }
 
     return 0;
 }
 
-static inline int ff_detect_range16_c(const uint8_t *data, ptrdiff_t stride,
+static inline int ff_detect_range_c(const uint8_t *data, ptrdiff_t stride,
+                                    ptrdiff_t width, ptrdiff_t height,
+                                    int mpeg_min, int mpeg_max)
+{
+    av_assume(mpeg_min >= 0 && mpeg_min <= UINT8_MAX);
+    av_assume(mpeg_max >= 0 && mpeg_max <= UINT8_MAX);
+    return ff_detect_range_impl_c(data, stride, width, height, mpeg_min, mpeg_max);
+}
+
+static inline int ff_detect_range16_impl_c(const uint8_t *data, ptrdiff_t stride,
                                       ptrdiff_t width, ptrdiff_t height,
-                                      int mpeg_min, int mpeg_max)
+                                      uint16_t mpeg_min, uint16_t mpeg_max)
 {
     while (height--) {
         const uint16_t *data16 = (const uint16_t *) data;
+        uint8_t cond = 0;
         for (int x = 0; x < width; x++) {
             const uint16_t val = data16[x];
-            if (val < mpeg_min || val > mpeg_max)
-                return 1;
+            cond |= val < mpeg_min || val > mpeg_max;
         }
+        if (cond)
+            return 1;
         data += stride;
     }
 
     return 0;
 }
 
+static inline int ff_detect_range16_c(const uint8_t *data, ptrdiff_t stride,
+                                      ptrdiff_t width, ptrdiff_t height,
+                                      int mpeg_min, int mpeg_max)
+{
+    av_assume(mpeg_min >= 0 && mpeg_min <= UINT16_MAX);
+    av_assume(mpeg_max >= 0 && mpeg_max <= UINT16_MAX);
+    return ff_detect_range16_impl_c(data, stride, width, height, mpeg_min, mpeg_max);
+}
+
 static inline int
 ff_detect_alpha_full_c(const uint8_t *color, ptrdiff_t color_stride,
                        const uint8_t *alpha, ptrdiff_t alpha_stride,
@@ -84,10 +107,11 @@ ff_detect_alpha_full_c(const uint8_t *color, ptrdiff_t color_stride,
                        int p, int q, int k)
 {
     while (height--) {
-        for (int x = 0; x < width; x++) {
-            if (color[x] > alpha[x])
-                return 1;
-        }
+        uint8_t cond = 0;
+        for (int x = 0; x < width; x++)
+            cond |= color[x] > alpha[x];
+        if (cond)
+            return 1;
         color += color_stride;
         alpha += alpha_stride;
     }
@@ -101,10 +125,11 @@ ff_detect_alpha_limited_c(const uint8_t *color, ptrdiff_t color_stride,
                           int p, int q, int k)
 {
     while (height--) {
-        for (int x = 0; x < width; x++) {
-            if (p * color[x] - k > q * alpha[x])
-                return 1;
-        }
+        uint8_t cond = 0;
+        for (int x = 0; x < width; x++)
+            cond |= p * color[x] - k > q * alpha[x];
+        if (cond)
+            return 1;
         color += color_stride;
         alpha += alpha_stride;
     }
@@ -120,10 +145,11 @@ ff_detect_alpha16_full_c(const uint8_t *color, ptrdiff_t color_stride,
     while (height--) {
         const uint16_t *color16 = (const uint16_t *) color;
         const uint16_t *alpha16 = (const uint16_t *) alpha;
-        for (int x = 0; x < width; x++) {
-            if (color16[x] > alpha16[x])
-                return 1;
-        }
+        uint8_t cond = 0;
+        for (int x = 0; x < width; x++)
+            cond |= color16[x] > alpha16[x];
+        if (cond)
+            return 1;
         color += color_stride;
         alpha += alpha_stride;
     }

-----------------------------------------------------------------------

Summary of changes:
 libavfilter/vf_colordetect.h | 66 ++++++++++++++++++++++++++++++--------------
 1 file changed, 46 insertions(+), 20 deletions(-)


hooks/post-receive
-- 



More information about the ffmpeg-cvslog mailing list