[MPlayer-dev-eng] scaletempo speedup, listen-test-request

Reimar Döffinger Reimar.Doeffinger at stud.uni-karlsruhe.de
Thu Nov 15 22:15:15 CET 2007


Hello,
since my ears aren't the best at all, could you please test attached
patch if it decreases quality audibly?
It seems fine to me, and provides more than 40 % speedup on x86 and more
than 54% on x86_64 for extreme settings like -af
scaletempo=stride=30:overlap=0.5:search=40
(please forgive the extra clutter in the patch, I did not yet separate the
previous patch out).

Greetings,
Reimar Döffinger
-------------- next part --------------
Index: libaf/af_scaletempo.c
===================================================================
--- libaf/af_scaletempo.c	(revision 25054)
+++ libaf/af_scaletempo.c	(working copy)
@@ -60,16 +60,15 @@
   int     samples_standing;
   int     bytes_overlap;
   int     bytes_standing;
-  int8_t* buf_overlap;
-  int8_t* table_blend;
-  void    (*output_overlap)(struct af_scaletempo_s* s, int8_t* out_buf, int bytes_off);
+  void*   buf_overlap;
+  void*   table_blend;
+  void    (*output_overlap)(struct af_scaletempo_s* s, void* out_buf, int bytes_off);
   // best overlap
   int     frames_search;
   int     num_channels;
-  int8_t* buf_pre_corr;
-  int8_t* table_window;
+  void*   buf_pre_corr;
+  void*   table_window;
   int     (*best_overlap_offset)(struct af_scaletempo_s* s);
-  short   shift_corr;
   // command line
   float   scale_nominal;
   float   ms_stride;
@@ -123,9 +122,10 @@
   int best_off = 0;
   int i, off;
 
-  pw  = (float*)s->table_window;
-  po  = (float*)s->buf_overlap + s->num_channels;
-  ppc = (float*)s->buf_pre_corr;
+  pw  = s->table_window;
+  po  = s->buf_overlap;
+  po += s->num_channels;
+  ppc = s->buf_pre_corr;
   for (i=s->num_channels; i<s->samples_overlap; i++) {
     *ppc++ = *pw++ * *po++;
   }
@@ -134,7 +134,7 @@
   for (off=0; off<s->frames_search; off++) {
     float corr = 0;
     float* ps = search_start;
-    ppc = (float*)s->buf_pre_corr;
+    ppc = s->buf_pre_corr;
     for (i=s->num_channels; i<s->samples_overlap; i++) {
       corr += *ppc++ * *ps++;
     }
@@ -150,27 +150,32 @@
 
 static int best_overlap_offset_s16(af_scaletempo_t* s)
 {
-  int32_t *pw, *ppc;
+  uint16_t *pw;
+  int16_t *ppc;
   int16_t *po, *search_start;
   int32_t best_corr = INT_MIN;
   int best_off = 0;
   int i, off;
 
-  pw  = (int32_t*)s->table_window;
-  po  = (int16_t*)s->buf_overlap + s->num_channels;
-  ppc = (int32_t*)s->buf_pre_corr;
+  pw  = s->table_window;
+  po  = s->buf_overlap;
+  po += s->num_channels;
+  ppc = s->buf_pre_corr;
   for (i=s->num_channels; i<s->samples_overlap; i++) {
-    *ppc++ = ( *pw++ * *po++ ) >> 15;
+    *ppc++ = ( *pw++ * *po++ ) >> 16;
   }
 
   search_start = (int16_t*)s->buf_queue + s->num_channels;
   for (off=0; off<s->frames_search; off++) {
-    int32_t corr = 0;
+    int64_t corr = 0;
     int16_t* ps = search_start;
-    ppc = (int32_t*)s->buf_pre_corr;
-    for (i=s->num_channels; i<s->samples_overlap; i++) {
-      corr += ( *ppc++ * *ps++ ) >> s->shift_corr;
-    }
+    ppc = s->buf_pre_corr;
+    ppc += s->samples_overlap - s->num_channels;
+    ps  += s->samples_overlap - s->num_channels;
+    i  = -(s->samples_overlap - s->num_channels);
+    do {
+      corr += ppc[i] * ps[i];
+    } while (++i < 0);
     if (corr > best_corr) {
       best_corr = corr;
       best_off  = off;
@@ -181,24 +186,24 @@
   return best_off * 2 * s->num_channels;
 }
 
-static void output_overlap_float(af_scaletempo_t* s, int8_t* buf_out,
+static void output_overlap_float(af_scaletempo_t* s, void* buf_out,
 				  int bytes_off)
 {
-  float* pout = (float*)buf_out;
-  float* pb   = (float*)s->table_blend;
-  float* po   = (float*)s->buf_overlap;
+  float* pout = buf_out;
+  float* pb   = s->table_blend;
+  float* po   = s->buf_overlap;
   float* pin  = (float*)(s->buf_queue + bytes_off);
   int i;
   for (i=0; i<s->samples_overlap; i++) {
     *pout++ = *po - *pb++ * ( *po - *pin++ ); po++;
   }
 }
-static void output_overlap_s16(af_scaletempo_t* s, int8_t* buf_out,
+static void output_overlap_s16(af_scaletempo_t* s, void* buf_out,
 			       int bytes_off)
 {
-  int16_t* pout = (int16_t*)buf_out;
-  int32_t* pb   = (int32_t*)s->table_blend;
-  int16_t* po   = (int16_t*)s->buf_overlap;
+  int16_t* pout = buf_out;
+  int32_t* pb   = s->table_blend;
+  int16_t* po   = s->buf_overlap;
   int16_t* pin  = (int16_t*)(s->buf_queue + bytes_off);
   int i;
   for (i=0; i<s->samples_overlap; i++) {
@@ -333,7 +338,7 @@
       }
       bzero(s->buf_overlap, s->bytes_overlap);
       if (use_int) {
-        int32_t* pb = (int32_t*)s->table_blend;
+        int32_t* pb = s->table_blend;
         int64_t blend = 0;
         for (i=0; i<frames_overlap; i++) {
           int32_t v = blend / frames_overlap;
@@ -344,7 +349,7 @@
         }
         s->output_overlap = output_overlap_s16;
       } else {
-        float* pb = (float*)s->table_blend;
+        float* pb = s->table_blend;
         for (i=0; i<frames_overlap; i++) {
           float v = i / (float)frames_overlap;
           for (j=0; j<nch; j++) {
@@ -362,21 +367,20 @@
       if (use_int) {
         int64_t t = frames_overlap;
         int32_t n = 8589934588LL / (t * t);  // 4 * (2^31 - 1) / t^2
-        int32_t* pw;
-        s->buf_pre_corr = realloc(s->buf_pre_corr, s->bytes_overlap * 2);
-        s->table_window = realloc(s->table_window, s->bytes_overlap * 2 - nch * bps * 2);
+        uint16_t* pw;
+        s->buf_pre_corr = realloc(s->buf_pre_corr, s->bytes_overlap);
+        s->table_window = realloc(s->table_window, s->bytes_overlap - nch * bps);
         if(!s->buf_pre_corr && !s->table_window) {
           af_msg(AF_MSG_FATAL, "[scaletempo] Out of memory\n");
           return AF_ERROR;
         }
-        pw = (int32_t*)s->table_window;
+        pw = s->table_window;
         for (i=1; i<frames_overlap; i++) {
-          int32_t v = ( i * (t - i) * n ) >> 15;
+          uint16_t v = ( i * (t - i) * n ) >> 15;
           for (j=0; j<nch; j++) {
             *pw++ = v;
           }
         }
-        s->shift_corr = av_log2( 2*(s->samples_overlap - nch) - 1 );
         s->best_overlap_offset = best_overlap_offset_s16;
       } else {
         float* pw;
@@ -386,7 +390,7 @@
           af_msg(AF_MSG_FATAL, "[scaletempo] Out of memory\n");
           return AF_ERROR;
         }
-        pw = (float*)s->table_window;
+        pw = s->table_window;
         for (i=1; i<frames_overlap; i++) {
           float v = i * (frames_overlap - i);
           for (j=0; j<nch; j++) {


More information about the MPlayer-dev-eng mailing list