[MPlayer-dev-eng] scaletempo speedup, listen-test-request
Reimar Döffinger
Reimar.Doeffinger at stud.uni-karlsruhe.de
Thu Nov 15 22:15:15 CET 2007
Hello,
since my ears aren't the best at all, could you please test attached
patch if it decreases quality audibly?
It seems fine to me, and provides more than 40 % speedup on x86 and more
than 54% on x86_64 for extreme settings like -af
scaletempo=stride=30:overlap=0.5:search=40
(please forgive the extra clutter in the patch, I did not yet separate the
previous patch out).
Greetings,
Reimar Döffinger
-------------- next part --------------
Index: libaf/af_scaletempo.c
===================================================================
--- libaf/af_scaletempo.c (revision 25054)
+++ libaf/af_scaletempo.c (working copy)
@@ -60,16 +60,15 @@
int samples_standing;
int bytes_overlap;
int bytes_standing;
- int8_t* buf_overlap;
- int8_t* table_blend;
- void (*output_overlap)(struct af_scaletempo_s* s, int8_t* out_buf, int bytes_off);
+ void* buf_overlap;
+ void* table_blend;
+ void (*output_overlap)(struct af_scaletempo_s* s, void* out_buf, int bytes_off);
// best overlap
int frames_search;
int num_channels;
- int8_t* buf_pre_corr;
- int8_t* table_window;
+ void* buf_pre_corr;
+ void* table_window;
int (*best_overlap_offset)(struct af_scaletempo_s* s);
- short shift_corr;
// command line
float scale_nominal;
float ms_stride;
@@ -123,9 +122,10 @@
int best_off = 0;
int i, off;
- pw = (float*)s->table_window;
- po = (float*)s->buf_overlap + s->num_channels;
- ppc = (float*)s->buf_pre_corr;
+ pw = s->table_window;
+ po = s->buf_overlap;
+ po += s->num_channels;
+ ppc = s->buf_pre_corr;
for (i=s->num_channels; i<s->samples_overlap; i++) {
*ppc++ = *pw++ * *po++;
}
@@ -134,7 +134,7 @@
for (off=0; off<s->frames_search; off++) {
float corr = 0;
float* ps = search_start;
- ppc = (float*)s->buf_pre_corr;
+ ppc = s->buf_pre_corr;
for (i=s->num_channels; i<s->samples_overlap; i++) {
corr += *ppc++ * *ps++;
}
@@ -150,27 +150,32 @@
static int best_overlap_offset_s16(af_scaletempo_t* s)
{
- int32_t *pw, *ppc;
+ uint16_t *pw;
+ int16_t *ppc;
int16_t *po, *search_start;
int32_t best_corr = INT_MIN;
int best_off = 0;
int i, off;
- pw = (int32_t*)s->table_window;
- po = (int16_t*)s->buf_overlap + s->num_channels;
- ppc = (int32_t*)s->buf_pre_corr;
+ pw = s->table_window;
+ po = s->buf_overlap;
+ po += s->num_channels;
+ ppc = s->buf_pre_corr;
for (i=s->num_channels; i<s->samples_overlap; i++) {
- *ppc++ = ( *pw++ * *po++ ) >> 15;
+ *ppc++ = ( *pw++ * *po++ ) >> 16;
}
search_start = (int16_t*)s->buf_queue + s->num_channels;
for (off=0; off<s->frames_search; off++) {
- int32_t corr = 0;
+ int64_t corr = 0;
int16_t* ps = search_start;
- ppc = (int32_t*)s->buf_pre_corr;
- for (i=s->num_channels; i<s->samples_overlap; i++) {
- corr += ( *ppc++ * *ps++ ) >> s->shift_corr;
- }
+ ppc = s->buf_pre_corr;
+ ppc += s->samples_overlap - s->num_channels;
+ ps += s->samples_overlap - s->num_channels;
+ i = -(s->samples_overlap - s->num_channels);
+ do {
+ corr += ppc[i] * ps[i];
+ } while (++i < 0);
if (corr > best_corr) {
best_corr = corr;
best_off = off;
@@ -181,24 +186,24 @@
return best_off * 2 * s->num_channels;
}
-static void output_overlap_float(af_scaletempo_t* s, int8_t* buf_out,
+static void output_overlap_float(af_scaletempo_t* s, void* buf_out,
int bytes_off)
{
- float* pout = (float*)buf_out;
- float* pb = (float*)s->table_blend;
- float* po = (float*)s->buf_overlap;
+ float* pout = buf_out;
+ float* pb = s->table_blend;
+ float* po = s->buf_overlap;
float* pin = (float*)(s->buf_queue + bytes_off);
int i;
for (i=0; i<s->samples_overlap; i++) {
*pout++ = *po - *pb++ * ( *po - *pin++ ); po++;
}
}
-static void output_overlap_s16(af_scaletempo_t* s, int8_t* buf_out,
+static void output_overlap_s16(af_scaletempo_t* s, void* buf_out,
int bytes_off)
{
- int16_t* pout = (int16_t*)buf_out;
- int32_t* pb = (int32_t*)s->table_blend;
- int16_t* po = (int16_t*)s->buf_overlap;
+ int16_t* pout = buf_out;
+ int32_t* pb = s->table_blend;
+ int16_t* po = s->buf_overlap;
int16_t* pin = (int16_t*)(s->buf_queue + bytes_off);
int i;
for (i=0; i<s->samples_overlap; i++) {
@@ -333,7 +338,7 @@
}
bzero(s->buf_overlap, s->bytes_overlap);
if (use_int) {
- int32_t* pb = (int32_t*)s->table_blend;
+ int32_t* pb = s->table_blend;
int64_t blend = 0;
for (i=0; i<frames_overlap; i++) {
int32_t v = blend / frames_overlap;
@@ -344,7 +349,7 @@
}
s->output_overlap = output_overlap_s16;
} else {
- float* pb = (float*)s->table_blend;
+ float* pb = s->table_blend;
for (i=0; i<frames_overlap; i++) {
float v = i / (float)frames_overlap;
for (j=0; j<nch; j++) {
@@ -362,21 +367,20 @@
if (use_int) {
int64_t t = frames_overlap;
int32_t n = 8589934588LL / (t * t); // 4 * (2^31 - 1) / t^2
- int32_t* pw;
- s->buf_pre_corr = realloc(s->buf_pre_corr, s->bytes_overlap * 2);
- s->table_window = realloc(s->table_window, s->bytes_overlap * 2 - nch * bps * 2);
+ uint16_t* pw;
+ s->buf_pre_corr = realloc(s->buf_pre_corr, s->bytes_overlap);
+ s->table_window = realloc(s->table_window, s->bytes_overlap - nch * bps);
if(!s->buf_pre_corr && !s->table_window) {
af_msg(AF_MSG_FATAL, "[scaletempo] Out of memory\n");
return AF_ERROR;
}
- pw = (int32_t*)s->table_window;
+ pw = s->table_window;
for (i=1; i<frames_overlap; i++) {
- int32_t v = ( i * (t - i) * n ) >> 15;
+ uint16_t v = ( i * (t - i) * n ) >> 15;
for (j=0; j<nch; j++) {
*pw++ = v;
}
}
- s->shift_corr = av_log2( 2*(s->samples_overlap - nch) - 1 );
s->best_overlap_offset = best_overlap_offset_s16;
} else {
float* pw;
@@ -386,7 +390,7 @@
af_msg(AF_MSG_FATAL, "[scaletempo] Out of memory\n");
return AF_ERROR;
}
- pw = (float*)s->table_window;
+ pw = s->table_window;
for (i=1; i<frames_overlap; i++) {
float v = i * (frames_overlap - i);
for (j=0; j<nch; j++) {
More information about the MPlayer-dev-eng
mailing list