VampPluginSDK 2.10
FixedTempoEstimator.cpp
Go to the documentation of this file.
1/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */
2
3/*
4 Vamp
5
6 An API for audio analysis and feature extraction plugins.
7
8 Centre for Digital Music, Queen Mary, University of London.
9 Copyright 2006-2009 Chris Cannam and QMUL.
10
11 Permission is hereby granted, free of charge, to any person
12 obtaining a copy of this software and associated documentation
13 files (the "Software"), to deal in the Software without
14 restriction, including without limitation the rights to use, copy,
15 modify, merge, publish, distribute, sublicense, and/or sell copies
16 of the Software, and to permit persons to whom the Software is
17 furnished to do so, subject to the following conditions:
18
19 The above copyright notice and this permission notice shall be
20 included in all copies or substantial portions of the Software.
21
22 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
23 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
24 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
25 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
26 ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
27 CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
28 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
29
30 Except as contained in this notice, the names of the Centre for
31 Digital Music; Queen Mary, University of London; and Chris Cannam
32 shall not be used in advertising or otherwise to promote the sale,
33 use or other dealings in this Software without prior written
34 authorization.
35*/
36
37#include "FixedTempoEstimator.h"
38
39using std::string;
40using std::vector;
41using std::cerr;
42using std::endl;
43
44using Vamp::RealTime;
45
46#include <cmath>
47#include <cstdio>
48
49
51// this class just avoids us having to declare any data members in the header
52{
53public:
54 D(float inputSampleRate);
55 ~D();
56
57 size_t getPreferredStepSize() const { return 64; }
58 size_t getPreferredBlockSize() const { return 256; }
59
61 float getParameter(string id) const;
62 void setParameter(string id, float value);
63
65
66 bool initialise(size_t channels, size_t stepSize, size_t blockSize);
67 void reset();
68 FeatureSet process(const float *const *, RealTime);
70
71private:
72 void calculate();
74
75 float lag2tempo(int);
76 int tempo2lag(float);
77
79 size_t m_stepSize;
81
82 float m_minbpm;
83 float m_maxbpm;
85
87
88 size_t m_dfsize;
89 float *m_df;
90 float *m_r;
91 float *m_fr;
92 float *m_t;
93 size_t m_n;
94
97};
98
99FixedTempoEstimator::D::D(float inputSampleRate) :
100 m_inputSampleRate(inputSampleRate),
101 m_stepSize(0),
102 m_blockSize(0),
103 m_minbpm(50),
104 m_maxbpm(190),
105 m_maxdflen(10),
106 m_priorMagnitudes(0),
107 m_df(0),
108 m_r(0),
109 m_fr(0),
110 m_t(0),
111 m_n(0)
112{
113}
114
116{
117 delete[] m_priorMagnitudes;
118 delete[] m_df;
119 delete[] m_r;
120 delete[] m_fr;
121 delete[] m_t;
122}
123
126{
127 ParameterList list;
128
130 d.identifier = "minbpm";
131 d.name = "Minimum estimated tempo";
132 d.description = "Minimum beat-per-minute value which the tempo estimator is able to return";
133 d.unit = "bpm";
134 d.minValue = 10;
135 d.maxValue = 360;
136 d.defaultValue = 50;
137 d.isQuantized = false;
138 list.push_back(d);
139
140 d.identifier = "maxbpm";
141 d.name = "Maximum estimated tempo";
142 d.description = "Maximum beat-per-minute value which the tempo estimator is able to return";
143 d.defaultValue = 190;
144 list.push_back(d);
145
146 d.identifier = "maxdflen";
147 d.name = "Input duration to study";
148 d.description = "Length of audio input, in seconds, which should be taken into account when estimating tempo. There is no need to supply the plugin with any further input once this time has elapsed since the start of the audio. The tempo estimator may use only the first part of this, up to eight times the slowest beat duration: increasing this value further than that is unlikely to improve results.";
149 d.unit = "s";
150 d.minValue = 2;
151 d.maxValue = 40;
152 d.defaultValue = 10;
153 list.push_back(d);
154
155 return list;
156}
157
158float
160{
161 if (id == "minbpm") {
162 return m_minbpm;
163 } else if (id == "maxbpm") {
164 return m_maxbpm;
165 } else if (id == "maxdflen") {
166 return m_maxdflen;
167 }
168 return 0.f;
169}
170
171void
173{
174 if (id == "minbpm") {
175 m_minbpm = value;
176 } else if (id == "maxbpm") {
177 m_maxbpm = value;
178 } else if (id == "maxdflen") {
179 m_maxdflen = value;
180 }
181}
182
183static int TempoOutput = 0;
184static int CandidatesOutput = 1;
185static int DFOutput = 2;
186static int ACFOutput = 3;
187static int FilteredACFOutput = 4;
188
191{
192 OutputList list;
193
195 d.identifier = "tempo";
196 d.name = "Tempo";
197 d.description = "Estimated tempo";
198 d.unit = "bpm";
199 d.hasFixedBinCount = true;
200 d.binCount = 1;
201 d.hasKnownExtents = false;
202 d.isQuantized = false;
203 d.sampleType = OutputDescriptor::VariableSampleRate;
205 d.hasDuration = true; // our returned tempo spans a certain range
206 list.push_back(d);
207
208 d.identifier = "candidates";
209 d.name = "Tempo candidates";
210 d.description = "Possible tempo estimates, one per bin with the most likely in the first bin";
211 d.unit = "bpm";
212 d.hasFixedBinCount = false;
213 list.push_back(d);
214
215 d.identifier = "detectionfunction";
216 d.name = "Detection Function";
217 d.description = "Onset detection function";
218 d.unit = "";
219 d.hasFixedBinCount = 1;
220 d.binCount = 1;
221 d.hasKnownExtents = true;
222 d.minValue = 0.0;
223 d.maxValue = 1.0;
224 d.isQuantized = false;
225 d.quantizeStep = 0.0;
226 d.sampleType = OutputDescriptor::FixedSampleRate;
227 if (m_stepSize) {
228 d.sampleRate = m_inputSampleRate / m_stepSize;
229 } else {
231 }
232 d.hasDuration = false;
233 list.push_back(d);
234
235 d.identifier = "acf";
236 d.name = "Autocorrelation Function";
237 d.description = "Autocorrelation of onset detection function";
238 d.hasKnownExtents = false;
239 d.unit = "r";
240 list.push_back(d);
241
242 d.identifier = "filtered_acf";
243 d.name = "Filtered Autocorrelation";
244 d.description = "Filtered autocorrelation of onset detection function";
245 d.unit = "r";
246 list.push_back(d);
247
248 return list;
249}
250
251bool
252FixedTempoEstimator::D::initialise(size_t, size_t stepSize, size_t blockSize)
253{
254 m_stepSize = stepSize;
255 m_blockSize = blockSize;
256
257 float dfLengthSecs = m_maxdflen;
258 m_dfsize = (dfLengthSecs * m_inputSampleRate) / m_stepSize;
259
260 m_priorMagnitudes = new float[m_blockSize/2];
261 m_df = new float[m_dfsize];
262
263 for (size_t i = 0; i < m_blockSize/2; ++i) {
264 m_priorMagnitudes[i] = 0.f;
265 }
266 for (size_t i = 0; i < m_dfsize; ++i) {
267 m_df[i] = 0.f;
268 }
269
270 m_n = 0;
271
272 return true;
273}
274
275void
277{
278 if (!m_priorMagnitudes) return;
279
280 for (size_t i = 0; i < m_blockSize/2; ++i) {
281 m_priorMagnitudes[i] = 0.f;
282 }
283 for (size_t i = 0; i < m_dfsize; ++i) {
284 m_df[i] = 0.f;
285 }
286
287 delete[] m_r;
288 m_r = 0;
289
290 delete[] m_fr;
291 m_fr = 0;
292
293 delete[] m_t;
294 m_t = 0;
295
296 m_n = 0;
297
298 m_start = RealTime::zeroTime;
299 m_lasttime = RealTime::zeroTime;
300}
301
303FixedTempoEstimator::D::process(const float *const *inputBuffers, RealTime ts)
304{
305 FeatureSet fs;
306
307 if (m_stepSize == 0) {
308 cerr << "ERROR: FixedTempoEstimator::process: "
309 << "FixedTempoEstimator has not been initialised"
310 << endl;
311 return fs;
312 }
313
314 if (m_n == 0) m_start = ts;
315 m_lasttime = ts;
316
317 if (m_n == m_dfsize) {
318 // If we have seen enough input, do the estimation and return
319 calculate();
320 fs = assembleFeatures();
321 ++m_n;
322 return fs;
323 }
324
325 // If we have seen more than enough, just discard and return!
326 if (m_n > m_dfsize) return FeatureSet();
327
328 float value = 0.f;
329
330 // m_df will contain an onset detection function based on the rise
331 // in overall power from one spectral frame to the next --
332 // simplistic but reasonably effective for our purposes.
333
334 for (size_t i = 1; i < m_blockSize/2; ++i) {
335
336 float real = inputBuffers[0][i*2];
337 float imag = inputBuffers[0][i*2 + 1];
338
339 float sqrmag = real * real + imag * imag;
340 value += fabsf(sqrmag - m_priorMagnitudes[i]);
341
342 m_priorMagnitudes[i] = sqrmag;
343 }
344
345 m_df[m_n] = value;
346
347 ++m_n;
348 return fs;
349}
350
353{
354 FeatureSet fs;
355 if (m_n > m_dfsize) return fs;
356 calculate();
357 fs = assembleFeatures();
358 ++m_n;
359 return fs;
360}
361
362float
364{
365 return 60.f / ((lag * m_stepSize) / m_inputSampleRate);
366}
367
368int
370{
371 return ((60.f / tempo) * m_inputSampleRate) / m_stepSize;
372}
373
374void
376{
377 if (m_r) {
378 cerr << "FixedTempoEstimator::calculate: calculation already happened?" << endl;
379 return;
380 }
381
382 if (m_n < m_dfsize / 9 &&
383 m_n < (1.0 * m_inputSampleRate) / m_stepSize) { // 1 second
384 cerr << "FixedTempoEstimator::calculate: Input is too short" << endl;
385 return;
386 }
387
388 // This function takes m_df (the detection function array filled
389 // out in process()) and calculates m_r (the raw autocorrelation)
390 // and m_fr (the filtered autocorrelation from whose peaks tempo
391 // estimates will be taken).
392
393 int n = m_n; // length of actual df array (m_dfsize is the theoretical max)
394
395 m_r = new float[n/2]; // raw autocorrelation
396 m_fr = new float[n/2]; // filtered autocorrelation
397 m_t = new float[n/2]; // averaged tempo estimate for each lag value
398
399 for (int i = 0; i < n/2; ++i) {
400 m_r[i] = 0.f;
401 m_fr[i] = 0.f;
402 m_t[i] = lag2tempo(i);
403 }
404
405 // Calculate the raw autocorrelation of the detection function
406
407 for (int i = 0; i < n/2; ++i) {
408
409 for (int j = i; j < n; ++j) {
410 m_r[i] += m_df[j] * m_df[j - i];
411 }
412
413 m_r[i] /= n - i - 1;
414 }
415
416 // Filter the autocorrelation and average out the tempo estimates
417
418 float related[] = { 0.5, 2, 4, 8 };
419
420 for (int i = 1; i < n/2-1; ++i) {
421
422 m_fr[i] = m_r[i];
423
424 int div = 1;
425
426 for (int j = 0; j < int(sizeof(related)/sizeof(related[0])); ++j) {
427
428 // Check for an obvious peak at each metrically related lag
429
430 int k0 = int(i * related[j] + 0.5);
431
432 if (k0 >= 0 && k0 < int(n/2)) {
433
434 int kmax = 0;
435 float kvmax = 0, kvmin = 0;
436 bool have = false;
437
438 for (int k = k0 - 1; k <= k0 + 1; ++k) {
439
440 if (k < 0 || k >= n/2) continue;
441
442 if (!have || (m_r[k] > kvmax)) { kvmax = m_r[k]; kmax = k; }
443 if (!have || (m_r[k] < kvmin)) { kvmin = m_r[k]; }
444
445 have = true;
446 }
447
448 // Boost the original lag according to the strongest
449 // value found close to this related lag
450
451 m_fr[i] += m_r[kmax] / 5;
452
453 if ((kmax == 0 || m_r[kmax] > m_r[kmax-1]) &&
454 (kmax == n/2-1 || m_r[kmax] > m_r[kmax+1]) &&
455 kvmax > kvmin * 1.05) {
456
457 // The strongest value close to the related lag is
458 // also a pretty good looking peak, so use it to
459 // improve our tempo estimate for the original lag
460
461 m_t[i] = m_t[i] + lag2tempo(kmax) * related[j];
462 ++div;
463 }
464 }
465 }
466
467 m_t[i] /= div;
468
469 // Finally apply a primitive perceptual weighting (to prefer
470 // tempi of around 120-130)
471
472 float weight = 1.f - fabsf(128.f - lag2tempo(i)) * 0.005;
473 if (weight < 0.f) weight = 0.f;
474 weight = weight * weight * weight;
475
476 m_fr[i] += m_fr[i] * (weight / 3);
477 }
478}
479
482{
483 FeatureSet fs;
484 if (!m_r) return fs; // No autocorrelation: no results
485
486 Feature feature;
487 feature.hasTimestamp = true;
488 feature.hasDuration = false;
489 feature.label = "";
490 feature.values.clear();
491 feature.values.push_back(0.f);
492
493 char buffer[40];
494
495 int n = m_n;
496
497 for (int i = 0; i < n; ++i) {
498
499 // Return the detection function in the DF output
500
501 feature.timestamp = m_start +
502 RealTime::frame2RealTime(i * m_stepSize, m_inputSampleRate);
503 feature.values[0] = m_df[i];
504 feature.label = "";
505 fs[DFOutput].push_back(feature);
506 }
507
508 for (int i = 1; i < n/2; ++i) {
509
510 // Return the raw autocorrelation in the ACF output, each
511 // value labelled according to its corresponding tempo
512
513 feature.timestamp = m_start +
514 RealTime::frame2RealTime(i * m_stepSize, m_inputSampleRate);
515 feature.values[0] = m_r[i];
516 sprintf(buffer, "%.1f bpm", lag2tempo(i));
517 if (i == n/2-1) feature.label = "";
518 else feature.label = buffer;
519 fs[ACFOutput].push_back(feature);
520 }
521
522 float t0 = m_minbpm; // our minimum detected tempo
523 float t1 = m_maxbpm; // our maximum detected tempo
524
525 int p0 = tempo2lag(t1);
526 int p1 = tempo2lag(t0);
527
528 std::map<float, int> candidates;
529
530 for (int i = p0; i <= p1 && i+1 < n/2; ++i) {
531
532 if (i < 1) continue;
533
534 if (m_fr[i] > m_fr[i-1] &&
535 m_fr[i] > m_fr[i+1]) {
536
537 // This is a peak in the filtered autocorrelation: stick
538 // it into the map from filtered autocorrelation to lag
539 // index -- this sorts our peaks by filtered acf value
540
541 candidates[m_fr[i]] = i;
542 }
543
544 // Also return the filtered autocorrelation in its own output
545
546 feature.timestamp = m_start +
547 RealTime::frame2RealTime(i * m_stepSize, m_inputSampleRate);
548 feature.values[0] = m_fr[i];
549 sprintf(buffer, "%.1f bpm", lag2tempo(i));
550 if (i == p1 || i == n/2-2) feature.label = "";
551 else feature.label = buffer;
552 fs[FilteredACFOutput].push_back(feature);
553 }
554
555 if (candidates.empty()) {
556 cerr << "No tempo candidates!" << endl;
557 return fs;
558 }
559
560 feature.hasTimestamp = true;
561 feature.timestamp = m_start;
562
563 feature.hasDuration = true;
564 feature.duration = m_lasttime - m_start;
565
566 // The map contains only peaks and is sorted by filtered acf
567 // value, so the final element in it is our "best" tempo guess
568
569 std::map<float, int>::const_iterator ci = candidates.end();
570 --ci;
571 int maxpi = ci->second;
572
573 if (m_t[maxpi] > 0) {
574
575 // This lag has an adjusted tempo from the averaging process:
576 // use it
577
578 feature.values[0] = m_t[maxpi];
579
580 } else {
581
582 // shouldn't happen -- it would imply that this high value was
583 // not a peak!
584
585 feature.values[0] = lag2tempo(maxpi);
586 cerr << "WARNING: No stored tempo for index " << maxpi << endl;
587 }
588
589 sprintf(buffer, "%.1f bpm", feature.values[0]);
590 feature.label = buffer;
591
592 // Return the best tempo in the main output
593
594 fs[TempoOutput].push_back(feature);
595
596 // And return the other estimates (up to the arbitrarily chosen
597 // number of 10 of them) in the candidates output
598
599 feature.values.clear();
600 feature.label = "";
601
602 while (feature.values.size() < 10) {
603 if (m_t[ci->second] > 0) {
604 feature.values.push_back(m_t[ci->second]);
605 } else {
606 feature.values.push_back(lag2tempo(ci->second));
607 }
608 if (ci == candidates.begin()) break;
609 --ci;
610 }
611
612 fs[CandidatesOutput].push_back(feature);
613
614 return fs;
615}
616
617
618
620 Plugin(inputSampleRate),
621 m_d(new D(inputSampleRate))
622{
623}
624
626{
627 delete m_d;
628}
629
630string
632{
633 return "fixedtempo";
634}
635
636string
638{
639 return "Simple Fixed Tempo Estimator";
640}
641
642string
644{
645 return "Study a short section of audio and estimate its tempo, assuming the tempo is constant";
646}
647
648string
650{
651 return "Vamp SDK Example Plugins";
652}
653
654int
656{
657 return 1;
658}
659
660string
662{
663 return "Code copyright 2008 Queen Mary, University of London. Freely redistributable (BSD license)";
664}
665
666size_t
668{
669 return m_d->getPreferredStepSize();
670}
671
672size_t
674{
675 return m_d->getPreferredBlockSize();
676}
677
678bool
679FixedTempoEstimator::initialise(size_t channels, size_t stepSize, size_t blockSize)
680{
681 if (channels < getMinChannelCount() ||
682 channels > getMaxChannelCount()) return false;
683
684 return m_d->initialise(channels, stepSize, blockSize);
685}
686
687void
689{
690 return m_d->reset();
691}
692
695{
697}
698
699float
701{
702 return m_d->getParameter(id);
703}
704
705void
706FixedTempoEstimator::setParameter(std::string id, float value)
707{
708 m_d->setParameter(id, value);
709}
710
713{
714 return m_d->getOutputDescriptors();
715}
716
718FixedTempoEstimator::process(const float *const *inputBuffers, RealTime ts)
719{
720 return m_d->process(inputBuffers, ts);
721}
722
725{
726 return m_d->getRemainingFeatures();
727}
static int ACFOutput
static int CandidatesOutput
static int FilteredACFOutput
static int TempoOutput
static int DFOutput
D(float inputSampleRate)
size_t getPreferredBlockSize() const
float getParameter(string id) const
OutputList getOutputDescriptors() const
void setParameter(string id, float value)
FeatureSet process(const float *const *, RealTime)
ParameterList getParameterDescriptors() const
bool initialise(size_t channels, size_t stepSize, size_t blockSize)
size_t getPreferredStepSize() const
std::string getIdentifier() const
Get the computer-usable name of the plugin.
FeatureSet process(const float *const *inputBuffers, Vamp::RealTime timestamp)
Process a single block of input data.
int getPluginVersion() const
Get the version number of the plugin.
FixedTempoEstimator(float inputSampleRate)
std::string getCopyright() const
Get the copyright statement or licensing summary for the plugin.
FeatureSet getRemainingFeatures()
After all blocks have been processed, calculate and return any remaining features derived from the co...
ParameterList getParameterDescriptors() const
Get the controllable parameters of this plugin.
std::string getDescription() const
Get a human-readable description for the plugin, typically a line of text that may optionally be disp...
std::string getMaker() const
Get the name of the author or vendor of the plugin in human-readable form.
void setParameter(std::string id, float value)
Set a named parameter.
size_t getPreferredBlockSize() const
Get the preferred block size (window size – the number of sample frames passed in each block to the p...
bool initialise(size_t channels, size_t stepSize, size_t blockSize)
Initialise a plugin to prepare it for use with the given number of input channels,...
std::string getName() const
Get a human-readable name or title of the plugin.
void reset()
Reset the plugin after use, to prepare it for another clean run.
size_t getPreferredStepSize() const
Get the preferred step size (window increment – the distance in sample frames between the start frame...
float getParameter(std::string id) const
Get the value of a named parameter.
OutputList getOutputDescriptors() const
Get the outputs of this plugin.
std::vector< ParameterDescriptor > ParameterList
virtual size_t getMaxChannelCount() const
Get the maximum supported number of input channels.
Plugin(float inputSampleRate)
std::vector< OutputDescriptor > OutputList
std::map< int, FeatureList > FeatureSet
float m_inputSampleRate
virtual size_t getMinChannelCount() const
Get the minimum supported number of input channels.
std::string name
The human-readable name of the parameter.
std::string identifier
The name of the parameter, in computer-usable form.
std::string unit
The unit of the parameter, in human-readable form.
std::string description
A human-readable short text describing the parameter.
bool isQuantized
True if the parameter values are quantized to a particular resolution.
float minValue
The minimum value of the parameter.
float maxValue
The maximum value of the parameter.
float defaultValue
The default value of the parameter.
bool hasDuration
True if an output feature has a specified duration.
std::vector< float > values
Results for a single sample of this feature.
std::string label
Label for the sample of this feature.
bool hasTimestamp
True if an output feature has its own timestamp.
RealTime timestamp
Timestamp of the output feature.
RealTime duration
Duration of the output feature.
SampleType sampleType
Positioning in time of the output results.
bool isQuantized
True if the output values are quantized to a particular resolution.
float quantizeStep
Quantization resolution of the output values (e.g.
float maxValue
Maximum value of the results in the output.
std::string unit
The unit of the output, in human-readable form.
bool hasDuration
True if the returned results for this output are known to have a duration field.
std::string name
The human-readable name of the output.
float minValue
Minimum value of the results in the output.
float sampleRate
Sample rate of the output results, as samples per second.
std::string identifier
The name of the output, in computer-usable form.
bool hasKnownExtents
True if the results in each output bin fall within a fixed numeric range (minimum and maximum values)...
bool hasFixedBinCount
True if the output has the same number of values per sample for every output sample.
std::string description
A human-readable short text describing the output.
size_t binCount
The number of values per result of the output.
RealTime represents time values to nanosecond precision with accurate arithmetic and frame-rate conve...