609 lines
16 KiB
C++
609 lines
16 KiB
C++
/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */
|
|
|
|
/*
|
|
pYIN - A fundamental frequency estimator for monophonic audio
|
|
Centre for Digital Music, Queen Mary, University of London.
|
|
|
|
This program is free software; you can redistribute it and/or
|
|
modify it under the terms of the GNU General Public License as
|
|
published by the Free Software Foundation; either version 2 of the
|
|
License, or (at your option) any later version. See the file
|
|
COPYING included with this distribution for more information.
|
|
*/
|
|
|
|
#include "PYinVamp.h"
|
|
#include "MonoNote.h"
|
|
#include "MonoPitch.h"
|
|
|
|
#include "vamp-sdk/FFT.h"
|
|
|
|
#include <vector>
|
|
#include <algorithm>
|
|
|
|
#include <cstdio>
|
|
#include <cmath>
|
|
#include <complex>
|
|
|
|
using std::string;
|
|
using std::vector;
|
|
using Vamp::RealTime;
|
|
|
|
|
|
PYinVamp::PYinVamp(float inputSampleRate) :
|
|
Plugin(inputSampleRate),
|
|
m_channels(0),
|
|
m_stepSize(256),
|
|
m_blockSize(2048),
|
|
m_fmin(40),
|
|
m_fmax(1600),
|
|
m_yin(2048, inputSampleRate, 0.0),
|
|
m_oF0Candidates(0),
|
|
m_oF0Probs(0),
|
|
m_oVoicedProb(0),
|
|
m_oCandidateSalience(0),
|
|
m_oSmoothedPitchTrack(0),
|
|
m_oNotes(0),
|
|
m_threshDistr(2.0f),
|
|
m_outputUnvoiced(0.0f),
|
|
m_preciseTime(0.0f),
|
|
m_lowAmp(0.1f),
|
|
m_onsetSensitivity(0.7f),
|
|
m_pruneThresh(0.1f),
|
|
m_pitchProb(0),
|
|
m_timestamp(0),
|
|
m_level(0)
|
|
{
|
|
}
|
|
|
|
PYinVamp::~PYinVamp()
|
|
{
|
|
}
|
|
|
|
string
|
|
PYinVamp::getIdentifier() const
|
|
{
|
|
return "pyin";
|
|
}
|
|
|
|
string
|
|
PYinVamp::getName() const
|
|
{
|
|
return "pYin";
|
|
}
|
|
|
|
string
|
|
PYinVamp::getDescription() const
|
|
{
|
|
return "Monophonic pitch and note tracking based on a probabilistic Yin extension.";
|
|
}
|
|
|
|
string
|
|
PYinVamp::getMaker() const
|
|
{
|
|
return "Matthias Mauch";
|
|
}
|
|
|
|
int
|
|
PYinVamp::getPluginVersion() const
|
|
{
|
|
// Increment this each time you release a version that behaves
|
|
// differently from the previous one
|
|
return 2;
|
|
}
|
|
|
|
string
|
|
PYinVamp::getCopyright() const
|
|
{
|
|
return "GPL";
|
|
}
|
|
|
|
PYinVamp::InputDomain
|
|
PYinVamp::getInputDomain() const
|
|
{
|
|
return TimeDomain;
|
|
}
|
|
|
|
size_t
|
|
PYinVamp::getPreferredBlockSize() const
|
|
{
|
|
return 2048;
|
|
}
|
|
|
|
size_t
|
|
PYinVamp::getPreferredStepSize() const
|
|
{
|
|
return 256;
|
|
}
|
|
|
|
size_t
|
|
PYinVamp::getMinChannelCount() const
|
|
{
|
|
return 1;
|
|
}
|
|
|
|
size_t
|
|
PYinVamp::getMaxChannelCount() const
|
|
{
|
|
return 1;
|
|
}
|
|
|
|
PYinVamp::ParameterList
|
|
PYinVamp::getParameterDescriptors() const
|
|
{
|
|
ParameterList list;
|
|
|
|
ParameterDescriptor d;
|
|
|
|
d.identifier = "threshdistr";
|
|
d.name = "Yin threshold distribution";
|
|
d.description = ".";
|
|
d.unit = "";
|
|
d.minValue = 0.0f;
|
|
d.maxValue = 7.0f;
|
|
d.defaultValue = 2.0f;
|
|
d.isQuantized = true;
|
|
d.quantizeStep = 1.0f;
|
|
d.valueNames.push_back("Uniform");
|
|
d.valueNames.push_back("Beta (mean 0.10)");
|
|
d.valueNames.push_back("Beta (mean 0.15)");
|
|
d.valueNames.push_back("Beta (mean 0.20)");
|
|
d.valueNames.push_back("Beta (mean 0.30)");
|
|
d.valueNames.push_back("Single Value 0.10");
|
|
d.valueNames.push_back("Single Value 0.15");
|
|
d.valueNames.push_back("Single Value 0.20");
|
|
list.push_back(d);
|
|
|
|
d.identifier = "outputunvoiced";
|
|
d.valueNames.clear();
|
|
d.name = "Output estimates classified as unvoiced?";
|
|
d.description = ".";
|
|
d.unit = "";
|
|
d.minValue = 0.0f;
|
|
d.maxValue = 2.0f;
|
|
d.defaultValue = 0.0f;
|
|
d.isQuantized = true;
|
|
d.quantizeStep = 1.0f;
|
|
d.valueNames.push_back("No");
|
|
d.valueNames.push_back("Yes");
|
|
d.valueNames.push_back("Yes, as negative frequencies");
|
|
list.push_back(d);
|
|
|
|
d.identifier = "precisetime";
|
|
d.valueNames.clear();
|
|
d.name = "Use non-standard precise YIN timing (slow).";
|
|
d.description = ".";
|
|
d.unit = "";
|
|
d.minValue = 0.0f;
|
|
d.maxValue = 1.0f;
|
|
d.defaultValue = 0.0f;
|
|
d.isQuantized = true;
|
|
d.quantizeStep = 1.0f;
|
|
list.push_back(d);
|
|
|
|
d.identifier = "lowampsuppression";
|
|
d.valueNames.clear();
|
|
d.name = "Suppress low amplitude pitch estimates.";
|
|
d.description = ".";
|
|
d.unit = "";
|
|
d.minValue = 0.0f;
|
|
d.maxValue = 1.0f;
|
|
d.defaultValue = 0.1f;
|
|
d.isQuantized = false;
|
|
list.push_back(d);
|
|
|
|
d.identifier = "onsetsensitivity";
|
|
d.valueNames.clear();
|
|
d.name = "Onset sensitivity";
|
|
d.description = "Adds additional note onsets when RMS increases.";
|
|
d.unit = "";
|
|
d.minValue = 0.0f;
|
|
d.maxValue = 1.0f;
|
|
d.defaultValue = 0.7f;
|
|
d.isQuantized = false;
|
|
list.push_back(d);
|
|
|
|
d.identifier = "prunethresh";
|
|
d.valueNames.clear();
|
|
d.name = "Duration pruning threshold.";
|
|
d.description = "Prune notes that are shorter than this value.";
|
|
d.unit = "";
|
|
d.minValue = 0.0f;
|
|
d.maxValue = 0.2f;
|
|
d.defaultValue = 0.1f;
|
|
d.isQuantized = false;
|
|
list.push_back(d);
|
|
|
|
return list;
|
|
}
|
|
|
|
float
|
|
PYinVamp::getParameter(string identifier) const
|
|
{
|
|
if (identifier == "threshdistr") {
|
|
return m_threshDistr;
|
|
}
|
|
if (identifier == "outputunvoiced") {
|
|
return m_outputUnvoiced;
|
|
}
|
|
if (identifier == "precisetime") {
|
|
return m_preciseTime;
|
|
}
|
|
if (identifier == "lowampsuppression") {
|
|
return m_lowAmp;
|
|
}
|
|
if (identifier == "onsetsensitivity") {
|
|
return m_onsetSensitivity;
|
|
}
|
|
if (identifier == "prunethresh") {
|
|
return m_pruneThresh;
|
|
}
|
|
return 0.f;
|
|
}
|
|
|
|
void
|
|
PYinVamp::setParameter(string identifier, float value)
|
|
{
|
|
if (identifier == "threshdistr")
|
|
{
|
|
m_threshDistr = value;
|
|
}
|
|
if (identifier == "outputunvoiced")
|
|
{
|
|
m_outputUnvoiced = value;
|
|
}
|
|
if (identifier == "precisetime")
|
|
{
|
|
m_preciseTime = value;
|
|
}
|
|
if (identifier == "lowampsuppression")
|
|
{
|
|
m_lowAmp = value;
|
|
}
|
|
if (identifier == "onsetsensitivity")
|
|
{
|
|
m_onsetSensitivity = value;
|
|
}
|
|
if (identifier == "prunethresh")
|
|
{
|
|
m_pruneThresh = value;
|
|
}
|
|
}
|
|
|
|
PYinVamp::ProgramList
|
|
PYinVamp::getPrograms() const
|
|
{
|
|
ProgramList list;
|
|
return list;
|
|
}
|
|
|
|
string
|
|
PYinVamp::getCurrentProgram() const
|
|
{
|
|
return ""; // no programs
|
|
}
|
|
|
|
void
|
|
PYinVamp::selectProgram(string name)
|
|
{
|
|
}
|
|
|
|
PYinVamp::OutputList
|
|
PYinVamp::getOutputDescriptors() const
|
|
{
|
|
OutputList outputs;
|
|
|
|
OutputDescriptor d;
|
|
|
|
int outputNumber = 0;
|
|
|
|
d.identifier = "f0candidates";
|
|
d.name = "F0 Candidates";
|
|
d.description = "Estimated fundamental frequency candidates.";
|
|
d.unit = "Hz";
|
|
d.hasFixedBinCount = false;
|
|
// d.binCount = 1;
|
|
d.hasKnownExtents = true;
|
|
d.minValue = m_fmin;
|
|
d.maxValue = 500;
|
|
d.isQuantized = false;
|
|
d.sampleType = OutputDescriptor::FixedSampleRate;
|
|
d.sampleRate = (m_inputSampleRate / m_stepSize);
|
|
d.hasDuration = false;
|
|
outputs.push_back(d);
|
|
m_oF0Candidates = outputNumber++;
|
|
|
|
d.identifier = "f0probs";
|
|
d.name = "Candidate Probabilities";
|
|
d.description = "Probabilities of estimated fundamental frequency candidates.";
|
|
d.unit = "";
|
|
d.hasFixedBinCount = false;
|
|
// d.binCount = 1;
|
|
d.hasKnownExtents = true;
|
|
d.minValue = 0;
|
|
d.maxValue = 1;
|
|
d.isQuantized = false;
|
|
d.sampleType = OutputDescriptor::FixedSampleRate;
|
|
d.sampleRate = (m_inputSampleRate / m_stepSize);
|
|
d.hasDuration = false;
|
|
outputs.push_back(d);
|
|
m_oF0Probs = outputNumber++;
|
|
|
|
d.identifier = "voicedprob";
|
|
d.name = "Voiced Probability";
|
|
d.description = "Probability that the signal is voiced according to Probabilistic Yin.";
|
|
d.unit = "";
|
|
d.hasFixedBinCount = true;
|
|
d.binCount = 1;
|
|
d.hasKnownExtents = true;
|
|
d.minValue = 0;
|
|
d.maxValue = 1;
|
|
d.isQuantized = false;
|
|
d.sampleType = OutputDescriptor::FixedSampleRate;
|
|
d.sampleRate = (m_inputSampleRate / m_stepSize);
|
|
d.hasDuration = false;
|
|
outputs.push_back(d);
|
|
m_oVoicedProb = outputNumber++;
|
|
|
|
d.identifier = "candidatesalience";
|
|
d.name = "Candidate Salience";
|
|
d.description = "Candidate Salience";
|
|
d.hasFixedBinCount = true;
|
|
d.binCount = m_blockSize / 2;
|
|
d.hasKnownExtents = true;
|
|
d.minValue = 0;
|
|
d.maxValue = 1;
|
|
d.isQuantized = false;
|
|
d.sampleType = OutputDescriptor::FixedSampleRate;
|
|
d.sampleRate = (m_inputSampleRate / m_stepSize);
|
|
d.hasDuration = false;
|
|
outputs.push_back(d);
|
|
m_oCandidateSalience = outputNumber++;
|
|
|
|
d.identifier = "smoothedpitchtrack";
|
|
d.name = "Smoothed Pitch Track";
|
|
d.description = ".";
|
|
d.unit = "Hz";
|
|
d.hasFixedBinCount = true;
|
|
d.binCount = 1;
|
|
d.hasKnownExtents = false;
|
|
// d.minValue = 0;
|
|
// d.maxValue = 1;
|
|
d.isQuantized = false;
|
|
d.sampleType = OutputDescriptor::FixedSampleRate;
|
|
d.sampleRate = (m_inputSampleRate / m_stepSize);
|
|
d.hasDuration = false;
|
|
outputs.push_back(d);
|
|
m_oSmoothedPitchTrack = outputNumber++;
|
|
|
|
d.identifier = "notes";
|
|
d.name = "Notes";
|
|
d.description = "Derived fixed-pitch note frequencies";
|
|
// d.unit = "MIDI unit";
|
|
d.unit = "Hz";
|
|
d.hasFixedBinCount = true;
|
|
d.binCount = 1;
|
|
d.hasKnownExtents = false;
|
|
d.isQuantized = false;
|
|
d.sampleType = OutputDescriptor::VariableSampleRate;
|
|
d.sampleRate = (m_inputSampleRate / m_stepSize);
|
|
d.hasDuration = true;
|
|
outputs.push_back(d);
|
|
m_oNotes = outputNumber++;
|
|
|
|
return outputs;
|
|
}
|
|
|
|
bool
|
|
PYinVamp::initialise(size_t channels, size_t stepSize, size_t blockSize)
|
|
{
|
|
if (channels < getMinChannelCount() ||
|
|
channels > getMaxChannelCount()) return false;
|
|
|
|
/*
|
|
std::cerr << "PYinVamp::initialise: channels = " << channels
|
|
<< ", stepSize = " << stepSize << ", blockSize = " << blockSize
|
|
<< std::endl;
|
|
*/
|
|
m_channels = channels;
|
|
m_stepSize = stepSize;
|
|
m_blockSize = blockSize;
|
|
|
|
reset();
|
|
|
|
return true;
|
|
}
|
|
|
|
void
|
|
PYinVamp::reset()
|
|
{
|
|
m_yin.setThresholdDistr(m_threshDistr);
|
|
m_yin.setFrameSize(m_blockSize);
|
|
m_yin.setFast(!m_preciseTime);
|
|
|
|
m_pitchProb.clear();
|
|
m_timestamp.clear();
|
|
m_level.clear();
|
|
/*
|
|
std::cerr << "PYinVamp::reset"
|
|
<< ", blockSize = " << m_blockSize
|
|
<< std::endl;
|
|
*/
|
|
}
|
|
|
|
PYinVamp::FeatureSet
|
|
PYinVamp::process(const float *const *inputBuffers, RealTime timestamp)
|
|
{
|
|
int offset = m_preciseTime == 1.0 ? m_blockSize/2 : m_blockSize/4;
|
|
timestamp = timestamp + Vamp::RealTime::frame2RealTime(offset, lrintf(m_inputSampleRate));
|
|
|
|
FeatureSet fs;
|
|
|
|
float rms = 0;
|
|
|
|
double *dInputBuffers = new double[m_blockSize];
|
|
for (size_t i = 0; i < m_blockSize; ++i) {
|
|
dInputBuffers[i] = inputBuffers[0][i];
|
|
rms += inputBuffers[0][i] * inputBuffers[0][i];
|
|
}
|
|
rms /= m_blockSize;
|
|
rms = sqrt(rms);
|
|
|
|
bool isLowAmplitude = (rms < m_lowAmp);
|
|
|
|
Yin::YinOutput yo = m_yin.processProbabilisticYin(dInputBuffers);
|
|
delete [] dInputBuffers;
|
|
|
|
m_level.push_back(yo.rms);
|
|
|
|
// First, get the things out of the way that we don't want to output
|
|
// immediately, but instead save for later.
|
|
vector<pair<double, double> > tempPitchProb;
|
|
for (size_t iCandidate = 0; iCandidate < yo.freqProb.size(); ++iCandidate)
|
|
{
|
|
double tempPitch = 12 * std::log(yo.freqProb[iCandidate].first/440)/std::log(2.) + 69;
|
|
if (!isLowAmplitude)
|
|
{
|
|
tempPitchProb.push_back(pair<double, double>
|
|
(tempPitch, yo.freqProb[iCandidate].second));
|
|
} else {
|
|
float factor = ((rms+0.01*m_lowAmp)/(1.01*m_lowAmp));
|
|
tempPitchProb.push_back(pair<double, double>
|
|
(tempPitch, yo.freqProb[iCandidate].second*factor));
|
|
}
|
|
}
|
|
m_pitchProb.push_back(tempPitchProb);
|
|
m_timestamp.push_back(timestamp);
|
|
|
|
// F0 CANDIDATES
|
|
Feature f;
|
|
f.hasTimestamp = true;
|
|
f.timestamp = timestamp;
|
|
for (size_t i = 0; i < yo.freqProb.size(); ++i)
|
|
{
|
|
f.values.push_back(yo.freqProb[i].first);
|
|
}
|
|
fs[m_oF0Candidates].push_back(f);
|
|
|
|
// VOICEDPROB
|
|
f.values.clear();
|
|
float voicedProb = 0;
|
|
for (size_t i = 0; i < yo.freqProb.size(); ++i)
|
|
{
|
|
f.values.push_back(yo.freqProb[i].second);
|
|
voicedProb += yo.freqProb[i].second;
|
|
}
|
|
fs[m_oF0Probs].push_back(f);
|
|
|
|
f.values.push_back(voicedProb);
|
|
fs[m_oVoicedProb].push_back(f);
|
|
|
|
// SALIENCE -- maybe this should eventually disappear
|
|
f.values.clear();
|
|
float salienceSum = 0;
|
|
for (size_t iBin = 0; iBin < yo.salience.size(); ++iBin)
|
|
{
|
|
f.values.push_back(yo.salience[iBin]);
|
|
salienceSum += yo.salience[iBin];
|
|
}
|
|
fs[m_oCandidateSalience].push_back(f);
|
|
|
|
return fs;
|
|
}
|
|
|
|
PYinVamp::FeatureSet
|
|
PYinVamp::getRemainingFeatures()
|
|
{
|
|
FeatureSet fs;
|
|
Feature f;
|
|
f.hasTimestamp = true;
|
|
f.hasDuration = false;
|
|
|
|
if (m_pitchProb.empty()) {
|
|
return fs;
|
|
}
|
|
|
|
// MONO-PITCH STUFF
|
|
MonoPitch mp;
|
|
vector<float> mpOut = mp.process(m_pitchProb);
|
|
for (size_t iFrame = 0; iFrame < mpOut.size(); ++iFrame)
|
|
{
|
|
if (mpOut[iFrame] < 0 && (m_outputUnvoiced==0)) continue;
|
|
f.timestamp = m_timestamp[iFrame];
|
|
f.values.clear();
|
|
if (m_outputUnvoiced == 1)
|
|
{
|
|
f.values.push_back(fabs(mpOut[iFrame]));
|
|
} else {
|
|
f.values.push_back(mpOut[iFrame]);
|
|
}
|
|
|
|
fs[m_oSmoothedPitchTrack].push_back(f);
|
|
}
|
|
|
|
// MONO-NOTE STUFF
|
|
// std::cerr << "Mono Note Stuff" << std::endl;
|
|
MonoNote mn;
|
|
std::vector<std::vector<std::pair<double, double> > > smoothedPitch;
|
|
for (size_t iFrame = 0; iFrame < mpOut.size(); ++iFrame) {
|
|
std::vector<std::pair<double, double> > temp;
|
|
if (mpOut[iFrame] > 0)
|
|
{
|
|
double tempPitch = 12 * std::log(mpOut[iFrame]/440)/std::log(2.) + 69;
|
|
temp.push_back(std::pair<double,double>(tempPitch, .9));
|
|
}
|
|
smoothedPitch.push_back(temp);
|
|
}
|
|
// vector<MonoNote::FrameOutput> mnOut = mn.process(m_pitchProb);
|
|
vector<MonoNote::FrameOutput> mnOut = mn.process(smoothedPitch);
|
|
|
|
// turning feature into a note feature
|
|
f.hasTimestamp = true;
|
|
f.hasDuration = true;
|
|
f.values.clear();
|
|
|
|
int onsetFrame = 0;
|
|
bool isVoiced = 0;
|
|
bool oldIsVoiced = 0;
|
|
size_t nFrame = m_pitchProb.size();
|
|
|
|
float minNoteFrames = (m_inputSampleRate*m_pruneThresh) / m_stepSize;
|
|
|
|
std::vector<float> notePitchTrack; // collects pitches for one note at a time
|
|
for (size_t iFrame = 0; iFrame < nFrame; ++iFrame)
|
|
{
|
|
isVoiced = mnOut[iFrame].noteState < 3
|
|
&& smoothedPitch[iFrame].size() > 0
|
|
&& (iFrame >= nFrame-2
|
|
|| ((m_level[iFrame]/m_level[iFrame+2]) > m_onsetSensitivity));
|
|
// std::cerr << m_level[iFrame]/m_level[iFrame-1] << " " << isVoiced << std::endl;
|
|
if (isVoiced && iFrame != nFrame-1)
|
|
{
|
|
if (oldIsVoiced == 0) // beginning of a note
|
|
{
|
|
onsetFrame = iFrame;
|
|
}
|
|
float pitch = smoothedPitch[iFrame][0].first;
|
|
notePitchTrack.push_back(pitch); // add to the note's pitch track
|
|
} else { // not currently voiced
|
|
if (oldIsVoiced == 1) // end of note
|
|
{
|
|
// std::cerr << notePitchTrack.size() << " " << minNoteFrames << std::endl;
|
|
if (notePitchTrack.size() >= minNoteFrames)
|
|
{
|
|
std::sort(notePitchTrack.begin(), notePitchTrack.end());
|
|
float medianPitch = notePitchTrack[notePitchTrack.size()/2];
|
|
float medianFreq = std::pow(2,(medianPitch - 69) / 12) * 440;
|
|
f.values.clear();
|
|
f.values.push_back(medianFreq);
|
|
f.timestamp = m_timestamp[onsetFrame];
|
|
f.duration = m_timestamp[iFrame] - m_timestamp[onsetFrame];
|
|
fs[m_oNotes].push_back(f);
|
|
}
|
|
notePitchTrack.clear();
|
|
}
|
|
}
|
|
oldIsVoiced = isVoiced;
|
|
}
|
|
return fs;
|
|
}
|