Krotos Modules 3
Loading...
Searching...
No Matches
AudioEmbedding.cpp
Go to the documentation of this file.
1#include <fstream>
2
3namespace krotos
4{
5
6//=====================================================================================
8 : m_fftSize(2048), m_hopSize(1024), m_fft(static_cast<int>(std::log2(m_fftSize))),
9 m_window(m_fftSize + 1, dsp::WindowingFunction<float>::hann, false)
10{
11 initialise();
12}
13
15{
16 // Note: if these params change the autoencoder should be retrained
17 const int sr = 48000;
18 const int n_mels = 40;
19 const float fmin = 20.f;
20 const float fmax = 8000.f;
21 const bool htk = false;
22 m_mel = AudioEmbedding::mel(sr, m_fftSize, n_mels, fmin, fmax, htk);
23 m_initialised = true;
24}
25
26std::vector<float> AudioEmbedding::forward(const AudioSampleBuffer& buffer)
27{
28 assert(m_initialised);
29
30 const auto inputSize = buffer.getNumSamples();
31 const auto inputData = buffer.getReadPointer(0);
32
33 std::vector<float> sigFrame(2 * m_fftSize);
34 std::vector<float> embedding(m_mel.size(), 0.f);
35
36 std::size_t numFrames = 0;
37 for (int posin = 0; posin < inputSize; posin += m_hopSize)
38 {
39 // extract a signal frame
40 for (int i = 0; i < m_fftSize; ++i)
41 {
42 if (posin + i < inputSize)
43 sigFrame.at(i) = inputData[posin + i];
44 else
45 sigFrame.at(i) = 0.f;
46 }
47
48 // apply window function signal frame
49 m_window.multiplyWithWindowingTable(sigFrame.data(), m_fftSize);
50
51 // apply FFT on the frame - get back frequency magnitude
52 m_fft.performFrequencyOnlyForwardTransform(sigFrame.data(), false);
53
54 // apply mel filterbank
55 auto melFrame = applyMelFilterbank(sigFrame);
56
57 // accumulate mel frames
58 std::transform(embedding.begin(), embedding.end(), melFrame.begin(), embedding.begin(), std::plus<float>());
59
60 ++numFrames;
61 }
62
63 // average and normalise audio embedding
64 numFrames = numFrames > 0 ? numFrames : 1;
65 std::for_each(embedding.begin(), embedding.end(), [numFrames](float& v) { v = v / numFrames; });
66 L2Normalise(embedding);
67 assert(embedding.size() == 40);
68
69 // apply autoencoder and renormalise
70 embedding = applyAutoEncoder(embedding);
71 L2Normalise(embedding);
72 assert(embedding.size() == 20);
73
74 return embedding;
75}
76
77void AudioEmbedding::L2Normalise(std::vector<float>& x, float eps)
78{
79 const auto norm = std::sqrt(std::inner_product(x.begin(), x.end(), x.begin(), eps));
80 std::for_each(x.begin(), x.end(), [norm](float& v) { v = v / norm; });
81}
82
83std::vector<float> AudioEmbedding::applyMelFilterbank(const std::vector<float>& x)
84{
85 const auto n = 1 + m_fftSize / 2;
86 std::vector<float> melFrame(m_mel.size(), 0.f);
87 for (std::size_t i = 0; i < m_mel.size(); ++i)
88 {
89 melFrame[i] = std::inner_product(x.begin(), x.begin() + n, m_mel[i].begin(), 0.f);
90 }
91 return melFrame;
92}
93
94std::vector<float> AudioEmbedding::applyAutoEncoder(const std::vector<float>& x)
95{
96 std::vector<float> latent(m_weight.size(), 0.f);
97 for (std::size_t i = 0; i < m_weight.size(); ++i)
98 {
99 latent[i] = std::inner_product(x.begin(), x.end(), m_weight[i].begin(), 0.f);
100 }
101 return latent;
102}
103
104std::vector<float> AudioEmbedding::hz_to_mel(std::vector<float> freqs, bool htk)
105{
106 std::vector<float> mels(freqs.size());
107 if (htk)
108 {
109 for (std::size_t i = 0; i < mels.size(); ++i)
110 {
111 mels[i] = 2595.0f * std::log10f(1.0f + freqs[i] / 700.0f);
112 }
113 return mels;
114 }
115
116 const float fmin = 0.0f;
117 const float f_sp = 200.0f / 3.0f;
118
119 for (std::size_t i = 0; i < mels.size(); ++i)
120 {
121 mels[i] = (freqs[i] - fmin) / f_sp;
122 }
123
124 const float min_log_hz = 1000.0f;
125 const float min_log_mel = (min_log_hz - fmin) / f_sp;
126 const float logstep = std::log(6.4f) / 27.0f;
127
128 for (std::size_t i = 0; i < mels.size(); ++i)
129 {
130 if (freqs[i] >= min_log_hz)
131 {
132 mels[i] = min_log_mel + std::log(freqs[i] / min_log_hz) / logstep;
133 }
134 }
135
136 return mels;
137}
138
139std::vector<float> AudioEmbedding::mel_to_hz(std::vector<float> mels, bool htk)
140{
141 std::vector<float> freqs(mels.size());
142 if (htk)
143 {
144 for (std::size_t i = 0; i < mels.size(); ++i)
145 {
146 freqs[i] = 700.0f * (std::pow(10.0f, mels[i] / 2595.0f) - 1.0f);
147 }
148 return freqs;
149 }
150
151 const float f_min = 0.0f;
152 const float f_sp = 200.0f / 3.0f;
153
154 for (std::size_t i = 0; i < mels.size(); ++i)
155 {
156 freqs[i] = f_min + f_sp * mels[i];
157 }
158
159 const float min_log_hz = 1000.0f;
160 const float min_log_mel = (min_log_hz - f_min) / f_sp;
161 const float logstep = std::log(6.4f) / 27.0f;
162
163 for (std::size_t i = 0; i < mels.size(); ++i)
164 {
165 if (mels[i] >= min_log_mel)
166 {
167 freqs[i] = min_log_hz * std::exp(logstep * (mels[i] - min_log_mel));
168 }
169 }
170
171 return freqs;
172}
173
174std::vector<float> AudioEmbedding::mel_frequencies(float fmin, float fmax, int n_mels, bool htk)
175{
176 const auto fmin_v = std::vector<float>(1, fmin);
177 const auto fmax_v = std::vector<float>(1, fmax);
178 const float min_mel = hz_to_mel(fmin_v, htk)[0];
179 const float max_mel = hz_to_mel(fmax_v, htk)[0];
180
181 const auto step = (max_mel - min_mel) / static_cast<float>(n_mels - 1);
182 std::vector<float> mels = std::vector<float>(n_mels);
183 for (int i = 0; i < n_mels; ++i)
184 {
185 mels[i] = min_mel + step * static_cast<float>(i);
186 }
187
188 return mel_to_hz(mels, htk);
189}
190
191std::vector<std::vector<float>> AudioEmbedding::mel(int sr, int n_fft, int n_mels, float fmin, float fmax, bool htk)
192{
193 const int length = 1 + n_fft / 2;
194 if (fmax < 0.0f)
195 {
196 fmax = static_cast<float>(sr) / 2.0f;
197 }
198
199 std::vector<std::vector<float>> weights(n_mels, std::vector<float>(length));
200
201 std::vector<float> fft_freqs(length);
202 for (int i = 0; i < length; ++i)
203 {
204 fft_freqs[i] = static_cast<float>(sr) / static_cast<float>(n_fft) * static_cast<float>(i);
205 }
206
207 auto mel_f = mel_frequencies(fmin, fmax, n_mels + 2, htk);
208
209 std::vector<float> fdiff(mel_f.size() - 1);
210 for (std::size_t i = 0; i < fdiff.size(); ++i)
211 {
212 fdiff[i] = mel_f[i + 1] - mel_f[i];
213 }
214
215 std::vector<std::vector<float>> ramps(mel_f.size(), std::vector<float>(fft_freqs.size()));
216 for (std::size_t i = 0; i < mel_f.size(); ++i)
217 {
218 for (std::size_t j = 0; j < fft_freqs.size(); ++j)
219 {
220 ramps[i][j] = mel_f[i] - fft_freqs[j];
221 }
222 }
223
224 auto lower = std::vector<float>(fft_freqs.size());
225 auto upper = std::vector<float>(fft_freqs.size());
226 for (int i = 0; i < n_mels; ++i)
227 {
228 for (std::size_t j = 0; j < lower.size(); j++)
229 {
230 lower[j] = -1 * ramps[i][j] / fdiff[i];
231 }
232
233 for (std::size_t j = 0; j < lower.size(); ++j)
234 {
235 upper[j] = ramps[i + 2][j] / fdiff[i + 1];
236 }
237
238 for (std::size_t j = 0; j < lower.size(); ++j)
239 {
240 auto lower_upper_minimum = 0.0f;
241 if (lower[j] > upper[j])
242 {
243 lower_upper_minimum = upper[j];
244 }
245 else
246 {
247 lower_upper_minimum = lower[j];
248 }
249
250 if (lower_upper_minimum > 0.0f)
251 {
252 weights[i][j] = lower_upper_minimum;
253 }
254 else
255 {
256 weights[i][j] = 0.0f;
257 }
258 }
259 }
260
261 for (int i = 0; i < n_mels; ++i)
262 {
263 const auto enorm = 2.0f / (mel_f[2 + i] - mel_f[i]);
264 for (int j = 0; j < length; ++j)
265 {
266 weights[i][j] = enorm * weights[i][j];
267 }
268 }
269
270 return weights;
271}
272
273void AudioEmbedding::writeMatrixToFile(const std::vector<std::vector<float>>& matrix, const std::string& filename)
274{
275 std::ofstream outputFile(filename);
276 if (!outputFile.is_open())
277 {
278 DBG("Failed to open file: ");
279 return;
280 }
281
282 for (const auto& row : matrix)
283 {
284 for (const auto& value : row)
285 {
286 outputFile << value << " ";
287 }
288 outputFile << '\n';
289 }
290 outputFile.close();
291}
292
293} // namespace krotos
std::vector< float > forward(const AudioSampleBuffer &buffer)
Definition AudioEmbedding.cpp:26
std::vector< float > applyAutoEncoder(const std::vector< float > &x)
Definition AudioEmbedding.cpp:94
int m_hopSize
Definition AudioEmbedding.h:78
std::vector< float > mel_to_hz(std::vector< float > mels, bool htk=false)
Definition AudioEmbedding.cpp:139
int m_fftSize
Definition AudioEmbedding.h:77
void writeMatrixToFile(const std::vector< std::vector< float > > &matrix, const std::string &filename)
Definition AudioEmbedding.cpp:273
void initialise()
Definition AudioEmbedding.cpp:14
std::vector< std::vector< float > > m_mel
Definition AudioEmbedding.h:81
juce::dsp::FFT m_fft
Definition AudioEmbedding.h:79
bool m_initialised
Definition AudioEmbedding.h:76
void L2Normalise(std::vector< float > &x, float eps=1e-5f)
Definition AudioEmbedding.cpp:77
juce::dsp::WindowingFunction< float > m_window
Definition AudioEmbedding.h:80
std::vector< float > hz_to_mel(std::vector< float > freqs, bool htk=false)
Definition AudioEmbedding.cpp:104
std::vector< float > applyMelFilterbank(const std::vector< float > &x)
Definition AudioEmbedding.cpp:83
AudioEmbedding()
Definition AudioEmbedding.cpp:7
std::vector< std::vector< float > > m_weight
Definition AudioEmbedding.h:84
std::vector< std::vector< float > > mel(int sr, int n_fft, int n_mels, float fmin, float fmax, bool htk=false)
Definition AudioEmbedding.cpp:191
std::vector< float > mel_frequencies(float fmin, float fmax, int n_mels, bool htk=false)
Definition AudioEmbedding.cpp:174
Definition AirAbsorptionFilter.cpp:2