source: src/pitch/pitch_crepe.c @ f5699b9

feature/crepe
Last change on this file since f5699b9 was 57630f6, checked in by Paul Brossier <piem@piem.org>, 3 years ago

[pitch_crepe] first version

  • Property mode set to 100644
File size: 14.9 KB
Line 
1/*
2  Copyright (C) 2018 Paul Brossier <piem@aubio.org>
3
4  This file is part of aubio.
5
6  aubio is free software: you can redistribute it and/or modify
7  it under the terms of the GNU General Public License as published by
8  the Free Software Foundation, either version 3 of the License, or
9  (at your option) any later version.
10
11  aubio is distributed in the hope that it will be useful,
12  but WITHOUT ANY WARRANTY; without even the implied warranty of
13  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  GNU General Public License for more details.
15
16  You should have received a copy of the GNU General Public License
17  along with aubio.  If not, see <http://www.gnu.org/licenses/>.
18
19*/
20
21/* CREPE pitch algorithm
22
23 References
24 ----------
25
26 CREPE: A Convolutional Representation for Pitch Estimation Jong Wook Kim,
27 Justin Salamon, Peter Li, Juan Pablo Bello.  Proceedings of the IEEE
28 International Conference on Acoustics, Speech, and Signal Processing (ICASSP),
29 2018. Available online at https://arxiv.org/abs/1802.06182
30
31 Original implementation available at https://github.com/marl/crepe
32
33*/
34
35#include "aubio_priv.h"
36
37#include "fmat.h"
38#include "ai/tensor.h"
39#include "ai/conv1d.h"
40#include "ai/maxpool1d.h"
41#include "ai/batchnorm.h"
42#include "ai/dense.h"
43#include "io/file_hdf5.h"
44#include "utils/scale.h"
45
46#define HDF5_FILE_PATH "crepe-model-tiny.h5"
47
48// public prototypes
49typedef struct _aubio_pitch_crepe_t aubio_pitch_crepe_t;
50aubio_pitch_crepe_t *new_aubio_pitch_crepe(void);
51void aubio_pitch_crepe_do(aubio_pitch_crepe_t *t, fvec_t *input, fvec_t *out);
52void del_aubio_pitch_crepe(aubio_pitch_crepe_t *t);
53smpl_t aubio_pitch_crepe_get_confidence (aubio_pitch_crepe_t * o);
54uint_t aubio_pitch_crepe_set_tolerance(aubio_pitch_crepe_t * o, smpl_t
55    tolerance);
56smpl_t aubio_pitch_crepe_get_tolerance (aubio_pitch_crepe_t * o);
57
58// static prototypes
59static uint_t aubio_pitch_crepe_load_params(aubio_pitch_crepe_t *o);
60
61struct _aubio_pitch_crepe_t
62{
63  // number of [conv, maxpool, batchnorm] groups
64  uint_t n_layers;
65  // layers
66  aubio_conv1d_t **conv_layers;
67  aubio_maxpool1d_t **maxpool_layers;
68  aubio_batchnorm_t **batchnorm_layers;
69  aubio_dense_t *dense_layer;
70  // input/output tensors
71  aubio_tensor_t *input_tensor;
72  aubio_tensor_t **maxpool_output;
73  aubio_tensor_t **batchnorm_output;
74  aubio_tensor_t **conv_output;
75  aubio_tensor_t *flattened;
76  aubio_tensor_t *dense_output;
77
78  smpl_t confidence;
79  smpl_t tolerance;
80  aubio_scale_t *scale;
81};
82
83aubio_pitch_crepe_t *new_aubio_pitch_crepe(void)
84{
85  aubio_pitch_crepe_t *o = AUBIO_NEW(aubio_pitch_crepe_t);
86  aubio_tensor_t *block_input;
87  // algorithm constants
88  uint_t input_shape[2] = {1024, 1};
89  uint_t capacity_modes[5] = {4, 8, 16, 24, 32};
90  uint_t n_filters[6] = {32, 4, 4, 4, 8, 16};
91  uint_t widths[6] = {512, 64, 64, 64, 64, 64};
92  uint_t maxpool_stride[1] = {2};
93  uint_t l0_stride[1] = {4};
94  uint_t n_dense = 360;
95
96  // local variables
97  uint_t capacity_mode = 0;
98  uint_t capacity = capacity_modes[capacity_mode];
99  uint_t output_shape[2];
100  uint_t i;
101
102  AUBIO_ASSERT (capacity_mode < 5 && (sint_t)capacity_mode >= 0);
103
104  o->n_layers = 6;
105  // create arrays of layers and tensors
106  o->conv_layers = AUBIO_ARRAY(aubio_conv1d_t*, o->n_layers);
107  o->conv_output = AUBIO_ARRAY(aubio_tensor_t*, o->n_layers);
108  o->maxpool_layers = AUBIO_ARRAY(aubio_maxpool1d_t*, o->n_layers);
109  o->maxpool_output = AUBIO_ARRAY(aubio_tensor_t*, o->n_layers);
110  o->batchnorm_layers = AUBIO_ARRAY(aubio_batchnorm_t*, o->n_layers);
111  o->batchnorm_output = AUBIO_ARRAY(aubio_tensor_t*, o->n_layers);
112
113  if (!o->conv_layers || !o->conv_output
114      || !o->maxpool_layers || !o->maxpool_output
115      || !o->batchnorm_layers || !o->batchnorm_output)
116    goto failure;
117
118  // create layers
119  for (i = 0; i < o->n_layers; i++) {
120    uint_t kern_shape[1] = {widths[i]};
121    // create convolutional layers
122    o->conv_layers[i] = new_aubio_conv1d(n_filters[i] * capacity, kern_shape);
123    if (!o->conv_layers[i]) goto failure;
124    // set padding='same'
125    if (aubio_conv1d_set_padding_mode(o->conv_layers[i], "same") != AUBIO_OK) {
126      goto failure;
127    }
128    // set stride of first layer
129    if ((i == 0) && (aubio_conv1d_set_stride(o->conv_layers[0],
130            l0_stride) != AUBIO_OK) ) {
131      goto failure;
132    }
133
134    // create batchnorm layers
135    o->batchnorm_layers[i] = new_aubio_batchnorm(n_filters[i] * capacity);
136    if (!o->batchnorm_layers[i]) goto failure;
137
138    // create maxpool layers
139    o->maxpool_layers[i] = new_aubio_maxpool1d(maxpool_stride);
140    if (!o->maxpool_layers[i]) goto failure;
141  }
142
143  o->dense_layer = new_aubio_dense(n_dense);
144  if (!o->dense_layer) goto failure;
145
146  // create input/output tensors
147  o->input_tensor = new_aubio_tensor(2, input_shape);
148  if (!o->input_tensor) goto failure;
149  block_input = o->input_tensor;
150  for (i = 0; i < o->n_layers; i++) {
151    // get shape of conv1d output and create its tensor
152    if (aubio_conv1d_get_output_shape(o->conv_layers[i],
153          block_input, output_shape))
154      goto failure;
155    o->conv_output[i] = new_aubio_tensor(2, output_shape);
156    if (!o->conv_output[i]) goto failure;
157
158    // get shape of batchnorm output and create its tensor
159    if (aubio_batchnorm_get_output_shape(o->batchnorm_layers[i],
160          o->conv_output[i], output_shape))
161      goto failure;
162    o->batchnorm_output[i] = new_aubio_tensor(2, output_shape);
163    if (!o->batchnorm_output[i]) goto failure;
164
165    // get shape of maxpool1d output and create its tensor
166    if (aubio_maxpool1d_get_output_shape(o->maxpool_layers[i],
167          o->batchnorm_output[i], output_shape))
168      goto failure;
169    o->maxpool_output[i] = new_aubio_tensor(2, output_shape);
170    if (!o->maxpool_output[i]) goto failure;
171
172    // set input for next block
173    block_input = o->maxpool_output[i];
174  }
175
176  uint_t flattened_dim = o->maxpool_output[5]->shape[0];
177  flattened_dim *= o->maxpool_output[5]->shape[1];
178  uint_t dense_input[1] = {flattened_dim};
179  o->flattened = new_aubio_tensor(1, dense_input);
180  if (!o->flattened) goto failure;
181
182  // permute and flatten
183  aubio_tensor_t *permute_input = o->maxpool_output[5];
184  AUBIO_DBG("permute:           (%d, %d) ->"
185      " (%d, %d) (permutation=(2, 1))\n",
186      permute_input->shape[0], permute_input->shape[1],
187      permute_input->shape[1], permute_input->shape[0]);
188  AUBIO_DBG("flatten:           (%d, %d) -> (%d)\n",
189      permute_input->shape[1], permute_input->shape[0],
190      o->flattened->shape[0]);
191
192  if (aubio_dense_get_output_shape(o->dense_layer, o->flattened, output_shape))
193    goto failure;
194  o->dense_output = new_aubio_tensor(1, output_shape);
195  if (!o->dense_output) goto failure;
196
197  AUBIO_ASSERT(n_dense == output_shape[0]);
198
199  if (aubio_pitch_crepe_load_params(o))
200    goto failure;
201
202  // map output units to midi note
203  smpl_t start = 1997.379408437619;
204  smpl_t end = 7180.;
205  o->scale = new_aubio_scale(0., 359., start, start + end);
206  if (!o->scale) goto failure;
207
208  return o;
209
210failure:
211  del_aubio_pitch_crepe(o);
212  return NULL;
213}
214
215void del_aubio_pitch_crepe(aubio_pitch_crepe_t *o)
216{
217  uint_t i;
218  AUBIO_ASSERT(o);
219
220  if (o->input_tensor) {
221    del_aubio_tensor(o->input_tensor);
222  }
223
224  if (o->batchnorm_output) {
225    for (i = 0; i < o->n_layers; i++) {
226      if (o->batchnorm_output[i])
227        del_aubio_tensor(o->batchnorm_output[i]);
228    }
229    AUBIO_FREE(o->batchnorm_output);
230  }
231
232  if (o->batchnorm_layers) {
233    for (i = 0; i < o->n_layers; i++) {
234      if (o->batchnorm_layers[i])
235        del_aubio_batchnorm(o->batchnorm_layers[i]);
236    }
237    AUBIO_FREE(o->batchnorm_layers);
238  }
239
240  if (o->maxpool_output) {
241    for (i = 0; i < o->n_layers; i++) {
242      if (o->maxpool_output[i])
243        del_aubio_tensor(o->maxpool_output[i]);
244    }
245    AUBIO_FREE(o->maxpool_output);
246  }
247
248  if (o->maxpool_layers) {
249    for (i = 0; i < o->n_layers; i++) {
250      if (o->maxpool_layers[i])
251        del_aubio_maxpool1d(o->maxpool_layers[i]);
252    }
253    AUBIO_FREE(o->maxpool_layers);
254  }
255
256  if (o->conv_output) {
257    for (i = 0; i < o->n_layers; i++) {
258      if (o->conv_output[i])
259        del_aubio_tensor(o->conv_output[i]);
260    }
261    AUBIO_FREE(o->conv_output);
262  }
263
264  if (o->conv_layers) {
265    for (i = 0; i < o->n_layers; i++) {
266      if (o->conv_layers[i])
267        del_aubio_conv1d(o->conv_layers[i]);
268    }
269    AUBIO_FREE(o->conv_layers);
270  }
271
272  if (o->flattened) {
273    del_aubio_tensor(o->flattened);
274  }
275
276  if (o->dense_layer) {
277    del_aubio_dense(o->dense_layer);
278  }
279
280  if (o->dense_output) {
281    del_aubio_tensor(o->dense_output);
282  }
283
284  if (o->scale) {
285    del_aubio_scale(o->scale);
286  }
287
288  AUBIO_FREE(o);
289}
290
291void aubio_pitch_crepe_do(aubio_pitch_crepe_t *o, fvec_t *input, fvec_t *out)
292{
293  uint_t i;
294  AUBIO_ASSERT(o && input);
295  // copy input to input tensor
296  AUBIO_ASSERT(input->length == o->input_tensor->shape[0]);
297  // normalize frame, removing mean and dividing by std
298  smpl_t mean = fvec_mean(input);
299  fvec_add(input, -mean);
300  smpl_t std = 0.;
301  for (i = 0; i < input->length; i++) {
302    std += SQR(input->data[i]);
303  }
304  std = SQRT(std / (smpl_t)input->length);
305  if (std < 1.e-7) std = 1;
306
307  for (i = 0; i < input->length; i++) {
308    o->input_tensor->data[0][i] = input->data[i] / std;
309  }
310
311  aubio_tensor_t *block_input = o->input_tensor;
312  for (i = 0; i < o->n_layers; i++) {
313    aubio_conv1d_do(o->conv_layers[i], block_input,
314        o->conv_output[i]);
315    aubio_batchnorm_do(o->batchnorm_layers[i], o->conv_output[i],
316        o->batchnorm_output[i]);
317    aubio_maxpool1d_do(o->maxpool_layers[i], o->batchnorm_output[i],
318        o->maxpool_output[i]);
319    block_input = o->maxpool_output[i];
320  }
321
322  aubio_tensor_t *permute_input = o->maxpool_output[5];
323  // perform flattening (permutation has no effect here, order unchanged)
324  AUBIO_ASSERT (permute_input->size == o->flattened->size);
325  for (i = 0; i < permute_input->size; i++) {
326    o->flattened->data[0][i] = permute_input->data[0][i];
327  }
328
329  // compute dense layer
330  aubio_dense_do(o->dense_layer, o->flattened, o->dense_output);
331
332#if 0
333  // print debug output
334  for (i = 0; i < o->n_layers; i++) {
335    AUBIO_DBG("pitch_crepe: conv1d[%d]    %f\n", i,
336        aubio_tensor_max(o->conv_output[i]));
337    AUBIO_DBG("pitch_crepe: batchnorm[%d] %f\n", i,
338        aubio_tensor_max(o->batchnorm_output[i]));
339    AUBIO_DBG("pitch_crepe: maxpool1d[%d] %f\n", i,
340        aubio_tensor_max(o->maxpool_output[i]));
341  }
342  AUBIO_DBG("pitch_crepe: dense %f\n", aubio_tensor_max(o->dense_output));
343#endif
344
345  // find maximum activation
346  fvec_t activations;
347  aubio_tensor_as_fvec(o->dense_output, &activations);
348  uint_t argmax = fvec_max_elem(&activations);
349  o->confidence = activations.data[argmax];
350
351  // skip frames with no activation at all (e.g. silence)
352  // or with insufficient confidence
353  if ((argmax == activations.length - 1)
354      || (o->confidence < o->tolerance)) {
355    out->data[0] = -100.;
356    o->confidence = 0;
357    return;
358  }
359
360  // perform interpolation across neighbouring outputs
361  sint_t start = MAX(0, (sint_t)argmax - 4);
362  uint_t end = MIN(argmax + 5, activations.length);
363
364  smpl_t prod = 0;
365  smpl_t weight = 0;
366  smpl_t scaling = 0;
367  for (i = start; i < end; i++) {
368    scaling = (smpl_t)(i);
369    prod += activations.data[i] * scaling;
370    weight += activations.data[i];
371  }
372  out->data[0] = prod / weight;
373
374  // map output units to midi output
375  aubio_scale_do(o->scale, out);
376
377  // convert cents to midi
378  out->data[0] /= 100.;
379
380  // final bias (f_ref = 10Hz -> 3.48 midi)
381  out->data[0] += 3.486821174621582;
382}
383
384smpl_t aubio_pitch_crepe_get_confidence (aubio_pitch_crepe_t* o)
385{
386  return o->confidence;
387}
388
389uint_t aubio_pitch_crepe_set_tolerance(aubio_pitch_crepe_t * o,
390    smpl_t tolerance)
391{
392  if (o->tolerance < 0 || o->tolerance > 1) return AUBIO_FAIL;
393  o->tolerance = tolerance;
394  return AUBIO_OK;
395}
396
397smpl_t aubio_pitch_crepe_get_tolerance (aubio_pitch_crepe_t * o)
398{
399  return o->tolerance;
400}
401
402uint_t aubio_pitch_crepe_load_params(aubio_pitch_crepe_t *o)
403{
404  uint_t i;
405  aubio_tensor_t *k = NULL;
406  fvec_t *vec = NULL;
407
408  AUBIO_ASSERT(o);
409
410  aubio_file_hdf5_t *hdf5 = new_aubio_file_hdf5(HDF5_FILE_PATH);
411  if (!hdf5) return AUBIO_FAIL;
412
413  // get kernels
414  for (i = 0; i < o->n_layers; i++) {
415    char_t *fmt_key = "/conv%d/conv%d_3/kernel:0";
416    char_t key[PATH_MAX];
417    snprintf(key, sizeof(key), fmt_key, i+1, i+1);
418    k = aubio_conv1d_get_kernel(o->conv_layers[i]);
419
420    // push dimension
421    k->shape[3] = k->shape[2]; k->shape[2] = k->shape[1]; k->shape[1] = 1;
422    k->ndim += 1;
423    // load params from hdf5 into kernel tensor
424    if (aubio_file_hdf5_load_dataset_into_tensor(hdf5, key, k))
425      return AUBIO_FAIL;
426    // pop dimension
427    k->shape[1] = k->shape[2]; k->shape[2] = k->shape[3]; k->shape[3] = 0;
428    k->ndim -= 1;
429  }
430
431  // get bias vectors
432  for (i = 0; i < o->n_layers; i++) {
433    char_t *fmt_key = "/conv%d/conv%d_3/bias:0";
434    char_t key[PATH_MAX];
435    snprintf(key, sizeof(key), fmt_key, i+1, i+1);
436    vec = aubio_conv1d_get_bias(o->conv_layers[i]);
437    // load params from hdf5 into kernel tensor
438    if (aubio_file_hdf5_load_dataset_into_vector(hdf5, key, vec))
439      return AUBIO_FAIL;
440  }
441
442  // batchnorm
443  for (i = 0; i < o->n_layers; i++) {
444    char_t *fmt_key = "/conv%d-BN/conv%d-BN_3/gamma:0";
445    char_t key[PATH_MAX];
446    snprintf(key, sizeof(key), fmt_key, i+1, i+1);
447    // get kernel matrix
448    vec = aubio_batchnorm_get_gamma(o->batchnorm_layers[i]);
449    // load params from hdf5 into kernel tensor
450    if (aubio_file_hdf5_load_dataset_into_vector(hdf5, key, vec))
451      return AUBIO_FAIL;
452  }
453  for (i = 0; i < o->n_layers; i++) {
454    char_t *fmt_key = "/conv%d-BN/conv%d-BN_3/beta:0";
455    char_t key[PATH_MAX];
456    snprintf(key, sizeof(key), fmt_key, i+1, i+1);
457    // get kernel matrix
458    vec = aubio_batchnorm_get_beta(o->batchnorm_layers[i]);
459    // load params from hdf5 into kernel tensor
460    if (aubio_file_hdf5_load_dataset_into_vector(hdf5, key, vec))
461      return AUBIO_FAIL;
462  }
463  for (i = 0; i < o->n_layers; i++) {
464    char_t *fmt_key = "/conv%d-BN/conv%d-BN_3/moving_mean:0";
465    char_t key[PATH_MAX];
466    snprintf(key, sizeof(key), fmt_key, i+1, i+1);
467    // get kernel matrix
468    vec = aubio_batchnorm_get_moving_mean(o->batchnorm_layers[i]);
469    // load params from hdf5 into kernel tensor
470    if (aubio_file_hdf5_load_dataset_into_vector(hdf5, key, vec))
471      return AUBIO_FAIL;
472  }
473  for (i = 0; i < o->n_layers; i++) {
474    char_t *fmt_key = "/conv%d-BN/conv%d-BN_3/moving_variance:0";
475    char_t key[PATH_MAX];
476    snprintf(key, sizeof(key), fmt_key, i+1, i+1);
477    // get kernel matrix
478    vec = aubio_batchnorm_get_moving_variance(o->batchnorm_layers[i]);
479    // load params from hdf5 into kernel tensor
480    if (aubio_file_hdf5_load_dataset_into_vector(hdf5, key, vec))
481      return AUBIO_FAIL;
482  }
483
484  {
485    char_t *key = "/classifier/classifier_3/kernel:0";
486    fmat_t *d = aubio_dense_get_weights(o->dense_layer);
487    if (aubio_file_hdf5_load_dataset_into_matrix(hdf5, key, d))
488      return AUBIO_FAIL;
489
490    key = "/classifier/classifier_3/bias:0";
491    fvec_t *v = aubio_dense_get_bias(o->dense_layer);
492    if (aubio_file_hdf5_load_dataset_into_vector(hdf5, key, v))
493      return AUBIO_FAIL;
494  }
495
496  if (hdf5) {
497    del_aubio_file_hdf5(hdf5);
498  }
499
500  return AUBIO_OK;
501}
Note: See TracBrowser for help on using the repository browser.