source: src/pitch/pitch_crepe.c @ 75e0b81

feature/crepe
Last change on this file since 75e0b81 was 557e0a7, checked in by Paul Brossier <piem@piem.org>, 3 years ago

[pitch_crepe] update creation

  • Property mode set to 100644
File size: 15.4 KB
Line 
1/*
2  Copyright (C) 2018 Paul Brossier <piem@aubio.org>
3
4  This file is part of aubio.
5
6  aubio is free software: you can redistribute it and/or modify
7  it under the terms of the GNU General Public License as published by
8  the Free Software Foundation, either version 3 of the License, or
9  (at your option) any later version.
10
11  aubio is distributed in the hope that it will be useful,
12  but WITHOUT ANY WARRANTY; without even the implied warranty of
13  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  GNU General Public License for more details.
15
16  You should have received a copy of the GNU General Public License
17  along with aubio.  If not, see <http://www.gnu.org/licenses/>.
18
19*/
20
21/* CREPE pitch algorithm
22
23 References
24 ----------
25
26 CREPE: A Convolutional Representation for Pitch Estimation Jong Wook Kim,
27 Justin Salamon, Peter Li, Juan Pablo Bello.  Proceedings of the IEEE
28 International Conference on Acoustics, Speech, and Signal Processing (ICASSP),
29 2018. Available online at https://arxiv.org/abs/1802.06182
30
31 Original implementation available at https://github.com/marl/crepe
32
33*/
34
35#include "aubio_priv.h"
36
37#include "fmat.h"
38#include "ai/tensor.h"
39#include "ai/activation.h"
40#include "ai/conv1d.h"
41#include "ai/maxpool1d.h"
42#include "ai/batchnorm.h"
43#include "ai/dense.h"
44#include "io/file_hdf5.h"
45#include "utils/scale.h"
46
47#define HDF5_FILE_PATH "crepe-model-tiny.h5"
48
49// public prototypes
50typedef struct _aubio_pitch_crepe_t aubio_pitch_crepe_t;
51aubio_pitch_crepe_t *new_aubio_pitch_crepe(void);
52void aubio_pitch_crepe_do(aubio_pitch_crepe_t *t, fvec_t *input, fvec_t *out);
53void del_aubio_pitch_crepe(aubio_pitch_crepe_t *t);
54smpl_t aubio_pitch_crepe_get_confidence (aubio_pitch_crepe_t * o);
55uint_t aubio_pitch_crepe_set_tolerance(aubio_pitch_crepe_t * o, smpl_t
56    tolerance);
57smpl_t aubio_pitch_crepe_get_tolerance (aubio_pitch_crepe_t * o);
58
59// static prototypes
60static uint_t aubio_pitch_crepe_load_params(aubio_pitch_crepe_t *o);
61
62struct _aubio_pitch_crepe_t
63{
64  // number of [conv, maxpool, batchnorm] groups
65  uint_t n_layers;
66  // layers
67  aubio_conv1d_t **conv_layers;
68  aubio_batchnorm_t **batchnorm_layers;
69  aubio_maxpool1d_t **maxpool_layers;
70  aubio_dense_t *dense_layer;
71  // input/output tensors
72  aubio_tensor_t *input_tensor;
73  aubio_tensor_t **conv_output;
74  aubio_tensor_t **batchnorm_output;
75  aubio_tensor_t **maxpool_output;
76  aubio_tensor_t *flattened;
77  aubio_tensor_t *dense_output;
78
79  smpl_t confidence;
80  smpl_t tolerance;
81  aubio_scale_t *scale;
82};
83
84aubio_pitch_crepe_t *new_aubio_pitch_crepe(void)
85{
86  aubio_pitch_crepe_t *o = AUBIO_NEW(aubio_pitch_crepe_t);
87  aubio_tensor_t *block_input;
88  // algorithm constants
89  uint_t input_shape[2] = {1024, 1};
90  uint_t capacity_modes[5] = {4, 8, 16, 24, 32};
91  uint_t n_filters[6] = {32, 4, 4, 4, 8, 16};
92  uint_t widths[6] = {512, 64, 64, 64, 64, 64};
93  uint_t maxpool_stride[1] = {2};
94  uint_t l0_stride[1] = {4};
95  uint_t n_dense = 360;
96
97  // local variables
98  uint_t capacity_mode = 0;
99  uint_t capacity = capacity_modes[capacity_mode];
100  uint_t output_shape[2];
101  uint_t i;
102
103#if defined(HAVE_BLAS) && defined(HAVE_OPENBLAS_CBLAS_H)
104  // workaround to prevent openblas from opening multiple threads, since
105  // the overhead appears to be higher than using a single thread.
106  openblas_set_num_threads(1);
107#endif
108
109  AUBIO_ASSERT (capacity_mode < 5 && (sint_t)capacity_mode >= 0);
110
111  o->n_layers = 6;
112  // create arrays of layers and tensors
113  o->conv_layers = AUBIO_ARRAY(aubio_conv1d_t*, o->n_layers);
114  o->conv_output = AUBIO_ARRAY(aubio_tensor_t*, o->n_layers);
115  o->maxpool_layers = AUBIO_ARRAY(aubio_maxpool1d_t*, o->n_layers);
116  o->maxpool_output = AUBIO_ARRAY(aubio_tensor_t*, o->n_layers);
117  o->batchnorm_layers = AUBIO_ARRAY(aubio_batchnorm_t*, o->n_layers);
118  o->batchnorm_output = AUBIO_ARRAY(aubio_tensor_t*, o->n_layers);
119
120  if (!o->conv_layers || !o->conv_output
121      || !o->maxpool_layers || !o->maxpool_output
122      || !o->batchnorm_layers || !o->batchnorm_output)
123    goto failure;
124
125  // create layers
126  for (i = 0; i < o->n_layers; i++) {
127    uint_t kern_shape[1] = {widths[i]};
128    // create convolutional layers
129    o->conv_layers[i] = new_aubio_conv1d(n_filters[i] * capacity, kern_shape);
130    if (!o->conv_layers[i]) goto failure;
131    // set padding='same'
132    if (aubio_conv1d_set_padding_mode(o->conv_layers[i], "same") != AUBIO_OK) {
133      goto failure;
134    }
135    // set stride of first layer
136    if ((i == 0) && (aubio_conv1d_set_stride(o->conv_layers[0],
137            l0_stride) != AUBIO_OK) ) {
138      goto failure;
139    }
140
141    // create batchnorm layers
142    o->batchnorm_layers[i] = new_aubio_batchnorm();
143    if (!o->batchnorm_layers[i]) goto failure;
144
145    // create maxpool layers
146    o->maxpool_layers[i] = new_aubio_maxpool1d(maxpool_stride);
147    if (!o->maxpool_layers[i]) goto failure;
148  }
149
150  o->dense_layer = new_aubio_dense(n_dense);
151  if (!o->dense_layer) goto failure;
152
153  // create input/output tensors
154  o->input_tensor = new_aubio_tensor(2, input_shape);
155  if (!o->input_tensor) goto failure;
156  block_input = o->input_tensor;
157  for (i = 0; i < o->n_layers; i++) {
158    // get shape of conv1d output and create its tensor
159    if (aubio_conv1d_get_output_shape(o->conv_layers[i],
160          block_input, output_shape))
161      goto failure;
162    o->conv_output[i] = new_aubio_tensor(2, output_shape);
163    if (!o->conv_output[i]) goto failure;
164
165    // get shape of batchnorm output and create its tensor
166    if (aubio_batchnorm_get_output_shape(o->batchnorm_layers[i],
167          o->conv_output[i], output_shape))
168      goto failure;
169    o->batchnorm_output[i] = new_aubio_tensor(2, output_shape);
170    if (!o->batchnorm_output[i]) goto failure;
171
172    // get shape of maxpool1d output and create its tensor
173    if (aubio_maxpool1d_get_output_shape(o->maxpool_layers[i],
174          o->batchnorm_output[i], output_shape))
175      goto failure;
176    o->maxpool_output[i] = new_aubio_tensor(2, output_shape);
177    if (!o->maxpool_output[i]) goto failure;
178
179    // set input for next block
180    block_input = o->maxpool_output[i];
181  }
182
183  uint_t flattened_dim = o->maxpool_output[5]->shape[0];
184  flattened_dim *= o->maxpool_output[5]->shape[1];
185  uint_t dense_input[1] = {flattened_dim};
186  o->flattened = new_aubio_tensor(1, dense_input);
187  if (!o->flattened) goto failure;
188
189  // permute and flatten
190  aubio_tensor_t *permute_input = o->maxpool_output[5];
191  AUBIO_DBG("permute:           (%d, %d) ->"
192      " (%d, %d) (permutation=(2, 1))\n",
193      permute_input->shape[0], permute_input->shape[1],
194      permute_input->shape[1], permute_input->shape[0]);
195  AUBIO_DBG("flatten:           (%d, %d) -> (%d)\n",
196      permute_input->shape[1], permute_input->shape[0],
197      o->flattened->shape[0]);
198
199  if (aubio_dense_get_output_shape(o->dense_layer, o->flattened, output_shape))
200    goto failure;
201  o->dense_output = new_aubio_tensor(1, output_shape);
202  if (!o->dense_output) goto failure;
203
204  AUBIO_ASSERT(n_dense == output_shape[0]);
205
206  if (aubio_pitch_crepe_load_params(o))
207    goto failure;
208
209  // map output units to midi note
210  smpl_t start = 1997.379408437619;
211  smpl_t end = 7180.;
212  o->scale = new_aubio_scale(0., 359., start, start + end);
213  if (!o->scale) goto failure;
214
215  return o;
216
217failure:
218  del_aubio_pitch_crepe(o);
219  return NULL;
220}
221
222void del_aubio_pitch_crepe(aubio_pitch_crepe_t *o)
223{
224  uint_t i;
225  AUBIO_ASSERT(o);
226
227  if (o->input_tensor) {
228    del_aubio_tensor(o->input_tensor);
229  }
230
231  if (o->batchnorm_output) {
232    for (i = 0; i < o->n_layers; i++) {
233      if (o->batchnorm_output[i])
234        del_aubio_tensor(o->batchnorm_output[i]);
235    }
236    AUBIO_FREE(o->batchnorm_output);
237  }
238
239  if (o->batchnorm_layers) {
240    for (i = 0; i < o->n_layers; i++) {
241      if (o->batchnorm_layers[i])
242        del_aubio_batchnorm(o->batchnorm_layers[i]);
243    }
244    AUBIO_FREE(o->batchnorm_layers);
245  }
246
247  if (o->maxpool_output) {
248    for (i = 0; i < o->n_layers; i++) {
249      if (o->maxpool_output[i])
250        del_aubio_tensor(o->maxpool_output[i]);
251    }
252    AUBIO_FREE(o->maxpool_output);
253  }
254
255  if (o->maxpool_layers) {
256    for (i = 0; i < o->n_layers; i++) {
257      if (o->maxpool_layers[i])
258        del_aubio_maxpool1d(o->maxpool_layers[i]);
259    }
260    AUBIO_FREE(o->maxpool_layers);
261  }
262
263  if (o->conv_output) {
264    for (i = 0; i < o->n_layers; i++) {
265      if (o->conv_output[i])
266        del_aubio_tensor(o->conv_output[i]);
267    }
268    AUBIO_FREE(o->conv_output);
269  }
270
271  if (o->conv_layers) {
272    for (i = 0; i < o->n_layers; i++) {
273      if (o->conv_layers[i])
274        del_aubio_conv1d(o->conv_layers[i]);
275    }
276    AUBIO_FREE(o->conv_layers);
277  }
278
279  if (o->flattened) {
280    del_aubio_tensor(o->flattened);
281  }
282
283  if (o->dense_layer) {
284    del_aubio_dense(o->dense_layer);
285  }
286
287  if (o->dense_output) {
288    del_aubio_tensor(o->dense_output);
289  }
290
291  if (o->scale) {
292    del_aubio_scale(o->scale);
293  }
294
295  AUBIO_FREE(o);
296}
297
298void aubio_pitch_crepe_do(aubio_pitch_crepe_t *o, fvec_t *input, fvec_t *out)
299{
300  uint_t i;
301  AUBIO_ASSERT(o && input);
302  // copy input to input tensor
303  AUBIO_ASSERT(input->length == o->input_tensor->shape[0]);
304  // normalize frame, removing mean and dividing by std
305  smpl_t mean = fvec_mean(input);
306  fvec_add(input, -mean);
307  smpl_t std = 0.;
308  for (i = 0; i < input->length; i++) {
309    std += SQR(input->data[i]);
310  }
311  std = SQRT(std / (smpl_t)input->length);
312  if (std < 1.e-7) std = 1;
313
314  for (i = 0; i < input->length; i++) {
315    o->input_tensor->data[0][i] = input->data[i] / std;
316  }
317
318  aubio_tensor_t *block_input = o->input_tensor;
319  for (i = 0; i < o->n_layers; i++) {
320    aubio_conv1d_do(o->conv_layers[i], block_input,
321        o->conv_output[i]);
322    // relu activation
323    aubio_activation_relu(o->conv_output[i]);
324    aubio_batchnorm_do(o->batchnorm_layers[i], o->conv_output[i],
325        o->batchnorm_output[i]);
326    aubio_maxpool1d_do(o->maxpool_layers[i], o->batchnorm_output[i],
327        o->maxpool_output[i]);
328    block_input = o->maxpool_output[i];
329  }
330
331  aubio_tensor_t *permute_input = o->maxpool_output[5];
332  // perform flattening (permutation has no effect here, order unchanged)
333  AUBIO_ASSERT (permute_input->size == o->flattened->size);
334  for (i = 0; i < permute_input->size; i++) {
335    o->flattened->data[0][i] = permute_input->data[0][i];
336  }
337
338  // compute dense layer
339  aubio_dense_do(o->dense_layer, o->flattened, o->dense_output);
340
341  // sigmoid activation
342  aubio_activation_sigmoid(o->dense_output);
343
344#if 0
345  // print debug output
346  for (i = 0; i < o->n_layers; i++) {
347    AUBIO_DBG("pitch_crepe: conv1d[%d]    %f\n", i,
348        aubio_tensor_max(o->conv_output[i]));
349    AUBIO_DBG("pitch_crepe: batchnorm[%d] %f\n", i,
350        aubio_tensor_max(o->batchnorm_output[i]));
351    AUBIO_DBG("pitch_crepe: maxpool1d[%d] %f\n", i,
352        aubio_tensor_max(o->maxpool_output[i]));
353  }
354  AUBIO_DBG("pitch_crepe: dense %f\n", aubio_tensor_max(o->dense_output));
355#endif
356
357  // find maximum activation
358  fvec_t activations;
359  aubio_tensor_as_fvec(o->dense_output, &activations);
360  uint_t argmax = fvec_max_elem(&activations);
361  o->confidence = activations.data[argmax];
362
363  // skip frames with no activation at all (e.g. silence)
364  // or with insufficient confidence
365  if ((argmax == activations.length - 1)
366      || (o->confidence < o->tolerance)) {
367    out->data[0] = -100.;
368    o->confidence = 0;
369    return;
370  }
371
372  // perform interpolation across neighbouring outputs
373  sint_t start = MAX(0, (sint_t)argmax - 4);
374  uint_t end = MIN(argmax + 5, activations.length);
375
376  smpl_t prod = 0;
377  smpl_t weight = 0;
378  smpl_t scaling = 0;
379  for (i = start; i < end; i++) {
380    scaling = (smpl_t)(i);
381    prod += activations.data[i] * scaling;
382    weight += activations.data[i];
383  }
384  out->data[0] = prod / weight;
385
386  // map output units to midi output
387  aubio_scale_do(o->scale, out);
388
389  // convert cents to midi
390  out->data[0] /= 100.;
391
392  // final bias (f_ref = 10Hz -> 3.48 midi)
393  out->data[0] += 3.486821174621582;
394}
395
396smpl_t aubio_pitch_crepe_get_confidence (aubio_pitch_crepe_t* o)
397{
398  return o->confidence;
399}
400
401uint_t aubio_pitch_crepe_set_tolerance(aubio_pitch_crepe_t * o,
402    smpl_t tolerance)
403{
404  if (o->tolerance < 0 || o->tolerance > 1) return AUBIO_FAIL;
405  o->tolerance = tolerance;
406  return AUBIO_OK;
407}
408
409smpl_t aubio_pitch_crepe_get_tolerance (aubio_pitch_crepe_t * o)
410{
411  return o->tolerance;
412}
413
414uint_t aubio_pitch_crepe_load_params(aubio_pitch_crepe_t *o)
415{
416#ifdef HAVE_HDF5
417  uint_t i;
418  aubio_tensor_t *k = NULL;
419  fvec_t *vec = NULL;
420
421  AUBIO_ASSERT(o);
422
423  aubio_file_hdf5_t *hdf5 = new_aubio_file_hdf5(HDF5_FILE_PATH);
424  if (!hdf5) return AUBIO_FAIL;
425
426  // get kernels
427  for (i = 0; i < o->n_layers; i++) {
428    char_t *fmt_key = "/conv%d/conv%d_3/kernel:0";
429    char_t key[PATH_MAX];
430    snprintf(key, sizeof(key), fmt_key, i+1, i+1);
431    k = aubio_conv1d_get_kernel(o->conv_layers[i]);
432
433    // push dimension
434    k->shape[3] = k->shape[2]; k->shape[2] = k->shape[1]; k->shape[1] = 1;
435    k->ndim += 1;
436    // load params from hdf5 into kernel tensor
437    if (aubio_file_hdf5_load_dataset_into_tensor(hdf5, key, k))
438      return AUBIO_FAIL;
439    // pop dimension
440    k->shape[1] = k->shape[2]; k->shape[2] = k->shape[3]; k->shape[3] = 0;
441    k->ndim -= 1;
442  }
443
444  // get bias vectors
445  for (i = 0; i < o->n_layers; i++) {
446    char_t *fmt_key = "/conv%d/conv%d_3/bias:0";
447    char_t key[PATH_MAX];
448    snprintf(key, sizeof(key), fmt_key, i+1, i+1);
449    vec = aubio_conv1d_get_bias(o->conv_layers[i]);
450    // load params from hdf5 into kernel tensor
451    if (aubio_file_hdf5_load_dataset_into_vector(hdf5, key, vec))
452      return AUBIO_FAIL;
453  }
454
455  // batchnorm
456  for (i = 0; i < o->n_layers; i++) {
457    char_t *fmt_key = "/conv%d-BN/conv%d-BN_3/gamma:0";
458    char_t key[PATH_MAX];
459    snprintf(key, sizeof(key), fmt_key, i+1, i+1);
460    // get kernel matrix
461    vec = aubio_batchnorm_get_gamma(o->batchnorm_layers[i]);
462    // load params from hdf5 into kernel tensor
463    if (aubio_file_hdf5_load_dataset_into_vector(hdf5, key, vec))
464      return AUBIO_FAIL;
465  }
466  for (i = 0; i < o->n_layers; i++) {
467    char_t *fmt_key = "/conv%d-BN/conv%d-BN_3/beta:0";
468    char_t key[PATH_MAX];
469    snprintf(key, sizeof(key), fmt_key, i+1, i+1);
470    // get kernel matrix
471    vec = aubio_batchnorm_get_beta(o->batchnorm_layers[i]);
472    // load params from hdf5 into kernel tensor
473    if (aubio_file_hdf5_load_dataset_into_vector(hdf5, key, vec))
474      return AUBIO_FAIL;
475  }
476  for (i = 0; i < o->n_layers; i++) {
477    char_t *fmt_key = "/conv%d-BN/conv%d-BN_3/moving_mean:0";
478    char_t key[PATH_MAX];
479    snprintf(key, sizeof(key), fmt_key, i+1, i+1);
480    // get kernel matrix
481    vec = aubio_batchnorm_get_moving_mean(o->batchnorm_layers[i]);
482    // load params from hdf5 into kernel tensor
483    if (aubio_file_hdf5_load_dataset_into_vector(hdf5, key, vec))
484      return AUBIO_FAIL;
485  }
486  for (i = 0; i < o->n_layers; i++) {
487    char_t *fmt_key = "/conv%d-BN/conv%d-BN_3/moving_variance:0";
488    char_t key[PATH_MAX];
489    snprintf(key, sizeof(key), fmt_key, i+1, i+1);
490    // get kernel matrix
491    vec = aubio_batchnorm_get_moving_variance(o->batchnorm_layers[i]);
492    // load params from hdf5 into kernel tensor
493    if (aubio_file_hdf5_load_dataset_into_vector(hdf5, key, vec))
494      return AUBIO_FAIL;
495  }
496
497  // dense layer
498  {
499    char_t *key = "/classifier/classifier_3/kernel:0";
500    fmat_t *d = aubio_dense_get_weights(o->dense_layer);
501    if (aubio_file_hdf5_load_dataset_into_matrix(hdf5, key, d))
502      return AUBIO_FAIL;
503
504    key = "/classifier/classifier_3/bias:0";
505    fvec_t *v = aubio_dense_get_bias(o->dense_layer);
506    if (aubio_file_hdf5_load_dataset_into_vector(hdf5, key, v))
507      return AUBIO_FAIL;
508  }
509
510  if (hdf5) {
511    del_aubio_file_hdf5(hdf5);
512  }
513
514  return AUBIO_OK;
515#else
516  AUBIO_ASSERT(o);
517  AUBIO_ERR("pitch_crepe: hdf5 support was not built in, failed loading"
518      " crepe model\n");
519  return AUBIO_FAIL;
520#endif
521}
Note: See TracBrowser for help on using the repository browser.