source: src/pitch/pitch_crepe.c @ 7ba59f1

feature/crepe
Last change on this file since 7ba59f1 was 986e4b7, checked in by Paul Brossier <piem@piem.org>, 3 years ago

[pitch_crepe] reorder members, add comment

  • Property mode set to 100644
File size: 15.3 KB
Line 
1/*
2  Copyright (C) 2018 Paul Brossier <piem@aubio.org>
3
4  This file is part of aubio.
5
6  aubio is free software: you can redistribute it and/or modify
7  it under the terms of the GNU General Public License as published by
8  the Free Software Foundation, either version 3 of the License, or
9  (at your option) any later version.
10
11  aubio is distributed in the hope that it will be useful,
12  but WITHOUT ANY WARRANTY; without even the implied warranty of
13  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  GNU General Public License for more details.
15
16  You should have received a copy of the GNU General Public License
17  along with aubio.  If not, see <http://www.gnu.org/licenses/>.
18
19*/
20
21/* CREPE pitch algorithm
22
23 References
24 ----------
25
26 CREPE: A Convolutional Representation for Pitch Estimation Jong Wook Kim,
27 Justin Salamon, Peter Li, Juan Pablo Bello.  Proceedings of the IEEE
28 International Conference on Acoustics, Speech, and Signal Processing (ICASSP),
29 2018. Available online at https://arxiv.org/abs/1802.06182
30
31 Original implementation available at https://github.com/marl/crepe
32
33*/
34
35#include "aubio_priv.h"
36
37#include "fmat.h"
38#include "ai/tensor.h"
39#include "ai/activation.h"
40#include "ai/conv1d.h"
41#include "ai/maxpool1d.h"
42#include "ai/batchnorm.h"
43#include "ai/dense.h"
44#include "io/file_hdf5.h"
45#include "utils/scale.h"
46
47#define HDF5_FILE_PATH "crepe-model-tiny.h5"
48
49// public prototypes
50typedef struct _aubio_pitch_crepe_t aubio_pitch_crepe_t;
51aubio_pitch_crepe_t *new_aubio_pitch_crepe(void);
52void aubio_pitch_crepe_do(aubio_pitch_crepe_t *t, fvec_t *input, fvec_t *out);
53void del_aubio_pitch_crepe(aubio_pitch_crepe_t *t);
54smpl_t aubio_pitch_crepe_get_confidence (aubio_pitch_crepe_t * o);
55uint_t aubio_pitch_crepe_set_tolerance(aubio_pitch_crepe_t * o, smpl_t
56    tolerance);
57smpl_t aubio_pitch_crepe_get_tolerance (aubio_pitch_crepe_t * o);
58
59// static prototypes
60static uint_t aubio_pitch_crepe_load_params(aubio_pitch_crepe_t *o);
61
62struct _aubio_pitch_crepe_t
63{
64  // number of [conv, maxpool, batchnorm] groups
65  uint_t n_layers;
66  // layers
67  aubio_conv1d_t **conv_layers;
68  aubio_batchnorm_t **batchnorm_layers;
69  aubio_maxpool1d_t **maxpool_layers;
70  aubio_dense_t *dense_layer;
71  // input/output tensors
72  aubio_tensor_t *input_tensor;
73  aubio_tensor_t **conv_output;
74  aubio_tensor_t **batchnorm_output;
75  aubio_tensor_t **maxpool_output;
76  aubio_tensor_t *flattened;
77  aubio_tensor_t *dense_output;
78
79  smpl_t confidence;
80  smpl_t tolerance;
81  aubio_scale_t *scale;
82};
83
84aubio_pitch_crepe_t *new_aubio_pitch_crepe(void)
85{
86  aubio_pitch_crepe_t *o = AUBIO_NEW(aubio_pitch_crepe_t);
87  aubio_tensor_t *block_input;
88  // algorithm constants
89  uint_t input_shape[2] = {1024, 1};
90  uint_t capacity_modes[5] = {4, 8, 16, 24, 32};
91  uint_t n_filters[6] = {32, 4, 4, 4, 8, 16};
92  uint_t widths[6] = {512, 64, 64, 64, 64, 64};
93  uint_t maxpool_stride[1] = {2};
94  uint_t l0_stride[1] = {4};
95  uint_t n_dense = 360;
96
97  // local variables
98  uint_t capacity_mode = 0;
99  uint_t capacity = capacity_modes[capacity_mode];
100  uint_t output_shape[2];
101  uint_t i;
102
103#if defined(HAVE_BLAS) && defined(HAVE_OPENBLAS_CBLAS_H)
104  // workaround to prevent openblas from opening multiple threads, since
105  // the overhead appears to be higher than using a single thread.
106  openblas_set_num_threads(1);
107#endif
108
109  AUBIO_ASSERT (capacity_mode < 5 && (sint_t)capacity_mode >= 0);
110
111  o->n_layers = 6;
112  // create arrays of layers and tensors
113  o->conv_layers = AUBIO_ARRAY(aubio_conv1d_t*, o->n_layers);
114  o->conv_output = AUBIO_ARRAY(aubio_tensor_t*, o->n_layers);
115  o->maxpool_layers = AUBIO_ARRAY(aubio_maxpool1d_t*, o->n_layers);
116  o->maxpool_output = AUBIO_ARRAY(aubio_tensor_t*, o->n_layers);
117  o->batchnorm_layers = AUBIO_ARRAY(aubio_batchnorm_t*, o->n_layers);
118  o->batchnorm_output = AUBIO_ARRAY(aubio_tensor_t*, o->n_layers);
119
120  if (!o->conv_layers || !o->conv_output
121      || !o->maxpool_layers || !o->maxpool_output
122      || !o->batchnorm_layers || !o->batchnorm_output)
123    goto failure;
124
125  // create layers
126  for (i = 0; i < o->n_layers; i++) {
127    uint_t kern_shape[1] = {widths[i]};
128    // create convolutional layers
129    o->conv_layers[i] = new_aubio_conv1d(n_filters[i] * capacity, kern_shape);
130    if (!o->conv_layers[i]) goto failure;
131    // set padding='same'
132    if (aubio_conv1d_set_padding_mode(o->conv_layers[i], "same") != AUBIO_OK) {
133      goto failure;
134    }
135    // set stride of first layer
136    if ((i == 0) && (aubio_conv1d_set_stride(o->conv_layers[0],
137            l0_stride) != AUBIO_OK) ) {
138      goto failure;
139    }
140
141    // create batchnorm layers
142    o->batchnorm_layers[i] = new_aubio_batchnorm(n_filters[i] * capacity);
143    if (!o->batchnorm_layers[i]) goto failure;
144
145    // create maxpool layers
146    o->maxpool_layers[i] = new_aubio_maxpool1d(maxpool_stride);
147    if (!o->maxpool_layers[i]) goto failure;
148  }
149
150  o->dense_layer = new_aubio_dense(n_dense);
151  if (!o->dense_layer) goto failure;
152
153  // create input/output tensors
154  o->input_tensor = new_aubio_tensor(2, input_shape);
155  if (!o->input_tensor) goto failure;
156  block_input = o->input_tensor;
157  for (i = 0; i < o->n_layers; i++) {
158    // get shape of conv1d output and create its tensor
159    if (aubio_conv1d_get_output_shape(o->conv_layers[i],
160          block_input, output_shape))
161      goto failure;
162    o->conv_output[i] = new_aubio_tensor(2, output_shape);
163    if (!o->conv_output[i]) goto failure;
164
165    // get shape of batchnorm output and create its tensor
166    if (aubio_batchnorm_get_output_shape(o->batchnorm_layers[i],
167          o->conv_output[i], output_shape))
168      goto failure;
169    o->batchnorm_output[i] = new_aubio_tensor(2, output_shape);
170    if (!o->batchnorm_output[i]) goto failure;
171
172    // get shape of maxpool1d output and create its tensor
173    if (aubio_maxpool1d_get_output_shape(o->maxpool_layers[i],
174          o->batchnorm_output[i], output_shape))
175      goto failure;
176    o->maxpool_output[i] = new_aubio_tensor(2, output_shape);
177    if (!o->maxpool_output[i]) goto failure;
178
179    // set input for next block
180    block_input = o->maxpool_output[i];
181  }
182
183  uint_t flattened_dim = o->maxpool_output[5]->shape[0];
184  flattened_dim *= o->maxpool_output[5]->shape[1];
185  uint_t dense_input[1] = {flattened_dim};
186  o->flattened = new_aubio_tensor(1, dense_input);
187  if (!o->flattened) goto failure;
188
189  // permute and flatten
190  aubio_tensor_t *permute_input = o->maxpool_output[5];
191  AUBIO_DBG("permute:           (%d, %d) ->"
192      " (%d, %d) (permutation=(2, 1))\n",
193      permute_input->shape[0], permute_input->shape[1],
194      permute_input->shape[1], permute_input->shape[0]);
195  AUBIO_DBG("flatten:           (%d, %d) -> (%d)\n",
196      permute_input->shape[1], permute_input->shape[0],
197      o->flattened->shape[0]);
198
199  if (aubio_dense_get_output_shape(o->dense_layer, o->flattened, output_shape))
200    goto failure;
201  o->dense_output = new_aubio_tensor(1, output_shape);
202  if (!o->dense_output) goto failure;
203
204  AUBIO_ASSERT(n_dense == output_shape[0]);
205
206  if (aubio_pitch_crepe_load_params(o))
207    goto failure;
208
209  // map output units to midi note
210  smpl_t start = 1997.379408437619;
211  smpl_t end = 7180.;
212  o->scale = new_aubio_scale(0., 359., start, start + end);
213  if (!o->scale) goto failure;
214
215  return o;
216
217failure:
218  del_aubio_pitch_crepe(o);
219  return NULL;
220}
221
222void del_aubio_pitch_crepe(aubio_pitch_crepe_t *o)
223{
224  uint_t i;
225  AUBIO_ASSERT(o);
226
227  if (o->input_tensor) {
228    del_aubio_tensor(o->input_tensor);
229  }
230
231  if (o->batchnorm_output) {
232    for (i = 0; i < o->n_layers; i++) {
233      if (o->batchnorm_output[i])
234        del_aubio_tensor(o->batchnorm_output[i]);
235    }
236    AUBIO_FREE(o->batchnorm_output);
237  }
238
239  if (o->batchnorm_layers) {
240    for (i = 0; i < o->n_layers; i++) {
241      if (o->batchnorm_layers[i])
242        del_aubio_batchnorm(o->batchnorm_layers[i]);
243    }
244    AUBIO_FREE(o->batchnorm_layers);
245  }
246
247  if (o->maxpool_output) {
248    for (i = 0; i < o->n_layers; i++) {
249      if (o->maxpool_output[i])
250        del_aubio_tensor(o->maxpool_output[i]);
251    }
252    AUBIO_FREE(o->maxpool_output);
253  }
254
255  if (o->maxpool_layers) {
256    for (i = 0; i < o->n_layers; i++) {
257      if (o->maxpool_layers[i])
258        del_aubio_maxpool1d(o->maxpool_layers[i]);
259    }
260    AUBIO_FREE(o->maxpool_layers);
261  }
262
263  if (o->conv_output) {
264    for (i = 0; i < o->n_layers; i++) {
265      if (o->conv_output[i])
266        del_aubio_tensor(o->conv_output[i]);
267    }
268    AUBIO_FREE(o->conv_output);
269  }
270
271  if (o->conv_layers) {
272    for (i = 0; i < o->n_layers; i++) {
273      if (o->conv_layers[i])
274        del_aubio_conv1d(o->conv_layers[i]);
275    }
276    AUBIO_FREE(o->conv_layers);
277  }
278
279  if (o->flattened) {
280    del_aubio_tensor(o->flattened);
281  }
282
283  if (o->dense_layer) {
284    del_aubio_dense(o->dense_layer);
285  }
286
287  if (o->dense_output) {
288    del_aubio_tensor(o->dense_output);
289  }
290
291  if (o->scale) {
292    del_aubio_scale(o->scale);
293  }
294
295  AUBIO_FREE(o);
296}
297
298void aubio_pitch_crepe_do(aubio_pitch_crepe_t *o, fvec_t *input, fvec_t *out)
299{
300  uint_t i;
301  AUBIO_ASSERT(o && input);
302  // copy input to input tensor
303  AUBIO_ASSERT(input->length == o->input_tensor->shape[0]);
304  // normalize frame, removing mean and dividing by std
305  smpl_t mean = fvec_mean(input);
306  fvec_add(input, -mean);
307  smpl_t std = 0.;
308  for (i = 0; i < input->length; i++) {
309    std += SQR(input->data[i]);
310  }
311  std = SQRT(std / (smpl_t)input->length);
312  if (std < 1.e-7) std = 1;
313
314  for (i = 0; i < input->length; i++) {
315    o->input_tensor->data[0][i] = input->data[i] / std;
316  }
317
318  aubio_tensor_t *block_input = o->input_tensor;
319  for (i = 0; i < o->n_layers; i++) {
320    aubio_conv1d_do(o->conv_layers[i], block_input,
321        o->conv_output[i]);
322    // relu activation
323    aubio_activation_relu(o->conv_output[i]);
324    aubio_batchnorm_do(o->batchnorm_layers[i], o->conv_output[i],
325        o->batchnorm_output[i]);
326    aubio_maxpool1d_do(o->maxpool_layers[i], o->batchnorm_output[i],
327        o->maxpool_output[i]);
328    block_input = o->maxpool_output[i];
329  }
330
331  aubio_tensor_t *permute_input = o->maxpool_output[5];
332  // perform flattening (permutation has no effect here, order unchanged)
333  AUBIO_ASSERT (permute_input->size == o->flattened->size);
334  for (i = 0; i < permute_input->size; i++) {
335    o->flattened->data[0][i] = permute_input->data[0][i];
336  }
337
338  // compute dense layer
339  aubio_dense_do(o->dense_layer, o->flattened, o->dense_output);
340
341  // sigmoid activation
342  aubio_activation_sigmoid(o->dense_output);
343
344#if 0
345  // print debug output
346  for (i = 0; i < o->n_layers; i++) {
347    AUBIO_DBG("pitch_crepe: conv1d[%d]    %f\n", i,
348        aubio_tensor_max(o->conv_output[i]));
349    AUBIO_DBG("pitch_crepe: batchnorm[%d] %f\n", i,
350        aubio_tensor_max(o->batchnorm_output[i]));
351    AUBIO_DBG("pitch_crepe: maxpool1d[%d] %f\n", i,
352        aubio_tensor_max(o->maxpool_output[i]));
353  }
354  AUBIO_DBG("pitch_crepe: dense %f\n", aubio_tensor_max(o->dense_output));
355#endif
356
357  // find maximum activation
358  fvec_t activations;
359  aubio_tensor_as_fvec(o->dense_output, &activations);
360  uint_t argmax = fvec_max_elem(&activations);
361  o->confidence = activations.data[argmax];
362
363  // skip frames with no activation at all (e.g. silence)
364  // or with insufficient confidence
365  if ((argmax == activations.length - 1)
366      || (o->confidence < o->tolerance)) {
367    out->data[0] = -100.;
368    o->confidence = 0;
369    return;
370  }
371
372  // perform interpolation across neighbouring outputs
373  sint_t start = MAX(0, (sint_t)argmax - 4);
374  uint_t end = MIN(argmax + 5, activations.length);
375
376  smpl_t prod = 0;
377  smpl_t weight = 0;
378  smpl_t scaling = 0;
379  for (i = start; i < end; i++) {
380    scaling = (smpl_t)(i);
381    prod += activations.data[i] * scaling;
382    weight += activations.data[i];
383  }
384  out->data[0] = prod / weight;
385
386  // map output units to midi output
387  aubio_scale_do(o->scale, out);
388
389  // convert cents to midi
390  out->data[0] /= 100.;
391
392  // final bias (f_ref = 10Hz -> 3.48 midi)
393  out->data[0] += 3.486821174621582;
394}
395
396smpl_t aubio_pitch_crepe_get_confidence (aubio_pitch_crepe_t* o)
397{
398  return o->confidence;
399}
400
401uint_t aubio_pitch_crepe_set_tolerance(aubio_pitch_crepe_t * o,
402    smpl_t tolerance)
403{
404  if (o->tolerance < 0 || o->tolerance > 1) return AUBIO_FAIL;
405  o->tolerance = tolerance;
406  return AUBIO_OK;
407}
408
409smpl_t aubio_pitch_crepe_get_tolerance (aubio_pitch_crepe_t * o)
410{
411  return o->tolerance;
412}
413
414uint_t aubio_pitch_crepe_load_params(aubio_pitch_crepe_t *o)
415{
416  uint_t i;
417  aubio_tensor_t *k = NULL;
418  fvec_t *vec = NULL;
419
420  AUBIO_ASSERT(o);
421
422  aubio_file_hdf5_t *hdf5 = new_aubio_file_hdf5(HDF5_FILE_PATH);
423  if (!hdf5) return AUBIO_FAIL;
424
425  // get kernels
426  for (i = 0; i < o->n_layers; i++) {
427    char_t *fmt_key = "/conv%d/conv%d_3/kernel:0";
428    char_t key[PATH_MAX];
429    snprintf(key, sizeof(key), fmt_key, i+1, i+1);
430    k = aubio_conv1d_get_kernel(o->conv_layers[i]);
431
432    // push dimension
433    k->shape[3] = k->shape[2]; k->shape[2] = k->shape[1]; k->shape[1] = 1;
434    k->ndim += 1;
435    // load params from hdf5 into kernel tensor
436    if (aubio_file_hdf5_load_dataset_into_tensor(hdf5, key, k))
437      return AUBIO_FAIL;
438    // pop dimension
439    k->shape[1] = k->shape[2]; k->shape[2] = k->shape[3]; k->shape[3] = 0;
440    k->ndim -= 1;
441  }
442
443  // get bias vectors
444  for (i = 0; i < o->n_layers; i++) {
445    char_t *fmt_key = "/conv%d/conv%d_3/bias:0";
446    char_t key[PATH_MAX];
447    snprintf(key, sizeof(key), fmt_key, i+1, i+1);
448    vec = aubio_conv1d_get_bias(o->conv_layers[i]);
449    // load params from hdf5 into kernel tensor
450    if (aubio_file_hdf5_load_dataset_into_vector(hdf5, key, vec))
451      return AUBIO_FAIL;
452  }
453
454  // batchnorm
455  for (i = 0; i < o->n_layers; i++) {
456    char_t *fmt_key = "/conv%d-BN/conv%d-BN_3/gamma:0";
457    char_t key[PATH_MAX];
458    snprintf(key, sizeof(key), fmt_key, i+1, i+1);
459    // get kernel matrix
460    vec = aubio_batchnorm_get_gamma(o->batchnorm_layers[i]);
461    // load params from hdf5 into kernel tensor
462    if (aubio_file_hdf5_load_dataset_into_vector(hdf5, key, vec))
463      return AUBIO_FAIL;
464  }
465  for (i = 0; i < o->n_layers; i++) {
466    char_t *fmt_key = "/conv%d-BN/conv%d-BN_3/beta:0";
467    char_t key[PATH_MAX];
468    snprintf(key, sizeof(key), fmt_key, i+1, i+1);
469    // get kernel matrix
470    vec = aubio_batchnorm_get_beta(o->batchnorm_layers[i]);
471    // load params from hdf5 into kernel tensor
472    if (aubio_file_hdf5_load_dataset_into_vector(hdf5, key, vec))
473      return AUBIO_FAIL;
474  }
475  for (i = 0; i < o->n_layers; i++) {
476    char_t *fmt_key = "/conv%d-BN/conv%d-BN_3/moving_mean:0";
477    char_t key[PATH_MAX];
478    snprintf(key, sizeof(key), fmt_key, i+1, i+1);
479    // get kernel matrix
480    vec = aubio_batchnorm_get_moving_mean(o->batchnorm_layers[i]);
481    // load params from hdf5 into kernel tensor
482    if (aubio_file_hdf5_load_dataset_into_vector(hdf5, key, vec))
483      return AUBIO_FAIL;
484  }
485  for (i = 0; i < o->n_layers; i++) {
486    char_t *fmt_key = "/conv%d-BN/conv%d-BN_3/moving_variance:0";
487    char_t key[PATH_MAX];
488    snprintf(key, sizeof(key), fmt_key, i+1, i+1);
489    // get kernel matrix
490    vec = aubio_batchnorm_get_moving_variance(o->batchnorm_layers[i]);
491    // load params from hdf5 into kernel tensor
492    if (aubio_file_hdf5_load_dataset_into_vector(hdf5, key, vec))
493      return AUBIO_FAIL;
494  }
495
496  // dense layer
497  {
498    char_t *key = "/classifier/classifier_3/kernel:0";
499    fmat_t *d = aubio_dense_get_weights(o->dense_layer);
500    if (aubio_file_hdf5_load_dataset_into_matrix(hdf5, key, d))
501      return AUBIO_FAIL;
502
503    key = "/classifier/classifier_3/bias:0";
504    fvec_t *v = aubio_dense_get_bias(o->dense_layer);
505    if (aubio_file_hdf5_load_dataset_into_vector(hdf5, key, v))
506      return AUBIO_FAIL;
507  }
508
509  if (hdf5) {
510    del_aubio_file_hdf5(hdf5);
511  }
512
513  return AUBIO_OK;
514}
Note: See TracBrowser for help on using the repository browser.