source: src/pitch/pitch_crepe.c @ 815a73c

feature/crepe
Last change on this file since 815a73c was 815a73c, checked in by Paul Brossier <piem@piem.org>, 3 years ago

[pitch_crepe] avoid compiler warnings in release mode

  • Property mode set to 100644
File size: 15.5 KB
Line 
1/*
2  Copyright (C) 2018 Paul Brossier <piem@aubio.org>
3
4  This file is part of aubio.
5
6  aubio is free software: you can redistribute it and/or modify
7  it under the terms of the GNU General Public License as published by
8  the Free Software Foundation, either version 3 of the License, or
9  (at your option) any later version.
10
11  aubio is distributed in the hope that it will be useful,
12  but WITHOUT ANY WARRANTY; without even the implied warranty of
13  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  GNU General Public License for more details.
15
16  You should have received a copy of the GNU General Public License
17  along with aubio.  If not, see <http://www.gnu.org/licenses/>.
18
19*/
20
21/* CREPE pitch algorithm
22
23 References
24 ----------
25
26 CREPE: A Convolutional Representation for Pitch Estimation Jong Wook Kim,
27 Justin Salamon, Peter Li, Juan Pablo Bello.  Proceedings of the IEEE
28 International Conference on Acoustics, Speech, and Signal Processing (ICASSP),
29 2018. Available online at https://arxiv.org/abs/1802.06182
30
31 Original implementation available at https://github.com/marl/crepe
32
33*/
34
35#include "aubio_priv.h"
36
37#include "fmat.h"
38#include "ai/tensor.h"
39#include "ai/activation.h"
40#include "ai/conv1d.h"
41#include "ai/maxpool1d.h"
42#include "ai/batchnorm.h"
43#include "ai/dense.h"
44#include "io/file_hdf5.h"
45#include "utils/scale.h"
46
47#define HDF5_FILE_PATH "crepe-model-tiny.h5"
48
49// public prototypes
50typedef struct _aubio_pitch_crepe_t aubio_pitch_crepe_t;
51aubio_pitch_crepe_t *new_aubio_pitch_crepe(void);
52void aubio_pitch_crepe_do(aubio_pitch_crepe_t *t, fvec_t *input, fvec_t *out);
53void del_aubio_pitch_crepe(aubio_pitch_crepe_t *t);
54smpl_t aubio_pitch_crepe_get_confidence (aubio_pitch_crepe_t * o);
55uint_t aubio_pitch_crepe_set_tolerance(aubio_pitch_crepe_t * o, smpl_t
56    tolerance);
57smpl_t aubio_pitch_crepe_get_tolerance (aubio_pitch_crepe_t * o);
58
59// static prototypes
60static uint_t aubio_pitch_crepe_load_params(aubio_pitch_crepe_t *o);
61
62struct _aubio_pitch_crepe_t
63{
64  // number of [conv, maxpool, batchnorm] groups
65  uint_t n_layers;
66  // layers
67  aubio_conv1d_t **conv_layers;
68  aubio_batchnorm_t **batchnorm_layers;
69  aubio_maxpool1d_t **maxpool_layers;
70  aubio_dense_t *dense_layer;
71  // input/output tensors
72  aubio_tensor_t *input_tensor;
73  aubio_tensor_t **conv_output;
74  aubio_tensor_t **batchnorm_output;
75  aubio_tensor_t **maxpool_output;
76  aubio_tensor_t *flattened;
77  aubio_tensor_t *dense_output;
78
79  smpl_t confidence;
80  smpl_t tolerance;
81  aubio_scale_t *scale;
82};
83
84aubio_pitch_crepe_t *new_aubio_pitch_crepe(void)
85{
86  aubio_pitch_crepe_t *o = AUBIO_NEW(aubio_pitch_crepe_t);
87  aubio_tensor_t *block_input;
88  // algorithm constants
89  uint_t input_shape[2] = {1024, 1};
90  uint_t capacity_modes[5] = {4, 8, 16, 24, 32};
91  uint_t n_filters[6] = {32, 4, 4, 4, 8, 16};
92  uint_t widths[6] = {512, 64, 64, 64, 64, 64};
93  uint_t maxpool_stride[1] = {2};
94  uint_t l0_stride[1] = {4};
95  uint_t n_dense = 360;
96
97  // local variables
98  uint_t capacity_mode = 0;
99  uint_t capacity = capacity_modes[capacity_mode];
100  uint_t output_shape[2];
101  uint_t i;
102
103#if defined(HAVE_BLAS) && defined(HAVE_OPENBLAS_CBLAS_H)
104  // workaround to prevent openblas from opening multiple threads, since
105  // the overhead appears to be higher than using a single thread.
106  openblas_set_num_threads(1);
107#endif
108
109  AUBIO_ASSERT (capacity_mode < 5 && (sint_t)capacity_mode >= 0);
110
111  o->n_layers = 6;
112  // create arrays of layers and tensors
113  o->conv_layers = AUBIO_ARRAY(aubio_conv1d_t*, o->n_layers);
114  o->conv_output = AUBIO_ARRAY(aubio_tensor_t*, o->n_layers);
115  o->maxpool_layers = AUBIO_ARRAY(aubio_maxpool1d_t*, o->n_layers);
116  o->maxpool_output = AUBIO_ARRAY(aubio_tensor_t*, o->n_layers);
117  o->batchnorm_layers = AUBIO_ARRAY(aubio_batchnorm_t*, o->n_layers);
118  o->batchnorm_output = AUBIO_ARRAY(aubio_tensor_t*, o->n_layers);
119
120  if (!o->conv_layers || !o->conv_output
121      || !o->maxpool_layers || !o->maxpool_output
122      || !o->batchnorm_layers || !o->batchnorm_output)
123    goto failure;
124
125  // create layers
126  for (i = 0; i < o->n_layers; i++) {
127    uint_t kern_shape[1] = {widths[i]};
128    // create convolutional layers
129    o->conv_layers[i] = new_aubio_conv1d(n_filters[i] * capacity, kern_shape);
130    if (!o->conv_layers[i]) goto failure;
131    // set padding='same'
132    if (aubio_conv1d_set_padding_mode(o->conv_layers[i], "same") != AUBIO_OK) {
133      goto failure;
134    }
135    // set stride of first layer
136    if ((i == 0) && (aubio_conv1d_set_stride(o->conv_layers[0],
137            l0_stride) != AUBIO_OK) ) {
138      goto failure;
139    }
140
141    // create batchnorm layers
142    o->batchnorm_layers[i] = new_aubio_batchnorm();
143    if (!o->batchnorm_layers[i]) goto failure;
144
145    // create maxpool layers
146    o->maxpool_layers[i] = new_aubio_maxpool1d(maxpool_stride);
147    if (!o->maxpool_layers[i]) goto failure;
148  }
149
150  o->dense_layer = new_aubio_dense(n_dense);
151  if (!o->dense_layer) goto failure;
152
153  // create input/output tensors
154  o->input_tensor = new_aubio_tensor(2, input_shape);
155  if (!o->input_tensor) goto failure;
156  block_input = o->input_tensor;
157  for (i = 0; i < o->n_layers; i++) {
158    // get shape of conv1d output and create its tensor
159    if (aubio_conv1d_get_output_shape(o->conv_layers[i],
160          block_input, output_shape))
161      goto failure;
162    o->conv_output[i] = new_aubio_tensor(2, output_shape);
163    if (!o->conv_output[i]) goto failure;
164
165    // get shape of batchnorm output and create its tensor
166    if (aubio_batchnorm_get_output_shape(o->batchnorm_layers[i],
167          o->conv_output[i], output_shape))
168      goto failure;
169    o->batchnorm_output[i] = new_aubio_tensor(2, output_shape);
170    if (!o->batchnorm_output[i]) goto failure;
171
172    // get shape of maxpool1d output and create its tensor
173    if (aubio_maxpool1d_get_output_shape(o->maxpool_layers[i],
174          o->batchnorm_output[i], output_shape))
175      goto failure;
176    o->maxpool_output[i] = new_aubio_tensor(2, output_shape);
177    if (!o->maxpool_output[i]) goto failure;
178
179    // set input for next block
180    block_input = o->maxpool_output[i];
181  }
182
183  uint_t flattened_dim = o->maxpool_output[5]->shape[0];
184  flattened_dim *= o->maxpool_output[5]->shape[1];
185  uint_t dense_input[1] = {flattened_dim};
186  o->flattened = new_aubio_tensor(1, dense_input);
187  if (!o->flattened) goto failure;
188
189#if defined(DEBUG)
190  // permute and flatten
191  aubio_tensor_t *permute_input = o->maxpool_output[5];
192  AUBIO_DBG("permute:           (%d, %d) ->"
193      " (%d, %d) (permutation=(2, 1))\n",
194      permute_input->shape[0], permute_input->shape[1],
195      permute_input->shape[1], permute_input->shape[0]);
196  AUBIO_DBG("flatten:           (%d, %d) -> (%d)\n",
197      permute_input->shape[1], permute_input->shape[0],
198      o->flattened->shape[0]);
199#endif
200
201  if (aubio_dense_get_output_shape(o->dense_layer, o->flattened, output_shape))
202    goto failure;
203  o->dense_output = new_aubio_tensor(1, output_shape);
204  if (!o->dense_output) goto failure;
205
206  AUBIO_ASSERT(n_dense == output_shape[0]);
207
208  if (aubio_pitch_crepe_load_params(o))
209    goto failure;
210
211  // map output units to midi note
212  smpl_t start = 1997.379408437619;
213  smpl_t end = 7180.;
214  o->scale = new_aubio_scale(0., 359., start, start + end);
215  if (!o->scale) goto failure;
216
217  return o;
218
219failure:
220  del_aubio_pitch_crepe(o);
221  return NULL;
222}
223
224void del_aubio_pitch_crepe(aubio_pitch_crepe_t *o)
225{
226  uint_t i;
227  AUBIO_ASSERT(o);
228
229  if (o->input_tensor) {
230    del_aubio_tensor(o->input_tensor);
231  }
232
233  if (o->batchnorm_output) {
234    for (i = 0; i < o->n_layers; i++) {
235      if (o->batchnorm_output[i])
236        del_aubio_tensor(o->batchnorm_output[i]);
237    }
238    AUBIO_FREE(o->batchnorm_output);
239  }
240
241  if (o->batchnorm_layers) {
242    for (i = 0; i < o->n_layers; i++) {
243      if (o->batchnorm_layers[i])
244        del_aubio_batchnorm(o->batchnorm_layers[i]);
245    }
246    AUBIO_FREE(o->batchnorm_layers);
247  }
248
249  if (o->maxpool_output) {
250    for (i = 0; i < o->n_layers; i++) {
251      if (o->maxpool_output[i])
252        del_aubio_tensor(o->maxpool_output[i]);
253    }
254    AUBIO_FREE(o->maxpool_output);
255  }
256
257  if (o->maxpool_layers) {
258    for (i = 0; i < o->n_layers; i++) {
259      if (o->maxpool_layers[i])
260        del_aubio_maxpool1d(o->maxpool_layers[i]);
261    }
262    AUBIO_FREE(o->maxpool_layers);
263  }
264
265  if (o->conv_output) {
266    for (i = 0; i < o->n_layers; i++) {
267      if (o->conv_output[i])
268        del_aubio_tensor(o->conv_output[i]);
269    }
270    AUBIO_FREE(o->conv_output);
271  }
272
273  if (o->conv_layers) {
274    for (i = 0; i < o->n_layers; i++) {
275      if (o->conv_layers[i])
276        del_aubio_conv1d(o->conv_layers[i]);
277    }
278    AUBIO_FREE(o->conv_layers);
279  }
280
281  if (o->flattened) {
282    del_aubio_tensor(o->flattened);
283  }
284
285  if (o->dense_layer) {
286    del_aubio_dense(o->dense_layer);
287  }
288
289  if (o->dense_output) {
290    del_aubio_tensor(o->dense_output);
291  }
292
293  if (o->scale) {
294    del_aubio_scale(o->scale);
295  }
296
297  AUBIO_FREE(o);
298}
299
300void aubio_pitch_crepe_do(aubio_pitch_crepe_t *o, fvec_t *input, fvec_t *out)
301{
302  uint_t i;
303  AUBIO_ASSERT(o && input);
304  // copy input to input tensor
305  AUBIO_ASSERT(input->length == o->input_tensor->shape[0]);
306  // normalize frame, removing mean and dividing by std
307  smpl_t mean = fvec_mean(input);
308  fvec_add(input, -mean);
309  smpl_t std = 0.;
310  for (i = 0; i < input->length; i++) {
311    std += SQR(input->data[i]);
312  }
313  std = SQRT(std / (smpl_t)input->length);
314  if (std < 1.e-7) std = 1;
315
316  for (i = 0; i < input->length; i++) {
317    o->input_tensor->data[0][i] = input->data[i] / std;
318  }
319
320  aubio_tensor_t *block_input = o->input_tensor;
321  for (i = 0; i < o->n_layers; i++) {
322    aubio_conv1d_do(o->conv_layers[i], block_input,
323        o->conv_output[i]);
324    // relu activation
325    aubio_activation_relu(o->conv_output[i]);
326    aubio_batchnorm_do(o->batchnorm_layers[i], o->conv_output[i],
327        o->batchnorm_output[i]);
328    aubio_maxpool1d_do(o->maxpool_layers[i], o->batchnorm_output[i],
329        o->maxpool_output[i]);
330    block_input = o->maxpool_output[i];
331  }
332
333  aubio_tensor_t *permute_input = o->maxpool_output[5];
334  // perform flattening (permutation has no effect here, order unchanged)
335  AUBIO_ASSERT (permute_input->size == o->flattened->size);
336  for (i = 0; i < permute_input->size; i++) {
337    o->flattened->data[0][i] = permute_input->data[0][i];
338  }
339
340  // compute dense layer
341  aubio_dense_do(o->dense_layer, o->flattened, o->dense_output);
342
343  // sigmoid activation
344  aubio_activation_sigmoid(o->dense_output);
345
346#if 0
347  // print debug output
348  for (i = 0; i < o->n_layers; i++) {
349    AUBIO_DBG("pitch_crepe: conv1d[%d]    %f\n", i,
350        aubio_tensor_max(o->conv_output[i]));
351    AUBIO_DBG("pitch_crepe: batchnorm[%d] %f\n", i,
352        aubio_tensor_max(o->batchnorm_output[i]));
353    AUBIO_DBG("pitch_crepe: maxpool1d[%d] %f\n", i,
354        aubio_tensor_max(o->maxpool_output[i]));
355  }
356  AUBIO_DBG("pitch_crepe: dense %f\n", aubio_tensor_max(o->dense_output));
357#endif
358
359  // find maximum activation
360  fvec_t activations;
361  aubio_tensor_as_fvec(o->dense_output, &activations);
362  uint_t argmax = fvec_max_elem(&activations);
363  o->confidence = activations.data[argmax];
364
365  // skip frames with no activation at all (e.g. silence)
366  // or with insufficient confidence
367  if ((argmax == activations.length - 1)
368      || (o->confidence < o->tolerance)) {
369    out->data[0] = -100.;
370    o->confidence = 0;
371    return;
372  }
373
374  // perform interpolation across neighbouring outputs
375  sint_t start = MAX(0, (sint_t)argmax - 4);
376  uint_t end = MIN(argmax + 5, activations.length);
377
378  smpl_t prod = 0;
379  smpl_t weight = 0;
380  smpl_t scaling = 0;
381  for (i = start; i < end; i++) {
382    scaling = (smpl_t)(i);
383    prod += activations.data[i] * scaling;
384    weight += activations.data[i];
385  }
386  out->data[0] = prod / weight;
387
388  // map output units to midi output
389  aubio_scale_do(o->scale, out);
390
391  // convert cents to midi
392  out->data[0] /= 100.;
393
394  // final bias (f_ref = 10Hz -> 3.48 midi)
395  out->data[0] += 3.486821174621582;
396}
397
398smpl_t aubio_pitch_crepe_get_confidence (aubio_pitch_crepe_t* o)
399{
400  return o->confidence;
401}
402
403uint_t aubio_pitch_crepe_set_tolerance(aubio_pitch_crepe_t * o,
404    smpl_t tolerance)
405{
406  if (o->tolerance < 0 || o->tolerance > 1) return AUBIO_FAIL;
407  o->tolerance = tolerance;
408  return AUBIO_OK;
409}
410
411smpl_t aubio_pitch_crepe_get_tolerance (aubio_pitch_crepe_t * o)
412{
413  return o->tolerance;
414}
415
416uint_t aubio_pitch_crepe_load_params(aubio_pitch_crepe_t *o)
417{
418#ifdef HAVE_HDF5
419  uint_t i;
420  aubio_tensor_t *k = NULL;
421  fvec_t *vec = NULL;
422
423  AUBIO_ASSERT(o);
424
425  aubio_file_hdf5_t *hdf5 = new_aubio_file_hdf5(HDF5_FILE_PATH);
426  if (!hdf5) return AUBIO_FAIL;
427
428  // get kernels
429  for (i = 0; i < o->n_layers; i++) {
430    char_t *fmt_key = "/conv%d/conv%d_3/kernel:0";
431    char_t key[PATH_MAX];
432    snprintf(key, sizeof(key), fmt_key, i+1, i+1);
433    k = aubio_conv1d_get_kernel(o->conv_layers[i]);
434
435    // push dimension
436    k->shape[3] = k->shape[2]; k->shape[2] = k->shape[1]; k->shape[1] = 1;
437    k->ndim += 1;
438    // load params from hdf5 into kernel tensor
439    if (aubio_file_hdf5_load_dataset_into_tensor(hdf5, key, k))
440      return AUBIO_FAIL;
441    // pop dimension
442    k->shape[1] = k->shape[2]; k->shape[2] = k->shape[3]; k->shape[3] = 0;
443    k->ndim -= 1;
444  }
445
446  // get bias vectors
447  for (i = 0; i < o->n_layers; i++) {
448    char_t *fmt_key = "/conv%d/conv%d_3/bias:0";
449    char_t key[PATH_MAX];
450    snprintf(key, sizeof(key), fmt_key, i+1, i+1);
451    vec = aubio_conv1d_get_bias(o->conv_layers[i]);
452    // load params from hdf5 into kernel tensor
453    if (aubio_file_hdf5_load_dataset_into_vector(hdf5, key, vec))
454      return AUBIO_FAIL;
455  }
456
457  // batchnorm
458  for (i = 0; i < o->n_layers; i++) {
459    char_t *fmt_key = "/conv%d-BN/conv%d-BN_3/gamma:0";
460    char_t key[PATH_MAX];
461    snprintf(key, sizeof(key), fmt_key, i+1, i+1);
462    // get kernel matrix
463    vec = aubio_batchnorm_get_gamma(o->batchnorm_layers[i]);
464    // load params from hdf5 into kernel tensor
465    if (aubio_file_hdf5_load_dataset_into_vector(hdf5, key, vec))
466      return AUBIO_FAIL;
467  }
468  for (i = 0; i < o->n_layers; i++) {
469    char_t *fmt_key = "/conv%d-BN/conv%d-BN_3/beta:0";
470    char_t key[PATH_MAX];
471    snprintf(key, sizeof(key), fmt_key, i+1, i+1);
472    // get kernel matrix
473    vec = aubio_batchnorm_get_beta(o->batchnorm_layers[i]);
474    // load params from hdf5 into kernel tensor
475    if (aubio_file_hdf5_load_dataset_into_vector(hdf5, key, vec))
476      return AUBIO_FAIL;
477  }
478  for (i = 0; i < o->n_layers; i++) {
479    char_t *fmt_key = "/conv%d-BN/conv%d-BN_3/moving_mean:0";
480    char_t key[PATH_MAX];
481    snprintf(key, sizeof(key), fmt_key, i+1, i+1);
482    // get kernel matrix
483    vec = aubio_batchnorm_get_moving_mean(o->batchnorm_layers[i]);
484    // load params from hdf5 into kernel tensor
485    if (aubio_file_hdf5_load_dataset_into_vector(hdf5, key, vec))
486      return AUBIO_FAIL;
487  }
488  for (i = 0; i < o->n_layers; i++) {
489    char_t *fmt_key = "/conv%d-BN/conv%d-BN_3/moving_variance:0";
490    char_t key[PATH_MAX];
491    snprintf(key, sizeof(key), fmt_key, i+1, i+1);
492    // get kernel matrix
493    vec = aubio_batchnorm_get_moving_variance(o->batchnorm_layers[i]);
494    // load params from hdf5 into kernel tensor
495    if (aubio_file_hdf5_load_dataset_into_vector(hdf5, key, vec))
496      return AUBIO_FAIL;
497  }
498
499  // dense layer
500  {
501    char_t *key = "/classifier/classifier_3/kernel:0";
502    fmat_t *d = aubio_dense_get_weights(o->dense_layer);
503    if (aubio_file_hdf5_load_dataset_into_matrix(hdf5, key, d))
504      return AUBIO_FAIL;
505
506    key = "/classifier/classifier_3/bias:0";
507    fvec_t *v = aubio_dense_get_bias(o->dense_layer);
508    if (aubio_file_hdf5_load_dataset_into_vector(hdf5, key, v))
509      return AUBIO_FAIL;
510  }
511
512  if (hdf5) {
513    del_aubio_file_hdf5(hdf5);
514  }
515
516  return AUBIO_OK;
517#else
518  AUBIO_ASSERT(o);
519  AUBIO_UNUSED(o);
520  AUBIO_ERR("pitch_crepe: hdf5 support was not built in, failed loading"
521      " crepe model\n");
522  return AUBIO_FAIL;
523#endif
524}
Note: See TracBrowser for help on using the repository browser.