source: src/pitch/pitch_crepe.c @ dd4e5d5

feature/crepe
Last change on this file since dd4e5d5 was dd4e5d5, checked in by Paul Brossier <piem@piem.org>, 2 years ago

[crepe] prevent openblas from opening threads

  • Property mode set to 100644
File size: 15.1 KB
Line 
1/*
2  Copyright (C) 2018 Paul Brossier <piem@aubio.org>
3
4  This file is part of aubio.
5
6  aubio is free software: you can redistribute it and/or modify
7  it under the terms of the GNU General Public License as published by
8  the Free Software Foundation, either version 3 of the License, or
9  (at your option) any later version.
10
11  aubio is distributed in the hope that it will be useful,
12  but WITHOUT ANY WARRANTY; without even the implied warranty of
13  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  GNU General Public License for more details.
15
16  You should have received a copy of the GNU General Public License
17  along with aubio.  If not, see <http://www.gnu.org/licenses/>.
18
19*/
20
21/* CREPE pitch algorithm
22
23 References
24 ----------
25
26 CREPE: A Convolutional Representation for Pitch Estimation Jong Wook Kim,
27 Justin Salamon, Peter Li, Juan Pablo Bello.  Proceedings of the IEEE
28 International Conference on Acoustics, Speech, and Signal Processing (ICASSP),
29 2018. Available online at https://arxiv.org/abs/1802.06182
30
31 Original implementation available at https://github.com/marl/crepe
32
33*/
34
35#include "aubio_priv.h"
36
37#include "fmat.h"
38#include "ai/tensor.h"
39#include "ai/conv1d.h"
40#include "ai/maxpool1d.h"
41#include "ai/batchnorm.h"
42#include "ai/dense.h"
43#include "io/file_hdf5.h"
44#include "utils/scale.h"
45
46#define HDF5_FILE_PATH "crepe-model-tiny.h5"
47
48// public prototypes
49typedef struct _aubio_pitch_crepe_t aubio_pitch_crepe_t;
50aubio_pitch_crepe_t *new_aubio_pitch_crepe(void);
51void aubio_pitch_crepe_do(aubio_pitch_crepe_t *t, fvec_t *input, fvec_t *out);
52void del_aubio_pitch_crepe(aubio_pitch_crepe_t *t);
53smpl_t aubio_pitch_crepe_get_confidence (aubio_pitch_crepe_t * o);
54uint_t aubio_pitch_crepe_set_tolerance(aubio_pitch_crepe_t * o, smpl_t
55    tolerance);
56smpl_t aubio_pitch_crepe_get_tolerance (aubio_pitch_crepe_t * o);
57
58// static prototypes
59static uint_t aubio_pitch_crepe_load_params(aubio_pitch_crepe_t *o);
60
61struct _aubio_pitch_crepe_t
62{
63  // number of [conv, maxpool, batchnorm] groups
64  uint_t n_layers;
65  // layers
66  aubio_conv1d_t **conv_layers;
67  aubio_maxpool1d_t **maxpool_layers;
68  aubio_batchnorm_t **batchnorm_layers;
69  aubio_dense_t *dense_layer;
70  // input/output tensors
71  aubio_tensor_t *input_tensor;
72  aubio_tensor_t **maxpool_output;
73  aubio_tensor_t **batchnorm_output;
74  aubio_tensor_t **conv_output;
75  aubio_tensor_t *flattened;
76  aubio_tensor_t *dense_output;
77
78  smpl_t confidence;
79  smpl_t tolerance;
80  aubio_scale_t *scale;
81};
82
83aubio_pitch_crepe_t *new_aubio_pitch_crepe(void)
84{
85  aubio_pitch_crepe_t *o = AUBIO_NEW(aubio_pitch_crepe_t);
86  aubio_tensor_t *block_input;
87  // algorithm constants
88  uint_t input_shape[2] = {1024, 1};
89  uint_t capacity_modes[5] = {4, 8, 16, 24, 32};
90  uint_t n_filters[6] = {32, 4, 4, 4, 8, 16};
91  uint_t widths[6] = {512, 64, 64, 64, 64, 64};
92  uint_t maxpool_stride[1] = {2};
93  uint_t l0_stride[1] = {4};
94  uint_t n_dense = 360;
95
96  // local variables
97  uint_t capacity_mode = 0;
98  uint_t capacity = capacity_modes[capacity_mode];
99  uint_t output_shape[2];
100  uint_t i;
101
102#if defined(HAVE_BLAS) && defined(HAVE_OPENBLAS_CBLAS_H)
103  // workaround to prevent openblas from opening multiple threads, since
104  // the overhead appears to be higher than using a single thread.
105  openblas_set_num_threads(1);
106#endif
107
108  AUBIO_ASSERT (capacity_mode < 5 && (sint_t)capacity_mode >= 0);
109
110  o->n_layers = 6;
111  // create arrays of layers and tensors
112  o->conv_layers = AUBIO_ARRAY(aubio_conv1d_t*, o->n_layers);
113  o->conv_output = AUBIO_ARRAY(aubio_tensor_t*, o->n_layers);
114  o->maxpool_layers = AUBIO_ARRAY(aubio_maxpool1d_t*, o->n_layers);
115  o->maxpool_output = AUBIO_ARRAY(aubio_tensor_t*, o->n_layers);
116  o->batchnorm_layers = AUBIO_ARRAY(aubio_batchnorm_t*, o->n_layers);
117  o->batchnorm_output = AUBIO_ARRAY(aubio_tensor_t*, o->n_layers);
118
119  if (!o->conv_layers || !o->conv_output
120      || !o->maxpool_layers || !o->maxpool_output
121      || !o->batchnorm_layers || !o->batchnorm_output)
122    goto failure;
123
124  // create layers
125  for (i = 0; i < o->n_layers; i++) {
126    uint_t kern_shape[1] = {widths[i]};
127    // create convolutional layers
128    o->conv_layers[i] = new_aubio_conv1d(n_filters[i] * capacity, kern_shape);
129    if (!o->conv_layers[i]) goto failure;
130    // set padding='same'
131    if (aubio_conv1d_set_padding_mode(o->conv_layers[i], "same") != AUBIO_OK) {
132      goto failure;
133    }
134    // set stride of first layer
135    if ((i == 0) && (aubio_conv1d_set_stride(o->conv_layers[0],
136            l0_stride) != AUBIO_OK) ) {
137      goto failure;
138    }
139
140    // create batchnorm layers
141    o->batchnorm_layers[i] = new_aubio_batchnorm(n_filters[i] * capacity);
142    if (!o->batchnorm_layers[i]) goto failure;
143
144    // create maxpool layers
145    o->maxpool_layers[i] = new_aubio_maxpool1d(maxpool_stride);
146    if (!o->maxpool_layers[i]) goto failure;
147  }
148
149  o->dense_layer = new_aubio_dense(n_dense);
150  if (!o->dense_layer) goto failure;
151
152  // create input/output tensors
153  o->input_tensor = new_aubio_tensor(2, input_shape);
154  if (!o->input_tensor) goto failure;
155  block_input = o->input_tensor;
156  for (i = 0; i < o->n_layers; i++) {
157    // get shape of conv1d output and create its tensor
158    if (aubio_conv1d_get_output_shape(o->conv_layers[i],
159          block_input, output_shape))
160      goto failure;
161    o->conv_output[i] = new_aubio_tensor(2, output_shape);
162    if (!o->conv_output[i]) goto failure;
163
164    // get shape of batchnorm output and create its tensor
165    if (aubio_batchnorm_get_output_shape(o->batchnorm_layers[i],
166          o->conv_output[i], output_shape))
167      goto failure;
168    o->batchnorm_output[i] = new_aubio_tensor(2, output_shape);
169    if (!o->batchnorm_output[i]) goto failure;
170
171    // get shape of maxpool1d output and create its tensor
172    if (aubio_maxpool1d_get_output_shape(o->maxpool_layers[i],
173          o->batchnorm_output[i], output_shape))
174      goto failure;
175    o->maxpool_output[i] = new_aubio_tensor(2, output_shape);
176    if (!o->maxpool_output[i]) goto failure;
177
178    // set input for next block
179    block_input = o->maxpool_output[i];
180  }
181
182  uint_t flattened_dim = o->maxpool_output[5]->shape[0];
183  flattened_dim *= o->maxpool_output[5]->shape[1];
184  uint_t dense_input[1] = {flattened_dim};
185  o->flattened = new_aubio_tensor(1, dense_input);
186  if (!o->flattened) goto failure;
187
188  // permute and flatten
189  aubio_tensor_t *permute_input = o->maxpool_output[5];
190  AUBIO_DBG("permute:           (%d, %d) ->"
191      " (%d, %d) (permutation=(2, 1))\n",
192      permute_input->shape[0], permute_input->shape[1],
193      permute_input->shape[1], permute_input->shape[0]);
194  AUBIO_DBG("flatten:           (%d, %d) -> (%d)\n",
195      permute_input->shape[1], permute_input->shape[0],
196      o->flattened->shape[0]);
197
198  if (aubio_dense_get_output_shape(o->dense_layer, o->flattened, output_shape))
199    goto failure;
200  o->dense_output = new_aubio_tensor(1, output_shape);
201  if (!o->dense_output) goto failure;
202
203  AUBIO_ASSERT(n_dense == output_shape[0]);
204
205  if (aubio_pitch_crepe_load_params(o))
206    goto failure;
207
208  // map output units to midi note
209  smpl_t start = 1997.379408437619;
210  smpl_t end = 7180.;
211  o->scale = new_aubio_scale(0., 359., start, start + end);
212  if (!o->scale) goto failure;
213
214  return o;
215
216failure:
217  del_aubio_pitch_crepe(o);
218  return NULL;
219}
220
221void del_aubio_pitch_crepe(aubio_pitch_crepe_t *o)
222{
223  uint_t i;
224  AUBIO_ASSERT(o);
225
226  if (o->input_tensor) {
227    del_aubio_tensor(o->input_tensor);
228  }
229
230  if (o->batchnorm_output) {
231    for (i = 0; i < o->n_layers; i++) {
232      if (o->batchnorm_output[i])
233        del_aubio_tensor(o->batchnorm_output[i]);
234    }
235    AUBIO_FREE(o->batchnorm_output);
236  }
237
238  if (o->batchnorm_layers) {
239    for (i = 0; i < o->n_layers; i++) {
240      if (o->batchnorm_layers[i])
241        del_aubio_batchnorm(o->batchnorm_layers[i]);
242    }
243    AUBIO_FREE(o->batchnorm_layers);
244  }
245
246  if (o->maxpool_output) {
247    for (i = 0; i < o->n_layers; i++) {
248      if (o->maxpool_output[i])
249        del_aubio_tensor(o->maxpool_output[i]);
250    }
251    AUBIO_FREE(o->maxpool_output);
252  }
253
254  if (o->maxpool_layers) {
255    for (i = 0; i < o->n_layers; i++) {
256      if (o->maxpool_layers[i])
257        del_aubio_maxpool1d(o->maxpool_layers[i]);
258    }
259    AUBIO_FREE(o->maxpool_layers);
260  }
261
262  if (o->conv_output) {
263    for (i = 0; i < o->n_layers; i++) {
264      if (o->conv_output[i])
265        del_aubio_tensor(o->conv_output[i]);
266    }
267    AUBIO_FREE(o->conv_output);
268  }
269
270  if (o->conv_layers) {
271    for (i = 0; i < o->n_layers; i++) {
272      if (o->conv_layers[i])
273        del_aubio_conv1d(o->conv_layers[i]);
274    }
275    AUBIO_FREE(o->conv_layers);
276  }
277
278  if (o->flattened) {
279    del_aubio_tensor(o->flattened);
280  }
281
282  if (o->dense_layer) {
283    del_aubio_dense(o->dense_layer);
284  }
285
286  if (o->dense_output) {
287    del_aubio_tensor(o->dense_output);
288  }
289
290  if (o->scale) {
291    del_aubio_scale(o->scale);
292  }
293
294  AUBIO_FREE(o);
295}
296
297void aubio_pitch_crepe_do(aubio_pitch_crepe_t *o, fvec_t *input, fvec_t *out)
298{
299  uint_t i;
300  AUBIO_ASSERT(o && input);
301  // copy input to input tensor
302  AUBIO_ASSERT(input->length == o->input_tensor->shape[0]);
303  // normalize frame, removing mean and dividing by std
304  smpl_t mean = fvec_mean(input);
305  fvec_add(input, -mean);
306  smpl_t std = 0.;
307  for (i = 0; i < input->length; i++) {
308    std += SQR(input->data[i]);
309  }
310  std = SQRT(std / (smpl_t)input->length);
311  if (std < 1.e-7) std = 1;
312
313  for (i = 0; i < input->length; i++) {
314    o->input_tensor->data[0][i] = input->data[i] / std;
315  }
316
317  aubio_tensor_t *block_input = o->input_tensor;
318  for (i = 0; i < o->n_layers; i++) {
319    aubio_conv1d_do(o->conv_layers[i], block_input,
320        o->conv_output[i]);
321    aubio_batchnorm_do(o->batchnorm_layers[i], o->conv_output[i],
322        o->batchnorm_output[i]);
323    aubio_maxpool1d_do(o->maxpool_layers[i], o->batchnorm_output[i],
324        o->maxpool_output[i]);
325    block_input = o->maxpool_output[i];
326  }
327
328  aubio_tensor_t *permute_input = o->maxpool_output[5];
329  // perform flattening (permutation has no effect here, order unchanged)
330  AUBIO_ASSERT (permute_input->size == o->flattened->size);
331  for (i = 0; i < permute_input->size; i++) {
332    o->flattened->data[0][i] = permute_input->data[0][i];
333  }
334
335  // compute dense layer
336  aubio_dense_do(o->dense_layer, o->flattened, o->dense_output);
337
338#if 0
339  // print debug output
340  for (i = 0; i < o->n_layers; i++) {
341    AUBIO_DBG("pitch_crepe: conv1d[%d]    %f\n", i,
342        aubio_tensor_max(o->conv_output[i]));
343    AUBIO_DBG("pitch_crepe: batchnorm[%d] %f\n", i,
344        aubio_tensor_max(o->batchnorm_output[i]));
345    AUBIO_DBG("pitch_crepe: maxpool1d[%d] %f\n", i,
346        aubio_tensor_max(o->maxpool_output[i]));
347  }
348  AUBIO_DBG("pitch_crepe: dense %f\n", aubio_tensor_max(o->dense_output));
349#endif
350
351  // find maximum activation
352  fvec_t activations;
353  aubio_tensor_as_fvec(o->dense_output, &activations);
354  uint_t argmax = fvec_max_elem(&activations);
355  o->confidence = activations.data[argmax];
356
357  // skip frames with no activation at all (e.g. silence)
358  // or with insufficient confidence
359  if ((argmax == activations.length - 1)
360      || (o->confidence < o->tolerance)) {
361    out->data[0] = -100.;
362    o->confidence = 0;
363    return;
364  }
365
366  // perform interpolation across neighbouring outputs
367  sint_t start = MAX(0, (sint_t)argmax - 4);
368  uint_t end = MIN(argmax + 5, activations.length);
369
370  smpl_t prod = 0;
371  smpl_t weight = 0;
372  smpl_t scaling = 0;
373  for (i = start; i < end; i++) {
374    scaling = (smpl_t)(i);
375    prod += activations.data[i] * scaling;
376    weight += activations.data[i];
377  }
378  out->data[0] = prod / weight;
379
380  // map output units to midi output
381  aubio_scale_do(o->scale, out);
382
383  // convert cents to midi
384  out->data[0] /= 100.;
385
386  // final bias (f_ref = 10Hz -> 3.48 midi)
387  out->data[0] += 3.486821174621582;
388}
389
390smpl_t aubio_pitch_crepe_get_confidence (aubio_pitch_crepe_t* o)
391{
392  return o->confidence;
393}
394
395uint_t aubio_pitch_crepe_set_tolerance(aubio_pitch_crepe_t * o,
396    smpl_t tolerance)
397{
398  if (o->tolerance < 0 || o->tolerance > 1) return AUBIO_FAIL;
399  o->tolerance = tolerance;
400  return AUBIO_OK;
401}
402
403smpl_t aubio_pitch_crepe_get_tolerance (aubio_pitch_crepe_t * o)
404{
405  return o->tolerance;
406}
407
408uint_t aubio_pitch_crepe_load_params(aubio_pitch_crepe_t *o)
409{
410  uint_t i;
411  aubio_tensor_t *k = NULL;
412  fvec_t *vec = NULL;
413
414  AUBIO_ASSERT(o);
415
416  aubio_file_hdf5_t *hdf5 = new_aubio_file_hdf5(HDF5_FILE_PATH);
417  if (!hdf5) return AUBIO_FAIL;
418
419  // get kernels
420  for (i = 0; i < o->n_layers; i++) {
421    char_t *fmt_key = "/conv%d/conv%d_3/kernel:0";
422    char_t key[PATH_MAX];
423    snprintf(key, sizeof(key), fmt_key, i+1, i+1);
424    k = aubio_conv1d_get_kernel(o->conv_layers[i]);
425
426    // push dimension
427    k->shape[3] = k->shape[2]; k->shape[2] = k->shape[1]; k->shape[1] = 1;
428    k->ndim += 1;
429    // load params from hdf5 into kernel tensor
430    if (aubio_file_hdf5_load_dataset_into_tensor(hdf5, key, k))
431      return AUBIO_FAIL;
432    // pop dimension
433    k->shape[1] = k->shape[2]; k->shape[2] = k->shape[3]; k->shape[3] = 0;
434    k->ndim -= 1;
435  }
436
437  // get bias vectors
438  for (i = 0; i < o->n_layers; i++) {
439    char_t *fmt_key = "/conv%d/conv%d_3/bias:0";
440    char_t key[PATH_MAX];
441    snprintf(key, sizeof(key), fmt_key, i+1, i+1);
442    vec = aubio_conv1d_get_bias(o->conv_layers[i]);
443    // load params from hdf5 into kernel tensor
444    if (aubio_file_hdf5_load_dataset_into_vector(hdf5, key, vec))
445      return AUBIO_FAIL;
446  }
447
448  // batchnorm
449  for (i = 0; i < o->n_layers; i++) {
450    char_t *fmt_key = "/conv%d-BN/conv%d-BN_3/gamma:0";
451    char_t key[PATH_MAX];
452    snprintf(key, sizeof(key), fmt_key, i+1, i+1);
453    // get kernel matrix
454    vec = aubio_batchnorm_get_gamma(o->batchnorm_layers[i]);
455    // load params from hdf5 into kernel tensor
456    if (aubio_file_hdf5_load_dataset_into_vector(hdf5, key, vec))
457      return AUBIO_FAIL;
458  }
459  for (i = 0; i < o->n_layers; i++) {
460    char_t *fmt_key = "/conv%d-BN/conv%d-BN_3/beta:0";
461    char_t key[PATH_MAX];
462    snprintf(key, sizeof(key), fmt_key, i+1, i+1);
463    // get kernel matrix
464    vec = aubio_batchnorm_get_beta(o->batchnorm_layers[i]);
465    // load params from hdf5 into kernel tensor
466    if (aubio_file_hdf5_load_dataset_into_vector(hdf5, key, vec))
467      return AUBIO_FAIL;
468  }
469  for (i = 0; i < o->n_layers; i++) {
470    char_t *fmt_key = "/conv%d-BN/conv%d-BN_3/moving_mean:0";
471    char_t key[PATH_MAX];
472    snprintf(key, sizeof(key), fmt_key, i+1, i+1);
473    // get kernel matrix
474    vec = aubio_batchnorm_get_moving_mean(o->batchnorm_layers[i]);
475    // load params from hdf5 into kernel tensor
476    if (aubio_file_hdf5_load_dataset_into_vector(hdf5, key, vec))
477      return AUBIO_FAIL;
478  }
479  for (i = 0; i < o->n_layers; i++) {
480    char_t *fmt_key = "/conv%d-BN/conv%d-BN_3/moving_variance:0";
481    char_t key[PATH_MAX];
482    snprintf(key, sizeof(key), fmt_key, i+1, i+1);
483    // get kernel matrix
484    vec = aubio_batchnorm_get_moving_variance(o->batchnorm_layers[i]);
485    // load params from hdf5 into kernel tensor
486    if (aubio_file_hdf5_load_dataset_into_vector(hdf5, key, vec))
487      return AUBIO_FAIL;
488  }
489
490  {
491    char_t *key = "/classifier/classifier_3/kernel:0";
492    fmat_t *d = aubio_dense_get_weights(o->dense_layer);
493    if (aubio_file_hdf5_load_dataset_into_matrix(hdf5, key, d))
494      return AUBIO_FAIL;
495
496    key = "/classifier/classifier_3/bias:0";
497    fvec_t *v = aubio_dense_get_bias(o->dense_layer);
498    if (aubio_file_hdf5_load_dataset_into_vector(hdf5, key, v))
499      return AUBIO_FAIL;
500  }
501
502  if (hdf5) {
503    del_aubio_file_hdf5(hdf5);
504  }
505
506  return AUBIO_OK;
507}
Note: See TracBrowser for help on using the repository browser.