/* Copyright (C) 2018 Paul Brossier This file is part of aubio. aubio is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. aubio is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with aubio. If not, see . */ /* CREPE pitch algorithm References ---------- CREPE: A Convolutional Representation for Pitch Estimation Jong Wook Kim, Justin Salamon, Peter Li, Juan Pablo Bello. Proceedings of the IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), 2018. Available online at https://arxiv.org/abs/1802.06182 Original implementation available at https://github.com/marl/crepe */ #include "aubio_priv.h" #include "fmat.h" #include "ai/tensor.h" #include "ai/conv1d.h" #include "ai/maxpool1d.h" #include "ai/batchnorm.h" #include "ai/dense.h" #include "io/file_hdf5.h" #include "utils/scale.h" #define HDF5_FILE_PATH "crepe-model-tiny.h5" // public prototypes typedef struct _aubio_pitch_crepe_t aubio_pitch_crepe_t; aubio_pitch_crepe_t *new_aubio_pitch_crepe(void); void aubio_pitch_crepe_do(aubio_pitch_crepe_t *t, fvec_t *input, fvec_t *out); void del_aubio_pitch_crepe(aubio_pitch_crepe_t *t); smpl_t aubio_pitch_crepe_get_confidence (aubio_pitch_crepe_t * o); uint_t aubio_pitch_crepe_set_tolerance(aubio_pitch_crepe_t * o, smpl_t tolerance); smpl_t aubio_pitch_crepe_get_tolerance (aubio_pitch_crepe_t * o); // static prototypes static uint_t aubio_pitch_crepe_load_params(aubio_pitch_crepe_t *o); struct _aubio_pitch_crepe_t { // number of [conv, maxpool, batchnorm] groups uint_t n_layers; // layers aubio_conv1d_t **conv_layers; aubio_maxpool1d_t **maxpool_layers; aubio_batchnorm_t **batchnorm_layers; aubio_dense_t *dense_layer; // input/output tensors aubio_tensor_t *input_tensor; aubio_tensor_t **maxpool_output; aubio_tensor_t **batchnorm_output; aubio_tensor_t **conv_output; aubio_tensor_t *flattened; aubio_tensor_t *dense_output; smpl_t confidence; smpl_t tolerance; aubio_scale_t *scale; }; aubio_pitch_crepe_t *new_aubio_pitch_crepe(void) { aubio_pitch_crepe_t *o = AUBIO_NEW(aubio_pitch_crepe_t); aubio_tensor_t *block_input; // algorithm constants uint_t input_shape[2] = {1024, 1}; uint_t capacity_modes[5] = {4, 8, 16, 24, 32}; uint_t n_filters[6] = {32, 4, 4, 4, 8, 16}; uint_t widths[6] = {512, 64, 64, 64, 64, 64}; uint_t maxpool_stride[1] = {2}; uint_t l0_stride[1] = {4}; uint_t n_dense = 360; // local variables uint_t capacity_mode = 0; uint_t capacity = capacity_modes[capacity_mode]; uint_t output_shape[2]; uint_t i; AUBIO_ASSERT (capacity_mode < 5 && (sint_t)capacity_mode >= 0); o->n_layers = 6; // create arrays of layers and tensors o->conv_layers = AUBIO_ARRAY(aubio_conv1d_t*, o->n_layers); o->conv_output = AUBIO_ARRAY(aubio_tensor_t*, o->n_layers); o->maxpool_layers = AUBIO_ARRAY(aubio_maxpool1d_t*, o->n_layers); o->maxpool_output = AUBIO_ARRAY(aubio_tensor_t*, o->n_layers); o->batchnorm_layers = AUBIO_ARRAY(aubio_batchnorm_t*, o->n_layers); o->batchnorm_output = AUBIO_ARRAY(aubio_tensor_t*, o->n_layers); if (!o->conv_layers || !o->conv_output || !o->maxpool_layers || !o->maxpool_output || !o->batchnorm_layers || !o->batchnorm_output) goto failure; // create layers for (i = 0; i < o->n_layers; i++) { uint_t kern_shape[1] = {widths[i]}; // create convolutional layers o->conv_layers[i] = new_aubio_conv1d(n_filters[i] * capacity, kern_shape); if (!o->conv_layers[i]) goto failure; // set padding='same' if (aubio_conv1d_set_padding_mode(o->conv_layers[i], "same") != AUBIO_OK) { goto failure; } // set stride of first layer if ((i == 0) && (aubio_conv1d_set_stride(o->conv_layers[0], l0_stride) != AUBIO_OK) ) { goto failure; } // create batchnorm layers o->batchnorm_layers[i] = new_aubio_batchnorm(n_filters[i] * capacity); if (!o->batchnorm_layers[i]) goto failure; // create maxpool layers o->maxpool_layers[i] = new_aubio_maxpool1d(maxpool_stride); if (!o->maxpool_layers[i]) goto failure; } o->dense_layer = new_aubio_dense(n_dense); if (!o->dense_layer) goto failure; // create input/output tensors o->input_tensor = new_aubio_tensor(2, input_shape); if (!o->input_tensor) goto failure; block_input = o->input_tensor; for (i = 0; i < o->n_layers; i++) { // get shape of conv1d output and create its tensor if (aubio_conv1d_get_output_shape(o->conv_layers[i], block_input, output_shape)) goto failure; o->conv_output[i] = new_aubio_tensor(2, output_shape); if (!o->conv_output[i]) goto failure; // get shape of batchnorm output and create its tensor if (aubio_batchnorm_get_output_shape(o->batchnorm_layers[i], o->conv_output[i], output_shape)) goto failure; o->batchnorm_output[i] = new_aubio_tensor(2, output_shape); if (!o->batchnorm_output[i]) goto failure; // get shape of maxpool1d output and create its tensor if (aubio_maxpool1d_get_output_shape(o->maxpool_layers[i], o->batchnorm_output[i], output_shape)) goto failure; o->maxpool_output[i] = new_aubio_tensor(2, output_shape); if (!o->maxpool_output[i]) goto failure; // set input for next block block_input = o->maxpool_output[i]; } uint_t flattened_dim = o->maxpool_output[5]->shape[0]; flattened_dim *= o->maxpool_output[5]->shape[1]; uint_t dense_input[1] = {flattened_dim}; o->flattened = new_aubio_tensor(1, dense_input); if (!o->flattened) goto failure; // permute and flatten aubio_tensor_t *permute_input = o->maxpool_output[5]; AUBIO_DBG("permute: (%d, %d) ->" " (%d, %d) (permutation=(2, 1))\n", permute_input->shape[0], permute_input->shape[1], permute_input->shape[1], permute_input->shape[0]); AUBIO_DBG("flatten: (%d, %d) -> (%d)\n", permute_input->shape[1], permute_input->shape[0], o->flattened->shape[0]); if (aubio_dense_get_output_shape(o->dense_layer, o->flattened, output_shape)) goto failure; o->dense_output = new_aubio_tensor(1, output_shape); if (!o->dense_output) goto failure; AUBIO_ASSERT(n_dense == output_shape[0]); if (aubio_pitch_crepe_load_params(o)) goto failure; // map output units to midi note smpl_t start = 1997.379408437619; smpl_t end = 7180.; o->scale = new_aubio_scale(0., 359., start, start + end); if (!o->scale) goto failure; return o; failure: del_aubio_pitch_crepe(o); return NULL; } void del_aubio_pitch_crepe(aubio_pitch_crepe_t *o) { uint_t i; AUBIO_ASSERT(o); if (o->input_tensor) { del_aubio_tensor(o->input_tensor); } if (o->batchnorm_output) { for (i = 0; i < o->n_layers; i++) { if (o->batchnorm_output[i]) del_aubio_tensor(o->batchnorm_output[i]); } AUBIO_FREE(o->batchnorm_output); } if (o->batchnorm_layers) { for (i = 0; i < o->n_layers; i++) { if (o->batchnorm_layers[i]) del_aubio_batchnorm(o->batchnorm_layers[i]); } AUBIO_FREE(o->batchnorm_layers); } if (o->maxpool_output) { for (i = 0; i < o->n_layers; i++) { if (o->maxpool_output[i]) del_aubio_tensor(o->maxpool_output[i]); } AUBIO_FREE(o->maxpool_output); } if (o->maxpool_layers) { for (i = 0; i < o->n_layers; i++) { if (o->maxpool_layers[i]) del_aubio_maxpool1d(o->maxpool_layers[i]); } AUBIO_FREE(o->maxpool_layers); } if (o->conv_output) { for (i = 0; i < o->n_layers; i++) { if (o->conv_output[i]) del_aubio_tensor(o->conv_output[i]); } AUBIO_FREE(o->conv_output); } if (o->conv_layers) { for (i = 0; i < o->n_layers; i++) { if (o->conv_layers[i]) del_aubio_conv1d(o->conv_layers[i]); } AUBIO_FREE(o->conv_layers); } if (o->flattened) { del_aubio_tensor(o->flattened); } if (o->dense_layer) { del_aubio_dense(o->dense_layer); } if (o->dense_output) { del_aubio_tensor(o->dense_output); } if (o->scale) { del_aubio_scale(o->scale); } AUBIO_FREE(o); } void aubio_pitch_crepe_do(aubio_pitch_crepe_t *o, fvec_t *input, fvec_t *out) { uint_t i; AUBIO_ASSERT(o && input); // copy input to input tensor AUBIO_ASSERT(input->length == o->input_tensor->shape[0]); // normalize frame, removing mean and dividing by std smpl_t mean = fvec_mean(input); fvec_add(input, -mean); smpl_t std = 0.; for (i = 0; i < input->length; i++) { std += SQR(input->data[i]); } std = SQRT(std / (smpl_t)input->length); if (std < 1.e-7) std = 1; for (i = 0; i < input->length; i++) { o->input_tensor->data[0][i] = input->data[i] / std; } aubio_tensor_t *block_input = o->input_tensor; for (i = 0; i < o->n_layers; i++) { aubio_conv1d_do(o->conv_layers[i], block_input, o->conv_output[i]); aubio_batchnorm_do(o->batchnorm_layers[i], o->conv_output[i], o->batchnorm_output[i]); aubio_maxpool1d_do(o->maxpool_layers[i], o->batchnorm_output[i], o->maxpool_output[i]); block_input = o->maxpool_output[i]; } aubio_tensor_t *permute_input = o->maxpool_output[5]; // perform flattening (permutation has no effect here, order unchanged) AUBIO_ASSERT (permute_input->size == o->flattened->size); for (i = 0; i < permute_input->size; i++) { o->flattened->data[0][i] = permute_input->data[0][i]; } // compute dense layer aubio_dense_do(o->dense_layer, o->flattened, o->dense_output); #if 0 // print debug output for (i = 0; i < o->n_layers; i++) { AUBIO_DBG("pitch_crepe: conv1d[%d] %f\n", i, aubio_tensor_max(o->conv_output[i])); AUBIO_DBG("pitch_crepe: batchnorm[%d] %f\n", i, aubio_tensor_max(o->batchnorm_output[i])); AUBIO_DBG("pitch_crepe: maxpool1d[%d] %f\n", i, aubio_tensor_max(o->maxpool_output[i])); } AUBIO_DBG("pitch_crepe: dense %f\n", aubio_tensor_max(o->dense_output)); #endif // find maximum activation fvec_t activations; aubio_tensor_as_fvec(o->dense_output, &activations); uint_t argmax = fvec_max_elem(&activations); o->confidence = activations.data[argmax]; // skip frames with no activation at all (e.g. silence) // or with insufficient confidence if ((argmax == activations.length - 1) || (o->confidence < o->tolerance)) { out->data[0] = -100.; o->confidence = 0; return; } // perform interpolation across neighbouring outputs sint_t start = MAX(0, (sint_t)argmax - 4); uint_t end = MIN(argmax + 5, activations.length); smpl_t prod = 0; smpl_t weight = 0; smpl_t scaling = 0; for (i = start; i < end; i++) { scaling = (smpl_t)(i); prod += activations.data[i] * scaling; weight += activations.data[i]; } out->data[0] = prod / weight; // map output units to midi output aubio_scale_do(o->scale, out); // convert cents to midi out->data[0] /= 100.; // final bias (f_ref = 10Hz -> 3.48 midi) out->data[0] += 3.486821174621582; } smpl_t aubio_pitch_crepe_get_confidence (aubio_pitch_crepe_t* o) { return o->confidence; } uint_t aubio_pitch_crepe_set_tolerance(aubio_pitch_crepe_t * o, smpl_t tolerance) { if (o->tolerance < 0 || o->tolerance > 1) return AUBIO_FAIL; o->tolerance = tolerance; return AUBIO_OK; } smpl_t aubio_pitch_crepe_get_tolerance (aubio_pitch_crepe_t * o) { return o->tolerance; } uint_t aubio_pitch_crepe_load_params(aubio_pitch_crepe_t *o) { uint_t i; aubio_tensor_t *k = NULL; fvec_t *vec = NULL; AUBIO_ASSERT(o); aubio_file_hdf5_t *hdf5 = new_aubio_file_hdf5(HDF5_FILE_PATH); if (!hdf5) return AUBIO_FAIL; // get kernels for (i = 0; i < o->n_layers; i++) { char_t *fmt_key = "/conv%d/conv%d_3/kernel:0"; char_t key[PATH_MAX]; snprintf(key, sizeof(key), fmt_key, i+1, i+1); k = aubio_conv1d_get_kernel(o->conv_layers[i]); // push dimension k->shape[3] = k->shape[2]; k->shape[2] = k->shape[1]; k->shape[1] = 1; k->ndim += 1; // load params from hdf5 into kernel tensor if (aubio_file_hdf5_load_dataset_into_tensor(hdf5, key, k)) return AUBIO_FAIL; // pop dimension k->shape[1] = k->shape[2]; k->shape[2] = k->shape[3]; k->shape[3] = 0; k->ndim -= 1; } // get bias vectors for (i = 0; i < o->n_layers; i++) { char_t *fmt_key = "/conv%d/conv%d_3/bias:0"; char_t key[PATH_MAX]; snprintf(key, sizeof(key), fmt_key, i+1, i+1); vec = aubio_conv1d_get_bias(o->conv_layers[i]); // load params from hdf5 into kernel tensor if (aubio_file_hdf5_load_dataset_into_vector(hdf5, key, vec)) return AUBIO_FAIL; } // batchnorm for (i = 0; i < o->n_layers; i++) { char_t *fmt_key = "/conv%d-BN/conv%d-BN_3/gamma:0"; char_t key[PATH_MAX]; snprintf(key, sizeof(key), fmt_key, i+1, i+1); // get kernel matrix vec = aubio_batchnorm_get_gamma(o->batchnorm_layers[i]); // load params from hdf5 into kernel tensor if (aubio_file_hdf5_load_dataset_into_vector(hdf5, key, vec)) return AUBIO_FAIL; } for (i = 0; i < o->n_layers; i++) { char_t *fmt_key = "/conv%d-BN/conv%d-BN_3/beta:0"; char_t key[PATH_MAX]; snprintf(key, sizeof(key), fmt_key, i+1, i+1); // get kernel matrix vec = aubio_batchnorm_get_beta(o->batchnorm_layers[i]); // load params from hdf5 into kernel tensor if (aubio_file_hdf5_load_dataset_into_vector(hdf5, key, vec)) return AUBIO_FAIL; } for (i = 0; i < o->n_layers; i++) { char_t *fmt_key = "/conv%d-BN/conv%d-BN_3/moving_mean:0"; char_t key[PATH_MAX]; snprintf(key, sizeof(key), fmt_key, i+1, i+1); // get kernel matrix vec = aubio_batchnorm_get_moving_mean(o->batchnorm_layers[i]); // load params from hdf5 into kernel tensor if (aubio_file_hdf5_load_dataset_into_vector(hdf5, key, vec)) return AUBIO_FAIL; } for (i = 0; i < o->n_layers; i++) { char_t *fmt_key = "/conv%d-BN/conv%d-BN_3/moving_variance:0"; char_t key[PATH_MAX]; snprintf(key, sizeof(key), fmt_key, i+1, i+1); // get kernel matrix vec = aubio_batchnorm_get_moving_variance(o->batchnorm_layers[i]); // load params from hdf5 into kernel tensor if (aubio_file_hdf5_load_dataset_into_vector(hdf5, key, vec)) return AUBIO_FAIL; } { char_t *key = "/classifier/classifier_3/kernel:0"; fmat_t *d = aubio_dense_get_weights(o->dense_layer); if (aubio_file_hdf5_load_dataset_into_matrix(hdf5, key, d)) return AUBIO_FAIL; key = "/classifier/classifier_3/bias:0"; fvec_t *v = aubio_dense_get_bias(o->dense_layer); if (aubio_file_hdf5_load_dataset_into_vector(hdf5, key, v)) return AUBIO_FAIL; } if (hdf5) { del_aubio_file_hdf5(hdf5); } return AUBIO_OK; }