Source code for spafe.features.mfcc

import numpy as np
from ..utils.spectral import rfft, dct
from ..utils.cepstral import cms, cmvn, lifter_ceps
from ..utils.exceptions import ParameterError, ErrorMsgs
from ..utils.spectral import power_spectrum, audspec, postaud, invpostaud
from ..fbanks.mel_fbanks import inverse_mel_filter_banks, mel_filter_banks
from ..utils.preprocessing import pre_emphasis, framing, windowing, zero_handling


[docs]def mfcc(sig, fs=16000, num_ceps=13, pre_emph=0, pre_emph_coeff=0.97, win_len=0.025, win_hop=0.01, win_type="hamming", nfilts=26, nfft=512, low_freq=None, high_freq=None, scale="constant", dct_type=2, use_energy=False, lifter=22, normalize=1): """ Compute MFCC features (Mel-frequency cepstral coefficients) from an audio signal. This function offers multiple approaches to features extraction depending on the input parameters. Implemenation is using FFT and based on http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.63.8029&rep=rep1&type=pdf - take the absolute value of the FFT - warp to a Mel frequency scale - take the DCT of the log-Mel-spectrum - return the first <num_ceps> components Args: sig (array) : a mono audio signal (Nx1) from which to compute features. fs (int) : the sampling frequency of the signal we are working with. Default is 16000. num_ceps (float) : number of cepstra to return. Default is 13. pre_emph (int) : apply pre-emphasis if 1. Default is 1. pre_emph_coeff (float) : apply pre-emphasis filter [1 -pre_emph] (0 = none). Default is 0.97. win_len (float) : window length in sec. Default is 0.025. win_hop (float) : step between successive windows in sec. Default is 0.01. win_type (float) : window type to apply for the windowing. Default is "hamming". nfilts (int) : the number of filters in the filterbank. Default is 40. nfft (int) : number of FFT points. Default is 512. low_freq (int) : lowest band edge of mel filters (Hz). Default is 0. high_freq (int) : highest band edge of mel filters (Hz). Default is samplerate / 2 = 8000. scale (str) : choose if max bins amplitudes ascend, descend or are constant (=1). Default is "constant". dct_type (int) : type of DCT used - 1 or 2 (or 3 for HTK or 4 for feac). Default is 2. use_energy (int) : overwrite C0 with true log energy Default is 0. lifter (int) : apply liftering if value > 0. Default is 22. normalize (int) : apply normalization if 1. Default is 0. Returns: (array) : features - the MFFC features: num_frames x num_ceps """ # init freqs high_freq = high_freq or fs / 2 low_freq = low_freq or 0 # run checks if low_freq < 0: raise ParameterError(ErrorMsgs["low_freq"]) if high_freq > (fs / 2): raise ParameterError(ErrorMsgs["high_freq"]) if nfilts < num_ceps: raise ParameterError(ErrorMsgs["nfilts"]) # pre-emphasis if pre_emph: sig = pre_emphasis(sig=sig, pre_emph_coeff=0.97) # -> framing frames, frame_length = framing(sig=sig, fs=fs, win_len=win_len, win_hop=win_hop) # -> windowing windows = windowing(frames=frames, frame_len=frame_length, win_type=win_type) # -> FFT -> |.| fourrier_transform = rfft(x=windows, n=nfft) abs_fft_values = (1 / 1) * np.abs(fourrier_transform) # -> x Mel-fbanks mel_fbanks_mat = mel_filter_banks(nfilts=nfilts, nfft=nfft, fs=fs, low_freq=low_freq, high_freq=high_freq, scale=scale) features = np.dot(abs_fft_values, mel_fbanks_mat.T) # -> log(.) -> DCT(.) features_no_zero = zero_handling(features) log_features = np.log(features_no_zero) mfccs = dct(x=log_features, type=dct_type, axis=1, norm='ortho')[:, :num_ceps] # use energy for 1st features column if use_energy: # compute the power power_frames = power_spectrum(fourrier_transform) # compute total energy in each frame frame_energies = np.sum(power_frames, 1) # Handling zero enegies energy = zero_handling(frame_energies) mfccs[:, 0] = np.log(energy) # liftering if lifter > 0: mfccs = lifter_ceps(mfccs, lifter) # normalizatio if normalize: mfccs = cmvn(cms(mfccs)) return mfccs
[docs]def imfcc(sig, fs=16000, num_ceps=13, pre_emph=0, pre_emph_coeff=0.97, win_len=0.025, win_hop=0.01, win_type="hamming", nfilts=26, nfft=512, low_freq=None, high_freq=None, scale="constant", dct_type=2, use_energy=False, lifter=22, normalize=1): """ Compute Inverse MFCC features from an audio signal. Args: sig (array) : a mono audio signal (Nx1) from which to compute features. fs (int) : the sampling frequency of the signal we are working with. Default is 16000. num_ceps (float) : number of cepstra to return. Default is 13. pre_emph (int) : apply pre-emphasis if 1. Default is 1. pre_emph_coeff (float) : apply pre-emphasis filter [1 -pre_emph] (0 = none). Default is 0.97. win_len (float) : window length in sec. Default is 0.025. win_hop (float) : step between successive windows in sec. Default is 0.01. win_type (float) : window type to apply for the windowing. Default is "hamming". nfilts (int) : the number of filters in the filterbank. Default is 40. nfft (int) : number of FFT points. Default is 512. low_freq (int) : lowest band edge of mel filters (Hz). Default is 0. high_freq (int) : highest band edge of mel filters (Hz). Default is samplerate / 2 = 8000. scale (str) : choose if max bins amplitudes ascend, descend or are constant (=1). Default is "constant". dct_type (int) : type of DCT used - 1 or 2 (or 3 for HTK or 4 for feac). Default is 2. use_energy (int) : overwrite C0 with true log energy Default is 0. lifter (int) : apply liftering if value > 0. Default is 22. normalize (int) : apply normalization if 1. Default is 0. Returns: (array) : features - the MFFC features: num_frames x num_ceps """ # init freqs high_freq = high_freq or fs / 2 low_freq = low_freq or 0 # run checks if low_freq < 0: raise ParameterError(ErrorMsgs["low_freq"]) if high_freq > (fs / 2): raise ParameterError(ErrorMsgs["high_freq"]) if nfilts < num_ceps: raise ParameterError(ErrorMsgs["nfilts"]) # pre-emphasis if pre_emph: sig = pre_emphasis(sig=sig, pre_emph_coeff=pre_emph_coeff) # -> framing frames, frame_length = framing(sig=sig, fs=fs, win_len=win_len, win_hop=win_hop) # -> windowing windows = windowing(frames=frames, frame_len=frame_length, win_type=win_type) # -> FFT -> |.| fourrier_transform = rfft(x=windows, n=nfft) abs_fft_values = np.abs(fourrier_transform) # -> x Mel-fbanks -> log(.) -> DCT(.) imel_fbanks_mat = inverse_mel_filter_banks(nfilts=nfilts, nfft=nfft, fs=fs, low_freq=low_freq, high_freq=high_freq, scale=scale) features = np.dot(abs_fft_values, imel_fbanks_mat.T) # -> log(.) features_no_zero = zero_handling(features) log_features = np.log(features_no_zero) # -> DCT(.) imfccs = dct(log_features, type=2, axis=1, norm='ortho')[:, :num_ceps] # use energy for 1st features column if use_energy: # compute the power power_frames = power_spectrum(fourrier_transform) # compute total energy in each frame frame_energies = np.sum(power_frames, 1) # Handling zero enegies energy = zero_handling(frame_energies) imfccs[:, 0] = np.log(energy) # liftering if lifter > 0: imfccs = lifter_ceps(imfccs, lifter) # normalization if normalize: imfccs = cmvn(cms(imfccs)) return imfccs