Source code for spafe.features.pncc

"""
based on https://github.com/supikiti/PNCC/blob/master/pncc.py
"""
import scipy
import numpy as np
from ..utils.spectral import stft, powspec
from ..utils.preprocessing import pre_emphasis
from ..utils.cepstral import cms, cmvn, lifter_ceps
from ..utils.exceptions import ParameterError, ErrorMsgs
from ..fbanks.gammatone_fbanks import gammatone_filter_banks


[docs]def medium_time_power_calculation(power_stft_signal, M=2): medium_time_power = np.zeros_like(power_stft_signal) power_stft_signal = np.pad(power_stft_signal, [(M, M), (0, 0)], 'constant') for i in range(medium_time_power.shape[0]): medium_time_power[i, :] = sum([ 1 / float(2 * M + 1) * power_stft_signal[i + k - M, :] for k in range(2 * M + 1) ]) return medium_time_power
[docs]def asymmetric_lawpass_filtering(rectified_signal, lm_a=0.999, lm_b=0.5): floor_level = np.zeros_like(rectified_signal) floor_level[0, ] = 0.9 * rectified_signal[0, ] for m in range(floor_level.shape[0]): x = lm_a * floor_level[m - 1, :] + (1 - lm_a) * rectified_signal[m, :] y = lm_b * floor_level[m - 1, :] + (1 - lm_b) * rectified_signal[m, :] floor_level[m, :] = np.where( rectified_signal[m, ] >= floor_level[m - 1, :], x, y) return floor_level
[docs]def temporal_masking(rectified_signal, lam_t=0.85, myu_t=0.2): # rectified_signal[m, l] temporal_masked_signal = np.zeros_like(rectified_signal) online_peak_power = np.zeros_like(rectified_signal) temporal_masked_signal[0, :] = rectified_signal[0, ] online_peak_power[0, :] = rectified_signal[0, :] for m in range(1, rectified_signal.shape[0]): online_peak_power[m, :] = np.maximum( lam_t * online_peak_power[m - 1, :], rectified_signal[m, :]) temporal_masked_signal[m, :] = np.where( rectified_signal[m, :] >= lam_t * online_peak_power[m - 1, :], rectified_signal[m, :], myu_t * online_peak_power[m - 1, :]) return temporal_masked_signal
[docs]def weight_smoothing(final_output, medium_time_power, N=4, L=128): spectral_weight_smoothing = np.zeros_like(final_output) for m in range(final_output.shape[0]): for l in range(final_output.shape[1]): l_1 = max(l - N, 1) l_2 = min(l + N, L) spectral_weight_smoothing[m, l] = (1 / float(l_2 - l_1 + 1)) * \ sum([(final_output[m, l_] / medium_time_power[m, l_]) for l_ in range(l_1, l_2)]) return spectral_weight_smoothing
[docs]def mean_power_normalization(transfer_function, final_output, lam_myu=0.999, L=80, k=1): myu = np.zeros(shape=(transfer_function.shape[0])) myu[0] = 0.0001 normalized_power = np.zeros_like(transfer_function) for m in range(1, transfer_function.shape[0]): myu[m] = lam_myu * myu[m - 1] + \ (1 - lam_myu) / L * \ sum([transfer_function[m, s] for s in range(0, L - 1)]) normalized_power = k * transfer_function / myu[:, None] return normalized_power
[docs]def medium_time_processing(power_stft_signal, nfilts=22): # calculate medium time power medium_time_power = medium_time_power_calculation(power_stft_signal) lower_envelope = asymmetric_lawpass_filtering(medium_time_power, 0.999, 0.5) subtracted_lower_envelope = medium_time_power - lower_envelope # half waverectification threshold = 0 rectified_signal = np.where(subtracted_lower_envelope < threshold, np.zeros_like(subtracted_lower_envelope), subtracted_lower_envelope) floor_level = asymmetric_lawpass_filtering(rectified_signal) temporal_masked_signal = temporal_masking(rectified_signal) # switch excitation or non-excitation c = 2 F = np.where(medium_time_power >= c * lower_envelope, temporal_masked_signal, floor_level) # weight smoothing spectral_weight_smoothing = weight_smoothing(F, medium_time_power, L=nfilts) return spectral_weight_smoothing, F
[docs]def pncc(sig, fs=16000, num_ceps=13, pre_emph=0, pre_emph_coeff=0.97, power=2, win_len=0.025, win_hop=0.01, win_type="hamming", nfilts=26, nfft=512, low_freq=None, high_freq=None, scale="constant", dct_type=2, use_energy=False, dither=1, lifter=22, normalize=1): """ Compute the power-normalized cepstral coefficients (SPNCC features) from an audio signal. Args: sig (array) : a mono audio signal (Nx1) from which to compute features. fs (int) : the sampling frequency of the signal we are working with. Default is 16000. num_ceps (float) : number of cepstra to return. Default is 13. pre_emph (int) : apply pre-emphasis if 1. Default is 1. pre_emph_coeff (float) : apply pre-emphasis filter [1 -pre_emph] (0 = none). Default is 0.97. power (int) : spectrum power. Default is 2. win_len (float) : window length in sec. Default is 0.025. win_hop (float) : step between successive windows in sec. Default is 0.01. win_type (float) : window type to apply for the windowing. Default is "hamming". nfilts (int) : the number of filters in the filterbank. Default is 40. nfft (int) : number of FFT points. Default is 512. low_freq (int) : lowest band edge of mel filters (Hz). Default is 0. high_freq (int) : highest band edge of mel filters (Hz). Default is samplerate / 2 = 8000. scale (str) : choose if max bins amplitudes ascend, descend or are constant (=1). Default is "constant". dct_type (int) : type of DCT used - 1 or 2 (or 3 for HTK or 4 for feac). Default is 2. use_energy (int) : overwrite C0 with true log energy Default is 0. dither (int) : 1 = add offset to spectrum as if dither noise. Default is 0. lifter (int) : apply liftering if value > 0. Default is 22. normalize (int) : apply normalization if 1. Default is 0. Returns: (array) : 2d array of PNCC features (num_frames x num_ceps) """ # init freqs high_freq = high_freq or fs / 2 low_freq = low_freq or 0 # run checks if low_freq < 0: raise ParameterError(ErrorMsgs["low_freq"]) if high_freq > (fs / 2): raise ParameterError(ErrorMsgs["high_freq"]) if nfilts < num_ceps: raise ParameterError(ErrorMsgs["nfilts"]) # pre-emphasis if pre_emph: sig = pre_emphasis(sig=sig, pre_emph_coeff=pre_emph_coeff) # -> STFT() stf_trafo, _ = stft(sig, fs) # -> |.|^2 spectrum_power = np.abs(stf_trafo)**power # -> x Filterbanks gammatone_filter = gammatone_filter_banks(nfilts=nfilts, nfft=nfft, fs=fs, low_freq=low_freq, high_freq=high_freq, scale=scale) P = np.dot(a=spectrum_power[:, :gammatone_filter.shape[1]], b=gammatone_filter.T) # medium_time_processing S, F = medium_time_processing(P, nfilts=nfilts) # time-freq normalization T = P * S # -> mean power normalization U = mean_power_normalization(T, F, L=nfilts) # -> power law non linearity V = U**(1 / 15) # DCT(.) pnccs = scipy.fftpack.dct(V)[:, :num_ceps] # use energy for 1st features column if use_energy: pspectrum, logE = powspec(sig, fs=fs, win_len=win_len, win_hop=win_hop, dither=dither) # bug: pnccs[:, 0] = logE # liftering if lifter > 0: pnccs = lifter_ceps(pnccs, lifter) # normalization if normalize: pnccs = cmvn(cms(pnccs)) return pnccs