"""
based on https://github.com/supikiti/PNCC/blob/master/pncc.py
"""
import scipy
import numpy as np
from ..utils.spectral import stft, powspec
from ..utils.preprocessing import pre_emphasis
from ..utils.cepstral import cms, cmvn, lifter_ceps
from ..utils.exceptions import ParameterError, ErrorMsgs
from ..fbanks.gammatone_fbanks import gammatone_filter_banks
[docs]def medium_time_power_calculation(power_stft_signal, M=2):
medium_time_power = np.zeros_like(power_stft_signal)
power_stft_signal = np.pad(power_stft_signal, [(M, M), (0, 0)], 'constant')
for i in range(medium_time_power.shape[0]):
medium_time_power[i, :] = sum([
1 / float(2 * M + 1) * power_stft_signal[i + k - M, :]
for k in range(2 * M + 1)
])
return medium_time_power
[docs]def asymmetric_lawpass_filtering(rectified_signal, lm_a=0.999, lm_b=0.5):
floor_level = np.zeros_like(rectified_signal)
floor_level[0, ] = 0.9 * rectified_signal[0, ]
for m in range(floor_level.shape[0]):
x = lm_a * floor_level[m - 1, :] + (1 - lm_a) * rectified_signal[m, :]
y = lm_b * floor_level[m - 1, :] + (1 - lm_b) * rectified_signal[m, :]
floor_level[m, :] = np.where(
rectified_signal[m, ] >= floor_level[m - 1, :], x, y)
return floor_level
[docs]def temporal_masking(rectified_signal, lam_t=0.85, myu_t=0.2):
# rectified_signal[m, l]
temporal_masked_signal = np.zeros_like(rectified_signal)
online_peak_power = np.zeros_like(rectified_signal)
temporal_masked_signal[0, :] = rectified_signal[0, ]
online_peak_power[0, :] = rectified_signal[0, :]
for m in range(1, rectified_signal.shape[0]):
online_peak_power[m, :] = np.maximum(
lam_t * online_peak_power[m - 1, :], rectified_signal[m, :])
temporal_masked_signal[m, :] = np.where(
rectified_signal[m, :] >= lam_t * online_peak_power[m - 1, :],
rectified_signal[m, :], myu_t * online_peak_power[m - 1, :])
return temporal_masked_signal
[docs]def weight_smoothing(final_output, medium_time_power, N=4, L=128):
spectral_weight_smoothing = np.zeros_like(final_output)
for m in range(final_output.shape[0]):
for l in range(final_output.shape[1]):
l_1 = max(l - N, 1)
l_2 = min(l + N, L)
spectral_weight_smoothing[m, l] = (1 / float(l_2 - l_1 + 1)) * \
sum([(final_output[m, l_] / medium_time_power[m, l_])
for l_ in range(l_1, l_2)])
return spectral_weight_smoothing
[docs]def mean_power_normalization(transfer_function,
final_output,
lam_myu=0.999,
L=80,
k=1):
myu = np.zeros(shape=(transfer_function.shape[0]))
myu[0] = 0.0001
normalized_power = np.zeros_like(transfer_function)
for m in range(1, transfer_function.shape[0]):
myu[m] = lam_myu * myu[m - 1] + \
(1 - lam_myu) / L * \
sum([transfer_function[m, s] for s in range(0, L - 1)])
normalized_power = k * transfer_function / myu[:, None]
return normalized_power
[docs]def medium_time_processing(power_stft_signal, nfilts=22):
# calculate medium time power
medium_time_power = medium_time_power_calculation(power_stft_signal)
lower_envelope = asymmetric_lawpass_filtering(medium_time_power, 0.999,
0.5)
subtracted_lower_envelope = medium_time_power - lower_envelope
# half waverectification
threshold = 0
rectified_signal = np.where(subtracted_lower_envelope < threshold,
np.zeros_like(subtracted_lower_envelope),
subtracted_lower_envelope)
floor_level = asymmetric_lawpass_filtering(rectified_signal)
temporal_masked_signal = temporal_masking(rectified_signal)
# switch excitation or non-excitation
c = 2
F = np.where(medium_time_power >= c * lower_envelope,
temporal_masked_signal, floor_level)
# weight smoothing
spectral_weight_smoothing = weight_smoothing(F,
medium_time_power,
L=nfilts)
return spectral_weight_smoothing, F
[docs]def pncc(sig,
fs=16000,
num_ceps=13,
pre_emph=0,
pre_emph_coeff=0.97,
power=2,
win_len=0.025,
win_hop=0.01,
win_type="hamming",
nfilts=26,
nfft=512,
low_freq=None,
high_freq=None,
scale="constant",
dct_type=2,
use_energy=False,
dither=1,
lifter=22,
normalize=1):
"""
Compute the power-normalized cepstral coefficients (SPNCC features) from an audio signal.
Args:
sig (array) : a mono audio signal (Nx1) from which to compute features.
fs (int) : the sampling frequency of the signal we are working with.
Default is 16000.
num_ceps (float) : number of cepstra to return.
Default is 13.
pre_emph (int) : apply pre-emphasis if 1.
Default is 1.
pre_emph_coeff (float) : apply pre-emphasis filter [1 -pre_emph] (0 = none).
Default is 0.97.
power (int) : spectrum power.
Default is 2.
win_len (float) : window length in sec.
Default is 0.025.
win_hop (float) : step between successive windows in sec.
Default is 0.01.
win_type (float) : window type to apply for the windowing.
Default is "hamming".
nfilts (int) : the number of filters in the filterbank.
Default is 40.
nfft (int) : number of FFT points.
Default is 512.
low_freq (int) : lowest band edge of mel filters (Hz).
Default is 0.
high_freq (int) : highest band edge of mel filters (Hz).
Default is samplerate / 2 = 8000.
scale (str) : choose if max bins amplitudes ascend, descend or are constant (=1).
Default is "constant".
dct_type (int) : type of DCT used - 1 or 2 (or 3 for HTK or 4 for feac).
Default is 2.
use_energy (int) : overwrite C0 with true log energy
Default is 0.
dither (int) : 1 = add offset to spectrum as if dither noise.
Default is 0.
lifter (int) : apply liftering if value > 0.
Default is 22.
normalize (int) : apply normalization if 1.
Default is 0.
Returns:
(array) : 2d array of PNCC features (num_frames x num_ceps)
"""
# init freqs
high_freq = high_freq or fs / 2
low_freq = low_freq or 0
# run checks
if low_freq < 0:
raise ParameterError(ErrorMsgs["low_freq"])
if high_freq > (fs / 2):
raise ParameterError(ErrorMsgs["high_freq"])
if nfilts < num_ceps:
raise ParameterError(ErrorMsgs["nfilts"])
# pre-emphasis
if pre_emph:
sig = pre_emphasis(sig=sig, pre_emph_coeff=pre_emph_coeff)
# -> STFT()
stf_trafo, _ = stft(sig, fs)
# -> |.|^2
spectrum_power = np.abs(stf_trafo)**power
# -> x Filterbanks
gammatone_filter = gammatone_filter_banks(nfilts=nfilts,
nfft=nfft,
fs=fs,
low_freq=low_freq,
high_freq=high_freq,
scale=scale)
P = np.dot(a=spectrum_power[:, :gammatone_filter.shape[1]],
b=gammatone_filter.T)
# medium_time_processing
S, F = medium_time_processing(P, nfilts=nfilts)
# time-freq normalization
T = P * S
# -> mean power normalization
U = mean_power_normalization(T, F, L=nfilts)
# -> power law non linearity
V = U**(1 / 15)
# DCT(.)
pnccs = scipy.fftpack.dct(V)[:, :num_ceps]
# use energy for 1st features column
if use_energy:
pspectrum, logE = powspec(sig,
fs=fs,
win_len=win_len,
win_hop=win_hop,
dither=dither)
# bug: pnccs[:, 0] = logE
# liftering
if lifter > 0:
pnccs = lifter_ceps(pnccs, lifter)
# normalization
if normalize:
pnccs = cmvn(cms(pnccs))
return pnccs