import numpy as np
import scipy.ndimage
from spafe.utils.spectral import rfft
from .exceptions import ParameterError, ErrorMsgs
[docs]def zero_handling(x):
"""
handle the issue with zero values if they are exposed to become an argument
for any log function.
Args:
x (array): input vector.
Returns:
vector with zeros substituted with epsilon values.
"""
return np.where(x == 0, np.finfo(float).eps, x)
[docs]def pre_emphasis(sig, pre_emph_coeff=0.97):
"""
perform preemphasis on the input signal.
Args:
sig (array) : signal to filter.
coeff (float) : preemphasis coefficient. 0 is no filter, default is 0.95.
Returns:
the filtered signal.
"""
return np.append(sig[0], sig[1:] - pre_emph_coeff * sig[:-1])
[docs]def stride_trick(a, stride_length, stride_step):
"""
apply framing using the stride trick from numpy.
Args:
a (array) : signal array.
stride_length (int) : length of the stride.
stride_step (int) : stride step.
Returns:
blocked/framed array.
"""
nrows = ((a.size - stride_length) // stride_step) + 1
n = a.strides[0]
return np.lib.stride_tricks.as_strided(a,
shape=(nrows, stride_length),
strides=(stride_step*n, n))
[docs]def framing(sig, fs=16000, win_len=0.025, win_hop=0.01):
"""
transform a signal into a series of overlapping frames (=Frame blocking).
Args:
sig (array) : a mono audio signal (Nx1) from which to compute features.
fs (int) : the sampling frequency of the signal we are working with.
Default is 16000.
win_len (float) : window length in sec.
Default is 0.025.
win_hop (float) : step between successive windows in sec.
Default is 0.01.
Returns:
array of frames.
frame length.
Notes:
------
Uses the stride trick to accelerate the processing.
"""
# run checks and assertions
if win_len < win_hop:
raise ParameterError(ErrorMsgs["win_len_win_hop_comparison"])
# compute frame length and frame step (convert from seconds to samples)
frame_length = win_len * fs
frame_step = win_hop * fs
signal_length = len(sig)
frames_overlap = frame_length - frame_step
# make sure to use integers as indices
frames = stride_trick(sig, int(frame_length), int(frame_step))
if len(frames[-1]) < frame_length:
frames[-1] = np.append(frames[-1], np.array([0]*(frame_length - len(frames[0]))))
return frames, frame_length
[docs]def windowing(frames, frame_len, win_type="hamming", beta=14):
"""
generate and apply a window function to avoid spectral leakage.
Args:
frames (array) : array including the overlapping frames.
frame_len (int) : frame length.
win_type (str) : type of window to use.
Default is "hamming"
Returns:
windowed frames.
"""
if win_type == "hamming" : windows = np.hamming(frame_len)
elif win_type == "hanning" : windows = np.hanning(frame_len)
elif win_type == "bartlet" : windows = np.bartlett(frame_len)
elif win_type == "kaiser" : windows = np.kaiser(frame_len, beta)
elif win_type == "blackman": windows = np.blackman(frame_len)
windowed_frames = frames * windows
return windowed_frames
[docs]def remove_silence(sig, fs, win_len=0.25, win_hop=0.25, threshold=-35):
"""
generate and apply a window function to avoid spectral leakage.
Args:
frames (array) : array including the overlapping frames.
frame_len (int) : frame length.
win_type (str) : type of window to use.
Default is "hamming"
Returns:
windowed frames.
"""
# framing
frames, frames_len = framing(sig=sig, fs=fs, win_len=win_len, win_hop=win_hop)
# compute short time energies to get voiced frames
amplitudes = np.abs(rfft(frames, len(frames)))
energy = np.sum(amplitudes, axis=-1) / len(frames)**2
energy = 10 * np.log10(zero_handling(energy))
# normalize energy to 0 dB then filter and format
energy = energy - energy.max()
energy = scipy.ndimage.filters.median_filter(energy, 5)
energy = np.repeat(energy, frames_len)
# compute vad and get speech frames
vad = np.array(energy > threshold, dtype=sig.dtype)
vframes = np.array(frames.flatten()[np.where(vad==1)], dtype=sig.dtype)
return energy, vad, np.array(vframes, dtype=np.float64)