import scipy
import warnings
import numpy as np
import matplotlib.pyplot as plt
from spafe.utils.converters import (hz2bark, bark2hz, hz2mel, mel2hz, fft2barkmx, fft2melmx)
NFFT = 512
[docs]def cqt(sig, fs=16000, low_freq=10, high_freq=3000, b=48):
"""
Compute the constant Q-transform.
- take the absolute value of the FFT
- warp to a Mel frequency scale
- take the DCT of the log-Mel-spectrum
- return the first <num_ceps> components
Args:
sig (array) : a mono audio signal (Nx1) from which to compute features.
fs (int) : the sampling frequency of the signal we are working with.
Default is 16000.
low_freq (int) : lowest band edge of mel filters (Hz).
Default is 10.
high_freq (int) : highest band edge of mel filters (Hz).
Default is 3000.
b (int) : number of bins per octave.
Default is 48.
Returns:
array including the Q-transform coefficients.
"""
# define lambda funcs for clarity
def f(k):
return low_freq * 2**((k - 1) / b)
def w(N):
return np.hamming(N)
def nk(k):
return np.ceil(Q * fs / f(k))
def t(Nk, k):
return (1 / Nk) * w(Nk) * np.exp(
2 * np.pi * 1j * Q * np.arange(Nk) / Nk)
# init vars
Q = 1 / (2**(1 / b) - 1)
K = int(np.ceil(b * np.log2(high_freq / low_freq)))
nfft = int(2**np.ceil(np.log2(Q * fs / low_freq)))
# define temporal kernal and sparse kernal variables
S = [
scipy.sparse.coo_matrix(np.fft.fft(t(nk(k), k), nfft))
for k in range(K, 0, -1)
]
S = scipy.sparse.vstack(S[::-1]).tocsc().transpose().conj() / nfft
# compute the constant Q-transform
xcq = (np.fft.fft(sig, nfft).reshape(1, nfft) * S)[0]
return xcq
[docs]def pre_process_x(sig, fs=16000, win_type="hann", win_len=0.025, win_hop=0.01):
"""
Prepare window and pad signal audio
"""
# convert integer to double
# sig = np.double(sig) / 2.**15
# STFT parameters
# convert win_len and win_hop from seconds to samples
win_length = int(win_len * fs)
hop_length = int(win_hop * fs)
# compute window
window = np.hanning(win_length)
if win_type == "hamm":
window = np.hamming(win_length)
# normalization step to ensure that the STFT is self-inverting (or a Parseval frame)
normalized_window = normalize_window(win=window, hop=hop_length)
# Compute the STFT
# zero pad to ensure that there are no partial overlap windows in the STFT computation
sig = np.pad(sig, (window.size + hop_length, window.size + hop_length),
'constant',
constant_values=(0, 0))
return sig, normalized_window, hop_length
[docs]def stft(sig, fs=16000, win_type="hann", win_len=0.025, win_hop=0.01):
"""
Compute the short time Fourrier transform of an audio signal x.
Args:
x (array) : audio signal in the time domain
win (int) : window to be used for the STFT
hop (int) : hop-size
Returns:
X : 2d array of the STFT coefficients of x
"""
sig, normalized_window, hop_length = pre_process_x(sig,
fs=fs,
win_type=win_type,
win_len=win_len,
win_hop=win_hop)
X = compute_stft(x=sig, win=normalized_window, hop=hop_length)
return X, sig
[docs]def compute_stft(x, win, hop):
"""
Compute the short time Fourrier transform of an audio signal x.
Args:
x (array) : audio signal in the time domain
win (int) : window to be used for the STFT
hop (int) : hop-size
Returns:
X : 2d array of the STFT coefficients of x
"""
# length of the audio signal
sig_len = x.size
# length of the window = fft size
win_len = win.size
# number of steps to take
num_steps = (np.ceil((sig_len - win_len) / hop) + 1).astype(int)
# init STFT coefficients
X = np.zeros((win_len, num_steps), dtype=complex)
# normalizing factor
nf = np.sqrt(win_len)
for k in range(num_steps - 1):
d = x[k * hop:k * hop + win_len] * win
X[:, k] = np.fft.fft(d) / nf
# the last window may partially overlap with the signal
d = x[num_steps * hop:]
X[:, k] = np.fft.fft(d * win[:d.size], n=win_len) / nf
return X
[docs]def istft(X, fs=16000, win_type="hann", win_len=0.025, win_hop=0.01):
"""
Args:
X : STFT coefficients
win : window to be used for the STFT
hop : hop-size
Returns :
x : inverse STFT of X
"""
# STFT parameters
# convert win_len and win_hop from seconds to samples
win_length = int(win_len * fs)
hop_length = int(win_hop * fs)
# compute window
if win_type == "hann":
window = np.hanning(win_length)
elif win_type == "hamm":
window = np.hamming(win_length)
# normalization step to ensure that the STFT is self-inverting (or a Parseval frame)
normalized_window = normalize_window(win=window, hop=hop_length)
# Compute the ISTFT
# win_len: length of the window + num_steps: number of frames
win_len, num_steps = X.shape[0], X.shape[1]
# length of the output signal
sig_len = win_len + (num_steps - 1) * hop_length
# init output variable
x = np.zeros((sig_len), dtype=complex)
# normalizing factor
nf = np.sqrt(win_len)
for k in range(num_steps):
d = nf * np.fft.ifft(X[:, k]) * normalized_window
x[k * hop_length:k * hop_length + win_len] += d
return x
[docs]def normalize_window(win, hop):
"""
Normalize the window according to the provided hop-size so that the STFT is
a tight frame.
Args:
win (int) : window to be used for the STFT
hop (int) : hop-size
"""
N = win.size
K = int(N / hop)
win2 = win * win
z = 1 * win2
k = 1
ind1 = N - hop
ind2 = hop
while (k < K):
z[0:ind1] += win2[ind2:N]
z[ind2:N] += win2[0:ind1]
ind1 -= hop
ind2 += hop
k += 1
win2 = win / np.sqrt(z)
return win2
[docs]def display_stft(X,
fs,
len_sig,
low_freq=0,
high_freq=3000,
min_db=-10,
max_db=0,
normalize=True):
"""
Plot the stft of an audio signal in the time-frequency plane.
Args:
X (array) : STFT coefficients
fs (int) : sampling frequency in Hz (assumed to be integer)
hop (int) : hop-size used in the STFT (for labeling the time axis)
low_freq (int) : minimun frequency to plot in hz.
Default is 0 Hz.
high_freq (int) : maximum frequency tp plot in Hz.
Default is 3000 Hz.
min_db (int) : minimun magnitude to display in dB
Default is 0 dB.
max_db (int) : maximum magnitude to display in dB.
Default is -10 dB.
normalize (bool) : Normalize input.
Default is True.
"""
# normalize : largest coefficient magnitude is unity
X_temp = X.copy()
if normalize:
X_temp /= np.amax(abs(X_temp))
# compute frequencies array
Freqs = np.array([low_freq, high_freq])
Fd = (Freqs * X_temp.shape[0] / fs).astype(int)
# compute values matrix
Z = X_temp[Fd[1]:Fd[0]:-1, :]
Z = np.clip(np.log(np.abs(Z) + 1e-50), min_db, max_db)
Z = 255 * (Z - min_db) / (max_db - min_db)
# compute duration
time = float(len_sig) / float(fs)
# plotting
plt.imshow(Z,
extent=[0, time, low_freq / 1000, high_freq / 1000],
aspect="auto")
plt.ylabel('Frequency (Khz)')
plt.xlabel('Time (sec)')
plt.show()
[docs]def power_spectrum(fourrier_transform, nfft=NFFT):
magnitude_frames = np.absolute(fourrier_transform) # Magnitude of the FFT
power_frames = ((1.0 / nfft) * ((magnitude_frames)**2)) # Power Spectrum
return power_frames
[docs]def rfft(x, n=NFFT):
"""
compute the fourrier transform of a certain signal frames.
"""
return np.fft.rfft(a=x, n=n)
[docs]def dct(x, type=2, axis=1, norm='ortho'):
from scipy.fftpack import dct
return scipy.fftpack.dct(x=x, type=type, axis=axis, norm=norm)
[docs]def powspec(sig,
fs=16000,
nfft=512,
win_type="hann",
win_len=0.025,
win_hop=0.01,
dither=1):
"""
compute the powerspectrum and frame energy of the input signal.
basically outputs a power spectrogram
each column represents a power spectrum for a given frame
each row represents a frequency
default values:
fs = 8000Hz
wintime = 25ms (200 samps)
steptime = 10ms (80 samps)
which means use 256 point fft
hamming window
$Header: /Users/dpwe/matlab/rastamat/RCS/powspec.m,v 1.3 2012/09/03 14:02:01 dpwe Exp dpwe $
for fs = 8000
NFFT = 256;
NOVERLAP = 120;
SAMPRATE = 8000;
WINDOW = hamming(200);
"""
# convert win_len and win_hop from seconds to samples
win_length = int(win_len * fs)
hop_length = int(win_hop * fs)
fft_length = int(np.power(2, np.ceil(np.log2(win_len * fs))))
# compute stft
X, _ = stft(sig=sig,
fs=fs,
win_type=win_type,
win_len=win_len,
win_hop=win_hop)
pow_X = np.abs(X)**2
if dither:
pow_X = pow_X + win_length
e = np.log(np.sum(pow_X, axis=0))
return pow_X, e
[docs]def lifter(x, lift=0.6, invs=False):
"""
apply lifter to matrix of cepstra (one per column)
Args:
lift (float) : exponent of x i^n liftering or, as a negative integer, the
length of HTK-style sin-curve liftering.
inverse (bool) : if inverse == 1 (default 0), undo the liftering.
Returns:
liftered cepstra.
"""
ncep = x.shape[0]
if lift == 0:
y = x
else:
if lift < 0:
warnings.warn(
'HTK liftering does not support yet; default liftering')
lift = 0.6
liftwts = np.arange(1, ncep)**lift
liftwts = np.append(1, liftwts)
if (invs):
liftwts = 1 / liftwts
y = np.matmul(np.diag(liftwts), x)
return y
[docs]def audspec(p_spectrum,
fs=16000,
nfilts=0,
fb_type='bark',
low_freq=0,
high_freq=0,
sumpower=1,
bwidth=1):
"""
perform critical band analysis (see PLP) based on the power spectrogram.
Args:
aspectrum (array) : the power spectrum array.
nfft (int) : the FFT size.
(Default is 512)
fs (int) : sample rate/ sampling frequency of the signal.
(Default 16000 Hz)
nfilts (int) : the number of filters in the filterbank.
(Default 20)
fb_type (str) : type of bins [Mel, Bark, ...].
bwidth (int) : the constant width of each band relative to standard Mel (default 1).
Default is 1.
low_freq (int) : lowest band edge of mel filters.
(Default 0 Hz)
high_freq (int) : highest band edge of mel filters.
(Default samplerate/2)
sumpower (bool) : sum power if True.
Default is True.
Returns:
auditory spectrum array.
"""
if nfilts == 0:
np.ceil(hz2bark(fs / 2)) + 1
if high_freq == 0:
high_freq = fs / 2
nfreqs = p_spectrum.shape[0]
nfft = (int(nfreqs) - 1) * 2
if fb_type == 'bark':
wts = fft2barkmx(nfft, fs, nfilts, bwidth, low_freq, high_freq)
elif fb_type == 'mel':
wts = fft2melmx(nfft, fs, nfilts, bwidth, low_freq, high_freq)
elif fb_type == 'htkmel':
wts = fft2melmx(nfft,
fs,
nfilts,
bwidth,
low_freq,
high_freq,
htk=True,
constamp=True)
elif fb_type == 'fcmel':
wts = fft2melmx(nfft,
fs,
nfilts,
bwidth,
low_freq,
high_freq,
htk=True,
constamp=False)
wts = wts[:, 0:nfreqs]
if sumpower:
aspectrum = np.matmul(wts, p_spectrum)
else:
aspectrum = np.matmul(wts, np.sqrt(p_spectrum))**2
return aspectrum
[docs]def postaud(x, fmax, fb_type='bark', broaden=0):
"""
do loudness equalization and cube root compression
Args:
x : critical band filters
rows : critical bands
cols : frames
"""
nbands, nframes = x.shape
nfpts = int(nbands + 2 * broaden)
if fb_type == 'bark':
bandcfhz = bark2hz(np.linspace(0, hz2bark(fmax), nfpts))
elif fb_type == 'mel':
bandcfhz = mel2hz(np.linspace(0, hz2mel(fmax), nfpts))
elif fb_type == 'htkmel' or fb_type == 'fcmel':
bandcfhz = mel2hz(np.linspace(0, hz2mel(fmax, htk=True), nfpts), htk=True)
bandcfhz = bandcfhz[broaden:(nfpts - broaden)]
fsq = np.power(bandcfhz, 2)
ftmp = np.add(fsq, 1.6e5)
eql = np.multiply((fsq / ftmp)**2, (fsq + 1.44e6) / (fsq + 9.61e6))
z = np.multiply(np.tile(eql, (nframes, 1)).T, x)
z = np.power(z, 0.33)
if broaden:
y = np.zeros((z.shape[0] + 2, z.shape[1]))
y[0, :] = z[0, :]
y[1:nbands + 1, :] = z
y[nbands + 1, :] = z[z.shape[0] - 1, :]
else:
y = np.zeros((z.shape[0], z.shape[1]))
y[0, :] = z[1, :]
y[1:nbands - 1, :] = z[1:z.shape[0] - 1, :]
y[nbands - 1, :] = z[z.shape[0] - 2, :]
return y, eql
[docs]def invpostaud(y, fmax, fb_type='bark', broaden=0):
"""
invert the effects of postaud (loudness equalization and cube
- root compression)
- y = postaud output
- x = reconstructed critical band filters
- rows = critical bands
- cols = frames
"""
nbands, nframes = y.shape
if fb_type == 'bark':
bandcfhz = bark2hz(np.linspace(0, hz2bark(fmax), nbands), fs, nfft)
elif fb_type == 'mel':
bandcfhz = mel2hz(np.linspace(0, hz2mel(fmax), nbands))
elif fb_type == 'htkmel' or fb_type == 'fcmel':
bandcfhz = mel2hz(np.linspace(0, hz2mel(fmax, htk=True), nbands),
htk=True)
bandcfhz = bandcfhz[broaden:(nbands - broaden)]
fsq = bandcfhz**2
ftmp = fsq + 1.6e5
eql = np.multiply((fsq / ftmp)**2, (fsq + 1.44e6) / (fsq + 9.61e6))
x = y**(1 / 0.33)
if eql[0] == 0:
eql[0] = eql[1]
eql[-1] = eql[-2]
x = np.divide(x[broaden:(nbands - broaden + 1), :],
np.add(np.tile(eql.T, (nframes, 1)).T, 1e-8))
return x, eql
[docs]def invpowspec(y, fs, win_len, win_hop, excit=[]):
"""
x = invpowspec(y, fs, wintime, steptime, excit)
Attempt to go back from specgram-like power spectrum to audio waveform by
scaling specgram of white noise
default values:
fs = 8000Hz
wintime = 25ms (200 samps)
steptime = 10ms (80 samps)
which means use 256 point fft
hamming window
excit is input excitation; white noise is used if not specified
for fs = 8000
NFFT = 256;
NOVERLAP = 120;
SAMPRATE = 8000;
WINDOW = hamming(200);
"""
nrow, ncol = y.shape
r = excit
winpts = int(win_len * fs)
steppts = int(win_hop * fs)
nfft = int(np.power(2, np.ceil(np.divide(np.log(winpts), np.log(2)))))
# Can't predict librosa stft length...
tmp = istft(X=y, fs=fs, win_type="hann", win_len=0.025, win_hop=0.01)
# # Can't predict librosa stft length...
# tmp = librosa.istft(y,
# hop_length=steppts,
# win_length=winpts,
# window='hann',
# center=False)
xlen = len(tmp)
# xlen = int(np.add(winpts, np.multiply(steppts, np.subtract(ncol, 1))))
# xlen = int(np.multiply(steppts, np.subtract(ncol, 1)))
if len(r) == 0:
r = np.squeeze(np.random.randn(xlen, 1))
r = r[0:xlen]
R, _ = stft(sig=r,
fs=fs,
win_type=win_type,
win_len=win_len,
win_hop=win_hop)
# R = librosa.stft(np.divide(r, 32768 * 12),
# n_fft=nfft,
# hop_length=steppts,
# win_length=winpts,
# window='hann',
# center=False)
R *= np.sqrt(y)
x = istft(X=R, fs=fs, win_type="hann", win_len=0.025, win_hop=0.01)
# x = librosa.istft(R,
# hop_length=steppts,
# win_length=winpts,
# window='hann',
# center=False)
return x
[docs]def invaudspec(aspectrum,
fs=16000,
nfft=512,
fb_type='bark',
low_freq=0,
high_freq=None,
sumpower=True,
bwidth=1):
"""
Compute the power spectrum from the auditory spectrum.
Invert (~might not be that accurate) the effects of audspec()
Args:
aspectrum (array) : the auditory spectrum array.
nfft (int) : the FFT size.
(Default is 512)
fs (int) : sample rate/ sampling frequency of the signal.
(Default 16000 Hz)
nfilts (int) : the number of filters in the filterbank.
(Default 20)
fb_type (str) : type of bins [Mel, Bark, ...].
bwidth (int) : the constant width of each band relative to standard Mel (default 1).
Default is 1.
low_freq (int) : lowest band edge of mel filters.
(Default 0 Hz)
high_freq (int) : highest band edge of mel filters.
(Default samplerate/2)
sumpower (bool) : sum power if True.
Default is True.
Returns:
power spectrum array.
"""
if high_freq is None:
high_freq = fs / 2
nfilts, nframes = aspectrum.shape
if fb_type == 'bark':
wts = fft2barkmx(nfft, fs, nfilts, bwidth, low_freq, high_freq)
elif fb_type == 'mel':
wts = fft2melmx(nfft, fs, nfilts, bwidth, low_freq, high_freq)
elif fb_type == 'htkmel':
wts = fft2melmx(nfft,
fs,
nfilts,
bwidth,
low_freq,
high_freq,
htk=True,
constamp=True)
elif fb_type == 'fcmel':
wts = fft2melmx(nfft,
fs,
nfilts,
bwidth,
low_freq,
high_freq,
htk=True,
constamp=False)
wts = wts[:, 0:int(nfft / 2 + 1)]
ww = np.matmul(wts.T, wts)
itws = wts.T / np.tile(
np.maximum(np.mean(np.diag(ww)) / 100, np.sum(ww, axis=0)),
(nfilts, 1)).T
if sumpower:
spec = np.matmul(itws, aspectrum)
else:
spec = np.power(np.matmul(itws, np.sqrt(aspectrum)))
return spec, wts, itws