%load_ext autoreload
%autoreload 2
import numpy as np
import matplotlib.pyplot as plt
import IPython.display as ipd
from scipy.io import wavfile
from mfcc import *
As we discussed, we can obtain a mel-spaced spectrogram by multiplying a spectrogram on the left by a matrix M in which each row is a different triangle centered on a different band. This matrix M is referred to as a Mel Filterbank. Below is an image of a mel filterbank
win = 2048
hop = 512
K = win//2+1
sr = 44100
min_freq = 80
max_freq = 16000
n_bins = 200
freqs = np.arange(K)*sr/win
M = get_mel_filterbank(K, win, sr, min_freq, max_freq, n_bins)
plt.figure(figsize=(10, 4))
plt.plot(freqs, M.T);
plt.xlabel("Frequency (Hz)")
plt.title("Normalized Mel Filterbank: {} Bins from {}hz - {}hz".format(n_bins, min_freq, max_freq))
Text(0.5, 1.0, 'Normalized Mel Filterbank: 200 Bins from 80hz - 16000hz')
sr, x = wavfile.read("doves.wav")
S = stft(x, win, hop)
S = np.abs(S)[0:K, :]
MS = M.dot(S)
plt.figure(figsize=(12, 4))
plt.subplot(131)
plt.imshow(amplitude_to_db(M), cmap='magma', aspect='auto')
plt.ylim([0, M.shape[0]])
plt.title("Mel Filterbank ({} x {})".format(*M.shape))
plt.subplot(132)
plt.imshow(amplitude_to_db(S), cmap='magma', aspect='auto')
plt.ylim([0, S.shape[0]])
plt.title("Spectrogram in dB ({} x {})".format(*S.shape))
plt.subplot(133)
plt.imshow(amplitude_to_db(MS), cmap='magma', aspect='auto')
plt.ylim([0, MS.shape[0]])
plt.title("Mel Specrogram ({} x {})".format(*MS.shape))
Text(0.5, 1.0, 'Mel Specrogram (200 x 859)')
We can "invert" a mel spectrogram and go back to the original spectrogram by multiplying the mel spectrogram on the left by the transpose of the mel matrix M
def invert_MS(M, MS, win, hop):
SInv = (M.T).dot(MS)
S2 = np.zeros((win, MS.shape[1]))
S2[0:S.shape[0], :] = (M.T).dot(MS)
S2 = np.array(S2, dtype=complex)
S2 *= np.exp(1j*2*np.pi*np.random.rand(S2.shape[0], S2.shape[1]))
S3 = np.zeros_like(S2)
S3[0:-1, :] = np.conj(S2[1::, :])
S2 = S2 + S3[::-1, :]
y = istft(S2, win, hop)
return y
ipd.Audio(x, rate=sr) # Original Audio
We find that with 200 bins, most of the detail of the audio has been preserved, at a compression factor of about 5x
SInv = (M.T).dot(MS)
y = invert_MS(M, MS, win, hop)
plt.figure(figsize=(12, 4))
plt.subplot(131)
plt.imshow(amplitude_to_db(S), cmap='magma', aspect='auto')
plt.ylim([0, S.shape[0]])
plt.subplot(132)
plt.imshow(amplitude_to_db(SInv), cmap='magma', aspect='auto')
plt.ylim([0, S.shape[0]])
ipd.Audio(y, rate=sr)
When we go down to 40 bins, we can just barely hear the notes
M = get_mel_filterbank(K, win, sr, min_freq, max_freq, n_bins=40)
MS = M.dot(S)
plt.figure(figsize=(12, 4))
plt.subplot(131)
plt.imshow(amplitude_to_db(M), cmap='magma', aspect='auto')
plt.ylim([0, M.shape[0]])
plt.title("Mel Filterbank ({} x {})".format(*M.shape))
plt.subplot(132)
plt.imshow(amplitude_to_db(S), cmap='magma', aspect='auto')
plt.ylim([0, S.shape[0]])
plt.title("Spectrogram in dB ({} x {})".format(*S.shape))
plt.subplot(133)
plt.imshow(amplitude_to_db(MS), cmap='magma', aspect='auto', interpolation='none')
plt.ylim([0, MS.shape[0]])
plt.title("Mel Specrogram ({} x {})".format(*MS.shape))
Text(0.5, 1.0, 'Mel Specrogram (40 x 859)')
SInv = (M.T).dot(MS)
y = invert_MS(M, MS, win, hop)
plt.figure(figsize=(12, 4))
plt.subplot(131)
plt.imshow(amplitude_to_db(S), cmap='magma', aspect='auto')
plt.ylim([0, S.shape[0]])
plt.subplot(132)
plt.imshow(amplitude_to_db(SInv), cmap='magma', aspect='auto')
plt.ylim([0, S.shape[0]])
ipd.Audio(y, rate=sr)
But when we go down to 20 bins, we can't hear the notes anymore. However, the vocals are still perfectly clear! And we can hear where the beats are and roughly what instruments are playing
M = get_mel_filterbank(K, win, sr, min_freq, max_freq, n_bins=20)
MS = M.dot(S)
plt.figure(figsize=(12, 4))
plt.subplot(131)
plt.imshow(amplitude_to_db(M), cmap='magma', aspect='auto')
plt.ylim([0, M.shape[0]])
plt.title("Mel Filterbank ({} x {})".format(*M.shape))
plt.subplot(132)
plt.imshow(amplitude_to_db(S), cmap='magma', aspect='auto')
plt.ylim([0, S.shape[0]])
plt.title("Spectrogram in dB ({} x {})".format(*S.shape))
plt.subplot(133)
plt.imshow(amplitude_to_db(MS), cmap='magma', aspect='auto', interpolation='none')
plt.ylim([0, MS.shape[0]])
plt.title("Mel Specrogram ({} x {})".format(*MS.shape))
Text(0.5, 1.0, 'Mel Specrogram (20 x 859)')
SInv = (M.T).dot(MS)
y = invert_MS(M, MS, win, hop)
plt.figure(figsize=(12, 4))
plt.subplot(131)
plt.imshow(amplitude_to_db(S), cmap='magma', aspect='auto')
plt.ylim([0, S.shape[0]])
plt.subplot(132)
plt.imshow(amplitude_to_db(SInv), cmap='magma', aspect='auto')
plt.ylim([0, S.shape[0]])
ipd.Audio(y, rate=sr)