import numpy as np
import matplotlib.pyplot as plt
import IPython.display as ipd
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from scipy import signal
import librosa
Ground truth Value | 1-Hot Vector y |
0 | [ 1, 0, 0, 0, 0, 0, 0, 0, 0, 0] |
1 | [0, 1, 0, 0, 0, 0, 0, 0, 0, 0] |
2 | [0, 0, 1, 0, 0, 0, 0, 0, 0, 0] |
3 | [0, 0, 0, 1, 0, 0, 0, 0, 0, 0] |
4 | [0, 0, 0, 0, 1, 0, 0, 0, 0, 0] |
5 | [0, 0, 0, 0, 0, 1, 0, 0, 0, 0] |
6 | [0, 0, 0, 0, 0, 0, 1, 0, 0, 0] |
7 | [0, 0, 0, 0, 0, 0, 0, 1, 0, 0] |
8 | [0, 0, 0, 0, 0, 0, 0, 0, 1, 0] |
9 | [0, 0, 0, 0, 0, 0, 0, 0, 0, 1] |
Given an array $u$ of inputs, the softmax function is defined as the following vector-valued function, which is a generalization of the logistic function to multi-class (beyond 2 class) data
Since we're dividing by the sum of all of the numerators, the values sum to 1. And hence, the output of a softmax can be interpreted as the probability over all classes that a particular output should be in each of those classes
LABELS = ['bass_electronic', 'bass_synthetic', 'brass_acoustic',
'flute_acoustic', 'flute_synthetic', 'guitar_acoustic',
'guitar_electronic', 'keyboard_acoustic',
'keyboard_electronic', 'keyboard_synthetic',
'mallet_acoustic', 'organ_electronic', 'reed_acoustic',
'string_acoustic', 'vocal_acoustic', 'vocal_synthetic']
class Synth(Dataset):
def __init__(self, audio_filename, labels_filename, sr=8000, sample_len=4):
"""
Parameters
----------
audio_filename: string
Path to audio file
labels_filename: string
Path to labels file
sr: int
Audio sample rate to use
sample_len: int
Length of each sample, in seconds
"""
self.x, self.sr = librosa.load(audio_filename, sr=sr)
self.labels = np.loadtxt(labels_filename)
print("Finished loading audio ", audio_filename)
self.sample_len = sample_len
def __len__(self):
return len(self.labels)
def __getitem__(self, idx):
"""
Return a tuple (x, y)
"""
L = self.sr*sample_len
x = self.x[L*idx:L*idx+L]
label = int(self.labels[idx])
y = np.zeros(len(LABELS))
y[label] = 1
x = librosa.feature.mfcc(y=x, sr=self.sr).flatten()
x = torch.from_numpy(np.array(x, dtype=np.float32))
y = torch.from_numpy(np.array(y, dtype=np.float32))
return x, y
sr = 8000
sample_len = 4
data_train = Synth("data/nsynth_valid.mp3", "data/labels_valid.txt", sr, sample_len)
data_test = Synth("data/nsynth_test.mp3", "data/labels_test.txt", sr, sample_len)
Finished loading audio data/nsynth_valid.mp3 Finished loading audio data/nsynth_test.mp3
dim = len(data_train[0][0])
The code below trains the neural network on one dataset and tests it on another, completely disjoint dataset. It is crucial that we test it on a different dataset from which we trained it to make sure it's not just "memorizing" the data in the training set. Such an undesirable scenario is referred to as overfitting
# Try to use the GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Device: ", device)
## Step 2: Create sequential neural net model (setup a function space)
## TODO: Create your model
model = nn.Sequential(
nn.Linear(dim, 100),
nn.ReLU(),
nn.Linear(100, 200),
nn.ReLU(),
nn.Linear(200, len(LABELS))
) ## TODO: More stuff here
model = model.to(device)
# Output of layer 3 will go through a logistic function
## Step 3: Setup the loss function
loss_fn = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
n_epochs = 50 # Each "epoch" is a loop through the entire dataset
# and we use this to update the parameters
losses = []
accuracy = []
train_losses = []
train_accuracy = []
for epoch in range(n_epochs):
print(".", end="")
loader = DataLoader(data_train, batch_size=64, shuffle=True)
train_loss = 0
train_correct = 0
for X, Y in loader: # Go through each mini batch
# Move inputs/outputs to GPU
X = X.to(device)
Y = Y.to(device)
# Reset the optimizer's gradients
optimizer.zero_grad()
# Run the sequential model on all inputs
Y_est = model(X)
# Compute the loss function comparing Y_est to Y
loss = loss_fn(Y_est, Y)
# Compute the gradients of the loss function with respect
# to all of the parameters of the model
loss.backward()
# Update the parameters based on the gradient and
# the optimization scheme
optimizer.step()
train_loss += loss.item()
train_correct += torch.sum(torch.argmax(Y, axis=1) == torch.argmax(Y_est, axis=1))
train_losses.append(train_loss)
t = train_correct / len(data_train)
train_accuracy.append(t.detach().cpu())
# Look at results on test set
test_loader = DataLoader(data_test, batch_size=len(data_test))
inputs, labels = next(iter(test_loader))
inputs = inputs.to(device)
labels = labels.to(device)
outputs = model(inputs)
total_loss = loss_fn(outputs, labels)
losses.append(total_loss.item())
num_correct = torch.sum(torch.argmax(labels, axis=1) == torch.argmax(outputs, axis=1))
num_correct = num_correct.detach().cpu().numpy()
accuracy.append(num_correct/len(data_test))
print("Epoch {}, train accuracy {:.3f}, test accuracy {:.3f}".format(epoch, train_accuracy[-1], accuracy[-1]))
Device: cuda .Epoch 0, train accuracy 0.431, test accuracy 0.650 .Epoch 1, train accuracy 0.737, test accuracy 0.792 .Epoch 2, train accuracy 0.829, test accuracy 0.843 .Epoch 3, train accuracy 0.876, test accuracy 0.884 .Epoch 4, train accuracy 0.896, test accuracy 0.904 .Epoch 5, train accuracy 0.921, test accuracy 0.924 .Epoch 6, train accuracy 0.935, test accuracy 0.931 .Epoch 7, train accuracy 0.947, test accuracy 0.938 .Epoch 8, train accuracy 0.957, test accuracy 0.945 .Epoch 9, train accuracy 0.965, test accuracy 0.952 .Epoch 10, train accuracy 0.967, test accuracy 0.958 .Epoch 11, train accuracy 0.972, test accuracy 0.968 .Epoch 12, train accuracy 0.978, test accuracy 0.967 .Epoch 13, train accuracy 0.978, test accuracy 0.968 .Epoch 14, train accuracy 0.982, test accuracy 0.969 .Epoch 15, train accuracy 0.985, test accuracy 0.977 .Epoch 16, train accuracy 0.987, test accuracy 0.973 .Epoch 17, train accuracy 0.990, test accuracy 0.977 .Epoch 18, train accuracy 0.991, test accuracy 0.984 .Epoch 19, train accuracy 0.993, test accuracy 0.981 .Epoch 20, train accuracy 0.993, test accuracy 0.977 .Epoch 21, train accuracy 0.993, test accuracy 0.985 .Epoch 22, train accuracy 0.996, test accuracy 0.978 .Epoch 23, train accuracy 0.994, test accuracy 0.986 .Epoch 24, train accuracy 0.997, test accuracy 0.984 .Epoch 25, train accuracy 0.996, test accuracy 0.985 .Epoch 26, train accuracy 0.997, test accuracy 0.986 .Epoch 27, train accuracy 0.997, test accuracy 0.988 .Epoch 28, train accuracy 0.998, test accuracy 0.988 .Epoch 29, train accuracy 0.992, test accuracy 0.989 .Epoch 30, train accuracy 0.997, test accuracy 0.986 .Epoch 31, train accuracy 0.998, test accuracy 0.983 .Epoch 32, train accuracy 0.998, test accuracy 0.993 .Epoch 33, train accuracy 0.998, test accuracy 0.990 .Epoch 34, train accuracy 0.999, test accuracy 0.993 .Epoch 35, train accuracy 0.997, test accuracy 0.984 .Epoch 36, train accuracy 0.996, test accuracy 0.975 .Epoch 37, train accuracy 0.992, test accuracy 0.988 .Epoch 38, train accuracy 0.998, test accuracy 0.990 .Epoch 39, train accuracy 0.999, test accuracy 0.989 .Epoch 40, train accuracy 0.999, test accuracy 0.993 .Epoch 41, train accuracy 0.999, test accuracy 0.990 .Epoch 42, train accuracy 0.998, test accuracy 0.988 .Epoch 43, train accuracy 0.999, test accuracy 0.991 .Epoch 44, train accuracy 0.999, test accuracy 0.981 .Epoch 45, train accuracy 0.988, test accuracy 0.989 .Epoch 46, train accuracy 0.999, test accuracy 0.994 .Epoch 47, train accuracy 0.999, test accuracy 0.990 .Epoch 48, train accuracy 0.999, test accuracy 0.991 .Epoch 49, train accuracy 0.999, test accuracy 0.994
plt.figure()
plt.subplot(211)
plt.plot(train_losses)
plt.plot(losses)
plt.title("Losses")
plt.xlabel("Epoch")
plt.legend(["Train", "Test"])
plt.subplot(212)
plt.plot(np.array(train_accuracy)*100)
plt.plot(np.array(accuracy)*100)
plt.title("Accuracy")
plt.xlabel("Epoch")
plt.legend(["Train", "Test"])
plt.tight_layout()
plt.savefig("ShallowTrain.svg", bbox_inches='tight')
def plot_confusion_matrix(labels, outputs):
from scipy import sparse
x1 = torch.argmax(labels, axis=1).detach().cpu()
x2 = torch.argmax(outputs, axis=1).detach().cpu()
I = np.array(x1.numpy(), dtype=int)
J = np.array(x2.numpy(), dtype=int)
K = len(LABELS)
D = sparse.coo_matrix((np.ones(I.size), (I, J)), shape=(K, K))
D = D.toarray()
plt.imshow(D)
plt.xticks(np.arange(K), LABELS, rotation='vertical')
acc = np.diag(D)/np.sum(D, axis=1)
plt.yticks(np.arange(K), ["{} {:.3f}".format(LABELS[i], acc[i]) for i in range(len(LABELS))])
plt.ylabel("Ground Truth Label")
plt.xlabel("Predicted Label")
correct = 100*np.sum(np.diag(D))/np.sum(D)
plt.title("{:.3f}% Correct".format(correct))
plot_confusion_matrix(labels, outputs)
plt.savefig("ShallowConfusion.svg", bbox_inches='tight')
# It's a good idea to save our model when we're finished so we don't
# have to do a long training process over again!
# https://pytorch.org/tutorials/beginner/saving_loading_models.html
torch.save(model.state_dict(), "deep.pkl")