import numpy as np
import matplotlib.pyplot as plt
import IPython.display as ipd
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from scipy import signal
import librosa


LABELS = ['bass_electronic', 'bass_synthetic', 'brass_acoustic', 
          'flute_acoustic', 'flute_synthetic', 'guitar_acoustic', 
          'guitar_electronic', 'keyboard_acoustic', 
          'keyboard_electronic', 'keyboard_synthetic', 
          'mallet_acoustic', 'organ_electronic', 'reed_acoustic', 
          'string_acoustic', 'vocal_acoustic', 'vocal_synthetic']

class Synth(Dataset):
    def __init__(self, audio_filename, labels_filename, sr=8000, sample_len=4):
        """
        Parameters
        ----------
        audio_filename: string
            Path to audio file
        labels_filename: string
            Path to labels file
        sr: int
            Audio sample rate to use
        sample_len: int
            Length of each sample, in seconds
        """
        self.x, self.sr = librosa.load(audio_filename, sr=sr)
        self.labels = np.loadtxt(labels_filename)
        print("Finished loading audio ", audio_filename)
        self.sample_len = sample_len
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        """
        Return a tuple (x, y)
        """
        L = self.sr*sample_len
        x = self.x[L*idx:L*idx+L]
        label = int(self.labels[idx])
        y = np.zeros(len(LABELS))
        y[label] = 1
        x = librosa.feature.mfcc(y=x, sr=self.sr).flatten()
        x = torch.from_numpy(np.array(x, dtype=np.float32))
        y = torch.from_numpy(np.array(y, dtype=np.float32))
        return x, y


sr = 8000
sample_len = 4
data_train = Synth("data/nsynth_valid.mp3", "data/labels_valid.txt", sr, sample_len)
data_test = Synth("data/nsynth_test.mp3", "data/labels_test.txt", sr, sample_len)

Finished loading audio  data/nsynth_valid.mp3
Finished loading audio  data/nsynth_test.mp3


dim = len(data_train[0][0])


# Try to use the GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 
print("Device: ", device)

## Step 2: Create sequential neural net model (setup a function space)

## TODO: Create your model
model = nn.Sequential(
    nn.Linear(dim, 100),
    nn.ReLU(),
    nn.Linear(100, 200),
    nn.ReLU(),
    nn.Linear(200, len(LABELS))
    
) ## TODO: More stuff here
model = model.to(device)

# Output of layer 3 will go through a logistic function

## Step 3: Setup the loss function
loss_fn = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

n_epochs = 50 # Each "epoch" is a loop through the entire dataset
# and we use this to update the parameters
losses = []
accuracy = []
train_losses = []
train_accuracy = []
for epoch in range(n_epochs):
    print(".", end="")
    loader = DataLoader(data_train, batch_size=64, shuffle=True)
    train_loss = 0
    train_correct = 0
    for X, Y in loader: # Go through each mini batch
        # Move inputs/outputs to GPU
        X = X.to(device)
        Y = Y.to(device)
        # Reset the optimizer's gradients
        optimizer.zero_grad()
        # Run the sequential model on all inputs
        Y_est = model(X)
        # Compute the loss function comparing Y_est to Y
        loss = loss_fn(Y_est, Y)
        # Compute the gradients of the loss function with respect
        # to all of the parameters of the model
        loss.backward()
        # Update the parameters based on the gradient and
        # the optimization scheme
        optimizer.step()
        train_loss += loss.item()
        train_correct += torch.sum(torch.argmax(Y, axis=1) == torch.argmax(Y_est, axis=1))
    train_losses.append(train_loss)
    t = train_correct / len(data_train)
    train_accuracy.append(t.detach().cpu())
    
    # Look at results on test set
    test_loader = DataLoader(data_test, batch_size=len(data_test))
    inputs, labels = next(iter(test_loader))
    inputs = inputs.to(device)
    labels = labels.to(device)
    outputs = model(inputs)
    total_loss = loss_fn(outputs, labels)
    losses.append(total_loss.item())
    num_correct = torch.sum(torch.argmax(labels, axis=1) == torch.argmax(outputs, axis=1))
    num_correct = num_correct.detach().cpu().numpy()
    accuracy.append(num_correct/len(data_test))
    print("Epoch {}, train accuracy {:.3f}, test accuracy {:.3f}".format(epoch, train_accuracy[-1], accuracy[-1]))

Device:  cuda
.Epoch 0, train accuracy 0.431, test accuracy 0.650
.Epoch 1, train accuracy 0.737, test accuracy 0.792
.Epoch 2, train accuracy 0.829, test accuracy 0.843
.Epoch 3, train accuracy 0.876, test accuracy 0.884
.Epoch 4, train accuracy 0.896, test accuracy 0.904
.Epoch 5, train accuracy 0.921, test accuracy 0.924
.Epoch 6, train accuracy 0.935, test accuracy 0.931
.Epoch 7, train accuracy 0.947, test accuracy 0.938
.Epoch 8, train accuracy 0.957, test accuracy 0.945
.Epoch 9, train accuracy 0.965, test accuracy 0.952
.Epoch 10, train accuracy 0.967, test accuracy 0.958
.Epoch 11, train accuracy 0.972, test accuracy 0.968
.Epoch 12, train accuracy 0.978, test accuracy 0.967
.Epoch 13, train accuracy 0.978, test accuracy 0.968
.Epoch 14, train accuracy 0.982, test accuracy 0.969
.Epoch 15, train accuracy 0.985, test accuracy 0.977
.Epoch 16, train accuracy 0.987, test accuracy 0.973
.Epoch 17, train accuracy 0.990, test accuracy 0.977
.Epoch 18, train accuracy 0.991, test accuracy 0.984
.Epoch 19, train accuracy 0.993, test accuracy 0.981
.Epoch 20, train accuracy 0.993, test accuracy 0.977
.Epoch 21, train accuracy 0.993, test accuracy 0.985
.Epoch 22, train accuracy 0.996, test accuracy 0.978
.Epoch 23, train accuracy 0.994, test accuracy 0.986
.Epoch 24, train accuracy 0.997, test accuracy 0.984
.Epoch 25, train accuracy 0.996, test accuracy 0.985
.Epoch 26, train accuracy 0.997, test accuracy 0.986
.Epoch 27, train accuracy 0.997, test accuracy 0.988
.Epoch 28, train accuracy 0.998, test accuracy 0.988
.Epoch 29, train accuracy 0.992, test accuracy 0.989
.Epoch 30, train accuracy 0.997, test accuracy 0.986
.Epoch 31, train accuracy 0.998, test accuracy 0.983
.Epoch 32, train accuracy 0.998, test accuracy 0.993
.Epoch 33, train accuracy 0.998, test accuracy 0.990
.Epoch 34, train accuracy 0.999, test accuracy 0.993
.Epoch 35, train accuracy 0.997, test accuracy 0.984
.Epoch 36, train accuracy 0.996, test accuracy 0.975
.Epoch 37, train accuracy 0.992, test accuracy 0.988
.Epoch 38, train accuracy 0.998, test accuracy 0.990
.Epoch 39, train accuracy 0.999, test accuracy 0.989
.Epoch 40, train accuracy 0.999, test accuracy 0.993
.Epoch 41, train accuracy 0.999, test accuracy 0.990
.Epoch 42, train accuracy 0.998, test accuracy 0.988
.Epoch 43, train accuracy 0.999, test accuracy 0.991
.Epoch 44, train accuracy 0.999, test accuracy 0.981
.Epoch 45, train accuracy 0.988, test accuracy 0.989
.Epoch 46, train accuracy 0.999, test accuracy 0.994
.Epoch 47, train accuracy 0.999, test accuracy 0.990
.Epoch 48, train accuracy 0.999, test accuracy 0.991
.Epoch 49, train accuracy 0.999, test accuracy 0.994


plt.figure()
plt.subplot(211)
plt.plot(train_losses)
plt.plot(losses)
plt.title("Losses")
plt.xlabel("Epoch")
plt.legend(["Train", "Test"])
plt.subplot(212)
plt.plot(np.array(train_accuracy)*100)
plt.plot(np.array(accuracy)*100)
plt.title("Accuracy")
plt.xlabel("Epoch")
plt.legend(["Train", "Test"])
plt.tight_layout()
plt.savefig("ShallowTrain.svg", bbox_inches='tight')


def plot_confusion_matrix(labels, outputs):
    from scipy import sparse
    x1 = torch.argmax(labels, axis=1).detach().cpu()
    x2 = torch.argmax(outputs, axis=1).detach().cpu()
    I = np.array(x1.numpy(), dtype=int)
    J = np.array(x2.numpy(), dtype=int)
    K = len(LABELS)
    D = sparse.coo_matrix((np.ones(I.size), (I, J)), shape=(K, K))
    D = D.toarray()
    plt.imshow(D)
    plt.xticks(np.arange(K), LABELS, rotation='vertical')
    acc = np.diag(D)/np.sum(D, axis=1)
    plt.yticks(np.arange(K), ["{} {:.3f}".format(LABELS[i], acc[i]) for i in range(len(LABELS))])
    plt.ylabel("Ground Truth Label")
    plt.xlabel("Predicted Label")
    correct = 100*np.sum(np.diag(D))/np.sum(D)
    plt.title("{:.3f}% Correct".format(correct))

plot_confusion_matrix(labels, outputs)
plt.savefig("ShallowConfusion.svg", bbox_inches='tight')


# It's a good idea to save our model when we're finished so we don't
# have to do a long training process over again!
# https://pytorch.org/tutorials/beginner/saving_loading_models.html
torch.save(model.state_dict(), "deep.pkl")

Ground truth Value	1-Hot Vector y
0	[ 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]
1	[0, 1, 0, 0, 0, 0, 0, 0, 0, 0]
2	[0, 0, 1, 0, 0, 0, 0, 0, 0, 0]
3	[0, 0, 0, 1, 0, 0, 0, 0, 0, 0]
4	[0, 0, 0, 0, 1, 0, 0, 0, 0, 0]
5	[0, 0, 0, 0, 0, 1, 0, 0, 0, 0]
6	[0, 0, 0, 0, 0, 0, 1, 0, 0, 0]
7	[0, 0, 0, 0, 0, 0, 0, 1, 0, 0]
8	[0, 0, 0, 0, 0, 0, 0, 0, 1, 0]
9	[0, 0, 0, 0, 0, 0, 0, 0, 0, 1]

Classifying NSynth Instruments with Fully Connected Feedforward Neural Nets¶

Multi-Class Classification¶

Dataset Output:¶

Softmax¶

$f(u)[i] = \frac{e^{u[i]}}{\sum_{j=1}^N e^{u[j]}} $¶

Loss function (Binary Cross-Entropy)¶

$L(y, y_{est}) = -\sum_{i=1}^N y[i] \log(y_{est}[i]) $¶

Torch Data Loader for NSynth Data¶

PyTorch Training Loop¶

Plot Optimization Progress¶