# imports, and build vocabulary
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt # for making figures
%matplotlib inline
 
# import data
words = open('data/names.txt', 'r').read().splitlines()
 
# build the vocabulary of characters, and mappings to/from integers
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}
# fn: build_dataset(): training examples X, and labels Y for an INPUT list of names only 
block_size = 3 # context length: how many characters do we take to predict the next one?
 
def build_dataset(words):  
    X, Y = [], [] # X: NN input training examples, Y: labels for each input in X
    
    for w in words:
        #print(w)
        context = [0] * block_size
        for ch in w + '.':
            ix = stoi[ch]
            X.append(context)
            Y.append(ix)
            #print(''.join(itos[i] for i in context), '--->', itos[ix])
            context = context[1:] + [ix] # crop and append
 
    X = torch.tensor(X)
    Y = torch.tensor(Y)
    print(X.shape, Y.shape)
    return X, Y

Overfitting

Lower training loss does not mean a better model. A NN (e.g. MLP) with more parameters has a greater capacity. Larger model can achieve lower training loss simply by memorising the training data rather than learning general structure.

If a model’s lower loss is driven by overfitting, it will perform worse on unseen data despite showing a better loss. It may also only be capable of returning verbatim examples from the training data, instead of new outputs.

Solution: Must evaluate on a held-out validation set to make a fair comparison. The following split of full dataset is common:

  • 80% training split: Training / optimising model parameters via gradient descent
  • 10% dev / validation split: Train hyperparameters (e.g. hidden layer size 100, embedding size 2, learning rate 0.1, regularisation strength)
  • 10% test split: Evaluate overall model performance

Train, validation, test splits

# i - randomly shuffle words data set, and create train, val, test splits
import random
random.seed(42)
random.shuffle(words)
n1 = int(0.8*len(words)) # to index 80th percentile word (i.e. words[0] to words[n1])
n2 = int(0.9*len(words)) # to index the 90th percentile word (i.e. words[n1] to words[n2])
 
Xtr, Ytr = build_dataset(words[:n1])     # 80% test set (Xtr: training examples, Ytr: training labels)
Xdev, Ydev = build_dataset(words[n1:n2]) # 10% validation set
Xte, Yte = build_dataset(words[n2:])     # 10% test set
 
torch.Size([182625, 3]) torch.Size([182625])
torch.Size([22655, 3]) torch.Size([22655])
torch.Size([22866, 3]) torch.Size([22866])
# reset all 3,481 params: C, W1, b1, W2, b2;
g = torch.Generator().manual_seed(2147483647) # for reproducibility
 
# define parameters (3,481 in total)
C = torch.randn((27, 2), generator=g)         # embedding matrix (lookup table for input tokens)
W1 = torch.randn((6, 100), generator=g)       # hidden layer's incoming weights: 6 inputs to layer, 100 hidden neurons in layer 
b1 = torch.randn(100, generator=g)            # 100 biases live "in" hidden layer's neurons
W2 = torch.randn((100, 27), generator=g)      # output layer's incoming weights: 100 inputs to layer, 27 output neurons in layer
b2 = torch.randn(27, generator=g)             # 27 biases live "in" output layer's neurons
 
parameters = [C, W1, b1, W2, b2]              # list of all parameters (makes easier to count)
print('num. of parameters:', sum(p.nelement() for p in parameters))  # total parameter count in network: 3,481
 
# ensure all 3,481 parameters have gradient (to enable optimisation)
for p in parameters:
    p.requires_grad = True
num. of parameters: 3481

Run 1: Train for 30,000 iters at lr = 0.1

Compare training data loss (1st code cell) to validation data loss (2nd code cell).

# 30,000 training iters on training split only (Xtr, Ytr)! mini-batches (32 examples each).
 
lr = 0.1 # initial lr, based on testing for "sweet spot" in previous file
 
for i in range(30000):
    
    # minibatch construct: 32 indices for this batch
    ix = torch.randint(0, Xtr.shape[0], (32,))
    
    # forward pass
    emb = C[Xtr[ix]]                           # (228146, 3, 2) -> now mini batch (32, 3, 2) -> (32, 6) next line emb.view(-1, 6)
    h = torch.tanh(emb.view(-1, 6) @ W1 + b1)  # (32, 100)
    logits = h @ W2 + b2                       # (32, 27)
    loss = F.cross_entropy(logits, Ytr[ix])
    # print(loss.item())
    
    # backward pass
    for p in parameters:
        p.grad = None
    loss.backward()
    
    # gradient descent update
    for p in parameters:
        p.data += -lr * p.grad
 
# forward pass full train split (Xtr, Ytr): clean loss number showing true model progress
emb = C[Xtr]                                 # (228146, 3, 2) -> (228146, 6) next line emb.view(-1, 6)
h = torch.tanh(emb.view(-1, 6) @ W1 + b1)  # (228146, 100)
logits = h @ W2 + b2                       # (228146, 27)
loss = F.cross_entropy(logits, Ytr)
print('training Run 1 (lr = 0.1): 30,000 iters on training split: (Xtr -> Ytr)\ntraining loss:',loss.item())
training Run 1 (lr = 0.1): 30,000 iters on training split: (Xtr -> Ytr)
training loss: 2.4073026180267334
# forward pass val split (Xdev, Ydev)! clean loss number showing true model progress on unseen data
emb = C[Xdev]                              # (228146, 3, 2) -> (228146, 6) next line emb.view(-1, 6)
h = torch.tanh(emb.view(-1, 6) @ W1 + b1)  # (228146, 100)
logits = h @ W2 + b2                       # (228146, 27)
loss = F.cross_entropy(logits, Ydev)
print('validation (Xdev -> Ydev) loss:', loss.item())
validation (Xdev -> Ydev) loss: 2.4029619693756104

Run 2: Train for 10,000 iters at lr = 0.01

Compare training data loss (1st code cell) to validation data loss (2nd code cell).

# decay lr = 0.01. more training: 10,000 iters on training split (Xtr, Ytr)
 
lr = 0.01 # decay by factor of 10x
 
for i in range(10000):
    
    # minibatch construct: 32 indices for this batch
    ix = torch.randint(0, Xtr.shape[0], (32,))
    
    # forward pass
    emb = C[Xtr[ix]]                           # (228146, 3, 2) -> now mini batch (32, 3, 2) -> (32, 6) next line emb.view(-1, 6)
    h = torch.tanh(emb.view(-1, 6) @ W1 + b1)  # (32, 100)
    logits = h @ W2 + b2                       # (32, 27)
    loss = F.cross_entropy(logits, Ytr[ix])
    # print(loss.item())
    
    # backward pass
    for p in parameters:
        p.grad = None
    loss.backward()
    
    # gradient descent update
    for p in parameters:
        p.data += -lr * p.grad
        
# forward pass full train split (Xtr, Ytr): clean loss number showing true model progress
emb = C[Xtr]                                 # (228146, 3, 2) -> (228146, 6) next line emb.view(-1, 6)
h = torch.tanh(emb.view(-1, 6) @ W1 + b1)  # (228146, 100)
logits = h @ W2 + b2                       # (228146, 27)
loss = F.cross_entropy(logits, Ytr)
print('training Run 2 (lr = 0.01): 10,000 iters on training split: (Xtr -> Ytr)\ntraining loss:',loss.item())
training Run 2 (lr = 0.01): 10,000 iters on training split: (Xtr -> Ytr)
training loss: 2.3339736461639404
# forward pass val split (Xdev, Ydev)! clean loss number showing true model progress on unseen data
emb = C[Xdev]                                 # (228146, 3, 2) -> (228146, 6) next line emb.view(-1, 6)
h = torch.tanh(emb.view(-1, 6) @ W1 + b1)  # (228146, 100)
logits = h @ W2 + b2                       # (228146, 27)
loss = F.cross_entropy(logits, Ydev)
print('validation (Xdev -> Ydev) loss:', loss.item())
validation (Xdev -> Ydev) loss: 2.332975149154663

Interpreting train vs. validation loss

Train lossVal lossGapDiagnosisAction
HighHighSmallUnderfitting — model lacks capacityScale up model, train longer
LowHighLargeOverfitting — model memorising training dataScale up data, add regularisation
LowLowSmallGood fit

Sources