# imports, build vocabulary, build_dataset function, create train/val/test data split.
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt # for making figures
%matplotlib inline
 
# import data
words = open('data/names.txt', 'r').read().splitlines()
 
# build the vocabulary of characters, and mappings to/from integers
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}
 
# fn: build dataset (training examples X, and labels Y) for an INPUT list of names only 
block_size = 3 # context length: how many characters do we take to predict the next one?
 
def build_dataset(words):  
    X, Y = [], [] # X: NN input training examples, Y: labels for each input in X
    
    for w in words:
        #print(w)
        context = [0] * block_size
        for ch in w + '.':
            ix = stoi[ch]
            X.append(context)
            Y.append(ix)
            #print(''.join(itos[i] for i in context), '--->', itos[ix])
            context = context[1:] + [ix] # crop and append
 
    X = torch.tensor(X)
    Y = torch.tensor(Y)
    print(X.shape, Y.shape)
    return X, Y
 
# randomly shuffle words data set, and create train, val, test splits
import random
random.seed(42)
random.shuffle(words)
n1 = int(0.8*len(words)) # to index 80th percentile word (i.e. words[0] to words[n1])
n2 = int(0.9*len(words)) # to index the 90th percentile word (i.e. words[n1] to words[n2])
 
Xtr, Ytr = build_dataset(words[:n1])     # 80% test set (Xtr: training examples, Ytr: training labels)
Xdev, Ydev = build_dataset(words[n1:n2]) # 10% validation set
Xte, Yte = build_dataset(words[n2:])     # 10% test set
torch.Size([182625, 3]) torch.Size([182625])
torch.Size([22655, 3]) torch.Size([22655])
torch.Size([22866, 3]) torch.Size([22866])

Increase embedding dimensions: 2 10 dims

Also reduce hidden layer size slightly: 300 200 neurons

Changes to embedding matrix

  • Still 27-characters now embedded into 10-dim space, so

Changes to hidden layer parameters W1 and b1

  • W1 : Hidden layer’s (incoming) weights matrix
    • arg 1: 6 -> 30 inputs to hidden layer: three embedding vectors, each with two embedding dims
    • arg 2: 100 -> 300 -> 200 (hidden) neurons in this (hidden) layer: design parameter
  • b1 : Hidden layer’s bias vector (lives “in” the layer’s neurons)
    • gets broadcasted (to 228,146; or however many training examples in the batch)

Change output layer parameter W2

  • W2 : Output layer’s (incoming) weights matrix
    • arg 1: 100 -> 300 -> 200 neurons coming into this output layer from previous (hidden) layer
    • arg 2: 27 (output) neurons in the this (output) layer: 27 possible next characters
  • b2 : Output layer bias vector (lives “in” the layer’s neurons)

So the total parameter count goes from 3,481 10,281 11,897

# i - increase embedding dimension (2 -> 10 dims) and decrease hidden layer size: 300 -> 200 neurons
g = torch.Generator().manual_seed(2147483647) # for reproducibility
 
# define parameters
C = torch.randn((27, 10), generator=g)        # NOW: 10-dim embeddings
W1 = torch.randn((30, 200), generator=g)      # hidden layer's incoming weights: NOW 30 inputs to layer, NOW 200 hidden neurons in layer 
b1 = torch.randn(200, generator=g)            # NOW 200 biases live "in" hidden layer's neurons
W2 = torch.randn((200, 27), generator=g)      # output layer's incoming weights: NOW 200 inputs to layer, 27 output neurons in layer
b2 = torch.randn(27, generator=g)             # 27 biases live "in" output layer's neurons
 
parameters = [C, W1, b1, W2, b2]              # list of all parameters (makes easier to count)
print('num. of parameters:', sum(p.nelement() for p in parameters))  # total parameter count in network: 3,481
 
# ensure all 10,281 parameters have gradient (to enable optimisation)
for p in parameters:
    p.requires_grad = True
 
lossi = []    # track resulting loss on each iter
loglossi = [] # track resulting log-loss on each iter (better plot)
stepi = []    # track steps
num. of parameters: 11897

Run 1: Train for 60,000 iters at lr = 0.1

Handling W1 and emb shape (dims) mismatch with Pytorch emb.view() (calculating h)

  • The embedded training examples emb are now of shape :
    • 228,146 training examples,
    • 3 tokens per example,
    • 2 now 10 embedding dimensions (token is now embedded in 2D 10D)
  • but W1 :
    • it is expecting 6 inputs (not 3 input tokens with 2 embedding dims each)
    • it is now expecting 30 inputs (3 input tokens with 10 embedding dims each)
  • We need to create a view: emb.view(-1, 6) emb.view(-1, 30),
    • arg 1: -1 infers the size of dimension 0
    • arg 2: 6 -> 30 we specify the size of dimension 1 (6 now 30 inputs coming into this layer)
    • Hence, all elements are accounted for.
# Run 1: 50,000 training iters on (Xtr, Ytr)! mini-batches (32 examples each).
 
for i in range(50000):
    # minibatch construct
    ix = torch.randint(0, Xtr.shape[0], (32,))
    
    # forward pass
    emb = C[Xtr[ix]] # (32, 3, 10)
    h = torch.tanh(emb.view(-1, 30) @ W1 + b1) # (32, 200)
    logits = h @ W2 + b2 # (32, 27)
    loss = F.cross_entropy(logits, Ytr[ix])
    #print(loss.item())
    
    # backward pass
    for p in parameters:
        p.grad = None
    loss.backward()
    
    # update
    lr = 0.1
    for p in parameters:
        p.data += -lr * p.grad
 
    # track stats
    stepi.append(i)
    lossi.append(loss.item())
    loglossi.append(loss.log10().item())

Plotting loss and log-loss (-axis), vs step / iteration count (-axis).

  • loss is usually an extremely vertical hockey stick shape.
  • log-loss allows a nicer visualisation by squashing vertically.
# plot loss / log-loss (y-axis) vs iteration count (x-axis)
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))
 
ax1.plot(stepi, lossi)
ax1.set_title('Loss vs Iterations')
ax1.set_xlabel('Loss')
ax1.set_ylabel('Iteration')
 
ax2.plot(stepi, loglossi)
ax2.set_title('Log-Loss vs Iterations')
ax2.set_xlabel('Log-Loss')
ax2.set_ylabel('Iteration')
 
plt.tight_layout()
plt.show()
plot

Compare training loss to val loss

# compare train loss vs val (dev) loss
# forward pass full train split (Xtr, Ytr): clean loss number showing true model progress
emb = C[Xtr]                               # (228146, 3, 10) -> (228146, 30) next line emb.view(-1, 30)
h = torch.tanh(emb.view(-1, 30) @ W1 + b1) # (228146, 100)
logits = h @ W2 + b2                       # (228146, 27)
loss = F.cross_entropy(logits, Ytr)
print('Training Run 1 (lr = 0.1): 50,000 iters on (Xtr, Ytr)\n\ntraining loss:',loss.item())
 
# forward pass val split (Xdev, Ydev)! clean loss number showing true model progress on unseen data
emb = C[Xdev]                              # (228146, 3, 10) -> (228146, 30) next line emb.view(-1, 30)
h = torch.tanh(emb.view(-1, 30) @ W1 + b1) # (228146, 100)
logits = h @ W2 + b2                       # (228146, 27)
loss = F.cross_entropy(logits, Ydev)
print('validation (Xdev -> Ydev) loss:', loss.item())
Training Run 1 (lr = 0.1): 50,000 iters on (Xtr, Ytr)
 
training loss: 2.363924980163574
validation (Xdev -> Ydev) loss: 2.384514808654785

Continue training

Run 2: Train for 100,000 iters (first half at lr = 0.01, next at lr = 0.005

Dymanically update learning rate: lr = 0.01 if i < 50000 else 0.005

# Run 2: 50,000 training iters on (Xtr, Ytr)! mini-batches (32 examples each).
 
for i in range(50000):
    # minibatch construct
    ix = torch.randint(0, Xtr.shape[0], (32,))
    
    # forward pass
    emb = C[Xtr[ix]] # (32, 3, 10)
    h = torch.tanh(emb.view(-1, 30) @ W1 + b1) # (32, 200)
    logits = h @ W2 + b2 # (32, 27)
    loss = F.cross_entropy(logits, Ytr[ix])
    #print(loss.item())
    
    # backward pass
    for p in parameters:
        p.grad = None
    loss.backward()
    
    # update
    # lr = 0.01
    lr = 0.01 if i < 50000 else 0.005
    for p in parameters:
        p.data += -lr * p.grad
 
    # track stats
    stepi.append(i)
    lossi.append(loss.item())
    loglossi.append(loss.log10().item())

Compare training loss to val loss

# compare train loss vs val (dev) loss
# forward pass full train split (Xtr, Ytr): clean loss number showing true model progress
emb = C[Xtr]                               # (228146, 3, 10) -> (228146, 30) next line emb.view(-1, 30)
h = torch.tanh(emb.view(-1, 30) @ W1 + b1) # (228146, 100)
logits = h @ W2 + b2                       # (228146, 27)
loss = F.cross_entropy(logits, Ytr)
print('Training Run 2 (lr = 0.01): 50,000 iters on (Xtr, Ytr)\n\ntraining loss:',loss.item())
 
# forward pass val split (Xdev, Ydev)! clean loss number showing true model progress on unseen data
emb = C[Xdev]                              # (228146, 3, 10) -> (228146, 30) next line emb.view(-1, 30)
h = torch.tanh(emb.view(-1, 30) @ W1 + b1) # (228146, 100)
logits = h @ W2 + b2                       # (228146, 27)
loss = F.cross_entropy(logits, Ydev)
print('validation (Xdev -> Ydev) loss:', loss.item())
Training Run 2 (lr = 0.01): 50,000 iters on (Xtr, Ytr)
 
training loss: 2.1698460578918457
validation (Xdev -> Ydev) loss: 2.19392991065979

Finding: Check for overfitting

Note Run 2 above. Training loss was ~2.17, validation loss was ~2.20. The NN’s parameter is hinting at slightly overfit the data (i.e. network is memorising specific training examples rather than general structure). But not massively!

Summary notes

  • Hyperparameter tuning is done systematically via experimentation and automated jobs. Not in this haphazard manual way.
  • Given the loss reduced further in Experiment 2 (without evidence of significant overfitting), the 2-dim embedding vectors were likely too constraining.

Next steps to further improve model performance

  • Adjust hidden layer size (number of neurons)
  • Adjust embedding dimensions
  • Increase context length (i.e. input more than 3 tokens (characters) into model)
  • Increase number of layers
  • Change hyperparameters:
    • Change number of training iterations
    • Change batch size
    • Change Initial learning rate
    • Change learning rate evolution/decay over time

Using the test set

Be very sparing with using the test set. Ideally only use this once, at the very end.

Sampling from the model

# sample from the model
g = torch.Generator().manual_seed(2147483647 + 10)
 
for _ in range(20):
    
    out = []
    context = [0] * block_size # initialise context with all ...
    while True:
      emb = C[torch.tensor([context])] # (1,block_size,d)
      h = torch.tanh(emb.view(1, -1) @ W1 + b1)
      logits = h @ W2 + b2
      probs = F.softmax(logits, dim=1)
      ix = torch.multinomial(probs, num_samples=1, generator=g).item()
      context = context[1:] + [ix]
      out.append(ix)
      if ix == 0:      # break when "." EOL generated
        break
    
    print(''.join(itos[i] for i in out))
carmah.
amori.
kif.
jari.
reh.
caslanden.
jazhet.
faverynt.
kaqui.
nellara.
chaiir.
kaleigh.
ham.
jory.
quintin.
lilea.
jadbi.
wapelo.
dearynix.
kael.

Sources