# imports, build vocabulary, build_dataset function, create train/val/test data split.
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt # for making figures
%matplotlib inline
# import data
words = open('data/names.txt', 'r').read().splitlines()
# build the vocabulary of characters, and mappings to/from integers
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}
# fn: build dataset (training examples X, and labels Y) for an INPUT list of names only
block_size = 3 # context length: how many characters do we take to predict the next one?
def build_dataset(words):
X, Y = [], [] # X: NN input training examples, Y: labels for each input in X
for w in words:
#print(w)
context = [0] * block_size
for ch in w + '.':
ix = stoi[ch]
X.append(context)
Y.append(ix)
#print(''.join(itos[i] for i in context), '--->', itos[ix])
context = context[1:] + [ix] # crop and append
X = torch.tensor(X)
Y = torch.tensor(Y)
print(X.shape, Y.shape)
return X, Y
# randomly shuffle words data set, and create train, val, test splits
import random
random.seed(42)
random.shuffle(words)
n1 = int(0.8*len(words)) # to index 80th percentile word (i.e. words[0] to words[n1])
n2 = int(0.9*len(words)) # to index the 90th percentile word (i.e. words[n1] to words[n2])
Xtr, Ytr = build_dataset(words[:n1]) # 80% test set (Xtr: training examples, Ytr: training labels)
Xdev, Ydev = build_dataset(words[n1:n2]) # 10% validation set
Xte, Yte = build_dataset(words[n2:]) # 10% test settorch.Size([182625, 3]) torch.Size([182625])
torch.Size([22655, 3]) torch.Size([22655])
torch.Size([22866, 3]) torch.Size([22866])Recap of findings (End of Experiment 1)
Since we used 2-dimensional embedding vectors (see 01_build_mlp, embedding section) we can visualise the model’s pre-trained embedding matrix as a graph on the -plane
Clearly there is some structure in how the model treats certain characters:
- The start/end character
.is very different to everything else, so sits apart by itself- Vowels
a,e,i,ohave clustered to the bottom leftqis quite unique and out by itself.uis also unique and out by itself: its uses are clearlly dissimilar to most other letters, and maybe more likeqysits between the vowels and everything else- Vague clustering of “hard / closed” consonants like
c,p,k,d,t.- Vague -axis alignment of “soft / flowy / open” consonants like
fl,r,n,w,v,h,mIt is possible the number of embedding dimensions is another bottleneck holding back model performance. Maybe cramming 27 tokens into 2 dimensions is too ambitious, and loses their semantic meaning.
Increase embedding dimensions: 2 → 10 dims
Also reduce hidden layer size slightly: 300 → 200 neurons
Changes to embedding matrix
- Still 27-characters → now embedded into 10-dim space, so
Changes to hidden layer parameters W1 and b1
Change output layer parameter W2
W2: Output layer’s (incoming) weights matrix- arg 1:
100 -> 300 -> 200neurons coming into this output layer from previous (hidden) layer - arg 2:
27(output) neurons in the this (output) layer: 27 possible next characters
- arg 1:
b2: Output layer bias vector (lives “in” the layer’s neurons)
So the total parameter count goes from 3,481 → 10,281 → 11,897
# i - increase embedding dimension (2 -> 10 dims) and decrease hidden layer size: 300 -> 200 neurons
g = torch.Generator().manual_seed(2147483647) # for reproducibility
# define parameters
C = torch.randn((27, 10), generator=g) # NOW: 10-dim embeddings
W1 = torch.randn((30, 200), generator=g) # hidden layer's incoming weights: NOW 30 inputs to layer, NOW 200 hidden neurons in layer
b1 = torch.randn(200, generator=g) # NOW 200 biases live "in" hidden layer's neurons
W2 = torch.randn((200, 27), generator=g) # output layer's incoming weights: NOW 200 inputs to layer, 27 output neurons in layer
b2 = torch.randn(27, generator=g) # 27 biases live "in" output layer's neurons
parameters = [C, W1, b1, W2, b2] # list of all parameters (makes easier to count)
print('num. of parameters:', sum(p.nelement() for p in parameters)) # total parameter count in network: 3,481
# ensure all 10,281 parameters have gradient (to enable optimisation)
for p in parameters:
p.requires_grad = True
lossi = [] # track resulting loss on each iter
loglossi = [] # track resulting log-loss on each iter (better plot)
stepi = [] # track stepsnum. of parameters: 11897Run 1: Train for 60,000 iters at lr = 0.1
Handling
W1andembshape (dims) mismatch with Pytorchemb.view()(calculatingh)
- The embedded training examples
embare now of shape :
- 228,146 training examples,
- 3 tokens per example,
2now 10 embedding dimensions (token is now embedded in2D10D)- but
W1:
it is expecting 6 inputs (not 3 input tokens with 2 embedding dims each)- it is now expecting 30 inputs (3 input tokens with 10 embedding dims each)
- We need to create a view:
→emb.view(-1, 6)emb.view(-1, 30),
- arg 1:
-1infers the size of dimension 0- arg 2:
6 -> 30we specify the size of dimension 1 (6now 30 inputs coming into this layer)- Hence, all elements are accounted for.
# Run 1: 50,000 training iters on (Xtr, Ytr)! mini-batches (32 examples each).
for i in range(50000):
# minibatch construct
ix = torch.randint(0, Xtr.shape[0], (32,))
# forward pass
emb = C[Xtr[ix]] # (32, 3, 10)
h = torch.tanh(emb.view(-1, 30) @ W1 + b1) # (32, 200)
logits = h @ W2 + b2 # (32, 27)
loss = F.cross_entropy(logits, Ytr[ix])
#print(loss.item())
# backward pass
for p in parameters:
p.grad = None
loss.backward()
# update
lr = 0.1
for p in parameters:
p.data += -lr * p.grad
# track stats
stepi.append(i)
lossi.append(loss.item())
loglossi.append(loss.log10().item())Plotting loss and log-loss (-axis), vs step / iteration count (-axis).
- loss is usually an extremely vertical hockey stick shape.
- log-loss allows a nicer visualisation by squashing vertically.
# plot loss / log-loss (y-axis) vs iteration count (x-axis)
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))
ax1.plot(stepi, lossi)
ax1.set_title('Loss vs Iterations')
ax1.set_xlabel('Loss')
ax1.set_ylabel('Iteration')
ax2.plot(stepi, loglossi)
ax2.set_title('Log-Loss vs Iterations')
ax2.set_xlabel('Log-Loss')
ax2.set_ylabel('Iteration')
plt.tight_layout()
plt.show()Compare training loss to val loss
# compare train loss vs val (dev) loss
# forward pass full train split (Xtr, Ytr): clean loss number showing true model progress
emb = C[Xtr] # (228146, 3, 10) -> (228146, 30) next line emb.view(-1, 30)
h = torch.tanh(emb.view(-1, 30) @ W1 + b1) # (228146, 100)
logits = h @ W2 + b2 # (228146, 27)
loss = F.cross_entropy(logits, Ytr)
print('Training Run 1 (lr = 0.1): 50,000 iters on (Xtr, Ytr)\n\ntraining loss:',loss.item())
# forward pass val split (Xdev, Ydev)! clean loss number showing true model progress on unseen data
emb = C[Xdev] # (228146, 3, 10) -> (228146, 30) next line emb.view(-1, 30)
h = torch.tanh(emb.view(-1, 30) @ W1 + b1) # (228146, 100)
logits = h @ W2 + b2 # (228146, 27)
loss = F.cross_entropy(logits, Ydev)
print('validation (Xdev -> Ydev) loss:', loss.item())Training Run 1 (lr = 0.1): 50,000 iters on (Xtr, Ytr)
training loss: 2.363924980163574
validation (Xdev -> Ydev) loss: 2.384514808654785Continue training
Run 2: Train for 100,000 iters (first half at lr = 0.01, next at lr = 0.005
Dymanically update learning rate: lr = 0.01 if i < 50000 else 0.005
# Run 2: 50,000 training iters on (Xtr, Ytr)! mini-batches (32 examples each).
for i in range(50000):
# minibatch construct
ix = torch.randint(0, Xtr.shape[0], (32,))
# forward pass
emb = C[Xtr[ix]] # (32, 3, 10)
h = torch.tanh(emb.view(-1, 30) @ W1 + b1) # (32, 200)
logits = h @ W2 + b2 # (32, 27)
loss = F.cross_entropy(logits, Ytr[ix])
#print(loss.item())
# backward pass
for p in parameters:
p.grad = None
loss.backward()
# update
# lr = 0.01
lr = 0.01 if i < 50000 else 0.005
for p in parameters:
p.data += -lr * p.grad
# track stats
stepi.append(i)
lossi.append(loss.item())
loglossi.append(loss.log10().item())Compare training loss to val loss
# compare train loss vs val (dev) loss
# forward pass full train split (Xtr, Ytr): clean loss number showing true model progress
emb = C[Xtr] # (228146, 3, 10) -> (228146, 30) next line emb.view(-1, 30)
h = torch.tanh(emb.view(-1, 30) @ W1 + b1) # (228146, 100)
logits = h @ W2 + b2 # (228146, 27)
loss = F.cross_entropy(logits, Ytr)
print('Training Run 2 (lr = 0.01): 50,000 iters on (Xtr, Ytr)\n\ntraining loss:',loss.item())
# forward pass val split (Xdev, Ydev)! clean loss number showing true model progress on unseen data
emb = C[Xdev] # (228146, 3, 10) -> (228146, 30) next line emb.view(-1, 30)
h = torch.tanh(emb.view(-1, 30) @ W1 + b1) # (228146, 100)
logits = h @ W2 + b2 # (228146, 27)
loss = F.cross_entropy(logits, Ydev)
print('validation (Xdev -> Ydev) loss:', loss.item())Training Run 2 (lr = 0.01): 50,000 iters on (Xtr, Ytr)
training loss: 2.1698460578918457
validation (Xdev -> Ydev) loss: 2.19392991065979Finding: Check for overfitting
Note Run 2 above. Training loss was ~2.17, validation loss was ~2.20. The NN’s parameter is hinting at slightly overfit the data (i.e. network is memorising specific training examples rather than general structure). But not massively!
Recall: Interpretation of train vs. validation loss
Train loss Val loss Gap Diagnosis Action High High Small Underfitting — model lacks capacity Scale up model, train longer Low High Large Overfitting — model memorising training data Scale up data, add regularisation Low Low Small Good fit —
Summary notes
- Hyperparameter tuning is done systematically via experimentation and automated jobs. Not in this haphazard manual way.
- Given the loss reduced further in Experiment 2 (without evidence of significant overfitting), the 2-dim embedding vectors were likely too constraining.
Next steps to further improve model performance
- Adjust hidden layer size (number of neurons)
- Adjust embedding dimensions
- Increase context length (i.e. input more than 3 tokens (characters) into model)
- Increase number of layers
- Change hyperparameters:
- Change number of training iterations
- Change batch size
- Change Initial learning rate
- Change learning rate evolution/decay over time
Using the test set
Be very sparing with using the test set. Ideally only use this once, at the very end.
Sampling from the model
# sample from the model
g = torch.Generator().manual_seed(2147483647 + 10)
for _ in range(20):
out = []
context = [0] * block_size # initialise context with all ...
while True:
emb = C[torch.tensor([context])] # (1,block_size,d)
h = torch.tanh(emb.view(1, -1) @ W1 + b1)
logits = h @ W2 + b2
probs = F.softmax(logits, dim=1)
ix = torch.multinomial(probs, num_samples=1, generator=g).item()
context = context[1:] + [ix]
out.append(ix)
if ix == 0: # break when "." EOL generated
break
print(''.join(itos[i] for i in out))carmah.
amori.
kif.
jari.
reh.
caslanden.
jazhet.
faverynt.
kaqui.
nellara.
chaiir.
kaleigh.
ham.
jory.
quintin.
lilea.
jadbi.
wapelo.
dearynix.
kael.Sources
- YouTube: The spelled-out intro to language modeling: building makemore
- Bengio et. al. 2003: A Neural Probabilistic Language Model (implemented here)
- karpathy/makemore on GitHub
- Google Colab: Exercises
- ezyang’s blog: PyTorch Internals