# imports, build vocabulary, build_dataset function, create train/val/test data split.
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt # for making figures
%matplotlib inline
# import data
words = open('data/names.txt', 'r').read().splitlines()
# build the vocabulary of characters, and mappings to/from integers
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}
# fn: build dataset (training examples X, and labels Y) for an INPUT list of names only
block_size = 3 # context length: how many characters do we take to predict the next one?
def build_dataset(words):
X, Y = [], [] # X: NN input training examples, Y: labels for each input in X
for w in words:
#print(w)
context = [0] * block_size
for ch in w + '.':
ix = stoi[ch]
X.append(context)
Y.append(ix)
#print(''.join(itos[i] for i in context), '--->', itos[ix])
context = context[1:] + [ix] # crop and append
X = torch.tensor(X)
Y = torch.tensor(Y)
print(X.shape, Y.shape)
return X, Y
# randomly shuffle words data set, and create train, val, test splits
import random
random.seed(42)
random.shuffle(words)
n1 = int(0.8*len(words)) # to index 80th percentile word (i.e. words[0] to words[n1])
n2 = int(0.9*len(words)) # to index the 90th percentile word (i.e. words[n1] to words[n2])
Xtr, Ytr = build_dataset(words[:n1]) # 80% test set (Xtr: training examples, Ytr: training labels)
Xdev, Ydev = build_dataset(words[n1:n2]) # 10% validation set
Xte, Yte = build_dataset(words[n2:]) # 10% test settorch.Size([182625, 3]) torch.Size([182625])
torch.Size([22655, 3]) torch.Size([22655])
torch.Size([22866, 3]) torch.Size([22866])Redefine hidden layer: 100 → 300 neurons
Thoughts and observations
- Increased parameter count (larger NN) may have necessitated more training iterations
- Mini-batches are noisy, causing gradient thrashing (see vertical thickness in the loss plot)
- At 32 training examples per batch, there may be too much noise to be optimise a larger network
- Increasing batch size above 32 training examples per training iteration may help
Run 2: Train for 60,000 iters at lr = 0.05
# 60,000 training iters on training split only (Xtr, Ytr)! mini-batches (32 examples each).
for i in range(60000):
# minibatch construct
ix = torch.randint(0, Xtr.shape[0], (32,))
# forward pass
emb = C[Xtr[ix]] # (32, 3, 10)
h = torch.tanh(emb.view(-1, 6) @ W1 + b1) # (32, 200)
logits = h @ W2 + b2 # (32, 27)
loss = F.cross_entropy(logits, Ytr[ix])
#print(loss.item())
# backward pass
for p in parameters:
p.grad = None
loss.backward()
# update
lr = 0.05
for p in parameters:
p.data += -lr * p.grad
# track stats
stepi.append(i)
lossi.append(loss.item())Compare training loss to val loss
# compare train loss vs val (dev) loss
# forward pass full train split (Xtr, Ytr): clean loss number showing true model progress
emb = C[Xtr] # (228146, 3, 2) -> (228146, 6) next line emb.view(-1, 6)
h = torch.tanh(emb.view(-1, 6) @ W1 + b1) # (228146, 100)
logits = h @ W2 + b2 # (228146, 27)
loss = F.cross_entropy(logits, Ytr)
print('Training Run 2 (lr = 0.05): 60,000 iters on (Xtr, Ytr)\n\ntraining loss:',loss.item())
# forward pass val split (Xdev, Ydev)! clean loss number showing true model progress on unseen data
emb = C[Xdev] # (228146, 3, 2) -> (228146, 6) next line emb.view(-1, 6)
h = torch.tanh(emb.view(-1, 6) @ W1 + b1) # (228146, 100)
logits = h @ W2 + b2 # (228146, 27)
loss = F.cross_entropy(logits, Ydev)
print('validation (Xdev -> Ydev) loss:', loss.item())Training Run 2 (lr = 0.05): 60,000 iters on (Xtr, Ytr)
training loss: 2.3143296241760254
validation (Xdev -> Ydev) loss: 2.3253331184387207Run 3: Train for 60,000 iters at lr = 0.01
# 60,000 training iters on training split only (Xtr, Ytr)! mini-batches (32 examples each).
for i in range(60000):
# minibatch construct
ix = torch.randint(0, Xtr.shape[0], (32,))
# forward pass
emb = C[Xtr[ix]] # (32, 3, 10)
h = torch.tanh(emb.view(-1, 6) @ W1 + b1) # (32, 200)
logits = h @ W2 + b2 # (32, 27)
loss = F.cross_entropy(logits, Ytr[ix])
#print(loss.item())
# backward pass
for p in parameters:
p.grad = None
loss.backward()
# update
lr = 0.01
for p in parameters:
p.data += -lr * p.grad
# track stats
stepi.append(i)
lossi.append(loss.item())Compare training loss to val loss
# compare train loss vs val (dev) loss
# forward pass full train split (Xtr, Ytr): clean loss number showing true model progress
emb = C[Xtr] # (228146, 3, 2) -> (228146, 6) next line emb.view(-1, 6)
h = torch.tanh(emb.view(-1, 6) @ W1 + b1) # (228146, 100)
logits = h @ W2 + b2 # (228146, 27)
loss = F.cross_entropy(logits, Ytr)
print('Training Run 3 (lr = 0.01): 60,000 iters on (Xtr, Ytr)\n\ntraining loss:',loss.item())
# forward pass val split (Xdev, Ydev)! clean loss number showing true model progress on unseen data
emb = C[Xdev] # (228146, 3, 2) -> (228146, 6) next line emb.view(-1, 6)
h = torch.tanh(emb.view(-1, 6) @ W1 + b1) # (228146, 100)
logits = h @ W2 + b2 # (228146, 27)
loss = F.cross_entropy(logits, Ydev)
print('validation (Xdev -> Ydev) loss:', loss.item())Training Run 3 (lr = 0.01): 60,000 iters on (Xtr, Ytr)
training loss: 2.235992908477783
validation (Xdev -> Ydev) loss: 2.2460684776306152Visualise character embeddings
Since we used 2-dimensional embedding vectors (see 01_build_mlp, embedding section) we can visualise the model’s pre-trained embedding matrix as a graph on the -plane
Clearly there is some structure in how the model treats certain characters:
- The start/end character
.is very different to everything else, so sits apart by itself - Vowels
a,e,i,ohave clustered to the bottom left qis quite unique and out by itself.uis also unique and out by itself: its uses are clearlly dissimilar to most other letters, and maybe more likeqysits between the vowels and everything else- Vague clustering of “hard / closed” consonants like
c,p,k,d,t. - Vague -axis alignment of “soft / flowy / open” consonants like
fl,r,n,w,v,h,m
It is possible the number of embedding dimensions is another bottleneck holding back model performance. Maybe cramming 27 tokens into 2 dimensions is too ambitious, and loses their semantic meaning.
# visualize dimensions 0 and 1 of the embedding matrix C for all characters
plt.figure(figsize=(8,8))
plt.scatter(C[:,0].data, C[:,1].data, s=200) # graphing the columns of C. x: C[:,0] and y: C[:,1]
for i in range(C.shape[0]):
plt.text(C[i,0].item(), C[i,1].item(), itos[i], ha="center", va="center", color='white')
plt.grid('minor')Sources
- YouTube: The spelled-out intro to language modeling: building makemore
- Bengio et. al. 2003: A Neural Probabilistic Language Model (implemented here)
- karpathy/makemore on GitHub
- Google Colab: Exercises
- ezyang’s blog: PyTorch Internals