# import data
words = open('data/names.txt', 'r').read().splitlines()
 
import torch
# initialise 27 x 27 tensor (2D array), and create lookup tables (maps)
N = torch.zeros((27, 27), dtype=torch.int32) # init counts array: 32-bit ints (26 chars + EOL)
 
chars = sorted(list(set(''.join(words))))  # sorted list: unique (26) chars in full dataset
stoi = {s:i+1 for i,s in enumerate(chars)} # map (dict type): `str`->`int`. 'a'=1, ..., 'z'=26
stoi['.'] = 0                              # map: '.'=0. Treat '<S>' and '<E>' as the same!
itos = {i:s for s,i in stoi.items()}       # map (reverse): invert `stoi` dict
 
# (re)create freq map, but storing bigram counts in PyTorch tensor: N
for w in words:
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2 in zip(chs, chs[1:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        N[ix1, ix2] += 1
 
# visualise array (heatmap)
import matplotlib.pyplot as plt
%matplotlib inline
 
plt.figure(figsize=(16,16))
plt.imshow(N, cmap='Blues') # show entire array image
 
# iterate over each cell in array
for i in range(27):
    for j in range(27):
        chstr = itos[i] + itos[j] # create char strings (e.g. 'ac', 'gb', 'a.')
        plt.text(j, i, chstr, ha="center", va="bottom", color='gray')       # write char string
        plt.text(j, i, N[i, j].item(), ha="center", va="top", color='gray') # write count (item)
plt.axis('off');
plot

Sampling from the model

N[0] # get first row (starting letter counts)
tensor([   0, 4410, 1306, 1542, 1690, 1531,  417,  669,  874,  591, 2422, 2963,
        1572, 2538, 1146,  394,  515,   92, 1639, 2055, 1308,   78,  376,  307,
         134,  535,  929], dtype=torch.int32)
# i - normalise counts -> creates a probability dist. vector `p` (Σp = 1)
p = N[0].float()
p = p / p.sum() # norm. not necessary (see torch.multinomial docs)
p   # each char's probability of being the 1st char in word
tensor([0.0000, 0.1377, 0.0408, 0.0481, 0.0528, 0.0478, 0.0130, 0.0209, 0.0273,
        0.0184, 0.0756, 0.0925, 0.0491, 0.0792, 0.0358, 0.0123, 0.0161, 0.0029,
        0.0512, 0.0642, 0.0408, 0.0024, 0.0117, 0.0096, 0.0042, 0.0167, 0.0290])

Multinomial distribution

torch.multinomial (docs) creates a tensor, by drawing a random sample of size num_samples elements from the probability distribution p (with replacement between draws).

Sample the 1st character

In this case, it:

  • draws a single index, ix,
  • from the 1st row of the bigram probability matrix p,
  • which is the index of the 1st char to emit

(Wiki)

# i - sample 1st character from the probability dist `p`
g = torch.Generator().manual_seed(2147483647) # seeded generator for reproducibility
ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
itos[ix] # sampled `c` as 1st char of word
'c'

Sample the 2nd character

Since the 1st character was c, we need the N[3] row of bigram counts (i.e. counts of each letter following c-).

From the heatmap above, the most next likely characters (in order) are: ca, ch, ce, co, ck, ci, and cl, and cy. The rest is a long tail.

# i - sample 2nd character (from distribution of chars following `c`)
p_2 = N[ix].float()
p_2 = p_2 / p_2.sum()
p_2 
 
ix_2 = torch.multinomial(p_2, num_samples=1, replacement=True, generator=g).item()
 
itos[ix_2] # sampled `e` as 2nd char of word
'e'
print(p)
print(p_2)
tensor([0.0000, 0.1377, 0.0408, 0.0481, 0.0528, 0.0478, 0.0130, 0.0209, 0.0273,
        0.0184, 0.0756, 0.0925, 0.0491, 0.0792, 0.0358, 0.0123, 0.0161, 0.0029,
        0.0512, 0.0642, 0.0408, 0.0024, 0.0117, 0.0096, 0.0042, 0.0167, 0.0290])
tensor([0.0275, 0.2307, 0.0000, 0.0119, 0.0003, 0.1560, 0.0000, 0.0006, 0.1880,
        0.0767, 0.0008, 0.0895, 0.0328, 0.0000, 0.0000, 0.1076, 0.0003, 0.0031,
        0.0215, 0.0014, 0.0099, 0.0099, 0.0000, 0.0000, 0.0008, 0.0294, 0.0011])

Write loop for sampling till EOL

Demonstrates that a “trained” bigram sucks, but it is still better than a totally untrained (uniform distribution) model, where every next letter is equally likely.

# i - these names suck!!! a "trained" bigram model sucks
g = torch.Generator().manual_seed(2147483647)
ix = 0 # start at index 0. the 1st (special) token '.' 
 
for i in range(20):
    out = [] # init list to hold name
    while True:
        p = N[ix].float()    # current char's index
        p = p / p.sum()      # normalise counts -> probs (Σp=1)
        
        # comment out prev 2 lines. bigram still much better than uniform dist.
        # p = torch.ones(27) / 27.0 # uniform dist (totally untrained)
        
        # sample next index
        ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
        out.append(itos[ix]) # append namese to list
        
        # break loop if next predicted token is EOL '.'
        if ix == 0:
            break
    print(''.join(out))
cexze.
momasurailezitynn.
konimittain.
llayn.
ka.
da.
staiyaubrtthrigotai.
moliellavo.
ke.
teda.
ka.
emimmsade.
enkaviyny.
ftlspihinivenvorhlasu.
dsor.
br.
jol.
pen.
aisan.
ja.

Efficiency (pre-normalise N)

Broadcasting

Broadcasting lets PyTorch perform operations on tensors of different shapes without explicitly copying data, by “stretching” the smaller tensor to match the larger one.

Goal (broadcasting is genuinely a little confusing)

We want to normalise each row in bigram counts matrix N.

  • This means divide each element on a given row by the sum of all elements in that row.
    • To perform this row-sum, set argument dim = 1 in torch.sum() (docs).
      • dim = 0 for column-sums
      • dim = None reduce all dimensions, sum ALL tensor elements return scalar
    • Also note:
      • keepdim = True means the output “sum” object retains the same shape as the input tensor
        • This prepares it for broadcasting (the “kept” dimension is reduced to length 1 upon summing, but crucially, it is KEPT for broadcasting)
        • Without keepdim (i.e. False), we’d incorrectly be normalising the columns.
      • keepdim = False “squeezes out” the reduced dimension. output “sum” shape is lower-dim than input
  • Do all rows in parallel
# i - calculate P_rowsum (a col vec). "denominators" for element-wise division across each row
P  = N.float()
print('P       :', P.shape)
 
P_rowsum = P.sum(dim=1, keepdim=True)
print('P_rowsum:', P_rowsum.shape, ' -> broadcasts to [27, 27]')
print('P = P / P_rowsum: normalisation operation is broadcastable!')
 
# if curious, inspect (for this bigram, row-sum and col-sum happen to be identical)
# print(P.sum(dim=0, keepdim=True).shape)
# P.sum(dim=0, keepdim=True)
P       : torch.Size([27, 27])
P_rowsum: torch.Size([27, 1])  -> broadcasts to [27, 27]
P = P / P_rowsum: normalisation operation is broadcastable!
# i - broadcast P_rowsum (a column vector) to normalise each of row elements in N
P = P / P_rowsum  # P_rowsum (a col vec) is broadcast (copied), then normalise (elementwise division)
P.sum(dim=1)      # check: Normalised rows (probabilities) sum to 1. 
# P.sum(0)        # cols don't sum to 1, as expected!
 
# for memory efficiency, should doing it inplace:
# P /= P.sum(dim=1, keepdim=True)
tensor([1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
        1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
        1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000])

Normalised elements of P are the “parameters” of this “traind” bigram model

Rerun sampling loop

# i - Expect same result (pre-normalised the bigram counts matrix `N`)
g = torch.Generator().manual_seed(2147483647)
ix = 0 # start at index 0. the 1st (special) token '.' 
 
for i in range(20):
    out = [] # init list to hold name
    while True:
        p = P[ix] # expect same result!
        
        # sample next index
        ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
        out.append(itos[ix]) # append namese to list
        
        # break loop if next predicted token is EOL '.'
        if ix == 0:
            break
    print(''.join(out))
cexze.
momasurailezitynn.
konimittain.
llayn.
ka.
da.
staiyaubrtthrigotai.
moliellavo.
ke.
teda.
ka.
emimmsade.
enkaviyny.
ftlspihinivenvorhlasu.
dsor.
br.
jol.
pen.
aisan.
ja.

Sources