# import data, init bigram counts N (and plot) -> normalise N (now probs) -> compute nll loss over all training data
from IPython.display import HTML
from utils.matmul_viz import show_matmul
 
words = open('data/names.txt', 'r').read().splitlines()
 
import torch
# initialise 27 x 27 tensor (2D array), and create lookup tables (maps)
N = torch.zeros((27, 27), dtype=torch.int32) # init counts array: 32-bit ints (26 chars + EOL)
 
chars = sorted(list(set(''.join(words))))  # sorted list: unique (26) chars in full dataset
stoi = {s:i+1 for i,s in enumerate(chars)} # map (dict type): `str`->`int`. 'a'=1, ..., 'z'=26
stoi['.'] = 0                              # map: '.'=0. Treat '<S>' and '<E>' as the same!
itos = {i:s for s,i in stoi.items()}       # map (reverse): invert `stoi` dict
 
# (re)create freq map, but storing bigram counts in PyTorch tensor: N
for w in words:
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2 in zip(chs, chs[1:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        N[ix1, ix2] += 1
 
# visualise array (heatmap)
import matplotlib.pyplot as plt
%matplotlib inline
 
plt.figure(figsize=(16,16))
plt.imshow(N, cmap='Blues') # show entire array image
 
# iterate over each cell in array
for i in range(27):
    for j in range(27):
        chstr = itos[i] + itos[j] # create char strings (e.g. 'ac', 'gb', 'a.')
        plt.text(j, i, chstr, ha="center", va="bottom", color='gray')       # write char string
        plt.text(j, i, N[i, j].item(), ha="center", va="top", color='gray') # write count (item)
plt.axis('off');
 
# broadcast row sums to normalise rows of N -> obtain probability distribution P
P  = (N+1).float() # N+1 to avoid zero-counts
P /= P.sum(dim=1, keepdim=True)
 
# compute loss function for entire data set
log_likelihood = 0.0
n = 0
 
for w in words: 
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2 in zip(chs, chs[1:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        prob = P[ix1, ix2]         # model's assigned prob for each bigram
        logprob = torch.log(prob)
        log_likelihood += logprob
        n += 1
        
print(f'{log_likelihood = }')
nll = -log_likelihood
print(f'{nll = }')
print(f'{nll / n = }')
print('\na BETTER model will MAXIMISE log_likelihood (i.e. MINIMISE nll or nll/n)')
log_likelihood = tensor(-559951.5625)
nll = tensor(559951.5625)
nll / n = tensor(2.4544)
 
a BETTER model will MAXIMISE log_likelihood (i.e. MINIMISE nll or nll/n)
plot

Rationale for the alternative: Explicit Bigram LM NN

The Bigram character-level Language Mode can be cast to a (very similar) Neural Network framework:

  • NN is still still character-level, and still a bigram (2-char comparisons)
  • xs: 1-char NN inputs (training examples)
    • ys: 1-char training labels / desired targets / correct next character in sequence
  • NN layers: parameterised by weights W
  • prob: output probability dist., predicting the next character in the sequence (i.e. most likely char to follow the input char)

The role of gradient descent

The previously defined loss function, nll / n (average negative log-likelihood), will be used to evaluate any given configuration of the weights, W. Gradient descent can be used, because we have labelled data.

For each line in the training data, the “next” character is always known. Thus, simply adjust the network’s weights, W, via gradient descent, to maximise the probability of the desired target (label), y, for each given input, x. This will improve the NN’s next-character prediction.

For example, the first name name emma already contains 5 distinct bigram training examples:

# create the training set of bigrams (x: training inputs, y: labelled targets)
 
 
xs, ys = [], []
iter = 1
 
print("first word, 'emma', contains 5 training examples:")
print('eg #', '  input (x)', '-> target (y)')
for w in words[:1]:
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2 in zip(chs, chs[1:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        print(f'{iter}          {ch1}     ->      {ch2}')
        iter += 1
        xs.append(ix1)
        ys.append(ix2)
        
xs = torch.tensor(xs)
ys = torch.tensor(ys)
 
print('\nnn inputs  (xs)    :', xs)
print('targets/labels (ys):', ys)
first word, 'emma', contains 5 training examples:
eg #   input (x) -> target (y)
1          .     ->      e
2          e     ->      m
3          m     ->      m
4          m     ->      a
5          a     ->      .
 
nn inputs  (xs)    : tensor([ 0,  5, 13, 13,  1])
targets/labels (ys): tensor([ 5, 13, 13,  1,  0])

Expand above output

Define Neural Network

Input layer: One-hot encoding

Raw integer indices for characters (ix) cannot be fed into a neural network. Integers imply false ordinal relationships — the network would interpret 'c' = 3 as literally three times 'a' = 1, which is meaningless for categorical data. Downstream multiplicative and non-linear operations would exacerbate the NN’s misunderstanding.

See detailed note, examples, and docs

Encode 5 int training examples (xs) as vectors

One-hot vectors make every character equidistant from every other. Separately, we also must ensure datatype is float (vs. integers) for gradient computation during backprop.

# i - one-hot encode the training inputs `xs` as length-27 vectors
import torch.nn.functional as F
xenc = F.one_hot(xs, num_classes=27).float() # one-hot encode, and cast to float
print(xenc)
print(xenc.shape, xenc.dtype)
tensor([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0.]])
torch.Size([5, 27]) torch.float32
# i - the 5 one-hot-endcoded input training examples contained in "emma"
plt.imshow(xenc)
<matplotlib.image.AxesImage at 0x115fbe990>
plot

Note how each of the 5 training examples in xs, have been one-hot encoded to 5 vectors

  • The appropriate bit set to 1 (corresponding to the letter)
  • All other bits set to 0

These vectors are equivalent to our prior result:

Weights

Construct 1 neuron

  • Randomly initialise weight W (column-vector) for 1 neuron
  • Matrix multiply xenc (matrix of 5 input training examples) with W.
    • Inner dimensions cancel (weighted sum), leaving an output
  • In parallel, PyTorch evaluate the activations (recall , but no biases in this case) on all 5 inputs in xenc, that are coming into this 1 neuron (for which W was defined).
# visualise xenc @ W: (5, 27) @ (27, 1) -> (5, 1)
 
HTML(show_matmul(
    rows_A=5, cols_A=27,
    rows_B=27, cols_B=1,
    highlight_row=3, highlight_col=0,
    onehot_indices=[0, 5, 13, 13, 1],
    label_A="xenc", label_B="W", label_out="xenc @ W",
    title="Constructing 1 neuron   (xenc @ W)[3, 0]"
))
Constructing 1 neuron (xenc @ W)[3, 0]
xenc
ℝ5×27
@
W
ℝ27×1
=
xenc @ W
ℝ5×1
result [3,0]
active row / col
one-hot 1
zero
# i - initialise weight vector W for 1 neuron, and MatMul `xenc` with `W``
W = torch.randn((27, 1))
print((xenc @ W).shape)
print(xenc @ W) # (5, 27) @ (27, 1) -> (5, 1)
torch.Size([5, 1])
tensor([[-0.5871],
        [-0.5964],
        [ 0.0396],
        [ 0.0396],
        [ 0.5090]])

Construct all 27 neurons

  • Randomly initialise weight W (square matrix) for all 27 neurons
  • Matrix multiply xenc (matrix of input training examples) with W.
    • Inner dimensions cancel (weighted sum), leaving an output
  • In parallel, PyTorch evaluated the activations (recall , but no biases in this case) on those 5 inputs in xenc, coming into all 27 neurons (for which W was defined)
# visualise xenc @ W: (5, 27) @ (27, 27) -> (5, 27)
 
HTML(show_matmul(
    rows_A=5, cols_A=27,
    rows_B=27, cols_B=27,
    highlight_row=3, highlight_col=13,
    onehot_indices=[0, 5, 13, 13, 1],
    label_A="xenc", label_B="W", label_out="out = xenc @ W",
    title="Constructing all 27 neurons   (xenc @ W)[3, 13]"
))
Constructing all 27 neurons (xenc @ W)[3, 13]
xenc
ℝ5×27
@
W
ℝ27×27
=
out = xenc @ W
ℝ5×27
result [3,13]
active row / col
one-hot 1
zero
# i - initialise weight vector W for ALL 27 neurons, and MatMul `xenc` with `W``
W = torch.randn((27, 27))
print((xenc @ W).shape)
print(xenc @ W)   # (5, 27) @ (27, 27) -> (5, 27)
torch.Size([5, 27])
tensor([[ 0.9652,  0.7885, -0.1041,  0.6425, -1.8685,  1.5876,  2.0369, -0.2119,
          0.5029,  0.2120, -0.1013, -1.1404, -1.6571, -1.3333, -2.3327, -1.3389,
          1.0471, -1.2016, -1.2877, -0.2382, -0.1731,  2.2464, -0.1492, -0.0184,
          0.8006,  0.2909,  0.0957],
        [ 0.3013, -0.2260,  0.5819,  1.1535,  0.3760,  0.1831, -0.7286,  1.0753,
         -0.4050,  0.1515, -0.3967, -0.0727,  1.2385,  0.0032, -0.0502,  0.5951,
          0.8034,  0.5065, -1.9070,  1.2913,  0.0301,  0.6183,  0.0633,  0.4137,
         -1.4627, -0.2583, -1.7031],
        [ 0.2091,  0.2786, -0.4015, -0.5392,  0.4746, -0.0973, -0.6801,  0.1759,
         -0.2730, -0.3720,  0.6673,  2.5479, -0.0697,  1.2388, -0.1610,  0.1319,
         -1.6991,  0.0076, -0.2641,  0.0806, -1.0993, -1.1639, -0.3803, -1.6768,
          0.5787,  1.6649,  0.3501],
        [ 0.2091,  0.2786, -0.4015, -0.5392,  0.4746, -0.0973, -0.6801,  0.1759,
         -0.2730, -0.3720,  0.6673,  2.5479, -0.0697,  1.2388, -0.1610,  0.1319,
         -1.6991,  0.0076, -0.2641,  0.0806, -1.0993, -1.1639, -0.3803, -1.6768,
          0.5787,  1.6649,  0.3501],
        [-0.1052, -0.4162, -2.3235,  0.0980, -1.7063, -0.2025, -0.1802, -0.7341,
          1.3711, -0.2890,  0.6595,  0.2495, -1.4391,  2.2291, -0.7953, -0.6056,
         -1.5934, -0.0027,  0.1579,  1.2266,  1.1530, -1.0065,  0.2135, -1.0438,
          1.5983, -0.5864,  0.1472]])

Interpretation of xenc @ W (dot product visual above)

Each element of xenc @ W is the activation for one of 27 neurons, for one of the 5 input training examples contained in emma.

For example, consider one element in the output: (xenc @ W)[3, 13] (zero-indexing, so 4th example, 14th neuron). It is the activation of the 14th neuron on the 4th training example (m -> m). It is the dot product between:

  • xenc[3] — the one-hot vector for the 4th input character (1, 27)
  • W[:, 13] — the weight vector of the 14th neuron (27, 1)

For dot product, see: vector-operations, and matrix-operations

The 27 neurons have no character identity — they are learned features. After softmax (exp(xenc @ W) normalise (divide by row sums)) the outputs will become predicted probabilities over the next character.

# i - inspecting a single element of the matrix multiply
print((xenc @ W)[3, 13]) # activation (firing rate) of 14th neuron looking at 4th input
 
# equivalent to the following dot product
print(xenc[3])     # 4th row (4rd training input example)
print(W[:, 13])    # 14th col (14th neuron)
print((xenc[3] * W[:, 13]).sum()) # dot product of 4th row of xenc, with 14th col of W
tensor(1.2388)
tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0.])
tensor([-1.3333,  2.2291,  0.7696, -0.9281, -0.0282,  0.0032, -0.6234,  0.4487,
         0.0054, -2.0373,  0.9912, -0.6934, -0.5364,  1.2388,  0.3005, -0.4492,
        -1.3713, -0.3236, -0.3239, -0.2134, -0.5936, -0.5198,  0.1939,  0.1692,
         1.7775, -1.3120,  0.5750])
tensor(1.2388)

Convention: Why xenc @ W and not W @ xenc?

Output layer: log-counts probabilities

Currently the elements of the NN’s output activation matrix (xenc @ W , for 5 input examples above) are arbitrary real numbers from a random matrix multiply with W.

The “goal” for the output layer

For each input example (we have 5 above), we WANT the output row xenc[i] @ W to behave like a probability distribution over the next character — positive numbers that sum to 1 (analogous to the normalised frequency counts N from the bigram table).

To achieve this, we apply softmax

  • interpret the raw activation outputs xenc @ W as log-counts (logits), and element-wise exponentiate them (logits.exp()) — guaranteeing positive numbers
  • then normalise each row to sum to 1
# i - apply softmax nonlinearity
logits = xenc @ W # interpret as log-counts (forward-looking, hoping!)
 
# apply softmax: impose a contract. grad desc will tune params to enforce log-count-like behaviour
counts = logits.exp() # exp() outputs are like counts (equiv. to N)
probs = counts / counts.sum(1, keepdims=True) # probabilities for next character
probs
tensor([[0.0557, 0.0467, 0.0191, 0.0404, 0.0033, 0.1039, 0.1628, 0.0172, 0.0351,
         0.0262, 0.0192, 0.0068, 0.0040, 0.0056, 0.0021, 0.0056, 0.0605, 0.0064,
         0.0059, 0.0167, 0.0179, 0.2007, 0.0183, 0.0208, 0.0473, 0.0284, 0.0234],
        [0.0354, 0.0209, 0.0469, 0.0831, 0.0382, 0.0315, 0.0126, 0.0768, 0.0175,
         0.0305, 0.0176, 0.0244, 0.0904, 0.0263, 0.0249, 0.0475, 0.0585, 0.0435,
         0.0039, 0.0953, 0.0270, 0.0486, 0.0279, 0.0396, 0.0061, 0.0202, 0.0048],
        [0.0283, 0.0303, 0.0153, 0.0134, 0.0368, 0.0208, 0.0116, 0.0273, 0.0174,
         0.0158, 0.0447, 0.2930, 0.0214, 0.0791, 0.0195, 0.0262, 0.0042, 0.0231,
         0.0176, 0.0248, 0.0076, 0.0072, 0.0157, 0.0043, 0.0409, 0.1212, 0.0325],
        [0.0283, 0.0303, 0.0153, 0.0134, 0.0368, 0.0208, 0.0116, 0.0273, 0.0174,
         0.0158, 0.0447, 0.2930, 0.0214, 0.0791, 0.0195, 0.0262, 0.0042, 0.0231,
         0.0176, 0.0248, 0.0076, 0.0072, 0.0157, 0.0043, 0.0409, 0.1212, 0.0325],
        [0.0219, 0.0161, 0.0024, 0.0269, 0.0044, 0.0199, 0.0203, 0.0117, 0.0959,
         0.0182, 0.0471, 0.0312, 0.0058, 0.2262, 0.0110, 0.0133, 0.0049, 0.0243,
         0.0285, 0.0830, 0.0771, 0.0089, 0.0301, 0.0086, 0.1204, 0.0135, 0.0282]])

The NN outputs probs (after softmax non-linearity). For each input example vector (5 in emma), we now have the probability distribution for the next character (also as a vector)

But why do elements in xenc @ W “behave like” log-counts?

See activation-function

Neat insight: W N

See detailed note to visualise.

  • When xenc is a one-hot encoded vector xenc @ W effectively just selects one row from W. Hence logits is related the 5th row of W.
  • Recall, we decided interpreted logits as log-counts. So counts = logits.exp() is identical to the approach from 02_sampling

The difference:

  • In explicit Bigram, we explicitly counted bigrams to construct N
  • In NN, W is initialised randomly and interpreted as log-counts.
    • Gradient descent minimises loss to and adjusts W (imposes a contract on W),
    • Such that W.exp() produces counts IDENTICAL to N (after enough iterations)

Sources