08_pytorch_backprop

Prev: 07_breaking_up_tanh
Next: 09_pytorch_gradient_descent
Related: Value object data structure, computation graph direction terminology

# imports, `Value` class, reset_graph() to init nn, graphviz: trace() & draw_dot()
 
import math
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
 
# extend `Value` class with the constituent methods listed above:
class Value:
    def __init__(self, data, _children=(), _op='', label=''):
        self.data = data
        self.grad = 0.0
        self._backward = lambda: None
        self._prev = set(_children)
        self._op = _op
        self.label = label
 
    def __repr__(self):
        return f"Value(data={self.data})" 
 
    def __add__(self, other):
        # pre-process `other`. If it is non-`Value`, assume `int`/`float` and wrap in `Value()`
        other = other if isinstance(other, Value) else Value(other)
        out = Value(self.data + other.data, (self, other), '+')
 
        def _backward():
            self.grad += out.grad * 1.0
            other.grad += out.grad * 1.0
        out._backward = _backward
        
        return out
 
    def __mul__(self, other):
        # pre-process `other`. If it is non-`Value`, assume int/float and wrap in `Value()`
        other = other if isinstance(other, Value) else Value(other)
        out = Value(self.data * other.data, (self, other), '*')
 
        def _backward():
            self.grad += other.data * out.grad
            other.grad += self.data * out.grad
        out._backward = _backward
        
        return out
 
    def __radd__(self, other):  # fallback for swapped operands: i.e. other + self
        return self + other     # route to `__add__`
    
    def __rmul__(self, other):  # fallback for swapped operands: i.e. other * self
        return self * other     # route to `__mul__`
    
    # ensure `other` is NEVER a `Value` object. Only int/float allowed
    def __pow__(self, other):
        assert isinstance(other, (int, float)), "only supporting int/float powers for now"
        out = Value(self.data**other, (self,), f'**{other}')
        
        # recall downstream grad = local grad * upstream grad
        # local gradient for x^k: d(x^k)/dx = kx^(k-1)
        def _backward():
            self.grad += other * (self.data ** (other - 1)) * out.grad
        out._backward = _backward
        
        return out
    
    def __truediv__(self, other): # i.e. self / other but...
        return self * other**-1   # use previously defined __mul__() and __pow__(), instead of implementing `/` operation and its own `_backward()``
    
    def __neg__(self): # -self
        return self * -1        # use previously defined __mul__() to evaluate this `Value` * `int` expression
    
    def __sub__(self, other):   # self - other
        return self + (-other)  # use previously defined __add__(), instead of implementing `-` operation and its own `_backward()``
 
    def tanh(self): 
        x = self.data
        t = (math.exp(2*x) - 1)/(math.exp(2*x) + 1)
        out = Value(t, (self, ), 'tanh')
 
        def _backward():
            self.grad += (1 - t**2) * out.grad
        out._backward = _backward
        
        return out
    
    # define exponentiation method
    def exp(self):
        x = self.data                               # input data value
        out = Value(math.exp(x), (self, ), 'exp')   # output data value: use builtin math.exp(x)
        
        # recall downstream grad = local grad * upstream grad
        # local gradient for exp: d(e^x)/dx = e^x (i.e. out.data, just calculated!)
        def _backward():
            self.grad += out.data * out.grad
        out._backward = _backward
        
        return out
    
    # define division method
 
    def backward(self):
        topo = []
        visited = set()
        
        def build_topo(v):
            if v not in visited:
                visited.add(v)
                for child in v._prev:
                    build_topo(child)
                topo.append(v)
        build_topo(self)
        
        self.grad = 1.0
        for node in reversed(topo):
            node._backward()
 
# helper: re-initialise graph
def reset_graph(reset_level):
 
    # declare global gradients
    global x1, x2, w1, w2, x1w1, x2w2, x1w1x2w2, b, n, o
 
    if reset_level == 'gradients':
        x1.grad = x2.grad = w1.grad = w2.grad = x1w1.grad = x2w2.grad = x1w1x2w2.grad = b.grad = n.grad = o.grad = 0
 
        print("reset_graph(): All gradients have been reset to 0")
 
    # reset all variables
    elif reset_level == 'graph':
        # redefine inputs (x1,x2), weights (w1,w2), and then the graph (n = x1*w1 + x2*w2 + b)
        x1 = Value(2.0, label='x1'); x2 = Value(0.0, label='x2')
        w1 = Value(-3.0, label='w1'); w2 = Value(1.0, label='w2')
        x1w1 = x1 * w1; x1w1.label = 'x1*w1'; x2w2 = x2 * w2; x2w2.label = 'x2*w2'
        x1w1x2w2 = x1w1 + x2w2; x1w1x2w2.label = 'x1*w1 + x2*w2'
 
        # manually change bias to make number nice for education: (b=8 to see tanh squishing!, b=6.8813735870195432 so deriv = 1)
        b = Value(6.8813735870195432, label='b'); 
        n = x1w1x2w2 + b; n.label = 'n'
 
        # try re-run the activation function on n (the raw cell body) and draw the output node o
        o = n.tanh(); o.label = 'o'
 
        print("reset_graph(): All vars, initial and intermediate, have been reset. All gradients now 0")
 
    else: print("reset_graph(): please specify the level of reset desired 'gradients' or 'graph'")
 
# graphviz
from graphviz import Digraph
 
def trace(root):
    # recursively builds a set of all nodes and edges in a graph
    nodes, edges = set(), set()
    def build(v):
        if v not in nodes: 
            nodes.add(v)
            for child in v._prev:
                edges.add((child, v))
                build(child)
    build(root)
    return nodes, edges
 
def draw_dot(root):
    dot = Digraph(format='svg', graph_attr={'rankdir': 'LR'}) # LR = left to right
    nodes, edges = trace(root)
    for n in nodes:
        uid = str(id(n))
        # for any value in the graph, create a rectangular ('record') node for it
        dot.node(name = uid, label = "{ %s | data %.4f | grad %.4f }" % (n.label, n.data, n.grad), shape='record')
        if n._op:
            # if this value is a result of some operation, create an op node for it
            dot.node(name = uid + n._op, label = n._op)
            # and connect this node to it
            dot.edge(uid + n._op, uid)
    for n1, n2 in edges:
        # connect n_i to the op node of n2
        dot.edge(str(id(n1)), str(id(n2)) + n2._op)
    return dot

PyTorch demo

Recap: Automating Backward Pass

We defined one recursive .backward() method which topologically sorted nodes

Then called .backward() one time on the output node, which:

iterates through nodes in reverse topological order calling ._backward() at each node,

applying the chain rule to compute that operation’s downstream gradient self.grad

Recall: micrograd was scalar-valued autograd engine. PyTorch uses tensors by default.
Note: python defaults to double precision floats (64-bit), but PyTorch uses single precision (32-bit)

import torch # PyTorch demo: tensors, not scalars
 
tensor_1 = torch.Tensor([[1, 2, 3], [4, 5, 6]])
print(tensor_1)
print(tensor_1.shape, '\n')
 
# python default is double precision (64-bit), but PyTorch uses single precision (32-bit)
scalar_1 = torch.Tensor([2.0])
print(scalar_1, scalar_1.dtype)
 
# cast PyTorch data to uses double precision float
scalar_2 = torch.Tensor([2.0]).double()
print(scalar_2, scalar_2.dtype)

tensor([[1., 2., 3.],
        [4., 5., 6.]])
torch.Size([2, 3]) 
 
tensor([2.]) torch.float32
tensor([2.], dtype=torch.float64) torch.float64

Define the toy NN in PyTorch

.requires_grad = False by default for efficiency. Leaf nodes (input data) do not need gradients.
.item() converts tensor to scalar. It strips away tensor object $iff$ it contains a single element

# i - redefine toy NN (leaf nodes, network structure) in PyTorch:
x1 = torch.Tensor([2.0]).double()                ; x1.requires_grad = True
x2 = torch.Tensor([0.0]).double()                ; x2.requires_grad = True
w1 = torch.Tensor([-3.0]).double()               ; w1.requires_grad = True
w2 = torch.Tensor([1.0]).double()                ; w2.requires_grad = True
b = torch.Tensor([6.8813735870195432]).double()  ; b.requires_grad = True
n = x1*w1 + x2*w2 + b
o = torch.tanh(n)
 
print('forward pass:\no.data.item():', o.data.item())
o.backward()
 
print('---\nbackward pass (leaf node gradients):')
print('x1.grad.item():', x1.grad.item())
print('w1.grad.item():', w1.grad.item())
print('x2.grad.item():', x2.grad.item())
print('w2.grad.item():', w2.grad.item()) # expand output below

forward pass:
o.data.item(): 0.7071066904050358
---
backward pass (leaf node gradients):
x1.grad.item(): -1.5000003851533106
w1.grad.item(): 1.0000002567688737
x2.grad.item(): 0.5000001283844369
w2.grad.item(): 0.0

# i - PyTorch & micrograd forward passes produce identical o.data values (0.7071)
#     both backward passes produce identical leaf gradients
reset_graph('graph')
o.backward()
draw_dot(o)

reset_graph(): All vars, initial and intermediate, have been reset. All gradients now 0

Building a neural net library in PyTorch

Neural nets are a specific class of mathematical expression.

Reference image of MLP architecture

# i - build neural net library in PyTorch (classes: `Neuron`, `Layer`, and `MLP`)
import random
 
class Neuron:
    # nin: number of inputs to the neuron
    def __init__(self, nin):
        self.w = [Value(random.uniform(-1,1)) for _ in range(nin)]  # create 1 weight per input 
        self.b = Value(random.uniform(-1,1))                        # create 1 bias for the neuron
    
    # function call [e.g. n(x)] returns forward pass of this neuron (its post-activation value)
    def __call__(self, x):
        # pre-activation weighted input: act = (w ⋅ x) + b. NB: w * x is their dot product
        act = sum((wi*xi for wi, xi in zip(self.w, x)), self.b)
        out = act.tanh()    # output: after applying activitation function (non-linearity)
        return out
    
    def parameters(self):
        return self.w + [self.b]
 
class Layer:
    # nout: how many (independent evaluated) neurons in this layer
    def __init__(self, nin, nout):
        # define 1 layer as a list of `nout` `Neuron` objects, EACH being a `nin`-dim Neuron
        self.neurons = [Neuron(nin) for _ in range(nout)]
    
    # function calls [e.g. l(x)] independently evaluates the `nout` neurons in this layer
    def __call__(self, x):
        outs = [n(x) for n in self.neurons]
        return outs[0] if len(outs) == 1 else outs  # strip list if single element return
    
    def parameters(self):
        return [p for neuron in self.neurons for p in neuron.parameters()]
 
class MLP:
    # nouts: list defining the desired sizes of each layer in the MLP
    def __init__(self, nin, nouts):
        sz = [nin] + nouts
        # define each `Layer` object by iterating consecutive pairs of sizes (i and i+1)
        self.layers = [Layer(sz[i], sz[i+1]) for i in range(len(nouts))]
    
    # function calls returns `Layer` objects sequentially
    def __call__(self, x):
        for layer in self.layers:
            x = layer(x)
        return x
    
    def parameters(self):
        return [p for layer in self.layers for p in layer.parameters()]

Example calls of each class `Neuron`, `Layer`, `MLP`

`Neuron` object

# i - `Neuron` object. n(x) returns the activation value of neuron `n`
x = [2.0, 3.0]  # list of inputs into neuron (e.g. raw data): 2-input list (into a 2-dim neuron)
n = Neuron(2)   # initialise empty 2-dim Neuron: nin = 2
n(x)            # feed input data list `x` into Neuron `n` (or "forward 1 neuron"). routes to __call__ method

Value(data=-0.9940070170011006)

`Layer` object

# i - `Layer` object. l(x) returns a list of all N neurons' activation values in layer `l`
x = [2.0, 3.0]  # list of inputs (e.g. prev. layer's neuron activation values): 2 inputs (into a 2-dim neuron)
l = Layer(2, 3) # initialise empty Layer: 3 neurons (nout = 3), EACH is 2-dim (nin = 2)
l(x)            # feed input data list `x` into Layer `l`. routes to __call__ method

[Value(data=0.5873084104615038),
 Value(data=-0.5857763932198019),
 Value(data=0.9927969183367131)]

`MLP` object

# i - `MLP` object. recreate #image 2 above (in: 3 -> 4 -> out: 1)
x = [2.0, 3.0, -1.0]    # list of 3 input neurons (here, raw data). each neuron in next layer will be 3-dim
m = MLP(3, [4, 4, 1])   # init MLP shape: 3 input neurons -> 4 neuron layer -> 4 neuron layer -> 1 output neuron
m(x)                    # MLP function call: forward pass, returns `.data` value of output node

Value(data=0.045746346776616424)

# draw_dot(m(x))        # huge output graph!

Takeaways

Sources

YouTube: The spelled-out intro to neural networks and backpropagation: building micrograd
karpathy/micrograd on GitHub
Jupyter notebooks from this chapter
Google Colab exercises

notes/

Implement backprop in PyTorch

PyTorch demo

Define the toy NN in PyTorch

Building a neural net library in PyTorch

Example calls of each class `Neuron`, `Layer`, `MLP`

`Neuron` object

`Layer` object

`MLP` object

Takeaways

Sources

Implement backprop in PyTorch

PyTorch demo

Define the toy NN in PyTorch

Building a neural net library in PyTorch

Example calls of each class Neuron, Layer, MLP

Neuron object

Layer object

MLP object

Takeaways

Sources

Graph View

Backlinks

Explorer

Example calls of each class `Neuron`, `Layer`, `MLP`

`Neuron` object

`Layer` object

`MLP` object