# imports
import math
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

A note on graph direction terminology

4. Create NN data structure, and perform forward pass

Define and use the Value object (a data structures to maintain the NN)

Let’s incrementally build a the data structures to maintain the massive mathematical expressions that are NNs.

4.1. Define Value object and its basic operations (add, mul)

Define Value object:

# define `Value` class with constructor (`__init`) and `__repr__` functions
class Value:
 
    # NB: __init__ is auto-called when you create a new instance of a class. 
    # it initialises the attributes of the class.
    def __init__(self, data):
        self.data = data
 
    # NB: __repr__ is a Py built-in function: 
    # - provides a string representation of an obj. for debugging, logging, etc
    # - try comment it out for the next Jupyter cell and see the outputs
    def __repr__(self):
        return f"Value(data={self.data})" 
 
a = Value(2.0)
b = Value(-3.0)
a, b
# a + b # Throws error; Python doesn't know how to add two Value objects
(Value(data=2.0), Value(data=-3.0))

Define methods for add and mul:

# extend `Value` class with methods `.__add__(self, other)` and `.__mul__(self, other)`
class Value:
 
    def __init__(self, data):
        self.data = data
 
    # NB: comment the __repr__ out to see its usefulness !
    def __repr__(self):
        return f"Value(data={self.data})" 
 
    # `__` double underscore methods define what basic operators (e.g. +, *) mean 
    # for non-built-in (i.e. user-defined) objects (e.g. Value).
    # I.e. Defines what it means to "ADD" 2 Value objects. Py doesn't know by default
    def __add__(self, other):
        # Below, the "+" operator is regular floating point addition
        # (not a "Value" obj addition); because the .data are just numbers
        out = Value(self.data + other.data)
        return out
 
    def __mul__(self, other):
        out = Value(self.data * other.data)
        return out

Test add and mul methods:

# test `a + b`; `a * b`
a = Value(2.0)
b = Value(-3.0)
print('a.__add__(b):', a.__add__(b)) # i.e. [self.__add__(other)
print('a + b:', a + b) # Python invokes a.__add__(b) because it notes a and b are Value objects
 
c = Value(10.0)
 
# check that multiplication works:
print('\na*b + c:', a*b + c)
 
# for learning, here's the python internal call (manual call)
print('a.__mul__(b).__add__(c):', a.__mul__(b).__add__(c))
a.__add__(b): Value(data=-1.0)
a + b: Value(data=-1.0)
 
a*b + c: Value(data=4.0)
a.__mul__(b).__add__(c): Value(data=4.0)

4.2. Create pointers to track the graph

This tells us which values produce which others

  • Create _children variable (default: as empty tuple)
  • Maintain _prev, the set of _children
  • Feed in _children of a given Value as inputs to the __add__ and __mul__ methods.

Confused? See backprop-graph-terminology

# extend `Value` class (and its methods) with attribute(s): `_children` -> `_prev`
class Value:
 
    # new var _children (empty tuple by default)
    def __init__(self, data, _children=()):
        self.data = data
        self._prev = set(_children) # _prev is the set of _children (efficiency?); by default _prev is an empty set
 
    def __repr__(self):
        return f"Value(data={self.data})" 
 
    def __add__(self, other):
        # when we are creating a new Value object (via + or * operation), 
        # e.g. "d" below, feed in the children of that Value: (self, other)
        out = Value(self.data + other.data, (self, other))
        return out
 
    def __mul__(self, other):
        out = Value(self.data * other.data, (self, other))
        return out
 
a = Value(2.0)
b = Value(-3.0)
c = Value(10.0)
d = a*b + c
print('d:', d)
print('d._prev:', d._prev, '<- Note; these are Value objects [a*b] and [c]; the "children" of [d]')
d: Value(data=4.0)
d._prev: {Value(data=-6.0), Value(data=10.0)} <- Note; these are Value objects [a*b] and [c]; the "children" of [d]
  • Create _op variable (default: empty set of leaves)
  • Update _op as simple string '+' or '*' for __add__ or __mul__ respectively.
# extend `Value` class (and its methods) with string attribute: `_op`
class Value:
 
    # new var _op (empty string for leafs)
    def __init__(self, data, _children=(), _op=''):
        self.data = data
        self._prev = set(_children)
        self._op = _op  # maintain the operation which created the Value object
                        # (e.g. Value obj. "d" was created from a '+' operation)
 
    def __repr__(self):
        return f"Value(data={self.data})" 
 
    def __add__(self, other):
        out = Value(self.data + other.data, (self, other), '+') # string '+' showing an addition operation created 'd'!
        return out
 
    def __mul__(self, other):
        out = Value(self.data * other.data, (self, other), '*')
        return out
 
a = Value(2.0)
b = Value(-3.0)
c = Value(10.0)
d = a*b + c
print('d:', d)
print('d._prev:', d._prev)
print('d._op:', d._op)
d: Value(data=4.0)
d._prev: {Value(data=-6.0), Value(data=10.0)}
d._op: +

5. Visualise the NN data structure

Now that we have the full mathematical expression of this basic NN, let’s visualise it as a graph with nodes and edges using Graphviz.

from graphviz import Digraph
 
def trace(root):
    # enumeration: recursively build a set of all nodes and edges in a graph
    nodes, edges = set(), set()
 
    def build(v):
        if v not in nodes: 
            nodes.add(v)
            for child in v._prev:
                edges.add((child, v))
                build(child)
    build(root)
    return nodes, edges
    
def draw_dot(root):
    dot = Digraph(format='svg', graph_attr={'rankdir': 'LR'}) # LR = left to right
    
    nodes, edges = trace(root)
    for n in nodes:
        uid = str(id(n))
        
        # for any value in the graph, create a rectangular ('record') node for it
        dot.node(name = uid, label = "{ data %.4f }" % (n.data), shape='record')
        
        if n._op:
            # if this value is a result of some operation, create an op node for it
            dot.node(name = uid + n._op, label = n._op)
            
            # and connect this node to it
            dot.edge(uid + n._op, uid)
        
    for n1, n2 in edges:
        # connect n1 to the op node of n2
        dot.edge(str(id(n1)), str(id(n2)) + n2._op)
    
    return dot
 
# NB: Only the rectangles are real `Value` objects with data (i.e. nodes in the NN)
# The op "nodes" (ovals) simply to visualise which operation (+ or *) was used
draw_dot(d)
4607736912 data 10.0000 4600719840+ + 4607736912->4600719840+ 4607735952 data -3.0000 4600720144* * 4607735952->4600720144* 4600720144 data -6.0000 4600720144->4600719840+ 4600720144*->4600720144 4608827344 data 2.0000 4608827344->4600720144* 4600719840 data 4.0000 4600719840+->4600719840

Note

  • The op nodes (ovals) are not real objects in the network.
  • They simply visualise which operation (+ or *) was used to create the Value objects
    • Only the rectangles are real Value nodes with data!

5.1. Add labels to nodes

Add label variable to Value object to see which variables are where in the visualised graph:

# extend `Value` class (and its methods) with string attribute: `label`
class Value:
 
    # new var label (empty string)
    def __init__(self, data, _children=(), _op='', label=''):
        self.data = data
        self._prev = set(_children)
        self._op = _op
        self.label = label
 
    def __repr__(self):
        return f"Value(data={self.data})" 
 
    def __add__(self, other):
        out = Value(self.data + other.data, (self, other), '+')
        return out
 
    def __mul__(self, other):
        out = Value(self.data * other.data, (self, other), '*')
        return out

Create a NN with 5 label-ed nodes:

  • The e node [a*b] was implicitly present before as an implicit child of d
    • See above: Sec. 4.2 d= a*b + c, and Sec. 5 digraph draw_dot(d)
  • Here, we explicitly create and label it to cleanly display the “children” of node d
# i - initialise a new NN with 5 labelled nodes (`Value` objects)
a = Value(2.0, label='a')
b = Value(-3.0, label='b')
c = Value(10.0, label='c')
e = a*b; e.label = 'e'  # now explicit
d = e + c; d.label = 'd'
d  # return final node
Value(data=4.0)

5.1.1. Update graphviz with labels

# redefine graphviz to add this info:
from graphviz import Digraph
 
def trace(root):
    # recursively builds a set of all nodes and edges in a graph
    nodes, edges = set(), set()
 
    def build(v):
        if v not in nodes: 
            nodes.add(v)
            for child in v._prev:
                edges.add((child, v))
                build(child)
    build(root)
    return nodes, edges
    
def draw_dot(root):
    dot = Digraph(format='svg', graph_attr={'rankdir': 'LR'}) # LR = left to right
    
    nodes, edges = trace(root)
    for n in nodes:
        uid = str(id(n))
        
        # for any value in the graph, create a rectangular ('record') node for it
        dot.node(name = uid, label = "{ %s | data %.4f }" % (n.label, n.data), shape='record')
        
        if n._op:
            # if this value is a result of some operation, create an op node for it
            dot.node(name = uid + n._op, label = n._op)
            
            # and connect this node to it
            dot.edge(uid + n._op, uid)
        
    for n1, n2 in edges:
        # connect n_i to the op node of n2
        dot.edge(str(id(n1)), str(id(n2)) + n2._op)
    
    return dot
 
# NB: The op nodes are not real objects (rectangles)
# they're simply to visualise which operation (+ or *) was used to create the actual objects (rectangles are real nodes!)
draw_dot(d)
4609458256 b data -3.0000 4608097200* * 4609458256->4608097200* 4609459856 c data 10.0000 4608097504+ + 4609459856->4608097504+ 4608097504 d data 4.0000 4608097504+->4608097504 4608830704 a data 2.0000 4608830704->4608097200* 4608097200 e data -6.0000 4608097200->4608097504+ 4608097200*->4608097200

5.1.2. Add one more Value object (go 1 layer deeper)

# i - extend the NN by 1 layer
f = Value(-2.0, label='f')
L = d * f; L.label = 'L' # the output of our graph
 
draw_dot(L)
4609458256 b data -3.0000 4608097200* * 4609458256->4608097200* 4609459856 c data 10.0000 4608097504+ + 4609459856->4608097504+ 4608097504 d data 4.0000 4609346288* * 4608097504->4609346288* 4608097504+->4608097504 4609346288 L data -8.0000 4609346288*->4609346288 4608830704 a data 2.0000 4608830704->4608097200* 4495821136 f data -2.0000 4495821136->4609346288* 4608097200 e data -6.0000 4608097200->4608097504+ 4608097200*->4608097200

5.2. Not all “leaf nodes” are actually of interest

  • Leaf nodes a, b, c, and f above may represent either input data OR weights
  • We are only interested in how the weights impact the loss function L.
    • Weights are updated (trained) to minimise L
    • The derivative of L wrt data is uninteresting, because input data is fixed (untrainable)

Takeaways

  • That’s the forward pass! Its output was L=-8
  • Next, let’s do backpropagation:
    • We’ll work backwards from L node; and calculate the gradient at each of the intermediate values (nodes a to f)
    • Specifically, at each node (a to f) compute the derivative of L with respect to that node.

What’s happening?

  • In essence, L is the “loss function” and the prior nodes a to f represent the “weights” of a NN
    • Note: “weights” are different to the “data” held at each prior node.
  • We’re interested in how the weights impact the loss function
    • Hence, the derivative of the loss function wrt the weights

Towards backpropagation

Maintain the gradient of L wrt each prior node

Update Value object to maintain the derivative grad of L with respect to each prior node.

  • Start at the final node L work backwards (recursively)
  • Individually compute derivative of L wrt each prior node, i.e.:
    • d(L)/dL,
    • d(L)/df,
    • d(L)/dd, and so on
  • Initialise grad as 0 at all nodes
    • assume each variable (node) has no effect on the output value (loss function L)
# extend `Value` class with `.grad` attribute:
class Value:
 
    # new var grad maintained
    def __init__(self, data, _children=(), _op='', label=''):
        self.data = data
        self.grad = 0.0 # At initialisation, assume every variable has no effect on the output value (L: loss function)
        self._prev = set(_children)
        self._op = _op
        self.label = label
 
    def __repr__(self):
        return f"Value(data={self.data})" 
 
    def __add__(self, other):
        out = Value(self.data + other.data, (self, other), '+')
        return out
 
    def __mul__(self, other):
        out = Value(self.data * other.data, (self, other), '*')
        return out
 
a = Value(2.0, label='a')
b = Value(-3.0, label='b')
c = Value(10.0, label='c')
e = a*b; e.label = 'e'
d = e + c; d.label = 'd'
f = Value(-2.0, label='f')
L = d * f; L.label = 'L' # the output of our graph
L
Value(data=-8.0)

Update graphviz with initialised gradient info

# redefine graphviz to add gradients to the visualisation
from graphviz import Digraph
 
def trace(root):
    # recursively builds a set of all nodes and edges in a graph
    nodes, edges = set(), set()
 
    def build(v):
        if v not in nodes: 
            nodes.add(v)
            for child in v._prev:
                edges.add((child, v))
                build(child)
    build(root)
    return nodes, edges
    
def draw_dot(root):
    dot = Digraph(format='svg', graph_attr={'rankdir': 'LR'}) # LR = left to right
    
    nodes, edges = trace(root)
    for n in nodes:
        uid = str(id(n))
        
        # for any value in the graph, create a rectangular ('record') node for it
        dot.node(name = uid, label = "{ %s | data %.4f | grad %.4f }" % (n.label, n.data, n.grad), shape='record')
        
        if n._op:
            # if this value is a result of some operation, create an op node for it
            dot.node(name = uid + n._op, label = n._op)
            
            # and connect this node to it
            dot.edge(uid + n._op, uid)
        
    for n1, n2 in edges:
        # connect n_i to the op node of n2
        dot.edge(str(id(n1)), str(id(n2)) + n2._op)
    
    return dot
 
draw_dot(L)
4495453696 L data -8.0000 grad 0.0000 4495453696* * 4495453696*->4495453696 4608831040 a data 2.0000 grad 0.0000 4608096592* * 4608831040->4608096592* 4609671248 d data 4.0000 grad 0.0000 4609671248->4495453696* 4609671248+ + 4609671248+->4609671248 4609460816 c data 10.0000 grad 0.0000 4609460816->4609671248+ 4609460496 b data -3.0000 grad 0.0000 4609460496->4608096592* 4608096592 e data -6.0000 grad 0.0000 4608096592->4609671248+ 4608096592*->4608096592 4609120240 f data -2.0000 grad 0.0000 4609120240->4495453696*

Sources