Module 2.0 - Neural Networks¶
+Our Goal¶
Compute derivative of Python function with respect to inputs.
+Example: Function¶
+def expression():
+ x = Scalar(1.0)
+ y = Scalar(1.0)
+ z = -y * sum([x, x, x]) * y + 10.0 * x
+ h_x_y = z + z
+ return h_x_y
+
SVG(make_graph(expression(), lr=True))
+
Chain Rule: Simple Case¶
$$ +\begin{eqnarray*} +z &=& g(x) \\ +d &=& f'(z) \\ +f'_x(g(x)) &=& g'(x) \times d \\ +\end{eqnarray*} + $$
+draw_boxes(["$x$", "$z = g(x)$", "$f(g(x))$"], [1, 1])
+
draw_boxes([r"$d\cdot g'(x)$", "$f'(z)$", "$1$"], [1, 1], lr=False)
+
Chain Rule: Two Arguments¶
$$ + \begin{eqnarray*} + z &=& g(x, y) \\ + d &=& f'(z) \\ + f'_x(g(x, y)) &=& g_x'(x, y) \times d \\ + f'_y(g(x, y)) &=& g_y'(x, y) \times d + \end{eqnarray*} + $$
+draw_boxes([("$x$", "$y$"), "$z = g(x, y)$", "$h(x,y)$"], [1, 1])
+
draw_boxes(
+ [(r"$d \times g'_x(x, y)$", r"$d \times g'_y(x, y)$"), "$f'(z)$", "$1$"],
+ [1, 1],
+ lr=False,
+)
+
Chain Rule: Repeated Use¶
$$z = g(x)$$ +$$f(z, z)$$
+draw_boxes(["$x$", ("$z_1, z_2$"), "$h(x)$"], [1, 1])
+
Chain Rule: Repeated Use¶
$$ +\begin{aligned} +\begin{eqnarray*} +d &=& f'_{z_1}(z_1, z_2) + f'_{z_2}(z_1, z_2) \\ +h'_x(x) &=& d \times g'_x(x) \\ +\end{eqnarray*} +\end{aligned} + $$
+draw_boxes(["$x$", ("$z_1 = g(x)$", "$z_2 = g(x)$"), "$h(x)$"], [1, 1])
+
draw_boxes(
+ [r"$d \cdot g'_x(x)$", ("$f'_{z_1}(z_1, z_2)$", "$f'_{z_2}(z_1, z_2)$"), "$1$"],
+ [1, 1],
+ lr=False,
+)
+
Algorithm: Outer Loop¶
-
+
- Call topological sort +
- Create dict of edges and empty $d$ values. +
- For each edge and $d$ in topological order: +
Algorithm: Inner Loop¶
-
+
- If edge goes to Leaf, done +
- Call
backward
with $d$ on previous box
+ - Loop through all its input edges and add derivative +
Example¶
+chalk.set_svg_height(200)
+backprop(1)
+
Example¶
+backprop(2)
+
Example¶
+backprop(3)
+
Example¶
+backprop(4)
+
Example¶
+backprop(5)
+
Example¶
+backprop(6)
+
Example¶
+backprop(7)
+
+
+chalk.set_svg_height(200)
+
Quiz¶
+Outline¶
-
+
- Model Training +
- Neural Networks +
- Modern Models +
Model Training¶
+Reminder: MiniML¶
-
+
- Dataset - Data to fit +
- Model - Shape of fit +
- Loss - Goodness of fit +
Model 1¶
-
+
- Linear Model +
from minitorch import Parameter, Module
+class Linear(Module):
+ def __init__(self, w1, w2, b):
+ super().__init__()
+ self.w1 = Parameter(w1)
+ self.w2 = Parameter(w2)
+ self.b = Parameter(b)
+
+ def forward(self, x1: float, x2: float) -> float:
+ return self.w1.value * x1 + self.w2.value * x2 + self.b.value
+
+
+model = Linear(1, 1, -0.9)
+draw_graph(model)
+
Point Loss¶
+def point_loss(x):
+ return minitorch.operators.relu(x)
+
+
+def full_loss(m):
+ l = 0
+ for x, y in zip(s.X, s.y):
+ l += point_loss(-y * m.forward(*x))
+ return -l
+
+
+graph(point_loss, [], [-2, -0.2, 1])
+
Class Goal¶
-
+
- Find parameters that minimize loss +
chalk.hcat(
+ [show(Linear(1, 1, -0.6)), show(Linear(1, 1, -0.7)), show(Linear(1, 1, -0.8))], 0.3
+)
+
Parameter Fitting¶
-
+
- (Forward) Compute the loss function, $L(w_1, w_2, b)$ +
- (Backward) See how small changes would change the loss +
- Update to parameters to locally reduce the loss +
Update Procedure¶
+chalk.set_svg_height(400)
+
+show_loss(full_loss, Linear(1, 1, 0))
+chalk.set_svg_height(200)
+
Module for Linear¶
+class LinearModule(minitorch.Module):
+ def __init__(self):
+ super().__init__()
+ # 0.0 is start value for param
+ self.w1 = Parameter(Scalar(0.0))
+ self.w2 = Parameter(Scalar(0.0))
+ self.bias = Parameter(Scalar(0.0))
+
+ def forward(self, x1: Scalar, x2: Scalar) -> Scalar:
+ return x1 * self.w1.value + x2 * self.w2.value + self.bias.value
+
Training Loop¶
+def train_step(optim, model, data):
+ # Step 1 - Forward (Loss function)
+ x_1, x_2 = Scalar(data[0]), Scalar(data[1])
+ loss = model.forward(x_1, x_2).relu()
+ # Step 2 - Backward (Compute derivative)
+ loss.backward()
+ # Step 3 - Update Params
+ optim.step()
+
More Features: Linear Model¶
$\text{lin}(x; w, b) = x_1 \times w_1 + \ldots + x_n \times w_n + b$
+More Features: Linear (Code)¶
+class LinearModule(minitorch.Module):
+ def __init__(self, in_size):
+ super().__init__()
+ self.weights = []
+ self.bias = []
+ # Need add parameter
+ for i in range(in_size):
+ self.weights.append(self.add_parameter(f"weight_{i}", 0.0))
+
Neural Networks¶
+Linear Model Example¶
-
+
- Parameters +
chalk.set_svg_height(300)
+model1 = Linear(1, 1, -1.0)
+model2 = Linear(0.5, 1.5, -1.0)
+compare(model1, model2)
+
Harder Datasets¶
+split_graph(s1_hard, s2_hard, show_origin=True)
+
Harder Datasets¶
-
+
- Model may not be good with any parameters. +
model = Linear(1, 1, -0.7)
+draw_with_hard_points(model)
+
Neural Networks¶
-
+
- New model +
- Uses repeated splits of data +
- Loss will not change +
Intuition: Neural Networks¶
-
+
- Apply many linear seperators +
- Reshape the data space based on results +
- Apply a linear model on new space +
Notation: Multiple Parameters¶
-
+
- Use superscript $w^0$ and $w^1$ to indicate different parameters. +
- Our final model will have many linears. +
- These will become Torch sub-modules. +
Intuition: Split 1¶
+yellow = Linear(-1, 0, 0.25)
+ycolor = Color("#fde699")
+draw_with_hard_points(yellow, ycolor, Color("white"))
+
Reshape: ReLU¶
+graph(
+ minitorch.operators.relu,
+ [yellow.forward(*pt) for pt in s2_hard],
+ [yellow.forward(*pt) for pt in s1_hard],
+ 3,
+ 0.25,
+ c=ycolor,
+)
+
Math View¶
$$ +\begin{eqnarray*} +h_ 1 &=& \text{ReLU}(\text{lin}(x; w^0, b^0)) \\ +\end{eqnarray*} + $$
+Intuition: Split 2¶
+green = Linear(1, 0, -0.8)
+gcolor = Color("#d1e9c3")
+draw_with_hard_points(green, gcolor, Color("white"))
+
Math View¶
$$ +\begin{eqnarray*} +h_ 2 &=& \text{ReLU}(\text{lin}(x; w^1, b^1)) \\ +\end{eqnarray*} + $$
+Reshape: ReLU¶
+graph(
+ minitorch.operators.relu,
+ [green.forward(*pt) for pt in s2_hard],
+ [green.forward(*pt) for pt in s1_hard],
+ 3,
+ 0.25,
+ c=gcolor,
+)
+
Reshape: ReLU¶
+draw_nn_graph(green, yellow)
+
Final Layer¶
+@dataclass
+class MLP:
+ lin1: Linear
+ lin2: Linear
+ final: Linear
+
+ def forward(self, x1, x2):
+ x1_1 = minitorch.operators.relu(self.lin1.forward(x1, x2))
+ x2_1 = minitorch.operators.relu(self.lin2.forward(x1, x2))
+ return self.final.forward(x1_1, x2_1)
+
+
+mlp = MLP(green, yellow, Linear(3, 3, -0.3))
+draw_with_hard_points(mlp)
+
Math View¶
$$ +\begin{eqnarray*} +h_1 &=& \text{ReLU}(x_1 \times w^0_1 + x_2 \times w^0_2 + b^0) \\ +h_2 &=& \text{ReLU}(x_1 \times w^1_1 + x_2 \times w^1_2 + b^1)\\ +m(x_1, x_2) &=& h_1 \times w_1 + h_2 \times w_2 + b +\end{eqnarray*} + $$ +Parameters: +$w_1, w_2, w^0_1, w^0_2, w^1_1, w^1_2, b, b^0, b^1$
+Math View (Alt)¶
$$ + \begin{eqnarray*} + h_ 1 &=& \text{ReLU}(\text{lin}(x; w^0, b^0)) \\ + h_ 2 &=& \text{ReLU}(\text{lin}(x; w^1, b^1))\\ + m(x_1, x_2) &=& \text{lin}(h; w, b) + \end{eqnarray*} + $$
+Code View¶
Linear
+class LinearModule(Module):
+ def __init__(self):
+ super().__init__()
+ self.w_1 = Parameter(Scalar(0.0))
+ self.w_2 = Parameter(Scalar(0.0))
+ self.b = Parameter(Scalar(0.0))
+
+ def forward(self, inputs):
+ return inputs[0] * self.w_1.value + inputs[1] * self.w_2.value + self.b.value
+
Code View¶
Model
+class Network(minitorch.Module):
+ def __init__(self):
+ super().__init__()
+ self.unit1 = LinearModule()
+ self.unit2 = LinearModule()
+ self.classify = LinearModule()
+
+ def forward(self, x):
+ h1 = self.unit1.forward(x).relu()
+ h2 = self.unit2.forward(x).relu()
+ return self.classify.forward((h1, h2))
+
Training¶
-
+
- All the parameters in model are leaves +
- Computing backward on loss fills their derivative +
model = Network()
+parameters = dict(model.named_parameters())
+parameters
+
{'unit1.w_1': Scalar(0.0), + 'unit1.w_2': Scalar(0.0), + 'unit1.b': Scalar(0.0), + 'unit2.w_1': Scalar(0.0), + 'unit2.w_2': Scalar(0.0), + 'unit2.b': Scalar(0.0), + 'classify.w_1': Scalar(0.0), + 'classify.w_2': Scalar(0.0), + 'classify.b': Scalar(0.0)}+
Derivatives¶
-
+
- All the parameters in model are leaf Variables +
model = Network()
+x1, x2 = Scalar(0.5), Scalar(0.5)
+# Step 1
+out = model.forward((0.5, 0.5))
+loss = out.relu()
+# Step 2
+SVG(make_graph(loss, lr=True))
+
Derivatives¶
-
+
- All the parameters in model are leaf scalars +
parameters["unit1.w_1"].value.derivative
+