```python slideshow={"slide_type": "skip"} +from dataclasses import dataclass
+import chalk +from colour import Color +from IPython.display import SVG +from mt_diagrams.autodiff_draw import backprop, draw_boxes +from mt_diagrams.mlprimer_draw import ( + compare, + draw_graph, + draw_nn_graph, + draw_with_hard_points, + graph, + s, + s1_hard, + s2_hard, + show, + show_loss, + split_graph, +) +from mt_diagrams.show_expression import make_graph
+import minitorch +from minitorch import Module, Parameter, Scalar
+chalk.set_svg_draw_height(150) +chalk.set_svg_height(100) +
<!-- #region slideshow={"slide_type": "slide"} -->
+
+Module 2.0 - Neural Networks
+==============================
+<!-- #endregion -->
+
+<!-- #region slideshow={"slide_type": "slide"} -->
+Our Goal
+-----------
+
+Compute derivative of Python function with respect to inputs.
+<!-- #endregion -->
+
+
+<!-- #region slideshow={"slide_type": "slide"} -->
+Example: Function
+---------------------
+<!-- #endregion -->
+
+
+```python slideshow={"slide_type": "x"}
+def expression():
+ x = Scalar(1.0)
+ y = Scalar(1.0)
+ z = -y * sum([x, x, x]) * y + 10.0 * x
+ h_x_y = z + z
+ return h_x_y
+
```python slideshow={"slide_type": "x"} tags=["hide_inp"] +SVG(make_graph(expression(), lr=True)) +
<!-- #region slideshow={"slide_type": "slide"} -->
+Chain Rule: Simple Case
+-----------
+
+ $$
+\begin{eqnarray*}
+z &=& g(x) \\
+d &=& f'(z) \\
+f'_x(g(x)) &=& g'(x) \times d \\
+\end{eqnarray*}
+ $$
+<!-- #endregion -->
+
+```python slideshow={"slide_type": "x"} tags=["hide_inp"]
+draw_boxes(["$x$", "$z = g(x)$", "$f(g(x))$"], [1, 1])
+
```python slideshow={"slide_type": "x"} tags=["hide_inp"] +draw_boxes([r"\(d\cdot g'(x)\)", "\(f'(z)\)", "\(1\)"], [1, 1], lr=False) +
<!-- #region slideshow={"slide_type": "slide"} -->
+Chain Rule: Two Arguments
+-------------------------
+
+ $$
+ \begin{eqnarray*}
+ z &=& g(x, y) \\
+ d &=& f'(z) \\
+ f'_x(g(x, y)) &=& g_x'(x, y) \times d \\
+ f'_y(g(x, y)) &=& g_y'(x, y) \times d
+ \end{eqnarray*}
+ $$
+<!-- #endregion -->
+
+```python slideshow={"slide_type": "x"} tags=["hide_inp"]
+draw_boxes([("$x$", "$y$"), "$z = g(x, y)$", "$h(x,y)$"], [1, 1])
+
```python slideshow={"slide_type": "x"} tags=["hide_inp"] +draw_boxes( + [(r"\(d \times g'_x(x, y)\)", r"\(d \times g'_y(x, y)\)"), "\(f'(z)\)", "\(1\)"], + [1, 1], + lr=False, +) +
<!-- #region slideshow={"slide_type": "slide"} -->
+Chain Rule: Repeated Use
+-------------------------
+ $$z = g(x)$$
+ $$f(z, z)$$
+<!-- #endregion -->
+
+```python slideshow={"slide_type": "x"} tags=["hide_inp"]
+draw_boxes(["$x$", ("$z_1, z_2$"), "$h(x)$"], [1, 1])
+
Chain Rule: Repeated Use
+$$ +begin{aligned} +begin{eqnarray*} +d &=& f'{z_1}(z_1, z_2) + f'(z_1, z_2) \ +h'_x(x) &=& d times g'_x(x) \ +end{eqnarray*} +end{aligned} + $$
+```python slideshow={"slide_type": "x"} tags=["hide_inp"] +draw_boxes(["\(x\)", ("\(z_1 = g(x)\)", "\(z_2 = g(x)\)"), "\(h(x)\)"], [1, 1]) +
```python slideshow={"slide_type": "x"} tags=["hide_inp"]
+draw_boxes(
+ [r"$d \cdot g'_x(x)$", ("$f'_{z_1}(z_1, z_2)$", "$f'_{z_2}(z_1, z_2)$"), "$1$"],
+ [1, 1],
+ lr=False,
+)
+
Algorithm: Outer Loop
+-
+
- Call topological sort +
- Create dict of edges and empty \(d\) values. +
- For each edge and \(d\) in topological order: +
Algorithm: Inner Loop
+-
+
- If edge goes to Leaf, done +
- Call
backward
with \(d\) on previous box
+ - Loop through all its input edges and add derivative +
Example
+```python slideshow={"slide_type": "x"} tags=["hide_inp"]
+chalk.set_svg_height(200) +backprop(1) +
<!-- #region slideshow={"slide_type": "slide"} -->
+Example
+-----------
+<!-- #endregion -->
+```python slideshow={"slide_type": "x"} tags=["hide_inp"]
+backprop(2)
+
Example
+```python slideshow={"slide_type": "x"} tags=["hide_inp"] +backprop(3) +
<!-- #region slideshow={"slide_type": "slide"} -->
+Example
+-----------
+<!-- #endregion -->
+
+```python slideshow={"slide_type": "x"} tags=["hide_inp"]
+backprop(4)
+
Example
+```python slideshow={"slide_type": "x"} tags=["hide_inp"] +backprop(5) +
<!-- #region slideshow={"slide_type": "slide"} -->
+Example
+-----------
+<!-- #endregion -->
+```python slideshow={"slide_type": "x"} tags=["hide_inp"]
+backprop(6)
+
Example
+```python slideshow={"slide_type": "x"} tags=["hide_inp"] +backprop(7)
+chalk.set_svg_height(200) +
<!-- #region slideshow={"slide_type": "slide"} -->
+
+Quiz
+------------
+
+
+<!-- #endregion -->
+
+<!-- #region slideshow={"slide_type": "slide"} -->
+Outline
+---------
+* Model Training
+* Neural Networks
+* Modern Models
+
+<!-- #endregion -->
+<!-- #region slideshow={"slide_type": "slide"} -->
+Model Training
+=================
+
+<!-- #endregion -->
+
+<!-- #region slideshow={"slide_type": "slide"} -->
+Reminder: MiniML
+-----------------
+
+* Dataset - Data to fit
+* Model - Shape of fit
+* Loss - Goodness of fit
+<!-- #endregion -->
+
+
+<!-- #region slideshow={"slide_type": "slide"} -->
+Model 1
+---------------------
+
+* Linear Model
+<!-- #endregion -->
+
+
+```python slideshow={"slide_type": "x"} tags=["hide_inp"]
+from minitorch import Parameter, Module
+class Linear(Module):
+ def __init__(self, w1, w2, b):
+ super().__init__()
+ self.w1 = Parameter(w1)
+ self.w2 = Parameter(w2)
+ self.b = Parameter(b)
+
+ def forward(self, x1: float, x2: float) -> float:
+ return self.w1.value * x1 + self.w2.value * x2 + self.b.value
+
+
+model = Linear(1, 1, -0.9)
+draw_graph(model)
+
Point Loss
+```python slideshow={"slide_type": "x"} +def point_loss(x): + return minitorch.operators.relu(x)
+def full_loss(m): + l = 0 + for x, y in zip(s.X, s.y): + l += point_loss(-y * m.forward(*x)) + return -l
+graph(point_loss, [], [-2, -0.2, 1]) +
<!-- #region slideshow={"slide_type": "slide"} -->
+Class Goal
+-----------
+
+ * Find parameters that minimize loss
+<!-- #endregion -->
+
+```python slideshow={"slide_type": "x"} tags=["hide_inp"]
+
+chalk.hcat(
+ [show(Linear(1, 1, -0.6)), show(Linear(1, 1, -0.7)), show(Linear(1, 1, -0.8))], 0.3
+)
+
Parameter Fitting
+-
+
- (Forward) Compute the loss function, \(L(w_1, w_2, b)\) +
- (Backward) See how small changes would change the loss +
- Update to parameters to locally reduce the loss +
Update Procedure
+```python slideshow={"slide_type": "x"} tags=["hide_inp"] +chalk.set_svg_height(400)
+show_loss(full_loss, Linear(1, 1, 0)) +chalk.set_svg_height(200) +
<!-- #region slideshow={"slide_type": "slide"} -->
+Module for Linear
+--------------------------------------
+<!-- #endregion -->
+```python slideshow={"slide_type": "x"}
+
+
+class LinearModule(minitorch.Module):
+ def __init__(self):
+ super().__init__()
+ # 0.0 is start value for param
+ self.w1 = Parameter(Scalar(0.0))
+ self.w2 = Parameter(Scalar(0.0))
+ self.bias = Parameter(Scalar(0.0))
+
+ def forward(self, x1: Scalar, x2: Scalar) -> Scalar:
+ return x1 * self.w1.value + x2 * self.w2.value + self.bias.value
+
Training Loop
+```python slideshow={"slide_type": "x"}
+def train_step(optim, model, data): + # Step 1 - Forward (Loss function) + x_1, x_2 = Scalar(data[0]), Scalar(data[1]) + loss = model.forward(x_1, x_2).relu() + # Step 2 - Backward (Compute derivative) + loss.backward() + # Step 3 - Update Params + optim.step() +
<!-- #region slideshow={"slide_type": "slide"} -->
+More Features: Linear Model
+------------------------------
+
+ $\text{lin}(x; w, b) = x_1 \times w_1 + \ldots + x_n \times w_n + b$
+
+
+<!-- #endregion -->
+
+<!-- #region slideshow={"slide_type": "slide"} -->
+More Features: Linear (Code)
+--------------------------------------
+<!-- #endregion -->
+```python slideshow={"slide_type": "x"}
+
+
+class LinearModule(minitorch.Module):
+ def __init__(self, in_size):
+ super().__init__()
+ self.weights = []
+ self.bias = []
+ # Need add parameter
+ for i in range(in_size):
+ self.weights.append(self.add_parameter(f"weight_{i}", 0.0))
+
Neural Networks
+Linear Model Example
+-
+
- Parameters +
```python slideshow={"slide_type": "x"} tags=["hide_inp"] +chalk.set_svg_height(300) +model1 = Linear(1, 1, -1.0) +model2 = Linear(0.5, 1.5, -1.0) +compare(model1, model2) +
<!-- #region slideshow={"slide_type": "slide"} -->
+Harder Datasets
+----------------
+<!-- #endregion -->
+
+```python slideshow={"slide_type": "x"} tags=["hide_inp"]
+split_graph(s1_hard, s2_hard, show_origin=True)
+
Harder Datasets
+-
+
- Model may not be good with any parameters. +
```python slideshow={"slide_type": "x"} tags=["hide_inp"] +model = Linear(1, 1, -0.7) +draw_with_hard_points(model) +
<!-- #region slideshow={"slide_type": "slide"} -->
+Neural Networks
+------------------
+* New *model*
+* Uses repeated splits of data
+* Loss will not change
+<!-- #endregion -->
+
+<!-- #region slideshow={"slide_type": "slide"} -->
+Intuition: Neural Networks
+--------------------------
+
+1. Apply many linear seperators
+2. Reshape the data space based on results
+3. Apply a linear model on new space
+<!-- #endregion -->
+
+<!-- #region slideshow={"slide_type": "slide"} -->
+Notation: Multiple Parameters
+--------------------------
+
+* Use superscript $w^0$ and $w^1$ to indicate different parameters.
+* Our final model will have many linears.
+* These will become Torch sub-modules.
+<!-- #endregion -->
+
+
+<!-- #region slideshow={"slide_type": "slide"} -->
+Intuition: Split 1
+--------------------------
+<!-- #endregion -->
+
+
+```python slideshow={"slide_type": "x"} tags=["hide_inp"]
+yellow = Linear(-1, 0, 0.25)
+ycolor = Color("#fde699")
+draw_with_hard_points(yellow, ycolor, Color("white"))
+
Reshape: ReLU
+```python slideshow={"slide_type": "x"} tags=["hide_inp"]
+graph( + minitorch.operators.relu, + [yellow.forward(*pt) for pt in s2_hard], + [yellow.forward(*pt) for pt in s1_hard], + 3, + 0.25, + c=ycolor, +) +
<!-- #region slideshow={"slide_type": "slide"} -->
+Math View
+---------------
+
+ $$
+\begin{eqnarray*}
+h_ 1 &=& \text{ReLU}(\text{lin}(x; w^0, b^0)) \\
+\end{eqnarray*}
+ $$
+<!-- #endregion -->
+
+<!-- #region slideshow={"slide_type": "slide"} -->
+Intuition: Split 2
+-------------------------
+<!-- #endregion -->
+
+
+```python slideshow={"slide_type": "x"} tags=["hide_inp"]
+green = Linear(1, 0, -0.8)
+gcolor = Color("#d1e9c3")
+draw_with_hard_points(green, gcolor, Color("white"))
+
Math View
+$$ +begin{eqnarray*} +h_ 2 &=& text{ReLU}(text{lin}(x; w^1, b^1)) \ +end{eqnarray*} + $$
+Reshape: ReLU
+```python slideshow={"slide_type": "x"} tags=["hide_inp"]
+graph( + minitorch.operators.relu, + [green.forward(*pt) for pt in s2_hard], + [green.forward(*pt) for pt in s1_hard], + 3, + 0.25, + c=gcolor, +) +
<!-- #region slideshow={"slide_type": "slide"} -->
+Reshape: ReLU
+--------------
+<!-- #endregion -->
+
+```python slideshow={"slide_type": "x"} tags=["hide_inp"]
+draw_nn_graph(green, yellow)
+
Final Layer
+```python slideshow={"slide_type": "x"} tags=["hide_inp"]
+@dataclass +class MLP: + lin1: Linear + lin2: Linear + final: Linear
+def forward(self, x1, x2):
+ x1_1 = minitorch.operators.relu(self.lin1.forward(x1, x2))
+ x2_1 = minitorch.operators.relu(self.lin2.forward(x1, x2))
+ return self.final.forward(x1_1, x2_1)
+
mlp = MLP(green, yellow, Linear(3, 3, -0.3)) +draw_with_hard_points(mlp) +
<!-- #region slideshow={"slide_type": "slide"} -->
+Math View
+----------
+ $$
+\begin{eqnarray*}
+h_1 &=& \text{ReLU}(x_1 \times w^0_1 + x_2 \times w^0_2 + b^0) \\
+h_2 &=& \text{ReLU}(x_1 \times w^1_1 + x_2 \times w^1_2 + b^1)\\
+m(x_1, x_2) &=& h_1 \times w_1 + h_2 \times w_2 + b
+\end{eqnarray*}
+ $$
+Parameters:
+ $w_1, w_2, w^0_1, w^0_2, w^1_1, w^1_2, b, b^0, b^1$
+<!-- #endregion -->
+
+<!-- #region slideshow={"slide_type": "slide"} -->
+Math View (Alt)
+---------------
+
+ $$
+ \begin{eqnarray*}
+ h_ 1 &=& \text{ReLU}(\text{lin}(x; w^0, b^0)) \\
+ h_ 2 &=& \text{ReLU}(\text{lin}(x; w^1, b^1))\\
+ m(x_1, x_2) &=& \text{lin}(h; w, b)
+ \end{eqnarray*}
+ $$
+<!-- #endregion -->
+
+<!-- #region slideshow={"slide_type": "slide"} -->
+Code View
+----------
+
+Linear
+<!-- #endregion -->
+
+
+```python slideshow={"slide_type": "x"}
+class LinearModule(Module):
+ def __init__(self):
+ super().__init__()
+ self.w_1 = Parameter(Scalar(0.0))
+ self.w_2 = Parameter(Scalar(0.0))
+ self.b = Parameter(Scalar(0.0))
+
+ def forward(self, inputs):
+ return inputs[0] * self.w_1.value + inputs[1] * self.w_2.value + self.b.value
+
Code View
+Model
+```python slideshow={"slide_type": "x"} +class Network(minitorch.Module): + def init(self): + super().init() + self.unit1 = LinearModule() + self.unit2 = LinearModule() + self.classify = LinearModule()
+def forward(self, x):
+ h1 = self.unit1.forward(x).relu()
+ h2 = self.unit2.forward(x).relu()
+ return self.classify.forward((h1, h2))
+
``` +Training
++
-
+
- All the parameters in model are leaves +
- Computing backward on loss fills their derivative
+
python slideshow={"slide_type": "x"} +model = Network() +parameters = dict(model.named_parameters()) +parameters
+
Derivatives
+-
+
- All the parameters in model are leaf Variables +
```python slideshow={"slide_type": "x"} +model = Network() +x1, x2 = Scalar(0.5), Scalar(0.5)
+Step 1
+out = model.forward((0.5, 0.5)) +loss = out.relu()
+Step 2
+SVG(make_graph(loss, lr=True)) +
<!-- #region slideshow={"slide_type": "slide"} -->
+Derivatives
+----------
+* All the parameters in model are leaf scalars
+<!-- #endregion -->
+
+
+```python slideshow={"slide_type": "x"}
+parameters["unit1.w_1"].value.derivative
+