diff --git a/404.html b/404.html
index bfd58d3..4f34ef4 100644
--- a/404.html
+++ b/404.html
@@ -22,6 +22,9 @@
 <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/katex@0.16.10/dist/katex.min.css" integrity="sha384-wcIxkf4k558AjM3Yz3BBFQUbk/zgIYC2R0QpeeYb+TwlBVMrlgLqwRjRtGZiK7ww" crossorigin="anonymous">
 <script defer="" src="https://cdn.jsdelivr.net/npm/katex@0.16.10/dist/katex.min.js" integrity="sha384-hIoBPJpTUs74ddyc4bFZSM1TVlQDA60VBbJS0oA934VSz82sBx1X7kSx2ATBDIyd" crossorigin="anonymous"></script>
 <script defer="" src="https://cdn.jsdelivr.net/npm/katex@0.16.10/dist/contrib/auto-render.min.js" integrity="sha384-43gviWU0YVjaDtb/GhzOouOXtZMP/7XUzwPTstBeZFe/+rCMvRwr4yROQP43s0Xk" crossorigin="anonymous" onload="renderMathInElement(document.body);"></script>
+<!-- BibTeX Support -->
+<script type="text/javascript" src="https://cdn.jsdelivr.net/gh/pcooksey/bibtex-js@1.0.0/src/bibtex_js.min.js"></script>
+<!-- Stylesheets -->
 <link rel="stylesheet" href="/assets/tufte.min.css">
 <link rel="stylesheet" href="https://use.fontawesome.com/releases/v5.15.4/css/all.css">
 <!-- Goat Counter for basic view counting w/o cookies -->
diff --git a/assets/img/2p1_loss_plot.png b/assets/img/2p1_loss_plot.png
new file mode 100644
index 0000000..77c6d5e
Binary files /dev/null and b/assets/img/2p1_loss_plot.png differ
diff --git a/assets/img/J9W8EvOzUU-2562.png b/assets/img/J9W8EvOzUU-2562.png
new file mode 100644
index 0000000..5b44dad
Binary files /dev/null and b/assets/img/J9W8EvOzUU-2562.png differ
diff --git a/assets/img/J9W8EvOzUU-2562.webp b/assets/img/J9W8EvOzUU-2562.webp
new file mode 100644
index 0000000..e49c4f2
Binary files /dev/null and b/assets/img/J9W8EvOzUU-2562.webp differ
diff --git a/assets/img/J9W8EvOzUU-300.png b/assets/img/J9W8EvOzUU-300.png
new file mode 100644
index 0000000..2fe6fad
Binary files /dev/null and b/assets/img/J9W8EvOzUU-300.png differ
diff --git a/assets/img/J9W8EvOzUU-300.webp b/assets/img/J9W8EvOzUU-300.webp
new file mode 100644
index 0000000..29a6ce1
Binary files /dev/null and b/assets/img/J9W8EvOzUU-300.webp differ
diff --git a/assets/img/J9W8EvOzUU-600.png b/assets/img/J9W8EvOzUU-600.png
new file mode 100644
index 0000000..8a329c7
Binary files /dev/null and b/assets/img/J9W8EvOzUU-600.png differ
diff --git a/assets/img/J9W8EvOzUU-600.webp b/assets/img/J9W8EvOzUU-600.webp
new file mode 100644
index 0000000..2e65afb
Binary files /dev/null and b/assets/img/J9W8EvOzUU-600.webp differ
diff --git a/assets/img/J9W8EvOzUU-900.png b/assets/img/J9W8EvOzUU-900.png
new file mode 100644
index 0000000..553af23
Binary files /dev/null and b/assets/img/J9W8EvOzUU-900.png differ
diff --git a/assets/img/J9W8EvOzUU-900.webp b/assets/img/J9W8EvOzUU-900.webp
new file mode 100644
index 0000000..d865dac
Binary files /dev/null and b/assets/img/J9W8EvOzUU-900.webp differ
diff --git a/assets/img/Zn-fTGuzYQ-1920.png b/assets/img/Zn-fTGuzYQ-1920.png
new file mode 100644
index 0000000..e88eb00
Binary files /dev/null and b/assets/img/Zn-fTGuzYQ-1920.png differ
diff --git a/assets/img/Zn-fTGuzYQ-1920.webp b/assets/img/Zn-fTGuzYQ-1920.webp
new file mode 100644
index 0000000..1d53127
Binary files /dev/null and b/assets/img/Zn-fTGuzYQ-1920.webp differ
diff --git a/assets/img/Zn-fTGuzYQ-300.png b/assets/img/Zn-fTGuzYQ-300.png
new file mode 100644
index 0000000..be98eeb
Binary files /dev/null and b/assets/img/Zn-fTGuzYQ-300.png differ
diff --git a/assets/img/Zn-fTGuzYQ-300.webp b/assets/img/Zn-fTGuzYQ-300.webp
new file mode 100644
index 0000000..38ce245
Binary files /dev/null and b/assets/img/Zn-fTGuzYQ-300.webp differ
diff --git a/assets/img/Zn-fTGuzYQ-600.png b/assets/img/Zn-fTGuzYQ-600.png
new file mode 100644
index 0000000..802b0ac
Binary files /dev/null and b/assets/img/Zn-fTGuzYQ-600.png differ
diff --git a/assets/img/Zn-fTGuzYQ-600.webp b/assets/img/Zn-fTGuzYQ-600.webp
new file mode 100644
index 0000000..0c07df5
Binary files /dev/null and b/assets/img/Zn-fTGuzYQ-600.webp differ
diff --git a/assets/img/Zn-fTGuzYQ-900.png b/assets/img/Zn-fTGuzYQ-900.png
new file mode 100644
index 0000000..ebe717a
Binary files /dev/null and b/assets/img/Zn-fTGuzYQ-900.png differ
diff --git a/assets/img/Zn-fTGuzYQ-900.webp b/assets/img/Zn-fTGuzYQ-900.webp
new file mode 100644
index 0000000..4b619e7
Binary files /dev/null and b/assets/img/Zn-fTGuzYQ-900.webp differ
diff --git a/assets/img/relu2.png b/assets/img/relu2.png
new file mode 100644
index 0000000..737c823
Binary files /dev/null and b/assets/img/relu2.png differ
diff --git a/feed.xml b/feed.xml
index 1d92273..6fe0b7d 100644
--- a/feed.xml
+++ b/feed.xml
@@ -20,9 +20,10 @@
 &lt;thead&gt;
 &lt;tr&gt;
 &lt;th align=&quot;left&quot;&gt;#&lt;/th&gt;
+&lt;th align=&quot;left&quot;&gt;Description&lt;/th&gt;
 &lt;th align=&quot;left&quot;&gt;Record time&lt;/th&gt;
 &lt;th align=&quot;left&quot;&gt;Training Tokens&lt;/th&gt;
-&lt;th align=&quot;left&quot;&gt;Description&lt;/th&gt;
+&lt;th align=&quot;left&quot;&gt;Tokens/Second&lt;/th&gt;
 &lt;th align=&quot;left&quot;&gt;Date&lt;/th&gt;
 &lt;th align=&quot;left&quot;&gt;Commit&lt;/th&gt;
 &lt;th align=&quot;left&quot;&gt;Log&lt;/th&gt;
@@ -30,14 +31,25 @@
 &lt;/thead&gt;
 &lt;tbody&gt;
 &lt;tr&gt;
-&lt;td align=&quot;left&quot;&gt;1&lt;/td&gt;
-&lt;td align=&quot;left&quot;&gt;8.13 hours&lt;/td&gt;
-&lt;td align=&quot;left&quot;&gt;6.44e+09&lt;/td&gt;
+&lt;td align=&quot;left&quot;&gt;&lt;a href=&quot;https://tylerromero.com/posts/nanogpt-speedrun-worklog/#1-initial-setup-and-baseline&quot;&gt;1&lt;/a&gt;&lt;/td&gt;
 &lt;td align=&quot;left&quot;&gt;Initial baseline&lt;/td&gt;
-&lt;td align=&quot;left&quot;&gt;2025–01–16&lt;/td&gt;
+&lt;td align=&quot;left&quot;&gt;8.13 hours&lt;/td&gt;
+&lt;td align=&quot;left&quot;&gt;6.44B&lt;/td&gt;
+&lt;td align=&quot;left&quot;&gt;220.7k&lt;/td&gt;
+&lt;td align=&quot;left&quot;&gt;2025/01/16&lt;/td&gt;
 &lt;td align=&quot;left&quot;&gt;&lt;a href=&quot;https://github.com/tyler-romero/nanogpt-speedrun/commit/b3c32f8937c1f4655c5eb9607970e03e351a6c08&quot;&gt;b3c32f8&lt;/a&gt;&lt;/td&gt;
 &lt;td align=&quot;left&quot;&gt;&lt;a href=&quot;https://github.com/tyler-romero/nanogpt-speedrun/blob/main/logs/4c627c0d-029c-4f8a-bd18-40f99b43b22e.txt&quot;&gt;here&lt;/a&gt;&lt;/td&gt;
 &lt;/tr&gt;
+&lt;tr&gt;
+&lt;td align=&quot;left&quot;&gt;&lt;a href=&quot;https://tylerromero.com/posts/nanogpt-speedrun-worklog/#21-architectural-changes-and-training-tweaks&quot;&gt;2.1&lt;/a&gt;&lt;/td&gt;
+&lt;td align=&quot;left&quot;&gt;Architectural changes&lt;/td&gt;
+&lt;td align=&quot;left&quot;&gt;7.51 hours&lt;/td&gt;
+&lt;td align=&quot;left&quot;&gt;5.07B&lt;/td&gt;
+&lt;td align=&quot;left&quot;&gt;187.7k&lt;/td&gt;
+&lt;td align=&quot;left&quot;&gt;2025/01/18&lt;/td&gt;
+&lt;td align=&quot;left&quot;&gt;&lt;a href=&quot;https://github.com/tyler-romero/nanogpt-speedrun/commit/b7bb93fd988d73a55184c553f0020feec1454340&quot;&gt;b7bb93f&lt;/a&gt;&lt;/td&gt;
+&lt;td align=&quot;left&quot;&gt;&lt;a href=&quot;https://github.com/tyler-romero/nanogpt-speedrun/blob/main/logs/14fcdb07-443d-4d1c-b307-061bc4bd2cd6.txt&quot;&gt;here&lt;/a&gt;&lt;/td&gt;
+&lt;/tr&gt;
 &lt;/tbody&gt;
 &lt;/table&gt;&lt;/section&gt;
 &lt;section&gt;&lt;h2 id=&quot;1-initial-setup-and-baseline&quot;&gt;1. Initial setup and baseline&lt;/h2&gt;&lt;p&gt;Part of the goal of this project is for me to learn as I go, so I am going to start at the beginning - with with Andrej Karpathy’s &lt;a href=&quot;https://github.com/karpathy/llm.c/blob/7b929300217ff1a974b63791a228928b39b26409/train_gpt2.py&quot;&gt;PyTorch GPT-2 trainer&lt;/a&gt; from &lt;a href=&quot;https://github.com/karpathy/llm.c&quot;&gt;llm.c&lt;/a&gt;. This is the script that Keller Jordan used for &lt;a href=&quot;https://github.com/KellerJordan/modded-nanogpt/tree/master?tab=readme-ov-file#modded-nanogpt&quot;&gt;his initial baseline&lt;/a&gt;. This trainer is very similar to the NanoGPT trainer with some minor modifications / simplifications (such as no dropout).&lt;/p&gt;&lt;p&gt;I have upstreamed some QOL improvements and basic tweaks to the training script from Keller’s fork, but have not changed any of the core training / modeling logic. Specifically:&lt;/p&gt;&lt;ol&gt;
@@ -46,15 +58,77 @@
 &lt;li&gt;Improved learning rate schedule (linear warmup then linear decay).&lt;/li&gt;
 &lt;li&gt;Removed all affine scale/bias parameters and switched to RMSNorm.&lt;/li&gt;
 &lt;li&gt;Padded the vocab size from 50257 to 50304 to make it a multiple of 128 (for better tensor core utilization).&lt;/li&gt;
-&lt;/ol&gt;&lt;p&gt;Additionally, I added &lt;code&gt;wandb&lt;/code&gt; logging for easy tracking of training progress - optimistically I may need to remove this one day as it slightly increases step time.&lt;/p&gt;&lt;p&gt;Commit with the initial setup is here: &lt;a href=&quot;https://github.com/tyler-romero/nanogpt-speedrun/blob/main/logs/4c627c0d-029c-4f8a-bd18-40f99b43b22e.txt&quot;&gt;&lt;code&gt;b3c32f8&lt;/code&gt;&lt;/a&gt;.&lt;/p&gt;&lt;p&gt;The baseline run time on my 2xRTX 4090 setup is &lt;strong&gt;8.13 hours&lt;/strong&gt;.&lt;/p&gt;&lt;!-- TODO: plot --&gt;&lt;!-- ## 2. Implementing major improvements from the 8xH100 leaderboard
-
-Waiting 8 hours for a result, so I&#39;m going to begin by implementing some of the notable improvements from the 8xH100 leaderboard. I&#39;ll start with the most impactful/easiest changes first:
-1. FlexAttention (30.2% speedup)
-2. Muon Optimizer (29% speedup)
-3. Architectural changes (31.8% speedup, then 24% speedup)
-4. Untied embeddings and lm_head (10% speedup)
-
-### 2.1 Muon Optimizer --&gt;&lt;/section&gt;
+&lt;li&gt;Using Pytorch 2.5.1 (the switch from 2.4 to 2.5 gave ~9% speedup on the 8xH100 leaderboard).&lt;/li&gt;
+&lt;/ol&gt;&lt;p&gt;Additionally, I added &lt;code&gt;wandb&lt;/code&gt; logging for easy tracking of training progress - optimistically I may need to remove this one day as it slightly increases step time.&lt;/p&gt;&lt;p&gt;Commit with the initial setup is here: &lt;a href=&quot;https://github.com/tyler-romero/nanogpt-speedrun/blob/main/logs/4c627c0d-029c-4f8a-bd18-40f99b43b22e.txt&quot;&gt;&lt;code&gt;b3c32f8&lt;/code&gt;&lt;/a&gt;.&lt;/p&gt;&lt;p&gt;The baseline run time on my 2xRTX 4090 setup is &lt;strong&gt;8.13 hours&lt;/strong&gt;.&lt;/p&gt;&lt;!-- TODO: plot --&gt;&lt;/section&gt;
+&lt;section&gt;&lt;h2 id=&quot;2-implementing-major-improvements-from-the-8xh100-leaderboard&quot;&gt;2. Implementing major improvements from the 8xH100 leaderboard&lt;/h2&gt;&lt;p&gt;Waiting 8 hours for a result, so I’m going to begin by implementing some of the notable improvements from the 8xH100 leaderboard. I’ll start with the most impactful/easiest changes first:&lt;/p&gt;&lt;ol&gt;
+&lt;li&gt;
+&lt;p&gt;Architectural changes (31.8% speedup, then 24% speedup)&lt;/p&gt;
+&lt;!-- 2. Muon Optimizer (29% speedup) --&gt;
+&lt;!-- 3. Untied embeddings and lm_head (10% speedup) --&gt;
+&lt;/li&gt;
+&lt;/ol&gt;&lt;h3 id=&quot;21-architectural-changes-and-training-tweaks&quot;&gt;2.1 Architectural changes and training tweaks&lt;/h3&gt;&lt;p&gt;There are some basic architectural changes and modernizations that can be made to the model that will speed up training. These changes are general improvements to the transformer decoder architecture that have been generally adopted since the original GPT-2 paper. The changes are:&lt;/p&gt;&lt;ol&gt;
+&lt;li&gt;&lt;a href=&quot;https://arxiv.org/abs/2104.09864&quot;&gt;RoPE (Rotary Positional Embeddings)&lt;/a&gt;. There are &lt;a href=&quot;https://www.jitx.io/posts/rope-embeddings&quot;&gt;many&lt;/a&gt; &lt;a href=&quot;https://blog.eleuther.ai/rotary-embeddings/&quot;&gt;good&lt;/a&gt; explanations of RoPE out there so I won’t go into detail here.&lt;/li&gt;
+&lt;li&gt;&lt;a href=&quot;https://arxiv.org/pdf/2109.08668&quot;&gt;ReLU^2 Activation&lt;/a&gt;&lt;label for=&quot;sd-relu-2-activation&quot; class=&quot;margin-toggle sidenote-number&quot;&gt;&lt;/label&gt;&lt;input type=&quot;checkbox&quot; id=&quot;sd-relu-2-activation&quot; class=&quot;margin-toggle&quot;&gt;&lt;span class=&quot;sidenote&quot;&gt;ReLU^2 activation function. &lt;picture&gt;&lt;source type=&quot;image/webp&quot; srcset=&quot;https://tylerromero.com/assets/img/Zn-fTGuzYQ-300.webp 300w, https://tylerromero.com/assets/img/Zn-fTGuzYQ-600.webp 600w, https://tylerromero.com/assets/img/Zn-fTGuzYQ-900.webp 900w, https://tylerromero.com/assets/img/Zn-fTGuzYQ-1920.webp 1920w&quot; sizes=&quot;(max-width: 900px) 100vw, 900px&quot;&gt;&lt;img loading=&quot;lazy&quot; decoding=&quot;async&quot; class=&quot;responsive-image&quot; src=&quot;https://tylerromero.com/assets/img/Zn-fTGuzYQ-300.png&quot; alt=&quot;Relu Activation plot&quot; width=&quot;1920&quot; height=&quot;1440&quot; srcset=&quot;https://tylerromero.com/assets/img/Zn-fTGuzYQ-300.png 300w, https://tylerromero.com/assets/img/Zn-fTGuzYQ-600.png 600w, https://tylerromero.com/assets/img/Zn-fTGuzYQ-900.png 900w, https://tylerromero.com/assets/img/Zn-fTGuzYQ-1920.png 1920w&quot; sizes=&quot;(max-width: 900px) 100vw, 900px&quot;&gt;&lt;/picture&gt;&lt;/span&gt;. Many activations that are better than GeLU have been proposed since GPT-2. ReLU^2 is a simple one that has been shown to be effective in decreasing training time required to reach a certain validation loss.&lt;/li&gt;
+&lt;li&gt;No gradient clipping. Gradient clipping can help stabilize training but it also slows down training. Since we are speed-running, we will remove gradient clipping. This also eliminates a hyperparameter that needs to be tuned.&lt;/li&gt;
+&lt;li&gt;&lt;a href=&quot;https://arxiv.org/abs/2405.18392&quot;&gt;Trapezoidal learning rate schedule&lt;/a&gt;. While cosine learning rate schedules are the de-facto standard, they can be difficult to work with since changing the number of training steps changes the entire schedule. Trapezoidal learning rate schedules are often easier to reason about / tune around, and they have been show to match the performance of cosine schedules.&lt;/li&gt;
+&lt;/ol&gt;&lt;p&gt;In addition, learning rate and batch size have been tuned.&lt;/p&gt;&lt;p&gt;Once again, many of these changes are &lt;a href=&quot;https://en.wikipedia.org/wiki/Downstream_(software_development)&quot;&gt;downstreamed&lt;/a&gt; from the &lt;a href=&quot;https://github.com/KellerJordan/modded-nanogpt&quot;&gt;modded-nanogpt&lt;/a&gt; repository / 8xH100 speedrun. Its not efficient to reinvent the wheel, and I want to get training time down as fast as possible in the beginning.&lt;/p&gt;&lt;p&gt;After implementing these changes (commit &lt;a href=&quot;https://github.com/tyler-romero/nanogpt-speedrun/commit/b7bb93fd988d73a55184c553f0020feec1454340&quot;&gt;&lt;code&gt;b7bb93f&lt;/code&gt;&lt;/a&gt;), the new run time is &lt;strong&gt;7.51 hours&lt;/strong&gt;. This run was more data-efficient than the baseline, requiring only 5.07B tokens. However, the tokens/second increased, likely due to the larger batch size (more gradient accumulation steps which tends to translate to lower throughput) and the architectural changes, such as the inclusion of RoPE. Once I have a shorter run time, I will be able to tune more effectively and see if I can remove gradient accumulation.&lt;/p&gt;&lt;p&gt;&lt;picture&gt;&lt;source type=&quot;image/webp&quot; srcset=&quot;https://tylerromero.com/assets/img/J9W8EvOzUU-300.webp 300w, https://tylerromero.com/assets/img/J9W8EvOzUU-600.webp 600w, https://tylerromero.com/assets/img/J9W8EvOzUU-900.webp 900w, https://tylerromero.com/assets/img/J9W8EvOzUU-2562.webp 2562w&quot; sizes=&quot;(max-width: 900px) 100vw, 900px&quot;&gt;&lt;img loading=&quot;lazy&quot; decoding=&quot;async&quot; class=&quot;responsive-image&quot; src=&quot;https://tylerromero.com/assets/img/J9W8EvOzUU-300.png&quot; alt=&quot;Section 2.1 loss plot&quot; width=&quot;2562&quot; height=&quot;1612&quot; srcset=&quot;https://tylerromero.com/assets/img/J9W8EvOzUU-300.png 300w, https://tylerromero.com/assets/img/J9W8EvOzUU-600.png 600w, https://tylerromero.com/assets/img/J9W8EvOzUU-900.png 900w, https://tylerromero.com/assets/img/J9W8EvOzUU-2562.png 2562w&quot; sizes=&quot;(max-width: 900px) 100vw, 900px&quot;&gt;&lt;/picture&gt;&lt;/p&gt;&lt;/section&gt;
+&lt;section&gt;&lt;h2 id=&quot;references&quot;&gt;References&lt;/h2&gt;&lt;textarea id=&quot;bibtex_input&quot; style=&quot;display:none;&quot;&gt;
+@misc{modded_nanogpt_2024,
+  author       = {Keller Jordan and Jeremy Bernstein and Brendan Rappazzo and
+                  @fernbear.bsky.social and Boza Vlado and You Jiacheng and
+                  Franz Cesista and Braden Koszarsky and @Grad62304977},
+  title        = {modded-nanogpt: Speedrunning the NanoGPT baseline},
+  year         = {2024},
+  url          = {https://github.com/KellerJordan/modded-nanogpt},
+  note = {GitHub repository}
+}
+@software{hlb-gpt_2024,
+  author={Fern},
+  month={3},
+  year = {2024},
+  title={hlb-gpt},
+  url={https://github.com/tysam-code/hlb-gpt},
+  version = {0.4.0},
+  note = {GitHub repository}
+}
+@misc{su2023roformerenhancedtransformerrotary,
+      title={RoFormer: Enhanced Transformer with Rotary Position Embedding},
+      author={Jianlin Su and Yu Lu and Shengfeng Pan and Ahmed Murtadha and Bo Wen and Yunfeng Liu},
+      year={2023},
+      eprint={2104.09864},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL},
+      url={https://arxiv.org/abs/2104.09864},
+}
+@misc{so2022primersearchingefficienttransformers,
+      title={Primer: Searching for Efficient Transformers for Language Modeling},
+      author={David R. So and Wojciech Mańke and Hanxiao Liu and Zihang Dai and Noam Shazeer and Quoc V. Le},
+      year={2022},
+      eprint={2109.08668},
+      archivePrefix={arXiv},
+      primaryClass={cs.LG},
+      url={https://arxiv.org/abs/2109.08668},
+}
+@misc{hagele2024scalinglawscomputeoptimaltraining,
+      title={Scaling Laws and Compute-Optimal Training Beyond Fixed Training Durations},
+      author={Alexander Hägele and Elie Bakouch and Atli Kosson and Loubna Ben Allal and Leandro Von Werra and Martin Jaggi},
+      year={2024},
+      eprint={2405.18392},
+      archivePrefix={arXiv},
+      primaryClass={cs.LG},
+      url={https://arxiv.org/abs/2405.18392},
+}
+@misc{hoffmann2022trainingcomputeoptimallargelanguage,
+      title={Training Compute-Optimal Large Language Models},
+      author={Jordan Hoffmann and Sebastian Borgeaud and Arthur Mensch and Elena Buchatskaya and Trevor Cai and Eliza Rutherford and Diego de Las Casas and Lisa Anne Hendricks and Johannes Welbl and Aidan Clark and Tom Hennigan and Eric Noland and Katie Millican and George van den Driessche and Bogdan Damoc and Aurelia Guy and Simon Osindero and Karen Simonyan and Erich Elsen and Jack W. Rae and Oriol Vinyals and Laurent Sifre},
+      year={2022},
+      eprint={2203.15556},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL},
+      url={https://arxiv.org/abs/2203.15556},
+}
+&lt;/textarea&gt;
+&lt;div id=&quot;bibtex_display&quot;&gt;&lt;/div&gt;&lt;/section&gt;
 </content>
   </entry>
   <entry>
diff --git a/index.html b/index.html
index a6cea50..19c2ff6 100644
--- a/index.html
+++ b/index.html
@@ -23,6 +23,9 @@
 <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/katex@0.16.10/dist/katex.min.css" integrity="sha384-wcIxkf4k558AjM3Yz3BBFQUbk/zgIYC2R0QpeeYb+TwlBVMrlgLqwRjRtGZiK7ww" crossorigin="anonymous">
 <script defer="" src="https://cdn.jsdelivr.net/npm/katex@0.16.10/dist/katex.min.js" integrity="sha384-hIoBPJpTUs74ddyc4bFZSM1TVlQDA60VBbJS0oA934VSz82sBx1X7kSx2ATBDIyd" crossorigin="anonymous"></script>
 <script defer="" src="https://cdn.jsdelivr.net/npm/katex@0.16.10/dist/contrib/auto-render.min.js" integrity="sha384-43gviWU0YVjaDtb/GhzOouOXtZMP/7XUzwPTstBeZFe/+rCMvRwr4yROQP43s0Xk" crossorigin="anonymous" onload="renderMathInElement(document.body);"></script>
+<!-- BibTeX Support -->
+<script type="text/javascript" src="https://cdn.jsdelivr.net/gh/pcooksey/bibtex-js@1.0.0/src/bibtex_js.min.js"></script>
+<!-- Stylesheets -->
 <link rel="stylesheet" href="/assets/tufte.min.css">
 <link rel="stylesheet" href="https://use.fontawesome.com/releases/v5.15.4/css/all.css">
 <!-- Goat Counter for basic view counting w/o cookies -->
@@ -40,7 +43,7 @@
                 "name": "Tyler Romero",
                 "url": "https://www.tylerromero.com"
             },
-            "dateModified": "2025-01-17T02:13:08.135Z",
+            "dateModified": "2025-01-18T20:21:57.157Z",
             "url": "https://www.tylerromero.com/",
             "mainEntityOfPage": {
                 "@type": "WebPage",
diff --git a/posts/2024-04-dpo/index.html b/posts/2024-04-dpo/index.html
index 03ca829..5c69b6b 100644
--- a/posts/2024-04-dpo/index.html
+++ b/posts/2024-04-dpo/index.html
@@ -23,6 +23,9 @@
 <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/katex@0.16.10/dist/katex.min.css" integrity="sha384-wcIxkf4k558AjM3Yz3BBFQUbk/zgIYC2R0QpeeYb+TwlBVMrlgLqwRjRtGZiK7ww" crossorigin="anonymous">
 <script defer="" src="https://cdn.jsdelivr.net/npm/katex@0.16.10/dist/katex.min.js" integrity="sha384-hIoBPJpTUs74ddyc4bFZSM1TVlQDA60VBbJS0oA934VSz82sBx1X7kSx2ATBDIyd" crossorigin="anonymous"></script>
 <script defer="" src="https://cdn.jsdelivr.net/npm/katex@0.16.10/dist/contrib/auto-render.min.js" integrity="sha384-43gviWU0YVjaDtb/GhzOouOXtZMP/7XUzwPTstBeZFe/+rCMvRwr4yROQP43s0Xk" crossorigin="anonymous" onload="renderMathInElement(document.body);"></script>
+<!-- BibTeX Support -->
+<script type="text/javascript" src="https://cdn.jsdelivr.net/gh/pcooksey/bibtex-js@1.0.0/src/bibtex_js.min.js"></script>
+<!-- Stylesheets -->
 <link rel="stylesheet" href="/assets/tufte.min.css">
 <link rel="stylesheet" href="https://use.fontawesome.com/releases/v5.15.4/css/all.css">
 <!-- Goat Counter for basic view counting w/o cookies -->
@@ -40,7 +43,7 @@
                 "url": "https://www.tylerromero.com"
             },
             "datePublished": "2024-04-13T08:00:00.000Z",
-            "dateModified": "2025-01-17T02:13:08.136Z",
+            "dateModified": "2025-01-18T20:21:57.157Z",
             "url": "https://www.tylerromero.com/posts/2024-04-dpo/",
             "mainEntityOfPage": {
                 "@type": "WebPage",
diff --git a/posts/2025-01-badge-extension/index.html b/posts/2025-01-badge-extension/index.html
index 3673040..4b85f7d 100644
--- a/posts/2025-01-badge-extension/index.html
+++ b/posts/2025-01-badge-extension/index.html
@@ -23,6 +23,9 @@
 <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/katex@0.16.10/dist/katex.min.css" integrity="sha384-wcIxkf4k558AjM3Yz3BBFQUbk/zgIYC2R0QpeeYb+TwlBVMrlgLqwRjRtGZiK7ww" crossorigin="anonymous">
 <script defer="" src="https://cdn.jsdelivr.net/npm/katex@0.16.10/dist/katex.min.js" integrity="sha384-hIoBPJpTUs74ddyc4bFZSM1TVlQDA60VBbJS0oA934VSz82sBx1X7kSx2ATBDIyd" crossorigin="anonymous"></script>
 <script defer="" src="https://cdn.jsdelivr.net/npm/katex@0.16.10/dist/contrib/auto-render.min.js" integrity="sha384-43gviWU0YVjaDtb/GhzOouOXtZMP/7XUzwPTstBeZFe/+rCMvRwr4yROQP43s0Xk" crossorigin="anonymous" onload="renderMathInElement(document.body);"></script>
+<!-- BibTeX Support -->
+<script type="text/javascript" src="https://cdn.jsdelivr.net/gh/pcooksey/bibtex-js@1.0.0/src/bibtex_js.min.js"></script>
+<!-- Stylesheets -->
 <link rel="stylesheet" href="/assets/tufte.min.css">
 <link rel="stylesheet" href="https://use.fontawesome.com/releases/v5.15.4/css/all.css">
 <!-- Goat Counter for basic view counting w/o cookies -->
@@ -40,7 +43,7 @@
                 "url": "https://www.tylerromero.com"
             },
             "datePublished": "2025-01-05T08:00:00.000Z",
-            "dateModified": "2025-01-17T02:13:08.136Z",
+            "dateModified": "2025-01-18T20:21:57.157Z",
             "url": "https://www.tylerromero.com/posts/2025-01-badge-extension/",
             "mainEntityOfPage": {
                 "@type": "WebPage",
diff --git a/posts/nanogpt-speedrun-worklog/index.html b/posts/nanogpt-speedrun-worklog/index.html
index 8734f48..6c9f728 100644
--- a/posts/nanogpt-speedrun-worklog/index.html
+++ b/posts/nanogpt-speedrun-worklog/index.html
@@ -23,6 +23,9 @@
 <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/katex@0.16.10/dist/katex.min.css" integrity="sha384-wcIxkf4k558AjM3Yz3BBFQUbk/zgIYC2R0QpeeYb+TwlBVMrlgLqwRjRtGZiK7ww" crossorigin="anonymous">
 <script defer="" src="https://cdn.jsdelivr.net/npm/katex@0.16.10/dist/katex.min.js" integrity="sha384-hIoBPJpTUs74ddyc4bFZSM1TVlQDA60VBbJS0oA934VSz82sBx1X7kSx2ATBDIyd" crossorigin="anonymous"></script>
 <script defer="" src="https://cdn.jsdelivr.net/npm/katex@0.16.10/dist/contrib/auto-render.min.js" integrity="sha384-43gviWU0YVjaDtb/GhzOouOXtZMP/7XUzwPTstBeZFe/+rCMvRwr4yROQP43s0Xk" crossorigin="anonymous" onload="renderMathInElement(document.body);"></script>
+<!-- BibTeX Support -->
+<script type="text/javascript" src="https://cdn.jsdelivr.net/gh/pcooksey/bibtex-js@1.0.0/src/bibtex_js.min.js"></script>
+<!-- Stylesheets -->
 <link rel="stylesheet" href="/assets/tufte.min.css">
 <link rel="stylesheet" href="https://use.fontawesome.com/releases/v5.15.4/css/all.css">
 <!-- Goat Counter for basic view counting w/o cookies -->
@@ -40,7 +43,7 @@
                 "url": "https://www.tylerromero.com"
             },
             "datePublished": "2025-01-16T08:00:00.000Z",
-            "dateModified": "2025-01-17T02:13:08.136Z",
+            "dateModified": "2025-01-18T20:21:57.157Z",
             "url": "https://www.tylerromero.com/posts/nanogpt-speedrun-worklog/",
             "mainEntityOfPage": {
                 "@type": "WebPage",
@@ -48,7 +51,7 @@
             },
             "image": "https://www.tylerromero.com",
             "keywords": "post, llm, gpt2, speedrun, nanogpt, worklog",
-            "wordCount": "513",
+            "wordCount": "1152",
             "articleBody": "I’ve seen some really awesome GPT-2 speedrun results from people like Keller Jordan, Fern, Braden Koszarsky, and others. I got a little inspired and wanted to see how fast I could train GPT-2 on my own hardware. Technically, the NanoGPT speedrun is to train a neural network to 3.28 validation loss on FineWeb as fast as possible on an 8xH100 machine. Keller Jordan maintains a leaderboard here. At the time of writing (Jan 16, 2025), the record is 3.14 minutes (!). I have access to 2xRTX 4090 GPUs..."
             }
         </script>
@@ -78,9 +81,10 @@ <h1>NanoGPT Speedrun Living Worklog
 <thead>
 <tr>
 <th align="left">#</th>
+<th align="left">Description</th>
 <th align="left">Record time</th>
 <th align="left">Training Tokens</th>
-<th align="left">Description</th>
+<th align="left">Tokens/Second</th>
 <th align="left">Date</th>
 <th align="left">Commit</th>
 <th align="left">Log</th>
@@ -88,14 +92,25 @@ <h1>NanoGPT Speedrun Living Worklog
 </thead>
 <tbody>
 <tr>
-<td align="left">1</td>
-<td align="left">8.13 hours</td>
-<td align="left">6.44e+09</td>
+<td align="left"><a href="#1-initial-setup-and-baseline">1</a></td>
 <td align="left">Initial baseline</td>
-<td align="left">2025–01–16</td>
+<td align="left">8.13 hours</td>
+<td align="left">6.44B</td>
+<td align="left">220.7k</td>
+<td align="left">2025/01/16</td>
 <td align="left"><a href="https://github.com/tyler-romero/nanogpt-speedrun/commit/b3c32f8937c1f4655c5eb9607970e03e351a6c08">b3c32f8</a></td>
 <td align="left"><a href="https://github.com/tyler-romero/nanogpt-speedrun/blob/main/logs/4c627c0d-029c-4f8a-bd18-40f99b43b22e.txt">here</a></td>
 </tr>
+<tr>
+<td align="left"><a href="#21-architectural-changes-and-training-tweaks">2.1</a></td>
+<td align="left">Architectural changes</td>
+<td align="left">7.51 hours</td>
+<td align="left">5.07B</td>
+<td align="left">187.7k</td>
+<td align="left">2025/01/18</td>
+<td align="left"><a href="https://github.com/tyler-romero/nanogpt-speedrun/commit/b7bb93fd988d73a55184c553f0020feec1454340">b7bb93f</a></td>
+<td align="left"><a href="https://github.com/tyler-romero/nanogpt-speedrun/blob/main/logs/14fcdb07-443d-4d1c-b307-061bc4bd2cd6.txt">here</a></td>
+</tr>
 </tbody>
 </table></section>
 <section><h2 id="1-initial-setup-and-baseline">1. Initial setup and baseline</h2><p>Part of the goal of this project is for me to learn as I go, so I am going to start at the beginning - with with Andrej Karpathy’s <a href="https://github.com/karpathy/llm.c/blob/7b929300217ff1a974b63791a228928b39b26409/train_gpt2.py">PyTorch GPT-2 trainer</a> from <a href="https://github.com/karpathy/llm.c">llm.c</a>. This is the script that Keller Jordan used for <a href="https://github.com/KellerJordan/modded-nanogpt/tree/master?tab=readme-ov-file#modded-nanogpt">his initial baseline</a>. This trainer is very similar to the NanoGPT trainer with some minor modifications / simplifications (such as no dropout).</p><p>I have upstreamed some QOL improvements and basic tweaks to the training script from Keller’s fork, but have not changed any of the core training / modeling logic. Specifically:</p><ol>
@@ -104,15 +119,77 @@ <h1>NanoGPT Speedrun Living Worklog
 <li>Improved learning rate schedule (linear warmup then linear decay).</li>
 <li>Removed all affine scale/bias parameters and switched to RMSNorm.</li>
 <li>Padded the vocab size from 50257 to 50304 to make it a multiple of 128 (for better tensor core utilization).</li>
-</ol><p>Additionally, I added <code>wandb</code> logging for easy tracking of training progress - optimistically I may need to remove this one day as it slightly increases step time.</p><p>Commit with the initial setup is here: <a href="https://github.com/tyler-romero/nanogpt-speedrun/blob/main/logs/4c627c0d-029c-4f8a-bd18-40f99b43b22e.txt"><code>b3c32f8</code></a>.</p><p>The baseline run time on my 2xRTX 4090 setup is <strong>8.13 hours</strong>.</p><!-- TODO: plot --><!-- ## 2. Implementing major improvements from the 8xH100 leaderboard
-
-Waiting 8 hours for a result, so I'm going to begin by implementing some of the notable improvements from the 8xH100 leaderboard. I'll start with the most impactful/easiest changes first:
-1. FlexAttention (30.2% speedup)
-2. Muon Optimizer (29% speedup)
-3. Architectural changes (31.8% speedup, then 24% speedup)
-4. Untied embeddings and lm_head (10% speedup)
-
-### 2.1 Muon Optimizer --></section>
+<li>Using Pytorch 2.5.1 (the switch from 2.4 to 2.5 gave ~9% speedup on the 8xH100 leaderboard).</li>
+</ol><p>Additionally, I added <code>wandb</code> logging for easy tracking of training progress - optimistically I may need to remove this one day as it slightly increases step time.</p><p>Commit with the initial setup is here: <a href="https://github.com/tyler-romero/nanogpt-speedrun/blob/main/logs/4c627c0d-029c-4f8a-bd18-40f99b43b22e.txt"><code>b3c32f8</code></a>.</p><p>The baseline run time on my 2xRTX 4090 setup is <strong>8.13 hours</strong>.</p><!-- TODO: plot --></section>
+<section><h2 id="2-implementing-major-improvements-from-the-8xh100-leaderboard">2. Implementing major improvements from the 8xH100 leaderboard</h2><p>Waiting 8 hours for a result, so I’m going to begin by implementing some of the notable improvements from the 8xH100 leaderboard. I’ll start with the most impactful/easiest changes first:</p><ol>
+<li>
+<p>Architectural changes (31.8% speedup, then 24% speedup)</p>
+<!-- 2. Muon Optimizer (29% speedup) -->
+<!-- 3. Untied embeddings and lm_head (10% speedup) -->
+</li>
+</ol><h3 id="21-architectural-changes-and-training-tweaks">2.1 Architectural changes and training tweaks</h3><p>There are some basic architectural changes and modernizations that can be made to the model that will speed up training. These changes are general improvements to the transformer decoder architecture that have been generally adopted since the original GPT-2 paper. The changes are:</p><ol>
+<li><a href="https://arxiv.org/abs/2104.09864">RoPE (Rotary Positional Embeddings)</a>. There are <a href="https://www.jitx.io/posts/rope-embeddings">many</a> <a href="https://blog.eleuther.ai/rotary-embeddings/">good</a> explanations of RoPE out there so I won’t go into detail here.</li>
+<li><a href="https://arxiv.org/pdf/2109.08668">ReLU^2 Activation</a><label for="sd-relu-2-activation" class="margin-toggle sidenote-number"></label><input type="checkbox" id="sd-relu-2-activation" class="margin-toggle"><span class="sidenote">ReLU^2 activation function. <picture><source type="image/webp" srcset="/assets/img/Zn-fTGuzYQ-300.webp 300w, /assets/img/Zn-fTGuzYQ-600.webp 600w, /assets/img/Zn-fTGuzYQ-900.webp 900w, /assets/img/Zn-fTGuzYQ-1920.webp 1920w" sizes="(max-width: 900px) 100vw, 900px"><img loading="lazy" decoding="async" class="responsive-image" src="/assets/img/Zn-fTGuzYQ-300.png" alt="Relu Activation plot" width="1920" height="1440" srcset="/assets/img/Zn-fTGuzYQ-300.png 300w, /assets/img/Zn-fTGuzYQ-600.png 600w, /assets/img/Zn-fTGuzYQ-900.png 900w, /assets/img/Zn-fTGuzYQ-1920.png 1920w" sizes="(max-width: 900px) 100vw, 900px"></picture></span>. Many activations that are better than GeLU have been proposed since GPT-2. ReLU^2 is a simple one that has been shown to be effective in decreasing training time required to reach a certain validation loss.</li>
+<li>No gradient clipping. Gradient clipping can help stabilize training but it also slows down training. Since we are speed-running, we will remove gradient clipping. This also eliminates a hyperparameter that needs to be tuned.</li>
+<li><a href="https://arxiv.org/abs/2405.18392">Trapezoidal learning rate schedule</a>. While cosine learning rate schedules are the de-facto standard, they can be difficult to work with since changing the number of training steps changes the entire schedule. Trapezoidal learning rate schedules are often easier to reason about / tune around, and they have been show to match the performance of cosine schedules.</li>
+</ol><p>In addition, learning rate and batch size have been tuned.</p><p>Once again, many of these changes are <a href="https://en.wikipedia.org/wiki/Downstream_(software_development)">downstreamed</a> from the <a href="https://github.com/KellerJordan/modded-nanogpt">modded-nanogpt</a> repository / 8xH100 speedrun. Its not efficient to reinvent the wheel, and I want to get training time down as fast as possible in the beginning.</p><p>After implementing these changes (commit <a href="https://github.com/tyler-romero/nanogpt-speedrun/commit/b7bb93fd988d73a55184c553f0020feec1454340"><code>b7bb93f</code></a>), the new run time is <strong>7.51 hours</strong>. This run was more data-efficient than the baseline, requiring only 5.07B tokens. However, the tokens/second increased, likely due to the larger batch size (more gradient accumulation steps which tends to translate to lower throughput) and the architectural changes, such as the inclusion of RoPE. Once I have a shorter run time, I will be able to tune more effectively and see if I can remove gradient accumulation.</p><p><picture><source type="image/webp" srcset="/assets/img/J9W8EvOzUU-300.webp 300w, /assets/img/J9W8EvOzUU-600.webp 600w, /assets/img/J9W8EvOzUU-900.webp 900w, /assets/img/J9W8EvOzUU-2562.webp 2562w" sizes="(max-width: 900px) 100vw, 900px"><img loading="lazy" decoding="async" class="responsive-image" src="/assets/img/J9W8EvOzUU-300.png" alt="Section 2.1 loss plot" width="2562" height="1612" srcset="/assets/img/J9W8EvOzUU-300.png 300w, /assets/img/J9W8EvOzUU-600.png 600w, /assets/img/J9W8EvOzUU-900.png 900w, /assets/img/J9W8EvOzUU-2562.png 2562w" sizes="(max-width: 900px) 100vw, 900px"></picture></p></section>
+<section><h2 id="references">References</h2><textarea id="bibtex_input" style="display:none;">
+@misc{modded_nanogpt_2024,
+  author       = {Keller Jordan and Jeremy Bernstein and Brendan Rappazzo and
+                  @fernbear.bsky.social and Boza Vlado and You Jiacheng and
+                  Franz Cesista and Braden Koszarsky and @Grad62304977},
+  title        = {modded-nanogpt: Speedrunning the NanoGPT baseline},
+  year         = {2024},
+  url          = {https://github.com/KellerJordan/modded-nanogpt},
+  note = {GitHub repository}
+}
+@software{hlb-gpt_2024,
+  author={Fern},
+  month={3},
+  year = {2024},
+  title={hlb-gpt},
+  url={https://github.com/tysam-code/hlb-gpt},
+  version = {0.4.0},
+  note = {GitHub repository}
+}
+@misc{su2023roformerenhancedtransformerrotary,
+      title={RoFormer: Enhanced Transformer with Rotary Position Embedding},
+      author={Jianlin Su and Yu Lu and Shengfeng Pan and Ahmed Murtadha and Bo Wen and Yunfeng Liu},
+      year={2023},
+      eprint={2104.09864},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL},
+      url={https://arxiv.org/abs/2104.09864},
+}
+@misc{so2022primersearchingefficienttransformers,
+      title={Primer: Searching for Efficient Transformers for Language Modeling},
+      author={David R. So and Wojciech Mańke and Hanxiao Liu and Zihang Dai and Noam Shazeer and Quoc V. Le},
+      year={2022},
+      eprint={2109.08668},
+      archivePrefix={arXiv},
+      primaryClass={cs.LG},
+      url={https://arxiv.org/abs/2109.08668},
+}
+@misc{hagele2024scalinglawscomputeoptimaltraining,
+      title={Scaling Laws and Compute-Optimal Training Beyond Fixed Training Durations},
+      author={Alexander Hägele and Elie Bakouch and Atli Kosson and Loubna Ben Allal and Leandro Von Werra and Martin Jaggi},
+      year={2024},
+      eprint={2405.18392},
+      archivePrefix={arXiv},
+      primaryClass={cs.LG},
+      url={https://arxiv.org/abs/2405.18392},
+}
+@misc{hoffmann2022trainingcomputeoptimallargelanguage,
+      title={Training Compute-Optimal Large Language Models},
+      author={Jordan Hoffmann and Sebastian Borgeaud and Arthur Mensch and Elena Buchatskaya and Trevor Cai and Eliza Rutherford and Diego de Las Casas and Lisa Anne Hendricks and Johannes Welbl and Aidan Clark and Tom Hennigan and Eric Noland and Katie Millican and George van den Driessche and Bogdan Damoc and Aurelia Guy and Simon Osindero and Karen Simonyan and Erich Elsen and Jack W. Rae and Oriol Vinyals and Laurent Sifre},
+      year={2022},
+      eprint={2203.15556},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL},
+      url={https://arxiv.org/abs/2203.15556},
+}
+</textarea>
+<div id="bibtex_display"></div></section>
 
         </article>
         <footer>
diff --git a/recipe-box/index.html b/recipe-box/index.html
index e309150..d17dec1 100644
--- a/recipe-box/index.html
+++ b/recipe-box/index.html
@@ -23,6 +23,9 @@
 <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/katex@0.16.10/dist/katex.min.css" integrity="sha384-wcIxkf4k558AjM3Yz3BBFQUbk/zgIYC2R0QpeeYb+TwlBVMrlgLqwRjRtGZiK7ww" crossorigin="anonymous">
 <script defer="" src="https://cdn.jsdelivr.net/npm/katex@0.16.10/dist/katex.min.js" integrity="sha384-hIoBPJpTUs74ddyc4bFZSM1TVlQDA60VBbJS0oA934VSz82sBx1X7kSx2ATBDIyd" crossorigin="anonymous"></script>
 <script defer="" src="https://cdn.jsdelivr.net/npm/katex@0.16.10/dist/contrib/auto-render.min.js" integrity="sha384-43gviWU0YVjaDtb/GhzOouOXtZMP/7XUzwPTstBeZFe/+rCMvRwr4yROQP43s0Xk" crossorigin="anonymous" onload="renderMathInElement(document.body);"></script>
+<!-- BibTeX Support -->
+<script type="text/javascript" src="https://cdn.jsdelivr.net/gh/pcooksey/bibtex-js@1.0.0/src/bibtex_js.min.js"></script>
+<!-- Stylesheets -->
 <link rel="stylesheet" href="/assets/tufte.min.css">
 <link rel="stylesheet" href="https://use.fontawesome.com/releases/v5.15.4/css/all.css">
 <!-- Goat Counter for basic view counting w/o cookies -->
@@ -40,7 +43,7 @@
                 "name": "Tyler Romero",
                 "url": "https://www.tylerromero.com"
             },
-            "dateModified": "2025-01-17T02:13:08.136Z",
+            "dateModified": "2025-01-18T20:21:57.158Z",
             "url": "https://www.tylerromero.com/recipe-box/",
             "mainEntityOfPage": {
                 "@type": "WebPage",
diff --git a/recipe-box/recipes/chanterelle-pasta-with-roasted-garlic-wine-sauce/index.html b/recipe-box/recipes/chanterelle-pasta-with-roasted-garlic-wine-sauce/index.html
index fb1499f..9b4474f 100644
--- a/recipe-box/recipes/chanterelle-pasta-with-roasted-garlic-wine-sauce/index.html
+++ b/recipe-box/recipes/chanterelle-pasta-with-roasted-garlic-wine-sauce/index.html
@@ -23,6 +23,9 @@
 <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/katex@0.16.10/dist/katex.min.css" integrity="sha384-wcIxkf4k558AjM3Yz3BBFQUbk/zgIYC2R0QpeeYb+TwlBVMrlgLqwRjRtGZiK7ww" crossorigin="anonymous">
 <script defer="" src="https://cdn.jsdelivr.net/npm/katex@0.16.10/dist/katex.min.js" integrity="sha384-hIoBPJpTUs74ddyc4bFZSM1TVlQDA60VBbJS0oA934VSz82sBx1X7kSx2ATBDIyd" crossorigin="anonymous"></script>
 <script defer="" src="https://cdn.jsdelivr.net/npm/katex@0.16.10/dist/contrib/auto-render.min.js" integrity="sha384-43gviWU0YVjaDtb/GhzOouOXtZMP/7XUzwPTstBeZFe/+rCMvRwr4yROQP43s0Xk" crossorigin="anonymous" onload="renderMathInElement(document.body);"></script>
+<!-- BibTeX Support -->
+<script type="text/javascript" src="https://cdn.jsdelivr.net/gh/pcooksey/bibtex-js@1.0.0/src/bibtex_js.min.js"></script>
+<!-- Stylesheets -->
 <link rel="stylesheet" href="/assets/tufte.min.css">
 <link rel="stylesheet" href="https://use.fontawesome.com/releases/v5.15.4/css/all.css">
 <!-- Goat Counter for basic view counting w/o cookies -->
@@ -39,8 +42,8 @@
                 "name": "Tyler Romero",
                 "url": "https://www.tylerromero.com"
             },
-            "datePublished": "2025-01-17T02:13:08.136Z",
-            "dateModified": "2025-01-17T02:13:08.136Z",
+            "datePublished": "2025-01-18T20:21:57.158Z",
+            "dateModified": "2025-01-18T20:21:57.158Z",
             "url": "https://www.tylerromero.com/recipe-box/recipes/chanterelle-pasta-with-roasted-garlic-wine-sauce/",
             "mainEntityOfPage": {
                 "@type": "WebPage",
@@ -68,7 +71,7 @@ <h1>Chanterelle Pasta with Roasted Garlic Wine Sauce
 </h1>
             <p class="subtitle">Super good pasta recipe for when chantrelles are in season.
 </p>
-            <p class="date">January 17, 2025</p>
+            <p class="date">January 18, 2025</p>
             <section><p>This is a <a href="https://foragerchef.com/chef-andys-chanterelle-pasta/#recipe">lovely recipe from foragerchef</a>.</p>
 <p>In the Pacific Northwest, chanterelles are in season in the fall. This is a great recipe to make when you have a bunch of them. Frequently they are available at farmers markets.</p>
 <p>General tips for cooking with Mushrooms:</p>
diff --git a/recipe-box/recipes/pearled-couscous-with-shallots-and-parsley/index.html b/recipe-box/recipes/pearled-couscous-with-shallots-and-parsley/index.html
index d105256..6752e8c 100644
--- a/recipe-box/recipes/pearled-couscous-with-shallots-and-parsley/index.html
+++ b/recipe-box/recipes/pearled-couscous-with-shallots-and-parsley/index.html
@@ -23,6 +23,9 @@
 <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/katex@0.16.10/dist/katex.min.css" integrity="sha384-wcIxkf4k558AjM3Yz3BBFQUbk/zgIYC2R0QpeeYb+TwlBVMrlgLqwRjRtGZiK7ww" crossorigin="anonymous">
 <script defer="" src="https://cdn.jsdelivr.net/npm/katex@0.16.10/dist/katex.min.js" integrity="sha384-hIoBPJpTUs74ddyc4bFZSM1TVlQDA60VBbJS0oA934VSz82sBx1X7kSx2ATBDIyd" crossorigin="anonymous"></script>
 <script defer="" src="https://cdn.jsdelivr.net/npm/katex@0.16.10/dist/contrib/auto-render.min.js" integrity="sha384-43gviWU0YVjaDtb/GhzOouOXtZMP/7XUzwPTstBeZFe/+rCMvRwr4yROQP43s0Xk" crossorigin="anonymous" onload="renderMathInElement(document.body);"></script>
+<!-- BibTeX Support -->
+<script type="text/javascript" src="https://cdn.jsdelivr.net/gh/pcooksey/bibtex-js@1.0.0/src/bibtex_js.min.js"></script>
+<!-- Stylesheets -->
 <link rel="stylesheet" href="/assets/tufte.min.css">
 <link rel="stylesheet" href="https://use.fontawesome.com/releases/v5.15.4/css/all.css">
 <!-- Goat Counter for basic view counting w/o cookies -->
@@ -39,8 +42,8 @@
                 "name": "Tyler Romero",
                 "url": "https://www.tylerromero.com"
             },
-            "datePublished": "2025-01-17T02:13:08.136Z",
-            "dateModified": "2025-01-17T02:13:08.137Z",
+            "datePublished": "2025-01-18T20:21:57.158Z",
+            "dateModified": "2025-01-18T20:21:57.158Z",
             "url": "https://www.tylerromero.com/recipe-box/recipes/pearled-couscous-with-shallots-and-parsley/",
             "mainEntityOfPage": {
                 "@type": "WebPage",
@@ -68,7 +71,7 @@ <h1>Pearled Couscous with Shallots and Parsley
 </h1>
             <p class="subtitle">A really good North African side that goes well with other Mediterranean dishes.
 </p>
-            <p class="date">January 17, 2025</p>
+            <p class="date">January 18, 2025</p>
             <section><p><em>Makes 2 to 3 portions</em></p><p><label for="md-lightly-modified" class="margin-toggle ">&#8853;</label><input type="checkbox" id="md-lightly-modified" class="margin-toggle"><span class="marginnote">Lightly modified from the back of a Trader Joe’s box. <picture><source type="image/webp" srcset="/assets/img/YKyGY9p0yi-300.webp 300w, /assets/img/YKyGY9p0yi-600.webp 600w, /assets/img/YKyGY9p0yi-900.webp 900w, /assets/img/YKyGY9p0yi-2348.webp 2348w" sizes="(max-width: 900px) 100vw, 900px"><img loading="lazy" decoding="async" class="responsive-image" src="/assets/img/YKyGY9p0yi-300.jpeg" alt="Picture of the original recipes on the back of a TJs box" width="2348" height="2399" srcset="/assets/img/YKyGY9p0yi-300.jpeg 300w, /assets/img/YKyGY9p0yi-600.jpeg 600w, /assets/img/YKyGY9p0yi-900.jpeg 900w, /assets/img/YKyGY9p0yi-2348.jpeg 2348w" sizes="(max-width: 900px) 100vw, 900px"></picture></span></p><p>2 tbsp. butter<br>
 1/2 cup shallots, finely chopped<br></p><p>1 1/2 cup pearled couscous<br>
 1/2 large cinnamon stick<br>
diff --git a/recipe-box/recipes/pesto/index.html b/recipe-box/recipes/pesto/index.html
index aba622d..4ba360d 100644
--- a/recipe-box/recipes/pesto/index.html
+++ b/recipe-box/recipes/pesto/index.html
@@ -23,6 +23,9 @@
 <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/katex@0.16.10/dist/katex.min.css" integrity="sha384-wcIxkf4k558AjM3Yz3BBFQUbk/zgIYC2R0QpeeYb+TwlBVMrlgLqwRjRtGZiK7ww" crossorigin="anonymous">
 <script defer="" src="https://cdn.jsdelivr.net/npm/katex@0.16.10/dist/katex.min.js" integrity="sha384-hIoBPJpTUs74ddyc4bFZSM1TVlQDA60VBbJS0oA934VSz82sBx1X7kSx2ATBDIyd" crossorigin="anonymous"></script>
 <script defer="" src="https://cdn.jsdelivr.net/npm/katex@0.16.10/dist/contrib/auto-render.min.js" integrity="sha384-43gviWU0YVjaDtb/GhzOouOXtZMP/7XUzwPTstBeZFe/+rCMvRwr4yROQP43s0Xk" crossorigin="anonymous" onload="renderMathInElement(document.body);"></script>
+<!-- BibTeX Support -->
+<script type="text/javascript" src="https://cdn.jsdelivr.net/gh/pcooksey/bibtex-js@1.0.0/src/bibtex_js.min.js"></script>
+<!-- Stylesheets -->
 <link rel="stylesheet" href="/assets/tufte.min.css">
 <link rel="stylesheet" href="https://use.fontawesome.com/releases/v5.15.4/css/all.css">
 <!-- Goat Counter for basic view counting w/o cookies -->
@@ -39,8 +42,8 @@
                 "name": "Tyler Romero",
                 "url": "https://www.tylerromero.com"
             },
-            "datePublished": "2025-01-17T02:13:08.137Z",
-            "dateModified": "2025-01-17T02:13:08.137Z",
+            "datePublished": "2025-01-18T20:21:57.158Z",
+            "dateModified": "2025-01-18T20:21:57.158Z",
             "url": "https://www.tylerromero.com/recipe-box/recipes/pesto/",
             "mainEntityOfPage": {
                 "@type": "WebPage",
@@ -68,7 +71,7 @@ <h1>Family Pesto Recipe
 </h1>
             <p class="subtitle">My favorite food while growing up. I tend to find store-bought pestos to be too oily or garlicky.
 </p>
-            <p class="date">January 17, 2025</p>
+            <p class="date">January 18, 2025</p>
             <section><p><em>Makes 2 1/2 to 3 cups</em></p><p>3 packed cups fresh basil leaves<br>
 2–3 large cloves garlic<br>
 1/2 cups walnuts<label for="sd-pesto-is" class="margin-toggle sidenote-number"></label><input type="checkbox" id="sd-pesto-is" class="margin-toggle"><span class="sidenote">Pesto is traditionally made with pine nuts, but I prefer the earthier flavor of walnuts. Plus walnuts are less expensive.</span><br>
diff --git a/sitemap.xml b/sitemap.xml
index bbe6f71..67ce2dd 100644
--- a/sitemap.xml
+++ b/sitemap.xml
@@ -26,49 +26,49 @@
     
         <url>
         <loc>https://www.tylerromero.com/404.html</loc>
-        <lastmod>2025-01-17T02:13:08.103Z</lastmod>
+        <lastmod>2025-01-18T20:21:57.122Z</lastmod>
         </url>
     
 
     
         <url>
         <loc>https://www.tylerromero.com/</loc>
-        <lastmod>2025-01-17T02:13:08.135Z</lastmod>
+        <lastmod>2025-01-18T20:21:57.157Z</lastmod>
         </url>
     
 
     
         <url>
         <loc>https://www.tylerromero.com/recipe-box/</loc>
-        <lastmod>2025-01-17T02:13:08.136Z</lastmod>
+        <lastmod>2025-01-18T20:21:57.158Z</lastmod>
         </url>
     
 
     
         <url>
         <loc>https://www.tylerromero.com/recipe-box/recipes/chanterelle-pasta-with-roasted-garlic-wine-sauce/</loc>
-        <lastmod>2025-01-17T02:13:08.136Z</lastmod>
+        <lastmod>2025-01-18T20:21:57.158Z</lastmod>
         </url>
     
 
     
         <url>
         <loc>https://www.tylerromero.com/recipe-box/recipes/pearled-couscous-with-shallots-and-parsley/</loc>
-        <lastmod>2025-01-17T02:13:08.136Z</lastmod>
+        <lastmod>2025-01-18T20:21:57.158Z</lastmod>
         </url>
     
 
     
         <url>
         <loc>https://www.tylerromero.com/recipe-box/recipes/pesto/</loc>
-        <lastmod>2025-01-17T02:13:08.137Z</lastmod>
+        <lastmod>2025-01-18T20:21:57.158Z</lastmod>
         </url>
     
 
     
         <url>
         <loc>https://www.tylerromero.com/feed.xml</loc>
-        <lastmod>2025-01-17T02:13:22.264Z</lastmod>
+        <lastmod>2025-01-18T20:22:14.365Z</lastmod>
         </url>