From d1a68496e4f7c155a934a5ef72090f7021f33d2e Mon Sep 17 00:00:00 2001
From: Kira McLean <email@kiramclean.com>
Date: Thu, 18 Jul 2024 17:47:15 -0300
Subject: [PATCH] fix rendering to use new slug

---
 ...-07-18-dplyr-polars-pandas-tablecloth.html | 177 ------------------
 public/archive.html                           |   2 -
 public/atom.xml                               |  78 +-------
 public/index.html                             |  93 +--------
 public/planetclojure.xml                      |  78 +-------
 public/tags/clojure.html                      |   2 -
 public/tags/index.html                        |  12 +-
 public/tags/python.html                       |   2 -
 public/tags/r.html                            |   2 -
 public/tags/scicloj.html                      |   2 -
 public/tags/tools.html                        |   2 -
 11 files changed, 18 insertions(+), 432 deletions(-)
 delete mode 100644 public/2024-07-18-dplyr-polars-pandas-tablecloth.html
diff --git a/public/2024-07-18-dplyr-polars-pandas-tablecloth.html b/public/2024-07-18-dplyr-polars-pandas-tablecloth.html
deleted file mode 100644
index fb143b4..0000000
--- a/public/2024-07-18-dplyr-polars-pandas-tablecloth.html
+++ /dev/null
@@ -1,177 +0,0 @@
-<!DOCTYPE html>
-<html lang="en">
-  <head>
-    <title>Data Manipulation in Clojure Compared to R and Python</title>
-    <meta charset="utf-8"/>
-    <meta name="viewport" content="width=device-width, initial-scale=1.0">
-    <link type="application/atom+xml" rel="alternate" href="atom.xml" title="Data Manipulation in Clojure Compared to R and Python">
-    <link rel="stylesheet" href="style.css">
-    <script src="https://cdnjs.cloudflare.com/ajax/libs/prism/1.28.0/prism.min.js"></script>
-    <script src="https://cdnjs.cloudflare.com/ajax/libs/prism/1.28.0/components/prism-clojure.min.js"></script>
-    <script src="https://cdnjs.cloudflare.com/ajax/libs/prism/1.28.0/components/prism-python.min.js"></script>
-    <script src="https://cdnjs.cloudflare.com/ajax/libs/prism/1.28.0/components/prism-r.min.js"></script>
-    <script type="text/javascript" src="https://livejs.com/live.js"></script>
-    <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/prism/1.28.0/themes/prism.min.css">
-
-
-
-    <!-- Social sharing (Facebook, Twitter, LinkedIn, etc.) -->
-    <meta name="title" content="Data Manipulation in Clojure Compared to R and Python">
-    <meta name="twitter:title" content="Data Manipulation in Clojure Compared to R and Python">
-    <meta property="og:title" content="Data Manipulation in Clojure Compared to R and Python">
-    <meta property="og:type" content="website">
-
-
-    <meta name="twitter:url" content="https://codewithkira.com/2024-07-18-dplyr-polars-pandas-tablecloth.html">
-    <meta property="og:url" content="https://codewithkira.com/2024-07-18-dplyr-polars-pandas-tablecloth.html">
-
-
-    <meta name="twitter:card" content="summary">
-
-
-
-    <script defer data-domain="codewithkira.com" src="https://plausible.io/js/script.js"></script>
-    <script defer src="https://analytics.eu.umami.is/script.js" data-website-id="21d16294-afc5-465a-a0b1-539ccf7679e9"></script>
-
-    <link rel="me" href="https://indieweb.social/@kira">
-    <link rel="me" href="https://github.com/kiramclean">
-    <!-- Fathom - beautiful, simple website analytics -->
-    <script src="https://cdn.usefathom.com/script.js" data-site="VRDCNTIY" defer></script>
-    <!-- / Fathom -->
-  </head>
-  <body>
-    <header>
-      <h1>
-        <a href="index.html">Code with Kira</a>
-      </h1>
-      <nav>
-        <ul>
-          <li><a href="archive.html">Archive</a></li>
-          <li><a href="tags/index.html">Tags</a></li>
-          <li><a href="atom.xml">Feed</a></li>
-          <li><a class="icon mastodon" href="https://indieweb.social/@kira" target="blank">Mastodon</a></li>
-          <li><a class="icon linkedin" href="https://www.linkedin.com/in/kiramclean" target="blank">LinkedIn</a></li>
-          <li><a class="icon github" href="https://github.com/kiramclean" target="blank">Github</a></li>
-        </ul>
-      </nav>
-    </header>
-
-    <main>
-
-      <h1>
-  
-    Data Manipulation in Clojure Compared to R and Python
-    
-</h1>
-<date><i>Published 2024-07-18</i></date>
-<p>I spend a lot of time developing and teaching people about Clojure's open source tools for working with data. Almost everybody who wants to use Clojure for this kind of work is coming from another language ecosystem, usually R or Python. Together with Daniel Slutsky, I'm working on formalizing some of the common teachings into a course. Part of that is providing context for people coming from other ecosystems, including "translations" of how to accomplish data science tasks in Clojure.</p><p>As part of this development, I wanted to share an early preview in this blog post. The format is inspired by this great blog post I read a while ago comparing <a href='https://krz.github.io/Comparing-dplyr-with-polars/'>R and Polars</a> side by side (where "R" here refers to <a href='https://www.tidyverse.org'>the tidyverse</a>, an opinionated collection of R libraries for data science, and realistically mostly <a href='https://dplyr.tidyverse.org'><code>dplyr</code></a> specifically). I'm adding Pandas because it's among the most popular dataset manipulation libraries, and of course Clojure, specifically <a href='https://github.com/scicloj/tablecloth'>tablecloth</a>, the primary data manipulation library in our ecosystem.</p><p>I'll use the same dataset as the original blog post, the <a href='https://allisonhorst.github.io/palmerpenguins/articles/intro.html'>Palmer Penguin dataset</a>. For the sake of simplicity, I saved a copy of the dataset as a CSV file and made it available on this website. I will also refer the data as a "dataset" throughout this post because that's what Clojure people call a tabular, column-major data structure, but it's the same thing that is variously referred to as a dataframe, data table, or just "data" in other languages. I'm also assuming you know how to install the packages required in the given ecosystems, but any necessary imports or requirements are included in the code snippets the first time they appear. Versions of all languages and libraries used in this post are listed at the end. Here we go!</p><h2 id="reading&#95;data">Reading data</h2><p>Reading data is straightforward in every language, but as a bonus we want to be able to indicate on the fly which values should be interpreted as "missing", whatever that means in the given libraries. In this dataset, the string <code>&quot;NA&quot;</code> means "missing", so we want to tell the dataset constructor this as soon as possible. Here's the comparison of how to accomplish that in various languages:</p><h3 id="tablecloth">Tablecloth</h3><pre><code class="lang-clojure">&#40;require '&#91;tablecloth.api :as tc&#93;&#41;
-
-&#40;def ds 
-  &#40;tc/dataset &quot;https://codewithkira.com/assets/penguins.csv&quot;&#41;&#41;
-</code></pre><p>Note that tablecloth interprets the string "NA" as missing (<code>nil</code>, in Clojure) by default.</p><h3 id="r">R</h3><p>In reality, in R you would get the dataset from the R <a href='https://allisonhorst.github.io/palmerpenguins/articles/intro.html'>package that contains the dataset</a>. This is a fairly common practice in R. In order to compare apples to apples, though, here I'll show how to initialize the dataset from a remote CSV file, using the <a href='https://readr.tidyverse.org/reference/read_delim.html'><code>readr</code> package's <code>read&#95;csv</code></a>, which is part of the tidyverse:</p><pre><code class="lang-r">library&#40;tidyverse&#41;
-
-ds &lt;- read&#95;csv&#40;&quot;https://codewithkira.com/assets/penguins.csv&quot;,
-               na = &quot;NA&quot;&#41;
-</code></pre><h3 id="pandas">Pandas</h3><pre><code class="lang-python">import pandas as pd
-
-ds = pd.read&#95;csv&#40;&quot;https://codewithkira.com/assets/penguins.csv&quot;&#41;
-</code></pre><p>Note that pandas has a <a href='https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html'>fairly long list</a> of values it considers <code>NaN</code> already, so we don't need to specify what missing values look like in our case, since <code>&quot;NA&quot;</code> is already in that list.</p><h3 id="polars">Polars</h3><pre><code class="lang-python">import polars as pl
-
-ds = pl.read&#95;csv&#40;&quot;https://codewithkira.com/assets/penguins.csv&quot;,
-                 null&#95;values=&quot;NA&quot;&#41;
-</code></pre><h2 id="basic&#95;commands&#95;to&#95;explore&#95;the&#95;dataset">Basic commands to explore the dataset</h2><p>The first thing people usually want to do with their dataset is see it and poke around a bit. Below is a comparison of how to accomplish basic data exploration tasks using each library.</p><table><thead><tr><th>Operation</th><th>tablecloth</th><th>dplyr</th></tr></thead><tbody><tr><td>see first 10 rows</td><td><code>&#40;tc/head ds 10&#41;</code></td><td><code>head&#40;ds, 10&#41;</code></td></tr><tr><td>see all column names</td><td><code>&#40;tc/column-names ds&#41;</code></td><td><code>colnames&#40;ds&#41;</code></td></tr><tr><td>select column</td><td><code>&#40;tc/select-columns ds &quot;year&quot;&#41;</code></td><td><code>select&#40;ds, year&#41;</code></td></tr><tr><td>select multiple columns</td><td><code>&#40;tc/select-columns ds &#91;&quot;year&quot; &quot;sex&quot;&#93;&#41;</code></td><td><code>select&#40;ds, year, sex&#41;</code></td></tr><tr><td>select rows</td><td><code>&#40;tc/select-rows ds #&#40;&gt; &#40;% &quot;year&quot;&#41; 2008&#41;&#41;</code></td><td><code>filter&#40;ds, year &gt; 2008&#41;</code></td></tr><tr><td>sort column</td><td><code>&#40;tc/order-by ds &quot;year&quot;&#41;</code></td><td><code>arrange&#40;ds, year&#41;</code></td></tr></tbody></table><p><br></p><table><thead><tr><th>Operation</th><th>pandas</th><th>polars</th></tr></thead><tbody><tr><td>see first <code>n</code> rows</td><td><code>ds.head&#40;10&#41;</code></td><td><code>ds.head&#40;10&#41;</code></td></tr><tr><td>see all column names</td><td><code>ds.columns</code></td><td><code>ds.columns</code></td></tr><tr><td>select column</td><td><code>ds&#91;&#91;&quot;year&quot;&#93;&#93;</code></td><td><code>ds.select&#40;pl.col&#40;&quot;year&quot;&#41;&#41;</code></td></tr><tr><td>select multiple columns</td><td><code>ds&#91;&#91;&quot;year&quot;, &quot;sex&quot;&#93;&#93;</code></td><td><code>ds.select&#40;pl.col&#40;&quot;year&quot;, &quot;sex&quot;&#41;&#41;</code></td></tr><tr><td>select rows</td><td><code>ds&#91;ds&#91;&quot;year&quot;&#93; &gt; 2008&#93;</code></td><td><code>ds.filter&#40;pl.col&#40;&quot;year&quot;&#41; &gt; 2008&#41;</code></td></tr><tr><td>sort column</td><td><code>ds.sort&#95;values&#40;&quot;year&quot;&#41;</code></td><td><code>ds.sort&#40;&quot;year&quot;&#41;</code></td></tr></tbody></table><p>Note there are some differences in how different libraries sort missing values, for example in tablecloth and polars they are placed at the beginning (so they're at the top when a column is sorted in ascending order and last when descending), but dplyr and pandas place them last (regardless of whether ascending or descending order is specified).</p><p>As you can see, these commands are all pretty similar, with the exception of selecting rows in tablecloth. This is a short-hand syntax for writing an anonymous function in Clojure, which is how rows are selected. Being a functional language, functions in Clojure are "first-class", which basically just means they are passed around as arguments willy-nilly, all over the place, all the time. In this case, the third argument to tablecloth's <code>select-rows</code> function is a predicate (a function that returns a boolean) that takes as its argument a dataset row as a map of column names to values. Don't worry, though, tablecloth doesn't process your entire dataset row-wise. Under the hood datasets are highly optimized to perform column-wise operations as fast as possible.</p><p>Here's an example of what it looks like to string a couple of these basic dataset exploration operations together, for example in this case to get the <code>bill&#95;length&#95;mm</code> of all penguins with <code>body&#95;mass&#95;g</code> below 3800:</p><h3 id="tablecloth">Tablecloth</h3><pre><code class="lang-clojure">&#40;-&gt; ds
-    &#40;tc/select-rows #&#40;and &#40;% &quot;body&#95;mass&#95;g&quot;&#41;
-                          &#40;&gt; &#40;% &quot;body&#95;mass&#95;g&quot;&#41; 3800&#41;&#41;&#41;
-    &#40;tc/select-columns &quot;bill&#95;length&#95;mm&quot;&#41;&#41;
-</code></pre><p>Note that in tablecloth we have to explicitly omit rows where the value we're filtering by is missing, unlike in other libraries. This is because tablecloth actually uses <code>nil</code> (as opposed to a library-specific construct) to indicate a missing value , and in Clojure <code>nil</code> is not treated as comparable to numbers. If we were to try to compare <code>nil</code> to a number, we would get an exception telling us that we're trying to compare incomparable types. Clojure is fundamentally dynamically typed in that it only does type checking at runtime and bindings can refer to values of any type, but it is also strongly typed, as we see here, in the sense that it explicitly avoids implicit type coercion. For example deciding whether 0 is greater or larger than <code>nil</code> requires some assumptions, and these are intentionally not baked into the core of Clojure or into tablecloth as a library as is the case in some other languages and libraries.</p><p>This example also introduces Clojure's "thread-first" macro. The <code>-&gt;</code> arrow is like R's <code>|&gt;</code> operator or the unix pipe, effectively passing the output of each function in the chain as input to the next. It comes in very handy for data processing code like this.</p><p>Here is the equivalent operation in the other libraries:</p><h3 id="dplyr">dplyr</h3><pre><code class="lang-r">ds |&gt;
-    filter&#40;body&#95;mass&#95;g &lt; 3800&#41; |&gt;
-    select&#40;bill&#95;length&#95;mm&#41;
-</code></pre><h3 id="pandas">Pandas</h3><pre><code class="lang-python">ds&#91;ds&#91;&quot;body&#95;mass&#95;g&quot;&#93; &lt; 3800&#93;&#91;&quot;bill&#95;length&#95;mm&quot;&#93;
-</code></pre><h3 id="polars">Polars</h3><pre><code class="lang-python">ds.filter&#40;pl.col&#40;&quot;body&#95;mass&#95;g&quot;&#41; &lt; 3800&#41;.select&#40;pl.col&#40;&quot;bill&#95;length&#95;mm&quot;&#41;&#41;
-</code></pre><h2 id="more&#95;advanced&#95;filtering&#95;and&#95;selecting">More advanced filtering and selecting</h2><p>Here is what some more complicated data wrangling looks like across the libraries.</p><h3 id="select&#95;all&#95;columns&#95;except&#95;for&#95;one">Select all columns except for one</h3><table><thead><tr><th>Library</th><th>Code</th></tr></thead><tbody><tr><td>tablecloth</td><td><code>&#40;tc/select-columns ds &#40;complement #{&quot;year&quot;}&#41;&#41;</code></td></tr><tr><td>dplyr</td><td><code>select&#40;ds, -year&#41;</code></td></tr><tr><td>pandas</td><td><code>ds.drop&#40;columns=&#91;&quot;year&quot;&#93;&#41;</code></td></tr><tr><td>polars</td><td><code>ds.select&#40;pl.exclude&#40;&quot;year&quot;&#41;&#41;</code></td></tr></tbody></table><p>Another property of functional languages in general, and especially Clojure, is that they really take advantage of the fact that a lot of things are functions that you might not be used to treating like functions. They also leverage function composition to simply combine multiple functions into a single operation.</p><p>For example a set (indicated with the <code>#{}</code> syntax in Clojure) is a special function that returns a boolean indicating whether the given argument is a member of the set or not. And <a href='https://clojuredocs.org/clojure.core/complement'><code>complement</code></a> is a function in <code>clojure.core</code> that effectively inverts the function given to it, so combined <code>&#40;complement #{&quot;year&quot;}&#41;</code> means "every value that is <i>not</i> in the set <code>#{&quot;year&quot;}</code>, which we can then use as our predicate column selector function to filter out certain columns.</p><h3 id="select&#95;all&#95;columns&#95;that&#95;start&#95;with&#95;a&#95;given&#95;string">Select all columns that start with a given string</h3><table><thead><tr><th>Library</th><th>Code</th></tr></thead><tbody><tr><td>tablecloth</td><td><code>&#40;tc/select-columns ds #&#40;str/starts-with? % &quot;bill&quot;&#41;&#41;</code></td></tr><tr><td>dplyr</td><td><code>select&#40;ds, starts&#95;with&#40;&quot;bill&quot;&#41;&#41;</code></td></tr><tr><td>pandas</td><td><code>ds.filter&#40;regex=&quot;&#94;bill&quot;&#41;</code></td></tr><tr><td>polars</td><td><pre style="width:max-content"><code>import polars.selectors as cs</code><br><code>ds.select(cs.starts_with("bill"))</code></pre></td></tr></tbody></table><h3 id="select&#95;only&#95;numeric&#95;columns">Select only numeric columns</h3><table><thead><tr><th>Library</th><th>Code</th></tr></thead><tbody><tr><td>tablecloth</td><td><code>&#40;tc/select-columns ds :type/numerical</code></td></tr><tr><td>dplyr</td><td><code>select&#40;ds, where&#40;is.numeric&#41;&#41;</code></td></tr><tr><td>pandas</td><td><code>ds.select&#95;dtypes&#40;include='number'&#41;</code></td></tr><tr><td>polars</td><td><code>ds.select&#40;cs.numeric&#40;&#41;&#41;</code></td></tr></tbody></table><p>The symbol <code>:type/numerical</code> in Clojure here is a magic keyword that tablecloth knows about and can accept as a column selector. This list of magic keywords that tablecloth knows about is not (yet) documented anywhere, but it is <a href='https://github.com/scicloj/tablecloth/blob/b0faadcd202d4355767f7e212a4d86e099eb5f96/src/tablecloth/api/utils.clj#L59'>available in the source code</a>.</p><h3 id="filter&#95;rows&#95;for&#95;range&#95;of&#95;values">Filter rows for range of values</h3><table><thead><tr><th>Library</th><th>Code</th></tr></thead><tbody><tr><td>tablecloth</td><td><code>&#40;tc/select-rows ds #&#40;&lt; 3500 &#40;% &quot;body&#95;mass&#95;g&quot; 0&#41; 4000&#41;&#41;</code></td></tr><tr><td>dplyr</td><td><code>filter&#40;ds, between&#40;body&#95;mass&#95;g, 3500, 4000&#41;&#41;</code></td></tr><tr><td>pandas</td><td><code>ds&#91;ds&#91;&quot;body&#95;mass&#95;g&quot;&#93;.between&#40;3500, 4000&#41;&#93;</code></td></tr><tr><td>polars</td><td><code>ds.filter&#40;pl.col&#40;&quot;body&#95;mass&#95;g&quot;&#41;.is&#95;between&#40;3500, 4000&#41;&#41;</code></td></tr></tbody></table><p>Note here we handle the missing values in the <code>body&#95;mass&#95;g</code> column differently than above, by specifying a default value for the map lookup. We're explicitly telling tablecloth to treat missing values as <code>0</code> in this case, which can then be compared to other numbers. This is probably the better way to handle this case, but the method above works, too, plus it gave me the opportunity to soapbox about Clojure types for a moment.</p><h3 id="reshaping&#95;the&#95;dataset">Reshaping the dataset</h3><h4 id="tablecloth">Tablecloth</h4><pre><code class="lang-clojure">&#40;tc/pivot-&gt;longer ds 
-                  &#91;&quot;bill&#95;length&#95;mm&quot; &quot;bill&#95;depth&#95;mm&quot;
-                   &quot;flipper&#95;length&#95;mm&quot; &quot;body&#95;mass&#95;g&quot;&#93;
-                  {:target-columns &quot;measurement&quot; :value-column-name &quot;value&quot;}&#41;
-</code></pre><h4 id="dplyr">dplyr</h4><pre><code class="lang-r">ds |&gt;
-    pivot&#95;longer&#40;cols = c&#40;bill&#95;length&#95;mm, bill&#95;depth&#95;mm,
-                          flipper&#95;length&#95;mm, body&#95;mass&#95;g&#41;,
-                 names&#95;to = &quot;measurement&quot;,
-                 values&#95;to = &quot;value&quot;&#41;
-</code></pre><h4 id="pandas">Pandas</h4><pre><code class="lang-python">pd.melt&#40;
-    ds, 
-    id&#95;vars=ds.columns.drop&#40;&#91;&quot;bill&#95;length&#95;mm&quot;, &quot;bill&#95;depth&#95;mm&quot;, 
-                             &quot;flipper&#95;length&#95;mm&quot;, &quot;body&#95;mass&#95;g&quot;&#93;&#41;, 
-    var&#95;name=&quot;measurement&quot;,
-    value&#95;name=&quot;value&quot;
-&#41;
-</code></pre><h4 id="polars">Polars</h4><pre><code class="lang-python">ds.unpivot&#40;
-     index=set&#40;ds.columns&#41; - set&#40;&#91;&quot;bill&#95;length&#95;mm&quot;,
-                                  &quot;bill&#95;depth&#95;mm&quot;,
-                                  &quot;flipper&#95;length&#95;mm&quot;,
-                                  &quot;body&#95;mass&#95;g&quot;&#93;&#41;,
-     variable&#95;name=&quot;measurement&quot;,
-     value&#95;name=&quot;value&quot;&#41;
-</code></pre><h2 id="creating&#95;and&#95;renaming&#95;columns">Creating and renaming columns</h2><h3 id="adding&#95;columns&#95;based&#95;on&#95;some&#95;other&#95;existing&#95;columns">Adding columns based on some other existing columns</h3><p>There are many reasons you might want to add columns, and often new columns are combinations of other ones. Here's how you'd generate a new column based on the values in some other columns in each library:</p><table><thead><tr><th>Library</th><th>Code</th></tr></thead><tbody><tr><td>tablecloth</td><td><pre style="width:max-content"><code>(require '[tablecloth.column.api :as tcc])<br>(tc/add-columns ds {"ratio" (tcc// (ds "bill&#95;length&#95;mm")<br>                                   (ds "flipper&#95;length&#95;mm"))})</code></pre></td></tr><tr><td>dplyr</td><td><code>mutate&#40;ds, ratio = bill&#95;length&#95;mm / flipper&#95;length&#95;mm&#41;</code></td></tr><tr><td>pandas</td><td><code>ds&#91;&quot;ratio&quot;&#93; = ds&#91;&quot;bill&#95;length&#95;mm&quot;&#93; / ds&#91;&quot;flipper&#95;length&#95;mm&quot;&#93;</code></td></tr><tr><td>polars</td><td><pre style="width:max-content"><code>ds.with&#95;columns(<br>    (pl.col("bill&#95;length&#95;mm") /<br>     pl.col("flipper&#95;length&#95;mm")).alias("ratio")<br>)</code></pre></td></tr></tbody></table><p>Note that this is where the wheels start to come off if you're not working in a functional way with immutable data structures. Clojure data structures (including tablecloth datasets) are immutable, which is not the case Pandas. The Pandas code above mutates the dataset in place, so as soon as you do any mutating operations like these, you now have to keep mental track of the state of your dataset, which can quickly lead to high cognitive overhead and lots of incidental complexity.</p><h3 id="renaming&#95;columns">Renaming columns</h3><table><thead><tr><th>Library</th><th>Code</th></tr></thead><tbody><tr><td>tablecloth</td><td><code>&#40;tc/rename-columns ds {&quot;bill&#95;length&#95;mm&quot; &quot;bill&#95;length&quot;}&#41;</code></td></tr><tr><td>dplyr</td><td><code>rename&#40;ds, bill&#95;length = bill&#95;length&#95;mm&#41;</code></td></tr><tr><td>pandas</td><td><code>ds.rename&#40;columns={&quot;bill&#95;length&#95;mm&quot;: &quot;bill&#95;length&quot;}&#41;</code></td></tr><tr><td>polars</td><td><code>ds.rename&#40;{&quot;bill&#95;length&#95;mm&quot;: &quot;bill&#95;length&quot;}&#41;</code></td></tr></tbody></table><p>Again beware, the Pandas implementation shown here mutates the dataset in place. Also manually specifying every column name transformation you want to do is one way to accomplish the task, but sometimes that can be tedious if you want to apply the same transformation to every column name, which is fairly common.</p><h3 id="transforming&#95;column&#95;names">Transforming column names</h3><p>Here's how you would upper case all column names:</p><table><thead><tr><th>Library</th><th>Code</th></tr></thead><tbody><tr><td>tablecloth</td><td><code>&#40;tc/rename-columns ds :all str/upper-case&#41;</code></td></tr><tr><td>dplyr</td><td><code>rename&#95;with&#40;ds, toupper&#41;</code></td></tr><tr><td>pandas</td><td><code>ds.columns = ds.columns.str.upper&#40;&#41;</code></td></tr><tr><td>polars</td><td><code>ds.select&#40;pl.all&#40;&#41;.name.to&#95;uppercase&#40;&#41;&#41;</code></td></tr></tbody></table><p>Like the other libraries, tablecloth's <code>rename-columns</code> accepts both types of arguments &ndash; a simple mapping of old -> new column names, or any column selector and any transformation function. For example, removing the units from each column name would look like this in each language:</p><table><thead><tr><th>Library</th><th>Code</th></tr></thead><tbody><tr><td>tablecloth</td><td><code>(tc/rename-columns ds #".+&#95;(mm&#124;g)" #(str/replace % #"(.+)&#95;(mm&#124;g)" "$1"))</code></td></tr><tr><td>dplyr</td><td><code>rename&#95;with(penguins, ~ str&#95;replace(.x, "&#94;(.+)&#95;(mm&#124;g)$", "&#92;1"))</code></td></tr><tr><td>pandas</td><td><pre style="width:max-content"><code>import re<br>ds.rename(columns=lambda x: re.sub(r"(.+)_(mm&#124;g)$", r"\1", x))</code></pre></td></tr><tr><td>polars</td><td><pre style="width:max-content"><code>ds = ds.rename({<br>    col: col.replace("&#95;mm", "").replace("&#95;g", "")<br>    for col in ds.columns<br>})</code></pre></td></tr></tbody></table><h2 id="grouping&#95;and&#95;aggregating">Grouping and aggregating</h2><p>Grouping behaves <a href='https://scicloj.github.io/tablecloth/index.html#group-by'>somewhat unconventionally in tablecloth</a>. Datasets can be grouped by a single column name or a sequence of column names like in other libraries, but grouping can also be done using any arbitrary function. Grouping in tablecloth also returns a new dataset, similar to dplyr, rather than an abstract intermediate object (as in pandas and polars). Grouped datasets have three columns, (name of the group, group id, and a column containing a new dataset of the grouped data). Once a dataset is grouped, the group values can be aggregated in a variety of ways. Here are a few examples, with comparisons between libraries:</p><h3 id="summarizing&#95;counts">Summarizing counts</h3><p>To get the count of each penguin by species:</p><h4 id="tablecloth">Tablecloth</h4><pre><code class="lang-clojure">&#40;-&gt; ds
-    &#40;tc/group-by &#91;&quot;species&quot;&#93;&#41;
-    &#40;tc/aggregate {&quot;count&quot; tc/row-count}&#41;&#41;
-</code></pre><h4 id="dplyr">dplyr</h4><pre><code class="lang-r">ds |&gt;
-    group&#95;by&#40;species&#41; |&gt;
-    summarise&#40;count = n&#40;&#41;&#41;
-</code></pre><h4 id="pandas">Pandas</h4><pre><code class="lang-python">ds.groupby&#40;&quot;species&quot;&#41;.agg&#40;count=&#40;&quot;species&quot;, &quot;count&quot;&#41;&#41;
-</code></pre><h4 id="polars">Polars</h4><pre><code class="lang-python">ds.group&#95;by&#40;&quot;species&quot;&#41;.agg&#40;pl.count&#40;&#41;.alias&#40;&quot;count&quot;&#41;&#41;
-</code></pre><h3 id="find&#95;the&#95;penguin&#95;with&#95;the&#95;lowest&#95;body&#95;mass&#95;by&#95;species">Find the penguin with the lowest body mass by species</h3><h4 id="tablecloth">Tablecloth</h4><pre><code class="lang-clojure">&#40;-&gt; ds
-    &#40;tc/group-by &#91;&quot;species&quot;&#93;&#41;
-    &#40;tc/aggregate {&quot;lowest&#95;body&#95;mass&#95;g&quot; #&#40;-&gt;&gt; &#40;% &quot;body&#95;mass&#95;g&quot;&#41;
-                                              tcc/drop-missing
-                                              &#40;apply tcc/min&#41;&#41;}&#41;&#41;
-</code></pre><h4 id="dplyr">dplyr</h4><pre><code class="lang-r">ds |&gt;
-    group&#95;by&#40;species&#41; |&gt;
-    summarize&#40;lowest&#95;body&#95;mass&#95;g = min&#40;body&#95;mass&#95;g, na.rm = TRUE&#41;&#41;
-</code></pre><h4 id="pandas">Pandas</h4><pre><code class="lang-python">ds.groupby&#40;&quot;species&quot;&#41;.agg&#40;
-    lowest&#95;body&#95;mass&#95;g=&#40;&quot;body&#95;mass&#95;g&quot;, lambda x: x.min&#40;skipna=True&#41;&#41;
-&#41;.reset&#95;index&#40;&#41;
-</code></pre><h4 id="polars">Polars</h4><pre><code class="lang-python">ds.group&#95;by&#40;&quot;species&quot;&#41;.agg&#40;
-    pl.col&#40;&quot;body&#95;mass&#95;g&quot;&#41;.min&#40;&#41;.alias&#40;&quot;lowest&#95;body&#95;mass&#95;g&quot;&#41;
-&#41;
-</code></pre><h2 id="conclusions">Conclusions</h2><p>As you can see, all of these libraries are perfectly suitable for accomplishing common data manipulation tasks. Choosing a language and library can impact code readability, maintainability, and performance, though, so understanding the differences between available toolkits can help us make better choices.</p><p>Clojure's tablecloth emphasizes functional programming concepts and immutability, which can lead to more predictable and re-usable code, at the cost of adopting a potentially new paradigm. Hopefully this comparison serves not only as a translation guide, but an an intro to the different philosophies underpinning these common data science tools.</p><p>Thanks for reading :)</p><h2 id="versions">Versions</h2><p>This code in this post works with the following language and library versions:</p><table><thead><tr><th>Tool</th><th>Version</th></tr></thead><tbody><tr><td>MacOS</td><td>Sonoma 14.5</td></tr><tr><td>JVM</td><td><code>21.0.2</code></td></tr><tr><td>Clojure</td><td><code>1.11.1</code></td></tr><tr><td>Tablecloth</td><td><code>7.021</code></td></tr><tr><td>R</td><td><code>4.4.1</code></td></tr><tr><td>Tidyverse</td><td><code>2.0.0</code></td></tr><tr><td>Python</td><td><code>3.12.3</code></td></tr><tr><td>Pandas</td><td><code>2.1.4</code></td></tr><tr><td>Polars</td><td><code>1.1.0</code></td></tr></tbody></table>
-
-
-
-
-<p>
-  <i>
-  Tagged:
-  
-  <span class="tag">
-    <a href="tags/tools.html">tools</a>
-  </span>
-  
-  <span class="tag">
-    <a href="tags/clojure.html">clojure</a>
-  </span>
-  
-  <span class="tag">
-    <a href="tags/scicloj.html">scicloj</a>
-  </span>
-  
-  <span class="tag">
-    <a href="tags/r.html">r</a>
-  </span>
-  
-  <span class="tag">
-    <a href="tags/python.html">python</a>
-  </span>
-  
-  </i>
-</p>
-
-
-
-      
-      <div style="margin-bottom: 20px; float: right;">
-        <a href="archive.html">Archive</a>
-      </div>
-      
-    </main>
-  </body>
-</html>
diff --git a/public/archive.html b/public/archive.html
index 5538ab7..849462a 100644
--- a/public/archive.html
+++ b/public/archive.html
@@ -67,8 +67,6 @@ <h1>Archive</h1>
 <h2>2024</h2>
 <ul class="index">
     
-        <li><span><a href="2024-07-18-dplyr-polars-pandas-tablecloth.html">Data Manipulation in Clojure Compared to R and Python</a> - 2024-07-18</span></li>
-    
         <li><span><a href="2024-07-18-tablecloth-dplyr-pandas-polars.html">Data Manipulation in Clojure Compared to R and Python</a> - 2024-07-18</span></li>
     
         <li><span><a href="2024-06-30-clojurists-together-update-may-jun-2024.html">OSS Updates May and June 2024</a> - 2024-06-30</span></li>
diff --git a/public/atom.xml b/public/atom.xml
index fd5fedc..d51afbf 100644
--- a/public/atom.xml
+++ b/public/atom.xml
@@ -3,87 +3,11 @@
   <title>Code with Kira</title>
   <link href="https://codewithkira.com/atom.xml" rel="self"/>
   <link href="https://codewithkira.com"/>
-  <updated>2024-07-18T20:39:10+00:00</updated>
+  <updated>2024-07-18T20:46:32+00:00</updated>
   <id>https://codewithkira.com</id>
   <author>
     <name>Kira McLean</name>
   </author>
-  <entry>
-    <id>https://codewithkira.com/2024-07-18-dplyr-polars-pandas-tablecloth.html</id>
-    <link href="https://codewithkira.com/2024-07-18-dplyr-polars-pandas-tablecloth.html"/>
-    <title>Data Manipulation in Clojure Compared to R and Python</title>
-    <updated>2024-07-18T23:59:59+00:00</updated>
-    <content type="html"><![CDATA[<p>I spend a lot of time developing and teaching people about Clojure's open source tools for working with data. Almost everybody who wants to use Clojure for this kind of work is coming from another language ecosystem, usually R or Python. Together with Daniel Slutsky, I'm working on formalizing some of the common teachings into a course. Part of that is providing context for people coming from other ecosystems, including "translations" of how to accomplish data science tasks in Clojure.</p><p>As part of this development, I wanted to share an early preview in this blog post. The format is inspired by this great blog post I read a while ago comparing <a href='https://krz.github.io/Comparing-dplyr-with-polars/'>R and Polars</a> side by side (where "R" here refers to <a href='https://www.tidyverse.org'>the tidyverse</a>, an opinionated collection of R libraries for data science, and realistically mostly <a href='https://dplyr.tidyverse.org'><code>dplyr</code></a> specifically). I'm adding Pandas because it's among the most popular dataset manipulation libraries, and of course Clojure, specifically <a href='https://github.com/scicloj/tablecloth'>tablecloth</a>, the primary data manipulation library in our ecosystem.</p><p>I'll use the same dataset as the original blog post, the <a href='https://allisonhorst.github.io/palmerpenguins/articles/intro.html'>Palmer Penguin dataset</a>. For the sake of simplicity, I saved a copy of the dataset as a CSV file and made it available on this website. I will also refer the data as a "dataset" throughout this post because that's what Clojure people call a tabular, column-major data structure, but it's the same thing that is variously referred to as a dataframe, data table, or just "data" in other languages. I'm also assuming you know how to install the packages required in the given ecosystems, but any necessary imports or requirements are included in the code snippets the first time they appear. Versions of all languages and libraries used in this post are listed at the end. Here we go!</p><h2 id="reading&#95;data">Reading data</h2><p>Reading data is straightforward in every language, but as a bonus we want to be able to indicate on the fly which values should be interpreted as "missing", whatever that means in the given libraries. In this dataset, the string <code>&quot;NA&quot;</code> means "missing", so we want to tell the dataset constructor this as soon as possible. Here's the comparison of how to accomplish that in various languages:</p><h3 id="tablecloth">Tablecloth</h3><pre><code class="lang-clojure">&#40;require '&#91;tablecloth.api :as tc&#93;&#41;
-
-&#40;def ds 
-  &#40;tc/dataset &quot;https://codewithkira.com/assets/penguins.csv&quot;&#41;&#41;
-</code></pre><p>Note that tablecloth interprets the string "NA" as missing (<code>nil</code>, in Clojure) by default.</p><h3 id="r">R</h3><p>In reality, in R you would get the dataset from the R <a href='https://allisonhorst.github.io/palmerpenguins/articles/intro.html'>package that contains the dataset</a>. This is a fairly common practice in R. In order to compare apples to apples, though, here I'll show how to initialize the dataset from a remote CSV file, using the <a href='https://readr.tidyverse.org/reference/read_delim.html'><code>readr</code> package's <code>read&#95;csv</code></a>, which is part of the tidyverse:</p><pre><code class="lang-r">library&#40;tidyverse&#41;
-
-ds &lt;- read&#95;csv&#40;&quot;https://codewithkira.com/assets/penguins.csv&quot;,
-               na = &quot;NA&quot;&#41;
-</code></pre><h3 id="pandas">Pandas</h3><pre><code class="lang-python">import pandas as pd
-
-ds = pd.read&#95;csv&#40;&quot;https://codewithkira.com/assets/penguins.csv&quot;&#41;
-</code></pre><p>Note that pandas has a <a href='https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html'>fairly long list</a> of values it considers <code>NaN</code> already, so we don't need to specify what missing values look like in our case, since <code>&quot;NA&quot;</code> is already in that list.</p><h3 id="polars">Polars</h3><pre><code class="lang-python">import polars as pl
-
-ds = pl.read&#95;csv&#40;&quot;https://codewithkira.com/assets/penguins.csv&quot;,
-                 null&#95;values=&quot;NA&quot;&#41;
-</code></pre><h2 id="basic&#95;commands&#95;to&#95;explore&#95;the&#95;dataset">Basic commands to explore the dataset</h2><p>The first thing people usually want to do with their dataset is see it and poke around a bit. Below is a comparison of how to accomplish basic data exploration tasks using each library.</p><table><thead><tr><th>Operation</th><th>tablecloth</th><th>dplyr</th></tr></thead><tbody><tr><td>see first 10 rows</td><td><code>&#40;tc/head ds 10&#41;</code></td><td><code>head&#40;ds, 10&#41;</code></td></tr><tr><td>see all column names</td><td><code>&#40;tc/column-names ds&#41;</code></td><td><code>colnames&#40;ds&#41;</code></td></tr><tr><td>select column</td><td><code>&#40;tc/select-columns ds &quot;year&quot;&#41;</code></td><td><code>select&#40;ds, year&#41;</code></td></tr><tr><td>select multiple columns</td><td><code>&#40;tc/select-columns ds &#91;&quot;year&quot; &quot;sex&quot;&#93;&#41;</code></td><td><code>select&#40;ds, year, sex&#41;</code></td></tr><tr><td>select rows</td><td><code>&#40;tc/select-rows ds #&#40;&gt; &#40;% &quot;year&quot;&#41; 2008&#41;&#41;</code></td><td><code>filter&#40;ds, year &gt; 2008&#41;</code></td></tr><tr><td>sort column</td><td><code>&#40;tc/order-by ds &quot;year&quot;&#41;</code></td><td><code>arrange&#40;ds, year&#41;</code></td></tr></tbody></table><p><br></p><table><thead><tr><th>Operation</th><th>pandas</th><th>polars</th></tr></thead><tbody><tr><td>see first <code>n</code> rows</td><td><code>ds.head&#40;10&#41;</code></td><td><code>ds.head&#40;10&#41;</code></td></tr><tr><td>see all column names</td><td><code>ds.columns</code></td><td><code>ds.columns</code></td></tr><tr><td>select column</td><td><code>ds&#91;&#91;&quot;year&quot;&#93;&#93;</code></td><td><code>ds.select&#40;pl.col&#40;&quot;year&quot;&#41;&#41;</code></td></tr><tr><td>select multiple columns</td><td><code>ds&#91;&#91;&quot;year&quot;, &quot;sex&quot;&#93;&#93;</code></td><td><code>ds.select&#40;pl.col&#40;&quot;year&quot;, &quot;sex&quot;&#41;&#41;</code></td></tr><tr><td>select rows</td><td><code>ds&#91;ds&#91;&quot;year&quot;&#93; &gt; 2008&#93;</code></td><td><code>ds.filter&#40;pl.col&#40;&quot;year&quot;&#41; &gt; 2008&#41;</code></td></tr><tr><td>sort column</td><td><code>ds.sort&#95;values&#40;&quot;year&quot;&#41;</code></td><td><code>ds.sort&#40;&quot;year&quot;&#41;</code></td></tr></tbody></table><p>Note there are some differences in how different libraries sort missing values, for example in tablecloth and polars they are placed at the beginning (so they're at the top when a column is sorted in ascending order and last when descending), but dplyr and pandas place them last (regardless of whether ascending or descending order is specified).</p><p>As you can see, these commands are all pretty similar, with the exception of selecting rows in tablecloth. This is a short-hand syntax for writing an anonymous function in Clojure, which is how rows are selected. Being a functional language, functions in Clojure are "first-class", which basically just means they are passed around as arguments willy-nilly, all over the place, all the time. In this case, the third argument to tablecloth's <code>select-rows</code> function is a predicate (a function that returns a boolean) that takes as its argument a dataset row as a map of column names to values. Don't worry, though, tablecloth doesn't process your entire dataset row-wise. Under the hood datasets are highly optimized to perform column-wise operations as fast as possible.</p><p>Here's an example of what it looks like to string a couple of these basic dataset exploration operations together, for example in this case to get the <code>bill&#95;length&#95;mm</code> of all penguins with <code>body&#95;mass&#95;g</code> below 3800:</p><h3 id="tablecloth">Tablecloth</h3><pre><code class="lang-clojure">&#40;-&gt; ds
-    &#40;tc/select-rows #&#40;and &#40;% &quot;body&#95;mass&#95;g&quot;&#41;
-                          &#40;&gt; &#40;% &quot;body&#95;mass&#95;g&quot;&#41; 3800&#41;&#41;&#41;
-    &#40;tc/select-columns &quot;bill&#95;length&#95;mm&quot;&#41;&#41;
-</code></pre><p>Note that in tablecloth we have to explicitly omit rows where the value we're filtering by is missing, unlike in other libraries. This is because tablecloth actually uses <code>nil</code> (as opposed to a library-specific construct) to indicate a missing value , and in Clojure <code>nil</code> is not treated as comparable to numbers. If we were to try to compare <code>nil</code> to a number, we would get an exception telling us that we're trying to compare incomparable types. Clojure is fundamentally dynamically typed in that it only does type checking at runtime and bindings can refer to values of any type, but it is also strongly typed, as we see here, in the sense that it explicitly avoids implicit type coercion. For example deciding whether 0 is greater or larger than <code>nil</code> requires some assumptions, and these are intentionally not baked into the core of Clojure or into tablecloth as a library as is the case in some other languages and libraries.</p><p>This example also introduces Clojure's "thread-first" macro. The <code>-&gt;</code> arrow is like R's <code>|&gt;</code> operator or the unix pipe, effectively passing the output of each function in the chain as input to the next. It comes in very handy for data processing code like this.</p><p>Here is the equivalent operation in the other libraries:</p><h3 id="dplyr">dplyr</h3><pre><code class="lang-r">ds |&gt;
-    filter&#40;body&#95;mass&#95;g &lt; 3800&#41; |&gt;
-    select&#40;bill&#95;length&#95;mm&#41;
-</code></pre><h3 id="pandas">Pandas</h3><pre><code class="lang-python">ds&#91;ds&#91;&quot;body&#95;mass&#95;g&quot;&#93; &lt; 3800&#93;&#91;&quot;bill&#95;length&#95;mm&quot;&#93;
-</code></pre><h3 id="polars">Polars</h3><pre><code class="lang-python">ds.filter&#40;pl.col&#40;&quot;body&#95;mass&#95;g&quot;&#41; &lt; 3800&#41;.select&#40;pl.col&#40;&quot;bill&#95;length&#95;mm&quot;&#41;&#41;
-</code></pre><h2 id="more&#95;advanced&#95;filtering&#95;and&#95;selecting">More advanced filtering and selecting</h2><p>Here is what some more complicated data wrangling looks like across the libraries.</p><h3 id="select&#95;all&#95;columns&#95;except&#95;for&#95;one">Select all columns except for one</h3><table><thead><tr><th>Library</th><th>Code</th></tr></thead><tbody><tr><td>tablecloth</td><td><code>&#40;tc/select-columns ds &#40;complement #{&quot;year&quot;}&#41;&#41;</code></td></tr><tr><td>dplyr</td><td><code>select&#40;ds, -year&#41;</code></td></tr><tr><td>pandas</td><td><code>ds.drop&#40;columns=&#91;&quot;year&quot;&#93;&#41;</code></td></tr><tr><td>polars</td><td><code>ds.select&#40;pl.exclude&#40;&quot;year&quot;&#41;&#41;</code></td></tr></tbody></table><p>Another property of functional languages in general, and especially Clojure, is that they really take advantage of the fact that a lot of things are functions that you might not be used to treating like functions. They also leverage function composition to simply combine multiple functions into a single operation.</p><p>For example a set (indicated with the <code>#{}</code> syntax in Clojure) is a special function that returns a boolean indicating whether the given argument is a member of the set or not. And <a href='https://clojuredocs.org/clojure.core/complement'><code>complement</code></a> is a function in <code>clojure.core</code> that effectively inverts the function given to it, so combined <code>&#40;complement #{&quot;year&quot;}&#41;</code> means "every value that is <i>not</i> in the set <code>#{&quot;year&quot;}</code>, which we can then use as our predicate column selector function to filter out certain columns.</p><h3 id="select&#95;all&#95;columns&#95;that&#95;start&#95;with&#95;a&#95;given&#95;string">Select all columns that start with a given string</h3><table><thead><tr><th>Library</th><th>Code</th></tr></thead><tbody><tr><td>tablecloth</td><td><code>&#40;tc/select-columns ds #&#40;str/starts-with? % &quot;bill&quot;&#41;&#41;</code></td></tr><tr><td>dplyr</td><td><code>select&#40;ds, starts&#95;with&#40;&quot;bill&quot;&#41;&#41;</code></td></tr><tr><td>pandas</td><td><code>ds.filter&#40;regex=&quot;&#94;bill&quot;&#41;</code></td></tr><tr><td>polars</td><td><pre style="width:max-content"><code>import polars.selectors as cs</code><br><code>ds.select(cs.starts_with("bill"))</code></pre></td></tr></tbody></table><h3 id="select&#95;only&#95;numeric&#95;columns">Select only numeric columns</h3><table><thead><tr><th>Library</th><th>Code</th></tr></thead><tbody><tr><td>tablecloth</td><td><code>&#40;tc/select-columns ds :type/numerical</code></td></tr><tr><td>dplyr</td><td><code>select&#40;ds, where&#40;is.numeric&#41;&#41;</code></td></tr><tr><td>pandas</td><td><code>ds.select&#95;dtypes&#40;include='number'&#41;</code></td></tr><tr><td>polars</td><td><code>ds.select&#40;cs.numeric&#40;&#41;&#41;</code></td></tr></tbody></table><p>The symbol <code>:type/numerical</code> in Clojure here is a magic keyword that tablecloth knows about and can accept as a column selector. This list of magic keywords that tablecloth knows about is not (yet) documented anywhere, but it is <a href='https://github.com/scicloj/tablecloth/blob/b0faadcd202d4355767f7e212a4d86e099eb5f96/src/tablecloth/api/utils.clj#L59'>available in the source code</a>.</p><h3 id="filter&#95;rows&#95;for&#95;range&#95;of&#95;values">Filter rows for range of values</h3><table><thead><tr><th>Library</th><th>Code</th></tr></thead><tbody><tr><td>tablecloth</td><td><code>&#40;tc/select-rows ds #&#40;&lt; 3500 &#40;% &quot;body&#95;mass&#95;g&quot; 0&#41; 4000&#41;&#41;</code></td></tr><tr><td>dplyr</td><td><code>filter&#40;ds, between&#40;body&#95;mass&#95;g, 3500, 4000&#41;&#41;</code></td></tr><tr><td>pandas</td><td><code>ds&#91;ds&#91;&quot;body&#95;mass&#95;g&quot;&#93;.between&#40;3500, 4000&#41;&#93;</code></td></tr><tr><td>polars</td><td><code>ds.filter&#40;pl.col&#40;&quot;body&#95;mass&#95;g&quot;&#41;.is&#95;between&#40;3500, 4000&#41;&#41;</code></td></tr></tbody></table><p>Note here we handle the missing values in the <code>body&#95;mass&#95;g</code> column differently than above, by specifying a default value for the map lookup. We're explicitly telling tablecloth to treat missing values as <code>0</code> in this case, which can then be compared to other numbers. This is probably the better way to handle this case, but the method above works, too, plus it gave me the opportunity to soapbox about Clojure types for a moment.</p><h3 id="reshaping&#95;the&#95;dataset">Reshaping the dataset</h3><h4 id="tablecloth">Tablecloth</h4><pre><code class="lang-clojure">&#40;tc/pivot-&gt;longer ds 
-                  &#91;&quot;bill&#95;length&#95;mm&quot; &quot;bill&#95;depth&#95;mm&quot;
-                   &quot;flipper&#95;length&#95;mm&quot; &quot;body&#95;mass&#95;g&quot;&#93;
-                  {:target-columns &quot;measurement&quot; :value-column-name &quot;value&quot;}&#41;
-</code></pre><h4 id="dplyr">dplyr</h4><pre><code class="lang-r">ds |&gt;
-    pivot&#95;longer&#40;cols = c&#40;bill&#95;length&#95;mm, bill&#95;depth&#95;mm,
-                          flipper&#95;length&#95;mm, body&#95;mass&#95;g&#41;,
-                 names&#95;to = &quot;measurement&quot;,
-                 values&#95;to = &quot;value&quot;&#41;
-</code></pre><h4 id="pandas">Pandas</h4><pre><code class="lang-python">pd.melt&#40;
-    ds, 
-    id&#95;vars=ds.columns.drop&#40;&#91;&quot;bill&#95;length&#95;mm&quot;, &quot;bill&#95;depth&#95;mm&quot;, 
-                             &quot;flipper&#95;length&#95;mm&quot;, &quot;body&#95;mass&#95;g&quot;&#93;&#41;, 
-    var&#95;name=&quot;measurement&quot;,
-    value&#95;name=&quot;value&quot;
-&#41;
-</code></pre><h4 id="polars">Polars</h4><pre><code class="lang-python">ds.unpivot&#40;
-     index=set&#40;ds.columns&#41; - set&#40;&#91;&quot;bill&#95;length&#95;mm&quot;,
-                                  &quot;bill&#95;depth&#95;mm&quot;,
-                                  &quot;flipper&#95;length&#95;mm&quot;,
-                                  &quot;body&#95;mass&#95;g&quot;&#93;&#41;,
-     variable&#95;name=&quot;measurement&quot;,
-     value&#95;name=&quot;value&quot;&#41;
-</code></pre><h2 id="creating&#95;and&#95;renaming&#95;columns">Creating and renaming columns</h2><h3 id="adding&#95;columns&#95;based&#95;on&#95;some&#95;other&#95;existing&#95;columns">Adding columns based on some other existing columns</h3><p>There are many reasons you might want to add columns, and often new columns are combinations of other ones. Here's how you'd generate a new column based on the values in some other columns in each library:</p><table><thead><tr><th>Library</th><th>Code</th></tr></thead><tbody><tr><td>tablecloth</td><td><pre style="width:max-content"><code>(require '[tablecloth.column.api :as tcc])<br>(tc/add-columns ds {"ratio" (tcc// (ds "bill&#95;length&#95;mm")<br>                                   (ds "flipper&#95;length&#95;mm"))})</code></pre></td></tr><tr><td>dplyr</td><td><code>mutate&#40;ds, ratio = bill&#95;length&#95;mm / flipper&#95;length&#95;mm&#41;</code></td></tr><tr><td>pandas</td><td><code>ds&#91;&quot;ratio&quot;&#93; = ds&#91;&quot;bill&#95;length&#95;mm&quot;&#93; / ds&#91;&quot;flipper&#95;length&#95;mm&quot;&#93;</code></td></tr><tr><td>polars</td><td><pre style="width:max-content"><code>ds.with&#95;columns(<br>    (pl.col("bill&#95;length&#95;mm") /<br>     pl.col("flipper&#95;length&#95;mm")).alias("ratio")<br>)</code></pre></td></tr></tbody></table><p>Note that this is where the wheels start to come off if you're not working in a functional way with immutable data structures. Clojure data structures (including tablecloth datasets) are immutable, which is not the case Pandas. The Pandas code above mutates the dataset in place, so as soon as you do any mutating operations like these, you now have to keep mental track of the state of your dataset, which can quickly lead to high cognitive overhead and lots of incidental complexity.</p><h3 id="renaming&#95;columns">Renaming columns</h3><table><thead><tr><th>Library</th><th>Code</th></tr></thead><tbody><tr><td>tablecloth</td><td><code>&#40;tc/rename-columns ds {&quot;bill&#95;length&#95;mm&quot; &quot;bill&#95;length&quot;}&#41;</code></td></tr><tr><td>dplyr</td><td><code>rename&#40;ds, bill&#95;length = bill&#95;length&#95;mm&#41;</code></td></tr><tr><td>pandas</td><td><code>ds.rename&#40;columns={&quot;bill&#95;length&#95;mm&quot;: &quot;bill&#95;length&quot;}&#41;</code></td></tr><tr><td>polars</td><td><code>ds.rename&#40;{&quot;bill&#95;length&#95;mm&quot;: &quot;bill&#95;length&quot;}&#41;</code></td></tr></tbody></table><p>Again beware, the Pandas implementation shown here mutates the dataset in place. Also manually specifying every column name transformation you want to do is one way to accomplish the task, but sometimes that can be tedious if you want to apply the same transformation to every column name, which is fairly common.</p><h3 id="transforming&#95;column&#95;names">Transforming column names</h3><p>Here's how you would upper case all column names:</p><table><thead><tr><th>Library</th><th>Code</th></tr></thead><tbody><tr><td>tablecloth</td><td><code>&#40;tc/rename-columns ds :all str/upper-case&#41;</code></td></tr><tr><td>dplyr</td><td><code>rename&#95;with&#40;ds, toupper&#41;</code></td></tr><tr><td>pandas</td><td><code>ds.columns = ds.columns.str.upper&#40;&#41;</code></td></tr><tr><td>polars</td><td><code>ds.select&#40;pl.all&#40;&#41;.name.to&#95;uppercase&#40;&#41;&#41;</code></td></tr></tbody></table><p>Like the other libraries, tablecloth's <code>rename-columns</code> accepts both types of arguments &ndash; a simple mapping of old -> new column names, or any column selector and any transformation function. For example, removing the units from each column name would look like this in each language:</p><table><thead><tr><th>Library</th><th>Code</th></tr></thead><tbody><tr><td>tablecloth</td><td><code>(tc/rename-columns ds #".+&#95;(mm&#124;g)" #(str/replace % #"(.+)&#95;(mm&#124;g)" "$1"))</code></td></tr><tr><td>dplyr</td><td><code>rename&#95;with(penguins, ~ str&#95;replace(.x, "&#94;(.+)&#95;(mm&#124;g)$", "&#92;1"))</code></td></tr><tr><td>pandas</td><td><pre style="width:max-content"><code>import re<br>ds.rename(columns=lambda x: re.sub(r"(.+)_(mm&#124;g)$", r"\1", x))</code></pre></td></tr><tr><td>polars</td><td><pre style="width:max-content"><code>ds = ds.rename({<br>    col: col.replace("&#95;mm", "").replace("&#95;g", "")<br>    for col in ds.columns<br>})</code></pre></td></tr></tbody></table><h2 id="grouping&#95;and&#95;aggregating">Grouping and aggregating</h2><p>Grouping behaves <a href='https://scicloj.github.io/tablecloth/index.html#group-by'>somewhat unconventionally in tablecloth</a>. Datasets can be grouped by a single column name or a sequence of column names like in other libraries, but grouping can also be done using any arbitrary function. Grouping in tablecloth also returns a new dataset, similar to dplyr, rather than an abstract intermediate object (as in pandas and polars). Grouped datasets have three columns, (name of the group, group id, and a column containing a new dataset of the grouped data). Once a dataset is grouped, the group values can be aggregated in a variety of ways. Here are a few examples, with comparisons between libraries:</p><h3 id="summarizing&#95;counts">Summarizing counts</h3><p>To get the count of each penguin by species:</p><h4 id="tablecloth">Tablecloth</h4><pre><code class="lang-clojure">&#40;-&gt; ds
-    &#40;tc/group-by &#91;&quot;species&quot;&#93;&#41;
-    &#40;tc/aggregate {&quot;count&quot; tc/row-count}&#41;&#41;
-</code></pre><h4 id="dplyr">dplyr</h4><pre><code class="lang-r">ds |&gt;
-    group&#95;by&#40;species&#41; |&gt;
-    summarise&#40;count = n&#40;&#41;&#41;
-</code></pre><h4 id="pandas">Pandas</h4><pre><code class="lang-python">ds.groupby&#40;&quot;species&quot;&#41;.agg&#40;count=&#40;&quot;species&quot;, &quot;count&quot;&#41;&#41;
-</code></pre><h4 id="polars">Polars</h4><pre><code class="lang-python">ds.group&#95;by&#40;&quot;species&quot;&#41;.agg&#40;pl.count&#40;&#41;.alias&#40;&quot;count&quot;&#41;&#41;
-</code></pre><h3 id="find&#95;the&#95;penguin&#95;with&#95;the&#95;lowest&#95;body&#95;mass&#95;by&#95;species">Find the penguin with the lowest body mass by species</h3><h4 id="tablecloth">Tablecloth</h4><pre><code class="lang-clojure">&#40;-&gt; ds
-    &#40;tc/group-by &#91;&quot;species&quot;&#93;&#41;
-    &#40;tc/aggregate {&quot;lowest&#95;body&#95;mass&#95;g&quot; #&#40;-&gt;&gt; &#40;% &quot;body&#95;mass&#95;g&quot;&#41;
-                                              tcc/drop-missing
-                                              &#40;apply tcc/min&#41;&#41;}&#41;&#41;
-</code></pre><h4 id="dplyr">dplyr</h4><pre><code class="lang-r">ds |&gt;
-    group&#95;by&#40;species&#41; |&gt;
-    summarize&#40;lowest&#95;body&#95;mass&#95;g = min&#40;body&#95;mass&#95;g, na.rm = TRUE&#41;&#41;
-</code></pre><h4 id="pandas">Pandas</h4><pre><code class="lang-python">ds.groupby&#40;&quot;species&quot;&#41;.agg&#40;
-    lowest&#95;body&#95;mass&#95;g=&#40;&quot;body&#95;mass&#95;g&quot;, lambda x: x.min&#40;skipna=True&#41;&#41;
-&#41;.reset&#95;index&#40;&#41;
-</code></pre><h4 id="polars">Polars</h4><pre><code class="lang-python">ds.group&#95;by&#40;&quot;species&quot;&#41;.agg&#40;
-    pl.col&#40;&quot;body&#95;mass&#95;g&quot;&#41;.min&#40;&#41;.alias&#40;&quot;lowest&#95;body&#95;mass&#95;g&quot;&#41;
-&#41;
-</code></pre><h2 id="conclusions">Conclusions</h2><p>As you can see, all of these libraries are perfectly suitable for accomplishing common data manipulation tasks. Choosing a language and library can impact code readability, maintainability, and performance, though, so understanding the differences between available toolkits can help us make better choices.</p><p>Clojure's tablecloth emphasizes functional programming concepts and immutability, which can lead to more predictable and re-usable code, at the cost of adopting a potentially new paradigm. Hopefully this comparison serves not only as a translation guide, but an an intro to the different philosophies underpinning these common data science tools.</p><p>Thanks for reading :)</p><h2 id="versions">Versions</h2><p>This code in this post works with the following language and library versions:</p><table><thead><tr><th>Tool</th><th>Version</th></tr></thead><tbody><tr><td>MacOS</td><td>Sonoma 14.5</td></tr><tr><td>JVM</td><td><code>21.0.2</code></td></tr><tr><td>Clojure</td><td><code>1.11.1</code></td></tr><tr><td>Tablecloth</td><td><code>7.021</code></td></tr><tr><td>R</td><td><code>4.4.1</code></td></tr><tr><td>Tidyverse</td><td><code>2.0.0</code></td></tr><tr><td>Python</td><td><code>3.12.3</code></td></tr><tr><td>Pandas</td><td><code>2.1.4</code></td></tr><tr><td>Polars</td><td><code>1.1.0</code></td></tr></tbody></table>]]></content>
-  </entry>
   <entry>
     <id>https://codewithkira.com/2024-07-18-tablecloth-dplyr-pandas-polars.html</id>
     <link href="https://codewithkira.com/2024-07-18-tablecloth-dplyr-pandas-polars.html"/>
diff --git a/public/index.html b/public/index.html
index a0acf5c..af0c54f 100644
--- a/public/index.html
+++ b/public/index.html
@@ -64,7 +64,7 @@ <h1>
 
       
     <div>
-        <h1><a href="2024-07-18-dplyr-polars-pandas-tablecloth.html">Data Manipulation in Clojure Compared to R and Python</a></h1>
+        <h1><a href="2024-07-18-tablecloth-dplyr-pandas-polars.html">Data Manipulation in Clojure Compared to R and Python</a></h1>
         <p>I spend a lot of time developing and teaching people about Clojure's open source tools for working with data. Almost everybody who wants to use Clojure for this kind of work is coming from another language ecosystem, usually R or Python. Together with Daniel Slutsky, I'm working on formalizing some of the common teachings into a course. Part of that is providing context for people coming from other ecosystems, including "translations" of how to accomplish data science tasks in Clojure.</p><p>As part of this development, I wanted to share an early preview in this blog post. The format is inspired by this great blog post I read a while ago comparing <a href='https://krz.github.io/Comparing-dplyr-with-polars/'>R and Polars</a> side by side (where "R" here refers to <a href='https://www.tidyverse.org'>the tidyverse</a>, an opinionated collection of R libraries for data science, and realistically mostly <a href='https://dplyr.tidyverse.org'><code>dplyr</code></a> specifically). I'm adding Pandas because it's among the most popular dataset manipulation libraries, and of course Clojure, specifically <a href='https://github.com/scicloj/tablecloth'>tablecloth</a>, the primary data manipulation library in our ecosystem.</p><p>I'll use the same dataset as the original blog post, the <a href='https://allisonhorst.github.io/palmerpenguins/articles/intro.html'>Palmer Penguin dataset</a>. For the sake of simplicity, I saved a copy of the dataset as a CSV file and made it available on this website. I will also refer the data as a "dataset" throughout this post because that's what Clojure people call a tabular, column-major data structure, but it's the same thing that is variously referred to as a dataframe, data table, or just "data" in other languages. I'm also assuming you know how to install the packages required in the given ecosystems, but any necessary imports or requirements are included in the code snippets the first time they appear. Versions of all languages and libraries used in this post are listed at the end. Here we go!</p><h2 id="reading&#95;data">Reading data</h2><p>Reading data is straightforward in every language, but as a bonus we want to be able to indicate on the fly which values should be interpreted as "missing", whatever that means in the given libraries. In this dataset, the string <code>&quot;NA&quot;</code> means "missing", so we want to tell the dataset constructor this as soon as possible. Here's the comparison of how to accomplish that in various languages:</p><h3 id="tablecloth">Tablecloth</h3><pre><code class="lang-clojure">&#40;require '&#91;tablecloth.api :as tc&#93;&#41;
 
 &#40;def ds 
@@ -167,102 +167,29 @@ <h1><a href="2024-07-18-dplyr-polars-pandas-tablecloth.html">Data Manipulation i
     </div>
 
     <div>
-        <h1><a href="2024-07-18-tablecloth-dplyr-pandas-polars.html">Data Manipulation in Clojure Compared to R and Python</a></h1>
-        <p>I spend a lot of time developing and teaching people about Clojure's open source tools for working with data. Almost everybody who wants to use Clojure for this kind of work is coming from another language ecosystem, usually R or Python. Together with Daniel Slutsky, I'm working on formalizing some of the common teachings into a course. Part of that is providing context for people coming from other ecosystems, including "translations" of how to accomplish data science tasks in Clojure.</p><p>As part of this development, I wanted to share an early preview in this blog post. The format is inspired by this great blog post I read a while ago comparing <a href='https://krz.github.io/Comparing-dplyr-with-polars/'>R and Polars</a> side by side (where "R" here refers to <a href='https://www.tidyverse.org'>the tidyverse</a>, an opinionated collection of R libraries for data science, and realistically mostly <a href='https://dplyr.tidyverse.org'><code>dplyr</code></a> specifically). I'm adding Pandas because it's among the most popular dataset manipulation libraries, and of course Clojure, specifically <a href='https://github.com/scicloj/tablecloth'>tablecloth</a>, the primary data manipulation library in our ecosystem.</p><p>I'll use the same dataset as the original blog post, the <a href='https://allisonhorst.github.io/palmerpenguins/articles/intro.html'>Palmer Penguin dataset</a>. For the sake of simplicity, I saved a copy of the dataset as a CSV file and made it available on this website. I will also refer the data as a "dataset" throughout this post because that's what Clojure people call a tabular, column-major data structure, but it's the same thing that is variously referred to as a dataframe, data table, or just "data" in other languages. I'm also assuming you know how to install the packages required in the given ecosystems, but any necessary imports or requirements are included in the code snippets the first time they appear. Versions of all languages and libraries used in this post are listed at the end. Here we go!</p><h2 id="reading&#95;data">Reading data</h2><p>Reading data is straightforward in every language, but as a bonus we want to be able to indicate on the fly which values should be interpreted as "missing", whatever that means in the given libraries. In this dataset, the string <code>&quot;NA&quot;</code> means "missing", so we want to tell the dataset constructor this as soon as possible. Here's the comparison of how to accomplish that in various languages:</p><h3 id="tablecloth">Tablecloth</h3><pre><code class="lang-clojure">&#40;require '&#91;tablecloth.api :as tc&#93;&#41;
-
-&#40;def ds 
-  &#40;tc/dataset &quot;https://codewithkira.com/assets/penguins.csv&quot;&#41;&#41;
-</code></pre><p>Note that tablecloth interprets the string "NA" as missing (<code>nil</code>, in Clojure) by default.</p><h3 id="r">R</h3><p>In reality, in R you would get the dataset from the R <a href='https://allisonhorst.github.io/palmerpenguins/articles/intro.html'>package that contains the dataset</a>. This is a fairly common practice in R. In order to compare apples to apples, though, here I'll show how to initialize the dataset from a remote CSV file, using the <a href='https://readr.tidyverse.org/reference/read_delim.html'><code>readr</code> package's <code>read&#95;csv</code></a>, which is part of the tidyverse:</p><pre><code class="lang-r">library&#40;tidyverse&#41;
-
-ds &lt;- read&#95;csv&#40;&quot;https://codewithkira.com/assets/penguins.csv&quot;,
-               na = &quot;NA&quot;&#41;
-</code></pre><h3 id="pandas">Pandas</h3><pre><code class="lang-python">import pandas as pd
-
-ds = pd.read&#95;csv&#40;&quot;https://codewithkira.com/assets/penguins.csv&quot;&#41;
-</code></pre><p>Note that pandas has a <a href='https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html'>fairly long list</a> of values it considers <code>NaN</code> already, so we don't need to specify what missing values look like in our case, since <code>&quot;NA&quot;</code> is already in that list.</p><h3 id="polars">Polars</h3><pre><code class="lang-python">import polars as pl
-
-ds = pl.read&#95;csv&#40;&quot;https://codewithkira.com/assets/penguins.csv&quot;,
-                 null&#95;values=&quot;NA&quot;&#41;
-</code></pre><h2 id="basic&#95;commands&#95;to&#95;explore&#95;the&#95;dataset">Basic commands to explore the dataset</h2><p>The first thing people usually want to do with their dataset is see it and poke around a bit. Below is a comparison of how to accomplish basic data exploration tasks using each library.</p><table><thead><tr><th>Operation</th><th>tablecloth</th><th>dplyr</th></tr></thead><tbody><tr><td>see first 10 rows</td><td><code>&#40;tc/head ds 10&#41;</code></td><td><code>head&#40;ds, 10&#41;</code></td></tr><tr><td>see all column names</td><td><code>&#40;tc/column-names ds&#41;</code></td><td><code>colnames&#40;ds&#41;</code></td></tr><tr><td>select column</td><td><code>&#40;tc/select-columns ds &quot;year&quot;&#41;</code></td><td><code>select&#40;ds, year&#41;</code></td></tr><tr><td>select multiple columns</td><td><code>&#40;tc/select-columns ds &#91;&quot;year&quot; &quot;sex&quot;&#93;&#41;</code></td><td><code>select&#40;ds, year, sex&#41;</code></td></tr><tr><td>select rows</td><td><code>&#40;tc/select-rows ds #&#40;&gt; &#40;% &quot;year&quot;&#41; 2008&#41;&#41;</code></td><td><code>filter&#40;ds, year &gt; 2008&#41;</code></td></tr><tr><td>sort column</td><td><code>&#40;tc/order-by ds &quot;year&quot;&#41;</code></td><td><code>arrange&#40;ds, year&#41;</code></td></tr></tbody></table><p><br></p><table><thead><tr><th>Operation</th><th>pandas</th><th>polars</th></tr></thead><tbody><tr><td>see first <code>n</code> rows</td><td><code>ds.head&#40;10&#41;</code></td><td><code>ds.head&#40;10&#41;</code></td></tr><tr><td>see all column names</td><td><code>ds.columns</code></td><td><code>ds.columns</code></td></tr><tr><td>select column</td><td><code>ds&#91;&#91;&quot;year&quot;&#93;&#93;</code></td><td><code>ds.select&#40;pl.col&#40;&quot;year&quot;&#41;&#41;</code></td></tr><tr><td>select multiple columns</td><td><code>ds&#91;&#91;&quot;year&quot;, &quot;sex&quot;&#93;&#93;</code></td><td><code>ds.select&#40;pl.col&#40;&quot;year&quot;, &quot;sex&quot;&#41;&#41;</code></td></tr><tr><td>select rows</td><td><code>ds&#91;ds&#91;&quot;year&quot;&#93; &gt; 2008&#93;</code></td><td><code>ds.filter&#40;pl.col&#40;&quot;year&quot;&#41; &gt; 2008&#41;</code></td></tr><tr><td>sort column</td><td><code>ds.sort&#95;values&#40;&quot;year&quot;&#41;</code></td><td><code>ds.sort&#40;&quot;year&quot;&#41;</code></td></tr></tbody></table><p>Note there are some differences in how different libraries sort missing values, for example in tablecloth and polars they are placed at the beginning (so they're at the top when a column is sorted in ascending order and last when descending), but dplyr and pandas place them last (regardless of whether ascending or descending order is specified).</p><p>As you can see, these commands are all pretty similar, with the exception of selecting rows in tablecloth. This is a short-hand syntax for writing an anonymous function in Clojure, which is how rows are selected. Being a functional language, functions in Clojure are "first-class", which basically just means they are passed around as arguments willy-nilly, all over the place, all the time. In this case, the third argument to tablecloth's <code>select-rows</code> function is a predicate (a function that returns a boolean) that takes as its argument a dataset row as a map of column names to values. Don't worry, though, tablecloth doesn't process your entire dataset row-wise. Under the hood datasets are highly optimized to perform column-wise operations as fast as possible.</p><p>Here's an example of what it looks like to string a couple of these basic dataset exploration operations together, for example in this case to get the <code>bill&#95;length&#95;mm</code> of all penguins with <code>body&#95;mass&#95;g</code> below 3800:</p><h3 id="tablecloth">Tablecloth</h3><pre><code class="lang-clojure">&#40;-&gt; ds
-    &#40;tc/select-rows #&#40;and &#40;% &quot;body&#95;mass&#95;g&quot;&#41;
-                          &#40;&gt; &#40;% &quot;body&#95;mass&#95;g&quot;&#41; 3800&#41;&#41;&#41;
-    &#40;tc/select-columns &quot;bill&#95;length&#95;mm&quot;&#41;&#41;
-</code></pre><p>Note that in tablecloth we have to explicitly omit rows where the value we're filtering by is missing, unlike in other libraries. This is because tablecloth actually uses <code>nil</code> (as opposed to a library-specific construct) to indicate a missing value , and in Clojure <code>nil</code> is not treated as comparable to numbers. If we were to try to compare <code>nil</code> to a number, we would get an exception telling us that we're trying to compare incomparable types. Clojure is fundamentally dynamically typed in that it only does type checking at runtime and bindings can refer to values of any type, but it is also strongly typed, as we see here, in the sense that it explicitly avoids implicit type coercion. For example deciding whether 0 is greater or larger than <code>nil</code> requires some assumptions, and these are intentionally not baked into the core of Clojure or into tablecloth as a library as is the case in some other languages and libraries.</p><p>This example also introduces Clojure's "thread-first" macro. The <code>-&gt;</code> arrow is like R's <code>|&gt;</code> operator or the unix pipe, effectively passing the output of each function in the chain as input to the next. It comes in very handy for data processing code like this.</p><p>Here is the equivalent operation in the other libraries:</p><h3 id="dplyr">dplyr</h3><pre><code class="lang-r">ds |&gt;
-    filter&#40;body&#95;mass&#95;g &lt; 3800&#41; |&gt;
-    select&#40;bill&#95;length&#95;mm&#41;
-</code></pre><h3 id="pandas">Pandas</h3><pre><code class="lang-python">ds&#91;ds&#91;&quot;body&#95;mass&#95;g&quot;&#93; &lt; 3800&#93;&#91;&quot;bill&#95;length&#95;mm&quot;&#93;
-</code></pre><h3 id="polars">Polars</h3><pre><code class="lang-python">ds.filter&#40;pl.col&#40;&quot;body&#95;mass&#95;g&quot;&#41; &lt; 3800&#41;.select&#40;pl.col&#40;&quot;bill&#95;length&#95;mm&quot;&#41;&#41;
-</code></pre><h2 id="more&#95;advanced&#95;filtering&#95;and&#95;selecting">More advanced filtering and selecting</h2><p>Here is what some more complicated data wrangling looks like across the libraries.</p><h3 id="select&#95;all&#95;columns&#95;except&#95;for&#95;one">Select all columns except for one</h3><table><thead><tr><th>Library</th><th>Code</th></tr></thead><tbody><tr><td>tablecloth</td><td><code>&#40;tc/select-columns ds &#40;complement #{&quot;year&quot;}&#41;&#41;</code></td></tr><tr><td>dplyr</td><td><code>select&#40;ds, -year&#41;</code></td></tr><tr><td>pandas</td><td><code>ds.drop&#40;columns=&#91;&quot;year&quot;&#93;&#41;</code></td></tr><tr><td>polars</td><td><code>ds.select&#40;pl.exclude&#40;&quot;year&quot;&#41;&#41;</code></td></tr></tbody></table><p>Another property of functional languages in general, and especially Clojure, is that they really take advantage of the fact that a lot of things are functions that you might not be used to treating like functions. They also leverage function composition to simply combine multiple functions into a single operation.</p><p>For example a set (indicated with the <code>#{}</code> syntax in Clojure) is a special function that returns a boolean indicating whether the given argument is a member of the set or not. And <a href='https://clojuredocs.org/clojure.core/complement'><code>complement</code></a> is a function in <code>clojure.core</code> that effectively inverts the function given to it, so combined <code>&#40;complement #{&quot;year&quot;}&#41;</code> means "every value that is <i>not</i> in the set <code>#{&quot;year&quot;}</code>, which we can then use as our predicate column selector function to filter out certain columns.</p><h3 id="select&#95;all&#95;columns&#95;that&#95;start&#95;with&#95;a&#95;given&#95;string">Select all columns that start with a given string</h3><table><thead><tr><th>Library</th><th>Code</th></tr></thead><tbody><tr><td>tablecloth</td><td><code>&#40;tc/select-columns ds #&#40;str/starts-with? % &quot;bill&quot;&#41;&#41;</code></td></tr><tr><td>dplyr</td><td><code>select&#40;ds, starts&#95;with&#40;&quot;bill&quot;&#41;&#41;</code></td></tr><tr><td>pandas</td><td><code>ds.filter&#40;regex=&quot;&#94;bill&quot;&#41;</code></td></tr><tr><td>polars</td><td><pre style="width:max-content"><code>import polars.selectors as cs</code><br><code>ds.select(cs.starts_with("bill"))</code></pre></td></tr></tbody></table><h3 id="select&#95;only&#95;numeric&#95;columns">Select only numeric columns</h3><table><thead><tr><th>Library</th><th>Code</th></tr></thead><tbody><tr><td>tablecloth</td><td><code>&#40;tc/select-columns ds :type/numerical</code></td></tr><tr><td>dplyr</td><td><code>select&#40;ds, where&#40;is.numeric&#41;&#41;</code></td></tr><tr><td>pandas</td><td><code>ds.select&#95;dtypes&#40;include='number'&#41;</code></td></tr><tr><td>polars</td><td><code>ds.select&#40;cs.numeric&#40;&#41;&#41;</code></td></tr></tbody></table><p>The symbol <code>:type/numerical</code> in Clojure here is a magic keyword that tablecloth knows about and can accept as a column selector. This list of magic keywords that tablecloth knows about is not (yet) documented anywhere, but it is <a href='https://github.com/scicloj/tablecloth/blob/b0faadcd202d4355767f7e212a4d86e099eb5f96/src/tablecloth/api/utils.clj#L59'>available in the source code</a>.</p><h3 id="filter&#95;rows&#95;for&#95;range&#95;of&#95;values">Filter rows for range of values</h3><table><thead><tr><th>Library</th><th>Code</th></tr></thead><tbody><tr><td>tablecloth</td><td><code>&#40;tc/select-rows ds #&#40;&lt; 3500 &#40;% &quot;body&#95;mass&#95;g&quot; 0&#41; 4000&#41;&#41;</code></td></tr><tr><td>dplyr</td><td><code>filter&#40;ds, between&#40;body&#95;mass&#95;g, 3500, 4000&#41;&#41;</code></td></tr><tr><td>pandas</td><td><code>ds&#91;ds&#91;&quot;body&#95;mass&#95;g&quot;&#93;.between&#40;3500, 4000&#41;&#93;</code></td></tr><tr><td>polars</td><td><code>ds.filter&#40;pl.col&#40;&quot;body&#95;mass&#95;g&quot;&#41;.is&#95;between&#40;3500, 4000&#41;&#41;</code></td></tr></tbody></table><p>Note here we handle the missing values in the <code>body&#95;mass&#95;g</code> column differently than above, by specifying a default value for the map lookup. We're explicitly telling tablecloth to treat missing values as <code>0</code> in this case, which can then be compared to other numbers. This is probably the better way to handle this case, but the method above works, too, plus it gave me the opportunity to soapbox about Clojure types for a moment.</p><h3 id="reshaping&#95;the&#95;dataset">Reshaping the dataset</h3><h4 id="tablecloth">Tablecloth</h4><pre><code class="lang-clojure">&#40;tc/pivot-&gt;longer ds 
-                  &#91;&quot;bill&#95;length&#95;mm&quot; &quot;bill&#95;depth&#95;mm&quot;
-                   &quot;flipper&#95;length&#95;mm&quot; &quot;body&#95;mass&#95;g&quot;&#93;
-                  {:target-columns &quot;measurement&quot; :value-column-name &quot;value&quot;}&#41;
-</code></pre><h4 id="dplyr">dplyr</h4><pre><code class="lang-r">ds |&gt;
-    pivot&#95;longer&#40;cols = c&#40;bill&#95;length&#95;mm, bill&#95;depth&#95;mm,
-                          flipper&#95;length&#95;mm, body&#95;mass&#95;g&#41;,
-                 names&#95;to = &quot;measurement&quot;,
-                 values&#95;to = &quot;value&quot;&#41;
-</code></pre><h4 id="pandas">Pandas</h4><pre><code class="lang-python">pd.melt&#40;
-    ds, 
-    id&#95;vars=ds.columns.drop&#40;&#91;&quot;bill&#95;length&#95;mm&quot;, &quot;bill&#95;depth&#95;mm&quot;, 
-                             &quot;flipper&#95;length&#95;mm&quot;, &quot;body&#95;mass&#95;g&quot;&#93;&#41;, 
-    var&#95;name=&quot;measurement&quot;,
-    value&#95;name=&quot;value&quot;
-&#41;
-</code></pre><h4 id="polars">Polars</h4><pre><code class="lang-python">ds.unpivot&#40;
-     index=set&#40;ds.columns&#41; - set&#40;&#91;&quot;bill&#95;length&#95;mm&quot;,
-                                  &quot;bill&#95;depth&#95;mm&quot;,
-                                  &quot;flipper&#95;length&#95;mm&quot;,
-                                  &quot;body&#95;mass&#95;g&quot;&#93;&#41;,
-     variable&#95;name=&quot;measurement&quot;,
-     value&#95;name=&quot;value&quot;&#41;
-</code></pre><h2 id="creating&#95;and&#95;renaming&#95;columns">Creating and renaming columns</h2><h3 id="adding&#95;columns&#95;based&#95;on&#95;some&#95;other&#95;existing&#95;columns">Adding columns based on some other existing columns</h3><p>There are many reasons you might want to add columns, and often new columns are combinations of other ones. Here's how you'd generate a new column based on the values in some other columns in each library:</p><table><thead><tr><th>Library</th><th>Code</th></tr></thead><tbody><tr><td>tablecloth</td><td><pre style="width:max-content"><code>(require '[tablecloth.column.api :as tcc])<br>(tc/add-columns ds {"ratio" (tcc// (ds "bill&#95;length&#95;mm")<br>                                   (ds "flipper&#95;length&#95;mm"))})</code></pre></td></tr><tr><td>dplyr</td><td><code>mutate&#40;ds, ratio = bill&#95;length&#95;mm / flipper&#95;length&#95;mm&#41;</code></td></tr><tr><td>pandas</td><td><code>ds&#91;&quot;ratio&quot;&#93; = ds&#91;&quot;bill&#95;length&#95;mm&quot;&#93; / ds&#91;&quot;flipper&#95;length&#95;mm&quot;&#93;</code></td></tr><tr><td>polars</td><td><pre style="width:max-content"><code>ds.with&#95;columns(<br>    (pl.col("bill&#95;length&#95;mm") /<br>     pl.col("flipper&#95;length&#95;mm")).alias("ratio")<br>)</code></pre></td></tr></tbody></table><p>Note that this is where the wheels start to come off if you're not working in a functional way with immutable data structures. Clojure data structures (including tablecloth datasets) are immutable, which is not the case Pandas. The Pandas code above mutates the dataset in place, so as soon as you do any mutating operations like these, you now have to keep mental track of the state of your dataset, which can quickly lead to high cognitive overhead and lots of incidental complexity.</p><h3 id="renaming&#95;columns">Renaming columns</h3><table><thead><tr><th>Library</th><th>Code</th></tr></thead><tbody><tr><td>tablecloth</td><td><code>&#40;tc/rename-columns ds {&quot;bill&#95;length&#95;mm&quot; &quot;bill&#95;length&quot;}&#41;</code></td></tr><tr><td>dplyr</td><td><code>rename&#40;ds, bill&#95;length = bill&#95;length&#95;mm&#41;</code></td></tr><tr><td>pandas</td><td><code>ds.rename&#40;columns={&quot;bill&#95;length&#95;mm&quot;: &quot;bill&#95;length&quot;}&#41;</code></td></tr><tr><td>polars</td><td><code>ds.rename&#40;{&quot;bill&#95;length&#95;mm&quot;: &quot;bill&#95;length&quot;}&#41;</code></td></tr></tbody></table><p>Again beware, the Pandas implementation shown here mutates the dataset in place. Also manually specifying every column name transformation you want to do is one way to accomplish the task, but sometimes that can be tedious if you want to apply the same transformation to every column name, which is fairly common.</p><h3 id="transforming&#95;column&#95;names">Transforming column names</h3><p>Here's how you would upper case all column names:</p><table><thead><tr><th>Library</th><th>Code</th></tr></thead><tbody><tr><td>tablecloth</td><td><code>&#40;tc/rename-columns ds :all str/upper-case&#41;</code></td></tr><tr><td>dplyr</td><td><code>rename&#95;with&#40;ds, toupper&#41;</code></td></tr><tr><td>pandas</td><td><code>ds.columns = ds.columns.str.upper&#40;&#41;</code></td></tr><tr><td>polars</td><td><code>ds.select&#40;pl.all&#40;&#41;.name.to&#95;uppercase&#40;&#41;&#41;</code></td></tr></tbody></table><p>Like the other libraries, tablecloth's <code>rename-columns</code> accepts both types of arguments &ndash; a simple mapping of old -> new column names, or any column selector and any transformation function. For example, removing the units from each column name would look like this in each language:</p><table><thead><tr><th>Library</th><th>Code</th></tr></thead><tbody><tr><td>tablecloth</td><td><code>(tc/rename-columns ds #".+&#95;(mm&#124;g)" #(str/replace % #"(.+)&#95;(mm&#124;g)" "$1"))</code></td></tr><tr><td>dplyr</td><td><code>rename&#95;with(penguins, ~ str&#95;replace(.x, "&#94;(.+)&#95;(mm&#124;g)$", "&#92;1"))</code></td></tr><tr><td>pandas</td><td><pre style="width:max-content"><code>import re<br>ds.rename(columns=lambda x: re.sub(r"(.+)_(mm&#124;g)$", r"\1", x))</code></pre></td></tr><tr><td>polars</td><td><pre style="width:max-content"><code>ds = ds.rename({<br>    col: col.replace("&#95;mm", "").replace("&#95;g", "")<br>    for col in ds.columns<br>})</code></pre></td></tr></tbody></table><h2 id="grouping&#95;and&#95;aggregating">Grouping and aggregating</h2><p>Grouping behaves <a href='https://scicloj.github.io/tablecloth/index.html#group-by'>somewhat unconventionally in tablecloth</a>. Datasets can be grouped by a single column name or a sequence of column names like in other libraries, but grouping can also be done using any arbitrary function. Grouping in tablecloth also returns a new dataset, similar to dplyr, rather than an abstract intermediate object (as in pandas and polars). Grouped datasets have three columns, (name of the group, group id, and a column containing a new dataset of the grouped data). Once a dataset is grouped, the group values can be aggregated in a variety of ways. Here are a few examples, with comparisons between libraries:</p><h3 id="summarizing&#95;counts">Summarizing counts</h3><p>To get the count of each penguin by species:</p><h4 id="tablecloth">Tablecloth</h4><pre><code class="lang-clojure">&#40;-&gt; ds
-    &#40;tc/group-by &#91;&quot;species&quot;&#93;&#41;
-    &#40;tc/aggregate {&quot;count&quot; tc/row-count}&#41;&#41;
-</code></pre><h4 id="dplyr">dplyr</h4><pre><code class="lang-r">ds |&gt;
-    group&#95;by&#40;species&#41; |&gt;
-    summarise&#40;count = n&#40;&#41;&#41;
-</code></pre><h4 id="pandas">Pandas</h4><pre><code class="lang-python">ds.groupby&#40;&quot;species&quot;&#41;.agg&#40;count=&#40;&quot;species&quot;, &quot;count&quot;&#41;&#41;
-</code></pre><h4 id="polars">Polars</h4><pre><code class="lang-python">ds.group&#95;by&#40;&quot;species&quot;&#41;.agg&#40;pl.count&#40;&#41;.alias&#40;&quot;count&quot;&#41;&#41;
-</code></pre><h3 id="find&#95;the&#95;penguin&#95;with&#95;the&#95;lowest&#95;body&#95;mass&#95;by&#95;species">Find the penguin with the lowest body mass by species</h3><h4 id="tablecloth">Tablecloth</h4><pre><code class="lang-clojure">&#40;-&gt; ds
-    &#40;tc/group-by &#91;&quot;species&quot;&#93;&#41;
-    &#40;tc/aggregate {&quot;lowest&#95;body&#95;mass&#95;g&quot; #&#40;-&gt;&gt; &#40;% &quot;body&#95;mass&#95;g&quot;&#41;
-                                              tcc/drop-missing
-                                              &#40;apply tcc/min&#41;&#41;}&#41;&#41;
-</code></pre><h4 id="dplyr">dplyr</h4><pre><code class="lang-r">ds |&gt;
-    group&#95;by&#40;species&#41; |&gt;
-    summarize&#40;lowest&#95;body&#95;mass&#95;g = min&#40;body&#95;mass&#95;g, na.rm = TRUE&#41;&#41;
-</code></pre><h4 id="pandas">Pandas</h4><pre><code class="lang-python">ds.groupby&#40;&quot;species&quot;&#41;.agg&#40;
-    lowest&#95;body&#95;mass&#95;g=&#40;&quot;body&#95;mass&#95;g&quot;, lambda x: x.min&#40;skipna=True&#41;&#41;
-&#41;.reset&#95;index&#40;&#41;
-</code></pre><h4 id="polars">Polars</h4><pre><code class="lang-python">ds.group&#95;by&#40;&quot;species&quot;&#41;.agg&#40;
-    pl.col&#40;&quot;body&#95;mass&#95;g&quot;&#41;.min&#40;&#41;.alias&#40;&quot;lowest&#95;body&#95;mass&#95;g&quot;&#41;
-&#41;
-</code></pre><h2 id="conclusions">Conclusions</h2><p>As you can see, all of these libraries are perfectly suitable for accomplishing common data manipulation tasks. Choosing a language and library can impact code readability, maintainability, and performance, though, so understanding the differences between available toolkits can help us make better choices.</p><p>Clojure's tablecloth emphasizes functional programming concepts and immutability, which can lead to more predictable and re-usable code, at the cost of adopting a potentially new paradigm. Hopefully this comparison serves not only as a translation guide, but an an intro to the different philosophies underpinning these common data science tools.</p><p>Thanks for reading :)</p><h2 id="versions">Versions</h2><p>This code in this post works with the following language and library versions:</p><table><thead><tr><th>Tool</th><th>Version</th></tr></thead><tbody><tr><td>MacOS</td><td>Sonoma 14.5</td></tr><tr><td>JVM</td><td><code>21.0.2</code></td></tr><tr><td>Clojure</td><td><code>1.11.1</code></td></tr><tr><td>Tablecloth</td><td><code>7.021</code></td></tr><tr><td>R</td><td><code>4.4.1</code></td></tr><tr><td>Tidyverse</td><td><code>2.0.0</code></td></tr><tr><td>Python</td><td><code>3.12.3</code></td></tr><tr><td>Pandas</td><td><code>2.1.4</code></td></tr><tr><td>Polars</td><td><code>1.1.0</code></td></tr></tbody></table>
+        <h1><a href="2024-06-30-clojurists-together-update-may-jun-2024.html">OSS Updates May and June 2024</a></h1>
+        <p>This is a summary of the open source work I've spent my time on throughout May and June, 2024. There were lots of small bug fixes and reports, driven by work on the Clojure Data Cookbook. This work was also the impetus for my initial release of <a href='https://github.com/scicloj/tcutils'><code>tcutils</code></a>, a library of utility functions for working with tablecloth datasets. I also had the wonderful opportunity to attend PyData London in June and found it really insightful and inspiring. Read on for more details.</p><h2 id="sponsors">Sponsors</h2><p>This work is made possible by the generous ongoing support of my sponsors. I appreciate all of the support the community has given to my work and would like to give a special thanks to Clojurists Together and Nubank for providing me with lucrative enough grants that I can reduce my client work significantly and afford to spend more time on these projects.</p><p>If you find my work valuable, please share it with others and consider supporting it financially. There are details about how to do that on my <a href='https://github.com/sponsors/kiramclean'>GitHub sponsors page</a>. On to the updates!</p><h2 id="ecosystem&#95;issue&#95;reports&#95;and&#95;bug&#95;fixes">Ecosystem issue reports and bug fixes</h2><p>Working on the cookbook these last couple of months turned up a few small issues in ecosystem libraries. The other developers of Clojure's data science tools are such a pleasure to work with, it's so rare and nice to have a distributed team of people capable of getting cool things built asynchronously. Here are some details of a few particular issues that came up:</p><ul><li>Small problem loading .xls/.xlsx files as datasets if they had a number as a column name: <a href='https://clojurians.zulipchat.com/#narrow/stream/236259-tech.2Eml.2Edataset.2Edev/topic/xlsx.20column.20parsing/near/437313810'>discussed here</a>, <a href='https://github.com/techascent/tech.ml.dataset/issues/408'>reported here</a>, and graciously <a href='https://github.com/techascent/tech.ml.dataset/commit/24c0e646f289210aa95c1ac9998cb2ddd5c9f836'>fixed by Chris Nuernberger</a>.</li><li>Unexpected behaviour when comparing certain numeric types in <code>dtype-next</code>: <a href='https://clojurians.zulipchat.com/#narrow/stream/236259-tech.2Eml.2Edataset.2Edev/topic/numeric.20datatypes/near/438617694%5D(https://clojurians.zulipchat.com/%23narrow/stream/236259-tech.2Eml.2Edataset.2Edev/topic/numeric.20datatypes/near/438617694'>discussed here</a>, <a href='https://github.com/cnuernber/dtype-next/issues/99'>reported here</a>, and again <a href='https://github.com/cnuernber/dtype-next/commit/563fe9c13797feb206391cd951655942e3e6cf0f'>fixed by Chris</a>. This one sadly had some unintended consequences that <a href='https://github.com/cnuernber/dtype-next/issues/103'>generateme found and reported here</a>.</li><li><a href='https://github.com/scicloj/clay/blob/b299d060c3edbce789a55fee3efedce42fbd2ab4/CHANGELOG.md'>Many improvements to Clay</a> by Daniel Slutsky, especially a couple of ones that make the quarto publications it produces much nicer: <a href='https://github.com/scicloj/clay/pull/102'>fixing too-wide tables in quarto pages</a> and <a href='https://clojurians.zulipchat.com/#narrow/stream/321125-noj-dev/topic/kindly.20options/near/440663980'>supporting limiting the number of table rows that get displayed</a>.</li><li>Some good discussions about how best to incorporate the myriad of dependencies required to use Java machine learning libraries in Clojure libs, including sorting out what to do about <a href='https://github.com/scicloj/scicloj.ml.tribuo/issues/1'>transitive dependencies in our tribuo wrapper</a>, led by Carsten Behring.</li></ul><h2 id="initial&#95;release&#95;of&#95;tcutils">Initial release of tcutils</h2><p>In my explorations of other languages' tools for working data I often come across nice utility functions that are super simple but have a big impact on the ergonomics of using the tools. I wanted to start bringing some of these convenience utilities to Clojure, so for now I'm putting them in <a href='https://github.com/scicloj/tcutils'><code>tcutils</code></a>. So far only a handful of helpers are implemented (<code>lag</code>, <code>lead</code>, <code>cumsum</code>, and <code>clean-column-names</code>). The goal is to eventually fill out more utilities that save people from having to dig into the documentation of half a dozen different libraries to figure out how to implement things like these. The goal is not to achieve feature parity or to exactly copy similar libraries, like pandas or dplyr, but rather to take inspiration from them and make our tools easier to use for people who are used to these conveniences.</p><h2 id="progress&#95;on&#95;clojure&#95;data&#95;cookbook">Progress on Clojure Data Cookbook</h2><p>I spent a lot of time on the Clojure Data Cookbook over these last two months. Notable progress includes:</p><ul><li>The introductory chapters bear some resemblance now to the final form they'll take.</li><li>The overall structure of the book is much more clear now.</li><li>I started the example analysis that will serve as the high-level introductory section of the book.</li><li>The publishing and deployment process is finally working.</li></ul><p>It's still very much in progress, but in the interest of transparency the work-in-progress version is <a href='https://github.com/scicloj/clojure-data-cookbook'>available online now</a>. It will continue to evolve and change as I fill out more and more of the chapters, but there's enough of it available now to hopefully give a sense of the style and tone I'm going for. I also finally have the publishing workflow set up and it's generating a nice-looking Quarto book, thanks to all of Daniel Slutsky's amazing work on Clay and Quarto integration recently.</p><h2 id="progress&#95;on&#95;high-level&#95;goals">Progress on high-level goals</h2><p>The high-level goal of my work in general remains to steward Clojure's data science ecosystem to a state of maturity and flourishing so that data practitioners can use it to get real work done. Toward this end, I set up a <a href='https://github.com/users/kiramclean/projects/4'>project board</a> to track progress toward what I see as the main components of this project. </p><p>Over the last couple of months, beginning with a prototype demoed at my <a href='https://www.youtube.com/watch?v=eUFf3-og_-Y'>London Clojurians talk in April</a>, Daniel Slutsky has made tremendous progress on our goal of implementing a grammar of graphics in Clojure in the new <a href='https://github.com/scicloj/hanamicloth'>hanamicloth library</a>. The near-term goal is to stabilize the API of this library enough that it can be used to provide a user-friendly way to accomplish all of the simple data visualization tasks that are currently possible with our other tools. The long term goal is to take the lessons we learn from this library and build a JVM-only grammar of graphics library for doing data visualization "right" in Clojure.</p><p>The development and surrounding discussions of hanamicloth have also made me realize it would be useful to write an overview of the current state of dataviz options for Clojure and why we're working on building something new. That's on my list for the coming months, but lower priority than actual development work.</p><h2 id="impressions&#95;from&#95;pydata&#95;london">Impressions from PyData London</h2><p>I got to attend PyData London this year thanks to a client of mine who was sponsoring the conference. I learned a lot and found the talks very interesting. My overall impression is that data science is maturing as a discipline, with more polished methods and robust theory backing up different approaches to data-related problems. With this maturation, though, comes higher expectations for production-ready, professional quality results. Most of the talks focused on high-level concerns like observability, scalability, and long-term stewardship of large open-source projects.</p><p>There are a lot of reasons why Python is just not ideal for building highly available, high-performance systems, and I really believe this is a good time to be building alternative tools for data science. Python is obviously entrenched as the current default language for working with data, but it is difficult and slow to write code that can take full advantage of modern hardware (because of the infamous global interpreter lock, reference counting, slow I/O, among other reasons). And to be fair, the Python community knows this. It's why virtually all of the libraries that do the heavy lifting for data science in Python are actually implemented in C (numpy,  pandas) or Rust (Polars, Pydantic), or are wrappers around C++ (PyTorch, TensorFlow, matplotlib) or Java (PySpark, Pydoop, confluent-kafka) libraries. </p><p>I think this provides a lot of insights into what data practitioners want. It's clear that users <i>want</i> approachable, simple, human-readable interfaces for all of these tools, and that any new tool needs to interoperate with the rest of the ones currently in use. People are also <a href='https://news.ycombinator.com/item?id=40815097'>tired of churn</a> and are craving stability. I think Clojure has a lot to offer in all of these areas and is well placed to become more widely adopted for data science.</p><h2 id="ongoing&#95;work">Ongoing work</h2><p>My focus over the next two months will remain on the cookbook. My main goal is to finish the introductory chapter with the housing price analysis and to continue putting together the data import section with instructions and examples for all file formats that can reasonably be supported easily at this time.</p><p>I'll continue to support and contribute to all of the ecosystem libraries I come across in my writings and analysis work in hopes of smoothing out all the rough edges I find.</p><p>Thanks for reading. I always love hearing from people who are interested in any of the things I'm working on. If that's you, don't hesitate to be in touch :)</p>
         
         
-        <p><i>Published: 2024-07-18</i></p>
+        <p><i>Published: 2024-06-30</i></p>
         
             <p><i>
                 Tagged:
                 
-                    <span class="tag">
-                        <a href="tags/tools.html">tools</a>
-                    </span>
-                
                     <span class="tag">
                         <a href="tags/clojure.html">clojure</a>
                     </span>
                 
                     <span class="tag">
-                        <a href="tags/scicloj.html">scicloj</a>
+                        <a href="tags/oss-updates.html">oss updates</a>
                     </span>
                 
                     <span class="tag">
-                        <a href="tags/r.html">r</a>
+                        <a href="tags/clojurists-together.html">clojurists together</a>
                     </span>
                 
                     <span class="tag">
-                        <a href="tags/python.html">python</a>
+                        <a href="tags/open-source.html">open source</a>
                     </span>
                 
             </i></p>
@@ -270,11 +197,11 @@ <h1><a href="2024-07-18-tablecloth-dplyr-pandas-polars.html">Data Manipulation i
     </div>
 
     <div>
-        <h1><a href="2024-06-30-clojurists-together-update-may-jun-2024.html">OSS Updates May and June 2024</a></h1>
-        <p>This is a summary of the open source work I've spent my time on throughout May and June, 2024. There were lots of small bug fixes and reports, driven by work on the Clojure Data Cookbook. This work was also the impetus for my initial release of <a href='https://github.com/scicloj/tcutils'><code>tcutils</code></a>, a library of utility functions for working with tablecloth datasets. I also had the wonderful opportunity to attend PyData London in June and found it really insightful and inspiring. Read on for more details.</p><h2 id="sponsors">Sponsors</h2><p>This work is made possible by the generous ongoing support of my sponsors. I appreciate all of the support the community has given to my work and would like to give a special thanks to Clojurists Together and Nubank for providing me with lucrative enough grants that I can reduce my client work significantly and afford to spend more time on these projects.</p><p>If you find my work valuable, please share it with others and consider supporting it financially. There are details about how to do that on my <a href='https://github.com/sponsors/kiramclean'>GitHub sponsors page</a>. On to the updates!</p><h2 id="ecosystem&#95;issue&#95;reports&#95;and&#95;bug&#95;fixes">Ecosystem issue reports and bug fixes</h2><p>Working on the cookbook these last couple of months turned up a few small issues in ecosystem libraries. The other developers of Clojure's data science tools are such a pleasure to work with, it's so rare and nice to have a distributed team of people capable of getting cool things built asynchronously. Here are some details of a few particular issues that came up:</p><ul><li>Small problem loading .xls/.xlsx files as datasets if they had a number as a column name: <a href='https://clojurians.zulipchat.com/#narrow/stream/236259-tech.2Eml.2Edataset.2Edev/topic/xlsx.20column.20parsing/near/437313810'>discussed here</a>, <a href='https://github.com/techascent/tech.ml.dataset/issues/408'>reported here</a>, and graciously <a href='https://github.com/techascent/tech.ml.dataset/commit/24c0e646f289210aa95c1ac9998cb2ddd5c9f836'>fixed by Chris Nuernberger</a>.</li><li>Unexpected behaviour when comparing certain numeric types in <code>dtype-next</code>: <a href='https://clojurians.zulipchat.com/#narrow/stream/236259-tech.2Eml.2Edataset.2Edev/topic/numeric.20datatypes/near/438617694%5D(https://clojurians.zulipchat.com/%23narrow/stream/236259-tech.2Eml.2Edataset.2Edev/topic/numeric.20datatypes/near/438617694'>discussed here</a>, <a href='https://github.com/cnuernber/dtype-next/issues/99'>reported here</a>, and again <a href='https://github.com/cnuernber/dtype-next/commit/563fe9c13797feb206391cd951655942e3e6cf0f'>fixed by Chris</a>. This one sadly had some unintended consequences that <a href='https://github.com/cnuernber/dtype-next/issues/103'>generateme found and reported here</a>.</li><li><a href='https://github.com/scicloj/clay/blob/b299d060c3edbce789a55fee3efedce42fbd2ab4/CHANGELOG.md'>Many improvements to Clay</a> by Daniel Slutsky, especially a couple of ones that make the quarto publications it produces much nicer: <a href='https://github.com/scicloj/clay/pull/102'>fixing too-wide tables in quarto pages</a> and <a href='https://clojurians.zulipchat.com/#narrow/stream/321125-noj-dev/topic/kindly.20options/near/440663980'>supporting limiting the number of table rows that get displayed</a>.</li><li>Some good discussions about how best to incorporate the myriad of dependencies required to use Java machine learning libraries in Clojure libs, including sorting out what to do about <a href='https://github.com/scicloj/scicloj.ml.tribuo/issues/1'>transitive dependencies in our tribuo wrapper</a>, led by Carsten Behring.</li></ul><h2 id="initial&#95;release&#95;of&#95;tcutils">Initial release of tcutils</h2><p>In my explorations of other languages' tools for working data I often come across nice utility functions that are super simple but have a big impact on the ergonomics of using the tools. I wanted to start bringing some of these convenience utilities to Clojure, so for now I'm putting them in <a href='https://github.com/scicloj/tcutils'><code>tcutils</code></a>. So far only a handful of helpers are implemented (<code>lag</code>, <code>lead</code>, <code>cumsum</code>, and <code>clean-column-names</code>). The goal is to eventually fill out more utilities that save people from having to dig into the documentation of half a dozen different libraries to figure out how to implement things like these. The goal is not to achieve feature parity or to exactly copy similar libraries, like pandas or dplyr, but rather to take inspiration from them and make our tools easier to use for people who are used to these conveniences.</p><h2 id="progress&#95;on&#95;clojure&#95;data&#95;cookbook">Progress on Clojure Data Cookbook</h2><p>I spent a lot of time on the Clojure Data Cookbook over these last two months. Notable progress includes:</p><ul><li>The introductory chapters bear some resemblance now to the final form they'll take.</li><li>The overall structure of the book is much more clear now.</li><li>I started the example analysis that will serve as the high-level introductory section of the book.</li><li>The publishing and deployment process is finally working.</li></ul><p>It's still very much in progress, but in the interest of transparency the work-in-progress version is <a href='https://github.com/scicloj/clojure-data-cookbook'>available online now</a>. It will continue to evolve and change as I fill out more and more of the chapters, but there's enough of it available now to hopefully give a sense of the style and tone I'm going for. I also finally have the publishing workflow set up and it's generating a nice-looking Quarto book, thanks to all of Daniel Slutsky's amazing work on Clay and Quarto integration recently.</p><h2 id="progress&#95;on&#95;high-level&#95;goals">Progress on high-level goals</h2><p>The high-level goal of my work in general remains to steward Clojure's data science ecosystem to a state of maturity and flourishing so that data practitioners can use it to get real work done. Toward this end, I set up a <a href='https://github.com/users/kiramclean/projects/4'>project board</a> to track progress toward what I see as the main components of this project. </p><p>Over the last couple of months, beginning with a prototype demoed at my <a href='https://www.youtube.com/watch?v=eUFf3-og_-Y'>London Clojurians talk in April</a>, Daniel Slutsky has made tremendous progress on our goal of implementing a grammar of graphics in Clojure in the new <a href='https://github.com/scicloj/hanamicloth'>hanamicloth library</a>. The near-term goal is to stabilize the API of this library enough that it can be used to provide a user-friendly way to accomplish all of the simple data visualization tasks that are currently possible with our other tools. The long term goal is to take the lessons we learn from this library and build a JVM-only grammar of graphics library for doing data visualization "right" in Clojure.</p><p>The development and surrounding discussions of hanamicloth have also made me realize it would be useful to write an overview of the current state of dataviz options for Clojure and why we're working on building something new. That's on my list for the coming months, but lower priority than actual development work.</p><h2 id="impressions&#95;from&#95;pydata&#95;london">Impressions from PyData London</h2><p>I got to attend PyData London this year thanks to a client of mine who was sponsoring the conference. I learned a lot and found the talks very interesting. My overall impression is that data science is maturing as a discipline, with more polished methods and robust theory backing up different approaches to data-related problems. With this maturation, though, comes higher expectations for production-ready, professional quality results. Most of the talks focused on high-level concerns like observability, scalability, and long-term stewardship of large open-source projects.</p><p>There are a lot of reasons why Python is just not ideal for building highly available, high-performance systems, and I really believe this is a good time to be building alternative tools for data science. Python is obviously entrenched as the current default language for working with data, but it is difficult and slow to write code that can take full advantage of modern hardware (because of the infamous global interpreter lock, reference counting, slow I/O, among other reasons). And to be fair, the Python community knows this. It's why virtually all of the libraries that do the heavy lifting for data science in Python are actually implemented in C (numpy,  pandas) or Rust (Polars, Pydantic), or are wrappers around C++ (PyTorch, TensorFlow, matplotlib) or Java (PySpark, Pydoop, confluent-kafka) libraries. </p><p>I think this provides a lot of insights into what data practitioners want. It's clear that users <i>want</i> approachable, simple, human-readable interfaces for all of these tools, and that any new tool needs to interoperate with the rest of the ones currently in use. People are also <a href='https://news.ycombinator.com/item?id=40815097'>tired of churn</a> and are craving stability. I think Clojure has a lot to offer in all of these areas and is well placed to become more widely adopted for data science.</p><h2 id="ongoing&#95;work">Ongoing work</h2><p>My focus over the next two months will remain on the cookbook. My main goal is to finish the introductory chapter with the housing price analysis and to continue putting together the data import section with instructions and examples for all file formats that can reasonably be supported easily at this time.</p><p>I'll continue to support and contribute to all of the ecosystem libraries I come across in my writings and analysis work in hopes of smoothing out all the rough edges I find.</p><p>Thanks for reading. I always love hearing from people who are interested in any of the things I'm working on. If that's you, don't hesitate to be in touch :)</p>
+        <h1><a href="2024-04-30-clojurists-together-update-mar-apr-2024.html">OSS Updates March and April 2024</a></h1>
+        <p>This is a summary of the open source work I've spent my time on throughout March and April, 2024. Overall it was a really insightful couple of months for me, with lots of productive discussions and meetings happening among key contributors to Clojure's data science ecosystem and great progress toward some of our most ambitious goals.</p><h2 id="sponsors">Sponsors</h2><p>This work is made possible by the generous ongoing support of my sponsors. I appreciate all of the support the community has given to my work and would like to give a special thanks to Clojurists Together and Nubank for providing me with lucrative enough grants that I can reduce my client work significantly and afford to spend more time on these projects.</p><p>If you find my work valuable, please share it with others and consider supporting it financially. There are details about how to do that on my <a href='https://github.com/sponsors/kiramclean/'>GitHub sponsors page</a>. On to the updates!</p><h2 id="grammar&#95;of&#95;graphics&#95;in&#95;clojure">Grammar of graphics in Clojure</h2><p>With help from Daniel Slutsky and others in the community, I started some concrete work on implementing a grammar of graphics in Clojure. I'm convinced this is the correct long-term solution for dataviz in Clojure, but it is a big project that will take time, including a lot of <a href='https://www.youtube.com/watch?v=f84n5oFoZBc'>hammock time</a>. It's still useful to play around with proofs of concept whilst thinking through problems, though, and in the interest of transparency I'm making all of <a href='https://github.com/kiramclean/ggclj'>those experiments public</a>.</p><p>The discussions around this development are all also happening in public. There were two visual tools meetups focused on this over the last two months (<a href='https://www.youtube.com/watch?v=MxjzaOtcdcY'>link 1</a>, <a href='https://www.youtube.com/watch?v=d3iRGmbJmes'>link 2</a>). And at the London Clojurians talk I just gave today I demonstrated an example of one proposed implementation of a <a href='https://github.com/kiramclean/workshops/blob/main/london_clojurians_april_2024/src/utils/hana.clj'>grammar-of-graphics-like API</a> on top of hanami implemented by Daniel.</p><p>There are more meetups planned for the coming months and work in this area for the foreseeable future will look like researching and understanding the fundamentals of the grammar of graphics in order to design a simple implementation in Clojure.</p><h2 id="clojure's&#95;ml&#95;and&#95;statistics&#95;tools">Clojure's ML and statistics tools</h2><p>I spent a lot of time these last couple of months documenting and testing out Clojure's current ML tools, leading to many great conversations and one <a href='https://codewithkira.com/2024-04-04-state-of-clojure-ml.html'>blog post</a> that generated many more interesting discussions. The takeaway is that the tools themselves in this area are all quite mature and stable, but there are still ongoing discussions around how to best accommodate the different ways that people want to work with them. The overall goal in this area of my work is to stabilize the solutions so we can start advocating for specific ways of using them.</p><p>Below are some key takeaways from my research into all this stuff. Note none of these are my decisions to make alone, but represent my current opinions and what I will be advocating for within the community:</p><ul><li>Smile will be slowly sunsetted from the ecosystem. The switch to GPL licensing was made in bad faith and many of the common models don't work on Apple chips. Given the abundance of suitable alternatives, the easiest option is to move away from depending on it.</li><li>A greater distinction between statistical modelling and machine learning workflows will be helpful. Right now there are many uses of the various models that are available in Clojure, and the wrappers and tools surrounding them are usually designed with a specific type of user in mind. For example machine learning people almost always have separate training and testing datasets, whereas statisticians "train" their models on an entire dataset. The highest-level APIs for these different usages (among others) look quite different, and we would benefit from having APIs that are ergonomic and familiar to our target users of various backgrounds.</li><li>We should agree on standards for accomplishing certain very common and basic tasks and propose a recommended usage for users. For example, there are almost a dozen ways to do linear regression in Clojure and it's not obvious which is "the best" way to someone not deeply familiar with the ecosystem.</li><li>Everything should work with tablecloth datasets and expect them as inputs. This is mostly the case already, but there is still some progress to be made.</li></ul><h2 id="foundations&#95;of&#95;clojure's&#95;data&#95;science&#95;stack">Foundations of Clojure's data science stack</h2><p>I continue to work on guides and tutorials for the parts of Clojure's data science stack that I feel are ready for prime time, mainly tablecloth and all of the amazing underlying libraries it leverages. Every once in a while this turns up surprises, for example this month I was surprised at how column header processing is handled for nippy files specifically. I also <a href='https://github.com/scicloj/tablecloth/pull/143'>fixed one bug</a> in tablecloth itself, which I discovered in the process of writing a tutorial earlier in March. I have a pile of in-progress guides focusing on some more in-depth topics from developing the London Clojurians talk that I'm going to tidy up and publish in the coming months.</p><p>The overarching goal in this area is to create a unified data science stack with libraries for processing, modelling, and visualization that all interoperate seamlessly and work with tablecloth datasets, like the tidyverse in R. Part of achieving that is making sure that tablecloth is rock solid, which just takes a lot of poking and prodding.</p><h2 id="london&#95;clojurians&#95;talk">London Clojurians talk</h2><p>This talk was a big inspiration for diving deep into Clojure's data science ecosystem. I experimented with a ton of different datasets for the workshop and discovered tons of potential areas for future development. Trying to put together a polished data workflow really exposed many of the key areas I think we should be focusing on and gave me a lot of inspiration for future work. I spent a ton of time exploring all of the possible ways to demonstrate a broad sample of data science tools and learned a lot along the way.</p><p>The resources from the talk are all available <a href='https://github.com/kiramclean/workshops/tree/main/london_clojurians_april_2024'>in this repo</a> and the video will be posted soon.</p><h2 id="summary&#95;of&#95;future&#95;work">Summary of future work</h2><p>I mentioned a few areas of focus above, below is a summary of the ongoing work as I see it. A framework for organizing this work is starting to emerge, and I've been thinking about in terms of four key areas:</p><h3 id="visualisation">Visualisation</h3><ul><li>Priority here is to release a stable dataviz API using the tools and wrappers we currently have so that we can start releasing guides and tutorials that follow a consistent style.</li><li>The long-term goal is to develop a robust, flexible, and stable data visualization library in Clojure itself based on the grammar of graphics.</li></ul><h3 id="machine&#95;learning">Machine learning</h3><ul><li>Priority is to decide which APIs we will commit to supporting in the long term and stabilize the "glue" libraries that provide the high-level APIs for data-first users.</li><li>Long term goal is to support the full spectrum of libraries and models that are in everyday use by data science professionals.</li></ul><h3 id="statistics">Statistics</h3><ul><li>Priority is to document the current options for accomplishing basic statistical modelling tasks, including Clojure libraries we do have, Java libs, and Python interop.</li><li>Long term goal is to have tablecloth-compatible stats libraries implemented in pure Clojure.</li></ul><h3 id="foundations">Foundations</h3><ul><li>Priority is to build a tidyverse for Clojure. This includes battle-testing tablecloth, fully documenting its capabilities, and fixing remaining, small, sharp edges.</li></ul><h2 id="going&#95;forward">Going forward</h2><p>My overarching goal (personally) is still to write a canonical resource for working with Clojure's data science stack (the Clojure Data Cookbook), and I'm still working on finding the right balance of documenting "work-in-progress" tools and libraries vs. delaying progress until I feel they are more "ready". Until now I've let the absence of stable or ideal APIs in certain areas hinder development of this book, but I'm starting to feel very confident in my understanding of the current direction of the ecosystem, enough so that I would feel good about releasing something a little bit more formal than a tutorial or guide and recommending usages with the caveat that development is ongoing in some areas. And while it will take a while to get where we want to go, I feel like I can finally see the path to getting there. It just takes a lot of work and lot of collaboration, but with your support we'll make it happen! Thanks for reading.</p>
         
         
-        <p><i>Published: 2024-06-30</i></p>
+        <p><i>Published: 2024-04-30</i></p>
         
             <p><i>
                 Tagged:
diff --git a/public/planetclojure.xml b/public/planetclojure.xml
index fd5fedc..d51afbf 100644
--- a/public/planetclojure.xml
+++ b/public/planetclojure.xml
@@ -3,87 +3,11 @@
   <title>Code with Kira</title>
   <link href="https://codewithkira.com/atom.xml" rel="self"/>
   <link href="https://codewithkira.com"/>
-  <updated>2024-07-18T20:39:10+00:00</updated>
+  <updated>2024-07-18T20:46:32+00:00</updated>
   <id>https://codewithkira.com</id>
   <author>
     <name>Kira McLean</name>
   </author>
-  <entry>
-    <id>https://codewithkira.com/2024-07-18-dplyr-polars-pandas-tablecloth.html</id>
-    <link href="https://codewithkira.com/2024-07-18-dplyr-polars-pandas-tablecloth.html"/>
-    <title>Data Manipulation in Clojure Compared to R and Python</title>
-    <updated>2024-07-18T23:59:59+00:00</updated>
-    <content type="html"><![CDATA[<p>I spend a lot of time developing and teaching people about Clojure's open source tools for working with data. Almost everybody who wants to use Clojure for this kind of work is coming from another language ecosystem, usually R or Python. Together with Daniel Slutsky, I'm working on formalizing some of the common teachings into a course. Part of that is providing context for people coming from other ecosystems, including "translations" of how to accomplish data science tasks in Clojure.</p><p>As part of this development, I wanted to share an early preview in this blog post. The format is inspired by this great blog post I read a while ago comparing <a href='https://krz.github.io/Comparing-dplyr-with-polars/'>R and Polars</a> side by side (where "R" here refers to <a href='https://www.tidyverse.org'>the tidyverse</a>, an opinionated collection of R libraries for data science, and realistically mostly <a href='https://dplyr.tidyverse.org'><code>dplyr</code></a> specifically). I'm adding Pandas because it's among the most popular dataset manipulation libraries, and of course Clojure, specifically <a href='https://github.com/scicloj/tablecloth'>tablecloth</a>, the primary data manipulation library in our ecosystem.</p><p>I'll use the same dataset as the original blog post, the <a href='https://allisonhorst.github.io/palmerpenguins/articles/intro.html'>Palmer Penguin dataset</a>. For the sake of simplicity, I saved a copy of the dataset as a CSV file and made it available on this website. I will also refer the data as a "dataset" throughout this post because that's what Clojure people call a tabular, column-major data structure, but it's the same thing that is variously referred to as a dataframe, data table, or just "data" in other languages. I'm also assuming you know how to install the packages required in the given ecosystems, but any necessary imports or requirements are included in the code snippets the first time they appear. Versions of all languages and libraries used in this post are listed at the end. Here we go!</p><h2 id="reading&#95;data">Reading data</h2><p>Reading data is straightforward in every language, but as a bonus we want to be able to indicate on the fly which values should be interpreted as "missing", whatever that means in the given libraries. In this dataset, the string <code>&quot;NA&quot;</code> means "missing", so we want to tell the dataset constructor this as soon as possible. Here's the comparison of how to accomplish that in various languages:</p><h3 id="tablecloth">Tablecloth</h3><pre><code class="lang-clojure">&#40;require '&#91;tablecloth.api :as tc&#93;&#41;
-
-&#40;def ds 
-  &#40;tc/dataset &quot;https://codewithkira.com/assets/penguins.csv&quot;&#41;&#41;
-</code></pre><p>Note that tablecloth interprets the string "NA" as missing (<code>nil</code>, in Clojure) by default.</p><h3 id="r">R</h3><p>In reality, in R you would get the dataset from the R <a href='https://allisonhorst.github.io/palmerpenguins/articles/intro.html'>package that contains the dataset</a>. This is a fairly common practice in R. In order to compare apples to apples, though, here I'll show how to initialize the dataset from a remote CSV file, using the <a href='https://readr.tidyverse.org/reference/read_delim.html'><code>readr</code> package's <code>read&#95;csv</code></a>, which is part of the tidyverse:</p><pre><code class="lang-r">library&#40;tidyverse&#41;
-
-ds &lt;- read&#95;csv&#40;&quot;https://codewithkira.com/assets/penguins.csv&quot;,
-               na = &quot;NA&quot;&#41;
-</code></pre><h3 id="pandas">Pandas</h3><pre><code class="lang-python">import pandas as pd
-
-ds = pd.read&#95;csv&#40;&quot;https://codewithkira.com/assets/penguins.csv&quot;&#41;
-</code></pre><p>Note that pandas has a <a href='https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html'>fairly long list</a> of values it considers <code>NaN</code> already, so we don't need to specify what missing values look like in our case, since <code>&quot;NA&quot;</code> is already in that list.</p><h3 id="polars">Polars</h3><pre><code class="lang-python">import polars as pl
-
-ds = pl.read&#95;csv&#40;&quot;https://codewithkira.com/assets/penguins.csv&quot;,
-                 null&#95;values=&quot;NA&quot;&#41;
-</code></pre><h2 id="basic&#95;commands&#95;to&#95;explore&#95;the&#95;dataset">Basic commands to explore the dataset</h2><p>The first thing people usually want to do with their dataset is see it and poke around a bit. Below is a comparison of how to accomplish basic data exploration tasks using each library.</p><table><thead><tr><th>Operation</th><th>tablecloth</th><th>dplyr</th></tr></thead><tbody><tr><td>see first 10 rows</td><td><code>&#40;tc/head ds 10&#41;</code></td><td><code>head&#40;ds, 10&#41;</code></td></tr><tr><td>see all column names</td><td><code>&#40;tc/column-names ds&#41;</code></td><td><code>colnames&#40;ds&#41;</code></td></tr><tr><td>select column</td><td><code>&#40;tc/select-columns ds &quot;year&quot;&#41;</code></td><td><code>select&#40;ds, year&#41;</code></td></tr><tr><td>select multiple columns</td><td><code>&#40;tc/select-columns ds &#91;&quot;year&quot; &quot;sex&quot;&#93;&#41;</code></td><td><code>select&#40;ds, year, sex&#41;</code></td></tr><tr><td>select rows</td><td><code>&#40;tc/select-rows ds #&#40;&gt; &#40;% &quot;year&quot;&#41; 2008&#41;&#41;</code></td><td><code>filter&#40;ds, year &gt; 2008&#41;</code></td></tr><tr><td>sort column</td><td><code>&#40;tc/order-by ds &quot;year&quot;&#41;</code></td><td><code>arrange&#40;ds, year&#41;</code></td></tr></tbody></table><p><br></p><table><thead><tr><th>Operation</th><th>pandas</th><th>polars</th></tr></thead><tbody><tr><td>see first <code>n</code> rows</td><td><code>ds.head&#40;10&#41;</code></td><td><code>ds.head&#40;10&#41;</code></td></tr><tr><td>see all column names</td><td><code>ds.columns</code></td><td><code>ds.columns</code></td></tr><tr><td>select column</td><td><code>ds&#91;&#91;&quot;year&quot;&#93;&#93;</code></td><td><code>ds.select&#40;pl.col&#40;&quot;year&quot;&#41;&#41;</code></td></tr><tr><td>select multiple columns</td><td><code>ds&#91;&#91;&quot;year&quot;, &quot;sex&quot;&#93;&#93;</code></td><td><code>ds.select&#40;pl.col&#40;&quot;year&quot;, &quot;sex&quot;&#41;&#41;</code></td></tr><tr><td>select rows</td><td><code>ds&#91;ds&#91;&quot;year&quot;&#93; &gt; 2008&#93;</code></td><td><code>ds.filter&#40;pl.col&#40;&quot;year&quot;&#41; &gt; 2008&#41;</code></td></tr><tr><td>sort column</td><td><code>ds.sort&#95;values&#40;&quot;year&quot;&#41;</code></td><td><code>ds.sort&#40;&quot;year&quot;&#41;</code></td></tr></tbody></table><p>Note there are some differences in how different libraries sort missing values, for example in tablecloth and polars they are placed at the beginning (so they're at the top when a column is sorted in ascending order and last when descending), but dplyr and pandas place them last (regardless of whether ascending or descending order is specified).</p><p>As you can see, these commands are all pretty similar, with the exception of selecting rows in tablecloth. This is a short-hand syntax for writing an anonymous function in Clojure, which is how rows are selected. Being a functional language, functions in Clojure are "first-class", which basically just means they are passed around as arguments willy-nilly, all over the place, all the time. In this case, the third argument to tablecloth's <code>select-rows</code> function is a predicate (a function that returns a boolean) that takes as its argument a dataset row as a map of column names to values. Don't worry, though, tablecloth doesn't process your entire dataset row-wise. Under the hood datasets are highly optimized to perform column-wise operations as fast as possible.</p><p>Here's an example of what it looks like to string a couple of these basic dataset exploration operations together, for example in this case to get the <code>bill&#95;length&#95;mm</code> of all penguins with <code>body&#95;mass&#95;g</code> below 3800:</p><h3 id="tablecloth">Tablecloth</h3><pre><code class="lang-clojure">&#40;-&gt; ds
-    &#40;tc/select-rows #&#40;and &#40;% &quot;body&#95;mass&#95;g&quot;&#41;
-                          &#40;&gt; &#40;% &quot;body&#95;mass&#95;g&quot;&#41; 3800&#41;&#41;&#41;
-    &#40;tc/select-columns &quot;bill&#95;length&#95;mm&quot;&#41;&#41;
-</code></pre><p>Note that in tablecloth we have to explicitly omit rows where the value we're filtering by is missing, unlike in other libraries. This is because tablecloth actually uses <code>nil</code> (as opposed to a library-specific construct) to indicate a missing value , and in Clojure <code>nil</code> is not treated as comparable to numbers. If we were to try to compare <code>nil</code> to a number, we would get an exception telling us that we're trying to compare incomparable types. Clojure is fundamentally dynamically typed in that it only does type checking at runtime and bindings can refer to values of any type, but it is also strongly typed, as we see here, in the sense that it explicitly avoids implicit type coercion. For example deciding whether 0 is greater or larger than <code>nil</code> requires some assumptions, and these are intentionally not baked into the core of Clojure or into tablecloth as a library as is the case in some other languages and libraries.</p><p>This example also introduces Clojure's "thread-first" macro. The <code>-&gt;</code> arrow is like R's <code>|&gt;</code> operator or the unix pipe, effectively passing the output of each function in the chain as input to the next. It comes in very handy for data processing code like this.</p><p>Here is the equivalent operation in the other libraries:</p><h3 id="dplyr">dplyr</h3><pre><code class="lang-r">ds |&gt;
-    filter&#40;body&#95;mass&#95;g &lt; 3800&#41; |&gt;
-    select&#40;bill&#95;length&#95;mm&#41;
-</code></pre><h3 id="pandas">Pandas</h3><pre><code class="lang-python">ds&#91;ds&#91;&quot;body&#95;mass&#95;g&quot;&#93; &lt; 3800&#93;&#91;&quot;bill&#95;length&#95;mm&quot;&#93;
-</code></pre><h3 id="polars">Polars</h3><pre><code class="lang-python">ds.filter&#40;pl.col&#40;&quot;body&#95;mass&#95;g&quot;&#41; &lt; 3800&#41;.select&#40;pl.col&#40;&quot;bill&#95;length&#95;mm&quot;&#41;&#41;
-</code></pre><h2 id="more&#95;advanced&#95;filtering&#95;and&#95;selecting">More advanced filtering and selecting</h2><p>Here is what some more complicated data wrangling looks like across the libraries.</p><h3 id="select&#95;all&#95;columns&#95;except&#95;for&#95;one">Select all columns except for one</h3><table><thead><tr><th>Library</th><th>Code</th></tr></thead><tbody><tr><td>tablecloth</td><td><code>&#40;tc/select-columns ds &#40;complement #{&quot;year&quot;}&#41;&#41;</code></td></tr><tr><td>dplyr</td><td><code>select&#40;ds, -year&#41;</code></td></tr><tr><td>pandas</td><td><code>ds.drop&#40;columns=&#91;&quot;year&quot;&#93;&#41;</code></td></tr><tr><td>polars</td><td><code>ds.select&#40;pl.exclude&#40;&quot;year&quot;&#41;&#41;</code></td></tr></tbody></table><p>Another property of functional languages in general, and especially Clojure, is that they really take advantage of the fact that a lot of things are functions that you might not be used to treating like functions. They also leverage function composition to simply combine multiple functions into a single operation.</p><p>For example a set (indicated with the <code>#{}</code> syntax in Clojure) is a special function that returns a boolean indicating whether the given argument is a member of the set or not. And <a href='https://clojuredocs.org/clojure.core/complement'><code>complement</code></a> is a function in <code>clojure.core</code> that effectively inverts the function given to it, so combined <code>&#40;complement #{&quot;year&quot;}&#41;</code> means "every value that is <i>not</i> in the set <code>#{&quot;year&quot;}</code>, which we can then use as our predicate column selector function to filter out certain columns.</p><h3 id="select&#95;all&#95;columns&#95;that&#95;start&#95;with&#95;a&#95;given&#95;string">Select all columns that start with a given string</h3><table><thead><tr><th>Library</th><th>Code</th></tr></thead><tbody><tr><td>tablecloth</td><td><code>&#40;tc/select-columns ds #&#40;str/starts-with? % &quot;bill&quot;&#41;&#41;</code></td></tr><tr><td>dplyr</td><td><code>select&#40;ds, starts&#95;with&#40;&quot;bill&quot;&#41;&#41;</code></td></tr><tr><td>pandas</td><td><code>ds.filter&#40;regex=&quot;&#94;bill&quot;&#41;</code></td></tr><tr><td>polars</td><td><pre style="width:max-content"><code>import polars.selectors as cs</code><br><code>ds.select(cs.starts_with("bill"))</code></pre></td></tr></tbody></table><h3 id="select&#95;only&#95;numeric&#95;columns">Select only numeric columns</h3><table><thead><tr><th>Library</th><th>Code</th></tr></thead><tbody><tr><td>tablecloth</td><td><code>&#40;tc/select-columns ds :type/numerical</code></td></tr><tr><td>dplyr</td><td><code>select&#40;ds, where&#40;is.numeric&#41;&#41;</code></td></tr><tr><td>pandas</td><td><code>ds.select&#95;dtypes&#40;include='number'&#41;</code></td></tr><tr><td>polars</td><td><code>ds.select&#40;cs.numeric&#40;&#41;&#41;</code></td></tr></tbody></table><p>The symbol <code>:type/numerical</code> in Clojure here is a magic keyword that tablecloth knows about and can accept as a column selector. This list of magic keywords that tablecloth knows about is not (yet) documented anywhere, but it is <a href='https://github.com/scicloj/tablecloth/blob/b0faadcd202d4355767f7e212a4d86e099eb5f96/src/tablecloth/api/utils.clj#L59'>available in the source code</a>.</p><h3 id="filter&#95;rows&#95;for&#95;range&#95;of&#95;values">Filter rows for range of values</h3><table><thead><tr><th>Library</th><th>Code</th></tr></thead><tbody><tr><td>tablecloth</td><td><code>&#40;tc/select-rows ds #&#40;&lt; 3500 &#40;% &quot;body&#95;mass&#95;g&quot; 0&#41; 4000&#41;&#41;</code></td></tr><tr><td>dplyr</td><td><code>filter&#40;ds, between&#40;body&#95;mass&#95;g, 3500, 4000&#41;&#41;</code></td></tr><tr><td>pandas</td><td><code>ds&#91;ds&#91;&quot;body&#95;mass&#95;g&quot;&#93;.between&#40;3500, 4000&#41;&#93;</code></td></tr><tr><td>polars</td><td><code>ds.filter&#40;pl.col&#40;&quot;body&#95;mass&#95;g&quot;&#41;.is&#95;between&#40;3500, 4000&#41;&#41;</code></td></tr></tbody></table><p>Note here we handle the missing values in the <code>body&#95;mass&#95;g</code> column differently than above, by specifying a default value for the map lookup. We're explicitly telling tablecloth to treat missing values as <code>0</code> in this case, which can then be compared to other numbers. This is probably the better way to handle this case, but the method above works, too, plus it gave me the opportunity to soapbox about Clojure types for a moment.</p><h3 id="reshaping&#95;the&#95;dataset">Reshaping the dataset</h3><h4 id="tablecloth">Tablecloth</h4><pre><code class="lang-clojure">&#40;tc/pivot-&gt;longer ds 
-                  &#91;&quot;bill&#95;length&#95;mm&quot; &quot;bill&#95;depth&#95;mm&quot;
-                   &quot;flipper&#95;length&#95;mm&quot; &quot;body&#95;mass&#95;g&quot;&#93;
-                  {:target-columns &quot;measurement&quot; :value-column-name &quot;value&quot;}&#41;
-</code></pre><h4 id="dplyr">dplyr</h4><pre><code class="lang-r">ds |&gt;
-    pivot&#95;longer&#40;cols = c&#40;bill&#95;length&#95;mm, bill&#95;depth&#95;mm,
-                          flipper&#95;length&#95;mm, body&#95;mass&#95;g&#41;,
-                 names&#95;to = &quot;measurement&quot;,
-                 values&#95;to = &quot;value&quot;&#41;
-</code></pre><h4 id="pandas">Pandas</h4><pre><code class="lang-python">pd.melt&#40;
-    ds, 
-    id&#95;vars=ds.columns.drop&#40;&#91;&quot;bill&#95;length&#95;mm&quot;, &quot;bill&#95;depth&#95;mm&quot;, 
-                             &quot;flipper&#95;length&#95;mm&quot;, &quot;body&#95;mass&#95;g&quot;&#93;&#41;, 
-    var&#95;name=&quot;measurement&quot;,
-    value&#95;name=&quot;value&quot;
-&#41;
-</code></pre><h4 id="polars">Polars</h4><pre><code class="lang-python">ds.unpivot&#40;
-     index=set&#40;ds.columns&#41; - set&#40;&#91;&quot;bill&#95;length&#95;mm&quot;,
-                                  &quot;bill&#95;depth&#95;mm&quot;,
-                                  &quot;flipper&#95;length&#95;mm&quot;,
-                                  &quot;body&#95;mass&#95;g&quot;&#93;&#41;,
-     variable&#95;name=&quot;measurement&quot;,
-     value&#95;name=&quot;value&quot;&#41;
-</code></pre><h2 id="creating&#95;and&#95;renaming&#95;columns">Creating and renaming columns</h2><h3 id="adding&#95;columns&#95;based&#95;on&#95;some&#95;other&#95;existing&#95;columns">Adding columns based on some other existing columns</h3><p>There are many reasons you might want to add columns, and often new columns are combinations of other ones. Here's how you'd generate a new column based on the values in some other columns in each library:</p><table><thead><tr><th>Library</th><th>Code</th></tr></thead><tbody><tr><td>tablecloth</td><td><pre style="width:max-content"><code>(require '[tablecloth.column.api :as tcc])<br>(tc/add-columns ds {"ratio" (tcc// (ds "bill&#95;length&#95;mm")<br>                                   (ds "flipper&#95;length&#95;mm"))})</code></pre></td></tr><tr><td>dplyr</td><td><code>mutate&#40;ds, ratio = bill&#95;length&#95;mm / flipper&#95;length&#95;mm&#41;</code></td></tr><tr><td>pandas</td><td><code>ds&#91;&quot;ratio&quot;&#93; = ds&#91;&quot;bill&#95;length&#95;mm&quot;&#93; / ds&#91;&quot;flipper&#95;length&#95;mm&quot;&#93;</code></td></tr><tr><td>polars</td><td><pre style="width:max-content"><code>ds.with&#95;columns(<br>    (pl.col("bill&#95;length&#95;mm") /<br>     pl.col("flipper&#95;length&#95;mm")).alias("ratio")<br>)</code></pre></td></tr></tbody></table><p>Note that this is where the wheels start to come off if you're not working in a functional way with immutable data structures. Clojure data structures (including tablecloth datasets) are immutable, which is not the case Pandas. The Pandas code above mutates the dataset in place, so as soon as you do any mutating operations like these, you now have to keep mental track of the state of your dataset, which can quickly lead to high cognitive overhead and lots of incidental complexity.</p><h3 id="renaming&#95;columns">Renaming columns</h3><table><thead><tr><th>Library</th><th>Code</th></tr></thead><tbody><tr><td>tablecloth</td><td><code>&#40;tc/rename-columns ds {&quot;bill&#95;length&#95;mm&quot; &quot;bill&#95;length&quot;}&#41;</code></td></tr><tr><td>dplyr</td><td><code>rename&#40;ds, bill&#95;length = bill&#95;length&#95;mm&#41;</code></td></tr><tr><td>pandas</td><td><code>ds.rename&#40;columns={&quot;bill&#95;length&#95;mm&quot;: &quot;bill&#95;length&quot;}&#41;</code></td></tr><tr><td>polars</td><td><code>ds.rename&#40;{&quot;bill&#95;length&#95;mm&quot;: &quot;bill&#95;length&quot;}&#41;</code></td></tr></tbody></table><p>Again beware, the Pandas implementation shown here mutates the dataset in place. Also manually specifying every column name transformation you want to do is one way to accomplish the task, but sometimes that can be tedious if you want to apply the same transformation to every column name, which is fairly common.</p><h3 id="transforming&#95;column&#95;names">Transforming column names</h3><p>Here's how you would upper case all column names:</p><table><thead><tr><th>Library</th><th>Code</th></tr></thead><tbody><tr><td>tablecloth</td><td><code>&#40;tc/rename-columns ds :all str/upper-case&#41;</code></td></tr><tr><td>dplyr</td><td><code>rename&#95;with&#40;ds, toupper&#41;</code></td></tr><tr><td>pandas</td><td><code>ds.columns = ds.columns.str.upper&#40;&#41;</code></td></tr><tr><td>polars</td><td><code>ds.select&#40;pl.all&#40;&#41;.name.to&#95;uppercase&#40;&#41;&#41;</code></td></tr></tbody></table><p>Like the other libraries, tablecloth's <code>rename-columns</code> accepts both types of arguments &ndash; a simple mapping of old -> new column names, or any column selector and any transformation function. For example, removing the units from each column name would look like this in each language:</p><table><thead><tr><th>Library</th><th>Code</th></tr></thead><tbody><tr><td>tablecloth</td><td><code>(tc/rename-columns ds #".+&#95;(mm&#124;g)" #(str/replace % #"(.+)&#95;(mm&#124;g)" "$1"))</code></td></tr><tr><td>dplyr</td><td><code>rename&#95;with(penguins, ~ str&#95;replace(.x, "&#94;(.+)&#95;(mm&#124;g)$", "&#92;1"))</code></td></tr><tr><td>pandas</td><td><pre style="width:max-content"><code>import re<br>ds.rename(columns=lambda x: re.sub(r"(.+)_(mm&#124;g)$", r"\1", x))</code></pre></td></tr><tr><td>polars</td><td><pre style="width:max-content"><code>ds = ds.rename({<br>    col: col.replace("&#95;mm", "").replace("&#95;g", "")<br>    for col in ds.columns<br>})</code></pre></td></tr></tbody></table><h2 id="grouping&#95;and&#95;aggregating">Grouping and aggregating</h2><p>Grouping behaves <a href='https://scicloj.github.io/tablecloth/index.html#group-by'>somewhat unconventionally in tablecloth</a>. Datasets can be grouped by a single column name or a sequence of column names like in other libraries, but grouping can also be done using any arbitrary function. Grouping in tablecloth also returns a new dataset, similar to dplyr, rather than an abstract intermediate object (as in pandas and polars). Grouped datasets have three columns, (name of the group, group id, and a column containing a new dataset of the grouped data). Once a dataset is grouped, the group values can be aggregated in a variety of ways. Here are a few examples, with comparisons between libraries:</p><h3 id="summarizing&#95;counts">Summarizing counts</h3><p>To get the count of each penguin by species:</p><h4 id="tablecloth">Tablecloth</h4><pre><code class="lang-clojure">&#40;-&gt; ds
-    &#40;tc/group-by &#91;&quot;species&quot;&#93;&#41;
-    &#40;tc/aggregate {&quot;count&quot; tc/row-count}&#41;&#41;
-</code></pre><h4 id="dplyr">dplyr</h4><pre><code class="lang-r">ds |&gt;
-    group&#95;by&#40;species&#41; |&gt;
-    summarise&#40;count = n&#40;&#41;&#41;
-</code></pre><h4 id="pandas">Pandas</h4><pre><code class="lang-python">ds.groupby&#40;&quot;species&quot;&#41;.agg&#40;count=&#40;&quot;species&quot;, &quot;count&quot;&#41;&#41;
-</code></pre><h4 id="polars">Polars</h4><pre><code class="lang-python">ds.group&#95;by&#40;&quot;species&quot;&#41;.agg&#40;pl.count&#40;&#41;.alias&#40;&quot;count&quot;&#41;&#41;
-</code></pre><h3 id="find&#95;the&#95;penguin&#95;with&#95;the&#95;lowest&#95;body&#95;mass&#95;by&#95;species">Find the penguin with the lowest body mass by species</h3><h4 id="tablecloth">Tablecloth</h4><pre><code class="lang-clojure">&#40;-&gt; ds
-    &#40;tc/group-by &#91;&quot;species&quot;&#93;&#41;
-    &#40;tc/aggregate {&quot;lowest&#95;body&#95;mass&#95;g&quot; #&#40;-&gt;&gt; &#40;% &quot;body&#95;mass&#95;g&quot;&#41;
-                                              tcc/drop-missing
-                                              &#40;apply tcc/min&#41;&#41;}&#41;&#41;
-</code></pre><h4 id="dplyr">dplyr</h4><pre><code class="lang-r">ds |&gt;
-    group&#95;by&#40;species&#41; |&gt;
-    summarize&#40;lowest&#95;body&#95;mass&#95;g = min&#40;body&#95;mass&#95;g, na.rm = TRUE&#41;&#41;
-</code></pre><h4 id="pandas">Pandas</h4><pre><code class="lang-python">ds.groupby&#40;&quot;species&quot;&#41;.agg&#40;
-    lowest&#95;body&#95;mass&#95;g=&#40;&quot;body&#95;mass&#95;g&quot;, lambda x: x.min&#40;skipna=True&#41;&#41;
-&#41;.reset&#95;index&#40;&#41;
-</code></pre><h4 id="polars">Polars</h4><pre><code class="lang-python">ds.group&#95;by&#40;&quot;species&quot;&#41;.agg&#40;
-    pl.col&#40;&quot;body&#95;mass&#95;g&quot;&#41;.min&#40;&#41;.alias&#40;&quot;lowest&#95;body&#95;mass&#95;g&quot;&#41;
-&#41;
-</code></pre><h2 id="conclusions">Conclusions</h2><p>As you can see, all of these libraries are perfectly suitable for accomplishing common data manipulation tasks. Choosing a language and library can impact code readability, maintainability, and performance, though, so understanding the differences between available toolkits can help us make better choices.</p><p>Clojure's tablecloth emphasizes functional programming concepts and immutability, which can lead to more predictable and re-usable code, at the cost of adopting a potentially new paradigm. Hopefully this comparison serves not only as a translation guide, but an an intro to the different philosophies underpinning these common data science tools.</p><p>Thanks for reading :)</p><h2 id="versions">Versions</h2><p>This code in this post works with the following language and library versions:</p><table><thead><tr><th>Tool</th><th>Version</th></tr></thead><tbody><tr><td>MacOS</td><td>Sonoma 14.5</td></tr><tr><td>JVM</td><td><code>21.0.2</code></td></tr><tr><td>Clojure</td><td><code>1.11.1</code></td></tr><tr><td>Tablecloth</td><td><code>7.021</code></td></tr><tr><td>R</td><td><code>4.4.1</code></td></tr><tr><td>Tidyverse</td><td><code>2.0.0</code></td></tr><tr><td>Python</td><td><code>3.12.3</code></td></tr><tr><td>Pandas</td><td><code>2.1.4</code></td></tr><tr><td>Polars</td><td><code>1.1.0</code></td></tr></tbody></table>]]></content>
-  </entry>
   <entry>
     <id>https://codewithkira.com/2024-07-18-tablecloth-dplyr-pandas-polars.html</id>
     <link href="https://codewithkira.com/2024-07-18-tablecloth-dplyr-pandas-polars.html"/>
diff --git a/public/tags/clojure.html b/public/tags/clojure.html
index b89e226..da271a3 100644
--- a/public/tags/clojure.html
+++ b/public/tags/clojure.html
@@ -68,8 +68,6 @@ <h1>Tag - clojure</h1>
         
             <li><span><a href="../2024-07-18-tablecloth-dplyr-pandas-polars.html">Data Manipulation in Clojure Compared to R and Python</a> - 2024-07-18</span></li>
         
-            <li><span><a href="../2024-07-18-dplyr-polars-pandas-tablecloth.html">Data Manipulation in Clojure Compared to R and Python</a> - 2024-07-18</span></li>
-        
             <li><span><a href="../2024-06-30-clojurists-together-update-may-jun-2024.html">OSS Updates May and June 2024</a> - 2024-06-30</span></li>
         
             <li><span><a href="../2024-04-30-clojurists-together-update-mar-apr-2024.html">OSS Updates March and April 2024</a> - 2024-04-30</span></li>
diff --git a/public/tags/index.html b/public/tags/index.html
index a8fe102..0934901 100644
--- a/public/tags/index.html
+++ b/public/tags/index.html
@@ -66,7 +66,7 @@ <h1>
     <h1>Tags</h1>
     <ul class="index">
         
-            <li><span><a href="clojure.html">clojure</a> - 8 posts</span></li>
+            <li><span><a href="clojure.html">clojure</a> - 7 posts</span></li>
         
             <li><span><a href="clojurists-together.html">clojurists together</a> - 4 posts</span></li>
         
@@ -74,16 +74,16 @@ <h1>Tags</h1>
         
             <li><span><a href="oss-updates.html">oss updates</a> - 3 posts</span></li>
         
-            <li><span><a href="scicloj.html">scicloj</a> - 3 posts</span></li>
+            <li><span><a href="scicloj.html">scicloj</a> - 2 posts</span></li>
         
-            <li><span><a href="tools.html">tools</a> - 2 posts</span></li>
+            <li><span><a href="tools.html">tools</a> - 1 post</span></li>
         
-            <li><span><a href="r.html">r</a> - 2 posts</span></li>
-        
-            <li><span><a href="python.html">python</a> - 2 posts</span></li>
+            <li><span><a href="r.html">r</a> - 1 post</span></li>
         
             <li><span><a href="tidy-tuesdays.html">tidy tuesdays</a> - 1 post</span></li>
         
+            <li><span><a href="python.html">python</a> - 1 post</span></li>
+        
             <li><span><a href="machine-learning.html">machine learning</a> - 1 post</span></li>
         
     </ul>
diff --git a/public/tags/python.html b/public/tags/python.html
index 28d3760..8f121f3 100644
--- a/public/tags/python.html
+++ b/public/tags/python.html
@@ -68,8 +68,6 @@ <h1>Tag - python</h1>
         
             <li><span><a href="../2024-07-18-tablecloth-dplyr-pandas-polars.html">Data Manipulation in Clojure Compared to R and Python</a> - 2024-07-18</span></li>
         
-            <li><span><a href="../2024-07-18-dplyr-polars-pandas-tablecloth.html">Data Manipulation in Clojure Compared to R and Python</a> - 2024-07-18</span></li>
-        
     </ul>
 </div>
 
diff --git a/public/tags/r.html b/public/tags/r.html
index ee17e72..70ef155 100644
--- a/public/tags/r.html
+++ b/public/tags/r.html
@@ -68,8 +68,6 @@ <h1>Tag - r</h1>
         
             <li><span><a href="../2024-07-18-tablecloth-dplyr-pandas-polars.html">Data Manipulation in Clojure Compared to R and Python</a> - 2024-07-18</span></li>
         
-            <li><span><a href="../2024-07-18-dplyr-polars-pandas-tablecloth.html">Data Manipulation in Clojure Compared to R and Python</a> - 2024-07-18</span></li>
-        
     </ul>
 </div>
 
diff --git a/public/tags/scicloj.html b/public/tags/scicloj.html
index 6a7b481..afb966b 100644
--- a/public/tags/scicloj.html
+++ b/public/tags/scicloj.html
@@ -68,8 +68,6 @@ <h1>Tag - scicloj</h1>
         
             <li><span><a href="../2024-07-18-tablecloth-dplyr-pandas-polars.html">Data Manipulation in Clojure Compared to R and Python</a> - 2024-07-18</span></li>
         
-            <li><span><a href="../2024-07-18-dplyr-polars-pandas-tablecloth.html">Data Manipulation in Clojure Compared to R and Python</a> - 2024-07-18</span></li>
-        
             <li><span><a href="../2024-04-04-state-of-clojure-ml.html">The Current State of ML in Clojure</a> - 2024-04-04</span></li>
         
     </ul>
diff --git a/public/tags/tools.html b/public/tags/tools.html
index 2ee4045..db145bb 100644
--- a/public/tags/tools.html
+++ b/public/tags/tools.html
@@ -68,8 +68,6 @@ <h1>Tag - tools</h1>
         
             <li><span><a href="../2024-07-18-tablecloth-dplyr-pandas-polars.html">Data Manipulation in Clojure Compared to R and Python</a> - 2024-07-18</span></li>
         
-            <li><span><a href="../2024-07-18-dplyr-polars-pandas-tablecloth.html">Data Manipulation in Clojure Compared to R and Python</a> - 2024-07-18</span></li>
-        
     </ul>
 </div>
 

Operation	tablecloth	dplyr
see first 10 rows	`(tc/head ds 10)`	`head(ds, 10)`
see all column names	`(tc/column-names ds)`	`colnames(ds)`
select column	`(tc/select-columns ds "year")`	`select(ds, year)`
select multiple columns	`(tc/select-columns ds ["year" "sex"])`	`select(ds, year, sex)`
select rows	`(tc/select-rows ds #(> (% "year") 2008))`	`filter(ds, year > 2008)`
sort column	`(tc/order-by ds "year")`	`arrange(ds, year)`
Operation	pandas	polars
see first `n` rows	`ds.head(10)`	`ds.head(10)`
see all column names	`ds.columns`	`ds.columns`
select column	`ds[["year"]]`	`ds.select(pl.col("year"))`
select multiple columns	`ds[["year", "sex"]]`	`ds.select(pl.col("year", "sex"))`
select rows	`ds[ds["year"] > 2008]`	`ds.filter(pl.col("year") > 2008)`
sort column	`ds.sort_values("year")`	`ds.sort("year")`
Library	Code
tablecloth	`(tc/select-columns ds (complement #{"year"}))`
dplyr	`select(ds, -year)`
pandas	`ds.drop(columns=["year"])`
polars	`ds.select(pl.exclude("year"))`
Library	Code
tablecloth	`(tc/select-columns ds #(str/starts-with? % "bill"))`
dplyr	`select(ds, starts_with("bill"))`
pandas	`ds.filter(regex="^bill")`
polars	`import polars.selectors as cs` `ds.select(cs.starts_with("bill"))`
Library	Code
tablecloth	`(tc/select-columns ds :type/numerical`
dplyr	`select(ds, where(is.numeric))`
pandas	`ds.select_dtypes(include='number')`
polars	`ds.select(cs.numeric())`
Library	Code
tablecloth	`(tc/select-rows ds #(< 3500 (% "body_mass_g" 0) 4000))`
dplyr	`filter(ds, between(body_mass_g, 3500, 4000))`
pandas	`ds[ds["body_mass_g"].between(3500, 4000)]`
polars	`ds.filter(pl.col("body_mass_g").is_between(3500, 4000))`
Library	Code
tablecloth	`(require '[tablecloth.column.api :as tcc]) (tc/add-columns ds {"ratio" (tcc// (ds "bill_length_mm") (ds "flipper_length_mm"))})`
dplyr	`mutate(ds, ratio = bill_length_mm / flipper_length_mm)`
pandas	`ds["ratio"] = ds["bill_length_mm"] / ds["flipper_length_mm"]`
polars	`ds.with_columns( (pl.col("bill_length_mm") / pl.col("flipper_length_mm")).alias("ratio") )`
Library	Code
tablecloth	`(tc/rename-columns ds {"bill_length_mm" "bill_length"})`
dplyr	`rename(ds, bill_length = bill_length_mm)`
pandas	`ds.rename(columns={"bill_length_mm": "bill_length"})`
polars	`ds.rename({"bill_length_mm": "bill_length"})`
Library	Code
tablecloth	`(tc/rename-columns ds :all str/upper-case)`
dplyr	`rename_with(ds, toupper)`
pandas	`ds.columns = ds.columns.str.upper()`
polars	`ds.select(pl.all().name.to_uppercase())`
Library	Code
tablecloth	`(tc/rename-columns ds #".+_(mm\|g)" #(str/replace % #"(.+)_(mm\|g)" "$1"))`
dplyr	`rename_with(penguins, ~ str_replace(.x, "^(.+)_(mm\|g)$", "\1"))`
pandas	`import re ds.rename(columns=lambda x: re.sub(r"(.+)_(mm\|g)$", r"\1", x))`
polars	`ds = ds.rename({ col: col.replace("_mm", "").replace("_g", "") for col in ds.columns })`