Merge pull request #103 from scicloj/linreg-20241219

linear regression intro - WIP
scicloj · Dec 29, 2024 · 8241c3c · 8241c3c
2 parents 947facb + 2d4e2d7
commit 8241c3c
Showing 1 changed file with 231 additions and 49 deletions.
diff --git a/notebooks/noj_book/linear_regression_intro.clj b/notebooks/noj_book/linear_regression_intro.clj
@@ -1,9 +1,11 @@
-;; # Intro to Linear Regression - DRAFT 🛠
+;; # Introduction to Linear Regression - DRAFT 🛠
 
-;; Here we offer an intro to [linear regression](https://en.wikipedia.org/wiki/Linear_regression)
-;; following the
+;; **last update:** 2024-12-29
+
+;; In this tutorial, we introduce the fundamentals of [linear regression](https://en.wikipedia.org/wiki/Linear_regression),
+;; guided by the
 ;; [In Depth: Linear Regression](https://jakevdp.github.io/PythonDataScienceHandbook/05.06-linear-regression.html)
-;; section of the
+;; chapter of the
 ;; [Python Data Science Handbook](https://jakevdp.github.io/PythonDataScienceHandbook/)
 ;; by Jake VanderPlas.
 
@@ -17,9 +19,212 @@
    [tech.v3.datatype.datetime :as datetime]
    [tech.v3.dataset.modelling :as ds-mod]
    [fastmath.ml.regression :as reg]
-   [scicloj.kindly.v4.kind :as kind]))
-
-;; ## Reading and parsing data
+   [scicloj.kindly.v4.kind :as kind]
+   [fastmath.random :as rand]
+   [scicloj.tableplot.v1.plotly :as plotly]))
+
+;; ## Simple Linear Regression
+
+;; We begin with the classic straight-line model: for data points $(x, y)$,
+;; we assume there is a linear relationship allowing us to predict $y$ as
+;; $$y = ax + b.$$
+;; In this formulation, $a$ is the slope and $b$ is the intercept,
+;; the point where our line would cross the $y$ axis.
+
+;; To illustrate, we'll use Fastmath and Tablecloth to create synthetic data
+;; in which the relationship is known to hold with $a=2$ and $b=-5$.
+
+;; For each row in the dataset below, we draw $x$ uniformly from 0 to 10
+;; and compute $y = ax + b$ plus an extra random noise term
+;; (drawn from a standard Gaussian distribution).
+;; This noise is added independently for every row.
+
+(def simple-linear-data
+  (let [rng (rand/rng 1234)
+        n 50
+        a 2
+        b -5]
+    (-> {:x (repeatedly n #(rand/frandom rng 0 10))}
+        tc/dataset
+        (tc/map-columns :y
+                        [:x]
+                        (fn [x]
+                          (+ (* a x)
+                             b
+                             (rand/grandom rng)))))))
+
+simple-linear-data
+
+;; Let's plot these points using Tableplot's Plotly API.
+
+(-> simple-linear-data
+    plotly/layer-point)
+
+;; ### Regression using Fastmath
+
+;; We can now fit a linear model to the data using the Fastmath library.
+
+(def simple-linear-data-model
+  (reg/lm
+   ;; ys - a "column" sequence of `y` values:
+   (simple-linear-data :y)
+   ;; xss - a sequence of "rows", each containing `x` values:
+   ;; (one `x` per row, in our case):
+   (-> simple-linear-data
+       (tc/select-columns [:x])
+       tc/rows)
+   ;; options
+   {:names ["x"]}))
+
+(type simple-linear-data-model)
+
+simple-linear-data-model
+
+;; Printing the model gives a tabular summary:
+;; We'll capture the printed output and display it via Kindly for cleaner formatting.
+
+(kind/code
+ (with-out-str
+   (println
+    simple-linear-data-model)))
+
+;; As you can see, the estimated coefficients match our intercept $b$
+;; and slope $a$ (the coefficient of $x$).
+
+;; ### Dataset ergonomics
+
+;; Below are a couple of helper functions that simplify how we use regression with datasets
+;; and display model summaries. We have similar ideas under development in the
+;; [Tablemath](https://scicloj.github.io/tablemath) library, but it is still in
+;; an experimental stage and not part of Noj yet.
+
+(defn lm
+  "Compute a linear regression model for `dataset`.
+  The first column marked as target is the target.
+  All the columns unmarked as target are the features.
+  The resulting model is of type `fastmath.ml.regression.LMData`,
+  created via [Fastmath](https://github.com/generateme/fastmath).
+  
+  See [fastmath.ml.regression.lm](https://generateme.github.io/fastmath/clay/ml.html#lm)
+  for `options`."
+  ([dataset]
+   (lm dataset nil))
+  ([dataset options]
+   (let [inference-column-name (-> dataset
+                                   ds-mod/inference-target-column-names
+                                   first)
+         ds-without-target (-> dataset
+                               (tc/drop-columns [inference-column-name]))]
+     (reg/lm
+      ;; ys
+      (get dataset inference-column-name)
+      ;; xss
+      (tc/rows ds-without-target)
+      ;; options
+      (merge {:names (-> ds-without-target
+                         tc/column-names
+                         vec)}
+             options)))))
+
+(defn summary
+  "Generate a summary of a linear model."
+  [lmdata]
+  (kind/code
+   (with-out-str
+     (println
+      lmdata))))
+
+(-> simple-linear-data
+    (ds-mod/set-inference-target :y)
+    lm
+    summary)
+
+;; ### Prediction
+
+;; Once we have a linear model, we can generate new predictions.
+;; For instance, let's predict $y$ when $x=3$:
+
+(simple-linear-data-model [3])
+
+;; ### Displaying the regression line
+
+;; We can visualize the fitted line by adding a smooth layer to our scatter plot.
+;; Tableplot makes this convenient:
+
+(-> simple-linear-data
+    plotly/layer-point
+    plotly/layer-smooth)
+
+;; Alternatively, we can build the regression line explicitly.
+;; We'll obtain predictions and then plot them:
+
+(-> simple-linear-data
+    (tc/map-columns :prediction
+                    [:x]
+                    simple-linear-data-model)
+    plotly/layer-point
+    (plotly/layer-smooth {:=y :prediction}))
+
+;; ## Multiple linear regression
+
+;; We can easily extend these ideas to multiple linear predictors.
+
+(def multiple-linear-data
+  (let [rng (rand/rng 1234)
+        n 50
+        a0 2
+        a1 -3
+        b -5]
+    (-> {:x0 (repeatedly n #(rand/frandom rng 0 10))
+         :x1 (repeatedly n #(rand/frandom rng 0 10))}
+        tc/dataset
+        (tc/map-columns :y
+                        [:x0 :x1]
+                        (fn [x0 x1]
+                          (+ (* a0 x0)
+                             (* a1 x1)
+                             b
+                             (rand/grandom rng)))))))
+
+(def multiple-linear-data-model
+  (-> multiple-linear-data
+      (ds-mod/set-inference-target :y)
+      lm))
+
+(summary multiple-linear-data-model)
+
+;; Visualizing multiple dimensions is more involved. Below is a 3D
+;; scatter plot using Tableplot's Plotly API, along with an explicit
+;; Plotly surface spec for the regression plane. We expect to simplify this
+;; process in the future, so these details become part of higher-level layers. 🛠
+
+(-> multiple-linear-data
+    (plotly/layer-point {:=coordinates :3d
+                         :=x :x0
+                         :=y :x1
+                         :=z :y}) 
+    plotly/plot
+    (assoc-in [:data 1]
+              {:type :surface
+               :z (for [i (range 11)]
+                    (for [j (range 11)]
+                      (multiple-linear-data-model
+                       [j i])))
+               :opacity 0.5}))
+
+;; ## Coming soon: Polynomial regression 🛠
+
+;; ## Coming soon: One-hot encoding 🛠
+
+;; ## Coming soon: Regularization 🛠
+
+;; ## Example: Predicting Bicycle Traffic
+
+;; As in the Python Data Science Handbook, we'll try predicting the daily number
+;; of bicycle trips across the Fremont Bridge in Seattle. The features will include
+;; weather, season, day of week, and related factors.
+
+;; ### Reading and parsing data
 
 (def column-name-mapping
   {"Fremont Bridge Sidewalks, south of N 34th St" :total
@@ -43,18 +248,17 @@ counts
 
 weather
 
-;; ## Preprocessing
+;; ### Preprocessing
 
-;; Our bike counts data are hourly, but the weather data is daily.
-;; To join them, we will need to convert the bike hourly counts to daily counts.
+;; The bike counts come in hourly data, but our weather information is daily.
+;; We'll need to aggregate the hourly counts into daily totals before combining the datasets.
 
-;; In the Python book, this is done as follows in Pandas:
+;; In the Python handbook, one does:
 ;; ```python
 ;; daily = counts.resample('d').sum()
 ;; ```
 
-;; Tablecloth's full support for time series is still under construction.
-;; For now, we will have to be a bit more verbose:
+;; Since Tablecloth's time series features are still evolving, we'll be a bit more explicit:
 
 (def daily-totals
   (-> counts
@@ -64,28 +268,25 @@ weather
       (tc/aggregate-columns [:total :west :east]
                             tcc/sum)))
 
-
 daily-totals
 
-;; ## Prediction by day-of-week
-
-;; Let us prepare the data for regression on the day of week.
+;; ### Prediction by day-of-week
 
+;; Next, we'll explore a simple regression by day of week.
 
 (def days-of-week
   [:Mon :Tue :Wed :Thu :Fri :Sat :Sun])
 
-
-;; We will convert numbers to days-of-week keywords:
+;; We'll convert the numeric day-of-week to the corresponding keyword:
 
 (def idx->day-of-week
   (comp days-of-week dec))
 
-;; E.g., 
+;; For example,
 (idx->day-of-week 1)
 (idx->day-of-week 7)
 
-;; Now, let us prepare the data:
+;; Now, let's build our dataset:
 
 (def totals-with-day-of-week
   (-> daily-totals
@@ -105,7 +306,7 @@ totals-with-day-of-week
                     (tc/add-column day-of-week
                                    #(-> (:day-of-week %)
                                         (tcc/eq day-of-week)
-                                        ;; turn booleans into 0s and 1s
+                                        ;; convert booleans to 0/1
                                         (tcc/* 1)))))
               totals-with-day-of-week
               days-of-week)
@@ -115,41 +316,22 @@ totals-with-day-of-week
 (-> totals-with-one-hot-days-of-week
     (tc/select-columns ds-mod/inference-column?))
 
-;; Let us compute the linear regression model using Fastmath.
-;; We will use this wrapper function that handles a dataset
-;; (a concept which is unknown to Fastmath):
-
-(defn lm [dataset options]
-  (let [inference-column-name (-> dataset
-                                  ds-mod/inference-target-column-names
-                                  first)
-        ds-without-target (-> dataset
-                              (tc/drop-columns [inference-column-name]))]
-    (reg/lm
-     ;; ys
-     (get dataset inference-column-name)
-     ;; xss
-     (tc/rows ds-without-target)
-     ;; options
-     (merge {:names (-> ds-without-target
-                        tc/column-names
-                        vec)}
-            options))))
-
-;; The binary columns are collinear (sum up to 1),
-;; but we will avoide the intercept.
-;; This way, the interpretation of each coefficient is the expected
-;; bike count for the corresponding day of week.
+;; Since the binary columns sum to 1, they're collinear, and we won't use an intercept.
+;; This way, each coefficient directly reflects the expected bike count
+;; for that day of week.
 
 (def days-of-week-model
   (lm totals-with-one-hot-days-of-week
       {:intercept? false}))
 
-;; Here are the regression results:
+;; Let's take a look at the results:
 
 (-> days-of-week-model
     println
     with-out-str
     kind/code)
 
-;; We can see the difference between weekends and weekdays.
+;; We can clearly see weekend versus weekday differences.
+
+;; ### Coming soon: more predictors for the bike counts 🛠
+