From 53d31dd539e3246c56ebde1bcb4ab531f9082801 Mon Sep 17 00:00:00 2001 From: daslu Date: Wed, 11 Dec 2024 20:22:38 +0200 Subject: [PATCH 1/3] index page typo --- notebooks/index.clj | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/notebooks/index.clj b/notebooks/index.clj index c3c973d..edb8304 100644 --- a/notebooks/index.clj +++ b/notebooks/index.clj @@ -67,15 +67,15 @@ directly for tabular data structures or provide high interoperability with it. chapter)) (->> "notebooks/chapters.edn" -slurp -clojure.edn/read-string -(mapcat (fn [{:keys [part chapters]}] - (cons (format "- %s" part) - (->> chapters - (map (fn [chapter] - (prn [chapter (chapter->title chapter)]) - (format "\n - [%s](noj_book.%s.html)\n" - (chapter->title chapter) - chapter))))))) -(str/join "\n") -md) + slurp + clojure.edn/read-string + (mapcat (fn [{:keys [part chapters]}] + (cons (format "- %s" part) + (->> chapters + (map (fn [chapter] + (prn [chapter (chapter->title chapter)]) + (format "\n - [%s](noj_book.%s.html)\n" + (chapter->title chapter) + chapter))))))) + (str/join "\n") + md) From a4512902145ead153e3a0ba9e51a5586e67610b7 Mon Sep 17 00:00:00 2001 From: daslu Date: Wed, 11 Dec 2024 20:22:46 +0200 Subject: [PATCH 2/3] linear-regression WIP --- .../noj_book/linear_regression_intro.clj | 59 +++++++++++++++++-- 1 file changed, 54 insertions(+), 5 deletions(-) diff --git a/notebooks/noj_book/linear_regression_intro.clj b/notebooks/noj_book/linear_regression_intro.clj index 0a111e7..1d2a92d 100644 --- a/notebooks/noj_book/linear_regression_intro.clj +++ b/notebooks/noj_book/linear_regression_intro.clj @@ -11,9 +11,12 @@ (ns noj-book.linear-regression-intro (:require + [tech.v3.dataset :as ds] [tablecloth.api :as tc] [tablecloth.column.api :as tcc] - [tech.v3.datatype.datetime :as datetime])) + [tech.v3.datatype.datetime :as datetime] + [tech.v3.dataset.modelling :as dsmod] + [scicloj.metamorph.ml :as ml])) ;; ## Reading and parsing data @@ -31,10 +34,14 @@ {:key-fn column-name-mapping :parser-fn {"Date" [:local-date-time "MM/dd/yyyy hh:mm:ss a"]}})) +counts + (def weather (tc/dataset "data/seattle-bikes-and-weather/BicycleWeather.csv.gz" {:key-fn keyword})) +weather + ;; ## Preprocessing ;; no good support for this in tablecloth @@ -44,7 +51,49 @@ ;; day column, group by, aggregate, sum. -(-> counts - (tc/group-by (fn [{:keys [datetime]}] - {:date (datetime/local-date-time->local-date datetime)})) - (tc/aggregate {:total (comp tcc/sum :total)})) +(def daily-totals + (-> counts + (tc/group-by (fn [{:keys [datetime]}] + {:date (datetime/local-date-time->local-date + datetime)})) + (tc/aggregate-columns [:total :west :east] + tcc/sum))) + + +daily-totals + +(:date daily-totals) + +(datetime/long-temporal-field + :day-of-week + (:date daily-totals)) + +(def idx->day-of-week + (comp [:Mon :Tue :Wed :Thu :Fri :Sat :Sun] + dec)) + +(idx->day-of-week 1) +(idx->day-of-week 7) + +(def data-for-prediction + (-> daily-totals + (tc/select-columns [:date :total]) + (tc/add-column :dow + (fn [ds] + (map idx->day-of-week + (datetime/long-temporal-field + :day-of-week + (:date ds))))) + (ds/categorical->one-hot [:dow]) + (tc/drop-columns [:date :dow-Sun]) + (dsmod/set-inference-target :total))) + +data-for-prediction + +;; C + A0*Mon + A1*Tue + ... + A5*Sat +;; The prediction for Mon: C+A0 +;; The prediction for Sun: C + +(-> data-for-prediction + :total + meta) From 59aa280a03b4787284b8fcbad32b2070af4c3a26 Mon Sep 17 00:00:00 2001 From: daslu Date: Wed, 11 Dec 2024 23:48:53 +0200 Subject: [PATCH 3/3] linear-regression-intro WIP --- .../noj_book/linear_regression_intro.clj | 88 +++++++++++++------ 1 file changed, 63 insertions(+), 25 deletions(-) diff --git a/notebooks/noj_book/linear_regression_intro.clj b/notebooks/noj_book/linear_regression_intro.clj index 1d2a92d..5ee33cd 100644 --- a/notebooks/noj_book/linear_regression_intro.clj +++ b/notebooks/noj_book/linear_regression_intro.clj @@ -16,7 +16,9 @@ [tablecloth.column.api :as tcc] [tech.v3.datatype.datetime :as datetime] [tech.v3.dataset.modelling :as dsmod] - [scicloj.metamorph.ml :as ml])) + [scicloj.metamorph.ml :as ml] + [fastmath.ml.regression :as reg] + [scicloj.kindly.v4.kind :as kind])) ;; ## Reading and parsing data @@ -44,12 +46,16 @@ weather ;; ## Preprocessing -;; no good support for this in tablecloth +;; Our bike counts data are hourly, but the weather data is daily. +;; To join them, we will need to convert the bike hourly counts to daily counts. + +;; In the Python book, this is done as follows in Pandas: ;; ```python ;; daily = counts.resample('d').sum() ;; ``` -;; day column, group by, aggregate, sum. +;; Tablecloth's full support for time series is still under construction. +;; For now, we will have to be a bit more verbose: (def daily-totals (-> counts @@ -62,38 +68,70 @@ weather daily-totals -(:date daily-totals) +;; ## Prediction by weekday + +;; Let us prepare the data for regression on the day of week. + + +(def days-of-week + [:Mon :Tue :Wed :Thu :Fri :Sat :Sun]) + -(datetime/long-temporal-field - :day-of-week - (:date daily-totals)) +;; We will convert numbers to days-of-week keywords: (def idx->day-of-week - (comp [:Mon :Tue :Wed :Thu :Fri :Sat :Sun] - dec)) + (comp days-of-week dec)) +;; E.g., (idx->day-of-week 1) (idx->day-of-week 7) -(def data-for-prediction +;; Now, let us prepare the data: + +(def totals-with-day-of-week (-> daily-totals - (tc/select-columns [:date :total]) - (tc/add-column :dow + (tc/add-column :day-of-week (fn [ds] (map idx->day-of-week (datetime/long-temporal-field :day-of-week (:date ds))))) - (ds/categorical->one-hot [:dow]) - (tc/drop-columns [:date :dow-Sun]) - (dsmod/set-inference-target :total))) - -data-for-prediction - -;; C + A0*Mon + A1*Tue + ... + A5*Sat -;; The prediction for Mon: C+A0 -;; The prediction for Sun: C - -(-> data-for-prediction - :total - meta) + (tc/select-columns [:total :day-of-week]))) + +totals-with-day-of-week + +(def totals-with-one-hot-days-of-week + (-> (reduce (fn [dataset day-of-week] + (-> dataset + (tc/add-column day-of-week + #(-> (:day-of-week %) + (tcc/eq day-of-week) + ;; turn booleans into 0s and 1s + (tcc/* 1))))) + totals-with-day-of-week + days-of-week) + (tc/drop-columns [:day-of-week]))) + +totals-with-one-hot-days-of-week + +;; Let us compute the linear regression model using Fastmath. +;; The binary columns are collinear (sum up to 1), +;; but we will avoide the intercept. +;; This way, the interpretation of each coefficient is the expected +;; bike count for the corresponding day of week. + +(def fit + (reg/lm (:total totals-with-one-hot-days-of-week) + (-> totals-with-one-hot-days-of-week + (tc/drop-columns [:total]) + tc/rows) + {:intercept? false})) + +;; Here are the regression results: + +(-> fit + println + with-out-str + kind/code) + +;; We can see the difference between weekends and weekdays.