Merge pull request #1268 from slds-lmu/classif-nb

classif NB
slds-lmu · Nov 8, 2024 · 5a95909 · 5a95909
2 parents 233b864 + a48c95a
commit 5a95909
Show file tree

Hide file tree

Showing 14 changed files with 195 additions and 61 deletions.
diff --git a/slides/supervised-classification/figure/disc_univariate-1.png b/slides/supervised-classification/figure/disc_univariate-1.png
diff --git a/slides/supervised-classification/figure/disc_univariate-2.png b/slides/supervised-classification/figure/disc_univariate-2.png
diff --git a/slides/supervised-classification/figure/disc_univariate-3.png b/slides/supervised-classification/figure/disc_univariate-3.png
diff --git a/slides/supervised-classification/figure/disc_univariate-4.png b/slides/supervised-classification/figure/disc_univariate-4.png
diff --git a/slides/supervised-classification/figure/nb-bench.png b/slides/supervised-classification/figure/nb-bench.png
diff --git a/slides/supervised-classification/figure/nb-db.png b/slides/supervised-classification/figure/nb-db.png
diff --git a/slides/supervised-classification/figure_man/nb-categorial_1.png b/slides/supervised-classification/figure_man/nb-categorial_1.png
diff --git a/slides/supervised-classification/figure_man/nb-categorial_2.png b/slides/supervised-classification/figure_man/nb-categorial_2.png
diff --git a/slides/supervised-classification/figure_man/nb-categorial_3.png b/slides/supervised-classification/figure_man/nb-categorial_3.png
diff --git a/slides/supervised-classification/rsrc/fig-disc_univariate.R b/slides/supervised-classification/rsrc/fig-disc_univariate.R
@@ -9,18 +9,26 @@ class = as.factor(rep(c("man", "woman"), each = 10))
 df = data.frame(x, class)
 str(df)
 
-# plot 1
 png("../figure/disc_univariate-1.png", width = 3000, height = 1300, res = 300)
-par(mar = c(4,4,1,1))
+
+layout(matrix(c(1, 2), nrow = 1, ncol = 2), widths = c(1, 2))
+par(mar = c(4, 4, 1, 0.5))
+hist(bh.man, breaks = seq(150, 200, by = 2), col = rgb(0, 0, 0, 0.5), 
+     xlim = c(150, 200), ylim = c(0, 3), xlab = "body height (cm)", ylab = "frequency",
+     main = "")
+hist(bh.woman, breaks = seq(150, 200, by = 2), col = rgb(1, 0, 0, 0.5), add = TRUE)
+legend("topright", legend = c("men", "women"), fill = c(rgb(0, 0, 0, 0.5), rgb(1, 0, 0, 0.5)))
+
+par(mar = c(4, 4, 1, 1))
 x = seq(150, 200, length.out = 100)
 plot(x, dnorm(x, mean = 170, sd = 4), type = "l", col = "red", lty = 2, 
-     xlim = c(150, 200), ylim = c(0, 0.1),
-     ylab = "p(x|y=k)", xlab = "x (body height in cm)")
+     xlim = c(150, 200), ylim = c(0, 0.1), ylab = "p(x|y=k)", xlab = "x (body height in cm)")
 abline(v = 175, col = "grey", lty = 2)
 axis(1, at = seq(155, 195, by = 10))
 lines(x, dnorm(x, mean = 180, sd = 4)) 
 legend("topright", legend = c("k = woman ~ N(170, 4)", "k = man ~ N(180, 4)"), 
        col = c("red", "black"), lty = c(2, 1))
+
 dev.off()
 
 # plot 2

diff --git a/slides/supervised-classification/rsrc/fig-nb-db.R b/slides/supervised-classification/rsrc/fig-nb-db.R
@@ -0,0 +1,106 @@
+# PREREQ -----------------------------------------------------------------------
+
+library(knitr)
+library(ggplot2)
+library(MASS)
+library(mlr3)
+library(mlr3learners)
+library(mlr3viz)
+library(mvtnorm)
+
+# common settings
+set.seed(1234)
+plot_width <- 20
+plot_height <- 10
+plot_dpi <- 300
+line_size <- 5
+base_size <- 40
+point_size <- 5
+
+options(digits = 3, 
+        width = 65, 
+        str = strOptions(strict.width = "cut", vec.len = 3))
+
+# DATA -------------------------------------------------------------------------
+
+set.seed(123)
+
+n = 300
+
+classa = data.frame(mvrnorm(n = n, 
+                            mu = c(2, 2), 
+                            Sigma = matrix(c(2, 0, 0, 2), 
+                                           ncol = 2, 
+                                           byrow = TRUE)))
+
+classb = data.frame(mvrnorm(n = n, 
+                            mu = c(10, 7), 
+                            Sigma = matrix(c(8, -6, -6, 8), 
+                                           ncol = 2, 
+                                           byrow = TRUE)))
+
+df = cbind(classa, factor(rep("a", ncol(classa))))
+colnames(df) = c("x1", "x2", "y")
+
+foo = cbind(classb, factor(rep("b", ncol(classb))))
+colnames(foo) = c("x1", "x2", "y")
+
+df = rbind(df, foo)
+
+task = TaskClassif$new("gauss_task", 
+                       backend = df, 
+                       target = "y", 
+                       positive = "a")
+
+learner = lrn("classif.naive_bayes", predict_type = "prob")
+learner$train(task)
+
+tab = learner$model$tables
+mus = data.frame(x1 = tab$x1[, 1], x2 = tab$x2[, 1])
+mu1 = as.numeric(mus[1,])
+mu2 = as.numeric(mus[2,])
+sds = data.frame(x1 = tab$x1[, 2], x2 = tab$x2[, 2])
+S1 = diag(sds[1, ]) 
+S2 = diag(sds[2, ]) 
+
+x1seq = seq(min(df$x1), max(df$x1), length.out = 100)
+x2seq = seq(min(df$x2), max(df$x2), length.out = 100)
+
+# Creating grid for predictions
+grid = expand.grid(x1 = x1seq, x2 = x2seq)
+grid_preds = as.data.frame(learner$predict_newdata(grid)$prob)
+grid_preds$pred_class = factor(apply(grid_preds, 1, function(row) ifelse(row["a"] > row["b"], "a", "b")))
+grid_preds$max_prob = apply(grid_preds[, c("a", "b")], 1, max)
+grid = cbind(grid, grid_preds)
+
+# Recompute density for visualizing distributions
+grid_dens1 = grid
+grid_dens1$dens = dmvnorm(grid_dens1[, c("x1", "x2")], mean = mu1, sigma = S1)
+grid_dens2 = grid
+grid_dens2$dens = dmvnorm(grid_dens2[, c("x1", "x2")], mean = mu2, sigma = S2)
+
+# PLOT -------------------------------------------------------------------------
+
+# Generate the plot
+orig_data = as.data.frame(task$data())
+pl = ggplot() +
+  geom_tile(data = grid, aes(x = x1, y = x2, fill = pred_class, alpha = max_prob)) +
+  geom_contour(data = grid_dens1, aes(x = x1, y = x2, z = dens), color = "#E69F00", alpha = 0.9, lwd = 1.5, bins = 10) +
+  geom_contour(data = grid_dens2, aes(x = x1, y = x2, z = dens), color = "#56B4E9", alpha = 0.9, lwd = 1.5, bins = 10) +
+  geom_point(data = orig_data, aes(x = x1, y = x2, color = y), size = point_size) +
+  guides(shape = FALSE, alpha = FALSE) +
+  scale_fill_manual(values = c("a" = "#E69F00", "b" = "#56B4E9")) +
+  scale_color_manual(values = c("a" = "#E69F00", "b" = "#56B4E9")) +
+  labs(x = expression(x[1]), y = expression(x[2]), color = "class", fill = "class") +
+  theme_minimal() +
+  scale_alpha(range = c(0.1, 0.5), guide = 'none') +
+  theme(
+    plot.title = element_text(hjust = 0.5, size = base_size, face = "bold"),
+    axis.title = element_text(size = base_size, face = "bold"),
+    axis.text = element_text(size = base_size * 0.75, face = "bold"),
+    legend.title = element_text(size = base_size, face = "bold"),
+    legend.text = element_text(size = base_size * 0.75, face = "bold")
+  )
+
+# Save directly to PNG
+ggsave(filename = "../figure/nb-db.png", plot = pl, width = plot_width, height = plot_height, dpi = plot_dpi)
diff --git a/slides/supervised-classification/rsrc/nb-bench.R b/slides/supervised-classification/rsrc/nb-bench.R
@@ -0,0 +1,35 @@
+# goal is to benchmark QDA versus NB versus LDA
+library(mlr3)
+library(mlr3learners)
+library(mlr3pipelines)
+library(mlr3viz)
+library(ggplot2)
+library(data.table)
+
+set.seed(123)
+
+task = tsk("spam")
+
+# because otherwise QDA throws a rank deficiency (because of factors in features I believe?)
+pca = po("pca", rank. = 56) # we only reduce the dimension by 1
+
+lrn_qda = GraphLearner$new(pca %>>% lrn("classif.qda"))
+lrn_nb = GraphLearner$new(pca %>>% lrn("classif.naive_bayes"))
+lrn_lda = GraphLearner$new(pca %>>% lrn("classif.lda"))
+
+learners = list(lrn_nb, lrn_lda, lrn_qda)
+bmr = benchmark(benchmark_grid(task, learners, rsmp("cv", folds = 5)))
+
+a <- autoplot(bmr, type = "boxplot") +
+  ylab("CE for 5-fold CV") +
+  xlab("Learners") +
+  scale_x_discrete(labels = c("QDA", "NB", "LDA")) +
+  theme_minimal() +
+  theme(
+    axis.title = element_text(size = 22, face = "bold"),
+    axis.text = element_text(size = 20, face = "bold"),
+    legend.title = element_text(size = 22, face = "bold"),
+    legend.text = element_text(size = 20, face = "bold"),
+    axis.text.x = element_text(angle = 45, hjust = 1)
+  )
+ggsave("../figure/nb-bench.png", plot = a, width = 12, height = 8, dpi = 300)
diff --git a/slides/supervised-classification/slides-classification-discranalysis.tex b/slides/supervised-classification/slides-classification-discranalysis.tex
@@ -49,7 +49,7 @@
 \item LDA models both classes using normal distributions with equal standard deviations (identical shapes).
 \end{itemize}
 \begin{center}
-\includegraphics[width=0.85\textwidth, clip=true, trim={0 0 0 0}]{figure/disc_univariate-1.png}
+\includegraphics[width=0.95\textwidth, clip=true, trim={0 0 0 0}]{figure/disc_univariate-1.png}
 \end{center}
 \centerline{The optimal separation is located at the intersection (= decision boundary)!}
 \end{small}
@@ -192,16 +192,16 @@
 Parameters $\thetav$ are estimated in a straightforward manner by:\\
 \begin{equation*}
 \begin{aligned}
-\pikh[k] &= \frac{n_k}{n},\text{ where $n_k$ is the number of class-$k$ observations} \\
-\mukh[k] &= \frac{1}{n_k}\sum_{i:\yi = k} \xi \\
-\Sigmah_k &= \frac{1}{n_k - 1} \sum_{i: \yi = k} (\xi - \mukh[k]) (\xi - \mukh[k])^T \quad \quad \text{   (QDA)} \\
-\hat{\Sigma} &= \frac{1}{n - g} \sumkg \sum_{i: \yi = k} (\xi - \mukh[k]) (\xi - \mukh[k])^T \quad \text{(LDA)} \\
+\pikh &= \frac{n_k}{n},\text{ where $n_k$ is the number of class-$k$ observations} \\
+\mukh &= \frac{1}{n_k}\sum_{i:\yi = k} \xi \\
+\Sigmah_k &= \frac{1}{n_k - 1} \sum_{i: \yi = k} (\xi - \mukh) (\xi - \mukh)^T \quad \quad \text{   (QDA)} \\
+\Sigmah &= \frac{1}{n - g} \sumkg \sum_{i: \yi = k} (\xi - \mukh) (\xi - \mukh)^T \quad \text{(LDA)} \\
 \end{aligned}
 \end{equation*}
 
 \lz
 
-As $\Sigmah_k, \Sigmah$ are $p \times p$ matrices (for $p$ features), estimating all $\Sigmah_k$ involves $\frac{p(p+1)}{2} \cdot g$ parameters across $g$ classes (vs. just $\frac{p(p+1)}{2}$ for LDA).
+As $\Sigmah_k, \Sigmah$ are $p \times p$ matrices (for $p$ features), estimating all $\Sigmah_k$ involves $\frac{p(p+1)}{2} \cdot g$ parameters across $g$ classes (vs. just $\frac{p(p+1)}{2}$ for LDA's $\Sigmah$).
 \end{vbframe}
 
 \begin{vbframe}{QDA parameter estimation example}

diff --git a/slides/supervised-classification/slides-classification-naivebayes.tex b/slides/supervised-classification/slides-classification-naivebayes.tex
@@ -11,7 +11,7 @@
   }{% Lecture title  
   Naive Bayes
 }{% Relative path to title page image: Can be empty but must not start with slides/
-  figure/reg_class_nb_1
+  figure/nb-db
 }{% Learning goals, wrapped inside itemize environment
   \item Understand the idea of Naive Bayes
   \item Understand in which sense Naive Bayes is a special QDA model
@@ -36,89 +36,74 @@
 
 \begin{vbframe}{NB: Numerical Features}
 
-We use a univariate Gaussian for $p(x_j | y=k)$, and estimate $(\mu_{kj}, \sigma^2_{kj})$ in the standard manner. Because of $\pdfxyk = \prodjp p(x_j|y = k)$, the joint conditional density is Gaussian with diagonal but non-isotropic covariance structure, and potentially different across classes. Hence, NB is a (specific) QDA model, with quadratic decision boundary.
+We use a univariate Gaussian for $p(x_j | y=k)$, and estimate $(\mu_{kj}, \sigma^2_{kj})$ in the standard manner. Because of $\pdfxyk = \prodjp p(x_j|y = k)$, the joint conditional density is Gaussian with diagonal but non-isotropic covariance structure, and potentially different across classes.
 
-\begin{knitrout}\scriptsize
-\definecolor{shadecolor}{rgb}{0.969, 0.969, 0.969}\color{fgcolor}
+\begin{center}
+\includegraphics[width=0.79\textwidth, clip = true, trim = {0 20 0 20}]{figure/nb-db.png} 
+\end{center}
 
-{\centering \includegraphics[width=0.95\textwidth]{figure/reg_class_nb_1} 
+$\Rightarrow$  \textbf{NB is a} (specific) \textbf{QDA model}, with a quadratic decision boundary.
 
-}
-
-
-
-\end{knitrout}
 \end{vbframe}
+\begin{frame}{NB: Categorical Features}
 
-\begin{vbframe}{NB: Categorical Features}
-
-  We use a categorical distribution for $p(x_j | y = k)$ and estimate the probabilities $p_{kjm}$ that, in class $k$, our $j$-th feature has value $m$, $x_j = m$, simply by counting the frequencies.
+We use a categorical distribution for $p(x_j | y = k)$ and estimate the probabilities $p_{kjm}$ that, in class $k$, our $j$-th feature has value $m$, $x_j = m$, simply by counting the frequencies.
 
 $$
-% p(x_j | y = k) = \frac{(\sum x_i)!}{\prod_i x_i!} \prod_m p_{kjm}^{[x_j = m]}
 p(x_j | y = k) = \prod_m p_{kjm}^{[x_j = m]}
 $$
-%
-% and for the completely observed data this becomes a multinomial distribution
-%
-% $$
-% \frac{(\sum_i x_i)!}{\prod_i x_i!} \prod_j p_{kj}^{v_{kj}},
-% $$
-
-% with ${v_{kj}} = \sum_{i = 1}^n [x_j^{(i)} = 1]$ the number of times $(j, k)$ occurs.
 
 Because of the simple conditional independence structure it is also very easy to deal with mixed numerical / categorical feature spaces.
 
-\end{vbframe}
-
-
-% \begin{vbframe}{Categorical NB is linear in frequencies}
-% We can now prove that the decision boundaries between klasses k and l are linear:
+\begin{flushright}
+% SOURCE: https://docs.google.com/presentation/d/1X2FxetT6fewXhoGLZmgJyEHY_pWbKjQ6AHZiZQHvwzA/edit?usp=sharing
+\only<1>{\includegraphics[width=\textwidth, clip = true, trim = {50 400 120 350}]{figure_man/nb-categorial_1.png}}
+\only<2>{\includegraphics[width=\textwidth, clip = true, trim = {50 410 120 350}]{figure_man/nb-categorial_2.png}}
+\only<3>{\includegraphics[width=\textwidth, clip = true, trim = {50 410 120 350}]{figure_man/nb-categorial_3.png}}
+\end{flushright}
 
-% $$
-% \log \frac{\pi_k(x)}{\pi_l(x)} \propto \log\frac{\pi_k}{\pi_l} + \sum_j v_{kj} \ln p_{kj} - \sum_j v_{lj} \ln p_{lj}
-% $$
-
-% This is a linear function in the parameter vector $v = (v_{11}, \ldots, v_{1p}, \ldots, v_{g1} \ldots v_{gp})$.
-
-% \end{vbframe}
+\end{frame}
 
 \begin{vbframe}{Laplace Smoothing}
-If a given class and feature value never occur together in the training data, then the frequency-based probability estimate will be zero.
-
-\lz
-
-This is problematic because it will wipe out all information in the other probabilities when they are multiplied.
+\begin{small}
+If a given class and feature value never occur together in the training data, then the frequency-based probability estimate will be zero, e.g., $p_{\text{no, class, 1st}} = 0$.
 
 \lz
-%
-A simple numerical correction is to set these zero probabilities to a small value to regularize against this case.
-
 
-\end{vbframe}
-
-% \begin{vbframe}{Naive Bayes as a linear classifier}
+When computing $\postk$, any zero probability will cause the entire product to be zero, negating information from other features:
 
-% In general, the \emph{Naive Bayes classifier} is \textbf{not} a \emph{linear} classifier.
+$$
+\P(\text{no} | \text{class = 1st, sex = male}) = \frac{p_{\text{no, class, 1st}} \cdot p_{\text{no, sex, male}} \cdot \pikh[no]}{\sumjg p(\text{class = 1st, sex = male} | y = j)\pih_j} = 0
+$$
 
-% Furthermore, one can show that the Naive Bayes is a linear classifier in a particular feature space if the features are from exponential families (e. g. binomial, multinomial, normal distribution).
+This is problematic because it will wipe out all information in the other probabilities when they are multiplied!
 
-% \lz
+\lz
 
-% However, it can be shown that the \emph{Naive Bayes classifier} is a linear classifier in a particular feature space if the features are from exponential families (e. g. binomial, multinomial, normal distribution) .
+$\Rightarrow$ A simple numerical correction is to set these zero probabilities to a small value to regularize against this case.
 
-% \end{vbframe}
+\end{small}
 
+\end{vbframe}
 
 \begin{vbframe}{Naive Bayes: application as spam filter}
 \begin{itemize}
   \item In the late 90s, Naive Bayes became popular for e-mail spam filter programs
   \item Word counts were used as features to detect spam mails (e.g., "Viagra" often occurs in spam mail)
   \item Independence assumption implies: occurrence of two words in mail is not correlated
-  \item Seems naive ("Viagra" more likely to occur in context with "Buy now" than "flower"), but leads to less required parameters and therefore better generalization, and often works well in practice.
+  \item Seems naive ("Viagra" more likely to occur in context with "Buy now" than "flower"), but leads to less required parameters and therefore better generalization.
 \end{itemize}
 \end{vbframe}
 
+\begin{vbframe}{Benchmarking SPAM}
+Benchmarking QDA, Naive Bayes and LDA on $\texttt{spam}$:
+
+\begin{center}
+\includegraphics[clip=true, trim={0 0 0 17}, width=0.80\linewidth]{figure/nb-bench.png}
+\end{center}
+
+$\Rightarrow$ In practice, NB often performs well even when the independence assumption is violated!
+\end{vbframe}
 
 \endlecture