Skip to content

Commit

Permalink
Merge pull request #1268 from slds-lmu/classif-nb
Browse files Browse the repository at this point in the history
classif NB
  • Loading branch information
giuseppec authored Nov 8, 2024
2 parents 233b864 + a48c95a commit 5a95909
Show file tree
Hide file tree
Showing 14 changed files with 195 additions and 61 deletions.
Binary file modified slides/supervised-classification/figure/disc_univariate-1.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified slides/supervised-classification/figure/disc_univariate-2.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified slides/supervised-classification/figure/disc_univariate-3.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified slides/supervised-classification/figure/disc_univariate-4.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
16 changes: 12 additions & 4 deletions slides/supervised-classification/rsrc/fig-disc_univariate.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,18 +9,26 @@ class = as.factor(rep(c("man", "woman"), each = 10))
df = data.frame(x, class)
str(df)

# plot 1
png("../figure/disc_univariate-1.png", width = 3000, height = 1300, res = 300)
par(mar = c(4,4,1,1))

layout(matrix(c(1, 2), nrow = 1, ncol = 2), widths = c(1, 2))
par(mar = c(4, 4, 1, 0.5))
hist(bh.man, breaks = seq(150, 200, by = 2), col = rgb(0, 0, 0, 0.5),
xlim = c(150, 200), ylim = c(0, 3), xlab = "body height (cm)", ylab = "frequency",
main = "")
hist(bh.woman, breaks = seq(150, 200, by = 2), col = rgb(1, 0, 0, 0.5), add = TRUE)
legend("topright", legend = c("men", "women"), fill = c(rgb(0, 0, 0, 0.5), rgb(1, 0, 0, 0.5)))

par(mar = c(4, 4, 1, 1))
x = seq(150, 200, length.out = 100)
plot(x, dnorm(x, mean = 170, sd = 4), type = "l", col = "red", lty = 2,
xlim = c(150, 200), ylim = c(0, 0.1),
ylab = "p(x|y=k)", xlab = "x (body height in cm)")
xlim = c(150, 200), ylim = c(0, 0.1), ylab = "p(x|y=k)", xlab = "x (body height in cm)")
abline(v = 175, col = "grey", lty = 2)
axis(1, at = seq(155, 195, by = 10))
lines(x, dnorm(x, mean = 180, sd = 4))
legend("topright", legend = c("k = woman ~ N(170, 4)", "k = man ~ N(180, 4)"),
col = c("red", "black"), lty = c(2, 1))

dev.off()

# plot 2
Expand Down
106 changes: 106 additions & 0 deletions slides/supervised-classification/rsrc/fig-nb-db.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
# PREREQ -----------------------------------------------------------------------

library(knitr)
library(ggplot2)
library(MASS)
library(mlr3)
library(mlr3learners)
library(mlr3viz)
library(mvtnorm)

# common settings
set.seed(1234)
plot_width <- 20
plot_height <- 10
plot_dpi <- 300
line_size <- 5
base_size <- 40
point_size <- 5

options(digits = 3,
width = 65,
str = strOptions(strict.width = "cut", vec.len = 3))

# DATA -------------------------------------------------------------------------

set.seed(123)

n = 300

classa = data.frame(mvrnorm(n = n,
mu = c(2, 2),
Sigma = matrix(c(2, 0, 0, 2),
ncol = 2,
byrow = TRUE)))

classb = data.frame(mvrnorm(n = n,
mu = c(10, 7),
Sigma = matrix(c(8, -6, -6, 8),
ncol = 2,
byrow = TRUE)))

df = cbind(classa, factor(rep("a", ncol(classa))))
colnames(df) = c("x1", "x2", "y")

foo = cbind(classb, factor(rep("b", ncol(classb))))
colnames(foo) = c("x1", "x2", "y")

df = rbind(df, foo)

task = TaskClassif$new("gauss_task",
backend = df,
target = "y",
positive = "a")

learner = lrn("classif.naive_bayes", predict_type = "prob")
learner$train(task)

tab = learner$model$tables
mus = data.frame(x1 = tab$x1[, 1], x2 = tab$x2[, 1])
mu1 = as.numeric(mus[1,])
mu2 = as.numeric(mus[2,])
sds = data.frame(x1 = tab$x1[, 2], x2 = tab$x2[, 2])
S1 = diag(sds[1, ])
S2 = diag(sds[2, ])

x1seq = seq(min(df$x1), max(df$x1), length.out = 100)
x2seq = seq(min(df$x2), max(df$x2), length.out = 100)

# Creating grid for predictions
grid = expand.grid(x1 = x1seq, x2 = x2seq)
grid_preds = as.data.frame(learner$predict_newdata(grid)$prob)
grid_preds$pred_class = factor(apply(grid_preds, 1, function(row) ifelse(row["a"] > row["b"], "a", "b")))
grid_preds$max_prob = apply(grid_preds[, c("a", "b")], 1, max)
grid = cbind(grid, grid_preds)

# Recompute density for visualizing distributions
grid_dens1 = grid
grid_dens1$dens = dmvnorm(grid_dens1[, c("x1", "x2")], mean = mu1, sigma = S1)
grid_dens2 = grid
grid_dens2$dens = dmvnorm(grid_dens2[, c("x1", "x2")], mean = mu2, sigma = S2)

# PLOT -------------------------------------------------------------------------

# Generate the plot
orig_data = as.data.frame(task$data())
pl = ggplot() +
geom_tile(data = grid, aes(x = x1, y = x2, fill = pred_class, alpha = max_prob)) +
geom_contour(data = grid_dens1, aes(x = x1, y = x2, z = dens), color = "#E69F00", alpha = 0.9, lwd = 1.5, bins = 10) +
geom_contour(data = grid_dens2, aes(x = x1, y = x2, z = dens), color = "#56B4E9", alpha = 0.9, lwd = 1.5, bins = 10) +
geom_point(data = orig_data, aes(x = x1, y = x2, color = y), size = point_size) +
guides(shape = FALSE, alpha = FALSE) +
scale_fill_manual(values = c("a" = "#E69F00", "b" = "#56B4E9")) +
scale_color_manual(values = c("a" = "#E69F00", "b" = "#56B4E9")) +
labs(x = expression(x[1]), y = expression(x[2]), color = "class", fill = "class") +
theme_minimal() +
scale_alpha(range = c(0.1, 0.5), guide = 'none') +
theme(
plot.title = element_text(hjust = 0.5, size = base_size, face = "bold"),
axis.title = element_text(size = base_size, face = "bold"),
axis.text = element_text(size = base_size * 0.75, face = "bold"),
legend.title = element_text(size = base_size, face = "bold"),
legend.text = element_text(size = base_size * 0.75, face = "bold")
)

# Save directly to PNG
ggsave(filename = "../figure/nb-db.png", plot = pl, width = plot_width, height = plot_height, dpi = plot_dpi)
35 changes: 35 additions & 0 deletions slides/supervised-classification/rsrc/nb-bench.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# goal is to benchmark QDA versus NB versus LDA
library(mlr3)
library(mlr3learners)
library(mlr3pipelines)
library(mlr3viz)
library(ggplot2)
library(data.table)

set.seed(123)

task = tsk("spam")

# because otherwise QDA throws a rank deficiency (because of factors in features I believe?)
pca = po("pca", rank. = 56) # we only reduce the dimension by 1

lrn_qda = GraphLearner$new(pca %>>% lrn("classif.qda"))
lrn_nb = GraphLearner$new(pca %>>% lrn("classif.naive_bayes"))
lrn_lda = GraphLearner$new(pca %>>% lrn("classif.lda"))

learners = list(lrn_nb, lrn_lda, lrn_qda)
bmr = benchmark(benchmark_grid(task, learners, rsmp("cv", folds = 5)))

a <- autoplot(bmr, type = "boxplot") +
ylab("CE for 5-fold CV") +
xlab("Learners") +
scale_x_discrete(labels = c("QDA", "NB", "LDA")) +
theme_minimal() +
theme(
axis.title = element_text(size = 22, face = "bold"),
axis.text = element_text(size = 20, face = "bold"),
legend.title = element_text(size = 22, face = "bold"),
legend.text = element_text(size = 20, face = "bold"),
axis.text.x = element_text(angle = 45, hjust = 1)
)
ggsave("../figure/nb-bench.png", plot = a, width = 12, height = 8, dpi = 300)
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@
\item LDA models both classes using normal distributions with equal standard deviations (identical shapes).
\end{itemize}
\begin{center}
\includegraphics[width=0.85\textwidth, clip=true, trim={0 0 0 0}]{figure/disc_univariate-1.png}
\includegraphics[width=0.95\textwidth, clip=true, trim={0 0 0 0}]{figure/disc_univariate-1.png}
\end{center}
\centerline{The optimal separation is located at the intersection (= decision boundary)!}
\end{small}
Expand Down Expand Up @@ -192,16 +192,16 @@
Parameters $\thetav$ are estimated in a straightforward manner by:\\
\begin{equation*}
\begin{aligned}
\pikh[k] &= \frac{n_k}{n},\text{ where $n_k$ is the number of class-$k$ observations} \\
\mukh[k] &= \frac{1}{n_k}\sum_{i:\yi = k} \xi \\
\Sigmah_k &= \frac{1}{n_k - 1} \sum_{i: \yi = k} (\xi - \mukh[k]) (\xi - \mukh[k])^T \quad \quad \text{ (QDA)} \\
\hat{\Sigma} &= \frac{1}{n - g} \sumkg \sum_{i: \yi = k} (\xi - \mukh[k]) (\xi - \mukh[k])^T \quad \text{(LDA)} \\
\pikh &= \frac{n_k}{n},\text{ where $n_k$ is the number of class-$k$ observations} \\
\mukh &= \frac{1}{n_k}\sum_{i:\yi = k} \xi \\
\Sigmah_k &= \frac{1}{n_k - 1} \sum_{i: \yi = k} (\xi - \mukh) (\xi - \mukh)^T \quad \quad \text{ (QDA)} \\
\Sigmah &= \frac{1}{n - g} \sumkg \sum_{i: \yi = k} (\xi - \mukh) (\xi - \mukh)^T \quad \text{(LDA)} \\
\end{aligned}
\end{equation*}

\lz

As $\Sigmah_k, \Sigmah$ are $p \times p$ matrices (for $p$ features), estimating all $\Sigmah_k$ involves $\frac{p(p+1)}{2} \cdot g$ parameters across $g$ classes (vs. just $\frac{p(p+1)}{2}$ for LDA).
As $\Sigmah_k, \Sigmah$ are $p \times p$ matrices (for $p$ features), estimating all $\Sigmah_k$ involves $\frac{p(p+1)}{2} \cdot g$ parameters across $g$ classes (vs. just $\frac{p(p+1)}{2}$ for LDA's $\Sigmah$).
\end{vbframe}

\begin{vbframe}{QDA parameter estimation example}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
}{% Lecture title
Naive Bayes
}{% Relative path to title page image: Can be empty but must not start with slides/
figure/reg_class_nb_1
figure/nb-db
}{% Learning goals, wrapped inside itemize environment
\item Understand the idea of Naive Bayes
\item Understand in which sense Naive Bayes is a special QDA model
Expand All @@ -36,89 +36,74 @@

\begin{vbframe}{NB: Numerical Features}

We use a univariate Gaussian for $p(x_j | y=k)$, and estimate $(\mu_{kj}, \sigma^2_{kj})$ in the standard manner. Because of $\pdfxyk = \prodjp p(x_j|y = k)$, the joint conditional density is Gaussian with diagonal but non-isotropic covariance structure, and potentially different across classes. Hence, NB is a (specific) QDA model, with quadratic decision boundary.
We use a univariate Gaussian for $p(x_j | y=k)$, and estimate $(\mu_{kj}, \sigma^2_{kj})$ in the standard manner. Because of $\pdfxyk = \prodjp p(x_j|y = k)$, the joint conditional density is Gaussian with diagonal but non-isotropic covariance structure, and potentially different across classes.

\begin{knitrout}\scriptsize
\definecolor{shadecolor}{rgb}{0.969, 0.969, 0.969}\color{fgcolor}
\begin{center}
\includegraphics[width=0.79\textwidth, clip = true, trim = {0 20 0 20}]{figure/nb-db.png}
\end{center}

{\centering \includegraphics[width=0.95\textwidth]{figure/reg_class_nb_1}
$\Rightarrow$ \textbf{NB is a} (specific) \textbf{QDA model}, with a quadratic decision boundary.

}



\end{knitrout}
\end{vbframe}
\begin{frame}{NB: Categorical Features}

\begin{vbframe}{NB: Categorical Features}

We use a categorical distribution for $p(x_j | y = k)$ and estimate the probabilities $p_{kjm}$ that, in class $k$, our $j$-th feature has value $m$, $x_j = m$, simply by counting the frequencies.
We use a categorical distribution for $p(x_j | y = k)$ and estimate the probabilities $p_{kjm}$ that, in class $k$, our $j$-th feature has value $m$, $x_j = m$, simply by counting the frequencies.

$$
% p(x_j | y = k) = \frac{(\sum x_i)!}{\prod_i x_i!} \prod_m p_{kjm}^{[x_j = m]}
p(x_j | y = k) = \prod_m p_{kjm}^{[x_j = m]}
$$
%
% and for the completely observed data this becomes a multinomial distribution
%
% $$
% \frac{(\sum_i x_i)!}{\prod_i x_i!} \prod_j p_{kj}^{v_{kj}},
% $$

% with ${v_{kj}} = \sum_{i = 1}^n [x_j^{(i)} = 1]$ the number of times $(j, k)$ occurs.

Because of the simple conditional independence structure it is also very easy to deal with mixed numerical / categorical feature spaces.

\end{vbframe}


% \begin{vbframe}{Categorical NB is linear in frequencies}
% We can now prove that the decision boundaries between klasses k and l are linear:
\begin{flushright}
% SOURCE: https://docs.google.com/presentation/d/1X2FxetT6fewXhoGLZmgJyEHY_pWbKjQ6AHZiZQHvwzA/edit?usp=sharing
\only<1>{\includegraphics[width=\textwidth, clip = true, trim = {50 400 120 350}]{figure_man/nb-categorial_1.png}}
\only<2>{\includegraphics[width=\textwidth, clip = true, trim = {50 410 120 350}]{figure_man/nb-categorial_2.png}}
\only<3>{\includegraphics[width=\textwidth, clip = true, trim = {50 410 120 350}]{figure_man/nb-categorial_3.png}}
\end{flushright}

% $$
% \log \frac{\pi_k(x)}{\pi_l(x)} \propto \log\frac{\pi_k}{\pi_l} + \sum_j v_{kj} \ln p_{kj} - \sum_j v_{lj} \ln p_{lj}
% $$

% This is a linear function in the parameter vector $v = (v_{11}, \ldots, v_{1p}, \ldots, v_{g1} \ldots v_{gp})$.

% \end{vbframe}
\end{frame}

\begin{vbframe}{Laplace Smoothing}
If a given class and feature value never occur together in the training data, then the frequency-based probability estimate will be zero.

\lz

This is problematic because it will wipe out all information in the other probabilities when they are multiplied.
\begin{small}
If a given class and feature value never occur together in the training data, then the frequency-based probability estimate will be zero, e.g., $p_{\text{no, class, 1st}} = 0$.

\lz
%
A simple numerical correction is to set these zero probabilities to a small value to regularize against this case.


\end{vbframe}

% \begin{vbframe}{Naive Bayes as a linear classifier}
When computing $\postk$, any zero probability will cause the entire product to be zero, negating information from other features:

% In general, the \emph{Naive Bayes classifier} is \textbf{not} a \emph{linear} classifier.
$$
\P(\text{no} | \text{class = 1st, sex = male}) = \frac{p_{\text{no, class, 1st}} \cdot p_{\text{no, sex, male}} \cdot \pikh[no]}{\sumjg p(\text{class = 1st, sex = male} | y = j)\pih_j} = 0
$$

% Furthermore, one can show that the Naive Bayes is a linear classifier in a particular feature space if the features are from exponential families (e. g. binomial, multinomial, normal distribution).
This is problematic because it will wipe out all information in the other probabilities when they are multiplied!

% \lz
\lz

% However, it can be shown that the \emph{Naive Bayes classifier} is a linear classifier in a particular feature space if the features are from exponential families (e. g. binomial, multinomial, normal distribution) .
$\Rightarrow$ A simple numerical correction is to set these zero probabilities to a small value to regularize against this case.

% \end{vbframe}
\end{small}

\end{vbframe}

\begin{vbframe}{Naive Bayes: application as spam filter}
\begin{itemize}
\item In the late 90s, Naive Bayes became popular for e-mail spam filter programs
\item Word counts were used as features to detect spam mails (e.g., "Viagra" often occurs in spam mail)
\item Independence assumption implies: occurrence of two words in mail is not correlated
\item Seems naive ("Viagra" more likely to occur in context with "Buy now" than "flower"), but leads to less required parameters and therefore better generalization, and often works well in practice.
\item Seems naive ("Viagra" more likely to occur in context with "Buy now" than "flower"), but leads to less required parameters and therefore better generalization.
\end{itemize}
\end{vbframe}

\begin{vbframe}{Benchmarking SPAM}
Benchmarking QDA, Naive Bayes and LDA on $\texttt{spam}$:

\begin{center}
\includegraphics[clip=true, trim={0 0 0 17}, width=0.80\linewidth]{figure/nb-bench.png}
\end{center}

$\Rightarrow$ In practice, NB often performs well even when the independence assumption is violated!
\end{vbframe}

\endlecture

Expand Down

0 comments on commit 5a95909

Please sign in to comment.