recorded classif 1,2,4

slds-lmu · Nov 3, 2024 · 15423f7 · 15423f7
1 parent d9433a1
commit 15423f7
Show file tree

Hide file tree

Showing 9 changed files with 222 additions and 157 deletions.
diff --git a/slides-pdf/slides-classification-basicdefs.pdf b/slides-pdf/slides-classification-basicdefs.pdf
diff --git a/slides-pdf/slides-classification-logistic.pdf b/slides-pdf/slides-classification-logistic.pdf
diff --git a/slides-pdf/slides-classification-tasks.pdf b/slides-pdf/slides-classification-tasks.pdf
diff --git a/slides/supervised-classification/figure_man/prior_probabilities.png b/slides/supervised-classification/figure_man/prior_probabilities.png
diff --git a/slides/supervised-classification/figure_man/threshold_1.png b/slides/supervised-classification/figure_man/threshold_1.png
diff --git a/slides/supervised-classification/slides-classification-basicdefs.tex b/slides/supervised-classification/slides-classification-basicdefs.tex
@@ -14,27 +14,27 @@
 }{% Relative path to title page image: Can be empty but must not start with slides/
   figure/db_examples
 }{% Learning goals, wrapped inside itemize environment
-  \item Understand why some classifiers output a score / probability % and not a class
-  \item Understand the difference between scoring and probabilistic classifiers
-  \item Know the concept of decision regions and boundaries
-  \item Know the difference between generative and discriminant approach
+  \item Basic notation
+  \item Hard labels vs. probabilities vs. scores
+  \item Decision regions and boundaries
+  \item Generative vs. discriminant approaches
 }
 
 
-\begin{vbframe}{Classification Tasks}
+\begin{vbframe}{Notation and Target Encoding}
 
 \begin{itemize}
 \item In classification, we aim at predicting a discrete output 
-
 $$
 y \in \Yspace = \{C_1, ..., C_g\}
 $$
+with $2 \le g < \infty$, given data $\D$ 
 
-with $2 \le g < \infty$, given data $\D$  and $\Yspace = \gset$.
+\item For convenience, we often encode these classes differently
 
-\item \textbf{Binary case, $g = 2$}: We assume the classes to be encoded as $\Yspace = \setzo$ or $\Yspace = \setmp$.
+\item \textbf{Binary case, $g = 2$}: Usually use $\Yspace = \setzo$ or $\Yspace = \setmp$
 
-\item \textbf{Multiclass case, $g \ge 3$}: We use \textbf{one-hot encoding} $o(y)$, i.e., a $g$-length encoding vector with $o_k(y) = \I(y = k) \in \{0,1\}$ to represent multiple classes numerically:
+\item \textbf{Multiclass case, $g \ge 3$}: Could use $\Yspace = \gset$, but often use \textbf{one-hot encoding} $o(y)$, i.e., $g$-length vector with $o_k(y) = \I(y = k) \in \{0,1\}$:
 
 \end{itemize}
 
@@ -46,29 +46,34 @@
 
 
 \begin{vbframe}{Classification Models} 
-We defined models $f: \Xspace \to \R^g$ as functions that output (continuous) \textbf{scores} / \textbf{probabilities} and \textbf{not} (discrete) classes. Why? 
 
 \begin{itemize}
-  \item From an optimization perspective, it is \textbf{much} (!) easier to optimize costs for continuous-valued functions 
-  \item Scores / probabilities (for classes) contain more information than the class labels alone
-  \item As we will see later, scores can easily be transformed into class labels; but class labels cannot be transformed into scores
+
+\item While for regression the model $f: \Xspace \to \R$ simply maps to the label space $\Yspace=\R$, classification is slightly more complicated.
+\item We sometimes like our models to output (hard) classes, 
+sometimes probabilities, sometimes class scores. The latter 2 are vectors. 
+\item The most basic / common form is the score-based classifier, this is why we defined models already as $f: \Xspace \to \R^g$.
+\item To minimize confusion, we distinguish between all 3 in notation: $\hx$ for hard labels, $\pix$ for probabilities and $\fx$ for scores
+
+\item Why all of that and not only hard labels? a) Scores / probabilities are more informative than hard class predictions; b) from an optimization perspective, it is much (!) easier to work with continuous values.
+
 \end{itemize}
 
-We distinguish \textbf{scoring} and \textbf{probabilistic} classifiers.
 \end{vbframe}
 
 
 \begin{vbframe}{Scoring Classifiers}
 \begin{itemize}
 % \item Scoring classifiers assume the output variable to be -1/+1-encoded, i. e. $\Yspace = \{-1, 1\}$
 \item Construct $g$ \textbf{discriminant} / \textbf{scoring functions} $f_1, ..., f_g: \Xspace \to \R$
-\item Scores $f_1(\xv), \ldots, \fkx[g]$ are transformed into classes by choosing the class with the maximum score 
+\item Predicted class is usually the one with max score 
 $$
-h(\xv) = \argmax_{k \in \gset} \fkx[k]. 
+h(\xv) = \argmax_{k \in \gset} \fkx[k]
 $$ 
 
-\item For $g = 2$, a single discriminant function $\fx = f_{1}(\xv) - f_{-1}(\xv)$ is sufficient (note that it would be natural here to label the classes with $\setmp$), class labels are constructed by $\hx = \text{sgn}(\fx)$
-\item $|\fx|$ is called \enquote{confidence}
+\item For $g = 2$, a single discriminant function $\fx = f_{1}(\xv) - f_{-1}(\xv)$ is sufficient (here, it's natural to label classes with $\setmp$ and we used slight abuse of notation for the subscripts), \\
+class labels are constructed by $\hx = \text{sgn}(\fx)$
+\item $|\fx|$ or $|\fkx|$ is loosely called \enquote{confidence}
 \end{itemize}
 
 \vspace{-0.3cm}
@@ -82,15 +87,14 @@
 \begin{vbframe}{Probabilistic Classifiers}
 \begin{itemize}
 % \item Probabilistic classifiers assume the output variable to be 0/1-encoded, i. e. $\Yspace = \{0, 1\}$
-\item Construct $g$ \textbf{probability functions} $\pi_1, ..., \pi_g: \Xspace \to [0, 1],~\sum_{k = 1}^g \pi_k = 1$ 
-\item Probabilities $\pi_1(\xv), \ldots, \pikx[g]$ are transformed into labels by predicting the class with the maximum probability
+\item Construct $g$ \textbf{probability functions} $\pi_1, ..., \pi_g: \Xspace \to [0, 1],~\sumkg \pikx = 1$ 
+\item Predicted class is usually the one with max probability
 $$
 \hx = \argmax_{k \in \gset} \pikx
 $$ 
-\item For $g = 2$ a single $\pix$ is constructed (note that it would be natural here to label the classes with $\setzo$)
+\item For $g = 2$, single $\pix$ is constructed, which models the predicted probability for the positive class (natural to encode $\Yspace = \setzo$)
 \end{itemize}
 
-
 \begin{center}
   \includegraphics{figure_man/probabilities.png} 
 \end{center}
@@ -100,10 +104,12 @@
 \begin{frame}{Thresholding}
 
 \begin{itemize}
+\item For imbalanced cases or class with costs, we might want to deviate from the standard conversion of scores to classes 
+\item Introduce basic concept (for binary case) and add details later
 \item Convert scores or probabilities to class outputs by thresholding: \\[0.5ex]
-$\hx:= [\pix \ge c]$ or $\hx = [\fx \ge c]$ for some threshold $c$
+$\hx:= [\pix \ge c]$ or $\hx := [\fx \ge c]$ for some threshold $c$
 \item Standard thresholds: $c = 0.5$ for probabilities, $c = 0$ for scores
-\item There are also versions of thresholding for the multiclass case
+%\item There are also versions of thresholding for the multiclass case
 
 \end{itemize}
 
@@ -123,35 +129,33 @@
 
 \end{frame} 
 
-\begin{vbframe}{Decision regions and boundaries}
-\begin{itemize}
-  \item A \textbf{decision region} for class $k$ is the set of input points $\xv$ where class $k$ is assigned as prediction of our model:
+\begin{vbframe}{Decision regions}
+
+Set of points $\xv$ where class $k$ is predicted:
 $$
 \Xspace_k = \{\xv \in \Xspace : \hx = k\}
 $$
 
-\item Points in space where the classes with maximal score are tied and the corresponding hypersurfaces are called \textbf{decision boundaries}
-\end{itemize}
-
 \begin{center}
   % SOURCE: https://docs.google.com/presentation/d/1X2FxetT6fewXhoGLZmgJyEHY_pWbKjQ6AHZiZQHvwzA/edit#slide=id.p
   \includegraphics{figure_man/decision_regions.png} 
 \end{center}
 \end{vbframe} 
 
 \begin{vbframe}{Decision Boundaries}
-Formally:
+Points in space where classes with maximal score are tied and the corresponding hypersurfaces are called \textbf{decision boundaries}
+
+%Formally:
 \begin{eqnarray*}
 \{ \xv \in \Xspace: \exists~i \ne j \text{ s.t. } \fkx[i] = \fkx[j] \land \fkx[i], \fkx[j] \ge \fkx[k] ~ \forall k \ne i, j\}
 \end{eqnarray*}  
 
-In the binary case we can simplify and generalize to the decision boundary for general threshold $c$:
-
+In binary case we can simply use the threshold:
 $$
     \{ \xv \in \Xspace : \fx = c \}
 $$
 
-If we set $c=0$ for scores and $c=0.5$ for probabilities, this is consistent with the definition above.
+$c=0$ for scores and $c=0.5$ for probs is consistent with the above.
 
 \begin{center}
   % SOURCE: https://docs.google.com/presentation/d/1X2FxetT6fewXhoGLZmgJyEHY_pWbKjQ6AHZiZQHvwzA/edit#slide=id.p
@@ -169,59 +173,59 @@
 
 \end{vbframe}
 
-\begin{vbframe}{Classification Approaches}
+%\begin{vbframe}{Classification Approaches}
 
-  Two fundamental approaches exist to construct classifiers:\\
-  The \textbf{generative approach} and the \textbf{discriminant approach}.
+  %Two fundamental approaches exist to construct classifiers:\\
+  %The \textbf{generative approach} and the \textbf{discriminant approach}.
 
-\lz
-They tackle the classification problem from different angles:
+%\lz
+%They tackle the classification problem from different angles:
 
-\begin{itemize}
-\item \textbf{Generative} classification approaches assume a data-generating process in which the distribution of the features $\xv$ is different for the various classes of the output $y$, and try to learn these conditional distributions:\\ \enquote{Which $y$ tends to have $\xv$ like these?}
-\lz
-\item \textbf{Discriminant} approaches use \textbf{empirical risk minimization} based on a suitable loss function:\\ \enquote{What is the best prediction for $y$ given these $\xv$?}
-\end{itemize}
-\end{vbframe}
+%\begin{itemize}
+%\item \textbf{Generative} classification approaches assume a data-generating process in which the distribution of the features $\xv$ is different for the various classes of the output $y$, and try to learn these conditional distributions:\\ \enquote{Which $y$ tends to have $\xv$ like these?}
+%\lz
+%\item \textbf{Discriminant} approaches use \textbf{empirical risk minimization} based on a suitable loss function:\\ \enquote{What is the best prediction for $y$ given these $\xv$?}
+%\end{itemize}
+%\end{vbframe}
 
 \begin{vbframe}{Generative approach}
 
-The \textbf{generative approach}
-models $\pdfxyk$ by making assumptions about these distributions, and employs Bayes' theorem:
-$$\pikx = \postk = \frac{\P(\xv | y = k) \P(y = k)}{\P(\xv)} = \frac{\pdfxyk \pik}{\sumjg \pdfxyk[j] \pi_j}$$
+Models class-conditional $\pdfxyk$, and employs Bayes' theorem:
+$$\pikx \approx \postk = \frac{\P(\xv | y = k) \P(y = k)}{\P(\xv)} = \frac{\pdfxyk \pik}{\sumjg \pdfxyk[j] \pi_j}$$
 
-The prior probabilities, $\pi_k = \P(y = k)$, for each class $k$ can be estimated from the training data as the relative frequency of each class:
+Prior probs $\pi_k = \P(y = k)$ can easily be estimated from training data as relative frequencies of each class:
+
+%\vspace{-1.2cm}
 
 \begin{center}
 % SOURCE: https://docs.google.com/presentation/d/1X2FxetT6fewXhoGLZmgJyEHY_pWbKjQ6AHZiZQHvwzA/edit#slide=id.p
-\includegraphics[width=\textwidth]{figure_man/prior_probabilities.png} 
+\includegraphics[width=0.8\textwidth]{figure_man/prior_probabilities.png} 
 \end{center}
 
 \end{vbframe}
 
-\begin{vbframe}{Visualizing the generative approach}
-The decision boundary between classes is implicitly defined by their probability distributions.
+\begin{vbframe}{Generative approach}
+Decision boundary implicitly defined via the conditional distributions
 
 \begin{center}
 \includegraphics[width=0.5\textwidth]{figure/approach_generative.png} 
 \end{center}
 
-\small{
-Examples are Naive Bayes, Linear Discriminant Analysis and Quadratic Discriminant Analysis (non-linear). Note: LDA and QDA have 'discriminant' in their name, but are generative models!}
+
+Examples are Naive Bayes, LDA and QDA. \\
+NB: LDA and QDA have 'discriminant' in their name, but are generative!
 \end{vbframe}
 
 \begin{vbframe}{Discriminant approach}
 
-Here we optimize the discriminant functions directly, usually via empirical risk minimization, resulting in explicit decision boundaries:
-$$ \fh = \argmin_{f \in \Hspace} \riskef = \argmin_{f \in \Hspace} \sumin \Lxyi.$$
+Here we optimize the discriminant functions (or better: their parameters) directly, usually via ERM:
+$$ \fh = \argmin_{f \in \Hspace} \riskef = \argmin_{f \in \Hspace} \sumin \Lxyi$$
 
 \begin{center}
 \includegraphics[width=1.1\textwidth]{figure_man/disc_approach.png} 
 \end{center}
 
-\small{
-Examples are neural networks, logistic regression and support vector machines.
-}
+Examples are neural networks, logistic regression and SVMs
 
 \end{vbframe}
 

diff --git a/slides/supervised-classification/slides-classification-linear.tex b/slides/supervised-classification/slides-classification-linear.tex
@@ -14,9 +14,9 @@
 }{% Relative path to title page image: Can be empty but must not start with slides/
   figure_man/linear_boundary.png
 }{% Learning goals, wrapped inside itemize environment
-  \item Know definition of linear classifier
-  \item Understand the connection between linear classifiers and linear decision boundaries
-  \item Grasp linear separability
+  \item Linear classifier
+  \item Linear decision boundaries
+  \item Linear separability
 }
 
 \framebreak
@@ -25,7 +25,7 @@
 \begin{vbframe}{Linear Classifiers}
 
 Linear classifiers are an important subclass of classification models. 
-If the discriminant function(s) $\fkx$ can be specified as linear function(s) (possibly through a rank-preserving,
+If the discriminant function(s) $\fkx$ can be written as affine linear function(s) (possibly through a rank-preserving,
 monotone transformation $g: \R \to \R$), i.e., 
 
 $$
@@ -36,56 +36,75 @@
 
 \vfill
 
-NB: $\bm{w}_k$ and $b_k$ do not directly refer to the parameters $\thetav_k$ 
-of $k$-th scoring function $f_k$ but the transformed version. 
+NB: $\bm{w}_k$ and $b_k$ do not necessarily refer to the parameters $\thetav_k$, although they often coincide.
 
 \end{vbframe}
 
-\begin{vbframe}{Example}
-We classify a point $\xv$ by assigning it to the class $k$ whose centroid $\muk$ is closest (least distance $d$), i.e., we assign $\xv$ to class 1 if $d_1 < d_2$:
+
+\begin{vbframe}{linear decision boundaries}
+
+We can also easily show that the decision boundary between classes $i$ and $j$ is a hyperplane. For every $\xv$ where there is a tie in scores: 
+
+\begin{eqnarray*}
+  \fkx[i] &=& \fkx[j] \\
+  g(\fkx[i]) &=& g(\fkx[j]) \\
+  \bm{w}_i^\top \xv + b_i &=& \bm{w}_j^\top \xv + b_j \\
+  \left(\bm{w}_i - \bm{w}_j\right)^\top \xv + \left(b_i - b_j\right) &=& 0 
+\end{eqnarray*}
+
+This represents a \textbf{hyperplane} separating two classes:
+
+\begin{center}
+\includegraphics[width=0.33\textwidth]{figure_man/linear_boundary.png} 
+\end{center}
+\end{vbframe}
+
+\begin{frame}{Example: 2 Classes with Centroids}
+
+
+\only<1> {
+\begin{itemize}
+\item Let's model a binary problem by using a centroid $\muk$ per class as "parameters". 
+
+\item  We don't really care how the centroids are estimated. We could estimate them by using class means, but the following doesn't depend on it.
+
+\item Classify a point $\xv$ by assigning it to class $k$ of nearest centroid.
+
+\end{itemize}
+}
+
+\only<2> {
+
+%We classify a point $\xv$ by assigning it to the class $k$ whose centroid $\muk$ is closest (least distance $d$), i.e., we assign $\xv$ to class 1 if $d_1 < d_2$:
+Let's calculate the boundary:
 $$
 d_1 = ||\xv - \muk[1]||^2 = \xv^\top \xv - 2 \xv^\top \muk[1] + \muk[1]^\top \muk[1]
-< \xv^\top \xv - 2 \xv^\top \muk[2] + \muk[2]^\top \muk[2] = ||\xv - \muk[2]||^2 = d_2
+= \xv^\top \xv - 2 \xv^\top \muk[2] + \muk[2]^\top \muk[2] = ||\xv - \muk[2]||^2 = d_2
 $$
 
 Where $d$ is measured using Euclidean distance. This implies:
 $$
 -2 \xv^\top \muk[1] + \muk[1]^\top \muk[1]
-< -2 \xv^\top \muk[2] + \muk[2]^\top \muk[2]
+= -2 \xv^\top \muk[2] + \muk[2]^\top \muk[2]
 $$
 
 Which simplifies to:
 $$
-2 \xv^\top (\muk[2] - \muk[1]) < \muk[2]^\top \muk[2] - \muk[1]^\top \muk[1]
+2 \xv^\top (\muk[2] - \muk[1]) =\muk[2]^\top \muk[2] - \muk[1]^\top \muk[1]
 $$
 
 Thus, it's a linear classifier!
+}
+
 \vspace{-0.85em}
 \begin{center}
 \includegraphics[width=0.9\textwidth]{figure/nearest_centroid_classifier.png} 
 \end{center}
 
-\end{vbframe}
-
-\begin{vbframe}{linear decision boundaries}
-
-We can also easily show that the decision boundary between classes $i$ and $j$ is a hyperplane. For every $\xv$ where there is a tie in scores: 
+\end{frame}
 
-\begin{eqnarray*}
-  \fkx[i] &=& \fkx[j] \\
-  g(\fkx[i]) &=& g(\fkx[j]) \\
-  \bm{w}_i^\top \xv + b_i &=& \bm{w}_j^\top \xv + b_j \\
-  \left(\bm{w}_i - \bm{w}_j\right)^\top \xv + \left(b_i - b_j\right) &=& 0 
-\end{eqnarray*}
 
-This represents a \textbf{hyperplane} separating two classes:
 
-\begin{center}
-\includegraphics[width=0.33\textwidth]{figure_man/linear_boundary.png} 
-\end{center}
-
-
-\end{vbframe}
 
 \begin{vbframe}{linear separability}
 If there exists a linear classifier that perfectly separates the classes of some dataset, the data are called \textbf{linearly separable}.