Edits Classification Ludwig

slds-lmu · Oct 31, 2024 · d9433a1 · d9433a1
1 parent a7ce7ca
commit d9433a1
Show file tree

Hide file tree

Showing 4 changed files with 24 additions and 24 deletions.
diff --git a/slides/supervised-classification/slides-classification-basicdefs.tex b/slides/supervised-classification/slides-classification-basicdefs.tex
@@ -14,7 +14,7 @@
 }{% Relative path to title page image: Can be empty but must not start with slides/
   figure/db_examples
 }{% Learning goals, wrapped inside itemize environment
-  \item Understand why classifiers output a score / probability and not a class
+  \item Understand why some classifiers output a score / probability % and not a class
   \item Understand the difference between scoring and probabilistic classifiers
   \item Know the concept of decision regions and boundaries
   \item Know the difference between generative and discriminant approach
@@ -187,7 +187,7 @@
 \begin{vbframe}{Generative approach}
 
 The \textbf{generative approach}
-models $\pdfxyk$, usually by making some assumptions about the structure of these distributions, and employs the Bayes' theorem:
+models $\pdfxyk$ by making assumptions about these distributions, and employs Bayes' theorem:
 $$\pikx = \postk = \frac{\P(\xv | y = k) \P(y = k)}{\P(\xv)} = \frac{\pdfxyk \pik}{\sumjg \pdfxyk[j] \pi_j}$$
 
 The prior probabilities, $\pi_k = \P(y = k)$, for each class $k$ can be estimated from the training data as the relative frequency of each class:
@@ -200,7 +200,7 @@
 \end{vbframe}
 
 \begin{vbframe}{Visualizing the generative approach}
-The decision boundary between classes is implicitly defined by the overlapping regions of their probability distributions.
+The decision boundary between classes is implicitly defined by their probability distributions.
 
 \begin{center}
 \includegraphics[width=0.5\textwidth]{figure/approach_generative.png} 

diff --git a/slides/supervised-classification/slides-classification-linear.tex b/slides/supervised-classification/slides-classification-linear.tex
@@ -14,7 +14,7 @@
 }{% Relative path to title page image: Can be empty but must not start with slides/
   figure_man/linear_boundary.png
 }{% Learning goals, wrapped inside itemize environment
-  \item Know the definition of a linear classifier
+  \item Know definition of linear classifier
   \item Understand the connection between linear classifiers and linear decision boundaries
   \item Grasp linear separability
 }
@@ -26,7 +26,7 @@
 
 Linear classifiers are an important subclass of classification models. 
 If the discriminant function(s) $\fkx$ can be specified as linear function(s) (possibly through a rank-preserving,
-monotone transformation $g: \R \to \R$), i. e. 
+monotone transformation $g: \R \to \R$), i.e., 
 
 $$
   g(\fkx) = \bm{w}_k^\top \xv + b_k,
@@ -42,7 +42,7 @@
 \end{vbframe}
 
 \begin{vbframe}{Example}
-We want to classify a point $\xv$ by assigning it to the class whose centroid $\muk$ is closest (least distance $d$), i.e., we assign $\xv$ to class 1 if $d_1 < d_2$:
+We classify a point $\xv$ by assigning it to the class $k$ whose centroid $\muk$ is closest (least distance $d$), i.e., we assign $\xv$ to class 1 if $d_1 < d_2$:
 $$
 d_1 = ||\xv - \muk[1]||^2 = \xv^\top \xv - 2 \xv^\top \muk[1] + \muk[1]^\top \muk[1]
 < \xv^\top \xv - 2 \xv^\top \muk[2] + \muk[2]^\top \muk[2] = ||\xv - \muk[2]||^2 = d_2

diff --git a/slides/supervised-classification/slides-classification-logistic.tex b/slides/supervised-classification/slides-classification-logistic.tex
@@ -14,7 +14,7 @@
 }{% Relative path to title page image: Can be empty but must not start with slides/
   figure/log_reg-scores
 }{% Learning goals, wrapped inside itemize environment
-  \item Understand the definition of the logit model
+  \item Understand definition of logit model
   \item Understand how a reasonable loss function for binary classification can be derived
   \item Know the hypothesis space that belongs to the logit model
 }
@@ -24,22 +24,21 @@
 
 \begin{vbframe}{Motivation}
 
-A \textbf{discriminant} approach for directly modeling the posterior probabilities $\pixt$ of the labels is \textbf{logistic regression}. 
-
-\lz
-
-For now, let's focus on the binary case $y \in \setzo$ and use empirical risk minimization.
+We are looking for a \textbf{discriminant} approach for modeling the posterior probabilities $\pixt$. % of the labels is \textbf{logistic regression}. 
+%\lz
+We focus on binary $y \in \setzo$ and use ERM:
 
 $$ \argmin_{\thetav \in \Theta} \risket = \argmin_{\thetav \in \Theta} \sumin \Lpixyit.$$
 
 \lz
+For this, we have to define a loss function $L$ and a model $\pixt$.
 A naive approach would be to model $\pixt$ as a simple linear model $\thx$ 
 (NB: We will often suppress the intercept in notation).
 
 \end{vbframe}
 
 \begin{vbframe}{linear models of probabilities}
-E.g., fitting a simple linear model $\pixt = \thx$ on some example data with one feature $\xv \in \R$ and L2 loss:
+Fitting a simple linear model $\pixt = \thx$ on data with one feature $x \in \R$ and L2 loss:
 
 \lz 
 
@@ -105,25 +104,25 @@
 
 \begin{vbframe}{Deriving a loss function}
 
-We need to find a suitable loss function to use \textbf{ERM}. Starting from the likelihood function $\LL$ for the binary case:
+We need to find a suitable loss function to use \textbf{ERM}. Starting from the likelihood function $\LL$ (for one observation):
 \begin{small}
 \begin{align*}
-\LLt &= \prod_{i \text{ with } \yi = 1} \pixit \prod_{i \text{ with } \yi = 0} (1-\pixit) \\
-     &= \pixit^{\yi} (1-\pixit)^{1-\yi}
+\LLt %&= \prod_{i \text{ with } \yi = 1} \pixit \prod_{i \text{ with } \yi = 0} (1-\pixit) \\
+     &= \pixit^{\yi} \left(1-\pixit\right)^{1-\yi}
 \end{align*}
 \end{small}
 Taking the log to convert products into sums:
 \begin{small}
 \begin{align*}
-\loglt &= \log \LLt = \log(\pixit^{\yi} (1-\pixit)^{1-\yi}) \\
-       &= \yi \log(\pixit) + (1-\yi)\log(1-\pixit)
+\loglt &= \log \LLt = \log\left(\pixit^{\yi} \left(1-\pixit\right)^{1-\yi}\right) \\
+       &= \yi \log\left(\pixit\right) + \left(1-\yi\right)\log\left(1-\pixit\right)
 \end{align*}
 \end{small}
 
-Since we want to minimize our risk, we work with the negative $\loglt$:
+Since we want to minimize the risk, we work with the negative $\loglt$:
 \begin{small}
 \begin{align*}
-- \loglt = - \yi \log(\pixit) - (1-\yi)\log(1-\pixit)
+- \loglt = - \yi \log\left(\pixit\right) - \left(1-\yi\right)\log\left(1-\pixit\right)
 \end{align*}
 \end{small}
 
@@ -143,14 +142,15 @@
 \begin{itemize}
   \item penalizes confidently wrong predictions heavily
   \item is used for many other classifiers, e.g., in NNs or boosting 
-  \item has no analytical solution for \textbf{optimization} (non-linear, non-convex)! Thus, we use \textbf{numerical optimization}, typically gradient-based methods, to fit a logistic regression model.
+  \item has no analytical solution for \textbf{optimization} (non-linear%, non-convex
+  )! Thus, we use \textbf{numerical optimization}, typically gradient-based methods, to fit a logistic regression model.
 \end{itemize}
 
 
 \end{vbframe}
 
 \begin{vbframe}{Logistic Regression in 1D}
-Using logistic regression on our example data with one feature $\xv \in \R$, we again see $\xv \mapsto \pix$, with $\pix \in [0,1]$.
+Using logistic regression on our example data with one feature $x \in \R$, we again see $\xv \mapsto \pix$, with $\pix \in [0,1]$.
 
 \lz
 
@@ -161,7 +161,7 @@
 
 \begin{vbframe}{Logistic Regression in 2D}
 
-Obviously, logistic regression is a linear classifier, as $\pixt = s\left( \thx \right)$ 
+Logistic regression is a linear classifier, as $\pixt = s\left( \thx \right)$ 
 and $s$ is isotonic.
 
 \lz

diff --git a/slides/supervised-classification/slides-classification-tasks.tex b/slides/supervised-classification/slides-classification-tasks.tex
@@ -18,7 +18,7 @@
 }
 
 \begin{vbframe}{Classification}
-Learn functions that assign class labels to observation / feature vectors. Each observation belongs to exactly one class. The main difference to regression is the scale of the output / label.
+Learn functions that assign class labels to observation / feature vectors. Each observation belongs to exactly one class. The main difference to regression is the scale of the target.
 {\centering \includegraphics[width= .7\textwidth]{figure_man/classifier.pdf}}
 
 \end{vbframe}