Final version of ML2 summary

phlippe · Oct 22, 2019 · 06d3029 · 06d3029
1 parent 4dac7ea
commit 06d3029
Show file tree

Hide file tree

Showing 8 changed files with 9 additions and 9 deletions.
diff --git a/Final_versions/Machine_Learning_2.pdf b/Final_versions/Machine_Learning_2.pdf
diff --git a/Machine_Learning_2/ml2_appendix.tex b/Machine_Learning_2/ml2_appendix.tex
@@ -52,7 +52,7 @@ \subsubsection{Gaussian}
 	x & x^2\\
 	\end{bmatrix}^T\\
 	& h(\bm{x}) = \frac{1}{\sqrt{2\pi}}\\
-	& g(\bm{\eta}) = \frac{1}{\sigma}\exp\left(-\frac{\eta_1^2}{4\cdot \eta_2}\right)\\
+	& g(\bm{\eta}) = (-2\eta_2)^{-\frac{1}{2}}\exp\left(\frac{\eta_1^2}{4\cdot \eta_2}\right)\\
 	\end{split}
 	\end{equation*}
 \end{fleqn}
@@ -68,7 +68,7 @@ \subsubsection{Gaussian}
 				\bm{x} & \bm{x}\bm{x}^T\\
 			\end{bmatrix}^T\\
 			& h(\bm{x}) = (2\pi)^{-D/2}\\
-			& g(\bm{\eta}) = |-2\bm{\eta}_1| \cdot \exp\left(\frac{1}{4}\bm{\eta}_{1}^T\bm{\eta}_{2}^{-1}\bm{\eta}_{1}\right)\\
+			& g(\bm{\eta}) = |-2\bm{\eta}_2|^{-\frac{1}{2}} \cdot \exp\left(\frac{1}{4}\bm{\eta}_{1}^T\bm{\eta}_{2}^{-1}\bm{\eta}_{1}\right)\\
 		\end{split}
 	\end{equation*}
 \end{fleqn}

diff --git a/Machine_Learning_2/ml2_exponential_family.tex b/Machine_Learning_2/ml2_exponential_family.tex
@@ -253,7 +253,7 @@ \subsection{Information theory}
 	\item We can also define conditional entropy, which is as follows:
 	\begin{equation*}
 	\tcbox[nobeforeafter]{\(
-		H(\bm{x}|\bm{y}) = -\int p(\bm{x})\left[\int p(\bm{y}|\bm{x})\ln p(\bm{y}|\bm{x})d\bm{y}\right]d\bm{x}
+		H(\bm{y}|\bm{x}) = -\int p(\bm{x})\left[\int p(\bm{y}|\bm{x})\ln p(\bm{y}|\bm{x})d\bm{y}\right]d\bm{x}
 		\)}
 	\end{equation*}
 	with the property $H(\bm{x},\bm{y})=H(\bm{x})+H(\bm{y}|\bm{x})=H(\bm{y})+H(\bm{x}|\bm{y})$

diff --git a/Machine_Learning_2/ml2_graphical_models.tex b/Machine_Learning_2/ml2_graphical_models.tex
@@ -559,7 +559,7 @@ \subsubsection{Sum-product algorithm}
 	\begin{equation*}
 	\tcbox[nobeforeafter]{\(
 		\begin{split}
-			\textbf{Factor$\to$Variable:} & \hspace{2mm} \mu_{\alpha\to i}(x_i)=\sum_{\bm{x}_{\alpha}} f_{\alpha}(\bm{x}_{\alpha})\prod_{j\in \alpha\setminus i}\mu_{j\to\alpha}(x_j)\\
+			\textbf{Factor$\to$Variable:} & \hspace{2mm} \mu_{\alpha\to i}(x_i)=\sum_{\bm{x}_{\alpha\setminus i}} f_{\alpha}(\bm{x}_{\alpha})\prod_{j\in \alpha\setminus i}\mu_{j\to\alpha}(x_j)\\
 			& \text{If $\alpha$ leaf node:} \hspace{2mm}\mu_{\alpha\to i}(x_i)=\sum_{\bm{x}_{\alpha}} f_{\alpha}(\bm{x}_{\alpha})\\
 			\textbf{Variable$\to$Factor:} & \hspace{2mm} \mu_{j\to \alpha}(x_j)=\prod_{\beta\in \text{ne}(j)\setminus \alpha}\mu_{\beta\to j}(x_j)\\
 			&\text{If $j$ leaf node:} \hspace{2mm}\mu_{j\to \alpha}(x_j)=1
@@ -634,7 +634,7 @@ \subsubsection{Max-sum algorithm}
 	\tcbox[nobeforeafter]{\(
 		\begin{split}
 		\textbf{Factor$\to$Variable:} & \hspace{2mm} \nu_{\alpha\to i}(x_i)=\max_{\bm{x}_{\alpha\setminus i}} \log f_{\alpha}(\bm{x}_{\alpha}) + \sum_{j\in \alpha\setminus i}\nu_{j\to\alpha}(x_j)\\
-		& \text{If $\alpha$ leaf node:} \hspace{2mm}\nu_{\alpha\to i}(x_i)=\max_{\bm{x}_{\alpha\setminus i}} f_{\alpha}(\bm{x}_{\alpha})\\
+		& \text{If $\alpha$ leaf node:} \hspace{2mm}\nu_{\alpha\to i}(x_i)=\max_{\bm{x}_{\alpha\setminus i}}\log  f_{\alpha}(\bm{x}_{\alpha})\\
 		\textbf{Variable$\to$Factor:} & \hspace{2mm} \nu_{j\to \alpha}(x_j)=\sum_{\beta\in \text{ne}(j)\setminus \alpha}\nu_{\beta\to j}(x_j)\\
 		&\text{If $j$ leaf node:} \hspace{2mm}\nu_{j\to \alpha}(x_j)=0
 		\end{split}

diff --git a/Machine_Learning_2/ml2_sampling_methods.tex b/Machine_Learning_2/ml2_sampling_methods.tex
@@ -51,7 +51,7 @@ \subsection{Rejection sampling}
 		\label{fig:sampling_rejection_sampling}
 	\end{figure}
 	\item To show that we actually generate samples from $p(z)$, we can write down the probability for a value $z_i$ to be picked. First, the chance of $z_i$ being generated in first place is $q(z_i)$. Next, the chance of $z_i$ being accepted, is $\frac{\tilde{p}(z_i)}{\tilde{q}(z_i)}$. Together, we get the probability of $z_i$:
-	$$\hat{p}(z_i) = q_(z_i)\frac{\tilde{p}(z_i)}{\tilde{q}(z_i)} \propto p(z_i)$$
+	$$\hat{p}(z_i) = q(z_i)\frac{\tilde{p}(z_i)}{\tilde{q}(z_i)} \propto p(z_i)$$
 	Hence, we actually generate samples from $z_i$ although we initially sample from $q(z)$
 	\item One requirement of rejection sampling to work well is that the area between $\tilde{p}(z)$ and $\tilde{q}(z)$ is small. The efficience of this sampler can be measured by the acceptance rate, which is $\E_{z_i\sim q}\left[\frac{\tilde{p}(z_i)}{\tilde{q}(z_i)} \right]$. If this value is low, it means that a lot of samples are rejected, hence the sampling process takes longer. This gets especially critical in higher dimensions as we need to make sure that \textit{for all} $z_i$, $\tilde{q}(z_i)$ is greater than $\tilde{p}(z_i)$. Finding a simple distribution in high dimensions that fulfills this requirement is often not trivial
 \end{itemize}

diff --git a/Machine_Learning_2/ml2_sequential_data.tex b/Machine_Learning_2/ml2_sequential_data.tex
@@ -93,7 +93,7 @@ \subsection{Hidden Markov Models}
 
 	The mapping between latent and observed variable is described as emission probabilities, which we can write as $p(\bm{x}_n|\bm{z}_n,\bm{\phi})=\prod_{k=1}^{K} p(\bm{x}_n|\bm{\phi}_k)^{z_{nk}}$
 
-	\item For optimizing the parameters, we again use the EM algorithm
+	\item For optimizing the parameters, we again use the EM algorithm because otherwise we would need to calculate $p(\bm{X}|\bm{\theta})=\sum_Z p(\bm{X},\bm{Z}|\bm{\theta})$. The sum has a complexity which increases exponentially with the number of hidden variables, and makes it inefficient for large $N$.
 
 \end{itemize}
 \subsubsection{Maximum Likelihood for HMM}
@@ -170,7 +170,7 @@ \subsubsection{Maximum Likelihood for HMM}
 		Performing the maximization, we get:
 		\begin{equation*}
 			\begin{split}
-				\pi_k^{\text{new}} & =\frac{\gamma(z_{1k})}{\sum_{k=1}^{K}\gamma(z_{1j})}, \hspace{5mm} A_{jk} = \frac{\sum_{n=2}^{N} \zeta(z_{n-1,j}, z_{nk})}{\sum_{l=1}^{K}\sum_{n=2}^{N} \zeta(z_{n-1,j}, z_{nl})}
+				\pi_k^{\text{new}} & =\frac{\gamma(z_{1k})}{\sum_{j=1}^{K}\gamma(z_{1j})}, \hspace{5mm} A_{jk} = \frac{\sum_{n=2}^{N} \zeta(z_{n-1,j}, z_{nk})}{\sum_{l=1}^{K}\sum_{n=2}^{N} \zeta(z_{n-1,j}, z_{nl})}
 			\end{split}
 		\end{equation*}
 		Solving the same for the parameter $\bm{\phi}$ depends on the form of emission probability that was chosen. For example, if we have a Gaussian density $p(\bm{x}|\bm{\phi}_k)$, the optimized parameters are:

diff --git a/Machine_Learning_2/ml2_summary.pdf b/Machine_Learning_2/ml2_summary.pdf
diff --git a/Machine_Learning_2/ml2_variational_EM.tex b/Machine_Learning_2/ml2_variational_EM.tex
@@ -28,7 +28,7 @@ \section{Variational Expectation Maximization}
 	\begin{description}
 		\item[E-step] Find the posterior distribution $p(\bm{Z}|\bm{X},\bm{\theta}^{\text{old}})$ where $\bm{\theta}^{\text{old}}$ means that we fix the other parameters
 		\item[M-step] Optimize the log-likelihood with respect to parameters $\bm{\theta}$ while keeping the posterior fixed
-		$$\bm{\theta}^{\text{new}} = \arg\max_{\bm{\theta}} \mathcal{Q}(\bm{\theta}, \bm{\theta}^{\text{old}}) = \arg\max_{\bm{\theta}} \sum_{\bm{Z}} p(\bm{Z}|\bm{X},\bm{\theta}) \ln p(\bm{X}, \bm{Z}|\bm{\theta})$$
+		$$\bm{\theta}^{\text{new}} = \arg\max_{\bm{\theta}} \mathcal{Q}(\bm{\theta}, \bm{\theta}^{\text{old}}) = \arg\max_{\bm{\theta}} \sum_{\bm{Z}} p(\bm{Z}|\bm{X},\bm{\theta}^{\text{old}}) \ln p(\bm{X}, \bm{Z}|\bm{\theta})$$
 	\end{description}
 	\item In case we want to find the MAP instead of the MLE, we simply have to add the prior term $\ln p(\bm{\theta})$ to $\mathcal{Q}(\bm{\theta}, \bm{\theta}^{\text{old}})$ in the M-step 
 \end{itemize}