unit10_cnn/prob/prob_cnn.tex

\documentclass[11pt]{article}

\usepackage{fullpage}
\usepackage{amsmath, amssymb, bm, cite, epsfig, psfrag}
\usepackage{graphicx}
\usepackage{float}
\usepackage{amsthm}
\usepackage{amsfonts}
\usepackage{listings}
\usepackage{cite}
\usepackage{hyperref}
\usepackage{tikz}
\usepackage{enumerate}
\usepackage{listings}
\usepackage{mathtools}
\usepackage{mdframed}

\lstloadlanguages{Python}
\usetikzlibrary{shapes,arrows}
%\usetikzlibrary{dsp,chains}

\DeclareFixedFont{\ttb}{T1}{txtt}{bx}{n}{9} % for bold
\DeclareFixedFont{\ttm}{T1}{txtt}{m}{n}{9}  % for normal
% Defining colors
\usepackage{color}
\definecolor{deepblue}{rgb}{0,0,0.5}
\definecolor{deepred}{rgb}{0.6,0,0}
\definecolor{deepgreen}{rgb}{0,0.5,0}
\definecolor{backcolour}{rgb}{0.95,0.95,0.92}

%\restylefloat{figure}
%\theoremstyle{plain}      \newtheorem{theorem}{Theorem}
%\theoremstyle{definition} \newtheorem{definition}{Definition}

\def\del{\partial}
\def\ds{\displaystyle}
\def\ts{\textstyle}
\def\beq{\begin{equation}}
\def\eeq{\end{equation}}
\def\beqa{\begin{eqnarray}}
\def\eeqa{\end{eqnarray}}
\def\beqan{\begin{eqnarray*}}
\def\eeqan{\end{eqnarray*}}
\def\nn{\nonumber}
\def\binomial{\mathop{\mathrm{binomial}}}
\def\half{{\ts\frac{1}{2}}}
\def\Half{{\frac{1}{2}}}
\def\N{{\mathbb{N}}}
\def\Z{{\mathbb{Z}}}
\def\Q{{\mathbb{Q}}}
\def\R{{\mathbb{R}}}
\def\C{{\mathbb{C}}}
\def\argmin{\mathop{\mathrm{arg\,min}}}
\def\argmax{\mathop{\mathrm{arg\,max}}}
%\def\span{\mathop{\mathrm{span}}}
\def\diag{\mathop{\mathrm{diag}}}
\def\x{\times}
\def\limn{\lim_{n \rightarrow \infty}}
\def\liminfn{\liminf_{n \rightarrow \infty}}
\def\limsupn{\limsup_{n \rightarrow \infty}}
\def\MID{\,|\,}
\def\MIDD{\,;\,}

\newtheorem{proposition}{Proposition}
\newtheorem{definition}{Definition}
\newtheorem{theorem}{Theorem}
\newtheorem{lemma}{Lemma}
\newtheorem{corollary}{Corollary}
\newtheorem{assumption}{Assumption}
\newtheorem{claim}{Claim}
\def\qed{\mbox{} \hfill $\Box$}
\setlength{\unitlength}{1mm}

\def\bhat{\widehat{b}}
\def\ehat{\widehat{e}}
\def\phat{\widehat{p}}
\def\qhat{\widehat{q}}
\def\rhat{\widehat{r}}
\def\shat{\widehat{s}}
\def\uhat{\widehat{u}}
\def\ubar{\overline{u}}
\def\vhat{\widehat{v}}
\def\xhat{\widehat{x}}
\def\xbar{\overline{x}}
\def\zhat{\widehat{z}}
\def\zbar{\overline{z}}
\def\la{\leftarrow}
\def\ra{\rightarrow}
\def\MSE{\mbox{\small \sffamily MSE}}
\def\SNR{\mbox{\small \sffamily SNR}}
\def\SINR{\mbox{\small \sffamily SINR}}
\def\arr{\rightarrow}
\def\Exp{\mathbb{E}}
\def\var{\mbox{var}}
\def\Tr{\mbox{Tr}}
\def\tm1{t\! - \! 1}
\def\tp1{t\! + \! 1}
\def\Tm1{T\! - \! 1}
\def\Tp1{T\! + \! 1}


\def\Xset{{\cal X}}

\newcommand{\one}{\mathbf{1}}
\newcommand{\abf}{\mathbf{a}}
\newcommand{\bbf}{\mathbf{b}}
\newcommand{\dbf}{\mathbf{d}}
\newcommand{\ebf}{\mathbf{e}}
\newcommand{\gbf}{\mathbf{g}}
\newcommand{\hbf}{\mathbf{h}}
\newcommand{\pbf}{\mathbf{p}}
\newcommand{\pbfhat}{\widehat{\mathbf{p}}}
\newcommand{\qbf}{\mathbf{q}}
\newcommand{\qbfhat}{\widehat{\mathbf{q}}}
\newcommand{\rbf}{\mathbf{r}}
\newcommand{\rbfhat}{\widehat{\mathbf{r}}}
\newcommand{\sbf}{\mathbf{s}}
\newcommand{\sbfhat}{\widehat{\mathbf{s}}}
\newcommand{\ubf}{\mathbf{u}}
\newcommand{\ubfhat}{\widehat{\mathbf{u}}}
\newcommand{\utildebf}{\tilde{\mathbf{u}}}
\newcommand{\vbf}{\mathbf{v}}
\newcommand{\vbfhat}{\widehat{\mathbf{v}}}
\newcommand{\wbf}{\mathbf{w}}
\newcommand{\wbfhat}{\widehat{\mathbf{w}}}
\newcommand{\xbf}{\mathbf{x}}
\newcommand{\xbfhat}{\widehat{\mathbf{x}}}
\newcommand{\xbfbar}{\overline{\mathbf{x}}}
\newcommand{\ybf}{\mathbf{y}}
\newcommand{\zbf}{\mathbf{z}}
\newcommand{\zbfbar}{\overline{\mathbf{z}}}
\newcommand{\zbfhat}{\widehat{\mathbf{z}}}
\newcommand{\Ahat}{\widehat{A}}
\newcommand{\Abf}{\mathbf{A}}
\newcommand{\Bbf}{\mathbf{B}}
\newcommand{\Cbf}{\mathbf{C}}
\newcommand{\Bbfhat}{\widehat{\mathbf{B}}}
\newcommand{\Dbf}{\mathbf{D}}
\newcommand{\Gbf}{\mathbf{G}}
\newcommand{\Hbf}{\mathbf{H}}
\newcommand{\Ibf}{\mathbf{I}}
\newcommand{\Kbf}{\mathbf{K}}
\newcommand{\Pbf}{\mathbf{P}}
\newcommand{\Phat}{\widehat{P}}
\newcommand{\Qbf}{\mathbf{Q}}
\newcommand{\Rbf}{\mathbf{R}}
\newcommand{\Rhat}{\widehat{R}}
\newcommand{\Sbf}{\mathbf{S}}
\newcommand{\Ubf}{\mathbf{U}}
\newcommand{\Vbf}{\mathbf{V}}
\newcommand{\Wbf}{\mathbf{W}}
\newcommand{\Xhat}{\widehat{X}}
\newcommand{\Xbf}{\mathbf{X}}
\newcommand{\Ybf}{\mathbf{Y}}
\newcommand{\Zbf}{\mathbf{Z}}
\newcommand{\Zhat}{\widehat{Z}}
\newcommand{\Zbfhat}{\widehat{\mathbf{Z}}}
\def\alphabf{{\boldsymbol \alpha}}
\def\betabf{{\boldsymbol \beta}}
\def\betabfhat{{\widehat{\bm{\beta}}}}
\def\epsilonbf{{\boldsymbol \epsilon}}
\def\mubf{{\boldsymbol \mu}}
\def\lambdabf{{\boldsymbol \lambda}}
\def\etabf{{\boldsymbol \eta}}
\def\xibf{{\boldsymbol \xi}}
\def\taubf{{\boldsymbol \tau}}
\def\sigmahat{{\widehat{\sigma}}}
\def\thetabf{{\bm{\theta}}}
\def\thetabfhat{{\widehat{\bm{\theta}}}}
\def\thetahat{{\widehat{\theta}}}
\def\mubar{\overline{\mu}}
\def\muavg{\mu}
\def\sigbf{\bm{\sigma}}
\def\etal{\emph{et al.}}
\def\Ggothic{\mathfrak{G}}
\def\Pset{{\mathcal P}}
\newcommand{\bigCond}[2]{\bigl({#1} \!\bigm\vert\! {#2} \bigr)}
\newcommand{\BigCond}[2]{\Bigl({#1} \!\Bigm\vert\! {#2} \Bigr)}
\newcommand{\tran}{^{\text{\sf T}}}
\newcommand{\herm}{^{\text{\sf H}}}
\newcommand{\bkt}[1]{{\langle #1 \rangle}}
\def\Norm{{\mathcal N}}
\newcommand{\vmult}{.}
\newcommand{\vdiv}{./}


% Python style for highlighting
\newcommand\pythonstyle{\lstset{
language=Python,
backgroundcolor=\color{backcolour},
commentstyle=\color{deepgreen},
basicstyle=\ttm,
otherkeywords={self},             % Add keywords here
keywordstyle=\ttb\color{deepblue},
emph={MyClass,__init__},          % Custom highlighting
emphstyle=\ttb\color{deepred},    % Custom highlighting style
stringstyle=\color{deepgreen},
%frame=tb,                         % Any extra options here
showstringspaces=false            %
}}

% Python environment
\lstnewenvironment{python}[1][]
{
\pythonstyle
\lstset{#1}
}
{}

% Python for external files
\newcommand\pythonexternal[2][]{{
\pythonstyle
\lstinputlisting[#1]{#2}}}

% Python for inline
\newcommand\pycode[1]{{\pythonstyle\lstinline!#1!}}

% Solution environment
\definecolor{lightgray}{gray}{0.95}
\newmdenv[linecolor=white,backgroundcolor=lightgray,frametitle=Solution:]{solution}

\begin{document}

\title{Introduction to Machine Learning\\
Problems:  Convolutional Neural Networks}
\author{Prof. Sundeep Rangan}
\date{}

\maketitle

\begin{enumerate}

\item \emph{Tensors.}  For each of the following datasets, describe how you
would represent them as tensors.  Specifically, give the shape of the tensors.
\begin{enumerate}[(a)]
\item A batch of 100 color images, each image is 256 $\times$ 256.

\item A batch of 40 EEG recordings.  Each EEG records has 80 channels of output
at a sample rate of 240 Hz for 10 seconds.

\item A batch of 32 videos.  Each video has a frame rate of 30 frames per second
and is 10 seconds long.  The video is color with a resolution of 512 $\times$ 512.
\end{enumerate}


\item \emph{2D convolutions.}  Let $X$ and $W$ be arrays,
\[
    X = \left[ \begin{array}{ccccc}
        0 & 0 & 0 & 0 & 0 \\
        0 & 3 & 3 & 3 & 0 \\
        0 & 3 & 3 & 3 & 0 \\
        0 & 3 & 2 & 3 & 0 \\
        0 & 3 & 2 & 3 & 0 \\
        0 & 0 & 0 & 0 & 0
        \end{array} \right], \quad
    W = \left[ \begin{array}{cc}
        1 & -1 \\
        1 & -1
        \end{array} \right].
\]
Let $Z$ be the 2D convolution (without reversal):
\beq \label{eq:Zconv}
    Z[i,j] = \sum_{k_1,k_2} W[k_1,k_2]X[i+k_1,j+k_2].
\eeq
Assume that the arrays are indexed starting at $(0,0)$.
\begin{enumerate}[(a)]
\item What are the limits of the summations over $k_1$ and $k_2$ in \eqref{eq:Zconv}?
\item What is the size of the output $Z[i,j]$ if the convolution is computed only on the \emph{valid} pixels (i.e.\ the pixel locations $(i,j)$
where the summation in \eqref{eq:Zconv} does not exceed the boundaries of $W$ or $X$).
\item What is the largest positive value of $Z[i,j]$ and state one pixel location $(i,j)$ where that value occurs.
\item What is the largest negative value of $Z[i,j]$ and state one pixel location $(i,j)$ where that value occurs.
\item Find one pixel location where $Z[i,j]=0$.
\end{enumerate}


\item \emph{Complexity and number of parameters.}
Suppose that a convolutional layer of a neural network has an input tensor $X[i,j,k]$ and computes
an output via a convolution and ReLU activation,
\begin{align*}
    Z[i,j,m] &= \sum_{k_1} \sum_{k_2} \sum_n W[k_1,k_2,n,m]X[i+k_1,j+k_2,n] + b[m], \\
    U[i,j,m] &= \max\{0, Z[i,j,m] \}.
\end{align*}
for some weight kernel $W[k_1,k_2,n,m]$ and bias $b[m]$.  Suppose that $X$ has shape (48,64,10) and $W$ has shape (3,3,10,20).
Assume the convolution is computed on the \emph{valid} pixels.
\begin{enumerate}[(a)]
\item What are the shapes of $Z$ and $U$?
\item What are the number of input channels and output channels?
\item How many multiplications must be performed to compute the convolution in that layer?
\item If $W$ and $b$ are to be learned, what are the total number of trainable parameters in the layer?
\end{enumerate}


\item \emph{Back-propagation.}
Suppose that a convolutional layer in some neural network
is described as a linear convolution followed by a sigmoid activation,
\begin{align*}
    Z[i,j_1,j_2,m] &= \sum_{k_1} \sum_{k_2} \sum_n W[k_1,k_2,n,m]X[i,j_1+k_1,j_2+k_2,n] + b[m], \\
    U[i,j_1,j_2,m] &= 1/(1+\exp(-Z[i,j_1,j_2,m])).
\end{align*}
where $X[i,j_1,j_2,n]$ is the input of the layer and $U[i,j_1,j_2,m]$ is the output.
Suppose that during back-propagation, we have computed the gradient $\partial J/\partial U$ for some loss function $J$.
That is, we have computed  the components $\partial J/\partial U[i,j_1,j_2,m]$.
Show how to compute the following:
\begin{enumerate}[(a)]
\item The gradient components $\partial J/\partial Z[i,j_1,j_2,m]$.
\item The gradient components $\partial J/\partial W[k_1,k_2,n,m]$.
\item The gradient components $\partial J/\partial X[i,j_1,j_2,n]$.
\end{enumerate}

\item \emph{Sub-sampling and pooling}.
In CNNs, convolution operations are often followed
by a data reduction step, typically either via \emph{sub-sampling} or \emph{max pooling}.
The methods can be described as follows:
Let $x[j]$, $j=0,1,\ldots,N-1$ be a 1D input (say in one channel in one sample).
The outputs $y[k]$ for sub-sampling and max-pooling are given by:
\begin{itemize}
\item \emph{Sub-sampling} with \emph{stride} $s$ selects every $s$-th sample:
\[
    y[k] = x[sk], \quad k=0,1,\ldots, \left\lfloor \frac{N-1}{s} \right\rfloor.
\]
\item \emph{Max pooling} with \emph{pool size} $p$ and \emph{stride} $s$ computes,
\[
    y[k] = \max_{j=0,1,\ldots,p-1} x[sk+j], \quad  k=0,1,\ldots, \left\lfloor \frac{N-1}{s} \right\rfloor.
\]
\end{itemize}
\begin{enumerate}[(a)]
\item Let $\xbf$ be the vector,
\[
    \xbf = [1,2,3,2,0,10,1,0].
\]
Find the output $\ybf$ when sub-sampling with stride $s=2$.

\item For the same vector $\xbf$ as in part (a), find the output of max pooling with
stride $s=2$ and pool size $p=2$.

\item Let $X[i,j,n]$ be a tensor of shape $(B,N,C)$ where $B$ is the batch size,
$N$ is the number of samples per channel and $C$ is the number of channels.
Write equations for sampling and max pooling of $X$ if the operations are to be
performed on each channel and sample.  What are the output shapes?

\end{enumerate}

\end{enumerate}

\end{document}