Neural Network Basic to Application
Author
Kim Woo Hyun
Last Updated
5年前
License
Creative Commons CC BY 4.0
Abstract
Explaining of Neural Network by painting style transfer
Explaining of Neural Network by painting style transfer
\documentclass{beamer}
% For more themes, color themes and font themes, see:
% http://deic.uab.es/~iblanes/beamer_gallery/index_by_theme.html
%
\mode<presentation>
{
\usetheme{Madrid} % or try default, Darmstadt, Warsaw, ...
\usecolortheme{default} % or try albatross, beaver, crane, ...
\usefonttheme{serif} % or try default, structurebold, ...
\setbeamertemplate{navigation symbols}{}
\setbeamertemplate{caption}[numbered]
}
\usepackage[english]{babel}
\usepackage[utf8x]{inputenc}
\usepackage{chemfig}
\usepackage[version=3]{mhchem}
\usepackage{wrapfig}
% On Overleaf, these lines give you sharper preview images.
% You might want to `comment them out before you export, though.
\usepackage{pgfpages}
\pgfpagesuselayout{resize to}[%
physical paper width=8in, physical paper height=6in]
% Here's where the presentation starts, with the info for the title slide
\title[Seminar]{Neural Network. Basic to application}
\subtitle{(painting style transfer)}
\author{Kim Woo Hyun}
\date{\today}
\begin{document}
\begin{frame}
\titlepage
\end{frame}
% These three lines create an automatically generated table of contents.
\begin{frame}{Outline}
\tableofcontents
\end{frame}
\section{Neural Network}
\subsection{First Generation (ANN, Perceptron)}
\begin{frame}{First Generation}
\begin{block}{Artificial Neural Network : ANN}
At 1943 \textbf{\textit{McCulloch, Warren S.}}, and \textbf{\textit{Walter Pitts}} suggested
\end{block}
\centering
\includegraphics[scale=0.5]{ANN.PNG}
\begin{itemize}
\item Mimic the human neural structure by connecting switches
\end{itemize}
\end{frame}
\begin{frame}{First Generation}
\begin{block}{Perceptron}
In 1958 \textbf{\textit{Frank Rosenblatt}} suggested Linear Classifier.
\end{block}
\includegraphics[scale=0.2]{1_neuron.png}
\includegraphics[scale=0.2]{1_neuron_model.jpeg}
\begin{itemize}
\item Expected computer can do things human can do better at that time.
\item Basic structure is not changed until now.
\item Using sigmoid with \textbf{Activation function}.
(Make output $\in$ [0,1])
\end{itemize}
\end{frame}
\begin{frame}{First Generation}
\begin{block}{Problem}
In 1969 \textbf{\textit{Marvin Minsky, Seymour Papert}} proved limitations of perceptron.
\end{block}
\includegraphics[scale = 0.3]{1_minsky_book.jpg}
\includegraphics[scale = 0.6]{1_xor_unsolve.PNG}
It can't solve XOR problem even.
\end{frame}
\subsection{Second Generation (MLP, Back-propagation)}
\begin{frame}{Second Generation}
\begin{block}{Multi-Layer Perception : MLP}
Make neurons deeper by make \textbf{hidden layers} of perception
\end{block}
\includegraphics[scale = 0.3]{1_xor_solve.PNG}
\includegraphics[scale = 0.2]{1_MLP}
\begin{itemize}
\item Solve the Non-Linear problems with multiple linear classifier.
\item \textbf{Too many parameters!!}
\item Needs parameter controller.
\end{itemize}
\end{frame}
\begin{frame}{Second Generation}
\begin{block}{Back-propagation}
Feedback algorithm controls the weights of neural network.
\end{block}
\centering
\includegraphics[scale = 0.5]{1_BackP.png}
\begin{itemize}
\item $i$ : input layer
\item $h$ : hidden layer
\item $o$ : output layer
\item $w_{ij}$ : weight connected to the neuron i to j.
\end{itemize}
\end{frame}
\begin{frame}{Second Generation}
\includegraphics[scale = 0.4]{1_BackP.png}
\includegraphics[scale = 0.3]{1_sigmoid.png}
\begin{itemize}
\item $out$ : Output value of a neuron.
\item $in$ : sum of weighted output of connected neurons. ($in = \sum w*out$)
\item $t$ : Target value (Choose yourself!)
\item \textbf{Sigmoid} activation function. Ex) $out_{h3} = \sigma(in_{h3}) = \frac{1}{1+e^{-in_{h3}}}$
\end{itemize}
\end{frame}
\begin{frame}{Second Generation}
Error with Sum of square (Euclidean Distance)
\[E = \frac{1}{2}(t_5-out_{o5})^2 + \frac{1}{2}(t_6-out_{o6})^2\]
We want to see how much each weights influence to $E$ $\Rightarrow $ Calculate $\frac{\partial E}{\partial w_{ij}}$
Example) Calculate $\frac{\partial E}{\partial w_{35}}$ with \textbf{Chain-rule}
\[\frac{\partial E}{\partial w_{35}} = \frac{\partial E}{\partial out_{o5}}*\frac{\partial out_{o5}}{\partial in_{o5}}*\frac{\partial in_{o5}}{\partial w_{35}}\]
\centering
\includegraphics[scale = 0.4]{1_BackP.png}
\end{frame}
\begin{frame}{Second Generation}
First, \[\frac{\partial E}{\partial out_{o5}} = \frac{\partial }{\partial out_{o5}}\left [ \frac{1}{2}(t_5-out_{o5})^2 + \frac{1}{2}(t_6-out_{o6})^2 \right ] = out_{o5}-t_5\]
Second, \[\frac{\partial out_{05}}{\partial in_{o5}}= \frac{\partial \sigma(in_{o5}) }{\partial in_{o5}}\]
\end{frame}
\begin{frame}{Second Generation}
The sigmoid function $\sigma(x)$ is
\[\sigma(x) = \frac{1}{1+e^{-ax}}\]
The differential of sigmoid $\sigma(x)$
\begin{align*}
\sigma'(x) &= \frac{ae^{-ax}}{(1+e^{-ax})^2} \\
&= a\frac{1}{(1+e^{-ax})}\frac{e^{-ax}}{(1+e^{-ax})} \\
&= a\frac{1}{(1+e^{-ax})}\left( 1- \frac{1}{(1+e^{-ax})} \right ) \\
&= a\sigma(x)(1-\sigma(x))
\end{align*}
\end{frame}
\begin{frame}{Second Generation}
First, \[\frac{\partial E}{\partial out_{o5}} = \frac{\partial }{\partial out_{o5}}\left [ \frac{1}{2}(t_5-out_{o5})^2 + \frac{1}{2}(t_6-out_{o6})^2 \right ] = out_{o5}-t_5\]
Second, \[\frac{\partial out_{05}}{\partial in_{o5}}= \frac{\partial \sigma(in_{o5}) }{\partial in_{o5}} = \sigma(in_{o5})(1-\sigma(in_{o5})) = out_{o5}(1-out_{o5})\]
\end{frame}
\begin{frame}{Second Generation}
First, \[\frac{\partial E}{\partial out_{o5}} = \frac{\partial }{\partial out_{o5}}\left [ \frac{1}{2}(t_5-out_{o5})^2 + \frac{1}{2}(t_6-out_{o6})^2 \right ] = out_{o5}-t_5\]
Second, \[\frac{\partial out_{05}}{\partial in_{o5}}= \frac{\partial \sigma(in_{o5}) }{\partial in_{o5}} = \sigma(in_{o5})(1-\sigma(in_{o5})) = out_{o5}(1-out_{o5})\]
Third,
\[\frac{\partial in_{o5}}{\partial w_{35}} = \frac{\partial (out_{h3}*w_{35})}{\partial w_{35}} = out_{h3}\]
Finally,
\[\frac{\partial E}{\partial w_{35}} = (out_{o5}-t_5)(1-out_{o5})out_{o5}out_{h3}\]
\begin{block}{}
Beautifully, all parameters are already calculated and what we have to do is easy math.
\end{block}
\end{frame}
\begin{frame}{Second Generation}
Then, how to update weights?
\[w := w - r\frac{\partial E}{\partial w} \text{, r is constant called learning rate.}\]
So, updated $w_{35}$ is
\[w_{35} := w_{35} - r(out_{o5}-t_5)(1-out_{o5})out_{o5}out_{h3}\]
This method called \textbf{Gradient descent.}
\centering
\includegraphics[scale = 0.16]{1_Gradient_descent.png}
\end{frame}
\begin{frame}{Second Generation}
\begin{block}{Gradient descent}
Simply, moving to orthogonal direction from contour line.
\end{block}
\textit{Why the direction to orthogonal?}
At minimum point of f(x,y),
\[\nabla f(x,y) = \left( \frac{\partial f}{\partial x}, \frac{\partial f}{\partial y} \right ) = 0\]
Assume direction of contour line is $(a,b)$. Then using \textbf{Tayler series}, derive orthogonal direction by linearize the contour line.
\[f(x_1+a,y_1+b) \simeq f(x_1,y_2) + \frac{\partial f}{\partial x}a + \frac{\partial f}{\partial y}b + \dots\]
The condition of $(a,b)$ that minimize error is
\[\frac{\partial f}{\partial x}a + \frac{\partial f}{\partial y}b = 0 \]
\end{frame}
\begin{frame}{Second Generation}
If $a = \frac{\partial f}{\partial y}$ and $b = -\frac{\partial f}{\partial x}$.
\[\frac{\partial f}{\partial x}a + \frac{\partial f}{\partial y}b = \frac{\partial f}{\partial x}\frac{\partial f}{\partial y} + \frac{\partial f}{\partial y}(-\frac{\partial f}{\partial x}) = 0\]
In addition, the inner product of gradient and (a,b) is
\[(\nabla f(x,y))\cdot (a,b) = \left (\frac{\partial f}{\partial x} ,\frac{\partial f}{\partial y} \right )\cdot \left ( \frac{\partial f}{\partial y} ,-\frac{\partial f}{\partial x} \right ) = 0\]
\begin{block}{}
It means the vector orthogonal to contour line is gradient itself. And if we track the gradient until it is 0, we can find minimum point.
\end{block}
*Caution it can be a saddle point not minimum but I don't want to discuss in this time because I don't know.
\end{frame}
\begin{frame}{Second Generation}
Problems
\begin{itemize}
\item Gradient descent is bad at non-convex function, but sigmoid is non-convex function.
\[\sigma''(x) = a^{2}\sigma(x)(1-\sigma(x))(1-2\sigma(x))\]
\[a^{2}\sigma(x)(1-\sigma(x)) \geq 0 \text{ but } -1 \leq 1-2\sigma(x) \leq 1\]
\item Cost of back-propagation is Big.
\item Vanishing Gradient Problem.
\end{itemize}
\end{frame}
\begin{frame}{Second Generation}
\begin{block}{Cost of back-propagation.}
Cost is big at shallow layer.
\end{block}
For example,
\[\frac{\partial E}{\partial w_{13}} = \frac{\partial E}{\partial out_{h3}}*\frac{\partial out_{h3}}{\partial in_{h3}}*\frac{\partial in_{h3}}{\partial w_{13}}\]
\[\vdots\]
\[= \left [(out_{o5}-t_5)\{out_{o5}(1-out_{o5})\}w_{35} + (out_{o5}-t_5)\{out_{o6}(1-out_{o6})\}w_{36}\right ]\]
\[*(1-out_{h3})*out_{h3}*out_{i1}\]
Of course! since it is chain-rule algorithm, it is easier than looks like. However if we have very big network?
\end{frame}
\begin{frame}{Second Generation}
\begin{block}{Vanishing Gradient Problem}
Because of sigmoid function, gradient is going to 0 while repeat Back-propagation.
\end{block}
\centering
\includegraphics[scale = 0.5]{1_sigmoid.png}
\end{frame}
\subsection{Thrid Generation (ReLU)}
\begin{frame}{Thrid Generation}
\centering
\includegraphics[scale = 0.25]{1_relu.png}
\begin{block}{Rectified Linear Unit : ReLU}
\begin{itemize}
\item Convex : good at gradient descent.
\item Cost of Back-propagation is decrease. (since $f'(x)$ = 1 or 0 always)
\item Safe from Vanishing Gradient Problem
\end{itemize}
\end{block}
All problems are from bad activation function.
\end{frame}
\begin{frame}{Thrid Generation}
\centering
\includegraphics[scale = 0.5]{1_Bad_Act.PNG}
Notice at gap between tanh and ReLU.
\end{frame}
\section{Convolutional Neural Network}
\begin{frame}
Section 2. Convolutional Neural Network
\begin{itemize}
\item Convolution layer
\item ReLU layer
\item Pooling layer
\item Fully Connected layer
\end{itemize}
\end{frame}
\subsection{Convolution layer}
\begin{frame}{Convolution layer}
\begin{block}{2D Convolution}
Nothing specially different from 1D convolution.
\end{block}
\begin{columns}
\begin{column}{0.4\textwidth}
\includegraphics[scale = 0.5]{2_simple_conv.JPG}
\end{column}
\begin{column}{0.6\textwidth}
\begin{itemize}
\item Input size = 7x7x1
\item Filter size = 3x3
\item The number of filter = 1
\item Stride = 1
\end{itemize}
\end{column}
\end{columns}
\end{frame}
\begin{frame}{Convolution layer}
\begin{block}{What is the filter do?}
Assume weights are already trained.
\end{block}
\centering
\includegraphics[scale = 0.5]{2_line_filter.png}
Curve detection filter and its visualization.
\end{frame}
\begin{frame}{Filter}
\centering
\includegraphics[scale = 0.5]{2_Mice.png}
\includegraphics[scale = 0.5]{2_mice_hip.png}
\begin{block}{}
If Original image has similar shape at part, the result of Mult and Sum has a large number.
\end{block}
\end{frame}
\begin{frame}{Filter}
\centering
\includegraphics[scale = 0.5]{2_mice_conv.png}
\begin{block}{}
In contrast, If not, the result has a small number.
\end{block}
\begin{block}{}
Trained filter can \textbf{give a score} for which feature exist or not!!
\end{block}
\end{frame}
\begin{frame}{Filter}
\centering
\includegraphics[scale = 0.5]{2_ActMap.png}
Each score is grouped together and forms layer by convolution.
\end{frame}
\begin{frame}{Padding}
\centering
\includegraphics[scale = 0.4]{3_padding.png}
\begin{block}{}
\begin{itemize}
\item Attach zeros around the layer. \ \ (Zero-padding)
\item Prevent from size decreasing while convolution.
\item To catch the features at edge more detail.
\end{itemize}
\end{block}
\end{frame}
\begin{frame}{Convolution layer}
\begin{block}{Convolution}
W = width, H = Height, D = Depth, P = Padding, S = stride.
F = Filters W and H, N = Number of filters.
\end{block}
\begin{columns}[onlytextwidth]
\begin{column}{0.5\textwidth}
\centering
\includegraphics[scale=0.3]{2_conv.JPG}% Place your graphic here
\end{column}
\begin{column}{0.5\textwidth}
(6+1)x(6+1)x3 input
Two 3x3x3 filters
$\Rightarrow$ Two output with 3x3x2
\begin{itemize}
\item $W_{2} = \frac{W-F+2P}{S}+1 = \frac{6-3+2*1}{2}+1 = 3$
\item $H_{2} = \frac{H-F+2P}{S}+1 = \frac{6-3+2*1}{2}+1 = 3$
\item $D_{2} = N = 2$ \ \ \ (Depth is same with Number of filters)
\end{itemize}
\end{column}
\end{columns}
\end{frame}
\subsection{ReLU layer}
\begin{frame}{ReLU layer}
\begin{columns}[onlytextwidth]
\begin{column}{0.5\textwidth}
\centering
\includegraphics[scale = 0.35]{1_relu.png}
\end{column}
\begin{column}{0.5\textwidth}
\begin{itemize}
\item Zero OR Itself.
\item Used to give Non-linearity and threshold.
\item No parameter. No size change.
\end{itemize}
\end{column}
\end{columns}
\end{frame}
\begin{frame}{ReLU layer}
\begin{block}{Why we have to give a Non-linearity.}
Experimental result is given.
\end{block}
\centering
\includegraphics[scale=0.35]{2_why_NonL}
With Image.net classification test.
\end{frame}
\subsection{Pooling layer}
\begin{frame}{Pooling layer}
\includegraphics[scale = 0.25]{2_maxpool.jpeg}
\includegraphics[scale = 0.25]{2_dawnsampling.jpeg}
\begin{itemize}
\item Usually, using \textbf{Max-Pooling.} (If higher value is important)
\item No depth change.
\item \textbf{\textit{Reduce Complexity!!!!!!(Down-sampling)}} $\frac{1}{4} = 75\%$ reduced.
\item Not Recessary. (But Recommended)
\end{itemize}
\[W_{2} = \frac{W-F}{S}+1 = \frac{224-2}{2}+1 = 112\]
\end{frame}
\subsection{Fully Connected layer}
\begin{frame}{Fully Connected layer}
\centering
\includegraphics[scale = 0.6]{2_Fully.png}
\begin{itemize}
\item Make 2D layer to 1D line layer (Make layer to vector.)
\item Used to compare with target.
\item Making method is not only one.
\end{itemize}
\end{frame}
\section{Painting Style Transfer}
\begin{frame}
Section 3. Painting Style Transfer
\begin{itemize}
\item VGGnet
\item Algorithm and Loss function
\item Result
\end{itemize}
\end{frame}
\subsection{VGGnet}
\begin{frame}{VGGnet}
\centering
\includegraphics[scale = 1.8]{3_VGG_19.png}
\begin{block}{}
\begin{itemize}
\item $F_{conv} = 3 \ (3*3*D), S_{conv} = 1, Padding = 1$
\item $F_{Pool} = 2 \ (2*2*D), S_{pool} = 2$
\end{itemize}
\end{block}
\[\frac{W-F_{conv}+2P}{S_{conv}}+1 = \frac{224-3+2*1}{1}+1 = 224\]
\[\frac{W-F_{conv}}{S_{pool}} + 1 = \frac{224-2}{2} + 1 = 112\]
\end{frame}
\begin{frame}{Painting style transfer}
\centering
\includegraphics[scale = 0.125]{3_Structure.jpg}
\begin{block}{}
\begin{itemize}
\item Weights must be trained already.
\item $a = $ style image, $p = $ content image
\item $x = $ generated image.
\end{itemize}
\end{block}
\end{frame}
\subsection{Algorithm and Loss function}
\begin{frame}{Painting style transfer}
\begin{block}{}
\begin{itemize}
\item $N_l = $ Number of feature maps of $l$th layer
\item $M_l = $ Size of feature map of $l$th layer
\item $F^l \in \mathcal{R}^{N_l*M_l}$
\item $F^{l}_{ij}$ is the activation of the $i^{th}$ filter at position $j$ in layer $l$
\item $P^{l}_{ij}$ is same with $F^{l}_{ij}$ but it is from content image.(conv4\_2)
\end{itemize}
\end{block}
\[\mathcal{L}_{\text{content}}(\vec{p},\vec{x}, l)=\frac{1}{2}\sum_{i, j}(F_{ij}^{l}-P_{ij}^{l})^{2}.\]
\begin{block}{}
So this loss function want to minimize distance of each value of same position between content layer and generate layer.
\end{block}
\end{frame}
\begin{frame}
\begin{block}{}
\begin{itemize}
\item $G^l \in \mathcal{R}^{N_l*N_l}$
\item $G^{l}_{ij}$ is the inner product between the vectorized feature maps $i$ and j in layer $l$ (Gram matrix of style layer)
\item \[G_{ij}^{l}= \sum_{k}F_{ik}^{l}F_{jk}^{l}\]
\item $A^{l}_{ij}$ is same with $G^{l}_{ij}$ but it is from content image.
\end{itemize}
\end{block}
\[E_{l}= \frac{1}{4N_{l}^{2}M_{l}^{2}}\sum_{i, j}(G_{ij}^{l}-A_{ij}^{l})^{2}\]
\[\mathcal{L}_{\text{style}}(\vec{a},\vec{x})=\sum_{l=0}^{L}w_{l}E_{l}\]
\begin{block}{}
They have thought the style information is hide on correlation but I can't understand.
\end{block}
\end{frame}
\begin{frame}{Painting style transfer}
The differential of each loss function are
\[\frac{\partial \mathcal{L}_{\text{content}}}{\partial F_{ij}^{l}}=\begin{cases} (F^{l}-P^{l})_{ij} & \text{if}\ F_{ij}^{l} > 0\\ 0 & \text{if}\ F_{ij}^{l} < 0, \end{cases}\]
\[\frac{\partial E_{l}}{\partial F_{ij}^{l}}=\begin{cases} \frac{1}{N_{l}^{2}M_{l}^{2}}((F^{l})^{\mathrm{T}}(G^{l}-A^{l}))_{ji}& \text{if}\ F_{ij}^{l} > 0\\ 0& \text{if}\ F_{ij}^{l} < 0. \end{cases}\]
And the total loss is
\[\mathcal{L}_{\text{total}}(\vec{p},\vec{a},\vec{x})=\alpha \mathcal{L}_{\text{content}}(\vec{p},\vec{x})+\beta \mathcal{L}_{\text{style}}(\vec{a},\vec{x})\]
\begin{itemize}
\item $\alpha$ and $\beta$ are learning rate.
\end{itemize}
\end{frame}
\begin{frame}
\centering
\includegraphics[scale = 0.125]{3_Structure.jpg}
\[\vec{x} := \vec{x} - \lambda \frac{\partial \mathcal{L}_{total}}{\partial \vec{x}}\]
\begin{itemize}
\item $\lambda$ is learning rate.
\item At first, $\vec{x}$ is white noise image.
\item \textbf{Not learning weights, learning $\vec{x}$!!!!}
\end{itemize}
\end{frame}
\subsection{Result}
\begin{frame}{Result}
\centering
\begin{columns}[onlytextwidth]
\begin{column}{0.5\textwidth}
\centering
\includegraphics[scale = 0.16]{starry_night.jpg}
\centering
\[+\]
\centering
\includegraphics[scale = 0.16]{in4.JPG}
\end{column}
\begin{column}{0.5\textwidth}
\includegraphics[scale = 0.18]{2800.png}
\end{column}
\end{columns}
\end{frame}
\begin{frame}{Bonus}
\begin{block}{}
Thank you!
\end{block}
\begin{columns}[onlytextwidth]
\begin{column}{0.5\textwidth}
\centering
\includegraphics[scale = 0.2]{in3.jpg}
\end{column}
\begin{column}{0.5\textwidth}
\includegraphics[scale = 0.2]{2100.png}
\end{column}
\end{columns}
\end{frame}
\end{document}