563ec3c3bf98e3e250d2fc5fa4deffb26dc74ac8
[tex.git] / elbo.tex
1 %% -*- mode: latex; mode: reftex; mode: flyspell; coding: utf-8; tex-command: "pdflatex.sh" -*-
2
3 %% Any copyright is dedicated to the Public Domain.
4 %% https://creativecommons.org/publicdomain/zero/1.0/
5 %% Written by Francois Fleuret <francois@fleuret.org>
6
7 \documentclass[11pt,a4paper,oneside]{article}
8 \usepackage[paperheight=15cm,paperwidth=8cm,top=2mm,bottom=15mm,right=2mm,left=2mm]{geometry}
9 %\usepackage[a4paper,top=2.5cm,bottom=2cm,left=2.5cm,right=2.5cm]{geometry}
10 \usepackage[utf8]{inputenc}
11 \usepackage[T1]{fontenc}
12 \usepackage{amsmath,amssymb,dsfont}
13 \usepackage[pdftex]{graphicx}
14 \usepackage[colorlinks=true,linkcolor=blue,urlcolor=blue,citecolor=blue]{hyperref}
15 \usepackage{tikz}
16 \usetikzlibrary{arrows,arrows.meta,calc}
17 \usetikzlibrary{patterns,backgrounds}
18 \usetikzlibrary{positioning,fit}
19 \usetikzlibrary{shapes.geometric,shapes.multipart}
20 \usetikzlibrary{patterns.meta,decorations.pathreplacing,calligraphy}
21 \usetikzlibrary{tikzmark}
22 \usetikzlibrary{decorations.pathmorphing}
23 \usepackage[round]{natbib}
24 \usepackage[osf]{libertine}
25 \usepackage{microtype}
26 \usepackage{fancyvrb}
27
28 \usepackage{mleftright}
29
30 \newcommand{\setmuskip}[2]{#1=#2\relax}
31 \setmuskip{\thinmuskip}{1.5mu} % by default it is equal to 3 mu
32 \setmuskip{\medmuskip}{2mu} % by default it is equal to 4 mu
33 \setmuskip{\thickmuskip}{3.5mu} % by default it is equal to 5 mu
34
35 \setlength{\parindent}{0cm}
36 \setlength{\parskip}{1ex}
37 %\renewcommand{\baselinestretch}{1.3}
38 %\setlength{\tabcolsep}{0pt}
39 %\renewcommand{\arraystretch}{1.0}
40
41 \def\argmax{\operatornamewithlimits{argmax}}
42 \def\argmin{\operatornamewithlimits{argmin}}
43
44 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
45
46 \def\given{\,\middle\vert\,}
47 \def\proba{\operatorname{P}}
48 \newcommand{\seq}{{S}}
49 \newcommand{\expect}{\mathds{E}}
50 \newcommand{\variance}{\mathds{V}}
51 \newcommand{\empexpect}{\hat{\mathds{E}}}
52 \newcommand{\mutinf}{\mathds{I}}
53 \newcommand{\empmutinf}{\hat{\mathds{I}}}
54 \newcommand{\entropy}{\mathds{H}}
55 \newcommand{\empentropy}{\hat{\mathds{H}}}
56 \newcommand{\ganG}{\mathbf{G}}
57 \newcommand{\ganD}{\mathbf{D}}
58 \newcommand{\ganF}{\mathbf{F}}
59
60 \newcommand{\dkl}{\mathds{D}_{\mathsf{KL}}}
61 \newcommand{\djs}{\mathds{D}_{\mathsf{JS}}}
62
63 \allowdisplaybreaks[2]
64
65 \newcommand*{\vertbar}{\rule[-1ex]{0.5pt}{2.5ex}}
66 \newcommand*{\horzbar}{\rule[.5ex]{2.5ex}{0.5pt}}
67
68 \def\positionalencoding{\operatorname{pos-enc}}
69 \def\concat{\operatorname{concat}}
70 \def\crossentropy{\LL_{\operatorname{ce}}}
71
72 \begin{document}
73
74 \setlength{\abovedisplayskip}{2ex}
75 \setlength{\belowdisplayskip}{2ex}
76 \setlength{\abovedisplayshortskip}{2ex}
77 \setlength{\belowdisplayshortskip}{2ex}
78
79 \vspace*{-3ex}
80
81 \begin{center}
82 {\Large The Evidence Lower Bound}
83
84 \vspace*{2ex}
85
86 Fran\c cois Fleuret
87
88 %% \vspace*{2ex}
89
90 \today
91
92 %% \vspace*{-1ex}
93
94 \end{center}
95
96 Given i.i.d training samples $x_1, \dots, x_N$ we want to fit a model
97 $p_\theta(x,z)$ to it, maximizing
98 %
99 \[
100 \sum_n \log \, p_\theta(x_n).
101 \]
102 %
103 If we do not have an analytical form of the marginal $p_\theta(x_n)$
104 but only the expression of $p_\theta(x_n,z)$, we can get an estimate
105 of the marginal by sampling $z$ with any distribution $q$
106 %
107 \begin{align*}
108 p_\theta(x_n) & = \int_z p_\theta(x_n,z) dz                   \\
109               & = \int_z \frac{p_\theta(x_n,z)}{q(z)} q(z) dz \\
110               & = \expect_{Z \sim q(z)} \left[\frac{p_\theta(x_n,Z)}{q(Z)}\right].
111 \end{align*}
112 %
113 So if we sample a
114 $Z$ with $q$ and maximize
115 %
116 \begin{equation*}
117 \frac{p_\theta(x_n,Z)}{q(Z)},
118 \end{equation*}
119 %
120 we do maximize $p_\theta(x_n)$ on average.
121
122 But we want to maximize $\sum_n \log \, p_\theta(x_n)$. If we use the
123 $\log$ of the previous expression, we can decompose its average value
124 as
125 \begin{align*}
126  & \expect_{Z \sim q(z)} \left[ \log \frac{p_\theta(x_n,Z)}{q(Z)} \right]                                \\
127  & = \expect_{Z \sim q(z)} \left[ \log \frac{p_\theta(Z \mid x_n) \, p_\theta(x_n)}{q(Z)} \right]        \\
128  & = \expect_{Z \sim q(z)} \left[ \log \frac{p_\theta(Z \mid x_n)}{q(Z)} \right] + \log \, p_\theta(x_n) \\
129  & = - \dkl(q(z) \, \| \, p_\theta(z \mid x_n)) + \log \, p_\theta(x_n).
130 \end{align*}
131 %
132 Hence this does not maximize $\log \, p_\theta(x_n)$ on average, but a
133 \emph{lower bound} of it, since the KL divergence is non-negative. And
134 since this maximization pushes that KL term down, it also aligns
135 $p_\theta(z \mid x_n)$ and $q(z)$, and we may get a worse
136 $p_\theta(x_n)$ to bring $p_\theta(z \mid x_n)$ closer to $q(z)$.
137
138 \medskip
139
140 However, all this analysis is still valid if $q$ is a parameterized
141 function $q_\alpha(z \mid x_n)$ of $x_n$. In that case, if we optimize
142 $\theta$ and $\alpha$ to maximize
143 %
144 \[
145 \expect_{Z \sim q_\alpha(z \mid x_n)} \left[ \log \frac{p_\theta(x_n,Z)}{q_\alpha(Z \mid x_n)} \right],
146 \]
147 %
148 it maximizes $\log \, p_\theta(x_n)$ and brings $q_\alpha(z \mid
149 x_n)$ close to $p_\theta(z \mid x_n)$.
150
151 \medskip
152
153 A point that may be important in practice is
154 %
155 \begin{align*}
156  & \expect_{Z \sim q_\alpha(z \mid x_n)} \left[ \log \frac{p_\theta(x_n,Z)}{q_\alpha(Z \mid x_n)} \right]                      \\
157  & = \expect_{Z \sim q_\alpha(z \mid x_n)} \left[ \log \frac{p_\theta(x_n \mid Z) p_\theta(Z)}{q_\alpha(Z \mid x_n)} \right] \\
158  & = \expect_{Z \sim q_\alpha(z \mid x_n)} \left[ \log \, p_\theta(x_n \mid Z) \right]                                            \\
159  & \hspace*{7em} - \dkl(q_\alpha(z \mid x_n) \, \| \, p_\theta(z)).
160 \end{align*}
161 %
162 This form is useful because for certain $p_\theta$ and $q_\alpha$, for
163 instance if they are Gaussian, the KL term can be computed exactly
164 instead of through sampling, which removes one source of noise in the
165 optimization process.
166
167 \end{document}