X-Git-Url: https://fleuret.org/cgi-bin/gitweb/gitweb.cgi?a=blobdiff_plain;f=elbo.tex;h=4c6cb24cc1dd7475f2f8c1a24054f5d0b3e707e8;hb=4b8c58903baa9ff8c508bda798492e10dde9cb7f;hp=6875ddf7bd096d28c78a7dcaa857354f2315e62a;hpb=119ad14a2072217edf3e2315154614815b72ccbd;p=tex.git diff --git a/elbo.tex b/elbo.tex index 6875ddf..4c6cb24 100644 --- a/elbo.tex +++ b/elbo.tex @@ -71,28 +71,36 @@ \begin{document} -\vspace*{0ex} +\setlength{\abovedisplayskip}{2ex} +\setlength{\belowdisplayskip}{2ex} +\setlength{\abovedisplayshortskip}{2ex} +\setlength{\belowdisplayshortskip}{2ex} + +\vspace*{-3ex} \begin{center} {\Large The Evidence Lower Bound} +\vspace*{2ex} + Fran\c cois Fleuret +%% \vspace*{2ex} + \today -\vspace*{1ex} +%% \vspace*{-1ex} \end{center} -Given a training set $x_1, \dots, x_N$ that follows an unknown -distribution $\mu_X$, we want to fit a model $p_\theta(x,z)$ to it, -maximizing +Given i.i.d training samples $x_1, \dots, x_N$ we want to fit a model +$p_\theta(x,z)$ to it, maximizing % \[ \sum_n \log \, p_\theta(x_n). \] % -If we do not have a analytical form of the marginal $p_\theta(x_n)$ +If we do not have an analytical form of the marginal $p_\theta(x_n)$ but only the expression of $p_\theta(x_n,z)$, we can get an estimate of the marginal by sampling $z$ with any distribution $q$ % @@ -102,12 +110,14 @@ p_\theta(x_n) & = \int_z p_\theta(x_n,z) dz \\ & = \expect_{Z \sim q(z)} \left[\frac{p_\theta(x_n,Z)}{q(Z)}\right]. \end{align*} % -So if we wanted to maximize $p_\theta(x_n)$ alone, we could sample a +So if we sample a $Z$ with $q$ and maximize % \begin{equation*} -\frac{p_\theta(x_n,Z)}{q(Z)}.\label{eq:estimator} +\frac{p_\theta(x_n,Z)}{q(Z)}, \end{equation*} +% +we do maximize $p_\theta(x_n)$ on average. But we want to maximize $\sum_n \log \, p_\theta(x_n)$. If we use the $\log$ of the previous expression, we can decompose its average value @@ -125,6 +135,8 @@ since this maximization pushes that KL term down, it also aligns $p_\theta(z \mid x_n)$ and $q(z)$, and we may get a worse $p_\theta(x_n)$ to bring $p_\theta(z \mid x_n)$ closer to $q(z)$. +\medskip + However, all this analysis is still valid if $q$ is a parameterized function $q_\alpha(z \mid x_n)$ of $x_n$. In that case, if we optimize $\theta$ and $\alpha$ to maximize @@ -136,5 +148,4 @@ $\theta$ and $\alpha$ to maximize it maximizes $\log \, p_\theta(x_n)$ and brings $q_\alpha(z \mid x_n)$ close to $p_\theta(z \mid x_n)$. - \end{document}