From: François Fleuret Date: Mon, 12 Feb 2024 20:25:59 +0000 (+0100) Subject: Update. X-Git-Url: https://fleuret.org/cgi-bin/gitweb/gitweb.cgi?a=commitdiff_plain;h=74cdd5e14b65ac1ff03725173eb941dc7a455edf;p=tex.git Update. --- diff --git a/inftheory.tex b/inftheory.tex index 7c34fce..954fd06 100644 --- a/inftheory.tex +++ b/inftheory.tex @@ -4,8 +4,8 @@ %% https://creativecommons.org/publicdomain/zero/1.0/ %% Written by Francois Fleuret -\documentclass[10pt,a4paper,twoside]{article} -\usepackage[paperheight=18cm,paperwidth=10cm,top=5mm,bottom=20mm,right=5mm,left=5mm]{geometry} +\documentclass[11pt,a4paper,oneside]{article} +\usepackage[paperheight=15cm,paperwidth=8cm,top=2mm,bottom=15mm,right=2mm,left=2mm]{geometry} %\usepackage[a4paper,top=2.5cm,bottom=2cm,left=2.5cm,right=2.5cm]{geometry} \usepackage[utf8]{inputenc} \usepackage{amsmath,amssymb,dsfont} @@ -20,6 +20,8 @@ \usetikzlibrary{tikzmark} \usetikzlibrary{decorations.pathmorphing} \usepackage[round]{natbib} +\usepackage[osf]{libertine} +\usepackage{microtype} \usepackage{mleftright} @@ -29,7 +31,7 @@ \setmuskip{\thickmuskip}{3.5mu} % by default it is equal to 5 mu \setlength{\parindent}{0cm} -\setlength{\parskip}{12pt} +\setlength{\parskip}{1ex} %\renewcommand{\baselinestretch}{1.3} %\setlength{\tabcolsep}{0pt} %\renewcommand{\arraystretch}{1.0} @@ -65,12 +67,17 @@ \begin{document} -\vspace*{1ex} +\vspace*{-3ex} \begin{center} {\Large Some bits of Information Theory} -\today +Fran\c cois Fleuret + +January 19, 2024 + +\vspace*{1ex} + \end{center} Information Theory is awesome so here is a TL;DR about Shannon's entropy. @@ -79,9 +86,9 @@ The field is originally about quantifying the amount of ``information'' contained in a signal and how much can be transmitted under certain conditions. -What makes it awesome IMO is that it is very intuitive, and like -thermodynamics in Physics, it gives exact bounds about what is possible -or not. +What makes it awesome is that it is very intuitive, and like +thermodynamics in Physics, it gives exact bounds about what is +possible or not. \section{Shannon's Entropy} @@ -96,8 +103,8 @@ line, you can design a coding that takes into account that the symbols are not all as probable, and decode on the other side. For instance if $\proba('\!\!A')=1/2$, $\proba('\!\!B')=1/4$, and -$\proba('\!\!C')=1/4$ you would transmit ``0'' for a ``A'' and ``10'' for a -``B'' and ``11'' for a ``C'', 1.5 bits on average. +$\proba('\!\!C')=1/4$ you would transmit ``$0$'' for a ``A'' and ``$10$'' for a +``B'' and ``$11$'' for a ``C'', 1.5 bits on average. If the symbol is always the same, you transmit nothing, if they are equiprobable you need $\log_2$(nb symbols) etc. @@ -153,11 +160,16 @@ that quantifies the amount of information shared by the two variables. Conditional entropy is the average of the entropy of the conditional distribution: % -\begin{align*} - & \entropy(X \mid Y) \\ - & = \sum_y \proba(Y=y) \entropy(X \mid Y=y) \\ - & = \sum_y \proba(Y=y) \sum_x \proba(X=x \mid Y=y) \log \proba(X=x \mid Y=y) -\end{align*} +\begin{equation*} +\entropy(X \mid Y) = \sum_y \proba(Y=y) \entropy(X \mid Y=y) +\end{equation*} +% +with +% +\begin{eqnarray*} +\entropy(X \mid Y=y) \hspace*{13.5em} \\ + = \sum_x \proba(X=x \mid Y=y) \log \proba(X=x \mid Y=y) +\end{eqnarray*} Intuitively it is the [minimum average] number of bits required to describe $X$ given that $Y$ is known. @@ -175,13 +187,13 @@ and if $X$ is a deterministic function of $Y$ then \entropy(X \mid Y)=0. \] -And if you send the bits for $Y$ and then the bits to describe $X$ given -that $Y$, you have sent $(X, Y)$. Hence we have the chain rule: +And if you send the bits for $Y$ and then the bits to describe $X$ +given that $Y$, you have sent $(X, Y)$, hence the chain rule: % \[ \entropy(X, Y) = \entropy(Y) + \entropy(X \mid Y). \] - +% And then we get % \begin{align*} diff --git a/randvar.tex b/randvar.tex new file mode 100644 index 0000000..6d3aae5 --- /dev/null +++ b/randvar.tex @@ -0,0 +1,284 @@ +%% -*- mode: latex; mode: reftex; mode: flyspell; coding: utf-8; tex-command: "pdflatex.sh" -*- + +%% Any copyright is dedicated to the Public Domain. +%% https://creativecommons.org/publicdomain/zero/1.0/ +%% Written by Francois Fleuret + +\documentclass[11pt,a4paper,oneside]{article} +\usepackage[paperheight=15cm,paperwidth=8cm,top=2mm,bottom=15mm,right=2mm,left=2mm]{geometry} +%\usepackage[a4paper,top=2.5cm,bottom=2cm,left=2.5cm,right=2.5cm]{geometry} +\usepackage[utf8]{inputenc} +\usepackage{amsmath,amssymb,dsfont} +\usepackage[pdftex]{graphicx} +\usepackage[colorlinks=true,linkcolor=blue,urlcolor=blue,citecolor=blue]{hyperref} +\usepackage{tikz} +\usetikzlibrary{arrows,arrows.meta,calc} +\usetikzlibrary{patterns,backgrounds} +\usetikzlibrary{positioning,fit} +\usetikzlibrary{shapes.geometric,shapes.multipart} +\usetikzlibrary{patterns.meta,decorations.pathreplacing,calligraphy} +\usetikzlibrary{tikzmark} +\usetikzlibrary{decorations.pathmorphing} +\usepackage[round]{natbib} +\usepackage[osf]{libertine} +\usepackage{microtype} + +\usepackage{mleftright} + +\newcommand{\setmuskip}[2]{#1=#2\relax} +\setmuskip{\thinmuskip}{1.5mu} % by default it is equal to 3 mu +\setmuskip{\medmuskip}{2mu} % by default it is equal to 4 mu +\setmuskip{\thickmuskip}{3.5mu} % by default it is equal to 5 mu + +\setlength{\parindent}{0cm} +\setlength{\parskip}{1ex} +%\renewcommand{\baselinestretch}{1.3} +%\setlength{\tabcolsep}{0pt} +%\renewcommand{\arraystretch}{1.0} + +\def\argmax{\operatornamewithlimits{argmax}} +\def\argmin{\operatornamewithlimits{argmin}} + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +\def\given{\,\middle\vert\,} +\def\proba{\operatorname{P}} +\newcommand{\seq}{{S}} +\newcommand{\expect}{\mathds{E}} +\newcommand{\variance}{\mathds{V}} +\newcommand{\empexpect}{\hat{\mathds{E}}} +\newcommand{\mutinf}{\mathds{I}} +\newcommand{\empmutinf}{\hat{\mathds{I}}} +\newcommand{\entropy}{\mathds{H}} +\newcommand{\empentropy}{\hat{\mathds{H}}} +\newcommand{\ganG}{\mathbf{G}} +\newcommand{\ganD}{\mathbf{D}} +\newcommand{\ganF}{\mathbf{F}} + +\newcommand{\dkl}{\mathds{D}_{\mathsf{KL}}} +\newcommand{\djs}{\mathds{D}_{\mathsf{JS}}} + +\newcommand*{\vertbar}{\rule[-1ex]{0.5pt}{2.5ex}} +\newcommand*{\horzbar}{\rule[.5ex]{2.5ex}{0.5pt}} + +\def\positionalencoding{\operatorname{pos-enc}} +\def\concat{\operatorname{concat}} +\def\crossentropy{\LL_{\operatorname{ce}}} + +\begin{document} + +\vspace*{0ex} + +\begin{center} +{\Large On Random Variables} + +Fran\c cois Fleuret + +\today + +\vspace*{1ex} + +\end{center} + +\underline{Random variables} are central to any model of a random +process, but their mathematical definition is unclear to most. This is +an attempt at giving an intuitive understanding of their definition +and utility. + +\section{Modeling randomness} + +To formalize something ``random'', the natural strategy is to define a +distribution, that is, in the finite case, a list of values / +probabilities. For instance, the head / tail result of a coin flipping +would be +% +\[ +\{(H, 0.5), (T, 0.5)\}. +\] + +This is perfectly fine, until you have several such objects. To model +two coins $A$ and $B$, it seems intuitively okay: they have nothing to +do with each other, they are ``independent'', so defining how they +behave individually is sufficient. + +\section{Non-independent variables} + +The process to generate two random values can be such that they are +related. Consider for instance that $A$ is the result of flipping a +coin, and $B$ as *the inverse value of $A$*. + +Both $A$ and $B$ are legitimate RVs, a both have the same distribution +(H, 0.5) (T, 0.5). So where is the information that they have a +relation? + +With models of the respective distributions of $A$ and $B$, this is +nowhere. This can be fixed in some way by specifying the distribution +of the pair $(A, B)$. That would be here +% +\[ +\{(H/H, 0.0), (H/T, 0.5), (T/H, 0.5), (T/T, 0.0)\}. +\] + +The distribution of $A$ and $B$ individually are called the +\underline{marginal} distributions, and this is the \underline{joint} +distribution. + +Note that the joint is a far richer object than the two marginals, and +in general many different joints are consistent with given marginals. +Here for instance, the marginals are the same as if $A$ and $B$ where +two independent coins, even though they are not. + +Even though this could somehow work, the notion of a RV here is very +unclear: it is not simply a distribution, and every time a new one is +defined, it require the specification of the joint with all the +variables already defined. + +\section{Random Variables} + +The actual definition of a RV is a bit technical. Intuitively, in some +way, it consists of defining first ``the source of all randomness'', +and then every RV is a deterministic function of it. + +Formally, it relies first on the definition of a set $\Omega$ such +that its subsets can be measured, with all the desirable properties, +such as $\mu(\Omega)=1, \mu(\emptyset)=0$ and $A \cap B = \emptyset +\Rightarrow \mu(A \cup B) = \mu(A) + \mu(B)$. + +There is a technical point: for some $\Omega$ it may be impossible to +define such a measure on all its subsets due to tricky +infinity-related pathologies. So the set $\Sigma$ of +\underline{measurable} subsets is explicitly specified and called a +$\sigma$-algebra. In any practical situation this technicality does +not matter, since $\Sigma$ contains anything needed. + +The triplet $(\Omega, \Sigma, \mu)$ is a \underline{measured set}. + +Given such a measured set, an \underline{random variable} $X$ is a +mapping from $\Omega$ into another set, and the +\underline{probability} that $X$ takes the value $x$ is the measure of +the subset of $\Omega$ where $X$ takes the value $x$: +% +\[ +P(X=x) = \mu(X^{-1}(x)) +\] + +You can imagine $\Omega$ as the square $[0,1]^2$ in $\mathbb{R}^2$ +with the usual geometrical area for $\mu$. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +For instance if the two coins $A$ and $B$ are flipped independently, we +could picture possible random variables with the proper distribution +as follows: + +\nopagebreak + +\begin{tikzpicture}[scale=0.8] +\draw[pattern=north east lines] (0,0) rectangle ++(0.5,0.5); +\draw (0,0) rectangle ++(1,0.5); +\node at (2.5,0.2) {$A=\text{head}/\text{tail}$}; + +\draw[fill=red!50] (4.5, 0) rectangle ++(0.5,0.5); +\draw (4.5,0) rectangle ++(1,0.5); +\node at (7.0,0.2) {$B=\text{head}/\text{tail}$}; +\end{tikzpicture} +% + +\nopagebreak + +\begin{tikzpicture}[scale=0.600] +\draw[fill=red!50,draw=none] (0, 0) rectangle (2, 4); +\draw[draw=none,pattern=north east lines] (0, 0) rectangle (4,2); +\draw (0,0) rectangle (4,4); + +%% \draw[draw=green,thick] (0,0) rectangle ++(2,2); +%% \draw[draw=green,thick] (0.1,2.1) rectangle ++(1.8257,1.8257); +%% \draw[draw=green,thick] (2.1,0.1) rectangle ++(0.8165,0.8165); + +\end{tikzpicture} +% +\hspace*{\stretch{1}} +% +\begin{tikzpicture}[scale=0.600] +\draw[fill=red!50,draw=none] (0, 0) rectangle ++(1, 4); +\draw[fill=red!50,draw=none] (1.5, 0) rectangle ++(1, 4); +\draw[draw=none,pattern=north east lines] (0, 0.25) rectangle ++(4,0.5); +\draw[draw=none,pattern=north east lines] (0, 1.25) rectangle ++(4,0.5); +\draw[draw=none,pattern=north east lines] (0, 2.) rectangle ++(4,0.5); +\draw[draw=none,pattern=north east lines] (0, 2.5) rectangle ++(4,0.5); +\draw (0,0) rectangle (4,4); +\end{tikzpicture} +% +\hspace*{\stretch{1}} +% +\begin{tikzpicture}[scale=0.600] +\draw[fill=red!50,draw=none] (0, 0) rectangle (2, 2); +\draw[fill=red!50,draw=none] (0, 4)--(2,4)--(4,2)--(2,2)--cycle; +\draw[draw=none,pattern=north east lines] (0.5, 4)--(1.5,4)--(3.5,2)--(2.5,2)--cycle; +\draw[draw=none,pattern=north east lines] (3, 3) rectangle (4,4); +\draw[draw=none,pattern=north east lines] (0,4)--(1,3)--(0,2)--cycle; +\draw[draw=none,pattern=north east lines] (2.25,0) rectangle (3.25,2); +\draw[draw=none,pattern=north east lines] (0, 0) rectangle (2,1); +\draw (0,0) rectangle (4,4); +\end{tikzpicture} + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +And if $A$ is flipped and $B$ is the inverse of $A$, possible RV would +be + +\nopagebreak + +\begin{tikzpicture}[scale=0.8] +%% \node at (3.2, 1) {Flip A and B = inverse(A)}; + +\draw[pattern=north east lines] (0,0) rectangle ++(0.5,0.5); +\draw (0,0) rectangle ++(1,0.5); +\node at (2.5,0.2) {$A=\text{head}/\text{tail}$}; + +\draw[fill=red!50] (4.5, 0) rectangle ++(0.5,0.5); +\draw (4.5,0) rectangle ++(1,0.5); +\node at (7.0,0.2) {$B=\text{head}/\text{tail}$}; +\end{tikzpicture} + +\nopagebreak + +\begin{tikzpicture}[scale=0.600] +\draw[fill=red!50] (0,0) rectangle (4,4); +\draw[preaction={fill=white},draw=none,pattern=north east lines] (0, 0) rectangle (2,4); +\draw (0,0) rectangle (4,4); +\end{tikzpicture} +% +\hspace*{\stretch{1}} +% +\begin{tikzpicture}[scale=0.600] +\draw[fill=red!50] (0,0) rectangle (4,4); +\draw[preaction={fill=white},draw=none,pattern=north east lines] (0, 0) rectangle ++(1,1); +\draw[preaction={fill=white},draw=none,pattern=north east lines] (1, 0) rectangle ++(1,1); +\draw[preaction={fill=white},draw=none,pattern=north east lines] (3, 0) rectangle ++(1,1); +\draw[preaction={fill=white},draw=none,pattern=north east lines] (0, 1) rectangle ++(1,1); +\draw[preaction={fill=white},draw=none,pattern=north east lines] (2, 1) rectangle ++(1,1); +\draw[preaction={fill=white},draw=none,pattern=north east lines] (0, 2) rectangle ++(1,1); +\draw[preaction={fill=white},draw=none,pattern=north east lines] (1, 3) rectangle ++(1,1); +\draw[preaction={fill=white},draw=none,pattern=north east lines] (2, 3) rectangle ++(1,1); +\draw (0,0) rectangle (4,4); +\end{tikzpicture} +% +\hspace*{\stretch{1}} +% +\begin{tikzpicture}[scale=0.600] +\draw[fill=red!50] (0,0) rectangle (4,4); +\draw[preaction={fill=white},draw=none,pattern=north east lines] (0, 0)--(1,1)--(3,1)--(3,4)--(0,1)--cycle; +\draw[preaction={fill=white},draw=none,pattern=north east lines] (0, 3) rectangle ++(2,1); +\draw[preaction={fill=white},draw=none,pattern=north east lines] (3,0) rectangle ++(1,1); +%% \draw (0,0) grid (4,4); +\draw (0,0) rectangle (4,4); +\end{tikzpicture} + +%% Thanks to this definition, additional random variables can be defined +%% with dependency structures. For instance, if $A$ and $B$ are two +%% separate coin flipping, and then a third variable $C$ is defined by +%% rolling a dice and taking the value of $A$ if it gives $1$ and the +%% value of $B$ otherwise. + +\end{document}