1 % -*- mode: latex; mode: reftex; mode: auto-fill; mode: flyspell; coding: utf-8; tex-command: "pdflatex.sh" -*-
3 \documentclass[oneside,11pt]{memoir}
14 pdfproducer={LaTeX and TikZ},
15 pdfcreator={pdflatex},
18 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
28 {\huge The Little Book\\[0.75ex] of\\[1.75ex] Something}
40 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
42 \vspace*{\stretch{1.25}}
44 The author blah blah blah
49 \footnotesize beta-\dotdate\today
56 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
58 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
61 \everymath{\color{black}}
62 \tableofcontents* % Prints the table of contents
63 %\addcontentsline{toc}{chapter}{Contents}
69 \addcontentsline{toc}{chapter}{List of figures}
71 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
74 \addcontentsline{toc}{chapter}{Foreword}
76 \keyterm[lorem]{Lorem}
80 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
85 %% This first part provides a minimal background about machine
86 %% learning, issues and techniques for efficient computation, and the
87 %% strategies to train a parametric model.
89 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
92 \cite{arxiv-1907.07174}
96 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
99 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
103 \begin{tikzpicture}[deepnet]
106 coordinate[next=4,layer] (encoder start)
107 node[next=4,layer,param] (encoder embed) {$\embedding$}
108 node[next,layer small] (encoder sum) {$+$}
109 node[next=4,layer large,param] (encoder sa) {$\selfattention$}
110 coordinate[next=4,layer] (encoder end)
113 \node[var,left=10pt of encoder sum] (encoder pe) {$\positionalencoding$};
116 node[var,above=8mm of encoder sa] (Z) {$Z$}
117 coordinate[next=4,layer] (decoder start)
118 node[next=4,layer,param] (decoder embed) {$\embedding$}
119 node[next,layer small] (decoder sum) {$+$}
120 node[next=4,layer large,param,text height=3.5ex,every text node part/.style={align=center}] (decoder sa) {$\operatorname{causal}$\\[-1pt]$\selfattention$}
121 node[next=4,param,layer large,inputs={$\hspace*{3pt}Q\hspace*{38pt}KV$}] (decoder ca) {$\crossattention$}
122 node[next=4,layer,param] (decoder readout) {$\fullyconnected$}
123 coordinate[next=4,layer] (end)
124 node[next=4,var] (Y) {$Y$}
127 \node[var,left=10pt of decoder sum] (decoder pe) {$\positionalencoding$};
129 \begin{scope}[on background layer]
130 \node[replicated=N,fit=(encoder sa)] {};
131 \node[replicated=N,fit=(decoder sa) (decoder ca)] {};
134 (X)--(encoder start)--(encoder embed)--(encoder sum)--(encoder sa)
138 (Z)--(decoder start)--(decoder embed)--(decoder sum)--(decoder sa)
141 \draw[halo] (encoder pe)--(encoder sum);
142 \draw[halo,->] (decoder ca) -- (decoder readout) -- (Y);
144 \draw[halo] (decoder pe)--(decoder sum);
146 \coordinate (decoder ca q) at ($(decoder ca.south west)!0.5!(decoder ca.south)$);
147 \coordinate (decoder ca kv) at ($(decoder ca.south east)!0.5!(decoder ca.south)$);
148 \coordinate (decoder ca kv d) at ($(decoder ca kv)!0.5!(decoder ca kv|-decoder sa.north)$);
150 \draw[halo] (decoder ca q)--(decoder ca q|-decoder sa.north);
151 %% \draw[halo] (decoder ca kv)|-(encoder end)--(encoder sa.north);
152 \draw[halo] (decoder ca kv)--(decoder ca kv d)--++(3.25\layerthickness,0)|-(encoder end)--(encoder sa.north);
155 \begin{pgfinterruptboundingbox}
157 \node[fit=(encoder embed) (encoder pe) (encoder sa),xshift=-5pt] (encoder) {};
161 (encoder.south west) -- (encoder.north west)
163 node[midway,left,every text node part/.style={align=center},xshift=-0.5em,yshift=2pt] {\likecaption Encoder};
165 \node[fit=(decoder embed) (decoder pe) (decoder readout),xshift=-5pt] (decoder) {};
169 (decoder.south west) -- (decoder.north west)
171 node[midway,left,every text node part/.style={align=center},xshift=-0.5em,yshift=2pt] {\likecaption Decoder};
173 \setlength{\diminfoshift}{85pt}
175 \diminfo{encoder embed}{X}{$T \times D$}
176 \diminfo{encoder sa}{X}{$T \times D$}
178 \diminfo{decoder embed}{X}{$S \times D$}
179 \diminfo{decoder ca}{X}{$S \times D$}
180 \diminfo{decoder readout}{X}{$S \times V$}
181 \end{pgfinterruptboundingbox}
185 \caption[Transformer]{Original encoder-decoder
186 \keyterm[Transformer]{Transformer model} for sequence-to-sequence
187 translation \citep{arxiv-1706.03762}.}\label{fig:transformer}
192 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
196 %% If there is an index, it should follow the bibliography (see the
197 %% Manual, 14.62). Bibliography entries are listed in one alphabetical
198 %% sequence arranged by the surname of the first author or by title if
199 %% there is no author.25 Nov 2022
201 %% Bibliography - Chicago Citation Style, 17th Edition - Library
205 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
209 \vspace*{\stretch{1}}
213 {\color{red} (draft, do not circulate)}
216 This book is licensed under the
217 \href{https://creativecommons.org/licenses/by-nc-sa/4.0/}{Creative
218 Commons BY-NC-SA 4.0 International License.}
221 \vspace*{\stretch{1}}
223 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%