13 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
18 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
19 % Cover for the phone version
28 {\huge The Little Book\\[0.75ex] of\\[1.75ex] Things}
42 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
43 % First page for the pocket book version
54 {The Little Book of Things}
63 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
65 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
67 \vspace*{\stretch{1.25}}
69 The author blah blah blah
74 \footnotesize beta-\dotdate\today
81 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
83 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
86 \everymath{\color{black}}
87 \tableofcontents* % Prints the table of contents
88 %\addcontentsline{toc}{chapter}{Contents}
94 \addcontentsline{toc}{chapter}{List of figures}
96 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
99 \addcontentsline{toc}{chapter}{Foreword}
101 \keyterm[lorem]{Lorem}
105 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
110 %% This first part provides a minimal background about machine
111 %% learning, issues and techniques for efficient computation, and the
112 %% strategies to train a parametric model.
114 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
117 \cite{arxiv-1907.07174}
121 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
124 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
128 \begin{tikzpicture}[deepnet]
131 coordinate[next=4,layer] (encoder start)
132 node[next=4,layer,param] (encoder embed) {$\embedding$}
133 node[next,layer small] (encoder sum) {$+$}
134 node[next=4,layer large,param] (encoder sa) {$\selfattention$}
135 coordinate[next=4,layer] (encoder end)
138 \node[var,left=10pt of encoder sum] (encoder pe) {$\positionalencoding$};
141 node[var,above=8mm of encoder sa] (Z) {$Z$}
142 coordinate[next=4,layer] (decoder start)
143 node[next=4,layer,param] (decoder embed) {$\embedding$}
144 node[next,layer small] (decoder sum) {$+$}
145 node[next=4,layer large,param,text height=3.5ex,every text node part/.style={align=center}] (decoder sa) {$\operatorname{causal}$\\[-1pt]$\selfattention$}
146 node[next=4,param,layer large,inputs={$\hspace*{3pt}Q\hspace*{38pt}KV$}] (decoder ca) {$\crossattention$}
147 node[next=4,layer,param] (decoder readout) {$\fullyconnected$}
148 coordinate[next=4,layer] (end)
149 node[next=4,var] (Y) {$Y$}
152 \node[var,left=10pt of decoder sum] (decoder pe) {$\positionalencoding$};
154 \begin{scope}[on background layer]
155 \node[replicated=N,fit=(encoder sa)] {};
156 \node[replicated=N,fit=(decoder sa) (decoder ca)] {};
159 (X)--(encoder start)--(encoder embed)--(encoder sum)--(encoder sa)
163 (Z)--(decoder start)--(decoder embed)--(decoder sum)--(decoder sa)
166 \draw[halo] (encoder pe)--(encoder sum);
167 \draw[halo,->] (decoder ca) -- (decoder readout) -- (Y);
169 \draw[halo] (decoder pe)--(decoder sum);
171 \coordinate (decoder ca q) at ($(decoder ca.south west)!0.5!(decoder ca.south)$);
172 \coordinate (decoder ca kv) at ($(decoder ca.south east)!0.5!(decoder ca.south)$);
173 \coordinate (decoder ca kv d) at ($(decoder ca kv)!0.5!(decoder ca kv|-decoder sa.north)$);
175 \draw[halo] (decoder ca q)--(decoder ca q|-decoder sa.north);
176 %% \draw[halo] (decoder ca kv)|-(encoder end)--(encoder sa.north);
177 \draw[halo] (decoder ca kv)--(decoder ca kv d)--++(3.25\layerthickness,0)|-(encoder end)--(encoder sa.north);
180 \begin{pgfinterruptboundingbox}
182 \node[fit=(encoder embed) (encoder pe) (encoder sa),xshift=-5pt] (encoder) {};
186 (encoder.south west) -- (encoder.north west)
188 node[midway,left,every text node part/.style={align=center},xshift=-0.5em,yshift=2pt] {\likecaption Encoder};
190 \node[fit=(decoder embed) (decoder pe) (decoder readout),xshift=-5pt] (decoder) {};
194 (decoder.south west) -- (decoder.north west)
196 node[midway,left,every text node part/.style={align=center},xshift=-0.5em,yshift=2pt] {\likecaption Decoder};
198 \setlength{\diminfoshift}{85pt}
200 \diminfo{encoder embed}{X}{$T \times D$}
201 \diminfo{encoder sa}{X}{$T \times D$}
203 \diminfo{decoder embed}{X}{$S \times D$}
204 \diminfo{decoder ca}{X}{$S \times D$}
205 \diminfo{decoder readout}{X}{$S \times V$}
206 \end{pgfinterruptboundingbox}
210 \caption[Transformer]{Original encoder-decoder
211 \keyterm[Transformer]{Transformer model} for sequence-to-sequence
212 translation \citep{arxiv-1706.03762}.}\label{fig:transformer}
217 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
221 %% If there is an index, it should follow the bibliography (see the
222 %% Manual, 14.62). Bibliography entries are listed in one alphabetical
223 %% sequence arranged by the surname of the first author or by title if
224 %% there is no author.25 Nov 2022
226 %% Bibliography - Chicago Citation Style, 17th Edition - Library
230 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
234 \vspace*{\stretch{1}}
238 {\color{red} (draft, do not circulate)}
241 This book is licensed under the
242 \href{https://creativecommons.org/licenses/by-nc-sa/4.0/}{Creative
243 Commons BY-NC-SA 4.0 International License.}
246 \vspace*{\stretch{1}}
248 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%