1 % -*- mode: latex; mode: reftex; mode: auto-fill; mode: flyspell; -*-
3 \documentclass[c,8pt]{beamer}
6 \newcommand{\transpose}{^{\top}}
7 \def\softmax{\operatorname{softmax}}
9 \setbeamertemplate{navigation symbols}{}
13 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
15 \begin{frame}[fragile]
17 Given a query sequence $Q$, a key sequence $K$, and a value sequence
18 $V$, compute an attention matrix $A$ by matching $Q$s to $K$s, and
19 weight $V$ with it to get $Y$.
25 A_i = \softmax \left( \frac{Q_i \, K\transpose}{\sqrt{d}} \right)
37 \makebox[\textwidth][c]{
40 \node[cm={0.5, 0.5, 0.0, 1.0, (0.0, 0.0)}] (V) at (-2, 2.35) {
42 \draw[fill=green!20] (0, 0) rectangle (4, 1.4);
43 \uncover<3,5>{\draw[fill=yellow] (0, 0) rectangle (4, 1.4);}
44 \foreach \x in { 0.2, 0.4, ..., 3.8 } \draw (\x, 0) -- ++(0, 1.4);
48 \node[cm={1.0, 0.0, 0.5, 0.5, (0.0, 0.0)}] (A) at (0.5, 1.6) {
50 \draw (0, 0) rectangle ++(3, 4);
55 \node[cm={0.5, 0.5, 0.0, 1.0, (0.0, 0.0)}] (a1) at (-0.9, 2.1) {
57 \draw[draw=none] (0, 0) rectangle (4, 1);
59 0.00/0.03, 0.20/0.04, 0.40/0.07, 0.60/0.35, 0.80/0.52,
60 1.00/1.00, 1.20/0.82, 1.40/0.25, 1.60/0.08, 1.80/0.03,
61 2.00/0.15, 2.20/0.24, 2.40/0.70, 2.60/0.05, 2.80/0.03,
62 3.00/0.03, 3.20/0.03, 3.40/0.00, 3.60/0.03, 3.80/0.00 }{
63 \uncover<2>{\draw[black,fill=red] (\x, 0) rectangle ++(0.2, \y);}
64 \uncover<3>{\draw[black,fill=yellow] (\x, 0) rectangle ++(0.2, \y);}
71 \node[cm={0.5, 0.5, 0.0, 1.0, (0.0, 0.0)}] (a2) at (-0.7, 2.1) {
73 \draw[draw=none] (0, 0) rectangle (4, 1);
75 0.00/0.03, 0.20/0.04, 0.40/0.07, 0.60/0.03, 0.80/0.03,
76 1.00/0.05, 1.20/0.02, 1.40/0.08, 1.60/0.35, 1.80/0.85,
77 2.00/0.05, 2.20/0.04, 2.40/0.03, 2.60/0.05, 2.80/0.03,
78 3.00/0.03, 3.20/0.03, 3.40/0.00, 3.60/0.03, 3.80/0.00 }{
79 \uncover<4>{\draw[black,fill=red] (\x, 0) rectangle ++(0.2, \y);}
80 \uncover<5>{\draw[black,fill=yellow] (\x, 0) rectangle ++(0.2, \y);}
86 \node[cm={1.0, 0.0, 0.0, 1.0, (0.0, 0.0)}] (Q) at (-0.5, -0.05) {
88 \draw[fill=green!20] (0, 0) rectangle (3, 1.0);
89 \foreach \x in { 0.2, 0.4, ..., 2.8 } \draw (\x, 0) -- ++(0, 1.0);
90 \uncover<2>{\draw[fill=yellow] (0.0, 0) rectangle ++(0.2, 1);}
91 \uncover<4>{\draw[fill=yellow] (0.2, 0) rectangle ++(0.2, 1);}
95 \node[cm={1.0, 0.0, 0.0, 1.0, (0.0, 0.0)}] (Y) at (1.5, 3.45) {
97 \uncover<3>{\draw[fill=red] (0.0, 0) rectangle ++(0.2, 1.4);}
98 \uncover<4->{\draw[fill=green!20] (0.0, 0) rectangle ++(0.2, 1.4);}
99 \uncover<6->{\draw[fill=green!20] (0.0, 0) rectangle ++(3, 1.4);}
100 \uncover<5>{\draw[fill=red] (0.2, 0) rectangle ++(0.2, 1.4);}
101 \draw (0, 0) rectangle (3, 1.4);
102 \foreach \x in { 0.2, 0.4, ..., 2.8 } \draw (\x, 0) -- ++(0, 1.4);
106 \node[cm={0.5, 0.5, 0.0, 1.0, (0.0, 0.0)}] (K) at (3, 1.1) {
108 \draw[fill=green!20] (0, 0) rectangle (4, 1);
109 \uncover<2,4>{\draw[fill=yellow] (0, 0) rectangle (4, 1);}
110 \foreach \x in { 0.2, 0.4, ..., 3.8 } \draw (\x, 0) -- ++(0, 1);
114 \node[left of=V,xshift=0.5cm,yshift=0.7cm] (Vl) {V};
115 \node[left of=Q,xshift=-0.8cm] (Ql) {Q};
116 \node (Al) at (A) {A};
117 \node[right of=K,xshift=-0.6cm,yshift=-0.6cm] (Kl) {K};
118 \node[right of=Y,xshift=0.8cm] (Yl) {Y};
125 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
127 \begin{frame}[fragile]
129 A standard attention layer takes as input two sequences $X$ and $X'$
136 Y & = \underbrace{\softmax_{row} \left( \frac{Q K\transpose}{\sqrt{d}} \right)}_{A} V
139 When $X = X'$, this is \textbf{self attention}, otherwise \textbf{cross
146 Several such processes can be combined in which case $Y$ is the
147 concatenation of the separate results. This is \textbf{multi-head