report/culture.tex

   1 %% -*- mode: latex; mode: reftex; mode: flyspell; coding: utf-8; tex-command: "pdflatex.sh" -*-
   2
   3 %% Any copyright is dedicated to the Public Domain.
   4 %% https://creativecommons.org/publicdomain/zero/1.0/
   5 %% Written by Francois Fleuret <francois@fleuret.org>
   6
   7 \documentclass[11pt,a4paper,oneside]{article}
   8 \usepackage[paperheight=15cm,paperwidth=8cm,top=2mm,bottom=15mm,right=5mm,left=5mm]{geometry}
   9 %\usepackage[a4paper,top=2.5cm,bottom=2cm,left=2.5cm,right=2.5cm]{geometry}
  10 \usepackage[utf8]{inputenc}
  11 \usepackage{amsmath,amssymb,dsfont}
  12 \usepackage[pdftex]{graphicx}
  13 \usepackage[colorlinks=true,linkcolor=blue,urlcolor=blue,citecolor=blue]{hyperref}
  14 \urlstyle{same}
  15 \usepackage{tikz}
  16 \usetikzlibrary{arrows,arrows.meta,calc}
  17 \usetikzlibrary{patterns,backgrounds}
  18 \usetikzlibrary{positioning,fit}
  19 \usetikzlibrary{shapes.geometric,shapes.multipart}
  20 \usetikzlibrary{patterns.meta,decorations.pathreplacing,calligraphy}
  21 \usetikzlibrary{tikzmark}
  22 \usetikzlibrary{decorations.pathmorphing}
  23 \usepackage[round]{natbib}
  24 \usepackage[osf]{libertine}
  25 \usepackage{microtype}
  26
  27 \usepackage{mleftright}
  28
  29 \usepackage{enumitem}
  30 \setlist[itemize]{leftmargin=0pt,itemindent=1em,itemsep=2ex}
  31 \setlist{nosep} % or \setlist{noitemsep} to leave space around whole list
  32
  33 \newcommand{\setmuskip}[2]{#1=#2\relax}
  34 \setmuskip{\thinmuskip}{1.5mu} % by default it is equal to 3 mu
  35 \setmuskip{\medmuskip}{2mu} % by default it is equal to 4 mu
  36 \setmuskip{\thickmuskip}{3.5mu} % by default it is equal to 5 mu
  37
  38 \setlength{\parindent}{0cm}
  39 \setlength{\parskip}{1ex}
  40 %\renewcommand{\baselinestretch}{1.3}
  41 %\setlength{\tabcolsep}{0pt}
  42 %\renewcommand{\arraystretch}{1.0}
  43
  44 \def\argmax{\operatornamewithlimits{argmax}}
  45 \def\argmin{\operatornamewithlimits{argmin}}
  46
  47 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  48
  49 \def\given{\,\middle\vert\,}
  50 \def\proba{\operatorname{P}}
  51 \newcommand{\seq}{{S}}
  52 \newcommand{\expect}{\mathds{E}}
  53 \newcommand{\variance}{\mathds{V}}
  54 \newcommand{\empexpect}{\hat{\mathds{E}}}
  55 \newcommand{\mutinf}{\mathds{I}}
  56 \newcommand{\empmutinf}{\hat{\mathds{I}}}
  57 \newcommand{\entropy}{\mathds{H}}
  58 \newcommand{\empentropy}{\hat{\mathds{H}}}
  59 \newcommand{\ganG}{\mathbf{G}}
  60 \newcommand{\ganD}{\mathbf{D}}
  61 \newcommand{\ganF}{\mathbf{F}}
  62
  63 \newcommand{\dkl}{\mathds{D}_{\mathsf{KL}}}
  64 \newcommand{\djs}{\mathds{D}_{\mathsf{JS}}}
  65
  66 \newcommand*{\vertbar}{\rule[-1ex]{0.5pt}{2.5ex}}
  67 \newcommand*{\horzbar}{\rule[.5ex]{2.5ex}{0.5pt}}
  68
  69 \def\positionalencoding{\operatorname{pos-enc}}
  70 \def\concat{\operatorname{concat}}
  71 \def\crossentropy{\LL_{\operatorname{ce}}}
  72
  73 \newcommand{\separator}{\begin{center}
  74 *
  75 \end{center}}
  76
  77 \newcommand{\pic}[2]{%
  78 \hspace*{\stretch{1}}
  79 %
  80 \includegraphics[scale=0.25]{#1}
  81 %
  82 \hspace*{\stretch{1}}%
  83 }
  84
  85 \newcommand{\birdpic}[2]{%
  86 \hspace*{\stretch{1}}
  87 %
  88 \includegraphics[scale=0.35]{#1}
  89 %
  90 \hspace*{\stretch{1}}%
  91 }
  92
  93 \newenvironment{example}{%
  94
  95 \vspace*{2ex}
  96
  97 \begin{minipage}{\textwidth}
  98
  99 \setlength{\parindent}{0cm}
 100 \setlength{\parskip}{1ex}
 101 }{%
 102 \end{minipage}
 103 }
 104
 105 \begin{document}
 106
 107 \vspace*{-3ex}
 108
 109 \begin{center}
 110
 111 {\Large Self-Generated Culture}
 112
 113 Fran\c cois Fleuret
 114
 115 \today
 116
 117 \vspace*{2ex}
 118
 119 \centerline{\color{red}(work in progress, to be updated)}
 120
 121 \medskip
 122
 123 \centerline{\url{https://fleuret.org/public/culture/culture.pdf}}
 124
 125 \end{center}
 126
 127 \section{Introduction}
 128
 129 The hypothesis behind this experiment is that high-level abstract
 130 thinking is fueled by social competition.
 131
 132 A group of communicating agents that try to demonstrate their
 133 cognitive superiority would end up developing a rich and consistent
 134 culture.
 135
 136 \subsection{Setup}
 137
 138 The experiment is designed with a group of GPTs that alternatively
 139 learn to solve quizzes and generate new ones.
 140
 141 A ``quiz'' is a pair composed of a prompt and a solution, both being
 142 sequence of tokens.
 143
 144 We differentiate \textbf{world quizzes} that follow pre-defined and
 145 fixed regularities, and mimic the world's physical and environmental
 146 patterns that an organism has to grasp to survive, and \textbf{culture
 147   quizzes} that are generated by the GPTs, and mimic the knowledge one
 148 has to master to perform socially.
 149
 150
 151 We train five GPTs on a a very large set of ``world quizzes''
 152 generated randomly. These models are trained to generate both the
 153 solution given the prompt, and the prompt given the solution.
 154
 155 This is achieved by using for training both ``forward sequences'',
 156 composed of a token \texttt{[fwd]}, followed by the prompt's tokens,
 157 followed by another token \texttt{[fwd]}, followed by the solution's
 158 tokens, or ``backward sequences'' composed of a token \texttt{[bck]},
 159 followed by the solution's tokens, followed by another token
 160 \texttt{[bck]}, followed by the prompt's tokens,
 161
 162 \subsection{Generating Culture Quizzes}
 163
 164 When their accuracy get above $95\%$ we generate new quizzes as follows:
 165 %
 166 \begin{enumerate}
 167
 168 \item generate a solution (without conditioning) at temperature $T=2$,
 169   then generate a prompt for that solution at temperature $T=1/2$, and
 170   then generate a solution for that prompt at temperature $T=1/2$.
 171
 172 \item generate one solution for that prompt with each of the $5$ GPTs
 173   at temperature $T=1$, if $4$ of them generate the correct solution,
 174   validate that quiz and include it in the training data.
 175
 176 \end{enumerate}
 177
 178 This criterion assures that the new quizzes are both solvable and
 179 sophisticated, and incrementally complexify the culture. Imposing both
 180 direction prevents the generation of quizzes which are not trivial
 181 only because the prompt has been randomly degraded.
 182
 183 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 184 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 185 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 186 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 187
 188 \section{Grid Quizzes}
 189
 190 \subsection{World Quizzes}
 191
 192 We define several types of quizzes and implement algorithmic
 193 procedures to generate randomly example from each.
 194
 195 In these quizzes, the prompt is made of three grids $A, f(A), B$ and
 196 the solution is a single grid $f(B)$.
 197
 198 \subsubsection{Half Fill}
 199
 200 \pic{pics/task_color_grow.png}{``half fill''}
 201
 202 The first grid contains three rectangles, each with a vertical or an
 203 horizontal line of another color in its middle. The second grid is
 204 identical with one of the rectangle having one half filled. The third
 205 grid contains three rectangles of identical colors as the firs grid,
 206 of different size and locations. The solution is obtained by filling
 207 similarly one of the half of a rectangle of the third image.
 208
 209 \subsubsection{Detect}
 210
 211 \pic{pics/task_detect.png}{``detect''}
 212
 213 The first grid contains three rectangles, the second has two pixels of
 214 same colors located in the top-left corner of two of them. The
 215 solution is obtained by marking in the fourth image the top-left
 216 corners of the rectangles of same colors in the third.
 217
 218 \subsubsection{Frame}
 219
 220 \pic{pics/task_frame.png}{``frame''}
 221
 222 The first grid contains three rectangles, and the second is identical
 223 except that one rectangle has been replaced by its frame. The same
 224 should be done to the similarly colored rectangles of the third grid
 225 to obtain the solution.
 226
 227 \subsubsection{Grow}
 228
 229 \pic{pics/task_grow.png}{``grow''}
 230
 231 The first grid contains three rectangles, one of them getting one
 232 pixel thicker or thinner in the second. The same should be done to the
 233 similarly colored rectangles of the third grid to get the solution.
 234
 235 \subsubsection{Replace color}
 236
 237 \pic{pics/task_replace_color.png}{``replace color''}
 238
 239 The first grid contains three rectangles, the second is obtained by
 240 changing one of the colors. The same should be done to the third grid
 241 to obtain the solution.
 242
 243 \subsubsection{Translate}
 244
 245 \pic{pics/task_translate.png}{``translate''}
 246
 247 The first grid contains three rectangles. The second is obtained by
 248 displacing one of them by one pixel in both direction. The solution is
 249 obtained by applying the same motion to the similarly colored
 250 rectangle in the third grid.
 251
 252 %% \subsubsection{Bounce}
 253
 254 %% \pic{pics/task_bounce.png}{``bounce''}
 255
 256 %% The solution should join the two pixels of same color, with a path of
 257 %% another color, starting in the direction indicated by a pixel of that
 258 %% color, and changing direction only when colliding with a pixel of a
 259 %% third color or one of the lattice border.
 260
 261 %% \subsubsection{count}
 262
 263 %% \pic{pics/task_count.png}{``count''}
 264
 265 %% \subsubsection{scale}
 266
 267 %% \pic{pics/task_scale.png}{``scale''}
 268
 269 %% \subsubsection{trajectory}
 270
 271 %% \pic{pics/task_trajectory.png}{``trajectory''}
 272
 273 \subsection{Culture Quizzes}
 274
 275 We list here some generated quizzes that exhibit features that were not present in the ``world quizzes'' used for training.
 276
 277 \bigskip
 278
 279 \begin{example}
 280
 281 \pic{pics/culture_c_quiz_0078_N4_validated/quiz_01.png}{0078/01}
 282
 283 \pic{pics/culture_c_quiz_0078_N4_validated/quiz_02.png}{0078/02}
 284
 285 Added distractors.
 286
 287 \end{example}
 288
 289 \separator
 290
 291 \begin{example}
 292
 293 \pic{pics/culture_c_quiz_0110_N4_validated/quiz_63.png}{0110/63}
 294
 295 The quizzes ``frame'' and ``half fill'' have been combined in a single
 296 quiz.
 297
 298 \end{example}
 299
 300 \separator
 301
 302 \begin{example}
 303
 304 \pic{pics/culture_c_quiz_0087_N4_validated/quiz_62.png}{0087/62}
 305
 306 \pic{pics/culture_c_quiz_0102_N4_validated/quiz_04.png}{0102/04}
 307
 308 \pic{pics/culture_c_quiz_0102_N4_validated/quiz_11.png}{0102/11}
 309
 310 \pic{pics/culture_c_quiz_0108_N4_validated/quiz_31.png}{0108/31}
 311
 312 Variation of ``Detect'' with location markers colored according to the
 313 color of the rectangle they mark.
 314
 315 \end{example}
 316
 317 \separator
 318
 319 \begin{example}
 320
 321 \pic{pics/culture_c_quiz_0078_N4_validated/quiz_16.png}{0078/16}
 322
 323 \pic{pics/culture_c_quiz_0084_N4_validated/quiz_21.png}{0084/21}
 324
 325 \pic{pics/culture_c_quiz_0078_N4_validated/quiz_42.png}{0078/42}
 326
 327 \pic{pics/culture_c_quiz_0089_N4_validated/quiz_28.png}{0089/28}
 328
 329 \pic{pics/culture_c_quiz_0084_N4_validated/quiz_00.png}{0084/00}
 330
 331 Variations of ``Half Fill'', ``Detect'', ``Translate'', ``Grow'', and
 332 ``Frame'' with a number of rectangles not equal to three.
 333
 334 \end{example}
 335
 336 \separator
 337
 338 \begin{example}
 339
 340 \pic{pics/culture_c_quiz_0078_N4_validated/quiz_27.png}{0078/27}
 341
 342 \pic{pics/culture_c_quiz_0078_N4_validated/quiz_18.png}{0078/18}
 343
 344 \pic{pics/culture_c_quiz_0086_N4_validated/quiz_45.png}{0086/45}
 345
 346 \pic{pics/culture_c_quiz_0078_N4_validated/quiz_37.png}{0078/37}
 347
 348 Variations of ``Half Fill'' where the shapes to change have more
 349 complex coloring.
 350
 351 \end{example}
 352
 353 \separator
 354
 355 \begin{example}
 356
 357 \pic{pics/culture_c_quiz_0078_N4_validated/quiz_30.png}{0078/30}
 358
 359 Variation of ``Translate'' where the moving part is occluded, which
 360 was never the case.
 361
 362 \end{example}
 363
 364 \separator
 365
 366 \begin{example}
 367
 368 \pic{pics/culture_c_quiz_0078_N4_validated/quiz_31.png}{0078/31}
 369
 370 \pic{pics/culture_c_quiz_0084_N4_validated/quiz_10.png}{0084/10}
 371
 372 \pic{pics/culture_c_quiz_0084_N4_validated/quiz_12.png}{0084/12}
 373
 374 \pic{pics/culture_c_quiz_0086_N4_validated/quiz_23.png}{0086/23}
 375
 376 \pic{pics/culture_c_quiz_0086_N4_validated/quiz_28.png}{0086/28}
 377
 378 Variations of ``Half Fill'' with non-rectangular shapes.
 379
 380 \end{example}
 381
 382 \separator
 383
 384 \begin{example}
 385
 386 \pic{pics/culture_c_quiz_0078_N4_validated/quiz_60.png}{0078/60}
 387
 388 \pic{pics/culture_c_quiz_0084_N4_validated/quiz_41.png}{0084/41}
 389
 390 \pic{pics/culture_c_quiz_0084_N4_validated/quiz_49.png}{0084/49}
 391
 392 \pic{pics/culture_c_quiz_0086_N4_validated/quiz_04.png}{0086/04}
 393
 394 Variations of ``Half Fill'' with two colors or two rectangles have to
 395 be modified.
 396
 397 \end{example}
 398
 399 \separator
 400
 401 \begin{example}
 402
 403 \pic{pics/culture_c_quiz_0111_N4_validated/quiz_23.png}{0111/23}
 404
 405 Variation of ``Frame'' with no rectangle of adequate size to be
 406 modified.
 407
 408 \end{example}
 409
 410 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 411 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 412 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 413 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 414
 415 \section{Bird World}
 416
 417 These results were obtained with a slightly different procedure. In
 418 particular the quizzes were validated if the models could predict both
 419 the solution from the prompt and the prompt from the solution. We
 420 report them since they exhibit the same patterns of generalization
 421 although they are quite different.
 422
 423 \subsection{World Quizzes}
 424
 425 The initial set of quizzes consist of predicting the dynamics of a
 426 very simple world: A $6 \times 8$ grid with three colored ``birds'' moving in
 427 a straight line, possibly bouncing on the grid's borders. There are
 428 ten different colors.
 429 %
 430 \birdpic{pics/examples_train.png}{}
 431 %
 432
 433 In each on these quizzes, $A$ is the left image serialized in
 434 raster-scan order as a sequence of $6 \times 8 = 48$ tokens, $d$ is
 435 either the token ``forward'' or the token ``backward'', and $B$ is the
 436 right image, also serialized. The direction of prediction is chosen at
 437 random.
 438
 439 \subsection{Culture quizzes}
 440
 441 This procedure results in the discovery of patterns which are not
 442 present in the original quizzes:
 443
 444 \begin{example}
 445
 446 \birdpic{pics/4_birds_1.png}{}
 447
 448 \birdpic{pics/5_birds_1.png}{}
 449
 450 \birdpic{pics/6_birds_1.png}{}
 451
 452 More birds.
 453
 454 \end{example}
 455
 456 \separator
 457
 458 \begin{example}
 459
 460 \birdpic{pics/other_shapes_2.png}{}
 461
 462 \birdpic{pics/other_shapes_3.png}{}
 463
 464 New bird shapes.
 465
 466 \end{example}
 467
 468 \separator
 469
 470 \begin{example}
 471
 472 \birdpic{pics/other_shapes_1.png}{}
 473
 474 \birdpic{pics/occlusions_1.png}{}
 475
 476 Occlusions.
 477
 478 \end{example}
 479
 480 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 481 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 482 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 483 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 484
 485 \section{Various thoughts}
 486
 487 \begin{itemize}
 488
 489 \item The whole process can be envisioned as natural selection of
 490   quizzes in the representation landscape of GPTs. There probably is a
 491   subtle relation between the temperature (mutation rate) and the
 492   number of models used to validate with the ``all but one'' criterion
 493   (survival criterion).
 494
 495 \item The ``all but one'' could be ``all but K'', and there may be
 496   some information-theoretical thing, where the goal is to maximize
 497   mutual information, with $K=N$ being total randomness, so high
 498   entropy but no structure, and $K=0$ is total determinism, so no
 499   information to share.
 500
 501 \item The setup does not push toward any specific invariance or
 502   property in the generated quizzes, their consistency is entirely due
 503   to the statistics of the ``world quizzes'' that remain in the
 504   training set, and to the GPTs' inductive biased.
 505
 506 \item The GPTs obviously get a sense of objectness and 2d topology
 507   early on, since they rapidly increase the number of birds and
 508   ``discover'' occlusion even though they never was in the world
 509   quizzes.
 510
 511 \item There may not be so many problems that can be cast as pairs of
 512   patterns that are each a deterministic function of the other, which
 513   is probably critical here.
 514
 515 \item This overall process probably fight the ``simplicity bias'': If
 516   a model is lacking a ``cue'' that the others have, there will
 517   rapidly be quizzes that require this cue, they will be added to the
 518   training data, and that model will catch up.
 519
 520 \item The randomness of the process probably allow to even go beyond
 521   just synchronizing the abilities of the models. There may be some
 522   additional complexification of quizzes that get accepted by chance.
 523
 524 \item It can be parallelized by dispatching the GPTs across multiples
 525   nodes, and avoiding a quadratic cost by limiting the validation of
 526   the quizzes to a subset of them.
 527
 528 \item The current process to generate new quizzes, which simply
 529   samples them at random is very rudimentary and probably not
 530   sufficient in a real-data setup. It can probably be supplemented
 531   with a MCTS-type search.
 532
 533 \item There may be already in the generated quizzes some structure
 534   that \emph{we} do not pick up (e.g. certain color or motion
 535   patterns).
 536
 537 \end{itemize}
 538
 539 \section*{Appendix}
 540
 541 The code is available at
 542
 543 \medskip
 544
 545 \centerline{\url{https://fleuret.org/git/culture}}
 546
 547 The experiments are done with a GTX 4090.
 548
 549 The GPT used has 37M parameters and the following structure:
 550
 551 \begin{center}
 552 \begin{tabular}{lc}
 553     \texttt{dim\_model}  & 512  \\
 554     \texttt{dim\_keys}   & 64   \\
 555     \texttt{dim\_hidden} & 2048 \\
 556     \texttt{nb\_heads}   & 8    \\
 557     \texttt{nb\_blocks}  & 12
 558 \end{tabular}
 559 \end{center}
 560
 561 Adam, $\eta = 1e-4$, no scheduling.
 562
 563 There are $N_{\text{train}}=250'000$ original quizzes for training and
 564 $N_{\text{test}} = 10'000$ for test.
 565
 566 At each epoch, for both train and test samples, we mix original
 567 quizzes and the generated ones.
 568
 569 For training for instance, if there are less than $N_{\text{train}}/2$
 570 new quizzes, we take all of them, otherwise we sample
 571 $N_{\text{train}}/2$ of them without replacement, and then we sample
 572 without replacement enough original quizzes to get $N_{\text{train}}$
 573 samples in total.
 574
 575 We proceed similarly to get $N_{\text{test}}$ samples for test.
 576
 577 \end{document}