picocrafter.py

   1 #!/usr/bin/env python
   2
   3 #########################################################################
   4 # This program is free software: you can redistribute it and/or modify  #
   5 # it under the terms of the version 3 of the GNU General Public License #
   6 # as published by the Free Software Foundation.                         #
   7 #                                                                       #
   8 # This program is distributed in the hope that it will be useful, but   #
   9 # WITHOUT ANY WARRANTY; without even the implied warranty of            #
  10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU      #
  11 # General Public License for more details.                              #
  12 #                                                                       #
  13 # You should have received a copy of the GNU General Public License     #
  14 # along with this program. If not, see <http://www.gnu.org/licenses/>.  #
  15 #                                                                       #
  16 # Written by and Copyright (C) Francois Fleuret                         #
  17 # Contact <francois.fleuret@unige.ch> for comments & bug reports        #
  18 #########################################################################
  19
  20 # This is a tiny rogue-like environment implemented with tensor
  21 # operations, that runs in batches efficiently on a GPU. On a RTX4090
  22 # it can initialize ~20k environments per second and run ~40k
  23 # iterations.
  24 #
  25 # The environment is a rectangular area with walls "#" dispatched
  26 # randomly. The agent "@" can perform five actions: move "NESW" or be
  27 # immobile "I".
  28 #
  29 # There are monsters "$" moving randomly. The agent gets hit by every
  30 # monster present in one of the 4 direct neighborhoods at the end of
  31 # the moves, each hit results in a rewards of -1.
  32 #
  33 # The agent starts with 5 life points, each hit costs it 1pt, when it
  34 # gets to 0 it dies, gets a reward of -10 and the episode is over. At
  35 # every step it recovers 1/20th of a life point, with a maximum of
  36 # 5pt.
  37 #
  38 # The agent can carry "keys" ("a", "b", "c") that open "vaults" ("A",
  39 # "B", "C"). The keys and vault can only be used in sequence:
  40 # initially the agent can move only to free spaces, or to the "a", in
  41 # which case the key is removed from the environment and the agent now
  42 # carries it, it appears in the inventory at the bottom of the frame,
  43 # and the agent can now move to free spaces or the "A". When it moves
  44 # to the "A", it gets a reward, loses the "a", the "A" is removed from
  45 # the environment, but the agent can now move to the "b", etc. Rewards
  46 # are 1 for "A" and "B" and 10 for "C".
  47
  48 ######################################################################
  49
  50 import torch
  51
  52 from torch.nn.functional import conv2d
  53
  54 ######################################################################
  55
  56
  57 def to_ansi(s):
  58     if type(s) is list:
  59         return [to_ansi(x) for x in s]
  60
  61     for u, c in [("$", 31), ("@", 32)] + [(x, 36) for x in "aAbBcC"]:
  62         s = s.replace(u, f"\u001b[{c}m{u}\u001b[0m")
  63
  64     return s
  65
  66
  67 def to_unicode(s):
  68     if type(s) is list:
  69         return [to_unicode(x) for x in s]
  70
  71     for u, c in [("#", "█"), ("+", "░"), ("|", "│")]:
  72         s = s.replace(u, c)
  73
  74     return s
  75
  76
  77 def fusion_multi_lines(l, width_min=0):
  78     l = [x if type(x) is str else str(x) for x in l]
  79
  80     l = [x.split("\n") for x in l]
  81
  82     def center(r, w):
  83         k = w - len(r)
  84         return " " * (k // 2) + r + " " * (k - k // 2)
  85
  86     def f(o, h):
  87         w = max(width_min, max([len(r) for r in o]))
  88         return [" " * w] * (h - len(o)) + [center(r, w) for r in o]
  89
  90     h = max([len(x) for x in l])
  91     l = [f(o, h) for o in l]
  92
  93     return "\n".join(["|".join([o[k] for o in l]) for k in range(h)])
  94
  95
  96 class PicroCrafterEnvironment:
  97     def __init__(
  98         self,
  99         world_height=27,
 100         world_width=27,
 101         nb_walls=27,
 102         world_margin=2,
 103         view_height=5,
 104         view_width=5,
 105         device=torch.device("cpu"),
 106     ):
 107         assert (world_height - 2 * world_margin) % (view_height - 2 * world_margin) == 0
 108         assert (world_width - 2 * world_margin) % (view_width - 2 * world_margin) == 0
 109
 110         self.device = device
 111
 112         self.world_height = world_height
 113         self.world_width = world_width
 114         self.world_margin = world_margin
 115         self.view_height = view_height
 116         self.view_width = view_width
 117         self.nb_walls = nb_walls
 118         self.life_level_max = 5
 119         self.life_level_gain_100th = 5
 120         self.reward_per_hit = -1
 121         self.reward_death = -10
 122
 123         self.tiles = " +#@$aAbBcC-" + "".join(
 124             [str(n) for n in range(self.life_level_max + 1)]
 125         )
 126         self.tile2id = dict([(t, n) for n, t in enumerate(self.tiles)])
 127         self.id2tile = dict([(n, t) for n, t in enumerate(self.tiles)])
 128
 129         self.next_object = dict(
 130             [
 131                 (self.tile2id[s], self.tile2id[t])
 132                 for (s, t) in [
 133                     ("a", "A"),
 134                     ("A", "b"),
 135                     ("b", "B"),
 136                     ("B", "c"),
 137                     ("c", "C"),
 138                     ("C", "-"),
 139                 ]
 140             ]
 141         )
 142
 143         self.object_reward = dict(
 144             [
 145                 (self.tile2id[t], r)
 146                 for (t, r) in [
 147                     ("a", 0),
 148                     ("A", 1),
 149                     ("b", 0),
 150                     ("B", 1),
 151                     ("c", 0),
 152                     ("C", 10),
 153                 ]
 154             ]
 155         )
 156
 157         self.accessible_object_to_inventory = dict(
 158             [
 159                 (self.tile2id[s], self.tile2id[t])
 160                 for (s, t) in [
 161                     ("a", " "),
 162                     ("A", "a"),
 163                     ("b", " "),
 164                     ("B", "b"),
 165                     ("c", " "),
 166                     ("C", "c"),
 167                     ("-", " "),
 168                 ]
 169             ]
 170         )
 171
 172     def reset(self, nb_agents):
 173         self.worlds = self.create_worlds(
 174             nb_agents,
 175             self.world_height,
 176             self.world_width,
 177             self.nb_walls,
 178             self.world_margin,
 179         ).to(self.device)
 180         self.life_level_in_100th = torch.full(
 181             (nb_agents,), self.life_level_max * 100 + 99, device=self.device
 182         )
 183         self.accessible_object = torch.full(
 184             (nb_agents,), self.tile2id["a"], device=self.device
 185         )
 186
 187     def create_mazes(self, nb, height, width, nb_walls):
 188         m = torch.zeros(nb, height, width, dtype=torch.int64, device=self.device)
 189         m[:, 0, :] = 1
 190         m[:, -1, :] = 1
 191         m[:, :, 0] = 1
 192         m[:, :, -1] = 1
 193
 194         i = torch.arange(height, device=m.device)[None, :, None]
 195         j = torch.arange(width, device=m.device)[None, None, :]
 196
 197         for _ in range(nb_walls):
 198             q = torch.rand(m.size(), device=m.device).flatten(1).sort(-1).indices * (
 199                 (1 - m) * (i % 2 == 0) * (j % 2 == 0)
 200             ).flatten(1)
 201             q = (q == q.max(dim=-1, keepdim=True).values).long().view(m.size())
 202             a = q[:, None].expand(-1, 4, -1, -1).clone()
 203             a[:, 0, :-1, :] += q[:, 1:, :]
 204             a[:, 0, :-2, :] += q[:, 2:, :]
 205             a[:, 1, 1:, :] += q[:, :-1, :]
 206             a[:, 1, 2:, :] += q[:, :-2, :]
 207             a[:, 2, :, :-1] += q[:, :, 1:]
 208             a[:, 2, :, :-2] += q[:, :, 2:]
 209             a[:, 3, :, 1:] += q[:, :, :-1]
 210             a[:, 3, :, 2:] += q[:, :, :-2]
 211             a = a[
 212                 torch.arange(a.size(0), device=a.device),
 213                 torch.randint(4, (a.size(0),), device=a.device),
 214             ]
 215             m = (m + q + a).clamp(max=1)
 216
 217         return m
 218
 219     def create_worlds(self, nb, height, width, nb_walls, world_margin=2):
 220         world_margin -= 1  # The maze adds a wall all around
 221         m = self.create_mazes(
 222             nb, height - 2 * world_margin, width - 2 * world_margin, nb_walls
 223         )
 224         q = m.flatten(1)
 225         z = "@aAbBcC$$$$$"  # What to add to the maze
 226         u = torch.rand(q.size(), device=q.device) * (1 - q)
 227         r = u.sort(dim=-1, descending=True).indices[:, : len(z)]
 228
 229         q *= self.tile2id["#"]
 230         q[
 231             torch.arange(q.size(0), device=q.device)[:, None].expand_as(r), r
 232         ] = torch.tensor([self.tile2id[c] for c in z], device=q.device)[None, :]
 233
 234         if world_margin > 0:
 235             r = m.new_full(
 236                 (m.size(0), m.size(1) + world_margin * 2, m.size(2) + world_margin * 2),
 237                 self.tile2id["+"],
 238             )
 239             r[:, world_margin:-world_margin, world_margin:-world_margin] = m
 240             m = r
 241         return m
 242
 243     def nb_actions(self):
 244         return 5
 245
 246     def action2str(self, n):
 247         if n >= 0 and n < 5:
 248             return "INESW"[n]
 249         else:
 250             return "?"
 251
 252     def nb_state_token_values(self):
 253         return len(self.tiles)
 254
 255     def min_max_reward(self):
 256         return (
 257             min(4 * self.reward_per_hit, self.reward_death),
 258             max(self.object_reward.values()),
 259         )
 260
 261     def step(self, actions):
 262         a = (self.worlds == self.tile2id["@"]).nonzero()
 263         self.worlds[a[:, 0], a[:, 1], a[:, 2]] = self.tile2id[" "]
 264         s = torch.tensor([[0, 0], [-1, 0], [0, 1], [1, 0], [0, -1]], device=self.device)
 265         b = a.clone()
 266         b[:, 1:] = b[:, 1:] + s[actions[b[:, 0]]]
 267         # position is empty
 268         o = (self.worlds[b[:, 0], b[:, 1], b[:, 2]] == self.tile2id[" "]).long()
 269         # or it is the next accessible object
 270         q = (
 271             self.worlds[b[:, 0], b[:, 1], b[:, 2]] == self.accessible_object[b[:, 0]]
 272         ).long()
 273         o = (o + q).clamp(max=1)[:, None]
 274         b = (1 - o) * a + o * b
 275         self.worlds[b[:, 0], b[:, 1], b[:, 2]] = self.tile2id["@"]
 276
 277         qq = q
 278         q = qq.new_zeros((self.worlds.size(0),) + qq.size()[1:])
 279         q[b[:, 0]] = qq
 280
 281         nb_hits = self.monster_moves()
 282
 283         alive_before = self.life_level_in_100th >= 100
 284
 285         self.life_level_in_100th[alive_before] = (
 286             self.life_level_in_100th[alive_before]
 287             + self.life_level_gain_100th
 288             - nb_hits[alive_before] * 100
 289         ).clamp(max=self.life_level_max * 100 + 99)
 290
 291         alive_after = self.life_level_in_100th >= 100
 292
 293         self.worlds[torch.logical_not(alive_after)] = self.tile2id["#"]
 294
 295         reward = nb_hits * self.reward_per_hit
 296
 297         for i in range(q.size(0)):
 298             if q[i] == 1:
 299                 reward[i] += self.object_reward[self.accessible_object[i].item()]
 300                 self.accessible_object[i] = self.next_object[
 301                     self.accessible_object[i].item()
 302                 ]
 303
 304         reward = (
 305             alive_after.long() * reward
 306             + alive_before.long() * (1 - alive_after.long()) * self.reward_death
 307         )
 308         inventory = torch.tensor(
 309             [
 310                 self.accessible_object_to_inventory[s.item()]
 311                 for s in self.accessible_object
 312             ]
 313         )
 314
 315         self.life_level_in_100th = (
 316             self.life_level_in_100th
 317             * (self.accessible_object != self.tile2id["-"]).long()
 318         )
 319
 320         reward[torch.logical_not(alive_before)] = 0
 321
 322         return reward, inventory, self.life_level_in_100th // 100
 323
 324     def monster_moves(self):
 325         # Current positions of the monsters
 326         m = (self.worlds == self.tile2id["$"]).long().flatten(1)
 327
 328         # Total number of monsters
 329         n = m.sum(-1).max()
 330
 331         # Create a tensor with one channel per monster
 332         r = (
 333             (torch.rand(m.size(), device=m.device) * m)
 334             .sort(dim=-1, descending=True)
 335             .indices[:, :n]
 336         )
 337         o = m.new_zeros((m.size(0), n) + m.size()[1:])
 338         i = torch.arange(o.size(0), device=o.device)[:, None].expand(-1, o.size(1))
 339         j = torch.arange(o.size(1), device=o.device)[None, :].expand(o.size(0), -1)
 340         o[i, j, r] = 1
 341         o = o * m[:, None]
 342
 343         # Create the tensor of possible motions
 344         o = o.view((self.worlds.size(0), n) + self.worlds.flatten(1).size()[1:])
 345         move_kernel = torch.tensor(
 346             [[[[0.0, 1.0, 0.0], [1.0, 1.0, 1.0], [0.0, 1.0, 0.0]]]], device=o.device
 347         )
 348
 349         p = (
 350             conv2d(
 351                 o.view(
 352                     o.size(0) * o.size(1), 1, self.worlds.size(-2), self.worlds.size(-1)
 353                 ).float(),
 354                 move_kernel,
 355                 padding=1,
 356             ).view(o.size())
 357             == 1.0
 358         ).long()
 359
 360         # Let's do the moves per say
 361         i = torch.arange(self.worlds.size(0), device=self.worlds.device)[
 362             :, None
 363         ].expand_as(r)
 364
 365         for n in range(p.size(1)):
 366             u = o[:, n].sort(dim=-1, descending=True).indices[:, :1]
 367             q = p[:, n] * (self.worlds.flatten(1) == self.tile2id[" "]) + o[:, n]
 368             r = (
 369                 (q * torch.rand(q.size(), device=q.device))
 370                 .sort(dim=-1, descending=True)
 371                 .indices[:, :1]
 372             )
 373             self.worlds.flatten(1)[i, u] = self.tile2id[" "]
 374             self.worlds.flatten(1)[i, r] = self.tile2id["$"]
 375
 376         nb_hits = (
 377             (
 378                 conv2d(
 379                     (self.worlds == self.tile2id["$"]).float()[:, None],
 380                     move_kernel,
 381                     padding=1,
 382                 )
 383                 .long()
 384                 .squeeze(1)
 385                 * (self.worlds == self.tile2id["@"]).long()
 386             )
 387             .flatten(1)
 388             .sum(-1)
 389         )
 390
 391         return nb_hits
 392
 393     def state_size(self):
 394         return (self.view_height + 1) * self.view_width
 395
 396     def state(self):
 397         i_height, i_width = (
 398             self.view_height - 2 * self.world_margin,
 399             self.view_width - 2 * self.world_margin,
 400         )
 401         a = (self.worlds == self.tile2id["@"]).nonzero()
 402         y = i_height * ((a[:, 1] - self.world_margin) // i_height)
 403         x = i_width * ((a[:, 2] - self.world_margin) // i_width)
 404         n = a[:, 0][:, None, None].expand(-1, self.view_height, self.view_width)
 405         i = (
 406             torch.arange(self.view_height, device=a.device)[None, :, None]
 407             + y[:, None, None]
 408         ).expand_as(n)
 409         j = (
 410             torch.arange(self.view_width, device=a.device)[None, None, :]
 411             + x[:, None, None]
 412         ).expand_as(n)
 413         v = self.worlds.new_full(
 414             (self.worlds.size(0), self.view_height + 1, self.view_width),
 415             self.tile2id["#"],
 416         )
 417
 418         v[a[:, 0], : self.view_height] = self.worlds[n, i, j]
 419
 420         v[:, self.view_height] = self.tile2id["-"]
 421         v[:, self.view_height, 0] = self.tile2id["0"] + (
 422             self.life_level_in_100th // 100
 423         ).clamp(min=0, max=self.life_level_max)
 424         v[:, self.view_height, 1] = torch.tensor(
 425             [
 426                 self.accessible_object_to_inventory[o.item()]
 427                 for o in self.accessible_object
 428             ],
 429             device=v.device,
 430         )
 431
 432         return v.flatten(1), self.life_level_in_100th >= 100
 433
 434     def state2str(self, t, width=None):
 435         def tile(n):
 436             n = n.item()
 437             if n in self.id2tile:
 438                 return self.id2tile[n]
 439             else:
 440                 return "?"
 441
 442         if t.dim() == 2:
 443             return [self.state2str(r, width) for r in t]
 444
 445         if width is None:
 446             width = self.view_width
 447
 448         t = t.reshape(-1, width)
 449
 450         t = "\n".join(["".join([tile(n) for n in r]) for r in t])
 451
 452         return t
 453
 454
 455 ######################################################################
 456
 457 if __name__ == "__main__":
 458     import os, time, sys
 459
 460     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 461
 462     # char_conv = lambda x: x
 463     char_conv = to_unicode
 464
 465     # nb_agents, nb_iter, display = 1000, 1000, False
 466     # ansi_term = False
 467
 468     nb_agents, nb_iter, display = 4, 10000, True
 469     ansi_term = True
 470
 471     if ansi_term:
 472         char_conv = lambda x: to_ansi(to_unicode(x))
 473
 474     start_time = time.perf_counter()
 475     environment = PicroCrafterEnvironment(
 476         world_height=27,
 477         world_width=27,
 478         nb_walls=35,
 479         view_height=9,
 480         view_width=9,
 481         world_margin=4,
 482         device=device,
 483     )
 484
 485     environment.reset(nb_agents)
 486
 487     print(f"timing {nb_agents/(time.perf_counter() - start_time)} init per s")
 488
 489     start_time = time.perf_counter()
 490
 491     stop = 0
 492     for k in range(nb_iter):
 493         if display:
 494             if ansi_term:
 495                 to_print = "\u001bc"
 496                 # print("\u001b[2J")
 497             else:
 498                 to_print = ""
 499                 os.system("clear")
 500
 501             l = environment.state2str(
 502                 environment.worlds.flatten(1), width=environment.world_width
 503             )
 504
 505             to_print += char_conv(fusion_multi_lines(l)) + "\n\n"
 506
 507         state, alive = environment.state()
 508         action = alive * torch.randint(
 509             environment.nb_actions(), (nb_agents,), device=device
 510         )
 511
 512         rewards, inventories, life_levels = environment.step(action)
 513
 514         if display:
 515             l = environment.state2str(state)
 516             l = [
 517                 v + f"\n{environment.action2str(a.item())}/{r: 3d}"
 518                 for (v, a, r) in zip(l, action, rewards)
 519             ]
 520
 521             to_print += (
 522                 char_conv(fusion_multi_lines(l, width_min=environment.world_width))
 523                 + "\n"
 524             )
 525
 526             print(to_print)
 527             sys.stdout.flush()
 528             time.sleep(0.25)
 529
 530         if (life_levels > 0).long().sum() == 0:
 531             stop += 1
 532             if stop == 10:
 533                 break
 534
 535     print(f"timing {(nb_agents*k)/(time.perf_counter() - start_time)} iteration per s")