--- /dev/null
+#!/usr/bin/env python
+
+import time, torch
+
+if torch.cuda.is_available():
+ device = torch.device('cuda')
+else:
+ device = torch.device('cpu')
+
+nb_runs = 10000
+d1, d2, d3 = 50000, 256, 512
+
+a, b = torch.rand(d1, d2).to(device), torch.rand(d2, d3).to(device)
+
+start_time = time.perf_counter()
+for k in range(nb_runs):
+ c = a @ b
+duration = time.perf_counter() - start_time
+
+nb_flop = float(nb_runs * d1 * d2 * d3)
+speed = nb_flop / duration
+
+for u in [ '', 'K', 'M', 'G', 'T', 'P' ]:
+ if speed < 1e3: break
+ speed /= 1e3
+
+print(f'{speed:.02f} {u}flops on {device}')
+