forked from NeuralMMO/environment
-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathevo_batch.py
981 lines (912 loc) · 42.2 KB
/
evo_batch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
'''
Launch a batch of experiments on a SLURM cluster.
WARNING: This will kill all ray processes running on the current node after each experiment, to avoid memory issues from dead processes.
'''
from pdb import set_trace as TT
import argparse
import copy
import itertools
import json
import matplotlib
import scipy.stats
from matplotlib import pyplot as plt
import numpy as np
import pickle
import os
import re
import time
from collections import Counter
from forge.blade.core.terrain import MapGenerator, Save
from evolution.plot_diversity import heatmap, annotate_heatmap
from projekt import config
from projekt.config import get_experiment_name
from evolution.diversity import get_div_calc, get_pop_stats
from evolution.utils import get_exp_shorthand, get_eval_map_inds
##### HYPER-PARAMETERS #####
genomes = [
'Baseline',
'RiverBottleneckBaseline',
'ResourceNichesBaseline',
'BottleneckedResourceNichesBaseline',
'LabyrinthBaseline',
'Simplex',
'NCA',
'TileFlip',
'CPPN',
'Primitives',
'L-System',
'All',
]
generator_objectives = [
'Lifespans',
'Differential',
'FarNearestNeighbor',
'AdversityDiversityTrgs',
# 'L2',
# 'Hull',
# 'Sum',
# 'Discrete',
# 'CloseNearestNeighbor',
# 'InvL2',
# 'AdversityDiversity',
# 'MapTestText',
]
skills = [
'ALL',
# 'HARVEST',
# 'COMBAT',
# 'EXPLORATION',
]
algos = [
'MAP-Elites',
# 'Simple',
# 'CMAES',
# 'CMAME',
# 'NEAT',
]
me_bin_sizes = [
# [1,1],
[50, 50],
# [100,100],
]
# Are we running a PAIRED-type algorithm? If so, we use two policies, and reward the generator for maximizing the
# difference in terms of the generator_objective between the "protagonist" and "antagonist" policies.
PAIRED_bools = [
# True,
False
]
adv_div_ratios = [.5]
# adv_div_ratios = np.arange(0, 1.01, 1/6) # this gets stretched to [-1, 1] and used to shrink one agent or the either
# For "AdversityDiversityTrgs" -- how long should agents live, how diverse should they be
adv_trgs = [
0,
1/5,
2/5,
3/5,
4/5,
1,
]
div_trgs = [
0,
1/5,
2/5,
3/5,
4/5,
1,
]
adv_div_trgs = [i for i in itertools.product(adv_trgs, div_trgs)]
##########################
# TODO: use this variable in the eval command string. Formatting might be weird.
SKILLS = ['constitution', 'fishing', 'hunting', 'range', 'mage', 'melee', 'defense', 'woodcutting', 'mining', 'exploration',]
DIV_CALCS = ['L2', 'Differential', 'Hull',
#'Discrete',
'FarNearestNeighbor',
'Sum']
global eval_args
global EVALUATION_HORIZON
global TERRAIN_BORDER # Assuming this is the same for all experiments!
global MAP_GENERATOR # Also tile-set
global N_EVAL_MAPS
global N_MAP_EVALS
TERRAIN_BORDER = None
MAP_GENERATOR = None
def launch_cmd(new_cmd, i):
with open(sbatch_file, 'r') as f:
content = f.read()
job_name = 'nmmo_'
if EVALUATE:
job_name += 'eval_'
job_name += str(i)
content = re.sub('nmmo_(eval_)?\d+', job_name, content)
content = re.sub('#SBATCH --time=\d+:', '#SBATCH --time={}:'.format(JOB_TIME), content)
content = re.sub('#SBATCH --cpus-per-task=\d+:', '#SBATCH --cpus-per-task={}:'.format(JOB_CPUS), content)
new_cmd = '\n' + new_cmd
new_content = re.sub('\n.*python Forge.*', new_cmd, content)
with open(sbatch_file, 'w') as f:
f.write(new_content)
if LOCAL:
os.system(new_cmd)
if not (opts.vis_maps or opts.vis_cross_eval):
os.system('ray stop')
else:
os.system('sbatch {}'.format(sbatch_file))
def launch_batch(exp_name, get_exp_info_only=False):
exp_names = []
exp_configs = []
global TERRAIN_BORDER
global MAP_GENERATOR
global N_EVAL_MAPS
global N_MAP_EVALS
if LOCAL:
default_config['n_generations'] = 1
if EVALUATE or opts.render:
NENT = 16
else:
NENT = 3
#FIXME: we're overwriting a variable from original NMMO here. Will this be a problem?
N_EVAL_MAPS = 4 # How many maps to evaluate on. This must always be divisible by 2
N_MAP_EVALS = 5 # How many times to evaluate on each map
else:
NENT = 16
N_EVAL_MAPS = 4
N_MAP_EVALS = 5
N_PROC = opts.n_cpu
N_EVO_MAPS = 12
global EVALUATION_HORIZON
if opts.multi_policy:
EVALUATION_HORIZON = 500
else:
EVALUATION_HORIZON = 100
launched_baseline = False
i = 0
global eval_args
eval_args = "--EVALUATION_HORIZON {} --N_EVAL {} --N_EVAL_MAPS {} --NEW_EVAL --SKILLS \"['constitution', 'fishing', 'hunting', " \
"'range', 'mage', 'melee', 'defense', 'woodcutting', 'mining', 'exploration',]\" --NENT {} " \
"--FITNESS_METRIC {} ".format(
EVALUATION_HORIZON, N_MAP_EVALS, N_EVAL_MAPS, NENT, generator_objectives[0])
settings_tpls = [i for i in itertools.product(genomes, generator_objectives, skills, algos, me_bin_sizes,
PAIRED_bools)]
settings_tpls = [st for st in settings_tpls if not ('Baseline' in st[0] and st[1] != 'Lifespans')]
# Adversity/Diversity target expeiments can have different hyperparams
new_settings_tpls = []
for i, st in enumerate(settings_tpls):
adv_div_trg = (1, 1) # dummi var
adv_div_ratio = 0.5 # dummi var
gen_obj = st[1]
if gen_obj in ['AdversityDiversity', 'AdversityDiversityTrgs']:
for adv_div_ratio in adv_div_ratios:
if gen_obj == 'AdversityDiversityTrgs':
for adv_div_trg in adv_div_trgs:
new_st = copy.deepcopy(st)
new_st += (adv_div_ratio, adv_div_trg)
new_settings_tpls.append(new_st)
else:
new_st = copy.deepcopy(st)
new_st += (adv_div_ratio, adv_div_trg)
new_settings_tpls.append(new_st)
else:
st += (adv_div_ratio, adv_div_trg)
new_settings_tpls.append(st)
settings_tpls = new_settings_tpls
for (gene, gen_obj, skillset, algo, me_bins, PAIRED_bool, adv_div_ratio, adv_div_trg) in settings_tpls:
if gen_obj in ['Lifespans', 'Sum']:
if skillset != 'ALL':
continue
skillset = 'NONE'
if gene == 'Baseline':
if gen_obj != 'Lifespans':
continue
# if launched_baseline:
# # Only launch one baseline, these other settings are irrelevant
# # FIXME: but now you're going to get redundant baselines with different names across batch runs if you're
# # not careful (and I am not careful)
# continue
# else:
# launched_baseline = True
if algo != 'MAP-Elites' and not (np.array(me_bins) == 1).all():
# If using MAP-Elites, ME bin sizes are irrelevant
continue
if (np.array(me_bins) == 1).all():
# If we're doing a simple evolutionary strategy (lazily, through qdpy ME, then set 12 individuals per bin
items_per_bin = 12
feature_calc = None
else:
items_per_bin = 1
feature_calc = 'map_entropy'
if LOCAL:
if gen_obj == 'MapTestText':
N_GENERATIONS = 100000
if gene == 'All':
EVO_SAVE_INTERVAL = 100
else:
EVO_SAVE_INTERVAL = 100
else:
N_GENERATIONS = 10000
EVO_SAVE_INTERVAL = 10
else:
EVO_SAVE_INTERVAL = 500
N_GENERATIONS = 10000
def launch_experiment(i):
# Write the config file with the desired settings
exp_config = copy.deepcopy(default_config)
root = os.path.dirname(os.path.abspath(__file__)) + "/evo_experiment/experiment-name_0/maps/map"
exp_config.update({
'ROOT': root,
'N_GENERATIONS': N_GENERATIONS,
'TERRAIN_SIZE': 70,
'NENT': NENT,
'GENOME': gene,
'FITNESS_METRIC': gen_obj,
'EVO_ALGO': algo,
'EVO_DIR': exp_name,
'SKILLS': skillset,
'ME_BIN_SIZES': me_bins,
'ME_BOUNDS': [(0, 100), (0, 100)],
'FEATURE_CALC': feature_calc,
'ITEMS_PER_BIN': items_per_bin,
'N_EVO_MAPS': N_EVO_MAPS,
'N_PROC': N_PROC,
'TERRAIN_RENDER': False,
'EVO_SAVE_INTERVAL': EVO_SAVE_INTERVAL,
'VIS_MAPS': opts.vis_maps,
'RENDER': RENDER,
'EVALUATE': EVALUATE,
'PAIRED': PAIRED_bool,
'NUM_GPUS': 1 if CUDA else 0,
'ADVERSITY_DIVERSITY_RATIO': adv_div_ratio,
'ADVERSITY_DIVERSITY_TRGS': adv_div_trg,
'COMPETITIVE_EVAL': opts.multi_policy,
})
# if gene == 'Baseline':
# exp_config.update({
# 'PRETRAIN': True,
# })
# print('Saving experiment config:\n{}'.format(exp_config))
with open('configs/settings_{}.json'.format(i), 'w') as f:
json.dump(exp_config, f, ensure_ascii=False, indent=4)
# Edit the sbatch file to load the correct config file
# Launch the experiment. It should load the saved settings
new_cmd = 'python ForgeEvo.py --load_arguments {}'.format(i)
exp_configs.append(exp_config)
if not get_exp_info_only:
launch_cmd(new_cmd, i)
launch_experiment(i)
i += 1
return exp_configs
def launch_cross_eval(experiment_names, experiment_configs, vis_only=False, render=False, vis_cross_eval=False):
"""Launch a batch of evaluations, evaluating player models on generated maps from different experiments.
If not just visualizing, run each evaluation (cartesian product of set of experiments with itself), then return.
Otherwise, load data from past evaluations to generate visualizations of individual evaluations and/or of comparisons
between them."""
# FIXME: Hey why is one of these experiment_names just [None,...,None] :-D !
global MAP_GENERATOR
model_names_configs = [(model_name, model_config) for model_name, model_config in
zip(experiment_names, experiment_configs) if os.path.isdir(os.path.join('evo_experiment',
model_name, 'models'))
]
model_exp_names, model_exp_configs = [i[0] for i in model_names_configs], [i[1] for i in model_names_configs]
row_labels = [get_exp_shorthand(r) for r in model_exp_names]
map_names_configs = [(map_name, map_config) for map_name, map_config in zip(experiment_names, experiment_configs) if
os.path.isfile(os.path.join('evo_experiment', map_name, 'ME_archive.p'))]
map_exp_names, map_exp_configs = [i[0] for i in map_names_configs], [i[1] for i in map_names_configs]
col_labels = [get_exp_shorthand(c) for c in map_exp_names]
# TODO: Make it more difficult to mangle the dimensions of these arrays. Attach them to an enum type class or something
# We will use these heatmaps to visualize performance between generator-agent pairs over the set of experiments
mean_lifespans = np.zeros((1, len(model_exp_names), len(map_exp_names), N_MAP_EVALS, N_EVAL_MAPS))
# std_lifespans = np.zeros((len(model_exp_names), len(map_exp_names) + 1, N_MAP_EVALS, N_EVAL_MAPS)) # also take std of each model's average performance
mean_skills = np.zeros((len(SKILLS), len(model_exp_names), len(map_exp_names), N_MAP_EVALS, N_EVAL_MAPS))
div_scores = np.zeros((len(DIV_CALCS), len(model_exp_names), len(map_exp_names), N_MAP_EVALS, N_EVAL_MAPS))
div_scores[:] = np.nan
mean_skills[:] = np.nan
mean_lifespans[:] = np.nan
if opts.multi_policy:
mean_survivors = np.empty((len(model_exp_names), len(map_exp_names), N_MAP_EVALS, N_EVAL_MAPS), dtype=np.float)
if vis_only:
txt_verb = 'Visualizing past inference'
elif vis_cross_eval:
txt_verb = 'Collecting data for cross-eval visualization'
else:
txt_verb = 'Inferring'
def collect_eval_data():
n = 0
for (gen_i, (map_exp_name, map_exp_config)) in enumerate(map_names_configs):
if opts.eval_baseline_maps_only:
if 'Baseline' not in map_exp_config.GENOME:
continue
TERRAIN_BORDER = map_exp_config.TERRAIN_BORDER
# For each experiment from which we are evaluating generated maps, load up its map archive in order to select
# these evaluation maps
print(f'{txt_verb} from evaluation on map generator: {map_exp_name}')
mapgen_archive_path = os.path.join('evo_experiment', map_exp_name, 'ME_archive.p')
if not os.path.isfile(mapgen_archive_path):
print(f'Missing map archive at {mapgen_archive_path}')
continue
mapgen_exp_folder = map_exp_name
mapgen_eval_path = os.path.join('eval_experiment', mapgen_exp_folder)
if not os.path.isdir(mapgen_eval_path):
print(f'Missing map-generator eval folder for map {mapgen_eval_path}')
continue
else:
map_archive = pickle.load(open(mapgen_archive_path, "rb"))
# best_ind = archive['container'].best
eval_inds = get_eval_map_inds(map_archive, n_inds=N_EVAL_MAPS)
# Evaluate on a handful of elite maps
# for map_i, eval_map in enumerate(eval_inds):
# infer_idx, best_fitness = eval_map.idx, eval_map.fitness
infer_idxs, best_fitnesses = [map.idx for map in eval_inds], [map.fitness for map in eval_inds]
map_eval_paths = []
for infer_idx in infer_idxs:
map_eval_path = os.path.join(mapgen_eval_path, str(infer_idx))
infer_idx = map_eval_path.split('/')[-1]
if not os.path.isdir(map_eval_path):
print(f' Cannot find map eval folder for map {infer_idx}')
else:
print(f' Found map eval folder for map {infer_idx}')
map_eval_paths.append(map_eval_path)
for eval_map in eval_inds:
map_path = os.path.join('evo_experiment', map_exp_name, 'maps')
# Ad-hoc fix in case I delete all the "maps" folders on HPC
if not os.path.isdir(map_path):
os.mkdir(map_path)
map_path = os.path.join(map_path, 'map' + str(eval_map.idx), '')
if not os.path.isdir(map_path):
os.mkdir(map_path)
map_arr = eval_map.chromosome.map_arr
# Saving just in case we haven't already
Save.np(map_arr, map_path)
# png_path = os.path.join('evo_experiment', map_exp_name, 'maps', 'map' + str(infer_idx) + '.png')
# Save.render(map_arr[TERRAIN_BORDER:-TERRAIN_BORDER, TERRAIN_BORDER:-TERRAIN_BORDER], MAP_GENERATOR.textures, png_path)
print('{} on maps {}, with fitness scores {}, and ages {}.'.format(txt_verb, infer_idxs, best_fitnesses,
[map.age for map in eval_inds]))
# for (mdl_i, (model_exp_name, model_config)) in enumerate(zip(model_exp_names, experiment_configs)):
l_eval_args = '--config TreeOrerock --MAP {} '.format(map_exp_name,
infer_idxs)
if opts.multi_policy:
NPOLICIES = len(experiment_names)
l_eval_args += '--MODELS {} '.format(str(model_exp_names).replace(' ', ''))
else:
NPOLICIES = 1
NPOP = NPOLICIES
# FIXME: below may be trouble if not all experiments are (not) PAIRED
l_eval_args += '--NPOLICIES {} --NPOP {} --PAIRED {}'.format(NPOLICIES, NPOP, experiment_configs[0].PAIRED)
# Do eval
if render:
for infer_idx in infer_idxs:
l_eval_args_i = l_eval_args + ' --INFER_IDXS \"{}\" '.format(infer_idx)
render_cmd = 'python Forge.py render {} {}'.format(l_eval_args_i, eval_args)
assert LOCAL # cannot render on SLURM
assert not vis_only
# Launch the client as a background process
client_cmd = './neural-mmo-client/UnityClient/neural-mmo-resources.x86_64&'
os.system(client_cmd)
print(render_cmd)
os.system(render_cmd)
elif not (vis_only or vis_cross_eval):
l_eval_args_i = l_eval_args + ' --INFER_IDXS \"{}\" '.format(infer_idxs)
if not opts.multi_policy:
eval_cmd = ''
for mdl_i in range(len(model_exp_names)):
# TODO: cpu overheats but would be nice to have option of evaluating multiple models on each map in
# sequence
l_eval_args_i_j = l_eval_args_i + '--MODELS {} '.format(str([model_exp_names[mdl_i:mdl_i+2]]).replace(' ', ''))
eval_cmd_i = 'python Forge.py evaluate {} {} --EVO_DIR {}'.format(l_eval_args_i_j, eval_args, EXP_NAME)
eval_cmd += eval_cmd_i + ' ; '
else:
eval_cmd = 'python Forge.py evaluate {} {} --EVO_DIR {} --COMPETITIVE_EVAL True'.format(l_eval_args_i, eval_args, EXP_NAME)
print(eval_cmd)
launch_cmd(eval_cmd, n)
# print(eval_cmd)
n += 1
# Do stuff with data after eval
else:
for (mdl_i, (model_exp_name, model_exp_config)) in enumerate(model_names_configs):
# std_lifespans[i, j+1] =
print(' Collecting data from model {}.'.format(model_exp_name))
global EVALUATION_HORIZON
if opts.multi_policy:
model_exp_folder_name = 'multi_policy'
model_name = str([get_exp_shorthand(m) for m in model_exp_names])
else:
model_name = get_exp_shorthand(model_exp_name)
model_exp_folder_name = model_exp_name
eval_data_paths = []
for map_eval_path in map_eval_paths:
eval_data_path = os.path.join(
map_eval_path,
model_exp_folder_name,
'{}-steps eval.npy'.format(
# model_name,
# get_exp_shorthand(map_exp_name),
# infer_idx,
EVALUATION_HORIZON
),
)
infer_idx = map_eval_path.split('/')[-1]
if os.path.isfile(eval_data_path):
eval_data_paths.append(eval_data_path)
print(f" Found eval data for map {infer_idx}.")
# print(f" Found model eval data at {eval_data_path}")
else:
print(f" Cannot find eval data for map {infer_idx}")
# print(f" Cannot find eval data at {eval_data_path}")
map_eval_data = []
for eval_data_path in eval_data_paths:
data = dict(np.load(eval_data_path, allow_pickle=True))
if opts.multi_policy:
data['survivors'] = np.load(eval_data_path.replace('eval.npy', 'multi_eval.npy'), allow_pickle=True)
map_eval_data.append(data)
# except FileNotFoundError as fnf:
# # print(fnf)
# print('Skipping. Missing eval data at: {}'.format(eval_data_path))
# continue
print(' Compiling data from map-generator.')
# FIXME: this is a tad gnarly. Could we do this more cleanly over different maps?
# t0 = time.time()
# for map_i, data in enumerate(map_eval_data):
for map_i in range(min(N_EVAL_MAPS, len(map_eval_data))):
data = map_eval_data[map_i]
final_stats, div_mat, heatmaps = data['final_stats'], data['div_mat'], data['heatmaps']
# how many eval episodes will we use for data collection? can collect fewer than saved for fast iteration
n_evals_data = min(N_MAP_EVALS, len(final_stats))
# get the mean lifespan of each eval episode
evals_mean_lifespans = [np.mean(get_pop_stats(final_stats[i]['lifespans'], pop=None))
for i in range(n_evals_data)]
# take the mean lifespan over these episodes
mean_lifespans[0, mdl_i, gen_i, :, map_i] = evals_mean_lifespans
# std over episodes
# std_lifespans[mdl_i, gen_i, map_i] = np.std(evals_mean_lifespans)
# get the mean agent skill vector of each eval episode
evals_mean_skills = np.vstack([get_pop_stats(data_i['skills'],pop=None).mean(axis=0)
for data_i in final_stats])
for s_i in range(len(SKILLS)):
mean_skills[s_i, mdl_i, gen_i, :, map_i] = evals_mean_skills[0:n_evals_data, s_i]
for (s_i, div_calc_name) in enumerate(DIV_CALCS):
# Last dimension of div_mat is time-steps (at some interval, only last time-step by default). We'll
# take the latest.
evals_div_scores = div_mat[:, s_i, -1]
div_scores[s_i, mdl_i, gen_i, 0:n_evals_data, map_i] = evals_div_scores
if opts.multi_policy:
model_name_idxs = {get_exp_shorthand(r): i for (i, r) in enumerate(model_exp_names)}
multi_eval_data_path = eval_data_path.replace('eval.npy', 'multi_eval.npy')
survivors = np.load(multi_eval_data_path, allow_pickle=True)
for map_i, map_survivors in survivors.item().items():
for model_name, n_survivors in map_survivors.items():
model_idx = model_name_idxs[model_name]
mean_survivors[model_idx, gen_i, 0:n_evals_data, map_i] = n_survivors
# t1 = time.time()
# print(f" {t1-t0} to compile data for model on map-generator")
if opts.multi_policy: # don't need to iterate through models since we pit them against each other during the same episode
break
ret = (row_labels, model_exp_configs, col_labels, mean_lifespans, mean_skills, div_scores)
if opts.multi_policy:
ret = (*ret, mean_survivors)
return ret
if opts.multi_policy:
cross_eval_data_path = os.path.join('eval_experiment', 'competitive_cross-eval_data.npy')
else:
cross_eval_data_path = os.path.join('eval_experiment', 'cross-eval_data.npy')
if not opts.re_render_cross_vis:
data_tpl = collect_eval_data()
np.save(cross_eval_data_path, data_tpl)
else:
data_tpl = np.load(cross_eval_data_path, allow_pickle=True)
if vis_cross_eval or vis_only: # might as well do cross-eval vis if visualizing individual evals I guess
print("Visualizing cross-evaluation.")
# NOTE: this is placeholder code, valid only for the current batch of experiments which varies along the "genome" , "generator_objective" and "PAIRED" dimensions exclusively. Expand crappy get_exp_shorthand function if we need more.
# TODO: annotate the heatmap with labels more fancily, i.e. use the lists of hyperparams to create concise (hierarchical?) axis labels.
def get_mannwhitney(data):
data = data.reshape(data.shape[0], -1)
u_stats = np.empty(shape=(data.shape[0], data.shape[0]))
u_stats[:] = np.nan
data = [[v for v in dr if not np.isnan(v)] for dr in data]
for i in range(u_stats.shape[0]):
for j in range(u_stats.shape[1]):
u_stat = scipy.stats.mannwhitneyu(data[i], data[j])
u_stats[i, j] = u_stat.pvalue
return u_stats
def get_meanstd(data, get_pvals=False):
'''Funky function for getting mean, standard deviation of our data'''
# TODO: these indices should be global variables or something like that
# This gets the mean over evaluations (-2) and maps (-1)
mean_model_mapgen = np.nanmean(data, axis=(-2, -1))
# We want the standard deviation over evaluations (-2). So we get the mean on maps (-1) first
std_model_mapgen = np.nanstd(np.nanmean(data, axis=-1), axis=-1)
# add a column looking at the mean performance of each model over all maps
mean_model = np.nanmean(data, axis=(-3, -1)) # work around missing generators/maps
mean_model = np.nanmean(mean_model, axis=-1, keepdims=True) # and evals (careful though)
# standard deviation in this column is calculated a little differently: by getting the aggregate score of each model
# model over all maps, then looking at *this* random variable's standard deviation over evals
# TODO: is this a bad way to do it??? Should take std over evals, generators, and maps... or...?
# this is the mean over generators and maps (not evals!)
aggr_model = np.nanmean(data, axis=(-3, -1))
# std over evals
std_model = np.nanstd(aggr_model, axis=-1, keepdims=True)
# add column including mean performance of each model over all map generators
means = np.concatenate((mean_model_mapgen, mean_model), axis=-1)
stds = np.concatenate((std_model_mapgen, std_model), axis=-1)
# Now we add the same kind of mean column, but for maps, and calculate standard deviation in the same way
mean_map = np.nanmean(data, axis=(-4, -1))
mean_map = np.nanmean(mean_map, axis=-1, keepdims=True)
extra_cell_shape = list(mean_map.shape)
extra_cell_shape[-2] = 1
extra_cell = np.empty(extra_cell_shape)
extra_cell[:] = np.nan
mean_map = np.concatenate((mean_map, extra_cell), axis=-2)
aggr_map = np.nanmean(data, axis=(-4, -1))
std_map = np.nanstd(aggr_map, axis=-1, keepdims=True)
extra_cell_shape = list(std_map.shape)
extra_cell_shape[-2] = 1
extra_cell = np.empty(extra_cell_shape)
extra_cell[:] = np.nan
std_map = np.concatenate((std_map, extra_cell), axis=-2)
means = np.concatenate((means, np.swapaxes(mean_map, -2, -1)), -2)
stds = np.concatenate((stds, np.swapaxes(std_map, -2, -1)), -2)
pvals = None
if get_pvals:
pvals = get_mannwhitney(data)
return means, stds, pvals
def plot_histogram(data, row_labels, col_labels, name=""):
map_means = data[-1]
col_idxs =[(i, m) for i, m in enumerate(map_means)]
col_idx_vals = sorted(col_idxs, key=lambda tpl: tpl[1])
col_idxs = [tpl[0] for tpl in col_idx_vals]
xtick_labels = [col_labels[i] for i in col_idxs if not np.isnan(col_idx_vals[i][1])]
data = data[:, col_idxs]
col_idxs = np.arange(data.shape[1])
linestyles = ['-', '--', '-.', ':']
fig = plt.figure(figsize=(16, 20))
for i, model in enumerate(row_labels):
plt.plot(col_idxs, data[i], label=model, linestyle=linestyles[i % len(linestyles)])
plt.legend()
# locs, _ = plt.xticks()
plt.xticks(np.arange(len(xtick_labels)), xtick_labels, rotation=90)
plt.ylabel(name)
plt.savefig(os.path.join('eval_experiment', f'histogram_{name}.png'))
plt.tight_layout()
plt.close(fig)
def plot_prosp_div(data, row_labels, col_labels):
pass
def heatmaps_from_data(row_labels, col_labels, data_tpl, squash=False, figshape=(30,30)):
name = ''
if squash:
name = 'squash'
row_labels_m = copy.copy(row_labels)
col_labels_m = copy.copy(col_labels)
col_labels_m.append('mean')
row_labels_m.append('mean')
# FIXME: messy
if opts.multi_policy:
mean_lifespans, mean_skills, div_scores, mean_survivors = data_tpl
else:
mean_lifespans, mean_skills, div_scores, = data_tpl
mean_survivors = None
# mean and standard deviation of lifespans over maps and evals
mean_mapgen_lifespans, std_mapgen_lifespans, pvals_lifespans = get_meanstd(mean_lifespans[0], get_pvals=True)
if not squash: # otherwise different pros/div experiments will be collapsed into one
prosp_div_heatmap(row_labels, model_configs, mean_mapgen_lifespans[:, -1], std_mapgen_lifespans[:, -1],
title='Lifespans')
if squash: # otherwise this is an uninterpretable mess of lines
plot_histogram(mean_mapgen_lifespans, row_labels, col_labels, name='lifespans')
# Repeat this averaging logic for other stats
mean_mapgen_div_scores, std_mapgen_divscores, pvals_div_scores = get_meanstd(div_scores)
mean_mapgen_skills, std_mapgen_skills, pvals_div_scores = get_meanstd(mean_skills)
pval_figshape = figshape
if squash:
pval_figshape = (15, 15)
if opts.multi_policy:
mean_mapgen_survivors, std_mapgen_survivors, pvals_div_survivors = get_meanstd(mean_survivors)
cross_eval_heatmap(pvals_div_survivors, row_labels, row_labels, f"survivors_{name}_pvals", cbarlabel="p value",
figshape=pval_figshape)
cross_eval_heatmap(pvals_lifespans, row_labels, row_labels, f"lifespans_{name}_pvals", cbarlabel="p value",
pvals=True, figshape=pval_figshape)
if opts.multi_policy:
cross_eval_heatmap(mean_mapgen_survivors, row_labels_m, col_labels_m, f"mean survivors_{name}", "",
errors=std_mapgen_survivors, figshape=figshape)
cross_eval_heatmap(mean_mapgen_lifespans, row_labels_m, col_labels_m, f"lifespans_{name}", "mean lifespan [ticks]",
errors=std_mapgen_lifespans, figshape=figshape)
# for (s_i, skill_name) in enumerate(SKILLS):
# cross_eval_heatmap(mean_mapgen_skills[s_i], row_labels_m, col_labels_m, f"{skill_name}_{name}",
# "mean {} [xp]".format(skill_name), errors=std_mapgen_skills[s_i], figshape=figshape)
# for (d_i, div_calc_name) in enumerate(DIV_CALCS):
# cross_eval_heatmap(mean_mapgen_div_scores[d_i], row_labels_m, col_labels_m,
# f"{div_calc_name}_diversity_{name}", f"{div_calc_name} diversity",
# errors=std_mapgen_divscores[d_i], figshape=figshape)
def squash_exp_shorthand(shorthand):
'''Lmao'''
if 'Baseline' in shorthand:
# return shorthand
return 'Baseline'
for gen_obj_name in generator_objectives:
if gen_obj_name in shorthand:
if gen_obj_name == 'AdversityDiversityTrgs':
sp = shorthand.split(' ')
adv, div = float(sp[-3].strip(',')), float(sp[-1].strip(','))
if adv <= 0.5:
adv_name = "Adverse"
else:
adv_name = "Prosperous"
if div <= 0.5:
div_name = "Homogeneous"
else:
div_name = "Diverse"
return f"{adv_name} & {div_name}"
exp_name = gen_obj_name
return exp_name
else: return shorthand
def del_nan_rows(row_labels, col_labels, data):
#TODO
pass
# TODO: squash data, mann-whitney that shit
def squash_data(row_labels, col_labels, data):
''' Combine data from models from similar experiments by squashing them together, row-wise, and stacking them
along the "n_evals" dimension.'''
n_evals = data.shape[-2]
new_row_labels = []
for rl in row_labels: # excluding the "mean" row
if rl not in new_row_labels:
new_row_labels.append(rl)
row_label_idxs = {r: i for i, r in enumerate(new_row_labels)}
row_exp_counts = {r: 0 for r in new_row_labels} # so that we can stack experiments in new array
most_common, max_redundant_experiments = Counter(row_labels).most_common(1)[0]
n_net_evals = n_evals * max_redundant_experiments
sqsh_data = np.empty(shape=(data.shape[0], len(new_row_labels), data.shape[-3], n_net_evals, data.shape[-1]))
sqsh_data[:] = np.nan
print(data.shape, row_labels)
for i, rl in enumerate(row_labels):
rd = data[:,i:i+1]
n_eval_start = row_exp_counts[rl] * n_evals
row_exp_counts[rl] += 1
n_eval_end = row_exp_counts[rl] * n_evals
model_idx = row_label_idxs[rl]
# print(n_eval_end, sqsh_data.shape)
sqsh_data[:, model_idx:model_idx+1, :, n_eval_start:n_eval_end, :] = rd
new_row_labels = [None] * len(new_row_labels)
for k, v in row_label_idxs.items():
new_row_labels[v] = k
return new_row_labels, sqsh_data
row_labels, model_configs, col_labels = data_tpl[0:3]
data_tpl = data_tpl[3:]
# Visualize performance of all player-policies on all map-generators
heatmaps_from_data(row_labels, col_labels, data_tpl, figshape=(70,70))
# squash experiments with different objectives together
# TODO: use actual configs for this squashing!
row_labels = [squash_exp_shorthand(l) for l in row_labels]
# col_labels = [squash_exp_shorthand(l) for l in col_labels]
new_data_tpl = []
for d in data_tpl:
new_row_labels, sqsh_data = squash_data(row_labels, col_labels, d)
new_data_tpl.append(sqsh_data)
heatmaps_from_data(new_row_labels, col_labels, new_data_tpl, squash=True, figshape=(70, 30))
def prosp_div_heatmap(row_labels, model_configs, vals, errs, title):
# TODO: throw this in a function
# Visualize mean performance (over all maps) of experiments controlling for prosperity and diversity in a 2D grid
# along dimensions of prosperity and diversity.
idxs = []
divs_prosps = []
prosp_vals = set()
div_vals = set()
for i, (model_name, model_config) in enumerate(zip(row_labels, model_configs)):
if model_config.FITNESS_METRIC == 'AdversityDiversityTrgs':
idxs.append(i)
prosp, div = model_config.ADVERSITY_DIVERSITY_TRGS
divs_prosps.append((div, prosp))
prosp_vals.add(prosp)
div_vals.add(div)
prosps = sorted(list(prosp_vals))
prosps_to_pos = {p: i for i, p in enumerate(prosps)}
divs = sorted(list(div_vals))
divs_to_pos = {d: i for i, d in enumerate(divs)}
vals = vals[idxs]
errs = errs[idxs]
data = np.empty(shape=(len(prosps), len(divs)))
errors = data.copy()
for val, err, (div, prosp) in zip(vals, errs, divs_prosps):
data[prosps_to_pos[prosp], divs_to_pos[div]] = val
errors[prosps_to_pos[prosp], divs_to_pos[div]] = err
cross_eval_heatmap(np.flip(data.T, 0), prosps[::-1], divs, title, '', errors, figshape=(10, 10), xlabel='prosperity',
ylabel='diversity', filename=f'{title} (prosperity X diversity)', swap_xticks=False)
def cross_eval_heatmap(data, row_labels, col_labels, title, cbarlabel, errors=None, pvals=False, figshape=(30,30),
xlabel='maps', ylabel='models', filename=None, swap_xticks=True):
if filename is None:
filename = title
fig, ax = plt.subplots()
# Remove empty rows and columns
i = 0
# Remove empty rows and columns
for data_row in data:
if np.isnan(data_row).all():
data = np.vstack((data[:i], data[i+1:]))
assert np.isnan(errors[i]).all()
errors = np.vstack((errors[:i], errors[i+1:]))
row_labels = row_labels[:i] + row_labels[i+1:]
continue
i += 1
i = 0
for data_col in data.T:
if np.isnan(data_col).all():
data = (np.vstack((data.T[:i], data.T[i + 1:]))).T
assert np.isnan(errors.T[i]).all()
errors = (np.vstack((errors.T[:i], errors.T[i+1:]))).T
col_labels = col_labels[:i] + col_labels[i+1:]
continue
i += 1
# fig.set_figheight(1.5*len(col_labels))
# fig.set_figwidth(1.0*len(row_labels))
fig.set_figwidth(figshape[0])
fig.set_figheight(figshape[1])
if pvals:
cmap="viridis"
else:
cmap="magma"
im, cbar = heatmap(data, row_labels, col_labels, ax=ax,
cmap=cmap, cbarlabel=cbarlabel)
if not swap_xticks:
im.axes.xaxis.tick_bottom()
class CellFormatter(object):
def __init__(self, errors):
self.errors = errors
def func(self, x, pos):
#if np.isnan(x) or np.isnan(errors[pos]):
# # print(x, errors[pos])
# # Turns out the data entry is "masked" while the error entry is nan
# # assert np.isnan(x) and np.isnan(errors[pos])
# # if not np.isnan(x) and np.isnan(errors[pos]):
# return '--'
if not pvals:
x_str = "{:.1f}".format(x)
else:
x_str = "{:.1e}".format(x)
# x_str = "{:.3f}".format(x)
if errors is None:
return x_str
err = errors[pos]
x_str = x_str + " ± {:.1f}".format(err)
return x_str
cf = CellFormatter(errors)
if pvals:
textcolors = ("white", "black")
else:
textcolors = ("white", "black")
texts = annotate_heatmap(im, valfmt=matplotlib.ticker.FuncFormatter(cf.func), textcolors=textcolors)
ax.set_title(title)
# fig.tight_layout(rect=[1,0,1,0])
fig.tight_layout(pad=3)
# plt.show()
ax.set_xlabel(xlabel)
ax.set_ylabel(ylabel)
plt.savefig(os.path.join(
'eval_experiment',
'{}.png'.format(filename),
))
plt.close()
if __name__ == '__main__':
opts = argparse.ArgumentParser(
description='Launch a batch of experiments/evaluations for evo-pcgrl')
opts.add_argument(
'-ex',
'--experiment_name',
help='A name to be shared by the batch of experiments.',
default='0',
)
opts.add_argument(
'-ev',
'--evaluate',
help='Cross-evaluate a batch of joint map-evolution, agent-learning experiments, looking at the behavior of all '
'agent models on all ("best") maps.',
action='store_true',
)
opts.add_argument(
'-l',
'--local',
help='Run the batch script on a local machine (evolving for a minimal number of generations, or running full evaluations sequentially).',
action='store_true',
)
opts.add_argument(
'-bl',
'--train_baseline',
help='Train a baseline on Perlin noise-generated maps.',
action='store_true',
)
opts.add_argument(
'--cpu',
help='Do not use GPU (only applies to SLURM, not recommended for default, big neural networks).',
action='store_true',
)
opts.add_argument(
'--n_cpu',
help='How many parallel processes ray should use.',
type=int,
default=12,
)
opts.add_argument(
'--vis_cross_eval',
help='Visualize the results of cross-evaluation. (No new evaluations.)',
action='store_true',
)
opts.add_argument(
'--vis_evals',
help='Visualize the results of individual evaluations and cross-evaluation. (No new evaluations.)',
action='store_true',
)
opts.add_argument(
'--vis_maps',
help='Save and visualize evolved maps, and plot their fitness.',
action='store_true'
)
opts.add_argument(
'--render',
help='Render an episode in unity.',
action='store_true'
)
opts.add_argument(
'-mp',
'--multi-policy',
help='Evaluate all policies on each map simultaneously, to allow for inter-policy competition.',
action='store_true',
)
opts.add_argument(
'--eval_baseline_maps_only',
help='Only use baseline experiments for evaluation maps.',
action='store_true',
)
opts.add_argument(
'--re-render_cross_vis',
help='Re-render the heatmaps resulting from the last cross-visualization. For iterating on the way we render '
'these cross-vis graphics.',
action='store_true',
)
opts = opts.parse_args()
EXP_NAME = opts.experiment_name
EVALUATE = opts.evaluate
LOCAL = opts.local
TRAIN_BASELINE = opts.train_baseline
CUDA = not opts.cpu and not opts.vis_maps and not EVALUATE
VIS_CROSS_EVAL = opts.vis_cross_eval
VIS_EVALS = opts.vis_evals
RENDER = opts.render
if EVALUATE or opts.vis_maps:
JOB_TIME = 24
elif CUDA:
JOB_TIME = 48 # NYU HPC Greene limits number of gpu jobs otherwise
else:
pass
# JOB_TIME = 120 # never use CPU-only for training anyway
if EVALUATE and opts.multi_policy:
JOB_CPUS = 48
else:
JOB_CPUS = 12
if CUDA:
sbatch_file = 'evo_train.sh'
else:
sbatch_file = 'evo_train_cpu.sh'
if LOCAL:
print('Testing locally.')
else:
print('Launching batch of experiments on SLURM.')
with open('configs/default_settings.json', 'r') as f:
default_config = json.load(f)
print('Loaded default config:\n{}'.format(default_config))
if (EVALUATE or RENDER or VIS_EVALS or VIS_CROSS_EVAL) and not opts.vis_maps:
# just get the names and configs of experiments in which we are interested (no actual evaluations are run)
exp_dicts = launch_batch(EXP_NAME, get_exp_info_only=True)
experiment_configs = [config.EvoNMMO() for ec in exp_dicts]
[ec.set(*i) for ec, ecd in zip(experiment_configs, exp_dicts) for i in ecd.items()]
experiment_names = [get_experiment_name(ec) for ec in experiment_configs]
if RENDER:
print('rendering experiments: {}\n KeyboardInterrupt (Ctrl+c) to render next.'.format(experiment_names))
launch_cross_eval(experiment_names, vis_only=False, render=True, experiment_configs=experiment_configs)
else:
if not (VIS_CROSS_EVAL or VIS_EVALS):
print('cross evaluating experiments: {}'.format(experiment_names))
# only launch these cross evaluations if we need to
launch_cross_eval(experiment_names, experiment_configs=experiment_configs, vis_only=False)
# otherwise just load up old data to visualize results
if VIS_EVALS:
# visualize individual evaluations.
launch_cross_eval(experiment_names, experiment_configs=experiment_configs, vis_only=True)
elif VIS_CROSS_EVAL or LOCAL: # elif since vis_only also prompts cross-eval visualization
# visualize cross-evaluation tables
launch_cross_eval(experiment_names, experiment_configs=experiment_configs, vis_only=False, vis_cross_eval=True)
else:
# Launch a batch of joint map-evolution and agent-training experiments (maybe also a baseline agent-training experiment on a fixed set of maps).
launch_batch(EXP_NAME)