evo_batch.py

'''
Launch a batch of experiments on a SLURM cluster.

WARNING: This will kill all ray processes running on the current node after each experiment, to avoid memory issues from dead processes.
'''
from pdb import set_trace as TT
import argparse
import copy
import itertools
import json
import matplotlib
import scipy.stats
from matplotlib import pyplot as plt
import numpy as np
import pickle
import os
import re
import time
from collections import Counter

from forge.blade.core.terrain import MapGenerator, Save
from evolution.plot_diversity import heatmap, annotate_heatmap
from projekt import config
from projekt.config import get_experiment_name
from evolution.diversity import get_div_calc, get_pop_stats
from evolution.utils import get_exp_shorthand, get_eval_map_inds


##### HYPER-PARAMETERS #####

genomes = [
   'Baseline',
   'RiverBottleneckBaseline',
   'ResourceNichesBaseline',
   'BottleneckedResourceNichesBaseline',
   'LabyrinthBaseline',
   'Simplex',
   'NCA',
   'TileFlip',
   'CPPN',
   'Primitives',
   'L-System',
   'All',
]
generator_objectives = [
   'Lifespans',
   'Differential',
   'FarNearestNeighbor',
   'AdversityDiversityTrgs',
#  'L2',
#  'Hull',
#  'Sum',
#  'Discrete',
#  'CloseNearestNeighbor',
#  'InvL2',
#  'AdversityDiversity',
#  'MapTestText',
]
skills = [
    'ALL',
#   'HARVEST',
#   'COMBAT',
#   'EXPLORATION',
]
algos = [
    'MAP-Elites',
#   'Simple',
#   'CMAES',
#   'CMAME',
#   'NEAT',
]
me_bin_sizes = [
#   [1,1],
    [50, 50],
#   [100,100],
]

# Are we running a PAIRED-type algorithm? If so, we use two policies, and reward the generator for maximizing the
# difference in terms of the generator_objective between the "protagonist" and "antagonist" policies.
PAIRED_bools = [
#  True,
   False
]
adv_div_ratios = [.5]
# adv_div_ratios = np.arange(0, 1.01, 1/6)  # this gets stretched to [-1, 1] and used to shrink one agent or the either

# For "AdversityDiversityTrgs" -- how long should agents live, how diverse should they be
adv_trgs = [
   0,
   1/5,
   2/5,
   3/5,
   4/5,
   1,
]
div_trgs = [
   0,
   1/5,
   2/5,
   3/5,
   4/5,
   1,
]
adv_div_trgs = [i for i in itertools.product(adv_trgs, div_trgs)]

##########################


# TODO: use this variable in the eval command string. Formatting might be weird.
SKILLS = ['constitution', 'fishing', 'hunting', 'range', 'mage', 'melee', 'defense', 'woodcutting', 'mining', 'exploration',]
DIV_CALCS = ['L2', 'Differential', 'Hull',
             #'Discrete',
             'FarNearestNeighbor',
            'Sum']
global eval_args
global EVALUATION_HORIZON
global TERRAIN_BORDER  # Assuming this is the same for all experiments!
global MAP_GENERATOR  # Also tile-set
global N_EVAL_MAPS
global N_MAP_EVALS
TERRAIN_BORDER = None
MAP_GENERATOR = None

def launch_cmd(new_cmd, i):
   with open(sbatch_file, 'r') as f:
      content = f.read()
      job_name = 'nmmo_'
      if EVALUATE:
          job_name += 'eval_'
      job_name += str(i)
      content = re.sub('nmmo_(eval_)?\d+', job_name, content)
      content = re.sub('#SBATCH --time=\d+:', '#SBATCH --time={}:'.format(JOB_TIME), content)
      content = re.sub('#SBATCH --cpus-per-task=\d+:', '#SBATCH --cpus-per-task={}:'.format(JOB_CPUS), content)
      new_cmd = '\n' + new_cmd
      new_content = re.sub('\n.*python Forge.*', new_cmd, content)

   with open(sbatch_file, 'w') as f:
      f.write(new_content)
   if LOCAL:
      os.system(new_cmd)
      if not (opts.vis_maps or opts.vis_cross_eval):
         os.system('ray stop')
   else:
      os.system('sbatch {}'.format(sbatch_file))


def launch_batch(exp_name, get_exp_info_only=False):
   exp_names = []
   exp_configs = []
   global TERRAIN_BORDER
   global MAP_GENERATOR
   global N_EVAL_MAPS
   global N_MAP_EVALS
   if LOCAL:
      default_config['n_generations'] = 1
      if EVALUATE or opts.render:
         NENT = 16
      else:
         NENT = 3
      #FIXME: we're overwriting a variable from original NMMO here. Will this be a problem?
      N_EVAL_MAPS = 4  # How many maps to evaluate on. This must always be divisible by 2
      N_MAP_EVALS = 5  # How many times to evaluate on each map
   else:
      NENT = 16
      N_EVAL_MAPS = 4
      N_MAP_EVALS = 5
   N_PROC = opts.n_cpu
   N_EVO_MAPS = 12
   global EVALUATION_HORIZON
   if opts.multi_policy:
      EVALUATION_HORIZON = 500
   else:
      EVALUATION_HORIZON = 100
   launched_baseline = False
   i = 0
   global eval_args
   eval_args = "--EVALUATION_HORIZON {} --N_EVAL {} --N_EVAL_MAPS {} --NEW_EVAL --SKILLS \"['constitution', 'fishing', 'hunting', " \
               "'range', 'mage', 'melee', 'defense', 'woodcutting', 'mining', 'exploration',]\" --NENT {} " \
               "--FITNESS_METRIC {} ".format(
      EVALUATION_HORIZON, N_MAP_EVALS, N_EVAL_MAPS, NENT, generator_objectives[0])

   settings_tpls = [i for i in itertools.product(genomes, generator_objectives, skills, algos, me_bin_sizes,
                                                 PAIRED_bools)]
   settings_tpls = [st for st in settings_tpls if not ('Baseline' in st[0] and st[1] != 'Lifespans')]
   # Adversity/Diversity target expeiments can have different hyperparams
   new_settings_tpls = []
   for i, st in enumerate(settings_tpls):
      adv_div_trg = (1, 1)  # dummi var
      adv_div_ratio = 0.5  # dummi var
      gen_obj = st[1]
      if gen_obj in ['AdversityDiversity', 'AdversityDiversityTrgs']:
         for adv_div_ratio in adv_div_ratios:
            if gen_obj == 'AdversityDiversityTrgs':
               for adv_div_trg in adv_div_trgs:
                  new_st = copy.deepcopy(st)
                  new_st += (adv_div_ratio, adv_div_trg)
                  new_settings_tpls.append(new_st)
            else:
               new_st = copy.deepcopy(st)
               new_st += (adv_div_ratio, adv_div_trg)
               new_settings_tpls.append(new_st)
      else:
         st += (adv_div_ratio, adv_div_trg)
         new_settings_tpls.append(st)
   settings_tpls = new_settings_tpls
   for (gene, gen_obj, skillset, algo, me_bins, PAIRED_bool, adv_div_ratio, adv_div_trg) in settings_tpls:
      if gen_obj in ['Lifespans', 'Sum']:
         if skillset != 'ALL':
            continue
         skillset = 'NONE'

      if gene == 'Baseline':
         if gen_obj != 'Lifespans':
            continue
#        if launched_baseline:
#           # Only launch one baseline, these other settings are irrelevant
#           # FIXME: but now you're going to get redundant baselines with different names across batch runs if you're
#           #  not careful (and I am not careful)
#           continue
#        else:
#           launched_baseline = True
      if algo != 'MAP-Elites' and not (np.array(me_bins) == 1).all():
         # If using MAP-Elites, ME bin sizes are irrelevant
         continue
      if (np.array(me_bins) == 1).all():
         # If we're doing a simple evolutionary strategy (lazily, through qdpy ME, then set 12 individuals per bin
         items_per_bin = 12
         feature_calc = None
      else:
         items_per_bin = 1
         feature_calc = 'map_entropy'

      if LOCAL:
         if gen_obj == 'MapTestText':
            N_GENERATIONS = 100000
            if gene == 'All':
               EVO_SAVE_INTERVAL = 100
            else:
               EVO_SAVE_INTERVAL = 100
         else:
            N_GENERATIONS = 10000
            EVO_SAVE_INTERVAL = 10
      else:
         EVO_SAVE_INTERVAL = 500
         N_GENERATIONS = 10000

      def launch_experiment(i):
         # Write the config file with the desired settings
         exp_config = copy.deepcopy(default_config)
         root = os.path.dirname(os.path.abspath(__file__)) + "/evo_experiment/experiment-name_0/maps/map"
         exp_config.update({
            'ROOT': root,
            'N_GENERATIONS': N_GENERATIONS,
            'TERRAIN_SIZE': 70,
            'NENT': NENT,
            'GENOME': gene,
            'FITNESS_METRIC': gen_obj,
            'EVO_ALGO': algo,
            'EVO_DIR': exp_name,
            'SKILLS': skillset,
            'ME_BIN_SIZES': me_bins,
            'ME_BOUNDS': [(0, 100), (0, 100)],
            'FEATURE_CALC': feature_calc,
            'ITEMS_PER_BIN': items_per_bin,
            'N_EVO_MAPS': N_EVO_MAPS,
            'N_PROC': N_PROC,
            'TERRAIN_RENDER': False,
            'EVO_SAVE_INTERVAL': EVO_SAVE_INTERVAL,
            'VIS_MAPS': opts.vis_maps,
            'RENDER': RENDER,
            'EVALUATE': EVALUATE,
            'PAIRED': PAIRED_bool,
            'NUM_GPUS': 1 if CUDA else 0,
            'ADVERSITY_DIVERSITY_RATIO': adv_div_ratio,
            'ADVERSITY_DIVERSITY_TRGS': adv_div_trg,
            'COMPETITIVE_EVAL': opts.multi_policy,
         })
         #     if gene == 'Baseline':
         #        exp_config.update({
         #            'PRETRAIN': True,
         #        })

#        print('Saving experiment config:\n{}'.format(exp_config))
         with open('configs/settings_{}.json'.format(i), 'w') as f:
            json.dump(exp_config, f, ensure_ascii=False, indent=4)

         # Edit the sbatch file to load the correct config file
         # Launch the experiment. It should load the saved settings
         new_cmd = 'python ForgeEvo.py --load_arguments {}'.format(i)
         exp_configs.append(exp_config)
         if not get_exp_info_only:
            launch_cmd(new_cmd, i)

      launch_experiment(i)
      i += 1

   return exp_configs


def launch_cross_eval(experiment_names, experiment_configs, vis_only=False, render=False, vis_cross_eval=False):
   """Launch a batch of evaluations, evaluating player models on generated maps from different experiments.
   If not just visualizing, run each evaluation (cartesian product of set of experiments with itself), then return.
   Otherwise, load data from past evaluations to generate visualizations of individual evaluations and/or of comparisons
   between them."""
   # FIXME: Hey why is one of these experiment_names just [None,...,None]   :-D   !
   global MAP_GENERATOR
   model_names_configs = [(model_name, model_config) for model_name, model_config in
                          zip(experiment_names, experiment_configs) if os.path.isdir(os.path.join('evo_experiment',
                                                                                                  model_name, 'models'))
                          ]
   model_exp_names, model_exp_configs = [i[0] for i in model_names_configs], [i[1] for i in model_names_configs]
   row_labels = [get_exp_shorthand(r) for r in model_exp_names]
   map_names_configs = [(map_name, map_config) for map_name, map_config in zip(experiment_names, experiment_configs) if
                        os.path.isfile(os.path.join('evo_experiment', map_name, 'ME_archive.p'))]
   map_exp_names, map_exp_configs = [i[0] for i in map_names_configs], [i[1] for i in map_names_configs]
   col_labels = [get_exp_shorthand(c) for c in map_exp_names]
   # TODO: Make it more difficult to mangle the dimensions of these arrays. Attach them to an enum type class or something
   # We will use these heatmaps to visualize performance between generator-agent pairs over the set of experiments
   mean_lifespans = np.zeros((1, len(model_exp_names), len(map_exp_names), N_MAP_EVALS, N_EVAL_MAPS))
#  std_lifespans = np.zeros((len(model_exp_names), len(map_exp_names) + 1, N_MAP_EVALS, N_EVAL_MAPS))  # also take std of each model's average performance
   mean_skills = np.zeros((len(SKILLS), len(model_exp_names), len(map_exp_names), N_MAP_EVALS, N_EVAL_MAPS))
   div_scores = np.zeros((len(DIV_CALCS), len(model_exp_names), len(map_exp_names), N_MAP_EVALS, N_EVAL_MAPS))
   div_scores[:] = np.nan
   mean_skills[:] = np.nan
   mean_lifespans[:] = np.nan
   if opts.multi_policy:
      mean_survivors = np.empty((len(model_exp_names), len(map_exp_names), N_MAP_EVALS, N_EVAL_MAPS), dtype=np.float)
   if vis_only:
      txt_verb = 'Visualizing past inference'
   elif vis_cross_eval:
      txt_verb = 'Collecting data for cross-eval visualization'
   else:
      txt_verb = 'Inferring'

   def collect_eval_data():
      n = 0
      for (gen_i, (map_exp_name, map_exp_config)) in enumerate(map_names_configs):
         if opts.eval_baseline_maps_only:
            if 'Baseline' not in map_exp_config.GENOME:
               continue
         TERRAIN_BORDER = map_exp_config.TERRAIN_BORDER
         # For each experiment from which we are evaluating generated maps, load up its map archive in order to select
         # these evaluation maps
         print(f'{txt_verb} from evaluation on map generator: {map_exp_name}')
         mapgen_archive_path = os.path.join('evo_experiment', map_exp_name, 'ME_archive.p')
         if not os.path.isfile(mapgen_archive_path):
            print(f'Missing map archive at {mapgen_archive_path}')
            continue
         mapgen_exp_folder = map_exp_name
         mapgen_eval_path = os.path.join('eval_experiment', mapgen_exp_folder)
         if not os.path.isdir(mapgen_eval_path):
            print(f'Missing map-generator eval folder for map {mapgen_eval_path}')
            continue
         else:
            map_archive = pickle.load(open(mapgen_archive_path, "rb"))
         # best_ind = archive['container'].best
         eval_inds = get_eval_map_inds(map_archive, n_inds=N_EVAL_MAPS)
         # Evaluate on a handful of elite maps
#        for map_i, eval_map in enumerate(eval_inds):
#        infer_idx, best_fitness = eval_map.idx, eval_map.fitness
         infer_idxs, best_fitnesses = [map.idx for map in eval_inds], [map.fitness for map in eval_inds]
         map_eval_paths = []
         for infer_idx in infer_idxs:
            map_eval_path = os.path.join(mapgen_eval_path, str(infer_idx))
            infer_idx = map_eval_path.split('/')[-1]
            if not os.path.isdir(map_eval_path):
               print(f' Cannot find map eval folder for map {infer_idx}')
            else:
               print(f' Found map eval folder for map {infer_idx}')
               map_eval_paths.append(map_eval_path)
         for eval_map in eval_inds:
            map_path = os.path.join('evo_experiment', map_exp_name, 'maps')
            # Ad-hoc fix in case I delete all the "maps" folders on HPC
            if not os.path.isdir(map_path):
                os.mkdir(map_path)
            map_path = os.path.join(map_path, 'map' + str(eval_map.idx), '')
            if not os.path.isdir(map_path):
                os.mkdir(map_path)
            map_arr = eval_map.chromosome.map_arr
            # Saving just in case we haven't already
            Save.np(map_arr, map_path)
#           png_path = os.path.join('evo_experiment', map_exp_name, 'maps', 'map' + str(infer_idx) + '.png')
#           Save.render(map_arr[TERRAIN_BORDER:-TERRAIN_BORDER, TERRAIN_BORDER:-TERRAIN_BORDER], MAP_GENERATOR.textures, png_path)
         print('{} on maps {}, with fitness scores {}, and ages {}.'.format(txt_verb, infer_idxs, best_fitnesses,
                                                                     [map.age for map in eval_inds]))
#        for (mdl_i, (model_exp_name, model_config)) in enumerate(zip(model_exp_names, experiment_configs)):
         l_eval_args = '--config TreeOrerock --MAP {} '.format(map_exp_name,
                                                                                          infer_idxs)
         if opts.multi_policy:
            NPOLICIES = len(experiment_names)
            l_eval_args += '--MODELS {} '.format(str(model_exp_names).replace(' ', ''))
         else:
            NPOLICIES = 1
         NPOP = NPOLICIES
         # FIXME: below may be trouble if not all experiments are (not) PAIRED
         l_eval_args += '--NPOLICIES {} --NPOP {} --PAIRED {}'.format(NPOLICIES, NPOP, experiment_configs[0].PAIRED)

         # Do eval
         if render:
            for infer_idx in infer_idxs:
               l_eval_args_i = l_eval_args + ' --INFER_IDXS \"{}\" '.format(infer_idx)
               render_cmd = 'python Forge.py render {} {}'.format(l_eval_args_i, eval_args)
               assert LOCAL  # cannot render on SLURM
               assert not vis_only
               # Launch the client as a background process
               client_cmd = './neural-mmo-client/UnityClient/neural-mmo-resources.x86_64&'
               os.system(client_cmd)
               print(render_cmd)
               os.system(render_cmd)
         elif not (vis_only or vis_cross_eval):
            l_eval_args_i = l_eval_args + ' --INFER_IDXS \"{}\" '.format(infer_idxs)
            if not opts.multi_policy:
               eval_cmd = ''
               for mdl_i in range(len(model_exp_names)):
                  # TODO: cpu overheats but would be nice to have option of evaluating multiple models on each map in
                  #   sequence
                  l_eval_args_i_j = l_eval_args_i + '--MODELS {} '.format(str([model_exp_names[mdl_i:mdl_i+2]]).replace(' ', ''))
                  eval_cmd_i = 'python Forge.py evaluate {} {} --EVO_DIR {}'.format(l_eval_args_i_j, eval_args, EXP_NAME)
                  eval_cmd += eval_cmd_i + ' ; '
            else:
               eval_cmd = 'python Forge.py evaluate {} {} --EVO_DIR {} --COMPETITIVE_EVAL True'.format(l_eval_args_i, eval_args, EXP_NAME)
            print(eval_cmd)
            launch_cmd(eval_cmd, n)
               # print(eval_cmd)
            n += 1
         # Do stuff with data after eval
         else:
            for (mdl_i, (model_exp_name, model_exp_config)) in enumerate(model_names_configs):
               #        std_lifespans[i, j+1] =
               print('  Collecting data from model {}.'.format(model_exp_name))
               global EVALUATION_HORIZON
               if opts.multi_policy:
                  model_exp_folder_name = 'multi_policy'
                  model_name = str([get_exp_shorthand(m) for m in model_exp_names])
               else:
                  model_name = get_exp_shorthand(model_exp_name)
                  model_exp_folder_name = model_exp_name
               eval_data_paths = []
               for map_eval_path in map_eval_paths:
                  eval_data_path = os.path.join(
                     map_eval_path,
                     model_exp_folder_name,
                     '{}-steps eval.npy'.format(
#                       model_name,
#                       get_exp_shorthand(map_exp_name),
#                       infer_idx,
                        EVALUATION_HORIZON
                     ),
                  )
                  infer_idx = map_eval_path.split('/')[-1]
                  if os.path.isfile(eval_data_path):
                     eval_data_paths.append(eval_data_path)
                     print(f"    Found eval data for map {infer_idx}.")
#                    print(f"    Found model eval data at {eval_data_path}")
                  else:
                     print(f"    Cannot find eval data for map {infer_idx}")
#                    print(f"    Cannot find eval data at {eval_data_path}")
               map_eval_data = []
               for eval_data_path in eval_data_paths:
                  data = dict(np.load(eval_data_path, allow_pickle=True))
                  if opts.multi_policy:
                     data['survivors'] = np.load(eval_data_path.replace('eval.npy', 'multi_eval.npy'), allow_pickle=True)
                  map_eval_data.append(data)
   #              except FileNotFoundError as fnf:
#                 # print(fnf)
#                 print('Skipping. Missing eval data at: {}'.format(eval_data_path))
#                 continue
               print('      Compiling data from map-generator.')
               # FIXME: this is a tad gnarly. Could we do this more cleanly over different maps?
#              t0 = time.time()
               # for map_i, data in enumerate(map_eval_data):
               for map_i in range(min(N_EVAL_MAPS, len(map_eval_data))):
                  data = map_eval_data[map_i]
                  final_stats, div_mat, heatmaps = data['final_stats'], data['div_mat'], data['heatmaps']
                  # how many eval episodes will we use for data collection? can collect fewer than saved for fast iteration
                  n_evals_data = min(N_MAP_EVALS, len(final_stats))
                  # get the mean lifespan of each eval episode
                  evals_mean_lifespans = [np.mean(get_pop_stats(final_stats[i]['lifespans'], pop=None))
                                          for i in range(n_evals_data)]
                  # take the mean lifespan over these episodes
                  mean_lifespans[0, mdl_i, gen_i, :, map_i] = evals_mean_lifespans
                  # std over episodes
                  # std_lifespans[mdl_i, gen_i, map_i] = np.std(evals_mean_lifespans)
                  # get the mean agent skill vector of each eval episode
                  evals_mean_skills = np.vstack([get_pop_stats(data_i['skills'],pop=None).mean(axis=0)
                                                 for data_i in final_stats])
                  for s_i in range(len(SKILLS)):
                     mean_skills[s_i, mdl_i, gen_i, :, map_i] = evals_mean_skills[0:n_evals_data, s_i]
                  for (s_i, div_calc_name) in enumerate(DIV_CALCS):
                     # Last dimension of div_mat is time-steps (at some interval, only last time-step by default). We'll
                     # take the latest.
                     evals_div_scores = div_mat[:, s_i, -1]
                     div_scores[s_i, mdl_i, gen_i, 0:n_evals_data, map_i] = evals_div_scores
                  if opts.multi_policy:
                     model_name_idxs = {get_exp_shorthand(r): i for (i, r) in enumerate(model_exp_names)}
                     multi_eval_data_path = eval_data_path.replace('eval.npy', 'multi_eval.npy')
                     survivors = np.load(multi_eval_data_path, allow_pickle=True)
                     for map_i, map_survivors in survivors.item().items():
                        for model_name, n_survivors in map_survivors.items():
                           model_idx = model_name_idxs[model_name]
                           mean_survivors[model_idx, gen_i, 0:n_evals_data, map_i] = n_survivors
#              t1 = time.time()
#              print(f"      {t1-t0} to compile data for model on map-generator")
               if opts.multi_policy:  # don't need to iterate through models since we pit them against each other during the same episode
                  break
      ret = (row_labels, model_exp_configs, col_labels, mean_lifespans, mean_skills, div_scores)
      if opts.multi_policy:
         ret = (*ret, mean_survivors)
      return ret

   if opts.multi_policy:
      cross_eval_data_path = os.path.join('eval_experiment', 'competitive_cross-eval_data.npy')
   else:
      cross_eval_data_path = os.path.join('eval_experiment', 'cross-eval_data.npy')
   if not opts.re_render_cross_vis:
      data_tpl = collect_eval_data()
      np.save(cross_eval_data_path, data_tpl)
   else:
      data_tpl = np.load(cross_eval_data_path, allow_pickle=True)

   if vis_cross_eval or vis_only:  # might as well do cross-eval vis if visualizing individual evals I guess
      print("Visualizing cross-evaluation.")
      # NOTE: this is placeholder code, valid only for the current batch of experiments which varies along the "genome" , "generator_objective" and "PAIRED" dimensions exclusively. Expand crappy get_exp_shorthand function if we need more.
      # TODO: annotate the heatmap with labels more fancily, i.e. use the lists of hyperparams to create concise (hierarchical?) axis labels.

      def get_mannwhitney(data):
         data = data.reshape(data.shape[0], -1)
         u_stats = np.empty(shape=(data.shape[0], data.shape[0]))
         u_stats[:] = np.nan
         data = [[v for v in dr if not np.isnan(v)] for dr in data]
         for i in range(u_stats.shape[0]):
            for j in range(u_stats.shape[1]):
               u_stat = scipy.stats.mannwhitneyu(data[i], data[j])
               u_stats[i, j] = u_stat.pvalue
         return u_stats

      def get_meanstd(data, get_pvals=False):
         '''Funky function for getting mean, standard deviation of our data'''
         # TODO: these indices should be global variables or something like that
         # This gets the mean over evaluations (-2) and maps (-1)
         mean_model_mapgen = np.nanmean(data, axis=(-2, -1))
         # We want the standard deviation over evaluations (-2). So we get the mean on maps (-1) first
         std_model_mapgen = np.nanstd(np.nanmean(data, axis=-1), axis=-1)

         # add a column looking at the mean performance of each model over all maps
         mean_model = np.nanmean(data, axis=(-3, -1))  # work around missing generators/maps
         mean_model = np.nanmean(mean_model, axis=-1, keepdims=True)  # and evals (careful though)

         # standard deviation in this column is calculated a little differently: by getting the aggregate score of each model
         # model over all maps, then looking at *this* random variable's standard deviation over evals
         # TODO: is this a bad way to do it??? Should take std over evals, generators, and maps... or...?
         # this is the mean over generators and maps (not evals!)
         aggr_model = np.nanmean(data, axis=(-3, -1))
         # std over evals
         std_model = np.nanstd(aggr_model, axis=-1, keepdims=True)
         # add column including mean performance of each model over all map generators
         means = np.concatenate((mean_model_mapgen, mean_model), axis=-1)
         stds = np.concatenate((std_model_mapgen, std_model), axis=-1)

         # Now we add the same kind of mean column, but for maps, and calculate standard deviation in the same way
         mean_map = np.nanmean(data, axis=(-4, -1))
         mean_map = np.nanmean(mean_map, axis=-1, keepdims=True)
         extra_cell_shape = list(mean_map.shape)
         extra_cell_shape[-2] = 1
         extra_cell = np.empty(extra_cell_shape)
         extra_cell[:] = np.nan
         mean_map = np.concatenate((mean_map, extra_cell), axis=-2)
         aggr_map = np.nanmean(data, axis=(-4, -1))
         std_map = np.nanstd(aggr_map, axis=-1, keepdims=True)
         extra_cell_shape = list(std_map.shape)
         extra_cell_shape[-2] = 1
         extra_cell = np.empty(extra_cell_shape)
         extra_cell[:] = np.nan
         std_map = np.concatenate((std_map, extra_cell), axis=-2)
         means = np.concatenate((means, np.swapaxes(mean_map, -2, -1)), -2)
         stds = np.concatenate((stds, np.swapaxes(std_map, -2, -1)), -2)

         pvals = None
         if get_pvals:
            pvals = get_mannwhitney(data)

         return means, stds, pvals

      def plot_histogram(data, row_labels, col_labels, name=""):
         map_means = data[-1]
         col_idxs =[(i, m) for i, m in enumerate(map_means)]
         col_idx_vals = sorted(col_idxs, key=lambda tpl: tpl[1])
         col_idxs = [tpl[0] for tpl in col_idx_vals]
         xtick_labels = [col_labels[i] for i in col_idxs if not np.isnan(col_idx_vals[i][1])]
         data = data[:, col_idxs]
         col_idxs = np.arange(data.shape[1])
         linestyles = ['-', '--', '-.', ':']
         fig = plt.figure(figsize=(16, 20))
         for i, model in enumerate(row_labels):
            plt.plot(col_idxs, data[i], label=model, linestyle=linestyles[i % len(linestyles)])
         plt.legend()
#        locs, _ = plt.xticks()
         plt.xticks(np.arange(len(xtick_labels)), xtick_labels, rotation=90)
         plt.ylabel(name)
         plt.savefig(os.path.join('eval_experiment', f'histogram_{name}.png'))
         plt.tight_layout()
         plt.close(fig)

      def plot_prosp_div(data, row_labels, col_labels):
         pass

      def heatmaps_from_data(row_labels, col_labels, data_tpl, squash=False, figshape=(30,30)):
         name = ''
         if squash:
            name = 'squash'
         row_labels_m = copy.copy(row_labels)
         col_labels_m = copy.copy(col_labels)
         col_labels_m.append('mean')
         row_labels_m.append('mean')
         # FIXME: messy
         if opts.multi_policy:
            mean_lifespans, mean_skills, div_scores, mean_survivors = data_tpl
         else:
            mean_lifespans, mean_skills, div_scores, = data_tpl
            mean_survivors = None

         # mean and standard deviation of lifespans over maps and evals
         mean_mapgen_lifespans, std_mapgen_lifespans, pvals_lifespans = get_meanstd(mean_lifespans[0], get_pvals=True)

         if not squash:  # otherwise different pros/div experiments will be collapsed into one
            prosp_div_heatmap(row_labels, model_configs, mean_mapgen_lifespans[:, -1], std_mapgen_lifespans[:, -1],
                              title='Lifespans')

         if squash:  # otherwise this is an uninterpretable mess of lines
            plot_histogram(mean_mapgen_lifespans, row_labels, col_labels, name='lifespans')

         # Repeat this averaging logic for other stats
         mean_mapgen_div_scores, std_mapgen_divscores, pvals_div_scores = get_meanstd(div_scores)
         mean_mapgen_skills, std_mapgen_skills, pvals_div_scores = get_meanstd(mean_skills)
         pval_figshape = figshape
         if squash:
            pval_figshape = (15, 15)
         if opts.multi_policy:
            mean_mapgen_survivors, std_mapgen_survivors, pvals_div_survivors = get_meanstd(mean_survivors)
            cross_eval_heatmap(pvals_div_survivors, row_labels, row_labels, f"survivors_{name}_pvals", cbarlabel="p value",
                               figshape=pval_figshape)

         cross_eval_heatmap(pvals_lifespans, row_labels, row_labels, f"lifespans_{name}_pvals", cbarlabel="p value",
                            pvals=True, figshape=pval_figshape)
         if opts.multi_policy:
            cross_eval_heatmap(mean_mapgen_survivors, row_labels_m, col_labels_m, f"mean survivors_{name}", "",
                                errors=std_mapgen_survivors, figshape=figshape)
         cross_eval_heatmap(mean_mapgen_lifespans, row_labels_m, col_labels_m, f"lifespans_{name}", "mean lifespan [ticks]",
                             errors=std_mapgen_lifespans, figshape=figshape)
#        for (s_i, skill_name) in enumerate(SKILLS):
#           cross_eval_heatmap(mean_mapgen_skills[s_i], row_labels_m, col_labels_m, f"{skill_name}_{name}",
#                               "mean {} [xp]".format(skill_name), errors=std_mapgen_skills[s_i], figshape=figshape)
#        for (d_i, div_calc_name) in enumerate(DIV_CALCS):
#           cross_eval_heatmap(mean_mapgen_div_scores[d_i], row_labels_m, col_labels_m,
#                               f"{div_calc_name}_diversity_{name}", f"{div_calc_name} diversity",
#                               errors=std_mapgen_divscores[d_i], figshape=figshape)

      def squash_exp_shorthand(shorthand):
         '''Lmao'''
         if 'Baseline' in shorthand:
#           return shorthand
            return 'Baseline'
         for gen_obj_name in generator_objectives:
            if gen_obj_name in shorthand:
               if gen_obj_name == 'AdversityDiversityTrgs':
                  sp = shorthand.split(' ')
                  adv, div = float(sp[-3].strip(',')), float(sp[-1].strip(','))
                  if adv <= 0.5:
                     adv_name = "Adverse"
                  else:
                     adv_name = "Prosperous"
                  if div <= 0.5:
                     div_name = "Homogeneous"
                  else:
                     div_name = "Diverse"
                  return f"{adv_name} & {div_name}"
               exp_name = gen_obj_name
               return exp_name
         else: return shorthand

      def del_nan_rows(row_labels, col_labels, data):
         #TODO
         pass

      # TODO: squash data, mann-whitney that shit

      def squash_data(row_labels, col_labels, data):
         ''' Combine data from models from similar experiments by squashing them together, row-wise, and stacking them
           along the "n_evals" dimension.'''
         n_evals = data.shape[-2]
         new_row_labels = []
         for rl in row_labels:  # excluding the "mean" row
            if rl not in new_row_labels:
               new_row_labels.append(rl)
         row_label_idxs = {r: i for i, r in enumerate(new_row_labels)}
         row_exp_counts = {r: 0 for r in new_row_labels}  # so that we can stack experiments in new array
         most_common, max_redundant_experiments = Counter(row_labels).most_common(1)[0]
         n_net_evals = n_evals * max_redundant_experiments
         sqsh_data = np.empty(shape=(data.shape[0], len(new_row_labels), data.shape[-3], n_net_evals, data.shape[-1]))
         sqsh_data[:] = np.nan

         print(data.shape, row_labels)
         for i, rl in enumerate(row_labels):
            rd = data[:,i:i+1]
            n_eval_start = row_exp_counts[rl] * n_evals
            row_exp_counts[rl] += 1
            n_eval_end = row_exp_counts[rl] * n_evals
            model_idx = row_label_idxs[rl]
#           print(n_eval_end, sqsh_data.shape)
            sqsh_data[:, model_idx:model_idx+1, :, n_eval_start:n_eval_end, :] = rd

         new_row_labels = [None] * len(new_row_labels)
         for k, v in row_label_idxs.items():
            new_row_labels[v] = k
         return new_row_labels, sqsh_data


      row_labels, model_configs, col_labels = data_tpl[0:3]
      data_tpl = data_tpl[3:]

      # Visualize performance of all player-policies on all map-generators
      heatmaps_from_data(row_labels, col_labels, data_tpl, figshape=(70,70))

      # squash experiments with different objectives together
      # TODO: use actual configs for this squashing!
      row_labels = [squash_exp_shorthand(l) for l in row_labels]
#     col_labels = [squash_exp_shorthand(l) for l in col_labels]
      new_data_tpl = []
      for d in data_tpl:
         new_row_labels, sqsh_data = squash_data(row_labels, col_labels, d)
         new_data_tpl.append(sqsh_data)
      heatmaps_from_data(new_row_labels, col_labels, new_data_tpl, squash=True, figshape=(70, 30))

def prosp_div_heatmap(row_labels, model_configs, vals, errs, title):
   # TODO: throw this in a function
   # Visualize mean performance (over all maps) of experiments controlling for prosperity and diversity in a 2D grid
   # along dimensions of prosperity and diversity.
   idxs = []
   divs_prosps = []
   prosp_vals = set()
   div_vals = set()
   for i, (model_name, model_config) in enumerate(zip(row_labels, model_configs)):
      if model_config.FITNESS_METRIC == 'AdversityDiversityTrgs':
         idxs.append(i)
         prosp, div = model_config.ADVERSITY_DIVERSITY_TRGS
         divs_prosps.append((div, prosp))
         prosp_vals.add(prosp)
         div_vals.add(div)
   prosps = sorted(list(prosp_vals))
   prosps_to_pos = {p: i for i, p in enumerate(prosps)}
   divs = sorted(list(div_vals))
   divs_to_pos = {d: i for i, d in enumerate(divs)}
   vals = vals[idxs]
   errs = errs[idxs]
   data = np.empty(shape=(len(prosps), len(divs)))
   errors = data.copy()
   for val, err, (div, prosp) in zip(vals, errs, divs_prosps):
      data[prosps_to_pos[prosp], divs_to_pos[div]] = val
      errors[prosps_to_pos[prosp], divs_to_pos[div]] = err

   cross_eval_heatmap(np.flip(data.T, 0), prosps[::-1], divs, title, '', errors, figshape=(10, 10), xlabel='prosperity',
                      ylabel='diversity', filename=f'{title} (prosperity X diversity)', swap_xticks=False)

def cross_eval_heatmap(data, row_labels, col_labels, title, cbarlabel, errors=None, pvals=False, figshape=(30,30),
                       xlabel='maps', ylabel='models', filename=None, swap_xticks=True):
   if filename is None:
      filename = title
   fig, ax = plt.subplots()
   # Remove empty rows and columns
   i = 0

   # Remove empty rows and columns
   for data_row in data:
      if np.isnan(data_row).all():
         data = np.vstack((data[:i], data[i+1:]))
         assert np.isnan(errors[i]).all()
         errors = np.vstack((errors[:i], errors[i+1:]))
         row_labels = row_labels[:i] + row_labels[i+1:]
         continue
      i += 1
   i = 0
   for data_col in data.T:
      if np.isnan(data_col).all():
         data = (np.vstack((data.T[:i], data.T[i + 1:]))).T
         assert np.isnan(errors.T[i]).all()
         errors = (np.vstack((errors.T[:i], errors.T[i+1:]))).T
         col_labels = col_labels[:i] + col_labels[i+1:]
         continue
      i += 1

#  fig.set_figheight(1.5*len(col_labels))
#  fig.set_figwidth(1.0*len(row_labels))
   fig.set_figwidth(figshape[0])
   fig.set_figheight(figshape[1])

   if pvals:
      cmap="viridis"
   else:
      cmap="magma"
   im, cbar = heatmap(data, row_labels, col_labels, ax=ax,
                      cmap=cmap, cbarlabel=cbarlabel)
   if not swap_xticks:
      im.axes.xaxis.tick_bottom()

   class CellFormatter(object):
      def __init__(self, errors):
         self.errors = errors
      def func(self, x, pos):
        #if np.isnan(x) or np.isnan(errors[pos]):
#       #   print(x, errors[pos])

        #   # Turns out the data entry is "masked" while the error entry is nan
#       #   assert np.isnan(x) and np.isnan(errors[pos])
#       #   if not np.isnan(x) and np.isnan(errors[pos]):
        #   return '--'
         if not pvals:
            x_str = "{:.1f}".format(x)
         else:
            x_str = "{:.1e}".format(x)
#           x_str = "{:.3f}".format(x)
         if errors is None:
            return x_str
         err = errors[pos]
         x_str = x_str + "  ± {:.1f}".format(err)
         return x_str
   cf = CellFormatter(errors)

   if pvals:
      textcolors = ("white", "black")
   else:
      textcolors = ("white", "black")
   texts = annotate_heatmap(im, valfmt=matplotlib.ticker.FuncFormatter(cf.func), textcolors=textcolors)
   ax.set_title(title)

#  fig.tight_layout(rect=[1,0,1,0])
   fig.tight_layout(pad=3)
#  plt.show()
   ax.set_xlabel(xlabel)
   ax.set_ylabel(ylabel)
   plt.savefig(os.path.join(
      'eval_experiment',
      '{}.png'.format(filename),
   ))
   plt.close()


if __name__ == '__main__':
   opts = argparse.ArgumentParser(
      description='Launch a batch of experiments/evaluations for evo-pcgrl')

   opts.add_argument(
       '-ex',
       '--experiment_name',
       help='A name to be shared by the batch of experiments.',
       default='0',
   )
   opts.add_argument(
       '-ev',
       '--evaluate',
       help='Cross-evaluate a batch of joint map-evolution, agent-learning experiments, looking at the behavior of all '
            'agent models on all ("best") maps.',
       action='store_true',
   )
   opts.add_argument(
       '-l',
       '--local',
       help='Run the batch script on a local machine (evolving for a minimal number of generations, or running full evaluations sequentially).',
       action='store_true',
   )
   opts.add_argument(
      '-bl',
      '--train_baseline',
      help='Train a baseline on Perlin noise-generated maps.',
      action='store_true',
   )
   opts.add_argument(
      '--cpu',
      help='Do not use GPU (only applies to SLURM, not recommended for default, big neural networks).',
      action='store_true',
   )
   opts.add_argument(
      '--n_cpu',
      help='How many parallel processes ray should use.',
      type=int,
      default=12,
   )
   opts.add_argument(
      '--vis_cross_eval',
      help='Visualize the results of cross-evaluation. (No new evaluations.)',
      action='store_true',
   )
   opts.add_argument(
      '--vis_evals',
      help='Visualize the results of individual evaluations and cross-evaluation. (No new evaluations.)',
      action='store_true',
   )
   opts.add_argument(
      '--vis_maps',
      help='Save and visualize evolved maps, and plot their fitness.',
      action='store_true'
   )
   opts.add_argument(
      '--render',
      help='Render an episode in unity.',
      action='store_true'
   )
   opts.add_argument(
      '-mp',
      '--multi-policy',
      help='Evaluate all policies on each map simultaneously, to allow for inter-policy competition.',
      action='store_true',
   )
   opts.add_argument(
      '--eval_baseline_maps_only',
      help='Only use baseline experiments for evaluation maps.',
      action='store_true',
   )
   opts.add_argument(
      '--re-render_cross_vis',
      help='Re-render the heatmaps resulting from the last cross-visualization. For iterating on the way we render '
           'these cross-vis graphics.',
      action='store_true',
   )
   opts = opts.parse_args()
   EXP_NAME = opts.experiment_name
   EVALUATE = opts.evaluate
   LOCAL = opts.local
   TRAIN_BASELINE = opts.train_baseline
   CUDA = not opts.cpu and not opts.vis_maps and not EVALUATE
   VIS_CROSS_EVAL = opts.vis_cross_eval
   VIS_EVALS = opts.vis_evals
   RENDER = opts.render
   if EVALUATE or opts.vis_maps:
      JOB_TIME = 24
   elif CUDA:
      JOB_TIME = 48  # NYU HPC Greene limits number of gpu jobs otherwise
   else:
      pass
#     JOB_TIME = 120  # never use CPU-only for training anyway
   if EVALUATE and opts.multi_policy:
      JOB_CPUS = 48
   else:
      JOB_CPUS = 12

   if CUDA:
      sbatch_file = 'evo_train.sh'
   else:
      sbatch_file = 'evo_train_cpu.sh'
   if LOCAL:
      print('Testing locally.')
   else:
      print('Launching batch of experiments on SLURM.')
   with open('configs/default_settings.json', 'r') as f:
      default_config = json.load(f)
   print('Loaded default config:\n{}'.format(default_config))

   if (EVALUATE or RENDER or VIS_EVALS or VIS_CROSS_EVAL) and not opts.vis_maps:
      # just get the names and configs of experiments in which we are interested (no actual evaluations are run)
      exp_dicts = launch_batch(EXP_NAME, get_exp_info_only=True)
      experiment_configs = [config.EvoNMMO() for ec in exp_dicts]
      [ec.set(*i) for ec, ecd in zip(experiment_configs, exp_dicts) for i in ecd.items()]
      experiment_names = [get_experiment_name(ec) for ec in experiment_configs]
      if RENDER:
         print('rendering experiments: {}\n KeyboardInterrupt (Ctrl+c) to render next.'.format(experiment_names))
         launch_cross_eval(experiment_names, vis_only=False, render=True, experiment_configs=experiment_configs)
      else:
         if not (VIS_CROSS_EVAL or VIS_EVALS):
            print('cross evaluating experiments: {}'.format(experiment_names))
            # only launch these cross evaluations if we need to
            launch_cross_eval(experiment_names, experiment_configs=experiment_configs, vis_only=False)
         # otherwise just load up old data to visualize results
         if VIS_EVALS:
            # visualize individual evaluations.
            launch_cross_eval(experiment_names, experiment_configs=experiment_configs, vis_only=True)
         elif VIS_CROSS_EVAL or LOCAL:  # elif since vis_only also prompts cross-eval visualization
            # visualize cross-evaluation tables
            launch_cross_eval(experiment_names, experiment_configs=experiment_configs, vis_only=False, vis_cross_eval=True)
   else:
      # Launch a batch of joint map-evolution and agent-training experiments (maybe also a baseline agent-training experiment on a fixed set of maps).
      launch_batch(EXP_NAME)