diff --git a/comp.py b/comp.py index f953dddf..32eea52c 100644 --- a/comp.py +++ b/comp.py @@ -10,12 +10,13 @@ # INIT + def get_most_recent_entry(search_dir): """ Return most recently modified entry from given directory. - + Args: search_dir: an absolute or relative path to a directory - + Returns: The file/folder in search_dir that has the most recent 'modified' datetime. @@ -24,12 +25,13 @@ def get_most_recent_entry(search_dir): entries.sort(key=lambda x: os.path.getmtime(x)) return entries[-1] + def glob_result_files(folder_name): - """ Glob result spreadsheets from specified folder. - + """ Glob result spreadsheets from specified folder. + Args: folder_name: an absolute or relative path to a directory - + Returns: list of filenames that match the pattern 'scenario_*.xlsx' """ @@ -37,28 +39,76 @@ def glob_result_files(folder_name): result_files = sorted(glob.glob(glob_pattern)) return result_files + +def deduplicate_legend(handles, labels): + """ Remove double entries from figure legend. + + Args: + handles: list of legend entry handles + labels: list of legend entry labels + + Returns: + (handles, labels) tuple of lists with duplicate labels removed + """ + new_handles = [] + new_labels = [] + for hdl, lbl in zip(handles, labels): + if lbl not in new_labels: + new_handles.append(hdl) + new_labels.append(lbl) + # also, sort both lists accordingly + new_labels, new_handles = (list(t) for t + in zip(*sorted(zip(new_labels, new_handles)))) + return (new_handles, new_labels) + + +def group_hbar_plots(ax, group_size, inner_sep=None): + """ + Args: + ax: matplotlib axis + group_size (int): how many bars to group together + inner_sep (float): vertical spacing within group (optional) + """ + handles, labels = ax.get_legend_handles_labels() + bar_height = handles[0][0].get_height() # assumption: all bars identical + + if not inner_sep: + inner_sep = 0.5 * (1 - bar_height) + + for column, handle in enumerate(handles): + for row, patch in enumerate(handle.patches): + group_number, row_within_group = divmod(row, group_size) + + group_offset = (group_number * group_size + + 0.5 * (group_size - 1) * (1 - inner_sep) + - 0.5 * (group_size * bar_height)) + + patch.set_y(row_within_group * (bar_height + inner_sep) + + group_offset) + + def compare_scenarios(result_files, output_filename): """ Create report sheet and plots for given report spreadsheets. - + Args: result_files: a list of spreadsheet filenames generated by urbs.report - output_filename: a spreadsheet filename that the comparison is to be + output_filename: a spreadsheet filename that the comparison is to be written to - + Returns: Nothing - - To do: - Don't use report spreadsheets, instead load pickled problem + + To do: + Don't use report spreadsheets, instead load pickled problem instances. This would make this function less fragile and dependent on the output format of urbs.report(). """ - + # derive list of scenario names for column labels/figure captions - scenario_names = [os.path.basename(rf) # drop folder names, keep filename - .replace('_', ' ') # replace _ with spaces - .replace('.xlsx', '') # drop file extension - .replace('scenario ', '') # drop 'scenario ' prefix + scenario_names = [os.path.basename(rf) # drop folder names, keep filename + .replace('_', ' ') # replace _ with spaces + .replace('.xlsx', '') # drop file extension + .replace('scenario ', '') # drop 'scenario ' prefix for rf in result_files] # find base scenario and put at first position @@ -67,32 +117,45 @@ def compare_scenarios(result_files, output_filename): result_files.append(result_files.pop(base_scenario)) scenario_names.append(scenario_names.pop(base_scenario)) except ValueError: - pass # do nothing if no base scenario is found - + pass # do nothing if no base scenario is found + costs = [] # total costs by type and scenario esums = [] # sum of energy produced by scenario - + # READ - + for rf in result_files: with pd.ExcelFile(rf) as xls: - cost = xls.parse('Costs',index_col=[0]) + cost = xls.parse('Costs', index_col=[0]) esum = xls.parse('Commodity sums') - + # repair broken MultiIndex in the first column esum.reset_index(inplace=True) esum.fillna(method='ffill', inplace=True) esum.set_index(['level_0', 'level_1'], inplace=True) costs.append(cost) - esums.append(esum) - + + # extract sites and commodities from scenario + sitcom = [value.split('.') for value + in esum.columns.get_level_values(0)] + coms = set([com for sit, com in sitcom]) + com_sums = pd.DataFrame() + # get site.commodity names + sit_com = esum.columns.get_level_values(0) + # sum each commodity (e.g. Elec, CO2) + for com in coms: + com_sum = pd.DataFrame(esum.loc[:, sit_com.str.contains(com)] + .sum(axis=1), columns=[com]) + com_sums = pd.concat([com_sums, com_sum], axis=1) + esums.append(com_sums) + # merge everything into one DataFrame each costs = pd.concat(costs, axis=1, keys=scenario_names) esums = pd.concat(esums, axis=1, keys=scenario_names) - + # ANALYSE - + # drop redundant 'costs' column label # make index name nicer for plot # sort/transpose frame @@ -103,87 +166,94 @@ def compare_scenarios(result_files, output_filename): costs = costs / 1e9 spent = costs.loc[:, costs.sum() > 0] earnt = costs.loc[:, costs.sum() < 0] - - # sum up created energy over all locations, but keeping scenarios (level=0) + + # extract created + # per commodity (e.g. 'Elec', 'CO2', 'Heat'...) # make index name 'Commodity' nicer for plot # drop all unused commodities and sort/transpose # convert MWh to GWh - esums = esums.loc['Created'].sum(axis=1, level=0) + esums = esums.loc['Created'] esums.index.name = 'Commodity' used_commodities = (esums.sum(axis=1) > 0) esums = esums[used_commodities].sort_index().transpose() esums = esums / 1e3 - + # PLOT - + fig = plt.figure(figsize=(20, 8)) gs = gridspec.GridSpec(1, 2, width_ratios=[2, 3]) - + ax0 = plt.subplot(gs[0]) spent_colors = [urbs.to_color(ct) for ct in spent.columns] bp0 = spent.plot(ax=ax0, kind='barh', stacked=True, color=spent_colors, linewidth=0) if not earnt.empty: - earnt_colors = [urbs.to_color(ct) for ct in earnt.columns] - bp0a = earnt.plot(ax=ax0, kind='barh', stacked=True, color=earnt_colors, - linewidth=0) - + earnt_colors = [urbs.to_color(ct) for ct in earnt.columns] + bp0a = earnt.plot(ax=ax0, kind='barh', stacked=True, + color=earnt_colors, linewidth=0) + ax1 = plt.subplot(gs[1]) esums_colors = [urbs.to_color(commodity) for commodity in esums.columns] bp1 = esums.plot(ax=ax1, kind='barh', stacked=True, color=esums_colors, - linewidth=0) - + linewidth=0, width=.5) + # remove scenario names from second plot - ax1.set_yticklabels('') - + group_hbar_plots(ax1, len(coms)) + ax1.set_yticklabels(esums.index.get_level_values(1)) + # make bar plot edges lighter for bp in [bp0, bp1]: for patch in bp.patches: patch.set_edgecolor(urbs.to_color('Decoration')) - + # set limits and ticks for both axes for ax in [ax0, ax1]: plt.setp(list(ax.spines.values()), color=urbs.to_color('Grid')) ax.yaxis.grid(False) - ax.xaxis.grid(True, 'major', color=urbs.to_color('Grid'), linestyle='-') + ax.xaxis.grid(True, 'major', color=urbs.to_color('Grid'), + linestyle='-') ax.xaxis.set_ticks_position('none') ax.yaxis.set_ticks_position('none') - + # group 1,000,000 with commas - group_thousands = tkr.FuncFormatter(lambda x, pos: '{:0,d}'.format(int(x))) + group_thousands = tkr.FuncFormatter(lambda x, + pos: '{:0,d}'.format(int(x))) ax.xaxis.set_major_formatter(group_thousands) - + # legend lg = ax.legend(frameon=False, loc='upper center', ncol=4, bbox_to_anchor=(0.5, 1.11)) plt.setp(lg.get_patches(), edgecolor=urbs.to_color('Decoration'), linewidth=0) - + ax0.set_xlabel('Total costs (billion EUR/a)') - ax1.set_xlabel('Total energy produced (GWh)') - + if 'CO2' in coms: + ax1.set_xlabel('Total energy produced (GWh)\n Emitted CO2 (kt)') + else: + ax1.set_xlabel('Total energy produced (GWh)') + for ext in ['png', 'pdf']: fig.savefig('{}.{}'.format(output_filename, ext), bbox_inches='tight') - + # REPORT with pd.ExcelWriter('{}.{}'.format(output_filename, 'xlsx')) as writer: costs.to_excel(writer, 'Costs') esums.to_excel(writer, 'Energy sums') - + if __name__ == '__main__': - + directories = sys.argv[1:] if not directories: # get the directory of the supposedly last run # and retrieve (glob) a list of all result spreadsheets from there directories = [get_most_recent_entry('result')] - + for directory in directories: result_files = glob_result_files(directory) - - # specify comparison result filename + + # specify comparison result filename # and run the comparison function comp_filename = os.path.join(directory, 'comparison') compare_scenarios(list(reversed(result_files)), comp_filename)