Skip to content

Latest commit

 

History

History
440 lines (341 loc) · 12.7 KB

make_supp_data.md

File metadata and controls

440 lines (341 loc) · 12.7 KB

Make supplementary data files

This Python Jupyter notebook makes supplementary files for antibody sets for which doing this is specified in escape_profiles_config:

  • raw data files
  • dms-view input files
  • correlations between libraries

Set up

Import Python modules:

import itertools
import math
import os

from IPython.display import display, HTML

import matplotlib.pyplot as plt

import pandas as pd

from plotnine import *

import yaml

Read the configuration file:

with open('config.yaml') as f:
    config = yaml.safe_load(f)

Create output directory:

os.makedirs(config['supp_data_dir'], exist_ok=True)

Extract from configuration what we will use as the site- and mutation-level metrics:

site_metric = config['site_metric']
mut_metric = config['mut_metric']

print(f"At site level, quantifying selection by {site_metric}")
print(f"At mutation level, quantify selection by {mut_metric}")
At site level, quantifying selection by site_total_escape_frac_epistasis_model
At mutation level, quantify selection by mut_escape_frac_epistasis_model

Read escape profiles configuration and get sets for which we will be making supp data:

with open (config['escape_profiles_config']) as f:
    escape_profiles_config = yaml.safe_load(f)
    
condition_sets = {name: specs['conditions'] for name, specs in escape_profiles_config.items()
                  if 'make_supp_data' in specs and specs['make_supp_data']}

print('Making supplementary data for the following condition sets:\n  ' + '\n  '.join(condition_sets))
Making supplementary data for the following condition sets:
  karolinska

Read the escape fractions

print(f"Reading escape fractions from {config['escape_fracs']}")
escape_fracs_all = pd.read_csv(config['escape_fracs'])
Reading escape fractions from results/escape_scores/escape_fracs.csv

Read the configuration about what data gets output on what PDBs:

with open (config['output_pdbs_config']) as f:
    output_pdbs_config = yaml.safe_load(f)

Correlations between libraries

Plot correlations between libraries for mutation- and site-level escape:

libraries = [lib for lib in escape_fracs_all['library'].unique() if lib != "average"]
assert len(libraries) == 2, 'plot only makes sense if 2 libraries'

for metric_type, (set_name, condition_set) in itertools.product(['mutation', 'site'],
                                                                condition_sets.items()):
    
    print(f"\nPlotting correlations of {metric_type} escape for {set_name}")
    
    if metric_type == 'mutation':
        metric = mut_metric
        pivot_index = ['selection', 'site', 'mutation']
    elif metric_type == 'site':
        metric = site_metric
        pivot_index = ['selection', 'site']
    else:
        raise ValueError(f"invalid `metric_type` {metric_type}")

    # wide data frame with each library's score in a different column
    df_wide = (
        escape_fracs_all
        .query('library != "average"')
        .query('selection in @condition_set')
        .assign(selection=lambda x: x['selection'].map(condition_set))
        # need to drop duplicate columns if using site metric
        [['library', metric] + pivot_index]
        .drop_duplicates()
        # now make columns for each library, only keep mutants with scores for both libs
        .pivot_table(index=pivot_index,
                     columns='library',
                     values=metric)
        .reset_index()
        .dropna(axis=0)
        )

    # correlations between libraries
    corrs = (
        df_wide
        .groupby('selection')
        [libraries]
        .corr(method='pearson')
        .reset_index()
        .query('library == @libraries[0]')
        .assign(correlation=lambda x: 'R=' + x[libraries[1]].round(2).astype(str))
        [['selection', 'correlation']]
        # add number of mutations measured
        .merge(df_wide
               .groupby('selection')
               .size()
               .rename('n')
               .reset_index()
               )
        .assign(correlation=lambda x: x['correlation'] + ', N=' + x['n'].astype(str))
        )

    # plot correlations
    nfacets = df_wide['selection'].nunique()
    ncol = min(nfacets, 5)
    nrow = math.ceil(nfacets / ncol)
    xmin = df_wide[libraries[0]].min()
    xspan = df_wide[libraries[0]].max() - xmin
    ymin = df_wide[libraries[1]].min()
    yspan = df_wide[libraries[1]].max() - ymin
    p = (ggplot(df_wide) +
         aes(libraries[0], libraries[1]) +
         geom_point(alpha=0.2, color='black') +
         geom_text(mapping=aes(label='correlation'),
                   data=corrs,
                   x=0.01 * xspan + xmin,
                   y=0.99 * yspan + ymin,
                   size=10,
                   ha='left',
                   va='top',
                   ) +
         facet_wrap('~ selection', ncol=ncol) +
         theme_classic() +
         theme(figure_size=(2.5 * ncol, 2.5 * nrow),
               plot_title=element_text(size=14)) +
         ggtitle(f"{metric_type}-level escape")
         )

    fig = p.draw()
    display(fig)
    plotfile = os.path.join(config['supp_data_dir'],
                            f"{set_name}-{metric_type}-corr.pdf")
    print(f"Saving plot to {plotfile}")
    p.save(plotfile, verbose=False)
    plt.close(fig)
Plotting correlations of mutation escape for karolinska

png

Saving plot to results/supp_data/karolinska-mutation-corr.pdf

Plotting correlations of site escape for karolinska

png

Saving plot to results/supp_data/karolinska-site-corr.pdf

Raw data files

Get the "raw data" for the effects of each mutation on antibody escape. This has just the columns of most interest for the average of the two libraries, renamed in a sensible way:

raw_data = (
    escape_fracs_all
    .query('library == "average"')
    .drop(columns=['library'])
    .rename(columns={mut_metric: 'mut_escape',
                     site_metric: 'site_total_escape'})
    [['condition', 'site', 'label_site', 'wildtype', 'mutation', 'protein_chain',
      'protein_site', 'mut_escape', 'site_total_escape']]
    .assign(site_max_escape=lambda x: x.groupby(['condition', 'site'])['mut_escape'].transform('max'))
    )

Now write the raw data for each condition set:

for set_name, condition_set in condition_sets.items():
    print(f"\nRaw data for {set_name}:")
    df = (raw_data
          .query('condition in @condition_set')
          .assign(condition=lambda x: x['condition'].map(condition_set))
          [['condition', 'label_site', 'wildtype', 'mutation', 'mut_escape',
            'site_total_escape', 'site_max_escape']]
          .rename(columns={'label_site': 'site'})
          )
    #display(HTML(df.head().to_html(index=False)))
    csv_file = os.path.join(config['supp_data_dir'], f"{set_name}_raw_data.csv")
    print(f"Writing to {csv_file}")
    df.to_csv(csv_file, index=False, float_format='%.4g')
Raw data for karolinska:
Writing to results/supp_data/karolinska_raw_data.csv

Data for dms-view:

Now we write the data in a format for display by dms-view with several different possible mutation-level escape color schemes and also allowing options to visualize directly the deep mutational scanning data. We write such a file for each condition set for the PDBs specified in output_pdbs_config.

First, create the different color schemes for the mutation-level escape. The first step is to read these color schemes:

print(f"Reading DMS color schemes from {config['escape_profiles_dms_colors']}")

dms_colors = (
    pd.read_csv(config['escape_profiles_dms_colors'])
    .drop(columns=['bind', 'expr'])
    .rename(columns={'site': 'label_site'})
    .rename(columns={'bind_color': 'color ACE2 bind',
                     'expr_color': 'color RBD expr'})
    # add color schemes by functional group and all gray
    .assign(**{'color gray': '#696969',
               'color func group': 'functional'})
    )
Reading DMS color schemes from results/escape_profiles/escape_profiles_dms_colors.csv

Now write a dms-view input file that allows different mutation-level coloring schemes:

dms_view_df = (
    pd.concat([raw_data.merge(dms_colors[['label_site', 'mutation', color]],
                              how='left',
                              validate='many_to_one')
                       .rename(columns={color: 'color_for_mutation',
                                        'mut_escape': 'mut_escape ' + color})
               for color in dms_colors.drop(columns=['label_site', 'mutation']).columns.tolist()
               ])
    .rename(columns={'site_max_escape': 'site_max escape',
                     'site_total_escape': 'site_total escape'})
    .drop(columns='protein_chain')
    )

display(HTML(dms_view_df.head().to_html(index=False)))
condition site label_site wildtype mutation protein_site mut_escape color ACE2 bind site_total escape site_max escape color_for_mutation mut_escape color RBD expr mut_escape color gray mut_escape color func group
CAB-A17_139 1 331 N A 331 0.001006 0.01628 0.00109 #692505 NaN NaN NaN
CAB-A17_139 1 331 N D 331 0.001013 0.01628 0.00109 #662505 NaN NaN NaN
CAB-A17_139 1 331 N E 331 0.000979 0.01628 0.00109 #662505 NaN NaN NaN
CAB-A17_139 1 331 N F 331 0.001090 0.01628 0.00109 #722805 NaN NaN NaN
CAB-A17_139 1 331 N G 331 0.001025 0.01628 0.00109 #6a2605 NaN NaN NaN

Finally, write dms-view input files for each condition set:

for set_name, condition_set in condition_sets.items():
    
    for pdb_name, pdb_specs in output_pdbs_config.items():
        
        # get all conditions for this set to be written to this PDB
        if isinstance(pdb_specs['conditions'], str) and pdb_specs['conditions'].upper() == 'ALL':
            pdb_conditions = condition_set
        else:
            assert isinstance(pdb_specs['conditions'], list)
            pdb_conditions = [condition for condition in condition_set
                              if condition in pdb_specs['conditions']]
        if not pdb_conditions:
            continue
            
        df = (dms_view_df
              .query('condition in @pdb_conditions')
              .assign(condition=lambda x: x['condition'].map(condition_set))  # re-name to shorter names
              )
        
        # assign all relevant protein chains
        cols = df.columns.tolist()
        i = cols.index('protein_site')  # add protein chain column here
        newcols = cols[: i] + ['protein_chain'] + cols[i: ]
        chains = ' '.join(pdb_specs['chains'])
        df = df.assign(protein_chain=chains)[newcols]
   
        csv_file = os.path.join(config['supp_data_dir'], f"{set_name}_{pdb_name}_dms-view_data.csv")
        print(f"Writing `dms-view` input file for {set_name} mapped to PDB {pdb_name} to {csv_file}")
        df.to_csv(csv_file, index=False, float_format='%.4g')
Writing `dms-view` input file for karolinska mapped to PDB 6m0j to results/supp_data/karolinska_6m0j_dms-view_data.csv
Writing `dms-view` input file for karolinska mapped to PDB BA1_A17 to results/supp_data/karolinska_BA1_A17_dms-view_data.csv