From e461ef06e9e9f23b39a9ecf145b1118947b975bc Mon Sep 17 00:00:00 2001 From: jnasser Date: Fri, 17 Apr 2020 14:17:18 -0400 Subject: [PATCH] Make --H3K27ac an optional parameter. Allow running of ATAC only model --- src/neighborhoods.py | 29 ++++++++++++++++++++++------- src/run.neighborhoods.py | 2 +- 2 files changed, 23 insertions(+), 8 deletions(-) diff --git a/src/neighborhoods.py b/src/neighborhoods.py index 49d5bdbe7..acf86833e 100644 --- a/src/neighborhoods.py +++ b/src/neighborhoods.py @@ -93,7 +93,12 @@ def annotate_genes_with_features(genes, merged = genes.merge(tsscounts, on="name", suffixes=['','.TSS1Kb']) access_col = default_accessibility_feature + ".RPKM.quantile.TSS1Kb" - merged['PromoterActivityQuantile'] = ((0.0001+merged['H3K27ac.RPKM.quantile.TSS1Kb'])*(0.0001+merged[access_col])).rank(method='average', na_option="top", ascending=True, pct=True) + + if 'H3K27ac.RPKM.quantile.TSS1Kb' in merged.columns: + merged['PromoterActivityQuantile'] = ((0.0001+merged['H3K27ac.RPKM.quantile.TSS1Kb'])*(0.0001+merged[access_col])).rank(method='average', na_option="top", ascending=True, pct=True) + else: + merged['PromoterActivityQuantile'] = ((0.0001+merged[access_col])).rank(method='average', na_option="top", ascending=True, pct=True) + merged.to_csv(os.path.join(outdir, "GeneList.txt"), sep='\t', index=False, header=True, float_format="%.6f") @@ -544,7 +549,9 @@ def parse_params_file(args): def get_features(args): features = {} - features['H3K27ac'] = args.H3K27ac.split(",") + + if args.H3K27ac: + features['H3K27ac'] = args.H3K27ac.split(",") if args.ATAC: features['ATAC'] = args.ATAC.split(",") @@ -573,11 +580,19 @@ def determine_accessibility_feature(args): def compute_activity(df, access_col): if access_col == "DHS": - df['activity_base'] = np.sqrt(df['normalized_h3K27ac'] * df['normalized_dhs']) - df['activity_base_no_qnorm'] = np.sqrt(df['H3K27ac.RPM'] * df['DHS.RPM']) + if 'H3K27ac.RPM' in df.columns: + df['activity_base'] = np.sqrt(df['normalized_h3K27ac'] * df['normalized_dhs']) + df['activity_base_no_qnorm'] = np.sqrt(df['H3K27ac.RPM'] * df['DHS.RPM']) + else: + df['activity_base'] = df['normalized_dhs'] + df['activity_base_no_qnorm'] = df['DHS.RPM'] elif access_col == "ATAC": - df['activity_base'] = np.sqrt(df['normalized_h3K27ac'] * df['normalized_atac']) - df['activity_base_no_qnorm'] = np.sqrt(df['H3K27ac.RPM'] * df['ATAC.RPM']) + if 'H3K27ac.RPM' in df.columns: + df['activity_base'] = np.sqrt(df['normalized_h3K27ac'] * df['normalized_atac']) + df['activity_base_no_qnorm'] = np.sqrt(df['H3K27ac.RPM'] * df['ATAC.RPM']) + else: + df['activity_base'] = df['normalized_atac'] + df['activity_base_no_qnorm'] = df['ATAC.RPM'] else: raise RuntimeError("At least one of ATAC or DHS must be provided!") @@ -589,7 +604,7 @@ def run_qnorm(df, qnorm, qnorm_method = "rank", separate_promoters = True): # Option to qnorm promoters and nonpromoters separately if qnorm is None: - df['normalized_h3K27ac'] = df['H3K27ac.RPM'] + if 'H3K27ac.RPM' in df.columns: df['normalized_h3K27ac'] = df['H3K27ac.RPM'] if 'DHS.RPM' in df.columns: df['normalized_dhs'] = df['DHS.RPM'] if 'ATAC.RPM' in df.columns: df['normalized_atac'] = df['ATAC.RPM'] else: diff --git a/src/run.neighborhoods.py b/src/run.neighborhoods.py index d9f4c1dae..f379e4fef 100644 --- a/src/run.neighborhoods.py +++ b/src/run.neighborhoods.py @@ -25,7 +25,7 @@ class formatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawTextHelpForm parser.add_argument('--skip_gene_counts', action="store_true", help="Do not count over genes or gene bodies. Will not produce GeneList.txt. Do not use switch if intending to run Predictions") #epi - parser.add_argument('--H3K27ac', required=required_args, default=None, help="Comma delimited string of H3K27ac .bam files") + parser.add_argument('--H3K27ac', default="", nargs='?', help="Comma delimited string of H3K27ac .bam files") parser.add_argument('--DHS', default="", nargs='?', help="Comma delimited string of DHS .bam files. Either ATAC or DHS must be provided") parser.add_argument('--ATAC', default="", nargs='?', help="Comma delimited string of ATAC .bam files. Either ATAC or DHS must be provided") parser.add_argument('--default_accessibility_feature', default=None, nargs='?', help="If both ATAC and DHS are provided, this flag must be set to either 'DHS' or 'ATAC' signifying which datatype to use in computing activity")