TORCH-Consortium · abhi18av · Jan 19, 2023 · Jan 15, 2023 · Jan 15, 2023 · Jan 15, 2023
diff --git a/.gitignore b/.gitignore
@@ -7,7 +7,7 @@ data/test_data/
 
 resources/exit_rif/
 
-containers/xbs-nf-container-1/resistance_db_who
+containers/magma-container-1/resistance_db_who
 
 */**/*fastq.gz
 */**/*fasta
@@ -22,9 +22,9 @@ data/full_data
 results*
 *.nextflow*
 
-conda_envs/xbs-nf-env*
+conda_envs/magma-env*
 containers/**/*yml
 
 samplesheet.csv
-xbs-nf.sh
+magma.sh
 .Rproj.user
diff --git a/Makefile b/Makefile
@@ -1,8 +1,5 @@
 # https://makefiletutorial.com/
 
-run_stub:
-	bash ./data/mock_data/generate_mock_files.sh && nextflow run main.nf -entry TEST -profile dev -stub-run -process.cpus 1 -process.memory 1.GB -resume
-
 run_dev:
 	nextflow run main.nf -profile conda,dev -entry TEST  -resume -with-tower
 

diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
-# XBS-nf
+# MAGMA
 
-XBS-nf (compleX Bacterial Samples) is a pipeline for comprehensive genomic analyses of Mycobacterium tuberculosis with a focus on clinical decision making as well as research.
+MAGMA (**M**aximum **A**ccessible **G**enome for **M**tb **A**nalysis) is a pipeline for comprehensive genomic analyses of Mycobacterium tuberculosis with a focus on clinical decision making as well as research.
 
 # Salient features of the implementation
 
@@ -9,25 +9,23 @@ XBS-nf (compleX Bacterial Samples) is a pipeline for comprehensive genomic analy
 - Ease of use on a range of infrastructure (cloud/on-prem HPC clusters/ servers (or local machines))
 - Resumability for failed processes
 - Centralized locations for specifying analysis parameters and hardware requirements
-  - XBS-nf parameters (`default_parameters.config`)
+  - MAGMA parameters (`default_parameters.config`)
   - Hardware requirements (`conf/standard.config`)
   - Execution (software) requirements (`conf/docker.config` or `conf/conda.config`)
-- A GVCF reference dataset for ~600 samples
+- An (optional) GVCF reference dataset for ~600 samples is provided for augmenting smaller datasets
 
-# Usage and Tutorial
+> **Note**
+> Downloading the reference EXIT_RIF GVCF files from FIXME
 
-For the usage and tutorials please refer the XBS-nf website
+# Tutorials and Presentations
 
-## Prerequisites
-
-### Git tooling
-
-- `git` and `git-lfs`
+For the tutorials(./docs/tutorials.md) and [presentations](./docs/presentations.md) please refer the [docs](./docs) folder.
 
-> NOTE: Without the `git-lfs` tool the optional bundled wouldn't be downloaded correctly.
+## Prerequisites
 
 ### Nextflow
 
+- `git` : The version control in the pipeline.
 - `Java-11` or `Java-17` (preferred)
 
 **NOTE**: The `java` version should NOT be an `internal jdk` release! You can check the release via `java -version`
@@ -44,7 +42,7 @@ $ curl -s https://get.nextflow.io | bash
 $ chmod +x nextflow
 ```
 
-- Add `nextflow` to your `path` (perhaps `/usr/local/bin/`)
+- Add `nextflow` to your `path` (for example `/usr/local/bin/`)
 
 ```sh
 $ mv nextflow /usr/local/bin
@@ -64,52 +62,22 @@ $ nextflow info
 
 ```
 
-### Local Conda environments for XBS-nf
-
-> **NOTE**: The conda environments are expected by the `conda_local` profile to be created within `xbs-nf/conda_envs` directory
-
-- Clone the pipeline locally and `cd` into it
-
-```sh
-$ git clone https://github.com/TORCH-Consortium/xbs-nf
-
-$ cd xbs-nf
-
-```
-
-- `cd` in the `conda_envs` folder and execute the following commands
 
-```sh
-$ conda env create -p xbs-nf-env-1 --file xbs-nf-env-1.yml
-
-$ conda env create -p xbs-nf-env-2 --file xbs-nf-env-2.yml
-```
-
-> TIP: For faster installation process, please download [mamba](https://github.com/mamba-org/mamba) tool and replace `conda` with `mamba` in the above commands.
-
-### Run the pipeline
-
-- Customize the pipeline and process level settings in the [default_params](./default_params.config) file
+### Running MAGMA on different environments
 
-- From inside the `xbs-nf` folder, invoke the pipeline
-
-```sh
-$ nextflow run main.nf -profile conda
-```
-- use the ```-resume``` flag to continue from previously generated output files, rather than starting from scratch.
-
-```sh
-$ nextflow run main.nf -profile conda -resume
-```
+1. Local Conda environments for MAGMA
+2. Docker based execution for MAGMA
+3. HPC based execution for MAGMA
+4. Cloud batch (AWS/Google/Azure) based execution for MAGMA
 
-<!-- # Citation -->
+# Citation 
 
-<!-- TODO: Update this section and add a citation.cff file -->
+TODO: Update this section and add a citation.cff file 
 
 # Contributions
 
 Contributions are warmly accepted!
 
 # License
 
-Please refer the [LICENSE](./LICENSE) file.
+Please refer the [GPL 3.0 LICENSE](./LICENSE) file.
diff --git a/bin/multiple_infection_filter.py b/bin/multiple_infection_filter.py
@@ -7,7 +7,7 @@
 import argparse
 
 if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description='Analyse resistance output from XBS Pipeline')
+    parser = argparse.ArgumentParser(description='Analyse resistance output from MAGMA pipeline')
     parser.add_argument('indir', metavar='indir', type=str, help='The directory containing the LoFreq TBProfiler output')
     parser.add_argument('relative_abundance_threshold', metavar='relative_abundance_threshold', type=float, help='Minimum relative abundance of the majority strain required to process the sample')
 

diff --git a/bin/reformat_lofreq.py b/bin/reformat_lofreq.py
@@ -32,7 +32,7 @@ def write_vcf(filename, df, header):
         df.to_csv(vcf, sep='\t', index=False)
 
 if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description='Analyse resistance output from XBS Pipeline')
+    parser = argparse.ArgumentParser(description='Analyse resistance output from the MAGMA pipeline')
     parser.add_argument('lofreq_vcf_file', metavar='lofreq_vcf_file', type=str, help='The input lofreq vcf file')
     parser.add_argument('lofreq_sample_name', metavar='lofreq_sample_name', type=str, help='The sample name')
     parser.add_argument('outfile', metavar='outfile', type=str, help='The name of the output VCF file')

diff --git a/bin/samplesheet_validation.py b/bin/samplesheet_validation.py
@@ -7,7 +7,7 @@
 
 from sys import exit
 
-parser = argparse.ArgumentParser(description='Run the XBS Pipeline')
+parser = argparse.ArgumentParser(description='Run the MAGMA pipeline samplesheet validation')
 parser.add_argument('input_file', metavar='input_file', type=str, help='The input sample file')
 args = vars(parser.parse_args())
 

diff --git a/bin/summarize_resistance.py b/bin/summarize_resistance.py
@@ -36,7 +36,7 @@ def add_var_to_df(df, pt_id, drug, var, freq):
             df.loc[pt_id, drug] += ' & {} ({:.0%})'.format(var_repr, freq)
 
 if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description='Analyse resistance output from XBS Pipeline')
+    parser = argparse.ArgumentParser(description='Analyse resistance output from magma Pipeline')
     parser.add_argument('major_res_var_dir', metavar='major_res_var_dir', type=str, help='The directory containing the major variants TBProfiler output files')
     parser.add_argument('minor_res_var_dir', metavar='minor_res_var_dir', type=str, help='The directory containing the minor variants TBProfiler output files')
     parser.add_argument('summary_output_dir', metavar='summary_output_dir', type=str, help='The directory where the resulting excel sheets should be placed')
@@ -51,7 +51,7 @@ def add_var_to_df(df, pt_id, drug, var, freq):
         keys = file_name.split('.')
         samples[keys[1]] = {}
         with open(os.path.join(os.path.join(args['major_res_var_dir'], 'results', file_name))) as json_file:
-            samples[keys[1]]['xbs'] = json.load(json_file)
+            samples[keys[1]]['magma'] = json.load(json_file)
 
     if os.path.exists(os.path.join(os.path.join(args['minor_res_var_dir'], 'results'))):
         for file_name in os.listdir(os.path.join(os.path.join(args['minor_res_var_dir'], 'results'))):
@@ -69,46 +69,46 @@ def add_var_to_df(df, pt_id, drug, var, freq):
     for patient, sample in tqdm(samples_df.iterrows(), total=samples_df.shape[0]):
         sample_res = samples[patient]
 
-        pt_df_xbs = pd.DataFrame(columns=['Drug', 'Variant', 'Interpretation', 'Source'] + ['Conclusion {}'.format(patient)] + list([patient])).set_index(['Drug', 'Variant'])
+        pt_df_magma = pd.DataFrame(columns=['Drug', 'Variant', 'Interpretation', 'Source'] + ['Conclusion {}'.format(patient)] + list([patient])).set_index(['Drug', 'Variant'])
         pt_df_lof = pd.DataFrame(columns=['Drug', 'Variant', 'Interpretation', 'Source'] + ['Conclusion {}'.format(patient)] + list([patient])).set_index(['Drug', 'Variant'])
 
         """
-        Add the DR variants from the XBS analysis to the xbs variant dataframe.
-        Do this after adding the lofreq dr variants to show the XBS variant frequencies.
+        Add the DR variants from the magma analysis to the magma variant dataframe.
+        Do this after adding the lofreq dr variants to show the magma variant frequencies.
         """
-        for var in sample_res['xbs']['dr_variants']:
+        for var in sample_res['magma']['dr_variants']:
             gene = var['gene']
             if gene == '.':
                 gene = var['locus_tag']
             var_repr = '{}_{}'.format(gene, var['change'])
             for drug in var['drugs']:
                 drug = drug['drug'].lower().replace(' ', '_')
-                pt_df_xbs.loc[(drug, var_repr), (patient, 'Interpretation', 'Source')] = ['{:.0%}'.format(var['freq']), 0, 'WHO Catalogue']
+                pt_df_magma.loc[(drug, var_repr), (patient, 'Interpretation', 'Source')] = ['{:.0%}'.format(var['freq']), 0, 'WHO Catalogue']
 
         """
-        Add the other variants from the XBS analysis to the xbs variant dataframe.
+        Add the other variants from the magma analysis to the magma variant dataframe.
         """
-        for var in sample_res['xbs']['other_variants']:
+        for var in sample_res['magma']['other_variants']:
             gene = var['gene']
             if gene == '.':
                 gene = var['locus_tag']
             var_repr = '{}_{}'.format(gene, var['change'])
             # Add all the other variants as unknown classification and overwrite their classification later if necessary
             for drug in var['gene_associated_drugs']:
                 drug = drug.lower().replace(' ', '_')
-                pt_df_xbs.loc[(drug, var_repr), (patient, 'Interpretation', 'Source')] = ['{:.0%}'.format(var['freq']), 1, 'Tier 1 or 2 gene']
+                pt_df_magma.loc[(drug, var_repr), (patient, 'Interpretation', 'Source')] = ['{:.0%}'.format(var['freq']), 1, 'Tier 1 or 2 gene']
             # Overwrite the variant classification for drugs which have a WHO sens classification last as to overrule all other classifications
             if 'annotation' in var:
                 for annotation in var['annotation']:
                     if annotation['type'] == 'resistance_association_confidence' and (int(annotation['confidence']) == 4 or int(annotation['confidence']) == 5):
-                        pt_df_xbs.loc[(annotation['drug'].lower().replace(' ', '_'), var_repr), (patient, 'Interpretation', 'Source')] = ['{:.0%}'.format(var['freq']), 2, 'WHO Catalogue']
+                        pt_df_magma.loc[(annotation['drug'].lower().replace(' ', '_'), var_repr), (patient, 'Interpretation', 'Source')] = ['{:.0%}'.format(var['freq']), 2, 'WHO Catalogue']
                     elif annotation['type'] == 'resistance_association_confidence' and (int(annotation['confidence']) == 3):
-                        pt_df_xbs.loc[(annotation['drug'].lower().replace(' ', '_'), var_repr), (patient, 'Interpretation', 'Source')] = ['{:.0%}'.format(var['freq']), 1, 'WHO Catalogue']
+                        pt_df_magma.loc[(annotation['drug'].lower().replace(' ', '_'), var_repr), (patient, 'Interpretation', 'Source')] = ['{:.0%}'.format(var['freq']), 1, 'WHO Catalogue']
                     else:
                         display(annotation)
 
         """
-        Add the DR variants from the lofreq analysis to the xbs variant dataframe.
+        Add the DR variants from the lofreq analysis to the magma variant dataframe.
         """
         if 'lofreq' in sample_res:
             for var in sample_res['lofreq']['dr_variants']:
@@ -143,10 +143,10 @@ def add_var_to_df(df, pt_id, drug, var, freq):
                         else:
                             display(annotation)
 
-        # Remove all variants in the XBS summary from the lofreq summary
-        pt_df_lof = pt_df_lof.drop([i for i in pt_df_xbs.index if i in pt_df_lof.index])
-        for drug in list(drugs - set([i[0] for i in pt_df_xbs.index.values])):
-            pt_df_xbs.loc[(drug, 'No variants found'), ('Interpretation')] = 2
+        # Remove all variants in the magma summary from the lofreq summary
+        pt_df_lof = pt_df_lof.drop([i for i in pt_df_magma.index if i in pt_df_lof.index])
+        for drug in list(drugs - set([i[0] for i in pt_df_magma.index.values])):
+            pt_df_magma.loc[(drug, 'No variants found'), ('Interpretation')] = 2
         for drug in list(drugs - set([i[0] for i in pt_df_lof.index.values])):
             pt_df_lof.loc[(drug, 'No variants found'), ('Interpretation')] = 2
 
@@ -155,16 +155,16 @@ def add_var_to_df(df, pt_id, drug, var, freq):
         Calculate the conclusion for all samples
         """
         for _, sample in samples_df.loc[[patient]].iterrows():
-            work_df = pt_df_xbs[pd.notna(pt_df_xbs[patient])]
-            for drug in pt_df_xbs.index.levels[0]:
+            work_df = pt_df_magma[pd.notna(pt_df_magma[patient])]
+            for drug in pt_df_magma.index.levels[0]:
                 if drug not in set([i[0] for i in work_df.index.values]):
-                    pt_df_xbs.loc[drug, 'Conclusion {}'.format(patient)] = 2
+                    pt_df_magma.loc[drug, 'Conclusion {}'.format(patient)] = 2
                 elif 0 in work_df.loc[drug, 'Interpretation'].value_counts():
-                    pt_df_xbs.loc[drug, 'Conclusion {}'.format(patient)] = 0
+                    pt_df_magma.loc[drug, 'Conclusion {}'.format(patient)] = 0
                 elif 1 in work_df.loc[drug, 'Interpretation'].value_counts():
-                    pt_df_xbs.loc[drug, 'Conclusion {}'.format(patient)] = 1
+                    pt_df_magma.loc[drug, 'Conclusion {}'.format(patient)] = 1
                 else:
-                    pt_df_xbs.loc[drug, 'Conclusion {}'.format(patient)] = 2
+                    pt_df_magma.loc[drug, 'Conclusion {}'.format(patient)] = 2
         for _, sample in samples_df.loc[[patient]].iterrows():
             work_df = pt_df_lof[pd.notna(pt_df_lof[patient])]
             for drug in pt_df_lof.index.levels[0]:
@@ -178,17 +178,17 @@ def add_var_to_df(df, pt_id, drug, var, freq):
                     pt_df_lof.loc[drug, 'Conclusion {}'.format(patient)] = 2
 
         x2 = pt_df_lof.copy(deep=True)
-        pt_df_xbs = pt_df_xbs.reset_index().sort_values(['Conclusion {}'.format(patient)] + ['Drug', 'Interpretation', 'Variant'])
-        for column in [i for i in pt_df_xbs if 'Conclusion' in i or 'Interpretation' == i]:
-            pt_df_xbs[column] = pt_df_xbs[column].apply(lambda c: class_map[c])
+        pt_df_magma = pt_df_magma.reset_index().sort_values(['Conclusion {}'.format(patient)] + ['Drug', 'Interpretation', 'Variant'])
+        for column in [i for i in pt_df_magma if 'Conclusion' in i or 'Interpretation' == i]:
+            pt_df_magma[column] = pt_df_magma[column].apply(lambda c: class_map[c])
         pt_df_lof = pt_df_lof.reset_index().sort_values(['Conclusion {}'.format(patient)] + ['Drug', 'Interpretation', 'Variant'])
         for column in [i for i in pt_df_lof if 'Conclusion' in i or 'Interpretation' == i]:
             pt_df_lof[column] = pt_df_lof[column].apply(lambda c: class_map[c])
 
         """
         Write both sheets to excel with formatting"""
         with pd.ExcelWriter(os.path.join(summary_dir, '{}.xlsx'.format(patient)), engine='xlsxwriter') as writer:
-            pt_df_xbs.set_index(['Drug'] + ['Conclusion {}'.format(patient)] + ['Variant']).to_excel(writer, sheet_name=major_variants_sheet_name)
+            pt_df_magma.set_index(['Drug'] + ['Conclusion {}'.format(patient)] + ['Variant']).to_excel(writer, sheet_name=major_variants_sheet_name)
             pt_df_lof.set_index(['Drug'] + ['Conclusion {}'.format(patient)] + ['Variant']).to_excel(writer, sheet_name=minor_variants_sheet_name)
             #unclassified.reset_index().sort_values(by=['Conclusion', 'Drug', 'Interpretation', 'Variant']).set_index(['Drug', 'Conclusion', 'Variant']).to_excel(writer, sheet_name='Unclassified Variants')
 
@@ -200,10 +200,10 @@ def add_var_to_df(df, pt_id, drug, var, freq):
 
             # Add formatting to Variants sheet
             for i in range(samples_df.loc[[patient]].shape[0]):
-                writer.sheets[major_variants_sheet_name].conditional_format('{}1:{}{}'.format(alphabet[1+i], alphabet[1+i], pt_df_xbs.shape[0]+1),  cond_res)
-                writer.sheets[major_variants_sheet_name].conditional_format('{}1:{}{}'.format(alphabet[1+i], alphabet[1+i], pt_df_xbs.shape[0]+1),  cond_sens)
-            writer.sheets[major_variants_sheet_name].conditional_format('{}1:{}{}'.format(alphabet[2+samples_df.loc[[patient]].shape[0]], alphabet[2+samples_df.loc[[patient]].shape[0]], pt_df_xbs.shape[0]+1),  cond_res)
-            writer.sheets[major_variants_sheet_name].conditional_format('{}1:{}{}'.format(alphabet[2+samples_df.loc[[patient]].shape[0]], alphabet[2+samples_df.loc[[patient]].shape[0]], pt_df_xbs.shape[0]+1),  cond_sens)
+                writer.sheets[major_variants_sheet_name].conditional_format('{}1:{}{}'.format(alphabet[1+i], alphabet[1+i], pt_df_magma.shape[0]+1),  cond_res)
+                writer.sheets[major_variants_sheet_name].conditional_format('{}1:{}{}'.format(alphabet[1+i], alphabet[1+i], pt_df_magma.shape[0]+1),  cond_sens)
+            writer.sheets[major_variants_sheet_name].conditional_format('{}1:{}{}'.format(alphabet[2+samples_df.loc[[patient]].shape[0]], alphabet[2+samples_df.loc[[patient]].shape[0]], pt_df_magma.shape[0]+1),  cond_res)
+            writer.sheets[major_variants_sheet_name].conditional_format('{}1:{}{}'.format(alphabet[2+samples_df.loc[[patient]].shape[0]], alphabet[2+samples_df.loc[[patient]].shape[0]], pt_df_magma.shape[0]+1),  cond_sens)
 
             # Add formatting to Lofreq variants sheet
             for i in range(samples_df.loc[[patient]].shape[0]):

diff --git a/conda_envs/xbs-nf-env-1.yml → conda_envs/magma-env-1.yml b/conda_envs/xbs-nf-env-1.yml → conda_envs/magma-env-1.yml
@@ -1,4 +1,4 @@
-name: xbs-nf-env-1
+name: magma-env-1
 channels:
   - conda-forge
   - bioconda

diff --git a/conda_envs/xbs-nf-env-2.yml → conda_envs/magma-env-2.yml b/conda_envs/xbs-nf-env-2.yml → conda_envs/magma-env-2.yml
@@ -1,4 +1,4 @@
-name: xbs-nf-env-2
+name: magma-env-2
 channels:
   - conda-forge
   - bioconda

diff --git a/conda_envs/setup_conda_envs.sh b/conda_envs/setup_conda_envs.sh
@@ -5,15 +5,15 @@ set -e
 # NOTE: Please replace `conda` with `mamba` if it is installed for faster installs.
 resolverCondaBinary="conda" # pick either conda OR mamba
 
-# NOTE: By default, the conda environments are expected by the `conda_local` profile to be created within `xbs-nf/conda_envs` directory
+# NOTE: By default, the conda environments are expected by the `conda_local` profile to be created within `magma/conda_envs` directory
 
-$resolverCondaBinary env create -p xbs-nf-env-1 --file xbs-nf-env-1.yml 
+$resolverCondaBinary env create -p magma-env-1 --file magma-env-1.yml 
 
-$resolverCondaBinary env create -p xbs-nf-env-2 --file xbs-nf-env-2.yml
+$resolverCondaBinary env create -p magma-env-2 --file magma-env-2.yml
 
-echo "INFO: Activate conda env with tb-profiler and setup the WHO database within the xbs-nf-env-1"
+echo "INFO: Activate conda env with tb-profiler and setup the WHO database within the magma-env-1"
 eval "$(conda shell.bash hook)"
-conda activate "./xbs-nf-env-1"
+conda activate "./magma-env-1"
 
 echo "INFO: Make a local copy and cd inside it"
 cp -r ../resources/resistance_db_who ./
@@ -26,5 +26,5 @@ echo "INFO: Remove the local copy of the database folder"
 cd ..
 rm -rf resistance_db_who
 
-echo "INFO: Deactivate the xbs-nf-env-1 env"
+echo "INFO: Deactivate the magma-env-1 env"
 conda deactivate