From fb2659d1a2ddf722a40ec160f9198224c0a03455 Mon Sep 17 00:00:00 2001
From: Jonathan Oribello <Jonathan.d.oribello@gmail.com>
Date: Wed, 22 Mar 2023 23:37:44 +0000
Subject: [PATCH 1/7] docs: sync with development repo version

---
 .../GL-DPPD-7101-F.md                         | 170 ++++++++++--------
 1 file changed, 100 insertions(+), 70 deletions(-)

diff --git a/RNAseq/Pipeline_GL-DPPD-7101_Versions/GL-DPPD-7101-F.md b/RNAseq/Pipeline_GL-DPPD-7101_Versions/GL-DPPD-7101-F.md
index b6476054..7ac8487b 100644
--- a/RNAseq/Pipeline_GL-DPPD-7101_Versions/GL-DPPD-7101-F.md
+++ b/RNAseq/Pipeline_GL-DPPD-7101_Versions/GL-DPPD-7101-F.md
@@ -37,6 +37,12 @@ The DESeq2 Normalization and DGE step, [step 9](#9-normalize-read-counts-perform
 
 - Fixed edge case where `contrasts.csv` and `ERCCnorm_contrasts.csv` table header and rows could become out of sync with each other in [step 9c](#9c-configure-metadata-sample-grouping-and-group-comparisons) and [step 9e](#9e-perform-dge-on-datasets-with-ercc-spike-in) by generating rows from header rather than generating both separately.
 
+- Updated R version from 4.1.2 to 4.1.3.
+
+- Fixed edge case where certain scripts would crash if sample names were prefixes of other sample names. This had affected [step 4c](#4c-tablulate-star-counts-in-r), [step 8c](#8c-calculate-total-number-of-genes-expressed-per-sample-in-r), and [step 9d](#9d-import-rsem-genecounts)
+
+- Fixed rare edge case where groupwise mean and standard deviations could become misassociated to incorrect groups. This had affected [step 9f](#9f-prepare-genelab-dge-tables-with-annotations-on-datasets-with-ercc-spike-in) and [step 9i](#9i-prepare-genelab-dge-tables-with-annotations-on-datasets-without-ercc-spike-in)
+
 ---
 
 # Table of contents  
@@ -110,20 +116,20 @@ The DESeq2 Normalization and DGE step, [step 9](#9-normalize-read-counts-perform
 |geneBody_coverage|4.0.0|[http://rseqc.sourceforge.net/#genebody-coverage-py](http://rseqc.sourceforge.net/#genebody-coverage-py)|
 |inner_distance|4.0.0|[http://rseqc.sourceforge.net/#inner-distance-py](http://rseqc.sourceforge.net/#inner-distance-py)|
 |read_distribution|4.0.0|[http://rseqc.sourceforge.net/#read-distribution-py](http://rseqc.sourceforge.net/#read-distribution-py)|
-|R|4.1.2|[https://www.r-project.org/](https://www.r-project.org/)|
+|R|4.1.3|[https://www.r-project.org/](https://www.r-project.org/)|
 |Bioconductor|3.14.0|[https://bioconductor.org](https://bioconductor.org)|
 |DESeq2|1.34|[https://bioconductor.org/packages/release/bioc/html/DESeq2.html](https://bioconductor.org/packages/release/bioc/html/DESeq2.html)|
-|tximport|1.22|[https://bioconductor.org/packages/release/bioc/html/tximport.html](https://bioconductor.org/packages/release/bioc/html/tximport.html)|
+|tximport|1.27.1|[https://github.com/mikelove/tximport](https://github.com/mikelove/tximport)|
 |tidyverse|1.3.1|[https://www.tidyverse.org](https://www.tidyverse.org)|
-|dp_tools|1.1.5|[https://github.com/J-81/dp_tools](https://github.com/J-81/dp_tools)|
-|singularity|3.9|[https://sylabs.io/](https://sylabs.io/)|
 |stringr|1.4.1|[https://github.com/tidyverse/stringr](https://github.com/tidyverse/stringr)|
+|dp_tools|1.1.8|[https://github.com/J-81/dp_tools](https://github.com/J-81/dp_tools)|
 |pandas|1.5.0|[https://github.com/pandas-dev/pandas](https://github.com/pandas-dev/pandas)|
 |seaborn|0.12.0|[https://seaborn.pydata.org/](https://seaborn.pydata.org/)|
 |matplotlib|3.6.0|[https://matplotlib.org/stable](https://matplotlib.org/stable)|
 |jupyter notebook|6.4.12|[https://jupyter-notebook.readthedocs.io/](https://jupyter-notebook.readthedocs.io/)|
 |numpy|1.23.3|[https://numpy.org/](https://numpy.org/)|
 |scipy|1.9.1|[https://scipy.org/](https://scipy.org/)|
+|singularity|3.9|[https://sylabs.io/](https://sylabs.io/)|
 
 ---
 
@@ -455,7 +461,7 @@ study <- read.csv(Sys.glob(file.path(work_dir,"samples.txt")), header = FALSE, r
 ff <- list.files(file.path(align_dir), pattern = "ReadsPerGene.out.tab", recursive=TRUE, full.names = TRUE)
 
 ## Reorder the *genes.results files to match the ordering of the ISA samples
-ff <- ff[sapply(rownames(study), function(x)grep(paste0(x,'_ReadsPerGene.out.tab$'), ff, value=FALSE))]
+ff <- ff[sapply(rownames(study), function(x)grep(paste0(align_dir, '/', x,'_ReadsPerGene.out.tab$'), ff, value=FALSE))]
 
 # Remove the first 4 lines
 counts.files <- lapply( ff, read.table, skip = 4 )
@@ -942,7 +948,7 @@ samples <- read.csv(Sys.glob(file.path(work_dir,"samples.txt")), header = FALSE,
 files <- list.files(file.path(counts_dir),pattern = ".genes.results", full.names = TRUE)
 
 ### reorder the genes.results files to match the ordering of the samples in the metadata file
-files <- files[sapply(rownames(samples), function(x)grep(paste0(x,'.genes.results$'), files, value=FALSE))]
+files <- files[sapply(rownames(samples), function(x)grep(paste0(counts_dir, '/',  x,'.genes.results$'), files, value=FALSE))]
 
 names(files) <- rownames(samples)
 txi.rsem <- tximport(files, type = "rsem", txIn = FALSE, txOut = FALSE)
@@ -981,7 +987,7 @@ sessionInfo()
 
 ### 9a. Create Sample RunSheet
 
-> Note: Rather than running the command below to create the runsheet needed for processing, the runsheet may also be created manually by following the [file specification](../Workflow_Documentation/NF_RCP-F/examples/README.md).
+> Note: Rather than running the command below to create the runsheet needed for processing, the runsheet may also be created manually by following the [file specification](../Workflow_Documentation/NF_RCP-F/examples/runsheet/README.md).
 
 ```bash
 ### Download the *ISA.zip file from the GeneLab Repository ###
@@ -1137,7 +1143,7 @@ files <- list.files(file.path(counts_dir),pattern = ".genes.results", full.names
 
 ### Reorder the *genes.results files to match the ordering of the ISA samples ###
 
-files <- files[sapply(rownames(study), function(x)grep(paste0(x,".genes.results$"), files, value=FALSE))]
+files <- files[sapply(rownames(study), function(x)grep(paste0(counts_dir, '/', x,".genes.results$"), files, value=FALSE))]
 
 names(files) <- rownames(study)
 txi.rsem <- tximport(files, type = "rsem", txIn = FALSE, txOut = FALSE)
@@ -1386,19 +1392,18 @@ reduced_output_table_1$LRT.p.value <- res_1_lrt@listData$padj
 ### Generate and add group mean and stdev columns to the (non-ERCC) DGE table ###
 
 tcounts <- as.data.frame(t(normCounts))
-tcounts$group <- group
-group_means <- as.data.frame(t(aggregate(. ~ group,data = tcounts,mean)))
-group_means <- group_means[-c(1),]
-colnames(group_means) <- paste0("Group.Mean_",levels(factor(names(group))))
-group_stdev <- as.data.frame(t(aggregate(. ~ group,data = tcounts,sd)))
-group_stdev <- group_stdev[-c(1),]
-colnames(group_stdev) <- paste0("Group.Stdev_",levels(factor(names(group))))
+tcounts$group <- names(group) # Used final table group name formatting (e.g. '( Space Flight & Blue Light )' )
+
+group_means <- as.data.frame(t(aggregate(. ~ group,data = tcounts,mean))) # Compute group name group-wise means
+colnames(group_means) <- paste0("Group.Mean_", group_means['group',]) # assign group name as column names
 
-output_table_1 <- cbind(output_table_1,group_means)
-reduced_output_table_1 <- cbind(reduced_output_table_1,group_means)
+group_stdev <- as.data.frame(t(aggregate(. ~ group,data = tcounts,sd))) # Compute group name group-wise standard deviation
+colnames(group_stdev) <- paste0("Group.Stdev_", group_stdev['group',]) # assign group name as column names
 
-output_table_1 <- cbind(output_table_1,group_stdev)
-reduced_output_table_1 <- cbind(reduced_output_table_1,group_stdev)
+group_means <- group_means[-c(1),] # Drop group name row from data rows (now present as column names)
+group_stdev <- group_stdev[-c(1),] # Drop group name row from data rows (now present as column names)
+output_table_1 <- cbind(output_table_1,group_means, group_stdev) # Column bind the group-wise data
+reduced_output_table_1 <- cbind(reduced_output_table_1,group_means, group_stdev) # Column bind the group-wise data
 
 rm(group_stdev,group_means,tcounts)
 
@@ -1515,19 +1520,18 @@ reduced_output_table_2$LRT.p.value <- res_2_lrt@listData$padj
 ### Generate and add group mean and stdev columns to the ERCC-normalized DGE table ###
 
 tcounts <- as.data.frame(t(ERCCnormCounts))
-tcounts$group_sub <- group_sub
-group_means <- as.data.frame(t(aggregate(. ~ group_sub,data = tcounts,mean)))
-group_means <- group_means[-c(1),]
-colnames(group_means) <- paste0("Group.Mean_",levels(factor(names(group_sub))))
-group_stdev <- as.data.frame(t(aggregate(. ~ group_sub,data = tcounts,sd)))
-group_stdev <- group_stdev[-c(1),]
-colnames(group_stdev) <- paste0("Group.Stdev_",levels(factor(names(group_sub))))
+tcounts$group_sub <- names(group_sub) # Used final table group name formatting (e.g. '( Space Flight & Blue Light )' )
 
-output_table_2 <- cbind(output_table_2,group_means)
-reduced_output_table_2 <- cbind(reduced_output_table_2,group_means)
+group_means <- as.data.frame(t(aggregate(. ~ group_sub,data = tcounts,mean))) # Compute group name group-wise means
+colnames(group_means) <- paste0("Group.Mean_", group_means['group_sub',]) # assign group name as column names
 
-output_table_2 <- cbind(output_table_2,group_stdev)
-reduced_output_table_2 <- cbind(reduced_output_table_2,group_stdev)
+group_stdev <- as.data.frame(t(aggregate(. ~ group_sub,data = tcounts,sd))) # Compute group name group-wise standard deviation
+colnames(group_stdev) <- paste0("Group.Stdev_", group_stdev['group_sub',]) # assign group name as column names
+
+group_means <- group_means[-c(1),] # Drop group name row from data rows (now present as column names)
+group_stdev <- group_stdev[-c(1),] # Drop group name row from data rows (now present as column names)
+output_table_2 <- cbind(output_table_2,group_means, group_stdev) # Column bind the group-wise data
+reduced_output_table_2 <- cbind(reduced_output_table_2,group_means, group_stdev) # Column bind the group-wise data
 
 rm(group_stdev,group_means,tcounts)
 
@@ -1734,19 +1738,18 @@ reduced_output_table_1$LRT.p.value <- res_1_lrt@listData$padj
 ### Generate and add group mean and stdev columns to the DGE table ###
 
 tcounts <- as.data.frame(t(normCounts))
-tcounts$group <- group
-group_means <- as.data.frame(t(aggregate(. ~ group,data = tcounts,mean)))
-group_means <- group_means[-c(1),]
-colnames(group_means) <- paste0("Group.Mean_",levels(factor(names(group))))
-group_stdev <- as.data.frame(t(aggregate(. ~ group,data = tcounts,sd)))
-group_stdev <- group_stdev[-c(1),]
-colnames(group_stdev) <- paste0("Group.Stdev_",levels(factor(names(group))))
+tcounts$group <- names(group) # Used final table group name formatting (e.g. '( Space Flight & Blue Light )' )
+
+group_means <- as.data.frame(t(aggregate(. ~ group,data = tcounts,mean))) # Compute group name group-wise means
+colnames(group_means) <- paste0("Group.Mean_", group_means['group',]) # assign group name as column names
 
-output_table_1 <- cbind(output_table_1,group_means)
-reduced_output_table_1 <- cbind(reduced_output_table_1,group_means)
+group_stdev <- as.data.frame(t(aggregate(. ~ group,data = tcounts,sd))) # Compute group name group-wise standard deviation
+colnames(group_stdev) <- paste0("Group.Stdev_", group_stdev['group',]) # assign group name as column names
 
-output_table_1 <- cbind(output_table_1,group_stdev)
-reduced_output_table_1 <- cbind(reduced_output_table_1,group_stdev)
+group_means <- group_means[-c(1),] # Drop group name row from data rows (now present as column names)
+group_stdev <- group_stdev[-c(1),] # Drop group name row from data rows (now present as column names)
+output_table_1 <- cbind(output_table_1,group_means, group_stdev) # Column bind the group-wise data
+reduced_output_table_1 <- cbind(reduced_output_table_1,group_means, group_stdev) # Column bind the group-wise data
 
 rm(group_stdev,group_means,tcounts)
 
@@ -1933,7 +1936,8 @@ import matplotlib.pyplot as plt
 accession = 'GLDS-NNN' # Replace Ns with GLDS number
 isaPath = 'path/to/GLDS-NNN-ISA.zip' # Replace with path to ISA archive file
 zip_file_object = zipfile.ZipFile(isaPath, "r")
-list_of_ISA_files = zip_file_object.namelist() 
+list_of_ISA_files = zip_file_object.namelist()
+UnnormalizedCountsPath = '/path/to/GLDS-NNN_rna_seq_RSEM_Unnormalized_Counts.csv'
 
 # Print contents of ISA zip file to view file order
 list_of_ISA_files
@@ -1970,7 +1974,7 @@ assay_table.head(n=3)
 
 # Get raw counts table
 
-raw_counts_table = pd.read_csv('/path/to/RSEM_Unnormalized_Counts.csv', index_col=0) 
+raw_counts_table = pd.read_csv(UnnormalizedCountsPath, index_col=0) 
 raw_counts_table.index.rename('Gene_ID', inplace=True)
 print(raw_counts_table.head(n=3))
 
@@ -2048,8 +2052,8 @@ print(merged_ercc)
 
 ### Filter and calculate mean counts per million of Mix1 and Mix2 spiked samples in each of the 4 groups
 ## Check the 'Parameter Value[Spike-in Mix Number]' column of the assay table
-## If the values do not have a space, change 'Mix 1' and 'Mix 2' to 'Mix1' and 'Mix2', respectively
 
+## If the values do not have a space, change 'Mix 1' and 'Mix 2' to 'Mix1' and 'Mix2', respectively
 # Filter Mix1 CPM and Mix2 CPM in group A 
 
 Adf = merged_ercc.loc[merged_ercc['ERCC group'] == 'A']
@@ -2189,45 +2193,34 @@ print('Number of ERCC detected in group D (out of 23) =', ddf['Avg Mix1 CPM/ Avg
 #Calculate and plot ERCC metrics from individual samples, including limit of detection, dynamic range, and R^2 of counts vs. concentration.
 
 #View the ERCC table
-print(ercc_table.head(n=3))
-
-# Check assay_table header to identify the 'Sample Name' column and the column title indicating the 'Spike-in Mix Nmber' if it's indicated in the metadata.
-
-pd.set_option('display.max_columns', None)
-print(assay_table.head(n=3))
-
-# Get ERCC counts for all samples
-
-ercc_counts = raw_counts_table[raw_counts_table.index.str.contains('^ERCC-')] 
-ercc_counts = ercc_counts.sort_values(by=list(ercc_counts), ascending=False)
-print(ercc_counts.head())
 
+print(ercc_table.head(n=3))
 
 # Make a dictionary for ERCC concentrations for Mix 1
+
 mix1_conc_dict = dict(zip(ercc_table['ERCC ID'], ercc_table['concentration in Mix 1 (attomoles/ul)']))
 
-# Get samples spiked with Mix 1
-mix1_samples = assay_table[assay_table['Parameter Value[Spike-in Mix Number]'] == 'Mix 1']['Sample Name']
 
-# Get ERCC counts for Mix 1 spiked samples
-ercc_counts_mix_1 = ercc_counts[mix1_samples]
-ercc_counts_mix_1['ERCC conc (attomoles/ul)'] = ercc_counts_mix_1.index.map(mix1_conc_dict)
-print(ercc_counts_mix_1.head(n=3))
+# Get samples spiked with Mix 1
 
+mix1_samples = assay_table[assay_table['Parameter Value[Spike-in Mix Number]'] == 'Mix 1']['Sample Name']
 
 # Make a dictionary for ERCC concentrations for Mix 2
+
 mix2_conc_dict = dict(zip(ercc_table['ERCC ID'], ercc_table['concentration in Mix 2 (attomoles/ul)']))
 
+
 # Get samples spiked with Mix 2
+
 mix2_samples = assay_table[assay_table['Parameter Value[Spike-in Mix Number]'] == 'Mix 2']['Sample Name']
 
+
 # Get ERCC counts for Mix 2 spiked samples
 
 ercc_counts_mix_2 = ercc_counts[mix2_samples]
 ercc_counts_mix_2['ERCC conc (attomoles/ul)'] = ercc_counts_mix_2.index.map(mix2_conc_dict)
 print(ercc_counts_mix_2.head(n=3))
 
-
 # Create a scatter plot of log(2) ERCC counts versus log(2) ERCC concentration for each sample
 
 columns_mix_1 = ercc_counts_mix_1.columns.drop(['ERCC conc (attomoles/ul)'])
@@ -2261,6 +2254,40 @@ for ax in axs.flat:
     counter = counter + 1
 
 
+
+# Create a scatter plot of log(2) ERCC counts versus log(2) ERCC concentration for each sample
+
+columns_mix_1 = ercc_counts_mix_1.columns.drop(['ERCC conc (attomoles/ul)'])
+columns_mix_2 = ercc_counts_mix_2.columns.drop(['ERCC conc (attomoles/ul)'])
+all_columns = columns_mix_1.to_list() + columns_mix_2.to_list()
+total_columns = len(columns_mix_1) + len(columns_mix_2) 
+side_size = np.int32(np.ceil(np.sqrt(total_columns)))# calculate grid side size. take sqrt of total plots and round up.
+fig, axs = plt.subplots(side_size, side_size, figsize=(15,15), sharex='all', sharey='all'); #change figsize x,y labels if needed.
+fig.tight_layout(pad=1, w_pad=2.5, h_pad=3.5)
+
+counter = 0
+for ax in axs.flat:
+    
+    if(counter < len(columns_mix_1)):
+      ax.scatter(x=np.log2(ercc_counts_mix_1['ERCC conc (attomoles/ul)']), y=np.log2(ercc_counts_mix_1[all_columns[counter]]+1), s=7);
+      ax.set_title(all_columns[counter][-45:], fontsize=9);
+      ax.set_xlabel('log2 ERCC conc (attomoles/ ul)', fontsize=9);
+      ax.set_ylabel('log2 Counts per million', fontsize=9);
+      ax.tick_params(direction='in', axis='both', labelsize=8, labelleft=True, labelbottom=True);
+      
+    elif(counter >= len(columns_mix_1) and counter < total_columns):
+      ax.scatter(x=np.log2(ercc_counts_mix_2['ERCC conc (attomoles/ul)']), y=np.log2(ercc_counts_mix_2[all_columns[counter]]+1), s=7);
+      ax.set_title(all_columns[counter][-45:], fontsize=9);
+      ax.set_xlabel('log2 ERCC conc (attomoles/ ul)', fontsize=9);
+      ax.set_ylabel('log2 Counts per million', fontsize=9);
+      ax.tick_params(direction='in', axis='both', labelsize=8, labelleft=True, labelbottom=True);
+       
+    else:
+      pass
+
+    counter = counter + 1
+
+
 ## Calculate and plot linear regression of log(2) ERCC counts versus log(2) ERCC concentration for each sample
 
 # Filter counts > 0
@@ -2288,7 +2315,8 @@ for i in range(0, len(ercc_counts_mix_2.columns)-1):
   nonzero_counts_list_2.append(nonzero_counts_sorted)
 
 
-# Plot each sample using linear regression of scatter plot with x = log2 Conc and y = log2 Counts.  Return min, max, R^2 and dynamic range (max / min) values.
+# Plot each sample using linear regression of scatter plot with x = log2 Conc and y = log2 Counts.
+# Return min, max, R^2 and dynamic range (max / min) values.
 
 samples = []
 mins = []
@@ -2435,15 +2463,17 @@ os.makedirs(name="ERCC_analysis", exist_ok=True)
 
 stats = pd.DataFrame(list(zip(samples, mins, maxs, dyranges, rs)))
 stats.columns = ['Samples', 'Min', 'Max', 'Dynamic range', 'R']
-stats.to_csv('ERCC_analysis/ERCC_stats_GLDS-NNN.csv', index = False) 
-stats.filter(items = ['Samples', 'Dynamic range']).to_csv('ERCC_analysis/ERCC_dynrange_GLDS-NNN_mqc.csv', index = False) 
-stats.filter(items = ['Samples', 'R']).to_csv('ERCC_analysis/ERCC_rsq_GLDS-NNN_mqc.csv', index = False) 
+stats.to_csv('ERCC_analysis/ERCC_stats_GLDS-NNN.csv', index = False)
+stats.filter(items = ['Samples', 'Dynamic range']).to_csv('ERCC_analysis/ERCC_dynrange_GLDS-NNN_mqc.csv', index = False)
+stats.filter(items = ['Samples', 'R']).to_csv('ERCC_analysis/ERCC_rsq_GLDS-NNN_mqc.csv', index = False)
 
 
 
 ### Generate data and metadata files needed for ERCC DESeq2 analysis
 
-# ERCC Mix 1 and Mix 2 are distributed so that half the samples receive Mix 1 spike-in and half receive Mix 2 spike-in. Transcripts in Mix 1 and Mix 2 are present at a known ratio, so we can determine how well these patterns are revealed in the dataset.
+# ERCC Mix 1 and Mix 2 are distributed so that half the samples receive Mix 1 spike-in and half receive Mix 2 spike-in.
+
+# Transcripts in Mix 1 and Mix 2 are present at a known ratio, so we can determine how well these patterns are revealed in the dataset.
 
 # Get sample table
 
@@ -2453,7 +2483,7 @@ pd.set_option('display.max_columns', None)
 print(combined)
 
 # Create metadata table containing samples and their respective ERCC spike-in Mix number
-# Sometimes Number in [Spike-in Mix Number] is spelled 'number' and this could cause error in mismatch search 
+# Note: Sometimes "Number" in [Spike-in Mix Number] is spelled "number" and this could cause error in mismatch search 
 
 ERCCmetadata = combined[['Parameter Value[Spike-in Mix Number]']]
 ERCCmetadata.index = ERCCmetadata.index.str.replace('-','_')
@@ -2647,4 +2677,4 @@ ax.set_yscale("log");
 
 - ERCC_analysis/ERCC_lodr_*.csv (ERCC Gene Table including mean counts, adjusted p-value and p-value, and filtered to genes with both adj. p-value and p-value < 0.001)
 
-> All steps of the ERCC Spike-In Data Analysis are performed in a Jupyter Notebook (JN) and the completed JN is exported as an html file and published in the [GLDS repository](https://genelab-data.ndc.nasa.gov/genelab/projects) for the respective dataset.
+> All steps of the ERCC Spike-In Data Analysis are performed in a Jupyter Notebook (JN) and the completed JN is exported as an html file and published in the [GLDS repository](https://genelab-data.ndc.nasa.gov/genelab/projects) for the respective dataset.
\ No newline at end of file

From 0bf7e876a53378a2dd5c6e4a6729d02856ee8c39 Mon Sep 17 00:00:00 2001
From: Jonathan Oribello <Jonathan.d.oribello@gmail.com>
Date: Wed, 22 Mar 2023 23:38:43 +0000
Subject: [PATCH 2/7] docs: remove duplicated chunk

Confirmed this was not duplicated in the associated notebook
---
 .../GL-DPPD-7101-F.md                         | 35 -------------------
 1 file changed, 35 deletions(-)

diff --git a/RNAseq/Pipeline_GL-DPPD-7101_Versions/GL-DPPD-7101-F.md b/RNAseq/Pipeline_GL-DPPD-7101_Versions/GL-DPPD-7101-F.md
index 7ac8487b..196a42e7 100644
--- a/RNAseq/Pipeline_GL-DPPD-7101_Versions/GL-DPPD-7101-F.md
+++ b/RNAseq/Pipeline_GL-DPPD-7101_Versions/GL-DPPD-7101-F.md
@@ -2253,41 +2253,6 @@ for ax in axs.flat:
 
     counter = counter + 1
 
-
-
-# Create a scatter plot of log(2) ERCC counts versus log(2) ERCC concentration for each sample
-
-columns_mix_1 = ercc_counts_mix_1.columns.drop(['ERCC conc (attomoles/ul)'])
-columns_mix_2 = ercc_counts_mix_2.columns.drop(['ERCC conc (attomoles/ul)'])
-all_columns = columns_mix_1.to_list() + columns_mix_2.to_list()
-total_columns = len(columns_mix_1) + len(columns_mix_2) 
-side_size = np.int32(np.ceil(np.sqrt(total_columns)))# calculate grid side size. take sqrt of total plots and round up.
-fig, axs = plt.subplots(side_size, side_size, figsize=(15,15), sharex='all', sharey='all'); #change figsize x,y labels if needed.
-fig.tight_layout(pad=1, w_pad=2.5, h_pad=3.5)
-
-counter = 0
-for ax in axs.flat:
-    
-    if(counter < len(columns_mix_1)):
-      ax.scatter(x=np.log2(ercc_counts_mix_1['ERCC conc (attomoles/ul)']), y=np.log2(ercc_counts_mix_1[all_columns[counter]]+1), s=7);
-      ax.set_title(all_columns[counter][-45:], fontsize=9);
-      ax.set_xlabel('log2 ERCC conc (attomoles/ ul)', fontsize=9);
-      ax.set_ylabel('log2 Counts per million', fontsize=9);
-      ax.tick_params(direction='in', axis='both', labelsize=8, labelleft=True, labelbottom=True);
-      
-    elif(counter >= len(columns_mix_1) and counter < total_columns):
-      ax.scatter(x=np.log2(ercc_counts_mix_2['ERCC conc (attomoles/ul)']), y=np.log2(ercc_counts_mix_2[all_columns[counter]]+1), s=7);
-      ax.set_title(all_columns[counter][-45:], fontsize=9);
-      ax.set_xlabel('log2 ERCC conc (attomoles/ ul)', fontsize=9);
-      ax.set_ylabel('log2 Counts per million', fontsize=9);
-      ax.tick_params(direction='in', axis='both', labelsize=8, labelleft=True, labelbottom=True);
-       
-    else:
-      pass
-
-    counter = counter + 1
-
-
 ## Calculate and plot linear regression of log(2) ERCC counts versus log(2) ERCC concentration for each sample
 
 # Filter counts > 0

From 2d07f0f459c735f2b328e64adcc909eb0fa9cb7d Mon Sep 17 00:00:00 2001
From: Jonathan Oribello <Jonathan.d.oribello@gmail.com>
Date: Wed, 22 Mar 2023 23:40:47 +0000
Subject: [PATCH 3/7] docs: remove params$

Inadvertently included from notebook version which uses a params object instead of directly supplying by user (as in DPPD)
---
 RNAseq/Pipeline_GL-DPPD-7101_Versions/GL-DPPD-7101-F.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/RNAseq/Pipeline_GL-DPPD-7101_Versions/GL-DPPD-7101-F.md b/RNAseq/Pipeline_GL-DPPD-7101_Versions/GL-DPPD-7101-F.md
index 196a42e7..466042d7 100644
--- a/RNAseq/Pipeline_GL-DPPD-7101_Versions/GL-DPPD-7101-F.md
+++ b/RNAseq/Pipeline_GL-DPPD-7101_Versions/GL-DPPD-7101-F.md
@@ -1097,7 +1097,7 @@ compare_csv_from_runsheet <- function(runsheet_path) {
 
 ### Load metadata from runsheet csv file ###
 
-compare_csv <- compare_csv_from_runsheet(params$runsheet_path)
+compare_csv <- compare_csv_from_runsheet(runsheet_path)
 
 
 ### Create data frame containing all samples and respective factors ###

From 4316f4da52fe1a5cd18fac7a82d79f64e7a3f86c Mon Sep 17 00:00:00 2001
From: Jonathan Oribello <Jonathan.d.oribello@gmail.com>
Date: Wed, 22 Mar 2023 16:52:28 -0700
Subject: [PATCH 4/7] docs: remove duplicate software lines

---
 RNAseq/Pipeline_GL-DPPD-7101_Versions/GL-DPPD-7101-F.md | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/RNAseq/Pipeline_GL-DPPD-7101_Versions/GL-DPPD-7101-F.md b/RNAseq/Pipeline_GL-DPPD-7101_Versions/GL-DPPD-7101-F.md
index e03a0dfe..360c6f25 100644
--- a/RNAseq/Pipeline_GL-DPPD-7101_Versions/GL-DPPD-7101-F.md
+++ b/RNAseq/Pipeline_GL-DPPD-7101_Versions/GL-DPPD-7101-F.md
@@ -121,8 +121,6 @@ The DESeq2 Normalization and DGE step, [step 9](#9-normalize-read-counts-perform
 |DESeq2|1.34|[https://bioconductor.org/packages/release/bioc/html/DESeq2.html](https://bioconductor.org/packages/release/bioc/html/DESeq2.html)|
 |tximport|1.27.1|[https://github.com/mikelove/tximport](https://github.com/mikelove/tximport)|
 |tidyverse|1.3.1|[https://www.tidyverse.org](https://www.tidyverse.org)|
-|dp_tools|1.1.5|[https://github.com/J-81/dp_tools](https://github.com/J-81/dp_tools)|
-|singularity|3.9|[https://sylabs.io/](https://sylabs.io/)|
 |stringr|1.4.1|[https://github.com/tidyverse/stringr](https://github.com/tidyverse/stringr)|
 |dp_tools|1.1.8|[https://github.com/J-81/dp_tools](https://github.com/J-81/dp_tools)|
 |pandas|1.5.0|[https://github.com/pandas-dev/pandas](https://github.com/pandas-dev/pandas)|
@@ -2644,4 +2642,4 @@ ax.set_yscale("log");
 
 - ERCC_analysis/ERCC_lodr_*.csv (ERCC Gene Table including mean counts, adjusted p-value and p-value, and filtered to genes with both adj. p-value and p-value < 0.001)
 
-> All steps of the ERCC Spike-In Data Analysis are performed in a Jupyter Notebook (JN) and the completed JN is exported as an html file and published in the [GLDS repository](https://genelab-data.ndc.nasa.gov/genelab/projects) for the respective dataset.
\ No newline at end of file
+> All steps of the ERCC Spike-In Data Analysis are performed in a Jupyter Notebook (JN) and the completed JN is exported as an html file and published in the [GLDS repository](https://genelab-data.ndc.nasa.gov/genelab/projects) for the respective dataset.

From 6216c243fec8579429d33668513871b8a63f8593 Mon Sep 17 00:00:00 2001
From: Jonathan Oribello <Jonathan.d.oribello@gmail.com>
Date: Thu, 23 Mar 2023 18:41:33 +0000
Subject: [PATCH 5/7] docs: update notebook related code using 1.0.3 notebook

Converted using:

``` bash
INPUT_IPYNB=$1

jupyter-nbconvert $INPUT_IPYNB --to python --stdout | # Convert using nbconvert
    sed  '/# In\[ \]:/d' | # remove notebook artifacts
    sed '/# coding: utf-8/d' | # remove notebook artifacts
    sed '/# $/d' | # remove notebook artifacts
    sed '/#!\/usr\/bin\/env python/d' | # remove notebook artifacts
    sed 's/# #/#/' | # remove notebook artifact where markdown gets an extra '# ' added on
    cat -s > DPPD_OUT.txt # # Remove empty lines that are consecutive and output
```
---
 .../GL-DPPD-7101-F.md                         | 139 ++++++------------
 1 file changed, 49 insertions(+), 90 deletions(-)

diff --git a/RNAseq/Pipeline_GL-DPPD-7101_Versions/GL-DPPD-7101-F.md b/RNAseq/Pipeline_GL-DPPD-7101_Versions/GL-DPPD-7101-F.md
index 360c6f25..c5c190a3 100644
--- a/RNAseq/Pipeline_GL-DPPD-7101_Versions/GL-DPPD-7101-F.md
+++ b/RNAseq/Pipeline_GL-DPPD-7101_Versions/GL-DPPD-7101-F.md
@@ -1915,8 +1915,6 @@ Output data with considering ERCC spike-in genes:
 ### 10a. Evaluate ERCC Count Data in Python
 
 ```python
-### Setting up the notebook
-
 # import python packages
 import pandas as pd
 pd.set_option('mode.chained_assignment', None) # suppress chained indexing warnings
@@ -1928,51 +1926,40 @@ import seaborn as sns
 from scipy.stats import linregress
 import matplotlib.pyplot as plt
 
-
 ### Get and parse data and metadata
 
 # Get and unzip ISA.zip to extract metadata.
 
 accession = 'GLDS-NNN' # Replace Ns with GLDS number
-isaPath = 'path/to/GLDS-NNN-ISA.zip' # Replace with path to ISA archive file
-zip_file_object = zipfile.ZipFile(isaPath, "r")
-list_of_ISA_files = zip_file_object.namelist()
+isaPath = '/path/to/GLDS-NNN_metadata_GLDS-NNN-ISA.zip' # Replace with path to ISA archive file
+zip_file_object =  zipfile.ZipFile(isaPath, "r")
+list_of_ISA_files = zip_file_object.namelist() # Print contents of zip file. Pick relevant one from list
 UnnormalizedCountsPath = '/path/to/GLDS-NNN_rna_seq_RSEM_Unnormalized_Counts.csv'
 
-# Print contents of ISA zip file to view file order
 list_of_ISA_files
 
 # There are datasets that have multiple assays (including microarray), so the RNAseq ISA files from the above output must be selected. 
 # Txt files outputted above are indexed as 0, 1, 2, etc. Fill in the indexed number corresponding to the sample (s_*txt) and assay files for RNAseq (a_*_(RNA-Seq).txt) in the code block below.
 
 # Extract metadata from the sample file (s_*txt)
-
 sample_file = list_of_ISA_files[1] # replace [1] with index corresponding to the (s_*txt) file
 file = zip_file_object.open(sample_file)
 sample_table = pd.read_csv(zip_file_object.open(sample_file), sep='\t')
 
-
 # Extract metadata from the assay (a_*_(RNA-Seq).txt) file
-
 assay_file = list_of_ISA_files[0] # replace [0] with index corresponding to the (a_*_(RNA-Seq).txt) file
 file = zip_file_object.open(assay_file)
 assay_table = pd.read_csv(zip_file_object.open(assay_file), sep='\t')
 
-
 # Check the sample table
-
 pd.set_option('display.max_columns', None)
 print(sample_table.head(n=3))
 
-
 # Check the assay table
-
 pd.set_option('display.max_columns', None)
 assay_table.head(n=3)
 
-
-
-# Get raw counts table
+### Get raw counts table
 
 raw_counts_table = pd.read_csv(UnnormalizedCountsPath, index_col=0) 
 raw_counts_table.index.rename('Gene_ID', inplace=True)
@@ -1982,8 +1969,7 @@ raw_counts_transcripts = raw_counts_table[raw_counts_table.index.str.contains('^
 raw_counts_transcripts = raw_counts_transcripts.sort_values(by=list(raw_counts_transcripts), ascending=False)
 print(raw_counts_transcripts)
 
-
-# Get ERCC counts
+### Get ERCC counts
 
 ercc_counts = raw_counts_table[raw_counts_table.index.str.contains('^ERCC-')] 
 ercc_counts.reset_index(inplace=True)
@@ -1991,17 +1977,14 @@ ercc_counts = ercc_counts.rename(columns={'Gene_ID':'ERCC ID'})
 ercc_counts = ercc_counts.sort_values(by=list(ercc_counts), ascending=False)
 print(ercc_counts.head())
 
-# Get files containing ERCC gene concentrations and metadata
+### Get files containing ERCC gene concentrations and metadata
 
 ercc_url = 'https://assets.thermofisher.com/TFS-Assets/LSG/manuals/cms_095046.txt'
 ercc_table = pd.read_csv(ercc_url, '\t')
 print(ercc_table.head(n=3))
 
-
-
-### Calculate the number of ERCC genes detected in each of the 4 (A, B, C and D) groups for each sample
-
-# Extract ERCC counts and calculate the log(2)
+## Calculate the number of ERCC genes detected in each of the 4 (A, B, C and D) groups for each sample
+### Extract ERCC counts and calculate the log(2)
 
 meltERCC = ercc_counts.melt(id_vars=['ERCC ID'])
 meltERCC['log2 Count'] = meltERCC['value']+1
@@ -2009,7 +1992,7 @@ meltERCC['log2 Count'] = np.log2(meltERCC['log2 Count'])
 meltERCC = meltERCC.rename(columns={'variable':'Sample Name', 'value':'Count'})
 print(meltERCC.head(n=3))
 
-# Build Mix dictionary to link sample name to mix added and read depth using the assay table
+### Build Mix dictionary to link sample name to mix added and read depth using the assay table
 
 mix_dict = assay_table.filter(['Sample Name','Parameter Value[Spike-in Mix Number]', 
                        'Parameter Value[Read Depth]'])
@@ -2018,43 +2001,35 @@ mix_dict = mix_dict.rename(columns={'Parameter Value[Spike-in Mix Number]':'Mix'
                                     'Total Reads'})
 print(mix_dict.head(n=3))
 
-
-# Make combined ercc counts and assay table
+### Make combined ercc counts and assay table
 
 merged_ercc = meltERCC.merge(mix_dict, on='Sample Name')
 print(merged_ercc)
 
-# Read ERCC info including concentrations from merged_ercc table
+### Read ERCC info including concentrations from merged_ercc table
 
 groupA = ercc_table.loc[ercc_table['subgroup'] == 'A']['ERCC ID']
 groupB = ercc_table.loc[ercc_table['subgroup'] == 'B']['ERCC ID']
 groupC = ercc_table.loc[ercc_table['subgroup'] == 'C']['ERCC ID']
 groupD = ercc_table.loc[ercc_table['subgroup'] == 'D']['ERCC ID']
 
-
-# Make a dictionary for ERCC groups
+### Make a dictionary for ERCC groups
 
 group_dict = dict(zip(ercc_table['ERCC ID'], ercc_table['subgroup']))
 
-
-# Calculate ERCC counts per million and log(2) counts per million
+### Calculate ERCC counts per million and log(2) counts per million
 
 merged_ercc['Count per million'] = merged_ercc['Count'] / (merged_ercc['Total Reads'] / 1000000.0)
 merged_ercc['log2 Count per million'] = np.log2(merged_ercc['Count per million']+1)
 
-
-# Add ERCC group column
+### Add ERCC group column
 
 merged_ercc['ERCC group'] = merged_ercc['ERCC ID'].map(group_dict)
 merged_ercc = merged_ercc.sort_values(by=['Mix'], ascending=True)
 print(merged_ercc)
 
-
-### Filter and calculate mean counts per million of Mix1 and Mix2 spiked samples in each of the 4 groups
-## Check the 'Parameter Value[Spike-in Mix Number]' column of the assay table
-
-## If the values do not have a space, change 'Mix 1' and 'Mix 2' to 'Mix1' and 'Mix2', respectively
-# Filter Mix1 CPM and Mix2 CPM in group A 
+## Filter and calculate mean counts per million of Mix1 and Mix2 spiked samples in each of the 4 groups
+### Filter Mix1 CPM and Mix2 CPM in group A 
 
 Adf = merged_ercc.loc[merged_ercc['ERCC group'] == 'A']
 Amix1df = Adf.loc[Adf['Mix']=='Mix 1']
@@ -2070,8 +2045,7 @@ adf = Amix1df.merge(Amix2df, on='ERCC ID', suffixes=('', '_2'))
 adf = adf.reset_index()
 adf['Avg Mix1 CPM/ Avg Mix2 CPM'] = (adf['Avg Mix1 CPM'] / adf['Avg Mix2 CPM'])
 
-
-# Filter Mix1 CPM and Mix2 CPM in group B
+### Filter Mix1 CPM and Mix2 CPM in group B
 
 Bdf = merged_ercc.loc[merged_ercc['ERCC group'] == 'B']
 Bmix1df = Bdf.loc[Bdf['Mix']=='Mix 1']
@@ -2087,8 +2061,7 @@ bdf = Bmix1df.merge(Bmix2df, on='ERCC ID')
 bdf = bdf.reset_index()
 bdf['Avg Mix1 CPM/ Avg Mix2 CPM'] = (bdf['Avg Mix1 CPM'] / bdf['Avg Mix2 CPM'])
 
-
-# Filter Mix1 CPM and Mix2 CPM in group C
+### Filter Mix1 CPM and Mix2 CPM in group C
 
 Cdf = merged_ercc.loc[merged_ercc['ERCC group'] == 'C']
 Cmix1df = Cdf.loc[Cdf['Mix']=='Mix 1']
@@ -2104,8 +2077,7 @@ cdf = Cmix1df.merge(Cmix2df, on='ERCC ID')
 cdf = cdf.reset_index()
 cdf['Avg Mix1 CPM/ Avg Mix2 CPM'] = (cdf['Avg Mix1 CPM'] / cdf['Avg Mix2 CPM'])
 
-
-# Filter Mix1 CPM and Mix2 CPM in group D
+### Filter Mix1 CPM and Mix2 CPM in group D
 
 Ddf = merged_ercc.loc[merged_ercc['ERCC group'] == 'D']
 Dmix1df = Ddf.loc[Ddf['Mix']=='Mix 1']
@@ -2121,9 +2093,7 @@ ddf = Dmix1df.merge(Dmix2df, on='ERCC ID')
 ddf = ddf.reset_index()
 ddf['Avg Mix1 CPM/ Avg Mix2 CPM'] = (ddf['Avg Mix1 CPM'] / ddf['Avg Mix2 CPM'])
 
-
-##### Multi-sample ERCC analyses
-
+## Multi-sample ERCC analyses
 ### Create box and whisker plots of the log(2) CPM for each ERCC detected in group A in Mix 1 and Mix 2 spiked samples
 
 a = sns.catplot(x="ERCC ID", y="log2 Count per million", order=groupA, hue="Mix",data=merged_ercc[merged_ercc['ERCC ID'].isin(groupA)], kind="box", col="ERCC group", height=5, aspect=1, palette=sns.color_palette(['blue', 'orange']))
@@ -2138,7 +2108,6 @@ plt.title("ERCC Group A")
 a1.set(ylim=(0, 6))
 print('Number of ERCC detected in group A (out of 23) =', adf['Avg Mix1 CPM/ Avg Mix2 CPM'].count())
 
-
 ### Create box and whisker plots of the log(2) CPM for each ERCC detected in group B in Mix 1 and Mix 2 spiked samples
 
 b = sns.catplot(x="ERCC ID", y="log2 Count per million", order=groupB, hue="Mix", data=merged_ercc[merged_ercc['ERCC ID'].isin(groupB)], kind="box", col="ERCC group", height=5, aspect=1, palette=sns.color_palette(['blue', 'orange']))
@@ -2154,7 +2123,6 @@ plt.title("ERCC Group B")
 b.set(ylim=(0, 2))
 print('Number of ERCC detected in group B (out of 23) =', bdf['Avg Mix1 CPM/ Avg Mix2 CPM'].count())
 
-
 ### Create box and whisker plots of the log(2) CPM for each ERCC detected in group C in Mix 1 and Mix 2 spiked samples
 
 c = sns.catplot(x="ERCC ID", y="log2 Count per million", order=groupC, hue="Mix", data=merged_ercc[merged_ercc['ERCC ID'].isin(groupC)], kind="box", col="ERCC group", height=5, aspect=1, palette=sns.color_palette(['blue', 'orange']))
@@ -2170,7 +2138,6 @@ plt.title("ERCC Group C")
 c.set(ylim=(0, 2))
 print('Number of ERCC detected in group C (out of 23) =', cdf['Avg Mix1 CPM/ Avg Mix2 CPM'].count())
 
-
 ### Create box and whisker plots of the log(2) CPM for each ERCC detected in group D in Mix 1 and Mix 2 spiked samples
 
 d = sns.catplot(x="ERCC ID", y="log2 Count per million", order=groupD, hue="Mix", data=merged_ercc[merged_ercc['ERCC ID'].isin(groupD)], col="ERCC group", kind="box", height=5, aspect=1, palette=sns.color_palette(['blue', 'orange']))
@@ -2186,35 +2153,46 @@ plt.title("ERCC Group D")
 d.set(ylim=(0, 1))
 print('Number of ERCC detected in group D (out of 23) =', ddf['Avg Mix1 CPM/ Avg Mix2 CPM'].count())
 
+## Individual sample ERCC analyses
+# Calculate and plot ERCC metrics from individual samples, including limit of detection, dynamic range, and R^2 of counts vs. concentration.
+
+#View the ERCC table
 
+print(ercc_table.head(n=3))
 
-##### Individual sample ERCC analyses
+# Check assay_table header to identify the 'Sample Name' column and the column title indicating the 'Spike-in Mix Nmber' if it's indicated in the metadata.
 
-#Calculate and plot ERCC metrics from individual samples, including limit of detection, dynamic range, and R^2 of counts vs. concentration.
+pd.set_option('display.max_columns', None)
+print(assay_table.head(n=3))
 
-#View the ERCC table
+# Get ERCC counts for all samples
 
-print(ercc_table.head(n=3))
+ercc_counts = raw_counts_table[raw_counts_table.index.str.contains('^ERCC-')] 
+ercc_counts = ercc_counts.sort_values(by=list(ercc_counts), ascending=False)
+print(ercc_counts.head())
 
 # Make a dictionary for ERCC concentrations for Mix 1
 
 mix1_conc_dict = dict(zip(ercc_table['ERCC ID'], ercc_table['concentration in Mix 1 (attomoles/ul)']))
 
-
 # Get samples spiked with Mix 1
 
 mix1_samples = assay_table[assay_table['Parameter Value[Spike-in Mix Number]'] == 'Mix 1']['Sample Name']
 
+# Get ERCC counts for Mix 1 spiked samples
+
+ercc_counts_mix_1 = ercc_counts[mix1_samples]
+ercc_counts_mix_1['ERCC conc (attomoles/ul)'] = ercc_counts_mix_1.index.map(mix1_conc_dict)
+print(ercc_counts_mix_1.head(n=3))
+
 # Make a dictionary for ERCC concentrations for Mix 2
 
 mix2_conc_dict = dict(zip(ercc_table['ERCC ID'], ercc_table['concentration in Mix 2 (attomoles/ul)']))
 
-
 # Get samples spiked with Mix 2
 
 mix2_samples = assay_table[assay_table['Parameter Value[Spike-in Mix Number]'] == 'Mix 2']['Sample Name']
 
-
 # Get ERCC counts for Mix 2 spiked samples
 
 ercc_counts_mix_2 = ercc_counts[mix2_samples]
@@ -2279,7 +2257,6 @@ for i in range(0, len(ercc_counts_mix_2.columns)-1):
   nonzero_counts_sorted = nonzero_counts.sort_values('Conc')
   nonzero_counts_list_2.append(nonzero_counts_sorted)
 
-
 # Plot each sample using linear regression of scatter plot with x = log2 Conc and y = log2 Counts.
 # Return min, max, R^2 and dynamic range (max / min) values.
 
@@ -2366,7 +2343,6 @@ for ax in axs.flat:
       ax.tick_params(direction='in', axis='both', labelsize=9, labelleft=True, labelbottom=True);
       samples.append(all_columns[counter])
 
-
       if(len(xvalues) == 0):
         mins.append('NaN')
         maxs.append('NaN')
@@ -2417,7 +2393,6 @@ for ax in axs.flat:
 
     counter = counter + 1
 
-
 # Create directory for saved files
 
 import os
@@ -2432,12 +2407,8 @@ stats.to_csv('ERCC_analysis/ERCC_stats_GLDS-NNN.csv', index = False)
 stats.filter(items = ['Samples', 'Dynamic range']).to_csv('ERCC_analysis/ERCC_dynrange_GLDS-NNN_mqc.csv', index = False)
 stats.filter(items = ['Samples', 'R']).to_csv('ERCC_analysis/ERCC_rsq_GLDS-NNN_mqc.csv', index = False)
 
-
-
-### Generate data and metadata files needed for ERCC DESeq2 analysis
-
-# ERCC Mix 1 and Mix 2 are distributed so that half the samples receive Mix 1 spike-in and half receive Mix 2 spike-in.
-
+## Generate data and metadata files needed for ERCC DESeq2 analysis
+# ERCC Mix 1 and Mix 2 are distributed so that ~half the samples receive Mix 1 spike-in and ~half receive Mix 2 spike-in. 
 # Transcripts in Mix 1 and Mix 2 are present at a known ratio, so we can determine how well these patterns are revealed in the dataset.
 
 # Get sample table
@@ -2488,7 +2459,6 @@ ERCCcounts.to_csv('ERCC_analysis/ERCCcounts.csv')
 ### 10b. Perform DESeq2 Analysis of ERCC Counts in R
 
 ```R
-
 ## Install R packages if not already installed
 
 if (!requireNamespace("BiocManager", quietly = TRUE))
@@ -2508,7 +2478,6 @@ coldata <- read.csv('ERCC_analysis/ERCCmetadata.csv', row.names=1) #INPUT
 coldata$Mix <- factor(coldata$Mix)
 all(rownames(coldata) == colnames(cts))
 
-
 ## Make DESeqDataSet object
 
 dds <- DESeqDataSetFromMatrix(countData = cts,
@@ -2516,7 +2485,6 @@ dds <- DESeqDataSetFromMatrix(countData = cts,
                               design = ~ Mix)
 dds
 
-
 ## Filter out ERCC genes with counts of less than 10 in all samples #####
 
 keepGenes <- rowSums(counts(dds)) > 10
@@ -2524,7 +2492,6 @@ dds <- dds[keepGenes,]
 
 dds
 
-
 ## Run DESeq2 analysis and calculate results
 
 dds <- DESeq(dds)
@@ -2553,37 +2520,32 @@ write.csv(normcounts, 'ERCC_analysis/ERCC_normcounts.csv') #OUTPUT
 ### 10c. Analyze ERCC DESeq2 Results in Python
 
 ```python
-
-# Import python packages
+## Import python packages
 
 import pandas as pd
 from urllib.request import urlopen, quote, urlretrieve
 import seaborn as sns
 import matplotlib.pyplot as plt
 
-
-# Import ERCC DESeq2 results
+## Import ERCC DESeq2 results
 
 deseq2out = pd.read_csv('ERCC_analysis/ERCC_DESeq2.csv', index_col=0) # INPUT
 #deseq2out.index = deseq2out.index.str.replace('_','-')
 deseq2out.rename(columns ={'baseMean' : 'meanNormCounts'}, inplace = True)
 print(deseq2out.head())
 
-
-# Get files containing ERCC gene concentrations and metadata
+## Get files containing ERCC gene concentrations and metadata
 
 ercc_url = 'https://assets.thermofisher.com/TFS-Assets/LSG/manuals/cms_095046.txt'
 ercc_table = pd.read_csv(ercc_url, '\t', index_col='ERCC ID')
 print(ercc_table.head(n=3))
 
-
-# Combine ERCC DESeq2 results and ercc_table
+## Combine ERCC DESeq2 results and ercc_table
 
 combined = deseq2out.merge(ercc_table, left_index=True, right_index=True)
 print(combined.head())
 
-
-# Filter p-value and adj. p-value cutoff at 10^-3
+## Filter p-value and adj. p-value cutoff at 10^-3
 
 combined['cleaned_padj'] = combined['padj']
 combined.loc[(combined.cleaned_padj < 0.001),'cleaned_padj']=0.001
@@ -2593,14 +2555,12 @@ combined.loc[(combined.cleaned_pvalue < 0.001),'cleaned_pvalue']=0.001
 
 print(combined.head())
 
-
-# Export the filtered combined ERCC DESeq2 results and ercc_table
-# Remember to change file name to GLDS# analyzing
+## Export the filtered combined ERCC DESeq2 results and ercc_table
+### Remember to change file name to GLDS# analyzing
 
 combined.filter(items = ['ERCC ID', 'meanNormCounts', 'cleaned_pvalue','cleaned_padj']).to_csv('ERCC_analysis/ERCC_lodr_GLDS-NNN_mqc.csv') 
 
-
-# Plot p-value vs. mean normalized ERCC counts
+## Plot p-value vs. mean normalized ERCC counts
 
 fig, ax = plt.subplots(figsize=(10, 7))
 
@@ -2616,8 +2576,7 @@ sns.lineplot(data=combined, x="meanNormCounts", y="cleaned_pvalue",
 ax.set_xscale("linear");
 ax.set_yscale("log");
 
-
-# Plot Adjp-value vs. mean normalized ERCC counts
+## Plot Adjp-value vs. mean normalized ERCC counts
 
 fig, ax = plt.subplots(figsize=(10, 7))
 

From 72a1c85aabe40c5dbac4885df8f520ba9db8a50a Mon Sep 17 00:00:00 2001
From: Jonathan Oribello <Jonathan.d.oribello@gmail.com>
Date: Thu, 23 Mar 2023 13:17:49 -0700
Subject: [PATCH 6/7] Update GL-DPPD-7101-F.md code / revert comment spacing
 changes

---
 .../GL-DPPD-7101-F.md                         | 138 ++++++++++++------
 1 file changed, 94 insertions(+), 44 deletions(-)

diff --git a/RNAseq/Pipeline_GL-DPPD-7101_Versions/GL-DPPD-7101-F.md b/RNAseq/Pipeline_GL-DPPD-7101_Versions/GL-DPPD-7101-F.md
index c5c190a3..80cb8b98 100644
--- a/RNAseq/Pipeline_GL-DPPD-7101_Versions/GL-DPPD-7101-F.md
+++ b/RNAseq/Pipeline_GL-DPPD-7101_Versions/GL-DPPD-7101-F.md
@@ -1915,6 +1915,8 @@ Output data with considering ERCC spike-in genes:
 ### 10a. Evaluate ERCC Count Data in Python
 
 ```python
+### Setting up the notebook
+
 # import python packages
 import pandas as pd
 pd.set_option('mode.chained_assignment', None) # suppress chained indexing warnings
@@ -1926,6 +1928,7 @@ import seaborn as sns
 from scipy.stats import linregress
 import matplotlib.pyplot as plt
 
+
 ### Get and parse data and metadata
 
 # Get and unzip ISA.zip to extract metadata.
@@ -1933,33 +1936,43 @@ import matplotlib.pyplot as plt
 accession = 'GLDS-NNN' # Replace Ns with GLDS number
 isaPath = '/path/to/GLDS-NNN_metadata_GLDS-NNN-ISA.zip' # Replace with path to ISA archive file
 zip_file_object =  zipfile.ZipFile(isaPath, "r")
-list_of_ISA_files = zip_file_object.namelist() # Print contents of zip file. Pick relevant one from list
+list_of_ISA_files = zip_file_object.namelist()
 UnnormalizedCountsPath = '/path/to/GLDS-NNN_rna_seq_RSEM_Unnormalized_Counts.csv'
 
+# Print contents of ISA zip file to view file order
 list_of_ISA_files
 
 # There are datasets that have multiple assays (including microarray), so the RNAseq ISA files from the above output must be selected. 
 # Txt files outputted above are indexed as 0, 1, 2, etc. Fill in the indexed number corresponding to the sample (s_*txt) and assay files for RNAseq (a_*_(RNA-Seq).txt) in the code block below.
 
 # Extract metadata from the sample file (s_*txt)
+
 sample_file = list_of_ISA_files[1] # replace [1] with index corresponding to the (s_*txt) file
 file = zip_file_object.open(sample_file)
 sample_table = pd.read_csv(zip_file_object.open(sample_file), sep='\t')
 
+
 # Extract metadata from the assay (a_*_(RNA-Seq).txt) file
+
 assay_file = list_of_ISA_files[0] # replace [0] with index corresponding to the (a_*_(RNA-Seq).txt) file
 file = zip_file_object.open(assay_file)
 assay_table = pd.read_csv(zip_file_object.open(assay_file), sep='\t')
 
+
 # Check the sample table
+
 pd.set_option('display.max_columns', None)
 print(sample_table.head(n=3))
 
+
 # Check the assay table
+
 pd.set_option('display.max_columns', None)
 assay_table.head(n=3)
 
-### Get raw counts table
+
+
+# Get raw counts table
 
 raw_counts_table = pd.read_csv(UnnormalizedCountsPath, index_col=0) 
 raw_counts_table.index.rename('Gene_ID', inplace=True)
@@ -1969,7 +1982,8 @@ raw_counts_transcripts = raw_counts_table[raw_counts_table.index.str.contains('^
 raw_counts_transcripts = raw_counts_transcripts.sort_values(by=list(raw_counts_transcripts), ascending=False)
 print(raw_counts_transcripts)
 
-### Get ERCC counts
+
+# Get ERCC counts
 
 ercc_counts = raw_counts_table[raw_counts_table.index.str.contains('^ERCC-')] 
 ercc_counts.reset_index(inplace=True)
@@ -1977,14 +1991,17 @@ ercc_counts = ercc_counts.rename(columns={'Gene_ID':'ERCC ID'})
 ercc_counts = ercc_counts.sort_values(by=list(ercc_counts), ascending=False)
 print(ercc_counts.head())
 
-### Get files containing ERCC gene concentrations and metadata
+# Get files containing ERCC gene concentrations and metadata
 
 ercc_url = 'https://assets.thermofisher.com/TFS-Assets/LSG/manuals/cms_095046.txt'
 ercc_table = pd.read_csv(ercc_url, '\t')
 print(ercc_table.head(n=3))
 
-## Calculate the number of ERCC genes detected in each of the 4 (A, B, C and D) groups for each sample
-### Extract ERCC counts and calculate the log(2)
+
+
+### Calculate the number of ERCC genes detected in each of the 4 (A, B, C and D) groups for each sample
+
+# Extract ERCC counts and calculate the log(2)
 
 meltERCC = ercc_counts.melt(id_vars=['ERCC ID'])
 meltERCC['log2 Count'] = meltERCC['value']+1
@@ -1992,7 +2009,7 @@ meltERCC['log2 Count'] = np.log2(meltERCC['log2 Count'])
 meltERCC = meltERCC.rename(columns={'variable':'Sample Name', 'value':'Count'})
 print(meltERCC.head(n=3))
 
-### Build Mix dictionary to link sample name to mix added and read depth using the assay table
+# Build Mix dictionary to link sample name to mix added and read depth using the assay table
 
 mix_dict = assay_table.filter(['Sample Name','Parameter Value[Spike-in Mix Number]', 
                        'Parameter Value[Read Depth]'])
@@ -2001,35 +2018,43 @@ mix_dict = mix_dict.rename(columns={'Parameter Value[Spike-in Mix Number]':'Mix'
                                     'Total Reads'})
 print(mix_dict.head(n=3))
 
-### Make combined ercc counts and assay table
+
+# Make combined ercc counts and assay table
 
 merged_ercc = meltERCC.merge(mix_dict, on='Sample Name')
 print(merged_ercc)
 
-### Read ERCC info including concentrations from merged_ercc table
+# Read ERCC info including concentrations from merged_ercc table
 
 groupA = ercc_table.loc[ercc_table['subgroup'] == 'A']['ERCC ID']
 groupB = ercc_table.loc[ercc_table['subgroup'] == 'B']['ERCC ID']
 groupC = ercc_table.loc[ercc_table['subgroup'] == 'C']['ERCC ID']
 groupD = ercc_table.loc[ercc_table['subgroup'] == 'D']['ERCC ID']
 
-### Make a dictionary for ERCC groups
+
+# Make a dictionary for ERCC groups
 
 group_dict = dict(zip(ercc_table['ERCC ID'], ercc_table['subgroup']))
 
-### Calculate ERCC counts per million and log(2) counts per million
+
+# Calculate ERCC counts per million and log(2) counts per million
 
 merged_ercc['Count per million'] = merged_ercc['Count'] / (merged_ercc['Total Reads'] / 1000000.0)
 merged_ercc['log2 Count per million'] = np.log2(merged_ercc['Count per million']+1)
 
-### Add ERCC group column
+
+# Add ERCC group column
 
 merged_ercc['ERCC group'] = merged_ercc['ERCC ID'].map(group_dict)
 merged_ercc = merged_ercc.sort_values(by=['Mix'], ascending=True)
 print(merged_ercc)
 
-## Filter and calculate mean counts per million of Mix1 and Mix2 spiked samples in each of the 4 groups
-### Filter Mix1 CPM and Mix2 CPM in group A 
+
+### Filter and calculate mean counts per million of Mix1 and Mix2 spiked samples in each of the 4 groups
+## Check the 'Parameter Value[Spike-in Mix Number]' column of the assay table
+## If the values do not have a space, change 'Mix 1' and 'Mix 2' to 'Mix1' and 'Mix2', respectively
+
+# Filter Mix1 CPM and Mix2 CPM in group A 
 
 Adf = merged_ercc.loc[merged_ercc['ERCC group'] == 'A']
 Amix1df = Adf.loc[Adf['Mix']=='Mix 1']
@@ -2045,7 +2070,8 @@ adf = Amix1df.merge(Amix2df, on='ERCC ID', suffixes=('', '_2'))
 adf = adf.reset_index()
 adf['Avg Mix1 CPM/ Avg Mix2 CPM'] = (adf['Avg Mix1 CPM'] / adf['Avg Mix2 CPM'])
 
-### Filter Mix1 CPM and Mix2 CPM in group B
+
+# Filter Mix1 CPM and Mix2 CPM in group B
 
 Bdf = merged_ercc.loc[merged_ercc['ERCC group'] == 'B']
 Bmix1df = Bdf.loc[Bdf['Mix']=='Mix 1']
@@ -2061,7 +2087,8 @@ bdf = Bmix1df.merge(Bmix2df, on='ERCC ID')
 bdf = bdf.reset_index()
 bdf['Avg Mix1 CPM/ Avg Mix2 CPM'] = (bdf['Avg Mix1 CPM'] / bdf['Avg Mix2 CPM'])
 
-### Filter Mix1 CPM and Mix2 CPM in group C
+
+# Filter Mix1 CPM and Mix2 CPM in group C
 
 Cdf = merged_ercc.loc[merged_ercc['ERCC group'] == 'C']
 Cmix1df = Cdf.loc[Cdf['Mix']=='Mix 1']
@@ -2077,7 +2104,8 @@ cdf = Cmix1df.merge(Cmix2df, on='ERCC ID')
 cdf = cdf.reset_index()
 cdf['Avg Mix1 CPM/ Avg Mix2 CPM'] = (cdf['Avg Mix1 CPM'] / cdf['Avg Mix2 CPM'])
 
-### Filter Mix1 CPM and Mix2 CPM in group D
+
+# Filter Mix1 CPM and Mix2 CPM in group D
 
 Ddf = merged_ercc.loc[merged_ercc['ERCC group'] == 'D']
 Dmix1df = Ddf.loc[Ddf['Mix']=='Mix 1']
@@ -2093,7 +2121,9 @@ ddf = Dmix1df.merge(Dmix2df, on='ERCC ID')
 ddf = ddf.reset_index()
 ddf['Avg Mix1 CPM/ Avg Mix2 CPM'] = (ddf['Avg Mix1 CPM'] / ddf['Avg Mix2 CPM'])
 
-## Multi-sample ERCC analyses
+
+##### Multi-sample ERCC analyses
+
 ### Create box and whisker plots of the log(2) CPM for each ERCC detected in group A in Mix 1 and Mix 2 spiked samples
 
 a = sns.catplot(x="ERCC ID", y="log2 Count per million", order=groupA, hue="Mix",data=merged_ercc[merged_ercc['ERCC ID'].isin(groupA)], kind="box", col="ERCC group", height=5, aspect=1, palette=sns.color_palette(['blue', 'orange']))
@@ -2108,6 +2138,7 @@ plt.title("ERCC Group A")
 a1.set(ylim=(0, 6))
 print('Number of ERCC detected in group A (out of 23) =', adf['Avg Mix1 CPM/ Avg Mix2 CPM'].count())
 
+
 ### Create box and whisker plots of the log(2) CPM for each ERCC detected in group B in Mix 1 and Mix 2 spiked samples
 
 b = sns.catplot(x="ERCC ID", y="log2 Count per million", order=groupB, hue="Mix", data=merged_ercc[merged_ercc['ERCC ID'].isin(groupB)], kind="box", col="ERCC group", height=5, aspect=1, palette=sns.color_palette(['blue', 'orange']))
@@ -2123,6 +2154,7 @@ plt.title("ERCC Group B")
 b.set(ylim=(0, 2))
 print('Number of ERCC detected in group B (out of 23) =', bdf['Avg Mix1 CPM/ Avg Mix2 CPM'].count())
 
+
 ### Create box and whisker plots of the log(2) CPM for each ERCC detected in group C in Mix 1 and Mix 2 spiked samples
 
 c = sns.catplot(x="ERCC ID", y="log2 Count per million", order=groupC, hue="Mix", data=merged_ercc[merged_ercc['ERCC ID'].isin(groupC)], kind="box", col="ERCC group", height=5, aspect=1, palette=sns.color_palette(['blue', 'orange']))
@@ -2138,6 +2170,7 @@ plt.title("ERCC Group C")
 c.set(ylim=(0, 2))
 print('Number of ERCC detected in group C (out of 23) =', cdf['Avg Mix1 CPM/ Avg Mix2 CPM'].count())
 
+
 ### Create box and whisker plots of the log(2) CPM for each ERCC detected in group D in Mix 1 and Mix 2 spiked samples
 
 d = sns.catplot(x="ERCC ID", y="log2 Count per million", order=groupD, hue="Mix", data=merged_ercc[merged_ercc['ERCC ID'].isin(groupD)], col="ERCC group", kind="box", height=5, aspect=1, palette=sns.color_palette(['blue', 'orange']))
@@ -2153,11 +2186,13 @@ plt.title("ERCC Group D")
 d.set(ylim=(0, 1))
 print('Number of ERCC detected in group D (out of 23) =', ddf['Avg Mix1 CPM/ Avg Mix2 CPM'].count())
 
-## Individual sample ERCC analyses
-# Calculate and plot ERCC metrics from individual samples, including limit of detection, dynamic range, and R^2 of counts vs. concentration.
 
-#View the ERCC table
 
+##### Individual sample ERCC analyses
+
+#Calculate and plot ERCC metrics from individual samples, including limit of detection, dynamic range, and R^2 of counts vs. concentration.
+
+#View the ERCC table
 print(ercc_table.head(n=3))
 
 # Check assay_table header to identify the 'Sample Name' column and the column title indicating the 'Spike-in Mix Nmber' if it's indicated in the metadata.
@@ -2171,26 +2206,23 @@ ercc_counts = raw_counts_table[raw_counts_table.index.str.contains('^ERCC-')]
 ercc_counts = ercc_counts.sort_values(by=list(ercc_counts), ascending=False)
 print(ercc_counts.head())
 
-# Make a dictionary for ERCC concentrations for Mix 1
 
+# Make a dictionary for ERCC concentrations for Mix 1
 mix1_conc_dict = dict(zip(ercc_table['ERCC ID'], ercc_table['concentration in Mix 1 (attomoles/ul)']))
 
 # Get samples spiked with Mix 1
-
 mix1_samples = assay_table[assay_table['Parameter Value[Spike-in Mix Number]'] == 'Mix 1']['Sample Name']
 
 # Get ERCC counts for Mix 1 spiked samples
-
 ercc_counts_mix_1 = ercc_counts[mix1_samples]
 ercc_counts_mix_1['ERCC conc (attomoles/ul)'] = ercc_counts_mix_1.index.map(mix1_conc_dict)
 print(ercc_counts_mix_1.head(n=3))
 
-# Make a dictionary for ERCC concentrations for Mix 2
 
+# Make a dictionary for ERCC concentrations for Mix 2
 mix2_conc_dict = dict(zip(ercc_table['ERCC ID'], ercc_table['concentration in Mix 2 (attomoles/ul)']))
 
 # Get samples spiked with Mix 2
-
 mix2_samples = assay_table[assay_table['Parameter Value[Spike-in Mix Number]'] == 'Mix 2']['Sample Name']
 
 # Get ERCC counts for Mix 2 spiked samples
@@ -2199,6 +2231,7 @@ ercc_counts_mix_2 = ercc_counts[mix2_samples]
 ercc_counts_mix_2['ERCC conc (attomoles/ul)'] = ercc_counts_mix_2.index.map(mix2_conc_dict)
 print(ercc_counts_mix_2.head(n=3))
 
+
 # Create a scatter plot of log(2) ERCC counts versus log(2) ERCC concentration for each sample
 
 columns_mix_1 = ercc_counts_mix_1.columns.drop(['ERCC conc (attomoles/ul)'])
@@ -2231,6 +2264,7 @@ for ax in axs.flat:
 
     counter = counter + 1
 
+
 ## Calculate and plot linear regression of log(2) ERCC counts versus log(2) ERCC concentration for each sample
 
 # Filter counts > 0
@@ -2257,8 +2291,8 @@ for i in range(0, len(ercc_counts_mix_2.columns)-1):
   nonzero_counts_sorted = nonzero_counts.sort_values('Conc')
   nonzero_counts_list_2.append(nonzero_counts_sorted)
 
-# Plot each sample using linear regression of scatter plot with x = log2 Conc and y = log2 Counts.
-# Return min, max, R^2 and dynamic range (max / min) values.
+
+# Plot each sample using linear regression of scatter plot with x = log2 Conc and y = log2 Counts.  Return min, max, R^2 and dynamic range (max / min) values.
 
 samples = []
 mins = []
@@ -2343,6 +2377,7 @@ for ax in axs.flat:
       ax.tick_params(direction='in', axis='both', labelsize=9, labelleft=True, labelbottom=True);
       samples.append(all_columns[counter])
 
+
       if(len(xvalues) == 0):
         mins.append('NaN')
         maxs.append('NaN')
@@ -2393,6 +2428,7 @@ for ax in axs.flat:
 
     counter = counter + 1
 
+
 # Create directory for saved files
 
 import os
@@ -2403,13 +2439,15 @@ os.makedirs(name="ERCC_analysis", exist_ok=True)
 
 stats = pd.DataFrame(list(zip(samples, mins, maxs, dyranges, rs)))
 stats.columns = ['Samples', 'Min', 'Max', 'Dynamic range', 'R']
-stats.to_csv('ERCC_analysis/ERCC_stats_GLDS-NNN.csv', index = False)
-stats.filter(items = ['Samples', 'Dynamic range']).to_csv('ERCC_analysis/ERCC_dynrange_GLDS-NNN_mqc.csv', index = False)
-stats.filter(items = ['Samples', 'R']).to_csv('ERCC_analysis/ERCC_rsq_GLDS-NNN_mqc.csv', index = False)
+stats.to_csv('ERCC_analysis/ERCC_stats_GLDS-NNN.csv', index = False) 
+stats.filter(items = ['Samples', 'Dynamic range']).to_csv('ERCC_analysis/ERCC_dynrange_GLDS-NNN_mqc.csv', index = False) 
+stats.filter(items = ['Samples', 'R']).to_csv('ERCC_analysis/ERCC_rsq_GLDS-NNN_mqc.csv', index = False) 
 
-## Generate data and metadata files needed for ERCC DESeq2 analysis
-# ERCC Mix 1 and Mix 2 are distributed so that ~half the samples receive Mix 1 spike-in and ~half receive Mix 2 spike-in. 
-# Transcripts in Mix 1 and Mix 2 are present at a known ratio, so we can determine how well these patterns are revealed in the dataset.
+
+
+### Generate data and metadata files needed for ERCC DESeq2 analysis
+
+# ERCC Mix 1 and Mix 2 are distributed so that half the samples receive Mix 1 spike-in and half receive Mix 2 spike-in. Transcripts in Mix 1 and Mix 2 are present at a known ratio, so we can determine how well these patterns are revealed in the dataset.
 
 # Get sample table
 
@@ -2419,7 +2457,7 @@ pd.set_option('display.max_columns', None)
 print(combined)
 
 # Create metadata table containing samples and their respective ERCC spike-in Mix number
-# Note: Sometimes "Number" in [Spike-in Mix Number] is spelled "number" and this could cause error in mismatch search 
+# Sometimes Number in [Spike-in Mix Number] is spelled 'number' and this could cause error in mismatch search 
 
 ERCCmetadata = combined[['Parameter Value[Spike-in Mix Number]']]
 ERCCmetadata.index = ERCCmetadata.index.str.replace('-','_')
@@ -2459,6 +2497,7 @@ ERCCcounts.to_csv('ERCC_analysis/ERCCcounts.csv')
 ### 10b. Perform DESeq2 Analysis of ERCC Counts in R
 
 ```R
+
 ## Install R packages if not already installed
 
 if (!requireNamespace("BiocManager", quietly = TRUE))
@@ -2478,6 +2517,7 @@ coldata <- read.csv('ERCC_analysis/ERCCmetadata.csv', row.names=1) #INPUT
 coldata$Mix <- factor(coldata$Mix)
 all(rownames(coldata) == colnames(cts))
 
+
 ## Make DESeqDataSet object
 
 dds <- DESeqDataSetFromMatrix(countData = cts,
@@ -2485,6 +2525,7 @@ dds <- DESeqDataSetFromMatrix(countData = cts,
                               design = ~ Mix)
 dds
 
+
 ## Filter out ERCC genes with counts of less than 10 in all samples #####
 
 keepGenes <- rowSums(counts(dds)) > 10
@@ -2492,6 +2533,7 @@ dds <- dds[keepGenes,]
 
 dds
 
+
 ## Run DESeq2 analysis and calculate results
 
 dds <- DESeq(dds)
@@ -2520,32 +2562,37 @@ write.csv(normcounts, 'ERCC_analysis/ERCC_normcounts.csv') #OUTPUT
 ### 10c. Analyze ERCC DESeq2 Results in Python
 
 ```python
-## Import python packages
+
+# Import python packages
 
 import pandas as pd
 from urllib.request import urlopen, quote, urlretrieve
 import seaborn as sns
 import matplotlib.pyplot as plt
 
-## Import ERCC DESeq2 results
+
+# Import ERCC DESeq2 results
 
 deseq2out = pd.read_csv('ERCC_analysis/ERCC_DESeq2.csv', index_col=0) # INPUT
 #deseq2out.index = deseq2out.index.str.replace('_','-')
 deseq2out.rename(columns ={'baseMean' : 'meanNormCounts'}, inplace = True)
 print(deseq2out.head())
 
-## Get files containing ERCC gene concentrations and metadata
+
+# Get files containing ERCC gene concentrations and metadata
 
 ercc_url = 'https://assets.thermofisher.com/TFS-Assets/LSG/manuals/cms_095046.txt'
 ercc_table = pd.read_csv(ercc_url, '\t', index_col='ERCC ID')
 print(ercc_table.head(n=3))
 
-## Combine ERCC DESeq2 results and ercc_table
+
+# Combine ERCC DESeq2 results and ercc_table
 
 combined = deseq2out.merge(ercc_table, left_index=True, right_index=True)
 print(combined.head())
 
-## Filter p-value and adj. p-value cutoff at 10^-3
+
+# Filter p-value and adj. p-value cutoff at 10^-3
 
 combined['cleaned_padj'] = combined['padj']
 combined.loc[(combined.cleaned_padj < 0.001),'cleaned_padj']=0.001
@@ -2555,12 +2602,14 @@ combined.loc[(combined.cleaned_pvalue < 0.001),'cleaned_pvalue']=0.001
 
 print(combined.head())
 
-## Export the filtered combined ERCC DESeq2 results and ercc_table
-### Remember to change file name to GLDS# analyzing
+
+# Export the filtered combined ERCC DESeq2 results and ercc_table
+# Remember to change file name to GLDS# analyzing
 
 combined.filter(items = ['ERCC ID', 'meanNormCounts', 'cleaned_pvalue','cleaned_padj']).to_csv('ERCC_analysis/ERCC_lodr_GLDS-NNN_mqc.csv') 
 
-## Plot p-value vs. mean normalized ERCC counts
+
+# Plot p-value vs. mean normalized ERCC counts
 
 fig, ax = plt.subplots(figsize=(10, 7))
 
@@ -2576,7 +2625,8 @@ sns.lineplot(data=combined, x="meanNormCounts", y="cleaned_pvalue",
 ax.set_xscale("linear");
 ax.set_yscale("log");
 
-## Plot Adjp-value vs. mean normalized ERCC counts
+
+# Plot Adjp-value vs. mean normalized ERCC counts
 
 fig, ax = plt.subplots(figsize=(10, 7))
 

From 1b93859baf62ac1ef30de294c08fd60cd2c4bd9e Mon Sep 17 00:00:00 2001
From: Jonathan Oribello <Jonathan.d.oribello@gmail.com>
Date: Thu, 23 Mar 2023 20:37:10 +0000
Subject: [PATCH 7/7] docs: update workflow instructions README.md

---
 RNAseq/Workflow_Documentation/NF_RCP-F/README.md | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/README.md b/RNAseq/Workflow_Documentation/NF_RCP-F/README.md
index 1e5515a1..7dfb3f32 100644
--- a/RNAseq/Workflow_Documentation/NF_RCP-F/README.md
+++ b/RNAseq/Workflow_Documentation/NF_RCP-F/README.md
@@ -24,8 +24,8 @@ document](../../Pipeline_GL-DPPD-7101_Versions/GL-DPPD-7101-F.md):
 1. **Analysis Staging Subworkflow**
 
    - Description:
-     - This subworkflow extracts the metadata parameters (e.g. organism, library layout) needed for processing from the GLDS ISA archive and retrieves the raw reads files hosted on the [GeneLab Data Repository](https://genelab-data.ndc.nasa.gov/genelab/projects).
-       > *GLDS ISA archive*: ISA directory containing Investigation, Study, and Assay (ISA) metadata files for a respective GLDS dataset - the *ISA.zip file is located in the [GLDS repository](https://genelab-data.ndc.nasa.gov/genelab/projects) under 'STUDY FILES' -> 'Study Metadata Files' for any GLDS dataset in the GeneLab Data Repository.
+     - This subworkflow extracts the metadata parameters (e.g. organism, library layout) needed for processing from the OSD/GLDS ISA archive and retrieves the raw reads files hosted on the [Open Science Data Repository (OSDR)](https://osdr.nasa.gov/bio/repo/).
+       > *OSD/GLDS ISA archive*: ISA directory containing Investigation, Study, and Assay (ISA) metadata files for a respective GLDS dataset - the *ISA.zip file is located in the [OSDR](https://osdr.nasa.gov/bio/repo/) under 'Files' -> 'Study Metadata Files' for any GeneLab Data Set (GLDS) in the OSDR.
 
 2. **RNASeq Consensus Pipeline Subworkflow**
 
@@ -149,7 +149,9 @@ nextflow run NF_RCP-F_1.0.3/main.nf \
 
 <br>
 
-#### 4b. Approach 2: Run the workflow on a GeneLab RNAseq dataset using local Ensembl reference fasta and gtf files
+#### 4b. Approach 2: Run the workflow on a GeneLab RNAseq dataset using local reference fasta and gtf files
+
+> Note: The `--ref_source` and `--ensemblVersion` parameters should match the reference source and version number of the local reference fasta and gtf files used
 
 ```bash
 nextflow run NF_RCP-F_1.0.3/main.nf \ 
@@ -235,13 +237,13 @@ See `nextflow run -h` and [Nextflow's CLI run command documentation](https://nex
 ### 5. Additional Output Files
 
 The outputs from the Analysis Staging and V&V Pipeline Subworkflows are described below:
-> Note: The outputs from the RNASeq Consensus Pipeline Subworkflow are documented in the [GL-DPPD-7101-F.md](../../Pipeline_GL-DPPD-7101_Versions/GL-DPPD-7101-F.md) processing protocol.
+> Note: The outputs from the RNASeq Consensus Pipeline Subworkflow are documented in the [GL-DPPD-7101-F](../../Pipeline_GL-DPPD-7101_Versions/GL-DPPD-7101-F.md) processing protocol.
 
 **Analysis Staging Subworkflow**
 
    - Output:
      - \*_bulkRNASeq_v1_runsheet.csv (table containing metadata required for processing, including the raw reads files location)
-     - \*-ISA.zip (the ISA archive of the GLDS datasets to be processed, downloaded from the GeneLab Data Repository)
+     - \*-ISA.zip (the ISA archive of the GLDS datasets to be processed, downloaded from the OSDR)
      - \*_metadata_table.txt (table that includes additional information about the GLDS dataset, not used for processing)
    
    
@@ -270,4 +272,3 @@ Standard Nextflow resource usage logs are also produced as follows:
      - Resource_Usage/execution_trace_{timestamp}.txt (an execution tracing file that contains information about each process executed in the workflow, including: submission time, start time, completion time, cpu and memory used, machine-readable output)
 
 <br>
-