From fb2659d1a2ddf722a40ec160f9198224c0a03455 Mon Sep 17 00:00:00 2001 From: Jonathan Oribello Date: Wed, 22 Mar 2023 23:37:44 +0000 Subject: [PATCH 1/7] docs: sync with development repo version --- .../GL-DPPD-7101-F.md | 170 ++++++++++-------- 1 file changed, 100 insertions(+), 70 deletions(-) diff --git a/RNAseq/Pipeline_GL-DPPD-7101_Versions/GL-DPPD-7101-F.md b/RNAseq/Pipeline_GL-DPPD-7101_Versions/GL-DPPD-7101-F.md index b6476054..7ac8487b 100644 --- a/RNAseq/Pipeline_GL-DPPD-7101_Versions/GL-DPPD-7101-F.md +++ b/RNAseq/Pipeline_GL-DPPD-7101_Versions/GL-DPPD-7101-F.md @@ -37,6 +37,12 @@ The DESeq2 Normalization and DGE step, [step 9](#9-normalize-read-counts-perform - Fixed edge case where `contrasts.csv` and `ERCCnorm_contrasts.csv` table header and rows could become out of sync with each other in [step 9c](#9c-configure-metadata-sample-grouping-and-group-comparisons) and [step 9e](#9e-perform-dge-on-datasets-with-ercc-spike-in) by generating rows from header rather than generating both separately. +- Updated R version from 4.1.2 to 4.1.3. + +- Fixed edge case where certain scripts would crash if sample names were prefixes of other sample names. This had affected [step 4c](#4c-tablulate-star-counts-in-r), [step 8c](#8c-calculate-total-number-of-genes-expressed-per-sample-in-r), and [step 9d](#9d-import-rsem-genecounts) + +- Fixed rare edge case where groupwise mean and standard deviations could become misassociated to incorrect groups. This had affected [step 9f](#9f-prepare-genelab-dge-tables-with-annotations-on-datasets-with-ercc-spike-in) and [step 9i](#9i-prepare-genelab-dge-tables-with-annotations-on-datasets-without-ercc-spike-in) + --- # Table of contents @@ -110,20 +116,20 @@ The DESeq2 Normalization and DGE step, [step 9](#9-normalize-read-counts-perform |geneBody_coverage|4.0.0|[http://rseqc.sourceforge.net/#genebody-coverage-py](http://rseqc.sourceforge.net/#genebody-coverage-py)| |inner_distance|4.0.0|[http://rseqc.sourceforge.net/#inner-distance-py](http://rseqc.sourceforge.net/#inner-distance-py)| |read_distribution|4.0.0|[http://rseqc.sourceforge.net/#read-distribution-py](http://rseqc.sourceforge.net/#read-distribution-py)| -|R|4.1.2|[https://www.r-project.org/](https://www.r-project.org/)| +|R|4.1.3|[https://www.r-project.org/](https://www.r-project.org/)| |Bioconductor|3.14.0|[https://bioconductor.org](https://bioconductor.org)| |DESeq2|1.34|[https://bioconductor.org/packages/release/bioc/html/DESeq2.html](https://bioconductor.org/packages/release/bioc/html/DESeq2.html)| -|tximport|1.22|[https://bioconductor.org/packages/release/bioc/html/tximport.html](https://bioconductor.org/packages/release/bioc/html/tximport.html)| +|tximport|1.27.1|[https://github.com/mikelove/tximport](https://github.com/mikelove/tximport)| |tidyverse|1.3.1|[https://www.tidyverse.org](https://www.tidyverse.org)| -|dp_tools|1.1.5|[https://github.com/J-81/dp_tools](https://github.com/J-81/dp_tools)| -|singularity|3.9|[https://sylabs.io/](https://sylabs.io/)| |stringr|1.4.1|[https://github.com/tidyverse/stringr](https://github.com/tidyverse/stringr)| +|dp_tools|1.1.8|[https://github.com/J-81/dp_tools](https://github.com/J-81/dp_tools)| |pandas|1.5.0|[https://github.com/pandas-dev/pandas](https://github.com/pandas-dev/pandas)| |seaborn|0.12.0|[https://seaborn.pydata.org/](https://seaborn.pydata.org/)| |matplotlib|3.6.0|[https://matplotlib.org/stable](https://matplotlib.org/stable)| |jupyter notebook|6.4.12|[https://jupyter-notebook.readthedocs.io/](https://jupyter-notebook.readthedocs.io/)| |numpy|1.23.3|[https://numpy.org/](https://numpy.org/)| |scipy|1.9.1|[https://scipy.org/](https://scipy.org/)| +|singularity|3.9|[https://sylabs.io/](https://sylabs.io/)| --- @@ -455,7 +461,7 @@ study <- read.csv(Sys.glob(file.path(work_dir,"samples.txt")), header = FALSE, r ff <- list.files(file.path(align_dir), pattern = "ReadsPerGene.out.tab", recursive=TRUE, full.names = TRUE) ## Reorder the *genes.results files to match the ordering of the ISA samples -ff <- ff[sapply(rownames(study), function(x)grep(paste0(x,'_ReadsPerGene.out.tab$'), ff, value=FALSE))] +ff <- ff[sapply(rownames(study), function(x)grep(paste0(align_dir, '/', x,'_ReadsPerGene.out.tab$'), ff, value=FALSE))] # Remove the first 4 lines counts.files <- lapply( ff, read.table, skip = 4 ) @@ -942,7 +948,7 @@ samples <- read.csv(Sys.glob(file.path(work_dir,"samples.txt")), header = FALSE, files <- list.files(file.path(counts_dir),pattern = ".genes.results", full.names = TRUE) ### reorder the genes.results files to match the ordering of the samples in the metadata file -files <- files[sapply(rownames(samples), function(x)grep(paste0(x,'.genes.results$'), files, value=FALSE))] +files <- files[sapply(rownames(samples), function(x)grep(paste0(counts_dir, '/', x,'.genes.results$'), files, value=FALSE))] names(files) <- rownames(samples) txi.rsem <- tximport(files, type = "rsem", txIn = FALSE, txOut = FALSE) @@ -981,7 +987,7 @@ sessionInfo() ### 9a. Create Sample RunSheet -> Note: Rather than running the command below to create the runsheet needed for processing, the runsheet may also be created manually by following the [file specification](../Workflow_Documentation/NF_RCP-F/examples/README.md). +> Note: Rather than running the command below to create the runsheet needed for processing, the runsheet may also be created manually by following the [file specification](../Workflow_Documentation/NF_RCP-F/examples/runsheet/README.md). ```bash ### Download the *ISA.zip file from the GeneLab Repository ### @@ -1137,7 +1143,7 @@ files <- list.files(file.path(counts_dir),pattern = ".genes.results", full.names ### Reorder the *genes.results files to match the ordering of the ISA samples ### -files <- files[sapply(rownames(study), function(x)grep(paste0(x,".genes.results$"), files, value=FALSE))] +files <- files[sapply(rownames(study), function(x)grep(paste0(counts_dir, '/', x,".genes.results$"), files, value=FALSE))] names(files) <- rownames(study) txi.rsem <- tximport(files, type = "rsem", txIn = FALSE, txOut = FALSE) @@ -1386,19 +1392,18 @@ reduced_output_table_1$LRT.p.value <- res_1_lrt@listData$padj ### Generate and add group mean and stdev columns to the (non-ERCC) DGE table ### tcounts <- as.data.frame(t(normCounts)) -tcounts$group <- group -group_means <- as.data.frame(t(aggregate(. ~ group,data = tcounts,mean))) -group_means <- group_means[-c(1),] -colnames(group_means) <- paste0("Group.Mean_",levels(factor(names(group)))) -group_stdev <- as.data.frame(t(aggregate(. ~ group,data = tcounts,sd))) -group_stdev <- group_stdev[-c(1),] -colnames(group_stdev) <- paste0("Group.Stdev_",levels(factor(names(group)))) +tcounts$group <- names(group) # Used final table group name formatting (e.g. '( Space Flight & Blue Light )' ) + +group_means <- as.data.frame(t(aggregate(. ~ group,data = tcounts,mean))) # Compute group name group-wise means +colnames(group_means) <- paste0("Group.Mean_", group_means['group',]) # assign group name as column names -output_table_1 <- cbind(output_table_1,group_means) -reduced_output_table_1 <- cbind(reduced_output_table_1,group_means) +group_stdev <- as.data.frame(t(aggregate(. ~ group,data = tcounts,sd))) # Compute group name group-wise standard deviation +colnames(group_stdev) <- paste0("Group.Stdev_", group_stdev['group',]) # assign group name as column names -output_table_1 <- cbind(output_table_1,group_stdev) -reduced_output_table_1 <- cbind(reduced_output_table_1,group_stdev) +group_means <- group_means[-c(1),] # Drop group name row from data rows (now present as column names) +group_stdev <- group_stdev[-c(1),] # Drop group name row from data rows (now present as column names) +output_table_1 <- cbind(output_table_1,group_means, group_stdev) # Column bind the group-wise data +reduced_output_table_1 <- cbind(reduced_output_table_1,group_means, group_stdev) # Column bind the group-wise data rm(group_stdev,group_means,tcounts) @@ -1515,19 +1520,18 @@ reduced_output_table_2$LRT.p.value <- res_2_lrt@listData$padj ### Generate and add group mean and stdev columns to the ERCC-normalized DGE table ### tcounts <- as.data.frame(t(ERCCnormCounts)) -tcounts$group_sub <- group_sub -group_means <- as.data.frame(t(aggregate(. ~ group_sub,data = tcounts,mean))) -group_means <- group_means[-c(1),] -colnames(group_means) <- paste0("Group.Mean_",levels(factor(names(group_sub)))) -group_stdev <- as.data.frame(t(aggregate(. ~ group_sub,data = tcounts,sd))) -group_stdev <- group_stdev[-c(1),] -colnames(group_stdev) <- paste0("Group.Stdev_",levels(factor(names(group_sub)))) +tcounts$group_sub <- names(group_sub) # Used final table group name formatting (e.g. '( Space Flight & Blue Light )' ) -output_table_2 <- cbind(output_table_2,group_means) -reduced_output_table_2 <- cbind(reduced_output_table_2,group_means) +group_means <- as.data.frame(t(aggregate(. ~ group_sub,data = tcounts,mean))) # Compute group name group-wise means +colnames(group_means) <- paste0("Group.Mean_", group_means['group_sub',]) # assign group name as column names -output_table_2 <- cbind(output_table_2,group_stdev) -reduced_output_table_2 <- cbind(reduced_output_table_2,group_stdev) +group_stdev <- as.data.frame(t(aggregate(. ~ group_sub,data = tcounts,sd))) # Compute group name group-wise standard deviation +colnames(group_stdev) <- paste0("Group.Stdev_", group_stdev['group_sub',]) # assign group name as column names + +group_means <- group_means[-c(1),] # Drop group name row from data rows (now present as column names) +group_stdev <- group_stdev[-c(1),] # Drop group name row from data rows (now present as column names) +output_table_2 <- cbind(output_table_2,group_means, group_stdev) # Column bind the group-wise data +reduced_output_table_2 <- cbind(reduced_output_table_2,group_means, group_stdev) # Column bind the group-wise data rm(group_stdev,group_means,tcounts) @@ -1734,19 +1738,18 @@ reduced_output_table_1$LRT.p.value <- res_1_lrt@listData$padj ### Generate and add group mean and stdev columns to the DGE table ### tcounts <- as.data.frame(t(normCounts)) -tcounts$group <- group -group_means <- as.data.frame(t(aggregate(. ~ group,data = tcounts,mean))) -group_means <- group_means[-c(1),] -colnames(group_means) <- paste0("Group.Mean_",levels(factor(names(group)))) -group_stdev <- as.data.frame(t(aggregate(. ~ group,data = tcounts,sd))) -group_stdev <- group_stdev[-c(1),] -colnames(group_stdev) <- paste0("Group.Stdev_",levels(factor(names(group)))) +tcounts$group <- names(group) # Used final table group name formatting (e.g. '( Space Flight & Blue Light )' ) + +group_means <- as.data.frame(t(aggregate(. ~ group,data = tcounts,mean))) # Compute group name group-wise means +colnames(group_means) <- paste0("Group.Mean_", group_means['group',]) # assign group name as column names -output_table_1 <- cbind(output_table_1,group_means) -reduced_output_table_1 <- cbind(reduced_output_table_1,group_means) +group_stdev <- as.data.frame(t(aggregate(. ~ group,data = tcounts,sd))) # Compute group name group-wise standard deviation +colnames(group_stdev) <- paste0("Group.Stdev_", group_stdev['group',]) # assign group name as column names -output_table_1 <- cbind(output_table_1,group_stdev) -reduced_output_table_1 <- cbind(reduced_output_table_1,group_stdev) +group_means <- group_means[-c(1),] # Drop group name row from data rows (now present as column names) +group_stdev <- group_stdev[-c(1),] # Drop group name row from data rows (now present as column names) +output_table_1 <- cbind(output_table_1,group_means, group_stdev) # Column bind the group-wise data +reduced_output_table_1 <- cbind(reduced_output_table_1,group_means, group_stdev) # Column bind the group-wise data rm(group_stdev,group_means,tcounts) @@ -1933,7 +1936,8 @@ import matplotlib.pyplot as plt accession = 'GLDS-NNN' # Replace Ns with GLDS number isaPath = 'path/to/GLDS-NNN-ISA.zip' # Replace with path to ISA archive file zip_file_object = zipfile.ZipFile(isaPath, "r") -list_of_ISA_files = zip_file_object.namelist() +list_of_ISA_files = zip_file_object.namelist() +UnnormalizedCountsPath = '/path/to/GLDS-NNN_rna_seq_RSEM_Unnormalized_Counts.csv' # Print contents of ISA zip file to view file order list_of_ISA_files @@ -1970,7 +1974,7 @@ assay_table.head(n=3) # Get raw counts table -raw_counts_table = pd.read_csv('/path/to/RSEM_Unnormalized_Counts.csv', index_col=0) +raw_counts_table = pd.read_csv(UnnormalizedCountsPath, index_col=0) raw_counts_table.index.rename('Gene_ID', inplace=True) print(raw_counts_table.head(n=3)) @@ -2048,8 +2052,8 @@ print(merged_ercc) ### Filter and calculate mean counts per million of Mix1 and Mix2 spiked samples in each of the 4 groups ## Check the 'Parameter Value[Spike-in Mix Number]' column of the assay table -## If the values do not have a space, change 'Mix 1' and 'Mix 2' to 'Mix1' and 'Mix2', respectively +## If the values do not have a space, change 'Mix 1' and 'Mix 2' to 'Mix1' and 'Mix2', respectively # Filter Mix1 CPM and Mix2 CPM in group A Adf = merged_ercc.loc[merged_ercc['ERCC group'] == 'A'] @@ -2189,45 +2193,34 @@ print('Number of ERCC detected in group D (out of 23) =', ddf['Avg Mix1 CPM/ Avg #Calculate and plot ERCC metrics from individual samples, including limit of detection, dynamic range, and R^2 of counts vs. concentration. #View the ERCC table -print(ercc_table.head(n=3)) - -# Check assay_table header to identify the 'Sample Name' column and the column title indicating the 'Spike-in Mix Nmber' if it's indicated in the metadata. - -pd.set_option('display.max_columns', None) -print(assay_table.head(n=3)) - -# Get ERCC counts for all samples - -ercc_counts = raw_counts_table[raw_counts_table.index.str.contains('^ERCC-')] -ercc_counts = ercc_counts.sort_values(by=list(ercc_counts), ascending=False) -print(ercc_counts.head()) +print(ercc_table.head(n=3)) # Make a dictionary for ERCC concentrations for Mix 1 + mix1_conc_dict = dict(zip(ercc_table['ERCC ID'], ercc_table['concentration in Mix 1 (attomoles/ul)'])) -# Get samples spiked with Mix 1 -mix1_samples = assay_table[assay_table['Parameter Value[Spike-in Mix Number]'] == 'Mix 1']['Sample Name'] -# Get ERCC counts for Mix 1 spiked samples -ercc_counts_mix_1 = ercc_counts[mix1_samples] -ercc_counts_mix_1['ERCC conc (attomoles/ul)'] = ercc_counts_mix_1.index.map(mix1_conc_dict) -print(ercc_counts_mix_1.head(n=3)) +# Get samples spiked with Mix 1 +mix1_samples = assay_table[assay_table['Parameter Value[Spike-in Mix Number]'] == 'Mix 1']['Sample Name'] # Make a dictionary for ERCC concentrations for Mix 2 + mix2_conc_dict = dict(zip(ercc_table['ERCC ID'], ercc_table['concentration in Mix 2 (attomoles/ul)'])) + # Get samples spiked with Mix 2 + mix2_samples = assay_table[assay_table['Parameter Value[Spike-in Mix Number]'] == 'Mix 2']['Sample Name'] + # Get ERCC counts for Mix 2 spiked samples ercc_counts_mix_2 = ercc_counts[mix2_samples] ercc_counts_mix_2['ERCC conc (attomoles/ul)'] = ercc_counts_mix_2.index.map(mix2_conc_dict) print(ercc_counts_mix_2.head(n=3)) - # Create a scatter plot of log(2) ERCC counts versus log(2) ERCC concentration for each sample columns_mix_1 = ercc_counts_mix_1.columns.drop(['ERCC conc (attomoles/ul)']) @@ -2261,6 +2254,40 @@ for ax in axs.flat: counter = counter + 1 + +# Create a scatter plot of log(2) ERCC counts versus log(2) ERCC concentration for each sample + +columns_mix_1 = ercc_counts_mix_1.columns.drop(['ERCC conc (attomoles/ul)']) +columns_mix_2 = ercc_counts_mix_2.columns.drop(['ERCC conc (attomoles/ul)']) +all_columns = columns_mix_1.to_list() + columns_mix_2.to_list() +total_columns = len(columns_mix_1) + len(columns_mix_2) +side_size = np.int32(np.ceil(np.sqrt(total_columns)))# calculate grid side size. take sqrt of total plots and round up. +fig, axs = plt.subplots(side_size, side_size, figsize=(15,15), sharex='all', sharey='all'); #change figsize x,y labels if needed. +fig.tight_layout(pad=1, w_pad=2.5, h_pad=3.5) + +counter = 0 +for ax in axs.flat: + + if(counter < len(columns_mix_1)): + ax.scatter(x=np.log2(ercc_counts_mix_1['ERCC conc (attomoles/ul)']), y=np.log2(ercc_counts_mix_1[all_columns[counter]]+1), s=7); + ax.set_title(all_columns[counter][-45:], fontsize=9); + ax.set_xlabel('log2 ERCC conc (attomoles/ ul)', fontsize=9); + ax.set_ylabel('log2 Counts per million', fontsize=9); + ax.tick_params(direction='in', axis='both', labelsize=8, labelleft=True, labelbottom=True); + + elif(counter >= len(columns_mix_1) and counter < total_columns): + ax.scatter(x=np.log2(ercc_counts_mix_2['ERCC conc (attomoles/ul)']), y=np.log2(ercc_counts_mix_2[all_columns[counter]]+1), s=7); + ax.set_title(all_columns[counter][-45:], fontsize=9); + ax.set_xlabel('log2 ERCC conc (attomoles/ ul)', fontsize=9); + ax.set_ylabel('log2 Counts per million', fontsize=9); + ax.tick_params(direction='in', axis='both', labelsize=8, labelleft=True, labelbottom=True); + + else: + pass + + counter = counter + 1 + + ## Calculate and plot linear regression of log(2) ERCC counts versus log(2) ERCC concentration for each sample # Filter counts > 0 @@ -2288,7 +2315,8 @@ for i in range(0, len(ercc_counts_mix_2.columns)-1): nonzero_counts_list_2.append(nonzero_counts_sorted) -# Plot each sample using linear regression of scatter plot with x = log2 Conc and y = log2 Counts. Return min, max, R^2 and dynamic range (max / min) values. +# Plot each sample using linear regression of scatter plot with x = log2 Conc and y = log2 Counts. +# Return min, max, R^2 and dynamic range (max / min) values. samples = [] mins = [] @@ -2435,15 +2463,17 @@ os.makedirs(name="ERCC_analysis", exist_ok=True) stats = pd.DataFrame(list(zip(samples, mins, maxs, dyranges, rs))) stats.columns = ['Samples', 'Min', 'Max', 'Dynamic range', 'R'] -stats.to_csv('ERCC_analysis/ERCC_stats_GLDS-NNN.csv', index = False) -stats.filter(items = ['Samples', 'Dynamic range']).to_csv('ERCC_analysis/ERCC_dynrange_GLDS-NNN_mqc.csv', index = False) -stats.filter(items = ['Samples', 'R']).to_csv('ERCC_analysis/ERCC_rsq_GLDS-NNN_mqc.csv', index = False) +stats.to_csv('ERCC_analysis/ERCC_stats_GLDS-NNN.csv', index = False) +stats.filter(items = ['Samples', 'Dynamic range']).to_csv('ERCC_analysis/ERCC_dynrange_GLDS-NNN_mqc.csv', index = False) +stats.filter(items = ['Samples', 'R']).to_csv('ERCC_analysis/ERCC_rsq_GLDS-NNN_mqc.csv', index = False) ### Generate data and metadata files needed for ERCC DESeq2 analysis -# ERCC Mix 1 and Mix 2 are distributed so that half the samples receive Mix 1 spike-in and half receive Mix 2 spike-in. Transcripts in Mix 1 and Mix 2 are present at a known ratio, so we can determine how well these patterns are revealed in the dataset. +# ERCC Mix 1 and Mix 2 are distributed so that half the samples receive Mix 1 spike-in and half receive Mix 2 spike-in. + +# Transcripts in Mix 1 and Mix 2 are present at a known ratio, so we can determine how well these patterns are revealed in the dataset. # Get sample table @@ -2453,7 +2483,7 @@ pd.set_option('display.max_columns', None) print(combined) # Create metadata table containing samples and their respective ERCC spike-in Mix number -# Sometimes Number in [Spike-in Mix Number] is spelled 'number' and this could cause error in mismatch search +# Note: Sometimes "Number" in [Spike-in Mix Number] is spelled "number" and this could cause error in mismatch search ERCCmetadata = combined[['Parameter Value[Spike-in Mix Number]']] ERCCmetadata.index = ERCCmetadata.index.str.replace('-','_') @@ -2647,4 +2677,4 @@ ax.set_yscale("log"); - ERCC_analysis/ERCC_lodr_*.csv (ERCC Gene Table including mean counts, adjusted p-value and p-value, and filtered to genes with both adj. p-value and p-value < 0.001) -> All steps of the ERCC Spike-In Data Analysis are performed in a Jupyter Notebook (JN) and the completed JN is exported as an html file and published in the [GLDS repository](https://genelab-data.ndc.nasa.gov/genelab/projects) for the respective dataset. +> All steps of the ERCC Spike-In Data Analysis are performed in a Jupyter Notebook (JN) and the completed JN is exported as an html file and published in the [GLDS repository](https://genelab-data.ndc.nasa.gov/genelab/projects) for the respective dataset. \ No newline at end of file From 0bf7e876a53378a2dd5c6e4a6729d02856ee8c39 Mon Sep 17 00:00:00 2001 From: Jonathan Oribello Date: Wed, 22 Mar 2023 23:38:43 +0000 Subject: [PATCH 2/7] docs: remove duplicated chunk Confirmed this was not duplicated in the associated notebook --- .../GL-DPPD-7101-F.md | 35 ------------------- 1 file changed, 35 deletions(-) diff --git a/RNAseq/Pipeline_GL-DPPD-7101_Versions/GL-DPPD-7101-F.md b/RNAseq/Pipeline_GL-DPPD-7101_Versions/GL-DPPD-7101-F.md index 7ac8487b..196a42e7 100644 --- a/RNAseq/Pipeline_GL-DPPD-7101_Versions/GL-DPPD-7101-F.md +++ b/RNAseq/Pipeline_GL-DPPD-7101_Versions/GL-DPPD-7101-F.md @@ -2253,41 +2253,6 @@ for ax in axs.flat: counter = counter + 1 - - -# Create a scatter plot of log(2) ERCC counts versus log(2) ERCC concentration for each sample - -columns_mix_1 = ercc_counts_mix_1.columns.drop(['ERCC conc (attomoles/ul)']) -columns_mix_2 = ercc_counts_mix_2.columns.drop(['ERCC conc (attomoles/ul)']) -all_columns = columns_mix_1.to_list() + columns_mix_2.to_list() -total_columns = len(columns_mix_1) + len(columns_mix_2) -side_size = np.int32(np.ceil(np.sqrt(total_columns)))# calculate grid side size. take sqrt of total plots and round up. -fig, axs = plt.subplots(side_size, side_size, figsize=(15,15), sharex='all', sharey='all'); #change figsize x,y labels if needed. -fig.tight_layout(pad=1, w_pad=2.5, h_pad=3.5) - -counter = 0 -for ax in axs.flat: - - if(counter < len(columns_mix_1)): - ax.scatter(x=np.log2(ercc_counts_mix_1['ERCC conc (attomoles/ul)']), y=np.log2(ercc_counts_mix_1[all_columns[counter]]+1), s=7); - ax.set_title(all_columns[counter][-45:], fontsize=9); - ax.set_xlabel('log2 ERCC conc (attomoles/ ul)', fontsize=9); - ax.set_ylabel('log2 Counts per million', fontsize=9); - ax.tick_params(direction='in', axis='both', labelsize=8, labelleft=True, labelbottom=True); - - elif(counter >= len(columns_mix_1) and counter < total_columns): - ax.scatter(x=np.log2(ercc_counts_mix_2['ERCC conc (attomoles/ul)']), y=np.log2(ercc_counts_mix_2[all_columns[counter]]+1), s=7); - ax.set_title(all_columns[counter][-45:], fontsize=9); - ax.set_xlabel('log2 ERCC conc (attomoles/ ul)', fontsize=9); - ax.set_ylabel('log2 Counts per million', fontsize=9); - ax.tick_params(direction='in', axis='both', labelsize=8, labelleft=True, labelbottom=True); - - else: - pass - - counter = counter + 1 - - ## Calculate and plot linear regression of log(2) ERCC counts versus log(2) ERCC concentration for each sample # Filter counts > 0 From 2d07f0f459c735f2b328e64adcc909eb0fa9cb7d Mon Sep 17 00:00:00 2001 From: Jonathan Oribello Date: Wed, 22 Mar 2023 23:40:47 +0000 Subject: [PATCH 3/7] docs: remove params$ Inadvertently included from notebook version which uses a params object instead of directly supplying by user (as in DPPD) --- RNAseq/Pipeline_GL-DPPD-7101_Versions/GL-DPPD-7101-F.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/RNAseq/Pipeline_GL-DPPD-7101_Versions/GL-DPPD-7101-F.md b/RNAseq/Pipeline_GL-DPPD-7101_Versions/GL-DPPD-7101-F.md index 196a42e7..466042d7 100644 --- a/RNAseq/Pipeline_GL-DPPD-7101_Versions/GL-DPPD-7101-F.md +++ b/RNAseq/Pipeline_GL-DPPD-7101_Versions/GL-DPPD-7101-F.md @@ -1097,7 +1097,7 @@ compare_csv_from_runsheet <- function(runsheet_path) { ### Load metadata from runsheet csv file ### -compare_csv <- compare_csv_from_runsheet(params$runsheet_path) +compare_csv <- compare_csv_from_runsheet(runsheet_path) ### Create data frame containing all samples and respective factors ### From 4316f4da52fe1a5cd18fac7a82d79f64e7a3f86c Mon Sep 17 00:00:00 2001 From: Jonathan Oribello Date: Wed, 22 Mar 2023 16:52:28 -0700 Subject: [PATCH 4/7] docs: remove duplicate software lines --- RNAseq/Pipeline_GL-DPPD-7101_Versions/GL-DPPD-7101-F.md | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/RNAseq/Pipeline_GL-DPPD-7101_Versions/GL-DPPD-7101-F.md b/RNAseq/Pipeline_GL-DPPD-7101_Versions/GL-DPPD-7101-F.md index e03a0dfe..360c6f25 100644 --- a/RNAseq/Pipeline_GL-DPPD-7101_Versions/GL-DPPD-7101-F.md +++ b/RNAseq/Pipeline_GL-DPPD-7101_Versions/GL-DPPD-7101-F.md @@ -121,8 +121,6 @@ The DESeq2 Normalization and DGE step, [step 9](#9-normalize-read-counts-perform |DESeq2|1.34|[https://bioconductor.org/packages/release/bioc/html/DESeq2.html](https://bioconductor.org/packages/release/bioc/html/DESeq2.html)| |tximport|1.27.1|[https://github.com/mikelove/tximport](https://github.com/mikelove/tximport)| |tidyverse|1.3.1|[https://www.tidyverse.org](https://www.tidyverse.org)| -|dp_tools|1.1.5|[https://github.com/J-81/dp_tools](https://github.com/J-81/dp_tools)| -|singularity|3.9|[https://sylabs.io/](https://sylabs.io/)| |stringr|1.4.1|[https://github.com/tidyverse/stringr](https://github.com/tidyverse/stringr)| |dp_tools|1.1.8|[https://github.com/J-81/dp_tools](https://github.com/J-81/dp_tools)| |pandas|1.5.0|[https://github.com/pandas-dev/pandas](https://github.com/pandas-dev/pandas)| @@ -2644,4 +2642,4 @@ ax.set_yscale("log"); - ERCC_analysis/ERCC_lodr_*.csv (ERCC Gene Table including mean counts, adjusted p-value and p-value, and filtered to genes with both adj. p-value and p-value < 0.001) -> All steps of the ERCC Spike-In Data Analysis are performed in a Jupyter Notebook (JN) and the completed JN is exported as an html file and published in the [GLDS repository](https://genelab-data.ndc.nasa.gov/genelab/projects) for the respective dataset. \ No newline at end of file +> All steps of the ERCC Spike-In Data Analysis are performed in a Jupyter Notebook (JN) and the completed JN is exported as an html file and published in the [GLDS repository](https://genelab-data.ndc.nasa.gov/genelab/projects) for the respective dataset. From 6216c243fec8579429d33668513871b8a63f8593 Mon Sep 17 00:00:00 2001 From: Jonathan Oribello Date: Thu, 23 Mar 2023 18:41:33 +0000 Subject: [PATCH 5/7] docs: update notebook related code using 1.0.3 notebook Converted using: ``` bash INPUT_IPYNB=$1 jupyter-nbconvert $INPUT_IPYNB --to python --stdout | # Convert using nbconvert sed '/# In\[ \]:/d' | # remove notebook artifacts sed '/# coding: utf-8/d' | # remove notebook artifacts sed '/# $/d' | # remove notebook artifacts sed '/#!\/usr\/bin\/env python/d' | # remove notebook artifacts sed 's/# #/#/' | # remove notebook artifact where markdown gets an extra '# ' added on cat -s > DPPD_OUT.txt # # Remove empty lines that are consecutive and output ``` --- .../GL-DPPD-7101-F.md | 139 ++++++------------ 1 file changed, 49 insertions(+), 90 deletions(-) diff --git a/RNAseq/Pipeline_GL-DPPD-7101_Versions/GL-DPPD-7101-F.md b/RNAseq/Pipeline_GL-DPPD-7101_Versions/GL-DPPD-7101-F.md index 360c6f25..c5c190a3 100644 --- a/RNAseq/Pipeline_GL-DPPD-7101_Versions/GL-DPPD-7101-F.md +++ b/RNAseq/Pipeline_GL-DPPD-7101_Versions/GL-DPPD-7101-F.md @@ -1915,8 +1915,6 @@ Output data with considering ERCC spike-in genes: ### 10a. Evaluate ERCC Count Data in Python ```python -### Setting up the notebook - # import python packages import pandas as pd pd.set_option('mode.chained_assignment', None) # suppress chained indexing warnings @@ -1928,51 +1926,40 @@ import seaborn as sns from scipy.stats import linregress import matplotlib.pyplot as plt - ### Get and parse data and metadata # Get and unzip ISA.zip to extract metadata. accession = 'GLDS-NNN' # Replace Ns with GLDS number -isaPath = 'path/to/GLDS-NNN-ISA.zip' # Replace with path to ISA archive file -zip_file_object = zipfile.ZipFile(isaPath, "r") -list_of_ISA_files = zip_file_object.namelist() +isaPath = '/path/to/GLDS-NNN_metadata_GLDS-NNN-ISA.zip' # Replace with path to ISA archive file +zip_file_object = zipfile.ZipFile(isaPath, "r") +list_of_ISA_files = zip_file_object.namelist() # Print contents of zip file. Pick relevant one from list UnnormalizedCountsPath = '/path/to/GLDS-NNN_rna_seq_RSEM_Unnormalized_Counts.csv' -# Print contents of ISA zip file to view file order list_of_ISA_files # There are datasets that have multiple assays (including microarray), so the RNAseq ISA files from the above output must be selected. # Txt files outputted above are indexed as 0, 1, 2, etc. Fill in the indexed number corresponding to the sample (s_*txt) and assay files for RNAseq (a_*_(RNA-Seq).txt) in the code block below. # Extract metadata from the sample file (s_*txt) - sample_file = list_of_ISA_files[1] # replace [1] with index corresponding to the (s_*txt) file file = zip_file_object.open(sample_file) sample_table = pd.read_csv(zip_file_object.open(sample_file), sep='\t') - # Extract metadata from the assay (a_*_(RNA-Seq).txt) file - assay_file = list_of_ISA_files[0] # replace [0] with index corresponding to the (a_*_(RNA-Seq).txt) file file = zip_file_object.open(assay_file) assay_table = pd.read_csv(zip_file_object.open(assay_file), sep='\t') - # Check the sample table - pd.set_option('display.max_columns', None) print(sample_table.head(n=3)) - # Check the assay table - pd.set_option('display.max_columns', None) assay_table.head(n=3) - - -# Get raw counts table +### Get raw counts table raw_counts_table = pd.read_csv(UnnormalizedCountsPath, index_col=0) raw_counts_table.index.rename('Gene_ID', inplace=True) @@ -1982,8 +1969,7 @@ raw_counts_transcripts = raw_counts_table[raw_counts_table.index.str.contains('^ raw_counts_transcripts = raw_counts_transcripts.sort_values(by=list(raw_counts_transcripts), ascending=False) print(raw_counts_transcripts) - -# Get ERCC counts +### Get ERCC counts ercc_counts = raw_counts_table[raw_counts_table.index.str.contains('^ERCC-')] ercc_counts.reset_index(inplace=True) @@ -1991,17 +1977,14 @@ ercc_counts = ercc_counts.rename(columns={'Gene_ID':'ERCC ID'}) ercc_counts = ercc_counts.sort_values(by=list(ercc_counts), ascending=False) print(ercc_counts.head()) -# Get files containing ERCC gene concentrations and metadata +### Get files containing ERCC gene concentrations and metadata ercc_url = 'https://assets.thermofisher.com/TFS-Assets/LSG/manuals/cms_095046.txt' ercc_table = pd.read_csv(ercc_url, '\t') print(ercc_table.head(n=3)) - - -### Calculate the number of ERCC genes detected in each of the 4 (A, B, C and D) groups for each sample - -# Extract ERCC counts and calculate the log(2) +## Calculate the number of ERCC genes detected in each of the 4 (A, B, C and D) groups for each sample +### Extract ERCC counts and calculate the log(2) meltERCC = ercc_counts.melt(id_vars=['ERCC ID']) meltERCC['log2 Count'] = meltERCC['value']+1 @@ -2009,7 +1992,7 @@ meltERCC['log2 Count'] = np.log2(meltERCC['log2 Count']) meltERCC = meltERCC.rename(columns={'variable':'Sample Name', 'value':'Count'}) print(meltERCC.head(n=3)) -# Build Mix dictionary to link sample name to mix added and read depth using the assay table +### Build Mix dictionary to link sample name to mix added and read depth using the assay table mix_dict = assay_table.filter(['Sample Name','Parameter Value[Spike-in Mix Number]', 'Parameter Value[Read Depth]']) @@ -2018,43 +2001,35 @@ mix_dict = mix_dict.rename(columns={'Parameter Value[Spike-in Mix Number]':'Mix' 'Total Reads'}) print(mix_dict.head(n=3)) - -# Make combined ercc counts and assay table +### Make combined ercc counts and assay table merged_ercc = meltERCC.merge(mix_dict, on='Sample Name') print(merged_ercc) -# Read ERCC info including concentrations from merged_ercc table +### Read ERCC info including concentrations from merged_ercc table groupA = ercc_table.loc[ercc_table['subgroup'] == 'A']['ERCC ID'] groupB = ercc_table.loc[ercc_table['subgroup'] == 'B']['ERCC ID'] groupC = ercc_table.loc[ercc_table['subgroup'] == 'C']['ERCC ID'] groupD = ercc_table.loc[ercc_table['subgroup'] == 'D']['ERCC ID'] - -# Make a dictionary for ERCC groups +### Make a dictionary for ERCC groups group_dict = dict(zip(ercc_table['ERCC ID'], ercc_table['subgroup'])) - -# Calculate ERCC counts per million and log(2) counts per million +### Calculate ERCC counts per million and log(2) counts per million merged_ercc['Count per million'] = merged_ercc['Count'] / (merged_ercc['Total Reads'] / 1000000.0) merged_ercc['log2 Count per million'] = np.log2(merged_ercc['Count per million']+1) - -# Add ERCC group column +### Add ERCC group column merged_ercc['ERCC group'] = merged_ercc['ERCC ID'].map(group_dict) merged_ercc = merged_ercc.sort_values(by=['Mix'], ascending=True) print(merged_ercc) - -### Filter and calculate mean counts per million of Mix1 and Mix2 spiked samples in each of the 4 groups -## Check the 'Parameter Value[Spike-in Mix Number]' column of the assay table - -## If the values do not have a space, change 'Mix 1' and 'Mix 2' to 'Mix1' and 'Mix2', respectively -# Filter Mix1 CPM and Mix2 CPM in group A +## Filter and calculate mean counts per million of Mix1 and Mix2 spiked samples in each of the 4 groups +### Filter Mix1 CPM and Mix2 CPM in group A Adf = merged_ercc.loc[merged_ercc['ERCC group'] == 'A'] Amix1df = Adf.loc[Adf['Mix']=='Mix 1'] @@ -2070,8 +2045,7 @@ adf = Amix1df.merge(Amix2df, on='ERCC ID', suffixes=('', '_2')) adf = adf.reset_index() adf['Avg Mix1 CPM/ Avg Mix2 CPM'] = (adf['Avg Mix1 CPM'] / adf['Avg Mix2 CPM']) - -# Filter Mix1 CPM and Mix2 CPM in group B +### Filter Mix1 CPM and Mix2 CPM in group B Bdf = merged_ercc.loc[merged_ercc['ERCC group'] == 'B'] Bmix1df = Bdf.loc[Bdf['Mix']=='Mix 1'] @@ -2087,8 +2061,7 @@ bdf = Bmix1df.merge(Bmix2df, on='ERCC ID') bdf = bdf.reset_index() bdf['Avg Mix1 CPM/ Avg Mix2 CPM'] = (bdf['Avg Mix1 CPM'] / bdf['Avg Mix2 CPM']) - -# Filter Mix1 CPM and Mix2 CPM in group C +### Filter Mix1 CPM and Mix2 CPM in group C Cdf = merged_ercc.loc[merged_ercc['ERCC group'] == 'C'] Cmix1df = Cdf.loc[Cdf['Mix']=='Mix 1'] @@ -2104,8 +2077,7 @@ cdf = Cmix1df.merge(Cmix2df, on='ERCC ID') cdf = cdf.reset_index() cdf['Avg Mix1 CPM/ Avg Mix2 CPM'] = (cdf['Avg Mix1 CPM'] / cdf['Avg Mix2 CPM']) - -# Filter Mix1 CPM and Mix2 CPM in group D +### Filter Mix1 CPM and Mix2 CPM in group D Ddf = merged_ercc.loc[merged_ercc['ERCC group'] == 'D'] Dmix1df = Ddf.loc[Ddf['Mix']=='Mix 1'] @@ -2121,9 +2093,7 @@ ddf = Dmix1df.merge(Dmix2df, on='ERCC ID') ddf = ddf.reset_index() ddf['Avg Mix1 CPM/ Avg Mix2 CPM'] = (ddf['Avg Mix1 CPM'] / ddf['Avg Mix2 CPM']) - -##### Multi-sample ERCC analyses - +## Multi-sample ERCC analyses ### Create box and whisker plots of the log(2) CPM for each ERCC detected in group A in Mix 1 and Mix 2 spiked samples a = sns.catplot(x="ERCC ID", y="log2 Count per million", order=groupA, hue="Mix",data=merged_ercc[merged_ercc['ERCC ID'].isin(groupA)], kind="box", col="ERCC group", height=5, aspect=1, palette=sns.color_palette(['blue', 'orange'])) @@ -2138,7 +2108,6 @@ plt.title("ERCC Group A") a1.set(ylim=(0, 6)) print('Number of ERCC detected in group A (out of 23) =', adf['Avg Mix1 CPM/ Avg Mix2 CPM'].count()) - ### Create box and whisker plots of the log(2) CPM for each ERCC detected in group B in Mix 1 and Mix 2 spiked samples b = sns.catplot(x="ERCC ID", y="log2 Count per million", order=groupB, hue="Mix", data=merged_ercc[merged_ercc['ERCC ID'].isin(groupB)], kind="box", col="ERCC group", height=5, aspect=1, palette=sns.color_palette(['blue', 'orange'])) @@ -2154,7 +2123,6 @@ plt.title("ERCC Group B") b.set(ylim=(0, 2)) print('Number of ERCC detected in group B (out of 23) =', bdf['Avg Mix1 CPM/ Avg Mix2 CPM'].count()) - ### Create box and whisker plots of the log(2) CPM for each ERCC detected in group C in Mix 1 and Mix 2 spiked samples c = sns.catplot(x="ERCC ID", y="log2 Count per million", order=groupC, hue="Mix", data=merged_ercc[merged_ercc['ERCC ID'].isin(groupC)], kind="box", col="ERCC group", height=5, aspect=1, palette=sns.color_palette(['blue', 'orange'])) @@ -2170,7 +2138,6 @@ plt.title("ERCC Group C") c.set(ylim=(0, 2)) print('Number of ERCC detected in group C (out of 23) =', cdf['Avg Mix1 CPM/ Avg Mix2 CPM'].count()) - ### Create box and whisker plots of the log(2) CPM for each ERCC detected in group D in Mix 1 and Mix 2 spiked samples d = sns.catplot(x="ERCC ID", y="log2 Count per million", order=groupD, hue="Mix", data=merged_ercc[merged_ercc['ERCC ID'].isin(groupD)], col="ERCC group", kind="box", height=5, aspect=1, palette=sns.color_palette(['blue', 'orange'])) @@ -2186,35 +2153,46 @@ plt.title("ERCC Group D") d.set(ylim=(0, 1)) print('Number of ERCC detected in group D (out of 23) =', ddf['Avg Mix1 CPM/ Avg Mix2 CPM'].count()) +## Individual sample ERCC analyses +# Calculate and plot ERCC metrics from individual samples, including limit of detection, dynamic range, and R^2 of counts vs. concentration. + +#View the ERCC table +print(ercc_table.head(n=3)) -##### Individual sample ERCC analyses +# Check assay_table header to identify the 'Sample Name' column and the column title indicating the 'Spike-in Mix Nmber' if it's indicated in the metadata. -#Calculate and plot ERCC metrics from individual samples, including limit of detection, dynamic range, and R^2 of counts vs. concentration. +pd.set_option('display.max_columns', None) +print(assay_table.head(n=3)) -#View the ERCC table +# Get ERCC counts for all samples -print(ercc_table.head(n=3)) +ercc_counts = raw_counts_table[raw_counts_table.index.str.contains('^ERCC-')] +ercc_counts = ercc_counts.sort_values(by=list(ercc_counts), ascending=False) +print(ercc_counts.head()) # Make a dictionary for ERCC concentrations for Mix 1 mix1_conc_dict = dict(zip(ercc_table['ERCC ID'], ercc_table['concentration in Mix 1 (attomoles/ul)'])) - # Get samples spiked with Mix 1 mix1_samples = assay_table[assay_table['Parameter Value[Spike-in Mix Number]'] == 'Mix 1']['Sample Name'] +# Get ERCC counts for Mix 1 spiked samples + +ercc_counts_mix_1 = ercc_counts[mix1_samples] +ercc_counts_mix_1['ERCC conc (attomoles/ul)'] = ercc_counts_mix_1.index.map(mix1_conc_dict) +print(ercc_counts_mix_1.head(n=3)) + # Make a dictionary for ERCC concentrations for Mix 2 mix2_conc_dict = dict(zip(ercc_table['ERCC ID'], ercc_table['concentration in Mix 2 (attomoles/ul)'])) - # Get samples spiked with Mix 2 mix2_samples = assay_table[assay_table['Parameter Value[Spike-in Mix Number]'] == 'Mix 2']['Sample Name'] - # Get ERCC counts for Mix 2 spiked samples ercc_counts_mix_2 = ercc_counts[mix2_samples] @@ -2279,7 +2257,6 @@ for i in range(0, len(ercc_counts_mix_2.columns)-1): nonzero_counts_sorted = nonzero_counts.sort_values('Conc') nonzero_counts_list_2.append(nonzero_counts_sorted) - # Plot each sample using linear regression of scatter plot with x = log2 Conc and y = log2 Counts. # Return min, max, R^2 and dynamic range (max / min) values. @@ -2366,7 +2343,6 @@ for ax in axs.flat: ax.tick_params(direction='in', axis='both', labelsize=9, labelleft=True, labelbottom=True); samples.append(all_columns[counter]) - if(len(xvalues) == 0): mins.append('NaN') maxs.append('NaN') @@ -2417,7 +2393,6 @@ for ax in axs.flat: counter = counter + 1 - # Create directory for saved files import os @@ -2432,12 +2407,8 @@ stats.to_csv('ERCC_analysis/ERCC_stats_GLDS-NNN.csv', index = False) stats.filter(items = ['Samples', 'Dynamic range']).to_csv('ERCC_analysis/ERCC_dynrange_GLDS-NNN_mqc.csv', index = False) stats.filter(items = ['Samples', 'R']).to_csv('ERCC_analysis/ERCC_rsq_GLDS-NNN_mqc.csv', index = False) - - -### Generate data and metadata files needed for ERCC DESeq2 analysis - -# ERCC Mix 1 and Mix 2 are distributed so that half the samples receive Mix 1 spike-in and half receive Mix 2 spike-in. - +## Generate data and metadata files needed for ERCC DESeq2 analysis +# ERCC Mix 1 and Mix 2 are distributed so that ~half the samples receive Mix 1 spike-in and ~half receive Mix 2 spike-in. # Transcripts in Mix 1 and Mix 2 are present at a known ratio, so we can determine how well these patterns are revealed in the dataset. # Get sample table @@ -2488,7 +2459,6 @@ ERCCcounts.to_csv('ERCC_analysis/ERCCcounts.csv') ### 10b. Perform DESeq2 Analysis of ERCC Counts in R ```R - ## Install R packages if not already installed if (!requireNamespace("BiocManager", quietly = TRUE)) @@ -2508,7 +2478,6 @@ coldata <- read.csv('ERCC_analysis/ERCCmetadata.csv', row.names=1) #INPUT coldata$Mix <- factor(coldata$Mix) all(rownames(coldata) == colnames(cts)) - ## Make DESeqDataSet object dds <- DESeqDataSetFromMatrix(countData = cts, @@ -2516,7 +2485,6 @@ dds <- DESeqDataSetFromMatrix(countData = cts, design = ~ Mix) dds - ## Filter out ERCC genes with counts of less than 10 in all samples ##### keepGenes <- rowSums(counts(dds)) > 10 @@ -2524,7 +2492,6 @@ dds <- dds[keepGenes,] dds - ## Run DESeq2 analysis and calculate results dds <- DESeq(dds) @@ -2553,37 +2520,32 @@ write.csv(normcounts, 'ERCC_analysis/ERCC_normcounts.csv') #OUTPUT ### 10c. Analyze ERCC DESeq2 Results in Python ```python - -# Import python packages +## Import python packages import pandas as pd from urllib.request import urlopen, quote, urlretrieve import seaborn as sns import matplotlib.pyplot as plt - -# Import ERCC DESeq2 results +## Import ERCC DESeq2 results deseq2out = pd.read_csv('ERCC_analysis/ERCC_DESeq2.csv', index_col=0) # INPUT #deseq2out.index = deseq2out.index.str.replace('_','-') deseq2out.rename(columns ={'baseMean' : 'meanNormCounts'}, inplace = True) print(deseq2out.head()) - -# Get files containing ERCC gene concentrations and metadata +## Get files containing ERCC gene concentrations and metadata ercc_url = 'https://assets.thermofisher.com/TFS-Assets/LSG/manuals/cms_095046.txt' ercc_table = pd.read_csv(ercc_url, '\t', index_col='ERCC ID') print(ercc_table.head(n=3)) - -# Combine ERCC DESeq2 results and ercc_table +## Combine ERCC DESeq2 results and ercc_table combined = deseq2out.merge(ercc_table, left_index=True, right_index=True) print(combined.head()) - -# Filter p-value and adj. p-value cutoff at 10^-3 +## Filter p-value and adj. p-value cutoff at 10^-3 combined['cleaned_padj'] = combined['padj'] combined.loc[(combined.cleaned_padj < 0.001),'cleaned_padj']=0.001 @@ -2593,14 +2555,12 @@ combined.loc[(combined.cleaned_pvalue < 0.001),'cleaned_pvalue']=0.001 print(combined.head()) - -# Export the filtered combined ERCC DESeq2 results and ercc_table -# Remember to change file name to GLDS# analyzing +## Export the filtered combined ERCC DESeq2 results and ercc_table +### Remember to change file name to GLDS# analyzing combined.filter(items = ['ERCC ID', 'meanNormCounts', 'cleaned_pvalue','cleaned_padj']).to_csv('ERCC_analysis/ERCC_lodr_GLDS-NNN_mqc.csv') - -# Plot p-value vs. mean normalized ERCC counts +## Plot p-value vs. mean normalized ERCC counts fig, ax = plt.subplots(figsize=(10, 7)) @@ -2616,8 +2576,7 @@ sns.lineplot(data=combined, x="meanNormCounts", y="cleaned_pvalue", ax.set_xscale("linear"); ax.set_yscale("log"); - -# Plot Adjp-value vs. mean normalized ERCC counts +## Plot Adjp-value vs. mean normalized ERCC counts fig, ax = plt.subplots(figsize=(10, 7)) From 72a1c85aabe40c5dbac4885df8f520ba9db8a50a Mon Sep 17 00:00:00 2001 From: Jonathan Oribello Date: Thu, 23 Mar 2023 13:17:49 -0700 Subject: [PATCH 6/7] Update GL-DPPD-7101-F.md code / revert comment spacing changes --- .../GL-DPPD-7101-F.md | 138 ++++++++++++------ 1 file changed, 94 insertions(+), 44 deletions(-) diff --git a/RNAseq/Pipeline_GL-DPPD-7101_Versions/GL-DPPD-7101-F.md b/RNAseq/Pipeline_GL-DPPD-7101_Versions/GL-DPPD-7101-F.md index c5c190a3..80cb8b98 100644 --- a/RNAseq/Pipeline_GL-DPPD-7101_Versions/GL-DPPD-7101-F.md +++ b/RNAseq/Pipeline_GL-DPPD-7101_Versions/GL-DPPD-7101-F.md @@ -1915,6 +1915,8 @@ Output data with considering ERCC spike-in genes: ### 10a. Evaluate ERCC Count Data in Python ```python +### Setting up the notebook + # import python packages import pandas as pd pd.set_option('mode.chained_assignment', None) # suppress chained indexing warnings @@ -1926,6 +1928,7 @@ import seaborn as sns from scipy.stats import linregress import matplotlib.pyplot as plt + ### Get and parse data and metadata # Get and unzip ISA.zip to extract metadata. @@ -1933,33 +1936,43 @@ import matplotlib.pyplot as plt accession = 'GLDS-NNN' # Replace Ns with GLDS number isaPath = '/path/to/GLDS-NNN_metadata_GLDS-NNN-ISA.zip' # Replace with path to ISA archive file zip_file_object = zipfile.ZipFile(isaPath, "r") -list_of_ISA_files = zip_file_object.namelist() # Print contents of zip file. Pick relevant one from list +list_of_ISA_files = zip_file_object.namelist() UnnormalizedCountsPath = '/path/to/GLDS-NNN_rna_seq_RSEM_Unnormalized_Counts.csv' +# Print contents of ISA zip file to view file order list_of_ISA_files # There are datasets that have multiple assays (including microarray), so the RNAseq ISA files from the above output must be selected. # Txt files outputted above are indexed as 0, 1, 2, etc. Fill in the indexed number corresponding to the sample (s_*txt) and assay files for RNAseq (a_*_(RNA-Seq).txt) in the code block below. # Extract metadata from the sample file (s_*txt) + sample_file = list_of_ISA_files[1] # replace [1] with index corresponding to the (s_*txt) file file = zip_file_object.open(sample_file) sample_table = pd.read_csv(zip_file_object.open(sample_file), sep='\t') + # Extract metadata from the assay (a_*_(RNA-Seq).txt) file + assay_file = list_of_ISA_files[0] # replace [0] with index corresponding to the (a_*_(RNA-Seq).txt) file file = zip_file_object.open(assay_file) assay_table = pd.read_csv(zip_file_object.open(assay_file), sep='\t') + # Check the sample table + pd.set_option('display.max_columns', None) print(sample_table.head(n=3)) + # Check the assay table + pd.set_option('display.max_columns', None) assay_table.head(n=3) -### Get raw counts table + + +# Get raw counts table raw_counts_table = pd.read_csv(UnnormalizedCountsPath, index_col=0) raw_counts_table.index.rename('Gene_ID', inplace=True) @@ -1969,7 +1982,8 @@ raw_counts_transcripts = raw_counts_table[raw_counts_table.index.str.contains('^ raw_counts_transcripts = raw_counts_transcripts.sort_values(by=list(raw_counts_transcripts), ascending=False) print(raw_counts_transcripts) -### Get ERCC counts + +# Get ERCC counts ercc_counts = raw_counts_table[raw_counts_table.index.str.contains('^ERCC-')] ercc_counts.reset_index(inplace=True) @@ -1977,14 +1991,17 @@ ercc_counts = ercc_counts.rename(columns={'Gene_ID':'ERCC ID'}) ercc_counts = ercc_counts.sort_values(by=list(ercc_counts), ascending=False) print(ercc_counts.head()) -### Get files containing ERCC gene concentrations and metadata +# Get files containing ERCC gene concentrations and metadata ercc_url = 'https://assets.thermofisher.com/TFS-Assets/LSG/manuals/cms_095046.txt' ercc_table = pd.read_csv(ercc_url, '\t') print(ercc_table.head(n=3)) -## Calculate the number of ERCC genes detected in each of the 4 (A, B, C and D) groups for each sample -### Extract ERCC counts and calculate the log(2) + + +### Calculate the number of ERCC genes detected in each of the 4 (A, B, C and D) groups for each sample + +# Extract ERCC counts and calculate the log(2) meltERCC = ercc_counts.melt(id_vars=['ERCC ID']) meltERCC['log2 Count'] = meltERCC['value']+1 @@ -1992,7 +2009,7 @@ meltERCC['log2 Count'] = np.log2(meltERCC['log2 Count']) meltERCC = meltERCC.rename(columns={'variable':'Sample Name', 'value':'Count'}) print(meltERCC.head(n=3)) -### Build Mix dictionary to link sample name to mix added and read depth using the assay table +# Build Mix dictionary to link sample name to mix added and read depth using the assay table mix_dict = assay_table.filter(['Sample Name','Parameter Value[Spike-in Mix Number]', 'Parameter Value[Read Depth]']) @@ -2001,35 +2018,43 @@ mix_dict = mix_dict.rename(columns={'Parameter Value[Spike-in Mix Number]':'Mix' 'Total Reads'}) print(mix_dict.head(n=3)) -### Make combined ercc counts and assay table + +# Make combined ercc counts and assay table merged_ercc = meltERCC.merge(mix_dict, on='Sample Name') print(merged_ercc) -### Read ERCC info including concentrations from merged_ercc table +# Read ERCC info including concentrations from merged_ercc table groupA = ercc_table.loc[ercc_table['subgroup'] == 'A']['ERCC ID'] groupB = ercc_table.loc[ercc_table['subgroup'] == 'B']['ERCC ID'] groupC = ercc_table.loc[ercc_table['subgroup'] == 'C']['ERCC ID'] groupD = ercc_table.loc[ercc_table['subgroup'] == 'D']['ERCC ID'] -### Make a dictionary for ERCC groups + +# Make a dictionary for ERCC groups group_dict = dict(zip(ercc_table['ERCC ID'], ercc_table['subgroup'])) -### Calculate ERCC counts per million and log(2) counts per million + +# Calculate ERCC counts per million and log(2) counts per million merged_ercc['Count per million'] = merged_ercc['Count'] / (merged_ercc['Total Reads'] / 1000000.0) merged_ercc['log2 Count per million'] = np.log2(merged_ercc['Count per million']+1) -### Add ERCC group column + +# Add ERCC group column merged_ercc['ERCC group'] = merged_ercc['ERCC ID'].map(group_dict) merged_ercc = merged_ercc.sort_values(by=['Mix'], ascending=True) print(merged_ercc) -## Filter and calculate mean counts per million of Mix1 and Mix2 spiked samples in each of the 4 groups -### Filter Mix1 CPM and Mix2 CPM in group A + +### Filter and calculate mean counts per million of Mix1 and Mix2 spiked samples in each of the 4 groups +## Check the 'Parameter Value[Spike-in Mix Number]' column of the assay table +## If the values do not have a space, change 'Mix 1' and 'Mix 2' to 'Mix1' and 'Mix2', respectively + +# Filter Mix1 CPM and Mix2 CPM in group A Adf = merged_ercc.loc[merged_ercc['ERCC group'] == 'A'] Amix1df = Adf.loc[Adf['Mix']=='Mix 1'] @@ -2045,7 +2070,8 @@ adf = Amix1df.merge(Amix2df, on='ERCC ID', suffixes=('', '_2')) adf = adf.reset_index() adf['Avg Mix1 CPM/ Avg Mix2 CPM'] = (adf['Avg Mix1 CPM'] / adf['Avg Mix2 CPM']) -### Filter Mix1 CPM and Mix2 CPM in group B + +# Filter Mix1 CPM and Mix2 CPM in group B Bdf = merged_ercc.loc[merged_ercc['ERCC group'] == 'B'] Bmix1df = Bdf.loc[Bdf['Mix']=='Mix 1'] @@ -2061,7 +2087,8 @@ bdf = Bmix1df.merge(Bmix2df, on='ERCC ID') bdf = bdf.reset_index() bdf['Avg Mix1 CPM/ Avg Mix2 CPM'] = (bdf['Avg Mix1 CPM'] / bdf['Avg Mix2 CPM']) -### Filter Mix1 CPM and Mix2 CPM in group C + +# Filter Mix1 CPM and Mix2 CPM in group C Cdf = merged_ercc.loc[merged_ercc['ERCC group'] == 'C'] Cmix1df = Cdf.loc[Cdf['Mix']=='Mix 1'] @@ -2077,7 +2104,8 @@ cdf = Cmix1df.merge(Cmix2df, on='ERCC ID') cdf = cdf.reset_index() cdf['Avg Mix1 CPM/ Avg Mix2 CPM'] = (cdf['Avg Mix1 CPM'] / cdf['Avg Mix2 CPM']) -### Filter Mix1 CPM and Mix2 CPM in group D + +# Filter Mix1 CPM and Mix2 CPM in group D Ddf = merged_ercc.loc[merged_ercc['ERCC group'] == 'D'] Dmix1df = Ddf.loc[Ddf['Mix']=='Mix 1'] @@ -2093,7 +2121,9 @@ ddf = Dmix1df.merge(Dmix2df, on='ERCC ID') ddf = ddf.reset_index() ddf['Avg Mix1 CPM/ Avg Mix2 CPM'] = (ddf['Avg Mix1 CPM'] / ddf['Avg Mix2 CPM']) -## Multi-sample ERCC analyses + +##### Multi-sample ERCC analyses + ### Create box and whisker plots of the log(2) CPM for each ERCC detected in group A in Mix 1 and Mix 2 spiked samples a = sns.catplot(x="ERCC ID", y="log2 Count per million", order=groupA, hue="Mix",data=merged_ercc[merged_ercc['ERCC ID'].isin(groupA)], kind="box", col="ERCC group", height=5, aspect=1, palette=sns.color_palette(['blue', 'orange'])) @@ -2108,6 +2138,7 @@ plt.title("ERCC Group A") a1.set(ylim=(0, 6)) print('Number of ERCC detected in group A (out of 23) =', adf['Avg Mix1 CPM/ Avg Mix2 CPM'].count()) + ### Create box and whisker plots of the log(2) CPM for each ERCC detected in group B in Mix 1 and Mix 2 spiked samples b = sns.catplot(x="ERCC ID", y="log2 Count per million", order=groupB, hue="Mix", data=merged_ercc[merged_ercc['ERCC ID'].isin(groupB)], kind="box", col="ERCC group", height=5, aspect=1, palette=sns.color_palette(['blue', 'orange'])) @@ -2123,6 +2154,7 @@ plt.title("ERCC Group B") b.set(ylim=(0, 2)) print('Number of ERCC detected in group B (out of 23) =', bdf['Avg Mix1 CPM/ Avg Mix2 CPM'].count()) + ### Create box and whisker plots of the log(2) CPM for each ERCC detected in group C in Mix 1 and Mix 2 spiked samples c = sns.catplot(x="ERCC ID", y="log2 Count per million", order=groupC, hue="Mix", data=merged_ercc[merged_ercc['ERCC ID'].isin(groupC)], kind="box", col="ERCC group", height=5, aspect=1, palette=sns.color_palette(['blue', 'orange'])) @@ -2138,6 +2170,7 @@ plt.title("ERCC Group C") c.set(ylim=(0, 2)) print('Number of ERCC detected in group C (out of 23) =', cdf['Avg Mix1 CPM/ Avg Mix2 CPM'].count()) + ### Create box and whisker plots of the log(2) CPM for each ERCC detected in group D in Mix 1 and Mix 2 spiked samples d = sns.catplot(x="ERCC ID", y="log2 Count per million", order=groupD, hue="Mix", data=merged_ercc[merged_ercc['ERCC ID'].isin(groupD)], col="ERCC group", kind="box", height=5, aspect=1, palette=sns.color_palette(['blue', 'orange'])) @@ -2153,11 +2186,13 @@ plt.title("ERCC Group D") d.set(ylim=(0, 1)) print('Number of ERCC detected in group D (out of 23) =', ddf['Avg Mix1 CPM/ Avg Mix2 CPM'].count()) -## Individual sample ERCC analyses -# Calculate and plot ERCC metrics from individual samples, including limit of detection, dynamic range, and R^2 of counts vs. concentration. -#View the ERCC table +##### Individual sample ERCC analyses + +#Calculate and plot ERCC metrics from individual samples, including limit of detection, dynamic range, and R^2 of counts vs. concentration. + +#View the ERCC table print(ercc_table.head(n=3)) # Check assay_table header to identify the 'Sample Name' column and the column title indicating the 'Spike-in Mix Nmber' if it's indicated in the metadata. @@ -2171,26 +2206,23 @@ ercc_counts = raw_counts_table[raw_counts_table.index.str.contains('^ERCC-')] ercc_counts = ercc_counts.sort_values(by=list(ercc_counts), ascending=False) print(ercc_counts.head()) -# Make a dictionary for ERCC concentrations for Mix 1 +# Make a dictionary for ERCC concentrations for Mix 1 mix1_conc_dict = dict(zip(ercc_table['ERCC ID'], ercc_table['concentration in Mix 1 (attomoles/ul)'])) # Get samples spiked with Mix 1 - mix1_samples = assay_table[assay_table['Parameter Value[Spike-in Mix Number]'] == 'Mix 1']['Sample Name'] # Get ERCC counts for Mix 1 spiked samples - ercc_counts_mix_1 = ercc_counts[mix1_samples] ercc_counts_mix_1['ERCC conc (attomoles/ul)'] = ercc_counts_mix_1.index.map(mix1_conc_dict) print(ercc_counts_mix_1.head(n=3)) -# Make a dictionary for ERCC concentrations for Mix 2 +# Make a dictionary for ERCC concentrations for Mix 2 mix2_conc_dict = dict(zip(ercc_table['ERCC ID'], ercc_table['concentration in Mix 2 (attomoles/ul)'])) # Get samples spiked with Mix 2 - mix2_samples = assay_table[assay_table['Parameter Value[Spike-in Mix Number]'] == 'Mix 2']['Sample Name'] # Get ERCC counts for Mix 2 spiked samples @@ -2199,6 +2231,7 @@ ercc_counts_mix_2 = ercc_counts[mix2_samples] ercc_counts_mix_2['ERCC conc (attomoles/ul)'] = ercc_counts_mix_2.index.map(mix2_conc_dict) print(ercc_counts_mix_2.head(n=3)) + # Create a scatter plot of log(2) ERCC counts versus log(2) ERCC concentration for each sample columns_mix_1 = ercc_counts_mix_1.columns.drop(['ERCC conc (attomoles/ul)']) @@ -2231,6 +2264,7 @@ for ax in axs.flat: counter = counter + 1 + ## Calculate and plot linear regression of log(2) ERCC counts versus log(2) ERCC concentration for each sample # Filter counts > 0 @@ -2257,8 +2291,8 @@ for i in range(0, len(ercc_counts_mix_2.columns)-1): nonzero_counts_sorted = nonzero_counts.sort_values('Conc') nonzero_counts_list_2.append(nonzero_counts_sorted) -# Plot each sample using linear regression of scatter plot with x = log2 Conc and y = log2 Counts. -# Return min, max, R^2 and dynamic range (max / min) values. + +# Plot each sample using linear regression of scatter plot with x = log2 Conc and y = log2 Counts. Return min, max, R^2 and dynamic range (max / min) values. samples = [] mins = [] @@ -2343,6 +2377,7 @@ for ax in axs.flat: ax.tick_params(direction='in', axis='both', labelsize=9, labelleft=True, labelbottom=True); samples.append(all_columns[counter]) + if(len(xvalues) == 0): mins.append('NaN') maxs.append('NaN') @@ -2393,6 +2428,7 @@ for ax in axs.flat: counter = counter + 1 + # Create directory for saved files import os @@ -2403,13 +2439,15 @@ os.makedirs(name="ERCC_analysis", exist_ok=True) stats = pd.DataFrame(list(zip(samples, mins, maxs, dyranges, rs))) stats.columns = ['Samples', 'Min', 'Max', 'Dynamic range', 'R'] -stats.to_csv('ERCC_analysis/ERCC_stats_GLDS-NNN.csv', index = False) -stats.filter(items = ['Samples', 'Dynamic range']).to_csv('ERCC_analysis/ERCC_dynrange_GLDS-NNN_mqc.csv', index = False) -stats.filter(items = ['Samples', 'R']).to_csv('ERCC_analysis/ERCC_rsq_GLDS-NNN_mqc.csv', index = False) +stats.to_csv('ERCC_analysis/ERCC_stats_GLDS-NNN.csv', index = False) +stats.filter(items = ['Samples', 'Dynamic range']).to_csv('ERCC_analysis/ERCC_dynrange_GLDS-NNN_mqc.csv', index = False) +stats.filter(items = ['Samples', 'R']).to_csv('ERCC_analysis/ERCC_rsq_GLDS-NNN_mqc.csv', index = False) -## Generate data and metadata files needed for ERCC DESeq2 analysis -# ERCC Mix 1 and Mix 2 are distributed so that ~half the samples receive Mix 1 spike-in and ~half receive Mix 2 spike-in. -# Transcripts in Mix 1 and Mix 2 are present at a known ratio, so we can determine how well these patterns are revealed in the dataset. + + +### Generate data and metadata files needed for ERCC DESeq2 analysis + +# ERCC Mix 1 and Mix 2 are distributed so that half the samples receive Mix 1 spike-in and half receive Mix 2 spike-in. Transcripts in Mix 1 and Mix 2 are present at a known ratio, so we can determine how well these patterns are revealed in the dataset. # Get sample table @@ -2419,7 +2457,7 @@ pd.set_option('display.max_columns', None) print(combined) # Create metadata table containing samples and their respective ERCC spike-in Mix number -# Note: Sometimes "Number" in [Spike-in Mix Number] is spelled "number" and this could cause error in mismatch search +# Sometimes Number in [Spike-in Mix Number] is spelled 'number' and this could cause error in mismatch search ERCCmetadata = combined[['Parameter Value[Spike-in Mix Number]']] ERCCmetadata.index = ERCCmetadata.index.str.replace('-','_') @@ -2459,6 +2497,7 @@ ERCCcounts.to_csv('ERCC_analysis/ERCCcounts.csv') ### 10b. Perform DESeq2 Analysis of ERCC Counts in R ```R + ## Install R packages if not already installed if (!requireNamespace("BiocManager", quietly = TRUE)) @@ -2478,6 +2517,7 @@ coldata <- read.csv('ERCC_analysis/ERCCmetadata.csv', row.names=1) #INPUT coldata$Mix <- factor(coldata$Mix) all(rownames(coldata) == colnames(cts)) + ## Make DESeqDataSet object dds <- DESeqDataSetFromMatrix(countData = cts, @@ -2485,6 +2525,7 @@ dds <- DESeqDataSetFromMatrix(countData = cts, design = ~ Mix) dds + ## Filter out ERCC genes with counts of less than 10 in all samples ##### keepGenes <- rowSums(counts(dds)) > 10 @@ -2492,6 +2533,7 @@ dds <- dds[keepGenes,] dds + ## Run DESeq2 analysis and calculate results dds <- DESeq(dds) @@ -2520,32 +2562,37 @@ write.csv(normcounts, 'ERCC_analysis/ERCC_normcounts.csv') #OUTPUT ### 10c. Analyze ERCC DESeq2 Results in Python ```python -## Import python packages + +# Import python packages import pandas as pd from urllib.request import urlopen, quote, urlretrieve import seaborn as sns import matplotlib.pyplot as plt -## Import ERCC DESeq2 results + +# Import ERCC DESeq2 results deseq2out = pd.read_csv('ERCC_analysis/ERCC_DESeq2.csv', index_col=0) # INPUT #deseq2out.index = deseq2out.index.str.replace('_','-') deseq2out.rename(columns ={'baseMean' : 'meanNormCounts'}, inplace = True) print(deseq2out.head()) -## Get files containing ERCC gene concentrations and metadata + +# Get files containing ERCC gene concentrations and metadata ercc_url = 'https://assets.thermofisher.com/TFS-Assets/LSG/manuals/cms_095046.txt' ercc_table = pd.read_csv(ercc_url, '\t', index_col='ERCC ID') print(ercc_table.head(n=3)) -## Combine ERCC DESeq2 results and ercc_table + +# Combine ERCC DESeq2 results and ercc_table combined = deseq2out.merge(ercc_table, left_index=True, right_index=True) print(combined.head()) -## Filter p-value and adj. p-value cutoff at 10^-3 + +# Filter p-value and adj. p-value cutoff at 10^-3 combined['cleaned_padj'] = combined['padj'] combined.loc[(combined.cleaned_padj < 0.001),'cleaned_padj']=0.001 @@ -2555,12 +2602,14 @@ combined.loc[(combined.cleaned_pvalue < 0.001),'cleaned_pvalue']=0.001 print(combined.head()) -## Export the filtered combined ERCC DESeq2 results and ercc_table -### Remember to change file name to GLDS# analyzing + +# Export the filtered combined ERCC DESeq2 results and ercc_table +# Remember to change file name to GLDS# analyzing combined.filter(items = ['ERCC ID', 'meanNormCounts', 'cleaned_pvalue','cleaned_padj']).to_csv('ERCC_analysis/ERCC_lodr_GLDS-NNN_mqc.csv') -## Plot p-value vs. mean normalized ERCC counts + +# Plot p-value vs. mean normalized ERCC counts fig, ax = plt.subplots(figsize=(10, 7)) @@ -2576,7 +2625,8 @@ sns.lineplot(data=combined, x="meanNormCounts", y="cleaned_pvalue", ax.set_xscale("linear"); ax.set_yscale("log"); -## Plot Adjp-value vs. mean normalized ERCC counts + +# Plot Adjp-value vs. mean normalized ERCC counts fig, ax = plt.subplots(figsize=(10, 7)) From 1b93859baf62ac1ef30de294c08fd60cd2c4bd9e Mon Sep 17 00:00:00 2001 From: Jonathan Oribello Date: Thu, 23 Mar 2023 20:37:10 +0000 Subject: [PATCH 7/7] docs: update workflow instructions README.md --- RNAseq/Workflow_Documentation/NF_RCP-F/README.md | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/README.md b/RNAseq/Workflow_Documentation/NF_RCP-F/README.md index 1e5515a1..7dfb3f32 100644 --- a/RNAseq/Workflow_Documentation/NF_RCP-F/README.md +++ b/RNAseq/Workflow_Documentation/NF_RCP-F/README.md @@ -24,8 +24,8 @@ document](../../Pipeline_GL-DPPD-7101_Versions/GL-DPPD-7101-F.md): 1. **Analysis Staging Subworkflow** - Description: - - This subworkflow extracts the metadata parameters (e.g. organism, library layout) needed for processing from the GLDS ISA archive and retrieves the raw reads files hosted on the [GeneLab Data Repository](https://genelab-data.ndc.nasa.gov/genelab/projects). - > *GLDS ISA archive*: ISA directory containing Investigation, Study, and Assay (ISA) metadata files for a respective GLDS dataset - the *ISA.zip file is located in the [GLDS repository](https://genelab-data.ndc.nasa.gov/genelab/projects) under 'STUDY FILES' -> 'Study Metadata Files' for any GLDS dataset in the GeneLab Data Repository. + - This subworkflow extracts the metadata parameters (e.g. organism, library layout) needed for processing from the OSD/GLDS ISA archive and retrieves the raw reads files hosted on the [Open Science Data Repository (OSDR)](https://osdr.nasa.gov/bio/repo/). + > *OSD/GLDS ISA archive*: ISA directory containing Investigation, Study, and Assay (ISA) metadata files for a respective GLDS dataset - the *ISA.zip file is located in the [OSDR](https://osdr.nasa.gov/bio/repo/) under 'Files' -> 'Study Metadata Files' for any GeneLab Data Set (GLDS) in the OSDR. 2. **RNASeq Consensus Pipeline Subworkflow** @@ -149,7 +149,9 @@ nextflow run NF_RCP-F_1.0.3/main.nf \
-#### 4b. Approach 2: Run the workflow on a GeneLab RNAseq dataset using local Ensembl reference fasta and gtf files +#### 4b. Approach 2: Run the workflow on a GeneLab RNAseq dataset using local reference fasta and gtf files + +> Note: The `--ref_source` and `--ensemblVersion` parameters should match the reference source and version number of the local reference fasta and gtf files used ```bash nextflow run NF_RCP-F_1.0.3/main.nf \ @@ -235,13 +237,13 @@ See `nextflow run -h` and [Nextflow's CLI run command documentation](https://nex ### 5. Additional Output Files The outputs from the Analysis Staging and V&V Pipeline Subworkflows are described below: -> Note: The outputs from the RNASeq Consensus Pipeline Subworkflow are documented in the [GL-DPPD-7101-F.md](../../Pipeline_GL-DPPD-7101_Versions/GL-DPPD-7101-F.md) processing protocol. +> Note: The outputs from the RNASeq Consensus Pipeline Subworkflow are documented in the [GL-DPPD-7101-F](../../Pipeline_GL-DPPD-7101_Versions/GL-DPPD-7101-F.md) processing protocol. **Analysis Staging Subworkflow** - Output: - \*_bulkRNASeq_v1_runsheet.csv (table containing metadata required for processing, including the raw reads files location) - - \*-ISA.zip (the ISA archive of the GLDS datasets to be processed, downloaded from the GeneLab Data Repository) + - \*-ISA.zip (the ISA archive of the GLDS datasets to be processed, downloaded from the OSDR) - \*_metadata_table.txt (table that includes additional information about the GLDS dataset, not used for processing) @@ -270,4 +272,3 @@ Standard Nextflow resource usage logs are also produced as follows: - Resource_Usage/execution_trace_{timestamp}.txt (an execution tracing file that contains information about each process executed in the workflow, including: submission time, start time, completion time, cpu and memory used, machine-readable output)
-