From 6826f05f6afd3fbc132d4a4c1778268e7a1d0406 Mon Sep 17 00:00:00 2001 From: jpsmith5 Date: Thu, 5 Nov 2020 08:03:34 -0500 Subject: [PATCH 01/12] update BiocProject usage --- BiocProject/PEPATAC_BiocProject.Rmd | 4 +- BiocProject/gold_atac_annotation.csv | 6 -- BiocProject/gold_config.yaml | 40 ++++++++++++ BiocProject/gold_hg19.yaml | 19 ------ BiocProject/gold_sample_table.csv | 6 ++ BiocProject/readConsensusPeak.R | 44 +++++++++++++ BiocProject/readCoverage.R | 80 ++++++++++++++++++++++++ BiocProject/readNarrowPeak.R | 46 ++++++++++++++ BiocProject/readPepatacPeakBeds.R | 18 ------ BiocProject/runCOCOA.R | 93 ++++++++++++++++++++++++++++ pepatac_output_schema.yaml | 20 +++--- 11 files changed, 323 insertions(+), 53 deletions(-) delete mode 100644 BiocProject/gold_atac_annotation.csv create mode 100644 BiocProject/gold_config.yaml delete mode 100644 BiocProject/gold_hg19.yaml create mode 100644 BiocProject/gold_sample_table.csv create mode 100644 BiocProject/readConsensusPeak.R create mode 100644 BiocProject/readCoverage.R create mode 100644 BiocProject/readNarrowPeak.R delete mode 100644 BiocProject/readPepatacPeakBeds.R create mode 100644 BiocProject/runCOCOA.R diff --git a/BiocProject/PEPATAC_BiocProject.Rmd b/BiocProject/PEPATAC_BiocProject.Rmd index c25cd3a8..ca4df839 100644 --- a/BiocProject/PEPATAC_BiocProject.Rmd +++ b/BiocProject/PEPATAC_BiocProject.Rmd @@ -1,6 +1,6 @@ --- title: "PEPATAC BiocProject" -author: "Michal Stolarczyk" +author: c("Michal Stolarczyk", "Jason Smith") date: "`r Sys.Date()`" output: rmarkdown::html_vignette vignette: > @@ -35,7 +35,7 @@ The way the output files are read is defined in a [function](https://github.com/ ```{r echo=T, message=FALSE} library(BiocProject) -ProjectConfig = "gold_hg19.yaml" +ProjectConfig = "gold_config.yaml" ``` ## Run the `BiocProject` function diff --git a/BiocProject/gold_atac_annotation.csv b/BiocProject/gold_atac_annotation.csv deleted file mode 100644 index 3683e246..00000000 --- a/BiocProject/gold_atac_annotation.csv +++ /dev/null @@ -1,6 +0,0 @@ -sample_name,sample_description,treatment_description,organism,protocol,data_source,SRR,SRX,Sample_geo_accession,Sample_series_id,read_type,Sample_instrument_model,read1,read2 -gold1,ATAC-seq from dendritic cell (ENCLB065VMV),Homo sapiens dendritic in vitro differentiated cells treated with 0 ng/mL Lipopolysaccharide for 0 hours,human,ATAC-seq,SRA,SRR5210416,SRX2523872,GSM2471255,GSE94182,PAIRED,Illumina HiSeq 2000,SRA_1,SRA_2 -gold2,ATAC-seq from dendritic cell (ENCLB811FLK),Homo sapiens dendritic in vitro differentiated cells treated with 0 ng/mL Lipopolysaccharide for 0 hours,human,ATAC-seq,SRA,SRR5210450,SRX2523906,GSM2471300,GSE94222,PAIRED,Illumina HiSeq 2000,SRA_1,SRA_2 -gold3,ATAC-seq from dendritic cell (ENCLB887PKE),Homo sapiens dendritic in vitro differentiated cells treated with 0 ng/mL Lipopolysaccharide for 0 hours,human,ATAC-seq,SRA,SRR5210398,SRX2523862,GSM2471249,GSE94177,PAIRED,Illumina NextSeq 500,SRA_1,SRA_2 -gold4,ATAC-seq from dendritic cell (ENCLB586KIS),Homo sapiens dendritic in vitro differentiated cells treated with 0 ng/mL Lipopolysaccharide for 0 hours,human,ATAC-seq,SRA,SRR5210428,SRX2523884,GSM2471269,GSE94196,PAIRED,Illumina HiSeq 2000,SRA_1,SRA_2 -gold5,ATAC-seq from dendritic cell (ENCLB384NOX),Homo sapiens dendritic in vitro differentiated cells treated with 0 ng/mL Lipopolysaccharide for 0 hours,human,ATAC-seq,SRA,SRR5210390,SRX2523854,GSM2471245,GSE94173,PAIRED,Illumina HiSeq 2000,SRA_1,SRA_2 diff --git a/BiocProject/gold_config.yaml b/BiocProject/gold_config.yaml new file mode 100644 index 00000000..150d8550 --- /dev/null +++ b/BiocProject/gold_config.yaml @@ -0,0 +1,40 @@ +# This project config file describes your project. See looper docs for details. +name: BiocProject # The name that summary files will be prefaced with + +pep_version: 2.0.0 +sample_table: gold_sample_table.csv # sheet listing all samples in the project + +looper: # relative paths are relative to this config file + output_dir: "$PROCESSED/pepatac/gold" # ABSOLUTE PATH to the parent, shared space where project results go + pipeline_interfaces: ["$CODE/pepatac/project_pipeline_interface.yaml"] # ABSOLUTE PATH to the directory where looper will find the pipeline repository + +sample_modifiers: + append: + pipeline_interfaces: ["$CODE/pepatac/sample_pipeline_interface.yaml"] + derive: + attributes: [read1, read2] + sources: + SRA_1: "${SRAFQ}{SRR}_1.fastq.gz" + SRA_2: "${SRAFQ}{SRR}_2.fastq.gz" + imply: + - if: + organism: ["human", "Homo sapiens", "Human", "Homo_sapiens"] + then: + genome: hg38 + macs_genome_size: hs + prealignments: rCRSd + aligner: bowtie2 # Default. [options: bwa] + deduplicator: samblaster # Default. [options: picard] + trimmer: skewer # Default. [options: pyadapt, trimmomatic] + peak_caller: macs2 # Default. [options: fseq, genrich, hmmratac, homer] + peak_type: fixed # Default. [options: variable] + extend: "250" # Default. For fixed-width peaks, extend this distance up- and down-stream. + frip_ref_peaks: None # Default. Use an external reference set of peaks instead of the peaks called from this run + blacklist: $GENOMES/hg38/blacklist/default/hg38_blacklist.bed.gz + +bioconductor: + readFunName: readNarrowPeak + readFunPath: readNarrowPeak.R + funcArgs: + output_dir: "$PROCESSED/pepatac/gold" + results_subdir: "$PROCESSED/pepatac/gold/results_pipeline/" \ No newline at end of file diff --git a/BiocProject/gold_hg19.yaml b/BiocProject/gold_hg19.yaml deleted file mode 100644 index 32e071ed..00000000 --- a/BiocProject/gold_hg19.yaml +++ /dev/null @@ -1,19 +0,0 @@ -# Run gold standard samples through ATACseq pipeline. -name: gold_hg19 - -metadata: - sample_annotation: "$PROCESSED/gold/pepatac/hg19/gold_atac_annotation.csv" - output_dir: "$PROCESSED/gold/pepatac/hg19/10_08_18_wo" - pipeline_interfaces: "$CODE/pepatac/pipeline_interface.yaml" - -derived_columns: [read1, read2] - -data_sources: - SRA_1: "${SRAFQ}{SRR}_1.fastq.gz" - SRA_2: "${SRAFQ}{SRR}_2.fastq.gz" - -implied_columns: - organism: - human: - genome: hg19 - macs_genome_size: hs diff --git a/BiocProject/gold_sample_table.csv b/BiocProject/gold_sample_table.csv new file mode 100644 index 00000000..adb33194 --- /dev/null +++ b/BiocProject/gold_sample_table.csv @@ -0,0 +1,6 @@ +sample_name,organism,protocol,SRR,read_type,read1,read2 +gold1,human,ATAC-seq,SRR5210416,PAIRED,SRA_1,SRA_2 +gold2,human,ATAC-seq,SRR5210450,PAIRED,SRA_1,SRA_2 +gold3,human,ATAC-seq,SRR5210398,PAIRED,SRA_1,SRA_2 +gold4,human,ATAC-seq,SRR5210428,PAIRED,SRA_1,SRA_2 +gold5,human,ATAC-seq,SRR5210390,PAIRED,SRA_1,SRA_2 diff --git a/BiocProject/readConsensusPeak.R b/BiocProject/readConsensusPeak.R new file mode 100644 index 00000000..81165e86 --- /dev/null +++ b/BiocProject/readConsensusPeak.R @@ -0,0 +1,44 @@ +readConsensusPeak = function(project, summary_dir) { + cwd = getwd() + # Construct sample table + sample_table <- data.table( + genome=unique(pepr::sampleTable(project)$genome) + ) + sample_table[,peak_file_path:=.(file.path( + summary_dir, + paste0(config(project)$name, "_", sample_table$genome, + "_consensusPeaks.narrowPeak") + ))] + # Only keep samples with valid peak files + file_list <- sample_table$peak_file_path + file_exists <- character() + for (i in 1:length(file_list)) { + if(file.exists(file.path(file_list[i]))) { + file_exists <- append(file_exists, file.path(file_list[i])) + } + } + files <- data.table(peak_file_path=file_exists) + consensus_peak_files = list() + if (nrow(files) == 0) { + warning("No valid peak files exist.") + return(GenomicRanges::GRangesList()) + } + # Ensure duplicates do not exist + sample_table <- unique( + sample_table[sample_table$peak_file_path %in% files$peak_file_path,]) + # Load peak files as Granges objects + sample_table[, peak_file := .(lapply( + peak_file_path, + function(x) { + GenomicRanges::GRanges( + fread(x, col.names=c("chr", "start", "end", "name", "score", + "strand", "signalValue", "pValue", "qValue", + "peak"))) + }))] + # Convert to GRangesList and name the individual Granges + peak_files <- GenomicRanges::GRangesList(sample_table$peak_file) + names(peak_files) <- sample_table$genome + setwd(cwd) + return(peak_files) +} + diff --git a/BiocProject/readCoverage.R b/BiocProject/readCoverage.R new file mode 100644 index 00000000..7e8f2173 --- /dev/null +++ b/BiocProject/readCoverage.R @@ -0,0 +1,80 @@ +readCoverage = function(project, results_subdir) { + cwd = getwd() + # Construct sample table + sample_table <- data.table( + sample_name=pepr::sampleTable(project)$sample_name, + genome=pepr::sampleTable(project)$genome + ) + # Check if coverage files are compressed + if (any(file.exists(file.path(results_subdir, + sample_table$sample_name, + paste0("peak_calling_", sample_table$genome), + paste0(sample_table$sample_name, + "_ref_peaks_coverage.bed.gz"))))) { + ext <- ".bed.gz" + } else if (any(file.exists(file.path(results_subdir, + sample_table$sample_name, + paste0("peak_calling_", sample_table$genome), + paste0(sample_table$sample_name, + "_peaks_coverage.bed.gz"))))) { + ext <- ".bed.gz" + } else { + ext <- ".bed" + } + # Use reference peak coverage file if available + if (any(file.exists(file.path(results_subdir, + sample_table$sample_name, + paste0("peak_calling_", sample_table$genome), + paste0(sample_table$sample_name, + "_ref_peaks_coverage", ext))))) { + peak_file_name = paste0("_ref_peaks_coverage", ext) + reference = TRUE + } else { + warning("Peak coverage files are not derived from a singular reference peak set.") + reference = FALSE + peak_file_name = paste0("_peaks_coverage", ext) + } + # Construct paths to peak coverage files + sample_table[,cov_file_path:=.((file.path( + results_subdir, + sample_table$sample_name, + paste0("peak_calling_", sample_table$genome), + paste0(sample_table$sample_name, peak_file_name))))] + # Only keep samples with valid peak files + file_list <- sample_table$cov_file_path + file_exists <- character() + for (i in 1:length(file_list)) { + if(file.exists(file.path(file_list[i]))) { + file_exists <- append(file_exists, file.path(file_list[i])) + } + } + files <- data.table(cov_file_path=file_exists) + if (nrow(files) == 0) { + warning("No valid peak files exist.") + return(data.table()) + } + # Ensure duplicates do not exist + sample_table <- unique( + sample_table[sample_table$cov_file_path %in% files$cov_file_path,]) + # Column names are dependent on source file + if (reference) { + column_names <- c("chr", "start", "end", "name", "score", "strand", + "signalValue", "pValue", "qValue", "peak", + "read_count", "base_count", "width", "frac") + } else { + column_names <- c("chr", "start", "end", "read_count", + "base_count", "width", "frac", "norm") + } + # Load coverage files and return list of data.tables + sample_table[, cov_file := .(lapply(cov_file_path, + fread, col.names=column_names))] + # sample_table[, cov_file := .(lapply( + # cov_file_path, function(x) { + # x <- fread(x, col.names=column_names) + # data.table(x) + # }))] + cov_files <- sample_table$cov_file + names(cov_files) <- sample_table$sample_name + setwd(cwd) + return(cov_files) +} diff --git a/BiocProject/readNarrowPeak.R b/BiocProject/readNarrowPeak.R new file mode 100644 index 00000000..ff8a3ce8 --- /dev/null +++ b/BiocProject/readNarrowPeak.R @@ -0,0 +1,46 @@ +readNarrowPeak = function(project, results_subdir) { + cwd = getwd() + # Construct sample table + sample_table <- data.table( + sample_name=pepr::sampleTable(project)$sample_name, + genome=pepr::sampleTable(project)$genome + ) + # Identify peak file paths + sample_table[,peak_file_path:=.((file.path( + results_subdir, + sample_table$sample_name, + paste0("peak_calling_", sample_table$genome), + paste0(sample_table$sample_name, + "_peaks_normalized.narrowPeak"))))] + # Only keep samples with valid peak files + file_list <- sample_table$peak_file_path + file_exists <- character() + for (i in 1:length(file_list)) { + if(file.exists(file.path(file_list[i]))) { + file_exists <- append(file_exists, file.path(file_list[i])) + } + } + files <- data.table(peak_file_path=file_exists) + consensus_peak_files = list() + if (nrow(files) == 0) { + warning("No valid peak files exist.") + return(GenomicRanges::GRangesList()) + } + # Ensure duplicates do not exist + sample_table <- unique( + sample_table[sample_table$peak_file_path %in% files$peak_file_path,]) + # Load peak files as Granges objects + sample_table[, peak_file := .(lapply( + peak_file_path, + function(x) { + GenomicRanges::GRanges( + fread(x, col.names=c("chr", "start", "end", "name", "score", "strand", + "signalValue", "pValue", "qValue", "peak"))) + }))] + # Convert to GRangesList and name the individual Granges + peak_files <- GenomicRanges::GRangesList(sample_table$peak_file) + names(peak_files) <- sample_table$sample_name + setwd(cwd) + return(peak_files) +} + diff --git a/BiocProject/readPepatacPeakBeds.R b/BiocProject/readPepatacPeakBeds.R deleted file mode 100644 index 10791ceb..00000000 --- a/BiocProject/readPepatacPeakBeds.R +++ /dev/null @@ -1,18 +0,0 @@ -readPepatacPeakBeds = function(p, pipName="pepatac.py") { - readBed = function(fp) { - message("Reading: ", basename(fp)) - df = read.table(s, stringsAsFactors=F) - colnames(df)[1:3] = c('chr', 'start', 'end') - GenomicRanges::GRanges(df) - } - lapply(outputsByPipeline(p, pipName), function(f) { - lapply(f, function(s){ - if (!file.exists(s)) { - warning("File '", s, "' not found") - NULL - } else { - readBed(s) - } - }) - }) -} diff --git a/BiocProject/runCOCOA.R b/BiocProject/runCOCOA.R new file mode 100644 index 00000000..10dcb785 --- /dev/null +++ b/BiocProject/runCOCOA.R @@ -0,0 +1,93 @@ +runCOCOA = function(project, results_subdir, summary_dir, + genome, conditions, lolaPath) { + # verify condition table + if(!all(c("sample_name", "condition") %in% tolower(colnames(conditions)))) { + quit("Conditions must include 'sample_name' and 'condition' columns.") + } + # load required packages + required_packages <- c("COCOA", "LOLA", "PEPATACr", "tidyr") + for (package in required_packages) { + loadLibrary <- tryCatch ( + { + suppressPackageStartupMessages( + suppressWarnings(library(package, character.only=TRUE))) + }, + error=function(e) { + message("Error: Install the \"", package, + "\" library before proceeding.") + return(NULL) + }, + warning=function(e) { + message(e) + return(1) + } + ) + if (length(loadLibrary)!=0) { + suppressWarnings(library(package, character.only=TRUE)) + } else { + quit() + } + } + # Load peak count table + count_table_path = file.path(summary_dir, + paste0(config(project)$name, "_", genome, + "_peaks_coverage.tsv")) + if(file.exists(count_table_path)) { + ct <- fread(count_table_path) + } else { + quit(paste0("Could not load ", count_table_path)) + } + # Subset count table by condition table + cols <- colnames(ct)[c(TRUE ,colnames(ct) %in% conditions$sample_name)] + ct <- ct[,..cols] + # Generate matrix + m <- as.matrix(ct[, 2:ncol(ct)]) + rownames(m) <- ct$name + mat <- t(m) + mat <- mat[, -grep("random", colnames(mat))] + mat <- mat[, -grep("GL", colnames(mat))] + mat <- mat[, -grep("chrUn", colnames(mat))] + pca <- prcomp(mat) + row.names(pca$x) <- rownames(mat) + loadings <- pca$rotation + scores <- pca$x + rownames(scores) <- rownames(mat) + # Load region database + regionSetDB <- loadRegionDB(lolaPath) + # Load the ENCODE transcription factor binding sites + regionAnno <- regionSetDB$regionAnno + collectionInd <- regionAnno$collection %in% "encode_tfbs" + TFGR <- GRangesList(regionSetDB$regionGRL[collectionInd]) + regionAnno <- regionAnno[collectionInd, ] + regionSetName <- regionAnno$filename + regionSetDescription <- regionAnno$description + + # Remove full DB to release memory + rm("regionSetDB") + + # Generate signal coordinates + pos <- data.table(rownames(m)) + pos <- pos[- grep("random", pos$V1),] + pos <- pos[- grep("GL", pos$V1),] + pos <- pos[- grep("chrUn", pos$V1),] + peaks <- separate(pos, col="V1", sep="_", into=c("chr", "start", "end")) + peaks$start <- as.numeric(peaks$start) + peaks$end <- as.numeric(peaks$end) + + # Subset principal components to annotate + PCsToAnnotate <- paste0("PC", 1:6) + correctSampleOrder <- 1:nrow(mat) + + tfRSS <- runCOCOA(genomicSignal=loadings, + signalCoord=peaks, + GRList=TFGR, + signalCol=PCsToAnnotate, + sampleOrder=correctSampleOrder, + targetVar=scores, + scoringMetric="proportionWeightedMean", + absVal=TRUE) + tfRSS$regionSetName <- regionSetName + tfRSS$regionSetDescription <- regionSetDescription + return(tfRSS) +} + diff --git a/pepatac_output_schema.yaml b/pepatac_output_schema.yaml index edd2dc1e..8914c950 100644 --- a/pepatac_output_schema.yaml +++ b/pepatac_output_schema.yaml @@ -8,23 +8,27 @@ properties: smooth_bw: path: "aligned_{genome}/{sample_name}_smooth.bw" type: string - description: "Test sample property" + description: "Smoothed signal track" exact_bw: path: "aligned_{genome}_exact/{sample_name}_exact.bw" type: string - description: "Test sample property" + description: "Nucleotide-resolution signal track" aligned_bam: - path: "aligned_{genome}/{sample_name}_sort.bam" + path: "aligned_{genome}/{sample_name}_sort_dedup.bam" type: string - description: "Test sample property" - peaks_bed: - path: "peak_calling_{genome}/{sample_name}_peaks.bed" + description: "Coordinate sorted deduplicated, aligned BAM file" + peak_file: + path: "peak_calling_{genome}/{sample_name}_peaks_normalized.narrowPeak" type: string - description: "Test sample property" + description: "Sample peak file" + coverage_file: + path: "peak_calling_{genome}/{sample_name}_peaks_coverage.bed.gz" + type: string + description: "Sample peak coverage table" summits_bed: path: "peak_calling_{genome}/{sample_name}_summits.bed" type: string - description: "Test sample property" + description: "Peak summit file" alignment_percent_file: title: "Alignment percent file" description: "Plots percent of total alignment to all pre-alignments and primary genome." From f6c0bfde55bab449d57457b4046f0e117bf8bd47 Mon Sep 17 00:00:00 2001 From: jpsmith5 Date: Fri, 6 Nov 2020 07:55:03 -0500 Subject: [PATCH 02/12] update return value --- BiocProject/runCOCOA.R | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/BiocProject/runCOCOA.R b/BiocProject/runCOCOA.R index 10dcb785..686331df 100644 --- a/BiocProject/runCOCOA.R +++ b/BiocProject/runCOCOA.R @@ -25,7 +25,8 @@ runCOCOA = function(project, results_subdir, summary_dir, if (length(loadLibrary)!=0) { suppressWarnings(library(package, character.only=TRUE)) } else { - quit() + warning() + return(1) } } # Load peak count table @@ -35,7 +36,8 @@ runCOCOA = function(project, results_subdir, summary_dir, if(file.exists(count_table_path)) { ct <- fread(count_table_path) } else { - quit(paste0("Could not load ", count_table_path)) + warning(paste0("Could not load ", count_table_path)) + return(1) } # Subset count table by condition table cols <- colnames(ct)[c(TRUE ,colnames(ct) %in% conditions$sample_name)] From 7ee5f225ffd8c20c894dd38f9d836750fd9ff386 Mon Sep 17 00:00:00 2001 From: jpsmith5 Date: Fri, 6 Nov 2020 08:16:09 -0500 Subject: [PATCH 03/12] fix return value --- BiocProject/runCOCOA.R | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/BiocProject/runCOCOA.R b/BiocProject/runCOCOA.R index 686331df..24ca5286 100644 --- a/BiocProject/runCOCOA.R +++ b/BiocProject/runCOCOA.R @@ -2,7 +2,8 @@ runCOCOA = function(project, results_subdir, summary_dir, genome, conditions, lolaPath) { # verify condition table if(!all(c("sample_name", "condition") %in% tolower(colnames(conditions)))) { - quit("Conditions must include 'sample_name' and 'condition' columns.") + warning("Conditions must include 'sample_name' and 'condition' columns.") + return(1) } # load required packages required_packages <- c("COCOA", "LOLA", "PEPATACr", "tidyr") @@ -15,7 +16,7 @@ runCOCOA = function(project, results_subdir, summary_dir, error=function(e) { message("Error: Install the \"", package, "\" library before proceeding.") - return(NULL) + return(1) }, warning=function(e) { message(e) From 02700642f6d157df6a3dfe6ed4b2cf7a525583c9 Mon Sep 17 00:00:00 2001 From: jpsmith5 Date: Fri, 6 Nov 2020 13:41:26 -0500 Subject: [PATCH 04/12] update --- BiocProject/runCOCOA.R | 79 +++++++++++++++++++++++++++++------------- 1 file changed, 55 insertions(+), 24 deletions(-) diff --git a/BiocProject/runCOCOA.R b/BiocProject/runCOCOA.R index 24ca5286..11814fa9 100644 --- a/BiocProject/runCOCOA.R +++ b/BiocProject/runCOCOA.R @@ -6,7 +6,7 @@ runCOCOA = function(project, results_subdir, summary_dir, return(1) } # load required packages - required_packages <- c("COCOA", "LOLA", "PEPATACr", "tidyr") + required_packages <- c("COCOA", "LOLA", "data.table") for (package in required_packages) { loadLibrary <- tryCatch ( { @@ -16,7 +16,7 @@ runCOCOA = function(project, results_subdir, summary_dir, error=function(e) { message("Error: Install the \"", package, "\" library before proceeding.") - return(1) + return(NULL) }, warning=function(e) { message(e) @@ -35,14 +35,14 @@ runCOCOA = function(project, results_subdir, summary_dir, paste0(config(project)$name, "_", genome, "_peaks_coverage.tsv")) if(file.exists(count_table_path)) { - ct <- fread(count_table_path) + ct <- data.table::fread(count_table_path) } else { warning(paste0("Could not load ", count_table_path)) return(1) } # Subset count table by condition table cols <- colnames(ct)[c(TRUE ,colnames(ct) %in% conditions$sample_name)] - ct <- ct[,..cols] + ct <- ct[, cols, with=FALSE] # Generate matrix m <- as.matrix(ct[, 2:ncol(ct)]) rownames(m) <- ct$name @@ -55,40 +55,71 @@ runCOCOA = function(project, results_subdir, summary_dir, loadings <- pca$rotation scores <- pca$x rownames(scores) <- rownames(mat) + + #message("Produce signal coordinates (peaks)") + # Generate signal coordinates + # coords <- data.frame(coordinates=rownames(m)) + # message("coords") + # print(coords[1:4,1]) + # message("rownames(m)") + # print(head(rownames(m),4)) + # message("remove matching rows") + delIdx <- c(grep("random", rownames(m)), + grep("GL", rownames(m)), + grep("chrUn", rownames(m))) + #print(head(delIdx)) + # coords2 <- coords[!delIdx,] + # message("cleaned up coords") + # #print(coords2[1:4,1]) + # message("split column") + # eval(setDT(coords)[, + # c("chr", "start", "end") := tstrsplit(coords$coordinates, "_", + # type.convert = TRUE, fixed = TRUE)], envir=globalenv()) + #peaks <- coords[,-1] + coord_list <- data.table::tstrsplit(rownames(m)[-delIdx], "_", + type.convert = TRUE, fixed = TRUE) + + #print(data.table::tstrsplit(coords$coordinates, "_", type.convert = TRUE, fixed = TRUE)) + #message("check class") + #message(class(coord_list)) + #message(length(coord_list)) + #message("build peaks data.table") + peaks <- data.table::data.table(chr=coord_list[[1]], + start=coord_list[[2]], + end=coord_list[[3]]) + #message("peak example:") + #print(peaks[1:3,]) + #peaks <- eval(separate(coords, col="coordinates", sep="_", into=c("chr", "start", "end")), envir=globalenv()) + #message("set column type") + #peaks$start <- as.numeric(peaks$start) + #peaks$end <- as.numeric(peaks$end) + # Load region database - regionSetDB <- loadRegionDB(lolaPath) + regionSetDB <- suppressWarnings(loadRegionDB(lolaPath)) # Load the ENCODE transcription factor binding sites regionAnno <- regionSetDB$regionAnno collectionInd <- regionAnno$collection %in% "encode_tfbs" + #message("load just TFBS") TFGR <- GRangesList(regionSetDB$regionGRL[collectionInd]) regionAnno <- regionAnno[collectionInd, ] regionSetName <- regionAnno$filename regionSetDescription <- regionAnno$description - + #message("remove regionSetDB") # Remove full DB to release memory rm("regionSetDB") - # Generate signal coordinates - pos <- data.table(rownames(m)) - pos <- pos[- grep("random", pos$V1),] - pos <- pos[- grep("GL", pos$V1),] - pos <- pos[- grep("chrUn", pos$V1),] - peaks <- separate(pos, col="V1", sep="_", into=c("chr", "start", "end")) - peaks$start <- as.numeric(peaks$start) - peaks$end <- as.numeric(peaks$end) - # Subset principal components to annotate PCsToAnnotate <- paste0("PC", 1:6) correctSampleOrder <- 1:nrow(mat) - - tfRSS <- runCOCOA(genomicSignal=loadings, - signalCoord=peaks, - GRList=TFGR, - signalCol=PCsToAnnotate, - sampleOrder=correctSampleOrder, - targetVar=scores, - scoringMetric="proportionWeightedMean", - absVal=TRUE) + message("run COCOA") + tfRSS <- local(COCOA::runCOCOA(genomicSignal=loadings, + signalCoord=peaks, + GRList=TFGR, + signalCol=PCsToAnnotate, + sampleOrder=correctSampleOrder, + targetVar=scores, + scoringMetric="proportionWeightedMean", + absVal=TRUE)) tfRSS$regionSetName <- regionSetName tfRSS$regionSetDescription <- regionSetDescription return(tfRSS) From ee3a7d8f4b9844d47d95eea3639955c4a899fbba Mon Sep 17 00:00:00 2001 From: jpsmith5 Date: Fri, 6 Nov 2020 14:29:28 -0500 Subject: [PATCH 05/12] clean up debug notes --- BiocProject/runCOCOA.R | 46 ++++++++---------------------------------- 1 file changed, 8 insertions(+), 38 deletions(-) diff --git a/BiocProject/runCOCOA.R b/BiocProject/runCOCOA.R index 11814fa9..3e7d4435 100644 --- a/BiocProject/runCOCOA.R +++ b/BiocProject/runCOCOA.R @@ -55,45 +55,15 @@ runCOCOA = function(project, results_subdir, summary_dir, loadings <- pca$rotation scores <- pca$x rownames(scores) <- rownames(mat) - - #message("Produce signal coordinates (peaks)") - # Generate signal coordinates - # coords <- data.frame(coordinates=rownames(m)) - # message("coords") - # print(coords[1:4,1]) - # message("rownames(m)") - # print(head(rownames(m),4)) - # message("remove matching rows") + delIdx <- c(grep("random", rownames(m)), grep("GL", rownames(m)), grep("chrUn", rownames(m))) - #print(head(delIdx)) - # coords2 <- coords[!delIdx,] - # message("cleaned up coords") - # #print(coords2[1:4,1]) - # message("split column") - # eval(setDT(coords)[, - # c("chr", "start", "end") := tstrsplit(coords$coordinates, "_", - # type.convert = TRUE, fixed = TRUE)], envir=globalenv()) - #peaks <- coords[,-1] coord_list <- data.table::tstrsplit(rownames(m)[-delIdx], "_", type.convert = TRUE, fixed = TRUE) - - #print(data.table::tstrsplit(coords$coordinates, "_", type.convert = TRUE, fixed = TRUE)) - #message("check class") - #message(class(coord_list)) - #message(length(coord_list)) - #message("build peaks data.table") peaks <- data.table::data.table(chr=coord_list[[1]], start=coord_list[[2]], end=coord_list[[3]]) - #message("peak example:") - #print(peaks[1:3,]) - #peaks <- eval(separate(coords, col="coordinates", sep="_", into=c("chr", "start", "end")), envir=globalenv()) - #message("set column type") - #peaks$start <- as.numeric(peaks$start) - #peaks$end <- as.numeric(peaks$end) - # Load region database regionSetDB <- suppressWarnings(loadRegionDB(lolaPath)) # Load the ENCODE transcription factor binding sites @@ -113,13 +83,13 @@ runCOCOA = function(project, results_subdir, summary_dir, correctSampleOrder <- 1:nrow(mat) message("run COCOA") tfRSS <- local(COCOA::runCOCOA(genomicSignal=loadings, - signalCoord=peaks, - GRList=TFGR, - signalCol=PCsToAnnotate, - sampleOrder=correctSampleOrder, - targetVar=scores, - scoringMetric="proportionWeightedMean", - absVal=TRUE)) + signalCoord=peaks, + GRList=TFGR, + signalCol=PCsToAnnotate, + sampleOrder=correctSampleOrder, + targetVar=scores, + scoringMetric="proportionWeightedMean", + absVal=TRUE)) tfRSS$regionSetName <- regionSetName tfRSS$regionSetDescription <- regionSetDescription return(tfRSS) From 8d82f03ffd83a6edb75280dd8d8941a028725775 Mon Sep 17 00:00:00 2001 From: jpsmith5 Date: Wed, 11 Nov 2020 11:47:00 -0500 Subject: [PATCH 06/12] improve descriptions --- pepatac_output_schema.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pepatac_output_schema.yaml b/pepatac_output_schema.yaml index 8914c950..19840b00 100644 --- a/pepatac_output_schema.yaml +++ b/pepatac_output_schema.yaml @@ -1,4 +1,4 @@ -description: objects produced by PEPPRO pipeline. +description: objects produced by PEPATAC pipeline. properties: samples: type: array From 0bfecfd31337bd89fa3ecdabc29f44fc13faf8ea Mon Sep 17 00:00:00 2001 From: jpsmith5 Date: Wed, 11 Nov 2020 11:47:25 -0500 Subject: [PATCH 07/12] update arguments and bioconductor section --- sample_pipeline_interface.yaml | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/sample_pipeline_interface.yaml b/sample_pipeline_interface.yaml index c84d2baa..238b624e 100644 --- a/sample_pipeline_interface.yaml +++ b/sample_pipeline_interface.yaml @@ -13,9 +13,9 @@ command_template: > -P {compute.cores} -M {compute.mem} {% if sample.read2 is defined %} --input2 {sample.read2} {% endif %} + {% if sample.aligner is defined %} --aligner {sample.aligner} {% endif %} {% if sample.peak_caller is defined %} --peak-caller {sample.peak_caller} {% endif %} {% if sample.macs_genome_size is defined %} --genome-size {sample.macs_genome_size} {% endif %} - {% if sample.aligner is defined %} --aligner {sample.aligner} {% endif %} {% if sample.trimmer is defined %} --trimmer {sample.trimmer} {% endif %} {% if sample.prealignments is defined %} --prealignments {sample.prealignments} {% endif %} {% if sample.deduplicator is defined %} --deduplicator {sample.deduplicator} {% endif %} @@ -26,12 +26,13 @@ command_template: > {% if sample.extend is defined %} --extend {sample.extend} {% endif %} {% if sample.frip_ref_peaks is defined %} --frip-ref-peaks {sample.frip_ref_peaks} {% endif %} {% if sample.motif is defined %} --motif {% endif %} - {% if sample.no_scale is defined %} --no-scale {% endif %} {% if sample.sob is defined %} --sob {% endif %} + {% if sample.no_scale is defined %} --no-scale {% endif %} {% if sample.prioritize is defined %} --prioritize {% endif %} {% if sample.keep is defined %} --keep {% endif %} {% if sample.no_fifo is defined %} --noFIFO {% endif %} {% if sample.lite is defined %} --lite {% endif %} + {% if sample.skipqc is defined %} --skipqc {% endif %} compute: singularity_image: ${SIMAGES}pepatac docker_image: databio/pepatac @@ -39,5 +40,5 @@ compute: size_dependent_variables: resources-sample.tsv bioconductor: - readFunName: readPepatacPeakBeds - readFunPath: BiocProject/readPepatacPeakBeds.R + readFunName: runCOCOA + readFunPath: BiocProject/runCOCOA.R From d322fbe48bdb1a35944d7ab78edda5b0978e5ad3 Mon Sep 17 00:00:00 2001 From: jpsmith5 Date: Wed, 11 Nov 2020 11:47:53 -0500 Subject: [PATCH 08/12] fix calcFRiF bug on zero coverage file --- PEPATACr/R/PEPATACr.R | 51 ++++++++++++++++++++++++++++++++----------- 1 file changed, 38 insertions(+), 13 deletions(-) diff --git a/PEPATACr/R/PEPATACr.R b/PEPATACr/R/PEPATACr.R index b0c612be..51f8fa35 100644 --- a/PEPATACr/R/PEPATACr.R +++ b/PEPATACr/R/PEPATACr.R @@ -631,6 +631,7 @@ calcFRiF <- function(bedFile, total, reads) { } bedFile <- bedFile[apply(bedFile != 0, 1, all),] + # if no regions have coverage, file is now empty if (reads) { bedFile <- cbind(bedFile, cumsum=cumsum(bedFile$count)) @@ -640,8 +641,11 @@ calcFRiF <- function(bedFile, total, reads) { bedFile <- cbind(bedFile, cumSize=cumsum(bedFile$size)) bedFile <- cbind(bedFile, frip=bedFile$cumsum/as.numeric(total)) - bedFile <- cbind(bedFile, numfeats=as.numeric(1:nrow(bedFile))) - return(bedFile) + if (is.empty(bedFile)) { + return(cbind(bedFile, numfeats=as.numeric())) + } else { + return(cbind(bedFile, numfeats=as.numeric(1:nrow(bedFile)))) + } } @@ -692,12 +696,18 @@ plotFRiF <- function(sample_name, num_reads, genome_size, bed <- get(bedFile[1]) bedCov <- calcFRiF(bed, num_reads, reads) name <- bedFile[1] - labels[1,] <- c(0.95*max(log10(bedCov$cumSize)), max(bedCov$frip)+0.001, - name, round(max(bedCov$frip),2), "#FF0703") - feature_dist[1,] <- c(name, nrow(bed), - as.numeric(sum(abs(bed$V3-bed$V2))), - as.numeric((sum(abs(bed$V3-bed$V2))/genome_size))) - bedCov$feature <- name + if (is.empty(bedCov)) { + labels[1,] <- c(0, 0, name,0, "#FF0703") + bedCov$feature <- as.character() + } else { + labels[1,] <- c(0.95*max(log10(bedCov$cumSize)), + max(bedCov$frip)+0.001, name, + round(max(bedCov$frip),2), "#FF0703") + bedCov$feature <- name + } + feature_dist[1,] <- c(name, + nrow(bed), as.numeric(sum(abs(bed$V3-bed$V2))), + as.numeric((sum(abs(bed$V3-bed$V2))/genome_size))) } else if (file.exists(file.path(bedFile[1])) && info$size != 0) { bed <- read.table(file.path(bedFile[1])) if (nrow(bed[which(bed$V5 != 0),]) == 0) { @@ -709,12 +719,18 @@ plotFRiF <- function(sample_name, num_reads, genome_size, name <- gsub("^.*?_", "", name) numFields <- 1 for(i in 1:numFields) name <- gsub("_[^_]*$", "", name) - labels[1,] <- c(0.95*max(log10(bedCov$cumSize)), max(bedCov$frip)+0.001, - name, round(max(bedCov$frip),2), "#FF0703") + if (is.empty(bedCov)) { + labels[1,] <- c(0, 0, name, 0, "#FF0703") + bedCov$feature <- as.character() + } else { + labels[1,] <- c(0.95*max(log10(bedCov$cumSize)), + max(bedCov$frip)+0.001, name, + round(max(bedCov$frip),2), "#FF0703") + bedCov$feature <- name + } feature_dist[1,] <- c(name, nrow(bed), - as.numeric(sum(abs(bed$V3-bed$V2))), - as.numeric((sum(abs(bed$V3-bed$V2))/genome_size))) - bedCov$feature <- name + as.numeric(sum(abs(bed$V3-bed$V2))), + as.numeric((sum(abs(bed$V3-bed$V2))/genome_size))) } } else { if (is.na(info[1])) { @@ -1235,6 +1251,15 @@ plotFLD <- function(fragL, } +#' Check if a data frame is empty. +#' +#' Return TRUE if provided data frame is NULL, has no rows, or has no columns. +#' +#' @param df A data frame to check for emptiness +is.empty <- function(df) { + (is.null(df) || nrow(df) == 0 || ncol(df) == 0) +} + #' Calculate mode(s) of data #' From ec14f5f9386c72096bf8ad978139488c6afe4ae5 Mon Sep 17 00:00:00 2001 From: jpsmith5 Date: Thu, 12 Nov 2020 14:00:07 -0500 Subject: [PATCH 09/12] add function to confirm filter_paired_fq.py has completed before compression --- pipelines/pepatac.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/pipelines/pepatac.py b/pipelines/pepatac.py index c5db7b23..f07c9c0e 100755 --- a/pipelines/pepatac.py +++ b/pipelines/pepatac.py @@ -981,6 +981,24 @@ def check_trim(): to_compress.append(unmap_fq2) pm.timestamp("### Compress all unmapped read files") + # Confirm pairing is complete + def check_pairing(fq1, fq2): + wc1 = ngstk.count_reads(fq1, args.paired_end) + wc2 = ngstk.count_reads(fq2, args.paired_end) + pm.debug("wc1: {}".format(str(wc1))) + pm.debug("wc2: {}".format(str(wc2))) + pm.debug("Return value: {}".format(str(wc1 == wc2))) + return wc1 == wc2 + + if args.paired_end: + checks = 1 + while not check_pairing(unmap_fq1, unmap_fq2) and checks < 100: + checks += 1 + pm.debug("Check count: {}".format(str(checks))) + if checks > 100 and not check_pairing(unmap_fq1, unmap_fq2): + err_msg = ("Fastq filter_paired_fq.pl function did not complete " + "successfully. Try running the pipeline with `--keep`.") + pm.fail_pipeline(IOError(err_msg)) for unmapped_fq in to_compress: # Compress unmapped fastq reads if not pypiper.is_gzipped_fastq(unmapped_fq) and not unmapped_fq == '': From 40238cda85d6520f8e37a18237dbbacbb89cdb15 Mon Sep 17 00:00:00 2001 From: jpsmith5 Date: Mon, 16 Nov 2020 09:37:08 -0500 Subject: [PATCH 10/12] update function for confirming fixed pairs is compelte --- pipelines/pepatac.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/pipelines/pepatac.py b/pipelines/pepatac.py index f07c9c0e..35dd61a9 100755 --- a/pipelines/pepatac.py +++ b/pipelines/pepatac.py @@ -5,7 +5,7 @@ __author__ = ["Jin Xu", "Nathan Sheffield", "Jason Smith"] __email__ = "jasonsmith@virginia.edu" -__version__ = "0.9.9" +__version__ = "0.9.10" from argparse import ArgumentParser @@ -991,14 +991,17 @@ def check_pairing(fq1, fq2): return wc1 == wc2 if args.paired_end: - checks = 1 - while not check_pairing(unmap_fq1, unmap_fq2) and checks < 100: - checks += 1 - pm.debug("Check count: {}".format(str(checks))) - if checks > 100 and not check_pairing(unmap_fq1, unmap_fq2): - err_msg = ("Fastq filter_paired_fq.pl function did not complete " - "successfully. Try running the pipeline with `--keep`.") - pm.fail_pipeline(IOError(err_msg)) + if (not pypiper.is_gzipped_fastq(unmap_fq1) and + not pypiper.is_gzipped_fastq(unmap_fq2)): + checks = 1 + while not check_pairing(unmap_fq1, unmap_fq2) and checks < 100: + checks += 1 + pm.debug("Check count: {}".format(str(checks))) + if checks > 100 and not check_pairing(unmap_fq1, unmap_fq2): + err_msg = ("Fastq filter_paired_fq.pl function did not " + "complete successfully. Try running the pipeline " + "with `--keep`.") + pm.fail_pipeline(IOError(err_msg)) for unmapped_fq in to_compress: # Compress unmapped fastq reads if not pypiper.is_gzipped_fastq(unmapped_fq) and not unmapped_fq == '': From 6fe176028830e8df2f3b7cfab173dba1517ea9fa Mon Sep 17 00:00:00 2001 From: jpsmith5 Date: Mon, 16 Nov 2020 21:29:08 -0500 Subject: [PATCH 11/12] update version and docs --- docs/changelog.md | 8 ++++++++ docs/usage.md | 2 +- pipelines/pepatac.py | 4 ++-- usage.txt | 2 +- 4 files changed, 12 insertions(+), 4 deletions(-) diff --git a/docs/changelog.md b/docs/changelog.md index 932e74a8..1e4431a7 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -1,6 +1,14 @@ # Change log All notable changes to this project will be documented in this file. + +## [0.9.10] -- 2020-11-16 + +### Changed + - Added check for rare lagging filter_paired_fq.pl command + - Fixed calcFRiF bug on zero coverage files + - Updated BiocProject examples + ## [0.9.9] -- 2020-11-02 ### Changed diff --git a/docs/usage.md b/docs/usage.md index e9040a1d..7b19ecd8 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -22,7 +22,7 @@ usage: pepatac.py [-h] [-R] [-N] [-D] [-F] [-T] [--silent] [--verbosity V] [--motif] [--sob] [--no-scale] [--prioritize] [--keep] [--noFIFO] [--lite] [--skipqc] [-V] -PEPATAC version 0.9.9 +PEPATAC version 0.9.10 optional arguments: -h, --help show this help message and exit diff --git a/pipelines/pepatac.py b/pipelines/pepatac.py index 35dd61a9..f4ca857b 100755 --- a/pipelines/pepatac.py +++ b/pipelines/pepatac.py @@ -565,7 +565,7 @@ def main(): ############################################################################ # Confirm required tools are all callable # ############################################################################ - opt_tools = ["fseq", "genrich", "${HMMRATAC}", "${PICARD}", + opt_tools = ["fseq", "Genrich", "${HMMRATAC}", "${PICARD}", "${TRIMMOMATIC}", "pyadapt", "findMotifsGenome.pl", "findPeaks", "seqOutBias", "bigWigMerge", "bedGraphToBigWig", "pigz", "bwa"] @@ -605,7 +605,7 @@ def main(): if 'fseq' in opt_tools: opt_tools.remove('fseq') if args.peak_caller == "genrich": - if 'genrich' in opt_tools: opt_tools.remove('genrich') + if 'Genrich' in opt_tools: opt_tools.remove('Genrich') if args.peak_caller == "hmmratac": if '${HMMRATAC}' in opt_tools: opt_tools.remove('${HMMRATAC}') diff --git a/usage.txt b/usage.txt index 60d598ad..c7b6ba4c 100644 --- a/usage.txt +++ b/usage.txt @@ -14,7 +14,7 @@ usage: pepatac.py [-h] [-R] [-N] [-D] [-F] [-T] [--silent] [--verbosity V] [--motif] [--sob] [--no-scale] [--prioritize] [--keep] [--noFIFO] [--lite] [--skipqc] [-V] -PEPATAC version 0.9.9 +PEPATAC version 0.9.10 optional arguments: -h, --help show this help message and exit From 089f9a2c9cfe3b3434f28b9165dcea51b839c4a0 Mon Sep 17 00:00:00 2001 From: jpsmith5 Date: Mon, 16 Nov 2020 21:29:39 -0500 Subject: [PATCH 12/12] update bulker crate --- sample_pipeline_interface.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sample_pipeline_interface.yaml b/sample_pipeline_interface.yaml index 238b624e..e67213e1 100644 --- a/sample_pipeline_interface.yaml +++ b/sample_pipeline_interface.yaml @@ -36,7 +36,7 @@ command_template: > compute: singularity_image: ${SIMAGES}pepatac docker_image: databio/pepatac - bulker_crate: databio/pepatac:1.0.4 + bulker_crate: databio/pepatac:1.0.5 size_dependent_variables: resources-sample.tsv bioconductor: