Skip to content

Commit

Permalink
add script to find overlapping credible set snps #29
Browse files Browse the repository at this point in the history
fix definition of input and output files
  • Loading branch information
dariushghasemi committed Dec 5, 2024
1 parent 3bc1149 commit a3ec8a4
Show file tree
Hide file tree
Showing 2 changed files with 149 additions and 6 deletions.
37 changes: 31 additions & 6 deletions workflow/rules/coloc.smk
Original file line number Diff line number Diff line change
@@ -1,17 +1,42 @@

rule make_file:
rule master_file:
input:
expand(ws_path("cojo/{seqid}.sentinel"), seqid=analytes.seqid),
ws_path("cojo/{seqid}.sentinel"),
output:
sentinel = ws_path("coloc/{seqid}_master_file.sentinel"),
params:
info = ws_path("cojo/{seqid}/finemaping/*_coloc_info_table.tsv"),
ofile = ws_path("coloc/master_coloc.txt"),
resources:
runtime=lambda wc, attempt: 999 + attempt * 60,
shell:
"""
cat {params.info} >> {params.ofile}
touch {output.sentinel}
"""


rule merry_go_round:
input:
sentinel = expand(ws_path("coloc/{seqid}_master_file.sentinel"), seqid=analytes.seqid),
output:
sentinel = ws_path("coloc/chr{chr_cs}.sentinel"),
params:
outdir = config.get("workspace_path")
pairs = ws_path("coloc/chr{chr_cs}_coloc_pairwise_guide_table.tsv"),
info = ws_path("coloc/master_coloc.txt"),
chr = "{chr_cs}",
conda:
"../envs/coloc.yml"
log:
ws_path("logs/coloc/master_file.log"),
ws_path("logs/coloc/chr{chr_cs}_coloc_pairwise.log"),
resources:
runtime=lambda wc, attempt: 999 + attempt * 60,
shell:
"""
cat {params.outdir}/cojo/{seqid}/finemaping/*_coloc_info_table.tsv >> {output.ofile}
Rscript workflow/scripts/s05_find_overlapping_cs.R \
--coloc_info_table {params.info} \
--chr_cs {params.chr} \
--ofile {params.pairs}
touch {output.sentinel}
"""

118 changes: 118 additions & 0 deletions workflow/scripts/s05_find_overlapping_cs.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
suppressMessages(library(optparse))
suppressMessages(library(tidyverse))
suppressMessages(library(data.table))
suppressMessages(library(Matrix))

# Get arguments specified in the sbatch
option_list <- list(
#make_option("--pipeline_path", default=NULL, help="Path where Rscript lives"),
make_option("--coloc_info_table", default=NULL, help="Path and filename of master coloc table produced by individual traits pre-processing"),
# make_option("--coloc_id", default=NULL, help="Id code to univocally identify the colocalisation analysis"),
make_option("--chr_cs", default=NULL, help="Chromosomes to split the analysis in"),
# make_option("--chunk_size", default=NULL, help="Row number of each chunk the output file will be broken into")
make_option("--ofile", default=NULL, help="Output filename")
);
opt_parser = OptionParser(option_list=option_list);
opt = parse_args(opt_parser);

# column names of coloc info table files
headers = c("phenotype_id", "credible_set", "path_rds", "path_ind_snps")

# List all cs variants
tb <- data.table::fread(opt$coloc_info_table, header = F, col.names = headers) %>%
dplyr::mutate(chr = stringr::str_extract(credible_set, "(\\d+)")) %>%
dplyr::filter(chr==opt$chr_cs)

nrows_tb <- nrow(tb)

# Filter by chr
tb <- tb %>% dplyr::mutate(cs_name=paste0("cs", seq(1,nrows_tb)))

cs_list <- strsplit(tb$credible_set, ",", fixed=TRUE) # If TRUE match split exactly, otherwise use regular expressions - hoping is more memory efficient! https://stackoverflow.com/questions/55919893/more-memory-efficient-way-than-strsplit-to-split-a-string-into-two-in-r
names(cs_list) <- tb$cs_name

# Get unique elements present in all text vectors
all_elements <- unique(unlist(cs_list))

# Check that there's at list a case of overlapping! Otherwise no point in performing all this - FOR THE MOMENT. Still useful to add it to the massive matrix? But will decide later

if(length(unlist(cs_list)) > length(all_elements)){

element_indices <- setNames(seq_along(all_elements), all_elements)

# Create a sparse matrix with zeros
matrix_sparse <- Matrix::Matrix(
0,
nrow = length(cs_list),
ncol = length(all_elements),
sparse = TRUE,
dimnames = list(names(cs_list), all_elements)
)


# Create a matrix of indices for each row
row_indices <- lapply(cs_list, function(vec) {
element_indices[vec]
})


matrix_sparse[cbind(rep(seq_along(cs_list), lengths(row_indices)), unlist(row_indices))] <- 1

# Fill in the matrix with 1 where the text vector contains the element

# Multiply the transpose of the matrix with itself
start <- Sys.time()
matrix_product <- matrix_sparse %*% Matrix::t(matrix_sparse)


# Set the diagonal to zero (as each vector will have 1s on diagonal)
diag(matrix_product) <- 0

shared_elements <- Matrix::which(matrix_product > 0, arr.ind = TRUE)
shared_elements_df <- as.data.frame(shared_elements)

# Map row and column indices to credible set names
shared_elements_df$row <- rownames(shared_elements)
shared_elements_df$col <- colnames(matrix_product)[shared_elements_df$col]

# Keep unique pairs
shared_elements_unique <- unique(Matrix::t(apply(shared_elements_df, 1, sort)))
colnames(shared_elements_unique) <- c("t1", "t2")

# Retrieve trait info
coloc_combo <- merge(
shared_elements_unique,
tb %>% dplyr::select(-credible_set),
by.x = "t1",
by.y = "cs_name"
)

coloc_combo <- merge(
coloc_combo,
tb %>% dplyr::select(-credible_set),
by.x = "t2",
by.y = "cs_name",
suffixes = c("_t1", "_t2")
)

# Remove pair testing different conditional dataset for the same trait (study_id + phenotype_id)
coloc_combo <- coloc_combo %>%
dplyr::filter(phenotype_id_t1 != phenotype_id_t2) %>%
dplyr::select(-t1, -t2, -chr_t1, -chr_t2) %>%
dplyr::rename(
t1_phenotype_id = phenotype_id_t1,
t2_phenotype_id = phenotype_id_t2,
t1_path_rds = path_rds_t1,
t2_path_rds = path_rds_t2,
t1_path_ind_snps = path_ind_snps_t1,
t2_path_ind_snps = path_ind_snps_t2
)

end <- Sys.time()
cat("\nAnalysis time: ",end-start)

cat("\nsaving results to: ", paste0(opt$ofile))

# Save the table as input of next rule
data.table::fwrite(coloc_combo, file = paste0(opt$ofile), quote=F, na=NA, sep="\t")
}

0 comments on commit a3ec8a4

Please sign in to comment.