Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Get rid of Hardcoded paths #23

Open
wants to merge 8 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
generated_datasets
pipeline/convert_step2/liftover/annotations.pkl
pipeline/convert_step2/liftover/Homo_sapiens.GRCh38.113.gff3
utils/__pycache__/
.*/
../downloads
*.log
Expand Down
4 changes: 3 additions & 1 deletion pipeline/config.json
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"/data/shared/repos/biomuta-old/downloads" is a symlink to "/data/shared/biomuta/downloads", and "/data/shared/repos/biomuta-old/generated_datasets" is a symlink to "/data/shared/biomuta/generated/datasets".

Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
{
"relevant_paths": {
"repos_generated_datasets": "/data/shared/repos/biomuta-old/generated_datasets",
"repos_downloads": "/data/shared/repos/biomuta-old/downloads",
"downloads": "/data/shared/biomuta/downloads",
"generated_datasets": "/data/shared/biomuta/generated/datasets",
"mapping": "/data/shared/biomuta/pipeline/convert_step2/mapping"
},
"resource_init": {
"cbioportal": ["subfolder1", "subfolder2"]
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,12 @@
from pathlib import Path
from typing import Optional

# Configure logging
logging.basicConfig(filename="cancer_mapping.log",
filemode='a',
format='%(asctime)s %(levelname)s %(message)s',
datefmt='%Y-%m-%d %H:%M:%S',
level=logging.INFO)

# Logging levels
# 1. error
# 2. warning
Expand All @@ -27,24 +27,27 @@
#from utils import ROOT_DIR #__init__.py, ROOT_DIR is a global var

# Define paths

# Get the directory of this script
script_dir = Path(__file__).resolve().parent
# Navigate to config.json location relative to script
config_dir = script_dir.parent.parent
# Load config
with open(config_dir/'config.json') as config_file:

# Load config.json
with open(config_dir / 'config.json') as config_file:
config = json.load(config_file)

# Access paths from config

mapping_dir = Path(config["relevant_paths"]["mapping"])
doid_mapping = mapping_dir / "combined_do_mapping.json"
fallback_do_map = mapping_dir / "fallback_cbio_doid_mapping.json"

# Input and output file names
# Get the latest directory
directory_path = Path(config["relevant_paths"]["generated_datasets"])
latest_dir = max([d for d in os.listdir(directory_path) if os.path.isdir(os.path.join(directory_path, d))], key=lambda d: os.path.getctime(os.path.join(directory_path, d)))
latest_dir = Path(directory_path) / latest_dir
# Get the latest directory in generated_datasets
generated_datasets_dir = Path(config["relevant_paths"]["generated_datasets"])
latest_dir = max([d for d in os.listdir(generated_datasets_dir) if os.path.isdir(os.path.join(generated_datasets_dir, d))], key=lambda d: os.path.getctime(os.path.join(generated_datasets_dir, d)))
latest_dir = Path(generated_datasets_dir) / latest_dir

def ask_confirmation(prompt):
while True:
user_input = input(f"{prompt} (y/n): ").strip().lower()
Expand All @@ -54,11 +57,12 @@ def ask_confirmation(prompt):
return False
else:
print(f"Invalid input. Please enter 'y' for yes or 'n' for no.")

if ask_confirmation(f"The latest created directory is: {latest_dir}. Proceed?"):
input_file = Path(latest_dir) / "unique_cancer_names.txt"
cancer_types_with_do = Path(latest_dir) / "cancer_types_with_do.json"
cancer_type_per_study = Path(latest_dir) / "cancer_type_per_study.json"
study_ids_with_do = Path(latest_dir) / "study_ids_with_do.json"
input_file = latest_dir / "unique_cancer_names.txt"
cancer_types_with_do = latest_dir / "cancer_types_with_do.json"
cancer_type_per_study = latest_dir / "cancer_type_per_study.json"
study_ids_with_do = latest_dir / "study_ids_with_do.json"
print(f"Using {latest_dir}/unique_cancer_names.txt and writing out to {latest_dir}/cancer_types_with_do.json")
else:
sys.exit("Aborted by user.")
Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
#!/bin/bash

# Load config.json and extract paths
config_file="$(dirname "$(dirname "$(realpath "$0")")")/config.json"
repos_generated_datasets=$(jq -r '.relevant_paths.repos_generated_datasets' "$config_file")

# Input and output file paths
input_csv="/data/shared/repos/biomuta-old/generated_datasets/2024_10_22/mapping_ids/chr_pos_to_ensp.csv" # Input CSV file
output_json="/data/shared/repos/biomuta-old/generated_datasets/2024_10_22/mapping_ids/ensp_to_uniprot_mappings.json" # Output JSON file
input_csv="$repos_generated_datasets/2024_10_22/mapping_ids/chr_pos_to_ensp.csv" # Input CSV file
output_json="$repos_generated_datasets/2024_10_22/mapping_ids/ensp_to_uniprot_mappings.json" # Output JSON file

batch_size=5000 # Number of ENSP IDs per batch (adjustable)

Expand Down
Original file line number Diff line number Diff line change
@@ -1,12 +1,25 @@
import json
import pandas as pd
from pathlib import Path

# Load config.json
config_path = Path(__file__).resolve().parent.parent / "config.json"
with open(config_path, "r") as config_file:
config = json.load(config_file)

# Retrieve paths from updated config
repos_generated_datasets = Path(config["relevant_paths"]["repos_generated_datasets"])
repos_downloads = Path(config["relevant_paths"]["repos_downloads"])
isoform_data_path = repos_downloads / "glygen/human_protein_masterlist.csv"
ensp_to_uniprot_path = repos_generated_datasets / "2024_10_22/mapping_ids/ensp_to_uniprot_mappings_toy.json"
canonical_toy_output_path = repos_generated_datasets / "2024_10_22/mapping_ids/canonical_toy.json"

# Load the ENSP to UniProt mapping JSON
with open("/data/shared/repos/biomuta-old/generated_datasets/2024_10_22/mapping_ids/ensp_to_uniprot_mappings.json", "r") as f:
with open(ensp_to_uniprot_path, "r") as f:
ensp_to_uniprot = json.load(f)

# Load the isoform data CSV
isoform_data = pd.read_csv("/data/shared/repos/biomuta-old/downloads/glygen/human_protein_masterlist.csv")
isoform_data = pd.read_csv(isoform_data_path)

# Prepare a dictionary to store the results
result = {}
Expand All @@ -19,6 +32,9 @@ def strip_suffix(identifier):

# Iterate over each ENSP and its corresponding UniProt ID
for ensp, uniprot in ensp_to_uniprot.items():
# Default to "no" for canonical until proven otherwise
is_canonical = "no"

# Check for matching UniProt IDs in either reviewed_isoforms or unreviewed_isoforms
for _, entry in isoform_data.iterrows():
# Strip suffixes from isoform IDs before comparison
Expand All @@ -29,10 +45,15 @@ def strip_suffix(identifier):
if uniprot == reviewed_isoforms or uniprot == unreviewed_isoforms:
# If a match is found, add the uniprotkb_canonical_ac to the result
uniprotkb_ac = strip_suffix(entry.get("uniprotkb_canonical_ac"))
# Store the first match found for each ENSP identifier
result[ensp] = uniprotkb_ac

# Check if the UniProt ID matches the canonical version
if uniprot == uniprotkb_ac:
is_canonical = "yes"

# Store the result with canonical status
result[ensp] = {"uniprotkb_canonical_ac": uniprotkb_ac, "canonical": is_canonical}
break # Exit inner loop once the first match is found

# Write the result to a JSON file
with open("/data/shared/repos/biomuta-old/generated_datasets/2024_10_22/mapping_ids/canonical.json", "w") as json_file:
with open(canonical_toy_output_path, "w") as json_file:
json.dump(result, json_file, indent=4)
47 changes: 47 additions & 0 deletions pipeline/convert_step2/cbioportal/4_compare_fasta.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
import requests
import json
from pathlib import Path

# Load config.json
config_path = Path(__file__).resolve().parent.parent / "config.json"
with open(config_path, "r") as config_file:
config = json.load(config_file)

# Retrieve paths from config.json
repos_base = Path(config["relevant_paths"]["repos"])
ensembl_uniprot_map_path = repos_base / "2024_10_22/mapping_ids/canonical_toy.json"

# Load your JSON file containing ENSEMBL to UniProt mappings
with open(ensembl_uniprot_map_path, "r") as file:
ensembl_uniprot_map = json.load(file)

def fetch_ensembl_sequence(ensembl_id): # Fetch ENSEMBL sequence
url = f"https://rest.ensembl.org/sequence/id/{ensembl_id}?content-type=text/plain"
response = requests.get(url)
if response.status_code == 200:
return response.text.strip()
else:
print(f"Failed to fetch ENSEMBL sequence for {ensembl_id}")
return None

def fetch_uniprot_sequence(uniprot_id): # Fetch UniProt sequence
url = f"https://rest.uniprot.org/uniprotkb/{uniprot_id}.fasta"
response = requests.get(url)
if response.status_code == 200:
return response.text.split('\n', 1)[1].replace('\n', '')
else:
print(f"Failed to fetch UniProt sequence for {uniprot_id}")
return None

# Compare sequences
for ensembl_id, uniprot_id in ensembl_uniprot_map.items():
ensembl_sequence = fetch_ensembl_sequence(ensembl_id)
uniprot_sequence = fetch_uniprot_sequence(uniprot_id)

if ensembl_sequence and uniprot_sequence:
if ensembl_sequence == uniprot_sequence:
print(f"Sequences match for {ensembl_id} and {uniprot_id}")
else:
print(f"Sequences do not match for {ensembl_id} and {uniprot_id}")
else:
print(f"Could not compare sequences for {ensembl_id} and {uniprot_id}")
12 changes: 12 additions & 0 deletions pipeline/convert_step2/cbioportal/compare_fasta_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import requests

def fetch_uniprot_sequence(uniprot_id):
url = f"https://rest.uniprot.org/uniprotkb/{uniprot_id}.fasta"
response = requests.get(url)
if response.status_code == 200:
return response.text.split('\n', 1)[1].replace('\n', '')
else:
print(f"Failed to fetch UniProt sequence for {uniprot_id}")
return None

print(fetch_uniprot_sequence('Q9NXB0'))
21 changes: 12 additions & 9 deletions pipeline/convert_step2/liftover/2_liftover.sh
mariacuria marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
@@ -1,25 +1,28 @@
#!/bin/bash

wd="/data/shared/biomuta/generated/datasets/2024_10_22/liftover"
# Load paths from config.json using jq
config_file="/path/to/config.json" # Replace with the actual path to your config.json
generated_datasets=$(jq -r '.relevant_paths.generated_datasets' $config_file)
liftover_dir="${generated_datasets}/2024_10_22/liftover"

# Extract rows with GRCh38 and save as tab-separated:
awk '$5 == "GRCh38"' ${wd}/hg19entrez_build_protChange.bed | awk '{OFS="\t"; $5=""; $1=$1; print}' > ${wd}/cbio_hg38.bed
awk '$5 == "GRCh38"' ${liftover_dir}/hg19entrez_build_protChange.bed | awk '{OFS="\t"; $5=""; $1=$1; print}' > ${liftover_dir}/cbio_hg38.bed
# Check if the extraction and modification were successful
if [ $? -ne 0 ]; then
echo "Error extracting or modifying rows for GRCh38."
exit 1
fi

# Extract all other rows (where the 5th column is not GRCh38) and save as tab-separated:
awk '$5 != "GRCh38"' ${wd}/hg19entrez_build_protChange.bed | awk '{OFS="\t"; $5=""; $1=$1; print}' > ${wd}/hg19entrez_protChange.bed
awk '$5 != "GRCh38"' ${liftover_dir}/hg19entrez_build_protChange.bed | awk '{OFS="\t"; $5=""; $1=$1; print}' > ${liftover_dir}/hg19entrez_protChange.bed
# Check if the extraction and modification were successful
if [ $? -ne 0 ]; then
echo "Error extracting or modifying non-GRCh38 rows."
exit 1
fi

# Run liftOver to convert coordinates (first chain)
./liftOver ${wd}/hg19entrez_protChange.bed ucscHg19ToHg38.over.chain ${wd}/ucsc_hg38entrez_protChange.bed ${wd}/ucsc_unmapped_entrez_protChange.bed
./liftOver ${liftover_dir}/hg19entrez_protChange.bed ucscHg19ToHg38.over.chain ${liftover_dir}/ucsc_hg38entrez_protChange.bed ${liftover_dir}/ucsc_unmapped_entrez_protChange.bed

# Check if the first liftOver was successful
if [ $? -ne 0 ]; then
Expand All @@ -28,7 +31,7 @@ if [ $? -ne 0 ]; then
fi

# Run liftOver to convert coordinates (second chain)
./liftOver ${wd}/ucsc_unmapped_entrez_protChange.bed ensembl_GRCh37_to_GRCh38.chain ${wd}/ensembl_hg38entrez_protChange.bed ${wd}/ensembl_unmapped_entrez_protChange.bed
./liftOver ${liftover_dir}/ucsc_unmapped_entrez_protChange.bed ensembl_GRCh37_to_GRCh38.chain ${liftover_dir}/ensembl_hg38entrez_protChange.bed ${liftover_dir}/ensembl_unmapped_entrez_protChange.bed

# Check if the second liftOver was successful
if [ $? -ne 0 ]; then
Expand All @@ -37,14 +40,14 @@ if [ $? -ne 0 ]; then
fi

# Prepend 'chr' to the 1st column of ensembl_hg38entrez_protChange.bed
sed 's/^\([a-zA-Z0-9]*\)/chr\1/' ${wd}/ensembl_hg38entrez_protChange.bed > ${wd}/temp && mv ${wd}/temp ${wd}/ensembl_hg38entrez_protChange.bed
sed 's/^\([a-zA-Z0-9]*\)/chr\1/' ${liftover_dir}/ensembl_hg38entrez_protChange.bed > ${liftover_dir}/temp && mv ${liftover_dir}/temp ${liftover_dir}/ensembl_hg38entrez_protChange.bed

# Combine all hg38 files
cat ${wd}/cbio_hg38.bed ${wd}/ucsc_hg38entrez_protChange.bed ${wd}/ensembl_hg38entrez_protChange.bed > ${wd}/hg38_combined.bed
cat ${liftover_dir}/cbio_hg38.bed ${liftover_dir}/ucsc_hg38entrez_protChange.bed ${liftover_dir}/ensembl_hg38entrez_protChange.bed > ${liftover_dir}/hg38_combined.bed
# Remove duplicate rows taking into account extra tabs
awk -v OFS='\t' '{$1=$1; print}' ${wd}/hg38_combined.bed | sort -u > ${wd}/temp && mv ${wd}/temp ${wd}/hg38_combined.bed
awk -v OFS='\t' '{$1=$1; print}' ${liftover_dir}/hg38_combined.bed | sort -u > ${liftover_dir}/temp && mv ${liftover_dir}/temp ${liftover_dir}/hg38_combined.bed
# Add headers with tab separation
sed -i '1i chr_id\tstart_pos\tend_pos\tentrez_gene_id\tprot_change' ${wd}/hg38_combined.bed
sed -i '1i chr_id\tstart_pos\tend_pos\tentrez_gene_id\tprot_change' ${liftover_dir}/hg38_combined.bed

echo "Script completed successfully."

Expand Down
112 changes: 112 additions & 0 deletions pipeline/convert_step2/liftover/3_get_ensp_ver2_toy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
import csv
import pickle
import sys
from pathlib import Path

sys.path.append(str(Path(__file__).resolve().parent.parent.parent.parent))
from utils import ROOT_DIR
from utils.config import get_config

# Load input_bed and ann
# Set input directory for JSON files and output file for BED format
config_obj = get_config()
wd = Path(config_obj["relevant_paths"]["generated_datasets"])
input_bed = wd / '2024_10_22' / 'liftover' / 'hg38_combined_toy.bed' # Write a util to get latest dir
ann_dir = Path(config_obj["relevant_paths"]["downloads"])
ann = ann_dir / 'ensembl' / 'Homo_sapiens.GRCh38.113.gff3' # GFF file with genomic features
output_file = wd / '2024_10_22' / 'mapping_ids' / 'chr_pos_to_ensp_toy.csv'


# Step 1: Load and Serialize 'ann' Annotations
def parse_and_serialize_ann():
annotations = {}
print("Parsing annotations from ann...")
with open(ann, 'r') as f:
for line in f:
if not line.startswith('#'):
fields = line.strip().split('\t')
chrom = 'chr' + fields[0] # Add 'chr' prefix to match format in input_bed
feature_start = int(fields[3])
feature_end = int(fields[4])
if chrom not in annotations:
annotations[chrom] = []
annotations[chrom].append((feature_start, feature_end, line.strip()))
with open('annotations.pkl', 'wb') as f:
pickle.dump(annotations, f)
print("Serialized annotations to 'annotations.pkl'")

# Step 2: Load 'ann' from Serialized File
def load_annotations():
with open('annotations.pkl', 'rb') as f:
return pickle.load(f)

# Step 3: Process 'input_bed' and Write Results in Batches
def process_large_input_bed():
# Load serialized annotations
annotations = load_annotations()

# Open output CSV file
with open(output_file, 'w', newline='') as csvfile:
# Define the new headers for the output file
writer = csv.writer(csvfile)
writer.writerow(['chr_id', 'start_pos', 'end_pos', 'entrez_gene_id', 'prot_change', 'ENSP'])

batch = []
batch_size = 10000 # Define batch size for writing

print("Processing SNP positions from input_bed and writing to CSV...")

with open(input_bed, 'r') as f:
# Skip the header line (if the first line is a header)
header_skipped = False

for i, line in enumerate(f, start=1):
# Skip the header line
if not header_skipped:
header_skipped = True
continue # Skip the header

fields = line.strip().split('\t')

# Check that the necessary fields are numeric before proceeding
try:
start = int(fields[1]) # start_pos
end = int(fields[2]) # end_pos
except ValueError:
print(f"Skipping invalid line {i}: {line.strip()}")
continue # Skip lines where start or end position is not numeric

chrom = fields[0] # chr_id
entrez = fields[3] # entrez_gene_id
prot_change = fields[4] # prot_change

# Find matching annotations
if chrom in annotations:
for feature_start, feature_end, annotation in annotations[chrom]:
if start >= feature_start and end <= feature_end:
ensp = None
for field in annotation.split(';'):
if 'protein_id=' in field:
ensp = field.split('=')[1]
break
if ensp:
# Add match to batch with renamed fields
batch.append([chrom, start, end, entrez, prot_change, ensp])

# Write batch to file every 'batch_size' records
if len(batch) >= batch_size:
writer.writerows(batch)
batch.clear() # Clear batch after writing to file
print(f"Processed {i} lines so far...") # Status update

# Write remaining entries in the batch
if batch:
writer.writerows(batch)
print("Wrote remaining records to file.")

print(f"Process completed. Results written to {output_file}")


# Run the workflow
parse_and_serialize_ann() # Run once to create the serialized annotations file if needed
process_large_input_bed() # Process large 'input_bed' and write results
Loading