Skip to content

Commit

Permalink
Merge pull request #4323 from vgteam/lr-giraffe
Browse files Browse the repository at this point in the history
Long Read Giraffe
  • Loading branch information
adamnovak authored Jan 13, 2025
2 parents 50952bb + 6eaa72b commit dc847a9
Show file tree
Hide file tree
Showing 125 changed files with 26,979 additions and 5,394 deletions.
2 changes: 1 addition & 1 deletion deps/dozeu
Submodule dozeu updated 2 files
+87 −8 dozeu.h
+65 −58 example.c
2 changes: 1 addition & 1 deletion deps/gbwtgraph
2 changes: 1 addition & 1 deletion deps/sublinear-Li-Stephens
4 changes: 4 additions & 0 deletions doc/publish-docs.sh
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,10 @@ COMMIT_AUTHOR_EMAIL="[email protected]"
# We expect GITLAB_SECRET_FILE_DOCS_SSH_KEY to come in from the environment,
# specifying the private deploy key we will use to get at the docs repo.

# Make sure no submodules have untracked files from caching
# See <https://gist.github.com/nicktoumpelis/11214362#file-repo-rinse-sh-L2>
git submodule foreach --recursive git clean -xfd

# Find all the submodules that Doxygen wants to look at and make sure we have
# those.
cat Doxyfile | grep "^INPUT *=" | cut -f2 -d'=' | tr ' ' '\n' | grep "^ *deps" | sed 's_ *\(deps/[^/]*\).*_\1_' | sort | uniq | xargs -n 1 git submodule update --init --recursive
Expand Down
2 changes: 1 addition & 1 deletion doc/wiki
Submodule wiki updated from 9c76a6 to 0e573e
60 changes: 52 additions & 8 deletions scripts/giraffe-facts.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,8 +98,10 @@ def parse_args(args):
parser = argparse.ArgumentParser(description=__doc__,
formatter_class=argparse.RawDescriptionHelpFormatter)

parser.add_argument("--input", type=argparse.FileType('r'), default=sys.stdin,
help="line-oriented JSON GAM to process")
parser.add_argument("input", type=str,
help="GAM to process")
parser.add_argument("--vg", type=str, default="vg",
help="vg binary to use")
parser.add_argument("outdir",
help="directory to place output in")

Expand Down Expand Up @@ -286,11 +288,44 @@ def add_in_stats(destination, addend):

def read_line_oriented_json(lines):
"""
For each line in the given stream, yield it as a parsed JSON object.
For each line in the given iterable of lines (such as a stream), yield it as a parsed JSON object.
"""

for line in lines:
yield json.loads(line)
line = line.strip()
if len(line) > 0:
yield json.loads(line)


def read_read_views(vg, filename):
"""
Given a vg binary and a filename, iterate over subsets of the parsed read dicts for each read in the file.
The subsets will have the annotation and time_used fields.
"""

# Extract just the annotations and times of reads as JSON, with a # header
# We don't know all the annotation field names in advance so we have to dump them all.
filter_process = subprocess.Popen([vg, "filter", "--tsv-out", "annotation;time_used", filename], stdout=subprocess.PIPE)

lines = iter(filter_process.stdout)
# Drop header line
next(lines)

for line in lines:
# Parse the TSV and reconstruct a view of the full read dict.
line = line.decode('utf-8')
line = line.strip()
if len(line) == 0:
continue
parts = line.split("\t")
assert len(parts) == 2
read = {"annotation": json.loads(parts[0]), "time_used": float(parts[1])}

yield read

return_code = filter_process.wait()
assert return_code == 0

class Table(object):
"""
Expand Down Expand Up @@ -916,11 +951,20 @@ def main(args):
# Count all the reads
read_count = 0

# Record mapping parameters from at least one read
# Record mapping parameters from special magic GAM chunk, if any, or a read
params = None

for read in read_line_oriented_json(options.input):


# Get the params from a magic chunk.
# TODO: This is a whole pass through a possibly big file!
params_json = subprocess.check_output([options.vg, "view", "--extract-tag", "PARAMS_JSON", "--first", options.input]).decode('utf-8')
lines = params_json.split("\n")
for parsed_params in read_line_oriented_json(lines):
if params is None:
params = parsed_params

for read in read_read_views(options.vg, options.input):
# For the data we need on each read

if params is None:
# Go get the mapping parameters
params = sniff_params(read)
Expand Down
2 changes: 1 addition & 1 deletion scripts/giraffe-wrangler.sh
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,7 @@ if [[ ! -z "${SIM_GAM}" ]] ; then

# Compute loss stages
# Let giraffe facts errors out
vg view -aj "${WORK}/mapped.gam" | scripts/giraffe-facts.py "${WORK}/facts" >"${WORK}/facts.txt"
scripts/giraffe-facts.py "${WORK}/mapped.gam" "${WORK}/facts" >"${WORK}/facts.txt"
fi

if [[ ! -z "${REAL_FASTQ}" ]] ; then
Expand Down
93 changes: 57 additions & 36 deletions scripts/make_pbsim_reads.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
#!/usr/bin/env bash
# make_pbsim_reads.sh: script to simulate reads with pbsim2.
# Mostly theoretical; records commands that would have worked better than what was actually run
# Intended to run on UCSC Courtyard/Plaza systems
# Intended to run on UCSC behind-the-firewall systems
# You may also need to CFLAGS=-fPIC pip3 install --user bioconvert

set -ex
Expand All @@ -10,27 +9,34 @@ set -ex
# You can set these in the environment to override them and I don't have to write a CLI option parser.
# See https://stackoverflow.com/a/28085062

# Graph to simulate from. Can be S3 URLs or local file paths.
# Graph to simulate from. Can be S3 URLs or local file paths. If GRAPH_GBZ_URL
# is set, GRAPH_XG_URL and GRAPH_GBWT_URL are not used.
: "${GRAPH_XG_URL:=s3://human-pangenomics/pangenomes/freeze/freeze1/minigraph-cactus/hprc-v1.0-mc-grch38.xg}"
: "${GRAPH_GBWT_URL:=s3://human-pangenomics/pangenomes/freeze/freeze1/minigraph-cactus/hprc-v1.0-mc-grch38.gbwt}"
: "${GRAPH_GBZ_URL:=""}"
# Name to use for graph when downloaded
: "${GRAPH_NAME:=hprc-v1.0-mc-grch38}"
# Sample to simulate from
: "${SAMPLE_NAME:=HG00741}"
# Technology name to use in output filenames
: "${TECH_NAME:=hifi}"
# FASTQ to use as a template, or "/dev/null"
: "${SAMPLE_FASTQ:=/public/groups/vg/sjhwang/data/reads/real_HiFi/tmp/HiFi_reads_100k_real.fq}"
: "${SAMPLE_FASTQ:=/private/groups/patenlab/anovak/projects/hprc/lr-giraffe/reads/real/hifi/HiFi_reads_100k.fq}"
# HMM model to use instead of a FASTQ, or "/dev/null"
: "${PBSIM_HMM:=/dev/null}"
# This needs to be the pbsim2 command, which isn't assumed to be in $PATH
: "${PBSIM:=/public/groups/vg/sjhwang/tools/bin/pbsim}"
# This needs to be the pbsim2 binary, which might not be in $PATH.
# It can be installed with
# git clone https://github.com/yukiteruono/pbsim2.git
# cd pbsim2
# git checkout eeb5a19420534a0f672c81db2670117e62a9ee38
# autoupdate
# automake --add-missing
# autoreconf
# ./configure --prefix=$HOME/.local && make
# The binary will be in src/pbsim
: "${PBSIM:=pbsim}"
# Parameters to use with pbsim for simulating reads for each contig. Parameters are space-separated and internal spaces must be escaped.
: "${PBSIM_PARAMS:=--depth 1 --accuracy-min 0.00 --length-min 10000 --difference-ratio 6:50:54}"
# This needs to be a command line which can execute Stephen's script that adds qualities from a FASTQ back into a SAM that is missing them.
# Arguments are space-separated and internal spaces must be escaped.
# This script is at https://gist.github.com/adamnovak/45ae4f500a8ec63ce12ace4ca77afc21
: "${ADD_QUALITIES:=python3 /public/groups/vg/sjhwang/vg_scripts/bin/readers/sam_reader.py}"
: "${PBSIM_PARAMS:=--depth 4 --accuracy-min 0.00 --length-min 10000 --difference-ratio 6:50:54}"
# Directory to save results in
: "${OUT_DIR:=./reads/sim/${TECH_NAME}/${SAMPLE_NAME}}"
# Number of MAFs to convert at once
Expand All @@ -49,33 +55,48 @@ fi
# Make sure scratch directory exists
mkdir -p "${WORK_DIR}"

# Fetch graph
if [[ ! -e "${WORK_DIR}/${GRAPH_NAME}.xg" ]] ; then
# This comparison require Bash 3 or later. See <https://stackoverflow.com/a/2172365>
if [[ ${GRAPH_XG_URL} =~ ^s3:.* ]]; then
# Download from S3
aws s3 cp "${GRAPH_XG_URL}" "${WORK_DIR}/${GRAPH_NAME}.xg.tmp"
mv "${WORK_DIR}/${GRAPH_NAME}.xg.tmp" "${WORK_DIR}/${GRAPH_NAME}.xg"
else
# Use local symlink
ln -s "$(realpath "${GRAPH_XG_URL}")" "${WORK_DIR}/${GRAPH_NAME}.xg"
if [[ -z "${GRAPH_GBZ_URL}" ]] ; then

# Fetch graph
if [[ ! -e "${WORK_DIR}/${GRAPH_NAME}.xg" ]] ; then
# This comparison require Bash 3 or later. See <https://stackoverflow.com/a/2172365>
if [[ ${GRAPH_XG_URL} =~ ^s3:.* ]]; then
# Download from S3
aws s3 cp "${GRAPH_XG_URL}" "${WORK_DIR}/${GRAPH_NAME}.xg.tmp"
mv "${WORK_DIR}/${GRAPH_NAME}.xg.tmp" "${WORK_DIR}/${GRAPH_NAME}.xg"
else
# Use local symlink
ln -s "$(realpath "${GRAPH_XG_URL}")" "${WORK_DIR}/${GRAPH_NAME}.xg"
fi
fi
fi
if [[ ! -e "${WORK_DIR}/${GRAPH_NAME}.gbwt" ]] ; then
if [[ ${GRAPH_GBWT_URL} =~ ^s3:.* ]]; then
if [[ ! -e "${WORK_DIR}/${GRAPH_NAME}.gbwt" ]] ; then
if [[ ${GRAPH_GBWT_URL} =~ ^s3:.* ]]; then
# Download from S3
aws s3 cp "${GRAPH_GBWT_URL}" "${WORK_DIR}/${GRAPH_NAME}.gbwt.tmp"
mv "${WORK_DIR}/${GRAPH_NAME}.gbwt.tmp" "${WORK_DIR}/${GRAPH_NAME}.gbwt"
else
# Use local symlink
ln -s "$(realpath "${GRAPH_GBWT_URL}")" "${WORK_DIR}/${GRAPH_NAME}.gbwt"
fi
fi

if [[ ! -e "${WORK_DIR}/${GRAPH_NAME}.gbz" ]] ; then
# Make it one file
time vg gbwt -x "${WORK_DIR}/${GRAPH_NAME}.xg" "${WORK_DIR}/${GRAPH_NAME}.gbwt" --gbz-format -g "${WORK_DIR}/${GRAPH_NAME}.gbz.tmp"
mv "${WORK_DIR}/${GRAPH_NAME}.gbz.tmp" "${WORK_DIR}/${GRAPH_NAME}.gbz"
fi

elif [[ ! -e "${WORK_DIR}/${GRAPH_NAME}.gbz" ]] ; then
# Fetch the GBZ
if [[ ${GRAPH_GBZ_URL} =~ ^s3:.* ]]; then
# Download from S3
aws s3 cp "${GRAPH_GBWT_URL}" "${WORK_DIR}/${GRAPH_NAME}.gbwt.tmp"
mv "${WORK_DIR}/${GRAPH_NAME}.gbwt.tmp" "${WORK_DIR}/${GRAPH_NAME}.gbwt"
aws s3 cp "${GRAPH_GBZ_URL}" "${WORK_DIR}/${GRAPH_NAME}.gbz.tmp"
mv "${WORK_DIR}/${GRAPH_NAME}.gbz.tmp" "${WORK_DIR}/${GRAPH_NAME}.gbz"
else
# Use local symlink
ln -s "$(realpath "${GRAPH_GBWT_URL}")" "${WORK_DIR}/${GRAPH_NAME}.gbwt"
ln -s "$(realpath "${GRAPH_GBZ_URL}")" "${WORK_DIR}/${GRAPH_NAME}.gbz"
fi
fi

if [[ ! -e "${WORK_DIR}/${GRAPH_NAME}.gbz" ]] ; then
# Make it one file
time vg gbwt -x "${WORK_DIR}/${GRAPH_NAME}.xg" "${WORK_DIR}/${GRAPH_NAME}.gbwt" --gbz-format -g "${WORK_DIR}/${GRAPH_NAME}.gbz.tmp"
mv "${WORK_DIR}/${GRAPH_NAME}.gbz.tmp" "${WORK_DIR}/${GRAPH_NAME}.gbz"
fi

if [[ ! -e "${WORK_DIR}/${GRAPH_NAME}-${SAMPLE_NAME}-as-ref.gbz" ]] ; then
Expand Down Expand Up @@ -150,7 +171,7 @@ function do_job() {
mv "${SAM_NAME}.tmp" "${SAM_NAME}"
fi
set -o pipefail
${ADD_QUALITIES} -s "${SAM_NAME}" -f "${FASTQ_NAME}" | sed "s/ref/${CONTIG_NAME}/g" | samtools view -b - > "${RENAMED_BAM_NAME}.tmp"
python3 "$(dirname -- "${BASH_SOURCE[0]}")/reinsert_qualities.py" -s "${SAM_NAME}" -f "${FASTQ_NAME}" | sed "s/ref/${CONTIG_NAME}/g" | samtools view -b - > "${RENAMED_BAM_NAME}.tmp"
set +o pipefail
mv "${RENAMED_BAM_NAME}.tmp" "${RENAMED_BAM_NAME}"
else
Expand Down Expand Up @@ -207,14 +228,14 @@ fi
# Work out howe many reads there are
TOTAL_READS="$(vg stats -a "${WORK_DIR}/${SAMPLE_NAME}-reads/${SAMPLE_NAME}-sim-${TECH_NAME}.gam" | grep "^Total alignments:" | cut -f2 -d':' | tr -d ' ')"

if [[ "${TOTAL_READS}" -lt 10500 ]] ; then
echo "Only ${TOTAL_READS} reads were simulated. Cannot subset to 10000 reads with buffer!"
if [[ "${TOTAL_READS}" -lt 1000500 ]] ; then
echo "Only ${TOTAL_READS} reads were simulated. Cannot subset to 1000000 reads with buffer!"
exit 1
fi
echo "Simulated ${TOTAL_READS} reads overall"

SUBSAMPLE_SEED=1
for READ_COUNT in 100 1000 10000 ; do
for READ_COUNT in 100 1000 10000 100000 1000000 ; do
# Subset to manageable sizes (always)
# Get the fraction of reads to keep, overestimated, with no leading 0, to paste onto subsample seed.
FRACTION="$(echo "(${READ_COUNT} + 500)/${TOTAL_READS}" | bc -l | sed 's/^[0-9]*//g')"
Expand Down
39 changes: 39 additions & 0 deletions scripts/mark_secondaries.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
#!/usr/bin/python3
# mark_secondaries.py: Mark all but the first alignment with a given name as secondary
"""
Mark duplicate alignments for a given read name as secondary. Useful for GraphAligner output which does not mark its secondaries. Assumes that the first alignment is the primary alignment, ignoring score.
vg view -a unmarked.gam mark_secondaries.py | vg view -JaG - > marked.gam
"""
import sys
import json


def filter_json_gam(infile):
"""
process gam json made with vg view -a my.gam
"""

seen_names = set()

for line in infile:
gam = json.loads(line)

if gam['name'] in seen_names:
gam['is_secondary'] = True
else:
gam['is_secondary'] = False
seen_names.add(gam['name'])

print(json.dumps(gam))

def main():
"""
Main entry point for the program.
"""
filter_json_gam(sys.stdin)

if __name__ == "__main__" :
main()

2 changes: 1 addition & 1 deletion scripts/plot-qq.R
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ print(x$ci)

# Now plot the points as different sizes, but the error bar line ranges as a consistent size
dat.plot <- ggplot(x, aes(1-mapprob+1e-7, 1-observed+1e-7, color=aligner, size=N, weight=N, label=round(mapq,2))) +
scale_color_manual(values=colors, guide=guide_legend(title=NULL, ncol=1)) +
scale_color_manual(values=colors, guide=guide_legend(title=NULL, ncol=2)) +
scale_y_log10("measured error", limits=c(1e-7,2), breaks=c(1e-6,1e-5,1e-4,1e-3,1e-2,1e-1,1e0)) +
scale_x_log10("error estimate", limits=c(1e-7,2), breaks=c(1e-6,1e-5,1e-4,1e-3,1e-2,1e-1,1e0)) +
scale_size_continuous("number", guide=guide_legend(title=NULL, ncol=4)) +
Expand Down
Loading

1 comment on commit dc847a9

@adamnovak
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

vg CI tests complete for merge to master. View the full report here.

16 tests passed, 0 tests failed and 0 tests skipped in 17359 seconds

Please sign in to comment.