Merge pull request #4323 from vgteam/lr-giraffe

Long Read Giraffe
vgteam · Jan 13, 2025 · dc847a9 · dc847a9 · adamnovak · Jan 14, 2025
2 parents 50952bb + 6eaa72b
commit dc847a9
Show file tree

Hide file tree

Showing 125 changed files with 26,979 additions and 5,394 deletions.
diff --git a/deps/dozeu b/deps/dozeu
diff --git a/deps/gbwtgraph b/deps/gbwtgraph
diff --git a/deps/libvgio b/deps/libvgio
diff --git a/deps/sublinear-Li-Stephens b/deps/sublinear-Li-Stephens
diff --git a/doc/publish-docs.sh b/doc/publish-docs.sh
@@ -22,6 +22,10 @@ COMMIT_AUTHOR_EMAIL="[email protected]"
 # We expect GITLAB_SECRET_FILE_DOCS_SSH_KEY to come in from the environment,
 # specifying the private deploy key we will use to get at the docs repo.
 
+# Make sure no submodules have untracked files from caching
+# See <https://gist.github.com/nicktoumpelis/11214362#file-repo-rinse-sh-L2>
+git submodule foreach --recursive git clean -xfd
+
 # Find all the submodules that Doxygen wants to look at and make sure we have
 # those.
 cat Doxyfile  | grep "^INPUT *=" | cut -f2 -d'=' | tr ' ' '\n' | grep "^ *deps" | sed 's_ *\(deps/[^/]*\).*_\1_' | sort | uniq | xargs -n 1 git submodule update --init --recursive

diff --git a/doc/wiki b/doc/wiki
diff --git a/scripts/giraffe-facts.py b/scripts/giraffe-facts.py
@@ -98,8 +98,10 @@ def parse_args(args):
     parser = argparse.ArgumentParser(description=__doc__, 
         formatter_class=argparse.RawDescriptionHelpFormatter)
 
-    parser.add_argument("--input", type=argparse.FileType('r'), default=sys.stdin,
-                        help="line-oriented JSON GAM to process")
+    parser.add_argument("input", type=str,
+                        help="GAM to process")
+    parser.add_argument("--vg", type=str, default="vg",
+                        help="vg binary to use")
     parser.add_argument("outdir",
                         help="directory to place output in")
 
@@ -286,11 +288,44 @@ def add_in_stats(destination, addend):
 
 def read_line_oriented_json(lines):
     """
-    For each line in the given stream, yield it as a parsed JSON object.
+    For each line in the given iterable of lines (such as a stream), yield it as a parsed JSON object.
     """
 
     for line in lines:
-        yield json.loads(line)
+        line = line.strip()
+        if len(line) > 0:
+            yield json.loads(line)
+
+
+def read_read_views(vg, filename):
+    """
+    Given a vg binary and a filename, iterate over subsets of the parsed read dicts for each read in the file.
+
+    The subsets will have the annotation and time_used fields.
+    """
+
+    # Extract just the annotations and times of reads as JSON, with a # header
+    # We don't know all the annotation field names in advance so we have to dump them all.
+    filter_process = subprocess.Popen([vg, "filter", "--tsv-out", "annotation;time_used", filename], stdout=subprocess.PIPE)
+
+    lines = iter(filter_process.stdout)
+    # Drop header line
+    next(lines)
+
+    for line in lines:
+        # Parse the TSV and reconstruct a view of the full read dict.
+        line = line.decode('utf-8')
+        line = line.strip()
+        if len(line) == 0:
+            continue
+        parts = line.split("\t")
+        assert len(parts) == 2
+        read = {"annotation": json.loads(parts[0]), "time_used": float(parts[1])}
+
+        yield read
+
+    return_code = filter_process.wait()
+    assert return_code == 0
 
 class Table(object):
     """
@@ -916,11 +951,20 @@ def main(args):
     # Count all the reads
     read_count = 0
 
-    # Record mapping parameters from at least one read
+    # Record mapping parameters from special magic GAM chunk, if any, or a read
     params = None
-
-    for read in read_line_oriented_json(options.input):
-
+
+    # Get the params from a magic chunk.
+    # TODO: This is a whole pass through a possibly big file!
+    params_json = subprocess.check_output([options.vg, "view", "--extract-tag", "PARAMS_JSON", "--first", options.input]).decode('utf-8')
+    lines = params_json.split("\n")
+    for parsed_params in read_line_oriented_json(lines):
+        if params is None:
+            params = parsed_params
+
+    for read in read_read_views(options.vg, options.input):
+        # For the data we need on each read
+
         if params is None:
             # Go get the mapping parameters
             params = sniff_params(read)

diff --git a/scripts/giraffe-wrangler.sh b/scripts/giraffe-wrangler.sh
@@ -195,7 +195,7 @@ if [[ ! -z "${SIM_GAM}" ]] ; then
 
     # Compute loss stages
     # Let giraffe facts errors out
-    vg view -aj "${WORK}/mapped.gam" | scripts/giraffe-facts.py "${WORK}/facts" >"${WORK}/facts.txt"
+    scripts/giraffe-facts.py "${WORK}/mapped.gam" "${WORK}/facts" >"${WORK}/facts.txt"
 fi
 
 if [[ ! -z "${REAL_FASTQ}" ]] ; then

diff --git a/scripts/make_pbsim_reads.sh b/scripts/make_pbsim_reads.sh
@@ -1,7 +1,6 @@
 #!/usr/bin/env bash
 # make_pbsim_reads.sh: script to simulate reads with pbsim2.
-# Mostly theoretical; records commands that would have worked better than what was actually run
-# Intended to run on UCSC Courtyard/Plaza systems
+# Intended to run on UCSC behind-the-firewall systems
 # You may also need to CFLAGS=-fPIC pip3 install --user bioconvert
 
 set -ex
@@ -10,27 +9,34 @@ set -ex
 # You can set these in the environment to override them and I don't have to write a CLI option parser.
 # See https://stackoverflow.com/a/28085062
 
-# Graph to simulate from. Can be S3 URLs or local file paths.
+# Graph to simulate from. Can be S3 URLs or local file paths. If GRAPH_GBZ_URL
+# is set, GRAPH_XG_URL and GRAPH_GBWT_URL are not used.
 : "${GRAPH_XG_URL:=s3://human-pangenomics/pangenomes/freeze/freeze1/minigraph-cactus/hprc-v1.0-mc-grch38.xg}"
 : "${GRAPH_GBWT_URL:=s3://human-pangenomics/pangenomes/freeze/freeze1/minigraph-cactus/hprc-v1.0-mc-grch38.gbwt}"
+: "${GRAPH_GBZ_URL:=""}"
 # Name to use for graph when downloaded
 : "${GRAPH_NAME:=hprc-v1.0-mc-grch38}"
 # Sample to simulate from
 : "${SAMPLE_NAME:=HG00741}"
 # Technology name to use in output filenames
 : "${TECH_NAME:=hifi}"
 # FASTQ to use as a template, or "/dev/null"
-: "${SAMPLE_FASTQ:=/public/groups/vg/sjhwang/data/reads/real_HiFi/tmp/HiFi_reads_100k_real.fq}"
+: "${SAMPLE_FASTQ:=/private/groups/patenlab/anovak/projects/hprc/lr-giraffe/reads/real/hifi/HiFi_reads_100k.fq}"
 # HMM model to use instead of a FASTQ, or "/dev/null"
 : "${PBSIM_HMM:=/dev/null}"
-# This needs to be the pbsim2 command, which isn't assumed to be in $PATH
-: "${PBSIM:=/public/groups/vg/sjhwang/tools/bin/pbsim}"
+# This needs to be the pbsim2 binary, which might not be in $PATH.
+# It can be installed with
+# git clone https://github.com/yukiteruono/pbsim2.git
+# cd pbsim2
+# git checkout eeb5a19420534a0f672c81db2670117e62a9ee38
+# autoupdate
+# automake --add-missing
+# autoreconf 
+# ./configure --prefix=$HOME/.local && make
+# The binary will be in src/pbsim
+: "${PBSIM:=pbsim}"
 # Parameters to use with pbsim for simulating reads for each contig. Parameters are space-separated and internal spaces must be escaped.
-: "${PBSIM_PARAMS:=--depth 1 --accuracy-min 0.00 --length-min 10000 --difference-ratio 6:50:54}"
-# This needs to be a command line which can execute Stephen's script that adds qualities from a FASTQ back into a SAM that is missing them.
-# Arguments are space-separated and internal spaces must be escaped.
-# This script is at https://gist.github.com/adamnovak/45ae4f500a8ec63ce12ace4ca77afc21
-: "${ADD_QUALITIES:=python3 /public/groups/vg/sjhwang/vg_scripts/bin/readers/sam_reader.py}"
+: "${PBSIM_PARAMS:=--depth 4 --accuracy-min 0.00 --length-min 10000 --difference-ratio 6:50:54}"
 # Directory to save results in
 : "${OUT_DIR:=./reads/sim/${TECH_NAME}/${SAMPLE_NAME}}"
 # Number of MAFs to convert at once
@@ -49,33 +55,48 @@ fi
 # Make sure scratch directory exists
 mkdir -p "${WORK_DIR}"
 
-# Fetch graph
-if [[ ! -e "${WORK_DIR}/${GRAPH_NAME}.xg" ]] ; then
-    # This comparison require Bash 3 or later. See <https://stackoverflow.com/a/2172365>
-    if [[ ${GRAPH_XG_URL} =~ ^s3:.* ]]; then
-        # Download from S3
-        aws s3 cp "${GRAPH_XG_URL}" "${WORK_DIR}/${GRAPH_NAME}.xg.tmp"
-        mv "${WORK_DIR}/${GRAPH_NAME}.xg.tmp" "${WORK_DIR}/${GRAPH_NAME}.xg"
-    else
-        # Use local symlink
-        ln -s "$(realpath "${GRAPH_XG_URL}")" "${WORK_DIR}/${GRAPH_NAME}.xg"
+if [[ -z "${GRAPH_GBZ_URL}" ]] ; then
+
+    # Fetch graph
+    if [[ ! -e "${WORK_DIR}/${GRAPH_NAME}.xg" ]] ; then
+        # This comparison require Bash 3 or later. See <https://stackoverflow.com/a/2172365>
+        if [[ ${GRAPH_XG_URL} =~ ^s3:.* ]]; then
+            # Download from S3
+            aws s3 cp "${GRAPH_XG_URL}" "${WORK_DIR}/${GRAPH_NAME}.xg.tmp"
+            mv "${WORK_DIR}/${GRAPH_NAME}.xg.tmp" "${WORK_DIR}/${GRAPH_NAME}.xg"
+        else
+            # Use local symlink
+            ln -s "$(realpath "${GRAPH_XG_URL}")" "${WORK_DIR}/${GRAPH_NAME}.xg"
+        fi
     fi
-fi
-if [[ ! -e "${WORK_DIR}/${GRAPH_NAME}.gbwt" ]] ; then
-    if [[ ${GRAPH_GBWT_URL} =~ ^s3:.* ]]; then
+    if [[ ! -e "${WORK_DIR}/${GRAPH_NAME}.gbwt" ]] ; then
+        if [[ ${GRAPH_GBWT_URL} =~ ^s3:.* ]]; then
+            # Download from S3
+            aws s3 cp "${GRAPH_GBWT_URL}" "${WORK_DIR}/${GRAPH_NAME}.gbwt.tmp"
+            mv "${WORK_DIR}/${GRAPH_NAME}.gbwt.tmp" "${WORK_DIR}/${GRAPH_NAME}.gbwt"
+        else
+            # Use local symlink
+            ln -s "$(realpath "${GRAPH_GBWT_URL}")" "${WORK_DIR}/${GRAPH_NAME}.gbwt"
+        fi
+    fi
+
+    if [[ ! -e "${WORK_DIR}/${GRAPH_NAME}.gbz" ]] ; then
+        # Make it one file
+        time vg gbwt -x "${WORK_DIR}/${GRAPH_NAME}.xg" "${WORK_DIR}/${GRAPH_NAME}.gbwt" --gbz-format -g "${WORK_DIR}/${GRAPH_NAME}.gbz.tmp"
+        mv "${WORK_DIR}/${GRAPH_NAME}.gbz.tmp" "${WORK_DIR}/${GRAPH_NAME}.gbz"
+    fi
+
+elif [[ ! -e "${WORK_DIR}/${GRAPH_NAME}.gbz" ]] ; then
+    # Fetch the GBZ
+    if [[ ${GRAPH_GBZ_URL} =~ ^s3:.* ]]; then
         # Download from S3
-        aws s3 cp "${GRAPH_GBWT_URL}" "${WORK_DIR}/${GRAPH_NAME}.gbwt.tmp"
-        mv "${WORK_DIR}/${GRAPH_NAME}.gbwt.tmp" "${WORK_DIR}/${GRAPH_NAME}.gbwt"
+        aws s3 cp "${GRAPH_GBZ_URL}" "${WORK_DIR}/${GRAPH_NAME}.gbz.tmp"
+        mv "${WORK_DIR}/${GRAPH_NAME}.gbz.tmp" "${WORK_DIR}/${GRAPH_NAME}.gbz"
     else
         # Use local symlink
-        ln -s "$(realpath "${GRAPH_GBWT_URL}")" "${WORK_DIR}/${GRAPH_NAME}.gbwt"
+        ln -s "$(realpath "${GRAPH_GBZ_URL}")" "${WORK_DIR}/${GRAPH_NAME}.gbz"
     fi
-fi
 
-if [[ ! -e "${WORK_DIR}/${GRAPH_NAME}.gbz" ]] ; then
-    # Make it one file
-    time vg gbwt -x "${WORK_DIR}/${GRAPH_NAME}.xg" "${WORK_DIR}/${GRAPH_NAME}.gbwt" --gbz-format -g "${WORK_DIR}/${GRAPH_NAME}.gbz.tmp"
-    mv "${WORK_DIR}/${GRAPH_NAME}.gbz.tmp" "${WORK_DIR}/${GRAPH_NAME}.gbz"
 fi
 
 if [[ ! -e "${WORK_DIR}/${GRAPH_NAME}-${SAMPLE_NAME}-as-ref.gbz" ]] ; then
@@ -150,7 +171,7 @@ function do_job() {
             mv "${SAM_NAME}.tmp" "${SAM_NAME}"
         fi
         set -o pipefail
-        ${ADD_QUALITIES} -s "${SAM_NAME}" -f "${FASTQ_NAME}" | sed "s/ref/${CONTIG_NAME}/g" | samtools view -b - > "${RENAMED_BAM_NAME}.tmp"
+        python3 "$(dirname -- "${BASH_SOURCE[0]}")/reinsert_qualities.py" -s "${SAM_NAME}" -f "${FASTQ_NAME}" | sed "s/ref/${CONTIG_NAME}/g" | samtools view -b - > "${RENAMED_BAM_NAME}.tmp"
         set +o pipefail
         mv "${RENAMED_BAM_NAME}.tmp" "${RENAMED_BAM_NAME}"
     else
@@ -207,14 +228,14 @@ fi
 # Work out howe many reads there are
 TOTAL_READS="$(vg stats -a "${WORK_DIR}/${SAMPLE_NAME}-reads/${SAMPLE_NAME}-sim-${TECH_NAME}.gam" | grep "^Total alignments:" | cut -f2 -d':' | tr -d ' ')"
 
-if [[ "${TOTAL_READS}" -lt 10500 ]] ; then
-    echo "Only ${TOTAL_READS} reads were simulated. Cannot subset to 10000 reads with buffer!"
+if [[ "${TOTAL_READS}" -lt 1000500 ]] ; then
+    echo "Only ${TOTAL_READS} reads were simulated. Cannot subset to 1000000 reads with buffer!"
     exit 1
 fi
 echo "Simulated ${TOTAL_READS} reads overall"
 
 SUBSAMPLE_SEED=1
-for READ_COUNT in 100 1000 10000 ; do
+for READ_COUNT in 100 1000 10000 100000 1000000 ; do
     # Subset to manageable sizes (always)
     # Get the fraction of reads to keep, overestimated, with no leading 0, to paste onto subsample seed.
     FRACTION="$(echo "(${READ_COUNT} + 500)/${TOTAL_READS}" | bc -l | sed 's/^[0-9]*//g')"

diff --git a/scripts/mark_secondaries.py b/scripts/mark_secondaries.py
@@ -0,0 +1,39 @@
+#!/usr/bin/python3
+# mark_secondaries.py: Mark all but the first alignment with a given name as secondary
+"""
+Mark duplicate alignments for a given read name as secondary. Useful for GraphAligner output which does not mark its secondaries. Assumes that the first alignment is the primary alignment, ignoring score.
+
+  vg view -a unmarked.gam mark_secondaries.py | vg view -JaG - > marked.gam
+
+"""
+import sys
+import json
+
+
+def filter_json_gam(infile):
+    """
+    process gam json made with vg view -a my.gam
+    """
+
+    seen_names = set()
+
+    for line in infile:
+        gam = json.loads(line)
+
+        if gam['name'] in seen_names:
+            gam['is_secondary'] = True
+        else:
+            gam['is_secondary'] = False
+            seen_names.add(gam['name'])
+
+        print(json.dumps(gam))
+
+def main():
+    """
+    Main entry point for the program.
+    """
+    filter_json_gam(sys.stdin)
+
+if __name__ == "__main__" :
+    main()
+
diff --git a/scripts/plot-qq.R b/scripts/plot-qq.R
@@ -106,7 +106,7 @@ print(x$ci)
 
 # Now plot the points as different sizes, but the error bar line ranges as a consistent size
 dat.plot <- ggplot(x, aes(1-mapprob+1e-7, 1-observed+1e-7, color=aligner, size=N, weight=N, label=round(mapq,2))) +
-    scale_color_manual(values=colors, guide=guide_legend(title=NULL, ncol=1)) +
+    scale_color_manual(values=colors, guide=guide_legend(title=NULL, ncol=2)) +
     scale_y_log10("measured error", limits=c(1e-7,2), breaks=c(1e-6,1e-5,1e-4,1e-3,1e-2,1e-1,1e0)) +
     scale_x_log10("error estimate", limits=c(1e-7,2), breaks=c(1e-6,1e-5,1e-4,1e-3,1e-2,1e-1,1e0)) +
     scale_size_continuous("number", guide=guide_legend(title=NULL, ncol=4)) +
+35 −18		include/gbwtgraph/index.h
+5 −6		include/gbwtgraph/minimizer.h
+12 −1		include/gbwtgraph/utils.h
+26 −6		src/gbz_stats.cpp
+3 −6		src/minimizer.cpp
+21 −30		src/utils.cpp
+5 −5		include/vg/io/blocked_gzip_input_stream.hpp
+3 −0		include/vg/io/message_iterator.hpp
+4 −1		include/vg/io/protobuf_iterator.hpp
+68 −13		include/vg/io/stream.hpp
+5 −5		src/blocked_gzip_input_stream.cpp
+2 −7		src/message_iterator.cpp
+29 −0		src/stream.cpp