diff --git a/Makefile b/Makefile index b59f8db..5b9cb20 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ rootPath = ./ include ./include.mk -all : hal2vg clip-vg halRemoveDupes halMergeChroms halUnclip filter-paf-deletions +all : hal2vg clip-vg halRemoveDupes halMergeChroms halUnclip filter-paf-deletions count-vg-hap-cov # Note: hdf5 from apt doesn't seem to work for static builds. It should be installed # from source and configured with "--enable-static --disable-shared", then have its @@ -43,10 +43,14 @@ ifeq ($(shell ldd filter-paf-deletions | grep "not a dynamic" | wc -l), $(shell else $(error ldd found dnymaic linked dependency in filter-paf-deletions) endif - +ifeq ($(shell ldd count-vg-hap-cov | grep "not a dynamic" | wc -l), $(shell ls count-vg-hap-cov | wc -l)) + $(info ldd verified that count-vg-hap-cov static) +else + $(error ldd found dnymaic linked dependency in count-vg-hap-cov) +endif cleanFast : - rm -f hal2vg hal2vg.o clip-vg clip-vg.o halRemoveDupes halRemoveDupes.o halMergeChroms halMergeChroms.o halUnclip halUnclip.o filter-paf-deletions filter-paf-deletions.o + rm -f hal2vg hal2vg.o clip-vg clip-vg.o halRemoveDupes halRemoveDupes.o halMergeChroms halMergeChroms.o halUnclip halUnclip.o filter-paf-deletions filter-paf-deletions.o count-vg-hap-cov.o count-vg-hap-cov clean : rm -f hal2vg hal2vg.o clip-vg clip-vg.o halRemoveDupes halRemoveDupes.o halMergeChroms halMergeChroms.o halUnclip halUnclip.o filter-paf-deletions filter-paf-deletions.o @@ -103,6 +107,12 @@ filter-paf-deletions.o : filter-paf-deletions.cpp subpaths.h paf.hpp ${basicLibs filter-paf-deletions : filter-paf-deletions.o ${basicLibsDependencies} ${cpp} ${CXXFLAGS} -fopenmp -pthread filter-paf-deletions.o ${basicLibs} -o filter-paf-deletions +count-vg-hap-cov.o : count-vg-hap-cov.cpp ${basicLibsDependencies} + ${cpp} ${CXXFLAGS} -I . count-vg-hap-cov.cpp -c + +count-vg-hap-cov : count-vg-hap-cov.o ${basicLibsDependencies} + ${cpp} ${CXXFLAGS} -fopenmp -pthread count-vg-hap-cov.o ${basicLibs} -o count-vg-hap-cov + test : make cd tests && prove -v t diff --git a/build-tools/makeBinRelease b/build-tools/makeBinRelease index eb313da..d5b71e3 100755 --- a/build-tools/makeBinRelease +++ b/build-tools/makeBinRelease @@ -39,5 +39,5 @@ else make check-static fi -cp hal2vg clip-vg halRemoveDupes halMergeChroms halUnclip filter-paf-deletions ${buildDir}/ +cp hal2vg clip-vg halRemoveDupes halMergeChroms halUnclip filter-paf-deletions count-vg-hap-cov ${buildDir}/ diff --git a/count-vg-hap-cov.cpp b/count-vg-hap-cov.cpp new file mode 100644 index 0000000..bcef027 --- /dev/null +++ b/count-vg-hap-cov.cpp @@ -0,0 +1,316 @@ +// Count the number of bases that aren't in a given reference sample. +// Print the table of results stratisfied by number of covering samples +// Assume's current cactus convertion of Sample.Haplotype.Contig + +//#define debug + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "bdsg/packed_graph.hpp" +#include "bdsg/hash_graph.hpp" +#include "bdsg/odgi.hpp" + +using namespace std; +using namespace handlegraph; +using namespace bdsg; + +static unique_ptr load_graph(istream& graph_stream) { + + char magic_bytes[4]; + graph_stream.read(magic_bytes, 4); + uint32_t magic_number = ntohl(*((uint32_t*) magic_bytes)); + graph_stream.clear(); + graph_stream.seekg(0, ios::beg); + + PathHandleGraph* graph; + if (magic_number == PackedGraph().get_magic_number()) { + graph = new PackedGraph(); + } else if (magic_number == HashGraph().get_magic_number()) { + graph = new HashGraph(); + } else if (magic_number == ODGI().get_magic_number()) { + graph = new ODGI(); + } else { + cerr << "Unable to parse input graph with magic number " << magic_number << endl; + exit(1); + } + dynamic_cast(graph)->deserialize(graph_stream); + + return unique_ptr(graph); +} + +void help(char** argv) { + cerr << "usage: " << argv[0] << " [options] [graph] [graph] [...]" << endl + << "Count nodes and bp in graph covered by different sample counts\n" + << "Assumes SAMPLE.HAPLOTYPE.CONTIG path name format" << endl + << endl + << "options: " << endl + << " -r, --reference Include counts of nodes that are not present in the given reference sample prefix" << endl + << " -i, --ignore Completely ignore all paths with given prefix [default: _MINIGRAPH_]" << endl + << " -t, --threads Number of threads [default: all]" << endl + << " -s, --separator Use this separator for tokenizing path name. Haplotype key will be first 2 tokens (or all tokens if fewer than 2) [default=.]" << endl + << " -p, --progress Print progress" << endl + << endl; +} + +// returns SAMPLE.HAPLOTYPE +// todo: vg/bdsg in progress of adpoting conventions / api +// to manage stuff like this -- should switch to using that +const string& get_sample_name(const PathHandleGraph* graph, path_handle_t path_handle, + unordered_map& name_map, + char separator) { + if (!name_map.count(path_handle)) { + string path_name = graph->get_path_name(path_handle); + string sample; + int dots = 0; + for (int64_t i = 0; i < path_name.length(); ++i) { + if (path_name[i] == separator) { + ++dots; + } + if (dots == 2) { + break; + } + sample.push_back(path_name[i]); + } + name_map[path_handle] = sample; + } + return name_map.at(path_handle); +} + +int main(int argc, char** argv) { + + string ref_sample; + string ignore_sample = "_MINIGRAPH_"; + char separator = '.'; + bool progress = false; + + int c; + optind = 1; + while (true) { + + static const struct option long_options[] = { + {"help", no_argument, 0, 'h'}, + {"ref-sample", required_argument, 0, 'r'}, + {"ignore", required_argument, 0, 'i'}, + {"separator", required_argument, 0, 's'}, + {"threads", required_argument, 0, 't'}, + {"progress", no_argument, 0, 'p'}, + {0, 0, 0, 0} + }; + + int option_index = 0; + + c = getopt_long (argc, argv, "hr:s:i:t:p", + long_options, &option_index); + + // Detect the end of the options. + if (c == -1) + break; + + switch (c) + { + case 'r': + ref_sample = optarg; + break; + case 'i': + ignore_sample = optarg; + break; + case 's': + assert(strlen(optarg) == 1); + separator = optarg[0]; + break; + case 't': + { + int num_threads = stoi(optarg); + if (num_threads <= 0) { + cerr << "error:[count-vg-hap-depth] Thread count (-t) set to " << num_threads << ", must set to a positive integer." << endl; + exit(1); + } + omp_set_num_threads(num_threads); + break; + } + case 'p': + progress = true; + break; + case 'h': + case '?': + /* getopt_long already printed an error message. */ + help(argv); + exit(1); + break; + default: + abort (); + } + } + + if (argc <= 1) { + help(argv); + return 1; + } + + // Parse the positional argument + if (optind >= argc) { + cerr << "[count-vg-hap-depth] error: too few arguments" << endl; + help(argv); + return 1; + } + + // depth stats (one per thread) + vector> depth_base_counts(get_thread_count()); + vector> depth_node_counts(get_thread_count()); + vector> depth_base_counts_nonref(get_thread_count()); + vector> depth_node_counts_nonref(get_thread_count()); + + // do counts for each graph arg + while(optind < argc) { + + string graph_path = argv[optind++]; + ifstream graph_stream(graph_path); + if (!graph_stream) { + cerr << "[count-vg-hap-depth] error: Unable to open input graph " << graph_path << endl; + return 1; + } + unique_ptr graph = load_graph(graph_stream); + graph_stream.close(); + if (progress) { + cerr << "[count-vg-hap-depth]: Loaded graph" << endl; + } + + // path handle to sample key (one per thread) + vector> name_maps(get_thread_count()); + + if (progress) { + cerr << "[count-vg-hap-depth]: Calculating coverage with " << depth_base_counts.size() << " threads" << endl; + } + + graph->for_each_handle([&](handle_t handle) { + int64_t t = omp_get_thread_num(); + // collect all the samples that step on the node + set sample_set; + bool ref = false; + graph->for_each_step_on_handle(handle, [&](step_handle_t step_handle) { + const string& sample_name = get_sample_name(graph.get(), graph->get_path_handle_of_step(step_handle), name_maps[t], separator); + if (ignore_sample.empty() || sample_name.compare(0, ignore_sample.length(), ignore_sample) != 0) { + if (!ref && sample_name.compare(0, ref_sample.length(), ref_sample) == 0) { + ref = true; + } + sample_set.insert(sample_name); + } + }); + // update the total coverage + int64_t coverage = sample_set.size(); + if (depth_base_counts[t].size() <= coverage) { + depth_base_counts[t].resize(coverage + 1, 0); + depth_node_counts[t].resize(coverage + 1, 0); + } + depth_base_counts[t][coverage] += graph->get_length(handle); + depth_node_counts[t][coverage] += 1; + + if (!ref && !ref_sample.empty()) { + // update the nonref coverage + int64_t coverage = sample_set.size(); + if (depth_base_counts_nonref[t].size() <= coverage) { + depth_base_counts_nonref[t].resize(coverage + 1, 0); + depth_node_counts_nonref[t].resize(coverage + 1, 0); + } + depth_base_counts_nonref[t][coverage] += graph->get_length(handle); + depth_node_counts_nonref[t][coverage] += 1; + } + }, + true); + } + + if (progress) { + cerr << "[count-vg-hap-depth]: Merging data from different threads" << endl; + } + + // merge up the threads + for (int64_t t = 1; t < get_thread_count(); ++t) { + for (int64_t coverage = 0; coverage < depth_base_counts[t].size(); ++coverage) { + if (depth_base_counts[0].size() <= coverage) { + depth_base_counts[0].resize(coverage + 1, 0); + depth_node_counts[0].resize(coverage + 1, 0); + } + depth_base_counts[0][coverage] += depth_base_counts[t][coverage]; + depth_node_counts[0][coverage] += depth_node_counts[t][coverage]; + + if (!ref_sample.empty()) { + if (depth_base_counts_nonref[0].size() <= coverage) { + depth_base_counts_nonref[0].resize(coverage + 1, 0); + depth_node_counts_nonref[0].resize(coverage + 1, 0); + } + if (coverage < depth_base_counts_nonref[t].size()) { + depth_base_counts_nonref[0][coverage] += depth_base_counts_nonref[t][coverage]; + depth_node_counts_nonref[0][coverage] += depth_node_counts_nonref[t][coverage]; + } + } + } + } + + // there's almost certainly an stl one-line for this.. oh well + function(const vector&)> get_cumul = [](const vector& v) { + int64_t tot = 0; + vector cumul(v.size(), 0); + for (int64_t i = 0; i < v.size(); ++i) { + tot += v[i]; + cumul[i] = tot; + } + return cumul; + }; + function(const vector&)> get_lumuc = [](const vector& v) { + int64_t tot = 0; + vector cumul(v.size(), 0); + for (int64_t i = v.size() - 1; i >= 0; --i) { + tot += v[i]; + cumul[i] = tot; + } + return cumul; + }; + + // keep cumulative counts while we're at it + // cumulate from 0 + vector node_counts_cumul = get_cumul(depth_node_counts[0]); + vector base_counts_cumul = get_cumul(depth_base_counts[0]); + vector node_counts_nonref_cumul = get_cumul(depth_node_counts_nonref[0]); + vector base_counts_nonref_cumul = get_cumul(depth_base_counts_nonref[0]); + + //cumulate from end + vector node_counts_lumuc = get_lumuc(depth_node_counts[0]); + vector base_counts_lumuc = get_lumuc(depth_base_counts[0]); + vector node_counts_nonref_lumuc = get_lumuc(depth_node_counts_nonref[0]); + vector base_counts_nonref_lumuc = get_lumuc(depth_base_counts_nonref[0]); + + // print the results + cout << "hap-depth" + << "\t" << "nodes" << "\t" << "bases" + << "\t" << "nodes-cumul" << "\t" <<"bases-cumul" + << "\t" << "nodes-cumul-rev" << "\t" << "bases-cumul-rev"; + if (!ref_sample.empty()) { + cout << "\t" << "nodes-nonref" << "\t" << "bases-nonref" + << "\t" << "nodes-cumul-nonref" << "\t" << "bases-cumul-nonref" + << "\t" << "nodes-cumul-rev-nonref" << "\t" << "bases-cumul-rev-nonref"; + } + cout << endl; + + for (int64_t coverage = 0; coverage < depth_base_counts[0].size(); ++coverage) { + cout << coverage + << "\t" << depth_node_counts[0][coverage] << "\t" << depth_base_counts[0][coverage] + << "\t" << node_counts_cumul[coverage] << "\t" << base_counts_cumul[coverage] + << "\t" << node_counts_lumuc[coverage] << "\t" << base_counts_lumuc[coverage]; + if (!ref_sample.empty()) { + cout << "\t" << depth_node_counts_nonref[0][coverage] << "\t" << depth_base_counts_nonref[0][coverage] + << "\t" << node_counts_nonref_cumul[coverage] << "\t" << base_counts_nonref_cumul[coverage] + << "\t" << node_counts_nonref_lumuc[coverage] << "\t" << base_counts_nonref_lumuc[coverage]; + } + cout << "\n"; + } + + return 0; +}