Skip to content

Commit

Permalink
Merge pull request #55 from ComparativeGenomicsToolkit/bigdel
Browse files Browse the repository at this point in the history
Tool compute some basic coverage stats
  • Loading branch information
glennhickey authored May 25, 2022
2 parents 81784b5 + 755e539 commit f3d9a18
Show file tree
Hide file tree
Showing 3 changed files with 330 additions and 4 deletions.
16 changes: 13 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
rootPath = ./
include ./include.mk

all : hal2vg clip-vg halRemoveDupes halMergeChroms halUnclip filter-paf-deletions
all : hal2vg clip-vg halRemoveDupes halMergeChroms halUnclip filter-paf-deletions count-vg-hap-cov

# Note: hdf5 from apt doesn't seem to work for static builds. It should be installed
# from source and configured with "--enable-static --disable-shared", then have its
Expand Down Expand Up @@ -43,10 +43,14 @@ ifeq ($(shell ldd filter-paf-deletions | grep "not a dynamic" | wc -l), $(shell
else
$(error ldd found dnymaic linked dependency in filter-paf-deletions)
endif

ifeq ($(shell ldd count-vg-hap-cov | grep "not a dynamic" | wc -l), $(shell ls count-vg-hap-cov | wc -l))
$(info ldd verified that count-vg-hap-cov static)
else
$(error ldd found dnymaic linked dependency in count-vg-hap-cov)
endif

cleanFast :
rm -f hal2vg hal2vg.o clip-vg clip-vg.o halRemoveDupes halRemoveDupes.o halMergeChroms halMergeChroms.o halUnclip halUnclip.o filter-paf-deletions filter-paf-deletions.o
rm -f hal2vg hal2vg.o clip-vg clip-vg.o halRemoveDupes halRemoveDupes.o halMergeChroms halMergeChroms.o halUnclip halUnclip.o filter-paf-deletions filter-paf-deletions.o count-vg-hap-cov.o count-vg-hap-cov

clean :
rm -f hal2vg hal2vg.o clip-vg clip-vg.o halRemoveDupes halRemoveDupes.o halMergeChroms halMergeChroms.o halUnclip halUnclip.o filter-paf-deletions filter-paf-deletions.o
Expand Down Expand Up @@ -103,6 +107,12 @@ filter-paf-deletions.o : filter-paf-deletions.cpp subpaths.h paf.hpp ${basicLibs
filter-paf-deletions : filter-paf-deletions.o ${basicLibsDependencies}
${cpp} ${CXXFLAGS} -fopenmp -pthread filter-paf-deletions.o ${basicLibs} -o filter-paf-deletions

count-vg-hap-cov.o : count-vg-hap-cov.cpp ${basicLibsDependencies}
${cpp} ${CXXFLAGS} -I . count-vg-hap-cov.cpp -c

count-vg-hap-cov : count-vg-hap-cov.o ${basicLibsDependencies}
${cpp} ${CXXFLAGS} -fopenmp -pthread count-vg-hap-cov.o ${basicLibs} -o count-vg-hap-cov

test :
make
cd tests && prove -v t
2 changes: 1 addition & 1 deletion build-tools/makeBinRelease
Original file line number Diff line number Diff line change
Expand Up @@ -39,5 +39,5 @@ else
make check-static
fi

cp hal2vg clip-vg halRemoveDupes halMergeChroms halUnclip filter-paf-deletions ${buildDir}/
cp hal2vg clip-vg halRemoveDupes halMergeChroms halUnclip filter-paf-deletions count-vg-hap-cov ${buildDir}/

316 changes: 316 additions & 0 deletions count-vg-hap-cov.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,316 @@
// Count the number of bases that aren't in a given reference sample.
// Print the table of results stratisfied by number of covering samples
// Assume's current cactus convertion of Sample.Haplotype.Contig

//#define debug

#include <cstdlib>
#include <iostream>
#include <cassert>
#include <fstream>
#include <deque>
#include <unordered_map>
#include <unistd.h>
#include <getopt.h>
#include <omp.h>

#include "bdsg/packed_graph.hpp"
#include "bdsg/hash_graph.hpp"
#include "bdsg/odgi.hpp"

using namespace std;
using namespace handlegraph;
using namespace bdsg;

static unique_ptr<PathHandleGraph> load_graph(istream& graph_stream) {

char magic_bytes[4];
graph_stream.read(magic_bytes, 4);
uint32_t magic_number = ntohl(*((uint32_t*) magic_bytes));
graph_stream.clear();
graph_stream.seekg(0, ios::beg);

PathHandleGraph* graph;
if (magic_number == PackedGraph().get_magic_number()) {
graph = new PackedGraph();
} else if (magic_number == HashGraph().get_magic_number()) {
graph = new HashGraph();
} else if (magic_number == ODGI().get_magic_number()) {
graph = new ODGI();
} else {
cerr << "Unable to parse input graph with magic number " << magic_number << endl;
exit(1);
}
dynamic_cast<SerializableHandleGraph*>(graph)->deserialize(graph_stream);

return unique_ptr<PathHandleGraph>(graph);
}

void help(char** argv) {
cerr << "usage: " << argv[0] << " [options] <graph> [graph] [graph] [...]" << endl
<< "Count nodes and bp in graph covered by different sample counts\n"
<< "Assumes SAMPLE.HAPLOTYPE.CONTIG path name format" << endl
<< endl
<< "options: " << endl
<< " -r, --reference Include counts of nodes that are not present in the given reference sample prefix" << endl
<< " -i, --ignore Completely ignore all paths with given prefix [default: _MINIGRAPH_]" << endl
<< " -t, --threads Number of threads [default: all]" << endl
<< " -s, --separator Use this separator for tokenizing path name. Haplotype key will be first 2 tokens (or all tokens if fewer than 2) [default=.]" << endl
<< " -p, --progress Print progress" << endl
<< endl;
}

// returns SAMPLE.HAPLOTYPE
// todo: vg/bdsg in progress of adpoting conventions / api
// to manage stuff like this -- should switch to using that
const string& get_sample_name(const PathHandleGraph* graph, path_handle_t path_handle,
unordered_map<path_handle_t, string>& name_map,
char separator) {
if (!name_map.count(path_handle)) {
string path_name = graph->get_path_name(path_handle);
string sample;
int dots = 0;
for (int64_t i = 0; i < path_name.length(); ++i) {
if (path_name[i] == separator) {
++dots;
}
if (dots == 2) {
break;
}
sample.push_back(path_name[i]);
}
name_map[path_handle] = sample;
}
return name_map.at(path_handle);
}

int main(int argc, char** argv) {

string ref_sample;
string ignore_sample = "_MINIGRAPH_";
char separator = '.';
bool progress = false;

int c;
optind = 1;
while (true) {

static const struct option long_options[] = {
{"help", no_argument, 0, 'h'},
{"ref-sample", required_argument, 0, 'r'},
{"ignore", required_argument, 0, 'i'},
{"separator", required_argument, 0, 's'},
{"threads", required_argument, 0, 't'},
{"progress", no_argument, 0, 'p'},
{0, 0, 0, 0}
};

int option_index = 0;

c = getopt_long (argc, argv, "hr:s:i:t:p",
long_options, &option_index);

// Detect the end of the options.
if (c == -1)
break;

switch (c)
{
case 'r':
ref_sample = optarg;
break;
case 'i':
ignore_sample = optarg;
break;
case 's':
assert(strlen(optarg) == 1);
separator = optarg[0];
break;
case 't':
{
int num_threads = stoi(optarg);
if (num_threads <= 0) {
cerr << "error:[count-vg-hap-depth] Thread count (-t) set to " << num_threads << ", must set to a positive integer." << endl;
exit(1);
}
omp_set_num_threads(num_threads);
break;
}
case 'p':
progress = true;
break;
case 'h':
case '?':
/* getopt_long already printed an error message. */
help(argv);
exit(1);
break;
default:
abort ();
}
}

if (argc <= 1) {
help(argv);
return 1;
}

// Parse the positional argument
if (optind >= argc) {
cerr << "[count-vg-hap-depth] error: too few arguments" << endl;
help(argv);
return 1;
}

// depth stats (one per thread)
vector<vector<int64_t>> depth_base_counts(get_thread_count());
vector<vector<int64_t>> depth_node_counts(get_thread_count());
vector<vector<int64_t>> depth_base_counts_nonref(get_thread_count());
vector<vector<int64_t>> depth_node_counts_nonref(get_thread_count());

// do counts for each graph arg
while(optind < argc) {

string graph_path = argv[optind++];
ifstream graph_stream(graph_path);
if (!graph_stream) {
cerr << "[count-vg-hap-depth] error: Unable to open input graph " << graph_path << endl;
return 1;
}
unique_ptr<PathHandleGraph> graph = load_graph(graph_stream);
graph_stream.close();
if (progress) {
cerr << "[count-vg-hap-depth]: Loaded graph" << endl;
}

// path handle to sample key (one per thread)
vector<unordered_map<path_handle_t, string>> name_maps(get_thread_count());

if (progress) {
cerr << "[count-vg-hap-depth]: Calculating coverage with " << depth_base_counts.size() << " threads" << endl;
}

graph->for_each_handle([&](handle_t handle) {
int64_t t = omp_get_thread_num();
// collect all the samples that step on the node
set<string> sample_set;
bool ref = false;
graph->for_each_step_on_handle(handle, [&](step_handle_t step_handle) {
const string& sample_name = get_sample_name(graph.get(), graph->get_path_handle_of_step(step_handle), name_maps[t], separator);
if (ignore_sample.empty() || sample_name.compare(0, ignore_sample.length(), ignore_sample) != 0) {
if (!ref && sample_name.compare(0, ref_sample.length(), ref_sample) == 0) {
ref = true;
}
sample_set.insert(sample_name);
}
});
// update the total coverage
int64_t coverage = sample_set.size();
if (depth_base_counts[t].size() <= coverage) {
depth_base_counts[t].resize(coverage + 1, 0);
depth_node_counts[t].resize(coverage + 1, 0);
}
depth_base_counts[t][coverage] += graph->get_length(handle);
depth_node_counts[t][coverage] += 1;

if (!ref && !ref_sample.empty()) {
// update the nonref coverage
int64_t coverage = sample_set.size();
if (depth_base_counts_nonref[t].size() <= coverage) {
depth_base_counts_nonref[t].resize(coverage + 1, 0);
depth_node_counts_nonref[t].resize(coverage + 1, 0);
}
depth_base_counts_nonref[t][coverage] += graph->get_length(handle);
depth_node_counts_nonref[t][coverage] += 1;
}
},
true);
}

if (progress) {
cerr << "[count-vg-hap-depth]: Merging data from different threads" << endl;
}

// merge up the threads
for (int64_t t = 1; t < get_thread_count(); ++t) {
for (int64_t coverage = 0; coverage < depth_base_counts[t].size(); ++coverage) {
if (depth_base_counts[0].size() <= coverage) {
depth_base_counts[0].resize(coverage + 1, 0);
depth_node_counts[0].resize(coverage + 1, 0);
}
depth_base_counts[0][coverage] += depth_base_counts[t][coverage];
depth_node_counts[0][coverage] += depth_node_counts[t][coverage];

if (!ref_sample.empty()) {
if (depth_base_counts_nonref[0].size() <= coverage) {
depth_base_counts_nonref[0].resize(coverage + 1, 0);
depth_node_counts_nonref[0].resize(coverage + 1, 0);
}
if (coverage < depth_base_counts_nonref[t].size()) {
depth_base_counts_nonref[0][coverage] += depth_base_counts_nonref[t][coverage];
depth_node_counts_nonref[0][coverage] += depth_node_counts_nonref[t][coverage];
}
}
}
}

// there's almost certainly an stl one-line for this.. oh well
function<vector<int64_t>(const vector<int64_t>&)> get_cumul = [](const vector<int64_t>& v) {
int64_t tot = 0;
vector<int64_t> cumul(v.size(), 0);
for (int64_t i = 0; i < v.size(); ++i) {
tot += v[i];
cumul[i] = tot;
}
return cumul;
};
function<vector<int64_t>(const vector<int64_t>&)> get_lumuc = [](const vector<int64_t>& v) {
int64_t tot = 0;
vector<int64_t> cumul(v.size(), 0);
for (int64_t i = v.size() - 1; i >= 0; --i) {
tot += v[i];
cumul[i] = tot;
}
return cumul;
};

// keep cumulative counts while we're at it
// cumulate from 0
vector<int64_t> node_counts_cumul = get_cumul(depth_node_counts[0]);
vector<int64_t> base_counts_cumul = get_cumul(depth_base_counts[0]);
vector<int64_t> node_counts_nonref_cumul = get_cumul(depth_node_counts_nonref[0]);
vector<int64_t> base_counts_nonref_cumul = get_cumul(depth_base_counts_nonref[0]);

//cumulate from end
vector<int64_t> node_counts_lumuc = get_lumuc(depth_node_counts[0]);
vector<int64_t> base_counts_lumuc = get_lumuc(depth_base_counts[0]);
vector<int64_t> node_counts_nonref_lumuc = get_lumuc(depth_node_counts_nonref[0]);
vector<int64_t> base_counts_nonref_lumuc = get_lumuc(depth_base_counts_nonref[0]);

// print the results
cout << "hap-depth"
<< "\t" << "nodes" << "\t" << "bases"
<< "\t" << "nodes-cumul" << "\t" <<"bases-cumul"
<< "\t" << "nodes-cumul-rev" << "\t" << "bases-cumul-rev";
if (!ref_sample.empty()) {
cout << "\t" << "nodes-nonref" << "\t" << "bases-nonref"
<< "\t" << "nodes-cumul-nonref" << "\t" << "bases-cumul-nonref"
<< "\t" << "nodes-cumul-rev-nonref" << "\t" << "bases-cumul-rev-nonref";
}
cout << endl;

for (int64_t coverage = 0; coverage < depth_base_counts[0].size(); ++coverage) {
cout << coverage
<< "\t" << depth_node_counts[0][coverage] << "\t" << depth_base_counts[0][coverage]
<< "\t" << node_counts_cumul[coverage] << "\t" << base_counts_cumul[coverage]
<< "\t" << node_counts_lumuc[coverage] << "\t" << base_counts_lumuc[coverage];
if (!ref_sample.empty()) {
cout << "\t" << depth_node_counts_nonref[0][coverage] << "\t" << depth_base_counts_nonref[0][coverage]
<< "\t" << node_counts_nonref_cumul[coverage] << "\t" << base_counts_nonref_cumul[coverage]
<< "\t" << node_counts_nonref_lumuc[coverage] << "\t" << base_counts_nonref_lumuc[coverage];
}
cout << "\n";
}

return 0;
}

0 comments on commit f3d9a18

Please sign in to comment.