Skip to content

Commit

Permalink
Merge pull request #16 from ComparativeGenomicsToolkit/update-deps
Browse files Browse the repository at this point in the history
Switch output format from protobuf to HandleGraph
  • Loading branch information
glennhickey authored May 12, 2020
2 parents fb3b437 + 1f1d966 commit 40507b2
Show file tree
Hide file tree
Showing 18 changed files with 1,051 additions and 413 deletions.
7 changes: 5 additions & 2 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
[submodule "deps/hal"]
path = deps/hal
url = https://github.com/glennhickey/hal.git
url = https://github.com/ComparativeGenomicsToolkit/hal.git
[submodule "deps/sonLib"]
path = deps/sonLib
url = https://github.com/benedictpaten/sonLib.git
url = https://github.com/ComparativeGenomicsToolkit/sonLib.git
[submodule "deps/hal2sg"]
path = deps/hal2sg
url = https://github.com/glennhickey/hal2sg.git
[submodule "deps/libbdsg-easy"]
path = deps/libbdsg-easy
url = https://github.com/vgteam/libbdsg-easy.git
31 changes: 31 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# Control file for continuous integration testing at http://travis-ci.org/

language: cpp
compiler: gcc

before_install:
- git submodule update --init --recursive
- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install libomp; fi
- if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then sudo apt-get -qq update; fi
- if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then sudo apt-get install -y libhdf5-serial-dev python3 python3-pip libpython3-dev wget; fi
- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install hdf5 python3.6 python3-pip || echo "a brew error code when installing gcc is expected"; fi

install:
- sudo pip3 install setuptools --upgrade
- wget https://github.com/vgteam/vg/releases/download/v1.24.0/vg && chmod u+x vg

script:
- export PATH=$(pwd):$PATH
- export PATH=$(pwd)/deps/hal/bin:$PATH
- make test

dist: bionic
osx_image: xcode10.1

matrix:
include:
- os: linux
compiler: gcc
#- os: osx
# compiler: clang

6 changes: 3 additions & 3 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
# creates an image containing vg and hal2vg

# build on compatible vg image
FROM quay.io/vgteam/vg:v1.17.0-0-gaa0b37860-t315-build
FROM quay.io/vgteam/vg:v1.24.0

# update system and install dependencies not present in vg image
RUN apt-get -qq update && apt-get -qq install -y libhdf5-serial-dev
RUN apt-get -qq update && apt-get -qq install -y libhdf5-dev build-essential python3-dev python3-pip

# copy current directory to docker
ADD . /hal2vg
Expand All @@ -16,4 +16,4 @@ WORKDIR /hal2vg
RUN make

# add hal2vg to the PATH
ENV PATH /hal2vg:$PATH
ENV PATH /hal2vg:/hal2vg/deps/hal/bin:$PATH
26 changes: 19 additions & 7 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -7,18 +7,19 @@ sidegraphInc = ${sgExportPath}/sidegraph.h ${sgExportPath}/sgcommon.h ${sgExport
all : hal2vg

cleanFast :
rm -f hal2vg hal2vg.o sg2vgproto.o
rm -f hal2vg hal2vg.o sg2vghandle.o

clean :
rm -f hal2vg hal2vg.o
cd deps/sonLib && make clean
cd deps/hal && make clean
cd deps/hal2sg && make clean
cd deps/libbdsg-easy && make clean

sg2vgproto.o : sg2vgproto.cpp sg2vgproto.h ${sidegraphInc} ${basicLibsDependencies}
${cpp} ${cppflags} -I . sg2vgproto.cpp -c
sg2vghandle.o : sg2vghandle.cpp sg2vghandle.h ${sidegraphInc} ${basicLibsDependencies}
${cpp} ${cppflags} -I . sg2vghandle.cpp -c

hal2vg.o : hal2vg.cpp sg2vgproto.h ${sidegraphInc} ${basicLibsDependencies}
hal2vg.o : hal2vg.cpp sg2vghandle.h ${sidegraphInc} ${basicLibsDependencies}
${cpp} ${cppflags} -I . hal2vg.cpp -c

${sonLibPath}/sonLib.a :
Expand All @@ -30,9 +31,20 @@ ${halPath}/halLib.a : ${sonLibPath}/sonLib.a
${hal2sgPath}/libhal2sg.a : ${halPath}/halLib.a
cd deps/hal2sg && make

hal2vg : hal2vg.o sg2vgproto.o ${basicLibsDependencies}
${libbdsgPath}/lib/libbdsg.a :
cd deps/libbdsg-easy && make

${libbdsgPath}/lib/lib/libhandlegraph.a : ${libbdsgPath}/lib/libbdsg.a

${libbdsgPath}/lib/lib/libsdsl.a : ${libbdsgPath}/lib/libbdsg.a

${libbdsgPath}/lib/lib/libdivsufsort.a : ${libbdsgPath}/lib/libbdsg.a

${libbdsgPath}/lib/lib/libdivsufsort64.a : ${libbdsgPath}/lib/libbdsg.a

hal2vg : hal2vg.o sg2vghandle.o ${basicLibsDependencies}
cd deps/hal2sg && make
${cpp} ${cppflags} -pthread hal2vg.o sg2vgproto.o ${basicLibs} -o hal2vg
${cpp} ${cppflags} -fopenmp -pthread hal2vg.o sg2vghandle.o ${basicLibs} -o hal2vg

test : hal2vg
cd tests && VGDIR=${PWD}/${VGDIR} prove -v small.t
cd tests && prove -v small.t
63 changes: 34 additions & 29 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# hal2vg
Prototype code for converting [HAL](https://github.com/glennhickey/hal) to [vg](https://github.com/vgteam/vg).
[![Build Status](https://travis-ci.org/ComparativeGenomicsToolkit/hal2vg.svg?branch=master)](https://travis-ci.org/ComparativeGenomicsToolkit/hal2vg)

(c) 2016 Glenn Hickey. See [LICENSE](https://github.com/glennhickey/hal2vg/blob/master/LICENSE) for details.
Prototype code for converting [HAL](https://github.com/glennhickey/hal) to [vg](https://github.com/vgteam/vg).

See also:
* [hal2sg](https://github.com/glennhickey/hal2sg): Convert [HAL](https://github.com/glennhickey/hal) (output by [Cactus](https://github.com/glennhickey/progressiveCactus) and [CAMEL](https://github.com/adamnovak/sequence-graphs)) to [Side Graph SQL](https://github.com/ga4gh/schemas/wiki/Human-Genome-Variation-Reference-(HGVR)-Pilot-Project#graph-format)
Expand All @@ -19,20 +19,29 @@ This tool is a composition of `hal2sg` and `sg2vg`. It converts HAL into an in-

## Installing Dependencies

#### vg
#### HDF5 1.10.1 with C++ API enabled

* [vg](https://github.com/vgteam/vg) must be downloaded and built before hal2vg
* Edit hal2vg/include.mk and make sure that VGDIR points to the correct vg directory
* Using apt (Ubuntu 18.04)

#### HDF5 1.10.1 with C++ API enabled
`sudo apt install libhdf5-dev`

* Using [MacPorts](http://www.macports.org/):

* Use build from [Progressive Cactus](https://github.com/glennhickey/progressiveCactus) by downloading and building Progressive Cactus *then* running `. environment` in the progressive cactus directory before building hal2vg.
`sudo port install hdf5 @1.10.1 +cxx`

* Or Local install from source into DIR (do not need root password)
* From [Source](http://www.hdfgroup.org/ftp/HDF5/releases/hdf5-1.10/hdf5-1.10.1/src/):

`mkdir DIR/hdf5`
`wget http://www.hdfgroup.org/ftp/HDF5/releases/hdf5-1.10/hdf5-1.10.1/src/hdf5-1.10.1.tar.gz`
`tar xzf hdf5-1.10.1.tar.gz`
`tar xzf hdf5-1.10.1.tar.gz`
`cd hdf5-1.10.1`
`./configure --enable-cxx`
`make && make install`

* Local install from source into DIR (do not need root password)

`mkdir DIR/hdf5`
`wget http://www.hdfgroup.org/ftp/HDF5/releases/hdf5-1.10/hdf5-1.10.1/src/hdf5-1.10.1.tar.gz`
`tar xzf hdf5-1.10.1.tar.gz`
`cd hdf5-1.10.1`
`./configure --enable-cxx --prefix DIR/hdf5`
`make && make install`
Expand All @@ -41,39 +50,35 @@ This tool is a composition of `hal2sg` and `sg2vg`. It converts HAL into an in-

`export PATH=DIR/hdf5/bin:${PATH}`
`export h5prefix=-prefix=DIR/hdf5`

or set these in include.local.mk.

* Or From [Source](http://www.hdfgroup.org/ftp/HDF5/releases/hdf5-1.10/hdf5-1.10.1/src/):

`wget http://www.hdfgroup.org/ftp/HDF5/releases/hdf5-1.10/hdf5-1.10.1/src/hdf5-1.10.1.tar.gz`
`tar xzf hdf5-1.10.1.tar.gz`
`cd hdf5-1.10.1`
`./configure --enable-cxx`
`make && make install`

* Or Using [MacPorts](http://www.macports.org/):

sudo port install hdf5 @1.10.1 +cxx
If you are using older version of HDF5, such as installed on Centos,
you may need to set

`export CXX_ABI_DEF='-D_GLIBCXX_USE_CXX11_ABI=1'

If you get undefined functions base on string type with errors about
`std::__cxx11::basic_string` vs `std::basic_string`.

## Instructions

**Cloning:** Don't forget to clone submodules with the `--recursive` option:

git clone https://github.com/glennhickey/hal2vg.git --recursive

**Setting your VG path:**

* Compile `vg` with `make static`
* Edit `include.mk` so that `VGDIR` points to where you've built [vg](https://github.com/vgteam/vg). By default it will be `../vg`
* Change `LIBPROTOBUF=$(VGLIBDIR)/libprotobuf.a` to the system library that was used to build vg. For example: `LIBPROTOBUF=/usr/lib/x86_64-linux-gnu/libprotobuf.a` in `include.mk`. You can find it on Ubuntu with `dpkg -L libprotobuf-dev`

**Compiling:**

make

To run the converter:

hal2vg input.hal > output.vg
hal2vg input.hal > output.pg

To see all the options, run with no args or use `--help`.

Note: The output vg may have nodes with sequence length up to 1MB, and will need to be chopped (ex `vg mod -X 32`) before indexing with `vg index`.
Note: The output graph may have nodes with sequence length up to 1MB, and will need to be chopped (ex `vg mod -X 32`) before indexing with `vg index`.

Note: The output graph is only readable by vg version 1.24.0 and greater.

(c) 2016 Glenn Hickey. See [LICENSE](https://github.com/glennhickey/hal2vg/blob/master/LICENSE) for details.
1 change: 1 addition & 0 deletions deps/libbdsg-easy
Submodule libbdsg-easy added at 537567
60 changes: 34 additions & 26 deletions hal2vg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,15 @@

#include "sgbuilder.h"
#include "side2seq.h"
#include "sg2vgproto.h"
#include "vg.pb.h"
#include "sg2vghandle.h"
#include "bdsg/packed_graph.hpp"
#include "bdsg/hash_graph.hpp"
#include "bdsg/odgi.hpp"

using namespace std;
using namespace hal;
using namespace handlegraph;
using namespace bdsg;

static bool isCamelHal(AlignmentConstPtr aligment);
static void breadthFirstGenomeSearch(const Genome* reference,
Expand Down Expand Up @@ -64,15 +68,15 @@ static void initParser(CLParser* optionsParser)
optionsParser->addOptionFlag("keepCase",
"don't convert all nucleotides to upper case",
false);
optionsParser->addOption("protoChunk",
"maximum size (approx) of output protobuf chunks (bytes)",
30000000);
optionsParser->addOption("refSequenceFile",
"white-space delimited list of sequence names in the "
"reference genome which will *not* be collapsed by duplications."
" Overrides --refDupes", "\"\"");
optionsParser->addOption("outputFormat",
"output graph format in {pg, hg, odgi} [default=pg]",
"pg");

optionsParser->setDescription("Convert HAL alignment to vg protobuf");
optionsParser->setDescription("Convert HAL alignment to handle graph");

}

Expand All @@ -93,8 +97,8 @@ int main(int argc, char** argv)
// larger graphs. So we only use to make sure we don't overflow protobuf.
// Todo: tune down?
const int chop = 1000000;
int protoChunk;
string refSequenceFile;
string outputFormat;
try
{
optionsParser.parseOptions(argc, argv);
Expand All @@ -106,22 +110,21 @@ int main(int argc, char** argv)
refDupes = optionsParser.getFlag("refDupes");
onlySequenceNames = optionsParser.getFlag("onlySequenceNames");
keepCase = optionsParser.getFlag("keepCase");
protoChunk = optionsParser.getOption<int>("protoChunk");
outputFormat = optionsParser.getOption<string>("outputFormat");
refSequenceFile = optionsParser.getOption<string>("refSequenceFile");
if (rootGenomeName != "\"\"" && targetGenomes != "\"\"")
{
throw hal_exception("--rootGenome and --targetGenomes options are "
"mutually exclusive");
}
if (protoChunk > 60000000)
{
cerr << "Warning: --protoChunk parameter set dangerously high." << endl;
}
if (refSequenceFile != "\"\"" && refGenomeName == "\"\"")
{
throw hal_exception("--refSequenceFile must be used in conjunction "
" with --refGenome");
}
if (outputFormat != "pg" && outputFormat != "hg" && outputFormat != "odgi") {
throw hal_exception("--outputFormat must be one of {pg, hg, odgi}");
}
}
catch(exception& e)
{
Expand Down Expand Up @@ -308,21 +311,26 @@ int main(int argc, char** argv)
const vector<SGNamedPath>& outPaths = converter.getOutPaths();


// write to vg proto
cerr << "Writing VG protobuf to stdout" << endl;
SG2VGProto vgWriter;
vgWriter.init(&cout);
//vgWriter.writeGraph(outGraph, outBases, outPaths);

// chunking parameters designed to keep well under protobuf limit
int nodeCount = max(1UL, protoChunk / (sizeof(vg::Node) + chop));
int edgeCount = max(1UL, protoChunk / sizeof(vg::Edge));
// very conservative here assuming avg path size of 1
int segmentCount = max(1UL, protoChunk /
(sizeof(vg::Mapping) + sizeof(vg::Path)));
// convert to vg handle
cerr << "Converting SideGraph to HandleGraph" << endl;

vgWriter.writeChunkedGraph(outGraph, outBases, outPaths,
nodeCount, edgeCount, segmentCount);
unique_ptr<MutablePathMutableHandleGraph> graph;
if (outputFormat == "pg") {
graph = unique_ptr<MutablePathMutableHandleGraph>(new PackedGraph());
} else if (outputFormat == "hg") {
graph = unique_ptr<MutablePathMutableHandleGraph>(new HashGraph());
} else if (outputFormat == "odgi") {
graph = unique_ptr<MutablePathMutableHandleGraph>(new ODGI());
} else {
assert(false);
}

SG2VGHandle vgConverter;
vgConverter.convert(outGraph, outBases, outPaths, graph.get());

// write tot stdout
cerr << "Writing HandleGraph to stdout" << endl;
dynamic_cast<SerializableHandleGraph*>(graph.get())->serialize(cout);

//cout << *sgbuild.getSideGraph() << endl;

Expand Down
16 changes: 3 additions & 13 deletions include.mk
Original file line number Diff line number Diff line change
@@ -1,17 +1,6 @@
binPath=${rootPath}
libPath=${rootPath}

#IMPORTANT: must change this to where you built vg
VGDIR=../vg
# Since we're writing protobuf directly for now (to avoid making a whole-graph in memory index),
# we only link against the bare minimum to write the proto objects.
VGLIBDIR=$(VGDIR)/lib
LIBPROTOBUF=$(VGLIBDIR)/libprotobuf.a
LIBVG=$(VGLIBDIR)/libvg.a
LIBHTS=$(VGLIBDIR)/libhts.a
LIBDEFLATE=$(VGLIBDIR)/libdeflate.a
VGLIBS=$(LIBVG) $(VGLIBDIR)/libvgio.a $(LIBPROTOBUF) $(LIBHTS) $(LIBDEFLATE) -llzma -lbz2

sonLibRootPath=deps/sonLib
sonLibPath=${sonLibRootPath}/lib

Expand All @@ -24,12 +13,13 @@ hal2sgPath=${rootPath}/deps/hal2sg
sg2vgPath=${rootPath}/deps/sg2vg
rapidJsonPath=${sg2vgPath}/rapidjson
sgExportPath=${hal2sgPath}/sgExport
libbdsgPath=${rootPath}/deps/libbdsg-easy

include ${sonLibRootPath}/include.mk

cflags += -I ${sonLibPath} -I ${halPath} -I ${halIncPath} -I ${halLIIncPath} -I ${sgExportPath} -I ${hal2sgPath}
cppflags += -std=c++11 -I ${sonLibPath} -I ${halPath} -I ${halIncPath} -I ${halLIIncPath} -I ${sgExportPath} -I ${hal2sgPath} -I ${VGDIR}/include -I ${VGDIR}/include/vg -I ${VGDIR}/include/vg/io -I ${VGDIR}/src -UNDEBUG
basicLibs = ${hal2sgPath}/libhal2sg.a ${sgExportPath}/sgExport.a ${halPath}/libHalLiftover.a ${halPath}/libHal.a ${VGLIBS} ${sonLibPath}/sonLib.a ${sonLibPath}/cuTest.a
cppflags += -std=c++11 -I ${sonLibPath} -I ${halPath} -I ${halIncPath} -I ${halLIIncPath} -I ${sgExportPath} -I ${hal2sgPath} -I ${libbdsgPath}/include -UNDEBUG
basicLibs = ${hal2sgPath}/libhal2sg.a ${sgExportPath}/sgExport.a ${halPath}/libHalLiftover.a ${halPath}/libHal.a ${VGLIBS} ${sonLibPath}/sonLib.a ${sonLibPath}/cuTest.a ${libbdsgPath}/lib/libbdsg.a ${libbdsgPath}/lib/libhandlegraph.a ${libbdsgPath}/lib/libsdsl.a ${libbdsgPath}/lib/libdivsufsort.a ${libbdsgPath}/lib/libdivsufsort64.a
basicLibsDependencies = ${basicLibs}

# hdf5 compilation is done through its wrappers.
Expand Down
Loading

0 comments on commit 40507b2

Please sign in to comment.