-
Notifications
You must be signed in to change notification settings - Fork 10
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #89 from jaebeom-kim/master
add extract module to extract reads classified into a certain taxon
- Loading branch information
Showing
9 changed files
with
226 additions
and
7 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -69,10 +69,19 @@ std::vector<Command> commands = { | |
"Jaebeom Kim <[email protected]>", | ||
"<i:query file(s)> <i:database directory> <o:output directory> <job ID> ", | ||
CITATION_SPACEPHARER, | ||
{{"FASTA", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA | DbType::VARIADIC, &DbValidator::flatfile}, | ||
{{"FASTA/Q", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA | DbType::VARIADIC, &DbValidator::flatfile}, | ||
{"DB dir", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::directory}, | ||
{"out dir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory}, | ||
{"job ID", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile}}}, | ||
{"extract", extract, &localPar.extract, COMMAND_MAIN, | ||
"It extracts reads classified into a certain taxon. It should be used after classification.", | ||
nullptr, | ||
"Jaebeom Kim <[email protected]>", | ||
"<i:query file(s)> <i:read-by-read result> <i:database directory>", | ||
CITATION_SPACEPHARER, | ||
{{"FASTA/Q", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA | DbType::VARIADIC, &DbValidator::flatfile}, | ||
{"read-by-read result", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::flatfile}, | ||
{"DB dir", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::directory}}}, | ||
{"grade", grade, &localPar.grade, COMMAND_EXPERT, | ||
"Grade the classification result (only for benchmarking)", | ||
nullptr, | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,93 @@ | ||
#include "LocalParameters.h" | ||
#include "FileUtil.h" | ||
#include "common.h" | ||
#include "Reporter.h" | ||
|
||
void setExtractDefaults(LocalParameters & par){ | ||
par.taxonomyPath = "" ; | ||
par.seqMode = 2; | ||
par.targetTaxId = 0; | ||
} | ||
|
||
int extract(int argc, const char **argv, const Command& command) | ||
{ | ||
LocalParameters & par = LocalParameters::getLocalInstance(); | ||
setExtractDefaults(par); | ||
par.parseParameters(argc, argv, command, true, Parameters::PARSE_ALLOW_EMPTY, 0); | ||
|
||
if (par.seqMode == 2) { | ||
// Check if the second argument is a directory | ||
if (FileUtil::directoryExists(par.filenames[2].c_str())) { | ||
cerr << "For '--seq-mode 2', please provide two query files." << endl; | ||
exit(1); | ||
} | ||
} else { | ||
// Check if the second argument is file | ||
if (!FileUtil::directoryExists(par.filenames[2].c_str())) { | ||
cerr << "For '--seq-mode 1' and '--seq-mode 3', please provide one query file." << endl; | ||
exit(1); | ||
} | ||
} | ||
|
||
if (par.targetTaxId == 0) { | ||
cerr << "Please provide a target taxon ID with --tax-id parameter." << endl; | ||
exit(1); | ||
} | ||
|
||
string classificationFileName = par.filenames[1 + (par.seqMode == 2)]; | ||
string dbDir = par.filenames[2 + (par.seqMode == 2)]; | ||
TaxID targetTaxID = par.targetTaxId; | ||
|
||
cout << "Loading taxonomy ... " << endl; | ||
NcbiTaxonomy *taxonomy = loadTaxonomy(dbDir, par.taxonomyPath); | ||
Reporter reporter(par, taxonomy); | ||
|
||
vector<size_t> readIdxs; | ||
|
||
cout << "Extracting reads classified to taxon " << targetTaxID << " ... " << flush; | ||
reporter.getReadsClassifiedToClade(targetTaxID, classificationFileName, readIdxs); | ||
cout << "done." << endl; | ||
|
||
string queryFileName = par.filenames[0]; | ||
size_t lastDotPos = queryFileName.find_last_of('.'); | ||
string baseName = ""; | ||
string extension = ""; | ||
|
||
if (queryFileName.substr(lastDotPos) == ".gz") { | ||
lastDotPos = queryFileName.substr(0, lastDotPos).find_last_of('.'); | ||
baseName = queryFileName.substr(0, lastDotPos); | ||
extension = queryFileName.substr(lastDotPos); | ||
// Remove the last ".gz" from the extension | ||
extension = extension.substr(0, extension.length() - 3); | ||
} else { | ||
baseName = queryFileName.substr(0, lastDotPos); | ||
extension = queryFileName.substr(lastDotPos); | ||
} | ||
string outFileName = baseName + "_" + to_string(targetTaxID) + extension; | ||
|
||
cout << "Writing extracted reads to " << outFileName << " ... " << flush; | ||
reporter.printSpecifiedReads(readIdxs, queryFileName, outFileName); | ||
cout << "done." << endl; | ||
|
||
if (par.seqMode == 2) { | ||
queryFileName = par.filenames[1]; | ||
lastDotPos = queryFileName.find_last_of('.'); | ||
|
||
if (queryFileName.substr(lastDotPos) == ".gz") { | ||
lastDotPos = queryFileName.substr(0, lastDotPos).find_last_of('.'); | ||
baseName = queryFileName.substr(0, lastDotPos); | ||
extension = queryFileName.substr(lastDotPos); | ||
extension = extension.substr(0, extension.length() - 3); | ||
} else { | ||
baseName = queryFileName.substr(0, lastDotPos); | ||
extension = queryFileName.substr(lastDotPos); | ||
} | ||
outFileName = baseName + "_" + to_string(targetTaxID) + extension; | ||
cout << "Writing extracted reads to " << outFileName << " ... " << flush; | ||
reporter.printSpecifiedReads(readIdxs, queryFileName, outFileName); | ||
cout << "done." << endl; | ||
} | ||
|
||
delete taxonomy; | ||
return 0; | ||
} |