Skip to content

Commit

Permalink
Merge pull request #90 from jaebeom-kim/master
Browse files Browse the repository at this point in the history
Redundancy reduction optimization + new DB
  • Loading branch information
jaebeom-kim authored Sep 29, 2024
2 parents 8ddb1c0 + be0cc44 commit d36b96d
Show file tree
Hide file tree
Showing 7 changed files with 61 additions and 37 deletions.
7 changes: 3 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -99,8 +99,7 @@ Usage:
metabuli databases DB_NAME OUTDIR tmp
# NOTE
- A human genome (T2T-CHM13v2.0) is included in all databases except RefSeq_release.
- A human genome (GRCh38.p14) is included in RefSeq_release.
- A human genome (T2T-CHM13v2.0) is included in all databases below.
1. RefSeq Virus (8.1 GiB)
- NCBI RefSeq release 223 virus genomes
Expand All @@ -117,8 +116,8 @@ metabuli databases RefSeq OUTDIR tmp
- Database will be in OUT_DIR/gtdb
metabuli databases GTDB OUTDIR tmp
4. RefSeq Releases 217 (480.5 GiB) (OLD)
- Viral and prokaryotic genomes of RefSeq release 217 and human genome (GRCh38.p14)
4. RefSeq Releases 224 (619 GiB)
- Viral and prokaryotic genomes of RefSeq release 224.
metabuli databases RefSeq_release OUTDIR tmp
```
Downloaded files are stored in `OUTDIR/DB_NAME` directory, which can be provided for `classify` module as `DBDIR`.
Expand Down
12 changes: 11 additions & 1 deletion src/commons/FileMerger.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -263,7 +263,17 @@ size_t FileMerger::smallest(const uint64_t lookingKmers[],
{
size_t idxOfMin = 0;
uint64_t min = lookingKmers[0];
int minTaxIdAtRank = taxId2speciesId.at((int) lookingInfos[0].sequenceID);
int minTaxIdAtRank;
if (min == UINT64_MAX) {
minTaxIdAtRank = INT_MAX;
} else {
// if (taxId2speciesId.find((int) lookingInfos[0].sequenceID) == taxId2speciesId.end()) {
// cerr << lookingKmers[0] << endl;
// cerr << "TaxID not found 1: " << lookingInfos[0].sequenceID << endl;
// exit(1);
// }
minTaxIdAtRank = taxId2speciesId.at((int) lookingInfos[0].sequenceID);
}
for(size_t i = 1; i < fileCnt; i++)
{
if(lookingKmers[i] < min ||
Expand Down
28 changes: 6 additions & 22 deletions src/commons/IndexCreator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -220,7 +220,6 @@ void IndexCreator::makeBlocksForParallelProcessing() {
}

void IndexCreator::makeBlocksForParallelProcessing_accession_level() {

unordered_map<string, TaxID> acc2taxid;
TaxID maxTaxID = load_accession2taxid(acc2taxidFileName, acc2taxid);
newTaxID = std::max(getMaxTaxID() + 1, maxTaxID + 1);
Expand Down Expand Up @@ -498,9 +497,9 @@ void IndexCreator::reduceRedundancy(TargetKmerBuffer & kmerBuffer, size_t * uniq
// Make splits
vector<Split> splits;
size_t splitWidth = (kmerBuffer.startIndexOfReserve - startIdx) / par.threads;
for (size_t i = 0; i < par.threads - 1; i++) {
for (int i = 0; i < par.threads - 1; i++) {
for (size_t j = startIdx + splitWidth; j + 1 < kmerBuffer.startIndexOfReserve; j++) {
if (kmerBuffer.buffer[j].taxIdAtRank != kmerBuffer.buffer[j + 1].taxIdAtRank) {
if (kmerBuffer.buffer[j].ADkmer != kmerBuffer.buffer[j + 1].ADkmer) {
splits.emplace_back(startIdx, j);
startIdx = j + 1;
break;
Expand All @@ -520,46 +519,32 @@ void IndexCreator::reduceRedundancy(TargetKmerBuffer & kmerBuffer, size_t * uniq
TargetKmer * lookingKmer;
size_t lookingIndex;
int endFlag;
int hasSeenOtherStrains;
vector<TaxID> taxIds;
#pragma omp for schedule(dynamic, 1)
for(size_t split = 0; split < splits.size(); split ++){
lookingKmer = & kmerBuffer.buffer[splits[split].offset];
lookingIndex = splits[split].offset;
endFlag = 0;
for(size_t i = 1 + splits[split].offset; i < splits[split].end + 1 ; i++) {
hasSeenOtherStrains = 0;
taxIds.clear();
taxIds.push_back(taxIdList[lookingKmer->info.sequenceID]);

// Scan redundancy
while(lookingKmer->taxIdAtRank == kmerBuffer.buffer[i].taxIdAtRank){
if (lookingKmer->ADkmer != kmerBuffer.buffer[i].ADkmer) {
break;
}
while(lookingKmer->taxIdAtRank == kmerBuffer.buffer[i].taxIdAtRank &&
lookingKmer->ADkmer == kmerBuffer.buffer[i].ADkmer){
taxIds.push_back(taxIdList[kmerBuffer.buffer[i].info.sequenceID]);
if (par.accessionLevel) {
hasSeenOtherStrains += (taxonomy->taxonNode(taxIdList[lookingKmer->info.sequenceID])->parentTaxId
!= taxonomy->taxonNode(taxIdList[kmerBuffer.buffer[i].info.sequenceID]) -> parentTaxId);
} else {
hasSeenOtherStrains += (taxIdList[lookingKmer->info.sequenceID] != taxIdList[kmerBuffer.buffer[i].info.sequenceID]);
}
i++;
if(i == splits[split].end + 1){
endFlag = 1;
break;
}
}

lookingKmer->info.redundancy = (hasSeenOtherStrains > 0);
if(taxIds.size() > 1){
lookingKmer->info.sequenceID = taxonomy->LCA(taxIds)->taxId;
} else {
lookingKmer->info.sequenceID = taxIds[0];
}

idxOfEachSplit[split][cntOfEachSplit[split]] = lookingIndex;
cntOfEachSplit[split] ++;
idxOfEachSplit[split][cntOfEachSplit[split]++] = lookingIndex;
if(endFlag == 1) break;
lookingKmer = & kmerBuffer.buffer[i];
lookingIndex = i;
Expand All @@ -569,8 +554,7 @@ void IndexCreator::reduceRedundancy(TargetKmerBuffer & kmerBuffer, size_t * uniq
if(!((kmerBuffer.buffer[splits[split].end - 1].ADkmer == kmerBuffer.buffer[splits[split].end].ADkmer) &&
(kmerBuffer.buffer[splits[split].end - 1].taxIdAtRank == kmerBuffer.buffer[splits[split].end].taxIdAtRank))){
kmerBuffer.buffer[splits[split].end].info.sequenceID = taxIdList[kmerBuffer.buffer[splits[split].end].info.sequenceID];
idxOfEachSplit[split][cntOfEachSplit[split]] = splits[split].end;
cntOfEachSplit[split] ++;
idxOfEachSplit[split][cntOfEachSplit[split]++] = splits[split].end;
}
}
}
Expand Down
1 change: 1 addition & 0 deletions src/commons/IndexCreator.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#include <algorithm>
#include <ctime>
#include <fstream>
#include <unordered_set>
#include "printBinary.h"
#include "Mmap.h"
#include "Kmer.h"
Expand Down
4 changes: 0 additions & 4 deletions src/commons/SeqIterator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -450,10 +450,6 @@ SeqIterator::fillBufferWithKmerFromBlock(const PredictedBlock &block, const char
kmerBuffer.buffer[posToWrite] = {UINT64_MAX, -1, 0, false};
} else {
addDNAInfo_TargetKmer(tempKmer, seq, block, kmerCnt);
// if(posToWrite >= kmerBuffer.bufferSize - 2) {
// cout << "HERE " << posToWrite << endl;
// return -1;
// }
kmerBuffer.buffer[posToWrite] = {tempKmer, taxIdAtRank, seqID, false};
}
posToWrite++;
Expand Down
2 changes: 1 addition & 1 deletion src/metabuli.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ std::vector<DatabaseDownload> externalDownloads = {
},
{
"RefSeq_release",
"NCBI release 217 (Prokaryote & Virus) and a human genome (GRCh38.p14)",
"NCBI release 224 (Prokaryote & Virus) and a human genome (T2T-CHM13v2.0)",
"O'Leary et al. Reference sequence (RefSeq) database at NCBI: current status, taxonomic expansion, and functional annotation. Nucleic Acids Res. (2016)",
"https://www.ncbi.nlm.nih.gov/refseq/",
true, LocalParameters::DBTYPE_INDEX_DB, metabulidatabases_sh, metabulidatabases_sh_len,
Expand Down
44 changes: 39 additions & 5 deletions src/util/database-report.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@ int databaseReport(int argc, const char **argv, const Command &command) {
// Load taxonomy
if (par.taxonomyPath == "DBDIR/taxonomy/") par.taxonomyPath = dbDir + "/taxonomy/";
NcbiTaxonomy * taxonomy = loadTaxonomy(dbDir, par.taxonomyPath);
// NcbiTaxonomy * taxonomy = new NcbiTaxonomy(par.taxonomyPath + "/names.dmp",
// par.taxonomyPath + "/nodes.dmp",
// par.taxonomyPath + "/merged.dmp");

// if (!FileUtil::directoryExists(par.taxonomyPath.c_str())) {
// cerr << "Error: taxonomy path " << par.taxonomyPath << " does not exist." << endl;
Expand All @@ -38,11 +41,42 @@ int databaseReport(int argc, const char **argv, const Command &command) {
return 1;
}

// // Load taxonomy
// const string names = par.taxonomyPath + "/names.dmp";
// const string nodes = par.taxonomyPath + "/nodes.dmp";
// const string merged = par.taxonomyPath + "/merged.dmp";
// auto * taxonomy = new NcbiTaxonomy(names, nodes, merged);
// string taxIDList = dbDir + "/taxID_list";
// if (!FileUtil::fileExists(taxIDList.c_str())) {
// cerr << "Error: taxID_list file " << taxIDList << " does not exist in the provided DBDIR." << endl;
// return 1;
// }

// FILE *taxIdFile;
// if ((taxIdFile = fopen((dbDir + "/taxID_list").c_str(), "r")) == NULL) {
// std::cout << "Cannot open the taxID list file." << std::endl;
// return 1;
// }
// char taxID[100];
// vector<TaxID> taxids2;
// const TaxonNode *node = taxonomy->taxonNode(562);
// cout << taxonomy->taxLineage(node) << endl;

// while (feof(taxIdFile) == 0) {
// fscanf(taxIdFile, "%s", taxID);
// TaxID taxId = atol(taxID);
// taxids2.push_back(taxId);
// if (!taxonomy->IsAncestor(562, taxId)) {
// cout << "wtf: " << taxId << endl;
// cout << taxonomy->getTaxIdAtRank(taxId, "species") << endl;
// const TaxonNode *node = taxonomy->taxonNode(taxId);
// cout << taxonomy->taxLineage(node) << endl;
// cout << "Parent: " << node -> parentTaxId << endl;
// }
// if (taxonomy->LCA(562, taxId) == 543) {
// cout << "wtf2: " << taxId << endl;
// cout << taxonomy->getTaxIdAtRank(taxId, "species") << endl;
// }
// }
// fclose(taxIdFile);
// cout << "LCA: " << taxonomy->LCA(taxids2)->taxId << endl;
// // exit(1);


// Load only the second column of acc2taxid.map as integers
vector<int> taxids;
Expand Down

0 comments on commit d36b96d

Please sign in to comment.