From f44a6a8acc4261756db7c09bef9a2c63707f286a Mon Sep 17 00:00:00 2001 From: Jaebeom Kim Date: Sun, 29 Sep 2024 15:13:19 +0900 Subject: [PATCH 1/2] redundancy reduction became faster for species with thousands of genomes --- src/commons/FileMerger.cpp | 12 +++++++++- src/commons/IndexCreator.cpp | 28 +++++------------------ src/commons/IndexCreator.h | 1 + src/commons/SeqIterator.cpp | 4 ---- src/metabuli.cpp | 2 +- src/util/database-report.cpp | 44 ++++++++++++++++++++++++++++++++---- 6 files changed, 58 insertions(+), 33 deletions(-) diff --git a/src/commons/FileMerger.cpp b/src/commons/FileMerger.cpp index f2b7a947..c53e9e0f 100644 --- a/src/commons/FileMerger.cpp +++ b/src/commons/FileMerger.cpp @@ -263,7 +263,17 @@ size_t FileMerger::smallest(const uint64_t lookingKmers[], { size_t idxOfMin = 0; uint64_t min = lookingKmers[0]; - int minTaxIdAtRank = taxId2speciesId.at((int) lookingInfos[0].sequenceID); + int minTaxIdAtRank; + if (min == UINT64_MAX) { + minTaxIdAtRank = INT_MAX; + } else { + // if (taxId2speciesId.find((int) lookingInfos[0].sequenceID) == taxId2speciesId.end()) { + // cerr << lookingKmers[0] << endl; + // cerr << "TaxID not found 1: " << lookingInfos[0].sequenceID << endl; + // exit(1); + // } + minTaxIdAtRank = taxId2speciesId.at((int) lookingInfos[0].sequenceID); + } for(size_t i = 1; i < fileCnt; i++) { if(lookingKmers[i] < min || diff --git a/src/commons/IndexCreator.cpp b/src/commons/IndexCreator.cpp index c41c65fb..61175006 100644 --- a/src/commons/IndexCreator.cpp +++ b/src/commons/IndexCreator.cpp @@ -220,7 +220,6 @@ void IndexCreator::makeBlocksForParallelProcessing() { } void IndexCreator::makeBlocksForParallelProcessing_accession_level() { - unordered_map acc2taxid; TaxID maxTaxID = load_accession2taxid(acc2taxidFileName, acc2taxid); newTaxID = std::max(getMaxTaxID() + 1, maxTaxID + 1); @@ -498,9 +497,9 @@ void IndexCreator::reduceRedundancy(TargetKmerBuffer & kmerBuffer, size_t * uniq // Make splits vector splits; size_t splitWidth = (kmerBuffer.startIndexOfReserve - startIdx) / par.threads; - for (size_t i = 0; i < par.threads - 1; i++) { + for (int i = 0; i < par.threads - 1; i++) { for (size_t j = startIdx + splitWidth; j + 1 < kmerBuffer.startIndexOfReserve; j++) { - if (kmerBuffer.buffer[j].taxIdAtRank != kmerBuffer.buffer[j + 1].taxIdAtRank) { + if (kmerBuffer.buffer[j].ADkmer != kmerBuffer.buffer[j + 1].ADkmer) { splits.emplace_back(startIdx, j); startIdx = j + 1; break; @@ -520,7 +519,6 @@ void IndexCreator::reduceRedundancy(TargetKmerBuffer & kmerBuffer, size_t * uniq TargetKmer * lookingKmer; size_t lookingIndex; int endFlag; - int hasSeenOtherStrains; vector taxIds; #pragma omp for schedule(dynamic, 1) for(size_t split = 0; split < splits.size(); split ++){ @@ -528,38 +526,25 @@ void IndexCreator::reduceRedundancy(TargetKmerBuffer & kmerBuffer, size_t * uniq lookingIndex = splits[split].offset; endFlag = 0; for(size_t i = 1 + splits[split].offset; i < splits[split].end + 1 ; i++) { - hasSeenOtherStrains = 0; taxIds.clear(); taxIds.push_back(taxIdList[lookingKmer->info.sequenceID]); - // Scan redundancy - while(lookingKmer->taxIdAtRank == kmerBuffer.buffer[i].taxIdAtRank){ - if (lookingKmer->ADkmer != kmerBuffer.buffer[i].ADkmer) { - break; - } + while(lookingKmer->taxIdAtRank == kmerBuffer.buffer[i].taxIdAtRank && + lookingKmer->ADkmer == kmerBuffer.buffer[i].ADkmer){ taxIds.push_back(taxIdList[kmerBuffer.buffer[i].info.sequenceID]); - if (par.accessionLevel) { - hasSeenOtherStrains += (taxonomy->taxonNode(taxIdList[lookingKmer->info.sequenceID])->parentTaxId - != taxonomy->taxonNode(taxIdList[kmerBuffer.buffer[i].info.sequenceID]) -> parentTaxId); - } else { - hasSeenOtherStrains += (taxIdList[lookingKmer->info.sequenceID] != taxIdList[kmerBuffer.buffer[i].info.sequenceID]); - } i++; if(i == splits[split].end + 1){ endFlag = 1; break; } } - - lookingKmer->info.redundancy = (hasSeenOtherStrains > 0); if(taxIds.size() > 1){ lookingKmer->info.sequenceID = taxonomy->LCA(taxIds)->taxId; } else { lookingKmer->info.sequenceID = taxIds[0]; } - idxOfEachSplit[split][cntOfEachSplit[split]] = lookingIndex; - cntOfEachSplit[split] ++; + idxOfEachSplit[split][cntOfEachSplit[split]++] = lookingIndex; if(endFlag == 1) break; lookingKmer = & kmerBuffer.buffer[i]; lookingIndex = i; @@ -569,8 +554,7 @@ void IndexCreator::reduceRedundancy(TargetKmerBuffer & kmerBuffer, size_t * uniq if(!((kmerBuffer.buffer[splits[split].end - 1].ADkmer == kmerBuffer.buffer[splits[split].end].ADkmer) && (kmerBuffer.buffer[splits[split].end - 1].taxIdAtRank == kmerBuffer.buffer[splits[split].end].taxIdAtRank))){ kmerBuffer.buffer[splits[split].end].info.sequenceID = taxIdList[kmerBuffer.buffer[splits[split].end].info.sequenceID]; - idxOfEachSplit[split][cntOfEachSplit[split]] = splits[split].end; - cntOfEachSplit[split] ++; + idxOfEachSplit[split][cntOfEachSplit[split]++] = splits[split].end; } } } diff --git a/src/commons/IndexCreator.h b/src/commons/IndexCreator.h index c491cb20..8b1911c7 100644 --- a/src/commons/IndexCreator.h +++ b/src/commons/IndexCreator.h @@ -6,6 +6,7 @@ #include #include #include +#include #include "printBinary.h" #include "Mmap.h" #include "Kmer.h" diff --git a/src/commons/SeqIterator.cpp b/src/commons/SeqIterator.cpp index 58f610ff..46725f99 100644 --- a/src/commons/SeqIterator.cpp +++ b/src/commons/SeqIterator.cpp @@ -450,10 +450,6 @@ SeqIterator::fillBufferWithKmerFromBlock(const PredictedBlock &block, const char kmerBuffer.buffer[posToWrite] = {UINT64_MAX, -1, 0, false}; } else { addDNAInfo_TargetKmer(tempKmer, seq, block, kmerCnt); - // if(posToWrite >= kmerBuffer.bufferSize - 2) { - // cout << "HERE " << posToWrite << endl; - // return -1; - // } kmerBuffer.buffer[posToWrite] = {tempKmer, taxIdAtRank, seqID, false}; } posToWrite++; diff --git a/src/metabuli.cpp b/src/metabuli.cpp index 4326ecfb..4b137e7e 100644 --- a/src/metabuli.cpp +++ b/src/metabuli.cpp @@ -159,7 +159,7 @@ std::vector externalDownloads = { }, { "RefSeq_release", - "NCBI release 217 (Prokaryote & Virus) and a human genome (GRCh38.p14)", + "NCBI release 224 (Prokaryote & Virus) and a human genome (T2T-CHM13v2.0)", "O'Leary et al. Reference sequence (RefSeq) database at NCBI: current status, taxonomic expansion, and functional annotation. Nucleic Acids Res. (2016)", "https://www.ncbi.nlm.nih.gov/refseq/", true, LocalParameters::DBTYPE_INDEX_DB, metabulidatabases_sh, metabulidatabases_sh_len, diff --git a/src/util/database-report.cpp b/src/util/database-report.cpp index 4aee73e4..cbf6c067 100644 --- a/src/util/database-report.cpp +++ b/src/util/database-report.cpp @@ -24,6 +24,9 @@ int databaseReport(int argc, const char **argv, const Command &command) { // Load taxonomy if (par.taxonomyPath == "DBDIR/taxonomy/") par.taxonomyPath = dbDir + "/taxonomy/"; NcbiTaxonomy * taxonomy = loadTaxonomy(dbDir, par.taxonomyPath); + // NcbiTaxonomy * taxonomy = new NcbiTaxonomy(par.taxonomyPath + "/names.dmp", + // par.taxonomyPath + "/nodes.dmp", + // par.taxonomyPath + "/merged.dmp"); // if (!FileUtil::directoryExists(par.taxonomyPath.c_str())) { // cerr << "Error: taxonomy path " << par.taxonomyPath << " does not exist." << endl; @@ -38,11 +41,42 @@ int databaseReport(int argc, const char **argv, const Command &command) { return 1; } - // // Load taxonomy - // const string names = par.taxonomyPath + "/names.dmp"; - // const string nodes = par.taxonomyPath + "/nodes.dmp"; - // const string merged = par.taxonomyPath + "/merged.dmp"; - // auto * taxonomy = new NcbiTaxonomy(names, nodes, merged); + // string taxIDList = dbDir + "/taxID_list"; + // if (!FileUtil::fileExists(taxIDList.c_str())) { + // cerr << "Error: taxID_list file " << taxIDList << " does not exist in the provided DBDIR." << endl; + // return 1; + // } + + // FILE *taxIdFile; + // if ((taxIdFile = fopen((dbDir + "/taxID_list").c_str(), "r")) == NULL) { + // std::cout << "Cannot open the taxID list file." << std::endl; + // return 1; + // } + // char taxID[100]; + // vector taxids2; + // const TaxonNode *node = taxonomy->taxonNode(562); + // cout << taxonomy->taxLineage(node) << endl; + + // while (feof(taxIdFile) == 0) { + // fscanf(taxIdFile, "%s", taxID); + // TaxID taxId = atol(taxID); + // taxids2.push_back(taxId); + // if (!taxonomy->IsAncestor(562, taxId)) { + // cout << "wtf: " << taxId << endl; + // cout << taxonomy->getTaxIdAtRank(taxId, "species") << endl; + // const TaxonNode *node = taxonomy->taxonNode(taxId); + // cout << taxonomy->taxLineage(node) << endl; + // cout << "Parent: " << node -> parentTaxId << endl; + // } + // if (taxonomy->LCA(562, taxId) == 543) { + // cout << "wtf2: " << taxId << endl; + // cout << taxonomy->getTaxIdAtRank(taxId, "species") << endl; + // } + // } + // fclose(taxIdFile); + // cout << "LCA: " << taxonomy->LCA(taxids2)->taxId << endl; + // // exit(1); + // Load only the second column of acc2taxid.map as integers vector taxids; From 49c4f1e29c427663da56496eafefa32490640f3e Mon Sep 17 00:00:00 2001 From: Jaebeom Kim Date: Sun, 29 Sep 2024 15:16:50 +0900 Subject: [PATCH 2/2] change DB discription --- README.md | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 0def9a03..71f26b4f 100644 --- a/README.md +++ b/README.md @@ -96,8 +96,7 @@ Usage: metabuli databases DB_NAME OUTDIR tmp # NOTE -- A human genome (T2T-CHM13v2.0) is included in all databases except RefSeq_release. -- A human genome (GRCh38.p14) is included in RefSeq_release. +- A human genome (T2T-CHM13v2.0) is included in all databases below. 1. RefSeq Virus (8.1 GiB) - NCBI RefSeq release 223 virus genomes @@ -114,8 +113,8 @@ metabuli databases RefSeq OUTDIR tmp - Database will be in OUT_DIR/gtdb metabuli databases GTDB OUTDIR tmp -4. RefSeq Releases 217 (480.5 GiB) (OLD) -- Viral and prokaryotic genomes of RefSeq release 217 and human genome (GRCh38.p14) +4. RefSeq Releases 224 (619 GiB) +- Viral and prokaryotic genomes of RefSeq release 224. metabuli databases RefSeq_release OUTDIR tmp ``` Downloaded files are stored in `OUTDIR/DB_NAME` directory, which can be provided for `classify` module as `DBDIR`.