Merge pull request #90 from jaebeom-kim/master

Redundancy reduction optimization + new DB
steineggerlab · Sep 29, 2024 · d36b96d · d36b96d
2 parents 8ddb1c0 + be0cc44
commit d36b96d
Show file tree

Hide file tree

Showing 7 changed files with 61 additions and 37 deletions.
diff --git a/README.md b/README.md
@@ -99,8 +99,7 @@ Usage:
 metabuli databases DB_NAME OUTDIR tmp
 
 # NOTE
-- A human genome (T2T-CHM13v2.0) is included in all databases except RefSeq_release.
-- A human genome (GRCh38.p14) is included in RefSeq_release.  
+- A human genome (T2T-CHM13v2.0) is included in all databases below.
 
 1. RefSeq Virus (8.1 GiB)
 - NCBI RefSeq release 223 virus genomes
@@ -117,8 +116,8 @@ metabuli databases RefSeq OUTDIR tmp
 - Database will be in OUT_DIR/gtdb 
 metabuli databases GTDB OUTDIR tmp
 
-4. RefSeq Releases 217 (480.5 GiB) (OLD)
-- Viral and prokaryotic genomes of RefSeq release 217 and human genome (GRCh38.p14)
+4. RefSeq Releases 224 (619 GiB)
+- Viral and prokaryotic genomes of RefSeq release 224.
 metabuli databases RefSeq_release OUTDIR tmp
 ```
 Downloaded files are stored in `OUTDIR/DB_NAME` directory, which can be provided for `classify` module as `DBDIR`.

diff --git a/src/commons/FileMerger.cpp b/src/commons/FileMerger.cpp
@@ -263,7 +263,17 @@ size_t FileMerger::smallest(const uint64_t lookingKmers[],
 {
     size_t idxOfMin = 0;
     uint64_t min = lookingKmers[0];
-    int minTaxIdAtRank = taxId2speciesId.at((int) lookingInfos[0].sequenceID);
+    int minTaxIdAtRank;
+    if (min == UINT64_MAX) {
+        minTaxIdAtRank = INT_MAX;
+    } else {
+        // if (taxId2speciesId.find((int) lookingInfos[0].sequenceID) == taxId2speciesId.end()) {
+        //     cerr << lookingKmers[0] << endl;
+        //     cerr << "TaxID not found 1: " << lookingInfos[0].sequenceID << endl;
+        //     exit(1);
+        // }
+        minTaxIdAtRank = taxId2speciesId.at((int) lookingInfos[0].sequenceID);
+    }
     for(size_t i = 1; i < fileCnt; i++)
     {
         if(lookingKmers[i] < min ||

diff --git a/src/commons/IndexCreator.cpp b/src/commons/IndexCreator.cpp
@@ -220,7 +220,6 @@ void IndexCreator::makeBlocksForParallelProcessing() {
 }
 
 void IndexCreator::makeBlocksForParallelProcessing_accession_level() {
-
     unordered_map<string, TaxID> acc2taxid;
     TaxID maxTaxID = load_accession2taxid(acc2taxidFileName, acc2taxid);
     newTaxID = std::max(getMaxTaxID() + 1, maxTaxID + 1);
@@ -498,9 +497,9 @@ void IndexCreator::reduceRedundancy(TargetKmerBuffer & kmerBuffer, size_t * uniq
     // Make splits
     vector<Split> splits;
     size_t splitWidth = (kmerBuffer.startIndexOfReserve - startIdx) / par.threads;
-    for (size_t i = 0; i < par.threads - 1; i++) {
+    for (int i = 0; i < par.threads - 1; i++) {
         for (size_t j = startIdx + splitWidth; j + 1 < kmerBuffer.startIndexOfReserve; j++) {
-            if (kmerBuffer.buffer[j].taxIdAtRank != kmerBuffer.buffer[j + 1].taxIdAtRank) {
+            if (kmerBuffer.buffer[j].ADkmer != kmerBuffer.buffer[j + 1].ADkmer) {
                 splits.emplace_back(startIdx, j);
                 startIdx = j + 1;
                 break;
@@ -520,46 +519,32 @@ void IndexCreator::reduceRedundancy(TargetKmerBuffer & kmerBuffer, size_t * uniq
         TargetKmer * lookingKmer;
         size_t lookingIndex;
         int endFlag;
-        int hasSeenOtherStrains;
         vector<TaxID> taxIds;
 #pragma omp for schedule(dynamic, 1)
         for(size_t split = 0; split < splits.size(); split ++){
             lookingKmer = & kmerBuffer.buffer[splits[split].offset];
             lookingIndex = splits[split].offset;
             endFlag = 0;
             for(size_t i = 1 + splits[split].offset; i < splits[split].end + 1 ; i++) {
-                hasSeenOtherStrains = 0;
                 taxIds.clear();
                 taxIds.push_back(taxIdList[lookingKmer->info.sequenceID]);
-
                 // Scan redundancy
-                while(lookingKmer->taxIdAtRank == kmerBuffer.buffer[i].taxIdAtRank){
-                    if (lookingKmer->ADkmer != kmerBuffer.buffer[i].ADkmer) {
-                        break;
-                    }
+                while(lookingKmer->taxIdAtRank == kmerBuffer.buffer[i].taxIdAtRank &&
+                      lookingKmer->ADkmer == kmerBuffer.buffer[i].ADkmer){
                     taxIds.push_back(taxIdList[kmerBuffer.buffer[i].info.sequenceID]);
-                    if (par.accessionLevel) {
-                        hasSeenOtherStrains += (taxonomy->taxonNode(taxIdList[lookingKmer->info.sequenceID])->parentTaxId 
-                                                != taxonomy->taxonNode(taxIdList[kmerBuffer.buffer[i].info.sequenceID]) -> parentTaxId);
-                    } else {
-                        hasSeenOtherStrains += (taxIdList[lookingKmer->info.sequenceID] != taxIdList[kmerBuffer.buffer[i].info.sequenceID]);
-                    }
                     i++;
                     if(i == splits[split].end + 1){
                         endFlag = 1;
                         break;
                     }
                 }
-
-                lookingKmer->info.redundancy = (hasSeenOtherStrains > 0);
                 if(taxIds.size() > 1){
                     lookingKmer->info.sequenceID = taxonomy->LCA(taxIds)->taxId;
                 } else {
                     lookingKmer->info.sequenceID = taxIds[0];
                 }
 
-                idxOfEachSplit[split][cntOfEachSplit[split]] = lookingIndex;
-                cntOfEachSplit[split] ++;
+                idxOfEachSplit[split][cntOfEachSplit[split]++] = lookingIndex;
                 if(endFlag == 1) break;
                 lookingKmer = & kmerBuffer.buffer[i];
                 lookingIndex = i;
@@ -569,8 +554,7 @@ void IndexCreator::reduceRedundancy(TargetKmerBuffer & kmerBuffer, size_t * uniq
             if(!((kmerBuffer.buffer[splits[split].end - 1].ADkmer == kmerBuffer.buffer[splits[split].end].ADkmer) &&
                  (kmerBuffer.buffer[splits[split].end - 1].taxIdAtRank == kmerBuffer.buffer[splits[split].end].taxIdAtRank))){
                 kmerBuffer.buffer[splits[split].end].info.sequenceID = taxIdList[kmerBuffer.buffer[splits[split].end].info.sequenceID];
-                idxOfEachSplit[split][cntOfEachSplit[split]] = splits[split].end;
-                cntOfEachSplit[split] ++;
+                idxOfEachSplit[split][cntOfEachSplit[split]++] = splits[split].end;
             }
         }
     }

diff --git a/src/commons/IndexCreator.h b/src/commons/IndexCreator.h
@@ -6,6 +6,7 @@
 #include <algorithm>
 #include <ctime>
 #include <fstream>
+#include <unordered_set>
 #include "printBinary.h"
 #include "Mmap.h"
 #include "Kmer.h"

diff --git a/src/commons/SeqIterator.cpp b/src/commons/SeqIterator.cpp
@@ -450,10 +450,6 @@ SeqIterator::fillBufferWithKmerFromBlock(const PredictedBlock &block, const char
             kmerBuffer.buffer[posToWrite] = {UINT64_MAX, -1, 0, false};
         } else {
             addDNAInfo_TargetKmer(tempKmer, seq, block, kmerCnt);
-            // if(posToWrite >= kmerBuffer.bufferSize - 2) {
-            //     cout << "HERE " << posToWrite << endl;
-            //     return -1;
-            // }
             kmerBuffer.buffer[posToWrite] = {tempKmer, taxIdAtRank, seqID, false};
         }
         posToWrite++;

diff --git a/src/metabuli.cpp b/src/metabuli.cpp
@@ -159,7 +159,7 @@ std::vector<DatabaseDownload> externalDownloads = {
         },
         {
                 "RefSeq_release",
-                "NCBI release 217 (Prokaryote & Virus) and a human genome (GRCh38.p14)",
+                "NCBI release 224 (Prokaryote & Virus) and a human genome (T2T-CHM13v2.0)",
                 "O'Leary et al. Reference sequence (RefSeq) database at NCBI: current status, taxonomic expansion, and functional annotation. Nucleic Acids Res. (2016)",
                 "https://www.ncbi.nlm.nih.gov/refseq/",
                 true, LocalParameters::DBTYPE_INDEX_DB, metabulidatabases_sh, metabulidatabases_sh_len,

diff --git a/src/util/database-report.cpp b/src/util/database-report.cpp
@@ -24,6 +24,9 @@ int databaseReport(int argc, const char **argv, const Command &command) {
     // Load taxonomy
     if (par.taxonomyPath == "DBDIR/taxonomy/") par.taxonomyPath = dbDir + "/taxonomy/";
     NcbiTaxonomy * taxonomy = loadTaxonomy(dbDir, par.taxonomyPath);
+    // NcbiTaxonomy * taxonomy = new NcbiTaxonomy(par.taxonomyPath + "/names.dmp",
+    //                         par.taxonomyPath + "/nodes.dmp",
+    //                         par.taxonomyPath + "/merged.dmp");
 
     // if (!FileUtil::directoryExists(par.taxonomyPath.c_str())) {
     //     cerr << "Error: taxonomy path " << par.taxonomyPath << " does not exist." << endl;
@@ -38,11 +41,42 @@ int databaseReport(int argc, const char **argv, const Command &command) {
         return 1;
     }
 
-    // // Load taxonomy
-    // const string names = par.taxonomyPath + "/names.dmp";
-    // const string nodes = par.taxonomyPath + "/nodes.dmp";
-    // const string merged = par.taxonomyPath + "/merged.dmp";
-    // auto * taxonomy = new NcbiTaxonomy(names, nodes, merged);
+    // string taxIDList = dbDir + "/taxID_list";
+    // if (!FileUtil::fileExists(taxIDList.c_str())) {
+    //     cerr << "Error: taxID_list file " << taxIDList << " does not exist in the provided DBDIR." << endl;
+    //     return 1;
+    // }
+
+    // FILE *taxIdFile;
+    // if ((taxIdFile = fopen((dbDir + "/taxID_list").c_str(), "r")) == NULL) {
+    //     std::cout << "Cannot open the taxID list file." << std::endl;
+    //     return 1;
+    // }
+    // char taxID[100];
+    // vector<TaxID> taxids2;
+    // const TaxonNode *node = taxonomy->taxonNode(562);
+    // cout << taxonomy->taxLineage(node) << endl;
+
+    // while (feof(taxIdFile) == 0) {
+    //     fscanf(taxIdFile, "%s", taxID);
+    //     TaxID taxId = atol(taxID);
+    //     taxids2.push_back(taxId);
+    //     if (!taxonomy->IsAncestor(562, taxId)) {
+    //         cout << "wtf: " << taxId << endl;
+    //         cout << taxonomy->getTaxIdAtRank(taxId, "species") << endl;
+    //         const TaxonNode *node = taxonomy->taxonNode(taxId);
+    //         cout << taxonomy->taxLineage(node) << endl;
+    //         cout << "Parent: " << node -> parentTaxId << endl;
+    //     }
+    //     if (taxonomy->LCA(562, taxId) == 543) {
+    //         cout << "wtf2: " << taxId << endl;
+    //         cout << taxonomy->getTaxIdAtRank(taxId, "species") << endl;
+    //     }
+    // }
+    // fclose(taxIdFile);
+    // cout << "LCA: " << taxonomy->LCA(taxids2)->taxId << endl;
+    // // exit(1);
+
 
     // Load only the second column of acc2taxid.map as integers
     vector<int> taxids;