From f5b77bf0b7d413c785dc912e190c1ca9ff008239 Mon Sep 17 00:00:00 2001 From: Hendrik Muhs Date: Sun, 2 Jun 2024 10:17:55 +0200 Subject: [PATCH] fix fuzzy matching cases --- .../dictionary/matching/fuzzy_matching.h | 221 ++++++++++-------- .../keyvi/index/internal/base_index_reader.h | 17 +- 2 files changed, 130 insertions(+), 108 deletions(-) diff --git a/keyvi/include/keyvi/dictionary/matching/fuzzy_matching.h b/keyvi/include/keyvi/dictionary/matching/fuzzy_matching.h index a7d7bbff6..d488f14fc 100644 --- a/keyvi/include/keyvi/dictionary/matching/fuzzy_matching.h +++ b/keyvi/include/keyvi/dictionary/matching/fuzzy_matching.h @@ -47,6 +47,8 @@ namespace keyvi { namespace index { namespace internal { +template +class BaseIndexReader; template keyvi::dictionary::Match NextFilteredMatchSingle(const MatcherT&, const DeletedT&); template @@ -70,7 +72,21 @@ class FuzzyMatching final { template static FuzzyMatching FromSingleFsa(const fsa::automata_t& fsa, const std::string& query, const int32_t max_edit_distance, const size_t minimum_exact_prefix = 2) { - uint64_t state = fsa->GetStartState(); + return FromSingleFsa(fsa, fsa->GetStartState(), query, max_edit_distance, minimum_exact_prefix); + } + /** + * Create a fuzzy matcher from a single Fsa + * + * @param fsa the fsa + * @param start_state the state to start from + * @param query the query + * @param max_edit_distance the maximum allowed edit distance + * @param minimum_exact_prefix the minimum exact prefix to match before matching approximate + */ + template + static FuzzyMatching FromSingleFsa(const fsa::automata_t& fsa, const uint64_t start_state, const std::string& query, + const int32_t max_edit_distance, const size_t minimum_exact_prefix = 2) { + uint64_t state = start_state; size_t depth = 0; size_t utf8_depth = 0; @@ -89,54 +105,8 @@ class FuzzyMatching final { return FuzzyMatching(); } - return FromSingleFsa(fsa, state, query, max_edit_distance, minimum_exact_prefix); - } - - /** - * Create a fuzzy matcher from a single Fsa - * - * @param fsa the fsa - * @param start_state the state to start from - * @param query the query - * @param max_edit_distance the maximum allowed edit distance - * @param exact_prefix the exact prefix that already matched - */ - template - static FuzzyMatching FromSingleFsa(const fsa::automata_t& fsa, const uint64_t start_state, const std::string& query, - const int32_t max_edit_distance, const size_t exact_prefix) { - if (start_state == 0) { - return FuzzyMatching(); - } - - std::unique_ptr metric; - std::unique_ptr> traverser; - Match first_match; - - std::vector codepoints; - utf8::unchecked::utf8to32(query.begin(), query.end(), back_inserter(codepoints)); - - if (start_state == 0) { - TRACE("query lengh < minimum exact prefix, returning empty iterator"); - return FuzzyMatching(); - } - - // initialize the distance metric with the exact prefix - metric.reset(new stringdistance::Levenshtein(codepoints, 20, max_edit_distance)); - for (size_t i = 0; i < exact_prefix; ++i) { - metric->Put(codepoints[i], i); - } - - traverser.reset(new fsa::CodePointStateTraverser(fsa, start_state)); - - if (fsa->IsFinalState(start_state) && metric->GetScore() <= max_edit_distance) { - TRACE("exact prefix matched"); - first_match = - Match(0, exact_prefix, metric->GetCandidate(), metric->GetScore(), fsa, fsa->GetStateValue(start_state)); - } - - TRACE("create iterator"); - return FuzzyMatching(std::move(traverser), std::move(metric), std::move(first_match), - max_edit_distance, exact_prefix); + return FromSingleFsaWithMatchedExactPrefix(fsa, state, query, max_edit_distance, + minimum_exact_prefix); } /** @@ -154,57 +124,7 @@ class FuzzyMatching final { std::vector> fsa_start_state_pairs = FilterWithExactPrefix(fsas, query, minimum_exact_prefix); - return FromMulipleFsas(fsa_start_state_pairs, query, max_edit_distance, minimum_exact_prefix); - } - - /** - * Create a fuzzy matcher with already matched exact prefix. - * - * @param fsa_start_state_pairs pairs of fsa and current state - * @param query the query - * @param max_edit_distance the maximum allowed edit distance - * @param exact_prefix the exact prefix that already matched - */ - template - static FuzzyMatching> FromMulipleFsas( - const std::vector>& fsa_start_state_pairs, const std::string& query, - const int32_t max_edit_distance, const size_t exact_prefix) { - // if the list of fsa's is empty return an empty matcher - if (fsa_start_state_pairs.size() == 0) { - return FuzzyMatching>(); - } - - std::unique_ptr metric; - std::unique_ptr>> traverser; - Match first_match; - - // decode the utf8 query into single codepoints - std::vector codepoints; - utf8::unchecked::utf8to32(query.begin(), query.end(), back_inserter(codepoints)); - - // initialize the distance metric with the exact prefix - metric.reset(new stringdistance::Levenshtein(codepoints, 20, max_edit_distance)); - for (size_t i = 0; i < exact_prefix; ++i) { - metric->Put(codepoints[i], i); - } - - // check for a match given the exact prefix - for (const auto& fsa_state : fsa_start_state_pairs) { - if (fsa_state.first->IsFinalState(fsa_state.second) && metric->GetScore() <= max_edit_distance) { - first_match = Match(0, exact_prefix, metric->GetCandidate(), metric->GetScore(), fsa_state.first, - fsa_state.first->GetStateValue(fsa_state.second)); - break; - } - } - - TRACE("create zip traverser with %ul inner traversers", fsa_start_state_pairs.size()); - fsa::ZipStateTraverser zip_state_traverser(fsa_start_state_pairs, false); - traverser.reset( - new fsa::CodePointStateTraverser>(std::move(zip_state_traverser))); - - TRACE("create iterator"); - return FuzzyMatching>( - std::move(traverser), std::move(metric), std::move(first_match), max_edit_distance, exact_prefix); + return FromMulipleFsasWithMatchedExactPrefix(fsa_start_state_pairs, query, max_edit_distance, minimum_exact_prefix); } static inline std::vector> FilterWithExactPrefix( @@ -285,12 +205,113 @@ class FuzzyMatching final { const size_t exact_prefix_; const Match first_match_; + template + friend class index::internal::BaseIndexReader; + // reset method for the index in the special case the match is deleted template friend Match index::internal::NextFilteredMatchSingle(const MatcherT&, const DeletedT&); template friend Match index::internal::NextFilteredMatch(const MatcherT&, const DeletedT&); + /** + * Create a fuzzy matcher from a single Fsa + * + * @param fsa the fsa + * @param start_state the state to start from + * @param query the query + * @param max_edit_distance the maximum allowed edit distance + * @param exact_prefix the exact prefix that already matched + */ + template + static FuzzyMatching FromSingleFsaWithMatchedExactPrefix(const fsa::automata_t& fsa, const uint64_t start_state, + const std::string& query, const int32_t max_edit_distance, + const size_t exact_prefix) { + if (start_state == 0) { + return FuzzyMatching(); + } + + std::unique_ptr metric; + std::unique_ptr> traverser; + Match first_match; + + std::vector codepoints; + utf8::unchecked::utf8to32(query.begin(), query.end(), back_inserter(codepoints)); + + if (start_state == 0) { + TRACE("query lengh < minimum exact prefix, returning empty iterator"); + return FuzzyMatching(); + } + + // initialize the distance metric with the exact prefix + metric.reset(new stringdistance::Levenshtein(codepoints, 20, max_edit_distance)); + for (size_t i = 0; i < exact_prefix; ++i) { + metric->Put(codepoints[i], i); + } + + traverser.reset(new fsa::CodePointStateTraverser(fsa, start_state)); + + if (fsa->IsFinalState(start_state) && metric->GetScore() <= max_edit_distance) { + TRACE("exact prefix matched"); + first_match = + Match(0, exact_prefix, metric->GetCandidate(), metric->GetScore(), fsa, fsa->GetStateValue(start_state)); + } + + TRACE("create iterator"); + return FuzzyMatching(std::move(traverser), std::move(metric), std::move(first_match), + max_edit_distance, exact_prefix); + } + + /** + * Create a fuzzy matcher with already matched exact prefix. + * + * @param fsa_start_state_pairs pairs of fsa and current state + * @param query the query + * @param max_edit_distance the maximum allowed edit distance + * @param exact_prefix the exact prefix that already matched + */ + template + static FuzzyMatching> FromMulipleFsasWithMatchedExactPrefix( + const std::vector>& fsa_start_state_pairs, const std::string& query, + const int32_t max_edit_distance, const size_t exact_prefix) { + // if the list of fsa's is empty return an empty matcher + if (fsa_start_state_pairs.size() == 0) { + return FuzzyMatching>(); + } + + std::unique_ptr metric; + std::unique_ptr>> traverser; + Match first_match; + + // decode the utf8 query into single codepoints + std::vector codepoints; + utf8::unchecked::utf8to32(query.begin(), query.end(), back_inserter(codepoints)); + + // initialize the distance metric with the exact prefix + metric.reset(new stringdistance::Levenshtein(codepoints, 20, max_edit_distance)); + for (size_t i = 0; i < exact_prefix; ++i) { + metric->Put(codepoints[i], i); + } + + // check for a match given the exact prefix + for (const auto& fsa_state : fsa_start_state_pairs) { + if (fsa_state.first->IsFinalState(fsa_state.second) && metric->GetScore() <= max_edit_distance) { + first_match = Match(0, exact_prefix, metric->GetCandidate(), metric->GetScore(), fsa_state.first, + fsa_state.first->GetStateValue(fsa_state.second)); + break; + } + } + + TRACE("create zip traverser with %ul inner traversers", fsa_start_state_pairs.size()); + fsa::ZipStateTraverser zip_state_traverser(fsa_start_state_pairs, false); + traverser.reset( + new fsa::CodePointStateTraverser>(std::move(zip_state_traverser))); + + TRACE("create iterator"); + return FuzzyMatching>( + std::move(traverser), std::move(metric), std::move(first_match), max_edit_distance, exact_prefix); + } + void ResetLastMatch() {} }; diff --git a/keyvi/include/keyvi/index/internal/base_index_reader.h b/keyvi/include/keyvi/index/internal/base_index_reader.h index 78634bae9..4dbda79e3 100644 --- a/keyvi/include/keyvi/index/internal/base_index_reader.h +++ b/keyvi/include/keyvi/index/internal/base_index_reader.h @@ -130,8 +130,8 @@ class BaseIndexReader { } if (fsa_start_state_payloads.size() == 1) { - auto near_matcher = - std::make_shared>(dictionary::matching::NearMatching<>::FromSingleFsaWithMatchedExactPrefix( + auto near_matcher = std::make_shared>( + dictionary::matching::NearMatching<>::FromSingleFsaWithMatchedExactPrefix( std::get<0>(fsa_start_state_payloads[0]), std::get<1>(fsa_start_state_payloads[0]), query, minimum_exact_prefix, greedy)); @@ -157,7 +157,8 @@ class BaseIndexReader { auto near_matcher = std::make_shared< dictionary::matching::NearMatching>>( dictionary::matching::NearMatching>:: - FromMulipleFsasWithMatchedExactPrefix(std::move(fsa_start_state_payloads), query, minimum_exact_prefix, greedy)); + FromMulipleFsasWithMatchedExactPrefix(std::move(fsa_start_state_payloads), query, minimum_exact_prefix, + greedy)); if (deleted_keys_map.size() == 0) { auto func = [near_matcher]() { return near_matcher->NextMatch(); }; @@ -200,9 +201,9 @@ class BaseIndexReader { if (fsa_start_state_pairs.size() == 1) { auto fuzzy_matcher = std::make_shared>( - dictionary::matching::FuzzyMatching<>::FromSingleFsa<>(fsa_start_state_pairs[0].first, - fsa_start_state_pairs[0].second, query, - max_edit_distance, minimum_exact_prefix)); + dictionary::matching::FuzzyMatching<>::FromSingleFsaWithMatchedExactPrefix<>( + fsa_start_state_pairs[0].first, fsa_start_state_pairs[0].second, query, max_edit_distance, + minimum_exact_prefix)); for (auto it = segments->crbegin(); it != segments->crend(); it++) { if ((*it)->GetDictionary()->GetFsa() == fsa_start_state_pairs[0].first) { @@ -233,8 +234,8 @@ class BaseIndexReader { auto fuzzy_matcher = std::make_shared< dictionary::matching::FuzzyMatching>>>( dictionary::matching::FuzzyMatching>>:: - FromMulipleFsas>(fsa_start_state_pairs, query, max_edit_distance, - minimum_exact_prefix)); + FromMulipleFsasWithMatchedExactPrefix>( + fsa_start_state_pairs, query, max_edit_distance, minimum_exact_prefix)); if (deleted_keys_map.size() == 0) { auto func = [fuzzy_matcher]() { return fuzzy_matcher->NextMatch(); };