From 7dd138923655617743f23ffc9ad130eb006bdc8a Mon Sep 17 00:00:00 2001 From: Alexander Myltsev Date: Mon, 3 Dec 2018 20:42:56 +0300 Subject: [PATCH] Matcher (#67): add fuzzy matching without middling --- .../globalnames/index/matcher/Matcher.scala | 125 ++++++++++++------ 1 file changed, 88 insertions(+), 37 deletions(-) diff --git a/matcher/src/main/scala/org/globalnames/index/matcher/Matcher.scala b/matcher/src/main/scala/org/globalnames/index/matcher/Matcher.scala index a8c08e7..129b072 100644 --- a/matcher/src/main/scala/org/globalnames/index/matcher/Matcher.scala +++ b/matcher/src/main/scala/org/globalnames/index/matcher/Matcher.scala @@ -12,22 +12,24 @@ import parser.{ScientificNameParser => snp} import org.apache.commons.lang3.StringUtils import util.UuidEnhanced.javaUuid2thriftUuid import scalaz.syntax.std.boolean._ +import scalaz.syntax.std.option._ import scala.collection.parallel.ParSeq import scala.concurrent.Future import scala.util.Success import scala.concurrent.ExecutionContext.Implicits.global +import akka.http.impl.util._ @Singleton final case class CanonicalNames(private val namesRaw: Map[String, Set[Int]]) { - val names: Map[String, Set[Int]] = namesRaw.withDefaultValue(Set()) + val nameToDatasources: Map[String, Set[Int]] = namesRaw.withDefaultValue(Set()) } @Singleton class Matcher @Inject()(canonicalNamesFut: Future[CanonicalNames]) extends Logging { val matcherLibFut: Future[matcherlib.Matcher] = - canonicalNamesFut.map { canonicalNames => matcherlib.Matcher(canonicalNames.names) } + canonicalNamesFut.map { canonicalNames => matcherlib.Matcher(canonicalNames.nameToDatasources) } private def matcherLib: matcherlib.Matcher = matcherLibFut.value match { case Some(Success(m)) => m @@ -39,9 +41,10 @@ class Matcher @Inject()(canonicalNamesFut: Future[CanonicalNames]) extends Loggi case _ => CanonicalNames(Map.empty) } - private[Matcher] case class CanonicalNameSplit(name: snp.Result, + private[Matcher] case class CanonicalNameSplit(nameProvidedUuid: Uuid, namePartialStr: String, - isOriginalCanonical: Boolean) { + isOriginalCanonical: Boolean, + advancedResolutionAllowed: Boolean) { val size: Int = namePartialStr.isEmpty ? 0 | (StringUtils.countMatches(namePartialStr, ' ') + 1) @@ -56,24 +59,47 @@ class Matcher @Inject()(canonicalNamesFut: Future[CanonicalNames]) extends Loggi this.copy(namePartialStr = "", isOriginalCanonical = false) } - def nameProvidedUuid: Uuid = name.preprocessorResult.id - def namePartial: Name = Name(uuid = UuidGenerator.generate(namePartialStr), value = namePartialStr) } private object CanonicalNameSplit { - def apply(name: snp.Result): CanonicalNameSplit = - CanonicalNameSplit(name, name.canonized().getOrElse(""), isOriginalCanonical = true) + def create(name: snp.Result): Option[CanonicalNameSplit] = { + for (can <- name.canonized()) yield { + CanonicalNameSplit( + nameProvidedUuid = name.preprocessorResult.id, + namePartialStr = can, + isOriginalCanonical = true, + advancedResolutionAllowed = true + ) + } + } + + def genusAndInfraspeciesOnly(name: snp.Result): Option[CanonicalNameSplit] = { + name.canonized().flatMap { can => + val canSplit = can.fastSplit(' ') + (canSplit.headOption, canSplit.lastOption) match { + case (Some(hd), Some(tl)) => + val can1 = s"$hd $tl" + val cns = CanonicalNameSplit( + nameProvidedUuid = name.preprocessorResult.id, + namePartialStr = can1, + isOriginalCanonical = false, + advancedResolutionAllowed = false + ) + cns.some + case _ => None + } + } + } } private case class FuzzyMatch(canonicalNameSplit: CanonicalNameSplit, candidates: Vector[Candidate]) - private - def resolveFromPartials(canonicalNameSplits: ParSeq[CanonicalNameSplit], - dataSourceIds: Set[Int], - advancedResolution: Boolean): ParSeq[Response] = { + private def resolveFromPartials(canonicalNameSplits: ParSeq[CanonicalNameSplit], + dataSourceIds: Set[Int], + advancedResolution: Boolean): ParSeq[Response] = { logger.info(s"Matcher service start for ${canonicalNameSplits.size} records") if (canonicalNameSplits.isEmpty) { ParSeq() @@ -82,11 +108,13 @@ class Matcher @Inject()(canonicalNamesFut: Future[CanonicalNames]) extends Loggi canonicalNameSplits.partition { cnp => (cnp.size == 1 && cnp.isOriginalCanonical) || cnp.size > 1 } + logger.info(s"Split: nonGenusOrUninomialSplits - ${nonGenusOrUninomialSplits.size}" + + s" and genusOnlyMatchesSplits - ${genusOnlyMatchesSplits.size}") - val noFuzzyMatches = + val noFuzzyMatchesResponses = for (goms <- genusOnlyMatchesSplits) yield { val matchKind = MK.CanonicalMatch(thrift.CanonicalMatch(partial = true)) - val dsIds = canonicalNames.names(goms.namePartialStr) + val dsIds = canonicalNames.nameToDatasources(goms.namePartialStr) val results = (dataSourceIds.isEmpty ? dsIds | dataSourceIds.intersect(dsIds)).nonEmpty.option { Result(nameMatched = goms.namePartial, matchKind = matchKind) @@ -96,10 +124,12 @@ class Matcher @Inject()(canonicalNamesFut: Future[CanonicalNames]) extends Loggi val (exactPartialCanonicalMatches, possibleFuzzyCanonicalMatches) = nonGenusOrUninomialSplits - .map { cns => (cns, canonicalNames.names(cns.namePartialStr)) } + .map { cns => (cns, canonicalNames.nameToDatasources(cns.namePartialStr)) } .partition { case (_, dsids) => (dataSourceIds.isEmpty ? dsids | dataSourceIds.intersect(dsids)).nonEmpty } + logger.info(s"Split: exactPartialCanonicalMatches - ${exactPartialCanonicalMatches.size}" + + s" and possibleFuzzyCanonicalMatches - ${possibleFuzzyCanonicalMatches.size}") val partialByGenusFuzzyResponses = for ((canonicalNameSplit, _) <- exactPartialCanonicalMatches) yield { @@ -114,7 +144,7 @@ class Matcher @Inject()(canonicalNamesFut: Future[CanonicalNames]) extends Loggi logger.info(s"Matcher library call for ${possibleFuzzyCanonicalMatches.size} records") val possibleFuzzyCanonicalsResponses = - for ((canNmSplit, _) <- possibleFuzzyCanonicalMatches.par) yield { + for ((canNmSplit, _) <- possibleFuzzyCanonicalMatches) yield { val matches = matcherLib.findMatches(canNmSplit.namePartialStr, dataSourceIds) FuzzyMatch(canNmSplit, matches) } @@ -126,17 +156,25 @@ class Matcher @Inject()(canonicalNamesFut: Future[CanonicalNames]) extends Loggi dataSourceIds.isEmpty ? fuzzyMatch.candidates.nonEmpty | fuzzyMatch.candidates.exists { candidate => - canonicalNames.names(candidate.term).intersect(dataSourceIds).nonEmpty + canonicalNames.nameToDatasources(candidate.term).intersect(dataSourceIds).nonEmpty } } val responsesYetEmpty = if (advancedResolution) { - resolveFromPartials(oonucrEmpty.map { _.canonicalNameSplit.shorten }, - dataSourceIds, - advancedResolution) + val (oonucrEmptyAllowed, oonucrEmptyRest) = + oonucrEmpty.partition { n => n.canonicalNameSplit.advancedResolutionAllowed } + + val partials = resolveFromPartials( + canonicalNameSplits = oonucrEmptyAllowed.map { _.canonicalNameSplit.shorten }, + dataSourceIds = dataSourceIds, + advancedResolution = advancedResolution) + val rest = for (fm <- oonucrEmptyRest) yield { + Response(inputUuid = fm.canonicalNameSplit.nameProvidedUuid, results = Seq()) + } + partials ++ rest } else { - for { fm <- oonucrEmpty } yield { + for (fm <- oonucrEmpty) yield { Response(inputUuid = fm.canonicalNameSplit.nameProvidedUuid, results = Seq()) } } @@ -146,7 +184,7 @@ class Matcher @Inject()(canonicalNamesFut: Future[CanonicalNames]) extends Loggi val results = fuzzyMatch.candidates .filter { candidate => dataSourceIds.isEmpty || - canonicalNames.names(candidate.term).intersect(dataSourceIds).nonEmpty + canonicalNames.nameToDatasources(candidate.term).intersect(dataSourceIds).nonEmpty } .map { candidate => val matchKind = @@ -164,8 +202,8 @@ class Matcher @Inject()(canonicalNamesFut: Future[CanonicalNames]) extends Loggi } logger.info(s"Matcher service completed for ${canonicalNameSplits.size} records") - val responses = - partialByGenusFuzzyResponses ++ responsesNonEmpty ++ responsesYetEmpty ++ noFuzzyMatches + val responses = partialByGenusFuzzyResponses ++ responsesNonEmpty ++ + responsesYetEmpty ++ noFuzzyMatchesResponses assert(responses.size == canonicalNameSplits.size) responses } @@ -182,23 +220,36 @@ class Matcher @Inject()(canonicalNamesFut: Future[CanonicalNames]) extends Loggi val responsesRest = namesParsedRest.map { np => Response(inputUuid = np.preprocessorResult.id, results = Seq()) } - val namesParsedSuccessfullySplits = namesParsedSuccessfully.map { np => CanonicalNameSplit(np) } + val namesParsedSuccessfullySplits = namesParsedSuccessfully.flatMap { np => + val cns = CanonicalNameSplit.create(name = np) + val cnsNoMiddle = CanonicalNameSplit.genusAndInfraspeciesOnly(name = np) + List(cns, cnsNoMiddle).flatten + } logger.info("Recursive fuzzy match started") - val responses = - resolveFromPartials( - namesParsedSuccessfullySplits, dataSourceIds.toSet, advancedResolution) ++ responsesRest + val responsesFromPartitials = resolveFromPartials( + canonicalNameSplits = namesParsedSuccessfullySplits, + dataSourceIds = dataSourceIds.toSet, + advancedResolution = advancedResolution) + val responses: Vector[Response] = + (responsesFromPartitials ++ responsesRest) + .groupBy { _.inputUuid } + .mapValues { vs => vs.head.copy(results = vs.flatMap { _.results }.seq) } + .values.toVector if (advancedResolution) { - responses.seq + responses } else { - for (response <- responses.seq) yield { - response.copy(results = response.results.filter { res => - res.matchKind match { - case MK.CanonicalMatch(cm) => cm.stemEditDistance > 0 || cm.verbatimEditDistance > 0 - case _ => false - } - }) - } + val result = + for (response <- responses) yield { + response.copy(results = response.results.filter { res => + res.matchKind match { + case MK.CanonicalMatch(cm) => + cm.partial || cm.stemEditDistance > 0 || cm.verbatimEditDistance > 0 + case _ => false + } + }) + } + result } } }