Skip to content
This repository has been archived by the owner on May 22, 2021. It is now read-only.

Commit

Permalink
Matcher (#67): add fuzzy matching without middling
Browse files Browse the repository at this point in the history
  • Loading branch information
alexander-myltsev committed Dec 5, 2018
1 parent 0bd9b1a commit 7dd1389
Showing 1 changed file with 88 additions and 37 deletions.
125 changes: 88 additions & 37 deletions matcher/src/main/scala/org/globalnames/index/matcher/Matcher.scala
Original file line number Diff line number Diff line change
Expand Up @@ -12,22 +12,24 @@ import parser.{ScientificNameParser => snp}
import org.apache.commons.lang3.StringUtils
import util.UuidEnhanced.javaUuid2thriftUuid
import scalaz.syntax.std.boolean._
import scalaz.syntax.std.option._

import scala.collection.parallel.ParSeq
import scala.concurrent.Future
import scala.util.Success
import scala.concurrent.ExecutionContext.Implicits.global
import akka.http.impl.util._

@Singleton
final case class CanonicalNames(private val namesRaw: Map[String, Set[Int]]) {
val names: Map[String, Set[Int]] = namesRaw.withDefaultValue(Set())
val nameToDatasources: Map[String, Set[Int]] = namesRaw.withDefaultValue(Set())
}

@Singleton
class Matcher @Inject()(canonicalNamesFut: Future[CanonicalNames]) extends Logging {

val matcherLibFut: Future[matcherlib.Matcher] =
canonicalNamesFut.map { canonicalNames => matcherlib.Matcher(canonicalNames.names) }
canonicalNamesFut.map { canonicalNames => matcherlib.Matcher(canonicalNames.nameToDatasources) }

private def matcherLib: matcherlib.Matcher = matcherLibFut.value match {
case Some(Success(m)) => m
Expand All @@ -39,9 +41,10 @@ class Matcher @Inject()(canonicalNamesFut: Future[CanonicalNames]) extends Loggi
case _ => CanonicalNames(Map.empty)
}

private[Matcher] case class CanonicalNameSplit(name: snp.Result,
private[Matcher] case class CanonicalNameSplit(nameProvidedUuid: Uuid,
namePartialStr: String,
isOriginalCanonical: Boolean) {
isOriginalCanonical: Boolean,
advancedResolutionAllowed: Boolean) {

val size: Int =
namePartialStr.isEmpty ? 0 | (StringUtils.countMatches(namePartialStr, ' ') + 1)
Expand All @@ -56,24 +59,47 @@ class Matcher @Inject()(canonicalNamesFut: Future[CanonicalNames]) extends Loggi
this.copy(namePartialStr = "", isOriginalCanonical = false)
}

def nameProvidedUuid: Uuid = name.preprocessorResult.id

def namePartial: Name =
Name(uuid = UuidGenerator.generate(namePartialStr), value = namePartialStr)
}

private object CanonicalNameSplit {
def apply(name: snp.Result): CanonicalNameSplit =
CanonicalNameSplit(name, name.canonized().getOrElse(""), isOriginalCanonical = true)
def create(name: snp.Result): Option[CanonicalNameSplit] = {
for (can <- name.canonized()) yield {
CanonicalNameSplit(
nameProvidedUuid = name.preprocessorResult.id,
namePartialStr = can,
isOriginalCanonical = true,
advancedResolutionAllowed = true
)
}
}

def genusAndInfraspeciesOnly(name: snp.Result): Option[CanonicalNameSplit] = {
name.canonized().flatMap { can =>
val canSplit = can.fastSplit(' ')
(canSplit.headOption, canSplit.lastOption) match {
case (Some(hd), Some(tl)) =>
val can1 = s"$hd $tl"
val cns = CanonicalNameSplit(
nameProvidedUuid = name.preprocessorResult.id,
namePartialStr = can1,
isOriginalCanonical = false,
advancedResolutionAllowed = false
)
cns.some
case _ => None
}
}
}
}

private case class FuzzyMatch(canonicalNameSplit: CanonicalNameSplit,
candidates: Vector[Candidate])

private
def resolveFromPartials(canonicalNameSplits: ParSeq[CanonicalNameSplit],
dataSourceIds: Set[Int],
advancedResolution: Boolean): ParSeq[Response] = {
private def resolveFromPartials(canonicalNameSplits: ParSeq[CanonicalNameSplit],
dataSourceIds: Set[Int],
advancedResolution: Boolean): ParSeq[Response] = {
logger.info(s"Matcher service start for ${canonicalNameSplits.size} records")
if (canonicalNameSplits.isEmpty) {
ParSeq()
Expand All @@ -82,11 +108,13 @@ class Matcher @Inject()(canonicalNamesFut: Future[CanonicalNames]) extends Loggi
canonicalNameSplits.partition { cnp =>
(cnp.size == 1 && cnp.isOriginalCanonical) || cnp.size > 1
}
logger.info(s"Split: nonGenusOrUninomialSplits - ${nonGenusOrUninomialSplits.size}" +
s" and genusOnlyMatchesSplits - ${genusOnlyMatchesSplits.size}")

val noFuzzyMatches =
val noFuzzyMatchesResponses =
for (goms <- genusOnlyMatchesSplits) yield {
val matchKind = MK.CanonicalMatch(thrift.CanonicalMatch(partial = true))
val dsIds = canonicalNames.names(goms.namePartialStr)
val dsIds = canonicalNames.nameToDatasources(goms.namePartialStr)
val results =
(dataSourceIds.isEmpty ? dsIds | dataSourceIds.intersect(dsIds)).nonEmpty.option {
Result(nameMatched = goms.namePartial, matchKind = matchKind)
Expand All @@ -96,10 +124,12 @@ class Matcher @Inject()(canonicalNamesFut: Future[CanonicalNames]) extends Loggi

val (exactPartialCanonicalMatches, possibleFuzzyCanonicalMatches) =
nonGenusOrUninomialSplits
.map { cns => (cns, canonicalNames.names(cns.namePartialStr)) }
.map { cns => (cns, canonicalNames.nameToDatasources(cns.namePartialStr)) }
.partition { case (_, dsids) =>
(dataSourceIds.isEmpty ? dsids | dataSourceIds.intersect(dsids)).nonEmpty
}
logger.info(s"Split: exactPartialCanonicalMatches - ${exactPartialCanonicalMatches.size}" +
s" and possibleFuzzyCanonicalMatches - ${possibleFuzzyCanonicalMatches.size}")

val partialByGenusFuzzyResponses =
for ((canonicalNameSplit, _) <- exactPartialCanonicalMatches) yield {
Expand All @@ -114,7 +144,7 @@ class Matcher @Inject()(canonicalNamesFut: Future[CanonicalNames]) extends Loggi

logger.info(s"Matcher library call for ${possibleFuzzyCanonicalMatches.size} records")
val possibleFuzzyCanonicalsResponses =
for ((canNmSplit, _) <- possibleFuzzyCanonicalMatches.par) yield {
for ((canNmSplit, _) <- possibleFuzzyCanonicalMatches) yield {
val matches = matcherLib.findMatches(canNmSplit.namePartialStr, dataSourceIds)
FuzzyMatch(canNmSplit, matches)
}
Expand All @@ -126,17 +156,25 @@ class Matcher @Inject()(canonicalNamesFut: Future[CanonicalNames]) extends Loggi
dataSourceIds.isEmpty ?
fuzzyMatch.candidates.nonEmpty |
fuzzyMatch.candidates.exists { candidate =>
canonicalNames.names(candidate.term).intersect(dataSourceIds).nonEmpty
canonicalNames.nameToDatasources(candidate.term).intersect(dataSourceIds).nonEmpty
}
}

val responsesYetEmpty =
if (advancedResolution) {
resolveFromPartials(oonucrEmpty.map { _.canonicalNameSplit.shorten },
dataSourceIds,
advancedResolution)
val (oonucrEmptyAllowed, oonucrEmptyRest) =
oonucrEmpty.partition { n => n.canonicalNameSplit.advancedResolutionAllowed }

val partials = resolveFromPartials(
canonicalNameSplits = oonucrEmptyAllowed.map { _.canonicalNameSplit.shorten },
dataSourceIds = dataSourceIds,
advancedResolution = advancedResolution)
val rest = for (fm <- oonucrEmptyRest) yield {
Response(inputUuid = fm.canonicalNameSplit.nameProvidedUuid, results = Seq())
}
partials ++ rest
} else {
for { fm <- oonucrEmpty } yield {
for (fm <- oonucrEmpty) yield {
Response(inputUuid = fm.canonicalNameSplit.nameProvidedUuid, results = Seq())
}
}
Expand All @@ -146,7 +184,7 @@ class Matcher @Inject()(canonicalNamesFut: Future[CanonicalNames]) extends Loggi
val results = fuzzyMatch.candidates
.filter { candidate =>
dataSourceIds.isEmpty ||
canonicalNames.names(candidate.term).intersect(dataSourceIds).nonEmpty
canonicalNames.nameToDatasources(candidate.term).intersect(dataSourceIds).nonEmpty
}
.map { candidate =>
val matchKind =
Expand All @@ -164,8 +202,8 @@ class Matcher @Inject()(canonicalNamesFut: Future[CanonicalNames]) extends Loggi
}

logger.info(s"Matcher service completed for ${canonicalNameSplits.size} records")
val responses =
partialByGenusFuzzyResponses ++ responsesNonEmpty ++ responsesYetEmpty ++ noFuzzyMatches
val responses = partialByGenusFuzzyResponses ++ responsesNonEmpty ++
responsesYetEmpty ++ noFuzzyMatchesResponses
assert(responses.size == canonicalNameSplits.size)
responses
}
Expand All @@ -182,23 +220,36 @@ class Matcher @Inject()(canonicalNamesFut: Future[CanonicalNames]) extends Loggi
val responsesRest = namesParsedRest.map { np =>
Response(inputUuid = np.preprocessorResult.id, results = Seq())
}
val namesParsedSuccessfullySplits = namesParsedSuccessfully.map { np => CanonicalNameSplit(np) }
val namesParsedSuccessfullySplits = namesParsedSuccessfully.flatMap { np =>
val cns = CanonicalNameSplit.create(name = np)
val cnsNoMiddle = CanonicalNameSplit.genusAndInfraspeciesOnly(name = np)
List(cns, cnsNoMiddle).flatten
}
logger.info("Recursive fuzzy match started")
val responses =
resolveFromPartials(
namesParsedSuccessfullySplits, dataSourceIds.toSet, advancedResolution) ++ responsesRest
val responsesFromPartitials = resolveFromPartials(
canonicalNameSplits = namesParsedSuccessfullySplits,
dataSourceIds = dataSourceIds.toSet,
advancedResolution = advancedResolution)
val responses: Vector[Response] =
(responsesFromPartitials ++ responsesRest)
.groupBy { _.inputUuid }
.mapValues { vs => vs.head.copy(results = vs.flatMap { _.results }.seq) }
.values.toVector

if (advancedResolution) {
responses.seq
responses
} else {
for (response <- responses.seq) yield {
response.copy(results = response.results.filter { res =>
res.matchKind match {
case MK.CanonicalMatch(cm) => cm.stemEditDistance > 0 || cm.verbatimEditDistance > 0
case _ => false
}
})
}
val result =
for (response <- responses) yield {
response.copy(results = response.results.filter { res =>
res.matchKind match {
case MK.CanonicalMatch(cm) =>
cm.partial || cm.stemEditDistance > 0 || cm.verbatimEditDistance > 0
case _ => false
}
})
}
result
}
}
}

0 comments on commit 7dd1389

Please sign in to comment.