Skip to content

Commit

Permalink
merged
Browse files Browse the repository at this point in the history
  • Loading branch information
detain committed Dec 12, 2019
1 parent 12ba78d commit 4646540
Show file tree
Hide file tree
Showing 2 changed files with 106 additions and 22 deletions.
45 changes: 32 additions & 13 deletions src/Indexer/TNTIndexer.php
Original file line number Diff line number Diff line change
Expand Up @@ -202,9 +202,15 @@ public function createIndex($indexName)
term_id INTEGER,
doc_id INTEGER,
field_id INTEGER,
field_len INTEGER,
position INTEGER,
hit_count INTEGER)");

$this->index->exec("CREATE TABLE IF NOT EXISTS docinfo (
doc_id INTEGER,
field_id INTEGER,
num_terms INTEGER)");

$this->index->exec("CREATE TABLE IF NOT EXISTS info (
key TEXT,
value INTEGER)");
Expand Down Expand Up @@ -457,6 +463,7 @@ public function saveToIndex($stems, $docId)
$terms = $this->saveWordlist($stems);
$this->saveDoclist($terms, $docId);
$this->saveHitList($stems, $docId, $terms);
$this->saveDocInfo($stems, $docId);
}

/**
Expand Down Expand Up @@ -540,33 +547,45 @@ public function saveDoclist($terms, $docId)

public function saveHitList($stems, $docId, $termsList)
{
return;
$fieldCounter = 0;
$fields = [];

$insert = "INSERT INTO hitlist (term_id, doc_id, field_id, position, hit_count)
VALUES (:term_id, :doc_id, :field_id, :position, :hit_count)";
$insert = "INSERT INTO hitlist (term_id, doc_id, field_id, field_len, hit_count)
VALUES (:term_id, :doc_id, :field_id, :field_len, :hit_count)";
$stmt = $this->index->prepare($insert);

foreach ($stems as $field => $terms) {
$fields[$fieldCounter] = $field;
$positionCounter = 0;
$termCounts = array_count_values($terms);
foreach ($terms as $term) {
if (isset($termsList[$term])) {
$stmt->bindValue(':term_id', $termsList[$term]['id']);
$stmt->bindValue(':doc_id', $docId);
$stmt->bindValue(':field_id', $fieldCounter);
$stmt->bindValue(':position', $positionCounter);
$stmt->bindValue(':hit_count', $termCounts[$term]);
$stmt->execute();
}
$positionCounter++;
$field_len = count($terms);
foreach ($termCounts as $term => $hitCount) {
$stmt->bindValue(':term_id', $termsList[$term]['id']);
$stmt->bindValue(':doc_id', $docId);
$stmt->bindValue(':field_id', $fieldCounter);
$stmt->bindValue(':field_len', $field_len);
$stmt->bindValue(':hit_count', $termCounts[$term]);
$stmt->execute();
}
$fieldCounter++;
}
}

public function saveDocInfo($stems, $docId)
{
$fieldCounter = 0;
foreach ($stems as $field => $terms) {
$numTerms = count($terms);
$insert = "INSERT INTO docinfo (doc_id, field_id, num_terms) VALUES (:doc, :field_id, :num_terms)";
$stmt = $this->index->prepare($insert);
$stmt->bindValue(':doc', $docId);
$stmt->bindValue(':field_id', $fieldCounter);
$stmt->bindValue(':num_terms', $numTerms);
$stmt->execute();
$fieldCounter++;
}
}

public function getWordFromWordList($word)
{
$selectStmt = $this->index->prepare("SELECT * FROM wordlist WHERE term like :keyword LIMIT 1");
Expand Down
83 changes: 74 additions & 9 deletions src/TNTSearch.php
Original file line number Diff line number Diff line change
Expand Up @@ -101,25 +101,35 @@ public function search($phrase, $numOfResults = 100)
return $this->stemmer->stem($keyword);
});

$tfWeight = 1;
$dlWeight = 0.5;
$tfWeight = 1.2;
$dlWeight = 0.75;
$docScores = [];
$count = $this->totalDocumentsInCollection();
$avgFlen = $this->getAverageFieldLength();
$docTerms = array();

foreach ($keywords as $index => $term) {
$isLastKeyword = ($keywords->count() - 1) == $index;
$df = $this->totalMatchingDocuments($term, $isLastKeyword);
$idf = log($count / max(1, $df));
foreach ($this->getAllDocumentsForKeyword($term, false, $isLastKeyword) as $document) {
$docID = $document['doc_id'];
$tf = $document['hit_count'];
$idf = log(1 + ($count - $df + 0.5) / ($df + 0.5));
foreach ($this->getAllHitsForKeyword($term, true, $isLastKeyword) as $hit) {
$docID = $hit['doc_id'];
$tf = $hit['hit_count'];
$dlen = $hit['field_len'];
$fnorm = 1/sqrt($hit['field_len']);
$num = ($tfWeight + 1) * $tf;
$avgDlen = $avgFlen[$hit['field_id']];
$denom = $tfWeight
* ((1 - $dlWeight) + $dlWeight)
* ((1 - $dlWeight) + $dlWeight * $dlen / $avgDlen)
+ $tf;
$score = $idf * ($num / $denom);
$score = $fnorm * $idf * ($num / $denom);
$docScores[$docID] = isset($docScores[$docID]) ?
$docScores[$docID] + $score : $score;

if (!isset($docTerms[$docID])) {
$docTerms[$docID] = array();
}
$docTerms[$docID][$term] = 1;
}
}

Expand All @@ -128,7 +138,9 @@ public function search($phrase, $numOfResults = 100)
$docs = new Collection($docScores);

$totalHits = $docs->count();
$docs = $docs->map(function ($doc, $key) {
$docs = $docs->filter(function ($score, $docID) use ($docTerms, $keywords) {
return (count($docTerms[$docID]) == $keywords->count());
})->map(function ($doc, $key) {
return $key;
})->take($numOfResults);
$stopTimer = microtime(true);
Expand Down Expand Up @@ -262,6 +274,23 @@ public function getAllDocumentsForKeyword($keyword, $noLimit = false, $isLastKey
return $this->getAllDocumentsForStrictKeyword($word, $noLimit);
}

/**
* @param $keyword
* @param bool $noLimit
* @param bool $isLastKeyword
*
* @return Collection
*/
public function getAllHitsForKeyword($keyword, $noLimit = false, $isLastKeyword = false)
{
$word = $this->getWordlistByKeyword($keyword, $isLastKeyword);
if (!isset($word[0])) {
return new Collection([]);
}
// TODO: Fuzzy
return $this->getAllHitsForStrictKeyword($word, $noLimit);
}

/**
* @param $keyword
* @param bool $noLimit
Expand Down Expand Up @@ -518,4 +547,40 @@ private function getAllDocumentsForStrictKeyword($word, $noLimit)
$stmtDoc->execute();
return new Collection($stmtDoc->fetchAll(PDO::FETCH_ASSOC));
}

/**
* @param $word
* @param $noLimit
*
* @return Collection
*/
private function getAllHitsForStrictKeyword($word, $noLimit)
{
$query = "SELECT * FROM hitlist WHERE term_id = :id ORDER BY hit_count DESC";
// TODO: limit?
$stmtDoc = $this->index->prepare($query);
$stmtDoc->bindValue(':id', $word[0]['id']);
$stmtDoc->execute();
return new Collection($stmtDoc->fetchAll(PDO::FETCH_ASSOC));
}

/**
* @return $avgFieldLen
*/
private function getAverageFieldLength()
{
$query = "SELECT MAX(field_id) FROM docinfo";
$stmtDoc = $this->index->prepare($query);
$stmtDoc->execute();
$noFields = $stmtDoc->fetch(PDO::FETCH_NUM)[0] + 1;
$avgFlen = array();
for ($field = 0; $field < $noFields; $field++) {
$query = "SELECT AVG(num_terms) FROM docinfo WHERE field_id = :field_id";
$stmtDoc = $this->index->prepare($query);
$stmtDoc->bindValue(':field_id', $field);
$stmtDoc->execute();
$avgFlen[$field] = $stmtDoc->fetch(PDO::FETCH_NUM)[0];
}
return $avgFlen;
}
}

0 comments on commit 4646540

Please sign in to comment.