Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

tec: Improve performance to retrieve links to fetch #701

Merged
merged 2 commits into from
Oct 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/jobs/scheduled/LinksSync.php
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ public function perform(): void
{
$fetch_service = new services\LinkFetcher();

$links = models\Link::listToFetch(25);
$links = models\Link::listToFetch(max: 25);
foreach ($links as $link) {
$has_lock = $link->lock();
if (!$has_lock) {
Expand Down
48 changes: 48 additions & 0 deletions src/migrations/Migration202410020001AddToBeFetchedToLinks.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
<?php

namespace App\migrations;

class Migration202410020001AddToBeFetchedToLinks
{
public function migrate(): bool
{
$database = \Minz\Database::get();

$database->exec(<<<'SQL'
ALTER TABLE links
ADD COLUMN to_be_fetched BOOLEAN NOT NULL DEFAULT true;

UPDATE links
SET to_be_fetched = false
WHERE fetched_at IS NOT NULL
AND (
(fetched_code >= 200 AND fetched_code < 300)
OR fetched_count > 25
);

CREATE INDEX idx_links_to_be_fetched ON links(to_be_fetched) WHERE to_be_fetched = true;

DROP INDEX idx_links_fetched_at;
DROP INDEX idx_links_fetched_code;
SQL);

return true;
}

public function rollback(): bool
{
$database = \Minz\Database::get();

$database->exec(<<<'SQL'
CREATE INDEX idx_links_fetched_at ON links(fetched_at) WHERE fetched_at IS NULL;
CREATE INDEX idx_links_fetched_code ON links(fetched_code) WHERE fetched_code < 200 OR fetched_code >= 300;

DROP INDEX idx_links_to_be_fetched;

ALTER TABLE links
DROP COLUMN to_be_fetched;
SQL);

return true;
}
}
3 changes: 3 additions & 0 deletions src/models/Link.php
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,9 @@ class Link
#[Database\Column]
public ?string $image_filename = null;

#[Database\Column]
public bool $to_be_fetched = true;

#[Database\Column]
public ?\DateTimeImmutable $fetched_at = null;

Expand Down
33 changes: 16 additions & 17 deletions src/models/dao/links/FetcherQueries.php
Original file line number Diff line number Diff line change
Expand Up @@ -67,23 +67,23 @@ public static function listEntryIdsToUrlsByCollectionId(string $collection_id):
}

/**
* Return a list of links to fetch (fetched_at is null, or fetched_code is in error).
* Return a list of links to fetch.
*
* Links in error are not returned if their fetched_count is greater than
* 25 or if fetched_at is too close (a number of seconds depending on the
* fetched_count value).
* The links with a fetched_at too close (a number of seconds depending on
* the fetched_count value) are not returned.
*
* The list is limited by the $max parameter, and is randomly ordered.
*
* @return self[]
*/
public static function listToFetch(int $max_number): array
public static function listToFetch(int $max): array
{
$sql = <<<SQL
$sql = <<<'SQL'
SELECT * FROM links
WHERE fetched_at IS NULL
OR (
(fetched_code < 200 OR fetched_code >= 300)
AND fetched_count <= 25
AND fetched_at < (?::timestamptz - interval '1 second' * (5 + pow(fetched_count, 4)))
WHERE to_be_fetched = true
AND (
fetched_at IS NULL
OR fetched_at < (?::timestamptz - interval '1 second' * (5 + pow(fetched_count, 4)))
)
ORDER BY random()
LIMIT ?
Expand All @@ -95,7 +95,7 @@ public static function listToFetch(int $max_number): array
$statement = $database->prepare($sql);
$statement->execute([
$now->format(Database\Column::DATETIME_FORMAT),
$max_number,
$max,
]);

return self::fromDatabaseRows($statement->fetchAll());
Expand All @@ -108,11 +108,10 @@ public static function countToFetch(): int
{
$sql = <<<'SQL'
SELECT COUNT(*) FROM links
WHERE fetched_at IS NULL
OR (
(fetched_code < 200 OR fetched_code >= 300)
AND fetched_count <= 25
AND fetched_at < (?::timestamptz - interval '1 second' * (5 + pow(fetched_count, 4)))
WHERE to_be_fetched = true
AND (
fetched_at IS NULL
OR fetched_at < (?::timestamptz - interval '1 second' * (5 + pow(fetched_count, 4)))
)
SQL;

Expand Down
4 changes: 2 additions & 2 deletions src/schema.sql
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,7 @@ CREATE TABLE links (
image_filename TEXT,
tags JSON NOT NULL DEFAULT '[]',

to_be_fetched BOOLEAN NOT NULL DEFAULT true,
fetched_at TIMESTAMPTZ,
fetched_code INTEGER NOT NULL DEFAULT 0,
fetched_error TEXT,
Expand All @@ -167,8 +168,7 @@ CREATE TABLE links (

CREATE INDEX idx_links_user_id_url_hash ON links USING btree(user_id, url_hash);
CREATE INDEX idx_links_url_hash ON links USING hash(url_hash);
CREATE INDEX idx_links_fetched_at ON links(fetched_at) WHERE fetched_at IS NULL;
CREATE INDEX idx_links_fetched_code ON links(fetched_code) WHERE fetched_code < 200 OR fetched_code >= 300;
CREATE INDEX idx_links_to_be_fetched ON links(to_be_fetched) WHERE to_be_fetched = true;
CREATE INDEX idx_links_image_filename ON links(image_filename) WHERE image_filename IS NOT NULL;
CREATE INDEX idx_links_search ON links USING GIN (search_index);

Expand Down
21 changes: 21 additions & 0 deletions src/services/LinkFetcher.php
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,8 @@ public function fetch(models\Link $link): void
$link->fetched_error = $info['error'];
}

$link->to_be_fetched = self::shouldBeFetchedAgain($link);

// we set the title only if it wasn't changed yet
$title_never_changed = $link->title === $link->url;
if (
Expand Down Expand Up @@ -292,6 +294,25 @@ public function fetchUrl(string $url): array
return $info;
}

public static function shouldBeFetchedAgain(models\Link $link): bool
{
if (!$link->to_be_fetched) {
return false;
}

$never_fetched = $link->fetched_at === null;

if ($never_fetched) {
return true;
}

$in_error = $link->fetched_code < 200 || $link->fetched_code >= 300;
$reached_max_retries = $link->fetched_count > 25;
$should_retry = $in_error && !$reached_max_retries;

return $should_retry;
}

/**
* Return true if the url is pointing to Twitter
*
Expand Down
Loading
Loading