diff --git a/src/jobs/scheduled/LinksSync.php b/src/jobs/scheduled/LinksSync.php index 09fb0be6..9d2a1706 100644 --- a/src/jobs/scheduled/LinksSync.php +++ b/src/jobs/scheduled/LinksSync.php @@ -57,7 +57,7 @@ public function perform(): void { $fetch_service = new services\LinkFetcher(); - $links = models\Link::listToFetch(25); + $links = models\Link::listToFetch(max: 25); foreach ($links as $link) { $has_lock = $link->lock(); if (!$has_lock) { diff --git a/src/migrations/Migration202410020001AddToBeFetchedToLinks.php b/src/migrations/Migration202410020001AddToBeFetchedToLinks.php new file mode 100644 index 00000000..66f2324c --- /dev/null +++ b/src/migrations/Migration202410020001AddToBeFetchedToLinks.php @@ -0,0 +1,48 @@ +exec(<<<'SQL' + ALTER TABLE links + ADD COLUMN to_be_fetched BOOLEAN NOT NULL DEFAULT true; + + UPDATE links + SET to_be_fetched = false + WHERE fetched_at IS NOT NULL + AND ( + (fetched_code >= 200 AND fetched_code < 300) + OR fetched_count > 25 + ); + + CREATE INDEX idx_links_to_be_fetched ON links(to_be_fetched) WHERE to_be_fetched = true; + + DROP INDEX idx_links_fetched_at; + DROP INDEX idx_links_fetched_code; + SQL); + + return true; + } + + public function rollback(): bool + { + $database = \Minz\Database::get(); + + $database->exec(<<<'SQL' + CREATE INDEX idx_links_fetched_at ON links(fetched_at) WHERE fetched_at IS NULL; + CREATE INDEX idx_links_fetched_code ON links(fetched_code) WHERE fetched_code < 200 OR fetched_code >= 300; + + DROP INDEX idx_links_to_be_fetched; + + ALTER TABLE links + DROP COLUMN to_be_fetched; + SQL); + + return true; + } +} diff --git a/src/models/Link.php b/src/models/Link.php index 4840c703..99467492 100644 --- a/src/models/Link.php +++ b/src/models/Link.php @@ -58,6 +58,9 @@ class Link #[Database\Column] public ?string $image_filename = null; + #[Database\Column] + public bool $to_be_fetched = true; + #[Database\Column] public ?\DateTimeImmutable $fetched_at = null; diff --git a/src/models/dao/links/FetcherQueries.php b/src/models/dao/links/FetcherQueries.php index 7b762678..a6a72c21 100644 --- a/src/models/dao/links/FetcherQueries.php +++ b/src/models/dao/links/FetcherQueries.php @@ -67,23 +67,23 @@ public static function listEntryIdsToUrlsByCollectionId(string $collection_id): } /** - * Return a list of links to fetch (fetched_at is null, or fetched_code is in error). + * Return a list of links to fetch. * - * Links in error are not returned if their fetched_count is greater than - * 25 or if fetched_at is too close (a number of seconds depending on the - * fetched_count value). + * The links with a fetched_at too close (a number of seconds depending on + * the fetched_count value) are not returned. + * + * The list is limited by the $max parameter, and is randomly ordered. * * @return self[] */ - public static function listToFetch(int $max_number): array + public static function listToFetch(int $max): array { - $sql = <<= 300) - AND fetched_count <= 25 - AND fetched_at < (?::timestamptz - interval '1 second' * (5 + pow(fetched_count, 4))) + WHERE to_be_fetched = true + AND ( + fetched_at IS NULL + OR fetched_at < (?::timestamptz - interval '1 second' * (5 + pow(fetched_count, 4))) ) ORDER BY random() LIMIT ? @@ -95,7 +95,7 @@ public static function listToFetch(int $max_number): array $statement = $database->prepare($sql); $statement->execute([ $now->format(Database\Column::DATETIME_FORMAT), - $max_number, + $max, ]); return self::fromDatabaseRows($statement->fetchAll()); @@ -108,11 +108,10 @@ public static function countToFetch(): int { $sql = <<<'SQL' SELECT COUNT(*) FROM links - WHERE fetched_at IS NULL - OR ( - (fetched_code < 200 OR fetched_code >= 300) - AND fetched_count <= 25 - AND fetched_at < (?::timestamptz - interval '1 second' * (5 + pow(fetched_count, 4))) + WHERE to_be_fetched = true + AND ( + fetched_at IS NULL + OR fetched_at < (?::timestamptz - interval '1 second' * (5 + pow(fetched_count, 4))) ) SQL; diff --git a/src/schema.sql b/src/schema.sql index 354565bb..f6a1b965 100644 --- a/src/schema.sql +++ b/src/schema.sql @@ -148,6 +148,7 @@ CREATE TABLE links ( image_filename TEXT, tags JSON NOT NULL DEFAULT '[]', + to_be_fetched BOOLEAN NOT NULL DEFAULT true, fetched_at TIMESTAMPTZ, fetched_code INTEGER NOT NULL DEFAULT 0, fetched_error TEXT, @@ -167,8 +168,7 @@ CREATE TABLE links ( CREATE INDEX idx_links_user_id_url_hash ON links USING btree(user_id, url_hash); CREATE INDEX idx_links_url_hash ON links USING hash(url_hash); -CREATE INDEX idx_links_fetched_at ON links(fetched_at) WHERE fetched_at IS NULL; -CREATE INDEX idx_links_fetched_code ON links(fetched_code) WHERE fetched_code < 200 OR fetched_code >= 300; +CREATE INDEX idx_links_to_be_fetched ON links(to_be_fetched) WHERE to_be_fetched = true; CREATE INDEX idx_links_image_filename ON links(image_filename) WHERE image_filename IS NOT NULL; CREATE INDEX idx_links_search ON links USING GIN (search_index); diff --git a/src/services/LinkFetcher.php b/src/services/LinkFetcher.php index d6dea7b6..a3a47a34 100644 --- a/src/services/LinkFetcher.php +++ b/src/services/LinkFetcher.php @@ -80,6 +80,8 @@ public function fetch(models\Link $link): void $link->fetched_error = $info['error']; } + $link->to_be_fetched = self::shouldBeFetchedAgain($link); + // we set the title only if it wasn't changed yet $title_never_changed = $link->title === $link->url; if ( @@ -292,6 +294,25 @@ public function fetchUrl(string $url): array return $info; } + public static function shouldBeFetchedAgain(models\Link $link): bool + { + if (!$link->to_be_fetched) { + return false; + } + + $never_fetched = $link->fetched_at === null; + + if ($never_fetched) { + return true; + } + + $in_error = $link->fetched_code < 200 || $link->fetched_code >= 300; + $reached_max_retries = $link->fetched_count > 25; + $should_retry = $in_error && !$reached_max_retries; + + return $should_retry; + } + /** * Return true if the url is pointing to Twitter * diff --git a/tests/jobs/scheduled/LinksSyncTest.php b/tests/jobs/scheduled/LinksSyncTest.php index 3956bfe5..e27b9f8f 100644 --- a/tests/jobs/scheduled/LinksSyncTest.php +++ b/tests/jobs/scheduled/LinksSyncTest.php @@ -83,9 +83,8 @@ public function testPerform(): void $link = LinkFactory::create([ 'url' => $url, 'title' => $url, + 'to_be_fetched' => true, 'fetched_at' => null, - 'fetched_code' => 0, - 'fetched_count' => 0, ]); $links_fetcher_job = new LinksSync(); @@ -101,9 +100,12 @@ public function testPerform(): void public function testPerformLogsFetch(): void { + $url = 'https://flus.fr/carnet/'; + $this->mockHttpWithFixture($url, 'responses/flus.fr_carnet_index.html'); $link = LinkFactory::create([ - 'url' => 'https://github.com/flusio/Flus', - 'title' => 'https://github.com/flusio/Flus', + 'url' => $url, + 'title' => $url, + 'to_be_fetched' => true, 'fetched_at' => null, ]); $links_fetcher_job = new LinksSync(); @@ -115,8 +117,8 @@ public function testPerformLogsFetch(): void $this->assertGreaterThanOrEqual(1, models\FetchLog::count()); $fetch_log = models\FetchLog::take(); $this->assertNotNull($fetch_log); - $this->assertSame('https://github.com/flusio/Flus', $fetch_log->url); - $this->assertSame('github.com', $fetch_log->host); + $this->assertSame($url, $fetch_log->url); + $this->assertSame('flus.fr', $fetch_log->host); } public function testPerformSavesResponseInCache(): void @@ -126,6 +128,7 @@ public function testPerformSavesResponseInCache(): void $link = LinkFactory::create([ 'url' => $url, 'title' => $url, + 'to_be_fetched' => true, 'fetched_at' => null, ]); $links_fetcher_job = new LinksSync(); @@ -141,10 +144,12 @@ public function testPerformSavesResponseInCache(): void public function testPerformUsesCache(): void { - $url = 'https://github.com/flusio/Flus'; + $url = 'https://flus.fr/carnet/'; + $this->mockHttpWithFixture($url, 'responses/flus.fr_carnet_index.html'); $link = LinkFactory::create([ 'url' => $url, 'title' => $url, + 'to_be_fetched' => true, 'fetched_at' => null, ]); $links_fetcher_job = new LinksSync(); @@ -174,11 +179,12 @@ public function testPerformUsesCache(): void public function testPerformSkipsFetchIfReachedRateLimit(): void { + $url = 'https://flus.fr/carnet/'; + $this->mockHttpWithFixture($url, 'responses/flus.fr_carnet_index.html'); + $host = 'flus.fr'; /** @var \DateTimeImmutable */ $now = $this->fake('dateTime'); $this->freeze($now); - $url = 'https://github.com/flusio/Flus'; - $host = 'github.com'; foreach (range(1, 25) as $i) { /** @var int */ $seconds = $this->fake('numberBetween', 0, 60); @@ -192,6 +198,7 @@ public function testPerformSkipsFetchIfReachedRateLimit(): void $link = LinkFactory::create([ 'url' => $url, 'title' => $url, + 'to_be_fetched' => true, 'fetched_at' => null, ]); $links_fetcher_job = new LinksSync(); @@ -204,71 +211,10 @@ public function testPerformSkipsFetchIfReachedRateLimit(): void $this->assertNull($link->fetched_at); } - public function testPerformFetchesLinksInError(): void + public function testPerformDoesNotFetchLinksIfFetchedAtIsWithinIntervalToWait(): void { - /** @var \DateTimeImmutable */ - $now = $this->fake('dateTime'); - $this->freeze($now); - /** @var int */ - $fetched_count = $this->fake('numberBetween', 1, 25); - $interval_to_wait = 5 + pow($fetched_count, 4); - /** @var int */ - $seconds = $this->fake('numberBetween', $interval_to_wait + 1, $interval_to_wait + 9000); - $fetched_at = \Minz\Time::ago($seconds, 'seconds'); $url = 'https://flus.fr/carnet/'; $this->mockHttpWithFixture($url, 'responses/flus.fr_carnet_index.html'); - $link = LinkFactory::create([ - 'url' => $url, - 'title' => $url, - 'fetched_at' => $fetched_at, - 'fetched_code' => 404, - 'fetched_error' => 'not found', - 'fetched_count' => $fetched_count, - ]); - $links_fetcher_job = new LinksSync(); - - $links_fetcher_job->perform(); - - $link = $link->reload(); - $this->assertSame('Carnet de Flus', $link->title); - $this->assertNotEquals($fetched_at, $link->fetched_at); - $this->assertSame(200, $link->fetched_code); - $this->assertNull($link->fetched_error); - $this->assertSame($fetched_count + 1, $link->fetched_count); - } - - public function testPerformDoesNotFetchLinksInErrorIfFetchedCountIsGreaterThan25(): void - { - /** @var \DateTimeImmutable */ - $now = $this->fake('dateTime'); - $this->freeze($now); - /** @var int */ - $fetched_count = $this->fake('numberBetween', 26, 42); - $interval_to_wait = 5 + pow($fetched_count, 4); - /** @var int */ - $seconds = $this->fake('numberBetween', $interval_to_wait + 1, $interval_to_wait + 9000); - $fetched_at = \Minz\Time::ago($seconds, 'seconds'); - $link = LinkFactory::create([ - 'url' => 'https://github.com/flusio/Flus', - 'title' => 'https://github.com/flusio/Flus', - 'fetched_at' => $fetched_at, - 'fetched_code' => 404, - 'fetched_error' => 'not found', - 'fetched_count' => $fetched_count, - ]); - $links_fetcher_job = new LinksSync(); - - $links_fetcher_job->perform(); - - $link = $link->reload(); - $this->assertSame('https://github.com/flusio/Flus', $link->title); - $this->assertEquals($fetched_at, $link->fetched_at); - $this->assertSame(404, $link->fetched_code); - $this->assertSame($fetched_count, $link->fetched_count); - } - - public function testPerformDoesNotFetchLinksInErrorIfFetchedAtIsWithinIntervalToWait(): void - { /** @var \DateTimeImmutable */ $now = $this->fake('dateTime'); $this->freeze($now); @@ -281,11 +227,10 @@ public function testPerformDoesNotFetchLinksInErrorIfFetchedAtIsWithinIntervalTo $seconds = $this->fake('numberBetween', 0, $interval_to_wait); $fetched_at = \Minz\Time::ago($seconds, 'seconds'); $link = LinkFactory::create([ - 'url' => 'https://github.com/flusio/Flus', - 'title' => 'https://github.com/flusio/Flus', + 'url' => $url, + 'title' => $url, + 'to_be_fetched' => true, 'fetched_at' => $fetched_at, - 'fetched_code' => 404, - 'fetched_error' => 'not found', 'fetched_count' => $fetched_count, ]); $links_fetcher_job = new LinksSync(); @@ -293,52 +238,34 @@ public function testPerformDoesNotFetchLinksInErrorIfFetchedAtIsWithinIntervalTo $links_fetcher_job->perform(); $link = $link->reload(); - $this->assertSame('https://github.com/flusio/Flus', $link->title); + $this->assertSame($url, $link->title); $this->assertEquals($fetched_at, $link->fetched_at); - $this->assertSame(404, $link->fetched_code); $this->assertSame($fetched_count, $link->fetched_count); } public function testPerformDoesNotFetchLinkIfLockedDuringLastHour(): void { + $url = 'https://flus.fr/carnet/'; + $this->mockHttpWithFixture($url, 'responses/flus.fr_carnet_index.html'); /** @var \DateTimeImmutable */ $now = $this->fake('dateTime'); $this->freeze($now); /** @var int */ $minutes = $this->fake('numberBetween', 0, 59); $locked_at = \Minz\Time::ago($minutes, 'minutes'); - $url = 'https://github.com/flusio/Flus'; $link = LinkFactory::create([ 'url' => $url, 'title' => $url, + 'to_be_fetched' => true, 'fetched_at' => null, - 'fetched_code' => 0, - 'fetched_count' => 0, 'locked_at' => $locked_at, ]); $links_fetcher_job = new LinksSync(); - /** @var string */ - $title = $this->fake('sentence'); - $hash = \SpiderBits\Cache::hash($url); - $raw_response = << - - {$title} - - - TEXT; - /** @var string */ - $cache_path = \Minz\Configuration::$application['cache_path']; - $cache = new \SpiderBits\Cache($cache_path); - $cache->save($hash, $raw_response); $links_fetcher_job->perform(); $link = $link->reload(); - $this->assertNotSame($title, $link->title); + $this->assertSame($url, $link->title); $this->assertNull($link->fetched_at); $this->assertSame(0, $link->fetched_code); $this->assertSame(0, $link->fetched_count); @@ -347,44 +274,27 @@ public function testPerformDoesNotFetchLinkIfLockedDuringLastHour(): void public function testPerformFetchesLinkIfLockedAfterAnHour(): void { + $url = 'https://flus.fr/carnet/'; + $this->mockHttpWithFixture($url, 'responses/flus.fr_carnet_index.html'); /** @var \DateTimeImmutable */ $now = $this->fake('dateTime'); $this->freeze($now); /** @var int */ $minutes = $this->fake('numberBetween', 60, 1000); $locked_at = \Minz\Time::ago($minutes, 'minutes'); - $url = 'https://github.com/flusio/Flus'; $link = LinkFactory::create([ 'url' => $url, 'title' => $url, + 'to_be_fetched' => true, 'fetched_at' => null, - 'fetched_code' => 0, - 'fetched_count' => 0, 'locked_at' => $locked_at, ]); $links_fetcher_job = new LinksSync(); - /** @var string */ - $title = $this->fake('sentence'); - $hash = \SpiderBits\Cache::hash($url); - $raw_response = << - - {$title} - - - TEXT; - /** @var string */ - $cache_path = \Minz\Configuration::$application['cache_path']; - $cache = new \SpiderBits\Cache($cache_path); - $cache->save($hash, $raw_response); $links_fetcher_job->perform(); $link = $link->reload(); - $this->assertSame($title, $link->title); + $this->assertSame('Carnet de Flus', $link->title); $this->assertNotNull($link->fetched_at); $this->assertSame(200, $link->fetched_code); $this->assertSame(1, $link->fetched_count); diff --git a/tests/services/LinkFetcherTest.php b/tests/services/LinkFetcherTest.php index 6e86cf3d..2493ae1a 100644 --- a/tests/services/LinkFetcherTest.php +++ b/tests/services/LinkFetcherTest.php @@ -375,4 +375,77 @@ public function testFetchDoesNotChangeIllustrationIfAlreadySet(): void $link = $link->reload(); $this->assertSame('old.png', $link->image_filename); } + + public function testShouldBeFetchedAgainReturnsTrueIfNeverFetched(): void + { + $link = new models\Link('https://example.com', 'foo', true); + $link->to_be_fetched = true; + $link->fetched_at = null; + + $to_be_fetched = LinkFetcher::shouldBeFetchedAgain($link); + + $this->assertTrue($to_be_fetched); + } + + public function testShouldBeFetchedAgainReturnsTrueIfInErrorAndCanRetry(): void + { + $link = new models\Link('https://example.com', 'foo', true); + $link->to_be_fetched = true; + $link->fetched_code = 404; + /** @var int */ + $fetched_count = $this->fake('numberBetween', 1, 25); + $link->fetched_count = $fetched_count; + /** @var \DateTimeImmutable */ + $fetched_at = $this->fake('dateTime'); + $link->fetched_at = $fetched_at; + + $to_be_fetched = LinkFetcher::shouldBeFetchedAgain($link); + + $this->assertTrue($to_be_fetched); + } + + public function testShouldBeFetchedAgainReturnsFalseIfCanRetryButNotInError(): void + { + $link = new models\Link('https://example.com', 'foo', true); + $link->to_be_fetched = true; + $link->fetched_code = 200; + /** @var int */ + $fetched_count = $this->fake('numberBetween', 1, 25); + $link->fetched_count = $fetched_count; + /** @var \DateTimeImmutable */ + $fetched_at = $this->fake('dateTime'); + $link->fetched_at = $fetched_at; + + $to_be_fetched = LinkFetcher::shouldBeFetchedAgain($link); + + $this->assertFalse($to_be_fetched); + } + + public function testShouldBeFetchedAgainReturnsFalseIfInErrorButCannotRetry(): void + { + $link = new models\Link('https://example.com', 'foo', true); + $link->to_be_fetched = true; + $link->fetched_code = 404; + /** @var int */ + $fetched_count = $this->fake('numberBetween', 26, 42); + $link->fetched_count = $fetched_count; + /** @var \DateTimeImmutable */ + $fetched_at = $this->fake('dateTime'); + $link->fetched_at = $fetched_at; + + $to_be_fetched = LinkFetcher::shouldBeFetchedAgain($link); + + $this->assertFalse($to_be_fetched); + } + + public function testShouldBeFetchedAgainReturnsFalseIfToBeFetchedIsFalse(): void + { + $link = new models\Link('https://example.com', 'foo', true); + $link->to_be_fetched = false; + $link->fetched_at = null; + + $to_be_fetched = LinkFetcher::shouldBeFetchedAgain($link); + + $this->assertFalse($to_be_fetched); + } }