Skip to content

Commit

Permalink
imp: Allow to get the correct duration of Youtube videos
Browse files Browse the repository at this point in the history
  • Loading branch information
marienfressinaud committed May 29, 2024
1 parent bf0fdb6 commit e132baa
Show file tree
Hide file tree
Showing 3 changed files with 104 additions and 4 deletions.
40 changes: 40 additions & 0 deletions lib/SpiderBits/src/DomExtractor.php
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,46 @@ public static function content(Dom $dom): string
return $main_node->text();
}

/**
* Return the duration in minutes of the element in the DOM.
*
* If the DOM declares an element with itemprop="duration", return the
* corresponding duration. Otherwise, estimate the duration from the
* content itself by counting the number of words in the content and divide
* by an average reading speed (i.e. 200 words per minute).
*/
public static function duration(Dom $dom): int
{
// Search for a node having the attribute itemprop="duration
// @see https://schema.org/docs/gs.html
$duration_node = $dom->select('//*[@itemprop = "duration"]/attribute::content');

if ($duration_node) {
try {
$interval = new \DateInterval($duration_node->text());
// Convert the interval to minutes
$duration = $interval->y * 12 * 30 * 24 * 60;
$duration += $interval->m * 30 * 24 * 60;
$duration += $interval->d * 24 * 60;
$duration += $interval->h * 60;
$duration += $interval->i;
if ($interval->s >= 30) {
$duration += 1;
}
return $duration;
} catch (\Exception $e) {
// Do nothing and fallback to the other mode
}
}

// If there is no duration node (or if its content can't be parsed),
// roughly estimate the duration from the DOM content.
$content = self::content($dom);
$words = array_filter(explode(' ', $content));
$average_reading_speed = 200;
return intval(count($words) / $average_reading_speed);
}

/**
* Return the autodiscovered feeds URLs (RSS and Atom).
*
Expand Down
5 changes: 1 addition & 4 deletions src/services/LinkFetcher.php
Original file line number Diff line number Diff line change
Expand Up @@ -269,10 +269,7 @@ public function fetchUrl(string $url): array
}
}

// And roughly estimate the reading time
$content = \SpiderBits\DomExtractor::content($dom);
$words = array_filter(explode(' ', $content));
$info['reading_time'] = intval(count($words) / 200);
$info['reading_time'] = \SpiderBits\DomExtractor::duration($dom);

// Get the illustration URL if any
$url_illustration = \SpiderBits\DomExtractor::illustration($dom);
Expand Down
63 changes: 63 additions & 0 deletions tests/lib/SpiderBits/DomExtractorTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@

class DomExtractorTest extends \PHPUnit\Framework\TestCase
{
use \tests\FakerHelper;

public function testTitle(): void
{
$dom = Dom::fromText(<<<HTML
Expand Down Expand Up @@ -280,6 +282,67 @@ public function testContentStripsScripts(): void
$this->assertSame('This is main', $content);
}

public function testDuration(): void
{
/** @var string */
$content = $this->fake('words', 400, true);
$dom = Dom::fromText(<<<HTML
<html>
<body>
<main>
{$content}
</main>
</body>
</html>
HTML);

$duration = DomExtractor::duration($dom);

$this->assertSame(2, $duration);
}

public function testDurationWithItempropAttribute(): void
{
/** @var string */
$content = $this->fake('words', 400, true);
$dom = Dom::fromText(<<<HTML
<html>
<body>
<meta itemprop="duration" content="PT41M35S" />
<main>
{$content}
</main>
</body>
</html>
HTML);

$duration = DomExtractor::duration($dom);

$this->assertSame(42, $duration);
}

public function testDurationWithInvalidItempropAttribute(): void
{
/** @var string */
$content = $this->fake('words', 400, true);
$dom = Dom::fromText(<<<HTML
<html>
<body>
<meta itemprop="duration" content="NotIso8601" />
<main>
{$content}
</main>
</body>
</html>
HTML);

$duration = DomExtractor::duration($dom);

$this->assertSame(2, $duration);
}

public function testFeeds(): void
{
$dom = Dom::fromText(<<<HTML
Expand Down

0 comments on commit e132baa

Please sign in to comment.