Skip to content

Commit

Permalink
rewrite service URL checker using curl instead of get_headers
Browse files Browse the repository at this point in the history
- removed ability to check links from multi render
- checked links have data attribute indicating if check occured or not
  • Loading branch information
vincent-peugnet committed Nov 26, 2024
1 parent abf3add commit 7feea33
Show file tree
Hide file tree
Showing 6 changed files with 172 additions and 70 deletions.
4 changes: 1 addition & 3 deletions app/class/Controllerhome.php
Original file line number Diff line number Diff line change
Expand Up @@ -328,14 +328,12 @@ public function multiedit()
public function multirender(): void
{
$pagelist = $_POST['pagesid'] ?? [];
$checkurl = Config::urlchecker() && boolval($_POST['checkurl']);
$total = count($pagelist);
$pagelist = $this->pagemanager->pagelistbyid($pagelist);
$count = 0;
$urlchecker = $checkurl ? new Serviceurlchecker(12) : null; // time to check URLs is limited to 12s
foreach ($pagelist as $page) {
try {
$page = $this->pagemanager->renderpage($page, $this->router, $urlchecker);
$page = $this->pagemanager->renderpage($page, $this->router, Config::urlchecker() ? new Serviceurlchecker(0) : 0);

Check warning on line 336 in app/class/Controllerhome.php

View workflow job for this annotation

GitHub Actions / lint php

Line exceeds 120 characters; contains 130 characters

Check warning on line 336 in app/class/Controllerhome.php

View workflow job for this annotation

GitHub Actions / lint php

Line exceeds 120 characters; contains 130 characters
if ($this->pagemanager->update($page)) {
$count++;
}
Expand Down
4 changes: 2 additions & 2 deletions app/class/Controllerpage.php
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ public function render(string $page): void

if ($this->importpage() && $this->user->iseditor()) {
try {
$urlchecker = Config::urlchecker() ? new Serviceurlchecker(8) : null;
$urlchecker = Config::urlchecker() ? new Serviceurlchecker(6) : null;
$this->page = $this->pagemanager->renderpage($this->page, $this->router, $urlchecker);
} catch (RuntimeException $e) {
Logger::errorex($e);
Expand Down Expand Up @@ -187,7 +187,7 @@ public function read(string $page): void
$oldlinkto = $this->page->linkto();
}
try {
$urlchecker = Config::urlchecker() ? new Serviceurlchecker(8) : null;
$urlchecker = Config::urlchecker() ? new Serviceurlchecker(3) : null;
$this->page = $this->pagemanager->renderpage($this->page, $this->router, $urlchecker);
} catch (RuntimeException $e) {
Logger::errorex($e);
Expand Down
26 changes: 14 additions & 12 deletions app/class/Page.php
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,6 @@
use DateTimeImmutable;
use DateTimeZone;

use function Clue\StreamFilter\fun;

abstract class Page extends Item
{
protected $id;
Expand Down Expand Up @@ -813,21 +811,25 @@ public function deadlinkcount(): int
}

/**
* Used in the title of external links column in hme view
* Used in the title of external links column in home view
*
* @return string All links separated by new lines followed by a emoji ✅ or 💀
*/
public function externallinkstitle(): string
{
$links = $this->externallinks;
array_walk($links, function (&$value, string $key) {
if (is_null($value)) {
$symbol = '🔍️';
} else {
$symbol = $value ? '' : '💀';
}
$value = $key . ' ' . $symbol;
});
if (Config::urlchecker()) {
$links = $this->externallinks;
array_walk($links, function (&$value, string $key) {
if (is_null($value)) {
$symbol = '🔍️';
} else {
$symbol = $value ? '' : '💀';
}
$value = $key . ' ' . $symbol;
});
} else {
$links = array_keys($this->externallinks);
}
return implode("\n", $links);
}

Expand Down
38 changes: 32 additions & 6 deletions app/class/Servicerender.php
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
use DOMDocument;
use DOMElement;
use DOMNodeList;
use DOMXPath;
use Exception;
use InvalidArgumentException;
use LogicException;
Expand Down Expand Up @@ -392,15 +393,15 @@ protected function htmlparser(string $html): string
if (!$link->hasAttribute('target') && $this->externallinkblank) {
$link->setAttribute('target', '_blank');
}
$url = filter_var($href, FILTER_SANITIZE_URL);
$this->urls[$url] = null;
$this->urls[$href] = null;
if ($this->urlchecker !== null) {
try {
$dead = $this->urlchecker->isdead($url);
$classes[] = $dead ? 'dead' : 'ok';
$this->urls[$url] = !$dead;
$response = $this->urlchecker->check($href);
$classes[] = $response ? 'ok' : 'dead';
$link->setAttribute('data-urlcheck', '1');
$this->urls[$href] = $response;
} catch (RuntimeException $e) {
// Web search limit reached
$link->setAttribute('data-urlcheck', '0');
}
}
} elseif (preg_match('~^([a-z0-9-_]+)((\/?#[a-z0-9-_]+)|(\/([\w\-\%\[\]\=\?\&]*)))?$~', $href, $out)) {
Expand Down Expand Up @@ -443,6 +444,31 @@ protected function htmlparser(string $html): string
$link->setAttribute('class', implode(' ', array_unique($classes)));
}
}

// check for URLs that where not cached
try {
if ($this->urlchecker !== null && $this->urlchecker->processqueue()) {
$selector = new DOMXPath($dom);
$links = $selector->query('//a[ @data-urlcheck = 0 ]');
foreach ($links as $link) {
assert($link instanceof DOMElement);
$href = $link->getAttribute('href');
$class = $link->getAttribute('class');
$classes = explode(' ', $class);
try {
$response = $this->urlchecker->check($href);
$classes[] = $response ? 'ok' : 'dead';
$link->setAttribute('class', implode(' ', array_unique($classes)));
$link->setAttribute('data-urlcheck', '1');
$this->urls[$href] = $response;
} catch (RuntimeException $e) {
}
}
}
} catch (RuntimeException $e) {
Logger::errorex($e);
}

$images = $dom->getElementsByTagName('img');
$this->sourceparser($images);
$sources = $dom->getElementsByTagName('source');
Expand Down
166 changes: 121 additions & 45 deletions app/class/Serviceurlchecker.php
Original file line number Diff line number Diff line change
Expand Up @@ -4,20 +4,21 @@

use RuntimeException;
use Wcms\Exception\Filesystemexception;
use Wcms\Exception\Missingextensionexception;

/**
* Check URL
*/
class Serviceurlchecker
{
/** @var array[] $urls */
/** @var array[] $urls cached URLs */
protected array $urls = [];

/** @var int $starttimestamp timestamp lauched when object is build (in seconds) */
protected int $starttimestamp;
/** @var string[] $queue ULRs that need to be checked */
protected array $queue = [];

/** @var int $webchecktime time before stopping Web check (in seconds) */
protected int $webchecktime;
/** @var int $timeout before stopping Web check (in seconds) */
protected int $timeout;

/** @var bool $cacheonly Limit URL checking to cache */
protected bool $cacheonly = false;
Expand All @@ -38,70 +39,52 @@ class Serviceurlchecker
/**
* Tool that check for urls status, first in the cache, then on the Web
* The cache expires according to CACHE_EXPIRE_TIME constant
* A time limite have to be set to limit Web checking time
* A timeout have to be set to limit Web checking time
*
* @param int $webchecktime allocated time for looking URL on the Web (in seconds)
* @param int $timeout allocated time for looking URL on the Web (in seconds)
* if set to `0`, Check on the Web is disabled: only the cache is used
*/
public function __construct(int $webchecktime)
public function __construct(int $timeout = 0)
{
$this->webchecktime = $webchecktime;
if ($webchecktime === 0) {
$this->timeout = $timeout;
if ($timeout === 0) {
$this->cacheonly = true;
}
$this->starttimestamp = time();
try {
$urlfile = Fs::readfile(Model::URLS_FILE);
$this->urls = json_decode($urlfile, true);
} catch (Filesystemexception $e) {
// This mean the tag file does not exist
// This mean the url cache file does not exist
}
}

/**
* Check if URL is dead according to ACCEPTED_RESPONSE_CODES
* Check status of URL
*
* @throws RuntimeException If time limit is reached and URL status is expired or not stored in cache
*/
public function isdead(string $url): bool
{
if ($this->iscachedandvalid($url)) {
return !key_exists($this->urls[$url]['response'], self::ACCEPTED_RESPONSE_CODES);
}
if (!$this->cacheonly && time() < ($this->starttimestamp + $this->webchecktime)) {
$this->urls[$url]['response'] = $this->getresponse($url);
$this->urls[$url]['timestamp'] = time();
return !key_exists($this->urls[$url]['response'], self::ACCEPTED_RESPONSE_CODES);
}
throw new RuntimeException('Impossible to give a status about this URL');
}

/**
* read HTTP response headers
* @param string $url The URL to verify
*
* @return bool True if the url is alive, false if it's dead
*
* @return int HTTP response code, or `0` if no response
* @throws RuntimeException If the status of the URL is not cached
*/
protected function getresponse(string $url): int
public function check(string $url): bool
{
$scheme = parse_url($url, PHP_URL_SCHEME);
$context = stream_context_create([$scheme => ['method' => "HEAD",'header' => 'User-Agent: Mozilla/5.0']]);
$headers = @get_headers($url, 1, $context); // `@` avoid throwing PHP error
if ($headers === false) {
return 0;
}
for ($i = 0; $i < self::MAX_BOUNCE; $i++) {
if (!isset($headers[$i])) {
$id = $i - 1;
$http = $headers[$id];
return intval(substr($http, 9, 3));
}
// $url = filter_var($url, FILTER_SANITIZE_URL);
if ($this->iscachedandvalid($url)) {
return key_exists($this->urls[$url]['response'], self::ACCEPTED_RESPONSE_CODES);
} else {
$this->queue[] = $url;
throw new RuntimeException('unchecked URL');
}
return 0;
}

/**
* Check if the status of URL is cached and has not expired
* If cache is expired, the entry is deleted
*
* @param string $url The URL to verify
*
* @return bool Indicate if the URL status is cached and has not expired
*/
protected function iscachedandvalid(string $url): bool
{
Expand All @@ -115,6 +98,99 @@ protected function iscachedandvalid(string $url): bool
return true;
}

/**
* If queue contains URLs, process it !
* All the que may not be processed, it depend on $this->timeout,
* Which is set during object creation.
*
* @return int Number of new URL analysed (iundependent from status)
*
* @throws Missingextensionexception If curl is not installed
* @throws RuntimeException If curl failed
*/
public function processqueue(): int
{
if (!extension_loaded('curl')) {
throw new Missingextensionexception("PHP Curl extension is not installed");
}

if (empty($this->queue)) {
return 0;
}

$this->queue = array_unique($this->queue);

$multihandle = curl_multi_init();
curl_multi_setopt($multihandle, CURLMOPT_MAX_TOTAL_CONNECTIONS, 10);

foreach ($this->queue as $url) {
$curlhandles[$url] = curl_init($url);
curl_setopt($curlhandles[$url], CURLOPT_NOBODY, true);
curl_setopt($curlhandles[$url], CURLOPT_HEADER, true);
curl_setopt($curlhandles[$url], CURLOPT_RETURNTRANSFER, true);
curl_setopt($curlhandles[$url], CURLOPT_HTTPGET, true);
curl_setopt($curlhandles[$url], CURLOPT_TIMEOUT, $this->timeout);
curl_setopt($curlhandles[$url], CURLOPT_FOLLOWLOCATION, true);
curl_setopt($curlhandles[$url], CURLOPT_MAXREDIRS, self::MAX_BOUNCE);

curl_multi_add_handle($multihandle, $curlhandles[$url]);
}


do {
$status = curl_multi_exec($multihandle, $unfinishedHandles);
if ($status !== CURLM_OK) {
throw new RuntimeException(curl_multi_strerror(curl_multi_errno($multihandle)));
}

while (($info = curl_multi_info_read($multihandle)) !== false) {
if ($info['msg'] === CURLMSG_DONE) {
$handle = $info['handle'];
curl_multi_remove_handle($multihandle, $handle);
}
}

if ($unfinishedHandles) {
if ((curl_multi_select($multihandle)) === -1) {
throw new RuntimeException(curl_multi_strerror(curl_multi_errno($multihandle)));
}
}
} while ($unfinishedHandles);

$newurls = [];

foreach ($curlhandles as $url => $curlhandle) {
switch (curl_errno($curlhandle)) {
case CURLE_OK:
$newurls[$url] = [
'response' => curl_getinfo($curlhandle, CURLINFO_HTTP_CODE),
'timestamp' => time()
];
break;
case CURLE_OPERATION_TIMEDOUT:
if (count($this->queue) < 5 && $this->timeout >= 3) {
$newurls[$url] = [
'response' => 0,
'timestamp' => time() - ( self::CACHE_EXPIRE_TIME - 1 ) * 24 * 3600,
];
}
break;
case CURLE_COULDNT_RESOLVE_HOST:
default:
$newurls[$url] = [
'response' => 0,
'timestamp' => time(),
];
}
}

curl_multi_close($multihandle);

$this->urls = array_merge($this->urls, $newurls);

return count($newurls);
}

/**
* Save the cache
*
Expand Down
4 changes: 2 additions & 2 deletions app/view/templates/home.php
Original file line number Diff line number Diff line change
Expand Up @@ -328,10 +328,10 @@ class="redirection"
<?php if ($columns['externallinks']) : ?>
<td class="externallinks" title="<?= $item->externallinkstitle() ?>">
<?= $item->externallinks('sort') ?>
<?php if (!empty($deadlinks = $item->deadlinkcount())) : ?>
<?php if (Config::urlchecker() && !empty($deadlinks = $item->deadlinkcount())) : ?>
<span class="deadlinkcount"><?= $deadlinks ?></span>
<?php endif ?>
<?php if (!empty($uncheckedlinkcount = $item->uncheckedlinkcount())) : ?>
<?php if (Config::urlchecker() && !empty($uncheckedlinkcount = $item->uncheckedlinkcount())) : ?>
<span class="uncheckedlinkcount"><?= $uncheckedlinkcount ?></span>
<?php endif ?>
</td>
Expand Down

0 comments on commit 7feea33

Please sign in to comment.