diff --git a/app/class/Controllerhome.php b/app/class/Controllerhome.php index 20ae34d..6e307d7 100644 --- a/app/class/Controllerhome.php +++ b/app/class/Controllerhome.php @@ -328,14 +328,12 @@ public function multiedit() public function multirender(): void { $pagelist = $_POST['pagesid'] ?? []; - $checkurl = Config::urlchecker() && boolval($_POST['checkurl']); $total = count($pagelist); $pagelist = $this->pagemanager->pagelistbyid($pagelist); $count = 0; - $urlchecker = $checkurl ? new Serviceurlchecker(12) : null; // time to check URLs is limited to 12s foreach ($pagelist as $page) { try { - $page = $this->pagemanager->renderpage($page, $this->router, $urlchecker); + $page = $this->pagemanager->renderpage($page, $this->router, Config::urlchecker() ? new Serviceurlchecker(0) : 0); if ($this->pagemanager->update($page)) { $count++; } diff --git a/app/class/Controllerpage.php b/app/class/Controllerpage.php index c48bc67..56a6022 100644 --- a/app/class/Controllerpage.php +++ b/app/class/Controllerpage.php @@ -75,7 +75,7 @@ public function render(string $page): void if ($this->importpage() && $this->user->iseditor()) { try { - $urlchecker = Config::urlchecker() ? new Serviceurlchecker(8) : null; + $urlchecker = Config::urlchecker() ? new Serviceurlchecker(6) : null; $this->page = $this->pagemanager->renderpage($this->page, $this->router, $urlchecker); } catch (RuntimeException $e) { Logger::errorex($e); @@ -187,7 +187,7 @@ public function read(string $page): void $oldlinkto = $this->page->linkto(); } try { - $urlchecker = Config::urlchecker() ? new Serviceurlchecker(8) : null; + $urlchecker = Config::urlchecker() ? new Serviceurlchecker(3) : null; $this->page = $this->pagemanager->renderpage($this->page, $this->router, $urlchecker); } catch (RuntimeException $e) { Logger::errorex($e); diff --git a/app/class/Page.php b/app/class/Page.php index 49dc10b..7e7ed64 100644 --- a/app/class/Page.php +++ b/app/class/Page.php @@ -6,8 +6,6 @@ use DateTimeImmutable; use DateTimeZone; -use function Clue\StreamFilter\fun; - abstract class Page extends Item { protected $id; @@ -813,21 +811,25 @@ public function deadlinkcount(): int } /** - * Used in the title of external links column in hme view + * Used in the title of external links column in home view * * @return string All links separated by new lines followed by a emoji ✅ or 💀 */ public function externallinkstitle(): string { - $links = $this->externallinks; - array_walk($links, function (&$value, string $key) { - if (is_null($value)) { - $symbol = '🔍️'; - } else { - $symbol = $value ? '✅' : '💀'; - } - $value = $key . ' ' . $symbol; - }); + if (Config::urlchecker()) { + $links = $this->externallinks; + array_walk($links, function (&$value, string $key) { + if (is_null($value)) { + $symbol = '🔍️'; + } else { + $symbol = $value ? '✅' : '💀'; + } + $value = $key . ' ' . $symbol; + }); + } else { + $links = array_keys($this->externallinks); + } return implode("\n", $links); } diff --git a/app/class/Servicerender.php b/app/class/Servicerender.php index 700370a..df52a48 100644 --- a/app/class/Servicerender.php +++ b/app/class/Servicerender.php @@ -6,6 +6,7 @@ use DOMDocument; use DOMElement; use DOMNodeList; +use DOMXPath; use Exception; use InvalidArgumentException; use LogicException; @@ -392,15 +393,15 @@ protected function htmlparser(string $html): string if (!$link->hasAttribute('target') && $this->externallinkblank) { $link->setAttribute('target', '_blank'); } - $url = filter_var($href, FILTER_SANITIZE_URL); - $this->urls[$url] = null; + $this->urls[$href] = null; if ($this->urlchecker !== null) { try { - $dead = $this->urlchecker->isdead($url); - $classes[] = $dead ? 'dead' : 'ok'; - $this->urls[$url] = !$dead; + $response = $this->urlchecker->check($href); + $classes[] = $response ? 'ok' : 'dead'; + $link->setAttribute('data-urlcheck', '1'); + $this->urls[$href] = $response; } catch (RuntimeException $e) { - // Web search limit reached + $link->setAttribute('data-urlcheck', '0'); } } } elseif (preg_match('~^([a-z0-9-_]+)((\/?#[a-z0-9-_]+)|(\/([\w\-\%\[\]\=\?\&]*)))?$~', $href, $out)) { @@ -443,6 +444,31 @@ protected function htmlparser(string $html): string $link->setAttribute('class', implode(' ', array_unique($classes))); } } + + // check for URLs that where not cached + try { + if ($this->urlchecker !== null && $this->urlchecker->processqueue()) { + $selector = new DOMXPath($dom); + $links = $selector->query('//a[ @data-urlcheck = 0 ]'); + foreach ($links as $link) { + assert($link instanceof DOMElement); + $href = $link->getAttribute('href'); + $class = $link->getAttribute('class'); + $classes = explode(' ', $class); + try { + $response = $this->urlchecker->check($href); + $classes[] = $response ? 'ok' : 'dead'; + $link->setAttribute('class', implode(' ', array_unique($classes))); + $link->setAttribute('data-urlcheck', '1'); + $this->urls[$href] = $response; + } catch (RuntimeException $e) { + } + } + } + } catch (RuntimeException $e) { + Logger::errorex($e); + } + $images = $dom->getElementsByTagName('img'); $this->sourceparser($images); $sources = $dom->getElementsByTagName('source'); diff --git a/app/class/Serviceurlchecker.php b/app/class/Serviceurlchecker.php index 052e66e..4f9f3eb 100644 --- a/app/class/Serviceurlchecker.php +++ b/app/class/Serviceurlchecker.php @@ -4,20 +4,21 @@ use RuntimeException; use Wcms\Exception\Filesystemexception; +use Wcms\Exception\Missingextensionexception; /** * Check URL */ class Serviceurlchecker { - /** @var array[] $urls */ + /** @var array[] $urls cached URLs */ protected array $urls = []; - /** @var int $starttimestamp timestamp lauched when object is build (in seconds) */ - protected int $starttimestamp; + /** @var string[] $queue ULRs that need to be checked */ + protected array $queue = []; - /** @var int $webchecktime time before stopping Web check (in seconds) */ - protected int $webchecktime; + /** @var int $timeout before stopping Web check (in seconds) */ + protected int $timeout; /** @var bool $cacheonly Limit URL checking to cache */ protected bool $cacheonly = false; @@ -38,70 +39,52 @@ class Serviceurlchecker /** * Tool that check for urls status, first in the cache, then on the Web * The cache expires according to CACHE_EXPIRE_TIME constant - * A time limite have to be set to limit Web checking time + * A timeout have to be set to limit Web checking time * - * @param int $webchecktime allocated time for looking URL on the Web (in seconds) + * @param int $timeout allocated time for looking URL on the Web (in seconds) * if set to `0`, Check on the Web is disabled: only the cache is used */ - public function __construct(int $webchecktime) + public function __construct(int $timeout = 0) { - $this->webchecktime = $webchecktime; - if ($webchecktime === 0) { + $this->timeout = $timeout; + if ($timeout === 0) { $this->cacheonly = true; } - $this->starttimestamp = time(); try { $urlfile = Fs::readfile(Model::URLS_FILE); $this->urls = json_decode($urlfile, true); } catch (Filesystemexception $e) { - // This mean the tag file does not exist + // This mean the url cache file does not exist } } /** - * Check if URL is dead according to ACCEPTED_RESPONSE_CODES + * Check status of URL * - * @throws RuntimeException If time limit is reached and URL status is expired or not stored in cache - */ - public function isdead(string $url): bool - { - if ($this->iscachedandvalid($url)) { - return !key_exists($this->urls[$url]['response'], self::ACCEPTED_RESPONSE_CODES); - } - if (!$this->cacheonly && time() < ($this->starttimestamp + $this->webchecktime)) { - $this->urls[$url]['response'] = $this->getresponse($url); - $this->urls[$url]['timestamp'] = time(); - return !key_exists($this->urls[$url]['response'], self::ACCEPTED_RESPONSE_CODES); - } - throw new RuntimeException('Impossible to give a status about this URL'); - } - - /** - * read HTTP response headers + * @param string $url The URL to verify + * + * @return bool True if the url is alive, false if it's dead * - * @return int HTTP response code, or `0` if no response + * @throws RuntimeException If the status of the URL is not cached */ - protected function getresponse(string $url): int + public function check(string $url): bool { - $scheme = parse_url($url, PHP_URL_SCHEME); - $context = stream_context_create([$scheme => ['method' => "HEAD",'header' => 'User-Agent: Mozilla/5.0']]); - $headers = @get_headers($url, 1, $context); // `@` avoid throwing PHP error - if ($headers === false) { - return 0; - } - for ($i = 0; $i < self::MAX_BOUNCE; $i++) { - if (!isset($headers[$i])) { - $id = $i - 1; - $http = $headers[$id]; - return intval(substr($http, 9, 3)); - } + // $url = filter_var($url, FILTER_SANITIZE_URL); + if ($this->iscachedandvalid($url)) { + return key_exists($this->urls[$url]['response'], self::ACCEPTED_RESPONSE_CODES); + } else { + $this->queue[] = $url; + throw new RuntimeException('unchecked URL'); } - return 0; } /** * Check if the status of URL is cached and has not expired * If cache is expired, the entry is deleted + * + * @param string $url The URL to verify + * + * @return bool Indicate if the URL status is cached and has not expired */ protected function iscachedandvalid(string $url): bool { @@ -115,6 +98,99 @@ protected function iscachedandvalid(string $url): bool return true; } + /** + * If queue contains URLs, process it ! + * All the que may not be processed, it depend on $this->timeout, + * Which is set during object creation. + * + * @return int Number of new URL analysed (iundependent from status) + * + * @throws Missingextensionexception If curl is not installed + * @throws RuntimeException If curl failed + */ + public function processqueue(): int + { + if (!extension_loaded('curl')) { + throw new Missingextensionexception("PHP Curl extension is not installed"); + } + + if (empty($this->queue)) { + return 0; + } + + $this->queue = array_unique($this->queue); + + $multihandle = curl_multi_init(); + curl_multi_setopt($multihandle, CURLMOPT_MAX_TOTAL_CONNECTIONS, 10); + + foreach ($this->queue as $url) { + $curlhandles[$url] = curl_init($url); + curl_setopt($curlhandles[$url], CURLOPT_NOBODY, true); + curl_setopt($curlhandles[$url], CURLOPT_HEADER, true); + curl_setopt($curlhandles[$url], CURLOPT_RETURNTRANSFER, true); + curl_setopt($curlhandles[$url], CURLOPT_HTTPGET, true); + curl_setopt($curlhandles[$url], CURLOPT_TIMEOUT, $this->timeout); + curl_setopt($curlhandles[$url], CURLOPT_FOLLOWLOCATION, true); + curl_setopt($curlhandles[$url], CURLOPT_MAXREDIRS, self::MAX_BOUNCE); + + curl_multi_add_handle($multihandle, $curlhandles[$url]); + } + + + do { + $status = curl_multi_exec($multihandle, $unfinishedHandles); + if ($status !== CURLM_OK) { + throw new RuntimeException(curl_multi_strerror(curl_multi_errno($multihandle))); + } + + while (($info = curl_multi_info_read($multihandle)) !== false) { + if ($info['msg'] === CURLMSG_DONE) { + $handle = $info['handle']; + curl_multi_remove_handle($multihandle, $handle); + } + } + + if ($unfinishedHandles) { + if ((curl_multi_select($multihandle)) === -1) { + throw new RuntimeException(curl_multi_strerror(curl_multi_errno($multihandle))); + } + } + } while ($unfinishedHandles); + + $newurls = []; + + foreach ($curlhandles as $url => $curlhandle) { + switch (curl_errno($curlhandle)) { + case CURLE_OK: + $newurls[$url] = [ + 'response' => curl_getinfo($curlhandle, CURLINFO_HTTP_CODE), + 'timestamp' => time() + ]; + break; + case CURLE_OPERATION_TIMEDOUT: + if (count($this->queue) < 5 && $this->timeout >= 3) { + $newurls[$url] = [ + 'response' => 0, + 'timestamp' => time() - ( self::CACHE_EXPIRE_TIME - 1 ) * 24 * 3600, + ]; + } + break; + case CURLE_COULDNT_RESOLVE_HOST: + default: + $newurls[$url] = [ + 'response' => 0, + 'timestamp' => time(), + ]; + } + } + + curl_multi_close($multihandle); + + $this->urls = array_merge($this->urls, $newurls); + + return count($newurls); + } + /** * Save the cache * diff --git a/app/view/templates/home.php b/app/view/templates/home.php index 140d225..2dad795 100644 --- a/app/view/templates/home.php +++ b/app/view/templates/home.php @@ -328,10 +328,10 @@ class="redirection"