Skip to content

Commit

Permalink
incremental dead link expire date
Browse files Browse the repository at this point in the history
+ url cache store timestamp and expire date
- remove cases where page should be rendered beccause of external links
  • Loading branch information
vincent-peugnet committed Nov 26, 2024
1 parent 5aff7ee commit 45d9c61
Show file tree
Hide file tree
Showing 2 changed files with 45 additions and 42 deletions.
15 changes: 3 additions & 12 deletions app/class/Modelpage.php
Original file line number Diff line number Diff line change
Expand Up @@ -403,10 +403,7 @@ protected function reset(Page $page, array $reset): Page
*
* 1. This will compare edit and render dates
* 2. then if render file exists
* 3. then if page have external links and
* - if some haven't been checked yet
* - or if it's been a long time
* - or if url cache is deleted
* 3. then if page have external links and some haven't been checked yet
* 4. then if the templatebody is set and has been updated
*
* @param Page $page Page to be checked
Expand All @@ -425,14 +422,8 @@ public function needtoberendered(Page $page): bool
) {
return true;
}
if (count($page->externallinks()) > 0) {
$now = new DateTimeImmutable("now", timezone_open("Europe/Paris"));
if (
$page->daterender()->diff($now)->days > Serviceurlchecker::CACHE_EXPIRE_TIME
|| $page->uncheckedlinkcount() > 0
) {
return true;
}
if (!empty(($page->externallinks()))) {
return $page->uncheckedlinkcount() > 0;
}
if (!empty($page->templatebody())) {
try {
Expand Down
72 changes: 42 additions & 30 deletions app/class/Serviceurlchecker.php
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,11 @@ class Serviceurlchecker
/** @var int MAX_BOUNCE limit of redirections to follow */
public const MAX_BOUNCE = 8;

/** @var int CACHE_EXPIRE_TIME in days */
public const CACHE_EXPIRE_TIME = 90;
/** @var int OK_CACHE_EXPIRE_TIME in days */
public const OK_CACHE_EXPIRE_TIME = 90;

/** @var int DEAD_CACHE_EXPIRE_TIME in minutes */
public const DEAD_CACHE_EXPIRE_TIME = 1;

/** @var null[] URL response code considered as not dead */
public const ACCEPTED_RESPONSE_CODES = [
Expand All @@ -38,10 +41,9 @@ class Serviceurlchecker

/**
* Tool that check for urls status, first in the cache, then on the Web
* The cache expires according to CACHE_EXPIRE_TIME constant
* A timeout have to be set to limit Web checking time
*
* @param int $timeout allocated time for looking URL on the Web (in seconds)
* @param int $timeout allocated time for looking URL on the Web (in seconds)
* if set to `0`, Check on the Web is disabled: only the cache is used
*/
public function __construct(int $timeout = 0)
Expand All @@ -60,6 +62,7 @@ public function __construct(int $timeout = 0)

/**
* Check status of URL
* If the URL status is not cached and valid, it's added to the queue.
*
* @param string $url The URL to verify
*
Expand All @@ -69,12 +72,11 @@ public function __construct(int $timeout = 0)
*/
public function check(string $url): bool
{
// $url = filter_var($url, FILTER_SANITIZE_URL);
if ($this->iscachedandvalid($url)) {
return key_exists($this->urls[$url]['response'], self::ACCEPTED_RESPONSE_CODES);
return $this->isalive($this->urls[$url]['response']);
} else {
$this->queue[] = $url;
throw new RuntimeException('unchecked URL');
throw new RuntimeException('no status about this URL');
}
}

Expand All @@ -91,8 +93,7 @@ protected function iscachedandvalid(string $url): bool
if (!key_exists($url, $this->urls)) {
return false;
}
if (($this->urls[$url]['timestamp'] + self::CACHE_EXPIRE_TIME * 3600 * 24) < time()) {
unset($this->urls[$url]);
if ($this->urls[$url]['expire'] < time()) {
return false;
}
return true;
Expand Down Expand Up @@ -160,27 +161,28 @@ public function processqueue(): int
$newurls = [];

foreach ($curlhandles as $url => $curlhandle) {
switch (curl_errno($curlhandle)) {
case CURLE_OK:
$newurls[$url] = [
'response' => curl_getinfo($curlhandle, CURLINFO_HTTP_CODE),
'timestamp' => time()
];
break;
case CURLE_OPERATION_TIMEDOUT:
if (count($this->queue) < 5 && $this->timeout >= 3) {
$newurls[$url] = [
'response' => 0,
'timestamp' => time() - ( self::CACHE_EXPIRE_TIME - 1 ) * 24 * 3600,
];
}
break;
case CURLE_COULDNT_RESOLVE_HOST:
default:
$newurls[$url] = [
'response' => 0,
'timestamp' => time(),
];
$curlerror = curl_errno($curlhandle);

if ($curlerror === CURLE_OK) {
$newurls[$url] = [
'response' => curl_getinfo($curlhandle, CURLINFO_HTTP_CODE),
'timestamp' => time(),
'expire' => time() + self::OK_CACHE_EXPIRE_TIME * 24 * 3600,
];
} elseif ($curlerror === CURLE_OPERATION_TIMEDOUT && count($this->queue) > 10) {
// if queue was big, there is chances that timeout is due to curl saturation
// consider the link as unchecked
} else {
if (key_exists($url, $this->urls) && $this->isalive($this->urls[$url]['response'])) {
$expire = time() + (time() - $this->urls[$url]['timestamp']) * 2;
} else {
$expire = time() + self::DEAD_CACHE_EXPIRE_TIME * 60;
}
$newurls[$url] = [
'response' => $curlerror,
'timestamp' => time(),
'expire' => $expire,
];
}
}

Expand All @@ -191,6 +193,16 @@ public function processqueue(): int
return count($newurls);
}

/**
* @param int $response HTTP response code
*
* @return bool Indicate if code mean alive or not.
*/
public static function isalive(int $response): bool
{
return key_exists($response, self::ACCEPTED_RESPONSE_CODES);
}

/**
* Save the cache
*
Expand Down

0 comments on commit 45d9c61

Please sign in to comment.