-
Notifications
You must be signed in to change notification settings - Fork 18
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
enhancement: add priority to direct child nodes #108
This enhancement aims to flow priority down to direct child nodes only. Through the implementation of node levels and a level check when marking a node to be crawled, we only assign a parent priority to a child node if it is a direct ancestor of the original node. This will prevent passing priority recursively and if, for example, a child node is a top level node, filtering the priority to effectively all nodes, which is undesirable behaviour.
- Loading branch information
Tom Dickman
committed
Jan 23, 2020
1 parent
a7e809e
commit 54026eb
Showing
5 changed files
with
52 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -306,12 +306,20 @@ public function get_queue_size() { | |
* @return object|boolean The node record if the resource pointed to by the URL can and should be considered; or `false` if the | ||
* URL is invalid or excluded. | ||
*/ | ||
public function mark_for_crawl($baseurl, $url, $courseid = null, $priority = TOOL_CRAWLER_PRIORITY_DEFAULT) { | ||
public function mark_for_crawl($baseurl, $url, $courseid = null, $priority = TOOL_CRAWLER_PRIORITY_DEFAULT, | ||
$level = TOOL_CRAWLER_NODE_LEVEL_PARENT) { | ||
|
||
global $DB, $CFG; | ||
|
||
$url = $this->absolute_url($baseurl, $url); | ||
|
||
// Strip priority from indirect child nodes. Only parent and direct children | ||
// of parent nodes have priority applied to avoid recursively applying priority | ||
// to all ancestors of a parent node. | ||
if ($level == TOOL_CRAWLER_NODE_LEVEL_INDIRECT_CHILD) { | ||
$priority = TOOL_CRAWLER_PRIORITY_DEFAULT; | ||
} | ||
|
||
// Filter out non http protocols like mailto:[email protected] etc. | ||
$bits = parse_url($url); | ||
if (array_key_exists('scheme', $bits) | ||
|
@@ -420,6 +428,7 @@ public function mark_for_crawl($baseurl, $url, $courseid = null, $priority = TOO | |
$node->external = self::is_external($url); | ||
$node->needscrawl = time(); | ||
$node->priority = $priority; | ||
$node->level = $level; | ||
|
||
if (isset($courseid)) { | ||
$node->courseid = $courseid; | ||
|
@@ -438,6 +447,11 @@ public function mark_for_crawl($baseurl, $url, $courseid = null, $priority = TOO | |
$node->priority = $priority; | ||
$needsupdating = true; | ||
} | ||
if ($node->level != $level) { | ||
// Set the level again, in case this node has been seen again at a different | ||
// level, to avoid reprocessing. | ||
$node->level = $level; | ||
} | ||
if (isset($courseid)) { | ||
$node->courseid = $courseid; | ||
$needsupdating = true; | ||
|
@@ -901,8 +915,15 @@ private function link_from_node_to_url($from, $url, $text, $idattr) { | |
|
||
global $DB; | ||
|
||
// Ascertain the correct node level based on parent node level. | ||
if ($from->level == TOOL_CRAWLER_NODE_LEVEL_PARENT) { | ||
$level = TOOL_CRAWLER_NODE_LEVEL_DIRECT_CHILD; | ||
} else { | ||
$level = TOOL_CRAWLER_NODE_LEVEL_INDIRECT_CHILD; | ||
} | ||
|
||
// Add the node URL to the queue. | ||
$to = $this->mark_for_crawl($from->url, $url, null, $from->priority); | ||
$to = $this->mark_for_crawl($from->url, $url, $from->courseid, $from->priority, $level); | ||
if ($to === false) { | ||
return false; | ||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters