-
Notifications
You must be signed in to change notification settings - Fork 18
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
issue108: Priority does not flow down to child nodes (#109)
* issue108: Priority does not flow down to child nodes #108 Add parent priority to child nodes when marking for crawl * enhancement: add priority to direct child nodes #108 This enhancement aims to flow priority down to direct child nodes only. Through the implementation of node levels and a level check when marking a node to be crawled, we only assign a parent priority to a child node if it is a direct ancestor of the original node. This will prevent passing priority recursively and if, for example, a child node is a top level node, filtering the priority to effectively all nodes, which is undesirable behaviour. * fix: Add priority check to node * fix: Remove extra table closing tag in install.xml * style: remove addition line in upgrade script * tests: Add unit tests for issue #108 * tests: Add priority provider to test all possible parent priorities
- Loading branch information
1 parent
97b3799
commit e7f557e
Showing
6 changed files
with
132 additions
and
6 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -306,12 +306,20 @@ public function get_queue_size() { | |
* @return object|boolean The node record if the resource pointed to by the URL can and should be considered; or `false` if the | ||
* URL is invalid or excluded. | ||
*/ | ||
public function mark_for_crawl($baseurl, $url, $courseid = null, $priority = TOOL_CRAWLER_PRIORITY_DEFAULT) { | ||
public function mark_for_crawl($baseurl, $url, $courseid = null, $priority = TOOL_CRAWLER_PRIORITY_DEFAULT, | ||
$level = TOOL_CRAWLER_NODE_LEVEL_PARENT) { | ||
|
||
global $DB, $CFG; | ||
|
||
$url = $this->absolute_url($baseurl, $url); | ||
|
||
// Strip priority from indirect child nodes. Only parent and direct children | ||
// of parent nodes have priority applied to avoid recursively applying priority | ||
// to all ancestors of a parent node. | ||
if ($level == TOOL_CRAWLER_NODE_LEVEL_INDIRECT_CHILD) { | ||
$priority = TOOL_CRAWLER_PRIORITY_DEFAULT; | ||
} | ||
|
||
// Filter out non http protocols like mailto:[email protected] etc. | ||
$bits = parse_url($url); | ||
if (array_key_exists('scheme', $bits) | ||
|
@@ -420,6 +428,7 @@ public function mark_for_crawl($baseurl, $url, $courseid = null, $priority = TOO | |
$node->external = self::is_external($url); | ||
$node->needscrawl = time(); | ||
$node->priority = $priority; | ||
$node->level = $level; | ||
|
||
if (isset($courseid)) { | ||
$node->courseid = $courseid; | ||
|
@@ -438,6 +447,11 @@ public function mark_for_crawl($baseurl, $url, $courseid = null, $priority = TOO | |
$node->priority = $priority; | ||
$needsupdating = true; | ||
} | ||
if ($node->level != $level) { | ||
// Set the level again, in case this node has been seen again at a different | ||
// level, to avoid reprocessing. | ||
$node->level = $level; | ||
} | ||
if (isset($courseid)) { | ||
$node->courseid = $courseid; | ||
$needsupdating = true; | ||
|
@@ -901,8 +915,18 @@ private function link_from_node_to_url($from, $url, $text, $idattr) { | |
|
||
global $DB; | ||
|
||
// Ascertain the correct node level based on parent node level. | ||
if (!empty($from->level) && $from->level == TOOL_CRAWLER_NODE_LEVEL_PARENT) { | ||
$level = TOOL_CRAWLER_NODE_LEVEL_DIRECT_CHILD; | ||
} else { | ||
$level = TOOL_CRAWLER_NODE_LEVEL_INDIRECT_CHILD; | ||
} | ||
|
||
$priority = isset($from->priority) ? $from->priority : TOOL_CRAWLER_PRIORITY_DEFAULT; | ||
$courseid = isset($from->courseid) ? $from->courseid : null; | ||
|
||
// Add the node URL to the queue. | ||
$to = $this->mark_for_crawl($from->url, $url); | ||
$to = $this->mark_for_crawl($from->url, $url, $courseid, $priority, $level); | ||
if ($to === false) { | ||
return false; | ||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters