From 54026eb56c090c111783c9cf48167749cd6fcba0 Mon Sep 17 00:00:00 2001 From: Tom Dickman Date: Thu, 23 Jan 2020 13:01:06 +1100 Subject: [PATCH] enhancement: add priority to direct child nodes #108 This enhancement aims to flow priority down to direct child nodes only. Through the implementation of node levels and a level check when marking a node to be crawled, we only assign a parent priority to a child node if it is a direct ancestor of the original node. This will prevent passing priority recursively and if, for example, a child node is a top level node, filtering the priority to effectively all nodes, which is undesirable behaviour. --- classes/robot/crawler.php | 25 +++++++++++++++++++++++-- constants.php | 9 +++++++++ db/install.xml | 2 ++ db/upgrade.php | 16 ++++++++++++++++ version.php | 4 ++-- 5 files changed, 52 insertions(+), 4 deletions(-) diff --git a/classes/robot/crawler.php b/classes/robot/crawler.php index a6a34416..9605bd6b 100644 --- a/classes/robot/crawler.php +++ b/classes/robot/crawler.php @@ -306,12 +306,20 @@ public function get_queue_size() { * @return object|boolean The node record if the resource pointed to by the URL can and should be considered; or `false` if the * URL is invalid or excluded. */ - public function mark_for_crawl($baseurl, $url, $courseid = null, $priority = TOOL_CRAWLER_PRIORITY_DEFAULT) { + public function mark_for_crawl($baseurl, $url, $courseid = null, $priority = TOOL_CRAWLER_PRIORITY_DEFAULT, + $level = TOOL_CRAWLER_NODE_LEVEL_PARENT) { global $DB, $CFG; $url = $this->absolute_url($baseurl, $url); + // Strip priority from indirect child nodes. Only parent and direct children + // of parent nodes have priority applied to avoid recursively applying priority + // to all ancestors of a parent node. + if ($level == TOOL_CRAWLER_NODE_LEVEL_INDIRECT_CHILD) { + $priority = TOOL_CRAWLER_PRIORITY_DEFAULT; + } + // Filter out non http protocols like mailto:cqulibrary@cqu.edu.au etc. $bits = parse_url($url); if (array_key_exists('scheme', $bits) @@ -420,6 +428,7 @@ public function mark_for_crawl($baseurl, $url, $courseid = null, $priority = TOO $node->external = self::is_external($url); $node->needscrawl = time(); $node->priority = $priority; + $node->level = $level; if (isset($courseid)) { $node->courseid = $courseid; @@ -438,6 +447,11 @@ public function mark_for_crawl($baseurl, $url, $courseid = null, $priority = TOO $node->priority = $priority; $needsupdating = true; } + if ($node->level != $level) { + // Set the level again, in case this node has been seen again at a different + // level, to avoid reprocessing. + $node->level = $level; + } if (isset($courseid)) { $node->courseid = $courseid; $needsupdating = true; @@ -901,8 +915,15 @@ private function link_from_node_to_url($from, $url, $text, $idattr) { global $DB; + // Ascertain the correct node level based on parent node level. + if ($from->level == TOOL_CRAWLER_NODE_LEVEL_PARENT) { + $level = TOOL_CRAWLER_NODE_LEVEL_DIRECT_CHILD; + } else { + $level = TOOL_CRAWLER_NODE_LEVEL_INDIRECT_CHILD; + } + // Add the node URL to the queue. - $to = $this->mark_for_crawl($from->url, $url, null, $from->priority); + $to = $this->mark_for_crawl($from->url, $url, $from->courseid, $from->priority, $level); if ($to === false) { return false; } diff --git a/constants.php b/constants.php index 860819e9..b0649738 100644 --- a/constants.php +++ b/constants.php @@ -93,3 +93,12 @@ define('TOOL_CRAWLER_PRIORITY_DEFAULT', 0); define('TOOL_CRAWLER_PRIORITY_NORMAL', 50); define('TOOL_CRAWLER_PRIORITY_HIGH', 100); + +/** + * Node level assigned to each node based on whether it is the parent node, or + * a child node discovered within a parent when crawling, or any child of a child + * node (or even further removed). + */ +define('TOOL_CRAWLER_NODE_LEVEL_PARENT', 2); +define('TOOL_CRAWLER_NODE_LEVEL_DIRECT_CHILD', 1); +define('TOOL_CRAWLER_NODE_LEVEL_INDIRECT_CHILD', 0); diff --git a/db/install.xml b/db/install.xml index d2510ded..34277e4e 100644 --- a/db/install.xml +++ b/db/install.xml @@ -27,6 +27,8 @@ + + diff --git a/db/upgrade.php b/db/upgrade.php index 3138ffa5..01dc8f24 100644 --- a/db/upgrade.php +++ b/db/upgrade.php @@ -94,5 +94,21 @@ function xmldb_tool_crawler_upgrade($oldversion) { upgrade_plugin_savepoint(true, 2019100300, 'tool', 'crawler'); } + if ($oldversion < 2020012300) { + + // Define field level to be added to tool_crawler_url. + $table = new xmldb_table('tool_crawler_url'); + $field = new xmldb_field('level', XMLDB_TYPE_INTEGER, '1', null, null, null, '2', 'priority'); + + // Conditionally launch add field level. + if (!$dbman->field_exists($table, $field)) { + $dbman->add_field($table, $field); + } + + // Crawler savepoint reached. + upgrade_plugin_savepoint(true, 2020012300, 'tool', 'crawler'); + } + + return true; } diff --git a/version.php b/version.php index fc112523..0e6dfe3b 100644 --- a/version.php +++ b/version.php @@ -27,8 +27,8 @@ defined('MOODLE_INTERNAL') || die(); -$plugin->version = 2020010600; // The current plugin version (Date: YYYYMMDDXX) -$plugin->release = 2020010600; // The current plugin version (Date: YYYYMMDDXX) +$plugin->version = 2020012300; // The current plugin version (Date: YYYYMMDDXX) +$plugin->release = 2020012300; // The current plugin version (Date: YYYYMMDDXX) $plugin->requires = 2016021800; // Requires this Moodle version. $plugin->component = 'tool_crawler'; // To check on upgrade, that module sits in correct place. $plugin->maturity = MATURITY_STABLE;