From e7f557e8e152946b9d36ec59547fd4555c864ee9 Mon Sep 17 00:00:00 2001 From: Tom Date: Tue, 28 Jan 2020 12:25:47 +1100 Subject: [PATCH] issue108: Priority does not flow down to child nodes (#109) * issue108: Priority does not flow down to child nodes #108 Add parent priority to child nodes when marking for crawl * enhancement: add priority to direct child nodes #108 This enhancement aims to flow priority down to direct child nodes only. Through the implementation of node levels and a level check when marking a node to be crawled, we only assign a parent priority to a child node if it is a direct ancestor of the original node. This will prevent passing priority recursively and if, for example, a child node is a top level node, filtering the priority to effectively all nodes, which is undesirable behaviour. * fix: Add priority check to node * fix: Remove extra table closing tag in install.xml * style: remove addition line in upgrade script * tests: Add unit tests for issue #108 * tests: Add priority provider to test all possible parent priorities --- classes/robot/crawler.php | 28 +++++++++- constants.php | 9 ++++ db/install.xml | 1 + db/upgrade.php | 15 ++++++ tests/phpunit/robot_crawler_test.php | 81 +++++++++++++++++++++++++++- version.php | 4 +- 6 files changed, 132 insertions(+), 6 deletions(-) diff --git a/classes/robot/crawler.php b/classes/robot/crawler.php index f051604e..a7b329d5 100644 --- a/classes/robot/crawler.php +++ b/classes/robot/crawler.php @@ -306,12 +306,20 @@ public function get_queue_size() { * @return object|boolean The node record if the resource pointed to by the URL can and should be considered; or `false` if the * URL is invalid or excluded. */ - public function mark_for_crawl($baseurl, $url, $courseid = null, $priority = TOOL_CRAWLER_PRIORITY_DEFAULT) { + public function mark_for_crawl($baseurl, $url, $courseid = null, $priority = TOOL_CRAWLER_PRIORITY_DEFAULT, + $level = TOOL_CRAWLER_NODE_LEVEL_PARENT) { global $DB, $CFG; $url = $this->absolute_url($baseurl, $url); + // Strip priority from indirect child nodes. Only parent and direct children + // of parent nodes have priority applied to avoid recursively applying priority + // to all ancestors of a parent node. + if ($level == TOOL_CRAWLER_NODE_LEVEL_INDIRECT_CHILD) { + $priority = TOOL_CRAWLER_PRIORITY_DEFAULT; + } + // Filter out non http protocols like mailto:cqulibrary@cqu.edu.au etc. $bits = parse_url($url); if (array_key_exists('scheme', $bits) @@ -420,6 +428,7 @@ public function mark_for_crawl($baseurl, $url, $courseid = null, $priority = TOO $node->external = self::is_external($url); $node->needscrawl = time(); $node->priority = $priority; + $node->level = $level; if (isset($courseid)) { $node->courseid = $courseid; @@ -438,6 +447,11 @@ public function mark_for_crawl($baseurl, $url, $courseid = null, $priority = TOO $node->priority = $priority; $needsupdating = true; } + if ($node->level != $level) { + // Set the level again, in case this node has been seen again at a different + // level, to avoid reprocessing. + $node->level = $level; + } if (isset($courseid)) { $node->courseid = $courseid; $needsupdating = true; @@ -901,8 +915,18 @@ private function link_from_node_to_url($from, $url, $text, $idattr) { global $DB; + // Ascertain the correct node level based on parent node level. + if (!empty($from->level) && $from->level == TOOL_CRAWLER_NODE_LEVEL_PARENT) { + $level = TOOL_CRAWLER_NODE_LEVEL_DIRECT_CHILD; + } else { + $level = TOOL_CRAWLER_NODE_LEVEL_INDIRECT_CHILD; + } + + $priority = isset($from->priority) ? $from->priority : TOOL_CRAWLER_PRIORITY_DEFAULT; + $courseid = isset($from->courseid) ? $from->courseid : null; + // Add the node URL to the queue. - $to = $this->mark_for_crawl($from->url, $url); + $to = $this->mark_for_crawl($from->url, $url, $courseid, $priority, $level); if ($to === false) { return false; } diff --git a/constants.php b/constants.php index 860819e9..b0649738 100644 --- a/constants.php +++ b/constants.php @@ -93,3 +93,12 @@ define('TOOL_CRAWLER_PRIORITY_DEFAULT', 0); define('TOOL_CRAWLER_PRIORITY_NORMAL', 50); define('TOOL_CRAWLER_PRIORITY_HIGH', 100); + +/** + * Node level assigned to each node based on whether it is the parent node, or + * a child node discovered within a parent when crawling, or any child of a child + * node (or even further removed). + */ +define('TOOL_CRAWLER_NODE_LEVEL_PARENT', 2); +define('TOOL_CRAWLER_NODE_LEVEL_DIRECT_CHILD', 1); +define('TOOL_CRAWLER_NODE_LEVEL_INDIRECT_CHILD', 0); diff --git a/db/install.xml b/db/install.xml index d2510ded..fb249872 100644 --- a/db/install.xml +++ b/db/install.xml @@ -27,6 +27,7 @@ + diff --git a/db/upgrade.php b/db/upgrade.php index 3138ffa5..6466e153 100644 --- a/db/upgrade.php +++ b/db/upgrade.php @@ -94,5 +94,20 @@ function xmldb_tool_crawler_upgrade($oldversion) { upgrade_plugin_savepoint(true, 2019100300, 'tool', 'crawler'); } + if ($oldversion < 2020012300) { + + // Define field level to be added to tool_crawler_url. + $table = new xmldb_table('tool_crawler_url'); + $field = new xmldb_field('level', XMLDB_TYPE_INTEGER, '1', null, null, null, '2', 'priority'); + + // Conditionally launch add field level. + if (!$dbman->field_exists($table, $field)) { + $dbman->add_field($table, $field); + } + + // Crawler savepoint reached. + upgrade_plugin_savepoint(true, 2020012300, 'tool', 'crawler'); + } + return true; } diff --git a/tests/phpunit/robot_crawler_test.php b/tests/phpunit/robot_crawler_test.php index 84a2dc46..29f29b30 100644 --- a/tests/phpunit/robot_crawler_test.php +++ b/tests/phpunit/robot_crawler_test.php @@ -27,6 +27,7 @@ defined('MOODLE_INTERNAL') || die('Direct access to this script is forbidden'); require_once(__DIR__ . '/../../locallib.php'); +require_once(__DIR__ . '/../../constants.php'); /** * Unit tests for link crawler robot @@ -295,6 +296,7 @@ public function test_should_be_excluded() { $node->contents = $page . $linktoexclude; $node->url = $url; $node->id = $insertid; + $node->level = TOOL_CRAWLER_NODE_LEVEL_PARENT; $this->resetAfterTest(true); @@ -310,6 +312,81 @@ public function test_should_be_excluded() { self::assertFalse($found); } -} - + /** + * Priority provider. + * + * @return array of potential crawler priority codes. + */ + public function priority_provider() { + return [ + ['high' => TOOL_CRAWLER_PRIORITY_HIGH], + ['normal' => TOOL_CRAWLER_PRIORITY_NORMAL], + ['default' => TOOL_CRAWLER_PRIORITY_DEFAULT] + ]; + } + /** + * @dataProvider priority_provider + * + * Test for issue #108 - passing node crawl priority to child nodes when parsing html. + */ + public function test_parse_html_priority_inheritance($parentpriority) { + global $CFG, $DB; + + $parentlocalurl = 'course/view.php?id=1§ion=2'; + $directchildlocalurl = 'mod/book/view.php?id=7'; + $indirectchildexternalurl = 'http://someexternalsite.net.au'; + + // Internal parent node. + $node = $this->robot->mark_for_crawl($CFG->wwwroot, $parentlocalurl, 1, $parentpriority); + $node->httpcode = 200; + $node->mimetype = 'text/html'; + $node->external = 0; + $node->contents = << + + + + Test title + + + Direct child node + + +HTML; + // Parse the parent node, to create the direct child node. + $parentnode = $this->robot->parse_html($node, $node->external); + + // Internal node direct child. + $url = new moodle_url('/' . $directchildlocalurl); + $node = $DB->get_record('tool_crawler_url', array('url' => $url->raw_out()) ); + $node->url = $CFG->wwwroot.'/'.$directchildlocalurl; + $node->httpcode = 200; + $node->mimetype = 'text/html'; + $node->external = 0; + $node->contents = << + + + + Test title + + + Indirect child node + + +HTML; + // Parse the direct child, to create the indirect child node. + $directchildnode = $this->robot->parse_html($node, $node->external); + $indirectchildnode = $DB->get_record('tool_crawler_url', ['url' => $indirectchildexternalurl]); + + // Direct child nodes should inherit priority from parent node (super node). + $this->assertEquals($parentnode->priority, $directchildnode->priority); + // Indirect child nodes should not inherit priority from parent node (super node). + $this->assertGreaterThanOrEqual($indirectchildnode->priority, $parentnode->priority); + // Indirect child nodes should not inherit priority from direct child node. + $this->assertGreaterThanOrEqual($indirectchildnode->priority, $directchildnode->priority); + // Indirect child nodes should not be able to have a high priority. + $this->assertLessThan(TOOL_CRAWLER_PRIORITY_HIGH, $indirectchildnode->priority); + } +} diff --git a/version.php b/version.php index fc112523..0e6dfe3b 100644 --- a/version.php +++ b/version.php @@ -27,8 +27,8 @@ defined('MOODLE_INTERNAL') || die(); -$plugin->version = 2020010600; // The current plugin version (Date: YYYYMMDDXX) -$plugin->release = 2020010600; // The current plugin version (Date: YYYYMMDDXX) +$plugin->version = 2020012300; // The current plugin version (Date: YYYYMMDDXX) +$plugin->release = 2020012300; // The current plugin version (Date: YYYYMMDDXX) $plugin->requires = 2016021800; // Requires this Moodle version. $plugin->component = 'tool_crawler'; // To check on upgrade, that module sits in correct place. $plugin->maturity = MATURITY_STABLE;