Skip to content

Commit

Permalink
issue108: Priority does not flow down to child nodes (#109)
Browse files Browse the repository at this point in the history
* issue108: Priority does not flow down to child nodes #108

Add parent priority to child nodes when marking for crawl

* enhancement: add priority to direct child nodes #108

This enhancement aims to flow priority down to direct child nodes only.
Through the implementation of node levels and a level check when
marking a node to be crawled, we only assign a parent priority to a
child node if it is a direct ancestor of the original node.
This will prevent passing priority recursively and if, for example, a child
node is a top level node, filtering the priority to effectively all nodes,
which is undesirable behaviour.

* fix: Add priority check to node

* fix: Remove extra table closing tag in install.xml

* style: remove addition line in upgrade script

* tests: Add unit tests for issue #108

* tests: Add priority provider to test all possible parent priorities
  • Loading branch information
Tom authored and brendanheywood committed Jan 28, 2020
1 parent 97b3799 commit e7f557e
Show file tree
Hide file tree
Showing 6 changed files with 132 additions and 6 deletions.
28 changes: 26 additions & 2 deletions classes/robot/crawler.php
Original file line number Diff line number Diff line change
Expand Up @@ -306,12 +306,20 @@ public function get_queue_size() {
* @return object|boolean The node record if the resource pointed to by the URL can and should be considered; or `false` if the
* URL is invalid or excluded.
*/
public function mark_for_crawl($baseurl, $url, $courseid = null, $priority = TOOL_CRAWLER_PRIORITY_DEFAULT) {
public function mark_for_crawl($baseurl, $url, $courseid = null, $priority = TOOL_CRAWLER_PRIORITY_DEFAULT,
$level = TOOL_CRAWLER_NODE_LEVEL_PARENT) {

global $DB, $CFG;

$url = $this->absolute_url($baseurl, $url);

// Strip priority from indirect child nodes. Only parent and direct children
// of parent nodes have priority applied to avoid recursively applying priority
// to all ancestors of a parent node.
if ($level == TOOL_CRAWLER_NODE_LEVEL_INDIRECT_CHILD) {
$priority = TOOL_CRAWLER_PRIORITY_DEFAULT;
}

// Filter out non http protocols like mailto:[email protected] etc.
$bits = parse_url($url);
if (array_key_exists('scheme', $bits)
Expand Down Expand Up @@ -420,6 +428,7 @@ public function mark_for_crawl($baseurl, $url, $courseid = null, $priority = TOO
$node->external = self::is_external($url);
$node->needscrawl = time();
$node->priority = $priority;
$node->level = $level;

if (isset($courseid)) {
$node->courseid = $courseid;
Expand All @@ -438,6 +447,11 @@ public function mark_for_crawl($baseurl, $url, $courseid = null, $priority = TOO
$node->priority = $priority;
$needsupdating = true;
}
if ($node->level != $level) {
// Set the level again, in case this node has been seen again at a different
// level, to avoid reprocessing.
$node->level = $level;
}
if (isset($courseid)) {
$node->courseid = $courseid;
$needsupdating = true;
Expand Down Expand Up @@ -901,8 +915,18 @@ private function link_from_node_to_url($from, $url, $text, $idattr) {

global $DB;

// Ascertain the correct node level based on parent node level.
if (!empty($from->level) && $from->level == TOOL_CRAWLER_NODE_LEVEL_PARENT) {
$level = TOOL_CRAWLER_NODE_LEVEL_DIRECT_CHILD;
} else {
$level = TOOL_CRAWLER_NODE_LEVEL_INDIRECT_CHILD;
}

$priority = isset($from->priority) ? $from->priority : TOOL_CRAWLER_PRIORITY_DEFAULT;
$courseid = isset($from->courseid) ? $from->courseid : null;

// Add the node URL to the queue.
$to = $this->mark_for_crawl($from->url, $url);
$to = $this->mark_for_crawl($from->url, $url, $courseid, $priority, $level);
if ($to === false) {
return false;
}
Expand Down
9 changes: 9 additions & 0 deletions constants.php
Original file line number Diff line number Diff line change
Expand Up @@ -93,3 +93,12 @@
define('TOOL_CRAWLER_PRIORITY_DEFAULT', 0);
define('TOOL_CRAWLER_PRIORITY_NORMAL', 50);
define('TOOL_CRAWLER_PRIORITY_HIGH', 100);

/**
* Node level assigned to each node based on whether it is the parent node, or
* a child node discovered within a parent when crawling, or any child of a child
* node (or even further removed).
*/
define('TOOL_CRAWLER_NODE_LEVEL_PARENT', 2);
define('TOOL_CRAWLER_NODE_LEVEL_DIRECT_CHILD', 1);
define('TOOL_CRAWLER_NODE_LEVEL_INDIRECT_CHILD', 0);
1 change: 1 addition & 0 deletions db/install.xml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
<FIELD NAME="httpmsg" TYPE="text" NOTNULL="false" SEQUENCE="false"/>
<FIELD NAME="errormsg" TYPE="text" NOTNULL="false" SEQUENCE="false"/>
<FIELD NAME="priority" TYPE="int" LENGTH="10" NOTNULL="false" SEQUENCE="false" DEFAULT="0"/>
<FIELD NAME="level" TYPE="int" LENGTH="1" NOTNULL="false" DEFAULT="2" SEQUENCE="false" COMMENT="Node level, 2 for parent node, 1 for direct child and 0 for subsequent children."/>
</FIELDS>
<KEYS>
<KEY NAME="primary" TYPE="primary" FIELDS="id"/>
Expand Down
15 changes: 15 additions & 0 deletions db/upgrade.php
Original file line number Diff line number Diff line change
Expand Up @@ -94,5 +94,20 @@ function xmldb_tool_crawler_upgrade($oldversion) {
upgrade_plugin_savepoint(true, 2019100300, 'tool', 'crawler');
}

if ($oldversion < 2020012300) {

// Define field level to be added to tool_crawler_url.
$table = new xmldb_table('tool_crawler_url');
$field = new xmldb_field('level', XMLDB_TYPE_INTEGER, '1', null, null, null, '2', 'priority');

// Conditionally launch add field level.
if (!$dbman->field_exists($table, $field)) {
$dbman->add_field($table, $field);
}

// Crawler savepoint reached.
upgrade_plugin_savepoint(true, 2020012300, 'tool', 'crawler');
}

return true;
}
81 changes: 79 additions & 2 deletions tests/phpunit/robot_crawler_test.php
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
defined('MOODLE_INTERNAL') || die('Direct access to this script is forbidden');

require_once(__DIR__ . '/../../locallib.php');
require_once(__DIR__ . '/../../constants.php');

/**
* Unit tests for link crawler robot
Expand Down Expand Up @@ -295,6 +296,7 @@ public function test_should_be_excluded() {
$node->contents = $page . $linktoexclude;
$node->url = $url;
$node->id = $insertid;
$node->level = TOOL_CRAWLER_NODE_LEVEL_PARENT;

$this->resetAfterTest(true);

Expand All @@ -310,6 +312,81 @@ public function test_should_be_excluded() {
self::assertFalse($found);
}

}

/**
* Priority provider.
*
* @return array of potential crawler priority codes.
*/
public function priority_provider() {
return [
['high' => TOOL_CRAWLER_PRIORITY_HIGH],
['normal' => TOOL_CRAWLER_PRIORITY_NORMAL],
['default' => TOOL_CRAWLER_PRIORITY_DEFAULT]
];
}

/**
* @dataProvider priority_provider
*
* Test for issue #108 - passing node crawl priority to child nodes when parsing html.
*/
public function test_parse_html_priority_inheritance($parentpriority) {
global $CFG, $DB;

$parentlocalurl = 'course/view.php?id=1&section=2';
$directchildlocalurl = 'mod/book/view.php?id=7';
$indirectchildexternalurl = 'http://someexternalsite.net.au';

// Internal parent node.
$node = $this->robot->mark_for_crawl($CFG->wwwroot, $parentlocalurl, 1, $parentpriority);
$node->httpcode = 200;
$node->mimetype = 'text/html';
$node->external = 0;
$node->contents = <<<HTML
<!doctype html>
<html>
<head>
<meta charset="utf-8"/>
<title>Test title</title>
</head>
<body class="course-1">
<a href="$CFG->wwwroot/$directchildlocalurl">Direct child node</a>
</body>
</html>
HTML;
// Parse the parent node, to create the direct child node.
$parentnode = $this->robot->parse_html($node, $node->external);

// Internal node direct child.
$url = new moodle_url('/' . $directchildlocalurl);
$node = $DB->get_record('tool_crawler_url', array('url' => $url->raw_out()) );
$node->url = $CFG->wwwroot.'/'.$directchildlocalurl;
$node->httpcode = 200;
$node->mimetype = 'text/html';
$node->external = 0;
$node->contents = <<<HTML
<!doctype html>
<html>
<head>
<meta charset="utf-8"/>
<title>Test title</title>
</head>
<body class="course-1">
<a href="$indirectchildexternalurl">Indirect child node</a>
</body>
</html>
HTML;
// Parse the direct child, to create the indirect child node.
$directchildnode = $this->robot->parse_html($node, $node->external);
$indirectchildnode = $DB->get_record('tool_crawler_url', ['url' => $indirectchildexternalurl]);

// Direct child nodes should inherit priority from parent node (super node).
$this->assertEquals($parentnode->priority, $directchildnode->priority);
// Indirect child nodes should not inherit priority from parent node (super node).
$this->assertGreaterThanOrEqual($indirectchildnode->priority, $parentnode->priority);
// Indirect child nodes should not inherit priority from direct child node.
$this->assertGreaterThanOrEqual($indirectchildnode->priority, $directchildnode->priority);
// Indirect child nodes should not be able to have a high priority.
$this->assertLessThan(TOOL_CRAWLER_PRIORITY_HIGH, $indirectchildnode->priority);
}
}
4 changes: 2 additions & 2 deletions version.php
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,8 @@
defined('MOODLE_INTERNAL') || die();


$plugin->version = 2020010600; // The current plugin version (Date: YYYYMMDDXX)
$plugin->release = 2020010600; // The current plugin version (Date: YYYYMMDDXX)
$plugin->version = 2020012300; // The current plugin version (Date: YYYYMMDDXX)
$plugin->release = 2020012300; // The current plugin version (Date: YYYYMMDDXX)
$plugin->requires = 2016021800; // Requires this Moodle version.
$plugin->component = 'tool_crawler'; // To check on upgrade, that module sits in correct place.
$plugin->maturity = MATURITY_STABLE;
Expand Down

0 comments on commit e7f557e

Please sign in to comment.