Skip to content

Commit

Permalink
enhancement: add priority to direct child nodes #108
Browse files Browse the repository at this point in the history
This enhancement aims to flow priority down to direct child nodes only.
Through the implementation of node levels and a level check when
marking a node to be crawled, we only assign a parent priority to a
child node if it is a direct ancestor of the original node.
This will prevent passing priority recursively and if, for example, a child
node is a top level node, filtering the priority to effectively all nodes,
which is undesirable behaviour.
  • Loading branch information
Tom Dickman committed Jan 23, 2020
1 parent a7e809e commit 54026eb
Show file tree
Hide file tree
Showing 5 changed files with 52 additions and 4 deletions.
25 changes: 23 additions & 2 deletions classes/robot/crawler.php
Original file line number Diff line number Diff line change
Expand Up @@ -306,12 +306,20 @@ public function get_queue_size() {
* @return object|boolean The node record if the resource pointed to by the URL can and should be considered; or `false` if the
* URL is invalid or excluded.
*/
public function mark_for_crawl($baseurl, $url, $courseid = null, $priority = TOOL_CRAWLER_PRIORITY_DEFAULT) {
public function mark_for_crawl($baseurl, $url, $courseid = null, $priority = TOOL_CRAWLER_PRIORITY_DEFAULT,
$level = TOOL_CRAWLER_NODE_LEVEL_PARENT) {

global $DB, $CFG;

$url = $this->absolute_url($baseurl, $url);

// Strip priority from indirect child nodes. Only parent and direct children
// of parent nodes have priority applied to avoid recursively applying priority
// to all ancestors of a parent node.
if ($level == TOOL_CRAWLER_NODE_LEVEL_INDIRECT_CHILD) {
$priority = TOOL_CRAWLER_PRIORITY_DEFAULT;
}

// Filter out non http protocols like mailto:[email protected] etc.
$bits = parse_url($url);
if (array_key_exists('scheme', $bits)
Expand Down Expand Up @@ -420,6 +428,7 @@ public function mark_for_crawl($baseurl, $url, $courseid = null, $priority = TOO
$node->external = self::is_external($url);
$node->needscrawl = time();
$node->priority = $priority;
$node->level = $level;

if (isset($courseid)) {
$node->courseid = $courseid;
Expand All @@ -438,6 +447,11 @@ public function mark_for_crawl($baseurl, $url, $courseid = null, $priority = TOO
$node->priority = $priority;
$needsupdating = true;
}
if ($node->level != $level) {
// Set the level again, in case this node has been seen again at a different
// level, to avoid reprocessing.
$node->level = $level;
}
if (isset($courseid)) {
$node->courseid = $courseid;
$needsupdating = true;
Expand Down Expand Up @@ -901,8 +915,15 @@ private function link_from_node_to_url($from, $url, $text, $idattr) {

global $DB;

// Ascertain the correct node level based on parent node level.
if ($from->level == TOOL_CRAWLER_NODE_LEVEL_PARENT) {
$level = TOOL_CRAWLER_NODE_LEVEL_DIRECT_CHILD;
} else {
$level = TOOL_CRAWLER_NODE_LEVEL_INDIRECT_CHILD;
}

// Add the node URL to the queue.
$to = $this->mark_for_crawl($from->url, $url, null, $from->priority);
$to = $this->mark_for_crawl($from->url, $url, $from->courseid, $from->priority, $level);
if ($to === false) {
return false;
}
Expand Down
9 changes: 9 additions & 0 deletions constants.php
Original file line number Diff line number Diff line change
Expand Up @@ -93,3 +93,12 @@
define('TOOL_CRAWLER_PRIORITY_DEFAULT', 0);
define('TOOL_CRAWLER_PRIORITY_NORMAL', 50);
define('TOOL_CRAWLER_PRIORITY_HIGH', 100);

/**
* Node level assigned to each node based on whether it is the parent node, or
* a child node discovered within a parent when crawling, or any child of a child
* node (or even further removed).
*/
define('TOOL_CRAWLER_NODE_LEVEL_PARENT', 2);
define('TOOL_CRAWLER_NODE_LEVEL_DIRECT_CHILD', 1);
define('TOOL_CRAWLER_NODE_LEVEL_INDIRECT_CHILD', 0);
2 changes: 2 additions & 0 deletions db/install.xml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@
<FIELD NAME="httpmsg" TYPE="text" NOTNULL="false" SEQUENCE="false"/>
<FIELD NAME="errormsg" TYPE="text" NOTNULL="false" SEQUENCE="false"/>
<FIELD NAME="priority" TYPE="int" LENGTH="10" NOTNULL="false" SEQUENCE="false" DEFAULT="0"/>
<FIELD NAME="level" TYPE="int" LENGTH="1" NOTNULL="false" DEFAULT="2" SEQUENCE="false" COMMENT="Node level, 2 for parent node, 1 for direct child and 0 for subsequent children."/>
</FIELDS>
</FIELDS>
<KEYS>
<KEY NAME="primary" TYPE="primary" FIELDS="id"/>
Expand Down
16 changes: 16 additions & 0 deletions db/upgrade.php
Original file line number Diff line number Diff line change
Expand Up @@ -94,5 +94,21 @@ function xmldb_tool_crawler_upgrade($oldversion) {
upgrade_plugin_savepoint(true, 2019100300, 'tool', 'crawler');
}

if ($oldversion < 2020012300) {

// Define field level to be added to tool_crawler_url.
$table = new xmldb_table('tool_crawler_url');
$field = new xmldb_field('level', XMLDB_TYPE_INTEGER, '1', null, null, null, '2', 'priority');

// Conditionally launch add field level.
if (!$dbman->field_exists($table, $field)) {
$dbman->add_field($table, $field);
}

// Crawler savepoint reached.
upgrade_plugin_savepoint(true, 2020012300, 'tool', 'crawler');
}


return true;
}
4 changes: 2 additions & 2 deletions version.php
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,8 @@
defined('MOODLE_INTERNAL') || die();


$plugin->version = 2020010600; // The current plugin version (Date: YYYYMMDDXX)
$plugin->release = 2020010600; // The current plugin version (Date: YYYYMMDDXX)
$plugin->version = 2020012300; // The current plugin version (Date: YYYYMMDDXX)
$plugin->release = 2020012300; // The current plugin version (Date: YYYYMMDDXX)
$plugin->requires = 2016021800; // Requires this Moodle version.
$plugin->component = 'tool_crawler'; // To check on upgrade, that module sits in correct place.
$plugin->maturity = MATURITY_STABLE;
Expand Down

0 comments on commit 54026eb

Please sign in to comment.