diff --git a/classes/robot/crawler.php b/classes/robot/crawler.php index 497519b2..a7b329d5 100644 --- a/classes/robot/crawler.php +++ b/classes/robot/crawler.php @@ -916,16 +916,17 @@ private function link_from_node_to_url($from, $url, $text, $idattr) { global $DB; // Ascertain the correct node level based on parent node level. - if ($from->level == TOOL_CRAWLER_NODE_LEVEL_PARENT) { + if (!empty($from->level) && $from->level == TOOL_CRAWLER_NODE_LEVEL_PARENT) { $level = TOOL_CRAWLER_NODE_LEVEL_DIRECT_CHILD; } else { $level = TOOL_CRAWLER_NODE_LEVEL_INDIRECT_CHILD; } - $priority = $from->priority ? $from->priority : TOOL_CRAWLER_PRIORITY_DEFAULT; + $priority = isset($from->priority) ? $from->priority : TOOL_CRAWLER_PRIORITY_DEFAULT; + $courseid = isset($from->courseid) ? $from->courseid : null; // Add the node URL to the queue. - $to = $this->mark_for_crawl($from->url, $url, $from->courseid, $priority, $level); + $to = $this->mark_for_crawl($from->url, $url, $courseid, $priority, $level); if ($to === false) { return false; } diff --git a/tests/phpunit/robot_crawler_test.php b/tests/phpunit/robot_crawler_test.php index 84a2dc46..08c80d28 100644 --- a/tests/phpunit/robot_crawler_test.php +++ b/tests/phpunit/robot_crawler_test.php @@ -295,6 +295,7 @@ public function test_should_be_excluded() { $node->contents = $page . $linktoexclude; $node->url = $url; $node->id = $insertid; + $node->level = TOOL_CRAWLER_NODE_LEVEL_PARENT; $this->resetAfterTest(true); @@ -310,6 +311,67 @@ public function test_should_be_excluded() { self::assertFalse($found); } + /** + * Test for issue #108 - passing node crawl priority to child nodes when parsing html. + */ + public function test_parse_html_priority_inheritance() { + global $CFG, $DB; + + $parentlocalurl = 'course/view.php?id=1§ion=2'; + $directchildlocalurl = 'mod/book/view.php?id=7'; + $indirectchildexternalurl = 'http://someexternalsite.net.au'; + $nodes = []; + + // Internal parent node. + $node = $this->robot->mark_for_crawl($CFG->wwwroot, $parentlocalurl, 1, TOOL_CRAWLER_PRIORITY_HIGH); + $node->httpcode = 200; + $node->mimetype = 'text/html'; + $node->external = 0; + $node->contents = << + + + + Test title + + + Direct child node + + +HTML; + // Parse the parent node, to add create the direct child node. + $parentnode = $this->robot->parse_html($node, $node->external); + + // Internal node direct child. + $url = new moodle_url('/' . $directchildlocalurl); + $node = $DB->get_record('tool_crawler_url', array('url' => $url->raw_out()) ); + $node->url = $CFG->wwwroot.'/'.$directchildlocalurl; + $node->httpcode = 200; + $node->mimetype = 'text/html'; + $node->external = 0; + $node->contents = << + + + + Test title + + + Indirect child node + + +HTML; + // Parse the direct child, to create the indirect child node. + $directchildnode = $this->robot->parse_html($node, $node->external); + $indirectchildnode = $DB->get_record('tool_crawler_url', ['url' => $indirectchildexternalurl]); + + // Direct child nodes should inherit priority from parent node (super node). + $this->assertEquals($parentnode->priority, $directchildnode->priority); + // Indirect child nodes should not inherit a high priority from parent node (super node). + $this->assertGreaterThanOrEqual($indirectchildnode->priority, $parentnode->priority); + // Indirect child nodes should not inherit a high priority from parent node (super node). + $this->assertGreaterThanOrEqual($indirectchildnode->priority, $directchildnode->priority); + // Indirect child nodes should not be able to have a high priority. + $this->assertLessThan(TOOL_CRAWLER_PRIORITY_HIGH, $indirectchildnode->priority); + } } - -