From 2fd85107b3a221d6c0dac92452feb96f8edcf224 Mon Sep 17 00:00:00 2001 From: st143971 Date: Mon, 12 Feb 2024 05:43:58 +0100 Subject: [PATCH] Added pre and postprocessing to Search results to make them more readable and neat. --- block_slidefinder.php | 3 +- externallib.php | 58 +++++---- locallib.php | 238 +++++++++++++++++++++++++++------- templates/lrf_search.mustache | 28 ++-- version.php | 2 +- 5 files changed, 251 insertions(+), 78 deletions(-) diff --git a/block_slidefinder.php b/block_slidefinder.php index fbb3b08..d6e3133 100644 --- a/block_slidefinder.php +++ b/block_slidefinder.php @@ -103,9 +103,10 @@ public function get_content() { } } + // data[0] = array([section, filename, page, bookurl, size, content]). $data = [[], []]; if (!is_null($course)) { - $data = block_slidefinder_get_content_as_chapters_for_all_book_pdf_matches_from_course($course->id, $USER->id); + $data = block_slidefinder_get_all_content_of_course_as_sections_with_metadata($course->id, $USER->id); if (!empty($data[1])) { $footer .= get_string('misconfigured_info', get_class($this)); foreach ($data[1] as $key => $value) { diff --git a/externallib.php b/externallib.php index a901273..5fb735a 100644 --- a/externallib.php +++ b/externallib.php @@ -109,19 +109,24 @@ public static function get_searched_locations($userid, $courseid, $searchstring, $coursecontext = context_course::instance($course->id); self::validate_context($coursecontext); - [$chapters, $misconfiguredchapters] = - block_slidefinder_get_content_as_chapters_for_all_book_pdf_matches_from_course($courseid, $userid); + // Get all searchable content. + [$sections, $_] = block_slidefinder_get_all_content_of_course_as_sections_with_metadata($courseid, $userid); // Get Search Results & Context for PDFs. + $data = []; + foreach ($sections as $section) { + $data = self::search_content($data, $section, $searchstring, $contextlength); + } + + // Format results. $results = []; - foreach ($chapters as $chapter) { - $result = self::search_content($chapter, $searchstring, $contextlength); - if ($result) { + foreach ($data as $file) { + foreach ($file as $chapter) { $results[] = [ - 'filename' => $result->filename, - 'page_number' => $result->page, - 'book_chapter_url' => $result->bookurl, - 'context_snippet' => $result->context, + 'filename' => $chapter->filename, + 'page_number' => $chapter->page, + 'book_chapter_url' => $chapter->bookurl, + 'context_snippet' => $chapter->context, ]; } } @@ -131,7 +136,7 @@ public static function get_searched_locations($userid, $courseid, $searchstring, } /** - * Returns description of the method return values + * Returns description of the method return values. * @return external_value */ public static function get_searched_locations_returns() { @@ -139,21 +144,21 @@ public static function get_searched_locations_returns() { } /** - * Searches for the $searchterm in the given $page->content and - * returns the page with a $page->context context snippet if it was found. returns null if not. + * Searches for the $searchterm in the given $section->content and populates the given $results array. * - * @param stdClass $page object that holds the $page->content and gets returned containing the $page->context - * @param string $searchterm the string to seach for in the $page->content - * @param int $contextlength word count returned as context snippet on each side of the found $searchterm + * @param array $results the results so far. + * @param stdClass $section object that holds the $section->content. + * @param string $searchterm the string to seach for in the $section->content. + * @param int $contextlength word count returned as context snippet on each side of the found $searchterm. * - * @return stdClass|null the given $page object with the additional $page->context or null if nothing was found + * @return array $results returns the updated $results array with the new data. */ - private static function search_content($page, $searchterm, $contextlength) { - $content = $page->content; + private static function search_content($results, $section, $searchterm, $contextlength) { + $content = $section->content; - // Is the searched word in this page? + // Is the searched word in this section? if (!stristr($content, $searchterm)) { - return; + return $results; } // Split the text into words. @@ -201,9 +206,18 @@ private static function search_content($page, $searchterm, $contextlength) { // Create a String with all occurences & context. $context = implode(' ... ', $snippets); + $section->context = $context; + + if (!array_key_exists($section->filename, $results)) { + $results[$section->filename] = []; + } + if (!array_key_exists($section->page, $results[$section->filename])) { + $results[$section->filename][$section->page] = $section; + } else { + $results[$section->filename][$section->page]->context .= " ... " . $section->context; + } - $page->context = $context; - return $page; + return $results; } /** diff --git a/locallib.php b/locallib.php index d8369ec..061be03 100644 --- a/locallib.php +++ b/locallib.php @@ -21,26 +21,43 @@ * @copyright 2022 Universtity of Stuttgart * @license http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later */ + +use core_reportbuilder\external\reports\retrieve; + defined('MOODLE_INTERNAL') || die(); require_once(__DIR__ . '/pdfparser/alt_autoload.php-dist'); /** - * Return the content & link of all chapters that are part of an eliganble book-pdf match in the given course. + * Return the content for all elligable Book to Pdf matches. + * + * Return[0]: + * The content is returned in sections. + * A section is a sentece or part of the Pdf that fits together. + * Each section contains the content as text and some metadata. + * The metadata is: + * - section: The moodle course section this pdf/book match appears on. + * - filename: The name of the Pdf this section appears on. + * - page: The page number this section appears on. + * - bookurl: The url linking to the matching book-chapter this section appears on. + * - text: The text content of this section. + * + * Return[1]: + * Additionally returns a list of filenames that are intended to match to a book but have an error in the setup. * * @param int $courseid ID of the course to be searched * @param int $userid ID of the user initiating the search * - * @return array [0] list of chapters (content, link, other metadata). One chapter for each eligable book chaper in course. + * @return array [0] list of logical sections of content (section, filename, page, bookurl, text). * @return array [1] list of filenames of intended eligable pairs that have a problem */ -function block_slidefinder_get_content_as_chapters_for_all_book_pdf_matches_from_course($courseid, $userid) { +function block_slidefinder_get_all_content_of_course_as_sections_with_metadata($courseid, $userid) { global $DB; // Array of pdf_chapter metadata and content of all book to pdf matches in the given course. - $coursechapters = []; + $sections = []; // Array of pdf_chapter metadata of all book to pdf matches with some misconfigurations in the given course. - $misconfiguredcoursechapters = []; + $misconfiguredmatches = []; try { // Course. @@ -53,23 +70,35 @@ function block_slidefinder_get_content_as_chapters_for_all_book_pdf_matches_from } } catch (\Throwable $th) { debugging($th); - return [$coursechapters, $misconfiguredcoursechapters]; + return [$sections, $misconfiguredmatches]; } - // Get the Book to Pdf matches that exist. Array of metadata for each match. - $matches = block_slidefinder_get_all_book_pdf_matches_from_course($course); + try { + // Get the Book to Pdf matches that exist. Array of metadata for each match. + $matches = block_slidefinder_get_all_book_pdf_matches_from_course($course); + } catch (\Throwable $th) { + debugging($th); + gc_collect_cycles(); + return [$sections, $misconfiguredmatches]; + } foreach ($matches as $match) { - // Split each pdf metadata into pdf_chapter metadata. Add the chapter content. - $matchedchapters = block_slidefinder_get_content_as_chapters($match); - if (!is_null($matchedchapters) && !empty($matchedchapters)) { - $coursechapters = array_merge($coursechapters, $matchedchapters); - } else { - $misconfiguredcoursechapters[] = $match->filename; + try { + // Split each pdf content into logical sections containing text and metadata. + $pagesections = block_slidefinder_get_content_as_sections($match); + if (!is_null($pagesections) && !empty($pagesections)) { + $sections = array_merge($sections, $pagesections); + } else { + $misconfiguredmatches[] = $match->filename; + } + } catch (\Throwable $th) { + debugging($th); + $misconfiguredmatches[] = $match->filename; + gc_collect_cycles(); } } - return [$coursechapters, $misconfiguredcoursechapters]; + return [$sections, $misconfiguredmatches]; } /** @@ -140,48 +169,158 @@ function block_slidefinder_get_all_book_pdf_matches_from_course($course) { } /** - * Return an array of objects each containing the content and some metadata of one PDF page of a given pdf-book match. + * Return an array of logical sections based on each page of the given pdf/book match. + * A section is a sentece or part of the Pdf page that fits together. + * Each section contains the content as text and some metadata. + * The metadata is: + * - section: The moodle course section this pdf/book match appears on. + * - filename: The name of the Pdf this section appears on. + * - page: The page number this section appears on. + * - bookurl: The url linking to the matching book-chapter this section appears on. + * - text: The text content of this section. * * @param mixed $match an object containing metadata of one pdf-book match. * - * @return array list of objects containing the content and some metadata of one PDF page. + * @return array list of logical sections of content (section, filename, page, bookurl, text). */ -function block_slidefinder_get_content_as_chapters($match) { - $chapters = []; +function block_slidefinder_get_content_as_sections($match) { + $sections = []; - try { - $fs = get_file_storage(); + $fs = get_file_storage(); - $config = new \Smalot\PdfParser\Config(); - $config->setHorizontalOffset(''); - $pdfparser = new \Smalot\PdfParser\Parser([], $config); + $config = new \Smalot\PdfParser\Config(); + $config->setRetainImageContent(false); + $config->setHorizontalOffset(''); + $config->setFontSpaceLimit(-600); + $pdfparser = new \Smalot\PdfParser\Parser([], $config); - $file = $fs->get_file_by_hash($match->pathnamehash); - if ($file->get_mimetype() != 'application/pdf') { - return $chapters; + $file = $fs->get_file_by_hash($match->pathnamehash); + if ($file->get_mimetype() != 'application/pdf') { + return $sections; + } + + $pdf = $pdfparser->parseContent($file->get_content()); + gc_collect_cycles(); + + // Create a list of pages, where each page is a combination of match and pdf metadata for one pdf page. + $pages = block_slidefinder_get_pdf_metadata_as_pages($pdf, $match); + + // Split the list of pages (with metadata) into smaller logical sections containing metadata and text content. + foreach ($pages as $page) { + $sections = array_merge($sections, block_slidefinder_get_page_as_sections_with_content($page)); + } + + $test = []; + foreach ($sections as $value) { + $test[] = $value->content; + } + // debugging(print_r($test, true)); + + gc_collect_cycles(); + return $sections; +} + +/** + * Create a list of pages with metadata from a given match and parsed pdf. + * + * @param mixed $pdf object containing the parsed information (content and metadata) of the pdf. + * @param mixed $match object containing metadata of the book/pdf match. + * + * @return array of pages, each with metadata combined from match and parsed pdf and representing one pdf page. + */ +function block_slidefinder_get_pdf_metadata_as_pages($pdf, $match) { + $pages = []; + $pdfdetails = $pdf->getDetails(); + + for ($i = 0; $i < $pdfdetails['Pages']; $i++) { + $page = new stdClass(); + $page->section = $match->section; + $page->filename = str_replace('.pdf', get_string('pdf_replace', 'block_slidefinder'), $match->filename); + $page->page = $i + 1; + $page->bookurl = block_slidefinder_get_book_chapter_url($match->bookid, $i + 1); + $page->content = $pdf->getPages()[$i]; + $pages[] = $page; + } + + return $pages; +} + +/** + * Split a page (with metadata) into smaller logical sections containing metadata and text content. + */ +function block_slidefinder_get_page_as_sections_with_content($page) { + $sections = []; + + // List of subsections/subsentences of text with metadata like size. + $subsections = block_slidefinder_get_sub_sections_from_page($page); + gc_collect_cycles(); + + $currentsection = null; + + foreach ($subsections as $subsection) { + $isseperator = block_slidefinder_text_is_seperator($subsection->content); + if (is_null($currentsection)) { + if (!$isseperator) { + $currentsection = $subsection; + } + continue; + } + if ($isseperator) { + $sections[] = $currentsection; + $currentsection = null; + continue; + } + if ($currentsection->size !== $subsection->size) { + $sections[] = $currentsection; + $currentsection = $subsection; + continue; } + // The current section and current subsection belong together. + $currentsection->content .= " " . $subsection->content; + } + if (!is_null($currentsection)) { + $sections[] = $currentsection; + $currentsection = null; + } - $pdf = $pdfparser->parseContent($file->get_content()); - $pdfdetails = $pdf->getDetails(); - $pages = $pdf->getPages(); - - for ($i = 0; $i < $pdfdetails['Pages']; $i++) { - $chapter = new stdClass(); - $chapter->filename = str_replace('.pdf', get_string('pdf_replace', 'block_slidefinder'), $match->filename); - $chapter->section = $match->section; - $chapter->page = $i + 1; - $chapter->content = $pages[$i]->getText(); - $chapter->bookurl = block_slidefinder_get_book_chapter_url($match->bookid, $i + 1); - $chapters[] = $chapter; + gc_collect_cycles(); + return $sections; +} + +/** + * For a given parsed pdf document. Create a list of subsections/subsentences of text with metadata like size. + * + * @param mixed $page document of a parsed pdf. + * + * @return array list of subsections/subsentences of text with metadata like size for the given page. + */ +function block_slidefinder_get_sub_sections_from_page($page) { + // Subsections do no longer contain the end of a sentence inside the text. + $subsections = []; + $endofsentencepattern = '/(?<=[.?!;:])\s+/'; + + // Get pdf content as lines with metadata: [0]: Metadata, [1]: Text. + $lines = $page->content->getDataTm(); + + foreach ($lines as $line) { + $subsection = new stdClass(); + $subsection->section = $page->section; + $subsection->filename = $page->filename; + $subsection->page = $page->page; + $subsection->bookurl = $page->bookurl; + $subsection->size = array_slice($line[0], 0, 4); + + // Split the line into subsections. + $subtexts = preg_split($endofsentencepattern, $line[1], -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE); + + foreach ($subtexts as $text) { + $subsection->content = $text; + $subsections[] = $subsection; } - } catch (\Throwable $th) { - gc_collect_cycles(); - debugging($th); - return null; } gc_collect_cycles(); - return $chapters; + return $subsections; } /** @@ -204,6 +343,17 @@ function block_slidefinder_get_book_chapter_url($bookid, $pagenum) { return $url->out(false); } +/** + * Checks if the given text counts as a seperator. + * + * @param string $text given text to check. + * + * @return bool true if it is a seperator. + */ +function block_slidefinder_text_is_seperator($text) { + return strlen($text) <= 2; +} + /** * Create and Return an (id => fullname) array for all courses the current user can access. * @param int $cid ID of a course. The selected course is at the beginning of the array, else a selection method. diff --git a/templates/lrf_search.mustache b/templates/lrf_search.mustache index c12bc55..a71bf33 100644 --- a/templates/lrf_search.mustache +++ b/templates/lrf_search.mustache @@ -110,17 +110,18 @@ } /** - * searches one pdf-page/book-chapter using the lrf_search_term and if found ads it with context to lrf_searched_content - * @param {*} page + * searches one section of pdf-page/book-chapter. + * using the lrf_search_term and if found ads it with context to lrf_searched_content. + * @param {*} section */ - function lrfSearchContent(page) { - // Is the searched word in this page? - if (!page.content.toLowerCase().includes(lrf_search_term.toLowerCase())) { + function lrfSearchContent(section) { + // Is the searched word in this section? + if (!section.content.toLowerCase().includes(lrf_search_term.toLowerCase())) { return; } // Split the text into words. - let words = page.content.split(/\s+/); + let words = section.content.split(/\s+/); let snippets = []; let snippetIndex = 0; @@ -164,12 +165,19 @@ // Create a string with all occurrences & context. let context = snippets.join(' ... '); + section.context = context; + + // Create new file entry in results if it does not exist. + if (!(section.filename in lrf_searched_content)) { + lrf_searched_content[section.filename] = []; + } - page.context = context; - if (!(page.filename in lrf_searched_content)) { - lrf_searched_content[page.filename] = []; + // Set chapter entry as section or add section context to existing chapter entry. + if (!(section.page in lrf_searched_content[section.filename])) { + lrf_searched_content[section.filename][section.page] = section; + } else { + lrf_searched_content[section.filename][section.page].context += " ... " + section.context; } - lrf_searched_content[page.filename][page.page] = page; } /** diff --git a/version.php b/version.php index c90932c..222a554 100644 --- a/version.php +++ b/version.php @@ -23,7 +23,7 @@ */ defined('MOODLE_INTERNAL') || die(); -$plugin->version = 2024020700; // The current plugin version (Date: YYYYMMDDHH). +$plugin->version = 2024021200; // The current plugin version (Date: YYYYMMDDHH). $plugin->requires = 2020061510; // Requires this Moodle version. $plugin->component = 'block_slidefinder'; // Full name of the plugin (used for diagnostics). $plugin->release = '1.1.1';