From 2fd85107b3a221d6c0dac92452feb96f8edcf224 Mon Sep 17 00:00:00 2001
From: st143971 <st143971@stud.uni-stuttgart.de>
Date: Mon, 12 Feb 2024 05:43:58 +0100
Subject: [PATCH] Added pre and postprocessing to Search results to make them
 more readable and neat.

---
 block_slidefinder.php         |   3 +-
 externallib.php               |  58 +++++----
 locallib.php                  | 238 +++++++++++++++++++++++++++-------
 templates/lrf_search.mustache |  28 ++--
 version.php                   |   2 +-
 5 files changed, 251 insertions(+), 78 deletions(-)

diff --git a/block_slidefinder.php b/block_slidefinder.php
index fbb3b08..d6e3133 100644
--- a/block_slidefinder.php
+++ b/block_slidefinder.php
@@ -103,9 +103,10 @@ public function get_content() {
                 }
             }
 
+            // data[0] = array([section, filename, page, bookurl, size, content]).
             $data = [[], []];
             if (!is_null($course)) {
-                $data = block_slidefinder_get_content_as_chapters_for_all_book_pdf_matches_from_course($course->id, $USER->id);
+                $data = block_slidefinder_get_all_content_of_course_as_sections_with_metadata($course->id, $USER->id);
                 if (!empty($data[1])) {
                     $footer .= get_string('misconfigured_info', get_class($this));
                     foreach ($data[1] as $key => $value) {
diff --git a/externallib.php b/externallib.php
index a901273..5fb735a 100644
--- a/externallib.php
+++ b/externallib.php
@@ -109,19 +109,24 @@ public static function get_searched_locations($userid, $courseid, $searchstring,
         $coursecontext = context_course::instance($course->id);
         self::validate_context($coursecontext);
 
-        [$chapters, $misconfiguredchapters] =
-            block_slidefinder_get_content_as_chapters_for_all_book_pdf_matches_from_course($courseid, $userid);
+        // Get all searchable content.
+        [$sections, $_] = block_slidefinder_get_all_content_of_course_as_sections_with_metadata($courseid, $userid);
 
         // Get Search Results & Context for PDFs.
+        $data = [];
+        foreach ($sections as $section) {
+            $data = self::search_content($data, $section, $searchstring, $contextlength);
+        }
+
+        // Format results.
         $results = [];
-        foreach ($chapters as $chapter) {
-            $result = self::search_content($chapter, $searchstring, $contextlength);
-            if ($result) {
+        foreach ($data as $file) {
+            foreach ($file as $chapter) {
                 $results[] = [
-                    'filename' => $result->filename,
-                    'page_number' => $result->page,
-                    'book_chapter_url' => $result->bookurl,
-                    'context_snippet' => $result->context,
+                    'filename' => $chapter->filename,
+                    'page_number' => $chapter->page,
+                    'book_chapter_url' => $chapter->bookurl,
+                    'context_snippet' => $chapter->context,
                 ];
             }
         }
@@ -131,7 +136,7 @@ public static function get_searched_locations($userid, $courseid, $searchstring,
     }
 
     /**
-     * Returns description of the method return values
+     * Returns description of the method return values.
      * @return external_value
      */
     public static function get_searched_locations_returns() {
@@ -139,21 +144,21 @@ public static function get_searched_locations_returns() {
     }
 
     /**
-     * Searches for the $searchterm in the given $page->content and
-     * returns the page with a $page->context context snippet if it was found. returns null if not.
+     * Searches for the $searchterm in the given $section->content and populates the given $results array.
      *
-     * @param stdClass $page object that holds the $page->content and gets returned containing the $page->context
-     * @param string $searchterm the string to seach for in the $page->content
-     * @param int $contextlength word count returned as context snippet on each side of the found $searchterm
+     * @param array $results the results so far.
+     * @param stdClass $section object that holds the $section->content.
+     * @param string $searchterm the string to seach for in the $section->content.
+     * @param int $contextlength word count returned as context snippet on each side of the found $searchterm.
      *
-     * @return stdClass|null the given $page object with the additional $page->context or null if nothing was found
+     * @return array $results returns the updated $results array with the new data.
      */
-    private static function search_content($page, $searchterm, $contextlength) {
-        $content = $page->content;
+    private static function search_content($results, $section, $searchterm, $contextlength) {
+        $content = $section->content;
 
-        // Is the searched word in this page?
+        // Is the searched word in this section?
         if (!stristr($content, $searchterm)) {
-            return;
+            return $results;
         }
 
         // Split the text into words.
@@ -201,9 +206,18 @@ private static function search_content($page, $searchterm, $contextlength) {
 
         // Create a String with all occurences & context.
         $context = implode(' ... ', $snippets);
+        $section->context = $context;
+
+        if (!array_key_exists($section->filename, $results)) {
+            $results[$section->filename] = [];
+        }
+        if (!array_key_exists($section->page, $results[$section->filename])) {
+            $results[$section->filename][$section->page] = $section;
+        } else {
+            $results[$section->filename][$section->page]->context .= " ... " . $section->context;
+        }
 
-        $page->context = $context;
-        return $page;
+        return $results;
     }
 
     /**
diff --git a/locallib.php b/locallib.php
index d8369ec..061be03 100644
--- a/locallib.php
+++ b/locallib.php
@@ -21,26 +21,43 @@
  * @copyright  2022 Universtity of Stuttgart <kasra.habib@iste.uni-stuttgart.de>
  * @license    http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later
  */
+
+use core_reportbuilder\external\reports\retrieve;
+
 defined('MOODLE_INTERNAL') || die();
 
 require_once(__DIR__ . '/pdfparser/alt_autoload.php-dist');
 
 /**
- * Return the content & link of all chapters that are part of an eliganble book-pdf match in the given course.
+ * Return the content for all elligable Book to Pdf matches.
+ * 
+ * Return[0]:
+ * The content is returned in sections.
+ * A section is a sentece or part of the Pdf that fits together.
+ * Each section contains the content as text and some metadata.
+ * The metadata is: 
+ *  - section: The moodle course section this pdf/book match appears on.
+ *  - filename: The name of the Pdf this section appears on.
+ *  - page: The page number this section appears on.
+ *  - bookurl: The url linking to the matching book-chapter this section appears on.
+ *  - text: The text content of this section.
+ * 
+ * Return[1]:
+ * Additionally returns a list of filenames that are intended to match to a book but have an error in the setup.
  *
  * @param int $courseid ID of the course to be searched
  * @param int $userid ID of the user initiating the search
  *
- * @return array [0] list of chapters (content, link, other metadata). One chapter for each eligable book chaper in course.
+ * @return array [0] list of logical sections of content (section, filename, page, bookurl, text).
  * @return array [1] list of filenames of intended eligable pairs that have a problem
  */
-function block_slidefinder_get_content_as_chapters_for_all_book_pdf_matches_from_course($courseid, $userid) {
+function block_slidefinder_get_all_content_of_course_as_sections_with_metadata($courseid, $userid) {
     global $DB;
 
     // Array of pdf_chapter metadata and content of all book to pdf matches in the given course.
-    $coursechapters = [];
+    $sections = [];
     // Array of pdf_chapter metadata of all book to pdf matches with some misconfigurations in the given course.
-    $misconfiguredcoursechapters = [];
+    $misconfiguredmatches = [];
 
     try {
         // Course.
@@ -53,23 +70,35 @@ function block_slidefinder_get_content_as_chapters_for_all_book_pdf_matches_from
         }
     } catch (\Throwable $th) {
         debugging($th);
-        return [$coursechapters, $misconfiguredcoursechapters];
+        return [$sections, $misconfiguredmatches];
     }
 
-    // Get the Book to Pdf matches that exist. Array of metadata for each match.
-    $matches = block_slidefinder_get_all_book_pdf_matches_from_course($course);
+    try {
+        // Get the Book to Pdf matches that exist. Array of metadata for each match.
+        $matches = block_slidefinder_get_all_book_pdf_matches_from_course($course);
+    } catch (\Throwable $th) {
+        debugging($th);
+        gc_collect_cycles();
+        return [$sections, $misconfiguredmatches];
+    }
 
     foreach ($matches as $match) {
-        // Split each pdf metadata into pdf_chapter metadata. Add the chapter content.
-        $matchedchapters = block_slidefinder_get_content_as_chapters($match);
-        if (!is_null($matchedchapters) && !empty($matchedchapters)) {
-            $coursechapters = array_merge($coursechapters, $matchedchapters);
-        } else {
-            $misconfiguredcoursechapters[] = $match->filename;
+        try {
+            // Split each pdf content into logical sections containing text and metadata.
+            $pagesections = block_slidefinder_get_content_as_sections($match);
+            if (!is_null($pagesections) && !empty($pagesections)) {
+                $sections = array_merge($sections, $pagesections);
+            } else {
+                $misconfiguredmatches[] = $match->filename;
+            }
+        } catch (\Throwable $th) {
+            debugging($th);
+            $misconfiguredmatches[] = $match->filename;
+            gc_collect_cycles();
         }
     }
 
-    return [$coursechapters, $misconfiguredcoursechapters];
+    return [$sections, $misconfiguredmatches];
 }
 
 /**
@@ -140,48 +169,158 @@ function block_slidefinder_get_all_book_pdf_matches_from_course($course) {
 }
 
 /**
- * Return an array of objects each containing the content and some metadata of one PDF page of a given pdf-book match.
+ * Return an array of logical sections based on each page of the given pdf/book match.
+ * A section is a sentece or part of the Pdf page that fits together.
+ * Each section contains the content as text and some metadata.
+ * The metadata is: 
+ *  - section: The moodle course section this pdf/book match appears on.
+ *  - filename: The name of the Pdf this section appears on.
+ *  - page: The page number this section appears on.
+ *  - bookurl: The url linking to the matching book-chapter this section appears on.
+ *  - text: The text content of this section.
  *
  * @param mixed $match an object containing metadata of one pdf-book match.
  *
- * @return array list of objects containing the content and some metadata of one PDF page.
+ * @return array list of logical sections of content (section, filename, page, bookurl, text).
  */
-function block_slidefinder_get_content_as_chapters($match) {
-    $chapters = [];
+function block_slidefinder_get_content_as_sections($match) {
+    $sections = [];
 
-    try {
-        $fs = get_file_storage();
+    $fs = get_file_storage();
 
-        $config = new \Smalot\PdfParser\Config();
-        $config->setHorizontalOffset('');
-        $pdfparser = new \Smalot\PdfParser\Parser([], $config);
+    $config = new \Smalot\PdfParser\Config();
+    $config->setRetainImageContent(false);
+    $config->setHorizontalOffset('');
+    $config->setFontSpaceLimit(-600);
+    $pdfparser = new \Smalot\PdfParser\Parser([], $config);
 
-        $file = $fs->get_file_by_hash($match->pathnamehash);
-        if ($file->get_mimetype() != 'application/pdf') {
-            return $chapters;
+    $file = $fs->get_file_by_hash($match->pathnamehash);
+    if ($file->get_mimetype() != 'application/pdf') {
+        return $sections;
+    }
+
+    $pdf = $pdfparser->parseContent($file->get_content());
+    gc_collect_cycles();
+
+    // Create a list of pages, where each page is a combination of match and pdf metadata for one pdf page.
+    $pages = block_slidefinder_get_pdf_metadata_as_pages($pdf, $match);
+
+    // Split the list of pages (with metadata) into smaller logical sections containing metadata and text content.
+    foreach ($pages as $page) {
+        $sections = array_merge($sections, block_slidefinder_get_page_as_sections_with_content($page));
+    }
+
+    $test = [];
+    foreach ($sections as $value) {
+        $test[] = $value->content;
+    }
+    // debugging(print_r($test, true));
+
+    gc_collect_cycles();
+    return $sections;
+}
+
+/**
+ * Create a list of pages with metadata from a given match and parsed pdf.
+ * 
+ * @param mixed $pdf object containing the parsed information (content and metadata) of the pdf.
+ * @param mixed $match object containing metadata of the book/pdf match.
+ * 
+ * @return array of pages, each with metadata combined from match and parsed pdf and representing one pdf page.
+ */
+function block_slidefinder_get_pdf_metadata_as_pages($pdf, $match) {
+    $pages = [];
+    $pdfdetails = $pdf->getDetails();
+
+    for ($i = 0; $i < $pdfdetails['Pages']; $i++) {
+        $page = new stdClass();
+        $page->section = $match->section;
+        $page->filename = str_replace('.pdf', get_string('pdf_replace', 'block_slidefinder'), $match->filename);
+        $page->page = $i + 1;
+        $page->bookurl = block_slidefinder_get_book_chapter_url($match->bookid, $i + 1);
+        $page->content = $pdf->getPages()[$i];
+        $pages[] = $page;
+    }
+
+    return $pages;
+}
+
+/**
+ * Split a page (with metadata) into smaller logical sections containing metadata and text content.
+ */
+function block_slidefinder_get_page_as_sections_with_content($page) {
+    $sections = [];
+
+    // List of subsections/subsentences of text with metadata like size.
+    $subsections = block_slidefinder_get_sub_sections_from_page($page);
+    gc_collect_cycles();
+
+    $currentsection = null;
+
+    foreach ($subsections as $subsection) {
+        $isseperator = block_slidefinder_text_is_seperator($subsection->content);
+        if (is_null($currentsection)) {
+            if (!$isseperator) {
+                $currentsection = $subsection;
+            }
+            continue;
+        }
+        if ($isseperator) {
+            $sections[] = $currentsection;
+            $currentsection = null;
+            continue;
+        }
+        if ($currentsection->size !== $subsection->size) {
+            $sections[] = $currentsection;
+            $currentsection = $subsection;
+            continue;
         }
+        // The current section and current subsection belong together.
+        $currentsection->content .= " " . $subsection->content;
+    }
+    if (!is_null($currentsection)) {
+        $sections[] = $currentsection;
+        $currentsection = null;
+    }
 
-        $pdf = $pdfparser->parseContent($file->get_content());
-        $pdfdetails = $pdf->getDetails();
-        $pages = $pdf->getPages();
-
-        for ($i = 0; $i < $pdfdetails['Pages']; $i++) {
-            $chapter = new stdClass();
-            $chapter->filename = str_replace('.pdf', get_string('pdf_replace', 'block_slidefinder'), $match->filename);
-            $chapter->section = $match->section;
-            $chapter->page = $i + 1;
-            $chapter->content = $pages[$i]->getText();
-            $chapter->bookurl = block_slidefinder_get_book_chapter_url($match->bookid, $i + 1);
-            $chapters[] = $chapter;
+    gc_collect_cycles();
+    return $sections;
+}
+
+/**
+ * For a given parsed pdf document. Create a list of subsections/subsentences of text with metadata like size.
+ * 
+ * @param mixed $page document of a parsed pdf.
+ * 
+ * @return array list of subsections/subsentences of text with metadata like size for the given page.
+ */
+function block_slidefinder_get_sub_sections_from_page($page) {
+    // Subsections do no longer contain the end of a sentence inside the text.
+    $subsections = [];
+    $endofsentencepattern = '/(?<=[.?!;:])\s+/';
+
+    // Get pdf content as lines with metadata: [0]: Metadata, [1]: Text.
+    $lines = $page->content->getDataTm();
+
+    foreach ($lines as $line) {
+        $subsection = new stdClass();
+        $subsection->section = $page->section;
+        $subsection->filename = $page->filename;
+        $subsection->page = $page->page;
+        $subsection->bookurl = $page->bookurl;
+        $subsection->size = array_slice($line[0], 0, 4);
+
+        // Split the line into subsections.
+        $subtexts = preg_split($endofsentencepattern, $line[1], -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE);
+
+        foreach ($subtexts as $text) {
+            $subsection->content = $text;
+            $subsections[] = $subsection;
         }
-    } catch (\Throwable $th) {
-        gc_collect_cycles();
-        debugging($th);
-        return null;
     }
 
     gc_collect_cycles();
-    return $chapters;
+    return $subsections;
 }
 
 /**
@@ -204,6 +343,17 @@ function block_slidefinder_get_book_chapter_url($bookid, $pagenum) {
     return $url->out(false);
 }
 
+/**
+ * Checks if the given text counts as a seperator.
+ * 
+ * @param string $text given text to check.
+ * 
+ * @return bool true if it is a seperator.
+ */
+function block_slidefinder_text_is_seperator($text) {
+    return strlen($text) <= 2;
+}
+
 /**
  * Create and Return an (id => fullname) array for all courses the current user can access.
  * @param int $cid ID of a course. The selected course is at the beginning of the array, else a selection method.
diff --git a/templates/lrf_search.mustache b/templates/lrf_search.mustache
index c12bc55..a71bf33 100644
--- a/templates/lrf_search.mustache
+++ b/templates/lrf_search.mustache
@@ -110,17 +110,18 @@
         }
 
         /**
-        * searches one pdf-page/book-chapter using the lrf_search_term and if found ads it with context to lrf_searched_content
-        * @param {*} page
+        * searches one section of pdf-page/book-chapter.
+        * using the lrf_search_term and if found ads it with context to lrf_searched_content.
+        * @param {*} section
         */
-        function lrfSearchContent(page) {
-            // Is the searched word in this page?
-            if (!page.content.toLowerCase().includes(lrf_search_term.toLowerCase())) {
+        function lrfSearchContent(section) {
+            // Is the searched word in this section?
+            if (!section.content.toLowerCase().includes(lrf_search_term.toLowerCase())) {
                 return;
             }
 
             // Split the text into words.
-            let words = page.content.split(/\s+/);
+            let words = section.content.split(/\s+/);
 
             let snippets = [];
             let snippetIndex = 0;
@@ -164,12 +165,19 @@
             
             // Create a string with all occurrences & context.
             let context = snippets.join(' ... ');
+            section.context = context;
+            
+            // Create new file entry in results if it does not exist.
+            if (!(section.filename in lrf_searched_content)) {
+                lrf_searched_content[section.filename] = [];
+            }
 
-            page.context = context;
-            if (!(page.filename in lrf_searched_content)) {
-                lrf_searched_content[page.filename] = [];
+            // Set chapter entry as section or add section context to existing chapter entry.
+            if (!(section.page in lrf_searched_content[section.filename])) {
+                lrf_searched_content[section.filename][section.page] = section;
+            } else {
+                lrf_searched_content[section.filename][section.page].context += " ... " + section.context; 
             }
-            lrf_searched_content[page.filename][page.page] = page;
         }
 
         /**
diff --git a/version.php b/version.php
index c90932c..222a554 100644
--- a/version.php
+++ b/version.php
@@ -23,7 +23,7 @@
  */
 defined('MOODLE_INTERNAL') || die();
 
-$plugin->version   = 2024020700;  // The current plugin version (Date: YYYYMMDDHH).
+$plugin->version   = 2024021200;  // The current plugin version (Date: YYYYMMDDHH).
 $plugin->requires  = 2020061510;  // Requires this Moodle version.
 $plugin->component = 'block_slidefinder';  // Full name of the plugin (used for diagnostics).
 $plugin->release = '1.1.1';