From 062efac2142390e1e874b0600dc5769bd449f089 Mon Sep 17 00:00:00 2001 From: martin Date: Wed, 15 Jun 2016 09:57:09 -0500 Subject: [PATCH] PDF text extraction enhancement --- pdfclass.php | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/pdfclass.php b/pdfclass.php index cab04f3..e6f4ecc 100644 --- a/pdfclass.php +++ b/pdfclass.php @@ -292,9 +292,14 @@ public function extractBookmarks() { // Try to repair some malformed files. $string = file_get_contents($temp_xml . '.xml'); + // Bad UTF-8 encoding. + $string = utf8_encode($string); + // Remove invalid XML characters. $string = preg_replace('/[^\x{0009}\x{000a}\x{000d}\x{0020}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}]+/u', ' ', $string); $string = preg_replace('/\s{2,}/ui', ' ', $string); - $string = str_ireplace('', '', $string); + // Remove unneeded tags. They are often malformed. + $string = strip_tags(strstr($string, ''); + $string = ' ' . $string; // Load XML file into object. $xml = @simplexml_load_string($string); @@ -401,9 +406,14 @@ public function extractXMLText() { // Try to repair some malformed files. $string = file_get_contents($temp_xml . '.xml'); + // Bad UTF-8 encoding. + $string = utf8_encode($string); + // Remove invalid XML characters. $string = preg_replace('/[^\x{0009}\x{000a}\x{000d}\x{0020}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}]+/u', ' ', $string); $string = preg_replace('/\s{2,}/ui', ' ', $string); - $string = str_ireplace('', '', $string); + // Remove unneeded tags. They are often malformed. + $string = strip_tags(strstr($string, ''); + $string = ' ' . $string; // Load XML file into object. $xml = @simplexml_load_string($string);