Skip to content
This repository has been archived by the owner on Jan 27, 2023. It is now read-only.

Commit

Permalink
PDF text extraction enhancement
Browse files Browse the repository at this point in the history
  • Loading branch information
martin committed Jun 15, 2016
1 parent e81f08b commit 062efac
Showing 1 changed file with 12 additions and 2 deletions.
14 changes: 12 additions & 2 deletions pdfclass.php
Original file line number Diff line number Diff line change
Expand Up @@ -292,9 +292,14 @@ public function extractBookmarks() {

// Try to repair some malformed files.
$string = file_get_contents($temp_xml . '.xml');
// Bad UTF-8 encoding.
$string = utf8_encode($string);
// Remove invalid XML characters.
$string = preg_replace('/[^\x{0009}\x{000a}\x{000d}\x{0020}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}]+/u', ' ', $string);
$string = preg_replace('/\s{2,}/ui', ' ', $string);
$string = str_ireplace('<!doctype pdf2xml system "pdf2xml.dtd">', '<!DOCTYPE pdf2xml SYSTEM "pdf2xml.dtd">', $string);
// Remove unneeded tags. They are often malformed.
$string = strip_tags(strstr($string, '<pdf2xml'), '<pdf2xml><page><fontspec><text><a><outline><item>');
$string = '<?xml version="1.0" encoding="UTF-8"?> <!DOCTYPE pdf2xml SYSTEM "pdf2xml.dtd"> ' . $string;

// Load XML file into object.
$xml = @simplexml_load_string($string);
Expand Down Expand Up @@ -401,9 +406,14 @@ public function extractXMLText() {

// Try to repair some malformed files.
$string = file_get_contents($temp_xml . '.xml');
// Bad UTF-8 encoding.
$string = utf8_encode($string);
// Remove invalid XML characters.
$string = preg_replace('/[^\x{0009}\x{000a}\x{000d}\x{0020}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}]+/u', ' ', $string);
$string = preg_replace('/\s{2,}/ui', ' ', $string);
$string = str_ireplace('<!doctype pdf2xml system "pdf2xml.dtd">', '<!DOCTYPE pdf2xml SYSTEM "pdf2xml.dtd">', $string);
// Remove unneeded tags. They are often malformed.
$string = strip_tags(strstr($string, '<pdf2xml'), '<pdf2xml><page><fontspec><text><a><outline><item>');
$string = '<?xml version="1.0" encoding="UTF-8"?> <!DOCTYPE pdf2xml SYSTEM "pdf2xml.dtd"> ' . $string;

// Load XML file into object.
$xml = @simplexml_load_string($string);
Expand Down

0 comments on commit 062efac

Please sign in to comment.