forked from Grandt/PHPePub
-
Notifications
You must be signed in to change notification settings - Fork 0
/
EPubChapterSplitter.php
184 lines (157 loc) · 5.75 KB
/
EPubChapterSplitter.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
<?php
/**
* Split an HTML file into smaller html files, retaining the formatting and structure for the individual parts.
* What this splitter does is using DOM to try and retain any formatting in the file, including rebuilding the DOM tree for subsequent parts.
* Split size is considered max target size. The actual size is the result of an even split across the resulting files.
*
* @author A. Grandt <[email protected]>
* @copyright 2009-2012 A. Grandt
* @license GNU LGPL, Attribution required for commercial implementations, requested for everything else.
* @link http://www.phpclasses.org/package/6115
* @link https://github.com/Grandt/PHPePub
* @version 2.02
*/
class EPubChapterSplitter {
const VERSION = 2.02;
private $splitDefaultSize = 250000;
/**
* Set default chapter target size.
* Default is 250000 bytes, and minimum is 10240 bytes.
*
* @param $size segment size in bytes
* @return void
*/
function setSplitSize($size) {
$this->splitDefaultSize = (int)$size;
if ($size < 10240) {
$this->splitDefaultSize = 10240; // Making the file smaller than 10k is not a good idea.
}
}
/**
* Get the chapter target size.
*
* @return $size
*/
function getSplitSize() {
return $this->splitDefaultSize;
}
/**
* Split $chapter into multiple parts.
*
* The search string can either be a regular string or a PHP PECL Regular Expression pattern as defined here: http://www.php.net/manual/en/pcre.pattern.php
* If the search string is a regular string, the matching will be for lines in the HTML starting with the string given
*
* @param String $chapter XHTML file
* @param Bool $splitOnSearchString Split on chapter boundaries, Splitting on search strings disables the split size check.
* @param String $searchString Chapter string to search for can be fixed text, or a regular expression pattern.
*
* @return array with 1 or more parts
*/
function splitChapter($chapter, $splitOnSearchString = false, $searchString = '/^Chapter\\ /i') {
$chapterData = array();
$isSearchRegexp = $splitOnSearchString && (preg_match('#^(\D|\S|\W).+\1[imsxeADSUXJu]*$#m', $searchString) == 1);
if ($splitOnSearchString && !$isSearchRegexp) {
$searchString = '#^<.+?>' . preg_quote($searchString, '#') . "#";
}
if (!$splitOnSearchString && strlen($chapter) <= $this->splitDefaultSize) {
return array($chapter);
}
$xmlDoc = new DOMDocument();
@$xmlDoc->loadHTML($chapter);
$head = $xmlDoc->getElementsByTagName("head");
$body = $xmlDoc->getElementsByTagName("body");
$htmlPos = stripos($chapter, "<html");
$htmlEndPos = stripos($chapter, ">", $htmlPos);
$newXML = substr($chapter, 0, $htmlEndPos+1) . "\n</html>";
if (strpos(trim($newXML), "<?xml ") === FALSE) {
$newXML = "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n" . $newXML;
}
$headerLength = strlen($newXML);
$files = array();
$chapterNames = array();
$domDepth = 0;
$domPath = array();
$domClonedPath = array();
$curFile = $xmlDoc->createDocumentFragment();
$files[] = $curFile;
$curParent = $curFile;
$curSize = 0;
$bodyLen = strlen($xmlDoc->saveXML($body->item(0)));
$headLen = strlen($xmlDoc->saveXML($head->item(0))) + $headerLength;
$partSize = $this->splitDefaultSize - $headLen;
if ($bodyLen > $partSize) {
$parts = ceil($bodyLen / $partSize);
$partSize = ($bodyLen / $parts) - $headLen;
}
$node = $body->item(0)->firstChild;
do {
$nodeData = $xmlDoc->saveXML($node);
$nodeLen = strlen($nodeData);
if ($nodeLen > $partSize && $node->hasChildNodes()) {
$domPath[] = $node;
$domClonedPath[] = $node->cloneNode(false);
$domDepth++;
$node = $node->firstChild;
}
$node2 = $node->nextSibling;
if ($node != null && $node->nodeName != "#text") {
$doSplit = false;
if ($splitOnSearchString) {
$doSplit = preg_match($searchString, $nodeData) == 1;
if ($doSplit) {
$chapterNames[] = trim($nodeData);
}
}
if ($curSize > 0 && ($doSplit || (!$splitOnSearchString && $curSize + $nodeLen > $partSize))) {
$curFile = $xmlDoc->createDocumentFragment();
$files[] = $curFile;
$curParent = $curFile;
if ($domDepth > 0) {
reset($domPath);
reset($domClonedPath);
$oneDomClonedPath = each($domClonedPath);
while ($oneDomClonedPath) {
list($k, $v) = $oneDomClonedPath;
$newParent = $v->cloneNode(false);
$curParent->appendChild($newParent);
$curParent = $newParent;
$oneDomClonedPath = each($domClonedPath);
}
}
$curSize = strlen($xmlDoc->saveXML($curFile));
}
$curParent->appendChild($node->cloneNode(true));
$curSize += $nodeLen;
}
$node = $node2;
while ($node == null && $domDepth > 0) {
$domDepth--;
$node = end($domPath)->nextSibling;
array_pop($domPath);
array_pop($domClonedPath);
$curParent = $curParent->parentNode;
}
} while ($node != null);
$curFile = null;
$curSize = 0;
$xml = new DOMDocument('1.0', $xmlDoc->xmlEncoding);
$xml->lookupPrefix("http://www.w3.org/1999/xhtml");
$xml->preserveWhiteSpace = false;
$xml->formatOutput = true;
for ($idx = 0; $idx < count($files); $idx++) {
$xml2Doc = new DOMDocument('1.0', $xmlDoc->xmlEncoding);
$xml2Doc->lookupPrefix("http://www.w3.org/1999/xhtml");
$xml2Doc->loadXML($newXML);
$html = $xml2Doc->getElementsByTagName("html")->item(0);
$html->appendChild($xml2Doc->importNode($head->item(0), true));
$body = $xml2Doc->createElement("body");
$html->appendChild($body);
$body->appendChild($xml2Doc->importNode($files[$idx], true));
// force pretty printing and correct formatting, should not be needed, but it is.
$xml->loadXML($xml2Doc->saveXML());
$chapterData[$splitOnSearchString ? $chapterNames[$idx] : $idx] = $xml->saveXML();
}
return $chapterData;
}
}
?>