Skip to content

Commit

Permalink
Add .docx reader
Browse files Browse the repository at this point in the history
  • Loading branch information
mantas-done committed Apr 9, 2024
1 parent 7e75387 commit b1058a7
Show file tree
Hide file tree
Showing 6 changed files with 222 additions and 0 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
| [SAMI](https://en.wikipedia.org/wiki/SAMI) | .smi | smi |
| QuickTime | .qt.txt | txt_quicktime |
| Rich text format (only reader) | .rtf | rtf |
| DOCX (only reader) | .docx | docx |
| [LyRiCs](https://en.wikipedia.org/wiki/LRC_(file_format)) | .lrc | lrc |
| Comma separated values | .csv | csv |
| Plaintext | .txt | txt |
Expand Down
27 changes: 27 additions & 0 deletions src/Code/Converters/DocxReader.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
<?php

namespace Done\Subtitles\Code\Converters;

use Done\Subtitles\Code\Other\DocxToText;
use Done\Subtitles\Subtitles;

class DocxReader implements ConverterContract
{
public function canParseFileContent($file_content)
{
return strpos($file_content, 'PK') === 0 && strpos($file_content, '[Content_Types].xml') !== false;
}

public function fileContentToInternalFormat($file_content, $original_file_content)
{
$text = DocxToText::text($original_file_content);
return Subtitles::loadFromString($text)->getInternalFormat();
}

public function internalFormatToFileContent(array $internal_format, array $options)
{
throw new \Exception('not implemented');
}


}
168 changes: 168 additions & 0 deletions src/Code/Other/DocxToText.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
<?php

namespace Done\Subtitles\Code\Other;

class DocxToText
{
// text might not be correctly ordered
public static function text($path): string
{
$that = new self($path);

$content = '';
// $content .= $that->getAllHeadersXml();
$content .= $that->getFileContent('word/document.xml');
// $content .= $that->getFileContent('word/footnotes.xml');
// $content .= $that->getAllFootersXml();


// $content = str_replace('</w:r></w:p></w:tc><w:tc>', " ", $content); // original code poster had this
$content = str_replace('<w:tab/>', " ", $content); // tab
$content = str_replace('<w:pStyle w:val="ListParagraph"/>', '1. ', $content); // numbering but not correct, jus for word count
$content = preg_replace('/<w:drawing>.*<\/w:drawing>/Um', '', $content);
$content = preg_replace('/<w:instrText.*<\/w:instrText>/Um', '', $content);
$content = str_replace('</w:r></w:p>', "\r\n", $content);
$striped_content = strip_tags($content);
$striped_content = html_entity_decode($striped_content, ENT_QUOTES | ENT_SUBSTITUTE | ENT_XML1, 'UTF-8');

return $striped_content;
}

private \ZipArchive $zip;

private function __construct($file_content)
{
$tmp_file = tempnam(sys_get_temp_dir(), 'prefix_');
file_put_contents($tmp_file, $file_content);

$zip = new \ZipArchive();
$opened = $zip->open($tmp_file, \ZipArchive::RDONLY); // zip archive can only open real file
if ($opened !== true) {
throw new \Exception();
}
$this->zip = $zip;
$this->tmp_path = $tmp_file;
}

public function __destruct()
{
$this->zip->close();
unlink($this->tmp_path);
}

private function pageCount(): int
{
$content = $this->getFileContent('docProps/app.xml');
if ($content === '') {
return 1; // file docProps/app.xml doesn't exist, guess that we have 1 page
}
$xml = new \SimpleXMLElement($content);
return (int)$xml->Pages;
}

private function getFileContent($internal_path): string
{
return $this->zip->getFromName($internal_path);
}

private function getAllHeadersXml(): string
{
$string = $this->getFileContent('word/document.xml');
preg_match_all('/<w:headerReference w:type="(?<type>.+)" r:id="(?<id>.+)"\/>/U', $string, $matches, PREG_SET_ORDER);
$headers = [];
foreach ($matches as $match) {
$internal_filename = $this->idToFileName($match['id']);
$headers[$match['type']] = $this->getFileContent($internal_filename);
}

$page_count = $this->pageCount();
$pages = [];
for ($i = 1; $i <= $page_count; $i++) {
$pages[$i] = '';
}

if (isset($headers['default'])) {
foreach ($pages as $key => &$page) {
$page = $headers['default'];
}
}
if (isset($headers['even'])) {
foreach ($pages as $key => &$page) {
if ($key % 2 === 0) {
$page = $headers['even'];
}
}
}
if (isset($headers['odd'])) {
foreach ($pages as $key => &$page) {
if ($key % 2 === 1) {
$page = $headers['odd'];
}
}
}
if (isset($headers['first'])) {
$pages[1] = $headers['first'];
}

return implode("\n", $pages);
}

private function getAllFootersXml(): string
{
$string = $this->getFileContent('word/document.xml');
preg_match_all('/<w:footerReference w:type="(?<type>.+)" r:id="(?<id>.+)"\/>/U', $string, $matches, PREG_SET_ORDER);
$footers = [];
foreach ($matches as $match) {
$internal_filename = $this->idToFileName($match['id']);
$footers[$match['type']] = $this->getFileContent($internal_filename);
}

$page_count = $this->pageCount();
$pages = [];
for ($i = 1; $i <= $page_count; $i++) {
$pages[$i] = '';
}

if (isset($footers['default'])) {
foreach ($pages as $key => &$page) {
$page = $footers['default'];
}
}
if (isset($footers['even'])) {
foreach ($pages as $key => &$page) {
if ($key % 2 === 0) {
$page = $footers['even'];
}
}
}
if (isset($footers['odd'])) {
foreach ($pages as $key => &$page) {
if ($key % 2 === 1) {
$page = $footers['odd'];
}
}
}
if (isset($footers['first'])) {
$pages[1] = $footers['first'];
}

$xml = implode("\n", $pages);

// remove page numbers from the footer
$xml = preg_replace('/<w:fldChar w:fldCharType="begin">.*w:fldCharType="end".*<\/w:r>/Um', '', $xml);

return $xml;
}

private function idToFileName(string $xml_id): string
{
$content = $this->getFileContent('word/_rels/document.xml.rels');
preg_match_all('/Relationship.+Id="(?<id>.+)".+Target="(?<filename>.+)"/U', $content, $matches, PREG_SET_ORDER);
foreach ($matches as $match) {
if ($match['id'] === $xml_id) {
return 'word/' . $match['filename'];
}
}
throw new \Exception("Can't find: " . $xml_id);
}
}
2 changes: 2 additions & 0 deletions src/Subtitles.php
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
use Done\Subtitles\Code\Converters\BinaryFinder;
use Done\Subtitles\Code\Converters\CsvConverter;
use Done\Subtitles\Code\Converters\DfxpConverter;
use Done\Subtitles\Code\Converters\DocxReader;
use Done\Subtitles\Code\Converters\EbuStlReader;
use Done\Subtitles\Code\Converters\LrcConverter;
use Done\Subtitles\Code\Converters\RtfReader;
Expand Down Expand Up @@ -49,6 +50,7 @@ class Subtitles
['extension' => 'txt', 'format' => 'txt_quicktime', 'name' => 'Quick Time Text', 'class' => TxtQuickTimeConverter::class],
['extension' => 'scc', 'format' => 'scc', 'name' => 'Scenarist', 'class' => SccConverter::class],
['extension' => 'lrc', 'format' => 'lrc', 'name' => 'LyRiCs', 'class' => LrcConverter::class],
['extension' => 'docx', 'format' => 'docx', 'name' => 'DOCX', 'class' => DocxReader::class],
['extension' => 'rtf', 'format' => 'rtf', 'name' => 'Rich text format', 'class' => RtfReader::class], // libraryies eather throws exception, not parses, or takes long to parse 2h file
['extension' => 'csv', 'format' => 'csv', 'name' => 'Coma Separated Values', 'class' => CsvConverter::class], // must be last from bottom
['extension' => 'bin', 'format' => 'bin', 'name' => 'Binary', 'class' => BinaryFinder::class],
Expand Down
Binary file added tests/files/docx.docx
Binary file not shown.
24 changes: 24 additions & 0 deletions tests/formats/DocxTest.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
<?php

namespace Formats;

use Done\Subtitles\Subtitles;
use PHPUnit\Framework\TestCase;
use Helpers\AdditionalAssertionsTrait;

class DocxTest extends TestCase
{

use AdditionalAssertionsTrait;

public function testParsesDocxFile()
{
$content = file_get_contents('./tests/files/docx.docx');
$actual = Subtitles::loadFromString($content)->getInternalFormat();
$expected = (new Subtitles())
->add(137.4, 140.4, ["Senator, we're making", 'our final approach into Coruscant.'])
->add(3740.5, 3742.5, ['Very good, Lieutenant.'])
->getInternalFormat();
$this->assertInternalFormatsEqual($expected, $actual);
}
}

0 comments on commit b1058a7

Please sign in to comment.