From 38403ded135a927d740a0af1becd5ba18f1fa462 Mon Sep 17 00:00:00 2001 From: Mantas Date: Fri, 5 Apr 2024 17:01:16 +0300 Subject: [PATCH] Improve txt converter --- src/Code/Converters/CsvConverter.php | 8 +++ src/Code/Converters/SrtConverter.php | 3 +- src/Code/Converters/TxtConverter.php | 27 +++++++++- src/Code/Helpers.php | 79 ++++++++++++++++++++++++++++ tests/Helpers/RemoveHtmlTest.php | 27 ++++++++++ tests/formats/TxtTest.php | 14 +++++ 6 files changed, 155 insertions(+), 3 deletions(-) create mode 100644 tests/Helpers/RemoveHtmlTest.php diff --git a/src/Code/Converters/CsvConverter.php b/src/Code/Converters/CsvConverter.php index ff5d3de..61ff5dc 100644 --- a/src/Code/Converters/CsvConverter.php +++ b/src/Code/Converters/CsvConverter.php @@ -23,6 +23,14 @@ public function canParseFileContent($file_content) } $last_row = $csv[$count - 1]; + // check if each row has the same column count + $last_row_count = count($last_row); + foreach ($csv as $row) { + if (count($row) !== $last_row_count) { + return false; // this is not a csv file + } + } + $has_timestamp = false; $has_text = false; foreach ($last_row as $cell) { diff --git a/src/Code/Converters/SrtConverter.php b/src/Code/Converters/SrtConverter.php index 6f9abd7..d9f6c5c 100644 --- a/src/Code/Converters/SrtConverter.php +++ b/src/Code/Converters/SrtConverter.php @@ -2,6 +2,7 @@ namespace Done\Subtitles\Code\Converters; +use Done\Subtitles\Code\Helpers; use Done\Subtitles\Code\UserException; class SrtConverter implements ConverterContract @@ -52,7 +53,7 @@ public function fileContentToInternalFormat($file_content, $original_file_conten throw new UserException("Arrow should looks like this --> for srt format on line: " . $line . ' (SrtConverter)'); */ } elseif ($parts['text'] !== null) { - $internal_format[$i]['lines'][] = strip_tags($line); + $internal_format[$i]['lines'][] = Helpers::removeOnlyHtmlTags($line); } if (!$saw_start) { diff --git a/src/Code/Converters/TxtConverter.php b/src/Code/Converters/TxtConverter.php index f25275a..40229a2 100644 --- a/src/Code/Converters/TxtConverter.php +++ b/src/Code/Converters/TxtConverter.php @@ -2,6 +2,7 @@ namespace Done\Subtitles\Code\Converters; +use Done\Subtitles\Code\Helpers; use Done\Subtitles\Code\UserException; class TxtConverter implements ConverterContract @@ -122,7 +123,10 @@ public function fileContentToInternalFormat($file_content, $original_file_conten // strip html foreach ($internal_format as &$row) { - $row['lines'] = array_map('strip_tags', $row['lines']); + foreach ($row['lines'] as &$line) { + $line = Helpers::removeOnlyHtmlTags($line); + } + unset($line); } unset($row); @@ -369,7 +373,7 @@ public static function doesFileUseTimestamps(array $lines) if (isset($timestamps[0][0])) { $start = $timestamps[0][0]; $before = self::strBefore($line, $start); - if (self::hasText($before) || self::hasDigit($before)) { + if (self::hasText($before)) { continue; } $lines_with_timestamp_count++; @@ -422,6 +426,16 @@ public static function withoutTimestampsInternalFormat(array $lines) $internal_format[] = ['lines' => [$line]]; } $internal_format = self::fillStartAndEndTimes($internal_format); + + // strip html + foreach ($internal_format as &$row) { + foreach ($row['lines'] as &$line) { + $line = Helpers::removeOnlyHtmlTags($line); + } + unset($line); + } + unset($row); + return $internal_format; } @@ -492,6 +506,15 @@ private static function twoLinesSeparatedByEmptyLine(string $file_content) } } + // strip html + foreach ($internal_format as &$row) { + foreach ($row['lines'] as &$line) { + $line = Helpers::removeOnlyHtmlTags($line); + } + unset($line); + } + unset($row); + return self::fillStartAndEndTimes($internal_format); } diff --git a/src/Code/Helpers.php b/src/Code/Helpers.php index e26fde4..85b1df5 100644 --- a/src/Code/Helpers.php +++ b/src/Code/Helpers.php @@ -253,4 +253,83 @@ public static function strAfterLast($subject, $search) return substr($subject, $position + strlen($search)); } + + public static function strBefore($subject, $search) + { + if ($search === '') { + return $subject; + } + + $result = strstr($subject, (string) $search, true); + + return $result === false ? $subject : $result; + } + + public static function removeOnlyHtmlTags($string) + { + $letters = preg_split('//u', $string, null, PREG_SPLIT_NO_EMPTY); + $parts = []; + $current_text = ''; + foreach ($letters as $letter) { + if ($letter === '<') { + if ($current_text !== '') { + $parts[] = $current_text; + $current_text = '<'; + } else { + $current_text = '<'; + } + } elseif ($letter === '>') { + $current_text .= '>'; + $parts[] = $current_text; + $current_text = ''; + } else { + $current_text .= $letter; + } + } + if ($current_text !== '') { + $parts[] = $current_text; + } + + $text = ''; + foreach ($parts as $part) { + if (!Helpers::isRealHtmlTag($part)) { + $text .= $part; + } + } + $text = preg_replace('/\s+/', ' ', $text); + return $text; + } + + private static function isRealHtmlTag($tag) + { + $starts = ['div', 'p', 'a', 'b', 'i', 'u', 'strong', 'img', 'ul', 'li', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'span', 'input', 'br', 'font']; + $attributes = ['id', 'class', 'href', 'src', 'alt', 'title', 'style', 'target', 'rel', 'type', 'color', 'size']; + + $found_start = false; + foreach ($starts as $start) { + if (preg_match("/^<\s*\/?\s*$start\s*\/?\s*>/i", $tag)) { + return true; + } + + $tag_start = Helpers::strBefore($tag, ' '); + if ($tag_start === "<$start") { + $found_start = true; + break; + } + } + if (!$found_start) { + return false; + } + + if (strpos($tag, '>') === false) { + return false; // no closing tag + } + + foreach ($attributes as $attribute) { + if (preg_match("/ $attribute\s*=/i", $tag)) { + return true; + } + } + return false; + } } diff --git a/tests/Helpers/RemoveHtmlTest.php b/tests/Helpers/RemoveHtmlTest.php new file mode 100644 index 0000000..8237a42 --- /dev/null +++ b/tests/Helpers/RemoveHtmlTest.php @@ -0,0 +1,27 @@ +assertEquals('a', Helpers::removeOnlyHtmlTags('a'), 1); + $this->assertEquals('a', Helpers::removeOnlyHtmlTags('a'), 2); + $this->assertEquals('a', Helpers::removeOnlyHtmlTags('a
'), 3); + $this->assertEquals('a', Helpers::removeOnlyHtmlTags('a
'), 4); + $this->assertEquals('a', Helpers::removeOnlyHtmlTags('a
'), 5); + $this->assertEquals('a', Helpers::removeOnlyHtmlTags('a
'), 6); + $this->assertEquals('a b', Helpers::removeOnlyHtmlTags('a b'), 7); + $this->assertEquals('a ', Helpers::removeOnlyHtmlTags('a '), 8); + $this->assertEquals('a b', Helpers::removeOnlyHtmlTags('a b')); + $this->assertEquals('a ', Helpers::removeOnlyHtmlTags('a ')); + $this->assertEquals(' www.url.net ', Helpers::removeOnlyHtmlTags(' www.url.net ')); + $this->assertEquals('word', Helpers::removeOnlyHtmlTags('word')); + $this->assertEquals('assertEquals('getInternalFormat(); } + public function testNumberBeforeTimestamp() + { + $actual = Subtitles::loadFromString('1 00:00:01:00 00:00:02:00 a')->getInternalFormat(); + $expected = (new Subtitles())->add(1, 2, 'a')->getInternalFormat(); + $this->assertInternalFormatsEqual($expected, $actual); + } + + public function testDoesNotRemoveNotHtmlTag() + { + $actual = Subtitles::loadFromString('text