Skip to content
This repository has been archived by the owner on Oct 1, 2022. It is now read-only.

Commit

Permalink
Merge branch 'bugfix-utf8-lexer'
Browse files Browse the repository at this point in the history
  • Loading branch information
sanmai committed Sep 16, 2019
2 parents c07ce53 + b0c8e3f commit 50c329f
Show file tree
Hide file tree
Showing 3 changed files with 99 additions and 3 deletions.
23 changes: 23 additions & 0 deletions atoum/Unit/Llk/Lexer.php
Original file line number Diff line number Diff line change
Expand Up @@ -498,4 +498,27 @@ public function case_unicode_disabled()
''
);
}

public function case_invalid_utf8_with_unicode_mode()
{
$this
->given(
$lexer = new SUT(['lexer.unicode' => true]),
$datum = "\xFF",
$tokens = [
'default' => [
'foo' => "\xFF"
]
]
)
->when($result = $lexer->lexMe($datum, $tokens))
->then
->exception(function () use ($result) {
$result->next();
})
->isInstanceOf(LUT\Exception\Lexer::class)
->hasMessage(
'Text is not valid utf-8 string, you probably need to switch "lexer.unicode" setting off.'
);
}
}
48 changes: 48 additions & 0 deletions src/Exception/InternalError.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
<?php
/**
* Hoa
*
*
* @license
*
* BSD 3-Clause License
*
* Copyright © 2007-2017, Hoa community. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
*/

namespace Hoa\Compiler\Exception;

/**
* It probably points to some internal issue of the Hoa Compiler library.
* Regardless source of the bug, try to report about this exception to the library maintainers.
* Even if bug is yours, this exception must not happen.
*/
final class InternalError extends Exception
{
}
31 changes: 28 additions & 3 deletions src/Llk/Lexer.php
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,8 @@ public function __construct(array $pragmas = [])
*/
public function lexMe($text, array $tokens): \Generator
{
$this->validateInputInUnicodeMode($text);

$this->_text = $text;
$this->_tokens = $tokens;
$this->_nsStack = null;
Expand Down Expand Up @@ -287,9 +289,9 @@ protected function nextToken($offset)
*/
protected function matchLexeme($lexeme, $regex, $offset)
{
$_regex = str_replace('#', '\#', $regex);
$preg = preg_match(
'#\G(?|' . $_regex . ')#' . $this->_pcreOptions,
$_regex = '#\G(?|' . str_replace('#', '\#', $regex) . ')#' . $this->_pcreOptions;
$preg = @preg_match(
$_regex,
$this->_text,
$matches,
0,
Expand All @@ -300,6 +302,16 @@ protected function matchLexeme($lexeme, $regex, $offset)
return null;
}

if (false === $preg) {
throw new Compiler\Exception\InternalError(
sprintf(
'Lexer encountered a PCRE error (code: %d), full regex: "%s".',
preg_last_error(),
$_regex
)
);
}

if ('' === $matches[0]) {
throw new Compiler\Exception\Lexer(
'A lexeme must not match an empty value, which is the ' .
Expand All @@ -315,4 +327,17 @@ protected function matchLexeme($lexeme, $regex, $offset)
'length' => mb_strlen($matches[0])
];
}

/**
* @param string $text
* @return bool
*/
private function validateInputInUnicodeMode($text)
{
if (strpos($this->_pcreOptions, 'u') !== false && preg_match('##u', $text) === false) {
throw new Compiler\Exception\Lexer(
'Text is not valid utf-8 string, you probably need to switch "lexer.unicode" setting off.'
);
}
}
}

0 comments on commit 50c329f

Please sign in to comment.