Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: catastrophic backtracking in Core.AggressivelyFixLt #440

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
91 changes: 81 additions & 10 deletions library/HTMLPurifier/Lexer/DOMLex.php
Original file line number Diff line number Diff line change
Expand Up @@ -52,14 +52,7 @@ public function tokenizeHTML($html, $config, $context)
// attempt to armor stray angled brackets that cannot possibly
// form tags and thus are probably being used as emoticons
if ($config->get('Core.AggressivelyFixLt')) {
$char = '[^a-z!\/]';
$comment = "/<!--(.*?)(-->|\z)/is";
$html = preg_replace_callback($comment, array($this, 'callbackArmorCommentEntities'), $html);
do {
$old = $html;
$html = preg_replace("/<($char)/i", '&lt;\\1', $html);
} while ($html !== $old);
$html = preg_replace_callback($comment, array($this, 'callbackUndoCommentSubst'), $html); // fix comments
$html = $this->aggressivelyFixLt($html);
}

// preprocess html, essential for UTF-8
Expand Down Expand Up @@ -288,7 +281,7 @@ public function muteErrorHandler($errno, $errstr)
*/
public function callbackUndoCommentSubst($matches)
{
return '<!--' . strtr($matches[1], array('&amp;' => '&', '&lt;' => '<')) . $matches[2];
return '<!--' . $this->undoCommentSubstr($matches[1]) . $matches[2];
}

/**
Expand All @@ -299,7 +292,25 @@ public function callbackUndoCommentSubst($matches)
*/
public function callbackArmorCommentEntities($matches)
{
return '<!--' . str_replace('&', '&amp;', $matches[1]) . $matches[2];
return '<!--' . $this->armorEntities($matches[1]) . $matches[2];
}

/**
* @param string $string
* @return string
*/
protected function armorEntities($string)
{
return str_replace('&', '&amp;', $string);
}

/**
* @param string $string
* @return string
*/
protected function undoCommentSubstr($string)
{
return strtr($string, array('&amp;' => '&', '&lt;' => '<'));
}

/**
Expand Down Expand Up @@ -335,6 +346,66 @@ protected function wrapHTML($html, $config, $context, $use_div = true)
$ret .= '</body></html>';
return $ret;
}

/**
* @param string $html
* @return string
*/
protected function aggressivelyFixLt($html)
{
$char = '[^a-z!\/]';
$html = $this->manipulateHtmlComments($html, array($this, 'armorEntities'));

do {
$old = $html;
$html = preg_replace("/<($char)/i", '&lt;\\1', $html);
} while ($html !== $old);

return $this->manipulateHtmlComments($html, array($this, 'undoCommentSubstr'));
}

/**
* Modify HTML comments in the given HTML content using a callback.
*
* @param string $html
* @param callable $callback
* @return string
*/
protected function manipulateHtmlComments($html, callable $callback)
{
$offset = 0;
$startTag = '<!--';
$endTag = '-->';

while (($startPos = strpos($html, $startTag, $offset)) !== false) {
$startPos += strlen($startTag); // Move past `<!--`
$endPos = strpos($html, $endTag, $startPos);

if ($endPos === false) {
// No matching ending comment tag found
break;
}

// Extract the original comment content
$commentContent = substr($html, $startPos, $endPos - $startPos);

// Apply the callback to the comment content
$newCommentContent = $callback($commentContent);

// Reconstruct the entire comment with the new content
$newComment = $startTag . $newCommentContent . $endTag;

// Replace the old comment in the HTML content with the new one
$html = substr($html, 0, $startPos - strlen($startTag)) .
$newComment .
substr($html, $endPos + strlen($endTag));

// Move offset to the end of the new comment for the next iteration
$offset = strpos($html, $newComment, $offset) + strlen($newComment);
}

return $html;
}
}

// vim: et sw=4 sts=4
40 changes: 40 additions & 0 deletions tests/HTMLPurifier/Lexer/DomLexTest.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
<?php

class HTMLPurifier_Lexer_DomLexTest extends HTMLPurifier_Harness
{

protected $domLex;

public function setUp()
{
$this->domLex = new HTMLPurifier_Lexer_DOMLex();
}

public function testCoreAggressivelyFixLtEmojis()
{
$context = new HTMLPurifier_Context();
$config = HTMLPurifier_Config::createDefault();
$output = $this->domLex->tokenizeHTML('<b><3</b>', $config, $context);

$this->assertIdentical($output, array(
new HTMLPurifier_Token_Start('b'),
new HTMLPurifier_Token_Text('<3'),
new HTMLPurifier_Token_End('b')
));
}

public function testCoreAggressivelyFixLtComments()
{
$context = new HTMLPurifier_Context();
$config = HTMLPurifier_Config::createDefault();
$output = $this->domLex->tokenizeHTML('<!-- Nested <!-- Not to be included --> comment -->', $config, $context);

$this->assertIdentical($output, array(
new HTMLPurifier_Token_Comment(' Nested <!-- Not to be included '),
new HTMLPurifier_Token_Text(' comment -->')
));
}

}

// vim: et sw=4 sts=4