From ca15515c3e1e05eff6d73ad2716bbcdc97850d5a Mon Sep 17 00:00:00 2001 From: vincent-peugnet Date: Sun, 22 Dec 2024 15:17:04 +0100 Subject: [PATCH] WIP try to escape in render see #232 --- RENDER.md | 49 +++++++++++++++----------- app/class/Servicerender.php | 66 +++++++++++++++++++++++++++++++++-- app/class/Servicerenderv2.php | 12 +++++-- 3 files changed, 101 insertions(+), 26 deletions(-) diff --git a/RENDER.md b/RENDER.md index 15b3888a..89323ba6 100644 --- a/RENDER.md +++ b/RENDER.md @@ -10,17 +10,25 @@ flowchart TD 0A(Head generation) --> 0rss(RSS feed declaration) --> 3B + 1A[[Element]] --> + md?{markdown ?} -->|no| wi + md? --> |yes| md(Markdown to HTML) --> + ec[extract code tags] --> + wi(W inclusion) --> + el(every link*) --> + hi(header ID) --> + 1F(URL linker) --> + 1G(HTML tag*) --> 2C + 2A[[Body]] --> - 2B(W inclusion) -------> - 2C((Element inclusion)) --> 2D - subgraph "post inclusion parser" - 2D(Summary) --> - 2rss(RSS detection) --> - 2H(Wiki links) --> - 2I(Link and media analysis) --> - 2pp(check for post render actions) - end - 2pp --> + 2B(W inclusion) ---------> + 2C((Element inclusion)) --> + 2pp(check for post render actions) --> + 2D(Summary inclusion) --> + 2rss(RSS detection) --> + 2H(Wiki links) --> + lma(Link and media analysis) --> + ic(insert code tags) --> 3B((Head and Body gathering)) --> 3C[[Rendered HTML]] --> 4c subgraph "post render actions" @@ -30,18 +38,10 @@ flowchart TD 4j --> 5[\served web page/] - 1A[[Element]] --> - 1B(W inclusion) --> - 1C(every link*) --> - 1D(Markdown) --> 1E - subgraph "post MD parser" - 1E(header ID) --> - 1F(URL linker) --> - 1G(HTML tag*) - end - 1G --> 2C - 1E -. "send TOC structure" .-> 2D + + ec -. code tags content .-> ic + hi -. "send TOC structure" .-> 2D 2rss -. "send rss links" .-> 0rss 2pp -. trigger post render action .-> 4c ``` @@ -71,3 +71,10 @@ List of W inclusions 1. replace `%CONNECT%` code The point of doing those inclusions early is to be before __Header ID__ parser. That way, when they are used inside HTML headings, they will generate nicer IDs. + + +Code tag extraction have to be done before W inclusions in order to avoid this inclusion to occured. + + + + diff --git a/app/class/Servicerender.php b/app/class/Servicerender.php index 9ee0a0e1..cb27dca1 100644 --- a/app/class/Servicerender.php +++ b/app/class/Servicerender.php @@ -34,6 +34,9 @@ abstract class Servicerender /** @var array $urls */ protected $urls = []; + /** @var array $codetags */ + protected $codetags = []; + protected $sum = []; /** @var bool If true, internal links target a new tab */ @@ -130,8 +133,8 @@ protected function gethmtl() { $body = $this->bodyconstructor($this->readbody()); + $this->postprocessaction = $this->checkpostprocessaction($body); $parsebody = $this->bodyparser($body); - $this->postprocessaction = $this->checkpostprocessaction($parsebody); $head = $this->gethead(); $lang = !empty($this->page->lang()) ? $this->page->lang() : Config::lang(); @@ -477,6 +480,21 @@ protected function htmlparser(string $html): string $this->sourceparser($audios); $videos = $dom->getElementsByTagName('video'); $this->sourceparser($videos); + + # replace tags contents + if (!empty($this->codetags)) { + $codes = $dom->getElementsByTagName('code'); + foreach ($codes as $code) { + assert($code instanceof DOMElement); + $value = $code->nodeValue; + if ($value !== null) { + if (key_exists($value, $this->codetags)) { + $code->nodeValue = $this->codetags[$value]; + } + } + } + } + // By passing the documentElement to saveHTML, special chars are not converted to entities return $dom->saveHTML($dom->documentElement); } @@ -511,6 +529,35 @@ protected function sourceparser(DOMNodeList $sourcables): void } } + /** + * Replace every code tag content with a hash of the previous content. + * Content is kept in array with associated hash as key + */ + protected function extractcodetag(string $html): string + { + $dom = new DOMDocument('1.0', 'UTF-8'); + /** Force UTF-8 encoding for loadHTML by defining it in the content itself with an XML tag that need to be removed later */ + $xhtml = '' . $html; + /** @phpstan-ignore-next-line Error supposed to be thrown here but is'nt */ + $dom->loadHTML($xhtml, LIBXML_NOERROR | LIBXML_HTML_NODEFDTD | LIBXML_HTML_NOIMPLIED); + $dom->removeChild($dom->firstChild); + $codes = $dom->getElementsByTagName('code'); + if ($codes->count() === 0) { + return $html; + } + foreach ($codes as $code) { + assert($code instanceof DOMElement); + $value = $code->nodeValue; + if ($value !== null) { + $hash = strval(crc32($value)); + $this->codetags[$hash] = $value; + $code->nodeValue = $hash; + } + } + // By passing the documentElement to saveHTML, special chars are not converted to entities + return $dom->saveHTML($dom); + } + /** * Replace wiki links [[page_id]] with HTML link */ @@ -596,7 +643,22 @@ protected function markdown($text) // $fortin->header_id_func = function ($header) { // return preg_replace('/[^\w]/', '', strtolower($header)); // }; - $fortin->hard_wrap = Config::markdownhardwrap(); + // $fortin->code_block_content_func = function ($code) { + // // $code = trim($code); + // $str = mb_convert_encoding($code , 'UTF-32', 'UTF-8'); + // $t = unpack("N*", $str); + // $t = array_map(function($n) { return "&#$n;"; }, $t); + // $t = implode("", $t); + // return $t; + // }; + // $fortin->code_span_content_func = function ($code) { + // $code = rtrim($code, ' '); + // $str = mb_convert_encoding($code , 'UTF-32', 'UTF-8'); + // $t = unpack("N*", $str); + // $t = array_map(function($n) { return "&#$n;"; }, $t); + // return implode("", $t); + // }; + // $fortin->hard_wrap = Config::markdownhardwrap(); $text = $fortin->transform($text); return $text; } diff --git a/app/class/Servicerenderv2.php b/app/class/Servicerenderv2.php index ed458891..dd35caab 100644 --- a/app/class/Servicerenderv2.php +++ b/app/class/Servicerenderv2.php @@ -65,13 +65,18 @@ protected function bodyconstructor(string $body): string protected function elementparser(Elementv2 $element) { $content = $element->content(); + + if ($element->markdown()) { + $content = $this->markdown($content); + // $content = $this->extractcodetag($content); + } + $content = $this->winclusions($content); + if ($element->everylink() > 0) { $content = $this->everylink($content, $element->everylink()); } - if ($element->markdown()) { - $content = $this->markdown($content); - } + if ($element->headerid()) { $content = $this->headerid( $content, @@ -80,6 +85,7 @@ protected function elementparser(Elementv2 $element) $element->headeranchor(), ); } + if ($element->urllinker()) { $content = $this->autourl($content); }