Merge pull request #692 from contour-terminal/improvement/vt-text-pro…

…cessing improve VT text processing efficiency
contour-terminal · Oct 21, 2022 · 92efc84 · 92efc84
2 parents 464d280 + ff927f8
commit 92efc84
Show file tree

Hide file tree

Showing 14 changed files with 239 additions and 103 deletions.
diff --git a/cmake/PedanticCompiler.cmake b/cmake/PedanticCompiler.cmake
@@ -56,7 +56,11 @@ else()
 endif()
 
 if(${PEDANTIC_COMPILER_WERROR})
-    try_add_compile_options(-Werror) # XXX Not yet, but hopefully soon.
+    try_add_compile_options(-Werror)
+
+    # Don't complain here. That's needed for bitpacking (codepoint_properties) in libunicode dependency.
+    try_add_compile_options(-Wno-error=c++20-extensions)
+    try_add_compile_options(-Wno-c++20-extensions)
 
     # Not sure how to work around these.
     try_add_compile_options(-Wno-error=class-memaccess)

diff --git a/metainfo.xml b/metainfo.xml
@@ -111,6 +111,7 @@
           <li>Removes `images.sixel_cursor_conformance` config option.</li>
           <li>Adds VT sequence DECSCA, DECSEL, DECSED and DECSERA to support protected grid areas during erase operations (#29, #30, #31).</li>
           <li>Improve Input Method (IME) handling, visualizing preedit-text.</li>
+          <li>Improve throughput performance of arbitrary complex Unicode.</li>
           <li>Update Unicode data to version 15.0.0 (release). See Announcing The Unicode® Standard, Version 15.0.0.</li>
           <li>Fixes cursor highlight in VI mode</li>
         </ul>

diff --git a/scripts/install-deps.ps1 b/scripts/install-deps.ps1
@@ -26,9 +26,9 @@ $ThirdParties =
         Macro   = ""
     };
     [ThirdParty]@{
-        Folder  = "libunicode-44969c6d80e44a4731b584d047ba108769926835";
-        Archive = "libunicode-44969c6d80e44a4731b584d047ba108769926835.zip";
-        URI     = "https://github.com/contour-terminal/libunicode/archive/44969c6d80e44a4731b584d047ba108769926835.zip";
+        Folder  = "libunicode-be83e5052f6e7590c244faad59be16af210db90b";
+        Archive = "libunicode-be83e5052f6e7590c244faad59be16af210db90b.zip";
+        URI     = "https://github.com/contour-terminal/libunicode/archive/be83e5052f6e7590c244faad59be16af210db90b.zip";
         Macro   = "libunicode"
     };
     [ThirdParty]@{

diff --git a/scripts/install-deps.sh b/scripts/install-deps.sh
@@ -102,12 +102,20 @@ fetch_and_unpack_embeds()
         https://github.com/contour-terminal/termbench-pro/archive/$termbench_pro_git_sha.tar.gz \
         termbench_pro
 
-    local libunicode_git_sha="44969c6d80e44a4731b584d047ba108769926835"
-    fetch_and_unpack \
-        libunicode-$libunicode_git_sha \
-        libunicode-$libunicode_git_sha.tar.gz \
-        https://github.com/contour-terminal/libunicode/archive/$libunicode_git_sha.tar.gz \
-        libunicode
+    if test x$LIBUNICODE_SRC_DIR = x; then
+        local libunicode_git_sha="be83e5052f6e7590c244faad59be16af210db90b"
+        fetch_and_unpack \
+            libunicode-$libunicode_git_sha \
+            libunicode-$libunicode_git_sha.tar.gz \
+            https://github.com/contour-terminal/libunicode/archive/$libunicode_git_sha.tar.gz \
+            libunicode
+    else
+        echo "Hard linking external libunicode source directory to: $LIBUNICODE_SRC_DIR"
+        MACRO="libunicode"
+        echo "macro(ContourThirdParties_Embed_$MACRO)" >> $SYSDEPS_CMAKE_FILE
+        echo "    add_subdirectory($LIBUNICODE_SRC_DIR libunicode EXCLUDE_FROM_ALL)" >> $SYSDEPS_CMAKE_FILE
+        echo "endmacro()" >> $SYSDEPS_CMAKE_FILE
+    fi
 }
 
 fetch_and_unpack_yaml_cpp()

diff --git a/src/terminal/Line.cpp b/src/terminal/Line.cpp
@@ -154,10 +154,10 @@ InflatedLineBuffer<Cell> inflate(TrivialLineBuffer const& input)
 
     auto columns = InflatedLineBuffer<Cell> {};
     columns.reserve(unbox<size_t>(input.displayWidth));
-    // fmt::print("Inflating {}/{}\n", input.text.size(), input.displayWidth);
 
     auto lastChar = char32_t { 0 };
     auto utf8DecoderState = unicode::utf8_decoder_state {};
+    auto gapPending = 0;
 
     for (char const ch: input.text.view())
     {
@@ -173,10 +173,16 @@ InflatedLineBuffer<Cell> inflate(TrivialLineBuffer const& input)
 
         if (!lastChar || isAsciiBreakable || unicode::grapheme_segmenter::breakable(lastChar, nextChar))
         {
+            while (gapPending > 0)
+            {
+                columns.emplace_back(Cell { input.textAttributes, input.hyperlink });
+                --gapPending;
+            }
+            auto const charWidth = unicode::width(nextChar);
             columns.emplace_back(Cell {});
             columns.back().setHyperlink(input.hyperlink);
-            columns.back().write(
-                input.textAttributes, nextChar, static_cast<uint8_t>(unicode::width(nextChar)));
+            columns.back().write(input.textAttributes, nextChar, static_cast<uint8_t>(charWidth));
+            gapPending = charWidth - 1;
         }
         else
         {
@@ -193,7 +199,15 @@ InflatedLineBuffer<Cell> inflate(TrivialLineBuffer const& input)
                 }
             }
         }
+        lastChar = nextChar;
+    }
+
+    while (gapPending > 0)
+    {
+        columns.emplace_back(Cell { input.textAttributes, input.hyperlink });
+        --gapPending;
     }
+
     assert(columns.size() == unbox<size_t>(input.usedColumns));
 
     while (columns.size() < unbox<size_t>(input.displayWidth))

diff --git a/src/terminal/Line_test.cpp b/src/terminal/Line_test.cpp
@@ -66,3 +66,110 @@ TEST_CASE("Line.inflate", "[Line]")
         CHECK(char(cell.codepoint(0)) == testText[i]);
     }
 }
+
+TEST_CASE("Line.inflate.Unicode", "[Line]")
+{
+    auto constexpr DisplayWidth = ColumnCount(10);
+    auto constexpr testTextUtf32 = U"0\u2705123456789ABCDEF"sv;
+    auto const testTextUtf8 = unicode::convert_to<char>(testTextUtf32);
+
+    auto pool = BufferObjectPool(32);
+    auto bufferObject = pool.allocateBufferObject();
+    bufferObject->writeAtEnd(testTextUtf8);
+
+    // Buffer fragment containing 9 codepoints, with one of them using display width of 2.
+    auto const bufferFragment = bufferObject->ref(0, 11);
+
+    auto sgr = GraphicsAttributes {};
+    sgr.foregroundColor = RGBColor(0x123456);
+    sgr.backgroundColor = Color::Indexed(IndexedColor::Yellow);
+    sgr.underlineColor = Color::Indexed(IndexedColor::Red);
+    sgr.flags |= CellFlags::CurlyUnderlined;
+    auto const trivial =
+        TrivialLineBuffer { DisplayWidth, sgr, sgr, HyperlinkId {}, DisplayWidth, bufferFragment };
+
+    auto const inflated = inflate<Cell>(trivial);
+
+    CHECK(inflated.size() == unbox<size_t>(DisplayWidth));
+    for (size_t i = 0, k = 0; i < inflated.size();)
+    {
+        auto const& cell = inflated[i];
+        INFO(fmt::format("column {}, k {}, codepoint U+{:X}", i, k, (unsigned) cell.codepoint(0)));
+        REQUIRE(cell.codepointCount() == 1);
+        REQUIRE(cell.codepoint(0) == testTextUtf32[k]);
+        REQUIRE(cell.foregroundColor() == sgr.foregroundColor);
+        REQUIRE(cell.backgroundColor() == sgr.backgroundColor);
+        REQUIRE(cell.underlineColor() == sgr.underlineColor);
+        for (int n = 1; n < cell.width(); ++n)
+        {
+            INFO(fmt::format("column.sub: {}\n", n));
+            auto const& fillCell = inflated.at(i + static_cast<size_t>(n));
+            REQUIRE(fillCell.codepointCount() == 0);
+            REQUIRE(fillCell.foregroundColor() == sgr.foregroundColor);
+            REQUIRE(fillCell.backgroundColor() == sgr.backgroundColor);
+            REQUIRE(fillCell.underlineColor() == sgr.underlineColor);
+        }
+        i += cell.width();
+        k++;
+    }
+}
+
+TEST_CASE("Line.inflate.Unicode.FamilyEmoji", "[Line]")
+{
+    // Ensure inflate() is also working for reaaally complex Unicode grapheme clusters.
+
+    auto constexpr DisplayWidth = ColumnCount(5);
+    auto constexpr UsedColumnCount = ColumnCount(4);
+    auto constexpr testTextUtf32 = U"A\U0001F468\u200D\U0001F468\u200D\U0001F467B"sv;
+    auto const testTextUtf8 = unicode::convert_to<char>(testTextUtf32);
+    auto const familyEmojiUtf8 = unicode::convert_to<char>(U"\U0001F468\u200D\U0001F468\u200D\U0001F467"sv);
+
+    auto pool = BufferObjectPool(32);
+    auto bufferObject = pool.allocateBufferObject();
+    bufferObject->writeAtEnd(testTextUtf8);
+
+    auto const bufferFragment = bufferObject->ref(0, testTextUtf8.size());
+
+    auto sgr = GraphicsAttributes {};
+    sgr.foregroundColor = RGBColor(0x123456);
+    sgr.backgroundColor = Color::Indexed(IndexedColor::Yellow);
+    sgr.underlineColor = Color::Indexed(IndexedColor::Red);
+    sgr.flags |= CellFlags::CurlyUnderlined;
+
+    auto fillSGR = GraphicsAttributes {};
+    fillSGR.foregroundColor = RGBColor(0x123456);
+    fillSGR.backgroundColor = Color::Indexed(IndexedColor::Yellow);
+    fillSGR.underlineColor = Color::Indexed(IndexedColor::Red);
+    fillSGR.flags |= CellFlags::CurlyUnderlined;
+
+    auto const trivial =
+        TrivialLineBuffer { DisplayWidth, sgr, fillSGR, HyperlinkId {}, UsedColumnCount, bufferFragment };
+
+    auto const inflated = inflate<Cell>(trivial);
+
+    CHECK(inflated.size() == unbox<size_t>(DisplayWidth));
+
+    // Check text in 0..3
+    // Check @4 is empty text.
+    // Check 0..3 has same SGR.
+    // Check @4 has fill-SGR.
+
+    REQUIRE(inflated[0].toUtf8() == "A");
+    REQUIRE(inflated[1].toUtf8() == familyEmojiUtf8);
+    REQUIRE(inflated[2].toUtf8() == "");
+    REQUIRE(inflated[3].toUtf8() == "B");
+    REQUIRE(inflated[4].toUtf8() == "");
+
+    for (auto const i: { 0u, 2u, 1u, 3u })
+    {
+        auto const& cell = inflated[i];
+        REQUIRE(cell.foregroundColor() == sgr.foregroundColor);
+        REQUIRE(cell.backgroundColor() == sgr.backgroundColor);
+        REQUIRE(cell.underlineColor() == sgr.underlineColor);
+    }
+
+    auto const& cell = inflated[4];
+    REQUIRE(cell.foregroundColor() == fillSGR.foregroundColor);
+    REQUIRE(cell.backgroundColor() == fillSGR.backgroundColor);
+    REQUIRE(cell.underlineColor() == fillSGR.underlineColor);
+}
diff --git a/src/terminal/Parser.cpp b/src/terminal/Parser.cpp
@@ -354,43 +354,49 @@ void Parser<EventListener, TraceStateChanges>::parseFragment(std::string_view co
     auto input = _data.data();
     auto const end = _data.data() + _data.size();
 
-    do
+    while (input != end)
     {
-        if (state_ == State::Ground)
+        if (state_ == State::Ground && eventListener_.acceptsBulkText())
         {
             auto const chunk = std::string_view(input, static_cast<size_t>(std::distance(input, end)));
+            auto const [cellCount, next, subStart, subEnd] =
+                unicode::scan_for_text(scanState_, chunk, maxCharCount);
 
-            if (auto const cellCount = unicode::scan_for_text_ascii(chunk, maxCharCount); cellCount > 0)
+            if (next != input)
             {
-                auto const next = input + cellCount;
-                auto const byteCount = static_cast<size_t>(std::distance(input, next));
-                precedingGraphicCharacter = static_cast<char32_t>(input[cellCount - 1]);
-                assert(byteCount <= chunk.size());
+                // We do not test on cellCount>0 because the scan could contain only a ZWJ (zero width
+                // joiner), and that would be misleading.
+
+                assert(subStart <= subEnd);
+                auto const byteCount = static_cast<size_t>(std::distance(subStart, subEnd));
                 assert(cellCount <= maxCharCount);
+                assert(subEnd <= chunk.data() + chunk.size());
                 assert(next <= chunk.data() + chunk.size());
 
 #if defined(LIBTERMINAL_LOG_TRACE)
                 if (VTTraceParserLog)
-                    VTTraceParserLog()(
-                        "[{}] Scanned text: cap {}; available cells {}; chars {}; bytes {}; \"{}\"",
-                        "US-ASCII",
-                        chunk.size(),
-                        maxCharCount,
-                        cellCount,
-                        byteCount,
-                        crispy::escape(std::string_view { input, byteCount }));
+                    VTTraceParserLog()("[Unicode] Scanned text: {}/{} cells; \"{}\"",
+                                       cellCount,
+                                       maxCharCount,
+                                       crispy::escape(std::string_view { input, byteCount }));
 #endif
 
-                auto const text = std::string_view { input, byteCount };
-                if (utf8DecoderState_.expectedLength == 0)
+                auto const text = std::string_view { subStart, byteCount };
+                if (scanState_.utf8.expectedLength == 0)
                 {
-                    maxCharCount = eventListener_.print(text, cellCount);
-                    precedingGraphicCharacter = static_cast<char32_t>(text.back());
+                    if (!text.empty())
+                    {
+                        maxCharCount = eventListener_.print(text, cellCount);
+                    }
                 }
                 else
                 {
-                    for (char const ch: text)
-                        printUtf8Byte(ch);
+                    // fmt::print("Parser.text: incomplete UTF-8 sequence at end: {}/{}\n",
+                    //            scanState_.utf8.currentLength,
+                    //            scanState_.utf8.expectedLength);
+
+                    // for (char const ch: text)
+                    //     printUtf8Byte(ch);
                 }
 
                 input = next;
@@ -429,21 +435,21 @@ void Parser<EventListener, TraceStateChanges>::parseFragment(std::string_view co
                             state_,
                             ch,
                             static_cast<unsigned>(ch)));
-    } while (input != end);
+    }
 }
 
 template <typename EventListener, bool TraceStateChanges>
 void Parser<EventListener, TraceStateChanges>::printUtf8Byte(char ch)
 {
-    unicode::ConvertResult const r = unicode::from_utf8(utf8DecoderState_, (uint8_t) ch);
+    unicode::ConvertResult const r = unicode::from_utf8(scanState_.utf8, (uint8_t) ch);
     if (std::holds_alternative<unicode::Incomplete>(r))
         return;
 
     auto constexpr ReplacementCharacter = char32_t { 0xFFFD };
     auto const codepoint = std::holds_alternative<unicode::Success>(r) ? std::get<unicode::Success>(r).value
                                                                        : ReplacementCharacter;
     eventListener_.print(codepoint);
-    precedingGraphicCharacter = codepoint;
+    scanState_.lastCodepointHint = codepoint;
 }
 
 template <typename EventListener, bool TraceStateChanges>
@@ -466,7 +472,7 @@ void Parser<EventListener, TraceStateChanges>::handle(ActionClass _actionClass,
 
     switch (_action)
     {
-        case Action::GroundStart: precedingGraphicCharacter = 0; break;
+        case Action::GroundStart: scanState_.lastCodepointHint = 0; break;
         case Action::Clear: eventListener_.clear(); break;
         case Action::CollectLeader: eventListener_.collectLeader(ch); break;
         case Action::Collect: eventListener_.collect(ch); break;

diff --git a/src/terminal/Parser.h b/src/terminal/Parser.h
@@ -19,6 +19,7 @@
 #include <crispy/range.h>
 
 #include <unicode/convert.h>
+#include <unicode/scan.h>
 #include <unicode/utf8.h>
 
 #include <fmt/format.h>
@@ -554,17 +555,18 @@ class Parser
 
     [[nodiscard]] State state() const noexcept { return state_; }
 
-    char32_t precedingGraphicCharacter = 0;
+    [[nodiscard]] char32_t precedingGraphicCharacter() const noexcept { return scanState_.lastCodepointHint; }
+
+    void printUtf8Byte(char ch);
 
   private:
     void handle(ActionClass _actionClass, Action _action, uint8_t _char);
-    void printUtf8Byte(char ch);
 
     // private properties
     //
     State state_ = State::Ground;
     EventListener& eventListener_;
-    unicode::utf8_decoder_state utf8DecoderState_ = {};
+    unicode::scan_state scanState_ {};
 };
 
 /// @returns parsed tuple with OSC code and offset to first data parameter byte.

diff --git a/src/terminal/ParserEvents.h b/src/terminal/ParserEvents.h
@@ -48,6 +48,14 @@ class ParserEvents
      */
     virtual size_t print(std::string_view _chars, size_t cellCount) = 0;
 
+    /**
+     * Used to indicate whether or not the print() overload may be used to process bulk text.
+     *
+     * There may be situations where it would not be efficient to process bulk text. In such situations,
+     * simply calling print() per codepoint is sufficient (potentially being more performant).
+     */
+    [[nodiscard]] virtual bool acceptsBulkText() const noexcept = 0;
+
     /**
      * The C0 or C1 control function should be executed, which may have any one of a variety of
      * effects, including changing the cursor position, suspending or resuming communications or
@@ -168,6 +176,7 @@ class NullParserEvents: public ParserEvents
     void error(std::string_view const&) override {}
     void print(char32_t) override {}
     size_t print(std::string_view, size_t) override { return 0; }
+    [[nodiscard]] bool acceptsBulkText() const noexcept override { return true; }
     void execute(char) override {}
     void clear() override {}
     void collect(char) override {}