Skip to content

Commit

Permalink
Merge pull request #692 from contour-terminal/improvement/vt-text-pro…
Browse files Browse the repository at this point in the history
…cessing

improve VT text processing efficiency
  • Loading branch information
christianparpart authored Oct 21, 2022
2 parents 464d280 + ff927f8 commit 92efc84
Show file tree
Hide file tree
Showing 14 changed files with 239 additions and 103 deletions.
6 changes: 5 additions & 1 deletion cmake/PedanticCompiler.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,11 @@ else()
endif()

if(${PEDANTIC_COMPILER_WERROR})
try_add_compile_options(-Werror) # XXX Not yet, but hopefully soon.
try_add_compile_options(-Werror)

# Don't complain here. That's needed for bitpacking (codepoint_properties) in libunicode dependency.
try_add_compile_options(-Wno-error=c++20-extensions)
try_add_compile_options(-Wno-c++20-extensions)

# Not sure how to work around these.
try_add_compile_options(-Wno-error=class-memaccess)
Expand Down
1 change: 1 addition & 0 deletions metainfo.xml
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,7 @@
<li>Removes `images.sixel_cursor_conformance` config option.</li>
<li>Adds VT sequence DECSCA, DECSEL, DECSED and DECSERA to support protected grid areas during erase operations (#29, #30, #31).</li>
<li>Improve Input Method (IME) handling, visualizing preedit-text.</li>
<li>Improve throughput performance of arbitrary complex Unicode.</li>
<li>Update Unicode data to version 15.0.0 (release). See Announcing The Unicode® Standard, Version 15.0.0.</li>
<li>Fixes cursor highlight in VI mode</li>
</ul>
Expand Down
6 changes: 3 additions & 3 deletions scripts/install-deps.ps1
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,9 @@ $ThirdParties =
Macro = ""
};
[ThirdParty]@{
Folder = "libunicode-44969c6d80e44a4731b584d047ba108769926835";
Archive = "libunicode-44969c6d80e44a4731b584d047ba108769926835.zip";
URI = "https://github.com/contour-terminal/libunicode/archive/44969c6d80e44a4731b584d047ba108769926835.zip";
Folder = "libunicode-be83e5052f6e7590c244faad59be16af210db90b";
Archive = "libunicode-be83e5052f6e7590c244faad59be16af210db90b.zip";
URI = "https://github.com/contour-terminal/libunicode/archive/be83e5052f6e7590c244faad59be16af210db90b.zip";
Macro = "libunicode"
};
[ThirdParty]@{
Expand Down
20 changes: 14 additions & 6 deletions scripts/install-deps.sh
Original file line number Diff line number Diff line change
Expand Up @@ -102,12 +102,20 @@ fetch_and_unpack_embeds()
https://github.com/contour-terminal/termbench-pro/archive/$termbench_pro_git_sha.tar.gz \
termbench_pro

local libunicode_git_sha="44969c6d80e44a4731b584d047ba108769926835"
fetch_and_unpack \
libunicode-$libunicode_git_sha \
libunicode-$libunicode_git_sha.tar.gz \
https://github.com/contour-terminal/libunicode/archive/$libunicode_git_sha.tar.gz \
libunicode
if test x$LIBUNICODE_SRC_DIR = x; then
local libunicode_git_sha="be83e5052f6e7590c244faad59be16af210db90b"
fetch_and_unpack \
libunicode-$libunicode_git_sha \
libunicode-$libunicode_git_sha.tar.gz \
https://github.com/contour-terminal/libunicode/archive/$libunicode_git_sha.tar.gz \
libunicode
else
echo "Hard linking external libunicode source directory to: $LIBUNICODE_SRC_DIR"
MACRO="libunicode"
echo "macro(ContourThirdParties_Embed_$MACRO)" >> $SYSDEPS_CMAKE_FILE
echo " add_subdirectory($LIBUNICODE_SRC_DIR libunicode EXCLUDE_FROM_ALL)" >> $SYSDEPS_CMAKE_FILE
echo "endmacro()" >> $SYSDEPS_CMAKE_FILE
fi
}

fetch_and_unpack_yaml_cpp()
Expand Down
20 changes: 17 additions & 3 deletions src/terminal/Line.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -154,10 +154,10 @@ InflatedLineBuffer<Cell> inflate(TrivialLineBuffer const& input)

auto columns = InflatedLineBuffer<Cell> {};
columns.reserve(unbox<size_t>(input.displayWidth));
// fmt::print("Inflating {}/{}\n", input.text.size(), input.displayWidth);

auto lastChar = char32_t { 0 };
auto utf8DecoderState = unicode::utf8_decoder_state {};
auto gapPending = 0;

for (char const ch: input.text.view())
{
Expand All @@ -173,10 +173,16 @@ InflatedLineBuffer<Cell> inflate(TrivialLineBuffer const& input)

if (!lastChar || isAsciiBreakable || unicode::grapheme_segmenter::breakable(lastChar, nextChar))
{
while (gapPending > 0)
{
columns.emplace_back(Cell { input.textAttributes, input.hyperlink });
--gapPending;
}
auto const charWidth = unicode::width(nextChar);
columns.emplace_back(Cell {});
columns.back().setHyperlink(input.hyperlink);
columns.back().write(
input.textAttributes, nextChar, static_cast<uint8_t>(unicode::width(nextChar)));
columns.back().write(input.textAttributes, nextChar, static_cast<uint8_t>(charWidth));
gapPending = charWidth - 1;
}
else
{
Expand All @@ -193,7 +199,15 @@ InflatedLineBuffer<Cell> inflate(TrivialLineBuffer const& input)
}
}
}
lastChar = nextChar;
}

while (gapPending > 0)
{
columns.emplace_back(Cell { input.textAttributes, input.hyperlink });
--gapPending;
}

assert(columns.size() == unbox<size_t>(input.usedColumns));

while (columns.size() < unbox<size_t>(input.displayWidth))
Expand Down
107 changes: 107 additions & 0 deletions src/terminal/Line_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -66,3 +66,110 @@ TEST_CASE("Line.inflate", "[Line]")
CHECK(char(cell.codepoint(0)) == testText[i]);
}
}

TEST_CASE("Line.inflate.Unicode", "[Line]")
{
auto constexpr DisplayWidth = ColumnCount(10);
auto constexpr testTextUtf32 = U"0\u2705123456789ABCDEF"sv;
auto const testTextUtf8 = unicode::convert_to<char>(testTextUtf32);

auto pool = BufferObjectPool(32);
auto bufferObject = pool.allocateBufferObject();
bufferObject->writeAtEnd(testTextUtf8);

// Buffer fragment containing 9 codepoints, with one of them using display width of 2.
auto const bufferFragment = bufferObject->ref(0, 11);

auto sgr = GraphicsAttributes {};
sgr.foregroundColor = RGBColor(0x123456);
sgr.backgroundColor = Color::Indexed(IndexedColor::Yellow);
sgr.underlineColor = Color::Indexed(IndexedColor::Red);
sgr.flags |= CellFlags::CurlyUnderlined;
auto const trivial =
TrivialLineBuffer { DisplayWidth, sgr, sgr, HyperlinkId {}, DisplayWidth, bufferFragment };

auto const inflated = inflate<Cell>(trivial);

CHECK(inflated.size() == unbox<size_t>(DisplayWidth));
for (size_t i = 0, k = 0; i < inflated.size();)
{
auto const& cell = inflated[i];
INFO(fmt::format("column {}, k {}, codepoint U+{:X}", i, k, (unsigned) cell.codepoint(0)));
REQUIRE(cell.codepointCount() == 1);
REQUIRE(cell.codepoint(0) == testTextUtf32[k]);
REQUIRE(cell.foregroundColor() == sgr.foregroundColor);
REQUIRE(cell.backgroundColor() == sgr.backgroundColor);
REQUIRE(cell.underlineColor() == sgr.underlineColor);
for (int n = 1; n < cell.width(); ++n)
{
INFO(fmt::format("column.sub: {}\n", n));
auto const& fillCell = inflated.at(i + static_cast<size_t>(n));
REQUIRE(fillCell.codepointCount() == 0);
REQUIRE(fillCell.foregroundColor() == sgr.foregroundColor);
REQUIRE(fillCell.backgroundColor() == sgr.backgroundColor);
REQUIRE(fillCell.underlineColor() == sgr.underlineColor);
}
i += cell.width();
k++;
}
}

TEST_CASE("Line.inflate.Unicode.FamilyEmoji", "[Line]")
{
// Ensure inflate() is also working for reaaally complex Unicode grapheme clusters.

auto constexpr DisplayWidth = ColumnCount(5);
auto constexpr UsedColumnCount = ColumnCount(4);
auto constexpr testTextUtf32 = U"A\U0001F468\u200D\U0001F468\u200D\U0001F467B"sv;
auto const testTextUtf8 = unicode::convert_to<char>(testTextUtf32);
auto const familyEmojiUtf8 = unicode::convert_to<char>(U"\U0001F468\u200D\U0001F468\u200D\U0001F467"sv);

auto pool = BufferObjectPool(32);
auto bufferObject = pool.allocateBufferObject();
bufferObject->writeAtEnd(testTextUtf8);

auto const bufferFragment = bufferObject->ref(0, testTextUtf8.size());

auto sgr = GraphicsAttributes {};
sgr.foregroundColor = RGBColor(0x123456);
sgr.backgroundColor = Color::Indexed(IndexedColor::Yellow);
sgr.underlineColor = Color::Indexed(IndexedColor::Red);
sgr.flags |= CellFlags::CurlyUnderlined;

auto fillSGR = GraphicsAttributes {};
fillSGR.foregroundColor = RGBColor(0x123456);
fillSGR.backgroundColor = Color::Indexed(IndexedColor::Yellow);
fillSGR.underlineColor = Color::Indexed(IndexedColor::Red);
fillSGR.flags |= CellFlags::CurlyUnderlined;

auto const trivial =
TrivialLineBuffer { DisplayWidth, sgr, fillSGR, HyperlinkId {}, UsedColumnCount, bufferFragment };

auto const inflated = inflate<Cell>(trivial);

CHECK(inflated.size() == unbox<size_t>(DisplayWidth));

// Check text in 0..3
// Check @4 is empty text.
// Check 0..3 has same SGR.
// Check @4 has fill-SGR.

REQUIRE(inflated[0].toUtf8() == "A");
REQUIRE(inflated[1].toUtf8() == familyEmojiUtf8);
REQUIRE(inflated[2].toUtf8() == "");
REQUIRE(inflated[3].toUtf8() == "B");
REQUIRE(inflated[4].toUtf8() == "");

for (auto const i: { 0u, 2u, 1u, 3u })
{
auto const& cell = inflated[i];
REQUIRE(cell.foregroundColor() == sgr.foregroundColor);
REQUIRE(cell.backgroundColor() == sgr.backgroundColor);
REQUIRE(cell.underlineColor() == sgr.underlineColor);
}

auto const& cell = inflated[4];
REQUIRE(cell.foregroundColor() == fillSGR.foregroundColor);
REQUIRE(cell.backgroundColor() == fillSGR.backgroundColor);
REQUIRE(cell.underlineColor() == fillSGR.underlineColor);
}
56 changes: 31 additions & 25 deletions src/terminal/Parser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -354,43 +354,49 @@ void Parser<EventListener, TraceStateChanges>::parseFragment(std::string_view co
auto input = _data.data();
auto const end = _data.data() + _data.size();

do
while (input != end)
{
if (state_ == State::Ground)
if (state_ == State::Ground && eventListener_.acceptsBulkText())
{
auto const chunk = std::string_view(input, static_cast<size_t>(std::distance(input, end)));
auto const [cellCount, next, subStart, subEnd] =
unicode::scan_for_text(scanState_, chunk, maxCharCount);

if (auto const cellCount = unicode::scan_for_text_ascii(chunk, maxCharCount); cellCount > 0)
if (next != input)
{
auto const next = input + cellCount;
auto const byteCount = static_cast<size_t>(std::distance(input, next));
precedingGraphicCharacter = static_cast<char32_t>(input[cellCount - 1]);
assert(byteCount <= chunk.size());
// We do not test on cellCount>0 because the scan could contain only a ZWJ (zero width
// joiner), and that would be misleading.

assert(subStart <= subEnd);
auto const byteCount = static_cast<size_t>(std::distance(subStart, subEnd));
assert(cellCount <= maxCharCount);
assert(subEnd <= chunk.data() + chunk.size());
assert(next <= chunk.data() + chunk.size());

#if defined(LIBTERMINAL_LOG_TRACE)
if (VTTraceParserLog)
VTTraceParserLog()(
"[{}] Scanned text: cap {}; available cells {}; chars {}; bytes {}; \"{}\"",
"US-ASCII",
chunk.size(),
maxCharCount,
cellCount,
byteCount,
crispy::escape(std::string_view { input, byteCount }));
VTTraceParserLog()("[Unicode] Scanned text: {}/{} cells; \"{}\"",
cellCount,
maxCharCount,
crispy::escape(std::string_view { input, byteCount }));
#endif

auto const text = std::string_view { input, byteCount };
if (utf8DecoderState_.expectedLength == 0)
auto const text = std::string_view { subStart, byteCount };
if (scanState_.utf8.expectedLength == 0)
{
maxCharCount = eventListener_.print(text, cellCount);
precedingGraphicCharacter = static_cast<char32_t>(text.back());
if (!text.empty())
{
maxCharCount = eventListener_.print(text, cellCount);
}
}
else
{
for (char const ch: text)
printUtf8Byte(ch);
// fmt::print("Parser.text: incomplete UTF-8 sequence at end: {}/{}\n",
// scanState_.utf8.currentLength,
// scanState_.utf8.expectedLength);

// for (char const ch: text)
// printUtf8Byte(ch);
}

input = next;
Expand Down Expand Up @@ -429,21 +435,21 @@ void Parser<EventListener, TraceStateChanges>::parseFragment(std::string_view co
state_,
ch,
static_cast<unsigned>(ch)));
} while (input != end);
}
}

template <typename EventListener, bool TraceStateChanges>
void Parser<EventListener, TraceStateChanges>::printUtf8Byte(char ch)
{
unicode::ConvertResult const r = unicode::from_utf8(utf8DecoderState_, (uint8_t) ch);
unicode::ConvertResult const r = unicode::from_utf8(scanState_.utf8, (uint8_t) ch);
if (std::holds_alternative<unicode::Incomplete>(r))
return;

auto constexpr ReplacementCharacter = char32_t { 0xFFFD };
auto const codepoint = std::holds_alternative<unicode::Success>(r) ? std::get<unicode::Success>(r).value
: ReplacementCharacter;
eventListener_.print(codepoint);
precedingGraphicCharacter = codepoint;
scanState_.lastCodepointHint = codepoint;
}

template <typename EventListener, bool TraceStateChanges>
Expand All @@ -466,7 +472,7 @@ void Parser<EventListener, TraceStateChanges>::handle(ActionClass _actionClass,

switch (_action)
{
case Action::GroundStart: precedingGraphicCharacter = 0; break;
case Action::GroundStart: scanState_.lastCodepointHint = 0; break;
case Action::Clear: eventListener_.clear(); break;
case Action::CollectLeader: eventListener_.collectLeader(ch); break;
case Action::Collect: eventListener_.collect(ch); break;
Expand Down
8 changes: 5 additions & 3 deletions src/terminal/Parser.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include <crispy/range.h>

#include <unicode/convert.h>
#include <unicode/scan.h>
#include <unicode/utf8.h>

#include <fmt/format.h>
Expand Down Expand Up @@ -554,17 +555,18 @@ class Parser

[[nodiscard]] State state() const noexcept { return state_; }

char32_t precedingGraphicCharacter = 0;
[[nodiscard]] char32_t precedingGraphicCharacter() const noexcept { return scanState_.lastCodepointHint; }

void printUtf8Byte(char ch);

private:
void handle(ActionClass _actionClass, Action _action, uint8_t _char);
void printUtf8Byte(char ch);

// private properties
//
State state_ = State::Ground;
EventListener& eventListener_;
unicode::utf8_decoder_state utf8DecoderState_ = {};
unicode::scan_state scanState_ {};
};

/// @returns parsed tuple with OSC code and offset to first data parameter byte.
Expand Down
9 changes: 9 additions & 0 deletions src/terminal/ParserEvents.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,14 @@ class ParserEvents
*/
virtual size_t print(std::string_view _chars, size_t cellCount) = 0;

/**
* Used to indicate whether or not the print() overload may be used to process bulk text.
*
* There may be situations where it would not be efficient to process bulk text. In such situations,
* simply calling print() per codepoint is sufficient (potentially being more performant).
*/
[[nodiscard]] virtual bool acceptsBulkText() const noexcept = 0;

/**
* The C0 or C1 control function should be executed, which may have any one of a variety of
* effects, including changing the cursor position, suspending or resuming communications or
Expand Down Expand Up @@ -168,6 +176,7 @@ class NullParserEvents: public ParserEvents
void error(std::string_view const&) override {}
void print(char32_t) override {}
size_t print(std::string_view, size_t) override { return 0; }
[[nodiscard]] bool acceptsBulkText() const noexcept override { return true; }
void execute(char) override {}
void clear() override {}
void collect(char) override {}
Expand Down
Loading

0 comments on commit 92efc84

Please sign in to comment.