Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/master' into fcitx
Browse files Browse the repository at this point in the history
  • Loading branch information
Fcitx Bot committed Oct 10, 2024
2 parents 53f12ea + 1f4119d commit ad5fe7b
Show file tree
Hide file tree
Showing 8 changed files with 1,206 additions and 984 deletions.
30 changes: 25 additions & 5 deletions src/base/util.cc
Original file line number Diff line number Diff line change
Expand Up @@ -96,26 +96,46 @@ void ConstChar32ReverseIterator::Next() {

bool ConstChar32ReverseIterator::Done() const { return done_; }

void Util::SplitStringToUtf8Chars(absl::string_view str,
std::vector<std::string> *output) {
namespace {

template <typename T>
void AppendUtf8CharsImpl(absl::string_view str, std::vector<T> &output) {
const char *begin = str.data();
const char *const end = str.data() + str.size();
while (begin < end) {
const size_t mblen = strings::OneCharLen(begin);
output->emplace_back(begin, mblen);
output.emplace_back(begin, mblen);
begin += mblen;
}
DCHECK_EQ(begin, end);
}

} // namespace

std::vector<std::string> Util::SplitStringToUtf8Chars(absl::string_view str) {
std::vector<std::string> output;
AppendUtf8Chars(str, output);
return output;
}

void Util::AppendUtf8Chars(absl::string_view str,
std::vector<std::string> &output) {
AppendUtf8CharsImpl(str, output);
}

void Util::AppendUtf8Chars(absl::string_view str,
std::vector<absl::string_view> &output) {
AppendUtf8CharsImpl(str, output);
}

// Grapheme is user-perceived character. It may contain multiple codepoints
// such as modifiers and variation squesnces (e.g. 神︀ = U+795E,U+FE00 [SVS]).
// Note, this function does not support full requirements of the grapheme
// specifications defined by Unicode.
// * https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries
void Util::SplitStringToUtf8Graphemes(absl::string_view str,
std::vector<std::string> *graphemes) {
Util::SplitStringToUtf8Chars(str, graphemes);
*graphemes = SplitStringToUtf8Chars(str);
if (graphemes->size() <= 1) {
return;
}
Expand Down Expand Up @@ -908,7 +928,7 @@ Util::ScriptType Util::GetFirstScriptType(absl::string_view str,
}
const Utf8AsChars32 utf8_as_char32(str);
if (mblen) {
*mblen = utf8_as_char32.begin().ok()? utf8_as_char32.begin().size() : 0;
*mblen = utf8_as_char32.begin().ok() ? utf8_as_char32.begin().size() : 0;
}
return GetScriptType(utf8_as_char32.front());
}
Expand Down
16 changes: 13 additions & 3 deletions src/base/util.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,9 +46,19 @@ class Util {
Util() = delete;
~Util() = delete;

// String utils
static void SplitStringToUtf8Chars(absl::string_view str,
std::vector<std::string> *output);
// Splits a string into UTF8 chars.
static std::vector<std::string> SplitStringToUtf8Chars(absl::string_view str);

// Splits a string into UTF8 chars and appends them to `output`. For example:
//
// std::string str = "あa1";
// std::vector<std::string> output = {"漢"};
// Util::AppendUtf8Chars(str, output);
// EXPECT_THAT(output, ElementsAre("漢", "あ", "a", "1"));
static void AppendUtf8Chars(absl::string_view str,
std::vector<std::string> &output);
static void AppendUtf8Chars(absl::string_view str,
std::vector<absl::string_view> &output);

// Split `str` to graphemes.
// A grapheme may contain multiple characters such as modifiers and variation
Expand Down
40 changes: 29 additions & 11 deletions src/base/util_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -51,34 +51,52 @@
namespace mozc {
namespace {

using ::testing::ElementsAre;
using ::testing::ElementsAreArray;

TEST(UtilTest, SplitStringToUtf8Chars) {
template <typename T>
class TypedUtilTest : public ::testing::Test {};

using StrTypes = ::testing::Types<std::string, absl::string_view>;
TYPED_TEST_SUITE(TypedUtilTest, StrTypes);

TYPED_TEST(TypedUtilTest, AppendUtf8Chars) {
using StrType = TypeParam;
{
std::vector<std::string> output;
Util::SplitStringToUtf8Chars("", &output);
std::vector<StrType> output;
Util::AppendUtf8Chars("", output);
EXPECT_EQ(output.size(), 0);
}

{
const std::string kInputs[] = {
constexpr absl::string_view kInputs[] = {
"a", "", "", "\n", "a",
};
const std::string joined_string = absl::StrJoin(kInputs, "");

std::vector<std::string> output;
Util::SplitStringToUtf8Chars(joined_string, &output);
EXPECT_THAT(output, ElementsAreArray(kInputs));
std::vector<StrType> output = {"x", "y", "z"};
Util::AppendUtf8Chars(joined_string, output);
EXPECT_THAT(output, ElementsAre("x", "y", "z", "a", "", "", "\n", "a"));
}
}

TEST(UtilTest, SplitStringToUtf8Graphemes) {
TEST(UtilTest, SplitStringToUtf8Chars) {
{
std::vector<std::string> output;
Util::SplitStringToUtf8Chars("", &output);
const std::vector<std::string> output = Util::SplitStringToUtf8Chars("");
EXPECT_EQ(output.size(), 0);
}
{
const std::string kInputs[] = {
"a", "", "", "\n", "a",
};
const std::string joined_string = absl::StrJoin(kInputs, "");

const std::vector<std::string> output =
Util::SplitStringToUtf8Chars(joined_string);
EXPECT_THAT(output, ElementsAreArray(kInputs));
}
}

TEST(UtilTest, SplitStringToUtf8Graphemes) {
{ // Single codepoint characters.
const std::string kInputs[] = {
"a", "", "", "\n", "a",
Expand Down
6 changes: 2 additions & 4 deletions src/composer/composer_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2549,10 +2549,8 @@ TEST_F(ComposerTest, InsertCharacterPreedit) {
{
std::string base;
std::set<std::string> expanded;
std::vector<std::string> chars;
Util::SplitStringToUtf8Chars(kTestStr, &chars);
for (size_t i = 0; i < chars.size(); ++i) {
composer_->InsertCharacterPreedit(chars[i]);
for (const std::string &c : Util::SplitStringToUtf8Chars(kTestStr)) {
composer_->InsertCharacterPreedit(c);
}
std::string preedit = composer_->GetStringForPreedit();
std::string conversion_query = composer_->GetQueryForConversion();
Expand Down
4 changes: 2 additions & 2 deletions src/converter/immutable_converter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1333,8 +1333,8 @@ void ImmutableConverter::MakeLatticeNodesForPredictiveNodes(
conversion_key += segment.key();
}
DCHECK_NE(std::string::npos, key.find(conversion_key));
std::vector<std::string> conversion_key_chars;
Util::SplitStringToUtf8Chars(conversion_key, &conversion_key_chars);
const std::vector<std::string> conversion_key_chars =
Util::SplitStringToUtf8Chars(conversion_key);

// *** Current behaviors ***
// - Starts suggestion from 6 characters, which is conservative.
Expand Down
Loading

0 comments on commit ad5fe7b

Please sign in to comment.