Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/master' into fcitx
Browse files Browse the repository at this point in the history
  • Loading branch information
Fcitx Bot committed Sep 26, 2024
2 parents 13bdc20 + dd7b112 commit a47ae1c
Show file tree
Hide file tree
Showing 36 changed files with 255 additions and 281 deletions.
30 changes: 30 additions & 0 deletions docs/configurations.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,36 @@ Any category is OK so far.
| {DATE} | 2 digits date (01) |
| {{} | sinle { character |

## Keymap (shortcut keys)

Shortcut keys are configurable from the preferences dialog GUI.

* Preferences GUI > General tab > Keymap > Keymap style.

There are three predefined keymaps

* ATOK
* MS-IME (default for Windows)
* Kotoeri (default for macOS)

Full customization is also available.

### Input modes

* Direct: Mode to type Latin characters without IME.
* Precomposition: Mode to type Hiragana characters, but no character is typed
yet.
* Composition: Mode to type Hiragana characters.
* Suggestion: Sub-mode of Composition. Some word suggestions have appeared,
but no candidate is focused.
* Conversion: Mode to select words (in Kanji). Candidate words are generated
by exact-match (by Space key).
* Prediction: Sub-mode of Conversion. Candidate words are generated by
prefix-match (by Tab key).

![Diagram of input modes](input_modes.svg)
![Screenshots of input modes](input_modes.png)

## Configuration path

Mozc creates configuration files under `$XDG_CONFIG_HOME/mozc` (default:
Expand Down
Binary file added docs/input_modes.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
1 change: 1 addition & 0 deletions docs/input_modes.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
1 change: 0 additions & 1 deletion src/android/gen_touch_event_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@
import struct
import urllib.parse

__author__ = "matsuzakit"


def ReadCollectedKeyboards(stream):
Expand Down
1 change: 1 addition & 0 deletions src/base/strings/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,7 @@ mozc_cc_library(
deps = [
"//base/strings/internal:utf8_internal",
"@com_google_absl//absl/base:core_headers",
"@com_google_absl//absl/base:nullability",
"@com_google_absl//absl/log:check",
"@com_google_absl//absl/strings",
],
Expand Down
52 changes: 28 additions & 24 deletions src/base/strings/internal/utf8_internal.cc
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
#include "base/strings/internal/utf8_internal.h"

#include <array>
#include <cstdint>
#include <iterator>
#include <type_traits>

Expand Down Expand Up @@ -99,18 +100,6 @@ constexpr bool IsValidSecondByte<2>(const char leading_byte,
return IsTrailingByte(second_byte);
}

template <int Count>
inline void EncodeSequence(char32_t cp, const char offset,
EncodeResult::Buffer::iterator it) {
int count = Count;
*it++ = static_cast<char>((cp >> (kShift * count)) + offset);
while (count > 0) {
const char temp = static_cast<char>(cp >> (kShift * (count - 1)));
*it++ = 0x80 | (temp & 0x3f);
--count;
}
}

DecodeResult HandleBufferTooShort(const char* ptr, const char* last,
const int needed) {
// If the buffer is not long enough, stop processing and return error.
Expand All @@ -131,15 +120,15 @@ DecodeResult HandleBufferTooShort(const char* ptr, const char* last,
return DecodeResult::Error(seen);
}

constexpr char32_t AppendTrailingByte(char32_t base, char byte) {
inline char32_t AppendTrailingByte(char32_t base, char byte) {
return (base << kShift) + (byte & kTrailingMask);
}

template <int Needed>
DecodeResult DecodeSequence(const char* ptr, const char mask) {
// By using a template parameter, we force the compiler to check the value for
// Needed and optimize each case at compile time.
static_assert(Needed <= kMaxByteSize);
static_assert(1 < Needed && Needed <= kMaxByteSize);

const char leading_byte = *ptr++;
// Handle the leading byte.
Expand All @@ -161,30 +150,45 @@ DecodeResult DecodeSequence(const char* ptr, const char mask) {

} // namespace

EncodeResult EncodeResult::Ascii(const char32_t cp) {
EncodeResult result;
result.count_ = 1;
result.bytes_[0] = static_cast<char>(cp);
return result;
}

EncodeResult EncodeResult::EncodeSequence(char32_t cp, uint_fast8_t count,
char offset) {
EncodeResult result;
result.count_ = count + 1; // count_ in the result is the byte length.
auto it = result.bytes_.begin();
*it++ = static_cast<char>((cp >> (kShift * count)) + offset);
while (count > 0) {
const char temp = static_cast<char>(cp >> (kShift * (count - 1)));
*it++ = 0x80 | (temp & 0x3f);
--count;
}
return result;
}

EncodeResult Encode(const char32_t cp) {
// This is a naive UTF-8 encoder based on the WHATWG Encoding standard.
// https://encoding.spec.whatwg.org/#utf-8-encoder
EncodeResult result;
if (cp <= 0x7f) {
result.bytes_[0] = static_cast<char>(cp);
result.count_ = 1;
return EncodeResult::Ascii(cp);
} else if (cp <= 0x7ff) {
EncodeSequence<1>(cp, 0xc0, result.bytes_.begin());
result.count_ = 2;
return EncodeResult::EncodeSequence(cp, 1, 0xc0);
} else if (cp <= 0xffff) {
EncodeSequence<2>(cp, 0xe0, result.bytes_.begin());
result.count_ = 3;
return EncodeResult::EncodeSequence(cp, 2, 0xe0);
} else if (cp <= 0x10ffff) {
EncodeSequence<3>(cp, 0xf0, result.bytes_.begin());
result.count_ = 4;
return EncodeResult::EncodeSequence(cp, 3, 0xf0);
} else {
// Unicode 15.0 §3.4 Characters and Encoding D9
// "Unicode codespace: A range of integers from 0 to 0x10FFFF."
// §3.9 UTF-32 D90
// "Any UTF-32 code unit greater than 0010FFFF16 is ill-formed."
return Encode(kReplacementCharacter);
}
return result;
}

DecodeResult Decode(const char* ptr, const char* last) {
Expand Down
24 changes: 18 additions & 6 deletions src/base/strings/internal/utf8_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -67,11 +67,22 @@ class EncodeResult {
public:
using Buffer = std::array<char, kMaxByteSize>;

// Returns an EncodeResult for a single ASCII code point cp.
static EncodeResult Ascii(char32_t cp);

// Encodes a Unicode character cp in UTF-8 according to the count and offset
// parameters as in https://encoding.spec.whatwg.org/#utf-8-encoder.
static EncodeResult EncodeSequence(char32_t cp, uint_fast8_t count,
char offset);

EncodeResult(const EncodeResult&) = default;
EncodeResult& operator=(const EncodeResult&) = default;

constexpr const char* data() const { return bytes_.data(); }
constexpr uint_fast8_t size() const { return count_; }

private:
friend EncodeResult Encode(char32_t);
EncodeResult() = default;

uint_fast8_t count_;
Buffer bytes_;
Expand All @@ -81,18 +92,21 @@ class DecodeResult {
public:
DecodeResult() = default;

DecodeResult(const DecodeResult&) = default;
DecodeResult& operator=(const DecodeResult&) = default;

static constexpr DecodeResult Continue(const char32_t cp,
const uint_fast8_t bytes_seen) {
return DecodeResult{cp, true, bytes_seen};
}

static inline DecodeResult Error(const uint_fast8_t bytes_seen) {
static constexpr DecodeResult Error(const uint_fast8_t bytes_seen) {
return DecodeResult{kReplacementCharacter, false, bytes_seen};
}

// Indicates that the decoded position is the `end` sentinel.
static inline DecodeResult Sentinel() { return DecodeResult{0, false, 0}; }
bool IsSentinel() const { return bytes_seen_ == 0; }
static constexpr DecodeResult Sentinel() { return DecodeResult{0, false, 0}; }
constexpr bool IsSentinel() const { return bytes_seen_ == 0; }

constexpr char32_t code_point() const { return code_point_; }
constexpr bool ok() const { return ok_; }
Expand Down Expand Up @@ -124,8 +138,6 @@ EncodeResult Encode(char32_t cp);
// REQUIRES: [it, last) to be a valid range.
DecodeResult Decode(const char* ptr, const char* last);

// Implementations

} // namespace mozc::utf8_internal

#endif // MOZC_BASE_STRINGS_INTERNAL_UTF8_INTERNAL_H_
23 changes: 12 additions & 11 deletions src/base/strings/unicode.cc
Original file line number Diff line number Diff line change
Expand Up @@ -61,29 +61,30 @@ std::u32string Utf8ToUtf32(const absl::string_view sv) {
std::string Utf32ToUtf8(const std::u32string_view sv) {
std::string result;
// Same, most strings are fairly short, so it's faster to just append.
for (auto it = sv.begin(); it != sv.end(); ++it) {
StrAppendChar32(&result, *it);
for (const char32_t c : sv) {
StrAppendChar32(&result, c);
}
return result;
}

absl::string_view Utf8Substring(absl::string_view sv, size_t pos) {
while (pos > 0) {
sv.remove_prefix(OneCharLen(sv.front()));
--pos;
const Utf8AsChars usv(sv);
auto first = usv.begin();
while (pos-- > 0) {
++first;
}
return sv;
return usv.Substring(first);
}

absl::string_view Utf8Substring(absl::string_view sv, const size_t pos,
size_t count) {
sv = Utf8Substring(sv, pos);
size_t i = 0;
while (i < sv.size() && count > 0) {
i += OneCharLen(sv[i]);
--count;
const Utf8AsChars usv(sv);
auto last = usv.begin();
while (last != usv.end() && count-- > 0) {
++last;
}
return sv.substr(0, i);
return usv.Substring(usv.begin(), last);
}

} // namespace strings
Expand Down
44 changes: 30 additions & 14 deletions src/base/strings/unicode.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
#include <utility>

#include "absl/base/attributes.h"
#include "absl/base/nullability.h"
#include "absl/base/optimization.h"
#include "absl/log/check.h"
#include "absl/strings/string_view.h"
Expand Down Expand Up @@ -124,22 +125,30 @@ std::string Utf32ToUtf8(std::u32string_view sv);

// Appends a single Unicode character represented by a char32_t code point to
// dest.
inline void StrAppendChar32(std::string* dest, const char32_t cp) {
inline void StrAppendChar32(absl::Nonnull<std::string*> dest,
const char32_t cp) {
const utf8_internal::EncodeResult ec = utf8_internal::Encode(cp);
// basic_string::append() is faster than absl::StrAppend() here.
dest->append(ec.data(), ec.size());
}

// Converts a single Unicode character by a char32_t code point to UTF-8.
inline std::string Char32ToUtf8(const char32_t cp) {
std::string result;
StrAppendChar32(&result, cp);
return result;
}

// Returns a substring of the UTF-8 string sv [pos, pos + count), or [pos,
// sv.end()) if count is not provided, by the number of Unicode characters. The
// result is clipped if pos + count > CharsLen().
// result is clipped if pos + count > [number of Unicode characters in sv].
//
// Note that this function is linear and slower than Utf8AsChars::Substring as
// it needs to traverse through each character. Use Utf8AsChars::Substring if
// you already have the character iterators.
//
// REQUIRES: The UTF-8 string is valid. pos <= CharsLen(sv).
// Complexity: linear to pos + count or pos if count it not provided.
// REQUIRES: pos <= [number of Unicode characters in sv].
// Complexity: linear to pos + count, or pos if count it not provided.
absl::string_view Utf8Substring(absl::string_view sv, size_t pos);
absl::string_view Utf8Substring(absl::string_view sv, size_t pos, size_t count);

Expand All @@ -155,6 +164,9 @@ class UnicodeChar {
char32_t codepoint)
: mozc::UnicodeChar(utf8, /*ok=*/true, bytes_seen, codepoint) {}

UnicodeChar(const UnicodeChar&) = default;
UnicodeChar& operator=(const UnicodeChar&) = default;

char32_t char32() const { return dr_.code_point(); }
absl::string_view utf8() const {
return absl::string_view(utf8_, dr_.bytes_seen());
Expand Down Expand Up @@ -212,16 +224,7 @@ class Utf8CharIterator {
Utf8CharIterator& operator=(const Utf8CharIterator&) = default;

// Returns the current character.
reference operator*() const {
DCHECK(!dr_.IsSentinel());
if constexpr (std::is_same_v<ValueType, char32_t>) {
return char32();
} else if constexpr (std::is_same_v<ValueType, absl::string_view>) {
return view();
} else if constexpr (std::is_same_v<ValueType, UnicodeChar>) {
return ValueType{ptr_, dr_.ok(), dr_.bytes_seen(), dr_.code_point()};
}
}
reference operator*() const;

// Moves the iterator to the next Unicode character.
Utf8CharIterator& operator++() {
Expand Down Expand Up @@ -525,6 +528,19 @@ constexpr std::pair<absl::string_view, absl::string_view> FrontChar(

} // namespace strings

template <typename ValueType>
typename Utf8CharIterator<ValueType>::reference
Utf8CharIterator<ValueType>::operator*() const {
DCHECK(!dr_.IsSentinel());
if constexpr (std::is_same_v<ValueType, char32_t>) {
return char32();
} else if constexpr (std::is_same_v<ValueType, absl::string_view>) {
return view();
} else if constexpr (std::is_same_v<ValueType, UnicodeChar>) {
return ValueType{ptr_, dr_.ok(), dr_.bytes_seen(), dr_.code_point()};
}
}

template <typename ValueType>
typename Utf8AsCharsBase<ValueType>::value_type
Utf8AsCharsBase<ValueType>::back() const {
Expand Down
19 changes: 19 additions & 0 deletions src/base/strings/unicode_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,22 @@ TEST(UnicodeTest, Utf32ToUtf8) {
EXPECT_EQ(Utf32ToUtf8(kU32Str), kExpected);
}

TEST(UnicodeTest, StrAppendChar32) {
std::string result;
StrAppendChar32(&result, 'A');
EXPECT_EQ(result, "A");
StrAppendChar32(&result, U'');
EXPECT_EQ(result, "Aあ");
StrAppendChar32(&result, 0x110000);
EXPECT_EQ(result, "Aあ\uFFFD");
}

TEST(UnicodeTest, Char32ToUtf8) {
EXPECT_EQ(Char32ToUtf8('A'), "A");
EXPECT_EQ(Char32ToUtf8(U''), "");
EXPECT_EQ(Char32ToUtf8(0x110000), "\uFFFD");
}

TEST(UnicodeTest, IsValidUtf8) {
EXPECT_TRUE(IsValidUtf8(""));
EXPECT_TRUE(IsValidUtf8("abc"));
Expand Down Expand Up @@ -157,6 +173,9 @@ TEST(UnicodeTest, Utf8Substring) {
EXPECT_EQ(Utf8Substring("五十音ABC", 2, 2), "音A");
EXPECT_EQ(Utf8Substring("Mozc は便利", 6, 100), "便利");
EXPECT_EQ(Utf8Substring("日本語", 2, 0), "");

// Invalid sequence.
EXPECT_EQ(Utf8Substring("\xF0\x80\x80\xAF", 1, 2), "\x80\x80");
}

struct Utf8AsCharsTestParam {
Expand Down
Loading

0 comments on commit a47ae1c

Please sign in to comment.