Merge remote-tracking branch 'origin/master' into fcitx

fcitx · Sep 26, 2024 · a47ae1c · a47ae1c
2 parents 13bdc20 + dd7b112
commit a47ae1c
Show file tree

Hide file tree

Showing 36 changed files with 255 additions and 281 deletions.
diff --git a/docs/configurations.md b/docs/configurations.md
@@ -24,6 +24,36 @@ Any category is OK so far.
 | {DATE}  | 2 digits date (01)   |
 | {{}     | sinle { character    |
 
+## Keymap (shortcut keys)
+
+Shortcut keys are configurable from the preferences dialog GUI.
+
+*   Preferences GUI > General tab > Keymap > Keymap style.
+
+There are three predefined keymaps
+
+*   ATOK
+*   MS-IME (default for Windows)
+*   Kotoeri (default for macOS)
+
+Full customization is also available.
+
+### Input modes
+
+*   Direct: Mode to type Latin characters without IME.
+*   Precomposition: Mode to type Hiragana characters, but no character is typed
+    yet.
+*   Composition: Mode to type Hiragana characters.
+*   Suggestion: Sub-mode of Composition. Some word suggestions have appeared,
+    but no candidate is focused.
+*   Conversion: Mode to select words (in Kanji). Candidate words are generated
+    by exact-match (by Space key).
+*   Prediction: Sub-mode of Conversion. Candidate words are generated by
+    prefix-match (by Tab key).
+
+![Diagram of input modes](input_modes.svg)
+![Screenshots of input modes](input_modes.png)
+
 ## Configuration path
 
 Mozc creates configuration files under `$XDG_CONFIG_HOME/mozc` (default:

diff --git a/docs/input_modes.png b/docs/input_modes.png
diff --git a/docs/input_modes.svg b/docs/input_modes.svg
diff --git a/src/android/gen_touch_event_stats.py b/src/android/gen_touch_event_stats.py
@@ -43,7 +43,6 @@
 import struct
 import urllib.parse
 
-__author__ = "matsuzakit"
 
 
 def ReadCollectedKeyboards(stream):

diff --git a/src/base/strings/BUILD.bazel b/src/base/strings/BUILD.bazel
@@ -138,6 +138,7 @@ mozc_cc_library(
     deps = [
         "//base/strings/internal:utf8_internal",
         "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/base:nullability",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings",
     ],

diff --git a/src/base/strings/internal/utf8_internal.cc b/src/base/strings/internal/utf8_internal.cc
@@ -30,6 +30,7 @@
 #include "base/strings/internal/utf8_internal.h"
 
 #include <array>
+#include <cstdint>
 #include <iterator>
 #include <type_traits>
 
@@ -99,18 +100,6 @@ constexpr bool IsValidSecondByte<2>(const char leading_byte,
   return IsTrailingByte(second_byte);
 }
 
-template <int Count>
-inline void EncodeSequence(char32_t cp, const char offset,
-                           EncodeResult::Buffer::iterator it) {
-  int count = Count;
-  *it++ = static_cast<char>((cp >> (kShift * count)) + offset);
-  while (count > 0) {
-    const char temp = static_cast<char>(cp >> (kShift * (count - 1)));
-    *it++ = 0x80 | (temp & 0x3f);
-    --count;
-  }
-}
-
 DecodeResult HandleBufferTooShort(const char* ptr, const char* last,
                                   const int needed) {
   // If the buffer is not long enough, stop processing and return error.
@@ -131,15 +120,15 @@ DecodeResult HandleBufferTooShort(const char* ptr, const char* last,
   return DecodeResult::Error(seen);
 }
 
-constexpr char32_t AppendTrailingByte(char32_t base, char byte) {
+inline char32_t AppendTrailingByte(char32_t base, char byte) {
   return (base << kShift) + (byte & kTrailingMask);
 }
 
 template <int Needed>
 DecodeResult DecodeSequence(const char* ptr, const char mask) {
   // By using a template parameter, we force the compiler to check the value for
   // Needed and optimize each case at compile time.
-  static_assert(Needed <= kMaxByteSize);
+  static_assert(1 < Needed && Needed <= kMaxByteSize);
 
   const char leading_byte = *ptr++;
   // Handle the leading byte.
@@ -161,30 +150,45 @@ DecodeResult DecodeSequence(const char* ptr, const char mask) {
 
 }  // namespace
 
+EncodeResult EncodeResult::Ascii(const char32_t cp) {
+  EncodeResult result;
+  result.count_ = 1;
+  result.bytes_[0] = static_cast<char>(cp);
+  return result;
+}
+
+EncodeResult EncodeResult::EncodeSequence(char32_t cp, uint_fast8_t count,
+                                          char offset) {
+  EncodeResult result;
+  result.count_ = count + 1;  // count_ in the result is the byte length.
+  auto it = result.bytes_.begin();
+  *it++ = static_cast<char>((cp >> (kShift * count)) + offset);
+  while (count > 0) {
+    const char temp = static_cast<char>(cp >> (kShift * (count - 1)));
+    *it++ = 0x80 | (temp & 0x3f);
+    --count;
+  }
+  return result;
+}
+
 EncodeResult Encode(const char32_t cp) {
   // This is a naive UTF-8 encoder based on the WHATWG Encoding standard.
   // https://encoding.spec.whatwg.org/#utf-8-encoder
-  EncodeResult result;
   if (cp <= 0x7f) {
-    result.bytes_[0] = static_cast<char>(cp);
-    result.count_ = 1;
+    return EncodeResult::Ascii(cp);
   } else if (cp <= 0x7ff) {
-    EncodeSequence<1>(cp, 0xc0, result.bytes_.begin());
-    result.count_ = 2;
+    return EncodeResult::EncodeSequence(cp, 1, 0xc0);
   } else if (cp <= 0xffff) {
-    EncodeSequence<2>(cp, 0xe0, result.bytes_.begin());
-    result.count_ = 3;
+    return EncodeResult::EncodeSequence(cp, 2, 0xe0);
   } else if (cp <= 0x10ffff) {
-    EncodeSequence<3>(cp, 0xf0, result.bytes_.begin());
-    result.count_ = 4;
+    return EncodeResult::EncodeSequence(cp, 3, 0xf0);
   } else {
     // Unicode 15.0 §3.4 Characters and Encoding D9
     // "Unicode codespace: A range of integers from 0 to 0x10FFFF."
     // §3.9 UTF-32 D90
     // "Any UTF-32 code unit greater than 0010FFFF16 is ill-formed."
     return Encode(kReplacementCharacter);
   }
-  return result;
 }
 
 DecodeResult Decode(const char* ptr, const char* last) {

diff --git a/src/base/strings/internal/utf8_internal.h b/src/base/strings/internal/utf8_internal.h
@@ -67,11 +67,22 @@ class EncodeResult {
  public:
   using Buffer = std::array<char, kMaxByteSize>;
 
+  // Returns an EncodeResult for a single ASCII code point cp.
+  static EncodeResult Ascii(char32_t cp);
+
+  // Encodes a Unicode character cp in UTF-8 according to the count and offset
+  // parameters as in https://encoding.spec.whatwg.org/#utf-8-encoder.
+  static EncodeResult EncodeSequence(char32_t cp, uint_fast8_t count,
+                                     char offset);
+
+  EncodeResult(const EncodeResult&) = default;
+  EncodeResult& operator=(const EncodeResult&) = default;
+
   constexpr const char* data() const { return bytes_.data(); }
   constexpr uint_fast8_t size() const { return count_; }
 
  private:
-  friend EncodeResult Encode(char32_t);
+  EncodeResult() = default;
 
   uint_fast8_t count_;
   Buffer bytes_;
@@ -81,18 +92,21 @@ class DecodeResult {
  public:
   DecodeResult() = default;
 
+  DecodeResult(const DecodeResult&) = default;
+  DecodeResult& operator=(const DecodeResult&) = default;
+
   static constexpr DecodeResult Continue(const char32_t cp,
                                          const uint_fast8_t bytes_seen) {
     return DecodeResult{cp, true, bytes_seen};
   }
 
-  static inline DecodeResult Error(const uint_fast8_t bytes_seen) {
+  static constexpr DecodeResult Error(const uint_fast8_t bytes_seen) {
     return DecodeResult{kReplacementCharacter, false, bytes_seen};
   }
 
   // Indicates that the decoded position is the `end` sentinel.
-  static inline DecodeResult Sentinel() { return DecodeResult{0, false, 0}; }
-  bool IsSentinel() const { return bytes_seen_ == 0; }
+  static constexpr DecodeResult Sentinel() { return DecodeResult{0, false, 0}; }
+  constexpr bool IsSentinel() const { return bytes_seen_ == 0; }
 
   constexpr char32_t code_point() const { return code_point_; }
   constexpr bool ok() const { return ok_; }
@@ -124,8 +138,6 @@ EncodeResult Encode(char32_t cp);
 // REQUIRES: [it, last) to be a valid range.
 DecodeResult Decode(const char* ptr, const char* last);
 
-// Implementations
-
 }  // namespace mozc::utf8_internal
 
 #endif  // MOZC_BASE_STRINGS_INTERNAL_UTF8_INTERNAL_H_
diff --git a/src/base/strings/unicode.cc b/src/base/strings/unicode.cc
@@ -61,29 +61,30 @@ std::u32string Utf8ToUtf32(const absl::string_view sv) {
 std::string Utf32ToUtf8(const std::u32string_view sv) {
   std::string result;
   // Same, most strings are fairly short, so it's faster to just append.
-  for (auto it = sv.begin(); it != sv.end(); ++it) {
-    StrAppendChar32(&result, *it);
+  for (const char32_t c : sv) {
+    StrAppendChar32(&result, c);
   }
   return result;
 }
 
 absl::string_view Utf8Substring(absl::string_view sv, size_t pos) {
-  while (pos > 0) {
-    sv.remove_prefix(OneCharLen(sv.front()));
-    --pos;
+  const Utf8AsChars usv(sv);
+  auto first = usv.begin();
+  while (pos-- > 0) {
+    ++first;
   }
-  return sv;
+  return usv.Substring(first);
 }
 
 absl::string_view Utf8Substring(absl::string_view sv, const size_t pos,
                                 size_t count) {
   sv = Utf8Substring(sv, pos);
-  size_t i = 0;
-  while (i < sv.size() && count > 0) {
-    i += OneCharLen(sv[i]);
-    --count;
+  const Utf8AsChars usv(sv);
+  auto last = usv.begin();
+  while (last != usv.end() && count-- > 0) {
+    ++last;
   }
-  return sv.substr(0, i);
+  return usv.Substring(usv.begin(), last);
 }
 
 }  // namespace strings

diff --git a/src/base/strings/unicode.h b/src/base/strings/unicode.h
@@ -39,6 +39,7 @@
 #include <utility>
 
 #include "absl/base/attributes.h"
+#include "absl/base/nullability.h"
 #include "absl/base/optimization.h"
 #include "absl/log/check.h"
 #include "absl/strings/string_view.h"
@@ -124,22 +125,30 @@ std::string Utf32ToUtf8(std::u32string_view sv);
 
 // Appends a single Unicode character represented by a char32_t code point to
 // dest.
-inline void StrAppendChar32(std::string* dest, const char32_t cp) {
+inline void StrAppendChar32(absl::Nonnull<std::string*> dest,
+                            const char32_t cp) {
   const utf8_internal::EncodeResult ec = utf8_internal::Encode(cp);
   // basic_string::append() is faster than absl::StrAppend() here.
   dest->append(ec.data(), ec.size());
 }
 
+// Converts a single Unicode character by a char32_t code point to UTF-8.
+inline std::string Char32ToUtf8(const char32_t cp) {
+  std::string result;
+  StrAppendChar32(&result, cp);
+  return result;
+}
+
 // Returns a substring of the UTF-8 string sv [pos, pos + count), or [pos,
 // sv.end()) if count is not provided, by the number of Unicode characters. The
-// result is clipped if pos + count > CharsLen().
+// result is clipped if pos + count > [number of Unicode characters in sv].
 //
 // Note that this function is linear and slower than Utf8AsChars::Substring as
 // it needs to traverse through each character. Use Utf8AsChars::Substring if
 // you already have the character iterators.
 //
-// REQUIRES: The UTF-8 string is valid. pos <= CharsLen(sv).
-// Complexity: linear to pos + count or pos if count it not provided.
+// REQUIRES: pos <= [number of Unicode characters in sv].
+// Complexity: linear to pos + count, or pos if count it not provided.
 absl::string_view Utf8Substring(absl::string_view sv, size_t pos);
 absl::string_view Utf8Substring(absl::string_view sv, size_t pos, size_t count);
 
@@ -155,6 +164,9 @@ class UnicodeChar {
               char32_t codepoint)
       : mozc::UnicodeChar(utf8, /*ok=*/true, bytes_seen, codepoint) {}
 
+  UnicodeChar(const UnicodeChar&) = default;
+  UnicodeChar& operator=(const UnicodeChar&) = default;
+
   char32_t char32() const { return dr_.code_point(); }
   absl::string_view utf8() const {
     return absl::string_view(utf8_, dr_.bytes_seen());
@@ -212,16 +224,7 @@ class Utf8CharIterator {
   Utf8CharIterator& operator=(const Utf8CharIterator&) = default;
 
   // Returns the current character.
-  reference operator*() const {
-    DCHECK(!dr_.IsSentinel());
-    if constexpr (std::is_same_v<ValueType, char32_t>) {
-      return char32();
-    } else if constexpr (std::is_same_v<ValueType, absl::string_view>) {
-      return view();
-    } else if constexpr (std::is_same_v<ValueType, UnicodeChar>) {
-      return ValueType{ptr_, dr_.ok(), dr_.bytes_seen(), dr_.code_point()};
-    }
-  }
+  reference operator*() const;
 
   // Moves the iterator to the next Unicode character.
   Utf8CharIterator& operator++() {
@@ -525,6 +528,19 @@ constexpr std::pair<absl::string_view, absl::string_view> FrontChar(
 
 }  // namespace strings
 
+template <typename ValueType>
+typename Utf8CharIterator<ValueType>::reference
+Utf8CharIterator<ValueType>::operator*() const {
+  DCHECK(!dr_.IsSentinel());
+  if constexpr (std::is_same_v<ValueType, char32_t>) {
+    return char32();
+  } else if constexpr (std::is_same_v<ValueType, absl::string_view>) {
+    return view();
+  } else if constexpr (std::is_same_v<ValueType, UnicodeChar>) {
+    return ValueType{ptr_, dr_.ok(), dr_.bytes_seen(), dr_.code_point()};
+  }
+}
+
 template <typename ValueType>
 typename Utf8AsCharsBase<ValueType>::value_type
 Utf8AsCharsBase<ValueType>::back() const {

diff --git a/src/base/strings/unicode_test.cc b/src/base/strings/unicode_test.cc
@@ -112,6 +112,22 @@ TEST(UnicodeTest, Utf32ToUtf8) {
   EXPECT_EQ(Utf32ToUtf8(kU32Str), kExpected);
 }
 
+TEST(UnicodeTest, StrAppendChar32) {
+  std::string result;
+  StrAppendChar32(&result, 'A');
+  EXPECT_EQ(result, "A");
+  StrAppendChar32(&result, U'あ');
+  EXPECT_EQ(result, "Aあ");
+  StrAppendChar32(&result, 0x110000);
+  EXPECT_EQ(result, "Aあ\uFFFD");
+}
+
+TEST(UnicodeTest, Char32ToUtf8) {
+  EXPECT_EQ(Char32ToUtf8('A'), "A");
+  EXPECT_EQ(Char32ToUtf8(U'あ'), "あ");
+  EXPECT_EQ(Char32ToUtf8(0x110000), "\uFFFD");
+}
+
 TEST(UnicodeTest, IsValidUtf8) {
   EXPECT_TRUE(IsValidUtf8(""));
   EXPECT_TRUE(IsValidUtf8("abc"));
@@ -157,6 +173,9 @@ TEST(UnicodeTest, Utf8Substring) {
   EXPECT_EQ(Utf8Substring("五十音ABC", 2, 2), "音A");
   EXPECT_EQ(Utf8Substring("Mozc は便利", 6, 100), "便利");
   EXPECT_EQ(Utf8Substring("日本語", 2, 0), "");
+
+  // Invalid sequence.
+  EXPECT_EQ(Utf8Substring("\xF0\x80\x80\xAF", 1, 2), "\x80\x80");
 }
 
 struct Utf8AsCharsTestParam {