forked from goldendict/goldendict
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfolding.hh
84 lines (62 loc) · 2.98 KB
/
folding.hh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
/* This file is (c) 2008-2012 Konstantin Isakov <[email protected]>
* Part of GoldenDict. Licensed under GPLv3 or later, see the LICENSE file */
#ifndef __FOLDING_HH_INCLUDED__
#define __FOLDING_HH_INCLUDED__
#include "wstring.hh"
/// Folding provides means to translate several possible ways to write a
/// symbol into one. This facilitates searching. Here we currently perform
/// full case folding (everything gets translated to lowercase, ligatures
/// and complex letters are decomposed), diacritics folding (all diacritic
/// marks get removed) and whitespace/punctuation marks removal. These
/// transforms are done according to the Unicode standard and/or drafts. The
/// exact algorithms, lists and types of folding performed might get changed
/// in the future -- in this case, the Version field will be bumped up.
namespace Folding {
using gd::wstring;
using gd::wchar;
/// The algorithm's version.
enum
{
Version = 5
};
/// Applies the folding algorithm to each character in the given string,
/// making another one as a result.
wstring apply( wstring const & );
/// Applies only simple case folding algorithm. Since many dictionaries have
/// different case style, we interpret words differing only by case as synonyms.
wstring applySimpleCaseOnly( wstring const & );
/// Applies only full case folding algorithm. This includes simple case, but also
/// decomposing ligatures and complex letters.
wstring applyFullCaseOnly( wstring const & );
/// Applies only diacritics folding algorithm.
wstring applyDiacriticsOnly( wstring const & );
/// Applies only punctuation folding algorithm.
wstring applyPunctOnly( wstring const & );
/// Applies only whitespace folding algorithm.
wstring applyWhitespaceOnly( wstring const & );
/// Applies only whitespace&punctuation folding algorithm.
wstring applyWhitespaceAndPunctOnly( wstring const & );
/// Returns true if the given character is any form of whitespace, false
/// otherwise. Whitespace corresponds to Zl/Zp/Zs Unicode classes, and also
/// includes \n, \r and \t.
bool isWhitespace( wchar ch );
/// Returns true if the given character is any form of punctuation, false
/// otherwise. Punctuation corresponds to Pc/Pd/Pe/Pf/Pi/Po/Ps classes.
bool isPunct( wchar ch );
/// Removes any whitespace or punctuation from the beginning and the end of
/// the word.
wstring trimWhitespaceOrPunct( wstring const & );
/// Removes any whitespace from the beginning and the end of
/// the word.
wstring trimWhitespace( wstring const & );
/// Turns any sequences of consecutive whitespace into a single basic space.
void normalizeWhitespace( wstring & );
/// Same as apply( wstring ), but without any heap operations, therefore
/// preferable when there're many strings to process. Returns -1 if the
/// operation succeded, or otherwise the minimum value of outSize required
/// to succeed.
/// Currently commented out, consider implementing it in case indices'
/// generation would be too slow.
//ssize_t apply( wchar const * in, wchar * out, size_t outSize );
}
#endif