-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathJXLDStringTokenUtilities.m
93 lines (73 loc) · 3.61 KB
/
JXLDStringTokenUtilities.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
//
// JXLDStringTokenUtilities.m
// Damerau-Levenshtein
//
// Created by Jan on 04.05.12.
// Copyright (c) 2012-2015 geheimwerk.de. All rights reserved.
//
#import "JXLDStringTokenUtilities.h"
CFOptionFlags jxst_kCFStringTokenizerTokenIsGap = (CFOptionFlags)1 << ((sizeof(CFOptionFlags) * CHAR_BIT) -1);
typedef struct {
CFRange *array;
CFStringTokenizerTokenType *types;
size_t used;
size_t capacity;
} TokenRangesArray;
CF_INLINE void assureTokenRangesArrayCapacity(TokenRangesArray *tokenRanges_p) {
if (tokenRanges_p->capacity == tokenRanges_p->used) {
tokenRanges_p->capacity *= 2;
tokenRanges_p->array = reallocf(tokenRanges_p->array, (tokenRanges_p->capacity * sizeof(CFRange)));
if (tokenRanges_p->types != NULL) {
tokenRanges_p->types = reallocf(tokenRanges_p->types, (tokenRanges_p->capacity * sizeof(CFStringTokenizerTokenType)));
}
}
}
CF_INLINE void addToTokenRangesArray(TokenRangesArray *tokenRanges_p, CFRange tokenRange, CFStringTokenizerTokenType tokenType) {
assureTokenRangesArrayCapacity(tokenRanges_p);
if (tokenRanges_p->types != NULL) tokenRanges_p->types[tokenRanges_p->used] = tokenType;
tokenRanges_p->array[tokenRanges_p->used] = tokenRange;
tokenRanges_p->used++;
}
size_t jxst_CFStringPrepareTokenRangesArray(CFStringRef string, CFRange tokenizerRange, CFOptionFlags tokenizerOptions, CFRange **ranges, CFStringTokenizerTokenType **types) {
// This function contains a very crude pseudo-dynamic array implementation as it is a pain to work with CFRange structs and CFArray objects.
// Don’t forget to free the ranges array when you are done with it!
TokenRangesArray tokenRanges = {
.used = 0,
.capacity = 4,
};
tokenRanges.array = malloc(tokenRanges.capacity * sizeof(CFRange));
tokenRanges.types = (types != NULL) ? malloc(tokenRanges.capacity * sizeof(CFStringTokenizerTokenType)) : NULL;
CFStringTokenizerRef tokenizer = CFStringTokenizerCreate(kCFAllocatorDefault, string, tokenizerRange, tokenizerOptions, NULL);
Boolean detectGaps = (tokenizerOptions != kCFStringTokenizerUnitWord);
// Set tokenizer to the start of the string.
CFStringTokenizerTokenType tokenType = CFStringTokenizerGoToTokenAtIndex(tokenizer, 0);
CFStringTokenizerTokenType gapTokenType = (kCFStringTokenizerTokenNormal | jxst_kCFStringTokenizerTokenIsGap);
CFRange tokenRange;
CFIndex prevTokenRangeMax = 0;
while (tokenType != kCFStringTokenizerTokenNone) {
tokenRange = CFStringTokenizerGetCurrentTokenRange(tokenizer);
if (detectGaps && (tokenRange.location > prevTokenRangeMax)) {
// Gaps are expected behaviour when using kCFStringTokenizerUnitWord,
// but for some reason, gaps in other tokenizations can appear.
// One particular example is the tokenizer skipping a line feed ('\n') directly after a string of Chinese characters when using kCFStringTokenizerUnitWordBoundary.
CFRange gapRange = CFRangeMake(prevTokenRangeMax, (tokenRange.location - prevTokenRangeMax));
addToTokenRangesArray(&tokenRanges, gapRange, gapTokenType);
}
addToTokenRangesArray(&tokenRanges, tokenRange, tokenType);
prevTokenRangeMax = (tokenRange.location + tokenRange.length);
tokenType = CFStringTokenizerAdvanceToNextToken(tokenizer);
}
if (detectGaps) {
CFIndex stringLength = CFStringGetLength(string);
if (stringLength > prevTokenRangeMax) {
CFRange gapRange = CFRangeMake(prevTokenRangeMax, (stringLength - prevTokenRangeMax));
addToTokenRangesArray(&tokenRanges, gapRange, gapTokenType);
}
}
CFRelease(tokenizer);
*ranges = tokenRanges.array;
if (types != NULL) {
*types = tokenRanges.types;
}
return tokenRanges.used;
}