-
Notifications
You must be signed in to change notification settings - Fork 46
/
Copy pathStemmer.pyx
221 lines (182 loc) · 8.32 KB
/
Stemmer.pyx
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
# Stemmer.pyx: Copyright (c) 2006, Richard Boulton
# Pyrex interface to the snowball "libstemmer_c" library.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to
# deal in the Software without restriction, including without limitation the
# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
# sell copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
# IN THE SOFTWARE.
r"""Stemmer: stemming algorithms from the snowball project.
"""
__docformat__ = "restructuredtext en"
cdef extern from *:
ctypedef char** const_char_ptr_ptr "const char **"
cdef extern from "Python.h":
object PyUnicode_FromStringAndSize (const char * s, Py_ssize_t len)
cdef extern from "libstemmer.h":
cdef struct sb_stemmer
ctypedef unsigned char sb_symbol
cdef const_char_ptr_ptr sb_stemmer_list()
cdef sb_stemmer * sb_stemmer_new(const char * algorithm, const char * charenc)
cdef void sb_stemmer_delete(sb_stemmer * stemmer)
cdef const sb_symbol * sb_stemmer_stem(sb_stemmer * stemmer, const sb_symbol * word, int size)
cdef int sb_stemmer_length(sb_stemmer * stemmer)
def algorithms(aliases=False):
"""Get a list of the names of the available stemming algorithms.
Note that there are also aliases for these algorithm names, which are not
included in this list by default. If the 'aliases' keyword parameter is
False, this list is guaranteed to contain precisely one entry for each
available stemming algorithm. Otherwise, all known aliases for algorithms
will be included in the list.
Note that the classic Porter stemming algorithm for English is
available by default: although this has been superseded by an improved
algorithm, the original algorithm may be of interest to information
retrieval researchers wishing to reproduce results of earlier
experiments. Most users will want to use the "english" algorithm,
instead of the "porter" algorithm.
"""
cdef const_char_ptr_ptr algs
cdef int i
py_algs = []
algs = sb_stemmer_list()
i = 0
while algs[i] != NULL:
alg = algs[i]
alg = alg.decode(u"ascii")
py_algs.append(alg)
i = i + 1
return py_algs
def version():
"""Get the version string of the stemming module.
This version number is for the Stemmer module as a whole (not for an
individual stemming algorithm).
"""
return '2.0.1'
cdef class Stemmer:
"""An instance of a stemming algorithm.
The algorithm has internal state, so must not be called concurrently.
ie, only a single thread should access the instance at any given time.
When creating a `Stemmer` object, there is one required argument: the
name of the algorithm to use in the new stemmer. A list of the valid
algorithm names may be obtained by calling the `algorithms()` function
in this module. In addition, the appropriate stemming algorithm for a
given language may be obtained by using the 2 or 3 letter ISO 639
language codes.
A second optional argument to the constructor for `Stemmer` is the size
of cache to use. The cache implemented in this module is not terribly
efficient, but benchmarks show that it approximately doubles
performance for typical text processing operations, without too much
memory overhead. The cache may be disabled by passing a size of 0.
The default size (10000 words) is probably appropriate in most
situations. In pathological cases (for example, when no word is
presented to the stemming algorithm more than once, so the cache is
useless), the cache can severely damage performance.
The "benchmark.py" script supplied with the PyStemmer distribution can
be used to test the performance of the stemming algorithms with various
cache sizes.
"""
cdef sb_stemmer * cobj
cdef object cache
cdef object counter
cdef int max_cache_size
def __init__ (self, algorithm, int maxCacheSize = 10000):
"""Initialise a stemmer.
See the class documentation for details.
"""
alg = algorithm.encode(u'ascii')
self.cobj = sb_stemmer_new(alg, 'UTF_8')
if self.cobj == NULL:
raise KeyError("Stemming algorithm '%s' not found" % algorithm)
self.max_cache_size = maxCacheSize
self.counter = 0
self.cache = {}
def __dealloc__ (self):
sb_stemmer_delete(self.cobj)
property maxCacheSize:
"""Maximum number of entries to allow in the cache.
This may be set to zero to disable the cache entirely.
The maximum cache size may be set at any point - setting the
maximum size will purge entries from the cache if the new maximum
size is smaller than the current size.
"""
def __set__(self, int size):
self.max_cache_size = size
if size == 0:
self.cache = {}
self.counter = 0
else:
self.__purgeCache()
def __get__(self):
return self.max_cache_size
def __purgeCache (self):
if len(self.cache) < self.max_cache_size:
return
newcache = {}
mincounter = self.counter - int(self.max_cache_size * 8 / 10)
for (word, cacheditem) in self.cache.iteritems():
if cacheditem[1] > mincounter:
newcache[word] = cacheditem
self.cache = newcache
def stemWord (self, word):
"""Stem a word.
This takes a single argument, ``word``, which should either be a UTF-8
encoded string, or a unicode object.
The result is the stemmed form of the word. If the word supplied
was a unicode object, the result will be a unicode object: if the
word supplied was a string, the result will be a UTF-8 encoded
string.
"""
cdef const char * c_word
was_unicode = 0
if isinstance(word, unicode):
was_unicode = 1
word = word.encode(u'utf-8');
if self.max_cache_size > 0:
try:
cacheditem = self.cache[word]
result = cacheditem[0]
cacheditem[1] = self.counter
self.counter = self.counter + 1
except KeyError:
c_word = word
c_word = <const char*>sb_stemmer_stem(self.cobj, <const sb_symbol*>c_word, len(word))
length = sb_stemmer_length(self.cobj)
result = PyUnicode_FromStringAndSize (c_word, length)
self.cache[word] = [result, self.counter]
self.counter = self.counter + 1
self.__purgeCache()
else:
c_word = word
c_word = <const char*>sb_stemmer_stem(self.cobj, <const sb_symbol*>c_word, len(word))
length = sb_stemmer_length(self.cobj)
result = PyUnicode_FromStringAndSize (c_word, length)
if not was_unicode:
return result.encode(u'utf-8')
return result
def stemWords (self, words):
"""Stem a list of words.
This takes a single argument, ``words``, which must be a sequence,
iterator, generator or similar.
The entries in ``words`` should either be UTF-8 encoded strings, or a
unicode objects.
The result is a list of the stemmed forms of the words. If the
word supplied was a unicode object, the stemmed form will be a
unicode object: if the word supplied was a string, the stemmed form
will be a UTF-8 encoded string.
"""
result = []
for word in words:
result.append(self.stemWord(word))
return result