-
Notifications
You must be signed in to change notification settings - Fork 1
/
spec.ctr
113 lines (97 loc) · 3.32 KB
/
spec.ctr
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
Broom memoryLimit: 1024 * 1024 * 1024 * 8.
Broom autoAlloc: True.
import main: \*.
import
Library/Test: 'describe'
.
describe Lexer do: {:it:let
it exists do: {
{ Lexer. } should not raiseError: String.
}.
let[Reflect thisContext] lexer { Lexer }.
let[Reflect thisContext] collist {
File new: 'collation', read split: '\n',
fmap: {:x ^x split: ' ' max: 1.}
}.
it ~ 'is not Nil' do: {
lexer should not = Nil.
lexer collate: collist.
}.
it can lex simple persian sentences do: {
(lexer split: 'کلمات فارسی')
should = ['کلمات', 'فارسی'].
}.
it can lex persian sentences with half-width spaces do: {
(lexer split: 'این جمله (خیلی) پیچیده نیست!!!')
should = (['این' ,'جمله' ,'(' ,'خیلی' ,')' ,'پیچیده' ,'نیست', '!', '!', '!']).
}.
it can lex arbitrary persian text do: {
# Pen writeln:
(lexer split: 'میشدید اون پارسر ترم رو دیدی؟ یه تیکه هست مچ رو درست پیدا میکنه ولی توشو خالی میده')
should = [
'میشدید', 'اون', 'پارسر',
'ترم', 'رو', 'دیدی', '؟', 'یه',
'تیکه', 'هست', 'مچ', 'رو', 'درست',
'پیدا', 'میکنه', 'ولی', 'توشو',
'خالی', 'میده'
].
# .
}.
}.
#:language XFrozen
describe TokenAnalyser do: {:it:let
it exists do: {
{ TokenAnalyser. } should not raiseError: String.
}.
let[Reflect thisContext] lexer { Lexer }.
let[Reflect thisContext] analyser { TokenAnalyser with: lexer }.
let[Reflect thisContext] glblctx { Reflect thisContext }.
let[Reflect thisContext] consolidateAll { Nil }.
let[Reflect thisContext] appearsIn { Nil }.
let[Reflect thisContext] allTargetTexts { Nil }.
it can tokenise do: {:thisTest
allTargetTexts is (
File
list: 'samples',
filter: (\:_:descr (descr @ 'file' startsWith: 'sample')),
from: 0 length: 60,
fmap!: \:descr [(File new: 'out/' + (descr @ 'file')), (File new: 'samples/' + (descr @ 'file'), read)]
).
thisTest tests: 'Tokeniser'.
thisTest progressesUpTo: allTargetTexts count.
var res is analyser
consolidate: allTargetTexts
pre: {:i thisTest progress: i. }
post: { thisTest show. }.
consolidateAll is res @ 0.
appearsIn is res @ 1.
}.
it can calculate TF-IDF do: {:thisTest
thisTest tests: 'TF-IDF'.
thisTest progressesUpTo: consolidateAll count.
analyser
calculateTF-IDF: consolidateAll
withMap: appearsIn
pre: {:i thisTest progress: i. }
post: { thisTest show. }
.
# var outf is File new: 'tfidf', open: 'w+'.
# consolidateAll each: {:k:v
# outf write: '%s -- %:L\n' % [k, ',\t', v].
# }.
# outf close.
}.
it can detect keywords do: {:thisTest
thisTest tests: 'Keyword detection'.
thisTest progressesUpTo: allTargetTexts count.
var keywords-for-docs is analyser
detectKeywords: consolidateAll
forDocuments: allTargetTexts
pre: {:i thisTest progress: i + 1. }
post: { thisTest show. }.
# Pen writeln: keywords-for-docs.
var kfs is File new: 'keywords', open: 'w+'.
kfs write: (keywords-for-docs imap: \:i:x 'document $$i - %L (frq=%L)\n' % x, sum).
kfs close.
}.
}.