This repository has been archived by the owner on Nov 13, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathMITIE.cpp
194 lines (159 loc) · 5.39 KB
/
MITIE.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
/**
* MITIE.h (MITIE-PHP Project)
*
* An extension that bridges the MITIE information extraction library with PHP
* Most of the functionality here is based on the C++ examples in: MITIE/examples/cpp/
*
* @author: https://github.com/rjjakes/
*/
#include <string>
#include <iostream>
#include <phpcpp.h>
#include <mitie.h>
#include <mitie/named_entity_extractor.h>
#include <mitie/conll_tokenizer.h>
#include <iostream>
#include <iomanip>
#include <fstream>
#include <cstdlib>
// Namespaces to use
using namespace std;
using namespace mitie;
// ----------------------------------------------------------------------------------------
// NER helper function.
std::vector<string> tokenize_file (
const string& filename
)
{
ifstream fin(filename.c_str());
if (!fin)
{
cout << "Unable to load input text file" << endl;
exit(EXIT_FAILURE);
}
// The conll_tokenizer splits the contents of an istream into a bunch of words and is
// MITIE's default tokenization method.
conll_tokenizer tok(fin);
std::vector<string> tokens;
string token;
// Read the tokens out of the file one at a time and store into tokens.
while(tok(token))
tokens.push_back(token);
return tokens;
}
// ----------------------------------------------------------------------------------------
// Define the NER class
//
class MITIENer : public Php::Base
{
private:
string _classname;
named_entity_extractor _ner;
std::vector<string> _tagstr;
std::vector<string> _tokens;
std::vector<pair<unsigned long, unsigned long> > _chunks;
std::vector<unsigned long> _chunk_tags;
std::vector<double> _chunk_scores;
public:
MITIENer()
{
// cout << this << endl;
// cout << _x << endl;
}
virtual ~MITIENer()
{
}
virtual void __construct()
{
}
virtual void __destruct()
{
}
void loadModel(Php::Parameters ¶ms)
{
// Check the corretc number of parameters have been passed
if (params.size() != 1)
{
cout << "loadModel must contain the path to a NER model file." << endl;
return;
}
// Load the model file and setup the class ready for extraction
// Load MITIE's named entity extractor from disk. Each file in the MITIE-models
// folder begins with a string containing the name of the serialized class. In
// this case classname contains "mitie::named_entity_extractor". It can be used to
// identify what is in any particular file. However, in this example we don't need
// it so it is just ignored.
dlib::deserialize(params[0]) >> _classname >> _ner;
// Print out what kind of tags this tagger can predict.
_tagstr = _ner.get_tag_name_strings();
}
Php::Value getTags()
{
// @todo - check this array is set
return _tagstr;
}
void extraction(Php::Parameters ¶ms)
{
if (params.size() != 1)
{
cout << "extraction() must contain the path to a text file." << endl;
return;
}
// Before we can try out the tagger we need to load some data.
_tokens = tokenize_file(params[0]);
// Now detect all the entities in the text file we loaded and print them to the screen.
// The output of this function is a set of "chunks" of tokens, each a named entity.
// Additionally, if it is useful for your application a confidence score for each "chunk"
// is available by using the predict() method. The larger the score the more
// confident MITIE is in the tag.
_ner.predict(_tokens, _chunks, _chunk_tags, _chunk_scores);
}
Php::Value getEntities()
{
Php::Array entities;
// @todo - check input variables exists
for (unsigned int i = 0; i < _chunks.size(); ++i)
{
Php::Array entity;
std::string tokens = "";
entity["tag_id"] = (int)_chunk_tags[i];
entity["score"] = (float)_chunk_scores[i];
entity["tag"] = _tagstr[_chunk_tags[i]];
// _chunks[i] defines a half open range in tokens that contains the entity.
for (unsigned long j = _chunks[i].first; j < _chunks[i].second; ++j)
{
if (j != _chunks[i].first)
tokens += " ";
tokens += (string)_tokens[j];
}
entity["tokens"] = tokens;
// Add this entity to the enitites array
entities[i] = entity;
}
return entities;
}
};
/**
* Export classes via PHP-CPP
*/
// symbols are exported according to the "C" language
extern "C"
{
// export the "get_module" function that will be called by the Zend engine
PHPCPP_EXPORT void *get_module()
{
// create extension
static Php::Extension extension("MITIE","0.1");
// define classes
Php::Class<MITIENer> mITIENer("MITIENer");
mITIENer.method<&MITIENer::getEntities>("getEntities");
mITIENer.method<&MITIENer::getTags>("getTags");
mITIENer.method<&MITIENer::loadModel>("loadModel");
mITIENer.method<&MITIENer::extraction>("extraction");
mITIENer.method<&MITIENer::__construct>("__construct");
// add to extension
extension.add(mITIENer);
// return the module entry
return extension.module();
}
}