-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathVocabulary.hpp
32 lines (23 loc) · 983 Bytes
/
Vocabulary.hpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
#pragma once
#include <unordered_map>
#include <unordered_set>
#include <vector>
#include <string>
#include "Matrix.hpp"
class Vocabulary{
public:
typedef unsigned int INDEX;
typedef unsigned long int COUNT;
Vocabulary(){};
Vocabulary(const std::string& inputFile, const Vocabulary::COUNT wordFreqThreshold, const Vocabulary::COUNT nGramFreqThreshold);
std::unordered_map<std::string, Vocabulary::INDEX> tokenIndex;
std::vector<std::pair<std::string, Vocabulary::COUNT> > tokenListCount;
Vocabulary::INDEX unkIndex;
std::unordered_map<std::string, Vocabulary::INDEX> ngramIndex;
std::vector<std::pair<std::string, Vocabulary::COUNT> > ngramListCount;
std::vector<Vocabulary::INDEX> noiseDistribution;
std::vector<Real> discardProb;
void extractCharNgram(const std::string& str, std::unordered_set<Vocabulary::INDEX>& ngram);
private:
void extractCharNgram(const std::string& str, std::unordered_map<std::string, Vocabulary::COUNT>& ngramCount);
};