-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgram_export_wf.py
56 lines (40 loc) · 2.04 KB
/
gram_export_wf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
import os
import pickle
from technews_nlp_aggregator.nlp_model.publish import GramFacade
from technews_nlp_aggregator.common import load_config
from datetime import datetime
import yaml
import sys
def export_to_phrases(config):
pickle_dir = config["root_dir"]+config["pickle_dir"]
pickle_file = config["root_dir"]+config["text_pickle_file"]
bigrams_pickle_file = config["root_dir"]+config["bigrams_pickle_file"]
trigrams_pickle_file = config["root_dir"]+config["trigrams_pickle_file"]
phrase_model_dir = config["root_dir"]+config["phrases_model_dir_link"]
with open(pickle_file, 'rb') as f:
texts = pickle.load(f)
logging.info("Loaded {} texts".format(len(texts)))
gramFacade = GramFacade(phrase_model_dir)
gramFacade.load_models()
bigrams = gramFacade.export_bigrams(texts)
del texts
trigrams = gramFacade.export_trigrams(bigrams)
logging.info("Saving {} texts as trigrams".format(len(trigrams)))
bigrams_core_name, trigrams_core_name = 'bigrams_', 'trigrams_'
bigrams_file = pickle_dir + bigrams_core_name + datetime.now().isoformat() + '.p'
trigrams_file = pickle_dir + trigrams_core_name + datetime.now().isoformat() + '.p'
with open(bigrams_file , 'wb') as f:
pickle.dump(bigrams, f)
with open(trigrams_file, 'wb') as f:
pickle.dump(trigrams, f)
if os.path.islink( config["root_dir"]+config["bigrams_pickle_file"]):
os.unlink( config["root_dir"]+config["bigrams_pickle_file"])
os.symlink(bigrams_file, config["root_dir"]+config["bigrams_pickle_file"])
if os.path.islink( config["root_dir"]+config["trigrams_pickle_file"]):
os.unlink( config["root_dir"]+config["trigrams_pickle_file"])
os.symlink(trigrams_file, config["root_dir"] + config["trigrams_pickle_file"])
if __name__ == '__main__':
config = load_config(sys.argv)
export_to_phrases(config)