-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtoken_model_wf.py
95 lines (74 loc) · 3.49 KB
/
token_model_wf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
import pickle
import os
from technews_nlp_aggregator.common import load_config
import sys
from datetime import datetime
import yaml
from technews_nlp_aggregator.persistence import ArticleDatasetRepo
from technews_nlp_aggregator.nlp_model.common import ArticleLoader, defaultTokenizer
from technews_nlp_aggregator.common import load_config
import argparse
def create_pickle( config , articleLoader, tokenizer, limit=None):
logging.info("Articles loaded : {} ".format(len(articleLoader.articlesDF)))
articleFilterDF = articleLoader.articlesDF[:limit] if limit else articleLoader.articlesDF
texts_df = tokenizer.tokenize_ddf(articleFilterDF )
texts = texts_df.tolist()
if (not limit):
save_picke_file(config, texts)
def save_picke_file(config, texts):
core_name = 'texts_'
pickle_file = config["root_dir"]+config["pickle_dir"] + core_name + datetime.now().isoformat() + '.p'
logging.info("Articles saved in pickle file : {} ".format(len(texts)))
with open(pickle_file, 'wb') as f:
pickle.dump(texts, f)
if os.path.islink(config["root_dir"]+config["text_pickle_file"]):
os.unlink(config["root_dir"]+config["text_pickle_file"])
os.symlink(pickle_file, config["root_dir"]+config["text_pickle_file"])
def update_pickle(config, articleLoader, tokenizer):
pickle_file = config["root_dir"]+config["text_pickle_file"]
texts = []
if not os.path.isfile(pickle_file):
logging.error("File {} not found".format(pickle_file))
logging.error("Change action to create")
exit(1)
with open(pickle_file, 'rb') as f:
texts = pickle.load(f)
logging.info("Loaded {} texts".format(len(texts)))
logging.info("Articles loaded : {} ".format(len(articleLoader.articlesDF)))
last_texts = texts[-10:]
for index, last_text in enumerate(last_texts):
logging.info("=============== {} ===================".format(len(texts)-9+index))
logging.info(last_text)
articlesNewDF = articleLoader.articlesDF
new_textsDF = tokenizer.tokenize_ddf(articlesNewDF )
new_texts = new_textsDF.tolist()
texts = texts + new_texts
save_picke_file(config, texts)
if __name__ == '__main__':
config = load_config(sys.argv)
action = config['tok_action']
db_config = yaml.safe_load(open(config["key_file"]))
db_url = db_config["db_url"]
articleDatasetRepo = ArticleDatasetRepo(db_config.get("db_url"))
logging.info("DB_URL: {}".format(db_config.get("db_url")))
articleLoader = ArticleLoader(articleDatasetRepo)
logging.info("Loading articles....")
if not os.path.isdir(config["root_dir"] + config["pickle_dir"]):
os.mkdir(config["root_dir"] + config["pickle_dir"])
if (action == 'append'):
logging.info("Appending....")
articleLoader.load_all_articles(load_text=True, load_only_unsaved=True)
logging.info("Finished loading articles....")
update_pickle(config, articleLoader, defaultTokenizer)
articleDatasetRepo.update_to_saved()
elif (action == 'create'):
logging.info("Creating new pickle file....")
articleLoader.load_all_articles(load_text=True, load_only_unsaved=False)
logging.info("Finished loading articles....")
create_pickle(config, articleLoader, defaultTokenizer)
articleDatasetRepo.update_to_saved()
else:
print("Please choose create or append for tok_action")
sys.exit(1)