Skip to content

Commit

Permalink
externally call each script for each cross-validation iteration
Browse files Browse the repository at this point in the history
  • Loading branch information
emrekgn committed Apr 4, 2018
1 parent 73109a2 commit 354e771
Show file tree
Hide file tree
Showing 6 changed files with 58 additions and 24 deletions.
4 changes: 2 additions & 2 deletions build_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
export_trimmed_word2vec_vectors, get_processing_word


def main(**kwargs):
def main():
"""Procedure to build data
You MUST RUN this procedure. It iterates over the whole dataset (train,
Expand All @@ -21,7 +21,7 @@ def main(**kwargs):
"""
# get config and processing of words

config = Config(**dict(kwargs, load=False))
config = Config(load=False)
processing_word = get_processing_word(lowercase=False)

# Generators
Expand Down
45 changes: 34 additions & 11 deletions cross-validation.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
import codecs
import numpy as np
from pip._vendor.progress import counter
from sklearn.model_selection import ShuffleSplit
from build_data import main as build
from train import main as train
from evaluate import main as eval
from shutil import copyfile
import subprocess
import sys

"""
Expand All @@ -15,6 +16,7 @@
sentences = []
sentence = []


def write(path, sts):
f = open(path, 'w')
for s in sts:
Expand All @@ -24,6 +26,7 @@ def write(path, sts):
f.write('\n')
f.close()


for line in codecs.open('data/celikkaya2013/input.txt', 'r', 'utf8'):
line = line.rstrip()
if not line:
Expand Down Expand Up @@ -74,21 +77,41 @@ def write(path, sts):

print("Created train, dev and test sets of iteration: %i" % count)

copyfile(filename_train, 'data/celikkaya2013/tr.train.iobes')
copyfile(filename_dev, 'data/celikkaya2013/tr.testa.iobes')
copyfile(filename_test, 'data/celikkaya2013/tr.testb.iobes')

# Build
kwargs = {
"filename_train": filename_train,
"filename_dev": filename_dev,
"filename_test": filename_test
}
build(**kwargs)
with open('output.log', 'a+') as out:
out.write("Beginning building for CV iteration:{}".format(str(counter)))
p = subprocess.Popen('python3 build_data.py', shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
for line in p.stdout.readlines():
out.write(line + '\n')
retval = p.wait()
out.write("Finished building. exit code:{}\n".format(str(retval)))
out.flush()
print("Built model.")

# Train
train(**kwargs)
with open('output.log', 'a+') as out:
out.write("Beginning training for CV iteration:{}".format(str(counter)))
p = subprocess.Popen('python3 train.py', shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
for line in p.stdout.readlines():
out.write(line + '\n')
retval = p.wait()
out.write("Finished training. exit code:{}\n".format(str(retval)))
out.flush()
print("Trained model.")

# Evaluate
eval(interactive=False, **kwargs)
with open('output.log', 'a+') as out:
out.write("Beginning eval for CV iteration:{}".format(str(counter)))
p = subprocess.Popen('python3 evaluate.py', shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
for line in p.stdout.readlines():
out.write(line + '\n')
retval = p.wait()
out.write("Finished eval. exit code:{}\n".format(str(retval)))
out.flush()
print("Evaluated model.")

count -= 1
Expand Down
18 changes: 16 additions & 2 deletions data/extract_word_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,26 @@
# https://groups.google.com/forum/#!topic/gensim/JRYhCt10AMw
# https://radimrehurek.com/gensim/models/keyedvectors.html
# https://stackoverflow.com/questions/44693241/how-to-extract-a-word-vector-from-the-google-pre-trained-model-for-word2vec

model = KeyedVectors.load_word2vec_format('../other/TweetNER/TweetNER/newfile.txt', binary=True, limit=200000)
"""
model = KeyedVectors.load_word2vec_format('../other/TweetNER/TweetNER/newfile.txt', binary=True, limit=200000, encoding='utf-8', unicode_errors='ignore')
print(model['*UNKNOWN*'])
with open('../other/TweetNER/TweetNER/word-list.txt', 'w+', encoding='utf-8') as f:
for key in model.vocab.keys():
f.write(key + "\n")
"""

print("Building vocab...")
count = 0
with open('../other/TweetNER/TweetNER/newfile.txt', encoding='utf-8') as f:
with open('../other/TweetNER/TweetNER/word-list.txt', 'w+', encoding='utf-8') as f2:
for line in f:
if count == 200001:
break
word = line.strip().split(' ')[0]
f2.write(word + "\n")
count += 1



"""
with open('../other/TweetNER/TweetNER/pretrained.txt', 'r', encoding='utf-8') as f:
Expand Down
4 changes: 2 additions & 2 deletions evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,9 +66,9 @@ def interactive_shell(model):
model.logger.info(seq)


def main(interactive=True, **kwargs):
def main(interactive=False):
# create instance of config
config = Config(**kwargs)
config = Config()

# build model
model = NERModel(config)
Expand Down
6 changes: 1 addition & 5 deletions model/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,18 +7,14 @@


class Config():
def __init__(self, load=True, **kwargs):
def __init__(self, load=True):
"""Initialize hyperparameters and load vocabs
Args:
load_embeddings: (bool) if True, load embeddings into
np array, else None
"""
# update default parameters!
for key, value in kwargs.items():
setattr(self, key, value)

# directory for training outputs
if not os.path.exists(self.dir_output):
os.makedirs(self.dir_output)
Expand Down
5 changes: 3 additions & 2 deletions train.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@
from model.config import Config


def main(**kwargs):
def main():
# create instance of config
config = Config(**kwargs)
config = Config()

# build model
model = NERModel(config)
Expand All @@ -22,5 +22,6 @@ def main(**kwargs):
# train model
model.train(train, dev)


if __name__ == "__main__":
main()

0 comments on commit 354e771

Please sign in to comment.