Skip to content
This repository has been archived by the owner on Jul 7, 2023. It is now read-only.

Commit

Permalink
Merge pull request #73 from rsepassi/push
Browse files Browse the repository at this point in the history
v1.0.9
  • Loading branch information
lukaszkaiser authored Jun 29, 2017
2 parents a2a6178 + e4fe66c commit 7087807
Show file tree
Hide file tree
Showing 26 changed files with 395 additions and 245 deletions.
2 changes: 0 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
# Compiled python modules.
*.pyc
# Byte-compiled
__pycache__/

# Python egg metadata, regenerated from source files by setuptools.
/*.egg-info
Expand Down
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ t2t-trainer --registry_help
PROBLEM=wmt_ende_tokens_32k
MODEL=transformer
HPARAMS=transformer_base
HPARAMS=transformer_base_single_gpu
DATA_DIR=$HOME/t2t_data
TMP_DIR=/tmp/t2t_datagen
Expand Down Expand Up @@ -209,7 +209,7 @@ and hyperparameter set functions can compose other hyperparameter set functions.
The **trainer** binary is the main entrypoint for training, evaluation, and
inference. Users can easily switch between problems, models, and hyperparameter
sets by using the `--model`, `--problems`, and `--hparams_set` flags. Specific
hyperparameters can be overriden with the `--hparams` flag. `--schedule` and
hyperparameters can be overridden with the `--hparams` flag. `--schedule` and
related flags control local and distributed training/evaluation
([distributed training documentation](https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/docs/distributed_training.md)).

Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

setup(
name='tensor2tensor',
version='1.0.8',
version='1.0.9',
description='Tensor2Tensor',
author='Google Inc.',
author_email='[email protected]',
Expand Down
5 changes: 2 additions & 3 deletions tensor2tensor/bin/make_tf_configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@

# Dependency imports

import six
import tensorflow as tf

flags = tf.flags
Expand All @@ -51,7 +50,7 @@ def main(_):

cluster = {"ps": ps, "worker": workers}

for task_type, jobs in six.iteritems(cluster):
for task_type, jobs in (("worker", workers), ("ps", ps)):
for idx, job in enumerate(jobs):
if task_type == "worker":
cmd_line_flags = " ".join([
Expand All @@ -77,7 +76,7 @@ def main(_):
"index": idx
}
})
print(tf_config + "\t" + cmd_line_flags)
print("'%s'\t%s" % (tf_config, cmd_line_flags))


if __name__ == "__main__":
Expand Down
6 changes: 3 additions & 3 deletions tensor2tensor/bin/t2t-datagen
100755 → 100644
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,10 @@ from tensor2tensor.data_generators import algorithmic_math
from tensor2tensor.data_generators import audio
from tensor2tensor.data_generators import generator_utils
from tensor2tensor.data_generators import image
from tensor2tensor.data_generators import ptb
from tensor2tensor.data_generators import snli
from tensor2tensor.data_generators import wmt
from tensor2tensor.data_generators import wsj_parsing
from tensor2tensor.data_generators import ptb

import tensorflow as tf

Expand Down Expand Up @@ -319,11 +319,11 @@ _SUPPORTED_PROBLEM_GENERATORS = {
vocab_filename="tokens.vocab.%d" % 2**15,
vocab_size=2**15)),
"lmptb_10k": (
lambda: ptb.train_generator(
lambda: ptb.train_generator(
FLAGS.tmp_dir,
FLAGS.data_dir,
False),
lambda: ptb.valid_generator()),
ptb.valid_generator),
}

# pylint: enable=g-long-lambda
Expand Down
Empty file modified tensor2tensor/bin/t2t-trainer
100755 → 100644
Empty file.
9 changes: 5 additions & 4 deletions tensor2tensor/data_generators/algorithmic.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ def zipf_distribution(nbr_symbols, alpha):
Usually for modelling natural text distribution is in
the range [1.1-1.6].
Return:
Returns:
distr_map: list of float, Zipf's distribution over nbr_symbols.
"""
Expand All @@ -118,7 +118,7 @@ def zipf_random_sample(distr_map, sample_len):
distr_map: list of float, Zipf's distribution over nbr_symbols.
sample_len: integer, length of sequence to generate.
Return:
Returns:
sample: list of integer, Zipf's random sample over nbr_symbols.
"""
Expand All @@ -131,8 +131,8 @@ def zipf_random_sample(distr_map, sample_len):
return [t+1 if t > 0 else t+2 for t in np.searchsorted(distr_map, u)]


def reverse_generator_nlplike(nbr_symbols, max_length, nbr_cases, \
scale_std_dev=100, alpha=1.5):
def reverse_generator_nlplike(nbr_symbols, max_length, nbr_cases,
scale_std_dev=100, alpha=1.5):
"""Generator for the reversing nlp-like task on sequences of symbols.
The length of the sequence is drawn from a Gaussian(Normal) distribution
Expand All @@ -141,6 +141,7 @@ def reverse_generator_nlplike(nbr_symbols, max_length, nbr_cases, \
nbr_cases sequences have been produced.
Args:
nbr_symbols: integer, number of symbols.
max_length: integer, maximum length of sequences to generate.
nbr_cases: the number of cases to generate.
scale_std_dev: float, Normal distribution's standard deviation scale factor
Expand Down
11 changes: 5 additions & 6 deletions tensor2tensor/data_generators/algorithmic_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,14 +41,13 @@ def testReverseGenerator(self):
self.assertEqual(list(reversed(d["inputs"])) + [1], d["targets"])
self.assertEqual(counter, 10)

def testZipfDistribution(self):
# Following Zipf's Law with alpha equals 1: the first in rank is two times
# more probable/frequent that the second in rank, three times more prob/freq
# that the third in rank and so on.
def testZipfDistribution(self):
# Following Zipf's Law with alpha equals 1: the first in rank is two times
# more probable/frequent that the second in rank, three times more prob/freq
# that the third in rank and so on.
d = algorithmic.zipf_distribution(10, 1.0001)
for i in xrange(len(d[1:])-1):
self.assertEqual("%.4f" % (abs(d[i+1]-d[i+2])*(i+2)), \
"%.4f" % d[1])
self.assertEqual("%.4f" % (abs(d[i+1]-d[i+2])*(i+2)), "%.4f" % d[1])

def testReverseGeneratorNlpLike(self):
counter = 0
Expand Down
3 changes: 2 additions & 1 deletion tensor2tensor/data_generators/generator_utils.py
100755 → 100644
Original file line number Diff line number Diff line change
Expand Up @@ -244,7 +244,8 @@ def get_or_generate_vocab(tmp_dir, vocab_filename, vocab_size):
if ".gz" in lang_file:
new_filepath = os.path.join(tmp_dir, lang_file[:-3])
if os.path.exists(new_filepath):
tf.logging.info("Subdirectory %s already exists, skipping unpacking" % filepath)
tf.logging.info("Subdirectory %s already exists, skipping unpacking"
% filepath)
else:
tf.logging.info("Unpacking subdirectory %s" % filepath)
gunzip_file(filepath, new_filepath)
Expand Down
34 changes: 16 additions & 18 deletions tensor2tensor/data_generators/problem_hparams.py
Original file line number Diff line number Diff line change
Expand Up @@ -340,24 +340,6 @@ def lm1b_16k(model_hparams):
p.target_space_id = 3
return p

def lmptb_10k(model_hparams):
"""Penn Tree Bank language-modeling benchmark, 10k token vocabulary."""
p = default_problem_hparams()
p.input_modality = {}
p.target_modality = (registry.Modalities.SYMBOL, 10000)

vocabulary = text_encoder.TokenTextEncoder(
os.path.join(model_hparams.data_dir,
"lmptb_10k.vocab"))

p.vocabulary = {
"inputs": vocabulary,
"targets": vocabulary,
}

p.input_space_id = 3
p.target_space_id = 3
return p

def lm1b_64k(model_hparams):
"""Billion-word language-modeling benchmark, 64k subtoken vocabulary."""
Expand All @@ -374,6 +356,22 @@ def lm1b_64k(model_hparams):
p.target_space_id = 3
return p


def lmptb_10k(model_hparams):
"""Penn Tree Bank language-modeling benchmark, 10k token vocabulary."""
p = default_problem_hparams()
p.input_modality = {}
p.target_modality = (registry.Modalities.SYMBOL, 10000)
vocabulary = text_encoder.TokenTextEncoder(
os.path.join(model_hparams.data_dir, "lmptb_10k.vocab"))
p.vocabulary = {
"targets": vocabulary,
}
p.input_space_id = 3
p.target_space_id = 3
return p


def wmt_enfr_characters(unused_model_hparams):
"""English to French translation benchmark."""
p = default_problem_hparams()
Expand Down
75 changes: 33 additions & 42 deletions tensor2tensor/data_generators/ptb.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,10 @@
from __future__ import division
from __future__ import print_function

import collections
import os
import sys
import tarfile
import collections

# Dependency imports

Expand All @@ -34,68 +34,62 @@
EOS = text_encoder.EOS
PTB_URL = "http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz"


def _read_words(filename):
"""Reads words from a file.
It returns a list of words without '\n'
Originally from:
https://github.com/tensorflow/models/blob/master/tutorials/rnn/ptb/reader.py
"""
"""Reads words from a file."""
with tf.gfile.GFile(filename, "r") as f:
if sys.version_info[0] >= 3:
return f.read().replace("\n", " ").split()
else:
return f.read().decode("utf-8").replace("\n", " ").split()




def _build_vocab(filename, vocab_path, vocab_size):
"""Reads a file a build a vocabulary of `vocab_size` words to
as a list of words to `filename`
The vocabulary is sorted by occurence count and has one word per line
Originally from:
https://github.com/tensorflow/models/blob/master/tutorials/rnn/ptb/reader.py
"""Reads a file to build a vocabulary of `vocab_size` most common words.
The vocabulary is sorted by occurence count and has one word per line.
Originally from:
https://github.com/tensorflow/models/blob/master/tutorials/rnn/ptb/reader.py
Args:
filename: file to read list of words from.
vocab_path: path where to save the vocabulary.
vocab_size: size of the vocablulary to generate.
"""
data = _read_words(filename)

counter = collections.Counter(data)
count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0]))
words, _ = list(zip(*count_pairs))
words, _ = list(zip(*count_pairs))
words = words[:vocab_size]

with open(vocab_path, 'w') as f:
with open(vocab_path, "w") as f:
f.write("\n".join(words))


def _get_token_encoder(vocab_dir, filename):
"""Reads from file and returns a `TokenTextEncoder` based on the vocabulary
"""
"""Reads from file and returns a `TokenTextEncoder` for the vocabulary."""
vocab_name = "lmptb_10k.vocab"
vocab_path = os.path.join(vocab_dir, vocab_name)


_build_vocab(filename, vocab_path, 10000)

return text_encoder.TokenTextEncoder(vocab_path)


class PTB(object):
"""A class for generating PTB data."""

def __init__(self, tmp_dir, data_dir, char=False):
assert not char, "char mode for PTB is not yet implemented"
self.char = char
self.data_dir = data_dir
#self.num_steps = num_steps

url = PTB_URL

filename = os.path.basename(url)
compressed_filepath = generator_utils.maybe_download(tmp_dir,
filename,
url)

compressed_filepath = generator_utils.maybe_download(
tmp_dir, filename, url)
ptb_files = []
ptb_char_files = []
with tarfile.open(compressed_filepath, "r:gz") as tgz:
files = []
# selecting only relevant files
# Selecting only relevant files.
for m in tgz.getmembers():
if "ptb" in m.name and ".txt" in m.name:
if "char" in m.name:
Expand All @@ -120,7 +114,6 @@ def __init__(self, tmp_dir, data_dir, char=False):

assert hasattr(self, "train"), "Training file not found"
assert hasattr(self, "valid"), "Validation file not found"

self.encoder = _get_token_encoder(data_dir, self.train)

def train_generator(self):
Expand All @@ -132,27 +125,25 @@ def valid_generator(self):
def _generator(self, filename):
with tf.gfile.GFile(filename, "r") as f:
for line in f:
line = " ".join(line.replace('\n', EOS).split())
line = " ".join(line.replace("\n", EOS).split())
tok = self.encoder.encode(line)
x = tok[:-1]
y = tok[1:]

yield {"inputs": x,
"targets": y}
yield {"inputs": tok[:-1], "targets": tok[1:]}


# Using a object "singleton"
# `train_generator` must be called before
# `valid_generator` in order to work
_ptb = {}


def train_generator(*args, **kwargs):
"""The train data generator to be called
"""
"""The train data generator to be called."""
global _ptb
_ptb = PTB(*args, **kwargs)
return _ptb.train_generator()


def valid_generator():
"""Validation (aka. dev) data generator
"""
global _ptb
"""Validation (aka. dev) data generator."""
global _ptb # pylint:disable=global-variable-not-assigned
return _ptb.valid_generator()
1 change: 1 addition & 0 deletions tensor2tensor/data_generators/snli.py
100755 → 100644
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,7 @@ def _parse_dataset(file_path, tmp_dir, train):


def _get_or_generate_vocab(tmp_dir, vocab_filename, vocab_size):
"""Read or create vocabulary."""
vocab_filepath = os.path.join(tmp_dir, vocab_filename)
print('Vocab file written to: ' + vocab_filepath)

Expand Down
Loading

0 comments on commit 7087807

Please sign in to comment.