Merge pull request #185 from rsepassi/push

v1.1.2
tensorflow · Jul 27, 2017 · a55c4cf · a55c4cf
2 parents 62a0ee7 + 36766d8
commit a55c4cf
Show file tree

Hide file tree

Showing 93 changed files with 1,890 additions and 719 deletions.
diff --git a/README.md b/README.md
@@ -86,7 +86,6 @@ mkdir -p $DATA_DIR $TMP_DIR $TRAIN_DIR
 t2t-datagen \
   --data_dir=$DATA_DIR \
   --tmp_dir=$TMP_DIR \
-  --num_shards=100 \
   --problem=$PROBLEM
 
 # Train

diff --git a/setup.py b/setup.py
@@ -5,13 +5,14 @@
 
 setup(
     name='tensor2tensor',
-    version='1.1.1',
+    version='1.1.2',
     description='Tensor2Tensor',
     author='Google Inc.',
     author_email='[email protected]',
     url='http://github.com/tensorflow/tensor2tensor',
     license='Apache 2.0',
     packages=find_packages(),
+    package_data={'tensor2tensor.data_generators': ['test_data/*']},
     scripts=[
         'tensor2tensor/bin/t2t-trainer',
         'tensor2tensor/bin/t2t-datagen',
@@ -26,6 +27,8 @@
         'tensorflow': ['tensorflow>=1.2.0rc1'],
         'tensorflow_gpu': ['tensorflow-gpu>=1.2.0rc1'],
     },
+    tests_require=['nose'],
+    test_suite='nose.collector',
     classifiers=[
         'Development Status :: 4 - Beta',
         'Intended Audience :: Developers',

diff --git a/tensor2tensor/__init__.py b/tensor2tensor/__init__.py
@@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2017 The Tensor2Tensor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");

diff --git a/tensor2tensor/bin/t2t-datagen b/tensor2tensor/bin/t2t-datagen
@@ -1,4 +1,5 @@
 #!/usr/bin/env python
+# coding=utf-8
 # Copyright 2017 The Tensor2Tensor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -62,10 +63,12 @@ flags.DEFINE_string("problem", "",
                     "The name of the problem to generate data for.")
 flags.DEFINE_string("exclude_problems", "",
                     "Comma-separates list of problems to exclude.")
-flags.DEFINE_integer("num_shards", 10, "How many shards to use.")
+flags.DEFINE_integer("num_shards", 0, "How many shards to use. Ignored for "
+                     "registered Problems.")
 flags.DEFINE_integer("max_cases", 0,
                      "Maximum number of cases to generate (unbounded if 0).")
 flags.DEFINE_integer("random_seed", 429459, "Random seed to use.")
+flags.DEFINE_integer("task_id", -1, "For distributed data generation.")
 flags.DEFINE_string("t2t_usr_dir", "",
                     "Path to a Python module that will be imported. The "
                     "__init__.py file should include the necessary imports. "
@@ -108,6 +111,10 @@ _SUPPORTED_PROBLEM_GENERATORS = {
         lambda: lm1b.generator(FLAGS.tmp_dir, True),
         lambda: lm1b.generator(FLAGS.tmp_dir, False)
     ),
+    "lm1b_characters": (
+        lambda: lm1b.generator(FLAGS.tmp_dir, True, characters=True),
+        lambda: lm1b.generator(FLAGS.tmp_dir, False, characters=True)
+    ),
     "wiki_32k": (
         lambda: wiki.generator(FLAGS.tmp_dir, True),
         1000
@@ -246,7 +253,7 @@ def generate_data_for_problem(problem):
   if isinstance(dev_gen, int):
     # The dev set and test sets are generated as extra shards using the
     # training generator.  The integer specifies the number of training
-    # shards.  FLAGS.num_shards is ignored.
+    # shards. FLAGS.num_shards is ignored.
     num_training_shards = dev_gen
     tf.logging.info("Generating data for %s.", problem)
     all_output_files = generator_utils.combined_data_filenames(
@@ -257,10 +264,11 @@ def generate_data_for_problem(problem):
   else:
     # usual case - train data and dev data are generated using separate
     # generators.
+    num_shards = FLAGS.num_shards or 10
     tf.logging.info("Generating training data for %s.", problem)
     train_output_files = generator_utils.train_data_filenames(
         problem + generator_utils.UNSHUFFLED_SUFFIX, FLAGS.data_dir,
-        FLAGS.num_shards)
+        num_shards)
     generator_utils.generate_files(training_gen(), train_output_files,
                                    FLAGS.max_cases)
     tf.logging.info("Generating development data for %s.", problem)
@@ -275,10 +283,14 @@ def generate_data_for_problem(problem):
 
 
 def generate_data_for_registered_problem(problem_name):
+  tf.logging.info("Generating training data for %s.", problem_name)
+  if FLAGS.num_shards:
+    raise ValueError("--num_shards should not be set for registered Problem.")
   problem = registry.problem(problem_name)
+  task_id = None if FLAGS.task_id < 0 else FLAGS.task_id
   problem.generate_data(os.path.expanduser(FLAGS.data_dir),
                         os.path.expanduser(FLAGS.tmp_dir),
-                        FLAGS.num_shards)
+                        task_id=task_id)
 
 
 if __name__ == "__main__":

diff --git a/tensor2tensor/bin/t2t-make-tf-configs b/tensor2tensor/bin/t2t-make-tf-configs
@@ -1,4 +1,5 @@
 #!/usr/bin/env python
+# coding=utf-8
 # Copyright 2017 The Tensor2Tensor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");

diff --git a/tensor2tensor/bin/t2t-trainer b/tensor2tensor/bin/t2t-trainer
@@ -1,4 +1,5 @@
 #!/usr/bin/env python
+# coding=utf-8
 # Copyright 2017 The Tensor2Tensor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");

diff --git a/tensor2tensor/data_generators/__init__.py b/tensor2tensor/data_generators/__init__.py
@@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2017 The Tensor2Tensor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");

diff --git a/tensor2tensor/data_generators/algorithmic.py b/tensor2tensor/data_generators/algorithmic.py
@@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2017 The Tensor2Tensor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -65,10 +66,7 @@ def dev_size(self):
   def num_shards(self):
     return 10
 
-  def generate_data(self, data_dir, _, num_shards=None):
-    if num_shards is None:
-      num_shards = self.num_shards
-
+  def generate_data(self, data_dir, _, task_id=-1):
     def generator_eos(generator):
       """Shift by NUM_RESERVED_IDS and append EOS token."""
       for case in generator:
@@ -86,7 +84,7 @@ def generator_eos(generator):
 
     utils.generate_dataset_and_shuffle(
         train_generator_eos(),
-        self.training_filepaths(data_dir, num_shards, shuffled=True),
+        self.training_filepaths(data_dir, self.num_shards, shuffled=True),
         dev_generator_eos(),
         self.dev_filepaths(data_dir, 1, shuffled=True),
         shuffle=False)
@@ -253,7 +251,7 @@ def zipf_distribution(nbr_symbols, alpha):
 
 
 def zipf_random_sample(distr_map, sample_len):
-  """Helper function: Generate a random Zipf sample of given lenght.
+  """Helper function: Generate a random Zipf sample of given length.
 
   Args:
     distr_map: list of float, Zipf's distribution over nbr_symbols.
@@ -286,7 +284,7 @@ def reverse_generator_nlplike(nbr_symbols,
     max_length: integer, maximum length of sequences to generate.
     nbr_cases: the number of cases to generate.
     scale_std_dev: float, Normal distribution's standard deviation scale factor
-      used to draw the lenght of sequence. Default = 1% of the max_length.
+      used to draw the length of sequence. Default = 1% of the max_length.
     alpha: float, Zipf's Law Distribution parameter. Default = 1.5.
       Usually for modelling natural text distribution is in
       the range [1.1-1.6].

diff --git a/tensor2tensor/data_generators/algorithmic_math.py b/tensor2tensor/data_generators/algorithmic_math.py
@@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2017 The Tensor2Tensor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");

diff --git a/tensor2tensor/data_generators/algorithmic_math_test.py b/tensor2tensor/data_generators/algorithmic_math_test.py
@@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2017 The Tensor2Tensor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");

diff --git a/tensor2tensor/data_generators/algorithmic_test.py b/tensor2tensor/data_generators/algorithmic_test.py
@@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2017 The Tensor2Tensor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");

diff --git a/tensor2tensor/data_generators/all_problems.py b/tensor2tensor/data_generators/all_problems.py
@@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2017 The Tensor2Tensor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -33,7 +34,7 @@
 # pylint: disable=g-import-not-at-top
 try:
   # Requires h5py
-  from tensor2tensor.data_generators import genetics
+  from tensor2tensor.data_generators import gene_expression
 except ImportError:
   pass
 # pylint: enable=g-import-not-at-top

diff --git a/tensor2tensor/data_generators/audio.py b/tensor2tensor/data_generators/audio.py
@@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2017 The Tensor2Tensor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");

diff --git a/tensor2tensor/data_generators/audio_test.py b/tensor2tensor/data_generators/audio_test.py
@@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2017 The Tensor2Tensor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");

diff --git a/tensor2tensor/data_generators/concatenate_examples.py b/tensor2tensor/data_generators/concatenate_examples.py
@@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2017 The Tensor2Tensor Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -33,7 +34,7 @@
               + subtokenizer.encode("target French Je t'aime.") + [1])
 }
 
-We add a dummy feature "inputs"=[0] for compatability with seq-to-seq models.
+We add a dummy feature "inputs"=[0] for compatibility with seq-to-seq models.
 
 If FLAGS.combine_to_length is nonzero, then we combine multiple examples into
 examples of a constant length, possibly with some padding at the end.
@@ -52,34 +53,33 @@
 from tensor2tensor.data_generators import text_encoder
 import tensorflow as tf
 
-tf.app.flags.DEFINE_string("vocab_file", "",
-                           "SubwordTextEncoder vocabulary file")
+tf.flags.DEFINE_string("vocab_file", "", "SubwordTextEncoder vocabulary file")
 
-tf.app.flags.DEFINE_boolean(
+tf.flags.DEFINE_boolean(
     "random_reverse", False,
     "If true, write half of the example with source/target reversed")
 
-tf.app.flags.DEFINE_boolean(
+tf.flags.DEFINE_boolean(
     "count_everything", False,
     "If true, assign positive weights to designators, source and target. "
     "If false, assign positive weights only to target.")
 
-tf.app.flags.DEFINE_string("source_domain_string", "English", "")
-tf.app.flags.DEFINE_string("target_domain_string", "French", "")
+tf.flags.DEFINE_string("source_domain_string", "English", "")
+tf.flags.DEFINE_string("target_domain_string", "French", "")
 
-tf.app.flags.DEFINE_integer(
+tf.flags.DEFINE_integer(
     "combine_to_length", 0,
     "If positive, concatenate examples to form examples with target length "
     " equal to this value. Targets are padded with subtoken id=0.")
 
-tf.app.flags.DEFINE_string("in_file", "", "input filename")
+tf.flags.DEFINE_string("in_file", "", "input filename")
 
-tf.app.flags.DEFINE_string(
+tf.flags.DEFINE_string(
     "out_prefix", "/usr/local/google/tmp/concat",
     "The output filename is equal to out_prefix plus "
     "the last 15 characters of in_file. (e.g. -00001-of-00100)")
 
-FLAGS = tf.app.flags.FLAGS
+FLAGS = tf.flags.FLAGS
 
 
 def _make_example(ids, weights, raw_num_bytes):