[feature]: add embedding parallel (#438)

* add parquet input in packed format and non-packed format * add support for embedding parallel, shard embedding across workers, based on horovod all2all and allreduce * refactor predictor into separate files and add parquet predictors * add script for custom ops build * add adam sparse optimizer * add fast and memory efficient auc implementation
alibaba · Jan 3, 2024 · fddb73a · fddb73a
1 parent cb806b2
commit fddb73a
Show file tree

Hide file tree

Showing 64 changed files with 6,603 additions and 705 deletions.
diff --git a/.git_bin_path b/.git_bin_path
@@ -1,6 +1,7 @@
 {"leaf_name": "data/test", "leaf_file": ["data/test/batch_criteo_sample.tfrecord", "data/test/criteo_sample.tfrecord", "data/test/dwd_avazu_ctr_deepmodel_10w.csv", "data/test/embed_data.csv", "data/test/lookup_data.csv", "data/test/tag_kv_data.csv", "data/test/test.csv", "data/test/test_sample_weight.txt", "data/test/test_with_quote.csv"]}
 {"leaf_name": "data/test/client", "leaf_file": ["data/test/client/item_lst", "data/test/client/user_table_data", "data/test/client/user_table_schema"]}
 {"leaf_name": "data/test/criteo_data", "leaf_file": ["data/test/criteo_data/category.bin", "data/test/criteo_data/dense.bin", "data/test/criteo_data/label.bin", "data/test/criteo_data/readme"]}
+{"leaf_name": "data/test/criteo_parquet", "leaf_file": ["data/test/criteo_parquet/0_0.parquet", "data/test/criteo_parquet/0_1.parquet", "data/test/criteo_parquet/0_2.parquet", "data/test/criteo_parquet/0_3.parquet", "data/test/criteo_parquet/0_4.parquet", "data/test/criteo_parquet/0_5.parquet"]}
 {"leaf_name": "data/test/distribute_eval_test/deepfm_distribute_eval_dwd_avazu_out_multi_cls", "leaf_file": ["data/test/distribute_eval_test/deepfm_distribute_eval_dwd_avazu_out_multi_cls/ESTIMATOR_TRAIN_DONE", "data/test/distribute_eval_test/deepfm_distribute_eval_dwd_avazu_out_multi_cls/atexit_sync_1661483067", "data/test/distribute_eval_test/deepfm_distribute_eval_dwd_avazu_out_multi_cls/checkpoint", "data/test/distribute_eval_test/deepfm_distribute_eval_dwd_avazu_out_multi_cls/eval_result.txt", "data/test/distribute_eval_test/deepfm_distribute_eval_dwd_avazu_out_multi_cls/model.ckpt-1000.data-00000-of-00001", "data/test/distribute_eval_test/deepfm_distribute_eval_dwd_avazu_out_multi_cls/model.ckpt-1000.index", "data/test/distribute_eval_test/deepfm_distribute_eval_dwd_avazu_out_multi_cls/model.ckpt-1000.meta", "data/test/distribute_eval_test/deepfm_distribute_eval_dwd_avazu_out_multi_cls/pipeline.config", "data/test/distribute_eval_test/deepfm_distribute_eval_dwd_avazu_out_multi_cls/version"]}
 {"leaf_name": "data/test/distribute_eval_test/dropoutnet_distribute_eval_taobao_ckpt", "leaf_file": ["data/test/distribute_eval_test/dropoutnet_distribute_eval_taobao_ckpt/checkpoint", "data/test/distribute_eval_test/dropoutnet_distribute_eval_taobao_ckpt/eval_result.txt", "data/test/distribute_eval_test/dropoutnet_distribute_eval_taobao_ckpt/model.ckpt-1000.data-00000-of-00001", "data/test/distribute_eval_test/dropoutnet_distribute_eval_taobao_ckpt/model.ckpt-1000.index", "data/test/distribute_eval_test/dropoutnet_distribute_eval_taobao_ckpt/model.ckpt-1000.meta", "data/test/distribute_eval_test/dropoutnet_distribute_eval_taobao_ckpt/pipeline.config"]}
 {"leaf_name": "data/test/distribute_eval_test/dssm_distribute_eval_pointwise_classification_taobao_ckpt", "leaf_file": ["data/test/distribute_eval_test/dssm_distribute_eval_pointwise_classification_taobao_ckpt/checkpoint", "data/test/distribute_eval_test/dssm_distribute_eval_pointwise_classification_taobao_ckpt/eval_result.txt", "data/test/distribute_eval_test/dssm_distribute_eval_pointwise_classification_taobao_ckpt/model.ckpt-1000.data-00000-of-00001", "data/test/distribute_eval_test/dssm_distribute_eval_pointwise_classification_taobao_ckpt/model.ckpt-1000.index", "data/test/distribute_eval_test/dssm_distribute_eval_pointwise_classification_taobao_ckpt/model.ckpt-1000.meta", "data/test/distribute_eval_test/dssm_distribute_eval_pointwise_classification_taobao_ckpt/pipeline.config"]}

diff --git a/.git_bin_url b/.git_bin_url
@@ -1,6 +1,7 @@
 {"leaf_path": "data/test", "sig": "656d73b4e78d0d71e98120050bc51387", "remote_path": "data/git_oss_sample_data/data_test_656d73b4e78d0d71e98120050bc51387"}
 {"leaf_path": "data/test/client", "sig": "d2e000187cebd884ee10e3cf804717fc", "remote_path": "data/git_oss_sample_data/data_test_client_d2e000187cebd884ee10e3cf804717fc"}
 {"leaf_path": "data/test/criteo_data", "sig": "f224ba0b1a4f66eeda096c88703d3afc", "remote_path": "data/git_oss_sample_data/data_test_criteo_data_f224ba0b1a4f66eeda096c88703d3afc"}
+{"leaf_path": "data/test/criteo_parquet", "sig": "275dd04a6ce63341e6f87a9ebd612f05", "remote_path": "data/git_oss_sample_data/data_test_criteo_parquet_275dd04a6ce63341e6f87a9ebd612f05"}
 {"leaf_path": "data/test/distribute_eval_test/deepfm_distribute_eval_dwd_avazu_out_multi_cls", "sig": "e74bea3847855feb44b4f621a3e78344", "remote_path": "data/git_oss_sample_data/data_test_distribute_eval_test_deepfm_distribute_eval_dwd_avazu_out_multi_cls_e74bea3847855feb44b4f621a3e78344"}
 {"leaf_path": "data/test/distribute_eval_test/dropoutnet_distribute_eval_taobao_ckpt", "sig": "9fde5d2987654f268a231a1c69db5799", "remote_path": "data/git_oss_sample_data/data_test_distribute_eval_test_dropoutnet_distribute_eval_taobao_ckpt_9fde5d2987654f268a231a1c69db5799"}
 {"leaf_path": "data/test/distribute_eval_test/dssm_distribute_eval_pointwise_classification_taobao_ckpt", "sig": "aaee9c8774ef0451a86090b344b66a04", "remote_path": "data/git_oss_sample_data/data_test_distribute_eval_test_dssm_distribute_eval_pointwise_classification_taobao_ckpt_aaee9c8774ef0451a86090b344b66a04"}

diff --git a/MANIFEST.in b/MANIFEST.in
@@ -1,2 +1,3 @@
 include easy_rec/python/ops/1.12/*.so*
 include easy_rec/python/ops/1.15/*.so*
+include easy_rec/python/ops/2.12/*.so*
diff --git a/docs/source/automl/finetune_config.md b/docs/source/automl/finetune_config.md
@@ -70,7 +70,7 @@ cmd1_{{bizdate}}=PAI -name=easy_rec_ext
     -Dbuckets='oss://automl-nni/'
     -Darn='xxx'
     -DossHost='oss-cn-beijing-internal.aliyuncs.com'
-    -Dcluster={"ps":{"count":1,"cpu":1600,"memory":40000 },"worker":{"count":12,"cpu":1600,"memory":40000}} 
+    -Dcluster={"ps":{"count":1,"cpu":1600,"memory":40000 },"worker":{"count":12,"cpu":1600,"memory":40000}}
 
 {% else %}
 cmd1_{{bizdate}}=PAI -name=easy_rec_ext
@@ -87,7 +87,7 @@ cmd1_{{bizdate}}=PAI -name=easy_rec_ext
     -Dbuckets='oss://automl-nni/'
     -Darn='xxx'
     -DossHost='oss-cn-beijing-internal.aliyuncs.com'
-    -Dcluster={"ps":{"count":1,"cpu":1600,"memory":40000 },"worker":{"count":12,"cpu":1600,"memory":40000}} 
+    -Dcluster={"ps":{"count":1,"cpu":1600,"memory":40000 },"worker":{"count":12,"cpu":1600,"memory":40000}}
 {% endif %}
 
 {% endfor %}

diff --git a/docs/source/automl/pai_nni_hpo.md b/docs/source/automl/pai_nni_hpo.md
@@ -167,7 +167,7 @@ cmd1=PAI -name=easy_rec_ext
             -Dbuckets='oss://lcl-bj/'
             -Dmodel_dir='oss://lcl-bj/eval_dist_test/model_${exp_id}_${trial_id}'
             -DossHost='oss-cn-beijing-internal.aliyuncs.com'
-            -Deval_method='separate' 
+            -Deval_method='separate'
 
 
 

diff --git a/easy_rec/__init__.py b/easy_rec/__init__.py
@@ -17,7 +17,12 @@
   import tensorflow as tf
   from tensorflow.python.platform import tf_logging
   tf_logging.set_verbosity(tf_logging.INFO)
+else:
+  logging.basicConfig(
+      level=logging.INFO, format='[%(asctime)s][%(levelname)s] %(message)s')
+
 
+def get_ops_dir():
   if platform.system() == 'Linux':
     ops_dir = os.path.join(curr_dir, 'python/ops')
     if 'PAI' in tf.__version__:
@@ -30,8 +35,20 @@
       else:
         ops_dir = os.path.join(ops_dir, '1.15')
     else:
-      ops_dir = None
+      tmp_version = tf.__version__.split('.')
+      tmp_version = '.'.join(tmp_version[:2])
+      return os.path.join(ops_dir, tmp_version)
   else:
+    return None
+
+
+# Avoid import tensorflow which conflicts with the version used in EasyRecProcessor
+if 'PROCESSOR_TEST' not in os.environ:
+  from tensorflow.python.platform import tf_logging
+  tf_logging.set_verbosity(tf_logging.INFO)
+  ops_dir = get_ops_dir()
+  if ops_dir is not None and not os.path.exists(ops_dir):
+    logging.warning('ops_dir[%s] does not exist' % ops_dir)
     ops_dir = None
 
   logging.basicConfig(

diff --git a/easy_rec/python/builders/optimizer_builder.py b/easy_rec/python/builders/optimizer_builder.py
@@ -88,6 +88,14 @@ def build(optimizer_config):
         beta1=config.beta1,
         beta2=config.beta2)
 
+  if optimizer_type == 'lazy_adam_optimizer':
+    config = optimizer_config.lazy_adam_optimizer
+    learning_rate = _create_learning_rate(config.learning_rate)
+    summary_vars.append(learning_rate)
+    from easy_rec.python.compat.adam_s import AdamOptimizerS
+    optimizer = AdamOptimizerS(
+        learning_rate=learning_rate, beta1=config.beta1, beta2=config.beta2)
+
   if optimizer_type == 'momentumw_optimizer':
     config = optimizer_config.momentumw_optimizer
     learning_rate = _create_learning_rate(config.learning_rate)

diff --git a/easy_rec/python/compat/adam_s.py b/easy_rec/python/compat/adam_s.py
@@ -0,0 +1,245 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Adam for TensorFlow."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.eager import context
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.training import optimizer
+from tensorflow.python.training import training_ops
+
+
+class AdamOptimizerS(optimizer.Optimizer):
+  """Optimizer that implements the Adam algorithm.
+
+  References:
+    Adam - A Method for Stochastic Optimization:
+      [Kingma et al., 2015](https://arxiv.org/abs/1412.6980)
+      ([pdf](https://arxiv.org/pdf/1412.6980.pdf))
+  """
+
+  def __init__(self,
+               learning_rate=0.001,
+               beta1=0.9,
+               beta2=0.999,
+               epsilon=1e-8,
+               use_locking=False,
+               name='Adam'):
+    r"""Construct a new Adam optimizer.
+
+    Initialization:
+
+    $$m_0 := 0 \text{(Initialize initial 1st moment vector)}$$
+    $$v_0 := 0 \text{(Initialize initial 2nd moment vector)}$$
+    $$t := 0 \text{(Initialize timestep)}$$
+
+    The update rule for `variable` with gradient `g` uses an optimization
+    described at the end of section 2 of the paper:
+
+    $$t := t + 1$$
+    $$\text{lr}_t := \mathrm{learning_rate} *
+      \sqrt{1 - \beta_2^t} / (1 - \beta_1^t)$$
+
+    $$m_t := \beta_1 * m_{t-1} + (1 - \beta_1) * g$$
+    $$v_t := \beta_2 * v_{t-1} + (1 - \beta_2) * g * g$$
+    $$\text{variable} := \text{variable} -
+      \text{lr}_t * m_t / (\sqrt{v_t} + \epsilon)$$
+
+    The default value of 1e-8 for epsilon might not be a good default in
+    general. For example, when training an Inception network on ImageNet a
+    current good choice is 1.0 or 0.1. Note that since AdamOptimizerS uses the
+    formulation just before Section 2.1 of the Kingma and Ba paper rather than
+    the formulation in Algorithm 1, the "epsilon" referred to here is "epsilon
+    hat" in the paper.
+
+    The sparse implementation of this algorithm (used when the gradient is an
+    IndexedSlices object, typically because of `tf.gather` or an embedding
+    lookup in the forward pass) does apply momentum to variable slices even if
+    they were not used in the forward pass (meaning they have a gradient equal
+    to zero). Momentum decay (beta1) is also applied to the entire momentum
+    accumulator. This means that the sparse behavior is equivalent to the dense
+    behavior (in contrast to some momentum implementations which ignore momentum
+    unless a variable slice was actually used).
+
+    Args:
+      learning_rate: A Tensor or a floating point value.  The learning rate.
+      beta1: A float value or a constant float tensor. The exponential decay
+        rate for the 1st moment estimates.
+      beta2: A float value or a constant float tensor. The exponential decay
+        rate for the 2nd moment estimates.
+      epsilon: A small constant for numerical stability. This epsilon is
+        "epsilon hat" in the Kingma and Ba paper (in the formula just before
+        Section 2.1), not the epsilon in Algorithm 1 of the paper.
+      use_locking: If True use locks for update operations.
+      name: Optional name for the operations created when applying gradients.
+        Defaults to "Adam".
+
+    @compatibility(eager)
+    When eager execution is enabled, `learning_rate`, `beta1`, `beta2`, and
+    `epsilon` can each be a callable that takes no arguments and returns the
+    actual value to use. This can be useful for changing these values across
+    different invocations of optimizer functions.
+    @end_compatibility
+    """
+    super(AdamOptimizerS, self).__init__(use_locking, name)
+    self._lr = learning_rate
+    self._beta1 = beta1
+    self._beta2 = beta2
+    self._epsilon = epsilon
+
+    # Tensor versions of the constructor arguments, created in _prepare().
+    self._lr_t = None
+    self._beta1_t = None
+    self._beta2_t = None
+    self._epsilon_t = None
+
+  def _get_beta_accumulators(self):
+    with ops.init_scope():
+      if context.executing_eagerly():
+        graph = None
+      else:
+        graph = ops.get_default_graph()
+      return (self._get_non_slot_variable('beta1_power', graph=graph),
+              self._get_non_slot_variable('beta2_power', graph=graph))
+
+  def _create_slots(self, var_list):
+    # Create the beta1 and beta2 accumulators on the same device as the first
+    # variable. Sort the var_list to make sure this device is consistent across
+    # workers (these need to go on the same PS, otherwise some updates are
+    # silently ignored).
+    first_var = min(var_list, key=lambda x: x.name)
+    self._create_non_slot_variable(
+        initial_value=self._beta1, name='beta1_power', colocate_with=first_var)
+    self._create_non_slot_variable(
+        initial_value=self._beta2, name='beta2_power', colocate_with=first_var)
+
+    # Create slots for the first and second moments.
+    for v in var_list:
+      self._zeros_slot(v, 'm', self._name)
+      self._zeros_slot(v, 'v', self._name)
+
+  def _prepare(self):
+    lr = self._call_if_callable(self._lr)
+    beta1 = self._call_if_callable(self._beta1)
+    beta2 = self._call_if_callable(self._beta2)
+    epsilon = self._call_if_callable(self._epsilon)
+
+    self._lr_t = ops.convert_to_tensor(lr, name='learning_rate')
+    self._beta1_t = ops.convert_to_tensor(beta1, name='beta1')
+    self._beta2_t = ops.convert_to_tensor(beta2, name='beta2')
+    self._epsilon_t = ops.convert_to_tensor(epsilon, name='epsilon')
+
+  def _apply_dense(self, grad, var):
+    m = self.get_slot(var, 'm')
+    v = self.get_slot(var, 'v')
+    beta1_power, beta2_power = self._get_beta_accumulators()
+    return training_ops.apply_adam(
+        var,
+        m,
+        v,
+        math_ops.cast(beta1_power, var.dtype.base_dtype),
+        math_ops.cast(beta2_power, var.dtype.base_dtype),
+        math_ops.cast(self._lr_t, var.dtype.base_dtype),
+        math_ops.cast(self._beta1_t, var.dtype.base_dtype),
+        math_ops.cast(self._beta2_t, var.dtype.base_dtype),
+        math_ops.cast(self._epsilon_t, var.dtype.base_dtype),
+        grad,
+        use_locking=self._use_locking).op
+
+  def _resource_apply_dense(self, grad, var):
+    m = self.get_slot(var, 'm')
+    v = self.get_slot(var, 'v')
+    beta1_power, beta2_power = self._get_beta_accumulators()
+    return training_ops.resource_apply_adam(
+        var.handle,
+        m.handle,
+        v.handle,
+        math_ops.cast(beta1_power, grad.dtype.base_dtype),
+        math_ops.cast(beta2_power, grad.dtype.base_dtype),
+        math_ops.cast(self._lr_t, grad.dtype.base_dtype),
+        math_ops.cast(self._beta1_t, grad.dtype.base_dtype),
+        math_ops.cast(self._beta2_t, grad.dtype.base_dtype),
+        math_ops.cast(self._epsilon_t, grad.dtype.base_dtype),
+        grad,
+        use_locking=self._use_locking)
+
+  def _apply_sparse_shared(self, grad, var, indices, scatter_add):
+    beta1_power, beta2_power = self._get_beta_accumulators()
+    beta1_power = math_ops.cast(beta1_power, var.dtype.base_dtype)
+    beta2_power = math_ops.cast(beta2_power, var.dtype.base_dtype)
+    lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
+    beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype)
+    beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype)
+    epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype)
+    lr = (lr_t * math_ops.sqrt(1 - beta2_power) / (1 - beta1_power))
+    # m_t = beta1 * m + (1 - beta1) * g_t
+    m = self.get_slot(var, 'm')
+    m_scaled_g_values = grad * (1 - beta1_t)
+    # m_t = state_ops.assign(m, m * beta1_t, use_locking=self._use_locking)
+    m_decay = array_ops.gather(m, indices) * beta1_t
+    m_part_n = m_scaled_g_values + m_decay
+    m_t = state_ops.scatter_update(m, indices, m_part_n)
+    # v_t = beta2 * v + (1 - beta2) * (g_t * g_t)
+    v = self.get_slot(var, 'v')
+    v_scaled_g_values = (grad * grad) * (1 - beta2_t)
+    v_decay = array_ops.gather(v, indices) * beta2_t
+    v_part_n = v_scaled_g_values + v_decay
+    v_t = state_ops.scatter_update(v, indices, v_part_n)
+    # v_sqrt = math_ops.sqrt(v_t)
+    # var_update = state_ops.assign_sub(
+    #     var, lr * m_t / (v_sqrt + epsilon_t), use_locking=self._use_locking)
+    v_part_sqrt = math_ops.sqrt(v_part_n)
+    var_update = scatter_add(var, indices,
+                             -lr * m_part_n / (v_part_sqrt + epsilon_t))
+    return control_flow_ops.group(*[var_update, m_t, v_t])
+
+  def _apply_sparse(self, grad, var):
+    return self._apply_sparse_shared(
+        grad.values,
+        var,
+        grad.indices,
+        lambda x, i, v: state_ops.scatter_add(  # pylint: disable=g-long-lambda
+            x,
+            i,
+            v,
+            use_locking=self._use_locking))
+
+  def _resource_scatter_add(self, x, i, v):
+    with ops.control_dependencies(
+        [resource_variable_ops.resource_scatter_add(x.handle, i, v)]):
+      return x.value()
+
+  def _resource_apply_sparse(self, grad, var, indices):
+    return self._apply_sparse_shared(grad, var, indices,
+                                     self._resource_scatter_add)
+
+  def _finish(self, update_ops, name_scope):
+    # Update the power accumulators.
+    with ops.control_dependencies(update_ops):
+      beta1_power, beta2_power = self._get_beta_accumulators()
+      with ops.colocate_with(beta1_power):
+        update_beta1 = beta1_power.assign(
+            beta1_power * self._beta1_t, use_locking=self._use_locking)
+        update_beta2 = beta2_power.assign(
+            beta2_power * self._beta2_t, use_locking=self._use_locking)
+    return control_flow_ops.group(
+        *update_ops + [update_beta1, update_beta2], name=name_scope)