From 520515ed09c65b0ae5f5675fb607c75986f4be26 Mon Sep 17 00:00:00 2001
From: Yazhi Gao <leongao@fb.com>
Date: Mon, 7 Dec 2020 00:15:19 -0800
Subject: [PATCH] revert onnx, pr, md bag changes (#143)

---
 dlrm_s_pytorch.py          | 91 ++++++++++++++++++++++++++++----------
 tricks/md_embedding_bag.py |  5 ++-
 2 files changed, 71 insertions(+), 25 deletions(-)

diff --git a/dlrm_s_pytorch.py b/dlrm_s_pytorch.py
index 00f1244e..a1bef049 100644
--- a/dlrm_s_pytorch.py
+++ b/dlrm_s_pytorch.py
@@ -109,11 +109,13 @@
 
 exc = getattr(builtins, "IOError", "FileNotFoundError")
 
+
 def time_wrap(use_gpu):
     if use_gpu:
         torch.cuda.synchronize()
     return time.time()
 
+
 def dlrm_wrap(X, lS_o, lS_i, use_gpu, device, ndevices=1):
     with record_function("DLRM forward"):
         if use_gpu:  # .cuda()
@@ -132,6 +134,7 @@ def dlrm_wrap(X, lS_o, lS_i, use_gpu, device, ndevices=1):
                 )
         return dlrm(X.to(device), lS_o, lS_i)
 
+
 def loss_fn_wrap(Z, T, use_gpu, device):
     with record_function("DLRM loss compute"):
         if args.loss_function == "mse" or args.loss_function == "bce":
@@ -142,6 +145,7 @@ def loss_fn_wrap(Z, T, use_gpu, device):
             loss_sc_ = loss_ws_ * loss_fn_
             return loss_sc_.mean()
 
+
 # The following function is a wrapper to avoid checking this multiple times in th
 # loop below.
 def unpack_batch(b):
@@ -248,8 +252,8 @@ def create_emb(self, m, ln, weighted_pooling=None):
                     sparse=True,
                 )
             elif self.md_flag and n > self.md_threshold:
-                _m = m[i]
                 base = max(m)
+                _m = m[i] if n > self.md_threshold else base
                 EE = PrEmbeddingBag(n, _m, base)
                 # use np initialization as below for consistency...
                 W = np.random.uniform(
@@ -496,19 +500,13 @@ def interact_features(self, x, ly):
     def forward(self, dense_x, lS_o, lS_i):
         if ext_dist.my_size > 1:
             # multi-node multi-device run
-            return self.distributed_forward(
-                dense_x, lS_o, lS_i
-            )
+            return self.distributed_forward(dense_x, lS_o, lS_i)
         elif self.ndevices <= 1:
             # single device run
-            return self.sequential_forward(
-                dense_x, lS_o, lS_i
-            )
+            return self.sequential_forward(dense_x, lS_o, lS_i)
         else:
             # single-node multi-device run
-            return self.parallel_forward(
-                dense_x, lS_o, lS_i
-            )
+            return self.parallel_forward(dense_x, lS_o, lS_i)
 
     def distributed_forward(self, dense_x, lS_o, lS_i):
         batch_size = dense_x.size()[0]
@@ -535,9 +533,7 @@ def distributed_forward(self, dense_x, lS_o, lS_i):
 
         # embeddings
         with record_function("DLRM embedding forward"):
-            ly = self.apply_emb(
-                lS_o, lS_i, self.emb_l, self.v_W_l
-            )
+            ly = self.apply_emb(lS_o, lS_i, self.emb_l, self.v_W_l)
 
         # WARNING: Note that at this point we have the result of the embedding lookup
         # for the entire batch on each rank. We would like to obtain partial results
@@ -579,9 +575,7 @@ def sequential_forward(self, dense_x, lS_o, lS_i):
         # print(x.detach().cpu().numpy())
 
         # process sparse features(using embeddings), resulting in a list of row vectors
-        ly = self.apply_emb(
-            lS_o, lS_i, self.emb_l, self.v_W_l
-        )
+        ly = self.apply_emb(lS_o, lS_i, self.emb_l, self.v_W_l)
         # for y in ly:
         #     print(y.detach().cpu().numpy())
 
@@ -666,9 +660,7 @@ def parallel_forward(self, dense_x, lS_o, lS_i):
         # print(x)
 
         # embeddings
-        ly = self.apply_emb(
-            lS_o, lS_i, self.emb_l, self.v_W_l
-        )
+        ly = self.apply_emb(lS_o, lS_i, self.emb_l, self.v_W_l)
         # debug prints
         # print(ly)
 
@@ -778,7 +770,6 @@ def inference(
             print("Warning: Skiping the batch %d with size %d" % (i, X_test.size(0)))
             continue
 
-
         # forward pass
         Z_test = dlrm_wrap(
             X_test,
@@ -1085,9 +1076,7 @@ def run():
         mlperf_logger.barrier()
 
     if args.data_generation == "dataset":
-        train_data, train_ld, test_data, test_ld = dp.make_criteo_data_and_loaders(
-            args
-        )
+        train_data, train_ld, test_data, test_ld = dp.make_criteo_data_and_loaders(args)
         table_feature_map = {idx: idx for idx in range(len(train_data.counts))}
         nbatches = args.num_batches if args.num_batches > 0 else len(train_ld)
         nbatches_test = len(test_ld)
@@ -1800,13 +1789,67 @@ def run():
         # print("inputs", X_onnx, lS_o_onnx, lS_i_onnx)
         # print("output", dlrm_wrap(X_onnx, lS_o_onnx, lS_i_onnx, use_gpu, device))
         dlrm_pytorch_onnx_file = "dlrm_s_pytorch.onnx"
+        batch_size = X_onnx.shape[0]
+        print("X_onnx.shape", X_onnx.shape)
+        if torch.is_tensor(lS_o_onnx):
+            print("lS_o_onnx.shape", lS_o_onnx.shape)
+        else:
+            for oo in lS_o_onnx:
+                print("oo.shape", oo.shape)
+        if torch.is_tensor(lS_i_onnx):
+            print("lS_i_onnx.shape", lS_i_onnx.shape)
+        else:
+            for ii in lS_i_onnx:
+                print("ii.shape", ii.shape)
+
+        # name inputs and outputs
+        o_inputs = (
+            ["offsets"]
+            if torch.is_tensor(lS_o_onnx)
+            else ["offsets_" + str(i) for i in range(len(lS_o_onnx))]
+        )
+        i_inputs = (
+            ["indices"]
+            if torch.is_tensor(lS_i_onnx)
+            else ["indices_" + str(i) for i in range(len(lS_i_onnx))]
+        )
+        all_inputs = ["dense_x"] + o_inputs + i_inputs
+        # debug prints
+        print("inputs", all_inputs)
+
+        # create dynamic_axis dictionaries
+        do_inputs = (
+            [{"offsets": {1: "batch_size"}}]
+            if torch.is_tensor(lS_o_onnx)
+            else [
+                {"offsets_" + str(i): {0: "batch_size"}} for i in range(len(lS_o_onnx))
+            ]
+        )
+        di_inputs = (
+            [{"indices": {1: "batch_size"}}]
+            if torch.is_tensor(lS_i_onnx)
+            else [
+                {"indices_" + str(i): {0: "batch_size"}} for i in range(len(lS_i_onnx))
+            ]
+        )
+        dynamic_axes = {"dense_x": {0: "batch_size"}, "pred": {0: "batch_size"}}
+        for do in do_inputs:
+            dynamic_axes.update(do)
+        for di in di_inputs:
+            dynamic_axes.update(di)
+        # debug prints
+        print(dynamic_axes)
+        # export model
         torch.onnx.export(
             dlrm,
             (X_onnx, lS_o_onnx, lS_i_onnx),
             dlrm_pytorch_onnx_file,
             verbose=True,
             use_external_data_format=True,
-            opset_version=10,
+            opset_version=11,
+            input_names=all_inputs,
+            output_names=["pred"],
+            dynamic_axes=dynamic_axes,
         )
         # recover the model back
         dlrm_pytorch_onnx = onnx.load("dlrm_s_pytorch.onnx")
diff --git a/tricks/md_embedding_bag.py b/tricks/md_embedding_bag.py
index 53c9f7af..7c4071a2 100644
--- a/tricks/md_embedding_bag.py
+++ b/tricks/md_embedding_bag.py
@@ -34,7 +34,10 @@ def md_solver(n, alpha, d0=None, B=None, round_dim=True, k=None):
     d = alpha_power_rule(n.type(torch.float) / k, alpha, d0=d0, B=B)
     if round_dim:
         d = pow_2_round(d)
-    return d
+    undo_sort = [0] * len(indices)
+    for i, v in enumerate(indices):
+        undo_sort[v] = i
+    return d[undo_sort]
 
 
 def alpha_power_rule(n, alpha, d0=None, B=None):