DLRM benchmark with distributed training (facebookresearch#142)

jianan-gu · Dec 4, 2020 · b9c61a6 · b9c61a6
1 parent 52b77f8
commit b9c61a6
Show file tree

Hide file tree

Showing 10 changed files with 2,488 additions and 614 deletions.
diff --git a/README.md b/README.md
@@ -337,6 +337,8 @@ Benchmarking
    - Corresponding pre-trained model is available under [CC-BY-NC license](https://creativecommons.org/licenses/by-nc/2.0/) and can be downloaded here
      [dlrm_emb128_subsample0.0_maxindrange40M_pretrained.pt](https://dlrm.s3-us-west-1.amazonaws.com/models/tb00_40M.pt)
 
+5) The code now supports synchronous distributed training, we support gloo/nccl/mpi backend, we provide launching mode for [pytorch distributed launcher](https://pytorch.org/docs/stable/distributed.html#launch-utility) and MPIRUN(optional).
+
 Model checkpoint saving/loading
 -------------------------------
 During training, the model can be saved using --save-model=<path/model.pt>
@@ -352,10 +354,11 @@ Alternatively, the saved model can be used to evaluate only on the test data-set
 Version
 -------
 0.1 : Initial release of the DLRM code
+1.0 : DLRM with distributed training
 
 Requirements
 ------------
-pytorch-nightly (*6/10/19*)
+pytorch-nightly (*11/10/20*)
 
 scikit-learn
 
@@ -367,8 +370,6 @@ pydot (*optional*)
 
 torchviz (*optional*)
 
-tqdm
-
 
 License
 -------

diff --git a/data_loader_terabyte.py b/data_loader_terabyte.py
@@ -234,6 +234,9 @@ def __getitem__(self, idx):
                                    max_ind_range=self.max_ind_range,
                                    flag_input_torch_tensor=True)
 
+    def __del__(self):
+        self.file.close()
+
 
 def numpy_to_binary(input_files, output_file_path, split='train'):
     """Convert the data to a binary format to be read with CriteoBinDataset."""

diff --git a/data_utils.py b/data_utils.py
@@ -883,7 +883,7 @@ def getCriteoAdData(
         randomize='total',
         criteo_kaggle=True,
         memory_map=False,
-        dataset_multiprocessing=False
+        dataset_multiprocessing=False,
 ):
     # Passes through entire dataset and defines dictionaries for categorical
     # features and determines the number of total categories.
@@ -1175,11 +1175,12 @@ def process_one_file(
                                  convertDicts,
                                  counts,
                                  )
-                           ) for i in range (0, days)]
+                           ) for i in range(0, days)]
         for process in processes:
             process.start()
         for process in processes:
             process.join()
+
     else:
         for i in range(days):
             processCriteoAdData(d_path, d_file, npzfile, i, convertDicts, counts)
@@ -1225,7 +1226,7 @@ def loadDataset(
     lstr = raw_path.split("/")
     d_path = "/".join(lstr[0:-1]) + "/"
     d_file = lstr[-1].split(".")[0] if dataset == "kaggle" else lstr[-1]
-    npzfile = d_path + ((d_file + "_day") if dataset == "kaggle" else d_file)
+    npzfile = (d_file + "_day") if dataset == "kaggle" else d_file
     # trafile = d_path + ((d_file + "_fea") if dataset == "kaggle" else "fea")
 
     # check if pre-processed data is available