From d94e52f6951843040f7e11e5c405a740eefd3cf1 Mon Sep 17 00:00:00 2001
From: RhettYing <rhett_ying@qq.com>
Date: Mon, 23 Oct 2023 01:00:50 +0000
Subject: [PATCH 01/30] [gb_distdgl] add demo py

---
 dt.py | 76 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 76 insertions(+)
 create mode 100644 dt.py

diff --git a/dt.py b/dt.py
new file mode 100644
index 000000000000..4dd45b93445c
--- /dev/null
+++ b/dt.py
@@ -0,0 +1,76 @@
+import dgl
+import dgl.graphbolt as gb
+import torch as th
+import numpy as np
+
+# [TODO][P0] Set up distributed environment.
+
+'''
+num_trainers = 8
+num_servers = 4
+num_samplers = 0
+part_config = ./ogbn-products.json
+ip_config = ./ip_config.txt
+'''
+
+args = {}
+
+# Initialize distributed environment
+dgl.distributed.initialize(args.ip_config)
+th.distributed.init_process_group(backend=args.backend)
+# [TODO][P0] Load `CSCSamplingGraph` into `DistGraph`.
+g = dgl.distributed.DistGraph(args.graph_name, part_config=args.part_config)
+
+# Generate train/val/test splits
+##############
+# train/val/test splits could be generated offline, then `train/val/test_masks`
+#   could be offloaded.
+# No change is required as `node_split` requires graph parition book and
+#   masks only.
+# This should be part of `OnDiskDataset::TVT`.
+# [TODO][P1]: Add a standalone API to generate train/val/test splits.
+##############
+gpb = g.get_partition_book()
+train_nids = dgl.distributed.node_split(g.ndata['train_masks'], gpb)
+val_nids = dgl.distributed.node_split(g.ndata['val_masks'], gpb)
+test_nids = dgl.distributed.node_split(g.ndata['test_masks'], gpb)
+all_nids = dgl.distributed.node_split(th.arange(g.num_nodes()), gpb)
+
+# [TODO][P2] How to handle feature data such as 'feat', 'mask'?
+# Just use `g.ndata['feat']` for now. As no more memory could be offloaded.
+# GB: feat_data = gb.OnDiskDataset().feature
+# DistDGL: feat_data = g.ndata['feat'] # DistTensor
+
+
+# Train.
+##############
+# GraphBolt version
+# [TODO][P0] Add `gb.distributed_sample_neighbor` API.
+# [TODO][P0] `remote_sample_neighbor()` returns original global node pairs + eids.
+# [TODO][P0] Upldate `dgl.distributed.merge_graphs` API.
+#     https://github.com/dmlc/dgl/blob/7439b7e73bdb85b4285ab01f704ac5a4f77c927e/python/dgl/distributed/graph_services.py#L440.
+##############
+'''
+datapipe = gb.ItemSampler(item_set, batch_size=batch_size, shuffle=shuffle)
+datapipe = datapipe.sample_neighbor(g._graph, fanouts=fanouts)
+datapipe = datapipe.to_dgl()
+device = th.device("cpu")
+datapipe = datapipe.copy_to(device)
+data_loader = gb.MultiProcessDataLoader(datapipe, num_workers=num_workers)
+'''
+sampler = dgl.dataloading.NeighborSampler([25, 10])
+train_dataloader = dgl.distributed.DistDataLoader(
+    g, train_nids, sampler=sampler, batch_size=args.batch_size, shuffle=True)
+model = None
+for mini_batch in train_dataloader:
+    in_feats = g.ndata['feat'][mini_batch.input_nodes]
+    labels = g.ndata['label'][mini_batch.output_nodes]
+    _ = model(mini_batch, in_feats)
+
+# Evaluate.
+model.eval()
+sampler = dgl.dataloading.NeighborSampler([-1])
+val_dataloader = dgl.distributed.DistDataLoader(
+    g, val_nids, sampler=sampler, batch_size=args.batch_size, shuffle=False)
+test_dataloader = dgl.distributed.DistDataLoader(
+    g, test_nids, sampler=sampler, batch_size=args.batch_size, shuffle=False)

From 9de41c6277236e7989f2bcde1c04ec486d2729ac Mon Sep 17 00:00:00 2001
From: RhettYing <rhett_ying@qq.com>
Date: Mon, 23 Oct 2023 06:43:20 +0000
Subject: [PATCH 02/30] [gb_distdgl] enable to store node_attributes into
 CSCSamplingGraph

---
 dt.py                                         |  30 +++--
 .../include/graphbolt/csc_sampling_graph.h    |  21 +++
 graphbolt/src/csc_sampling_graph.cc           |  48 ++++++-
 graphbolt/src/python_binding.cc               |   2 +
 python/dgl/graphbolt/base.py                  |   2 +
 .../dgl/graphbolt/impl/csc_sampling_graph.py  |  53 +++++++-
 python/dgl/graphbolt/impl/ondisk_dataset.py   |  15 ++-
 .../graphbolt/impl/test_csc_sampling_graph.py | 120 ++++++++++++++++--
 .../graphbolt/impl/test_ondisk_dataset.py     |  32 ++++-
 9 files changed, 287 insertions(+), 36 deletions(-)

diff --git a/dt.py b/dt.py
index 4dd45b93445c..42bdcad1c6a3 100644
--- a/dt.py
+++ b/dt.py
@@ -1,17 +1,17 @@
 import dgl
 import dgl.graphbolt as gb
-import torch as th
 import numpy as np
+import torch as th
 
 # [TODO][P0] Set up distributed environment.
 
-'''
+"""
 num_trainers = 8
 num_servers = 4
 num_samplers = 0
 part_config = ./ogbn-products.json
 ip_config = ./ip_config.txt
-'''
+"""
 
 args = {}
 
@@ -19,6 +19,7 @@
 dgl.distributed.initialize(args.ip_config)
 th.distributed.init_process_group(backend=args.backend)
 # [TODO][P0] Load `CSCSamplingGraph` into `DistGraph`.
+## NID/EIDs are required.
 g = dgl.distributed.DistGraph(args.graph_name, part_config=args.part_config)
 
 # Generate train/val/test splits
@@ -31,9 +32,9 @@
 # [TODO][P1]: Add a standalone API to generate train/val/test splits.
 ##############
 gpb = g.get_partition_book()
-train_nids = dgl.distributed.node_split(g.ndata['train_masks'], gpb)
-val_nids = dgl.distributed.node_split(g.ndata['val_masks'], gpb)
-test_nids = dgl.distributed.node_split(g.ndata['test_masks'], gpb)
+train_nids = dgl.distributed.node_split(g.ndata["train_masks"], gpb)
+val_nids = dgl.distributed.node_split(g.ndata["val_masks"], gpb)
+test_nids = dgl.distributed.node_split(g.ndata["test_masks"], gpb)
 all_nids = dgl.distributed.node_split(th.arange(g.num_nodes()), gpb)
 
 # [TODO][P2] How to handle feature data such as 'feat', 'mask'?
@@ -50,27 +51,30 @@
 # [TODO][P0] Upldate `dgl.distributed.merge_graphs` API.
 #     https://github.com/dmlc/dgl/blob/7439b7e73bdb85b4285ab01f704ac5a4f77c927e/python/dgl/distributed/graph_services.py#L440.
 ##############
-'''
+"""
 datapipe = gb.ItemSampler(item_set, batch_size=batch_size, shuffle=shuffle)
 datapipe = datapipe.sample_neighbor(g._graph, fanouts=fanouts)
 datapipe = datapipe.to_dgl()
 device = th.device("cpu")
 datapipe = datapipe.copy_to(device)
 data_loader = gb.MultiProcessDataLoader(datapipe, num_workers=num_workers)
-'''
+"""
 sampler = dgl.dataloading.NeighborSampler([25, 10])
 train_dataloader = dgl.distributed.DistDataLoader(
-    g, train_nids, sampler=sampler, batch_size=args.batch_size, shuffle=True)
+    g, train_nids, sampler=sampler, batch_size=args.batch_size, shuffle=True
+)
 model = None
 for mini_batch in train_dataloader:
-    in_feats = g.ndata['feat'][mini_batch.input_nodes]
-    labels = g.ndata['label'][mini_batch.output_nodes]
+    in_feats = g.ndata["feat"][mini_batch.input_nodes]
+    labels = g.ndata["label"][mini_batch.output_nodes]
     _ = model(mini_batch, in_feats)
 
 # Evaluate.
 model.eval()
 sampler = dgl.dataloading.NeighborSampler([-1])
 val_dataloader = dgl.distributed.DistDataLoader(
-    g, val_nids, sampler=sampler, batch_size=args.batch_size, shuffle=False)
+    g, val_nids, sampler=sampler, batch_size=args.batch_size, shuffle=False
+)
 test_dataloader = dgl.distributed.DistDataLoader(
-    g, test_nids, sampler=sampler, batch_size=args.batch_size, shuffle=False)
+    g, test_nids, sampler=sampler, batch_size=args.batch_size, shuffle=False
+)
diff --git a/graphbolt/include/graphbolt/csc_sampling_graph.h b/graphbolt/include/graphbolt/csc_sampling_graph.h
index c228998e3d31..ee8210e9172c 100644
--- a/graphbolt/include/graphbolt/csc_sampling_graph.h
+++ b/graphbolt/include/graphbolt/csc_sampling_graph.h
@@ -48,6 +48,7 @@ struct SamplerArgs<SamplerType::LABOR> {
  */
 class CSCSamplingGraph : public torch::CustomClassHolder {
  public:
+  using NodeAttrMap = torch::Dict<std::string, torch::Tensor>;
   using EdgeAttrMap = torch::Dict<std::string, torch::Tensor>;
   /** @brief Default constructor. */
   CSCSamplingGraph() = default;
@@ -65,6 +66,7 @@ class CSCSamplingGraph : public torch::CustomClassHolder {
       const torch::Tensor& indptr, const torch::Tensor& indices,
       const torch::optional<torch::Tensor>& node_type_offset,
       const torch::optional<torch::Tensor>& type_per_edge,
+      const torch::optional<NodeAttrMap>& node_attributes,
       const torch::optional<EdgeAttrMap>& edge_attributes);
 
   /**
@@ -82,6 +84,7 @@ class CSCSamplingGraph : public torch::CustomClassHolder {
       const torch::Tensor& indptr, const torch::Tensor& indices,
       const torch::optional<torch::Tensor>& node_type_offset,
       const torch::optional<torch::Tensor>& type_per_edge,
+      const torch::optional<NodeAttrMap>& node_attributes,
       const torch::optional<EdgeAttrMap>& edge_attributes);
 
   /** @brief Get the number of nodes. */
@@ -106,6 +109,11 @@ class CSCSamplingGraph : public torch::CustomClassHolder {
     return type_per_edge_;
   }
 
+  /** @brief Get the node attributes dictionary. */
+  inline const torch::optional<NodeAttrMap> NodeAttributes() const {
+    return node_attributes_;
+  }
+
   /** @brief Get the edge attributes dictionary. */
   inline const torch::optional<EdgeAttrMap> EdgeAttributes() const {
     return edge_attributes_;
@@ -129,6 +137,12 @@ class CSCSamplingGraph : public torch::CustomClassHolder {
     type_per_edge_ = type_per_edge;
   }
 
+  /** @brief Set the node attributes dictionary. */
+  inline void SetNodeAttributes(
+      const torch::optional<NodeAttrMap>& node_attributes) {
+    node_attributes_ = node_attributes;
+  }
+
   /** @brief Set the edge attributes dictionary. */
   inline void SetEdgeAttributes(
       const torch::optional<EdgeAttrMap>& edge_attributes) {
@@ -302,6 +316,13 @@ class CSCSamplingGraph : public torch::CustomClassHolder {
    */
   torch::optional<torch::Tensor> type_per_edge_;
 
+  /**
+   * @brief A dictionary of node attributes. Each key represents the attribute's
+   * name, while the corresponding value holds the attribute's specific value.
+   * The length of each value should match the total number of nodes."
+   */
+  torch::optional<NodeAttrMap> node_attributes_;
+
   /**
    * @brief A dictionary of edge attributes. Each key represents the attribute's
    * name, while the corresponding value holds the attribute's specific value.
diff --git a/graphbolt/src/csc_sampling_graph.cc b/graphbolt/src/csc_sampling_graph.cc
index 30798d720287..26cf6303123e 100644
--- a/graphbolt/src/csc_sampling_graph.cc
+++ b/graphbolt/src/csc_sampling_graph.cc
@@ -28,11 +28,13 @@ CSCSamplingGraph::CSCSamplingGraph(
     const torch::Tensor& indptr, const torch::Tensor& indices,
     const torch::optional<torch::Tensor>& node_type_offset,
     const torch::optional<torch::Tensor>& type_per_edge,
+    const torch::optional<NodeAttrMap>& node_attributes,
     const torch::optional<EdgeAttrMap>& edge_attributes)
     : indptr_(indptr),
       indices_(indices),
       node_type_offset_(node_type_offset),
       type_per_edge_(type_per_edge),
+      node_attributes_(node_attributes),
       edge_attributes_(edge_attributes) {
   TORCH_CHECK(indptr.dim() == 1);
   TORCH_CHECK(indices.dim() == 1);
@@ -43,6 +45,7 @@ c10::intrusive_ptr<CSCSamplingGraph> CSCSamplingGraph::FromCSC(
     const torch::Tensor& indptr, const torch::Tensor& indices,
     const torch::optional<torch::Tensor>& node_type_offset,
     const torch::optional<torch::Tensor>& type_per_edge,
+    const torch::optional<NodeAttrMap>& node_attributes,
     const torch::optional<EdgeAttrMap>& edge_attributes) {
   if (node_type_offset.has_value()) {
     auto& offset = node_type_offset.value();
@@ -52,13 +55,18 @@ c10::intrusive_ptr<CSCSamplingGraph> CSCSamplingGraph::FromCSC(
     TORCH_CHECK(type_per_edge.value().dim() == 1);
     TORCH_CHECK(type_per_edge.value().size(0) == indices.size(0));
   }
+  if (node_attributes.has_value()) {
+    for (const auto& pair : node_attributes.value()) {
+      TORCH_CHECK(pair.value().size(0) == indptr.size(0) - 1);
+    }
+  }
   if (edge_attributes.has_value()) {
     for (const auto& pair : edge_attributes.value()) {
       TORCH_CHECK(pair.value().size(0) == indices.size(0));
     }
   }
   return c10::make_intrusive<CSCSamplingGraph>(
-      indptr, indices, node_type_offset, type_per_edge, edge_attributes);
+      indptr, indices, node_type_offset, type_per_edge, node_attributes, edge_attributes);
 }
 
 void CSCSamplingGraph::Load(torch::serialize::InputArchive& archive) {
@@ -81,6 +89,25 @@ void CSCSamplingGraph::Load(torch::serialize::InputArchive& archive) {
         read_from_archive(archive, "CSCSamplingGraph/type_per_edge").toTensor();
   }
 
+  // Optional node attributes.
+  torch::IValue has_node_attributes;
+  if (archive.try_read(
+          "CSCSamplingGraph/has_node_attributes", has_node_attributes) &&
+      has_node_attributes.toBool()) {
+    torch::Dict<torch::IValue, torch::IValue> generic_dict =
+        read_from_archive(archive, "CSCSamplingGraph/node_attributes")
+            .toGenericDict();
+    NodeAttrMap target_dict;
+    for (const auto& pair : generic_dict) {
+      std::string key = pair.key().toStringRef();
+      torch::Tensor value = pair.value().toTensor();
+      // Use move to avoid copy.
+      target_dict.insert(std::move(key), std::move(value));
+    }
+    // Same as above.
+    node_attributes_ = std::move(target_dict);
+  }
+
   // Optional edge attributes.
   torch::IValue has_edge_attributes;
   if (archive.try_read(
@@ -116,6 +143,12 @@ void CSCSamplingGraph::Save(torch::serialize::OutputArchive& archive) const {
   if (type_per_edge_) {
     archive.write("CSCSamplingGraph/type_per_edge", type_per_edge_.value());
   }
+  archive.write(
+      "CSCSamplingGraph/has_node_attributes", node_attributes_.has_value());
+  if (node_attributes_) {
+    archive.write(
+        "CSCSamplingGraph/node_attributes", node_attributes_.value());
+  }
   archive.write(
       "CSCSamplingGraph/has_edge_attributes", edge_attributes_.has_value());
   if (edge_attributes_) {
@@ -127,7 +160,7 @@ void CSCSamplingGraph::SetState(
     const torch::Dict<std::string, torch::Dict<std::string, torch::Tensor>>&
         state) {
   // State is a dict of dicts. The tensor-type attributes are stored in the dict
-  // with key "independent_tensors". The dict-type attributes (edge_attributes)
+  // with key "independent_tensors". The dict-type attributes (node/edge_attributes)
   // are stored directly with the their name as the key.
   const auto& independent_tensors = state.at("independent_tensors");
   TORCH_CHECK(
@@ -143,6 +176,9 @@ void CSCSamplingGraph::SetState(
   if (independent_tensors.find("type_per_edge") != independent_tensors.end()) {
     type_per_edge_ = independent_tensors.at("type_per_edge");
   }
+  if (state.find("node_attributes") != state.end()) {
+    node_attributes_ = state.at("node_attributes");
+  }
   if (state.find("edge_attributes") != state.end()) {
     edge_attributes_ = state.at("edge_attributes");
   }
@@ -151,7 +187,7 @@ void CSCSamplingGraph::SetState(
 torch::Dict<std::string, torch::Dict<std::string, torch::Tensor>>
 CSCSamplingGraph::GetState() const {
   // State is a dict of dicts. The tensor-type attributes are stored in the dict
-  // with key "independent_tensors". The dict-type attributes (edge_attributes)
+  // with key "independent_tensors". The dict-type attributes (node/edge_attributes)
   // are stored directly with the their name as the key.
   torch::Dict<std::string, torch::Dict<std::string, torch::Tensor>> state;
   torch::Dict<std::string, torch::Tensor> independent_tensors;
@@ -167,6 +203,9 @@ CSCSamplingGraph::GetState() const {
     independent_tensors.insert("type_per_edge", type_per_edge_.value());
   }
   state.insert("independent_tensors", independent_tensors);
+  if (node_attributes_.has_value()) {
+    state.insert("node_attributes", node_attributes_.value());
+  }
   if (edge_attributes_.has_value()) {
     state.insert("edge_attributes", edge_attributes_.value());
   }
@@ -469,9 +508,11 @@ static c10::intrusive_ptr<CSCSamplingGraph> BuildGraphFromSharedMemoryHelper(
   auto indices = helper.ReadTorchTensor();
   auto node_type_offset = helper.ReadTorchTensor();
   auto type_per_edge = helper.ReadTorchTensor();
+  auto node_attributes = helper.ReadTorchTensorDict();
   auto edge_attributes = helper.ReadTorchTensorDict();
   auto graph = c10::make_intrusive<CSCSamplingGraph>(
       indptr.value(), indices.value(), node_type_offset, type_per_edge,
+      node_attributes,
       edge_attributes);
   auto shared_memory = helper.ReleaseSharedMemory();
   graph->HoldSharedMemoryObject(
@@ -486,6 +527,7 @@ c10::intrusive_ptr<CSCSamplingGraph> CSCSamplingGraph::CopyToSharedMemory(
   helper.WriteTorchTensor(indices_);
   helper.WriteTorchTensor(node_type_offset_);
   helper.WriteTorchTensor(type_per_edge_);
+  helper.WriteTorchTensorDict(node_attributes_);
   helper.WriteTorchTensorDict(edge_attributes_);
   helper.Flush();
   return BuildGraphFromSharedMemoryHelper(std::move(helper));
diff --git a/graphbolt/src/python_binding.cc b/graphbolt/src/python_binding.cc
index 06bcc9c98acc..4fc691fcf4c5 100644
--- a/graphbolt/src/python_binding.cc
+++ b/graphbolt/src/python_binding.cc
@@ -31,11 +31,13 @@ TORCH_LIBRARY(graphbolt, m) {
       .def("indices", &CSCSamplingGraph::Indices)
       .def("node_type_offset", &CSCSamplingGraph::NodeTypeOffset)
       .def("type_per_edge", &CSCSamplingGraph::TypePerEdge)
+        .def("node_attributes", &CSCSamplingGraph::NodeAttributes)
       .def("edge_attributes", &CSCSamplingGraph::EdgeAttributes)
       .def("set_csc_indptr", &CSCSamplingGraph::SetCSCIndptr)
       .def("set_indices", &CSCSamplingGraph::SetIndices)
       .def("set_node_type_offset", &CSCSamplingGraph::SetNodeTypeOffset)
       .def("set_type_per_edge", &CSCSamplingGraph::SetTypePerEdge)
+      .def("set_node_attributes", &CSCSamplingGraph::SetNodeAttributes)
       .def("set_edge_attributes", &CSCSamplingGraph::SetEdgeAttributes)
       .def("in_subgraph", &CSCSamplingGraph::InSubgraph)
       .def("sample_neighbors", &CSCSamplingGraph::SampleNeighbors)
diff --git a/python/dgl/graphbolt/base.py b/python/dgl/graphbolt/base.py
index d460bb2332fc..42a2caa5e97a 100644
--- a/python/dgl/graphbolt/base.py
+++ b/python/dgl/graphbolt/base.py
@@ -8,6 +8,7 @@
 
 __all__ = [
     "CANONICAL_ETYPE_DELIMITER",
+    "ORIGINAL_NODE_ID",
     "ORIGINAL_EDGE_ID",
     "etype_str_to_tuple",
     "etype_tuple_to_str",
@@ -16,6 +17,7 @@
 ]
 
 CANONICAL_ETYPE_DELIMITER = ":"
+ORIGINAL_NODE_ID = "_ORIGINAL_NODE_ID"
 ORIGINAL_EDGE_ID = "_ORIGINAL_EDGE_ID"
 
 
diff --git a/python/dgl/graphbolt/impl/csc_sampling_graph.py b/python/dgl/graphbolt/impl/csc_sampling_graph.py
index af9550f77b95..05c580965d94 100644
--- a/python/dgl/graphbolt/impl/csc_sampling_graph.py
+++ b/python/dgl/graphbolt/impl/csc_sampling_graph.py
@@ -10,10 +10,15 @@
 
 from dgl.utils import recursive_apply
 
-from ...base import EID, ETYPE
+from ...base import EID, ETYPE, NID
 from ...convert import to_homogeneous
 from ...heterograph import DGLGraph
-from ..base import etype_str_to_tuple, etype_tuple_to_str, ORIGINAL_EDGE_ID
+from ..base import (
+    etype_str_to_tuple,
+    etype_tuple_to_str,
+    ORIGINAL_EDGE_ID,
+    ORIGINAL_NODE_ID,
+)
 from ..sampling_graph import SamplingGraph
 from .sampled_subgraph_impl import SampledSubgraphImpl
 
@@ -251,6 +256,27 @@ def type_per_edge(self, type_per_edge: Optional[torch.Tensor]) -> None:
         """Sets the edge type tensor if present."""
         self._c_csc_graph.set_type_per_edge(type_per_edge)
 
+    @property
+    def node_attributes(self) -> Optional[Dict[str, torch.Tensor]]:
+        """Returns the node attributes dictionary.
+
+        Returns
+        -------
+        torch.Tensor or None
+            If present, returns a dictionary of node attributes. Each key
+            represents the attribute's name, while the corresponding value
+            holds the attribute's specific value. The length of each value
+            should match the total number of nodes."
+        """
+        return self._c_csc_graph.node_attributes()
+
+    @node_attributes.setter
+    def node_attributes(
+        self, node_attributes: Optional[Dict[str, torch.Tensor]]
+    ) -> None:
+        """Sets the node attributes dictionary."""
+        self._c_csc_graph.set_node_attributes(node_attributes)
+
     @property
     def edge_attributes(self) -> Optional[Dict[str, torch.Tensor]]:
         """Returns the edge attributes dictionary.
@@ -321,6 +347,7 @@ def _convert_to_sampled_subgraph(
             column_num
         )
         row = C_sampled_subgraph.indices
+
         type_per_edge = C_sampled_subgraph.type_per_edge
         original_edge_ids = C_sampled_subgraph.original_edge_ids
         has_original_eids = (
@@ -529,6 +556,10 @@ def _sample_neighbors(
         """
         # Ensure nodes is 1-D tensor.
         self._check_sampler_arguments(nodes, fanouts, probs_name)
+        has_original_nids = (
+            self.node_attributes is not None
+            and ORIGINAL_NODE_ID in self.node_attributes
+        )
         has_original_eids = (
             self.edge_attributes is not None
             and ORIGINAL_EDGE_ID in self.edge_attributes
@@ -618,6 +649,10 @@ def sample_layer_neighbors(
             nodes = self._convert_to_homogeneous_nodes(nodes)
 
         self._check_sampler_arguments(nodes, fanouts, probs_name)
+        has_original_nids = (
+            self.node_attributes is not None
+            and ORIGINAL_NODE_ID in self.node_attributes
+        )
         has_original_eids = (
             self.edge_attributes is not None
             and ORIGINAL_EDGE_ID in self.edge_attributes
@@ -721,6 +756,9 @@ def _to(x, device):
         self.type_per_edge = recursive_apply(
             self.type_per_edge, lambda x: _to(x, device)
         )
+        self.node_attributes = recursive_apply(
+            self.node_attributes, lambda x: _to(x, device)
+        )
         self.edge_attributes = recursive_apply(
             self.edge_attributes, lambda x: _to(x, device)
         )
@@ -733,6 +771,7 @@ def from_csc(
     indices: torch.Tensor,
     node_type_offset: Optional[torch.tensor] = None,
     type_per_edge: Optional[torch.tensor] = None,
+    node_attributes: Optional[Dict[str, torch.tensor]] = None,
     edge_attributes: Optional[Dict[str, torch.tensor]] = None,
     metadata: Optional[GraphMetadata] = None,
 ) -> CSCSamplingGraph:
@@ -750,6 +789,8 @@ def from_csc(
         Offset of node types in the graph, by default None.
     type_per_edge : Optional[torch.tensor], optional
         Type ids of each edge in the graph, by default None.
+    node_attributes: Optional[Dict[str, torch.tensor]], optional
+        Node attributes of the graph, by default None.
     edge_attributes: Optional[Dict[str, torch.tensor]], optional
         Edge attributes of the graph, by default None.
     metadata: Optional[GraphMetadata], optional
@@ -785,6 +826,7 @@ def from_csc(
             indices,
             node_type_offset,
             type_per_edge,
+            node_attributes,
             edge_attributes,
         ),
         metadata,
@@ -880,6 +922,7 @@ def save_csc_sampling_graph(graph, filename):
 def from_dglgraph(
     g: DGLGraph,
     is_homogeneous: bool = False,
+    include_original_node_id: bool = False,
     include_original_edge_id: bool = False,
 ) -> CSCSamplingGraph:
     """Convert a DGLGraph to CSCSamplingGraph."""
@@ -905,6 +948,11 @@ def from_dglgraph(
     # Assign edge type according to the order of CSC matrix.
     type_per_edge = None if is_homogeneous else homo_g.edata[ETYPE][edge_ids]
 
+    node_attributes = {}
+    if include_original_node_id:
+        # Assign node attributes according to the original nids mapping.
+        node_attributes[ORIGINAL_NODE_ID] = homo_g.ndata[NID]
+
     edge_attributes = {}
     if include_original_edge_id:
         # Assign edge attributes according to the original eids mapping.
@@ -916,6 +964,7 @@ def from_dglgraph(
             indices,
             node_type_offset,
             type_per_edge,
+            node_attributes,
             edge_attributes,
         ),
         metadata,
diff --git a/python/dgl/graphbolt/impl/ondisk_dataset.py b/python/dgl/graphbolt/impl/ondisk_dataset.py
index 03f0a6806493..904344f08ad0 100644
--- a/python/dgl/graphbolt/impl/ondisk_dataset.py
+++ b/python/dgl/graphbolt/impl/ondisk_dataset.py
@@ -35,7 +35,9 @@
 
 
 def preprocess_ondisk_dataset(
-    dataset_dir: str, include_original_edge_id: bool = False
+    dataset_dir: str,
+    include_original_node_id: bool = False,
+    include_original_edge_id: bool = False,
 ) -> str:
     """Preprocess the on-disk dataset. Parse the input config file,
     load the data, and save the data in the format that GraphBolt supports.
@@ -138,7 +140,7 @@ def preprocess_ondisk_dataset(
 
     # 4. Convert the DGLGraph to a CSCSamplingGraph.
     csc_sampling_graph = from_dglgraph(
-        g, is_homogeneous, include_original_edge_id
+        g, is_homogeneous, include_original_node_id, include_original_edge_id
     )
 
     # 5. Save the CSCSamplingGraph and modify the output_config.
@@ -340,12 +342,17 @@ class OnDiskDataset(Dataset):
     """
 
     def __init__(
-        self, path: str, include_original_edge_id: bool = False
+        self,
+        path: str,
+        include_original_node_id: bool = False,
+        include_original_edge_id: bool = False,
     ) -> None:
         # Always call the preprocess function first. If already preprocessed,
         # the function will return the original path directly.
         self._dataset_dir = path
-        yaml_path = preprocess_ondisk_dataset(path, include_original_edge_id)
+        yaml_path = preprocess_ondisk_dataset(
+            path, include_original_node_id, include_original_edge_id
+        )
         with open(yaml_path) as f:
             self._yaml_data = yaml.load(f, Loader=yaml.loader.SafeLoader)
 
diff --git a/tests/python/pytorch/graphbolt/impl/test_csc_sampling_graph.py b/tests/python/pytorch/graphbolt/impl/test_csc_sampling_graph.py
index 02850c4da356..186d69af62ff 100644
--- a/tests/python/pytorch/graphbolt/impl/test_csc_sampling_graph.py
+++ b/tests/python/pytorch/graphbolt/impl/test_csc_sampling_graph.py
@@ -58,6 +58,7 @@ def test_hetero_empty_graph(total_num_nodes):
         node_type_offset,
         type_per_edge,
         None,
+        None,
         metadata,
     )
     assert graph.total_num_edges == 0
@@ -115,11 +116,17 @@ def test_homo_graph(total_num_nodes, total_num_edges):
     csc_indptr, indices = gbt.random_homo_graph(
         total_num_nodes, total_num_edges
     )
+    node_attributes = {"_ID": torch.arange(total_num_nodes)}
     edge_attributes = {
         "A1": torch.randn(total_num_edges),
         "A2": torch.randn(total_num_edges),
     }
-    graph = gb.from_csc(csc_indptr, indices, edge_attributes=edge_attributes)
+    graph = gb.from_csc(
+        csc_indptr,
+        indices,
+        node_attributes=node_attributes,
+        edge_attributes=edge_attributes,
+    )
 
     assert graph.total_num_nodes == total_num_nodes
     assert graph.total_num_edges == total_num_edges
@@ -127,6 +134,7 @@ def test_homo_graph(total_num_nodes, total_num_edges):
     assert torch.equal(csc_indptr, graph.csc_indptr)
     assert torch.equal(indices, graph.indices)
 
+    assert graph.node_attributes == node_attributes
     assert graph.edge_attributes == edge_attributes
     assert graph.metadata is None
     assert graph.node_type_offset is None
@@ -152,6 +160,7 @@ def test_hetero_graph(total_num_nodes, total_num_edges, num_ntypes, num_etypes):
     ) = gbt.random_hetero_graph(
         total_num_nodes, total_num_edges, num_ntypes, num_etypes
     )
+    node_attributes = {"_ID": torch.arange(total_num_nodes)}
     edge_attributes = {
         "A1": torch.randn(total_num_edges),
         "A2": torch.randn(total_num_edges),
@@ -161,6 +170,7 @@ def test_hetero_graph(total_num_nodes, total_num_edges, num_ntypes, num_etypes):
         indices,
         node_type_offset,
         type_per_edge,
+        node_attributes,
         edge_attributes,
         metadata,
     )
@@ -172,6 +182,7 @@ def test_hetero_graph(total_num_nodes, total_num_edges, num_ntypes, num_etypes):
     assert torch.equal(indices, graph.indices)
     assert torch.equal(node_type_offset, graph.node_type_offset)
     assert torch.equal(type_per_edge, graph.type_per_edge)
+    assert graph.node_attributes == node_attributes
     assert graph.edge_attributes == edge_attributes
     assert metadata.node_type_to_id == graph.metadata.node_type_to_id
     assert metadata.edge_type_to_id == graph.metadata.edge_type_to_id
@@ -242,7 +253,7 @@ def test_num_nodes_hetero():
     # Construct CSCSamplingGraph.
     metadata = gb.GraphMetadata(ntypes, etypes)
     graph = gb.from_csc(
-        indptr, indices, node_type_offset, type_per_edge, None, metadata
+        indptr, indices, node_type_offset, type_per_edge, None, None, metadata
     )
 
     # Verify nodes number per node types.
@@ -274,7 +285,13 @@ def test_node_type_offset_wrong_legnth(node_type_offset):
     )
     with pytest.raises(Exception):
         gb.from_csc(
-            csc_indptr, indices, node_type_offset, type_per_edge, None, metadata
+            csc_indptr,
+            indices,
+            node_type_offset,
+            type_per_edge,
+            None,
+            None,
+            metadata,
         )
 
 
@@ -330,7 +347,13 @@ def test_load_save_hetero_graph(
         total_num_nodes, total_num_edges, num_ntypes, num_etypes
     )
     graph = gb.from_csc(
-        csc_indptr, indices, node_type_offset, type_per_edge, None, metadata
+        csc_indptr,
+        indices,
+        node_type_offset,
+        type_per_edge,
+        None,
+        None,
+        metadata,
     )
 
     with tempfile.TemporaryDirectory() as test_dir:
@@ -398,6 +421,7 @@ def test_pickle_hetero_graph(
     ) = gbt.random_hetero_graph(
         total_num_nodes, total_num_edges, num_ntypes, num_etypes
     )
+    node_attributes = {"_ID": torch.arange(total_num_nodes)}
     edge_attributes = {
         "a": torch.randn((total_num_edges,)),
         "b": torch.randint(1, 10, (total_num_edges,)),
@@ -407,6 +431,7 @@ def test_pickle_hetero_graph(
         indices,
         node_type_offset,
         type_per_edge,
+        node_attributes,
         edge_attributes,
         metadata,
     )
@@ -423,6 +448,7 @@ def test_pickle_hetero_graph(
     assert torch.equal(graph.type_per_edge, graph2.type_per_edge)
     assert graph.metadata.node_type_to_id == graph2.metadata.node_type_to_id
     assert graph.metadata.edge_type_to_id == graph2.metadata.edge_type_to_id
+    assert graph.node_attributes == node_attributes
     assert graph.edge_attributes.keys() == graph2.edge_attributes.keys()
     for i in graph.edge_attributes.keys():
         assert torch.equal(graph.edge_attributes[i], graph2.edge_attributes[i])
@@ -458,6 +484,7 @@ def test_multiprocessing():
         indices,
         node_type_offset,
         type_per_edge,
+        None,
         edge_attributes,
         metadata,
     )
@@ -555,7 +582,7 @@ def test_in_subgraph_heterogeneous():
     # Construct CSCSamplingGraph.
     metadata = gb.GraphMetadata(ntypes, etypes)
     graph = gb.from_csc(
-        indptr, indices, node_type_offset, type_per_edge, None, metadata
+        indptr, indices, node_type_offset, type_per_edge, None, None, metadata
     )
 
     # Extract in subgraph.
@@ -1045,12 +1072,17 @@ def check_tensors_on_the_same_shared_memory(t1: torch.Tensor, t2: torch.Tensor):
     [(1, 1), (100, 1), (10, 50), (1000, 50000)],
 )
 @pytest.mark.parametrize("test_edge_attrs", [True, False])
+@pytest.mark.parametrize("test_node_attrs", [True, False])
 def test_homo_graph_on_shared_memory(
-    total_num_nodes, total_num_edges, test_edge_attrs
+    total_num_nodes, total_num_edges, test_edge_attrs, test_node_attrs
 ):
     csc_indptr, indices = gbt.random_homo_graph(
         total_num_nodes, total_num_edges
     )
+    if test_node_attrs:
+        node_attributes = {"_ID": torch.arange(total_num_nodes)}
+    else:
+        node_attributes = None
     if test_edge_attrs:
         edge_attributes = {
             "A1": torch.randn(total_num_edges),
@@ -1058,7 +1090,12 @@ def test_homo_graph_on_shared_memory(
         }
     else:
         edge_attributes = None
-    graph = gb.from_csc(csc_indptr, indices, edge_attributes=edge_attributes)
+    graph = gb.from_csc(
+        csc_indptr,
+        indices,
+        node_attributes=node_attributes,
+        edge_attributes=edge_attributes,
+    )
 
     shm_name = "test_homo_g"
     graph1 = graph.copy_to_shared_memory(shm_name)
@@ -1083,6 +1120,15 @@ def test_homo_graph_on_shared_memory(
     )
     check_tensors_on_the_same_shared_memory(graph1.indices, graph2.indices)
 
+    if test_node_attrs:
+        for name, node_attr in node_attributes.items():
+            assert name in graph1.node_attributes
+            assert name in graph2.node_attributes
+            assert torch.equal(graph1.node_attributes[name], node_attr)
+            check_tensors_on_the_same_shared_memory(
+                graph1.node_attributes[name], graph2.node_attributes[name]
+            )
+
     if test_edge_attrs:
         for name, edge_attr in edge_attributes.items():
             assert name in graph1.edge_attributes
@@ -1107,8 +1153,14 @@ def test_homo_graph_on_shared_memory(
 )
 @pytest.mark.parametrize("num_ntypes, num_etypes", [(1, 1), (3, 5), (100, 1)])
 @pytest.mark.parametrize("test_edge_attrs", [True, False])
+@pytest.mark.parametrize("test_node_attrs", [True, False])
 def test_hetero_graph_on_shared_memory(
-    total_num_nodes, total_num_edges, num_ntypes, num_etypes, test_edge_attrs
+    total_num_nodes,
+    total_num_edges,
+    num_ntypes,
+    num_etypes,
+    test_edge_attrs,
+    test_node_attrs,
 ):
     (
         csc_indptr,
@@ -1120,6 +1172,11 @@ def test_hetero_graph_on_shared_memory(
         total_num_nodes, total_num_edges, num_ntypes, num_etypes
     )
 
+    if test_node_attrs:
+        node_attributes = {"_ID": torch.arange(total_num_nodes)}
+    else:
+        node_attributes = None
+
     if test_edge_attrs:
         edge_attributes = {
             "A1": torch.randn(total_num_edges),
@@ -1132,6 +1189,7 @@ def test_hetero_graph_on_shared_memory(
         indices,
         node_type_offset,
         type_per_edge,
+        node_attributes,
         edge_attributes,
         metadata,
     )
@@ -1169,6 +1227,15 @@ def test_hetero_graph_on_shared_memory(
         graph1.type_per_edge, graph2.type_per_edge
     )
 
+    if test_node_attrs:
+        for name, node_attr in node_attributes.items():
+            assert name in graph1.node_attributes
+            assert name in graph2.node_attributes
+            assert torch.equal(graph1.node_attributes[name], node_attr)
+            check_tensors_on_the_same_shared_memory(
+                graph1.node_attributes[name], graph2.node_attributes[name]
+            )
+
     if test_edge_attrs:
         for name, edge_attr in edge_attributes.items():
             assert name in graph1.edge_attributes
@@ -1256,6 +1323,7 @@ def test_multiprocessing_with_shared_memory():
         node_type_offset,
         type_per_edge,
         None,
+        None,
         metadata,
     )
 
@@ -1300,19 +1368,29 @@ def test_from_dglgraph_homogeneous():
     gb_g = gb.from_dglgraph(
         dgl_g, is_homogeneous=False, include_original_edge_id=False
     )
+    assert (
+        gb_g.node_attributes is None
+        or gb.ORIGINAL_NODE_ID not in gb_g.node_attributes
+    )
     assert (
         gb_g.edge_attributes is None
         or gb.ORIGINAL_EDGE_ID not in gb_g.edge_attributes
     )
 
     gb_g = gb.from_dglgraph(
-        dgl_g, is_homogeneous=True, include_original_edge_id=True
+        dgl_g,
+        is_homogeneous=True,
+        include_original_node_id=True,
+        include_original_edge_id=True,
     )
     # Get the COO representation of the CSCSamplingGraph.
     num_columns = gb_g.csc_indptr[1:] - gb_g.csc_indptr[:-1]
     rows = gb_g.indices
     columns = torch.arange(gb_g.total_num_nodes).repeat_interleave(num_columns)
 
+    original_node_ids = gb_g.node_attributes[gb.ORIGINAL_NODE_ID]
+    assert torch.all(original_node_ids == torch.arange(gb_g.num_nodes))
+
     original_edge_ids = gb_g.edge_attributes[gb.ORIGINAL_EDGE_ID]
     assert torch.all(dgl_g.edges()[0][original_edge_ids] == rows)
     assert torch.all(dgl_g.edges()[1][original_edge_ids] == columns)
@@ -1351,13 +1429,20 @@ def test_from_dglgraph_heterogeneous():
     gb_g = gb.from_dglgraph(
         dgl_g, is_homogeneous=False, include_original_edge_id=False
     )
+    assert (
+        gb_g.node_attributes is None
+        or gb.ORIGINAL_NODE_ID not in gb_g.node_attributes
+    )
     assert (
         gb_g.edge_attributes is None
         or gb.ORIGINAL_EDGE_ID not in gb_g.edge_attributes
     )
 
     gb_g = gb.from_dglgraph(
-        dgl_g, is_homogeneous=False, include_original_edge_id=True
+        dgl_g,
+        is_homogeneous=False,
+        include_original_node_id=True,
+        include_original_edge_id=True,
     )
 
     # `reverse_node_id` is used to map the node id in CSCSamplingGraph to the
@@ -1385,6 +1470,14 @@ def test_from_dglgraph_heterogeneous():
         == dgl_g.etypes
     )
 
+    for ntype, num_nodes in gb_g.num_nodes.items():
+        original_node_ids = gb_g.node_attributes[gb.ORIGINAL_NODE_ID][
+            gb_g.node_type_offset[
+                gb_g.metadata.node_type_to_id[ntype]
+            ] : gb_g.node_type_offset[gb_g.metadata.node_type_to_id[ntype] + 1]
+        ]
+        assert torch.all(original_node_ids == torch.arange(num_nodes))
+
     # Use ORIGINAL_EDGE_ID to check if the edge mapping is correct.
     for edge_idx in range(gb_g.total_num_edges):
         hetero_graph_idx = gb_g.type_per_edge[edge_idx]
@@ -1630,6 +1723,10 @@ def test_csc_sampling_graph_to_device():
     assert node_type_offset[-1] == total_num_nodes
     assert all(type_per_edge < len(etypes))
 
+    node_attributes = {
+        "_ID": torch.arange(total_num_nodes),
+    }
+
     edge_attributes = {
         "mask": torch.BoolTensor([1, 1, 0, 1, 1, 1, 0, 0, 0]),
         "all": torch.BoolTensor([1, 1, 1, 1, 1, 1, 1, 1, 1]),
@@ -1640,6 +1737,7 @@ def test_csc_sampling_graph_to_device():
     graph = gb.from_csc(
         indptr,
         indices,
+        node_attributes=node_attributes,
         edge_attributes=edge_attributes,
         node_type_offset=node_type_offset,
         type_per_edge=type_per_edge,
@@ -1655,5 +1753,7 @@ def test_csc_sampling_graph_to_device():
     assert graph.node_type_offset.device.type == "cuda"
     assert graph.type_per_edge.device.type == "cuda"
     assert graph.csc_indptr.device.type == "cuda"
+    for key in graph.node_attributes:
+        assert graph.node_attributes[key].device.type == "cuda"
     for key in graph.edge_attributes:
         assert graph.edge_attributes[key].device.type == "cuda"
diff --git a/tests/python/pytorch/graphbolt/impl/test_ondisk_dataset.py b/tests/python/pytorch/graphbolt/impl/test_ondisk_dataset.py
index bb95bd92f69e..a08ab53ad8b7 100644
--- a/tests/python/pytorch/graphbolt/impl/test_ondisk_dataset.py
+++ b/tests/python/pytorch/graphbolt/impl/test_ondisk_dataset.py
@@ -993,7 +993,13 @@ def test_OnDiskDataset_Graph_heterogeneous():
         metadata,
     ) = gbt.random_hetero_graph(1000, 10 * 1000, 3, 4)
     graph = gb.from_csc(
-        csc_indptr, indices, node_type_offset, type_per_edge, None, metadata
+        csc_indptr,
+        indices,
+        node_type_offset,
+        type_per_edge,
+        None,
+        None,
+        metadata,
     )
 
     with tempfile.TemporaryDirectory() as test_dir:
@@ -1618,8 +1624,14 @@ def test_OnDiskDataset_load_graph():
 
         # Check the different original_edge_id option to load edge_attributes.
         dataset = gb.OnDiskDataset(
-            test_dir, include_original_edge_id=True
+            test_dir,
+            include_original_node_id=True,
+            include_original_edge_id=True,
         ).load()
+        assert (
+            dataset.graph.node_attributes is not None
+            and gb.ORIGINAL_NODE_ID in dataset.graph.node_attributes
+        )
         assert (
             dataset.graph.edge_attributes is not None
             and gb.ORIGINAL_EDGE_ID in dataset.graph.edge_attributes
@@ -1685,8 +1697,14 @@ def test_OnDiskDataset_load_graph():
 
         # Test do not generate original_edge_id.
         dataset = gb.OnDiskDataset(
-            test_dir, include_original_edge_id=False
+            test_dir,
+            include_original_node_id=False,
+            include_original_edge_id=False,
         ).load()
+        assert (
+            dataset.graph.node_attributes is None
+            or gb.ORIGINAL_NODE_ID not in dataset.graph.node_attributes
+        )
         assert (
             dataset.graph.edge_attributes is None
             or gb.ORIGINAL_EDGE_ID not in dataset.graph.edge_attributes
@@ -1823,7 +1841,13 @@ def test_OnDiskDataset_all_nodes_set_hetero():
         metadata,
     ) = gbt.random_hetero_graph(1000, 10 * 1000, 3, 4)
     graph = gb.from_csc(
-        csc_indptr, indices, node_type_offset, type_per_edge, None, metadata
+        csc_indptr,
+        indices,
+        node_type_offset,
+        type_per_edge,
+        None,
+        None,
+        metadata,
     )
 
     with tempfile.TemporaryDirectory() as test_dir:

From ddce1d42de016be040cd0f8a5e71f2a10148de82 Mon Sep 17 00:00:00 2001
From: RhettYing <rhett_ying@qq.com>
Date: Mon, 23 Oct 2023 08:46:41 +0000
Subject: [PATCH 03/30] [gb_distdgl] refine
 convert_dgl_partition_to_csc_sampling_graph to control orig_nids/eids storage

---
 Jenkinsfile                            |   1 -
 python/dgl/distributed/partition.py    |  71 +++++++++++++---
 tests/distributed/test_partition.py    | 107 +++++++++++++++++++++----
 tests/scripts/task_distributed_test.sh |   3 +-
 4 files changed, 157 insertions(+), 25 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 2fe54cfe38f3..46a4943e2cb4 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -580,7 +580,6 @@ pipeline {
                   steps {
                     unit_distributed_linux('pytorch', 'cpu')
                   }
-                  when { expression { false } }
                 }
               }
               post {
diff --git a/python/dgl/distributed/partition.py b/python/dgl/distributed/partition.py
index da0ab445690e..e93fb2ba41ae 100644
--- a/python/dgl/distributed/partition.py
+++ b/python/dgl/distributed/partition.py
@@ -1221,7 +1221,13 @@ def get_homogeneous(g, balance_ntypes):
         return orig_nids, orig_eids
 
 
-def convert_dgl_partition_to_csc_sampling_graph(part_config):
+def convert_dgl_partition_to_csc_sampling_graph(
+    part_config,
+    store_orig_nids=False,
+    store_orig_eids=False,
+    store_etypes=False,
+    store_metadata=False,
+):
     """Convert partitions of dgl to CSCSamplingGraph of GraphBolt.
 
     This API converts `DGLGraph` partitions to `CSCSamplingGraph` which is
@@ -1235,6 +1241,14 @@ def convert_dgl_partition_to_csc_sampling_graph(part_config):
     ----------
     part_config : str
         The partition configuration JSON file.
+    store_orig_nids : bool, optional
+        Whether to store original node IDs in the new graph.
+    store_orig_eids : bool, optional
+        Whether to store original edge IDs in the new graph.
+    store_etypes : bool, optional
+        Whether to store edge types in the new graph.
+    store_metadata : bool, optional
+        Whether to store metadata in the new graph.
     """
     # As only this function requires GraphBolt for now, let's import here.
     from .. import graphbolt
@@ -1252,18 +1266,57 @@ def init_type_per_edge(graph, gpb):
         graph, _, _, gpb, _, _, _ = load_partition(
             part_config, part_id, load_feats=False
         )
-        # Construct GraphMetadata.
-        _, _, ntypes, etypes = load_partition_book(part_config, part_id)
-        metadata = graphbolt.GraphMetadata(ntypes, etypes)
+        # [Rui] We can always treat partitioned graph as homogeneous graph. Then
+        # we don't need metadata at all. What's more, heterogeneous graph
+        # requires `node_type_offset` is set correctly and nodes are sorted
+        # according to their types. This is not guaranteed in current partitioned
+        # graph.
+        metadata = None
+        if store_metadata:
+            # Construct GraphMetadata.
+            _, _, ntypes, etypes = load_partition_book(part_config, part_id)
+            etypes = {
+                graphbolt.etype_tuple_to_str(etype): v
+                for etype, v in etypes.items()
+            }
+            metadata = graphbolt.GraphMetadata(ntypes, etypes)
         # Obtain CSC indtpr and indices.
         indptr, indices, _ = graph.adj().csc()
         # Initalize type per edge.
-        type_per_edge = init_type_per_edge(graph, gpb)
-        type_per_edge = type_per_edge.to(RESERVED_FIELD_DTYPE[ETYPE])
-        # Sanity check.
-        assert len(type_per_edge) == graph.num_edges()
+        type_per_edge = None
+        if store_etypes:
+            type_per_edge = init_type_per_edge(graph, gpb)
+            type_per_edge = type_per_edge.to(RESERVED_FIELD_DTYPE[ETYPE])
+            # Sanity check.
+            assert len(type_per_edge) == graph.num_edges()
+
+        # Original node IDs.
+        node_attributes = None
+        if store_orig_nids:
+            # Sanity check.
+            assert len(graph.ndata[NID]) == graph.num_nodes()
+            node_attributes = {
+                NID: graph.ndata[NID].to(RESERVED_FIELD_DTYPE[NID])
+            }
+
+        # Original edge IDs.
+        edge_attributes = None
+        if store_orig_eids:
+            # Sanity check.
+            assert len(graph.edata[EID]) == graph.num_edges()
+            edge_attributes = {
+                EID: graph.edata[EID].to(RESERVED_FIELD_DTYPE[EID])
+            }
+
+        # Construct CSCSamplingGraph
         csc_graph = graphbolt.from_csc(
-            indptr, indices, None, type_per_edge, metadata=metadata
+            indptr,
+            indices,
+            node_type_offset=None,
+            type_per_edge=type_per_edge,
+            node_attributes=node_attributes,
+            edge_attributes=edge_attributes,
+            metadata=metadata,
         )
         orig_graph_path = os.path.join(
             os.path.dirname(part_config),
diff --git a/tests/distributed/test_partition.py b/tests/distributed/test_partition.py
index 4a3cc279ff8b..e9cb71420ada 100644
--- a/tests/distributed/test_partition.py
+++ b/tests/distributed/test_partition.py
@@ -4,6 +4,7 @@
 
 import backend as F
 import dgl
+import dgl.graphbolt as gb
 import numpy as np
 import pytest
 import torch as th
@@ -679,8 +680,17 @@ def test_UnknownPartitionBook():
 
 @pytest.mark.parametrize("part_method", ["metis", "random"])
 @pytest.mark.parametrize("num_parts", [1, 4])
+@pytest.mark.parametrize("store_orig_nids", [True, False])
+@pytest.mark.parametrize("store_orig_eids", [True, False])
+@pytest.mark.parametrize("store_etypes", [True, False])
+@pytest.mark.parametrize("store_metadata", [True, False])
 def test_convert_dgl_partition_to_csc_sampling_graph_homo(
-    part_method, num_parts
+    part_method,
+    num_parts,
+    store_orig_nids,
+    store_orig_eids,
+    store_etypes,
+    store_metadata,
 ):
     with tempfile.TemporaryDirectory() as test_dir:
         g = create_random_graph(1000)
@@ -689,7 +699,13 @@ def test_convert_dgl_partition_to_csc_sampling_graph_homo(
             g, graph_name, num_parts, test_dir, part_method=part_method
         )
         part_config = os.path.join(test_dir, f"{graph_name}.json")
-        convert_dgl_partition_to_csc_sampling_graph(part_config)
+        convert_dgl_partition_to_csc_sampling_graph(
+            part_config,
+            store_orig_nids,
+            store_orig_eids,
+            store_etypes,
+            store_metadata,
+        )
         for part_id in range(num_parts):
             orig_g = dgl.load_graphs(
                 os.path.join(test_dir, f"part{part_id}/graph.dgl")
@@ -701,17 +717,51 @@ def test_convert_dgl_partition_to_csc_sampling_graph_homo(
             assert th.equal(orig_indptr, new_g.csc_indptr)
             assert th.equal(orig_indices, new_g.indices)
             assert new_g.node_type_offset is None
-            assert all(new_g.type_per_edge == 0)
-            for node_type, type_id in new_g.metadata.node_type_to_id.items():
-                assert g.get_ntype_id(node_type) == type_id
-            for edge_type, type_id in new_g.metadata.edge_type_to_id.items():
-                assert g.get_etype_id(edge_type) == type_id
+            if store_orig_nids:
+                assert th.equal(
+                    orig_g.ndata[dgl.NID], new_g.node_attributes[dgl.NID]
+                )
+            else:
+                assert new_g.node_attributes is None
+            if store_orig_eids:
+                assert th.equal(
+                    orig_g.edata[dgl.EID], new_g.edge_attributes[dgl.EID]
+                )
+            else:
+                assert new_g.edge_attributes is None
+            if store_etypes:
+                assert th.all(0 == new_g.type_per_edge)
+            else:
+                assert new_g.type_per_edge is None
+            if store_metadata:
+                for (
+                    node_type,
+                    type_id,
+                ) in new_g.metadata.node_type_to_id.items():
+                    assert g.get_ntype_id(node_type) == type_id
+                for (
+                    edge_type,
+                    type_id,
+                ) in new_g.metadata.edge_type_to_id.items():
+                    edge_type = gb.etype_str_to_tuple(edge_type)
+                    assert g.get_etype_id(edge_type) == type_id
+            else:
+                assert new_g.metadata is None
 
 
 @pytest.mark.parametrize("part_method", ["metis", "random"])
 @pytest.mark.parametrize("num_parts", [1, 4])
+@pytest.mark.parametrize("store_orig_nids", [True, False])
+@pytest.mark.parametrize("store_orig_eids", [True, False])
+@pytest.mark.parametrize("store_etypes", [True, False])
+@pytest.mark.parametrize("store_metadata", [True, False])
 def test_convert_dgl_partition_to_csc_sampling_graph_hetero(
-    part_method, num_parts
+    part_method,
+    num_parts,
+    store_orig_nids,
+    store_orig_eids,
+    store_etypes,
+    store_metadata,
 ):
     with tempfile.TemporaryDirectory() as test_dir:
         g = create_random_hetero()
@@ -720,7 +770,13 @@ def test_convert_dgl_partition_to_csc_sampling_graph_hetero(
             g, graph_name, num_parts, test_dir, part_method=part_method
         )
         part_config = os.path.join(test_dir, f"{graph_name}.json")
-        convert_dgl_partition_to_csc_sampling_graph(part_config)
+        convert_dgl_partition_to_csc_sampling_graph(
+            part_config,
+            store_orig_nids,
+            store_orig_eids,
+            store_etypes,
+            store_metadata,
+        )
         for part_id in range(num_parts):
             orig_g = dgl.load_graphs(
                 os.path.join(test_dir, f"part{part_id}/graph.dgl")
@@ -731,12 +787,35 @@ def test_convert_dgl_partition_to_csc_sampling_graph_hetero(
             orig_indptr, orig_indices, _ = orig_g.adj().csc()
             assert th.equal(orig_indptr, new_g.csc_indptr)
             assert th.equal(orig_indices, new_g.indices)
-            for node_type, type_id in new_g.metadata.node_type_to_id.items():
-                assert g.get_ntype_id(node_type) == type_id
-            for edge_type, type_id in new_g.metadata.edge_type_to_id.items():
-                assert g.get_etype_id(edge_type) == type_id
+            if store_orig_nids:
+                assert th.equal(
+                    orig_g.ndata[dgl.NID], new_g.node_attributes[dgl.NID]
+                )
+            else:
+                assert new_g.node_attributes is None
+            if store_orig_eids:
+                assert th.equal(
+                    orig_g.edata[dgl.EID], new_g.edge_attributes[dgl.EID]
+                )
+            else:
+                assert new_g.edge_attributes is None
+            if store_etypes:
+                assert th.equal(orig_g.edata[dgl.ETYPE], new_g.type_per_edge)
+            else:
+                assert new_g.type_per_edge is None
+            if store_metadata:
+                for (
+                    node_type,
+                    type_id,
+                ) in new_g.metadata.node_type_to_id.items():
+                    assert g.get_ntype_id(node_type) == type_id
+                for (
+                    edge_type,
+                    type_id,
+                ) in new_g.metadata.edge_type_to_id.items():
+                    edge_type = gb.etype_str_to_tuple(edge_type)
+                    assert g.get_etype_id(edge_type) == type_id
             assert new_g.node_type_offset is None
-            assert th.equal(orig_g.edata[dgl.ETYPE], new_g.type_per_edge)
 
 
 def test_not_sorted_node_edge_map():
diff --git a/tests/scripts/task_distributed_test.sh b/tests/scripts/task_distributed_test.sh
index 62ea349d474a..dc70111e8c0c 100644
--- a/tests/scripts/task_distributed_test.sh
+++ b/tests/scripts/task_distributed_test.sh
@@ -34,6 +34,7 @@ export PYTHONUNBUFFERED=1
 export OMP_NUM_THREADS=1
 export DMLC_LOG_DEBUG=1
 
-python3 -m pytest -v --capture=tee-sys --junitxml=pytest_distributed.xml --durations=100 tests/distributed/*.py || fail "distributed"
+python3 -m pytest -v --capture=tee-sys --junitxml=pytest_distributed.xml --durations=100 tests/distributed/test_partition.py || fail "distributed"
+#python3 -m pytest -v --capture=tee-sys --junitxml=pytest_distributed.xml --durations=100 tests/distributed/*.py || fail "distributed"
 
 PYTHONPATH=tools:tools/distpartitioning:$PYTHONPATH python3 -m pytest -v --capture=tee-sys --junitxml=pytest_tools.xml --durations=100 tests/tools/*.py || fail "tools"

From 59825dcf760b3ed9f83215a781545be3718dfd3d Mon Sep 17 00:00:00 2001
From: RhettYing <rhett_ying@qq.com>
Date: Mon, 23 Oct 2023 09:01:10 +0000
Subject: [PATCH 04/30] [gb_distdgl] TODO: control dtype more rigidly when
 construct CSCSamplingGraph

---
 dt.py                               | 4 ++++
 python/dgl/distributed/partition.py | 9 ++++++---
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/dt.py b/dt.py
index 42bdcad1c6a3..c908da2cf152 100644
--- a/dt.py
+++ b/dt.py
@@ -18,6 +18,10 @@
 # Initialize distributed environment
 dgl.distributed.initialize(args.ip_config)
 th.distributed.init_process_group(backend=args.backend)
+# [TODO][P0] Convert dgl partitioned graphs to graphbolt.CSCSamplingGraph.
+#           done@2023-10-23 16:49:00
+#           see details in: https://github.com/Rhett-Ying/dgl/commits/gb_distdgl
+#                           ddce1d42de016be040cd0f8a5e71f2a10148de82
 # [TODO][P0] Load `CSCSamplingGraph` into `DistGraph`.
 ## NID/EIDs are required.
 g = dgl.distributed.DistGraph(args.graph_name, part_config=args.part_config)
diff --git a/python/dgl/distributed/partition.py b/python/dgl/distributed/partition.py
index e93fb2ba41ae..066e86bf587e 100644
--- a/python/dgl/distributed/partition.py
+++ b/python/dgl/distributed/partition.py
@@ -6,6 +6,7 @@
 import time
 
 import numpy as np
+import torch
 
 from .. import backend as F
 from ..base import DGLError, EID, ETYPE, NID, NTYPE
@@ -1271,15 +1272,15 @@ def init_type_per_edge(graph, gpb):
         # requires `node_type_offset` is set correctly and nodes are sorted
         # according to their types. This is not guaranteed in current partitioned
         # graph.
+        _, _, ntypes, etypes = load_partition_book(part_config, part_id)
         metadata = None
         if store_metadata:
             # Construct GraphMetadata.
-            _, _, ntypes, etypes = load_partition_book(part_config, part_id)
-            etypes = {
+            c_etypes = {
                 graphbolt.etype_tuple_to_str(etype): v
                 for etype, v in etypes.items()
             }
-            metadata = graphbolt.GraphMetadata(ntypes, etypes)
+            metadata = graphbolt.GraphMetadata(ntypes, c_etypes)
         # Obtain CSC indtpr and indices.
         indptr, indices, _ = graph.adj().csc()
         # Initalize type per edge.
@@ -1287,6 +1288,8 @@ def init_type_per_edge(graph, gpb):
         if store_etypes:
             type_per_edge = init_type_per_edge(graph, gpb)
             type_per_edge = type_per_edge.to(RESERVED_FIELD_DTYPE[ETYPE])
+            if len(etypes) < 128:
+                type_per_edge = type_per_edge.to(torch.int8)
             # Sanity check.
             assert len(type_per_edge) == graph.num_edges()
 

From 174dc37acfef6605760a1e92d6fd9107322bf14f Mon Sep 17 00:00:00 2001
From: RhettYing <rhett_ying@qq.com>
Date: Mon, 23 Oct 2023 09:04:05 +0000
Subject: [PATCH 05/30] [gb_distdgl] add graph file size of ogbn-mag for
 comparision btw DGL and GB

---
 dt.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/dt.py b/dt.py
index c908da2cf152..2ce284fe0333 100644
--- a/dt.py
+++ b/dt.py
@@ -22,6 +22,20 @@
 #           done@2023-10-23 16:49:00
 #           see details in: https://github.com/Rhett-Ying/dgl/commits/gb_distdgl
 #                           ddce1d42de016be040cd0f8a5e71f2a10148de82
+'''
+In [1]: part_config='/home/ubuntu/workspace/dgl_2/data/ogbn-mag.json'
+In [3]: dgl.distributed.convert_dgl_partition_to_csc_sampling_graph(part_config, store_orig_nids=True)
+In [7]: !ls data/part0 -lh
+total 1.1G
+-rw-rw-r-- 1 ubuntu ubuntu 207M Oct 23 08:44 csc_sampling_graph.tar
+-rw-rw-r-- 1 ubuntu ubuntu 694M Oct 23 02:47 graph.dgl
+
+In [8]: !ls data/part1 -lh
+total 1.1G
+-rw-rw-r-- 1 ubuntu ubuntu 202M Oct 23 08:44 csc_sampling_graph.tar
+-rw-rw-r-- 1 ubuntu ubuntu 678M Oct 23 02:47 graph.dgl
+'''
+
 # [TODO][P0] Load `CSCSamplingGraph` into `DistGraph`.
 ## NID/EIDs are required.
 g = dgl.distributed.DistGraph(args.graph_name, part_config=args.part_config)

From 4b767b377187ba3099938b99a23618e3ac2f3bd3 Mon Sep 17 00:00:00 2001
From: RhettYing <rhett_ying@qq.com>
Date: Tue, 24 Oct 2023 01:51:03 +0000
Subject: [PATCH 06/30] [gb_distdgl] Add use_graphbolt to control save
 CSCSamplingGraph when partition, load_partition, tests are added

---
 .../rgcn/experimental/partition_graph.py      |   6 +
 python/dgl/distributed/partition.py           |  90 ++++++++++---
 tests/distributed/test_partition.py           | 122 ++++++++++++++++++
 3 files changed, 202 insertions(+), 16 deletions(-)

diff --git a/examples/pytorch/rgcn/experimental/partition_graph.py b/examples/pytorch/rgcn/experimental/partition_graph.py
index cc364ee94e12..af430261609f 100644
--- a/examples/pytorch/rgcn/experimental/partition_graph.py
+++ b/examples/pytorch/rgcn/experimental/partition_graph.py
@@ -68,6 +68,11 @@ def load_ogb(dataset):
     argparser.add_argument(
         "--part_method", type=str, default="metis", help="the partition method"
     )
+    argparser.add_argument(
+        "--graphbolt",
+        action="store_true",
+        help="convert DGL to GraphBolt partitions.",
+    )
     argparser.add_argument(
         "--balance_train",
         action="store_true",
@@ -127,4 +132,5 @@ def load_ogb(dataset):
         balance_ntypes=balance_ntypes,
         balance_edges=args.balance_edges,
         num_trainers_per_machine=args.num_trainers_per_machine,
+        use_graphbolt=args.graphbolt,
     )
diff --git a/python/dgl/distributed/partition.py b/python/dgl/distributed/partition.py
index 066e86bf587e..0f3f04b1f4f7 100644
--- a/python/dgl/distributed/partition.py
+++ b/python/dgl/distributed/partition.py
@@ -5,13 +5,16 @@
 import os
 import time
 
+from copy import deepcopy
+
 import numpy as np
 import torch
 
-from .. import backend as F
+from .. import backend as F, graphbolt as gb
 from ..base import DGLError, EID, ETYPE, NID, NTYPE
 from ..convert import to_homogeneous
 from ..data.utils import load_graphs, load_tensors, save_graphs, save_tensors
+from ..heterograph import DGLGraph
 from ..partition import (
     get_peak_mem,
     metis_partition_assignment,
@@ -141,7 +144,7 @@ def _get_part_ranges(id_ranges):
     return res
 
 
-def load_partition(part_config, part_id, load_feats=True):
+def load_partition(part_config, part_id, load_feats=True, use_graphbolt=False):
     """Load data of a partition from the data path.
 
     A partition data includes a graph structure of the partition, a dict of node tensors,
@@ -163,6 +166,8 @@ def load_partition(part_config, part_id, load_feats=True):
     load_feats : bool, optional
         Whether to load node/edge feats. If False, the returned node/edge feature
         dictionaries will be empty. Default: True.
+    use_graphbolt : bool, optional
+        Whether to load the partition graph structure in the GraphBolt format.
 
     Returns
     -------
@@ -190,10 +195,13 @@ def load_partition(part_config, part_id, load_feats=True):
         "part-{}".format(part_id) in part_metadata
     ), "part-{} does not exist".format(part_id)
     part_files = part_metadata["part-{}".format(part_id)]
+    part_graph_field = "part_graph"
+    if use_graphbolt:
+        part_graph_field = "gb_part_graph"
     assert (
-        "part_graph" in part_files
-    ), "the partition does not contain graph structure."
-    partition_path = relative_to_config(part_files["part_graph"])
+        part_graph_field in part_files
+    ), f"the partition does not contain graph structure: {part_graph_field}."
+    partition_path = relative_to_config(part_files[part_graph_field])
     logging.info(
         "Start to load partition from %s which is "
         "%d bytes. It may take non-trivial "
@@ -201,20 +209,35 @@ def load_partition(part_config, part_id, load_feats=True):
         partition_path,
         os.path.getsize(partition_path),
     )
-    graph = load_graphs(partition_path)[0][0]
-    logging.info("Finished loading partition.")
-
-    assert (
-        NID in graph.ndata
-    ), "the partition graph should contain node mapping to global node ID"
-    assert (
-        EID in graph.edata
-    ), "the partition graph should contain edge mapping to global edge ID"
+    graph = None
+    if partition_path.endswith(".tar"):
+        assert use_graphbolt, (
+            "The partition is stored in the GraphBolt format. "
+            "Please set use_graphbolt=True to load it."
+        )
+        graph = gb.load_csc_sampling_graph(partition_path)
+        assert isinstance(graph, gb.CSCSamplingGraph)
+    else:
+        assert not use_graphbolt, (
+            "The partition is stored in the DGL format. "
+            "Please set use_graphbolt=False to load it."
+        )
+        graph = load_graphs(partition_path)[0][0]
+        assert isinstance(graph, DGLGraph)
+    logging.info(f"Finished loading partition from {partition_path}")
+
+    if isinstance(graph, DGLGraph):
+        assert (
+            NID in graph.ndata
+        ), "the partition graph should contain node mapping to global node ID"
+        assert (
+            EID in graph.edata
+        ), "the partition graph should contain edge mapping to global edge ID"
 
     gpb, graph_name, ntypes, etypes = load_partition_book(part_config, part_id)
     ntypes_list = list(ntypes.keys())
     etypes_list = list(etypes.keys())
-    if "DGL_DIST_DEBUG" in os.environ:
+    if "DGL_DIST_DEBUG" in os.environ and isinstance(graph, DGLGraph):
         for ntype in ntypes:
             ntype_id = ntypes[ntype]
             # graph.ndata[NID] are global homogeneous node IDs.
@@ -547,6 +570,11 @@ def partition_graph(
     num_trainers_per_machine=1,
     objtype="cut",
     graph_formats=None,
+    use_graphbolt=False,
+    gb_store_orig_nids=False,
+    gb_store_orig_eids=False,
+    gb_store_etypes=False,
+    gb_store_metadata=False,
 ):
     """Partition a graph for distributed training and store the partitions on files.
 
@@ -720,6 +748,16 @@ def partition_graph(
         ``csc`` and ``csr``. If not specified, save one format only according to what
         format is available. If multiple formats are available, selection priority
         from high to low is ``coo``, ``csc``, ``csr``.
+    use_graphbolt : bool
+        Whether to convert the partitioned graph to GraphBolt format.
+    gb_store_orig_nids : bool
+        Whether to store the original node IDs in the partitioned graph.
+    gb_store_orig_eids : bool
+        Whether to store the original edge IDs in the partitioned graph.
+    gb_store_etypes : bool
+        Whether to store the edge types in the partitioned graph.
+    gb_store_metadata : bool
+        Whether to store the metadata of the partitioned graph.
 
     Returns
     -------
@@ -1207,7 +1245,8 @@ def get_homogeneous(g, balance_ntypes):
         )
     )
 
-    _dump_part_config(f"{out_path}/{graph_name}.json", part_metadata)
+    part_config = os.path.join(out_path, graph_name + ".json")
+    _dump_part_config(part_config, part_metadata)
 
     num_cuts = sim_g.num_edges() - tot_num_inner_edges
     if num_parts == 1:
@@ -1218,6 +1257,16 @@ def get_homogeneous(g, balance_ntypes):
         )
     )
 
+    if use_graphbolt:
+        convert_dgl_partition_to_csc_sampling_graph(
+            part_config,
+            store_orig_nids=gb_store_orig_nids,
+            store_orig_eids=gb_store_orig_eids,
+            store_etypes=gb_store_etypes,
+            store_metadata=gb_store_metadata,
+        )
+        print("Converted to GraphBolt format.")
+
     if return_mapping:
         return orig_nids, orig_eids
 
@@ -1255,6 +1304,7 @@ def convert_dgl_partition_to_csc_sampling_graph(
     from .. import graphbolt
 
     part_meta = _load_part_config(part_config)
+    new_part_meta = deepcopy(part_meta)
     num_parts = part_meta["num_parts"]
 
     # Utility functions.
@@ -1329,3 +1379,11 @@ def init_type_per_edge(graph, gpb):
             os.path.dirname(orig_graph_path), "csc_sampling_graph.tar"
         )
         graphbolt.save_csc_sampling_graph(csc_graph, csc_graph_path)
+
+        # Update graph path.
+        new_part_meta[f"part-{part_id}"]["gb_part_graph"] = os.path.relpath(
+            csc_graph_path, os.path.dirname(part_config)
+        )
+
+    # Update partition config.
+    _dump_part_config(part_config, new_part_meta)
diff --git a/tests/distributed/test_partition.py b/tests/distributed/test_partition.py
index e9cb71420ada..be540362866c 100644
--- a/tests/distributed/test_partition.py
+++ b/tests/distributed/test_partition.py
@@ -926,3 +926,125 @@ def test_not_sorted_node_edge_map():
         gpb, _, _, _ = load_partition_book(part_config, 1)
         assert gpb.local_ntype_offset == [0, 300, 700]
         assert gpb.local_etype_offset == [0, 500, 1100, 1800, 2600]
+
+
+@pytest.mark.parametrize("part_method", ["metis", "random"])
+@pytest.mark.parametrize("num_parts", [1])
+@pytest.mark.parametrize("num_trainers_per_machine", [1, 4])
+@pytest.mark.parametrize("load_feats", [True, False])
+def test_partition_homo_graphbolt(
+    part_method,
+    num_parts,
+    num_trainers_per_machine,
+    load_feats,
+):
+    os.environ["DGL_DIST_DEBUG"] = "1"
+    if part_method == "random" and num_parts > 1:
+        num_trainers_per_machine = 1
+
+    g = create_random_graph(1000)
+    g.ndata["labels"] = F.arange(0, g.num_nodes())
+    g.ndata["feats"] = F.tensor(np.random.randn(g.num_nodes(), 10), F.float32)
+    g.edata["feats"] = F.tensor(np.random.randn(g.num_edges(), 10), F.float32)
+    g.update_all(fn.copy_u("feats", "msg"), fn.sum("msg", "h"))
+    g.update_all(fn.copy_e("feats", "msg"), fn.sum("msg", "eh"))
+    num_hops = 2
+
+    with tempfile.TemporaryDirectory() as test_dir:
+        orig_nids, orig_eids = partition_graph(
+            g,
+            "test",
+            num_parts,
+            test_dir,
+            num_hops=num_hops,
+            part_method=part_method,
+            return_mapping=True,
+            num_trainers_per_machine=num_trainers_per_machine,
+            use_graphbolt=True,
+        )
+        part_config = os.path.join(test_dir, "test.json")
+        for i in range(num_parts):
+            part_g, node_feats, edge_feats, gpb, _, _, _ = load_partition(
+                part_config, i, load_feats=load_feats, use_graphbolt=True
+            )
+            assert isinstance(part_g, gb.CSCSamplingGraph)
+            assert gpb.num_partitions() == num_parts
+            gpb_meta = gpb.metadata()
+            assert len(gpb_meta) == num_parts
+            assert len(gpb.partid2nids(i)) == gpb_meta[i]["num_nodes"]
+            assert len(gpb.partid2eids(i)) == gpb_meta[i]["num_edges"]
+            assert len(gpb.partid2nids(i)) == part_g.total_num_nodes
+            assert len(gpb.partid2eids(i)) == part_g.total_num_edges
+            if load_feats:
+                assert "_N/labels" in node_feats
+                assert "_N/feats" in node_feats
+                assert "_N:_E:_N/feats" in edge_feats
+            else:
+                assert node_feats == {}
+                assert edge_feats == {}
+
+    reset_envs()
+
+
+@pytest.mark.parametrize("part_method", ["metis", "random"])
+@pytest.mark.parametrize("num_parts", [1])
+@pytest.mark.parametrize("num_trainers_per_machine", [1, 4])
+@pytest.mark.parametrize("load_feats", [True, False])
+def test_partition_hetero_graphbolt(
+    part_method,
+    num_parts,
+    num_trainers_per_machine,
+    load_feats,
+):
+    os.environ["DGL_DIST_DEBUG"] = "1"
+    if part_method == "random" and num_parts > 1:
+        num_trainers_per_machine = 1
+
+    hg = create_random_hetero()
+    test_ntype = "n1"
+    test_etype = ("n1", "r1", "n2")
+    hg.nodes[test_ntype].data["labels"] = F.arange(0, hg.num_nodes(test_ntype))
+    hg.nodes[test_ntype].data["feats"] = F.tensor(
+        np.random.randn(hg.num_nodes(test_ntype), 10), F.float32
+    )
+    hg.edges[test_etype].data["feats"] = F.tensor(
+        np.random.randn(hg.num_edges(test_etype), 10), F.float32
+    )
+    hg.edges[test_etype].data["labels"] = F.arange(0, hg.num_edges(test_etype))
+
+    num_hops = 2
+
+    with tempfile.TemporaryDirectory() as test_dir:
+        orig_nids, orig_eids = partition_graph(
+            hg,
+            "test",
+            num_parts,
+            test_dir,
+            num_hops=num_hops,
+            part_method=part_method,
+            return_mapping=True,
+            num_trainers_per_machine=num_trainers_per_machine,
+            use_graphbolt=True,
+        )
+        part_config = os.path.join(test_dir, "test.json")
+        for i in range(num_parts):
+            part_g, node_feats, edge_feats, gpb, _, _, _ = load_partition(
+                part_config, i, load_feats=load_feats, use_graphbolt=True
+            )
+            assert isinstance(part_g, gb.CSCSamplingGraph)
+            assert gpb.num_partitions() == num_parts
+            gpb_meta = gpb.metadata()
+            assert len(gpb_meta) == num_parts
+            assert len(gpb.partid2nids(i)) == gpb_meta[i]["num_nodes"]
+            assert len(gpb.partid2eids(i)) == gpb_meta[i]["num_edges"]
+            assert len(gpb.partid2nids(i)) == part_g.total_num_nodes
+            assert len(gpb.partid2eids(i)) == part_g.total_num_edges
+            if load_feats:
+                assert "n1/labels" in node_feats
+                assert "n1/feats" in node_feats
+                assert "n1:r1:n2/feats" in edge_feats
+            else:
+                assert node_feats == {}
+                assert edge_feats == {}
+
+    reset_envs()

From d76004f3d9a0fad62b3cd7e94e78ed514842d409 Mon Sep 17 00:00:00 2001
From: RhettYing <rhett_ying@qq.com>
Date: Tue, 24 Oct 2023 03:32:08 +0000
Subject: [PATCH 07/30] [gb_distdgl] enable copy to sharedMem and back

---
 python/dgl/distributed/dist_graph.py   | 103 +++++++++++++++----------
 python/dgl/distributed/server_state.py |   2 +-
 2 files changed, 65 insertions(+), 40 deletions(-)

diff --git a/python/dgl/distributed/dist_graph.py b/python/dgl/distributed/dist_graph.py
index 192293a80676..967d9505a792 100644
--- a/python/dgl/distributed/dist_graph.py
+++ b/python/dgl/distributed/dist_graph.py
@@ -8,7 +8,7 @@
 
 import numpy as np
 
-from .. import backend as F, heterograph_index
+from .. import backend as F, graphbolt as gb, heterograph_index
 from .._ffi.ndarray import empty_shared_mem
 from ..base import ALL, DGLError, EID, ETYPE, is_all, NID
 from ..convert import graph as dgl_graph, heterograph as dgl_heterograph
@@ -60,18 +60,21 @@ class InitGraphRequest(rpc.Request):
     with shared memory.
     """
 
-    def __init__(self, graph_name):
+    def __init__(self, graph_name, use_graphbolt):
         self._graph_name = graph_name
+        self._use_graphbolt = use_graphbolt
 
     def __getstate__(self):
-        return self._graph_name
+        return (self._graph_name, self._use_graphbolt)
 
     def __setstate__(self, state):
-        self._graph_name = state
+        (self._graph_name, self._use_graphbolt) = state
 
     def process_request(self, server_state):
         if server_state.graph is None:
-            server_state.graph = _get_graph_from_shared_mem(self._graph_name)
+            server_state.graph = _get_graph_from_shared_mem(
+                self._graph_name, self._use_graphbolt
+            )
         return InitGraphResponse(self._graph_name)
 
 
@@ -88,7 +91,10 @@ def __setstate__(self, state):
         self._graph_name = state
 
 
-def _copy_graph_to_shared_mem(g, graph_name, graph_format):
+def _copy_graph_to_shared_mem(g, graph_name, graph_format, use_graphbolt):
+    if use_graphbolt:
+        new_g = g.copy_to_shared_memory(graph_name)
+        return new_g
     new_g = g.shared_memory(graph_name, formats=graph_format)
     # We should share the node/edge data to the client explicitly instead of putting them
     # in the KVStore because some of the node/edge data may be duplicated.
@@ -151,13 +157,17 @@ def _exist_shared_mem_array(graph_name, name):
     return exist_shared_mem_array(_get_edata_path(graph_name, name))
 
 
-def _get_graph_from_shared_mem(graph_name):
+def _get_graph_from_shared_mem(graph_name, use_graphbolt):
     """Get the graph from the DistGraph server.
 
     The DistGraph server puts the graph structure of the local partition in the shared memory.
     The client can access the graph structure and some metadata on nodes and edges directly
     through shared memory to reduce the overhead of data access.
     """
+    if use_graphbolt:
+        g = gb.load_from_shared_memory(graph_name, None)
+        return g
+
     g, ntypes, etypes = heterograph_index.create_heterograph_from_shared_memory(
         graph_name
     )
@@ -330,6 +340,8 @@ class DistGraphServer(KVServer):
         Disable shared memory.
     graph_format : str or list of str
         The graph formats.
+    use_graphbolt : bool
+        Whether to use GraphBolt format.
     """
 
     def __init__(
@@ -341,6 +353,7 @@ def __init__(
         part_config,
         disable_shared_mem=False,
         graph_format=("csc", "coo"),
+        use_graphbolt=False,
     ):
         super(DistGraphServer, self).__init__(
             server_id=server_id,
@@ -367,32 +380,39 @@ def __init__(
                 graph_name,
                 ntypes,
                 etypes,
-            ) = load_partition(part_config, self.part_id, load_feats=False)
-            print("load " + graph_name)
-            # formatting dtype
-            # TODO(Rui) Formatting forcely is not a perfect solution.
-            #   We'd better store all dtypes when mapping to shared memory
-            #   and map back with original dtypes.
-            for k, dtype in RESERVED_FIELD_DTYPE.items():
-                if k in self.client_g.ndata:
-                    self.client_g.ndata[k] = F.astype(
-                        self.client_g.ndata[k], dtype
-                    )
-                if k in self.client_g.edata:
-                    self.client_g.edata[k] = F.astype(
-                        self.client_g.edata[k], dtype
-                    )
-            # Create the graph formats specified the users.
-            print(
-                "Start to create specified graph formats which may take "
-                "non-trivial time."
+            ) = load_partition(
+                part_config,
+                self.part_id,
+                load_feats=False,
+                use_graphbolt=use_graphbolt,
             )
-            self.client_g = self.client_g.formats(graph_format)
-            self.client_g.create_formats_()
-            print("Finished creating specified graph formats.")
+            print(f"Loaded {graph_name} with use_graphbolt[{use_graphbolt}]")
+
+            if not use_graphbolt:
+                # formatting dtype
+                # TODO(Rui) Formatting forcely is not a perfect solution.
+                #   We'd better store all dtypes when mapping to shared memory
+                #   and map back with original dtypes.
+                for k, dtype in RESERVED_FIELD_DTYPE.items():
+                    if k in self.client_g.ndata:
+                        self.client_g.ndata[k] = F.astype(
+                            self.client_g.ndata[k], dtype
+                        )
+                    if k in self.client_g.edata:
+                        self.client_g.edata[k] = F.astype(
+                            self.client_g.edata[k], dtype
+                        )
+                # Create the graph formats specified the users.
+                print(
+                    "Start to create specified graph formats which may take "
+                    "non-trivial time."
+                )
+                self.client_g = self.client_g.formats(graph_format)
+                self.client_g.create_formats_()
+                print("Finished creating specified graph formats.")
             if not disable_shared_mem:
                 self.client_g = _copy_graph_to_shared_mem(
-                    self.client_g, graph_name, graph_format
+                    self.client_g, graph_name, graph_format, use_graphbolt
                 )
 
         if not disable_shared_mem:
@@ -542,8 +562,11 @@ class DistGraph:
     manually setting up servers and trainers. The setup is not fully tested yet.
     """
 
-    def __init__(self, graph_name, gpb=None, part_config=None):
+    def __init__(
+        self, graph_name, gpb=None, part_config=None, use_graphbolt=False
+    ):
         self.graph_name = graph_name
+        self._use_graphbolt = use_graphbolt
         if os.environ.get("DGL_DIST_MODE", "standalone") == "standalone":
             assert (
                 part_config is not None
@@ -554,7 +577,7 @@ def __init__(self, graph_name, gpb=None, part_config=None):
             ), "Distributed module is not initialized. Please call dgl.distributed.initialize."
             # Load graph partition data.
             g, node_feats, edge_feats, self._gpb, _, _, _ = load_partition(
-                part_config, 0
+                part_config, 0, use_graphbolt=use_graphbolt
             )
             assert (
                 self._gpb.num_partitions() == 1
@@ -582,10 +605,12 @@ def __init__(self, graph_name, gpb=None, part_config=None):
             self._client.map_shared_data(self._gpb)
             rpc.set_num_client(1)
         else:
-            self._init(gpb)
+            self._init(gpb, use_graphbolt)
             # Tell the backup servers to load the graph structure from shared memory.
             for server_id in range(self._client.num_servers):
-                rpc.send_request(server_id, InitGraphRequest(graph_name))
+                rpc.send_request(
+                    server_id, InitGraphRequest(graph_name, use_graphbolt)
+                )
             for server_id in range(self._client.num_servers):
                 rpc.recv_response()
             self._client.barrier()
@@ -605,12 +630,12 @@ def __init__(self, graph_name, gpb=None, part_config=None):
             etype: i for i, etype in enumerate(self.canonical_etypes)
         }
 
-    def _init(self, gpb):
+    def _init(self, gpb, use_graphbolt):
         self._client = get_kvstore()
         assert (
             self._client is not None
         ), "Distributed module is not initialized. Please call dgl.distributed.initialize."
-        self._g = _get_graph_from_shared_mem(self.graph_name)
+        self._g = _get_graph_from_shared_mem(self.graph_name, use_graphbolt)
         self._gpb = get_shared_mem_partition_book(self.graph_name)
         if self._gpb is None:
             self._gpb = gpb
@@ -667,11 +692,11 @@ def _init_edata_store(self):
                 self._edata_store[etype] = data
 
     def __getstate__(self):
-        return self.graph_name, self._gpb
+        return self.graph_name, self._gpb, self._use_graphbolt
 
     def __setstate__(self, state):
-        self.graph_name, gpb = state
-        self._init(gpb)
+        self.graph_name, gpb, self._use_graphbolt = state
+        self._init(gpb, self._use_graphbolt)
 
         self._init_ndata_store()
         self._init_edata_store()
diff --git a/python/dgl/distributed/server_state.py b/python/dgl/distributed/server_state.py
index 0eac8d40c670..3b2dde3e2032 100644
--- a/python/dgl/distributed/server_state.py
+++ b/python/dgl/distributed/server_state.py
@@ -30,7 +30,7 @@ class ServerState:
     ----------
     kv_store : KVServer
         reference for KVServer
-    graph : DGLGraph
+    graph : DGLGraph or CSCSamplingGraph
         Graph structure of one partition
     total_num_nodes : int
         Total number of nodes

From 222dd2bd51084cc4f242148b0a7e6e5d91e0ae80 Mon Sep 17 00:00:00 2001
From: RhettYing <rhett_ying@qq.com>
Date: Tue, 24 Oct 2023 06:11:12 +0000
Subject: [PATCH 08/30] [gb_distdgl] successfully load graph in server and
 client from shared mem

---
 examples/pytorch/rgcn/experimental/cmd.sh     |  10 +
 examples/pytorch/rgcn/experimental/gb_demo.py | 932 ++++++++++++++++++
 .../pytorch/rgcn/experimental/ip_config.txt   |   2 +
 python/dgl/distributed/dist_context.py        |   4 +
 4 files changed, 948 insertions(+)
 create mode 100644 examples/pytorch/rgcn/experimental/cmd.sh
 create mode 100644 examples/pytorch/rgcn/experimental/gb_demo.py
 create mode 100644 examples/pytorch/rgcn/experimental/ip_config.txt

diff --git a/examples/pytorch/rgcn/experimental/cmd.sh b/examples/pytorch/rgcn/experimental/cmd.sh
new file mode 100644
index 000000000000..88212bc794d7
--- /dev/null
+++ b/examples/pytorch/rgcn/experimental/cmd.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+python3 /home/ubuntu/workspace/dgl_2/tools/launch.py \
+    --workspace /home/ubuntu/workspace/dgl_2/examples/pytorch/rgcn/experimental/ \
+    --num_trainers 4 \
+    --num_servers 2 \
+    --num_samplers 0 \
+    --part_config /home/ubuntu/workspace/dgl_2/data/ogbn-mag.json \
+    --ip_config /home/ubuntu/workspace/ip_config.txt \
+    "DGL_LIBRARY_PATH=/home/ubuntu/workspace/dgl_2/build PYTHONPATH=tests:/home/ubuntu/workspace/dgl_2/python:tests/python/pytorch/graphbolt:$PYTHONPATH python3 gb_demo.py --graph-name ogbn-mag --dataset ogbn-mag --ip-config /home/ubuntu/workspace/ip_config.txt --fanout='25,10' --batch-size 1024  --n-hidden 64 --lr 0.01 --eval-batch-size 1024 --graphbolt"
diff --git a/examples/pytorch/rgcn/experimental/gb_demo.py b/examples/pytorch/rgcn/experimental/gb_demo.py
new file mode 100644
index 000000000000..06d6d9c460d2
--- /dev/null
+++ b/examples/pytorch/rgcn/experimental/gb_demo.py
@@ -0,0 +1,932 @@
+"""
+Modeling Relational Data with Graph Convolutional Networks
+Paper: https://arxiv.org/abs/1703.06103
+Code: https://github.com/tkipf/relational-gcn
+Difference compared to tkipf/relation-gcn
+* l2norm applied to all weights
+* remove nodes that won't be touched
+"""
+import argparse
+import gc, os
+import itertools
+import time
+
+import numpy as np
+
+os.environ["DGLBACKEND"] = "pytorch"
+
+from functools import partial
+
+import dgl
+import dgl.graphbolt as gb
+import torch as th
+import torch.multiprocessing as mp
+import torch.nn as nn
+import torch.nn.functional as F
+
+import tqdm
+from dgl import DGLGraph, nn as dglnn
+from dgl.distributed import DistDataLoader
+
+from ogb.nodeproppred import DglNodePropPredDataset
+from torch.multiprocessing import Queue
+from torch.nn.parallel import DistributedDataParallel
+from torch.utils.data import DataLoader
+
+
+class RelGraphConvLayer(nn.Module):
+    r"""Relational graph convolution layer.
+    Parameters
+    ----------
+    in_feat : int
+        Input feature size.
+    out_feat : int
+        Output feature size.
+    rel_names : list[str]
+        Relation names.
+    num_bases : int, optional
+        Number of bases. If is none, use number of relations. Default: None.
+    weight : bool, optional
+        True if a linear layer is applied after message passing. Default: True
+    bias : bool, optional
+        True if bias is added. Default: True
+    activation : callable, optional
+        Activation function. Default: None
+    self_loop : bool, optional
+        True to include self loop message. Default: False
+    dropout : float, optional
+        Dropout rate. Default: 0.0
+    """
+
+    def __init__(
+        self,
+        in_feat,
+        out_feat,
+        rel_names,
+        num_bases,
+        *,
+        weight=True,
+        bias=True,
+        activation=None,
+        self_loop=False,
+        dropout=0.0
+    ):
+        super(RelGraphConvLayer, self).__init__()
+        self.in_feat = in_feat
+        self.out_feat = out_feat
+        self.rel_names = rel_names
+        self.num_bases = num_bases
+        self.bias = bias
+        self.activation = activation
+        self.self_loop = self_loop
+
+        self.conv = dglnn.HeteroGraphConv(
+            {
+                rel: dglnn.GraphConv(
+                    in_feat, out_feat, norm="right", weight=False, bias=False
+                )
+                for rel in rel_names
+            }
+        )
+
+        self.use_weight = weight
+        self.use_basis = num_bases < len(self.rel_names) and weight
+        if self.use_weight:
+            if self.use_basis:
+                self.basis = dglnn.WeightBasis(
+                    (in_feat, out_feat), num_bases, len(self.rel_names)
+                )
+            else:
+                self.weight = nn.Parameter(
+                    th.Tensor(len(self.rel_names), in_feat, out_feat)
+                )
+                nn.init.xavier_uniform_(
+                    self.weight, gain=nn.init.calculate_gain("relu")
+                )
+
+        # bias
+        if bias:
+            self.h_bias = nn.Parameter(th.Tensor(out_feat))
+            nn.init.zeros_(self.h_bias)
+
+        # weight for self loop
+        if self.self_loop:
+            self.loop_weight = nn.Parameter(th.Tensor(in_feat, out_feat))
+            nn.init.xavier_uniform_(
+                self.loop_weight, gain=nn.init.calculate_gain("relu")
+            )
+
+        self.dropout = nn.Dropout(dropout)
+
+    def forward(self, g, inputs):
+        """Forward computation
+        Parameters
+        ----------
+        g : DGLGraph
+            Input graph.
+        inputs : dict[str, torch.Tensor]
+            Node feature for each node type.
+        Returns
+        -------
+        dict[str, torch.Tensor]
+            New node features for each node type.
+        """
+        g = g.local_var()
+        if self.use_weight:
+            weight = self.basis() if self.use_basis else self.weight
+            wdict = {
+                self.rel_names[i]: {"weight": w.squeeze(0)}
+                for i, w in enumerate(th.split(weight, 1, dim=0))
+            }
+        else:
+            wdict = {}
+
+        if g.is_block:
+            inputs_src = inputs
+            inputs_dst = {
+                k: v[: g.number_of_dst_nodes(k)] for k, v in inputs.items()
+            }
+        else:
+            inputs_src = inputs_dst = inputs
+
+        hs = self.conv(g, inputs, mod_kwargs=wdict)
+
+        def _apply(ntype, h):
+            if self.self_loop:
+                h = h + th.matmul(inputs_dst[ntype], self.loop_weight)
+            if self.bias:
+                h = h + self.h_bias
+            if self.activation:
+                h = self.activation(h)
+            return self.dropout(h)
+
+        return {ntype: _apply(ntype, h) for ntype, h in hs.items()}
+
+
+class EntityClassify(nn.Module):
+    """Entity classification class for RGCN
+    Parameters
+    ----------
+    device : int
+        Device to run the layer.
+    num_nodes : int
+        Number of nodes.
+    h_dim : int
+        Hidden dim size.
+    out_dim : int
+        Output dim size.
+    rel_names : list of str
+        A list of relation names.
+    num_bases : int
+        Number of bases. If is none, use number of relations.
+    num_hidden_layers : int
+        Number of hidden RelGraphConv Layer
+    dropout : float
+        Dropout
+    use_self_loop : bool
+        Use self loop if True, default False.
+    """
+
+    def __init__(
+        self,
+        device,
+        h_dim,
+        out_dim,
+        rel_names,
+        num_bases=None,
+        num_hidden_layers=1,
+        dropout=0,
+        use_self_loop=False,
+        layer_norm=False,
+    ):
+        super(EntityClassify, self).__init__()
+        self.device = device
+        self.h_dim = h_dim
+        self.out_dim = out_dim
+        self.num_bases = None if num_bases < 0 else num_bases
+        self.num_hidden_layers = num_hidden_layers
+        self.dropout = dropout
+        self.use_self_loop = use_self_loop
+        self.layer_norm = layer_norm
+
+        self.layers = nn.ModuleList()
+        # i2h
+        self.layers.append(
+            RelGraphConvLayer(
+                self.h_dim,
+                self.h_dim,
+                rel_names,
+                self.num_bases,
+                activation=F.relu,
+                self_loop=self.use_self_loop,
+                dropout=self.dropout,
+            )
+        )
+        # h2h
+        for idx in range(self.num_hidden_layers):
+            self.layers.append(
+                RelGraphConvLayer(
+                    self.h_dim,
+                    self.h_dim,
+                    rel_names,
+                    self.num_bases,
+                    activation=F.relu,
+                    self_loop=self.use_self_loop,
+                    dropout=self.dropout,
+                )
+            )
+        # h2o
+        self.layers.append(
+            RelGraphConvLayer(
+                self.h_dim,
+                self.out_dim,
+                rel_names,
+                self.num_bases,
+                activation=None,
+                self_loop=self.use_self_loop,
+            )
+        )
+
+    def forward(self, blocks, feats, norm=None):
+        if blocks is None:
+            # full graph training
+            blocks = [self.g] * len(self.layers)
+        h = feats
+        for layer, block in zip(self.layers, blocks):
+            block = block.to(self.device)
+            h = layer(block, h)
+        return h
+
+
+def init_emb(shape, dtype):
+    arr = th.zeros(shape, dtype=dtype)
+    nn.init.uniform_(arr, -1.0, 1.0)
+    return arr
+
+
+class DistEmbedLayer(nn.Module):
+    r"""Embedding layer for featureless heterograph.
+    Parameters
+    ----------
+    dev_id : int
+        Device to run the layer.
+    g : DistGraph
+        training graph
+    embed_size : int
+        Output embed size
+    sparse_emb: bool
+        Whether to use sparse embedding
+        Default: False
+    dgl_sparse_emb: bool
+        Whether to use DGL sparse embedding
+        Default: False
+    embed_name : str, optional
+        Embed name
+    """
+
+    def __init__(
+        self,
+        dev_id,
+        g,
+        embed_size,
+        sparse_emb=False,
+        dgl_sparse_emb=False,
+        feat_name="feat",
+        embed_name="node_emb",
+    ):
+        super(DistEmbedLayer, self).__init__()
+        self.dev_id = dev_id
+        self.embed_size = embed_size
+        self.embed_name = embed_name
+        self.feat_name = feat_name
+        self.sparse_emb = sparse_emb
+        self.g = g
+        self.ntype_id_map = {g.get_ntype_id(ntype): ntype for ntype in g.ntypes}
+
+        self.node_projs = nn.ModuleDict()
+        for ntype in g.ntypes:
+            if feat_name in g.nodes[ntype].data:
+                self.node_projs[ntype] = nn.Linear(
+                    g.nodes[ntype].data[feat_name].shape[1], embed_size
+                )
+                nn.init.xavier_uniform_(self.node_projs[ntype].weight)
+                print("node {} has data {}".format(ntype, feat_name))
+        if sparse_emb:
+            if dgl_sparse_emb:
+                self.node_embeds = {}
+                for ntype in g.ntypes:
+                    # We only create embeddings for nodes without node features.
+                    if feat_name not in g.nodes[ntype].data:
+                        part_policy = g.get_node_partition_policy(ntype)
+                        self.node_embeds[ntype] = dgl.distributed.DistEmbedding(
+                            g.num_nodes(ntype),
+                            self.embed_size,
+                            embed_name + "_" + ntype,
+                            init_emb,
+                            part_policy,
+                        )
+            else:
+                self.node_embeds = nn.ModuleDict()
+                for ntype in g.ntypes:
+                    # We only create embeddings for nodes without node features.
+                    if feat_name not in g.nodes[ntype].data:
+                        self.node_embeds[ntype] = th.nn.Embedding(
+                            g.num_nodes(ntype),
+                            self.embed_size,
+                            sparse=self.sparse_emb,
+                        )
+                        nn.init.uniform_(
+                            self.node_embeds[ntype].weight, -1.0, 1.0
+                        )
+        else:
+            self.node_embeds = nn.ModuleDict()
+            for ntype in g.ntypes:
+                # We only create embeddings for nodes without node features.
+                if feat_name not in g.nodes[ntype].data:
+                    self.node_embeds[ntype] = th.nn.Embedding(
+                        g.num_nodes(ntype), self.embed_size
+                    )
+                    nn.init.uniform_(self.node_embeds[ntype].weight, -1.0, 1.0)
+
+    def forward(self, node_ids):
+        """Forward computation
+        Parameters
+        ----------
+        node_ids : dict of Tensor
+            node ids to generate embedding for.
+        Returns
+        -------
+        tensor
+            embeddings as the input of the next layer
+        """
+        embeds = {}
+        for ntype in node_ids:
+            if self.feat_name in self.g.nodes[ntype].data:
+                embeds[ntype] = self.node_projs[ntype](
+                    self.g.nodes[ntype]
+                    .data[self.feat_name][node_ids[ntype]]
+                    .to(self.dev_id)
+                )
+            else:
+                embeds[ntype] = self.node_embeds[ntype](node_ids[ntype]).to(
+                    self.dev_id
+                )
+        return embeds
+
+
+def compute_acc(results, labels):
+    """
+    Compute the accuracy of prediction given the labels.
+    """
+    labels = labels.long()
+    return (results == labels).float().sum() / len(results)
+
+
+def evaluate(
+    g,
+    model,
+    embed_layer,
+    labels,
+    eval_loader,
+    test_loader,
+    all_val_nid,
+    all_test_nid,
+):
+    model.eval()
+    embed_layer.eval()
+    eval_logits = []
+    eval_seeds = []
+
+    global_results = dgl.distributed.DistTensor(
+        labels.shape, th.long, "results", persistent=True
+    )
+
+    with th.no_grad():
+        th.cuda.empty_cache()
+        for sample_data in tqdm.tqdm(eval_loader):
+            input_nodes, seeds, blocks = sample_data
+            seeds = seeds["paper"]
+            feats = embed_layer(input_nodes)
+            logits = model(blocks, feats)
+            assert len(logits) == 1
+            logits = logits["paper"]
+            eval_logits.append(logits.cpu().detach())
+            assert np.all(seeds.numpy() < g.num_nodes("paper"))
+            eval_seeds.append(seeds.cpu().detach())
+    eval_logits = th.cat(eval_logits)
+    eval_seeds = th.cat(eval_seeds)
+    global_results[eval_seeds] = eval_logits.argmax(dim=1)
+
+    test_logits = []
+    test_seeds = []
+    with th.no_grad():
+        th.cuda.empty_cache()
+        for sample_data in tqdm.tqdm(test_loader):
+            input_nodes, seeds, blocks = sample_data
+            seeds = seeds["paper"]
+            feats = embed_layer(input_nodes)
+            logits = model(blocks, feats)
+            assert len(logits) == 1
+            logits = logits["paper"]
+            test_logits.append(logits.cpu().detach())
+            assert np.all(seeds.numpy() < g.num_nodes("paper"))
+            test_seeds.append(seeds.cpu().detach())
+    test_logits = th.cat(test_logits)
+    test_seeds = th.cat(test_seeds)
+    global_results[test_seeds] = test_logits.argmax(dim=1)
+
+    g.barrier()
+    if g.rank() == 0:
+        return compute_acc(
+            global_results[all_val_nid], labels[all_val_nid]
+        ), compute_acc(global_results[all_test_nid], labels[all_test_nid])
+    else:
+        return -1, -1
+
+
+def run(args, device, data):
+    (
+        g,
+        num_classes,
+        train_nid,
+        val_nid,
+        test_nid,
+        labels,
+        all_val_nid,
+        all_test_nid,
+    ) = data
+
+    fanouts = [int(fanout) for fanout in args.fanout.split(",")]
+    val_fanouts = [int(fanout) for fanout in args.validation_fanout.split(",")]
+
+    sampler = dgl.dataloading.MultiLayerNeighborSampler(fanouts)
+    dataloader = dgl.dataloading.DistNodeDataLoader(
+        g,
+        {"paper": train_nid},
+        sampler,
+        batch_size=args.batch_size,
+        shuffle=True,
+        drop_last=False,
+    )
+
+    valid_sampler = dgl.dataloading.MultiLayerNeighborSampler(val_fanouts)
+    valid_dataloader = dgl.dataloading.DistNodeDataLoader(
+        g,
+        {"paper": val_nid},
+        valid_sampler,
+        batch_size=args.batch_size,
+        shuffle=False,
+        drop_last=False,
+    )
+
+    test_sampler = dgl.dataloading.MultiLayerNeighborSampler(val_fanouts)
+    test_dataloader = dgl.dataloading.DistNodeDataLoader(
+        g,
+        {"paper": test_nid},
+        test_sampler,
+        batch_size=args.eval_batch_size,
+        shuffle=False,
+        drop_last=False,
+    )
+
+    embed_layer = DistEmbedLayer(
+        device,
+        g,
+        args.n_hidden,
+        sparse_emb=args.sparse_embedding,
+        dgl_sparse_emb=args.dgl_sparse,
+        feat_name="feat",
+    )
+
+    model = EntityClassify(
+        device,
+        args.n_hidden,
+        num_classes,
+        g.etypes,
+        num_bases=args.n_bases,
+        num_hidden_layers=args.n_layers - 2,
+        dropout=args.dropout,
+        use_self_loop=args.use_self_loop,
+        layer_norm=args.layer_norm,
+    )
+    model = model.to(device)
+
+    if not args.standalone:
+        if args.num_gpus == -1:
+            model = DistributedDataParallel(model)
+            # If there are dense parameters in the embedding layer
+            # or we use Pytorch saprse embeddings.
+            if len(embed_layer.node_projs) > 0 or not args.dgl_sparse:
+                embed_layer = DistributedDataParallel(embed_layer)
+        else:
+            dev_id = g.rank() % args.num_gpus
+            model = DistributedDataParallel(
+                model, device_ids=[dev_id], output_device=dev_id
+            )
+            # If there are dense parameters in the embedding layer
+            # or we use Pytorch saprse embeddings.
+            if len(embed_layer.node_projs) > 0 or not args.dgl_sparse:
+                embed_layer = embed_layer.to(device)
+                embed_layer = DistributedDataParallel(
+                    embed_layer, device_ids=[dev_id], output_device=dev_id
+                )
+
+    if args.sparse_embedding:
+        if args.dgl_sparse and args.standalone:
+            emb_optimizer = dgl.distributed.optim.SparseAdam(
+                list(embed_layer.node_embeds.values()), lr=args.sparse_lr
+            )
+            print(
+                "optimize DGL sparse embedding:", embed_layer.node_embeds.keys()
+            )
+        elif args.dgl_sparse:
+            emb_optimizer = dgl.distributed.optim.SparseAdam(
+                list(embed_layer.module.node_embeds.values()), lr=args.sparse_lr
+            )
+            print(
+                "optimize DGL sparse embedding:",
+                embed_layer.module.node_embeds.keys(),
+            )
+        elif args.standalone:
+            emb_optimizer = th.optim.SparseAdam(
+                list(embed_layer.node_embeds.parameters()), lr=args.sparse_lr
+            )
+            print("optimize Pytorch sparse embedding:", embed_layer.node_embeds)
+        else:
+            emb_optimizer = th.optim.SparseAdam(
+                list(embed_layer.module.node_embeds.parameters()),
+                lr=args.sparse_lr,
+            )
+            print(
+                "optimize Pytorch sparse embedding:",
+                embed_layer.module.node_embeds,
+            )
+
+        dense_params = list(model.parameters())
+        if args.standalone:
+            dense_params += list(embed_layer.node_projs.parameters())
+            print("optimize dense projection:", embed_layer.node_projs)
+        else:
+            dense_params += list(embed_layer.module.node_projs.parameters())
+            print("optimize dense projection:", embed_layer.module.node_projs)
+        optimizer = th.optim.Adam(
+            dense_params, lr=args.lr, weight_decay=args.l2norm
+        )
+    else:
+        all_params = list(model.parameters()) + list(embed_layer.parameters())
+        optimizer = th.optim.Adam(
+            all_params, lr=args.lr, weight_decay=args.l2norm
+        )
+
+    # training loop
+    print("start training...")
+    for epoch in range(args.n_epochs):
+        tic = time.time()
+
+        sample_time = 0
+        copy_time = 0
+        forward_time = 0
+        backward_time = 0
+        update_time = 0
+        number_train = 0
+        number_input = 0
+
+        step_time = []
+        iter_t = []
+        sample_t = []
+        feat_copy_t = []
+        forward_t = []
+        backward_t = []
+        update_t = []
+        iter_tput = []
+
+        start = time.time()
+        # Loop over the dataloader to sample the computation dependency graph as a list of
+        # blocks.
+        step_time = []
+        for step, sample_data in enumerate(dataloader):
+            input_nodes, seeds, blocks = sample_data
+            seeds = seeds["paper"]
+            number_train += seeds.shape[0]
+            number_input += np.sum(
+                [blocks[0].num_src_nodes(ntype) for ntype in blocks[0].ntypes]
+            )
+            tic_step = time.time()
+            sample_time += tic_step - start
+            sample_t.append(tic_step - start)
+
+            feats = embed_layer(input_nodes)
+            label = labels[seeds].to(device)
+            copy_time = time.time()
+            feat_copy_t.append(copy_time - tic_step)
+
+            # forward
+            logits = model(blocks, feats)
+            assert len(logits) == 1
+            logits = logits["paper"]
+            loss = F.cross_entropy(logits, label)
+            forward_end = time.time()
+
+            # backward
+            optimizer.zero_grad()
+            if args.sparse_embedding:
+                emb_optimizer.zero_grad()
+            loss.backward()
+            compute_end = time.time()
+            forward_t.append(forward_end - copy_time)
+            backward_t.append(compute_end - forward_end)
+
+            # Update model parameters
+            optimizer.step()
+            if args.sparse_embedding:
+                emb_optimizer.step()
+            update_t.append(time.time() - compute_end)
+            step_t = time.time() - start
+            step_time.append(step_t)
+
+            train_acc = th.sum(logits.argmax(dim=1) == label).item() / len(
+                seeds
+            )
+
+            if step % args.log_every == 0:
+                print(
+                    "[{}] Epoch {:05d} | Step {:05d} | Train acc {:.4f} | Loss {:.4f} | time {:.3f} s"
+                    "| sample {:.3f} | copy {:.3f} | forward {:.3f} | backward {:.3f} | update {:.3f}".format(
+                        g.rank(),
+                        epoch,
+                        step,
+                        train_acc,
+                        loss.item(),
+                        np.sum(step_time[-args.log_every :]),
+                        np.sum(sample_t[-args.log_every :]),
+                        np.sum(feat_copy_t[-args.log_every :]),
+                        np.sum(forward_t[-args.log_every :]),
+                        np.sum(backward_t[-args.log_every :]),
+                        np.sum(update_t[-args.log_every :]),
+                    )
+                )
+            start = time.time()
+
+        gc.collect()
+        print(
+            "[{}]Epoch Time(s): {:.4f}, sample: {:.4f}, data copy: {:.4f}, forward: {:.4f}, backward: {:.4f}, update: {:.4f}, #train: {}, #input: {}".format(
+                g.rank(),
+                np.sum(step_time),
+                np.sum(sample_t),
+                np.sum(feat_copy_t),
+                np.sum(forward_t),
+                np.sum(backward_t),
+                np.sum(update_t),
+                number_train,
+                number_input,
+            )
+        )
+        epoch += 1
+
+        start = time.time()
+        g.barrier()
+        val_acc, test_acc = evaluate(
+            g,
+            model,
+            embed_layer,
+            labels,
+            valid_dataloader,
+            test_dataloader,
+            all_val_nid,
+            all_test_nid,
+        )
+        if val_acc >= 0:
+            print(
+                "Val Acc {:.4f}, Test Acc {:.4f}, time: {:.4f}".format(
+                    val_acc, test_acc, time.time() - start
+                )
+            )
+
+
+def main(args):
+    if args.graphbolt:
+        print("Using GraphBolt")
+    dgl.distributed.initialize(args.ip_config, use_graphbolt=args.graphbolt)
+    if not args.standalone:
+        th.distributed.init_process_group(backend="gloo")
+
+    g = dgl.distributed.DistGraph(
+        args.graph_name,
+        part_config=args.conf_path,
+        use_graphbolt=args.graphbolt,
+    )
+    print("rank:", g.rank())
+
+    g.barrier()
+    return
+
+    pb = g.get_partition_book()
+    if "trainer_id" in g.nodes["paper"].data:
+        train_nid = dgl.distributed.node_split(
+            g.nodes["paper"].data["train_mask"],
+            pb,
+            ntype="paper",
+            force_even=True,
+            node_trainer_ids=g.nodes["paper"].data["trainer_id"],
+        )
+        val_nid = dgl.distributed.node_split(
+            g.nodes["paper"].data["val_mask"],
+            pb,
+            ntype="paper",
+            force_even=True,
+            node_trainer_ids=g.nodes["paper"].data["trainer_id"],
+        )
+        test_nid = dgl.distributed.node_split(
+            g.nodes["paper"].data["test_mask"],
+            pb,
+            ntype="paper",
+            force_even=True,
+            node_trainer_ids=g.nodes["paper"].data["trainer_id"],
+        )
+    else:
+        train_nid = dgl.distributed.node_split(
+            g.nodes["paper"].data["train_mask"],
+            pb,
+            ntype="paper",
+            force_even=True,
+        )
+        val_nid = dgl.distributed.node_split(
+            g.nodes["paper"].data["val_mask"],
+            pb,
+            ntype="paper",
+            force_even=True,
+        )
+        test_nid = dgl.distributed.node_split(
+            g.nodes["paper"].data["test_mask"],
+            pb,
+            ntype="paper",
+            force_even=True,
+        )
+    local_nid = pb.partid2nids(pb.partid, "paper").detach().numpy()
+    print(
+        "part {}, train: {} (local: {}), val: {} (local: {}), test: {} (local: {})".format(
+            g.rank(),
+            len(train_nid),
+            len(np.intersect1d(train_nid.numpy(), local_nid)),
+            len(val_nid),
+            len(np.intersect1d(val_nid.numpy(), local_nid)),
+            len(test_nid),
+            len(np.intersect1d(test_nid.numpy(), local_nid)),
+        )
+    )
+    if args.num_gpus == -1:
+        device = th.device("cpu")
+    else:
+        dev_id = g.rank() % args.num_gpus
+        device = th.device("cuda:" + str(dev_id))
+    labels = g.nodes["paper"].data["labels"][np.arange(g.num_nodes("paper"))]
+    all_val_nid = th.LongTensor(
+        np.nonzero(
+            g.nodes["paper"].data["val_mask"][np.arange(g.num_nodes("paper"))]
+        )
+    ).squeeze()
+    all_test_nid = th.LongTensor(
+        np.nonzero(
+            g.nodes["paper"].data["test_mask"][np.arange(g.num_nodes("paper"))]
+        )
+    ).squeeze()
+    n_classes = len(th.unique(labels[labels >= 0]))
+    print("#classes:", n_classes)
+
+    run(
+        args,
+        device,
+        (
+            g,
+            n_classes,
+            train_nid,
+            val_nid,
+            test_nid,
+            labels,
+            all_val_nid,
+            all_test_nid,
+        ),
+    )
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="RGCN")
+    # distributed training related
+    parser.add_argument(
+        "--graphbolt",
+        default=False,
+        action="store_true",
+        help="train with GraphBolt",
+    )
+    parser.add_argument("--graph-name", type=str, help="graph name")
+    parser.add_argument("--id", type=int, help="the partition id")
+    parser.add_argument(
+        "--ip-config", type=str, help="The file for IP configuration"
+    )
+    parser.add_argument(
+        "--conf-path", type=str, help="The path to the partition config file"
+    )
+
+    # rgcn related
+    parser.add_argument(
+        "--num_gpus",
+        type=int,
+        default=-1,
+        help="the number of GPU device. Use -1 for CPU training",
+    )
+    parser.add_argument(
+        "--dropout", type=float, default=0, help="dropout probability"
+    )
+    parser.add_argument(
+        "--n-hidden", type=int, default=16, help="number of hidden units"
+    )
+    parser.add_argument("--lr", type=float, default=1e-2, help="learning rate")
+    parser.add_argument(
+        "--sparse-lr", type=float, default=1e-2, help="sparse lr rate"
+    )
+    parser.add_argument(
+        "--n-bases",
+        type=int,
+        default=-1,
+        help="number of filter weight matrices, default: -1 [use all]",
+    )
+    parser.add_argument(
+        "--n-layers", type=int, default=2, help="number of propagation rounds"
+    )
+    parser.add_argument(
+        "-e",
+        "--n-epochs",
+        type=int,
+        default=50,
+        help="number of training epochs",
+    )
+    parser.add_argument(
+        "-d", "--dataset", type=str, required=True, help="dataset to use"
+    )
+    parser.add_argument("--l2norm", type=float, default=0, help="l2 norm coef")
+    parser.add_argument(
+        "--relabel",
+        default=False,
+        action="store_true",
+        help="remove untouched nodes and relabel",
+    )
+    parser.add_argument(
+        "--fanout",
+        type=str,
+        default="4, 4",
+        help="Fan-out of neighbor sampling.",
+    )
+    parser.add_argument(
+        "--validation-fanout",
+        type=str,
+        default=None,
+        help="Fan-out of neighbor sampling during validation.",
+    )
+    parser.add_argument(
+        "--use-self-loop",
+        default=False,
+        action="store_true",
+        help="include self feature as a special relation",
+    )
+    parser.add_argument(
+        "--batch-size", type=int, default=100, help="Mini-batch size. "
+    )
+    parser.add_argument(
+        "--eval-batch-size", type=int, default=128, help="Mini-batch size. "
+    )
+    parser.add_argument("--log-every", type=int, default=20)
+    parser.add_argument(
+        "--low-mem",
+        default=False,
+        action="store_true",
+        help="Whether use low mem RelGraphCov",
+    )
+    parser.add_argument(
+        "--sparse-embedding",
+        action="store_true",
+        help="Use sparse embedding for node embeddings.",
+    )
+    parser.add_argument(
+        "--dgl-sparse",
+        action="store_true",
+        help="Whether to use DGL sparse embedding",
+    )
+    parser.add_argument(
+        "--layer-norm",
+        default=False,
+        action="store_true",
+        help="Use layer norm",
+    )
+    parser.add_argument(
+        "--local_rank", type=int, help="get rank of the process"
+    )
+    parser.add_argument(
+        "--standalone", action="store_true", help="run in the standalone mode"
+    )
+    args = parser.parse_args()
+
+    # if validation_fanout is None, set it with args.fanout
+    if args.validation_fanout is None:
+        args.validation_fanout = args.fanout
+    print(args)
+    main(args)
diff --git a/examples/pytorch/rgcn/experimental/ip_config.txt b/examples/pytorch/rgcn/experimental/ip_config.txt
new file mode 100644
index 000000000000..f7bec5c8124c
--- /dev/null
+++ b/examples/pytorch/rgcn/experimental/ip_config.txt
@@ -0,0 +1,2 @@
+172.31.14.101
+172.31.8.229
diff --git a/python/dgl/distributed/dist_context.py b/python/dgl/distributed/dist_context.py
index 51af0afeafb0..0e81b617ef6c 100644
--- a/python/dgl/distributed/dist_context.py
+++ b/python/dgl/distributed/dist_context.py
@@ -210,6 +210,7 @@ def initialize(
     max_queue_size=MAX_QUEUE_SIZE,
     net_type=None,
     num_worker_threads=1,
+    use_graphbolt=False,
 ):
     """Initialize DGL's distributed module
 
@@ -231,6 +232,8 @@ def initialize(
         [Deprecated] Networking type, can be 'socket' only.
     num_worker_threads: int
         The number of OMP threads in each sampler process.
+    use_graphbolt: bool
+        Whether to use graphbolt for sampling.
 
     Note
     ----
@@ -270,6 +273,7 @@ def initialize(
             int(os.environ.get("DGL_NUM_CLIENT")),
             os.environ.get("DGL_CONF_PATH"),
             graph_format=formats,
+            use_graphbolt=use_graphbolt,
         )
         serv.start()
         sys.exit()

From 1cabe94fc5eec14351bdef36f18781aa20dc1ba6 Mon Sep 17 00:00:00 2001
From: RhettYing <rhett_ying@qq.com>
Date: Tue, 24 Oct 2023 07:11:59 +0000
Subject: [PATCH 09/30] [gb_distdgl] update todo list

---
 dt.py                                         | 3 +++
 examples/pytorch/rgcn/experimental/gb_demo.py | 6 +++---
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/dt.py b/dt.py
index 2ce284fe0333..331445075b8f 100644
--- a/dt.py
+++ b/dt.py
@@ -37,6 +37,9 @@
 '''
 
 # [TODO][P0] Load `CSCSamplingGraph` into `DistGraph`.
+#           done@2023-10-24 15:10:00
+#           see details in: https://github.com/Rhett-Ying/dgl/commits/gb_distdgl
+#                           222dd2bd51084cc4f242148b0a7e6e5d91e0ae80
 ## NID/EIDs are required.
 g = dgl.distributed.DistGraph(args.graph_name, part_config=args.part_config)
 
diff --git a/examples/pytorch/rgcn/experimental/gb_demo.py b/examples/pytorch/rgcn/experimental/gb_demo.py
index 06d6d9c460d2..118ce98c1f7a 100644
--- a/examples/pytorch/rgcn/experimental/gb_demo.py
+++ b/examples/pytorch/rgcn/experimental/gb_demo.py
@@ -459,6 +459,8 @@ def run(args, device, data):
     fanouts = [int(fanout) for fanout in args.fanout.split(",")]
     val_fanouts = [int(fanout) for fanout in args.validation_fanout.split(",")]
 
+    g.barrier()
+    return
     sampler = dgl.dataloading.MultiLayerNeighborSampler(fanouts)
     dataloader = dgl.dataloading.DistNodeDataLoader(
         g,
@@ -717,9 +719,6 @@ def main(args):
     )
     print("rank:", g.rank())
 
-    g.barrier()
-    return
-
     pb = g.get_partition_book()
     if "trainer_id" in g.nodes["paper"].data:
         train_nid = dgl.distributed.node_split(
@@ -774,6 +773,7 @@ def main(args):
             len(np.intersect1d(test_nid.numpy(), local_nid)),
         )
     )
+
     if args.num_gpus == -1:
         device = th.device("cpu")
     else:

From 70c3a9d31102f9cc10f1d228a173f0f4022afc04 Mon Sep 17 00:00:00 2001
From: RhettYing <rhett_ying@qq.com>
Date: Wed, 25 Oct 2023 00:39:51 +0000
Subject: [PATCH 10/30] [gb_distdgl] dataloader is created

---
 examples/pytorch/rgcn/experimental/gb_demo.py | 79 ++++++++++++-------
 python/dgl/distributed/__init__.py            |  8 +-
 python/dgl/distributed/dist_graph.py          | 18 +++++
 3 files changed, 76 insertions(+), 29 deletions(-)

diff --git a/examples/pytorch/rgcn/experimental/gb_demo.py b/examples/pytorch/rgcn/experimental/gb_demo.py
index 118ce98c1f7a..84f1f6c891f1 100644
--- a/examples/pytorch/rgcn/experimental/gb_demo.py
+++ b/examples/pytorch/rgcn/experimental/gb_demo.py
@@ -69,7 +69,7 @@ def __init__(
         bias=True,
         activation=None,
         self_loop=False,
-        dropout=0.0
+        dropout=0.0,
     ):
         super(RelGraphConvLayer, self).__init__()
         self.in_feat = in_feat
@@ -444,6 +444,42 @@ def evaluate(
         return -1, -1
 
 
+def create_itemset(g, nodes, labels, category):
+    gpb = g.get_partition_book()
+    if isinstance(nodes, dict):
+        assert (
+            category in nodes
+        ), f"Category {category} not in nodes: {nodes.keys()}."
+        labels = labels[nodes[category]]
+        homo_nids = []
+        for ntype in nodes.keys():
+            assert (
+                ntype in gpb.ntypes
+            ), "The sampled node type {} does not exist in the input graph".format(
+                ntype
+            )
+            homo_nids.append(gpb.map_to_homo_nid(nodes[ntype], ntype))
+        nodes = th.cat(homo_nids, 0)
+        print(nodes)
+
+    return gb.ItemSet((nodes, labels), names=("seed_nodes", "labels"))
+
+
+def create_dataloader(g, nodes, labels, batch_size, shuffle, fanouts):
+    item_set = create_itemset(g, nodes, labels, "paper")
+
+    datapipe = gb.ItemSampler(item_set, batch_size=batch_size, shuffle=shuffle)
+
+    datapipe = datapipe.distributed_sample_neighbor(g, fanouts=fanouts)
+
+    # datapipe = datapipe.to_dgl()
+
+    # device=th.device("cpu")
+    # datapipe = datapipe.copy_to(device)
+
+    return gb.MultiProcessDataLoader(datapipe, num_workers=0)
+
+
 def run(args, device, data):
     (
         g,
@@ -459,37 +495,24 @@ def run(args, device, data):
     fanouts = [int(fanout) for fanout in args.fanout.split(",")]
     val_fanouts = [int(fanout) for fanout in args.validation_fanout.split(",")]
 
-    g.barrier()
-    return
-    sampler = dgl.dataloading.MultiLayerNeighborSampler(fanouts)
-    dataloader = dgl.dataloading.DistNodeDataLoader(
-        g,
-        {"paper": train_nid},
-        sampler,
-        batch_size=args.batch_size,
-        shuffle=True,
-        drop_last=False,
+    print(
+        f"Rank[{g.rank()}] train_nid: {train_nid.shape}, labels: {labels.shape}"
     )
 
-    valid_sampler = dgl.dataloading.MultiLayerNeighborSampler(val_fanouts)
-    valid_dataloader = dgl.dataloading.DistNodeDataLoader(
-        g,
-        {"paper": val_nid},
-        valid_sampler,
-        batch_size=args.batch_size,
-        shuffle=False,
-        drop_last=False,
+    # Create dataloaders
+    train_dl = create_dataloader(
+        g, {"paper": train_nid}, labels, args.batch_size, True, fanouts
     )
+    # val_dl = create_dataloader(g, {"paper": val_nid}, labels, args.batch_size, True, val_fanouts)
+    # test_dl = create_dataloader(g, {"paper": test_nid}, labels, args.eval_batch_size, True, val_fanouts)
 
-    test_sampler = dgl.dataloading.MultiLayerNeighborSampler(val_fanouts)
-    test_dataloader = dgl.dataloading.DistNodeDataLoader(
-        g,
-        {"paper": test_nid},
-        test_sampler,
-        batch_size=args.eval_batch_size,
-        shuffle=False,
-        drop_last=False,
-    )
+    for step, data in enumerate(tqdm.tqdm(train_dl, desc="DistDGL Training")):
+        pass
+
+    g.barrier()
+    if g.rank() == 0:
+        time.sleep(5)
+    return
 
     embed_layer = DistEmbedLayer(
         device,
diff --git a/python/dgl/distributed/__init__.py b/python/dgl/distributed/__init__.py
index 6b7d322841ea..2a3fe2b8a647 100644
--- a/python/dgl/distributed/__init__.py
+++ b/python/dgl/distributed/__init__.py
@@ -2,7 +2,13 @@
 from . import optim
 from .dist_context import exit_client, initialize
 from .dist_dataloader import DistDataLoader
-from .dist_graph import DistGraph, DistGraphServer, edge_split, node_split
+from .dist_graph import (
+    DistGraph,
+    DistGraphServer,
+    DistributedNeighborSampler,
+    edge_split,
+    node_split,
+)
 from .dist_tensor import DistTensor
 from .graph_partition_book import GraphPartitionBook, PartitionPolicy
 from .graph_services import *
diff --git a/python/dgl/distributed/dist_graph.py b/python/dgl/distributed/dist_graph.py
index 967d9505a792..7ebe7c1787ab 100644
--- a/python/dgl/distributed/dist_graph.py
+++ b/python/dgl/distributed/dist_graph.py
@@ -8,6 +8,9 @@
 
 import numpy as np
 
+import torch
+from torch.utils.data import functional_datapipe
+
 from .. import backend as F, graphbolt as gb, heterograph_index
 from .._ffi.ndarray import empty_shared_mem
 from ..base import ALL, DGLError, EID, ETYPE, is_all, NID
@@ -52,6 +55,21 @@
 INIT_GRAPH = 800001
 
 
+@functional_datapipe("distributed_sample_neighbor")
+class DistributedNeighborSampler(gb.NeighborSampler):
+    """Distributed Neighbor Sampler.
+
+    This is a wrapper of :py:class:`dgl.dataloading.NeighborSampler` to support distributed
+    training. It samples neighbors from a distributed graph.
+    """
+
+    def __init__(self, datapipe, graph, fanouts):
+        super().__init__(datapipe, graph, fanouts)
+
+    def _sample_subgraphs(self, seeds):
+        return seeds, []
+
+
 class InitGraphRequest(rpc.Request):
     """Init graph on the backup servers.
 

From 9362afe94258742daf6f4bfcb9a2419fd77c51bb Mon Sep 17 00:00:00 2001
From: RhettYing <rhett_ying@qq.com>
Date: Wed, 25 Oct 2023 00:54:37 +0000
Subject: [PATCH 11/30] [gb_distdgl] create ItemSet with dict input

---
 examples/pytorch/rgcn/experimental/gb_demo.py | 21 +++++++++++--------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/examples/pytorch/rgcn/experimental/gb_demo.py b/examples/pytorch/rgcn/experimental/gb_demo.py
index 84f1f6c891f1..b9d027ebdcf1 100644
--- a/examples/pytorch/rgcn/experimental/gb_demo.py
+++ b/examples/pytorch/rgcn/experimental/gb_demo.py
@@ -444,14 +444,11 @@ def evaluate(
         return -1, -1
 
 
-def create_itemset(g, nodes, labels, category):
+def create_itemset(g, nodes, labels):
     gpb = g.get_partition_book()
     if isinstance(nodes, dict):
-        assert (
-            category in nodes
-        ), f"Category {category} not in nodes: {nodes.keys()}."
-        labels = labels[nodes[category]]
         homo_nids = []
+        homo_labels = []
         for ntype in nodes.keys():
             assert (
                 ntype in gpb.ntypes
@@ -459,14 +456,15 @@ def create_itemset(g, nodes, labels, category):
                 ntype
             )
             homo_nids.append(gpb.map_to_homo_nid(nodes[ntype], ntype))
+            assert ntype in labels, f"{ntype} not found in labels."
+            homo_labels.append(labels[ntype])
         nodes = th.cat(homo_nids, 0)
-        print(nodes)
-
+        labels = th.cat(homo_labels, 0)
     return gb.ItemSet((nodes, labels), names=("seed_nodes", "labels"))
 
 
 def create_dataloader(g, nodes, labels, batch_size, shuffle, fanouts):
-    item_set = create_itemset(g, nodes, labels, "paper")
+    item_set = create_itemset(g, nodes, labels)
 
     datapipe = gb.ItemSampler(item_set, batch_size=batch_size, shuffle=shuffle)
 
@@ -501,7 +499,12 @@ def run(args, device, data):
 
     # Create dataloaders
     train_dl = create_dataloader(
-        g, {"paper": train_nid}, labels, args.batch_size, True, fanouts
+        g,
+        {"paper": train_nid},
+        {"paper": labels},
+        args.batch_size,
+        True,
+        fanouts,
     )
     # val_dl = create_dataloader(g, {"paper": val_nid}, labels, args.batch_size, True, val_fanouts)
     # test_dl = create_dataloader(g, {"paper": test_nid}, labels, args.eval_batch_size, True, val_fanouts)

From f440df16e1c7379843f7bb9445922038deb17907 Mon Sep 17 00:00:00 2001
From: RhettYing <rhett_ying@qq.com>
Date: Wed, 25 Oct 2023 05:27:43 +0000
Subject: [PATCH 12/30] [gb_distdgl] _distributed_access works with
 graph.sample hacked

---
 examples/pytorch/rgcn/experimental/gb_demo.py | 13 ++-
 .../rgcn/experimental/partition_graph.py      |  2 +
 python/dgl/distributed/dist_graph.py          | 22 ++++-
 python/dgl/distributed/graph_services.py      | 86 ++++++++++++++-----
 python/dgl/graphbolt/impl/neighbor_sampler.py |  6 ++
 5 files changed, 98 insertions(+), 31 deletions(-)

diff --git a/examples/pytorch/rgcn/experimental/gb_demo.py b/examples/pytorch/rgcn/experimental/gb_demo.py
index b9d027ebdcf1..026ca136e501 100644
--- a/examples/pytorch/rgcn/experimental/gb_demo.py
+++ b/examples/pytorch/rgcn/experimental/gb_demo.py
@@ -447,19 +447,18 @@ def evaluate(
 def create_itemset(g, nodes, labels):
     gpb = g.get_partition_book()
     if isinstance(nodes, dict):
-        homo_nids = []
-        homo_labels = []
-        for ntype in nodes.keys():
+        data = {}
+        for ntype in nodes:
             assert (
                 ntype in gpb.ntypes
             ), "The sampled node type {} does not exist in the input graph".format(
                 ntype
             )
-            homo_nids.append(gpb.map_to_homo_nid(nodes[ntype], ntype))
             assert ntype in labels, f"{ntype} not found in labels."
-            homo_labels.append(labels[ntype])
-        nodes = th.cat(homo_nids, 0)
-        labels = th.cat(homo_labels, 0)
+            data[ntype] = gb.ItemSet(
+                (nodes[ntype], labels[ntype]), names=("seed_nodes", "labels")
+            )
+        return gb.ItemSetDict(data)
     return gb.ItemSet((nodes, labels), names=("seed_nodes", "labels"))
 
 
diff --git a/examples/pytorch/rgcn/experimental/partition_graph.py b/examples/pytorch/rgcn/experimental/partition_graph.py
index af430261609f..da7611d0b0ee 100644
--- a/examples/pytorch/rgcn/experimental/partition_graph.py
+++ b/examples/pytorch/rgcn/experimental/partition_graph.py
@@ -133,4 +133,6 @@ def load_ogb(dataset):
         balance_edges=args.balance_edges,
         num_trainers_per_machine=args.num_trainers_per_machine,
         use_graphbolt=args.graphbolt,
+        gb_store_orig_nids=True,
+        gb_store_orig_eids=True,
     )
diff --git a/python/dgl/distributed/dist_graph.py b/python/dgl/distributed/dist_graph.py
index 7ebe7c1787ab..c0250d0d8287 100644
--- a/python/dgl/distributed/dist_graph.py
+++ b/python/dgl/distributed/dist_graph.py
@@ -64,10 +64,19 @@ class DistributedNeighborSampler(gb.NeighborSampler):
     """
 
     def __init__(self, datapipe, graph, fanouts):
-        super().__init__(datapipe, graph, fanouts)
+        super().__init__(datapipe, graph._g, fanouts)
+        self.dist_graph = graph
 
     def _sample_subgraphs(self, seeds):
-        return seeds, []
+        sampled_graphs = []  # In DGLGraph or DGLHeteroGraph format.
+        for fanout in self.fanouts:
+            # fanout is a tensor. We need to convert it to integer.
+            sampled_graphs.append(
+                self.dist_graph.sample_neighbors(
+                    seeds, fanout.item(), use_graphbolt=True
+                )
+            )
+        return seeds, sampled_graphs
 
 
 class InitGraphRequest(rpc.Request):
@@ -1400,6 +1409,7 @@ def sample_neighbors(
         replace=False,
         etype_sorted=True,
         output_device=None,
+        use_graphbolt=False,
     ):
         # pylint: disable=unused-argument
         """Sample neighbors from a distributed graph."""
@@ -1411,10 +1421,16 @@ def sample_neighbors(
                 replace=replace,
                 etype_sorted=etype_sorted,
                 prob=prob,
+                use_graphbolt=use_graphbolt,
             )
         else:
             frontier = graph_services.sample_neighbors(
-                self, seed_nodes, fanout, replace=replace, prob=prob
+                self,
+                seed_nodes,
+                fanout,
+                replace=replace,
+                prob=prob,
+                use_graphbolt=use_graphbolt,
             )
         return frontier
 
diff --git a/python/dgl/distributed/graph_services.py b/python/dgl/distributed/graph_services.py
index 0a732ca0e7b0..1c7259489324 100644
--- a/python/dgl/distributed/graph_services.py
+++ b/python/dgl/distributed/graph_services.py
@@ -3,8 +3,10 @@
 
 import numpy as np
 
-from .. import backend as F
-from ..base import EID, NID
+import torch
+
+from .. import backend as F, graphbolt as gb
+from ..base import dgl_warning, DGLError, EID, NID
 from ..convert import graph, heterograph
 from ..sampling import (
     sample_etype_neighbors as local_sample_etype_neighbors,
@@ -106,6 +108,7 @@ def _sample_etype_neighbors(
     prob,
     replace,
     etype_sorted=False,
+    use_graphbolt=False,
 ):
     """Sample from local partition.
 
@@ -115,25 +118,40 @@ def _sample_etype_neighbors(
     and edge IDs.
     """
     local_ids = partition_book.nid2localnid(seed_nodes, partition_book.partid)
-    local_ids = F.astype(local_ids, local_g.idtype)
-
-    sampled_graph = local_sample_etype_neighbors(
-        local_g,
-        local_ids,
-        etype_offset,
-        fan_out,
-        edge_dir,
-        prob,
-        replace,
-        etype_sorted=etype_sorted,
-        _dist_training=True,
-    )
-    global_nid_mapping = local_g.ndata[NID]
-    src, dst = sampled_graph.edges()
+    if not use_graphbolt:
+        local_ids = F.astype(local_ids, local_g.idtype)
+    local_src, local_dst, local_eids = None, None, None
+    if use_graphbolt:
+        local_src, local_dst = gb.NeighborSampler.distributed_sample_neighbor(
+            local_g, local_ids, fan_out
+        )
+    else:
+        sampled_graph = local_sample_etype_neighbors(
+            local_g,
+            local_ids,
+            etype_offset,
+            fan_out,
+            edge_dir,
+            prob,
+            replace,
+            etype_sorted=etype_sorted,
+            _dist_training=True,
+        )
+        local_src, local_dst = sampled_graph.edges()
+        local_eids = sampled_graph.edata[EID]
+    if use_graphbolt:
+        global_nid_mapping = local_g.node_attributes[NID]
+        global_eids = local_eids
+    else:
+        global_nid_mapping = local_g.ndata[NID]
+        global_eids = F.gather_row(local_g.edata[EID], local_eids)
     global_src, global_dst = F.gather_row(
-        global_nid_mapping, src
-    ), F.gather_row(global_nid_mapping, dst)
-    global_eids = F.gather_row(local_g.edata[EID], sampled_graph.edata[EID])
+        global_nid_mapping, local_src
+    ), F.gather_row(global_nid_mapping, local_dst)
+
+    # [Rui] Hack for graphbolt case as EID is not returned for now.
+    if global_eids is None:
+        global_eids = torch.zeros((global_src.shape[0],), dtype=torch.int64)
     return global_src, global_dst, global_eids
 
 
@@ -268,6 +286,7 @@ def __init__(
         prob=None,
         replace=False,
         etype_sorted=True,
+        use_graphbolt=False,
     ):
         self.seed_nodes = nodes
         self.edge_dir = edge_dir
@@ -275,6 +294,7 @@ def __init__(
         self.replace = replace
         self.fan_out = fan_out
         self.etype_sorted = etype_sorted
+        self.use_graphbolt = use_graphbolt
 
     def __setstate__(self, state):
         (
@@ -284,6 +304,7 @@ def __setstate__(self, state):
             self.replace,
             self.fan_out,
             self.etype_sorted,
+            self.use_graphbolt,
         ) = state
 
     def __getstate__(self):
@@ -294,6 +315,7 @@ def __getstate__(self):
             self.replace,
             self.fan_out,
             self.etype_sorted,
+            self.use_graphbolt,
         )
 
     def process_request(self, server_state):
@@ -319,6 +341,7 @@ def process_request(self, server_state):
             probs,
             self.replace,
             self.etype_sorted,
+            use_graphbolt=self.use_graphbolt,
         )
         return SubgraphResponse(global_src, global_dst, global_eids)
 
@@ -526,6 +549,7 @@ def _distributed_access(g, nodes, issue_remote_req, local_access):
         res_list.extend(results)
 
     sampled_graph = merge_graphs(res_list, g.num_nodes())
+    print("sampled_graph: ", sampled_graph)
     return sampled_graph
 
 
@@ -578,6 +602,7 @@ def sample_etype_neighbors(
     prob=None,
     replace=False,
     etype_sorted=True,
+    use_graphbolt=False,
 ):
     """Sample from the neighbors of the given nodes from a distributed graph.
 
@@ -631,6 +656,8 @@ def sample_etype_neighbors(
         neighbors are sampled. If fanout == -1, all neighbors are collected.
     etype_sorted : bool, optional
         Indicates whether etypes are sorted.
+    use_graphbolt : bool, optional
+        Whether to use GraphBolt to sample neighbors.
 
     Returns
     -------
@@ -640,6 +667,10 @@ def sample_etype_neighbors(
     if isinstance(fanout, int):
         fanout = F.full_1d(len(g.canonical_etypes), fanout, F.int64, F.cpu())
     else:
+        if use_graphbolt:
+            dgl_warning(
+                "----------- [Rui] Not covered in demo test yet. -----------"
+            )
         etype_ids = {etype: i for i, etype in enumerate(g.canonical_etypes)}
         fanout_array = [None] * len(g.canonical_etypes)
         for etype, v in fanout.items():
@@ -688,6 +719,7 @@ def issue_remote_req(node_ids):
             prob=_prob,
             replace=replace,
             etype_sorted=etype_sorted,
+            use_graphbolt=use_graphbolt,
         )
 
     def local_access(local_g, partition_book, local_nids):
@@ -712,16 +744,27 @@ def local_access(local_g, partition_book, local_nids):
             _prob,
             replace,
             etype_sorted=etype_sorted,
+            use_graphbolt=use_graphbolt,
         )
 
     frontier = _distributed_access(g, nodes, issue_remote_req, local_access)
+    return frontier
     if not gpb.is_homogeneous:
+        # [Rui] Crashed due to incorrect eids.
         return _frontier_to_heterogeneous_graph(g, frontier, gpb)
     else:
         return frontier
 
 
-def sample_neighbors(g, nodes, fanout, edge_dir="in", prob=None, replace=False):
+def sample_neighbors(
+    g,
+    nodes,
+    fanout,
+    edge_dir="in",
+    prob=None,
+    replace=False,
+    use_graphbolt=False,
+):
     """Sample from the neighbors of the given nodes from a distributed graph.
 
     For each node, a number of inbound (or outbound when ``edge_dir == 'out'``) edges
@@ -770,6 +813,7 @@ def sample_neighbors(g, nodes, fanout, edge_dir="in", prob=None, replace=False):
     DGLGraph
         A sampled subgraph containing only the sampled neighboring edges.  It is on CPU.
     """
+    assert not use_graphbolt, "GraphBolt is not supported in distributed mode."
     gpb = g.get_partition_book()
     if not gpb.is_homogeneous:
         assert isinstance(nodes, dict)
diff --git a/python/dgl/graphbolt/impl/neighbor_sampler.py b/python/dgl/graphbolt/impl/neighbor_sampler.py
index 8b18ecb4199f..6556ed4656b6 100644
--- a/python/dgl/graphbolt/impl/neighbor_sampler.py
+++ b/python/dgl/graphbolt/impl/neighbor_sampler.py
@@ -100,6 +100,12 @@ def __init__(
         self.deduplicate = deduplicate
         self.sampler = graph.sample_neighbors
 
+    @staticmethod
+    def distributed_sample_neighbor(graph, seeds, fanouts):
+        src_nodes = seeds.flip(0)
+        dst_nodes = seeds
+        return src_nodes, dst_nodes
+
     def _sample_subgraphs(self, seeds):
         subgraphs = []
         num_layers = len(self.fanouts)

From 93feb0bc3d2cf68882ef1a7f5a81c5afef633d0c Mon Sep 17 00:00:00 2001
From: RhettYing <rhett_ying@qq.com>
Date: Wed, 25 Oct 2023 06:55:36 +0000
Subject: [PATCH 13/30] [gb_distdgl] Not worked as crashed in
 csc_sampling_graph::_check_sampler_arguments(). seeds should be converted
 from dict to list

---
 .../rgcn/experimental/partition_graph.py      |  3 +-
 python/dgl/distributed/dist_graph.py          |  3 +-
 python/dgl/distributed/graph_services.py      | 54 ++++++++++++++++---
 python/dgl/distributed/partition.py           |  9 ++++
 .../dgl/graphbolt/impl/csc_sampling_graph.py  | 15 +++++-
 python/dgl/graphbolt/impl/neighbor_sampler.py |  9 ++--
 .../graphbolt/impl/sampled_subgraph_impl.py   |  1 +
 7 files changed, 81 insertions(+), 13 deletions(-)

diff --git a/examples/pytorch/rgcn/experimental/partition_graph.py b/examples/pytorch/rgcn/experimental/partition_graph.py
index da7611d0b0ee..9a110d5cf778 100644
--- a/examples/pytorch/rgcn/experimental/partition_graph.py
+++ b/examples/pytorch/rgcn/experimental/partition_graph.py
@@ -134,5 +134,6 @@ def load_ogb(dataset):
         num_trainers_per_machine=args.num_trainers_per_machine,
         use_graphbolt=args.graphbolt,
         gb_store_orig_nids=True,
-        gb_store_orig_eids=True,
+        gb_store_orig_eids=False,
+        gb_store_etypes=True,
     )
diff --git a/python/dgl/distributed/dist_graph.py b/python/dgl/distributed/dist_graph.py
index c0250d0d8287..a323633e4801 100644
--- a/python/dgl/distributed/dist_graph.py
+++ b/python/dgl/distributed/dist_graph.py
@@ -54,7 +54,7 @@
 
 INIT_GRAPH = 800001
 
-
+from torchdata.datapipes.iter import IterDataPipe
 @functional_datapipe("distributed_sample_neighbor")
 class DistributedNeighborSampler(gb.NeighborSampler):
     """Distributed Neighbor Sampler.
@@ -76,6 +76,7 @@ def _sample_subgraphs(self, seeds):
                     seeds, fanout.item(), use_graphbolt=True
                 )
             )
+        print("sampled_graphs: ", sampled_graphs)
         return seeds, sampled_graphs
 
 
diff --git a/python/dgl/distributed/graph_services.py b/python/dgl/distributed/graph_services.py
index 1c7259489324..db9597423c20 100644
--- a/python/dgl/distributed/graph_services.py
+++ b/python/dgl/distributed/graph_services.py
@@ -122,9 +122,12 @@ def _sample_etype_neighbors(
         local_ids = F.astype(local_ids, local_g.idtype)
     local_src, local_dst, local_eids = None, None, None
     if use_graphbolt:
-        local_src, local_dst = gb.NeighborSampler.distributed_sample_neighbor(
+        local_src, local_dst, local_eids = gb.NeighborSampler.distributed_sample_neighbor(
             local_g, local_ids, fan_out
         )
+        assert local_src is not None and local_dst is not None and local_eids is not None, (
+            "GraphBolt NeighborSampler.distributed_sample_neighbor() failed."
+        )
     else:
         sampled_graph = local_sample_etype_neighbors(
             local_g,
@@ -149,9 +152,6 @@ def _sample_etype_neighbors(
         global_nid_mapping, local_src
     ), F.gather_row(global_nid_mapping, local_dst)
 
-    # [Rui] Hack for graphbolt case as EID is not returned for now.
-    if global_eids is None:
-        global_eids = torch.zeros((global_src.shape[0],), dtype=torch.int64)
     return global_src, global_dst, global_eids
 
 
@@ -594,6 +594,45 @@ def _frontier_to_heterogeneous_graph(g, frontier, gpb):
     return hg
 
 
+def _frontier_to_heterogeneous_graph_gb(g, frontier, gpb):
+    '''[Rui] Do not use EID.'''
+    # We need to handle empty frontiers correctly.
+    if frontier.num_edges() == 0:
+        data_dict = {
+            etype: (np.zeros(0), np.zeros(0)) for etype in g.canonical_etypes
+        }
+        return heterograph(
+            data_dict,
+            {ntype: g.num_nodes(ntype) for ntype in g.ntypes},
+            idtype=g.idtype,
+        )
+
+    # For GraphBolt, we store ETYPE into EID field.
+    etype_ids = frontier.edata[EID]
+    src, dst = frontier.edges()
+    etype_ids, idx = F.sort_1d(etype_ids)
+    src, dst = F.gather_row(src, idx), F.gather_row(dst, idx)
+    _, src = gpb.map_to_per_ntype(src)
+    _, dst = gpb.map_to_per_ntype(dst)
+
+    data_dict = dict()
+    for etid, etype in enumerate(g.canonical_etypes):
+        type_idx = etype_ids == etid
+        if F.sum(type_idx, 0) > 0:
+            data_dict[etype] = (
+                F.boolean_mask(src, type_idx),
+                F.boolean_mask(dst, type_idx),
+            )
+    hg = heterograph(
+        data_dict,
+        {ntype: g.num_nodes(ntype) for ntype in g.ntypes},
+        idtype=g.idtype,
+    )
+
+    return hg
+
+
+
 def sample_etype_neighbors(
     g,
     nodes,
@@ -748,10 +787,11 @@ def local_access(local_g, partition_book, local_nids):
         )
 
     frontier = _distributed_access(g, nodes, issue_remote_req, local_access)
-    return frontier
     if not gpb.is_homogeneous:
-        # [Rui] Crashed due to incorrect eids.
-        return _frontier_to_heterogeneous_graph(g, frontier, gpb)
+        if use_graphbolt:
+            return _frontier_to_heterogeneous_graph_gb(g, frontier, gpb)
+        else:
+            return _frontier_to_heterogeneous_graph(g, frontier, gpb)
     else:
         return frontier
 
diff --git a/python/dgl/distributed/partition.py b/python/dgl/distributed/partition.py
index 0f3f04b1f4f7..d4da8828fc82 100644
--- a/python/dgl/distributed/partition.py
+++ b/python/dgl/distributed/partition.py
@@ -1340,6 +1340,8 @@ def init_type_per_edge(graph, gpb):
             type_per_edge = type_per_edge.to(RESERVED_FIELD_DTYPE[ETYPE])
             if len(etypes) < 128:
                 type_per_edge = type_per_edge.to(torch.int8)
+            elif len(etypes) < 32768:
+                type_per_edge = type_per_edge.to(torch.int16)
             # Sanity check.
             assert len(type_per_edge) == graph.num_edges()
 
@@ -1361,6 +1363,13 @@ def init_type_per_edge(graph, gpb):
                 EID: graph.edata[EID].to(RESERVED_FIELD_DTYPE[EID])
             }
 
+        if store_etypes:
+            # [Rui] Let's store as edge attributes for now.
+            if edge_attributes is None:
+                edge_attributes = {}
+            edge_attributes[ETYPE] = type_per_edge
+            type_per_edge = None
+
         # Construct CSCSamplingGraph
         csc_graph = graphbolt.from_csc(
             indptr,
diff --git a/python/dgl/graphbolt/impl/csc_sampling_graph.py b/python/dgl/graphbolt/impl/csc_sampling_graph.py
index 05c580965d94..58110c04577a 100644
--- a/python/dgl/graphbolt/impl/csc_sampling_graph.py
+++ b/python/dgl/graphbolt/impl/csc_sampling_graph.py
@@ -358,6 +358,18 @@ def _convert_to_sampled_subgraph(
             original_edge_ids = self.edge_attributes[ORIGINAL_EDGE_ID][
                 original_edge_ids
             ]
+
+        # [Rui] Extract ETYPEs from edge attributes.
+        original_etype_ids = None
+        has_original_etype_ids = (
+            self.edge_attributes is not None
+            and ETYPE in self.edge_attributes
+        )
+        if has_original_etype_ids:
+            original_etype_ids = self.edge_attributes[ETYPE][
+                original_edge_ids
+            ]
+
         if type_per_edge is None:
             # The sampled graph is already a homogeneous graph.
             node_pairs = (row, column)
@@ -381,7 +393,8 @@ def _convert_to_sampled_subgraph(
             if has_original_eids:
                 original_edge_ids = original_hetero_edge_ids
         return SampledSubgraphImpl(
-            node_pairs=node_pairs, original_edge_ids=original_edge_ids
+            node_pairs=node_pairs, original_edge_ids=original_edge_ids,
+            original_etype_ids=original_etype_ids,
         )
 
     def _convert_to_homogeneous_nodes(self, nodes):
diff --git a/python/dgl/graphbolt/impl/neighbor_sampler.py b/python/dgl/graphbolt/impl/neighbor_sampler.py
index 6556ed4656b6..0e451f7f8fad 100644
--- a/python/dgl/graphbolt/impl/neighbor_sampler.py
+++ b/python/dgl/graphbolt/impl/neighbor_sampler.py
@@ -102,9 +102,12 @@ def __init__(
 
     @staticmethod
     def distributed_sample_neighbor(graph, seeds, fanouts):
-        src_nodes = seeds.flip(0)
-        dst_nodes = seeds
-        return src_nodes, dst_nodes
+        if isinstance(fanouts, int):
+            fanouts = torch.LongTensor([fanouts])
+        subgraph = graph.sample_neighbors(seeds, fanouts)
+        src_nodes, dst_nodes = subgraph.node_pairs
+        etype_ids = subgraph.original_etype_ids
+        return src_nodes, dst_nodes, etype_ids
 
     def _sample_subgraphs(self, seeds):
         subgraphs = []
diff --git a/python/dgl/graphbolt/impl/sampled_subgraph_impl.py b/python/dgl/graphbolt/impl/sampled_subgraph_impl.py
index 601377dd0637..bf0c6cfd4a44 100644
--- a/python/dgl/graphbolt/impl/sampled_subgraph_impl.py
+++ b/python/dgl/graphbolt/impl/sampled_subgraph_impl.py
@@ -46,6 +46,7 @@ class SampledSubgraphImpl(SampledSubgraph):
     ] = None
     original_row_node_ids: Union[Dict[str, torch.Tensor], torch.Tensor] = None
     original_edge_ids: Union[Dict[str, torch.Tensor], torch.Tensor] = None
+    original_etype_ids: Union[Dict[str, torch.Tensor], torch.Tensor] = None
 
     def __post_init__(self):
         if isinstance(self.node_pairs, dict):

From 8da2a4a10a1323b6fa43fc8b41050efc908340be Mon Sep 17 00:00:00 2001
From: RhettYing <rhett_ying@qq.com>
Date: Thu, 26 Oct 2023 02:10:51 +0000
Subject: [PATCH 14/30] [gb_distdgl] only replace sample_neigbors with GB and
 it does not crash

---
 examples/pytorch/rgcn/experimental/cmd.sh     |   2 +-
 .../rgcn/experimental/entity_classify_dist.py |  19 ++-
 .../pytorch/rgcn/experimental/gb_demo_cmd.sh  |  10 ++
 python/dgl/dataloading/dist_dataloader.py     |   9 +-
 python/dgl/dataloading/neighbor_sampler.py    |   3 +-
 python/dgl/distributed/graph_services.py      |  52 ++++---
 python/dgl/distributed/partition.py           |  19 ++-
 .../dgl/graphbolt/impl/csc_sampling_graph.py  |   8 +-
 python/dgl/graphbolt/impl/neighbor_sampler.py |   5 +-
 tests/distributed/test_partition.py           | 138 ++++++++++++++++--
 10 files changed, 214 insertions(+), 51 deletions(-)
 create mode 100644 examples/pytorch/rgcn/experimental/gb_demo_cmd.sh

diff --git a/examples/pytorch/rgcn/experimental/cmd.sh b/examples/pytorch/rgcn/experimental/cmd.sh
index 88212bc794d7..23a452e9cd4a 100644
--- a/examples/pytorch/rgcn/experimental/cmd.sh
+++ b/examples/pytorch/rgcn/experimental/cmd.sh
@@ -7,4 +7,4 @@ python3 /home/ubuntu/workspace/dgl_2/tools/launch.py \
     --num_samplers 0 \
     --part_config /home/ubuntu/workspace/dgl_2/data/ogbn-mag.json \
     --ip_config /home/ubuntu/workspace/ip_config.txt \
-    "DGL_LIBRARY_PATH=/home/ubuntu/workspace/dgl_2/build PYTHONPATH=tests:/home/ubuntu/workspace/dgl_2/python:tests/python/pytorch/graphbolt:$PYTHONPATH python3 gb_demo.py --graph-name ogbn-mag --dataset ogbn-mag --ip-config /home/ubuntu/workspace/ip_config.txt --fanout='25,10' --batch-size 1024  --n-hidden 64 --lr 0.01 --eval-batch-size 1024 --graphbolt"
+    "DGL_LIBRARY_PATH=/home/ubuntu/workspace/dgl_2/build PYTHONPATH=tests:/home/ubuntu/workspace/dgl_2/python:tests/python/pytorch/graphbolt:$PYTHONPATH python3 entity_classify_dist.py --graph-name ogbn-mag --dataset ogbn-mag --ip-config /home/ubuntu/workspace/ip_config.txt --fanout='25,10' --batch-size 1024  --n-hidden 64 --lr 0.01 --eval-batch-size 1024 --low-mem --dropout 0.5 --use-self-loop --n-bases 2 --n-epochs 3 --layer-norm --sparse-embedding --sparse-lr 0.06  --graphbolt"
diff --git a/examples/pytorch/rgcn/experimental/entity_classify_dist.py b/examples/pytorch/rgcn/experimental/entity_classify_dist.py
index 89093ede8a8b..804eedc8e091 100644
--- a/examples/pytorch/rgcn/experimental/entity_classify_dist.py
+++ b/examples/pytorch/rgcn/experimental/entity_classify_dist.py
@@ -466,6 +466,7 @@ def run(args, device, data):
         batch_size=args.batch_size,
         shuffle=True,
         drop_last=False,
+        use_graphbolt=args.graphbolt,
     )
 
     valid_sampler = dgl.dataloading.MultiLayerNeighborSampler(val_fanouts)
@@ -476,6 +477,7 @@ def run(args, device, data):
         batch_size=args.batch_size,
         shuffle=False,
         drop_last=False,
+        use_graphbolt=args.graphbolt,
     )
 
     test_sampler = dgl.dataloading.MultiLayerNeighborSampler(val_fanouts)
@@ -486,6 +488,7 @@ def run(args, device, data):
         batch_size=args.eval_batch_size,
         shuffle=False,
         drop_last=False,
+        use_graphbolt=args.graphbolt,
     )
 
     embed_layer = DistEmbedLayer(
@@ -604,6 +607,9 @@ def run(args, device, data):
         # blocks.
         step_time = []
         for step, sample_data in enumerate(dataloader):
+            g.barrier()
+            time.sleep(5)
+            return
             input_nodes, seeds, blocks = sample_data
             seeds = seeds["paper"]
             number_train += seeds.shape[0]
@@ -703,11 +709,14 @@ def run(args, device, data):
 
 
 def main(args):
-    dgl.distributed.initialize(args.ip_config)
+    if args.graphbolt:
+        print("Using GraphBolt")
+    dgl.distributed.initialize(args.ip_config, use_graphbolt=args.graphbolt)
     if not args.standalone:
         th.distributed.init_process_group(backend="gloo")
 
-    g = dgl.distributed.DistGraph(args.graph_name, part_config=args.conf_path)
+    g = dgl.distributed.DistGraph(args.graph_name, part_config=args.conf_path,
+                                  use_graphbolt=args.graphbolt)
     print("rank:", g.rank())
 
     pb = g.get_partition_book()
@@ -802,6 +811,12 @@ def main(args):
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="RGCN")
     # distributed training related
+    parser.add_argument(
+        "--graphbolt",
+        default=False,
+        action="store_true",
+        help="train with GraphBolt",
+    )
     parser.add_argument("--graph-name", type=str, help="graph name")
     parser.add_argument("--id", type=int, help="the partition id")
     parser.add_argument(
diff --git a/examples/pytorch/rgcn/experimental/gb_demo_cmd.sh b/examples/pytorch/rgcn/experimental/gb_demo_cmd.sh
new file mode 100644
index 000000000000..88212bc794d7
--- /dev/null
+++ b/examples/pytorch/rgcn/experimental/gb_demo_cmd.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+python3 /home/ubuntu/workspace/dgl_2/tools/launch.py \
+    --workspace /home/ubuntu/workspace/dgl_2/examples/pytorch/rgcn/experimental/ \
+    --num_trainers 4 \
+    --num_servers 2 \
+    --num_samplers 0 \
+    --part_config /home/ubuntu/workspace/dgl_2/data/ogbn-mag.json \
+    --ip_config /home/ubuntu/workspace/ip_config.txt \
+    "DGL_LIBRARY_PATH=/home/ubuntu/workspace/dgl_2/build PYTHONPATH=tests:/home/ubuntu/workspace/dgl_2/python:tests/python/pytorch/graphbolt:$PYTHONPATH python3 gb_demo.py --graph-name ogbn-mag --dataset ogbn-mag --ip-config /home/ubuntu/workspace/ip_config.txt --fanout='25,10' --batch-size 1024  --n-hidden 64 --lr 0.01 --eval-batch-size 1024 --graphbolt"
diff --git a/python/dgl/dataloading/dist_dataloader.py b/python/dgl/dataloading/dist_dataloader.py
index dde8a2098cc2..8eb1e765ff78 100644
--- a/python/dgl/dataloading/dist_dataloader.py
+++ b/python/dgl/dataloading/dist_dataloader.py
@@ -167,7 +167,7 @@ class NodeCollator(Collator):
     :doc:`Minibatch Training Tutorials <tutorials/large/L0_neighbor_sampling_overview>`.
     """
 
-    def __init__(self, g, nids, graph_sampler):
+    def __init__(self, g, nids, graph_sampler, use_graphbolt=False):
         self.g = g
         if not isinstance(nids, Mapping):
             assert (
@@ -177,6 +177,7 @@ def __init__(self, g, nids, graph_sampler):
 
         self.nids = utils.prepare_tensor_or_dict(g, nids, "nids")
         self._dataset = utils.maybe_flatten_dict(self.nids)
+        self._use_graphbolt = use_graphbolt
 
     @property
     def dataset(self):
@@ -213,7 +214,7 @@ def collate(self, items):
         items = utils.prepare_tensor_or_dict(self.g, items, "items")
 
         input_nodes, output_nodes, blocks = self.graph_sampler.sample_blocks(
-            self.g, items
+            self.g, items, use_graphbolt=self._use_graphbolt
         )
 
         return input_nodes, output_nodes, blocks
@@ -591,7 +592,7 @@ class DistNodeDataLoader(DistDataLoader):
     dgl.dataloading.DataLoader
     """
 
-    def __init__(self, g, nids, graph_sampler, device=None, **kwargs):
+    def __init__(self, g, nids, graph_sampler, device=None, use_graphbolt=False, **kwargs):
         collator_kwargs = {}
         dataloader_kwargs = {}
         _collator_arglist = inspect.getfullargspec(NodeCollator).args
@@ -608,7 +609,7 @@ def __init__(self, g, nids, graph_sampler, device=None, **kwargs):
         ), "Only cpu is supported in the case of a DistGraph."
         # Distributed DataLoader currently does not support heterogeneous graphs
         # and does not copy features.  Fallback to normal solution
-        self.collator = NodeCollator(g, nids, graph_sampler, **collator_kwargs)
+        self.collator = NodeCollator(g, nids, graph_sampler, use_graphbolt=use_graphbolt, **collator_kwargs)
         _remove_kwargs_dist(dataloader_kwargs)
         super().__init__(
             self.collator.dataset,
diff --git a/python/dgl/dataloading/neighbor_sampler.py b/python/dgl/dataloading/neighbor_sampler.py
index 603d39107cf1..e81d5a393594 100644
--- a/python/dgl/dataloading/neighbor_sampler.py
+++ b/python/dgl/dataloading/neighbor_sampler.py
@@ -148,7 +148,7 @@ def __init__(
         self.mapping = {}
         self.g = None
 
-    def sample_blocks(self, g, seed_nodes, exclude_eids=None):
+    def sample_blocks(self, g, seed_nodes, exclude_eids=None, use_graphbolt=False):
         output_nodes = seed_nodes
         blocks = []
         # sample_neighbors_fused function requires multithreading to be more efficient
@@ -191,6 +191,7 @@ def sample_blocks(self, g, seed_nodes, exclude_eids=None):
                 replace=self.replace,
                 output_device=self.output_device,
                 exclude_edges=exclude_eids,
+                use_graphbolt=use_graphbolt,
             )
             eid = frontier.edata[EID]
             block = to_block(frontier, seed_nodes)
diff --git a/python/dgl/distributed/graph_services.py b/python/dgl/distributed/graph_services.py
index db9597423c20..4f8dce7578df 100644
--- a/python/dgl/distributed/graph_services.py
+++ b/python/dgl/distributed/graph_services.py
@@ -487,7 +487,7 @@ def merge_graphs(res_list, num_nodes):
 )
 
 
-def _distributed_access(g, nodes, issue_remote_req, local_access):
+def _distributed_access(g, nodes, issue_remote_req, local_access, use_graphbolt=False):
     """A routine that fetches local neighborhood of nodes from the distributed graph.
 
     The local neighborhood of some nodes are stored in the local machine and the other
@@ -506,6 +506,8 @@ def _distributed_access(g, nodes, issue_remote_req, local_access):
         The function that issues requests to access remote data.
     local_access : callable
         The function that reads data on the local machine.
+    use_graphbolt : bool
+        Whether to use GraphBolt.
 
     Returns
     -------
@@ -595,7 +597,6 @@ def _frontier_to_heterogeneous_graph(g, frontier, gpb):
 
 
 def _frontier_to_heterogeneous_graph_gb(g, frontier, gpb):
-    '''[Rui] Do not use EID.'''
     # We need to handle empty frontiers correctly.
     if frontier.num_edges() == 0:
         data_dict = {
@@ -612,17 +613,27 @@ def _frontier_to_heterogeneous_graph_gb(g, frontier, gpb):
     src, dst = frontier.edges()
     etype_ids, idx = F.sort_1d(etype_ids)
     src, dst = F.gather_row(src, idx), F.gather_row(dst, idx)
-    _, src = gpb.map_to_per_ntype(src)
-    _, dst = gpb.map_to_per_ntype(dst)
+    src_ntype_ids, src = gpb.map_to_per_ntype(src)
+    dst_ntype_ids, dst = gpb.map_to_per_ntype(dst)
 
     data_dict = dict()
+    print("g.canonical_etypes: ", g.canonical_etypes)
     for etid, etype in enumerate(g.canonical_etypes):
+        src_ntype, _, dst_ntype = etype
+        src_ntype_id = g.get_ntype_id(src_ntype)
+        dst_ntype_id = g.get_ntype_id(dst_ntype)
         type_idx = etype_ids == etid
         if F.sum(type_idx, 0) > 0:
             data_dict[etype] = (
                 F.boolean_mask(src, type_idx),
                 F.boolean_mask(dst, type_idx),
             )
+            assert torch.all(src_ntype_id == src_ntype_ids[type_idx]), (
+                "source ntype is is not expected."
+            )
+            assert torch.all(dst_ntype_id == dst_ntype_ids[type_idx]), (
+                "destination ntype is is not expected."
+            )
     hg = heterograph(
         data_dict,
         {ntype: g.num_nodes(ntype) for ntype in g.ntypes},
@@ -703,24 +714,21 @@ def sample_etype_neighbors(
     DGLGraph
         A sampled subgraph containing only the sampled neighboring edges.  It is on CPU.
     """
-    if isinstance(fanout, int):
-        fanout = F.full_1d(len(g.canonical_etypes), fanout, F.int64, F.cpu())
-    else:
-        if use_graphbolt:
-            dgl_warning(
-                "----------- [Rui] Not covered in demo test yet. -----------"
+    if not use_graphbolt:
+        if isinstance(fanout, int):
+            fanout = F.full_1d(len(g.canonical_etypes), fanout, F.int64, F.cpu())
+        else:
+            etype_ids = {etype: i for i, etype in enumerate(g.canonical_etypes)}
+            fanout_array = [None] * len(g.canonical_etypes)
+            for etype, v in fanout.items():
+                c_etype = g.to_canonical_etype(etype)
+                fanout_array[etype_ids[c_etype]] = v
+            assert all(v is not None for v in fanout_array), (
+                "Not all etypes have valid fanout. Please make sure passed-in "
+                "fanout in dict includes all the etypes in graph. Passed-in "
+                f"fanout: {fanout}, graph etypes: {g.canonical_etypes}."
             )
-        etype_ids = {etype: i for i, etype in enumerate(g.canonical_etypes)}
-        fanout_array = [None] * len(g.canonical_etypes)
-        for etype, v in fanout.items():
-            c_etype = g.to_canonical_etype(etype)
-            fanout_array[etype_ids[c_etype]] = v
-        assert all(v is not None for v in fanout_array), (
-            "Not all etypes have valid fanout. Please make sure passed-in "
-            "fanout in dict includes all the etypes in graph. Passed-in "
-            f"fanout: {fanout}, graph etypes: {g.canonical_etypes}."
-        )
-        fanout = F.tensor(fanout_array, dtype=F.int64)
+            fanout = F.tensor(fanout_array, dtype=F.int64)
 
     gpb = g.get_partition_book()
     if isinstance(nodes, dict):
@@ -786,7 +794,7 @@ def local_access(local_g, partition_book, local_nids):
             use_graphbolt=use_graphbolt,
         )
 
-    frontier = _distributed_access(g, nodes, issue_remote_req, local_access)
+    frontier = _distributed_access(g, nodes, issue_remote_req, local_access, use_graphbolt=use_graphbolt)
     if not gpb.is_homogeneous:
         if use_graphbolt:
             return _frontier_to_heterogeneous_graph_gb(g, frontier, gpb)
diff --git a/python/dgl/distributed/partition.py b/python/dgl/distributed/partition.py
index d4da8828fc82..5fb29e232639 100644
--- a/python/dgl/distributed/partition.py
+++ b/python/dgl/distributed/partition.py
@@ -573,6 +573,7 @@ def partition_graph(
     use_graphbolt=False,
     gb_store_orig_nids=False,
     gb_store_orig_eids=False,
+    gb_store_ntypes=False,
     gb_store_etypes=False,
     gb_store_metadata=False,
 ):
@@ -754,6 +755,8 @@ def partition_graph(
         Whether to store the original node IDs in the partitioned graph.
     gb_store_orig_eids : bool
         Whether to store the original edge IDs in the partitioned graph.
+    gb_store_ntypes : bool
+        Whether to store the node types in the partitioned graph.
     gb_store_etypes : bool
         Whether to store the edge types in the partitioned graph.
     gb_store_metadata : bool
@@ -1262,6 +1265,7 @@ def get_homogeneous(g, balance_ntypes):
             part_config,
             store_orig_nids=gb_store_orig_nids,
             store_orig_eids=gb_store_orig_eids,
+            store_ntypes=gb_store_ntypes,
             store_etypes=gb_store_etypes,
             store_metadata=gb_store_metadata,
         )
@@ -1275,6 +1279,7 @@ def convert_dgl_partition_to_csc_sampling_graph(
     part_config,
     store_orig_nids=False,
     store_orig_eids=False,
+    store_ntypes=False,
     store_etypes=False,
     store_metadata=False,
 ):
@@ -1295,6 +1300,8 @@ def convert_dgl_partition_to_csc_sampling_graph(
         Whether to store original node IDs in the new graph.
     store_orig_eids : bool, optional
         Whether to store original edge IDs in the new graph.
+    store_ntypes : bool, optional
+        Whether to store node types in the new graph.
     store_etypes : bool, optional
         Whether to store edge types in the new graph.
     store_metadata : bool, optional
@@ -1332,11 +1339,12 @@ def init_type_per_edge(graph, gpb):
             }
             metadata = graphbolt.GraphMetadata(ntypes, c_etypes)
         # Obtain CSC indtpr and indices.
-        indptr, indices, _ = graph.adj().csc()
+        indptr, indices, edge_ids = graph.adj_tensors("csc") #graph.adj().csc()
         # Initalize type per edge.
         type_per_edge = None
         if store_etypes:
             type_per_edge = init_type_per_edge(graph, gpb)
+            type_per_edge = type_per_edge[edge_ids]
             type_per_edge = type_per_edge.to(RESERVED_FIELD_DTYPE[ETYPE])
             if len(etypes) < 128:
                 type_per_edge = type_per_edge.to(torch.int8)
@@ -1360,9 +1368,16 @@ def init_type_per_edge(graph, gpb):
             # Sanity check.
             assert len(graph.edata[EID]) == graph.num_edges()
             edge_attributes = {
-                EID: graph.edata[EID].to(RESERVED_FIELD_DTYPE[EID])
+                EID: graph.edata[EID][edge_ids].to(RESERVED_FIELD_DTYPE[EID])
             }
 
+        if store_ntypes:
+            if node_attributes is None:
+                node_attributes = {}
+            node_attributes[NTYPE] = graph.ndata[NTYPE].to(
+                RESERVED_FIELD_DTYPE[NTYPE]
+            )
+
         if store_etypes:
             # [Rui] Let's store as edge attributes for now.
             if edge_attributes is None:
diff --git a/python/dgl/graphbolt/impl/csc_sampling_graph.py b/python/dgl/graphbolt/impl/csc_sampling_graph.py
index 58110c04577a..09c4a06963b2 100644
--- a/python/dgl/graphbolt/impl/csc_sampling_graph.py
+++ b/python/dgl/graphbolt/impl/csc_sampling_graph.py
@@ -366,9 +366,11 @@ def _convert_to_sampled_subgraph(
             and ETYPE in self.edge_attributes
         )
         if has_original_etype_ids:
+            assert original_edge_ids is not None, "original_edge_ids is None."
             original_etype_ids = self.edge_attributes[ETYPE][
                 original_edge_ids
             ]
+            assert original_etype_ids is not None, "original_etype_ids is None"
 
         if type_per_edge is None:
             # The sampled graph is already a homogeneous graph.
@@ -577,12 +579,16 @@ def _sample_neighbors(
             self.edge_attributes is not None
             and ORIGINAL_EDGE_ID in self.edge_attributes
         )
+        has_original_etype_ids=(
+            self.edge_attributes is not None
+            and ETYPE in self.edge_attributes
+        )
         return self._c_csc_graph.sample_neighbors(
             nodes,
             fanouts.tolist(),
             replace,
             False,
-            has_original_eids,
+            has_original_eids or has_original_etype_ids,
             probs_name,
         )
 
diff --git a/python/dgl/graphbolt/impl/neighbor_sampler.py b/python/dgl/graphbolt/impl/neighbor_sampler.py
index 0e451f7f8fad..00ff91796f0b 100644
--- a/python/dgl/graphbolt/impl/neighbor_sampler.py
+++ b/python/dgl/graphbolt/impl/neighbor_sampler.py
@@ -102,11 +102,12 @@ def __init__(
 
     @staticmethod
     def distributed_sample_neighbor(graph, seeds, fanouts):
-        if isinstance(fanouts, int):
-            fanouts = torch.LongTensor([fanouts])
+        assert isinstance(fanouts, int), f"Fanouts should be an integer but got {fanouts}."
+        fanouts = torch.LongTensor([fanouts])
         subgraph = graph.sample_neighbors(seeds, fanouts)
         src_nodes, dst_nodes = subgraph.node_pairs
         etype_ids = subgraph.original_etype_ids
+        assert src_nodes.shape == dst_nodes.shape == etype_ids.shape, f"Shape mismatch: {src_nodes.shape}, {dst_nodes.shape}, {etype_ids.shape}"
         return src_nodes, dst_nodes, etype_ids
 
     def _sample_subgraphs(self, seeds):
diff --git a/tests/distributed/test_partition.py b/tests/distributed/test_partition.py
index be540362866c..d519e7b266d6 100644
--- a/tests/distributed/test_partition.py
+++ b/tests/distributed/test_partition.py
@@ -701,10 +701,10 @@ def test_convert_dgl_partition_to_csc_sampling_graph_homo(
         part_config = os.path.join(test_dir, f"{graph_name}.json")
         convert_dgl_partition_to_csc_sampling_graph(
             part_config,
-            store_orig_nids,
-            store_orig_eids,
-            store_etypes,
-            store_metadata,
+            store_orig_nids=store_orig_nids,
+            store_orig_eids=store_orig_eids,
+            store_etypes=store_etypes,
+            store_metadata=store_metadata,
         )
         for part_id in range(num_parts):
             orig_g = dgl.load_graphs(
@@ -713,7 +713,7 @@ def test_convert_dgl_partition_to_csc_sampling_graph_homo(
             new_g = dgl.graphbolt.load_csc_sampling_graph(
                 os.path.join(test_dir, f"part{part_id}/csc_sampling_graph.tar")
             )
-            orig_indptr, orig_indices, _ = orig_g.adj().csc()
+            orig_indptr, orig_indices, orig_eids = orig_g.adj().csc()
             assert th.equal(orig_indptr, new_g.csc_indptr)
             assert th.equal(orig_indices, new_g.indices)
             assert new_g.node_type_offset is None
@@ -725,12 +725,14 @@ def test_convert_dgl_partition_to_csc_sampling_graph_homo(
                 assert new_g.node_attributes is None
             if store_orig_eids:
                 assert th.equal(
-                    orig_g.edata[dgl.EID], new_g.edge_attributes[dgl.EID]
+                    orig_g.edata[dgl.EID][orig_eids], new_g.edge_attributes[dgl.EID]
                 )
             else:
-                assert new_g.edge_attributes is None
+                if not store_etypes:
+                    assert new_g.edge_attributes is None
             if store_etypes:
-                assert th.all(0 == new_g.type_per_edge)
+                assert th.all(0 == new_g.edge_attributes[dgl.ETYPE])
+                assert new_g.type_per_edge is None
             else:
                 assert new_g.type_per_edge is None
             if store_metadata:
@@ -772,10 +774,10 @@ def test_convert_dgl_partition_to_csc_sampling_graph_hetero(
         part_config = os.path.join(test_dir, f"{graph_name}.json")
         convert_dgl_partition_to_csc_sampling_graph(
             part_config,
-            store_orig_nids,
-            store_orig_eids,
-            store_etypes,
-            store_metadata,
+            store_orig_nids=store_orig_nids,
+            store_orig_eids=store_orig_eids,
+            store_etypes=store_etypes,
+            store_metadata=store_metadata,
         )
         for part_id in range(num_parts):
             orig_g = dgl.load_graphs(
@@ -784,7 +786,7 @@ def test_convert_dgl_partition_to_csc_sampling_graph_hetero(
             new_g = dgl.graphbolt.load_csc_sampling_graph(
                 os.path.join(test_dir, f"part{part_id}/csc_sampling_graph.tar")
             )
-            orig_indptr, orig_indices, _ = orig_g.adj().csc()
+            orig_indptr, orig_indices, orig_eids = orig_g.adj().csc()
             assert th.equal(orig_indptr, new_g.csc_indptr)
             assert th.equal(orig_indices, new_g.indices)
             if store_orig_nids:
@@ -795,12 +797,14 @@ def test_convert_dgl_partition_to_csc_sampling_graph_hetero(
                 assert new_g.node_attributes is None
             if store_orig_eids:
                 assert th.equal(
-                    orig_g.edata[dgl.EID], new_g.edge_attributes[dgl.EID]
+                    orig_g.edata[dgl.EID][orig_eids], new_g.edge_attributes[dgl.EID]
                 )
             else:
-                assert new_g.edge_attributes is None
+                if not store_etypes:
+                    assert new_g.edge_attributes is None
             if store_etypes:
-                assert th.equal(orig_g.edata[dgl.ETYPE], new_g.type_per_edge)
+                assert th.equal(orig_g.edata[dgl.ETYPE][orig_eids], new_g.edge_attributes[dgl.ETYPE])
+                assert new_g.type_per_edge is None
             else:
                 assert new_g.type_per_edge is None
             if store_metadata:
@@ -1048,3 +1052,105 @@ def test_partition_hetero_graphbolt(
                 assert edge_feats == {}
 
     reset_envs()
+
+
+@pytest.mark.parametrize("part_method", ["metis"])
+@pytest.mark.parametrize("num_parts", [4])
+@pytest.mark.parametrize("num_trainers_per_machine", [1])
+@pytest.mark.parametrize("load_feats", [True])
+def test_partition_hetero_graphbolt_sample_neighbors(
+    part_method,
+    num_parts,
+    num_trainers_per_machine,
+    load_feats,
+):
+    os.environ["DGL_DIST_DEBUG"] = "1"
+    if part_method == "random" and num_parts > 1:
+        num_trainers_per_machine = 1
+
+    hg = create_random_hetero()
+    test_ntype = "n1"
+    test_etype = ("n1", "r1", "n2")
+    hg.nodes[test_ntype].data["labels"] = F.arange(0, hg.num_nodes(test_ntype))
+    hg.nodes[test_ntype].data["feats"] = F.tensor(
+        np.random.randn(hg.num_nodes(test_ntype), 10), F.float32
+    )
+    hg.edges[test_etype].data["feats"] = F.tensor(
+        np.random.randn(hg.num_edges(test_etype), 10), F.float32
+    )
+    hg.edges[test_etype].data["labels"] = F.arange(0, hg.num_edges(test_etype))
+
+    num_hops = 2
+
+    with tempfile.TemporaryDirectory() as test_dir:
+        orig_nids, orig_eids = dgl.distributed.partition_graph(
+            hg,
+            "test",
+            num_parts,
+            test_dir,
+            num_hops=num_hops,
+            part_method=part_method,
+            return_mapping=True,
+            num_trainers_per_machine=num_trainers_per_machine,
+            use_graphbolt=True,
+            gb_store_orig_nids=True,
+            gb_store_orig_eids=True,
+            gb_store_ntypes=True,
+            gb_store_etypes=True,
+        )
+        part_config = os.path.join(test_dir, "test.json")
+        for i in range(num_parts):
+            part_g, node_feats, edge_feats, gpb, _, _, _ = dgl.distributed.load_partition(
+                part_config, i, load_feats=load_feats, use_graphbolt=True
+            )
+            assert isinstance(part_g, gb.CSCSamplingGraph)
+            assert gpb.num_partitions() == num_parts
+            gpb_meta = gpb.metadata()
+            assert len(gpb_meta) == num_parts
+            assert len(gpb.partid2nids(i)) == gpb_meta[i]["num_nodes"]
+            assert len(gpb.partid2eids(i)) == gpb_meta[i]["num_edges"]
+            #assert len(gpb.partid2nids(i)) == part_g.total_num_nodes
+            #assert len(gpb.partid2eids(i)) == part_g.total_num_edges
+            if load_feats:
+                assert "n1/labels" in node_feats
+                assert "n1/feats" in node_feats
+                assert "n1:r1:n2/feats" in edge_feats
+            else:
+                assert node_feats == {}
+                assert edge_feats == {}
+
+            # sample_neighbors()
+            subg = part_g.sample_neighbors(th.arange(10), th.LongTensor([-1]))
+            src, dst = subg.node_pairs
+            orig_src = part_g.node_attributes[dgl.NID][src]
+            orig_dst = part_g.node_attributes[dgl.NID][dst]
+            orig_ntype_src = part_g.node_attributes[dgl.NTYPE][src]
+            orig_ntype_dst = part_g.node_attributes[dgl.NTYPE][dst]
+            etype_ids = subg.original_etype_ids
+            orig_eids = part_g.edge_attributes[dgl.EID][subg.original_edge_ids]
+            etype_idsA, _ = gpb.map_to_per_etype(orig_eids)
+            assert th.equal(etype_ids, etype_idsA), "etype_ids is not expected."
+
+            etype_ids, idx = F.sort_1d(etype_ids)
+            sorted_orig_src, sorted_orig_dst = F.gather_row(orig_src, idx), F.gather_row(orig_dst, idx)
+            src_ntype_ids, ntype_wised_src = gpb.map_to_per_ntype(sorted_orig_src)
+            dst_ntype_ids, ntype_wised_dst = gpb.map_to_per_ntype(sorted_orig_dst)
+
+            data_dict = dict()
+            print("gpb.canonical_etypes: ", gpb.canonical_etypes)
+            ntype_map = {ntype: i for i, ntype in enumerate(gpb.ntypes)}
+            etype_map = {
+                etype: i for i, etype in enumerate(gpb.canonical_etypes)
+            }
+            for etid, etype in enumerate(gpb.canonical_etypes):
+                src_ntype, _, dst_ntype = etype
+                src_ntype_id = ntype_map[src_ntype]
+                dst_ntype_id = ntype_map[dst_ntype]
+                type_idx = etype_ids == etid
+                if F.sum(type_idx, 0) > 0:
+                    assert th.all(src_ntype_id == src_ntype_ids[type_idx]), (
+                        "source ntype is is not expected."
+                    )
+                    assert th.all(dst_ntype_id == dst_ntype_ids[type_idx]), (
+                        "destination ntype is is not expected."
+                    )

From 1b06f147f96c0edc4403e9e4d2fb36ee818761c9 Mon Sep 17 00:00:00 2001
From: RhettYing <rhett_ying@qq.com>
Date: Thu, 26 Oct 2023 05:47:29 +0000
Subject: [PATCH 15/30] [gb_distdgl] train is ready though acc and time drops

---
 examples/pytorch/rgcn/experimental/README.md  |  75 ++++++
 examples/pytorch/rgcn/experimental/cmd.sh     |   2 +-
 examples/pytorch/rgcn/experimental/dgl_cmd.sh |  10 +
 .../rgcn/experimental/entity_classify_dist.py |   4 +-
 .../pytorch/rgcn/experimental/log_dgl.txt     | 223 +++++++++++++++++
 examples/pytorch/rgcn/experimental/log_gb.txt | 231 ++++++++++++++++++
 python/dgl/distributed/dist_graph.py          |  21 +-
 python/dgl/distributed/graph_services.py      |   2 -
 python/dgl/distributed/partition.py           |   4 +-
 9 files changed, 561 insertions(+), 11 deletions(-)
 create mode 100644 examples/pytorch/rgcn/experimental/dgl_cmd.sh
 create mode 100644 examples/pytorch/rgcn/experimental/log_dgl.txt
 create mode 100644 examples/pytorch/rgcn/experimental/log_gb.txt

diff --git a/examples/pytorch/rgcn/experimental/README.md b/examples/pytorch/rgcn/experimental/README.md
index 7a7bcc9794a9..a8adc5e4d54c 100644
--- a/examples/pytorch/rgcn/experimental/README.md
+++ b/examples/pytorch/rgcn/experimental/README.md
@@ -1,3 +1,78 @@
+## DistDGL with GraphBolt partitions and sampling
+
+### How to partition graph
+
+#### Partition from original dataset with `dgl.distributed.partition_graph()`
+
+```
+DGL_HOME=/home/ubuntu/workspace/dgl_2 DGL_LIBRARY_PATH=$DGL_HOME/build PYTHONPATH=tests:$DGL_HOME/python:tests/python/pytorch/graphbolt:$PYTHONPATH python3 examples/pytorch/rgcn/experimental/partition_graph.py --dataset ogbn-mag --num_parts 2 --balance_train --balance_edges --graphbolt
+```
+
+#### Convert existing partitions into GraphBolt formats
+
+```
+import dgl
+part_config = "./data/ogbn-mag.json"
+dgl.distributed.convert_dgl_partition_to_csc_sampling_graph(
+    part_config,
+    store_orig_nids=True,
+    store_etypes=True,
+)
+```
+
+#### Partition sizes compared between GraphBolt and DistDGL
+
+`csc_sampling_graph.tar` is the GraphBolt partitions.
+`graph.dgl` is the original DistDGL partitions, namely, DGLGraph.
+
+```
+-rw-rw-r-- 1 ubuntu ubuntu 231M Oct 26 01:51 data/part0/csc_sampling_graph.tar
+-rw-rw-r-- 1 ubuntu ubuntu   24 Oct 26 01:51 data/part0/edge_feat.dgl
+-rw-rw-r-- 1 ubuntu ubuntu 701M Oct 26 01:51 data/part0/graph.dgl
+-rw-rw-r-- 1 ubuntu ubuntu 182M Oct 26 01:51 data/part0/node_feat.dgl
+-rw-rw-r-- 1 ubuntu ubuntu 235M Oct 26 01:51 data/part1/csc_sampling_graph.tar
+-rw-rw-r-- 1 ubuntu ubuntu   24 Oct 26 01:51 data/part1/edge_feat.dgl
+-rw-rw-r-- 1 ubuntu ubuntu 711M Oct 26 01:51 data/part1/graph.dgl
+-rw-rw-r-- 1 ubuntu ubuntu 187M Oct 26 01:51 data/part1/node_feat.dgl
+```
+
+### Train with GraphBolt partitions
+just append `--graphbolt`.
+
+```
+python3 /home/ubuntu/workspace/dgl_2/tools/launch.py \
+    --workspace /home/ubuntu/workspace/dgl_2/examples/pytorch/rgcn/experimental/ \
+    --num_trainers 4 \
+    --num_servers 2 \
+    --num_samplers 0 \
+    --part_config /home/ubuntu/workspace/dgl_2/data/ogbn-mag.json \
+    --ip_config /home/ubuntu/workspace/ip_config.txt \
+    "DGL_LIBRARY_PATH=/home/ubuntu/workspace/dgl_2/build PYTHONPATH=tests:/home/ubuntu/workspace/dgl_2/python:tests/python/pytorch/graphbolt:$PYTHONPATH python3 entity_classify_dist.py --graph-name ogbn-mag --dataset ogbn-mag --ip-config /home/ubuntu/workspace/ip_config.txt --fanout='25,10' --batch-size 1024  --n-hidden 64 --lr 0.01 --eval-batch-size 1024 --low-mem --dropout 0.5 --use-self-loop --n-bases 2 --n-epochs 1 --layer-norm --sparse-embedding --sparse-lr 0.06  --graphbolt"
+```
+
+#### Results
+`g4dn.metal` x 2, `ogbn-mag`.
+
+[**NOTE**]Epoch time drops in DistDGL with GraphBolt, so does accuracy, probably partially caused by the difference between `sample_neighbors()` and `sample_etype_neighbors()`.
+
+##### DistDGL
+
+```
+Epoch Time(s): 177.6757, sample: 73.0354, data copy: 27.7802, forward: 2.4314, backward: 63.2740, update: 11.1546, #train: 78696, #inp
+ut: 34579790
+
+Val Acc 0.4618, Test Acc 0.4485, time: 16.9179
+```
+
+##### DistDGL with GraphBolt
+
+```
+Epoch Time(s): 32.7923, sample: 5.4970, data copy: 4.9976, forward: 2.4069, backward: 15.4529, update: 4.4377, #train: 78696, #input: 
+18370936
+
+Val Acc 0.3901, Test Acc 0.3844, time: 2.2284
+```
+
 ## Distributed training
 
 This is an example of training RGCN node classification in a distributed fashion. Currently, the example train RGCN graphs with input node features. The current implementation follows ../rgcn/entity_claasify_mp.py.
diff --git a/examples/pytorch/rgcn/experimental/cmd.sh b/examples/pytorch/rgcn/experimental/cmd.sh
index 23a452e9cd4a..63ed0a14ca90 100644
--- a/examples/pytorch/rgcn/experimental/cmd.sh
+++ b/examples/pytorch/rgcn/experimental/cmd.sh
@@ -7,4 +7,4 @@ python3 /home/ubuntu/workspace/dgl_2/tools/launch.py \
     --num_samplers 0 \
     --part_config /home/ubuntu/workspace/dgl_2/data/ogbn-mag.json \
     --ip_config /home/ubuntu/workspace/ip_config.txt \
-    "DGL_LIBRARY_PATH=/home/ubuntu/workspace/dgl_2/build PYTHONPATH=tests:/home/ubuntu/workspace/dgl_2/python:tests/python/pytorch/graphbolt:$PYTHONPATH python3 entity_classify_dist.py --graph-name ogbn-mag --dataset ogbn-mag --ip-config /home/ubuntu/workspace/ip_config.txt --fanout='25,10' --batch-size 1024  --n-hidden 64 --lr 0.01 --eval-batch-size 1024 --low-mem --dropout 0.5 --use-self-loop --n-bases 2 --n-epochs 3 --layer-norm --sparse-embedding --sparse-lr 0.06  --graphbolt"
+    "DGL_LIBRARY_PATH=/home/ubuntu/workspace/dgl_2/build PYTHONPATH=tests:/home/ubuntu/workspace/dgl_2/python:tests/python/pytorch/graphbolt:$PYTHONPATH python3 entity_classify_dist.py --graph-name ogbn-mag --dataset ogbn-mag --ip-config /home/ubuntu/workspace/ip_config.txt --fanout='25,10' --batch-size 1024  --n-hidden 64 --lr 0.01 --eval-batch-size 1024 --low-mem --dropout 0.5 --use-self-loop --n-bases 2 --n-epochs 1 --layer-norm --sparse-embedding --sparse-lr 0.06  --graphbolt"
diff --git a/examples/pytorch/rgcn/experimental/dgl_cmd.sh b/examples/pytorch/rgcn/experimental/dgl_cmd.sh
new file mode 100644
index 000000000000..3990982b2991
--- /dev/null
+++ b/examples/pytorch/rgcn/experimental/dgl_cmd.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+python3 /home/ubuntu/workspace/dgl_2/tools/launch.py \
+    --workspace /home/ubuntu/workspace/dgl_2/examples/pytorch/rgcn/experimental/ \
+    --num_trainers 4 \
+    --num_servers 2 \
+    --num_samplers 0 \
+    --part_config /home/ubuntu/workspace/dgl_2/data/ogbn-mag.json \
+    --ip_config /home/ubuntu/workspace/ip_config.txt \
+    "DGL_LIBRARY_PATH=/home/ubuntu/workspace/dgl_2/build PYTHONPATH=tests:/home/ubuntu/workspace/dgl_2/python:tests/python/pytorch/graphbolt:$PYTHONPATH python3 entity_classify_dist.py --graph-name ogbn-mag --dataset ogbn-mag --ip-config /home/ubuntu/workspace/ip_config.txt --fanout='25,10' --batch-size 1024  --n-hidden 64 --lr 0.01 --eval-batch-size 1024 --low-mem --dropout 0.5 --use-self-loop --n-bases 2 --n-epochs 1 --layer-norm --sparse-embedding --sparse-lr 0.06 "
diff --git a/examples/pytorch/rgcn/experimental/entity_classify_dist.py b/examples/pytorch/rgcn/experimental/entity_classify_dist.py
index 804eedc8e091..162a328299b8 100644
--- a/examples/pytorch/rgcn/experimental/entity_classify_dist.py
+++ b/examples/pytorch/rgcn/experimental/entity_classify_dist.py
@@ -10,6 +10,7 @@
 import gc, os
 import itertools
 import time
+import psutil
 
 import numpy as np
 
@@ -607,9 +608,6 @@ def run(args, device, data):
         # blocks.
         step_time = []
         for step, sample_data in enumerate(dataloader):
-            g.barrier()
-            time.sleep(5)
-            return
             input_nodes, seeds, blocks = sample_data
             seeds = seeds["paper"]
             number_train += seeds.shape[0]
diff --git a/examples/pytorch/rgcn/experimental/log_dgl.txt b/examples/pytorch/rgcn/experimental/log_dgl.txt
new file mode 100644
index 000000000000..89147ec2fe50
--- /dev/null
+++ b/examples/pytorch/rgcn/experimental/log_dgl.txt
@@ -0,0 +1,223 @@
+/home/ubuntu/workspace/dgl_2/tools/launch.py:148: DeprecationWarning: setDaemon() is deprecated, set the daemon attribute instead
+  thread.setDaemon(True)
+The number of OMP threads per trainer is set to 12
+Namespace(batch_size=1024, conf_path=None, dataset='ogbn-mag', dgl_sparse=False, dropout=0.5, eval_batch_size=1024, fanout='25,10', graph_name='ogbn-mag', graphbolt=False, id=None, ip_config='/home/ubuntu/workspace/ip_config.txt', l2norm=0, layer_norm=True, local_rank=None, log_every=20, low_mem=True, lr=0.01, n_bases=2, n_epochs=1, n_hidden=64, n_layers=2, num_gpus=-1, relabel=False, sparse_embedding=True, sparse_lr=0.06, standalone=False, use_self_loop=True, validation_fanout='25,10')
+Namespace(batch_size=1024, conf_path=None, dataset='ogbn-mag', dgl_sparse=False, dropout=0.5, eval_batch_size=1024, fanout='25,10', graph_name='ogbn-mag', graphbolt=False, id=None, ip_config='/home/ubuntu/workspace/ip_config.txt', l2norm=0, layer_norm=True, local_rank=None, log_every=20, low_mem=True, lr=0.01, n_bases=2, n_epochs=1, n_hidden=64, n_layers=2, num_gpus=-1, relabel=False, sparse_embedding=True, sparse_lr=0.06, standalone=False, use_self_loop=True, validation_fanout='25,10')
+Namespace(batch_size=1024, conf_path=None, dataset='ogbn-mag', dgl_sparse=False, dropout=0.5, eval_batch_size=1024, fanout='25,10', graph_name='ogbn-mag', graphbolt=False, id=None, ip_config='/home/ubuntu/workspace/ip_config.txt', l2norm=0, layer_norm=True, local_rank=None, log_every=20, low_mem=True, lr=0.01, n_bases=2, n_epochs=1, n_hidden=64, n_layers=2, num_gpus=-1, relabel=False, sparse_embedding=True, sparse_lr=0.06, standalone=False, use_self_loop=True, validation_fanout='25,10')
+Namespace(batch_size=1024, conf_path=None, dataset='ogbn-mag', dgl_sparse=False, dropout=0.5, eval_batch_size=1024, fanout='25,10', graph_name='ogbn-mag', graphbolt=False, id=None, ip_config='/home/ubuntu/workspace/ip_config.txt', l2norm=0, layer_norm=True, local_rank=None, log_every=20, low_mem=True, lr=0.01, n_bases=2, n_epochs=1, n_hidden=64, n_layers=2, num_gpus=-1, relabel=False, sparse_embedding=True, sparse_lr=0.06, standalone=False, use_self_loop=True, validation_fanout='25,10')
+Namespace(batch_size=1024, conf_path=None, dataset='ogbn-mag', dgl_sparse=False, dropout=0.5, eval_batch_size=1024, fanout='25,10', graph_name='ogbn-mag', graphbolt=False, id=None, ip_config='/home/ubuntu/workspace/ip_config.txt', l2norm=0, layer_norm=True, local_rank=None, log_every=20, low_mem=True, lr=0.01, n_bases=2, n_epochs=1, n_hidden=64, n_layers=2, num_gpus=-1, relabel=False, sparse_embedding=True, sparse_lr=0.06, standalone=False, use_self_loop=True, validation_fanout='25,10')
+Namespace(batch_size=1024, conf_path=None, dataset='ogbn-mag', dgl_sparse=False, dropout=0.5, eval_batch_size=1024, fanout='25,10', graph_name='ogbn-mag', graphbolt=False, id=None, ip_config='/home/ubuntu/workspace/ip_config.txt', l2norm=0, layer_norm=True, local_rank=None, log_every=20, low_mem=True, lr=0.01, n_bases=2, n_epochs=1, n_hidden=64, n_layers=2, num_gpus=-1, relabel=False, sparse_embedding=True, sparse_lr=0.06, standalone=False, use_self_loop=True, validation_fanout='25,10')
+Namespace(batch_size=1024, conf_path=None, dataset='ogbn-mag', dgl_sparse=False, dropout=0.5, eval_batch_size=1024, fanout='25,10', graph_name='ogbn-mag', graphbolt=False, id=None, ip_config='/home/ubuntu/workspace/ip_config.txt', l2norm=0, layer_norm=True, local_rank=None, log_every=20, low_mem=True, lr=0.01, n_bases=2, n_epochs=1, n_hidden=64, n_layers=2, num_gpus=-1, relabel=False, sparse_embedding=True, sparse_lr=0.06, standalone=False, use_self_loop=True, validation_fanout='25,10')
+Namespace(batch_size=1024, conf_path=None, dataset='ogbn-mag', dgl_sparse=False, dropout=0.5, eval_batch_size=1024, fanout='25,10', graph_name='ogbn-mag', graphbolt=False, id=None, ip_config='/home/ubuntu/workspace/ip_config.txt', l2norm=0, layer_norm=True, local_rank=None, log_every=20, low_mem=True, lr=0.01, n_bases=2, n_epochs=1, n_hidden=64, n_layers=2, num_gpus=-1, relabel=False, sparse_embedding=True, sparse_lr=0.06, standalone=False, use_self_loop=True, validation_fanout='25,10')
+Client [170011] waits on 172.31.8.229:54407
+Client [170012] waits on 172.31.8.229:53959
+Client [170014] waits on 172.31.8.229:49611
+Client [170013] waits on 172.31.8.229:35283
+Client [1878744] waits on 172.31.14.101:45181
+Client [1878745] waits on 172.31.14.101:59671Client [1878743] waits on 172.31.14.101:58669
+
+Client [1878746] waits on 172.31.14.101:34789
+Machine (0) group (0) client (0) connect to server successfuly!
+Machine (0) group (0) client (1) connect to server successfuly!
+Machine (0) group (0) client (2) connect to server successfuly!
+Machine (0) group (0) client (3) connect to server successfuly!
+Machine (1) group (0) client (4) connect to server successfuly!
+Machine (1) group (0) client (5) connect to server successfuly!
+Machine (1) group (0) client (6) connect to server successfuly!
+Machine (1) group (0) client (7) connect to server successfuly!
+rank: 0
+rank: 2
+rank: 3
+rank: 1
+rank: 5
+rank: 4
+rank: 7
+rank: 6
+part 4, train: 78697 (local: 78697), val: 8110 (local: 7708), test: 5243 (local: 5072)
+part 2, train: 78696 (local: 77008), val: 8110 (local: 8110), test: 5242 (local: 5242)
+part 5, train: 78696 (local: 78696), val: 8110 (local: 7714), test: 5242 (local: 5108)
+part 1, train: 78697 (local: 76875), val: 8110 (local: 8110), test: 5243 (local: 5243)
+part 3, train: 78696 (local: 76932), val: 8110 (local: 8110), test: 5242 (local: 5242)
+part 0, train: 78697 (local: 76971), val: 8110 (local: 8110), test: 5243 (local: 5243)
+part 6, train: 78696 (local: 78696), val: 8110 (local: 7735), test: 5242 (local: 5072)
+part 7, train: 78696 (local: 78696), val: 8109 (local: 7738), test: 5242 (local: 5092)
+#classes: 349
+#classes: 349
+node paper has data feat
+node paper has data feat
+#classes: 349
+#classes: 349
+node paper has data feat
+node paper has data feat
+#classes: 349
+node paper has data feat
+#classes: 349
+node paper has data feat
+#classes: 349
+node paper has data feat
+#classes: 349
+node paper has data feat
+optimize Pytorch sparse embedding: ModuleDict(
+  (author): Embedding(1134649, 64, sparse=True)
+  (field_of_study): Embedding(59965, 64, sparse=True)
+  (institution): Embedding(8740, 64, sparse=True)
+)
+optimize dense projection: ModuleDict(
+  (paper): Linear(in_features=128, out_features=64, bias=True)
+)
+start training...
+optimize Pytorch sparse embedding: ModuleDict(
+  (author): Embedding(1134649, 64, sparse=True)
+  (field_of_study): Embedding(59965, 64, sparse=True)
+  (institution): Embedding(8740, 64, sparse=True)
+)
+optimize dense projection: ModuleDict(
+  (paper): Linear(in_features=128, out_features=64, bias=True)
+)
+start training...
+optimize Pytorch sparse embedding: ModuleDict(
+  (author): Embedding(1134649, 64, sparse=True)
+  (field_of_study): Embedding(59965, 64, sparse=True)
+  (institution): Embedding(8740, 64, sparse=True)
+)
+optimize dense projection: ModuleDict(
+  (paper): Linear(in_features=128, out_features=64, bias=True)
+)
+start training...
+optimize Pytorch sparse embedding: optimize Pytorch sparse embedding: ModuleDict(
+  (author): Embedding(1134649, 64, sparse=True)
+  (field_of_study): Embedding(59965, 64, sparse=True)
+  (institution): Embedding(8740, 64, sparse=True)
+)
+ModuleDict(
+  (author): Embedding(1134649, 64, sparse=True)
+  (field_of_study): Embedding(59965, 64, sparse=True)
+  (institution): Embedding(8740, 64, sparse=True)
+)
+optimize dense projection: ModuleDict(
+  (paper): Linear(in_features=128, out_features=64, bias=True)
+)
+optimize dense projection: ModuleDict(
+  (paper): Linear(in_features=128, out_features=64, bias=True)
+)
+start training...
+start training...
+optimize Pytorch sparse embedding: ModuleDict(
+  (author): Embedding(1134649, 64, sparse=True)
+  (field_of_study): Embedding(59965, 64, sparse=True)
+  (institution): Embedding(8740, 64, sparse=True)
+)
+optimize dense projection: ModuleDict(
+  (paper): Linear(in_features=128, out_features=64, bias=True)
+)
+start training...
+optimize Pytorch sparse embedding: ModuleDict(
+  (author): Embedding(1134649, 64, sparse=True)
+  (field_of_study): Embedding(59965, 64, sparse=True)
+  (institution): Embedding(8740, 64, sparse=True)
+)
+optimize dense projection: ModuleDict(
+  (paper): Linear(in_features=128, out_features=64, bias=True)
+)
+start training...
+optimize Pytorch sparse embedding: ModuleDict(
+  (author): Embedding(1134649, 64, sparse=True)
+  (field_of_study): Embedding(59965, 64, sparse=True)
+  (institution): Embedding(8740, 64, sparse=True)
+)
+optimize dense projection: ModuleDict(
+  (paper): Linear(in_features=128, out_features=64, bias=True)
+)
+start training...
+[6] Epoch 00000 | Step 00000 | Train acc 0.0029 | Loss 5.9122 | time 2.916 s| sample 1.148 | copy 0.096 | forward 0.031 | backward 1.472 | update 0.170
+[5] Epoch 00000 | Step 00000 | Train acc 0.0059 | Loss 5.8932 | time 2.925 s| sample 0.582 | copy 0.100 | forward 0.059 | backward 1.997 | update 0.187
+[4] Epoch 00000 | Step 00000 | Train acc 0.0059 | Loss 5.8930 | time 2.933 s| sample 0.572 | copy 0.553 | forward 0.053 | backward 1.571 | update 0.183
+[7] Epoch 00000 | Step 00000 | Train acc 0.0020 | Loss 5.8976 | time 2.943 s| sample 0.823 | copy 0.089 | forward 0.053 | backward 1.802 | update 0.175
+[3] Epoch 00000 | Step 00000 | Train acc 0.0029 | Loss 5.9240 | time 3.022 s| sample 0.660 | copy 0.488 | forward 0.053 | backward 1.569 | update 0.252
+[1] Epoch 00000 | Step 00000 | Train acc 0.0020 | Loss 5.9289 | time 3.016 s| sample 1.648 | copy 0.119 | forward 0.052 | backward 0.921 | update 0.277
+[2] Epoch 00000 | Step 00000 | Train acc 0.0010 | Loss 5.9225 | time 3.039 s| sample 2.145 | copy 0.108 | forward 0.033 | backward 0.491 | update 0.261
+[0] Epoch 00000 | Step 00000 | Train acc 0.0010 | Loss 5.9157 | time 3.040 s| sample 1.169 | copy 0.917 | forward 0.055 | backward 0.604 | update 0.295
+[6] Epoch 00000 | Step 00020 | Train acc 0.2412 | Loss 3.2387 | time 46.698 s| sample 14.856 | copy 4.428 | forward 0.654 | backward 23.941 | update 2.819
+[5] Epoch 00000 | Step 00020 | Train acc 0.2539 | Loss 3.2034 | time 46.734 s| sample 16.540 | copy 4.590 | forward 0.685 | backward 22.167 | update 2.752
+[7] Epoch 00000 | Step 00020 | Train acc 0.2393 | Loss 3.2745 | time 46.723 s| sample 15.996 | copy 3.837 | forward 0.699 | backward 23.381 | update 2.811
+[1] Epoch 00000 | Step 00020 | Train acc 0.2852 | Loss 3.0824 | time 46.619 s| sample 19.973 | copy 8.788 | forward 0.681 | backward 13.997 | update 3.179
+[0] Epoch 00000 | Step 00020 | Train acc 0.3057 | Loss 3.0992 | time 46.612 s| sample 23.789 | copy 5.350 | forward 0.657 | backward 13.582 | update 3.233
+[2] Epoch 00000 | Step 00020 | Train acc 0.2676 | Loss 3.1275 | time 46.616 s| sample 23.537 | copy 5.105 | forward 0.708 | backward 13.871 | update 3.396
+[4] Epoch 00000 | Step 00020 | Train acc 0.2490 | Loss 3.2494 | time 46.769 s| sample 15.239 | copy 5.336 | forward 0.675 | backward 22.613 | update 2.907
+[3] Epoch 00000 | Step 00020 | Train acc 0.2920 | Loss 3.1066 | time 46.668 s| sample 16.345 | copy 7.208 | forward 0.652 | backward 19.454 | update 3.009
+[6] Epoch 00000 | Step 00040 | Train acc 0.3252 | Loss 2.6834 | time 47.843 s| sample 11.093 | copy 7.034 | forward 0.539 | backward 27.259 | update 1.917
+[5] Epoch 00000 | Step 00040 | Train acc 0.3340 | Loss 2.6190 | time 47.821 s| sample 15.453 | copy 3.689 | forward 0.600 | backward 25.696 | update 2.383
+[4] Epoch 00000 | Step 00040 | Train acc 0.3262 | Loss 2.6368 | time 47.785 s| sample 16.000 | copy 4.917 | forward 0.595 | backward 23.932 | update 2.341
+[2] Epoch 00000 | Step 00040 | Train acc 0.4229 | Loss 2.2658 | time 47.809 s| sample 25.169 | copy 6.637 | forward 0.647 | backward 12.682 | update 2.675
+[3] Epoch 00000 | Step 00040 | Train acc 0.4180 | Loss 2.4361 | time 47.813 s| sample 21.526 | copy 8.554 | forward 0.607 | backward 14.433 | update 2.692
+[1] Epoch 00000 | Step 00040 | Train acc 0.4043 | Loss 2.4200 | time 47.841 s| sample 18.712 | copy 7.680 | forward 0.653 | backward 18.093 | update 2.703
+[0] Epoch 00000 | Step 00040 | Train acc 0.3740 | Loss 2.5024 | time 47.852 s| sample 23.139 | copy 5.863 | forward 0.618 | backward 15.616 | update 2.615
+[7] Epoch 00000 | Step 00040 | Train acc 0.3447 | Loss 2.7855 | time 47.878 s| sample 19.202 | copy 3.430 | forward 0.637 | backward 22.131 | update 2.477
+[6] Epoch 00000 | Step 00060 | Train acc 0.3633 | Loss 2.4481 | time 44.457 s| sample 10.072 | copy 7.121 | forward 0.540 | backward 24.892 | update 1.832
+[4] Epoch 00000 | Step 00060 | Train acc 0.3740 | Loss 2.4281 | time 44.479 s| sample 14.819 | copy 2.959 | forward 0.607 | backward 23.605 | update 2.490
+[7] Epoch 00000 | Step 00060 | Train acc 0.3887 | Loss 2.3412 | time 44.424 s| sample 16.428 | copy 2.622 | forward 0.587 | backward 22.469 | update 2.318
+[3] Epoch 00000 | Step 00060 | Train acc 0.4883 | Loss 2.0318 | time 44.474 s| sample 20.794 | copy 5.910 | forward 0.631 | backward 14.221 | update 2.917
+[5] Epoch 00000 | Step 00060 | Train acc 0.3545 | Loss 2.4415 | time 44.515 s| sample 13.574 | copy 3.596 | forward 0.644 | backward 24.285 | update 2.417
+[1] Epoch 00000 | Step 00060 | Train acc 0.4873 | Loss 2.0377 | time 44.462 s| sample 18.935 | copy 7.895 | forward 0.622 | backward 14.207 | update 2.803
+[2] Epoch 00000 | Step 00060 | Train acc 0.4619 | Loss 2.1139 | time 44.499 s| sample 24.354 | copy 3.396 | forward 0.611 | backward 13.427 | update 2.712
+[0] Epoch 00000 | Step 00060 | Train acc 0.4805 | Loss 2.0668 | time 44.483 s| sample 18.805 | copy 5.289 | forward 0.607 | backward 17.069 | update 2.713
+[5]Epoch Time(s): 177.6338, sample: 60.4674, data copy: 14.8044, forward: 2.4920, backward: 90.0968, update: 9.7730, #train: 78696, #input: 30054068
+[7]Epoch Time(s): 177.6308, sample: 68.8437, data copy: 12.3456, forward: 2.4491, backward: 84.3391, update: 9.6532, #train: 78696, #input: 30088778
+[4]Epoch Time(s): 177.6483, sample: 57.3434, data copy: 18.0550, forward: 2.4233, backward: 89.8727, update: 9.9538, #train: 78697, #input: 29936306
+[0]Epoch Time(s): 177.6141, sample: 84.0955, data copy: 22.4317, forward: 2.4212, backward: 57.5743, update: 11.0912, #train: 78697, #input: 34556524
+[1]Epoch Time(s): 177.6036, sample: 73.0517, data copy: 29.3354, forward: 2.5082, backward: 61.4990, update: 11.2092, #train: 78697, #input: 34605854
+[2]Epoch Time(s): 177.6336, sample: 93.5940, data copy: 19.8252, forward: 2.4786, backward: 50.4552, update: 11.2804, #train: 78696, #input: 34584592
+[3]Epoch Time(s): 177.6757, sample: 73.0354, data copy: 27.7802, forward: 2.4314, backward: 63.2740, update: 11.1546, #train: 78696, #input: 34579790
+[6]Epoch Time(s): 177.6165, sample: 44.9285, data copy: 25.8790, forward: 2.1825, backward: 96.4268, update: 8.1996, #train: 78696, #input: 30027068
+0it [00:00, ?it/s]0it [00:00, ?it/s]0it [00:00, ?it/s]0it [00:00, ?it/s]0it [00:00, ?it/s]0it [00:00, ?it/s]0it [00:00, ?it/s]0it [00:00, ?it/s]1it [00:01,  1.04s/it]1it [00:01,  1.14s/it]1it [00:01,  1.15s/it]1it [00:01,  1.17s/it]1it [00:01,  1.24s/it]2it [00:01,  1.08it/s]1it [00:02,  2.24s/it]2it [00:02,  1.10s/it]1it [00:02,  2.39s/it]2it [00:02,  1.25s/it]1it [00:02,  2.58s/it]2it [00:02,  1.46s/it]2it [00:02,  1.56s/it]3it [00:02,  1.01it/s]2it [00:03,  1.38s/it]3it [00:03,  1.02s/it]2it [00:03,  1.49s/it]2it [00:03,  1.60s/it]3it [00:03,  1.16s/it]3it [00:03,  1.17s/it]3it [00:03,  1.16s/it]4it [00:04,  1.02s/it]3it [00:04,  1.36s/it]3it [00:04,  1.17s/it]4it [00:04,  1.12s/it]3it [00:04,  1.39s/it]4it [00:04,  1.18s/it]4it [00:05,  1.25s/it]4it [00:05,  1.20s/it]5it [00:05,  1.05s/it]4it [00:05,  1.20s/it]4it [00:05,  1.33s/it]4it [00:05,  1.44s/it]5it [00:05,  1.03it/s]5it [00:05,  1.35s/it]5it [00:06,  1.24s/it]6it [00:06,  1.04it/s]5it [00:06,  1.10s/it]5it [00:06,  1.27s/it]6it [00:06,  1.08it/s]7it [00:06,  1.19it/s]5it [00:06,  1.59s/it]6it [00:06,  1.05s/it]6it [00:06,  1.22s/it]5it [00:07,  1.46s/it]7it [00:07,  1.07it/s]6it [00:07,  1.22s/it]6it [00:07,  1.30s/it]6it [00:07,  1.40s/it]6it [00:08,  1.27s/it]7it [00:08,  1.06s/it]8it [00:08,  1.03it/s]8it [00:08,  1.01s/it]
+0it [00:00, ?it/s]8it [00:08,  1.09it/s]8it [00:08,  1.07s/it]
+0it [00:00, ?it/s]7it [00:08,  1.34s/it]7it [00:09,  1.28s/it]7it [00:09,  1.17s/it]8it [00:09,  1.03s/it]8it [00:09,  1.13s/it]
+0it [00:00, ?it/s]7it [00:09,  1.20s/it]7it [00:09,  1.26s/it]1it [00:01,  1.16s/it]8it [00:09,  1.09s/it]8it [00:09,  1.24s/it]
+8it [00:09,  1.36s/it]8it [00:09,  1.24s/it]
+0it [00:00, ?it/s]0it [00:00, ?it/s]2it [00:02,  1.02it/s]1it [00:01,  1.85s/it]8it [00:10,  1.26s/it]8it [00:10,  1.31s/it]
+0it [00:00, ?it/s]8it [00:10,  1.31s/it]8it [00:10,  1.32s/it]
+0it [00:00, ?it/s]8it [00:10,  1.36s/it]8it [00:10,  1.32s/it]
+0it [00:00, ?it/s]2it [00:02,  1.18s/it]1it [00:01,  1.22s/it]1it [00:01,  1.22s/it]3it [00:03,  1.04s/it]1it [00:01,  1.19s/it]1it [00:01,  1.28s/it]2it [00:01,  1.09it/s]3it [00:03,  1.00s/it]1it [00:02,  2.83s/it]4it [00:03,  1.09it/s]1it [00:01,  1.58s/it]2it [00:02,  1.19s/it]4it [00:04,  1.10it/s]2it [00:03,  1.64s/it]5it [00:04,  1.13it/s]2it [00:02,  1.16s/it]2it [00:02,  1.17s/it]3it [00:03,  1.01s/it]2it [00:02,  1.30s/it]6it [00:05,  1.34it/s]6it [00:05,  1.16it/s]
+5it [00:04,  1.20it/s]3it [00:03,  1.20s/it]6it [00:05,  1.48it/s]6it [00:05,  1.16it/s]
+3it [00:04,  1.36s/it]3it [00:03,  1.01s/it]4it [00:04,  1.07s/it]4it [00:05,  1.05s/it]3it [00:03,  1.27s/it]4it [00:04,  1.02s/it]3it [00:04,  1.48s/it]4it [00:04,  1.01it/s]5it [00:04,  1.06it/s]4it [00:04,  1.12s/it]5it [00:06,  1.01s/it]5it [00:04,  1.17it/s]6it [00:05,  1.22it/s]6it [00:05,  1.09it/s]
+5it [00:05,  1.08s/it]4it [00:04,  1.18s/it]6it [00:06,  1.26it/s]6it [00:06,  1.09s/it]
+6it [00:05,  1.45it/s]6it [00:05,  1.13it/s]
+5it [00:05,  1.02it/s]6it [00:06,  1.02it/s]6it [00:06,  1.05s/it]
+6it [00:05,  1.30it/s]6it [00:05,  1.05it/s]
+5it [00:05,  1.10s/it]6it [00:06,  1.17it/s]6it [00:06,  1.04s/it]
+Val Acc 0.4618, Test Acc 0.4485, time: 16.9179
+Client[6] in group[0] is exiting...
+Client[3] in group[0] is exiting...
+Client[7] in group[0] is exiting...
+Client[5] in group[0] is exiting...
+Client[4] in group[0] is exiting...
+Client[1] in group[0] is exiting...
+Client[2] in group[0] is exiting...
+Client[0] in group[0] is exiting...
+Namespace(batch_size=1024, conf_path=None, dataset='ogbn-mag', dgl_sparse=False, dropout=0.5, eval_batch_size=1024, fanout='25,10', graph_name='ogbn-mag', graphbolt=False, id=None, ip_config='/home/ubuntu/workspace/ip_config.txt', l2norm=0, layer_norm=True, local_rank=None, log_every=20, low_mem=True, lr=0.01, n_bases=2, n_epochs=1, n_hidden=64, n_layers=2, num_gpus=-1, relabel=False, sparse_embedding=True, sparse_lr=0.06, standalone=False, use_self_loop=True, validation_fanout='25,10')
+[Server_2] Loaded ogbn-mag with use_graphbolt[False] in size[711.65234375 MB]
+Start to create specified graph formats which may take non-trivial time.
+Finished creating specified graph formats.
+start graph service on server 2 for part 1
+Server is waiting for connections on [172.31.8.229:30050]...
+Server (2) shutdown.
+Server is exiting...
+Namespace(batch_size=1024, conf_path=None, dataset='ogbn-mag', dgl_sparse=False, dropout=0.5, eval_batch_size=1024, fanout='25,10', graph_name='ogbn-mag', graphbolt=False, id=None, ip_config='/home/ubuntu/workspace/ip_config.txt', l2norm=0, layer_norm=True, local_rank=None, log_every=20, low_mem=True, lr=0.01, n_bases=2, n_epochs=1, n_hidden=64, n_layers=2, num_gpus=-1, relabel=False, sparse_embedding=True, sparse_lr=0.06, standalone=False, use_self_loop=True, validation_fanout='25,10')
+[Server_0] Loaded ogbn-mag with use_graphbolt[False] in size[701.08203125 MB]
+Start to create specified graph formats which may take non-trivial time.
+Finished creating specified graph formats.
+start graph service on server 0 for part 0
+Server is waiting for connections on [172.31.14.101:30050]...
+Server (0) shutdown.
+Server is exiting...
+Namespace(batch_size=1024, conf_path=None, dataset='ogbn-mag', dgl_sparse=False, dropout=0.5, eval_batch_size=1024, fanout='25,10', graph_name='ogbn-mag', graphbolt=False, id=None, ip_config='/home/ubuntu/workspace/ip_config.txt', l2norm=0, layer_norm=True, local_rank=None, log_every=20, low_mem=True, lr=0.01, n_bases=2, n_epochs=1, n_hidden=64, n_layers=2, num_gpus=-1, relabel=False, sparse_embedding=True, sparse_lr=0.06, standalone=False, use_self_loop=True, validation_fanout='25,10')
+start graph service on server 3 for part 1
+Server is waiting for connections on [172.31.8.229:30051]...
+Server (3) shutdown.
+Server is exiting...
+Namespace(batch_size=1024, conf_path=None, dataset='ogbn-mag', dgl_sparse=False, dropout=0.5, eval_batch_size=1024, fanout='25,10', graph_name='ogbn-mag', graphbolt=False, id=None, ip_config='/home/ubuntu/workspace/ip_config.txt', l2norm=0, layer_norm=True, local_rank=None, log_every=20, low_mem=True, lr=0.01, n_bases=2, n_epochs=1, n_hidden=64, n_layers=2, num_gpus=-1, relabel=False, sparse_embedding=True, sparse_lr=0.06, standalone=False, use_self_loop=True, validation_fanout='25,10')
+start graph service on server 1 for part 0
+Server is waiting for connections on [172.31.14.101:30051]...
+Server (1) shutdown.
+Server is exiting...
+cleanup process runs
diff --git a/examples/pytorch/rgcn/experimental/log_gb.txt b/examples/pytorch/rgcn/experimental/log_gb.txt
new file mode 100644
index 000000000000..eb6265e2aacb
--- /dev/null
+++ b/examples/pytorch/rgcn/experimental/log_gb.txt
@@ -0,0 +1,231 @@
+/home/ubuntu/workspace/dgl_2/tools/launch.py:148: DeprecationWarning: setDaemon() is deprecated, set the daemon attribute instead
+  thread.setDaemon(True)
+The number of OMP threads per trainer is set to 12
+Namespace(batch_size=1024, conf_path=None, dataset='ogbn-mag', dgl_sparse=False, dropout=0.5, eval_batch_size=1024, fanout='25,10', graph_name='ogbn-mag', graphbolt=True, id=None, ip_config='/home/ubuntu/workspace/ip_config.txt', l2norm=0, layer_norm=True, local_rank=None, log_every=20, low_mem=True, lr=0.01, n_bases=2, n_epochs=1, n_hidden=64, n_layers=2, num_gpus=-1, relabel=False, sparse_embedding=True, sparse_lr=0.06, standalone=False, use_self_loop=True, validation_fanout='25,10')
+Using GraphBolt
+Namespace(batch_size=1024, conf_path=None, dataset='ogbn-mag', dgl_sparse=False, dropout=0.5, eval_batch_size=1024, fanout='25,10', graph_name='ogbn-mag', graphbolt=True, id=None, ip_config='/home/ubuntu/workspace/ip_config.txt', l2norm=0, layer_norm=True, local_rank=None, log_every=20, low_mem=True, lr=0.01, n_bases=2, n_epochs=1, n_hidden=64, n_layers=2, num_gpus=-1, relabel=False, sparse_embedding=True, sparse_lr=0.06, standalone=False, use_self_loop=True, validation_fanout='25,10')
+Using GraphBolt
+Namespace(batch_size=1024, conf_path=None, dataset='ogbn-mag', dgl_sparse=False, dropout=0.5, eval_batch_size=1024, fanout='25,10', graph_name='ogbn-mag', graphbolt=True, id=None, ip_config='/home/ubuntu/workspace/ip_config.txt', l2norm=0, layer_norm=True, local_rank=None, log_every=20, low_mem=True, lr=0.01, n_bases=2, n_epochs=1, n_hidden=64, n_layers=2, num_gpus=-1, relabel=False, sparse_embedding=True, sparse_lr=0.06, standalone=False, use_self_loop=True, validation_fanout='25,10')
+Using GraphBolt
+Namespace(batch_size=1024, conf_path=None, dataset='ogbn-mag', dgl_sparse=False, dropout=0.5, eval_batch_size=1024, fanout='25,10', graph_name='ogbn-mag', graphbolt=True, id=None, ip_config='/home/ubuntu/workspace/ip_config.txt', l2norm=0, layer_norm=True, local_rank=None, log_every=20, low_mem=True, lr=0.01, n_bases=2, n_epochs=1, n_hidden=64, n_layers=2, num_gpus=-1, relabel=False, sparse_embedding=True, sparse_lr=0.06, standalone=False, use_self_loop=True, validation_fanout='25,10')
+Using GraphBolt
+Namespace(batch_size=1024, conf_path=None, dataset='ogbn-mag', dgl_sparse=False, dropout=0.5, eval_batch_size=1024, fanout='25,10', graph_name='ogbn-mag', graphbolt=True, id=None, ip_config='/home/ubuntu/workspace/ip_config.txt', l2norm=0, layer_norm=True, local_rank=None, log_every=20, low_mem=True, lr=0.01, n_bases=2, n_epochs=1, n_hidden=64, n_layers=2, num_gpus=-1, relabel=False, sparse_embedding=True, sparse_lr=0.06, standalone=False, use_self_loop=True, validation_fanout='25,10')
+Using GraphBolt
+Namespace(batch_size=1024, conf_path=None, dataset='ogbn-mag', dgl_sparse=False, dropout=0.5, eval_batch_size=1024, fanout='25,10', graph_name='ogbn-mag', graphbolt=True, id=None, ip_config='/home/ubuntu/workspace/ip_config.txt', l2norm=0, layer_norm=True, local_rank=None, log_every=20, low_mem=True, lr=0.01, n_bases=2, n_epochs=1, n_hidden=64, n_layers=2, num_gpus=-1, relabel=False, sparse_embedding=True, sparse_lr=0.06, standalone=False, use_self_loop=True, validation_fanout='25,10')
+Using GraphBolt
+Namespace(batch_size=1024, conf_path=None, dataset='ogbn-mag', dgl_sparse=False, dropout=0.5, eval_batch_size=1024, fanout='25,10', graph_name='ogbn-mag', graphbolt=True, id=None, ip_config='/home/ubuntu/workspace/ip_config.txt', l2norm=0, layer_norm=True, local_rank=None, log_every=20, low_mem=True, lr=0.01, n_bases=2, n_epochs=1, n_hidden=64, n_layers=2, num_gpus=-1, relabel=False, sparse_embedding=True, sparse_lr=0.06, standalone=False, use_self_loop=True, validation_fanout='25,10')
+Using GraphBolt
+Namespace(batch_size=1024, conf_path=None, dataset='ogbn-mag', dgl_sparse=False, dropout=0.5, eval_batch_size=1024, fanout='25,10', graph_name='ogbn-mag', graphbolt=True, id=None, ip_config='/home/ubuntu/workspace/ip_config.txt', l2norm=0, layer_norm=True, local_rank=None, log_every=20, low_mem=True, lr=0.01, n_bases=2, n_epochs=1, n_hidden=64, n_layers=2, num_gpus=-1, relabel=False, sparse_embedding=True, sparse_lr=0.06, standalone=False, use_self_loop=True, validation_fanout='25,10')
+Using GraphBolt
+Client [174058] waits on 172.31.8.229:58369
+Client [174059] waits on 172.31.8.229:57779
+Client [174060] waits on 172.31.8.229:33865
+Client [174061] waits on 172.31.8.229:33205
+Client [1882912] waits on 172.31.14.101:35629
+Client [1882909] waits on 172.31.14.101:43017
+Client [1882910] waits on 172.31.14.101:34859
+Client [1882911] waits on 172.31.14.101:33033
+Machine (0) group (0) client (0) connect to server successfuly!
+Machine (0) group (0) client (1) connect to server successfuly!
+Machine (0) group (0) client (2) connect to server successfuly!
+Machine (0) group (0) client (3) connect to server successfuly!
+Machine (1) group (0) client (4) connect to server successfuly!
+Machine (1) group (0) client (5) connect to server successfuly!
+Machine (1) group (0) client (6) connect to server successfuly!
+Machine (1) group (0) client (7) connect to server successfuly!
+rank: 0
+rank: 7
+rank: 2
+rank: 6
+rank: 5
+rank: 3
+rank: 4
+rank: 1
+part 6, train: 78696 (local: 78696), val: 8110 (local: 7735), test: 5242 (local: 5072)
+part 7, train: 78696 (local: 78696), val: 8109 (local: 7738), test: 5242 (local: 5092)
+part 0, train: 78697 (local: 76971), val: 8110 (local: 8110), test: 5243 (local: 5243)
+part 3, train: 78696 (local: 76932), val: 8110 (local: 8110), test: 5242 (local: 5242)
+part 5, train: 78696 (local: 78696), val: 8110 (local: 7714), test: 5242 (local: 5108)
+part 4, train: 78697 (local: 78697), val: 8110 (local: 7708), test: 5243 (local: 5072)
+part 2, train: 78696 (local: 77008), val: 8110 (local: 8110), test: 5242 (local: 5242)
+part 1, train: 78697 (local: 76875), val: 8110 (local: 8110), test: 5243 (local: 5243)
+#classes: 349
+node paper has data feat
+#classes: 349
+node paper has data feat
+#classes: 349
+#classes: 349
+node paper has data feat
+node paper has data feat
+#classes: 349
+node paper has data feat
+#classes: 349
+node paper has data feat
+#classes: 349
+node paper has data feat
+#classes: 349
+node paper has data feat
+optimize Pytorch sparse embedding: ModuleDict(
+  (author): Embedding(1134649, 64, sparse=True)
+  (field_of_study): Embedding(59965, 64, sparse=True)
+  (institution): Embedding(8740, 64, sparse=True)
+)
+optimize dense projection: ModuleDict(
+  (paper): Linear(in_features=128, out_features=64, bias=True)
+)
+start training...
+optimize Pytorch sparse embedding: ModuleDict(
+  (author): Embedding(1134649, 64, sparse=True)
+  (field_of_study): Embedding(59965, 64, sparse=True)
+  (institution): Embedding(8740, 64, sparse=True)
+)
+optimize dense projection: ModuleDict(
+  (paper): Linear(in_features=128, out_features=64, bias=True)
+)
+start training...
+optimize Pytorch sparse embedding: ModuleDict(
+  (author): Embedding(1134649, 64, sparse=True)
+  (field_of_study): Embedding(59965, 64, sparse=True)
+  (institution): Embedding(8740, 64, sparse=True)
+)
+optimize dense projection: ModuleDict(
+  (paper): Linear(in_features=128, out_features=64, bias=True)
+)
+start training...
+optimize Pytorch sparse embedding: ModuleDict(
+  (author): Embedding(1134649, 64, sparse=True)
+  (field_of_study): Embedding(59965, 64, sparse=True)
+  (institution): Embedding(8740, 64, sparse=True)
+)
+optimize dense projection: ModuleDict(
+  (paper): Linear(in_features=128, out_features=64, bias=True)
+)
+start training...
+optimize Pytorch sparse embedding: ModuleDict(
+  (author): Embedding(1134649, 64, sparse=True)
+  (field_of_study): Embedding(59965, 64, sparse=True)
+  (institution): Embedding(8740, 64, sparse=True)
+)
+optimize dense projection: ModuleDict(
+  (paper): Linear(in_features=128, out_features=64, bias=True)
+)
+start training...
+optimize Pytorch sparse embedding: ModuleDict(
+  (author): Embedding(1134649, 64, sparse=True)
+  (field_of_study): Embedding(59965, 64, sparse=True)
+  (institution): Embedding(8740, 64, sparse=True)
+)
+optimize dense projection: ModuleDict(
+  (paper): Linear(in_features=128, out_features=64, bias=True)
+)
+start training...
+optimize Pytorch sparse embedding: ModuleDict(
+  (author): Embedding(1134649, 64, sparse=True)
+  (field_of_study): Embedding(59965, 64, sparse=True)
+  (institution): Embedding(8740, 64, sparse=True)
+)
+optimize dense projection: ModuleDict(
+  (paper): Linear(in_features=128, out_features=64, bias=True)
+)
+start training...
+optimize Pytorch sparse embedding: ModuleDict(
+  (author): Embedding(1134649, 64, sparse=True)
+  (field_of_study): Embedding(59965, 64, sparse=True)
+  (institution): Embedding(8740, 64, sparse=True)
+)
+optimize dense projection: ModuleDict(
+  (paper): Linear(in_features=128, out_features=64, bias=True)
+)
+start training...
+[4] Epoch 00000 | Step 00000 | Train acc 0.0020 | Loss 5.8798 | time 0.442 s| sample 0.070 | copy 0.053 | forward 0.051 | backward 0.187 | update 0.081
+[6] Epoch 00000 | Step 00000 | Train acc 0.0020 | Loss 5.8951 | time 0.658 s| sample 0.076 | copy 0.053 | forward 0.053 | backward 0.381 | update 0.096
+[1] Epoch 00000 | Step 00000 | Train acc 0.0049 | Loss 5.8799 | time 0.656 s| sample 0.075 | copy 0.073 | forward 0.054 | backward 0.343 | update 0.111
+[5] Epoch 00000 | Step 00000 | Train acc 0.0020 | Loss 5.8777 | time 0.875 s| sample 0.082 | copy 0.049 | forward 0.056 | backward 0.591 | update 0.096
+[2] Epoch 00000 | Step 00000 | Train acc 0.0059 | Loss 5.8630 | time 0.452 s| sample 0.077 | copy 0.056 | forward 0.032 | backward 0.184 | update 0.103
+[0] Epoch 00000 | Step 00000 | Train acc 0.0059 | Loss 5.8723 | time 0.914 s| sample 0.077 | copy 0.057 | forward 0.071 | backward 0.598 | update 0.111
+[7] Epoch 00000 | Step 00000 | Train acc 0.0020 | Loss 5.8806 | time 0.823 s| sample 0.079 | copy 0.064 | forward 0.051 | backward 0.536 | update 0.092
+[3] Epoch 00000 | Step 00000 | Train acc 0.0059 | Loss 5.8844 | time 0.843 s| sample 0.075 | copy 0.061 | forward 0.054 | backward 0.538 | update 0.115
+[4] Epoch 00000 | Step 00020 | Train acc 0.2246 | Loss 3.5885 | time 8.594 s| sample 1.276 | copy 1.007 | forward 0.527 | backward 4.970 | update 0.815
+[5] Epoch 00000 | Step 00020 | Train acc 0.1982 | Loss 3.5712 | time 8.583 s| sample 1.432 | copy 1.035 | forward 0.653 | backward 4.079 | update 1.384
+[1] Epoch 00000 | Step 00020 | Train acc 0.2383 | Loss 3.4027 | time 8.584 s| sample 1.425 | copy 1.238 | forward 0.630 | backward 4.100 | update 1.191
+[3] Epoch 00000 | Step 00020 | Train acc 0.2197 | Loss 3.4913 | time 8.584 s| sample 1.455 | copy 1.353 | forward 0.645 | backward 3.976 | update 1.155
+[2] Epoch 00000 | Step 00020 | Train acc 0.2539 | Loss 3.3632 | time 8.591 s| sample 1.383 | copy 1.344 | forward 0.592 | backward 4.106 | update 1.167
+[6] Epoch 00000 | Step 00020 | Train acc 0.2188 | Loss 3.4466 | time 8.606 s| sample 1.294 | copy 1.040 | forward 0.559 | backward 4.691 | update 1.022
+[7] Epoch 00000 | Step 00020 | Train acc 0.2373 | Loss 3.4916 | time 8.627 s| sample 1.316 | copy 1.006 | forward 0.556 | backward 4.750 | update 1.000
+[0] Epoch 00000 | Step 00020 | Train acc 0.2363 | Loss 3.5446 | time 8.631 s| sample 1.424 | copy 1.255 | forward 0.583 | backward 4.229 | update 1.141
+[4] Epoch 00000 | Step 00040 | Train acc 0.3047 | Loss 2.8568 | time 8.660 s| sample 1.259 | copy 0.921 | forward 0.501 | backward 5.221 | update 0.757
+[0] Epoch 00000 | Step 00040 | Train acc 0.3408 | Loss 2.6664 | time 8.603 s| sample 1.514 | copy 1.253 | forward 0.586 | backward 4.111 | update 1.139
+[1] Epoch 00000 | Step 00040 | Train acc 0.3330 | Loss 2.7342 | time 8.649 s| sample 1.480 | copy 1.296 | forward 0.609 | backward 3.941 | update 1.324
+[2] Epoch 00000 | Step 00040 | Train acc 0.3135 | Loss 2.7676 | time 8.656 s| sample 1.400 | copy 1.158 | forward 0.558 | backward 4.337 | update 1.202
+[6] Epoch 00000 | Step 00040 | Train acc 0.3018 | Loss 2.8101 | time 8.683 s| sample 1.329 | copy 1.064 | forward 0.565 | backward 4.614 | update 1.112
+[5] Epoch 00000 | Step 00040 | Train acc 0.3086 | Loss 2.9863 | time 8.696 s| sample 1.366 | copy 1.041 | forward 0.563 | backward 4.509 | update 1.216
+[3] Epoch 00000 | Step 00040 | Train acc 0.3535 | Loss 2.6234 | time 8.689 s| sample 1.409 | copy 1.307 | forward 0.628 | backward 4.060 | update 1.285
+[7] Epoch 00000 | Step 00040 | Train acc 0.2861 | Loss 2.8887 | time 8.652 s| sample 1.307 | copy 1.031 | forward 0.545 | backward 4.737 | update 1.033
+[4] Epoch 00000 | Step 00060 | Train acc 0.3398 | Loss 2.6324 | time 8.234 s| sample 1.266 | copy 0.933 | forward 0.486 | backward 4.762 | update 0.787
+[5] Epoch 00000 | Step 00060 | Train acc 0.3359 | Loss 2.5936 | time 8.186 s| sample 1.360 | copy 1.045 | forward 0.564 | backward 4.062 | update 1.155
+[6] Epoch 00000 | Step 00060 | Train acc 0.3096 | Loss 2.7593 | time 8.192 s| sample 1.326 | copy 1.029 | forward 0.544 | backward 4.208 | update 1.085
+[1] Epoch 00000 | Step 00060 | Train acc 0.3809 | Loss 2.4687 | time 8.238 s| sample 1.438 | copy 1.241 | forward 0.619 | backward 3.767 | update 1.172
+[3] Epoch 00000 | Step 00060 | Train acc 0.3613 | Loss 2.5116 | time 8.201 s| sample 1.436 | copy 1.265 | forward 0.607 | backward 3.814 | update 1.079
+[0] Epoch 00000 | Step 00060 | Train acc 0.3896 | Loss 2.4396 | time 8.246 s| sample 1.533 | copy 1.252 | forward 0.594 | backward 3.909 | update 0.957
+[2] Epoch 00000 | Step 00060 | Train acc 0.3887 | Loss 2.4081 | time 8.236 s| sample 1.411 | copy 1.195 | forward 0.604 | backward 4.009 | update 1.016
+[7] Epoch 00000 | Step 00060 | Train acc 0.3408 | Loss 2.7524 | time 8.208 s| sample 1.355 | copy 1.045 | forward 0.575 | backward 4.182 | update 1.051
+[4]Epoch Time(s): 32.4171, sample: 4.9020, data copy: 3.6422, forward: 1.9722, backward: 18.8446, update: 3.0559, #train: 78697, #input: 16666112
+[0]Epoch Time(s): 32.8553, sample: 5.6930, data copy: 4.7914, forward: 2.3189, backward: 15.8580, update: 4.1937, #train: 78697, #input: 18370334
+[1]Epoch Time(s): 32.5985, sample: 5.6200, data copy: 4.8302, forward: 2.3978, backward: 15.1472, update: 4.6032, #train: 78697, #input: 18381034
+[7]Epoch Time(s): 32.7594, sample: 5.0083, data copy: 3.9177, forward: 2.1759, backward: 17.6220, update: 4.0355, #train: 78696, #input: 16686254
+[3]Epoch Time(s): 32.7923, sample: 5.4970, data copy: 4.9976, forward: 2.4069, backward: 15.4529, update: 4.4377, #train: 78696, #input: 18370936
+[2]Epoch Time(s): 32.4160, sample: 5.4193, data copy: 4.7859, forward: 2.2797, backward: 15.5506, update: 4.3803, #train: 78696, #input: 18378296
+[5]Epoch Time(s): 32.8164, sample: 5.3067, data copy: 3.9895, forward: 2.2805, backward: 16.5132, update: 4.7265, #train: 78696, #input: 16678362
+[6]Epoch Time(s): 32.6269, sample: 5.0835, data copy: 3.9453, forward: 2.1211, backward: 17.3780, update: 4.0990, #train: 78696, #input: 16674232
+0it [00:00, ?it/s]0it [00:00, ?it/s]0it [00:00, ?it/s]0it [00:00, ?it/s]0it [00:00, ?it/s]0it [00:00, ?it/s]0it [00:00, ?it/s]0it [00:00, ?it/s]1it [00:00,  6.32it/s]1it [00:00,  6.20it/s]1it [00:00,  6.17it/s]1it [00:00,  5.95it/s]1it [00:00,  4.79it/s]1it [00:00,  4.08it/s]1it [00:00,  3.56it/s]1it [00:00,  3.55it/s]2it [00:00,  6.44it/s]2it [00:00,  6.38it/s]2it [00:00,  6.31it/s]2it [00:00,  5.97it/s]2it [00:00,  5.86it/s]2it [00:00,  5.77it/s]2it [00:00,  4.99it/s]2it [00:00,  4.86it/s]3it [00:00,  6.66it/s]3it [00:00,  6.48it/s]3it [00:00,  6.42it/s]3it [00:00,  6.23it/s]3it [00:00,  6.23it/s]3it [00:00,  6.46it/s]3it [00:00,  5.75it/s]3it [00:00,  5.69it/s]4it [00:00,  6.85it/s]4it [00:00,  6.82it/s]4it [00:00,  6.52it/s]4it [00:00,  6.87it/s]4it [00:00,  6.54it/s]4it [00:00,  6.65it/s]4it [00:00,  6.08it/s]4it [00:00,  5.76it/s]5it [00:00,  7.30it/s]5it [00:00,  6.75it/s]5it [00:00,  6.84it/s]5it [00:00,  6.70it/s]5it [00:00,  6.66it/s]5it [00:00,  6.86it/s]5it [00:00,  6.47it/s]5it [00:00,  6.34it/s]6it [00:00,  7.43it/s]6it [00:00,  6.97it/s]6it [00:00,  7.30it/s]6it [00:00,  6.64it/s]6it [00:00,  6.51it/s]6it [00:00,  6.62it/s]6it [00:00,  6.74it/s]6it [00:01,  6.71it/s]7it [00:01,  7.45it/s]7it [00:01,  7.52it/s]7it [00:01,  6.79it/s]7it [00:01,  6.74it/s]7it [00:01,  6.39it/s]7it [00:01,  6.23it/s]7it [00:01,  6.90it/s]7it [00:01,  6.83it/s]8it [00:01,  7.79it/s]8it [00:01,  6.98it/s]
+8it [00:01,  7.40it/s]8it [00:01,  6.98it/s]
+0it [00:00, ?it/s]0it [00:00, ?it/s]8it [00:01,  6.93it/s]8it [00:01,  6.77it/s]
+0it [00:00, ?it/s]8it [00:01,  6.89it/s]8it [00:01,  6.75it/s]
+0it [00:00, ?it/s]8it [00:01,  6.68it/s]8it [00:01,  6.52it/s]
+0it [00:00, ?it/s]8it [00:01,  6.46it/s]8it [00:01,  6.45it/s]
+0it [00:00, ?it/s]8it [00:01,  7.18it/s]8it [00:01,  6.31it/s]
+8it [00:01,  7.05it/s]8it [00:01,  6.31it/s]
+0it [00:00, ?it/s]0it [00:00, ?it/s]1it [00:00,  5.84it/s]1it [00:00,  5.66it/s]1it [00:00,  7.06it/s]1it [00:00,  6.43it/s]1it [00:00,  6.56it/s]1it [00:00,  6.56it/s]1it [00:00,  5.36it/s]2it [00:00,  6.48it/s]2it [00:00,  6.28it/s]1it [00:00,  4.84it/s]2it [00:00,  6.79it/s]2it [00:00,  6.64it/s]2it [00:00,  6.31it/s]2it [00:00,  6.43it/s]3it [00:00,  6.92it/s]3it [00:00,  6.86it/s]2it [00:00,  5.92it/s]2it [00:00,  5.88it/s]3it [00:00,  6.86it/s]3it [00:00,  6.49it/s]3it [00:00,  6.75it/s]3it [00:00,  6.41it/s]4it [00:00,  7.27it/s]4it [00:00,  7.03it/s]3it [00:00,  6.63it/s]3it [00:00,  6.41it/s]4it [00:00,  6.89it/s]4it [00:00,  6.65it/s]4it [00:00,  6.53it/s]5it [00:00,  7.62it/s]4it [00:00,  6.57it/s]5it [00:00,  7.16it/s]4it [00:00,  6.98it/s]4it [00:00,  6.93it/s]6it [00:00,  7.99it/s]
+5it [00:00,  6.79it/s]5it [00:00,  6.82it/s]6it [00:00,  7.67it/s]
+6it [00:00,  7.53it/s]
+5it [00:00,  6.50it/s]5it [00:00,  7.38it/s]5it [00:00,  6.62it/s]6it [00:00,  7.33it/s]
+5it [00:00,  7.07it/s]6it [00:00,  7.30it/s]
+6it [00:00,  7.66it/s]
+6it [00:00,  7.32it/s]
+6it [00:00,  7.49it/s]
+Val Acc 0.3901, Test Acc 0.3844, time: 2.2284
+Client[4] in group[0] is exiting...
+Client[2] in group[0] is exiting...
+Client[3] in group[0] is exiting...
+Client[1] in group[0] is exiting...
+Client[6] in group[0] is exiting...
+Client[7] in group[0] is exiting...
+Client[5] in group[0] is exiting...
+Client[0] in group[0] is exiting...
+Namespace(batch_size=1024, conf_path=None, dataset='ogbn-mag', dgl_sparse=False, dropout=0.5, eval_batch_size=1024, fanout='25,10', graph_name='ogbn-mag', graphbolt=True, id=None, ip_config='/home/ubuntu/workspace/ip_config.txt', l2norm=0, layer_norm=True, local_rank=None, log_every=20, low_mem=True, lr=0.01, n_bases=2, n_epochs=1, n_hidden=64, n_layers=2, num_gpus=-1, relabel=False, sparse_embedding=True, sparse_lr=0.06, standalone=False, use_self_loop=True, validation_fanout='25,10')
+Using GraphBolt
+[Server_2] Loaded ogbn-mag with use_graphbolt[True] in size[240.7109375 MB]
+start graph service on server 2 for part 1
+Server is waiting for connections on [172.31.8.229:30050]...
+Server (2) shutdown.
+Server is exiting...
+Namespace(batch_size=1024, conf_path=None, dataset='ogbn-mag', dgl_sparse=False, dropout=0.5, eval_batch_size=1024, fanout='25,10', graph_name='ogbn-mag', graphbolt=True, id=None, ip_config='/home/ubuntu/workspace/ip_config.txt', l2norm=0, layer_norm=True, local_rank=None, log_every=20, low_mem=True, lr=0.01, n_bases=2, n_epochs=1, n_hidden=64, n_layers=2, num_gpus=-1, relabel=False, sparse_embedding=True, sparse_lr=0.06, standalone=False, use_self_loop=True, validation_fanout='25,10')
+Using GraphBolt
+[Server_0] Loaded ogbn-mag with use_graphbolt[True] in size[235.8046875 MB]
+start graph service on server 0 for part 0
+Server is waiting for connections on [172.31.14.101:30050]...
+Server (0) shutdown.
+Server is exiting...
+Namespace(batch_size=1024, conf_path=None, dataset='ogbn-mag', dgl_sparse=False, dropout=0.5, eval_batch_size=1024, fanout='25,10', graph_name='ogbn-mag', graphbolt=True, id=None, ip_config='/home/ubuntu/workspace/ip_config.txt', l2norm=0, layer_norm=True, local_rank=None, log_every=20, low_mem=True, lr=0.01, n_bases=2, n_epochs=1, n_hidden=64, n_layers=2, num_gpus=-1, relabel=False, sparse_embedding=True, sparse_lr=0.06, standalone=False, use_self_loop=True, validation_fanout='25,10')
+Using GraphBolt
+start graph service on server 3 for part 1
+Server is waiting for connections on [172.31.8.229:30051]...
+Server (3) shutdown.
+Server is exiting...
+Namespace(batch_size=1024, conf_path=None, dataset='ogbn-mag', dgl_sparse=False, dropout=0.5, eval_batch_size=1024, fanout='25,10', graph_name='ogbn-mag', graphbolt=True, id=None, ip_config='/home/ubuntu/workspace/ip_config.txt', l2norm=0, layer_norm=True, local_rank=None, log_every=20, low_mem=True, lr=0.01, n_bases=2, n_epochs=1, n_hidden=64, n_layers=2, num_gpus=-1, relabel=False, sparse_embedding=True, sparse_lr=0.06, standalone=False, use_self_loop=True, validation_fanout='25,10')
+Using GraphBolt
+start graph service on server 1 for part 0
+Server is waiting for connections on [172.31.14.101:30051]...
+Server (1) shutdown.
+Server is exiting...
+cleanup process runs
diff --git a/python/dgl/distributed/dist_graph.py b/python/dgl/distributed/dist_graph.py
index a323633e4801..514927d8dd0d 100644
--- a/python/dgl/distributed/dist_graph.py
+++ b/python/dgl/distributed/dist_graph.py
@@ -1,7 +1,9 @@
 """Define distributed graph."""
 
 import gc
-
+import psutil
+#import tracemalloc
+#from  pympler import asizeof
 import os
 from collections import namedtuple
 from collections.abc import MutableMapping
@@ -383,6 +385,7 @@ def __init__(
         graph_format=("csc", "coo"),
         use_graphbolt=False,
     ):
+        #tracemalloc.start()
         super(DistGraphServer, self).__init__(
             server_id=server_id,
             ip_config=ip_config,
@@ -400,6 +403,8 @@ def __init__(
             self.client_g = None
         else:
             # Loading of node/edge_feats are deferred to lower the peak memory consumption.
+            #snapshot1 = tracemalloc.take_snapshot()
+            prev_rss = psutil.Process(os.getpid()).memory_info().rss
             (
                 self.client_g,
                 _,
@@ -414,8 +419,18 @@ def __init__(
                 load_feats=False,
                 use_graphbolt=use_graphbolt,
             )
-            print(f"Loaded {graph_name} with use_graphbolt[{use_graphbolt}]")
-
+            #print(f"[Server_{self.server_id}] Loaded {graph_name} with use_graphbolt[{use_graphbolt}].")
+            new_rss = psutil.Process(os.getpid()).memory_info().rss
+            print(f"[Server_{self.server_id}] Loaded {graph_name} with use_graphbolt[{use_graphbolt}] in size[{(new_rss - prev_rss)/1024/1024} MB]")
+            #print(f"[Server_{self.server_id}] Loaded {graph_name} with use_graphbolt[{use_graphbolt}] in size[{asizeof.asizeof(self.client_g)}]")
+            '''
+            snapshot2 = tracemalloc.take_snapshot()
+            top_stats = snapshot2.compare_to(snapshot1, "lineno")
+            print(f"[Server_{self.server_id}][ Top 10 differences ]")
+            for stat in top_stats[:10]:
+                print(f"[Server_{self.server_id}]: {stat}")
+            tracemalloc.stop()
+            '''
             if not use_graphbolt:
                 # formatting dtype
                 # TODO(Rui) Formatting forcely is not a perfect solution.
diff --git a/python/dgl/distributed/graph_services.py b/python/dgl/distributed/graph_services.py
index 4f8dce7578df..a1dad16a4209 100644
--- a/python/dgl/distributed/graph_services.py
+++ b/python/dgl/distributed/graph_services.py
@@ -551,7 +551,6 @@ def _distributed_access(g, nodes, issue_remote_req, local_access, use_graphbolt=
         res_list.extend(results)
 
     sampled_graph = merge_graphs(res_list, g.num_nodes())
-    print("sampled_graph: ", sampled_graph)
     return sampled_graph
 
 
@@ -617,7 +616,6 @@ def _frontier_to_heterogeneous_graph_gb(g, frontier, gpb):
     dst_ntype_ids, dst = gpb.map_to_per_ntype(dst)
 
     data_dict = dict()
-    print("g.canonical_etypes: ", g.canonical_etypes)
     for etid, etype in enumerate(g.canonical_etypes):
         src_ntype, _, dst_ntype = etype
         src_ntype_id = g.get_ntype_id(src_ntype)
diff --git a/python/dgl/distributed/partition.py b/python/dgl/distributed/partition.py
index 5fb29e232639..7fffb0b8d655 100644
--- a/python/dgl/distributed/partition.py
+++ b/python/dgl/distributed/partition.py
@@ -1277,10 +1277,10 @@ def get_homogeneous(g, balance_ntypes):
 
 def convert_dgl_partition_to_csc_sampling_graph(
     part_config,
-    store_orig_nids=False,
+    store_orig_nids=True,
     store_orig_eids=False,
     store_ntypes=False,
-    store_etypes=False,
+    store_etypes=True,
     store_metadata=False,
 ):
     """Convert partitions of dgl to CSCSamplingGraph of GraphBolt.

From b3eecae5f3a29eb9c630c5704e40c4a8f5c31cde Mon Sep 17 00:00:00 2001
From: RhettYing <rhett_ying@qq.com>
Date: Thu, 26 Oct 2023 06:43:51 +0000
Subject: [PATCH 16/30] multiply fanout with num_etypes

---
 examples/pytorch/rgcn/experimental/README.md | 17 ++++++++++++++++-
 python/dgl/distributed/graph_services.py     |  9 ++++++++-
 2 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/examples/pytorch/rgcn/experimental/README.md b/examples/pytorch/rgcn/experimental/README.md
index a8adc5e4d54c..7e9334642231 100644
--- a/examples/pytorch/rgcn/experimental/README.md
+++ b/examples/pytorch/rgcn/experimental/README.md
@@ -53,7 +53,9 @@ python3 /home/ubuntu/workspace/dgl_2/tools/launch.py \
 #### Results
 `g4dn.metal` x 2, `ogbn-mag`.
 
-[**NOTE**]Epoch time drops in DistDGL with GraphBolt, so does accuracy, probably partially caused by the difference between `sample_neighbors()` and `sample_etype_neighbors()`.
+[**NOTE**] `sample_etype_neighbors()` is not correctly enabled in DistDGL with GraphBolt.
+
+DistDGL with GraphBolt takes less time for sampling(-60%) and for whole epoch(-30%) while keeping comparable accuracies in validation and test.
 
 ##### DistDGL
 
@@ -66,6 +68,17 @@ Val Acc 0.4618, Test Acc 0.4485, time: 16.9179
 
 ##### DistDGL with GraphBolt
 
+fanout = [25, 10], multiplied by `num_etypes`. [**Default**]
+
+```
+[3]Epoch Time(s): 137.0454, sample: 27.0914, data copy: 32.2842, forward: 3.5588, backward: 60.5921, update: 13.5188, #train: 78696, #inp
+ut: 76402212
+
+Val Acc 0.4648, Test Acc 0.4498, time: 10.4527
+```
+
+fanout = [25, 10], not multiplied by `num_etypes`. [**Deprecated**]
+
 ```
 Epoch Time(s): 32.7923, sample: 5.4970, data copy: 4.9976, forward: 2.4069, backward: 15.4529, update: 4.4377, #train: 78696, #input: 
 18370936
@@ -73,6 +86,8 @@ Epoch Time(s): 32.7923, sample: 5.4970, data copy: 4.9976, forward: 2.4069, back
 Val Acc 0.3901, Test Acc 0.3844, time: 2.2284
 ```
 
+---------------------------------------
+
 ## Distributed training
 
 This is an example of training RGCN node classification in a distributed fashion. Currently, the example train RGCN graphs with input node features. The current implementation follows ../rgcn/entity_claasify_mp.py.
diff --git a/python/dgl/distributed/graph_services.py b/python/dgl/distributed/graph_services.py
index a1dad16a4209..2f057bac9e33 100644
--- a/python/dgl/distributed/graph_services.py
+++ b/python/dgl/distributed/graph_services.py
@@ -712,7 +712,14 @@ def sample_etype_neighbors(
     DGLGraph
         A sampled subgraph containing only the sampled neighboring edges.  It is on CPU.
     """
-    if not use_graphbolt:
+    if use_graphbolt:
+        assert isinstance(fanout, int), "GraphBolt only supports int fanout."
+        # [Rui] As CSCSamplingGraph is always homogeneous in GB,
+        # sample_etype_neighbors() not supported yet though it's available in
+        # underlying GB. So I increase the fanout to mimic the behavior of
+        # sample_etype_neighbors(). But I know it's quite not right.
+        fanout *= len(g.canonical_etypes)
+    else:
         if isinstance(fanout, int):
             fanout = F.full_1d(len(g.canonical_etypes), fanout, F.int64, F.cpu())
         else:

From 896ad9b93b905cda2adc2a13736a63d48a4475c2 Mon Sep 17 00:00:00 2001
From: RhettYing <rhett_ying@qq.com>
Date: Thu, 26 Oct 2023 08:12:41 +0000
Subject: [PATCH 17/30] [WAHAHA] sample_etype_neighbors is applied truely
 except metadata shm hack

---
 examples/pytorch/rgcn/experimental/README.md  | 15 +++++--
 .../rgcn/experimental/partition_graph.py      |  3 --
 python/dgl/distributed/dist_graph.py          | 10 ++++-
 python/dgl/distributed/graph_services.py      | 34 ++++++----------
 python/dgl/distributed/partition.py           | 40 +++++++------------
 .../dgl/graphbolt/impl/csc_sampling_graph.py  | 10 +++--
 python/dgl/graphbolt/impl/neighbor_sampler.py |  7 ++--
 tests/distributed/test_partition.py           |  9 +----
 8 files changed, 60 insertions(+), 68 deletions(-)

diff --git a/examples/pytorch/rgcn/experimental/README.md b/examples/pytorch/rgcn/experimental/README.md
index 7e9334642231..f3fb75660084 100644
--- a/examples/pytorch/rgcn/experimental/README.md
+++ b/examples/pytorch/rgcn/experimental/README.md
@@ -53,9 +53,7 @@ python3 /home/ubuntu/workspace/dgl_2/tools/launch.py \
 #### Results
 `g4dn.metal` x 2, `ogbn-mag`.
 
-[**NOTE**] `sample_etype_neighbors()` is not correctly enabled in DistDGL with GraphBolt.
-
-DistDGL with GraphBolt takes less time for sampling(-60%) and for whole epoch(-30%) while keeping comparable accuracies in validation and test.
+DistDGL with GraphBolt takes less time for sampling(from **73s** to **11s**) and for whole epoch(from **178s** to **70s**) while keeping comparable accuracies in validation and test.
 
 ##### DistDGL
 
@@ -68,7 +66,16 @@ Val Acc 0.4618, Test Acc 0.4485, time: 16.9179
 
 ##### DistDGL with GraphBolt
 
-fanout = [25, 10], multiplied by `num_etypes`. [**Default**]
+fanout = [25, 10], call `gb.sample_etype_neighbors()` correctly. [**Default**]
+
+```
+Epoch Time(s): 70.3498, sample: 10.6339, data copy: 8.9492, forward: 2.6577, backward: 36.1793, update: 11.9295, #train: 78696, #input
+: 34559464
+
+Val Acc 0.4572, Test Acc 0.4498, time: 3.5830
+```
+
+fanout = [25, 10], multiplied by `num_etypes`. [**Deprecated**]
 
 ```
 [3]Epoch Time(s): 137.0454, sample: 27.0914, data copy: 32.2842, forward: 3.5588, backward: 60.5921, update: 13.5188, #train: 78696, #inp
diff --git a/examples/pytorch/rgcn/experimental/partition_graph.py b/examples/pytorch/rgcn/experimental/partition_graph.py
index 9a110d5cf778..af430261609f 100644
--- a/examples/pytorch/rgcn/experimental/partition_graph.py
+++ b/examples/pytorch/rgcn/experimental/partition_graph.py
@@ -133,7 +133,4 @@ def load_ogb(dataset):
         balance_edges=args.balance_edges,
         num_trainers_per_machine=args.num_trainers_per_machine,
         use_graphbolt=args.graphbolt,
-        gb_store_orig_nids=True,
-        gb_store_orig_eids=False,
-        gb_store_etypes=True,
     )
diff --git a/python/dgl/distributed/dist_graph.py b/python/dgl/distributed/dist_graph.py
index 514927d8dd0d..f06d1bb41576 100644
--- a/python/dgl/distributed/dist_graph.py
+++ b/python/dgl/distributed/dist_graph.py
@@ -195,7 +195,15 @@ def _get_graph_from_shared_mem(graph_name, use_graphbolt):
     through shared memory to reduce the overhead of data access.
     """
     if use_graphbolt:
-        g = gb.load_from_shared_memory(graph_name, None)
+        metadata = gb.GraphMetadata(
+            {'author': 0, 'field_of_study': 1, 'institution': 2, 'paper': 3},
+            {'author:affiliated_with:institution': 0, 'author:writes:paper': 1,
+             'field_of_study:rev-has_topic:paper': 2,
+             'institution:rev-affiliated_with:author': 3, 'paper:cites:paper': 4,
+             'paper:has_topic:field_of_study': 5, 'paper:rev-cites:paper': 6,
+             'paper:rev-writes:author': 7}
+        )
+        g = gb.load_from_shared_memory(graph_name, metadata)
         return g
 
     g, ntypes, etypes = heterograph_index.create_heterograph_from_shared_memory(
diff --git a/python/dgl/distributed/graph_services.py b/python/dgl/distributed/graph_services.py
index 2f057bac9e33..26aa6aef64b0 100644
--- a/python/dgl/distributed/graph_services.py
+++ b/python/dgl/distributed/graph_services.py
@@ -712,28 +712,20 @@ def sample_etype_neighbors(
     DGLGraph
         A sampled subgraph containing only the sampled neighboring edges.  It is on CPU.
     """
-    if use_graphbolt:
-        assert isinstance(fanout, int), "GraphBolt only supports int fanout."
-        # [Rui] As CSCSamplingGraph is always homogeneous in GB,
-        # sample_etype_neighbors() not supported yet though it's available in
-        # underlying GB. So I increase the fanout to mimic the behavior of
-        # sample_etype_neighbors(). But I know it's quite not right.
-        fanout *= len(g.canonical_etypes)
+    if isinstance(fanout, int):
+        fanout = F.full_1d(len(g.canonical_etypes), fanout, F.int64, F.cpu())
     else:
-        if isinstance(fanout, int):
-            fanout = F.full_1d(len(g.canonical_etypes), fanout, F.int64, F.cpu())
-        else:
-            etype_ids = {etype: i for i, etype in enumerate(g.canonical_etypes)}
-            fanout_array = [None] * len(g.canonical_etypes)
-            for etype, v in fanout.items():
-                c_etype = g.to_canonical_etype(etype)
-                fanout_array[etype_ids[c_etype]] = v
-            assert all(v is not None for v in fanout_array), (
-                "Not all etypes have valid fanout. Please make sure passed-in "
-                "fanout in dict includes all the etypes in graph. Passed-in "
-                f"fanout: {fanout}, graph etypes: {g.canonical_etypes}."
-            )
-            fanout = F.tensor(fanout_array, dtype=F.int64)
+        etype_ids = {etype: i for i, etype in enumerate(g.canonical_etypes)}
+        fanout_array = [None] * len(g.canonical_etypes)
+        for etype, v in fanout.items():
+            c_etype = g.to_canonical_etype(etype)
+            fanout_array[etype_ids[c_etype]] = v
+        assert all(v is not None for v in fanout_array), (
+            "Not all etypes have valid fanout. Please make sure passed-in "
+            "fanout in dict includes all the etypes in graph. Passed-in "
+            f"fanout: {fanout}, graph etypes: {g.canonical_etypes}."
+        )
+        fanout = F.tensor(fanout_array, dtype=F.int64)
 
     gpb = g.get_partition_book()
     if isinstance(nodes, dict):
diff --git a/python/dgl/distributed/partition.py b/python/dgl/distributed/partition.py
index 7fffb0b8d655..a3b6b3f86006 100644
--- a/python/dgl/distributed/partition.py
+++ b/python/dgl/distributed/partition.py
@@ -571,11 +571,7 @@ def partition_graph(
     objtype="cut",
     graph_formats=None,
     use_graphbolt=False,
-    gb_store_orig_nids=False,
-    gb_store_orig_eids=False,
-    gb_store_ntypes=False,
-    gb_store_etypes=False,
-    gb_store_metadata=False,
+    gb_save_all=False,
 ):
     """Partition a graph for distributed training and store the partitions on files.
 
@@ -751,16 +747,6 @@ def partition_graph(
         from high to low is ``coo``, ``csc``, ``csr``.
     use_graphbolt : bool
         Whether to convert the partitioned graph to GraphBolt format.
-    gb_store_orig_nids : bool
-        Whether to store the original node IDs in the partitioned graph.
-    gb_store_orig_eids : bool
-        Whether to store the original edge IDs in the partitioned graph.
-    gb_store_ntypes : bool
-        Whether to store the node types in the partitioned graph.
-    gb_store_etypes : bool
-        Whether to store the edge types in the partitioned graph.
-    gb_store_metadata : bool
-        Whether to store the metadata of the partitioned graph.
 
     Returns
     -------
@@ -1261,14 +1247,19 @@ def get_homogeneous(g, balance_ntypes):
     )
 
     if use_graphbolt:
-        convert_dgl_partition_to_csc_sampling_graph(
-            part_config,
-            store_orig_nids=gb_store_orig_nids,
-            store_orig_eids=gb_store_orig_eids,
-            store_ntypes=gb_store_ntypes,
-            store_etypes=gb_store_etypes,
-            store_metadata=gb_store_metadata,
-        )
+        if gb_save_all:
+            convert_dgl_partition_to_csc_sampling_graph(
+                part_config,
+                store_orig_nids=True,
+                store_orig_eids=True,
+                store_ntypes=True,
+                store_etypes=True,
+                store_metadata=True,
+            )
+        else:
+            convert_dgl_partition_to_csc_sampling_graph(
+                part_config,
+            )
         print("Converted to GraphBolt format.")
 
     if return_mapping:
@@ -1281,7 +1272,7 @@ def convert_dgl_partition_to_csc_sampling_graph(
     store_orig_eids=False,
     store_ntypes=False,
     store_etypes=True,
-    store_metadata=False,
+    store_metadata=True,
 ):
     """Convert partitions of dgl to CSCSamplingGraph of GraphBolt.
 
@@ -1383,7 +1374,6 @@ def init_type_per_edge(graph, gpb):
             if edge_attributes is None:
                 edge_attributes = {}
             edge_attributes[ETYPE] = type_per_edge
-            type_per_edge = None
 
         # Construct CSCSamplingGraph
         csc_graph = graphbolt.from_csc(
diff --git a/python/dgl/graphbolt/impl/csc_sampling_graph.py b/python/dgl/graphbolt/impl/csc_sampling_graph.py
index 09c4a06963b2..b7167cb0db92 100644
--- a/python/dgl/graphbolt/impl/csc_sampling_graph.py
+++ b/python/dgl/graphbolt/impl/csc_sampling_graph.py
@@ -337,6 +337,7 @@ def in_subgraph(self, nodes: torch.Tensor) -> torch.ScriptObject:
     def _convert_to_sampled_subgraph(
         self,
         C_sampled_subgraph: torch.ScriptObject,
+        keep_homo: bool = False,
     ):
         """An internal function used to convert a fused homogeneous sampled
         subgraph to general struct 'SampledSubgraphImpl'."""
@@ -372,7 +373,7 @@ def _convert_to_sampled_subgraph(
             ]
             assert original_etype_ids is not None, "original_etype_ids is None"
 
-        if type_per_edge is None:
+        if type_per_edge is None or keep_homo:
             # The sampled graph is already a homogeneous graph.
             node_pairs = (row, column)
         else:
@@ -412,6 +413,7 @@ def sample_neighbors(
         fanouts: torch.Tensor,
         replace: bool = False,
         probs_name: Optional[str] = None,
+        keep_homo: bool = False,
     ) -> SampledSubgraphImpl:
         """Sample neighboring edges of the given nodes and return the induced
         subgraph.
@@ -483,7 +485,7 @@ def sample_neighbors(
             nodes, fanouts, replace, probs_name
         )
 
-        return self._convert_to_sampled_subgraph(C_sampled_subgraph)
+        return self._convert_to_sampled_subgraph(C_sampled_subgraph, keep_homo=keep_homo)
 
     def _check_sampler_arguments(self, nodes, fanouts, probs_name):
         assert nodes.dim() == 1, "Nodes should be 1-D tensor."
@@ -494,8 +496,8 @@ def _check_sampler_arguments(self, nodes, fanouts, probs_name):
         assert len(fanouts) in [
             expected_fanout_len,
             1,
-        ], "Fanouts should have the same number of elements as etypes or \
-            should have a length of 1."
+        ], f"Fanouts should have the same number of elements as etypes or \
+            should have a length of 1, but got {fanouts} while {self.metadata.edge_type_to_id}"
         if fanouts.size(0) > 1:
             assert (
                 self.type_per_edge is not None
diff --git a/python/dgl/graphbolt/impl/neighbor_sampler.py b/python/dgl/graphbolt/impl/neighbor_sampler.py
index 00ff91796f0b..ca8cb0473757 100644
--- a/python/dgl/graphbolt/impl/neighbor_sampler.py
+++ b/python/dgl/graphbolt/impl/neighbor_sampler.py
@@ -102,9 +102,10 @@ def __init__(
 
     @staticmethod
     def distributed_sample_neighbor(graph, seeds, fanouts):
-        assert isinstance(fanouts, int), f"Fanouts should be an integer but got {fanouts}."
-        fanouts = torch.LongTensor([fanouts])
-        subgraph = graph.sample_neighbors(seeds, fanouts)
+        if isinstance(fanouts, int):
+            fanouts = torch.LongTensor([fanouts])
+        assert isinstance(fanouts, torch.Tensor), f"Invalid fanouts: {fanouts}"
+        subgraph = graph.sample_neighbors(seeds, fanouts, keep_homo=True)
         src_nodes, dst_nodes = subgraph.node_pairs
         etype_ids = subgraph.original_etype_ids
         assert src_nodes.shape == dst_nodes.shape == etype_ids.shape, f"Shape mismatch: {src_nodes.shape}, {dst_nodes.shape}, {etype_ids.shape}"
diff --git a/tests/distributed/test_partition.py b/tests/distributed/test_partition.py
index d519e7b266d6..583b3d8e7120 100644
--- a/tests/distributed/test_partition.py
+++ b/tests/distributed/test_partition.py
@@ -732,7 +732,6 @@ def test_convert_dgl_partition_to_csc_sampling_graph_homo(
                     assert new_g.edge_attributes is None
             if store_etypes:
                 assert th.all(0 == new_g.edge_attributes[dgl.ETYPE])
-                assert new_g.type_per_edge is None
             else:
                 assert new_g.type_per_edge is None
             if store_metadata:
@@ -804,7 +803,6 @@ def test_convert_dgl_partition_to_csc_sampling_graph_hetero(
                     assert new_g.edge_attributes is None
             if store_etypes:
                 assert th.equal(orig_g.edata[dgl.ETYPE][orig_eids], new_g.edge_attributes[dgl.ETYPE])
-                assert new_g.type_per_edge is None
             else:
                 assert new_g.type_per_edge is None
             if store_metadata:
@@ -1093,10 +1091,7 @@ def test_partition_hetero_graphbolt_sample_neighbors(
             return_mapping=True,
             num_trainers_per_machine=num_trainers_per_machine,
             use_graphbolt=True,
-            gb_store_orig_nids=True,
-            gb_store_orig_eids=True,
-            gb_store_ntypes=True,
-            gb_store_etypes=True,
+            gb_save_all=True,
         )
         part_config = os.path.join(test_dir, "test.json")
         for i in range(num_parts):
@@ -1120,7 +1115,7 @@ def test_partition_hetero_graphbolt_sample_neighbors(
                 assert edge_feats == {}
 
             # sample_neighbors()
-            subg = part_g.sample_neighbors(th.arange(10), th.LongTensor([-1]))
+            subg = part_g.sample_neighbors(th.arange(10), th.LongTensor([-1]), keep_homo=True)
             src, dst = subg.node_pairs
             orig_src = part_g.node_attributes[dgl.NID][src]
             orig_dst = part_g.node_attributes[dgl.NID][dst]

From 5c8c61241c8680c48e13d17d93f1614e97489afe Mon Sep 17 00:00:00 2001
From: RhettYing <rhett_ying@qq.com>
Date: Thu, 26 Oct 2023 08:54:41 +0000
Subject: [PATCH 18/30] [WAHAHA] gb_metadata is generated on-fly for obtain
 CSCSamplingGraph from shm

---
 .../pytorch/rgcn/experimental/log_dgl.txt     | 223 -----------------
 examples/pytorch/rgcn/experimental/log_gb.txt | 231 ------------------
 python/dgl/distributed/dist_graph.py          |  29 ++-
 3 files changed, 17 insertions(+), 466 deletions(-)
 delete mode 100644 examples/pytorch/rgcn/experimental/log_dgl.txt
 delete mode 100644 examples/pytorch/rgcn/experimental/log_gb.txt

diff --git a/examples/pytorch/rgcn/experimental/log_dgl.txt b/examples/pytorch/rgcn/experimental/log_dgl.txt
deleted file mode 100644
index 89147ec2fe50..000000000000
--- a/examples/pytorch/rgcn/experimental/log_dgl.txt
+++ /dev/null
@@ -1,223 +0,0 @@
-/home/ubuntu/workspace/dgl_2/tools/launch.py:148: DeprecationWarning: setDaemon() is deprecated, set the daemon attribute instead
-  thread.setDaemon(True)
-The number of OMP threads per trainer is set to 12
-Namespace(batch_size=1024, conf_path=None, dataset='ogbn-mag', dgl_sparse=False, dropout=0.5, eval_batch_size=1024, fanout='25,10', graph_name='ogbn-mag', graphbolt=False, id=None, ip_config='/home/ubuntu/workspace/ip_config.txt', l2norm=0, layer_norm=True, local_rank=None, log_every=20, low_mem=True, lr=0.01, n_bases=2, n_epochs=1, n_hidden=64, n_layers=2, num_gpus=-1, relabel=False, sparse_embedding=True, sparse_lr=0.06, standalone=False, use_self_loop=True, validation_fanout='25,10')
-Namespace(batch_size=1024, conf_path=None, dataset='ogbn-mag', dgl_sparse=False, dropout=0.5, eval_batch_size=1024, fanout='25,10', graph_name='ogbn-mag', graphbolt=False, id=None, ip_config='/home/ubuntu/workspace/ip_config.txt', l2norm=0, layer_norm=True, local_rank=None, log_every=20, low_mem=True, lr=0.01, n_bases=2, n_epochs=1, n_hidden=64, n_layers=2, num_gpus=-1, relabel=False, sparse_embedding=True, sparse_lr=0.06, standalone=False, use_self_loop=True, validation_fanout='25,10')
-Namespace(batch_size=1024, conf_path=None, dataset='ogbn-mag', dgl_sparse=False, dropout=0.5, eval_batch_size=1024, fanout='25,10', graph_name='ogbn-mag', graphbolt=False, id=None, ip_config='/home/ubuntu/workspace/ip_config.txt', l2norm=0, layer_norm=True, local_rank=None, log_every=20, low_mem=True, lr=0.01, n_bases=2, n_epochs=1, n_hidden=64, n_layers=2, num_gpus=-1, relabel=False, sparse_embedding=True, sparse_lr=0.06, standalone=False, use_self_loop=True, validation_fanout='25,10')
-Namespace(batch_size=1024, conf_path=None, dataset='ogbn-mag', dgl_sparse=False, dropout=0.5, eval_batch_size=1024, fanout='25,10', graph_name='ogbn-mag', graphbolt=False, id=None, ip_config='/home/ubuntu/workspace/ip_config.txt', l2norm=0, layer_norm=True, local_rank=None, log_every=20, low_mem=True, lr=0.01, n_bases=2, n_epochs=1, n_hidden=64, n_layers=2, num_gpus=-1, relabel=False, sparse_embedding=True, sparse_lr=0.06, standalone=False, use_self_loop=True, validation_fanout='25,10')
-Namespace(batch_size=1024, conf_path=None, dataset='ogbn-mag', dgl_sparse=False, dropout=0.5, eval_batch_size=1024, fanout='25,10', graph_name='ogbn-mag', graphbolt=False, id=None, ip_config='/home/ubuntu/workspace/ip_config.txt', l2norm=0, layer_norm=True, local_rank=None, log_every=20, low_mem=True, lr=0.01, n_bases=2, n_epochs=1, n_hidden=64, n_layers=2, num_gpus=-1, relabel=False, sparse_embedding=True, sparse_lr=0.06, standalone=False, use_self_loop=True, validation_fanout='25,10')
-Namespace(batch_size=1024, conf_path=None, dataset='ogbn-mag', dgl_sparse=False, dropout=0.5, eval_batch_size=1024, fanout='25,10', graph_name='ogbn-mag', graphbolt=False, id=None, ip_config='/home/ubuntu/workspace/ip_config.txt', l2norm=0, layer_norm=True, local_rank=None, log_every=20, low_mem=True, lr=0.01, n_bases=2, n_epochs=1, n_hidden=64, n_layers=2, num_gpus=-1, relabel=False, sparse_embedding=True, sparse_lr=0.06, standalone=False, use_self_loop=True, validation_fanout='25,10')
-Namespace(batch_size=1024, conf_path=None, dataset='ogbn-mag', dgl_sparse=False, dropout=0.5, eval_batch_size=1024, fanout='25,10', graph_name='ogbn-mag', graphbolt=False, id=None, ip_config='/home/ubuntu/workspace/ip_config.txt', l2norm=0, layer_norm=True, local_rank=None, log_every=20, low_mem=True, lr=0.01, n_bases=2, n_epochs=1, n_hidden=64, n_layers=2, num_gpus=-1, relabel=False, sparse_embedding=True, sparse_lr=0.06, standalone=False, use_self_loop=True, validation_fanout='25,10')
-Namespace(batch_size=1024, conf_path=None, dataset='ogbn-mag', dgl_sparse=False, dropout=0.5, eval_batch_size=1024, fanout='25,10', graph_name='ogbn-mag', graphbolt=False, id=None, ip_config='/home/ubuntu/workspace/ip_config.txt', l2norm=0, layer_norm=True, local_rank=None, log_every=20, low_mem=True, lr=0.01, n_bases=2, n_epochs=1, n_hidden=64, n_layers=2, num_gpus=-1, relabel=False, sparse_embedding=True, sparse_lr=0.06, standalone=False, use_self_loop=True, validation_fanout='25,10')
-Client [170011] waits on 172.31.8.229:54407
-Client [170012] waits on 172.31.8.229:53959
-Client [170014] waits on 172.31.8.229:49611
-Client [170013] waits on 172.31.8.229:35283
-Client [1878744] waits on 172.31.14.101:45181
-Client [1878745] waits on 172.31.14.101:59671Client [1878743] waits on 172.31.14.101:58669
-
-Client [1878746] waits on 172.31.14.101:34789
-Machine (0) group (0) client (0) connect to server successfuly!
-Machine (0) group (0) client (1) connect to server successfuly!
-Machine (0) group (0) client (2) connect to server successfuly!
-Machine (0) group (0) client (3) connect to server successfuly!
-Machine (1) group (0) client (4) connect to server successfuly!
-Machine (1) group (0) client (5) connect to server successfuly!
-Machine (1) group (0) client (6) connect to server successfuly!
-Machine (1) group (0) client (7) connect to server successfuly!
-rank: 0
-rank: 2
-rank: 3
-rank: 1
-rank: 5
-rank: 4
-rank: 7
-rank: 6
-part 4, train: 78697 (local: 78697), val: 8110 (local: 7708), test: 5243 (local: 5072)
-part 2, train: 78696 (local: 77008), val: 8110 (local: 8110), test: 5242 (local: 5242)
-part 5, train: 78696 (local: 78696), val: 8110 (local: 7714), test: 5242 (local: 5108)
-part 1, train: 78697 (local: 76875), val: 8110 (local: 8110), test: 5243 (local: 5243)
-part 3, train: 78696 (local: 76932), val: 8110 (local: 8110), test: 5242 (local: 5242)
-part 0, train: 78697 (local: 76971), val: 8110 (local: 8110), test: 5243 (local: 5243)
-part 6, train: 78696 (local: 78696), val: 8110 (local: 7735), test: 5242 (local: 5072)
-part 7, train: 78696 (local: 78696), val: 8109 (local: 7738), test: 5242 (local: 5092)
-#classes: 349
-#classes: 349
-node paper has data feat
-node paper has data feat
-#classes: 349
-#classes: 349
-node paper has data feat
-node paper has data feat
-#classes: 349
-node paper has data feat
-#classes: 349
-node paper has data feat
-#classes: 349
-node paper has data feat
-#classes: 349
-node paper has data feat
-optimize Pytorch sparse embedding: ModuleDict(
-  (author): Embedding(1134649, 64, sparse=True)
-  (field_of_study): Embedding(59965, 64, sparse=True)
-  (institution): Embedding(8740, 64, sparse=True)
-)
-optimize dense projection: ModuleDict(
-  (paper): Linear(in_features=128, out_features=64, bias=True)
-)
-start training...
-optimize Pytorch sparse embedding: ModuleDict(
-  (author): Embedding(1134649, 64, sparse=True)
-  (field_of_study): Embedding(59965, 64, sparse=True)
-  (institution): Embedding(8740, 64, sparse=True)
-)
-optimize dense projection: ModuleDict(
-  (paper): Linear(in_features=128, out_features=64, bias=True)
-)
-start training...
-optimize Pytorch sparse embedding: ModuleDict(
-  (author): Embedding(1134649, 64, sparse=True)
-  (field_of_study): Embedding(59965, 64, sparse=True)
-  (institution): Embedding(8740, 64, sparse=True)
-)
-optimize dense projection: ModuleDict(
-  (paper): Linear(in_features=128, out_features=64, bias=True)
-)
-start training...
-optimize Pytorch sparse embedding: optimize Pytorch sparse embedding: ModuleDict(
-  (author): Embedding(1134649, 64, sparse=True)
-  (field_of_study): Embedding(59965, 64, sparse=True)
-  (institution): Embedding(8740, 64, sparse=True)
-)
-ModuleDict(
-  (author): Embedding(1134649, 64, sparse=True)
-  (field_of_study): Embedding(59965, 64, sparse=True)
-  (institution): Embedding(8740, 64, sparse=True)
-)
-optimize dense projection: ModuleDict(
-  (paper): Linear(in_features=128, out_features=64, bias=True)
-)
-optimize dense projection: ModuleDict(
-  (paper): Linear(in_features=128, out_features=64, bias=True)
-)
-start training...
-start training...
-optimize Pytorch sparse embedding: ModuleDict(
-  (author): Embedding(1134649, 64, sparse=True)
-  (field_of_study): Embedding(59965, 64, sparse=True)
-  (institution): Embedding(8740, 64, sparse=True)
-)
-optimize dense projection: ModuleDict(
-  (paper): Linear(in_features=128, out_features=64, bias=True)
-)
-start training...
-optimize Pytorch sparse embedding: ModuleDict(
-  (author): Embedding(1134649, 64, sparse=True)
-  (field_of_study): Embedding(59965, 64, sparse=True)
-  (institution): Embedding(8740, 64, sparse=True)
-)
-optimize dense projection: ModuleDict(
-  (paper): Linear(in_features=128, out_features=64, bias=True)
-)
-start training...
-optimize Pytorch sparse embedding: ModuleDict(
-  (author): Embedding(1134649, 64, sparse=True)
-  (field_of_study): Embedding(59965, 64, sparse=True)
-  (institution): Embedding(8740, 64, sparse=True)
-)
-optimize dense projection: ModuleDict(
-  (paper): Linear(in_features=128, out_features=64, bias=True)
-)
-start training...
-[6] Epoch 00000 | Step 00000 | Train acc 0.0029 | Loss 5.9122 | time 2.916 s| sample 1.148 | copy 0.096 | forward 0.031 | backward 1.472 | update 0.170
-[5] Epoch 00000 | Step 00000 | Train acc 0.0059 | Loss 5.8932 | time 2.925 s| sample 0.582 | copy 0.100 | forward 0.059 | backward 1.997 | update 0.187
-[4] Epoch 00000 | Step 00000 | Train acc 0.0059 | Loss 5.8930 | time 2.933 s| sample 0.572 | copy 0.553 | forward 0.053 | backward 1.571 | update 0.183
-[7] Epoch 00000 | Step 00000 | Train acc 0.0020 | Loss 5.8976 | time 2.943 s| sample 0.823 | copy 0.089 | forward 0.053 | backward 1.802 | update 0.175
-[3] Epoch 00000 | Step 00000 | Train acc 0.0029 | Loss 5.9240 | time 3.022 s| sample 0.660 | copy 0.488 | forward 0.053 | backward 1.569 | update 0.252
-[1] Epoch 00000 | Step 00000 | Train acc 0.0020 | Loss 5.9289 | time 3.016 s| sample 1.648 | copy 0.119 | forward 0.052 | backward 0.921 | update 0.277
-[2] Epoch 00000 | Step 00000 | Train acc 0.0010 | Loss 5.9225 | time 3.039 s| sample 2.145 | copy 0.108 | forward 0.033 | backward 0.491 | update 0.261
-[0] Epoch 00000 | Step 00000 | Train acc 0.0010 | Loss 5.9157 | time 3.040 s| sample 1.169 | copy 0.917 | forward 0.055 | backward 0.604 | update 0.295
-[6] Epoch 00000 | Step 00020 | Train acc 0.2412 | Loss 3.2387 | time 46.698 s| sample 14.856 | copy 4.428 | forward 0.654 | backward 23.941 | update 2.819
-[5] Epoch 00000 | Step 00020 | Train acc 0.2539 | Loss 3.2034 | time 46.734 s| sample 16.540 | copy 4.590 | forward 0.685 | backward 22.167 | update 2.752
-[7] Epoch 00000 | Step 00020 | Train acc 0.2393 | Loss 3.2745 | time 46.723 s| sample 15.996 | copy 3.837 | forward 0.699 | backward 23.381 | update 2.811
-[1] Epoch 00000 | Step 00020 | Train acc 0.2852 | Loss 3.0824 | time 46.619 s| sample 19.973 | copy 8.788 | forward 0.681 | backward 13.997 | update 3.179
-[0] Epoch 00000 | Step 00020 | Train acc 0.3057 | Loss 3.0992 | time 46.612 s| sample 23.789 | copy 5.350 | forward 0.657 | backward 13.582 | update 3.233
-[2] Epoch 00000 | Step 00020 | Train acc 0.2676 | Loss 3.1275 | time 46.616 s| sample 23.537 | copy 5.105 | forward 0.708 | backward 13.871 | update 3.396
-[4] Epoch 00000 | Step 00020 | Train acc 0.2490 | Loss 3.2494 | time 46.769 s| sample 15.239 | copy 5.336 | forward 0.675 | backward 22.613 | update 2.907
-[3] Epoch 00000 | Step 00020 | Train acc 0.2920 | Loss 3.1066 | time 46.668 s| sample 16.345 | copy 7.208 | forward 0.652 | backward 19.454 | update 3.009
-[6] Epoch 00000 | Step 00040 | Train acc 0.3252 | Loss 2.6834 | time 47.843 s| sample 11.093 | copy 7.034 | forward 0.539 | backward 27.259 | update 1.917
-[5] Epoch 00000 | Step 00040 | Train acc 0.3340 | Loss 2.6190 | time 47.821 s| sample 15.453 | copy 3.689 | forward 0.600 | backward 25.696 | update 2.383
-[4] Epoch 00000 | Step 00040 | Train acc 0.3262 | Loss 2.6368 | time 47.785 s| sample 16.000 | copy 4.917 | forward 0.595 | backward 23.932 | update 2.341
-[2] Epoch 00000 | Step 00040 | Train acc 0.4229 | Loss 2.2658 | time 47.809 s| sample 25.169 | copy 6.637 | forward 0.647 | backward 12.682 | update 2.675
-[3] Epoch 00000 | Step 00040 | Train acc 0.4180 | Loss 2.4361 | time 47.813 s| sample 21.526 | copy 8.554 | forward 0.607 | backward 14.433 | update 2.692
-[1] Epoch 00000 | Step 00040 | Train acc 0.4043 | Loss 2.4200 | time 47.841 s| sample 18.712 | copy 7.680 | forward 0.653 | backward 18.093 | update 2.703
-[0] Epoch 00000 | Step 00040 | Train acc 0.3740 | Loss 2.5024 | time 47.852 s| sample 23.139 | copy 5.863 | forward 0.618 | backward 15.616 | update 2.615
-[7] Epoch 00000 | Step 00040 | Train acc 0.3447 | Loss 2.7855 | time 47.878 s| sample 19.202 | copy 3.430 | forward 0.637 | backward 22.131 | update 2.477
-[6] Epoch 00000 | Step 00060 | Train acc 0.3633 | Loss 2.4481 | time 44.457 s| sample 10.072 | copy 7.121 | forward 0.540 | backward 24.892 | update 1.832
-[4] Epoch 00000 | Step 00060 | Train acc 0.3740 | Loss 2.4281 | time 44.479 s| sample 14.819 | copy 2.959 | forward 0.607 | backward 23.605 | update 2.490
-[7] Epoch 00000 | Step 00060 | Train acc 0.3887 | Loss 2.3412 | time 44.424 s| sample 16.428 | copy 2.622 | forward 0.587 | backward 22.469 | update 2.318
-[3] Epoch 00000 | Step 00060 | Train acc 0.4883 | Loss 2.0318 | time 44.474 s| sample 20.794 | copy 5.910 | forward 0.631 | backward 14.221 | update 2.917
-[5] Epoch 00000 | Step 00060 | Train acc 0.3545 | Loss 2.4415 | time 44.515 s| sample 13.574 | copy 3.596 | forward 0.644 | backward 24.285 | update 2.417
-[1] Epoch 00000 | Step 00060 | Train acc 0.4873 | Loss 2.0377 | time 44.462 s| sample 18.935 | copy 7.895 | forward 0.622 | backward 14.207 | update 2.803
-[2] Epoch 00000 | Step 00060 | Train acc 0.4619 | Loss 2.1139 | time 44.499 s| sample 24.354 | copy 3.396 | forward 0.611 | backward 13.427 | update 2.712
-[0] Epoch 00000 | Step 00060 | Train acc 0.4805 | Loss 2.0668 | time 44.483 s| sample 18.805 | copy 5.289 | forward 0.607 | backward 17.069 | update 2.713
-[5]Epoch Time(s): 177.6338, sample: 60.4674, data copy: 14.8044, forward: 2.4920, backward: 90.0968, update: 9.7730, #train: 78696, #input: 30054068
-[7]Epoch Time(s): 177.6308, sample: 68.8437, data copy: 12.3456, forward: 2.4491, backward: 84.3391, update: 9.6532, #train: 78696, #input: 30088778
-[4]Epoch Time(s): 177.6483, sample: 57.3434, data copy: 18.0550, forward: 2.4233, backward: 89.8727, update: 9.9538, #train: 78697, #input: 29936306
-[0]Epoch Time(s): 177.6141, sample: 84.0955, data copy: 22.4317, forward: 2.4212, backward: 57.5743, update: 11.0912, #train: 78697, #input: 34556524
-[1]Epoch Time(s): 177.6036, sample: 73.0517, data copy: 29.3354, forward: 2.5082, backward: 61.4990, update: 11.2092, #train: 78697, #input: 34605854
-[2]Epoch Time(s): 177.6336, sample: 93.5940, data copy: 19.8252, forward: 2.4786, backward: 50.4552, update: 11.2804, #train: 78696, #input: 34584592
-[3]Epoch Time(s): 177.6757, sample: 73.0354, data copy: 27.7802, forward: 2.4314, backward: 63.2740, update: 11.1546, #train: 78696, #input: 34579790
-[6]Epoch Time(s): 177.6165, sample: 44.9285, data copy: 25.8790, forward: 2.1825, backward: 96.4268, update: 8.1996, #train: 78696, #input: 30027068
-0it [00:00, ?it/s]0it [00:00, ?it/s]0it [00:00, ?it/s]0it [00:00, ?it/s]0it [00:00, ?it/s]0it [00:00, ?it/s]0it [00:00, ?it/s]0it [00:00, ?it/s]1it [00:01,  1.04s/it]1it [00:01,  1.14s/it]1it [00:01,  1.15s/it]1it [00:01,  1.17s/it]1it [00:01,  1.24s/it]2it [00:01,  1.08it/s]1it [00:02,  2.24s/it]2it [00:02,  1.10s/it]1it [00:02,  2.39s/it]2it [00:02,  1.25s/it]1it [00:02,  2.58s/it]2it [00:02,  1.46s/it]2it [00:02,  1.56s/it]3it [00:02,  1.01it/s]2it [00:03,  1.38s/it]3it [00:03,  1.02s/it]2it [00:03,  1.49s/it]2it [00:03,  1.60s/it]3it [00:03,  1.16s/it]3it [00:03,  1.17s/it]3it [00:03,  1.16s/it]4it [00:04,  1.02s/it]3it [00:04,  1.36s/it]3it [00:04,  1.17s/it]4it [00:04,  1.12s/it]3it [00:04,  1.39s/it]4it [00:04,  1.18s/it]4it [00:05,  1.25s/it]4it [00:05,  1.20s/it]5it [00:05,  1.05s/it]4it [00:05,  1.20s/it]4it [00:05,  1.33s/it]4it [00:05,  1.44s/it]5it [00:05,  1.03it/s]5it [00:05,  1.35s/it]5it [00:06,  1.24s/it]6it [00:06,  1.04it/s]5it [00:06,  1.10s/it]5it [00:06,  1.27s/it]6it [00:06,  1.08it/s]7it [00:06,  1.19it/s]5it [00:06,  1.59s/it]6it [00:06,  1.05s/it]6it [00:06,  1.22s/it]5it [00:07,  1.46s/it]7it [00:07,  1.07it/s]6it [00:07,  1.22s/it]6it [00:07,  1.30s/it]6it [00:07,  1.40s/it]6it [00:08,  1.27s/it]7it [00:08,  1.06s/it]8it [00:08,  1.03it/s]8it [00:08,  1.01s/it]
-0it [00:00, ?it/s]8it [00:08,  1.09it/s]8it [00:08,  1.07s/it]
-0it [00:00, ?it/s]7it [00:08,  1.34s/it]7it [00:09,  1.28s/it]7it [00:09,  1.17s/it]8it [00:09,  1.03s/it]8it [00:09,  1.13s/it]
-0it [00:00, ?it/s]7it [00:09,  1.20s/it]7it [00:09,  1.26s/it]1it [00:01,  1.16s/it]8it [00:09,  1.09s/it]8it [00:09,  1.24s/it]
-8it [00:09,  1.36s/it]8it [00:09,  1.24s/it]
-0it [00:00, ?it/s]0it [00:00, ?it/s]2it [00:02,  1.02it/s]1it [00:01,  1.85s/it]8it [00:10,  1.26s/it]8it [00:10,  1.31s/it]
-0it [00:00, ?it/s]8it [00:10,  1.31s/it]8it [00:10,  1.32s/it]
-0it [00:00, ?it/s]8it [00:10,  1.36s/it]8it [00:10,  1.32s/it]
-0it [00:00, ?it/s]2it [00:02,  1.18s/it]1it [00:01,  1.22s/it]1it [00:01,  1.22s/it]3it [00:03,  1.04s/it]1it [00:01,  1.19s/it]1it [00:01,  1.28s/it]2it [00:01,  1.09it/s]3it [00:03,  1.00s/it]1it [00:02,  2.83s/it]4it [00:03,  1.09it/s]1it [00:01,  1.58s/it]2it [00:02,  1.19s/it]4it [00:04,  1.10it/s]2it [00:03,  1.64s/it]5it [00:04,  1.13it/s]2it [00:02,  1.16s/it]2it [00:02,  1.17s/it]3it [00:03,  1.01s/it]2it [00:02,  1.30s/it]6it [00:05,  1.34it/s]6it [00:05,  1.16it/s]
-5it [00:04,  1.20it/s]3it [00:03,  1.20s/it]6it [00:05,  1.48it/s]6it [00:05,  1.16it/s]
-3it [00:04,  1.36s/it]3it [00:03,  1.01s/it]4it [00:04,  1.07s/it]4it [00:05,  1.05s/it]3it [00:03,  1.27s/it]4it [00:04,  1.02s/it]3it [00:04,  1.48s/it]4it [00:04,  1.01it/s]5it [00:04,  1.06it/s]4it [00:04,  1.12s/it]5it [00:06,  1.01s/it]5it [00:04,  1.17it/s]6it [00:05,  1.22it/s]6it [00:05,  1.09it/s]
-5it [00:05,  1.08s/it]4it [00:04,  1.18s/it]6it [00:06,  1.26it/s]6it [00:06,  1.09s/it]
-6it [00:05,  1.45it/s]6it [00:05,  1.13it/s]
-5it [00:05,  1.02it/s]6it [00:06,  1.02it/s]6it [00:06,  1.05s/it]
-6it [00:05,  1.30it/s]6it [00:05,  1.05it/s]
-5it [00:05,  1.10s/it]6it [00:06,  1.17it/s]6it [00:06,  1.04s/it]
-Val Acc 0.4618, Test Acc 0.4485, time: 16.9179
-Client[6] in group[0] is exiting...
-Client[3] in group[0] is exiting...
-Client[7] in group[0] is exiting...
-Client[5] in group[0] is exiting...
-Client[4] in group[0] is exiting...
-Client[1] in group[0] is exiting...
-Client[2] in group[0] is exiting...
-Client[0] in group[0] is exiting...
-Namespace(batch_size=1024, conf_path=None, dataset='ogbn-mag', dgl_sparse=False, dropout=0.5, eval_batch_size=1024, fanout='25,10', graph_name='ogbn-mag', graphbolt=False, id=None, ip_config='/home/ubuntu/workspace/ip_config.txt', l2norm=0, layer_norm=True, local_rank=None, log_every=20, low_mem=True, lr=0.01, n_bases=2, n_epochs=1, n_hidden=64, n_layers=2, num_gpus=-1, relabel=False, sparse_embedding=True, sparse_lr=0.06, standalone=False, use_self_loop=True, validation_fanout='25,10')
-[Server_2] Loaded ogbn-mag with use_graphbolt[False] in size[711.65234375 MB]
-Start to create specified graph formats which may take non-trivial time.
-Finished creating specified graph formats.
-start graph service on server 2 for part 1
-Server is waiting for connections on [172.31.8.229:30050]...
-Server (2) shutdown.
-Server is exiting...
-Namespace(batch_size=1024, conf_path=None, dataset='ogbn-mag', dgl_sparse=False, dropout=0.5, eval_batch_size=1024, fanout='25,10', graph_name='ogbn-mag', graphbolt=False, id=None, ip_config='/home/ubuntu/workspace/ip_config.txt', l2norm=0, layer_norm=True, local_rank=None, log_every=20, low_mem=True, lr=0.01, n_bases=2, n_epochs=1, n_hidden=64, n_layers=2, num_gpus=-1, relabel=False, sparse_embedding=True, sparse_lr=0.06, standalone=False, use_self_loop=True, validation_fanout='25,10')
-[Server_0] Loaded ogbn-mag with use_graphbolt[False] in size[701.08203125 MB]
-Start to create specified graph formats which may take non-trivial time.
-Finished creating specified graph formats.
-start graph service on server 0 for part 0
-Server is waiting for connections on [172.31.14.101:30050]...
-Server (0) shutdown.
-Server is exiting...
-Namespace(batch_size=1024, conf_path=None, dataset='ogbn-mag', dgl_sparse=False, dropout=0.5, eval_batch_size=1024, fanout='25,10', graph_name='ogbn-mag', graphbolt=False, id=None, ip_config='/home/ubuntu/workspace/ip_config.txt', l2norm=0, layer_norm=True, local_rank=None, log_every=20, low_mem=True, lr=0.01, n_bases=2, n_epochs=1, n_hidden=64, n_layers=2, num_gpus=-1, relabel=False, sparse_embedding=True, sparse_lr=0.06, standalone=False, use_self_loop=True, validation_fanout='25,10')
-start graph service on server 3 for part 1
-Server is waiting for connections on [172.31.8.229:30051]...
-Server (3) shutdown.
-Server is exiting...
-Namespace(batch_size=1024, conf_path=None, dataset='ogbn-mag', dgl_sparse=False, dropout=0.5, eval_batch_size=1024, fanout='25,10', graph_name='ogbn-mag', graphbolt=False, id=None, ip_config='/home/ubuntu/workspace/ip_config.txt', l2norm=0, layer_norm=True, local_rank=None, log_every=20, low_mem=True, lr=0.01, n_bases=2, n_epochs=1, n_hidden=64, n_layers=2, num_gpus=-1, relabel=False, sparse_embedding=True, sparse_lr=0.06, standalone=False, use_self_loop=True, validation_fanout='25,10')
-start graph service on server 1 for part 0
-Server is waiting for connections on [172.31.14.101:30051]...
-Server (1) shutdown.
-Server is exiting...
-cleanup process runs
diff --git a/examples/pytorch/rgcn/experimental/log_gb.txt b/examples/pytorch/rgcn/experimental/log_gb.txt
deleted file mode 100644
index eb6265e2aacb..000000000000
--- a/examples/pytorch/rgcn/experimental/log_gb.txt
+++ /dev/null
@@ -1,231 +0,0 @@
-/home/ubuntu/workspace/dgl_2/tools/launch.py:148: DeprecationWarning: setDaemon() is deprecated, set the daemon attribute instead
-  thread.setDaemon(True)
-The number of OMP threads per trainer is set to 12
-Namespace(batch_size=1024, conf_path=None, dataset='ogbn-mag', dgl_sparse=False, dropout=0.5, eval_batch_size=1024, fanout='25,10', graph_name='ogbn-mag', graphbolt=True, id=None, ip_config='/home/ubuntu/workspace/ip_config.txt', l2norm=0, layer_norm=True, local_rank=None, log_every=20, low_mem=True, lr=0.01, n_bases=2, n_epochs=1, n_hidden=64, n_layers=2, num_gpus=-1, relabel=False, sparse_embedding=True, sparse_lr=0.06, standalone=False, use_self_loop=True, validation_fanout='25,10')
-Using GraphBolt
-Namespace(batch_size=1024, conf_path=None, dataset='ogbn-mag', dgl_sparse=False, dropout=0.5, eval_batch_size=1024, fanout='25,10', graph_name='ogbn-mag', graphbolt=True, id=None, ip_config='/home/ubuntu/workspace/ip_config.txt', l2norm=0, layer_norm=True, local_rank=None, log_every=20, low_mem=True, lr=0.01, n_bases=2, n_epochs=1, n_hidden=64, n_layers=2, num_gpus=-1, relabel=False, sparse_embedding=True, sparse_lr=0.06, standalone=False, use_self_loop=True, validation_fanout='25,10')
-Using GraphBolt
-Namespace(batch_size=1024, conf_path=None, dataset='ogbn-mag', dgl_sparse=False, dropout=0.5, eval_batch_size=1024, fanout='25,10', graph_name='ogbn-mag', graphbolt=True, id=None, ip_config='/home/ubuntu/workspace/ip_config.txt', l2norm=0, layer_norm=True, local_rank=None, log_every=20, low_mem=True, lr=0.01, n_bases=2, n_epochs=1, n_hidden=64, n_layers=2, num_gpus=-1, relabel=False, sparse_embedding=True, sparse_lr=0.06, standalone=False, use_self_loop=True, validation_fanout='25,10')
-Using GraphBolt
-Namespace(batch_size=1024, conf_path=None, dataset='ogbn-mag', dgl_sparse=False, dropout=0.5, eval_batch_size=1024, fanout='25,10', graph_name='ogbn-mag', graphbolt=True, id=None, ip_config='/home/ubuntu/workspace/ip_config.txt', l2norm=0, layer_norm=True, local_rank=None, log_every=20, low_mem=True, lr=0.01, n_bases=2, n_epochs=1, n_hidden=64, n_layers=2, num_gpus=-1, relabel=False, sparse_embedding=True, sparse_lr=0.06, standalone=False, use_self_loop=True, validation_fanout='25,10')
-Using GraphBolt
-Namespace(batch_size=1024, conf_path=None, dataset='ogbn-mag', dgl_sparse=False, dropout=0.5, eval_batch_size=1024, fanout='25,10', graph_name='ogbn-mag', graphbolt=True, id=None, ip_config='/home/ubuntu/workspace/ip_config.txt', l2norm=0, layer_norm=True, local_rank=None, log_every=20, low_mem=True, lr=0.01, n_bases=2, n_epochs=1, n_hidden=64, n_layers=2, num_gpus=-1, relabel=False, sparse_embedding=True, sparse_lr=0.06, standalone=False, use_self_loop=True, validation_fanout='25,10')
-Using GraphBolt
-Namespace(batch_size=1024, conf_path=None, dataset='ogbn-mag', dgl_sparse=False, dropout=0.5, eval_batch_size=1024, fanout='25,10', graph_name='ogbn-mag', graphbolt=True, id=None, ip_config='/home/ubuntu/workspace/ip_config.txt', l2norm=0, layer_norm=True, local_rank=None, log_every=20, low_mem=True, lr=0.01, n_bases=2, n_epochs=1, n_hidden=64, n_layers=2, num_gpus=-1, relabel=False, sparse_embedding=True, sparse_lr=0.06, standalone=False, use_self_loop=True, validation_fanout='25,10')
-Using GraphBolt
-Namespace(batch_size=1024, conf_path=None, dataset='ogbn-mag', dgl_sparse=False, dropout=0.5, eval_batch_size=1024, fanout='25,10', graph_name='ogbn-mag', graphbolt=True, id=None, ip_config='/home/ubuntu/workspace/ip_config.txt', l2norm=0, layer_norm=True, local_rank=None, log_every=20, low_mem=True, lr=0.01, n_bases=2, n_epochs=1, n_hidden=64, n_layers=2, num_gpus=-1, relabel=False, sparse_embedding=True, sparse_lr=0.06, standalone=False, use_self_loop=True, validation_fanout='25,10')
-Using GraphBolt
-Namespace(batch_size=1024, conf_path=None, dataset='ogbn-mag', dgl_sparse=False, dropout=0.5, eval_batch_size=1024, fanout='25,10', graph_name='ogbn-mag', graphbolt=True, id=None, ip_config='/home/ubuntu/workspace/ip_config.txt', l2norm=0, layer_norm=True, local_rank=None, log_every=20, low_mem=True, lr=0.01, n_bases=2, n_epochs=1, n_hidden=64, n_layers=2, num_gpus=-1, relabel=False, sparse_embedding=True, sparse_lr=0.06, standalone=False, use_self_loop=True, validation_fanout='25,10')
-Using GraphBolt
-Client [174058] waits on 172.31.8.229:58369
-Client [174059] waits on 172.31.8.229:57779
-Client [174060] waits on 172.31.8.229:33865
-Client [174061] waits on 172.31.8.229:33205
-Client [1882912] waits on 172.31.14.101:35629
-Client [1882909] waits on 172.31.14.101:43017
-Client [1882910] waits on 172.31.14.101:34859
-Client [1882911] waits on 172.31.14.101:33033
-Machine (0) group (0) client (0) connect to server successfuly!
-Machine (0) group (0) client (1) connect to server successfuly!
-Machine (0) group (0) client (2) connect to server successfuly!
-Machine (0) group (0) client (3) connect to server successfuly!
-Machine (1) group (0) client (4) connect to server successfuly!
-Machine (1) group (0) client (5) connect to server successfuly!
-Machine (1) group (0) client (6) connect to server successfuly!
-Machine (1) group (0) client (7) connect to server successfuly!
-rank: 0
-rank: 7
-rank: 2
-rank: 6
-rank: 5
-rank: 3
-rank: 4
-rank: 1
-part 6, train: 78696 (local: 78696), val: 8110 (local: 7735), test: 5242 (local: 5072)
-part 7, train: 78696 (local: 78696), val: 8109 (local: 7738), test: 5242 (local: 5092)
-part 0, train: 78697 (local: 76971), val: 8110 (local: 8110), test: 5243 (local: 5243)
-part 3, train: 78696 (local: 76932), val: 8110 (local: 8110), test: 5242 (local: 5242)
-part 5, train: 78696 (local: 78696), val: 8110 (local: 7714), test: 5242 (local: 5108)
-part 4, train: 78697 (local: 78697), val: 8110 (local: 7708), test: 5243 (local: 5072)
-part 2, train: 78696 (local: 77008), val: 8110 (local: 8110), test: 5242 (local: 5242)
-part 1, train: 78697 (local: 76875), val: 8110 (local: 8110), test: 5243 (local: 5243)
-#classes: 349
-node paper has data feat
-#classes: 349
-node paper has data feat
-#classes: 349
-#classes: 349
-node paper has data feat
-node paper has data feat
-#classes: 349
-node paper has data feat
-#classes: 349
-node paper has data feat
-#classes: 349
-node paper has data feat
-#classes: 349
-node paper has data feat
-optimize Pytorch sparse embedding: ModuleDict(
-  (author): Embedding(1134649, 64, sparse=True)
-  (field_of_study): Embedding(59965, 64, sparse=True)
-  (institution): Embedding(8740, 64, sparse=True)
-)
-optimize dense projection: ModuleDict(
-  (paper): Linear(in_features=128, out_features=64, bias=True)
-)
-start training...
-optimize Pytorch sparse embedding: ModuleDict(
-  (author): Embedding(1134649, 64, sparse=True)
-  (field_of_study): Embedding(59965, 64, sparse=True)
-  (institution): Embedding(8740, 64, sparse=True)
-)
-optimize dense projection: ModuleDict(
-  (paper): Linear(in_features=128, out_features=64, bias=True)
-)
-start training...
-optimize Pytorch sparse embedding: ModuleDict(
-  (author): Embedding(1134649, 64, sparse=True)
-  (field_of_study): Embedding(59965, 64, sparse=True)
-  (institution): Embedding(8740, 64, sparse=True)
-)
-optimize dense projection: ModuleDict(
-  (paper): Linear(in_features=128, out_features=64, bias=True)
-)
-start training...
-optimize Pytorch sparse embedding: ModuleDict(
-  (author): Embedding(1134649, 64, sparse=True)
-  (field_of_study): Embedding(59965, 64, sparse=True)
-  (institution): Embedding(8740, 64, sparse=True)
-)
-optimize dense projection: ModuleDict(
-  (paper): Linear(in_features=128, out_features=64, bias=True)
-)
-start training...
-optimize Pytorch sparse embedding: ModuleDict(
-  (author): Embedding(1134649, 64, sparse=True)
-  (field_of_study): Embedding(59965, 64, sparse=True)
-  (institution): Embedding(8740, 64, sparse=True)
-)
-optimize dense projection: ModuleDict(
-  (paper): Linear(in_features=128, out_features=64, bias=True)
-)
-start training...
-optimize Pytorch sparse embedding: ModuleDict(
-  (author): Embedding(1134649, 64, sparse=True)
-  (field_of_study): Embedding(59965, 64, sparse=True)
-  (institution): Embedding(8740, 64, sparse=True)
-)
-optimize dense projection: ModuleDict(
-  (paper): Linear(in_features=128, out_features=64, bias=True)
-)
-start training...
-optimize Pytorch sparse embedding: ModuleDict(
-  (author): Embedding(1134649, 64, sparse=True)
-  (field_of_study): Embedding(59965, 64, sparse=True)
-  (institution): Embedding(8740, 64, sparse=True)
-)
-optimize dense projection: ModuleDict(
-  (paper): Linear(in_features=128, out_features=64, bias=True)
-)
-start training...
-optimize Pytorch sparse embedding: ModuleDict(
-  (author): Embedding(1134649, 64, sparse=True)
-  (field_of_study): Embedding(59965, 64, sparse=True)
-  (institution): Embedding(8740, 64, sparse=True)
-)
-optimize dense projection: ModuleDict(
-  (paper): Linear(in_features=128, out_features=64, bias=True)
-)
-start training...
-[4] Epoch 00000 | Step 00000 | Train acc 0.0020 | Loss 5.8798 | time 0.442 s| sample 0.070 | copy 0.053 | forward 0.051 | backward 0.187 | update 0.081
-[6] Epoch 00000 | Step 00000 | Train acc 0.0020 | Loss 5.8951 | time 0.658 s| sample 0.076 | copy 0.053 | forward 0.053 | backward 0.381 | update 0.096
-[1] Epoch 00000 | Step 00000 | Train acc 0.0049 | Loss 5.8799 | time 0.656 s| sample 0.075 | copy 0.073 | forward 0.054 | backward 0.343 | update 0.111
-[5] Epoch 00000 | Step 00000 | Train acc 0.0020 | Loss 5.8777 | time 0.875 s| sample 0.082 | copy 0.049 | forward 0.056 | backward 0.591 | update 0.096
-[2] Epoch 00000 | Step 00000 | Train acc 0.0059 | Loss 5.8630 | time 0.452 s| sample 0.077 | copy 0.056 | forward 0.032 | backward 0.184 | update 0.103
-[0] Epoch 00000 | Step 00000 | Train acc 0.0059 | Loss 5.8723 | time 0.914 s| sample 0.077 | copy 0.057 | forward 0.071 | backward 0.598 | update 0.111
-[7] Epoch 00000 | Step 00000 | Train acc 0.0020 | Loss 5.8806 | time 0.823 s| sample 0.079 | copy 0.064 | forward 0.051 | backward 0.536 | update 0.092
-[3] Epoch 00000 | Step 00000 | Train acc 0.0059 | Loss 5.8844 | time 0.843 s| sample 0.075 | copy 0.061 | forward 0.054 | backward 0.538 | update 0.115
-[4] Epoch 00000 | Step 00020 | Train acc 0.2246 | Loss 3.5885 | time 8.594 s| sample 1.276 | copy 1.007 | forward 0.527 | backward 4.970 | update 0.815
-[5] Epoch 00000 | Step 00020 | Train acc 0.1982 | Loss 3.5712 | time 8.583 s| sample 1.432 | copy 1.035 | forward 0.653 | backward 4.079 | update 1.384
-[1] Epoch 00000 | Step 00020 | Train acc 0.2383 | Loss 3.4027 | time 8.584 s| sample 1.425 | copy 1.238 | forward 0.630 | backward 4.100 | update 1.191
-[3] Epoch 00000 | Step 00020 | Train acc 0.2197 | Loss 3.4913 | time 8.584 s| sample 1.455 | copy 1.353 | forward 0.645 | backward 3.976 | update 1.155
-[2] Epoch 00000 | Step 00020 | Train acc 0.2539 | Loss 3.3632 | time 8.591 s| sample 1.383 | copy 1.344 | forward 0.592 | backward 4.106 | update 1.167
-[6] Epoch 00000 | Step 00020 | Train acc 0.2188 | Loss 3.4466 | time 8.606 s| sample 1.294 | copy 1.040 | forward 0.559 | backward 4.691 | update 1.022
-[7] Epoch 00000 | Step 00020 | Train acc 0.2373 | Loss 3.4916 | time 8.627 s| sample 1.316 | copy 1.006 | forward 0.556 | backward 4.750 | update 1.000
-[0] Epoch 00000 | Step 00020 | Train acc 0.2363 | Loss 3.5446 | time 8.631 s| sample 1.424 | copy 1.255 | forward 0.583 | backward 4.229 | update 1.141
-[4] Epoch 00000 | Step 00040 | Train acc 0.3047 | Loss 2.8568 | time 8.660 s| sample 1.259 | copy 0.921 | forward 0.501 | backward 5.221 | update 0.757
-[0] Epoch 00000 | Step 00040 | Train acc 0.3408 | Loss 2.6664 | time 8.603 s| sample 1.514 | copy 1.253 | forward 0.586 | backward 4.111 | update 1.139
-[1] Epoch 00000 | Step 00040 | Train acc 0.3330 | Loss 2.7342 | time 8.649 s| sample 1.480 | copy 1.296 | forward 0.609 | backward 3.941 | update 1.324
-[2] Epoch 00000 | Step 00040 | Train acc 0.3135 | Loss 2.7676 | time 8.656 s| sample 1.400 | copy 1.158 | forward 0.558 | backward 4.337 | update 1.202
-[6] Epoch 00000 | Step 00040 | Train acc 0.3018 | Loss 2.8101 | time 8.683 s| sample 1.329 | copy 1.064 | forward 0.565 | backward 4.614 | update 1.112
-[5] Epoch 00000 | Step 00040 | Train acc 0.3086 | Loss 2.9863 | time 8.696 s| sample 1.366 | copy 1.041 | forward 0.563 | backward 4.509 | update 1.216
-[3] Epoch 00000 | Step 00040 | Train acc 0.3535 | Loss 2.6234 | time 8.689 s| sample 1.409 | copy 1.307 | forward 0.628 | backward 4.060 | update 1.285
-[7] Epoch 00000 | Step 00040 | Train acc 0.2861 | Loss 2.8887 | time 8.652 s| sample 1.307 | copy 1.031 | forward 0.545 | backward 4.737 | update 1.033
-[4] Epoch 00000 | Step 00060 | Train acc 0.3398 | Loss 2.6324 | time 8.234 s| sample 1.266 | copy 0.933 | forward 0.486 | backward 4.762 | update 0.787
-[5] Epoch 00000 | Step 00060 | Train acc 0.3359 | Loss 2.5936 | time 8.186 s| sample 1.360 | copy 1.045 | forward 0.564 | backward 4.062 | update 1.155
-[6] Epoch 00000 | Step 00060 | Train acc 0.3096 | Loss 2.7593 | time 8.192 s| sample 1.326 | copy 1.029 | forward 0.544 | backward 4.208 | update 1.085
-[1] Epoch 00000 | Step 00060 | Train acc 0.3809 | Loss 2.4687 | time 8.238 s| sample 1.438 | copy 1.241 | forward 0.619 | backward 3.767 | update 1.172
-[3] Epoch 00000 | Step 00060 | Train acc 0.3613 | Loss 2.5116 | time 8.201 s| sample 1.436 | copy 1.265 | forward 0.607 | backward 3.814 | update 1.079
-[0] Epoch 00000 | Step 00060 | Train acc 0.3896 | Loss 2.4396 | time 8.246 s| sample 1.533 | copy 1.252 | forward 0.594 | backward 3.909 | update 0.957
-[2] Epoch 00000 | Step 00060 | Train acc 0.3887 | Loss 2.4081 | time 8.236 s| sample 1.411 | copy 1.195 | forward 0.604 | backward 4.009 | update 1.016
-[7] Epoch 00000 | Step 00060 | Train acc 0.3408 | Loss 2.7524 | time 8.208 s| sample 1.355 | copy 1.045 | forward 0.575 | backward 4.182 | update 1.051
-[4]Epoch Time(s): 32.4171, sample: 4.9020, data copy: 3.6422, forward: 1.9722, backward: 18.8446, update: 3.0559, #train: 78697, #input: 16666112
-[0]Epoch Time(s): 32.8553, sample: 5.6930, data copy: 4.7914, forward: 2.3189, backward: 15.8580, update: 4.1937, #train: 78697, #input: 18370334
-[1]Epoch Time(s): 32.5985, sample: 5.6200, data copy: 4.8302, forward: 2.3978, backward: 15.1472, update: 4.6032, #train: 78697, #input: 18381034
-[7]Epoch Time(s): 32.7594, sample: 5.0083, data copy: 3.9177, forward: 2.1759, backward: 17.6220, update: 4.0355, #train: 78696, #input: 16686254
-[3]Epoch Time(s): 32.7923, sample: 5.4970, data copy: 4.9976, forward: 2.4069, backward: 15.4529, update: 4.4377, #train: 78696, #input: 18370936
-[2]Epoch Time(s): 32.4160, sample: 5.4193, data copy: 4.7859, forward: 2.2797, backward: 15.5506, update: 4.3803, #train: 78696, #input: 18378296
-[5]Epoch Time(s): 32.8164, sample: 5.3067, data copy: 3.9895, forward: 2.2805, backward: 16.5132, update: 4.7265, #train: 78696, #input: 16678362
-[6]Epoch Time(s): 32.6269, sample: 5.0835, data copy: 3.9453, forward: 2.1211, backward: 17.3780, update: 4.0990, #train: 78696, #input: 16674232
-0it [00:00, ?it/s]0it [00:00, ?it/s]0it [00:00, ?it/s]0it [00:00, ?it/s]0it [00:00, ?it/s]0it [00:00, ?it/s]0it [00:00, ?it/s]0it [00:00, ?it/s]1it [00:00,  6.32it/s]1it [00:00,  6.20it/s]1it [00:00,  6.17it/s]1it [00:00,  5.95it/s]1it [00:00,  4.79it/s]1it [00:00,  4.08it/s]1it [00:00,  3.56it/s]1it [00:00,  3.55it/s]2it [00:00,  6.44it/s]2it [00:00,  6.38it/s]2it [00:00,  6.31it/s]2it [00:00,  5.97it/s]2it [00:00,  5.86it/s]2it [00:00,  5.77it/s]2it [00:00,  4.99it/s]2it [00:00,  4.86it/s]3it [00:00,  6.66it/s]3it [00:00,  6.48it/s]3it [00:00,  6.42it/s]3it [00:00,  6.23it/s]3it [00:00,  6.23it/s]3it [00:00,  6.46it/s]3it [00:00,  5.75it/s]3it [00:00,  5.69it/s]4it [00:00,  6.85it/s]4it [00:00,  6.82it/s]4it [00:00,  6.52it/s]4it [00:00,  6.87it/s]4it [00:00,  6.54it/s]4it [00:00,  6.65it/s]4it [00:00,  6.08it/s]4it [00:00,  5.76it/s]5it [00:00,  7.30it/s]5it [00:00,  6.75it/s]5it [00:00,  6.84it/s]5it [00:00,  6.70it/s]5it [00:00,  6.66it/s]5it [00:00,  6.86it/s]5it [00:00,  6.47it/s]5it [00:00,  6.34it/s]6it [00:00,  7.43it/s]6it [00:00,  6.97it/s]6it [00:00,  7.30it/s]6it [00:00,  6.64it/s]6it [00:00,  6.51it/s]6it [00:00,  6.62it/s]6it [00:00,  6.74it/s]6it [00:01,  6.71it/s]7it [00:01,  7.45it/s]7it [00:01,  7.52it/s]7it [00:01,  6.79it/s]7it [00:01,  6.74it/s]7it [00:01,  6.39it/s]7it [00:01,  6.23it/s]7it [00:01,  6.90it/s]7it [00:01,  6.83it/s]8it [00:01,  7.79it/s]8it [00:01,  6.98it/s]
-8it [00:01,  7.40it/s]8it [00:01,  6.98it/s]
-0it [00:00, ?it/s]0it [00:00, ?it/s]8it [00:01,  6.93it/s]8it [00:01,  6.77it/s]
-0it [00:00, ?it/s]8it [00:01,  6.89it/s]8it [00:01,  6.75it/s]
-0it [00:00, ?it/s]8it [00:01,  6.68it/s]8it [00:01,  6.52it/s]
-0it [00:00, ?it/s]8it [00:01,  6.46it/s]8it [00:01,  6.45it/s]
-0it [00:00, ?it/s]8it [00:01,  7.18it/s]8it [00:01,  6.31it/s]
-8it [00:01,  7.05it/s]8it [00:01,  6.31it/s]
-0it [00:00, ?it/s]0it [00:00, ?it/s]1it [00:00,  5.84it/s]1it [00:00,  5.66it/s]1it [00:00,  7.06it/s]1it [00:00,  6.43it/s]1it [00:00,  6.56it/s]1it [00:00,  6.56it/s]1it [00:00,  5.36it/s]2it [00:00,  6.48it/s]2it [00:00,  6.28it/s]1it [00:00,  4.84it/s]2it [00:00,  6.79it/s]2it [00:00,  6.64it/s]2it [00:00,  6.31it/s]2it [00:00,  6.43it/s]3it [00:00,  6.92it/s]3it [00:00,  6.86it/s]2it [00:00,  5.92it/s]2it [00:00,  5.88it/s]3it [00:00,  6.86it/s]3it [00:00,  6.49it/s]3it [00:00,  6.75it/s]3it [00:00,  6.41it/s]4it [00:00,  7.27it/s]4it [00:00,  7.03it/s]3it [00:00,  6.63it/s]3it [00:00,  6.41it/s]4it [00:00,  6.89it/s]4it [00:00,  6.65it/s]4it [00:00,  6.53it/s]5it [00:00,  7.62it/s]4it [00:00,  6.57it/s]5it [00:00,  7.16it/s]4it [00:00,  6.98it/s]4it [00:00,  6.93it/s]6it [00:00,  7.99it/s]
-5it [00:00,  6.79it/s]5it [00:00,  6.82it/s]6it [00:00,  7.67it/s]
-6it [00:00,  7.53it/s]
-5it [00:00,  6.50it/s]5it [00:00,  7.38it/s]5it [00:00,  6.62it/s]6it [00:00,  7.33it/s]
-5it [00:00,  7.07it/s]6it [00:00,  7.30it/s]
-6it [00:00,  7.66it/s]
-6it [00:00,  7.32it/s]
-6it [00:00,  7.49it/s]
-Val Acc 0.3901, Test Acc 0.3844, time: 2.2284
-Client[4] in group[0] is exiting...
-Client[2] in group[0] is exiting...
-Client[3] in group[0] is exiting...
-Client[1] in group[0] is exiting...
-Client[6] in group[0] is exiting...
-Client[7] in group[0] is exiting...
-Client[5] in group[0] is exiting...
-Client[0] in group[0] is exiting...
-Namespace(batch_size=1024, conf_path=None, dataset='ogbn-mag', dgl_sparse=False, dropout=0.5, eval_batch_size=1024, fanout='25,10', graph_name='ogbn-mag', graphbolt=True, id=None, ip_config='/home/ubuntu/workspace/ip_config.txt', l2norm=0, layer_norm=True, local_rank=None, log_every=20, low_mem=True, lr=0.01, n_bases=2, n_epochs=1, n_hidden=64, n_layers=2, num_gpus=-1, relabel=False, sparse_embedding=True, sparse_lr=0.06, standalone=False, use_self_loop=True, validation_fanout='25,10')
-Using GraphBolt
-[Server_2] Loaded ogbn-mag with use_graphbolt[True] in size[240.7109375 MB]
-start graph service on server 2 for part 1
-Server is waiting for connections on [172.31.8.229:30050]...
-Server (2) shutdown.
-Server is exiting...
-Namespace(batch_size=1024, conf_path=None, dataset='ogbn-mag', dgl_sparse=False, dropout=0.5, eval_batch_size=1024, fanout='25,10', graph_name='ogbn-mag', graphbolt=True, id=None, ip_config='/home/ubuntu/workspace/ip_config.txt', l2norm=0, layer_norm=True, local_rank=None, log_every=20, low_mem=True, lr=0.01, n_bases=2, n_epochs=1, n_hidden=64, n_layers=2, num_gpus=-1, relabel=False, sparse_embedding=True, sparse_lr=0.06, standalone=False, use_self_loop=True, validation_fanout='25,10')
-Using GraphBolt
-[Server_0] Loaded ogbn-mag with use_graphbolt[True] in size[235.8046875 MB]
-start graph service on server 0 for part 0
-Server is waiting for connections on [172.31.14.101:30050]...
-Server (0) shutdown.
-Server is exiting...
-Namespace(batch_size=1024, conf_path=None, dataset='ogbn-mag', dgl_sparse=False, dropout=0.5, eval_batch_size=1024, fanout='25,10', graph_name='ogbn-mag', graphbolt=True, id=None, ip_config='/home/ubuntu/workspace/ip_config.txt', l2norm=0, layer_norm=True, local_rank=None, log_every=20, low_mem=True, lr=0.01, n_bases=2, n_epochs=1, n_hidden=64, n_layers=2, num_gpus=-1, relabel=False, sparse_embedding=True, sparse_lr=0.06, standalone=False, use_self_loop=True, validation_fanout='25,10')
-Using GraphBolt
-start graph service on server 3 for part 1
-Server is waiting for connections on [172.31.8.229:30051]...
-Server (3) shutdown.
-Server is exiting...
-Namespace(batch_size=1024, conf_path=None, dataset='ogbn-mag', dgl_sparse=False, dropout=0.5, eval_batch_size=1024, fanout='25,10', graph_name='ogbn-mag', graphbolt=True, id=None, ip_config='/home/ubuntu/workspace/ip_config.txt', l2norm=0, layer_norm=True, local_rank=None, log_every=20, low_mem=True, lr=0.01, n_bases=2, n_epochs=1, n_hidden=64, n_layers=2, num_gpus=-1, relabel=False, sparse_embedding=True, sparse_lr=0.06, standalone=False, use_self_loop=True, validation_fanout='25,10')
-Using GraphBolt
-start graph service on server 1 for part 0
-Server is waiting for connections on [172.31.14.101:30051]...
-Server (1) shutdown.
-Server is exiting...
-cleanup process runs
diff --git a/python/dgl/distributed/dist_graph.py b/python/dgl/distributed/dist_graph.py
index f06d1bb41576..649ee3dbcb31 100644
--- a/python/dgl/distributed/dist_graph.py
+++ b/python/dgl/distributed/dist_graph.py
@@ -102,8 +102,15 @@ def __setstate__(self, state):
 
     def process_request(self, server_state):
         if server_state.graph is None:
+            gb_metadata = None
+            if self._use_graphbolt:
+                gpb = server_state.partition_book
+                gb_metadata = gb.GraphMetadata(
+                    {ntype: i for i, ntype in enumerate(gpb.ntypes)},
+                    {gb.etype_tuple_to_str(etype): i for i, etype in enumerate(gpb.canonical_etypes)},
+                )
             server_state.graph = _get_graph_from_shared_mem(
-                self._graph_name, self._use_graphbolt
+                self._graph_name, self._use_graphbolt, gb_metadata
             )
         return InitGraphResponse(self._graph_name)
 
@@ -187,7 +194,7 @@ def _exist_shared_mem_array(graph_name, name):
     return exist_shared_mem_array(_get_edata_path(graph_name, name))
 
 
-def _get_graph_from_shared_mem(graph_name, use_graphbolt):
+def _get_graph_from_shared_mem(graph_name, use_graphbolt, gb_metadata):
     """Get the graph from the DistGraph server.
 
     The DistGraph server puts the graph structure of the local partition in the shared memory.
@@ -195,15 +202,7 @@ def _get_graph_from_shared_mem(graph_name, use_graphbolt):
     through shared memory to reduce the overhead of data access.
     """
     if use_graphbolt:
-        metadata = gb.GraphMetadata(
-            {'author': 0, 'field_of_study': 1, 'institution': 2, 'paper': 3},
-            {'author:affiliated_with:institution': 0, 'author:writes:paper': 1,
-             'field_of_study:rev-has_topic:paper': 2,
-             'institution:rev-affiliated_with:author': 3, 'paper:cites:paper': 4,
-             'paper:has_topic:field_of_study': 5, 'paper:rev-cites:paper': 6,
-             'paper:rev-writes:author': 7}
-        )
-        g = gb.load_from_shared_memory(graph_name, metadata)
+        g = gb.load_from_shared_memory(graph_name, gb_metadata)
         return g
 
     g, ntypes, etypes = heterograph_index.create_heterograph_from_shared_memory(
@@ -686,10 +685,16 @@ def _init(self, gpb, use_graphbolt):
         assert (
             self._client is not None
         ), "Distributed module is not initialized. Please call dgl.distributed.initialize."
-        self._g = _get_graph_from_shared_mem(self.graph_name, use_graphbolt)
         self._gpb = get_shared_mem_partition_book(self.graph_name)
         if self._gpb is None:
             self._gpb = gpb
+        gb_metadata = None
+        if use_graphbolt:
+            gb_metadata = gb.GraphMetadata(
+                {ntype: i for i, ntype in enumerate(self._gpb.ntypes)},
+                {gb.etype_tuple_to_str(etype): i for i, etype in enumerate(self._gpb.canonical_etypes)},
+            )
+        self._g = _get_graph_from_shared_mem(self.graph_name, use_graphbolt, gb_metadata)
         self._client.map_shared_data(self._gpb)
 
     def _init_ndata_store(self):

From 438d8ca4b304a20da3e3a7dca28bbf552856025c Mon Sep 17 00:00:00 2001
From: RhettYing <rhett_ying@qq.com>
Date: Thu, 26 Oct 2023 08:56:55 +0000
Subject: [PATCH 19/30] ------ LAUNCH DistDGL with GraphBolt on heterograph
 -------

---
 examples/pytorch/rgcn/experimental/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/pytorch/rgcn/experimental/README.md b/examples/pytorch/rgcn/experimental/README.md
index f3fb75660084..5edb825fe08e 100644
--- a/examples/pytorch/rgcn/experimental/README.md
+++ b/examples/pytorch/rgcn/experimental/README.md
@@ -1,4 +1,4 @@
-## DistDGL with GraphBolt partitions and sampling
+## DistDGL with GraphBolt(Heterograph only)
 
 ### How to partition graph
 

From 6d5d24026a63760df9a755e6754db50e9b89d730 Mon Sep 17 00:00:00 2001
From: RhettYing <rhett_ying@qq.com>
Date: Fri, 27 Oct 2023 08:51:23 +0000
Subject: [PATCH 20/30] standalone with use_graphbolt is not supported

---
 python/dgl/distributed/dist_graph.py | 15 +--------------
 1 file changed, 1 insertion(+), 14 deletions(-)

diff --git a/python/dgl/distributed/dist_graph.py b/python/dgl/distributed/dist_graph.py
index 649ee3dbcb31..2489d86daca3 100644
--- a/python/dgl/distributed/dist_graph.py
+++ b/python/dgl/distributed/dist_graph.py
@@ -2,8 +2,6 @@
 
 import gc
 import psutil
-#import tracemalloc
-#from  pympler import asizeof
 import os
 from collections import namedtuple
 from collections.abc import MutableMapping
@@ -392,7 +390,6 @@ def __init__(
         graph_format=("csc", "coo"),
         use_graphbolt=False,
     ):
-        #tracemalloc.start()
         super(DistGraphServer, self).__init__(
             server_id=server_id,
             ip_config=ip_config,
@@ -410,7 +407,6 @@ def __init__(
             self.client_g = None
         else:
             # Loading of node/edge_feats are deferred to lower the peak memory consumption.
-            #snapshot1 = tracemalloc.take_snapshot()
             prev_rss = psutil.Process(os.getpid()).memory_info().rss
             (
                 self.client_g,
@@ -426,18 +422,8 @@ def __init__(
                 load_feats=False,
                 use_graphbolt=use_graphbolt,
             )
-            #print(f"[Server_{self.server_id}] Loaded {graph_name} with use_graphbolt[{use_graphbolt}].")
             new_rss = psutil.Process(os.getpid()).memory_info().rss
             print(f"[Server_{self.server_id}] Loaded {graph_name} with use_graphbolt[{use_graphbolt}] in size[{(new_rss - prev_rss)/1024/1024} MB]")
-            #print(f"[Server_{self.server_id}] Loaded {graph_name} with use_graphbolt[{use_graphbolt}] in size[{asizeof.asizeof(self.client_g)}]")
-            '''
-            snapshot2 = tracemalloc.take_snapshot()
-            top_stats = snapshot2.compare_to(snapshot1, "lineno")
-            print(f"[Server_{self.server_id}][ Top 10 differences ]")
-            for stat in top_stats[:10]:
-                print(f"[Server_{self.server_id}]: {stat}")
-            tracemalloc.stop()
-            '''
             if not use_graphbolt:
                 # formatting dtype
                 # TODO(Rui) Formatting forcely is not a perfect solution.
@@ -618,6 +604,7 @@ def __init__(
         self.graph_name = graph_name
         self._use_graphbolt = use_graphbolt
         if os.environ.get("DGL_DIST_MODE", "standalone") == "standalone":
+            assert not use_graphbolt, "GraphBolt is not supported in standalone mode."
             assert (
                 part_config is not None
             ), "When running in the standalone model, the partition config file is required"

From a99b19294ad8022d0eb852710ba7164495e71a3b Mon Sep 17 00:00:00 2001
From: RhettYing <rhett_ying@qq.com>
Date: Mon, 30 Oct 2023 07:41:31 +0000
Subject: [PATCH 21/30] format dtype when converting to CSCSamplingGraph

---
 graphbolt/src/csc_sampling_graph.cc           |  5 +-
 python/dgl/distributed/graph_services.py      |  3 +-
 python/dgl/distributed/partition.py           | 94 ++++++++++++++++---
 .../dgl/graphbolt/impl/csc_sampling_graph.py  |  7 +-
 python/dgl/graphbolt/impl/neighbor_sampler.py |  2 +-
 tests/distributed/test_partition.py           |  8 +-
 6 files changed, 96 insertions(+), 23 deletions(-)

diff --git a/graphbolt/src/csc_sampling_graph.cc b/graphbolt/src/csc_sampling_graph.cc
index 26cf6303123e..af28e4bcbda7 100644
--- a/graphbolt/src/csc_sampling_graph.cc
+++ b/graphbolt/src/csc_sampling_graph.cc
@@ -357,7 +357,8 @@ c10::intrusive_ptr<SampledSubgraph> CSCSamplingGraph::SampleNeighborsImpl(
         auto num_picked_neighbors_data_ptr =
             num_picked_neighbors_per_node.data_ptr<scalar_t>();
         num_picked_neighbors_data_ptr[0] = 0;
-        const auto nodes_data_ptr = nodes.data_ptr<int64_t>();
+        //const auto nodes_data_ptr = nodes.data_ptr<int64_t>();
+        const auto nodes_data_ptr = nodes.data_ptr<scalar_t>();
 
         // Step 1. Calculate pick number of each node.
         torch::parallel_for(
@@ -378,7 +379,7 @@ c10::intrusive_ptr<SampledSubgraph> CSCSamplingGraph::SampleNeighborsImpl(
 
         // Step 2. Calculate prefix sum to get total length and offsets of each
         // node. It's also the indptr of the generated subgraph.
-        subgraph_indptr = torch::cumsum(num_picked_neighbors_per_node, 0);
+        subgraph_indptr = torch::cumsum(num_picked_neighbors_per_node, 0).to(indptr_.dtype());
 
         // Step 3. Allocate the tensor for picked neighbors.
         const auto total_length =
diff --git a/python/dgl/distributed/graph_services.py b/python/dgl/distributed/graph_services.py
index 26aa6aef64b0..ff6d429357a2 100644
--- a/python/dgl/distributed/graph_services.py
+++ b/python/dgl/distributed/graph_services.py
@@ -610,6 +610,7 @@ def _frontier_to_heterogeneous_graph_gb(g, frontier, gpb):
     # For GraphBolt, we store ETYPE into EID field.
     etype_ids = frontier.edata[EID]
     src, dst = frontier.edges()
+    src, dst = F.astype(src, g.idtype), F.astype(dst, g.idtype)
     etype_ids, idx = F.sort_1d(etype_ids)
     src, dst = F.gather_row(src, idx), F.gather_row(dst, idx)
     src_ntype_ids, src = gpb.map_to_per_ntype(src)
@@ -713,7 +714,7 @@ def sample_etype_neighbors(
         A sampled subgraph containing only the sampled neighboring edges.  It is on CPU.
     """
     if isinstance(fanout, int):
-        fanout = F.full_1d(len(g.canonical_etypes), fanout, F.int64, F.cpu())
+        fanout = F.full_1d(len(g.canonical_etypes), fanout, F.int32, F.cpu())
     else:
         etype_ids = {etype: i for i, etype in enumerate(g.canonical_etypes)}
         fanout_array = [None] * len(g.canonical_etypes)
diff --git a/python/dgl/distributed/partition.py b/python/dgl/distributed/partition.py
index a3b6b3f86006..a34854f1ce50 100644
--- a/python/dgl/distributed/partition.py
+++ b/python/dgl/distributed/partition.py
@@ -1273,6 +1273,7 @@ def convert_dgl_partition_to_csc_sampling_graph(
     store_ntypes=False,
     store_etypes=True,
     store_metadata=True,
+    graph_file_name=None,
 ):
     """Convert partitions of dgl to CSCSamplingGraph of GraphBolt.
 
@@ -1297,6 +1298,9 @@ def convert_dgl_partition_to_csc_sampling_graph(
         Whether to store edge types in the new graph.
     store_metadata : bool, optional
         Whether to store metadata in the new graph.
+    graph_file_name : str, optional
+        The name of the new graph file. If not provided, the name will be
+        `csc_sampling_graph.tar`.
     """
     # As only this function requires GraphBolt for now, let's import here.
     from .. import graphbolt
@@ -1336,11 +1340,6 @@ def init_type_per_edge(graph, gpb):
         if store_etypes:
             type_per_edge = init_type_per_edge(graph, gpb)
             type_per_edge = type_per_edge[edge_ids]
-            type_per_edge = type_per_edge.to(RESERVED_FIELD_DTYPE[ETYPE])
-            if len(etypes) < 128:
-                type_per_edge = type_per_edge.to(torch.int8)
-            elif len(etypes) < 32768:
-                type_per_edge = type_per_edge.to(torch.int16)
             # Sanity check.
             assert len(type_per_edge) == graph.num_edges()
 
@@ -1350,7 +1349,7 @@ def init_type_per_edge(graph, gpb):
             # Sanity check.
             assert len(graph.ndata[NID]) == graph.num_nodes()
             node_attributes = {
-                NID: graph.ndata[NID].to(RESERVED_FIELD_DTYPE[NID])
+                NID: graph.ndata[NID]
             }
 
         # Original edge IDs.
@@ -1359,15 +1358,13 @@ def init_type_per_edge(graph, gpb):
             # Sanity check.
             assert len(graph.edata[EID]) == graph.num_edges()
             edge_attributes = {
-                EID: graph.edata[EID][edge_ids].to(RESERVED_FIELD_DTYPE[EID])
+                EID: graph.edata[EID][edge_ids]
             }
 
         if store_ntypes:
             if node_attributes is None:
                 node_attributes = {}
-            node_attributes[NTYPE] = graph.ndata[NTYPE].to(
-                RESERVED_FIELD_DTYPE[NTYPE]
-            )
+            node_attributes[NTYPE] = graph.ndata[NTYPE]
 
         if store_etypes:
             # [Rui] Let's store as edge attributes for now.
@@ -1375,6 +1372,79 @@ def init_type_per_edge(graph, gpb):
                 edge_attributes = {}
             edge_attributes[ETYPE] = type_per_edge
 
+        # Data type formatting before saving.
+        num_nodes = part_meta["num_nodes"]
+        num_edges = part_meta["num_edges"]
+        local_num_nodes = graph.num_nodes()
+        local_num_edges = graph.num_edges()
+        # 1. csc matrix. [Required]
+        if local_num_nodes < torch.iinfo(torch.int32).max:
+            indices = indices.to(torch.int32)
+        else:
+            indices = indices.to(torch.int64)
+        if local_num_edges < torch.iinfo(torch.int32).max:
+            indptr = indptr.to(torch.int32)
+        else:
+            indptr = indptr.to(torch.int64)
+        # 2. NID. [Required]
+        if node_attributes is not None and NID in node_attributes:
+            if num_nodes < torch.iinfo(torch.int32).max:
+                node_attributes[NID] = node_attributes[NID].to(torch.int32)
+            else:
+                node_attributes[NID] = node_attributes[NID].to(torch.int64)
+        # 3. ETYPE. [Required].
+        # [TODO] `type_per_edge` and edge_attributes[ETYPE] are duplicated.
+        if type_per_edge is not None:
+            if len(etypes) < torch.iinfo(torch.int8).max:
+                type_per_edge = type_per_edge.to(torch.int8)
+            elif len(etypes) < torch.iinfo(torch.int16).max:
+                type_per_edge = type_per_edge.to(torch.int16)
+            elif len(etypes) < torch.iinfo(torch.int32).max:
+                type_per_edge = type_per_edge.to(torch.int32)
+            else:
+                type_per_edge = type_per_edge.to(torch.int64)
+        if edge_attributes is not None and ETYPE in edge_attributes:
+            if len(etypes) < torch.iinfo(torch.int8).max:
+                edge_attributes[ETYPE] = edge_attributes[ETYPE].to(
+                    torch.int8
+                )
+            elif len(etypes) < torch.iinfo(torch.int16).max:
+                edge_attributes[ETYPE] = edge_attributes[ETYPE].to(
+                    torch.int16
+                )
+            elif len(etypes) < torch.iinfo(torch.int32).max:
+                edge_attributes[ETYPE] = edge_attributes[ETYPE].to(
+                    torch.int32
+                )
+            else:
+                edge_attributes[ETYPE] = edge_attributes[ETYPE].to(
+                    torch.int64
+                )
+        # 4. NTYPE. [Optional]
+        if node_attributes is not None and NTYPE in node_attributes:
+            if len(ntypes) < torch.iinfo(torch.int8).max:
+                node_attributes[NTYPE] = node_attributes[NTYPE].to(
+                    torch.int8
+                )
+            elif len(ntypes) < torch.iinfo(torch.int16).max:
+                node_attributes[NTYPE] = node_attributes[NTYPE].to(
+                    torch.int16
+                )
+            elif len(ntypes) < torch.iinfo(torch.int32).max:
+                node_attributes[NTYPE] = node_attributes[NTYPE].to(
+                    torch.int32
+                )
+            else:
+                node_attributes[NTYPE] = node_attributes[NTYPE].to(
+                    torch.int64
+                )
+        # 5. EID. [Optional]
+        if edge_attributes is not None and EID in edge_attributes:
+            if num_edges < torch.iinfo(torch.int32).max:
+                edge_attributes[EID] = edge_attributes[EID].to(torch.int32)
+            else:
+                edge_attributes[EID] = edge_attributes[EID].to(torch.int64)
+
         # Construct CSCSamplingGraph
         csc_graph = graphbolt.from_csc(
             indptr,
@@ -1389,8 +1459,10 @@ def init_type_per_edge(graph, gpb):
             os.path.dirname(part_config),
             part_meta[f"part-{part_id}"]["part_graph"],
         )
+        if graph_file_name is None:
+            graph_file_name = "csc_sampling_graph.tar"
         csc_graph_path = os.path.join(
-            os.path.dirname(orig_graph_path), "csc_sampling_graph.tar"
+            os.path.dirname(orig_graph_path), graph_file_name
         )
         graphbolt.save_csc_sampling_graph(csc_graph, csc_graph_path)
 
diff --git a/python/dgl/graphbolt/impl/csc_sampling_graph.py b/python/dgl/graphbolt/impl/csc_sampling_graph.py
index b7167cb0db92..c9e4878c0dd2 100644
--- a/python/dgl/graphbolt/impl/csc_sampling_graph.py
+++ b/python/dgl/graphbolt/impl/csc_sampling_graph.py
@@ -573,10 +573,6 @@ def _sample_neighbors(
         """
         # Ensure nodes is 1-D tensor.
         self._check_sampler_arguments(nodes, fanouts, probs_name)
-        has_original_nids = (
-            self.node_attributes is not None
-            and ORIGINAL_NODE_ID in self.node_attributes
-        )
         has_original_eids = (
             self.edge_attributes is not None
             and ORIGINAL_EDGE_ID in self.edge_attributes
@@ -585,6 +581,9 @@ def _sample_neighbors(
             self.edge_attributes is not None
             and ETYPE in self.edge_attributes
         )
+        # [Rui] Formatting to avoid `RuntimeError: expected scalar type Int but found Long`.
+        nodes = nodes.to(self.indices.dtype)
+        fanouts = fanouts.to(self.indices.dtype)
         return self._c_csc_graph.sample_neighbors(
             nodes,
             fanouts.tolist(),
diff --git a/python/dgl/graphbolt/impl/neighbor_sampler.py b/python/dgl/graphbolt/impl/neighbor_sampler.py
index ca8cb0473757..2b780e1a8bf8 100644
--- a/python/dgl/graphbolt/impl/neighbor_sampler.py
+++ b/python/dgl/graphbolt/impl/neighbor_sampler.py
@@ -103,7 +103,7 @@ def __init__(
     @staticmethod
     def distributed_sample_neighbor(graph, seeds, fanouts):
         if isinstance(fanouts, int):
-            fanouts = torch.LongTensor([fanouts])
+            fanouts = torch.IntTensor([fanouts])
         assert isinstance(fanouts, torch.Tensor), f"Invalid fanouts: {fanouts}"
         subgraph = graph.sample_neighbors(seeds, fanouts, keep_homo=True)
         src_nodes, dst_nodes = subgraph.node_pairs
diff --git a/tests/distributed/test_partition.py b/tests/distributed/test_partition.py
index 583b3d8e7120..f2a7a99bc27b 100644
--- a/tests/distributed/test_partition.py
+++ b/tests/distributed/test_partition.py
@@ -1115,21 +1115,21 @@ def test_partition_hetero_graphbolt_sample_neighbors(
                 assert edge_feats == {}
 
             # sample_neighbors()
-            subg = part_g.sample_neighbors(th.arange(10), th.LongTensor([-1]), keep_homo=True)
+            subg = part_g.sample_neighbors(th.arange(10), th.IntTensor([-1]), keep_homo=True)
             src, dst = subg.node_pairs
             orig_src = part_g.node_attributes[dgl.NID][src]
             orig_dst = part_g.node_attributes[dgl.NID][dst]
             orig_ntype_src = part_g.node_attributes[dgl.NTYPE][src]
             orig_ntype_dst = part_g.node_attributes[dgl.NTYPE][dst]
             etype_ids = subg.original_etype_ids
-            orig_eids = part_g.edge_attributes[dgl.EID][subg.original_edge_ids]
+            orig_eids = part_g.edge_attributes[dgl.EID].to(hg.idtype)[subg.original_edge_ids]
             etype_idsA, _ = gpb.map_to_per_etype(orig_eids)
             assert th.equal(etype_ids, etype_idsA), "etype_ids is not expected."
 
             etype_ids, idx = F.sort_1d(etype_ids)
             sorted_orig_src, sorted_orig_dst = F.gather_row(orig_src, idx), F.gather_row(orig_dst, idx)
-            src_ntype_ids, ntype_wised_src = gpb.map_to_per_ntype(sorted_orig_src)
-            dst_ntype_ids, ntype_wised_dst = gpb.map_to_per_ntype(sorted_orig_dst)
+            src_ntype_ids, ntype_wised_src = gpb.map_to_per_ntype(sorted_orig_src.to(hg.idtype))
+            dst_ntype_ids, ntype_wised_dst = gpb.map_to_per_ntype(sorted_orig_dst.to(hg.idtype))
 
             data_dict = dict()
             print("gpb.canonical_etypes: ", gpb.canonical_etypes)

From c0099a17fbcb2e20d110226da8c9d22580eeda0a Mon Sep 17 00:00:00 2001
From: RhettYing <rhett_ying@qq.com>
Date: Mon, 30 Oct 2023 08:01:17 +0000
Subject: [PATCH 22/30] update README about partition size

---
 examples/pytorch/rgcn/experimental/README.md | 27 +++++++++++++-------
 python/dgl/distributed/partition.py          | 11 +++++++-
 2 files changed, 28 insertions(+), 10 deletions(-)

diff --git a/examples/pytorch/rgcn/experimental/README.md b/examples/pytorch/rgcn/experimental/README.md
index 5edb825fe08e..a9e47e59aa6d 100644
--- a/examples/pytorch/rgcn/experimental/README.md
+++ b/examples/pytorch/rgcn/experimental/README.md
@@ -25,15 +25,24 @@ dgl.distributed.convert_dgl_partition_to_csc_sampling_graph(
 `csc_sampling_graph.tar` is the GraphBolt partitions.
 `graph.dgl` is the original DistDGL partitions, namely, DGLGraph.
 
-```
--rw-rw-r-- 1 ubuntu ubuntu 231M Oct 26 01:51 data/part0/csc_sampling_graph.tar
--rw-rw-r-- 1 ubuntu ubuntu   24 Oct 26 01:51 data/part0/edge_feat.dgl
--rw-rw-r-- 1 ubuntu ubuntu 701M Oct 26 01:51 data/part0/graph.dgl
--rw-rw-r-- 1 ubuntu ubuntu 182M Oct 26 01:51 data/part0/node_feat.dgl
--rw-rw-r-- 1 ubuntu ubuntu 235M Oct 26 01:51 data/part1/csc_sampling_graph.tar
--rw-rw-r-- 1 ubuntu ubuntu   24 Oct 26 01:51 data/part1/edge_feat.dgl
--rw-rw-r-- 1 ubuntu ubuntu 711M Oct 26 01:51 data/part1/graph.dgl
--rw-rw-r-- 1 ubuntu ubuntu 187M Oct 26 01:51 data/part1/node_feat.dgl
+###### ogbn-mag
+heterogeneous, ~1.9M nodes, ~42M edges(reverse edges are added),  4 ntypes, 8 etypes, 2 parts.
+
+| DGL(MB) | GraphBolt w/o EIDs(MB) | GraphBolt w/ EIDs(MB) |
+| --- | ------------------ | ----------------- |
+| 701/711 | 151/153 | 243/247 |
+
+```
+-rw-rw-r-- 1 ubuntu ubuntu 151M Oct 30 07:35 data/part0/csc_sampling_graph.tar
+-rw-rw-r-- 1 ubuntu ubuntu 243M Oct 30 07:48 data/part0/csc_sampling_graph_eids.tar
+-rw-rw-r-- 1 ubuntu ubuntu   24 Oct 30 07:35 data/part0/edge_feat.dgl
+-rw-rw-r-- 1 ubuntu ubuntu 701M Oct 30 07:35 data/part0/graph.dgl
+-rw-rw-r-- 1 ubuntu ubuntu 182M Oct 30 07:35 data/part0/node_feat.dgl
+-rw-rw-r-- 1 ubuntu ubuntu 153M Oct 30 07:35 data/part1/csc_sampling_graph.tar
+-rw-rw-r-- 1 ubuntu ubuntu 247M Oct 30 07:48 data/part1/csc_sampling_graph_eids.tar
+-rw-rw-r-- 1 ubuntu ubuntu   24 Oct 30 07:35 data/part1/edge_feat.dgl
+-rw-rw-r-- 1 ubuntu ubuntu 711M Oct 30 07:35 data/part1/graph.dgl
+-rw-rw-r-- 1 ubuntu ubuntu 187M Oct 30 07:35 data/part1/node_feat.dgl
 ```
 
 ### Train with GraphBolt partitions
diff --git a/python/dgl/distributed/partition.py b/python/dgl/distributed/partition.py
index a34854f1ce50..5435fe8e60b3 100644
--- a/python/dgl/distributed/partition.py
+++ b/python/dgl/distributed/partition.py
@@ -1274,6 +1274,7 @@ def convert_dgl_partition_to_csc_sampling_graph(
     store_etypes=True,
     store_metadata=True,
     graph_file_name=None,
+    part_config_file_name=None,
 ):
     """Convert partitions of dgl to CSCSamplingGraph of GraphBolt.
 
@@ -1301,6 +1302,9 @@ def convert_dgl_partition_to_csc_sampling_graph(
     graph_file_name : str, optional
         The name of the new graph file. If not provided, the name will be
         `csc_sampling_graph.tar`.
+    part_config_file_name : str, optional
+        The name of the new partition configuration file. If not provided, the
+        name will be the passed-in one.
     """
     # As only this function requires GraphBolt for now, let's import here.
     from .. import graphbolt
@@ -1472,4 +1476,9 @@ def init_type_per_edge(graph, gpb):
         )
 
     # Update partition config.
-    _dump_part_config(part_config, new_part_meta)
+    if part_config_file_name is None:
+        part_config_file_name = os.path.basename(part_config)
+    new_part_config = os.path.join(
+        os.path.dirname(part_config), part_config_file_name
+    )
+    _dump_part_config(new_part_config, new_part_meta)

From ebb6cead8d62ad3b60c1e366042303cc18a2d0fd Mon Sep 17 00:00:00 2001
From: RhettYing <rhett_ying@qq.com>
Date: Mon, 30 Oct 2023 08:44:07 +0000
Subject: [PATCH 23/30] clean up unnecessary return edge types

---
 examples/pytorch/rgcn/experimental/README.md  |  6 ++--
 python/dgl/distributed/partition.py           |  6 ----
 .../dgl/graphbolt/impl/csc_sampling_graph.py  | 29 +++++--------------
 tests/distributed/test_partition.py           | 13 ++++-----
 4 files changed, 17 insertions(+), 37 deletions(-)

diff --git a/examples/pytorch/rgcn/experimental/README.md b/examples/pytorch/rgcn/experimental/README.md
index a9e47e59aa6d..764e818b00fa 100644
--- a/examples/pytorch/rgcn/experimental/README.md
+++ b/examples/pytorch/rgcn/experimental/README.md
@@ -30,15 +30,15 @@ heterogeneous, ~1.9M nodes, ~42M edges(reverse edges are added),  4 ntypes, 8 et
 
 | DGL(MB) | GraphBolt w/o EIDs(MB) | GraphBolt w/ EIDs(MB) |
 | --- | ------------------ | ----------------- |
-| 701/711 | 151/153 | 243/247 |
+| 701/711 | 128/129 | 243/247 |
 
 ```
--rw-rw-r-- 1 ubuntu ubuntu 151M Oct 30 07:35 data/part0/csc_sampling_graph.tar
+-rw-rw-r-- 1 ubuntu ubuntu 128M Oct 30 08:30 data/part0/csc_sampling_graph.tar
 -rw-rw-r-- 1 ubuntu ubuntu 243M Oct 30 07:48 data/part0/csc_sampling_graph_eids.tar
 -rw-rw-r-- 1 ubuntu ubuntu   24 Oct 30 07:35 data/part0/edge_feat.dgl
 -rw-rw-r-- 1 ubuntu ubuntu 701M Oct 30 07:35 data/part0/graph.dgl
 -rw-rw-r-- 1 ubuntu ubuntu 182M Oct 30 07:35 data/part0/node_feat.dgl
--rw-rw-r-- 1 ubuntu ubuntu 153M Oct 30 07:35 data/part1/csc_sampling_graph.tar
+-rw-rw-r-- 1 ubuntu ubuntu 129M Oct 30 08:30 data/part1/csc_sampling_graph.tar
 -rw-rw-r-- 1 ubuntu ubuntu 247M Oct 30 07:48 data/part1/csc_sampling_graph_eids.tar
 -rw-rw-r-- 1 ubuntu ubuntu   24 Oct 30 07:35 data/part1/edge_feat.dgl
 -rw-rw-r-- 1 ubuntu ubuntu 711M Oct 30 07:35 data/part1/graph.dgl
diff --git a/python/dgl/distributed/partition.py b/python/dgl/distributed/partition.py
index 5435fe8e60b3..3909c5292826 100644
--- a/python/dgl/distributed/partition.py
+++ b/python/dgl/distributed/partition.py
@@ -1370,12 +1370,6 @@ def init_type_per_edge(graph, gpb):
                 node_attributes = {}
             node_attributes[NTYPE] = graph.ndata[NTYPE]
 
-        if store_etypes:
-            # [Rui] Let's store as edge attributes for now.
-            if edge_attributes is None:
-                edge_attributes = {}
-            edge_attributes[ETYPE] = type_per_edge
-
         # Data type formatting before saving.
         num_nodes = part_meta["num_nodes"]
         num_edges = part_meta["num_edges"]
diff --git a/python/dgl/graphbolt/impl/csc_sampling_graph.py b/python/dgl/graphbolt/impl/csc_sampling_graph.py
index c9e4878c0dd2..57acd0e3a80b 100644
--- a/python/dgl/graphbolt/impl/csc_sampling_graph.py
+++ b/python/dgl/graphbolt/impl/csc_sampling_graph.py
@@ -360,19 +360,6 @@ def _convert_to_sampled_subgraph(
                 original_edge_ids
             ]
 
-        # [Rui] Extract ETYPEs from edge attributes.
-        original_etype_ids = None
-        has_original_etype_ids = (
-            self.edge_attributes is not None
-            and ETYPE in self.edge_attributes
-        )
-        if has_original_etype_ids:
-            assert original_edge_ids is not None, "original_edge_ids is None."
-            original_etype_ids = self.edge_attributes[ETYPE][
-                original_edge_ids
-            ]
-            assert original_etype_ids is not None, "original_etype_ids is None"
-
         if type_per_edge is None or keep_homo:
             # The sampled graph is already a homogeneous graph.
             node_pairs = (row, column)
@@ -397,7 +384,7 @@ def _convert_to_sampled_subgraph(
                 original_edge_ids = original_hetero_edge_ids
         return SampledSubgraphImpl(
             node_pairs=node_pairs, original_edge_ids=original_edge_ids,
-            original_etype_ids=original_etype_ids,
+            original_etype_ids=type_per_edge,
         )
 
     def _convert_to_homogeneous_nodes(self, nodes):
@@ -414,6 +401,7 @@ def sample_neighbors(
         replace: bool = False,
         probs_name: Optional[str] = None,
         keep_homo: bool = False,
+        return_orig_edge_ids: bool = False,
     ) -> SampledSubgraphImpl:
         """Sample neighboring edges of the given nodes and return the induced
         subgraph.
@@ -482,10 +470,12 @@ def sample_neighbors(
             nodes = self._convert_to_homogeneous_nodes(nodes)
 
         C_sampled_subgraph = self._sample_neighbors(
-            nodes, fanouts, replace, probs_name
+            nodes, fanouts, replace, probs_name, return_orig_edge_ids
         )
 
-        return self._convert_to_sampled_subgraph(C_sampled_subgraph, keep_homo=keep_homo)
+        return self._convert_to_sampled_subgraph(
+            C_sampled_subgraph, keep_homo=keep_homo,
+        )
 
     def _check_sampler_arguments(self, nodes, fanouts, probs_name):
         assert nodes.dim() == 1, "Nodes should be 1-D tensor."
@@ -531,6 +521,7 @@ def _sample_neighbors(
         fanouts: torch.Tensor,
         replace: bool = False,
         probs_name: Optional[str] = None,
+        return_orig_edge_ids: bool = False,
     ) -> torch.ScriptObject:
         """Sample neighboring edges of the given nodes and return the induced
         subgraph.
@@ -577,10 +568,6 @@ def _sample_neighbors(
             self.edge_attributes is not None
             and ORIGINAL_EDGE_ID in self.edge_attributes
         )
-        has_original_etype_ids=(
-            self.edge_attributes is not None
-            and ETYPE in self.edge_attributes
-        )
         # [Rui] Formatting to avoid `RuntimeError: expected scalar type Int but found Long`.
         nodes = nodes.to(self.indices.dtype)
         fanouts = fanouts.to(self.indices.dtype)
@@ -589,7 +576,7 @@ def _sample_neighbors(
             fanouts.tolist(),
             replace,
             False,
-            has_original_eids or has_original_etype_ids,
+            has_original_eids or return_orig_edge_ids,
             probs_name,
         )
 
diff --git a/tests/distributed/test_partition.py b/tests/distributed/test_partition.py
index f2a7a99bc27b..af429643dda9 100644
--- a/tests/distributed/test_partition.py
+++ b/tests/distributed/test_partition.py
@@ -728,10 +728,9 @@ def test_convert_dgl_partition_to_csc_sampling_graph_homo(
                     orig_g.edata[dgl.EID][orig_eids], new_g.edge_attributes[dgl.EID]
                 )
             else:
-                if not store_etypes:
-                    assert new_g.edge_attributes is None
+                assert new_g.edge_attributes is None
             if store_etypes:
-                assert th.all(0 == new_g.edge_attributes[dgl.ETYPE])
+                assert th.all(0 == new_g.type_per_edge)
             else:
                 assert new_g.type_per_edge is None
             if store_metadata:
@@ -799,10 +798,9 @@ def test_convert_dgl_partition_to_csc_sampling_graph_hetero(
                     orig_g.edata[dgl.EID][orig_eids], new_g.edge_attributes[dgl.EID]
                 )
             else:
-                if not store_etypes:
-                    assert new_g.edge_attributes is None
+                assert new_g.edge_attributes is None
             if store_etypes:
-                assert th.equal(orig_g.edata[dgl.ETYPE][orig_eids], new_g.edge_attributes[dgl.ETYPE])
+                assert th.equal(orig_g.edata[dgl.ETYPE][orig_eids], new_g.type_per_edge)
             else:
                 assert new_g.type_per_edge is None
             if store_metadata:
@@ -1115,7 +1113,8 @@ def test_partition_hetero_graphbolt_sample_neighbors(
                 assert edge_feats == {}
 
             # sample_neighbors()
-            subg = part_g.sample_neighbors(th.arange(10), th.IntTensor([-1]), keep_homo=True)
+            subg = part_g.sample_neighbors(th.arange(10), th.IntTensor([-1]), keep_homo=True,
+                                           return_orig_edge_ids=True)
             src, dst = subg.node_pairs
             orig_src = part_g.node_attributes[dgl.NID][src]
             orig_dst = part_g.node_attributes[dgl.NID][dst]

From be3f7f2a5112e0ef9f766eeb927237e6af67fa96 Mon Sep 17 00:00:00 2001
From: RhettYing <rhett_ying@qq.com>
Date: Mon, 30 Oct 2023 08:46:37 +0000
Subject: [PATCH 24/30] update graph partition size with eids

---
 examples/pytorch/rgcn/experimental/README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/pytorch/rgcn/experimental/README.md b/examples/pytorch/rgcn/experimental/README.md
index 764e818b00fa..eb3b8443f3e1 100644
--- a/examples/pytorch/rgcn/experimental/README.md
+++ b/examples/pytorch/rgcn/experimental/README.md
@@ -30,16 +30,16 @@ heterogeneous, ~1.9M nodes, ~42M edges(reverse edges are added),  4 ntypes, 8 et
 
 | DGL(MB) | GraphBolt w/o EIDs(MB) | GraphBolt w/ EIDs(MB) |
 | --- | ------------------ | ----------------- |
-| 701/711 | 128/129 | 243/247 |
+| 701/711 | 128/129 | 220/223 |
 
 ```
 -rw-rw-r-- 1 ubuntu ubuntu 128M Oct 30 08:30 data/part0/csc_sampling_graph.tar
--rw-rw-r-- 1 ubuntu ubuntu 243M Oct 30 07:48 data/part0/csc_sampling_graph_eids.tar
+-rw-rw-r-- 1 ubuntu ubuntu 220M Oct 30 08:45 data/part0/csc_sampling_graph_eids.tar
 -rw-rw-r-- 1 ubuntu ubuntu   24 Oct 30 07:35 data/part0/edge_feat.dgl
 -rw-rw-r-- 1 ubuntu ubuntu 701M Oct 30 07:35 data/part0/graph.dgl
 -rw-rw-r-- 1 ubuntu ubuntu 182M Oct 30 07:35 data/part0/node_feat.dgl
 -rw-rw-r-- 1 ubuntu ubuntu 129M Oct 30 08:30 data/part1/csc_sampling_graph.tar
--rw-rw-r-- 1 ubuntu ubuntu 247M Oct 30 07:48 data/part1/csc_sampling_graph_eids.tar
+-rw-rw-r-- 1 ubuntu ubuntu 223M Oct 30 08:45 data/part1/csc_sampling_graph_eids.tar
 -rw-rw-r-- 1 ubuntu ubuntu   24 Oct 30 07:35 data/part1/edge_feat.dgl
 -rw-rw-r-- 1 ubuntu ubuntu 711M Oct 30 07:35 data/part1/graph.dgl
 -rw-rw-r-- 1 ubuntu ubuntu 187M Oct 30 07:35 data/part1/node_feat.dgl

From 591f60eb9a1a419af835b31b94c35404121abdd8 Mon Sep 17 00:00:00 2001
From: RhettYing <rhett_ying@qq.com>
Date: Tue, 31 Oct 2023 04:55:58 +0000
Subject: [PATCH 25/30] [WAHAHA] node classfication on homogeneous graph with
 GraphBolt is ready

---
 examples/distributed/graphsage/README.md      |  80 ++++++
 examples/distributed/graphsage/dgl_cmd.sh     |  10 +
 examples/distributed/graphsage/gb_cmd.sh      |  10 +
 .../graphsage/node_classification.py          |  22 +-
 .../distributed/graphsage/partition_graph.py  |   6 +
 python/dgl/dataloading/neighbor_sampler.py    |   7 +-
 python/dgl/distributed/dist_graph.py          |   6 +
 python/dgl/distributed/graph_services.py      |  94 +++++--
 python/dgl/distributed/partition.py           | 121 ++++-----
 .../dgl/graphbolt/impl/csc_sampling_graph.py  |   8 +-
 python/dgl/graphbolt/impl/neighbor_sampler.py |   4 +-
 tests/distributed/test_partition.py           | 248 +++++++++++-------
 12 files changed, 416 insertions(+), 200 deletions(-)
 create mode 100644 examples/distributed/graphsage/dgl_cmd.sh
 create mode 100644 examples/distributed/graphsage/gb_cmd.sh

diff --git a/examples/distributed/graphsage/README.md b/examples/distributed/graphsage/README.md
index 69035175d14f..cfa01ae0e267 100644
--- a/examples/distributed/graphsage/README.md
+++ b/examples/distributed/graphsage/README.md
@@ -1,3 +1,83 @@
+## DistDGL with GraphBolt(Homograph + Node Classification)
+
+### How to partition graph
+
+#### Partition from original dataset with `dgl.distributed.partition_graph()`
+
+```
+DGL_HOME=/home/ubuntu/workspace/dgl_2 DGL_LIBRARY_PATH=$DGL_HOME/build PYTHONPATH=tests:$DGL_HOME/python:tests/python/pytorch/graphbolt:$PYTHONPATH python3 examples/distributed/graphsage/partition_graph.py --dataset ogbn-products --num_parts 2 --balance_train --balance_edges --graphbolt
+```
+
+#### Convert existing partitions into GraphBolt formats
+
+```
+DGL_LIBRARY_PATH=$DGL_HOME/build PYTHONPATH=tests:$DGL_HOME/python:tests/python/pytorch/graphbolt:$PYTHONPATH python3 -c "from dgl.distributed import convert_dgl_partition_to_csc_sampling_graph as f;f('data/ogbn-products.json')"
+```
+
+#### Partition sizes compared between GraphBolt and DistDGL
+
+`csc_sampling_graph.tar` is the GraphBolt partitions.
+`graph.dgl` is the original DistDGL partitions, namely, DGLGraph.
+
+###### ogbn-products
+homogeneous, ~2.4M nodes, ~123.7M edges(reverse edges are added), 2 parts.
+
+| DGL(GB) | GraphBolt w/o EIDs(MB) | GraphBolt w/ EIDs(MB) |
+| --- | ------------------ | ----------------- |
+| 1.6/1.7 | 258/272 | 502/530 |
+
+```
+-rw-rw-r-- 1 ubuntu ubuntu 258M Oct 31 01:56 homo_data/part0/csc_sampling_graph.tar
+-rw-rw-r-- 1 ubuntu ubuntu 502M Oct 31 04:45 homo_data/part0/csc_sampling_graph_eids.tar
+-rw-rw-r-- 1 ubuntu ubuntu   24 Oct 31 00:51 homo_data/part0/edge_feat.dgl
+-rw-rw-r-- 1 ubuntu ubuntu 1.6G Oct 31 00:51 homo_data/part0/graph.dgl
+-rw-rw-r-- 1 ubuntu ubuntu 501M Oct 31 00:51 homo_data/part0/node_feat.dgl
+-rw-rw-r-- 1 ubuntu ubuntu 272M Oct 31 01:56 homo_data/part1/csc_sampling_graph.tar
+-rw-rw-r-- 1 ubuntu ubuntu 530M Oct 31 04:45 homo_data/part1/csc_sampling_graph_eids.tar
+-rw-rw-r-- 1 ubuntu ubuntu   24 Oct 31 00:51 homo_data/part1/edge_feat.dgl
+-rw-rw-r-- 1 ubuntu ubuntu 1.7G Oct 31 00:51 homo_data/part1/graph.dgl
+-rw-rw-r-- 1 ubuntu ubuntu 460M Oct 31 00:51 homo_data/part1/node_feat.dgl
+```
+
+### Train with GraphBolt partitions
+just append `--graphbolt`.
+
+```
+python3 /home/ubuntu/workspace/dgl_2/tools/launch.py \
+    --workspace /home/ubuntu/workspace/dgl_2/examples/distributed/graphsage/ \
+    --num_trainers 4 \
+    --num_servers 2 \
+    --num_samplers 0 \
+    --part_config /home/ubuntu/workspace/dgl_2/homo_data/ogbn-products.json \
+    --ip_config /home/ubuntu/workspace/ip_config.txt \
+    "DGL_LIBRARY_PATH=/home/ubuntu/workspace/dgl_2/build PYTHONPATH=tests:/home/ubuntu/workspace/dgl_2/python:tests/python/pytorch/graphbolt:$PYTHONPATH python3 node_classification.py --graph_name ogbn-products --ip_config /home/ubuntu/workspace/ip_config.txt --num_epochs 3 --eval_every 2 --graphbolt"
+```
+
+#### Results
+`g4dn.metal` x 2, `ogbn-products`.
+
+DistDGL with GraphBolt takes less time for sampling(from **1.8283s** to **1.4470s**) and for whole epoch(from **4.9259s** to **4.4898s**) while keeping comparable accuracies in validation and test.
+
+##### DistDGL
+
+```
+Part 0, Epoch Time(s): 4.9648, sample+data_copy: 1.8283, forward: 0.2912, backward: 1.1307, update: 0.0232, #seeds: 24577, #inputs: 4136843
+
+Summary of node classification(GraphSAGE): GraphName ogbn-products | TrainEpochTime(mean) 4.9259 | TestAccuracy 0.6213
+```
+
+##### DistDGL with GraphBolt
+
+```
+Part 0, Epoch Time(s): 4.4826, sample+data_copy: 1.4470, forward: 0.2517, backward: 0.9081, update: 0.0175, #seeds: 24577, #inputs: 41369
+80
+
+Summary of node classification(GraphSAGE): GraphName ogbn-products | TrainEpochTime(mean) 4.4898 | TestAccuracy 0.6174
+```
+
+---------------------------------------
+
+
 ## Distributed training
 
 This is an example of training GraphSage in a distributed fashion. Before training, please install some python libs by pip:
diff --git a/examples/distributed/graphsage/dgl_cmd.sh b/examples/distributed/graphsage/dgl_cmd.sh
new file mode 100644
index 000000000000..1d3eaa64d019
--- /dev/null
+++ b/examples/distributed/graphsage/dgl_cmd.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+python3 /home/ubuntu/workspace/dgl_2/tools/launch.py \
+    --workspace /home/ubuntu/workspace/dgl_2/examples/distributed/graphsage/ \
+    --num_trainers 4 \
+    --num_servers 2 \
+    --num_samplers 0 \
+    --part_config /home/ubuntu/workspace/dgl_2/homo_data/ogbn-products.json \
+    --ip_config /home/ubuntu/workspace/ip_config.txt \
+    "DGL_LIBRARY_PATH=/home/ubuntu/workspace/dgl_2/build PYTHONPATH=tests:/home/ubuntu/workspace/dgl_2/python:tests/python/pytorch/graphbolt:$PYTHONPATH python3 node_classification.py --graph_name ogbn-products --ip_config /home/ubuntu/workspace/ip_config.txt --num_epochs 3 --eval_every 2"
diff --git a/examples/distributed/graphsage/gb_cmd.sh b/examples/distributed/graphsage/gb_cmd.sh
new file mode 100644
index 000000000000..ede8c051d59e
--- /dev/null
+++ b/examples/distributed/graphsage/gb_cmd.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+python3 /home/ubuntu/workspace/dgl_2/tools/launch.py \
+    --workspace /home/ubuntu/workspace/dgl_2/examples/distributed/graphsage/ \
+    --num_trainers 4 \
+    --num_servers 2 \
+    --num_samplers 0 \
+    --part_config /home/ubuntu/workspace/dgl_2/homo_data/ogbn-products.json \
+    --ip_config /home/ubuntu/workspace/ip_config.txt \
+    "DGL_LIBRARY_PATH=/home/ubuntu/workspace/dgl_2/build PYTHONPATH=tests:/home/ubuntu/workspace/dgl_2/python:tests/python/pytorch/graphbolt:$PYTHONPATH python3 node_classification.py --graph_name ogbn-products --ip_config /home/ubuntu/workspace/ip_config.txt --num_epochs 3 --eval_every 2 --graphbolt"
diff --git a/examples/distributed/graphsage/node_classification.py b/examples/distributed/graphsage/node_classification.py
index 0b11e356635b..120e15bfaf88 100644
--- a/examples/distributed/graphsage/node_classification.py
+++ b/examples/distributed/graphsage/node_classification.py
@@ -66,7 +66,7 @@ def forward(self, blocks, x):
                 h = self.dropout(h)
         return h
 
-    def inference(self, g, x, batch_size, device):
+    def inference(self, g, x, batch_size, device, use_graphbolt):
         """
         Distributed layer-wise inference with the GraphSAGE model on full
         neighbors.
@@ -116,6 +116,7 @@ def inference(self, g, x, batch_size, device):
                 batch_size=batch_size,
                 shuffle=False,
                 drop_last=False,
+                use_graphbolt=use_graphbolt,
             )
 
             for input_nodes, output_nodes, blocks in tqdm.tqdm(dataloader):
@@ -155,7 +156,7 @@ def compute_acc(pred, labels):
     return (th.argmax(pred, dim=1) == labels).float().sum() / len(pred)
 
 
-def evaluate(model, g, inputs, labels, val_nid, test_nid, batch_size, device):
+def evaluate(model, g, inputs, labels, val_nid, test_nid, batch_size, device, use_graphbolt):
     """
     Evaluate the model on the validation and test set.
 
@@ -187,7 +188,7 @@ def evaluate(model, g, inputs, labels, val_nid, test_nid, batch_size, device):
     """
     model.eval()
     with th.no_grad():
-        pred = model.inference(g, inputs, batch_size, device)
+        pred = model.inference(g, inputs, batch_size, device, use_graphbolt)
     model.train()
     return compute_acc(pred[val_nid], labels[val_nid]), compute_acc(
         pred[test_nid], labels[test_nid]
@@ -219,6 +220,7 @@ def run(args, device, data):
         batch_size=args.batch_size,
         shuffle=True,
         drop_last=False,
+        use_graphbolt=args.graphbolt,
     )
     model = DistSAGE(
         in_feats,
@@ -325,6 +327,7 @@ def run(args, device, data):
                 test_nid,
                 args.batch_size_eval,
                 device,
+                args.graphbolt,
             )
             print(
                 f"Part {g.rank()}, Val Acc {val_acc:.4f}, "
@@ -338,13 +341,16 @@ def main(args):
     """
     Main function.
     """
+    if args.graphbolt:
+        print("DistDGL with GraphBolt...")
     host_name = socket.gethostname()
     print(f"{host_name}: Initializing DistDGL.")
-    dgl.distributed.initialize(args.ip_config)
+    dgl.distributed.initialize(args.ip_config, use_graphbolt=args.graphbolt)
     print(f"{host_name}: Initializing PyTorch process group.")
     th.distributed.init_process_group(backend=args.backend)
     print(f"{host_name}: Initializing DistGraph.")
-    g = dgl.distributed.DistGraph(args.graph_name, part_config=args.part_config)
+    g = dgl.distributed.DistGraph(args.graph_name, part_config=args.part_config,
+                                  use_graphbolt=args.graphbolt)
     print(f"Rank of {host_name}: {g.rank()}")
 
     # Split train/val/test IDs for each trainer.
@@ -415,6 +421,12 @@ def main(args):
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Distributed GraphSAGE.")
+    parser.add_argument(
+        "--graphbolt",
+        default=False,
+        action="store_true",
+        help="train with GraphBolt",
+    )
     parser.add_argument("--graph_name", type=str, help="graph name")
     parser.add_argument(
         "--ip_config", type=str, help="The file for IP configuration"
diff --git a/examples/distributed/graphsage/partition_graph.py b/examples/distributed/graphsage/partition_graph.py
index 3c6e4b7e8fd5..95919377be94 100644
--- a/examples/distributed/graphsage/partition_graph.py
+++ b/examples/distributed/graphsage/partition_graph.py
@@ -59,6 +59,11 @@ def load_ogb(name, root="dataset"):
     argparser.add_argument(
         "--part_method", type=str, default="metis", help="the partition method"
     )
+    argparser.add_argument(
+        "--graphbolt",
+        action="store_true",
+        help="convert DGL to GraphBolt partitions.",
+    )
     argparser.add_argument(
         "--balance_train",
         action="store_true",
@@ -127,4 +132,5 @@ def load_ogb(name, root="dataset"):
         balance_ntypes=balance_ntypes,
         balance_edges=args.balance_edges,
         num_trainers_per_machine=args.num_trainers_per_machine,
+        use_graphbolt=args.graphbolt,
     )
diff --git a/python/dgl/dataloading/neighbor_sampler.py b/python/dgl/dataloading/neighbor_sampler.py
index e81d5a393594..e5a9e48c82b4 100644
--- a/python/dgl/dataloading/neighbor_sampler.py
+++ b/python/dgl/dataloading/neighbor_sampler.py
@@ -193,9 +193,12 @@ def sample_blocks(self, g, seed_nodes, exclude_eids=None, use_graphbolt=False):
                 exclude_edges=exclude_eids,
                 use_graphbolt=use_graphbolt,
             )
-            eid = frontier.edata[EID]
+            eid = None
+            if EID in frontier.edata:
+                eid = frontier.edata[EID]
             block = to_block(frontier, seed_nodes)
-            block.edata[EID] = eid
+            if eid is not None:
+                block.edata[EID] = eid
             seed_nodes = block.srcdata[NID]
             blocks.insert(0, block)
 
diff --git a/python/dgl/distributed/dist_graph.py b/python/dgl/distributed/dist_graph.py
index 2489d86daca3..7abf0c9ed8d5 100644
--- a/python/dgl/distributed/dist_graph.py
+++ b/python/dgl/distributed/dist_graph.py
@@ -818,6 +818,12 @@ def idtype(self):
         int
         """
         # TODO(da?): describe when self._g is None and idtype shouldn't be called.
+        '''
+        if isinstance(self.local_partition, DGLGraph):
+            return self.local_partition.idtype
+        else:
+            return self.local_partition.indices.dtype
+        '''
         return F.int64
 
     @property
diff --git a/python/dgl/distributed/graph_services.py b/python/dgl/distributed/graph_services.py
index ff6d429357a2..ed72ec5b5b29 100644
--- a/python/dgl/distributed/graph_services.py
+++ b/python/dgl/distributed/graph_services.py
@@ -68,7 +68,8 @@ def __getstate__(self):
 
 
 def _sample_neighbors(
-    local_g, partition_book, seed_nodes, fan_out, edge_dir, prob, replace
+    local_g, partition_book, seed_nodes, fan_out, edge_dir, prob, replace,
+    use_graphbolt=False
 ):
     """Sample from local partition.
 
@@ -78,23 +79,37 @@ def _sample_neighbors(
     and edge IDs.
     """
     local_ids = partition_book.nid2localnid(seed_nodes, partition_book.partid)
-    local_ids = F.astype(local_ids, local_g.idtype)
-    # local_ids = self.seed_nodes
-    sampled_graph = local_sample_neighbors(
-        local_g,
-        local_ids,
-        fan_out,
-        edge_dir,
-        prob,
-        replace,
-        _dist_training=True,
-    )
-    global_nid_mapping = local_g.ndata[NID]
-    src, dst = sampled_graph.edges()
+    if not use_graphbolt:
+        local_ids = F.astype(local_ids, local_g.idtype)
+    local_src, local_dst, local_eids = None, None, None
+    if use_graphbolt:
+        local_src, local_dst, local_eids = gb.NeighborSampler.distributed_sample_neighbor(
+            local_g, local_ids, fan_out
+        )
+        assert local_src is not None and local_dst is not None, (
+            "GraphBolt NeighborSampler.distributed_sample_neighbor() failed."
+        )
+    else:
+        sampled_graph = local_sample_neighbors(
+            local_g,
+            local_ids,
+            fan_out,
+            edge_dir,
+            prob,
+            replace,
+            _dist_training=True,
+        )
+        local_src, local_dst = sampled_graph.edges()
+        local_eids = sampled_graph.edata[EID]
+    if use_graphbolt:
+        global_nid_mapping = local_g.node_attributes[NID]
+        global_eids = local_eids
+    else:
+        global_nid_mapping = local_g.ndata[NID]
+        global_eids = F.gather_row(local_g.edata[EID], local_eids)
     global_src, global_dst = F.gather_row(
-        global_nid_mapping, src
-    ), F.gather_row(global_nid_mapping, dst)
-    global_eids = F.gather_row(local_g.edata[EID], sampled_graph.edata[EID])
+        global_nid_mapping, local_src
+    ), F.gather_row(global_nid_mapping, local_dst)
     return global_src, global_dst, global_eids
 
 
@@ -230,12 +245,13 @@ def _in_subgraph(local_g, partition_book, seed_nodes):
 class SamplingRequest(Request):
     """Sampling Request"""
 
-    def __init__(self, nodes, fan_out, edge_dir="in", prob=None, replace=False):
+    def __init__(self, nodes, fan_out, edge_dir="in", prob=None, replace=False, use_graphbolt=False):
         self.seed_nodes = nodes
         self.edge_dir = edge_dir
         self.prob = prob
         self.replace = replace
         self.fan_out = fan_out
+        self.use_graphbolt = use_graphbolt
 
     def __setstate__(self, state):
         (
@@ -244,6 +260,7 @@ def __setstate__(self, state):
             self.prob,
             self.replace,
             self.fan_out,
+            self.use_graphbolt,
         ) = state
 
     def __getstate__(self):
@@ -253,6 +270,7 @@ def __getstate__(self):
             self.prob,
             self.replace,
             self.fan_out,
+            self.use_graphbolt,
         )
 
     def process_request(self, server_state):
@@ -271,6 +289,7 @@ def process_request(self, server_state):
             self.edge_dir,
             prob,
             self.replace,
+            use_graphbolt=self.use_graphbolt,
         )
         return SubgraphResponse(global_src, global_dst, global_eids)
 
@@ -472,13 +491,17 @@ def merge_graphs(res_list, num_nodes):
             eids.append(res.global_eids)
         src_tensor = F.cat(srcs, 0)
         dst_tensor = F.cat(dsts, 0)
-        eid_tensor = F.cat(eids, 0)
+        if eids[0] is None:
+            eid_tensor = None
+        else:
+            eid_tensor = F.cat(eids, 0)
     else:
         src_tensor = res_list[0].global_src
         dst_tensor = res_list[0].global_dst
         eid_tensor = res_list[0].global_eids
     g = graph((src_tensor, dst_tensor), num_nodes=num_nodes)
-    g.edata[EID] = eid_tensor
+    if eid_tensor is not None:
+        g.edata[EID] = eid_tensor
     return g
 
 
@@ -516,7 +539,8 @@ def _distributed_access(g, nodes, issue_remote_req, local_access, use_graphbolt=
     """
     req_list = []
     partition_book = g.get_partition_book()
-    nodes = toindex(nodes).tousertensor()
+    if not isinstance(nodes, torch.Tensor):
+        nodes = toindex(nodes).tousertensor()
     partition_id = partition_book.nid2partid(nodes)
     local_nids = None
     for pid in range(partition_book.num_partitions()):
@@ -551,6 +575,10 @@ def _distributed_access(g, nodes, issue_remote_req, local_access, use_graphbolt=
         res_list.extend(results)
 
     sampled_graph = merge_graphs(res_list, g.num_nodes())
+
+    # [TODO][Rui] For now, g.idtype is alwayas int64 while underlying CSCSamplingGraph could be int32. 
+    if use_graphbolt:
+        sampled_graph = sampled_graph.long()
     return sampled_graph
 
 
@@ -607,10 +635,17 @@ def _frontier_to_heterogeneous_graph_gb(g, frontier, gpb):
             idtype=g.idtype,
         )
 
-    # For GraphBolt, we store ETYPE into EID field.
-    etype_ids = frontier.edata[EID]
     src, dst = frontier.edges()
     src, dst = F.astype(src, g.idtype), F.astype(dst, g.idtype)
+    if gpb.is_homogeneous:
+        assert frontier.edata[EID] is None, (
+            "For homogeneous graph in GraphBolt, EID field should be None."
+        )
+        etype_ids = torch.zeros(src.shape[0], dtype=torch.int32)
+        raise RuntimeError("Should not arrive here.")
+    else:
+        # For GraphBolt, we store ETYPE into EID field.
+        etype_ids = frontier.edata[EID]
     etype_ids, idx = F.sort_1d(etype_ids)
     src, dst = F.gather_row(src, idx), F.gather_row(dst, idx)
     src_ntype_ids, src = gpb.map_to_per_ntype(src)
@@ -642,7 +677,6 @@ def _frontier_to_heterogeneous_graph_gb(g, frontier, gpb):
     return hg
 
 
-
 def sample_etype_neighbors(
     g,
     nodes,
@@ -859,7 +893,6 @@ def sample_neighbors(
     DGLGraph
         A sampled subgraph containing only the sampled neighboring edges.  It is on CPU.
     """
-    assert not use_graphbolt, "GraphBolt is not supported in distributed mode."
     gpb = g.get_partition_book()
     if not gpb.is_homogeneous:
         assert isinstance(nodes, dict)
@@ -885,7 +918,8 @@ def issue_remote_req(node_ids):
         else:
             _prob = None
         return SamplingRequest(
-            node_ids, fanout, edge_dir=edge_dir, prob=_prob, replace=replace
+            node_ids, fanout, edge_dir=edge_dir, prob=_prob, replace=replace,
+            use_graphbolt=use_graphbolt,
         )
 
     def local_access(local_g, partition_book, local_nids):
@@ -899,11 +933,15 @@ def local_access(local_g, partition_book, local_nids):
             edge_dir,
             _prob,
             replace,
+            use_graphbolt=use_graphbolt,
         )
 
-    frontier = _distributed_access(g, nodes, issue_remote_req, local_access)
+    frontier = _distributed_access(g, nodes, issue_remote_req, local_access, use_graphbolt=use_graphbolt)
     if not gpb.is_homogeneous:
-        return _frontier_to_heterogeneous_graph(g, frontier, gpb)
+        if use_graphbolt:
+            return _frontier_to_heterogeneous_graph_gb(g, frontier, gpb)
+        else:
+            return _frontier_to_heterogeneous_graph(g, frontier, gpb)
     else:
         return frontier
 
diff --git a/python/dgl/distributed/partition.py b/python/dgl/distributed/partition.py
index 3909c5292826..83e2adde1760 100644
--- a/python/dgl/distributed/partition.py
+++ b/python/dgl/distributed/partition.py
@@ -11,7 +11,7 @@
 import torch
 
 from .. import backend as F, graphbolt as gb
-from ..base import DGLError, EID, ETYPE, NID, NTYPE
+from ..base import dgl_warning, DGLError, EID, ETYPE, NID, NTYPE
 from ..convert import to_homogeneous
 from ..data.utils import load_graphs, load_tensors, save_graphs, save_tensors
 from ..heterograph import DGLGraph
@@ -745,8 +745,10 @@ def partition_graph(
         ``csc`` and ``csr``. If not specified, save one format only according to what
         format is available. If multiple formats are available, selection priority
         from high to low is ``coo``, ``csc``, ``csr``.
-    use_graphbolt : bool
+    use_graphbolt : bool, optional
         Whether to convert the partitioned graph to GraphBolt format.
+    gb_save_all : bool, optional
+        Whether to save all data into `CSCSamplingGraph`.
 
     Returns
     -------
@@ -1247,19 +1249,10 @@ def get_homogeneous(g, balance_ntypes):
     )
 
     if use_graphbolt:
-        if gb_save_all:
-            convert_dgl_partition_to_csc_sampling_graph(
-                part_config,
-                store_orig_nids=True,
-                store_orig_eids=True,
-                store_ntypes=True,
-                store_etypes=True,
-                store_metadata=True,
-            )
-        else:
-            convert_dgl_partition_to_csc_sampling_graph(
-                part_config,
-            )
+        convert_dgl_partition_to_csc_sampling_graph(
+            part_config,
+            store_all=gb_save_all,
+        )
         print("Converted to GraphBolt format.")
 
     if return_mapping:
@@ -1268,13 +1261,10 @@ def get_homogeneous(g, balance_ntypes):
 
 def convert_dgl_partition_to_csc_sampling_graph(
     part_config,
-    store_orig_nids=True,
-    store_orig_eids=False,
-    store_ntypes=False,
-    store_etypes=True,
-    store_metadata=True,
+    store_eids=False,
     graph_file_name=None,
     part_config_file_name=None,
+    store_all=False,
 ):
     """Convert partitions of dgl to CSCSamplingGraph of GraphBolt.
 
@@ -1285,33 +1275,48 @@ def convert_dgl_partition_to_csc_sampling_graph(
     In the near future, partitions are supposed to be saved as
     `CSCSamplingGraph` directly. At that time, this API should be deprecated.
 
+    For homogeneous graph, below attributes are required:
+        dgl.NID: original node IDs. saved into `node_attributes`.
+    
+    For heterogeneous graph, below attributes are required:
+        dgl.NID: original node IDs. saved into `node_attributes`.
+        dgl.ETYPE: original edge types. saved into `type_per_edge`.
+    
+    If `store_eids` is True, below attributes are additional saved:
+        dgl.EID: original edge IDs. saved into `edge_attributes`.
+
     Parameters
     ----------
     part_config : str
         The partition configuration JSON file.
-    store_orig_nids : bool, optional
-        Whether to store original node IDs in the new graph.
-    store_orig_eids : bool, optional
+    store_eids : bool, optional
         Whether to store original edge IDs in the new graph.
-    store_ntypes : bool, optional
-        Whether to store node types in the new graph.
-    store_etypes : bool, optional
-        Whether to store edge types in the new graph.
-    store_metadata : bool, optional
-        Whether to store metadata in the new graph.
     graph_file_name : str, optional
         The name of the new graph file. If not provided, the name will be
         `csc_sampling_graph.tar`.
     part_config_file_name : str, optional
         The name of the new partition configuration file. If not provided, the
         name will be the passed-in one.
+    store_all : bool, optional
+        Whether to store all attributes in the new graph. If False, only
+        required attributes will be stored.
     """
     # As only this function requires GraphBolt for now, let's import here.
     from .. import graphbolt
 
+    if store_all:
+        dgl_warning(
+            "Storing all attributes in the new graph is not recommended."
+        )
+    if store_eids:
+        dgl_warning("Storing edge IDs is not supported yet.")
+
     part_meta = _load_part_config(part_config)
     new_part_meta = deepcopy(part_meta)
     num_parts = part_meta["num_parts"]
+    p_ntypes = part_meta["ntypes"]
+    p_etypes = part_meta["etypes"]
+    is_homo = len(p_ntypes) == 1 and DEFAULT_NTYPE in p_ntypes and len(p_etypes) and DEFAULT_ETYPE in p_etypes
 
     # Utility functions.
     def init_type_per_edge(graph, gpb):
@@ -1330,7 +1335,7 @@ def init_type_per_edge(graph, gpb):
         # graph.
         _, _, ntypes, etypes = load_partition_book(part_config, part_id)
         metadata = None
-        if store_metadata:
+        if not is_homo:
             # Construct GraphMetadata.
             c_etypes = {
                 graphbolt.etype_tuple_to_str(etype): v
@@ -1341,33 +1346,31 @@ def init_type_per_edge(graph, gpb):
         indptr, indices, edge_ids = graph.adj_tensors("csc") #graph.adj().csc()
         # Initalize type per edge.
         type_per_edge = None
-        if store_etypes:
+        if not is_homo:
             type_per_edge = init_type_per_edge(graph, gpb)
             type_per_edge = type_per_edge[edge_ids]
             # Sanity check.
             assert len(type_per_edge) == graph.num_edges()
 
-        # Original node IDs.
+        # Original node IDs. [Required]
         node_attributes = None
-        if store_orig_nids:
-            # Sanity check.
-            assert len(graph.ndata[NID]) == graph.num_nodes()
-            node_attributes = {
-                NID: graph.ndata[NID]
-            }
+        # Sanity check.
+        assert len(graph.ndata[NID]) == graph.num_nodes()
+        node_attributes = {
+            NID: graph.ndata[NID]
+        }
 
-        # Original edge IDs.
+        # Original edge IDs. [Optional]
         edge_attributes = None
-        if store_orig_eids:
+        if store_eids or store_all:
             # Sanity check.
             assert len(graph.edata[EID]) == graph.num_edges()
             edge_attributes = {
                 EID: graph.edata[EID][edge_ids]
             }
 
-        if store_ntypes:
-            if node_attributes is None:
-                node_attributes = {}
+        # Storing NTYPE is mainly for debug.
+        if store_all and (not is_homo):
             node_attributes[NTYPE] = graph.ndata[NTYPE]
 
         # Data type formatting before saving.
@@ -1385,13 +1388,14 @@ def init_type_per_edge(graph, gpb):
         else:
             indptr = indptr.to(torch.int64)
         # 2. NID. [Required]
-        if node_attributes is not None and NID in node_attributes:
-            if num_nodes < torch.iinfo(torch.int32).max:
-                node_attributes[NID] = node_attributes[NID].to(torch.int32)
-            else:
-                node_attributes[NID] = node_attributes[NID].to(torch.int64)
-        # 3. ETYPE. [Required].
-        # [TODO] `type_per_edge` and edge_attributes[ETYPE] are duplicated.
+        assert node_attributes is not None and NID in node_attributes, (
+            "NID is required for GraphBolt."
+        )
+        if num_nodes < torch.iinfo(torch.int32).max:
+            node_attributes[NID] = node_attributes[NID].to(torch.int32)
+        else:
+            node_attributes[NID] = node_attributes[NID].to(torch.int64)
+        # 3. ETYPE. [Required for heterograph].
         if type_per_edge is not None:
             if len(etypes) < torch.iinfo(torch.int8).max:
                 type_per_edge = type_per_edge.to(torch.int8)
@@ -1401,23 +1405,6 @@ def init_type_per_edge(graph, gpb):
                 type_per_edge = type_per_edge.to(torch.int32)
             else:
                 type_per_edge = type_per_edge.to(torch.int64)
-        if edge_attributes is not None and ETYPE in edge_attributes:
-            if len(etypes) < torch.iinfo(torch.int8).max:
-                edge_attributes[ETYPE] = edge_attributes[ETYPE].to(
-                    torch.int8
-                )
-            elif len(etypes) < torch.iinfo(torch.int16).max:
-                edge_attributes[ETYPE] = edge_attributes[ETYPE].to(
-                    torch.int16
-                )
-            elif len(etypes) < torch.iinfo(torch.int32).max:
-                edge_attributes[ETYPE] = edge_attributes[ETYPE].to(
-                    torch.int32
-                )
-            else:
-                edge_attributes[ETYPE] = edge_attributes[ETYPE].to(
-                    torch.int64
-                )
         # 4. NTYPE. [Optional]
         if node_attributes is not None and NTYPE in node_attributes:
             if len(ntypes) < torch.iinfo(torch.int8).max:
diff --git a/python/dgl/graphbolt/impl/csc_sampling_graph.py b/python/dgl/graphbolt/impl/csc_sampling_graph.py
index 57acd0e3a80b..95748607953f 100644
--- a/python/dgl/graphbolt/impl/csc_sampling_graph.py
+++ b/python/dgl/graphbolt/impl/csc_sampling_graph.py
@@ -401,7 +401,7 @@ def sample_neighbors(
         replace: bool = False,
         probs_name: Optional[str] = None,
         keep_homo: bool = False,
-        return_orig_edge_ids: bool = False,
+        return_eids: bool = False,
     ) -> SampledSubgraphImpl:
         """Sample neighboring edges of the given nodes and return the induced
         subgraph.
@@ -470,7 +470,7 @@ def sample_neighbors(
             nodes = self._convert_to_homogeneous_nodes(nodes)
 
         C_sampled_subgraph = self._sample_neighbors(
-            nodes, fanouts, replace, probs_name, return_orig_edge_ids
+            nodes, fanouts, replace, probs_name, return_eids
         )
 
         return self._convert_to_sampled_subgraph(
@@ -521,7 +521,7 @@ def _sample_neighbors(
         fanouts: torch.Tensor,
         replace: bool = False,
         probs_name: Optional[str] = None,
-        return_orig_edge_ids: bool = False,
+        return_eids: bool = False,
     ) -> torch.ScriptObject:
         """Sample neighboring edges of the given nodes and return the induced
         subgraph.
@@ -576,7 +576,7 @@ def _sample_neighbors(
             fanouts.tolist(),
             replace,
             False,
-            has_original_eids or return_orig_edge_ids,
+            has_original_eids or return_eids,
             probs_name,
         )
 
diff --git a/python/dgl/graphbolt/impl/neighbor_sampler.py b/python/dgl/graphbolt/impl/neighbor_sampler.py
index 2b780e1a8bf8..179cd4aea0db 100644
--- a/python/dgl/graphbolt/impl/neighbor_sampler.py
+++ b/python/dgl/graphbolt/impl/neighbor_sampler.py
@@ -108,7 +108,9 @@ def distributed_sample_neighbor(graph, seeds, fanouts):
         subgraph = graph.sample_neighbors(seeds, fanouts, keep_homo=True)
         src_nodes, dst_nodes = subgraph.node_pairs
         etype_ids = subgraph.original_etype_ids
-        assert src_nodes.shape == dst_nodes.shape == etype_ids.shape, f"Shape mismatch: {src_nodes.shape}, {dst_nodes.shape}, {etype_ids.shape}"
+        assert src_nodes.shape == dst_nodes.shape, f"Shape mismatch: {src_nodes.shape}, {dst_nodes.shape}"
+        if etype_ids is not None:
+            assert src_nodes.shape == etype_ids.shape, f"Shape mismatch: {src_nodes.shape}, {etype_ids.shape}"
         return src_nodes, dst_nodes, etype_ids
 
     def _sample_subgraphs(self, seeds):
diff --git a/tests/distributed/test_partition.py b/tests/distributed/test_partition.py
index af429643dda9..f5bc4395c29e 100644
--- a/tests/distributed/test_partition.py
+++ b/tests/distributed/test_partition.py
@@ -680,17 +680,13 @@ def test_UnknownPartitionBook():
 
 @pytest.mark.parametrize("part_method", ["metis", "random"])
 @pytest.mark.parametrize("num_parts", [1, 4])
-@pytest.mark.parametrize("store_orig_nids", [True, False])
-@pytest.mark.parametrize("store_orig_eids", [True, False])
-@pytest.mark.parametrize("store_etypes", [True, False])
-@pytest.mark.parametrize("store_metadata", [True, False])
+@pytest.mark.parametrize("store_eids", [True, False])
+@pytest.mark.parametrize("store_all", [True, False])
 def test_convert_dgl_partition_to_csc_sampling_graph_homo(
     part_method,
     num_parts,
-    store_orig_nids,
-    store_orig_eids,
-    store_etypes,
-    store_metadata,
+    store_eids,
+    store_all,
 ):
     with tempfile.TemporaryDirectory() as test_dir:
         g = create_random_graph(1000)
@@ -701,10 +697,8 @@ def test_convert_dgl_partition_to_csc_sampling_graph_homo(
         part_config = os.path.join(test_dir, f"{graph_name}.json")
         convert_dgl_partition_to_csc_sampling_graph(
             part_config,
-            store_orig_nids=store_orig_nids,
-            store_orig_eids=store_orig_eids,
-            store_etypes=store_etypes,
-            store_metadata=store_metadata,
+            store_eids=store_eids,
+            store_all=store_all,
         )
         for part_id in range(num_parts):
             orig_g = dgl.load_graphs(
@@ -717,51 +711,31 @@ def test_convert_dgl_partition_to_csc_sampling_graph_homo(
             assert th.equal(orig_indptr, new_g.csc_indptr)
             assert th.equal(orig_indices, new_g.indices)
             assert new_g.node_type_offset is None
-            if store_orig_nids:
-                assert th.equal(
-                    orig_g.ndata[dgl.NID], new_g.node_attributes[dgl.NID]
-                )
-            else:
-                assert new_g.node_attributes is None
-            if store_orig_eids:
+            assert th.equal(
+                orig_g.ndata[dgl.NID], new_g.node_attributes[dgl.NID]
+            )
+            assert dgl.NTYPE not in new_g.node_attributes
+            if store_eids or store_all:
                 assert th.equal(
                     orig_g.edata[dgl.EID][orig_eids], new_g.edge_attributes[dgl.EID]
                 )
             else:
                 assert new_g.edge_attributes is None
-            if store_etypes:
-                assert th.all(0 == new_g.type_per_edge)
-            else:
-                assert new_g.type_per_edge is None
-            if store_metadata:
-                for (
-                    node_type,
-                    type_id,
-                ) in new_g.metadata.node_type_to_id.items():
-                    assert g.get_ntype_id(node_type) == type_id
-                for (
-                    edge_type,
-                    type_id,
-                ) in new_g.metadata.edge_type_to_id.items():
-                    edge_type = gb.etype_str_to_tuple(edge_type)
-                    assert g.get_etype_id(edge_type) == type_id
-            else:
-                assert new_g.metadata is None
+            # For homogeneous graph, ETYPE is not stored.
+            assert new_g.type_per_edge is None
+            # For homogeneous graph, metadata is not stored.
+            assert new_g.metadata is None
 
 
 @pytest.mark.parametrize("part_method", ["metis", "random"])
 @pytest.mark.parametrize("num_parts", [1, 4])
-@pytest.mark.parametrize("store_orig_nids", [True, False])
-@pytest.mark.parametrize("store_orig_eids", [True, False])
-@pytest.mark.parametrize("store_etypes", [True, False])
-@pytest.mark.parametrize("store_metadata", [True, False])
+@pytest.mark.parametrize("store_eids", [True, False])
+@pytest.mark.parametrize("store_all", [True, False])
 def test_convert_dgl_partition_to_csc_sampling_graph_hetero(
     part_method,
     num_parts,
-    store_orig_nids,
-    store_orig_eids,
-    store_etypes,
-    store_metadata,
+    store_eids,
+    store_all,
 ):
     with tempfile.TemporaryDirectory() as test_dir:
         g = create_random_hetero()
@@ -772,10 +746,8 @@ def test_convert_dgl_partition_to_csc_sampling_graph_hetero(
         part_config = os.path.join(test_dir, f"{graph_name}.json")
         convert_dgl_partition_to_csc_sampling_graph(
             part_config,
-            store_orig_nids=store_orig_nids,
-            store_orig_eids=store_orig_eids,
-            store_etypes=store_etypes,
-            store_metadata=store_metadata,
+            store_eids=store_eids,
+            store_all=store_all,
         )
         for part_id in range(num_parts):
             orig_g = dgl.load_graphs(
@@ -787,34 +759,35 @@ def test_convert_dgl_partition_to_csc_sampling_graph_hetero(
             orig_indptr, orig_indices, orig_eids = orig_g.adj().csc()
             assert th.equal(orig_indptr, new_g.csc_indptr)
             assert th.equal(orig_indices, new_g.indices)
-            if store_orig_nids:
-                assert th.equal(
-                    orig_g.ndata[dgl.NID], new_g.node_attributes[dgl.NID]
-                )
-            else:
-                assert new_g.node_attributes is None
-            if store_orig_eids:
+            # dgl.NID is required.
+            assert th.equal(
+                orig_g.ndata[dgl.NID], new_g.node_attributes[dgl.NID]
+            )
+            if store_eids or store_all:
                 assert th.equal(
                     orig_g.edata[dgl.EID][orig_eids], new_g.edge_attributes[dgl.EID]
                 )
             else:
                 assert new_g.edge_attributes is None
-            if store_etypes:
-                assert th.equal(orig_g.edata[dgl.ETYPE][orig_eids], new_g.type_per_edge)
-            else:
-                assert new_g.type_per_edge is None
-            if store_metadata:
-                for (
-                    node_type,
-                    type_id,
-                ) in new_g.metadata.node_type_to_id.items():
-                    assert g.get_ntype_id(node_type) == type_id
-                for (
-                    edge_type,
-                    type_id,
-                ) in new_g.metadata.edge_type_to_id.items():
-                    edge_type = gb.etype_str_to_tuple(edge_type)
-                    assert g.get_etype_id(edge_type) == type_id
+            # dgl.ETYPE is required for heterograph.
+            assert th.equal(orig_g.edata[dgl.ETYPE][orig_eids], new_g.type_per_edge)
+            # dgl.NTYPE is optional for heterograph.
+            if store_all:
+                assert th.equal(
+                    orig_g.ndata[dgl.NTYPE], new_g.node_attributes[dgl.NTYPE]
+                )
+            # metadata is required for heterograph.
+            for (
+                node_type,
+                type_id,
+            ) in new_g.metadata.node_type_to_id.items():
+                assert g.get_ntype_id(node_type) == type_id
+            for (
+                edge_type,
+                type_id,
+            ) in new_g.metadata.edge_type_to_id.items():
+                edge_type = gb.etype_str_to_tuple(edge_type)
+                assert g.get_etype_id(edge_type) == type_id
             assert new_g.node_type_offset is None
 
 
@@ -929,7 +902,7 @@ def test_not_sorted_node_edge_map():
 
 
 @pytest.mark.parametrize("part_method", ["metis", "random"])
-@pytest.mark.parametrize("num_parts", [1])
+@pytest.mark.parametrize("num_parts", [1, 4])
 @pytest.mark.parametrize("num_trainers_per_machine", [1, 4])
 @pytest.mark.parametrize("load_feats", [True, False])
 def test_partition_homo_graphbolt(
@@ -973,8 +946,9 @@ def test_partition_homo_graphbolt(
             assert len(gpb_meta) == num_parts
             assert len(gpb.partid2nids(i)) == gpb_meta[i]["num_nodes"]
             assert len(gpb.partid2eids(i)) == gpb_meta[i]["num_edges"]
-            assert len(gpb.partid2nids(i)) == part_g.total_num_nodes
-            assert len(gpb.partid2eids(i)) == part_g.total_num_edges
+            if num_parts == 1:
+                assert len(gpb.partid2nids(i)) == part_g.total_num_nodes
+                assert len(gpb.partid2eids(i)) == part_g.total_num_edges
             if load_feats:
                 assert "_N/labels" in node_feats
                 assert "_N/feats" in node_feats
@@ -987,7 +961,7 @@ def test_partition_homo_graphbolt(
 
 
 @pytest.mark.parametrize("part_method", ["metis", "random"])
-@pytest.mark.parametrize("num_parts", [1])
+@pytest.mark.parametrize("num_parts", [1, 4])
 @pytest.mark.parametrize("num_trainers_per_machine", [1, 4])
 @pytest.mark.parametrize("load_feats", [True, False])
 def test_partition_hetero_graphbolt(
@@ -1037,8 +1011,9 @@ def test_partition_hetero_graphbolt(
             assert len(gpb_meta) == num_parts
             assert len(gpb.partid2nids(i)) == gpb_meta[i]["num_nodes"]
             assert len(gpb.partid2eids(i)) == gpb_meta[i]["num_edges"]
-            assert len(gpb.partid2nids(i)) == part_g.total_num_nodes
-            assert len(gpb.partid2eids(i)) == part_g.total_num_edges
+            if num_parts == 1:
+                assert len(gpb.partid2nids(i)) == part_g.total_num_nodes
+                assert len(gpb.partid2eids(i)) == part_g.total_num_edges
             if load_feats:
                 assert "n1/labels" in node_feats
                 assert "n1/feats" in node_feats
@@ -1050,10 +1025,10 @@ def test_partition_hetero_graphbolt(
     reset_envs()
 
 
-@pytest.mark.parametrize("part_method", ["metis"])
-@pytest.mark.parametrize("num_parts", [4])
-@pytest.mark.parametrize("num_trainers_per_machine", [1])
-@pytest.mark.parametrize("load_feats", [True])
+@pytest.mark.parametrize("part_method", ["metis", "random"])
+@pytest.mark.parametrize("num_parts", [1, 4])
+@pytest.mark.parametrize("num_trainers_per_machine", [1, 4])
+@pytest.mark.parametrize("load_feats", [True, False])
 def test_partition_hetero_graphbolt_sample_neighbors(
     part_method,
     num_parts,
@@ -1102,8 +1077,9 @@ def test_partition_hetero_graphbolt_sample_neighbors(
             assert len(gpb_meta) == num_parts
             assert len(gpb.partid2nids(i)) == gpb_meta[i]["num_nodes"]
             assert len(gpb.partid2eids(i)) == gpb_meta[i]["num_edges"]
-            #assert len(gpb.partid2nids(i)) == part_g.total_num_nodes
-            #assert len(gpb.partid2eids(i)) == part_g.total_num_edges
+            if num_parts == 1:
+                assert len(gpb.partid2nids(i)) == part_g.total_num_nodes
+                assert len(gpb.partid2eids(i)) == part_g.total_num_edges
             if load_feats:
                 assert "n1/labels" in node_feats
                 assert "n1/feats" in node_feats
@@ -1114,12 +1090,10 @@ def test_partition_hetero_graphbolt_sample_neighbors(
 
             # sample_neighbors()
             subg = part_g.sample_neighbors(th.arange(10), th.IntTensor([-1]), keep_homo=True,
-                                           return_orig_edge_ids=True)
+                                           return_eids=True)
             src, dst = subg.node_pairs
             orig_src = part_g.node_attributes[dgl.NID][src]
             orig_dst = part_g.node_attributes[dgl.NID][dst]
-            orig_ntype_src = part_g.node_attributes[dgl.NTYPE][src]
-            orig_ntype_dst = part_g.node_attributes[dgl.NTYPE][dst]
             etype_ids = subg.original_etype_ids
             orig_eids = part_g.edge_attributes[dgl.EID].to(hg.idtype)[subg.original_edge_ids]
             etype_idsA, _ = gpb.map_to_per_etype(orig_eids)
@@ -1127,19 +1101,107 @@ def test_partition_hetero_graphbolt_sample_neighbors(
 
             etype_ids, idx = F.sort_1d(etype_ids)
             sorted_orig_src, sorted_orig_dst = F.gather_row(orig_src, idx), F.gather_row(orig_dst, idx)
-            src_ntype_ids, ntype_wised_src = gpb.map_to_per_ntype(sorted_orig_src.to(hg.idtype))
-            dst_ntype_ids, ntype_wised_dst = gpb.map_to_per_ntype(sorted_orig_dst.to(hg.idtype))
+            src_ntype_ids, _ = gpb.map_to_per_ntype(sorted_orig_src.to(hg.idtype))
+            dst_ntype_ids, _ = gpb.map_to_per_ntype(sorted_orig_dst.to(hg.idtype))
+
+            print("gpb.canonical_etypes: ", gpb.canonical_etypes)
+            ntype_map = {ntype: i for i, ntype in enumerate(gpb.ntypes)}
+            for etid, etype in enumerate(gpb.canonical_etypes):
+                src_ntype, _, dst_ntype = etype
+                src_ntype_id = ntype_map[src_ntype]
+                dst_ntype_id = ntype_map[dst_ntype]
+                type_idx = etype_ids == etid
+                if F.sum(type_idx, 0) > 0:
+                    assert th.all(src_ntype_id == src_ntype_ids[type_idx]), (
+                        "source ntype is is not expected."
+                    )
+                    assert th.all(dst_ntype_id == dst_ntype_ids[type_idx]), (
+                        "destination ntype is is not expected."
+                    )
+
+
+@pytest.mark.parametrize("part_method", ["metis"])
+@pytest.mark.parametrize("num_parts", [4])
+@pytest.mark.parametrize("num_trainers_per_machine", [1])
+@pytest.mark.parametrize("load_feats", [True])
+def test_partition_homo_graphbolt_sample_neighbors(
+    part_method,
+    num_parts,
+    num_trainers_per_machine,
+    load_feats,
+):
+    os.environ["DGL_DIST_DEBUG"] = "1"
+    if part_method == "random" and num_parts > 1:
+        num_trainers_per_machine = 1
+
+    g = create_random_graph(1000)
+    g.ndata["labels"] = F.arange(0, g.num_nodes())
+    g.ndata["feats"] = F.tensor(np.random.randn(g.num_nodes(), 10), F.float32)
+    g.edata["feats"] = F.tensor(np.random.randn(g.num_edges(), 10), F.float32)
+    g.update_all(fn.copy_u("feats", "msg"), fn.sum("msg", "h"))
+    g.update_all(fn.copy_e("feats", "msg"), fn.sum("msg", "eh"))
+    num_hops = 2
+
+    with tempfile.TemporaryDirectory() as test_dir:
+        orig_nids, orig_eids = dgl.distributed.partition_graph(
+            g,
+            "test",
+            num_parts,
+            test_dir,
+            num_hops=num_hops,
+            part_method=part_method,
+            return_mapping=True,
+            num_trainers_per_machine=num_trainers_per_machine,
+            use_graphbolt=True,
+            gb_save_all=True,
+        )
+        part_config = os.path.join(test_dir, "test.json")
+        for i in range(num_parts):
+            part_g, node_feats, edge_feats, gpb, _, _, _ = dgl.distributed.load_partition(
+                part_config, i, load_feats=load_feats, use_graphbolt=True
+            )
+            assert isinstance(part_g, gb.CSCSamplingGraph)
+            assert gpb.num_partitions() == num_parts
+            gpb_meta = gpb.metadata()
+            assert len(gpb_meta) == num_parts
+            assert len(gpb.partid2nids(i)) == gpb_meta[i]["num_nodes"]
+            assert len(gpb.partid2eids(i)) == gpb_meta[i]["num_edges"]
+            if num_parts == 1:
+                assert len(gpb.partid2nids(i)) == part_g.total_num_nodes
+                assert len(gpb.partid2eids(i)) == part_g.total_num_edges
+            if load_feats:
+                assert "_N/labels" in node_feats
+                assert "_N/feats" in node_feats
+                assert "_N:_E:_N/feats" in edge_feats
+            else:
+                assert node_feats == {}
+                assert edge_feats == {}
+
+            # sample_neighbors()
+            subg = part_g.sample_neighbors(th.arange(10), th.IntTensor([-1]), keep_homo=True,
+                                           return_eids=True)
+            src, dst = subg.node_pairs
+            orig_src = part_g.node_attributes[dgl.NID][src]
+            orig_dst = part_g.node_attributes[dgl.NID][dst]
+            etype_ids = subg.original_etype_ids
+            assert etype_ids is None, "subgraph from homograph should not have etypes."
+            orig_eids = part_g.edge_attributes[dgl.EID].to(g.idtype)[subg.original_edge_ids]
+            etype_idsA, _ = gpb.map_to_per_etype(orig_eids)
+            #assert th.equal(etype_ids, etype_idsA), "etype_ids is not expected."
+
+            continue
+            etype_ids, idx = F.sort_1d(etype_ids)
+            sorted_orig_src, sorted_orig_dst = F.gather_row(orig_src, idx), F.gather_row(orig_dst, idx)
+            src_ntype_ids, _ = gpb.map_to_per_ntype(sorted_orig_src.to(g.idtype))
+            dst_ntype_ids, _ = gpb.map_to_per_ntype(sorted_orig_dst.to(g.idtype))
 
-            data_dict = dict()
             print("gpb.canonical_etypes: ", gpb.canonical_etypes)
             ntype_map = {ntype: i for i, ntype in enumerate(gpb.ntypes)}
-            etype_map = {
-                etype: i for i, etype in enumerate(gpb.canonical_etypes)
-            }
             for etid, etype in enumerate(gpb.canonical_etypes):
                 src_ntype, _, dst_ntype = etype
                 src_ntype_id = ntype_map[src_ntype]
                 dst_ntype_id = ntype_map[dst_ntype]
+                continue
                 type_idx = etype_ids == etid
                 if F.sum(type_idx, 0) > 0:
                     assert th.all(src_ntype_id == src_ntype_ids[type_idx]), (

From 50c7be2567a68e8d870ed1af846a3d09a474a778 Mon Sep 17 00:00:00 2001
From: RhettYing <rhett_ying@qq.com>
Date: Tue, 31 Oct 2023 05:19:23 +0000
Subject: [PATCH 26/30] Homo/Hetero + NC is verified

---
 examples/pytorch/rgcn/experimental/README.md  | 24 +------------------
 .../rgcn/experimental/{ => dev}/gb_demo.py    |  0
 .../experimental/{ => dev}/gb_demo_cmd.sh     |  0
 .../rgcn/experimental/{cmd.sh => gb_cmd.sh}   |  0
 .../pytorch/rgcn/experimental/ip_config.txt   |  2 --
 python/dgl/distributed/graph_services.py      |  1 +
 6 files changed, 2 insertions(+), 25 deletions(-)
 rename examples/pytorch/rgcn/experimental/{ => dev}/gb_demo.py (100%)
 rename examples/pytorch/rgcn/experimental/{ => dev}/gb_demo_cmd.sh (100%)
 rename examples/pytorch/rgcn/experimental/{cmd.sh => gb_cmd.sh} (100%)
 delete mode 100644 examples/pytorch/rgcn/experimental/ip_config.txt

diff --git a/examples/pytorch/rgcn/experimental/README.md b/examples/pytorch/rgcn/experimental/README.md
index eb3b8443f3e1..110fa0482b52 100644
--- a/examples/pytorch/rgcn/experimental/README.md
+++ b/examples/pytorch/rgcn/experimental/README.md
@@ -1,4 +1,4 @@
-## DistDGL with GraphBolt(Heterograph only)
+## DistDGL with GraphBolt(Heterograph + Node Classification)
 
 ### How to partition graph
 
@@ -15,8 +15,6 @@ import dgl
 part_config = "./data/ogbn-mag.json"
 dgl.distributed.convert_dgl_partition_to_csc_sampling_graph(
     part_config,
-    store_orig_nids=True,
-    store_etypes=True,
 )
 ```
 
@@ -75,8 +73,6 @@ Val Acc 0.4618, Test Acc 0.4485, time: 16.9179
 
 ##### DistDGL with GraphBolt
 
-fanout = [25, 10], call `gb.sample_etype_neighbors()` correctly. [**Default**]
-
 ```
 Epoch Time(s): 70.3498, sample: 10.6339, data copy: 8.9492, forward: 2.6577, backward: 36.1793, update: 11.9295, #train: 78696, #input
 : 34559464
@@ -84,24 +80,6 @@ Epoch Time(s): 70.3498, sample: 10.6339, data copy: 8.9492, forward: 2.6577, bac
 Val Acc 0.4572, Test Acc 0.4498, time: 3.5830
 ```
 
-fanout = [25, 10], multiplied by `num_etypes`. [**Deprecated**]
-
-```
-[3]Epoch Time(s): 137.0454, sample: 27.0914, data copy: 32.2842, forward: 3.5588, backward: 60.5921, update: 13.5188, #train: 78696, #inp
-ut: 76402212
-
-Val Acc 0.4648, Test Acc 0.4498, time: 10.4527
-```
-
-fanout = [25, 10], not multiplied by `num_etypes`. [**Deprecated**]
-
-```
-Epoch Time(s): 32.7923, sample: 5.4970, data copy: 4.9976, forward: 2.4069, backward: 15.4529, update: 4.4377, #train: 78696, #input: 
-18370936
-
-Val Acc 0.3901, Test Acc 0.3844, time: 2.2284
-```
-
 ---------------------------------------
 
 ## Distributed training
diff --git a/examples/pytorch/rgcn/experimental/gb_demo.py b/examples/pytorch/rgcn/experimental/dev/gb_demo.py
similarity index 100%
rename from examples/pytorch/rgcn/experimental/gb_demo.py
rename to examples/pytorch/rgcn/experimental/dev/gb_demo.py
diff --git a/examples/pytorch/rgcn/experimental/gb_demo_cmd.sh b/examples/pytorch/rgcn/experimental/dev/gb_demo_cmd.sh
similarity index 100%
rename from examples/pytorch/rgcn/experimental/gb_demo_cmd.sh
rename to examples/pytorch/rgcn/experimental/dev/gb_demo_cmd.sh
diff --git a/examples/pytorch/rgcn/experimental/cmd.sh b/examples/pytorch/rgcn/experimental/gb_cmd.sh
similarity index 100%
rename from examples/pytorch/rgcn/experimental/cmd.sh
rename to examples/pytorch/rgcn/experimental/gb_cmd.sh
diff --git a/examples/pytorch/rgcn/experimental/ip_config.txt b/examples/pytorch/rgcn/experimental/ip_config.txt
deleted file mode 100644
index f7bec5c8124c..000000000000
--- a/examples/pytorch/rgcn/experimental/ip_config.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-172.31.14.101
-172.31.8.229
diff --git a/python/dgl/distributed/graph_services.py b/python/dgl/distributed/graph_services.py
index ed72ec5b5b29..ee39c56185d2 100644
--- a/python/dgl/distributed/graph_services.py
+++ b/python/dgl/distributed/graph_services.py
@@ -144,6 +144,7 @@ def _sample_etype_neighbors(
             "GraphBolt NeighborSampler.distributed_sample_neighbor() failed."
         )
     else:
+        fan_out = F.astype(fan_out, local_g.idtype)
         sampled_graph = local_sample_etype_neighbors(
             local_g,
             local_ids,

From aac0d6f964044f00759bb804179a41956c157a61 Mon Sep 17 00:00:00 2001
From: RhettYing <rhett_ying@qq.com>
Date: Wed, 1 Nov 2023 01:48:16 +0000
Subject: [PATCH 27/30] add comments for EID in DGL block creation

---
 python/dgl/dataloading/neighbor_sampler.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/python/dgl/dataloading/neighbor_sampler.py b/python/dgl/dataloading/neighbor_sampler.py
index e5a9e48c82b4..837c5c33747c 100644
--- a/python/dgl/dataloading/neighbor_sampler.py
+++ b/python/dgl/dataloading/neighbor_sampler.py
@@ -193,6 +193,10 @@ def sample_blocks(self, g, seed_nodes, exclude_eids=None, use_graphbolt=False):
                 exclude_edges=exclude_eids,
                 use_graphbolt=use_graphbolt,
             )
+            # [Rui] For heterograph + DGL, it returns EIDs.
+            # For heterograph + GraphBolt, it returns {} as I didn't set it.
+            # For homogeneous graph + DGL, it returns EIDs.
+            # For homogeneous graph + GraphBolt, it crashed as no key[EID] exist.
             eid = None
             if EID in frontier.edata:
                 eid = frontier.edata[EID]

From f806ecea64f9faa3df653eabe616b5602b391b66 Mon Sep 17 00:00:00 2001
From: RhettYing <rhett_ying@qq.com>
Date: Wed, 1 Nov 2023 02:43:02 +0000
Subject: [PATCH 28/30] add assertion for num_samplers > 0 as not supported yet

---
 python/dgl/distributed/dist_context.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/python/dgl/distributed/dist_context.py b/python/dgl/distributed/dist_context.py
index 0e81b617ef6c..565c4c26bcb2 100644
--- a/python/dgl/distributed/dist_context.py
+++ b/python/dgl/distributed/dist_context.py
@@ -287,6 +287,8 @@ def initialize(
         is_standalone = (
             os.environ.get("DGL_DIST_MODE", "standalone") == "standalone"
         )
+        if use_graphbolt:
+            assert num_workers == 0, "GraphBolt does not support multiprocessing sampling."
         if num_workers > 0 and not is_standalone:
             SAMPLER_POOL = CustomPool(
                 num_workers,

From b15c93158d34574c08fc4cc631b24ac66d7b93f2 Mon Sep 17 00:00:00 2001
From: RhettYing <rhett_ying@qq.com>
Date: Wed, 1 Nov 2023 08:35:38 +0000
Subject: [PATCH 29/30] add crash log for num_samplers>0

---
 python/dgl/distributed/graph_services.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/python/dgl/distributed/graph_services.py b/python/dgl/distributed/graph_services.py
index ee39c56185d2..a125af4b7a4f 100644
--- a/python/dgl/distributed/graph_services.py
+++ b/python/dgl/distributed/graph_services.py
@@ -652,6 +652,10 @@ def _frontier_to_heterogeneous_graph_gb(g, frontier, gpb):
     src_ntype_ids, src = gpb.map_to_per_ntype(src)
     dst_ntype_ids, dst = gpb.map_to_per_ntype(dst)
 
+    #[Rui] `g.get_ntype_id()` crashed due to
+    # 'DistGraph' object has no attribute '_ntype_map' if `num_samplers>0`.
+    # `DistGraph` is not sharable between processes?
+
     data_dict = dict()
     for etid, etype in enumerate(g.canonical_etypes):
         src_ntype, _, dst_ntype = etype

From 19f12bb71bc501451e5cb427ccabc1f3f35d3ba2 Mon Sep 17 00:00:00 2001
From: RhettYing <rhett_ying@qq.com>
Date: Mon, 6 Nov 2023 07:53:47 +0000
Subject: [PATCH 30/30] add script to check mem footprint

---
 check_mem_footprint.py               | 52 ++++++++++++++++++++++++++++
 python/dgl/distributed/dist_graph.py |  4 +--
 2 files changed, 54 insertions(+), 2 deletions(-)
 create mode 100644 check_mem_footprint.py

diff --git a/check_mem_footprint.py b/check_mem_footprint.py
new file mode 100644
index 000000000000..791a8157df85
--- /dev/null
+++ b/check_mem_footprint.py
@@ -0,0 +1,52 @@
+import dgl
+from dgl.distributed import load_partition
+import psutil
+import os
+import argparse
+import gc
+
+parser = argparse.ArgumentParser(description="check memory footprint")
+parser.add_argument(
+    "--part_config",
+    type=str,
+    help="partition config file",
+)
+parser.add_argument(
+    "--graphbolt",
+    action="store_true",
+    help="use graphbolt",
+)
+parser.add_argument(
+    "--part_id",
+    type=int,
+    help="partition id",
+)
+
+args = parser.parse_args()
+
+use_graphbolt = args.graphbolt
+part_id = args.part_id
+
+prev_rss = psutil.Process(os.getpid()).memory_info().rss
+(
+    client_g,
+    _,
+    _,
+    gpb,
+    graph_name,
+    ntypes,
+    etypes,
+) = load_partition(
+    args.part_config,
+    part_id,
+    load_feats=False,
+    use_graphbolt=use_graphbolt,
+)
+if not use_graphbolt:
+    graph_format=("csc")
+    client_g = client_g.formats(graph_format)
+    client_g.create_formats_()
+new_rss = psutil.Process(os.getpid()).memory_info().rss
+print(f"[PartID_{part_id}] Loaded {graph_name} with use_graphbolt[{use_graphbolt}] in size[{(new_rss - prev_rss)/1024/1024 : .0f} MB]")
+client_g = None
+gc.collect()
diff --git a/python/dgl/distributed/dist_graph.py b/python/dgl/distributed/dist_graph.py
index 7abf0c9ed8d5..8a9c1e3b0d1c 100644
--- a/python/dgl/distributed/dist_graph.py
+++ b/python/dgl/distributed/dist_graph.py
@@ -422,8 +422,6 @@ def __init__(
                 load_feats=False,
                 use_graphbolt=use_graphbolt,
             )
-            new_rss = psutil.Process(os.getpid()).memory_info().rss
-            print(f"[Server_{self.server_id}] Loaded {graph_name} with use_graphbolt[{use_graphbolt}] in size[{(new_rss - prev_rss)/1024/1024} MB]")
             if not use_graphbolt:
                 # formatting dtype
                 # TODO(Rui) Formatting forcely is not a perfect solution.
@@ -446,6 +444,8 @@ def __init__(
                 self.client_g = self.client_g.formats(graph_format)
                 self.client_g.create_formats_()
                 print("Finished creating specified graph formats.")
+            new_rss = psutil.Process(os.getpid()).memory_info().rss
+            print(f"[Server_{self.server_id}] Loaded {graph_name} with use_graphbolt[{use_graphbolt}] in size[{(new_rss - prev_rss)/1024/1024} MB]")
             if not disable_shared_mem:
                 self.client_g = _copy_graph_to_shared_mem(
                     self.client_g, graph_name, graph_format, use_graphbolt