diff --git a/Jenkinsfile b/Jenkinsfile
index 2fe54cfe38f3..46a4943e2cb4 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -580,7 +580,6 @@ pipeline {
                   steps {
                     unit_distributed_linux('pytorch', 'cpu')
                   }
-                  when { expression { false } }
                 }
               }
               post {
diff --git a/check_mem_footprint.py b/check_mem_footprint.py
new file mode 100644
index 000000000000..791a8157df85
--- /dev/null
+++ b/check_mem_footprint.py
@@ -0,0 +1,52 @@
+import dgl
+from dgl.distributed import load_partition
+import psutil
+import os
+import argparse
+import gc
+
+parser = argparse.ArgumentParser(description="check memory footprint")
+parser.add_argument(
+    "--part_config",
+    type=str,
+    help="partition config file",
+)
+parser.add_argument(
+    "--graphbolt",
+    action="store_true",
+    help="use graphbolt",
+)
+parser.add_argument(
+    "--part_id",
+    type=int,
+    help="partition id",
+)
+
+args = parser.parse_args()
+
+use_graphbolt = args.graphbolt
+part_id = args.part_id
+
+prev_rss = psutil.Process(os.getpid()).memory_info().rss
+(
+    client_g,
+    _,
+    _,
+    gpb,
+    graph_name,
+    ntypes,
+    etypes,
+) = load_partition(
+    args.part_config,
+    part_id,
+    load_feats=False,
+    use_graphbolt=use_graphbolt,
+)
+if not use_graphbolt:
+    graph_format=("csc")
+    client_g = client_g.formats(graph_format)
+    client_g.create_formats_()
+new_rss = psutil.Process(os.getpid()).memory_info().rss
+print(f"[PartID_{part_id}] Loaded {graph_name} with use_graphbolt[{use_graphbolt}] in size[{(new_rss - prev_rss)/1024/1024 : .0f} MB]")
+client_g = None
+gc.collect()
diff --git a/dt.py b/dt.py
new file mode 100644
index 000000000000..331445075b8f
--- /dev/null
+++ b/dt.py
@@ -0,0 +1,101 @@
+import dgl
+import dgl.graphbolt as gb
+import numpy as np
+import torch as th
+
+# [TODO][P0] Set up distributed environment.
+
+"""
+num_trainers = 8
+num_servers = 4
+num_samplers = 0
+part_config = ./ogbn-products.json
+ip_config = ./ip_config.txt
+"""
+
+args = {}
+
+# Initialize distributed environment
+dgl.distributed.initialize(args.ip_config)
+th.distributed.init_process_group(backend=args.backend)
+# [TODO][P0] Convert dgl partitioned graphs to graphbolt.CSCSamplingGraph.
+#           done@2023-10-23 16:49:00
+#           see details in: https://github.com/Rhett-Ying/dgl/commits/gb_distdgl
+#                           ddce1d42de016be040cd0f8a5e71f2a10148de82
+'''
+In [1]: part_config='/home/ubuntu/workspace/dgl_2/data/ogbn-mag.json'
+In [3]: dgl.distributed.convert_dgl_partition_to_csc_sampling_graph(part_config, store_orig_nids=True)
+In [7]: !ls data/part0 -lh
+total 1.1G
+-rw-rw-r-- 1 ubuntu ubuntu 207M Oct 23 08:44 csc_sampling_graph.tar
+-rw-rw-r-- 1 ubuntu ubuntu 694M Oct 23 02:47 graph.dgl
+
+In [8]: !ls data/part1 -lh
+total 1.1G
+-rw-rw-r-- 1 ubuntu ubuntu 202M Oct 23 08:44 csc_sampling_graph.tar
+-rw-rw-r-- 1 ubuntu ubuntu 678M Oct 23 02:47 graph.dgl
+'''
+
+# [TODO][P0] Load `CSCSamplingGraph` into `DistGraph`.
+#           done@2023-10-24 15:10:00
+#           see details in: https://github.com/Rhett-Ying/dgl/commits/gb_distdgl
+#                           222dd2bd51084cc4f242148b0a7e6e5d91e0ae80
+## NID/EIDs are required.
+g = dgl.distributed.DistGraph(args.graph_name, part_config=args.part_config)
+
+# Generate train/val/test splits
+##############
+# train/val/test splits could be generated offline, then `train/val/test_masks`
+#   could be offloaded.
+# No change is required as `node_split` requires graph parition book and
+#   masks only.
+# This should be part of `OnDiskDataset::TVT`.
+# [TODO][P1]: Add a standalone API to generate train/val/test splits.
+##############
+gpb = g.get_partition_book()
+train_nids = dgl.distributed.node_split(g.ndata["train_masks"], gpb)
+val_nids = dgl.distributed.node_split(g.ndata["val_masks"], gpb)
+test_nids = dgl.distributed.node_split(g.ndata["test_masks"], gpb)
+all_nids = dgl.distributed.node_split(th.arange(g.num_nodes()), gpb)
+
+# [TODO][P2] How to handle feature data such as 'feat', 'mask'?
+# Just use `g.ndata['feat']` for now. As no more memory could be offloaded.
+# GB: feat_data = gb.OnDiskDataset().feature
+# DistDGL: feat_data = g.ndata['feat'] # DistTensor
+
+
+# Train.
+##############
+# GraphBolt version
+# [TODO][P0] Add `gb.distributed_sample_neighbor` API.
+# [TODO][P0] `remote_sample_neighbor()` returns original global node pairs + eids.
+# [TODO][P0] Upldate `dgl.distributed.merge_graphs` API.
+#     https://github.com/dmlc/dgl/blob/7439b7e73bdb85b4285ab01f704ac5a4f77c927e/python/dgl/distributed/graph_services.py#L440.
+##############
+"""
+datapipe = gb.ItemSampler(item_set, batch_size=batch_size, shuffle=shuffle)
+datapipe = datapipe.sample_neighbor(g._graph, fanouts=fanouts)
+datapipe = datapipe.to_dgl()
+device = th.device("cpu")
+datapipe = datapipe.copy_to(device)
+data_loader = gb.MultiProcessDataLoader(datapipe, num_workers=num_workers)
+"""
+sampler = dgl.dataloading.NeighborSampler([25, 10])
+train_dataloader = dgl.distributed.DistDataLoader(
+    g, train_nids, sampler=sampler, batch_size=args.batch_size, shuffle=True
+)
+model = None
+for mini_batch in train_dataloader:
+    in_feats = g.ndata["feat"][mini_batch.input_nodes]
+    labels = g.ndata["label"][mini_batch.output_nodes]
+    _ = model(mini_batch, in_feats)
+
+# Evaluate.
+model.eval()
+sampler = dgl.dataloading.NeighborSampler([-1])
+val_dataloader = dgl.distributed.DistDataLoader(
+    g, val_nids, sampler=sampler, batch_size=args.batch_size, shuffle=False
+)
+test_dataloader = dgl.distributed.DistDataLoader(
+    g, test_nids, sampler=sampler, batch_size=args.batch_size, shuffle=False
+)
diff --git a/examples/distributed/graphsage/README.md b/examples/distributed/graphsage/README.md
index 69035175d14f..cfa01ae0e267 100644
--- a/examples/distributed/graphsage/README.md
+++ b/examples/distributed/graphsage/README.md
@@ -1,3 +1,83 @@
+## DistDGL with GraphBolt(Homograph + Node Classification)
+
+### How to partition graph
+
+#### Partition from original dataset with `dgl.distributed.partition_graph()`
+
+```
+DGL_HOME=/home/ubuntu/workspace/dgl_2 DGL_LIBRARY_PATH=$DGL_HOME/build PYTHONPATH=tests:$DGL_HOME/python:tests/python/pytorch/graphbolt:$PYTHONPATH python3 examples/distributed/graphsage/partition_graph.py --dataset ogbn-products --num_parts 2 --balance_train --balance_edges --graphbolt
+```
+
+#### Convert existing partitions into GraphBolt formats
+
+```
+DGL_LIBRARY_PATH=$DGL_HOME/build PYTHONPATH=tests:$DGL_HOME/python:tests/python/pytorch/graphbolt:$PYTHONPATH python3 -c "from dgl.distributed import convert_dgl_partition_to_csc_sampling_graph as f;f('data/ogbn-products.json')"
+```
+
+#### Partition sizes compared between GraphBolt and DistDGL
+
+`csc_sampling_graph.tar` is the GraphBolt partitions.
+`graph.dgl` is the original DistDGL partitions, namely, DGLGraph.
+
+###### ogbn-products
+homogeneous, ~2.4M nodes, ~123.7M edges(reverse edges are added), 2 parts.
+
+| DGL(GB) | GraphBolt w/o EIDs(MB) | GraphBolt w/ EIDs(MB) |
+| --- | ------------------ | ----------------- |
+| 1.6/1.7 | 258/272 | 502/530 |
+
+```
+-rw-rw-r-- 1 ubuntu ubuntu 258M Oct 31 01:56 homo_data/part0/csc_sampling_graph.tar
+-rw-rw-r-- 1 ubuntu ubuntu 502M Oct 31 04:45 homo_data/part0/csc_sampling_graph_eids.tar
+-rw-rw-r-- 1 ubuntu ubuntu   24 Oct 31 00:51 homo_data/part0/edge_feat.dgl
+-rw-rw-r-- 1 ubuntu ubuntu 1.6G Oct 31 00:51 homo_data/part0/graph.dgl
+-rw-rw-r-- 1 ubuntu ubuntu 501M Oct 31 00:51 homo_data/part0/node_feat.dgl
+-rw-rw-r-- 1 ubuntu ubuntu 272M Oct 31 01:56 homo_data/part1/csc_sampling_graph.tar
+-rw-rw-r-- 1 ubuntu ubuntu 530M Oct 31 04:45 homo_data/part1/csc_sampling_graph_eids.tar
+-rw-rw-r-- 1 ubuntu ubuntu   24 Oct 31 00:51 homo_data/part1/edge_feat.dgl
+-rw-rw-r-- 1 ubuntu ubuntu 1.7G Oct 31 00:51 homo_data/part1/graph.dgl
+-rw-rw-r-- 1 ubuntu ubuntu 460M Oct 31 00:51 homo_data/part1/node_feat.dgl
+```
+
+### Train with GraphBolt partitions
+just append `--graphbolt`.
+
+```
+python3 /home/ubuntu/workspace/dgl_2/tools/launch.py \
+    --workspace /home/ubuntu/workspace/dgl_2/examples/distributed/graphsage/ \
+    --num_trainers 4 \
+    --num_servers 2 \
+    --num_samplers 0 \
+    --part_config /home/ubuntu/workspace/dgl_2/homo_data/ogbn-products.json \
+    --ip_config /home/ubuntu/workspace/ip_config.txt \
+    "DGL_LIBRARY_PATH=/home/ubuntu/workspace/dgl_2/build PYTHONPATH=tests:/home/ubuntu/workspace/dgl_2/python:tests/python/pytorch/graphbolt:$PYTHONPATH python3 node_classification.py --graph_name ogbn-products --ip_config /home/ubuntu/workspace/ip_config.txt --num_epochs 3 --eval_every 2 --graphbolt"
+```
+
+#### Results
+`g4dn.metal` x 2, `ogbn-products`.
+
+DistDGL with GraphBolt takes less time for sampling(from **1.8283s** to **1.4470s**) and for whole epoch(from **4.9259s** to **4.4898s**) while keeping comparable accuracies in validation and test.
+
+##### DistDGL
+
+```
+Part 0, Epoch Time(s): 4.9648, sample+data_copy: 1.8283, forward: 0.2912, backward: 1.1307, update: 0.0232, #seeds: 24577, #inputs: 4136843
+
+Summary of node classification(GraphSAGE): GraphName ogbn-products | TrainEpochTime(mean) 4.9259 | TestAccuracy 0.6213
+```
+
+##### DistDGL with GraphBolt
+
+```
+Part 0, Epoch Time(s): 4.4826, sample+data_copy: 1.4470, forward: 0.2517, backward: 0.9081, update: 0.0175, #seeds: 24577, #inputs: 41369
+80
+
+Summary of node classification(GraphSAGE): GraphName ogbn-products | TrainEpochTime(mean) 4.4898 | TestAccuracy 0.6174
+```
+
+---------------------------------------
+
+
 ## Distributed training
 
 This is an example of training GraphSage in a distributed fashion. Before training, please install some python libs by pip:
diff --git a/examples/distributed/graphsage/dgl_cmd.sh b/examples/distributed/graphsage/dgl_cmd.sh
new file mode 100644
index 000000000000..1d3eaa64d019
--- /dev/null
+++ b/examples/distributed/graphsage/dgl_cmd.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+python3 /home/ubuntu/workspace/dgl_2/tools/launch.py \
+    --workspace /home/ubuntu/workspace/dgl_2/examples/distributed/graphsage/ \
+    --num_trainers 4 \
+    --num_servers 2 \
+    --num_samplers 0 \
+    --part_config /home/ubuntu/workspace/dgl_2/homo_data/ogbn-products.json \
+    --ip_config /home/ubuntu/workspace/ip_config.txt \
+    "DGL_LIBRARY_PATH=/home/ubuntu/workspace/dgl_2/build PYTHONPATH=tests:/home/ubuntu/workspace/dgl_2/python:tests/python/pytorch/graphbolt:$PYTHONPATH python3 node_classification.py --graph_name ogbn-products --ip_config /home/ubuntu/workspace/ip_config.txt --num_epochs 3 --eval_every 2"
diff --git a/examples/distributed/graphsage/gb_cmd.sh b/examples/distributed/graphsage/gb_cmd.sh
new file mode 100644
index 000000000000..ede8c051d59e
--- /dev/null
+++ b/examples/distributed/graphsage/gb_cmd.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+python3 /home/ubuntu/workspace/dgl_2/tools/launch.py \
+    --workspace /home/ubuntu/workspace/dgl_2/examples/distributed/graphsage/ \
+    --num_trainers 4 \
+    --num_servers 2 \
+    --num_samplers 0 \
+    --part_config /home/ubuntu/workspace/dgl_2/homo_data/ogbn-products.json \
+    --ip_config /home/ubuntu/workspace/ip_config.txt \
+    "DGL_LIBRARY_PATH=/home/ubuntu/workspace/dgl_2/build PYTHONPATH=tests:/home/ubuntu/workspace/dgl_2/python:tests/python/pytorch/graphbolt:$PYTHONPATH python3 node_classification.py --graph_name ogbn-products --ip_config /home/ubuntu/workspace/ip_config.txt --num_epochs 3 --eval_every 2 --graphbolt"
diff --git a/examples/distributed/graphsage/node_classification.py b/examples/distributed/graphsage/node_classification.py
index 0b11e356635b..120e15bfaf88 100644
--- a/examples/distributed/graphsage/node_classification.py
+++ b/examples/distributed/graphsage/node_classification.py
@@ -66,7 +66,7 @@ def forward(self, blocks, x):
                 h = self.dropout(h)
         return h
 
-    def inference(self, g, x, batch_size, device):
+    def inference(self, g, x, batch_size, device, use_graphbolt):
         """
         Distributed layer-wise inference with the GraphSAGE model on full
         neighbors.
@@ -116,6 +116,7 @@ def inference(self, g, x, batch_size, device):
                 batch_size=batch_size,
                 shuffle=False,
                 drop_last=False,
+                use_graphbolt=use_graphbolt,
             )
 
             for input_nodes, output_nodes, blocks in tqdm.tqdm(dataloader):
@@ -155,7 +156,7 @@ def compute_acc(pred, labels):
     return (th.argmax(pred, dim=1) == labels).float().sum() / len(pred)
 
 
-def evaluate(model, g, inputs, labels, val_nid, test_nid, batch_size, device):
+def evaluate(model, g, inputs, labels, val_nid, test_nid, batch_size, device, use_graphbolt):
     """
     Evaluate the model on the validation and test set.
 
@@ -187,7 +188,7 @@ def evaluate(model, g, inputs, labels, val_nid, test_nid, batch_size, device):
     """
     model.eval()
     with th.no_grad():
-        pred = model.inference(g, inputs, batch_size, device)
+        pred = model.inference(g, inputs, batch_size, device, use_graphbolt)
     model.train()
     return compute_acc(pred[val_nid], labels[val_nid]), compute_acc(
         pred[test_nid], labels[test_nid]
@@ -219,6 +220,7 @@ def run(args, device, data):
         batch_size=args.batch_size,
         shuffle=True,
         drop_last=False,
+        use_graphbolt=args.graphbolt,
     )
     model = DistSAGE(
         in_feats,
@@ -325,6 +327,7 @@ def run(args, device, data):
                 test_nid,
                 args.batch_size_eval,
                 device,
+                args.graphbolt,
             )
             print(
                 f"Part {g.rank()}, Val Acc {val_acc:.4f}, "
@@ -338,13 +341,16 @@ def main(args):
     """
     Main function.
     """
+    if args.graphbolt:
+        print("DistDGL with GraphBolt...")
     host_name = socket.gethostname()
     print(f"{host_name}: Initializing DistDGL.")
-    dgl.distributed.initialize(args.ip_config)
+    dgl.distributed.initialize(args.ip_config, use_graphbolt=args.graphbolt)
     print(f"{host_name}: Initializing PyTorch process group.")
     th.distributed.init_process_group(backend=args.backend)
     print(f"{host_name}: Initializing DistGraph.")
-    g = dgl.distributed.DistGraph(args.graph_name, part_config=args.part_config)
+    g = dgl.distributed.DistGraph(args.graph_name, part_config=args.part_config,
+                                  use_graphbolt=args.graphbolt)
     print(f"Rank of {host_name}: {g.rank()}")
 
     # Split train/val/test IDs for each trainer.
@@ -415,6 +421,12 @@ def main(args):
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Distributed GraphSAGE.")
+    parser.add_argument(
+        "--graphbolt",
+        default=False,
+        action="store_true",
+        help="train with GraphBolt",
+    )
     parser.add_argument("--graph_name", type=str, help="graph name")
     parser.add_argument(
         "--ip_config", type=str, help="The file for IP configuration"
diff --git a/examples/distributed/graphsage/partition_graph.py b/examples/distributed/graphsage/partition_graph.py
index 3c6e4b7e8fd5..95919377be94 100644
--- a/examples/distributed/graphsage/partition_graph.py
+++ b/examples/distributed/graphsage/partition_graph.py
@@ -59,6 +59,11 @@ def load_ogb(name, root="dataset"):
     argparser.add_argument(
         "--part_method", type=str, default="metis", help="the partition method"
     )
+    argparser.add_argument(
+        "--graphbolt",
+        action="store_true",
+        help="convert DGL to GraphBolt partitions.",
+    )
     argparser.add_argument(
         "--balance_train",
         action="store_true",
@@ -127,4 +132,5 @@ def load_ogb(name, root="dataset"):
         balance_ntypes=balance_ntypes,
         balance_edges=args.balance_edges,
         num_trainers_per_machine=args.num_trainers_per_machine,
+        use_graphbolt=args.graphbolt,
     )
diff --git a/examples/pytorch/rgcn/experimental/README.md b/examples/pytorch/rgcn/experimental/README.md
index 7a7bcc9794a9..110fa0482b52 100644
--- a/examples/pytorch/rgcn/experimental/README.md
+++ b/examples/pytorch/rgcn/experimental/README.md
@@ -1,3 +1,87 @@
+## DistDGL with GraphBolt(Heterograph + Node Classification)
+
+### How to partition graph
+
+#### Partition from original dataset with `dgl.distributed.partition_graph()`
+
+```
+DGL_HOME=/home/ubuntu/workspace/dgl_2 DGL_LIBRARY_PATH=$DGL_HOME/build PYTHONPATH=tests:$DGL_HOME/python:tests/python/pytorch/graphbolt:$PYTHONPATH python3 examples/pytorch/rgcn/experimental/partition_graph.py --dataset ogbn-mag --num_parts 2 --balance_train --balance_edges --graphbolt
+```
+
+#### Convert existing partitions into GraphBolt formats
+
+```
+import dgl
+part_config = "./data/ogbn-mag.json"
+dgl.distributed.convert_dgl_partition_to_csc_sampling_graph(
+    part_config,
+)
+```
+
+#### Partition sizes compared between GraphBolt and DistDGL
+
+`csc_sampling_graph.tar` is the GraphBolt partitions.
+`graph.dgl` is the original DistDGL partitions, namely, DGLGraph.
+
+###### ogbn-mag
+heterogeneous, ~1.9M nodes, ~42M edges(reverse edges are added),  4 ntypes, 8 etypes, 2 parts.
+
+| DGL(MB) | GraphBolt w/o EIDs(MB) | GraphBolt w/ EIDs(MB) |
+| --- | ------------------ | ----------------- |
+| 701/711 | 128/129 | 220/223 |
+
+```
+-rw-rw-r-- 1 ubuntu ubuntu 128M Oct 30 08:30 data/part0/csc_sampling_graph.tar
+-rw-rw-r-- 1 ubuntu ubuntu 220M Oct 30 08:45 data/part0/csc_sampling_graph_eids.tar
+-rw-rw-r-- 1 ubuntu ubuntu   24 Oct 30 07:35 data/part0/edge_feat.dgl
+-rw-rw-r-- 1 ubuntu ubuntu 701M Oct 30 07:35 data/part0/graph.dgl
+-rw-rw-r-- 1 ubuntu ubuntu 182M Oct 30 07:35 data/part0/node_feat.dgl
+-rw-rw-r-- 1 ubuntu ubuntu 129M Oct 30 08:30 data/part1/csc_sampling_graph.tar
+-rw-rw-r-- 1 ubuntu ubuntu 223M Oct 30 08:45 data/part1/csc_sampling_graph_eids.tar
+-rw-rw-r-- 1 ubuntu ubuntu   24 Oct 30 07:35 data/part1/edge_feat.dgl
+-rw-rw-r-- 1 ubuntu ubuntu 711M Oct 30 07:35 data/part1/graph.dgl
+-rw-rw-r-- 1 ubuntu ubuntu 187M Oct 30 07:35 data/part1/node_feat.dgl
+```
+
+### Train with GraphBolt partitions
+just append `--graphbolt`.
+
+```
+python3 /home/ubuntu/workspace/dgl_2/tools/launch.py \
+    --workspace /home/ubuntu/workspace/dgl_2/examples/pytorch/rgcn/experimental/ \
+    --num_trainers 4 \
+    --num_servers 2 \
+    --num_samplers 0 \
+    --part_config /home/ubuntu/workspace/dgl_2/data/ogbn-mag.json \
+    --ip_config /home/ubuntu/workspace/ip_config.txt \
+    "DGL_LIBRARY_PATH=/home/ubuntu/workspace/dgl_2/build PYTHONPATH=tests:/home/ubuntu/workspace/dgl_2/python:tests/python/pytorch/graphbolt:$PYTHONPATH python3 entity_classify_dist.py --graph-name ogbn-mag --dataset ogbn-mag --ip-config /home/ubuntu/workspace/ip_config.txt --fanout='25,10' --batch-size 1024  --n-hidden 64 --lr 0.01 --eval-batch-size 1024 --low-mem --dropout 0.5 --use-self-loop --n-bases 2 --n-epochs 1 --layer-norm --sparse-embedding --sparse-lr 0.06  --graphbolt"
+```
+
+#### Results
+`g4dn.metal` x 2, `ogbn-mag`.
+
+DistDGL with GraphBolt takes less time for sampling(from **73s** to **11s**) and for whole epoch(from **178s** to **70s**) while keeping comparable accuracies in validation and test.
+
+##### DistDGL
+
+```
+Epoch Time(s): 177.6757, sample: 73.0354, data copy: 27.7802, forward: 2.4314, backward: 63.2740, update: 11.1546, #train: 78696, #inp
+ut: 34579790
+
+Val Acc 0.4618, Test Acc 0.4485, time: 16.9179
+```
+
+##### DistDGL with GraphBolt
+
+```
+Epoch Time(s): 70.3498, sample: 10.6339, data copy: 8.9492, forward: 2.6577, backward: 36.1793, update: 11.9295, #train: 78696, #input
+: 34559464
+
+Val Acc 0.4572, Test Acc 0.4498, time: 3.5830
+```
+
+---------------------------------------
+
 ## Distributed training
 
 This is an example of training RGCN node classification in a distributed fashion. Currently, the example train RGCN graphs with input node features. The current implementation follows ../rgcn/entity_claasify_mp.py.
diff --git a/examples/pytorch/rgcn/experimental/dev/gb_demo.py b/examples/pytorch/rgcn/experimental/dev/gb_demo.py
new file mode 100644
index 000000000000..026ca136e501
--- /dev/null
+++ b/examples/pytorch/rgcn/experimental/dev/gb_demo.py
@@ -0,0 +1,957 @@
+"""
+Modeling Relational Data with Graph Convolutional Networks
+Paper: https://arxiv.org/abs/1703.06103
+Code: https://github.com/tkipf/relational-gcn
+Difference compared to tkipf/relation-gcn
+* l2norm applied to all weights
+* remove nodes that won't be touched
+"""
+import argparse
+import gc, os
+import itertools
+import time
+
+import numpy as np
+
+os.environ["DGLBACKEND"] = "pytorch"
+
+from functools import partial
+
+import dgl
+import dgl.graphbolt as gb
+import torch as th
+import torch.multiprocessing as mp
+import torch.nn as nn
+import torch.nn.functional as F
+
+import tqdm
+from dgl import DGLGraph, nn as dglnn
+from dgl.distributed import DistDataLoader
+
+from ogb.nodeproppred import DglNodePropPredDataset
+from torch.multiprocessing import Queue
+from torch.nn.parallel import DistributedDataParallel
+from torch.utils.data import DataLoader
+
+
+class RelGraphConvLayer(nn.Module):
+    r"""Relational graph convolution layer.
+    Parameters
+    ----------
+    in_feat : int
+        Input feature size.
+    out_feat : int
+        Output feature size.
+    rel_names : list[str]
+        Relation names.
+    num_bases : int, optional
+        Number of bases. If is none, use number of relations. Default: None.
+    weight : bool, optional
+        True if a linear layer is applied after message passing. Default: True
+    bias : bool, optional
+        True if bias is added. Default: True
+    activation : callable, optional
+        Activation function. Default: None
+    self_loop : bool, optional
+        True to include self loop message. Default: False
+    dropout : float, optional
+        Dropout rate. Default: 0.0
+    """
+
+    def __init__(
+        self,
+        in_feat,
+        out_feat,
+        rel_names,
+        num_bases,
+        *,
+        weight=True,
+        bias=True,
+        activation=None,
+        self_loop=False,
+        dropout=0.0,
+    ):
+        super(RelGraphConvLayer, self).__init__()
+        self.in_feat = in_feat
+        self.out_feat = out_feat
+        self.rel_names = rel_names
+        self.num_bases = num_bases
+        self.bias = bias
+        self.activation = activation
+        self.self_loop = self_loop
+
+        self.conv = dglnn.HeteroGraphConv(
+            {
+                rel: dglnn.GraphConv(
+                    in_feat, out_feat, norm="right", weight=False, bias=False
+                )
+                for rel in rel_names
+            }
+        )
+
+        self.use_weight = weight
+        self.use_basis = num_bases < len(self.rel_names) and weight
+        if self.use_weight:
+            if self.use_basis:
+                self.basis = dglnn.WeightBasis(
+                    (in_feat, out_feat), num_bases, len(self.rel_names)
+                )
+            else:
+                self.weight = nn.Parameter(
+                    th.Tensor(len(self.rel_names), in_feat, out_feat)
+                )
+                nn.init.xavier_uniform_(
+                    self.weight, gain=nn.init.calculate_gain("relu")
+                )
+
+        # bias
+        if bias:
+            self.h_bias = nn.Parameter(th.Tensor(out_feat))
+            nn.init.zeros_(self.h_bias)
+
+        # weight for self loop
+        if self.self_loop:
+            self.loop_weight = nn.Parameter(th.Tensor(in_feat, out_feat))
+            nn.init.xavier_uniform_(
+                self.loop_weight, gain=nn.init.calculate_gain("relu")
+            )
+
+        self.dropout = nn.Dropout(dropout)
+
+    def forward(self, g, inputs):
+        """Forward computation
+        Parameters
+        ----------
+        g : DGLGraph
+            Input graph.
+        inputs : dict[str, torch.Tensor]
+            Node feature for each node type.
+        Returns
+        -------
+        dict[str, torch.Tensor]
+            New node features for each node type.
+        """
+        g = g.local_var()
+        if self.use_weight:
+            weight = self.basis() if self.use_basis else self.weight
+            wdict = {
+                self.rel_names[i]: {"weight": w.squeeze(0)}
+                for i, w in enumerate(th.split(weight, 1, dim=0))
+            }
+        else:
+            wdict = {}
+
+        if g.is_block:
+            inputs_src = inputs
+            inputs_dst = {
+                k: v[: g.number_of_dst_nodes(k)] for k, v in inputs.items()
+            }
+        else:
+            inputs_src = inputs_dst = inputs
+
+        hs = self.conv(g, inputs, mod_kwargs=wdict)
+
+        def _apply(ntype, h):
+            if self.self_loop:
+                h = h + th.matmul(inputs_dst[ntype], self.loop_weight)
+            if self.bias:
+                h = h + self.h_bias
+            if self.activation:
+                h = self.activation(h)
+            return self.dropout(h)
+
+        return {ntype: _apply(ntype, h) for ntype, h in hs.items()}
+
+
+class EntityClassify(nn.Module):
+    """Entity classification class for RGCN
+    Parameters
+    ----------
+    device : int
+        Device to run the layer.
+    num_nodes : int
+        Number of nodes.
+    h_dim : int
+        Hidden dim size.
+    out_dim : int
+        Output dim size.
+    rel_names : list of str
+        A list of relation names.
+    num_bases : int
+        Number of bases. If is none, use number of relations.
+    num_hidden_layers : int
+        Number of hidden RelGraphConv Layer
+    dropout : float
+        Dropout
+    use_self_loop : bool
+        Use self loop if True, default False.
+    """
+
+    def __init__(
+        self,
+        device,
+        h_dim,
+        out_dim,
+        rel_names,
+        num_bases=None,
+        num_hidden_layers=1,
+        dropout=0,
+        use_self_loop=False,
+        layer_norm=False,
+    ):
+        super(EntityClassify, self).__init__()
+        self.device = device
+        self.h_dim = h_dim
+        self.out_dim = out_dim
+        self.num_bases = None if num_bases < 0 else num_bases
+        self.num_hidden_layers = num_hidden_layers
+        self.dropout = dropout
+        self.use_self_loop = use_self_loop
+        self.layer_norm = layer_norm
+
+        self.layers = nn.ModuleList()
+        # i2h
+        self.layers.append(
+            RelGraphConvLayer(
+                self.h_dim,
+                self.h_dim,
+                rel_names,
+                self.num_bases,
+                activation=F.relu,
+                self_loop=self.use_self_loop,
+                dropout=self.dropout,
+            )
+        )
+        # h2h
+        for idx in range(self.num_hidden_layers):
+            self.layers.append(
+                RelGraphConvLayer(
+                    self.h_dim,
+                    self.h_dim,
+                    rel_names,
+                    self.num_bases,
+                    activation=F.relu,
+                    self_loop=self.use_self_loop,
+                    dropout=self.dropout,
+                )
+            )
+        # h2o
+        self.layers.append(
+            RelGraphConvLayer(
+                self.h_dim,
+                self.out_dim,
+                rel_names,
+                self.num_bases,
+                activation=None,
+                self_loop=self.use_self_loop,
+            )
+        )
+
+    def forward(self, blocks, feats, norm=None):
+        if blocks is None:
+            # full graph training
+            blocks = [self.g] * len(self.layers)
+        h = feats
+        for layer, block in zip(self.layers, blocks):
+            block = block.to(self.device)
+            h = layer(block, h)
+        return h
+
+
+def init_emb(shape, dtype):
+    arr = th.zeros(shape, dtype=dtype)
+    nn.init.uniform_(arr, -1.0, 1.0)
+    return arr
+
+
+class DistEmbedLayer(nn.Module):
+    r"""Embedding layer for featureless heterograph.
+    Parameters
+    ----------
+    dev_id : int
+        Device to run the layer.
+    g : DistGraph
+        training graph
+    embed_size : int
+        Output embed size
+    sparse_emb: bool
+        Whether to use sparse embedding
+        Default: False
+    dgl_sparse_emb: bool
+        Whether to use DGL sparse embedding
+        Default: False
+    embed_name : str, optional
+        Embed name
+    """
+
+    def __init__(
+        self,
+        dev_id,
+        g,
+        embed_size,
+        sparse_emb=False,
+        dgl_sparse_emb=False,
+        feat_name="feat",
+        embed_name="node_emb",
+    ):
+        super(DistEmbedLayer, self).__init__()
+        self.dev_id = dev_id
+        self.embed_size = embed_size
+        self.embed_name = embed_name
+        self.feat_name = feat_name
+        self.sparse_emb = sparse_emb
+        self.g = g
+        self.ntype_id_map = {g.get_ntype_id(ntype): ntype for ntype in g.ntypes}
+
+        self.node_projs = nn.ModuleDict()
+        for ntype in g.ntypes:
+            if feat_name in g.nodes[ntype].data:
+                self.node_projs[ntype] = nn.Linear(
+                    g.nodes[ntype].data[feat_name].shape[1], embed_size
+                )
+                nn.init.xavier_uniform_(self.node_projs[ntype].weight)
+                print("node {} has data {}".format(ntype, feat_name))
+        if sparse_emb:
+            if dgl_sparse_emb:
+                self.node_embeds = {}
+                for ntype in g.ntypes:
+                    # We only create embeddings for nodes without node features.
+                    if feat_name not in g.nodes[ntype].data:
+                        part_policy = g.get_node_partition_policy(ntype)
+                        self.node_embeds[ntype] = dgl.distributed.DistEmbedding(
+                            g.num_nodes(ntype),
+                            self.embed_size,
+                            embed_name + "_" + ntype,
+                            init_emb,
+                            part_policy,
+                        )
+            else:
+                self.node_embeds = nn.ModuleDict()
+                for ntype in g.ntypes:
+                    # We only create embeddings for nodes without node features.
+                    if feat_name not in g.nodes[ntype].data:
+                        self.node_embeds[ntype] = th.nn.Embedding(
+                            g.num_nodes(ntype),
+                            self.embed_size,
+                            sparse=self.sparse_emb,
+                        )
+                        nn.init.uniform_(
+                            self.node_embeds[ntype].weight, -1.0, 1.0
+                        )
+        else:
+            self.node_embeds = nn.ModuleDict()
+            for ntype in g.ntypes:
+                # We only create embeddings for nodes without node features.
+                if feat_name not in g.nodes[ntype].data:
+                    self.node_embeds[ntype] = th.nn.Embedding(
+                        g.num_nodes(ntype), self.embed_size
+                    )
+                    nn.init.uniform_(self.node_embeds[ntype].weight, -1.0, 1.0)
+
+    def forward(self, node_ids):
+        """Forward computation
+        Parameters
+        ----------
+        node_ids : dict of Tensor
+            node ids to generate embedding for.
+        Returns
+        -------
+        tensor
+            embeddings as the input of the next layer
+        """
+        embeds = {}
+        for ntype in node_ids:
+            if self.feat_name in self.g.nodes[ntype].data:
+                embeds[ntype] = self.node_projs[ntype](
+                    self.g.nodes[ntype]
+                    .data[self.feat_name][node_ids[ntype]]
+                    .to(self.dev_id)
+                )
+            else:
+                embeds[ntype] = self.node_embeds[ntype](node_ids[ntype]).to(
+                    self.dev_id
+                )
+        return embeds
+
+
+def compute_acc(results, labels):
+    """
+    Compute the accuracy of prediction given the labels.
+    """
+    labels = labels.long()
+    return (results == labels).float().sum() / len(results)
+
+
+def evaluate(
+    g,
+    model,
+    embed_layer,
+    labels,
+    eval_loader,
+    test_loader,
+    all_val_nid,
+    all_test_nid,
+):
+    model.eval()
+    embed_layer.eval()
+    eval_logits = []
+    eval_seeds = []
+
+    global_results = dgl.distributed.DistTensor(
+        labels.shape, th.long, "results", persistent=True
+    )
+
+    with th.no_grad():
+        th.cuda.empty_cache()
+        for sample_data in tqdm.tqdm(eval_loader):
+            input_nodes, seeds, blocks = sample_data
+            seeds = seeds["paper"]
+            feats = embed_layer(input_nodes)
+            logits = model(blocks, feats)
+            assert len(logits) == 1
+            logits = logits["paper"]
+            eval_logits.append(logits.cpu().detach())
+            assert np.all(seeds.numpy() < g.num_nodes("paper"))
+            eval_seeds.append(seeds.cpu().detach())
+    eval_logits = th.cat(eval_logits)
+    eval_seeds = th.cat(eval_seeds)
+    global_results[eval_seeds] = eval_logits.argmax(dim=1)
+
+    test_logits = []
+    test_seeds = []
+    with th.no_grad():
+        th.cuda.empty_cache()
+        for sample_data in tqdm.tqdm(test_loader):
+            input_nodes, seeds, blocks = sample_data
+            seeds = seeds["paper"]
+            feats = embed_layer(input_nodes)
+            logits = model(blocks, feats)
+            assert len(logits) == 1
+            logits = logits["paper"]
+            test_logits.append(logits.cpu().detach())
+            assert np.all(seeds.numpy() < g.num_nodes("paper"))
+            test_seeds.append(seeds.cpu().detach())
+    test_logits = th.cat(test_logits)
+    test_seeds = th.cat(test_seeds)
+    global_results[test_seeds] = test_logits.argmax(dim=1)
+
+    g.barrier()
+    if g.rank() == 0:
+        return compute_acc(
+            global_results[all_val_nid], labels[all_val_nid]
+        ), compute_acc(global_results[all_test_nid], labels[all_test_nid])
+    else:
+        return -1, -1
+
+
+def create_itemset(g, nodes, labels):
+    gpb = g.get_partition_book()
+    if isinstance(nodes, dict):
+        data = {}
+        for ntype in nodes:
+            assert (
+                ntype in gpb.ntypes
+            ), "The sampled node type {} does not exist in the input graph".format(
+                ntype
+            )
+            assert ntype in labels, f"{ntype} not found in labels."
+            data[ntype] = gb.ItemSet(
+                (nodes[ntype], labels[ntype]), names=("seed_nodes", "labels")
+            )
+        return gb.ItemSetDict(data)
+    return gb.ItemSet((nodes, labels), names=("seed_nodes", "labels"))
+
+
+def create_dataloader(g, nodes, labels, batch_size, shuffle, fanouts):
+    item_set = create_itemset(g, nodes, labels)
+
+    datapipe = gb.ItemSampler(item_set, batch_size=batch_size, shuffle=shuffle)
+
+    datapipe = datapipe.distributed_sample_neighbor(g, fanouts=fanouts)
+
+    # datapipe = datapipe.to_dgl()
+
+    # device=th.device("cpu")
+    # datapipe = datapipe.copy_to(device)
+
+    return gb.MultiProcessDataLoader(datapipe, num_workers=0)
+
+
+def run(args, device, data):
+    (
+        g,
+        num_classes,
+        train_nid,
+        val_nid,
+        test_nid,
+        labels,
+        all_val_nid,
+        all_test_nid,
+    ) = data
+
+    fanouts = [int(fanout) for fanout in args.fanout.split(",")]
+    val_fanouts = [int(fanout) for fanout in args.validation_fanout.split(",")]
+
+    print(
+        f"Rank[{g.rank()}] train_nid: {train_nid.shape}, labels: {labels.shape}"
+    )
+
+    # Create dataloaders
+    train_dl = create_dataloader(
+        g,
+        {"paper": train_nid},
+        {"paper": labels},
+        args.batch_size,
+        True,
+        fanouts,
+    )
+    # val_dl = create_dataloader(g, {"paper": val_nid}, labels, args.batch_size, True, val_fanouts)
+    # test_dl = create_dataloader(g, {"paper": test_nid}, labels, args.eval_batch_size, True, val_fanouts)
+
+    for step, data in enumerate(tqdm.tqdm(train_dl, desc="DistDGL Training")):
+        pass
+
+    g.barrier()
+    if g.rank() == 0:
+        time.sleep(5)
+    return
+
+    embed_layer = DistEmbedLayer(
+        device,
+        g,
+        args.n_hidden,
+        sparse_emb=args.sparse_embedding,
+        dgl_sparse_emb=args.dgl_sparse,
+        feat_name="feat",
+    )
+
+    model = EntityClassify(
+        device,
+        args.n_hidden,
+        num_classes,
+        g.etypes,
+        num_bases=args.n_bases,
+        num_hidden_layers=args.n_layers - 2,
+        dropout=args.dropout,
+        use_self_loop=args.use_self_loop,
+        layer_norm=args.layer_norm,
+    )
+    model = model.to(device)
+
+    if not args.standalone:
+        if args.num_gpus == -1:
+            model = DistributedDataParallel(model)
+            # If there are dense parameters in the embedding layer
+            # or we use Pytorch saprse embeddings.
+            if len(embed_layer.node_projs) > 0 or not args.dgl_sparse:
+                embed_layer = DistributedDataParallel(embed_layer)
+        else:
+            dev_id = g.rank() % args.num_gpus
+            model = DistributedDataParallel(
+                model, device_ids=[dev_id], output_device=dev_id
+            )
+            # If there are dense parameters in the embedding layer
+            # or we use Pytorch saprse embeddings.
+            if len(embed_layer.node_projs) > 0 or not args.dgl_sparse:
+                embed_layer = embed_layer.to(device)
+                embed_layer = DistributedDataParallel(
+                    embed_layer, device_ids=[dev_id], output_device=dev_id
+                )
+
+    if args.sparse_embedding:
+        if args.dgl_sparse and args.standalone:
+            emb_optimizer = dgl.distributed.optim.SparseAdam(
+                list(embed_layer.node_embeds.values()), lr=args.sparse_lr
+            )
+            print(
+                "optimize DGL sparse embedding:", embed_layer.node_embeds.keys()
+            )
+        elif args.dgl_sparse:
+            emb_optimizer = dgl.distributed.optim.SparseAdam(
+                list(embed_layer.module.node_embeds.values()), lr=args.sparse_lr
+            )
+            print(
+                "optimize DGL sparse embedding:",
+                embed_layer.module.node_embeds.keys(),
+            )
+        elif args.standalone:
+            emb_optimizer = th.optim.SparseAdam(
+                list(embed_layer.node_embeds.parameters()), lr=args.sparse_lr
+            )
+            print("optimize Pytorch sparse embedding:", embed_layer.node_embeds)
+        else:
+            emb_optimizer = th.optim.SparseAdam(
+                list(embed_layer.module.node_embeds.parameters()),
+                lr=args.sparse_lr,
+            )
+            print(
+                "optimize Pytorch sparse embedding:",
+                embed_layer.module.node_embeds,
+            )
+
+        dense_params = list(model.parameters())
+        if args.standalone:
+            dense_params += list(embed_layer.node_projs.parameters())
+            print("optimize dense projection:", embed_layer.node_projs)
+        else:
+            dense_params += list(embed_layer.module.node_projs.parameters())
+            print("optimize dense projection:", embed_layer.module.node_projs)
+        optimizer = th.optim.Adam(
+            dense_params, lr=args.lr, weight_decay=args.l2norm
+        )
+    else:
+        all_params = list(model.parameters()) + list(embed_layer.parameters())
+        optimizer = th.optim.Adam(
+            all_params, lr=args.lr, weight_decay=args.l2norm
+        )
+
+    # training loop
+    print("start training...")
+    for epoch in range(args.n_epochs):
+        tic = time.time()
+
+        sample_time = 0
+        copy_time = 0
+        forward_time = 0
+        backward_time = 0
+        update_time = 0
+        number_train = 0
+        number_input = 0
+
+        step_time = []
+        iter_t = []
+        sample_t = []
+        feat_copy_t = []
+        forward_t = []
+        backward_t = []
+        update_t = []
+        iter_tput = []
+
+        start = time.time()
+        # Loop over the dataloader to sample the computation dependency graph as a list of
+        # blocks.
+        step_time = []
+        for step, sample_data in enumerate(dataloader):
+            input_nodes, seeds, blocks = sample_data
+            seeds = seeds["paper"]
+            number_train += seeds.shape[0]
+            number_input += np.sum(
+                [blocks[0].num_src_nodes(ntype) for ntype in blocks[0].ntypes]
+            )
+            tic_step = time.time()
+            sample_time += tic_step - start
+            sample_t.append(tic_step - start)
+
+            feats = embed_layer(input_nodes)
+            label = labels[seeds].to(device)
+            copy_time = time.time()
+            feat_copy_t.append(copy_time - tic_step)
+
+            # forward
+            logits = model(blocks, feats)
+            assert len(logits) == 1
+            logits = logits["paper"]
+            loss = F.cross_entropy(logits, label)
+            forward_end = time.time()
+
+            # backward
+            optimizer.zero_grad()
+            if args.sparse_embedding:
+                emb_optimizer.zero_grad()
+            loss.backward()
+            compute_end = time.time()
+            forward_t.append(forward_end - copy_time)
+            backward_t.append(compute_end - forward_end)
+
+            # Update model parameters
+            optimizer.step()
+            if args.sparse_embedding:
+                emb_optimizer.step()
+            update_t.append(time.time() - compute_end)
+            step_t = time.time() - start
+            step_time.append(step_t)
+
+            train_acc = th.sum(logits.argmax(dim=1) == label).item() / len(
+                seeds
+            )
+
+            if step % args.log_every == 0:
+                print(
+                    "[{}] Epoch {:05d} | Step {:05d} | Train acc {:.4f} | Loss {:.4f} | time {:.3f} s"
+                    "| sample {:.3f} | copy {:.3f} | forward {:.3f} | backward {:.3f} | update {:.3f}".format(
+                        g.rank(),
+                        epoch,
+                        step,
+                        train_acc,
+                        loss.item(),
+                        np.sum(step_time[-args.log_every :]),
+                        np.sum(sample_t[-args.log_every :]),
+                        np.sum(feat_copy_t[-args.log_every :]),
+                        np.sum(forward_t[-args.log_every :]),
+                        np.sum(backward_t[-args.log_every :]),
+                        np.sum(update_t[-args.log_every :]),
+                    )
+                )
+            start = time.time()
+
+        gc.collect()
+        print(
+            "[{}]Epoch Time(s): {:.4f}, sample: {:.4f}, data copy: {:.4f}, forward: {:.4f}, backward: {:.4f}, update: {:.4f}, #train: {}, #input: {}".format(
+                g.rank(),
+                np.sum(step_time),
+                np.sum(sample_t),
+                np.sum(feat_copy_t),
+                np.sum(forward_t),
+                np.sum(backward_t),
+                np.sum(update_t),
+                number_train,
+                number_input,
+            )
+        )
+        epoch += 1
+
+        start = time.time()
+        g.barrier()
+        val_acc, test_acc = evaluate(
+            g,
+            model,
+            embed_layer,
+            labels,
+            valid_dataloader,
+            test_dataloader,
+            all_val_nid,
+            all_test_nid,
+        )
+        if val_acc >= 0:
+            print(
+                "Val Acc {:.4f}, Test Acc {:.4f}, time: {:.4f}".format(
+                    val_acc, test_acc, time.time() - start
+                )
+            )
+
+
+def main(args):
+    if args.graphbolt:
+        print("Using GraphBolt")
+    dgl.distributed.initialize(args.ip_config, use_graphbolt=args.graphbolt)
+    if not args.standalone:
+        th.distributed.init_process_group(backend="gloo")
+
+    g = dgl.distributed.DistGraph(
+        args.graph_name,
+        part_config=args.conf_path,
+        use_graphbolt=args.graphbolt,
+    )
+    print("rank:", g.rank())
+
+    pb = g.get_partition_book()
+    if "trainer_id" in g.nodes["paper"].data:
+        train_nid = dgl.distributed.node_split(
+            g.nodes["paper"].data["train_mask"],
+            pb,
+            ntype="paper",
+            force_even=True,
+            node_trainer_ids=g.nodes["paper"].data["trainer_id"],
+        )
+        val_nid = dgl.distributed.node_split(
+            g.nodes["paper"].data["val_mask"],
+            pb,
+            ntype="paper",
+            force_even=True,
+            node_trainer_ids=g.nodes["paper"].data["trainer_id"],
+        )
+        test_nid = dgl.distributed.node_split(
+            g.nodes["paper"].data["test_mask"],
+            pb,
+            ntype="paper",
+            force_even=True,
+            node_trainer_ids=g.nodes["paper"].data["trainer_id"],
+        )
+    else:
+        train_nid = dgl.distributed.node_split(
+            g.nodes["paper"].data["train_mask"],
+            pb,
+            ntype="paper",
+            force_even=True,
+        )
+        val_nid = dgl.distributed.node_split(
+            g.nodes["paper"].data["val_mask"],
+            pb,
+            ntype="paper",
+            force_even=True,
+        )
+        test_nid = dgl.distributed.node_split(
+            g.nodes["paper"].data["test_mask"],
+            pb,
+            ntype="paper",
+            force_even=True,
+        )
+    local_nid = pb.partid2nids(pb.partid, "paper").detach().numpy()
+    print(
+        "part {}, train: {} (local: {}), val: {} (local: {}), test: {} (local: {})".format(
+            g.rank(),
+            len(train_nid),
+            len(np.intersect1d(train_nid.numpy(), local_nid)),
+            len(val_nid),
+            len(np.intersect1d(val_nid.numpy(), local_nid)),
+            len(test_nid),
+            len(np.intersect1d(test_nid.numpy(), local_nid)),
+        )
+    )
+
+    if args.num_gpus == -1:
+        device = th.device("cpu")
+    else:
+        dev_id = g.rank() % args.num_gpus
+        device = th.device("cuda:" + str(dev_id))
+    labels = g.nodes["paper"].data["labels"][np.arange(g.num_nodes("paper"))]
+    all_val_nid = th.LongTensor(
+        np.nonzero(
+            g.nodes["paper"].data["val_mask"][np.arange(g.num_nodes("paper"))]
+        )
+    ).squeeze()
+    all_test_nid = th.LongTensor(
+        np.nonzero(
+            g.nodes["paper"].data["test_mask"][np.arange(g.num_nodes("paper"))]
+        )
+    ).squeeze()
+    n_classes = len(th.unique(labels[labels >= 0]))
+    print("#classes:", n_classes)
+
+    run(
+        args,
+        device,
+        (
+            g,
+            n_classes,
+            train_nid,
+            val_nid,
+            test_nid,
+            labels,
+            all_val_nid,
+            all_test_nid,
+        ),
+    )
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="RGCN")
+    # distributed training related
+    parser.add_argument(
+        "--graphbolt",
+        default=False,
+        action="store_true",
+        help="train with GraphBolt",
+    )
+    parser.add_argument("--graph-name", type=str, help="graph name")
+    parser.add_argument("--id", type=int, help="the partition id")
+    parser.add_argument(
+        "--ip-config", type=str, help="The file for IP configuration"
+    )
+    parser.add_argument(
+        "--conf-path", type=str, help="The path to the partition config file"
+    )
+
+    # rgcn related
+    parser.add_argument(
+        "--num_gpus",
+        type=int,
+        default=-1,
+        help="the number of GPU device. Use -1 for CPU training",
+    )
+    parser.add_argument(
+        "--dropout", type=float, default=0, help="dropout probability"
+    )
+    parser.add_argument(
+        "--n-hidden", type=int, default=16, help="number of hidden units"
+    )
+    parser.add_argument("--lr", type=float, default=1e-2, help="learning rate")
+    parser.add_argument(
+        "--sparse-lr", type=float, default=1e-2, help="sparse lr rate"
+    )
+    parser.add_argument(
+        "--n-bases",
+        type=int,
+        default=-1,
+        help="number of filter weight matrices, default: -1 [use all]",
+    )
+    parser.add_argument(
+        "--n-layers", type=int, default=2, help="number of propagation rounds"
+    )
+    parser.add_argument(
+        "-e",
+        "--n-epochs",
+        type=int,
+        default=50,
+        help="number of training epochs",
+    )
+    parser.add_argument(
+        "-d", "--dataset", type=str, required=True, help="dataset to use"
+    )
+    parser.add_argument("--l2norm", type=float, default=0, help="l2 norm coef")
+    parser.add_argument(
+        "--relabel",
+        default=False,
+        action="store_true",
+        help="remove untouched nodes and relabel",
+    )
+    parser.add_argument(
+        "--fanout",
+        type=str,
+        default="4, 4",
+        help="Fan-out of neighbor sampling.",
+    )
+    parser.add_argument(
+        "--validation-fanout",
+        type=str,
+        default=None,
+        help="Fan-out of neighbor sampling during validation.",
+    )
+    parser.add_argument(
+        "--use-self-loop",
+        default=False,
+        action="store_true",
+        help="include self feature as a special relation",
+    )
+    parser.add_argument(
+        "--batch-size", type=int, default=100, help="Mini-batch size. "
+    )
+    parser.add_argument(
+        "--eval-batch-size", type=int, default=128, help="Mini-batch size. "
+    )
+    parser.add_argument("--log-every", type=int, default=20)
+    parser.add_argument(
+        "--low-mem",
+        default=False,
+        action="store_true",
+        help="Whether use low mem RelGraphCov",
+    )
+    parser.add_argument(
+        "--sparse-embedding",
+        action="store_true",
+        help="Use sparse embedding for node embeddings.",
+    )
+    parser.add_argument(
+        "--dgl-sparse",
+        action="store_true",
+        help="Whether to use DGL sparse embedding",
+    )
+    parser.add_argument(
+        "--layer-norm",
+        default=False,
+        action="store_true",
+        help="Use layer norm",
+    )
+    parser.add_argument(
+        "--local_rank", type=int, help="get rank of the process"
+    )
+    parser.add_argument(
+        "--standalone", action="store_true", help="run in the standalone mode"
+    )
+    args = parser.parse_args()
+
+    # if validation_fanout is None, set it with args.fanout
+    if args.validation_fanout is None:
+        args.validation_fanout = args.fanout
+    print(args)
+    main(args)
diff --git a/examples/pytorch/rgcn/experimental/dev/gb_demo_cmd.sh b/examples/pytorch/rgcn/experimental/dev/gb_demo_cmd.sh
new file mode 100644
index 000000000000..88212bc794d7
--- /dev/null
+++ b/examples/pytorch/rgcn/experimental/dev/gb_demo_cmd.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+python3 /home/ubuntu/workspace/dgl_2/tools/launch.py \
+    --workspace /home/ubuntu/workspace/dgl_2/examples/pytorch/rgcn/experimental/ \
+    --num_trainers 4 \
+    --num_servers 2 \
+    --num_samplers 0 \
+    --part_config /home/ubuntu/workspace/dgl_2/data/ogbn-mag.json \
+    --ip_config /home/ubuntu/workspace/ip_config.txt \
+    "DGL_LIBRARY_PATH=/home/ubuntu/workspace/dgl_2/build PYTHONPATH=tests:/home/ubuntu/workspace/dgl_2/python:tests/python/pytorch/graphbolt:$PYTHONPATH python3 gb_demo.py --graph-name ogbn-mag --dataset ogbn-mag --ip-config /home/ubuntu/workspace/ip_config.txt --fanout='25,10' --batch-size 1024  --n-hidden 64 --lr 0.01 --eval-batch-size 1024 --graphbolt"
diff --git a/examples/pytorch/rgcn/experimental/dgl_cmd.sh b/examples/pytorch/rgcn/experimental/dgl_cmd.sh
new file mode 100644
index 000000000000..3990982b2991
--- /dev/null
+++ b/examples/pytorch/rgcn/experimental/dgl_cmd.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+python3 /home/ubuntu/workspace/dgl_2/tools/launch.py \
+    --workspace /home/ubuntu/workspace/dgl_2/examples/pytorch/rgcn/experimental/ \
+    --num_trainers 4 \
+    --num_servers 2 \
+    --num_samplers 0 \
+    --part_config /home/ubuntu/workspace/dgl_2/data/ogbn-mag.json \
+    --ip_config /home/ubuntu/workspace/ip_config.txt \
+    "DGL_LIBRARY_PATH=/home/ubuntu/workspace/dgl_2/build PYTHONPATH=tests:/home/ubuntu/workspace/dgl_2/python:tests/python/pytorch/graphbolt:$PYTHONPATH python3 entity_classify_dist.py --graph-name ogbn-mag --dataset ogbn-mag --ip-config /home/ubuntu/workspace/ip_config.txt --fanout='25,10' --batch-size 1024  --n-hidden 64 --lr 0.01 --eval-batch-size 1024 --low-mem --dropout 0.5 --use-self-loop --n-bases 2 --n-epochs 1 --layer-norm --sparse-embedding --sparse-lr 0.06 "
diff --git a/examples/pytorch/rgcn/experimental/entity_classify_dist.py b/examples/pytorch/rgcn/experimental/entity_classify_dist.py
index 89093ede8a8b..162a328299b8 100644
--- a/examples/pytorch/rgcn/experimental/entity_classify_dist.py
+++ b/examples/pytorch/rgcn/experimental/entity_classify_dist.py
@@ -10,6 +10,7 @@
 import gc, os
 import itertools
 import time
+import psutil
 
 import numpy as np
 
@@ -466,6 +467,7 @@ def run(args, device, data):
         batch_size=args.batch_size,
         shuffle=True,
         drop_last=False,
+        use_graphbolt=args.graphbolt,
     )
 
     valid_sampler = dgl.dataloading.MultiLayerNeighborSampler(val_fanouts)
@@ -476,6 +478,7 @@ def run(args, device, data):
         batch_size=args.batch_size,
         shuffle=False,
         drop_last=False,
+        use_graphbolt=args.graphbolt,
     )
 
     test_sampler = dgl.dataloading.MultiLayerNeighborSampler(val_fanouts)
@@ -486,6 +489,7 @@ def run(args, device, data):
         batch_size=args.eval_batch_size,
         shuffle=False,
         drop_last=False,
+        use_graphbolt=args.graphbolt,
     )
 
     embed_layer = DistEmbedLayer(
@@ -703,11 +707,14 @@ def run(args, device, data):
 
 
 def main(args):
-    dgl.distributed.initialize(args.ip_config)
+    if args.graphbolt:
+        print("Using GraphBolt")
+    dgl.distributed.initialize(args.ip_config, use_graphbolt=args.graphbolt)
     if not args.standalone:
         th.distributed.init_process_group(backend="gloo")
 
-    g = dgl.distributed.DistGraph(args.graph_name, part_config=args.conf_path)
+    g = dgl.distributed.DistGraph(args.graph_name, part_config=args.conf_path,
+                                  use_graphbolt=args.graphbolt)
     print("rank:", g.rank())
 
     pb = g.get_partition_book()
@@ -802,6 +809,12 @@ def main(args):
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="RGCN")
     # distributed training related
+    parser.add_argument(
+        "--graphbolt",
+        default=False,
+        action="store_true",
+        help="train with GraphBolt",
+    )
     parser.add_argument("--graph-name", type=str, help="graph name")
     parser.add_argument("--id", type=int, help="the partition id")
     parser.add_argument(
diff --git a/examples/pytorch/rgcn/experimental/gb_cmd.sh b/examples/pytorch/rgcn/experimental/gb_cmd.sh
new file mode 100644
index 000000000000..63ed0a14ca90
--- /dev/null
+++ b/examples/pytorch/rgcn/experimental/gb_cmd.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+python3 /home/ubuntu/workspace/dgl_2/tools/launch.py \
+    --workspace /home/ubuntu/workspace/dgl_2/examples/pytorch/rgcn/experimental/ \
+    --num_trainers 4 \
+    --num_servers 2 \
+    --num_samplers 0 \
+    --part_config /home/ubuntu/workspace/dgl_2/data/ogbn-mag.json \
+    --ip_config /home/ubuntu/workspace/ip_config.txt \
+    "DGL_LIBRARY_PATH=/home/ubuntu/workspace/dgl_2/build PYTHONPATH=tests:/home/ubuntu/workspace/dgl_2/python:tests/python/pytorch/graphbolt:$PYTHONPATH python3 entity_classify_dist.py --graph-name ogbn-mag --dataset ogbn-mag --ip-config /home/ubuntu/workspace/ip_config.txt --fanout='25,10' --batch-size 1024  --n-hidden 64 --lr 0.01 --eval-batch-size 1024 --low-mem --dropout 0.5 --use-self-loop --n-bases 2 --n-epochs 1 --layer-norm --sparse-embedding --sparse-lr 0.06  --graphbolt"
diff --git a/examples/pytorch/rgcn/experimental/partition_graph.py b/examples/pytorch/rgcn/experimental/partition_graph.py
index cc364ee94e12..af430261609f 100644
--- a/examples/pytorch/rgcn/experimental/partition_graph.py
+++ b/examples/pytorch/rgcn/experimental/partition_graph.py
@@ -68,6 +68,11 @@ def load_ogb(dataset):
     argparser.add_argument(
         "--part_method", type=str, default="metis", help="the partition method"
     )
+    argparser.add_argument(
+        "--graphbolt",
+        action="store_true",
+        help="convert DGL to GraphBolt partitions.",
+    )
     argparser.add_argument(
         "--balance_train",
         action="store_true",
@@ -127,4 +132,5 @@ def load_ogb(dataset):
         balance_ntypes=balance_ntypes,
         balance_edges=args.balance_edges,
         num_trainers_per_machine=args.num_trainers_per_machine,
+        use_graphbolt=args.graphbolt,
     )
diff --git a/graphbolt/include/graphbolt/csc_sampling_graph.h b/graphbolt/include/graphbolt/csc_sampling_graph.h
index c228998e3d31..ee8210e9172c 100644
--- a/graphbolt/include/graphbolt/csc_sampling_graph.h
+++ b/graphbolt/include/graphbolt/csc_sampling_graph.h
@@ -48,6 +48,7 @@ struct SamplerArgs<SamplerType::LABOR> {
  */
 class CSCSamplingGraph : public torch::CustomClassHolder {
  public:
+  using NodeAttrMap = torch::Dict<std::string, torch::Tensor>;
   using EdgeAttrMap = torch::Dict<std::string, torch::Tensor>;
   /** @brief Default constructor. */
   CSCSamplingGraph() = default;
@@ -65,6 +66,7 @@ class CSCSamplingGraph : public torch::CustomClassHolder {
       const torch::Tensor& indptr, const torch::Tensor& indices,
       const torch::optional<torch::Tensor>& node_type_offset,
       const torch::optional<torch::Tensor>& type_per_edge,
+      const torch::optional<NodeAttrMap>& node_attributes,
       const torch::optional<EdgeAttrMap>& edge_attributes);
 
   /**
@@ -82,6 +84,7 @@ class CSCSamplingGraph : public torch::CustomClassHolder {
       const torch::Tensor& indptr, const torch::Tensor& indices,
       const torch::optional<torch::Tensor>& node_type_offset,
       const torch::optional<torch::Tensor>& type_per_edge,
+      const torch::optional<NodeAttrMap>& node_attributes,
       const torch::optional<EdgeAttrMap>& edge_attributes);
 
   /** @brief Get the number of nodes. */
@@ -106,6 +109,11 @@ class CSCSamplingGraph : public torch::CustomClassHolder {
     return type_per_edge_;
   }
 
+  /** @brief Get the node attributes dictionary. */
+  inline const torch::optional<NodeAttrMap> NodeAttributes() const {
+    return node_attributes_;
+  }
+
   /** @brief Get the edge attributes dictionary. */
   inline const torch::optional<EdgeAttrMap> EdgeAttributes() const {
     return edge_attributes_;
@@ -129,6 +137,12 @@ class CSCSamplingGraph : public torch::CustomClassHolder {
     type_per_edge_ = type_per_edge;
   }
 
+  /** @brief Set the node attributes dictionary. */
+  inline void SetNodeAttributes(
+      const torch::optional<NodeAttrMap>& node_attributes) {
+    node_attributes_ = node_attributes;
+  }
+
   /** @brief Set the edge attributes dictionary. */
   inline void SetEdgeAttributes(
       const torch::optional<EdgeAttrMap>& edge_attributes) {
@@ -302,6 +316,13 @@ class CSCSamplingGraph : public torch::CustomClassHolder {
    */
   torch::optional<torch::Tensor> type_per_edge_;
 
+  /**
+   * @brief A dictionary of node attributes. Each key represents the attribute's
+   * name, while the corresponding value holds the attribute's specific value.
+   * The length of each value should match the total number of nodes."
+   */
+  torch::optional<NodeAttrMap> node_attributes_;
+
   /**
    * @brief A dictionary of edge attributes. Each key represents the attribute's
    * name, while the corresponding value holds the attribute's specific value.
diff --git a/graphbolt/src/csc_sampling_graph.cc b/graphbolt/src/csc_sampling_graph.cc
index 30798d720287..af28e4bcbda7 100644
--- a/graphbolt/src/csc_sampling_graph.cc
+++ b/graphbolt/src/csc_sampling_graph.cc
@@ -28,11 +28,13 @@ CSCSamplingGraph::CSCSamplingGraph(
     const torch::Tensor& indptr, const torch::Tensor& indices,
     const torch::optional<torch::Tensor>& node_type_offset,
     const torch::optional<torch::Tensor>& type_per_edge,
+    const torch::optional<NodeAttrMap>& node_attributes,
     const torch::optional<EdgeAttrMap>& edge_attributes)
     : indptr_(indptr),
       indices_(indices),
       node_type_offset_(node_type_offset),
       type_per_edge_(type_per_edge),
+      node_attributes_(node_attributes),
       edge_attributes_(edge_attributes) {
   TORCH_CHECK(indptr.dim() == 1);
   TORCH_CHECK(indices.dim() == 1);
@@ -43,6 +45,7 @@ c10::intrusive_ptr<CSCSamplingGraph> CSCSamplingGraph::FromCSC(
     const torch::Tensor& indptr, const torch::Tensor& indices,
     const torch::optional<torch::Tensor>& node_type_offset,
     const torch::optional<torch::Tensor>& type_per_edge,
+    const torch::optional<NodeAttrMap>& node_attributes,
     const torch::optional<EdgeAttrMap>& edge_attributes) {
   if (node_type_offset.has_value()) {
     auto& offset = node_type_offset.value();
@@ -52,13 +55,18 @@ c10::intrusive_ptr<CSCSamplingGraph> CSCSamplingGraph::FromCSC(
     TORCH_CHECK(type_per_edge.value().dim() == 1);
     TORCH_CHECK(type_per_edge.value().size(0) == indices.size(0));
   }
+  if (node_attributes.has_value()) {
+    for (const auto& pair : node_attributes.value()) {
+      TORCH_CHECK(pair.value().size(0) == indptr.size(0) - 1);
+    }
+  }
   if (edge_attributes.has_value()) {
     for (const auto& pair : edge_attributes.value()) {
       TORCH_CHECK(pair.value().size(0) == indices.size(0));
     }
   }
   return c10::make_intrusive<CSCSamplingGraph>(
-      indptr, indices, node_type_offset, type_per_edge, edge_attributes);
+      indptr, indices, node_type_offset, type_per_edge, node_attributes, edge_attributes);
 }
 
 void CSCSamplingGraph::Load(torch::serialize::InputArchive& archive) {
@@ -81,6 +89,25 @@ void CSCSamplingGraph::Load(torch::serialize::InputArchive& archive) {
         read_from_archive(archive, "CSCSamplingGraph/type_per_edge").toTensor();
   }
 
+  // Optional node attributes.
+  torch::IValue has_node_attributes;
+  if (archive.try_read(
+          "CSCSamplingGraph/has_node_attributes", has_node_attributes) &&
+      has_node_attributes.toBool()) {
+    torch::Dict<torch::IValue, torch::IValue> generic_dict =
+        read_from_archive(archive, "CSCSamplingGraph/node_attributes")
+            .toGenericDict();
+    NodeAttrMap target_dict;
+    for (const auto& pair : generic_dict) {
+      std::string key = pair.key().toStringRef();
+      torch::Tensor value = pair.value().toTensor();
+      // Use move to avoid copy.
+      target_dict.insert(std::move(key), std::move(value));
+    }
+    // Same as above.
+    node_attributes_ = std::move(target_dict);
+  }
+
   // Optional edge attributes.
   torch::IValue has_edge_attributes;
   if (archive.try_read(
@@ -116,6 +143,12 @@ void CSCSamplingGraph::Save(torch::serialize::OutputArchive& archive) const {
   if (type_per_edge_) {
     archive.write("CSCSamplingGraph/type_per_edge", type_per_edge_.value());
   }
+  archive.write(
+      "CSCSamplingGraph/has_node_attributes", node_attributes_.has_value());
+  if (node_attributes_) {
+    archive.write(
+        "CSCSamplingGraph/node_attributes", node_attributes_.value());
+  }
   archive.write(
       "CSCSamplingGraph/has_edge_attributes", edge_attributes_.has_value());
   if (edge_attributes_) {
@@ -127,7 +160,7 @@ void CSCSamplingGraph::SetState(
     const torch::Dict<std::string, torch::Dict<std::string, torch::Tensor>>&
         state) {
   // State is a dict of dicts. The tensor-type attributes are stored in the dict
-  // with key "independent_tensors". The dict-type attributes (edge_attributes)
+  // with key "independent_tensors". The dict-type attributes (node/edge_attributes)
   // are stored directly with the their name as the key.
   const auto& independent_tensors = state.at("independent_tensors");
   TORCH_CHECK(
@@ -143,6 +176,9 @@ void CSCSamplingGraph::SetState(
   if (independent_tensors.find("type_per_edge") != independent_tensors.end()) {
     type_per_edge_ = independent_tensors.at("type_per_edge");
   }
+  if (state.find("node_attributes") != state.end()) {
+    node_attributes_ = state.at("node_attributes");
+  }
   if (state.find("edge_attributes") != state.end()) {
     edge_attributes_ = state.at("edge_attributes");
   }
@@ -151,7 +187,7 @@ void CSCSamplingGraph::SetState(
 torch::Dict<std::string, torch::Dict<std::string, torch::Tensor>>
 CSCSamplingGraph::GetState() const {
   // State is a dict of dicts. The tensor-type attributes are stored in the dict
-  // with key "independent_tensors". The dict-type attributes (edge_attributes)
+  // with key "independent_tensors". The dict-type attributes (node/edge_attributes)
   // are stored directly with the their name as the key.
   torch::Dict<std::string, torch::Dict<std::string, torch::Tensor>> state;
   torch::Dict<std::string, torch::Tensor> independent_tensors;
@@ -167,6 +203,9 @@ CSCSamplingGraph::GetState() const {
     independent_tensors.insert("type_per_edge", type_per_edge_.value());
   }
   state.insert("independent_tensors", independent_tensors);
+  if (node_attributes_.has_value()) {
+    state.insert("node_attributes", node_attributes_.value());
+  }
   if (edge_attributes_.has_value()) {
     state.insert("edge_attributes", edge_attributes_.value());
   }
@@ -318,7 +357,8 @@ c10::intrusive_ptr<SampledSubgraph> CSCSamplingGraph::SampleNeighborsImpl(
         auto num_picked_neighbors_data_ptr =
             num_picked_neighbors_per_node.data_ptr<scalar_t>();
         num_picked_neighbors_data_ptr[0] = 0;
-        const auto nodes_data_ptr = nodes.data_ptr<int64_t>();
+        //const auto nodes_data_ptr = nodes.data_ptr<int64_t>();
+        const auto nodes_data_ptr = nodes.data_ptr<scalar_t>();
 
         // Step 1. Calculate pick number of each node.
         torch::parallel_for(
@@ -339,7 +379,7 @@ c10::intrusive_ptr<SampledSubgraph> CSCSamplingGraph::SampleNeighborsImpl(
 
         // Step 2. Calculate prefix sum to get total length and offsets of each
         // node. It's also the indptr of the generated subgraph.
-        subgraph_indptr = torch::cumsum(num_picked_neighbors_per_node, 0);
+        subgraph_indptr = torch::cumsum(num_picked_neighbors_per_node, 0).to(indptr_.dtype());
 
         // Step 3. Allocate the tensor for picked neighbors.
         const auto total_length =
@@ -469,9 +509,11 @@ static c10::intrusive_ptr<CSCSamplingGraph> BuildGraphFromSharedMemoryHelper(
   auto indices = helper.ReadTorchTensor();
   auto node_type_offset = helper.ReadTorchTensor();
   auto type_per_edge = helper.ReadTorchTensor();
+  auto node_attributes = helper.ReadTorchTensorDict();
   auto edge_attributes = helper.ReadTorchTensorDict();
   auto graph = c10::make_intrusive<CSCSamplingGraph>(
       indptr.value(), indices.value(), node_type_offset, type_per_edge,
+      node_attributes,
       edge_attributes);
   auto shared_memory = helper.ReleaseSharedMemory();
   graph->HoldSharedMemoryObject(
@@ -486,6 +528,7 @@ c10::intrusive_ptr<CSCSamplingGraph> CSCSamplingGraph::CopyToSharedMemory(
   helper.WriteTorchTensor(indices_);
   helper.WriteTorchTensor(node_type_offset_);
   helper.WriteTorchTensor(type_per_edge_);
+  helper.WriteTorchTensorDict(node_attributes_);
   helper.WriteTorchTensorDict(edge_attributes_);
   helper.Flush();
   return BuildGraphFromSharedMemoryHelper(std::move(helper));
diff --git a/graphbolt/src/python_binding.cc b/graphbolt/src/python_binding.cc
index 06bcc9c98acc..4fc691fcf4c5 100644
--- a/graphbolt/src/python_binding.cc
+++ b/graphbolt/src/python_binding.cc
@@ -31,11 +31,13 @@ TORCH_LIBRARY(graphbolt, m) {
       .def("indices", &CSCSamplingGraph::Indices)
       .def("node_type_offset", &CSCSamplingGraph::NodeTypeOffset)
       .def("type_per_edge", &CSCSamplingGraph::TypePerEdge)
+        .def("node_attributes", &CSCSamplingGraph::NodeAttributes)
       .def("edge_attributes", &CSCSamplingGraph::EdgeAttributes)
       .def("set_csc_indptr", &CSCSamplingGraph::SetCSCIndptr)
       .def("set_indices", &CSCSamplingGraph::SetIndices)
       .def("set_node_type_offset", &CSCSamplingGraph::SetNodeTypeOffset)
       .def("set_type_per_edge", &CSCSamplingGraph::SetTypePerEdge)
+      .def("set_node_attributes", &CSCSamplingGraph::SetNodeAttributes)
       .def("set_edge_attributes", &CSCSamplingGraph::SetEdgeAttributes)
       .def("in_subgraph", &CSCSamplingGraph::InSubgraph)
       .def("sample_neighbors", &CSCSamplingGraph::SampleNeighbors)
diff --git a/python/dgl/dataloading/dist_dataloader.py b/python/dgl/dataloading/dist_dataloader.py
index dde8a2098cc2..8eb1e765ff78 100644
--- a/python/dgl/dataloading/dist_dataloader.py
+++ b/python/dgl/dataloading/dist_dataloader.py
@@ -167,7 +167,7 @@ class NodeCollator(Collator):
     :doc:`Minibatch Training Tutorials <tutorials/large/L0_neighbor_sampling_overview>`.
     """
 
-    def __init__(self, g, nids, graph_sampler):
+    def __init__(self, g, nids, graph_sampler, use_graphbolt=False):
         self.g = g
         if not isinstance(nids, Mapping):
             assert (
@@ -177,6 +177,7 @@ def __init__(self, g, nids, graph_sampler):
 
         self.nids = utils.prepare_tensor_or_dict(g, nids, "nids")
         self._dataset = utils.maybe_flatten_dict(self.nids)
+        self._use_graphbolt = use_graphbolt
 
     @property
     def dataset(self):
@@ -213,7 +214,7 @@ def collate(self, items):
         items = utils.prepare_tensor_or_dict(self.g, items, "items")
 
         input_nodes, output_nodes, blocks = self.graph_sampler.sample_blocks(
-            self.g, items
+            self.g, items, use_graphbolt=self._use_graphbolt
         )
 
         return input_nodes, output_nodes, blocks
@@ -591,7 +592,7 @@ class DistNodeDataLoader(DistDataLoader):
     dgl.dataloading.DataLoader
     """
 
-    def __init__(self, g, nids, graph_sampler, device=None, **kwargs):
+    def __init__(self, g, nids, graph_sampler, device=None, use_graphbolt=False, **kwargs):
         collator_kwargs = {}
         dataloader_kwargs = {}
         _collator_arglist = inspect.getfullargspec(NodeCollator).args
@@ -608,7 +609,7 @@ def __init__(self, g, nids, graph_sampler, device=None, **kwargs):
         ), "Only cpu is supported in the case of a DistGraph."
         # Distributed DataLoader currently does not support heterogeneous graphs
         # and does not copy features.  Fallback to normal solution
-        self.collator = NodeCollator(g, nids, graph_sampler, **collator_kwargs)
+        self.collator = NodeCollator(g, nids, graph_sampler, use_graphbolt=use_graphbolt, **collator_kwargs)
         _remove_kwargs_dist(dataloader_kwargs)
         super().__init__(
             self.collator.dataset,
diff --git a/python/dgl/dataloading/neighbor_sampler.py b/python/dgl/dataloading/neighbor_sampler.py
index 603d39107cf1..837c5c33747c 100644
--- a/python/dgl/dataloading/neighbor_sampler.py
+++ b/python/dgl/dataloading/neighbor_sampler.py
@@ -148,7 +148,7 @@ def __init__(
         self.mapping = {}
         self.g = None
 
-    def sample_blocks(self, g, seed_nodes, exclude_eids=None):
+    def sample_blocks(self, g, seed_nodes, exclude_eids=None, use_graphbolt=False):
         output_nodes = seed_nodes
         blocks = []
         # sample_neighbors_fused function requires multithreading to be more efficient
@@ -191,10 +191,18 @@ def sample_blocks(self, g, seed_nodes, exclude_eids=None):
                 replace=self.replace,
                 output_device=self.output_device,
                 exclude_edges=exclude_eids,
+                use_graphbolt=use_graphbolt,
             )
-            eid = frontier.edata[EID]
+            # [Rui] For heterograph + DGL, it returns EIDs.
+            # For heterograph + GraphBolt, it returns {} as I didn't set it.
+            # For homogeneous graph + DGL, it returns EIDs.
+            # For homogeneous graph + GraphBolt, it crashed as no key[EID] exist.
+            eid = None
+            if EID in frontier.edata:
+                eid = frontier.edata[EID]
             block = to_block(frontier, seed_nodes)
-            block.edata[EID] = eid
+            if eid is not None:
+                block.edata[EID] = eid
             seed_nodes = block.srcdata[NID]
             blocks.insert(0, block)
 
diff --git a/python/dgl/distributed/__init__.py b/python/dgl/distributed/__init__.py
index 6b7d322841ea..2a3fe2b8a647 100644
--- a/python/dgl/distributed/__init__.py
+++ b/python/dgl/distributed/__init__.py
@@ -2,7 +2,13 @@
 from . import optim
 from .dist_context import exit_client, initialize
 from .dist_dataloader import DistDataLoader
-from .dist_graph import DistGraph, DistGraphServer, edge_split, node_split
+from .dist_graph import (
+    DistGraph,
+    DistGraphServer,
+    DistributedNeighborSampler,
+    edge_split,
+    node_split,
+)
 from .dist_tensor import DistTensor
 from .graph_partition_book import GraphPartitionBook, PartitionPolicy
 from .graph_services import *
diff --git a/python/dgl/distributed/dist_context.py b/python/dgl/distributed/dist_context.py
index 51af0afeafb0..565c4c26bcb2 100644
--- a/python/dgl/distributed/dist_context.py
+++ b/python/dgl/distributed/dist_context.py
@@ -210,6 +210,7 @@ def initialize(
     max_queue_size=MAX_QUEUE_SIZE,
     net_type=None,
     num_worker_threads=1,
+    use_graphbolt=False,
 ):
     """Initialize DGL's distributed module
 
@@ -231,6 +232,8 @@ def initialize(
         [Deprecated] Networking type, can be 'socket' only.
     num_worker_threads: int
         The number of OMP threads in each sampler process.
+    use_graphbolt: bool
+        Whether to use graphbolt for sampling.
 
     Note
     ----
@@ -270,6 +273,7 @@ def initialize(
             int(os.environ.get("DGL_NUM_CLIENT")),
             os.environ.get("DGL_CONF_PATH"),
             graph_format=formats,
+            use_graphbolt=use_graphbolt,
         )
         serv.start()
         sys.exit()
@@ -283,6 +287,8 @@ def initialize(
         is_standalone = (
             os.environ.get("DGL_DIST_MODE", "standalone") == "standalone"
         )
+        if use_graphbolt:
+            assert num_workers == 0, "GraphBolt does not support multiprocessing sampling."
         if num_workers > 0 and not is_standalone:
             SAMPLER_POOL = CustomPool(
                 num_workers,
diff --git a/python/dgl/distributed/dist_graph.py b/python/dgl/distributed/dist_graph.py
index 192293a80676..8a9c1e3b0d1c 100644
--- a/python/dgl/distributed/dist_graph.py
+++ b/python/dgl/distributed/dist_graph.py
@@ -1,14 +1,17 @@
 """Define distributed graph."""
 
 import gc
-
+import psutil
 import os
 from collections import namedtuple
 from collections.abc import MutableMapping
 
 import numpy as np
 
-from .. import backend as F, heterograph_index
+import torch
+from torch.utils.data import functional_datapipe
+
+from .. import backend as F, graphbolt as gb, heterograph_index
 from .._ffi.ndarray import empty_shared_mem
 from ..base import ALL, DGLError, EID, ETYPE, is_all, NID
 from ..convert import graph as dgl_graph, heterograph as dgl_heterograph
@@ -51,6 +54,31 @@
 
 INIT_GRAPH = 800001
 
+from torchdata.datapipes.iter import IterDataPipe
+@functional_datapipe("distributed_sample_neighbor")
+class DistributedNeighborSampler(gb.NeighborSampler):
+    """Distributed Neighbor Sampler.
+
+    This is a wrapper of :py:class:`dgl.dataloading.NeighborSampler` to support distributed
+    training. It samples neighbors from a distributed graph.
+    """
+
+    def __init__(self, datapipe, graph, fanouts):
+        super().__init__(datapipe, graph._g, fanouts)
+        self.dist_graph = graph
+
+    def _sample_subgraphs(self, seeds):
+        sampled_graphs = []  # In DGLGraph or DGLHeteroGraph format.
+        for fanout in self.fanouts:
+            # fanout is a tensor. We need to convert it to integer.
+            sampled_graphs.append(
+                self.dist_graph.sample_neighbors(
+                    seeds, fanout.item(), use_graphbolt=True
+                )
+            )
+        print("sampled_graphs: ", sampled_graphs)
+        return seeds, sampled_graphs
+
 
 class InitGraphRequest(rpc.Request):
     """Init graph on the backup servers.
@@ -60,18 +88,28 @@ class InitGraphRequest(rpc.Request):
     with shared memory.
     """
 
-    def __init__(self, graph_name):
+    def __init__(self, graph_name, use_graphbolt):
         self._graph_name = graph_name
+        self._use_graphbolt = use_graphbolt
 
     def __getstate__(self):
-        return self._graph_name
+        return (self._graph_name, self._use_graphbolt)
 
     def __setstate__(self, state):
-        self._graph_name = state
+        (self._graph_name, self._use_graphbolt) = state
 
     def process_request(self, server_state):
         if server_state.graph is None:
-            server_state.graph = _get_graph_from_shared_mem(self._graph_name)
+            gb_metadata = None
+            if self._use_graphbolt:
+                gpb = server_state.partition_book
+                gb_metadata = gb.GraphMetadata(
+                    {ntype: i for i, ntype in enumerate(gpb.ntypes)},
+                    {gb.etype_tuple_to_str(etype): i for i, etype in enumerate(gpb.canonical_etypes)},
+                )
+            server_state.graph = _get_graph_from_shared_mem(
+                self._graph_name, self._use_graphbolt, gb_metadata
+            )
         return InitGraphResponse(self._graph_name)
 
 
@@ -88,7 +126,10 @@ def __setstate__(self, state):
         self._graph_name = state
 
 
-def _copy_graph_to_shared_mem(g, graph_name, graph_format):
+def _copy_graph_to_shared_mem(g, graph_name, graph_format, use_graphbolt):
+    if use_graphbolt:
+        new_g = g.copy_to_shared_memory(graph_name)
+        return new_g
     new_g = g.shared_memory(graph_name, formats=graph_format)
     # We should share the node/edge data to the client explicitly instead of putting them
     # in the KVStore because some of the node/edge data may be duplicated.
@@ -151,13 +192,17 @@ def _exist_shared_mem_array(graph_name, name):
     return exist_shared_mem_array(_get_edata_path(graph_name, name))
 
 
-def _get_graph_from_shared_mem(graph_name):
+def _get_graph_from_shared_mem(graph_name, use_graphbolt, gb_metadata):
     """Get the graph from the DistGraph server.
 
     The DistGraph server puts the graph structure of the local partition in the shared memory.
     The client can access the graph structure and some metadata on nodes and edges directly
     through shared memory to reduce the overhead of data access.
     """
+    if use_graphbolt:
+        g = gb.load_from_shared_memory(graph_name, gb_metadata)
+        return g
+
     g, ntypes, etypes = heterograph_index.create_heterograph_from_shared_memory(
         graph_name
     )
@@ -330,6 +375,8 @@ class DistGraphServer(KVServer):
         Disable shared memory.
     graph_format : str or list of str
         The graph formats.
+    use_graphbolt : bool
+        Whether to use GraphBolt format.
     """
 
     def __init__(
@@ -341,6 +388,7 @@ def __init__(
         part_config,
         disable_shared_mem=False,
         graph_format=("csc", "coo"),
+        use_graphbolt=False,
     ):
         super(DistGraphServer, self).__init__(
             server_id=server_id,
@@ -359,6 +407,7 @@ def __init__(
             self.client_g = None
         else:
             # Loading of node/edge_feats are deferred to lower the peak memory consumption.
+            prev_rss = psutil.Process(os.getpid()).memory_info().rss
             (
                 self.client_g,
                 _,
@@ -367,32 +416,39 @@ def __init__(
                 graph_name,
                 ntypes,
                 etypes,
-            ) = load_partition(part_config, self.part_id, load_feats=False)
-            print("load " + graph_name)
-            # formatting dtype
-            # TODO(Rui) Formatting forcely is not a perfect solution.
-            #   We'd better store all dtypes when mapping to shared memory
-            #   and map back with original dtypes.
-            for k, dtype in RESERVED_FIELD_DTYPE.items():
-                if k in self.client_g.ndata:
-                    self.client_g.ndata[k] = F.astype(
-                        self.client_g.ndata[k], dtype
-                    )
-                if k in self.client_g.edata:
-                    self.client_g.edata[k] = F.astype(
-                        self.client_g.edata[k], dtype
-                    )
-            # Create the graph formats specified the users.
-            print(
-                "Start to create specified graph formats which may take "
-                "non-trivial time."
+            ) = load_partition(
+                part_config,
+                self.part_id,
+                load_feats=False,
+                use_graphbolt=use_graphbolt,
             )
-            self.client_g = self.client_g.formats(graph_format)
-            self.client_g.create_formats_()
-            print("Finished creating specified graph formats.")
+            if not use_graphbolt:
+                # formatting dtype
+                # TODO(Rui) Formatting forcely is not a perfect solution.
+                #   We'd better store all dtypes when mapping to shared memory
+                #   and map back with original dtypes.
+                for k, dtype in RESERVED_FIELD_DTYPE.items():
+                    if k in self.client_g.ndata:
+                        self.client_g.ndata[k] = F.astype(
+                            self.client_g.ndata[k], dtype
+                        )
+                    if k in self.client_g.edata:
+                        self.client_g.edata[k] = F.astype(
+                            self.client_g.edata[k], dtype
+                        )
+                # Create the graph formats specified the users.
+                print(
+                    "Start to create specified graph formats which may take "
+                    "non-trivial time."
+                )
+                self.client_g = self.client_g.formats(graph_format)
+                self.client_g.create_formats_()
+                print("Finished creating specified graph formats.")
+            new_rss = psutil.Process(os.getpid()).memory_info().rss
+            print(f"[Server_{self.server_id}] Loaded {graph_name} with use_graphbolt[{use_graphbolt}] in size[{(new_rss - prev_rss)/1024/1024} MB]")
             if not disable_shared_mem:
                 self.client_g = _copy_graph_to_shared_mem(
-                    self.client_g, graph_name, graph_format
+                    self.client_g, graph_name, graph_format, use_graphbolt
                 )
 
         if not disable_shared_mem:
@@ -542,9 +598,13 @@ class DistGraph:
     manually setting up servers and trainers. The setup is not fully tested yet.
     """
 
-    def __init__(self, graph_name, gpb=None, part_config=None):
+    def __init__(
+        self, graph_name, gpb=None, part_config=None, use_graphbolt=False
+    ):
         self.graph_name = graph_name
+        self._use_graphbolt = use_graphbolt
         if os.environ.get("DGL_DIST_MODE", "standalone") == "standalone":
+            assert not use_graphbolt, "GraphBolt is not supported in standalone mode."
             assert (
                 part_config is not None
             ), "When running in the standalone model, the partition config file is required"
@@ -554,7 +614,7 @@ def __init__(self, graph_name, gpb=None, part_config=None):
             ), "Distributed module is not initialized. Please call dgl.distributed.initialize."
             # Load graph partition data.
             g, node_feats, edge_feats, self._gpb, _, _, _ = load_partition(
-                part_config, 0
+                part_config, 0, use_graphbolt=use_graphbolt
             )
             assert (
                 self._gpb.num_partitions() == 1
@@ -582,10 +642,12 @@ def __init__(self, graph_name, gpb=None, part_config=None):
             self._client.map_shared_data(self._gpb)
             rpc.set_num_client(1)
         else:
-            self._init(gpb)
+            self._init(gpb, use_graphbolt)
             # Tell the backup servers to load the graph structure from shared memory.
             for server_id in range(self._client.num_servers):
-                rpc.send_request(server_id, InitGraphRequest(graph_name))
+                rpc.send_request(
+                    server_id, InitGraphRequest(graph_name, use_graphbolt)
+                )
             for server_id in range(self._client.num_servers):
                 rpc.recv_response()
             self._client.barrier()
@@ -605,15 +667,21 @@ def __init__(self, graph_name, gpb=None, part_config=None):
             etype: i for i, etype in enumerate(self.canonical_etypes)
         }
 
-    def _init(self, gpb):
+    def _init(self, gpb, use_graphbolt):
         self._client = get_kvstore()
         assert (
             self._client is not None
         ), "Distributed module is not initialized. Please call dgl.distributed.initialize."
-        self._g = _get_graph_from_shared_mem(self.graph_name)
         self._gpb = get_shared_mem_partition_book(self.graph_name)
         if self._gpb is None:
             self._gpb = gpb
+        gb_metadata = None
+        if use_graphbolt:
+            gb_metadata = gb.GraphMetadata(
+                {ntype: i for i, ntype in enumerate(self._gpb.ntypes)},
+                {gb.etype_tuple_to_str(etype): i for i, etype in enumerate(self._gpb.canonical_etypes)},
+            )
+        self._g = _get_graph_from_shared_mem(self.graph_name, use_graphbolt, gb_metadata)
         self._client.map_shared_data(self._gpb)
 
     def _init_ndata_store(self):
@@ -667,11 +735,11 @@ def _init_edata_store(self):
                 self._edata_store[etype] = data
 
     def __getstate__(self):
-        return self.graph_name, self._gpb
+        return self.graph_name, self._gpb, self._use_graphbolt
 
     def __setstate__(self, state):
-        self.graph_name, gpb = state
-        self._init(gpb)
+        self.graph_name, gpb, self._use_graphbolt = state
+        self._init(gpb, self._use_graphbolt)
 
         self._init_ndata_store()
         self._init_edata_store()
@@ -750,6 +818,12 @@ def idtype(self):
         int
         """
         # TODO(da?): describe when self._g is None and idtype shouldn't be called.
+        '''
+        if isinstance(self.local_partition, DGLGraph):
+            return self.local_partition.idtype
+        else:
+            return self.local_partition.indices.dtype
+        '''
         return F.int64
 
     @property
@@ -1357,6 +1431,7 @@ def sample_neighbors(
         replace=False,
         etype_sorted=True,
         output_device=None,
+        use_graphbolt=False,
     ):
         # pylint: disable=unused-argument
         """Sample neighbors from a distributed graph."""
@@ -1368,10 +1443,16 @@ def sample_neighbors(
                 replace=replace,
                 etype_sorted=etype_sorted,
                 prob=prob,
+                use_graphbolt=use_graphbolt,
             )
         else:
             frontier = graph_services.sample_neighbors(
-                self, seed_nodes, fanout, replace=replace, prob=prob
+                self,
+                seed_nodes,
+                fanout,
+                replace=replace,
+                prob=prob,
+                use_graphbolt=use_graphbolt,
             )
         return frontier
 
diff --git a/python/dgl/distributed/graph_services.py b/python/dgl/distributed/graph_services.py
index 0a732ca0e7b0..a125af4b7a4f 100644
--- a/python/dgl/distributed/graph_services.py
+++ b/python/dgl/distributed/graph_services.py
@@ -3,8 +3,10 @@
 
 import numpy as np
 
-from .. import backend as F
-from ..base import EID, NID
+import torch
+
+from .. import backend as F, graphbolt as gb
+from ..base import dgl_warning, DGLError, EID, NID
 from ..convert import graph, heterograph
 from ..sampling import (
     sample_etype_neighbors as local_sample_etype_neighbors,
@@ -66,7 +68,8 @@ def __getstate__(self):
 
 
 def _sample_neighbors(
-    local_g, partition_book, seed_nodes, fan_out, edge_dir, prob, replace
+    local_g, partition_book, seed_nodes, fan_out, edge_dir, prob, replace,
+    use_graphbolt=False
 ):
     """Sample from local partition.
 
@@ -76,23 +79,37 @@ def _sample_neighbors(
     and edge IDs.
     """
     local_ids = partition_book.nid2localnid(seed_nodes, partition_book.partid)
-    local_ids = F.astype(local_ids, local_g.idtype)
-    # local_ids = self.seed_nodes
-    sampled_graph = local_sample_neighbors(
-        local_g,
-        local_ids,
-        fan_out,
-        edge_dir,
-        prob,
-        replace,
-        _dist_training=True,
-    )
-    global_nid_mapping = local_g.ndata[NID]
-    src, dst = sampled_graph.edges()
+    if not use_graphbolt:
+        local_ids = F.astype(local_ids, local_g.idtype)
+    local_src, local_dst, local_eids = None, None, None
+    if use_graphbolt:
+        local_src, local_dst, local_eids = gb.NeighborSampler.distributed_sample_neighbor(
+            local_g, local_ids, fan_out
+        )
+        assert local_src is not None and local_dst is not None, (
+            "GraphBolt NeighborSampler.distributed_sample_neighbor() failed."
+        )
+    else:
+        sampled_graph = local_sample_neighbors(
+            local_g,
+            local_ids,
+            fan_out,
+            edge_dir,
+            prob,
+            replace,
+            _dist_training=True,
+        )
+        local_src, local_dst = sampled_graph.edges()
+        local_eids = sampled_graph.edata[EID]
+    if use_graphbolt:
+        global_nid_mapping = local_g.node_attributes[NID]
+        global_eids = local_eids
+    else:
+        global_nid_mapping = local_g.ndata[NID]
+        global_eids = F.gather_row(local_g.edata[EID], local_eids)
     global_src, global_dst = F.gather_row(
-        global_nid_mapping, src
-    ), F.gather_row(global_nid_mapping, dst)
-    global_eids = F.gather_row(local_g.edata[EID], sampled_graph.edata[EID])
+        global_nid_mapping, local_src
+    ), F.gather_row(global_nid_mapping, local_dst)
     return global_src, global_dst, global_eids
 
 
@@ -106,6 +123,7 @@ def _sample_etype_neighbors(
     prob,
     replace,
     etype_sorted=False,
+    use_graphbolt=False,
 ):
     """Sample from local partition.
 
@@ -115,25 +133,41 @@ def _sample_etype_neighbors(
     and edge IDs.
     """
     local_ids = partition_book.nid2localnid(seed_nodes, partition_book.partid)
-    local_ids = F.astype(local_ids, local_g.idtype)
-
-    sampled_graph = local_sample_etype_neighbors(
-        local_g,
-        local_ids,
-        etype_offset,
-        fan_out,
-        edge_dir,
-        prob,
-        replace,
-        etype_sorted=etype_sorted,
-        _dist_training=True,
-    )
-    global_nid_mapping = local_g.ndata[NID]
-    src, dst = sampled_graph.edges()
+    if not use_graphbolt:
+        local_ids = F.astype(local_ids, local_g.idtype)
+    local_src, local_dst, local_eids = None, None, None
+    if use_graphbolt:
+        local_src, local_dst, local_eids = gb.NeighborSampler.distributed_sample_neighbor(
+            local_g, local_ids, fan_out
+        )
+        assert local_src is not None and local_dst is not None and local_eids is not None, (
+            "GraphBolt NeighborSampler.distributed_sample_neighbor() failed."
+        )
+    else:
+        fan_out = F.astype(fan_out, local_g.idtype)
+        sampled_graph = local_sample_etype_neighbors(
+            local_g,
+            local_ids,
+            etype_offset,
+            fan_out,
+            edge_dir,
+            prob,
+            replace,
+            etype_sorted=etype_sorted,
+            _dist_training=True,
+        )
+        local_src, local_dst = sampled_graph.edges()
+        local_eids = sampled_graph.edata[EID]
+    if use_graphbolt:
+        global_nid_mapping = local_g.node_attributes[NID]
+        global_eids = local_eids
+    else:
+        global_nid_mapping = local_g.ndata[NID]
+        global_eids = F.gather_row(local_g.edata[EID], local_eids)
     global_src, global_dst = F.gather_row(
-        global_nid_mapping, src
-    ), F.gather_row(global_nid_mapping, dst)
-    global_eids = F.gather_row(local_g.edata[EID], sampled_graph.edata[EID])
+        global_nid_mapping, local_src
+    ), F.gather_row(global_nid_mapping, local_dst)
+
     return global_src, global_dst, global_eids
 
 
@@ -212,12 +246,13 @@ def _in_subgraph(local_g, partition_book, seed_nodes):
 class SamplingRequest(Request):
     """Sampling Request"""
 
-    def __init__(self, nodes, fan_out, edge_dir="in", prob=None, replace=False):
+    def __init__(self, nodes, fan_out, edge_dir="in", prob=None, replace=False, use_graphbolt=False):
         self.seed_nodes = nodes
         self.edge_dir = edge_dir
         self.prob = prob
         self.replace = replace
         self.fan_out = fan_out
+        self.use_graphbolt = use_graphbolt
 
     def __setstate__(self, state):
         (
@@ -226,6 +261,7 @@ def __setstate__(self, state):
             self.prob,
             self.replace,
             self.fan_out,
+            self.use_graphbolt,
         ) = state
 
     def __getstate__(self):
@@ -235,6 +271,7 @@ def __getstate__(self):
             self.prob,
             self.replace,
             self.fan_out,
+            self.use_graphbolt,
         )
 
     def process_request(self, server_state):
@@ -253,6 +290,7 @@ def process_request(self, server_state):
             self.edge_dir,
             prob,
             self.replace,
+            use_graphbolt=self.use_graphbolt,
         )
         return SubgraphResponse(global_src, global_dst, global_eids)
 
@@ -268,6 +306,7 @@ def __init__(
         prob=None,
         replace=False,
         etype_sorted=True,
+        use_graphbolt=False,
     ):
         self.seed_nodes = nodes
         self.edge_dir = edge_dir
@@ -275,6 +314,7 @@ def __init__(
         self.replace = replace
         self.fan_out = fan_out
         self.etype_sorted = etype_sorted
+        self.use_graphbolt = use_graphbolt
 
     def __setstate__(self, state):
         (
@@ -284,6 +324,7 @@ def __setstate__(self, state):
             self.replace,
             self.fan_out,
             self.etype_sorted,
+            self.use_graphbolt,
         ) = state
 
     def __getstate__(self):
@@ -294,6 +335,7 @@ def __getstate__(self):
             self.replace,
             self.fan_out,
             self.etype_sorted,
+            self.use_graphbolt,
         )
 
     def process_request(self, server_state):
@@ -319,6 +361,7 @@ def process_request(self, server_state):
             probs,
             self.replace,
             self.etype_sorted,
+            use_graphbolt=self.use_graphbolt,
         )
         return SubgraphResponse(global_src, global_dst, global_eids)
 
@@ -449,13 +492,17 @@ def merge_graphs(res_list, num_nodes):
             eids.append(res.global_eids)
         src_tensor = F.cat(srcs, 0)
         dst_tensor = F.cat(dsts, 0)
-        eid_tensor = F.cat(eids, 0)
+        if eids[0] is None:
+            eid_tensor = None
+        else:
+            eid_tensor = F.cat(eids, 0)
     else:
         src_tensor = res_list[0].global_src
         dst_tensor = res_list[0].global_dst
         eid_tensor = res_list[0].global_eids
     g = graph((src_tensor, dst_tensor), num_nodes=num_nodes)
-    g.edata[EID] = eid_tensor
+    if eid_tensor is not None:
+        g.edata[EID] = eid_tensor
     return g
 
 
@@ -464,7 +511,7 @@ def merge_graphs(res_list, num_nodes):
 )
 
 
-def _distributed_access(g, nodes, issue_remote_req, local_access):
+def _distributed_access(g, nodes, issue_remote_req, local_access, use_graphbolt=False):
     """A routine that fetches local neighborhood of nodes from the distributed graph.
 
     The local neighborhood of some nodes are stored in the local machine and the other
@@ -483,6 +530,8 @@ def _distributed_access(g, nodes, issue_remote_req, local_access):
         The function that issues requests to access remote data.
     local_access : callable
         The function that reads data on the local machine.
+    use_graphbolt : bool
+        Whether to use GraphBolt.
 
     Returns
     -------
@@ -491,7 +540,8 @@ def _distributed_access(g, nodes, issue_remote_req, local_access):
     """
     req_list = []
     partition_book = g.get_partition_book()
-    nodes = toindex(nodes).tousertensor()
+    if not isinstance(nodes, torch.Tensor):
+        nodes = toindex(nodes).tousertensor()
     partition_id = partition_book.nid2partid(nodes)
     local_nids = None
     for pid in range(partition_book.num_partitions()):
@@ -526,6 +576,10 @@ def _distributed_access(g, nodes, issue_remote_req, local_access):
         res_list.extend(results)
 
     sampled_graph = merge_graphs(res_list, g.num_nodes())
+
+    # [TODO][Rui] For now, g.idtype is alwayas int64 while underlying CSCSamplingGraph could be int32. 
+    if use_graphbolt:
+        sampled_graph = sampled_graph.long()
     return sampled_graph
 
 
@@ -570,6 +624,64 @@ def _frontier_to_heterogeneous_graph(g, frontier, gpb):
     return hg
 
 
+def _frontier_to_heterogeneous_graph_gb(g, frontier, gpb):
+    # We need to handle empty frontiers correctly.
+    if frontier.num_edges() == 0:
+        data_dict = {
+            etype: (np.zeros(0), np.zeros(0)) for etype in g.canonical_etypes
+        }
+        return heterograph(
+            data_dict,
+            {ntype: g.num_nodes(ntype) for ntype in g.ntypes},
+            idtype=g.idtype,
+        )
+
+    src, dst = frontier.edges()
+    src, dst = F.astype(src, g.idtype), F.astype(dst, g.idtype)
+    if gpb.is_homogeneous:
+        assert frontier.edata[EID] is None, (
+            "For homogeneous graph in GraphBolt, EID field should be None."
+        )
+        etype_ids = torch.zeros(src.shape[0], dtype=torch.int32)
+        raise RuntimeError("Should not arrive here.")
+    else:
+        # For GraphBolt, we store ETYPE into EID field.
+        etype_ids = frontier.edata[EID]
+    etype_ids, idx = F.sort_1d(etype_ids)
+    src, dst = F.gather_row(src, idx), F.gather_row(dst, idx)
+    src_ntype_ids, src = gpb.map_to_per_ntype(src)
+    dst_ntype_ids, dst = gpb.map_to_per_ntype(dst)
+
+    #[Rui] `g.get_ntype_id()` crashed due to
+    # 'DistGraph' object has no attribute '_ntype_map' if `num_samplers>0`.
+    # `DistGraph` is not sharable between processes?
+
+    data_dict = dict()
+    for etid, etype in enumerate(g.canonical_etypes):
+        src_ntype, _, dst_ntype = etype
+        src_ntype_id = g.get_ntype_id(src_ntype)
+        dst_ntype_id = g.get_ntype_id(dst_ntype)
+        type_idx = etype_ids == etid
+        if F.sum(type_idx, 0) > 0:
+            data_dict[etype] = (
+                F.boolean_mask(src, type_idx),
+                F.boolean_mask(dst, type_idx),
+            )
+            assert torch.all(src_ntype_id == src_ntype_ids[type_idx]), (
+                "source ntype is is not expected."
+            )
+            assert torch.all(dst_ntype_id == dst_ntype_ids[type_idx]), (
+                "destination ntype is is not expected."
+            )
+    hg = heterograph(
+        data_dict,
+        {ntype: g.num_nodes(ntype) for ntype in g.ntypes},
+        idtype=g.idtype,
+    )
+
+    return hg
+
+
 def sample_etype_neighbors(
     g,
     nodes,
@@ -578,6 +690,7 @@ def sample_etype_neighbors(
     prob=None,
     replace=False,
     etype_sorted=True,
+    use_graphbolt=False,
 ):
     """Sample from the neighbors of the given nodes from a distributed graph.
 
@@ -631,6 +744,8 @@ def sample_etype_neighbors(
         neighbors are sampled. If fanout == -1, all neighbors are collected.
     etype_sorted : bool, optional
         Indicates whether etypes are sorted.
+    use_graphbolt : bool, optional
+        Whether to use GraphBolt to sample neighbors.
 
     Returns
     -------
@@ -638,7 +753,7 @@ def sample_etype_neighbors(
         A sampled subgraph containing only the sampled neighboring edges.  It is on CPU.
     """
     if isinstance(fanout, int):
-        fanout = F.full_1d(len(g.canonical_etypes), fanout, F.int64, F.cpu())
+        fanout = F.full_1d(len(g.canonical_etypes), fanout, F.int32, F.cpu())
     else:
         etype_ids = {etype: i for i, etype in enumerate(g.canonical_etypes)}
         fanout_array = [None] * len(g.canonical_etypes)
@@ -688,6 +803,7 @@ def issue_remote_req(node_ids):
             prob=_prob,
             replace=replace,
             etype_sorted=etype_sorted,
+            use_graphbolt=use_graphbolt,
         )
 
     def local_access(local_g, partition_book, local_nids):
@@ -712,16 +828,28 @@ def local_access(local_g, partition_book, local_nids):
             _prob,
             replace,
             etype_sorted=etype_sorted,
+            use_graphbolt=use_graphbolt,
         )
 
-    frontier = _distributed_access(g, nodes, issue_remote_req, local_access)
+    frontier = _distributed_access(g, nodes, issue_remote_req, local_access, use_graphbolt=use_graphbolt)
     if not gpb.is_homogeneous:
-        return _frontier_to_heterogeneous_graph(g, frontier, gpb)
+        if use_graphbolt:
+            return _frontier_to_heterogeneous_graph_gb(g, frontier, gpb)
+        else:
+            return _frontier_to_heterogeneous_graph(g, frontier, gpb)
     else:
         return frontier
 
 
-def sample_neighbors(g, nodes, fanout, edge_dir="in", prob=None, replace=False):
+def sample_neighbors(
+    g,
+    nodes,
+    fanout,
+    edge_dir="in",
+    prob=None,
+    replace=False,
+    use_graphbolt=False,
+):
     """Sample from the neighbors of the given nodes from a distributed graph.
 
     For each node, a number of inbound (or outbound when ``edge_dir == 'out'``) edges
@@ -795,7 +923,8 @@ def issue_remote_req(node_ids):
         else:
             _prob = None
         return SamplingRequest(
-            node_ids, fanout, edge_dir=edge_dir, prob=_prob, replace=replace
+            node_ids, fanout, edge_dir=edge_dir, prob=_prob, replace=replace,
+            use_graphbolt=use_graphbolt,
         )
 
     def local_access(local_g, partition_book, local_nids):
@@ -809,11 +938,15 @@ def local_access(local_g, partition_book, local_nids):
             edge_dir,
             _prob,
             replace,
+            use_graphbolt=use_graphbolt,
         )
 
-    frontier = _distributed_access(g, nodes, issue_remote_req, local_access)
+    frontier = _distributed_access(g, nodes, issue_remote_req, local_access, use_graphbolt=use_graphbolt)
     if not gpb.is_homogeneous:
-        return _frontier_to_heterogeneous_graph(g, frontier, gpb)
+        if use_graphbolt:
+            return _frontier_to_heterogeneous_graph_gb(g, frontier, gpb)
+        else:
+            return _frontier_to_heterogeneous_graph(g, frontier, gpb)
     else:
         return frontier
 
diff --git a/python/dgl/distributed/partition.py b/python/dgl/distributed/partition.py
index da0ab445690e..83e2adde1760 100644
--- a/python/dgl/distributed/partition.py
+++ b/python/dgl/distributed/partition.py
@@ -5,12 +5,16 @@
 import os
 import time
 
+from copy import deepcopy
+
 import numpy as np
+import torch
 
-from .. import backend as F
-from ..base import DGLError, EID, ETYPE, NID, NTYPE
+from .. import backend as F, graphbolt as gb
+from ..base import dgl_warning, DGLError, EID, ETYPE, NID, NTYPE
 from ..convert import to_homogeneous
 from ..data.utils import load_graphs, load_tensors, save_graphs, save_tensors
+from ..heterograph import DGLGraph
 from ..partition import (
     get_peak_mem,
     metis_partition_assignment,
@@ -140,7 +144,7 @@ def _get_part_ranges(id_ranges):
     return res
 
 
-def load_partition(part_config, part_id, load_feats=True):
+def load_partition(part_config, part_id, load_feats=True, use_graphbolt=False):
     """Load data of a partition from the data path.
 
     A partition data includes a graph structure of the partition, a dict of node tensors,
@@ -162,6 +166,8 @@ def load_partition(part_config, part_id, load_feats=True):
     load_feats : bool, optional
         Whether to load node/edge feats. If False, the returned node/edge feature
         dictionaries will be empty. Default: True.
+    use_graphbolt : bool, optional
+        Whether to load the partition graph structure in the GraphBolt format.
 
     Returns
     -------
@@ -189,10 +195,13 @@ def load_partition(part_config, part_id, load_feats=True):
         "part-{}".format(part_id) in part_metadata
     ), "part-{} does not exist".format(part_id)
     part_files = part_metadata["part-{}".format(part_id)]
+    part_graph_field = "part_graph"
+    if use_graphbolt:
+        part_graph_field = "gb_part_graph"
     assert (
-        "part_graph" in part_files
-    ), "the partition does not contain graph structure."
-    partition_path = relative_to_config(part_files["part_graph"])
+        part_graph_field in part_files
+    ), f"the partition does not contain graph structure: {part_graph_field}."
+    partition_path = relative_to_config(part_files[part_graph_field])
     logging.info(
         "Start to load partition from %s which is "
         "%d bytes. It may take non-trivial "
@@ -200,20 +209,35 @@ def load_partition(part_config, part_id, load_feats=True):
         partition_path,
         os.path.getsize(partition_path),
     )
-    graph = load_graphs(partition_path)[0][0]
-    logging.info("Finished loading partition.")
-
-    assert (
-        NID in graph.ndata
-    ), "the partition graph should contain node mapping to global node ID"
-    assert (
-        EID in graph.edata
-    ), "the partition graph should contain edge mapping to global edge ID"
+    graph = None
+    if partition_path.endswith(".tar"):
+        assert use_graphbolt, (
+            "The partition is stored in the GraphBolt format. "
+            "Please set use_graphbolt=True to load it."
+        )
+        graph = gb.load_csc_sampling_graph(partition_path)
+        assert isinstance(graph, gb.CSCSamplingGraph)
+    else:
+        assert not use_graphbolt, (
+            "The partition is stored in the DGL format. "
+            "Please set use_graphbolt=False to load it."
+        )
+        graph = load_graphs(partition_path)[0][0]
+        assert isinstance(graph, DGLGraph)
+    logging.info(f"Finished loading partition from {partition_path}")
+
+    if isinstance(graph, DGLGraph):
+        assert (
+            NID in graph.ndata
+        ), "the partition graph should contain node mapping to global node ID"
+        assert (
+            EID in graph.edata
+        ), "the partition graph should contain edge mapping to global edge ID"
 
     gpb, graph_name, ntypes, etypes = load_partition_book(part_config, part_id)
     ntypes_list = list(ntypes.keys())
     etypes_list = list(etypes.keys())
-    if "DGL_DIST_DEBUG" in os.environ:
+    if "DGL_DIST_DEBUG" in os.environ and isinstance(graph, DGLGraph):
         for ntype in ntypes:
             ntype_id = ntypes[ntype]
             # graph.ndata[NID] are global homogeneous node IDs.
@@ -546,6 +570,8 @@ def partition_graph(
     num_trainers_per_machine=1,
     objtype="cut",
     graph_formats=None,
+    use_graphbolt=False,
+    gb_save_all=False,
 ):
     """Partition a graph for distributed training and store the partitions on files.
 
@@ -719,6 +745,10 @@ def partition_graph(
         ``csc`` and ``csr``. If not specified, save one format only according to what
         format is available. If multiple formats are available, selection priority
         from high to low is ``coo``, ``csc``, ``csr``.
+    use_graphbolt : bool, optional
+        Whether to convert the partitioned graph to GraphBolt format.
+    gb_save_all : bool, optional
+        Whether to save all data into `CSCSamplingGraph`.
 
     Returns
     -------
@@ -1206,7 +1236,8 @@ def get_homogeneous(g, balance_ntypes):
         )
     )
 
-    _dump_part_config(f"{out_path}/{graph_name}.json", part_metadata)
+    part_config = os.path.join(out_path, graph_name + ".json")
+    _dump_part_config(part_config, part_metadata)
 
     num_cuts = sim_g.num_edges() - tot_num_inner_edges
     if num_parts == 1:
@@ -1217,11 +1248,24 @@ def get_homogeneous(g, balance_ntypes):
         )
     )
 
+    if use_graphbolt:
+        convert_dgl_partition_to_csc_sampling_graph(
+            part_config,
+            store_all=gb_save_all,
+        )
+        print("Converted to GraphBolt format.")
+
     if return_mapping:
         return orig_nids, orig_eids
 
 
-def convert_dgl_partition_to_csc_sampling_graph(part_config):
+def convert_dgl_partition_to_csc_sampling_graph(
+    part_config,
+    store_eids=False,
+    graph_file_name=None,
+    part_config_file_name=None,
+    store_all=False,
+):
     """Convert partitions of dgl to CSCSamplingGraph of GraphBolt.
 
     This API converts `DGLGraph` partitions to `CSCSamplingGraph` which is
@@ -1231,16 +1275,48 @@ def convert_dgl_partition_to_csc_sampling_graph(part_config):
     In the near future, partitions are supposed to be saved as
     `CSCSamplingGraph` directly. At that time, this API should be deprecated.
 
+    For homogeneous graph, below attributes are required:
+        dgl.NID: original node IDs. saved into `node_attributes`.
+    
+    For heterogeneous graph, below attributes are required:
+        dgl.NID: original node IDs. saved into `node_attributes`.
+        dgl.ETYPE: original edge types. saved into `type_per_edge`.
+    
+    If `store_eids` is True, below attributes are additional saved:
+        dgl.EID: original edge IDs. saved into `edge_attributes`.
+
     Parameters
     ----------
     part_config : str
         The partition configuration JSON file.
+    store_eids : bool, optional
+        Whether to store original edge IDs in the new graph.
+    graph_file_name : str, optional
+        The name of the new graph file. If not provided, the name will be
+        `csc_sampling_graph.tar`.
+    part_config_file_name : str, optional
+        The name of the new partition configuration file. If not provided, the
+        name will be the passed-in one.
+    store_all : bool, optional
+        Whether to store all attributes in the new graph. If False, only
+        required attributes will be stored.
     """
     # As only this function requires GraphBolt for now, let's import here.
     from .. import graphbolt
 
+    if store_all:
+        dgl_warning(
+            "Storing all attributes in the new graph is not recommended."
+        )
+    if store_eids:
+        dgl_warning("Storing edge IDs is not supported yet.")
+
     part_meta = _load_part_config(part_config)
+    new_part_meta = deepcopy(part_meta)
     num_parts = part_meta["num_parts"]
+    p_ntypes = part_meta["ntypes"]
+    p_etypes = part_meta["etypes"]
+    is_homo = len(p_ntypes) == 1 and DEFAULT_NTYPE in p_ntypes and len(p_etypes) and DEFAULT_ETYPE in p_etypes
 
     # Utility functions.
     def init_type_per_edge(graph, gpb):
@@ -1252,24 +1328,138 @@ def init_type_per_edge(graph, gpb):
         graph, _, _, gpb, _, _, _ = load_partition(
             part_config, part_id, load_feats=False
         )
-        # Construct GraphMetadata.
+        # [Rui] We can always treat partitioned graph as homogeneous graph. Then
+        # we don't need metadata at all. What's more, heterogeneous graph
+        # requires `node_type_offset` is set correctly and nodes are sorted
+        # according to their types. This is not guaranteed in current partitioned
+        # graph.
         _, _, ntypes, etypes = load_partition_book(part_config, part_id)
-        metadata = graphbolt.GraphMetadata(ntypes, etypes)
+        metadata = None
+        if not is_homo:
+            # Construct GraphMetadata.
+            c_etypes = {
+                graphbolt.etype_tuple_to_str(etype): v
+                for etype, v in etypes.items()
+            }
+            metadata = graphbolt.GraphMetadata(ntypes, c_etypes)
         # Obtain CSC indtpr and indices.
-        indptr, indices, _ = graph.adj().csc()
+        indptr, indices, edge_ids = graph.adj_tensors("csc") #graph.adj().csc()
         # Initalize type per edge.
-        type_per_edge = init_type_per_edge(graph, gpb)
-        type_per_edge = type_per_edge.to(RESERVED_FIELD_DTYPE[ETYPE])
+        type_per_edge = None
+        if not is_homo:
+            type_per_edge = init_type_per_edge(graph, gpb)
+            type_per_edge = type_per_edge[edge_ids]
+            # Sanity check.
+            assert len(type_per_edge) == graph.num_edges()
+
+        # Original node IDs. [Required]
+        node_attributes = None
         # Sanity check.
-        assert len(type_per_edge) == graph.num_edges()
+        assert len(graph.ndata[NID]) == graph.num_nodes()
+        node_attributes = {
+            NID: graph.ndata[NID]
+        }
+
+        # Original edge IDs. [Optional]
+        edge_attributes = None
+        if store_eids or store_all:
+            # Sanity check.
+            assert len(graph.edata[EID]) == graph.num_edges()
+            edge_attributes = {
+                EID: graph.edata[EID][edge_ids]
+            }
+
+        # Storing NTYPE is mainly for debug.
+        if store_all and (not is_homo):
+            node_attributes[NTYPE] = graph.ndata[NTYPE]
+
+        # Data type formatting before saving.
+        num_nodes = part_meta["num_nodes"]
+        num_edges = part_meta["num_edges"]
+        local_num_nodes = graph.num_nodes()
+        local_num_edges = graph.num_edges()
+        # 1. csc matrix. [Required]
+        if local_num_nodes < torch.iinfo(torch.int32).max:
+            indices = indices.to(torch.int32)
+        else:
+            indices = indices.to(torch.int64)
+        if local_num_edges < torch.iinfo(torch.int32).max:
+            indptr = indptr.to(torch.int32)
+        else:
+            indptr = indptr.to(torch.int64)
+        # 2. NID. [Required]
+        assert node_attributes is not None and NID in node_attributes, (
+            "NID is required for GraphBolt."
+        )
+        if num_nodes < torch.iinfo(torch.int32).max:
+            node_attributes[NID] = node_attributes[NID].to(torch.int32)
+        else:
+            node_attributes[NID] = node_attributes[NID].to(torch.int64)
+        # 3. ETYPE. [Required for heterograph].
+        if type_per_edge is not None:
+            if len(etypes) < torch.iinfo(torch.int8).max:
+                type_per_edge = type_per_edge.to(torch.int8)
+            elif len(etypes) < torch.iinfo(torch.int16).max:
+                type_per_edge = type_per_edge.to(torch.int16)
+            elif len(etypes) < torch.iinfo(torch.int32).max:
+                type_per_edge = type_per_edge.to(torch.int32)
+            else:
+                type_per_edge = type_per_edge.to(torch.int64)
+        # 4. NTYPE. [Optional]
+        if node_attributes is not None and NTYPE in node_attributes:
+            if len(ntypes) < torch.iinfo(torch.int8).max:
+                node_attributes[NTYPE] = node_attributes[NTYPE].to(
+                    torch.int8
+                )
+            elif len(ntypes) < torch.iinfo(torch.int16).max:
+                node_attributes[NTYPE] = node_attributes[NTYPE].to(
+                    torch.int16
+                )
+            elif len(ntypes) < torch.iinfo(torch.int32).max:
+                node_attributes[NTYPE] = node_attributes[NTYPE].to(
+                    torch.int32
+                )
+            else:
+                node_attributes[NTYPE] = node_attributes[NTYPE].to(
+                    torch.int64
+                )
+        # 5. EID. [Optional]
+        if edge_attributes is not None and EID in edge_attributes:
+            if num_edges < torch.iinfo(torch.int32).max:
+                edge_attributes[EID] = edge_attributes[EID].to(torch.int32)
+            else:
+                edge_attributes[EID] = edge_attributes[EID].to(torch.int64)
+
+        # Construct CSCSamplingGraph
         csc_graph = graphbolt.from_csc(
-            indptr, indices, None, type_per_edge, metadata=metadata
+            indptr,
+            indices,
+            node_type_offset=None,
+            type_per_edge=type_per_edge,
+            node_attributes=node_attributes,
+            edge_attributes=edge_attributes,
+            metadata=metadata,
         )
         orig_graph_path = os.path.join(
             os.path.dirname(part_config),
             part_meta[f"part-{part_id}"]["part_graph"],
         )
+        if graph_file_name is None:
+            graph_file_name = "csc_sampling_graph.tar"
         csc_graph_path = os.path.join(
-            os.path.dirname(orig_graph_path), "csc_sampling_graph.tar"
+            os.path.dirname(orig_graph_path), graph_file_name
         )
         graphbolt.save_csc_sampling_graph(csc_graph, csc_graph_path)
+
+        # Update graph path.
+        new_part_meta[f"part-{part_id}"]["gb_part_graph"] = os.path.relpath(
+            csc_graph_path, os.path.dirname(part_config)
+        )
+
+    # Update partition config.
+    if part_config_file_name is None:
+        part_config_file_name = os.path.basename(part_config)
+    new_part_config = os.path.join(
+        os.path.dirname(part_config), part_config_file_name
+    )
+    _dump_part_config(new_part_config, new_part_meta)
diff --git a/python/dgl/distributed/server_state.py b/python/dgl/distributed/server_state.py
index 0eac8d40c670..3b2dde3e2032 100644
--- a/python/dgl/distributed/server_state.py
+++ b/python/dgl/distributed/server_state.py
@@ -30,7 +30,7 @@ class ServerState:
     ----------
     kv_store : KVServer
         reference for KVServer
-    graph : DGLGraph
+    graph : DGLGraph or CSCSamplingGraph
         Graph structure of one partition
     total_num_nodes : int
         Total number of nodes
diff --git a/python/dgl/graphbolt/base.py b/python/dgl/graphbolt/base.py
index d460bb2332fc..42a2caa5e97a 100644
--- a/python/dgl/graphbolt/base.py
+++ b/python/dgl/graphbolt/base.py
@@ -8,6 +8,7 @@
 
 __all__ = [
     "CANONICAL_ETYPE_DELIMITER",
+    "ORIGINAL_NODE_ID",
     "ORIGINAL_EDGE_ID",
     "etype_str_to_tuple",
     "etype_tuple_to_str",
@@ -16,6 +17,7 @@
 ]
 
 CANONICAL_ETYPE_DELIMITER = ":"
+ORIGINAL_NODE_ID = "_ORIGINAL_NODE_ID"
 ORIGINAL_EDGE_ID = "_ORIGINAL_EDGE_ID"
 
 
diff --git a/python/dgl/graphbolt/impl/csc_sampling_graph.py b/python/dgl/graphbolt/impl/csc_sampling_graph.py
index af9550f77b95..95748607953f 100644
--- a/python/dgl/graphbolt/impl/csc_sampling_graph.py
+++ b/python/dgl/graphbolt/impl/csc_sampling_graph.py
@@ -10,10 +10,15 @@
 
 from dgl.utils import recursive_apply
 
-from ...base import EID, ETYPE
+from ...base import EID, ETYPE, NID
 from ...convert import to_homogeneous
 from ...heterograph import DGLGraph
-from ..base import etype_str_to_tuple, etype_tuple_to_str, ORIGINAL_EDGE_ID
+from ..base import (
+    etype_str_to_tuple,
+    etype_tuple_to_str,
+    ORIGINAL_EDGE_ID,
+    ORIGINAL_NODE_ID,
+)
 from ..sampling_graph import SamplingGraph
 from .sampled_subgraph_impl import SampledSubgraphImpl
 
@@ -251,6 +256,27 @@ def type_per_edge(self, type_per_edge: Optional[torch.Tensor]) -> None:
         """Sets the edge type tensor if present."""
         self._c_csc_graph.set_type_per_edge(type_per_edge)
 
+    @property
+    def node_attributes(self) -> Optional[Dict[str, torch.Tensor]]:
+        """Returns the node attributes dictionary.
+
+        Returns
+        -------
+        torch.Tensor or None
+            If present, returns a dictionary of node attributes. Each key
+            represents the attribute's name, while the corresponding value
+            holds the attribute's specific value. The length of each value
+            should match the total number of nodes."
+        """
+        return self._c_csc_graph.node_attributes()
+
+    @node_attributes.setter
+    def node_attributes(
+        self, node_attributes: Optional[Dict[str, torch.Tensor]]
+    ) -> None:
+        """Sets the node attributes dictionary."""
+        self._c_csc_graph.set_node_attributes(node_attributes)
+
     @property
     def edge_attributes(self) -> Optional[Dict[str, torch.Tensor]]:
         """Returns the edge attributes dictionary.
@@ -311,6 +337,7 @@ def in_subgraph(self, nodes: torch.Tensor) -> torch.ScriptObject:
     def _convert_to_sampled_subgraph(
         self,
         C_sampled_subgraph: torch.ScriptObject,
+        keep_homo: bool = False,
     ):
         """An internal function used to convert a fused homogeneous sampled
         subgraph to general struct 'SampledSubgraphImpl'."""
@@ -321,6 +348,7 @@ def _convert_to_sampled_subgraph(
             column_num
         )
         row = C_sampled_subgraph.indices
+
         type_per_edge = C_sampled_subgraph.type_per_edge
         original_edge_ids = C_sampled_subgraph.original_edge_ids
         has_original_eids = (
@@ -331,7 +359,8 @@ def _convert_to_sampled_subgraph(
             original_edge_ids = self.edge_attributes[ORIGINAL_EDGE_ID][
                 original_edge_ids
             ]
-        if type_per_edge is None:
+
+        if type_per_edge is None or keep_homo:
             # The sampled graph is already a homogeneous graph.
             node_pairs = (row, column)
         else:
@@ -354,7 +383,8 @@ def _convert_to_sampled_subgraph(
             if has_original_eids:
                 original_edge_ids = original_hetero_edge_ids
         return SampledSubgraphImpl(
-            node_pairs=node_pairs, original_edge_ids=original_edge_ids
+            node_pairs=node_pairs, original_edge_ids=original_edge_ids,
+            original_etype_ids=type_per_edge,
         )
 
     def _convert_to_homogeneous_nodes(self, nodes):
@@ -370,6 +400,8 @@ def sample_neighbors(
         fanouts: torch.Tensor,
         replace: bool = False,
         probs_name: Optional[str] = None,
+        keep_homo: bool = False,
+        return_eids: bool = False,
     ) -> SampledSubgraphImpl:
         """Sample neighboring edges of the given nodes and return the induced
         subgraph.
@@ -438,10 +470,12 @@ def sample_neighbors(
             nodes = self._convert_to_homogeneous_nodes(nodes)
 
         C_sampled_subgraph = self._sample_neighbors(
-            nodes, fanouts, replace, probs_name
+            nodes, fanouts, replace, probs_name, return_eids
         )
 
-        return self._convert_to_sampled_subgraph(C_sampled_subgraph)
+        return self._convert_to_sampled_subgraph(
+            C_sampled_subgraph, keep_homo=keep_homo,
+        )
 
     def _check_sampler_arguments(self, nodes, fanouts, probs_name):
         assert nodes.dim() == 1, "Nodes should be 1-D tensor."
@@ -452,8 +486,8 @@ def _check_sampler_arguments(self, nodes, fanouts, probs_name):
         assert len(fanouts) in [
             expected_fanout_len,
             1,
-        ], "Fanouts should have the same number of elements as etypes or \
-            should have a length of 1."
+        ], f"Fanouts should have the same number of elements as etypes or \
+            should have a length of 1, but got {fanouts} while {self.metadata.edge_type_to_id}"
         if fanouts.size(0) > 1:
             assert (
                 self.type_per_edge is not None
@@ -487,6 +521,7 @@ def _sample_neighbors(
         fanouts: torch.Tensor,
         replace: bool = False,
         probs_name: Optional[str] = None,
+        return_eids: bool = False,
     ) -> torch.ScriptObject:
         """Sample neighboring edges of the given nodes and return the induced
         subgraph.
@@ -533,12 +568,15 @@ def _sample_neighbors(
             self.edge_attributes is not None
             and ORIGINAL_EDGE_ID in self.edge_attributes
         )
+        # [Rui] Formatting to avoid `RuntimeError: expected scalar type Int but found Long`.
+        nodes = nodes.to(self.indices.dtype)
+        fanouts = fanouts.to(self.indices.dtype)
         return self._c_csc_graph.sample_neighbors(
             nodes,
             fanouts.tolist(),
             replace,
             False,
-            has_original_eids,
+            has_original_eids or return_eids,
             probs_name,
         )
 
@@ -618,6 +656,10 @@ def sample_layer_neighbors(
             nodes = self._convert_to_homogeneous_nodes(nodes)
 
         self._check_sampler_arguments(nodes, fanouts, probs_name)
+        has_original_nids = (
+            self.node_attributes is not None
+            and ORIGINAL_NODE_ID in self.node_attributes
+        )
         has_original_eids = (
             self.edge_attributes is not None
             and ORIGINAL_EDGE_ID in self.edge_attributes
@@ -721,6 +763,9 @@ def _to(x, device):
         self.type_per_edge = recursive_apply(
             self.type_per_edge, lambda x: _to(x, device)
         )
+        self.node_attributes = recursive_apply(
+            self.node_attributes, lambda x: _to(x, device)
+        )
         self.edge_attributes = recursive_apply(
             self.edge_attributes, lambda x: _to(x, device)
         )
@@ -733,6 +778,7 @@ def from_csc(
     indices: torch.Tensor,
     node_type_offset: Optional[torch.tensor] = None,
     type_per_edge: Optional[torch.tensor] = None,
+    node_attributes: Optional[Dict[str, torch.tensor]] = None,
     edge_attributes: Optional[Dict[str, torch.tensor]] = None,
     metadata: Optional[GraphMetadata] = None,
 ) -> CSCSamplingGraph:
@@ -750,6 +796,8 @@ def from_csc(
         Offset of node types in the graph, by default None.
     type_per_edge : Optional[torch.tensor], optional
         Type ids of each edge in the graph, by default None.
+    node_attributes: Optional[Dict[str, torch.tensor]], optional
+        Node attributes of the graph, by default None.
     edge_attributes: Optional[Dict[str, torch.tensor]], optional
         Edge attributes of the graph, by default None.
     metadata: Optional[GraphMetadata], optional
@@ -785,6 +833,7 @@ def from_csc(
             indices,
             node_type_offset,
             type_per_edge,
+            node_attributes,
             edge_attributes,
         ),
         metadata,
@@ -880,6 +929,7 @@ def save_csc_sampling_graph(graph, filename):
 def from_dglgraph(
     g: DGLGraph,
     is_homogeneous: bool = False,
+    include_original_node_id: bool = False,
     include_original_edge_id: bool = False,
 ) -> CSCSamplingGraph:
     """Convert a DGLGraph to CSCSamplingGraph."""
@@ -905,6 +955,11 @@ def from_dglgraph(
     # Assign edge type according to the order of CSC matrix.
     type_per_edge = None if is_homogeneous else homo_g.edata[ETYPE][edge_ids]
 
+    node_attributes = {}
+    if include_original_node_id:
+        # Assign node attributes according to the original nids mapping.
+        node_attributes[ORIGINAL_NODE_ID] = homo_g.ndata[NID]
+
     edge_attributes = {}
     if include_original_edge_id:
         # Assign edge attributes according to the original eids mapping.
@@ -916,6 +971,7 @@ def from_dglgraph(
             indices,
             node_type_offset,
             type_per_edge,
+            node_attributes,
             edge_attributes,
         ),
         metadata,
diff --git a/python/dgl/graphbolt/impl/neighbor_sampler.py b/python/dgl/graphbolt/impl/neighbor_sampler.py
index 8b18ecb4199f..179cd4aea0db 100644
--- a/python/dgl/graphbolt/impl/neighbor_sampler.py
+++ b/python/dgl/graphbolt/impl/neighbor_sampler.py
@@ -100,6 +100,19 @@ def __init__(
         self.deduplicate = deduplicate
         self.sampler = graph.sample_neighbors
 
+    @staticmethod
+    def distributed_sample_neighbor(graph, seeds, fanouts):
+        if isinstance(fanouts, int):
+            fanouts = torch.IntTensor([fanouts])
+        assert isinstance(fanouts, torch.Tensor), f"Invalid fanouts: {fanouts}"
+        subgraph = graph.sample_neighbors(seeds, fanouts, keep_homo=True)
+        src_nodes, dst_nodes = subgraph.node_pairs
+        etype_ids = subgraph.original_etype_ids
+        assert src_nodes.shape == dst_nodes.shape, f"Shape mismatch: {src_nodes.shape}, {dst_nodes.shape}"
+        if etype_ids is not None:
+            assert src_nodes.shape == etype_ids.shape, f"Shape mismatch: {src_nodes.shape}, {etype_ids.shape}"
+        return src_nodes, dst_nodes, etype_ids
+
     def _sample_subgraphs(self, seeds):
         subgraphs = []
         num_layers = len(self.fanouts)
diff --git a/python/dgl/graphbolt/impl/ondisk_dataset.py b/python/dgl/graphbolt/impl/ondisk_dataset.py
index 03f0a6806493..904344f08ad0 100644
--- a/python/dgl/graphbolt/impl/ondisk_dataset.py
+++ b/python/dgl/graphbolt/impl/ondisk_dataset.py
@@ -35,7 +35,9 @@
 
 
 def preprocess_ondisk_dataset(
-    dataset_dir: str, include_original_edge_id: bool = False
+    dataset_dir: str,
+    include_original_node_id: bool = False,
+    include_original_edge_id: bool = False,
 ) -> str:
     """Preprocess the on-disk dataset. Parse the input config file,
     load the data, and save the data in the format that GraphBolt supports.
@@ -138,7 +140,7 @@ def preprocess_ondisk_dataset(
 
     # 4. Convert the DGLGraph to a CSCSamplingGraph.
     csc_sampling_graph = from_dglgraph(
-        g, is_homogeneous, include_original_edge_id
+        g, is_homogeneous, include_original_node_id, include_original_edge_id
     )
 
     # 5. Save the CSCSamplingGraph and modify the output_config.
@@ -340,12 +342,17 @@ class OnDiskDataset(Dataset):
     """
 
     def __init__(
-        self, path: str, include_original_edge_id: bool = False
+        self,
+        path: str,
+        include_original_node_id: bool = False,
+        include_original_edge_id: bool = False,
     ) -> None:
         # Always call the preprocess function first. If already preprocessed,
         # the function will return the original path directly.
         self._dataset_dir = path
-        yaml_path = preprocess_ondisk_dataset(path, include_original_edge_id)
+        yaml_path = preprocess_ondisk_dataset(
+            path, include_original_node_id, include_original_edge_id
+        )
         with open(yaml_path) as f:
             self._yaml_data = yaml.load(f, Loader=yaml.loader.SafeLoader)
 
diff --git a/python/dgl/graphbolt/impl/sampled_subgraph_impl.py b/python/dgl/graphbolt/impl/sampled_subgraph_impl.py
index 601377dd0637..bf0c6cfd4a44 100644
--- a/python/dgl/graphbolt/impl/sampled_subgraph_impl.py
+++ b/python/dgl/graphbolt/impl/sampled_subgraph_impl.py
@@ -46,6 +46,7 @@ class SampledSubgraphImpl(SampledSubgraph):
     ] = None
     original_row_node_ids: Union[Dict[str, torch.Tensor], torch.Tensor] = None
     original_edge_ids: Union[Dict[str, torch.Tensor], torch.Tensor] = None
+    original_etype_ids: Union[Dict[str, torch.Tensor], torch.Tensor] = None
 
     def __post_init__(self):
         if isinstance(self.node_pairs, dict):
diff --git a/tests/distributed/test_partition.py b/tests/distributed/test_partition.py
index 4a3cc279ff8b..f5bc4395c29e 100644
--- a/tests/distributed/test_partition.py
+++ b/tests/distributed/test_partition.py
@@ -4,6 +4,7 @@
 
 import backend as F
 import dgl
+import dgl.graphbolt as gb
 import numpy as np
 import pytest
 import torch as th
@@ -679,8 +680,13 @@ def test_UnknownPartitionBook():
 
 @pytest.mark.parametrize("part_method", ["metis", "random"])
 @pytest.mark.parametrize("num_parts", [1, 4])
+@pytest.mark.parametrize("store_eids", [True, False])
+@pytest.mark.parametrize("store_all", [True, False])
 def test_convert_dgl_partition_to_csc_sampling_graph_homo(
-    part_method, num_parts
+    part_method,
+    num_parts,
+    store_eids,
+    store_all,
 ):
     with tempfile.TemporaryDirectory() as test_dir:
         g = create_random_graph(1000)
@@ -689,7 +695,11 @@ def test_convert_dgl_partition_to_csc_sampling_graph_homo(
             g, graph_name, num_parts, test_dir, part_method=part_method
         )
         part_config = os.path.join(test_dir, f"{graph_name}.json")
-        convert_dgl_partition_to_csc_sampling_graph(part_config)
+        convert_dgl_partition_to_csc_sampling_graph(
+            part_config,
+            store_eids=store_eids,
+            store_all=store_all,
+        )
         for part_id in range(num_parts):
             orig_g = dgl.load_graphs(
                 os.path.join(test_dir, f"part{part_id}/graph.dgl")
@@ -697,21 +707,35 @@ def test_convert_dgl_partition_to_csc_sampling_graph_homo(
             new_g = dgl.graphbolt.load_csc_sampling_graph(
                 os.path.join(test_dir, f"part{part_id}/csc_sampling_graph.tar")
             )
-            orig_indptr, orig_indices, _ = orig_g.adj().csc()
+            orig_indptr, orig_indices, orig_eids = orig_g.adj().csc()
             assert th.equal(orig_indptr, new_g.csc_indptr)
             assert th.equal(orig_indices, new_g.indices)
             assert new_g.node_type_offset is None
-            assert all(new_g.type_per_edge == 0)
-            for node_type, type_id in new_g.metadata.node_type_to_id.items():
-                assert g.get_ntype_id(node_type) == type_id
-            for edge_type, type_id in new_g.metadata.edge_type_to_id.items():
-                assert g.get_etype_id(edge_type) == type_id
+            assert th.equal(
+                orig_g.ndata[dgl.NID], new_g.node_attributes[dgl.NID]
+            )
+            assert dgl.NTYPE not in new_g.node_attributes
+            if store_eids or store_all:
+                assert th.equal(
+                    orig_g.edata[dgl.EID][orig_eids], new_g.edge_attributes[dgl.EID]
+                )
+            else:
+                assert new_g.edge_attributes is None
+            # For homogeneous graph, ETYPE is not stored.
+            assert new_g.type_per_edge is None
+            # For homogeneous graph, metadata is not stored.
+            assert new_g.metadata is None
 
 
 @pytest.mark.parametrize("part_method", ["metis", "random"])
 @pytest.mark.parametrize("num_parts", [1, 4])
+@pytest.mark.parametrize("store_eids", [True, False])
+@pytest.mark.parametrize("store_all", [True, False])
 def test_convert_dgl_partition_to_csc_sampling_graph_hetero(
-    part_method, num_parts
+    part_method,
+    num_parts,
+    store_eids,
+    store_all,
 ):
     with tempfile.TemporaryDirectory() as test_dir:
         g = create_random_hetero()
@@ -720,7 +744,11 @@ def test_convert_dgl_partition_to_csc_sampling_graph_hetero(
             g, graph_name, num_parts, test_dir, part_method=part_method
         )
         part_config = os.path.join(test_dir, f"{graph_name}.json")
-        convert_dgl_partition_to_csc_sampling_graph(part_config)
+        convert_dgl_partition_to_csc_sampling_graph(
+            part_config,
+            store_eids=store_eids,
+            store_all=store_all,
+        )
         for part_id in range(num_parts):
             orig_g = dgl.load_graphs(
                 os.path.join(test_dir, f"part{part_id}/graph.dgl")
@@ -728,15 +756,39 @@ def test_convert_dgl_partition_to_csc_sampling_graph_hetero(
             new_g = dgl.graphbolt.load_csc_sampling_graph(
                 os.path.join(test_dir, f"part{part_id}/csc_sampling_graph.tar")
             )
-            orig_indptr, orig_indices, _ = orig_g.adj().csc()
+            orig_indptr, orig_indices, orig_eids = orig_g.adj().csc()
             assert th.equal(orig_indptr, new_g.csc_indptr)
             assert th.equal(orig_indices, new_g.indices)
-            for node_type, type_id in new_g.metadata.node_type_to_id.items():
+            # dgl.NID is required.
+            assert th.equal(
+                orig_g.ndata[dgl.NID], new_g.node_attributes[dgl.NID]
+            )
+            if store_eids or store_all:
+                assert th.equal(
+                    orig_g.edata[dgl.EID][orig_eids], new_g.edge_attributes[dgl.EID]
+                )
+            else:
+                assert new_g.edge_attributes is None
+            # dgl.ETYPE is required for heterograph.
+            assert th.equal(orig_g.edata[dgl.ETYPE][orig_eids], new_g.type_per_edge)
+            # dgl.NTYPE is optional for heterograph.
+            if store_all:
+                assert th.equal(
+                    orig_g.ndata[dgl.NTYPE], new_g.node_attributes[dgl.NTYPE]
+                )
+            # metadata is required for heterograph.
+            for (
+                node_type,
+                type_id,
+            ) in new_g.metadata.node_type_to_id.items():
                 assert g.get_ntype_id(node_type) == type_id
-            for edge_type, type_id in new_g.metadata.edge_type_to_id.items():
+            for (
+                edge_type,
+                type_id,
+            ) in new_g.metadata.edge_type_to_id.items():
+                edge_type = gb.etype_str_to_tuple(edge_type)
                 assert g.get_etype_id(edge_type) == type_id
             assert new_g.node_type_offset is None
-            assert th.equal(orig_g.edata[dgl.ETYPE], new_g.type_per_edge)
 
 
 def test_not_sorted_node_edge_map():
@@ -847,3 +899,314 @@ def test_not_sorted_node_edge_map():
         gpb, _, _, _ = load_partition_book(part_config, 1)
         assert gpb.local_ntype_offset == [0, 300, 700]
         assert gpb.local_etype_offset == [0, 500, 1100, 1800, 2600]
+
+
+@pytest.mark.parametrize("part_method", ["metis", "random"])
+@pytest.mark.parametrize("num_parts", [1, 4])
+@pytest.mark.parametrize("num_trainers_per_machine", [1, 4])
+@pytest.mark.parametrize("load_feats", [True, False])
+def test_partition_homo_graphbolt(
+    part_method,
+    num_parts,
+    num_trainers_per_machine,
+    load_feats,
+):
+    os.environ["DGL_DIST_DEBUG"] = "1"
+    if part_method == "random" and num_parts > 1:
+        num_trainers_per_machine = 1
+
+    g = create_random_graph(1000)
+    g.ndata["labels"] = F.arange(0, g.num_nodes())
+    g.ndata["feats"] = F.tensor(np.random.randn(g.num_nodes(), 10), F.float32)
+    g.edata["feats"] = F.tensor(np.random.randn(g.num_edges(), 10), F.float32)
+    g.update_all(fn.copy_u("feats", "msg"), fn.sum("msg", "h"))
+    g.update_all(fn.copy_e("feats", "msg"), fn.sum("msg", "eh"))
+    num_hops = 2
+
+    with tempfile.TemporaryDirectory() as test_dir:
+        orig_nids, orig_eids = partition_graph(
+            g,
+            "test",
+            num_parts,
+            test_dir,
+            num_hops=num_hops,
+            part_method=part_method,
+            return_mapping=True,
+            num_trainers_per_machine=num_trainers_per_machine,
+            use_graphbolt=True,
+        )
+        part_config = os.path.join(test_dir, "test.json")
+        for i in range(num_parts):
+            part_g, node_feats, edge_feats, gpb, _, _, _ = load_partition(
+                part_config, i, load_feats=load_feats, use_graphbolt=True
+            )
+            assert isinstance(part_g, gb.CSCSamplingGraph)
+            assert gpb.num_partitions() == num_parts
+            gpb_meta = gpb.metadata()
+            assert len(gpb_meta) == num_parts
+            assert len(gpb.partid2nids(i)) == gpb_meta[i]["num_nodes"]
+            assert len(gpb.partid2eids(i)) == gpb_meta[i]["num_edges"]
+            if num_parts == 1:
+                assert len(gpb.partid2nids(i)) == part_g.total_num_nodes
+                assert len(gpb.partid2eids(i)) == part_g.total_num_edges
+            if load_feats:
+                assert "_N/labels" in node_feats
+                assert "_N/feats" in node_feats
+                assert "_N:_E:_N/feats" in edge_feats
+            else:
+                assert node_feats == {}
+                assert edge_feats == {}
+
+    reset_envs()
+
+
+@pytest.mark.parametrize("part_method", ["metis", "random"])
+@pytest.mark.parametrize("num_parts", [1, 4])
+@pytest.mark.parametrize("num_trainers_per_machine", [1, 4])
+@pytest.mark.parametrize("load_feats", [True, False])
+def test_partition_hetero_graphbolt(
+    part_method,
+    num_parts,
+    num_trainers_per_machine,
+    load_feats,
+):
+    os.environ["DGL_DIST_DEBUG"] = "1"
+    if part_method == "random" and num_parts > 1:
+        num_trainers_per_machine = 1
+
+    hg = create_random_hetero()
+    test_ntype = "n1"
+    test_etype = ("n1", "r1", "n2")
+    hg.nodes[test_ntype].data["labels"] = F.arange(0, hg.num_nodes(test_ntype))
+    hg.nodes[test_ntype].data["feats"] = F.tensor(
+        np.random.randn(hg.num_nodes(test_ntype), 10), F.float32
+    )
+    hg.edges[test_etype].data["feats"] = F.tensor(
+        np.random.randn(hg.num_edges(test_etype), 10), F.float32
+    )
+    hg.edges[test_etype].data["labels"] = F.arange(0, hg.num_edges(test_etype))
+
+    num_hops = 2
+
+    with tempfile.TemporaryDirectory() as test_dir:
+        orig_nids, orig_eids = partition_graph(
+            hg,
+            "test",
+            num_parts,
+            test_dir,
+            num_hops=num_hops,
+            part_method=part_method,
+            return_mapping=True,
+            num_trainers_per_machine=num_trainers_per_machine,
+            use_graphbolt=True,
+        )
+        part_config = os.path.join(test_dir, "test.json")
+        for i in range(num_parts):
+            part_g, node_feats, edge_feats, gpb, _, _, _ = load_partition(
+                part_config, i, load_feats=load_feats, use_graphbolt=True
+            )
+            assert isinstance(part_g, gb.CSCSamplingGraph)
+            assert gpb.num_partitions() == num_parts
+            gpb_meta = gpb.metadata()
+            assert len(gpb_meta) == num_parts
+            assert len(gpb.partid2nids(i)) == gpb_meta[i]["num_nodes"]
+            assert len(gpb.partid2eids(i)) == gpb_meta[i]["num_edges"]
+            if num_parts == 1:
+                assert len(gpb.partid2nids(i)) == part_g.total_num_nodes
+                assert len(gpb.partid2eids(i)) == part_g.total_num_edges
+            if load_feats:
+                assert "n1/labels" in node_feats
+                assert "n1/feats" in node_feats
+                assert "n1:r1:n2/feats" in edge_feats
+            else:
+                assert node_feats == {}
+                assert edge_feats == {}
+
+    reset_envs()
+
+
+@pytest.mark.parametrize("part_method", ["metis", "random"])
+@pytest.mark.parametrize("num_parts", [1, 4])
+@pytest.mark.parametrize("num_trainers_per_machine", [1, 4])
+@pytest.mark.parametrize("load_feats", [True, False])
+def test_partition_hetero_graphbolt_sample_neighbors(
+    part_method,
+    num_parts,
+    num_trainers_per_machine,
+    load_feats,
+):
+    os.environ["DGL_DIST_DEBUG"] = "1"
+    if part_method == "random" and num_parts > 1:
+        num_trainers_per_machine = 1
+
+    hg = create_random_hetero()
+    test_ntype = "n1"
+    test_etype = ("n1", "r1", "n2")
+    hg.nodes[test_ntype].data["labels"] = F.arange(0, hg.num_nodes(test_ntype))
+    hg.nodes[test_ntype].data["feats"] = F.tensor(
+        np.random.randn(hg.num_nodes(test_ntype), 10), F.float32
+    )
+    hg.edges[test_etype].data["feats"] = F.tensor(
+        np.random.randn(hg.num_edges(test_etype), 10), F.float32
+    )
+    hg.edges[test_etype].data["labels"] = F.arange(0, hg.num_edges(test_etype))
+
+    num_hops = 2
+
+    with tempfile.TemporaryDirectory() as test_dir:
+        orig_nids, orig_eids = dgl.distributed.partition_graph(
+            hg,
+            "test",
+            num_parts,
+            test_dir,
+            num_hops=num_hops,
+            part_method=part_method,
+            return_mapping=True,
+            num_trainers_per_machine=num_trainers_per_machine,
+            use_graphbolt=True,
+            gb_save_all=True,
+        )
+        part_config = os.path.join(test_dir, "test.json")
+        for i in range(num_parts):
+            part_g, node_feats, edge_feats, gpb, _, _, _ = dgl.distributed.load_partition(
+                part_config, i, load_feats=load_feats, use_graphbolt=True
+            )
+            assert isinstance(part_g, gb.CSCSamplingGraph)
+            assert gpb.num_partitions() == num_parts
+            gpb_meta = gpb.metadata()
+            assert len(gpb_meta) == num_parts
+            assert len(gpb.partid2nids(i)) == gpb_meta[i]["num_nodes"]
+            assert len(gpb.partid2eids(i)) == gpb_meta[i]["num_edges"]
+            if num_parts == 1:
+                assert len(gpb.partid2nids(i)) == part_g.total_num_nodes
+                assert len(gpb.partid2eids(i)) == part_g.total_num_edges
+            if load_feats:
+                assert "n1/labels" in node_feats
+                assert "n1/feats" in node_feats
+                assert "n1:r1:n2/feats" in edge_feats
+            else:
+                assert node_feats == {}
+                assert edge_feats == {}
+
+            # sample_neighbors()
+            subg = part_g.sample_neighbors(th.arange(10), th.IntTensor([-1]), keep_homo=True,
+                                           return_eids=True)
+            src, dst = subg.node_pairs
+            orig_src = part_g.node_attributes[dgl.NID][src]
+            orig_dst = part_g.node_attributes[dgl.NID][dst]
+            etype_ids = subg.original_etype_ids
+            orig_eids = part_g.edge_attributes[dgl.EID].to(hg.idtype)[subg.original_edge_ids]
+            etype_idsA, _ = gpb.map_to_per_etype(orig_eids)
+            assert th.equal(etype_ids, etype_idsA), "etype_ids is not expected."
+
+            etype_ids, idx = F.sort_1d(etype_ids)
+            sorted_orig_src, sorted_orig_dst = F.gather_row(orig_src, idx), F.gather_row(orig_dst, idx)
+            src_ntype_ids, _ = gpb.map_to_per_ntype(sorted_orig_src.to(hg.idtype))
+            dst_ntype_ids, _ = gpb.map_to_per_ntype(sorted_orig_dst.to(hg.idtype))
+
+            print("gpb.canonical_etypes: ", gpb.canonical_etypes)
+            ntype_map = {ntype: i for i, ntype in enumerate(gpb.ntypes)}
+            for etid, etype in enumerate(gpb.canonical_etypes):
+                src_ntype, _, dst_ntype = etype
+                src_ntype_id = ntype_map[src_ntype]
+                dst_ntype_id = ntype_map[dst_ntype]
+                type_idx = etype_ids == etid
+                if F.sum(type_idx, 0) > 0:
+                    assert th.all(src_ntype_id == src_ntype_ids[type_idx]), (
+                        "source ntype is is not expected."
+                    )
+                    assert th.all(dst_ntype_id == dst_ntype_ids[type_idx]), (
+                        "destination ntype is is not expected."
+                    )
+
+
+@pytest.mark.parametrize("part_method", ["metis"])
+@pytest.mark.parametrize("num_parts", [4])
+@pytest.mark.parametrize("num_trainers_per_machine", [1])
+@pytest.mark.parametrize("load_feats", [True])
+def test_partition_homo_graphbolt_sample_neighbors(
+    part_method,
+    num_parts,
+    num_trainers_per_machine,
+    load_feats,
+):
+    os.environ["DGL_DIST_DEBUG"] = "1"
+    if part_method == "random" and num_parts > 1:
+        num_trainers_per_machine = 1
+
+    g = create_random_graph(1000)
+    g.ndata["labels"] = F.arange(0, g.num_nodes())
+    g.ndata["feats"] = F.tensor(np.random.randn(g.num_nodes(), 10), F.float32)
+    g.edata["feats"] = F.tensor(np.random.randn(g.num_edges(), 10), F.float32)
+    g.update_all(fn.copy_u("feats", "msg"), fn.sum("msg", "h"))
+    g.update_all(fn.copy_e("feats", "msg"), fn.sum("msg", "eh"))
+    num_hops = 2
+
+    with tempfile.TemporaryDirectory() as test_dir:
+        orig_nids, orig_eids = dgl.distributed.partition_graph(
+            g,
+            "test",
+            num_parts,
+            test_dir,
+            num_hops=num_hops,
+            part_method=part_method,
+            return_mapping=True,
+            num_trainers_per_machine=num_trainers_per_machine,
+            use_graphbolt=True,
+            gb_save_all=True,
+        )
+        part_config = os.path.join(test_dir, "test.json")
+        for i in range(num_parts):
+            part_g, node_feats, edge_feats, gpb, _, _, _ = dgl.distributed.load_partition(
+                part_config, i, load_feats=load_feats, use_graphbolt=True
+            )
+            assert isinstance(part_g, gb.CSCSamplingGraph)
+            assert gpb.num_partitions() == num_parts
+            gpb_meta = gpb.metadata()
+            assert len(gpb_meta) == num_parts
+            assert len(gpb.partid2nids(i)) == gpb_meta[i]["num_nodes"]
+            assert len(gpb.partid2eids(i)) == gpb_meta[i]["num_edges"]
+            if num_parts == 1:
+                assert len(gpb.partid2nids(i)) == part_g.total_num_nodes
+                assert len(gpb.partid2eids(i)) == part_g.total_num_edges
+            if load_feats:
+                assert "_N/labels" in node_feats
+                assert "_N/feats" in node_feats
+                assert "_N:_E:_N/feats" in edge_feats
+            else:
+                assert node_feats == {}
+                assert edge_feats == {}
+
+            # sample_neighbors()
+            subg = part_g.sample_neighbors(th.arange(10), th.IntTensor([-1]), keep_homo=True,
+                                           return_eids=True)
+            src, dst = subg.node_pairs
+            orig_src = part_g.node_attributes[dgl.NID][src]
+            orig_dst = part_g.node_attributes[dgl.NID][dst]
+            etype_ids = subg.original_etype_ids
+            assert etype_ids is None, "subgraph from homograph should not have etypes."
+            orig_eids = part_g.edge_attributes[dgl.EID].to(g.idtype)[subg.original_edge_ids]
+            etype_idsA, _ = gpb.map_to_per_etype(orig_eids)
+            #assert th.equal(etype_ids, etype_idsA), "etype_ids is not expected."
+
+            continue
+            etype_ids, idx = F.sort_1d(etype_ids)
+            sorted_orig_src, sorted_orig_dst = F.gather_row(orig_src, idx), F.gather_row(orig_dst, idx)
+            src_ntype_ids, _ = gpb.map_to_per_ntype(sorted_orig_src.to(g.idtype))
+            dst_ntype_ids, _ = gpb.map_to_per_ntype(sorted_orig_dst.to(g.idtype))
+
+            print("gpb.canonical_etypes: ", gpb.canonical_etypes)
+            ntype_map = {ntype: i for i, ntype in enumerate(gpb.ntypes)}
+            for etid, etype in enumerate(gpb.canonical_etypes):
+                src_ntype, _, dst_ntype = etype
+                src_ntype_id = ntype_map[src_ntype]
+                dst_ntype_id = ntype_map[dst_ntype]
+                continue
+                type_idx = etype_ids == etid
+                if F.sum(type_idx, 0) > 0:
+                    assert th.all(src_ntype_id == src_ntype_ids[type_idx]), (
+                        "source ntype is is not expected."
+                    )
+                    assert th.all(dst_ntype_id == dst_ntype_ids[type_idx]), (
+                        "destination ntype is is not expected."
+                    )
diff --git a/tests/python/pytorch/graphbolt/impl/test_csc_sampling_graph.py b/tests/python/pytorch/graphbolt/impl/test_csc_sampling_graph.py
index 02850c4da356..186d69af62ff 100644
--- a/tests/python/pytorch/graphbolt/impl/test_csc_sampling_graph.py
+++ b/tests/python/pytorch/graphbolt/impl/test_csc_sampling_graph.py
@@ -58,6 +58,7 @@ def test_hetero_empty_graph(total_num_nodes):
         node_type_offset,
         type_per_edge,
         None,
+        None,
         metadata,
     )
     assert graph.total_num_edges == 0
@@ -115,11 +116,17 @@ def test_homo_graph(total_num_nodes, total_num_edges):
     csc_indptr, indices = gbt.random_homo_graph(
         total_num_nodes, total_num_edges
     )
+    node_attributes = {"_ID": torch.arange(total_num_nodes)}
     edge_attributes = {
         "A1": torch.randn(total_num_edges),
         "A2": torch.randn(total_num_edges),
     }
-    graph = gb.from_csc(csc_indptr, indices, edge_attributes=edge_attributes)
+    graph = gb.from_csc(
+        csc_indptr,
+        indices,
+        node_attributes=node_attributes,
+        edge_attributes=edge_attributes,
+    )
 
     assert graph.total_num_nodes == total_num_nodes
     assert graph.total_num_edges == total_num_edges
@@ -127,6 +134,7 @@ def test_homo_graph(total_num_nodes, total_num_edges):
     assert torch.equal(csc_indptr, graph.csc_indptr)
     assert torch.equal(indices, graph.indices)
 
+    assert graph.node_attributes == node_attributes
     assert graph.edge_attributes == edge_attributes
     assert graph.metadata is None
     assert graph.node_type_offset is None
@@ -152,6 +160,7 @@ def test_hetero_graph(total_num_nodes, total_num_edges, num_ntypes, num_etypes):
     ) = gbt.random_hetero_graph(
         total_num_nodes, total_num_edges, num_ntypes, num_etypes
     )
+    node_attributes = {"_ID": torch.arange(total_num_nodes)}
     edge_attributes = {
         "A1": torch.randn(total_num_edges),
         "A2": torch.randn(total_num_edges),
@@ -161,6 +170,7 @@ def test_hetero_graph(total_num_nodes, total_num_edges, num_ntypes, num_etypes):
         indices,
         node_type_offset,
         type_per_edge,
+        node_attributes,
         edge_attributes,
         metadata,
     )
@@ -172,6 +182,7 @@ def test_hetero_graph(total_num_nodes, total_num_edges, num_ntypes, num_etypes):
     assert torch.equal(indices, graph.indices)
     assert torch.equal(node_type_offset, graph.node_type_offset)
     assert torch.equal(type_per_edge, graph.type_per_edge)
+    assert graph.node_attributes == node_attributes
     assert graph.edge_attributes == edge_attributes
     assert metadata.node_type_to_id == graph.metadata.node_type_to_id
     assert metadata.edge_type_to_id == graph.metadata.edge_type_to_id
@@ -242,7 +253,7 @@ def test_num_nodes_hetero():
     # Construct CSCSamplingGraph.
     metadata = gb.GraphMetadata(ntypes, etypes)
     graph = gb.from_csc(
-        indptr, indices, node_type_offset, type_per_edge, None, metadata
+        indptr, indices, node_type_offset, type_per_edge, None, None, metadata
     )
 
     # Verify nodes number per node types.
@@ -274,7 +285,13 @@ def test_node_type_offset_wrong_legnth(node_type_offset):
     )
     with pytest.raises(Exception):
         gb.from_csc(
-            csc_indptr, indices, node_type_offset, type_per_edge, None, metadata
+            csc_indptr,
+            indices,
+            node_type_offset,
+            type_per_edge,
+            None,
+            None,
+            metadata,
         )
 
 
@@ -330,7 +347,13 @@ def test_load_save_hetero_graph(
         total_num_nodes, total_num_edges, num_ntypes, num_etypes
     )
     graph = gb.from_csc(
-        csc_indptr, indices, node_type_offset, type_per_edge, None, metadata
+        csc_indptr,
+        indices,
+        node_type_offset,
+        type_per_edge,
+        None,
+        None,
+        metadata,
     )
 
     with tempfile.TemporaryDirectory() as test_dir:
@@ -398,6 +421,7 @@ def test_pickle_hetero_graph(
     ) = gbt.random_hetero_graph(
         total_num_nodes, total_num_edges, num_ntypes, num_etypes
     )
+    node_attributes = {"_ID": torch.arange(total_num_nodes)}
     edge_attributes = {
         "a": torch.randn((total_num_edges,)),
         "b": torch.randint(1, 10, (total_num_edges,)),
@@ -407,6 +431,7 @@ def test_pickle_hetero_graph(
         indices,
         node_type_offset,
         type_per_edge,
+        node_attributes,
         edge_attributes,
         metadata,
     )
@@ -423,6 +448,7 @@ def test_pickle_hetero_graph(
     assert torch.equal(graph.type_per_edge, graph2.type_per_edge)
     assert graph.metadata.node_type_to_id == graph2.metadata.node_type_to_id
     assert graph.metadata.edge_type_to_id == graph2.metadata.edge_type_to_id
+    assert graph.node_attributes == node_attributes
     assert graph.edge_attributes.keys() == graph2.edge_attributes.keys()
     for i in graph.edge_attributes.keys():
         assert torch.equal(graph.edge_attributes[i], graph2.edge_attributes[i])
@@ -458,6 +484,7 @@ def test_multiprocessing():
         indices,
         node_type_offset,
         type_per_edge,
+        None,
         edge_attributes,
         metadata,
     )
@@ -555,7 +582,7 @@ def test_in_subgraph_heterogeneous():
     # Construct CSCSamplingGraph.
     metadata = gb.GraphMetadata(ntypes, etypes)
     graph = gb.from_csc(
-        indptr, indices, node_type_offset, type_per_edge, None, metadata
+        indptr, indices, node_type_offset, type_per_edge, None, None, metadata
     )
 
     # Extract in subgraph.
@@ -1045,12 +1072,17 @@ def check_tensors_on_the_same_shared_memory(t1: torch.Tensor, t2: torch.Tensor):
     [(1, 1), (100, 1), (10, 50), (1000, 50000)],
 )
 @pytest.mark.parametrize("test_edge_attrs", [True, False])
+@pytest.mark.parametrize("test_node_attrs", [True, False])
 def test_homo_graph_on_shared_memory(
-    total_num_nodes, total_num_edges, test_edge_attrs
+    total_num_nodes, total_num_edges, test_edge_attrs, test_node_attrs
 ):
     csc_indptr, indices = gbt.random_homo_graph(
         total_num_nodes, total_num_edges
     )
+    if test_node_attrs:
+        node_attributes = {"_ID": torch.arange(total_num_nodes)}
+    else:
+        node_attributes = None
     if test_edge_attrs:
         edge_attributes = {
             "A1": torch.randn(total_num_edges),
@@ -1058,7 +1090,12 @@ def test_homo_graph_on_shared_memory(
         }
     else:
         edge_attributes = None
-    graph = gb.from_csc(csc_indptr, indices, edge_attributes=edge_attributes)
+    graph = gb.from_csc(
+        csc_indptr,
+        indices,
+        node_attributes=node_attributes,
+        edge_attributes=edge_attributes,
+    )
 
     shm_name = "test_homo_g"
     graph1 = graph.copy_to_shared_memory(shm_name)
@@ -1083,6 +1120,15 @@ def test_homo_graph_on_shared_memory(
     )
     check_tensors_on_the_same_shared_memory(graph1.indices, graph2.indices)
 
+    if test_node_attrs:
+        for name, node_attr in node_attributes.items():
+            assert name in graph1.node_attributes
+            assert name in graph2.node_attributes
+            assert torch.equal(graph1.node_attributes[name], node_attr)
+            check_tensors_on_the_same_shared_memory(
+                graph1.node_attributes[name], graph2.node_attributes[name]
+            )
+
     if test_edge_attrs:
         for name, edge_attr in edge_attributes.items():
             assert name in graph1.edge_attributes
@@ -1107,8 +1153,14 @@ def test_homo_graph_on_shared_memory(
 )
 @pytest.mark.parametrize("num_ntypes, num_etypes", [(1, 1), (3, 5), (100, 1)])
 @pytest.mark.parametrize("test_edge_attrs", [True, False])
+@pytest.mark.parametrize("test_node_attrs", [True, False])
 def test_hetero_graph_on_shared_memory(
-    total_num_nodes, total_num_edges, num_ntypes, num_etypes, test_edge_attrs
+    total_num_nodes,
+    total_num_edges,
+    num_ntypes,
+    num_etypes,
+    test_edge_attrs,
+    test_node_attrs,
 ):
     (
         csc_indptr,
@@ -1120,6 +1172,11 @@ def test_hetero_graph_on_shared_memory(
         total_num_nodes, total_num_edges, num_ntypes, num_etypes
     )
 
+    if test_node_attrs:
+        node_attributes = {"_ID": torch.arange(total_num_nodes)}
+    else:
+        node_attributes = None
+
     if test_edge_attrs:
         edge_attributes = {
             "A1": torch.randn(total_num_edges),
@@ -1132,6 +1189,7 @@ def test_hetero_graph_on_shared_memory(
         indices,
         node_type_offset,
         type_per_edge,
+        node_attributes,
         edge_attributes,
         metadata,
     )
@@ -1169,6 +1227,15 @@ def test_hetero_graph_on_shared_memory(
         graph1.type_per_edge, graph2.type_per_edge
     )
 
+    if test_node_attrs:
+        for name, node_attr in node_attributes.items():
+            assert name in graph1.node_attributes
+            assert name in graph2.node_attributes
+            assert torch.equal(graph1.node_attributes[name], node_attr)
+            check_tensors_on_the_same_shared_memory(
+                graph1.node_attributes[name], graph2.node_attributes[name]
+            )
+
     if test_edge_attrs:
         for name, edge_attr in edge_attributes.items():
             assert name in graph1.edge_attributes
@@ -1256,6 +1323,7 @@ def test_multiprocessing_with_shared_memory():
         node_type_offset,
         type_per_edge,
         None,
+        None,
         metadata,
     )
 
@@ -1300,19 +1368,29 @@ def test_from_dglgraph_homogeneous():
     gb_g = gb.from_dglgraph(
         dgl_g, is_homogeneous=False, include_original_edge_id=False
     )
+    assert (
+        gb_g.node_attributes is None
+        or gb.ORIGINAL_NODE_ID not in gb_g.node_attributes
+    )
     assert (
         gb_g.edge_attributes is None
         or gb.ORIGINAL_EDGE_ID not in gb_g.edge_attributes
     )
 
     gb_g = gb.from_dglgraph(
-        dgl_g, is_homogeneous=True, include_original_edge_id=True
+        dgl_g,
+        is_homogeneous=True,
+        include_original_node_id=True,
+        include_original_edge_id=True,
     )
     # Get the COO representation of the CSCSamplingGraph.
     num_columns = gb_g.csc_indptr[1:] - gb_g.csc_indptr[:-1]
     rows = gb_g.indices
     columns = torch.arange(gb_g.total_num_nodes).repeat_interleave(num_columns)
 
+    original_node_ids = gb_g.node_attributes[gb.ORIGINAL_NODE_ID]
+    assert torch.all(original_node_ids == torch.arange(gb_g.num_nodes))
+
     original_edge_ids = gb_g.edge_attributes[gb.ORIGINAL_EDGE_ID]
     assert torch.all(dgl_g.edges()[0][original_edge_ids] == rows)
     assert torch.all(dgl_g.edges()[1][original_edge_ids] == columns)
@@ -1351,13 +1429,20 @@ def test_from_dglgraph_heterogeneous():
     gb_g = gb.from_dglgraph(
         dgl_g, is_homogeneous=False, include_original_edge_id=False
     )
+    assert (
+        gb_g.node_attributes is None
+        or gb.ORIGINAL_NODE_ID not in gb_g.node_attributes
+    )
     assert (
         gb_g.edge_attributes is None
         or gb.ORIGINAL_EDGE_ID not in gb_g.edge_attributes
     )
 
     gb_g = gb.from_dglgraph(
-        dgl_g, is_homogeneous=False, include_original_edge_id=True
+        dgl_g,
+        is_homogeneous=False,
+        include_original_node_id=True,
+        include_original_edge_id=True,
     )
 
     # `reverse_node_id` is used to map the node id in CSCSamplingGraph to the
@@ -1385,6 +1470,14 @@ def test_from_dglgraph_heterogeneous():
         == dgl_g.etypes
     )
 
+    for ntype, num_nodes in gb_g.num_nodes.items():
+        original_node_ids = gb_g.node_attributes[gb.ORIGINAL_NODE_ID][
+            gb_g.node_type_offset[
+                gb_g.metadata.node_type_to_id[ntype]
+            ] : gb_g.node_type_offset[gb_g.metadata.node_type_to_id[ntype] + 1]
+        ]
+        assert torch.all(original_node_ids == torch.arange(num_nodes))
+
     # Use ORIGINAL_EDGE_ID to check if the edge mapping is correct.
     for edge_idx in range(gb_g.total_num_edges):
         hetero_graph_idx = gb_g.type_per_edge[edge_idx]
@@ -1630,6 +1723,10 @@ def test_csc_sampling_graph_to_device():
     assert node_type_offset[-1] == total_num_nodes
     assert all(type_per_edge < len(etypes))
 
+    node_attributes = {
+        "_ID": torch.arange(total_num_nodes),
+    }
+
     edge_attributes = {
         "mask": torch.BoolTensor([1, 1, 0, 1, 1, 1, 0, 0, 0]),
         "all": torch.BoolTensor([1, 1, 1, 1, 1, 1, 1, 1, 1]),
@@ -1640,6 +1737,7 @@ def test_csc_sampling_graph_to_device():
     graph = gb.from_csc(
         indptr,
         indices,
+        node_attributes=node_attributes,
         edge_attributes=edge_attributes,
         node_type_offset=node_type_offset,
         type_per_edge=type_per_edge,
@@ -1655,5 +1753,7 @@ def test_csc_sampling_graph_to_device():
     assert graph.node_type_offset.device.type == "cuda"
     assert graph.type_per_edge.device.type == "cuda"
     assert graph.csc_indptr.device.type == "cuda"
+    for key in graph.node_attributes:
+        assert graph.node_attributes[key].device.type == "cuda"
     for key in graph.edge_attributes:
         assert graph.edge_attributes[key].device.type == "cuda"
diff --git a/tests/python/pytorch/graphbolt/impl/test_ondisk_dataset.py b/tests/python/pytorch/graphbolt/impl/test_ondisk_dataset.py
index bb95bd92f69e..a08ab53ad8b7 100644
--- a/tests/python/pytorch/graphbolt/impl/test_ondisk_dataset.py
+++ b/tests/python/pytorch/graphbolt/impl/test_ondisk_dataset.py
@@ -993,7 +993,13 @@ def test_OnDiskDataset_Graph_heterogeneous():
         metadata,
     ) = gbt.random_hetero_graph(1000, 10 * 1000, 3, 4)
     graph = gb.from_csc(
-        csc_indptr, indices, node_type_offset, type_per_edge, None, metadata
+        csc_indptr,
+        indices,
+        node_type_offset,
+        type_per_edge,
+        None,
+        None,
+        metadata,
     )
 
     with tempfile.TemporaryDirectory() as test_dir:
@@ -1618,8 +1624,14 @@ def test_OnDiskDataset_load_graph():
 
         # Check the different original_edge_id option to load edge_attributes.
         dataset = gb.OnDiskDataset(
-            test_dir, include_original_edge_id=True
+            test_dir,
+            include_original_node_id=True,
+            include_original_edge_id=True,
         ).load()
+        assert (
+            dataset.graph.node_attributes is not None
+            and gb.ORIGINAL_NODE_ID in dataset.graph.node_attributes
+        )
         assert (
             dataset.graph.edge_attributes is not None
             and gb.ORIGINAL_EDGE_ID in dataset.graph.edge_attributes
@@ -1685,8 +1697,14 @@ def test_OnDiskDataset_load_graph():
 
         # Test do not generate original_edge_id.
         dataset = gb.OnDiskDataset(
-            test_dir, include_original_edge_id=False
+            test_dir,
+            include_original_node_id=False,
+            include_original_edge_id=False,
         ).load()
+        assert (
+            dataset.graph.node_attributes is None
+            or gb.ORIGINAL_NODE_ID not in dataset.graph.node_attributes
+        )
         assert (
             dataset.graph.edge_attributes is None
             or gb.ORIGINAL_EDGE_ID not in dataset.graph.edge_attributes
@@ -1823,7 +1841,13 @@ def test_OnDiskDataset_all_nodes_set_hetero():
         metadata,
     ) = gbt.random_hetero_graph(1000, 10 * 1000, 3, 4)
     graph = gb.from_csc(
-        csc_indptr, indices, node_type_offset, type_per_edge, None, metadata
+        csc_indptr,
+        indices,
+        node_type_offset,
+        type_per_edge,
+        None,
+        None,
+        metadata,
     )
 
     with tempfile.TemporaryDirectory() as test_dir:
diff --git a/tests/scripts/task_distributed_test.sh b/tests/scripts/task_distributed_test.sh
index 62ea349d474a..dc70111e8c0c 100644
--- a/tests/scripts/task_distributed_test.sh
+++ b/tests/scripts/task_distributed_test.sh
@@ -34,6 +34,7 @@ export PYTHONUNBUFFERED=1
 export OMP_NUM_THREADS=1
 export DMLC_LOG_DEBUG=1
 
-python3 -m pytest -v --capture=tee-sys --junitxml=pytest_distributed.xml --durations=100 tests/distributed/*.py || fail "distributed"
+python3 -m pytest -v --capture=tee-sys --junitxml=pytest_distributed.xml --durations=100 tests/distributed/test_partition.py || fail "distributed"
+#python3 -m pytest -v --capture=tee-sys --junitxml=pytest_distributed.xml --durations=100 tests/distributed/*.py || fail "distributed"
 
 PYTHONPATH=tools:tools/distpartitioning:$PYTHONPATH python3 -m pytest -v --capture=tee-sys --junitxml=pytest_tools.xml --durations=100 tests/tools/*.py || fail "tools"