From 644113386bdaaef11ef8d742932a8891ffd318cd Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-1-37.us-west-2.compute.internal>
Date: Thu, 15 Aug 2024 10:14:22 +0000
Subject: [PATCH 01/35] contribute three IGB dataset (small version)

---
 examples/graphbolt/rgcn/download.py    | 525 +++++++++++++++++++++++++
 examples/graphbolt/rgcn/evaluator.py   |  97 +++++
 examples/graphbolt/rgcn/hetero_rgcn.py |  36 +-
 3 files changed, 641 insertions(+), 17 deletions(-)
 create mode 100755 examples/graphbolt/rgcn/download.py
 create mode 100644 examples/graphbolt/rgcn/evaluator.py

diff --git a/examples/graphbolt/rgcn/download.py b/examples/graphbolt/rgcn/download.py
new file mode 100755
index 000000000000..b506d38a7a48
--- /dev/null
+++ b/examples/graphbolt/rgcn/download.py
@@ -0,0 +1,525 @@
+import argparse, tarfile, hashlib, os, yaml, shutil
+from tqdm import tqdm
+import urllib.request as ur
+import numpy as np
+import subprocess
+import dgl.graphbolt as gb
+
+GBFACTOR = float(1 << 30)
+
+
+def _get_size(file_path, node_name):
+    if "full" in file_path:
+        return num_nodes["full"][node_name]
+    if "large" in file_path:
+        return num_nodes["large"][node_name]
+    path = f"{file_path}/processed/{node_name}/{node_name}_id_index_mapping.npy"
+    array = np.load(path, allow_pickle=True)
+    return len(array.item())
+
+
+def build_yaml_helper(path, in_memory=True):
+    data = {
+            "graph": {
+                "nodes": [
+                {
+                    "num": _get_size(path, "paper"),
+                    "type": "paper"
+                }, 
+                {
+                    "num": _get_size(path, "author"), 
+                    "type": "author"
+                }, 
+                {
+                    "num": _get_size(path, "institute"),
+                    "type": "institution"
+                }, 
+                {
+                    "num": _get_size(path, "fos"),
+                    "type": "field_of_study"
+                }
+                ], 
+                "edges": [
+                {
+                    "path": "edges/author__affiliated_to__institute.npy", 
+                    "type": "author:affiliated_to:institution", 
+                    "format": "numpy"
+                },
+                {
+                    "path": "edges/paper__written_by__author.npy", 
+                    "type": "paper:written_by:author", 
+                    "format": "numpy"
+                }, 
+                {
+                    "path": "edges/paper__cites__paper.npy", 
+                    "type": "paper:cites:paper", 
+                    "format": "numpy"
+                }, 
+                {
+                    "path": "edges/paper__topic__fos.npy", 
+                    "type": "paper:has_topic:field_of_study", 
+                    "format": "numpy"
+                },
+                ],
+            }, 
+            "tasks": [
+                {
+                "num_classes": 19,
+                "validation_set": [
+                    {
+                    "data": [
+                        {
+                        "in_memory": in_memory, 
+                        "path": "set/validation_indices.npy", 
+                        "name": "seeds", 
+                        "format": "numpy"
+                        }, 
+                        {
+                        "in_memory": in_memory, 
+                        "path": "set/validation_labels.npy", 
+                        "name": "labels", 
+                        "format": "numpy"
+                        }
+                    ], 
+                    "type": "paper"
+                    }
+                ], 
+                "name": "node_classification", 
+                "train_set": [
+                    {
+                    "data": [
+                        {
+                        "in_memory": in_memory, 
+                        "path": "set/train_indices.npy", 
+                        "name": "seeds", 
+                        "format": "numpy"
+                        }, 
+                        {
+                        "in_memory": in_memory, 
+                        "path": "set/train_labels.npy", 
+                        "name": "labels", 
+                        "format": "numpy"
+                        }
+                    ], 
+                    "type": "paper"
+                    }
+                ], 
+                "test_set": [
+                    {
+                    "data": [
+                        {
+                        "in_memory": in_memory, 
+                        "path": "set/test_indices.npy", 
+                        "name": "seeds", 
+                        "format": "numpy"
+                        }, 
+                        {
+                        "in_memory": in_memory, 
+                        "path": "set/test_labels.npy", 
+                        "name": "labels", 
+                        "format": "numpy"
+                        }
+                    ], 
+                    "type": "paper"
+                    }
+                ]
+                }
+            ], 
+            "feature_data": [
+                {
+                "domain": "node", 
+                "name": "feat", 
+                "format": "numpy", 
+                "in_memory": in_memory, 
+                "path": "data/paper_feat.npy", 
+                "type": "paper"
+                }, 
+                {
+                "domain": "node", 
+                "name": "label", 
+                "format": "numpy", 
+                "in_memory": in_memory, 
+                "path": "data/paper_label_19.npy", 
+                "type": "paper"
+                },
+                {
+                "domain": "node", 
+                "name": "feat", 
+                "format": "numpy", 
+                "in_memory": in_memory, 
+                "path": "data/author_feat.npy", 
+                "type": "author"
+                }, 
+                {
+                "domain": "node", 
+                "name": "feat", 
+                "format": "numpy", 
+                "in_memory": in_memory, 
+                "path": "data/institute_feat.npy", 
+                "type": "institute"
+                },
+                {
+                "domain": "node", 
+                "name": "feat", 
+                "format": "numpy", 
+                "in_memory": in_memory, 
+                "path": "data/fos_feat.npy", 
+                "type": "fos"
+                }, 
+                {
+                "domain": "node", 
+                "name": "feat", 
+                "format": "numpy", 
+                "in_memory": in_memory,
+                "path": "data/author_feat.npy", 
+                "type": "author"
+                }
+            ], 
+            "dataset_name": os.path.basename(path)
+            }
+    
+    return data
+
+
+def build_yaml(original_path, current_path):
+    if "large" in current_path or "full" in current_path:
+        data = build_yaml_helper(original_path, in_memory=False)
+    else:
+        data = build_yaml_helper(original_path)
+    with open(f"{current_path}/metadata.yaml", 'w') as file:
+        yaml.dump(data, file, default_flow_style=False)
+
+
+def decide_download(url):
+    d = ur.urlopen(url)
+    size = int(d.info()["Content-Length"])/GBFACTOR
+    ### confirm if larger than 1GB
+    if size > 1:
+        return input("This will download %.2fGB. Will you proceed? (y/N) " % (size)).lower() == "y"
+    else:
+        return True
+
+
+dataset_urls = {
+    'homogeneous' : {
+        'tiny' : 'https://igb-public-awsopen.s3.amazonaws.com/igb-homogeneous/igb_homogeneous_tiny.tar.gz',
+        'small' : 'https://igb-public-awsopen.s3.amazonaws.com/igb-homogeneous/igb_homogeneous_small.tar.gz',
+        'medium' : 'https://igb-public-awsopen.s3.amazonaws.com/igb-homogeneous/igb_homogeneous_medium.tar.gz'
+    },
+    'heterogeneous' : {
+        'tiny' : 'https://igb-public-awsopen.s3.amazonaws.com/igb-heterogeneous/igb_heterogeneous_tiny.tar.gz',
+        'small' : 'https://igb-public-awsopen.s3.amazonaws.com/igb-heterogeneous/igb_heterogeneous_small.tar.gz',
+        'medium' : 'https://igb-public-awsopen.s3.amazonaws.com/igb-heterogeneous/igb_heterogeneous_medium.tar.gz'
+    }  
+}
+
+
+md5checksums = {
+    'homogeneous' : {
+        'tiny' : '34856534da55419b316d620e2d5b21be',
+        'small' : '6781c699723529902ace0a95cafe6fe4',
+        'medium' : '4640df4ceee46851fd18c0a44ddcc622'
+    },
+    'heterogeneous' : {
+        'tiny' : '83fbc1091497ff92cf20afe82fae0ade',
+        'small' : '2f42077be60a074aec24f7c60089e1bd',
+        'medium' : '7f0df4296eca36553ff3a6a63abbd347'
+    }  
+}
+
+
+def check_md5sum(dataset_type, dataset_size, filename):
+    original_md5 = md5checksums[dataset_type][dataset_size]
+
+    with open(filename, 'rb') as file_to_check:
+        data = file_to_check.read()    
+        md5_returned = hashlib.md5(data).hexdigest()
+
+    if original_md5 == md5_returned:
+        print(" md5sum verified.")
+        return
+    else:
+        os.remove(filename)
+        raise Exception(" md5sum verification failed!.")
+        
+
+def download_dataset(path, dataset_type, dataset_size):
+    if dataset_size in ["large", "full"]:
+        command = f"./download_{dataset_size}_igbh.sh"
+        subprocess.run(['bash', command], check=True, text=True)
+        shutil.move(src=f"igb-{dataset_type}-{dataset_size}", dst=f"{path}")
+        return path + "/" + "igb-" + dataset_type + "-" + dataset_size
+    else: 
+        output_directory = path
+        if not os.path.exists(output_directory + "igb_" + dataset_type + "_" + dataset_size + ".tar.gz"):
+            url = dataset_urls[dataset_type][dataset_size]
+            if decide_download(url):
+                data = ur.urlopen(url)
+                size = int(data.info()["Content-Length"])
+                chunk_size = 1024*1024
+                num_iter = int(size/chunk_size) + 2
+                downloaded_size = 0
+                filename = path + "/igb_" + dataset_type + "_" + dataset_size + ".tar.gz"
+                with open(filename, 'wb') as f:
+                    pbar = tqdm(range(num_iter))
+                    for i in pbar:
+                        chunk = data.read(chunk_size)
+                        downloaded_size += len(chunk)
+                        pbar.set_description("Downloaded {:.2f} GB".format(float(downloaded_size)/GBFACTOR))
+                        f.write(chunk)
+            print("Downloaded" + " igb_" + dataset_type + "_" + dataset_size, end=" ->")
+            check_md5sum(dataset_type, dataset_size, filename)
+        else: 
+            print("The file igb_" + dataset_type + "_" + dataset_size + ".tar.gz already exists, directly extracting...")
+            filename = path + "/igb_" + dataset_type + "_" + dataset_size + ".tar.gz"
+        file = tarfile.open(filename)
+        file.extractall(output_directory)
+        file.close()
+        size = 0
+        for path, dirs, files in os.walk(output_directory+ "/" + dataset_size):
+            for f in files:
+                fp = os.path.join(path, f)
+                size += os.path.getsize(fp)
+        print("Final dataset size {:.2f} GB.".format(size/GBFACTOR))
+        # os.remove(filename)
+        os.rename(output_directory+ "/" + dataset_size, output_directory+ "/" + "igb-" + dataset_type + "-" + dataset_size)
+        return output_directory + "/" + "igb-" + dataset_type + "-" + dataset_size
+
+
+num_nodes = {
+    "full": {
+        "paper": 269346174,
+        "author": 277220883,
+        "institute": 26918,
+        "fos": 712960
+    }, 
+    "large": {
+        "paper": 100000000,
+        "author": 116959896,
+        "institute": 26524,
+        "fos": 649707
+    },
+    "medium": {
+        "paper": 10000000,   
+        "author": 15544654,
+        "institute": 23256,
+        "fos": 415054
+    }, 
+    "small": { 
+        "paper": 1000000,
+        "author": 1926066,
+        "institute": 14751,
+        "fos": 190449
+    },
+    "tiny": {
+        "paper": 100000,
+        "author": 357041,
+        "institute": 8738,
+        "fos": 84220
+    }
+}
+
+num_edges = {
+    "full": {
+        "paper__cites__paper": 3996442004,
+        "paper__written_by__author": 716761549,
+        "paper__topic__fos": 1050280600,
+        "author__affiliated_to__institute": 48521486
+    },
+    "large": {
+        "paper__cites__paper": 1223571364,
+        "paper__written_by__author": 289502107,
+        "paper__topic__fos": 457577294,
+        "author__affiliated_to__institute": 34099660
+    },
+    "medium": {
+        "paper__cites__paper": 120077694,
+        "paper__written_by__author": 39854592,
+        "paper__topic__fos": 68510495,
+        "author__affiliated_to__institute": 11049412
+    },
+    "small": {
+        "paper__cites__paper": 12070502,
+        "paper__written_by__author": 4553516,
+        "paper__topic__fos": 7234122,
+        "author__affiliated_to__institute": 1630476
+    }, 
+    "tiny": {
+        "paper__cites__paper": 447416,
+        "paper__written_by__author": 471443,
+        "paper__topic__fos": 718445,
+        "author__affiliated_to__institute": 325410
+    }
+}
+
+
+def split_data(label_path, set_dir, dataset_size):
+    # labels = np.memmap(label_path, dtype='int32', mode='r',  shape=(num_nodes[dataset_size]["paper"], 1))
+    labels = np.load(label_path)
+
+    total_samples = len(labels)
+    train_end = int(0.8 * total_samples)
+    validation_end = int(0.9 * total_samples)
+
+    indices = np.arange(total_samples)
+    train_indices = indices[:train_end]
+    validation_indices = indices[train_end:validation_end]
+    test_indices = indices[validation_end:]
+    print(indices)
+    print(train_indices)
+    print(validation_indices)
+    print(test_indices)
+
+    train_labels = labels[:train_end]
+    validation_labels = labels[train_end:validation_end]
+    test_labels = labels[validation_end:]
+    print(train_labels, len(train_labels))
+    print(validation_labels,len(validation_labels))
+    print(test_labels, len(test_labels))
+
+    gb.numpy_save_aligned(f"{set_dir}/train_indices.npy", train_indices)
+    gb.numpy_save_aligned(f"{set_dir}/validation_indices.npy", validation_indices)
+    gb.numpy_save_aligned(f"{set_dir}/test_indices.npy", test_indices)
+    gb.numpy_save_aligned(f"{set_dir}/train_labels.npy", train_labels)
+    gb.numpy_save_aligned(f"{set_dir}/validation_labels.npy", validation_labels)
+    gb.numpy_save_aligned(f"{set_dir}/test_labels.npy", test_labels)
+
+
+def add_edges(edges, source, dest, dataset_size):
+    for edge in edges:
+        print(f"\t Processing {edge} edge...")
+        
+        old_edge_path = source + "/" + edge + "/" + "edge_index.npy"
+        new_edge_path = dest + "/" + edge + ".npy"
+        os.rename(src=old_edge_path, dst=new_edge_path)
+
+        # edge_array = np.memmap(new_edge_path, dtype='int32', mode='r',  shape=(num_edges[dataset_size][edge], 2))
+        edge_array = np.load(new_edge_path)
+        new_edge_array = edge_array.transpose()
+
+        assert(new_edge_array.shape == (2, num_edges[dataset_size][edge]))
+
+        np.save(new_edge_path, new_edge_array)
+
+
+def process_feat(file_path, node_name, dataset_size):
+    # array = np.memmap(file_path, dtype='float32', mode='r',  shape=(num_nodes[dataset_size][node_name], 1024))
+    array = np.load(file_path)
+    assert(array.shape == (num_nodes[dataset_size][node_name], 1024))
+    gb.numpy_save_aligned(file_path, array)
+
+    # Assert the shape and elements of the array are correct
+    # new_array = np.memmap(file_path, dtype='float32', mode='r',  shape=(num_nodes[dataset_size][node_name], 1024))
+    new_array = np.load(file_path)
+    assert(array.shape == (num_nodes[dataset_size][node_name], 1024))
+    assert(np.array_equal(array, new_array))
+    
+
+def process_label(file_path, num_class, dataset_size):
+    if num_class == 2983 and dataset_size == "full": # only this case label number changes
+        # array = np.memmap(file_path, dtype='int32', mode='r',  shape=(227130858, 1))
+        array = np.load(file_path)
+        assert(array.shape == (227130858, 1) or array.shape == (227130858,))
+    else:
+        # array = np.memmap(file_path, dtype='int32', mode='r',  shape=(num_nodes[dataset_size]["paper"], 1))
+        array = np.load(file_path)
+        assert(array.shape == (num_nodes[dataset_size]["paper"], 1) or array.shape == (num_nodes[dataset_size]["paper"],))
+
+    gb.numpy_save_aligned(file_path, array)
+
+    # Assert the shape and elements of the array are correct
+    if num_class == 2983 and dataset_size == "full":
+        # new_array = np.memmap(file_path, dtype='int32', mode='r',  shape=(227130858, 1))
+        new_array = np.load(file_path)
+        assert(new_array.shape == (227130858, 1) or new_array.shape == (227130858,))
+        assert(np.array_equal(array, new_array))
+    else: 
+        # new_array = np.memmap(file_path, dtype='int32', mode='r',  shape=(num_nodes[dataset_size]["paper"], 1))
+        new_array = np.load(file_path)
+        assert(new_array.shape == (num_nodes[dataset_size]["paper"], 1) or new_array.shape == (num_nodes[dataset_size]["paper"],))
+        assert(np.array_equal(array, new_array))
+
+
+def add_nodes(nodes, source, dest, dataset_size):
+    for node in nodes:
+        print(f"\t Processing {node} node feature...")
+        old_node_path = source + "/" + node + "/" + "node_feat.npy"
+        new_node_path = dest + "/" + node + "_feat.npy"
+        os.rename(src=old_node_path, dst=new_node_path)
+        process_feat(file_path=new_node_path, node_name=node, dataset_size=dataset_size)
+        if node == "paper":
+            print(f"\t Processing {node} labels...")
+            old_label_path_19 = source + "/" + node + "/" + "node_label_19.npy"
+            new_label_path_19 = dest + "/" + "paper_label_19.npy"
+            os.rename(src=old_label_path_19, dst=new_label_path_19)
+            process_label(file_path=new_label_path_19, num_class=19, dataset_size=dataset_size)
+
+            old_label_path_2K = source + "/" + node + "/" + "node_label_2K.npy"
+            new_label_path_2K = dest + "/" + "paper_label_2K.npy"
+            os.rename(src=old_label_path_2K, dst=new_label_path_2K)
+            process_label(file_path=new_label_path_19, num_class=2983, dataset_size=dataset_size)
+
+    return new_label_path_19, new_label_path_2K
+
+
+def process_dataset(path, dataset_size):
+    print(f"Starting to process the {dataset_size} dataset...")
+
+    # Make the directory for processed dataset
+    processed_dir = path + "-seeds"
+    os.makedirs(name=processed_dir, exist_ok=True)
+    original_path = path + "/" + "processed"
+
+    # Step 1: Move Nodes files
+    print("Processing Node files...")
+    node_dir = processed_dir + "/" + "data"
+    os.makedirs(name=node_dir, exist_ok=True)
+    # These are the four nodes in this citation network
+    nodes = [
+        "paper",
+        "author",
+        "fos",
+        "institute"
+    ]
+    label_file_19, label_file_2K = add_nodes(nodes=nodes, source=original_path, dest=node_dir, dataset_size=dataset_size)
+
+    # Step 2: Create labels
+    print("Processing train/valid/test files...")
+    set_dir = processed_dir + "/" + "set"
+    os.makedirs(name=set_dir, exist_ok=True)
+    split_data(label_path=label_file_19, set_dir=set_dir, dataset_size=dataset_size)
+
+    # Step 3: Move edge files
+    print("Processing Edge files...")
+    edge_dir = processed_dir + "/" + "edges"
+    os.makedirs(name=edge_dir, exist_ok=True)
+    # These are the four edges in this citation network
+    edges = [
+        "paper__cites__paper",
+        "paper__written_by__author",
+        "paper__topic__fos",
+        "author__affiliated_to__institute"
+    ]
+    add_edges(edges=edges, source=original_path, dest=edge_dir, dataset_size=dataset_size)
+
+    # Step 4: Build the yaml file
+    print("Building yaml file...")
+    build_yaml(original_path=path, current_path=processed_dir)
+
+    # shutil.rmtree(path)
+    print(f"Finished processing the {dataset_size} dataset")
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--path', type=str, default='datasets/',
+        help='path to store the datasets')
+    parser.add_argument('--type', type=str, default='heterogeneous',
+        choices=['homogeneous', 'heterogeneous'], 
+        help='dataset type')
+    parser.add_argument('--size', type=str, default='tiny',
+        choices=['tiny', 'small', 'medium', 'large', 'full'], 
+        help='size of the datasets')
+    args = parser.parse_args()    
+    path = download_dataset(path=args.path, dataset_type=args.type, dataset_size=args.size)
+    process_dataset(path=path, dataset_size=args.size)
diff --git a/examples/graphbolt/rgcn/evaluator.py b/examples/graphbolt/rgcn/evaluator.py
new file mode 100644
index 000000000000..6598355b0727
--- /dev/null
+++ b/examples/graphbolt/rgcn/evaluator.py
@@ -0,0 +1,97 @@
+import numpy as np
+try:
+    import torch
+except ImportError:
+    torch = None
+
+### Evaluator for node property prediction
+class IGB_Evaluator:
+    def __init__(self, name, num_tasks, eval_metric):
+        self.name = name
+        self.num_tasks = num_tasks
+        self.eval_metric = eval_metric
+
+
+    def _parse_and_check_input(self, input_dict):
+        if self.eval_metric == 'acc':
+            if not 'y_true' in input_dict:
+                raise RuntimeError('Missing key of y_true')
+            if not 'y_pred' in input_dict:
+                raise RuntimeError('Missing key of y_pred')
+
+            y_true, y_pred = input_dict['y_true'], input_dict['y_pred']
+
+            '''
+                y_true: numpy ndarray or torch tensor of shape (num_nodes num_tasks)
+                y_pred: numpy ndarray or torch tensor of shape (num_nodes num_tasks)
+            '''
+
+            # converting to torch.Tensor to numpy on cpu
+            if torch is not None and isinstance(y_true, torch.Tensor):
+                y_true = y_true.detach().cpu().numpy()
+
+            if torch is not None and isinstance(y_pred, torch.Tensor):
+                y_pred = y_pred.detach().cpu().numpy()
+
+            ## check type
+            if not (isinstance(y_true, np.ndarray) and isinstance(y_true, np.ndarray)):
+                raise RuntimeError('Arguments to Evaluator need to be either numpy ndarray or torch tensor')
+
+            if not y_true.shape == y_pred.shape:
+                raise RuntimeError('Shape of y_true and y_pred must be the same')
+
+            if not y_true.ndim == 2:
+                raise RuntimeError('y_true and y_pred must to 2-dim arrray, {}-dim array given'.format(y_true.ndim))
+
+            if not y_true.shape[1] == self.num_tasks:
+                raise RuntimeError('Number of tasks for {} should be {} but {} given'.format(self.name, self.num_tasks, y_true.shape[1]))
+
+            return y_true, y_pred
+
+        else:
+            raise ValueError('Undefined eval metric %s ' % (self.eval_metric))
+
+
+    def eval(self, input_dict):
+        if self.eval_metric == 'acc':
+            y_true, y_pred = self._parse_and_check_input(input_dict)
+            return self._eval_acc(y_true, y_pred)
+        else:
+            raise ValueError('Undefined eval metric %s ' % (self.eval_metric))
+
+    @property
+    def expected_input_format(self):
+        desc = '==== Expected input format of Evaluator for {}\n'.format(self.name)
+        if self.eval_metric == 'acc':
+            desc += '{\'y_true\': y_true, \'y_pred\': y_pred}\n'
+            desc += '- y_true: numpy ndarray or torch tensor of shape (num_nodes num_tasks)\n'
+            desc += '- y_pred: numpy ndarray or torch tensor of shape (num_nodes num_tasks)\n'
+            desc += 'where y_pred stores predicted class label (integer),\n'
+            desc += 'num_task is {}, and '.format(self.num_tasks)
+            desc += 'each row corresponds to one node.\n'
+        else:
+            raise ValueError('Undefined eval metric %s ' % (self.eval_metric))
+
+        return desc
+
+    @property
+    def expected_output_format(self):
+        desc = '==== Expected output format of Evaluator for {}\n'.format(self.name)
+        if self.eval_metric == 'acc':
+            desc += '{\'acc\': acc}\n'
+            desc += '- acc (float): Accuracy score averaged across {} task(s)\n'.format(self.num_tasks)
+        else:
+            raise ValueError('Undefined eval metric %s ' % (self.eval_metric))
+
+        return desc
+
+    def _eval_acc(self, y_true, y_pred):
+        acc_list = []
+
+        for i in range(y_true.shape[1]):
+            is_labeled = y_true[:,i] == y_true[:,i]
+            correct = y_true[is_labeled,i] == y_pred[is_labeled,i]
+            acc_list.append(float(np.sum(correct))/len(correct))
+
+        return {'acc': sum(acc_list)/len(acc_list)}
+
diff --git a/examples/graphbolt/rgcn/hetero_rgcn.py b/examples/graphbolt/rgcn/hetero_rgcn.py
index 60ab51602ca1..503295733092 100644
--- a/examples/graphbolt/rgcn/hetero_rgcn.py
+++ b/examples/graphbolt/rgcn/hetero_rgcn.py
@@ -59,6 +59,7 @@
 import torch.nn.functional as F
 from dgl.nn import HeteroEmbedding
 from ogb.lsc import MAG240MEvaluator
+from evaluator import IGB_Evaluator
 from ogb.nodeproppred import Evaluator
 from tqdm import tqdm
 
@@ -124,12 +125,7 @@ def create_dataloader(
     #   The graph(FusedCSCSamplingGraph) from which to sample neighbors.
     # `fanouts`:
     #   The number of neighbors to sample for each node in each layer.
-    datapipe = datapipe.sample_neighbor(
-        graph,
-        fanouts=fanouts,
-        overlap_fetch=args.overlap_graph_fetch,
-        asynchronous=args.asynchronous,
-    )
+    datapipe = datapipe.sample_neighbor(graph, fanouts=fanouts)
 
     # Fetch the features for each node in the mini-batch.
     # `features`:
@@ -141,6 +137,11 @@ def create_dataloader(
     if name == "ogb-lsc-mag240m":
         node_feature_keys["author"] = ["feat"]
         node_feature_keys["institution"] = ["feat"]
+    if "igb-heterogeneous" in name:
+        node_feature_keys["author"] = ["feat"]
+        node_feature_keys["institution"] = ["feat"]
+        node_feature_keys["fos"] = ["feat"]
+
     datapipe = datapipe.fetch_feature(features, node_feature_keys)
 
     # Create a DataLoader from the datapipe.
@@ -158,7 +159,7 @@ def extract_embed(node_embed, input_nodes):
 
 def extract_node_features(name, block, data, node_embed, device):
     """Extract the node features from embedding layer or raw features."""
-    if name == "ogbn-mag":
+    if name == "ogbn-mag" or "igb-heterogeneous" in name:
         input_nodes = {
             k: v.to(device) for k, v in block.srcdata[dgl.NID].items()
         }
@@ -424,7 +425,9 @@ def evaluate(
     model.eval()
     category = "paper"
     # An evaluator for the dataset.
-    if name == "ogbn-mag":
+    if "igb-heterogeneous" in name:
+        evaluator = IGB_Evaluator(name=name, num_tasks=1, eval_metric="acc")
+    elif name == "ogbn-mag":
         evaluator = Evaluator(name=name)
     else:
         evaluator = MAG240MEvaluator()
@@ -573,14 +576,9 @@ def main(args):
     ) = load_dataset(args.dataset)
 
     # Move the dataset to the pinned memory to enable GPU access.
-    args.overlap_graph_fetch = False
-    args.asynchronous = False
     if device == torch.device("cuda"):
-        g = g.pin_memory_()
-        features = features.pin_memory_()
-        # Enable optimizations for sampling on the GPU.
-        args.overlap_graph_fetch = True
-        args.asynchronous = True
+        g.pin_memory_()
+        features.pin_memory_()
 
     feat_size = features.size("node", "paper", "feat")[0]
 
@@ -588,7 +586,8 @@ def main(args):
     # `institution` are generated in advance and stored in the feature store.
     # For `ogbn-mag`, we generate the features on the fly.
     embed_layer = None
-    if args.dataset == "ogbn-mag":
+    # if args.dataset == "ogbn-mag":
+    if args.dataset == "ogbn-mag" or "igb-heterogeneous" in args.dataset:
         # Create the embedding layer and move it to the appropriate device.
         embed_layer = rel_graph_embed(g, feat_size).to(device)
         print(
@@ -663,7 +662,10 @@ def main(args):
         "--dataset",
         type=str,
         default="ogbn-mag",
-        choices=["ogbn-mag", "ogb-lsc-mag240m"],
+        # choices=["ogbn-mag", "ogb-lsc-mag240m"],
+        choices=["ogbn-mag", "ogb-lsc-mag240m", "igb-heterogeneous-tiny", 
+                 "igb-heterogeneous-small", "igb-heterogeneous-medium",
+                 "igb-heterogeneous-large", "igb-heterogeneous-full"],
         help="Dataset name. Possible values: ogbn-mag, ogb-lsc-mag240m",
     )
     parser.add_argument("--num_epochs", type=int, default=3)

From 85d92c834f7667c24c5ea54f16a75ca16ea98510 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-1-37.us-west-2.compute.internal>
Date: Thu, 15 Aug 2024 10:20:20 +0000
Subject: [PATCH 02/35] contribute three IGB dataset (small version)

---
 examples/graphbolt/rgcn/hetero_rgcn.py | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/examples/graphbolt/rgcn/hetero_rgcn.py b/examples/graphbolt/rgcn/hetero_rgcn.py
index 503295733092..9c93b53a21c1 100644
--- a/examples/graphbolt/rgcn/hetero_rgcn.py
+++ b/examples/graphbolt/rgcn/hetero_rgcn.py
@@ -142,7 +142,12 @@ def create_dataloader(
         node_feature_keys["institution"] = ["feat"]
         node_feature_keys["fos"] = ["feat"]
 
-    datapipe = datapipe.fetch_feature(features, node_feature_keys)
+    datapipe = datapipe.sample_neighbor(
+        graph,
+        fanouts=fanouts,
+        overlap_fetch=args.overlap_graph_fetch,
+        asynchronous=args.asynchronous,
+    )
 
     # Create a DataLoader from the datapipe.
     # `num_workers`:
@@ -576,9 +581,14 @@ def main(args):
     ) = load_dataset(args.dataset)
 
     # Move the dataset to the pinned memory to enable GPU access.
+    args.overlap_graph_fetch = False
+    args.asynchronous = False
     if device == torch.device("cuda"):
-        g.pin_memory_()
-        features.pin_memory_()
+        g = g.pin_memory_()
+        features = features.pin_memory_()
+        # Enable optimizations for sampling on the GPU.
+        args.overlap_graph_fetch = True
+        args.asynchronous = True
 
     feat_size = features.size("node", "paper", "feat")[0]
 

From 5d2fe56cff826be4d7424505dc681f7bd95f6947 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-1-37.us-west-2.compute.internal>
Date: Thu, 15 Aug 2024 10:27:06 +0000
Subject: [PATCH 03/35] contribute three IGB dataset (small version)

---
 examples/graphbolt/rgcn/hetero_rgcn.py | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/examples/graphbolt/rgcn/hetero_rgcn.py b/examples/graphbolt/rgcn/hetero_rgcn.py
index 9c93b53a21c1..bb12b31e2069 100644
--- a/examples/graphbolt/rgcn/hetero_rgcn.py
+++ b/examples/graphbolt/rgcn/hetero_rgcn.py
@@ -125,7 +125,12 @@ def create_dataloader(
     #   The graph(FusedCSCSamplingGraph) from which to sample neighbors.
     # `fanouts`:
     #   The number of neighbors to sample for each node in each layer.
-    datapipe = datapipe.sample_neighbor(graph, fanouts=fanouts)
+    datapipe = datapipe.sample_neighbor(
+        graph,
+        fanouts=fanouts,
+        overlap_fetch=args.overlap_graph_fetch,
+        asynchronous=args.asynchronous,
+    )
 
     # Fetch the features for each node in the mini-batch.
     # `features`:
@@ -141,13 +146,7 @@ def create_dataloader(
         node_feature_keys["author"] = ["feat"]
         node_feature_keys["institution"] = ["feat"]
         node_feature_keys["fos"] = ["feat"]
-
-    datapipe = datapipe.sample_neighbor(
-        graph,
-        fanouts=fanouts,
-        overlap_fetch=args.overlap_graph_fetch,
-        asynchronous=args.asynchronous,
-    )
+    datapipe = datapipe.fetch_feature(features, node_feature_keys)
 
     # Create a DataLoader from the datapipe.
     # `num_workers`:
@@ -672,7 +671,6 @@ def main(args):
         "--dataset",
         type=str,
         default="ogbn-mag",
-        # choices=["ogbn-mag", "ogb-lsc-mag240m"],
         choices=["ogbn-mag", "ogb-lsc-mag240m", "igb-heterogeneous-tiny", 
                  "igb-heterogeneous-small", "igb-heterogeneous-medium",
                  "igb-heterogeneous-large", "igb-heterogeneous-full"],

From 543f672276797ba3a39e7134871e2f8c7bf341a8 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-1-37.us-west-2.compute.internal>
Date: Thu, 15 Aug 2024 10:32:08 +0000
Subject: [PATCH 04/35] format the code with ufmt

---
 examples/graphbolt/rgcn/download.py    | 555 ++++++++++++++-----------
 examples/graphbolt/rgcn/evaluator.py   |  94 +++--
 examples/graphbolt/rgcn/hetero_rgcn.py |  15 +-
 3 files changed, 380 insertions(+), 284 deletions(-)

diff --git a/examples/graphbolt/rgcn/download.py b/examples/graphbolt/rgcn/download.py
index b506d38a7a48..4724061476b5 100755
--- a/examples/graphbolt/rgcn/download.py
+++ b/examples/graphbolt/rgcn/download.py
@@ -1,9 +1,10 @@
-import argparse, tarfile, hashlib, os, yaml, shutil
-from tqdm import tqdm
-import urllib.request as ur
-import numpy as np
+import argparse, hashlib, os, shutil, tarfile, yaml
 import subprocess
+import urllib.request as ur
+
 import dgl.graphbolt as gb
+import numpy as np
+from tqdm import tqdm
 
 GBFACTOR = float(1 << 30)
 
@@ -20,164 +21,152 @@ def _get_size(file_path, node_name):
 
 def build_yaml_helper(path, in_memory=True):
     data = {
-            "graph": {
-                "nodes": [
+        "graph": {
+            "nodes": [
+                {"num": _get_size(path, "paper"), "type": "paper"},
+                {"num": _get_size(path, "author"), "type": "author"},
+                {"num": _get_size(path, "institute"), "type": "institution"},
+                {"num": _get_size(path, "fos"), "type": "field_of_study"},
+            ],
+            "edges": [
                 {
-                    "num": _get_size(path, "paper"),
-                    "type": "paper"
-                }, 
-                {
-                    "num": _get_size(path, "author"), 
-                    "type": "author"
-                }, 
-                {
-                    "num": _get_size(path, "institute"),
-                    "type": "institution"
-                }, 
-                {
-                    "num": _get_size(path, "fos"),
-                    "type": "field_of_study"
-                }
-                ], 
-                "edges": [
-                {
-                    "path": "edges/author__affiliated_to__institute.npy", 
-                    "type": "author:affiliated_to:institution", 
-                    "format": "numpy"
+                    "path": "edges/author__affiliated_to__institute.npy",
+                    "type": "author:affiliated_to:institution",
+                    "format": "numpy",
                 },
                 {
-                    "path": "edges/paper__written_by__author.npy", 
-                    "type": "paper:written_by:author", 
-                    "format": "numpy"
-                }, 
-                {
-                    "path": "edges/paper__cites__paper.npy", 
-                    "type": "paper:cites:paper", 
-                    "format": "numpy"
-                }, 
+                    "path": "edges/paper__written_by__author.npy",
+                    "type": "paper:written_by:author",
+                    "format": "numpy",
+                },
                 {
-                    "path": "edges/paper__topic__fos.npy", 
-                    "type": "paper:has_topic:field_of_study", 
-                    "format": "numpy"
+                    "path": "edges/paper__cites__paper.npy",
+                    "type": "paper:cites:paper",
+                    "format": "numpy",
                 },
-                ],
-            }, 
-            "tasks": [
                 {
+                    "path": "edges/paper__topic__fos.npy",
+                    "type": "paper:has_topic:field_of_study",
+                    "format": "numpy",
+                },
+            ],
+        },
+        "tasks": [
+            {
                 "num_classes": 19,
                 "validation_set": [
                     {
-                    "data": [
-                        {
-                        "in_memory": in_memory, 
-                        "path": "set/validation_indices.npy", 
-                        "name": "seeds", 
-                        "format": "numpy"
-                        }, 
-                        {
-                        "in_memory": in_memory, 
-                        "path": "set/validation_labels.npy", 
-                        "name": "labels", 
-                        "format": "numpy"
-                        }
-                    ], 
-                    "type": "paper"
+                        "data": [
+                            {
+                                "in_memory": in_memory,
+                                "path": "set/validation_indices.npy",
+                                "name": "seeds",
+                                "format": "numpy",
+                            },
+                            {
+                                "in_memory": in_memory,
+                                "path": "set/validation_labels.npy",
+                                "name": "labels",
+                                "format": "numpy",
+                            },
+                        ],
+                        "type": "paper",
                     }
-                ], 
-                "name": "node_classification", 
+                ],
+                "name": "node_classification",
                 "train_set": [
                     {
-                    "data": [
-                        {
-                        "in_memory": in_memory, 
-                        "path": "set/train_indices.npy", 
-                        "name": "seeds", 
-                        "format": "numpy"
-                        }, 
-                        {
-                        "in_memory": in_memory, 
-                        "path": "set/train_labels.npy", 
-                        "name": "labels", 
-                        "format": "numpy"
-                        }
-                    ], 
-                    "type": "paper"
+                        "data": [
+                            {
+                                "in_memory": in_memory,
+                                "path": "set/train_indices.npy",
+                                "name": "seeds",
+                                "format": "numpy",
+                            },
+                            {
+                                "in_memory": in_memory,
+                                "path": "set/train_labels.npy",
+                                "name": "labels",
+                                "format": "numpy",
+                            },
+                        ],
+                        "type": "paper",
                     }
-                ], 
+                ],
                 "test_set": [
                     {
-                    "data": [
-                        {
-                        "in_memory": in_memory, 
-                        "path": "set/test_indices.npy", 
-                        "name": "seeds", 
-                        "format": "numpy"
-                        }, 
-                        {
-                        "in_memory": in_memory, 
-                        "path": "set/test_labels.npy", 
-                        "name": "labels", 
-                        "format": "numpy"
-                        }
-                    ], 
-                    "type": "paper"
+                        "data": [
+                            {
+                                "in_memory": in_memory,
+                                "path": "set/test_indices.npy",
+                                "name": "seeds",
+                                "format": "numpy",
+                            },
+                            {
+                                "in_memory": in_memory,
+                                "path": "set/test_labels.npy",
+                                "name": "labels",
+                                "format": "numpy",
+                            },
+                        ],
+                        "type": "paper",
                     }
-                ]
-                }
-            ], 
-            "feature_data": [
-                {
-                "domain": "node", 
-                "name": "feat", 
-                "format": "numpy", 
-                "in_memory": in_memory, 
-                "path": "data/paper_feat.npy", 
-                "type": "paper"
-                }, 
-                {
-                "domain": "node", 
-                "name": "label", 
-                "format": "numpy", 
-                "in_memory": in_memory, 
-                "path": "data/paper_label_19.npy", 
-                "type": "paper"
-                },
-                {
-                "domain": "node", 
-                "name": "feat", 
-                "format": "numpy", 
-                "in_memory": in_memory, 
-                "path": "data/author_feat.npy", 
-                "type": "author"
-                }, 
-                {
-                "domain": "node", 
-                "name": "feat", 
-                "format": "numpy", 
-                "in_memory": in_memory, 
-                "path": "data/institute_feat.npy", 
-                "type": "institute"
-                },
-                {
-                "domain": "node", 
-                "name": "feat", 
-                "format": "numpy", 
-                "in_memory": in_memory, 
-                "path": "data/fos_feat.npy", 
-                "type": "fos"
-                }, 
-                {
-                "domain": "node", 
-                "name": "feat", 
-                "format": "numpy", 
-                "in_memory": in_memory,
-                "path": "data/author_feat.npy", 
-                "type": "author"
-                }
-            ], 
-            "dataset_name": os.path.basename(path)
+                ],
             }
-    
+        ],
+        "feature_data": [
+            {
+                "domain": "node",
+                "name": "feat",
+                "format": "numpy",
+                "in_memory": in_memory,
+                "path": "data/paper_feat.npy",
+                "type": "paper",
+            },
+            {
+                "domain": "node",
+                "name": "label",
+                "format": "numpy",
+                "in_memory": in_memory,
+                "path": "data/paper_label_19.npy",
+                "type": "paper",
+            },
+            {
+                "domain": "node",
+                "name": "feat",
+                "format": "numpy",
+                "in_memory": in_memory,
+                "path": "data/author_feat.npy",
+                "type": "author",
+            },
+            {
+                "domain": "node",
+                "name": "feat",
+                "format": "numpy",
+                "in_memory": in_memory,
+                "path": "data/institute_feat.npy",
+                "type": "institute",
+            },
+            {
+                "domain": "node",
+                "name": "feat",
+                "format": "numpy",
+                "in_memory": in_memory,
+                "path": "data/fos_feat.npy",
+                "type": "fos",
+            },
+            {
+                "domain": "node",
+                "name": "feat",
+                "format": "numpy",
+                "in_memory": in_memory,
+                "path": "data/author_feat.npy",
+                "type": "author",
+            },
+        ],
+        "dataset_name": os.path.basename(path),
+    }
+
     return data
 
 
@@ -186,53 +175,58 @@ def build_yaml(original_path, current_path):
         data = build_yaml_helper(original_path, in_memory=False)
     else:
         data = build_yaml_helper(original_path)
-    with open(f"{current_path}/metadata.yaml", 'w') as file:
+    with open(f"{current_path}/metadata.yaml", "w") as file:
         yaml.dump(data, file, default_flow_style=False)
 
 
 def decide_download(url):
     d = ur.urlopen(url)
-    size = int(d.info()["Content-Length"])/GBFACTOR
+    size = int(d.info()["Content-Length"]) / GBFACTOR
     ### confirm if larger than 1GB
     if size > 1:
-        return input("This will download %.2fGB. Will you proceed? (y/N) " % (size)).lower() == "y"
+        return (
+            input(
+                "This will download %.2fGB. Will you proceed? (y/N) " % (size)
+            ).lower()
+            == "y"
+        )
     else:
         return True
 
 
 dataset_urls = {
-    'homogeneous' : {
-        'tiny' : 'https://igb-public-awsopen.s3.amazonaws.com/igb-homogeneous/igb_homogeneous_tiny.tar.gz',
-        'small' : 'https://igb-public-awsopen.s3.amazonaws.com/igb-homogeneous/igb_homogeneous_small.tar.gz',
-        'medium' : 'https://igb-public-awsopen.s3.amazonaws.com/igb-homogeneous/igb_homogeneous_medium.tar.gz'
+    "homogeneous": {
+        "tiny": "https://igb-public-awsopen.s3.amazonaws.com/igb-homogeneous/igb_homogeneous_tiny.tar.gz",
+        "small": "https://igb-public-awsopen.s3.amazonaws.com/igb-homogeneous/igb_homogeneous_small.tar.gz",
+        "medium": "https://igb-public-awsopen.s3.amazonaws.com/igb-homogeneous/igb_homogeneous_medium.tar.gz",
+    },
+    "heterogeneous": {
+        "tiny": "https://igb-public-awsopen.s3.amazonaws.com/igb-heterogeneous/igb_heterogeneous_tiny.tar.gz",
+        "small": "https://igb-public-awsopen.s3.amazonaws.com/igb-heterogeneous/igb_heterogeneous_small.tar.gz",
+        "medium": "https://igb-public-awsopen.s3.amazonaws.com/igb-heterogeneous/igb_heterogeneous_medium.tar.gz",
     },
-    'heterogeneous' : {
-        'tiny' : 'https://igb-public-awsopen.s3.amazonaws.com/igb-heterogeneous/igb_heterogeneous_tiny.tar.gz',
-        'small' : 'https://igb-public-awsopen.s3.amazonaws.com/igb-heterogeneous/igb_heterogeneous_small.tar.gz',
-        'medium' : 'https://igb-public-awsopen.s3.amazonaws.com/igb-heterogeneous/igb_heterogeneous_medium.tar.gz'
-    }  
 }
 
 
 md5checksums = {
-    'homogeneous' : {
-        'tiny' : '34856534da55419b316d620e2d5b21be',
-        'small' : '6781c699723529902ace0a95cafe6fe4',
-        'medium' : '4640df4ceee46851fd18c0a44ddcc622'
+    "homogeneous": {
+        "tiny": "34856534da55419b316d620e2d5b21be",
+        "small": "6781c699723529902ace0a95cafe6fe4",
+        "medium": "4640df4ceee46851fd18c0a44ddcc622",
+    },
+    "heterogeneous": {
+        "tiny": "83fbc1091497ff92cf20afe82fae0ade",
+        "small": "2f42077be60a074aec24f7c60089e1bd",
+        "medium": "7f0df4296eca36553ff3a6a63abbd347",
     },
-    'heterogeneous' : {
-        'tiny' : '83fbc1091497ff92cf20afe82fae0ade',
-        'small' : '2f42077be60a074aec24f7c60089e1bd',
-        'medium' : '7f0df4296eca36553ff3a6a63abbd347'
-    }  
 }
 
 
 def check_md5sum(dataset_type, dataset_size, filename):
     original_md5 = md5checksums[dataset_type][dataset_size]
 
-    with open(filename, 'rb') as file_to_check:
-        data = file_to_check.read()    
+    with open(filename, "rb") as file_to_check:
+        data = file_to_check.read()
         md5_returned = hashlib.md5(data).hexdigest()
 
     if original_md5 == md5_returned:
@@ -241,49 +235,83 @@ def check_md5sum(dataset_type, dataset_size, filename):
     else:
         os.remove(filename)
         raise Exception(" md5sum verification failed!.")
-        
+
 
 def download_dataset(path, dataset_type, dataset_size):
     if dataset_size in ["large", "full"]:
         command = f"./download_{dataset_size}_igbh.sh"
-        subprocess.run(['bash', command], check=True, text=True)
+        subprocess.run(["bash", command], check=True, text=True)
         shutil.move(src=f"igb-{dataset_type}-{dataset_size}", dst=f"{path}")
         return path + "/" + "igb-" + dataset_type + "-" + dataset_size
-    else: 
+    else:
         output_directory = path
-        if not os.path.exists(output_directory + "igb_" + dataset_type + "_" + dataset_size + ".tar.gz"):
+        if not os.path.exists(
+            output_directory
+            + "igb_"
+            + dataset_type
+            + "_"
+            + dataset_size
+            + ".tar.gz"
+        ):
             url = dataset_urls[dataset_type][dataset_size]
             if decide_download(url):
                 data = ur.urlopen(url)
                 size = int(data.info()["Content-Length"])
-                chunk_size = 1024*1024
-                num_iter = int(size/chunk_size) + 2
+                chunk_size = 1024 * 1024
+                num_iter = int(size / chunk_size) + 2
                 downloaded_size = 0
-                filename = path + "/igb_" + dataset_type + "_" + dataset_size + ".tar.gz"
-                with open(filename, 'wb') as f:
+                filename = (
+                    path
+                    + "/igb_"
+                    + dataset_type
+                    + "_"
+                    + dataset_size
+                    + ".tar.gz"
+                )
+                with open(filename, "wb") as f:
                     pbar = tqdm(range(num_iter))
                     for i in pbar:
                         chunk = data.read(chunk_size)
                         downloaded_size += len(chunk)
-                        pbar.set_description("Downloaded {:.2f} GB".format(float(downloaded_size)/GBFACTOR))
+                        pbar.set_description(
+                            "Downloaded {:.2f} GB".format(
+                                float(downloaded_size) / GBFACTOR
+                            )
+                        )
                         f.write(chunk)
-            print("Downloaded" + " igb_" + dataset_type + "_" + dataset_size, end=" ->")
+            print(
+                "Downloaded" + " igb_" + dataset_type + "_" + dataset_size,
+                end=" ->",
+            )
             check_md5sum(dataset_type, dataset_size, filename)
-        else: 
-            print("The file igb_" + dataset_type + "_" + dataset_size + ".tar.gz already exists, directly extracting...")
-            filename = path + "/igb_" + dataset_type + "_" + dataset_size + ".tar.gz"
+        else:
+            print(
+                "The file igb_"
+                + dataset_type
+                + "_"
+                + dataset_size
+                + ".tar.gz already exists, directly extracting..."
+            )
+            filename = (
+                path + "/igb_" + dataset_type + "_" + dataset_size + ".tar.gz"
+            )
         file = tarfile.open(filename)
         file.extractall(output_directory)
         file.close()
         size = 0
-        for path, dirs, files in os.walk(output_directory+ "/" + dataset_size):
+        for path, dirs, files in os.walk(output_directory + "/" + dataset_size):
             for f in files:
                 fp = os.path.join(path, f)
                 size += os.path.getsize(fp)
-        print("Final dataset size {:.2f} GB.".format(size/GBFACTOR))
+        print("Final dataset size {:.2f} GB.".format(size / GBFACTOR))
         # os.remove(filename)
-        os.rename(output_directory+ "/" + dataset_size, output_directory+ "/" + "igb-" + dataset_type + "-" + dataset_size)
-        return output_directory + "/" + "igb-" + dataset_type + "-" + dataset_size
+        os.rename(
+            output_directory + "/" + dataset_size,
+            output_directory + "/" + "igb-" + dataset_type + "-" + dataset_size,
+        )
+        return (
+            output_directory + "/" + "igb-" + dataset_type + "-" + dataset_size
+        )
 
 
 num_nodes = {
@@ -291,32 +319,32 @@ def download_dataset(path, dataset_type, dataset_size):
         "paper": 269346174,
         "author": 277220883,
         "institute": 26918,
-        "fos": 712960
-    }, 
+        "fos": 712960,
+    },
     "large": {
         "paper": 100000000,
         "author": 116959896,
         "institute": 26524,
-        "fos": 649707
+        "fos": 649707,
     },
     "medium": {
-        "paper": 10000000,   
+        "paper": 10000000,
         "author": 15544654,
         "institute": 23256,
-        "fos": 415054
-    }, 
-    "small": { 
+        "fos": 415054,
+    },
+    "small": {
         "paper": 1000000,
         "author": 1926066,
         "institute": 14751,
-        "fos": 190449
+        "fos": 190449,
     },
     "tiny": {
         "paper": 100000,
         "author": 357041,
         "institute": 8738,
-        "fos": 84220
-    }
+        "fos": 84220,
+    },
 }
 
 num_edges = {
@@ -324,32 +352,32 @@ def download_dataset(path, dataset_type, dataset_size):
         "paper__cites__paper": 3996442004,
         "paper__written_by__author": 716761549,
         "paper__topic__fos": 1050280600,
-        "author__affiliated_to__institute": 48521486
+        "author__affiliated_to__institute": 48521486,
     },
     "large": {
         "paper__cites__paper": 1223571364,
         "paper__written_by__author": 289502107,
         "paper__topic__fos": 457577294,
-        "author__affiliated_to__institute": 34099660
+        "author__affiliated_to__institute": 34099660,
     },
     "medium": {
         "paper__cites__paper": 120077694,
         "paper__written_by__author": 39854592,
         "paper__topic__fos": 68510495,
-        "author__affiliated_to__institute": 11049412
+        "author__affiliated_to__institute": 11049412,
     },
     "small": {
         "paper__cites__paper": 12070502,
         "paper__written_by__author": 4553516,
         "paper__topic__fos": 7234122,
-        "author__affiliated_to__institute": 1630476
-    }, 
+        "author__affiliated_to__institute": 1630476,
+    },
     "tiny": {
         "paper__cites__paper": 447416,
         "paper__written_by__author": 471443,
         "paper__topic__fos": 718445,
-        "author__affiliated_to__institute": 325410
-    }
+        "author__affiliated_to__institute": 325410,
+    },
 }
 
 
@@ -374,11 +402,13 @@ def split_data(label_path, set_dir, dataset_size):
     validation_labels = labels[train_end:validation_end]
     test_labels = labels[validation_end:]
     print(train_labels, len(train_labels))
-    print(validation_labels,len(validation_labels))
+    print(validation_labels, len(validation_labels))
     print(test_labels, len(test_labels))
 
     gb.numpy_save_aligned(f"{set_dir}/train_indices.npy", train_indices)
-    gb.numpy_save_aligned(f"{set_dir}/validation_indices.npy", validation_indices)
+    gb.numpy_save_aligned(
+        f"{set_dir}/validation_indices.npy", validation_indices
+    )
     gb.numpy_save_aligned(f"{set_dir}/test_indices.npy", test_indices)
     gb.numpy_save_aligned(f"{set_dir}/train_labels.npy", train_labels)
     gb.numpy_save_aligned(f"{set_dir}/validation_labels.npy", validation_labels)
@@ -388,7 +418,7 @@ def split_data(label_path, set_dir, dataset_size):
 def add_edges(edges, source, dest, dataset_size):
     for edge in edges:
         print(f"\t Processing {edge} edge...")
-        
+
         old_edge_path = source + "/" + edge + "/" + "edge_index.npy"
         new_edge_path = dest + "/" + edge + ".npy"
         os.rename(src=old_edge_path, dst=new_edge_path)
@@ -397,7 +427,7 @@ def add_edges(edges, source, dest, dataset_size):
         edge_array = np.load(new_edge_path)
         new_edge_array = edge_array.transpose()
 
-        assert(new_edge_array.shape == (2, num_edges[dataset_size][edge]))
+        assert new_edge_array.shape == (2, num_edges[dataset_size][edge])
 
         np.save(new_edge_path, new_edge_array)
 
@@ -405,25 +435,30 @@ def add_edges(edges, source, dest, dataset_size):
 def process_feat(file_path, node_name, dataset_size):
     # array = np.memmap(file_path, dtype='float32', mode='r',  shape=(num_nodes[dataset_size][node_name], 1024))
     array = np.load(file_path)
-    assert(array.shape == (num_nodes[dataset_size][node_name], 1024))
+    assert array.shape == (num_nodes[dataset_size][node_name], 1024)
     gb.numpy_save_aligned(file_path, array)
 
     # Assert the shape and elements of the array are correct
     # new_array = np.memmap(file_path, dtype='float32', mode='r',  shape=(num_nodes[dataset_size][node_name], 1024))
     new_array = np.load(file_path)
-    assert(array.shape == (num_nodes[dataset_size][node_name], 1024))
-    assert(np.array_equal(array, new_array))
-    
+    assert array.shape == (num_nodes[dataset_size][node_name], 1024)
+    assert np.array_equal(array, new_array)
+
 
 def process_label(file_path, num_class, dataset_size):
-    if num_class == 2983 and dataset_size == "full": # only this case label number changes
+    if (
+        num_class == 2983 and dataset_size == "full"
+    ):  # only this case label number changes
         # array = np.memmap(file_path, dtype='int32', mode='r',  shape=(227130858, 1))
         array = np.load(file_path)
-        assert(array.shape == (227130858, 1) or array.shape == (227130858,))
+        assert array.shape == (227130858, 1) or array.shape == (227130858,)
     else:
         # array = np.memmap(file_path, dtype='int32', mode='r',  shape=(num_nodes[dataset_size]["paper"], 1))
         array = np.load(file_path)
-        assert(array.shape == (num_nodes[dataset_size]["paper"], 1) or array.shape == (num_nodes[dataset_size]["paper"],))
+        assert array.shape == (
+            num_nodes[dataset_size]["paper"],
+            1,
+        ) or array.shape == (num_nodes[dataset_size]["paper"],)
 
     gb.numpy_save_aligned(file_path, array)
 
@@ -431,13 +466,18 @@ def process_label(file_path, num_class, dataset_size):
     if num_class == 2983 and dataset_size == "full":
         # new_array = np.memmap(file_path, dtype='int32', mode='r',  shape=(227130858, 1))
         new_array = np.load(file_path)
-        assert(new_array.shape == (227130858, 1) or new_array.shape == (227130858,))
-        assert(np.array_equal(array, new_array))
-    else: 
+        assert new_array.shape == (227130858, 1) or new_array.shape == (
+            227130858,
+        )
+        assert np.array_equal(array, new_array)
+    else:
         # new_array = np.memmap(file_path, dtype='int32', mode='r',  shape=(num_nodes[dataset_size]["paper"], 1))
         new_array = np.load(file_path)
-        assert(new_array.shape == (num_nodes[dataset_size]["paper"], 1) or new_array.shape == (num_nodes[dataset_size]["paper"],))
-        assert(np.array_equal(array, new_array))
+        assert new_array.shape == (
+            num_nodes[dataset_size]["paper"],
+            1,
+        ) or new_array.shape == (num_nodes[dataset_size]["paper"],)
+        assert np.array_equal(array, new_array)
 
 
 def add_nodes(nodes, source, dest, dataset_size):
@@ -446,18 +486,28 @@ def add_nodes(nodes, source, dest, dataset_size):
         old_node_path = source + "/" + node + "/" + "node_feat.npy"
         new_node_path = dest + "/" + node + "_feat.npy"
         os.rename(src=old_node_path, dst=new_node_path)
-        process_feat(file_path=new_node_path, node_name=node, dataset_size=dataset_size)
+        process_feat(
+            file_path=new_node_path, node_name=node, dataset_size=dataset_size
+        )
         if node == "paper":
             print(f"\t Processing {node} labels...")
             old_label_path_19 = source + "/" + node + "/" + "node_label_19.npy"
             new_label_path_19 = dest + "/" + "paper_label_19.npy"
             os.rename(src=old_label_path_19, dst=new_label_path_19)
-            process_label(file_path=new_label_path_19, num_class=19, dataset_size=dataset_size)
+            process_label(
+                file_path=new_label_path_19,
+                num_class=19,
+                dataset_size=dataset_size,
+            )
 
             old_label_path_2K = source + "/" + node + "/" + "node_label_2K.npy"
             new_label_path_2K = dest + "/" + "paper_label_2K.npy"
             os.rename(src=old_label_path_2K, dst=new_label_path_2K)
-            process_label(file_path=new_label_path_19, num_class=2983, dataset_size=dataset_size)
+            process_label(
+                file_path=new_label_path_19,
+                num_class=2983,
+                dataset_size=dataset_size,
+            )
 
     return new_label_path_19, new_label_path_2K
 
@@ -475,19 +525,21 @@ def process_dataset(path, dataset_size):
     node_dir = processed_dir + "/" + "data"
     os.makedirs(name=node_dir, exist_ok=True)
     # These are the four nodes in this citation network
-    nodes = [
-        "paper",
-        "author",
-        "fos",
-        "institute"
-    ]
-    label_file_19, label_file_2K = add_nodes(nodes=nodes, source=original_path, dest=node_dir, dataset_size=dataset_size)
+    nodes = ["paper", "author", "fos", "institute"]
+    label_file_19, label_file_2K = add_nodes(
+        nodes=nodes,
+        source=original_path,
+        dest=node_dir,
+        dataset_size=dataset_size,
+    )
 
     # Step 2: Create labels
     print("Processing train/valid/test files...")
     set_dir = processed_dir + "/" + "set"
     os.makedirs(name=set_dir, exist_ok=True)
-    split_data(label_path=label_file_19, set_dir=set_dir, dataset_size=dataset_size)
+    split_data(
+        label_path=label_file_19, set_dir=set_dir, dataset_size=dataset_size
+    )
 
     # Step 3: Move edge files
     print("Processing Edge files...")
@@ -498,9 +550,14 @@ def process_dataset(path, dataset_size):
         "paper__cites__paper",
         "paper__written_by__author",
         "paper__topic__fos",
-        "author__affiliated_to__institute"
+        "author__affiliated_to__institute",
     ]
-    add_edges(edges=edges, source=original_path, dest=edge_dir, dataset_size=dataset_size)
+    add_edges(
+        edges=edges,
+        source=original_path,
+        dest=edge_dir,
+        dataset_size=dataset_size,
+    )
 
     # Step 4: Build the yaml file
     print("Building yaml file...")
@@ -510,16 +567,30 @@ def process_dataset(path, dataset_size):
     print(f"Finished processing the {dataset_size} dataset")
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--path', type=str, default='datasets/',
-        help='path to store the datasets')
-    parser.add_argument('--type', type=str, default='heterogeneous',
-        choices=['homogeneous', 'heterogeneous'], 
-        help='dataset type')
-    parser.add_argument('--size', type=str, default='tiny',
-        choices=['tiny', 'small', 'medium', 'large', 'full'], 
-        help='size of the datasets')
-    args = parser.parse_args()    
-    path = download_dataset(path=args.path, dataset_type=args.type, dataset_size=args.size)
+    parser.add_argument(
+        "--path",
+        type=str,
+        default="datasets/",
+        help="path to store the datasets",
+    )
+    parser.add_argument(
+        "--type",
+        type=str,
+        default="heterogeneous",
+        choices=["homogeneous", "heterogeneous"],
+        help="dataset type",
+    )
+    parser.add_argument(
+        "--size",
+        type=str,
+        default="tiny",
+        choices=["tiny", "small", "medium", "large", "full"],
+        help="size of the datasets",
+    )
+    args = parser.parse_args()
+    path = download_dataset(
+        path=args.path, dataset_type=args.type, dataset_size=args.size
+    )
     process_dataset(path=path, dataset_size=args.size)
diff --git a/examples/graphbolt/rgcn/evaluator.py b/examples/graphbolt/rgcn/evaluator.py
index 6598355b0727..cbded66c264d 100644
--- a/examples/graphbolt/rgcn/evaluator.py
+++ b/examples/graphbolt/rgcn/evaluator.py
@@ -1,9 +1,11 @@
 import numpy as np
+
 try:
     import torch
 except ImportError:
     torch = None
 
+
 ### Evaluator for node property prediction
 class IGB_Evaluator:
     def __init__(self, name, num_tasks, eval_metric):
@@ -11,20 +13,19 @@ def __init__(self, name, num_tasks, eval_metric):
         self.num_tasks = num_tasks
         self.eval_metric = eval_metric
 
-
     def _parse_and_check_input(self, input_dict):
-        if self.eval_metric == 'acc':
-            if not 'y_true' in input_dict:
-                raise RuntimeError('Missing key of y_true')
-            if not 'y_pred' in input_dict:
-                raise RuntimeError('Missing key of y_pred')
+        if self.eval_metric == "acc":
+            if not "y_true" in input_dict:
+                raise RuntimeError("Missing key of y_true")
+            if not "y_pred" in input_dict:
+                raise RuntimeError("Missing key of y_pred")
 
-            y_true, y_pred = input_dict['y_true'], input_dict['y_pred']
+            y_true, y_pred = input_dict["y_true"], input_dict["y_pred"]
 
-            '''
+            """
                 y_true: numpy ndarray or torch tensor of shape (num_nodes num_tasks)
                 y_pred: numpy ndarray or torch tensor of shape (num_nodes num_tasks)
-            '''
+            """
 
             # converting to torch.Tensor to numpy on cpu
             if torch is not None and isinstance(y_true, torch.Tensor):
@@ -34,54 +35,74 @@ def _parse_and_check_input(self, input_dict):
                 y_pred = y_pred.detach().cpu().numpy()
 
             ## check type
-            if not (isinstance(y_true, np.ndarray) and isinstance(y_true, np.ndarray)):
-                raise RuntimeError('Arguments to Evaluator need to be either numpy ndarray or torch tensor')
+            if not (
+                isinstance(y_true, np.ndarray)
+                and isinstance(y_true, np.ndarray)
+            ):
+                raise RuntimeError(
+                    "Arguments to Evaluator need to be either numpy ndarray or torch tensor"
+                )
 
             if not y_true.shape == y_pred.shape:
-                raise RuntimeError('Shape of y_true and y_pred must be the same')
+                raise RuntimeError(
+                    "Shape of y_true and y_pred must be the same"
+                )
 
             if not y_true.ndim == 2:
-                raise RuntimeError('y_true and y_pred must to 2-dim arrray, {}-dim array given'.format(y_true.ndim))
+                raise RuntimeError(
+                    "y_true and y_pred must to 2-dim arrray, {}-dim array given".format(
+                        y_true.ndim
+                    )
+                )
 
             if not y_true.shape[1] == self.num_tasks:
-                raise RuntimeError('Number of tasks for {} should be {} but {} given'.format(self.name, self.num_tasks, y_true.shape[1]))
+                raise RuntimeError(
+                    "Number of tasks for {} should be {} but {} given".format(
+                        self.name, self.num_tasks, y_true.shape[1]
+                    )
+                )
 
             return y_true, y_pred
 
         else:
-            raise ValueError('Undefined eval metric %s ' % (self.eval_metric))
-
+            raise ValueError("Undefined eval metric %s " % (self.eval_metric))
 
     def eval(self, input_dict):
-        if self.eval_metric == 'acc':
+        if self.eval_metric == "acc":
             y_true, y_pred = self._parse_and_check_input(input_dict)
             return self._eval_acc(y_true, y_pred)
         else:
-            raise ValueError('Undefined eval metric %s ' % (self.eval_metric))
+            raise ValueError("Undefined eval metric %s " % (self.eval_metric))
 
     @property
     def expected_input_format(self):
-        desc = '==== Expected input format of Evaluator for {}\n'.format(self.name)
-        if self.eval_metric == 'acc':
-            desc += '{\'y_true\': y_true, \'y_pred\': y_pred}\n'
-            desc += '- y_true: numpy ndarray or torch tensor of shape (num_nodes num_tasks)\n'
-            desc += '- y_pred: numpy ndarray or torch tensor of shape (num_nodes num_tasks)\n'
-            desc += 'where y_pred stores predicted class label (integer),\n'
-            desc += 'num_task is {}, and '.format(self.num_tasks)
-            desc += 'each row corresponds to one node.\n'
+        desc = "==== Expected input format of Evaluator for {}\n".format(
+            self.name
+        )
+        if self.eval_metric == "acc":
+            desc += "{'y_true': y_true, 'y_pred': y_pred}\n"
+            desc += "- y_true: numpy ndarray or torch tensor of shape (num_nodes num_tasks)\n"
+            desc += "- y_pred: numpy ndarray or torch tensor of shape (num_nodes num_tasks)\n"
+            desc += "where y_pred stores predicted class label (integer),\n"
+            desc += "num_task is {}, and ".format(self.num_tasks)
+            desc += "each row corresponds to one node.\n"
         else:
-            raise ValueError('Undefined eval metric %s ' % (self.eval_metric))
+            raise ValueError("Undefined eval metric %s " % (self.eval_metric))
 
         return desc
 
     @property
     def expected_output_format(self):
-        desc = '==== Expected output format of Evaluator for {}\n'.format(self.name)
-        if self.eval_metric == 'acc':
-            desc += '{\'acc\': acc}\n'
-            desc += '- acc (float): Accuracy score averaged across {} task(s)\n'.format(self.num_tasks)
+        desc = "==== Expected output format of Evaluator for {}\n".format(
+            self.name
+        )
+        if self.eval_metric == "acc":
+            desc += "{'acc': acc}\n"
+            desc += "- acc (float): Accuracy score averaged across {} task(s)\n".format(
+                self.num_tasks
+            )
         else:
-            raise ValueError('Undefined eval metric %s ' % (self.eval_metric))
+            raise ValueError("Undefined eval metric %s " % (self.eval_metric))
 
         return desc
 
@@ -89,9 +110,8 @@ def _eval_acc(self, y_true, y_pred):
         acc_list = []
 
         for i in range(y_true.shape[1]):
-            is_labeled = y_true[:,i] == y_true[:,i]
-            correct = y_true[is_labeled,i] == y_pred[is_labeled,i]
-            acc_list.append(float(np.sum(correct))/len(correct))
-
-        return {'acc': sum(acc_list)/len(acc_list)}
+            is_labeled = y_true[:, i] == y_true[:, i]
+            correct = y_true[is_labeled, i] == y_pred[is_labeled, i]
+            acc_list.append(float(np.sum(correct)) / len(correct))
 
+        return {"acc": sum(acc_list) / len(acc_list)}
diff --git a/examples/graphbolt/rgcn/hetero_rgcn.py b/examples/graphbolt/rgcn/hetero_rgcn.py
index bb12b31e2069..7516e16a857a 100644
--- a/examples/graphbolt/rgcn/hetero_rgcn.py
+++ b/examples/graphbolt/rgcn/hetero_rgcn.py
@@ -58,8 +58,8 @@
 import torch.nn as nn
 import torch.nn.functional as F
 from dgl.nn import HeteroEmbedding
-from ogb.lsc import MAG240MEvaluator
 from evaluator import IGB_Evaluator
+from ogb.lsc import MAG240MEvaluator
 from ogb.nodeproppred import Evaluator
 from tqdm import tqdm
 
@@ -595,7 +595,6 @@ def main(args):
     # `institution` are generated in advance and stored in the feature store.
     # For `ogbn-mag`, we generate the features on the fly.
     embed_layer = None
-    # if args.dataset == "ogbn-mag":
     if args.dataset == "ogbn-mag" or "igb-heterogeneous" in args.dataset:
         # Create the embedding layer and move it to the appropriate device.
         embed_layer = rel_graph_embed(g, feat_size).to(device)
@@ -671,9 +670,15 @@ def main(args):
         "--dataset",
         type=str,
         default="ogbn-mag",
-        choices=["ogbn-mag", "ogb-lsc-mag240m", "igb-heterogeneous-tiny", 
-                 "igb-heterogeneous-small", "igb-heterogeneous-medium",
-                 "igb-heterogeneous-large", "igb-heterogeneous-full"],
+        choices=[
+            "ogbn-mag",
+            "ogb-lsc-mag240m",
+            "igb-heterogeneous-tiny",
+            "igb-heterogeneous-small",
+            "igb-heterogeneous-medium",
+            "igb-heterogeneous-large",
+            "igb-heterogeneous-full",
+        ],
         help="Dataset name. Possible values: ogbn-mag, ogb-lsc-mag240m",
     )
     parser.add_argument("--num_epochs", type=int, default=3)

From 6bf10cb47d0765488e58646798254c4f3278b62c Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-1-37.us-west-2.compute.internal>
Date: Thu, 15 Aug 2024 20:40:11 +0000
Subject: [PATCH 05/35] added documentation

---
 examples/graphbolt/rgcn/download.py | 116 +++++++++++++++-------------
 1 file changed, 64 insertions(+), 52 deletions(-)

diff --git a/examples/graphbolt/rgcn/download.py b/examples/graphbolt/rgcn/download.py
index 4724061476b5..9f3b51ceb6b7 100755
--- a/examples/graphbolt/rgcn/download.py
+++ b/examples/graphbolt/rgcn/download.py
@@ -9,24 +9,21 @@
 GBFACTOR = float(1 << 30)
 
 
-def _get_size(file_path, node_name):
-    if "full" in file_path:
-        return num_nodes["full"][node_name]
-    if "large" in file_path:
-        return num_nodes["large"][node_name]
-    path = f"{file_path}/processed/{node_name}/{node_name}_id_index_mapping.npy"
-    array = np.load(path, allow_pickle=True)
-    return len(array.item())
-
-
-def build_yaml_helper(path, in_memory=True):
+def build_yaml_helper(path, dataset_size, in_memory=True):
+    """The stirng to build the yaml file. (Still need modification)"""
     data = {
         "graph": {
             "nodes": [
-                {"num": _get_size(path, "paper"), "type": "paper"},
-                {"num": _get_size(path, "author"), "type": "author"},
-                {"num": _get_size(path, "institute"), "type": "institution"},
-                {"num": _get_size(path, "fos"), "type": "field_of_study"},
+                {"num": num_nodes[dataset_size]["paper"], "type": "paper"},
+                {"num": num_nodes[dataset_size]["author"], "type": "author"},
+                {
+                    "num": num_nodes[dataset_size]["institute"],
+                    "type": "institution",
+                },
+                {
+                    "num": num_nodes[dataset_size]["fos"],
+                    "type": "field_of_study",
+                },
             ],
             "edges": [
                 {
@@ -170,28 +167,18 @@ def build_yaml_helper(path, in_memory=True):
     return data
 
 
-def build_yaml(original_path, current_path):
-    if "large" in current_path or "full" in current_path:
-        data = build_yaml_helper(original_path, in_memory=False)
-    else:
-        data = build_yaml_helper(original_path)
-    with open(f"{current_path}/metadata.yaml", "w") as file:
-        yaml.dump(data, file, default_flow_style=False)
-
-
-def decide_download(url):
-    d = ur.urlopen(url)
-    size = int(d.info()["Content-Length"]) / GBFACTOR
-    ### confirm if larger than 1GB
-    if size > 1:
-        return (
-            input(
-                "This will download %.2fGB. Will you proceed? (y/N) " % (size)
-            ).lower()
-            == "y"
+def build_yaml(original_path, current_path, dataset_size):
+    """This build the yaml file differently based on the dataset size.
+    The two large datasets are put in disk while the other three smaller versions are in memory.
+    """
+    if "large" == dataset_size or "full" == dataset_size:
+        data = build_yaml_helper(
+            path=original_path, dataset_size=dataset_size, in_memory=False
         )
     else:
-        return True
+        data = build_yaml_helper(path=original_path, dataset_size=dataset_size)
+    with open(f"{current_path}/metadata.yaml", "w") as file:
+        yaml.dump(data=data, stream=file, default_flow_style=False)
 
 
 dataset_urls = {
@@ -222,7 +209,24 @@ def decide_download(url):
 }
 
 
+def decide_download(url):
+    """An interactive command line to confirm download."""
+    d = ur.urlopen(url)
+    size = int(d.info()["Content-Length"]) / GBFACTOR
+    ### confirm if larger than 1GB
+    if size > 1:
+        return (
+            input(
+                "This will download %.2fGB. Will you proceed? (y/N) " % (size)
+            ).lower()
+            == "y"
+        )
+    else:
+        return True
+
+
 def check_md5sum(dataset_type, dataset_size, filename):
+    """This is for checking the data correctness of the downloaded datasets."""
     original_md5 = md5checksums[dataset_type][dataset_size]
 
     with open(filename, "rb") as file_to_check:
@@ -238,11 +242,15 @@ def check_md5sum(dataset_type, dataset_size, filename):
 
 
 def download_dataset(path, dataset_type, dataset_size):
+    """This is the script to download all the related datasets."""
+
+    # For large datasets, use the two shell scripts to download.
     if dataset_size in ["large", "full"]:
         command = f"./download_{dataset_size}_igbh.sh"
         subprocess.run(["bash", command], check=True, text=True)
         shutil.move(src=f"igb-{dataset_type}-{dataset_size}", dst=f"{path}")
         return path + "/" + "igb-" + dataset_type + "-" + dataset_size
+    # For the three smaller version, use the url to download.
     else:
         output_directory = path
         if not os.path.exists(
@@ -284,7 +292,7 @@ def download_dataset(path, dataset_type, dataset_size):
                 end=" ->",
             )
             check_md5sum(dataset_type, dataset_size, filename)
-        else:
+        else:  # No need to download the tar file again if it is already downloaded.
             print(
                 "The file igb_"
                 + dataset_type
@@ -295,6 +303,7 @@ def download_dataset(path, dataset_type, dataset_size):
             filename = (
                 path + "/igb_" + dataset_type + "_" + dataset_size + ".tar.gz"
             )
+        # Extract the tar file
         file = tarfile.open(filename)
         file.extractall(output_directory)
         file.close()
@@ -382,6 +391,7 @@ def download_dataset(path, dataset_type, dataset_size):
 
 
 def split_data(label_path, set_dir, dataset_size):
+    """This is for splitting the labels into three sets: train, validation, and test sets."""
     # labels = np.memmap(label_path, dtype='int32', mode='r',  shape=(num_nodes[dataset_size]["paper"], 1))
     labels = np.load(label_path)
 
@@ -416,6 +426,7 @@ def split_data(label_path, set_dir, dataset_size):
 
 
 def add_edges(edges, source, dest, dataset_size):
+    """This is for processing the edges in the graph and convert them to correct shape."""
     for edge in edges:
         print(f"\t Processing {edge} edge...")
 
@@ -428,11 +439,13 @@ def add_edges(edges, source, dest, dataset_size):
         new_edge_array = edge_array.transpose()
 
         assert new_edge_array.shape == (2, num_edges[dataset_size][edge])
+        assert np.array_equal(edge_array, new_edge_array.transpose())
 
-        np.save(new_edge_path, new_edge_array)
+        gb.numpy_save_aligned(new_edge_path, new_edge_array)
 
 
 def process_feat(file_path, node_name, dataset_size):
+    """This is for processing the node features."""
     # array = np.memmap(file_path, dtype='float32', mode='r',  shape=(num_nodes[dataset_size][node_name], 1024))
     array = np.load(file_path)
     assert array.shape == (num_nodes[dataset_size][node_name], 1024)
@@ -441,24 +454,22 @@ def process_feat(file_path, node_name, dataset_size):
     # Assert the shape and elements of the array are correct
     # new_array = np.memmap(file_path, dtype='float32', mode='r',  shape=(num_nodes[dataset_size][node_name], 1024))
     new_array = np.load(file_path)
-    assert array.shape == (num_nodes[dataset_size][node_name], 1024)
+    assert new_array.shape == (num_nodes[dataset_size][node_name], 1024)
     assert np.array_equal(array, new_array)
 
 
 def process_label(file_path, num_class, dataset_size):
+    """This is for processing the node labels."""
     if (
         num_class == 2983 and dataset_size == "full"
     ):  # only this case label number changes
         # array = np.memmap(file_path, dtype='int32', mode='r',  shape=(227130858, 1))
         array = np.load(file_path)
-        assert array.shape == (227130858, 1) or array.shape == (227130858,)
+        assert array.shape[0] == 227130858
     else:
         # array = np.memmap(file_path, dtype='int32', mode='r',  shape=(num_nodes[dataset_size]["paper"], 1))
         array = np.load(file_path)
-        assert array.shape == (
-            num_nodes[dataset_size]["paper"],
-            1,
-        ) or array.shape == (num_nodes[dataset_size]["paper"],)
+        assert array.shape[0] == num_nodes[dataset_size]["paper"]
 
     gb.numpy_save_aligned(file_path, array)
 
@@ -466,21 +477,17 @@ def process_label(file_path, num_class, dataset_size):
     if num_class == 2983 and dataset_size == "full":
         # new_array = np.memmap(file_path, dtype='int32', mode='r',  shape=(227130858, 1))
         new_array = np.load(file_path)
-        assert new_array.shape == (227130858, 1) or new_array.shape == (
-            227130858,
-        )
+        assert new_array.shape[0] == 227130858
         assert np.array_equal(array, new_array)
     else:
         # new_array = np.memmap(file_path, dtype='int32', mode='r',  shape=(num_nodes[dataset_size]["paper"], 1))
         new_array = np.load(file_path)
-        assert new_array.shape == (
-            num_nodes[dataset_size]["paper"],
-            1,
-        ) or new_array.shape == (num_nodes[dataset_size]["paper"],)
+        assert new_array.shape[0] == num_nodes[dataset_size]["paper"]
         assert np.array_equal(array, new_array)
 
 
 def add_nodes(nodes, source, dest, dataset_size):
+    """This is for processing the nodes in the graph and store them in correct format."""
     for node in nodes:
         print(f"\t Processing {node} node feature...")
         old_node_path = source + "/" + node + "/" + "node_feat.npy"
@@ -489,6 +496,7 @@ def add_nodes(nodes, source, dest, dataset_size):
         process_feat(
             file_path=new_node_path, node_name=node, dataset_size=dataset_size
         )
+        # If the node is a paper type, process the labels
         if node == "paper":
             print(f"\t Processing {node} labels...")
             old_label_path_19 = source + "/" + node + "/" + "node_label_19.npy"
@@ -515,7 +523,7 @@ def add_nodes(nodes, source, dest, dataset_size):
 def process_dataset(path, dataset_size):
     print(f"Starting to process the {dataset_size} dataset...")
 
-    # Make the directory for processed dataset
+    # Step 0: Make the directory for processed dataset
     processed_dir = path + "-seeds"
     os.makedirs(name=processed_dir, exist_ok=True)
     original_path = path + "/" + "processed"
@@ -561,7 +569,11 @@ def process_dataset(path, dataset_size):
 
     # Step 4: Build the yaml file
     print("Building yaml file...")
-    build_yaml(original_path=path, current_path=processed_dir)
+    build_yaml(
+        original_path=path,
+        current_path=processed_dir,
+        dataset_size=dataset_size,
+    )
 
     # shutil.rmtree(path)
     print(f"Finished processing the {dataset_size} dataset")

From 42717af24f64513f9932bddab489cba564813ada Mon Sep 17 00:00:00 2001
From: Bowen Yao <112051015+BowenYao18@users.noreply.github.com>
Date: Thu, 15 Aug 2024 15:01:15 -0700
Subject: [PATCH 06/35] Update examples/graphbolt/rgcn/download.py

Co-authored-by: Muhammed Fatih BALIN <m.f.balin@gmail.com>
---
 examples/graphbolt/rgcn/download.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/graphbolt/rgcn/download.py b/examples/graphbolt/rgcn/download.py
index 9f3b51ceb6b7..a4067b9921ce 100755
--- a/examples/graphbolt/rgcn/download.py
+++ b/examples/graphbolt/rgcn/download.py
@@ -396,7 +396,7 @@ def split_data(label_path, set_dir, dataset_size):
     labels = np.load(label_path)
 
     total_samples = len(labels)
-    train_end = int(0.8 * total_samples)
+    train_end = int(0.6 * total_samples)
     validation_end = int(0.9 * total_samples)
 
     indices = np.arange(total_samples)

From 06f3f293027a3932c61ddca0368fe82c874083fe Mon Sep 17 00:00:00 2001
From: Bowen Yao <112051015+BowenYao18@users.noreply.github.com>
Date: Thu, 15 Aug 2024 15:01:22 -0700
Subject: [PATCH 07/35] Update examples/graphbolt/rgcn/download.py

Co-authored-by: Muhammed Fatih BALIN <m.f.balin@gmail.com>
---
 examples/graphbolt/rgcn/download.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/graphbolt/rgcn/download.py b/examples/graphbolt/rgcn/download.py
index a4067b9921ce..ce50e82bf12e 100755
--- a/examples/graphbolt/rgcn/download.py
+++ b/examples/graphbolt/rgcn/download.py
@@ -6,7 +6,7 @@
 import numpy as np
 from tqdm import tqdm
 
-GBFACTOR = float(1 << 30)
+GBFACTOR = 1 << 30
 
 
 def build_yaml_helper(path, dataset_size, in_memory=True):

From 19f18a581a9584c248348d779270e026cb4302b7 Mon Sep 17 00:00:00 2001
From: Bowen Yao <112051015+BowenYao18@users.noreply.github.com>
Date: Thu, 15 Aug 2024 15:01:41 -0700
Subject: [PATCH 08/35] Update examples/graphbolt/rgcn/download.py

Co-authored-by: Muhammed Fatih BALIN <m.f.balin@gmail.com>
---
 examples/graphbolt/rgcn/download.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/graphbolt/rgcn/download.py b/examples/graphbolt/rgcn/download.py
index ce50e82bf12e..7ec369bbfe99 100755
--- a/examples/graphbolt/rgcn/download.py
+++ b/examples/graphbolt/rgcn/download.py
@@ -480,6 +480,7 @@ def process_label(file_path, num_class, dataset_size):
         assert new_array.shape[0] == 227130858
         assert np.array_equal(array, new_array)
     else:
+        assert num_class == 19
         # new_array = np.memmap(file_path, dtype='int32', mode='r',  shape=(num_nodes[dataset_size]["paper"], 1))
         new_array = np.load(file_path)
         assert new_array.shape[0] == num_nodes[dataset_size]["paper"]

From 97c17352f978ef6b2adb1792528dcd85f1dab9d0 Mon Sep 17 00:00:00 2001
From: Bowen Yao <112051015+BowenYao18@users.noreply.github.com>
Date: Thu, 15 Aug 2024 15:02:06 -0700
Subject: [PATCH 09/35] Update examples/graphbolt/rgcn/download.py

Co-authored-by: Muhammed Fatih BALIN <m.f.balin@gmail.com>
---
 examples/graphbolt/rgcn/download.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/graphbolt/rgcn/download.py b/examples/graphbolt/rgcn/download.py
index 7ec369bbfe99..e3513019b414 100755
--- a/examples/graphbolt/rgcn/download.py
+++ b/examples/graphbolt/rgcn/download.py
@@ -397,7 +397,7 @@ def split_data(label_path, set_dir, dataset_size):
 
     total_samples = len(labels)
     train_end = int(0.6 * total_samples)
-    validation_end = int(0.9 * total_samples)
+    validation_end = int(0.8 * total_samples)
 
     indices = np.arange(total_samples)
     train_indices = indices[:train_end]

From 93cb70f18139ff70060f692776294a2836a430ec Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-1-37.us-west-2.compute.internal>
Date: Thu, 15 Aug 2024 22:35:24 +0000
Subject: [PATCH 10/35] added 2983 class task

---
 examples/graphbolt/rgcn/download.py | 113 +++++++++++++++++++++++-----
 1 file changed, 96 insertions(+), 17 deletions(-)

diff --git a/examples/graphbolt/rgcn/download.py b/examples/graphbolt/rgcn/download.py
index e3513019b414..787060c33819 100755
--- a/examples/graphbolt/rgcn/download.py
+++ b/examples/graphbolt/rgcn/download.py
@@ -56,13 +56,13 @@ def build_yaml_helper(path, dataset_size, in_memory=True):
                         "data": [
                             {
                                 "in_memory": in_memory,
-                                "path": "set/validation_indices.npy",
+                                "path": "set/validation_indices_19.npy",
                                 "name": "seeds",
                                 "format": "numpy",
                             },
                             {
                                 "in_memory": in_memory,
-                                "path": "set/validation_labels.npy",
+                                "path": "set/validation_labels_19.npy",
                                 "name": "labels",
                                 "format": "numpy",
                             },
@@ -70,19 +70,19 @@ def build_yaml_helper(path, dataset_size, in_memory=True):
                         "type": "paper",
                     }
                 ],
-                "name": "node_classification",
+                "name": "node_classification_19",
                 "train_set": [
                     {
                         "data": [
                             {
                                 "in_memory": in_memory,
-                                "path": "set/train_indices.npy",
+                                "path": "set/train_indices_19.npy",
                                 "name": "seeds",
                                 "format": "numpy",
                             },
                             {
                                 "in_memory": in_memory,
-                                "path": "set/train_labels.npy",
+                                "path": "set/train_labels_19.npy",
                                 "name": "labels",
                                 "format": "numpy",
                             },
@@ -95,13 +95,13 @@ def build_yaml_helper(path, dataset_size, in_memory=True):
                         "data": [
                             {
                                 "in_memory": in_memory,
-                                "path": "set/test_indices.npy",
+                                "path": "set/test_indices_19.npy",
                                 "name": "seeds",
                                 "format": "numpy",
                             },
                             {
                                 "in_memory": in_memory,
-                                "path": "set/test_labels.npy",
+                                "path": "set/test_labels_19.npy",
                                 "name": "labels",
                                 "format": "numpy",
                             },
@@ -109,7 +109,68 @@ def build_yaml_helper(path, dataset_size, in_memory=True):
                         "type": "paper",
                     }
                 ],
-            }
+            },
+            {
+                "num_classes": 2983,
+                "validation_set": [
+                    {
+                        "data": [
+                            {
+                                "in_memory": in_memory,
+                                "path": "set/validation_indices_2983.npy",
+                                "name": "seeds",
+                                "format": "numpy",
+                            },
+                            {
+                                "in_memory": in_memory,
+                                "path": "set/validation_labels_2983.npy",
+                                "name": "labels",
+                                "format": "numpy",
+                            },
+                        ],
+                        "type": "paper",
+                    }
+                ],
+                "name": "node_classification_2K",
+                "train_set": [
+                    {
+                        "data": [
+                            {
+                                "in_memory": in_memory,
+                                "path": "set/train_indices_2983.npy",
+                                "name": "seeds",
+                                "format": "numpy",
+                            },
+                            {
+                                "in_memory": in_memory,
+                                "path": "set/train_labels_2983.npy",
+                                "name": "labels",
+                                "format": "numpy",
+                            },
+                        ],
+                        "type": "paper",
+                    }
+                ],
+                "test_set": [
+                    {
+                        "data": [
+                            {
+                                "in_memory": in_memory,
+                                "path": "set/test_indices_2983.npy",
+                                "name": "seeds",
+                                "format": "numpy",
+                            },
+                            {
+                                "in_memory": in_memory,
+                                "path": "set/test_labels_2983.npy",
+                                "name": "labels",
+                                "format": "numpy",
+                            },
+                        ],
+                        "type": "paper",
+                    }
+                ],
+            },
         ],
         "feature_data": [
             {
@@ -390,7 +451,7 @@ def download_dataset(path, dataset_type, dataset_size):
 }
 
 
-def split_data(label_path, set_dir, dataset_size):
+def split_data(label_path, set_dir, dataset_size, class_num):
     """This is for splitting the labels into three sets: train, validation, and test sets."""
     # labels = np.memmap(label_path, dtype='int32', mode='r',  shape=(num_nodes[dataset_size]["paper"], 1))
     labels = np.load(label_path)
@@ -415,14 +476,24 @@ def split_data(label_path, set_dir, dataset_size):
     print(validation_labels, len(validation_labels))
     print(test_labels, len(test_labels))
 
-    gb.numpy_save_aligned(f"{set_dir}/train_indices.npy", train_indices)
     gb.numpy_save_aligned(
-        f"{set_dir}/validation_indices.npy", validation_indices
+        f"{set_dir}/train_indices_{class_num}.npy", train_indices
+    )
+    gb.numpy_save_aligned(
+        f"{set_dir}/validation_indices_{class_num}.npy", validation_indices
+    )
+    gb.numpy_save_aligned(
+        f"{set_dir}/test_indices_{class_num}.npy", test_indices
+    )
+    gb.numpy_save_aligned(
+        f"{set_dir}/train_labels_{class_num}.npy", train_labels
+    )
+    gb.numpy_save_aligned(
+        f"{set_dir}/validation_labels_{class_num}.npy", validation_labels
+    )
+    gb.numpy_save_aligned(
+        f"{set_dir}/test_labels_{class_num}.npy", test_labels
     )
-    gb.numpy_save_aligned(f"{set_dir}/test_indices.npy", test_indices)
-    gb.numpy_save_aligned(f"{set_dir}/train_labels.npy", train_labels)
-    gb.numpy_save_aligned(f"{set_dir}/validation_labels.npy", validation_labels)
-    gb.numpy_save_aligned(f"{set_dir}/test_labels.npy", test_labels)
 
 
 def add_edges(edges, source, dest, dataset_size):
@@ -480,7 +551,6 @@ def process_label(file_path, num_class, dataset_size):
         assert new_array.shape[0] == 227130858
         assert np.array_equal(array, new_array)
     else:
-        assert num_class == 19
         # new_array = np.memmap(file_path, dtype='int32', mode='r',  shape=(num_nodes[dataset_size]["paper"], 1))
         new_array = np.load(file_path)
         assert new_array.shape[0] == num_nodes[dataset_size]["paper"]
@@ -547,7 +617,16 @@ def process_dataset(path, dataset_size):
     set_dir = processed_dir + "/" + "set"
     os.makedirs(name=set_dir, exist_ok=True)
     split_data(
-        label_path=label_file_19, set_dir=set_dir, dataset_size=dataset_size
+        label_path=label_file_19,
+        set_dir=set_dir,
+        dataset_size=dataset_size,
+        class_num=19,
+    )
+    split_data(
+        label_path=label_file_2K,
+        set_dir=set_dir,
+        dataset_size=dataset_size,
+        class_num=2983,
     )
 
     # Step 3: Move edge files

From b170a903ddd5889aa2addfe215fe953777cef023 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-1-37.us-west-2.compute.internal>
Date: Thu, 15 Aug 2024 22:39:01 +0000
Subject: [PATCH 11/35] fix lint

---
 examples/graphbolt/rgcn/download.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/examples/graphbolt/rgcn/download.py b/examples/graphbolt/rgcn/download.py
index 787060c33819..1304fcfac572 100755
--- a/examples/graphbolt/rgcn/download.py
+++ b/examples/graphbolt/rgcn/download.py
@@ -491,9 +491,7 @@ def split_data(label_path, set_dir, dataset_size, class_num):
     gb.numpy_save_aligned(
         f"{set_dir}/validation_labels_{class_num}.npy", validation_labels
     )
-    gb.numpy_save_aligned(
-        f"{set_dir}/test_labels_{class_num}.npy", test_labels
-    )
+    gb.numpy_save_aligned(f"{set_dir}/test_labels_{class_num}.npy", test_labels)
 
 
 def add_edges(edges, source, dest, dataset_size):

From 55079b8507c9d1bc9019510970b36369027f7d6c Mon Sep 17 00:00:00 2001
From: Bowen Yao <112051015+BowenYao18@users.noreply.github.com>
Date: Thu, 15 Aug 2024 15:41:38 -0700
Subject: [PATCH 12/35] Update examples/graphbolt/rgcn/download.py

Co-authored-by: Muhammed Fatih BALIN <m.f.balin@gmail.com>
---
 examples/graphbolt/rgcn/download.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/graphbolt/rgcn/download.py b/examples/graphbolt/rgcn/download.py
index 1304fcfac572..dc37aef60c73 100755
--- a/examples/graphbolt/rgcn/download.py
+++ b/examples/graphbolt/rgcn/download.py
@@ -70,7 +70,7 @@ def build_yaml_helper(path, dataset_size, in_memory=True):
                         "type": "paper",
                     }
                 ],
-                "name": "node_classification_19",
+                "name": "node_classification",
                 "train_set": [
                     {
                         "data": [

From ce65746cad5d36ec03a07b328f06e2c0987a72fd Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-1-37.us-west-2.compute.internal>
Date: Fri, 16 Aug 2024 03:25:04 +0000
Subject: [PATCH 13/35] remove labels from yaml

---
 examples/graphbolt/rgcn/download.py | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/examples/graphbolt/rgcn/download.py b/examples/graphbolt/rgcn/download.py
index dc37aef60c73..279e60126233 100755
--- a/examples/graphbolt/rgcn/download.py
+++ b/examples/graphbolt/rgcn/download.py
@@ -181,14 +181,6 @@ def build_yaml_helper(path, dataset_size, in_memory=True):
                 "path": "data/paper_feat.npy",
                 "type": "paper",
             },
-            {
-                "domain": "node",
-                "name": "label",
-                "format": "numpy",
-                "in_memory": in_memory,
-                "path": "data/paper_label_19.npy",
-                "type": "paper",
-            },
             {
                 "domain": "node",
                 "name": "feat",

From 33229878c8ce918988ca29016c17a1ca88530cb6 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-1-37.us-west-2.compute.internal>
Date: Sat, 17 Aug 2024 07:33:44 +0000
Subject: [PATCH 14/35] add doenload script

---
 examples/graphbolt/rgcn/download_full_igbh.sh | 98 +++++++++++++++++++
 .../graphbolt/rgcn/download_large_igbh.sh     | 95 ++++++++++++++++++
 2 files changed, 193 insertions(+)
 create mode 100755 examples/graphbolt/rgcn/download_full_igbh.sh
 create mode 100755 examples/graphbolt/rgcn/download_large_igbh.sh

diff --git a/examples/graphbolt/rgcn/download_full_igbh.sh b/examples/graphbolt/rgcn/download_full_igbh.sh
new file mode 100755
index 000000000000..1efcdb3cb0b9
--- /dev/null
+++ b/examples/graphbolt/rgcn/download_full_igbh.sh
@@ -0,0 +1,98 @@
+#! /bin/bash
+
+mkdir -p igb-dataset-full
+cd igb-dataset-full
+mkdir -p processed
+cd processed
+
+echo "IGBH600M (Heteregeneous) download starting"
+
+# paper
+mkdir -p paper
+cd paper
+wget https://igb-public-awsopen.s3.amazonaws.com/IGBH/processed/paper/node_feat.npy
+wget https://igb-public-awsopen.s3.amazonaws.com/IGBH/processed/paper/node_label_19.npy
+wget https://igb-public-awsopen.s3.amazonaws.com/IGBH/processed/paper/node_label_2K.npy
+wget https://igb-public-awsopen.s3.amazonaws.com/IGBH/processed/paper/paper_id_index_mapping.npy
+cd ..
+
+# paper__cites__paper
+mkdir -p paper__cites__paper
+cd paper__cites__paper
+wget https://igb-public-awsopen.s3.amazonaws.com/IGBH/processed/paper__cites__paper/edge_index.npy
+cd ..
+
+# author
+mkdir -p author
+cd author
+wget https://igb-public-awsopen.s3.amazonaws.com/IGBH/processed/author/author_id_index_mapping.npy
+wget https://igb-public-awsopen.s3.amazonaws.com/IGBH/processed/author/node_feat.npy
+cd ..
+
+# conference
+mkdir -p conference
+cd conference
+wget https://igb-public-awsopen.s3.amazonaws.com/IGBH/processed/conference/conference_id_index_mapping.npy
+wget https://igb-public-awsopen.s3.amazonaws.com/IGBH/processed/conference/node_feat.npy
+cd ..
+
+# institute
+mkdir -p institute
+cd institute
+wget https://igb-public-awsopen.s3.amazonaws.com/IGBH/processed/institute/institute_id_index_mapping.npy
+wget https://igb-public-awsopen.s3.amazonaws.com/IGBH/processed/institute/node_feat.npy
+cd ..
+
+# journal
+mkdir -p journal
+cd journal
+wget https://igb-public-awsopen.s3.amazonaws.com/IGBH/processed/journal/journal_id_index_mapping.npy
+wget https://igb-public-awsopen.s3.amazonaws.com/IGBH/processed/journal/node_feat.npy
+cd ..
+
+# fos
+mkdir -p fos
+cd fos
+wget https://igb-public-awsopen.s3.amazonaws.com/IGBH/processed/fos/fos_id_index_mapping.npy
+wget https://igb-public-awsopen.s3.amazonaws.com/IGBH/processed/fos/node_feat.npy
+cd ..
+
+# author__affiliated_to__institute
+mkdir -p author__affiliated_to__institute
+cd author__affiliated_to__institute
+wget https://igb-public-awsopen.s3.amazonaws.com/IGBH/processed/author__affiliated_to__institute/edge_index.npy
+cd ..
+
+# paper__published__journal
+mkdir -p paper__published__journal
+cd paper__published__journal
+wget https://igb-public-awsopen.s3.amazonaws.com/IGBH/processed/paper__published__journal/edge_index.npy
+cd ..
+
+# paper__topic__fos
+mkdir -p paper__topic__fos
+cd paper__topic__fos
+wget https://igb-public-awsopen.s3.amazonaws.com/IGBH/processed/paper__topic__fos/edge_index.npy
+cd ..
+
+# paper__venue__conference
+mkdir -p paper__venue__conference
+cd paper__venue__conference
+wget https://igb-public-awsopen.s3.amazonaws.com/IGBH/processed/paper__venue__conference/edge_index.npy
+cd ..
+
+# paper__written_by__author
+mkdir -p paper__written_by__author
+cd paper__written_by__author
+wget https://igb-public-awsopen.s3.amazonaws.com/IGBH/processed/paper__written_by__author/edge_index.npy
+cd ..
+
+cd ../..
+
+echo "IGBH-IGBH (Heteregeneous) download complete"
+
+
+num_paper_nodes = 269346174
+paper_node_features = np.memmap('/home/ubuntu/dgl/examples/graphbolt/rgcn/igb_dataset/igb_dataset_full/processed/paper/node_label_19.npy', dtype='float32', mode='r',  shape=(num_paper_nodes,1))
+num_paper_nodes = 48521486
+paper_node_features = np.memmap('/home/ubuntu/dgl/examples/graphbolt/rgcn/datasets/igb-dataset-full-seeds/edges/author__affiliated_to__institute.npy', dtype='int32', mode='r',  shape=(num_paper_nodes,2))
diff --git a/examples/graphbolt/rgcn/download_large_igbh.sh b/examples/graphbolt/rgcn/download_large_igbh.sh
new file mode 100755
index 000000000000..4c36a7f8d0db
--- /dev/null
+++ b/examples/graphbolt/rgcn/download_large_igbh.sh
@@ -0,0 +1,95 @@
+#! /bin/bash
+
+mkdir -p igb-heterogeneous-large/
+cd igb-heterogeneous-large/
+mkdir -p processed
+cd processed
+
+echo "IGBH-large (Heterogeneous) download starting"
+
+# paper
+mkdir -p paper
+cd paper
+wget https://igb-public-awsopen.s3.amazonaws.com/igb_large/processed/paper/node_feat.npy
+wget https://igb-public-awsopen.s3.amazonaws.com/igb_large/processed/paper/node_label_19.npy
+wget https://igb-public-awsopen.s3.amazonaws.com/igb_large/processed/paper/node_label_2K.npy
+wget https://igb-public-awsopen.s3.amazonaws.com/igb_large/processed/paper/paper_id_index_mapping.npy
+cd ..
+
+# # paper__cites__paper
+# wget --recursive --no-parent https://igb-public-awsopen.s3.amazonaws.com/igb_large/processed/paper__cites__paper/edge_index.npy
+
+# paper__cites__paper
+mkdir -p paper__cites__paper
+cd paper__cites__paper
+wget https://igb-public-awsopen.s3.amazonaws.com/igb_large/processed/paper__cites__paper/edge_index.npy
+cd ..
+
+# author
+mkdir -p author
+cd author
+wget https://igb-public-awsopen.s3.amazonaws.com/igb_large/processed/author/author_id_index_mapping.npy
+wget https://igb-public-awsopen.s3.amazonaws.com/igb_large/processed/author/node_feat.npy
+cd ..
+
+# conference
+mkdir -p conference
+cd conference
+wget https://igb-public-awsopen.s3.amazonaws.com/igb_large/processed/conference/conference_id_index_mapping.npy
+wget https://igb-public-awsopen.s3.amazonaws.com/igb_large/processed/conference/node_feat.npy
+cd ..
+
+# institute
+mkdir -p institute
+cd institute
+wget https://igb-public-awsopen.s3.amazonaws.com/igb_large/processed/institute/institute_id_index_mapping.npy
+wget https://igb-public-awsopen.s3.amazonaws.com/igb_large/processed/institute/node_feat.npy
+cd ..
+
+# journal
+mkdir -p journal
+cd journal
+wget https://igb-public-awsopen.s3.amazonaws.com/igb_large/processed/journal/journal_id_index_mapping.npy
+wget https://igb-public-awsopen.s3.amazonaws.com/igb_large/processed/journal/node_feat.npy
+cd ..
+
+# fos
+mkdir -p fos
+cd fos
+wget https://igb-public-awsopen.s3.amazonaws.com/igb_large/processed/fos/fos_id_index_mapping.npy
+wget https://igb-public-awsopen.s3.amazonaws.com/igb_large/processed/fos/node_feat.npy
+cd ..
+
+# author__affiliated_to__institute
+mkdir -p author__affiliated_to__institute
+cd author__affiliated_to__institute
+wget https://igb-public-awsopen.s3.amazonaws.com/igb_large/processed/author__affiliated_to__institute/edge_index.npy
+cd ..
+
+# paper__published__journal
+mkdir -p paper__published__journal
+cd paper__published__journal
+wget https://igb-public-awsopen.s3.amazonaws.com/igb_large/processed/paper__published__journal/edge_index.npy
+cd ..
+
+# paper__topic__fos
+mkdir -p paper__topic__fos
+cd paper__topic__fos
+wget https://igb-public-awsopen.s3.amazonaws.com/igb_large/processed/paper__topic__fos/edge_index.npy
+cd ..
+
+# paper__venue__conference
+mkdir -p paper__venue__conference
+cd paper__venue__conference
+wget https://igb-public-awsopen.s3.amazonaws.com/igb_large/processed/paper__venue__conference/edge_index.npy
+cd ..
+
+# paper__written_by__author
+mkdir -p paper__written_by__author
+cd paper__written_by__author
+wget https://igb-public-awsopen.s3.amazonaws.com/igb_large/processed/paper__written_by__author/edge_index.npy
+cd ..
+
+cd ../..
+
+echo "IGBH-large (Heterogeneous) download complete"
\ No newline at end of file

From 0135b4b5af3c0219c2b6e3985d6549c900767ad3 Mon Sep 17 00:00:00 2001
From: BowenYao18 <by18@rice.edu>
Date: Sat, 17 Aug 2024 08:24:57 +0000
Subject: [PATCH 15/35] corrected path in processing file

---
 examples/graphbolt/rgcn/download.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/graphbolt/rgcn/download.py b/examples/graphbolt/rgcn/download.py
index 279e60126233..475f84592db3 100755
--- a/examples/graphbolt/rgcn/download.py
+++ b/examples/graphbolt/rgcn/download.py
@@ -573,7 +573,7 @@ def add_nodes(nodes, source, dest, dataset_size):
             new_label_path_2K = dest + "/" + "paper_label_2K.npy"
             os.rename(src=old_label_path_2K, dst=new_label_path_2K)
             process_label(
-                file_path=new_label_path_19,
+                file_path=new_label_path_2K,
                 num_class=2983,
                 dataset_size=dataset_size,
             )

From 7a35313cdd7f0f2f656e6e67f5ed00b0472ba5f2 Mon Sep 17 00:00:00 2001
From: BowenYao18 <by18@rice.edu>
Date: Sat, 17 Aug 2024 09:28:18 +0000
Subject: [PATCH 16/35] modify yaml file builder

---
 examples/graphbolt/rgcn/download.py | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/examples/graphbolt/rgcn/download.py b/examples/graphbolt/rgcn/download.py
index 475f84592db3..249f1dd176f6 100755
--- a/examples/graphbolt/rgcn/download.py
+++ b/examples/graphbolt/rgcn/download.py
@@ -51,6 +51,7 @@ def build_yaml_helper(path, dataset_size, in_memory=True):
         "tasks": [
             {
                 "num_classes": 19,
+                "name": "node_classification",
                 "validation_set": [
                     {
                         "data": [
@@ -70,7 +71,6 @@ def build_yaml_helper(path, dataset_size, in_memory=True):
                         "type": "paper",
                     }
                 ],
-                "name": "node_classification",
                 "train_set": [
                     {
                         "data": [
@@ -112,6 +112,7 @@ def build_yaml_helper(path, dataset_size, in_memory=True):
             },
             {
                 "num_classes": 2983,
+                "name": "node_classification_2K",
                 "validation_set": [
                     {
                         "data": [
@@ -131,7 +132,6 @@ def build_yaml_helper(path, dataset_size, in_memory=True):
                         "type": "paper",
                     }
                 ],
-                "name": "node_classification_2K",
                 "train_set": [
                     {
                         "data": [
@@ -205,14 +205,6 @@ def build_yaml_helper(path, dataset_size, in_memory=True):
                 "path": "data/fos_feat.npy",
                 "type": "fos",
             },
-            {
-                "domain": "node",
-                "name": "feat",
-                "format": "numpy",
-                "in_memory": in_memory,
-                "path": "data/author_feat.npy",
-                "type": "author",
-            },
         ],
         "dataset_name": os.path.basename(path),
     }

From 422ccbee4f4dcdb87af417c24e3beeaeff018d04 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-0-214.us-west-2.compute.internal>
Date: Thu, 5 Sep 2024 06:32:11 +0000
Subject: [PATCH 17/35] add igb-het-[tiny|small]

---
 examples/graphbolt/rgcn/download.py           | 670 ------------------
 examples/graphbolt/rgcn/download_full_igbh.sh |  98 ---
 .../graphbolt/rgcn/download_large_igbh.sh     |  95 ---
 examples/graphbolt/rgcn/hetero_rgcn.py        |  19 +-
 python/dgl/graphbolt/impl/ondisk_dataset.py   |  16 +
 5 files changed, 25 insertions(+), 873 deletions(-)
 delete mode 100755 examples/graphbolt/rgcn/download.py
 delete mode 100755 examples/graphbolt/rgcn/download_full_igbh.sh
 delete mode 100755 examples/graphbolt/rgcn/download_large_igbh.sh

diff --git a/examples/graphbolt/rgcn/download.py b/examples/graphbolt/rgcn/download.py
deleted file mode 100755
index 249f1dd176f6..000000000000
--- a/examples/graphbolt/rgcn/download.py
+++ /dev/null
@@ -1,670 +0,0 @@
-import argparse, hashlib, os, shutil, tarfile, yaml
-import subprocess
-import urllib.request as ur
-
-import dgl.graphbolt as gb
-import numpy as np
-from tqdm import tqdm
-
-GBFACTOR = 1 << 30
-
-
-def build_yaml_helper(path, dataset_size, in_memory=True):
-    """The stirng to build the yaml file. (Still need modification)"""
-    data = {
-        "graph": {
-            "nodes": [
-                {"num": num_nodes[dataset_size]["paper"], "type": "paper"},
-                {"num": num_nodes[dataset_size]["author"], "type": "author"},
-                {
-                    "num": num_nodes[dataset_size]["institute"],
-                    "type": "institution",
-                },
-                {
-                    "num": num_nodes[dataset_size]["fos"],
-                    "type": "field_of_study",
-                },
-            ],
-            "edges": [
-                {
-                    "path": "edges/author__affiliated_to__institute.npy",
-                    "type": "author:affiliated_to:institution",
-                    "format": "numpy",
-                },
-                {
-                    "path": "edges/paper__written_by__author.npy",
-                    "type": "paper:written_by:author",
-                    "format": "numpy",
-                },
-                {
-                    "path": "edges/paper__cites__paper.npy",
-                    "type": "paper:cites:paper",
-                    "format": "numpy",
-                },
-                {
-                    "path": "edges/paper__topic__fos.npy",
-                    "type": "paper:has_topic:field_of_study",
-                    "format": "numpy",
-                },
-            ],
-        },
-        "tasks": [
-            {
-                "num_classes": 19,
-                "name": "node_classification",
-                "validation_set": [
-                    {
-                        "data": [
-                            {
-                                "in_memory": in_memory,
-                                "path": "set/validation_indices_19.npy",
-                                "name": "seeds",
-                                "format": "numpy",
-                            },
-                            {
-                                "in_memory": in_memory,
-                                "path": "set/validation_labels_19.npy",
-                                "name": "labels",
-                                "format": "numpy",
-                            },
-                        ],
-                        "type": "paper",
-                    }
-                ],
-                "train_set": [
-                    {
-                        "data": [
-                            {
-                                "in_memory": in_memory,
-                                "path": "set/train_indices_19.npy",
-                                "name": "seeds",
-                                "format": "numpy",
-                            },
-                            {
-                                "in_memory": in_memory,
-                                "path": "set/train_labels_19.npy",
-                                "name": "labels",
-                                "format": "numpy",
-                            },
-                        ],
-                        "type": "paper",
-                    }
-                ],
-                "test_set": [
-                    {
-                        "data": [
-                            {
-                                "in_memory": in_memory,
-                                "path": "set/test_indices_19.npy",
-                                "name": "seeds",
-                                "format": "numpy",
-                            },
-                            {
-                                "in_memory": in_memory,
-                                "path": "set/test_labels_19.npy",
-                                "name": "labels",
-                                "format": "numpy",
-                            },
-                        ],
-                        "type": "paper",
-                    }
-                ],
-            },
-            {
-                "num_classes": 2983,
-                "name": "node_classification_2K",
-                "validation_set": [
-                    {
-                        "data": [
-                            {
-                                "in_memory": in_memory,
-                                "path": "set/validation_indices_2983.npy",
-                                "name": "seeds",
-                                "format": "numpy",
-                            },
-                            {
-                                "in_memory": in_memory,
-                                "path": "set/validation_labels_2983.npy",
-                                "name": "labels",
-                                "format": "numpy",
-                            },
-                        ],
-                        "type": "paper",
-                    }
-                ],
-                "train_set": [
-                    {
-                        "data": [
-                            {
-                                "in_memory": in_memory,
-                                "path": "set/train_indices_2983.npy",
-                                "name": "seeds",
-                                "format": "numpy",
-                            },
-                            {
-                                "in_memory": in_memory,
-                                "path": "set/train_labels_2983.npy",
-                                "name": "labels",
-                                "format": "numpy",
-                            },
-                        ],
-                        "type": "paper",
-                    }
-                ],
-                "test_set": [
-                    {
-                        "data": [
-                            {
-                                "in_memory": in_memory,
-                                "path": "set/test_indices_2983.npy",
-                                "name": "seeds",
-                                "format": "numpy",
-                            },
-                            {
-                                "in_memory": in_memory,
-                                "path": "set/test_labels_2983.npy",
-                                "name": "labels",
-                                "format": "numpy",
-                            },
-                        ],
-                        "type": "paper",
-                    }
-                ],
-            },
-        ],
-        "feature_data": [
-            {
-                "domain": "node",
-                "name": "feat",
-                "format": "numpy",
-                "in_memory": in_memory,
-                "path": "data/paper_feat.npy",
-                "type": "paper",
-            },
-            {
-                "domain": "node",
-                "name": "feat",
-                "format": "numpy",
-                "in_memory": in_memory,
-                "path": "data/author_feat.npy",
-                "type": "author",
-            },
-            {
-                "domain": "node",
-                "name": "feat",
-                "format": "numpy",
-                "in_memory": in_memory,
-                "path": "data/institute_feat.npy",
-                "type": "institute",
-            },
-            {
-                "domain": "node",
-                "name": "feat",
-                "format": "numpy",
-                "in_memory": in_memory,
-                "path": "data/fos_feat.npy",
-                "type": "fos",
-            },
-        ],
-        "dataset_name": os.path.basename(path),
-    }
-
-    return data
-
-
-def build_yaml(original_path, current_path, dataset_size):
-    """This build the yaml file differently based on the dataset size.
-    The two large datasets are put in disk while the other three smaller versions are in memory.
-    """
-    if "large" == dataset_size or "full" == dataset_size:
-        data = build_yaml_helper(
-            path=original_path, dataset_size=dataset_size, in_memory=False
-        )
-    else:
-        data = build_yaml_helper(path=original_path, dataset_size=dataset_size)
-    with open(f"{current_path}/metadata.yaml", "w") as file:
-        yaml.dump(data=data, stream=file, default_flow_style=False)
-
-
-dataset_urls = {
-    "homogeneous": {
-        "tiny": "https://igb-public-awsopen.s3.amazonaws.com/igb-homogeneous/igb_homogeneous_tiny.tar.gz",
-        "small": "https://igb-public-awsopen.s3.amazonaws.com/igb-homogeneous/igb_homogeneous_small.tar.gz",
-        "medium": "https://igb-public-awsopen.s3.amazonaws.com/igb-homogeneous/igb_homogeneous_medium.tar.gz",
-    },
-    "heterogeneous": {
-        "tiny": "https://igb-public-awsopen.s3.amazonaws.com/igb-heterogeneous/igb_heterogeneous_tiny.tar.gz",
-        "small": "https://igb-public-awsopen.s3.amazonaws.com/igb-heterogeneous/igb_heterogeneous_small.tar.gz",
-        "medium": "https://igb-public-awsopen.s3.amazonaws.com/igb-heterogeneous/igb_heterogeneous_medium.tar.gz",
-    },
-}
-
-
-md5checksums = {
-    "homogeneous": {
-        "tiny": "34856534da55419b316d620e2d5b21be",
-        "small": "6781c699723529902ace0a95cafe6fe4",
-        "medium": "4640df4ceee46851fd18c0a44ddcc622",
-    },
-    "heterogeneous": {
-        "tiny": "83fbc1091497ff92cf20afe82fae0ade",
-        "small": "2f42077be60a074aec24f7c60089e1bd",
-        "medium": "7f0df4296eca36553ff3a6a63abbd347",
-    },
-}
-
-
-def decide_download(url):
-    """An interactive command line to confirm download."""
-    d = ur.urlopen(url)
-    size = int(d.info()["Content-Length"]) / GBFACTOR
-    ### confirm if larger than 1GB
-    if size > 1:
-        return (
-            input(
-                "This will download %.2fGB. Will you proceed? (y/N) " % (size)
-            ).lower()
-            == "y"
-        )
-    else:
-        return True
-
-
-def check_md5sum(dataset_type, dataset_size, filename):
-    """This is for checking the data correctness of the downloaded datasets."""
-    original_md5 = md5checksums[dataset_type][dataset_size]
-
-    with open(filename, "rb") as file_to_check:
-        data = file_to_check.read()
-        md5_returned = hashlib.md5(data).hexdigest()
-
-    if original_md5 == md5_returned:
-        print(" md5sum verified.")
-        return
-    else:
-        os.remove(filename)
-        raise Exception(" md5sum verification failed!.")
-
-
-def download_dataset(path, dataset_type, dataset_size):
-    """This is the script to download all the related datasets."""
-
-    # For large datasets, use the two shell scripts to download.
-    if dataset_size in ["large", "full"]:
-        command = f"./download_{dataset_size}_igbh.sh"
-        subprocess.run(["bash", command], check=True, text=True)
-        shutil.move(src=f"igb-{dataset_type}-{dataset_size}", dst=f"{path}")
-        return path + "/" + "igb-" + dataset_type + "-" + dataset_size
-    # For the three smaller version, use the url to download.
-    else:
-        output_directory = path
-        if not os.path.exists(
-            output_directory
-            + "igb_"
-            + dataset_type
-            + "_"
-            + dataset_size
-            + ".tar.gz"
-        ):
-            url = dataset_urls[dataset_type][dataset_size]
-            if decide_download(url):
-                data = ur.urlopen(url)
-                size = int(data.info()["Content-Length"])
-                chunk_size = 1024 * 1024
-                num_iter = int(size / chunk_size) + 2
-                downloaded_size = 0
-                filename = (
-                    path
-                    + "/igb_"
-                    + dataset_type
-                    + "_"
-                    + dataset_size
-                    + ".tar.gz"
-                )
-                with open(filename, "wb") as f:
-                    pbar = tqdm(range(num_iter))
-                    for i in pbar:
-                        chunk = data.read(chunk_size)
-                        downloaded_size += len(chunk)
-                        pbar.set_description(
-                            "Downloaded {:.2f} GB".format(
-                                float(downloaded_size) / GBFACTOR
-                            )
-                        )
-                        f.write(chunk)
-            print(
-                "Downloaded" + " igb_" + dataset_type + "_" + dataset_size,
-                end=" ->",
-            )
-            check_md5sum(dataset_type, dataset_size, filename)
-        else:  # No need to download the tar file again if it is already downloaded.
-            print(
-                "The file igb_"
-                + dataset_type
-                + "_"
-                + dataset_size
-                + ".tar.gz already exists, directly extracting..."
-            )
-            filename = (
-                path + "/igb_" + dataset_type + "_" + dataset_size + ".tar.gz"
-            )
-        # Extract the tar file
-        file = tarfile.open(filename)
-        file.extractall(output_directory)
-        file.close()
-        size = 0
-        for path, dirs, files in os.walk(output_directory + "/" + dataset_size):
-            for f in files:
-                fp = os.path.join(path, f)
-                size += os.path.getsize(fp)
-        print("Final dataset size {:.2f} GB.".format(size / GBFACTOR))
-        # os.remove(filename)
-        os.rename(
-            output_directory + "/" + dataset_size,
-            output_directory + "/" + "igb-" + dataset_type + "-" + dataset_size,
-        )
-        return (
-            output_directory + "/" + "igb-" + dataset_type + "-" + dataset_size
-        )
-
-
-num_nodes = {
-    "full": {
-        "paper": 269346174,
-        "author": 277220883,
-        "institute": 26918,
-        "fos": 712960,
-    },
-    "large": {
-        "paper": 100000000,
-        "author": 116959896,
-        "institute": 26524,
-        "fos": 649707,
-    },
-    "medium": {
-        "paper": 10000000,
-        "author": 15544654,
-        "institute": 23256,
-        "fos": 415054,
-    },
-    "small": {
-        "paper": 1000000,
-        "author": 1926066,
-        "institute": 14751,
-        "fos": 190449,
-    },
-    "tiny": {
-        "paper": 100000,
-        "author": 357041,
-        "institute": 8738,
-        "fos": 84220,
-    },
-}
-
-num_edges = {
-    "full": {
-        "paper__cites__paper": 3996442004,
-        "paper__written_by__author": 716761549,
-        "paper__topic__fos": 1050280600,
-        "author__affiliated_to__institute": 48521486,
-    },
-    "large": {
-        "paper__cites__paper": 1223571364,
-        "paper__written_by__author": 289502107,
-        "paper__topic__fos": 457577294,
-        "author__affiliated_to__institute": 34099660,
-    },
-    "medium": {
-        "paper__cites__paper": 120077694,
-        "paper__written_by__author": 39854592,
-        "paper__topic__fos": 68510495,
-        "author__affiliated_to__institute": 11049412,
-    },
-    "small": {
-        "paper__cites__paper": 12070502,
-        "paper__written_by__author": 4553516,
-        "paper__topic__fos": 7234122,
-        "author__affiliated_to__institute": 1630476,
-    },
-    "tiny": {
-        "paper__cites__paper": 447416,
-        "paper__written_by__author": 471443,
-        "paper__topic__fos": 718445,
-        "author__affiliated_to__institute": 325410,
-    },
-}
-
-
-def split_data(label_path, set_dir, dataset_size, class_num):
-    """This is for splitting the labels into three sets: train, validation, and test sets."""
-    # labels = np.memmap(label_path, dtype='int32', mode='r',  shape=(num_nodes[dataset_size]["paper"], 1))
-    labels = np.load(label_path)
-
-    total_samples = len(labels)
-    train_end = int(0.6 * total_samples)
-    validation_end = int(0.8 * total_samples)
-
-    indices = np.arange(total_samples)
-    train_indices = indices[:train_end]
-    validation_indices = indices[train_end:validation_end]
-    test_indices = indices[validation_end:]
-    print(indices)
-    print(train_indices)
-    print(validation_indices)
-    print(test_indices)
-
-    train_labels = labels[:train_end]
-    validation_labels = labels[train_end:validation_end]
-    test_labels = labels[validation_end:]
-    print(train_labels, len(train_labels))
-    print(validation_labels, len(validation_labels))
-    print(test_labels, len(test_labels))
-
-    gb.numpy_save_aligned(
-        f"{set_dir}/train_indices_{class_num}.npy", train_indices
-    )
-    gb.numpy_save_aligned(
-        f"{set_dir}/validation_indices_{class_num}.npy", validation_indices
-    )
-    gb.numpy_save_aligned(
-        f"{set_dir}/test_indices_{class_num}.npy", test_indices
-    )
-    gb.numpy_save_aligned(
-        f"{set_dir}/train_labels_{class_num}.npy", train_labels
-    )
-    gb.numpy_save_aligned(
-        f"{set_dir}/validation_labels_{class_num}.npy", validation_labels
-    )
-    gb.numpy_save_aligned(f"{set_dir}/test_labels_{class_num}.npy", test_labels)
-
-
-def add_edges(edges, source, dest, dataset_size):
-    """This is for processing the edges in the graph and convert them to correct shape."""
-    for edge in edges:
-        print(f"\t Processing {edge} edge...")
-
-        old_edge_path = source + "/" + edge + "/" + "edge_index.npy"
-        new_edge_path = dest + "/" + edge + ".npy"
-        os.rename(src=old_edge_path, dst=new_edge_path)
-
-        # edge_array = np.memmap(new_edge_path, dtype='int32', mode='r',  shape=(num_edges[dataset_size][edge], 2))
-        edge_array = np.load(new_edge_path)
-        new_edge_array = edge_array.transpose()
-
-        assert new_edge_array.shape == (2, num_edges[dataset_size][edge])
-        assert np.array_equal(edge_array, new_edge_array.transpose())
-
-        gb.numpy_save_aligned(new_edge_path, new_edge_array)
-
-
-def process_feat(file_path, node_name, dataset_size):
-    """This is for processing the node features."""
-    # array = np.memmap(file_path, dtype='float32', mode='r',  shape=(num_nodes[dataset_size][node_name], 1024))
-    array = np.load(file_path)
-    assert array.shape == (num_nodes[dataset_size][node_name], 1024)
-    gb.numpy_save_aligned(file_path, array)
-
-    # Assert the shape and elements of the array are correct
-    # new_array = np.memmap(file_path, dtype='float32', mode='r',  shape=(num_nodes[dataset_size][node_name], 1024))
-    new_array = np.load(file_path)
-    assert new_array.shape == (num_nodes[dataset_size][node_name], 1024)
-    assert np.array_equal(array, new_array)
-
-
-def process_label(file_path, num_class, dataset_size):
-    """This is for processing the node labels."""
-    if (
-        num_class == 2983 and dataset_size == "full"
-    ):  # only this case label number changes
-        # array = np.memmap(file_path, dtype='int32', mode='r',  shape=(227130858, 1))
-        array = np.load(file_path)
-        assert array.shape[0] == 227130858
-    else:
-        # array = np.memmap(file_path, dtype='int32', mode='r',  shape=(num_nodes[dataset_size]["paper"], 1))
-        array = np.load(file_path)
-        assert array.shape[0] == num_nodes[dataset_size]["paper"]
-
-    gb.numpy_save_aligned(file_path, array)
-
-    # Assert the shape and elements of the array are correct
-    if num_class == 2983 and dataset_size == "full":
-        # new_array = np.memmap(file_path, dtype='int32', mode='r',  shape=(227130858, 1))
-        new_array = np.load(file_path)
-        assert new_array.shape[0] == 227130858
-        assert np.array_equal(array, new_array)
-    else:
-        # new_array = np.memmap(file_path, dtype='int32', mode='r',  shape=(num_nodes[dataset_size]["paper"], 1))
-        new_array = np.load(file_path)
-        assert new_array.shape[0] == num_nodes[dataset_size]["paper"]
-        assert np.array_equal(array, new_array)
-
-
-def add_nodes(nodes, source, dest, dataset_size):
-    """This is for processing the nodes in the graph and store them in correct format."""
-    for node in nodes:
-        print(f"\t Processing {node} node feature...")
-        old_node_path = source + "/" + node + "/" + "node_feat.npy"
-        new_node_path = dest + "/" + node + "_feat.npy"
-        os.rename(src=old_node_path, dst=new_node_path)
-        process_feat(
-            file_path=new_node_path, node_name=node, dataset_size=dataset_size
-        )
-        # If the node is a paper type, process the labels
-        if node == "paper":
-            print(f"\t Processing {node} labels...")
-            old_label_path_19 = source + "/" + node + "/" + "node_label_19.npy"
-            new_label_path_19 = dest + "/" + "paper_label_19.npy"
-            os.rename(src=old_label_path_19, dst=new_label_path_19)
-            process_label(
-                file_path=new_label_path_19,
-                num_class=19,
-                dataset_size=dataset_size,
-            )
-
-            old_label_path_2K = source + "/" + node + "/" + "node_label_2K.npy"
-            new_label_path_2K = dest + "/" + "paper_label_2K.npy"
-            os.rename(src=old_label_path_2K, dst=new_label_path_2K)
-            process_label(
-                file_path=new_label_path_2K,
-                num_class=2983,
-                dataset_size=dataset_size,
-            )
-
-    return new_label_path_19, new_label_path_2K
-
-
-def process_dataset(path, dataset_size):
-    print(f"Starting to process the {dataset_size} dataset...")
-
-    # Step 0: Make the directory for processed dataset
-    processed_dir = path + "-seeds"
-    os.makedirs(name=processed_dir, exist_ok=True)
-    original_path = path + "/" + "processed"
-
-    # Step 1: Move Nodes files
-    print("Processing Node files...")
-    node_dir = processed_dir + "/" + "data"
-    os.makedirs(name=node_dir, exist_ok=True)
-    # These are the four nodes in this citation network
-    nodes = ["paper", "author", "fos", "institute"]
-    label_file_19, label_file_2K = add_nodes(
-        nodes=nodes,
-        source=original_path,
-        dest=node_dir,
-        dataset_size=dataset_size,
-    )
-
-    # Step 2: Create labels
-    print("Processing train/valid/test files...")
-    set_dir = processed_dir + "/" + "set"
-    os.makedirs(name=set_dir, exist_ok=True)
-    split_data(
-        label_path=label_file_19,
-        set_dir=set_dir,
-        dataset_size=dataset_size,
-        class_num=19,
-    )
-    split_data(
-        label_path=label_file_2K,
-        set_dir=set_dir,
-        dataset_size=dataset_size,
-        class_num=2983,
-    )
-
-    # Step 3: Move edge files
-    print("Processing Edge files...")
-    edge_dir = processed_dir + "/" + "edges"
-    os.makedirs(name=edge_dir, exist_ok=True)
-    # These are the four edges in this citation network
-    edges = [
-        "paper__cites__paper",
-        "paper__written_by__author",
-        "paper__topic__fos",
-        "author__affiliated_to__institute",
-    ]
-    add_edges(
-        edges=edges,
-        source=original_path,
-        dest=edge_dir,
-        dataset_size=dataset_size,
-    )
-
-    # Step 4: Build the yaml file
-    print("Building yaml file...")
-    build_yaml(
-        original_path=path,
-        current_path=processed_dir,
-        dataset_size=dataset_size,
-    )
-
-    # shutil.rmtree(path)
-    print(f"Finished processing the {dataset_size} dataset")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--path",
-        type=str,
-        default="datasets/",
-        help="path to store the datasets",
-    )
-    parser.add_argument(
-        "--type",
-        type=str,
-        default="heterogeneous",
-        choices=["homogeneous", "heterogeneous"],
-        help="dataset type",
-    )
-    parser.add_argument(
-        "--size",
-        type=str,
-        default="tiny",
-        choices=["tiny", "small", "medium", "large", "full"],
-        help="size of the datasets",
-    )
-    args = parser.parse_args()
-    path = download_dataset(
-        path=args.path, dataset_type=args.type, dataset_size=args.size
-    )
-    process_dataset(path=path, dataset_size=args.size)
diff --git a/examples/graphbolt/rgcn/download_full_igbh.sh b/examples/graphbolt/rgcn/download_full_igbh.sh
deleted file mode 100755
index 1efcdb3cb0b9..000000000000
--- a/examples/graphbolt/rgcn/download_full_igbh.sh
+++ /dev/null
@@ -1,98 +0,0 @@
-#! /bin/bash
-
-mkdir -p igb-dataset-full
-cd igb-dataset-full
-mkdir -p processed
-cd processed
-
-echo "IGBH600M (Heteregeneous) download starting"
-
-# paper
-mkdir -p paper
-cd paper
-wget https://igb-public-awsopen.s3.amazonaws.com/IGBH/processed/paper/node_feat.npy
-wget https://igb-public-awsopen.s3.amazonaws.com/IGBH/processed/paper/node_label_19.npy
-wget https://igb-public-awsopen.s3.amazonaws.com/IGBH/processed/paper/node_label_2K.npy
-wget https://igb-public-awsopen.s3.amazonaws.com/IGBH/processed/paper/paper_id_index_mapping.npy
-cd ..
-
-# paper__cites__paper
-mkdir -p paper__cites__paper
-cd paper__cites__paper
-wget https://igb-public-awsopen.s3.amazonaws.com/IGBH/processed/paper__cites__paper/edge_index.npy
-cd ..
-
-# author
-mkdir -p author
-cd author
-wget https://igb-public-awsopen.s3.amazonaws.com/IGBH/processed/author/author_id_index_mapping.npy
-wget https://igb-public-awsopen.s3.amazonaws.com/IGBH/processed/author/node_feat.npy
-cd ..
-
-# conference
-mkdir -p conference
-cd conference
-wget https://igb-public-awsopen.s3.amazonaws.com/IGBH/processed/conference/conference_id_index_mapping.npy
-wget https://igb-public-awsopen.s3.amazonaws.com/IGBH/processed/conference/node_feat.npy
-cd ..
-
-# institute
-mkdir -p institute
-cd institute
-wget https://igb-public-awsopen.s3.amazonaws.com/IGBH/processed/institute/institute_id_index_mapping.npy
-wget https://igb-public-awsopen.s3.amazonaws.com/IGBH/processed/institute/node_feat.npy
-cd ..
-
-# journal
-mkdir -p journal
-cd journal
-wget https://igb-public-awsopen.s3.amazonaws.com/IGBH/processed/journal/journal_id_index_mapping.npy
-wget https://igb-public-awsopen.s3.amazonaws.com/IGBH/processed/journal/node_feat.npy
-cd ..
-
-# fos
-mkdir -p fos
-cd fos
-wget https://igb-public-awsopen.s3.amazonaws.com/IGBH/processed/fos/fos_id_index_mapping.npy
-wget https://igb-public-awsopen.s3.amazonaws.com/IGBH/processed/fos/node_feat.npy
-cd ..
-
-# author__affiliated_to__institute
-mkdir -p author__affiliated_to__institute
-cd author__affiliated_to__institute
-wget https://igb-public-awsopen.s3.amazonaws.com/IGBH/processed/author__affiliated_to__institute/edge_index.npy
-cd ..
-
-# paper__published__journal
-mkdir -p paper__published__journal
-cd paper__published__journal
-wget https://igb-public-awsopen.s3.amazonaws.com/IGBH/processed/paper__published__journal/edge_index.npy
-cd ..
-
-# paper__topic__fos
-mkdir -p paper__topic__fos
-cd paper__topic__fos
-wget https://igb-public-awsopen.s3.amazonaws.com/IGBH/processed/paper__topic__fos/edge_index.npy
-cd ..
-
-# paper__venue__conference
-mkdir -p paper__venue__conference
-cd paper__venue__conference
-wget https://igb-public-awsopen.s3.amazonaws.com/IGBH/processed/paper__venue__conference/edge_index.npy
-cd ..
-
-# paper__written_by__author
-mkdir -p paper__written_by__author
-cd paper__written_by__author
-wget https://igb-public-awsopen.s3.amazonaws.com/IGBH/processed/paper__written_by__author/edge_index.npy
-cd ..
-
-cd ../..
-
-echo "IGBH-IGBH (Heteregeneous) download complete"
-
-
-num_paper_nodes = 269346174
-paper_node_features = np.memmap('/home/ubuntu/dgl/examples/graphbolt/rgcn/igb_dataset/igb_dataset_full/processed/paper/node_label_19.npy', dtype='float32', mode='r',  shape=(num_paper_nodes,1))
-num_paper_nodes = 48521486
-paper_node_features = np.memmap('/home/ubuntu/dgl/examples/graphbolt/rgcn/datasets/igb-dataset-full-seeds/edges/author__affiliated_to__institute.npy', dtype='int32', mode='r',  shape=(num_paper_nodes,2))
diff --git a/examples/graphbolt/rgcn/download_large_igbh.sh b/examples/graphbolt/rgcn/download_large_igbh.sh
deleted file mode 100755
index 4c36a7f8d0db..000000000000
--- a/examples/graphbolt/rgcn/download_large_igbh.sh
+++ /dev/null
@@ -1,95 +0,0 @@
-#! /bin/bash
-
-mkdir -p igb-heterogeneous-large/
-cd igb-heterogeneous-large/
-mkdir -p processed
-cd processed
-
-echo "IGBH-large (Heterogeneous) download starting"
-
-# paper
-mkdir -p paper
-cd paper
-wget https://igb-public-awsopen.s3.amazonaws.com/igb_large/processed/paper/node_feat.npy
-wget https://igb-public-awsopen.s3.amazonaws.com/igb_large/processed/paper/node_label_19.npy
-wget https://igb-public-awsopen.s3.amazonaws.com/igb_large/processed/paper/node_label_2K.npy
-wget https://igb-public-awsopen.s3.amazonaws.com/igb_large/processed/paper/paper_id_index_mapping.npy
-cd ..
-
-# # paper__cites__paper
-# wget --recursive --no-parent https://igb-public-awsopen.s3.amazonaws.com/igb_large/processed/paper__cites__paper/edge_index.npy
-
-# paper__cites__paper
-mkdir -p paper__cites__paper
-cd paper__cites__paper
-wget https://igb-public-awsopen.s3.amazonaws.com/igb_large/processed/paper__cites__paper/edge_index.npy
-cd ..
-
-# author
-mkdir -p author
-cd author
-wget https://igb-public-awsopen.s3.amazonaws.com/igb_large/processed/author/author_id_index_mapping.npy
-wget https://igb-public-awsopen.s3.amazonaws.com/igb_large/processed/author/node_feat.npy
-cd ..
-
-# conference
-mkdir -p conference
-cd conference
-wget https://igb-public-awsopen.s3.amazonaws.com/igb_large/processed/conference/conference_id_index_mapping.npy
-wget https://igb-public-awsopen.s3.amazonaws.com/igb_large/processed/conference/node_feat.npy
-cd ..
-
-# institute
-mkdir -p institute
-cd institute
-wget https://igb-public-awsopen.s3.amazonaws.com/igb_large/processed/institute/institute_id_index_mapping.npy
-wget https://igb-public-awsopen.s3.amazonaws.com/igb_large/processed/institute/node_feat.npy
-cd ..
-
-# journal
-mkdir -p journal
-cd journal
-wget https://igb-public-awsopen.s3.amazonaws.com/igb_large/processed/journal/journal_id_index_mapping.npy
-wget https://igb-public-awsopen.s3.amazonaws.com/igb_large/processed/journal/node_feat.npy
-cd ..
-
-# fos
-mkdir -p fos
-cd fos
-wget https://igb-public-awsopen.s3.amazonaws.com/igb_large/processed/fos/fos_id_index_mapping.npy
-wget https://igb-public-awsopen.s3.amazonaws.com/igb_large/processed/fos/node_feat.npy
-cd ..
-
-# author__affiliated_to__institute
-mkdir -p author__affiliated_to__institute
-cd author__affiliated_to__institute
-wget https://igb-public-awsopen.s3.amazonaws.com/igb_large/processed/author__affiliated_to__institute/edge_index.npy
-cd ..
-
-# paper__published__journal
-mkdir -p paper__published__journal
-cd paper__published__journal
-wget https://igb-public-awsopen.s3.amazonaws.com/igb_large/processed/paper__published__journal/edge_index.npy
-cd ..
-
-# paper__topic__fos
-mkdir -p paper__topic__fos
-cd paper__topic__fos
-wget https://igb-public-awsopen.s3.amazonaws.com/igb_large/processed/paper__topic__fos/edge_index.npy
-cd ..
-
-# paper__venue__conference
-mkdir -p paper__venue__conference
-cd paper__venue__conference
-wget https://igb-public-awsopen.s3.amazonaws.com/igb_large/processed/paper__venue__conference/edge_index.npy
-cd ..
-
-# paper__written_by__author
-mkdir -p paper__written_by__author
-cd paper__written_by__author
-wget https://igb-public-awsopen.s3.amazonaws.com/igb_large/processed/paper__written_by__author/edge_index.npy
-cd ..
-
-cd ../..
-
-echo "IGBH-large (Heterogeneous) download complete"
\ No newline at end of file
diff --git a/examples/graphbolt/rgcn/hetero_rgcn.py b/examples/graphbolt/rgcn/hetero_rgcn.py
index 7516e16a857a..4051425a129d 100644
--- a/examples/graphbolt/rgcn/hetero_rgcn.py
+++ b/examples/graphbolt/rgcn/hetero_rgcn.py
@@ -142,7 +142,7 @@ def create_dataloader(
     if name == "ogb-lsc-mag240m":
         node_feature_keys["author"] = ["feat"]
         node_feature_keys["institution"] = ["feat"]
-    if "igb-heterogeneous" in name:
+    if "igb-het" in name:
         node_feature_keys["author"] = ["feat"]
         node_feature_keys["institution"] = ["feat"]
         node_feature_keys["fos"] = ["feat"]
@@ -163,7 +163,7 @@ def extract_embed(node_embed, input_nodes):
 
 def extract_node_features(name, block, data, node_embed, device):
     """Extract the node features from embedding layer or raw features."""
-    if name == "ogbn-mag" or "igb-heterogeneous" in name:
+    if name == "ogbn-mag" or "igb-het" in name:
         input_nodes = {
             k: v.to(device) for k, v in block.srcdata[dgl.NID].items()
         }
@@ -429,7 +429,7 @@ def evaluate(
     model.eval()
     category = "paper"
     # An evaluator for the dataset.
-    if "igb-heterogeneous" in name:
+    if "igb-het" in name:
         evaluator = IGB_Evaluator(name=name, num_tasks=1, eval_metric="acc")
     elif name == "ogbn-mag":
         evaluator = Evaluator(name=name)
@@ -595,7 +595,7 @@ def main(args):
     # `institution` are generated in advance and stored in the feature store.
     # For `ogbn-mag`, we generate the features on the fly.
     embed_layer = None
-    if args.dataset == "ogbn-mag" or "igb-heterogeneous" in args.dataset:
+    if args.dataset == "ogbn-mag" or "igb-het" in args.dataset:
         # Create the embedding layer and move it to the appropriate device.
         embed_layer = rel_graph_embed(g, feat_size).to(device)
         print(
@@ -673,13 +673,12 @@ def main(args):
         choices=[
             "ogbn-mag",
             "ogb-lsc-mag240m",
-            "igb-heterogeneous-tiny",
-            "igb-heterogeneous-small",
-            "igb-heterogeneous-medium",
-            "igb-heterogeneous-large",
-            "igb-heterogeneous-full",
+            "igb-het-tiny",
+            "igb-het-small",
+            "igb-het-medium",
         ],
-        help="Dataset name. Possible values: ogbn-mag, ogb-lsc-mag240m",
+        help="Dataset name. Possible values: ogbn-mag, ogb-lsc-mag240m, "
+        " igb-het-[tiny|small|medium].",
     )
     parser.add_argument("--num_epochs", type=int, default=3)
     parser.add_argument("--num_workers", type=int, default=0)
diff --git a/python/dgl/graphbolt/impl/ondisk_dataset.py b/python/dgl/graphbolt/impl/ondisk_dataset.py
index 3318491d2888..52b61a7d3a21 100644
--- a/python/dgl/graphbolt/impl/ondisk_dataset.py
+++ b/python/dgl/graphbolt/impl/ondisk_dataset.py
@@ -989,6 +989,16 @@ class BuiltinDataset(OnDiskDataset):
             Self edges are added to the original graph.
             Node features are stored as float32.
 
+    **igb-het-[tiny|small|medium]**
+        The igb-hom-[tiny|small|medium] dataset is a heterogeneous citation network,
+        which is designed for developers to train and evaluate GNN models with
+        high fidelity. See more details in `igb-het-[tiny|small|medium]
+        <https://github.com/IllinoisGraphBenchmark/IGB-Datasets>`_.
+
+        .. note::
+            Reverse paper__cites__paper edges are added to the original graph.
+            Node features are stored as float32.
+
     Parameters
     ----------
     name : str
@@ -1018,6 +1028,10 @@ class BuiltinDataset(OnDiskDataset):
         "igb-hom-tiny-seeds",
         "igb-hom-small",
         "igb-hom-small-seeds",
+        "igb-het-tiny",
+        "igb-het-tiny-seeds",
+        "igb-het-small",
+        "igb-het-small-seeds",
     ]
     _large_datasets = [
         "ogb-lsc-mag240m",
@@ -1028,6 +1042,8 @@ class BuiltinDataset(OnDiskDataset):
         "igb-hom-medium-seeds",
         "igb-hom-large",
         "igb-hom-large-seeds",
+        "igb-het-medium",
+        "igb-het-medium-seeds",
     ]
     _all_datasets = _datasets + _large_datasets
 

From 8e51701466c91cb42cc7f3f57a4435005de77acd Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-0-214.us-west-2.compute.internal>
Date: Fri, 6 Sep 2024 05:38:07 +0000
Subject: [PATCH 18/35] resolve merge conflict

---
 .../pyg/hetero/node_classification.py         | 31 +++++++++----
 examples/graphbolt/rgcn/evaluator.py          | 46 +++----------------
 examples/graphbolt/rgcn/hetero_rgcn.py        |  2 +-
 python/dgl/graphbolt/impl/ondisk_dataset.py   | 11 +++--
 4 files changed, 37 insertions(+), 53 deletions(-)

diff --git a/examples/graphbolt/pyg/hetero/node_classification.py b/examples/graphbolt/pyg/hetero/node_classification.py
index 032b84d82c4b..272e778610f0 100644
--- a/examples/graphbolt/pyg/hetero/node_classification.py
+++ b/examples/graphbolt/pyg/hetero/node_classification.py
@@ -58,12 +58,20 @@ def create_dataloader(
         datapipe = datapipe.copy_to(device=device)
         need_copy = False
 
+    # if args.dataset == "ogb-lsc-mag240m":
+    #     node_feature_keys = {
+    #         "paper": ["feat"],
+    #         "author": ["feat"],
+    #         "institution": ["feat"],
+    #     }
+    node_feature_keys = {"paper": ["feat"]}
     if args.dataset == "ogb-lsc-mag240m":
-        node_feature_keys = {
-            "paper": ["feat"],
-            "author": ["feat"],
-            "institution": ["feat"],
-        }
+        node_feature_keys["author"] = ["feat"]
+        node_feature_keys["institution"] = ["feat"]
+    if "igb-het" in args.dataset:
+        node_feature_keys["author"] = ["feat"]
+        node_feature_keys["institute"] = ["feat"]
+        node_feature_keys["fos"] = ["feat"]
     # Fetch node features for the sampled subgraph.
     datapipe = datapipe.fetch_feature(features, node_feature_keys)
 
@@ -335,8 +343,13 @@ def parse_args():
         "--dataset",
         type=str,
         default="ogb-lsc-mag240m",
-        choices=["ogb-lsc-mag240m"],
-        help="Dataset name. Possible values: ogb-lsc-mag240m",
+        choices=[
+            "ogb-lsc-mag240m",
+            "igb-het-tiny",
+            "igb-het-small",
+            "igb-het-medium",
+        ],
+        help="Dataset name. Possible values: ogb-lsc-mag240m, igb-het-[tiny|small|medium].",
     )
     parser.add_argument(
         "--fanout",
@@ -400,7 +413,7 @@ def parse_args():
     return parser.parse_args()
 
 
-def main():
+def main(args):
     torch.set_float32_matmul_precision(args.precision)
     if not torch.cuda.is_available():
         args.mode = "cpu-cpu-cpu"
@@ -517,4 +530,4 @@ def main():
 
 if __name__ == "__main__":
     args = parse_args()
-    main()
+    main(args)
diff --git a/examples/graphbolt/rgcn/evaluator.py b/examples/graphbolt/rgcn/evaluator.py
index cbded66c264d..e52b2dbb78a7 100644
--- a/examples/graphbolt/rgcn/evaluator.py
+++ b/examples/graphbolt/rgcn/evaluator.py
@@ -67,45 +67,6 @@ def _parse_and_check_input(self, input_dict):
         else:
             raise ValueError("Undefined eval metric %s " % (self.eval_metric))
 
-    def eval(self, input_dict):
-        if self.eval_metric == "acc":
-            y_true, y_pred = self._parse_and_check_input(input_dict)
-            return self._eval_acc(y_true, y_pred)
-        else:
-            raise ValueError("Undefined eval metric %s " % (self.eval_metric))
-
-    @property
-    def expected_input_format(self):
-        desc = "==== Expected input format of Evaluator for {}\n".format(
-            self.name
-        )
-        if self.eval_metric == "acc":
-            desc += "{'y_true': y_true, 'y_pred': y_pred}\n"
-            desc += "- y_true: numpy ndarray or torch tensor of shape (num_nodes num_tasks)\n"
-            desc += "- y_pred: numpy ndarray or torch tensor of shape (num_nodes num_tasks)\n"
-            desc += "where y_pred stores predicted class label (integer),\n"
-            desc += "num_task is {}, and ".format(self.num_tasks)
-            desc += "each row corresponds to one node.\n"
-        else:
-            raise ValueError("Undefined eval metric %s " % (self.eval_metric))
-
-        return desc
-
-    @property
-    def expected_output_format(self):
-        desc = "==== Expected output format of Evaluator for {}\n".format(
-            self.name
-        )
-        if self.eval_metric == "acc":
-            desc += "{'acc': acc}\n"
-            desc += "- acc (float): Accuracy score averaged across {} task(s)\n".format(
-                self.num_tasks
-            )
-        else:
-            raise ValueError("Undefined eval metric %s " % (self.eval_metric))
-
-        return desc
-
     def _eval_acc(self, y_true, y_pred):
         acc_list = []
 
@@ -115,3 +76,10 @@ def _eval_acc(self, y_true, y_pred):
             acc_list.append(float(np.sum(correct)) / len(correct))
 
         return {"acc": sum(acc_list) / len(acc_list)}
+
+    def eval(self, input_dict):
+        if self.eval_metric == "acc":
+            y_true, y_pred = self._parse_and_check_input(input_dict)
+            return self._eval_acc(y_true, y_pred)
+        else:
+            raise ValueError("Undefined eval metric %s " % (self.eval_metric))
diff --git a/examples/graphbolt/rgcn/hetero_rgcn.py b/examples/graphbolt/rgcn/hetero_rgcn.py
index 4051425a129d..2ca98b9ab872 100644
--- a/examples/graphbolt/rgcn/hetero_rgcn.py
+++ b/examples/graphbolt/rgcn/hetero_rgcn.py
@@ -144,7 +144,7 @@ def create_dataloader(
         node_feature_keys["institution"] = ["feat"]
     if "igb-het" in name:
         node_feature_keys["author"] = ["feat"]
-        node_feature_keys["institution"] = ["feat"]
+        node_feature_keys["institute"] = ["feat"]
         node_feature_keys["fos"] = ["feat"]
     datapipe = datapipe.fetch_feature(features, node_feature_keys)
 
diff --git a/python/dgl/graphbolt/impl/ondisk_dataset.py b/python/dgl/graphbolt/impl/ondisk_dataset.py
index 52b61a7d3a21..ee09f43274b7 100644
--- a/python/dgl/graphbolt/impl/ondisk_dataset.py
+++ b/python/dgl/graphbolt/impl/ondisk_dataset.py
@@ -979,10 +979,11 @@ class BuiltinDataset(OnDiskDataset):
         .. note::
             Reverse edges are added to the original graph.
 
-    **igb-hom-[tiny|small|medium|large]**
-        The igb-hom-[tiny|small|medium] dataset is a homogeneous citation network,
-        which is designed for developers to train and evaluate GNN models with
-        high fidelity. See more details in `igb-hom-[tiny|small|medium|large]
+    **igb-hom and igb-hom-[tiny|small|medium|large]**
+        The igb-hom-[tiny|small|medium|large] and igb-hom dataset is a homogeneous
+        citation network, which is designed for developers to train and evaluate
+        GNN models with high fidelity. See more details in
+        `igb-hom-[tiny|small|medium|large]
         <https://github.com/IllinoisGraphBenchmark/IGB-Datasets>`_.
 
         .. note::
@@ -1042,6 +1043,8 @@ class BuiltinDataset(OnDiskDataset):
         "igb-hom-medium-seeds",
         "igb-hom-large",
         "igb-hom-large-seeds",
+        "igb-hom",
+        "igb-hom-seeds",
         "igb-het-medium",
         "igb-het-medium-seeds",
     ]

From aaf1da118879238f21c13f54c839bbdc5165653c Mon Sep 17 00:00:00 2001
From: Bowen Yao <112051015+BowenYao18@users.noreply.github.com>
Date: Thu, 5 Sep 2024 22:59:58 -0700
Subject: [PATCH 19/35] Update
 examples/graphbolt/pyg/hetero/node_classification.py

Co-authored-by: Muhammed Fatih BALIN <m.f.balin@gmail.com>
---
 examples/graphbolt/pyg/hetero/node_classification.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/examples/graphbolt/pyg/hetero/node_classification.py b/examples/graphbolt/pyg/hetero/node_classification.py
index 272e778610f0..bfad1ab0cf4d 100644
--- a/examples/graphbolt/pyg/hetero/node_classification.py
+++ b/examples/graphbolt/pyg/hetero/node_classification.py
@@ -69,7 +69,6 @@ def create_dataloader(
         node_feature_keys["author"] = ["feat"]
         node_feature_keys["institution"] = ["feat"]
     if "igb-het" in args.dataset:
-        node_feature_keys["author"] = ["feat"]
         node_feature_keys["institute"] = ["feat"]
         node_feature_keys["fos"] = ["feat"]
     # Fetch node features for the sampled subgraph.

From ebdb01f132bbad20ee54f9cb70117d770b02603e Mon Sep 17 00:00:00 2001
From: Bowen Yao <112051015+BowenYao18@users.noreply.github.com>
Date: Thu, 5 Sep 2024 23:00:03 -0700
Subject: [PATCH 20/35] Update
 examples/graphbolt/pyg/hetero/node_classification.py

Co-authored-by: Muhammed Fatih BALIN <m.f.balin@gmail.com>
---
 examples/graphbolt/pyg/hetero/node_classification.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/graphbolt/pyg/hetero/node_classification.py b/examples/graphbolt/pyg/hetero/node_classification.py
index bfad1ab0cf4d..d231c94b7cbc 100644
--- a/examples/graphbolt/pyg/hetero/node_classification.py
+++ b/examples/graphbolt/pyg/hetero/node_classification.py
@@ -64,7 +64,7 @@ def create_dataloader(
     #         "author": ["feat"],
     #         "institution": ["feat"],
     #     }
-    node_feature_keys = {"paper": ["feat"]}
+    node_feature_keys = {"paper": ["feat"], "author": ["feat"]}
     if args.dataset == "ogb-lsc-mag240m":
         node_feature_keys["author"] = ["feat"]
         node_feature_keys["institution"] = ["feat"]

From d50977d840872b1760eaa4253999f12b62389ced Mon Sep 17 00:00:00 2001
From: Bowen Yao <112051015+BowenYao18@users.noreply.github.com>
Date: Thu, 5 Sep 2024 23:00:09 -0700
Subject: [PATCH 21/35] Update
 examples/graphbolt/pyg/hetero/node_classification.py

Co-authored-by: Muhammed Fatih BALIN <m.f.balin@gmail.com>
---
 examples/graphbolt/pyg/hetero/node_classification.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/examples/graphbolt/pyg/hetero/node_classification.py b/examples/graphbolt/pyg/hetero/node_classification.py
index d231c94b7cbc..21fdf1203899 100644
--- a/examples/graphbolt/pyg/hetero/node_classification.py
+++ b/examples/graphbolt/pyg/hetero/node_classification.py
@@ -66,7 +66,6 @@ def create_dataloader(
     #     }
     node_feature_keys = {"paper": ["feat"], "author": ["feat"]}
     if args.dataset == "ogb-lsc-mag240m":
-        node_feature_keys["author"] = ["feat"]
         node_feature_keys["institution"] = ["feat"]
     if "igb-het" in args.dataset:
         node_feature_keys["institute"] = ["feat"]

From 071d05521a967190957a28e5534c11920f0b0a24 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-0-214.us-west-2.compute.internal>
Date: Fri, 6 Sep 2024 06:03:40 +0000
Subject: [PATCH 22/35] remove main args

---
 examples/graphbolt/pyg/hetero/node_classification.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/graphbolt/pyg/hetero/node_classification.py b/examples/graphbolt/pyg/hetero/node_classification.py
index 21fdf1203899..871d74df84ed 100644
--- a/examples/graphbolt/pyg/hetero/node_classification.py
+++ b/examples/graphbolt/pyg/hetero/node_classification.py
@@ -411,7 +411,7 @@ def parse_args():
     return parser.parse_args()
 
 
-def main(args):
+def main():
     torch.set_float32_matmul_precision(args.precision)
     if not torch.cuda.is_available():
         args.mode = "cpu-cpu-cpu"
@@ -528,4 +528,4 @@ def main(args):
 
 if __name__ == "__main__":
     args = parse_args()
-    main(args)
+    main()

From f31d3540b351b682320a1f0a589a439a4e184f0b Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-0-214.us-west-2.compute.internal>
Date: Fri, 6 Sep 2024 20:51:02 +0000
Subject: [PATCH 23/35] remove script

---
 examples/graphbolt/pyg/hetero/node_classification.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/examples/graphbolt/pyg/hetero/node_classification.py b/examples/graphbolt/pyg/hetero/node_classification.py
index 871d74df84ed..4166805c3f92 100644
--- a/examples/graphbolt/pyg/hetero/node_classification.py
+++ b/examples/graphbolt/pyg/hetero/node_classification.py
@@ -58,12 +58,6 @@ def create_dataloader(
         datapipe = datapipe.copy_to(device=device)
         need_copy = False
 
-    # if args.dataset == "ogb-lsc-mag240m":
-    #     node_feature_keys = {
-    #         "paper": ["feat"],
-    #         "author": ["feat"],
-    #         "institution": ["feat"],
-    #     }
     node_feature_keys = {"paper": ["feat"], "author": ["feat"]}
     if args.dataset == "ogb-lsc-mag240m":
         node_feature_keys["institution"] = ["feat"]

From 9e445c6e663d02122d6591962a49e1ef608580fa Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-0-214.us-west-2.compute.internal>
Date: Fri, 6 Sep 2024 20:53:13 +0000
Subject: [PATCH 24/35] add all reverse edge type

---
 python/dgl/graphbolt/impl/ondisk_dataset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/dgl/graphbolt/impl/ondisk_dataset.py b/python/dgl/graphbolt/impl/ondisk_dataset.py
index ee09f43274b7..303a423a853b 100644
--- a/python/dgl/graphbolt/impl/ondisk_dataset.py
+++ b/python/dgl/graphbolt/impl/ondisk_dataset.py
@@ -997,7 +997,7 @@ class BuiltinDataset(OnDiskDataset):
         <https://github.com/IllinoisGraphBenchmark/IGB-Datasets>`_.
 
         .. note::
-            Reverse paper__cites__paper edges are added to the original graph.
+            Four Reverse edge types are added to the original graph.
             Node features are stored as float32.
 
     Parameters

From b04399900e96301aafa0e589835ad1220314a5b3 Mon Sep 17 00:00:00 2001
From: BowenYao18 <by18@rice.edu>
Date: Mon, 9 Sep 2024 08:38:07 +0000
Subject: [PATCH 25/35] add igb-het-large

---
 examples/graphbolt/pyg/hetero/node_classification.py | 1 +
 examples/graphbolt/rgcn/hetero_rgcn.py               | 3 ++-
 python/dgl/graphbolt/impl/ondisk_dataset.py          | 6 ++++--
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/examples/graphbolt/pyg/hetero/node_classification.py b/examples/graphbolt/pyg/hetero/node_classification.py
index 4166805c3f92..2e789fa39bd9 100644
--- a/examples/graphbolt/pyg/hetero/node_classification.py
+++ b/examples/graphbolt/pyg/hetero/node_classification.py
@@ -340,6 +340,7 @@ def parse_args():
             "igb-het-tiny",
             "igb-het-small",
             "igb-het-medium",
+            "igb-het-large"
         ],
         help="Dataset name. Possible values: ogb-lsc-mag240m, igb-het-[tiny|small|medium].",
     )
diff --git a/examples/graphbolt/rgcn/hetero_rgcn.py b/examples/graphbolt/rgcn/hetero_rgcn.py
index 2ca98b9ab872..f3b0edbf5ba8 100644
--- a/examples/graphbolt/rgcn/hetero_rgcn.py
+++ b/examples/graphbolt/rgcn/hetero_rgcn.py
@@ -676,9 +676,10 @@ def main(args):
             "igb-het-tiny",
             "igb-het-small",
             "igb-het-medium",
+            "igb-het-large",
         ],
         help="Dataset name. Possible values: ogbn-mag, ogb-lsc-mag240m, "
-        " igb-het-[tiny|small|medium].",
+        " igb-het-[tiny|small|medium|large].",
     )
     parser.add_argument("--num_epochs", type=int, default=3)
     parser.add_argument("--num_workers", type=int, default=0)
diff --git a/python/dgl/graphbolt/impl/ondisk_dataset.py b/python/dgl/graphbolt/impl/ondisk_dataset.py
index 303a423a853b..8763fa332854 100644
--- a/python/dgl/graphbolt/impl/ondisk_dataset.py
+++ b/python/dgl/graphbolt/impl/ondisk_dataset.py
@@ -990,10 +990,10 @@ class BuiltinDataset(OnDiskDataset):
             Self edges are added to the original graph.
             Node features are stored as float32.
 
-    **igb-het-[tiny|small|medium]**
+    **igb-het-[tiny|small|medium|large]**
         The igb-hom-[tiny|small|medium] dataset is a heterogeneous citation network,
         which is designed for developers to train and evaluate GNN models with
-        high fidelity. See more details in `igb-het-[tiny|small|medium]
+        high fidelity. See more details in `igb-het-[tiny|small|medium|large]
         <https://github.com/IllinoisGraphBenchmark/IGB-Datasets>`_.
 
         .. note::
@@ -1047,6 +1047,8 @@ class BuiltinDataset(OnDiskDataset):
         "igb-hom-seeds",
         "igb-het-medium",
         "igb-het-medium-seeds",
+        "igb-het-large",
+        "igb-het-large-seeds",
     ]
     _all_datasets = _datasets + _large_datasets
 

From 2657ee1fdd47d3ec3385ee1fc73628928b495228 Mon Sep 17 00:00:00 2001
From: BowenYao18 <by18@rice.edu>
Date: Mon, 9 Sep 2024 08:42:18 +0000
Subject: [PATCH 26/35] fix format

---
 examples/graphbolt/pyg/hetero/node_classification.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/graphbolt/pyg/hetero/node_classification.py b/examples/graphbolt/pyg/hetero/node_classification.py
index 2e789fa39bd9..f6ee70f76a5f 100644
--- a/examples/graphbolt/pyg/hetero/node_classification.py
+++ b/examples/graphbolt/pyg/hetero/node_classification.py
@@ -340,7 +340,7 @@ def parse_args():
             "igb-het-tiny",
             "igb-het-small",
             "igb-het-medium",
-            "igb-het-large"
+            "igb-het-large",
         ],
         help="Dataset name. Possible values: ogb-lsc-mag240m, igb-het-[tiny|small|medium].",
     )

From d11f8150c98ff6391e0b8960f3fd0d4720dfc8aa Mon Sep 17 00:00:00 2001
From: BowenYao18 <by18@rice.edu>
Date: Mon, 9 Sep 2024 08:45:46 +0000
Subject: [PATCH 27/35] fix lint

---
 examples/graphbolt/pyg/hetero/node_classification.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/graphbolt/pyg/hetero/node_classification.py b/examples/graphbolt/pyg/hetero/node_classification.py
index f6ee70f76a5f..8df18fb49db3 100644
--- a/examples/graphbolt/pyg/hetero/node_classification.py
+++ b/examples/graphbolt/pyg/hetero/node_classification.py
@@ -342,7 +342,7 @@ def parse_args():
             "igb-het-medium",
             "igb-het-large",
         ],
-        help="Dataset name. Possible values: ogb-lsc-mag240m, igb-het-[tiny|small|medium].",
+        help="Dataset name. Possible values: ogb-lsc-mag240m, igb-het-[tiny|small|medium|large].",
     )
     parser.add_argument(
         "--fanout",

From 692912ec820f70890e5eff0e3e76bec4479d698e Mon Sep 17 00:00:00 2001
From: BowenYao18 <by18@rice.edu>
Date: Wed, 9 Oct 2024 02:27:19 +0000
Subject: [PATCH 28/35] fix acc drop bug

---
 examples/graphbolt/rgcn/hetero_rgcn.py | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/examples/graphbolt/rgcn/hetero_rgcn.py b/examples/graphbolt/rgcn/hetero_rgcn.py
index f3b0edbf5ba8..b9f2a8e5911c 100644
--- a/examples/graphbolt/rgcn/hetero_rgcn.py
+++ b/examples/graphbolt/rgcn/hetero_rgcn.py
@@ -163,7 +163,7 @@ def extract_embed(node_embed, input_nodes):
 
 def extract_node_features(name, block, data, node_embed, device):
     """Extract the node features from embedding layer or raw features."""
-    if name == "ogbn-mag" or "igb-het" in name:
+    if name == "ogbn-mag":
         input_nodes = {
             k: v.to(device) for k, v in block.srcdata[dgl.NID].items()
         }
@@ -288,13 +288,6 @@ def __init__(
             }
         )
 
-        self.loop_weights = nn.ModuleDict(
-            {
-                ntype: nn.Linear(in_size, out_size, bias=True)
-                for ntype in self.ntypes
-            }
-        )
-
         self.dropout = nn.Dropout(dropout)
         # Initialize parameters of the model.
         self.reset_parameters()
@@ -677,6 +670,8 @@ def main(args):
             "igb-het-small",
             "igb-het-medium",
             "igb-het-large",
+            "igb-het",
+            "igb-het-MLPerf"
         ],
         help="Dataset name. Possible values: ogbn-mag, ogb-lsc-mag240m, "
         " igb-het-[tiny|small|medium|large].",

From 5dfd3fd2c791e4b6f4de3acc023e003801281c05 Mon Sep 17 00:00:00 2001
From: BowenYao18 <by18@rice.edu>
Date: Wed, 9 Oct 2024 02:29:21 +0000
Subject: [PATCH 29/35] fix acc drop bug

---
 examples/graphbolt/rgcn/hetero_rgcn.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/graphbolt/rgcn/hetero_rgcn.py b/examples/graphbolt/rgcn/hetero_rgcn.py
index b9f2a8e5911c..f44aef956e90 100644
--- a/examples/graphbolt/rgcn/hetero_rgcn.py
+++ b/examples/graphbolt/rgcn/hetero_rgcn.py
@@ -671,7 +671,7 @@ def main(args):
             "igb-het-medium",
             "igb-het-large",
             "igb-het",
-            "igb-het-MLPerf"
+            "igb-het-MLPerf",
         ],
         help="Dataset name. Possible values: ogbn-mag, ogb-lsc-mag240m, "
         " igb-het-[tiny|small|medium|large].",

From 65c9b807997474f94e535b20cc8a72dd4048d281 Mon Sep 17 00:00:00 2001
From: Bowen Yao <112051015+BowenYao18@users.noreply.github.com>
Date: Thu, 17 Oct 2024 21:19:34 -0500
Subject: [PATCH 30/35] Update node_classification.py

---
 examples/graphbolt/pyg/hetero/node_classification.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/examples/graphbolt/pyg/hetero/node_classification.py b/examples/graphbolt/pyg/hetero/node_classification.py
index dcefb7358be9..f43a6f0609ba 100644
--- a/examples/graphbolt/pyg/hetero/node_classification.py
+++ b/examples/graphbolt/pyg/hetero/node_classification.py
@@ -345,8 +345,10 @@ def parse_args():
             "igb-het-small",
             "igb-het-medium",
             "igb-het-large",
+            "igb-het-MLPerf",
+            "igb-het"
         ],
-        help="Dataset name. Possible values: ogb-lsc-mag240m, igb-het-[tiny|small|medium|large].",
+        help="Dataset name. Possible values: ogb-lsc-mag240m, igb-het, and igb-het-[tiny|small|medium|large|MLPerf].",
     )
     parser.add_argument(
         "--fanout",

From 4b2a7fb988b09ead711e69fee3fba9235e0a9c76 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-39-5.us-west-2.compute.internal>
Date: Fri, 18 Oct 2024 02:28:25 +0000
Subject: [PATCH 31/35] add dataset name

---
 examples/graphbolt/rgcn/hetero_rgcn.py      |  2 +-
 python/dgl/graphbolt/impl/ondisk_dataset.py | 15 ++++++++++-----
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/examples/graphbolt/rgcn/hetero_rgcn.py b/examples/graphbolt/rgcn/hetero_rgcn.py
index f44aef956e90..36fc1185c324 100644
--- a/examples/graphbolt/rgcn/hetero_rgcn.py
+++ b/examples/graphbolt/rgcn/hetero_rgcn.py
@@ -674,7 +674,7 @@ def main(args):
             "igb-het-MLPerf",
         ],
         help="Dataset name. Possible values: ogbn-mag, ogb-lsc-mag240m, "
-        " igb-het-[tiny|small|medium|large].",
+        "igb-het, and igb-het-[tiny|small|medium|large|MLPerf].",
     )
     parser.add_argument("--num_epochs", type=int, default=3)
     parser.add_argument("--num_workers", type=int, default=0)
diff --git a/python/dgl/graphbolt/impl/ondisk_dataset.py b/python/dgl/graphbolt/impl/ondisk_dataset.py
index 8763fa332854..855e964f83fd 100644
--- a/python/dgl/graphbolt/impl/ondisk_dataset.py
+++ b/python/dgl/graphbolt/impl/ondisk_dataset.py
@@ -983,17 +983,18 @@ class BuiltinDataset(OnDiskDataset):
         The igb-hom-[tiny|small|medium|large] and igb-hom dataset is a homogeneous
         citation network, which is designed for developers to train and evaluate
         GNN models with high fidelity. See more details in
-        `igb-hom-[tiny|small|medium|large]
+        `igb-hom-[tiny|small|medium|large] and igb-hom
         <https://github.com/IllinoisGraphBenchmark/IGB-Datasets>`_.
 
         .. note::
             Self edges are added to the original graph.
             Node features are stored as float32.
 
-    **igb-het-[tiny|small|medium|large]**
-        The igb-hom-[tiny|small|medium] dataset is a heterogeneous citation network,
-        which is designed for developers to train and evaluate GNN models with
-        high fidelity. See more details in `igb-het-[tiny|small|medium|large]
+    **igb-het and igb-het-[tiny|small|medium|large|mlperf]**
+        The igb-hom-[tiny|small|medium|large|mlperf] and igb-het dataset is a
+        heterogeneous citation network, which is designed for developers to train
+        and evaluate GNN models with high fidelity. See more details in
+        `igb-het-[tiny|small|medium|large|mlperf] and igb-het
         <https://github.com/IllinoisGraphBenchmark/IGB-Datasets>`_.
 
         .. note::
@@ -1049,6 +1050,10 @@ class BuiltinDataset(OnDiskDataset):
         "igb-het-medium-seeds",
         "igb-het-large",
         "igb-het-large-seeds",
+        "igb-het",
+        "igb-het-seeds",
+        "igb-het-MLPerf",
+        "igb-het-MLPerf-seeds",
     ]
     _all_datasets = _datasets + _large_datasets
 

From 4f3f35545c9163e7e8101ba5cd9dd5b72c9a8c75 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-39-5.us-west-2.compute.internal>
Date: Fri, 18 Oct 2024 02:35:30 +0000
Subject: [PATCH 32/35] fix lint

---
 examples/graphbolt/pyg/hetero/node_classification.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/graphbolt/pyg/hetero/node_classification.py b/examples/graphbolt/pyg/hetero/node_classification.py
index f43a6f0609ba..45a8b1eed730 100644
--- a/examples/graphbolt/pyg/hetero/node_classification.py
+++ b/examples/graphbolt/pyg/hetero/node_classification.py
@@ -346,7 +346,7 @@ def parse_args():
             "igb-het-medium",
             "igb-het-large",
             "igb-het-MLPerf",
-            "igb-het"
+            "igb-het",
         ],
         help="Dataset name. Possible values: ogb-lsc-mag240m, igb-het, and igb-het-[tiny|small|medium|large|MLPerf].",
     )

From 8c21ef44eb09c1150d6460febb1f59920f984a5d Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-39-5.us-west-2.compute.internal>
Date: Fri, 18 Oct 2024 07:40:39 +0000
Subject: [PATCH 33/35] change mlperf to lower

---
 examples/graphbolt/pyg/hetero/node_classification.py | 4 ++--
 examples/graphbolt/rgcn/hetero_rgcn.py               | 4 ++--
 python/dgl/graphbolt/impl/ondisk_dataset.py          | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/examples/graphbolt/pyg/hetero/node_classification.py b/examples/graphbolt/pyg/hetero/node_classification.py
index 45a8b1eed730..922ff15562ea 100644
--- a/examples/graphbolt/pyg/hetero/node_classification.py
+++ b/examples/graphbolt/pyg/hetero/node_classification.py
@@ -345,10 +345,10 @@ def parse_args():
             "igb-het-small",
             "igb-het-medium",
             "igb-het-large",
-            "igb-het-MLPerf",
+            "igb-het-mlperf",
             "igb-het",
         ],
-        help="Dataset name. Possible values: ogb-lsc-mag240m, igb-het, and igb-het-[tiny|small|medium|large|MLPerf].",
+        help="Dataset name. Possible values: ogb-lsc-mag240m, igb-het, and igb-het-[tiny|small|medium|large|mlperf].",
     )
     parser.add_argument(
         "--fanout",
diff --git a/examples/graphbolt/rgcn/hetero_rgcn.py b/examples/graphbolt/rgcn/hetero_rgcn.py
index 36fc1185c324..c3a6a36c3dbe 100644
--- a/examples/graphbolt/rgcn/hetero_rgcn.py
+++ b/examples/graphbolt/rgcn/hetero_rgcn.py
@@ -671,10 +671,10 @@ def main(args):
             "igb-het-medium",
             "igb-het-large",
             "igb-het",
-            "igb-het-MLPerf",
+            "igb-het-mlperf",
         ],
         help="Dataset name. Possible values: ogbn-mag, ogb-lsc-mag240m, "
-        "igb-het, and igb-het-[tiny|small|medium|large|MLPerf].",
+        "igb-het, and igb-het-[tiny|small|medium|large|mlperf].",
     )
     parser.add_argument("--num_epochs", type=int, default=3)
     parser.add_argument("--num_workers", type=int, default=0)
diff --git a/python/dgl/graphbolt/impl/ondisk_dataset.py b/python/dgl/graphbolt/impl/ondisk_dataset.py
index 855e964f83fd..a8a3972fdb6d 100644
--- a/python/dgl/graphbolt/impl/ondisk_dataset.py
+++ b/python/dgl/graphbolt/impl/ondisk_dataset.py
@@ -1052,8 +1052,8 @@ class BuiltinDataset(OnDiskDataset):
         "igb-het-large-seeds",
         "igb-het",
         "igb-het-seeds",
-        "igb-het-MLPerf",
-        "igb-het-MLPerf-seeds",
+        "igb-het-mlperf",
+        "igb-het-mlperf-seeds",
     ]
     _all_datasets = _datasets + _large_datasets
 

From 098efbeebba5d5978954633a36a1759dd19829e3 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-39-5.us-west-2.compute.internal>
Date: Wed, 23 Oct 2024 03:38:14 +0000
Subject: [PATCH 34/35] reduce evaluator

---
 examples/graphbolt/rgcn/evaluator.py | 82 ++++++----------------------
 1 file changed, 17 insertions(+), 65 deletions(-)

diff --git a/examples/graphbolt/rgcn/evaluator.py b/examples/graphbolt/rgcn/evaluator.py
index e52b2dbb78a7..d003c5b62ede 100644
--- a/examples/graphbolt/rgcn/evaluator.py
+++ b/examples/graphbolt/rgcn/evaluator.py
@@ -6,80 +6,32 @@
     torch = None
 
 
-### Evaluator for node property prediction
 class IGB_Evaluator:
-    def __init__(self, name, num_tasks, eval_metric):
+    def __init__(self, name, num_tasks):
         self.name = name
         self.num_tasks = num_tasks
-        self.eval_metric = eval_metric
 
-    def _parse_and_check_input(self, input_dict):
-        if self.eval_metric == "acc":
-            if not "y_true" in input_dict:
-                raise RuntimeError("Missing key of y_true")
-            if not "y_pred" in input_dict:
-                raise RuntimeError("Missing key of y_pred")
+    def _parse_input(self, input_dict):
+        y_true, y_pred = input_dict["y_true"], input_dict["y_pred"]
 
-            y_true, y_pred = input_dict["y_true"], input_dict["y_pred"]
+        if torch and isinstance(y_true, torch.Tensor):
+            y_true = y_true.cpu().numpy()
+        if torch and isinstance(y_pred, torch.Tensor):
+            y_pred = y_pred.cpu().numpy()
 
-            """
-                y_true: numpy ndarray or torch tensor of shape (num_nodes num_tasks)
-                y_pred: numpy ndarray or torch tensor of shape (num_nodes num_tasks)
-            """
+        if not isinstance(y_true, np.ndarray) or not isinstance(
+            y_pred, np.ndarray
+        ):
+            raise RuntimeError("Arguments must be numpy arrays")
 
-            # converting to torch.Tensor to numpy on cpu
-            if torch is not None and isinstance(y_true, torch.Tensor):
-                y_true = y_true.detach().cpu().numpy()
+        if y_true.shape != y_pred.shape or y_true.ndim != 2:
+            raise RuntimeError("Shape mismatch between y_true and y_pred")
 
-            if torch is not None and isinstance(y_pred, torch.Tensor):
-                y_pred = y_pred.detach().cpu().numpy()
-
-            ## check type
-            if not (
-                isinstance(y_true, np.ndarray)
-                and isinstance(y_true, np.ndarray)
-            ):
-                raise RuntimeError(
-                    "Arguments to Evaluator need to be either numpy ndarray or torch tensor"
-                )
-
-            if not y_true.shape == y_pred.shape:
-                raise RuntimeError(
-                    "Shape of y_true and y_pred must be the same"
-                )
-
-            if not y_true.ndim == 2:
-                raise RuntimeError(
-                    "y_true and y_pred must to 2-dim arrray, {}-dim array given".format(
-                        y_true.ndim
-                    )
-                )
-
-            if not y_true.shape[1] == self.num_tasks:
-                raise RuntimeError(
-                    "Number of tasks for {} should be {} but {} given".format(
-                        self.name, self.num_tasks, y_true.shape[1]
-                    )
-                )
-
-            return y_true, y_pred
-
-        else:
-            raise ValueError("Undefined eval metric %s " % (self.eval_metric))
+        return y_true, y_pred
 
     def _eval_acc(self, y_true, y_pred):
-        acc_list = []
-
-        for i in range(y_true.shape[1]):
-            is_labeled = y_true[:, i] == y_true[:, i]
-            correct = y_true[is_labeled, i] == y_pred[is_labeled, i]
-            acc_list.append(float(np.sum(correct)) / len(correct))
-
-        return {"acc": sum(acc_list) / len(acc_list)}
+        return {"acc": np.mean(np.all(y_true == y_pred, axis=1))}
 
     def eval(self, input_dict):
-        if self.eval_metric == "acc":
-            y_true, y_pred = self._parse_and_check_input(input_dict)
-            return self._eval_acc(y_true, y_pred)
-        else:
-            raise ValueError("Undefined eval metric %s " % (self.eval_metric))
+        y_true, y_pred = self._parse_input(input_dict)
+        return self._eval_acc(y_true, y_pred)

From 8e3017031afc2ee09a8c78fbb2a9e0e8f4c4b553 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-39-5.us-west-2.compute.internal>
Date: Wed, 23 Oct 2024 03:39:43 +0000
Subject: [PATCH 35/35] reduce evaluator

---
 examples/graphbolt/rgcn/hetero_rgcn.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/graphbolt/rgcn/hetero_rgcn.py b/examples/graphbolt/rgcn/hetero_rgcn.py
index c3a6a36c3dbe..71e05b4e4bfd 100644
--- a/examples/graphbolt/rgcn/hetero_rgcn.py
+++ b/examples/graphbolt/rgcn/hetero_rgcn.py
@@ -423,7 +423,7 @@ def evaluate(
     category = "paper"
     # An evaluator for the dataset.
     if "igb-het" in name:
-        evaluator = IGB_Evaluator(name=name, num_tasks=1, eval_metric="acc")
+        evaluator = IGB_Evaluator(name=name, num_tasks=1)
     elif name == "ogbn-mag":
         evaluator = Evaluator(name=name)
     else: