From 644113386bdaaef11ef8d742932a8891ffd318cd Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Thu, 15 Aug 2024 10:14:22 +0000 Subject: [PATCH 01/35] contribute three IGB dataset (small version) --- examples/graphbolt/rgcn/download.py | 525 +++++++++++++++++++++++++ examples/graphbolt/rgcn/evaluator.py | 97 +++++ examples/graphbolt/rgcn/hetero_rgcn.py | 36 +- 3 files changed, 641 insertions(+), 17 deletions(-) create mode 100755 examples/graphbolt/rgcn/download.py create mode 100644 examples/graphbolt/rgcn/evaluator.py diff --git a/examples/graphbolt/rgcn/download.py b/examples/graphbolt/rgcn/download.py new file mode 100755 index 000000000000..b506d38a7a48 --- /dev/null +++ b/examples/graphbolt/rgcn/download.py @@ -0,0 +1,525 @@ +import argparse, tarfile, hashlib, os, yaml, shutil +from tqdm import tqdm +import urllib.request as ur +import numpy as np +import subprocess +import dgl.graphbolt as gb + +GBFACTOR = float(1 << 30) + + +def _get_size(file_path, node_name): + if "full" in file_path: + return num_nodes["full"][node_name] + if "large" in file_path: + return num_nodes["large"][node_name] + path = f"{file_path}/processed/{node_name}/{node_name}_id_index_mapping.npy" + array = np.load(path, allow_pickle=True) + return len(array.item()) + + +def build_yaml_helper(path, in_memory=True): + data = { + "graph": { + "nodes": [ + { + "num": _get_size(path, "paper"), + "type": "paper" + }, + { + "num": _get_size(path, "author"), + "type": "author" + }, + { + "num": _get_size(path, "institute"), + "type": "institution" + }, + { + "num": _get_size(path, "fos"), + "type": "field_of_study" + } + ], + "edges": [ + { + "path": "edges/author__affiliated_to__institute.npy", + "type": "author:affiliated_to:institution", + "format": "numpy" + }, + { + "path": "edges/paper__written_by__author.npy", + "type": "paper:written_by:author", + "format": "numpy" + }, + { + "path": "edges/paper__cites__paper.npy", + "type": "paper:cites:paper", + "format": "numpy" + }, + { + "path": "edges/paper__topic__fos.npy", + "type": "paper:has_topic:field_of_study", + "format": "numpy" + }, + ], + }, + "tasks": [ + { + "num_classes": 19, + "validation_set": [ + { + "data": [ + { + "in_memory": in_memory, + "path": "set/validation_indices.npy", + "name": "seeds", + "format": "numpy" + }, + { + "in_memory": in_memory, + "path": "set/validation_labels.npy", + "name": "labels", + "format": "numpy" + } + ], + "type": "paper" + } + ], + "name": "node_classification", + "train_set": [ + { + "data": [ + { + "in_memory": in_memory, + "path": "set/train_indices.npy", + "name": "seeds", + "format": "numpy" + }, + { + "in_memory": in_memory, + "path": "set/train_labels.npy", + "name": "labels", + "format": "numpy" + } + ], + "type": "paper" + } + ], + "test_set": [ + { + "data": [ + { + "in_memory": in_memory, + "path": "set/test_indices.npy", + "name": "seeds", + "format": "numpy" + }, + { + "in_memory": in_memory, + "path": "set/test_labels.npy", + "name": "labels", + "format": "numpy" + } + ], + "type": "paper" + } + ] + } + ], + "feature_data": [ + { + "domain": "node", + "name": "feat", + "format": "numpy", + "in_memory": in_memory, + "path": "data/paper_feat.npy", + "type": "paper" + }, + { + "domain": "node", + "name": "label", + "format": "numpy", + "in_memory": in_memory, + "path": "data/paper_label_19.npy", + "type": "paper" + }, + { + "domain": "node", + "name": "feat", + "format": "numpy", + "in_memory": in_memory, + "path": "data/author_feat.npy", + "type": "author" + }, + { + "domain": "node", + "name": "feat", + "format": "numpy", + "in_memory": in_memory, + "path": "data/institute_feat.npy", + "type": "institute" + }, + { + "domain": "node", + "name": "feat", + "format": "numpy", + "in_memory": in_memory, + "path": "data/fos_feat.npy", + "type": "fos" + }, + { + "domain": "node", + "name": "feat", + "format": "numpy", + "in_memory": in_memory, + "path": "data/author_feat.npy", + "type": "author" + } + ], + "dataset_name": os.path.basename(path) + } + + return data + + +def build_yaml(original_path, current_path): + if "large" in current_path or "full" in current_path: + data = build_yaml_helper(original_path, in_memory=False) + else: + data = build_yaml_helper(original_path) + with open(f"{current_path}/metadata.yaml", 'w') as file: + yaml.dump(data, file, default_flow_style=False) + + +def decide_download(url): + d = ur.urlopen(url) + size = int(d.info()["Content-Length"])/GBFACTOR + ### confirm if larger than 1GB + if size > 1: + return input("This will download %.2fGB. Will you proceed? (y/N) " % (size)).lower() == "y" + else: + return True + + +dataset_urls = { + 'homogeneous' : { + 'tiny' : 'https://igb-public-awsopen.s3.amazonaws.com/igb-homogeneous/igb_homogeneous_tiny.tar.gz', + 'small' : 'https://igb-public-awsopen.s3.amazonaws.com/igb-homogeneous/igb_homogeneous_small.tar.gz', + 'medium' : 'https://igb-public-awsopen.s3.amazonaws.com/igb-homogeneous/igb_homogeneous_medium.tar.gz' + }, + 'heterogeneous' : { + 'tiny' : 'https://igb-public-awsopen.s3.amazonaws.com/igb-heterogeneous/igb_heterogeneous_tiny.tar.gz', + 'small' : 'https://igb-public-awsopen.s3.amazonaws.com/igb-heterogeneous/igb_heterogeneous_small.tar.gz', + 'medium' : 'https://igb-public-awsopen.s3.amazonaws.com/igb-heterogeneous/igb_heterogeneous_medium.tar.gz' + } +} + + +md5checksums = { + 'homogeneous' : { + 'tiny' : '34856534da55419b316d620e2d5b21be', + 'small' : '6781c699723529902ace0a95cafe6fe4', + 'medium' : '4640df4ceee46851fd18c0a44ddcc622' + }, + 'heterogeneous' : { + 'tiny' : '83fbc1091497ff92cf20afe82fae0ade', + 'small' : '2f42077be60a074aec24f7c60089e1bd', + 'medium' : '7f0df4296eca36553ff3a6a63abbd347' + } +} + + +def check_md5sum(dataset_type, dataset_size, filename): + original_md5 = md5checksums[dataset_type][dataset_size] + + with open(filename, 'rb') as file_to_check: + data = file_to_check.read() + md5_returned = hashlib.md5(data).hexdigest() + + if original_md5 == md5_returned: + print(" md5sum verified.") + return + else: + os.remove(filename) + raise Exception(" md5sum verification failed!.") + + +def download_dataset(path, dataset_type, dataset_size): + if dataset_size in ["large", "full"]: + command = f"./download_{dataset_size}_igbh.sh" + subprocess.run(['bash', command], check=True, text=True) + shutil.move(src=f"igb-{dataset_type}-{dataset_size}", dst=f"{path}") + return path + "/" + "igb-" + dataset_type + "-" + dataset_size + else: + output_directory = path + if not os.path.exists(output_directory + "igb_" + dataset_type + "_" + dataset_size + ".tar.gz"): + url = dataset_urls[dataset_type][dataset_size] + if decide_download(url): + data = ur.urlopen(url) + size = int(data.info()["Content-Length"]) + chunk_size = 1024*1024 + num_iter = int(size/chunk_size) + 2 + downloaded_size = 0 + filename = path + "/igb_" + dataset_type + "_" + dataset_size + ".tar.gz" + with open(filename, 'wb') as f: + pbar = tqdm(range(num_iter)) + for i in pbar: + chunk = data.read(chunk_size) + downloaded_size += len(chunk) + pbar.set_description("Downloaded {:.2f} GB".format(float(downloaded_size)/GBFACTOR)) + f.write(chunk) + print("Downloaded" + " igb_" + dataset_type + "_" + dataset_size, end=" ->") + check_md5sum(dataset_type, dataset_size, filename) + else: + print("The file igb_" + dataset_type + "_" + dataset_size + ".tar.gz already exists, directly extracting...") + filename = path + "/igb_" + dataset_type + "_" + dataset_size + ".tar.gz" + file = tarfile.open(filename) + file.extractall(output_directory) + file.close() + size = 0 + for path, dirs, files in os.walk(output_directory+ "/" + dataset_size): + for f in files: + fp = os.path.join(path, f) + size += os.path.getsize(fp) + print("Final dataset size {:.2f} GB.".format(size/GBFACTOR)) + # os.remove(filename) + os.rename(output_directory+ "/" + dataset_size, output_directory+ "/" + "igb-" + dataset_type + "-" + dataset_size) + return output_directory + "/" + "igb-" + dataset_type + "-" + dataset_size + + +num_nodes = { + "full": { + "paper": 269346174, + "author": 277220883, + "institute": 26918, + "fos": 712960 + }, + "large": { + "paper": 100000000, + "author": 116959896, + "institute": 26524, + "fos": 649707 + }, + "medium": { + "paper": 10000000, + "author": 15544654, + "institute": 23256, + "fos": 415054 + }, + "small": { + "paper": 1000000, + "author": 1926066, + "institute": 14751, + "fos": 190449 + }, + "tiny": { + "paper": 100000, + "author": 357041, + "institute": 8738, + "fos": 84220 + } +} + +num_edges = { + "full": { + "paper__cites__paper": 3996442004, + "paper__written_by__author": 716761549, + "paper__topic__fos": 1050280600, + "author__affiliated_to__institute": 48521486 + }, + "large": { + "paper__cites__paper": 1223571364, + "paper__written_by__author": 289502107, + "paper__topic__fos": 457577294, + "author__affiliated_to__institute": 34099660 + }, + "medium": { + "paper__cites__paper": 120077694, + "paper__written_by__author": 39854592, + "paper__topic__fos": 68510495, + "author__affiliated_to__institute": 11049412 + }, + "small": { + "paper__cites__paper": 12070502, + "paper__written_by__author": 4553516, + "paper__topic__fos": 7234122, + "author__affiliated_to__institute": 1630476 + }, + "tiny": { + "paper__cites__paper": 447416, + "paper__written_by__author": 471443, + "paper__topic__fos": 718445, + "author__affiliated_to__institute": 325410 + } +} + + +def split_data(label_path, set_dir, dataset_size): + # labels = np.memmap(label_path, dtype='int32', mode='r', shape=(num_nodes[dataset_size]["paper"], 1)) + labels = np.load(label_path) + + total_samples = len(labels) + train_end = int(0.8 * total_samples) + validation_end = int(0.9 * total_samples) + + indices = np.arange(total_samples) + train_indices = indices[:train_end] + validation_indices = indices[train_end:validation_end] + test_indices = indices[validation_end:] + print(indices) + print(train_indices) + print(validation_indices) + print(test_indices) + + train_labels = labels[:train_end] + validation_labels = labels[train_end:validation_end] + test_labels = labels[validation_end:] + print(train_labels, len(train_labels)) + print(validation_labels,len(validation_labels)) + print(test_labels, len(test_labels)) + + gb.numpy_save_aligned(f"{set_dir}/train_indices.npy", train_indices) + gb.numpy_save_aligned(f"{set_dir}/validation_indices.npy", validation_indices) + gb.numpy_save_aligned(f"{set_dir}/test_indices.npy", test_indices) + gb.numpy_save_aligned(f"{set_dir}/train_labels.npy", train_labels) + gb.numpy_save_aligned(f"{set_dir}/validation_labels.npy", validation_labels) + gb.numpy_save_aligned(f"{set_dir}/test_labels.npy", test_labels) + + +def add_edges(edges, source, dest, dataset_size): + for edge in edges: + print(f"\t Processing {edge} edge...") + + old_edge_path = source + "/" + edge + "/" + "edge_index.npy" + new_edge_path = dest + "/" + edge + ".npy" + os.rename(src=old_edge_path, dst=new_edge_path) + + # edge_array = np.memmap(new_edge_path, dtype='int32', mode='r', shape=(num_edges[dataset_size][edge], 2)) + edge_array = np.load(new_edge_path) + new_edge_array = edge_array.transpose() + + assert(new_edge_array.shape == (2, num_edges[dataset_size][edge])) + + np.save(new_edge_path, new_edge_array) + + +def process_feat(file_path, node_name, dataset_size): + # array = np.memmap(file_path, dtype='float32', mode='r', shape=(num_nodes[dataset_size][node_name], 1024)) + array = np.load(file_path) + assert(array.shape == (num_nodes[dataset_size][node_name], 1024)) + gb.numpy_save_aligned(file_path, array) + + # Assert the shape and elements of the array are correct + # new_array = np.memmap(file_path, dtype='float32', mode='r', shape=(num_nodes[dataset_size][node_name], 1024)) + new_array = np.load(file_path) + assert(array.shape == (num_nodes[dataset_size][node_name], 1024)) + assert(np.array_equal(array, new_array)) + + +def process_label(file_path, num_class, dataset_size): + if num_class == 2983 and dataset_size == "full": # only this case label number changes + # array = np.memmap(file_path, dtype='int32', mode='r', shape=(227130858, 1)) + array = np.load(file_path) + assert(array.shape == (227130858, 1) or array.shape == (227130858,)) + else: + # array = np.memmap(file_path, dtype='int32', mode='r', shape=(num_nodes[dataset_size]["paper"], 1)) + array = np.load(file_path) + assert(array.shape == (num_nodes[dataset_size]["paper"], 1) or array.shape == (num_nodes[dataset_size]["paper"],)) + + gb.numpy_save_aligned(file_path, array) + + # Assert the shape and elements of the array are correct + if num_class == 2983 and dataset_size == "full": + # new_array = np.memmap(file_path, dtype='int32', mode='r', shape=(227130858, 1)) + new_array = np.load(file_path) + assert(new_array.shape == (227130858, 1) or new_array.shape == (227130858,)) + assert(np.array_equal(array, new_array)) + else: + # new_array = np.memmap(file_path, dtype='int32', mode='r', shape=(num_nodes[dataset_size]["paper"], 1)) + new_array = np.load(file_path) + assert(new_array.shape == (num_nodes[dataset_size]["paper"], 1) or new_array.shape == (num_nodes[dataset_size]["paper"],)) + assert(np.array_equal(array, new_array)) + + +def add_nodes(nodes, source, dest, dataset_size): + for node in nodes: + print(f"\t Processing {node} node feature...") + old_node_path = source + "/" + node + "/" + "node_feat.npy" + new_node_path = dest + "/" + node + "_feat.npy" + os.rename(src=old_node_path, dst=new_node_path) + process_feat(file_path=new_node_path, node_name=node, dataset_size=dataset_size) + if node == "paper": + print(f"\t Processing {node} labels...") + old_label_path_19 = source + "/" + node + "/" + "node_label_19.npy" + new_label_path_19 = dest + "/" + "paper_label_19.npy" + os.rename(src=old_label_path_19, dst=new_label_path_19) + process_label(file_path=new_label_path_19, num_class=19, dataset_size=dataset_size) + + old_label_path_2K = source + "/" + node + "/" + "node_label_2K.npy" + new_label_path_2K = dest + "/" + "paper_label_2K.npy" + os.rename(src=old_label_path_2K, dst=new_label_path_2K) + process_label(file_path=new_label_path_19, num_class=2983, dataset_size=dataset_size) + + return new_label_path_19, new_label_path_2K + + +def process_dataset(path, dataset_size): + print(f"Starting to process the {dataset_size} dataset...") + + # Make the directory for processed dataset + processed_dir = path + "-seeds" + os.makedirs(name=processed_dir, exist_ok=True) + original_path = path + "/" + "processed" + + # Step 1: Move Nodes files + print("Processing Node files...") + node_dir = processed_dir + "/" + "data" + os.makedirs(name=node_dir, exist_ok=True) + # These are the four nodes in this citation network + nodes = [ + "paper", + "author", + "fos", + "institute" + ] + label_file_19, label_file_2K = add_nodes(nodes=nodes, source=original_path, dest=node_dir, dataset_size=dataset_size) + + # Step 2: Create labels + print("Processing train/valid/test files...") + set_dir = processed_dir + "/" + "set" + os.makedirs(name=set_dir, exist_ok=True) + split_data(label_path=label_file_19, set_dir=set_dir, dataset_size=dataset_size) + + # Step 3: Move edge files + print("Processing Edge files...") + edge_dir = processed_dir + "/" + "edges" + os.makedirs(name=edge_dir, exist_ok=True) + # These are the four edges in this citation network + edges = [ + "paper__cites__paper", + "paper__written_by__author", + "paper__topic__fos", + "author__affiliated_to__institute" + ] + add_edges(edges=edges, source=original_path, dest=edge_dir, dataset_size=dataset_size) + + # Step 4: Build the yaml file + print("Building yaml file...") + build_yaml(original_path=path, current_path=processed_dir) + + # shutil.rmtree(path) + print(f"Finished processing the {dataset_size} dataset") + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--path', type=str, default='datasets/', + help='path to store the datasets') + parser.add_argument('--type', type=str, default='heterogeneous', + choices=['homogeneous', 'heterogeneous'], + help='dataset type') + parser.add_argument('--size', type=str, default='tiny', + choices=['tiny', 'small', 'medium', 'large', 'full'], + help='size of the datasets') + args = parser.parse_args() + path = download_dataset(path=args.path, dataset_type=args.type, dataset_size=args.size) + process_dataset(path=path, dataset_size=args.size) diff --git a/examples/graphbolt/rgcn/evaluator.py b/examples/graphbolt/rgcn/evaluator.py new file mode 100644 index 000000000000..6598355b0727 --- /dev/null +++ b/examples/graphbolt/rgcn/evaluator.py @@ -0,0 +1,97 @@ +import numpy as np +try: + import torch +except ImportError: + torch = None + +### Evaluator for node property prediction +class IGB_Evaluator: + def __init__(self, name, num_tasks, eval_metric): + self.name = name + self.num_tasks = num_tasks + self.eval_metric = eval_metric + + + def _parse_and_check_input(self, input_dict): + if self.eval_metric == 'acc': + if not 'y_true' in input_dict: + raise RuntimeError('Missing key of y_true') + if not 'y_pred' in input_dict: + raise RuntimeError('Missing key of y_pred') + + y_true, y_pred = input_dict['y_true'], input_dict['y_pred'] + + ''' + y_true: numpy ndarray or torch tensor of shape (num_nodes num_tasks) + y_pred: numpy ndarray or torch tensor of shape (num_nodes num_tasks) + ''' + + # converting to torch.Tensor to numpy on cpu + if torch is not None and isinstance(y_true, torch.Tensor): + y_true = y_true.detach().cpu().numpy() + + if torch is not None and isinstance(y_pred, torch.Tensor): + y_pred = y_pred.detach().cpu().numpy() + + ## check type + if not (isinstance(y_true, np.ndarray) and isinstance(y_true, np.ndarray)): + raise RuntimeError('Arguments to Evaluator need to be either numpy ndarray or torch tensor') + + if not y_true.shape == y_pred.shape: + raise RuntimeError('Shape of y_true and y_pred must be the same') + + if not y_true.ndim == 2: + raise RuntimeError('y_true and y_pred must to 2-dim arrray, {}-dim array given'.format(y_true.ndim)) + + if not y_true.shape[1] == self.num_tasks: + raise RuntimeError('Number of tasks for {} should be {} but {} given'.format(self.name, self.num_tasks, y_true.shape[1])) + + return y_true, y_pred + + else: + raise ValueError('Undefined eval metric %s ' % (self.eval_metric)) + + + def eval(self, input_dict): + if self.eval_metric == 'acc': + y_true, y_pred = self._parse_and_check_input(input_dict) + return self._eval_acc(y_true, y_pred) + else: + raise ValueError('Undefined eval metric %s ' % (self.eval_metric)) + + @property + def expected_input_format(self): + desc = '==== Expected input format of Evaluator for {}\n'.format(self.name) + if self.eval_metric == 'acc': + desc += '{\'y_true\': y_true, \'y_pred\': y_pred}\n' + desc += '- y_true: numpy ndarray or torch tensor of shape (num_nodes num_tasks)\n' + desc += '- y_pred: numpy ndarray or torch tensor of shape (num_nodes num_tasks)\n' + desc += 'where y_pred stores predicted class label (integer),\n' + desc += 'num_task is {}, and '.format(self.num_tasks) + desc += 'each row corresponds to one node.\n' + else: + raise ValueError('Undefined eval metric %s ' % (self.eval_metric)) + + return desc + + @property + def expected_output_format(self): + desc = '==== Expected output format of Evaluator for {}\n'.format(self.name) + if self.eval_metric == 'acc': + desc += '{\'acc\': acc}\n' + desc += '- acc (float): Accuracy score averaged across {} task(s)\n'.format(self.num_tasks) + else: + raise ValueError('Undefined eval metric %s ' % (self.eval_metric)) + + return desc + + def _eval_acc(self, y_true, y_pred): + acc_list = [] + + for i in range(y_true.shape[1]): + is_labeled = y_true[:,i] == y_true[:,i] + correct = y_true[is_labeled,i] == y_pred[is_labeled,i] + acc_list.append(float(np.sum(correct))/len(correct)) + + return {'acc': sum(acc_list)/len(acc_list)} + diff --git a/examples/graphbolt/rgcn/hetero_rgcn.py b/examples/graphbolt/rgcn/hetero_rgcn.py index 60ab51602ca1..503295733092 100644 --- a/examples/graphbolt/rgcn/hetero_rgcn.py +++ b/examples/graphbolt/rgcn/hetero_rgcn.py @@ -59,6 +59,7 @@ import torch.nn.functional as F from dgl.nn import HeteroEmbedding from ogb.lsc import MAG240MEvaluator +from evaluator import IGB_Evaluator from ogb.nodeproppred import Evaluator from tqdm import tqdm @@ -124,12 +125,7 @@ def create_dataloader( # The graph(FusedCSCSamplingGraph) from which to sample neighbors. # `fanouts`: # The number of neighbors to sample for each node in each layer. - datapipe = datapipe.sample_neighbor( - graph, - fanouts=fanouts, - overlap_fetch=args.overlap_graph_fetch, - asynchronous=args.asynchronous, - ) + datapipe = datapipe.sample_neighbor(graph, fanouts=fanouts) # Fetch the features for each node in the mini-batch. # `features`: @@ -141,6 +137,11 @@ def create_dataloader( if name == "ogb-lsc-mag240m": node_feature_keys["author"] = ["feat"] node_feature_keys["institution"] = ["feat"] + if "igb-heterogeneous" in name: + node_feature_keys["author"] = ["feat"] + node_feature_keys["institution"] = ["feat"] + node_feature_keys["fos"] = ["feat"] + datapipe = datapipe.fetch_feature(features, node_feature_keys) # Create a DataLoader from the datapipe. @@ -158,7 +159,7 @@ def extract_embed(node_embed, input_nodes): def extract_node_features(name, block, data, node_embed, device): """Extract the node features from embedding layer or raw features.""" - if name == "ogbn-mag": + if name == "ogbn-mag" or "igb-heterogeneous" in name: input_nodes = { k: v.to(device) for k, v in block.srcdata[dgl.NID].items() } @@ -424,7 +425,9 @@ def evaluate( model.eval() category = "paper" # An evaluator for the dataset. - if name == "ogbn-mag": + if "igb-heterogeneous" in name: + evaluator = IGB_Evaluator(name=name, num_tasks=1, eval_metric="acc") + elif name == "ogbn-mag": evaluator = Evaluator(name=name) else: evaluator = MAG240MEvaluator() @@ -573,14 +576,9 @@ def main(args): ) = load_dataset(args.dataset) # Move the dataset to the pinned memory to enable GPU access. - args.overlap_graph_fetch = False - args.asynchronous = False if device == torch.device("cuda"): - g = g.pin_memory_() - features = features.pin_memory_() - # Enable optimizations for sampling on the GPU. - args.overlap_graph_fetch = True - args.asynchronous = True + g.pin_memory_() + features.pin_memory_() feat_size = features.size("node", "paper", "feat")[0] @@ -588,7 +586,8 @@ def main(args): # `institution` are generated in advance and stored in the feature store. # For `ogbn-mag`, we generate the features on the fly. embed_layer = None - if args.dataset == "ogbn-mag": + # if args.dataset == "ogbn-mag": + if args.dataset == "ogbn-mag" or "igb-heterogeneous" in args.dataset: # Create the embedding layer and move it to the appropriate device. embed_layer = rel_graph_embed(g, feat_size).to(device) print( @@ -663,7 +662,10 @@ def main(args): "--dataset", type=str, default="ogbn-mag", - choices=["ogbn-mag", "ogb-lsc-mag240m"], + # choices=["ogbn-mag", "ogb-lsc-mag240m"], + choices=["ogbn-mag", "ogb-lsc-mag240m", "igb-heterogeneous-tiny", + "igb-heterogeneous-small", "igb-heterogeneous-medium", + "igb-heterogeneous-large", "igb-heterogeneous-full"], help="Dataset name. Possible values: ogbn-mag, ogb-lsc-mag240m", ) parser.add_argument("--num_epochs", type=int, default=3) From 85d92c834f7667c24c5ea54f16a75ca16ea98510 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Thu, 15 Aug 2024 10:20:20 +0000 Subject: [PATCH 02/35] contribute three IGB dataset (small version) --- examples/graphbolt/rgcn/hetero_rgcn.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/examples/graphbolt/rgcn/hetero_rgcn.py b/examples/graphbolt/rgcn/hetero_rgcn.py index 503295733092..9c93b53a21c1 100644 --- a/examples/graphbolt/rgcn/hetero_rgcn.py +++ b/examples/graphbolt/rgcn/hetero_rgcn.py @@ -142,7 +142,12 @@ def create_dataloader( node_feature_keys["institution"] = ["feat"] node_feature_keys["fos"] = ["feat"] - datapipe = datapipe.fetch_feature(features, node_feature_keys) + datapipe = datapipe.sample_neighbor( + graph, + fanouts=fanouts, + overlap_fetch=args.overlap_graph_fetch, + asynchronous=args.asynchronous, + ) # Create a DataLoader from the datapipe. # `num_workers`: @@ -576,9 +581,14 @@ def main(args): ) = load_dataset(args.dataset) # Move the dataset to the pinned memory to enable GPU access. + args.overlap_graph_fetch = False + args.asynchronous = False if device == torch.device("cuda"): - g.pin_memory_() - features.pin_memory_() + g = g.pin_memory_() + features = features.pin_memory_() + # Enable optimizations for sampling on the GPU. + args.overlap_graph_fetch = True + args.asynchronous = True feat_size = features.size("node", "paper", "feat")[0] From 5d2fe56cff826be4d7424505dc681f7bd95f6947 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Thu, 15 Aug 2024 10:27:06 +0000 Subject: [PATCH 03/35] contribute three IGB dataset (small version) --- examples/graphbolt/rgcn/hetero_rgcn.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/examples/graphbolt/rgcn/hetero_rgcn.py b/examples/graphbolt/rgcn/hetero_rgcn.py index 9c93b53a21c1..bb12b31e2069 100644 --- a/examples/graphbolt/rgcn/hetero_rgcn.py +++ b/examples/graphbolt/rgcn/hetero_rgcn.py @@ -125,7 +125,12 @@ def create_dataloader( # The graph(FusedCSCSamplingGraph) from which to sample neighbors. # `fanouts`: # The number of neighbors to sample for each node in each layer. - datapipe = datapipe.sample_neighbor(graph, fanouts=fanouts) + datapipe = datapipe.sample_neighbor( + graph, + fanouts=fanouts, + overlap_fetch=args.overlap_graph_fetch, + asynchronous=args.asynchronous, + ) # Fetch the features for each node in the mini-batch. # `features`: @@ -141,13 +146,7 @@ def create_dataloader( node_feature_keys["author"] = ["feat"] node_feature_keys["institution"] = ["feat"] node_feature_keys["fos"] = ["feat"] - - datapipe = datapipe.sample_neighbor( - graph, - fanouts=fanouts, - overlap_fetch=args.overlap_graph_fetch, - asynchronous=args.asynchronous, - ) + datapipe = datapipe.fetch_feature(features, node_feature_keys) # Create a DataLoader from the datapipe. # `num_workers`: @@ -672,7 +671,6 @@ def main(args): "--dataset", type=str, default="ogbn-mag", - # choices=["ogbn-mag", "ogb-lsc-mag240m"], choices=["ogbn-mag", "ogb-lsc-mag240m", "igb-heterogeneous-tiny", "igb-heterogeneous-small", "igb-heterogeneous-medium", "igb-heterogeneous-large", "igb-heterogeneous-full"], From 543f672276797ba3a39e7134871e2f8c7bf341a8 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Thu, 15 Aug 2024 10:32:08 +0000 Subject: [PATCH 04/35] format the code with ufmt --- examples/graphbolt/rgcn/download.py | 555 ++++++++++++++----------- examples/graphbolt/rgcn/evaluator.py | 94 +++-- examples/graphbolt/rgcn/hetero_rgcn.py | 15 +- 3 files changed, 380 insertions(+), 284 deletions(-) diff --git a/examples/graphbolt/rgcn/download.py b/examples/graphbolt/rgcn/download.py index b506d38a7a48..4724061476b5 100755 --- a/examples/graphbolt/rgcn/download.py +++ b/examples/graphbolt/rgcn/download.py @@ -1,9 +1,10 @@ -import argparse, tarfile, hashlib, os, yaml, shutil -from tqdm import tqdm -import urllib.request as ur -import numpy as np +import argparse, hashlib, os, shutil, tarfile, yaml import subprocess +import urllib.request as ur + import dgl.graphbolt as gb +import numpy as np +from tqdm import tqdm GBFACTOR = float(1 << 30) @@ -20,164 +21,152 @@ def _get_size(file_path, node_name): def build_yaml_helper(path, in_memory=True): data = { - "graph": { - "nodes": [ + "graph": { + "nodes": [ + {"num": _get_size(path, "paper"), "type": "paper"}, + {"num": _get_size(path, "author"), "type": "author"}, + {"num": _get_size(path, "institute"), "type": "institution"}, + {"num": _get_size(path, "fos"), "type": "field_of_study"}, + ], + "edges": [ { - "num": _get_size(path, "paper"), - "type": "paper" - }, - { - "num": _get_size(path, "author"), - "type": "author" - }, - { - "num": _get_size(path, "institute"), - "type": "institution" - }, - { - "num": _get_size(path, "fos"), - "type": "field_of_study" - } - ], - "edges": [ - { - "path": "edges/author__affiliated_to__institute.npy", - "type": "author:affiliated_to:institution", - "format": "numpy" + "path": "edges/author__affiliated_to__institute.npy", + "type": "author:affiliated_to:institution", + "format": "numpy", }, { - "path": "edges/paper__written_by__author.npy", - "type": "paper:written_by:author", - "format": "numpy" - }, - { - "path": "edges/paper__cites__paper.npy", - "type": "paper:cites:paper", - "format": "numpy" - }, + "path": "edges/paper__written_by__author.npy", + "type": "paper:written_by:author", + "format": "numpy", + }, { - "path": "edges/paper__topic__fos.npy", - "type": "paper:has_topic:field_of_study", - "format": "numpy" + "path": "edges/paper__cites__paper.npy", + "type": "paper:cites:paper", + "format": "numpy", }, - ], - }, - "tasks": [ { + "path": "edges/paper__topic__fos.npy", + "type": "paper:has_topic:field_of_study", + "format": "numpy", + }, + ], + }, + "tasks": [ + { "num_classes": 19, "validation_set": [ { - "data": [ - { - "in_memory": in_memory, - "path": "set/validation_indices.npy", - "name": "seeds", - "format": "numpy" - }, - { - "in_memory": in_memory, - "path": "set/validation_labels.npy", - "name": "labels", - "format": "numpy" - } - ], - "type": "paper" + "data": [ + { + "in_memory": in_memory, + "path": "set/validation_indices.npy", + "name": "seeds", + "format": "numpy", + }, + { + "in_memory": in_memory, + "path": "set/validation_labels.npy", + "name": "labels", + "format": "numpy", + }, + ], + "type": "paper", } - ], - "name": "node_classification", + ], + "name": "node_classification", "train_set": [ { - "data": [ - { - "in_memory": in_memory, - "path": "set/train_indices.npy", - "name": "seeds", - "format": "numpy" - }, - { - "in_memory": in_memory, - "path": "set/train_labels.npy", - "name": "labels", - "format": "numpy" - } - ], - "type": "paper" + "data": [ + { + "in_memory": in_memory, + "path": "set/train_indices.npy", + "name": "seeds", + "format": "numpy", + }, + { + "in_memory": in_memory, + "path": "set/train_labels.npy", + "name": "labels", + "format": "numpy", + }, + ], + "type": "paper", } - ], + ], "test_set": [ { - "data": [ - { - "in_memory": in_memory, - "path": "set/test_indices.npy", - "name": "seeds", - "format": "numpy" - }, - { - "in_memory": in_memory, - "path": "set/test_labels.npy", - "name": "labels", - "format": "numpy" - } - ], - "type": "paper" + "data": [ + { + "in_memory": in_memory, + "path": "set/test_indices.npy", + "name": "seeds", + "format": "numpy", + }, + { + "in_memory": in_memory, + "path": "set/test_labels.npy", + "name": "labels", + "format": "numpy", + }, + ], + "type": "paper", } - ] - } - ], - "feature_data": [ - { - "domain": "node", - "name": "feat", - "format": "numpy", - "in_memory": in_memory, - "path": "data/paper_feat.npy", - "type": "paper" - }, - { - "domain": "node", - "name": "label", - "format": "numpy", - "in_memory": in_memory, - "path": "data/paper_label_19.npy", - "type": "paper" - }, - { - "domain": "node", - "name": "feat", - "format": "numpy", - "in_memory": in_memory, - "path": "data/author_feat.npy", - "type": "author" - }, - { - "domain": "node", - "name": "feat", - "format": "numpy", - "in_memory": in_memory, - "path": "data/institute_feat.npy", - "type": "institute" - }, - { - "domain": "node", - "name": "feat", - "format": "numpy", - "in_memory": in_memory, - "path": "data/fos_feat.npy", - "type": "fos" - }, - { - "domain": "node", - "name": "feat", - "format": "numpy", - "in_memory": in_memory, - "path": "data/author_feat.npy", - "type": "author" - } - ], - "dataset_name": os.path.basename(path) + ], } - + ], + "feature_data": [ + { + "domain": "node", + "name": "feat", + "format": "numpy", + "in_memory": in_memory, + "path": "data/paper_feat.npy", + "type": "paper", + }, + { + "domain": "node", + "name": "label", + "format": "numpy", + "in_memory": in_memory, + "path": "data/paper_label_19.npy", + "type": "paper", + }, + { + "domain": "node", + "name": "feat", + "format": "numpy", + "in_memory": in_memory, + "path": "data/author_feat.npy", + "type": "author", + }, + { + "domain": "node", + "name": "feat", + "format": "numpy", + "in_memory": in_memory, + "path": "data/institute_feat.npy", + "type": "institute", + }, + { + "domain": "node", + "name": "feat", + "format": "numpy", + "in_memory": in_memory, + "path": "data/fos_feat.npy", + "type": "fos", + }, + { + "domain": "node", + "name": "feat", + "format": "numpy", + "in_memory": in_memory, + "path": "data/author_feat.npy", + "type": "author", + }, + ], + "dataset_name": os.path.basename(path), + } + return data @@ -186,53 +175,58 @@ def build_yaml(original_path, current_path): data = build_yaml_helper(original_path, in_memory=False) else: data = build_yaml_helper(original_path) - with open(f"{current_path}/metadata.yaml", 'w') as file: + with open(f"{current_path}/metadata.yaml", "w") as file: yaml.dump(data, file, default_flow_style=False) def decide_download(url): d = ur.urlopen(url) - size = int(d.info()["Content-Length"])/GBFACTOR + size = int(d.info()["Content-Length"]) / GBFACTOR ### confirm if larger than 1GB if size > 1: - return input("This will download %.2fGB. Will you proceed? (y/N) " % (size)).lower() == "y" + return ( + input( + "This will download %.2fGB. Will you proceed? (y/N) " % (size) + ).lower() + == "y" + ) else: return True dataset_urls = { - 'homogeneous' : { - 'tiny' : 'https://igb-public-awsopen.s3.amazonaws.com/igb-homogeneous/igb_homogeneous_tiny.tar.gz', - 'small' : 'https://igb-public-awsopen.s3.amazonaws.com/igb-homogeneous/igb_homogeneous_small.tar.gz', - 'medium' : 'https://igb-public-awsopen.s3.amazonaws.com/igb-homogeneous/igb_homogeneous_medium.tar.gz' + "homogeneous": { + "tiny": "https://igb-public-awsopen.s3.amazonaws.com/igb-homogeneous/igb_homogeneous_tiny.tar.gz", + "small": "https://igb-public-awsopen.s3.amazonaws.com/igb-homogeneous/igb_homogeneous_small.tar.gz", + "medium": "https://igb-public-awsopen.s3.amazonaws.com/igb-homogeneous/igb_homogeneous_medium.tar.gz", + }, + "heterogeneous": { + "tiny": "https://igb-public-awsopen.s3.amazonaws.com/igb-heterogeneous/igb_heterogeneous_tiny.tar.gz", + "small": "https://igb-public-awsopen.s3.amazonaws.com/igb-heterogeneous/igb_heterogeneous_small.tar.gz", + "medium": "https://igb-public-awsopen.s3.amazonaws.com/igb-heterogeneous/igb_heterogeneous_medium.tar.gz", }, - 'heterogeneous' : { - 'tiny' : 'https://igb-public-awsopen.s3.amazonaws.com/igb-heterogeneous/igb_heterogeneous_tiny.tar.gz', - 'small' : 'https://igb-public-awsopen.s3.amazonaws.com/igb-heterogeneous/igb_heterogeneous_small.tar.gz', - 'medium' : 'https://igb-public-awsopen.s3.amazonaws.com/igb-heterogeneous/igb_heterogeneous_medium.tar.gz' - } } md5checksums = { - 'homogeneous' : { - 'tiny' : '34856534da55419b316d620e2d5b21be', - 'small' : '6781c699723529902ace0a95cafe6fe4', - 'medium' : '4640df4ceee46851fd18c0a44ddcc622' + "homogeneous": { + "tiny": "34856534da55419b316d620e2d5b21be", + "small": "6781c699723529902ace0a95cafe6fe4", + "medium": "4640df4ceee46851fd18c0a44ddcc622", + }, + "heterogeneous": { + "tiny": "83fbc1091497ff92cf20afe82fae0ade", + "small": "2f42077be60a074aec24f7c60089e1bd", + "medium": "7f0df4296eca36553ff3a6a63abbd347", }, - 'heterogeneous' : { - 'tiny' : '83fbc1091497ff92cf20afe82fae0ade', - 'small' : '2f42077be60a074aec24f7c60089e1bd', - 'medium' : '7f0df4296eca36553ff3a6a63abbd347' - } } def check_md5sum(dataset_type, dataset_size, filename): original_md5 = md5checksums[dataset_type][dataset_size] - with open(filename, 'rb') as file_to_check: - data = file_to_check.read() + with open(filename, "rb") as file_to_check: + data = file_to_check.read() md5_returned = hashlib.md5(data).hexdigest() if original_md5 == md5_returned: @@ -241,49 +235,83 @@ def check_md5sum(dataset_type, dataset_size, filename): else: os.remove(filename) raise Exception(" md5sum verification failed!.") - + def download_dataset(path, dataset_type, dataset_size): if dataset_size in ["large", "full"]: command = f"./download_{dataset_size}_igbh.sh" - subprocess.run(['bash', command], check=True, text=True) + subprocess.run(["bash", command], check=True, text=True) shutil.move(src=f"igb-{dataset_type}-{dataset_size}", dst=f"{path}") return path + "/" + "igb-" + dataset_type + "-" + dataset_size - else: + else: output_directory = path - if not os.path.exists(output_directory + "igb_" + dataset_type + "_" + dataset_size + ".tar.gz"): + if not os.path.exists( + output_directory + + "igb_" + + dataset_type + + "_" + + dataset_size + + ".tar.gz" + ): url = dataset_urls[dataset_type][dataset_size] if decide_download(url): data = ur.urlopen(url) size = int(data.info()["Content-Length"]) - chunk_size = 1024*1024 - num_iter = int(size/chunk_size) + 2 + chunk_size = 1024 * 1024 + num_iter = int(size / chunk_size) + 2 downloaded_size = 0 - filename = path + "/igb_" + dataset_type + "_" + dataset_size + ".tar.gz" - with open(filename, 'wb') as f: + filename = ( + path + + "/igb_" + + dataset_type + + "_" + + dataset_size + + ".tar.gz" + ) + with open(filename, "wb") as f: pbar = tqdm(range(num_iter)) for i in pbar: chunk = data.read(chunk_size) downloaded_size += len(chunk) - pbar.set_description("Downloaded {:.2f} GB".format(float(downloaded_size)/GBFACTOR)) + pbar.set_description( + "Downloaded {:.2f} GB".format( + float(downloaded_size) / GBFACTOR + ) + ) f.write(chunk) - print("Downloaded" + " igb_" + dataset_type + "_" + dataset_size, end=" ->") + print( + "Downloaded" + " igb_" + dataset_type + "_" + dataset_size, + end=" ->", + ) check_md5sum(dataset_type, dataset_size, filename) - else: - print("The file igb_" + dataset_type + "_" + dataset_size + ".tar.gz already exists, directly extracting...") - filename = path + "/igb_" + dataset_type + "_" + dataset_size + ".tar.gz" + else: + print( + "The file igb_" + + dataset_type + + "_" + + dataset_size + + ".tar.gz already exists, directly extracting..." + ) + filename = ( + path + "/igb_" + dataset_type + "_" + dataset_size + ".tar.gz" + ) file = tarfile.open(filename) file.extractall(output_directory) file.close() size = 0 - for path, dirs, files in os.walk(output_directory+ "/" + dataset_size): + for path, dirs, files in os.walk(output_directory + "/" + dataset_size): for f in files: fp = os.path.join(path, f) size += os.path.getsize(fp) - print("Final dataset size {:.2f} GB.".format(size/GBFACTOR)) + print("Final dataset size {:.2f} GB.".format(size / GBFACTOR)) # os.remove(filename) - os.rename(output_directory+ "/" + dataset_size, output_directory+ "/" + "igb-" + dataset_type + "-" + dataset_size) - return output_directory + "/" + "igb-" + dataset_type + "-" + dataset_size + os.rename( + output_directory + "/" + dataset_size, + output_directory + "/" + "igb-" + dataset_type + "-" + dataset_size, + ) + return ( + output_directory + "/" + "igb-" + dataset_type + "-" + dataset_size + ) num_nodes = { @@ -291,32 +319,32 @@ def download_dataset(path, dataset_type, dataset_size): "paper": 269346174, "author": 277220883, "institute": 26918, - "fos": 712960 - }, + "fos": 712960, + }, "large": { "paper": 100000000, "author": 116959896, "institute": 26524, - "fos": 649707 + "fos": 649707, }, "medium": { - "paper": 10000000, + "paper": 10000000, "author": 15544654, "institute": 23256, - "fos": 415054 - }, - "small": { + "fos": 415054, + }, + "small": { "paper": 1000000, "author": 1926066, "institute": 14751, - "fos": 190449 + "fos": 190449, }, "tiny": { "paper": 100000, "author": 357041, "institute": 8738, - "fos": 84220 - } + "fos": 84220, + }, } num_edges = { @@ -324,32 +352,32 @@ def download_dataset(path, dataset_type, dataset_size): "paper__cites__paper": 3996442004, "paper__written_by__author": 716761549, "paper__topic__fos": 1050280600, - "author__affiliated_to__institute": 48521486 + "author__affiliated_to__institute": 48521486, }, "large": { "paper__cites__paper": 1223571364, "paper__written_by__author": 289502107, "paper__topic__fos": 457577294, - "author__affiliated_to__institute": 34099660 + "author__affiliated_to__institute": 34099660, }, "medium": { "paper__cites__paper": 120077694, "paper__written_by__author": 39854592, "paper__topic__fos": 68510495, - "author__affiliated_to__institute": 11049412 + "author__affiliated_to__institute": 11049412, }, "small": { "paper__cites__paper": 12070502, "paper__written_by__author": 4553516, "paper__topic__fos": 7234122, - "author__affiliated_to__institute": 1630476 - }, + "author__affiliated_to__institute": 1630476, + }, "tiny": { "paper__cites__paper": 447416, "paper__written_by__author": 471443, "paper__topic__fos": 718445, - "author__affiliated_to__institute": 325410 - } + "author__affiliated_to__institute": 325410, + }, } @@ -374,11 +402,13 @@ def split_data(label_path, set_dir, dataset_size): validation_labels = labels[train_end:validation_end] test_labels = labels[validation_end:] print(train_labels, len(train_labels)) - print(validation_labels,len(validation_labels)) + print(validation_labels, len(validation_labels)) print(test_labels, len(test_labels)) gb.numpy_save_aligned(f"{set_dir}/train_indices.npy", train_indices) - gb.numpy_save_aligned(f"{set_dir}/validation_indices.npy", validation_indices) + gb.numpy_save_aligned( + f"{set_dir}/validation_indices.npy", validation_indices + ) gb.numpy_save_aligned(f"{set_dir}/test_indices.npy", test_indices) gb.numpy_save_aligned(f"{set_dir}/train_labels.npy", train_labels) gb.numpy_save_aligned(f"{set_dir}/validation_labels.npy", validation_labels) @@ -388,7 +418,7 @@ def split_data(label_path, set_dir, dataset_size): def add_edges(edges, source, dest, dataset_size): for edge in edges: print(f"\t Processing {edge} edge...") - + old_edge_path = source + "/" + edge + "/" + "edge_index.npy" new_edge_path = dest + "/" + edge + ".npy" os.rename(src=old_edge_path, dst=new_edge_path) @@ -397,7 +427,7 @@ def add_edges(edges, source, dest, dataset_size): edge_array = np.load(new_edge_path) new_edge_array = edge_array.transpose() - assert(new_edge_array.shape == (2, num_edges[dataset_size][edge])) + assert new_edge_array.shape == (2, num_edges[dataset_size][edge]) np.save(new_edge_path, new_edge_array) @@ -405,25 +435,30 @@ def add_edges(edges, source, dest, dataset_size): def process_feat(file_path, node_name, dataset_size): # array = np.memmap(file_path, dtype='float32', mode='r', shape=(num_nodes[dataset_size][node_name], 1024)) array = np.load(file_path) - assert(array.shape == (num_nodes[dataset_size][node_name], 1024)) + assert array.shape == (num_nodes[dataset_size][node_name], 1024) gb.numpy_save_aligned(file_path, array) # Assert the shape and elements of the array are correct # new_array = np.memmap(file_path, dtype='float32', mode='r', shape=(num_nodes[dataset_size][node_name], 1024)) new_array = np.load(file_path) - assert(array.shape == (num_nodes[dataset_size][node_name], 1024)) - assert(np.array_equal(array, new_array)) - + assert array.shape == (num_nodes[dataset_size][node_name], 1024) + assert np.array_equal(array, new_array) + def process_label(file_path, num_class, dataset_size): - if num_class == 2983 and dataset_size == "full": # only this case label number changes + if ( + num_class == 2983 and dataset_size == "full" + ): # only this case label number changes # array = np.memmap(file_path, dtype='int32', mode='r', shape=(227130858, 1)) array = np.load(file_path) - assert(array.shape == (227130858, 1) or array.shape == (227130858,)) + assert array.shape == (227130858, 1) or array.shape == (227130858,) else: # array = np.memmap(file_path, dtype='int32', mode='r', shape=(num_nodes[dataset_size]["paper"], 1)) array = np.load(file_path) - assert(array.shape == (num_nodes[dataset_size]["paper"], 1) or array.shape == (num_nodes[dataset_size]["paper"],)) + assert array.shape == ( + num_nodes[dataset_size]["paper"], + 1, + ) or array.shape == (num_nodes[dataset_size]["paper"],) gb.numpy_save_aligned(file_path, array) @@ -431,13 +466,18 @@ def process_label(file_path, num_class, dataset_size): if num_class == 2983 and dataset_size == "full": # new_array = np.memmap(file_path, dtype='int32', mode='r', shape=(227130858, 1)) new_array = np.load(file_path) - assert(new_array.shape == (227130858, 1) or new_array.shape == (227130858,)) - assert(np.array_equal(array, new_array)) - else: + assert new_array.shape == (227130858, 1) or new_array.shape == ( + 227130858, + ) + assert np.array_equal(array, new_array) + else: # new_array = np.memmap(file_path, dtype='int32', mode='r', shape=(num_nodes[dataset_size]["paper"], 1)) new_array = np.load(file_path) - assert(new_array.shape == (num_nodes[dataset_size]["paper"], 1) or new_array.shape == (num_nodes[dataset_size]["paper"],)) - assert(np.array_equal(array, new_array)) + assert new_array.shape == ( + num_nodes[dataset_size]["paper"], + 1, + ) or new_array.shape == (num_nodes[dataset_size]["paper"],) + assert np.array_equal(array, new_array) def add_nodes(nodes, source, dest, dataset_size): @@ -446,18 +486,28 @@ def add_nodes(nodes, source, dest, dataset_size): old_node_path = source + "/" + node + "/" + "node_feat.npy" new_node_path = dest + "/" + node + "_feat.npy" os.rename(src=old_node_path, dst=new_node_path) - process_feat(file_path=new_node_path, node_name=node, dataset_size=dataset_size) + process_feat( + file_path=new_node_path, node_name=node, dataset_size=dataset_size + ) if node == "paper": print(f"\t Processing {node} labels...") old_label_path_19 = source + "/" + node + "/" + "node_label_19.npy" new_label_path_19 = dest + "/" + "paper_label_19.npy" os.rename(src=old_label_path_19, dst=new_label_path_19) - process_label(file_path=new_label_path_19, num_class=19, dataset_size=dataset_size) + process_label( + file_path=new_label_path_19, + num_class=19, + dataset_size=dataset_size, + ) old_label_path_2K = source + "/" + node + "/" + "node_label_2K.npy" new_label_path_2K = dest + "/" + "paper_label_2K.npy" os.rename(src=old_label_path_2K, dst=new_label_path_2K) - process_label(file_path=new_label_path_19, num_class=2983, dataset_size=dataset_size) + process_label( + file_path=new_label_path_19, + num_class=2983, + dataset_size=dataset_size, + ) return new_label_path_19, new_label_path_2K @@ -475,19 +525,21 @@ def process_dataset(path, dataset_size): node_dir = processed_dir + "/" + "data" os.makedirs(name=node_dir, exist_ok=True) # These are the four nodes in this citation network - nodes = [ - "paper", - "author", - "fos", - "institute" - ] - label_file_19, label_file_2K = add_nodes(nodes=nodes, source=original_path, dest=node_dir, dataset_size=dataset_size) + nodes = ["paper", "author", "fos", "institute"] + label_file_19, label_file_2K = add_nodes( + nodes=nodes, + source=original_path, + dest=node_dir, + dataset_size=dataset_size, + ) # Step 2: Create labels print("Processing train/valid/test files...") set_dir = processed_dir + "/" + "set" os.makedirs(name=set_dir, exist_ok=True) - split_data(label_path=label_file_19, set_dir=set_dir, dataset_size=dataset_size) + split_data( + label_path=label_file_19, set_dir=set_dir, dataset_size=dataset_size + ) # Step 3: Move edge files print("Processing Edge files...") @@ -498,9 +550,14 @@ def process_dataset(path, dataset_size): "paper__cites__paper", "paper__written_by__author", "paper__topic__fos", - "author__affiliated_to__institute" + "author__affiliated_to__institute", ] - add_edges(edges=edges, source=original_path, dest=edge_dir, dataset_size=dataset_size) + add_edges( + edges=edges, + source=original_path, + dest=edge_dir, + dataset_size=dataset_size, + ) # Step 4: Build the yaml file print("Building yaml file...") @@ -510,16 +567,30 @@ def process_dataset(path, dataset_size): print(f"Finished processing the {dataset_size} dataset") -if __name__ == '__main__': +if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument('--path', type=str, default='datasets/', - help='path to store the datasets') - parser.add_argument('--type', type=str, default='heterogeneous', - choices=['homogeneous', 'heterogeneous'], - help='dataset type') - parser.add_argument('--size', type=str, default='tiny', - choices=['tiny', 'small', 'medium', 'large', 'full'], - help='size of the datasets') - args = parser.parse_args() - path = download_dataset(path=args.path, dataset_type=args.type, dataset_size=args.size) + parser.add_argument( + "--path", + type=str, + default="datasets/", + help="path to store the datasets", + ) + parser.add_argument( + "--type", + type=str, + default="heterogeneous", + choices=["homogeneous", "heterogeneous"], + help="dataset type", + ) + parser.add_argument( + "--size", + type=str, + default="tiny", + choices=["tiny", "small", "medium", "large", "full"], + help="size of the datasets", + ) + args = parser.parse_args() + path = download_dataset( + path=args.path, dataset_type=args.type, dataset_size=args.size + ) process_dataset(path=path, dataset_size=args.size) diff --git a/examples/graphbolt/rgcn/evaluator.py b/examples/graphbolt/rgcn/evaluator.py index 6598355b0727..cbded66c264d 100644 --- a/examples/graphbolt/rgcn/evaluator.py +++ b/examples/graphbolt/rgcn/evaluator.py @@ -1,9 +1,11 @@ import numpy as np + try: import torch except ImportError: torch = None + ### Evaluator for node property prediction class IGB_Evaluator: def __init__(self, name, num_tasks, eval_metric): @@ -11,20 +13,19 @@ def __init__(self, name, num_tasks, eval_metric): self.num_tasks = num_tasks self.eval_metric = eval_metric - def _parse_and_check_input(self, input_dict): - if self.eval_metric == 'acc': - if not 'y_true' in input_dict: - raise RuntimeError('Missing key of y_true') - if not 'y_pred' in input_dict: - raise RuntimeError('Missing key of y_pred') + if self.eval_metric == "acc": + if not "y_true" in input_dict: + raise RuntimeError("Missing key of y_true") + if not "y_pred" in input_dict: + raise RuntimeError("Missing key of y_pred") - y_true, y_pred = input_dict['y_true'], input_dict['y_pred'] + y_true, y_pred = input_dict["y_true"], input_dict["y_pred"] - ''' + """ y_true: numpy ndarray or torch tensor of shape (num_nodes num_tasks) y_pred: numpy ndarray or torch tensor of shape (num_nodes num_tasks) - ''' + """ # converting to torch.Tensor to numpy on cpu if torch is not None and isinstance(y_true, torch.Tensor): @@ -34,54 +35,74 @@ def _parse_and_check_input(self, input_dict): y_pred = y_pred.detach().cpu().numpy() ## check type - if not (isinstance(y_true, np.ndarray) and isinstance(y_true, np.ndarray)): - raise RuntimeError('Arguments to Evaluator need to be either numpy ndarray or torch tensor') + if not ( + isinstance(y_true, np.ndarray) + and isinstance(y_true, np.ndarray) + ): + raise RuntimeError( + "Arguments to Evaluator need to be either numpy ndarray or torch tensor" + ) if not y_true.shape == y_pred.shape: - raise RuntimeError('Shape of y_true and y_pred must be the same') + raise RuntimeError( + "Shape of y_true and y_pred must be the same" + ) if not y_true.ndim == 2: - raise RuntimeError('y_true and y_pred must to 2-dim arrray, {}-dim array given'.format(y_true.ndim)) + raise RuntimeError( + "y_true and y_pred must to 2-dim arrray, {}-dim array given".format( + y_true.ndim + ) + ) if not y_true.shape[1] == self.num_tasks: - raise RuntimeError('Number of tasks for {} should be {} but {} given'.format(self.name, self.num_tasks, y_true.shape[1])) + raise RuntimeError( + "Number of tasks for {} should be {} but {} given".format( + self.name, self.num_tasks, y_true.shape[1] + ) + ) return y_true, y_pred else: - raise ValueError('Undefined eval metric %s ' % (self.eval_metric)) - + raise ValueError("Undefined eval metric %s " % (self.eval_metric)) def eval(self, input_dict): - if self.eval_metric == 'acc': + if self.eval_metric == "acc": y_true, y_pred = self._parse_and_check_input(input_dict) return self._eval_acc(y_true, y_pred) else: - raise ValueError('Undefined eval metric %s ' % (self.eval_metric)) + raise ValueError("Undefined eval metric %s " % (self.eval_metric)) @property def expected_input_format(self): - desc = '==== Expected input format of Evaluator for {}\n'.format(self.name) - if self.eval_metric == 'acc': - desc += '{\'y_true\': y_true, \'y_pred\': y_pred}\n' - desc += '- y_true: numpy ndarray or torch tensor of shape (num_nodes num_tasks)\n' - desc += '- y_pred: numpy ndarray or torch tensor of shape (num_nodes num_tasks)\n' - desc += 'where y_pred stores predicted class label (integer),\n' - desc += 'num_task is {}, and '.format(self.num_tasks) - desc += 'each row corresponds to one node.\n' + desc = "==== Expected input format of Evaluator for {}\n".format( + self.name + ) + if self.eval_metric == "acc": + desc += "{'y_true': y_true, 'y_pred': y_pred}\n" + desc += "- y_true: numpy ndarray or torch tensor of shape (num_nodes num_tasks)\n" + desc += "- y_pred: numpy ndarray or torch tensor of shape (num_nodes num_tasks)\n" + desc += "where y_pred stores predicted class label (integer),\n" + desc += "num_task is {}, and ".format(self.num_tasks) + desc += "each row corresponds to one node.\n" else: - raise ValueError('Undefined eval metric %s ' % (self.eval_metric)) + raise ValueError("Undefined eval metric %s " % (self.eval_metric)) return desc @property def expected_output_format(self): - desc = '==== Expected output format of Evaluator for {}\n'.format(self.name) - if self.eval_metric == 'acc': - desc += '{\'acc\': acc}\n' - desc += '- acc (float): Accuracy score averaged across {} task(s)\n'.format(self.num_tasks) + desc = "==== Expected output format of Evaluator for {}\n".format( + self.name + ) + if self.eval_metric == "acc": + desc += "{'acc': acc}\n" + desc += "- acc (float): Accuracy score averaged across {} task(s)\n".format( + self.num_tasks + ) else: - raise ValueError('Undefined eval metric %s ' % (self.eval_metric)) + raise ValueError("Undefined eval metric %s " % (self.eval_metric)) return desc @@ -89,9 +110,8 @@ def _eval_acc(self, y_true, y_pred): acc_list = [] for i in range(y_true.shape[1]): - is_labeled = y_true[:,i] == y_true[:,i] - correct = y_true[is_labeled,i] == y_pred[is_labeled,i] - acc_list.append(float(np.sum(correct))/len(correct)) - - return {'acc': sum(acc_list)/len(acc_list)} + is_labeled = y_true[:, i] == y_true[:, i] + correct = y_true[is_labeled, i] == y_pred[is_labeled, i] + acc_list.append(float(np.sum(correct)) / len(correct)) + return {"acc": sum(acc_list) / len(acc_list)} diff --git a/examples/graphbolt/rgcn/hetero_rgcn.py b/examples/graphbolt/rgcn/hetero_rgcn.py index bb12b31e2069..7516e16a857a 100644 --- a/examples/graphbolt/rgcn/hetero_rgcn.py +++ b/examples/graphbolt/rgcn/hetero_rgcn.py @@ -58,8 +58,8 @@ import torch.nn as nn import torch.nn.functional as F from dgl.nn import HeteroEmbedding -from ogb.lsc import MAG240MEvaluator from evaluator import IGB_Evaluator +from ogb.lsc import MAG240MEvaluator from ogb.nodeproppred import Evaluator from tqdm import tqdm @@ -595,7 +595,6 @@ def main(args): # `institution` are generated in advance and stored in the feature store. # For `ogbn-mag`, we generate the features on the fly. embed_layer = None - # if args.dataset == "ogbn-mag": if args.dataset == "ogbn-mag" or "igb-heterogeneous" in args.dataset: # Create the embedding layer and move it to the appropriate device. embed_layer = rel_graph_embed(g, feat_size).to(device) @@ -671,9 +670,15 @@ def main(args): "--dataset", type=str, default="ogbn-mag", - choices=["ogbn-mag", "ogb-lsc-mag240m", "igb-heterogeneous-tiny", - "igb-heterogeneous-small", "igb-heterogeneous-medium", - "igb-heterogeneous-large", "igb-heterogeneous-full"], + choices=[ + "ogbn-mag", + "ogb-lsc-mag240m", + "igb-heterogeneous-tiny", + "igb-heterogeneous-small", + "igb-heterogeneous-medium", + "igb-heterogeneous-large", + "igb-heterogeneous-full", + ], help="Dataset name. Possible values: ogbn-mag, ogb-lsc-mag240m", ) parser.add_argument("--num_epochs", type=int, default=3) From 6bf10cb47d0765488e58646798254c4f3278b62c Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Thu, 15 Aug 2024 20:40:11 +0000 Subject: [PATCH 05/35] added documentation --- examples/graphbolt/rgcn/download.py | 116 +++++++++++++++------------- 1 file changed, 64 insertions(+), 52 deletions(-) diff --git a/examples/graphbolt/rgcn/download.py b/examples/graphbolt/rgcn/download.py index 4724061476b5..9f3b51ceb6b7 100755 --- a/examples/graphbolt/rgcn/download.py +++ b/examples/graphbolt/rgcn/download.py @@ -9,24 +9,21 @@ GBFACTOR = float(1 << 30) -def _get_size(file_path, node_name): - if "full" in file_path: - return num_nodes["full"][node_name] - if "large" in file_path: - return num_nodes["large"][node_name] - path = f"{file_path}/processed/{node_name}/{node_name}_id_index_mapping.npy" - array = np.load(path, allow_pickle=True) - return len(array.item()) - - -def build_yaml_helper(path, in_memory=True): +def build_yaml_helper(path, dataset_size, in_memory=True): + """The stirng to build the yaml file. (Still need modification)""" data = { "graph": { "nodes": [ - {"num": _get_size(path, "paper"), "type": "paper"}, - {"num": _get_size(path, "author"), "type": "author"}, - {"num": _get_size(path, "institute"), "type": "institution"}, - {"num": _get_size(path, "fos"), "type": "field_of_study"}, + {"num": num_nodes[dataset_size]["paper"], "type": "paper"}, + {"num": num_nodes[dataset_size]["author"], "type": "author"}, + { + "num": num_nodes[dataset_size]["institute"], + "type": "institution", + }, + { + "num": num_nodes[dataset_size]["fos"], + "type": "field_of_study", + }, ], "edges": [ { @@ -170,28 +167,18 @@ def build_yaml_helper(path, in_memory=True): return data -def build_yaml(original_path, current_path): - if "large" in current_path or "full" in current_path: - data = build_yaml_helper(original_path, in_memory=False) - else: - data = build_yaml_helper(original_path) - with open(f"{current_path}/metadata.yaml", "w") as file: - yaml.dump(data, file, default_flow_style=False) - - -def decide_download(url): - d = ur.urlopen(url) - size = int(d.info()["Content-Length"]) / GBFACTOR - ### confirm if larger than 1GB - if size > 1: - return ( - input( - "This will download %.2fGB. Will you proceed? (y/N) " % (size) - ).lower() - == "y" +def build_yaml(original_path, current_path, dataset_size): + """This build the yaml file differently based on the dataset size. + The two large datasets are put in disk while the other three smaller versions are in memory. + """ + if "large" == dataset_size or "full" == dataset_size: + data = build_yaml_helper( + path=original_path, dataset_size=dataset_size, in_memory=False ) else: - return True + data = build_yaml_helper(path=original_path, dataset_size=dataset_size) + with open(f"{current_path}/metadata.yaml", "w") as file: + yaml.dump(data=data, stream=file, default_flow_style=False) dataset_urls = { @@ -222,7 +209,24 @@ def decide_download(url): } +def decide_download(url): + """An interactive command line to confirm download.""" + d = ur.urlopen(url) + size = int(d.info()["Content-Length"]) / GBFACTOR + ### confirm if larger than 1GB + if size > 1: + return ( + input( + "This will download %.2fGB. Will you proceed? (y/N) " % (size) + ).lower() + == "y" + ) + else: + return True + + def check_md5sum(dataset_type, dataset_size, filename): + """This is for checking the data correctness of the downloaded datasets.""" original_md5 = md5checksums[dataset_type][dataset_size] with open(filename, "rb") as file_to_check: @@ -238,11 +242,15 @@ def check_md5sum(dataset_type, dataset_size, filename): def download_dataset(path, dataset_type, dataset_size): + """This is the script to download all the related datasets.""" + + # For large datasets, use the two shell scripts to download. if dataset_size in ["large", "full"]: command = f"./download_{dataset_size}_igbh.sh" subprocess.run(["bash", command], check=True, text=True) shutil.move(src=f"igb-{dataset_type}-{dataset_size}", dst=f"{path}") return path + "/" + "igb-" + dataset_type + "-" + dataset_size + # For the three smaller version, use the url to download. else: output_directory = path if not os.path.exists( @@ -284,7 +292,7 @@ def download_dataset(path, dataset_type, dataset_size): end=" ->", ) check_md5sum(dataset_type, dataset_size, filename) - else: + else: # No need to download the tar file again if it is already downloaded. print( "The file igb_" + dataset_type @@ -295,6 +303,7 @@ def download_dataset(path, dataset_type, dataset_size): filename = ( path + "/igb_" + dataset_type + "_" + dataset_size + ".tar.gz" ) + # Extract the tar file file = tarfile.open(filename) file.extractall(output_directory) file.close() @@ -382,6 +391,7 @@ def download_dataset(path, dataset_type, dataset_size): def split_data(label_path, set_dir, dataset_size): + """This is for splitting the labels into three sets: train, validation, and test sets.""" # labels = np.memmap(label_path, dtype='int32', mode='r', shape=(num_nodes[dataset_size]["paper"], 1)) labels = np.load(label_path) @@ -416,6 +426,7 @@ def split_data(label_path, set_dir, dataset_size): def add_edges(edges, source, dest, dataset_size): + """This is for processing the edges in the graph and convert them to correct shape.""" for edge in edges: print(f"\t Processing {edge} edge...") @@ -428,11 +439,13 @@ def add_edges(edges, source, dest, dataset_size): new_edge_array = edge_array.transpose() assert new_edge_array.shape == (2, num_edges[dataset_size][edge]) + assert np.array_equal(edge_array, new_edge_array.transpose()) - np.save(new_edge_path, new_edge_array) + gb.numpy_save_aligned(new_edge_path, new_edge_array) def process_feat(file_path, node_name, dataset_size): + """This is for processing the node features.""" # array = np.memmap(file_path, dtype='float32', mode='r', shape=(num_nodes[dataset_size][node_name], 1024)) array = np.load(file_path) assert array.shape == (num_nodes[dataset_size][node_name], 1024) @@ -441,24 +454,22 @@ def process_feat(file_path, node_name, dataset_size): # Assert the shape and elements of the array are correct # new_array = np.memmap(file_path, dtype='float32', mode='r', shape=(num_nodes[dataset_size][node_name], 1024)) new_array = np.load(file_path) - assert array.shape == (num_nodes[dataset_size][node_name], 1024) + assert new_array.shape == (num_nodes[dataset_size][node_name], 1024) assert np.array_equal(array, new_array) def process_label(file_path, num_class, dataset_size): + """This is for processing the node labels.""" if ( num_class == 2983 and dataset_size == "full" ): # only this case label number changes # array = np.memmap(file_path, dtype='int32', mode='r', shape=(227130858, 1)) array = np.load(file_path) - assert array.shape == (227130858, 1) or array.shape == (227130858,) + assert array.shape[0] == 227130858 else: # array = np.memmap(file_path, dtype='int32', mode='r', shape=(num_nodes[dataset_size]["paper"], 1)) array = np.load(file_path) - assert array.shape == ( - num_nodes[dataset_size]["paper"], - 1, - ) or array.shape == (num_nodes[dataset_size]["paper"],) + assert array.shape[0] == num_nodes[dataset_size]["paper"] gb.numpy_save_aligned(file_path, array) @@ -466,21 +477,17 @@ def process_label(file_path, num_class, dataset_size): if num_class == 2983 and dataset_size == "full": # new_array = np.memmap(file_path, dtype='int32', mode='r', shape=(227130858, 1)) new_array = np.load(file_path) - assert new_array.shape == (227130858, 1) or new_array.shape == ( - 227130858, - ) + assert new_array.shape[0] == 227130858 assert np.array_equal(array, new_array) else: # new_array = np.memmap(file_path, dtype='int32', mode='r', shape=(num_nodes[dataset_size]["paper"], 1)) new_array = np.load(file_path) - assert new_array.shape == ( - num_nodes[dataset_size]["paper"], - 1, - ) or new_array.shape == (num_nodes[dataset_size]["paper"],) + assert new_array.shape[0] == num_nodes[dataset_size]["paper"] assert np.array_equal(array, new_array) def add_nodes(nodes, source, dest, dataset_size): + """This is for processing the nodes in the graph and store them in correct format.""" for node in nodes: print(f"\t Processing {node} node feature...") old_node_path = source + "/" + node + "/" + "node_feat.npy" @@ -489,6 +496,7 @@ def add_nodes(nodes, source, dest, dataset_size): process_feat( file_path=new_node_path, node_name=node, dataset_size=dataset_size ) + # If the node is a paper type, process the labels if node == "paper": print(f"\t Processing {node} labels...") old_label_path_19 = source + "/" + node + "/" + "node_label_19.npy" @@ -515,7 +523,7 @@ def add_nodes(nodes, source, dest, dataset_size): def process_dataset(path, dataset_size): print(f"Starting to process the {dataset_size} dataset...") - # Make the directory for processed dataset + # Step 0: Make the directory for processed dataset processed_dir = path + "-seeds" os.makedirs(name=processed_dir, exist_ok=True) original_path = path + "/" + "processed" @@ -561,7 +569,11 @@ def process_dataset(path, dataset_size): # Step 4: Build the yaml file print("Building yaml file...") - build_yaml(original_path=path, current_path=processed_dir) + build_yaml( + original_path=path, + current_path=processed_dir, + dataset_size=dataset_size, + ) # shutil.rmtree(path) print(f"Finished processing the {dataset_size} dataset") From 42717af24f64513f9932bddab489cba564813ada Mon Sep 17 00:00:00 2001 From: Bowen Yao <112051015+BowenYao18@users.noreply.github.com> Date: Thu, 15 Aug 2024 15:01:15 -0700 Subject: [PATCH 06/35] Update examples/graphbolt/rgcn/download.py Co-authored-by: Muhammed Fatih BALIN --- examples/graphbolt/rgcn/download.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/graphbolt/rgcn/download.py b/examples/graphbolt/rgcn/download.py index 9f3b51ceb6b7..a4067b9921ce 100755 --- a/examples/graphbolt/rgcn/download.py +++ b/examples/graphbolt/rgcn/download.py @@ -396,7 +396,7 @@ def split_data(label_path, set_dir, dataset_size): labels = np.load(label_path) total_samples = len(labels) - train_end = int(0.8 * total_samples) + train_end = int(0.6 * total_samples) validation_end = int(0.9 * total_samples) indices = np.arange(total_samples) From 06f3f293027a3932c61ddca0368fe82c874083fe Mon Sep 17 00:00:00 2001 From: Bowen Yao <112051015+BowenYao18@users.noreply.github.com> Date: Thu, 15 Aug 2024 15:01:22 -0700 Subject: [PATCH 07/35] Update examples/graphbolt/rgcn/download.py Co-authored-by: Muhammed Fatih BALIN --- examples/graphbolt/rgcn/download.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/graphbolt/rgcn/download.py b/examples/graphbolt/rgcn/download.py index a4067b9921ce..ce50e82bf12e 100755 --- a/examples/graphbolt/rgcn/download.py +++ b/examples/graphbolt/rgcn/download.py @@ -6,7 +6,7 @@ import numpy as np from tqdm import tqdm -GBFACTOR = float(1 << 30) +GBFACTOR = 1 << 30 def build_yaml_helper(path, dataset_size, in_memory=True): From 19f18a581a9584c248348d779270e026cb4302b7 Mon Sep 17 00:00:00 2001 From: Bowen Yao <112051015+BowenYao18@users.noreply.github.com> Date: Thu, 15 Aug 2024 15:01:41 -0700 Subject: [PATCH 08/35] Update examples/graphbolt/rgcn/download.py Co-authored-by: Muhammed Fatih BALIN --- examples/graphbolt/rgcn/download.py | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/graphbolt/rgcn/download.py b/examples/graphbolt/rgcn/download.py index ce50e82bf12e..7ec369bbfe99 100755 --- a/examples/graphbolt/rgcn/download.py +++ b/examples/graphbolt/rgcn/download.py @@ -480,6 +480,7 @@ def process_label(file_path, num_class, dataset_size): assert new_array.shape[0] == 227130858 assert np.array_equal(array, new_array) else: + assert num_class == 19 # new_array = np.memmap(file_path, dtype='int32', mode='r', shape=(num_nodes[dataset_size]["paper"], 1)) new_array = np.load(file_path) assert new_array.shape[0] == num_nodes[dataset_size]["paper"] From 97c17352f978ef6b2adb1792528dcd85f1dab9d0 Mon Sep 17 00:00:00 2001 From: Bowen Yao <112051015+BowenYao18@users.noreply.github.com> Date: Thu, 15 Aug 2024 15:02:06 -0700 Subject: [PATCH 09/35] Update examples/graphbolt/rgcn/download.py Co-authored-by: Muhammed Fatih BALIN --- examples/graphbolt/rgcn/download.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/graphbolt/rgcn/download.py b/examples/graphbolt/rgcn/download.py index 7ec369bbfe99..e3513019b414 100755 --- a/examples/graphbolt/rgcn/download.py +++ b/examples/graphbolt/rgcn/download.py @@ -397,7 +397,7 @@ def split_data(label_path, set_dir, dataset_size): total_samples = len(labels) train_end = int(0.6 * total_samples) - validation_end = int(0.9 * total_samples) + validation_end = int(0.8 * total_samples) indices = np.arange(total_samples) train_indices = indices[:train_end] From 93cb70f18139ff70060f692776294a2836a430ec Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Thu, 15 Aug 2024 22:35:24 +0000 Subject: [PATCH 10/35] added 2983 class task --- examples/graphbolt/rgcn/download.py | 113 +++++++++++++++++++++++----- 1 file changed, 96 insertions(+), 17 deletions(-) diff --git a/examples/graphbolt/rgcn/download.py b/examples/graphbolt/rgcn/download.py index e3513019b414..787060c33819 100755 --- a/examples/graphbolt/rgcn/download.py +++ b/examples/graphbolt/rgcn/download.py @@ -56,13 +56,13 @@ def build_yaml_helper(path, dataset_size, in_memory=True): "data": [ { "in_memory": in_memory, - "path": "set/validation_indices.npy", + "path": "set/validation_indices_19.npy", "name": "seeds", "format": "numpy", }, { "in_memory": in_memory, - "path": "set/validation_labels.npy", + "path": "set/validation_labels_19.npy", "name": "labels", "format": "numpy", }, @@ -70,19 +70,19 @@ def build_yaml_helper(path, dataset_size, in_memory=True): "type": "paper", } ], - "name": "node_classification", + "name": "node_classification_19", "train_set": [ { "data": [ { "in_memory": in_memory, - "path": "set/train_indices.npy", + "path": "set/train_indices_19.npy", "name": "seeds", "format": "numpy", }, { "in_memory": in_memory, - "path": "set/train_labels.npy", + "path": "set/train_labels_19.npy", "name": "labels", "format": "numpy", }, @@ -95,13 +95,13 @@ def build_yaml_helper(path, dataset_size, in_memory=True): "data": [ { "in_memory": in_memory, - "path": "set/test_indices.npy", + "path": "set/test_indices_19.npy", "name": "seeds", "format": "numpy", }, { "in_memory": in_memory, - "path": "set/test_labels.npy", + "path": "set/test_labels_19.npy", "name": "labels", "format": "numpy", }, @@ -109,7 +109,68 @@ def build_yaml_helper(path, dataset_size, in_memory=True): "type": "paper", } ], - } + }, + { + "num_classes": 2983, + "validation_set": [ + { + "data": [ + { + "in_memory": in_memory, + "path": "set/validation_indices_2983.npy", + "name": "seeds", + "format": "numpy", + }, + { + "in_memory": in_memory, + "path": "set/validation_labels_2983.npy", + "name": "labels", + "format": "numpy", + }, + ], + "type": "paper", + } + ], + "name": "node_classification_2K", + "train_set": [ + { + "data": [ + { + "in_memory": in_memory, + "path": "set/train_indices_2983.npy", + "name": "seeds", + "format": "numpy", + }, + { + "in_memory": in_memory, + "path": "set/train_labels_2983.npy", + "name": "labels", + "format": "numpy", + }, + ], + "type": "paper", + } + ], + "test_set": [ + { + "data": [ + { + "in_memory": in_memory, + "path": "set/test_indices_2983.npy", + "name": "seeds", + "format": "numpy", + }, + { + "in_memory": in_memory, + "path": "set/test_labels_2983.npy", + "name": "labels", + "format": "numpy", + }, + ], + "type": "paper", + } + ], + }, ], "feature_data": [ { @@ -390,7 +451,7 @@ def download_dataset(path, dataset_type, dataset_size): } -def split_data(label_path, set_dir, dataset_size): +def split_data(label_path, set_dir, dataset_size, class_num): """This is for splitting the labels into three sets: train, validation, and test sets.""" # labels = np.memmap(label_path, dtype='int32', mode='r', shape=(num_nodes[dataset_size]["paper"], 1)) labels = np.load(label_path) @@ -415,14 +476,24 @@ def split_data(label_path, set_dir, dataset_size): print(validation_labels, len(validation_labels)) print(test_labels, len(test_labels)) - gb.numpy_save_aligned(f"{set_dir}/train_indices.npy", train_indices) gb.numpy_save_aligned( - f"{set_dir}/validation_indices.npy", validation_indices + f"{set_dir}/train_indices_{class_num}.npy", train_indices + ) + gb.numpy_save_aligned( + f"{set_dir}/validation_indices_{class_num}.npy", validation_indices + ) + gb.numpy_save_aligned( + f"{set_dir}/test_indices_{class_num}.npy", test_indices + ) + gb.numpy_save_aligned( + f"{set_dir}/train_labels_{class_num}.npy", train_labels + ) + gb.numpy_save_aligned( + f"{set_dir}/validation_labels_{class_num}.npy", validation_labels + ) + gb.numpy_save_aligned( + f"{set_dir}/test_labels_{class_num}.npy", test_labels ) - gb.numpy_save_aligned(f"{set_dir}/test_indices.npy", test_indices) - gb.numpy_save_aligned(f"{set_dir}/train_labels.npy", train_labels) - gb.numpy_save_aligned(f"{set_dir}/validation_labels.npy", validation_labels) - gb.numpy_save_aligned(f"{set_dir}/test_labels.npy", test_labels) def add_edges(edges, source, dest, dataset_size): @@ -480,7 +551,6 @@ def process_label(file_path, num_class, dataset_size): assert new_array.shape[0] == 227130858 assert np.array_equal(array, new_array) else: - assert num_class == 19 # new_array = np.memmap(file_path, dtype='int32', mode='r', shape=(num_nodes[dataset_size]["paper"], 1)) new_array = np.load(file_path) assert new_array.shape[0] == num_nodes[dataset_size]["paper"] @@ -547,7 +617,16 @@ def process_dataset(path, dataset_size): set_dir = processed_dir + "/" + "set" os.makedirs(name=set_dir, exist_ok=True) split_data( - label_path=label_file_19, set_dir=set_dir, dataset_size=dataset_size + label_path=label_file_19, + set_dir=set_dir, + dataset_size=dataset_size, + class_num=19, + ) + split_data( + label_path=label_file_2K, + set_dir=set_dir, + dataset_size=dataset_size, + class_num=2983, ) # Step 3: Move edge files From b170a903ddd5889aa2addfe215fe953777cef023 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Thu, 15 Aug 2024 22:39:01 +0000 Subject: [PATCH 11/35] fix lint --- examples/graphbolt/rgcn/download.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/examples/graphbolt/rgcn/download.py b/examples/graphbolt/rgcn/download.py index 787060c33819..1304fcfac572 100755 --- a/examples/graphbolt/rgcn/download.py +++ b/examples/graphbolt/rgcn/download.py @@ -491,9 +491,7 @@ def split_data(label_path, set_dir, dataset_size, class_num): gb.numpy_save_aligned( f"{set_dir}/validation_labels_{class_num}.npy", validation_labels ) - gb.numpy_save_aligned( - f"{set_dir}/test_labels_{class_num}.npy", test_labels - ) + gb.numpy_save_aligned(f"{set_dir}/test_labels_{class_num}.npy", test_labels) def add_edges(edges, source, dest, dataset_size): From 55079b8507c9d1bc9019510970b36369027f7d6c Mon Sep 17 00:00:00 2001 From: Bowen Yao <112051015+BowenYao18@users.noreply.github.com> Date: Thu, 15 Aug 2024 15:41:38 -0700 Subject: [PATCH 12/35] Update examples/graphbolt/rgcn/download.py Co-authored-by: Muhammed Fatih BALIN --- examples/graphbolt/rgcn/download.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/graphbolt/rgcn/download.py b/examples/graphbolt/rgcn/download.py index 1304fcfac572..dc37aef60c73 100755 --- a/examples/graphbolt/rgcn/download.py +++ b/examples/graphbolt/rgcn/download.py @@ -70,7 +70,7 @@ def build_yaml_helper(path, dataset_size, in_memory=True): "type": "paper", } ], - "name": "node_classification_19", + "name": "node_classification", "train_set": [ { "data": [ From ce65746cad5d36ec03a07b328f06e2c0987a72fd Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Fri, 16 Aug 2024 03:25:04 +0000 Subject: [PATCH 13/35] remove labels from yaml --- examples/graphbolt/rgcn/download.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/examples/graphbolt/rgcn/download.py b/examples/graphbolt/rgcn/download.py index dc37aef60c73..279e60126233 100755 --- a/examples/graphbolt/rgcn/download.py +++ b/examples/graphbolt/rgcn/download.py @@ -181,14 +181,6 @@ def build_yaml_helper(path, dataset_size, in_memory=True): "path": "data/paper_feat.npy", "type": "paper", }, - { - "domain": "node", - "name": "label", - "format": "numpy", - "in_memory": in_memory, - "path": "data/paper_label_19.npy", - "type": "paper", - }, { "domain": "node", "name": "feat", From 33229878c8ce918988ca29016c17a1ca88530cb6 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Sat, 17 Aug 2024 07:33:44 +0000 Subject: [PATCH 14/35] add doenload script --- examples/graphbolt/rgcn/download_full_igbh.sh | 98 +++++++++++++++++++ .../graphbolt/rgcn/download_large_igbh.sh | 95 ++++++++++++++++++ 2 files changed, 193 insertions(+) create mode 100755 examples/graphbolt/rgcn/download_full_igbh.sh create mode 100755 examples/graphbolt/rgcn/download_large_igbh.sh diff --git a/examples/graphbolt/rgcn/download_full_igbh.sh b/examples/graphbolt/rgcn/download_full_igbh.sh new file mode 100755 index 000000000000..1efcdb3cb0b9 --- /dev/null +++ b/examples/graphbolt/rgcn/download_full_igbh.sh @@ -0,0 +1,98 @@ +#! /bin/bash + +mkdir -p igb-dataset-full +cd igb-dataset-full +mkdir -p processed +cd processed + +echo "IGBH600M (Heteregeneous) download starting" + +# paper +mkdir -p paper +cd paper +wget https://igb-public-awsopen.s3.amazonaws.com/IGBH/processed/paper/node_feat.npy +wget https://igb-public-awsopen.s3.amazonaws.com/IGBH/processed/paper/node_label_19.npy +wget https://igb-public-awsopen.s3.amazonaws.com/IGBH/processed/paper/node_label_2K.npy +wget https://igb-public-awsopen.s3.amazonaws.com/IGBH/processed/paper/paper_id_index_mapping.npy +cd .. + +# paper__cites__paper +mkdir -p paper__cites__paper +cd paper__cites__paper +wget https://igb-public-awsopen.s3.amazonaws.com/IGBH/processed/paper__cites__paper/edge_index.npy +cd .. + +# author +mkdir -p author +cd author +wget https://igb-public-awsopen.s3.amazonaws.com/IGBH/processed/author/author_id_index_mapping.npy +wget https://igb-public-awsopen.s3.amazonaws.com/IGBH/processed/author/node_feat.npy +cd .. + +# conference +mkdir -p conference +cd conference +wget https://igb-public-awsopen.s3.amazonaws.com/IGBH/processed/conference/conference_id_index_mapping.npy +wget https://igb-public-awsopen.s3.amazonaws.com/IGBH/processed/conference/node_feat.npy +cd .. + +# institute +mkdir -p institute +cd institute +wget https://igb-public-awsopen.s3.amazonaws.com/IGBH/processed/institute/institute_id_index_mapping.npy +wget https://igb-public-awsopen.s3.amazonaws.com/IGBH/processed/institute/node_feat.npy +cd .. + +# journal +mkdir -p journal +cd journal +wget https://igb-public-awsopen.s3.amazonaws.com/IGBH/processed/journal/journal_id_index_mapping.npy +wget https://igb-public-awsopen.s3.amazonaws.com/IGBH/processed/journal/node_feat.npy +cd .. + +# fos +mkdir -p fos +cd fos +wget https://igb-public-awsopen.s3.amazonaws.com/IGBH/processed/fos/fos_id_index_mapping.npy +wget https://igb-public-awsopen.s3.amazonaws.com/IGBH/processed/fos/node_feat.npy +cd .. + +# author__affiliated_to__institute +mkdir -p author__affiliated_to__institute +cd author__affiliated_to__institute +wget https://igb-public-awsopen.s3.amazonaws.com/IGBH/processed/author__affiliated_to__institute/edge_index.npy +cd .. + +# paper__published__journal +mkdir -p paper__published__journal +cd paper__published__journal +wget https://igb-public-awsopen.s3.amazonaws.com/IGBH/processed/paper__published__journal/edge_index.npy +cd .. + +# paper__topic__fos +mkdir -p paper__topic__fos +cd paper__topic__fos +wget https://igb-public-awsopen.s3.amazonaws.com/IGBH/processed/paper__topic__fos/edge_index.npy +cd .. + +# paper__venue__conference +mkdir -p paper__venue__conference +cd paper__venue__conference +wget https://igb-public-awsopen.s3.amazonaws.com/IGBH/processed/paper__venue__conference/edge_index.npy +cd .. + +# paper__written_by__author +mkdir -p paper__written_by__author +cd paper__written_by__author +wget https://igb-public-awsopen.s3.amazonaws.com/IGBH/processed/paper__written_by__author/edge_index.npy +cd .. + +cd ../.. + +echo "IGBH-IGBH (Heteregeneous) download complete" + + +num_paper_nodes = 269346174 +paper_node_features = np.memmap('/home/ubuntu/dgl/examples/graphbolt/rgcn/igb_dataset/igb_dataset_full/processed/paper/node_label_19.npy', dtype='float32', mode='r', shape=(num_paper_nodes,1)) +num_paper_nodes = 48521486 +paper_node_features = np.memmap('/home/ubuntu/dgl/examples/graphbolt/rgcn/datasets/igb-dataset-full-seeds/edges/author__affiliated_to__institute.npy', dtype='int32', mode='r', shape=(num_paper_nodes,2)) diff --git a/examples/graphbolt/rgcn/download_large_igbh.sh b/examples/graphbolt/rgcn/download_large_igbh.sh new file mode 100755 index 000000000000..4c36a7f8d0db --- /dev/null +++ b/examples/graphbolt/rgcn/download_large_igbh.sh @@ -0,0 +1,95 @@ +#! /bin/bash + +mkdir -p igb-heterogeneous-large/ +cd igb-heterogeneous-large/ +mkdir -p processed +cd processed + +echo "IGBH-large (Heterogeneous) download starting" + +# paper +mkdir -p paper +cd paper +wget https://igb-public-awsopen.s3.amazonaws.com/igb_large/processed/paper/node_feat.npy +wget https://igb-public-awsopen.s3.amazonaws.com/igb_large/processed/paper/node_label_19.npy +wget https://igb-public-awsopen.s3.amazonaws.com/igb_large/processed/paper/node_label_2K.npy +wget https://igb-public-awsopen.s3.amazonaws.com/igb_large/processed/paper/paper_id_index_mapping.npy +cd .. + +# # paper__cites__paper +# wget --recursive --no-parent https://igb-public-awsopen.s3.amazonaws.com/igb_large/processed/paper__cites__paper/edge_index.npy + +# paper__cites__paper +mkdir -p paper__cites__paper +cd paper__cites__paper +wget https://igb-public-awsopen.s3.amazonaws.com/igb_large/processed/paper__cites__paper/edge_index.npy +cd .. + +# author +mkdir -p author +cd author +wget https://igb-public-awsopen.s3.amazonaws.com/igb_large/processed/author/author_id_index_mapping.npy +wget https://igb-public-awsopen.s3.amazonaws.com/igb_large/processed/author/node_feat.npy +cd .. + +# conference +mkdir -p conference +cd conference +wget https://igb-public-awsopen.s3.amazonaws.com/igb_large/processed/conference/conference_id_index_mapping.npy +wget https://igb-public-awsopen.s3.amazonaws.com/igb_large/processed/conference/node_feat.npy +cd .. + +# institute +mkdir -p institute +cd institute +wget https://igb-public-awsopen.s3.amazonaws.com/igb_large/processed/institute/institute_id_index_mapping.npy +wget https://igb-public-awsopen.s3.amazonaws.com/igb_large/processed/institute/node_feat.npy +cd .. + +# journal +mkdir -p journal +cd journal +wget https://igb-public-awsopen.s3.amazonaws.com/igb_large/processed/journal/journal_id_index_mapping.npy +wget https://igb-public-awsopen.s3.amazonaws.com/igb_large/processed/journal/node_feat.npy +cd .. + +# fos +mkdir -p fos +cd fos +wget https://igb-public-awsopen.s3.amazonaws.com/igb_large/processed/fos/fos_id_index_mapping.npy +wget https://igb-public-awsopen.s3.amazonaws.com/igb_large/processed/fos/node_feat.npy +cd .. + +# author__affiliated_to__institute +mkdir -p author__affiliated_to__institute +cd author__affiliated_to__institute +wget https://igb-public-awsopen.s3.amazonaws.com/igb_large/processed/author__affiliated_to__institute/edge_index.npy +cd .. + +# paper__published__journal +mkdir -p paper__published__journal +cd paper__published__journal +wget https://igb-public-awsopen.s3.amazonaws.com/igb_large/processed/paper__published__journal/edge_index.npy +cd .. + +# paper__topic__fos +mkdir -p paper__topic__fos +cd paper__topic__fos +wget https://igb-public-awsopen.s3.amazonaws.com/igb_large/processed/paper__topic__fos/edge_index.npy +cd .. + +# paper__venue__conference +mkdir -p paper__venue__conference +cd paper__venue__conference +wget https://igb-public-awsopen.s3.amazonaws.com/igb_large/processed/paper__venue__conference/edge_index.npy +cd .. + +# paper__written_by__author +mkdir -p paper__written_by__author +cd paper__written_by__author +wget https://igb-public-awsopen.s3.amazonaws.com/igb_large/processed/paper__written_by__author/edge_index.npy +cd .. + +cd ../.. + +echo "IGBH-large (Heterogeneous) download complete" \ No newline at end of file From 0135b4b5af3c0219c2b6e3985d6549c900767ad3 Mon Sep 17 00:00:00 2001 From: BowenYao18 Date: Sat, 17 Aug 2024 08:24:57 +0000 Subject: [PATCH 15/35] corrected path in processing file --- examples/graphbolt/rgcn/download.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/graphbolt/rgcn/download.py b/examples/graphbolt/rgcn/download.py index 279e60126233..475f84592db3 100755 --- a/examples/graphbolt/rgcn/download.py +++ b/examples/graphbolt/rgcn/download.py @@ -573,7 +573,7 @@ def add_nodes(nodes, source, dest, dataset_size): new_label_path_2K = dest + "/" + "paper_label_2K.npy" os.rename(src=old_label_path_2K, dst=new_label_path_2K) process_label( - file_path=new_label_path_19, + file_path=new_label_path_2K, num_class=2983, dataset_size=dataset_size, ) From 7a35313cdd7f0f2f656e6e67f5ed00b0472ba5f2 Mon Sep 17 00:00:00 2001 From: BowenYao18 Date: Sat, 17 Aug 2024 09:28:18 +0000 Subject: [PATCH 16/35] modify yaml file builder --- examples/graphbolt/rgcn/download.py | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/examples/graphbolt/rgcn/download.py b/examples/graphbolt/rgcn/download.py index 475f84592db3..249f1dd176f6 100755 --- a/examples/graphbolt/rgcn/download.py +++ b/examples/graphbolt/rgcn/download.py @@ -51,6 +51,7 @@ def build_yaml_helper(path, dataset_size, in_memory=True): "tasks": [ { "num_classes": 19, + "name": "node_classification", "validation_set": [ { "data": [ @@ -70,7 +71,6 @@ def build_yaml_helper(path, dataset_size, in_memory=True): "type": "paper", } ], - "name": "node_classification", "train_set": [ { "data": [ @@ -112,6 +112,7 @@ def build_yaml_helper(path, dataset_size, in_memory=True): }, { "num_classes": 2983, + "name": "node_classification_2K", "validation_set": [ { "data": [ @@ -131,7 +132,6 @@ def build_yaml_helper(path, dataset_size, in_memory=True): "type": "paper", } ], - "name": "node_classification_2K", "train_set": [ { "data": [ @@ -205,14 +205,6 @@ def build_yaml_helper(path, dataset_size, in_memory=True): "path": "data/fos_feat.npy", "type": "fos", }, - { - "domain": "node", - "name": "feat", - "format": "numpy", - "in_memory": in_memory, - "path": "data/author_feat.npy", - "type": "author", - }, ], "dataset_name": os.path.basename(path), } From 422ccbee4f4dcdb87af417c24e3beeaeff018d04 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Thu, 5 Sep 2024 06:32:11 +0000 Subject: [PATCH 17/35] add igb-het-[tiny|small] --- examples/graphbolt/rgcn/download.py | 670 ------------------ examples/graphbolt/rgcn/download_full_igbh.sh | 98 --- .../graphbolt/rgcn/download_large_igbh.sh | 95 --- examples/graphbolt/rgcn/hetero_rgcn.py | 19 +- python/dgl/graphbolt/impl/ondisk_dataset.py | 16 + 5 files changed, 25 insertions(+), 873 deletions(-) delete mode 100755 examples/graphbolt/rgcn/download.py delete mode 100755 examples/graphbolt/rgcn/download_full_igbh.sh delete mode 100755 examples/graphbolt/rgcn/download_large_igbh.sh diff --git a/examples/graphbolt/rgcn/download.py b/examples/graphbolt/rgcn/download.py deleted file mode 100755 index 249f1dd176f6..000000000000 --- a/examples/graphbolt/rgcn/download.py +++ /dev/null @@ -1,670 +0,0 @@ -import argparse, hashlib, os, shutil, tarfile, yaml -import subprocess -import urllib.request as ur - -import dgl.graphbolt as gb -import numpy as np -from tqdm import tqdm - -GBFACTOR = 1 << 30 - - -def build_yaml_helper(path, dataset_size, in_memory=True): - """The stirng to build the yaml file. (Still need modification)""" - data = { - "graph": { - "nodes": [ - {"num": num_nodes[dataset_size]["paper"], "type": "paper"}, - {"num": num_nodes[dataset_size]["author"], "type": "author"}, - { - "num": num_nodes[dataset_size]["institute"], - "type": "institution", - }, - { - "num": num_nodes[dataset_size]["fos"], - "type": "field_of_study", - }, - ], - "edges": [ - { - "path": "edges/author__affiliated_to__institute.npy", - "type": "author:affiliated_to:institution", - "format": "numpy", - }, - { - "path": "edges/paper__written_by__author.npy", - "type": "paper:written_by:author", - "format": "numpy", - }, - { - "path": "edges/paper__cites__paper.npy", - "type": "paper:cites:paper", - "format": "numpy", - }, - { - "path": "edges/paper__topic__fos.npy", - "type": "paper:has_topic:field_of_study", - "format": "numpy", - }, - ], - }, - "tasks": [ - { - "num_classes": 19, - "name": "node_classification", - "validation_set": [ - { - "data": [ - { - "in_memory": in_memory, - "path": "set/validation_indices_19.npy", - "name": "seeds", - "format": "numpy", - }, - { - "in_memory": in_memory, - "path": "set/validation_labels_19.npy", - "name": "labels", - "format": "numpy", - }, - ], - "type": "paper", - } - ], - "train_set": [ - { - "data": [ - { - "in_memory": in_memory, - "path": "set/train_indices_19.npy", - "name": "seeds", - "format": "numpy", - }, - { - "in_memory": in_memory, - "path": "set/train_labels_19.npy", - "name": "labels", - "format": "numpy", - }, - ], - "type": "paper", - } - ], - "test_set": [ - { - "data": [ - { - "in_memory": in_memory, - "path": "set/test_indices_19.npy", - "name": "seeds", - "format": "numpy", - }, - { - "in_memory": in_memory, - "path": "set/test_labels_19.npy", - "name": "labels", - "format": "numpy", - }, - ], - "type": "paper", - } - ], - }, - { - "num_classes": 2983, - "name": "node_classification_2K", - "validation_set": [ - { - "data": [ - { - "in_memory": in_memory, - "path": "set/validation_indices_2983.npy", - "name": "seeds", - "format": "numpy", - }, - { - "in_memory": in_memory, - "path": "set/validation_labels_2983.npy", - "name": "labels", - "format": "numpy", - }, - ], - "type": "paper", - } - ], - "train_set": [ - { - "data": [ - { - "in_memory": in_memory, - "path": "set/train_indices_2983.npy", - "name": "seeds", - "format": "numpy", - }, - { - "in_memory": in_memory, - "path": "set/train_labels_2983.npy", - "name": "labels", - "format": "numpy", - }, - ], - "type": "paper", - } - ], - "test_set": [ - { - "data": [ - { - "in_memory": in_memory, - "path": "set/test_indices_2983.npy", - "name": "seeds", - "format": "numpy", - }, - { - "in_memory": in_memory, - "path": "set/test_labels_2983.npy", - "name": "labels", - "format": "numpy", - }, - ], - "type": "paper", - } - ], - }, - ], - "feature_data": [ - { - "domain": "node", - "name": "feat", - "format": "numpy", - "in_memory": in_memory, - "path": "data/paper_feat.npy", - "type": "paper", - }, - { - "domain": "node", - "name": "feat", - "format": "numpy", - "in_memory": in_memory, - "path": "data/author_feat.npy", - "type": "author", - }, - { - "domain": "node", - "name": "feat", - "format": "numpy", - "in_memory": in_memory, - "path": "data/institute_feat.npy", - "type": "institute", - }, - { - "domain": "node", - "name": "feat", - "format": "numpy", - "in_memory": in_memory, - "path": "data/fos_feat.npy", - "type": "fos", - }, - ], - "dataset_name": os.path.basename(path), - } - - return data - - -def build_yaml(original_path, current_path, dataset_size): - """This build the yaml file differently based on the dataset size. - The two large datasets are put in disk while the other three smaller versions are in memory. - """ - if "large" == dataset_size or "full" == dataset_size: - data = build_yaml_helper( - path=original_path, dataset_size=dataset_size, in_memory=False - ) - else: - data = build_yaml_helper(path=original_path, dataset_size=dataset_size) - with open(f"{current_path}/metadata.yaml", "w") as file: - yaml.dump(data=data, stream=file, default_flow_style=False) - - -dataset_urls = { - "homogeneous": { - "tiny": "https://igb-public-awsopen.s3.amazonaws.com/igb-homogeneous/igb_homogeneous_tiny.tar.gz", - "small": "https://igb-public-awsopen.s3.amazonaws.com/igb-homogeneous/igb_homogeneous_small.tar.gz", - "medium": "https://igb-public-awsopen.s3.amazonaws.com/igb-homogeneous/igb_homogeneous_medium.tar.gz", - }, - "heterogeneous": { - "tiny": "https://igb-public-awsopen.s3.amazonaws.com/igb-heterogeneous/igb_heterogeneous_tiny.tar.gz", - "small": "https://igb-public-awsopen.s3.amazonaws.com/igb-heterogeneous/igb_heterogeneous_small.tar.gz", - "medium": "https://igb-public-awsopen.s3.amazonaws.com/igb-heterogeneous/igb_heterogeneous_medium.tar.gz", - }, -} - - -md5checksums = { - "homogeneous": { - "tiny": "34856534da55419b316d620e2d5b21be", - "small": "6781c699723529902ace0a95cafe6fe4", - "medium": "4640df4ceee46851fd18c0a44ddcc622", - }, - "heterogeneous": { - "tiny": "83fbc1091497ff92cf20afe82fae0ade", - "small": "2f42077be60a074aec24f7c60089e1bd", - "medium": "7f0df4296eca36553ff3a6a63abbd347", - }, -} - - -def decide_download(url): - """An interactive command line to confirm download.""" - d = ur.urlopen(url) - size = int(d.info()["Content-Length"]) / GBFACTOR - ### confirm if larger than 1GB - if size > 1: - return ( - input( - "This will download %.2fGB. Will you proceed? (y/N) " % (size) - ).lower() - == "y" - ) - else: - return True - - -def check_md5sum(dataset_type, dataset_size, filename): - """This is for checking the data correctness of the downloaded datasets.""" - original_md5 = md5checksums[dataset_type][dataset_size] - - with open(filename, "rb") as file_to_check: - data = file_to_check.read() - md5_returned = hashlib.md5(data).hexdigest() - - if original_md5 == md5_returned: - print(" md5sum verified.") - return - else: - os.remove(filename) - raise Exception(" md5sum verification failed!.") - - -def download_dataset(path, dataset_type, dataset_size): - """This is the script to download all the related datasets.""" - - # For large datasets, use the two shell scripts to download. - if dataset_size in ["large", "full"]: - command = f"./download_{dataset_size}_igbh.sh" - subprocess.run(["bash", command], check=True, text=True) - shutil.move(src=f"igb-{dataset_type}-{dataset_size}", dst=f"{path}") - return path + "/" + "igb-" + dataset_type + "-" + dataset_size - # For the three smaller version, use the url to download. - else: - output_directory = path - if not os.path.exists( - output_directory - + "igb_" - + dataset_type - + "_" - + dataset_size - + ".tar.gz" - ): - url = dataset_urls[dataset_type][dataset_size] - if decide_download(url): - data = ur.urlopen(url) - size = int(data.info()["Content-Length"]) - chunk_size = 1024 * 1024 - num_iter = int(size / chunk_size) + 2 - downloaded_size = 0 - filename = ( - path - + "/igb_" - + dataset_type - + "_" - + dataset_size - + ".tar.gz" - ) - with open(filename, "wb") as f: - pbar = tqdm(range(num_iter)) - for i in pbar: - chunk = data.read(chunk_size) - downloaded_size += len(chunk) - pbar.set_description( - "Downloaded {:.2f} GB".format( - float(downloaded_size) / GBFACTOR - ) - ) - f.write(chunk) - print( - "Downloaded" + " igb_" + dataset_type + "_" + dataset_size, - end=" ->", - ) - check_md5sum(dataset_type, dataset_size, filename) - else: # No need to download the tar file again if it is already downloaded. - print( - "The file igb_" - + dataset_type - + "_" - + dataset_size - + ".tar.gz already exists, directly extracting..." - ) - filename = ( - path + "/igb_" + dataset_type + "_" + dataset_size + ".tar.gz" - ) - # Extract the tar file - file = tarfile.open(filename) - file.extractall(output_directory) - file.close() - size = 0 - for path, dirs, files in os.walk(output_directory + "/" + dataset_size): - for f in files: - fp = os.path.join(path, f) - size += os.path.getsize(fp) - print("Final dataset size {:.2f} GB.".format(size / GBFACTOR)) - # os.remove(filename) - os.rename( - output_directory + "/" + dataset_size, - output_directory + "/" + "igb-" + dataset_type + "-" + dataset_size, - ) - return ( - output_directory + "/" + "igb-" + dataset_type + "-" + dataset_size - ) - - -num_nodes = { - "full": { - "paper": 269346174, - "author": 277220883, - "institute": 26918, - "fos": 712960, - }, - "large": { - "paper": 100000000, - "author": 116959896, - "institute": 26524, - "fos": 649707, - }, - "medium": { - "paper": 10000000, - "author": 15544654, - "institute": 23256, - "fos": 415054, - }, - "small": { - "paper": 1000000, - "author": 1926066, - "institute": 14751, - "fos": 190449, - }, - "tiny": { - "paper": 100000, - "author": 357041, - "institute": 8738, - "fos": 84220, - }, -} - -num_edges = { - "full": { - "paper__cites__paper": 3996442004, - "paper__written_by__author": 716761549, - "paper__topic__fos": 1050280600, - "author__affiliated_to__institute": 48521486, - }, - "large": { - "paper__cites__paper": 1223571364, - "paper__written_by__author": 289502107, - "paper__topic__fos": 457577294, - "author__affiliated_to__institute": 34099660, - }, - "medium": { - "paper__cites__paper": 120077694, - "paper__written_by__author": 39854592, - "paper__topic__fos": 68510495, - "author__affiliated_to__institute": 11049412, - }, - "small": { - "paper__cites__paper": 12070502, - "paper__written_by__author": 4553516, - "paper__topic__fos": 7234122, - "author__affiliated_to__institute": 1630476, - }, - "tiny": { - "paper__cites__paper": 447416, - "paper__written_by__author": 471443, - "paper__topic__fos": 718445, - "author__affiliated_to__institute": 325410, - }, -} - - -def split_data(label_path, set_dir, dataset_size, class_num): - """This is for splitting the labels into three sets: train, validation, and test sets.""" - # labels = np.memmap(label_path, dtype='int32', mode='r', shape=(num_nodes[dataset_size]["paper"], 1)) - labels = np.load(label_path) - - total_samples = len(labels) - train_end = int(0.6 * total_samples) - validation_end = int(0.8 * total_samples) - - indices = np.arange(total_samples) - train_indices = indices[:train_end] - validation_indices = indices[train_end:validation_end] - test_indices = indices[validation_end:] - print(indices) - print(train_indices) - print(validation_indices) - print(test_indices) - - train_labels = labels[:train_end] - validation_labels = labels[train_end:validation_end] - test_labels = labels[validation_end:] - print(train_labels, len(train_labels)) - print(validation_labels, len(validation_labels)) - print(test_labels, len(test_labels)) - - gb.numpy_save_aligned( - f"{set_dir}/train_indices_{class_num}.npy", train_indices - ) - gb.numpy_save_aligned( - f"{set_dir}/validation_indices_{class_num}.npy", validation_indices - ) - gb.numpy_save_aligned( - f"{set_dir}/test_indices_{class_num}.npy", test_indices - ) - gb.numpy_save_aligned( - f"{set_dir}/train_labels_{class_num}.npy", train_labels - ) - gb.numpy_save_aligned( - f"{set_dir}/validation_labels_{class_num}.npy", validation_labels - ) - gb.numpy_save_aligned(f"{set_dir}/test_labels_{class_num}.npy", test_labels) - - -def add_edges(edges, source, dest, dataset_size): - """This is for processing the edges in the graph and convert them to correct shape.""" - for edge in edges: - print(f"\t Processing {edge} edge...") - - old_edge_path = source + "/" + edge + "/" + "edge_index.npy" - new_edge_path = dest + "/" + edge + ".npy" - os.rename(src=old_edge_path, dst=new_edge_path) - - # edge_array = np.memmap(new_edge_path, dtype='int32', mode='r', shape=(num_edges[dataset_size][edge], 2)) - edge_array = np.load(new_edge_path) - new_edge_array = edge_array.transpose() - - assert new_edge_array.shape == (2, num_edges[dataset_size][edge]) - assert np.array_equal(edge_array, new_edge_array.transpose()) - - gb.numpy_save_aligned(new_edge_path, new_edge_array) - - -def process_feat(file_path, node_name, dataset_size): - """This is for processing the node features.""" - # array = np.memmap(file_path, dtype='float32', mode='r', shape=(num_nodes[dataset_size][node_name], 1024)) - array = np.load(file_path) - assert array.shape == (num_nodes[dataset_size][node_name], 1024) - gb.numpy_save_aligned(file_path, array) - - # Assert the shape and elements of the array are correct - # new_array = np.memmap(file_path, dtype='float32', mode='r', shape=(num_nodes[dataset_size][node_name], 1024)) - new_array = np.load(file_path) - assert new_array.shape == (num_nodes[dataset_size][node_name], 1024) - assert np.array_equal(array, new_array) - - -def process_label(file_path, num_class, dataset_size): - """This is for processing the node labels.""" - if ( - num_class == 2983 and dataset_size == "full" - ): # only this case label number changes - # array = np.memmap(file_path, dtype='int32', mode='r', shape=(227130858, 1)) - array = np.load(file_path) - assert array.shape[0] == 227130858 - else: - # array = np.memmap(file_path, dtype='int32', mode='r', shape=(num_nodes[dataset_size]["paper"], 1)) - array = np.load(file_path) - assert array.shape[0] == num_nodes[dataset_size]["paper"] - - gb.numpy_save_aligned(file_path, array) - - # Assert the shape and elements of the array are correct - if num_class == 2983 and dataset_size == "full": - # new_array = np.memmap(file_path, dtype='int32', mode='r', shape=(227130858, 1)) - new_array = np.load(file_path) - assert new_array.shape[0] == 227130858 - assert np.array_equal(array, new_array) - else: - # new_array = np.memmap(file_path, dtype='int32', mode='r', shape=(num_nodes[dataset_size]["paper"], 1)) - new_array = np.load(file_path) - assert new_array.shape[0] == num_nodes[dataset_size]["paper"] - assert np.array_equal(array, new_array) - - -def add_nodes(nodes, source, dest, dataset_size): - """This is for processing the nodes in the graph and store them in correct format.""" - for node in nodes: - print(f"\t Processing {node} node feature...") - old_node_path = source + "/" + node + "/" + "node_feat.npy" - new_node_path = dest + "/" + node + "_feat.npy" - os.rename(src=old_node_path, dst=new_node_path) - process_feat( - file_path=new_node_path, node_name=node, dataset_size=dataset_size - ) - # If the node is a paper type, process the labels - if node == "paper": - print(f"\t Processing {node} labels...") - old_label_path_19 = source + "/" + node + "/" + "node_label_19.npy" - new_label_path_19 = dest + "/" + "paper_label_19.npy" - os.rename(src=old_label_path_19, dst=new_label_path_19) - process_label( - file_path=new_label_path_19, - num_class=19, - dataset_size=dataset_size, - ) - - old_label_path_2K = source + "/" + node + "/" + "node_label_2K.npy" - new_label_path_2K = dest + "/" + "paper_label_2K.npy" - os.rename(src=old_label_path_2K, dst=new_label_path_2K) - process_label( - file_path=new_label_path_2K, - num_class=2983, - dataset_size=dataset_size, - ) - - return new_label_path_19, new_label_path_2K - - -def process_dataset(path, dataset_size): - print(f"Starting to process the {dataset_size} dataset...") - - # Step 0: Make the directory for processed dataset - processed_dir = path + "-seeds" - os.makedirs(name=processed_dir, exist_ok=True) - original_path = path + "/" + "processed" - - # Step 1: Move Nodes files - print("Processing Node files...") - node_dir = processed_dir + "/" + "data" - os.makedirs(name=node_dir, exist_ok=True) - # These are the four nodes in this citation network - nodes = ["paper", "author", "fos", "institute"] - label_file_19, label_file_2K = add_nodes( - nodes=nodes, - source=original_path, - dest=node_dir, - dataset_size=dataset_size, - ) - - # Step 2: Create labels - print("Processing train/valid/test files...") - set_dir = processed_dir + "/" + "set" - os.makedirs(name=set_dir, exist_ok=True) - split_data( - label_path=label_file_19, - set_dir=set_dir, - dataset_size=dataset_size, - class_num=19, - ) - split_data( - label_path=label_file_2K, - set_dir=set_dir, - dataset_size=dataset_size, - class_num=2983, - ) - - # Step 3: Move edge files - print("Processing Edge files...") - edge_dir = processed_dir + "/" + "edges" - os.makedirs(name=edge_dir, exist_ok=True) - # These are the four edges in this citation network - edges = [ - "paper__cites__paper", - "paper__written_by__author", - "paper__topic__fos", - "author__affiliated_to__institute", - ] - add_edges( - edges=edges, - source=original_path, - dest=edge_dir, - dataset_size=dataset_size, - ) - - # Step 4: Build the yaml file - print("Building yaml file...") - build_yaml( - original_path=path, - current_path=processed_dir, - dataset_size=dataset_size, - ) - - # shutil.rmtree(path) - print(f"Finished processing the {dataset_size} dataset") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "--path", - type=str, - default="datasets/", - help="path to store the datasets", - ) - parser.add_argument( - "--type", - type=str, - default="heterogeneous", - choices=["homogeneous", "heterogeneous"], - help="dataset type", - ) - parser.add_argument( - "--size", - type=str, - default="tiny", - choices=["tiny", "small", "medium", "large", "full"], - help="size of the datasets", - ) - args = parser.parse_args() - path = download_dataset( - path=args.path, dataset_type=args.type, dataset_size=args.size - ) - process_dataset(path=path, dataset_size=args.size) diff --git a/examples/graphbolt/rgcn/download_full_igbh.sh b/examples/graphbolt/rgcn/download_full_igbh.sh deleted file mode 100755 index 1efcdb3cb0b9..000000000000 --- a/examples/graphbolt/rgcn/download_full_igbh.sh +++ /dev/null @@ -1,98 +0,0 @@ -#! /bin/bash - -mkdir -p igb-dataset-full -cd igb-dataset-full -mkdir -p processed -cd processed - -echo "IGBH600M (Heteregeneous) download starting" - -# paper -mkdir -p paper -cd paper -wget https://igb-public-awsopen.s3.amazonaws.com/IGBH/processed/paper/node_feat.npy -wget https://igb-public-awsopen.s3.amazonaws.com/IGBH/processed/paper/node_label_19.npy -wget https://igb-public-awsopen.s3.amazonaws.com/IGBH/processed/paper/node_label_2K.npy -wget https://igb-public-awsopen.s3.amazonaws.com/IGBH/processed/paper/paper_id_index_mapping.npy -cd .. - -# paper__cites__paper -mkdir -p paper__cites__paper -cd paper__cites__paper -wget https://igb-public-awsopen.s3.amazonaws.com/IGBH/processed/paper__cites__paper/edge_index.npy -cd .. - -# author -mkdir -p author -cd author -wget https://igb-public-awsopen.s3.amazonaws.com/IGBH/processed/author/author_id_index_mapping.npy -wget https://igb-public-awsopen.s3.amazonaws.com/IGBH/processed/author/node_feat.npy -cd .. - -# conference -mkdir -p conference -cd conference -wget https://igb-public-awsopen.s3.amazonaws.com/IGBH/processed/conference/conference_id_index_mapping.npy -wget https://igb-public-awsopen.s3.amazonaws.com/IGBH/processed/conference/node_feat.npy -cd .. - -# institute -mkdir -p institute -cd institute -wget https://igb-public-awsopen.s3.amazonaws.com/IGBH/processed/institute/institute_id_index_mapping.npy -wget https://igb-public-awsopen.s3.amazonaws.com/IGBH/processed/institute/node_feat.npy -cd .. - -# journal -mkdir -p journal -cd journal -wget https://igb-public-awsopen.s3.amazonaws.com/IGBH/processed/journal/journal_id_index_mapping.npy -wget https://igb-public-awsopen.s3.amazonaws.com/IGBH/processed/journal/node_feat.npy -cd .. - -# fos -mkdir -p fos -cd fos -wget https://igb-public-awsopen.s3.amazonaws.com/IGBH/processed/fos/fos_id_index_mapping.npy -wget https://igb-public-awsopen.s3.amazonaws.com/IGBH/processed/fos/node_feat.npy -cd .. - -# author__affiliated_to__institute -mkdir -p author__affiliated_to__institute -cd author__affiliated_to__institute -wget https://igb-public-awsopen.s3.amazonaws.com/IGBH/processed/author__affiliated_to__institute/edge_index.npy -cd .. - -# paper__published__journal -mkdir -p paper__published__journal -cd paper__published__journal -wget https://igb-public-awsopen.s3.amazonaws.com/IGBH/processed/paper__published__journal/edge_index.npy -cd .. - -# paper__topic__fos -mkdir -p paper__topic__fos -cd paper__topic__fos -wget https://igb-public-awsopen.s3.amazonaws.com/IGBH/processed/paper__topic__fos/edge_index.npy -cd .. - -# paper__venue__conference -mkdir -p paper__venue__conference -cd paper__venue__conference -wget https://igb-public-awsopen.s3.amazonaws.com/IGBH/processed/paper__venue__conference/edge_index.npy -cd .. - -# paper__written_by__author -mkdir -p paper__written_by__author -cd paper__written_by__author -wget https://igb-public-awsopen.s3.amazonaws.com/IGBH/processed/paper__written_by__author/edge_index.npy -cd .. - -cd ../.. - -echo "IGBH-IGBH (Heteregeneous) download complete" - - -num_paper_nodes = 269346174 -paper_node_features = np.memmap('/home/ubuntu/dgl/examples/graphbolt/rgcn/igb_dataset/igb_dataset_full/processed/paper/node_label_19.npy', dtype='float32', mode='r', shape=(num_paper_nodes,1)) -num_paper_nodes = 48521486 -paper_node_features = np.memmap('/home/ubuntu/dgl/examples/graphbolt/rgcn/datasets/igb-dataset-full-seeds/edges/author__affiliated_to__institute.npy', dtype='int32', mode='r', shape=(num_paper_nodes,2)) diff --git a/examples/graphbolt/rgcn/download_large_igbh.sh b/examples/graphbolt/rgcn/download_large_igbh.sh deleted file mode 100755 index 4c36a7f8d0db..000000000000 --- a/examples/graphbolt/rgcn/download_large_igbh.sh +++ /dev/null @@ -1,95 +0,0 @@ -#! /bin/bash - -mkdir -p igb-heterogeneous-large/ -cd igb-heterogeneous-large/ -mkdir -p processed -cd processed - -echo "IGBH-large (Heterogeneous) download starting" - -# paper -mkdir -p paper -cd paper -wget https://igb-public-awsopen.s3.amazonaws.com/igb_large/processed/paper/node_feat.npy -wget https://igb-public-awsopen.s3.amazonaws.com/igb_large/processed/paper/node_label_19.npy -wget https://igb-public-awsopen.s3.amazonaws.com/igb_large/processed/paper/node_label_2K.npy -wget https://igb-public-awsopen.s3.amazonaws.com/igb_large/processed/paper/paper_id_index_mapping.npy -cd .. - -# # paper__cites__paper -# wget --recursive --no-parent https://igb-public-awsopen.s3.amazonaws.com/igb_large/processed/paper__cites__paper/edge_index.npy - -# paper__cites__paper -mkdir -p paper__cites__paper -cd paper__cites__paper -wget https://igb-public-awsopen.s3.amazonaws.com/igb_large/processed/paper__cites__paper/edge_index.npy -cd .. - -# author -mkdir -p author -cd author -wget https://igb-public-awsopen.s3.amazonaws.com/igb_large/processed/author/author_id_index_mapping.npy -wget https://igb-public-awsopen.s3.amazonaws.com/igb_large/processed/author/node_feat.npy -cd .. - -# conference -mkdir -p conference -cd conference -wget https://igb-public-awsopen.s3.amazonaws.com/igb_large/processed/conference/conference_id_index_mapping.npy -wget https://igb-public-awsopen.s3.amazonaws.com/igb_large/processed/conference/node_feat.npy -cd .. - -# institute -mkdir -p institute -cd institute -wget https://igb-public-awsopen.s3.amazonaws.com/igb_large/processed/institute/institute_id_index_mapping.npy -wget https://igb-public-awsopen.s3.amazonaws.com/igb_large/processed/institute/node_feat.npy -cd .. - -# journal -mkdir -p journal -cd journal -wget https://igb-public-awsopen.s3.amazonaws.com/igb_large/processed/journal/journal_id_index_mapping.npy -wget https://igb-public-awsopen.s3.amazonaws.com/igb_large/processed/journal/node_feat.npy -cd .. - -# fos -mkdir -p fos -cd fos -wget https://igb-public-awsopen.s3.amazonaws.com/igb_large/processed/fos/fos_id_index_mapping.npy -wget https://igb-public-awsopen.s3.amazonaws.com/igb_large/processed/fos/node_feat.npy -cd .. - -# author__affiliated_to__institute -mkdir -p author__affiliated_to__institute -cd author__affiliated_to__institute -wget https://igb-public-awsopen.s3.amazonaws.com/igb_large/processed/author__affiliated_to__institute/edge_index.npy -cd .. - -# paper__published__journal -mkdir -p paper__published__journal -cd paper__published__journal -wget https://igb-public-awsopen.s3.amazonaws.com/igb_large/processed/paper__published__journal/edge_index.npy -cd .. - -# paper__topic__fos -mkdir -p paper__topic__fos -cd paper__topic__fos -wget https://igb-public-awsopen.s3.amazonaws.com/igb_large/processed/paper__topic__fos/edge_index.npy -cd .. - -# paper__venue__conference -mkdir -p paper__venue__conference -cd paper__venue__conference -wget https://igb-public-awsopen.s3.amazonaws.com/igb_large/processed/paper__venue__conference/edge_index.npy -cd .. - -# paper__written_by__author -mkdir -p paper__written_by__author -cd paper__written_by__author -wget https://igb-public-awsopen.s3.amazonaws.com/igb_large/processed/paper__written_by__author/edge_index.npy -cd .. - -cd ../.. - -echo "IGBH-large (Heterogeneous) download complete" \ No newline at end of file diff --git a/examples/graphbolt/rgcn/hetero_rgcn.py b/examples/graphbolt/rgcn/hetero_rgcn.py index 7516e16a857a..4051425a129d 100644 --- a/examples/graphbolt/rgcn/hetero_rgcn.py +++ b/examples/graphbolt/rgcn/hetero_rgcn.py @@ -142,7 +142,7 @@ def create_dataloader( if name == "ogb-lsc-mag240m": node_feature_keys["author"] = ["feat"] node_feature_keys["institution"] = ["feat"] - if "igb-heterogeneous" in name: + if "igb-het" in name: node_feature_keys["author"] = ["feat"] node_feature_keys["institution"] = ["feat"] node_feature_keys["fos"] = ["feat"] @@ -163,7 +163,7 @@ def extract_embed(node_embed, input_nodes): def extract_node_features(name, block, data, node_embed, device): """Extract the node features from embedding layer or raw features.""" - if name == "ogbn-mag" or "igb-heterogeneous" in name: + if name == "ogbn-mag" or "igb-het" in name: input_nodes = { k: v.to(device) for k, v in block.srcdata[dgl.NID].items() } @@ -429,7 +429,7 @@ def evaluate( model.eval() category = "paper" # An evaluator for the dataset. - if "igb-heterogeneous" in name: + if "igb-het" in name: evaluator = IGB_Evaluator(name=name, num_tasks=1, eval_metric="acc") elif name == "ogbn-mag": evaluator = Evaluator(name=name) @@ -595,7 +595,7 @@ def main(args): # `institution` are generated in advance and stored in the feature store. # For `ogbn-mag`, we generate the features on the fly. embed_layer = None - if args.dataset == "ogbn-mag" or "igb-heterogeneous" in args.dataset: + if args.dataset == "ogbn-mag" or "igb-het" in args.dataset: # Create the embedding layer and move it to the appropriate device. embed_layer = rel_graph_embed(g, feat_size).to(device) print( @@ -673,13 +673,12 @@ def main(args): choices=[ "ogbn-mag", "ogb-lsc-mag240m", - "igb-heterogeneous-tiny", - "igb-heterogeneous-small", - "igb-heterogeneous-medium", - "igb-heterogeneous-large", - "igb-heterogeneous-full", + "igb-het-tiny", + "igb-het-small", + "igb-het-medium", ], - help="Dataset name. Possible values: ogbn-mag, ogb-lsc-mag240m", + help="Dataset name. Possible values: ogbn-mag, ogb-lsc-mag240m, " + " igb-het-[tiny|small|medium].", ) parser.add_argument("--num_epochs", type=int, default=3) parser.add_argument("--num_workers", type=int, default=0) diff --git a/python/dgl/graphbolt/impl/ondisk_dataset.py b/python/dgl/graphbolt/impl/ondisk_dataset.py index 3318491d2888..52b61a7d3a21 100644 --- a/python/dgl/graphbolt/impl/ondisk_dataset.py +++ b/python/dgl/graphbolt/impl/ondisk_dataset.py @@ -989,6 +989,16 @@ class BuiltinDataset(OnDiskDataset): Self edges are added to the original graph. Node features are stored as float32. + **igb-het-[tiny|small|medium]** + The igb-hom-[tiny|small|medium] dataset is a heterogeneous citation network, + which is designed for developers to train and evaluate GNN models with + high fidelity. See more details in `igb-het-[tiny|small|medium] + `_. + + .. note:: + Reverse paper__cites__paper edges are added to the original graph. + Node features are stored as float32. + Parameters ---------- name : str @@ -1018,6 +1028,10 @@ class BuiltinDataset(OnDiskDataset): "igb-hom-tiny-seeds", "igb-hom-small", "igb-hom-small-seeds", + "igb-het-tiny", + "igb-het-tiny-seeds", + "igb-het-small", + "igb-het-small-seeds", ] _large_datasets = [ "ogb-lsc-mag240m", @@ -1028,6 +1042,8 @@ class BuiltinDataset(OnDiskDataset): "igb-hom-medium-seeds", "igb-hom-large", "igb-hom-large-seeds", + "igb-het-medium", + "igb-het-medium-seeds", ] _all_datasets = _datasets + _large_datasets From 8e51701466c91cb42cc7f3f57a4435005de77acd Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Fri, 6 Sep 2024 05:38:07 +0000 Subject: [PATCH 18/35] resolve merge conflict --- .../pyg/hetero/node_classification.py | 31 +++++++++---- examples/graphbolt/rgcn/evaluator.py | 46 +++---------------- examples/graphbolt/rgcn/hetero_rgcn.py | 2 +- python/dgl/graphbolt/impl/ondisk_dataset.py | 11 +++-- 4 files changed, 37 insertions(+), 53 deletions(-) diff --git a/examples/graphbolt/pyg/hetero/node_classification.py b/examples/graphbolt/pyg/hetero/node_classification.py index 032b84d82c4b..272e778610f0 100644 --- a/examples/graphbolt/pyg/hetero/node_classification.py +++ b/examples/graphbolt/pyg/hetero/node_classification.py @@ -58,12 +58,20 @@ def create_dataloader( datapipe = datapipe.copy_to(device=device) need_copy = False + # if args.dataset == "ogb-lsc-mag240m": + # node_feature_keys = { + # "paper": ["feat"], + # "author": ["feat"], + # "institution": ["feat"], + # } + node_feature_keys = {"paper": ["feat"]} if args.dataset == "ogb-lsc-mag240m": - node_feature_keys = { - "paper": ["feat"], - "author": ["feat"], - "institution": ["feat"], - } + node_feature_keys["author"] = ["feat"] + node_feature_keys["institution"] = ["feat"] + if "igb-het" in args.dataset: + node_feature_keys["author"] = ["feat"] + node_feature_keys["institute"] = ["feat"] + node_feature_keys["fos"] = ["feat"] # Fetch node features for the sampled subgraph. datapipe = datapipe.fetch_feature(features, node_feature_keys) @@ -335,8 +343,13 @@ def parse_args(): "--dataset", type=str, default="ogb-lsc-mag240m", - choices=["ogb-lsc-mag240m"], - help="Dataset name. Possible values: ogb-lsc-mag240m", + choices=[ + "ogb-lsc-mag240m", + "igb-het-tiny", + "igb-het-small", + "igb-het-medium", + ], + help="Dataset name. Possible values: ogb-lsc-mag240m, igb-het-[tiny|small|medium].", ) parser.add_argument( "--fanout", @@ -400,7 +413,7 @@ def parse_args(): return parser.parse_args() -def main(): +def main(args): torch.set_float32_matmul_precision(args.precision) if not torch.cuda.is_available(): args.mode = "cpu-cpu-cpu" @@ -517,4 +530,4 @@ def main(): if __name__ == "__main__": args = parse_args() - main() + main(args) diff --git a/examples/graphbolt/rgcn/evaluator.py b/examples/graphbolt/rgcn/evaluator.py index cbded66c264d..e52b2dbb78a7 100644 --- a/examples/graphbolt/rgcn/evaluator.py +++ b/examples/graphbolt/rgcn/evaluator.py @@ -67,45 +67,6 @@ def _parse_and_check_input(self, input_dict): else: raise ValueError("Undefined eval metric %s " % (self.eval_metric)) - def eval(self, input_dict): - if self.eval_metric == "acc": - y_true, y_pred = self._parse_and_check_input(input_dict) - return self._eval_acc(y_true, y_pred) - else: - raise ValueError("Undefined eval metric %s " % (self.eval_metric)) - - @property - def expected_input_format(self): - desc = "==== Expected input format of Evaluator for {}\n".format( - self.name - ) - if self.eval_metric == "acc": - desc += "{'y_true': y_true, 'y_pred': y_pred}\n" - desc += "- y_true: numpy ndarray or torch tensor of shape (num_nodes num_tasks)\n" - desc += "- y_pred: numpy ndarray or torch tensor of shape (num_nodes num_tasks)\n" - desc += "where y_pred stores predicted class label (integer),\n" - desc += "num_task is {}, and ".format(self.num_tasks) - desc += "each row corresponds to one node.\n" - else: - raise ValueError("Undefined eval metric %s " % (self.eval_metric)) - - return desc - - @property - def expected_output_format(self): - desc = "==== Expected output format of Evaluator for {}\n".format( - self.name - ) - if self.eval_metric == "acc": - desc += "{'acc': acc}\n" - desc += "- acc (float): Accuracy score averaged across {} task(s)\n".format( - self.num_tasks - ) - else: - raise ValueError("Undefined eval metric %s " % (self.eval_metric)) - - return desc - def _eval_acc(self, y_true, y_pred): acc_list = [] @@ -115,3 +76,10 @@ def _eval_acc(self, y_true, y_pred): acc_list.append(float(np.sum(correct)) / len(correct)) return {"acc": sum(acc_list) / len(acc_list)} + + def eval(self, input_dict): + if self.eval_metric == "acc": + y_true, y_pred = self._parse_and_check_input(input_dict) + return self._eval_acc(y_true, y_pred) + else: + raise ValueError("Undefined eval metric %s " % (self.eval_metric)) diff --git a/examples/graphbolt/rgcn/hetero_rgcn.py b/examples/graphbolt/rgcn/hetero_rgcn.py index 4051425a129d..2ca98b9ab872 100644 --- a/examples/graphbolt/rgcn/hetero_rgcn.py +++ b/examples/graphbolt/rgcn/hetero_rgcn.py @@ -144,7 +144,7 @@ def create_dataloader( node_feature_keys["institution"] = ["feat"] if "igb-het" in name: node_feature_keys["author"] = ["feat"] - node_feature_keys["institution"] = ["feat"] + node_feature_keys["institute"] = ["feat"] node_feature_keys["fos"] = ["feat"] datapipe = datapipe.fetch_feature(features, node_feature_keys) diff --git a/python/dgl/graphbolt/impl/ondisk_dataset.py b/python/dgl/graphbolt/impl/ondisk_dataset.py index 52b61a7d3a21..ee09f43274b7 100644 --- a/python/dgl/graphbolt/impl/ondisk_dataset.py +++ b/python/dgl/graphbolt/impl/ondisk_dataset.py @@ -979,10 +979,11 @@ class BuiltinDataset(OnDiskDataset): .. note:: Reverse edges are added to the original graph. - **igb-hom-[tiny|small|medium|large]** - The igb-hom-[tiny|small|medium] dataset is a homogeneous citation network, - which is designed for developers to train and evaluate GNN models with - high fidelity. See more details in `igb-hom-[tiny|small|medium|large] + **igb-hom and igb-hom-[tiny|small|medium|large]** + The igb-hom-[tiny|small|medium|large] and igb-hom dataset is a homogeneous + citation network, which is designed for developers to train and evaluate + GNN models with high fidelity. See more details in + `igb-hom-[tiny|small|medium|large] `_. .. note:: @@ -1042,6 +1043,8 @@ class BuiltinDataset(OnDiskDataset): "igb-hom-medium-seeds", "igb-hom-large", "igb-hom-large-seeds", + "igb-hom", + "igb-hom-seeds", "igb-het-medium", "igb-het-medium-seeds", ] From aaf1da118879238f21c13f54c839bbdc5165653c Mon Sep 17 00:00:00 2001 From: Bowen Yao <112051015+BowenYao18@users.noreply.github.com> Date: Thu, 5 Sep 2024 22:59:58 -0700 Subject: [PATCH 19/35] Update examples/graphbolt/pyg/hetero/node_classification.py Co-authored-by: Muhammed Fatih BALIN --- examples/graphbolt/pyg/hetero/node_classification.py | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/graphbolt/pyg/hetero/node_classification.py b/examples/graphbolt/pyg/hetero/node_classification.py index 272e778610f0..bfad1ab0cf4d 100644 --- a/examples/graphbolt/pyg/hetero/node_classification.py +++ b/examples/graphbolt/pyg/hetero/node_classification.py @@ -69,7 +69,6 @@ def create_dataloader( node_feature_keys["author"] = ["feat"] node_feature_keys["institution"] = ["feat"] if "igb-het" in args.dataset: - node_feature_keys["author"] = ["feat"] node_feature_keys["institute"] = ["feat"] node_feature_keys["fos"] = ["feat"] # Fetch node features for the sampled subgraph. From ebdb01f132bbad20ee54f9cb70117d770b02603e Mon Sep 17 00:00:00 2001 From: Bowen Yao <112051015+BowenYao18@users.noreply.github.com> Date: Thu, 5 Sep 2024 23:00:03 -0700 Subject: [PATCH 20/35] Update examples/graphbolt/pyg/hetero/node_classification.py Co-authored-by: Muhammed Fatih BALIN --- examples/graphbolt/pyg/hetero/node_classification.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/graphbolt/pyg/hetero/node_classification.py b/examples/graphbolt/pyg/hetero/node_classification.py index bfad1ab0cf4d..d231c94b7cbc 100644 --- a/examples/graphbolt/pyg/hetero/node_classification.py +++ b/examples/graphbolt/pyg/hetero/node_classification.py @@ -64,7 +64,7 @@ def create_dataloader( # "author": ["feat"], # "institution": ["feat"], # } - node_feature_keys = {"paper": ["feat"]} + node_feature_keys = {"paper": ["feat"], "author": ["feat"]} if args.dataset == "ogb-lsc-mag240m": node_feature_keys["author"] = ["feat"] node_feature_keys["institution"] = ["feat"] From d50977d840872b1760eaa4253999f12b62389ced Mon Sep 17 00:00:00 2001 From: Bowen Yao <112051015+BowenYao18@users.noreply.github.com> Date: Thu, 5 Sep 2024 23:00:09 -0700 Subject: [PATCH 21/35] Update examples/graphbolt/pyg/hetero/node_classification.py Co-authored-by: Muhammed Fatih BALIN --- examples/graphbolt/pyg/hetero/node_classification.py | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/graphbolt/pyg/hetero/node_classification.py b/examples/graphbolt/pyg/hetero/node_classification.py index d231c94b7cbc..21fdf1203899 100644 --- a/examples/graphbolt/pyg/hetero/node_classification.py +++ b/examples/graphbolt/pyg/hetero/node_classification.py @@ -66,7 +66,6 @@ def create_dataloader( # } node_feature_keys = {"paper": ["feat"], "author": ["feat"]} if args.dataset == "ogb-lsc-mag240m": - node_feature_keys["author"] = ["feat"] node_feature_keys["institution"] = ["feat"] if "igb-het" in args.dataset: node_feature_keys["institute"] = ["feat"] From 071d05521a967190957a28e5534c11920f0b0a24 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Fri, 6 Sep 2024 06:03:40 +0000 Subject: [PATCH 22/35] remove main args --- examples/graphbolt/pyg/hetero/node_classification.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/graphbolt/pyg/hetero/node_classification.py b/examples/graphbolt/pyg/hetero/node_classification.py index 21fdf1203899..871d74df84ed 100644 --- a/examples/graphbolt/pyg/hetero/node_classification.py +++ b/examples/graphbolt/pyg/hetero/node_classification.py @@ -411,7 +411,7 @@ def parse_args(): return parser.parse_args() -def main(args): +def main(): torch.set_float32_matmul_precision(args.precision) if not torch.cuda.is_available(): args.mode = "cpu-cpu-cpu" @@ -528,4 +528,4 @@ def main(args): if __name__ == "__main__": args = parse_args() - main(args) + main() From f31d3540b351b682320a1f0a589a439a4e184f0b Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Fri, 6 Sep 2024 20:51:02 +0000 Subject: [PATCH 23/35] remove script --- examples/graphbolt/pyg/hetero/node_classification.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/examples/graphbolt/pyg/hetero/node_classification.py b/examples/graphbolt/pyg/hetero/node_classification.py index 871d74df84ed..4166805c3f92 100644 --- a/examples/graphbolt/pyg/hetero/node_classification.py +++ b/examples/graphbolt/pyg/hetero/node_classification.py @@ -58,12 +58,6 @@ def create_dataloader( datapipe = datapipe.copy_to(device=device) need_copy = False - # if args.dataset == "ogb-lsc-mag240m": - # node_feature_keys = { - # "paper": ["feat"], - # "author": ["feat"], - # "institution": ["feat"], - # } node_feature_keys = {"paper": ["feat"], "author": ["feat"]} if args.dataset == "ogb-lsc-mag240m": node_feature_keys["institution"] = ["feat"] From 9e445c6e663d02122d6591962a49e1ef608580fa Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Fri, 6 Sep 2024 20:53:13 +0000 Subject: [PATCH 24/35] add all reverse edge type --- python/dgl/graphbolt/impl/ondisk_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/dgl/graphbolt/impl/ondisk_dataset.py b/python/dgl/graphbolt/impl/ondisk_dataset.py index ee09f43274b7..303a423a853b 100644 --- a/python/dgl/graphbolt/impl/ondisk_dataset.py +++ b/python/dgl/graphbolt/impl/ondisk_dataset.py @@ -997,7 +997,7 @@ class BuiltinDataset(OnDiskDataset): `_. .. note:: - Reverse paper__cites__paper edges are added to the original graph. + Four Reverse edge types are added to the original graph. Node features are stored as float32. Parameters From b04399900e96301aafa0e589835ad1220314a5b3 Mon Sep 17 00:00:00 2001 From: BowenYao18 Date: Mon, 9 Sep 2024 08:38:07 +0000 Subject: [PATCH 25/35] add igb-het-large --- examples/graphbolt/pyg/hetero/node_classification.py | 1 + examples/graphbolt/rgcn/hetero_rgcn.py | 3 ++- python/dgl/graphbolt/impl/ondisk_dataset.py | 6 ++++-- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/examples/graphbolt/pyg/hetero/node_classification.py b/examples/graphbolt/pyg/hetero/node_classification.py index 4166805c3f92..2e789fa39bd9 100644 --- a/examples/graphbolt/pyg/hetero/node_classification.py +++ b/examples/graphbolt/pyg/hetero/node_classification.py @@ -340,6 +340,7 @@ def parse_args(): "igb-het-tiny", "igb-het-small", "igb-het-medium", + "igb-het-large" ], help="Dataset name. Possible values: ogb-lsc-mag240m, igb-het-[tiny|small|medium].", ) diff --git a/examples/graphbolt/rgcn/hetero_rgcn.py b/examples/graphbolt/rgcn/hetero_rgcn.py index 2ca98b9ab872..f3b0edbf5ba8 100644 --- a/examples/graphbolt/rgcn/hetero_rgcn.py +++ b/examples/graphbolt/rgcn/hetero_rgcn.py @@ -676,9 +676,10 @@ def main(args): "igb-het-tiny", "igb-het-small", "igb-het-medium", + "igb-het-large", ], help="Dataset name. Possible values: ogbn-mag, ogb-lsc-mag240m, " - " igb-het-[tiny|small|medium].", + " igb-het-[tiny|small|medium|large].", ) parser.add_argument("--num_epochs", type=int, default=3) parser.add_argument("--num_workers", type=int, default=0) diff --git a/python/dgl/graphbolt/impl/ondisk_dataset.py b/python/dgl/graphbolt/impl/ondisk_dataset.py index 303a423a853b..8763fa332854 100644 --- a/python/dgl/graphbolt/impl/ondisk_dataset.py +++ b/python/dgl/graphbolt/impl/ondisk_dataset.py @@ -990,10 +990,10 @@ class BuiltinDataset(OnDiskDataset): Self edges are added to the original graph. Node features are stored as float32. - **igb-het-[tiny|small|medium]** + **igb-het-[tiny|small|medium|large]** The igb-hom-[tiny|small|medium] dataset is a heterogeneous citation network, which is designed for developers to train and evaluate GNN models with - high fidelity. See more details in `igb-het-[tiny|small|medium] + high fidelity. See more details in `igb-het-[tiny|small|medium|large] `_. .. note:: @@ -1047,6 +1047,8 @@ class BuiltinDataset(OnDiskDataset): "igb-hom-seeds", "igb-het-medium", "igb-het-medium-seeds", + "igb-het-large", + "igb-het-large-seeds", ] _all_datasets = _datasets + _large_datasets From 2657ee1fdd47d3ec3385ee1fc73628928b495228 Mon Sep 17 00:00:00 2001 From: BowenYao18 Date: Mon, 9 Sep 2024 08:42:18 +0000 Subject: [PATCH 26/35] fix format --- examples/graphbolt/pyg/hetero/node_classification.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/graphbolt/pyg/hetero/node_classification.py b/examples/graphbolt/pyg/hetero/node_classification.py index 2e789fa39bd9..f6ee70f76a5f 100644 --- a/examples/graphbolt/pyg/hetero/node_classification.py +++ b/examples/graphbolt/pyg/hetero/node_classification.py @@ -340,7 +340,7 @@ def parse_args(): "igb-het-tiny", "igb-het-small", "igb-het-medium", - "igb-het-large" + "igb-het-large", ], help="Dataset name. Possible values: ogb-lsc-mag240m, igb-het-[tiny|small|medium].", ) From d11f8150c98ff6391e0b8960f3fd0d4720dfc8aa Mon Sep 17 00:00:00 2001 From: BowenYao18 Date: Mon, 9 Sep 2024 08:45:46 +0000 Subject: [PATCH 27/35] fix lint --- examples/graphbolt/pyg/hetero/node_classification.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/graphbolt/pyg/hetero/node_classification.py b/examples/graphbolt/pyg/hetero/node_classification.py index f6ee70f76a5f..8df18fb49db3 100644 --- a/examples/graphbolt/pyg/hetero/node_classification.py +++ b/examples/graphbolt/pyg/hetero/node_classification.py @@ -342,7 +342,7 @@ def parse_args(): "igb-het-medium", "igb-het-large", ], - help="Dataset name. Possible values: ogb-lsc-mag240m, igb-het-[tiny|small|medium].", + help="Dataset name. Possible values: ogb-lsc-mag240m, igb-het-[tiny|small|medium|large].", ) parser.add_argument( "--fanout", From 692912ec820f70890e5eff0e3e76bec4479d698e Mon Sep 17 00:00:00 2001 From: BowenYao18 Date: Wed, 9 Oct 2024 02:27:19 +0000 Subject: [PATCH 28/35] fix acc drop bug --- examples/graphbolt/rgcn/hetero_rgcn.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/examples/graphbolt/rgcn/hetero_rgcn.py b/examples/graphbolt/rgcn/hetero_rgcn.py index f3b0edbf5ba8..b9f2a8e5911c 100644 --- a/examples/graphbolt/rgcn/hetero_rgcn.py +++ b/examples/graphbolt/rgcn/hetero_rgcn.py @@ -163,7 +163,7 @@ def extract_embed(node_embed, input_nodes): def extract_node_features(name, block, data, node_embed, device): """Extract the node features from embedding layer or raw features.""" - if name == "ogbn-mag" or "igb-het" in name: + if name == "ogbn-mag": input_nodes = { k: v.to(device) for k, v in block.srcdata[dgl.NID].items() } @@ -288,13 +288,6 @@ def __init__( } ) - self.loop_weights = nn.ModuleDict( - { - ntype: nn.Linear(in_size, out_size, bias=True) - for ntype in self.ntypes - } - ) - self.dropout = nn.Dropout(dropout) # Initialize parameters of the model. self.reset_parameters() @@ -677,6 +670,8 @@ def main(args): "igb-het-small", "igb-het-medium", "igb-het-large", + "igb-het", + "igb-het-MLPerf" ], help="Dataset name. Possible values: ogbn-mag, ogb-lsc-mag240m, " " igb-het-[tiny|small|medium|large].", From 5dfd3fd2c791e4b6f4de3acc023e003801281c05 Mon Sep 17 00:00:00 2001 From: BowenYao18 Date: Wed, 9 Oct 2024 02:29:21 +0000 Subject: [PATCH 29/35] fix acc drop bug --- examples/graphbolt/rgcn/hetero_rgcn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/graphbolt/rgcn/hetero_rgcn.py b/examples/graphbolt/rgcn/hetero_rgcn.py index b9f2a8e5911c..f44aef956e90 100644 --- a/examples/graphbolt/rgcn/hetero_rgcn.py +++ b/examples/graphbolt/rgcn/hetero_rgcn.py @@ -671,7 +671,7 @@ def main(args): "igb-het-medium", "igb-het-large", "igb-het", - "igb-het-MLPerf" + "igb-het-MLPerf", ], help="Dataset name. Possible values: ogbn-mag, ogb-lsc-mag240m, " " igb-het-[tiny|small|medium|large].", From 65c9b807997474f94e535b20cc8a72dd4048d281 Mon Sep 17 00:00:00 2001 From: Bowen Yao <112051015+BowenYao18@users.noreply.github.com> Date: Thu, 17 Oct 2024 21:19:34 -0500 Subject: [PATCH 30/35] Update node_classification.py --- examples/graphbolt/pyg/hetero/node_classification.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/examples/graphbolt/pyg/hetero/node_classification.py b/examples/graphbolt/pyg/hetero/node_classification.py index dcefb7358be9..f43a6f0609ba 100644 --- a/examples/graphbolt/pyg/hetero/node_classification.py +++ b/examples/graphbolt/pyg/hetero/node_classification.py @@ -345,8 +345,10 @@ def parse_args(): "igb-het-small", "igb-het-medium", "igb-het-large", + "igb-het-MLPerf", + "igb-het" ], - help="Dataset name. Possible values: ogb-lsc-mag240m, igb-het-[tiny|small|medium|large].", + help="Dataset name. Possible values: ogb-lsc-mag240m, igb-het, and igb-het-[tiny|small|medium|large|MLPerf].", ) parser.add_argument( "--fanout", From 4b2a7fb988b09ead711e69fee3fba9235e0a9c76 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Fri, 18 Oct 2024 02:28:25 +0000 Subject: [PATCH 31/35] add dataset name --- examples/graphbolt/rgcn/hetero_rgcn.py | 2 +- python/dgl/graphbolt/impl/ondisk_dataset.py | 15 ++++++++++----- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/examples/graphbolt/rgcn/hetero_rgcn.py b/examples/graphbolt/rgcn/hetero_rgcn.py index f44aef956e90..36fc1185c324 100644 --- a/examples/graphbolt/rgcn/hetero_rgcn.py +++ b/examples/graphbolt/rgcn/hetero_rgcn.py @@ -674,7 +674,7 @@ def main(args): "igb-het-MLPerf", ], help="Dataset name. Possible values: ogbn-mag, ogb-lsc-mag240m, " - " igb-het-[tiny|small|medium|large].", + "igb-het, and igb-het-[tiny|small|medium|large|MLPerf].", ) parser.add_argument("--num_epochs", type=int, default=3) parser.add_argument("--num_workers", type=int, default=0) diff --git a/python/dgl/graphbolt/impl/ondisk_dataset.py b/python/dgl/graphbolt/impl/ondisk_dataset.py index 8763fa332854..855e964f83fd 100644 --- a/python/dgl/graphbolt/impl/ondisk_dataset.py +++ b/python/dgl/graphbolt/impl/ondisk_dataset.py @@ -983,17 +983,18 @@ class BuiltinDataset(OnDiskDataset): The igb-hom-[tiny|small|medium|large] and igb-hom dataset is a homogeneous citation network, which is designed for developers to train and evaluate GNN models with high fidelity. See more details in - `igb-hom-[tiny|small|medium|large] + `igb-hom-[tiny|small|medium|large] and igb-hom `_. .. note:: Self edges are added to the original graph. Node features are stored as float32. - **igb-het-[tiny|small|medium|large]** - The igb-hom-[tiny|small|medium] dataset is a heterogeneous citation network, - which is designed for developers to train and evaluate GNN models with - high fidelity. See more details in `igb-het-[tiny|small|medium|large] + **igb-het and igb-het-[tiny|small|medium|large|mlperf]** + The igb-hom-[tiny|small|medium|large|mlperf] and igb-het dataset is a + heterogeneous citation network, which is designed for developers to train + and evaluate GNN models with high fidelity. See more details in + `igb-het-[tiny|small|medium|large|mlperf] and igb-het `_. .. note:: @@ -1049,6 +1050,10 @@ class BuiltinDataset(OnDiskDataset): "igb-het-medium-seeds", "igb-het-large", "igb-het-large-seeds", + "igb-het", + "igb-het-seeds", + "igb-het-MLPerf", + "igb-het-MLPerf-seeds", ] _all_datasets = _datasets + _large_datasets From 4f3f35545c9163e7e8101ba5cd9dd5b72c9a8c75 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Fri, 18 Oct 2024 02:35:30 +0000 Subject: [PATCH 32/35] fix lint --- examples/graphbolt/pyg/hetero/node_classification.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/graphbolt/pyg/hetero/node_classification.py b/examples/graphbolt/pyg/hetero/node_classification.py index f43a6f0609ba..45a8b1eed730 100644 --- a/examples/graphbolt/pyg/hetero/node_classification.py +++ b/examples/graphbolt/pyg/hetero/node_classification.py @@ -346,7 +346,7 @@ def parse_args(): "igb-het-medium", "igb-het-large", "igb-het-MLPerf", - "igb-het" + "igb-het", ], help="Dataset name. Possible values: ogb-lsc-mag240m, igb-het, and igb-het-[tiny|small|medium|large|MLPerf].", ) From 8c21ef44eb09c1150d6460febb1f59920f984a5d Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Fri, 18 Oct 2024 07:40:39 +0000 Subject: [PATCH 33/35] change mlperf to lower --- examples/graphbolt/pyg/hetero/node_classification.py | 4 ++-- examples/graphbolt/rgcn/hetero_rgcn.py | 4 ++-- python/dgl/graphbolt/impl/ondisk_dataset.py | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/examples/graphbolt/pyg/hetero/node_classification.py b/examples/graphbolt/pyg/hetero/node_classification.py index 45a8b1eed730..922ff15562ea 100644 --- a/examples/graphbolt/pyg/hetero/node_classification.py +++ b/examples/graphbolt/pyg/hetero/node_classification.py @@ -345,10 +345,10 @@ def parse_args(): "igb-het-small", "igb-het-medium", "igb-het-large", - "igb-het-MLPerf", + "igb-het-mlperf", "igb-het", ], - help="Dataset name. Possible values: ogb-lsc-mag240m, igb-het, and igb-het-[tiny|small|medium|large|MLPerf].", + help="Dataset name. Possible values: ogb-lsc-mag240m, igb-het, and igb-het-[tiny|small|medium|large|mlperf].", ) parser.add_argument( "--fanout", diff --git a/examples/graphbolt/rgcn/hetero_rgcn.py b/examples/graphbolt/rgcn/hetero_rgcn.py index 36fc1185c324..c3a6a36c3dbe 100644 --- a/examples/graphbolt/rgcn/hetero_rgcn.py +++ b/examples/graphbolt/rgcn/hetero_rgcn.py @@ -671,10 +671,10 @@ def main(args): "igb-het-medium", "igb-het-large", "igb-het", - "igb-het-MLPerf", + "igb-het-mlperf", ], help="Dataset name. Possible values: ogbn-mag, ogb-lsc-mag240m, " - "igb-het, and igb-het-[tiny|small|medium|large|MLPerf].", + "igb-het, and igb-het-[tiny|small|medium|large|mlperf].", ) parser.add_argument("--num_epochs", type=int, default=3) parser.add_argument("--num_workers", type=int, default=0) diff --git a/python/dgl/graphbolt/impl/ondisk_dataset.py b/python/dgl/graphbolt/impl/ondisk_dataset.py index 855e964f83fd..a8a3972fdb6d 100644 --- a/python/dgl/graphbolt/impl/ondisk_dataset.py +++ b/python/dgl/graphbolt/impl/ondisk_dataset.py @@ -1052,8 +1052,8 @@ class BuiltinDataset(OnDiskDataset): "igb-het-large-seeds", "igb-het", "igb-het-seeds", - "igb-het-MLPerf", - "igb-het-MLPerf-seeds", + "igb-het-mlperf", + "igb-het-mlperf-seeds", ] _all_datasets = _datasets + _large_datasets From 098efbeebba5d5978954633a36a1759dd19829e3 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Wed, 23 Oct 2024 03:38:14 +0000 Subject: [PATCH 34/35] reduce evaluator --- examples/graphbolt/rgcn/evaluator.py | 82 ++++++---------------------- 1 file changed, 17 insertions(+), 65 deletions(-) diff --git a/examples/graphbolt/rgcn/evaluator.py b/examples/graphbolt/rgcn/evaluator.py index e52b2dbb78a7..d003c5b62ede 100644 --- a/examples/graphbolt/rgcn/evaluator.py +++ b/examples/graphbolt/rgcn/evaluator.py @@ -6,80 +6,32 @@ torch = None -### Evaluator for node property prediction class IGB_Evaluator: - def __init__(self, name, num_tasks, eval_metric): + def __init__(self, name, num_tasks): self.name = name self.num_tasks = num_tasks - self.eval_metric = eval_metric - def _parse_and_check_input(self, input_dict): - if self.eval_metric == "acc": - if not "y_true" in input_dict: - raise RuntimeError("Missing key of y_true") - if not "y_pred" in input_dict: - raise RuntimeError("Missing key of y_pred") + def _parse_input(self, input_dict): + y_true, y_pred = input_dict["y_true"], input_dict["y_pred"] - y_true, y_pred = input_dict["y_true"], input_dict["y_pred"] + if torch and isinstance(y_true, torch.Tensor): + y_true = y_true.cpu().numpy() + if torch and isinstance(y_pred, torch.Tensor): + y_pred = y_pred.cpu().numpy() - """ - y_true: numpy ndarray or torch tensor of shape (num_nodes num_tasks) - y_pred: numpy ndarray or torch tensor of shape (num_nodes num_tasks) - """ + if not isinstance(y_true, np.ndarray) or not isinstance( + y_pred, np.ndarray + ): + raise RuntimeError("Arguments must be numpy arrays") - # converting to torch.Tensor to numpy on cpu - if torch is not None and isinstance(y_true, torch.Tensor): - y_true = y_true.detach().cpu().numpy() + if y_true.shape != y_pred.shape or y_true.ndim != 2: + raise RuntimeError("Shape mismatch between y_true and y_pred") - if torch is not None and isinstance(y_pred, torch.Tensor): - y_pred = y_pred.detach().cpu().numpy() - - ## check type - if not ( - isinstance(y_true, np.ndarray) - and isinstance(y_true, np.ndarray) - ): - raise RuntimeError( - "Arguments to Evaluator need to be either numpy ndarray or torch tensor" - ) - - if not y_true.shape == y_pred.shape: - raise RuntimeError( - "Shape of y_true and y_pred must be the same" - ) - - if not y_true.ndim == 2: - raise RuntimeError( - "y_true and y_pred must to 2-dim arrray, {}-dim array given".format( - y_true.ndim - ) - ) - - if not y_true.shape[1] == self.num_tasks: - raise RuntimeError( - "Number of tasks for {} should be {} but {} given".format( - self.name, self.num_tasks, y_true.shape[1] - ) - ) - - return y_true, y_pred - - else: - raise ValueError("Undefined eval metric %s " % (self.eval_metric)) + return y_true, y_pred def _eval_acc(self, y_true, y_pred): - acc_list = [] - - for i in range(y_true.shape[1]): - is_labeled = y_true[:, i] == y_true[:, i] - correct = y_true[is_labeled, i] == y_pred[is_labeled, i] - acc_list.append(float(np.sum(correct)) / len(correct)) - - return {"acc": sum(acc_list) / len(acc_list)} + return {"acc": np.mean(np.all(y_true == y_pred, axis=1))} def eval(self, input_dict): - if self.eval_metric == "acc": - y_true, y_pred = self._parse_and_check_input(input_dict) - return self._eval_acc(y_true, y_pred) - else: - raise ValueError("Undefined eval metric %s " % (self.eval_metric)) + y_true, y_pred = self._parse_input(input_dict) + return self._eval_acc(y_true, y_pred) From 8e3017031afc2ee09a8c78fbb2a9e0e8f4c4b553 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Wed, 23 Oct 2024 03:39:43 +0000 Subject: [PATCH 35/35] reduce evaluator --- examples/graphbolt/rgcn/hetero_rgcn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/graphbolt/rgcn/hetero_rgcn.py b/examples/graphbolt/rgcn/hetero_rgcn.py index c3a6a36c3dbe..71e05b4e4bfd 100644 --- a/examples/graphbolt/rgcn/hetero_rgcn.py +++ b/examples/graphbolt/rgcn/hetero_rgcn.py @@ -423,7 +423,7 @@ def evaluate( category = "paper" # An evaluator for the dataset. if "igb-het" in name: - evaluator = IGB_Evaluator(name=name, num_tasks=1, eval_metric="acc") + evaluator = IGB_Evaluator(name=name, num_tasks=1) elif name == "ogbn-mag": evaluator = Evaluator(name=name) else: