Skip to content

Commit

Permalink
[CI] fix bugs for multigpu benchmarks (dmlc#5140)
Browse files Browse the repository at this point in the history
  • Loading branch information
Rhett-Ying authored Jan 11, 2023
1 parent d837029 commit 4645532
Show file tree
Hide file tree
Showing 5 changed files with 7 additions and 57 deletions.
53 changes: 2 additions & 51 deletions benchmarks/benchmarks/multigpu/bench_multigpu_rgcn.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,6 @@ def __init__(
num_hidden_layers=1,
dropout=0,
use_self_loop=False,
low_mem=True,
layer_norm=False,
):
super(EntityClassify, self).__init__()
Expand All @@ -61,7 +60,6 @@ def __init__(
self.num_hidden_layers = num_hidden_layers
self.dropout = dropout
self.use_self_loop = use_self_loop
self.low_mem = low_mem
self.layer_norm = layer_norm

self.layers = nn.ModuleList()
Expand All @@ -75,7 +73,6 @@ def __init__(
self.num_bases,
activation=F.relu,
self_loop=self.use_self_loop,
low_mem=self.low_mem,
dropout=self.dropout,
layer_norm=layer_norm,
)
Expand All @@ -91,7 +88,6 @@ def __init__(
self.num_bases,
activation=F.relu,
self_loop=self.use_self_loop,
low_mem=self.low_mem,
dropout=self.dropout,
layer_norm=layer_norm,
)
Expand All @@ -106,7 +102,6 @@ def __init__(
self.num_bases,
activation=None,
self_loop=self.use_self_loop,
low_mem=self.low_mem,
layer_norm=layer_norm,
)
)
Expand Down Expand Up @@ -236,7 +231,6 @@ def run(proc_id, n_gpus, n_cpus, args, devices, dataset, split, queue=None):
num_hidden_layers=args.n_layers - 2,
dropout=args.dropout,
use_self_loop=args.use_self_loop,
low_mem=args.low_mem,
layer_norm=args.layer_norm,
)

Expand Down Expand Up @@ -373,14 +367,12 @@ def run(proc_id, n_gpus, n_cpus, args, devices, dataset, split, queue=None):
@utils.skip_if_not_4gpu()
@utils.benchmark("time", timeout=600)
@utils.parametrize("data", ["am", "ogbn-mag"])
@utils.parametrize("low_mem", [True, False])
@utils.parametrize("dgl_sparse", [True, False])
def track_time(data, low_mem, dgl_sparse):
def track_time(data, dgl_sparse):
# load graph data
dataset = utils.process_data(data)
args = config()
devices = [0, 1, 2, 3]
args.low_mem = low_mem
args.dgl_sparse = dgl_sparse
args.dataset = dataset
ogb_dataset = False
Expand Down Expand Up @@ -572,49 +564,8 @@ def config():
node_feats=False,
num_workers=0,
dgl_sparse=False,
low_mem=False,
)
# parser.add_argument("--dropout", type=float, default=0,
# help="dropout probability")
# parser.add_argument("--n-hidden", type=int, default=16,
# help="number of hidden units")
# parser.add_argument("--gpu", type=str, default='0',
# help="gpu")
# parser.add_argument("--lr", type=float, default=1e-2,
# help="learning rate")
# parser.add_argument("--sparse-lr", type=float, default=2e-2,
# help="sparse embedding learning rate")
# parser.add_argument("--n-bases", type=int, default=-1,
# help="number of filter weight matrices, default: -1 [use all]")
# parser.add_argument("--n-layers", type=int, default=2,
# help="number of propagation rounds")
# parser.add_argument("-e", "--n-epochs", type=int, default=50,
# help="number of training epochs")
# parser.add_argument("-d", "--dataset", type=str, required=True,
# help="dataset to use")
# parser.add_argument("--l2norm", type=float, default=0,
# help="l2 norm coef")
# parser.add_argument("--fanout", type=str, default="4, 4",
# help="Fan-out of neighbor sampling.")
# parser.add_argument("--use-self-loop", default=False, action='store_true',
# help="include self feature as a special relation")
# fp = parser.add_mutually_exclusive_group(required=False)
# parser.add_argument("--batch-size", type=int, default=100,
# help="Mini-batch size. ")
# parser.add_argument("--eval-batch-size", type=int, default=32,
# help="Mini-batch size. ")
# parser.add_argument("--num-workers", type=int, default=0,
# help="Number of workers for dataloader.")
# parser.add_argument("--low-mem", default=False, action='store_true',
# help="Whether use low mem RelGraphCov")
# parser.add_argument("--dgl-sparse", default=False, action='store_true',
# help='Use sparse embedding for node embeddings.')
# parser.add_argument('--node-feats', default=False, action='store_true',
# help='Whether use node features')
# parser.add_argument('--layer-norm', default=False, action='store_true',
# help='Use layer norm')
# parser.set_defaults(validation=True)
# args = parser.parse_args()

return args


Expand Down
2 changes: 1 addition & 1 deletion benchmarks/benchmarks/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -534,7 +534,7 @@ def skip_if_not_4gpu():
"""skip if DGL_BENCH_DEVICE is gpu"""

def _wrapper(func):
if GPU_COUNT != 4:
if GPU_COUNT < 4:
# skip if not enabled
print("Skip {}".format(func.__name__))
func.benchmark_name = "skip_" + func.__name__
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/scripts/build_dgl_asv.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ pip install -r /asv/torch_gpu_pip.txt
# build
CMAKE_VARS="-DUSE_OPENMP=ON -DBUILD_TORCH=ON -DBUILD_SPARSE=ON -DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda"
if [[ $DEVICE == "gpu" ]]; then
CMAKE_VARS="-DUSE_CUDA=ON $CMAKE_VARS"
CMAKE_VARS="-DUSE_CUDA=ON -DUSE_NCCL=ON $CMAKE_VARS"
fi
arch=`uname -m`
if [[ $arch == *"x86"* ]]; then
Expand Down
6 changes: 3 additions & 3 deletions benchmarks/scripts/publish.sh
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ else
fi

WS_ROOT=/asv/dgl
docker pull public.ecr.aws/s1o7b3d9/benchmark_test:cu116
docker pull public.ecr.aws/s1o7b3d9/benchmark_test:cu116_v230110
if [ -z "$DGL_REG_CONF" ]; then
DOCKER_ENV_OPT="$DOCKER_ENV_OPT"
else
Expand Down Expand Up @@ -56,14 +56,14 @@ if [[ $DEVICE == "cpu" ]]; then
$DOCKER_MOUNT_OPT \
$DOCKER_ENV_OPT \
--shm-size="16g" \
--hostname=$MACHINE -dit public.ecr.aws/s1o7b3d9/benchmark_test:cu116 /bin/bash
--hostname=$MACHINE -dit public.ecr.aws/s1o7b3d9/benchmark_test:cu116_v230110 /bin/bash
else
docker run --name dgl-reg \
--rm --gpus all \
$DOCKER_MOUNT_OPT \
$DOCKER_ENV_OPT \
--shm-size="16g" \
--hostname=$MACHINE -dit public.ecr.aws/s1o7b3d9/benchmark_test:cu116 /bin/bash
--hostname=$MACHINE -dit public.ecr.aws/s1o7b3d9/benchmark_test:cu116_v230110 /bin/bash
fi

pwd
Expand Down
1 change: 0 additions & 1 deletion docker/Dockerfile.ci_benchmark
Original file line number Diff line number Diff line change
Expand Up @@ -27,5 +27,4 @@ ENV CPLUS_INCLUDE_PATH=/usr/local/cuda/include:${CPLUS_INCLUDE_PATH}
ENV C_INCLUDE_PATH=/usr/local/cuda/include:${C_INCLUDE_PATH}
ENV LIBRARY_PATH=/usr/local/cuda/lib64:/usr/local/nvidia/lib64:${LIBRARY_PATH}
ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:/usr/local/nvidia/lib64:${LD_LIBRARY_PATH}
ENV CUDA_VISIBLE_DEVICES=0
ENV TF_FORCE_GPU_ALLOW_GROWTH=true

0 comments on commit 4645532

Please sign in to comment.