Skip to content

Commit

Permalink
adding routines for cleaning up distributed process groups (#50)
Browse files Browse the repository at this point in the history
  • Loading branch information
azrael417 authored Aug 29, 2024
1 parent 24fcb06 commit b2ce590
Show file tree
Hide file tree
Showing 5 changed files with 22 additions and 4 deletions.
5 changes: 3 additions & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -30,14 +30,15 @@
# build after cloning in directoy torch_harmonics via
# docker build . -t torch_harmonics

FROM nvcr.io/nvidia/pytorch:24.07-py3
FROM nvcr.io/nvidia/pytorch:24.08-py3

COPY . /workspace/torch_harmonics

# we need this for tests
RUN pip install parameterized

# The custom CUDA extension does not suppport architerctures < 7.0
ENV FORCE_CUDA_EXTENSION=1
ENV TORCH_CUDA_ARCH_LIST "7.0 7.2 7.5 8.0 8.6 8.7 9.0+PTX"
RUN pip install --global-option --cuda_ext /workspace/torch_harmonics
RUN cd /workspace/torch_harmonics && pip install --no-build-isolation .

7 changes: 6 additions & 1 deletion tests/test_distributed_convolution.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,11 @@ def setUpClass(cls):
# initializing sht
thd.init(cls.h_group, cls.w_group)

@classmethod
def tearDownClass(cls):
thd.finalize()
dist.destroy_process_group(None)

def _split_helper(self, tensor):
with torch.no_grad():
# split in W
Expand Down Expand Up @@ -185,7 +190,7 @@ def _gather_helper_bwd(self, tensor, B, C, convolution_dist):
[128, 256, 128, 256, 32, 8, [3], 2, "equiangular", "equiangular", False, 1e-5],
[128, 256, 128, 256, 32, 6, [3], 1, "equiangular", "equiangular", False, 1e-5],
[128, 256, 128, 256, 32, 8, [3], 1, "equiangular", "equiangular", True, 1e-5],
[129, 256, 128, 256, 32, 8, [3], 1, "equiangular", "equiangular", True, 1e-5],
[129, 256, 129, 256, 32, 8, [3], 1, "equiangular", "equiangular", True, 1e-5],
[128, 256, 128, 256, 32, 8, [3, 2], 1, "equiangular", "equiangular", True, 1e-5],
[64, 128, 128, 256, 32, 8, [3], 1, "equiangular", "equiangular", True, 1e-5],
[128, 256, 128, 256, 32, 8, [3], 2, "equiangular", "equiangular", True, 1e-5],
Expand Down
5 changes: 5 additions & 0 deletions tests/test_distributed_sht.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,11 @@ def setUpClass(cls):
# initializing sht
thd.init(cls.h_group, cls.w_group)

@classmethod
def tearDownClass(cls):
thd.finalize()
dist.destroy_process_group(None)


def _split_helper(self, tensor):
with torch.no_grad():
Expand Down
2 changes: 1 addition & 1 deletion torch_harmonics/distributed/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
#

# we need this in order to enable distributed
from .utils import init, is_initialized, polar_group, azimuth_group
from .utils import init, finalize, is_initialized, polar_group, azimuth_group
from .utils import polar_group_size, azimuth_group_size, polar_group_rank, azimuth_group_rank
from .primitives import compute_split_shapes, split_tensor_along_dim
from .primitives import (
Expand Down
7 changes: 7 additions & 0 deletions torch_harmonics/distributed/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,13 @@ def init(polar_process_group, azimuth_process_group):
_AZIMUTH_PARALLEL_GROUP = azimuth_process_group
_IS_INITIALIZED = True

def finalize():
if is_initialized():
if is_distributed_polar():
dist.destroy_process_group(_POLAR_PARALLEL_GROUP)
if is_distributed_azimuth():
ist.destroy_process_group(_AZIMUTH_PARALLEL_GROUP)

def is_initialized() -> bool:
return _IS_INITIALIZED

Expand Down

0 comments on commit b2ce590

Please sign in to comment.