Fix isolation (#186)

huggingface · Apr 29, 2024 · 10e4ece · 10e4ece
1 parent d9a8423
commit 10e4ece
Show file tree

Hide file tree

Showing 28 changed files with 570 additions and 505 deletions.
diff --git a/.github/workflows/test_cli_rocm_pytorch_single_gpu.yaml b/.github/workflows/test_cli_rocm_pytorch_single_gpu.yaml
@@ -26,10 +26,8 @@ jobs:
 
       - name: Target devices
         run: |
-          echo "DEVICE0: $DEVICE0"
-          echo "DEVICE1: $DEVICE1"
-          echo "DEVICE0=$DEVICE0" >> $GITHUB_ENV
-          echo "DEVICE1=$DEVICE1" >> $GITHUB_ENV
+          echo "DEVICE: $DEVICE"
+          echo "DEVICE=$DEVICE" >> $GITHUB_ENV
 
       - name: Build image
         run: docker build

diff --git a/Makefile b/Makefile
@@ -70,7 +70,6 @@ run_rocm_container:
 	docker run \
 	-it \
 	--rm \
-	--pid host \
 	--shm-size 64G \
 	--device /dev/kfd \
 	--device /dev/dri \

diff --git a/examples/pytorch_bert.yaml b/examples/pytorch_bert.yaml
@@ -11,6 +11,7 @@ experiment_name: pytorch_bert
 
 launcher:
   device_isolation: true
+  device_isolation_action: warn
 
 benchmark:
   latency: true

diff --git a/examples/pytorch_llama.yaml b/examples/pytorch_llama.yaml
@@ -9,15 +9,16 @@ defaults:
 
 experiment_name: pytorch_llama
 
+launcher:
+  device_isolation: true
+  device_isolation_action: warn
+
 backend:
   device: cuda
   device_ids: 0
   no_weights: true
   model: TheBloke/Llama-2-70B-AWQ
 
-launcher:
-  device_isolation: true
-
 benchmark:
   input_shapes:
     batch_size: 1

diff --git a/examples/pytorch_timm.yaml b/examples/pytorch_timm.yaml
@@ -9,14 +9,15 @@ defaults:
 
 experiment_name: pytorch_timm
 
+launcher:
+  device_isolation: true
+  device_isolation_action: warn
+
 backend:
   device: cuda
   device_ids: 0
   model: timm/mobilenetv3_large_100.ra_in1k
 
-launcher:
-  device_isolation: true
-
 benchmark:
   memory: true
   input_shapes:

diff --git a/examples/trt_llama.yaml b/examples/trt_llama.yaml
@@ -9,6 +9,10 @@ defaults:
 
 experiment_name: trt_llama
 
+launcher:
+  device_isolation: true
+  device_isolation_action: warn
+
 backend:
   device: cuda
   device_ids: 0

diff --git a/optimum_benchmark/backends/base.py b/optimum_benchmark/backends/base.py
@@ -120,11 +120,8 @@ def train(self, **kwargs) -> TrainerState:
         """
         raise NotImplementedError("Backend must implement train method")
 
-    def delete_pretrained_model(self) -> None:
+    def clean(self) -> None:
         if hasattr(self, "pretrained_model"):
             del self.pretrained_model
 
-    def clean(self) -> None:
-        LOGGER.info(f"Cleaning {self.NAME} backend")
-        self.delete_pretrained_model()
         gc.collect()