LazyAGI · wzh1994 · Oct 16, 2024 · Oct 14, 2024 · Oct 14, 2024 · Oct 15, 2024
diff --git a/lazyllm/launcher.py b/lazyllm/launcher.py
@@ -62,7 +62,8 @@ def clone(self):
 lazyllm.config.add('sco.workspace', str, 'your_workspace', 'SCO_WORKSPACE')
 lazyllm.config.add('sco_env_name', str, '', 'SCO_ENV_NAME')
 lazyllm.config.add('sco_keep_record', bool, False, 'SCO_KEEP_RECORD')
-lazyllm.config.add("sco_resource_type", str, "N3lS.Ii.I60", "SCO_RESOURCE_TYPE")
+lazyllm.config.add('sco_resource_type', str, 'N3lS.Ii.I60', 'SCO_RESOURCE_TYPE')
+lazyllm.config.add('cuda_visible', bool, False, 'CUDA_VISIBLE')
 
 
 # store cmd, return message and command output.
@@ -173,6 +174,25 @@ class Job(Job):
         def __init__(self, cmd, launcher, *, sync=True):
             super(__class__, self).__init__(cmd, launcher, sync=sync)
 
+        def _wrap_cmd(self, cmd):
+            if self.launcher.ngpus == 0:
+                return cmd
+            gpus = self.launcher._get_idle_gpus()
+            if gpus and lazyllm.config['cuda_visible']:
+                if self.launcher.ngpus is None:
+                    empty_cmd = f'CUDA_VISIBLE_DEVICES={gpus[0]} '
+                elif self.launcher.ngpus <= len(gpus):
+                    empty_cmd = 'CUDA_VISIBLE_DEVICES=' + \
+                                ','.join([str(n) for n in gpus[:self.launcher.ngpus]]) + ' '
+                else:
+                    error_info = (f'Not enough GPUs available. Requested {self.launcher.ngpus} GPUs, '
+                                  f'but only {len(gpus)} are available.')
+                    LOG.error(error_info)
+                    raise error_info
+            else:
+                empty_cmd = ''
+            return empty_cmd + cmd
+
         def stop(self):
             if self.ps:
                 try:
@@ -230,6 +250,29 @@ def launch(self, f, *args, **kw):
         else:
             raise RuntimeError('Invalid cmd given, please check the return value of cmd.')
 
+    def _get_idle_gpus(self):
+        try:
+            order_list = subprocess.check_output(
+                ['nvidia-smi', '--query-gpu=index,memory.free', '--format=csv,noheader,nounits'],
+                encoding='utf-8'
+            )
+        except Exception as e:
+            LOG.error(f"An error occurred: {e}")
+            return []
+        lines = order_list.strip().split('\n')
+
+        str_num = os.getenv('CUDA_VISIBLE_DEVICES', None)
+        if str_num:
+            sub_gpus = [int(x) for x in str_num.strip().split(',')]
+
+        gpu_info = []
+        for line in lines:
+            index, memory_free = line.split(', ')
+            if not str_num or int(index) in sub_gpus:
+                gpu_info.append((int(index), int(memory_free)))
+        gpu_info.sort(key=lambda x: x[1], reverse=True)
+        LOG.info('Memory left:\n' + '\n'.join([f'{item[0]} GPU, left: {item[1]} MiB' for item in gpu_info]))
+        return [info[0] for info in gpu_info]
 
 @final
 class SlurmLauncher(LazyLLMLaunchersBase):