Skip to content

Commit

Permalink
EmptyLauncher supported cuda_visible_devices by memory left order (#305)
Browse files Browse the repository at this point in the history
  • Loading branch information
JingofXin authored Oct 16, 2024
1 parent bf7a7a7 commit 749fe82
Showing 1 changed file with 44 additions and 1 deletion.
45 changes: 44 additions & 1 deletion lazyllm/launcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,8 @@ def clone(self):
lazyllm.config.add('sco.workspace', str, 'your_workspace', 'SCO_WORKSPACE')
lazyllm.config.add('sco_env_name', str, '', 'SCO_ENV_NAME')
lazyllm.config.add('sco_keep_record', bool, False, 'SCO_KEEP_RECORD')
lazyllm.config.add("sco_resource_type", str, "N3lS.Ii.I60", "SCO_RESOURCE_TYPE")
lazyllm.config.add('sco_resource_type', str, 'N3lS.Ii.I60', 'SCO_RESOURCE_TYPE')
lazyllm.config.add('cuda_visible', bool, False, 'CUDA_VISIBLE')


# store cmd, return message and command output.
Expand Down Expand Up @@ -173,6 +174,25 @@ class Job(Job):
def __init__(self, cmd, launcher, *, sync=True):
super(__class__, self).__init__(cmd, launcher, sync=sync)

def _wrap_cmd(self, cmd):
if self.launcher.ngpus == 0:
return cmd
gpus = self.launcher._get_idle_gpus()
if gpus and lazyllm.config['cuda_visible']:
if self.launcher.ngpus is None:
empty_cmd = f'CUDA_VISIBLE_DEVICES={gpus[0]} '
elif self.launcher.ngpus <= len(gpus):
empty_cmd = 'CUDA_VISIBLE_DEVICES=' + \
','.join([str(n) for n in gpus[:self.launcher.ngpus]]) + ' '
else:
error_info = (f'Not enough GPUs available. Requested {self.launcher.ngpus} GPUs, '
f'but only {len(gpus)} are available.')
LOG.error(error_info)
raise error_info
else:
empty_cmd = ''
return empty_cmd + cmd

def stop(self):
if self.ps:
try:
Expand Down Expand Up @@ -230,6 +250,29 @@ def launch(self, f, *args, **kw):
else:
raise RuntimeError('Invalid cmd given, please check the return value of cmd.')

def _get_idle_gpus(self):
try:
order_list = subprocess.check_output(
['nvidia-smi', '--query-gpu=index,memory.free', '--format=csv,noheader,nounits'],
encoding='utf-8'
)
except Exception as e:
LOG.error(f"An error occurred: {e}")
return []
lines = order_list.strip().split('\n')

str_num = os.getenv('CUDA_VISIBLE_DEVICES', None)
if str_num:
sub_gpus = [int(x) for x in str_num.strip().split(',')]

gpu_info = []
for line in lines:
index, memory_free = line.split(', ')
if not str_num or int(index) in sub_gpus:
gpu_info.append((int(index), int(memory_free)))
gpu_info.sort(key=lambda x: x[1], reverse=True)
LOG.info('Memory left:\n' + '\n'.join([f'{item[0]} GPU, left: {item[1]} MiB' for item in gpu_info]))
return [info[0] for info in gpu_info]

@final
class SlurmLauncher(LazyLLMLaunchersBase):
Expand Down

0 comments on commit 749fe82

Please sign in to comment.