You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
When trying to use OPT-66B without bnb-int8 on the new 10-GPU BRTX configuration, specifically when running the example query in the demo, I get the following on the backend:
sandle-backend-hf-1 | [2022-12-15 19:27:44,621] [ERROR] [waitress] Exception while serving /v1/completions
sandle-backend-hf-1 | Traceback (most recent call last):
sandle-backend-hf-1 | File "/opt/anaconda3/lib/python3.9/site-packages/waitress/channel.py", line 428, in service
sandle-backend-hf-1 | task.service()
sandle-backend-hf-1 | File "/opt/anaconda3/lib/python3.9/site-packages/waitress/task.py", line 168, in service
sandle-backend-hf-1 | self.execute()
sandle-backend-hf-1 | File "/opt/anaconda3/lib/python3.9/site-packages/waitress/task.py", line 456, in execute
sandle-backend-hf-1 | for chunk in app_iter:
sandle-backend-hf-1 | File "/opt/anaconda3/lib/python3.9/site-packages/sentry_sdk/integrations/wsgi.py", line 270, in __iter__
sandle-backend-hf-1 | reraise(*_capture_exception(self._hub))
sandle-backend-hf-1 | File "/opt/anaconda3/lib/python3.9/site-packages/sentry_sdk/_compat.py", line 57, in reraise
sandle-backend-hf-1 | raise value
sandle-backend-hf-1 | File "/opt/anaconda3/lib/python3.9/site-packages/sentry_sdk/integrations/wsgi.py", line 266, in __iter__
sandle-backend-hf-1 | chunk = next(iterator)
sandle-backend-hf-1 | File "/opt/anaconda3/lib/python3.9/site-packages/werkzeug/wsgi.py", line 500, in __next__
sandle-backend-hf-1 | return self._next()
sandle-backend-hf-1 | File "/opt/anaconda3/lib/python3.9/site-packages/werkzeug/wrappers/response.py", line 50, in _iter_encoded
sandle-backend-hf-1 | for item in iterable:
sandle-backend-hf-1 | File "/opt/sandle/serve-backend-hf.py", line 428, in <genexpr>
sandle-backend-hf-1 | (f'data: {event_data}\n\n' for event_data in chain(
sandle-backend-hf-1 | File "/opt/sandle/serve-backend-hf.py", line 429, in <genexpr>
sandle-backend-hf-1 | (
sandle-backend-hf-1 | File "/opt/sandle/serve-backend-hf.py", line 182, in <genexpr>
sandle-backend-hf-1 | return (
sandle-backend-hf-1 | File "/opt/sandle/serve-backend-hf.py", line 184, in <genexpr>
sandle-backend-hf-1 | for c in (
sandle-backend-hf-1 | File "/opt/sandle/serve-backend-hf.py", line 207, in _stream_complete_single
sandle-backend-hf-1 | [raw_completion] = self._complete(
sandle-backend-hf-1 | File "/opt/sandle/serve-backend-hf.py", line 247, in _complete
sandle-backend-hf-1 | output_token_ids = cast(torch.Tensor, model.generate(
sandle-backend-hf-1 | File "/opt/anaconda3/lib/python3.9/site-packages/torch/autograd/grad_mode.py", line 27, in decorate_context
sandle-backend-hf-1 | return func(*args, **kwargs)
sandle-backend-hf-1 | File "/opt/anaconda3/lib/python3.9/site-packages/transformers/generation_utils.py", line 1543, in generate
sandle-backend-hf-1 | return self.sample(
sandle-backend-hf-1 | File "/opt/anaconda3/lib/python3.9/site-packages/transformers/generation_utils.py", line 2482, in sample
sandle-backend-hf-1 | outputs = self(
sandle-backend-hf-1 | File "/opt/anaconda3/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
sandle-backend-hf-1 | return forward_call(*input, **kwargs)
sandle-backend-hf-1 | File "/opt/anaconda3/lib/python3.9/site-packages/accelerate/hooks.py", line 156, in new_forward
sandle-backend-hf-1 | output = old_forward(*args, **kwargs)
sandle-backend-hf-1 | File "/opt/anaconda3/lib/python3.9/site-packages/transformers/models/opt/modeling_opt.py", line 929, in forward
sandle-backend-hf-1 | outputs = self.model.decoder(
sandle-backend-hf-1 | File "/opt/anaconda3/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
sandle-backend-hf-1 | return forward_call(*input, **kwargs)
sandle-backend-hf-1 | File "/opt/anaconda3/lib/python3.9/site-packages/transformers/models/opt/modeling_opt.py", line 693, in forward
sandle-backend-hf-1 | layer_outputs = decoder_layer(
sandle-backend-hf-1 | File "/opt/anaconda3/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
sandle-backend-hf-1 | return forward_call(*input, **kwargs)
sandle-backend-hf-1 | File "/opt/anaconda3/lib/python3.9/site-packages/accelerate/hooks.py", line 151, in new_forward
sandle-backend-hf-1 | args, kwargs = module._hf_hook.pre_forward(module, *args, **kwargs)
sandle-backend-hf-1 | File "/opt/anaconda3/lib/python3.9/site-packages/accelerate/hooks.py", line 266, in pre_forward
sandle-backend-hf-1 | return send_to_device(args, self.execution_device), send_to_device(kwargs, self.execution_device)
sandle-backend-hf-1 | File "/opt/anaconda3/lib/python3.9/site-packages/accelerate/utils/operations.py", line 131, in send_to_device
sandle-backend-hf-1 | return recursively_apply(_send_to_device, tensor, device, non_blocking, test_type=_has_to_method)
sandle-backend-hf-1 | File "/opt/anaconda3/lib/python3.9/site-packages/accelerate/utils/operations.py", line 91, in recursively_apply
sandle-backend-hf-1 | {
sandle-backend-hf-1 | File "/opt/anaconda3/lib/python3.9/site-packages/accelerate/utils/operations.py", line 92, in <dictcomp>
sandle-backend-hf-1 | k: recursively_apply(
sandle-backend-hf-1 | File "/opt/anaconda3/lib/python3.9/site-packages/accelerate/utils/operations.py", line 99, in recursively_apply
sandle-backend-hf-1 | return func(data, *args, **kwargs)
sandle-backend-hf-1 | File "/opt/anaconda3/lib/python3.9/site-packages/accelerate/utils/operations.py", line 124, in _send_to_device
sandle-backend-hf-1 | return t.to(device, non_blocking=non_blocking)
sandle-backend-hf-1 | RuntimeError: CUDA error: peer mapping resources exhausted
sandle-backend-hf-1 | CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
sandle-backend-hf-1 | For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
When trying to use OPT-66B without bnb-int8 on the new 10-GPU BRTX configuration, specifically when running the example query in the demo, I get the following on the backend:
This discussion from 2016, if still applicable, suggests it might a limitation CUDA has to not use more than 8 GPUs in a "peer-to-peer ensemble:" https://forums.developer.nvidia.com/t/cuda-peer-resources-error-when-running-on-more-than-8-k80s-aws-p2-16xlarge/45351/3
The text was updated successfully, but these errors were encountered: