-
Notifications
You must be signed in to change notification settings - Fork 165
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Implement GPU blocks
property
#2253
Changes from 1 commit
6ae3ec0
3342a34
fa57f44
5d5434b
8b555f7
a6dcde2
16de87d
73f4ba7
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,5 @@ | ||
from enum import Enum | ||
from typing import List, Optional | ||
from typing import List, Literal, Optional, Union | ||
|
||
import gpuhunt | ||
from pydantic import root_validator | ||
|
@@ -110,6 +110,11 @@ def get_public_keys(self) -> List[str]: | |
return [ssh_key.public.strip() for ssh_key in self.ssh_keys] | ||
|
||
|
||
class InstanceSharedInfo(CoreModel): | ||
total_blocks: Union[Literal["auto"], int] | ||
busy_blocks: int = 0 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not sure about storing Maybe using json is more flexible if we take a complete different approach instead of relying on There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Replaced with two integer columns 8b555f7 |
||
|
||
|
||
class InstanceRuntime(Enum): | ||
SHIM = "shim" | ||
RUNNER = "runner" | ||
|
@@ -143,6 +148,11 @@ class InstanceOfferWithAvailability(InstanceOffer): | |
instance_runtime: InstanceRuntime = InstanceRuntime.SHIM | ||
|
||
|
||
class InstanceSharedOffer(InstanceOfferWithAvailability): | ||
blocks: int | ||
total_blocks: int | ||
|
||
|
||
class InstanceStatus(str, Enum): | ||
PENDING = "pending" | ||
PROVISIONING = "provisioning" | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,6 @@ | ||
from datetime import datetime, timedelta | ||
from enum import Enum | ||
from typing import Any, Dict, List, Optional, Type | ||
from typing import Any, Dict, List, Optional, Type, Union | ||
|
||
from pydantic import UUID4, Field, root_validator | ||
from typing_extensions import Annotated | ||
|
@@ -13,6 +13,7 @@ | |
) | ||
from dstack._internal.core.models.instances import ( | ||
InstanceOfferWithAvailability, | ||
InstanceSharedOffer, | ||
InstanceType, | ||
SSHConnectionParams, | ||
) | ||
|
@@ -239,6 +240,10 @@ class JobRuntimeData(CoreModel): | |
# None if data is not yet available (on vm-based backends and ssh instances) | ||
# or not applicable (container-based backends) | ||
ports: Optional[dict[int, int]] = None | ||
# List of volumes used by the job | ||
volume_names: Optional[list[str]] = None # None for backward compalibility | ||
# Virtual shared offer. None if the instance is not shared. | ||
offer: Optional[InstanceSharedOffer] = None | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe it's more useful to always store the offer used by the job, not just when it was shared? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Changed in 8b555f7 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think it's crucial now to add description of JobRuntimeData and what should go there so that it does not become a catch-all struct for everything. Probably makes sense to add a similar description to JobProvisioningData to differentiate it with JobRuntimeData. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
|
||
|
||
class ClusterInfo(CoreModel): | ||
|
@@ -416,7 +421,7 @@ def _error(cls, values) -> Dict: | |
|
||
class JobPlan(CoreModel): | ||
job_spec: JobSpec | ||
offers: List[InstanceOfferWithAvailability] | ||
offers: List[Union[InstanceSharedOffer, InstanceOfferWithAvailability]] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we really need to introduce different types of offers to the API? Can't every InstanceOfferWithAvailability be InstanceSharedOffer with total_blocks/blocks = 1? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
total_offers: int | ||
max_price: Optional[float] | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -86,6 +86,7 @@ | |
get_instance_profile, | ||
get_instance_provisioning_data, | ||
get_instance_requirements, | ||
get_instance_shared_info, | ||
) | ||
from dstack._internal.server.services.runner import client as runner_client | ||
from dstack._internal.server.services.runner.client import HealthStatus | ||
|
@@ -133,7 +134,7 @@ async def _process_next_instance(): | |
), | ||
InstanceModel.id.not_in(lockset), | ||
) | ||
.options(lazyload(InstanceModel.job)) | ||
.options(lazyload(InstanceModel.jobs)) | ||
.order_by(InstanceModel.last_processed_at.asc()) | ||
.limit(1) | ||
.with_for_update(skip_locked=True) | ||
|
@@ -156,15 +157,15 @@ async def _process_instance(session: AsyncSession, instance: InstanceModel): | |
select(InstanceModel) | ||
.where(InstanceModel.id == instance.id) | ||
.options(joinedload(InstanceModel.project).joinedload(ProjectModel.backends)) | ||
.options(joinedload(InstanceModel.job)) | ||
.options(joinedload(InstanceModel.jobs)) | ||
.options(joinedload(InstanceModel.fleet).joinedload(FleetModel.instances)) | ||
.execution_options(populate_existing=True) | ||
) | ||
instance = res.unique().scalar_one() | ||
if ( | ||
instance.status == InstanceStatus.IDLE | ||
and instance.termination_policy == TerminationPolicy.DESTROY_AFTER_IDLE | ||
and instance.job_id is None | ||
and not instance.jobs | ||
): | ||
await _mark_terminating_if_idle_duration_expired(instance) | ||
if instance.status == InstanceStatus.PENDING: | ||
|
@@ -322,6 +323,30 @@ async def _add_remote(instance: InstanceModel) -> None: | |
) | ||
return | ||
|
||
shared_info = get_instance_shared_info(instance) | ||
if shared_info is not None: | ||
resources = instance_type.resources | ||
blocks = shared_info.total_blocks | ||
if blocks == "auto": | ||
blocks = len(resources.gpus) | ||
if blocks > 1: | ||
if len(resources.gpus) % blocks or resources.cpus % blocks: | ||
instance.status = InstanceStatus.TERMINATED | ||
instance.termination_reason = "Cannot split into blocks" | ||
logger.warning( | ||
"Failed to add instance %s: cannot split into blocks", | ||
instance.name, | ||
extra={ | ||
"instance_name": instance.name, | ||
"instance_status": InstanceStatus.TERMINATED.value, | ||
}, | ||
) | ||
return | ||
shared_info.total_blocks = blocks | ||
instance.shared_info = shared_info.json() | ||
else: | ||
instance.shared_info = None | ||
|
||
region = instance.region | ||
jpd = JobProvisioningData( | ||
backend=BackendType.REMOTE, | ||
|
@@ -439,10 +464,11 @@ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> No | |
instance_configuration = get_instance_configuration(instance) | ||
profile = get_instance_profile(instance) | ||
requirements = get_instance_requirements(instance) | ||
shared_info = get_instance_shared_info(instance) | ||
except ValidationError as e: | ||
instance.status = InstanceStatus.TERMINATED | ||
instance.termination_reason = ( | ||
f"Error to parse profile, requirements or instance_configuration: {e}" | ||
f"Error to parse profile, requirements, shared_info or instance_configuration: {e}" | ||
) | ||
instance.last_retry_at = get_current_datetime() | ||
logger.warning( | ||
|
@@ -473,12 +499,18 @@ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> No | |
) | ||
return | ||
|
||
if shared_info is None: | ||
blocks = None | ||
else: | ||
blocks = shared_info.total_blocks | ||
|
||
offers = await get_create_instance_offers( | ||
project=instance.project, | ||
profile=profile, | ||
requirements=requirements, | ||
exclude_not_available=True, | ||
fleet_model=instance.fleet, | ||
blocks=blocks, | ||
) | ||
|
||
if not offers and should_retry: | ||
|
@@ -557,6 +589,18 @@ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> No | |
instance.started_at = get_current_datetime() | ||
instance.last_retry_at = get_current_datetime() | ||
|
||
if shared_info is not None: | ||
if blocks == "auto": | ||
blocks = len(instance_offer.instance.resources.gpus) | ||
if blocks > 1: | ||
shared_info.total_blocks = blocks | ||
else: | ||
shared_info = None | ||
if shared_info is not None: | ||
instance.shared_info = shared_info.json() | ||
else: | ||
instance.shared_info = None | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Replaced with NULL -> calculated auto blocks in 8b555f7 NULL as "info is not yet available" seems to fit there |
||
|
||
logger.info( | ||
"Created instance %s", | ||
instance.name, | ||
|
@@ -585,8 +629,8 @@ async def _create_instance(session: AsyncSession, instance: InstanceModel) -> No | |
async def _check_instance(instance: InstanceModel) -> None: | ||
if ( | ||
instance.status == InstanceStatus.BUSY | ||
and instance.job is not None | ||
and instance.job.status.is_finished() | ||
and instance.jobs | ||
and all(job.status.is_finished() for job in instance.jobs) | ||
): | ||
# A busy instance could have no active jobs due to this bug: https://github.com/dstackai/dstack/issues/2068 | ||
instance.status = InstanceStatus.TERMINATING | ||
|
@@ -648,9 +692,7 @@ async def _check_instance(instance: InstanceModel) -> None: | |
instance.unreachable = False | ||
|
||
if instance.status == InstanceStatus.PROVISIONING: | ||
instance.status = ( | ||
InstanceStatus.IDLE if instance.job_id is None else InstanceStatus.BUSY | ||
) | ||
instance.status = InstanceStatus.IDLE if not instance.jobs else InstanceStatus.BUSY | ||
logger.info( | ||
"Instance %s has switched to %s status", | ||
instance.name, | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Maybe elaborate what
auto
means?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
5d5434b