Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/dstack/_internal/core/models/instances.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,6 +256,7 @@ def finished_statuses(cls) -> List["InstanceStatus"]:


class InstanceTerminationReason(str, Enum):
TERMINATED_BY_USER = "terminated_by_user"
IDLE_TIMEOUT = "idle_timeout"
PROVISIONING_TIMEOUT = "provisioning_timeout"
ERROR = "error"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
)
from dstack._internal.server.services import backends as backends_services
from dstack._internal.server.services.compute_groups import compute_group_model_to_compute_group
from dstack._internal.server.services.instances import switch_instance_status
from dstack._internal.server.services.locking import get_locker
from dstack._internal.server.utils import sentry_utils
from dstack._internal.utils.common import get_current_datetime, run_async
Expand Down Expand Up @@ -83,12 +84,14 @@ async def _process_compute_group(session: AsyncSession, compute_group_model: Com
)
compute_group_model = res.unique().scalar_one()
if all(i.status == InstanceStatus.TERMINATING for i in compute_group_model.instances):
await _terminate_compute_group(compute_group_model)
await _terminate_compute_group(session, compute_group_model)
compute_group_model.last_processed_at = get_current_datetime()
await session.commit()


async def _terminate_compute_group(compute_group_model: ComputeGroupModel) -> None:
async def _terminate_compute_group(
session: AsyncSession, compute_group_model: ComputeGroupModel
) -> None:
if (
compute_group_model.last_termination_retry_at is not None
and _next_termination_retry_at(compute_group_model) > get_current_datetime()
Expand Down Expand Up @@ -147,7 +150,7 @@ async def _terminate_compute_group(compute_group_model: ComputeGroupModel) -> No
instance_model.deleted = True
instance_model.deleted_at = get_current_datetime()
instance_model.finished_at = get_current_datetime()
instance_model.status = InstanceStatus.TERMINATED
switch_instance_status(session, instance_model, InstanceStatus.TERMINATED)
logger.info(
"Terminated compute group %s",
compute_group.name,
Expand Down
11 changes: 3 additions & 8 deletions src/dstack/_internal/server/background/tasks/process_fleets.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
is_fleet_in_use,
switch_fleet_status,
)
from dstack._internal.server.services.instances import format_instance_status_for_event
from dstack._internal.server.services.instances import switch_instance_status
from dstack._internal.server.services.locking import get_locker
from dstack._internal.server.utils import sentry_utils
from dstack._internal.utils.common import get_current_datetime
Expand Down Expand Up @@ -219,15 +219,10 @@ def _maintain_fleet_nodes_in_min_max_range(
if nodes_redundant == 0:
break
if instance.status in [InstanceStatus.IDLE]:
instance.status = InstanceStatus.TERMINATING
instance.termination_reason = InstanceTerminationReason.MAX_INSTANCES_LIMIT
instance.termination_reason_message = "Fleet has too many instances"
switch_instance_status(session, instance, InstanceStatus.TERMINATING)
nodes_redundant -= 1
logger.info(
"Terminating instance %s: %s",
instance.name,
instance.termination_reason,
)
return True
nodes_missing = fleet_spec.configuration.nodes.min - active_instances_num
for i in range(nodes_missing):
Expand All @@ -243,7 +238,7 @@ def _maintain_fleet_nodes_in_min_max_range(
session,
(
"Instance created to meet target fleet node count."
f" Status: {format_instance_status_for_event(instance_model)}"
f" Status: {instance_model.status.upper()}"
),
actor=events.SystemActor(),
targets=[events.Target.from_model(instance_model)],
Expand Down
Loading