Skip to content

Commit

Permalink
Failed workshop items rerun (#1980)
Browse files Browse the repository at this point in the history
* Adding .vscode to ignore list

* Adding failed provisions rerun

* Adding fail threshold parameter

* Adding autoDetach to resourceClaims
  • Loading branch information
makirill authored Jun 27, 2024
1 parent 4b977af commit 2df4609
Show file tree
Hide file tree
Showing 4 changed files with 49 additions and 1 deletion.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@ notifier/values.yaml
*.key
*.swp
__pycache__
.vscode
2 changes: 2 additions & 0 deletions workshop-manager/operator/babylon.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ class Babylon():
salesforce_id_annotation = f"{demo_domain}/salesforce-id"
user_name_label = f"{babylon_domain}/user-name"

workshop_fail_percentage_threshold = int(os.environ.get('WORKSHOP_FAIL_PERCENTAGE_THRESHOLD', 60))

@classmethod
async def on_cleanup(cls):
await cls.core_v1_api.api_client.close()
Expand Down
29 changes: 29 additions & 0 deletions workshop-manager/operator/resourceclaim.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,10 +80,39 @@ def provision_complete(self):
if not state:
return False
if state['kind'] == 'AnarchySubject':
# Anarchy Governor is not setting the completeTimestamp for failed states yet
# so we need to check the current_state to determine if the provision is failed which
# means completed provision
# TODO: Remove this check once Anarchy Governor sets completeTimestamp for failed states (GPTEINFRA-10007)
current_state = state.get('spec', {}).get('vars', {}).get('current_state')
if current_state is not None and (
current_state.endswith('-failed') or
current_state in ["provision-error", "provision-cancelled"]
):
return True

if not state.get('status', {}).get('towerJobs', {}).get('provision', {}).get('completeTimestamp'):
return False

return True

@property
def is_failed(self):
if 'status' not in self.definition or 'resources' not in self.definition['status']:
return False
for resource in self.definition['status']['resources']:
state = resource.get('state')
if not state:
return False
if state['kind'] == 'AnarchySubject':
current_state = state.get('spec', {}).get('vars', {}).get('current_state')
if current_state is not None and (
current_state.endswith('-failed') or
current_state in ["provision-error", "provision-cancelled"]
):
return True
return False

@property
def resource_handle_name(self):
return self.status.get('resourceHandle', {}).get('name')
Expand Down
18 changes: 17 additions & 1 deletion workshop-manager/operator/workshopprovision.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,10 @@ def action_schedule_stop(self):
return datetime.strptime(
stop_timestamp, '%Y-%m-%dT%H:%M:%SZ'
).replace(tzinfo=timezone.utc)

@property
def auto_detach_condition(self):
return self.spec.get('autoDetach', {}).get('when')

@property
def catalog_item_name(self):
Expand Down Expand Up @@ -140,6 +144,9 @@ async def create_resource_claim(self, logger, workshop):
}
}

if self.auto_detach_condition:
resource_claim_definition['spec']['autoDetach'] = {"when": self.auto_detach_condition}

if workshop.requester:
resource_claim_definition['metadata']['annotations'][Babylon.requester_annotation] = workshop.requester

Expand Down Expand Up @@ -282,6 +289,7 @@ async def manage_resource_claims(self, logger, workshop):

resource_claim_count = 0
provisioning_count = 0
failed_count = 0

async for resource_claim in self.list_resource_claims():
resource_claim_count += 1
Expand All @@ -294,12 +302,20 @@ async def manage_resource_claims(self, logger, workshop):
if not resource_claim.provision_complete:
provisioning_count += 1

if resource_claim.is_failed:
failed_count += 1

# Do not start any provisions if lifespan start is in the future
if self.lifespan_start and self.lifespan_start > datetime.now(timezone.utc):
return

# Do not start any provisions if failure threshold is exceeded
if self.count != 0:
if Babylon.workshop_fail_percentage_threshold <= failed_count / self.count * 100:
return

# Start provisions up to count and within concurrency limit
if resource_claim_count < self.count and provisioning_count < self.concurrency:
if resource_claim_count < (self.count + failed_count) and provisioning_count < self.concurrency:
await self.create_resource_claim(logger=logger, workshop=workshop)

async def set_owner_references(self, logger):
Expand Down

0 comments on commit 2df4609

Please sign in to comment.