Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 47 additions & 2 deletions mod_ci/controllers.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,14 +131,51 @@ def safe_db_commit(db, operation_description: str = "database operation") -> boo
"The test will be retried automatically when resources become available."
),
'QUOTA_EXCEEDED': (
"GCP quota limit reached. Please wait for other tests to complete "
"or contact the administrator."
"GCP quota limit reached. "
"The test will be retried automatically when resources become available."
),
'RESOURCE_NOT_FOUND': "Required GCP resource not found. Please contact the administrator.",
'RESOURCE_ALREADY_EXISTS': "A VM with this name already exists. Please contact the administrator.",
'TIMEOUT': "GCP operation timed out. The test will be retried automatically.",
}

# GCP error codes that are transient and should be retried automatically.
# Tests encountering these errors will remain pending and be picked up on the next cron run.
GCP_RETRYABLE_ERRORS = {
'ZONE_RESOURCE_POOL_EXHAUSTED',
'QUOTA_EXCEEDED',
}


def get_gcp_error_code(result: Dict) -> Optional[str]:
"""
Extract the error code from a GCP API error response.

:param result: The GCP API response dictionary
:return: The error code string, or None if not found
"""
if not isinstance(result, dict):
return None
error = result.get('error')
if not isinstance(error, dict):
return None
errors = error.get('errors', [])
if not errors or not isinstance(errors, list):
return None
first_error = errors[0] if len(errors) > 0 else {}
return first_error.get('code')


def is_retryable_gcp_error(result: Dict) -> bool:
"""
Check if a GCP error is transient and should be retried.

:param result: The GCP API response dictionary
:return: True if the error is retryable, False otherwise
"""
error_code = get_gcp_error_code(result)
return error_code in GCP_RETRYABLE_ERRORS


def parse_gcp_error(result: Dict, log=None) -> str:
"""
Expand Down Expand Up @@ -929,6 +966,10 @@ def start_test(compute, app, db, repository: Repository.Repository, test, bot_to
# Check if the create_instance call itself returned an error (synchronous failure)
if 'error' in operation:
error_msg = parse_gcp_error(operation)
if is_retryable_gcp_error(operation):
# Transient error - leave test pending for retry on next cron run
log.warning(f"Test {test.id}: VM creation hit retryable error, will retry: {error_msg}")
return
log.error(f"Error creating test instance for test {test.id}, result: {operation}")
mark_test_failed(db, test, repository, error_msg)
return
Expand All @@ -947,6 +988,10 @@ def start_test(compute, app, db, repository: Repository.Repository, test, bot_to
error_msg = parse_gcp_error(result)
log.error(f"Test {test.id}: VM creation failed: {error_msg}")
log.error(f"Test {test.id}: Full GCP response: {result}")
if is_retryable_gcp_error(result):
# Transient error - leave test pending for retry on next cron run
log.info(f"Test {test.id}: Error is retryable, will retry on next cron run")
return
mark_test_failed(db, test, repository, error_msg)
return

Expand Down
50 changes: 46 additions & 4 deletions tests/test_ci/test_controllers.py
Original file line number Diff line number Diff line change
Expand Up @@ -3901,14 +3901,14 @@ def _setup_start_test_mocks(self, mock_g, mock_gcp_instance, mock_test_progress,
@mock.patch('mod_ci.controllers.g')
@mock.patch('mod_ci.controllers.TestProgress')
@mock.patch('mod_ci.controllers.GcpInstance')
def test_start_test_quota_exceeded_no_db_record(
def test_start_test_quota_exceeded_retries_instead_of_failing(
self, mock_gcp_instance, mock_test_progress, mock_g, mock_open_file,
mock_zipfile, mock_requests_get, mock_find_artifact,
mock_create_instance, mock_wait_for_operation, mock_mark_failed, mock_log):
"""Test that QUOTA_EXCEEDED error prevents gcp_instance record creation.
"""Test that QUOTA_EXCEEDED error leaves test pending for retry.

This is the exact scenario from test #7768: GCP operation returns
successfully but completes with QUOTA_EXCEEDED error.
QUOTA_EXCEEDED is a transient error - the test should remain pending
and be retried on the next cron run, not marked as failed.
"""
from mod_ci.controllers import start_test

Expand All @@ -3925,7 +3925,49 @@ def test_start_test_quota_exceeded_no_db_record(

start_test(MagicMock(), self.app, mock_g.db, MagicMock(), Test.query.first(), "token")

# QUOTA_EXCEEDED is retryable, so mark_test_failed should NOT be called
mock_mark_failed.assert_not_called()
# No GcpInstance record should be created
mock_g.db.add.assert_not_called()

@mock.patch('run.log')
@mock.patch('mod_ci.controllers.mark_test_failed')
@mock.patch('mod_ci.controllers.wait_for_operation')
@mock.patch('mod_ci.controllers.create_instance')
@mock.patch('mod_ci.controllers.find_artifact_for_commit')
@mock.patch('mod_ci.controllers.requests.get')
@mock.patch('mod_ci.controllers.zipfile.ZipFile')
@mock.patch('builtins.open', new_callable=mock.mock_open())
@mock.patch('mod_ci.controllers.g')
@mock.patch('mod_ci.controllers.TestProgress')
@mock.patch('mod_ci.controllers.GcpInstance')
def test_start_test_non_retryable_error_marks_failed(
self, mock_gcp_instance, mock_test_progress, mock_g, mock_open_file,
mock_zipfile, mock_requests_get, mock_find_artifact,
mock_create_instance, mock_wait_for_operation, mock_mark_failed, mock_log):
"""Test that non-retryable errors (like RESOURCE_NOT_FOUND) mark test as failed.

Only transient errors like QUOTA_EXCEEDED should be retried. Permanent
errors like RESOURCE_NOT_FOUND should immediately mark the test as failed.
"""
from mod_ci.controllers import start_test

self._setup_start_test_mocks(mock_g, mock_gcp_instance, mock_test_progress,
mock_find_artifact, mock_requests_get, mock_zipfile)

# VM creation returns operation ID, but wait_for_operation returns non-retryable error
mock_create_instance.return_value = {'name': 'op-123', 'status': 'RUNNING'}
mock_wait_for_operation.return_value = {
'status': 'DONE',
'error': {'errors': [{'code': 'RESOURCE_NOT_FOUND', 'message': 'Resource not found'}]},
'httpErrorStatusCode': 404
}

start_test(MagicMock(), self.app, mock_g.db, MagicMock(), Test.query.first(), "token")

# RESOURCE_NOT_FOUND is NOT retryable, so mark_test_failed SHOULD be called
mock_mark_failed.assert_called_once()
# No GcpInstance record should be created
mock_g.db.add.assert_not_called()

@mock.patch('run.log')
Expand Down