Skip to content

Commit e67b4fd

Browse files
authored
Merge pull request #18 from kernelci/fix-vm-boot
Fix container boot, send container log as failure if VM failed to start
2 parents db04535 + 079e96d commit e67b4fd

3 files changed

Lines changed: 82 additions & 7 deletions

File tree

dockerfiles/aws/test.dockerfile

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,12 @@ RUN pip install --no-cache-dir boto3 && \
1010

1111
WORKDIR /app
1212

13-
# Copy scripts
14-
COPY src/kernel_ci_cloud_labs/launch_vm.py /app/launch_vm.py
13+
# Copy scripts (with minimal package skeleton so launch_vm.py's
14+
# `from kernel_ci_cloud_labs.core.log_scrub import scrub_text` resolves)
15+
COPY src/kernel_ci_cloud_labs/__init__.py /app/kernel_ci_cloud_labs/__init__.py
16+
COPY src/kernel_ci_cloud_labs/core/__init__.py /app/kernel_ci_cloud_labs/core/__init__.py
17+
COPY src/kernel_ci_cloud_labs/core/log_scrub.py /app/kernel_ci_cloud_labs/core/log_scrub.py
18+
COPY src/kernel_ci_cloud_labs/launch_vm.py /app/launch_vm.py
1519
COPY src/kernel_ci_cloud_labs/debug_aws_setup.py /app/debug_aws_setup.py
1620

1721
# Run debug check (ignore exit code), then launch VMs

src/kernel_ci_cloud_labs/core/pipeline.py

Lines changed: 63 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -177,7 +177,14 @@ def _log_vm_output_excerpts(s3_client, bucket, run_prefix, failed_tests):
177177
pass
178178

179179

180-
def create_summary(run_dir, start_time, task_arn, expected_vm_count=None, s3_context=None):
180+
def create_summary(
181+
run_dir,
182+
start_time,
183+
task_arn,
184+
expected_vm_count=None,
185+
s3_context=None,
186+
container_failure_log_url=None,
187+
):
181188
"""Create summary.json with VM statistics.
182189
183190
Args:
@@ -186,6 +193,10 @@ def create_summary(run_dir, start_time, task_arn, expected_vm_count=None, s3_con
186193
task_arn: ARN of the ECS task that ran the tests.
187194
expected_vm_count: Number of VMs expected to spawn (None if unknown).
188195
s3_context: Optional dict with 'bucket', 'run_prefix', 'region', 's3_client' for debug output.
196+
container_failure_log_url: Public URL of the ECS container's own log when
197+
it exited non-zero before launching any VM. The KCIDB submitter
198+
uses this as ``tests[*].log_url`` for the synthetic Infrastructure
199+
row so users have a link to the actual failure reason.
189200
"""
190201
end_time = time.time()
191202
total_runtime = end_time - start_time
@@ -230,6 +241,7 @@ def create_summary(run_dir, start_time, task_arn, expected_vm_count=None, s3_con
230241
# each tests[*] row.
231242
"instances": vm_stats["instances"],
232243
},
244+
"container_failure_log_url": container_failure_log_url,
233245
}
234246

235247
summary_file = Path(run_dir) / "summary.json"
@@ -459,6 +471,15 @@ def run_pipeline(
459471
logger.error("Task did not complete successfully: %s", e)
460472
raise
461473

474+
# A non-zero container exit means launch_vm.py died before SSM ever
475+
# ran on a VM — the /ec2/.../<run_prefix> log group will never appear,
476+
# so we shorten the VM-log wait below and surface container.log as
477+
# the failure URL instead of an absent kernel log.
478+
container_failed = bool(final_status) and any(
479+
(c.get("exit_code") or 0) != 0
480+
for c in (final_status.get("containers") or [])
481+
)
482+
462483
logger.info("-" * 60)
463484

464485
# Refresh CloudWatch client — credentials may have expired during the wait
@@ -496,7 +517,15 @@ def run_pipeline(
496517

497518
# Wait for VM logs to appear (CloudWatch agent ships in batches
498519
# after the VM shuts down — give it up to 5 min to surface).
499-
max_retries = 10
520+
# When the container itself failed, no VM was ever launched, so
521+
# the log group can't appear — skip straight to the single probe.
522+
if container_failed:
523+
logger.info(
524+
"Container exited non-zero; skipping extended VM-log wait"
525+
)
526+
max_retries = 1
527+
else:
528+
max_retries = 10
500529
retry_delay = 30
501530

502531
for attempt in range(max_retries):
@@ -581,6 +610,7 @@ def run_pipeline(
581610
# and emit artifacts.json — the manifest the KCIDB submitter
582611
# consumes to populate tests[*].log_url. Failures here are
583612
# non-fatal: the test results in S3 remain the source of truth.
613+
container_failure_log_url = None
584614
try:
585615
logger.info("\n=== Collecting boot logs & artifacts manifest ===")
586616
s3_client = provider.auth.get_client("s3")
@@ -596,6 +626,32 @@ def run_pipeline(
596626
run_prefix=run_prefix,
597627
origin=origin,
598628
)
629+
630+
# Container died before any VM booted -> there is no kernel log.
631+
# Publish the container's own log as the failure URL so KCIDB
632+
# users land on something actionable instead of a dead link.
633+
if container_failed and container_log_file.exists():
634+
from kernel_ci_cloud_labs.core.artifacts import s3_public_url
635+
636+
failure_key = f"{run_prefix}/container-failure.log"
637+
try:
638+
s3_client.upload_file(
639+
str(container_log_file),
640+
storage.bucket,
641+
failure_key,
642+
ExtraArgs={"ContentType": "text/plain; charset=utf-8"},
643+
)
644+
container_failure_log_url = s3_public_url(
645+
storage.bucket, region, failure_key
646+
)
647+
logger.info(
648+
"✓ Uploaded container failure log to %s",
649+
container_failure_log_url,
650+
)
651+
except Exception as upload_err: # pylint: disable=broad-exception-caught
652+
logger.warning(
653+
"Could not upload container failure log: %s", upload_err
654+
)
599655
except Exception as e: # pylint: disable=broad-exception-caught
600656
logger.warning("Could not collect artifacts manifest: %s", e)
601657

@@ -686,4 +742,9 @@ def run_pipeline(
686742
task_arn if "task_arn" in locals() else None,
687743
expected_vm_count if "expected_vm_count" in locals() else None,
688744
s3_context=s3_context,
745+
container_failure_log_url=(
746+
container_failure_log_url
747+
if "container_failure_log_url" in locals()
748+
else None
749+
),
689750
)

src/kernel_ci_cloud_labs/pull_labs_poller.py

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -433,12 +433,22 @@ def _extract_test_results(summary: Dict[str, Any]) -> Tuple[List[Dict[str, Any]]
433433
when ``summary["vms"]["instances"]`` is absent — keeps older
434434
in-flight summary files and unit tests using the old shape working.
435435
436-
The second tuple element (legacy ``log_url`` slot) is always ``None``;
437-
log URLs are now per-row and live in ``row["log_url"]``.
436+
The second tuple element (legacy job-level ``log_url`` slot) is normally
437+
``None`` — per-row URLs live in ``row["log_url"]`` — but is set to
438+
``summary["container_failure_log_url"]`` when the ECS container died
439+
before any VM booted, so the fallback Infrastructure row downstream still
440+
carries a clickable failure log.
438441
"""
439442
vms = summary.get("vms", {}) or {}
440443
instances = vms.get("instances")
441444

445+
# When the ECS container itself failed before launching any VM, there is
446+
# no kernel log to publish. The pipeline uploads the container's own log
447+
# to S3 and records its URL here so the synthetic Infrastructure row that
448+
# the caller falls back to (build_test_row(..., log_url=log_url, ...)) at
449+
# least links the user to the actual failure reason.
450+
container_failure_log_url = summary.get("container_failure_log_url")
451+
442452
# Legacy path: no per-instance breakdown -> one row per test name, no URLs.
443453
if not instances:
444454
rows: List[Dict[str, Any]] = []
@@ -447,7 +457,7 @@ def _extract_test_results(summary: Dict[str, Any]) -> Tuple[List[Dict[str, Any]]
447457
for name in test_names:
448458
status = "FAIL" if failed_by_test.get(name) else "PASS"
449459
rows.append({"name": _test_name_to_path(name), "status": status})
450-
return rows, None
460+
return rows, container_failure_log_url
451461

452462
url_by_pair = _load_artifact_log_urls(summary.get("run_directory"))
453463
rows = []

0 commit comments

Comments
 (0)