@@ -177,7 +177,14 @@ def _log_vm_output_excerpts(s3_client, bucket, run_prefix, failed_tests):
177177 pass
178178
179179
180- def create_summary (run_dir , start_time , task_arn , expected_vm_count = None , s3_context = None ):
180+ def create_summary (
181+ run_dir ,
182+ start_time ,
183+ task_arn ,
184+ expected_vm_count = None ,
185+ s3_context = None ,
186+ container_failure_log_url = None ,
187+ ):
181188 """Create summary.json with VM statistics.
182189
183190 Args:
@@ -186,6 +193,10 @@ def create_summary(run_dir, start_time, task_arn, expected_vm_count=None, s3_con
186193 task_arn: ARN of the ECS task that ran the tests.
187194 expected_vm_count: Number of VMs expected to spawn (None if unknown).
188195 s3_context: Optional dict with 'bucket', 'run_prefix', 'region', 's3_client' for debug output.
196+ container_failure_log_url: Public URL of the ECS container's own log when
197+ it exited non-zero before launching any VM. The KCIDB submitter
198+ uses this as ``tests[*].log_url`` for the synthetic Infrastructure
199+ row so users have a link to the actual failure reason.
189200 """
190201 end_time = time .time ()
191202 total_runtime = end_time - start_time
@@ -230,6 +241,7 @@ def create_summary(run_dir, start_time, task_arn, expected_vm_count=None, s3_con
230241 # each tests[*] row.
231242 "instances" : vm_stats ["instances" ],
232243 },
244+ "container_failure_log_url" : container_failure_log_url ,
233245 }
234246
235247 summary_file = Path (run_dir ) / "summary.json"
@@ -459,6 +471,15 @@ def run_pipeline(
459471 logger .error ("Task did not complete successfully: %s" , e )
460472 raise
461473
474+ # A non-zero container exit means launch_vm.py died before SSM ever
475+ # ran on a VM — the /ec2/.../<run_prefix> log group will never appear,
476+ # so we shorten the VM-log wait below and surface container.log as
477+ # the failure URL instead of an absent kernel log.
478+ container_failed = bool (final_status ) and any (
479+ (c .get ("exit_code" ) or 0 ) != 0
480+ for c in (final_status .get ("containers" ) or [])
481+ )
482+
462483 logger .info ("-" * 60 )
463484
464485 # Refresh CloudWatch client — credentials may have expired during the wait
@@ -496,7 +517,15 @@ def run_pipeline(
496517
497518 # Wait for VM logs to appear (CloudWatch agent ships in batches
498519 # after the VM shuts down — give it up to 5 min to surface).
499- max_retries = 10
520+ # When the container itself failed, no VM was ever launched, so
521+ # the log group can't appear — skip straight to the single probe.
522+ if container_failed :
523+ logger .info (
524+ "Container exited non-zero; skipping extended VM-log wait"
525+ )
526+ max_retries = 1
527+ else :
528+ max_retries = 10
500529 retry_delay = 30
501530
502531 for attempt in range (max_retries ):
@@ -581,6 +610,7 @@ def run_pipeline(
581610 # and emit artifacts.json — the manifest the KCIDB submitter
582611 # consumes to populate tests[*].log_url. Failures here are
583612 # non-fatal: the test results in S3 remain the source of truth.
613+ container_failure_log_url = None
584614 try :
585615 logger .info ("\n === Collecting boot logs & artifacts manifest ===" )
586616 s3_client = provider .auth .get_client ("s3" )
@@ -596,6 +626,32 @@ def run_pipeline(
596626 run_prefix = run_prefix ,
597627 origin = origin ,
598628 )
629+
630+ # Container died before any VM booted -> there is no kernel log.
631+ # Publish the container's own log as the failure URL so KCIDB
632+ # users land on something actionable instead of a dead link.
633+ if container_failed and container_log_file .exists ():
634+ from kernel_ci_cloud_labs .core .artifacts import s3_public_url
635+
636+ failure_key = f"{ run_prefix } /container-failure.log"
637+ try :
638+ s3_client .upload_file (
639+ str (container_log_file ),
640+ storage .bucket ,
641+ failure_key ,
642+ ExtraArgs = {"ContentType" : "text/plain; charset=utf-8" },
643+ )
644+ container_failure_log_url = s3_public_url (
645+ storage .bucket , region , failure_key
646+ )
647+ logger .info (
648+ "✓ Uploaded container failure log to %s" ,
649+ container_failure_log_url ,
650+ )
651+ except Exception as upload_err : # pylint: disable=broad-exception-caught
652+ logger .warning (
653+ "Could not upload container failure log: %s" , upload_err
654+ )
599655 except Exception as e : # pylint: disable=broad-exception-caught
600656 logger .warning ("Could not collect artifacts manifest: %s" , e )
601657
@@ -686,4 +742,9 @@ def run_pipeline(
686742 task_arn if "task_arn" in locals () else None ,
687743 expected_vm_count if "expected_vm_count" in locals () else None ,
688744 s3_context = s3_context ,
745+ container_failure_log_url = (
746+ container_failure_log_url
747+ if "container_failure_log_url" in locals ()
748+ else None
749+ ),
689750 )
0 commit comments