ModelsLab
diff --git a/‎internal/agent/innerloop_tools.go‎
Lines changed: 532 additions & 0 deletions b/‎internal/agent/innerloop_tools.go‎
Lines changed: 532 additions & 0 deletions
diff --git a/‎internal/agent/tooling.go‎
Lines changed: 7 additions & 0 deletions b/‎internal/agent/tooling.go‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎internal/cli/agent_prompt.go‎
Lines changed: 10 additions & 6 deletions b/‎internal/cli/agent_prompt.go‎
Lines changed: 10 additions & 6 deletions
diff --git a/‎internal/cli/optimize_session.go‎
Lines changed: 74 additions & 0 deletions b/‎internal/cli/optimize_session.go‎
Lines changed: 74 additions & 0 deletions
diff --git a/‎internal/cli/profile.go‎
Lines changed: 79 additions & 1 deletion b/‎internal/cli/profile.go‎
Lines changed: 79 additions & 1 deletion
@@ -89,6 +89,13 @@ func DefaultTools(toolCtx ToolContext) []Tool {
 		showOptimizationSessionTool(),
 		registerOptimizationCandidateTool(toolCtx),
 		recordOptimizationStageTool(),
+		showOuterLoopStatusTool(),
+		recordLoopDecisionTool(),
+		analyzeProfileTool(toolCtx),
+		assessBenchmarkRunsTool(),
+		rankSearchCandidatesTool(),
+		saveRoundArtifactTool(toolCtx),
+		recordReflexionTool(toolCtx),
 		planOptimizationTool(toolCtx),
 		detectEnvironmentTool(),
 		listTargetsTool(toolCtx),
 
@@ -39,12 +39,16 @@ func buildOptimizeRunPrompt(session *optimize.Session) string {
 		"6. Exhaust the low-hanging search ladder before custom kernels: baseline -> packaged model-family, checkpoint, or runtime flavor variants -> runtime flags or attention implementation -> dtype or quant or checkpoint variants, including synthesized FP8 conversion when no packaged FP8 artifact exists -> torch.compile or CUDA graphs if supported -> Triton or CuTe or CUDA kernels -> deeper runtime patching.",
 		"7. Choose the backend and workflow yourself. Do not assume Triton, CuTe, or CUDA helpers exist. Use generic file tools plus run_command to create, edit, build, verify, and benchmark code.",
 		"8. When a command belongs to a candidate stage, call run_command with session, candidate, and stage so Fusion persists the artifact. Use run_benchmark and run_profile with session and candidate for benchmark/profile stages.",
-		"9. If compile, correctness, inference, or performance problems appear, inspect the outputs, patch the code or scripts, and retry. Do not stop at the first fixable error or the first small performance win.",
-		"10. Verify correctness before claiming success. Prefer explicit tolerances, reproducible seeds, and benchmark evidence.",
-		"11. Keep the optimization session state accurate by recording stages and using the candidate workspace instead of ad hoc temp paths.",
-		"12. For FP8 or other converted quantization paths, save the calibration recipe, runtime flags, and any fallback higher-precision modules. Compare normalized steady-state metrics, not just raw wall time. When model families produce different output lengths, prefer metrics like rtf, x_real_time, or tokens/sec. Keep download, compile, and warmup overhead separate from steady-state generation speed.",
-		"13. Maintain a current best candidate. If a new candidate regresses or breaks correctness, fall back to the current best and continue the search.",
-		"14. End only after each applicable candidate family has been tested, rejected with evidence, or blocked by the environment. Then report the best candidate, what changed, what passed, what failed, and the next most valuable experiment if more time remains.",
+		"9. After profile collection, use analyze_profile so Fusion converts raw Nsight output into a BottleneckReport and Prescription before you decide on deeper kernel changes.",
+		"10. Use show_outer_loop_status and record_loop_decision to make the outer-loop state explicit. Do not launch deeper custom kernel search until packaged model, runtime, quantization, compile, and attention-backend branches are exhausted or explicitly blocked.",
+		"11. During kernel search, persist round artifacts with save_round_artifact or record_reflexion under candidates/<id>/rounds/<n> so prompt, diagnosis, prescription, verify, bench, and reflexion data survive across turns.",
+		"12. Use assess_benchmark_runs before ranking performance-sensitive candidates, and use rank_search_candidates to keep a top-K survivor set and promote the current best candidate explicitly.",
+		"13. If compile, correctness, inference, or performance problems appear, inspect the outputs, patch the code or scripts, and retry. Do not stop at the first fixable error or the first small performance win.",
+		"14. Verify correctness before claiming success. Prefer explicit tolerances, reproducible seeds, and benchmark evidence.",
+		"15. Keep the optimization session state accurate by recording stages and using the candidate workspace instead of ad hoc temp paths.",
+		"16. For FP8 or other converted quantization paths, save the calibration recipe, runtime flags, and any fallback higher-precision modules. Compare normalized steady-state metrics, not just raw wall time. When model families produce different output lengths, prefer metrics like rtf, x_real_time, or tokens/sec. Keep download, compile, and warmup overhead separate from steady-state generation speed.",
+		"17. Maintain a current best candidate. If a new candidate regresses or breaks correctness, fall back to the current best and continue the search.",
+		"18. End only after each applicable candidate family has been tested, rejected with evidence, or blocked by the environment. Then report the best candidate, what changed, what passed, what failed, and the next most valuable experiment if more time remains.",
 		"",
 		"Stage guidance",
 		"- Common stage names: inspect, baseline, build, verify, benchmark, profile, patch, model-benchmark, final-report.",
 
@@ -23,6 +23,8 @@ func newOptimizeSessionCommand() *cobra.Command {
 		newOptimizeSessionCreateCommand(),
 		newOptimizeSessionListCommand(),
 		newOptimizeSessionShowCommand(),
+		newOptimizeSessionGateCommand(),
+		newOptimizeSessionDecisionCommand(),
 	)
 
 	return cmd
@@ -239,6 +241,78 @@ func newOptimizeSessionShowCommand() *cobra.Command {
 	return cmd
 }
 
+func newOptimizeSessionGateCommand() *cobra.Command {
+	var id string
+
+	cmd := &cobra.Command{
+		Use:   "gate",
+		Short: "Show whether the outer loop is exhausted and the inner kernel loop is ready to start",
+		RunE: func(cmd *cobra.Command, args []string) error {
+			session, _, err := loadOptimizationSession(id)
+			if err != nil {
+				return err
+			}
+			status := optimize.EvaluateOuterLoopStatus(session)
+			cmd.Printf("session: %s\n", session.ID)
+			cmd.Printf("outer_loop_exhausted: %t\n", status.Exhausted)
+			cmd.Printf("ready_for_inner_loop: %t\n", status.ReadyForInnerLoop)
+			cmd.Printf("current_best: %s\n", valueOrFallback(status.CurrentBestID, "unset"))
+			cmd.Println("families")
+			for _, family := range status.Families {
+				cmd.Printf("- %s: %s\n", family.Family, family.Status)
+				if family.Reason != "" {
+					cmd.Printf("  reason: %s\n", family.Reason)
+				}
+				if len(family.CandidateIDs) > 0 {
+					cmd.Printf("  candidates: %s\n", strings.Join(family.CandidateIDs, ", "))
+				}
+			}
+			return nil
+		},
+	}
+
+	cmd.Flags().StringVar(&id, "id", "", "optimization session id")
+	cmd.MarkFlagRequired("id")
+	return cmd
+}
+
+func newOptimizeSessionDecisionCommand() *cobra.Command {
+	var id string
+	var phase string
+	var family string
+	var status string
+	var reason string
+	var candidateID string
+
+	cmd := &cobra.Command{
+		Use:   "decide",
+		Short: "Record an explicit outer-loop or inner-loop decision for orchestration and gating",
+		RunE: func(cmd *cobra.Command, args []string) error {
+			session, store, err := loadOptimizationSession(id)
+			if err != nil {
+				return err
+			}
+			session.RecordLoopDecision(phase, family, status, candidateID, reason)
+			if _, err := store.Save(session); err != nil {
+				return err
+			}
+			cmd.Printf("Recorded %s decision for %s: %s\n", phase, family, status)
+			return nil
+		},
+	}
+
+	cmd.Flags().StringVar(&id, "id", "", "optimization session id")
+	cmd.Flags().StringVar(&phase, "phase", "outer", "loop phase, for example outer or inner")
+	cmd.Flags().StringVar(&family, "family", "", "decision family like baseline, model-family, runtime, quantization, compile, or attention-backend")
+	cmd.Flags().StringVar(&status, "status", "", "decision status like tested, blocked, skipped, regressed, or winner")
+	cmd.Flags().StringVar(&reason, "reason", "", "human-readable reason for the decision")
+	cmd.Flags().StringVar(&candidateID, "candidate", "", "optional candidate id associated with the decision")
+	cmd.MarkFlagRequired("id")
+	cmd.MarkFlagRequired("family")
+	cmd.MarkFlagRequired("status")
+	return cmd
+}
+
 func loadOptimizationSession(id string) (*optimize.Session, *optimize.SessionStore, error) {
 	store, err := optimize.NewSessionStore()
 	if err != nil {
 
@@ -1,9 +1,12 @@
 package cli
 
 import (
+	"encoding/json"
+	"os"
 	"time"
 
 	"github.com/ModelsLab/fusion/internal/artifacts"
+	"github.com/ModelsLab/fusion/internal/optimize"
 	"github.com/ModelsLab/fusion/internal/runner"
 	"github.com/spf13/cobra"
 )
@@ -14,7 +17,10 @@ func newProfileCommand() *cobra.Command {
 		Short: "Run profiling commands and persist the raw artifacts",
 	}
 
-	cmd.AddCommand(newProfileRunCommand())
+	cmd.AddCommand(
+		newProfileRunCommand(),
+		newProfileAnalyzeCommand(),
+	)
 	return cmd
 }
 
@@ -107,3 +113,75 @@ func newProfileRunCommand() *cobra.Command {
 	cmd.MarkFlagRequired("command")
 	return cmd
 }
+
+func newProfileAnalyzeCommand() *cobra.Command {
+	var artifactPath string
+	var tool string
+	var outputPath string
+
+	cmd := &cobra.Command{
+		Use:   "analyze",
+		Short: "Parse a saved profile artifact into stable Nsight metrics, a bottleneck report, and a prescription",
+		RunE: func(cmd *cobra.Command, args []string) error {
+			store, err := artifacts.NewStore()
+			if err != nil {
+				return err
+			}
+			artifact, err := store.LoadProfile(artifactPath)
+			if err != nil {
+				return err
+			}
+
+			resolvedTool := valueOrFallback(tool, artifact.Tool)
+			profile := optimize.ParseNsightProfile(resolvedTool, artifact.Stdout, artifact.Stderr)
+			report := optimize.AnalyzeRoofline(profile)
+			prescription := optimize.PrescribeFromReport(report, optimize.Request{}, optimize.Candidate{
+				Name:    artifact.Name,
+				Backend: resolvedTool,
+			})
+
+			payload := map[string]any{
+				"artifact":      artifactPath,
+				"profile":       profile,
+				"diagnosis":     report,
+				"prescription":  prescription,
+			}
+			if outputPath != "" {
+				data, err := json.MarshalIndent(payload, "", "  ")
+				if err != nil {
+					return err
+				}
+				data = append(data, '\n')
+				if err := os.WriteFile(outputPath, data, 0o600); err != nil {
+					return err
+				}
+				cmd.Printf("Saved profile analysis: %s\n", outputPath)
+			}
+
+			cmd.Printf("Tool: %s\n", profile.Tool)
+			cmd.Printf("Category: %s\n", report.Category)
+			cmd.Printf("Efficiency: %.2f%%\n", report.Efficiency*100)
+			cmd.Printf("Confidence: %.2f\n", report.Confidence)
+			cmd.Printf("Summary: %s\n", report.Summary)
+			if len(report.RootCauses) > 0 {
+				cmd.Println("Root causes")
+				for _, cause := range report.RootCauses {
+					cmd.Printf("- %s\n", cause)
+				}
+			}
+			if len(prescription.Fixes) > 0 {
+				cmd.Println("Fixes")
+				for _, fix := range prescription.Fixes {
+					cmd.Printf("- %s\n", fix.Action)
+				}
+			}
+			return nil
+		},
+	}
+
+	cmd.Flags().StringVar(&artifactPath, "artifact", "", "path to a saved profile artifact JSON")
+	cmd.Flags().StringVar(&tool, "tool", "", "override the profiler tool label, for example ncu or nsys")
+	cmd.Flags().StringVar(&outputPath, "output", "", "optional output path for the normalized analysis JSON")
+	cmd.MarkFlagRequired("artifact")
+	return cmd
+}