Skip to content

Commit 4fcb428

Browse files
committed
more KB
1 parent 2b97d0a commit 4fcb428

32 files changed

Lines changed: 8210 additions & 7 deletions

internal/agent/innerloop_tools.go

Lines changed: 532 additions & 0 deletions
Large diffs are not rendered by default.

internal/agent/tooling.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,13 @@ func DefaultTools(toolCtx ToolContext) []Tool {
8989
showOptimizationSessionTool(),
9090
registerOptimizationCandidateTool(toolCtx),
9191
recordOptimizationStageTool(),
92+
showOuterLoopStatusTool(),
93+
recordLoopDecisionTool(),
94+
analyzeProfileTool(toolCtx),
95+
assessBenchmarkRunsTool(),
96+
rankSearchCandidatesTool(),
97+
saveRoundArtifactTool(toolCtx),
98+
recordReflexionTool(toolCtx),
9299
planOptimizationTool(toolCtx),
93100
detectEnvironmentTool(),
94101
listTargetsTool(toolCtx),

internal/cli/agent_prompt.go

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -39,12 +39,16 @@ func buildOptimizeRunPrompt(session *optimize.Session) string {
3939
"6. Exhaust the low-hanging search ladder before custom kernels: baseline -> packaged model-family, checkpoint, or runtime flavor variants -> runtime flags or attention implementation -> dtype or quant or checkpoint variants, including synthesized FP8 conversion when no packaged FP8 artifact exists -> torch.compile or CUDA graphs if supported -> Triton or CuTe or CUDA kernels -> deeper runtime patching.",
4040
"7. Choose the backend and workflow yourself. Do not assume Triton, CuTe, or CUDA helpers exist. Use generic file tools plus run_command to create, edit, build, verify, and benchmark code.",
4141
"8. When a command belongs to a candidate stage, call run_command with session, candidate, and stage so Fusion persists the artifact. Use run_benchmark and run_profile with session and candidate for benchmark/profile stages.",
42-
"9. If compile, correctness, inference, or performance problems appear, inspect the outputs, patch the code or scripts, and retry. Do not stop at the first fixable error or the first small performance win.",
43-
"10. Verify correctness before claiming success. Prefer explicit tolerances, reproducible seeds, and benchmark evidence.",
44-
"11. Keep the optimization session state accurate by recording stages and using the candidate workspace instead of ad hoc temp paths.",
45-
"12. For FP8 or other converted quantization paths, save the calibration recipe, runtime flags, and any fallback higher-precision modules. Compare normalized steady-state metrics, not just raw wall time. When model families produce different output lengths, prefer metrics like rtf, x_real_time, or tokens/sec. Keep download, compile, and warmup overhead separate from steady-state generation speed.",
46-
"13. Maintain a current best candidate. If a new candidate regresses or breaks correctness, fall back to the current best and continue the search.",
47-
"14. End only after each applicable candidate family has been tested, rejected with evidence, or blocked by the environment. Then report the best candidate, what changed, what passed, what failed, and the next most valuable experiment if more time remains.",
42+
"9. After profile collection, use analyze_profile so Fusion converts raw Nsight output into a BottleneckReport and Prescription before you decide on deeper kernel changes.",
43+
"10. Use show_outer_loop_status and record_loop_decision to make the outer-loop state explicit. Do not launch deeper custom kernel search until packaged model, runtime, quantization, compile, and attention-backend branches are exhausted or explicitly blocked.",
44+
"11. During kernel search, persist round artifacts with save_round_artifact or record_reflexion under candidates/<id>/rounds/<n> so prompt, diagnosis, prescription, verify, bench, and reflexion data survive across turns.",
45+
"12. Use assess_benchmark_runs before ranking performance-sensitive candidates, and use rank_search_candidates to keep a top-K survivor set and promote the current best candidate explicitly.",
46+
"13. If compile, correctness, inference, or performance problems appear, inspect the outputs, patch the code or scripts, and retry. Do not stop at the first fixable error or the first small performance win.",
47+
"14. Verify correctness before claiming success. Prefer explicit tolerances, reproducible seeds, and benchmark evidence.",
48+
"15. Keep the optimization session state accurate by recording stages and using the candidate workspace instead of ad hoc temp paths.",
49+
"16. For FP8 or other converted quantization paths, save the calibration recipe, runtime flags, and any fallback higher-precision modules. Compare normalized steady-state metrics, not just raw wall time. When model families produce different output lengths, prefer metrics like rtf, x_real_time, or tokens/sec. Keep download, compile, and warmup overhead separate from steady-state generation speed.",
50+
"17. Maintain a current best candidate. If a new candidate regresses or breaks correctness, fall back to the current best and continue the search.",
51+
"18. End only after each applicable candidate family has been tested, rejected with evidence, or blocked by the environment. Then report the best candidate, what changed, what passed, what failed, and the next most valuable experiment if more time remains.",
4852
"",
4953
"Stage guidance",
5054
"- Common stage names: inspect, baseline, build, verify, benchmark, profile, patch, model-benchmark, final-report.",

internal/cli/optimize_session.go

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,8 @@ func newOptimizeSessionCommand() *cobra.Command {
2323
newOptimizeSessionCreateCommand(),
2424
newOptimizeSessionListCommand(),
2525
newOptimizeSessionShowCommand(),
26+
newOptimizeSessionGateCommand(),
27+
newOptimizeSessionDecisionCommand(),
2628
)
2729

2830
return cmd
@@ -239,6 +241,78 @@ func newOptimizeSessionShowCommand() *cobra.Command {
239241
return cmd
240242
}
241243

244+
func newOptimizeSessionGateCommand() *cobra.Command {
245+
var id string
246+
247+
cmd := &cobra.Command{
248+
Use: "gate",
249+
Short: "Show whether the outer loop is exhausted and the inner kernel loop is ready to start",
250+
RunE: func(cmd *cobra.Command, args []string) error {
251+
session, _, err := loadOptimizationSession(id)
252+
if err != nil {
253+
return err
254+
}
255+
status := optimize.EvaluateOuterLoopStatus(session)
256+
cmd.Printf("session: %s\n", session.ID)
257+
cmd.Printf("outer_loop_exhausted: %t\n", status.Exhausted)
258+
cmd.Printf("ready_for_inner_loop: %t\n", status.ReadyForInnerLoop)
259+
cmd.Printf("current_best: %s\n", valueOrFallback(status.CurrentBestID, "unset"))
260+
cmd.Println("families")
261+
for _, family := range status.Families {
262+
cmd.Printf("- %s: %s\n", family.Family, family.Status)
263+
if family.Reason != "" {
264+
cmd.Printf(" reason: %s\n", family.Reason)
265+
}
266+
if len(family.CandidateIDs) > 0 {
267+
cmd.Printf(" candidates: %s\n", strings.Join(family.CandidateIDs, ", "))
268+
}
269+
}
270+
return nil
271+
},
272+
}
273+
274+
cmd.Flags().StringVar(&id, "id", "", "optimization session id")
275+
cmd.MarkFlagRequired("id")
276+
return cmd
277+
}
278+
279+
func newOptimizeSessionDecisionCommand() *cobra.Command {
280+
var id string
281+
var phase string
282+
var family string
283+
var status string
284+
var reason string
285+
var candidateID string
286+
287+
cmd := &cobra.Command{
288+
Use: "decide",
289+
Short: "Record an explicit outer-loop or inner-loop decision for orchestration and gating",
290+
RunE: func(cmd *cobra.Command, args []string) error {
291+
session, store, err := loadOptimizationSession(id)
292+
if err != nil {
293+
return err
294+
}
295+
session.RecordLoopDecision(phase, family, status, candidateID, reason)
296+
if _, err := store.Save(session); err != nil {
297+
return err
298+
}
299+
cmd.Printf("Recorded %s decision for %s: %s\n", phase, family, status)
300+
return nil
301+
},
302+
}
303+
304+
cmd.Flags().StringVar(&id, "id", "", "optimization session id")
305+
cmd.Flags().StringVar(&phase, "phase", "outer", "loop phase, for example outer or inner")
306+
cmd.Flags().StringVar(&family, "family", "", "decision family like baseline, model-family, runtime, quantization, compile, or attention-backend")
307+
cmd.Flags().StringVar(&status, "status", "", "decision status like tested, blocked, skipped, regressed, or winner")
308+
cmd.Flags().StringVar(&reason, "reason", "", "human-readable reason for the decision")
309+
cmd.Flags().StringVar(&candidateID, "candidate", "", "optional candidate id associated with the decision")
310+
cmd.MarkFlagRequired("id")
311+
cmd.MarkFlagRequired("family")
312+
cmd.MarkFlagRequired("status")
313+
return cmd
314+
}
315+
242316
func loadOptimizationSession(id string) (*optimize.Session, *optimize.SessionStore, error) {
243317
store, err := optimize.NewSessionStore()
244318
if err != nil {

internal/cli/profile.go

Lines changed: 79 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,12 @@
11
package cli
22

33
import (
4+
"encoding/json"
5+
"os"
46
"time"
57

68
"github.com/ModelsLab/fusion/internal/artifacts"
9+
"github.com/ModelsLab/fusion/internal/optimize"
710
"github.com/ModelsLab/fusion/internal/runner"
811
"github.com/spf13/cobra"
912
)
@@ -14,7 +17,10 @@ func newProfileCommand() *cobra.Command {
1417
Short: "Run profiling commands and persist the raw artifacts",
1518
}
1619

17-
cmd.AddCommand(newProfileRunCommand())
20+
cmd.AddCommand(
21+
newProfileRunCommand(),
22+
newProfileAnalyzeCommand(),
23+
)
1824
return cmd
1925
}
2026

@@ -107,3 +113,75 @@ func newProfileRunCommand() *cobra.Command {
107113
cmd.MarkFlagRequired("command")
108114
return cmd
109115
}
116+
117+
func newProfileAnalyzeCommand() *cobra.Command {
118+
var artifactPath string
119+
var tool string
120+
var outputPath string
121+
122+
cmd := &cobra.Command{
123+
Use: "analyze",
124+
Short: "Parse a saved profile artifact into stable Nsight metrics, a bottleneck report, and a prescription",
125+
RunE: func(cmd *cobra.Command, args []string) error {
126+
store, err := artifacts.NewStore()
127+
if err != nil {
128+
return err
129+
}
130+
artifact, err := store.LoadProfile(artifactPath)
131+
if err != nil {
132+
return err
133+
}
134+
135+
resolvedTool := valueOrFallback(tool, artifact.Tool)
136+
profile := optimize.ParseNsightProfile(resolvedTool, artifact.Stdout, artifact.Stderr)
137+
report := optimize.AnalyzeRoofline(profile)
138+
prescription := optimize.PrescribeFromReport(report, optimize.Request{}, optimize.Candidate{
139+
Name: artifact.Name,
140+
Backend: resolvedTool,
141+
})
142+
143+
payload := map[string]any{
144+
"artifact": artifactPath,
145+
"profile": profile,
146+
"diagnosis": report,
147+
"prescription": prescription,
148+
}
149+
if outputPath != "" {
150+
data, err := json.MarshalIndent(payload, "", " ")
151+
if err != nil {
152+
return err
153+
}
154+
data = append(data, '\n')
155+
if err := os.WriteFile(outputPath, data, 0o600); err != nil {
156+
return err
157+
}
158+
cmd.Printf("Saved profile analysis: %s\n", outputPath)
159+
}
160+
161+
cmd.Printf("Tool: %s\n", profile.Tool)
162+
cmd.Printf("Category: %s\n", report.Category)
163+
cmd.Printf("Efficiency: %.2f%%\n", report.Efficiency*100)
164+
cmd.Printf("Confidence: %.2f\n", report.Confidence)
165+
cmd.Printf("Summary: %s\n", report.Summary)
166+
if len(report.RootCauses) > 0 {
167+
cmd.Println("Root causes")
168+
for _, cause := range report.RootCauses {
169+
cmd.Printf("- %s\n", cause)
170+
}
171+
}
172+
if len(prescription.Fixes) > 0 {
173+
cmd.Println("Fixes")
174+
for _, fix := range prescription.Fixes {
175+
cmd.Printf("- %s\n", fix.Action)
176+
}
177+
}
178+
return nil
179+
},
180+
}
181+
182+
cmd.Flags().StringVar(&artifactPath, "artifact", "", "path to a saved profile artifact JSON")
183+
cmd.Flags().StringVar(&tool, "tool", "", "override the profiler tool label, for example ncu or nsys")
184+
cmd.Flags().StringVar(&outputPath, "output", "", "optional output path for the normalized analysis JSON")
185+
cmd.MarkFlagRequired("artifact")
186+
return cmd
187+
}

0 commit comments

Comments
 (0)