Skip to content

Commit d48aa89

Browse files
committed
Merge branch 'fix/checkpoints' into fix/v4-misc
2 parents 29ce91a + 30077c6 commit d48aa89

File tree

3 files changed

+35
-10
lines changed

3 files changed

+35
-10
lines changed

apps/supervisor/src/env.ts

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ const Env = z.object({
2626
.transform((s) => z.enum(["http", "https"]).parse(s.toLowerCase()))
2727
.default("http"),
2828
TRIGGER_WORKLOAD_API_DOMAIN: z.string().optional(), // If unset, will use orchestrator-specific default
29+
TRIGGER_WORKLOAD_API_HOST_INTERNAL: z.string().default("0.0.0.0"),
2930
TRIGGER_WORKLOAD_API_PORT_INTERNAL: z.coerce.number().default(8020), // This is the port the workload API listens on
3031
TRIGGER_WORKLOAD_API_PORT_EXTERNAL: z.coerce.number().default(8020), // This is the exposed port passed to the run controller
3132

@@ -53,7 +54,10 @@ const Env = z.object({
5354
EPHEMERAL_STORAGE_SIZE_REQUEST: z.string().default("2Gi"),
5455

5556
// Metrics
57+
METRICS_ENABLED: BoolEnv.default(true),
5658
METRICS_COLLECT_DEFAULTS: BoolEnv.default(true),
59+
METRICS_HOST: z.string().default("127.0.0.1"),
60+
METRICS_PORT: z.coerce.number().int().default(9090),
5761

5862
// Pod cleaner
5963
POD_CLEANER_ENABLED: BoolEnv.default(true),

apps/supervisor/src/index.ts

Lines changed: 17 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ if (env.METRICS_COLLECT_DEFAULTS) {
3030

3131
class ManagedSupervisor {
3232
private readonly workerSession: SupervisorSession;
33-
private readonly httpServer: HttpServer;
33+
private readonly metricsServer?: HttpServer;
3434
private readonly workloadServer: WorkloadServer;
3535
private readonly workloadManager: WorkloadManager;
3636
private readonly logger = new SimpleStructuredLogger("managed-worker");
@@ -61,6 +61,7 @@ class ManagedSupervisor {
6161
intervalMs: env.POD_CLEANER_INTERVAL_MS,
6262
});
6363
this.podCleaner = new PodCleaner({
64+
register,
6465
namespace: env.KUBERNETES_NAMESPACE,
6566
batchSize: env.POD_CLEANER_BATCH_SIZE,
6667
intervalMs: env.POD_CLEANER_INTERVAL_MS,
@@ -75,6 +76,7 @@ class ManagedSupervisor {
7576
reconnectIntervalMs: env.FAILED_POD_HANDLER_RECONNECT_INTERVAL_MS,
7677
});
7778
this.failedPodHandler = new FailedPodHandler({
79+
register,
7880
namespace: env.KUBERNETES_NAMESPACE,
7981
reconnectIntervalMs: env.FAILED_POD_HANDLER_RECONNECT_INTERVAL_MS,
8082
});
@@ -243,16 +245,21 @@ class ManagedSupervisor {
243245
}
244246
});
245247

246-
// Used for health checks and metrics
247-
this.httpServer = new HttpServer({ port: 8080, host: "0.0.0.0" }).route("/health", "GET", {
248-
handler: async ({ reply }) => {
249-
reply.text("OK");
250-
},
251-
});
248+
if (env.METRICS_ENABLED) {
249+
this.metricsServer = new HttpServer({
250+
port: env.METRICS_PORT,
251+
host: env.METRICS_HOST,
252+
metrics: {
253+
register,
254+
expose: true,
255+
},
256+
});
257+
}
252258

253259
// Responds to workload requests only
254260
this.workloadServer = new WorkloadServer({
255261
port: env.TRIGGER_WORKLOAD_API_PORT_INTERNAL,
262+
host: env.TRIGGER_WORKLOAD_API_HOST_INTERNAL,
256263
workerClient: this.workerSession.httpClient,
257264
checkpointClient: this.checkpointClient,
258265
});
@@ -321,6 +328,7 @@ class ManagedSupervisor {
321328
// Optional services
322329
await this.podCleaner?.start();
323330
await this.failedPodHandler?.start();
331+
await this.metricsServer?.start();
324332

325333
if (env.TRIGGER_WORKLOAD_API_ENABLED) {
326334
this.logger.log("[ManagedWorker] Workload API enabled", {
@@ -334,16 +342,16 @@ class ManagedSupervisor {
334342
}
335343

336344
await this.workerSession.start();
337-
await this.httpServer.start();
338345
}
339346

340347
async stop() {
341348
this.logger.log("[ManagedWorker] Shutting down");
342-
await this.httpServer.stop();
349+
await this.workerSession.stop();
343350

344351
// Optional services
345352
await this.podCleaner?.stop();
346353
await this.failedPodHandler?.stop();
354+
await this.metricsServer?.stop();
347355
}
348356
}
349357

apps/supervisor/src/workloadServer/index.ts

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ import {
2222
} from "@trigger.dev/core/v3/workers";
2323
import { HttpServer, type CheckpointClient } from "@trigger.dev/core/v3/serverOnly";
2424
import { type IncomingMessage } from "node:http";
25+
import { register } from "../metrics.js";
2526

2627
// Use the official export when upgrading to socket.io@4.8.0
2728
interface DefaultEventsMap {
@@ -121,7 +122,19 @@ export class WorkloadServer extends EventEmitter<WorkloadServerEvents> {
121122
}
122123

123124
private createHttpServer({ host, port }: { host: string; port: number }) {
124-
return new HttpServer({ port, host })
125+
return new HttpServer({
126+
port,
127+
host,
128+
metrics: {
129+
register,
130+
expose: false,
131+
},
132+
})
133+
.route("/health", "GET", {
134+
handler: async ({ reply }) => {
135+
reply.text("OK");
136+
},
137+
})
125138
.route(
126139
"/api/v1/workload-actions/runs/:runFriendlyId/snapshots/:snapshotFriendlyId/attempts/start",
127140
"POST",

0 commit comments

Comments
 (0)