Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 47 additions & 1 deletion cmd/vmcp/app/commands.go
Original file line number Diff line number Diff line change
Expand Up @@ -427,12 +427,58 @@ func runServe(cmd *cobra.Command, _ []string) error {
}

defaults := health.DefaultConfig()

// Use configured timeout if provided, otherwise use default
healthCheckTimeout := defaults.Timeout
if cfg.Operational.FailureHandling.HealthCheckTimeout > 0 {
healthCheckTimeout = time.Duration(cfg.Operational.FailureHandling.HealthCheckTimeout)
}

Comment thread
yrobla marked this conversation as resolved.
// Validate that timeout is less than interval to prevent checks from queuing up
if healthCheckTimeout >= checkInterval {
return fmt.Errorf("health check timeout (%v) must be less than check interval (%v) to prevent checks from queuing up",
healthCheckTimeout, checkInterval)
}

healthMonitorConfig = &health.MonitorConfig{
CheckInterval: checkInterval,
UnhealthyThreshold: cfg.Operational.FailureHandling.UnhealthyThreshold,
Timeout: defaults.Timeout,
Timeout: healthCheckTimeout,
DegradedThreshold: defaults.DegradedThreshold,
}

// Wire circuit breaker configuration if present
if cfg.Operational.FailureHandling.CircuitBreaker != nil {
cbConfig := cfg.Operational.FailureHandling.CircuitBreaker

// Validate circuit breaker configuration
if cbConfig.Enabled {
if cbConfig.FailureThreshold < 1 {
return fmt.Errorf("circuit breaker failure threshold must be >= 1, got %d",
cbConfig.FailureThreshold)
}
cbTimeout := time.Duration(cbConfig.Timeout)
if cbTimeout <= 0 {
return fmt.Errorf("circuit breaker timeout must be > 0, got %v", cbTimeout)
}
if cbTimeout < time.Second {
return fmt.Errorf("circuit breaker timeout must be >= 1s to prevent thrashing, got %v",
Comment thread
yrobla marked this conversation as resolved.
cbTimeout)
}
}

healthMonitorConfig.CircuitBreaker = &health.CircuitBreakerConfig{
Enabled: cbConfig.Enabled,
FailureThreshold: cbConfig.FailureThreshold,
Timeout: time.Duration(cbConfig.Timeout),
}

if cbConfig.Enabled {
logger.Infof("Circuit breaker enabled (threshold: %d failures, timeout: %v)",
cbConfig.FailureThreshold, time.Duration(cbConfig.Timeout))
}
}

logger.Info("Health monitoring configured from operational settings")
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -616,22 +616,35 @@ spec:
type: boolean
failureThreshold:
default: 5
description: FailureThreshold is the number of failures
before opening the circuit.
description: |-
FailureThreshold is the number of failures before opening the circuit.
Must be >= 1.
minimum: 1
type: integer
timeout:
default: 60s
description: Timeout is the duration to wait before
attempting to close the circuit.
description: |-
Timeout is the duration to wait before attempting to close the circuit.
Must be >= 1s to prevent thrashing.
pattern: ^([0-9]+(\.[0-9]+)?(ns|us|µs|ms|s|m|h))+$
type: string
Comment thread
yrobla marked this conversation as resolved.
x-kubernetes-validations:
- message: timeout must be >= 1s
rule: self == '' || duration(self) >= duration('1s')
type: object
healthCheckInterval:
default: 30s
description: HealthCheckInterval is the interval between
health checks.
pattern: ^([0-9]+(\.[0-9]+)?(ns|us|µs|ms|s|m|h))+$
type: string
healthCheckTimeout:
default: 10s
description: |-
HealthCheckTimeout is the maximum duration for a single health check operation.
Should be less than HealthCheckInterval to prevent checks from queuing up.
pattern: ^([0-9]+(\.[0-9]+)?(ns|us|µs|ms|s|m|h))+$
type: string
partialFailureMode:
default: fail
description: |-
Expand Down Expand Up @@ -1422,6 +1435,26 @@ spec:
authType:
description: AuthType is the type of authentication configured
type: string
circuitBreakerState:
description: |-
CircuitBreakerState is the current circuit breaker state (closed, open, half-open).
Empty when circuit breaker is disabled or not configured.
enum:
- closed
- open
- half-open
Comment thread
yrobla marked this conversation as resolved.
type: string
circuitLastChanged:
description: |-
CircuitLastChanged is the timestamp when the circuit breaker state last changed.
Empty when circuit breaker is disabled or has never changed state.
format: date-time
type: string
consecutiveFailures:
description: |-
ConsecutiveFailures is the current count of consecutive health check failures.
Resets to 0 when the backend becomes healthy again.
type: integer
lastHealthCheck:
description: LastHealthCheck is the timestamp of the last health
check
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -619,22 +619,35 @@ spec:
type: boolean
failureThreshold:
default: 5
description: FailureThreshold is the number of failures
before opening the circuit.
description: |-
FailureThreshold is the number of failures before opening the circuit.
Must be >= 1.
minimum: 1
type: integer
timeout:
default: 60s
description: Timeout is the duration to wait before
attempting to close the circuit.
description: |-
Timeout is the duration to wait before attempting to close the circuit.
Must be >= 1s to prevent thrashing.
pattern: ^([0-9]+(\.[0-9]+)?(ns|us|µs|ms|s|m|h))+$
Comment thread
yrobla marked this conversation as resolved.
type: string
x-kubernetes-validations:
- message: timeout must be >= 1s
rule: self == '' || duration(self) >= duration('1s')
type: object
healthCheckInterval:
default: 30s
description: HealthCheckInterval is the interval between
health checks.
pattern: ^([0-9]+(\.[0-9]+)?(ns|us|µs|ms|s|m|h))+$
type: string
healthCheckTimeout:
default: 10s
description: |-
HealthCheckTimeout is the maximum duration for a single health check operation.
Should be less than HealthCheckInterval to prevent checks from queuing up.
pattern: ^([0-9]+(\.[0-9]+)?(ns|us|µs|ms|s|m|h))+$
type: string
partialFailureMode:
default: fail
description: |-
Expand Down Expand Up @@ -1425,6 +1438,26 @@ spec:
authType:
description: AuthType is the type of authentication configured
type: string
circuitBreakerState:
description: |-
CircuitBreakerState is the current circuit breaker state (closed, open, half-open).
Empty when circuit breaker is disabled or not configured.
enum:
- closed
- open
- half-open
Comment thread
yrobla marked this conversation as resolved.
type: string
circuitLastChanged:
description: |-
CircuitLastChanged is the timestamp when the circuit breaker state last changed.
Empty when circuit breaker is disabled or has never changed state.
format: date-time
type: string
consecutiveFailures:
description: |-
ConsecutiveFailures is the current count of consecutive health check failures.
Resets to 0 when the backend becomes healthy again.
type: integer
lastHealthCheck:
description: LastHealthCheck is the timestamp of the last health
check
Expand Down
5 changes: 3 additions & 2 deletions docs/operator/crd-api.md

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

10 changes: 10 additions & 0 deletions pkg/vmcp/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -447,6 +447,12 @@ type FailureHandlingConfig struct {
// +optional
UnhealthyThreshold int `json:"unhealthyThreshold,omitempty" yaml:"unhealthyThreshold,omitempty"`

// HealthCheckTimeout is the maximum duration for a single health check operation.
// Should be less than HealthCheckInterval to prevent checks from queuing up.
// +kubebuilder:default="10s"
// +optional
HealthCheckTimeout Duration `json:"healthCheckTimeout,omitempty" yaml:"healthCheckTimeout,omitempty"`

// StatusReportingInterval is the interval for reporting status updates to Kubernetes.
// This controls how often the vMCP runtime reports backend health and phase changes.
// Lower values provide faster status updates but increase API server load.
Expand Down Expand Up @@ -477,12 +483,16 @@ type CircuitBreakerConfig struct {
Enabled bool `json:"enabled,omitempty" yaml:"enabled,omitempty"`

// FailureThreshold is the number of failures before opening the circuit.
// Must be >= 1.
// +kubebuilder:default=5
// +kubebuilder:validation:Minimum=1
// +optional
FailureThreshold int `json:"failureThreshold,omitempty" yaml:"failureThreshold,omitempty"`

// Timeout is the duration to wait before attempting to close the circuit.
// Must be >= 1s to prevent thrashing.
// +kubebuilder:default="60s"
// +kubebuilder:validation:XValidation:rule="self == '' || duration(self) >= duration('1s')",message="timeout must be >= 1s"
// +optional
Timeout Duration `json:"timeout,omitempty" yaml:"timeout,omitempty"`
}
Expand Down
2 changes: 1 addition & 1 deletion pkg/vmcp/health/circuit_breaker.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ const (
// CircuitOpen indicates failing state - requests fail immediately
CircuitOpen CircuitState = "open"
// CircuitHalfOpen indicates recovery testing - limited requests allowed
CircuitHalfOpen CircuitState = "half_open"
CircuitHalfOpen CircuitState = "half-open"
)

// CircuitBreaker defines the interface for circuit breaker implementations.
Expand Down
34 changes: 20 additions & 14 deletions pkg/vmcp/health/monitor.go
Original file line number Diff line number Diff line change
Expand Up @@ -613,27 +613,33 @@ func (m *Monitor) convertToDiscoveredBackends(allStates map[string]*State) []vmc
// m.backends before starting goroutines and ignore results for removed backends.
// Keep as defensive fallback.
discoveredBackends = append(discoveredBackends, vmcp.DiscoveredBackend{
Name: backendID,
URL: "",
Status: state.Status.ToCRDStatus(),
AuthConfigRef: "",
AuthType: "",
LastHealthCheck: metav1.NewTime(state.LastCheckTime),
Message: formatBackendMessage(state),
Name: backendID,
URL: "",
Status: state.Status.ToCRDStatus(),
AuthConfigRef: "",
AuthType: "",
LastHealthCheck: metav1.NewTime(state.LastCheckTime),
Message: formatBackendMessage(state),
CircuitBreakerState: string(state.CircuitState),
CircuitLastChanged: metav1.NewTime(state.CircuitLastChanged),
ConsecutiveFailures: state.ConsecutiveFailures,
})
continue
}

authConfigRef, authType := extractAuthInfo(backend)

discoveredBackends = append(discoveredBackends, vmcp.DiscoveredBackend{
Name: backend.Name,
URL: backend.BaseURL,
Status: state.Status.ToCRDStatus(),
AuthConfigRef: authConfigRef,
AuthType: authType,
LastHealthCheck: metav1.NewTime(state.LastCheckTime),
Message: formatBackendMessage(state),
Name: backend.Name,
URL: backend.BaseURL,
Status: state.Status.ToCRDStatus(),
AuthConfigRef: authConfigRef,
AuthType: authType,
LastHealthCheck: metav1.NewTime(state.LastCheckTime),
Message: formatBackendMessage(state),
CircuitBreakerState: string(state.CircuitState),
CircuitLastChanged: metav1.NewTime(state.CircuitLastChanged),
ConsecutiveFailures: state.ConsecutiveFailures,
})
}

Expand Down
16 changes: 16 additions & 0 deletions pkg/vmcp/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,22 @@ type DiscoveredBackend struct {
// Message provides additional information about the backend status
// +optional
Message string `json:"message,omitempty"`

// CircuitBreakerState is the current circuit breaker state (closed, open, half-open).
// Empty when circuit breaker is disabled or not configured.
// +optional
// +kubebuilder:validation:Enum=closed;open;half-open
Comment thread
yrobla marked this conversation as resolved.
CircuitBreakerState string `json:"circuitBreakerState,omitempty"`

// CircuitLastChanged is the timestamp when the circuit breaker state last changed.
// Empty when circuit breaker is disabled or has never changed state.
// +optional
CircuitLastChanged metav1.Time `json:"circuitLastChanged,omitempty"`

// ConsecutiveFailures is the current count of consecutive health check failures.
// Resets to 0 when the backend becomes healthy again.
// +optional
ConsecutiveFailures int `json:"consecutiveFailures,omitempty"`
}

// DeepCopyInto copies the receiver into out. Required for Kubernetes CRD types.
Expand Down
2 changes: 1 addition & 1 deletion test/e2e/thv-operator/virtualmcp/helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ func WaitForVirtualMCPServerReady(
}

for _, condition := range vmcpServer.Status.Conditions {
if condition.Type == "Ready" {
if condition.Type == mcpv1alpha1.ConditionTypeVirtualMCPServerReady {
if condition.Status == "True" {
// Also check that at least one pod is actually running and ready
labels := map[string]string{
Expand Down
Loading
Loading