Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 9 additions & 2 deletions main.tf
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
locals {
cluster_domain = "${var.cluster_name}.${var.domain_zone}"
storage_class_name = "cosmotech-retain"
cluster_domain = "${var.cluster_name}.${var.domain_zone}"
storage_class_name = "cosmotech-retain"
enable_workload_scheduler = true
persistences = {
keycloak-postgresql = {
size = 10
Expand Down Expand Up @@ -86,6 +87,12 @@ resource "time_sleep" "timer" {
}


module "workload_scheduler" {
source = "./modules/workload_scheduler"
enable_workload_scheduler = local.enable_workload_scheduler
}


module "storageclass" {
source = "./modules/kube_storageclass"
cloud_provider = var.cloud_provider
Expand Down
12 changes: 12 additions & 0 deletions modules/workload_scheduler/kube_objects/cluster-role-binding.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: ${MAIN_NAME}
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: ${MAIN_NAME}
subjects:
- kind: ServiceAccount
name: ${MAIN_NAME}
namespace: default
29 changes: 29 additions & 0 deletions modules/workload_scheduler/kube_objects/cluster-role.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: ${MAIN_NAME}
rules:
# Read workloads
- apiGroups: ["apps"]
resources:
- deployments
- statefulsets
verbs: ["get", "list", "watch", "patch", "update"]

# Scale workloads
- apiGroups: ["apps"]
resources:
- deployments/scale
- statefulsets/scale
verbs: ["get", "update", "patch"]

# Read namespaces
- apiGroups: [""]
resources: ["namespaces"]
verbs: ["get", "list"]

# Read pods (no delete needed anymore)
- apiGroups: [""]
resources:
- pods
verbs: ["get", "list", "watch", "delete"]
69 changes: 69 additions & 0 deletions modules/workload_scheduler/kube_objects/cronjob-scale-down.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
apiVersion: batch/v1
kind: CronJob
metadata:
name: scale-down
namespace: ${NAMESPACE}
spec:
schedule: ${SCALE_DOWN_CRON_SCHEDULE}
timeZone: ${SCALER_TIME_ZONE}
jobTemplate:
spec:
template:
spec:
serviceAccountName: ${MAIN_NAME}
restartPolicy: OnFailure
containers:
- name: ${MAIN_NAME}-scaler
image: ${SCALER_IMAGE_TAG}
command:
- /bin/sh
- -c
- |
set -e

STATE_FILE=/data/replica-state.json

echo "Saving state..."

kubectl get deploy,statefulset -A -o json | jq '
[
.items[]
| select(.metadata.namespace as $ns
| ($ns != "kube-system"
and $ns != "kube-public"
and $ns != "kube-node-lease"
and $ns != "calico-system"
and $ns != "tigera-operator"))
| {
namespace: .metadata.namespace,
kind: .kind,
name: .metadata.name,
replicas: (.spec.replicas // 1)
}
]' > $STATE_FILE

echo "Scaling down..."

for ns in $(kubectl get ns -o jsonpath='{.items[*].metadata.name}'); do

case "$ns" in
kube-system|kube-public|kube-node-lease|calico-system|tigera-operator)
echo "Skipping $ns"
continue
;;
esac

kubectl scale deployment --all -n "$ns" --replicas=0 || true
kubectl scale statefulset --all -n "$ns" --replicas=0 || true
done

echo "Done."

volumeMounts:
- name: state
mountPath: /data

volumes:
- name: state
persistentVolumeClaim:
claimName: pvc-${MAIN_NAME}-scaler-replica-state
152 changes: 152 additions & 0 deletions modules/workload_scheduler/kube_objects/cronjob-scale-up.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
apiVersion: batch/v1
kind: CronJob
metadata:
name: scale-up
namespace: ${NAMESPACE}
spec:
schedule: ${SCALE_UP_CRON_SCHEDULE}
timeZone: ${SCALER_TIME_ZONE}
jobTemplate:
spec:
template:
spec:
serviceAccountName: ${MAIN_NAME}
restartPolicy: OnFailure

containers:
- name: ${MAIN_NAME}-scaler
image: ${SCALER_IMAGE_TAG}
command:
- /bin/sh
- -c
- |
set -e

STATE_FILE=/data/replica-state.json

if [ ! -f "$STATE_FILE" ]; then
echo "State file missing"
exit 1
fi

echo "Restoring StatefulSets..."
jq -c '.[] | select(.kind=="StatefulSet")' $STATE_FILE | while read -r row; do
ns=$(echo "$row" | jq -r '.namespace')
name=$(echo "$row" | jq -r '.name')
replicas=$(echo "$row" | jq -r '.replicas')

kubectl scale statefulset "$name" -n "$ns" --replicas="$replicas" || true
done

echo "Waiting for StatefulSets..."
sleep 60

echo "Restoring Deployments..."
jq -c '.[] | select(.kind=="Deployment")' $STATE_FILE | while read -r row; do
ns=$(echo "$row" | jq -r '.namespace')
name=$(echo "$row" | jq -r '.name')
replicas=$(echo "$row" | jq -r '.replicas')

kubectl scale deployment "$name" -n "$ns" --replicas="$replicas" || true
done

echo "Initial restore done."

echo "Waiting for pods to stabilize (10 min)..."
sleep 600

# Extract ONLY namespaces from state file (IMPORTANT FIX)
NAMESPACES=$(jq -r '.[] | .namespace' $STATE_FILE | sort -u)

if [ -z "$NAMESPACES" ]; then
echo "No namespaces found in state file"
exit 0
fi

for round in 1 2 3; do
echo "Health check round $round..."

for ns in $NAMESPACES; do

case "$ns" in
kube-system|kube-public|kube-node-lease|calico-system|tigera-operator)
continue
;;
esac

# ONLY pods from this namespace (NO cluster-wide scan)
kubectl get pods -n "$ns" -o json | jq -c '.items[]' | while read -r pod; do

name=$(echo "$pod" | jq -r '.metadata.name')

phase=$(echo "$pod" | jq -r '.status.phase')

ready=$(echo "$pod" | jq -r '
[.status.containerStatuses[]?.ready] | all // false
')

restarts=$(echo "$pod" | jq -r '
[.status.containerStatuses[]?.restartCount] | add // 0
')

waiting_reason=$(echo "$pod" | jq -r '
[.status.containerStatuses[]?.state.waiting.reason]
| map(select(. != null))
| .[0] // empty
')

unhealthy=false

if [ "$phase" != "Running" ] && [ "$phase" != "Succeeded" ]; then
unhealthy=true
fi

if [ "$ready" != "true" ]; then
unhealthy=true
fi

if [ "$restarts" -gt 3 ]; then
unhealthy=true
fi

if echo "$waiting_reason" | grep -qE "CrashLoopBackOff|ImagePullBackOff|CreateContainerError"; then
unhealthy=true
fi

if [ "$unhealthy" = true ]; then
echo "Unhealthy pod: $ns/$name | phase=$phase ready=$ready restarts=$restarts reason=$waiting_reason"

owner_kind=$(echo "$pod" | jq -r '.metadata.ownerReferences[0].kind // empty')
owner_name=$(echo "$pod" | jq -r '.metadata.ownerReferences[0].name // empty')

# SAFE RECOVERY
if [ "$restarts" -gt 2 ] || [ -n "$waiting_reason" ]; then
echo "Deleting pod $ns/$name"
kubectl delete pod "$name" -n "$ns" --grace-period=10 || true
fi

if [ "$owner_kind" = "ReplicaSet" ]; then
deploy=$(echo "$owner_name" | sed 's/-[a-z0-9]\{9,10\}$//')
kubectl rollout restart deployment -n "$ns" "$deploy" || true

elif [ "$owner_kind" = "StatefulSet" ]; then
kubectl rollout restart statefulset -n "$ns" "$owner_name" || true
fi
fi

done
done

sleep 60
done

echo "Done (restore + self-heal)"

volumeMounts:
- name: state
mountPath: /data

volumes:
- name: state
persistentVolumeClaim:
claimName: pvc-${MAIN_NAME}-scaler-replica-state
11 changes: 11 additions & 0 deletions modules/workload_scheduler/kube_objects/pvc.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: pvc-${MAIN_NAME}-scaler-replica-state
namespace: ${NAMESPACE}
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 1Gi
5 changes: 5 additions & 0 deletions modules/workload_scheduler/kube_objects/service-account.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
apiVersion: v1
kind: ServiceAccount
metadata:
name: ${MAIN_NAME}
namespace: default
61 changes: 61 additions & 0 deletions modules/workload_scheduler/main.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
terraform {
required_providers {
kubectl = {
source = "alekc/kubectl"
version = "~> 2.1.3"
}
}
}

locals {
objects_values = {
NAMESPACE = var.namespace
MAIN_NAME = "workload-scheduler"
SCALER_TIME_ZONE = var.scaler_time_zone
SCALE_UP_CRON_SCHEDULE = var.scale_up_cron_schedule
SCALE_DOWN_CRON_SCHEDULE = var.scale_down_cron_schedule
SCALER_IMAGE_TAG = "alpine/k8s:1.36.0"
}
}


resource "kubectl_manifest" "service_account" {
count = var.enable_workload_scheduler ? 1 : 0
validate_schema = false
yaml_body = templatefile("${path.module}/kube_objects/service-account.yaml", local.objects_values)
}


resource "kubectl_manifest" "cluster_role" {
count = var.enable_workload_scheduler ? 1 : 0
validate_schema = false
yaml_body = templatefile("${path.module}/kube_objects/cluster-role.yaml", local.objects_values)
}


resource "kubectl_manifest" "cluster_role_binding" {
count = var.enable_workload_scheduler ? 1 : 0
validate_schema = false
yaml_body = templatefile("${path.module}/kube_objects/cluster-role-binding.yaml", local.objects_values)
}


resource "kubectl_manifest" "cronjob_scale_down" {
count = var.enable_workload_scheduler ? 1 : 0
validate_schema = false
yaml_body = templatefile("${path.module}/kube_objects/cronjob-scale-down.yaml", local.objects_values)
}


resource "kubectl_manifest" "cronjob_scale_up" {
count = var.enable_workload_scheduler ? 1 : 0
validate_schema = false
yaml_body = templatefile("${path.module}/kube_objects/cronjob-scale-up.yaml", local.objects_values)
}


resource "kubectl_manifest" "pvc" {
count = var.enable_workload_scheduler ? 1 : 0
validate_schema = false
yaml_body = templatefile("${path.module}/kube_objects/pvc.yaml", local.objects_values)
}
Loading