Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion charts/workflows-cluster/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ name: workflows-cluster
description: A virtual cluster for Data Analysis workflows
type: application

version: 0.9.30
version: 0.10.0
dependencies:
- name: common
version: 2.23.0
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
{{- if eq .Values.cluster "argus" }}
apiVersion: bitnami.com/v1alpha1
kind: SealedSecret
metadata:
name: etcd-rclone-config
namespace: workflows
spec:
encryptedData:
rclone.conf: AgAcczoUI/F2QEk+4b31YXP4SvntZUrd/6RLvRReWGpbyvHv1hNpulJJB741NPjyk6Sykqu+JcJ4wdikIB/BZbOYxF04OmP5WvrwAqqf5CFgxNGckhOlG967c8wDVdSgX1fdrI3Uobk7Z2KYy6oxE0NBzhQKbnffn1BjxG5ybPuO9/4K5F+VoF9PqeCS35fHdGxjZNsD5Feaj58SZXQkI7kGZvSXT7iMd7Io/XdLpc39rtFic1e/Wtavrm2fqg+07OZh3nTk+Yew1haXUMEFG7AC8c62PcyYUXgOcFjDfyE5F3BYK3A7n1s5rzrItQQcH2wFHmiSHq/b2UlU94uWNqgOU5sCISjKJRzjiNgiaCzbrppdIVe3RAFZSMg4oPYxAfBrhgpkBQGcCHOHU+IOgRr+wrPtMWHOgXxGrJ8ZkknT2xrbk4Ui0IC6zvm9AJb5Jn0vQ1Spyy/B/dwIdIr0T4cgTrZ9MtLHaLr/Bc4UAU7TNJllL1OUGuwKHDVRHA3avj7t5s5wsm/V8j0aDwotQBZrKm7rdEmD+QoHd9GEWkcBwttVcq8DBWgD+mkVZVjVwyC6l9Ve0ZOnBi7IDJPDIr+gBYrLai3CTJTW5S4OKANpnN/OimAmrR3ZnmDrY+1a2vrXbQEIrwky+8QBHYaqBAhfV2u0aSbVC6FXYZwPD1RR9q8Ip6iRrFaBUwgJwPp0TOytnMkxyBgWq9KLSdOPmmj+wqCZR5vOBaQk9aItvrySX0ZT1Cs5ucOWdyhSPE/wRqFCJng1Vzg6wKohkGhPtA55BFQqxqBpdmqzEWnRyKhn4rt3xatVIrgV0G6v6vdMaZawi5TqsfI69JnKecybtbcVw4TEYLvsbA3uHZawpxN4FPjSpOZZZ31QSEl3LC5jSYAidM6gO5uB+/BCprBg90KqPeaa4vTJDUy4EtLT4BPGM9C7N2DReb2kAeglCAoqbPHyvwR+smPaSvRZR/fc93sYSyLdmDhHUHhIhqsjMU8JWFtPYhWk7zI2KUv7SKv8YC23FSzRQ17/pnUR97UbHmY9W/fhHl2ImCT79nashUmldxTDNq0R9nLewpzp2lbXBsVd5hWK+rj6F+sg7uhGN2aZk7Ly8QpkTf/+F3GPPawPLTQ2UR53hs1g7rn+oLSBaNzl2Rv7wGlv3AkP8eyChn9KUnKc1TRaROsW4hdbYjOuH9H3HP1s9ZHn+SJxwcgORktZAoZJliY15NQNqwd3hZBYal03ecfEVCGW4OXd00lGSQL43JEPRXXmGwKWsI0QVQvC6r6joaDb6az4gecPsfHPLgJt+NqRZdWEpbOxGvvrq9ZYVQOU3SK4AtePAhbHJvXw9yOysmliYkwQEWATw0KMutXb82MYqnxJd9NbicrkZsTo2KsTaQ1r//US7kvYAX9H
template:
metadata:
name: etcd-rclone-config
namespace: workflows
type: Opaque
{{- else if eq .Values.cluster "pollux" }}
apiVersion: bitnami.com/v1alpha1
kind: SealedSecret
metadata:
name: etcd-rclone-config
namespace: workflows
spec:
encryptedData:
rclone.conf: AgB1fwXmIpQq0DJRlqUJIPHandXkjcaaQafWnLeq0Mc/lA/f/xV/s/xryJMjpLmFuC7RFtpbNQatnK+fGBIbBn/A9JzXH5Vlhj4HMIzo35oDoccwTouN6aCG9LTKC7GWOa02S5gTegaoWl42tECy3SNqUv6Hcgxf7srrd0pMEdXx/CxlxAVUtHwzPfsNGBqWPN1L8m1MzImwaYC9oMcC3l9cG1XdMCeEJi4mOr/x7j1bOgwhqHhOiKWv0k91OVCuduG7U4bIt8dJR/REDYGmEdQTeOmKZwRW16wmMFf7sz2DHXEnQVzB57WZqb0qa836y+xJb3iHZN1Ky5osoQ+fJL9J20+TFF2SuCvVfAZX8nY0bwCamHCJxL8M0l9qHeGX9lXPN5d9ydJEc4syNoWUeWGUmUJoV+XCeV++CycT2SxT4sePbnxCRUYGpHtgSD90InRSephLRoGP+0PtagQiBL2m1jiHLAcpUeIQz6VpeimTOcaiQ3cDK4NsuwgTmJyfVz2lRxs+DHt3MIhRRjd3ivGh23Kfl5OO0jD1dP7e8DrcHxFNl1q+lILNQIAmYlFIiyq0xw9gkddQ/Swahu8pkmbRrVdbycv9havZiUxYLFArPiFwEHVm6LlNKOsv+H5x2zkVAUnClAldaolOdpFBoIuxx13sG1bFR50lsvDeqd0Wc08Ewp8vtC3DMr6N6MpFKZgxclRjaUuY+KWlJtC2frMydxfL/VtA6NiSp9hYxnwAxpSinMiybvr7BhsvvC1K/2hshgBnA2DoozqTfymNH8l8Kj2Fl0kunlvKLJAM8wYr1/knLluonUzWTPb38iDNg3udW6p0sJ7Zwe0n9ZwsQt2ZD0qGt7AppUN2ZYOHIE/yBzkT89QRaOKQ0mc6R0lb0d6cdcCoghQIGDjgelfBlyZRzvNSG8+bgQg4QLTWMS2O3XQYuA995Q3UJrKPUb2vFT4wW3aNwjLCi8cwQMHsHoyQERT37xCfj746R3SDtglQ08g+cIJ+imUaP7jiNDd4elzKB/w41fsdNGrVfPJB+9dnOdq5YS4y0k5eTpKiyVO6VPe3cxuzXqnw/4Ih3xEoxEh5idUJC8f50zYMbAHDDjOJxEKw101bbUAiSlM67vKPcS7FVDczCIhmJhulQROBLD3FbAni0TWxK1h8i6y8Qu16zV/p5BHBMPr2Q8kw46GpveDtZUsNgKxI2wPr0Oj/PUptPP1KdkT7Z6YordTKp4U69YqI6zjvF2pa8ibBHfVTpyrzDv32MtzECfG9xT9/8JrdxMPBt37jHEWsy/jrzeAOKvF84nFeKLAPMx5Gw6f3jw1Dvw4mk4HI9KnwT2twVvjptaZCu0fot3ynZbD0uCRzXLOaZsKQ9O8xAllmgivmjWK6QlNZAQlgs2lZZoMjTesa
template:
metadata:
name: etcd-rclone-config
namespace: workflows
type: Opaque
{{ else }}
{{- end }}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should there be an empty line after all of these?

5 changes: 5 additions & 0 deletions charts/workflows-cluster/dev-values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,11 @@ secrets:
enabled: false
cluster: dev

backup:
enabled: false
bucket:
prefix: dev

vcluster:
controlPlane:
distro:
Expand Down
46 changes: 46 additions & 0 deletions charts/workflows-cluster/scripts/restore-etcd.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
#!/bin/bash
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is it more robust to do:

#!/usr/bin/env bash


Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is it better to set set -euo pipefail to have the script stop on errors?

if [ -z "$KUBECONFIG" ]
then
echo "Kube config not found - have you loaded a cluster?"
exit 1
fi

CLUSTER=$(cat $KUBECONFIG | grep "cluster: " | awk '{print $2}')

cd "${0%/*}/.."

read -p "WARNING: You are about to attempt a full restore of the VCluster etcd on $CLUSTER. Proceed? (y/n)" -r
if [[ $REPLY =~ ^[Yy]$ && ! -z "$CLUSTER" ]]
then
ETCD_REPLICAS=$(kubectl get sts workflows-cluster-etcd -n workflows -o=jsonpath='{.spec.replicas}')
DEPLOY_REPLICAS=$(kubectl get deploy workflows-cluster -n workflows -o=jsonpath='{.spec.replicas}')
echo "Scaling down..."
kubectl scale sts workflows-cluster-etcd -n workflows --replicas=0
kubectl scale deploy workflows-cluster -n workflows --replicas=0

kubectl wait --for=delete pod -l app=vcluster-etcd -n workflows --timeout=300s
kubectl wait --for=delete pod -l app=vcluster -n workflows --timeout=300s

echo "Scale down complete. Creating restoration Jobs."

for ((i=0;i<ETCD_REPLICAS;i++)); do
kubectl -n workflows create job --from=cronjob/restore-etcd-$i restore-etcd-$i
done

echo "Waiting for Jobs to complete..."

for ((i=0;i<ETCD_REPLICAS;i++)); do
kubectl -n workflows wait --for=condition=complete job/restore-etcd-$i --timeout=300s
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is the timeout enough for prod?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure... This seems sufficient for Pollux, so maybe leave it as is for now, then we can boost it when we switch on backups for Argus?

kubectl -n workflows delete job/restore-etcd-$i
done

echo "Jobs complete. Scaling back up..."

kubectl -n workflows scale sts workflows-cluster-etcd --replicas=$ETCD_REPLICAS
kubectl wait --for=condition=Ready pod -l app=vcluster-etcd -n workflows --timeout=300s

kubectl -n workflows scale deploy workflows-cluster --replicas=$DEPLOY_REPLICAS
kubectl wait --for=condition=Ready pod -l app=vcluster -n workflows --timeout=300s
echo "Restore complete."
fi
5 changes: 5 additions & 0 deletions charts/workflows-cluster/staging-values.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
secrets:
cluster: pollux

backup:
enabled: true
bucket:
prefix: staging

vcluster:
controlPlane:
backingStore:
Expand Down
10 changes: 10 additions & 0 deletions charts/workflows-cluster/templates/_helpers.tpl
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
{{/*
Produce initial cluster list for a supplied number of replicas
*/}}
{{- define "workflows.etcdInitialCluster" -}}
{{- $etcdReplicas := . -}}
{{- range $i, $e := until $etcdReplicas -}}
{{- if $i }},{{ end -}}
workflows-cluster-etcd-{{ $i }}=https://workflows-cluster-etcd-{{ $i }}.workflows-cluster-etcd-headless.workflows:2380
{{- end -}}
{{- end }}
79 changes: 79 additions & 0 deletions charts/workflows-cluster/templates/etcd-backup-cronjob.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
apiVersion: batch/v1
kind: CronJob
metadata:
name: backup-etcd
spec:
{{- if .Values.backup.enabled }}
schedule: "@daily"
{{ else }}
schedule: "@yearly"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if .Values.backup.enabled is false the backups still happen but once per year?

Is there a reason you don't just wrap the whole CronJob with {{- if .Values.backup.enabled }} so that CronJob simply doesn't get applied if backup is disabled?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I borrowed this pattern from the LIMS postgres backup, but you're right, I could just wrap the whole thing. I think the suspend: true means it wouldn't run, but I'll change it just in case

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

see my comment below. Having read more about CronJobs, what you are doing may be OK.

suspend: true
{{- end }}
jobTemplate:
spec:
backoffLimit: 2
template:
spec:
initContainers:
- name: backup
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

extra space after backup

image: registry.k8s.io/etcd:3.5.21-0
env:
- name: ETCDCTL_CACERT
value: /certs/etcd-ca.crt
- name: ETCDCTL_CERT
value: /certs/etcd-server.crt
- name: ETCDCTL_KEY
value: /certs/etcd-server.key
command: ["etcdctl"]
args:
- --endpoints=https://workflows-cluster-etcd-0.workflows-cluster-etcd-headless.workflows.svc:2379
- --debug
- snapshot
- save
- /backup/etcd-snapshot.db
volumeMounts:
- name: backup
mountPath: /backup
- name: certs
mountPath: /certs
resources:
requests:
ephemeral-storage: "4Gi"
limits:
ephemeral-storage: "8Gi"
containers:
- name: rclone
image: docker.io/rclone/rclone
command: ["/bin/sh"]
args: ["/scripts/rclone-upload.sh"]
env:
- name: RCLONE_CONFIG
value: /etc/rclone.conf
- name: PREFIX
value: {{ $.Values.backup.bucket.prefix | quote }}
volumeMounts:
- name: backup
mountPath: /backup
- name: scripts
mountPath: /scripts
- name: rclone-conf
mountPath: /etc/rclone.conf
subPath: rclone.conf
resources:
requests:
ephemeral-storage: "4Gi"
limits:
ephemeral-storage: "8Gi"
restartPolicy: Never
volumes:
- name: backup
emptyDir: {}
- name: scripts
configMap:
name: etcd-rclone-scripts
- name: rclone-conf
secret:
secretName: etcd-rclone-config
- name: certs
secret:
secretName: workflows-cluster-certs
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: etcd-rclone-scripts
data:
rclone-upload.sh: |
SNAP=/backup/etcd-snapshot.db
# Check for today's backup
if [ ! -s "$SNAP" ]; then
echo "Backup does not exist"
exit 1
fi

# Want to track if any rclone command fails.
failed="false"

# Timestamp
mv "$SNAP" /backup/etcd-snapshot-$(date +%Y-%m-%d_%H-%M-%S_%Z).db || failed="true"

echo "backing up to echo S3"
rclone copy /backup/ echo:dls-workflows-prod/${PREFIX}
rclone_echo_s3_exit_code=$?

if [ $rclone_echo_s3_exit_code -eq 0 ]; then
echo "rclone copy to echo s3 succeeded"
else
echo "rclone copy to echo s3 failed"
failed="true"
fi

# Delete old backed up objects, with age >= 2 days.
echo "deleting old backups from echo s3"
rclone delete --min-age=2d echo:dls-workflows-prod/${PREFIX}
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is fine for this PR but we should decide our strategy for how many and how long we want to keep backups for.

rclone_delete_echo_s3_exit_code=$?

if [ $rclone_delete_echo_s3_exit_code -eq 0 ]; then
echo "rclone delete old objects in echo s3 succeeded"
else
echo "rclone delete old objects in echo s3 failed"
failed="true"
fi

# If any rclone command failed, then return non-zero.
if [ "$failed" = "true" ]; then
exit 1
fi

rclone-download.sh: |
echo "Starting script"
LAST=$(rclone lsf --files-only --include "etcd-snapshot*.db" echo:dls-workflows-prod/${PREFIX} | sort | tail -1)
rclone copyto -P echo:dls-workflows-prod/${PREFIX}/${LAST} /snapshot/snapshot.db
rm -rf /var/lib/etcd
mkdir -p /var/lib/etcd
73 changes: 73 additions & 0 deletions charts/workflows-cluster/templates/etcd-restore-cronjob.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
{{- $etcdReplicas := int (default 1 .Values.vcluster.controlPlane.backingStore.etcd.deploy.statefulSet.highAvailability.replicas) }}
{{- range $i := until $etcdReplicas }}
{{- $volume := printf "data-workflows-cluster-etcd-%d" $i }}
---
apiVersion: batch/v1
kind: CronJob
metadata:
name: "restore-etcd-{{ $i }}"
spec:
schedule: "@yearly"
suspend: true # Never runs automatically
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a reason you went with a CronJob for this? Naively I would have thought that a Job triggered by restore-etcd.sh would be the way to go.

I'm slightly nervous about this... we don't want the production database accidentally restoring to a backup at some random point in the year!

If you stick with this solution, please be absolutely sure that this is how this works.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Initially when creating the restore job, I wanted to avoid using any local files and have everything required present on the cluster - hence the CronJob. Once that proved difficult, it was simpler to leave it as an unscheduled cron than to switch. I guess since I'm using a locally stored script now it's not a big a deal to require the job file too, so I'll probably switch this over too

Copy link
Collaborator

@davehadley davehadley Mar 19, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My earlier comment was perhaps based on my ignorance. I haven't done much with CronJob's yet. Apparently it is intended to be able to create Jobs from suspended CronJobs (eg https://kubernetes.io/docs/reference/kubectl/generated/kubectl_create/kubectl_create_job/). Your solution may be the "idiomatic" kubernetes way.

jobTemplate:
spec:
template:
spec:
restartPolicy: Never
volumes:
- name: scripts
configMap:
name: etcd-rclone-scripts
defaultMode: 0755
- name: snapshot
emptyDir: {}
- name: {{ $volume }}
persistentVolumeClaim:
claimName: {{ $volume }}
- name: rclone-conf
secret:
secretName: etcd-rclone-config
initContainers:
- name: rclone
image: docker.io/rclone/rclone
command: [/bin/sh, "-c", "/scripts/rclone-download.sh"]
env:
- name: RCLONE_CONFIG
value: /etc/rclone.conf
- name: PREFIX
value: {{ $.Values.backup.bucket.prefix | quote }}
volumeMounts:
- name: scripts
mountPath: /scripts
- name: snapshot
mountPath: /snapshot
- name: rclone-conf
mountPath: /etc/rclone.conf
subPath: rclone.conf
- name: {{ $volume }}
mountPath: /var/lib/etcd
resources:
requests:
ephemeral-storage: "4Gi"
limits:
ephemeral-storage: "8Gi"
containers:
- name: restore-etcd
image: registry.k8s.io/etcd:3.5.21-0
command: ["etcdctl"]
args:
- snapshot
- restore
- /snapshot/snapshot.db
- --data-dir=/var/lib/etcd
- "--name=workflows-cluster-etcd-{{ $i }}"
- "--initial-cluster={{ include "workflows.etcdInitialCluster" $etcdReplicas }}"
- "--initial-advertise-peer-urls=https://workflows-cluster-etcd-{{ $i }}.workflows-cluster-etcd-headless.workflows:2380"
- --initial-cluster-token=workflows-cluster
- --skip-hash-check=false
volumeMounts:
- name: {{ $volume }}
mountPath: /var/lib/etcd
- name: snapshot
mountPath: /snapshot
{{- end }}
5 changes: 5 additions & 0 deletions charts/workflows-cluster/values.yaml
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why do we have a staging being backed up and not prod?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Probably best to roll it out on staging first and just make sure all is well - I just needed to add something to the Values.yaml for prod

Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,11 @@ secrets:
enabled: true
cluster: argus

backup:
enabled: false
bucket:
prefix: prod

vcluster:
telemetry:
enabled: false
Expand Down
Loading