Skip to content

Commit ceddb28

Browse files
feat(charts): implement etcd backup and restore on pollux
1 parent deedca0 commit ceddb28

10 files changed

Lines changed: 307 additions & 1 deletion

File tree

charts/workflows-cluster/Chart.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ name: workflows-cluster
33
description: A virtual cluster for Data Analysis workflows
44
type: application
55

6-
version: 0.9.30
6+
version: 0.10.0
77
dependencies:
88
- name: common
99
version: 2.23.0
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
{{- if eq .Values.cluster "argus" }}
2+
apiVersion: bitnami.com/v1alpha1
3+
kind: SealedSecret
4+
metadata:
5+
name: etcd-rclone-config
6+
namespace: workflows
7+
spec:
8+
encryptedData:
9+
rclone.conf: AgAcczoUI/F2QEk+4b31YXP4SvntZUrd/6RLvRReWGpbyvHv1hNpulJJB741NPjyk6Sykqu+JcJ4wdikIB/BZbOYxF04OmP5WvrwAqqf5CFgxNGckhOlG967c8wDVdSgX1fdrI3Uobk7Z2KYy6oxE0NBzhQKbnffn1BjxG5ybPuO9/4K5F+VoF9PqeCS35fHdGxjZNsD5Feaj58SZXQkI7kGZvSXT7iMd7Io/XdLpc39rtFic1e/Wtavrm2fqg+07OZh3nTk+Yew1haXUMEFG7AC8c62PcyYUXgOcFjDfyE5F3BYK3A7n1s5rzrItQQcH2wFHmiSHq/b2UlU94uWNqgOU5sCISjKJRzjiNgiaCzbrppdIVe3RAFZSMg4oPYxAfBrhgpkBQGcCHOHU+IOgRr+wrPtMWHOgXxGrJ8ZkknT2xrbk4Ui0IC6zvm9AJb5Jn0vQ1Spyy/B/dwIdIr0T4cgTrZ9MtLHaLr/Bc4UAU7TNJllL1OUGuwKHDVRHA3avj7t5s5wsm/V8j0aDwotQBZrKm7rdEmD+QoHd9GEWkcBwttVcq8DBWgD+mkVZVjVwyC6l9Ve0ZOnBi7IDJPDIr+gBYrLai3CTJTW5S4OKANpnN/OimAmrR3ZnmDrY+1a2vrXbQEIrwky+8QBHYaqBAhfV2u0aSbVC6FXYZwPD1RR9q8Ip6iRrFaBUwgJwPp0TOytnMkxyBgWq9KLSdOPmmj+wqCZR5vOBaQk9aItvrySX0ZT1Cs5ucOWdyhSPE/wRqFCJng1Vzg6wKohkGhPtA55BFQqxqBpdmqzEWnRyKhn4rt3xatVIrgV0G6v6vdMaZawi5TqsfI69JnKecybtbcVw4TEYLvsbA3uHZawpxN4FPjSpOZZZ31QSEl3LC5jSYAidM6gO5uB+/BCprBg90KqPeaa4vTJDUy4EtLT4BPGM9C7N2DReb2kAeglCAoqbPHyvwR+smPaSvRZR/fc93sYSyLdmDhHUHhIhqsjMU8JWFtPYhWk7zI2KUv7SKv8YC23FSzRQ17/pnUR97UbHmY9W/fhHl2ImCT79nashUmldxTDNq0R9nLewpzp2lbXBsVd5hWK+rj6F+sg7uhGN2aZk7Ly8QpkTf/+F3GPPawPLTQ2UR53hs1g7rn+oLSBaNzl2Rv7wGlv3AkP8eyChn9KUnKc1TRaROsW4hdbYjOuH9H3HP1s9ZHn+SJxwcgORktZAoZJliY15NQNqwd3hZBYal03ecfEVCGW4OXd00lGSQL43JEPRXXmGwKWsI0QVQvC6r6joaDb6az4gecPsfHPLgJt+NqRZdWEpbOxGvvrq9ZYVQOU3SK4AtePAhbHJvXw9yOysmliYkwQEWATw0KMutXb82MYqnxJd9NbicrkZsTo2KsTaQ1r//US7kvYAX9H
10+
template:
11+
metadata:
12+
name: etcd-rclone-config
13+
namespace: workflows
14+
type: Opaque
15+
{{- else if eq .Values.cluster "pollux" }}
16+
apiVersion: bitnami.com/v1alpha1
17+
kind: SealedSecret
18+
metadata:
19+
name: etcd-rclone-config
20+
namespace: workflows
21+
spec:
22+
encryptedData:
23+
rclone.conf: AgB1fwXmIpQq0DJRlqUJIPHandXkjcaaQafWnLeq0Mc/lA/f/xV/s/xryJMjpLmFuC7RFtpbNQatnK+fGBIbBn/A9JzXH5Vlhj4HMIzo35oDoccwTouN6aCG9LTKC7GWOa02S5gTegaoWl42tECy3SNqUv6Hcgxf7srrd0pMEdXx/CxlxAVUtHwzPfsNGBqWPN1L8m1MzImwaYC9oMcC3l9cG1XdMCeEJi4mOr/x7j1bOgwhqHhOiKWv0k91OVCuduG7U4bIt8dJR/REDYGmEdQTeOmKZwRW16wmMFf7sz2DHXEnQVzB57WZqb0qa836y+xJb3iHZN1Ky5osoQ+fJL9J20+TFF2SuCvVfAZX8nY0bwCamHCJxL8M0l9qHeGX9lXPN5d9ydJEc4syNoWUeWGUmUJoV+XCeV++CycT2SxT4sePbnxCRUYGpHtgSD90InRSephLRoGP+0PtagQiBL2m1jiHLAcpUeIQz6VpeimTOcaiQ3cDK4NsuwgTmJyfVz2lRxs+DHt3MIhRRjd3ivGh23Kfl5OO0jD1dP7e8DrcHxFNl1q+lILNQIAmYlFIiyq0xw9gkddQ/Swahu8pkmbRrVdbycv9havZiUxYLFArPiFwEHVm6LlNKOsv+H5x2zkVAUnClAldaolOdpFBoIuxx13sG1bFR50lsvDeqd0Wc08Ewp8vtC3DMr6N6MpFKZgxclRjaUuY+KWlJtC2frMydxfL/VtA6NiSp9hYxnwAxpSinMiybvr7BhsvvC1K/2hshgBnA2DoozqTfymNH8l8Kj2Fl0kunlvKLJAM8wYr1/knLluonUzWTPb38iDNg3udW6p0sJ7Zwe0n9ZwsQt2ZD0qGt7AppUN2ZYOHIE/yBzkT89QRaOKQ0mc6R0lb0d6cdcCoghQIGDjgelfBlyZRzvNSG8+bgQg4QLTWMS2O3XQYuA995Q3UJrKPUb2vFT4wW3aNwjLCi8cwQMHsHoyQERT37xCfj746R3SDtglQ08g+cIJ+imUaP7jiNDd4elzKB/w41fsdNGrVfPJB+9dnOdq5YS4y0k5eTpKiyVO6VPe3cxuzXqnw/4Ih3xEoxEh5idUJC8f50zYMbAHDDjOJxEKw101bbUAiSlM67vKPcS7FVDczCIhmJhulQROBLD3FbAni0TWxK1h8i6y8Qu16zV/p5BHBMPr2Q8kw46GpveDtZUsNgKxI2wPr0Oj/PUptPP1KdkT7Z6YordTKp4U69YqI6zjvF2pa8ibBHfVTpyrzDv32MtzECfG9xT9/8JrdxMPBt37jHEWsy/jrzeAOKvF84nFeKLAPMx5Gw6f3jw1Dvw4mk4HI9KnwT2twVvjptaZCu0fot3ynZbD0uCRzXLOaZsKQ9O8xAllmgivmjWK6QlNZAQlgs2lZZoMjTesa
24+
template:
25+
metadata:
26+
name: etcd-rclone-config
27+
namespace: workflows
28+
type: Opaque
29+
{{ else }}
30+
{{- end }}

charts/workflows-cluster/dev-values.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,11 @@ secrets:
22
enabled: false
33
cluster: dev
44

5+
backup:
6+
enabled: false
7+
bucket:
8+
prefix: dev
9+
510
vcluster:
611
controlPlane:
712
distro:
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
#!/bin/bash
2+
3+
if [ -z "$KUBECONFIG" ]
4+
then
5+
echo "Kube config not found - have you loaded a cluster?"
6+
exit 1
7+
fi
8+
9+
CLUSTER=$(cat $KUBECONFIG | grep "cluster: " | awk '{print $2}')
10+
11+
cd "${0%/*}/.."
12+
13+
read -p "WARNING: You are about to attempt a full restore of the VCluster etcd on $CLUSTER. Proceed? (y/n)" -r
14+
if [[ $REPLY =~ ^[Yy]$ && ! -z "$CLUSTER" ]]
15+
then
16+
ETCD_REPLICAS=$(kubectl get sts workflows-cluster-etcd -n workflows -o=jsonpath='{.spec.replicas}')
17+
DEPLOY_REPLICAS=$(kubectl get deploy workflows-cluster -n workflows -o=jsonpath='{.spec.replicas}')
18+
echo "Scaling down..."
19+
kubectl scale sts workflows-cluster-etcd -n workflows --replicas=0
20+
kubectl scale deploy workflows-cluster -n workflows --replicas=0
21+
22+
kubectl wait --for=delete pod -l app=vcluster-etcd -n workflows --timeout=300s
23+
kubectl wait --for=delete pod -l app=vcluster -n workflows --timeout=300s
24+
25+
echo "Scale down complete. Creating restoration Jobs."
26+
27+
for ((i=0;i<ETCD_REPLICAS;i++)); do
28+
kubectl -n workflows create job --from=cronjob/restore-etcd-$i restore-etcd-$i
29+
done
30+
31+
echo "Waiting for Jobs to complete..."
32+
33+
for ((i=0;i<ETCD_REPLICAS;i++)); do
34+
kubectl -n workflows wait --for=condition=complete job/restore-etcd-$i --timeout=300s
35+
kubectl -n workflows delete job/restore-etcd-$i
36+
done
37+
38+
echo "Jobs complete. Scaling back up..."
39+
40+
kubectl -n workflows scale sts workflows-cluster-etcd --replicas=$ETCD_REPLICAS
41+
kubectl wait --for=condition=Ready pod -l app=vcluster-etcd -n workflows --timeout=300s
42+
43+
kubectl -n workflows scale deploy workflows-cluster --replicas=$DEPLOY_REPLICAS
44+
kubectl wait --for=condition=Ready pod -l app=vcluster -n workflows --timeout=300s
45+
echo "Restore complete."
46+
fi

charts/workflows-cluster/staging-values.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,11 @@
11
secrets:
22
cluster: pollux
33

4+
backup:
5+
enabled: true
6+
bucket:
7+
prefix: staging
8+
49
vcluster:
510
controlPlane:
611
backingStore:
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
{{/*
2+
Produce initial cluster list for a supplied number of replicas
3+
*/}}
4+
{{- define "workflows.etcdInitialCluster" -}}
5+
{{- $etcdReplicas := . -}}
6+
{{- range $i, $e := until $etcdReplicas -}}
7+
{{- if $i }},{{ end -}}
8+
workflows-cluster-etcd-{{ $i }}=https://workflows-cluster-etcd-{{ $i }}.workflows-cluster-etcd-headless.workflows:2380
9+
{{- end -}}
10+
{{- end }}
Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
apiVersion: batch/v1
2+
kind: CronJob
3+
metadata:
4+
name: backup-etcd
5+
spec:
6+
{{- if .Values.backup.enabled }}
7+
schedule: "@daily"
8+
{{ else }}
9+
schedule: "@yearly"
10+
suspend: true
11+
{{- end }}
12+
jobTemplate:
13+
spec:
14+
backoffLimit: 2
15+
template:
16+
spec:
17+
initContainers:
18+
- name: backup
19+
image: registry.k8s.io/etcd:3.5.21-0
20+
env:
21+
- name: ETCDCTL_CACERT
22+
value: /certs/etcd-ca.crt
23+
- name: ETCDCTL_CERT
24+
value: /certs/etcd-server.crt
25+
- name: ETCDCTL_KEY
26+
value: /certs/etcd-server.key
27+
command: ["etcdctl"]
28+
args:
29+
- --endpoints=https://workflows-cluster-etcd-0.workflows-cluster-etcd-headless.workflows.svc:2379
30+
- --debug
31+
- snapshot
32+
- save
33+
- /backup/etcd-snapshot.db
34+
volumeMounts:
35+
- name: backup
36+
mountPath: /backup
37+
- name: certs
38+
mountPath: /certs
39+
resources:
40+
requests:
41+
ephemeral-storage: "4Gi"
42+
limits:
43+
ephemeral-storage: "8Gi"
44+
containers:
45+
- name: rclone
46+
image: docker.io/rclone/rclone
47+
command: ["/bin/sh"]
48+
args: ["/scripts/rclone-upload.sh"]
49+
env:
50+
- name: RCLONE_CONFIG
51+
value: /etc/rclone.conf
52+
- name: PREFIX
53+
value: {{ $.Values.backup.bucket.prefix | quote }}
54+
volumeMounts:
55+
- name: backup
56+
mountPath: /backup
57+
- name: scripts
58+
mountPath: /scripts
59+
- name: rclone-conf
60+
mountPath: /etc/rclone.conf
61+
subPath: rclone.conf
62+
resources:
63+
requests:
64+
ephemeral-storage: "4Gi"
65+
limits:
66+
ephemeral-storage: "8Gi"
67+
restartPolicy: Never
68+
volumes:
69+
- name: backup
70+
emptyDir: {}
71+
- name: scripts
72+
configMap:
73+
name: etcd-rclone-scripts
74+
- name: rclone-conf
75+
secret:
76+
secretName: etcd-rclone-config
77+
- name: certs
78+
secret:
79+
secretName: workflows-cluster-certs
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
apiVersion: v1
2+
kind: ConfigMap
3+
metadata:
4+
name: etcd-rclone-scripts
5+
data:
6+
rclone-upload.sh: |
7+
SNAP=/backup/etcd-snapshot.db
8+
# Check for today's backup
9+
if [ ! -s "$SNAP" ]; then
10+
echo "Backup does not exist"
11+
exit 1
12+
fi
13+
14+
# Want to track if any rclone command fails.
15+
failed="false"
16+
17+
# Timestamp
18+
mv "$SNAP" /backup/etcd-snapshot-$(date +%Y-%m-%d_%H-%M-%S_%Z).db || failed="true"
19+
20+
echo "backing up to echo S3"
21+
rclone copy /backup/ echo:dls-workflows-prod/${PREFIX}
22+
rclone_echo_s3_exit_code=$?
23+
24+
if [ $rclone_echo_s3_exit_code -eq 0 ]; then
25+
echo "rclone copy to echo s3 succeeded"
26+
else
27+
echo "rclone copy to echo s3 failed"
28+
failed="true"
29+
fi
30+
31+
# Delete old backed up objects, with age >= 2 days.
32+
echo "deleting old backups from echo s3"
33+
rclone delete --min-age=2d echo:dls-workflows-prod/${PREFIX}
34+
rclone_delete_echo_s3_exit_code=$?
35+
36+
if [ $rclone_delete_echo_s3_exit_code -eq 0 ]; then
37+
echo "rclone delete old objects in echo s3 succeeded"
38+
else
39+
echo "rclone delete old objects in echo s3 failed"
40+
failed="true"
41+
fi
42+
43+
# If any rclone command failed, then return non-zero.
44+
if [ "$failed" = "true" ]; then
45+
exit 1
46+
fi
47+
48+
rclone-download.sh: |
49+
echo "Starting script"
50+
LAST=$(rclone lsf --files-only --include "etcd-snapshot*.db" echo:dls-workflows-prod/${PREFIX} | sort | tail -1)
51+
rclone copyto -P echo:dls-workflows-prod/${PREFIX}/${LAST} /snapshot/snapshot.db
52+
rm -rf /var/lib/etcd
53+
mkdir -p /var/lib/etcd
Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
{{- $etcdReplicas := int (default 1 .Values.vcluster.controlPlane.backingStore.etcd.deploy.statefulSet.highAvailability.replicas) }}
2+
{{- range $i := until $etcdReplicas }}
3+
{{- $volume := printf "data-workflows-cluster-etcd-%d" $i }}
4+
---
5+
apiVersion: batch/v1
6+
kind: CronJob
7+
metadata:
8+
name: "restore-etcd-{{ $i }}"
9+
spec:
10+
schedule: "@yearly"
11+
suspend: true # Never runs automatically
12+
jobTemplate:
13+
spec:
14+
template:
15+
spec:
16+
restartPolicy: Never
17+
volumes:
18+
- name: scripts
19+
configMap:
20+
name: etcd-rclone-scripts
21+
defaultMode: 0755
22+
- name: snapshot
23+
emptyDir: {}
24+
- name: {{ $volume }}
25+
persistentVolumeClaim:
26+
claimName: {{ $volume }}
27+
- name: rclone-conf
28+
secret:
29+
secretName: etcd-rclone-config
30+
initContainers:
31+
- name: rclone
32+
image: docker.io/rclone/rclone
33+
command: [/bin/sh, "-c", "/scripts/rclone-download.sh"]
34+
env:
35+
- name: RCLONE_CONFIG
36+
value: /etc/rclone.conf
37+
- name: PREFIX
38+
value: {{ $.Values.backup.bucket.prefix | quote }}
39+
volumeMounts:
40+
- name: scripts
41+
mountPath: /scripts
42+
- name: snapshot
43+
mountPath: /snapshot
44+
- name: rclone-conf
45+
mountPath: /etc/rclone.conf
46+
subPath: rclone.conf
47+
- name: {{ $volume }}
48+
mountPath: /var/lib/etcd
49+
resources:
50+
requests:
51+
ephemeral-storage: "4Gi"
52+
limits:
53+
ephemeral-storage: "8Gi"
54+
containers:
55+
- name: restore-etcd
56+
image: registry.k8s.io/etcd:3.5.21-0
57+
command: ["etcdctl"]
58+
args:
59+
- snapshot
60+
- restore
61+
- /snapshot/snapshot.db
62+
- --data-dir=/var/lib/etcd
63+
- "--name=workflows-cluster-etcd-{{ $i }}"
64+
- "--initial-cluster={{ include "workflows.etcdInitialCluster" $etcdReplicas }}"
65+
- "--initial-advertise-peer-urls=https://workflows-cluster-etcd-{{ $i }}.workflows-cluster-etcd-headless.workflows:2380"
66+
- --initial-cluster-token=workflows-cluster
67+
- --skip-hash-check=false
68+
volumeMounts:
69+
- name: {{ $volume }}
70+
mountPath: /var/lib/etcd
71+
- name: snapshot
72+
mountPath: /snapshot
73+
{{- end }}

charts/workflows-cluster/values.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,11 @@ secrets:
22
enabled: true
33
cluster: argus
44

5+
backup:
6+
enabled: false
7+
bucket:
8+
prefix: prod
9+
510
vcluster:
611
telemetry:
712
enabled: false

0 commit comments

Comments
 (0)