Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 48 additions & 1 deletion ansible/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,17 @@ ansible/
playbooks/
rke2-full-install.yaml # Entry point: full cluster bootstrap + ArgoCD
rke2-join-install.yaml # Entry point: join nodes + ArgoCD enroll
rke2-scale.yaml # Entry point: scale cluster nodes up
rke2-decommission.yaml # Entry point: decommission cluster nodes
roles/
rke2-common/ # Node prep — runs on all hosts
rke2-master/ # Control plane init and join
rke2-worker/ # Worker join
bootstrap-artifacts/ # Fetch kubeconfig + token from first master
argocd/ # ArgoCD install, enroll, cutover
node-drain/ # Cordon and drain a node
node-uncordon/ # Uncordon a node
node-remove/ # Full node removal (drain + etcd + uninstall + delete)
group_vars/
all.yaml # Shared variables (vault, network, versions)
collections/
Expand Down Expand Up @@ -82,6 +87,24 @@ Runs on localhost. Manages the full ArgoCD lifecycle for a cluster.

---

### `node-drain`

Cordons and drains a node via the Kubernetes API (runs on localhost). Reusable for any maintenance that requires evicting workloads before touching a node.

---

### `node-uncordon`

Uncordons a node via the Kubernetes API (runs on localhost). Pair with `node-drain` for rolling operations such as host maintenance.

---

### `node-remove`

Full node removal. Composes `node-drain`, then removes the etcd member (masters only, gated on actual etcd membership), stops and uninstalls RKE2, and deletes the node object from the cluster.

---

## Entry-point playbooks

### `rke2-full-install.yaml`
Expand Down Expand Up @@ -114,6 +137,29 @@ The `master_init` play can be skipped with `--skip-tags master_init` when adding

---

### `rke2-scale.yaml`

Adds new nodes to a running cluster. Detects which hosts already have RKE2 running and only runs on the new ones. Fails if a `kube-masters` host is found running `rke2-agent` or vice versa.

```
all (new only) → rke2-common
kube-masters:&rke2_existing → bootstrap-artifacts (run_once)
kube-masters:&rke2_new → rke2-master (join), serial: 1
kube-workers:&rke2_new → rke2-worker
```

### `rke2-decommission.yaml`

Removes nodes declared in `decommission-kube-masters` and `decommission-kube-workers` inventory groups. Asserts etcd quorum is maintained before proceeding — at most `floor((n-1)/2)` masters can be removed in a single run.

```
localhost → quorum preflight assert
decommission-kube-masters → node-remove, serial: 1
decommission-kube-workers → node-remove
```

---

## Tags

| Tag | Plays |
Expand All @@ -124,12 +170,13 @@ The `master_init` play can be skipped with `--skip-tags master_init` when adding
| `master_init` | `rke2-master (init)` |
| `bootstrap_artifacts` | `bootstrap-artifacts` |
| `master_join` | `rke2-master (join)` |
| `workers` | `rke2-worker` |
| `workers` | `rke2-worker`, `rke2-decommission` / `rke2-scale` worker plays |
| `argocd` | `argocd (install)` |
| `argocd_cutover` | `argocd (cutover)` |
| `argocd_enroll` | `argocd (enroll)` |
| `bootstrap` | All RKE2 + ArgoCD install plays |
| `control_plane` | All master plays |
| `masters` | `rke2-decommission` master plays |

---

Expand Down
45 changes: 45 additions & 0 deletions ansible/playbooks/rke2-decommission.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
- name: Preflight decommission
hosts: localhost
connection: local
gather_facts: false
any_errors_fatal: true
tasks:
- name: Assert decommission groups are defined and non-empty
ansible.builtin.assert:
that: >-
(groups.get('decommission-kube-masters', []) | length) > 0 or
(groups.get('decommission-kube-workers', []) | length) > 0
fail_msg: >-
Neither decommission-kube-masters nor decommission-kube-workers contain any hosts.

- name: Assert master quorum is maintained after decommission
ansible.builtin.assert:
that: >-
(groups.get('decommission-kube-masters', []) | length) <=
((groups['kube-masters'] | length - 1) // 2)
fail_msg: >-
Removing {{ groups.get('decommission-kube-masters', []) | length }} master(s) from
{{ groups['kube-masters'] | length }} would break etcd quorum. At most
{{ (groups['kube-masters'] | length - 1) // 2 }} master(s) can be removed at once.
when: groups.get('decommission-kube-masters', []) | length > 0

- name: Decommission master nodes
hosts: decommission-kube-masters
become: true
gather_facts: false
any_errors_fatal: true
serial: 1
tags:
- masters
roles:
- node-remove

- name: Decommission worker nodes
hosts: decommission-kube-workers
become: true
gather_facts: false
any_errors_fatal: true
tags:
- workers
roles:
- node-remove
100 changes: 100 additions & 0 deletions ansible/playbooks/rke2-scale.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
- name: Detect RKE2 node state
hosts: all
become: true
gather_facts: false
any_errors_fatal: true
tasks:
- name: Check rke2-server service
ansible.builtin.systemd:
name: rke2-server
register: _rke2_server_svc
failed_when: false

- name: Check rke2-agent service
ansible.builtin.systemd:
name: rke2-agent
register: _rke2_agent_svc
failed_when: false

- name: Fail on role mismatch — master running rke2-agent
ansible.builtin.fail:
msg: >-
Host {{ inventory_hostname }} is in kube-masters but rke2-agent is active.
This host may belong to a different cluster or was incorrectly provisioned.
when:
- inventory_hostname in groups['kube-masters']
- _rke2_agent_svc.status.ActiveState | default('') == 'active'

- name: Fail on role mismatch — worker running rke2-server
ansible.builtin.fail:
msg: >-
Host {{ inventory_hostname }} is in kube-workers but rke2-server is active.
This host may belong to a different cluster or was incorrectly provisioned.
when:
- inventory_hostname in groups['kube-workers']
- _rke2_server_svc.status.ActiveState | default('') == 'active'

- name: Group hosts by RKE2 state
ansible.builtin.group_by:
key: >-
rke2_{{ 'existing' if (
(inventory_hostname in groups['kube-masters'] and _rke2_server_svc.status.ActiveState | default('') == 'active') or
(inventory_hostname in groups['kube-workers'] and _rke2_agent_svc.status.ActiveState | default('') == 'active')
) else 'new' }}

- name: Assert at least one existing master is available for artifact retrieval
hosts: localhost
connection: local
gather_facts: false
any_errors_fatal: true
tasks:
- name: Assert existing master presence
ansible.builtin.assert:
that: (groups['kube-masters'] | intersect(groups['rke2_existing'] | default([]))) | length > 0
fail_msg: >-
No existing master nodes found. All kube-masters are new — use
rke2-full-install.yaml or rke2-join-install.yaml for initial cluster bootstrap.

- name: Prepare new nodes
hosts: rke2_new
become: true
gather_facts: false
any_errors_fatal: true
tags:
- preflight
- host_prep
roles:
- rke2-common

- name: Gather bootstrap artifacts from an existing master
hosts: kube-masters:&rke2_existing
become: true
gather_facts: false
any_errors_fatal: true
run_once: true
tags:
- bootstrap_artifacts
roles:
- bootstrap-artifacts

- name: Join new master nodes
hosts: kube-masters:&rke2_new
become: true
gather_facts: false
any_errors_fatal: true
serial: 1
tags:
- master_join
- control_plane
roles:
- rke2-master

- name: Join new worker nodes
hosts: kube-workers:&rke2_new
become: true
gather_facts: false
any_errors_fatal: true
tags:
- workers
roles:
- rke2-worker
11 changes: 11 additions & 0 deletions ansible/playbooks/roles/node-drain/tasks/main.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
- name: Drain node {{ inventory_hostname }}
kubernetes.core.k8s_drain:
name: "{{ inventory_hostname }}"
kubeconfig: "{{ kubeconfig }}"
state: drain
delete_options:
ignore_daemonsets: true
delete_emptydir_data: true
wait_timeout: 300
delegate_to: localhost
become: false
79 changes: 79 additions & 0 deletions ansible/playbooks/roles/node-remove/tasks/main.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
- name: Drain node
ansible.builtin.include_role:
name: node-drain

- name: Gather host facts
ansible.builtin.setup:
filter: ansible_hostname

- name: Get etcd member list
ansible.builtin.command:
argv:
- /var/lib/rancher/rke2/bin/etcdctl
- --endpoints
- https://127.0.0.1:2379
- --cacert
- /var/lib/rancher/rke2/server/tls/etcd/server-ca.crt
- --cert
- /var/lib/rancher/rke2/server/tls/etcd/server-client.crt
- --key
- /var/lib/rancher/rke2/server/tls/etcd/server-client.key
- member
- list
- -w
- json
register: _etcd_member_list
changed_when: false
failed_when: false

- name: Set etcd membership facts
ansible.builtin.set_fact:
_etcd_self: >-
{{ (_etcd_member_list.stdout | from_json).members
| selectattr('name', 'equalto', ansible_hostname)
| list }}
when: _etcd_member_list.rc == 0 and _etcd_member_list.stdout | length > 0

Comment thread
ModerNews marked this conversation as resolved.
- name: Remove etcd member
when: _etcd_self | default([]) | length > 0
block:
- name: Set etcd member ID fact
ansible.builtin.set_fact:
_etcd_member_id: "{{ '%x' % (_etcd_self[0].ID | int) }}"

- name: Remove etcd member
ansible.builtin.command:
argv:
- /var/lib/rancher/rke2/bin/etcdctl
- --endpoints
- https://127.0.0.1:2379
- --cacert
- /var/lib/rancher/rke2/server/tls/etcd/server-ca.crt
- --cert
- /var/lib/rancher/rke2/server/tls/etcd/server-client.crt
- --key
- /var/lib/rancher/rke2/server/tls/etcd/server-client.key
- member
- remove
- "{{ _etcd_member_id }}"
changed_when: true

- name: Stop RKE2 service
ansible.builtin.service:
name: "{{ 'rke2-server' if inventory_hostname in groups['kube-masters'] else 'rke2-agent' }}"
state: stopped
enabled: false

- name: Run RKE2 uninstall script
ansible.builtin.command:
cmd: "{{ '/usr/local/bin/rke2-uninstall.sh' if inventory_hostname in groups['kube-masters'] else '/usr/local/bin/rke2-agent-uninstall.sh' }}"
changed_when: true

- name: Delete node from cluster
kubernetes.core.k8s:
kind: Node
name: "{{ inventory_hostname }}"
state: absent
kubeconfig: "{{ kubeconfig }}"
delegate_to: localhost
become: false
7 changes: 7 additions & 0 deletions ansible/playbooks/roles/node-uncordon/tasks/main.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
- name: Uncordon node {{ inventory_hostname }}
kubernetes.core.k8s_drain:
name: "{{ inventory_hostname }}"
kubeconfig: "{{ kubeconfig }}"
state: uncordon
delegate_to: localhost
become: false