Skip to content

Commit 32aa4af

Browse files
authored
Merge pull request #64 from wiseflat/dev/mgarcia/nomad-autoscaler
Multiple enhancements
2 parents d65a76a + 85d419a commit 32aa4af

File tree

121 files changed

+7113
-953
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

121 files changed

+7113
-953
lines changed

ansible/playbooks/paas/coredns.yml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,5 +5,4 @@
55
gather_facts: true
66
become: true
77
roles:
8-
- golang
98
- coredns

ansible/playbooks/paas/main.yml

Lines changed: 18 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,12 @@
55
gather_facts: true
66
become: true
77
pre_tasks:
8+
9+
- name: Set fqdn hostname
10+
ansible.builtin.hostname:
11+
name: "{{ inventory_hostname }}"
12+
use: systemd
13+
814
- name: Create ansible facts.d directory
915
become: true
1016
ansible.builtin.file:
@@ -14,20 +20,6 @@
1420
group: "root"
1521
mode: '0755'
1622

17-
- name: Get ipinfo.io
18-
ansible.builtin.uri:
19-
url: https://ipinfo.io
20-
http_agent: curl/7.81.0
21-
register: register_uri
22-
check_mode: false
23-
24-
- name: Set ipinfo local_fact
25-
ansible.builtin.copy:
26-
content: |
27-
{{ register_uri.json | to_nice_json }}
28-
dest: /etc/ansible/facts.d/ipinfo.fact
29-
mode: '0644'
30-
3123
- name: Install mandatories packages
3224
ansible.builtin.apt:
3325
pkg:
@@ -42,5 +34,17 @@
4234
until: apt_status is success
4335
delay: 6
4436
retries: 10
37+
4538
roles:
4639
- unattended-upgrades
40+
41+
- name: Configure systemd resolved
42+
ansible.builtin.import_playbook: systemd-resolved.yml
43+
- name: Configure docker
44+
ansible.builtin.import_playbook: docker.yml
45+
- name: Configure nomad
46+
ansible.builtin.import_playbook: nomad.yml
47+
- name: Configure coredns
48+
ansible.builtin.import_playbook: coredns.yml
49+
- name: Configure metrology
50+
ansible.builtin.import_playbook: metrology.yml

ansible/playbooks/paas/metrology.yml

Lines changed: 48 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -4,16 +4,51 @@
44
hosts: "{{ hosts_limit | default('infrastructure') }}"
55
gather_facts: true
66
become: true
7-
roles:
8-
- prometheus
9-
- promtail
10-
- phpfpm_exporter
11-
- node_exporter
12-
- mysqld_exporter
13-
- systemd_exporter
14-
- mongodb_exporter
15-
- blackbox_exporter
16-
- nginx_exporter
17-
- scan_exporter
18-
- dns_exporter
19-
- script_exporter
7+
vars_prompt:
8+
- name: project
9+
prompt: project name
10+
private: false
11+
tasks:
12+
- name: End the play for hosts that are not in admins group
13+
ansible.builtin.meta: end_host
14+
when: fact_instance.location != 'admins'
15+
16+
- name: Install prometheus
17+
ansible.builtin.import_role:
18+
name: prometheus
19+
20+
- name: Install exporters
21+
any_errors_fatal: true
22+
hosts: "{{ hosts_limit | default('infrastructure') }}"
23+
gather_facts: true
24+
become: true
25+
tasks:
26+
- name: Create prometheus group
27+
ansible.builtin.group:
28+
name: prometheus
29+
system: true
30+
31+
- name: Create prometheus user
32+
ansible.builtin.user:
33+
name: prometheus
34+
create_home: false
35+
system: true
36+
37+
- name: Install exporters
38+
ansible.builtin.include_role:
39+
name: "{{ exporter }}"
40+
loop:
41+
- promtail
42+
- phpfpm_exporter
43+
- node_exporter
44+
- mysqld_exporter
45+
- systemd_exporter
46+
- mongodb_exporter
47+
- blackbox_exporter
48+
- nginx_exporter
49+
- scan_exporter
50+
- dns_exporter
51+
- script_exporter
52+
- nvidia_gpu_exporter
53+
loop_control:
54+
loop_var: exporter
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
---
2+
- name: Install nomad juicefs CSI driver
3+
any_errors_fatal: true
4+
hosts: "{{ hosts_limit | default('infrastructure') }}"
5+
gather_facts: true
6+
become: true
7+
tasks:
8+
- name: Install nomad juicefs CSI driver
9+
ansible.builtin.import_role:
10+
name: nomad
11+
tasks_from: 10_juicefs

ansible/playbooks/paas/nvidia.yml

Lines changed: 142 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,142 @@
1+
---
2+
- name: Install nomad nvidia plugin
3+
any_errors_fatal: true
4+
hosts: "{{ hosts_limit | default('infrastructure') }}"
5+
gather_facts: true
6+
become: true
7+
vars:
8+
build_work_dir: /tmp
9+
upstream_file_url: https://github.com/hashicorp/nomad-device-nvidia.git
10+
nvidia_container_toolkit_version: "1.17.8-1"
11+
nvidia_gpg_key_url: "https://nvidia.github.io/libnvidia-container/gpgkey"
12+
nvidia_repo_list_url: "https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list"
13+
nvidia_keyring_path: "/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg"
14+
nvidia_list_path: "/etc/apt/sources.list.d/nvidia-container-toolkit.list"
15+
16+
roles:
17+
- golang
18+
19+
pre_tasks:
20+
21+
- name: Créer le répertoire du keyring s'il n'existe pas
22+
ansible.builtin.file:
23+
path: "{{ nvidia_keyring_path | dirname }}"
24+
state: directory
25+
mode: "0755"
26+
27+
- name: Télécharger et enregistrer la clé GPG NVIDIA
28+
ansible.builtin.get_url:
29+
url: "{{ nvidia_gpg_key_url }}"
30+
dest: /tmp/nvidia-container-toolkit.gpg
31+
mode: "0644"
32+
33+
- name: Convertir la clé GPG en format keyring
34+
ansible.builtin.command:
35+
cmd: "gpg --dearmor -o {{ nvidia_keyring_path }} /tmp/nvidia-container-toolkit.gpg"
36+
creates: "{{ nvidia_keyring_path }}"
37+
38+
- name: Télécharger le fichier de dépôt NVIDIA et ajouter le signed-by
39+
ansible.builtin.shell: |
40+
curl -s -L {{ nvidia_repo_list_url }} | \
41+
sed 's#deb https://#deb [signed-by={{ nvidia_keyring_path }}] https://#g' > {{ nvidia_list_path }}
42+
args:
43+
creates: "{{ nvidia_list_path }}"
44+
45+
- name: Activer la section experimental (décommenter)
46+
ansible.builtin.replace:
47+
path: "{{ nvidia_list_path }}"
48+
regexp: '^#(.*experimental.*)$'
49+
replace: '\1'
50+
51+
- name: Mettre à jour la liste des paquets
52+
ansible.builtin.apt:
53+
update_cache: true
54+
55+
- name: Installer les paquets NVIDIA Container Toolkit
56+
ansible.builtin.apt:
57+
name:
58+
- "nvidia-container-toolkit={{ nvidia_container_toolkit_version }}"
59+
- "nvidia-container-toolkit-base={{ nvidia_container_toolkit_version }}"
60+
- "libnvidia-container-tools={{ nvidia_container_toolkit_version }}"
61+
- "libnvidia-container1={{ nvidia_container_toolkit_version }}"
62+
state: present
63+
64+
tasks:
65+
- name: Install dependencies
66+
ansible.builtin.apt:
67+
pkg:
68+
- nvidia-utils-580
69+
- nvidia-driver-580
70+
- nvidia-container-runtime
71+
- nomad-device-nvidia
72+
state: present
73+
install_recommends: true
74+
update_cache: true
75+
register: apt_status
76+
until: apt_status is success
77+
delay: 6
78+
retries: 10
79+
80+
- name: Nomad-nvidia-plugin | Git checkout
81+
ansible.builtin.git:
82+
repo: https://github.com/hashicorp/nomad-device-nvidia.git
83+
dest: "{{ build_work_dir }}/nomad-device-nvidia"
84+
version: main
85+
force: true
86+
87+
- name: Nomad-nvidia-plugin | Build binary
88+
ansible.builtin.command:
89+
cmd: make compile
90+
chdir: "{{ build_work_dir }}/nomad-device-nvidia"
91+
environment:
92+
PATH: "/usr/local/go/bin:{{ ansible_env.PATH }}"
93+
register: my_output
94+
changed_when: my_output.rc != 0
95+
96+
- name: Create nomad plugin directory
97+
ansible.builtin.file:
98+
path: /opt/nomad/plugins
99+
state: directory
100+
owner: root
101+
group: root
102+
mode: "0755"
103+
104+
- name: Nomad-nvidia-plugin | Copy binary
105+
ansible.builtin.copy:
106+
src: /tmp/nomad-plugins/nomad-device-nvidia
107+
dest: /opt/nomad/plugins/nomad-device-nvidia
108+
owner: root
109+
group: root
110+
mode: '0755'
111+
remote_src: true
112+
113+
114+
- name: Copy using inline content
115+
ansible.builtin.copy:
116+
content: |
117+
plugin "nomad-device-nvidia" {
118+
config {
119+
enabled = true
120+
fingerprint_period = "5s"
121+
}
122+
}
123+
dest: /etc/nomad.d/nvidia.hcl
124+
owner: root
125+
group: root
126+
mode: '0644'
127+
128+
# https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html
129+
- name: Nomad-nvidia-plugin | Test nvidia support
130+
ansible.builtin.command: nvidia-ctk runtime configure --runtime=docker
131+
132+
- name: Nomad-nvidia-plugin | Restart docker
133+
ansible.builtin.command: systemctl restart docker
134+
135+
# https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/sample-workload.html
136+
- name: Nomad-nvidia-plugin | Test nvidia support
137+
ansible.builtin.command: docker run --rm --runtime=nvidia --gpus all ubuntu nvidia-smi
138+
register: docker_run
139+
140+
- name: Nomad-nvidia-plugin | Debug
141+
ansible.builtin.debug:
142+
msg: "{{ docker_run }}"
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
---
2+
- name: Configure sdb partition
3+
any_errors_fatal: true
4+
hosts: "{{ hosts_limit | default('infrastructure') }}"
5+
gather_facts: true
6+
become: true
7+
tasks:
8+
- name: Create default directory
9+
ansible.builtin.file:
10+
path: /data
11+
state: directory
12+
owner: root
13+
group: root
14+
mode: '0755'
15+
16+
- name: Create a new primary partition /dev/sdb1
17+
community.general.parted:
18+
device: /dev/sdb
19+
number: 1
20+
state: present
21+
22+
- name: Create a ext4 filesystem on /dev/sdb1
23+
community.general.filesystem:
24+
fstype: ext4
25+
dev: /dev/sdb1
26+
27+
- name: Mount up device
28+
ansible.posix.mount:
29+
path: /data
30+
src: /dev/sdb1
31+
fstype: ext4
32+
state: present

ansible/playbooks/paas/roles/ansible-docker/defaults/main.yml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,9 @@ docker_private_registry_state: false
1313
docker_private_registry_url: ""
1414
docker_private_registry_username: ""
1515
docker_private_registry_password: ""
16-
docker_private_registry_config: /etc/docker/config.json
16+
docker_private_registry_config:
17+
- /etc/docker/config.json
18+
- /root/.docker/config.json
1719

1820
# DNS
1921
docker_dns_configuration: true

ansible/playbooks/paas/roles/ansible-docker/tasks/install.yml

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -21,14 +21,7 @@
2121
url: "https://download.docker.com/linux/{{ ansible_distribution | lower }}/gpg"
2222
dest: /etc/apt/keyrings/docker.asc
2323

24-
- name: Add Docker repository on ubuntu < 24.04
25-
ansible.builtin.apt_repository:
26-
repo: "deb [arch={{ upstream_default_arch }} signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu {{ ansible_distribution_release }} stable"
27-
state: present
28-
filename: docker
29-
when: ansible_distribution_version is version('24.04', '<')
30-
31-
- name: Add Docker repository on ubuntu >= 24.04
24+
- name: Add Docker repository on ubuntu
3225
ansible.builtin.copy:
3326
content: |
3427
Components: stable
@@ -42,7 +35,6 @@
4235
owner: root
4336
group: root
4437
mode: '0644'
45-
when: ansible_distribution_version is version('24.04', '>=')
4638

4739
- name: Install Docker
4840
ansible.builtin.apt:
@@ -68,14 +60,24 @@
6860
append: true
6961
notify: Docker_restart
7062

63+
- name: Create home docker directory
64+
ansible.builtin.file:
65+
path: "{{ item }}"
66+
recurse: true
67+
state: directory
68+
mode: '0755'
69+
loop:
70+
- /root/.docker
71+
7172
- name: Copy config.json
7273
ansible.builtin.template:
7374
src: config.json.j2
74-
dest: "{{ docker_private_registry_config }}"
75+
dest: "{{ item }}"
7576
owner: root
7677
group: root
7778
mode: '0600'
7879
when: docker_private_registry_state
80+
loop: "{{ docker_private_registry_config }}"
7981
notify: Docker_restart
8082

8183
- name: Copy daemon.json for DNS resolution
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
11
# Role: `coredns`
22

33
## How to use this Ansible role?
4+
5+
### nomad cluster mode
6+
7+
nomad_primary_master_node: Set a primary nomad master node to get nomad_management_token

0 commit comments

Comments
 (0)