Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions defconfigs/lambdalabs-8x-b200-or-less
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# Lambda Labs 8-GPU with tier-based fallback (B200 maximum tier)
# Uses 8X_B200_OR_LESS for best available 8-GPU up to B200
# Fallback order: 8x B200 → 8x H100 → 8x A100-80 → 8x A100 → 8x V100
CONFIG_TERRAFORM=y
CONFIG_TERRAFORM_LAMBDALABS=y
CONFIG_TERRAFORM_LAMBDALABS_REGION_SMART_INFER=y
CONFIG_TERRAFORM_LAMBDALABS_INSTANCE_TYPE_8X_B200_OR_LESS=y
CONFIG_TERRAFORM_SSH_CONFIG_GENKEY=y
CONFIG_TERRAFORM_SSH_CONFIG_GENKEY_OVERWRITE=y
CONFIG_TERRAFORM_SSH_CONFIG_GENKEY_EMPTY_PASSPHRASE=y
CONFIG_WORKFLOWS=y
CONFIG_WORKFLOWS_TESTS=y
CONFIG_WORKFLOWS_LINUX_TESTS=y
CONFIG_WORKFLOWS_DEDICATED_WORKFLOW=y
14 changes: 14 additions & 0 deletions defconfigs/lambdalabs-8x-h100-or-less
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# Lambda Labs 8-GPU with tier-based fallback (H100 maximum tier)
# Uses 8X_H100_OR_LESS for best available 8-GPU up to H100
# Fallback order: 8x H100 → 8x A100-80 → 8x A100 → 8x V100
CONFIG_TERRAFORM=y
CONFIG_TERRAFORM_LAMBDALABS=y
CONFIG_TERRAFORM_LAMBDALABS_REGION_SMART_INFER=y
CONFIG_TERRAFORM_LAMBDALABS_INSTANCE_TYPE_8X_H100_OR_LESS=y
CONFIG_TERRAFORM_SSH_CONFIG_GENKEY=y
CONFIG_TERRAFORM_SSH_CONFIG_GENKEY_OVERWRITE=y
CONFIG_TERRAFORM_SSH_CONFIG_GENKEY_EMPTY_PASSPHRASE=y
CONFIG_WORKFLOWS=y
CONFIG_WORKFLOWS_TESTS=y
CONFIG_WORKFLOWS_LINUX_TESTS=y
CONFIG_WORKFLOWS_DEDICATED_WORKFLOW=y
14 changes: 14 additions & 0 deletions defconfigs/lambdalabs-a100-or-less
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# Lambda Labs GPU with tier-based fallback (A100 maximum tier)
# Uses A100_OR_LESS for best available single GPU up to A100
# Fallback order: A100-SXM → A100 → A6000 → RTX6000 → A10
CONFIG_TERRAFORM=y
CONFIG_TERRAFORM_LAMBDALABS=y
CONFIG_TERRAFORM_LAMBDALABS_REGION_SMART_INFER=y
CONFIG_TERRAFORM_LAMBDALABS_INSTANCE_TYPE_A100_OR_LESS=y
CONFIG_TERRAFORM_SSH_CONFIG_GENKEY=y
CONFIG_TERRAFORM_SSH_CONFIG_GENKEY_OVERWRITE=y
CONFIG_TERRAFORM_SSH_CONFIG_GENKEY_EMPTY_PASSPHRASE=y
CONFIG_WORKFLOWS=y
CONFIG_WORKFLOWS_TESTS=y
CONFIG_WORKFLOWS_LINUX_TESTS=y
CONFIG_WORKFLOWS_DEDICATED_WORKFLOW=y
14 changes: 14 additions & 0 deletions defconfigs/lambdalabs-gh200-or-less
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# Lambda Labs GPU with tier-based fallback (GH200 maximum tier)
# Uses GH200_OR_LESS for best available single GPU up to GH200
# Fallback order: GH200 → H100-SXM → H100-PCIe → A100-SXM → A100 → A6000 → RTX6000 → A10
CONFIG_TERRAFORM=y
CONFIG_TERRAFORM_LAMBDALABS=y
CONFIG_TERRAFORM_LAMBDALABS_REGION_SMART_INFER=y
CONFIG_TERRAFORM_LAMBDALABS_INSTANCE_TYPE_GH200_OR_LESS=y
CONFIG_TERRAFORM_SSH_CONFIG_GENKEY=y
CONFIG_TERRAFORM_SSH_CONFIG_GENKEY_OVERWRITE=y
CONFIG_TERRAFORM_SSH_CONFIG_GENKEY_EMPTY_PASSPHRASE=y
CONFIG_WORKFLOWS=y
CONFIG_WORKFLOWS_TESTS=y
CONFIG_WORKFLOWS_LINUX_TESTS=y
CONFIG_WORKFLOWS_DEDICATED_WORKFLOW=y
14 changes: 14 additions & 0 deletions defconfigs/lambdalabs-h100-or-less
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# Lambda Labs GPU with tier-based fallback (H100 maximum tier)
# Uses H100_OR_LESS for best available single GPU up to H100
# Fallback order: H100-SXM → H100-PCIe → A100-SXM → A100 → A6000 → RTX6000 → A10
CONFIG_TERRAFORM=y
CONFIG_TERRAFORM_LAMBDALABS=y
CONFIG_TERRAFORM_LAMBDALABS_REGION_SMART_INFER=y
CONFIG_TERRAFORM_LAMBDALABS_INSTANCE_TYPE_H100_OR_LESS=y
CONFIG_TERRAFORM_SSH_CONFIG_GENKEY=y
CONFIG_TERRAFORM_SSH_CONFIG_GENKEY_OVERWRITE=y
CONFIG_TERRAFORM_SSH_CONFIG_GENKEY_EMPTY_PASSPHRASE=y
CONFIG_WORKFLOWS=y
CONFIG_WORKFLOWS_TESTS=y
CONFIG_WORKFLOWS_LINUX_TESTS=y
CONFIG_WORKFLOWS_DEDICATED_WORKFLOW=y
120 changes: 119 additions & 1 deletion playbooks/roles/terraform/tasks/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -91,11 +91,126 @@
- destroy
- status

- name: Auto-select Lambda Labs instance type for tier-based wildcards
ansible.builtin.shell:
cmd: |
case "{{ terraform_lambdalabs_instance_type }}" in
GH200_OR_LESS|H100_OR_LESS|A100_OR_LESS|A6000_OR_LESS)
# Use tier-based selection script for single GPU
tier_group=$(echo "{{ terraform_lambdalabs_instance_type }}" | tr '[:upper:]' '[:lower:]' | tr '_' '-')
{{ topdir_path }}/scripts/lambdalabs_select_tier.py "$tier_group" --verbose
;;
8X_B200_OR_LESS|8X_H100_OR_LESS|8X_A100_OR_LESS)
# Use tier-based selection script for 8x GPU
tier_group=$(echo "{{ terraform_lambdalabs_instance_type }}" | tr '[:upper:]' '[:lower:]' | tr '_' '-')
{{ topdir_path }}/scripts/lambdalabs_select_tier.py "$tier_group" --verbose
;;
*)
echo "Unknown wildcard type: {{ terraform_lambdalabs_instance_type }}"
exit 1
;;
esac
register: lambdalabs_auto_instance_type
failed_when: false
changed_when: false
when:
- kdevops_terraform_provider == "lambdalabs"
- terraform_lambdalabs_instance_type in ["GH200_OR_LESS", "H100_OR_LESS", "A100_OR_LESS", "A6000_OR_LESS", "8X_B200_OR_LESS", "8X_H100_OR_LESS", "8X_A100_OR_LESS"]
tags:
- bringup

- name: Fail if no Lambda Labs instances available for wildcard selection
ansible.builtin.fail:
msg: |
No GPU instances available for {{ terraform_lambdalabs_instance_type }}

{{ lambdalabs_auto_instance_type.stderr }}

Try:
- Wait and retry (capacity changes frequently)
- Check Lambda Labs dashboard: https://cloud.lambdalabs.com
- Use a different tier group via menuconfig
- Check capacity manually: scripts/lambdalabs_check_capacity.py
when:
- kdevops_terraform_provider == "lambdalabs"
- terraform_lambdalabs_instance_type in ["GH200_OR_LESS", "H100_OR_LESS", "A100_OR_LESS", "A6000_OR_LESS", "8X_B200_OR_LESS", "8X_H100_OR_LESS", "8X_A100_OR_LESS"]
- lambdalabs_auto_instance_type.rc != 0
tags:
- bringup

- name: Validate Lambda Labs tier selection output format
ansible.builtin.assert:
that:
- lambdalabs_auto_instance_type.stdout.split() | length == 2
fail_msg: |
Invalid output from tier selection script.
Expected format: "instance_type region"
Got: "{{ lambdalabs_auto_instance_type.stdout }}"
when:
- kdevops_terraform_provider == "lambdalabs"
- terraform_lambdalabs_instance_type in ["GH200_OR_LESS", "H100_OR_LESS", "A100_OR_LESS", "A6000_OR_LESS", "8X_B200_OR_LESS", "8X_H100_OR_LESS", "8X_A100_OR_LESS"]
- lambdalabs_auto_instance_type.rc == 0
tags:
- bringup

- name: Parse Lambda Labs auto-selected instance type and region
set_fact:
lambdalabs_auto_selected_instance: "{{ lambdalabs_auto_instance_type.stdout.split()[0] }}"
lambdalabs_auto_selected_region: "{{ lambdalabs_auto_instance_type.stdout.split()[1] }}"
when:
- kdevops_terraform_provider == "lambdalabs"
- terraform_lambdalabs_instance_type in ["GH200_OR_LESS", "H100_OR_LESS", "A100_OR_LESS", "A6000_OR_LESS", "8X_B200_OR_LESS", "8X_H100_OR_LESS", "8X_A100_OR_LESS"]
- lambdalabs_auto_instance_type.rc == 0
tags:
- bringup

- name: Report Lambda Labs auto-selected instance type for wildcards
ansible.builtin.debug:
msg: "Auto-selected instance type: {{ lambdalabs_auto_selected_instance }} in region: {{ lambdalabs_auto_selected_region }}"
when:
- kdevops_terraform_provider == "lambdalabs"
- terraform_lambdalabs_instance_type in ["GH200_OR_LESS", "H100_OR_LESS", "A100_OR_LESS", "A6000_OR_LESS", "8X_B200_OR_LESS", "8X_H100_OR_LESS", "8X_A100_OR_LESS"]
- lambdalabs_auto_instance_type.rc == 0
tags:
- bringup

- name: Update Lambda Labs terraform vars with auto-selected instance type
ansible.builtin.lineinfile:
path: "{{ topdir_path }}/terraform/{{ kdevops_terraform_provider }}/terraform.tfvars"
regexp: '^lambdalabs_instance_type\s*='
line: 'lambdalabs_instance_type = "{{ lambdalabs_auto_selected_instance }}"'
when:
- kdevops_terraform_provider == "lambdalabs"
- terraform_lambdalabs_instance_type in ["GH200_OR_LESS", "H100_OR_LESS", "A100_OR_LESS", "A6000_OR_LESS", "8X_B200_OR_LESS", "8X_H100_OR_LESS", "8X_A100_OR_LESS"]
- lambdalabs_auto_instance_type.rc == 0
tags:
- bringup

- name: Update Lambda Labs terraform vars with auto-selected region
ansible.builtin.lineinfile:
path: "{{ topdir_path }}/terraform/{{ kdevops_terraform_provider }}/terraform.tfvars"
regexp: '^lambdalabs_region\s*='
line: 'lambdalabs_region = "{{ lambdalabs_auto_selected_region }}"'
when:
- kdevops_terraform_provider == "lambdalabs"
- terraform_lambdalabs_instance_type in ["GH200_OR_LESS", "H100_OR_LESS", "A100_OR_LESS", "A6000_OR_LESS", "8X_B200_OR_LESS", "8X_H100_OR_LESS", "8X_A100_OR_LESS"]
- lambdalabs_auto_instance_type.rc == 0
tags:
- bringup

- name: Set Lambda Labs resolved instance type for subsequent tasks
set_fact:
lambdalabs_resolved_instance_type: "{{ lambdalabs_auto_selected_instance if (terraform_lambdalabs_instance_type in ['GH200_OR_LESS', 'H100_OR_LESS', 'A100_OR_LESS', 'A6000_OR_LESS', '8X_B200_OR_LESS', '8X_H100_OR_LESS', '8X_A100_OR_LESS'] and lambdalabs_auto_instance_type.rc == 0) else terraform_lambdalabs_instance_type }}"
when:
- kdevops_terraform_provider == "lambdalabs"
tags:
- bringup

- name: Check Lambda Labs capacity before provisioning (if using Lambda Labs)
ansible.builtin.shell:
cmd: |
{{ topdir_path }}/scripts/lambda-cli --output json check-availability \
{{ terraform_lambdalabs_instance_type }} {{ terraform_lambdalabs_region }} | \
{{ lambdalabs_resolved_instance_type | default(terraform_lambdalabs_instance_type) }} {{ lambdalabs_auto_selected_region | default(terraform_lambdalabs_region) }} | \
python3 -c "
import sys, json
data = json.load(sys.stdin)
Expand All @@ -113,6 +228,7 @@
changed_when: false
when:
- kdevops_terraform_provider == "lambdalabs"
- terraform_lambdalabs_instance_type not in ["GH200_OR_LESS", "H100_OR_LESS", "A100_OR_LESS", "A6000_OR_LESS", "8X_B200_OR_LESS", "8X_H100_OR_LESS", "8X_A100_OR_LESS"]
tags:
- bringup

Expand All @@ -121,6 +237,8 @@
msg: "{{ capacity_check.stdout }}"
when:
- kdevops_terraform_provider == "lambdalabs"
- capacity_check is defined
- capacity_check.rc is defined
- capacity_check.rc != 0
tags:
- bringup
Expand Down
129 changes: 129 additions & 0 deletions scripts/lambdalabs_check_capacity.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
#!/usr/bin/env python3
# SPDX-License-Identifier: copyleft-next-0.3.1
"""
Check Lambda Labs instance availability across all regions.

This script queries the Lambda Labs API to find where specific instance types
are available, helping users avoid provisioning failures.
"""

import argparse
import json
import os
import sys

# Import our Lambda Labs API module
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
sys.path.insert(0, SCRIPT_DIR)

from lambdalabs_api import get_api_key, get_instance_types_with_capacity


def _build_region_map(gpu_instances):
"""Build a mapping from regions to available instance types."""
region_map = {}
for inst_type, regions in gpu_instances.items():
for region in regions:
if region not in region_map:
region_map[region] = []
region_map[region].append(inst_type)
return region_map


def check_availability(instance_type=None, json_output=False, pick_first=False):
"""Check instance availability across all regions."""
api_key = get_api_key()
if not api_key:
sys.stderr.write("Error: Lambda Labs API key not found\n")
sys.stderr.write("Set LAMBDALABS_API_KEY or create ~/.lambdalabs/credentials\n")
return 1

try:
_, capacity_map = get_instance_types_with_capacity(api_key)
except Exception as e:
sys.stderr.write(f"Error: Failed to fetch instance availability: {e}\n")
return 1

if not capacity_map:
sys.stderr.write("Error: Could not fetch instance availability\n")
return 1

if instance_type:
# Check specific instance type
regions = capacity_map.get(instance_type, [])
if pick_first:
if regions:
print(regions[0])
return 0
return 1

if json_output:
result = [{"instance_type": instance_type, "regions": regions}]
print(json.dumps(result, indent=2))
else:
if regions:
print(f"{instance_type}:")
for region in regions:
print(f" • {region}")
else:
print(f"{instance_type}: No capacity available")
return 0 if regions else 1
else:
# Show all GPU instances with capacity
results = []
gpu_instances = {
k: v for k, v in capacity_map.items() if k.startswith("gpu_") and v
}

if json_output:
# Format for tier selection script compatibility
# Group by region for consistency with DataCrunch format
region_map = _build_region_map(gpu_instances)

results = [
{"location": region, "instances": instances}
for region, instances in sorted(region_map.items())
]
print(json.dumps(results, indent=2))
else:
print("GPU Instance Availability:\n")

# Group by region
region_map = _build_region_map(gpu_instances)

for region in sorted(region_map.keys()):
print(f"📍 {region}:")
for inst in sorted(region_map[region]):
print(f" • {inst}")
print()

if not region_map:
print("No GPU instances currently available")

return 0


def main():
parser = argparse.ArgumentParser(
description="Check Lambda Labs instance availability"
)
parser.add_argument(
"--instance-type",
"-i",
help="Check specific instance type (e.g., gpu_1x_h100_sxm5)",
)
parser.add_argument(
"--json", "-j", action="store_true", help="Output in JSON format"
)
parser.add_argument(
"--pick-first",
action="store_true",
help="Return first available region (for scripts)",
)

args = parser.parse_args()
sys.exit(check_availability(args.instance_type, args.json, args.pick_first))


if __name__ == "__main__":
main()
Loading
Loading