Skip to content

Commit e231a1a

Browse files
committed
Parametrize DRA test config
1 parent d57e351 commit e231a1a

File tree

2 files changed

+47
-15
lines changed

2 files changed

+47
-15
lines changed

clusterloader2/testing/dra/config.yaml

Lines changed: 44 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,29 @@
22
{{$NODES_PER_NAMESPACE := MinInt .Nodes (DefaultParam .CL2_NODES_PER_NAMESPACE 100)}}
33
{{$LOAD_TEST_THROUGHPUT := DefaultParam .CL2_LOAD_TEST_THROUGHPUT 10}}
44
{{$STEADY_STATE_QPS := DefaultParam .CL2_STEADY_STATE_QPS 5}}
5+
{{$RESOURCE_SLICES_PER_NODE := DefaultParam .CL2_RESOURCE_SLICES_PER_NODE 1}}
6+
{{$UPSIZE_THRESHOLD := DefaultParam .CL2_UPSIZE_THRESHOLD "10m"}}
7+
{{$UPSIZE_PERC50_THRESHOLD := DefaultParam .CL2_UPSIZE_PERC50_THRESHOLD "40s"}}
8+
{{$UPSIZE_PERC90_THRESHOLD := DefaultParam .CL2_UPSIZE_PERC90_THRESHOLD "4m"}}
9+
{{$CHURN_POD_STARTUP_PERC50_THRESHOLD := DefaultParam .CL2_CHURN_POD_STARTUP_PERC50_THRESHOLD “40s"}}
10+
{{$CHURN_POD_STARTUP_PERC90_THRESHOLD := DefaultParam .CL2_CHURN_POD_STARTUP_PERC90_THRESHOLD “60s”}}
11+
{{$CHURN_POD_STARTUP_PERC99_THRESHOLD := DefaultParam .CL2_CHURN_POD_STARTUP_PERC99_THRESHOLD “80s”}}
12+
{{$FINISHED_JOBS_THRESHOLD := DefaultParam .CL2_FINISHED_JOBS_THRESHOLD "10m"}}
13+
{{$RUNNING_JOBS_THRESHOLD := DefaultParam .CL2_RUNNING_JOBS_THRESHOLD "10m"}}
14+
{{$RUNNING_JOBS_OPERATION_THRESHOLD := DefaultParam .CL2_RUNNING_JOBS_OPERATION_THRESHOLD "120s"}}
515
{{$token := .CL2_TOKEN }}
616

717
{{$namespaces := DivideInt .Nodes $NODES_PER_NAMESPACE}}
818

19+
# dra
20+
{{$draNamespace := DefaultParam .CL2_DRA_NAMESPACE "dra-example-driver"}}
21+
{{$draManifests := DefaultParam .CL2_DRA_MANIFESTS "dra-example-driver"}}
22+
{{$draDaemonsetName := DefaultParam .CL2_DRA_DAEMONSET_NAME "dra-example-driver-kubeletplugin"}}
23+
924
# Node resource configuration
1025
{{$gpusPerNode := DefaultParam .CL2_GPUS_PER_NODE 8}}
26+
{{$resourceSlicesPerNode := DefaultParam .CL2_RESOURCE_SLICES_PER_NODE 1}}
27+
{{$workerNodeCount := MultiplyInt $resourceSlicesPerNode .Nodes}}
1128
{{$totalGPUs := MultiplyInt $gpusPerNode .Nodes}}
1229

1330
# fast fill job configuration - for initial fill up
@@ -19,9 +36,11 @@
1936

2037
# churn job configuration for steady state
2138
{{$smallJobPodsCount := SubtractInt $totalGPUs (MultiplyInt $fillPodsPerNamespace $namespaces)}}
22-
{{$smallJobsPerNamespace := DivideInt $smallJobPodsCount $namespaces}}
39+
{{$calculatedSJPN := DivideInt $smallJobPodsCount $namespaces}}
40+
{{$maxSJPN := DefaultParam .CL2_MAX_SMALL_JOBS_PER_NAMESPACE 999999}}
41+
{{$smallJobsPerNamespace := MinInt $calculatedSJPN $maxSJPN}}
2342
{{$smallJobSize := 1}}
24-
{{$smallJobCompletions := 10}}
43+
{{$smallJobCompletions := DefaultParam .CL2_SMALL_JOB_COMPLETIONS 10}}
2544
{{$jobRunningTime := DefaultParam .CL2_JOB_RUNNING_TIME "30s"}}
2645
{{$ENABLE_EXTENDED_RESOURCES := DefaultParam .CL2_ENABLE_EXTENDED_RESOURCES false}}
2746

@@ -46,10 +65,13 @@ tuningSets:
4665

4766

4867
dependencies:
49-
- name: Install dra-example-driver for test
68+
- name: Install dra-driver for test
5069
Method: DRATestDriver
5170
Params:
52-
WorkerNodeCount: {{.Nodes}}
71+
WorkerNodeCount: {{$workerNodeCount}}
72+
Namespace: {{$draNamespace}}
73+
DaemonsetName: {{$draDaemonsetName}}
74+
Manifests: {{$draManifests}}
5375
{{if $ENABLE_EXTENDED_RESOURCES}}
5476
ExtendedResourceName: {{$extendedResourceName}}
5577
{{end}}
@@ -70,12 +92,15 @@ steps:
7092
apiVersion: batch/v1
7193
kind: Job
7294
labelSelector: job-type = long-running
73-
operationTimeout: 120s
95+
operationTimeout: {{$RUNNING_JOBS_OPERATION_THRESHOLD}}
7496
- Identifier: FastFillPodStartupLatency
7597
Method: PodStartupLatency
7698
Params:
7799
action: start
78100
labelSelector: job-type = long-running
101+
perc50Threshold: {{$UPSIZE_PERC50_THRESHOLD}}
102+
perc90Threshold: {{$UPSIZE_PERC90_THRESHOLD}}
103+
threshold: {{$UPSIZE_THRESHOLD}}
79104
- Identifier: FastFillClaimAllocationLatency
80105
Method: ResourceClaimAllocationLatency
81106
Params:
@@ -110,8 +135,10 @@ steps:
110135
tuningSet: FastFill
111136
objectBundle:
112137
- basename: single-gpu
138+
# Add other resourceclaimtemplates for different drivers
139+
{{if eq $draManifests "example"}}
113140
objectTemplatePath: "resourceclaimtemplate.yaml"
114-
{{end}}
141+
{{end}}
115142
- name: Fill cluster to {{$fillPercentage}}% utilization
116143
phases:
117144
- namespaceRange:
@@ -134,7 +161,7 @@ steps:
134161
Params:
135162
action: gather
136163
labelSelector: job-type = long-running
137-
timeout: 15m
164+
timeout: {{$RUNNING_JOBS_THRESHOLD}}
138165
- name: Gather measurements for long running pods
139166
measurements:
140167
- Identifier: FastFillSchedulingMetrics
@@ -145,6 +172,9 @@ steps:
145172
Method: PodStartupLatency
146173
Params:
147174
action: gather
175+
perc50Threshold: {{$UPSIZE_PERC50_THRESHOLD}}
176+
perc90Threshold: {{$UPSIZE_PERC90_THRESHOLD}}
177+
threshold: {{$UPSIZE_THRESHOLD}}
148178
- Identifier: FastFillClaimAllocationLatency
149179
Method: ResourceClaimAllocationLatency
150180
Params:
@@ -164,9 +194,9 @@ steps:
164194
Params:
165195
action: start
166196
labelSelector: job-type = short-lived
167-
perc50Threshold: 40s
168-
perc90Threshold: 60s
169-
perc99Threshold: 80s
197+
perc50Threshold: {{$CHURN_POD_STARTUP_PERC50_THRESHOLD}}
198+
perc90Threshold: {{$CHURN_POD_STARTUP_PERC90_THRESHOLD}}
199+
perc99Threshold: {{$CHURN_POD_STARTUP_PERC99_THRESHOLD}}
170200
- Identifier: ChurnClaimAllocationLatency
171201
Method: ResourceClaimAllocationLatency
172202
Params:
@@ -210,7 +240,7 @@ steps:
210240
Params:
211241
action: gather
212242
labelSelector: job-type = short-lived
213-
timeout: 15m
243+
timeout: {{$FINISHED_JOBS_THRESHOLD}}
214244
- name: Measure scheduler metrics
215245
measurements:
216246
- Identifier: ChurnSchedulingMetrics
@@ -221,9 +251,9 @@ steps:
221251
Method: PodStartupLatency
222252
Params:
223253
action: gather
224-
perc50Threshold: 40s
225-
perc90Threshold: 60s
226-
perc99Threshold: 80s
254+
perc50Threshold: {{$CHURN_POD_STARTUP_PERC50_THRESHOLD}}
255+
perc90Threshold: {{$CHURN_POD_STARTUP_PERC90_THRESHOLD}}
256+
perc99Threshold: {{$CHURN_POD_STARTUP_PERC99_THRESHOLD}}
227257
- Identifier: ChurnClaimAllocationLatency
228258
Method: ResourceClaimAllocationLatency
229259
Params:

clusterloader2/testing/dra/job.yaml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,9 @@ spec:
99
parallelism: {{.Replicas}}
1010
completions: {{.CompletionReplicas}}
1111
completionMode: {{.Mode}}
12-
ttlSecondsAfterFinished: 300
12+
# In tests involving a large number of sequentially created, short-lived jobs, the spin-up time may be significant.
13+
# A TTL of 1 hour should be sufficient to retain the jobs long enough for measurement checks.
14+
ttlSecondsAfterFinished: 3600 # 1 hour
1315
template:
1416
metadata:
1517
labels:

0 commit comments

Comments
 (0)