22{{$NODES_PER_NAMESPACE := MinInt .Nodes (DefaultParam .CL2_NODES_PER_NAMESPACE 100)}}
33{{$LOAD_TEST_THROUGHPUT := DefaultParam .CL2_LOAD_TEST_THROUGHPUT 10}}
44{{$STEADY_STATE_QPS := DefaultParam .CL2_STEADY_STATE_QPS 5}}
5+ {{$RESOURCE_SLICES_PER_NODE := DefaultParam .CL2_RESOURCE_SLICES_PER_NODE 1}}
6+ {{$UPSIZE_THRESHOLD := DefaultParam .CL2_UPSIZE_THRESHOLD "10m"}}
7+ {{$UPSIZE_PERC50_THRESHOLD := DefaultParam .CL2_UPSIZE_PERC50_THRESHOLD "40s"}}
8+ {{$UPSIZE_PERC90_THRESHOLD := DefaultParam .CL2_UPSIZE_PERC90_THRESHOLD "4m"}}
9+ {{$CHURN_POD_STARTUP_PERC50_THRESHOLD := DefaultParam .CL2_CHURN_POD_STARTUP_PERC50_THRESHOLD “40s"}}
10+ {{$CHURN_POD_STARTUP_PERC90_THRESHOLD := DefaultParam .CL2_CHURN_POD_STARTUP_PERC90_THRESHOLD “60s”}}
11+ {{$CHURN_POD_STARTUP_PERC99_THRESHOLD := DefaultParam .CL2_CHURN_POD_STARTUP_PERC99_THRESHOLD “80s”}}
12+ {{$FINISHED_JOBS_THRESHOLD := DefaultParam .CL2_FINISHED_JOBS_THRESHOLD "10m"}}
13+ {{$RUNNING_JOBS_THRESHOLD := DefaultParam .CL2_RUNNING_JOBS_THRESHOLD "10m"}}
14+ {{$RUNNING_JOBS_OPERATION_THRESHOLD := DefaultParam .CL2_RUNNING_JOBS_OPERATION_THRESHOLD "120s"}}
515{{$token := .CL2_TOKEN }}
616
717{{$namespaces := DivideInt .Nodes $NODES_PER_NAMESPACE}}
818
19+ # dra
20+ {{$draNamespace := DefaultParam .CL2_DRA_NAMESPACE "dra-example-driver"}}
21+ {{$draManifests := DefaultParam .CL2_DRA_MANIFESTS "dra-example-driver"}}
22+ {{$draDaemonsetName := DefaultParam .CL2_DRA_DAEMONSET_NAME "dra-example-driver-kubeletplugin"}}
23+
924# Node resource configuration
1025{{$gpusPerNode := DefaultParam .CL2_GPUS_PER_NODE 8}}
26+ {{$resourceSlicesPerNode := DefaultParam .CL2_RESOURCE_SLICES_PER_NODE 1}}
27+ {{$workerNodeCount := MultiplyInt $resourceSlicesPerNode .Nodes}}
1128{{$totalGPUs := MultiplyInt $gpusPerNode .Nodes}}
1229
1330# fast fill job configuration - for initial fill up
1936
2037# churn job configuration for steady state
2138{{$smallJobPodsCount := SubtractInt $totalGPUs (MultiplyInt $fillPodsPerNamespace $namespaces)}}
22- {{$smallJobsPerNamespace := DivideInt $smallJobPodsCount $namespaces}}
39+ {{$calculatedSJPN := DivideInt $smallJobPodsCount $namespaces}}
40+ {{$maxSJPN := DefaultParam .CL2_MAX_SMALL_JOBS_PER_NAMESPACE 999999}}
41+ {{$smallJobsPerNamespace := MinInt $calculatedSJPN $maxSJPN}}
2342{{$smallJobSize := 1}}
24- {{$smallJobCompletions := 10}}
43+ {{$smallJobCompletions := DefaultParam .CL2_SMALL_JOB_COMPLETIONS 10}}
2544{{$jobRunningTime := DefaultParam .CL2_JOB_RUNNING_TIME "30s"}}
2645{{$ENABLE_EXTENDED_RESOURCES := DefaultParam .CL2_ENABLE_EXTENDED_RESOURCES false}}
2746
@@ -46,10 +65,13 @@ tuningSets:
4665
4766
4867dependencies :
49- - name : Install dra-example- driver for test
68+ - name : Install dra-driver for test
5069 Method : DRATestDriver
5170 Params :
52- WorkerNodeCount : {{.Nodes}}
71+ WorkerNodeCount : {{$workerNodeCount}}
72+ Namespace : {{$draNamespace}}
73+ DaemonsetName : {{$draDaemonsetName}}
74+ Manifests : {{$draManifests}}
5375 {{if $ENABLE_EXTENDED_RESOURCES}}
5476 ExtendedResourceName : {{$extendedResourceName}}
5577 {{end}}
@@ -70,12 +92,15 @@ steps:
7092 apiVersion : batch/v1
7193 kind : Job
7294 labelSelector : job-type = long-running
73- operationTimeout : 120s
95+ operationTimeout : {{$RUNNING_JOBS_OPERATION_THRESHOLD}}
7496 - Identifier : FastFillPodStartupLatency
7597 Method : PodStartupLatency
7698 Params :
7799 action : start
78100 labelSelector : job-type = long-running
101+ perc50Threshold : {{$UPSIZE_PERC50_THRESHOLD}}
102+ perc90Threshold : {{$UPSIZE_PERC90_THRESHOLD}}
103+ threshold : {{$UPSIZE_THRESHOLD}}
79104 - Identifier : FastFillClaimAllocationLatency
80105 Method : ResourceClaimAllocationLatency
81106 Params :
@@ -110,8 +135,10 @@ steps:
110135 tuningSet : FastFill
111136 objectBundle :
112137 - basename : single-gpu
138+ # Add other resourceclaimtemplates for different drivers
139+ {{if eq $draManifests "example"}}
113140 objectTemplatePath : " resourceclaimtemplate.yaml"
114- {{end}}
141+ {{end}}
115142- name : Fill cluster to {{$fillPercentage}}% utilization
116143 phases :
117144 - namespaceRange :
@@ -134,7 +161,7 @@ steps:
134161 Params :
135162 action : gather
136163 labelSelector : job-type = long-running
137- timeout : 15m
164+ timeout : {{$RUNNING_JOBS_THRESHOLD}}
138165- name : Gather measurements for long running pods
139166 measurements :
140167 - Identifier : FastFillSchedulingMetrics
@@ -145,6 +172,9 @@ steps:
145172 Method : PodStartupLatency
146173 Params :
147174 action : gather
175+ perc50Threshold : {{$UPSIZE_PERC50_THRESHOLD}}
176+ perc90Threshold : {{$UPSIZE_PERC90_THRESHOLD}}
177+ threshold : {{$UPSIZE_THRESHOLD}}
148178 - Identifier : FastFillClaimAllocationLatency
149179 Method : ResourceClaimAllocationLatency
150180 Params :
@@ -164,9 +194,9 @@ steps:
164194 Params :
165195 action : start
166196 labelSelector : job-type = short-lived
167- perc50Threshold : 40s
168- perc90Threshold : 60s
169- perc99Threshold : 80s
197+ perc50Threshold : {{$CHURN_POD_STARTUP_PERC50_THRESHOLD}}
198+ perc90Threshold : {{$CHURN_POD_STARTUP_PERC90_THRESHOLD}}
199+ perc99Threshold : {{$CHURN_POD_STARTUP_PERC99_THRESHOLD}}
170200 - Identifier : ChurnClaimAllocationLatency
171201 Method : ResourceClaimAllocationLatency
172202 Params :
@@ -210,7 +240,7 @@ steps:
210240 Params :
211241 action : gather
212242 labelSelector : job-type = short-lived
213- timeout : 15m
243+ timeout : {{$FINISHED_JOBS_THRESHOLD}}
214244- name : Measure scheduler metrics
215245 measurements :
216246 - Identifier : ChurnSchedulingMetrics
@@ -221,9 +251,9 @@ steps:
221251 Method : PodStartupLatency
222252 Params :
223253 action : gather
224- perc50Threshold : 40s
225- perc90Threshold : 60s
226- perc99Threshold : 80s
254+ perc50Threshold : {{$CHURN_POD_STARTUP_PERC50_THRESHOLD}}
255+ perc90Threshold : {{$CHURN_POD_STARTUP_PERC90_THRESHOLD}}
256+ perc99Threshold : {{$CHURN_POD_STARTUP_PERC99_THRESHOLD}}
227257 - Identifier : ChurnClaimAllocationLatency
228258 Method : ResourceClaimAllocationLatency
229259 Params :
0 commit comments