Skip to content

Commit 8b41d01

Browse files
authored
Merge branch 'main' into tudorman/gpu-tensor-integ-tests
2 parents 314f33c + 76c09ae commit 8b41d01

File tree

49 files changed

+29765
-672
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

49 files changed

+29765
-672
lines changed

.gitignore

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,4 +11,7 @@ terraform.*
1111
/test/**/final_*.yml
1212
coverage.txt
1313
generator/resources/*complete*.json
14-
terraform/eks/e2e/helm-charts
14+
terraform/eks/e2e/helm-charts
15+
terraform/eks/daemon/credentials/pod_identity/helm-charts
16+
terraform/eks/daemon/ebs/helm-charts
17+
terraform/eks/daemon/entity/helm-charts

docs/resources/dummy-neuron-monitor/Dockerfile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ RUN apt-get update \
2727
&& rm -rf /tmp/tmp* \
2828
&& apt-get clean
2929

30+
COPY neuron-monitor-output.json /opt/aws/neuron/bin/neuron-monitor-output.json
3031
COPY dummy_neuron_monitor.py /opt/aws/neuron/bin/dummy_neuron_monitor.py
3132
RUN chmod 755 /opt/aws/neuron/bin/dummy_neuron_monitor.py
3233
RUN pip3 install prometheus_client boto3 requests

docs/resources/dummy-neuron-monitor/dummy_neuron_monitor.py

Lines changed: 4 additions & 598 deletions
Large diffs are not rendered by default.

docs/resources/dummy-neuron-monitor/neuron-monitor-output.json

Lines changed: 28166 additions & 0 deletions
Large diffs are not rendered by default.

environment/metadata.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,8 @@ type MetaData struct {
6666
SampleApp string
6767
AccountId string
6868
EKSInstallationType eksinstallationtype.EKSInstallationType
69+
PerformanceMetricMapName string
70+
PerformanceTestName string
6971
}
7072

7173
type MetaDataStrings struct {
@@ -110,6 +112,8 @@ type MetaDataStrings struct {
110112
SampleApp string
111113
AccountId string
112114
EKSInstallationType string
115+
PerformanceMetricMapName string
116+
PerformanceTestName string
113117
}
114118

115119
func registerComputeType(dataString *MetaDataStrings) {
@@ -140,6 +144,8 @@ func registerEKSData(d *MetaDataStrings) {
140144
flag.StringVar(&(d.EKSClusterName), "eksClusterName", "", "EKS cluster name")
141145
flag.StringVar(&(d.EksDeploymentStrategy), "eksDeploymentStrategy", "", "Daemon/Replica/Sidecar")
142146
flag.StringVar(&(d.EksGpuType), "eksGpuType", "", "nvidia/inferentia")
147+
flag.StringVar(&(d.PerformanceMetricMapName), "performanceMetricMapName", "", "name of eks performance metric map within directory")
148+
flag.StringVar(&(d.PerformanceTestName), "performanceTestName", "", "name of eks performance test")
143149
}
144150

145151
func registerEKSE2ETestData(dataString *MetaDataStrings) {
@@ -282,6 +288,8 @@ func fillEKSData(e *MetaData, data *MetaDataStrings) {
282288

283289
e.EKSClusterName = data.EKSClusterName
284290
e.EksGpuType = data.EksGpuType
291+
e.PerformanceMetricMapName = data.PerformanceMetricMapName
292+
e.PerformanceTestName = data.PerformanceTestName
285293
}
286294

287295
func fillEKSInstallationType(e *MetaData, data *MetaDataStrings) {

generator/test_case_generator.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -220,6 +220,8 @@ var testTypeToTestConfig = map[string][]testConfig{
220220
{testDir: "../../../test/restart"},
221221
{testDir: "../../../test/acceptance"},
222222
{testDir: "../../../test/feature/windows/event_logs"},
223+
{testDir: "../../../test/feature/windows/eventid_logs"},
224+
{testDir: "../../../test/feature/windows/event_regex_logs"},
223225
{testDir: "../../../test/log_state/logfile"},
224226
{testDir: "../../../test/log_state/windows_event_log"},
225227
{
@@ -339,6 +341,11 @@ var testTypeToTestConfig = map[string][]testConfig{
339341
testDir: "./test/metric_value_benchmark", terraformDir: "terraform/eks/daemon/credentials/pod_identity",
340342
targets: map[string]map[string]struct{}{"arc": {"amd64": {}}},
341343
},
344+
{
345+
testDir: "./test/ebscsi",
346+
terraformDir: "terraform/eks/daemon/ebs",
347+
targets: map[string]map[string]struct{}{"arc": {"amd64": {}}},
348+
},
342349
},
343350
"eks_deployment": {
344351
{testDir: "./test/metric_value_benchmark"},

terraform/ecs_fargate/linux/main.tf

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,8 @@ resource "null_resource" "validator" {
152152
-ecsLaunchType=FARGATE \
153153
-ecsDeploymentStrategy=DAEMON \
154154
-clusterArn=${aws_ecs_cluster.cluster.arn} \
155+
-cwagentECSServiceName=${aws_ecs_service.cwagent_service.name} \
156+
-cwagentConfigSsmParamName=${aws_ssm_parameter.cwagent_config.name} \
155157
-v
156158
EOT
157159
}

terraform/eks/daemon/awsneuron/main.tf

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -395,7 +395,7 @@ resource "kubernetes_daemonset" "neuron_monitor" {
395395
}
396396
container {
397397
name = "neuron-monitor-prometheus"
398-
image = "506463145083.dkr.ecr.us-west-2.amazonaws.com/mocked-neuron-monitor:v2"
398+
image = "506463145083.dkr.ecr.us-west-2.amazonaws.com/mocked-neuron-monitor:v4"
399399
port {
400400
container_port = 8000
401401
}

terraform/eks/daemon/credentials/pod_identity/main.tf

Lines changed: 18 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -194,17 +194,13 @@ resource "aws_security_group_rule" "nodes_cluster_inbound" {
194194
type = "ingress"
195195
}
196196

197-
resource "null_resource" "clone_helm_chart" {
198-
triggers = {
199-
timestamp = "${timestamp()}" # Forces re-run on every apply
200-
}
201-
provisioner "local-exec" {
202-
command = <<-EOT
203-
if [ ! -d "./helm-charts" ]; then
204-
git clone -b ${var.helm_chart_branch} https://github.com/aws-observability/helm-charts.git ./helm-charts
205-
fi
206-
EOT
207-
}
197+
data "external" "clone_helm_chart" {
198+
program = ["bash", "-c", <<-EOT
199+
rm -rf ./helm-charts
200+
git clone -b ${var.helm_chart_branch} https://github.com/aws-observability/helm-charts.git ./helm-charts
201+
echo '{"status":"ready"}'
202+
EOT
203+
]
208204
}
209205

210206
resource "helm_release" "aws_observability" {
@@ -213,21 +209,22 @@ resource "helm_release" "aws_observability" {
213209
namespace = "amazon-cloudwatch"
214210
create_namespace = true
215211

216-
set {
217-
name = "clusterName"
218-
value = aws_eks_cluster.this.name
219-
}
220-
221-
set {
222-
name = "region"
223-
value = "us-west-2"
224-
}
212+
set = [
213+
{
214+
name = "clusterName"
215+
value = aws_eks_cluster.this.name
216+
},
217+
{
218+
name = "region"
219+
value = var.region
220+
}
221+
]
225222
depends_on = [
226223
aws_eks_cluster.this,
227224
aws_eks_node_group.this,
228-
null_resource.clone_helm_chart,
229225
aws_eks_addon.pod_identity_addon,
230226
aws_eks_pod_identity_association.association,
227+
data.external.clone_helm_chart,
231228
]
232229
}
233230

terraform/eks/daemon/credentials/pod_identity/providers.tf

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,10 @@ provider "kubernetes" {
1717
}
1818

1919
provider "helm" {
20-
kubernetes {
20+
kubernetes = {
2121
host = aws_eks_cluster.this.endpoint
2222
cluster_ca_certificate = base64decode(aws_eks_cluster.this.certificate_authority.0.data)
23-
exec {
23+
exec = {
2424
api_version = "client.authentication.k8s.io/v1beta1"
2525
args = ["eks", "get-token", "--cluster-name", aws_eks_cluster.this.name]
2626
command = "aws"

0 commit comments

Comments
 (0)