Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 36 additions & 0 deletions tf-sysdig/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# Local Terraform directories
.terraform/
.terraform.lock.hcl

# Terraform plan files
*.tfplan

# Crash logs
crash.log

# Terraform state files
*.tfstate
*.tfstate.*

# Override files
override.tf
override.tf.json
*_override.tf
*_override.tf.json

# Sensitive variable files
*.tfvars
*.tfvars.json

# Terraform variable environment files
.terraform.tfvars
terraform.tfvars

# Sensitive provider configuration files
provider.tfvars

# Generated files by IDEs or OS
*.DS_Store
*.log
*.bak
*.swp
43 changes: 43 additions & 0 deletions tf-sysdig/alerts.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
# Define Sysdig alerts with detailed attributes for all environments and metrics
resource "sysdig_monitor_alert_v2_metric" "pod_metrics" {
for_each = local.all_metrics

name = each.value.name
description = each.value.description
enabled = each.value.enabled
severity = each.value.severity
metric = each.value.metric
group_aggregation = each.value.group_aggregation
time_aggregation = each.value.time_aggregation
operator = each.value.operator
threshold = each.value.threshold

dynamic "scope" {
for_each = [
{
label = "kube_namespace_name"
operator = "equals"
values = [each.value.namespace_name]
}
]
content {
label = scope.value.label
operator = scope.value.operator
values = scope.value.values
}
}

notification_channels {
# TODO: hard coded id for now. This shold be generated and referenced.
id = 238924
renotify_every_minutes = 60
}

custom_notification {
subject = each.value.notification_subject
prepend = "Alert Details:"
append = "Please check the system immediately."
}

range_seconds = each.value.range_seconds
}
141 changes: 141 additions & 0 deletions tf-sysdig/locals.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
locals {
# Define common scope as a local variable
common_scope = [
{
label = "kube_cluster_name"
operator = "equals"
values = ["gold"]
},
{
label = "kube_namespace_name"
operator = "equals"
values = ["abc123-dev"]
},
{
label = "kube_deployment_name"
operator = "equals"
values = ["project-wordpress"]
}
]

# Define environments
environments = {
dev = "abc123-dev"
test = "abc123-test"
prod = "abc123-prod"
}

# Define pod metrics
pod_metrics = {
"Pod CPU Usage" = {
metric = "sysdig_program_cpu_cores_used_percent"
description = "Alert when 'sysdig_program_cpu_cores_used_percent' exceeds threshold"
enabled = true
severity = "high"
group_aggregation = "avg"
time_aggregation = "avg"
operator = ">"
threshold = 80
notification_subject = "Pod CPU Usage Alert Status"
range_seconds = 60
}

"Pod Memory Usage" = {
metric = "sysdig_program_memory_used_percent"
description = "Alert when 'sysdig_program_memory_used_percent' exceeds threshold"
enabled = true
severity = "high"
group_aggregation = "avg"
time_aggregation = "avg"
operator = ">"
threshold = 80
notification_subject = "Pod Memory Usage Alert Status"
range_seconds = 60
}

"Pod Restarts" = {
metric = "kube_pod_sysdig_restart_count"
description = "Alert when 'kube_pod_sysdig_restart_count' exceeds threshold"
enabled = true
severity = "high"
group_aggregation = "max"
time_aggregation = "avg"
operator = ">"
threshold = 5
notification_subject = "Pod Restart Alert Status"
range_seconds = 300
}

"HTTP Error Count" = {
metric = "sysdig_container_net_http_error_count"
description = "Alert when 'sysdig_container_net_http_error_count' exceeds the threshold"
enabled = true
severity = "high"
group_aggregation = "avg"
time_aggregation = "avg"
operator = ">"
threshold = 25
notification_subject = "Pod HTTP Error Count Alert"
range_seconds = 300
}

"Replica Count Below Minimum" = {
metric = "kube_deployment_status_replicas"
description = "Alert when 'kube_deployment_status_replicas' falls below the threshold"
enabled = true
severity = "high"
group_aggregation = "avg"
time_aggregation = "avg"
operator = "<"
threshold = 3
notification_subject = "Replica Count Alert"
range_seconds = 60
}

"Pod Ready Status" = {
metric = "kube_pod_sysdig_status_ready"
description = "Alert when 'kube_pod_sysdig_status_ready' falls below the threshold"
enabled = true
severity = "high"
group_aggregation = "avg"
time_aggregation = "avg"
operator = "<"
threshold = 1
notification_subject = "Pod Ready Status Alert"
range_seconds = 60
}

"Pod Unready Status" = {
metric = "kube_pod_sysdig_status_ready"
description = "Alert when 'kube_pod_sysdig_status_ready' is unready for more than 5 minutes"
enabled = true
severity = "high"
group_aggregation = "avg"
time_aggregation = "avg"
operator = "<"
threshold = 1
notification_subject = "Pod Unready Status Alert"
range_seconds = 300
}
}

all_metrics = merge([
for env, ns in local.environments : {
for metric_name, metric_info in local.pod_metrics : "${env}-${metric_name}" => {
name = "${env} - ${metric_name}"
namespace_name = ns
metric = metric_info.metric
description = metric_info.description
enabled = metric_info.enabled
severity = metric_info.severity
group_aggregation = metric_info.group_aggregation
time_aggregation = metric_info.time_aggregation
operator = metric_info.operator
threshold = metric_info.threshold
notification_subject = metric_info.notification_subject
range_seconds = metric_info.range_seconds
}
}
]...)

}
21 changes: 21 additions & 0 deletions tf-sysdig/main.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
terraform {
required_providers {
sysdig = {
source = "sysdiglabs/sysdig"
version = ">=1.33.0"
}
}
backend "local" {
path = "terraform.tfstate"
}
}

provider "sysdig" {
sysdig_monitor_url = "https://app.sysdigcloud.com"
sysdig_monitor_api_token = var.sysdig_api_token
}

# Define a variable for the Sysdig API token
variable "sysdig_api_token" {
type = string
}
9 changes: 9 additions & 0 deletions tf-sysdig/notification_channel.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# # Define a notification channel
# resource "sysdig_monitor_notification_channel_email" "tf_sre" {
# name = "TF SRE"
# recipients = ["chris@bashbang.com"]
# enabled = true
# notify_when_ok = true
# notify_when_resolved = true
# send_test_notification = true
# }
9 changes: 9 additions & 0 deletions tf-sysdig/readme.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# How to use

## This is currently a WIP and should just be used as sample for a launching pad to expand on.

Terraform v1.5.5
on darwin_amd64

terraform init
terraform apply
2 changes: 2 additions & 0 deletions tf-sysdig/terraform.tfvars.example
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# This token is a PAT in my user profile. In my case was found here: https://app.sysdigcloud.com/#/settings/user under "Sysdig Monitor API"
sysdig_api_token = "{THIS_IS_THE_GENERATED_TOKEN_FROM_SYSDIG}"
2 changes: 2 additions & 0 deletions utility-pod/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ RUN apt-get -y install \
htop \
iperf \
iputils-ping \
jq \
lynx \
mysql-client \
nmap \
Expand All @@ -27,6 +28,7 @@ RUN apt-get -y install \
unzip \
vim \
wget \
yq \
&& \
rm -rf /var/lib/apt/lists/*

Expand Down
Loading