Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
444 changes: 350 additions & 94 deletions charts/countly/values.schema.json

Large diffs are not rendered by default.

151 changes: 151 additions & 0 deletions devops/mongodb-debug/check-mongodb-health.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
#!/bin/bash

# MongoDB Health Checker
# Detects whether MongoDB is choking under migration load

# Colors
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
CYAN='\033[0;36m'
NC='\033[0m'

# Configuration
NAMESPACE_MONGODB="mongodb"
MONGO_POD="app-mongodb-0"
MONGO_CONTAINER="mongod"
KEYFILE="/var/lib/mongodb-mms-automation/authentication/keyfile"

# Thresholds
QUEUE_WARN=5 # queued ops before warning
QUEUE_CRIT=20 # queued ops before critical
DIRTY_WARN=20 # WiredTiger dirty cache % before warning
DIRTY_CRIT=40 # WiredTiger dirty cache % before critical
SLOW_OPS_WARN=3 # long-running ops (>5s) before warning

mongosh_exec() {
local script="$1"
# Base64-encode to avoid all shell quoting issues when passing JS to the container
local encoded
encoded=$(printf '%s' "$script" | base64 -w0)
kubectl exec -n "$NAMESPACE_MONGODB" "$MONGO_POD" -c "$MONGO_CONTAINER" -- \
bash -c "echo $encoded | base64 -d > /tmp/_mhc.js && \
mongosh --authenticationDatabase local -u __system -p \"\$(cat $KEYFILE)\" --quiet --norc /tmp/_mhc.js 2>&1; \
rm -f /tmp/_mhc.js" \
2>/dev/null | grep -v "Could not access" | grep -v "^$" | tail -1
}

echo ""
echo -e "${CYAN}========================================${NC}"
echo -e "${CYAN} MONGODB HEALTH CHECK${NC}"
echo -e "${CYAN}========================================${NC}"
echo ""

# ── 1. Global lock queue depth ─────────────────────────────────────────────
HEALTH_RAW=$(mongosh_exec "
var s = db.serverStatus();
var q = s.globalLock.currentQueue;
var c = s.connections;
var wt = s.wiredTiger.cache;
var maxCache = wt['maximum bytes configured'];
var dirtyPct = maxCache > 0
? (wt['tracked dirty bytes in the cache'] * 100 / maxCache).toFixed(1) : '0';
var cachePct = maxCache > 0
? (wt['bytes currently in the cache'] * 100 / maxCache).toFixed(1) : '0';
var opcnt = s.opcounters;
print([q.readers, q.writers, c.current, c.available, dirtyPct, cachePct,
opcnt.insert, opcnt.query, opcnt.update, opcnt.delete].join(','));
")

if [ -z "$HEALTH_RAW" ] || ! echo "$HEALTH_RAW" | grep -qE '^[0-9]'; then
echo -e "${RED}[X] Failed to get serverStatus${NC}"
else
QUEUE_READERS=$(echo "$HEALTH_RAW" | cut -d',' -f1)
QUEUE_WRITERS=$(echo "$HEALTH_RAW" | cut -d',' -f2)
CONNS_CURRENT=$(echo "$HEALTH_RAW" | cut -d',' -f3)
CONNS_AVAILABLE=$(echo "$HEALTH_RAW" | cut -d',' -f4)
DIRTY_PCT=$(echo "$HEALTH_RAW" | cut -d',' -f5)
CACHE_PCT=$(echo "$HEALTH_RAW" | cut -d',' -f6)
OPS_INSERT=$(echo "$HEALTH_RAW" | cut -d',' -f7)
OPS_QUERY=$(echo "$HEALTH_RAW" | cut -d',' -f8)
OPS_UPDATE=$(echo "$HEALTH_RAW" | cut -d',' -f9)
OPS_DELETE=$(echo "$HEALTH_RAW" | cut -d',' -f10)
QUEUE_TOTAL=$((QUEUE_READERS + QUEUE_WRITERS))

echo -e "${YELLOW}Global Lock Queue:${NC}"
if [ "$QUEUE_TOTAL" -ge "$QUEUE_CRIT" ]; then
echo -e " Readers waiting: ${RED}$QUEUE_READERS${NC}"
echo -e " Writers waiting: ${RED}$QUEUE_WRITERS${NC}"
echo -e " ${RED}CRITICAL: $QUEUE_TOTAL operations queued — MongoDB is choking${NC}"
elif [ "$QUEUE_TOTAL" -ge "$QUEUE_WARN" ]; then
echo -e " Readers waiting: ${YELLOW}$QUEUE_READERS${NC}"
echo -e " Writers waiting: ${YELLOW}$QUEUE_WRITERS${NC}"
echo -e " ${YELLOW}WARNING: $QUEUE_TOTAL operations queued${NC}"
else
echo -e " Readers waiting: ${GREEN}$QUEUE_READERS${NC}"
echo -e " Writers waiting: ${GREEN}$QUEUE_WRITERS${NC}"
fi

echo ""
echo -e "${YELLOW}Connections:${NC}"
echo -e " Current: $CONNS_CURRENT"
echo -e " Available: $CONNS_AVAILABLE"

echo ""
echo -e "${YELLOW}WiredTiger Cache:${NC}"
DIRTY_INT=$(echo "$DIRTY_PCT / 1" | bc 2>/dev/null || echo "0")
if [ "$DIRTY_INT" -ge "$DIRTY_CRIT" ]; then
echo -e " Used: ${CACHE_PCT}%"
echo -e " Dirty: ${RED}${DIRTY_PCT}%${NC} ← ${RED}CRITICAL: write pressure, eviction may stall operations${NC}"
elif [ "$DIRTY_INT" -ge "$DIRTY_WARN" ]; then
echo -e " Used: ${CACHE_PCT}%"
echo -e " Dirty: ${YELLOW}${DIRTY_PCT}%${NC} ← ${YELLOW}WARNING: elevated write pressure${NC}"
else
echo -e " Used: ${CACHE_PCT}%"
echo -e " Dirty: ${GREEN}${DIRTY_PCT}%${NC}"
fi

echo ""
echo -e "${YELLOW}Operation Counters (cumulative since start):${NC}"
echo -e " Inserts: $OPS_INSERT"
echo -e " Queries: $OPS_QUERY"
echo -e " Updates: $OPS_UPDATE"
echo -e " Deletes: $OPS_DELETE"
fi

# ── 3. Long-running operations on countly_drill ────────────────────────────
echo ""
echo -e "${YELLOW}Slow Operations on countly_drill (>5s):${NC}"
SLOW_RAW=$(mongosh_exec "
var ops = db.currentOp({ secs_running: { \$gt: 5 }, ns: /^countly_drill/ });
var lines = ops.inprog.map(function(op){
return op.secs_running + 's|' + op.op + '|' + op.ns;
}).join('\n');
print(ops.inprog.length + '|' + lines);
")

if [ -z "$SLOW_RAW" ]; then
echo -e " ${RED}Unable to get currentOp${NC}"
else
SLOW_COUNT=$(echo "$SLOW_RAW" | head -1 | cut -d'|' -f1)
if [ "$SLOW_COUNT" -eq 0 ] 2>/dev/null; then
echo -e " ${GREEN}None${NC}"
elif [ "$SLOW_COUNT" -ge "$SLOW_OPS_WARN" ] 2>/dev/null; then
echo -e " ${YELLOW}WARNING: $SLOW_COUNT slow ops running${NC}"
echo "$SLOW_RAW" | tail -n +2 | while IFS='|' read -r secs op ns; do
[ -n "$secs" ] && echo -e " ${secs} ${op} ${ns}"
done
else
echo -e " $SLOW_COUNT slow op(s) running:"
echo "$SLOW_RAW" | tail -n +2 | while IFS='|' read -r secs op ns; do
[ -n "$secs" ] && echo -e " ${secs} ${op} ${ns}"
done
fi
fi

echo ""
echo -e "${CYAN}========================================${NC}"
echo -e "${BLUE}Report generated at: $(date '+%Y-%m-%d %H:%M:%S')${NC}"
echo -e "${CYAN}========================================${NC}"
echo ""
18 changes: 18 additions & 0 deletions devops/mongodb-debug/mongodb-debug-pod.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
apiVersion: v1
kind: Pod
metadata:
name: mongodb-debug
namespace: mongodb
spec:
restartPolicy: Never
containers:
- name: shell
image: ubuntu:22.04
command: ["/bin/bash", "-c", "sleep 1d"]
volumeMounts:
- mountPath: /data
name: data
volumes:
- name: data
persistentVolumeClaim:
claimName: data-volume-countly-mongodb-0
15 changes: 15 additions & 0 deletions devops/mongodb-debug/new-pv.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
apiVersion: v1
kind: PersistentVolume
metadata:
name: pv-<customer name>-data
spec:
capacity:
storage: 900Gi # MUST match PVC requested storage size
accessModes:
- ReadWriteOnce
persistentVolumeReclaimPolicy: Retain
storageClassName: "premium-rwo" # empty to ensure static PV (no dynamic provisioning)
csi:
driver: pd.csi.storage.gke.io
# volumeHandle format: projects/PROJECT/zones/ZONE/disks/DISKNAME
fsType: ext4 # MUST match the filesystem type expected by the application
13 changes: 13 additions & 0 deletions devops/mongodb-debug/new-pvc.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: data-volume-countly-mongodb-0 # MUST match operator expected PVC name
namespace: mongodb
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 900Gi
volumeName: pv-<customer name>-data # static bind the PV we created
storageClassName: "premium-rwo"
7 changes: 7 additions & 0 deletions environments/reference/countly-tls.env
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# Countly TLS Certificate Configuration - Template
# Copy this file to countly-tls.env and update with real values

# Base64 encoded TLS certificate (full chain)
TLS_CRT=
# Base64 encoded TLS private key
TLS_KEY=
23 changes: 23 additions & 0 deletions profiles/sizing/tier1/clickhouse.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# Profile: tier1 — ClickHouse chart sizing
# Machine: e2-standard-16 (16 CPU / 64 GB RAM)

shards: 1
replicas: 1

server:
resources:
requests: { cpu: "1500m", memory: "11Gi" }
limits: { cpu: "2", memory: "11Gi" }
persistence:
size: 200Gi
scheduling:
antiAffinity:
enabled: false

keeper:
replicas: 1
resources:
requests: { cpu: "1", memory: "2.5Gi" }
limits: { cpu: "1", memory: "2.5Gi" }
persistence:
size: 10Gi
51 changes: 51 additions & 0 deletions profiles/sizing/tier1/countly.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
# Profile: tier1 — Countly chart sizing
# Machine: e2-standard-16 (16 CPU / 64 GB RAM)
# Validated: yes — Perf Test #2 (2026-Jan)
# Capacity: ~90 DP/s / 233M DP/month

nodeOptions:
aggregator: "--max-old-space-size=2048 --max-semi-space-size=256"
api: "--max-old-space-size=2048 --max-semi-space-size=256"
frontend: "--max-old-space-size=1024"

aggregator:
replicaCount: 1
resources:
requests: { cpu: "1", memory: "4Gi" }
limits: { cpu: "1500m", memory: "4.5Gi" }
hpa:
minReplicas: 1
maxReplicas: 1
pdb:
enabled: false
scheduling:
antiAffinity:
enabled: false

api:
replicaCount: 1
resources:
requests: { cpu: "500m", memory: "2.5Gi" }
limits: { cpu: "1", memory: "2.5Gi" }
hpa:
maxReplicas: 1

frontend:
replicaCount: 1
resources:
requests: { cpu: "500m", memory: "1.5Gi" }
limits: { cpu: "1", memory: "1.5Gi" }

ingestor:
replicaCount: 1
resources:
requests: { cpu: "750m", memory: "3.5Gi" }
limits: { cpu: "1", memory: "4Gi" }
hpa:
maxReplicas: 1

jobserver:
replicaCount: 1
resources:
requests: { cpu: "500m", memory: "2.5Gi" }
limits: { cpu: "1", memory: "2.5Gi" }
53 changes: 53 additions & 0 deletions profiles/sizing/tier1/kafka.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
# Profile: tier1 — Kafka chart sizing
# Machine: e2-standard-16 (16 CPU / 64 GB RAM)
#
# 2 brokers + 1 controller. Replication factor 2, min.insync.replicas 1
# (allows writes when one broker is unavailable on a 2-broker cluster).
#
# NOTE: kafkaConnect resources here will be overridden by the kafka-connect
# dimension profile (throughput/balanced/low-latency). The values below
# reflect the tier1-validated connect worker sizing; pair with a matching
# kafka-connect profile if you need to honour them exactly.

brokers:
replicas: 2
resources:
requests: { cpu: "500m", memory: "4.5Gi" }
limits: { cpu: "750m", memory: "4.5Gi" }
jvmOptions:
xms: "2560m"
xmx: "2560m"
persistence:
volumes:
- id: 0
size: 100Gi
config:
default.replication.factor: 2
min.insync.replicas: 1
offsets.topic.replication.factor: 2
transaction.state.log.replication.factor: 2
transaction.state.log.min.isr: 1

controllers:
replicas: 1
resources:
requests: { cpu: "500m", memory: "2.5Gi" }
limits: { cpu: "1", memory: "2.5Gi" }
persistence:
size: 10Gi

cruiseControl:
enabled: false

kafkaConnect:
replicas: 1
resources:
requests: { cpu: "1", memory: "2Gi" }
limits: { cpu: "1", memory: "2Gi" }
jvmOptions:
xms: "1g"
xmx: "1g"
workerConfig:
config.storage.replication.factor: 2
offset.storage.replication.factor: 2
status.storage.replication.factor: 2
13 changes: 13 additions & 0 deletions profiles/sizing/tier1/mongodb.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Profile: tier1 — MongoDB chart sizing
# Machine: e2-standard-16 (16 CPU / 64 GB RAM)

mongodb:
members: 1
resources:
requests: { cpu: "1500m", memory: "11Gi" }
limits: { cpu: "2", memory: "11Gi" }
persistence:
size: 200Gi
scheduling:
antiAffinity:
enabled: false
Loading
Loading