Countly · kanwarujjaval · Mar 25, 2026 · Mar 25, 2026 · Mar 25, 2026 · Mar 25, 2026
diff --git a/charts/countly/values.schema.json b/charts/countly/values.schema.json
diff --git a/devops/mongodb-debug/check-mongodb-health.sh b/devops/mongodb-debug/check-mongodb-health.sh
@@ -0,0 +1,151 @@
+#!/bin/bash
+
+# MongoDB Health Checker
+# Detects whether MongoDB is choking under migration load
+
+# Colors
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+BLUE='\033[0;34m'
+CYAN='\033[0;36m'
+NC='\033[0m'
+
+# Configuration
+NAMESPACE_MONGODB="mongodb"
+MONGO_POD="app-mongodb-0"
+MONGO_CONTAINER="mongod"
+KEYFILE="/var/lib/mongodb-mms-automation/authentication/keyfile"
+
+# Thresholds
+QUEUE_WARN=5          # queued ops before warning
+QUEUE_CRIT=20         # queued ops before critical
+DIRTY_WARN=20         # WiredTiger dirty cache % before warning
+DIRTY_CRIT=40         # WiredTiger dirty cache % before critical
+SLOW_OPS_WARN=3       # long-running ops (>5s) before warning
+
+mongosh_exec() {
+    local script="$1"
+    # Base64-encode to avoid all shell quoting issues when passing JS to the container
+    local encoded
+    encoded=$(printf '%s' "$script" | base64 -w0)
+    kubectl exec -n "$NAMESPACE_MONGODB" "$MONGO_POD" -c "$MONGO_CONTAINER" -- \
+        bash -c "echo $encoded | base64 -d > /tmp/_mhc.js && \
+                 mongosh --authenticationDatabase local -u __system -p \"\$(cat $KEYFILE)\" --quiet --norc /tmp/_mhc.js 2>&1; \
+                 rm -f /tmp/_mhc.js" \
+        2>/dev/null | grep -v "Could not access" | grep -v "^$" | tail -1
+}
+
+echo ""
+echo -e "${CYAN}========================================${NC}"
+echo -e "${CYAN}  MONGODB HEALTH CHECK${NC}"
+echo -e "${CYAN}========================================${NC}"
+echo ""
+
+# ── 1. Global lock queue depth ─────────────────────────────────────────────
+HEALTH_RAW=$(mongosh_exec "
+var s = db.serverStatus();
+var q = s.globalLock.currentQueue;
+var c = s.connections;
+var wt = s.wiredTiger.cache;
+var maxCache = wt['maximum bytes configured'];
+var dirtyPct = maxCache > 0
+    ? (wt['tracked dirty bytes in the cache'] * 100 / maxCache).toFixed(1) : '0';
+var cachePct = maxCache > 0
+    ? (wt['bytes currently in the cache'] * 100 / maxCache).toFixed(1) : '0';
+var opcnt = s.opcounters;
+print([q.readers, q.writers, c.current, c.available, dirtyPct, cachePct,
+       opcnt.insert, opcnt.query, opcnt.update, opcnt.delete].join(','));
+")
+
+if [ -z "$HEALTH_RAW" ] || ! echo "$HEALTH_RAW" | grep -qE '^[0-9]'; then
+    echo -e "${RED}[X] Failed to get serverStatus${NC}"
+else
+    QUEUE_READERS=$(echo "$HEALTH_RAW" | cut -d',' -f1)
+    QUEUE_WRITERS=$(echo "$HEALTH_RAW" | cut -d',' -f2)
+    CONNS_CURRENT=$(echo "$HEALTH_RAW" | cut -d',' -f3)
+    CONNS_AVAILABLE=$(echo "$HEALTH_RAW" | cut -d',' -f4)
+    DIRTY_PCT=$(echo "$HEALTH_RAW" | cut -d',' -f5)
+    CACHE_PCT=$(echo "$HEALTH_RAW" | cut -d',' -f6)
+    OPS_INSERT=$(echo "$HEALTH_RAW" | cut -d',' -f7)
+    OPS_QUERY=$(echo "$HEALTH_RAW" | cut -d',' -f8)
+    OPS_UPDATE=$(echo "$HEALTH_RAW" | cut -d',' -f9)
+    OPS_DELETE=$(echo "$HEALTH_RAW" | cut -d',' -f10)
+    QUEUE_TOTAL=$((QUEUE_READERS + QUEUE_WRITERS))
+
+    echo -e "${YELLOW}Global Lock Queue:${NC}"
+    if [ "$QUEUE_TOTAL" -ge "$QUEUE_CRIT" ]; then
+        echo -e "  Readers waiting: ${RED}$QUEUE_READERS${NC}"
+        echo -e "  Writers waiting: ${RED}$QUEUE_WRITERS${NC}"
+        echo -e "  ${RED}CRITICAL: $QUEUE_TOTAL operations queued — MongoDB is choking${NC}"
+    elif [ "$QUEUE_TOTAL" -ge "$QUEUE_WARN" ]; then
+        echo -e "  Readers waiting: ${YELLOW}$QUEUE_READERS${NC}"
+        echo -e "  Writers waiting: ${YELLOW}$QUEUE_WRITERS${NC}"
+        echo -e "  ${YELLOW}WARNING: $QUEUE_TOTAL operations queued${NC}"
+    else
+        echo -e "  Readers waiting: ${GREEN}$QUEUE_READERS${NC}"
+        echo -e "  Writers waiting: ${GREEN}$QUEUE_WRITERS${NC}"
+    fi
+
+    echo ""
+    echo -e "${YELLOW}Connections:${NC}"
+    echo -e "  Current: $CONNS_CURRENT"
+    echo -e "  Available: $CONNS_AVAILABLE"
+
+    echo ""
+    echo -e "${YELLOW}WiredTiger Cache:${NC}"
+    DIRTY_INT=$(echo "$DIRTY_PCT / 1" | bc 2>/dev/null || echo "0")
+    if [ "$DIRTY_INT" -ge "$DIRTY_CRIT" ]; then
+        echo -e "  Used:  ${CACHE_PCT}%"
+        echo -e "  Dirty: ${RED}${DIRTY_PCT}%${NC}  ← ${RED}CRITICAL: write pressure, eviction may stall operations${NC}"
+    elif [ "$DIRTY_INT" -ge "$DIRTY_WARN" ]; then
+        echo -e "  Used:  ${CACHE_PCT}%"
+        echo -e "  Dirty: ${YELLOW}${DIRTY_PCT}%${NC}  ← ${YELLOW}WARNING: elevated write pressure${NC}"
+    else
+        echo -e "  Used:  ${CACHE_PCT}%"
+        echo -e "  Dirty: ${GREEN}${DIRTY_PCT}%${NC}"
+    fi
+
+    echo ""
+    echo -e "${YELLOW}Operation Counters (cumulative since start):${NC}"
+    echo -e "  Inserts: $OPS_INSERT"
+    echo -e "  Queries: $OPS_QUERY"
+    echo -e "  Updates: $OPS_UPDATE"
+    echo -e "  Deletes: $OPS_DELETE"
+fi
+
+# ── 3. Long-running operations on countly_drill ────────────────────────────
+echo ""
+echo -e "${YELLOW}Slow Operations on countly_drill (>5s):${NC}"
+SLOW_RAW=$(mongosh_exec "
+var ops = db.currentOp({ secs_running: { \$gt: 5 }, ns: /^countly_drill/ });
+var lines = ops.inprog.map(function(op){
+    return op.secs_running + 's|' + op.op + '|' + op.ns;
+}).join('\n');
+print(ops.inprog.length + '|' + lines);
+")
+
+if [ -z "$SLOW_RAW" ]; then
+    echo -e "  ${RED}Unable to get currentOp${NC}"
+else
+    SLOW_COUNT=$(echo "$SLOW_RAW" | head -1 | cut -d'|' -f1)
+    if [ "$SLOW_COUNT" -eq 0 ] 2>/dev/null; then
+        echo -e "  ${GREEN}None${NC}"
+    elif [ "$SLOW_COUNT" -ge "$SLOW_OPS_WARN" ] 2>/dev/null; then
+        echo -e "  ${YELLOW}WARNING: $SLOW_COUNT slow ops running${NC}"
+        echo "$SLOW_RAW" | tail -n +2 | while IFS='|' read -r secs op ns; do
+            [ -n "$secs" ] && echo -e "    ${secs}  ${op}  ${ns}"
+        done
+    else
+        echo -e "  $SLOW_COUNT slow op(s) running:"
+        echo "$SLOW_RAW" | tail -n +2 | while IFS='|' read -r secs op ns; do
+            [ -n "$secs" ] && echo -e "    ${secs}  ${op}  ${ns}"
+        done
+    fi
+fi
+
+echo ""
+echo -e "${CYAN}========================================${NC}"
+echo -e "${BLUE}Report generated at: $(date '+%Y-%m-%d %H:%M:%S')${NC}"
+echo -e "${CYAN}========================================${NC}"
+echo ""
diff --git a/devops/mongodb-debug/mongodb-debug-pod.yaml b/devops/mongodb-debug/mongodb-debug-pod.yaml
@@ -0,0 +1,18 @@
+apiVersion: v1
+kind: Pod
+metadata:
+  name: mongodb-debug
+  namespace: mongodb
+spec:
+  restartPolicy: Never
+  containers:
+  - name: shell
+    image: ubuntu:22.04
+    command: ["/bin/bash", "-c", "sleep 1d"]
+    volumeMounts:
+    - mountPath: /data
+      name: data
+  volumes:
+  - name: data
+    persistentVolumeClaim:
+      claimName: data-volume-countly-mongodb-0
diff --git a/devops/mongodb-debug/new-pv.yaml b/devops/mongodb-debug/new-pv.yaml
@@ -0,0 +1,15 @@
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: pv-<customer name>-data
+spec:
+  capacity:
+    storage: 900Gi # MUST match PVC requested storage size
+  accessModes:
+    - ReadWriteOnce
+  persistentVolumeReclaimPolicy: Retain
+  storageClassName: "premium-rwo"           # empty to ensure static PV (no dynamic provisioning)
+  csi:
+    driver: pd.csi.storage.gke.io
+    # volumeHandle format: projects/PROJECT/zones/ZONE/disks/DISKNAME
+    fsType: ext4 # MUST match the filesystem type expected by the application
diff --git a/devops/mongodb-debug/new-pvc.yaml b/devops/mongodb-debug/new-pvc.yaml
@@ -0,0 +1,13 @@
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: data-volume-countly-mongodb-0     # MUST match operator expected PVC name
+  namespace: mongodb
+spec:
+  accessModes:
+    - ReadWriteOnce
+  resources:
+    requests:
+      storage: 900Gi
+  volumeName: pv-<customer name>-data        # static bind the PV we created
+  storageClassName: "premium-rwo"
diff --git a/environments/reference/countly-tls.env b/environments/reference/countly-tls.env
@@ -0,0 +1,7 @@
+# Countly TLS Certificate Configuration - Template
+# Copy this file to countly-tls.env and update with real values
+
+# Base64 encoded TLS certificate (full chain)
+TLS_CRT=
+# Base64 encoded TLS private key
+TLS_KEY=
diff --git a/profiles/sizing/tier1/clickhouse.yaml b/profiles/sizing/tier1/clickhouse.yaml
@@ -0,0 +1,23 @@
+# Profile: tier1 — ClickHouse chart sizing
+# Machine: e2-standard-16 (16 CPU / 64 GB RAM)
+
+shards: 1
+replicas: 1
+
+server:
+  resources:
+    requests: { cpu: "1500m", memory: "11Gi" }
+    limits: { cpu: "2", memory: "11Gi" }
+  persistence:
+    size: 200Gi
+  scheduling:
+    antiAffinity:
+      enabled: false
+
+keeper:
+  replicas: 1
+  resources:
+    requests: { cpu: "1", memory: "2.5Gi" }
+    limits: { cpu: "1", memory: "2.5Gi" }
+  persistence:
+    size: 10Gi
diff --git a/profiles/sizing/tier1/countly.yaml b/profiles/sizing/tier1/countly.yaml
@@ -0,0 +1,51 @@
+# Profile: tier1 — Countly chart sizing
+# Machine: e2-standard-16 (16 CPU / 64 GB RAM)
+# Validated: yes — Perf Test #2 (2026-Jan)
+# Capacity: ~90 DP/s / 233M DP/month
+
+nodeOptions:
+  aggregator: "--max-old-space-size=2048 --max-semi-space-size=256"
+  api: "--max-old-space-size=2048 --max-semi-space-size=256"
+  frontend: "--max-old-space-size=1024"
+
+aggregator:
+  replicaCount: 1
+  resources:
+    requests: { cpu: "1", memory: "4Gi" }
+    limits: { cpu: "1500m", memory: "4.5Gi" }
+  hpa:
+    minReplicas: 1
+    maxReplicas: 1
+  pdb:
+    enabled: false
+  scheduling:
+    antiAffinity:
+      enabled: false
+
+api:
+  replicaCount: 1
+  resources:
+    requests: { cpu: "500m", memory: "2.5Gi" }
+    limits: { cpu: "1", memory: "2.5Gi" }
+  hpa:
+    maxReplicas: 1
+
+frontend:
+  replicaCount: 1
+  resources:
+    requests: { cpu: "500m", memory: "1.5Gi" }
+    limits: { cpu: "1", memory: "1.5Gi" }
+
+ingestor:
+  replicaCount: 1
+  resources:
+    requests: { cpu: "750m", memory: "3.5Gi" }
+    limits: { cpu: "1", memory: "4Gi" }
+  hpa:
+    maxReplicas: 1
+
+jobserver:
+  replicaCount: 1
+  resources:
+    requests: { cpu: "500m", memory: "2.5Gi" }
+    limits: { cpu: "1", memory: "2.5Gi" }
diff --git a/profiles/sizing/tier1/kafka.yaml b/profiles/sizing/tier1/kafka.yaml
@@ -0,0 +1,53 @@
+# Profile: tier1 — Kafka chart sizing
+# Machine: e2-standard-16 (16 CPU / 64 GB RAM)
+#
+# 2 brokers + 1 controller. Replication factor 2, min.insync.replicas 1
+# (allows writes when one broker is unavailable on a 2-broker cluster).
+#
+# NOTE: kafkaConnect resources here will be overridden by the kafka-connect
+# dimension profile (throughput/balanced/low-latency). The values below
+# reflect the tier1-validated connect worker sizing; pair with a matching
+# kafka-connect profile if you need to honour them exactly.
+
+brokers:
+  replicas: 2
+  resources:
+    requests: { cpu: "500m", memory: "4.5Gi" }
+    limits: { cpu: "750m", memory: "4.5Gi" }
+  jvmOptions:
+    xms: "2560m"
+    xmx: "2560m"
+  persistence:
+    volumes:
+      - id: 0
+        size: 100Gi
+  config:
+    default.replication.factor: 2
+    min.insync.replicas: 1
+    offsets.topic.replication.factor: 2
+    transaction.state.log.replication.factor: 2
+    transaction.state.log.min.isr: 1
+
+controllers:
+  replicas: 1
+  resources:
+    requests: { cpu: "500m", memory: "2.5Gi" }
+    limits: { cpu: "1", memory: "2.5Gi" }
+  persistence:
+    size: 10Gi
+
+cruiseControl:
+  enabled: false
+
+kafkaConnect:
+  replicas: 1
+  resources:
+    requests: { cpu: "1", memory: "2Gi" }
+    limits: { cpu: "1", memory: "2Gi" }
+  jvmOptions:
+    xms: "1g"
+    xmx: "1g"
+  workerConfig:
+    config.storage.replication.factor: 2
+    offset.storage.replication.factor: 2
+    status.storage.replication.factor: 2
diff --git a/profiles/sizing/tier1/mongodb.yaml b/profiles/sizing/tier1/mongodb.yaml
@@ -0,0 +1,13 @@
+# Profile: tier1 — MongoDB chart sizing
+# Machine: e2-standard-16 (16 CPU / 64 GB RAM)
+
+mongodb:
+  members: 1
+  resources:
+    requests: { cpu: "1500m", memory: "11Gi" }
+    limits: { cpu: "2", memory: "11Gi" }
+  persistence:
+    size: 200Gi
+  scheduling:
+    antiAffinity:
+      enabled: false