Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 12 additions & 11 deletions crates/auths-api/src/domains/agents/handlers.rs
Original file line number Diff line number Diff line change
Expand Up @@ -47,19 +47,20 @@ pub async fn authorize_operation(
#[allow(clippy::disallowed_methods)] // INVARIANT: HTTP handler boundary
let now = chrono::Utc::now();

// Validate clock skew (±5 minutes)
let time_diff = {
let duration = now.signed_duration_since(req.timestamp);
duration.num_seconds().unsigned_abs()
};
if time_diff > 300 {
return Err((StatusCode::BAD_REQUEST, "Clock skew too large".to_string()));
}

let service = AgentService::new(state.registry, state.persistence);
let resp = service
.authorize(&req.agent_did, &req.capability, now)
.map_err(|e| (StatusCode::UNAUTHORIZED, e))?;
.authorize(&req.agent_did, &req.capability, now, req.timestamp)
.map_err(|e| {
let error_msg = e.to_string();
// Clock skew is a request validation error (400)
// Authorization failures are authorization errors (401)
let status = if error_msg.contains("Clock skew") {
StatusCode::BAD_REQUEST
} else {
StatusCode::UNAUTHORIZED
};
(status, error_msg)
})?;

Ok((StatusCode::OK, Json(resp)))
}
Expand Down
57 changes: 57 additions & 0 deletions crates/auths-deployment/config/sentinel.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
# Redis Sentinel Configuration Template
# Production-grade 3-instance Sentinel cluster for auths-api
# See: docs/PRODUCTION_REDIS_HA.md for deployment guides

# Bind to all interfaces (override in deployment)
bind 0.0.0.0
protected-mode no

# Sentinel port (default 26379)
port 26379

# Sentinel working directory
dir ./

# Master name (referenced by clients)
# All 3 Sentinels must use the same name
sentinel monitor mymaster 127.0.0.1 6379 2

# Time in milliseconds before Sentinel considers master unreachable
# After this time, if a majority of Sentinels agree, auto-failover begins
# Recommended: 30s for auths-api (balance between detection time and false positives)
sentinel down_after_milliseconds mymaster 30000

# Number of replicas to reconfigure in parallel during failover
# Set to 1 to avoid traffic spikes during switchover
sentinel parallel_syncs mymaster 1

# Failover timeout: how long to wait before giving up
# Should be at least 3x down_after_milliseconds
sentinel failover_timeout mymaster 120000

# Sentinel logging
loglevel notice
logfile ""

# Deny dangerous commands (scripting, config modification)
sentinel deny_scripts_reconfig yes

# Authentication (if Redis requires password)
# Uncomment and set for production:
# sentinel auth-pass mymaster your-redis-password

# Sentinel quorum for starting auto-failover
# With 3 Sentinels, quorum=2 means any 2 can trigger failover
# (This is implicitly 2 from the "sentinel monitor" command above)

# Notification script on failure detection (optional)
# Called when failover starts: script will be called
# sentinel notification-script mymaster /path/to/notification-script.sh

# Configuration propagation script (optional)
# Called after failover to reconfigure replicas
# sentinel client-reconfig-script mymaster /path/to/client-reconfig-script.sh

# For testing: allow Sentinel to accept SHUTDOWN command
# Remove in production
sentinel deny_scripts_reconfig no
148 changes: 148 additions & 0 deletions crates/auths-deployment/scripts/backup-redis-aof.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
#!/bin/bash
# Automated Redis AOF backup to S3
# Usage: AWS_REGION=us-east-1 ./backup-redis-aof.sh [redis-host] [redis-port]
#
# Cron job (2am UTC daily):
# 0 2 * * * cd /app && AWS_REGION=us-east-1 ./backup-redis-aof.sh localhost 6379 >> /var/log/redis-backup.log 2>&1

set -e

# Configuration
REDIS_HOST=${1:-localhost}
REDIS_PORT=${2:-6379}
AWS_REGION=${AWS_REGION:-us-east-1}
S3_BUCKET="${S3_BUCKET:-auths-redis-backups}"
BACKUP_RETENTION_DAYS=30
MAX_BACKUP_SIZE_MB=1000 # Alert if > 1GB

# Derived variables
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
BACKUP_NAME="redis-aof-${TIMESTAMP}.aof.gz"
LOCAL_AOF_PATH="/tmp/redis-aof-${TIMESTAMP}.aof"
COMPRESSED_AOF_PATH="${LOCAL_AOF_PATH}.gz"
S3_KEY="backups/${BACKUP_NAME}"
S3_URI="s3://${S3_BUCKET}/${S3_KEY}"
LOG_PREFIX="[$(date '+%Y-%m-%d %H:%M:%S')]"

# Color output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m'

log_info() { echo -e "${GREEN}${LOG_PREFIX}${NC} $*"; }
log_warn() { echo -e "${YELLOW}${LOG_PREFIX}${NC} $*"; }
log_error() { echo -e "${RED}${LOG_PREFIX}${NC} $*"; exit 1; }

# === Step 1: Verify Redis connectivity ===
log_info "Verifying Redis connectivity ($REDIS_HOST:$REDIS_PORT)..."
if ! redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" ping >/dev/null 2>&1; then
log_error "Redis not reachable at $REDIS_HOST:$REDIS_PORT"
fi
log_info "Redis reachable ✓"

# === Step 2: Trigger AOF rewrite ===
log_info "Triggering AOF rewrite (compaction)..."
if ! redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" BGREWRITEAOF >/dev/null 2>&1; then
log_warn "AOF rewrite failed (may already be in progress)"
fi

# Wait for rewrite to complete (max 30s)
sleep 2
log_info "Waiting for AOF rewrite..."
for i in {1..15}; do
if redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" info persistence | grep -q "aof_rewrite_in_progress:0"; then
log_info "AOF rewrite completed"
break
fi
sleep 2
done

# === Step 3: Get AOF file location ===
log_info "Locating AOF file..."
REDIS_AOF_PATH=$(redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" config get appendfilename | tail -1)
REDIS_DIR=$(redis-cli -h "$REDIS_HOST" -p "$REDIS_PORT" config get dir | tail -1)
FULL_AOF_PATH="${REDIS_DIR}/${REDIS_AOF_PATH}"

log_info "AOF file: $FULL_AOF_PATH"
if [[ ! -f "$FULL_AOF_PATH" ]]; then
log_error "AOF file not found at $FULL_AOF_PATH"
fi

# === Step 4: Copy and compress AOF ===
log_info "Copying AOF to temporary location..."
cp "$FULL_AOF_PATH" "$LOCAL_AOF_PATH"

log_info "Compressing AOF..."
gzip -f "$LOCAL_AOF_PATH"

# Check backup size
BACKUP_SIZE_MB=$(($(stat -f%z "$COMPRESSED_AOF_PATH" 2>/dev/null || stat -c%s "$COMPRESSED_AOF_PATH") / 1024 / 1024))
log_info "Compressed AOF size: ${BACKUP_SIZE_MB}MB"

if [[ $BACKUP_SIZE_MB -gt $MAX_BACKUP_SIZE_MB ]]; then
log_warn "ALERT: Backup size (${BACKUP_SIZE_MB}MB) exceeds threshold (${MAX_BACKUP_SIZE_MB}MB)"
fi

# === Step 5: Upload to S3 ===
log_info "Uploading to S3: $S3_URI"
if ! aws s3 cp "$COMPRESSED_AOF_PATH" "$S3_URI" \
--region "$AWS_REGION" \
--storage-class STANDARD_IA \
--metadata "timestamp=${TIMESTAMP},redis-host=${REDIS_HOST},backup-size=${BACKUP_SIZE_MB}MB" \
2>&1; then
log_error "S3 upload failed for $S3_URI"
fi
log_info "✓ Backup uploaded to S3"

# === Step 6: Cleanup old local backups ===
log_info "Cleaning up temporary files..."
rm -f "$COMPRESSED_AOF_PATH"

# === Step 7: Cleanup old S3 backups (retention policy) ===
log_info "Applying retention policy (keeping ${BACKUP_RETENTION_DAYS} days)..."
CUTOFF_DATE=$(date -u -d "${BACKUP_RETENTION_DAYS} days ago" +%Y-%m-%d 2>/dev/null || date -u -v-${BACKUP_RETENTION_DAYS}d +%Y-%m-%d)

# List and delete old backups
OLD_BACKUPS=$(aws s3api list-objects-v2 \
--bucket "$S3_BUCKET" \
--prefix "backups/" \
--region "$AWS_REGION" \
--query "Contents[?LastModified<'${CUTOFF_DATE}T00:00:00Z'].Key" \
--output text 2>/dev/null || echo "")

if [[ -n "$OLD_BACKUPS" ]]; then
log_info "Deleting old backups..."
for key in $OLD_BACKUPS; do
log_info " Deleting: $key"
aws s3 rm "s3://${S3_BUCKET}/${key}" --region "$AWS_REGION" 2>/dev/null || true
done
fi

# === Step 8: Log success ===
log_info "✓ Backup completed successfully"
log_info "Summary:"
log_info " Timestamp: $TIMESTAMP"
log_info " Size: ${BACKUP_SIZE_MB}MB"
log_info " Location: $S3_URI"
log_info " Redis: $REDIS_HOST:$REDIS_PORT"

# === Step 9: CloudWatch metric (optional) ===
if command -v aws >/dev/null 2>&1; then
log_info "Publishing CloudWatch metrics..."
aws cloudwatch put-metric-data \
--namespace "auths/redis" \
--metric-name "backup-size-mb" \
--value "$BACKUP_SIZE_MB" \
--region "$AWS_REGION" \
2>/dev/null || log_warn "Failed to publish metrics"

aws cloudwatch put-metric-data \
--namespace "auths/redis" \
--metric-name "backup-success" \
--value 1 \
--region "$AWS_REGION" \
2>/dev/null || true
fi

exit 0
Loading
Loading