Skip to content

Commit eba49bc

Browse files
committed
Add diagnostic logging to Discord bot advisory lock for production debugging
1 parent 51bfe8c commit eba49bc

File tree

2 files changed

+120
-30
lines changed

2 files changed

+120
-30
lines changed

packages/internal/src/db/advisory-lock.ts

Lines changed: 48 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,21 @@ export type AdvisoryLockId = (typeof ADVISORY_LOCK_IDS)[keyof typeof ADVISORY_LO
1414

1515
const HEALTH_CHECK_INTERVAL_MS = 10_000 // 10 seconds
1616

17+
// Diagnostic logging helper with timestamp and process info
18+
function logLock(level: 'info' | 'error' | 'warn', message: string, data?: Record<string, unknown>): void {
19+
const timestamp = new Date().toISOString()
20+
const pid = process.pid
21+
const prefix = `[${timestamp}] [PID:${pid}] [advisory-lock]`
22+
const dataStr = data ? ` ${JSON.stringify(data)}` : ''
23+
if (level === 'error') {
24+
console.error(`${prefix} ${message}${dataStr}`)
25+
} else if (level === 'warn') {
26+
console.warn(`${prefix} ${message}${dataStr}`)
27+
} else {
28+
console.log(`${prefix} ${message}${dataStr}`)
29+
}
30+
}
31+
1732
export interface LockHandle {
1833
/** Register a callback to be called if the lock is lost (connection dies) */
1934
onLost(callback: () => void): void
@@ -33,28 +48,38 @@ export async function tryAcquireAdvisoryLock(lockId: AdvisoryLockId): Promise<{
3348
acquired: boolean
3449
handle: LockHandle | null
3550
}> {
51+
logLock('info', 'Attempting to acquire advisory lock', { lockId })
52+
3653
const connection = postgres(env.DATABASE_URL, {
3754
max: 1,
3855
idle_timeout: 0,
3956
connect_timeout: 10,
4057
})
4158

4259
try {
60+
logLock('info', 'Database connection established, attempting pg_try_advisory_lock')
4361
const result = await connection`SELECT pg_try_advisory_lock(${lockId}) as acquired`
4462
const acquired = result[0]?.acquired === true
4563

64+
logLock('info', 'Lock acquisition result', { acquired, lockId })
65+
4666
if (!acquired) {
67+
logLock('info', 'Lock not acquired (held by another process), closing connection')
4768
await connection.end()
4869
return { acquired: false, handle: null }
4970
}
5071

72+
logLock('info', 'Lock acquired successfully, setting up lock handle', { lockId })
73+
5174
// Create the lock handle
5275
let lostCallback: (() => void) | null = null
5376
let isReleased = false
5477
let healthCheckTimer: ReturnType<typeof setInterval> | null = null
78+
let healthCheckCount = 0
5579

5680
const triggerLost = () => {
5781
if (isReleased) return
82+
logLock('warn', 'Lock lost detected, triggering lost callback', { lockId, healthCheckCount })
5883
if (healthCheckTimer) {
5984
clearInterval(healthCheckTimer)
6085
healthCheckTimer = null
@@ -70,6 +95,7 @@ export async function tryAcquireAdvisoryLock(lockId: AdvisoryLockId): Promise<{
7095
// Start health check interval - verify we still hold the lock, not just connection liveness
7196
healthCheckTimer = setInterval(async () => {
7297
if (isReleased) return
98+
healthCheckCount++
7399
try {
74100
// Query pg_locks to verify we still hold this specific advisory lock
75101
// This catches cases where the lock was lost but connection stayed alive
@@ -85,11 +111,14 @@ export async function tryAcquireAdvisoryLock(lockId: AdvisoryLockId): Promise<{
85111
`
86112
const stillHeld = result[0]?.held === true
87113
if (!stillHeld) {
88-
console.error('Advisory lock health check failed - lock no longer held')
114+
logLock('error', 'Advisory lock health check failed - lock no longer held', { lockId, healthCheckCount })
89115
triggerLost()
116+
} else if (healthCheckCount % 6 === 0) {
117+
// Log every minute (6 * 10s) to confirm we're still running
118+
logLock('info', 'Advisory lock health check passed', { lockId, healthCheckCount, uptimeMinutes: healthCheckCount / 6 })
90119
}
91-
} catch {
92-
console.error('Advisory lock health check failed - connection lost')
120+
} catch (error) {
121+
logLock('error', 'Advisory lock health check failed - connection lost', { lockId, healthCheckCount, error: String(error) })
93122
triggerLost()
94123
}
95124
}, HEALTH_CHECK_INTERVAL_MS)
@@ -99,22 +128,36 @@ export async function tryAcquireAdvisoryLock(lockId: AdvisoryLockId): Promise<{
99128
lostCallback = callback
100129
},
101130
async release() {
102-
if (isReleased) return
131+
if (isReleased) {
132+
logLock('info', 'Lock release called but already released', { lockId })
133+
return
134+
}
135+
logLock('info', 'Releasing advisory lock', { lockId, healthCheckCount })
103136
isReleased = true
104137
if (healthCheckTimer) {
105138
clearInterval(healthCheckTimer)
106139
healthCheckTimer = null
107140
}
141+
try {
142+
// Explicitly release the advisory lock before closing connection
143+
logLock('info', 'Calling pg_advisory_unlock', { lockId })
144+
await connection`SELECT pg_advisory_unlock(${lockId})`
145+
logLock('info', 'Advisory lock released via pg_advisory_unlock', { lockId })
146+
} catch (error) {
147+
logLock('error', 'Error during pg_advisory_unlock (continuing to close connection)', { lockId, error: String(error) })
148+
}
108149
try {
109150
await connection.end()
151+
logLock('info', 'Database connection closed', { lockId })
110152
} catch (error) {
111-
console.error('Error releasing advisory lock:', error)
153+
logLock('error', 'Error closing database connection', { lockId, error: String(error) })
112154
}
113155
},
114156
}
115157

116158
return { acquired: true, handle }
117159
} catch (error) {
160+
logLock('error', 'Error during lock acquisition', { lockId, error: String(error) })
118161
await connection.end().catch(() => {})
119162
throw error
120163
}

web/scripts/discord/index.ts

Lines changed: 72 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ import {
22
ADVISORY_LOCK_IDS,
33
tryAcquireAdvisoryLock,
44
} from '@codebuff/internal/db'
5+
import os from 'os'
56

67
import { startDiscordBot } from '../../src/discord/client'
78

@@ -15,45 +16,83 @@ let lockHandle: LockHandle | null = null
1516
let discordClient: Client | null = null
1617
let isShuttingDown = false
1718

19+
// Diagnostic logging helper with timestamp and process info
20+
function log(level: 'info' | 'error' | 'warn', message: string, data?: Record<string, unknown>): void {
21+
const timestamp = new Date().toISOString()
22+
const pid = process.pid
23+
const hostname = os.hostname()
24+
const prefix = `[${timestamp}] [PID:${pid}] [host:${hostname}] [discord-bot]`
25+
const dataStr = data ? ` ${JSON.stringify(data)}` : ''
26+
if (level === 'error') {
27+
console.error(`${prefix} ${message}${dataStr}`)
28+
} else if (level === 'warn') {
29+
console.warn(`${prefix} ${message}${dataStr}`)
30+
} else {
31+
console.log(`${prefix} ${message}${dataStr}`)
32+
}
33+
}
34+
1835
function sleep(ms: number): Promise<void> {
1936
return new Promise((resolve) => setTimeout(resolve, ms))
2037
}
2138

22-
async function shutdown(exitCode: number = 0) {
23-
if (isShuttingDown) return
39+
async function shutdown(exitCode: number = 0): Promise<void> {
40+
if (isShuttingDown) {
41+
log('warn', 'Shutdown already in progress, ignoring duplicate call')
42+
return
43+
}
2444
isShuttingDown = true
2545

26-
console.log('Shutting down Discord bot...')
46+
log('info', 'Shutting down Discord bot...', { exitCode })
2747

2848
if (discordClient) {
2949
try {
50+
log('info', 'Destroying Discord client...')
3051
discordClient.destroy()
52+
log('info', 'Discord client destroyed')
3153
} catch (error) {
32-
console.error('Error destroying Discord client:', error)
54+
log('error', 'Error destroying Discord client', { error: String(error) })
3355
}
3456
discordClient = null
3557
}
3658

3759
if (lockHandle) {
60+
log('info', 'Releasing advisory lock...')
3861
await lockHandle.release()
62+
log('info', 'Advisory lock released')
3963
lockHandle = null
4064
}
4165

66+
log('info', 'Shutdown complete, exiting', { exitCode })
4267
process.exit(exitCode)
4368
}
4469

4570
async function main() {
46-
process.on('SIGTERM', () => shutdown(0))
47-
process.on('SIGINT', () => shutdown(0))
71+
const startTime = Date.now()
72+
log('info', 'Discord bot script starting', {
73+
pid: process.pid,
74+
hostname: os.hostname(),
75+
nodeVersion: process.version,
76+
platform: process.platform,
77+
})
78+
79+
// Handle SIGTERM/SIGINT - shutdown() handles deduplication internally
80+
process.on('SIGTERM', () => {
81+
log('info', 'Received SIGTERM signal')
82+
void shutdown(0)
83+
})
84+
process.on('SIGINT', () => {
85+
log('info', 'Received SIGINT signal')
86+
void shutdown(0)
87+
})
4888

4989
let consecutiveErrors = 0
5090
let attemptCount = 0
5191

5292
while (!isShuttingDown) {
5393
attemptCount++
54-
console.log(
55-
`Attempting to acquire Discord bot lock (attempt ${attemptCount})...`,
56-
)
94+
const elapsedSec = Math.round((Date.now() - startTime) / 1000)
95+
log('info', `Attempting to acquire Discord bot lock`, { attemptCount, elapsedSeconds: elapsedSec })
5796

5897
let acquired = false
5998
let handle: LockHandle | null = null
@@ -63,75 +102,83 @@ async function main() {
63102
acquired = result.acquired
64103
handle = result.handle
65104
consecutiveErrors = 0 // Reset on successful DB connection
105+
log('info', 'Lock acquisition attempt completed', { acquired, consecutiveErrors })
66106
} catch (error) {
67107
consecutiveErrors++
68-
console.error(
69-
`Error acquiring lock (${consecutiveErrors}/${MAX_CONSECUTIVE_ERRORS}):`,
70-
error,
71-
)
108+
log('error', `Error acquiring lock`, {
109+
consecutiveErrors,
110+
maxErrors: MAX_CONSECUTIVE_ERRORS,
111+
error: String(error),
112+
})
72113

73114
if (consecutiveErrors >= MAX_CONSECUTIVE_ERRORS) {
74-
console.error('Too many consecutive errors, exiting...')
115+
log('error', 'Too many consecutive errors, exiting...')
75116
await shutdown(1)
76117
return
77118
}
78119

120+
log('info', `Will retry in ${LOCK_RETRY_INTERVAL_MS / 1000} seconds...`)
79121
await sleep(LOCK_RETRY_INTERVAL_MS)
80122
continue
81123
}
82124

83125
if (!acquired || !handle) {
84-
console.log(
85-
`Another instance is already running the Discord bot. Retrying in ${LOCK_RETRY_INTERVAL_MS / 1000} seconds...`,
86-
)
126+
log('info', `Another instance is already running the Discord bot`, {
127+
retryInSeconds: LOCK_RETRY_INTERVAL_MS / 1000,
128+
})
87129
await sleep(LOCK_RETRY_INTERVAL_MS)
88130
continue
89131
}
90132

91133
lockHandle = handle
92-
console.log('Lock acquired. Starting Discord bot...')
134+
log('info', 'Lock acquired! Starting Discord bot...')
93135

94136
// Set up lock loss handler BEFORE starting the bot
95137
handle.onLost(() => {
96-
console.error('Advisory lock lost! Another instance may have taken over.')
138+
log('error', 'Advisory lock lost! Another instance may have taken over.')
97139
shutdown(1)
98140
})
99141

100142
try {
101143
// Wait for bot to be ready - this is critical!
102144
// If login fails, we release the lock so another instance can try
145+
log('info', 'Calling startDiscordBot()...')
103146
discordClient = await startDiscordBot()
104-
console.log('Discord bot is ready and running.')
147+
log('info', 'Discord bot is ready and running!', {
148+
uptime: Math.round((Date.now() - startTime) / 1000),
149+
})
105150

106151
// Set up error handler for runtime errors
107152
discordClient.on('error', (error) => {
108-
console.error('Discord client error:', error)
153+
log('error', 'Discord client error', { error: String(error) })
109154
})
110155

111156
// Handle disconnection
112157
discordClient.on('disconnect', () => {
113-
console.error('Discord client disconnected')
158+
log('error', 'Discord client disconnected')
114159
})
115160

116161
// Bot is running, keep the process alive
162+
// Note: heartbeat logging is handled by advisory-lock health checks
117163
return
118164
} catch (error) {
119-
console.error('Failed to start Discord bot:', error)
165+
log('error', 'Failed to start Discord bot', { error: String(error) })
120166

121167
// Release the lock so another instance can try
168+
log('info', 'Releasing lock after failed bot start...')
122169
await handle.release()
123170
lockHandle = null
124171
discordClient = null
125172

126173
// Continue polling - maybe another instance will have better luck,
127174
// or maybe the issue is transient (Discord outage)
128-
console.log(`Will retry in ${LOCK_RETRY_INTERVAL_MS / 1000} seconds...`)
175+
log('info', `Will retry in ${LOCK_RETRY_INTERVAL_MS / 1000} seconds...`)
129176
await sleep(LOCK_RETRY_INTERVAL_MS)
130177
}
131178
}
132179
}
133180

134181
main().catch(async (error) => {
135-
console.error('Fatal error in Discord bot script:', error)
182+
log('error', 'Fatal error in Discord bot script', { error: String(error), stack: (error as Error).stack })
136183
await shutdown(1)
137184
})

0 commit comments

Comments
 (0)