fix(web): make deploys safe for the native better-sqlite3 dependency

Restore the dependency-change guard that got overwritten on main, and harden
the deploy + worker shutdown so a flaky better-sqlite3 rebuild can no longer
take the site down.

Root cause of the recurring outages: tssbot-web is the only stack with a
native module (better-sqlite3) that must be downloaded/compiled on every
`npm ci`. The deploy ran `npm ci` unconditionally (the skip guard had been
reverted), with no timeout, and `npm ci` deletes node_modules first -- so a
single hung/failed native rebuild left the site unstartable, and a PM2
cluster restart on top wedged the daemon.

webhook.cjs:
- Restore the npm-ci skip guard: only reinstall when package.json /
  package-lock.json actually changed (previousHead captured before the pull),
  so code-only pushes never rebuild better-sqlite3. Defaults to installing on
  any uncertainty, and still installs if node_modules is incomplete.
- Add per-step timeouts to run() (DEPLOY_STEP_TIMEOUT_MS, and a tighter
  DEPLOY_INSTALL_TIMEOUT_MS for npm ci) so a stalled step is killed instead of
  hanging for hours with node_modules already deleted.
- Gate the deploy on better-sqlite3 actually loading (child-process load, not
  just require.resolve): force a reinstall when its native binary is missing,
  and abort before pm2 reload if it is still broken after install.

server.cjs:
- On shutdown, closeIdleConnections() + delayed closeAllConnections() so a
  worker stop/reload can't hang the full kill_timeout on idle keep-alives or a
  stuck upstream request.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
Liam
2026-07-01 22:23:57 +00:00
parent 2c34e9ad4a
commit 1fee214785
2 changed files with 105 additions and 5 deletions
+9
View File
@@ -3508,6 +3508,15 @@ function shutdown() {
process.exit(0)
})
// server.close() only fires its callback once every socket is gone, and idle
// HTTP keep-alive sockets (held open by nginx/Cloudflare) never close on
// their own — so without this the worker hangs the full kill_timeout on every
// stop/reload, which is what wedges the PM2 cluster daemon. Close idle sockets
// immediately, let in-flight requests finish for a short grace period, then
// force the rest so shutdown completes well inside kill_timeout.
server.closeIdleConnections()
setTimeout(() => server.closeAllConnections(), 3000).unref()
setTimeout(() => {
console.error('Graceful shutdown timed out')
process.exit(1)
+96 -5
View File
@@ -54,6 +54,12 @@ const PREVIOUS_DIST_DIR = path.join(__dirname, 'dist-previous')
const MAX_WEBHOOK_BODY_BYTES = Number(process.env.WEBHOOK_MAX_BODY_BYTES || 1024 * 1024)
const WEBHOOK_REQUEST_TIMEOUT_MS = Number(process.env.WEBHOOK_REQUEST_TIMEOUT_MS || 30000)
const WEBHOOK_HEADERS_TIMEOUT_MS = Number(process.env.WEBHOOK_HEADERS_TIMEOUT_MS || 10000)
// No deploy step may hang forever. A stalled `npm ci` (a native postinstall that
// never returns) would otherwise block for hours with node_modules already
// deleted — which is exactly what took the site down. These cap each step so a
// hang fails fast and aborts the deploy before any pm2 reload.
const DEPLOY_STEP_TIMEOUT_MS = Number(process.env.DEPLOY_STEP_TIMEOUT_MS || 15 * 60 * 1000)
const DEPLOY_INSTALL_TIMEOUT_MS = Number(process.env.DEPLOY_INSTALL_TIMEOUT_MS || 8 * 60 * 1000)
const ALLOWED_REFS = new Set(
(process.env.GITHUB_WEBHOOK_REFS || 'refs/heads/main')
.split(',')
@@ -269,19 +275,35 @@ function run(command, args, options = {}) {
stdio: 'inherit',
})
// Kill the step if it hangs so deploy() aborts before any pm2 reload instead
// of wedging here indefinitely (see DEPLOY_STEP_TIMEOUT_MS above).
const timeoutMs = Number(options.timeoutMs) > 0 ? Number(options.timeoutMs) : DEPLOY_STEP_TIMEOUT_MS
let timedOut = false
const timer = setTimeout(() => {
timedOut = true
console.error(`deploy step timed out after ${timeoutMs}ms, killing: ${label}`)
child.kill('SIGTERM')
setTimeout(() => child.kill('SIGKILL'), 5000).unref()
}, timeoutMs)
timer.unref()
child.on('error', (error) => {
clearTimeout(timer)
if (error.code === 'ENOENT') {
reject(new Error(commandNotFoundMessage(command)))
return
}
reject(error)
})
child.on('exit', (code) => {
if (code === 0) {
child.on('exit', (code, signal) => {
clearTimeout(timer)
if (timedOut) {
reject(new Error(`${label} timed out after ${timeoutMs}ms and was killed`))
} else if (code === 0) {
console.log(`deploy step completed: ${label}`)
resolve()
} else {
reject(new Error(`${label} exited with ${code}`))
reject(new Error(`${label} exited with ${code}${signal ? ` via ${signal}` : ''}`))
}
})
})
@@ -323,6 +345,11 @@ function runCapture(command, args, options = {}) {
})
}
// Files that, when changed, require a fresh `npm ci`. package.json is included
// because npm ci refuses to run against a package.json/lockfile that are out of
// sync, so a change to either should trigger a reinstall.
const DEPENDENCY_FILES = ['package.json', 'package-lock.json']
// Resolve the modules the build relies on. Returns the first missing module's
// name, or null if all are present. Used to validate after install.
function missingBuildDependency() {
@@ -345,7 +372,58 @@ function missingBuildDependency() {
return null
}
async function ensureBuildDependencies() {
// True when any dependency file changed between previousHead and the current
// HEAD. Defaults to true (reinstall) whenever we can't determine the range, so
// an uncertain state never skips a needed install.
async function dependencyFilesChanged(previousHead) {
if (!validGitSha(previousHead)) return true
try {
const changed = await runCapture('git', [
'diff',
'--name-only',
`${previousHead}..HEAD`,
'--',
...DEPENDENCY_FILES,
])
return changed.trim().length > 0
} catch {
return true
}
}
// True when the native better-sqlite3 addon actually LOADS. require.resolve()
// isn't enough — the package can exist while its compiled .node binary is
// missing, which is the "Could not locate the bindings file" crash that
// repeatedly took tssbot-web down. Load it in a child process so this reflects
// exactly what server.cjs does at startup.
async function betterSqliteLoads() {
try {
await run(
process.execPath,
['-e', "new (require('better-sqlite3'))(':memory:').close()"],
{ env: { NODE_ENV: 'production' }, timeoutMs: 60000 },
)
return true
} catch {
return false
}
}
async function ensureBuildDependencies(previousHead) {
const depsChanged = await dependencyFilesChanged(previousHead)
// Skip the clean reinstall (which wipes node_modules and rebuilds native
// modules like better-sqlite3 — the fragile step that took the site down) only
// when nothing dependency-related changed AND the existing node_modules already
// has everything the build and runtime need. A broken/missing better-sqlite3
// binary forces the reinstall so a skip can never leave a dead tree.
const canSkip =
!depsChanged && missingBuildDependency() === null && (await betterSqliteLoads())
if (canSkip) {
console.log('deploy step skipped: npm ci (no package.json/package-lock.json changes, node_modules intact)')
return
}
await run('npm', ['ci'], {
env: {
NODE_ENV: 'development',
@@ -354,12 +432,22 @@ async function ensureBuildDependencies() {
npm_config_fund: 'false',
npm_config_audit: 'false',
},
timeoutMs: DEPLOY_INSTALL_TIMEOUT_MS,
})
const missing = missingBuildDependency()
if (missing) {
throw new Error(`${missing} is missing after npm install; dependencies were not installed`)
}
// Hard gate: better-sqlite3 must actually load after the install, or abort the
// deploy here — before promoteBuiltDist()/pm2 reload — so a broken native build
// can never be promoted to the running workers (which still hold a good binary).
if (!(await betterSqliteLoads())) {
throw new Error(
'better-sqlite3 failed to load after npm ci — native binary missing or broken; aborting deploy before reload',
)
}
}
function copyMissingFiles(fromDir, toDir) {
@@ -608,9 +696,12 @@ async function deploy(push) {
try {
await notifyDeployStarted(push)
// Capture HEAD before the pull so we can tell whether this push actually
// changed dependency files (and thus needs a fresh npm ci).
const previousHead = await runCapture('git', ['rev-parse', 'HEAD']).catch(() => null)
await run('git', ['pull', '--ff-only'])
diff = await deployDiff(push)
await ensureBuildDependencies()
await ensureBuildDependencies(previousHead)
await run('npm', ['run', 'build', '--', '--outDir', '../dist-next'])
validateBuiltDist()
await run('cargo', ['build', '--manifest-path', 'backend/Cargo.toml', '--release'])