From 1fee214785fd43b7b6751f20780880f91fe9d17d Mon Sep 17 00:00:00 2001 From: Liam Date: Wed, 1 Jul 2026 22:23:57 +0000 Subject: [PATCH] fix(web): make deploys safe for the native better-sqlite3 dependency Restore the dependency-change guard that got overwritten on main, and harden the deploy + worker shutdown so a flaky better-sqlite3 rebuild can no longer take the site down. Root cause of the recurring outages: tssbot-web is the only stack with a native module (better-sqlite3) that must be downloaded/compiled on every `npm ci`. The deploy ran `npm ci` unconditionally (the skip guard had been reverted), with no timeout, and `npm ci` deletes node_modules first -- so a single hung/failed native rebuild left the site unstartable, and a PM2 cluster restart on top wedged the daemon. webhook.cjs: - Restore the npm-ci skip guard: only reinstall when package.json / package-lock.json actually changed (previousHead captured before the pull), so code-only pushes never rebuild better-sqlite3. Defaults to installing on any uncertainty, and still installs if node_modules is incomplete. - Add per-step timeouts to run() (DEPLOY_STEP_TIMEOUT_MS, and a tighter DEPLOY_INSTALL_TIMEOUT_MS for npm ci) so a stalled step is killed instead of hanging for hours with node_modules already deleted. - Gate the deploy on better-sqlite3 actually loading (child-process load, not just require.resolve): force a reinstall when its native binary is missing, and abort before pm2 reload if it is still broken after install. server.cjs: - On shutdown, closeIdleConnections() + delayed closeAllConnections() so a worker stop/reload can't hang the full kill_timeout on idle keep-alives or a stuck upstream request. Co-Authored-By: Claude Opus 4.8 --- server.cjs | 9 +++++ webhook.cjs | 101 +++++++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 105 insertions(+), 5 deletions(-) diff --git a/server.cjs b/server.cjs index f3e79a0..598fb68 100644 --- a/server.cjs +++ b/server.cjs @@ -3508,6 +3508,15 @@ function shutdown() { process.exit(0) }) + // server.close() only fires its callback once every socket is gone, and idle + // HTTP keep-alive sockets (held open by nginx/Cloudflare) never close on + // their own — so without this the worker hangs the full kill_timeout on every + // stop/reload, which is what wedges the PM2 cluster daemon. Close idle sockets + // immediately, let in-flight requests finish for a short grace period, then + // force the rest so shutdown completes well inside kill_timeout. + server.closeIdleConnections() + setTimeout(() => server.closeAllConnections(), 3000).unref() + setTimeout(() => { console.error('Graceful shutdown timed out') process.exit(1) diff --git a/webhook.cjs b/webhook.cjs index bf7484f..6ae6ce3 100644 --- a/webhook.cjs +++ b/webhook.cjs @@ -54,6 +54,12 @@ const PREVIOUS_DIST_DIR = path.join(__dirname, 'dist-previous') const MAX_WEBHOOK_BODY_BYTES = Number(process.env.WEBHOOK_MAX_BODY_BYTES || 1024 * 1024) const WEBHOOK_REQUEST_TIMEOUT_MS = Number(process.env.WEBHOOK_REQUEST_TIMEOUT_MS || 30000) const WEBHOOK_HEADERS_TIMEOUT_MS = Number(process.env.WEBHOOK_HEADERS_TIMEOUT_MS || 10000) +// No deploy step may hang forever. A stalled `npm ci` (a native postinstall that +// never returns) would otherwise block for hours with node_modules already +// deleted — which is exactly what took the site down. These cap each step so a +// hang fails fast and aborts the deploy before any pm2 reload. +const DEPLOY_STEP_TIMEOUT_MS = Number(process.env.DEPLOY_STEP_TIMEOUT_MS || 15 * 60 * 1000) +const DEPLOY_INSTALL_TIMEOUT_MS = Number(process.env.DEPLOY_INSTALL_TIMEOUT_MS || 8 * 60 * 1000) const ALLOWED_REFS = new Set( (process.env.GITHUB_WEBHOOK_REFS || 'refs/heads/main') .split(',') @@ -269,19 +275,35 @@ function run(command, args, options = {}) { stdio: 'inherit', }) + // Kill the step if it hangs so deploy() aborts before any pm2 reload instead + // of wedging here indefinitely (see DEPLOY_STEP_TIMEOUT_MS above). + const timeoutMs = Number(options.timeoutMs) > 0 ? Number(options.timeoutMs) : DEPLOY_STEP_TIMEOUT_MS + let timedOut = false + const timer = setTimeout(() => { + timedOut = true + console.error(`deploy step timed out after ${timeoutMs}ms, killing: ${label}`) + child.kill('SIGTERM') + setTimeout(() => child.kill('SIGKILL'), 5000).unref() + }, timeoutMs) + timer.unref() + child.on('error', (error) => { + clearTimeout(timer) if (error.code === 'ENOENT') { reject(new Error(commandNotFoundMessage(command))) return } reject(error) }) - child.on('exit', (code) => { - if (code === 0) { + child.on('exit', (code, signal) => { + clearTimeout(timer) + if (timedOut) { + reject(new Error(`${label} timed out after ${timeoutMs}ms and was killed`)) + } else if (code === 0) { console.log(`deploy step completed: ${label}`) resolve() } else { - reject(new Error(`${label} exited with ${code}`)) + reject(new Error(`${label} exited with ${code}${signal ? ` via ${signal}` : ''}`)) } }) }) @@ -323,6 +345,11 @@ function runCapture(command, args, options = {}) { }) } +// Files that, when changed, require a fresh `npm ci`. package.json is included +// because npm ci refuses to run against a package.json/lockfile that are out of +// sync, so a change to either should trigger a reinstall. +const DEPENDENCY_FILES = ['package.json', 'package-lock.json'] + // Resolve the modules the build relies on. Returns the first missing module's // name, or null if all are present. Used to validate after install. function missingBuildDependency() { @@ -345,7 +372,58 @@ function missingBuildDependency() { return null } -async function ensureBuildDependencies() { +// True when any dependency file changed between previousHead and the current +// HEAD. Defaults to true (reinstall) whenever we can't determine the range, so +// an uncertain state never skips a needed install. +async function dependencyFilesChanged(previousHead) { + if (!validGitSha(previousHead)) return true + try { + const changed = await runCapture('git', [ + 'diff', + '--name-only', + `${previousHead}..HEAD`, + '--', + ...DEPENDENCY_FILES, + ]) + return changed.trim().length > 0 + } catch { + return true + } +} + +// True when the native better-sqlite3 addon actually LOADS. require.resolve() +// isn't enough — the package can exist while its compiled .node binary is +// missing, which is the "Could not locate the bindings file" crash that +// repeatedly took tssbot-web down. Load it in a child process so this reflects +// exactly what server.cjs does at startup. +async function betterSqliteLoads() { + try { + await run( + process.execPath, + ['-e', "new (require('better-sqlite3'))(':memory:').close()"], + { env: { NODE_ENV: 'production' }, timeoutMs: 60000 }, + ) + return true + } catch { + return false + } +} + +async function ensureBuildDependencies(previousHead) { + const depsChanged = await dependencyFilesChanged(previousHead) + // Skip the clean reinstall (which wipes node_modules and rebuilds native + // modules like better-sqlite3 — the fragile step that took the site down) only + // when nothing dependency-related changed AND the existing node_modules already + // has everything the build and runtime need. A broken/missing better-sqlite3 + // binary forces the reinstall so a skip can never leave a dead tree. + const canSkip = + !depsChanged && missingBuildDependency() === null && (await betterSqliteLoads()) + + if (canSkip) { + console.log('deploy step skipped: npm ci (no package.json/package-lock.json changes, node_modules intact)') + return + } + await run('npm', ['ci'], { env: { NODE_ENV: 'development', @@ -354,12 +432,22 @@ async function ensureBuildDependencies() { npm_config_fund: 'false', npm_config_audit: 'false', }, + timeoutMs: DEPLOY_INSTALL_TIMEOUT_MS, }) const missing = missingBuildDependency() if (missing) { throw new Error(`${missing} is missing after npm install; dependencies were not installed`) } + + // Hard gate: better-sqlite3 must actually load after the install, or abort the + // deploy here — before promoteBuiltDist()/pm2 reload — so a broken native build + // can never be promoted to the running workers (which still hold a good binary). + if (!(await betterSqliteLoads())) { + throw new Error( + 'better-sqlite3 failed to load after npm ci — native binary missing or broken; aborting deploy before reload', + ) + } } function copyMissingFiles(fromDir, toDir) { @@ -608,9 +696,12 @@ async function deploy(push) { try { await notifyDeployStarted(push) + // Capture HEAD before the pull so we can tell whether this push actually + // changed dependency files (and thus needs a fresh npm ci). + const previousHead = await runCapture('git', ['rev-parse', 'HEAD']).catch(() => null) await run('git', ['pull', '--ff-only']) diff = await deployDiff(push) - await ensureBuildDependencies() + await ensureBuildDependencies(previousHead) await run('npm', ['run', 'build', '--', '--outDir', '../dist-next']) validateBuiltDist() await run('cargo', ['build', '--manifest-path', 'backend/Cargo.toml', '--release'])