fix(web): make deploys safe for the native better-sqlite3 dependency
Restore the dependency-change guard that got overwritten on main, and harden the deploy + worker shutdown so a flaky better-sqlite3 rebuild can no longer take the site down. Root cause of the recurring outages: tssbot-web is the only stack with a native module (better-sqlite3) that must be downloaded/compiled on every `npm ci`. The deploy ran `npm ci` unconditionally (the skip guard had been reverted), with no timeout, and `npm ci` deletes node_modules first -- so a single hung/failed native rebuild left the site unstartable, and a PM2 cluster restart on top wedged the daemon. webhook.cjs: - Restore the npm-ci skip guard: only reinstall when package.json / package-lock.json actually changed (previousHead captured before the pull), so code-only pushes never rebuild better-sqlite3. Defaults to installing on any uncertainty, and still installs if node_modules is incomplete. - Add per-step timeouts to run() (DEPLOY_STEP_TIMEOUT_MS, and a tighter DEPLOY_INSTALL_TIMEOUT_MS for npm ci) so a stalled step is killed instead of hanging for hours with node_modules already deleted. - Gate the deploy on better-sqlite3 actually loading (child-process load, not just require.resolve): force a reinstall when its native binary is missing, and abort before pm2 reload if it is still broken after install. server.cjs: - On shutdown, closeIdleConnections() + delayed closeAllConnections() so a worker stop/reload can't hang the full kill_timeout on idle keep-alives or a stuck upstream request. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -3508,6 +3508,15 @@ function shutdown() {
|
||||
process.exit(0)
|
||||
})
|
||||
|
||||
// server.close() only fires its callback once every socket is gone, and idle
|
||||
// HTTP keep-alive sockets (held open by nginx/Cloudflare) never close on
|
||||
// their own — so without this the worker hangs the full kill_timeout on every
|
||||
// stop/reload, which is what wedges the PM2 cluster daemon. Close idle sockets
|
||||
// immediately, let in-flight requests finish for a short grace period, then
|
||||
// force the rest so shutdown completes well inside kill_timeout.
|
||||
server.closeIdleConnections()
|
||||
setTimeout(() => server.closeAllConnections(), 3000).unref()
|
||||
|
||||
setTimeout(() => {
|
||||
console.error('Graceful shutdown timed out')
|
||||
process.exit(1)
|
||||
|
||||
+96
-5
@@ -54,6 +54,12 @@ const PREVIOUS_DIST_DIR = path.join(__dirname, 'dist-previous')
|
||||
const MAX_WEBHOOK_BODY_BYTES = Number(process.env.WEBHOOK_MAX_BODY_BYTES || 1024 * 1024)
|
||||
const WEBHOOK_REQUEST_TIMEOUT_MS = Number(process.env.WEBHOOK_REQUEST_TIMEOUT_MS || 30000)
|
||||
const WEBHOOK_HEADERS_TIMEOUT_MS = Number(process.env.WEBHOOK_HEADERS_TIMEOUT_MS || 10000)
|
||||
// No deploy step may hang forever. A stalled `npm ci` (a native postinstall that
|
||||
// never returns) would otherwise block for hours with node_modules already
|
||||
// deleted — which is exactly what took the site down. These cap each step so a
|
||||
// hang fails fast and aborts the deploy before any pm2 reload.
|
||||
const DEPLOY_STEP_TIMEOUT_MS = Number(process.env.DEPLOY_STEP_TIMEOUT_MS || 15 * 60 * 1000)
|
||||
const DEPLOY_INSTALL_TIMEOUT_MS = Number(process.env.DEPLOY_INSTALL_TIMEOUT_MS || 8 * 60 * 1000)
|
||||
const ALLOWED_REFS = new Set(
|
||||
(process.env.GITHUB_WEBHOOK_REFS || 'refs/heads/main')
|
||||
.split(',')
|
||||
@@ -269,19 +275,35 @@ function run(command, args, options = {}) {
|
||||
stdio: 'inherit',
|
||||
})
|
||||
|
||||
// Kill the step if it hangs so deploy() aborts before any pm2 reload instead
|
||||
// of wedging here indefinitely (see DEPLOY_STEP_TIMEOUT_MS above).
|
||||
const timeoutMs = Number(options.timeoutMs) > 0 ? Number(options.timeoutMs) : DEPLOY_STEP_TIMEOUT_MS
|
||||
let timedOut = false
|
||||
const timer = setTimeout(() => {
|
||||
timedOut = true
|
||||
console.error(`deploy step timed out after ${timeoutMs}ms, killing: ${label}`)
|
||||
child.kill('SIGTERM')
|
||||
setTimeout(() => child.kill('SIGKILL'), 5000).unref()
|
||||
}, timeoutMs)
|
||||
timer.unref()
|
||||
|
||||
child.on('error', (error) => {
|
||||
clearTimeout(timer)
|
||||
if (error.code === 'ENOENT') {
|
||||
reject(new Error(commandNotFoundMessage(command)))
|
||||
return
|
||||
}
|
||||
reject(error)
|
||||
})
|
||||
child.on('exit', (code) => {
|
||||
if (code === 0) {
|
||||
child.on('exit', (code, signal) => {
|
||||
clearTimeout(timer)
|
||||
if (timedOut) {
|
||||
reject(new Error(`${label} timed out after ${timeoutMs}ms and was killed`))
|
||||
} else if (code === 0) {
|
||||
console.log(`deploy step completed: ${label}`)
|
||||
resolve()
|
||||
} else {
|
||||
reject(new Error(`${label} exited with ${code}`))
|
||||
reject(new Error(`${label} exited with ${code}${signal ? ` via ${signal}` : ''}`))
|
||||
}
|
||||
})
|
||||
})
|
||||
@@ -323,6 +345,11 @@ function runCapture(command, args, options = {}) {
|
||||
})
|
||||
}
|
||||
|
||||
// Files that, when changed, require a fresh `npm ci`. package.json is included
|
||||
// because npm ci refuses to run against a package.json/lockfile that are out of
|
||||
// sync, so a change to either should trigger a reinstall.
|
||||
const DEPENDENCY_FILES = ['package.json', 'package-lock.json']
|
||||
|
||||
// Resolve the modules the build relies on. Returns the first missing module's
|
||||
// name, or null if all are present. Used to validate after install.
|
||||
function missingBuildDependency() {
|
||||
@@ -345,7 +372,58 @@ function missingBuildDependency() {
|
||||
return null
|
||||
}
|
||||
|
||||
async function ensureBuildDependencies() {
|
||||
// True when any dependency file changed between previousHead and the current
|
||||
// HEAD. Defaults to true (reinstall) whenever we can't determine the range, so
|
||||
// an uncertain state never skips a needed install.
|
||||
async function dependencyFilesChanged(previousHead) {
|
||||
if (!validGitSha(previousHead)) return true
|
||||
try {
|
||||
const changed = await runCapture('git', [
|
||||
'diff',
|
||||
'--name-only',
|
||||
`${previousHead}..HEAD`,
|
||||
'--',
|
||||
...DEPENDENCY_FILES,
|
||||
])
|
||||
return changed.trim().length > 0
|
||||
} catch {
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
// True when the native better-sqlite3 addon actually LOADS. require.resolve()
|
||||
// isn't enough — the package can exist while its compiled .node binary is
|
||||
// missing, which is the "Could not locate the bindings file" crash that
|
||||
// repeatedly took tssbot-web down. Load it in a child process so this reflects
|
||||
// exactly what server.cjs does at startup.
|
||||
async function betterSqliteLoads() {
|
||||
try {
|
||||
await run(
|
||||
process.execPath,
|
||||
['-e', "new (require('better-sqlite3'))(':memory:').close()"],
|
||||
{ env: { NODE_ENV: 'production' }, timeoutMs: 60000 },
|
||||
)
|
||||
return true
|
||||
} catch {
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
async function ensureBuildDependencies(previousHead) {
|
||||
const depsChanged = await dependencyFilesChanged(previousHead)
|
||||
// Skip the clean reinstall (which wipes node_modules and rebuilds native
|
||||
// modules like better-sqlite3 — the fragile step that took the site down) only
|
||||
// when nothing dependency-related changed AND the existing node_modules already
|
||||
// has everything the build and runtime need. A broken/missing better-sqlite3
|
||||
// binary forces the reinstall so a skip can never leave a dead tree.
|
||||
const canSkip =
|
||||
!depsChanged && missingBuildDependency() === null && (await betterSqliteLoads())
|
||||
|
||||
if (canSkip) {
|
||||
console.log('deploy step skipped: npm ci (no package.json/package-lock.json changes, node_modules intact)')
|
||||
return
|
||||
}
|
||||
|
||||
await run('npm', ['ci'], {
|
||||
env: {
|
||||
NODE_ENV: 'development',
|
||||
@@ -354,12 +432,22 @@ async function ensureBuildDependencies() {
|
||||
npm_config_fund: 'false',
|
||||
npm_config_audit: 'false',
|
||||
},
|
||||
timeoutMs: DEPLOY_INSTALL_TIMEOUT_MS,
|
||||
})
|
||||
|
||||
const missing = missingBuildDependency()
|
||||
if (missing) {
|
||||
throw new Error(`${missing} is missing after npm install; dependencies were not installed`)
|
||||
}
|
||||
|
||||
// Hard gate: better-sqlite3 must actually load after the install, or abort the
|
||||
// deploy here — before promoteBuiltDist()/pm2 reload — so a broken native build
|
||||
// can never be promoted to the running workers (which still hold a good binary).
|
||||
if (!(await betterSqliteLoads())) {
|
||||
throw new Error(
|
||||
'better-sqlite3 failed to load after npm ci — native binary missing or broken; aborting deploy before reload',
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
function copyMissingFiles(fromDir, toDir) {
|
||||
@@ -608,9 +696,12 @@ async function deploy(push) {
|
||||
|
||||
try {
|
||||
await notifyDeployStarted(push)
|
||||
// Capture HEAD before the pull so we can tell whether this push actually
|
||||
// changed dependency files (and thus needs a fresh npm ci).
|
||||
const previousHead = await runCapture('git', ['rev-parse', 'HEAD']).catch(() => null)
|
||||
await run('git', ['pull', '--ff-only'])
|
||||
diff = await deployDiff(push)
|
||||
await ensureBuildDependencies()
|
||||
await ensureBuildDependencies(previousHead)
|
||||
await run('npm', ['run', 'build', '--', '--outDir', '../dist-next'])
|
||||
validateBuiltDist()
|
||||
await run('cargo', ['build', '--manifest-path', 'backend/Cargo.toml', '--release'])
|
||||
|
||||
Reference in New Issue
Block a user