feat: replace PM2 with systemd --user services for production
Runs tssbot-web, tssbot-webhook, and tssbot-backend as systemd --user units instead of PM2 processes. tssbot-web moves from a 2-worker PM2 cluster to a single instance, so deploys now restart it directly instead of doing a zero-downtime cluster reload. webhook.cjs now shells out to `systemctl --user restart` instead of `pm2 reload`, and PM2_RESTART_TARGETS/WEBHOOK_PM2_NAME are renamed to RESTART_TARGETS/WEBHOOK_SERVICE_NAME. scripts/install-systemd-services.sh symlinks the new unit files into ~/.config/systemd/user and enables them. Co-Authored-By: Claude Sonnet 5 <noreply@anthropic.com>
This commit is contained in:
+30
-48
@@ -40,14 +40,14 @@ const PORT = Number(process.env.WEBHOOK_PORT || 3011)
|
||||
const SECRET = process.env.GITHUB_WEBHOOK_SECRET || ''
|
||||
const DISCORD_WEBHOOK_URL = process.env.DISCORD_WEBHOOK_URL || ''
|
||||
const DISCORD_INCLUDE_PATCH = /^(1|true|yes)$/i.test(String(process.env.DISCORD_INCLUDE_PATCH || ''))
|
||||
const RESTART_TARGETS = (process.env.PM2_RESTART_TARGETS || 'tssbot-web,tssbot-backend')
|
||||
const RESTART_TARGETS = (process.env.RESTART_TARGETS || 'tssbot-web,tssbot-backend')
|
||||
.split(',')
|
||||
.map((target) => target.trim())
|
||||
.filter((target) => /^[A-Za-z0-9_.:-]{1,80}$/.test(target))
|
||||
.filter(Boolean)
|
||||
|
||||
// This webhook's own PM2 process name — never reload it during its own deploy.
|
||||
const SELF_PM2_NAME = process.env.WEBHOOK_PM2_NAME || 'tssbot-webhook'
|
||||
// This webhook's own systemd unit name — never restart it inline during its own deploy.
|
||||
const SELF_SERVICE_NAME = process.env.WEBHOOK_SERVICE_NAME || 'tssbot-webhook'
|
||||
const DIST_DIR = path.join(__dirname, 'dist')
|
||||
const NEXT_DIST_DIR = path.join(__dirname, 'dist-next')
|
||||
const PREVIOUS_DIST_DIR = path.join(__dirname, 'dist-previous')
|
||||
@@ -57,7 +57,7 @@ const WEBHOOK_HEADERS_TIMEOUT_MS = Number(process.env.WEBHOOK_HEADERS_TIMEOUT_MS
|
||||
// No deploy step may hang forever. A stalled `npm ci` (a native postinstall that
|
||||
// never returns) would otherwise block for hours with node_modules already
|
||||
// deleted — which is exactly what took the site down. These cap each step so a
|
||||
// hang fails fast and aborts the deploy before any pm2 reload.
|
||||
// hang fails fast and aborts the deploy before any systemctl restart.
|
||||
const DEPLOY_STEP_TIMEOUT_MS = Number(process.env.DEPLOY_STEP_TIMEOUT_MS || 15 * 60 * 1000)
|
||||
const DEPLOY_INSTALL_TIMEOUT_MS = Number(process.env.DEPLOY_INSTALL_TIMEOUT_MS || 8 * 60 * 1000)
|
||||
const ALLOWED_REFS = new Set(
|
||||
@@ -200,7 +200,6 @@ function commandFor(command) {
|
||||
}
|
||||
if (process.platform !== 'win32') return command
|
||||
if (command === 'npm') return 'npm.cmd'
|
||||
if (command === 'pm2') return 'pm2.cmd'
|
||||
return command
|
||||
}
|
||||
|
||||
@@ -218,7 +217,7 @@ function restartTargetsInclude(target) {
|
||||
}
|
||||
|
||||
function pushTouchesWebhookRuntime(push) {
|
||||
const runtimeFiles = new Set(['webhook.cjs', 'ecosystem.config.cjs'])
|
||||
const runtimeFiles = new Set(['webhook.cjs', 'systemd/tssbot-webhook.service'])
|
||||
const commits = Array.isArray(push?.commits) ? push.commits : []
|
||||
return commits.some((commit) => {
|
||||
const changed = [
|
||||
@@ -230,28 +229,18 @@ function pushTouchesWebhookRuntime(push) {
|
||||
})
|
||||
}
|
||||
|
||||
function scheduleSelfReload(reason) {
|
||||
let resolvedCommand
|
||||
try {
|
||||
resolvedCommand = commandFor('pm2')
|
||||
} catch (error) {
|
||||
console.error(`could not schedule ${SELF_PM2_NAME} reload:`, error.message)
|
||||
return
|
||||
}
|
||||
|
||||
console.log(`scheduling ${SELF_PM2_NAME} reload: ${reason}`)
|
||||
function scheduleSelfRestart(reason) {
|
||||
console.log(`scheduling ${SELF_SERVICE_NAME} restart: ${reason}`)
|
||||
// Delayed + detached: `systemctl --user restart` sends SIGTERM to this very
|
||||
// process once it starts, so fire it after this tick unrefs and let the
|
||||
// deploy's response/notifications land first.
|
||||
setTimeout(() => {
|
||||
const child = spawn(
|
||||
resolvedCommand,
|
||||
['reload', 'ecosystem.config.cjs', '--only', SELF_PM2_NAME, '--update-env'],
|
||||
{
|
||||
cwd: __dirname,
|
||||
env: process.env,
|
||||
detached: true,
|
||||
stdio: 'ignore',
|
||||
shell: process.platform === 'win32',
|
||||
},
|
||||
)
|
||||
const child = spawn('systemctl', ['--user', 'restart', `${SELF_SERVICE_NAME}.service`], {
|
||||
cwd: __dirname,
|
||||
env: process.env,
|
||||
detached: true,
|
||||
stdio: 'ignore',
|
||||
})
|
||||
child.unref()
|
||||
}, 1000).unref()
|
||||
}
|
||||
@@ -275,7 +264,7 @@ function run(command, args, options = {}) {
|
||||
stdio: 'inherit',
|
||||
})
|
||||
|
||||
// Kill the step if it hangs so deploy() aborts before any pm2 reload instead
|
||||
// Kill the step if it hangs so deploy() aborts before any systemctl restart instead
|
||||
// of wedging here indefinitely (see DEPLOY_STEP_TIMEOUT_MS above).
|
||||
const timeoutMs = Number(options.timeoutMs) > 0 ? Number(options.timeoutMs) : DEPLOY_STEP_TIMEOUT_MS
|
||||
let timedOut = false
|
||||
@@ -441,7 +430,7 @@ async function ensureBuildDependencies(previousHead) {
|
||||
}
|
||||
|
||||
// Hard gate: better-sqlite3 must actually load after the install, or abort the
|
||||
// deploy here — before promoteBuiltDist()/pm2 reload — so a broken native build
|
||||
// deploy here — before promoteBuiltDist()/systemctl restart — so a broken native build
|
||||
// can never be promoted to the running workers (which still hold a good binary).
|
||||
if (!(await betterSqliteLoads())) {
|
||||
throw new Error(
|
||||
@@ -708,27 +697,20 @@ async function deploy(push) {
|
||||
promoteBuiltDist()
|
||||
syncVehicleIcons()
|
||||
|
||||
// Reload via the ecosystem file (not by bare name) with --only so each deploy
|
||||
// re-reads the committed env blocks (e.g. VEHICLE_* paths). `pm2 reload <name>
|
||||
// --update-env` would only merge the CLI's process.env and ignore the file.
|
||||
// Exclude this webhook process from the awaited reload: killing the process
|
||||
// running this deploy mid-command can interrupt the remaining reloads.
|
||||
const reloadTargets = RESTART_TARGETS.filter((t) => t !== SELF_PM2_NAME)
|
||||
if (reloadTargets.length) {
|
||||
await run('pm2', [
|
||||
'reload',
|
||||
'ecosystem.config.cjs',
|
||||
'--only',
|
||||
reloadTargets.join(','),
|
||||
'--update-env',
|
||||
])
|
||||
// Each restarted service re-reads .env itself on startup, so a plain
|
||||
// `systemctl restart` always picks up the committed env changes.
|
||||
// Exclude this webhook process from the awaited restart: killing the process
|
||||
// running this deploy mid-command can interrupt the remaining restarts.
|
||||
const restartTargets = RESTART_TARGETS.filter((t) => t !== SELF_SERVICE_NAME)
|
||||
if (restartTargets.length) {
|
||||
await run('systemctl', ['--user', 'restart', ...restartTargets.map((t) => `${t}.service`)])
|
||||
}
|
||||
|
||||
await notifyDeployCompleted(push, diff)
|
||||
if (restartTargetsInclude(SELF_PM2_NAME) || pushTouchesWebhookRuntime(push)) {
|
||||
scheduleSelfReload(
|
||||
restartTargetsInclude(SELF_PM2_NAME)
|
||||
? `${SELF_PM2_NAME} is listed in PM2_RESTART_TARGETS`
|
||||
if (restartTargetsInclude(SELF_SERVICE_NAME) || pushTouchesWebhookRuntime(push)) {
|
||||
scheduleSelfRestart(
|
||||
restartTargetsInclude(SELF_SERVICE_NAME)
|
||||
? `${SELF_SERVICE_NAME} is listed in RESTART_TARGETS`
|
||||
: 'webhook runtime files changed',
|
||||
)
|
||||
}
|
||||
@@ -860,6 +842,6 @@ webhookServer.listen(PORT, '0.0.0.0', () => {
|
||||
})
|
||||
|
||||
setTimeout(() => {
|
||||
console.log('24 hour webhook refresh reached; exiting for PM2 restart')
|
||||
console.log('24 hour webhook refresh reached; exiting for systemd restart')
|
||||
process.exit(0)
|
||||
}, RESTART_AFTER_MS).unref()
|
||||
|
||||
Reference in New Issue
Block a user