feat: replace PM2 with systemd --user services for production

Runs tssbot-web, tssbot-webhook, and tssbot-backend as systemd --user
units instead of PM2 processes. tssbot-web moves from a 2-worker PM2
cluster to a single instance, so deploys now restart it directly
instead of doing a zero-downtime cluster reload.

webhook.cjs now shells out to `systemctl --user restart` instead of
`pm2 reload`, and PM2_RESTART_TARGETS/WEBHOOK_PM2_NAME are renamed to
RESTART_TARGETS/WEBHOOK_SERVICE_NAME. scripts/install-systemd-services.sh
symlinks the new unit files into ~/.config/systemd/user and enables them.

Co-Authored-By: Claude Sonnet 5 <noreply@anthropic.com>
This commit is contained in:
2026-07-01 22:58:15 +00:00
parent 1fee214785
commit 341dae1913
10 changed files with 172 additions and 205 deletions
+30 -48
View File
@@ -40,14 +40,14 @@ const PORT = Number(process.env.WEBHOOK_PORT || 3011)
const SECRET = process.env.GITHUB_WEBHOOK_SECRET || ''
const DISCORD_WEBHOOK_URL = process.env.DISCORD_WEBHOOK_URL || ''
const DISCORD_INCLUDE_PATCH = /^(1|true|yes)$/i.test(String(process.env.DISCORD_INCLUDE_PATCH || ''))
const RESTART_TARGETS = (process.env.PM2_RESTART_TARGETS || 'tssbot-web,tssbot-backend')
const RESTART_TARGETS = (process.env.RESTART_TARGETS || 'tssbot-web,tssbot-backend')
.split(',')
.map((target) => target.trim())
.filter((target) => /^[A-Za-z0-9_.:-]{1,80}$/.test(target))
.filter(Boolean)
// This webhook's own PM2 process name — never reload it during its own deploy.
const SELF_PM2_NAME = process.env.WEBHOOK_PM2_NAME || 'tssbot-webhook'
// This webhook's own systemd unit name — never restart it inline during its own deploy.
const SELF_SERVICE_NAME = process.env.WEBHOOK_SERVICE_NAME || 'tssbot-webhook'
const DIST_DIR = path.join(__dirname, 'dist')
const NEXT_DIST_DIR = path.join(__dirname, 'dist-next')
const PREVIOUS_DIST_DIR = path.join(__dirname, 'dist-previous')
@@ -57,7 +57,7 @@ const WEBHOOK_HEADERS_TIMEOUT_MS = Number(process.env.WEBHOOK_HEADERS_TIMEOUT_MS
// No deploy step may hang forever. A stalled `npm ci` (a native postinstall that
// never returns) would otherwise block for hours with node_modules already
// deleted — which is exactly what took the site down. These cap each step so a
// hang fails fast and aborts the deploy before any pm2 reload.
// hang fails fast and aborts the deploy before any systemctl restart.
const DEPLOY_STEP_TIMEOUT_MS = Number(process.env.DEPLOY_STEP_TIMEOUT_MS || 15 * 60 * 1000)
const DEPLOY_INSTALL_TIMEOUT_MS = Number(process.env.DEPLOY_INSTALL_TIMEOUT_MS || 8 * 60 * 1000)
const ALLOWED_REFS = new Set(
@@ -200,7 +200,6 @@ function commandFor(command) {
}
if (process.platform !== 'win32') return command
if (command === 'npm') return 'npm.cmd'
if (command === 'pm2') return 'pm2.cmd'
return command
}
@@ -218,7 +217,7 @@ function restartTargetsInclude(target) {
}
function pushTouchesWebhookRuntime(push) {
const runtimeFiles = new Set(['webhook.cjs', 'ecosystem.config.cjs'])
const runtimeFiles = new Set(['webhook.cjs', 'systemd/tssbot-webhook.service'])
const commits = Array.isArray(push?.commits) ? push.commits : []
return commits.some((commit) => {
const changed = [
@@ -230,28 +229,18 @@ function pushTouchesWebhookRuntime(push) {
})
}
function scheduleSelfReload(reason) {
let resolvedCommand
try {
resolvedCommand = commandFor('pm2')
} catch (error) {
console.error(`could not schedule ${SELF_PM2_NAME} reload:`, error.message)
return
}
console.log(`scheduling ${SELF_PM2_NAME} reload: ${reason}`)
function scheduleSelfRestart(reason) {
console.log(`scheduling ${SELF_SERVICE_NAME} restart: ${reason}`)
// Delayed + detached: `systemctl --user restart` sends SIGTERM to this very
// process once it starts, so fire it after this tick unrefs and let the
// deploy's response/notifications land first.
setTimeout(() => {
const child = spawn(
resolvedCommand,
['reload', 'ecosystem.config.cjs', '--only', SELF_PM2_NAME, '--update-env'],
{
cwd: __dirname,
env: process.env,
detached: true,
stdio: 'ignore',
shell: process.platform === 'win32',
},
)
const child = spawn('systemctl', ['--user', 'restart', `${SELF_SERVICE_NAME}.service`], {
cwd: __dirname,
env: process.env,
detached: true,
stdio: 'ignore',
})
child.unref()
}, 1000).unref()
}
@@ -275,7 +264,7 @@ function run(command, args, options = {}) {
stdio: 'inherit',
})
// Kill the step if it hangs so deploy() aborts before any pm2 reload instead
// Kill the step if it hangs so deploy() aborts before any systemctl restart instead
// of wedging here indefinitely (see DEPLOY_STEP_TIMEOUT_MS above).
const timeoutMs = Number(options.timeoutMs) > 0 ? Number(options.timeoutMs) : DEPLOY_STEP_TIMEOUT_MS
let timedOut = false
@@ -441,7 +430,7 @@ async function ensureBuildDependencies(previousHead) {
}
// Hard gate: better-sqlite3 must actually load after the install, or abort the
// deploy here — before promoteBuiltDist()/pm2 reload — so a broken native build
// deploy here — before promoteBuiltDist()/systemctl restart — so a broken native build
// can never be promoted to the running workers (which still hold a good binary).
if (!(await betterSqliteLoads())) {
throw new Error(
@@ -708,27 +697,20 @@ async function deploy(push) {
promoteBuiltDist()
syncVehicleIcons()
// Reload via the ecosystem file (not by bare name) with --only so each deploy
// re-reads the committed env blocks (e.g. VEHICLE_* paths). `pm2 reload <name>
// --update-env` would only merge the CLI's process.env and ignore the file.
// Exclude this webhook process from the awaited reload: killing the process
// running this deploy mid-command can interrupt the remaining reloads.
const reloadTargets = RESTART_TARGETS.filter((t) => t !== SELF_PM2_NAME)
if (reloadTargets.length) {
await run('pm2', [
'reload',
'ecosystem.config.cjs',
'--only',
reloadTargets.join(','),
'--update-env',
])
// Each restarted service re-reads .env itself on startup, so a plain
// `systemctl restart` always picks up the committed env changes.
// Exclude this webhook process from the awaited restart: killing the process
// running this deploy mid-command can interrupt the remaining restarts.
const restartTargets = RESTART_TARGETS.filter((t) => t !== SELF_SERVICE_NAME)
if (restartTargets.length) {
await run('systemctl', ['--user', 'restart', ...restartTargets.map((t) => `${t}.service`)])
}
await notifyDeployCompleted(push, diff)
if (restartTargetsInclude(SELF_PM2_NAME) || pushTouchesWebhookRuntime(push)) {
scheduleSelfReload(
restartTargetsInclude(SELF_PM2_NAME)
? `${SELF_PM2_NAME} is listed in PM2_RESTART_TARGETS`
if (restartTargetsInclude(SELF_SERVICE_NAME) || pushTouchesWebhookRuntime(push)) {
scheduleSelfRestart(
restartTargetsInclude(SELF_SERVICE_NAME)
? `${SELF_SERVICE_NAME} is listed in RESTART_TARGETS`
: 'webhook runtime files changed',
)
}
@@ -860,6 +842,6 @@ webhookServer.listen(PORT, '0.0.0.0', () => {
})
setTimeout(() => {
console.log('24 hour webhook refresh reached; exiting for PM2 restart')
console.log('24 hour webhook refresh reached; exiting for systemd restart')
process.exit(0)
}, RESTART_AFTER_MS).unref()