diff --git a/ecosystem.config.js b/ecosystem.config.js index f542ff5..f0d2db7 100644 --- a/ecosystem.config.js +++ b/ecosystem.config.js @@ -9,18 +9,33 @@ const DEPLOY_PATH = __dirname; // Both bots share one venv at BOTS/SHARED/.venv (built from SHARED/requirements.txt). const PY_INTERPRETER = `${DEPLOY_PATH}/../SHARED/.venv/bin/python`; +// Shared crash-loop governor. Without this, `autorestart` relaunches a process +// that dies on startup forever (every restart_delay). Several apps here share +// SHARED/.env + SHARED/.venv + the STORAGE volume, so one bad shared config can +// make them crash-loop at once and peg all 8 cores until the box is unreachable +// (and `pm2 resurrect` then reproduces it on every boot). With this, PM2 gives +// up after max_restarts attempts that each fail to stay up min_uptime ms, +// marking the app `errored` instead of hammering the CPU. exp_backoff grows the +// delay between attempts (supersedes restart_delay during a crash loop). +const RESTART_POLICY = { + max_restarts: 10, + min_uptime: 10000, + exp_backoff_restart_delay: 200, +}; + module.exports = { apps: [ // Discord Bot { name: 'srebot', + ...RESTART_POLICY, script: 'start_bot.py', interpreter: PY_INTERPRETER, cwd: DEPLOY_PATH, instances: 1, autorestart: true, watch: false, - max_memory_restart: '16000M', + max_memory_restart: '12000M', log_file: './logs/bot_combined.log', out_file: './logs/bot_out.log', error_file: './logs/bot_error.log', @@ -33,6 +48,7 @@ module.exports = { // API Server (reads SREBOT_API_PORT from .env) { name: 'srebot-api', + ...RESTART_POLICY, script: 'server.js', interpreter: 'node', node_args: '--max-old-space-size=6144', @@ -54,6 +70,7 @@ module.exports = { // Reads TSS_API_HOST/PORT from .env (default 127.0.0.1:6100). { name: 'tssbot-api', + ...RESTART_POLICY, script: PY_INTERPRETER, args: '-m web.main', interpreter: 'none', @@ -79,6 +96,7 @@ module.exports = { // Reads SREBOT_EXTERNAL_HOST/PORT/UPSTREAM_URL + STORAGE_VOL_PATH from .env. { name: 'relay-gateway', + ...RESTART_POLICY, script: PY_INTERPRETER, args: '-m relay_gateway.gateway', interpreter: 'none', @@ -100,6 +118,7 @@ module.exports = { // Reads SREBOT_WEBHOOK_PORT from .env. { name: 'srebot-webhook', + ...RESTART_POLICY, script: 'github_webhook_updater.py', interpreter: PY_INTERPRETER, cwd: DEPLOY_PATH, @@ -119,6 +138,7 @@ module.exports = { // Website (reads SREBOT_WEB_PORT from .env) { name: 'srebot-web', + ...RESTART_POLICY, script: 'server.js', cwd: `${DEPLOY_PATH}/web`, instances: 3,