From d316d8fd615e86aa39e7ed60da3802006336efea Mon Sep 17 00:00:00 2001 From: Clippii Date: Sun, 17 May 2026 12:11:11 +0100 Subject: [PATCH] added feat --- BOT/botscript.py | 36 +++++---------- BOT/health.py | 32 ++++++++++++- BOT/tasks.py | 117 ++++++++++++++++++++++++++++++++++++++++++++++- README.md | 1 + 4 files changed, 159 insertions(+), 27 deletions(-) diff --git a/BOT/botscript.py b/BOT/botscript.py index dedb7c5..85d5221 100644 --- a/BOT/botscript.py +++ b/BOT/botscript.py @@ -53,7 +53,7 @@ from .game_api import ( obtain_clan_new_points, obtain_clans_leaderboard, ) -from .health import init_health, get_health_snapshot +from .health import init_health, get_health_snapshot, get_recent_ttl_stats from .utils import t, guild_lang from .lux_apis import fetch_replay_by_id from .meta_manager import ( @@ -8823,17 +8823,10 @@ async def bot_status_public(interaction: discord.Interaction): avg_delay: int | None = None sample_size = 0 try: - async with aiosqlite.connect(SQ_BATTLES_DB_PATH, timeout=10.0) as db: - rows = list(await db.execute_fetchall( - "SELECT endtime_unix, received_unix FROM match_summary " - "WHERE received_unix IS NOT NULL AND endtime_unix IS NOT NULL " - "ORDER BY endtime_unix DESC LIMIT 30" - )) - if rows: - delays = [max(int(r[1]) - int(r[0]), 0) for r in rows] - avg_delay = int(sum(delays) / len(delays)) - sample_size = len(delays) - last_received_ts = max(int(r[1]) for r in rows) + stats = await get_recent_ttl_stats(limit=30) + avg_delay = stats["avg_delay"] + sample_size = stats["sample_size"] + last_received_ts = stats["last_received_ts"] except Exception: logging.exception("Failed to compute /bot-status TTL stats") @@ -8989,23 +8982,16 @@ async def bot_status(interaction: discord.Interaction): # Avg TTL (Spectra receive delay) for the last 30 games try: - async with aiosqlite.connect(SQ_BATTLES_DB_PATH, timeout=10.0) as db: - ttl_rows = list(await db.execute_fetchall( - "SELECT endtime_unix, received_unix FROM match_summary " - "WHERE received_unix IS NOT NULL AND endtime_unix IS NOT NULL " - "ORDER BY endtime_unix DESC LIMIT 30" - )) - if ttl_rows: - delays = [max(int(r[1]) - int(r[0]), 0) for r in ttl_rows] - avg = sum(delays) / len(delays) - a_min, a_sec = divmod(int(avg), 60) - mn_min, mn_sec = divmod(min(delays), 60) - mx_min, mx_sec = divmod(max(delays), 60) + ttl_stats = await get_recent_ttl_stats(limit=30) + if ttl_stats["avg_delay"] is not None: + a_min, a_sec = divmod(ttl_stats["avg_delay"], 60) + mn_min, mn_sec = divmod(ttl_stats["min_delay"], 60) + mx_min, mx_sec = divmod(ttl_stats["max_delay"], 60) ttl_value = ( f"**Avg:** {a_min}m{a_sec:02d}s • " f"**Min:** {mn_min}m{mn_sec:02d}s • " f"**Max:** {mx_min}m{mx_sec:02d}s • " - f"**N:** {len(delays)}" + f"**N:** {ttl_stats['sample_size']}" ) else: ttl_value = t("en", "dev.health_never") diff --git a/BOT/health.py b/BOT/health.py index 523d804..deb66d5 100644 --- a/BOT/health.py +++ b/BOT/health.py @@ -14,9 +14,10 @@ from pathlib import Path # Third-Party Library Imports import aiofiles +import aiosqlite # Local Module Imports -from .utils import STORAGE_DIR, get_bot +from .utils import STORAGE_DIR, SQ_BATTLES_DB_PATH, get_bot HEALTH_PATH = STORAGE_DIR / "bot_health.json" @@ -130,3 +131,32 @@ async def get_health_snapshot() -> dict: _health_state["games_processed_24h"] = games_24h return dict(_health_state) + + +async def get_recent_ttl_stats(limit: int = 30) -> dict: + """Return receive-delay stats for the most recent completed games.""" + async with aiosqlite.connect(SQ_BATTLES_DB_PATH, timeout=10.0) as db: + rows = list(await db.execute_fetchall( + "SELECT endtime_unix, received_unix FROM match_summary " + "WHERE received_unix IS NOT NULL AND endtime_unix IS NOT NULL " + "ORDER BY endtime_unix DESC LIMIT ?", + (limit,), + )) + + if not rows: + return { + "sample_size": 0, + "avg_delay": None, + "min_delay": None, + "max_delay": None, + "last_received_ts": None, + } + + delays = [max(int(received) - int(ended), 0) for ended, received in rows] + return { + "sample_size": len(delays), + "avg_delay": int(sum(delays) / len(delays)), + "min_delay": min(delays), + "max_delay": max(delays), + "last_received_ts": max(int(received) for _, received in rows), + } diff --git a/BOT/tasks.py b/BOT/tasks.py index 093f7b9..e57474a 100644 --- a/BOT/tasks.py +++ b/BOT/tasks.py @@ -8,20 +8,23 @@ on configured intervals. # Standard Library Imports import asyncio +import os import json import logging import shutil +import time from datetime import datetime, timezone from pathlib import Path from typing import Any, Dict, Optional # Third-Party Library Imports +import aiohttp from discord.ext import tasks # Local Module Imports from . import lux_apis from .autologging import handle_ws_replays, handle_gob_message -from .health import record_task_run, write_heartbeat +from .health import get_recent_ttl_stats, record_task_run, write_heartbeat from .meta_manager import process_all_players, sync_all_guild_metas from .task_executors import ( execute_ldb_alarm_task, @@ -50,6 +53,116 @@ async def _record(task_name: str, success: bool, error: str = ""): await record_task_run(task_name, success, error) +def _format_duration(seconds: int) -> str: + minutes, rem = divmod(int(seconds), 60) + return f"{minutes}m {rem:02d}s" + + +async def _send_ttl_alert_webhook(content: str) -> bool: + webhook_url = os.environ.get("SREBOT_TTL_ALERT_WEBHOOK_URL", "").strip() + if not webhook_url: + logging.warning("[TTL-ALERT] SREBOT_TTL_ALERT_WEBHOOK_URL is not configured.") + return False + + timeout = aiohttp.ClientTimeout(total=15) + async with aiohttp.ClientSession(timeout=timeout) as session: + async with session.post(webhook_url, json={"content": content}) as response: + if 200 <= response.status < 300: + return True + body = await response.text() + logging.error("[TTL-ALERT] Webhook failed with HTTP %s: %s", response.status, body[:500]) + return False + + +_TTL_ALERT_HIGH_THRESHOLD_SECONDS = 20 * 60 +_TTL_ALERT_RECOVERY_THRESHOLD_SECONDS = 10 * 60 +_TTL_ALERT_HOLD_SECONDS = 10 * 60 +_ttl_alert_high_since: Optional[float] = None +_ttl_alert_active = False + + +async def _execute_ttl_alert_check() -> None: + global _ttl_alert_high_since, _ttl_alert_active + + stats = await get_recent_ttl_stats(limit=30) + avg_delay = stats["avg_delay"] + sample_size = stats["sample_size"] + last_received_ts = stats["last_received_ts"] + + if avg_delay is None: + _ttl_alert_high_since = None + logging.info("[TTL-ALERT] No TTL data available.") + return + + now = time.time() + if not _ttl_alert_active: + if avg_delay > _TTL_ALERT_HIGH_THRESHOLD_SECONDS: + if _ttl_alert_high_since is None: + _ttl_alert_high_since = now + logging.warning( + "[TTL-ALERT] Avg TTL above threshold: %s across %s games.", + _format_duration(avg_delay), + sample_size, + ) + return + + if now - _ttl_alert_high_since >= _TTL_ALERT_HOLD_SECONDS: + content = ( + ":warning: **SRE Bot TTL alert**\n" + f"Average TTL across the last {sample_size} games is " + f"**{_format_duration(avg_delay)}**, and has stayed above " + f"{_format_duration(_TTL_ALERT_HIGH_THRESHOLD_SECONDS)} for at least " + f"{_format_duration(_TTL_ALERT_HOLD_SECONDS)}.\n" + "Historical game messages may be delayed or out of date." + ) + if last_received_ts: + content += f"\nLast game received: ." + sent = await _send_ttl_alert_webhook(content) + if sent: + _ttl_alert_active = True + logging.warning("[TTL-ALERT] High TTL webhook sent.") + return + + _ttl_alert_high_since = None + return + + if avg_delay < _TTL_ALERT_RECOVERY_THRESHOLD_SECONDS: + content = ( + ":white_check_mark: **SRE Bot TTL recovered**\n" + f"Average TTL across the last {sample_size} games is now " + f"**{_format_duration(avg_delay)}**, below " + f"{_format_duration(_TTL_ALERT_RECOVERY_THRESHOLD_SECONDS)}. " + "Back at normal operations." + ) + if last_received_ts: + content += f"\nLast game received: ." + sent = await _send_ttl_alert_webhook(content) + if sent: + _ttl_alert_active = False + _ttl_alert_high_since = None + logging.info("[TTL-ALERT] Recovery webhook sent.") + + +# ============================================================================ +# TTL ALERT TASK +# ============================================================================ + +@tasks.loop(minutes=10) +async def ttl_alert_task(): + """Alert when recent game receive TTL is elevated for a sustained period.""" + try: + await _execute_ttl_alert_check() + await _record("ttl_alert", True) + except Exception as e: + await _record("ttl_alert", False, str(e)) + logging.error("[TTL-ALERT] Failed to check TTL alert state: %s", e) + + +@ttl_alert_task.before_loop +async def before_ttl_alert_task(): + await get_bot().wait_until_ready() + + # ============================================================================ # LEADERBOARD ALARM TASK # ============================================================================ @@ -516,6 +629,7 @@ async def start_all_tasks(): replay_cleanup_task.start() health_heartbeat_task.start() weekly_br_report_task.start() + ttl_alert_task.start() # Phase 2: WebSocket listeners ws_autolog_task.start() ws_gob_task.start() @@ -541,3 +655,4 @@ def stop_all_tasks(): sync_guild_metas_task.cancel() health_heartbeat_task.cancel() weekly_br_report_task.cancel() + ttl_alert_task.cancel() diff --git a/README.md b/README.md index 50cadc5..6033168 100644 --- a/README.md +++ b/README.md @@ -35,6 +35,7 @@ SREBOT_API_PORT=6000 SREBOT_WEB_PORT=3001 SREBOT_WEBHOOK_PORT=9000 + SREBOT_TTL_ALERT_WEBHOOK_URL=https://discord.com/api/webhooks/... SREBOT_EXTERNAL_HOST=0.0.0.0 SREBOT_EXTERNAL_PORT=18081 SREBOT_EXTERNAL_BEARER_TOKEN=your_external_bridge_token # Optional, protects the bridge API and websocket