This commit is contained in:
Heidi
2026-05-17 17:01:07 +01:00
parent d316d8fd61
commit 5148a0c7bb
+127 -31
View File
@@ -18,6 +18,7 @@ from pathlib import Path
from typing import Any, Dict, Optional
# Third-Party Library Imports
import aiofiles
import aiohttp
from discord.ext import tasks
@@ -41,6 +42,7 @@ from .task_executors import (
)
from .utils import (
get_bot,
STORAGE_DIR,
STACKS_DIR,
refresh_entitled_guilds,
SQB_STATS_TRACKER_WINDOWS,
@@ -53,44 +55,135 @@ async def _record(task_name: str, success: bool, error: str = ""):
await record_task_run(task_name, success, error)
def _format_duration(seconds: int) -> str:
def _format_duration(seconds: int | None) -> str:
if seconds is None:
return "No data"
minutes, rem = divmod(int(seconds), 60)
return f"{minutes}m {rem:02d}s"
async def _send_ttl_alert_webhook(content: str) -> bool:
def _build_ttl_monitor_embed(
*,
title: str,
description: str,
stats: dict,
color: int,
) -> dict:
last_received_ts = stats.get("last_received_ts")
if last_received_ts:
last_received = f"<t:{int(last_received_ts)}:R> (<t:{int(last_received_ts)}:T>)"
else:
last_received = "No recent games"
return {
"title": title,
"description": description,
"color": color,
"fields": [
{
"name": "Average TTL",
"value": _format_duration(stats.get("avg_delay")),
"inline": True,
},
{
"name": "Min / Max",
"value": f"{_format_duration(stats.get('min_delay'))} / {_format_duration(stats.get('max_delay'))}",
"inline": True,
},
{
"name": "Sample",
"value": f"{int(stats.get('sample_size') or 0)} games",
"inline": True,
},
{
"name": "Last received",
"value": last_received,
"inline": False,
},
],
"timestamp": datetime.now(timezone.utc).isoformat(),
}
async def _send_ttl_alert_webhook(embed: dict) -> bool:
webhook_url = os.environ.get("SREBOT_TTL_ALERT_WEBHOOK_URL", "").strip()
if not webhook_url:
logging.warning("[TTL-ALERT] SREBOT_TTL_ALERT_WEBHOOK_URL is not configured.")
return False
payload = {
"username": "SREBOT Monitor",
"embeds": [embed],
"allowed_mentions": {"parse": []},
}
timeout = aiohttp.ClientTimeout(total=15)
async with aiohttp.ClientSession(timeout=timeout) as session:
async with session.post(webhook_url, json={"content": content}) as response:
if 200 <= response.status < 300:
return True
body = await response.text()
logging.error("[TTL-ALERT] Webhook failed with HTTP %s: %s", response.status, body[:500])
return False
try:
async with aiohttp.ClientSession(timeout=timeout) as session:
async with session.post(webhook_url, json=payload) as response:
if 200 <= response.status < 300:
return True
body = await response.text()
logging.error("[TTL-ALERT] Webhook failed with HTTP %s: %s", response.status, body[:500])
return False
except Exception as e:
logging.error("[TTL-ALERT] Webhook request failed: %s", e)
return False
_TTL_ALERT_HIGH_THRESHOLD_SECONDS = 20 * 60
_TTL_ALERT_RECOVERY_THRESHOLD_SECONDS = 10 * 60
_TTL_ALERT_HOLD_SECONDS = 10 * 60
_TTL_ALERT_STATE_PATH = STORAGE_DIR / "ttl_alert_state.json"
_ttl_alert_high_since: Optional[float] = None
_ttl_alert_active = False
_ttl_alert_state_loaded = False
async def _load_ttl_alert_state() -> None:
global _ttl_alert_high_since, _ttl_alert_active, _ttl_alert_state_loaded
if _ttl_alert_state_loaded:
return
_ttl_alert_state_loaded = True
try:
if not _TTL_ALERT_STATE_PATH.exists():
return
async with aiofiles.open(_TTL_ALERT_STATE_PATH, "r", encoding="utf-8") as f:
state = json.loads(await f.read())
_ttl_alert_active = bool(state.get("active", False))
high_since = state.get("high_since")
_ttl_alert_high_since = float(high_since) if high_since is not None else None
except Exception as e:
logging.warning("[TTL-ALERT] Failed to load alert state: %s", e)
async def _save_ttl_alert_state() -> None:
try:
_TTL_ALERT_STATE_PATH.parent.mkdir(parents=True, exist_ok=True)
async with aiofiles.open(_TTL_ALERT_STATE_PATH, "w", encoding="utf-8") as f:
await f.write(json.dumps({
"active": _ttl_alert_active,
"high_since": _ttl_alert_high_since,
"updated_at": time.time(),
}, indent=2))
except Exception as e:
logging.warning("[TTL-ALERT] Failed to save alert state: %s", e)
async def _execute_ttl_alert_check() -> None:
global _ttl_alert_high_since, _ttl_alert_active
await _load_ttl_alert_state()
stats = await get_recent_ttl_stats(limit=30)
avg_delay = stats["avg_delay"]
sample_size = stats["sample_size"]
last_received_ts = stats["last_received_ts"]
if avg_delay is None:
_ttl_alert_high_since = None
await _save_ttl_alert_state()
logging.info("[TTL-ALERT] No TTL data available.")
return
@@ -99,6 +192,7 @@ async def _execute_ttl_alert_check() -> None:
if avg_delay > _TTL_ALERT_HIGH_THRESHOLD_SECONDS:
if _ttl_alert_high_since is None:
_ttl_alert_high_since = now
await _save_ttl_alert_state()
logging.warning(
"[TTL-ALERT] Avg TTL above threshold: %s across %s games.",
_format_duration(avg_delay),
@@ -107,39 +201,41 @@ async def _execute_ttl_alert_check() -> None:
return
if now - _ttl_alert_high_since >= _TTL_ALERT_HOLD_SECONDS:
content = (
":warning: **SRE Bot TTL alert**\n"
f"Average TTL across the last {sample_size} games is "
f"**{_format_duration(avg_delay)}**, and has stayed above "
f"{_format_duration(_TTL_ALERT_HIGH_THRESHOLD_SECONDS)} for at least "
f"{_format_duration(_TTL_ALERT_HOLD_SECONDS)}.\n"
"Historical game messages may be delayed or out of date."
embed = _build_ttl_monitor_embed(
title="SREBOT status: TTL degradation active",
description=(
"Average TTL has stayed above "
f"{_format_duration(_TTL_ALERT_HIGH_THRESHOLD_SECONDS)} for at least "
f"{_format_duration(_TTL_ALERT_HOLD_SECONDS)}. "
"Historical game messages may be delayed or out of date."
),
stats=stats,
color=0xF59E0B,
)
if last_received_ts:
content += f"\nLast game received: <t:{int(last_received_ts)}:R>."
sent = await _send_ttl_alert_webhook(content)
sent = await _send_ttl_alert_webhook(embed)
if sent:
_ttl_alert_active = True
await _save_ttl_alert_state()
logging.warning("[TTL-ALERT] High TTL webhook sent.")
return
_ttl_alert_high_since = None
if _ttl_alert_high_since is not None:
_ttl_alert_high_since = None
await _save_ttl_alert_state()
return
if avg_delay < _TTL_ALERT_RECOVERY_THRESHOLD_SECONDS:
content = (
":white_check_mark: **SRE Bot TTL recovered**\n"
f"Average TTL across the last {sample_size} games is now "
f"**{_format_duration(avg_delay)}**, below "
f"{_format_duration(_TTL_ALERT_RECOVERY_THRESHOLD_SECONDS)}. "
"Back at normal operations."
embed = _build_ttl_monitor_embed(
title="SREBOT status: all services normal",
description="No sustained TTL degradation is currently active.",
stats=stats,
color=0x22C55E,
)
if last_received_ts:
content += f"\nLast game received: <t:{int(last_received_ts)}:R>."
sent = await _send_ttl_alert_webhook(content)
sent = await _send_ttl_alert_webhook(embed)
if sent:
_ttl_alert_active = False
_ttl_alert_high_since = None
await _save_ttl_alert_state()
logging.info("[TTL-ALERT] Recovery webhook sent.")
@@ -147,7 +243,7 @@ async def _execute_ttl_alert_check() -> None:
# TTL ALERT TASK
# ============================================================================
@tasks.loop(minutes=10)
@tasks.loop(minutes=5)
async def ttl_alert_task():
"""Alert when recent game receive TTL is elevated for a sustained period."""
try: