added feat
This commit is contained in:
+116
-1
@@ -8,20 +8,23 @@ on configured intervals.
|
||||
|
||||
# Standard Library Imports
|
||||
import asyncio
|
||||
import os
|
||||
import json
|
||||
import logging
|
||||
import shutil
|
||||
import time
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
# Third-Party Library Imports
|
||||
import aiohttp
|
||||
from discord.ext import tasks
|
||||
|
||||
# Local Module Imports
|
||||
from . import lux_apis
|
||||
from .autologging import handle_ws_replays, handle_gob_message
|
||||
from .health import record_task_run, write_heartbeat
|
||||
from .health import get_recent_ttl_stats, record_task_run, write_heartbeat
|
||||
from .meta_manager import process_all_players, sync_all_guild_metas
|
||||
from .task_executors import (
|
||||
execute_ldb_alarm_task,
|
||||
@@ -50,6 +53,116 @@ async def _record(task_name: str, success: bool, error: str = ""):
|
||||
await record_task_run(task_name, success, error)
|
||||
|
||||
|
||||
def _format_duration(seconds: int) -> str:
|
||||
minutes, rem = divmod(int(seconds), 60)
|
||||
return f"{minutes}m {rem:02d}s"
|
||||
|
||||
|
||||
async def _send_ttl_alert_webhook(content: str) -> bool:
|
||||
webhook_url = os.environ.get("SREBOT_TTL_ALERT_WEBHOOK_URL", "").strip()
|
||||
if not webhook_url:
|
||||
logging.warning("[TTL-ALERT] SREBOT_TTL_ALERT_WEBHOOK_URL is not configured.")
|
||||
return False
|
||||
|
||||
timeout = aiohttp.ClientTimeout(total=15)
|
||||
async with aiohttp.ClientSession(timeout=timeout) as session:
|
||||
async with session.post(webhook_url, json={"content": content}) as response:
|
||||
if 200 <= response.status < 300:
|
||||
return True
|
||||
body = await response.text()
|
||||
logging.error("[TTL-ALERT] Webhook failed with HTTP %s: %s", response.status, body[:500])
|
||||
return False
|
||||
|
||||
|
||||
_TTL_ALERT_HIGH_THRESHOLD_SECONDS = 20 * 60
|
||||
_TTL_ALERT_RECOVERY_THRESHOLD_SECONDS = 10 * 60
|
||||
_TTL_ALERT_HOLD_SECONDS = 10 * 60
|
||||
_ttl_alert_high_since: Optional[float] = None
|
||||
_ttl_alert_active = False
|
||||
|
||||
|
||||
async def _execute_ttl_alert_check() -> None:
|
||||
global _ttl_alert_high_since, _ttl_alert_active
|
||||
|
||||
stats = await get_recent_ttl_stats(limit=30)
|
||||
avg_delay = stats["avg_delay"]
|
||||
sample_size = stats["sample_size"]
|
||||
last_received_ts = stats["last_received_ts"]
|
||||
|
||||
if avg_delay is None:
|
||||
_ttl_alert_high_since = None
|
||||
logging.info("[TTL-ALERT] No TTL data available.")
|
||||
return
|
||||
|
||||
now = time.time()
|
||||
if not _ttl_alert_active:
|
||||
if avg_delay > _TTL_ALERT_HIGH_THRESHOLD_SECONDS:
|
||||
if _ttl_alert_high_since is None:
|
||||
_ttl_alert_high_since = now
|
||||
logging.warning(
|
||||
"[TTL-ALERT] Avg TTL above threshold: %s across %s games.",
|
||||
_format_duration(avg_delay),
|
||||
sample_size,
|
||||
)
|
||||
return
|
||||
|
||||
if now - _ttl_alert_high_since >= _TTL_ALERT_HOLD_SECONDS:
|
||||
content = (
|
||||
":warning: **SRE Bot TTL alert**\n"
|
||||
f"Average TTL across the last {sample_size} games is "
|
||||
f"**{_format_duration(avg_delay)}**, and has stayed above "
|
||||
f"{_format_duration(_TTL_ALERT_HIGH_THRESHOLD_SECONDS)} for at least "
|
||||
f"{_format_duration(_TTL_ALERT_HOLD_SECONDS)}.\n"
|
||||
"Historical game messages may be delayed or out of date."
|
||||
)
|
||||
if last_received_ts:
|
||||
content += f"\nLast game received: <t:{int(last_received_ts)}:R>."
|
||||
sent = await _send_ttl_alert_webhook(content)
|
||||
if sent:
|
||||
_ttl_alert_active = True
|
||||
logging.warning("[TTL-ALERT] High TTL webhook sent.")
|
||||
return
|
||||
|
||||
_ttl_alert_high_since = None
|
||||
return
|
||||
|
||||
if avg_delay < _TTL_ALERT_RECOVERY_THRESHOLD_SECONDS:
|
||||
content = (
|
||||
":white_check_mark: **SRE Bot TTL recovered**\n"
|
||||
f"Average TTL across the last {sample_size} games is now "
|
||||
f"**{_format_duration(avg_delay)}**, below "
|
||||
f"{_format_duration(_TTL_ALERT_RECOVERY_THRESHOLD_SECONDS)}. "
|
||||
"Back at normal operations."
|
||||
)
|
||||
if last_received_ts:
|
||||
content += f"\nLast game received: <t:{int(last_received_ts)}:R>."
|
||||
sent = await _send_ttl_alert_webhook(content)
|
||||
if sent:
|
||||
_ttl_alert_active = False
|
||||
_ttl_alert_high_since = None
|
||||
logging.info("[TTL-ALERT] Recovery webhook sent.")
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# TTL ALERT TASK
|
||||
# ============================================================================
|
||||
|
||||
@tasks.loop(minutes=10)
|
||||
async def ttl_alert_task():
|
||||
"""Alert when recent game receive TTL is elevated for a sustained period."""
|
||||
try:
|
||||
await _execute_ttl_alert_check()
|
||||
await _record("ttl_alert", True)
|
||||
except Exception as e:
|
||||
await _record("ttl_alert", False, str(e))
|
||||
logging.error("[TTL-ALERT] Failed to check TTL alert state: %s", e)
|
||||
|
||||
|
||||
@ttl_alert_task.before_loop
|
||||
async def before_ttl_alert_task():
|
||||
await get_bot().wait_until_ready()
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# LEADERBOARD ALARM TASK
|
||||
# ============================================================================
|
||||
@@ -516,6 +629,7 @@ async def start_all_tasks():
|
||||
replay_cleanup_task.start()
|
||||
health_heartbeat_task.start()
|
||||
weekly_br_report_task.start()
|
||||
ttl_alert_task.start()
|
||||
# Phase 2: WebSocket listeners
|
||||
ws_autolog_task.start()
|
||||
ws_gob_task.start()
|
||||
@@ -541,3 +655,4 @@ def stop_all_tasks():
|
||||
sync_guild_metas_task.cancel()
|
||||
health_heartbeat_task.cancel()
|
||||
weekly_br_report_task.cancel()
|
||||
ttl_alert_task.cancel()
|
||||
|
||||
Reference in New Issue
Block a user