added feat
This commit is contained in:
+11
-25
@@ -53,7 +53,7 @@ from .game_api import (
|
|||||||
obtain_clan_new_points,
|
obtain_clan_new_points,
|
||||||
obtain_clans_leaderboard,
|
obtain_clans_leaderboard,
|
||||||
)
|
)
|
||||||
from .health import init_health, get_health_snapshot
|
from .health import init_health, get_health_snapshot, get_recent_ttl_stats
|
||||||
from .utils import t, guild_lang
|
from .utils import t, guild_lang
|
||||||
from .lux_apis import fetch_replay_by_id
|
from .lux_apis import fetch_replay_by_id
|
||||||
from .meta_manager import (
|
from .meta_manager import (
|
||||||
@@ -8823,17 +8823,10 @@ async def bot_status_public(interaction: discord.Interaction):
|
|||||||
avg_delay: int | None = None
|
avg_delay: int | None = None
|
||||||
sample_size = 0
|
sample_size = 0
|
||||||
try:
|
try:
|
||||||
async with aiosqlite.connect(SQ_BATTLES_DB_PATH, timeout=10.0) as db:
|
stats = await get_recent_ttl_stats(limit=30)
|
||||||
rows = list(await db.execute_fetchall(
|
avg_delay = stats["avg_delay"]
|
||||||
"SELECT endtime_unix, received_unix FROM match_summary "
|
sample_size = stats["sample_size"]
|
||||||
"WHERE received_unix IS NOT NULL AND endtime_unix IS NOT NULL "
|
last_received_ts = stats["last_received_ts"]
|
||||||
"ORDER BY endtime_unix DESC LIMIT 30"
|
|
||||||
))
|
|
||||||
if rows:
|
|
||||||
delays = [max(int(r[1]) - int(r[0]), 0) for r in rows]
|
|
||||||
avg_delay = int(sum(delays) / len(delays))
|
|
||||||
sample_size = len(delays)
|
|
||||||
last_received_ts = max(int(r[1]) for r in rows)
|
|
||||||
except Exception:
|
except Exception:
|
||||||
logging.exception("Failed to compute /bot-status TTL stats")
|
logging.exception("Failed to compute /bot-status TTL stats")
|
||||||
|
|
||||||
@@ -8989,23 +8982,16 @@ async def bot_status(interaction: discord.Interaction):
|
|||||||
|
|
||||||
# Avg TTL (Spectra receive delay) for the last 30 games
|
# Avg TTL (Spectra receive delay) for the last 30 games
|
||||||
try:
|
try:
|
||||||
async with aiosqlite.connect(SQ_BATTLES_DB_PATH, timeout=10.0) as db:
|
ttl_stats = await get_recent_ttl_stats(limit=30)
|
||||||
ttl_rows = list(await db.execute_fetchall(
|
if ttl_stats["avg_delay"] is not None:
|
||||||
"SELECT endtime_unix, received_unix FROM match_summary "
|
a_min, a_sec = divmod(ttl_stats["avg_delay"], 60)
|
||||||
"WHERE received_unix IS NOT NULL AND endtime_unix IS NOT NULL "
|
mn_min, mn_sec = divmod(ttl_stats["min_delay"], 60)
|
||||||
"ORDER BY endtime_unix DESC LIMIT 30"
|
mx_min, mx_sec = divmod(ttl_stats["max_delay"], 60)
|
||||||
))
|
|
||||||
if ttl_rows:
|
|
||||||
delays = [max(int(r[1]) - int(r[0]), 0) for r in ttl_rows]
|
|
||||||
avg = sum(delays) / len(delays)
|
|
||||||
a_min, a_sec = divmod(int(avg), 60)
|
|
||||||
mn_min, mn_sec = divmod(min(delays), 60)
|
|
||||||
mx_min, mx_sec = divmod(max(delays), 60)
|
|
||||||
ttl_value = (
|
ttl_value = (
|
||||||
f"**Avg:** {a_min}m{a_sec:02d}s • "
|
f"**Avg:** {a_min}m{a_sec:02d}s • "
|
||||||
f"**Min:** {mn_min}m{mn_sec:02d}s • "
|
f"**Min:** {mn_min}m{mn_sec:02d}s • "
|
||||||
f"**Max:** {mx_min}m{mx_sec:02d}s • "
|
f"**Max:** {mx_min}m{mx_sec:02d}s • "
|
||||||
f"**N:** {len(delays)}"
|
f"**N:** {ttl_stats['sample_size']}"
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
ttl_value = t("en", "dev.health_never")
|
ttl_value = t("en", "dev.health_never")
|
||||||
|
|||||||
+31
-1
@@ -14,9 +14,10 @@ from pathlib import Path
|
|||||||
|
|
||||||
# Third-Party Library Imports
|
# Third-Party Library Imports
|
||||||
import aiofiles
|
import aiofiles
|
||||||
|
import aiosqlite
|
||||||
|
|
||||||
# Local Module Imports
|
# Local Module Imports
|
||||||
from .utils import STORAGE_DIR, get_bot
|
from .utils import STORAGE_DIR, SQ_BATTLES_DB_PATH, get_bot
|
||||||
|
|
||||||
HEALTH_PATH = STORAGE_DIR / "bot_health.json"
|
HEALTH_PATH = STORAGE_DIR / "bot_health.json"
|
||||||
|
|
||||||
@@ -130,3 +131,32 @@ async def get_health_snapshot() -> dict:
|
|||||||
_health_state["games_processed_24h"] = games_24h
|
_health_state["games_processed_24h"] = games_24h
|
||||||
|
|
||||||
return dict(_health_state)
|
return dict(_health_state)
|
||||||
|
|
||||||
|
|
||||||
|
async def get_recent_ttl_stats(limit: int = 30) -> dict:
|
||||||
|
"""Return receive-delay stats for the most recent completed games."""
|
||||||
|
async with aiosqlite.connect(SQ_BATTLES_DB_PATH, timeout=10.0) as db:
|
||||||
|
rows = list(await db.execute_fetchall(
|
||||||
|
"SELECT endtime_unix, received_unix FROM match_summary "
|
||||||
|
"WHERE received_unix IS NOT NULL AND endtime_unix IS NOT NULL "
|
||||||
|
"ORDER BY endtime_unix DESC LIMIT ?",
|
||||||
|
(limit,),
|
||||||
|
))
|
||||||
|
|
||||||
|
if not rows:
|
||||||
|
return {
|
||||||
|
"sample_size": 0,
|
||||||
|
"avg_delay": None,
|
||||||
|
"min_delay": None,
|
||||||
|
"max_delay": None,
|
||||||
|
"last_received_ts": None,
|
||||||
|
}
|
||||||
|
|
||||||
|
delays = [max(int(received) - int(ended), 0) for ended, received in rows]
|
||||||
|
return {
|
||||||
|
"sample_size": len(delays),
|
||||||
|
"avg_delay": int(sum(delays) / len(delays)),
|
||||||
|
"min_delay": min(delays),
|
||||||
|
"max_delay": max(delays),
|
||||||
|
"last_received_ts": max(int(received) for _, received in rows),
|
||||||
|
}
|
||||||
|
|||||||
+116
-1
@@ -8,20 +8,23 @@ on configured intervals.
|
|||||||
|
|
||||||
# Standard Library Imports
|
# Standard Library Imports
|
||||||
import asyncio
|
import asyncio
|
||||||
|
import os
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
import shutil
|
import shutil
|
||||||
|
import time
|
||||||
from datetime import datetime, timezone
|
from datetime import datetime, timezone
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any, Dict, Optional
|
from typing import Any, Dict, Optional
|
||||||
|
|
||||||
# Third-Party Library Imports
|
# Third-Party Library Imports
|
||||||
|
import aiohttp
|
||||||
from discord.ext import tasks
|
from discord.ext import tasks
|
||||||
|
|
||||||
# Local Module Imports
|
# Local Module Imports
|
||||||
from . import lux_apis
|
from . import lux_apis
|
||||||
from .autologging import handle_ws_replays, handle_gob_message
|
from .autologging import handle_ws_replays, handle_gob_message
|
||||||
from .health import record_task_run, write_heartbeat
|
from .health import get_recent_ttl_stats, record_task_run, write_heartbeat
|
||||||
from .meta_manager import process_all_players, sync_all_guild_metas
|
from .meta_manager import process_all_players, sync_all_guild_metas
|
||||||
from .task_executors import (
|
from .task_executors import (
|
||||||
execute_ldb_alarm_task,
|
execute_ldb_alarm_task,
|
||||||
@@ -50,6 +53,116 @@ async def _record(task_name: str, success: bool, error: str = ""):
|
|||||||
await record_task_run(task_name, success, error)
|
await record_task_run(task_name, success, error)
|
||||||
|
|
||||||
|
|
||||||
|
def _format_duration(seconds: int) -> str:
|
||||||
|
minutes, rem = divmod(int(seconds), 60)
|
||||||
|
return f"{minutes}m {rem:02d}s"
|
||||||
|
|
||||||
|
|
||||||
|
async def _send_ttl_alert_webhook(content: str) -> bool:
|
||||||
|
webhook_url = os.environ.get("SREBOT_TTL_ALERT_WEBHOOK_URL", "").strip()
|
||||||
|
if not webhook_url:
|
||||||
|
logging.warning("[TTL-ALERT] SREBOT_TTL_ALERT_WEBHOOK_URL is not configured.")
|
||||||
|
return False
|
||||||
|
|
||||||
|
timeout = aiohttp.ClientTimeout(total=15)
|
||||||
|
async with aiohttp.ClientSession(timeout=timeout) as session:
|
||||||
|
async with session.post(webhook_url, json={"content": content}) as response:
|
||||||
|
if 200 <= response.status < 300:
|
||||||
|
return True
|
||||||
|
body = await response.text()
|
||||||
|
logging.error("[TTL-ALERT] Webhook failed with HTTP %s: %s", response.status, body[:500])
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
_TTL_ALERT_HIGH_THRESHOLD_SECONDS = 20 * 60
|
||||||
|
_TTL_ALERT_RECOVERY_THRESHOLD_SECONDS = 10 * 60
|
||||||
|
_TTL_ALERT_HOLD_SECONDS = 10 * 60
|
||||||
|
_ttl_alert_high_since: Optional[float] = None
|
||||||
|
_ttl_alert_active = False
|
||||||
|
|
||||||
|
|
||||||
|
async def _execute_ttl_alert_check() -> None:
|
||||||
|
global _ttl_alert_high_since, _ttl_alert_active
|
||||||
|
|
||||||
|
stats = await get_recent_ttl_stats(limit=30)
|
||||||
|
avg_delay = stats["avg_delay"]
|
||||||
|
sample_size = stats["sample_size"]
|
||||||
|
last_received_ts = stats["last_received_ts"]
|
||||||
|
|
||||||
|
if avg_delay is None:
|
||||||
|
_ttl_alert_high_since = None
|
||||||
|
logging.info("[TTL-ALERT] No TTL data available.")
|
||||||
|
return
|
||||||
|
|
||||||
|
now = time.time()
|
||||||
|
if not _ttl_alert_active:
|
||||||
|
if avg_delay > _TTL_ALERT_HIGH_THRESHOLD_SECONDS:
|
||||||
|
if _ttl_alert_high_since is None:
|
||||||
|
_ttl_alert_high_since = now
|
||||||
|
logging.warning(
|
||||||
|
"[TTL-ALERT] Avg TTL above threshold: %s across %s games.",
|
||||||
|
_format_duration(avg_delay),
|
||||||
|
sample_size,
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
|
if now - _ttl_alert_high_since >= _TTL_ALERT_HOLD_SECONDS:
|
||||||
|
content = (
|
||||||
|
":warning: **SRE Bot TTL alert**\n"
|
||||||
|
f"Average TTL across the last {sample_size} games is "
|
||||||
|
f"**{_format_duration(avg_delay)}**, and has stayed above "
|
||||||
|
f"{_format_duration(_TTL_ALERT_HIGH_THRESHOLD_SECONDS)} for at least "
|
||||||
|
f"{_format_duration(_TTL_ALERT_HOLD_SECONDS)}.\n"
|
||||||
|
"Historical game messages may be delayed or out of date."
|
||||||
|
)
|
||||||
|
if last_received_ts:
|
||||||
|
content += f"\nLast game received: <t:{int(last_received_ts)}:R>."
|
||||||
|
sent = await _send_ttl_alert_webhook(content)
|
||||||
|
if sent:
|
||||||
|
_ttl_alert_active = True
|
||||||
|
logging.warning("[TTL-ALERT] High TTL webhook sent.")
|
||||||
|
return
|
||||||
|
|
||||||
|
_ttl_alert_high_since = None
|
||||||
|
return
|
||||||
|
|
||||||
|
if avg_delay < _TTL_ALERT_RECOVERY_THRESHOLD_SECONDS:
|
||||||
|
content = (
|
||||||
|
":white_check_mark: **SRE Bot TTL recovered**\n"
|
||||||
|
f"Average TTL across the last {sample_size} games is now "
|
||||||
|
f"**{_format_duration(avg_delay)}**, below "
|
||||||
|
f"{_format_duration(_TTL_ALERT_RECOVERY_THRESHOLD_SECONDS)}. "
|
||||||
|
"Back at normal operations."
|
||||||
|
)
|
||||||
|
if last_received_ts:
|
||||||
|
content += f"\nLast game received: <t:{int(last_received_ts)}:R>."
|
||||||
|
sent = await _send_ttl_alert_webhook(content)
|
||||||
|
if sent:
|
||||||
|
_ttl_alert_active = False
|
||||||
|
_ttl_alert_high_since = None
|
||||||
|
logging.info("[TTL-ALERT] Recovery webhook sent.")
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# TTL ALERT TASK
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
@tasks.loop(minutes=10)
|
||||||
|
async def ttl_alert_task():
|
||||||
|
"""Alert when recent game receive TTL is elevated for a sustained period."""
|
||||||
|
try:
|
||||||
|
await _execute_ttl_alert_check()
|
||||||
|
await _record("ttl_alert", True)
|
||||||
|
except Exception as e:
|
||||||
|
await _record("ttl_alert", False, str(e))
|
||||||
|
logging.error("[TTL-ALERT] Failed to check TTL alert state: %s", e)
|
||||||
|
|
||||||
|
|
||||||
|
@ttl_alert_task.before_loop
|
||||||
|
async def before_ttl_alert_task():
|
||||||
|
await get_bot().wait_until_ready()
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
# LEADERBOARD ALARM TASK
|
# LEADERBOARD ALARM TASK
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
@@ -516,6 +629,7 @@ async def start_all_tasks():
|
|||||||
replay_cleanup_task.start()
|
replay_cleanup_task.start()
|
||||||
health_heartbeat_task.start()
|
health_heartbeat_task.start()
|
||||||
weekly_br_report_task.start()
|
weekly_br_report_task.start()
|
||||||
|
ttl_alert_task.start()
|
||||||
# Phase 2: WebSocket listeners
|
# Phase 2: WebSocket listeners
|
||||||
ws_autolog_task.start()
|
ws_autolog_task.start()
|
||||||
ws_gob_task.start()
|
ws_gob_task.start()
|
||||||
@@ -541,3 +655,4 @@ def stop_all_tasks():
|
|||||||
sync_guild_metas_task.cancel()
|
sync_guild_metas_task.cancel()
|
||||||
health_heartbeat_task.cancel()
|
health_heartbeat_task.cancel()
|
||||||
weekly_br_report_task.cancel()
|
weekly_br_report_task.cancel()
|
||||||
|
ttl_alert_task.cancel()
|
||||||
|
|||||||
@@ -35,6 +35,7 @@
|
|||||||
SREBOT_API_PORT=6000
|
SREBOT_API_PORT=6000
|
||||||
SREBOT_WEB_PORT=3001
|
SREBOT_WEB_PORT=3001
|
||||||
SREBOT_WEBHOOK_PORT=9000
|
SREBOT_WEBHOOK_PORT=9000
|
||||||
|
SREBOT_TTL_ALERT_WEBHOOK_URL=https://discord.com/api/webhooks/...
|
||||||
SREBOT_EXTERNAL_HOST=0.0.0.0
|
SREBOT_EXTERNAL_HOST=0.0.0.0
|
||||||
SREBOT_EXTERNAL_PORT=18081
|
SREBOT_EXTERNAL_PORT=18081
|
||||||
SREBOT_EXTERNAL_BEARER_TOKEN=your_external_bridge_token # Optional, protects the bridge API and websocket
|
SREBOT_EXTERNAL_BEARER_TOKEN=your_external_bridge_token # Optional, protects the bridge API and websocket
|
||||||
|
|||||||
Reference in New Issue
Block a user