From f042fd4d8a890af3e1b1a6a1f42e9c42eb997d95 Mon Sep 17 00:00:00 2001 From: NotSoToothless <67082114+FURRO404@users.noreply.github.com> Date: Wed, 27 May 2026 04:37:03 -0700 Subject: [PATCH] whoops (#1272) --- scripts/backfill_match_summary.py | 74 +++++++++++++++---------------- 1 file changed, 35 insertions(+), 39 deletions(-) diff --git a/scripts/backfill_match_summary.py b/scripts/backfill_match_summary.py index 6886848..bd4e41f 100644 --- a/scripts/backfill_match_summary.py +++ b/scripts/backfill_match_summary.py @@ -96,58 +96,54 @@ async def main(dry_run: bool) -> None: gz_files = sorted(REPLAYS_DIR.glob("*/replay_data.json.gz")) log.info("Found %d replay files in %s", len(gz_files), REPLAYS_DIR) - pending: list[dict] = [] - skipped = 0 - unreadable = 0 - - for gz_path in gz_files: - session_id = gz_path.parent.name - - if session_id in already: - skipped += 1 - continue - - replay = _load_replay(gz_path) - if replay is None: - unreadable += 1 - continue - - game = _game_dict_from_replay(replay, gz_path) - if game is None: - log.warning("Could not extract game dict from %s", gz_path) - unreadable += 1 - continue - - pending.append(game) + # Session ID is the directory name — no file reads needed to find missing ones + missing_paths = [p for p in gz_files if p.parent.name not in already] + skipped = len(gz_files) - len(missing_paths) log.info( - "Summary: %d to backfill | %d already in DB | %d unreadable", - len(pending), skipped, unreadable, + "Summary: %d to backfill | %d already in DB", + len(missing_paths), skipped, ) - if not pending: + if not missing_paths: log.info("Nothing to do.") return if dry_run: - log.info("[DRY RUN] Would backfill %d matches — not writing.", len(pending)) - for g in pending[:10]: - log.info(" %s endTime=%s map=%r", g["sessionIdHex"], g["endTime"], g["missionName"]) - if len(pending) > 10: - log.info(" ... and %d more", len(pending) - 10) + log.info("[DRY RUN] Would backfill %d matches — not writing.", len(missing_paths)) + for p in missing_paths[:10]: + log.info(" %s", p.parent.name) + if len(missing_paths) > 10: + log.info(" ... and %d more", len(missing_paths) - 10) return - # Process in batches of 50 to keep memory flat and give progress feedback + # Read files and write in batches of 50 batch_size = 50 - total = len(pending) + total = len(missing_paths) done = 0 - for i in range(0, total, batch_size): - batch = pending[i : i + batch_size] - await process_match_summaries(batch) - done += len(batch) - log.info("Backfilled %d / %d", done, total) + unreadable = 0 - log.info("Done. %d match_summary rows written.", total) + for i in range(0, total, batch_size): + batch_paths = missing_paths[i : i + batch_size] + batch: list[dict] = [] + + for gz_path in batch_paths: + replay = _load_replay(gz_path) + if replay is None: + unreadable += 1 + continue + game = _game_dict_from_replay(replay, gz_path) + if game is None: + unreadable += 1 + continue + batch.append(game) + + if batch: + await process_match_summaries(batch) + done += len(batch) + log.info("Backfilled %d / %d", done, total) + + log.info("Done. %d written, %d unreadable.", done, unreadable) if __name__ == "__main__":