whoops (#1272)

2026-05-27 04:37:03 -07:00
parent 0c99a911e0
commit f042fd4d8a
1 changed files with 35 additions and 39 deletions
@@ -96,58 +96,54 @@ async def main(dry_run: bool) -> None:
    gz_files = sorted(REPLAYS_DIR.glob("*/replay_data.json.gz"))
    log.info("Found %d replay files in %s", len(gz_files), REPLAYS_DIR)
-    pending: list[dict] = []
+    # Session ID is the directory name — no file reads needed to find missing ones
-    skipped = 0
+    missing_paths = [p for p in gz_files if p.parent.name not in already]
-    unreadable = 0
+    skipped = len(gz_files) - len(missing_paths)
    for gz_path in gz_files:
        session_id = gz_path.parent.name
        if session_id in already:
            skipped += 1
            continue
        replay = _load_replay(gz_path)
        if replay is None:
            unreadable += 1
            continue
        game = _game_dict_from_replay(replay, gz_path)
        if game is None:
            log.warning("Could not extract game dict from %s", gz_path)
            unreadable += 1
            continue
        pending.append(game)
    log.info(
-        "Summary: %d to backfill | %d already in DB | %d unreadable",
+        "Summary: %d to backfill | %d already in DB",
-        len(pending), skipped, unreadable,
+        len(missing_paths), skipped,
    )
-    if not pending:
+    if not missing_paths:
        log.info("Nothing to do.")
        return
    if dry_run:
-        log.info("[DRY RUN] Would backfill %d matches — not writing.", len(pending))
+        log.info("[DRY RUN] Would backfill %d matches — not writing.", len(missing_paths))
-        for g in pending[:10]:
+        for p in missing_paths[:10]:
-            log.info("  %s  endTime=%s  map=%r", g["sessionIdHex"], g["endTime"], g["missionName"])
+            log.info("  %s", p.parent.name)
-        if len(pending) > 10:
+        if len(missing_paths) > 10:
-            log.info("  ... and %d more", len(pending) - 10)
+            log.info("  ... and %d more", len(missing_paths) - 10)
        return
-    # Process in batches of 50 to keep memory flat and give progress feedback
+    # Read files and write in batches of 50
    batch_size = 50
-    total = len(pending)
+    total = len(missing_paths)
    done = 0
    unreadable = 0
    for i in range(0, total, batch_size):
-        batch = pending[i : i + batch_size]
+        batch_paths = missing_paths[i : i + batch_size]
        batch: list[dict] = []
        for gz_path in batch_paths:
            replay = _load_replay(gz_path)
            if replay is None:
                unreadable += 1
                continue
            game = _game_dict_from_replay(replay, gz_path)
            if game is None:
                unreadable += 1
                continue
            batch.append(game)
        if batch:
            await process_match_summaries(batch)
            done += len(batch)
            log.info("Backfilled %d / %d", done, total)
-    log.info("Done. %d match_summary rows written.", total)
+    log.info("Done. %d written, %d unreadable.", done, unreadable)
 if __name__ == "__main__":