This commit is contained in:
NotSoToothless
2026-05-27 04:37:03 -07:00
committed by GitHub
parent 0c99a911e0
commit f042fd4d8a
+35 -39
View File
@@ -96,58 +96,54 @@ async def main(dry_run: bool) -> None:
gz_files = sorted(REPLAYS_DIR.glob("*/replay_data.json.gz")) gz_files = sorted(REPLAYS_DIR.glob("*/replay_data.json.gz"))
log.info("Found %d replay files in %s", len(gz_files), REPLAYS_DIR) log.info("Found %d replay files in %s", len(gz_files), REPLAYS_DIR)
pending: list[dict] = [] # Session ID is the directory name — no file reads needed to find missing ones
skipped = 0 missing_paths = [p for p in gz_files if p.parent.name not in already]
unreadable = 0 skipped = len(gz_files) - len(missing_paths)
for gz_path in gz_files:
session_id = gz_path.parent.name
if session_id in already:
skipped += 1
continue
replay = _load_replay(gz_path)
if replay is None:
unreadable += 1
continue
game = _game_dict_from_replay(replay, gz_path)
if game is None:
log.warning("Could not extract game dict from %s", gz_path)
unreadable += 1
continue
pending.append(game)
log.info( log.info(
"Summary: %d to backfill | %d already in DB | %d unreadable", "Summary: %d to backfill | %d already in DB",
len(pending), skipped, unreadable, len(missing_paths), skipped,
) )
if not pending: if not missing_paths:
log.info("Nothing to do.") log.info("Nothing to do.")
return return
if dry_run: if dry_run:
log.info("[DRY RUN] Would backfill %d matches — not writing.", len(pending)) log.info("[DRY RUN] Would backfill %d matches — not writing.", len(missing_paths))
for g in pending[:10]: for p in missing_paths[:10]:
log.info(" %s endTime=%s map=%r", g["sessionIdHex"], g["endTime"], g["missionName"]) log.info(" %s", p.parent.name)
if len(pending) > 10: if len(missing_paths) > 10:
log.info(" ... and %d more", len(pending) - 10) log.info(" ... and %d more", len(missing_paths) - 10)
return return
# Process in batches of 50 to keep memory flat and give progress feedback # Read files and write in batches of 50
batch_size = 50 batch_size = 50
total = len(pending) total = len(missing_paths)
done = 0 done = 0
for i in range(0, total, batch_size): unreadable = 0
batch = pending[i : i + batch_size]
await process_match_summaries(batch)
done += len(batch)
log.info("Backfilled %d / %d", done, total)
log.info("Done. %d match_summary rows written.", total) for i in range(0, total, batch_size):
batch_paths = missing_paths[i : i + batch_size]
batch: list[dict] = []
for gz_path in batch_paths:
replay = _load_replay(gz_path)
if replay is None:
unreadable += 1
continue
game = _game_dict_from_replay(replay, gz_path)
if game is None:
unreadable += 1
continue
batch.append(game)
if batch:
await process_match_summaries(batch)
done += len(batch)
log.info("Backfilled %d / %d", done, total)
log.info("Done. %d written, %d unreadable.", done, unreadable)
if __name__ == "__main__": if __name__ == "__main__":