whoops (#1272)
This commit is contained in:
@@ -96,58 +96,54 @@ async def main(dry_run: bool) -> None:
|
||||
gz_files = sorted(REPLAYS_DIR.glob("*/replay_data.json.gz"))
|
||||
log.info("Found %d replay files in %s", len(gz_files), REPLAYS_DIR)
|
||||
|
||||
pending: list[dict] = []
|
||||
skipped = 0
|
||||
unreadable = 0
|
||||
|
||||
for gz_path in gz_files:
|
||||
session_id = gz_path.parent.name
|
||||
|
||||
if session_id in already:
|
||||
skipped += 1
|
||||
continue
|
||||
|
||||
replay = _load_replay(gz_path)
|
||||
if replay is None:
|
||||
unreadable += 1
|
||||
continue
|
||||
|
||||
game = _game_dict_from_replay(replay, gz_path)
|
||||
if game is None:
|
||||
log.warning("Could not extract game dict from %s", gz_path)
|
||||
unreadable += 1
|
||||
continue
|
||||
|
||||
pending.append(game)
|
||||
# Session ID is the directory name — no file reads needed to find missing ones
|
||||
missing_paths = [p for p in gz_files if p.parent.name not in already]
|
||||
skipped = len(gz_files) - len(missing_paths)
|
||||
|
||||
log.info(
|
||||
"Summary: %d to backfill | %d already in DB | %d unreadable",
|
||||
len(pending), skipped, unreadable,
|
||||
"Summary: %d to backfill | %d already in DB",
|
||||
len(missing_paths), skipped,
|
||||
)
|
||||
|
||||
if not pending:
|
||||
if not missing_paths:
|
||||
log.info("Nothing to do.")
|
||||
return
|
||||
|
||||
if dry_run:
|
||||
log.info("[DRY RUN] Would backfill %d matches — not writing.", len(pending))
|
||||
for g in pending[:10]:
|
||||
log.info(" %s endTime=%s map=%r", g["sessionIdHex"], g["endTime"], g["missionName"])
|
||||
if len(pending) > 10:
|
||||
log.info(" ... and %d more", len(pending) - 10)
|
||||
log.info("[DRY RUN] Would backfill %d matches — not writing.", len(missing_paths))
|
||||
for p in missing_paths[:10]:
|
||||
log.info(" %s", p.parent.name)
|
||||
if len(missing_paths) > 10:
|
||||
log.info(" ... and %d more", len(missing_paths) - 10)
|
||||
return
|
||||
|
||||
# Process in batches of 50 to keep memory flat and give progress feedback
|
||||
# Read files and write in batches of 50
|
||||
batch_size = 50
|
||||
total = len(pending)
|
||||
total = len(missing_paths)
|
||||
done = 0
|
||||
unreadable = 0
|
||||
|
||||
for i in range(0, total, batch_size):
|
||||
batch = pending[i : i + batch_size]
|
||||
batch_paths = missing_paths[i : i + batch_size]
|
||||
batch: list[dict] = []
|
||||
|
||||
for gz_path in batch_paths:
|
||||
replay = _load_replay(gz_path)
|
||||
if replay is None:
|
||||
unreadable += 1
|
||||
continue
|
||||
game = _game_dict_from_replay(replay, gz_path)
|
||||
if game is None:
|
||||
unreadable += 1
|
||||
continue
|
||||
batch.append(game)
|
||||
|
||||
if batch:
|
||||
await process_match_summaries(batch)
|
||||
done += len(batch)
|
||||
log.info("Backfilled %d / %d", done, total)
|
||||
|
||||
log.info("Done. %d match_summary rows written.", total)
|
||||
log.info("Done. %d written, %d unreadable.", done, unreadable)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
Reference in New Issue
Block a user