whoops (#1272)
This commit is contained in:
@@ -96,58 +96,54 @@ async def main(dry_run: bool) -> None:
|
|||||||
gz_files = sorted(REPLAYS_DIR.glob("*/replay_data.json.gz"))
|
gz_files = sorted(REPLAYS_DIR.glob("*/replay_data.json.gz"))
|
||||||
log.info("Found %d replay files in %s", len(gz_files), REPLAYS_DIR)
|
log.info("Found %d replay files in %s", len(gz_files), REPLAYS_DIR)
|
||||||
|
|
||||||
pending: list[dict] = []
|
# Session ID is the directory name — no file reads needed to find missing ones
|
||||||
skipped = 0
|
missing_paths = [p for p in gz_files if p.parent.name not in already]
|
||||||
unreadable = 0
|
skipped = len(gz_files) - len(missing_paths)
|
||||||
|
|
||||||
for gz_path in gz_files:
|
|
||||||
session_id = gz_path.parent.name
|
|
||||||
|
|
||||||
if session_id in already:
|
|
||||||
skipped += 1
|
|
||||||
continue
|
|
||||||
|
|
||||||
replay = _load_replay(gz_path)
|
|
||||||
if replay is None:
|
|
||||||
unreadable += 1
|
|
||||||
continue
|
|
||||||
|
|
||||||
game = _game_dict_from_replay(replay, gz_path)
|
|
||||||
if game is None:
|
|
||||||
log.warning("Could not extract game dict from %s", gz_path)
|
|
||||||
unreadable += 1
|
|
||||||
continue
|
|
||||||
|
|
||||||
pending.append(game)
|
|
||||||
|
|
||||||
log.info(
|
log.info(
|
||||||
"Summary: %d to backfill | %d already in DB | %d unreadable",
|
"Summary: %d to backfill | %d already in DB",
|
||||||
len(pending), skipped, unreadable,
|
len(missing_paths), skipped,
|
||||||
)
|
)
|
||||||
|
|
||||||
if not pending:
|
if not missing_paths:
|
||||||
log.info("Nothing to do.")
|
log.info("Nothing to do.")
|
||||||
return
|
return
|
||||||
|
|
||||||
if dry_run:
|
if dry_run:
|
||||||
log.info("[DRY RUN] Would backfill %d matches — not writing.", len(pending))
|
log.info("[DRY RUN] Would backfill %d matches — not writing.", len(missing_paths))
|
||||||
for g in pending[:10]:
|
for p in missing_paths[:10]:
|
||||||
log.info(" %s endTime=%s map=%r", g["sessionIdHex"], g["endTime"], g["missionName"])
|
log.info(" %s", p.parent.name)
|
||||||
if len(pending) > 10:
|
if len(missing_paths) > 10:
|
||||||
log.info(" ... and %d more", len(pending) - 10)
|
log.info(" ... and %d more", len(missing_paths) - 10)
|
||||||
return
|
return
|
||||||
|
|
||||||
# Process in batches of 50 to keep memory flat and give progress feedback
|
# Read files and write in batches of 50
|
||||||
batch_size = 50
|
batch_size = 50
|
||||||
total = len(pending)
|
total = len(missing_paths)
|
||||||
done = 0
|
done = 0
|
||||||
|
unreadable = 0
|
||||||
|
|
||||||
for i in range(0, total, batch_size):
|
for i in range(0, total, batch_size):
|
||||||
batch = pending[i : i + batch_size]
|
batch_paths = missing_paths[i : i + batch_size]
|
||||||
|
batch: list[dict] = []
|
||||||
|
|
||||||
|
for gz_path in batch_paths:
|
||||||
|
replay = _load_replay(gz_path)
|
||||||
|
if replay is None:
|
||||||
|
unreadable += 1
|
||||||
|
continue
|
||||||
|
game = _game_dict_from_replay(replay, gz_path)
|
||||||
|
if game is None:
|
||||||
|
unreadable += 1
|
||||||
|
continue
|
||||||
|
batch.append(game)
|
||||||
|
|
||||||
|
if batch:
|
||||||
await process_match_summaries(batch)
|
await process_match_summaries(batch)
|
||||||
done += len(batch)
|
done += len(batch)
|
||||||
log.info("Backfilled %d / %d", done, total)
|
log.info("Backfilled %d / %d", done, total)
|
||||||
|
|
||||||
log.info("Done. %d match_summary rows written.", total)
|
log.info("Done. %d written, %d unreadable.", done, unreadable)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|||||||
Reference in New Issue
Block a user