Files
SHARED/data_parser.py

448 lines
16 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
data_parser.py
Parses War Thunder VROMFS game data files:
- char.vromfs.bin → UnitTags: vehicle classification (fighter, bomber, tank, etc.)
- lang.vromfs.bin → LangTableReader: vehicle name translation (internal ID → display name)
- lang.vromfs.bin → WeaponTableReader: weapon/ammo name translation (internal ID → display name)
"""
# Standard Library Imports
import csv
import logging
import re
import sys
from io import StringIO
from pathlib import Path
# Ensure SHARED dir is on sys.path so DAGOR_FILES (sibling package) imports work
_shared_root = Path(__file__).resolve().parent
if str(_shared_root) not in sys.path:
sys.path.insert(0, str(_shared_root))
# Third-Party Library Imports
from DAGOR_FILES.WtFileUtils.vromfs.VROMFs import VROMFs
_BOT_DIR = Path(__file__).resolve().parent
# ---------------------------------------------------------------------------
# Unit tags — lazy-loaded from char.vromfs.bin
# ---------------------------------------------------------------------------
_TAG_TO_TYPE = {
"type_spaa": "SPAA",
"type_light_tank": "Light Tank",
"type_heavy_tank": "Tank",
"type_medium_tank": "Tank",
"type_tank_destroyer": "Tank",
"type_missile_tank": "Tank",
"type_fighter": "Fighter",
"type_strike_aircraft": "Fighter",
"type_bomber": "Bomber",
"type_helicopter": "Helicopter",
}
_TAG_TO_ABBREV = {
"type_spaa": "AA",
"type_light_tank": "L",
"type_heavy_tank": "T",
"type_medium_tank": "T",
"type_tank_destroyer": "T",
"type_missile_tank": "T",
"tank": "T",
"type_fighter": "F",
"type_strike_aircraft": "F",
"type_bomber": "B",
"helicopter": "H",
"type_helicopter": "H",
}
class UnitTags:
"""Lazy-loaded lookup for vehicle classification from char.vromfs.bin."""
_instance: "UnitTags | None" = None
def __init__(self):
self._data: dict | None = None
self._lowercase_map: dict[str, str] | None = None
@classmethod
def get(cls) -> "UnitTags":
"""Return the singleton UnitTags instance, creating it on first call.
Returns:
The shared UnitTags instance.
"""
if cls._instance is None:
cls._instance = cls()
return cls._instance
def _ensure_loaded(self):
"""Parse char.vromfs.bin and populate ``_data`` and ``_lowercase_map`` if not already loaded."""
if self._data is not None:
return
path = _BOT_DIR / "char.vromfs.bin"
v = VROMFs(str(path)).get_directory()
data = v["config"]["unittags.blk"].get_data()["root"] # type: ignore[index,union-attr]
self._data = data
self._lowercase_map = {k.lower(): k for k in data}
def _resolve_key(self, internal_name: str) -> str | None:
"""Return the actual key in the data dict, or None if not found."""
self._ensure_loaded()
assert self._data is not None and self._lowercase_map is not None
if internal_name in self._data:
return internal_name
return self._lowercase_map.get(internal_name.lower())
def _get_tags(self, internal_name: str) -> list[str] | None:
"""Return the tag list for a vehicle, or None if not found."""
key = self._resolve_key(internal_name)
if key is None:
return None
entry = self._data[key] # type: ignore[index]
tags = list(entry["tags"].keys())
if entry.get("type"):
tags.append(entry["type"])
if entry.get("type") == "helicopter" and "type_helicopter" not in tags:
tags.append("type_helicopter")
return tags
@property
def all_names(self) -> list[str]:
"""All vehicle internal names known to unittags.blk."""
self._ensure_loaded()
return list(self._data.keys()) # type: ignore[union-attr]
@property
def raw(self) -> dict:
"""Direct access to the parsed unittags dict."""
self._ensure_loaded()
return self._data # type: ignore[return-value]
@staticmethod
def _best_match(tags: list[str], mapping: dict[str, str]) -> str | None:
"""Return the most specific matching value from *mapping* for *tags*.
Specific ``type_*`` tags (e.g. ``type_spaa``, ``type_light_tank``) are
checked first so they take priority over generic tags like ``tank`` or
``helicopter``.
"""
fallback = None
for tag in tags:
if tag not in mapping:
continue
if tag.startswith("type_"):
return mapping[tag]
if fallback is None:
fallback = mapping[tag]
return fallback
def get_unit_type(self, internal_name: str) -> str | None:
"""Return full vehicle type like 'Tank', 'Fighter', 'SPAA', etc."""
tags = self._get_tags(internal_name)
if tags is None:
print(f"ERROR: Vehicle {internal_name} not found in unit tags")
return None
result = self._best_match(tags, _TAG_TO_TYPE)
if result is None:
print(f"ERROR DETERMINING VEHICLE TYPE FOR UNIT: {internal_name} WITH TAGS: {tags}")
return result
def get_unit_type_abbrev(self, internal_name: str | None) -> str:
"""Return abbreviated vehicle type like 'T', 'F', 'AA', etc."""
if not internal_name or internal_name == "DISCONNECTED":
return "?"
tags = self._get_tags(internal_name)
if tags is None:
print(f"ERROR: Vehicle {internal_name} not found in unit tags")
return "?"
result = self._best_match(tags, _TAG_TO_ABBREV)
if result is None:
print(f"ERROR DETERMINING VEHICLE TYPE FOR UNIT: {internal_name} WITH TAGS: {tags}")
return "?"
return result
# Module-level convenience functions (so callers don't need to touch UnitTags directly)
def get_unit_type(internal_name: str) -> str | None:
"""Return full vehicle type (e.g. ``'Tank'``, ``'Fighter'``) for an internal name.
Args:
internal_name: War Thunder internal vehicle identifier.
Returns:
Human-readable type string, or None if unrecognised.
"""
return UnitTags.get().get_unit_type(internal_name)
def get_unit_type_abbrev(internal_name: str | None) -> str:
"""Return abbreviated vehicle type (e.g. ``'T'``, ``'F'``, ``'AA'``).
Args:
internal_name: War Thunder internal vehicle identifier, or None.
Returns:
Single-letter abbreviation, or ``'?'`` if unknown/None.
"""
return UnitTags.get().get_unit_type_abbrev(internal_name)
def count_unit_types(internal_name_list: list[str]) -> dict[str, int]:
"""Count vehicle types in a list, returning e.g. {'T': 2, 'F': 1}."""
counts: dict[str, int] = {}
for name in internal_name_list:
if name != "MEOW":
t = get_unit_type_abbrev(name)
counts[t] = counts.get(t, 0) + 1
return counts
# ---------------------------------------------------------------------------
# Lang CSV readers — translations from lang.vromfs.bin
# ---------------------------------------------------------------------------
class _LangCSVBase:
"""Shared loader for semicolon-delimited lang CSVs inside lang.vromfs.bin."""
header_info: list[str]
global_data: dict[str, list[str]]
lowercase_key_map: dict[str, str]
_lang_dir = None # lazily shared across subclasses
@classmethod
def _get_lang_dir(cls):
"""Return the parsed lang.vromfs.bin directory, loading it once on first call."""
if _LangCSVBase._lang_dir is None:
p = _BOT_DIR / "lang.vromfs.bin"
_LangCSVBase._lang_dir = VROMFs(str(p)).get_directory()
return _LangCSVBase._lang_dir
@classmethod
def _load_csv(cls, csv_path: str):
"""Load a CSV from the lang vromfs. Returns (header, data_dict, lowercase_map)."""
lang_dir = cls._get_lang_dir()
parts = csv_path.split("/")
node = lang_dir[parts[0]][parts[1]] # type: ignore[index]
reader = csv.reader(StringIO(node.get_data().decode("utf-8")), delimiter=";")
header = next(reader)[1:]
data = {}
lc_map = {}
for line in reader:
key = line[0]
data[key] = line[1:]
lc_map[key.lower()] = key
return header, data, lc_map
def __init__(self, language: str):
"""Initialise the reader with the given language column.
Args:
language: Language column name (e.g. ``"English"``).
"""
self.index = 0
self.update_language(language)
def update_language(self, lang: str) -> bool:
"""Switch translation output to *lang*.
Accepts either the literal column header (e.g. ``"<English>"``)
or the bare language name (e.g. ``"English"``) — the columns in
the WT lang CSVs are stored with literal angle brackets, so we
try both forms before giving up. If neither matches we keep the
previous index and log a warning so silent fallthrough to column 0
(the historical bug) doesn't recur.
Args:
lang: Language column name with or without angle brackets.
Returns:
True if the language was found and set, False otherwise.
"""
if lang in self.header_info:
self.index = self.header_info.index(lang)
return True
bracketed = f"<{lang}>"
if bracketed in self.header_info:
self.index = self.header_info.index(bracketed)
return True
logging.warning(
"%s: unknown language column '%s' (also tried '%s'); "
"keeping current column index %d (%s). Available columns: %s",
type(self).__name__, lang, bracketed, self.index,
self.header_info[self.index] if 0 <= self.index < len(self.header_info) else '?',
self.header_info,
)
return False
# Keep old misspelled name working
update_langauge = update_language
def _lookup(self, key: str) -> str | None:
"""Case-insensitive lookup, returns translated string or None."""
try:
if key in self.global_data:
val = self.global_data[key][self.index]
elif key.lower() in self.lowercase_key_map:
actual = self.lowercase_key_map[key.lower()]
val = self.global_data[actual][self.index]
else:
return None
return val.replace("\\t", "\t")
except (KeyError, IndexError):
return None
class LangTableReader(_LangCSVBase):
"""Translate internal vehicle/unit names to human-readable names."""
header_info, global_data, lowercase_key_map = _LangCSVBase._load_csv("lang/units.csv")
def get_translate(self, value: str) -> str | None:
"""Translate a vehicle internal name to its display name.
Args:
value: Internal vehicle identifier (e.g. ``"ussr_t_34_1941_l_11"``).
Returns:
Translated display name, or None if not found.
"""
return self._lookup(value + "_shop")
class WeaponTableReader(_LangCSVBase):
"""Translate internal weapon/ammo names to human-readable names."""
header_info, global_data, lowercase_key_map = _LangCSVBase._load_csv("lang/units_weaponry.csv")
def get_translate(self, value: str) -> str | None:
"""Translate a weapon/ammo internal name to its display name.
Args:
value: Internal weapon identifier (e.g. ``"120mm_dm53"``).
Also accepts bare names without the ``weapons/`` prefix.
Returns:
Translated display name, or None if not found.
"""
result = self._lookup(value)
if result is None and not value.startswith("weapons/"):
result = self._lookup("weapons/" + value)
return result
# ---------------------------------------------------------------------------
# Name cleanup utilities
# ---------------------------------------------------------------------------
VEHICLE_NAME_FILTERS = [
("Weizman\u2019s ", ""),
("Weizman's ", ""),
("Plagis\u2019 ", ""),
("Plagis' ", ""),
(" (TEL)", ""),
]
# Decoration glyphs (country tree-leak ▄ ▀, event/premium markers
# ◊ ◌ ◔, block elements ▂▃▅▆▇█,
# control pictures ␗ etc.) used to live in this list and were stripped
# *unconditionally* by the loop in apply_vehicle_name_filters() —
# running before the strip_decorations flag was checked, making
# strip_decorations=False silently a no-op. That's why
# vehicle_translations.json had glyphs stripped despite
# init_vehicle_translation_cache() passing False. _DECORATION_RE already
# covers the same set, so the strip_decorations branch now handles them
# correctly: kept when False (web i18n), stripped when True (Discord PNG).
# Strip every WT decoration glyph in one sweep instead of chasing individual
# codepoints as Gaijin adds new tree-leak indicators.
#
# Covered ranges:
# U+2400-U+27FF — Control Pictures, Box Drawing, Block Elements (▀ ▄ etc.),
# Geometric Shapes (◊ ◌ ◢ ◣ ◤ ◥), Dingbats, Misc Symbols.
# U+E000-U+F8FF — Private Use Area, where older WT variants stored sprite
# refs that survived a few client patches.
#
# Mirrors normalizeVehicleName() in server.js — keep both sides in sync.
_DECORATION_RE = re.compile(r"[␀-⟿-]")
_PRIVATE_USE_RE = _DECORATION_RE # backward-compat alias for any external imports
def apply_vehicle_name_filters(name: str, strip_decorations: bool = True) -> str:
"""Apply standard vehicle-name cleanup.
Args:
name: Raw vehicle display string from lang/units.csv.
strip_decorations: When True (the historical default), drop every
glyph in ``_DECORATION_RE`` — country tree-leak indicators (▄ ▀),
event/premium markers (◊), tree shape markers (◢ ◣ ◤ ◥), control
pictures, and the Private Use Area. The Discord scoreboard PNG
renderer uses this because its font can't draw those reliably.
When False, keep every visible glyph and only strip the Private
Use Area (true tofu cruft that no font renders). Used by the
website i18n cache so country indicators survive to the UI.
"""
if not name:
return name
for target, repl in VEHICLE_NAME_FILTERS:
name = name.replace(target, repl)
if strip_decorations:
name = _DECORATION_RE.sub("", name)
else:
# Just the PUA — keep all visible decorations.
name = re.sub(r"[-]", "", name)
return name.strip()
def normalize_name(name: str):
"""Normalize a vehicle display name to ASCII-safe form.
Replaces Cyrillic ``T`` with Latin ``T``, converts ``No.`` symbols,
strips remaining non-ASCII characters, and collapses whitespace.
Args:
name: Raw display name string.
Returns:
Cleaned ASCII string, or None if *name* is falsy.
"""
if not name:
return None
name = name.replace("Т", "T") # Cyrillic Т → Latin T
name = name.replace("№", "No.") # Number Symbol to No.
name = name.encode("ascii", "ignore").decode("ascii")
name = re.sub(r"[^A-Za-z0-9 .\-\(\)]", "", name)
return re.sub(r"\s+", " ", name).strip()
# ---------------------------------------------------------------------------
# Self-test
# ---------------------------------------------------------------------------
if __name__ == "__main__":
translate = LangTableReader("English")
vehicles_list = [
"ussr_t_34_1941_l_11",
"spitfire_lf_mk9e_weisman",
"spitfire_ix_usa",
"germ_pzkpfw_V_ausf_d_panther",
"tankModels/germ_pzkpfw_V_ausf_d_panther",
"DISCONNECTED",
]
print("Vehicles:", vehicles_list)
print("Abbreviated types:", count_unit_types(vehicles_list))
print("\nHuman-readable vehicle translations:")
for vehicle in vehicles_list:
if vehicle and vehicle != "DISCONNECTED":
readable = translate.get_translate(vehicle)
print(f" {vehicle} -> {readable}")
else:
print(f" {vehicle} -> (skipped)")
print("\n--- Weapon translations ---")
weapons = WeaponTableReader("English")
weapon_list = ["120mm_dm13", "120mm_dm33", "120mm_dm53", "105mm_dm33", "weapons/cannonMGC30L"]
for w in weapon_list:
print(f" {w} -> {weapons.get_translate(w)}")