Initial commit: SHARED library with LFS for binary assets
This commit is contained in:
+447
@@ -0,0 +1,447 @@
|
||||
"""
|
||||
data_parser.py
|
||||
|
||||
Parses War Thunder VROMFS game data files:
|
||||
- char.vromfs.bin → UnitTags: vehicle classification (fighter, bomber, tank, etc.)
|
||||
- lang.vromfs.bin → LangTableReader: vehicle name translation (internal ID → display name)
|
||||
- lang.vromfs.bin → WeaponTableReader: weapon/ammo name translation (internal ID → display name)
|
||||
"""
|
||||
|
||||
# Standard Library Imports
|
||||
import csv
|
||||
import logging
|
||||
import re
|
||||
import sys
|
||||
from io import StringIO
|
||||
from pathlib import Path
|
||||
|
||||
# Ensure SHARED dir is on sys.path so DAGOR_FILES (sibling package) imports work
|
||||
_shared_root = Path(__file__).resolve().parent
|
||||
if str(_shared_root) not in sys.path:
|
||||
sys.path.insert(0, str(_shared_root))
|
||||
|
||||
# Third-Party Library Imports
|
||||
from DAGOR_FILES.WtFileUtils.vromfs.VROMFs import VROMFs
|
||||
|
||||
_BOT_DIR = Path(__file__).resolve().parent
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Unit tags — lazy-loaded from char.vromfs.bin
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_TAG_TO_TYPE = {
|
||||
"type_spaa": "SPAA",
|
||||
"type_light_tank": "Light Tank",
|
||||
"type_heavy_tank": "Tank",
|
||||
"type_medium_tank": "Tank",
|
||||
"type_tank_destroyer": "Tank",
|
||||
"type_missile_tank": "Tank",
|
||||
"type_fighter": "Fighter",
|
||||
"type_strike_aircraft": "Fighter",
|
||||
"type_bomber": "Bomber",
|
||||
"type_helicopter": "Helicopter",
|
||||
}
|
||||
|
||||
_TAG_TO_ABBREV = {
|
||||
"type_spaa": "AA",
|
||||
"type_light_tank": "L",
|
||||
"type_heavy_tank": "T",
|
||||
"type_medium_tank": "T",
|
||||
"type_tank_destroyer": "T",
|
||||
"type_missile_tank": "T",
|
||||
"tank": "T",
|
||||
"type_fighter": "F",
|
||||
"type_strike_aircraft": "F",
|
||||
"type_bomber": "B",
|
||||
"helicopter": "H",
|
||||
"type_helicopter": "H",
|
||||
}
|
||||
|
||||
|
||||
class UnitTags:
|
||||
"""Lazy-loaded lookup for vehicle classification from char.vromfs.bin."""
|
||||
|
||||
_instance: "UnitTags | None" = None
|
||||
|
||||
def __init__(self):
|
||||
self._data: dict | None = None
|
||||
self._lowercase_map: dict[str, str] | None = None
|
||||
|
||||
@classmethod
|
||||
def get(cls) -> "UnitTags":
|
||||
"""Return the singleton UnitTags instance, creating it on first call.
|
||||
|
||||
Returns:
|
||||
The shared UnitTags instance.
|
||||
"""
|
||||
if cls._instance is None:
|
||||
cls._instance = cls()
|
||||
return cls._instance
|
||||
|
||||
def _ensure_loaded(self):
|
||||
"""Parse char.vromfs.bin and populate ``_data`` and ``_lowercase_map`` if not already loaded."""
|
||||
if self._data is not None:
|
||||
return
|
||||
path = _BOT_DIR / "char.vromfs.bin"
|
||||
v = VROMFs(str(path)).get_directory()
|
||||
data = v["config"]["unittags.blk"].get_data()["root"] # type: ignore[index,union-attr]
|
||||
self._data = data
|
||||
self._lowercase_map = {k.lower(): k for k in data}
|
||||
|
||||
def _resolve_key(self, internal_name: str) -> str | None:
|
||||
"""Return the actual key in the data dict, or None if not found."""
|
||||
self._ensure_loaded()
|
||||
assert self._data is not None and self._lowercase_map is not None
|
||||
if internal_name in self._data:
|
||||
return internal_name
|
||||
return self._lowercase_map.get(internal_name.lower())
|
||||
|
||||
def _get_tags(self, internal_name: str) -> list[str] | None:
|
||||
"""Return the tag list for a vehicle, or None if not found."""
|
||||
key = self._resolve_key(internal_name)
|
||||
if key is None:
|
||||
return None
|
||||
entry = self._data[key] # type: ignore[index]
|
||||
tags = list(entry["tags"].keys())
|
||||
if entry.get("type"):
|
||||
tags.append(entry["type"])
|
||||
if entry.get("type") == "helicopter" and "type_helicopter" not in tags:
|
||||
tags.append("type_helicopter")
|
||||
return tags
|
||||
|
||||
@property
|
||||
def all_names(self) -> list[str]:
|
||||
"""All vehicle internal names known to unittags.blk."""
|
||||
self._ensure_loaded()
|
||||
return list(self._data.keys()) # type: ignore[union-attr]
|
||||
|
||||
@property
|
||||
def raw(self) -> dict:
|
||||
"""Direct access to the parsed unittags dict."""
|
||||
self._ensure_loaded()
|
||||
return self._data # type: ignore[return-value]
|
||||
|
||||
@staticmethod
|
||||
def _best_match(tags: list[str], mapping: dict[str, str]) -> str | None:
|
||||
"""Return the most specific matching value from *mapping* for *tags*.
|
||||
|
||||
Specific ``type_*`` tags (e.g. ``type_spaa``, ``type_light_tank``) are
|
||||
checked first so they take priority over generic tags like ``tank`` or
|
||||
``helicopter``.
|
||||
"""
|
||||
fallback = None
|
||||
for tag in tags:
|
||||
if tag not in mapping:
|
||||
continue
|
||||
if tag.startswith("type_"):
|
||||
return mapping[tag]
|
||||
if fallback is None:
|
||||
fallback = mapping[tag]
|
||||
return fallback
|
||||
|
||||
def get_unit_type(self, internal_name: str) -> str | None:
|
||||
"""Return full vehicle type like 'Tank', 'Fighter', 'SPAA', etc."""
|
||||
tags = self._get_tags(internal_name)
|
||||
if tags is None:
|
||||
print(f"ERROR: Vehicle {internal_name} not found in unit tags")
|
||||
return None
|
||||
result = self._best_match(tags, _TAG_TO_TYPE)
|
||||
if result is None:
|
||||
print(f"ERROR DETERMINING VEHICLE TYPE FOR UNIT: {internal_name} WITH TAGS: {tags}")
|
||||
return result
|
||||
|
||||
def get_unit_type_abbrev(self, internal_name: str | None) -> str:
|
||||
"""Return abbreviated vehicle type like 'T', 'F', 'AA', etc."""
|
||||
if not internal_name or internal_name == "DISCONNECTED":
|
||||
return "?"
|
||||
tags = self._get_tags(internal_name)
|
||||
if tags is None:
|
||||
print(f"ERROR: Vehicle {internal_name} not found in unit tags")
|
||||
return "?"
|
||||
result = self._best_match(tags, _TAG_TO_ABBREV)
|
||||
if result is None:
|
||||
print(f"ERROR DETERMINING VEHICLE TYPE FOR UNIT: {internal_name} WITH TAGS: {tags}")
|
||||
return "?"
|
||||
return result
|
||||
|
||||
|
||||
# Module-level convenience functions (so callers don't need to touch UnitTags directly)
|
||||
|
||||
def get_unit_type(internal_name: str) -> str | None:
|
||||
"""Return full vehicle type (e.g. ``'Tank'``, ``'Fighter'``) for an internal name.
|
||||
|
||||
Args:
|
||||
internal_name: War Thunder internal vehicle identifier.
|
||||
|
||||
Returns:
|
||||
Human-readable type string, or None if unrecognised.
|
||||
"""
|
||||
return UnitTags.get().get_unit_type(internal_name)
|
||||
|
||||
def get_unit_type_abbrev(internal_name: str | None) -> str:
|
||||
"""Return abbreviated vehicle type (e.g. ``'T'``, ``'F'``, ``'AA'``).
|
||||
|
||||
Args:
|
||||
internal_name: War Thunder internal vehicle identifier, or None.
|
||||
|
||||
Returns:
|
||||
Single-letter abbreviation, or ``'?'`` if unknown/None.
|
||||
"""
|
||||
return UnitTags.get().get_unit_type_abbrev(internal_name)
|
||||
|
||||
def count_unit_types(internal_name_list: list[str]) -> dict[str, int]:
|
||||
"""Count vehicle types in a list, returning e.g. {'T': 2, 'F': 1}."""
|
||||
counts: dict[str, int] = {}
|
||||
for name in internal_name_list:
|
||||
if name != "MEOW":
|
||||
t = get_unit_type_abbrev(name)
|
||||
counts[t] = counts.get(t, 0) + 1
|
||||
return counts
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Lang CSV readers — translations from lang.vromfs.bin
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class _LangCSVBase:
|
||||
"""Shared loader for semicolon-delimited lang CSVs inside lang.vromfs.bin."""
|
||||
|
||||
header_info: list[str]
|
||||
global_data: dict[str, list[str]]
|
||||
lowercase_key_map: dict[str, str]
|
||||
|
||||
_lang_dir = None # lazily shared across subclasses
|
||||
|
||||
@classmethod
|
||||
def _get_lang_dir(cls):
|
||||
"""Return the parsed lang.vromfs.bin directory, loading it once on first call."""
|
||||
if _LangCSVBase._lang_dir is None:
|
||||
p = _BOT_DIR / "lang.vromfs.bin"
|
||||
_LangCSVBase._lang_dir = VROMFs(str(p)).get_directory()
|
||||
return _LangCSVBase._lang_dir
|
||||
|
||||
@classmethod
|
||||
def _load_csv(cls, csv_path: str):
|
||||
"""Load a CSV from the lang vromfs. Returns (header, data_dict, lowercase_map)."""
|
||||
lang_dir = cls._get_lang_dir()
|
||||
parts = csv_path.split("/")
|
||||
node = lang_dir[parts[0]][parts[1]] # type: ignore[index]
|
||||
reader = csv.reader(StringIO(node.get_data().decode("utf-8")), delimiter=";")
|
||||
header = next(reader)[1:]
|
||||
data = {}
|
||||
lc_map = {}
|
||||
for line in reader:
|
||||
key = line[0]
|
||||
data[key] = line[1:]
|
||||
lc_map[key.lower()] = key
|
||||
return header, data, lc_map
|
||||
|
||||
def __init__(self, language: str):
|
||||
"""Initialise the reader with the given language column.
|
||||
|
||||
Args:
|
||||
language: Language column name (e.g. ``"English"``).
|
||||
"""
|
||||
self.index = 0
|
||||
self.update_language(language)
|
||||
|
||||
def update_language(self, lang: str) -> bool:
|
||||
"""Switch translation output to *lang*.
|
||||
|
||||
Accepts either the literal column header (e.g. ``"<English>"``)
|
||||
or the bare language name (e.g. ``"English"``) — the columns in
|
||||
the WT lang CSVs are stored with literal angle brackets, so we
|
||||
try both forms before giving up. If neither matches we keep the
|
||||
previous index and log a warning so silent fallthrough to column 0
|
||||
(the historical bug) doesn't recur.
|
||||
|
||||
Args:
|
||||
lang: Language column name with or without angle brackets.
|
||||
|
||||
Returns:
|
||||
True if the language was found and set, False otherwise.
|
||||
"""
|
||||
if lang in self.header_info:
|
||||
self.index = self.header_info.index(lang)
|
||||
return True
|
||||
bracketed = f"<{lang}>"
|
||||
if bracketed in self.header_info:
|
||||
self.index = self.header_info.index(bracketed)
|
||||
return True
|
||||
logging.warning(
|
||||
"%s: unknown language column '%s' (also tried '%s'); "
|
||||
"keeping current column index %d (%s). Available columns: %s",
|
||||
type(self).__name__, lang, bracketed, self.index,
|
||||
self.header_info[self.index] if 0 <= self.index < len(self.header_info) else '?',
|
||||
self.header_info,
|
||||
)
|
||||
return False
|
||||
|
||||
# Keep old misspelled name working
|
||||
update_langauge = update_language
|
||||
|
||||
def _lookup(self, key: str) -> str | None:
|
||||
"""Case-insensitive lookup, returns translated string or None."""
|
||||
try:
|
||||
if key in self.global_data:
|
||||
val = self.global_data[key][self.index]
|
||||
elif key.lower() in self.lowercase_key_map:
|
||||
actual = self.lowercase_key_map[key.lower()]
|
||||
val = self.global_data[actual][self.index]
|
||||
else:
|
||||
return None
|
||||
return val.replace("\\t", "\t")
|
||||
except (KeyError, IndexError):
|
||||
return None
|
||||
|
||||
|
||||
class LangTableReader(_LangCSVBase):
|
||||
"""Translate internal vehicle/unit names to human-readable names."""
|
||||
header_info, global_data, lowercase_key_map = _LangCSVBase._load_csv("lang/units.csv")
|
||||
|
||||
def get_translate(self, value: str) -> str | None:
|
||||
"""Translate a vehicle internal name to its display name.
|
||||
|
||||
Args:
|
||||
value: Internal vehicle identifier (e.g. ``"ussr_t_34_1941_l_11"``).
|
||||
|
||||
Returns:
|
||||
Translated display name, or None if not found.
|
||||
"""
|
||||
return self._lookup(value + "_shop")
|
||||
|
||||
|
||||
class WeaponTableReader(_LangCSVBase):
|
||||
"""Translate internal weapon/ammo names to human-readable names."""
|
||||
header_info, global_data, lowercase_key_map = _LangCSVBase._load_csv("lang/units_weaponry.csv")
|
||||
|
||||
def get_translate(self, value: str) -> str | None:
|
||||
"""Translate a weapon/ammo internal name to its display name.
|
||||
|
||||
Args:
|
||||
value: Internal weapon identifier (e.g. ``"120mm_dm53"``).
|
||||
Also accepts bare names without the ``weapons/`` prefix.
|
||||
|
||||
Returns:
|
||||
Translated display name, or None if not found.
|
||||
"""
|
||||
result = self._lookup(value)
|
||||
if result is None and not value.startswith("weapons/"):
|
||||
result = self._lookup("weapons/" + value)
|
||||
return result
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Name cleanup utilities
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
VEHICLE_NAME_FILTERS = [
|
||||
("Weizman\u2019s ", ""),
|
||||
("Weizman's ", ""),
|
||||
("Plagis\u2019 ", ""),
|
||||
("Plagis' ", ""),
|
||||
(" (TEL)", ""),
|
||||
]
|
||||
# Decoration glyphs (country tree-leak ▄ ▀, event/premium markers
|
||||
# ◊ ◌ ◔, block elements ▂▃▅▆▇█,
|
||||
# control pictures ␗ etc.) used to live in this list and were stripped
|
||||
# *unconditionally* by the loop in apply_vehicle_name_filters() —
|
||||
# running before the strip_decorations flag was checked, making
|
||||
# strip_decorations=False silently a no-op. That's why
|
||||
# vehicle_translations.json had glyphs stripped despite
|
||||
# init_vehicle_translation_cache() passing False. _DECORATION_RE already
|
||||
# covers the same set, so the strip_decorations branch now handles them
|
||||
# correctly: kept when False (web i18n), stripped when True (Discord PNG).
|
||||
|
||||
# Strip every WT decoration glyph in one sweep instead of chasing individual
|
||||
# codepoints as Gaijin adds new tree-leak indicators.
|
||||
#
|
||||
# Covered ranges:
|
||||
# U+2400-U+27FF — Control Pictures, Box Drawing, Block Elements (▀ ▄ etc.),
|
||||
# Geometric Shapes (◊ ◌ ◢ ◣ ◤ ◥), Dingbats, Misc Symbols.
|
||||
# U+E000-U+F8FF — Private Use Area, where older WT variants stored sprite
|
||||
# refs that survived a few client patches.
|
||||
#
|
||||
# Mirrors normalizeVehicleName() in server.js — keep both sides in sync.
|
||||
_DECORATION_RE = re.compile(r"[␀-⟿-]")
|
||||
_PRIVATE_USE_RE = _DECORATION_RE # backward-compat alias for any external imports
|
||||
|
||||
|
||||
def apply_vehicle_name_filters(name: str, strip_decorations: bool = True) -> str:
|
||||
"""Apply standard vehicle-name cleanup.
|
||||
|
||||
Args:
|
||||
name: Raw vehicle display string from lang/units.csv.
|
||||
strip_decorations: When True (the historical default), drop every
|
||||
glyph in ``_DECORATION_RE`` — country tree-leak indicators (▄ ▀),
|
||||
event/premium markers (◊), tree shape markers (◢ ◣ ◤ ◥), control
|
||||
pictures, and the Private Use Area. The Discord scoreboard PNG
|
||||
renderer uses this because its font can't draw those reliably.
|
||||
When False, keep every visible glyph and only strip the Private
|
||||
Use Area (true tofu cruft that no font renders). Used by the
|
||||
website i18n cache so country indicators survive to the UI.
|
||||
"""
|
||||
if not name:
|
||||
return name
|
||||
for target, repl in VEHICLE_NAME_FILTERS:
|
||||
name = name.replace(target, repl)
|
||||
if strip_decorations:
|
||||
name = _DECORATION_RE.sub("", name)
|
||||
else:
|
||||
# Just the PUA — keep all visible decorations.
|
||||
name = re.sub(r"[-]", "", name)
|
||||
return name.strip()
|
||||
|
||||
|
||||
def normalize_name(name: str):
|
||||
"""Normalize a vehicle display name to ASCII-safe form.
|
||||
|
||||
Replaces Cyrillic ``T`` with Latin ``T``, converts ``No.`` symbols,
|
||||
strips remaining non-ASCII characters, and collapses whitespace.
|
||||
|
||||
Args:
|
||||
name: Raw display name string.
|
||||
|
||||
Returns:
|
||||
Cleaned ASCII string, or None if *name* is falsy.
|
||||
"""
|
||||
if not name:
|
||||
return None
|
||||
name = name.replace("Т", "T") # Cyrillic Т → Latin T
|
||||
name = name.replace("№", "No.") # Number Symbol to No.
|
||||
name = name.encode("ascii", "ignore").decode("ascii")
|
||||
name = re.sub(r"[^A-Za-z0-9 .\-\(\)]", "", name)
|
||||
return re.sub(r"\s+", " ", name).strip()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Self-test
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
if __name__ == "__main__":
|
||||
translate = LangTableReader("English")
|
||||
|
||||
vehicles_list = [
|
||||
"ussr_t_34_1941_l_11",
|
||||
"spitfire_lf_mk9e_weisman",
|
||||
"spitfire_ix_usa",
|
||||
"germ_pzkpfw_V_ausf_d_panther",
|
||||
"tankModels/germ_pzkpfw_V_ausf_d_panther",
|
||||
"DISCONNECTED",
|
||||
]
|
||||
print("Vehicles:", vehicles_list)
|
||||
print("Abbreviated types:", count_unit_types(vehicles_list))
|
||||
print("\nHuman-readable vehicle translations:")
|
||||
for vehicle in vehicles_list:
|
||||
if vehicle and vehicle != "DISCONNECTED":
|
||||
readable = translate.get_translate(vehicle)
|
||||
print(f" {vehicle} -> {readable}")
|
||||
else:
|
||||
print(f" {vehicle} -> (skipped)")
|
||||
|
||||
print("\n--- Weapon translations ---")
|
||||
weapons = WeaponTableReader("English")
|
||||
weapon_list = ["120mm_dm13", "120mm_dm33", "120mm_dm53", "105mm_dm33", "weapons/cannonMGC30L"]
|
||||
for w in weapon_list:
|
||||
print(f" {w} -> {weapons.get_translate(w)}")
|
||||
Reference in New Issue
Block a user