Initial commit: SHARED library with LFS for binary assets

2026-07-02 02:00:46 +00:00
commit db5de3ac7d
9356 changed files with 47608 additions and 0 deletions
@@ -0,0 +1,447 @@
+"""
+data_parser.py
+
+Parses War Thunder VROMFS game data files:
+  - char.vromfs.bin → UnitTags: vehicle classification (fighter, bomber, tank, etc.)
+  - lang.vromfs.bin → LangTableReader: vehicle name translation (internal ID → display name)
+  - lang.vromfs.bin → WeaponTableReader: weapon/ammo name translation (internal ID → display name)
+"""
+
+# Standard Library Imports
+import csv
+import logging
+import re
+import sys
+from io import StringIO
+from pathlib import Path
+
+# Ensure SHARED dir is on sys.path so DAGOR_FILES (sibling package) imports work
+_shared_root = Path(__file__).resolve().parent
+if str(_shared_root) not in sys.path:
+    sys.path.insert(0, str(_shared_root))
+
+# Third-Party Library Imports
+from DAGOR_FILES.WtFileUtils.vromfs.VROMFs import VROMFs
+
+_BOT_DIR = Path(__file__).resolve().parent
+
+
+# ---------------------------------------------------------------------------
+# Unit tags — lazy-loaded from char.vromfs.bin
+# ---------------------------------------------------------------------------
+
+_TAG_TO_TYPE = {
+    "type_spaa":             "SPAA",
+    "type_light_tank":       "Light Tank",
+    "type_heavy_tank":       "Tank",
+    "type_medium_tank":      "Tank",
+    "type_tank_destroyer":   "Tank",
+    "type_missile_tank":     "Tank",
+    "type_fighter":          "Fighter",
+    "type_strike_aircraft":  "Fighter",
+    "type_bomber":           "Bomber",
+    "type_helicopter":       "Helicopter",
+}
+
+_TAG_TO_ABBREV = {
+    "type_spaa":             "AA",
+    "type_light_tank":       "L",
+    "type_heavy_tank":       "T",
+    "type_medium_tank":      "T",
+    "type_tank_destroyer":   "T",
+    "type_missile_tank":     "T",
+    "tank":                  "T",
+    "type_fighter":          "F",
+    "type_strike_aircraft":  "F",
+    "type_bomber":           "B",
+    "helicopter":            "H",
+    "type_helicopter":       "H",
+}
+
+
+class UnitTags:
+    """Lazy-loaded lookup for vehicle classification from char.vromfs.bin."""
+
+    _instance: "UnitTags | None" = None
+
+    def __init__(self):
+        self._data: dict | None = None
+        self._lowercase_map: dict[str, str] | None = None
+
+    @classmethod
+    def get(cls) -> "UnitTags":
+        """Return the singleton UnitTags instance, creating it on first call.
+
+        Returns:
+            The shared UnitTags instance.
+        """
+        if cls._instance is None:
+            cls._instance = cls()
+        return cls._instance
+
+    def _ensure_loaded(self):
+        """Parse char.vromfs.bin and populate ``_data`` and ``_lowercase_map`` if not already loaded."""
+        if self._data is not None:
+            return
+        path = _BOT_DIR / "char.vromfs.bin"
+        v = VROMFs(str(path)).get_directory()
+        data = v["config"]["unittags.blk"].get_data()["root"]  # type: ignore[index,union-attr]
+        self._data = data
+        self._lowercase_map = {k.lower(): k for k in data}
+
+    def _resolve_key(self, internal_name: str) -> str | None:
+        """Return the actual key in the data dict, or None if not found."""
+        self._ensure_loaded()
+        assert self._data is not None and self._lowercase_map is not None
+        if internal_name in self._data:
+            return internal_name
+        return self._lowercase_map.get(internal_name.lower())
+
+    def _get_tags(self, internal_name: str) -> list[str] | None:
+        """Return the tag list for a vehicle, or None if not found."""
+        key = self._resolve_key(internal_name)
+        if key is None:
+            return None
+        entry = self._data[key]  # type: ignore[index]
+        tags = list(entry["tags"].keys())
+        if entry.get("type"):
+            tags.append(entry["type"])
+        if entry.get("type") == "helicopter" and "type_helicopter" not in tags:
+            tags.append("type_helicopter")
+        return tags
+
+    @property
+    def all_names(self) -> list[str]:
+        """All vehicle internal names known to unittags.blk."""
+        self._ensure_loaded()
+        return list(self._data.keys())  # type: ignore[union-attr]
+
+    @property
+    def raw(self) -> dict:
+        """Direct access to the parsed unittags dict."""
+        self._ensure_loaded()
+        return self._data  # type: ignore[return-value]
+
+    @staticmethod
+    def _best_match(tags: list[str], mapping: dict[str, str]) -> str | None:
+        """Return the most specific matching value from *mapping* for *tags*.
+
+        Specific ``type_*`` tags (e.g. ``type_spaa``, ``type_light_tank``) are
+        checked first so they take priority over generic tags like ``tank`` or
+        ``helicopter``.
+        """
+        fallback = None
+        for tag in tags:
+            if tag not in mapping:
+                continue
+            if tag.startswith("type_"):
+                return mapping[tag]
+            if fallback is None:
+                fallback = mapping[tag]
+        return fallback
+
+    def get_unit_type(self, internal_name: str) -> str | None:
+        """Return full vehicle type like 'Tank', 'Fighter', 'SPAA', etc."""
+        tags = self._get_tags(internal_name)
+        if tags is None:
+            print(f"ERROR: Vehicle {internal_name} not found in unit tags")
+            return None
+        result = self._best_match(tags, _TAG_TO_TYPE)
+        if result is None:
+            print(f"ERROR DETERMINING VEHICLE TYPE FOR UNIT: {internal_name} WITH TAGS: {tags}")
+        return result
+
+    def get_unit_type_abbrev(self, internal_name: str | None) -> str:
+        """Return abbreviated vehicle type like 'T', 'F', 'AA', etc."""
+        if not internal_name or internal_name == "DISCONNECTED":
+            return "?"
+        tags = self._get_tags(internal_name)
+        if tags is None:
+            print(f"ERROR: Vehicle {internal_name} not found in unit tags")
+            return "?"
+        result = self._best_match(tags, _TAG_TO_ABBREV)
+        if result is None:
+            print(f"ERROR DETERMINING VEHICLE TYPE FOR UNIT: {internal_name} WITH TAGS: {tags}")
+            return "?"
+        return result
+
+
+# Module-level convenience functions (so callers don't need to touch UnitTags directly)
+
+def get_unit_type(internal_name: str) -> str | None:
+    """Return full vehicle type (e.g. ``'Tank'``, ``'Fighter'``) for an internal name.
+
+    Args:
+        internal_name: War Thunder internal vehicle identifier.
+
+    Returns:
+        Human-readable type string, or None if unrecognised.
+    """
+    return UnitTags.get().get_unit_type(internal_name)
+
+def get_unit_type_abbrev(internal_name: str | None) -> str:
+    """Return abbreviated vehicle type (e.g. ``'T'``, ``'F'``, ``'AA'``).
+
+    Args:
+        internal_name: War Thunder internal vehicle identifier, or None.
+
+    Returns:
+        Single-letter abbreviation, or ``'?'`` if unknown/None.
+    """
+    return UnitTags.get().get_unit_type_abbrev(internal_name)
+
+def count_unit_types(internal_name_list: list[str]) -> dict[str, int]:
+    """Count vehicle types in a list, returning e.g. {'T': 2, 'F': 1}."""
+    counts: dict[str, int] = {}
+    for name in internal_name_list:
+        if name != "MEOW":
+            t = get_unit_type_abbrev(name)
+            counts[t] = counts.get(t, 0) + 1
+    return counts
+
+
+# ---------------------------------------------------------------------------
+# Lang CSV readers — translations from lang.vromfs.bin
+# ---------------------------------------------------------------------------
+
+class _LangCSVBase:
+    """Shared loader for semicolon-delimited lang CSVs inside lang.vromfs.bin."""
+
+    header_info: list[str]
+    global_data: dict[str, list[str]]
+    lowercase_key_map: dict[str, str]
+
+    _lang_dir = None  # lazily shared across subclasses
+
+    @classmethod
+    def _get_lang_dir(cls):
+        """Return the parsed lang.vromfs.bin directory, loading it once on first call."""
+        if _LangCSVBase._lang_dir is None:
+            p = _BOT_DIR / "lang.vromfs.bin"
+            _LangCSVBase._lang_dir = VROMFs(str(p)).get_directory()
+        return _LangCSVBase._lang_dir
+
+    @classmethod
+    def _load_csv(cls, csv_path: str):
+        """Load a CSV from the lang vromfs. Returns (header, data_dict, lowercase_map)."""
+        lang_dir = cls._get_lang_dir()
+        parts = csv_path.split("/")
+        node = lang_dir[parts[0]][parts[1]]  # type: ignore[index]
+        reader = csv.reader(StringIO(node.get_data().decode("utf-8")), delimiter=";")
+        header = next(reader)[1:]
+        data = {}
+        lc_map = {}
+        for line in reader:
+            key = line[0]
+            data[key] = line[1:]
+            lc_map[key.lower()] = key
+        return header, data, lc_map
+
+    def __init__(self, language: str):
+        """Initialise the reader with the given language column.
+
+        Args:
+            language: Language column name (e.g. ``"English"``).
+        """
+        self.index = 0
+        self.update_language(language)
+
+    def update_language(self, lang: str) -> bool:
+        """Switch translation output to *lang*.
+
+        Accepts either the literal column header (e.g. ``"<English>"``)
+        or the bare language name (e.g. ``"English"``) — the columns in
+        the WT lang CSVs are stored with literal angle brackets, so we
+        try both forms before giving up. If neither matches we keep the
+        previous index and log a warning so silent fallthrough to column 0
+        (the historical bug) doesn't recur.
+
+        Args:
+            lang: Language column name with or without angle brackets.
+
+        Returns:
+            True if the language was found and set, False otherwise.
+        """
+        if lang in self.header_info:
+            self.index = self.header_info.index(lang)
+            return True
+        bracketed = f"<{lang}>"
+        if bracketed in self.header_info:
+            self.index = self.header_info.index(bracketed)
+            return True
+        logging.warning(
+            "%s: unknown language column '%s' (also tried '%s'); "
+            "keeping current column index %d (%s). Available columns: %s",
+            type(self).__name__, lang, bracketed, self.index,
+            self.header_info[self.index] if 0 <= self.index < len(self.header_info) else '?',
+            self.header_info,
+        )
+        return False
+
+    # Keep old misspelled name working
+    update_langauge = update_language
+
+    def _lookup(self, key: str) -> str | None:
+        """Case-insensitive lookup, returns translated string or None."""
+        try:
+            if key in self.global_data:
+                val = self.global_data[key][self.index]
+            elif key.lower() in self.lowercase_key_map:
+                actual = self.lowercase_key_map[key.lower()]
+                val = self.global_data[actual][self.index]
+            else:
+                return None
+            return val.replace("\\t", "\t")
+        except (KeyError, IndexError):
+            return None
+
+
+class LangTableReader(_LangCSVBase):
+    """Translate internal vehicle/unit names to human-readable names."""
+    header_info, global_data, lowercase_key_map = _LangCSVBase._load_csv("lang/units.csv")
+
+    def get_translate(self, value: str) -> str | None:
+        """Translate a vehicle internal name to its display name.
+
+        Args:
+            value: Internal vehicle identifier (e.g. ``"ussr_t_34_1941_l_11"``).
+
+        Returns:
+            Translated display name, or None if not found.
+        """
+        return self._lookup(value + "_shop")
+
+
+class WeaponTableReader(_LangCSVBase):
+    """Translate internal weapon/ammo names to human-readable names."""
+    header_info, global_data, lowercase_key_map = _LangCSVBase._load_csv("lang/units_weaponry.csv")
+
+    def get_translate(self, value: str) -> str | None:
+        """Translate a weapon/ammo internal name to its display name.
+
+        Args:
+            value: Internal weapon identifier (e.g. ``"120mm_dm53"``).
+                   Also accepts bare names without the ``weapons/`` prefix.
+
+        Returns:
+            Translated display name, or None if not found.
+        """
+        result = self._lookup(value)
+        if result is None and not value.startswith("weapons/"):
+            result = self._lookup("weapons/" + value)
+        return result
+
+
+# ---------------------------------------------------------------------------
+# Name cleanup utilities
+# ---------------------------------------------------------------------------
+
+VEHICLE_NAME_FILTERS = [
+    ("Weizman\u2019s ", ""),
+    ("Weizman's ", ""),
+    ("Plagis\u2019 ", ""),
+    ("Plagis' ", ""),
+    (" (TEL)", ""),
+]
+# Decoration glyphs (country tree-leak ▄ ▀, event/premium markers
+# ◊ ◌ ◔, block elements ▂▃▅▆▇█,
+# control pictures ␗ etc.) used to live in this list and were stripped
+# *unconditionally* by the loop in apply_vehicle_name_filters() —
+# running before the strip_decorations flag was checked, making
+# strip_decorations=False silently a no-op. That's why
+# vehicle_translations.json had glyphs stripped despite
+# init_vehicle_translation_cache() passing False. _DECORATION_RE already
+# covers the same set, so the strip_decorations branch now handles them
+# correctly: kept when False (web i18n), stripped when True (Discord PNG).
+
+# Strip every WT decoration glyph in one sweep instead of chasing individual
+# codepoints as Gaijin adds new tree-leak indicators.
+#
+# Covered ranges:
+#   U+2400-U+27FF — Control Pictures, Box Drawing, Block Elements (▀ ▄ etc.),
+#                   Geometric Shapes (◊ ◌ ◢ ◣ ◤ ◥), Dingbats, Misc Symbols.
+#   U+E000-U+F8FF — Private Use Area, where older WT variants stored sprite
+#                   refs that survived a few client patches.
+#
+# Mirrors normalizeVehicleName() in server.js — keep both sides in sync.
+_DECORATION_RE = re.compile(r"[␀-⟿-]")
+_PRIVATE_USE_RE = _DECORATION_RE  # backward-compat alias for any external imports
+
+
+def apply_vehicle_name_filters(name: str, strip_decorations: bool = True) -> str:
+    """Apply standard vehicle-name cleanup.
+
+    Args:
+        name: Raw vehicle display string from lang/units.csv.
+        strip_decorations: When True (the historical default), drop every
+            glyph in ``_DECORATION_RE`` — country tree-leak indicators (▄ ▀),
+            event/premium markers (◊), tree shape markers (◢ ◣ ◤ ◥), control
+            pictures, and the Private Use Area. The Discord scoreboard PNG
+            renderer uses this because its font can't draw those reliably.
+            When False, keep every visible glyph and only strip the Private
+            Use Area (true tofu cruft that no font renders). Used by the
+            website i18n cache so country indicators survive to the UI.
+    """
+    if not name:
+        return name
+    for target, repl in VEHICLE_NAME_FILTERS:
+        name = name.replace(target, repl)
+    if strip_decorations:
+        name = _DECORATION_RE.sub("", name)
+    else:
+        # Just the PUA — keep all visible decorations.
+        name = re.sub(r"[-]", "", name)
+    return name.strip()
+
+
+def normalize_name(name: str):
+    """Normalize a vehicle display name to ASCII-safe form.
+
+    Replaces Cyrillic ``T`` with Latin ``T``, converts ``No.`` symbols,
+    strips remaining non-ASCII characters, and collapses whitespace.
+
+    Args:
+        name: Raw display name string.
+
+    Returns:
+        Cleaned ASCII string, or None if *name* is falsy.
+    """
+    if not name:
+        return None
+    name = name.replace("Т", "T")       # Cyrillic Т → Latin T
+    name = name.replace("№", "No.")     # Number Symbol to No.
+    name = name.encode("ascii", "ignore").decode("ascii")
+    name = re.sub(r"[^A-Za-z0-9 .\-\(\)]", "", name)
+    return re.sub(r"\s+", " ", name).strip()
+
+
+# ---------------------------------------------------------------------------
+# Self-test
+# ---------------------------------------------------------------------------
+
+if __name__ == "__main__":
+    translate = LangTableReader("English")
+
+    vehicles_list = [
+        "ussr_t_34_1941_l_11",
+        "spitfire_lf_mk9e_weisman",
+        "spitfire_ix_usa",
+        "germ_pzkpfw_V_ausf_d_panther",
+        "tankModels/germ_pzkpfw_V_ausf_d_panther",
+        "DISCONNECTED",
+    ]
+    print("Vehicles:", vehicles_list)
+    print("Abbreviated types:", count_unit_types(vehicles_list))
+    print("\nHuman-readable vehicle translations:")
+    for vehicle in vehicles_list:
+        if vehicle and vehicle != "DISCONNECTED":
+            readable = translate.get_translate(vehicle)
+            print(f"  {vehicle} -> {readable}")
+        else:
+            print(f"  {vehicle} -> (skipped)")
+
+    print("\n--- Weapon translations ---")
+    weapons = WeaponTableReader("English")
+    weapon_list = ["120mm_dm13", "120mm_dm33", "120mm_dm53", "105mm_dm33", "weapons/cannonMGC30L"]
+    for w in weapon_list:
+        print(f"  {w} -> {weapons.get_translate(w)}")