SHARED/DAGOR_FILES/WtFileUtils/blk/BlkParser.py

import zstandard as zstd

from ..blk.FileInfo import FileType
from ..blk.Block import Block
from ..blk.Chunk import ChunkParser, Chunk
from ..blk.ParamParser import BLKTypes
from ..DataHandler import DataHandler


class BlkDecoder:
    """
    a blk parser
    inputs:
    dat: the data to be parsed
    offset: how far along into the data the blk starts
    name_map: an optional parameter for blks that have a name map, see FileInfo.py for more info
    zstd_dict: an optional parameter for blks that have a zstd dict, see FileInfo.py for more info
    """
    def __init__(self, dat, offset=0, name_map:list[bytearray] = None, zstd_dict = None):
        self.data = None
        self.blkType = FileType(dat[0+offset])  # gets blk type, the first byte
        if not self.blkType.is_zstd():
            self.data = DataHandler(dat, offset=offset+1, read_from_start=False)
        else:
            if self.blkType.needs_dict():
                if zstd_dict is None:
                    print("BAD DICT")
                # d = zstd.ZstdCompressionDict(zstd_dict)
                d= zstd_dict
                raw = zstd.ZstdDecompressor(d).decompress(dat[1:])
                self.data = DataHandler(raw, offset=offset, read_from_start=False)
            else:
                try:
                    raw = zstd.decompress(dat[1:])
                except zstd.ZstdError:
                    # only done because some zstd data in VROMFS can be in streams instead of standard format
                    x = zstd.ZstdDecompressor().stream_reader(dat[1:])
                    raw = x.read()
                    x.close()
                self.data = DataHandler(raw, offset=offset, read_from_start=False)
        self.names_in_name_map = self.decode_uleb128()  # gets the number of names in the name map
        self.names = None
        if self.blkType.is_slim():
            if name_map is None:
                print("BAD NAME MAP")
            self.names = []
            for name in name_map:
                try:
                    self.names.append(name.decode("utf-8"))
                except UnicodeDecodeError:
                    self.names.append("BADBADBAD"+name.decode("utf-8", errors="ignore"))
        else:
            self.name_map_size = self.decode_uleb128()  # gets the size of the name map

            self.names = [x.decode("utf-8") for x in self.data.fetch(self.name_map_size - 1).split(b"\x00")]
            # print(self.names)
            self.data.advance(1) # it only fetches size - 1 for speed to reduce slicing
            if len(self.names) != self.names_in_name_map:
                print("RED ALERT")

        self.num_of_blocks = self.decode_uleb128()
        self.num_of_params = self.decode_uleb128()
        self.params_data_size = self.decode_uleb128()
        self.params_data = self.data.fetch(self.params_data_size)  # used later on, data
        '''
        here we are are skipping results creation and starting with chunks
        assume we are doing let chunks
        '''
        chunks = []
        parser = ChunkParser(self.names, BLKTypes(self.names, self.params_data))
        for i in range(self.num_of_params):
            chunks.append(parser.parse(self.data.fetch(8)))

        # chunks = Chunks(self.data, self.num_of_params, self.names, B)
        blocks = []
        for i in range(self.num_of_blocks):  # this creates all the blocks
            name_id = self.decode_uleb128()
            param_count = self.decode_uleb128()
            block_count = self.decode_uleb128()
            if block_count > 0:
                first_block_id = self.decode_uleb128()
            else:
                first_block_id = -1

            # print(name_id, param_count, block_count, first_block_id, self.data._ptr, self.block_id_to_name(name_id))
            # print(self.block_id_to_name(name_id), name_id)
            blocks.append(Block(self.block_id_to_name(name_id), param_count, block_count, first_block_id))

        # if current_t > 0:
        #     print(f"After block creation and final file read: {time.perf_counter() - current_t}")

        result_ptr = 0
        for block in blocks:  # this grabs all the values and puts them in their correct blocks
            field_count = block.param_count
            for i in range(field_count):
                block.add_field(chunks[result_ptr + i])
            result_ptr += field_count

        # if current_t > 0:
        #     print(f"After block param matching: {time.perf_counter() - current_t}")

        self.parent = blocks[0]
        self.from_blocks_with_parent(self.parent, blocks)

        # if current_t > 0:
        #     print(f"After block hierarchy creation: {time.perf_counter() - current_t}")

    def to_dict(self):
        return self.parent.to_dict()

    def decode_uleb128(self):
        """Decodes a ULEB128 encoded value."""
        value = 0
        shift = 0
        while True:
            byte = self.data.fetch(1)[0]
            value |= (byte & 0x7f) << shift
            if not (byte & 0x80):
                break
            shift += 7
        return value

    def block_id_to_name(self, block_id):
        if block_id == 0:
            return "root"
        else:
            return self.names[block_id - 1]

    def from_blocks_with_parent(self, parent, blocks):
        for i in range(parent.blocks_count):
            parent.children.append(blocks[i + parent.first_block_id])
            self.from_blocks_with_parent(blocks[i + parent.first_block_id], blocks)


class BlkBytes:
    """
    A class that acts like BLkDecoder without all the parsing, simply used to get all the bytes from a BLK
    """
    def __init__(self, dat, offset=0, name_map:list[bytearray] = None, zstd_dict = None):
        self.data = None
        self.bytes = bytearray()
        #print(dat)
        # print(len(dat))
        # print(type(dat))
        self.blkType = FileType(dat[0+offset])  # gets blk type, the first byte
        self.bytes += bytearray([dat[0+offset]])
        if not self.blkType.is_zstd():
            self.data = DataHandler(dat, offset=offset+1, read_from_start=False)
        else:
            if self.blkType.needs_dict():
                if zstd_dict is None:
                    print("BAD DICT")
                # d = zstd.ZstdCompressionDict(zstd_dict)
                d= zstd_dict
                raw = zstd.ZstdDecompressor(d).decompress(dat[1:])
                self.data = DataHandler(raw, offset=offset, read_from_start=False)
            else:
                try:
                    raw = zstd.decompress(dat[1:])
                except zstd.ZstdError:
                    # only done because some zstd data in VROMFS can be in streams instead of standard format
                    x = zstd.ZstdDecompressor().stream_reader(dat[1:])
                    raw = x.read()
                    x.close()
                self.data = DataHandler(raw, offset=offset, read_from_start=False)
        self.names_in_name_map, temp = self.decode_uleb128()  # gets the number of names in the name map
        self.bytes += temp
        # self.names = None
        if self.blkType.is_slim():
            if name_map is None:
                print("BAD NAME MAP")
        else:
            self.name_map_size, temp = self.decode_uleb128()  # gets the size of the name map
            self.bytes += temp

            # self.names = [x.decode("utf-8") for x in self.data.fetch(self.name_map_size - 1).split(b"\x00")]
            self.bytes += self.data.fetch(self.name_map_size)
            # print(self.names)
            # self.data.advance(1)
            # if len(self.names) != self.names_in_name_map:
            #     print("RED ALERT")
        self.num_of_blocks, temp = self.decode_uleb128()
        self.bytes += temp
        self.num_of_params, temp = self.decode_uleb128()
        self.bytes += temp
        self.params_data_size, temp = self.decode_uleb128()
        self.bytes += temp
        # self.params_data = self.data.fetch(self.params_data_size)  # used later on, data
        self.bytes += self.data.fetch(self.params_data_size)
        '''
        here we are are skipping results creation and starting with chunks
        assume we are doing let chunks
        '''
        # chunks = []
        # parser = ChunkParser(self.names, BLKTypes(self.names, self.params_data))
        # for i in range(self.num_of_params):
        #     chunks.append(parser.parse(self.data.fetch(8)))
        self.bytes += self.data.fetch(self.num_of_params*8)
        # chunks = Chunks(self.data, self.num_of_params, self.names, B)
        # blocks = []
        for i in range(self.num_of_blocks):  # this creates all the blocks
            name_id, temp = self.decode_uleb128()
            self.bytes += temp
            param_count, temp = self.decode_uleb128()
            self.bytes += temp
            block_count, temp = self.decode_uleb128()
            self.bytes += temp
            if block_count > 0:
                first_block_id, temp = self.decode_uleb128()
                self.bytes += temp
        #     else:
        #         first_block_id = -1
            # print(self.block_id_to_name(name_id), name_id)
        #     blocks.append(Block(self.block_id_to_name(name_id), param_count, block_count, first_block_id))

        # if current_t > 0:
        #     print(f"After block creation and final file read: {time.perf_counter() - current_t}")
        '''
        result_ptr = 0
        for block in blocks:  # this grabs all the values and puts them in their correct blocks
            field_count = block.param_count
            for i in range(field_count):
                block.add_field(chunks[result_ptr + i])
            result_ptr += field_count

        # if current_t > 0:
        #     print(f"After block param matching: {time.perf_counter() - current_t}")

        self.parent = blocks[0]
        self.from_blocks_with_parent(self.parent, blocks)

        '''# if current_t > 0:
        #     print(f"After block hierarchy creation: {time.perf_counter() - current_t}")

    def decode_uleb128(self):
        """Decodes a ULEB128 encoded value."""
        value = 0
        shift = 0
        bytes_ = bytearray()
        while True:
            byte = self.data.fetch(1)
            bytes_ += byte
            byte = byte[0]
            value |= (byte & 0x7f) << shift
            if not (byte & 0x80):
                break
            shift += 7
        return value, bytes_