dockipelago/worlds/tloz_oos/common/patching/text/encoding.py

import re
from collections import defaultdict
from functools import lru_cache
from typing import List, Union, Optional
from . import char_table, kanji_table, text_offset_split_index_seasons, text_offset_1_table_address_seasons, text_offset_2_table_address_seasons, \
    text_table_eng_address_seasons, \
    text_addresses_limit_seasons, text_offset_split_index_ages, text_offset_1_table_address_ages, text_offset_2_table_address_ages, text_table_eng_address_ages, \
    text_addresses_limit_ages
from ..RomData import RomData
from ..Util import simple_hex
from ..z80asm.Assembler import GameboyAddress

control_sequence_pattern = re.compile(r"""
    \\
    (jump|cmd|col|charsfx|speed|pos|wait|sfx|call)
    \(([^)]+)\) |
    \\(link_name|child_name|w7SecretBuffer1|w7SecretBuffer2|
    num1|opt|stop|heartpiece|num2|slow)
""", re.VERBOSE)
dict_pattern = re.compile(r"DICT(\d+)_([0-9a-f]+)")


def add_to_tree(tree: dict[str, list[int]], char: str, keys: list[int]):
    tree[char] = keys


def build_encoding_dict() -> dict[str, list[int]]:
    tree = {}
    for i in range(len(char_table)):
        char = char_table[i]
        if char != "🚫" and char != "∅":
            add_to_tree(tree, char, [i])

    for i in range(len(kanji_table)):
        char = kanji_table[i]
        if char != "∅":
            add_to_tree(tree, char, [0x06, i])

    add_to_tree(tree, "jump", [0x07, 0x00])
    add_to_tree(tree, "cmd", [0x08, 0x00])

    add_to_tree(tree, "⬜", [0x09, 0x00])
    add_to_tree(tree, "🟥", [0x09, 0x01])
    add_to_tree(tree, "🟧", [0x09, 0x02])
    add_to_tree(tree, "🟦", [0x09, 0x03])
    add_to_tree(tree, "🟩", [0x09, 0x04])
    add_to_tree(tree, "col", [0x09, 0x00])

    add_to_tree(tree, "link_name", [0x0a, 0x00])
    add_to_tree(tree, "child_name", [0x0a, 0x01])
    add_to_tree(tree, "w7SecretBuffer1", [0x0a, 0x02])
    add_to_tree(tree, "w7SecretBuffer2", [0x0a, 0x03])

    add_to_tree(tree, "speed", [0x0c, 0x00])
    add_to_tree(tree, "num1", [0x0c, 0x08])
    add_to_tree(tree, "opt", [0x0c, 0x10])
    add_to_tree(tree, "stop", [0x0c, 0x18])
    add_to_tree(tree, "pos", [0x0c, 0x20])
    add_to_tree(tree, "heartpiece", [0x0c, 0x28])
    add_to_tree(tree, "num2", [0x0c, 0x30])
    add_to_tree(tree, "slow", [0x0c, 0x38])

    add_to_tree(tree, "wait", [0x0d, 0x00])
    add_to_tree(tree, "sfx", [0x0e, 0x00])
    add_to_tree(tree, "call", [0x0f, 0x00])

    add_to_tree(tree, "Ⓐ", [0xb8, 0xb9])
    add_to_tree(tree, "Ⓑ", [0xba, 0xbb])

    return tree


# --- Trie Data Structure ---
class TrieNode:
    def __init__(self):
        self.children = defaultdict(lambda: TrieNode())
        self.code = None


# --- Global Caches ---
encode_current_trie: Optional[TrieNode] = None
encode_current_encoding: Optional[dict[str, list[int]]] = None
encode_last_ids = (None, None)

control_keywords = {
    "link_name", "child_name", "w7SecretBuffer1", "w7SecretBuffer2",
    "num1", "opt", "stop", "heartpiece", "num2", "slow"
}

control_functions = {
    "jump", "cmd", "col", "charsfx", "speed", "pos", "wait", "sfx", "call"
}


def next_character(text: str, index: int) -> tuple[Union[str, tuple[str, int]], int]:
    if index >= len(text):
        return "\0", 1

    if text[index] != "\\":
        return text[index], 1

    # Try parsing a function-style command: \name(hex)
    for name in control_functions:
        if text.startswith(f"\\{name}(", index):
            start = index + len(name) + 2  # skip past '\name('
            end = start + 2
            value = text[start:end]
            return (name, int(value, 16)), end + 1 - index

    # Try keyword match (e.g. \opt)
    for name in control_keywords:
        if text.startswith(f"\\{name}", index):
            return name, 1 + len(name)

    raise Exception()


def build_trie(dictionary: dict[str, str]) -> TrieNode:
    root = TrieNode()
    for key, value in dictionary.items():
        node = root
        i = 0
        while i < len(value):
            token, length = next_character(value, i)
            node = node.children[token]
            i += length
        node.code = [2 + int(key[4]), int(key[6:8], 16)]
    return root


@lru_cache
def recursive_encode(text: str, index: int) -> tuple[int]:
    if index >= len(text):
        return (0,)

    token, length = next_character(text, index)
    if isinstance(token, tuple):
        encoded = list(encode_current_encoding[token[0]])
        encoded[-1] += token[1]
        if token[0] == "jump":
            return tuple(encoded)
    else:
        if token not in encode_current_encoding:
            token = "口"  # Use a white square to denote unknown characters
        encoded = encode_current_encoding[token]

    best = list(encoded) + list(recursive_encode(text, index + length))

    if token not in encode_current_trie.children:
        # No dict entry
        return tuple(best)

    node = encode_current_trie.children[token]
    i = index + length
    depth = 1

    while i < len(text):
        token2, tlen = next_character(text, i)
        if token2 not in node.children:
            break
        node = node.children[token2]
        i += tlen
        depth += 1
        if node.code:
            candidate = node.code + list(recursive_encode(text, i))
            if len(candidate) < len(best):
                best = candidate

    return tuple(best)


# --- Main Function ---
def encode_text(text: str, encoding: dict[str, List[int]], dictionary: dict[str, str]) -> List[int]:
    global encode_current_trie, encode_current_encoding, encode_last_ids
    id_dict = id(dictionary)
    id_enc = id(encoding)

    # Rebuild trie/cache if dictionary/encoding changed
    if encode_last_ids != (id_dict, id_enc):
        encode_current_trie = build_trie(dictionary)
        encode_current_encoding = encoding
        encode_last_ids = (id_dict, id_enc)

    result = list(recursive_encode(text, 0))
    return result


def encode_dict(text_data: dict[str, str], dictionary: Optional[dict[str, str]] = None) -> dict[str, list[int]]:
    if dictionary is None:
        dictionary = {}
    encoding_dict = build_encoding_dict()
    encoded_dict = {}
    for key in text_data:
        encoded_text = encode_text(text_data[key], encoding_dict, dictionary)
        encoded_dict[key] = encoded_text
    recursive_encode.cache_clear()
    return encoded_dict


def build_compact_table(data: dict[str, list[int]]) -> tuple[list[int], dict[str, int]]:
    sorted_items = sorted(data.items(), key=lambda kv: -len(kv[1]))
    compact = []
    offsets = {}

    for key, seq in sorted_items:
        for key2 in offsets:
            string_end = offsets[key2] + len(data[key2])
            if compact[string_end - len(seq):string_end] == seq:
                offset = string_end - len(seq)
                break
        else:
            offset = len(compact)
            compact.extend(seq)
        offsets[key] = offset
    assert len(compact) <= 0xffff

    return compact, offsets


def write_text_data(rom: RomData, dictionary: dict[str, str], texts: dict[str, str], seasons: bool):
    if seasons:
        text_offset_split_index = text_offset_split_index_seasons
        text_offset_1 = GameboyAddress(rom.read_byte(text_offset_1_table_address_seasons), rom.read_word(text_offset_1_table_address_seasons + 1))
        text_offset_2 = GameboyAddress(rom.read_byte(text_offset_2_table_address_seasons), rom.read_word(text_offset_2_table_address_seasons + 1))
        text_table_eng_address = text_table_eng_address_seasons
        text_addresses_limit = text_addresses_limit_seasons
    else:
        text_offset_split_index = text_offset_split_index_ages
        text_offset_1 = GameboyAddress(rom.read_byte(text_offset_1_table_address_ages), rom.read_word(text_offset_1_table_address_ages + 1))
        text_offset_2 = GameboyAddress(rom.read_byte(text_offset_2_table_address_ages), rom.read_word(text_offset_2_table_address_ages + 1))
        text_table_eng_address = text_table_eng_address_ages
        text_addresses_limit = text_addresses_limit_ages

    dict1 = {}
    dict2 = {}
    for key in texts:
        if int(key[3:5], 16) < text_offset_split_index - 4:
            dict1[key] = texts[key]
        else:
            dict2[key] = texts[key]

    encoded_dict1 = encode_dict(dict1, dictionary)
    encoded_dict1.update(encode_dict(dictionary))
    encoded_dict2 = encode_dict(dict2, dictionary)

    offset_table_length = (len(encoded_dict1) + len(encoded_dict2)) * 2
    text_offset_1_address = text_offset_1.address_in_rom()
    text_offset_2_address = text_offset_2.address_in_rom()
    text_table_current_address = text_table_eng_address
    tx_table_current_address = text_table_eng_address + 0x64 * 2
    text_offset_1_offset = text_table_eng_address + 0x64 * 2 + offset_table_length - text_offset_1_address
    assert text_offset_1_offset >= 0

    compact_table1, compact_offsets1 = build_compact_table(encoded_dict1)
    rom.write_bytes(text_offset_1_address + text_offset_1_offset, compact_table1)

    for i in range(4):
        rom.write_word(text_table_current_address, tx_table_current_address - text_table_eng_address)
        text_table_current_address += 2
        for j in range(0, 0x100):
            entry_name = f"DICT{i}_{simple_hex(j)}"
            rom.write_word(tx_table_current_address, compact_offsets1[entry_name] + text_offset_1_offset)
            tx_table_current_address += 2

    for i in range(text_offset_split_index - 4):
        start_address = tx_table_current_address
        subid = 0
        while True:
            tx = f"TX_{simple_hex(i)}{simple_hex(subid)}"
            if tx not in dict1:
                break
            subid += 1

            rom.write_word(tx_table_current_address, compact_offsets1[tx] + text_offset_1_offset)
            tx_table_current_address += 2
        if subid > 0:
            rom.write_word(text_table_current_address, start_address - text_table_eng_address)
        else:
            rom.write_word(text_table_current_address, 0)
        text_table_current_address += 2

    if __debug__ and False:
        sorted_dict = sorted(list(encoded_dict1.items()) + list(encoded_dict2.items()), key=lambda kv: -len(kv[1]))
        for entry in sorted_dict:
            if entry[0] in dict1:
                print(entry[0], dict1[entry[0]], len(entry[1]))
            elif entry[0] in dict2:
                print(entry[0], dict2[entry[0]], len(entry[1]))
    text_offset_2_offset = max(0, text_offset_1_address + text_offset_1_offset + len(compact_table1) - text_offset_2_address)
    compact_table2, compact_offsets2 = build_compact_table(encoded_dict2)
    assert text_offset_2_address + text_offset_2_offset + len(compact_table2) < text_addresses_limit, \
        f"Text is too long ({text_offset_2_address + text_offset_2_offset + len(compact_table2) - text_addresses_limit} too many bytes)"
    print(f"Free text bytes: {text_addresses_limit - text_offset_2_address - text_offset_2_offset - len(compact_table2)}")
    rom.write_bytes(text_offset_2_address + text_offset_2_offset, compact_table2)

    for i in range(text_offset_split_index - 4, 0x60):
        start_address = tx_table_current_address
        subid = 0
        while True:
            tx = f"TX_{simple_hex(i)}{simple_hex(subid)}"
            if tx not in dict2:
                break
            subid += 1

            rom.write_word(tx_table_current_address, compact_offsets2[tx] + text_offset_2_offset)
            tx_table_current_address += 2
        if subid > 0:
            rom.write_word(text_table_current_address, start_address - text_table_eng_address)
        else:
            rom.write_word(text_table_current_address, 0)
        text_table_current_address += 2