libx52/libx52util/x52_char_map_gen.py

#!/usr/bin/env python3
# Character map generator
#
# Copyright (C) 2012-2026 Nirenjan Krishnan (nirenjan@nirenjan.org)
#
# SPDX-License-Identifier: GPL-2.0-only WITH Classpath-exception-2.0
"""
Generator script to parse character mapping
for the X52/X52 Pro MFD
"""

import sys
import re
import json
import unicodedata

class LineFormatError(ValueError):
    """
    Error class for parser
    """

class BMPTable:
    """
    Sparse table for Basic Multilingual Plane
    """

    REPLACEMENT_CHAR = 0xDB

    HEADER = f"""/*
* Autogenerated tables for X52 MFD character lookup
*
* DO NOT EDIT
*/

#include <stdint.h>

"""

    TABLE_NAME_FORMAT = 'bmp_page_%02x'
    TABLE_NAME_DEFAULT = 'bmp_page_default'
    TABLE_FORMAT = 'const uint16_t %s[256] = {'
    TABLE_FOOTER = '};\n'

    def __init__(self, input_file, output_file, output_map):
        self.input_file = input_file
        self.output_file = output_file
        self.output_map = output_map
        self.mapping = {}
        self.pages = {}
        self.sequences = {}
        self.root_table = []

        self.read_map()
        self.build_extended_map()
        self.build_tables()
        self.generate_test_tables()

    @staticmethod
    def parse_line(data):
        """
        Parse a line containing a mapping descriptor. The mapping descriptor
        must start with a hexadecimal unicode code point, followed by either a
        single character, or a hexadecimal integer that corresponds to the map
        value.
        """
        # Strip off comments
        data = re.sub(re.compile('#.*$'), '', data)

        # Strip off leading and trailing whitespace
        data = data.strip()

        # If the line is empty, it is a comment line
        if len(data) == 0:
            return None, None

        # Find the code point and the target value
        try:
            code_point, target = data.strip().split()
        except ValueError as exc:
            # Raised when there are either too many, or not enough values in
            # the string
            raise LineFormatError(f'Invalid descriptor format "{data}"') from exc

        # Convert the string to its equivalent numeric value
        try:
            code_point = int(code_point, 0)
        except ValueError as exc:
            raise LineFormatError(f'Invalid code point "{code_point}"') from exc

        # Check if the target is a single character
        if len(target) == 1:
            target = ord(target)
        else:
            # Try parsing the target as an integer
            try:
                target = int(target, 0)
            except ValueError as exc:
                raise LineFormatError(f'Invalid map value "{target}"') from exc

        return code_point, target

    def read_map(self):
        """Read the mapping tables from the config file"""
        def map_normalized(char, dst):
            # Try to normalize the unicode character as NFKC
            normalized = unicodedata.normalize('NFKC', chr(char))
            if normalized == char:
                # This is already in normalized form
                return

            if len(normalized) == 1:
                normalized_char = ord(normalized)

                if normalized_char not in self.mapping:
                    # This is only needed to ensure that we get the normalized
                    # forms for example, half-width Katakana characters are
                    # normalized to their corresponding full width versions.
                    # However, we don't want to overwrite existing mappings,
                    # since something like Lowercase A with grave could be
                    # normalized to lowercase A, which would break the translation
                    self.mapping[normalized_char] = dst

        with open(self.input_file, 'r', encoding='utf-8') as infile:
            for line in infile:
                src, dst = self.parse_line(line)
                if src is None:
                    continue

                self.mapping[src] = dst
                map_normalized(src, dst)

    def build_extended_map(self):
        """Build the extended map for every character in the BMP"""
        self.mapping[0] = 0 # Handle NUL
        for i in range(0x10000):
            # Iterate over the basic multilingual plane
            if i in self.mapping:
                continue

            if 0xD800 <= i <= 0xDFFF:
                # UTF16 surrogate pairs - we want to mark it as a box character
                self.mapping[i] = self.REPLACEMENT_CHAR
                continue

            normalized = unicodedata.normalize('NFKC', chr(i))
            if len(normalized) == 1:
                normalized_ord = ord(normalized)
                if normalized_ord in self.mapping:
                    self.mapping[i] = self.mapping[normalized_ord]
                else:
                    # No single character mapping exists
                    self.mapping[i] = self.REPLACEMENT_CHAR

                continue

            # Check that all characters in the normalized are in the mapping table:
            sequence = []
            for c in normalized:
                if ord(c) in self.mapping:
                    sequence.append(self.mapping[ord(c)])
                else:
                    sequence.append(self.REPLACEMENT_CHAR)

            # Check if it only contains the box character, or box char and space,
            # and reduce runs to a single instance
            if all(c in (self.REPLACEMENT_CHAR, self.mapping[0x20])
                   for c in sequence):
                self.mapping[i] = self.REPLACEMENT_CHAR
                continue

            sequence = tuple(sequence)
            if sequence not in self.sequences:
                if not self.sequences:
                    last_sequence = 256
                else:
                    last_sequence = max(self.sequences.values()) + 1

                self.sequences[sequence] = last_sequence

            self.mapping[i] = self.sequences[sequence]

    def output_c_table(self, page_tuple, out_fd):
        """Output the C table structure"""
        page_name = self.pages[page_tuple]

        print(self.TABLE_FORMAT % (page_name), file=out_fd)

        for i, val in enumerate(page_tuple):
            print(f"0x{val:02x}, ", end='', file=out_fd)
            if i % 8 == 7:
                print(f"// 0x{i-7:02x}-0x{i:02x}", file=out_fd)

        print(self.TABLE_FOOTER, file=out_fd)

    def build_tables(self):
        """Build the C Tables"""
        with open(self.output_file, 'w', encoding='utf-8') as out_fd:
            print(self.HEADER, file=out_fd)

            default_page = tuple([self.REPLACEMENT_CHAR] * 256)
            self.pages[default_page] = self.TABLE_NAME_DEFAULT
            self.output_c_table(default_page, out_fd)

            for root_idx in range(256):
                base_idx = root_idx * 256
                page = [self.mapping[idx] for idx in range(base_idx, base_idx+256)]
                page_tuple = tuple(page)
                if page_tuple not in self.pages:
                    page_name = self.TABLE_NAME_FORMAT % (root_idx)
                    self.pages[page_tuple] = page_name
                    self.output_c_table(page_tuple, out_fd)

                self.root_table.append(self.pages[page_tuple])

            print(self.TABLE_FORMAT % ('* root_table'), file=out_fd)

            for page_id, page_name in enumerate(self.root_table):
                print(f"    {page_name}, // 0x{page_id:02x}", file=out_fd)

            print(self.TABLE_FOOTER, file=out_fd)

            print(f"const uint8_t *sequence_table[{len(self.sequences)}] = {{", file=out_fd)
            for sequence, seq_id in self.sequences.items():
                seq_len = len(sequence)
                if seq_len >= 256:
                    raise RuntimeError("Sequence way too long")

                line = [f"0x{seq_len:02X}"]
                for seq_elem in sequence:
                    line.append(f"0x{seq_elem:02X}")

                line = ', '.join(line)
                print(f'    [{seq_id-256}] = (const uint8_t[]){{ {line} }},', file=out_fd)

            print(self.TABLE_FOOTER, file=out_fd)

    def generate_test_tables(self):
        """Build the test tables used by the test suite"""
        # Generate the expected output sequences for every table
        # Mapping is a dict mapping the code point as a string to the output
        # Sequence is a dict of <seq_tuple>:<seq_id> mappings (seq_id starts from 256)
        output = []
        sequences = [item[0] for item in sorted(self.sequences.items(),
                                                key=lambda item: item[1])]

        # The mapping for the NUL byte (\x00) should be an empty sequence
        output.append([])

        for i in range(1, 0x10000):
            seq = self.mapping[i]
            if seq >= 256:
                # Pull from sequence table
                seq = sequences[seq - 256]
            else:
                seq = [seq]
            output.append(seq)

        # Find the longest length sequence (add 1 for the length byte)
        longest = max(len(seq) for seq in output) + 1
        # Find the next power of two that can hold this sequence
        if (longest & (longest - 1)) == 0:
            record_length = longest
        else:
            record_length = 1 << longest.bit_length()

        with open(self.output_map, 'wb') as output_map:
            pad = [0] * record_length
            for seq in output:
                record = [len(seq)] + list(seq) + pad
                output_map.write(bytes(record[:record_length]))

if __name__ == "__main__":
    if len(sys.argv) != 4:
        sys.stderr.write(f"Usage: {sys.argv[0]} <input-map> <output-c-file> <output-json-map>\n")
        sys.exit(1)

    BMPTable(sys.argv[1], sys.argv[2], sys.argv[3])