#!/usr/bin/env python3 # Character map generator # # Copyright (C) 2012-2026 Nirenjan Krishnan (nirenjan@nirenjan.org) # # SPDX-License-Identifier: GPL-2.0-only WITH Classpath-exception-2.0 """ Generator script to parse character mapping for the X52/X52 Pro MFD """ import sys import re import json import unicodedata class LineFormatError(ValueError): """ Error class for parser """ class BMPTable: """ Sparse table for Basic Multilingual Plane """ REPLACEMENT_CHAR = 0xDB HEADER = f"""/* * Autogenerated tables for X52 MFD character lookup * * DO NOT EDIT */ #include """ TABLE_NAME_FORMAT = 'bmp_page_%02x' TABLE_NAME_DEFAULT = 'bmp_page_default' TABLE_FORMAT = 'const uint16_t %s[256] = {' TABLE_FOOTER = '};\n' def __init__(self, input_file, output_file, output_map): self.input_file = input_file self.output_file = output_file self.output_map = output_map self.mapping = {} self.pages = {} self.sequences = {} self.root_table = [] self.read_map() self.build_extended_map() self.build_tables() self.generate_test_tables() @staticmethod def parse_line(data): """ Parse a line containing a mapping descriptor. The mapping descriptor must start with a hexadecimal unicode code point, followed by either a single character, or a hexadecimal integer that corresponds to the map value. """ # Strip off comments data = re.sub(re.compile('#.*$'), '', data) # Strip off leading and trailing whitespace data = data.strip() # If the line is empty, it is a comment line if len(data) == 0: return None, None # Find the code point and the target value try: code_point, target = data.strip().split() except ValueError as exc: # Raised when there are either too many, or not enough values in # the string raise LineFormatError(f'Invalid descriptor format "{data}"') from exc # Convert the string to its equivalent numeric value try: code_point = int(code_point, 0) except ValueError as exc: raise LineFormatError(f'Invalid code point "{code_point}"') from exc # Check if the target is a single character if len(target) == 1: target = ord(target) else: # Try parsing the target as an integer try: target = int(target, 0) except ValueError as exc: raise LineFormatError(f'Invalid map value "{target}"') from exc return code_point, target def read_map(self): """Read the mapping tables from the config file""" def map_normalized(char, dst): # Try to normalize the unicode character as NFKC normalized = unicodedata.normalize('NFKC', chr(char)) if normalized == char: # This is already in normalized form return if len(normalized) == 1: normalized_char = ord(normalized) if normalized_char not in self.mapping: # This is only needed to ensure that we get the normalized # forms for example, half-width Katakana characters are # normalized to their corresponding full width versions. # However, we don't want to overwrite existing mappings, # since something like Lowercase A with grave could be # normalized to lowercase A, which would break the translation self.mapping[normalized_char] = dst with open(self.input_file, 'r', encoding='utf-8') as infile: for line in infile: src, dst = self.parse_line(line) if src is None: continue self.mapping[src] = dst map_normalized(src, dst) def build_extended_map(self): """Build the extended map for every character in the BMP""" self.mapping[0] = 0 # Handle NUL for i in range(0x10000): # Iterate over the basic multilingual plane if i in self.mapping: continue if 0xD800 <= i <= 0xDFFF: # UTF16 surrogate pairs - we want to mark it as a box character self.mapping[i] = self.REPLACEMENT_CHAR continue normalized = unicodedata.normalize('NFKC', chr(i)) if len(normalized) == 1: normalized_ord = ord(normalized) if normalized_ord in self.mapping: self.mapping[i] = self.mapping[normalized_ord] else: # No single character mapping exists self.mapping[i] = self.REPLACEMENT_CHAR continue # Check that all characters in the normalized are in the mapping table: sequence = [] for c in normalized: if ord(c) in self.mapping: sequence.append(self.mapping[ord(c)]) else: sequence.append(self.REPLACEMENT_CHAR) # Check if it only contains the box character, or box char and space, # and reduce runs to a single instance if all(c in (self.REPLACEMENT_CHAR, self.mapping[0x20]) for c in sequence): self.mapping[i] = self.REPLACEMENT_CHAR continue sequence = tuple(sequence) if sequence not in self.sequences: if not self.sequences: last_sequence = 256 else: last_sequence = max(self.sequences.values()) + 1 self.sequences[sequence] = last_sequence self.mapping[i] = self.sequences[sequence] def output_c_table(self, page_tuple, out_fd): """Output the C table structure""" page_name = self.pages[page_tuple] print(self.TABLE_FORMAT % (page_name), file=out_fd) for i, val in enumerate(page_tuple): print(f"0x{val:02x}, ", end='', file=out_fd) if i % 8 == 7: print(f"// 0x{i-7:02x}-0x{i:02x}", file=out_fd) print(self.TABLE_FOOTER, file=out_fd) def build_tables(self): """Build the C Tables""" with open(self.output_file, 'w', encoding='utf-8') as out_fd: print(self.HEADER, file=out_fd) default_page = tuple([self.REPLACEMENT_CHAR] * 256) self.pages[default_page] = self.TABLE_NAME_DEFAULT self.output_c_table(default_page, out_fd) for root_idx in range(256): base_idx = root_idx * 256 page = [self.mapping[idx] for idx in range(base_idx, base_idx+256)] page_tuple = tuple(page) if page_tuple not in self.pages: page_name = self.TABLE_NAME_FORMAT % (root_idx) self.pages[page_tuple] = page_name self.output_c_table(page_tuple, out_fd) self.root_table.append(self.pages[page_tuple]) print(self.TABLE_FORMAT % ('* root_table'), file=out_fd) for page_id, page_name in enumerate(self.root_table): print(f" {page_name}, // 0x{page_id:02x}", file=out_fd) print(self.TABLE_FOOTER, file=out_fd) print(f"const uint8_t *sequence_table[{len(self.sequences)}] = {{", file=out_fd) for sequence, seq_id in self.sequences.items(): seq_len = len(sequence) if seq_len >= 256: raise RuntimeError("Sequence way too long") line = [f"0x{seq_len:02X}"] for seq_elem in sequence: line.append(f"0x{seq_elem:02X}") line = ', '.join(line) print(f' [{seq_id-256}] = (const uint8_t[]){{ {line} }},', file=out_fd) print(self.TABLE_FOOTER, file=out_fd) def generate_test_tables(self): """Build the test tables used by the test suite""" # Generate the expected output sequences for every table # Mapping is a dict mapping the code point as a string to the output # Sequence is a dict of : mappings (seq_id starts from 256) output = [] sequences = [item[0] for item in sorted(self.sequences.items(), key=lambda item: item[1])] # The mapping for the NUL byte (\x00) should be an empty sequence output.append([]) for i in range(1, 0x10000): seq = self.mapping[i] if seq >= 256: # Pull from sequence table seq = sequences[seq - 256] else: seq = [seq] output.append(seq) # Find the longest length sequence (add 1 for the length byte) longest = max(len(seq) for seq in output) + 1 # Find the next power of two that can hold this sequence if (longest & (longest - 1)) == 0: record_length = longest else: record_length = 1 << longest.bit_length() with open(self.output_map, 'wb') as output_map: pad = [0] * record_length for seq in output: record = [len(seq)] + list(seq) + pad output_map.write(bytes(record[:record_length])) if __name__ == "__main__": if len(sys.argv) != 4: sys.stderr.write(f"Usage: {sys.argv[0]} \n") sys.exit(1) BMPTable(sys.argv[1], sys.argv[2], sys.argv[3])