libx52/libx52util/x52_char_map_gen.py

278 lines
9.6 KiB
Python
Executable File

#!/usr/bin/env python3
# Character map generator
#
# Copyright (C) 2012-2026 Nirenjan Krishnan (nirenjan@nirenjan.org)
#
# SPDX-License-Identifier: GPL-2.0-only WITH Classpath-exception-2.0
"""
Generator script to parse character mapping
for the X52/X52 Pro MFD
"""
import sys
import re
import json
import unicodedata
class LineFormatError(ValueError):
"""
Error class for parser
"""
class BMPTable:
"""
Sparse table for Basic Multilingual Plane
"""
REPLACEMENT_CHAR = 0xDB
HEADER = f"""/*
* Autogenerated tables for X52 MFD character lookup
*
* DO NOT EDIT
*/
#include <stdint.h>
"""
TABLE_NAME_FORMAT = 'bmp_page_%02x'
TABLE_NAME_DEFAULT = 'bmp_page_default'
TABLE_FORMAT = 'const uint16_t %s[256] = {'
TABLE_FOOTER = '};\n'
def __init__(self, input_file, output_file, output_map):
self.input_file = input_file
self.output_file = output_file
self.output_map = output_map
self.mapping = {}
self.pages = {}
self.sequences = {}
self.root_table = []
self.read_map()
self.build_extended_map()
self.build_tables()
self.generate_test_tables()
@staticmethod
def parse_line(data):
"""
Parse a line containing a mapping descriptor. The mapping descriptor
must start with a hexadecimal unicode code point, followed by either a
single character, or a hexadecimal integer that corresponds to the map
value.
"""
# Strip off comments
data = re.sub(re.compile('#.*$'), '', data)
# Strip off leading and trailing whitespace
data = data.strip()
# If the line is empty, it is a comment line
if len(data) == 0:
return None, None
# Find the code point and the target value
try:
code_point, target = data.strip().split()
except ValueError as exc:
# Raised when there are either too many, or not enough values in
# the string
raise LineFormatError(f'Invalid descriptor format "{data}"') from exc
# Convert the string to its equivalent numeric value
try:
code_point = int(code_point, 0)
except ValueError as exc:
raise LineFormatError(f'Invalid code point "{code_point}"') from exc
# Check if the target is a single character
if len(target) == 1:
target = ord(target)
else:
# Try parsing the target as an integer
try:
target = int(target, 0)
except ValueError as exc:
raise LineFormatError(f'Invalid map value "{target}"') from exc
return code_point, target
def read_map(self):
"""Read the mapping tables from the config file"""
def map_normalized(char, dst):
# Try to normalize the unicode character as NFKC
normalized = unicodedata.normalize('NFKC', chr(char))
if normalized == char:
# This is already in normalized form
return
if len(normalized) == 1:
normalized_char = ord(normalized)
if normalized_char not in self.mapping:
# This is only needed to ensure that we get the normalized
# forms for example, half-width Katakana characters are
# normalized to their corresponding full width versions.
# However, we don't want to overwrite existing mappings,
# since something like Lowercase A with grave could be
# normalized to lowercase A, which would break the translation
self.mapping[normalized_char] = dst
with open(self.input_file, 'r', encoding='utf-8') as infile:
for line in infile:
src, dst = self.parse_line(line)
if src is None:
continue
self.mapping[src] = dst
map_normalized(src, dst)
def build_extended_map(self):
"""Build the extended map for every character in the BMP"""
self.mapping[0] = 0 # Handle NUL
for i in range(0x10000):
# Iterate over the basic multilingual plane
if i in self.mapping:
continue
if 0xD800 <= i <= 0xDFFF:
# UTF16 surrogate pairs - we want to mark it as a box character
self.mapping[i] = self.REPLACEMENT_CHAR
continue
normalized = unicodedata.normalize('NFKC', chr(i))
if len(normalized) == 1:
normalized_ord = ord(normalized)
if normalized_ord in self.mapping:
self.mapping[i] = self.mapping[normalized_ord]
else:
# No single character mapping exists
self.mapping[i] = self.REPLACEMENT_CHAR
continue
# Check that all characters in the normalized are in the mapping table:
sequence = []
for c in normalized:
if ord(c) in self.mapping:
sequence.append(self.mapping[ord(c)])
else:
sequence.append(self.REPLACEMENT_CHAR)
# Check if it only contains the box character, or box char and space,
# and reduce runs to a single instance
if all(c in (self.REPLACEMENT_CHAR, self.mapping[0x20])
for c in sequence):
self.mapping[i] = self.REPLACEMENT_CHAR
continue
sequence = tuple(sequence)
if sequence not in self.sequences:
if not self.sequences:
last_sequence = 256
else:
last_sequence = max(self.sequences.values()) + 1
self.sequences[sequence] = last_sequence
self.mapping[i] = self.sequences[sequence]
def output_c_table(self, page_tuple, out_fd):
"""Output the C table structure"""
page_name = self.pages[page_tuple]
print(self.TABLE_FORMAT % (page_name), file=out_fd)
for i, val in enumerate(page_tuple):
print(f"0x{val:02x}, ", end='', file=out_fd)
if i % 8 == 7:
print(f"// 0x{i-7:02x}-0x{i:02x}", file=out_fd)
print(self.TABLE_FOOTER, file=out_fd)
def build_tables(self):
"""Build the C Tables"""
with open(self.output_file, 'w', encoding='utf-8') as out_fd:
print(self.HEADER, file=out_fd)
default_page = tuple([self.REPLACEMENT_CHAR] * 256)
self.pages[default_page] = self.TABLE_NAME_DEFAULT
self.output_c_table(default_page, out_fd)
for root_idx in range(256):
base_idx = root_idx * 256
page = [self.mapping[idx] for idx in range(base_idx, base_idx+256)]
page_tuple = tuple(page)
if page_tuple not in self.pages:
page_name = self.TABLE_NAME_FORMAT % (root_idx)
self.pages[page_tuple] = page_name
self.output_c_table(page_tuple, out_fd)
self.root_table.append(self.pages[page_tuple])
print(self.TABLE_FORMAT % ('* root_table'), file=out_fd)
for page_id, page_name in enumerate(self.root_table):
print(f" {page_name}, // 0x{page_id:02x}", file=out_fd)
print(self.TABLE_FOOTER, file=out_fd)
print(f"const uint8_t *sequence_table[{len(self.sequences)}] = {{", file=out_fd)
for sequence, seq_id in self.sequences.items():
seq_len = len(sequence)
if seq_len >= 256:
raise RuntimeError("Sequence way too long")
line = [f"0x{seq_len:02X}"]
for seq_elem in sequence:
line.append(f"0x{seq_elem:02X}")
line = ', '.join(line)
print(f' [{seq_id-256}] = (const uint8_t[]){{ {line} }},', file=out_fd)
print(self.TABLE_FOOTER, file=out_fd)
def generate_test_tables(self):
"""Build the test tables used by the test suite"""
# Generate the expected output sequences for every table
# Mapping is a dict mapping the code point as a string to the output
# Sequence is a dict of <seq_tuple>:<seq_id> mappings (seq_id starts from 256)
output = []
sequences = [item[0] for item in sorted(self.sequences.items(),
key=lambda item: item[1])]
# The mapping for the NUL byte (\x00) should be an empty sequence
output.append([])
for i in range(1, 0x10000):
seq = self.mapping[i]
if seq >= 256:
# Pull from sequence table
seq = sequences[seq - 256]
else:
seq = [seq]
output.append(seq)
# Find the longest length sequence (add 1 for the length byte)
longest = max(len(seq) for seq in output) + 1
# Find the next power of two that can hold this sequence
if (longest & (longest - 1)) == 0:
record_length = longest
else:
record_length = 1 << longest.bit_length()
with open(self.output_map, 'wb') as output_map:
pad = [0] * record_length
for seq in output:
record = [len(seq)] + list(seq) + pad
output_map.write(bytes(record[:record_length]))
if __name__ == "__main__":
if len(sys.argv) != 4:
sys.stderr.write(f"Usage: {sys.argv[0]} <input-map> <output-c-file> <output-json-map>\n")
sys.exit(1)
BMPTable(sys.argv[1], sys.argv[2], sys.argv[3])