mirror of https://github.com/nirenjan/libx52.git
278 lines
9.6 KiB
Python
Executable File
278 lines
9.6 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
# Character map generator
|
|
#
|
|
# Copyright (C) 2012-2026 Nirenjan Krishnan (nirenjan@nirenjan.org)
|
|
#
|
|
# SPDX-License-Identifier: GPL-2.0-only WITH Classpath-exception-2.0
|
|
"""
|
|
Generator script to parse character mapping
|
|
for the X52/X52 Pro MFD
|
|
"""
|
|
|
|
import sys
|
|
import re
|
|
import json
|
|
import unicodedata
|
|
|
|
class LineFormatError(ValueError):
|
|
"""
|
|
Error class for parser
|
|
"""
|
|
|
|
class BMPTable:
|
|
"""
|
|
Sparse table for Basic Multilingual Plane
|
|
"""
|
|
|
|
REPLACEMENT_CHAR = 0xDB
|
|
|
|
HEADER = f"""/*
|
|
* Autogenerated tables for X52 MFD character lookup
|
|
*
|
|
* DO NOT EDIT
|
|
*/
|
|
|
|
#include <stdint.h>
|
|
|
|
"""
|
|
|
|
TABLE_NAME_FORMAT = 'bmp_page_%02x'
|
|
TABLE_NAME_DEFAULT = 'bmp_page_default'
|
|
TABLE_FORMAT = 'const uint16_t %s[256] = {'
|
|
TABLE_FOOTER = '};\n'
|
|
|
|
def __init__(self, input_file, output_file, output_map):
|
|
self.input_file = input_file
|
|
self.output_file = output_file
|
|
self.output_map = output_map
|
|
self.mapping = {}
|
|
self.pages = {}
|
|
self.sequences = {}
|
|
self.root_table = []
|
|
|
|
self.read_map()
|
|
self.build_extended_map()
|
|
self.build_tables()
|
|
self.generate_test_tables()
|
|
|
|
@staticmethod
|
|
def parse_line(data):
|
|
"""
|
|
Parse a line containing a mapping descriptor. The mapping descriptor
|
|
must start with a hexadecimal unicode code point, followed by either a
|
|
single character, or a hexadecimal integer that corresponds to the map
|
|
value.
|
|
"""
|
|
# Strip off comments
|
|
data = re.sub(re.compile('#.*$'), '', data)
|
|
|
|
# Strip off leading and trailing whitespace
|
|
data = data.strip()
|
|
|
|
# If the line is empty, it is a comment line
|
|
if len(data) == 0:
|
|
return None, None
|
|
|
|
# Find the code point and the target value
|
|
try:
|
|
code_point, target = data.strip().split()
|
|
except ValueError as exc:
|
|
# Raised when there are either too many, or not enough values in
|
|
# the string
|
|
raise LineFormatError(f'Invalid descriptor format "{data}"') from exc
|
|
|
|
# Convert the string to its equivalent numeric value
|
|
try:
|
|
code_point = int(code_point, 0)
|
|
except ValueError as exc:
|
|
raise LineFormatError(f'Invalid code point "{code_point}"') from exc
|
|
|
|
# Check if the target is a single character
|
|
if len(target) == 1:
|
|
target = ord(target)
|
|
else:
|
|
# Try parsing the target as an integer
|
|
try:
|
|
target = int(target, 0)
|
|
except ValueError as exc:
|
|
raise LineFormatError(f'Invalid map value "{target}"') from exc
|
|
|
|
return code_point, target
|
|
|
|
def read_map(self):
|
|
"""Read the mapping tables from the config file"""
|
|
def map_normalized(char, dst):
|
|
# Try to normalize the unicode character as NFKC
|
|
normalized = unicodedata.normalize('NFKC', chr(char))
|
|
if normalized == char:
|
|
# This is already in normalized form
|
|
return
|
|
|
|
if len(normalized) == 1:
|
|
normalized_char = ord(normalized)
|
|
|
|
if normalized_char not in self.mapping:
|
|
# This is only needed to ensure that we get the normalized
|
|
# forms for example, half-width Katakana characters are
|
|
# normalized to their corresponding full width versions.
|
|
# However, we don't want to overwrite existing mappings,
|
|
# since something like Lowercase A with grave could be
|
|
# normalized to lowercase A, which would break the translation
|
|
self.mapping[normalized_char] = dst
|
|
|
|
with open(self.input_file, 'r', encoding='utf-8') as infile:
|
|
for line in infile:
|
|
src, dst = self.parse_line(line)
|
|
if src is None:
|
|
continue
|
|
|
|
self.mapping[src] = dst
|
|
map_normalized(src, dst)
|
|
|
|
def build_extended_map(self):
|
|
"""Build the extended map for every character in the BMP"""
|
|
self.mapping[0] = 0 # Handle NUL
|
|
for i in range(0x10000):
|
|
# Iterate over the basic multilingual plane
|
|
if i in self.mapping:
|
|
continue
|
|
|
|
if 0xD800 <= i <= 0xDFFF:
|
|
# UTF16 surrogate pairs - we want to mark it as a box character
|
|
self.mapping[i] = self.REPLACEMENT_CHAR
|
|
continue
|
|
|
|
normalized = unicodedata.normalize('NFKC', chr(i))
|
|
if len(normalized) == 1:
|
|
normalized_ord = ord(normalized)
|
|
if normalized_ord in self.mapping:
|
|
self.mapping[i] = self.mapping[normalized_ord]
|
|
else:
|
|
# No single character mapping exists
|
|
self.mapping[i] = self.REPLACEMENT_CHAR
|
|
|
|
continue
|
|
|
|
# Check that all characters in the normalized are in the mapping table:
|
|
sequence = []
|
|
for c in normalized:
|
|
if ord(c) in self.mapping:
|
|
sequence.append(self.mapping[ord(c)])
|
|
else:
|
|
sequence.append(self.REPLACEMENT_CHAR)
|
|
|
|
# Check if it only contains the box character, or box char and space,
|
|
# and reduce runs to a single instance
|
|
if all(c in (self.REPLACEMENT_CHAR, self.mapping[0x20])
|
|
for c in sequence):
|
|
self.mapping[i] = self.REPLACEMENT_CHAR
|
|
continue
|
|
|
|
sequence = tuple(sequence)
|
|
if sequence not in self.sequences:
|
|
if not self.sequences:
|
|
last_sequence = 256
|
|
else:
|
|
last_sequence = max(self.sequences.values()) + 1
|
|
|
|
self.sequences[sequence] = last_sequence
|
|
|
|
self.mapping[i] = self.sequences[sequence]
|
|
|
|
def output_c_table(self, page_tuple, out_fd):
|
|
"""Output the C table structure"""
|
|
page_name = self.pages[page_tuple]
|
|
|
|
print(self.TABLE_FORMAT % (page_name), file=out_fd)
|
|
|
|
for i, val in enumerate(page_tuple):
|
|
print(f"0x{val:02x}, ", end='', file=out_fd)
|
|
if i % 8 == 7:
|
|
print(f"// 0x{i-7:02x}-0x{i:02x}", file=out_fd)
|
|
|
|
print(self.TABLE_FOOTER, file=out_fd)
|
|
|
|
def build_tables(self):
|
|
"""Build the C Tables"""
|
|
with open(self.output_file, 'w', encoding='utf-8') as out_fd:
|
|
print(self.HEADER, file=out_fd)
|
|
|
|
default_page = tuple([self.REPLACEMENT_CHAR] * 256)
|
|
self.pages[default_page] = self.TABLE_NAME_DEFAULT
|
|
self.output_c_table(default_page, out_fd)
|
|
|
|
for root_idx in range(256):
|
|
base_idx = root_idx * 256
|
|
page = [self.mapping[idx] for idx in range(base_idx, base_idx+256)]
|
|
page_tuple = tuple(page)
|
|
if page_tuple not in self.pages:
|
|
page_name = self.TABLE_NAME_FORMAT % (root_idx)
|
|
self.pages[page_tuple] = page_name
|
|
self.output_c_table(page_tuple, out_fd)
|
|
|
|
self.root_table.append(self.pages[page_tuple])
|
|
|
|
print(self.TABLE_FORMAT % ('* root_table'), file=out_fd)
|
|
|
|
for page_id, page_name in enumerate(self.root_table):
|
|
print(f" {page_name}, // 0x{page_id:02x}", file=out_fd)
|
|
|
|
print(self.TABLE_FOOTER, file=out_fd)
|
|
|
|
print(f"const uint8_t *sequence_table[{len(self.sequences)}] = {{", file=out_fd)
|
|
for sequence, seq_id in self.sequences.items():
|
|
seq_len = len(sequence)
|
|
if seq_len >= 256:
|
|
raise RuntimeError("Sequence way too long")
|
|
|
|
line = [f"0x{seq_len:02X}"]
|
|
for seq_elem in sequence:
|
|
line.append(f"0x{seq_elem:02X}")
|
|
|
|
line = ', '.join(line)
|
|
print(f' [{seq_id-256}] = (const uint8_t[]){{ {line} }},', file=out_fd)
|
|
|
|
print(self.TABLE_FOOTER, file=out_fd)
|
|
|
|
def generate_test_tables(self):
|
|
"""Build the test tables used by the test suite"""
|
|
# Generate the expected output sequences for every table
|
|
# Mapping is a dict mapping the code point as a string to the output
|
|
# Sequence is a dict of <seq_tuple>:<seq_id> mappings (seq_id starts from 256)
|
|
output = []
|
|
sequences = [item[0] for item in sorted(self.sequences.items(),
|
|
key=lambda item: item[1])]
|
|
|
|
# The mapping for the NUL byte (\x00) should be an empty sequence
|
|
output.append([])
|
|
|
|
for i in range(1, 0x10000):
|
|
seq = self.mapping[i]
|
|
if seq >= 256:
|
|
# Pull from sequence table
|
|
seq = sequences[seq - 256]
|
|
else:
|
|
seq = [seq]
|
|
output.append(seq)
|
|
|
|
# Find the longest length sequence (add 1 for the length byte)
|
|
longest = max(len(seq) for seq in output) + 1
|
|
# Find the next power of two that can hold this sequence
|
|
if (longest & (longest - 1)) == 0:
|
|
record_length = longest
|
|
else:
|
|
record_length = 1 << longest.bit_length()
|
|
|
|
with open(self.output_map, 'wb') as output_map:
|
|
pad = [0] * record_length
|
|
for seq in output:
|
|
record = [len(seq)] + list(seq) + pad
|
|
output_map.write(bytes(record[:record_length]))
|
|
|
|
if __name__ == "__main__":
|
|
if len(sys.argv) != 4:
|
|
sys.stderr.write(f"Usage: {sys.argv[0]} <input-map> <output-c-file> <output-json-map>\n")
|
|
sys.exit(1)
|
|
|
|
BMPTable(sys.argv[1], sys.argv[2], sys.argv[3])
|