From 0cb137bbe0d3caa73557e0614ecc12c5579e6f66 Mon Sep 17 00:00:00 2001
From: nirenjan <nirenjan@gmail.com>
Date: Mon, 16 Mar 2026 09:57:55 -0700
Subject: [PATCH] feat: Handle the entire BMP in libx52util

Prior to this change, the libx52util_convert_utf8_string function had a
limited set of characters that it would convert to the MFD character
map, these characters were derived from the x52_char_map.cfg file.
However, this is a tiny subset of the actual supported characters in the
Basic Multilingual Plane (BMP), since many characters in the BMP can be
normalized to a different character (or character sequence) that has a
corresponding glyph on the X52 MFD.

One example of this is the half-width Katakana characters which are
mapped in the display, however the corresponding full-width characters
were not explicitly mapped. With this commit, the generator script now
automatically detects that the half-width characters can be normalized
to the corresponding full width forms, and maps the full width forms
back to the correct characters on the MFD.

A second benefit of this change is that the MFD can now show characters
that would otherwise never be seen, for example, the 3/4 symbol or 5/8
symbol have no corresponding glyph in the MFD, but they can be
translated to the sequence `3` `/` `4`, giving us much more flexibility
on the characters that can actually be displayed.

Finally, with this change, the function also maps missing or unsupported
characters to the box character (0xDB in the display), making it clearer
that there was something there that could not be displayed. Earlier, it
would have simply skipped that character.
---
 libx52util/meson.build           |  27 +-
 libx52util/x52_char_map.cfg      |  10 +
 libx52util/x52_char_map.h        |  17 +-
 libx52util/x52_char_map_gen.py   | 418 ++++++++++++++++++-------------
 libx52util/x52_char_map_lookup.c | 123 ++++++---
 libx52util/x52_char_map_test.c   | 195 ++++++++++++++
 libx52util/x52_map_test_gen.py   | 107 --------
 7 files changed, 556 insertions(+), 341 deletions(-)
 create mode 100644 libx52util/x52_char_map_test.c
 delete mode 100755 libx52util/x52_map_test_gen.py

diff --git a/libx52util/meson.build b/libx52util/meson.build
index b52f344..9950605 100644
--- a/libx52util/meson.build
+++ b/libx52util/meson.build
@@ -1,13 +1,13 @@
 # libx52util
-libx52util_version = '1.0.1'
+libx52util_version = '1.0.2'
 gen_script = files('x52_char_map_gen.py')[0]
 
 util_char_map = custom_target('util-char-map',
   build_by_default: false,
   depend_files: ['x52_char_map_gen.py', 'x52_char_map.cfg'],
-  command: [python, gen_script, '@INPUT@', '@OUTPUT@'],
+  command: [python, gen_script, '@INPUT@', '@OUTPUT0@', '@OUTPUT1@'],
   input: 'x52_char_map.cfg',
-  output: 'util_char_map.c')
+  output: ['util_char_map.c', 'x52_char_map.bin'])
 
 lib_libx52util = library('x52util', util_char_map, 'x52_char_map_lookup.c',
   install: true,
@@ -23,21 +23,14 @@ pkgconfig.generate(lib_libx52util,
   version: libx52util_version,
 )
 
-test_gen_script = files('x52_map_test_gen.py')[0]
-
-libx52util_map_test_src = custom_target('libx52util-map-test-src',
-  build_by_default: false,
-  depend_files: ['x52_map_test_gen.py', 'x52_char_map.cfg'],
-  command: [python, test_gen_script, '@INPUT@', '@OUTPUT@'],
-  input: 'x52_char_map.cfg',
-  output: 'x52_map_test.c'
-  )
-
-libx52util_map_test = executable('libx52util-map-test', libx52util_map_test_src,
-  dependencies: [dep_cmocka],
-  link_with: [lib_libx52util],
+libx52util_bmp_test = executable(
+  'libx52util-bmp-test',
+  'x52_char_map_test.c',
   build_by_default: false,
   include_directories: [includes, lib_libx52util.private_dir_include()],
+  link_with: [lib_libx52util]
   )
 
-test('libx52util-map-test', libx52util_map_test, protocol: 'tap')
+test('libx52util-bmp-test', libx52util_bmp_test,
+  protocol: 'tap',
+  args: [util_char_map[1]])
diff --git a/libx52util/x52_char_map.cfg b/libx52util/x52_char_map.cfg
index cf6eda1..26369da 100644
--- a/libx52util/x52_char_map.cfg
+++ b/libx52util/x52_char_map.cfg
@@ -324,3 +324,13 @@
 0xFF9E  0xDE    # HALFWIDTH KATAKANA VOICED SOUND MARK
 0xFF9F  0xDF    # HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK
 
+# The following characters are manually added to aid in normalization to the
+# X52 character map
+0x2215  0x2F    # DIVISION SLASH
+0x2044  0x2F    # FRACTION SLASH
+0x00B0  0xDF    # DEGREE SIGN
+# Note: while Greek letters aren't actually supported by the MFD character map,
+# this is manually addded to map the letter 'mu' to ASCII 'u'. This is needed
+# in the CJK compatibility page (0x3300-0x33FF) to deal with the square latin
+# abbreviations
+0x03BC  0x75    # GREEK SMALL LETTER MU
diff --git a/libx52util/x52_char_map.h b/libx52util/x52_char_map.h
index 6dd4470..3abe175 100644
--- a/libx52util/x52_char_map.h
+++ b/libx52util/x52_char_map.h
@@ -12,20 +12,7 @@
 #include <stddef.h>
 #include <stdint.h>
 
-enum {
-    TYPE_INVALID = 0,   /* Invalid type (default) */
-
-    TYPE_POINTER,       /* Pointer target */
-
-    TYPE_ENTRY          /* Map entry value */
-};
-
-struct map_entry {
-    struct map_entry *next; /* Pointer to the next table */
-    uint8_t type;           /* Type of entry */
-    uint8_t value;          /* Value is valid if this is of TYPE_ENTRY */
-};
-
-extern struct map_entry map_root[];
+extern const uint16_t *root_table[256];
+extern const uint8_t *sequence_table[];
 
 #endif /* !defined X52_CHAR_MAP_H */
diff --git a/libx52util/x52_char_map_gen.py b/libx52util/x52_char_map_gen.py
index f52a9d6..eaf8072 100755
--- a/libx52util/x52_char_map_gen.py
+++ b/libx52util/x52_char_map_gen.py
@@ -1,7 +1,7 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 # Character map generator
 #
-# Copyright (C) 2012-2018 Nirenjan Krishnan (nirenjan@nirenjan.org)
+# Copyright (C) 2012-2026 Nirenjan Krishnan (nirenjan@nirenjan.org)
 #
 # SPDX-License-Identifier: GPL-2.0-only WITH Classpath-exception-2.0
 """
@@ -11,191 +11,267 @@ for the X52/X52 Pro MFD
 
 import sys
 import re
-
-AUTOGEN_HEADER = """
-/*
- * Autogenerated character map file for Saitek X52 Pro
- * Generated from %s
- */
-
-#include "x52_char_map.h"
-
-"""
-
-
-class MapTable(object):
-    """
-    Defines a MapTable entry, with each entry storing the value seen so far,
-    the type of the entry, and the value, if it's a value node.
-    """
-    # Empty list
-    root = [None] * 256
-
-    def __init__(self, value_so_far, map_value=None):
-        self.next_level = [None] * 256
-        self.value_so_far = value_so_far
-        self.map_value = map_value
-
-    def output_nodes(self):
-        """
-        Output the individual nodes
-        """
-        output_lines = []
-        output_count = 0
-        for node in self.next_level:
-            if node is not None:
-                output_lines.extend(node.output_nodes())
-                output_count += 1
-
-        if output_count != 0:
-            struct_header = 'static struct map_entry table_%x[64] = {' % \
-                            self.value_so_far
-            output_lines.append(struct_header)
-
-            for node_index in range(0, 256):
-                node = self.next_level[node_index]
-                if node is not None:
-                    output_lines.append(self.dump_entry_line(0x80, node_index,
-                                                             node.value_so_far,
-                                                             node.map_value))
-
-            output_lines.extend(['};', ''])
-
-        return output_lines
-
-    @staticmethod
-    def dump_entry_line(offset, node_index, value_so_far, map_value):
-        """
-        Dump the array entry for the current node
-        """
-        if map_value is None:
-            node_entry_line = '\t[0x%02x] = { table_%x, TYPE_POINTER, 0 },' % \
-                (node_index - offset, value_so_far)
-        else:
-            node_entry_line = '\t[0x%02x] = { NULL, TYPE_ENTRY, 0x%02x },' % \
-                (node_index - offset, map_value)
-
-        return node_entry_line
-
-    @classmethod
-    def add_to_table(cls, input_val, map_val):
-        """
-        Add a map value to the lookup table
-        """
-        try:
-            uchr = unichr(input_val)
-        except NameError:
-            # Python 3 doesn't have unichr, but chr should work
-            uchr = chr(input_val)
-
-        utf8_str = uchr.encode('utf-8')
-        # Python2 returns the encoded result as a string, wheras
-        # Python3 returns the result as a bytearray. Converting
-        # the string (or bytearray) into a bytearray ensures that
-        # this can be run in both Python2 and Python3
-        utf8_vals = [c for c in bytearray(utf8_str)]
-
-        value_so_far = 0
-        level = cls.root
-        for index, char in enumerate(utf8_vals):
-            value_so_far = (value_so_far << 8) | char
-            if index < (len(utf8_vals) - 1):
-                node = level[char]
-                if node is None:
-                    node = cls(value_so_far)
-                    level[char] = node
-
-                level = level[char].next_level
-            else:
-                node = cls(value_so_far, map_val)
-                level[char] = node
-
-    @classmethod
-    def output_table_as_list(cls):
-        """
-        Output the map table as a list of lines
-        """
-        output_lines = []
-        for node in cls.root:
-            if node is not None:
-                output_lines.extend(node.output_nodes())
-
-        output_lines.append('struct map_entry map_root[256] = {')
-
-        for node_index in range(0, 256):
-            node = cls.root[node_index]
-            if node is not None:
-                output_lines.append(cls.dump_entry_line(0x0, node_index,
-                                                        node.value_so_far,
-                                                        node.map_value))
-
-        output_lines.extend(['};', ''])
-
-        return output_lines
-
+import json
+import unicodedata
 
 class LineFormatError(ValueError):
     """
     Error class for parser
     """
 
-def parse_line(data):
+class BMPTable:
     """
-    Parse a line containing a mapping descriptor. The mapping descriptor
-    must start with a hexadecimal unicode code point, followed by either a
-    single character, or a hexadecimal integer that corresponds to the map
-    value.
+    Sparse table for Basic Multilingual Plane
     """
-    # Strip off comments
-    data = re.sub(re.compile('#.*$'), '', data)
 
-    # Strip off leading and trailing whitespace
-    data = data.strip()
+    REPLACEMENT_CHAR = 0xDB
 
-    # If the line is empty, it is a comment line
-    if len(data) == 0:
-        return None, None
+    HEADER = f"""/*
+* Autogenerated tables for X52 MFD character lookup
+*
+* DO NOT EDIT
+*/
 
-    # Find the code point and the target value
-    try:
-        code_point, target = data.strip().split()
-    except ValueError:
-        # Raised when there are either too many, or not enough values in
-        # the string
-        raise LineFormatError('Invalid descriptor format "%s"' % data)
+#include <stdint.h>
 
-    # Convert the string to its equivalent numeric value
-    try:
-        code_point = int(code_point, 0)
-    except ValueError:
-        raise LineFormatError('Invalid code point "%s"' % code_point)
+"""
 
-    # Check if the target is a single character
-    if len(target) == 1:
-        target = ord(target)
-    else:
-        # Try parsing the target as an integer
+    TABLE_NAME_FORMAT = 'bmp_page_%02x'
+    TABLE_NAME_DEFAULT = 'bmp_page_default'
+    TABLE_FORMAT = 'const uint16_t %s[256] = {'
+    TABLE_FOOTER = '};\n'
+
+    def __init__(self, input_file, output_file, output_map):
+        self.input_file = input_file
+        self.output_file = output_file
+        self.output_map = output_map
+        self.mapping = {}
+        self.pages = {}
+        self.sequences = {}
+        self.root_table = []
+
+        self.read_map()
+        self.build_extended_map()
+        self.build_tables()
+        self.generate_test_tables()
+
+    @staticmethod
+    def parse_line(data):
+        """
+        Parse a line containing a mapping descriptor. The mapping descriptor
+        must start with a hexadecimal unicode code point, followed by either a
+        single character, or a hexadecimal integer that corresponds to the map
+        value.
+        """
+        # Strip off comments
+        data = re.sub(re.compile('#.*$'), '', data)
+
+        # Strip off leading and trailing whitespace
+        data = data.strip()
+
+        # If the line is empty, it is a comment line
+        if len(data) == 0:
+            return None, None
+
+        # Find the code point and the target value
         try:
-            target = int(target, 0)
-        except ValueError:
-            raise LineFormatError('Invalid map value "%s"' % target)
+            code_point, target = data.strip().split()
+        except ValueError as exc:
+            # Raised when there are either too many, or not enough values in
+            # the string
+            raise LineFormatError(f'Invalid descriptor format "{data}"') from exc
 
-    return code_point, target
+        # Convert the string to its equivalent numeric value
+        try:
+            code_point = int(code_point, 0)
+        except ValueError as exc:
+            raise LineFormatError(f'Invalid code point "{code_point}"') from exc
+
+        # Check if the target is a single character
+        if len(target) == 1:
+            target = ord(target)
+        else:
+            # Try parsing the target as an integer
+            try:
+                target = int(target, 0)
+            except ValueError as exc:
+                raise LineFormatError(f'Invalid map value "{target}"') from exc
+
+        return code_point, target
+
+    def read_map(self):
+        """Read the mapping tables from the config file"""
+        def map_normalized(char, dst):
+            # Try to normalize the unicode character as NFKC
+            normalized = unicodedata.normalize('NFKC', chr(char))
+            if normalized == char:
+                # This is already in normalized form
+                return
+
+            if len(normalized) == 1:
+                normalized_char = ord(normalized)
+
+                if normalized_char not in self.mapping:
+                    # This is only needed to ensure that we get the normalized
+                    # forms for example, half-width Katakana characters are
+                    # normalized to their corresponding full width versions.
+                    # However, we don't want to overwrite existing mappings,
+                    # since something like Lowercase A with grave could be
+                    # normalized to lowercase A, which would break the translation
+                    self.mapping[normalized_char] = dst
+
+        with open(self.input_file, 'r', encoding='utf-8') as infile:
+            for line in infile:
+                src, dst = self.parse_line(line)
+                if src is None:
+                    continue
+
+                self.mapping[src] = dst
+                map_normalized(src, dst)
+
+    def build_extended_map(self):
+        """Build the extended map for every character in the BMP"""
+        self.mapping[0] = 0 # Handle NUL
+        for i in range(0x10000):
+            # Iterate over the basic multilingual plane
+            if i in self.mapping:
+                continue
+
+            if 0xD800 <= i <= 0xDFFF:
+                # UTF16 surrogate pairs - we want to mark it as a box character
+                self.mapping[i] = self.REPLACEMENT_CHAR
+                continue
+
+            normalized = unicodedata.normalize('NFKC', chr(i))
+            if len(normalized) == 1:
+                normalized_ord = ord(normalized)
+                if normalized_ord in self.mapping:
+                    self.mapping[i] = self.mapping[normalized_ord]
+                else:
+                    # No single character mapping exists
+                    self.mapping[i] = self.REPLACEMENT_CHAR
+
+                continue
+
+            # Check that all characters in the normalized are in the mapping table:
+            sequence = []
+            for c in normalized:
+                if ord(c) in self.mapping:
+                    sequence.append(self.mapping[ord(c)])
+                else:
+                    sequence.append(self.REPLACEMENT_CHAR)
+
+            # Check if it only contains the box character, or box char and space,
+            # and reduce runs to a single instance
+            if all(c in (self.REPLACEMENT_CHAR, self.mapping[0x20])
+                   for c in sequence):
+                self.mapping[i] = self.REPLACEMENT_CHAR
+                continue
+
+            sequence = tuple(sequence)
+            if sequence not in self.sequences:
+                if not self.sequences:
+                    last_sequence = 256
+                else:
+                    last_sequence = max(self.sequences.values()) + 1
+
+                self.sequences[sequence] = last_sequence
+
+            self.mapping[i] = self.sequences[sequence]
+
+    def output_c_table(self, page_tuple, out_fd):
+        """Output the C table structure"""
+        page_name = self.pages[page_tuple]
+
+        print(self.TABLE_FORMAT % (page_name), file=out_fd)
+
+        for i, val in enumerate(page_tuple):
+            print(f"0x{val:02x}, ", end='', file=out_fd)
+            if i % 8 == 7:
+                print(f"// 0x{i-7:02x}-0x{i:02x}", file=out_fd)
+
+        print(self.TABLE_FOOTER, file=out_fd)
+
+    def build_tables(self):
+        """Build the C Tables"""
+        with open(self.output_file, 'w', encoding='utf-8') as out_fd:
+            print(self.HEADER, file=out_fd)
+
+            default_page = tuple([self.REPLACEMENT_CHAR] * 256)
+            self.pages[default_page] = self.TABLE_NAME_DEFAULT
+            self.output_c_table(default_page, out_fd)
+
+            for root_idx in range(256):
+                base_idx = root_idx * 256
+                page = [self.mapping[idx] for idx in range(base_idx, base_idx+256)]
+                page_tuple = tuple(page)
+                if page_tuple not in self.pages:
+                    page_name = self.TABLE_NAME_FORMAT % (root_idx)
+                    self.pages[page_tuple] = page_name
+                    self.output_c_table(page_tuple, out_fd)
+
+                self.root_table.append(self.pages[page_tuple])
+
+            print(self.TABLE_FORMAT % ('* root_table'), file=out_fd)
+
+            for page_id, page_name in enumerate(self.root_table):
+                print(f"    {page_name}, // 0x{page_id:02x}", file=out_fd)
+
+            print(self.TABLE_FOOTER, file=out_fd)
+
+            print(f"const uint8_t *sequence_table[{len(self.sequences)}] = {{", file=out_fd)
+            for sequence, seq_id in self.sequences.items():
+                seq_len = len(sequence)
+                if seq_len >= 256:
+                    raise RuntimeError("Sequence way too long")
+
+                line = [f"0x{seq_len:02X}"]
+                for seq_elem in sequence:
+                    line.append(f"0x{seq_elem:02X}")
+
+                line = ', '.join(line)
+                print(f'    [{seq_id-256}] = (const uint8_t[]){{ {line} }},', file=out_fd)
+
+            print(self.TABLE_FOOTER, file=out_fd)
+
+    def generate_test_tables(self):
+        """Build the test tables used by the test suite"""
+        # Generate the expected output sequences for every table
+        # Mapping is a dict mapping the code point as a string to the output
+        # Sequence is a dict of <seq_tuple>:<seq_id> mappings (seq_id starts from 256)
+        output = []
+        sequences = [item[0] for item in sorted(self.sequences.items(),
+                                                key=lambda item: item[1])]
+
+        # The mapping for the NUL byte (\x00) should be an empty sequence
+        output.append([])
+
+        for i in range(1, 0x10000):
+            seq = self.mapping[i]
+            if seq >= 256:
+                # Pull from sequence table
+                seq = sequences[seq - 256]
+            else:
+                seq = [seq]
+            output.append(seq)
+
+        # Find the longest length sequence (add 1 for the length byte)
+        longest = max(len(seq) for seq in output) + 1
+        # Find the next power of two that can hold this sequence
+        if (longest & (longest - 1)) == 0:
+            record_length = longest
+        else:
+            record_length = 1 << longest.bit_length()
+
+        with open(self.output_map, 'wb') as output_map:
+            pad = [0] * record_length
+            for seq in output:
+                record = [len(seq)] + list(seq) + pad
+                output_map.write(bytes(record[:record_length]))
 
 if __name__ == "__main__":
-    if len(sys.argv) != 3:
-        sys.stderr.write('Usage: %s <input-map> <output-c-file>\n' %
-                         sys.argv[0])
+    if len(sys.argv) != 4:
+        sys.stderr.write(f"Usage: {sys.argv[0]} <input-map> <output-c-file> <output-json-map>\n")
         sys.exit(1)
 
-    with open(sys.argv[1], 'r') as infile:
-        for line in infile:
-            src, dst = parse_line(line)
-            if src is not None:
-                MapTable.add_to_table(src, dst)
-
-    with open(sys.argv[2], 'w') as outfile:
-        outfile.write(AUTOGEN_HEADER % sys.argv[1])
-
-        for line in MapTable.output_table_as_list():
-            outfile.write(line + '\n')
+    BMPTable(sys.argv[1], sys.argv[2], sys.argv[3])
diff --git a/libx52util/x52_char_map_lookup.c b/libx52util/x52_char_map_lookup.c
index 04a02e7..93fcb51 100644
--- a/libx52util/x52_char_map_lookup.c
+++ b/libx52util/x52_char_map_lookup.c
@@ -11,11 +11,63 @@
 
 #include "config.h"
 #include <stdint.h>
+#include <string.h>
 #include <errno.h>
 
 #include "libx52util.h"
 #include "x52_char_map.h"
 
+/**
+ * @brief Converts a UTF8 stream to a uint32_t
+ *
+ * @param[in]       utf8in  Pointer to UTF8 input stream. Must be NUL-terminated
+ * @param[out]      unichr  Output character pointer
+ *
+ * @returns number of bytes to advance stream by - 0 if NUL or input pointer is NULL
+ */
+static int utf8_to_u32(const uint8_t *utf8in, uint32_t *unichr)
+{
+    if (!utf8in || !*utf8in) return 0;
+
+    uint8_t b = utf8in[0];
+
+    // 1-byte (0xxxxxxx)
+    if (b < 0x80) {
+        *unichr = b;
+        return 1;
+    }
+
+    // Invalid leading bytes
+    if (b < 0xC2 || b > 0xF4) goto error;
+
+    // 2-byte (110xxxxx 10xxxxxx)
+    if ((b & 0xE0) == 0xC0) {
+        if ((utf8in[1] & 0xC0) != 0x80) goto error;
+        *unichr = ((b & 0x1F) << 6) | (utf8in[1] & 0x3F);
+        return 2;
+    }
+
+    // 3-byte (1110xxxx 10xxxxxx 10xxxxxx)
+    if ((b & 0xF0) == 0xE0) {
+        if ((utf8in[1] & 0xC0) != 0x80 || (utf8in[2] & 0xC0) != 0x80) goto error;
+        *unichr = ((b & 0x0F) << 12) | ((utf8in[1] & 0x3F) << 6) | (utf8in[2] & 0x3F);
+        return 3;
+    }
+
+    // 4-byte (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx)
+    if ((b & 0xF8) == 0xF0) {
+        if ((utf8in[1] & 0xC0) != 0x80 || (utf8in[2] & 0xC0) != 0x80 ||
+                (utf8in[3] & 0xC0) != 0x80) goto error;
+        *unichr = ((b & 0x07) << 18) | ((utf8in[1] & 0x3F) << 12) |
+              ((utf8in[2] & 0x3F) << 6) | (utf8in[3] & 0x3F);
+        return 4;
+    }
+
+error:
+    *unichr = 0xFFFD; // Unicode Replacement Character
+    return 1;     // Consume lead byte to attempt resync
+}
+
 /**
  * @brief Convert UTF8 string to X52 character map.
  *
@@ -32,52 +84,61 @@
 int libx52util_convert_utf8_string(const uint8_t *input,
                                    uint8_t *output, size_t *len)
 {
-    struct map_entry *entry;
     size_t index;
     int retval = 0;
-    unsigned char local_index;
+    uint32_t unichr;
+    int bytes_consumed;
+    uint16_t translated;
 
     if (!input || !output || !len || !*len) {
         return -EINVAL;
     }
 
     index = 0;
-    entry = &map_root[*input];
+    // Reset the output array
+    memset(output, 0, *len);
+
     while (*input) {
-        input++;
-        if (entry->type == TYPE_ENTRY) {
-            output[index] = entry->value;
+        // Length check
+        if (index >= *len) {
+            retval = -E2BIG;
+            break;
+        }
+
+        bytes_consumed = utf8_to_u32(input, &unichr);
+        if (bytes_consumed == 0) {
+            // We should never get here, since the while loop should have
+            // caught it
+            retval = 0;
+            break;
+        }
+        input += bytes_consumed;
+
+        // Check for bytes in the Supplementary planes
+        if (unichr >= 0x10000) {
+            unichr = 0xFFFD; // Unicode replacement character
+        }
+
+        translated = root_table[unichr >> 8][unichr & 0xFF];
+        if (translated < 256) {
+            // Table entry, push to output
+            output[index] = (uint8_t)translated;
             index++;
-            if (index >= *len && *input) {
+        } else {
+            // We have a sequence, output that
+            const uint8_t *sequence = sequence_table[translated - 256];
+            uint8_t seq_len = sequence[0];
+
+            // Let's make sure that we can actually output to the buffer
+            if ((index + seq_len) >= *len) {
                 retval = -E2BIG;
                 break;
             }
-            entry = &map_root[*input];
-        } else if (entry->type == TYPE_POINTER) {
-            local_index = *input;
-            if (local_index < 0x80 || local_index >= 0xC0) {
-                /* Invalid input, skip till we find the start of another
-                 * valid UTF-8 character
-                 */
-                while (*input >= 0x80 && *input < 0xC0) {
-                    input++; /* Skip invalid characters */
-                }
 
-                /* New UTF-8 character, reset the entry pointer */
-                entry = &map_root[*input];
-            } else {
-                /* Mask off the upper bits, we only care about the lower 6 bits */
-                local_index &= 0x3F;
-                entry = &(entry->next[local_index]);
+            for (int i = 1; i <= seq_len; i++) {
+                output[index] = sequence[i];
+                index++;
             }
-        } else {
-            /* Invalid value, skip */
-            while (*input >= 0x80 && *input < 0xC0) {
-                input++; /* Skip invalid characters */
-            }
-
-            /* New UTF-8 character, reset the entry pointer */
-            entry = &map_root[*input];
         }
     }
 
diff --git a/libx52util/x52_char_map_test.c b/libx52util/x52_char_map_test.c
new file mode 100644
index 0000000..8648fe8
--- /dev/null
+++ b/libx52util/x52_char_map_test.c
@@ -0,0 +1,195 @@
+/*
+ * X52 character map lookup test
+ *
+ * Copyright (C) 2026 Nirenjan Krishnan <nirenjan@nirenjan.org>
+ *
+ * SPDX-License-Identifier: GPL-2.0-only WITH Classpath-exception-2.0
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <string.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <unistd.h>
+
+#include "libx52util.h"
+
+// Fix this if we ever hit longer sequences
+#define RECORD_SIZE 8
+
+// Blindly encode a string into it's smallest UTF8 representation
+static void encode_utf8(uint32_t cp, uint8_t *out)
+{
+    if (cp <= 0x7F) {
+        out[0] = (uint8_t)cp;
+    } else if (cp <= 0x7FF) {
+        out[0] = (uint8_t)(0xC0 | (cp >> 6));
+        out[1] = (uint8_t)(0x80 | (cp & 0x3F));
+    } else if (cp <= 0xFFFF) {
+        out[0] = (uint8_t)(0xE0 | (cp >> 12));
+        out[1] = (uint8_t)(0x80 | ((cp >> 6) & 0x3F));
+        out[2] = (uint8_t)(0x80 | (cp & 0x3F));
+    } else if (cp <= 0x1FFFFF) {
+        out[0] = (uint8_t)(0xF0 | (cp >> 18));
+        out[1] = (uint8_t)(0x80 | ((cp >> 12) & 0x3F));
+        out[2] = (uint8_t)(0x80 | ((cp >> 6) & 0x3F));
+        out[3] = (uint8_t)(0x80 | (cp & 0x3F));
+    } else if (cp <= 0x3FFFFFF) {
+        out[0] = (uint8_t)(0xF8 | (cp >> 24));
+        out[1] = (uint8_t)(0x80 | ((cp >> 18) & 0x3F));
+        out[2] = (uint8_t)(0x80 | ((cp >> 12) & 0x3F));
+        out[3] = (uint8_t)(0x80 | ((cp >> 6) & 0x3F));
+        out[4] = (uint8_t)(0x80 | (cp & 0x3F));
+    } else if (cp <= 0x7FFFFFFF) {
+        out[0] = (uint8_t)(0xFC | (cp >> 30));
+        out[1] = (uint8_t)(0x80 | ((cp >> 24) & 0x3F));
+        out[2] = (uint8_t)(0x80 | ((cp >> 18) & 0x3F));
+        out[3] = (uint8_t)(0x80 | ((cp >> 12) & 0x3F));
+        out[4] = (uint8_t)(0x80 | ((cp >> 6) & 0x3F));
+        out[5] = (uint8_t)(0x80 | (cp & 0x3F));
+    } else { // 0x80000000 to 0xFFFFFFFF (7 bytes)
+        out[0] = (uint8_t)0xFE; // Binary 11111110
+        out[1] = (uint8_t)(0x80 | ((cp >> 30) & 0x3F));
+        out[2] = (uint8_t)(0x80 | ((cp >> 24) & 0x3F));
+        out[3] = (uint8_t)(0x80 | ((cp >> 18) & 0x3F));
+        out[4] = (uint8_t)(0x80 | ((cp >> 12) & 0x3F));
+        out[5] = (uint8_t)(0x80 | ((cp >> 6) & 0x3F));
+        out[6] = (uint8_t)(0x80 | (cp & 0x3F));
+    }
+}
+
+int main(int argc, char *argv[])
+{
+    uint8_t input[8] = {0};
+    uint8_t output[RECORD_SIZE];
+    size_t len;
+    int result;
+
+    int fd;
+    uint8_t *expected_blob;
+    bool smp_pages_ok;
+
+    // Argument check
+    if (argc != 2) {
+        puts("Bail out! Invalid number of arguments");
+        puts("# Usage: libx52util-bmp-test <path-to-bin>");
+        return 1;
+    }
+
+    fd = open(argv[1], O_RDONLY);
+    if (fd < 0) {
+        printf("Bail out! Error %d opening bin file %s: %s\n",
+                errno, argv[1], strerror(errno));
+        return 1;
+    }
+
+    expected_blob = mmap(NULL, 0x10000 * RECORD_SIZE,
+                         PROT_READ, MAP_SHARED, fd, 0);
+    if (expected_blob == MAP_FAILED) {
+        printf("Bail out! MMAP failed with error %d: %s\n",
+                errno, strerror(errno));
+    }
+
+    puts("TAP version 13");
+    // Check the 256 BMP Pages, plus the supplementary pages
+    puts("1..257");
+
+    for (uint32_t page = 0; page < 256; page++) {
+        bool page_ok = true;
+
+        for (uint32_t offset = 0; offset < 256; offset++) {
+            uint32_t cp = page * 256 + offset;
+            const uint8_t *rec = &expected_blob[cp * RECORD_SIZE];
+
+            memset(input, 0, sizeof(input));
+            memset(output, 0, sizeof(output));
+            encode_utf8(cp, input);
+            len = sizeof(output);
+
+            result = libx52util_convert_utf8_string(input, output, &len);
+            if (result != 0) {
+                page_ok = false;
+                printf("# Bad result @ %04X: %d\n", cp, result);
+                break;
+            }
+
+            // result is OK, check against the expected blob
+            if (len != rec[0]) {
+                page_ok = false;
+                printf("# Length mismatch @ %04X: expected %u, got %zu\n",
+                        cp, rec[0], len);
+                break;
+            }
+
+            // Length is OK, check the bytes
+            if (memcmp(output, &rec[1], rec[0]) != 0) {
+                page_ok = false;
+                printf("# Output mismatch @ %04X:\n", cp);
+                printf("# exp/got:");
+                for (int i = 0; i < len; i++) {
+                    printf("%02X/%02X ", rec[i+1], output[i]);
+                }
+                puts("");
+                break;
+            }
+        }
+
+        printf("%sok - %d Page 0x%02x\n", page_ok ? "": "not ",
+                page + 1, page);
+    }
+
+    // Handle the supplementary pages
+    smp_pages_ok = true;
+    for (uint32_t smp = 0x1; smp <= 0x10; smp++) {
+        const uint8_t *rec = &expected_blob[0xFFFD * RECORD_SIZE];
+        for (uint32_t offset = 0; offset < 0x100; offset += 0xFF) {
+            uint32_t cp = smp * 256 + offset;
+
+            memset(input, 0, sizeof(input));
+            memset(output, 0, sizeof(output));
+            len = sizeof(output);
+            encode_utf8(cp, input);
+
+            result = libx52util_convert_utf8_string(input, output, &len);
+            if (result != 0) {
+                smp_pages_ok = false;
+                printf("# Bad result @ %08X: %d\n", cp, result);
+                break;
+            }
+
+            // result is OK, check against the expected blob
+            if (len != rec[0]) {
+                smp_pages_ok = false;
+                printf("# Length mismatch @ %08X: expected %u, got %zu\n",
+
+                        cp, rec[0], len);
+                break;
+            }
+
+            // Length is OK, check the bytes
+            if (memcmp(output, &rec[1], rec[0]) != 0) {
+                smp_pages_ok = false;
+                printf("# Output mismatch @ %08X:\n", cp);
+                printf("# exp/got:");
+                for (int i = 0; i < len; i++) {
+                    printf("%02X/%02X ", rec[i+1], output[i]);
+                }
+                puts("");
+                break;
+            }
+        }
+
+        if (!smp_pages_ok) {
+            break;
+        }
+    }
+    printf("%sok - 257 SMP tests\n", smp_pages_ok ? "" : "not ");
+
+    // Cleanup
+    munmap(expected_blob, 0x10000 * RECORD_SIZE);
+    close(fd);
+    return 0;
+}
diff --git a/libx52util/x52_map_test_gen.py b/libx52util/x52_map_test_gen.py
deleted file mode 100755
index 60567d6..0000000
--- a/libx52util/x52_map_test_gen.py
+++ /dev/null
@@ -1,107 +0,0 @@
-#!/usr/bin/env python3
-"""Generate a test script for the convert function"""
-
-import argparse
-import re
-
-def parse_file(map_file):
-    """Read the map file, strip out comments, and return a dictionary that
-       maps the UTF-8 encoded string to the X52 MFD character"""
-
-    # If we are running this, then we know that the input map is likely
-    # in a sane format already.
-    char_dict = {}
-
-    with open(map_file, 'r', encoding='utf-8') as map_fd:
-        for line in map_fd:
-            line = re.sub(r'#.*$', '', line).strip()
-
-            if not line:
-                # Comment line, skip
-                continue
-
-            key, out = line.split()
-            in_char = int(key, 0)
-
-            if len(out) == 1:
-                out_byte = ord(out)
-            else:
-                out_byte = int(out, 0)
-
-            char_dict[in_char] = out_byte
-
-    return char_dict
-
-def generate_positive_test_cases(char_dict):
-    """Generate a set of positive test cases"""
-    # For every string in the dictionary, generate a test case that tests
-    # the input against the output
-    TEST_CASE_FMT = """
-static void test_map_{in_char}(void **state) {{
-    (void)state;
-    const uint8_t input_array[] = {{ {in_bytes}, 0 }};
-    const uint8_t expected_output[2] = {{ {out_byte}, 0 }};
-    size_t out_len = 20;
-    uint8_t output[20] = {{ 0 }};
-    int rc;
-
-    rc = libx52util_convert_utf8_string(input_array, output, &out_len);
-    assert_int_equal(rc, 0);
-    assert_int_equal(out_len, 1);
-    assert_memory_equal(output, expected_output, 2);
-}}
-"""
-
-    output = ""
-    for in_char, out_byte in char_dict.items():
-        in_bytes = ", ".join(hex(c) for c in chr(in_char).encode('utf-8'))
-        in_tc = hex(in_char)
-
-        output += TEST_CASE_FMT.format(in_char=in_tc, in_bytes=in_bytes, out_byte=out_byte)
-
-    output += """
-const struct CMUnitTest tests[] = {
-"""
-
-    for in_char in sorted(char_dict.keys()):
-        output += f"    cmocka_unit_test(test_map_{hex(in_char)}),\n"
-
-    output += '};\n'
-
-    return output
-
-TEST_HEADER = """
-#include <stdint.h>
-#include <stddef.h>
-#include <stdarg.h>
-#include <setjmp.h>
-#include <cmocka.h>
-
-#include "libx52util.h"
-"""
-
-TEST_FOOTER = """
-int main(void) {
-    cmocka_set_message_output(CM_OUTPUT_TAP);
-    cmocka_run_group_tests(tests, NULL, NULL);
-    return 0;
-}
-"""
-
-def main():
-    """Generate X52 map test suite"""
-    parser = argparse.ArgumentParser(description='Generate map test cases')
-    parser.add_argument('INPUT_FILE', help="Input character map file")
-    parser.add_argument('OUTPUT_FILE', help="Generated test script")
-    args = parser.parse_args()
-
-    char_dict = parse_file(args.INPUT_FILE)
-    test_cases = generate_positive_test_cases(char_dict)
-
-    with open(args.OUTPUT_FILE, 'w', encoding='utf-8') as out_fd:
-        print(TEST_HEADER, file=out_fd)
-        print(test_cases, file=out_fd)
-        print(TEST_FOOTER, file=out_fd)
-
-if __name__ == '__main__':
-    main()