From 8801c073151dc765d451a58046ba1f42b1a56cfc Mon Sep 17 00:00:00 2001 From: Pierre Le Marre Date: Mon, 17 Jul 2023 11:15:07 +0200 Subject: [PATCH 01/17] nls: Add tool to automate Compose sequences comments Currently the comments of the Compose sequences are written manually. Unfortunately there are inconsistencies that are time-consuming to fix and easy to re-introduce. This commit add a new Python script to detect these inconsistencies and optionally fix them automatically. --- nls/compose-check.py | 157 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 157 insertions(+) create mode 100755 nls/compose-check.py diff --git a/nls/compose-check.py b/nls/compose-check.py new file mode 100755 index 00000000..bc13bcd1 --- /dev/null +++ b/nls/compose-check.py @@ -0,0 +1,157 @@ +#!/usr/bin/env python3 + +import sys + +MINIMUM_PYTHON_VERSION = (3, 10) +if sys.version_info < MINIMUM_PYTHON_VERSION: + raise Exception( + "Minimal Python version required: " + ".".join(map(str, MINIMUM_PYTHON_VERSION)) + ) + +import argparse +from io import TextIOWrapper +from pathlib import Path +import re +import shutil +import tempfile +from typing import Any, Generator, Sequence +import unicodedata + + +SEQUENCE_PATTERN = re.compile( + r"""^(?P[^:]+) + :\s+ + "(?P(?:\\"|[^"])+)" + (?:\s+(?P\w+))? + (?:(?P\s+)\#\s*(?P.*))? + """, + re.VERBOSE, +) +"""A pattern for Compose entries""" + + +def _unescape(s: str) -> Generator[str, Any, None]: + """Unescape a Compose file string""" + pending_escape = False + for c in s: + # WARNING: probably incomplete, but sufficient for now + if pending_escape: + match c: + case "\\": + yield c + pending_escape = False + break + case '"': + yield c + pending_escape = False + break + case _: + raise ValueError(f"Invalid escape sequence: “{s}”") + elif c == "\\": + pending_escape = True + else: + yield c + if pending_escape: + raise ValueError(f"Incomplete escape sequence: “{s}”") + + +def unescape(s: str) -> str: + return "".join(_unescape(s)) + + +def unicode_name(c: str, is_first: bool) -> str: + # TODO: we should use Unicode *corrected* names! + # But the Python API does not propose those. + name = unicodedata.name(c, None) + if name is None: + raise ValueError(f"Cannot find Unicode name for: “{c}” (U+{ord(c):0>4X})") + # RULE: remove “ACCENT” from the name, when the character is combining and + # is not in first position + if not is_first and "COMBINING" in name and name.endswith("ACCENT"): + return name[:-7] + else: + return name + + +def make_comment(s: str) -> str: + """Make the comment of a Compose sequence, based on its result.""" + return " plus ".join(unicode_name(c, k == 0) for k, c in enumerate(s)) + + +# TODO: we probably also want to check that the keysyms are correct and not deprecated +def process_lines(fd: TextIOWrapper): + multi_line_comment = False + for n, line in enumerate(fd, start=1): + # Handle pending multi-line comment + if multi_line_comment: + if line.strip().endswith("*/"): + multi_line_comment = False + yield line + # Handle single-line comment & include + elif not line.strip() or any( + line.startswith(s) for s in ("XCOMM", "#", "include") + ): + yield line + # Handle start of a multi-line comment + elif line.startswith("/*"): + # Check if one-liner + if not line.strip().endswith("*/"): + multi_line_comment = True + yield line + # Handle compose sequence + elif m := SEQUENCE_PATTERN.match(line): + string = unescape(m.group("string")) + expected_comment = make_comment(string) + # Check if we have the expected comment + # NOTE: Some APL sequences provide the combo of composed characters + if not ( + m.group("comment") == expected_comment + or (m.group("comment") and m.group("comment")[4:] == expected_comment) + ): + print( + f"[WARNING] Line {n}: Expected “{expected_comment}” comment, " + f"got: “{m.group('comment')}”", + file=sys.stderr, + ) + keysym = "" if m.group("keysym") is None else f"\t{m.group('keysym')}" + assert (len(string) == 1 and m.group("keysym") is not None) ^ ( + len(string) > 1 and m.group("keysym") is None + ) + comment_space = " " if len(string) == 1 else m.group("space") or "\t" + yield f"""{m.group('sequence')}: "{m.group('string')}"{keysym}{comment_space}# {expected_comment}\n""" + else: + yield line + else: + raise ValueError(f"Cannot parse line: “{line}”") + + +def process_file(path: Path): + with path.open("rt", encoding="utf-8") as fd: + yield from process_lines(fd) + + +def run(paths: Sequence[Path], write: bool): + for path in paths: + print(f" Processing Compose file: {path} ".center(80, "="), file=sys.stderr) + if write: + with tempfile.NamedTemporaryFile("wt") as fd: + # Write to a temporary file + fd.writelines(process_file(path)) + fd.flush() + # No error: now ovewrite the original file + shutil.copyfile(fd.name, path) + else: + for _ in process_file(path): + pass + + +def parse_args(): + parser = argparse.ArgumentParser(description="Add comment to compose sequence") + parser.add_argument("input", type=Path, nargs="+", help="Compose file to process") + parser.add_argument("--write", action="store_true", help="Write the compose file") + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + run(args.input, args.write) From dee3c1b13c41221508e0f845ad467ba4862ad462 Mon Sep 17 00:00:00 2001 From: Pierre Le Marre Date: Mon, 17 Jul 2023 15:54:23 +0200 Subject: [PATCH 02/17] nls: Check keysym character Check compose entries where the result of the sequence is expressed using both a string and a keysym. NOTE: this feature requires libxkbcommon. The check will be simply skipped if this library is not found. --- nls/compose-check.py | 46 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/nls/compose-check.py b/nls/compose-check.py index bc13bcd1..41289558 100755 --- a/nls/compose-check.py +++ b/nls/compose-check.py @@ -16,6 +16,44 @@ import shutil import tempfile from typing import Any, Generator, Sequence import unicodedata +from ctypes import ( + c_char_p, + c_int, + c_uint32, + cdll, +) +from ctypes.util import find_library + + +# Try to load xkbcommon +if xkbcommon_path := find_library("xkbcommon"): + HAS_XKBCOMMON = True + xkbcommon = cdll.LoadLibrary(xkbcommon_path) + + xkb_keysym_t = c_uint32 + xkbcommon.xkb_keysym_from_name.argtypes = [c_char_p, c_int] + xkbcommon.xkb_keysym_from_name.restype = xkb_keysym_t + + xkbcommon.xkb_keysym_to_utf32.argtypes = [xkb_keysym_t] + xkbcommon.xkb_keysym_to_utf32.restype = c_uint32 + + XKB_KEY_NoSymbol = 0 + XKB_KEYSYM_NO_FLAGS = 0 + + def keysym_to_char(keysym_name: str) -> str: + keysym = xkbcommon.xkb_keysym_from_name( + keysym_name.encode("utf-8"), XKB_KEYSYM_NO_FLAGS + ) + if keysym == XKB_KEY_NoSymbol: + raise ValueError(f"Unsupported keysym: “{keysym_name}”") + codepoint = xkbcommon.xkb_keysym_to_utf32(keysym) + if codepoint == 0: + raise ValueError( + f"Keysym cannot be translated to character: “{keysym_name}”" + ) + return chr(codepoint) +else: + HAS_XKBCOMMON = False SEQUENCE_PATTERN = re.compile( @@ -101,6 +139,14 @@ def process_lines(fd: TextIOWrapper): # Handle compose sequence elif m := SEQUENCE_PATTERN.match(line): string = unescape(m.group("string")) + # Check keysym + if HAS_XKBCOMMON and m.group("keysym"): + keysym_char = keysym_to_char(m.group("keysym")) + if string != keysym_char: + print( + f"[ERROR] Line {n}: The keysym does not correspond to the character: expected “{string}”, got “{keysym_char}”.", + file=sys.stderr, + ) expected_comment = make_comment(string) # Check if we have the expected comment # NOTE: Some APL sequences provide the combo of composed characters From 856b9d06305755aa6d8dfea52369e758d6a5c512 Mon Sep 17 00:00:00 2001 From: Pierre Le Marre Date: Mon, 17 Jul 2023 17:40:14 +0200 Subject: [PATCH 03/17] nls: check deprecated keysyms MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a check on deprecated keysyms in Compose files. From comment in xproto “keysymdef.h”, keysym is deemed deprecated if: - it has a comment that says explicitly so, or - its Unicode translation is between brackets, or - it is an alias for a previous keysym. --- nls/compose-check.py | 236 ++++++++++++++++++++++++++++++++++++++----- 1 file changed, 213 insertions(+), 23 deletions(-) diff --git a/nls/compose-check.py b/nls/compose-check.py index 41289558..f53fc6f4 100755 --- a/nls/compose-check.py +++ b/nls/compose-check.py @@ -14,7 +14,7 @@ from pathlib import Path import re import shutil import tempfile -from typing import Any, Generator, Sequence +from typing import Any, DefaultDict, Generator, Sequence import unicodedata from ctypes import ( c_char_p, @@ -24,6 +24,9 @@ from ctypes import ( ) from ctypes.util import find_library +################################################################################ +# xkbcommon handling +################################################################################ # Try to load xkbcommon if xkbcommon_path := find_library("xkbcommon"): @@ -40,10 +43,13 @@ if xkbcommon_path := find_library("xkbcommon"): XKB_KEY_NoSymbol = 0 XKB_KEYSYM_NO_FLAGS = 0 - def keysym_to_char(keysym_name: str) -> str: - keysym = xkbcommon.xkb_keysym_from_name( + def xkb_keysym_from_name(keysym_name: str) -> int: + return xkbcommon.xkb_keysym_from_name( keysym_name.encode("utf-8"), XKB_KEYSYM_NO_FLAGS ) + + def keysym_to_char(keysym_name: str) -> str: + keysym = xkb_keysym_from_name(keysym_name) if keysym == XKB_KEY_NoSymbol: raise ValueError(f"Unsupported keysym: “{keysym_name}”") codepoint = xkbcommon.xkb_keysym_to_utf32(keysym) @@ -52,11 +58,120 @@ if xkbcommon_path := find_library("xkbcommon"): f"Keysym cannot be translated to character: “{keysym_name}”" ) return chr(codepoint) + else: HAS_XKBCOMMON = False -SEQUENCE_PATTERN = re.compile( +################################################################################ +# Keysyms headers +################################################################################ + +DEFAULT_KEYSYMS_HEADERS_PREFIX = Path("/usr") +DEFAULT_KEYSYMS_HEADERS = [ + Path("include/X11/keysymdef.h"), + Path("include/X11/XF86keysym.h"), + Path("include/X11/Sunkeysym.h"), + Path("include/X11/DECkeysym.h"), + Path("include/X11/HPkeysym.h"), +] + +KEYSYM_ENTRY_PATTERN = re.compile( + r""" + ^\#define\s+ + (?:(?P\w+)?XK|XKB_KEY)_(?P\w+)\s+ + (?P_EVDEVK\()? + (?P0x[0-9a-fA-F]+) + (?(evdev)\)|)\s* + (?:/\*\s* + (?: + (?Pdeprecated)| + \(U\+(?P[0-9a-fA-F]{4,})(?:\s|\w|-)+\)| + .* + ) + )? + """, + re.VERBOSE, +) +EXTRA_DEPRECATED_KEYSYMS = ("Ext16bit_L", "Ext16bit_R") + + +def parse_keysyms_header( + path: Path, keysyms: dict[int, str], keysyms_names: dict[str, str] +): + with path.open("rt", encoding="utf-8") as fd: + pending_multine_comment = False + for n, line in enumerate(map(lambda l: l.strip(), fd)): + if not line: + # Empty line + continue + elif pending_multine_comment: + if line.endswith("*/"): + pending_multine_comment = False + continue + elif line.startswith("/*"): + if not line.endswith("*/"): + pending_multine_comment = True + continue + elif any( + line.startswith(s) + for s in ("#ifdef", "#ifndef", "#endif", "#define _", "#undef") + ): + continue + elif m := KEYSYM_ENTRY_PATTERN.match(line): + if m.group("evdev"): + # _EVDEVK macro + keysym = 0x10081000 + int(m.group("value"), 16) + else: + keysym = int(m.group("value"), 16) + name = (m.group("prefix") or "") + m.group("name") + if ref := keysyms.get(keysym): + # Deprecated, because there is a previous definition with other name. + # Ensure that the replacement keysym is supported by xkbcommon. + if ( + not HAS_XKBCOMMON + or xkb_keysym_from_name(ref) != XKB_KEY_NoSymbol + ): + keysyms_names[name] = ref + continue + else: + print( + f"[WARNING] Line {n}: Keep deprecated keysym “{name}”, because the reference keysyms “{ref}” is not supported by your version of xkbcommon." + ) + else: + # Reference keysym + keysyms[keysym] = name + if ( + m.group("deprecated") + or m.group("unicode") + or m.group("name") in EXTRA_DEPRECATED_KEYSYMS + ): + # Explicitely deprecated + keysyms_names[name] = "" + else: + # Reference keysym + keysyms_names[name] = name + else: + raise ValueError(f"Cannot parse header “{path}” line: {line}") + + +def parse_keysyms_headers(paths: Sequence[Path]) -> dict[str, str]: + keysyms: dict[int, str] = {} + keysyms_names: dict[str, str] = {} + for path in paths: + if not path.is_file(): + print(f"[ERROR] Cannot open keysym header file: {path}") + else: + print(f" Processing header file: {path} ".center(80, "="), file=sys.stderr) + parse_keysyms_header(path, keysyms, keysyms_names) + return keysyms_names + + +################################################################################ +# Compose files +################################################################################ + +COMPOSE_ENTRY_PATTERN = re.compile( r"""^(?P[^:]+) :\s+ "(?P(?:\\"|[^"])+)" @@ -67,6 +182,9 @@ SEQUENCE_PATTERN = re.compile( ) """A pattern for Compose entries""" +UNICODE_KEYSYM_PATTERN = re.compile(r"(U[0-9A-Fa-f]+)") +KEYSYM_PATTERN = re.compile(r"<(\w+)>") + def _unescape(s: str) -> Generator[str, Any, None]: """Unescape a Compose file string""" @@ -116,8 +234,44 @@ def make_comment(s: str) -> str: return " plus ".join(unicode_name(c, k == 0) for k, c in enumerate(s)) -# TODO: we probably also want to check that the keysyms are correct and not deprecated -def process_lines(fd: TextIOWrapper): +def check_keysym(deprecated_keysyms: dict[str, str], n: int, keysym_name: str) -> str: + if UNICODE_KEYSYM_PATTERN.match(keysym_name): + return keysym_name + ref = deprecated_keysyms.get(keysym_name) + if keysym_name == ref: + # Reference keysym + return keysym_name + elif ref is None: + print(f"[ERROR] Line {n}: Unsupported keysym “{keysym_name}”") + return keysym_name + elif ref == "": + # Deprecated: keep keysym + print(f"[WARNING] Line {n}: Deprecated keysym “{keysym_name}”.") + return keysym_name + else: + # Deprecated alias: return reference keysym + print( + f"[WARNING] Line {n}: Deprecated keysym “{keysym_name}”. Please use “{ref}” instead." + ) + return ref + + +def check_keysym_sequence( + deprecated_keysyms: dict[str, str], n: int, sequence: str +) -> str: + subsitutions: dict[str, str] = {} + for keysym_name in KEYSYM_PATTERN.findall(sequence): + keysym_nameʹ = check_keysym(deprecated_keysyms, n, keysym_name) + if keysym_nameʹ != keysym_name: + subsitutions[keysym_name] = keysym_nameʹ + if subsitutions: + pattern = re.compile("|".join(re.escape(k) for k in subsitutions.keys())) + return pattern.sub(lambda x: subsitutions[x.group()], sequence) + else: + return sequence + + +def process_lines(fd: TextIOWrapper, keysyms_names: dict[str, str]): multi_line_comment = False for n, line in enumerate(fd, start=1): # Handle pending multi-line comment @@ -137,16 +291,29 @@ def process_lines(fd: TextIOWrapper): multi_line_comment = True yield line # Handle compose sequence - elif m := SEQUENCE_PATTERN.match(line): + elif m := COMPOSE_ENTRY_PATTERN.match(line): string = unescape(m.group("string")) - # Check keysym - if HAS_XKBCOMMON and m.group("keysym"): - keysym_char = keysym_to_char(m.group("keysym")) - if string != keysym_char: - print( - f"[ERROR] Line {n}: The keysym does not correspond to the character: expected “{string}”, got “{keysym_char}”.", - file=sys.stderr, - ) + rewrite = False + # Check sequence keysyms + if keysyms_names: + sequence = check_keysym_sequence(keysyms_names, n, m.group("sequence")) + if sequence != m.group("sequence"): + rewrite = True + else: + sequence = m.group("sequence") + # Check result keysym + if keysym := m.group("keysym"): + if HAS_XKBCOMMON: + keysym_char = keysym_to_char(m.group("keysym")) + if string != keysym_char: + print( + f"[ERROR] Line {n}: The keysym does not correspond to the character: expected “{string}”, got “{keysym_char}”.", + file=sys.stderr, + ) + if keysyms_names: + keysym = check_keysym(keysyms_names, n, m.group("keysym")) + if keysym != m.group("keysym"): + rewrite = True expected_comment = make_comment(string) # Check if we have the expected comment # NOTE: Some APL sequences provide the combo of composed characters @@ -159,45 +326,68 @@ def process_lines(fd: TextIOWrapper): f"got: “{m.group('comment')}”", file=sys.stderr, ) - keysym = "" if m.group("keysym") is None else f"\t{m.group('keysym')}" + rewrite = True + # Rewrite entry if necessary + if rewrite: + keysym = "" if keysym is None else f"\t{keysym}" assert (len(string) == 1 and m.group("keysym") is not None) ^ ( len(string) > 1 and m.group("keysym") is None ) comment_space = " " if len(string) == 1 else m.group("space") or "\t" - yield f"""{m.group('sequence')}: "{m.group('string')}"{keysym}{comment_space}# {expected_comment}\n""" + yield f"""{sequence}: "{m.group('string')}"{keysym}{comment_space}# {expected_comment}\n""" else: yield line else: raise ValueError(f"Cannot parse line: “{line}”") -def process_file(path: Path): +def process_file(path: Path, keysyms_names: dict[str, str]): with path.open("rt", encoding="utf-8") as fd: - yield from process_lines(fd) + yield from process_lines(fd, keysyms_names) -def run(paths: Sequence[Path], write: bool): +def run(paths: Sequence[Path], write: bool, keysyms_headers: Sequence[Path]): + # Keysyms headers + keysyms_names = parse_keysyms_headers(keysyms_headers) + # Compose file for path in paths: print(f" Processing Compose file: {path} ".center(80, "="), file=sys.stderr) if write: with tempfile.NamedTemporaryFile("wt") as fd: # Write to a temporary file - fd.writelines(process_file(path)) + fd.writelines(process_file(path, keysyms_names)) fd.flush() # No error: now ovewrite the original file shutil.copyfile(fd.name, path) else: - for _ in process_file(path): + for _ in process_file(path, keysyms_names): pass def parse_args(): parser = argparse.ArgumentParser(description="Add comment to compose sequence") parser.add_argument("input", type=Path, nargs="+", help="Compose file to process") + group = parser.add_mutually_exclusive_group() + group.add_argument("--no-keysyms", action="store_true", help="Do not check keysyms") + group.add_argument( + "--keysyms", type=Path, action="append", help="Add a keysym header to parse" + ) + group.add_argument( + "--keysyms-prefix", + type=Path, + default=DEFAULT_KEYSYMS_HEADERS_PREFIX, + help="Keysym header prefix for default keysyms header files (default: %(default)s)", + ) parser.add_argument("--write", action="store_true", help="Write the compose file") return parser.parse_args() if __name__ == "__main__": args = parse_args() - run(args.input, args.write) + if args.no_keysyms: + keysyms = [] + elif args.keysyms: + keysyms = args.keysyms + else: + keysyms = list(args.keysyms_prefix / path for path in DEFAULT_KEYSYMS_HEADERS) + run(args.input, args.write, keysyms) From 065321796d0fc03c1b19e68be7c15ee01175c951 Mon Sep 17 00:00:00 2001 From: Pierre Le Marre Date: Mon, 17 Jul 2023 21:43:21 +0200 Subject: [PATCH 04/17] nls: Ensure standard format of Unicode keysyms --- nls/compose-check.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/nls/compose-check.py b/nls/compose-check.py index f53fc6f4..817b1617 100755 --- a/nls/compose-check.py +++ b/nls/compose-check.py @@ -136,7 +136,7 @@ def parse_keysyms_header( continue else: print( - f"[WARNING] Line {n}: Keep deprecated keysym “{name}”, because the reference keysyms “{ref}” is not supported by your version of xkbcommon." + f"[WARNING] Line {n}: Keep deprecated keysym “{name}”; reference keysym “{ref}” is not supported by available xkbcommon." ) else: # Reference keysym @@ -182,7 +182,7 @@ COMPOSE_ENTRY_PATTERN = re.compile( ) """A pattern for Compose entries""" -UNICODE_KEYSYM_PATTERN = re.compile(r"(U[0-9A-Fa-f]+)") +UNICODE_KEYSYM_PATTERN = re.compile(r"\bU(?P[0-9A-Fa-f]+)\b") KEYSYM_PATTERN = re.compile(r"<(\w+)>") @@ -235,8 +235,10 @@ def make_comment(s: str) -> str: def check_keysym(deprecated_keysyms: dict[str, str], n: int, keysym_name: str) -> str: - if UNICODE_KEYSYM_PATTERN.match(keysym_name): - return keysym_name + if m := UNICODE_KEYSYM_PATTERN.match(keysym_name): + # Reformat Unicode keysym + codepoint = int(m.group("codepoint"), 16) + return f"U{codepoint:0>4X}" ref = deprecated_keysyms.get(keysym_name) if keysym_name == ref: # Reference keysym From 0c8d4a40c55390f0d8de76fc4c7b8b2d54900bf0 Mon Sep 17 00:00:00 2001 From: Pierre Le Marre Date: Mon, 17 Jul 2023 22:31:01 +0200 Subject: [PATCH 05/17] nls: Optionally check corrected Unicode names --- nls/compose-check.py | 92 ++++++++++++++++++++++++++++++++------------ 1 file changed, 68 insertions(+), 24 deletions(-) diff --git a/nls/compose-check.py b/nls/compose-check.py index 817b1617..61bdbe7b 100755 --- a/nls/compose-check.py +++ b/nls/compose-check.py @@ -167,6 +167,42 @@ def parse_keysyms_headers(paths: Sequence[Path]) -> dict[str, str]: return keysyms_names +################################################################################ +# Unicode names +################################################################################ + + +def parse_unicode_name_aliases(path: Path) -> dict[str, str]: + aliases: dict[str, str] = {} + with path.open("rt", encoding="utf-8") as fd: + for line in map(lambda s: s.strip(), fd): + # Empty line or comment + if not line or line.startswith("#"): + continue + line = line.split("#")[0] + raw_codepoint, alias, category, *_ = map( + lambda s: s.strip(), line.split(";") + ) + char = chr(int(raw_codepoint, 16)) + if category == "correction": + aliases[char] = alias + return aliases + + +def unicode_name(unicode_name_aliases: dict[str, str], c: str, is_first: bool) -> str: + # We want to use Unicode *corrected* names, but the Python API does not + # propose those. So we process the UCD by ourselves. + name = unicode_name_aliases.get(c) or unicodedata.name(c, None) + if name is None: + raise ValueError(f"Cannot find Unicode name for: “{c}” (U+{ord(c):0>4X})") + # RULE: remove “ACCENT” from the name, when the character is combining and + # is not in first position + if not is_first and "COMBINING" in name and name.endswith("ACCENT"): + return name[:-7] + else: + return name + + ################################################################################ # Compose files ################################################################################ @@ -215,23 +251,11 @@ def unescape(s: str) -> str: return "".join(_unescape(s)) -def unicode_name(c: str, is_first: bool) -> str: - # TODO: we should use Unicode *corrected* names! - # But the Python API does not propose those. - name = unicodedata.name(c, None) - if name is None: - raise ValueError(f"Cannot find Unicode name for: “{c}” (U+{ord(c):0>4X})") - # RULE: remove “ACCENT” from the name, when the character is combining and - # is not in first position - if not is_first and "COMBINING" in name and name.endswith("ACCENT"): - return name[:-7] - else: - return name - - -def make_comment(s: str) -> str: +def make_comment(unicode_name_aliases: dict[str, str], s: str) -> str: """Make the comment of a Compose sequence, based on its result.""" - return " plus ".join(unicode_name(c, k == 0) for k, c in enumerate(s)) + return " plus ".join( + unicode_name(unicode_name_aliases, c, k == 0) for k, c in enumerate(s) + ) def check_keysym(deprecated_keysyms: dict[str, str], n: int, keysym_name: str) -> str: @@ -273,7 +297,11 @@ def check_keysym_sequence( return sequence -def process_lines(fd: TextIOWrapper, keysyms_names: dict[str, str]): +def process_lines( + fd: TextIOWrapper, + keysyms_names: dict[str, str], + unicode_name_aliases: dict[str, str], +): multi_line_comment = False for n, line in enumerate(fd, start=1): # Handle pending multi-line comment @@ -316,7 +344,7 @@ def process_lines(fd: TextIOWrapper, keysyms_names: dict[str, str]): keysym = check_keysym(keysyms_names, n, m.group("keysym")) if keysym != m.group("keysym"): rewrite = True - expected_comment = make_comment(string) + expected_comment = make_comment(unicode_name_aliases, string) # Check if we have the expected comment # NOTE: Some APL sequences provide the combo of composed characters if not ( @@ -343,26 +371,37 @@ def process_lines(fd: TextIOWrapper, keysyms_names: dict[str, str]): raise ValueError(f"Cannot parse line: “{line}”") -def process_file(path: Path, keysyms_names: dict[str, str]): +def process_file( + path: Path, keysyms_names: dict[str, str], unicode_name_aliases: dict[str, str] +): with path.open("rt", encoding="utf-8") as fd: - yield from process_lines(fd, keysyms_names) + yield from process_lines(fd, keysyms_names, unicode_name_aliases) -def run(paths: Sequence[Path], write: bool, keysyms_headers: Sequence[Path]): +def run( + paths: Sequence[Path], + write: bool, + keysyms_headers: Sequence[Path], + name_aliases_path: Path | None, +): # Keysyms headers keysyms_names = parse_keysyms_headers(keysyms_headers) + # Unicode files + unicode_name_aliases = ( + parse_unicode_name_aliases(name_aliases_path) if name_aliases_path else {} + ) # Compose file for path in paths: print(f" Processing Compose file: {path} ".center(80, "="), file=sys.stderr) if write: with tempfile.NamedTemporaryFile("wt") as fd: # Write to a temporary file - fd.writelines(process_file(path, keysyms_names)) + fd.writelines(process_file(path, keysyms_names, unicode_name_aliases)) fd.flush() # No error: now ovewrite the original file shutil.copyfile(fd.name, path) else: - for _ in process_file(path, keysyms_names): + for _ in process_file(path, keysyms_names, unicode_name_aliases): pass @@ -380,6 +419,11 @@ def parse_args(): default=DEFAULT_KEYSYMS_HEADERS_PREFIX, help="Keysym header prefix for default keysyms header files (default: %(default)s)", ) + parser.add_argument( + "--unicode-name-aliases", + type=Path, + help="Name aliases file from the Unicode Character Database. Latest version available at: https://www.unicode.org/Public/UCD/latest/ucd/NameAliases.txt", + ) parser.add_argument("--write", action="store_true", help="Write the compose file") return parser.parse_args() @@ -392,4 +436,4 @@ if __name__ == "__main__": keysyms = args.keysyms else: keysyms = list(args.keysyms_prefix / path for path in DEFAULT_KEYSYMS_HEADERS) - run(args.input, args.write, keysyms) + run(args.input, args.write, keysyms, args.unicode_name_aliases) From e88d57c04fd262d51c2947042b80afd67b3456ce Mon Sep 17 00:00:00 2001 From: Pierre Le Marre Date: Mon, 17 Jul 2023 23:11:07 +0200 Subject: [PATCH 06/17] nls: Optionally prefer named keysym over Unicode ones --- nls/compose-check.py | 105 +++++++++++++++++++++++++++++++++---------- 1 file changed, 81 insertions(+), 24 deletions(-) diff --git a/nls/compose-check.py b/nls/compose-check.py index 61bdbe7b..d1df25cf 100755 --- a/nls/compose-check.py +++ b/nls/compose-check.py @@ -1,5 +1,6 @@ #!/usr/bin/env python3 +from dataclasses import dataclass import sys MINIMUM_PYTHON_VERSION = (3, 10) @@ -19,11 +20,21 @@ import unicodedata from ctypes import ( c_char_p, c_int, + c_size_t, c_uint32, cdll, + create_string_buffer, ) from ctypes.util import find_library + +@dataclass +class Configuration: + keysyms_names: dict[str, str] + unicode_name_aliases: dict[str, str] + prefer_unicode_keysym: bool + + ################################################################################ # xkbcommon handling ################################################################################ @@ -40,6 +51,12 @@ if xkbcommon_path := find_library("xkbcommon"): xkbcommon.xkb_keysym_to_utf32.argtypes = [xkb_keysym_t] xkbcommon.xkb_keysym_to_utf32.restype = c_uint32 + xkbcommon.xkb_utf32_to_keysym.argtypes = [c_uint32] + xkbcommon.xkb_utf32_to_keysym.restype = xkb_keysym_t + + xkbcommon.xkb_keysym_get_name.argtypes = [xkb_keysym_t, c_char_p, c_size_t] + xkbcommon.xkb_keysym_get_name.restype = int + XKB_KEY_NoSymbol = 0 XKB_KEYSYM_NO_FLAGS = 0 @@ -59,6 +76,20 @@ if xkbcommon_path := find_library("xkbcommon"): ) return chr(codepoint) + def char_to_keysym(char: str) -> str: + keysym = xkbcommon.xkb_utf32_to_keysym(ord(char)) + if keysym == XKB_KEY_NoSymbol: + return "" + buf_len = 90 + buf = create_string_buffer(buf_len) + n = xkbcommon.xkb_keysym_get_name(keysym, buf, c_size_t(buf_len)) + if n < 0 or n >= buf_len: + raise ValueError( + f"Unsupported keysym: {keysym} (char: “U+{ord(char):4>X}”)" + ) + else: + return buf.value.decode("utf-8") + else: HAS_XKBCOMMON = False @@ -258,12 +289,29 @@ def make_comment(unicode_name_aliases: dict[str, str], s: str) -> str: ) -def check_keysym(deprecated_keysyms: dict[str, str], n: int, keysym_name: str) -> str: +def check_keysym(config: Configuration, n: int, keysym_name: str) -> str: if m := UNICODE_KEYSYM_PATTERN.match(keysym_name): # Reformat Unicode keysym codepoint = int(m.group("codepoint"), 16) - return f"U{codepoint:0>4X}" - ref = deprecated_keysyms.get(keysym_name) + unicode_keysym = f"U{codepoint:0>4X}" + if HAS_XKBCOMMON: + # Find the canonical keysym name using xkbcommon + keysym_name = char_to_keysym(chr(codepoint)) + # We keep our normalized Unicode in case xkbcommon returns a long + # Unicode keysym, or we explicitely prefer Unicode keysyms, or + # the named keysym is deprecated. + if ( + unicode_keysym == keysym_name + or keysym_name.startswith("U0") + or config.prefer_unicode_keysym + or config.keysyms_names.get(keysym_name) != keysym_name + ): + return unicode_keysym + else: + return keysym_name + else: + return unicode_keysym + ref = config.keysyms_names.get(keysym_name) if keysym_name == ref: # Reference keysym return keysym_name @@ -282,12 +330,10 @@ def check_keysym(deprecated_keysyms: dict[str, str], n: int, keysym_name: str) - return ref -def check_keysym_sequence( - deprecated_keysyms: dict[str, str], n: int, sequence: str -) -> str: +def check_keysym_sequence(config: Configuration, n: int, sequence: str) -> str: subsitutions: dict[str, str] = {} for keysym_name in KEYSYM_PATTERN.findall(sequence): - keysym_nameʹ = check_keysym(deprecated_keysyms, n, keysym_name) + keysym_nameʹ = check_keysym(config, n, keysym_name) if keysym_nameʹ != keysym_name: subsitutions[keysym_name] = keysym_nameʹ if subsitutions: @@ -297,11 +343,7 @@ def check_keysym_sequence( return sequence -def process_lines( - fd: TextIOWrapper, - keysyms_names: dict[str, str], - unicode_name_aliases: dict[str, str], -): +def process_lines(fd: TextIOWrapper, config: Configuration): multi_line_comment = False for n, line in enumerate(fd, start=1): # Handle pending multi-line comment @@ -325,8 +367,8 @@ def process_lines( string = unescape(m.group("string")) rewrite = False # Check sequence keysyms - if keysyms_names: - sequence = check_keysym_sequence(keysyms_names, n, m.group("sequence")) + if config.keysyms_names: + sequence = check_keysym_sequence(config, n, m.group("sequence")) if sequence != m.group("sequence"): rewrite = True else: @@ -340,11 +382,11 @@ def process_lines( f"[ERROR] Line {n}: The keysym does not correspond to the character: expected “{string}”, got “{keysym_char}”.", file=sys.stderr, ) - if keysyms_names: - keysym = check_keysym(keysyms_names, n, m.group("keysym")) + if config.keysyms_names: + keysym = check_keysym(config, n, m.group("keysym")) if keysym != m.group("keysym"): rewrite = True - expected_comment = make_comment(unicode_name_aliases, string) + expected_comment = make_comment(config.unicode_name_aliases, string) # Check if we have the expected comment # NOTE: Some APL sequences provide the combo of composed characters if not ( @@ -371,11 +413,9 @@ def process_lines( raise ValueError(f"Cannot parse line: “{line}”") -def process_file( - path: Path, keysyms_names: dict[str, str], unicode_name_aliases: dict[str, str] -): +def process_file(path: Path, config: Configuration): with path.open("rt", encoding="utf-8") as fd: - yield from process_lines(fd, keysyms_names, unicode_name_aliases) + yield from process_lines(fd, config) def run( @@ -383,6 +423,7 @@ def run( write: bool, keysyms_headers: Sequence[Path], name_aliases_path: Path | None, + prefer_named_keysyms: bool, ): # Keysyms headers keysyms_names = parse_keysyms_headers(keysyms_headers) @@ -390,18 +431,23 @@ def run( unicode_name_aliases = ( parse_unicode_name_aliases(name_aliases_path) if name_aliases_path else {} ) + config = Configuration( + keysyms_names=keysyms_names, + unicode_name_aliases=unicode_name_aliases, + prefer_unicode_keysym=not prefer_named_keysyms, + ) # Compose file for path in paths: print(f" Processing Compose file: {path} ".center(80, "="), file=sys.stderr) if write: with tempfile.NamedTemporaryFile("wt") as fd: # Write to a temporary file - fd.writelines(process_file(path, keysyms_names, unicode_name_aliases)) + fd.writelines(process_file(path, config)) fd.flush() # No error: now ovewrite the original file shutil.copyfile(fd.name, path) else: - for _ in process_file(path, keysyms_names, unicode_name_aliases): + for _ in process_file(path, config): pass @@ -424,6 +470,11 @@ def parse_args(): type=Path, help="Name aliases file from the Unicode Character Database. Latest version available at: https://www.unicode.org/Public/UCD/latest/ucd/NameAliases.txt", ) + parser.add_argument( + "--prefer-named-keysyms", + action="store_true", + help="Prefer named keysyms over Unicode keysyms", + ) parser.add_argument("--write", action="store_true", help="Write the compose file") return parser.parse_args() @@ -436,4 +487,10 @@ if __name__ == "__main__": keysyms = args.keysyms else: keysyms = list(args.keysyms_prefix / path for path in DEFAULT_KEYSYMS_HEADERS) - run(args.input, args.write, keysyms, args.unicode_name_aliases) + run( + args.input, + args.write, + keysyms, + args.unicode_name_aliases, + args.prefer_named_keysyms, + ) From e3c42fab52fcdc12794f420f9270ff537d17a753 Mon Sep 17 00:00:00 2001 From: Pierre Le Marre Date: Tue, 18 Jul 2023 07:17:19 +0200 Subject: [PATCH 07/17] review: Improve libxkbcommon handling --- nls/compose-check.py | 131 ++++++++++++++++++++++++------------------- 1 file changed, 73 insertions(+), 58 deletions(-) diff --git a/nls/compose-check.py b/nls/compose-check.py index d1df25cf..f30e8f97 100755 --- a/nls/compose-check.py +++ b/nls/compose-check.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 -from dataclasses import dataclass +from __future__ import annotations + import sys MINIMUM_PYTHON_VERSION = (3, 10) @@ -10,22 +11,16 @@ if sys.version_info < MINIMUM_PYTHON_VERSION: ) import argparse -from io import TextIOWrapper -from pathlib import Path +import ctypes +import ctypes.util import re import shutil import tempfile -from typing import Any, DefaultDict, Generator, Sequence import unicodedata -from ctypes import ( - c_char_p, - c_int, - c_size_t, - c_uint32, - cdll, - create_string_buffer, -) -from ctypes.util import find_library +from dataclasses import dataclass +from io import TextIOWrapper +from pathlib import Path +from typing import Any, Generator, Sequence @dataclass @@ -39,61 +34,84 @@ class Configuration: # xkbcommon handling ################################################################################ -# Try to load xkbcommon -if xkbcommon_path := find_library("xkbcommon"): - HAS_XKBCOMMON = True - xkbcommon = cdll.LoadLibrary(xkbcommon_path) - xkb_keysym_t = c_uint32 - xkbcommon.xkb_keysym_from_name.argtypes = [c_char_p, c_int] - xkbcommon.xkb_keysym_from_name.restype = xkb_keysym_t +xkb_keysym_t = ctypes.c_uint32 - xkbcommon.xkb_keysym_to_utf32.argtypes = [xkb_keysym_t] - xkbcommon.xkb_keysym_to_utf32.restype = c_uint32 - - xkbcommon.xkb_utf32_to_keysym.argtypes = [c_uint32] - xkbcommon.xkb_utf32_to_keysym.restype = xkb_keysym_t - - xkbcommon.xkb_keysym_get_name.argtypes = [xkb_keysym_t, c_char_p, c_size_t] - xkbcommon.xkb_keysym_get_name.restype = int +class Xkbcommon: XKB_KEY_NoSymbol = 0 XKB_KEYSYM_NO_FLAGS = 0 - def xkb_keysym_from_name(keysym_name: str) -> int: - return xkbcommon.xkb_keysym_from_name( - keysym_name.encode("utf-8"), XKB_KEYSYM_NO_FLAGS + def __init__(self, xkbcommon_path): + self._lib = ctypes.cdll.LoadLibrary(xkbcommon_path) + + self._lib.xkb_keysym_from_name.argtypes = [ctypes.c_char_p, ctypes.c_int] + self._lib.xkb_keysym_from_name.restype = xkb_keysym_t + + self._lib.xkb_keysym_to_utf32.argtypes = [xkb_keysym_t] + self._lib.xkb_keysym_to_utf32.restype = ctypes.c_uint32 + + self._lib.xkb_utf32_to_keysym.argtypes = [ctypes.c_uint32] + self._lib.xkb_utf32_to_keysym.restype = xkb_keysym_t + + self._lib.xkb_keysym_get_name.argtypes = [ + xkb_keysym_t, + ctypes.c_char_p, + ctypes.c_size_t, + ] + self._lib.xkb_keysym_get_name.restype = int + + @classmethod + def load(cls) -> Xkbcommon | None: + """Try to load xkbcommon""" + if xkbcommon_path := ctypes.util.find_library("xkbcommon"): + return cls(xkbcommon_path) + else: + return None + + def keysym_from_name(self, keysym_name: str) -> int: + return self._lib.xkb_keysym_from_name( + keysym_name.encode("utf-8"), self.XKB_KEYSYM_NO_FLAGS ) - def keysym_to_char(keysym_name: str) -> str: - keysym = xkb_keysym_from_name(keysym_name) - if keysym == XKB_KEY_NoSymbol: + def is_invalid_keysym(self, keysym: int) -> bool: + return keysym == self.XKB_KEY_NoSymbol + + def is_invalid_keysym_name(self, name: str) -> bool: + return self.is_invalid_keysym(self.keysym_from_name(name)) + + def keysym_to_char(self, keysym_name: str) -> str: + keysym = self.keysym_from_name(keysym_name) + if self.is_invalid_keysym(keysym): raise ValueError(f"Unsupported keysym: “{keysym_name}”") - codepoint = xkbcommon.xkb_keysym_to_utf32(keysym) + codepoint = self._lib.xkb_keysym_to_utf32(keysym) if codepoint == 0: raise ValueError( f"Keysym cannot be translated to character: “{keysym_name}”" ) return chr(codepoint) - def char_to_keysym(char: str) -> str: - keysym = xkbcommon.xkb_utf32_to_keysym(ord(char)) - if keysym == XKB_KEY_NoSymbol: - return "" + def keysym_get_name(self, keysym: int) -> str: buf_len = 90 - buf = create_string_buffer(buf_len) - n = xkbcommon.xkb_keysym_get_name(keysym, buf, c_size_t(buf_len)) - if n < 0 or n >= buf_len: - raise ValueError( - f"Unsupported keysym: {keysym} (char: “U+{ord(char):4>X}”)" - ) + buf = ctypes.create_string_buffer(buf_len) + n = self._lib.xkb_keysym_get_name(keysym, buf, ctypes.c_size_t(buf_len)) + if n < 0: + raise ValueError(f"Unsupported keysym: 0x{keysym:0>4X})") + elif n >= buf_len: + raise ValueError(f"Buffer is not big enough: expected at least {n}.") else: return buf.value.decode("utf-8") -else: - HAS_XKBCOMMON = False + def char_to_keysym(self, char: str) -> str: + keysym = self._lib.xkb_utf32_to_keysym(ord(char)) + if self.is_invalid_keysym(keysym): + return "" + else: + return self.keysym_get_name(keysym) +libxkbcommon = Xkbcommon.load() + ################################################################################ # Keysyms headers ################################################################################ @@ -159,16 +177,13 @@ def parse_keysyms_header( if ref := keysyms.get(keysym): # Deprecated, because there is a previous definition with other name. # Ensure that the replacement keysym is supported by xkbcommon. - if ( - not HAS_XKBCOMMON - or xkb_keysym_from_name(ref) != XKB_KEY_NoSymbol - ): - keysyms_names[name] = ref - continue - else: + if libxkbcommon and libxkbcommon.is_invalid_keysym_name(ref): print( f"[WARNING] Line {n}: Keep deprecated keysym “{name}”; reference keysym “{ref}” is not supported by available xkbcommon." ) + else: + keysyms_names[name] = ref + continue else: # Reference keysym keysyms[keysym] = name @@ -294,9 +309,9 @@ def check_keysym(config: Configuration, n: int, keysym_name: str) -> str: # Reformat Unicode keysym codepoint = int(m.group("codepoint"), 16) unicode_keysym = f"U{codepoint:0>4X}" - if HAS_XKBCOMMON: + if libxkbcommon: # Find the canonical keysym name using xkbcommon - keysym_name = char_to_keysym(chr(codepoint)) + keysym_name = libxkbcommon.char_to_keysym(chr(codepoint)) # We keep our normalized Unicode in case xkbcommon returns a long # Unicode keysym, or we explicitely prefer Unicode keysyms, or # the named keysym is deprecated. @@ -375,8 +390,8 @@ def process_lines(fd: TextIOWrapper, config: Configuration): sequence = m.group("sequence") # Check result keysym if keysym := m.group("keysym"): - if HAS_XKBCOMMON: - keysym_char = keysym_to_char(m.group("keysym")) + if libxkbcommon: + keysym_char = libxkbcommon.keysym_to_char(m.group("keysym")) if string != keysym_char: print( f"[ERROR] Line {n}: The keysym does not correspond to the character: expected “{string}”, got “{keysym_char}”.", From 4ced4bf886682a0b1e9b2506e62a8609cd97b243 Mon Sep 17 00:00:00 2001 From: Pierre Le Marre Date: Tue, 18 Jul 2023 07:28:44 +0200 Subject: [PATCH 08/17] Remove useless continue statement --- nls/compose-check.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/nls/compose-check.py b/nls/compose-check.py index f30e8f97..9d88ad3a 100755 --- a/nls/compose-check.py +++ b/nls/compose-check.py @@ -152,21 +152,23 @@ def parse_keysyms_header( pending_multine_comment = False for n, line in enumerate(map(lambda l: l.strip(), fd)): if not line: - # Empty line - continue + # Skip empty line + pass elif pending_multine_comment: + # Continuation of a multiline comment. + # Check if it ends on this line. if line.endswith("*/"): pending_multine_comment = False - continue elif line.startswith("/*"): + # Start of a multiline comment if not line.endswith("*/"): pending_multine_comment = True - continue elif any( line.startswith(s) for s in ("#ifdef", "#ifndef", "#endif", "#define _", "#undef") ): - continue + # Skip C macros + pass elif m := KEYSYM_ENTRY_PATTERN.match(line): if m.group("evdev"): # _EVDEVK macro From 9b61c288949d6c563f647dc586ba3dd9d50a9b42 Mon Sep 17 00:00:00 2001 From: Pierre Le Marre Date: Tue, 18 Jul 2023 07:37:50 +0200 Subject: [PATCH 09/17] review: Add handle_keysym_match helper --- nls/compose-check.py | 70 +++++++++++++++++++++++++------------------- 1 file changed, 40 insertions(+), 30 deletions(-) diff --git a/nls/compose-check.py b/nls/compose-check.py index 9d88ad3a..54e19b63 100755 --- a/nls/compose-check.py +++ b/nls/compose-check.py @@ -145,12 +145,49 @@ KEYSYM_ENTRY_PATTERN = re.compile( EXTRA_DEPRECATED_KEYSYMS = ("Ext16bit_L", "Ext16bit_R") +def handle_keysym_match( + keysyms: dict[int, str], + keysyms_names: dict[str, str], + line_nbr: int, + m: re.Match[str], +): + if m.group("evdev"): + # _EVDEVK macro + keysym = 0x10081000 + int(m.group("value"), 16) + else: + keysym = int(m.group("value"), 16) + name = (m.group("prefix") or "") + m.group("name") + if ref := keysyms.get(keysym): + # Deprecated, because there is a previous definition with other name. + # Ensure that the replacement keysym is supported by xkbcommon. + if libxkbcommon and libxkbcommon.is_invalid_keysym_name(ref): + print( + f"[WARNING] Line {line_nbr}: Keep deprecated keysym “{name}”; reference keysym “{ref}” is not supported by available xkbcommon." + ) + else: + keysyms_names[name] = ref + return + else: + # Reference keysym + keysyms[keysym] = name + if ( + m.group("deprecated") + or m.group("unicode") + or m.group("name") in EXTRA_DEPRECATED_KEYSYMS + ): + # Explicitely deprecated + keysyms_names[name] = "" + else: + # Reference keysym + keysyms_names[name] = name + + def parse_keysyms_header( path: Path, keysyms: dict[int, str], keysyms_names: dict[str, str] ): with path.open("rt", encoding="utf-8") as fd: pending_multine_comment = False - for n, line in enumerate(map(lambda l: l.strip(), fd)): + for line_nbr, line in enumerate(map(lambda l: l.strip(), fd)): if not line: # Skip empty line pass @@ -170,35 +207,8 @@ def parse_keysyms_header( # Skip C macros pass elif m := KEYSYM_ENTRY_PATTERN.match(line): - if m.group("evdev"): - # _EVDEVK macro - keysym = 0x10081000 + int(m.group("value"), 16) - else: - keysym = int(m.group("value"), 16) - name = (m.group("prefix") or "") + m.group("name") - if ref := keysyms.get(keysym): - # Deprecated, because there is a previous definition with other name. - # Ensure that the replacement keysym is supported by xkbcommon. - if libxkbcommon and libxkbcommon.is_invalid_keysym_name(ref): - print( - f"[WARNING] Line {n}: Keep deprecated keysym “{name}”; reference keysym “{ref}” is not supported by available xkbcommon." - ) - else: - keysyms_names[name] = ref - continue - else: - # Reference keysym - keysyms[keysym] = name - if ( - m.group("deprecated") - or m.group("unicode") - or m.group("name") in EXTRA_DEPRECATED_KEYSYMS - ): - # Explicitely deprecated - keysyms_names[name] = "" - else: - # Reference keysym - keysyms_names[name] = name + # Valid keysym entry + handle_keysym_match(keysyms, keysyms_names, line_nbr, m) else: raise ValueError(f"Cannot parse header “{path}” line: {line}") From 14da186aab2b87101d21e35292c4410148614bff Mon Sep 17 00:00:00 2001 From: Pierre Le Marre Date: Tue, 18 Jul 2023 07:54:52 +0200 Subject: [PATCH 10/17] review: files check & miscellaneous --- nls/compose-check.py | 53 +++++++++++++++++++++++++++++++------------- 1 file changed, 37 insertions(+), 16 deletions(-) diff --git a/nls/compose-check.py b/nls/compose-check.py index 54e19b63..ecaf56d9 100755 --- a/nls/compose-check.py +++ b/nls/compose-check.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 from __future__ import annotations +from functools import partial import sys @@ -23,6 +24,10 @@ from pathlib import Path from typing import Any, Generator, Sequence +################################################################################ +# Utils +################################################################################ + @dataclass class Configuration: keysyms_names: dict[str, str] @@ -30,6 +35,17 @@ class Configuration: prefer_unicode_keysym: bool +def file_only(category: str, path: Path): + """Check file""" + if not path.is_file(): + print(f"[ERROR] Invalid {category} file: {path}") + return path.is_file() + + +def processing_file_message(category: str, path: Path): + return f"=== Processing {category} file: {path} ===" + + ################################################################################ # xkbcommon handling ################################################################################ @@ -217,11 +233,8 @@ def parse_keysyms_headers(paths: Sequence[Path]) -> dict[str, str]: keysyms: dict[int, str] = {} keysyms_names: dict[str, str] = {} for path in paths: - if not path.is_file(): - print(f"[ERROR] Cannot open keysym header file: {path}") - else: - print(f" Processing header file: {path} ".center(80, "="), file=sys.stderr) - parse_keysyms_header(path, keysyms, keysyms_names) + print(processing_file_message("keysym header", path), file=sys.stderr) + parse_keysyms_header(path, keysyms, keysyms_names) return keysyms_names @@ -232,6 +245,7 @@ def parse_keysyms_headers(paths: Sequence[Path]) -> dict[str, str]: def parse_unicode_name_aliases(path: Path) -> dict[str, str]: aliases: dict[str, str] = {} + print(processing_file_message("Unicode name aliases", path), file=sys.stderr) with path.open("rt", encoding="utf-8") as fd: for line in map(lambda s: s.strip(), fd): # Empty line or comment @@ -256,7 +270,7 @@ def unicode_name(unicode_name_aliases: dict[str, str], c: str, is_first: bool) - # RULE: remove “ACCENT” from the name, when the character is combining and # is not in first position if not is_first and "COMBINING" in name and name.endswith("ACCENT"): - return name[:-7] + return name.removesuffix(" ACCENT") else: return name @@ -370,7 +384,7 @@ def check_keysym_sequence(config: Configuration, n: int, sequence: str) -> str: return sequence -def process_lines(fd: TextIOWrapper, config: Configuration): +def process_compose_lines(fd: TextIOWrapper, config: Configuration): multi_line_comment = False for n, line in enumerate(fd, start=1): # Handle pending multi-line comment @@ -440,9 +454,9 @@ def process_lines(fd: TextIOWrapper, config: Configuration): raise ValueError(f"Cannot parse line: “{line}”") -def process_file(path: Path, config: Configuration): +def process_compose_file(path: Path, config: Configuration): with path.open("rt", encoding="utf-8") as fd: - yield from process_lines(fd, config) + yield from process_compose_lines(fd, config) def run( @@ -458,23 +472,24 @@ def run( unicode_name_aliases = ( parse_unicode_name_aliases(name_aliases_path) if name_aliases_path else {} ) + # Set config config = Configuration( keysyms_names=keysyms_names, unicode_name_aliases=unicode_name_aliases, prefer_unicode_keysym=not prefer_named_keysyms, ) - # Compose file + # Compose files for path in paths: - print(f" Processing Compose file: {path} ".center(80, "="), file=sys.stderr) + print(processing_file_message("Compose", path), file=sys.stderr) if write: with tempfile.NamedTemporaryFile("wt") as fd: # Write to a temporary file - fd.writelines(process_file(path, config)) + fd.writelines(process_compose_file(path, config)) fd.flush() # No error: now ovewrite the original file shutil.copyfile(fd.name, path) else: - for _ in process_file(path, config): + for _ in process_compose_file(path, config): pass @@ -514,10 +529,16 @@ if __name__ == "__main__": keysyms = args.keysyms else: keysyms = list(args.keysyms_prefix / path for path in DEFAULT_KEYSYMS_HEADERS) + unicode_name_aliases = ( + args.unicode_name_aliases + if args.unicode_name_aliases + and file_only("Unicode name aliases", args.unicode_name_aliases) + else None + ) run( - args.input, + list(filter(partial(file_only, "Compose"), args.input)), args.write, - keysyms, - args.unicode_name_aliases, + list(filter(partial(file_only, "keysyms header"), keysyms)), + unicode_name_aliases, args.prefer_named_keysyms, ) From a7b9868d6fb374a1250ab5c40de0a486fa2d9b02 Mon Sep 17 00:00:00 2001 From: Pierre Le Marre Date: Tue, 18 Jul 2023 08:43:29 +0200 Subject: [PATCH 11/17] review: use logging --- nls/compose-check.py | 44 ++++++++++++++++++++++++++++---------------- 1 file changed, 28 insertions(+), 16 deletions(-) diff --git a/nls/compose-check.py b/nls/compose-check.py index ecaf56d9..7735cdd9 100755 --- a/nls/compose-check.py +++ b/nls/compose-check.py @@ -14,6 +14,7 @@ if sys.version_info < MINIMUM_PYTHON_VERSION: import argparse import ctypes import ctypes.util +import logging import re import shutil import tempfile @@ -28,6 +29,7 @@ from typing import Any, Generator, Sequence # Utils ################################################################################ + @dataclass class Configuration: keysyms_names: dict[str, str] @@ -35,10 +37,13 @@ class Configuration: prefer_unicode_keysym: bool +logger = logging.getLogger(__name__) + + def file_only(category: str, path: Path): """Check file""" if not path.is_file(): - print(f"[ERROR] Invalid {category} file: {path}") + logger.error(f"Invalid {category} file: {path}") return path.is_file() @@ -177,8 +182,8 @@ def handle_keysym_match( # Deprecated, because there is a previous definition with other name. # Ensure that the replacement keysym is supported by xkbcommon. if libxkbcommon and libxkbcommon.is_invalid_keysym_name(ref): - print( - f"[WARNING] Line {line_nbr}: Keep deprecated keysym “{name}”; reference keysym “{ref}” is not supported by available xkbcommon." + logger.warning( + f"Line {line_nbr}: Keep deprecated keysym “{name}”; reference keysym “{ref}” is not supported by available xkbcommon." ) else: keysyms_names[name] = ref @@ -233,7 +238,7 @@ def parse_keysyms_headers(paths: Sequence[Path]) -> dict[str, str]: keysyms: dict[int, str] = {} keysyms_names: dict[str, str] = {} for path in paths: - print(processing_file_message("keysym header", path), file=sys.stderr) + logger.info(processing_file_message("keysym header", path)) parse_keysyms_header(path, keysyms, keysyms_names) return keysyms_names @@ -245,7 +250,7 @@ def parse_keysyms_headers(paths: Sequence[Path]) -> dict[str, str]: def parse_unicode_name_aliases(path: Path) -> dict[str, str]: aliases: dict[str, str] = {} - print(processing_file_message("Unicode name aliases", path), file=sys.stderr) + logger.info(processing_file_message("Unicode name aliases", path)) with path.open("rt", encoding="utf-8") as fd: for line in map(lambda s: s.strip(), fd): # Empty line or comment @@ -357,16 +362,16 @@ def check_keysym(config: Configuration, n: int, keysym_name: str) -> str: # Reference keysym return keysym_name elif ref is None: - print(f"[ERROR] Line {n}: Unsupported keysym “{keysym_name}”") + logger.error(f"Line {n}: Unsupported keysym “{keysym_name}”") return keysym_name elif ref == "": # Deprecated: keep keysym - print(f"[WARNING] Line {n}: Deprecated keysym “{keysym_name}”.") + logger.warning(f"Line {n}: Deprecated keysym “{keysym_name}”.") return keysym_name else: # Deprecated alias: return reference keysym - print( - f"[WARNING] Line {n}: Deprecated keysym “{keysym_name}”. Please use “{ref}” instead." + logger.warning( + f"Line {n}: Deprecated keysym “{keysym_name}”. Please use “{ref}” instead." ) return ref @@ -419,9 +424,8 @@ def process_compose_lines(fd: TextIOWrapper, config: Configuration): if libxkbcommon: keysym_char = libxkbcommon.keysym_to_char(m.group("keysym")) if string != keysym_char: - print( - f"[ERROR] Line {n}: The keysym does not correspond to the character: expected “{string}”, got “{keysym_char}”.", - file=sys.stderr, + logger.error( + f"Line {n}: The keysym does not correspond to the character: expected “{string}”, got “{keysym_char}”.", ) if config.keysyms_names: keysym = check_keysym(config, n, m.group("keysym")) @@ -434,10 +438,9 @@ def process_compose_lines(fd: TextIOWrapper, config: Configuration): m.group("comment") == expected_comment or (m.group("comment") and m.group("comment")[4:] == expected_comment) ): - print( - f"[WARNING] Line {n}: Expected “{expected_comment}” comment, " + logger.warning( + f"Line {n}: Expected “{expected_comment}” comment, " f"got: “{m.group('comment')}”", - file=sys.stderr, ) rewrite = True # Rewrite entry if necessary @@ -480,7 +483,7 @@ def run( ) # Compose files for path in paths: - print(processing_file_message("Compose", path), file=sys.stderr) + logger.info(processing_file_message("Compose", path)) if write: with tempfile.NamedTemporaryFile("wt") as fd: # Write to a temporary file @@ -522,6 +525,14 @@ def parse_args(): if __name__ == "__main__": + # Logging setup + logFormatter = logging.Formatter("[%(levelname)s] %(message)s") + logHandler = logging.StreamHandler(sys.stderr) + logHandler.setFormatter(logFormatter) + logger.addHandler(logHandler) + logger.setLevel(logging.INFO) + + # Parse CLI args args = parse_args() if args.no_keysyms: keysyms = [] @@ -535,6 +546,7 @@ if __name__ == "__main__": and file_only("Unicode name aliases", args.unicode_name_aliases) else None ) + run( list(filter(partial(file_only, "Compose"), args.input)), args.write, From 4e7ca32d139b0480a92f7a7efdde3b31417dda8c Mon Sep 17 00:00:00 2001 From: Pierre Le Marre Date: Tue, 18 Jul 2023 08:52:04 +0200 Subject: [PATCH 12/17] review: Convert XCOMM to #, with option to disable it --- nls/compose-check.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/nls/compose-check.py b/nls/compose-check.py index 7735cdd9..f07c8ec4 100755 --- a/nls/compose-check.py +++ b/nls/compose-check.py @@ -35,6 +35,7 @@ class Configuration: keysyms_names: dict[str, str] unicode_name_aliases: dict[str, str] prefer_unicode_keysym: bool + convert_xcomm: bool logger = logging.getLogger(__name__) @@ -397,9 +398,15 @@ def process_compose_lines(fd: TextIOWrapper, config: Configuration): if line.strip().endswith("*/"): multi_line_comment = False yield line + # Handle XCOMM comments + elif line.startswith("XCOMM"): + if config.convert_xcomm: + yield "#" + line.removeprefix("XCOMM") + else: + yield line # Handle single-line comment & include elif not line.strip() or any( - line.startswith(s) for s in ("XCOMM", "#", "include") + line.startswith(s) for s in ("#", "include") ): yield line # Handle start of a multi-line comment @@ -468,6 +475,7 @@ def run( keysyms_headers: Sequence[Path], name_aliases_path: Path | None, prefer_named_keysyms: bool, + keep_xcomm: bool, ): # Keysyms headers keysyms_names = parse_keysyms_headers(keysyms_headers) @@ -480,6 +488,7 @@ def run( keysyms_names=keysyms_names, unicode_name_aliases=unicode_name_aliases, prefer_unicode_keysym=not prefer_named_keysyms, + convert_xcomm=not keep_xcomm, ) # Compose files for path in paths: @@ -520,6 +529,11 @@ def parse_args(): action="store_true", help="Prefer named keysyms over Unicode keysyms", ) + parser.add_argument( + "--keep-xcomm", + action="store_true", + help="Do NOT convert XCOMM comments to # comments", + ) parser.add_argument("--write", action="store_true", help="Write the compose file") return parser.parse_args() @@ -553,4 +567,5 @@ if __name__ == "__main__": list(filter(partial(file_only, "keysyms header"), keysyms)), unicode_name_aliases, args.prefer_named_keysyms, + args.keep_xcomm, ) From 4b84b97b843d1d1c327bd500d96ff72fca26872a Mon Sep 17 00:00:00 2001 From: Pierre Le Marre Date: Tue, 18 Jul 2023 09:00:58 +0200 Subject: [PATCH 13/17] review: add handle_compose_entry_match helper --- nls/compose-check.py | 97 ++++++++++++++++++++++++-------------------- 1 file changed, 52 insertions(+), 45 deletions(-) diff --git a/nls/compose-check.py b/nls/compose-check.py index f07c8ec4..fb843c01 100755 --- a/nls/compose-check.py +++ b/nls/compose-check.py @@ -390,9 +390,57 @@ def check_keysym_sequence(config: Configuration, n: int, sequence: str) -> str: return sequence +def handle_compose_entry_match( + config: Configuration, line_nbr: int, m: re.Match[str] +) -> str | None: + string = unescape(m.group("string")) + rewrite = False + # Check sequence keysyms + if config.keysyms_names: + sequence = check_keysym_sequence(config, line_nbr, m.group("sequence")) + if sequence != m.group("sequence"): + rewrite = True + else: + sequence = m.group("sequence") + # Check result keysym + if keysym := m.group("keysym"): + if libxkbcommon: + keysym_char = libxkbcommon.keysym_to_char(m.group("keysym")) + if string != keysym_char: + logger.error( + f"Line {line_nbr}: The keysym does not correspond to the character: expected “{string}”, got “{keysym_char}”.", + ) + if config.keysyms_names: + keysym = check_keysym(config, line_nbr, m.group("keysym")) + if keysym != m.group("keysym"): + rewrite = True + expected_comment = make_comment(config.unicode_name_aliases, string) + # Check if we have the expected comment + # NOTE: Some APL sequences provide the combo of composed characters + if not ( + m.group("comment") == expected_comment + or (m.group("comment") and m.group("comment")[4:] == expected_comment) + ): + logger.warning( + f"Line {line_nbr}: Expected “{expected_comment}” comment, " + f"got: “{m.group('comment')}”", + ) + rewrite = True + # Rewrite entry if necessary + if rewrite: + keysym = "" if keysym is None else f"\t{keysym}" + assert (len(string) == 1 and m.group("keysym") is not None) ^ ( + len(string) > 1 and m.group("keysym") is None + ) + comment_space = " " if len(string) == 1 else m.group("space") or "\t" + return f"""{sequence}: "{m.group('string')}"{keysym}{comment_space}# {expected_comment}\n""" + else: + return None + + def process_compose_lines(fd: TextIOWrapper, config: Configuration): multi_line_comment = False - for n, line in enumerate(fd, start=1): + for line_nbr, line in enumerate(fd, start=1): # Handle pending multi-line comment if multi_line_comment: if line.strip().endswith("*/"): @@ -405,9 +453,7 @@ def process_compose_lines(fd: TextIOWrapper, config: Configuration): else: yield line # Handle single-line comment & include - elif not line.strip() or any( - line.startswith(s) for s in ("#", "include") - ): + elif not line.strip() or any(line.startswith(s) for s in ("#", "include")): yield line # Handle start of a multi-line comment elif line.startswith("/*"): @@ -417,47 +463,8 @@ def process_compose_lines(fd: TextIOWrapper, config: Configuration): yield line # Handle compose sequence elif m := COMPOSE_ENTRY_PATTERN.match(line): - string = unescape(m.group("string")) - rewrite = False - # Check sequence keysyms - if config.keysyms_names: - sequence = check_keysym_sequence(config, n, m.group("sequence")) - if sequence != m.group("sequence"): - rewrite = True - else: - sequence = m.group("sequence") - # Check result keysym - if keysym := m.group("keysym"): - if libxkbcommon: - keysym_char = libxkbcommon.keysym_to_char(m.group("keysym")) - if string != keysym_char: - logger.error( - f"Line {n}: The keysym does not correspond to the character: expected “{string}”, got “{keysym_char}”.", - ) - if config.keysyms_names: - keysym = check_keysym(config, n, m.group("keysym")) - if keysym != m.group("keysym"): - rewrite = True - expected_comment = make_comment(config.unicode_name_aliases, string) - # Check if we have the expected comment - # NOTE: Some APL sequences provide the combo of composed characters - if not ( - m.group("comment") == expected_comment - or (m.group("comment") and m.group("comment")[4:] == expected_comment) - ): - logger.warning( - f"Line {n}: Expected “{expected_comment}” comment, " - f"got: “{m.group('comment')}”", - ) - rewrite = True - # Rewrite entry if necessary - if rewrite: - keysym = "" if keysym is None else f"\t{keysym}" - assert (len(string) == 1 and m.group("keysym") is not None) ^ ( - len(string) > 1 and m.group("keysym") is None - ) - comment_space = " " if len(string) == 1 else m.group("space") or "\t" - yield f"""{sequence}: "{m.group('string')}"{keysym}{comment_space}# {expected_comment}\n""" + if lineʹ := handle_compose_entry_match(config, line_nbr, m): + yield lineʹ else: yield line else: From b0cb444098d28bf0d599feeb8d01c31d144f5d4b Mon Sep 17 00:00:00 2001 From: Pierre Le Marre Date: Wed, 19 Jul 2023 15:31:10 +0200 Subject: [PATCH 14/17] nls: Improve handling of keysyms --- nls/compose-check.py | 127 +++++++++++++++++++++++++++++++++---------- 1 file changed, 98 insertions(+), 29 deletions(-) diff --git a/nls/compose-check.py b/nls/compose-check.py index fb843c01..c32859ff 100755 --- a/nls/compose-check.py +++ b/nls/compose-check.py @@ -1,7 +1,6 @@ #!/usr/bin/env python3 from __future__ import annotations -from functools import partial import sys @@ -14,6 +13,8 @@ if sys.version_info < MINIMUM_PYTHON_VERSION: import argparse import ctypes import ctypes.util +from enum import Enum, IntFlag, unique +from functools import partial import logging import re import shutil @@ -39,6 +40,7 @@ class Configuration: logger = logging.getLogger(__name__) +verbosity: int = 0 def file_only(category: str, path: Path): @@ -154,19 +156,33 @@ KEYSYM_ENTRY_PATTERN = re.compile( (?P_EVDEVK\()? (?P0x[0-9a-fA-F]+) (?(evdev)\)|)\s* - (?:/\*\s* - (?: - (?Pdeprecated)| - \(U\+(?P[0-9a-fA-F]{4,})(?:\s|\w|-)+\)| - .* - ) - )? + (?:/\*(?P.+)\*/)? + \s*$ """, re.VERBOSE, ) +KEYSYM_DEPRECATION_COMMENT_PATTERN = re.compile( + r""" + # Explicit alias: do not deprecate + alias\s+for\s+(?P\w+)| + # Explicitly deprecated + (?Pdeprecated)| + # Inexact Unicode match + \(U\+(?P[0-9a-fA-F]{4,})(?:\s|\w|-)+\) + """, + re.VERBOSE | re.IGNORECASE, +) EXTRA_DEPRECATED_KEYSYMS = ("Ext16bit_L", "Ext16bit_R") +@unique +class Deprecation(IntFlag): + NONE = 0 + ALIAS = 1 << 0 + EXPLICIT = 1 << 2 + IMPLICIT = 1 << 3 + + def handle_keysym_match( keysyms: dict[int, str], keysyms_names: dict[str, str], @@ -179,29 +195,72 @@ def handle_keysym_match( else: keysym = int(m.group("value"), 16) name = (m.group("prefix") or "") + m.group("name") - if ref := keysyms.get(keysym): - # Deprecated, because there is a previous definition with other name. - # Ensure that the replacement keysym is supported by xkbcommon. - if libxkbcommon and libxkbcommon.is_invalid_keysym_name(ref): - logger.warning( - f"Line {line_nbr}: Keep deprecated keysym “{name}”; reference keysym “{ref}” is not supported by available xkbcommon." - ) - else: - keysyms_names[name] = ref - return - else: - # Reference keysym - keysyms[keysym] = name - if ( - m.group("deprecated") - or m.group("unicode") - or m.group("name") in EXTRA_DEPRECATED_KEYSYMS + alias = None + if (comment := m.group("comment")) and ( + comment_match := KEYSYM_DEPRECATION_COMMENT_PATTERN.match(comment.strip()) ): + if comment_match.group("deprecated") or comment_match.group("inexact_unicode"): + # Explicitely deprecated + deprecated = Deprecation.EXPLICIT + elif alias := comment_match.group("alias"): + # Explicit alias: do not deprecate + deprecated = Deprecation.ALIAS + else: + # Normal comment + deprecated = Deprecation.NONE + elif name in EXTRA_DEPRECATED_KEYSYMS: # Explicitely deprecated - keysyms_names[name] = "" + deprecated = Deprecation.EXPLICIT + else: + deprecated = Deprecation.NONE + + if name in keysyms_names: + # Duplicate keysym: skip + if verbosity: + logger.warning( + f"Line {line_nbr}: Keysym “{name}” 0x{keysym:0>4x} already defined; skipping." + ) + return + elif ref := keysyms.get(keysym): + if deprecated & Deprecation.ALIAS: + # Check alias has same value + if keysyms.get(keysym) != alias: + if verbosity: + if alias in keysyms_names: + logger.warning( + f"Line {line_nbr}: Keysym {name} is declared as alias of {alias}, but they have different values." + ) + else: + logger.warning( + f"Line {line_nbr}: Keysym “{name}” is declared as alias of “{alias}”, but the alias does not exists. Typo?" + ) + keysyms_names[name] = name + else: + # Deprecated, because there is a previous definition with other name. + # Ensure that the replacement keysym is supported by xkbcommon. + deprecated |= Deprecation.IMPLICIT + if libxkbcommon and libxkbcommon.is_invalid_keysym_name(ref): + if verbosity: + logger.warning( + f"Line {line_nbr}: Keep deprecated keysym “{name}”; reference keysym “{ref}” is not supported by available xkbcommon." + ) + keysyms_names[name] = name + else: + keysyms_names[name] = ref else: # Reference keysym - keysyms_names[name] = name + if deprecated & Deprecation.ALIAS: + if verbosity: + logger.error( + f"Line {line_nbr}: Explicit alias “{name}” for “{alias}” is invalid." + ) + keysyms_names[name] = name + elif deprecated & Deprecation.EXPLICIT: + keysyms_names[name] = "" + else: + assert deprecated is Deprecation.NONE + keysyms_names[name] = name + keysyms[keysym] = name def parse_keysyms_header( @@ -239,7 +298,8 @@ def parse_keysyms_headers(paths: Sequence[Path]) -> dict[str, str]: keysyms: dict[int, str] = {} keysyms_names: dict[str, str] = {} for path in paths: - logger.info(processing_file_message("keysym header", path)) + if verbosity: + logger.info(processing_file_message("keysym header", path)) parse_keysyms_header(path, keysyms, keysyms_names) return keysyms_names @@ -251,7 +311,8 @@ def parse_keysyms_headers(paths: Sequence[Path]) -> dict[str, str]: def parse_unicode_name_aliases(path: Path) -> dict[str, str]: aliases: dict[str, str] = {} - logger.info(processing_file_message("Unicode name aliases", path)) + if verbosity: + logger.info(processing_file_message("Unicode name aliases", path)) with path.open("rt", encoding="utf-8") as fd: for line in map(lambda s: s.strip(), fd): # Empty line or comment @@ -541,6 +602,13 @@ def parse_args(): action="store_true", help="Do NOT convert XCOMM comments to # comments", ) + parser.add_argument( + "-v", + "--verbose", + action="count", + default=0, + help="Increase verbosity. Useful to see log entry of keysyms headers & Unicode files.", + ) parser.add_argument("--write", action="store_true", help="Write the compose file") return parser.parse_args() @@ -555,6 +623,7 @@ if __name__ == "__main__": # Parse CLI args args = parse_args() + verbosity = args.verbose if args.no_keysyms: keysyms = [] elif args.keysyms: From b3c76d9abf1c3ce199b17f4edf64462edc486835 Mon Sep 17 00:00:00 2001 From: Pierre Le Marre Date: Wed, 19 Jul 2023 16:26:22 +0200 Subject: [PATCH 15/17] nls: Add support for tags and improve APL handling --- nls/compose-check.py | 56 ++++++++++++++++++++++++++++++++++++-------- 1 file changed, 46 insertions(+), 10 deletions(-) diff --git a/nls/compose-check.py b/nls/compose-check.py index c32859ff..5ec5bfd3 100755 --- a/nls/compose-check.py +++ b/nls/compose-check.py @@ -356,11 +356,33 @@ COMPOSE_ENTRY_PATTERN = re.compile( re.VERBOSE, ) """A pattern for Compose entries""" +COMPOSE_TAG_PATTERN = re.compile(r"\s*\{([^}]+)\}") +COMPOSE_APL_PATTERN = re.compile(r"\S \S APL ") + UNICODE_KEYSYM_PATTERN = re.compile(r"\bU(?P[0-9A-Fa-f]+)\b") KEYSYM_PATTERN = re.compile(r"<(\w+)>") +@unique +class ComposeTag(Enum): + PRESERVE_COMMENT = "preserve comment" + + +COMPOSE_TAGS_MAPPING = {t.value: t for t in ComposeTag} + + +def parse_compose_tags(s: str) -> tuple[set[ComposeTag], set[str]]: + valid: set[ComposeTag] = set() + invalid: set[str] = set() + for t in COMPOSE_TAG_PATTERN.findall(s): + if tag := COMPOSE_TAGS_MAPPING.get(t): + valid.add(tag) + else: + invalid.add(t) + return valid, invalid + + def _unescape(s: str) -> Generator[str, Any, None]: """Unescape a Compose file string""" pending_escape = False @@ -476,16 +498,30 @@ def handle_compose_entry_match( if keysym != m.group("keysym"): rewrite = True expected_comment = make_comment(config.unicode_name_aliases, string) - # Check if we have the expected comment - # NOTE: Some APL sequences provide the combo of composed characters - if not ( - m.group("comment") == expected_comment - or (m.group("comment") and m.group("comment")[4:] == expected_comment) - ): - logger.warning( - f"Line {line_nbr}: Expected “{expected_comment}” comment, " - f"got: “{m.group('comment')}”", - ) + # Check the comment + if comment := m.group("comment"): + # Check tags + tags, invalid = parse_compose_tags(comment) + if invalid: + logger.error(f"Line {line_nbr}: Invalid tags {invalid}") + if tags: + logger.info(f"Line {line_nbr}: preserving comment") + expected_comment = comment + # Check if we have the expected comment + # NOTE: Some APL sequences provide the combo of composed characters + elif comment == expected_comment or ( + COMPOSE_APL_PATTERN.match(comment) + and COMPOSE_APL_PATTERN.sub("APL ", comment) == expected_comment + ): + expected_comment = comment + else: + logger.warning( + f"Line {line_nbr}: Expected “{expected_comment}” comment, " + f"got: “{m.group('comment')}”", + ) + rewrite = True + else: + # No comment: require to write it rewrite = True # Rewrite entry if necessary if rewrite: From 432e7499470d0b828d7a197383510f000d35e363 Mon Sep 17 00:00:00 2001 From: Pierre Le Marre Date: Wed, 19 Jul 2023 16:39:04 +0200 Subject: [PATCH 16/17] nls: Typos --- nls/compose-check.py | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/nls/compose-check.py b/nls/compose-check.py index 5ec5bfd3..3415601e 100755 --- a/nls/compose-check.py +++ b/nls/compose-check.py @@ -200,7 +200,7 @@ def handle_keysym_match( comment_match := KEYSYM_DEPRECATION_COMMENT_PATTERN.match(comment.strip()) ): if comment_match.group("deprecated") or comment_match.group("inexact_unicode"): - # Explicitely deprecated + # Explicitly deprecated deprecated = Deprecation.EXPLICIT elif alias := comment_match.group("alias"): # Explicit alias: do not deprecate @@ -209,7 +209,7 @@ def handle_keysym_match( # Normal comment deprecated = Deprecation.NONE elif name in EXTRA_DEPRECATED_KEYSYMS: - # Explicitely deprecated + # Explicitly deprecated deprecated = Deprecation.EXPLICIT else: deprecated = Deprecation.NONE @@ -267,20 +267,20 @@ def parse_keysyms_header( path: Path, keysyms: dict[int, str], keysyms_names: dict[str, str] ): with path.open("rt", encoding="utf-8") as fd: - pending_multine_comment = False + pending_multiline_comment = False for line_nbr, line in enumerate(map(lambda l: l.strip(), fd)): if not line: # Skip empty line pass - elif pending_multine_comment: + elif pending_multiline_comment: # Continuation of a multiline comment. # Check if it ends on this line. if line.endswith("*/"): - pending_multine_comment = False + pending_multiline_comment = False elif line.startswith("/*"): # Start of a multiline comment if not line.endswith("*/"): - pending_multine_comment = True + pending_multiline_comment = True elif any( line.startswith(s) for s in ("#ifdef", "#ifndef", "#endif", "#define _", "#undef") @@ -428,7 +428,7 @@ def check_keysym(config: Configuration, n: int, keysym_name: str) -> str: # Find the canonical keysym name using xkbcommon keysym_name = libxkbcommon.char_to_keysym(chr(codepoint)) # We keep our normalized Unicode in case xkbcommon returns a long - # Unicode keysym, or we explicitely prefer Unicode keysyms, or + # Unicode keysym, or we explicitly prefer Unicode keysyms, or # the named keysym is deprecated. if ( unicode_keysym == keysym_name @@ -461,14 +461,14 @@ def check_keysym(config: Configuration, n: int, keysym_name: str) -> str: def check_keysym_sequence(config: Configuration, n: int, sequence: str) -> str: - subsitutions: dict[str, str] = {} + substitutions: dict[str, str] = {} for keysym_name in KEYSYM_PATTERN.findall(sequence): keysym_nameʹ = check_keysym(config, n, keysym_name) if keysym_nameʹ != keysym_name: - subsitutions[keysym_name] = keysym_nameʹ - if subsitutions: - pattern = re.compile("|".join(re.escape(k) for k in subsitutions.keys())) - return pattern.sub(lambda x: subsitutions[x.group()], sequence) + substitutions[keysym_name] = keysym_nameʹ + if substitutions: + pattern = re.compile("|".join(re.escape(k) for k in substitutions.keys())) + return pattern.sub(lambda x: substitutions[x.group()], sequence) else: return sequence @@ -516,7 +516,7 @@ def handle_compose_entry_match( expected_comment = comment else: logger.warning( - f"Line {line_nbr}: Expected “{expected_comment}” comment, " + f"Line {line_nbr}: Expected “{expected_comment}”, " f"got: “{m.group('comment')}”", ) rewrite = True @@ -602,7 +602,7 @@ def run( # Write to a temporary file fd.writelines(process_compose_file(path, config)) fd.flush() - # No error: now ovewrite the original file + # No error: now overwrite the original file shutil.copyfile(fd.name, path) else: for _ in process_compose_file(path, config): @@ -610,7 +610,7 @@ def run( def parse_args(): - parser = argparse.ArgumentParser(description="Add comment to compose sequence") + parser = argparse.ArgumentParser(description="Fix Compose file formatting") parser.add_argument("input", type=Path, nargs="+", help="Compose file to process") group = parser.add_mutually_exclusive_group() group.add_argument("--no-keysyms", action="store_true", help="Do not check keysyms") From 0bc3c8393cc1ecd7c26f965fea38e091867578e9 Mon Sep 17 00:00:00 2001 From: Pierre Le Marre Date: Fri, 21 Jul 2023 07:23:51 +0200 Subject: [PATCH 17/17] nls: Check Unicode keysyms --- nls/compose-check.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/nls/compose-check.py b/nls/compose-check.py index 3415601e..9ed23966 100755 --- a/nls/compose-check.py +++ b/nls/compose-check.py @@ -424,6 +424,8 @@ def check_keysym(config: Configuration, n: int, keysym_name: str) -> str: # Reformat Unicode keysym codepoint = int(m.group("codepoint"), 16) unicode_keysym = f"U{codepoint:0>4X}" + if not (0x100 <= codepoint <= 0x10ffff): + logger.error(f"Line {n}: Unicode keysym {keysym_name} is illegal: valid code points are 0x100..0x10ffff.") if libxkbcommon: # Find the canonical keysym name using xkbcommon keysym_name = libxkbcommon.char_to_keysym(chr(codepoint))