summaryrefslogtreecommitdiff
path: root/scripts
diff options
context:
space:
mode:
authorPierre Le Marre <dev@wismill.eu>2024-09-03 19:33:57 +0200
committerSergey Udaltsov <sergey.udaltsov@gmail.com>2024-09-05 18:45:16 +0000
commit2561d0cfa287ee7d7c175b85aa40744ad87e35b1 (patch)
treeff69f6e1d5b344628176adf62cf693a5e9ce99b5 /scripts
parentcd3e694fe4ad074b5558773d9800430ed3f99719 (diff)
Add script to extract list of (non-)Latin layouts
This produces a CSV file with (non-)Latin layouts by analyzing all the keysyms of each layout and check for required ones. This is more reliable than checking the language tags of layouts, because a language may have multiple scripts and the language tags may be incorrect or incomplete. Method: 1. Get layouts via `xkbregistry`. 2. For each layout, check if it has all the basic Latin letters. 3. Export the filtered layouts as CSV. Note that some layouts are “almost” Latin as they miss only a few letters. Such layouts are considered non-Latin but raise a warning.
Diffstat (limited to 'scripts')
-rwxr-xr-xscripts/registry.py245
1 files changed, 245 insertions, 0 deletions
diff --git a/scripts/registry.py b/scripts/registry.py
new file mode 100755
index 00000000..b3c54b74
--- /dev/null
+++ b/scripts/registry.py
@@ -0,0 +1,245 @@
+#!/usr/bin/env python3
+
+"""
+Query the XKB registry via xkbcommon and process the results, e.g. filter and
+export (non-)Latin layouts.
+"""
+
+from __future__ import annotations
+
+import argparse
+import csv
+import dataclasses
+import logging
+import subprocess
+import sys
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Generator, Sequence
+
+import yaml
+
+# Add our internal xkbcommon test lib to the PATH
+ROOT = Path(__file__).parent.parent
+sys.path.append(str(ROOT / "tests"))
+
+import xkbcommon
+
+logger = logging.getLogger()
+logging.basicConfig(
+ stream=sys.stderr, level=logging.INFO, format="[%(levelname)s] %(message)s"
+)
+
+
+@dataclass
+class Model:
+ name: str
+
+
+@dataclass(frozen=True, order=True)
+class Layout:
+ layout: str
+ variant: str
+ description: str
+ extra: dict[str, Any]
+
+ @classmethod
+ def parse(cls, raw: dict[str, str]) -> Layout:
+ """
+ Parse YAML entry
+ """
+ return Layout(
+ layout=raw.get("layout", ""),
+ variant=raw.get("variant", ""),
+ description=raw.get("description", ""),
+ extra={},
+ )
+
+
+class Option:
+ name: str
+
+
+@dataclass
+class Registry:
+ """
+ The XKB registry, i.e. a rules/*.xml file parsed.
+ """
+
+ models: tuple[Model, ...]
+ layouts: tuple[Layout, ...]
+ options: tuple[Option, ...]
+
+ @classmethod
+ def parse(cls, raw: dict[str, Sequence[Any]], skip_custom: bool = True) -> Registry:
+ """
+ Parse YAML entry
+ """
+ return cls(
+ models=(), # FIXME: process models
+ layouts=tuple(
+ l
+ for l in map(Layout.parse, raw.get("layouts", ()))
+ if not skip_custom or l.layout != "custom"
+ ),
+ options=(), # FIXME: process options
+ )
+
+ @classmethod
+ def load(cls, xkb_root: Path | None = None, rules: str | None = None) -> Registry:
+ """
+ Run xkbcli list and parse its YAML output.
+ """
+ args: tuple[str, ...] = ("xkbcli", "list", "--load-exotic")
+ if xkb_root:
+ # If no xkb config root is provided, we rely on the defaults that xkbcommon
+ # will pick. It depends on its built-in defaults and on the environment.
+ args += ("--skip-default-paths", str(xkb_root))
+ if rules:
+ # If no rules set is provided, we rely on the default one that xkbcommon
+ # will pick. It depends on its built-in default and on the environment.
+ args += ("--ruleset", rules)
+ logger.info(f"Running: {' '.join(args)}")
+ p = subprocess.run(args, encoding="utf-8", capture_output=True)
+ raw = yaml.safe_load(p.stdout)
+ return cls.parse(raw)
+
+
+class Csv(csv.unix_dialect):
+ """
+ CSV dialect used to export results.
+ """
+
+ quoting = csv.QUOTE_NONE
+
+
+LATIN_LETTERS = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+"""
+Required upper case characters for a layout to be considered Latin.
+"""
+
+
+def filter_latin_layouts(
+ xkb_root: Path,
+ registry: Registry,
+ latin: bool,
+ rules: str | None = None,
+ debug: bool = False,
+) -> Generator[Layout, None, None]:
+ """
+ Given a registry, filter all its layouts that are (non-)Latin by checking
+ that each required characters are accessible at some key, group and level.
+ """
+ latin_letters = frozenset(LATIN_LETTERS + LATIN_LETTERS.lower())
+ latin_keysyms: dict[int, str] = {
+ xkbcommon.xkb_keysym_from_name(c): c for c in latin_letters
+ }
+ for layout in registry.layouts:
+ try:
+ with xkbcommon.ForeignKeymap(
+ xkb_base=xkb_root,
+ rules=rules,
+ layout=layout.layout,
+ variant=layout.variant,
+ ) as keymap:
+ found: set[str] = set()
+ r: xkbcommon.KeyLevel
+ for r in xkbcommon.ForeignKeymap.get_keys_levels(keymap):
+ for k in r.keysyms:
+ if (c := latin_keysyms.get(k)) is not None:
+ found.add(c)
+ except ValueError as err:
+ # Log error message and skip
+ logger.error(err)
+ continue
+ missing = latin_letters.difference(found)
+ if latin ^ bool(missing):
+ if debug:
+ # Add missing characters for debugging
+ extra = dict(layout.extra)
+ extra["missing"] = missing_str = "".join(sorted(missing))
+ yield dataclasses.replace(layout, extra=extra)
+ else:
+ yield layout
+ almost_latin = len(missing) / len(latin_letters) <= 0.10
+ if debug and missing and almost_latin:
+ logger.debug(
+ f"Almost a Latin layout: {layout}; missing: {missing_str} ({len(missing)})"
+ )
+
+
+def process_layouts(xkb_root: Path, registry: Registry, args: argparse.Namespace):
+ """
+ Process layouts from a given registry, depending on the CLI arguments.
+ """
+ debug: bool = args.debug
+ if args.latin or args.non_latin:
+ # Filter (non-)Latin layouts
+ layouts = tuple(
+ filter_latin_layouts(
+ xkb_root,
+ registry,
+ latin=args.latin,
+ rules=args.rules,
+ debug=args.debug,
+ )
+ )
+ else:
+ # Get all layouts
+ layouts = registry.layouts
+ if args.csv:
+ # Output as CSV
+ path: Path = args.csv
+ with path.open("wt", encoding="utf-8", newline="") as fd:
+ writer = csv.writer(fd, dialect=csv.unix_dialect if debug else Csv)
+ fields: tuple[str, ...] = ("Layout", "Variant")
+ if debug:
+ fields += (
+ "Description",
+ "Missing Latin characters",
+ )
+ writer.writerow(fields)
+
+ def get_fields(layout) -> Generator[str, None, None]:
+ yield layout.layout
+ yield layout.variant
+ if debug:
+ yield layout.description
+ yield layout.extra.get("missing", "")
+
+ for layout in sorted(layouts):
+ writer.writerow(get_fields(layout))
+ else:
+ # Output as Python representation, for debugging
+ for layout in layouts:
+ print(layout)
+
+
+def parse_cli_args() -> argparse.Namespace:
+ """
+ Create CLI parser and parse corresponding arguments.
+ """
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--xkb-root", type=Path, required=True)
+ parser.add_argument("--debug", action="store_true")
+ parser.add_argument("--rules", help="Rules set to use")
+ subparsers = parser.add_subparsers(required=True)
+ subparser = subparsers.add_parser("layouts", help="List layouts")
+ subparser.set_defaults(run=process_layouts)
+ subparser.add_argument("--csv", type=Path)
+ group = subparser.add_mutually_exclusive_group()
+ group.add_argument("--latin", action="store_true", help="List only Latin layouts")
+ group.add_argument(
+ "--non-latin", action="store_true", help="List only non-Latin layouts"
+ )
+ return parser.parse_args()
+
+
+if __name__ == "__main__":
+ args = parse_cli_args()
+ if args.debug:
+ logger.setLevel(logging.DEBUG)
+ xkb_root: Path = args.xkb_root
+ rules: str | None = args.rules
+ registry = Registry.load(xkb_root, rules)
+ args.run(xkb_root, registry, args)