#!/usr/bin/env python3 # # fontconfig/fc-lang/fc-lang.py # # Copyright © 2001-2002 Keith Packard # Copyright © 2019 Tim-Philipp Müller # # Permission to use, copy, modify, distribute, and sell this software and its # documentation for any purpose is hereby granted without fee, provided that # the above copyright notice appear in all copies and that both that # copyright notice and this permission notice appear in supporting # documentation, and that the name of the author(s) not be used in # advertising or publicity pertaining to distribution of the software without # specific, written prior permission. The authors make no # representations about the suitability of this software for any purpose. It # is provided "as is" without express or implied warranty. # # THE AUTHOR(S) DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, # INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO # EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY SPECIAL, INDIRECT OR # CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, # DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER # TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR # PERFORMANCE OF THIS SOFTWARE. # fc-lang # # Read a set of language orthographies and build C declarations for # charsets which can then be used to identify which languages are # supported by a given font. # # TODO: this code is not very pythonic, a lot of it is a 1:1 translation # of the C code and we could probably simplify it a bit import argparse import string import sys import os # we just store the leaves in a dict, we can order the leaves later if needed class CharSet: def __init__(self): self.leaves = {} # leaf_number -> leaf data (= 16 uint32) def add_char(self, ucs4): assert ucs4 < 0x01000000 leaf_num = ucs4 >> 8 if leaf_num in self.leaves: leaf = self.leaves[leaf_num] else: leaf = [0, 0, 0, 0, 0, 0, 0, 0] # 256/32 = 8 self.leaves[leaf_num] = leaf leaf[(ucs4 & 0xff) >> 5] |= (1 << (ucs4 & 0x1f)) #print('{:08x} [{:04x}] --> {}'.format(ucs4, ucs4>>8, leaf)) def del_char(self, ucs4): assert ucs4 < 0x01000000 leaf_num = ucs4 >> 8 if leaf_num in self.leaves: leaf = self.leaves[leaf_num] leaf[(ucs4 & 0xff) >> 5] &= ~(1 << (ucs4 & 0x1f)) # We don't bother removing the leaf if it's empty */ #print('{:08x} [{:04x}] --> {}'.format(ucs4, ucs4>>8, leaf)) def equals(self, other_cs): keys = sorted(self.leaves.keys()) other_keys = sorted(other_cs.leaves.keys()) if len(keys) != len(other_keys): return False for k1, k2 in zip(keys, other_keys): if k1 != k2: return False if not leaves_equal(self.leaves[k1], other_cs.leaves[k2]): return False return True # Convert a file name into a name suitable for C declarations def get_name(file_name): return file_name.split('.')[0] # Convert a C name into a language name def get_lang(c_name): return c_name.replace('_', '-').replace(' ', '').lower() def read_orth_file(file_name): lines = [] with open(file_name, 'r', encoding='utf-8') as orth_file: for num, line in enumerate(orth_file): if line.startswith('include '): include_fn = line[8:].strip() lines += read_orth_file(include_fn) else: # remove comments and strip whitespaces line = line.split('#')[0].strip() line = line.split('\t')[0].strip() # skip empty lines if line: lines += [(file_name, num, line)] return lines def leaves_equal(leaf1, leaf2): for v1, v2 in zip(leaf1, leaf2): if v1 != v2: return False return True # Build a single charset from a source file # # The file format is quite simple, either # a single hex value or a pair separated with a dash def parse_orth_file(file_name, lines): charset = CharSet() for fn, num, line in lines: delete_char = line.startswith('-') if delete_char: line = line[1:] if line.find('-') != -1: parts = line.split('-') elif line.find('..') != -1: parts = line.split('..') else: parts = [line] start = int(parts.pop(0), 16) end = start if parts: end = int(parts.pop(0), 16) if parts: print('ERROR: {} line {}: parse error (too many parts)'.format(fn, num)) for ucs4 in range(start, end+1): if delete_char: charset.del_char(ucs4) else: charset.add_char(ucs4) assert charset.equals(charset) # sanity check for the equals function return charset if __name__=='__main__': parser = argparse.ArgumentParser() parser.add_argument('orth_files', nargs='+', help='List of .orth files') parser.add_argument('--directory', dest='directory', default=None) parser.add_argument('--template', dest='template_file', default=None) parser.add_argument('--output', dest='output_file', default=None) args = parser.parse_args() sets = [] names = [] langs = [] country = [] total_leaves = 0 LangCountrySets = {} # Open output file if args.output_file: sys.stdout = open(args.output_file, 'w', encoding='utf-8') # Read the template file if args.template_file: tmpl_file = open(args.template_file, 'r', encoding='utf-8') else: tmpl_file = sys.stdin # Change into source dir if specified (after opening other files) if args.directory: os.chdir(args.directory) orth_entries = {} for i, fn in enumerate(args.orth_files): orth_entries[fn] = i for fn in sorted(orth_entries.keys()): lines = read_orth_file(fn) charset = parse_orth_file(fn, lines) sets.append(charset) name = get_name(fn) names.append(name) lang = get_lang(name) langs.append(lang) if lang.find('-') != -1: country.append(orth_entries[fn]) # maps to original index language_family = lang.split('-')[0] if not language_family in LangCountrySets: LangCountrySets[language_family] = [] LangCountrySets[language_family] += [orth_entries[fn]] total_leaves += len(charset.leaves) # Find unique leaves leaves = [] for s in sets: for leaf_num in sorted(s.leaves.keys()): leaf = s.leaves[leaf_num] is_unique = True for existing_leaf in leaves: if leaves_equal(leaf, existing_leaf): is_unique = False break #print('unique: ', is_unique) if is_unique: leaves.append(leaf) # Find duplicate charsets duplicate = [] for i, s in enumerate(sets): dup_num = None if i >= 1: for j, s_cmp in enumerate(sets): if j >= i: break if s_cmp.equals(s): dup_num = j break duplicate.append(dup_num) tn = 0 off = {} for i, s in enumerate(sets): if duplicate[i]: continue off[i] = tn tn += len(s.leaves) # Scan the input until the marker is found # FIXME: this is a bit silly really, might just as well hardcode # the license header in the script and drop the template for line in tmpl_file: if line.strip() == '@@@': break print(line, end='') print('/* total size: {} unique leaves: {} */\n'.format(total_leaves, len(leaves))) print('#define LEAF0 ({} * sizeof (FcLangCharSet))'.format(len(sets))) print('#define OFF0 (LEAF0 + {} * sizeof (FcCharLeaf))'.format(len(leaves))) print('#define NUM0 (OFF0 + {} * sizeof (uintptr_t))'.format(tn)) print('#define SET(n) (n * sizeof (FcLangCharSet) + offsetof (FcLangCharSet, charset))') print('#define OFF(s,o) (OFF0 + o * sizeof (uintptr_t) - SET(s))') print('#define NUM(s,n) (NUM0 + n * sizeof (FcChar16) - SET(s))') print('#define LEAF(o,l) (LEAF0 + l * sizeof (FcCharLeaf) - (OFF0 + o * sizeof (intptr_t)))') print('#define fcLangCharSets (fcLangData.langCharSets)') print('#define fcLangCharSetIndices (fcLangData.langIndices)') print('#define fcLangCharSetIndicesInv (fcLangData.langIndicesInv)') assert len(sets) < 256 # FIXME: need to change index type to 16-bit below then print(''' static const struct {{ FcLangCharSet langCharSets[{}]; FcCharLeaf leaves[{}]; uintptr_t leaf_offsets[{}]; FcChar16 numbers[{}]; {} langIndices[{}]; {} langIndicesInv[{}]; }} fcLangData = {{'''.format(len(sets), len(leaves), tn, tn, 'FcChar8 ', len(sets), 'FcChar8 ', len(sets))) # Dump sets print('{') for i, s in enumerate(sets): if duplicate[i]: j = duplicate[i] else: j = i print(' {{ "{}", {{ FC_REF_CONSTANT, {}, OFF({},{}), NUM({},{}) }} }}, /* {} */'.format( langs[i], len(sets[j].leaves), i, off[j], i, off[j], i)) print('},') # Dump leaves print('{') for l, leaf in enumerate(leaves): print(' {{ {{ /* {} */'.format(l), end='') for i in range(0, 8): # 256/32 = 8 if i % 4 == 0: print('\n ', end='') print(' 0x{:08x},'.format(leaf[i]), end='') print('\n } },') print('},') # Dump leaves print('{') for i, s in enumerate(sets): if duplicate[i]: continue print(' /* {} */'.format(names[i])) for n, leaf_num in enumerate(sorted(s.leaves.keys())): leaf = s.leaves[leaf_num] if n % 4 == 0: print(' ', end='') found = [k for k, unique_leaf in enumerate(leaves) if leaves_equal(unique_leaf,leaf)] assert found, "Couldn't find leaf in unique leaves list!" assert len(found) == 1 print(' LEAF({:3},{:3}),'.format(off[i], found[0]), end='') if n % 4 == 3: print('') if len(s.leaves) % 4 != 0: print('') print('},') print('{') for i, s in enumerate(sets): if duplicate[i]: continue print(' /* {} */'.format(names[i])) for n, leaf_num in enumerate(sorted(s.leaves.keys())): leaf = s.leaves[leaf_num] if n % 8 == 0: print(' ', end='') print(' 0x{:04x},'.format(leaf_num), end='') if n % 8 == 7: print('') if len(s.leaves) % 8 != 0: print('') print('},') # langIndices print('{') for i, s in enumerate(sets): fn = '{}.orth'.format(names[i]) print(' {}, /* {} */'.format(orth_entries[fn], names[i])) print('},') # langIndicesInv print('{') for i, k in enumerate(orth_entries.keys()): name = get_name(k) idx = names.index(name) print(' {}, /* {} */'.format(idx, name)) print('}') print('};\n') print('#define NUM_LANG_CHAR_SET {}'.format(len(sets))) num_lang_set_map = (len(sets) + 31) // 32; print('#define NUM_LANG_SET_MAP {}'.format(num_lang_set_map)) # Dump indices with country codes assert len(country) > 0 assert len(LangCountrySets) > 0 print('') print('static const FcChar32 fcLangCountrySets[][NUM_LANG_SET_MAP] = {') for k in sorted(LangCountrySets.keys()): langset_map = [0] * num_lang_set_map # initialise all zeros for entries_id in LangCountrySets[k]: langset_map[entries_id >> 5] |= (1 << (entries_id & 0x1f)) print(' {', end='') for v in langset_map: print(' 0x{:08x},'.format(v), end='') print(' }}, /* {} */'.format(k)) print('};\n') print('#define NUM_COUNTRY_SET {}\n'.format(len(LangCountrySets))) # Find ranges for each letter for faster searching # Dump sets start/finish for the fastpath print('static const FcLangCharSetRange fcLangCharSetRanges[] = {\n') for c in string.ascii_lowercase: # a-z start = 9999 stop = -1 for i, s in enumerate(sets): if names[i].startswith(c): start = min(start,i) stop = max(stop,i) print(' {{ {}, {} }}, /* {} */'.format(start, stop, c)) print('};\n') # And flush out the rest of the input file for line in tmpl_file: print(line, end='') sys.stdout.flush()