diff options
-rwxr-xr-x | script/create-table.py | 137 |
1 files changed, 137 insertions, 0 deletions
diff --git a/script/create-table.py b/script/create-table.py new file mode 100755 index 0000000..ba5620b --- /dev/null +++ b/script/create-table.py @@ -0,0 +1,137 @@ +#!/bin/python3 +# -*- coding: utf-8 -*- + +# ##### BEGIN LICENSE BLOCK ##### +# Version: MPL 1.1/GPL 2.0/LGPL 2.1 +# +# The contents of this file are subject to the Mozilla Public License Version +# 1.1 (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# http://www.mozilla.org/MPL/ +# +# Software distributed under the License is distributed on an "AS IS" basis, +# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License +# for the specific language governing rights and limitations under the +# License. +# +# The Original Code is Mozilla Universal charset detector code. +# +# The Initial Developer of the Original Code is +# Netscape Communications Corporation. +# Portions created by the Initial Developer are Copyright (C) 2001 +# the Initial Developer. All Rights Reserved. +# +# Contributor(s): +# Jehan <jehan@girinstud.io> +# +# Alternatively, the contents of this file may be used under the terms of +# either the GNU General Public License Version 2 or later (the "GPL"), or +# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), +# in which case the provisions of the GPL or the LGPL are applicable instead +# of those above. If you wish to allow use of your version of this file only +# under the terms of either the GPL or the LGPL, and not to allow others to +# use your version of this file under the terms of the MPL, indicate your +# decision by deleting the provisions above and replace them with the notice +# and other provisions required by the GPL or the LGPL. If you do not delete +# the provisions above, a recipient may use your version of this file under +# the terms of any one of the MPL, the GPL or the LGPL. +# +# ##### END LICENSE BLOCK ##### + +import optparse +import os +import subprocess +import sys + +script_path = os.path.relpath(__file__) + +usage = 'Usage: {} <CHARSET-NAME>\n' \ + '\nEx: `{} ISO-8859-15`'.format(script_path, script_path) + +description = "Internal tool to generate a charset table." +cmdline = optparse.OptionParser(usage, description = description) +(options, charset) = cmdline.parse_args() +if len(charset) != 1: + sys.stderr.write("Please choose exactly one charset as argument.\n") + exit(1) + +charset = charset[0] + +use_iconv = False +try: + b' '.decode(charset) +except LookupError: + use_iconv = True + +def get_utf8_char(bchar, charset, iconv): + if iconv: + try: + call = subprocess.Popen(['iconv', '-f', charset, '-t', 'UTF-8'], + stdin=subprocess.PIPE, stdout=subprocess.PIPE, + stderr=subprocess.DEVNULL) + if call.poll() is not None: + (_, error) = call.communicate(input='') + sys.stderr.write('Error: `iconv` ended with error "{}".\n'.format(error)) + exit(1) + (uchar, _) = call.communicate(input=bchar) + except FileNotFoundError: + sys.stderr.write('Error: `iconv` is not installed.\n') + exit(1) + if len(uchar) > 0: + return uchar.decode('UTF-8') + else: + return None + else: + try: + return bchar.decode(charset) + except UnicodeDecodeError: + # Typical error: + # UnicodeDecodeError: 'charmap' codec can't decode byte 0x81 in position 0: character maps to <undefined> + # It would mean an illegal character. + return None + +print('## Table generated by {} with {} ##'.format(script_path, 'iconv' if use_iconv else 'Python decode()')) + +print('# X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF #') +print('charmap = \\') +sys.stdout.write('[') +for l in range(0x10): + sys.stdout.write('\n ') + has_printable = False + for c in range(0x10): + char = bytes([c + l * 0x10]) + char = get_utf8_char(char, charset, use_iconv) + if char is None: + sys.stdout.write('ILL,') + elif char.isalpha(): + sys.stdout.write('LET,') + has_printable = True + elif char.isdigit(): + sys.stdout.write('NUM,') + has_printable = True + elif char == '\n' or char == '\r': + sys.stdout.write('RET,') + elif char.isprintable(): + sys.stdout.write('SYM,') + has_printable = True + else: + sys.stdout.write('CTR,') + + sys.stdout.write(' # {}X'.format(hex(l)[-1:].upper())) + + if has_printable: + sys.stdout.write('\n #') + # The line has at least one printable character. Print in comment for + # debugging. + for c in range(0x10): + char = bytes([c + l * 0x10]) + char = get_utf8_char(char, charset, use_iconv) + if char is None: + sys.stdout.write('ILL ') + elif char == '\n' or char == '\r': + sys.stdout.write('RET ') + elif char.isalpha() or char.isdigit() or char.isprintable(): + sys.stdout.write("'{}' ".format(char)) + else: + sys.stdout.write('CTR ') +sys.stdout.write('\n]') |