summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJehan <jehan@girinstud.io>2022-12-20 12:03:19 +0100
committerJehan <jehan@girinstud.io>2022-12-20 12:03:19 +0100
commitc843d23a17eebaa69be56565c5963471d5f1295f (patch)
treeef486c940e8a00ebb73852e380a4f41c9c328163
parent419a971e6a9de966ea3a0b255bfd598a7617dc59 (diff)
script: new create-table script.
I wanted to add new tables for which I could find no listing anywhere, even though iconv has support for it (not core Python though), which are GEORGIAN-ACADEMY and GEORGIAN-PS. I could find info on these in libiconv source (./lib/georgian_academy.h and ./lib/georgian_ps.h), though rather than trying to read these, I thought I should just do the other way around: get back a table from the return value of iconv API (or Python decode() when relevant). So this script is able to generate tables in the format used under script/charsets/, from either Python decode() or iconv. It will be much useful!
-rwxr-xr-xscript/create-table.py137
1 files changed, 137 insertions, 0 deletions
diff --git a/script/create-table.py b/script/create-table.py
new file mode 100755
index 0000000..ba5620b
--- /dev/null
+++ b/script/create-table.py
@@ -0,0 +1,137 @@
+#!/bin/python3
+# -*- coding: utf-8 -*-
+
+# ##### BEGIN LICENSE BLOCK #####
+# Version: MPL 1.1/GPL 2.0/LGPL 2.1
+#
+# The contents of this file are subject to the Mozilla Public License Version
+# 1.1 (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+# http://www.mozilla.org/MPL/
+#
+# Software distributed under the License is distributed on an "AS IS" basis,
+# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
+# for the specific language governing rights and limitations under the
+# License.
+#
+# The Original Code is Mozilla Universal charset detector code.
+#
+# The Initial Developer of the Original Code is
+# Netscape Communications Corporation.
+# Portions created by the Initial Developer are Copyright (C) 2001
+# the Initial Developer. All Rights Reserved.
+#
+# Contributor(s):
+# Jehan <jehan@girinstud.io>
+#
+# Alternatively, the contents of this file may be used under the terms of
+# either the GNU General Public License Version 2 or later (the "GPL"), or
+# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
+# in which case the provisions of the GPL or the LGPL are applicable instead
+# of those above. If you wish to allow use of your version of this file only
+# under the terms of either the GPL or the LGPL, and not to allow others to
+# use your version of this file under the terms of the MPL, indicate your
+# decision by deleting the provisions above and replace them with the notice
+# and other provisions required by the GPL or the LGPL. If you do not delete
+# the provisions above, a recipient may use your version of this file under
+# the terms of any one of the MPL, the GPL or the LGPL.
+#
+# ##### END LICENSE BLOCK #####
+
+import optparse
+import os
+import subprocess
+import sys
+
+script_path = os.path.relpath(__file__)
+
+usage = 'Usage: {} <CHARSET-NAME>\n' \
+ '\nEx: `{} ISO-8859-15`'.format(script_path, script_path)
+
+description = "Internal tool to generate a charset table."
+cmdline = optparse.OptionParser(usage, description = description)
+(options, charset) = cmdline.parse_args()
+if len(charset) != 1:
+ sys.stderr.write("Please choose exactly one charset as argument.\n")
+ exit(1)
+
+charset = charset[0]
+
+use_iconv = False
+try:
+ b' '.decode(charset)
+except LookupError:
+ use_iconv = True
+
+def get_utf8_char(bchar, charset, iconv):
+ if iconv:
+ try:
+ call = subprocess.Popen(['iconv', '-f', charset, '-t', 'UTF-8'],
+ stdin=subprocess.PIPE, stdout=subprocess.PIPE,
+ stderr=subprocess.DEVNULL)
+ if call.poll() is not None:
+ (_, error) = call.communicate(input='')
+ sys.stderr.write('Error: `iconv` ended with error "{}".\n'.format(error))
+ exit(1)
+ (uchar, _) = call.communicate(input=bchar)
+ except FileNotFoundError:
+ sys.stderr.write('Error: `iconv` is not installed.\n')
+ exit(1)
+ if len(uchar) > 0:
+ return uchar.decode('UTF-8')
+ else:
+ return None
+ else:
+ try:
+ return bchar.decode(charset)
+ except UnicodeDecodeError:
+ # Typical error:
+ # UnicodeDecodeError: 'charmap' codec can't decode byte 0x81 in position 0: character maps to <undefined>
+ # It would mean an illegal character.
+ return None
+
+print('## Table generated by {} with {} ##'.format(script_path, 'iconv' if use_iconv else 'Python decode()'))
+
+print('# X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF #')
+print('charmap = \\')
+sys.stdout.write('[')
+for l in range(0x10):
+ sys.stdout.write('\n ')
+ has_printable = False
+ for c in range(0x10):
+ char = bytes([c + l * 0x10])
+ char = get_utf8_char(char, charset, use_iconv)
+ if char is None:
+ sys.stdout.write('ILL,')
+ elif char.isalpha():
+ sys.stdout.write('LET,')
+ has_printable = True
+ elif char.isdigit():
+ sys.stdout.write('NUM,')
+ has_printable = True
+ elif char == '\n' or char == '\r':
+ sys.stdout.write('RET,')
+ elif char.isprintable():
+ sys.stdout.write('SYM,')
+ has_printable = True
+ else:
+ sys.stdout.write('CTR,')
+
+ sys.stdout.write(' # {}X'.format(hex(l)[-1:].upper()))
+
+ if has_printable:
+ sys.stdout.write('\n #')
+ # The line has at least one printable character. Print in comment for
+ # debugging.
+ for c in range(0x10):
+ char = bytes([c + l * 0x10])
+ char = get_utf8_char(char, charset, use_iconv)
+ if char is None:
+ sys.stdout.write('ILL ')
+ elif char == '\n' or char == '\r':
+ sys.stdout.write('RET ')
+ elif char.isalpha() or char.isdigit() or char.isprintable():
+ sys.stdout.write("'{}' ".format(char))
+ else:
+ sys.stdout.write('CTR ')
+sys.stdout.write('\n]')