summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJehan <jehan@girinstud.io>2022-12-18 17:13:17 +0100
committerJehan <jehan@girinstud.io>2022-12-18 17:23:34 +0100
commitdb836fad63e8a89e02d27999a72d61dde37617e8 (patch)
tree01936717a656d99652349428fe6c60d8774d7bd2
parentd6cab28fb47ecd160b1d2d183e848588bb20ff12 (diff)
script, src: generate more code for language and sequence model listing.
Right now, each time we add new language or new charset support, we have too many pieces of code not to forget to edit. The script script/BuildLangModel.py will now take care of the main parts: listing the sequence models, listing the generic language models and computing the numbers for each listing. Furthermore the script will now end with a TODO list of the parts which are still to be done manually (2 functions to edit and a CMakeLists). Finally the script now allows to give a list of languages to edit rather of having to run it with languages one by one. It also allows 2 special code: "none", which will retrain none of the languages, but will re-generate only the new generated listings; and "all" which will retrain all models (useful in particulare when we change the model formats or usage and want to regenerate everything).
-rwxr-xr-xscript/BuildLangModel.py1490
-rw-r--r--script/header-template.cpp3
-rw-r--r--script/support.txt36
-rw-r--r--src/LangModels/LangArabicModel.cpp2
-rw-r--r--src/LangModels/LangBelarusianModel.cpp2
-rw-r--r--src/LangModels/LangBulgarianModel.cpp2
-rw-r--r--src/LangModels/LangCroatianModel.cpp2
-rw-r--r--src/LangModels/LangCzechModel.cpp2
-rw-r--r--src/LangModels/LangDanishModel.cpp2
-rw-r--r--src/LangModels/LangEnglishModel.cpp2
-rw-r--r--src/LangModels/LangEsperantoModel.cpp2
-rw-r--r--src/LangModels/LangEstonianModel.cpp2
-rw-r--r--src/LangModels/LangFinnishModel.cpp2
-rw-r--r--src/LangModels/LangFrenchModel.cpp2
-rw-r--r--src/LangModels/LangGermanModel.cpp2
-rw-r--r--src/LangModels/LangGreekModel.cpp2
-rw-r--r--src/LangModels/LangHebrewModel.cpp2
-rw-r--r--src/LangModels/LangHindiModel.cpp2
-rw-r--r--src/LangModels/LangHungarianModel.cpp2
-rw-r--r--src/LangModels/LangIrishModel.cpp2
-rw-r--r--src/LangModels/LangItalianModel.cpp2
-rw-r--r--src/LangModels/LangLatvianModel.cpp2
-rw-r--r--src/LangModels/LangLithuanianModel.cpp2
-rw-r--r--src/LangModels/LangMacedonianModel.cpp2
-rw-r--r--src/LangModels/LangMalteseModel.cpp2
-rw-r--r--src/LangModels/LangNorwegianModel.cpp2
-rw-r--r--src/LangModels/LangPolishModel.cpp2
-rw-r--r--src/LangModels/LangPortugueseModel.cpp2
-rw-r--r--src/LangModels/LangRomanianModel.cpp2
-rw-r--r--src/LangModels/LangRussianModel.cpp2
-rw-r--r--src/LangModels/LangSerbianModel.cpp2
-rw-r--r--src/LangModels/LangSlovakModel.cpp2
-rw-r--r--src/LangModels/LangSloveneModel.cpp2
-rw-r--r--src/LangModels/LangSpanishModel.cpp2
-rw-r--r--src/LangModels/LangSwedishModel.cpp2
-rw-r--r--src/LangModels/LangThaiModel.cpp2
-rw-r--r--src/LangModels/LangTurkishModel.cpp2
-rw-r--r--src/LangModels/LangUkrainianModel.cpp2
-rw-r--r--src/LangModels/LangVietnameseModel.cpp2
-rw-r--r--src/nsLanguageDetector-generated.h80
-rw-r--r--src/nsLanguageDetector.h37
-rw-r--r--src/nsMBCSGroupProber.h5
-rw-r--r--src/nsSBCSGroupProber.cpp24
-rw-r--r--src/nsSBCSGroupProber.h15
-rw-r--r--src/nsSBCharSetProber-generated.h194
-rw-r--r--src/nsSBCharSetProber.h150
46 files changed, 1211 insertions, 895 deletions
diff --git a/script/BuildLangModel.py b/script/BuildLangModel.py
index 1c94a97..1c95add 100755
--- a/script/BuildLangModel.py
+++ b/script/BuildLangModel.py
@@ -72,706 +72,816 @@ cmdline.add_option('--max-depth',
dest = 'max_depth', default = 2)
(options, langs) = cmdline.parse_args()
if len(langs) < 1:
- print("Please select at least one language code.\n")
- exit(1)
-if len(langs) > 1:
- print("This script is meant to generate data for one language at a time.\n")
- exit(1)
-lang = langs[0]
-
-# Load the language data.
-sys_path_backup = sys.path
+ sys.stderr.write("Please select at least one language code. ")
+ sys.stderr.write("You may also choose 'all' or 'none'.\n")
+ exit(1)
+
current_dir = os.path.dirname(os.path.realpath(__file__))
-sys.path = [current_dir + '/langs']
-
-try:
- lang = importlib.import_module(lang.lower())
-except ImportError:
- print('Unknown language code "{}": '
- 'file "langs/{}.py" does not exist.'.format(lang, lang.lower()))
- exit(1)
-sys.path = sys_path_backup
-
-charsets = charsets.db.load(lang.charsets)
-
-if not hasattr(lang, 'start_pages') or lang.start_pages is None or \
- lang.start_pages == []:
- # Let's start with the main page, assuming it should have links
- # to relevant pages. In locale wikipedia, this page is usually redirected
- # to a relevant page.
- print("Warning: no `start_pages` set for '{}'. Using ['Main_Page'].\n"
- " If you don't get good data, it is advised to set a "
- "start_pages` variable yourself.".format(lang.code))
- lang.start_pages = ['Main_Page']
-if not hasattr(lang, 'wikipedia_code') or lang.wikipedia_code is None:
- lang.wikipedia_code = lang.code
-if not hasattr(lang, 'clean_wikipedia_content') or lang.clean_wikipedia_content is None:
- lang.clean_wikipedia_content = None
-if hasattr(lang, 'case_mapping'):
- lang.case_mapping = bool(lang.case_mapping)
-else:
- lang.case_mapping = False
-if not hasattr(lang, 'custom_case_mapping'):
- lang.custom_case_mapping = None
-if not hasattr(lang, 'alphabet') or lang.alphabet is None:
- lang.alphabet = None
-if not hasattr(lang, 'alphabet_mapping') or lang.alphabet_mapping is None:
- lang.alphabet_mapping = None
-if not hasattr(lang, 'unicode_ranges') or lang.unicode_ranges is None:
- lang.unicode_ranges = None
-if not hasattr(lang, 'frequent_ranges') or lang.frequent_ranges is None:
- if lang.unicode_ranges is not None:
- lang.frequent_ranges = lang.unicode_ranges
- else:
- lang.frequent_ranges = None
-
-def local_lowercase(text, lang):
- lowercased = ''
- for l in text:
- if lang.custom_case_mapping is not None and \
- l in lang.custom_case_mapping:
- lowercased += lang.custom_case_mapping[l]
- elif l.isupper() and \
- lang.case_mapping and \
- len(unicodedata.normalize('NFC', l.lower())) == 1:
- lowercased += l.lower()
- else:
- lowercased += l
- return lowercased
-if lang.use_ascii:
- if lang.alphabet is None:
- lang.alphabet = [chr(l) for l in range(65, 91)] + [chr(l) for l in range(97, 123)]
- else:
- # Allowing to provide an alphabet in string format rather than list.
- lang.alphabet = list(lang.alphabet)
- lang.alphabet += [chr(l) for l in range(65, 91)] + [chr(l) for l in range(97, 123)]
-if lang.alphabet is not None:
- # Allowing to provide an alphabet in string format rather than list.
- lang.alphabet = list(lang.alphabet)
- if lang.case_mapping or lang.custom_case_mapping is not None:
- lang.alphabet = [local_lowercase(l, lang) for l in lang.alphabet]
- #alphabet = []
- #for l in lang.alphabet:
- #if l.isupper() and \
- #lang.custom_case_mapping is not None and \
- #l in lang.custom_case_mapping:
- #alphabet.append(lang.custom_case_mapping[l])
- #elif l.isupper() and \
- #lang.case_mapping and \
- #len(unicodedata.normalize('NFC', l.lower())) == 1:
- #alphabet.append(l.lower())
- #else:
- #alphabet.append(l)
- lang.alphabet = list(set(lang.alphabet))
-
-if lang.alphabet_mapping is not None:
- alphabet_mapping = {}
- for char in lang.alphabet_mapping:
+with open(os.path.join(current_dir, "support.txt")) as f:
+ all_langs = f.readlines()
+all_langs = [ l.strip() for l in all_langs if l.strip() != '' ]
+
+if len(langs) == 1:
+ if langs[0].lower() == 'none':
+ langs = []
+ elif langs[0].lower() == 'all':
+ langs = all_langs
+
+abort = False
+for lang in langs:
+ if lang not in all_langs:
+ abort = True
+ sys.stderr.write("Error: unsupported lang: {}\n".format(lang))
+if abort:
+ sys.stderr.write("Info: new langs must be added in 'script/support.txt'.\n")
+ exit(1)
+
+generated_files = []
+
+for lang_arg in langs:
+ lang_arg = lang_arg.lower()
+
+ # Load the language data.
+ sys_path_backup = sys.path
+ sys.path = [current_dir + '/langs']
+ try:
+ lang = importlib.import_module(lang_arg)
+ except ImportError:
+ sys.stderr.write('Unknown language code "{}": '
+ 'file "langs/{}.py" does not exist.'.format(lang_arg, lang_arg))
+ exit(1)
+ sys.path = sys_path_backup
+
+ print("Processing language data for {} (lang/{}.py):\n".format(lang_arg, lang_arg))
+
+ lang_charsets = charsets.db.load(lang.charsets)
+
+ if not hasattr(lang, 'start_pages') or lang.start_pages is None or \
+ lang.start_pages == []:
+ # Let's start with the main page, assuming it should have links
+ # to relevant pages. In locale wikipedia, this page is usually redirected
+ # to a relevant page.
+ sys.stderr.write("Warning: no `start_pages` set for '{}'. Using ['Main_Page'].\n"
+ " If you don't get good data, it is advised to set a "
+ "start_pages` variable yourself.".format(lang.code))
+ lang.start_pages = ['Main_Page']
+ if not hasattr(lang, 'wikipedia_code') or lang.wikipedia_code is None:
+ lang.wikipedia_code = lang.code
+ if not hasattr(lang, 'clean_wikipedia_content') or lang.clean_wikipedia_content is None:
+ lang.clean_wikipedia_content = None
+ if hasattr(lang, 'case_mapping'):
+ lang.case_mapping = bool(lang.case_mapping)
+ else:
+ lang.case_mapping = False
+ if not hasattr(lang, 'custom_case_mapping'):
+ lang.custom_case_mapping = None
+ if not hasattr(lang, 'alphabet') or lang.alphabet is None:
+ lang.alphabet = None
+ if not hasattr(lang, 'alphabet_mapping') or lang.alphabet_mapping is None:
+ lang.alphabet_mapping = None
+ if not hasattr(lang, 'unicode_ranges') or lang.unicode_ranges is None:
+ lang.unicode_ranges = None
+ if not hasattr(lang, 'frequent_ranges') or lang.frequent_ranges is None:
+ if lang.unicode_ranges is not None:
+ lang.frequent_ranges = lang.unicode_ranges
+ else:
+ lang.frequent_ranges = None
+
+ def local_lowercase(text, lang):
+ lowercased = ''
+ for l in text:
+ if lang.custom_case_mapping is not None and \
+ l in lang.custom_case_mapping:
+ lowercased += lang.custom_case_mapping[l]
+ elif l.isupper() and \
+ lang.case_mapping and \
+ len(unicodedata.normalize('NFC', l.lower())) == 1:
+ lowercased += l.lower()
+ else:
+ lowercased += l
+ return lowercased
+
+ if lang.use_ascii:
+ if lang.alphabet is None:
+ lang.alphabet = [chr(l) for l in range(65, 91)] + [chr(l) for l in range(97, 123)]
+ else:
+ # Allowing to provide an alphabet in string format rather than list.
+ lang.alphabet = list(lang.alphabet)
+ lang.alphabet += [chr(l) for l in range(65, 91)] + [chr(l) for l in range(97, 123)]
+ if lang.alphabet is not None:
# Allowing to provide an alphabet in string format rather than list.
- for alt_char in list(lang.alphabet_mapping[char]):
- # While it's easier to write from main character to
- # equivalencies in the language file, we reverse the mapping
- # for simpler usage.
- if lang.case_mapping or lang.custom_case_mapping is not None:
- alphabet_mapping[alt_char] = local_lowercase(char, lang)
- else:
- alphabet_mapping[alt_char] = char
- lang.alphabet_mapping = alphabet_mapping
-
-def normalize_codepoint_ranges(input_range):
- output_range = []
- if input_range is not None:
- for start, end in input_range:
- # Allow to write down characters rather than unicode values.
- if isinstance(start, str):
- start = ord(start)
- if isinstance(end, str):
- end = ord(end)
- if not isinstance(start, int) or not isinstance(end, int):
- sys.stderr.write("Expected unicode range in char or int: {}-{}.\n".format(start, end))
- if start > end:
- sys.stderr.write("Wrong unicode range: {}-{}.\n".format(start, end))
- else:
- output_range += [(start, end)]
- if len(output_range) == 0:
- output_range = None
- return output_range
-
-lang.unicode_ranges = normalize_codepoint_ranges(lang.unicode_ranges)
-lang.frequent_ranges = normalize_codepoint_ranges(lang.frequent_ranges)
-
-# Starting processing.
-wikipedia.set_lang(lang.wikipedia_code)
-
-visited_pages = []
-
-# The full list of letter characters.
-# The key is the unicode codepoint,
-# and the value is the occurrence count.
-characters = {}
-# Sequence of letters.
-# The key is the couple (char1, char2) in unicode codepoint,
-# the value is the occurrence count.
-sequences = {}
-prev_char = None
-
-def process_text(content, lang):
- global charsets
- global characters
- global sequences
- global prev_char
-
- if lang.clean_wikipedia_content is not None:
- content = lang.clean_wikipedia_content(content)
- # Clean out the Wikipedia syntax for titles.
- content = re.sub(r'(=+) *([^=]+) *\1',
- r'\2', content)
- # Clean multiple spaces. Newlines and such are normalized to spaces,
- # since they have basically a similar role in the purpose of uchardet.
- content = re.sub(r'\s+', ' ', content)
-
- if lang.case_mapping or lang.custom_case_mapping is not None:
- content = local_lowercase(content, lang)
-
- # In python 3, strings are UTF-8.
- # Looping through them return expected characters.
- for char in content:
- # Map to main equivalent character.
- if lang.alphabet_mapping is not None and \
- char in lang.alphabet_mapping:
- char = lang.alphabet_mapping[char]
-
- unicode_value = ord(char)
- is_letter = False
- if unicode_value in characters:
- characters[unicode_value] += 1
- is_letter = True
- elif lang.unicode_ranges is not None:
- for start, end in lang.unicode_ranges:
- if unicode_value >= start and unicode_value <= end:
- characters[unicode_value] = 1
- is_letter = True
- break
- else:
- # We save the character if it is at least in one of the
- # language encodings and its not a special character.
- for charset in charsets:
- # Does the character exist in the charset?
- try:
- codepoint = char.encode(charset, 'ignore')
- except LookupError:
- # unknown encoding. Use iconv from command line instead.
- try:
- call = subprocess.Popen(['iconv', '-f', 'UTF-8', '-t', charset],
- stdin=subprocess.PIPE, stdout=subprocess.PIPE,
- stderr=subprocess.DEVNULL)
- if call.poll() is not None:
- (_, error) = call.communicate(input='')
- print('Error: `iconv` ended with error "{}".\n'.format(error))
- exit(1)
- (codepoint, _) = call.communicate(input=char.encode('UTF-8'))
- except FileNotFoundError:
- print('Error: "{}" is not a supported charset by python and `iconv` is not installed.\n')
- exit(1)
-
- if codepoint == b'':
- continue
- # ord() is said to return the unicode codepoint.
- # But it turns out it also gives the codepoint for other
- # charsets if I turn the string to encoded bytes first.
- # Not sure if that is a bug or expected.
- codepoint = ord(codepoint)
- if charsets[charset].charmap[codepoint] == LET:
- characters[unicode_value] = 1
- is_letter = True
- break
- if is_letter:
- if prev_char is not None:
- if (prev_char, unicode_value) in sequences:
- sequences[(prev_char, unicode_value)] += 1
- else:
- sequences[(prev_char, unicode_value)] = 1
- prev_char = unicode_value
+ lang.alphabet = list(lang.alphabet)
+ if lang.case_mapping or lang.custom_case_mapping is not None:
+ lang.alphabet = [local_lowercase(l, lang) for l in lang.alphabet]
+ #alphabet = []
+ #for l in lang.alphabet:
+ #if l.isupper() and \
+ #lang.custom_case_mapping is not None and \
+ #l in lang.custom_case_mapping:
+ #alphabet.append(lang.custom_case_mapping[l])
+ #elif l.isupper() and \
+ #lang.case_mapping and \
+ #len(unicodedata.normalize('NFC', l.lower())) == 1:
+ #alphabet.append(l.lower())
+ #else:
+ #alphabet.append(l)
+ lang.alphabet = list(set(lang.alphabet))
+
+ if lang.alphabet_mapping is not None:
+ alphabet_mapping = {}
+ for char in lang.alphabet_mapping:
+ # Allowing to provide an alphabet in string format rather than list.
+ for alt_char in list(lang.alphabet_mapping[char]):
+ # While it's easier to write from main character to
+ # equivalencies in the language file, we reverse the mapping
+ # for simpler usage.
+ if lang.case_mapping or lang.custom_case_mapping is not None:
+ alphabet_mapping[alt_char] = local_lowercase(char, lang)
+ else:
+ alphabet_mapping[alt_char] = char
+ lang.alphabet_mapping = alphabet_mapping
+
+ def normalize_codepoint_ranges(input_range):
+ output_range = []
+ if input_range is not None:
+ for start, end in input_range:
+ # Allow to write down characters rather than unicode values.
+ if isinstance(start, str):
+ start = ord(start)
+ if isinstance(end, str):
+ end = ord(end)
+ if not isinstance(start, int) or not isinstance(end, int):
+ sys.stderr.write("Expected unicode range in char or int: {}-{}.\n".format(start, end))
+ if start > end:
+ sys.stderr.write("Wrong unicode range: {}-{}.\n".format(start, end))
+ else:
+ output_range += [(start, end)]
+ if len(output_range) == 0:
+ output_range = None
+ return output_range
+
+ lang.unicode_ranges = normalize_codepoint_ranges(lang.unicode_ranges)
+ lang.frequent_ranges = normalize_codepoint_ranges(lang.frequent_ranges)
+
+ # Starting processing.
+ wikipedia.set_lang(lang.wikipedia_code)
+
+ visited_pages = []
+
+ # The full list of letter characters.
+ # The key is the unicode codepoint,
+ # and the value is the occurrence count.
+ characters = {}
+ # Sequence of letters.
+ # The key is the couple (char1, char2) in unicode codepoint,
+ # the value is the occurrence count.
+ sequences = {}
+ prev_char = None
+
+ def process_text(content, lang):
+ global lang_charsets
+ global characters
+ global sequences
+ global prev_char
+
+ if lang.clean_wikipedia_content is not None:
+ content = lang.clean_wikipedia_content(content)
+ # Clean out the Wikipedia syntax for titles.
+ content = re.sub(r'(=+) *([^=]+) *\1',
+ r'\2', content)
+ # Clean multiple spaces. Newlines and such are normalized to spaces,
+ # since they have basically a similar role in the purpose of uchardet.
+ content = re.sub(r'\s+', ' ', content)
+
+ if lang.case_mapping or lang.custom_case_mapping is not None:
+ content = local_lowercase(content, lang)
+
+ # In python 3, strings are UTF-8.
+ # Looping through them return expected characters.
+ for char in content:
+ # Map to main equivalent character.
+ if lang.alphabet_mapping is not None and \
+ char in lang.alphabet_mapping:
+ char = lang.alphabet_mapping[char]
+
+ unicode_value = ord(char)
+ is_letter = False
+ if unicode_value in characters:
+ characters[unicode_value] += 1
+ is_letter = True
+ elif lang.unicode_ranges is not None:
+ for start, end in lang.unicode_ranges:
+ if unicode_value >= start and unicode_value <= end:
+ characters[unicode_value] = 1
+ is_letter = True
+ break
+ else:
+ # We save the character if it is at least in one of the
+ # language encodings and its not a special character.
+ for charset in lang_charsets:
+ # Does the character exist in the charset?
+ try:
+ codepoint = char.encode(charset, 'ignore')
+ except LookupError:
+ # unknown encoding. Use iconv from command line instead.
+ try:
+ call = subprocess.Popen(['iconv', '-f', 'UTF-8', '-t', charset],
+ stdin=subprocess.PIPE, stdout=subprocess.PIPE,
+ stderr=subprocess.DEVNULL)
+ if call.poll() is not None:
+ (_, error) = call.communicate(input='')
+ sys.stderr.write('Error: `iconv` ended with error "{}".\n'.format(error))
+ exit(1)
+ (codepoint, _) = call.communicate(input=char.encode('UTF-8'))
+ except FileNotFoundError:
+ sys.stderr.write('Error: "{}" is not a supported charset by python and `iconv` is not installed.\n')
+ exit(1)
+
+ if codepoint == b'':
+ continue
+ # ord() is said to return the unicode codepoint.
+ # But it turns out it also gives the codepoint for other
+ # charsets if I turn the string to encoded bytes first.
+ # Not sure if that is a bug or expected.
+ codepoint = ord(codepoint)
+ if lang_charsets[charset].charmap[codepoint] == LET:
+ characters[unicode_value] = 1
+ is_letter = True
+ break
+ if is_letter:
+ if prev_char is not None:
+ if (prev_char, unicode_value) in sequences:
+ sequences[(prev_char, unicode_value)] += 1
+ else:
+ sequences[(prev_char, unicode_value)] = 1
+ prev_char = unicode_value
+ else:
+ prev_char = None
+
+ def visit_pages(titles, depth, lang, logfd):
+ global visited_pages
+ global options
+
+ if len(titles) == 0:
+ return
+
+ next_titles = []
+ if options.max_page is not None:
+ max_titles = int(options.max_page/(options.max_depth * options.max_depth))
+ else:
+ max_titles = sys.maxsize
+ for title in titles:
+ if options.max_page is not None and \
+ len(visited_pages) > options.max_page:
+ return
+ if title in visited_pages:
+ continue
+
+ # Ugly hack skipping internal pages
+ if 'wiki' in title or 'Wiki' in title:
+ sys.stderr.write('Skipping', title)
+ continue
+
+ visited_pages += [title]
+ try:
+ page = wikipedia.page(title, auto_suggest=False)
+ except (wikipedia.exceptions.PageError,
+ wikipedia.exceptions.DisambiguationError) as error:
+ # Let's just discard a page when I get an exception.
+ sys.stderr.write("Discarding page {}: {}\n".format(title, error))
+ continue
+ logfd.write("\n{} (revision {})".format(title, page.revision_id))
+ logfd.flush()
+
+ process_text(page.content, lang)
+ try:
+ links = page.links
+ random.shuffle(links)
+ if len(links) > max_titles:
+ links = links[:max_titles]
+ next_titles += links
+ except KeyError:
+ pass
+
+ if depth >= options.max_depth:
+ return
+
+ random.shuffle(next_titles)
+ visit_pages (next_titles, depth + 1, lang, logfd)
+
+ language_c = lang.name.replace('-', '_').title()
+ build_log = current_dir + '/BuildLangModelLogs/Lang{}Model.log'.format(language_c)
+ logfd = open(build_log, 'w')
+ logfd.write('= Logs of language model for {} ({}) =\n'.format(lang.name, lang.code))
+ logfd.write('\n- Generated by {}'.format(os.path.basename(__file__)))
+ logfd.write('\n- Started: {}'.format(str(datetime.datetime.now())))
+ logfd.write('\n- Maximum depth: {}'.format(options.max_depth))
+ if options.max_page is not None:
+ logfd.write('\n- Max number of pages: {}'.format(options.max_page))
+ logfd.write('\n\n== Parsed pages ==\n')
+ logfd.flush()
+ try:
+ visit_pages(lang.start_pages, 0, lang, logfd)
+ except requests.exceptions.ConnectionError:
+ sys.stderr.write('Error: connection to Wikipedia failed. Aborting\n')
+ exit(1)
+ logfd.write('\n\n== End of Parsed pages ==')
+ logfd.write('\n\n- Wikipedia parsing ended at: {}\n'.format(str(datetime.datetime.now())))
+ logfd.flush()
+
+ ########### CHARACTERS ###########
+
+ # Character ratios.
+ ratios = {}
+ n_char = len(characters)
+ occurrences = sum(characters.values())
+
+ logfd.write("\n{} characters appeared {} times.\n".format(n_char, occurrences))
+ for char in characters:
+ ratios[char] = characters[char] / occurrences
+ #logfd.write("Character '{}' usage: {} ({} %)\n".format(chr(char),
+ # characters[char],
+ # ratios[char] * 100))
+
+ sorted_ratios = sorted(ratios.items(), key=operator.itemgetter(1),
+ reverse=True)
+ # Accumulated ratios of the frequent chars.
+ accumulated_ratios = 0
+
+ # If there is no alphabet defined, we just use the first 64 letters, which was
+ # the original default.
+ # If there is an alphabet, we make sure all the alphabet characters are in the
+ # frequent list, and we stop then. There may therefore be more or less than
+ # 64 frequent characters depending on the language.
+ logfd.write('\nMost Frequent characters:')
+ very_freq_count = 0
+ very_freq_ratio = 0
+ if lang.alphabet is None and lang.frequent_ranges is None:
+ freq_count = min(64, len(sorted_ratios))
+ for order, (char, ratio) in enumerate(sorted_ratios):
+ if order >= freq_count:
+ break
+ logfd.write("\n[{:2}] Char {}: {} %".format(order, chr(char), ratio * 100))
+ accumulated_ratios += ratio
+ if very_freq_ratio < 0.4:
+ very_freq_count += 1
+ very_freq_ratio += ratio
+ elif lang.alphabet is not None:
+ freq_count = 0
+ for order, (char, ratio) in enumerate(sorted_ratios):
+ if len(lang.alphabet) == 0:
+ break
+ if chr(char) in lang.alphabet:
+ lang.alphabet.remove(chr(char))
+ logfd.write("\n[{:2}] Char {}: {} %".format(order, chr(char), ratio * 100))
+ accumulated_ratios += ratio
+ freq_count += 1
+ if very_freq_ratio < 0.4:
+ very_freq_count += 1
+ very_freq_ratio += ratio
+ else:
+ if len(lang.alphabet) > 0:
+ sys.stderr.write("Error: alphabet characters are absent from data collection"
+ "\n Please check the configuration or the data."
+ "\n Missing characters: {}".format(", ".join(lang.alphabet)))
+ exit(1)
+ elif lang.frequent_ranges is not None:
+ # How many characters in the frequent range?
+ frequent_ranges_size = 0
+ for start, end in lang.frequent_ranges:
+ frequent_ranges_size += end - start + 1
+
+ # Keep ratio for at least all the characters inside the frequent
+ # ranges.
+ freq_count = 0
+ for order, (char, ratio) in enumerate(sorted_ratios):
+ for start, end in lang.frequent_ranges:
+ if char >= start and char <= end:
+ freq_count += 1
+ accumulated_ratios += ratio
+ logfd.write("\n[{:2}] Char {}: {} %".format(order, chr(char), ratio * 100))
+ frequent_ranges_size -= 1
+ break
else:
- prev_char = None
-
-def visit_pages(titles, depth, lang, logfd):
- global visited_pages
- global options
-
- if len(titles) == 0:
- return
+ # A frequent character in the non-frequent range.
+ logfd.write("\n[{:2}] Char {}: {} %".format(order, chr(char), ratio * 100))
+ freq_count += 1
+ accumulated_ratios += ratio
- next_titles = []
- if options.max_page is not None:
- max_titles = int(options.max_page/(options.max_depth * options.max_depth))
- else:
- max_titles = sys.maxsize
- for title in titles:
- if options.max_page is not None and \
- len(visited_pages) > options.max_page:
- return
- if title in visited_pages:
- continue
-
- # Ugly hack skipping internal pages
- if 'wiki' in title or 'Wiki' in title:
- print('Skipping', title)
- continue
-
- visited_pages += [title]
- try:
- page = wikipedia.page(title, auto_suggest=False)
- except (wikipedia.exceptions.PageError,
- wikipedia.exceptions.DisambiguationError) as error:
- # Let's just discard a page when I get an exception.
- print("Discarding page {}: {}\n".format(title, error))
- continue
- logfd.write("\n{} (revision {})".format(title, page.revision_id))
- logfd.flush()
-
- process_text(page.content, lang)
- try:
- links = page.links
- random.shuffle(links)
- if len(links) > max_titles:
- links = links[:max_titles]
- next_titles += links
- except KeyError:
- pass
-
- if depth >= options.max_depth:
- return
-
- random.shuffle(next_titles)
- visit_pages (next_titles, depth + 1, lang, logfd)
-
-language_c = lang.name.replace('-', '_').title()
-build_log = current_dir + '/BuildLangModelLogs/Lang{}Model.log'.format(language_c)
-logfd = open(build_log, 'w')
-logfd.write('= Logs of language model for {} ({}) =\n'.format(lang.name, lang.code))
-logfd.write('\n- Generated by {}'.format(os.path.basename(__file__)))
-logfd.write('\n- Started: {}'.format(str(datetime.datetime.now())))
-logfd.write('\n- Maximum depth: {}'.format(options.max_depth))
-if options.max_page is not None:
- logfd.write('\n- Max number of pages: {}'.format(options.max_page))
-logfd.write('\n\n== Parsed pages ==\n')
-logfd.flush()
-try:
- visit_pages(lang.start_pages, 0, lang, logfd)
-except requests.exceptions.ConnectionError:
- print('Error: connection to Wikipedia failed. Aborting\n')
- exit(1)
-logfd.write('\n\n== End of Parsed pages ==')
-logfd.write('\n\n- Wikipedia parsing ended at: {}\n'.format(str(datetime.datetime.now())))
-logfd.flush()
-
-########### CHARACTERS ###########
-
-# Character ratios.
-ratios = {}
-n_char = len(characters)
-occurrences = sum(characters.values())
-
-logfd.write("\n{} characters appeared {} times.\n".format(n_char, occurrences))
-for char in characters:
- ratios[char] = characters[char] / occurrences
- #logfd.write("Character '{}' usage: {} ({} %)\n".format(chr(char),
- # characters[char],
- # ratios[char] * 100))
-
-sorted_ratios = sorted(ratios.items(), key=operator.itemgetter(1),
- reverse=True)
-# Accumulated ratios of the frequent chars.
-accumulated_ratios = 0
-
-# If there is no alphabet defined, we just use the first 64 letters, which was
-# the original default.
-# If there is an alphabet, we make sure all the alphabet characters are in the
-# frequent list, and we stop then. There may therefore be more or less than
-# 64 frequent characters depending on the language.
-logfd.write('\nMost Frequent characters:')
-very_freq_count = 0
-very_freq_ratio = 0
-if lang.alphabet is None and lang.frequent_ranges is None:
- freq_count = min(64, len(sorted_ratios))
- for order, (char, ratio) in enumerate(sorted_ratios):
- if order >= freq_count:
- break
- logfd.write("\n[{:2}] Char {}: {} %".format(order, chr(char), ratio * 100))
- accumulated_ratios += ratio
- if very_freq_ratio < 0.4:
- very_freq_count += 1
- very_freq_ratio += ratio
-elif lang.alphabet is not None:
- freq_count = 0
- for order, (char, ratio) in enumerate(sorted_ratios):
- if len(lang.alphabet) == 0:
- break
- if chr(char) in lang.alphabet:
- lang.alphabet.remove(chr(char))
- logfd.write("\n[{:2}] Char {}: {} %".format(order, chr(char), ratio * 100))
- accumulated_ratios += ratio
- freq_count += 1
if very_freq_ratio < 0.4:
very_freq_count += 1
very_freq_ratio += ratio
+
+ if frequent_ranges_size <= 0:
+ break
+
+ low_freq_order = freq_count - 1
+ low_freq_ratio = 0
+ for back_order, (char, ratio) in enumerate(reversed(sorted_ratios[:freq_count])):
+ if low_freq_ratio < 0.03:
+ low_freq_ratio += ratio
+ low_freq_order -= 1
else:
- if len(lang.alphabet) > 0:
- print("Error: alphabet characters are absent from data collection"
- "\n Please check the configuration or the data."
- "\n Missing characters: {}".format(", ".join(lang.alphabet)))
- exit(1)
-elif lang.frequent_ranges is not None:
- # How many characters in the frequent range?
- frequent_ranges_size = 0
- for start, end in lang.frequent_ranges:
- frequent_ranges_size += end - start + 1
-
- # Keep ratio for at least all the characters inside the frequent
- # ranges.
- freq_count = 0
- for order, (char, ratio) in enumerate(sorted_ratios):
- for start, end in lang.frequent_ranges:
- if char >= start and char <= end:
- freq_count += 1
- accumulated_ratios += ratio
- logfd.write("\n[{:2}] Char {}: {} %".format(order, chr(char), ratio * 100))
- frequent_ranges_size -= 1
+ break
+
+ logfd.write("\n\nThe first {} characters have an accumulated ratio of {}.\n".format(freq_count, accumulated_ratios))
+ logfd.write("The first {} characters have an accumulated ratio of {}.\n".format(very_freq_count, very_freq_ratio))
+ logfd.write("All characters whose order is over {} have an accumulated ratio of {}.\n".format(low_freq_order, low_freq_ratio))
+
+ with open(current_dir + '/header-template.cpp', 'r') as header_fd:
+ c_code = header_fd.read()
+
+ c_code += '\n#include "../nsSBCharSetProber.h"'
+ c_code += '\n#include "../nsSBCharSetProber-generated.h"'
+ c_code += '\n#include "../nsLanguageDetector.h"\n'
+ c_code += '\n#include "../nsLanguageDetector-generated.h"\n'
+ c_code += '\n/********* Language model for: {} *********/\n\n'.format(lang.name)
+ c_code += '/**\n * Generated by {}\n'.format(os.path.basename(__file__))
+ c_code += ' * On: {}\n'.format(str(datetime.datetime.now()))
+ c_code += ' **/\n'
+
+ c_code += \
+ """
+ /* Character Mapping Table:
+ * ILL: illegal character.
+ * CTR: control character specific to the charset.
+ * RET: carriage/return.
+ * SYM: symbol (punctuation) that does not belong to word.
+ * NUM: 0 - 9.
+ *
+ * Other characters are ordered by probabilities
+ * (0 is the most common character in the language).
+ *
+ * Orders are generic to a language. So the codepoint with order X in
+ * CHARSET1 maps to the same character as the codepoint with the same
+ * order X in CHARSET2 for the same language.
+ * As such, it is possible to get missing order. For instance the
+ * ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1
+ * even though they are both used for French. Same for the euro sign.
+ */
+ """
+
+ for charset in lang_charsets:
+ charset_c = charset.replace('-', '_').title()
+ CTOM_str = 'static const unsigned char {}_CharToOrderMap[]'.format(charset_c)
+ CTOM_str += ' =\n{'
+ for line in range(0, 16):
+ CTOM_str += '\n '
+ for column in range(0, 16):
+ cp = line * 16 + column
+ cp_type = lang_charsets[charset].charmap[cp]
+ if cp_type == ILL:
+ CTOM_str += 'ILL,'
+ elif cp_type == RET:
+ CTOM_str += 'RET,'
+ elif cp_type == CTR:
+ CTOM_str += 'CTR,'
+ elif cp_type == SYM:
+ CTOM_str += 'SYM,'
+ elif cp_type == NUM:
+ CTOM_str += 'NUM,'
+ else: # LET
+ try:
+ uchar = bytes([cp]).decode(charset)
+ except UnicodeDecodeError:
+ sys.stderr.write('Unknown character 0X{:X} in {}.'.format(cp, charset))
+ sys.stderr.write('Please verify your charset specification.\n')
+ exit(1)
+ except LookupError:
+ # Unknown encoding. Use iconv instead.
+ try:
+ call = subprocess.Popen(['iconv', '-t', 'UTF-8', '-f', charset],
+ stdin=subprocess.PIPE,
+ stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE)
+ if call.poll() is not None:
+ (_, error) = call.communicate(input='')
+ sys.stderr.write('Error: `iconv` ended with error "{}".\n'.format(error))
+ exit(1)
+ (uchar, _) = call.communicate(input=bytes([cp]))
+ uchar = uchar.decode('UTF-8')
+ except FileNotFoundError:
+ sys.stderr.write('Error: "{}" is not a supported charset by python and `iconv` is not installed.\n')
+ exit(1)
+ if len(uchar) == 0:
+ sys.stderr.write('TypeError: iconv failed to return a unicode character for codepoint "{}" in charset {}.\n'.format(hex(cp), charset))
+ exit(1)
+ #if lang.case_mapping and uchar.isupper() and \
+ #len(unicodedata.normalize('NFC', uchar.lower())) == 1:
+ # Unless we encounter special cases of characters with no
+ # composed lowercase, we lowercase it.
+ if lang.case_mapping or lang.custom_case_mapping is not None:
+ uchar = local_lowercase(uchar, lang)
+ if lang.alphabet_mapping is not None and uchar in lang.alphabet_mapping:
+ uchar = lang.alphabet_mapping[uchar]
+ for order, (char, ratio) in enumerate(sorted_ratios):
+ if char == ord(uchar):
+ CTOM_str += '{:3},'.format(min(249, order))
+ break
+ else:
+ # XXX: we must make sure the character order does not go
+ # over the special characters (250 currently). This may
+ # actually happen when building a model for a language
+ # writable with many different encoding. So let's just
+ # ceil the order value at 249 max.
+ # It may be an interesting alternative to add another
+ # constant for any character with an order > freqCharCount.
+ # Maybe IRR (irrelevant character) or simply CHR.
+ CTOM_str += '{:3},'.format(min(249, n_char))
+ n_char += 1
+ CTOM_str += ' /* {:X}X */'.format(line)
+ CTOM_str += '\n};\n/*'
+ CTOM_str += 'X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF'
+ CTOM_str += ' */\n\n'
+ c_code += CTOM_str
+
+ ## UNICODE frequency.
+
+ # Since we can't map the full character table from encoding to order,
+ # just create a list from the most common characters from the language.
+ # The list is ordered by unicode code points (hence can be used
+ # generically for various encoding scheme as it is not encoding
+ # specific) allowing to search from code points efficiently by a divide
+ # and conqueer search algorithm.
+ # Each code point is immediately followed by its order.
+
+ # Keep the freq_count more frequent characters.
+ sorted_chars = [(char, freq, order) for order, (char, freq) in
+ enumerate(sorted_ratios)][:freq_count]
+ max_order = len(sorted_chars)
+
+ # Add equivalency characters.
+ equivalent = []
+ if lang.case_mapping:
+ for char, ratio, order in sorted_chars:
+ uppercased = chr(char).upper()
+ try:
+ if char != ord(uppercased):
+ equivalent += [(ord(uppercased), ratio, order)]
+ except TypeError:
+ # This happens for some case such as 'SS' as uppercase of 'ß'.
+ # Just ignore such cases.
+ sys.stderr.write("Ignoring '{}' as uppercase equivalent of '{}'.\n".format(uppercased, char))
+
+ if lang.alphabet_mapping is not None:
+ for alt_c in lang.alphabet_mapping:
+ for char, ratio, order in sorted_chars:
+ if alt_c == chr(char):
+ sys.stderr.write("ALREADY {}\n".format(alt_c))
+ exit(1)
+ elif char == ord(lang.alphabet_mapping[alt_c]):
+ equivalent += [(ord(alt_c), ratio, order)]
break
else:
- # A frequent character in the non-frequent range.
- logfd.write("\n[{:2}] Char {}: {} %".format(order, chr(char), ratio * 100))
- freq_count += 1
- accumulated_ratios += ratio
-
- if very_freq_ratio < 0.4:
- very_freq_count += 1
- very_freq_ratio += ratio
-
- if frequent_ranges_size <= 0:
- break
-
-low_freq_order = freq_count - 1
-low_freq_ratio = 0
-for back_order, (char, ratio) in enumerate(reversed(sorted_ratios[:freq_count])):
- if low_freq_ratio < 0.03:
- low_freq_ratio += ratio
- low_freq_order -= 1
- else:
- break
-
-logfd.write("\n\nThe first {} characters have an accumulated ratio of {}.\n".format(freq_count, accumulated_ratios))
-logfd.write("The first {} characters have an accumulated ratio of {}.\n".format(very_freq_count, very_freq_ratio))
-logfd.write("All characters whose order is over {} have an accumulated ratio of {}.\n".format(low_freq_order, low_freq_ratio))
-
-with open(current_dir + '/header-template.cpp', 'r') as header_fd:
- c_code = header_fd.read()
-
-c_code += '\n/********* Language model for: {} *********/\n\n'.format(lang.name)
-c_code += '/**\n * Generated by {}\n'.format(os.path.basename(__file__))
-c_code += ' * On: {}\n'.format(str(datetime.datetime.now()))
-c_code += ' **/\n'
-
-c_code += \
-"""
-/* Character Mapping Table:
- * ILL: illegal character.
- * CTR: control character specific to the charset.
- * RET: carriage/return.
- * SYM: symbol (punctuation) that does not belong to word.
- * NUM: 0 - 9.
- *
- * Other characters are ordered by probabilities
- * (0 is the most common character in the language).
- *
- * Orders are generic to a language. So the codepoint with order X in
- * CHARSET1 maps to the same character as the codepoint with the same
- * order X in CHARSET2 for the same language.
- * As such, it is possible to get missing order. For instance the
- * ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1
- * even though they are both used for French. Same for the euro sign.
- */
-"""
-
-for charset in charsets:
- charset_c = charset.replace('-', '_').title()
- CTOM_str = 'static const unsigned char {}_CharToOrderMap[]'.format(charset_c)
- CTOM_str += ' =\n{'
- for line in range(0, 16):
- CTOM_str += '\n '
- for column in range(0, 16):
- cp = line * 16 + column
- cp_type = charsets[charset].charmap[cp]
- if cp_type == ILL:
- CTOM_str += 'ILL,'
- elif cp_type == RET:
- CTOM_str += 'RET,'
- elif cp_type == CTR:
- CTOM_str += 'CTR,'
- elif cp_type == SYM:
- CTOM_str += 'SYM,'
- elif cp_type == NUM:
- CTOM_str += 'NUM,'
- else: # LET
- try:
- uchar = bytes([cp]).decode(charset)
- except UnicodeDecodeError:
- print('Unknown character 0X{:X} in {}.'.format(cp, charset))
- print('Please verify your charset specification.\n')
- exit(1)
- except LookupError:
- # Unknown encoding. Use iconv instead.
- try:
- call = subprocess.Popen(['iconv', '-t', 'UTF-8', '-f', charset],
- stdin=subprocess.PIPE,
- stdout=subprocess.PIPE,
- stderr=subprocess.PIPE)
- if call.poll() is not None:
- (_, error) = call.communicate(input='')
- print('Error: `iconv` ended with error "{}".\n'.format(error))
- exit(1)
- (uchar, _) = call.communicate(input=bytes([cp]))
- uchar = uchar.decode('UTF-8')
- except FileNotFoundError:
- print('Error: "{}" is not a supported charset by python and `iconv` is not installed.\n')
- exit(1)
- if len(uchar) == 0:
- print('TypeError: iconv failed to return a unicode character for codepoint "{}" in charset {}.\n'.format(hex(cp), charset))
- exit(1)
- #if lang.case_mapping and uchar.isupper() and \
- #len(unicodedata.normalize('NFC', uchar.lower())) == 1:
- # Unless we encounter special cases of characters with no
- # composed lowercase, we lowercase it.
- if lang.case_mapping or lang.custom_case_mapping is not None:
- uchar = local_lowercase(uchar, lang)
- if lang.alphabet_mapping is not None and uchar in lang.alphabet_mapping:
- uchar = lang.alphabet_mapping[uchar]
- for order, (char, ratio) in enumerate(sorted_ratios):
- if char == ord(uchar):
- CTOM_str += '{:3},'.format(min(249, order))
- break
- else:
- # XXX: we must make sure the character order does not go
- # over the special characters (250 currently). This may
- # actually happen when building a model for a language
- # writable with many different encoding. So let's just
- # ceil the order value at 249 max.
- # It may be an interesting alternative to add another
- # constant for any character with an order > freqCharCount.
- # Maybe IRR (irrelevant character) or simply CHR.
- CTOM_str += '{:3},'.format(min(249, n_char))
- n_char += 1
- CTOM_str += ' /* {:X}X */'.format(line)
- CTOM_str += '\n};\n/*'
- CTOM_str += 'X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF'
- CTOM_str += ' */\n\n'
- c_code += CTOM_str
-
-## UNICODE frequency.
-
-# Since we can't map the full character table from encoding to order,
-# just create a list from the most common characters from the language.
-# The list is ordered by unicode code points (hence can be used
-# generically for various encoding scheme as it is not encoding
-# specific) allowing to search from code points efficiently by a divide
-# and conqueer search algorithm.
-# Each code point is immediately followed by its order.
-
-# Keep the freq_count more frequent characters.
-sorted_chars = [(char, freq, order) for order, (char, freq) in
- enumerate(sorted_ratios)][:freq_count]
-max_order = len(sorted_chars)
-
-# Add equivalency characters.
-equivalent = []
-if lang.case_mapping:
- for char, ratio, order in sorted_chars:
- uppercased = chr(char).upper()
- try:
- if char != ord(uppercased):
- equivalent += [(ord(uppercased), ratio, order)]
- except TypeError:
- # This happens for some case such as 'SS' as uppercase of 'ß'.
- # Just ignore such cases.
- sys.stderr.write("Ignoring '{}' as uppercase equivalent of '{}'.\n".format(uppercased, char))
-
-if lang.alphabet_mapping is not None:
- for alt_c in lang.alphabet_mapping:
- for char, ratio, order in sorted_chars:
- if alt_c == chr(char):
- sys.stderr.write("ALREADY {}\n".format(alt_c))
+ sys.stderr.write("Base equivalent for {} not found in frequent characters!\n".format(alt_c))
exit(1)
- elif char == ord(lang.alphabet_mapping[alt_c]):
- equivalent += [(ord(alt_c), ratio, order)]
- break
- else:
- sys.stderr.write("Base equivalent for {} not found in frequent characters!\n".format(alt_c))
- exit(1)
-sorted_chars += equivalent
-
-# Order by code point.
-sorted_chars = sorted(sorted_chars, key=operator.itemgetter(0))
-
-CTOM_str = 'static const int Unicode_Char_size = {};\n'.format(len(sorted_chars))
-
-CTOM_str += 'static const unsigned int Unicode_CharOrder[]'
-CTOM_str += ' =\n{'
-column = 0
-
-max_char_width = math.floor(math.log10(sorted_chars[-1][0])) + 1
-max_order_width = math.floor(math.log10(max_order)) + 1
-
-for char, ratio, order in sorted_chars:
- if column % 8 == 0:
- CTOM_str += '\n '
- column += 1
- CTOM_str += '{}{:>{width}}, '.format('' if column % 8 == 0 else ' ', char, width=max_char_width)
- CTOM_str += '{:>{width}},'.format(order, width=max_order_width)
-
-CTOM_str += '\n};\n\n'
-c_code += CTOM_str
-
-########### SEQUENCES ###########
-
-ratios = {}
-occurrences = sum(sequences.values())
-
-accumulated_seq_count = 0
-order_3 = -1
-order_2 = -1
-ratio_3 = -1
-ratio_2 = -1
-count_512 = -1
-count_1024 = -1
-sorted_seqs = sorted(sequences.items(), key=operator.itemgetter(1),
- reverse=True)
-for order, ((c1, c2), count) in enumerate(sorted_seqs):
- accumulated_seq_count += count
- if order_3 == -1 and accumulated_seq_count / occurrences >= 0.995:
- order_3 = order
- ratio_3 = accumulated_seq_count / occurrences
- elif order_2 == -1 and accumulated_seq_count / occurrences >= 0.999:
- order_2 = order
- ratio_2 = accumulated_seq_count / occurrences
- if order < 512:
- count_512 += count
- elif order < 1024:
- count_1024 += count
-
- if order_3 != -1 and order_2 != -1:
- break
-
-if order_3 == -1 or order_2 == -1:
- # This would probably never happens. It would require a language with
- # very few possible sequences and each of the sequences are widely
- # used. Just add this code for completio, but it won't likely ever be
- # run.
- order_2 = 512
- order_3 = 1024
- ratio_2 = count_512 / occurrences
- ratio_3 = count_1024 / occurrences
-
-logfd.write("\n{} sequences found.\n".format(len(sorted_seqs)))
-
-c_code += """
-/* Model Table:
- * Total considered sequences: {} / {}
- * - Positive sequences: first {} ({})
- * - Probable sequences: next {} ({}-{}) ({})
- * - Neutral sequences: last {} ({})
- * - Negative sequences: {} (off-ratio)
- * Negative sequences: TODO""".format(len(sorted_seqs),
- freq_count * freq_count,
- order_3, ratio_3,
- order_2 - order_3,
- order_2, order_3,
- ratio_2 - ratio_3,
- freq_count * freq_count - order_2,
- 1 - ratio_2,
- freq_count * freq_count - len(sorted_seqs))
-
-logfd.write("\nFirst {} (typical positive ratio): {}".format(order_3, ratio_3))
-logfd.write("\nNext {} ({}-{}): {}".format(order_2 - order_3,
- order_2, order_3,
- ratio_2 - ratio_3))
-logfd.write("\nRest: {}".format(1 - ratio_2))
-
-c_code += "\n */\n"
-
-LM_str = 'static const PRUint8 {}LangModel[]'.format(language_c)
-LM_str += ' =\n{'
-for line in range(0, freq_count):
- LM_str += '\n '
- for column in range(0, freq_count):
- # Let's not make too long lines.
- if freq_count > 40 and column == int(freq_count / 2):
- LM_str += '\n '
- first_order = int(line)
- second_order = column
- if first_order < len(sorted_ratios) and second_order < len(sorted_ratios):
- (first_char, _) = sorted_ratios[first_order]
- (second_char, _) = sorted_ratios[second_order]
- if (first_char, second_char) in sequences:
- for order, (seq, _) in enumerate(sorted_seqs):
- if seq == (first_char, second_char):
- if order < order_3:
- LM_str += '3,'
- elif order < order_2:
- LM_str += '2,'
- else:
- LM_str += '1,'
- break
- else:
- pass # impossible!
- LM_str += '0,'
- else:
- LM_str += '0,'
- else:
- # It may indeed happen that we find less than 64 letters used for a
- # given language.
- LM_str += '0,'
-LM_str += '\n};\n'
-c_code += LM_str
-
-for charset in charsets:
- charset_c = charset.replace('-', '_').title()
- SM_str = '\n\nconst SequenceModel {}{}Model ='.format(charset_c, language_c)
- SM_str += '\n{\n '
- SM_str += '{}_CharToOrderMap,\n {}LangModel,'.format(charset_c, language_c)
- SM_str += '\n {},'.format(freq_count)
- SM_str += '\n (float){},'.format(ratio_2)
- SM_str += '\n {},'.format('PR_TRUE' if lang.use_ascii else 'PR_FALSE')
- SM_str += '\n "{}",'.format(charset)
- SM_str += '\n "{}"'.format(lang.code)
- SM_str += '\n};'
- c_code += SM_str
-
-SM_str = '\n\nconst LanguageModel {}Model ='.format(language_c)
-SM_str += '\n{'
-SM_str += '\n "{}",'.format(lang.code)
-SM_str += '\n Unicode_CharOrder,'
-SM_str += '\n {},'.format(len(sorted_chars)) # Order is wrong!
-SM_str += '\n {}LangModel,'.format(language_c)
-SM_str += '\n {},'.format(freq_count)
-SM_str += '\n {},'.format(very_freq_count)
-SM_str += '\n (float){},'.format(very_freq_ratio)
-SM_str += '\n {},'.format(low_freq_order)
-SM_str += '\n (float){},'.format(low_freq_ratio)
-SM_str += '\n};'
-c_code += SM_str
-
-c_code += '\n'
-
-lang_model_file = current_dir + '/../src/LangModels/Lang{}Model.cpp'.format(language_c)
-with open(lang_model_file, 'w') as cpp_fd:
- cpp_fd.write(c_code)
-
-logfd.write('\n\n- Processing end: {}\n'.format(str(datetime.datetime.now())))
-logfd.close()
-
-print("The following language model file has been generated: {}"
- "\nThe build log is available in: {}"
- "\nTest them and commit them.".format(lang_model_file, build_log))
+ sorted_chars += equivalent
+
+ # Order by code point.
+ sorted_chars = sorted(sorted_chars, key=operator.itemgetter(0))
+
+ CTOM_str = 'static const int Unicode_Char_size = {};\n'.format(len(sorted_chars))
+
+ CTOM_str += 'static const unsigned int Unicode_CharOrder[]'
+ CTOM_str += ' =\n{'
+ column = 0
+
+ max_char_width = math.floor(math.log10(sorted_chars[-1][0])) + 1
+ max_order_width = math.floor(math.log10(max_order)) + 1
+
+ for char, ratio, order in sorted_chars:
+ if column % 8 == 0:
+ CTOM_str += '\n '
+ column += 1
+ CTOM_str += '{}{:>{width}}, '.format('' if column % 8 == 0 else ' ', char, width=max_char_width)
+ CTOM_str += '{:>{width}},'.format(order, width=max_order_width)
+
+ CTOM_str += '\n};\n\n'
+ c_code += CTOM_str
+
+ ########### SEQUENCES ###########
+
+ ratios = {}
+ occurrences = sum(sequences.values())
+
+ accumulated_seq_count = 0
+ order_3 = -1
+ order_2 = -1
+ ratio_3 = -1
+ ratio_2 = -1
+ count_512 = -1
+ count_1024 = -1
+ sorted_seqs = sorted(sequences.items(), key=operator.itemgetter(1),
+ reverse=True)
+ for order, ((c1, c2), count) in enumerate(sorted_seqs):
+ accumulated_seq_count += count
+ if order_3 == -1 and accumulated_seq_count / occurrences >= 0.995:
+ order_3 = order
+ ratio_3 = accumulated_seq_count / occurrences
+ elif order_2 == -1 and accumulated_seq_count / occurrences >= 0.999:
+ order_2 = order
+ ratio_2 = accumulated_seq_count / occurrences
+ if order < 512:
+ count_512 += count
+ elif order < 1024:
+ count_1024 += count
+
+ if order_3 != -1 and order_2 != -1:
+ break
+
+ if order_3 == -1 or order_2 == -1:
+ # This would probably never happens. It would require a language with
+ # very few possible sequences and each of the sequences are widely
+ # used. Just add this code for completio, but it won't likely ever be
+ # run.
+ order_2 = 512
+ order_3 = 1024
+ ratio_2 = count_512 / occurrences
+ ratio_3 = count_1024 / occurrences
+
+ logfd.write("\n{} sequences found.\n".format(len(sorted_seqs)))
+
+ c_code += """
+ /* Model Table:
+ * Total considered sequences: {} / {}
+ * - Positive sequences: first {} ({})
+ * - Probable sequences: next {} ({}-{}) ({})
+ * - Neutral sequences: last {} ({})
+ * - Negative sequences: {} (off-ratio)
+ * Negative sequences: TODO""".format(len(sorted_seqs),
+ freq_count * freq_count,
+ order_3, ratio_3,
+ order_2 - order_3,
+ order_2, order_3,
+ ratio_2 - ratio_3,
+ freq_count * freq_count - order_2,
+ 1 - ratio_2,
+ freq_count * freq_count - len(sorted_seqs))
+
+ logfd.write("\nFirst {} (typical positive ratio): {}".format(order_3, ratio_3))
+ logfd.write("\nNext {} ({}-{}): {}".format(order_2 - order_3,
+ order_2, order_3,
+ ratio_2 - ratio_3))
+ logfd.write("\nRest: {}".format(1 - ratio_2))
+
+ c_code += "\n */\n"
+
+ LM_str = 'static const PRUint8 {}LangModel[]'.format(language_c)
+ LM_str += ' =\n{'
+ for line in range(0, freq_count):
+ LM_str += '\n '
+ for column in range(0, freq_count):
+ # Let's not make too long lines.
+ if freq_count > 40 and column == int(freq_count / 2):
+ LM_str += '\n '
+ first_order = int(line)
+ second_order = column
+ if first_order < len(sorted_ratios) and second_order < len(sorted_ratios):
+ (first_char, _) = sorted_ratios[first_order]
+ (second_char, _) = sorted_ratios[second_order]
+ if (first_char, second_char) in sequences:
+ for order, (seq, _) in enumerate(sorted_seqs):
+ if seq == (first_char, second_char):
+ if order < order_3:
+ LM_str += '3,'
+ elif order < order_2:
+ LM_str += '2,'
+ else:
+ LM_str += '1,'
+ break
+ else:
+ pass # impossible!
+ LM_str += '0,'
+ else:
+ LM_str += '0,'
+ else:
+ # It may indeed happen that we find less than 64 letters used for a
+ # given language.
+ LM_str += '0,'
+ LM_str += '\n};\n'
+ c_code += LM_str
+
+ for charset in lang_charsets:
+ charset_c = charset.replace('-', '_').title()
+ SM_str = '\n\nconst SequenceModel {}{}Model ='.format(charset_c, language_c)
+ SM_str += '\n{\n '
+ SM_str += '{}_CharToOrderMap,\n {}LangModel,'.format(charset_c, language_c)
+ SM_str += '\n {},'.format(freq_count)
+ SM_str += '\n (float){},'.format(ratio_2)
+ SM_str += '\n {},'.format('PR_TRUE' if lang.use_ascii else 'PR_FALSE')
+ SM_str += '\n "{}",'.format(charset)
+ SM_str += '\n "{}"'.format(lang.code)
+ SM_str += '\n};'
+ c_code += SM_str
+
+ SM_str = '\n\nconst LanguageModel {}Model ='.format(language_c)
+ SM_str += '\n{'
+ SM_str += '\n "{}",'.format(lang.code)
+ SM_str += '\n Unicode_CharOrder,'
+ SM_str += '\n {},'.format(len(sorted_chars)) # Order is wrong!
+ SM_str += '\n {}LangModel,'.format(language_c)
+ SM_str += '\n {},'.format(freq_count)
+ SM_str += '\n {},'.format(very_freq_count)
+ SM_str += '\n (float){},'.format(very_freq_ratio)
+ SM_str += '\n {},'.format(low_freq_order)
+ SM_str += '\n (float){},'.format(low_freq_ratio)
+ SM_str += '\n};'
+ c_code += SM_str
+
+ c_code += '\n'
+
+ lang_model_file = current_dir + '/../src/LangModels/Lang{}Model.cpp'.format(language_c)
+ with open(lang_model_file, 'w') as cpp_fd:
+ cpp_fd.write(c_code)
+
+ logfd.write('\n\n- Processing end: {}\n'.format(str(datetime.datetime.now())))
+ logfd.close()
+
+ generated_files += [ (lang_model_file, build_log) ]
+
+charset_cpp = os.path.join(current_dir, '../src', 'nsSBCharSetProber-generated.h')
+print("\nGenerating {}…".format(charset_cpp))
+
+with open(charset_cpp, 'w') as cpp_fd:
+ with open(current_dir + '/header-template.cpp', 'r') as header_fd:
+ cpp_fd.write(header_fd.read())
+
+ cpp_fd.write('\n#ifndef nsSingleByteCharSetProber_generated_h__')
+ cpp_fd.write('\n#define nsSingleByteCharSetProber_generated_h__\n')
+
+ all_extern_declarations = ''
+ n_sequence_models = 0
+ for l in all_langs:
+ l = l.lower()
+ # Load the language data.
+ sys_path_backup = sys.path
+ sys.path = [current_dir + '/langs']
+ try:
+ lang = importlib.import_module(l)
+ except ImportError:
+ sys.stderr.write('Unknown language code "{}": '
+ 'file "langs/{}.py" does not exist.'.format(l, l))
+ exit(1)
+ sys.path = sys_path_backup
+
+ language_c = lang.name.replace('-', '_').title()
+ lang_charsets = charsets.db.load(lang.charsets)
+ for charset in lang_charsets:
+ charset_c = charset.replace('-', '_').title()
+ all_extern_declarations += '\nextern const SequenceModel {}{}Model;'.format(charset_c, language_c)
+ n_sequence_models += 1
+ all_extern_declarations += '\n'
+
+ cpp_fd.write('\n#define NUM_OF_SEQUENCE_MODELS {}\n'.format(n_sequence_models))
+ cpp_fd.write('{}'.format(all_extern_declarations))
+ cpp_fd.write('\n#endif /* nsSingleByteCharSetProber_generated_h__ */')
+
+print("Done!")
+
+language_cpp = os.path.join(current_dir, '../src', 'nsLanguageDetector-generated.h')
+print("\nGenerating {}…".format(language_cpp))
+
+with open(language_cpp, 'w') as cpp_fd:
+ with open(current_dir + '/header-template.cpp', 'r') as header_fd:
+ cpp_fd.write(header_fd.read())
+
+ cpp_fd.write('\n#ifndef nsLanguageDetector_h_generated_h__')
+ cpp_fd.write('\n#define nsLanguageDetector_h_generated_h__\n')
+
+ all_extern_declarations = ''
+ n_language_models = 0
+ for l in all_langs:
+ l = l.lower()
+ # Load the language data.
+ sys_path_backup = sys.path
+ sys.path = [current_dir + '/langs']
+ try:
+ lang = importlib.import_module(l)
+ except ImportError:
+ sys.stderr.write('Unknown language code "{}": '
+ 'file "langs/{}.py" does not exist.'.format(l, l))
+ exit(1)
+ sys.path = sys_path_backup
+
+ language_c = lang.name.replace('-', '_').title()
+ all_extern_declarations += '\nextern const LanguageModel {}Model;'.format(language_c)
+ n_language_models += 1
+
+ cpp_fd.write('\n#define NUM_OF_LANGUAGE_MODELS {}\n'.format(n_language_models))
+ cpp_fd.write('{}'.format(all_extern_declarations))
+ cpp_fd.write('\n\n#endif /* nsLanguageDetector_h_generated_h__ */')
+
+print("Done!")
+if len(generated_files) > 0:
+ print("\nThe following language files has been generated:")
+ for (lang_model_file, build_log) in generated_files:
+ print("\n- Language file: {}".format(lang_model_file))
+ print("\n Build log: {}".format(build_log))
+
+print("\nTODO:")
+print("- edit nsSBCSGroupProber::nsSBCSGroupProber() in src/nsSBCSGroupProber.cpp manually to test new sequence models;")
+print("- edit nsMBCSGroupProber::nsMBCSGroupProber() in src/nsMBCSGroupProber.cpp manually to test new language models;")
+print("- add any new language files to src/CMakeLists.txt;")
+print("- commit generated files if tests are successful.")
diff --git a/script/header-template.cpp b/script/header-template.cpp
index 286078a..c354fe6 100644
--- a/script/header-template.cpp
+++ b/script/header-template.cpp
@@ -34,6 +34,3 @@
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
-
-#include "../nsSBCharSetProber.h"
-#include "../nsLanguageDetector.h"
diff --git a/script/support.txt b/script/support.txt
new file mode 100644
index 0000000..d52051e
--- /dev/null
+++ b/script/support.txt
@@ -0,0 +1,36 @@
+ar
+be
+bg
+cs
+da
+de
+el
+en
+eo
+es
+et
+fi
+fr
+ga
+he
+hi
+hr
+hu
+it
+lt
+lv
+mk
+mt
+no
+pl
+pt
+ro
+ru
+sk
+sl
+sr
+sv
+th
+tr
+uk
+vi
diff --git a/src/LangModels/LangArabicModel.cpp b/src/LangModels/LangArabicModel.cpp
index a84e3e4..dab0d00 100644
--- a/src/LangModels/LangArabicModel.cpp
+++ b/src/LangModels/LangArabicModel.cpp
@@ -36,7 +36,9 @@
* ***** END LICENSE BLOCK ***** */
#include "../nsSBCharSetProber.h"
+#include "../nsSBCharSetProber-generated.h"
#include "../nsLanguageDetector.h"
+#include "../nsLanguageDetector-generated.h"
/********* Language model for: Arabic *********/
diff --git a/src/LangModels/LangBelarusianModel.cpp b/src/LangModels/LangBelarusianModel.cpp
index f013abe..b610376 100644
--- a/src/LangModels/LangBelarusianModel.cpp
+++ b/src/LangModels/LangBelarusianModel.cpp
@@ -36,7 +36,9 @@
* ***** END LICENSE BLOCK ***** */
#include "../nsSBCharSetProber.h"
+#include "../nsSBCharSetProber-generated.h"
#include "../nsLanguageDetector.h"
+#include "../nsLanguageDetector-generated.h"
/********* Language model for: Belarusian *********/
diff --git a/src/LangModels/LangBulgarianModel.cpp b/src/LangModels/LangBulgarianModel.cpp
index 32bba1c..7361a7e 100644
--- a/src/LangModels/LangBulgarianModel.cpp
+++ b/src/LangModels/LangBulgarianModel.cpp
@@ -36,7 +36,9 @@
* ***** END LICENSE BLOCK ***** */
#include "../nsSBCharSetProber.h"
+#include "../nsSBCharSetProber-generated.h"
#include "../nsLanguageDetector.h"
+#include "../nsLanguageDetector-generated.h"
/********* Language model for: Bulgarian *********/
diff --git a/src/LangModels/LangCroatianModel.cpp b/src/LangModels/LangCroatianModel.cpp
index 4bb6480..5abc994 100644
--- a/src/LangModels/LangCroatianModel.cpp
+++ b/src/LangModels/LangCroatianModel.cpp
@@ -36,7 +36,9 @@
* ***** END LICENSE BLOCK ***** */
#include "../nsSBCharSetProber.h"
+#include "../nsSBCharSetProber-generated.h"
#include "../nsLanguageDetector.h"
+#include "../nsLanguageDetector-generated.h"
/********* Language model for: Croatian *********/
diff --git a/src/LangModels/LangCzechModel.cpp b/src/LangModels/LangCzechModel.cpp
index caaab7e..8ed5a0b 100644
--- a/src/LangModels/LangCzechModel.cpp
+++ b/src/LangModels/LangCzechModel.cpp
@@ -36,7 +36,9 @@
* ***** END LICENSE BLOCK ***** */
#include "../nsSBCharSetProber.h"
+#include "../nsSBCharSetProber-generated.h"
#include "../nsLanguageDetector.h"
+#include "../nsLanguageDetector-generated.h"
/********* Language model for: Czech *********/
diff --git a/src/LangModels/LangDanishModel.cpp b/src/LangModels/LangDanishModel.cpp
index d60f2b9..9426c5a 100644
--- a/src/LangModels/LangDanishModel.cpp
+++ b/src/LangModels/LangDanishModel.cpp
@@ -36,7 +36,9 @@
* ***** END LICENSE BLOCK ***** */
#include "../nsSBCharSetProber.h"
+#include "../nsSBCharSetProber-generated.h"
#include "../nsLanguageDetector.h"
+#include "../nsLanguageDetector-generated.h"
/********* Language model for: Danish *********/
diff --git a/src/LangModels/LangEnglishModel.cpp b/src/LangModels/LangEnglishModel.cpp
index 682c1b8..faca79b 100644
--- a/src/LangModels/LangEnglishModel.cpp
+++ b/src/LangModels/LangEnglishModel.cpp
@@ -36,7 +36,9 @@
* ***** END LICENSE BLOCK ***** */
#include "../nsSBCharSetProber.h"
+#include "../nsSBCharSetProber-generated.h"
#include "../nsLanguageDetector.h"
+#include "../nsLanguageDetector-generated.h"
/********* Language model for: English *********/
diff --git a/src/LangModels/LangEsperantoModel.cpp b/src/LangModels/LangEsperantoModel.cpp
index f948abe..c1da2ec 100644
--- a/src/LangModels/LangEsperantoModel.cpp
+++ b/src/LangModels/LangEsperantoModel.cpp
@@ -36,7 +36,9 @@
* ***** END LICENSE BLOCK ***** */
#include "../nsSBCharSetProber.h"
+#include "../nsSBCharSetProber-generated.h"
#include "../nsLanguageDetector.h"
+#include "../nsLanguageDetector-generated.h"
/********* Language model for: Esperanto *********/
diff --git a/src/LangModels/LangEstonianModel.cpp b/src/LangModels/LangEstonianModel.cpp
index da5177f..5cdf9d4 100644
--- a/src/LangModels/LangEstonianModel.cpp
+++ b/src/LangModels/LangEstonianModel.cpp
@@ -36,7 +36,9 @@
* ***** END LICENSE BLOCK ***** */
#include "../nsSBCharSetProber.h"
+#include "../nsSBCharSetProber-generated.h"
#include "../nsLanguageDetector.h"
+#include "../nsLanguageDetector-generated.h"
/********* Language model for: Estonian *********/
diff --git a/src/LangModels/LangFinnishModel.cpp b/src/LangModels/LangFinnishModel.cpp
index f7e5a57..ccbbd2d 100644
--- a/src/LangModels/LangFinnishModel.cpp
+++ b/src/LangModels/LangFinnishModel.cpp
@@ -36,7 +36,9 @@
* ***** END LICENSE BLOCK ***** */
#include "../nsSBCharSetProber.h"
+#include "../nsSBCharSetProber-generated.h"
#include "../nsLanguageDetector.h"
+#include "../nsLanguageDetector-generated.h"
/********* Language model for: Finnish *********/
diff --git a/src/LangModels/LangFrenchModel.cpp b/src/LangModels/LangFrenchModel.cpp
index 6e49ab6..f01e250 100644
--- a/src/LangModels/LangFrenchModel.cpp
+++ b/src/LangModels/LangFrenchModel.cpp
@@ -36,7 +36,9 @@
* ***** END LICENSE BLOCK ***** */
#include "../nsSBCharSetProber.h"
+#include "../nsSBCharSetProber-generated.h"
#include "../nsLanguageDetector.h"
+#include "../nsLanguageDetector-generated.h"
/********* Language model for: French *********/
diff --git a/src/LangModels/LangGermanModel.cpp b/src/LangModels/LangGermanModel.cpp
index 3ed2684..c722fb7 100644
--- a/src/LangModels/LangGermanModel.cpp
+++ b/src/LangModels/LangGermanModel.cpp
@@ -36,7 +36,9 @@
* ***** END LICENSE BLOCK ***** */
#include "../nsSBCharSetProber.h"
+#include "../nsSBCharSetProber-generated.h"
#include "../nsLanguageDetector.h"
+#include "../nsLanguageDetector-generated.h"
/********* Language model for: German *********/
diff --git a/src/LangModels/LangGreekModel.cpp b/src/LangModels/LangGreekModel.cpp
index 29e5b1d..4825977 100644
--- a/src/LangModels/LangGreekModel.cpp
+++ b/src/LangModels/LangGreekModel.cpp
@@ -36,7 +36,9 @@
* ***** END LICENSE BLOCK ***** */
#include "../nsSBCharSetProber.h"
+#include "../nsSBCharSetProber-generated.h"
#include "../nsLanguageDetector.h"
+#include "../nsLanguageDetector-generated.h"
/********* Language model for: Greek *********/
diff --git a/src/LangModels/LangHebrewModel.cpp b/src/LangModels/LangHebrewModel.cpp
index 91327ec..c19791e 100644
--- a/src/LangModels/LangHebrewModel.cpp
+++ b/src/LangModels/LangHebrewModel.cpp
@@ -36,7 +36,9 @@
* ***** END LICENSE BLOCK ***** */
#include "../nsSBCharSetProber.h"
+#include "../nsSBCharSetProber-generated.h"
#include "../nsLanguageDetector.h"
+#include "../nsLanguageDetector-generated.h"
/********* Language model for: Hebrew *********/
diff --git a/src/LangModels/LangHindiModel.cpp b/src/LangModels/LangHindiModel.cpp
index ab7ecd8..93da9d5 100644
--- a/src/LangModels/LangHindiModel.cpp
+++ b/src/LangModels/LangHindiModel.cpp
@@ -36,7 +36,9 @@
* ***** END LICENSE BLOCK ***** */
#include "../nsSBCharSetProber.h"
+#include "../nsSBCharSetProber-generated.h"
#include "../nsLanguageDetector.h"
+#include "../nsLanguageDetector-generated.h"
/********* Language model for: Hindi *********/
diff --git a/src/LangModels/LangHungarianModel.cpp b/src/LangModels/LangHungarianModel.cpp
index 230eff0..c9c17c4 100644
--- a/src/LangModels/LangHungarianModel.cpp
+++ b/src/LangModels/LangHungarianModel.cpp
@@ -36,7 +36,9 @@
* ***** END LICENSE BLOCK ***** */
#include "../nsSBCharSetProber.h"
+#include "../nsSBCharSetProber-generated.h"
#include "../nsLanguageDetector.h"
+#include "../nsLanguageDetector-generated.h"
/********* Language model for: Hungarian *********/
diff --git a/src/LangModels/LangIrishModel.cpp b/src/LangModels/LangIrishModel.cpp
index c3d3282..7bcbcbc 100644
--- a/src/LangModels/LangIrishModel.cpp
+++ b/src/LangModels/LangIrishModel.cpp
@@ -36,7 +36,9 @@
* ***** END LICENSE BLOCK ***** */
#include "../nsSBCharSetProber.h"
+#include "../nsSBCharSetProber-generated.h"
#include "../nsLanguageDetector.h"
+#include "../nsLanguageDetector-generated.h"
/********* Language model for: Irish *********/
diff --git a/src/LangModels/LangItalianModel.cpp b/src/LangModels/LangItalianModel.cpp
index 297bd97..e49f148 100644
--- a/src/LangModels/LangItalianModel.cpp
+++ b/src/LangModels/LangItalianModel.cpp
@@ -36,7 +36,9 @@
* ***** END LICENSE BLOCK ***** */
#include "../nsSBCharSetProber.h"
+#include "../nsSBCharSetProber-generated.h"
#include "../nsLanguageDetector.h"
+#include "../nsLanguageDetector-generated.h"
/********* Language model for: Italian *********/
diff --git a/src/LangModels/LangLatvianModel.cpp b/src/LangModels/LangLatvianModel.cpp
index 581daee..a5248d4 100644
--- a/src/LangModels/LangLatvianModel.cpp
+++ b/src/LangModels/LangLatvianModel.cpp
@@ -36,7 +36,9 @@
* ***** END LICENSE BLOCK ***** */
#include "../nsSBCharSetProber.h"
+#include "../nsSBCharSetProber-generated.h"
#include "../nsLanguageDetector.h"
+#include "../nsLanguageDetector-generated.h"
/********* Language model for: Latvian *********/
diff --git a/src/LangModels/LangLithuanianModel.cpp b/src/LangModels/LangLithuanianModel.cpp
index 9c18ece..7f68804 100644
--- a/src/LangModels/LangLithuanianModel.cpp
+++ b/src/LangModels/LangLithuanianModel.cpp
@@ -36,7 +36,9 @@
* ***** END LICENSE BLOCK ***** */
#include "../nsSBCharSetProber.h"
+#include "../nsSBCharSetProber-generated.h"
#include "../nsLanguageDetector.h"
+#include "../nsLanguageDetector-generated.h"
/********* Language model for: Lithuanian *********/
diff --git a/src/LangModels/LangMacedonianModel.cpp b/src/LangModels/LangMacedonianModel.cpp
index bae13ad..f3d1526 100644
--- a/src/LangModels/LangMacedonianModel.cpp
+++ b/src/LangModels/LangMacedonianModel.cpp
@@ -36,7 +36,9 @@
* ***** END LICENSE BLOCK ***** */
#include "../nsSBCharSetProber.h"
+#include "../nsSBCharSetProber-generated.h"
#include "../nsLanguageDetector.h"
+#include "../nsLanguageDetector-generated.h"
/********* Language model for: Macedonian *********/
diff --git a/src/LangModels/LangMalteseModel.cpp b/src/LangModels/LangMalteseModel.cpp
index 52d30a1..a345ad5 100644
--- a/src/LangModels/LangMalteseModel.cpp
+++ b/src/LangModels/LangMalteseModel.cpp
@@ -36,7 +36,9 @@
* ***** END LICENSE BLOCK ***** */
#include "../nsSBCharSetProber.h"
+#include "../nsSBCharSetProber-generated.h"
#include "../nsLanguageDetector.h"
+#include "../nsLanguageDetector-generated.h"
/********* Language model for: Maltese *********/
diff --git a/src/LangModels/LangNorwegianModel.cpp b/src/LangModels/LangNorwegianModel.cpp
index 1fe232b..e894ba9 100644
--- a/src/LangModels/LangNorwegianModel.cpp
+++ b/src/LangModels/LangNorwegianModel.cpp
@@ -36,7 +36,9 @@
* ***** END LICENSE BLOCK ***** */
#include "../nsSBCharSetProber.h"
+#include "../nsSBCharSetProber-generated.h"
#include "../nsLanguageDetector.h"
+#include "../nsLanguageDetector-generated.h"
/********* Language model for: Norwegian *********/
diff --git a/src/LangModels/LangPolishModel.cpp b/src/LangModels/LangPolishModel.cpp
index b742d6b..71f196d 100644
--- a/src/LangModels/LangPolishModel.cpp
+++ b/src/LangModels/LangPolishModel.cpp
@@ -36,7 +36,9 @@
* ***** END LICENSE BLOCK ***** */
#include "../nsSBCharSetProber.h"
+#include "../nsSBCharSetProber-generated.h"
#include "../nsLanguageDetector.h"
+#include "../nsLanguageDetector-generated.h"
/********* Language model for: Polish *********/
diff --git a/src/LangModels/LangPortugueseModel.cpp b/src/LangModels/LangPortugueseModel.cpp
index 33af46e..d90f255 100644
--- a/src/LangModels/LangPortugueseModel.cpp
+++ b/src/LangModels/LangPortugueseModel.cpp
@@ -36,7 +36,9 @@
* ***** END LICENSE BLOCK ***** */
#include "../nsSBCharSetProber.h"
+#include "../nsSBCharSetProber-generated.h"
#include "../nsLanguageDetector.h"
+#include "../nsLanguageDetector-generated.h"
/********* Language model for: Portuguese *********/
diff --git a/src/LangModels/LangRomanianModel.cpp b/src/LangModels/LangRomanianModel.cpp
index ca091a7..c7ac4fc 100644
--- a/src/LangModels/LangRomanianModel.cpp
+++ b/src/LangModels/LangRomanianModel.cpp
@@ -36,7 +36,9 @@
* ***** END LICENSE BLOCK ***** */
#include "../nsSBCharSetProber.h"
+#include "../nsSBCharSetProber-generated.h"
#include "../nsLanguageDetector.h"
+#include "../nsLanguageDetector-generated.h"
/********* Language model for: Romanian *********/
diff --git a/src/LangModels/LangRussianModel.cpp b/src/LangModels/LangRussianModel.cpp
index 32a5e87..a51dcb1 100644
--- a/src/LangModels/LangRussianModel.cpp
+++ b/src/LangModels/LangRussianModel.cpp
@@ -36,7 +36,9 @@
* ***** END LICENSE BLOCK ***** */
#include "../nsSBCharSetProber.h"
+#include "../nsSBCharSetProber-generated.h"
#include "../nsLanguageDetector.h"
+#include "../nsLanguageDetector-generated.h"
/********* Language model for: Russian *********/
diff --git a/src/LangModels/LangSerbianModel.cpp b/src/LangModels/LangSerbianModel.cpp
index a1a40a3..ccb3189 100644
--- a/src/LangModels/LangSerbianModel.cpp
+++ b/src/LangModels/LangSerbianModel.cpp
@@ -36,7 +36,9 @@
* ***** END LICENSE BLOCK ***** */
#include "../nsSBCharSetProber.h"
+#include "../nsSBCharSetProber-generated.h"
#include "../nsLanguageDetector.h"
+#include "../nsLanguageDetector-generated.h"
/********* Language model for: Serbian *********/
diff --git a/src/LangModels/LangSlovakModel.cpp b/src/LangModels/LangSlovakModel.cpp
index 221ba98..57f7765 100644
--- a/src/LangModels/LangSlovakModel.cpp
+++ b/src/LangModels/LangSlovakModel.cpp
@@ -36,7 +36,9 @@
* ***** END LICENSE BLOCK ***** */
#include "../nsSBCharSetProber.h"
+#include "../nsSBCharSetProber-generated.h"
#include "../nsLanguageDetector.h"
+#include "../nsLanguageDetector-generated.h"
/********* Language model for: Slovak *********/
diff --git a/src/LangModels/LangSloveneModel.cpp b/src/LangModels/LangSloveneModel.cpp
index 4bb6f93..100a2de 100644
--- a/src/LangModels/LangSloveneModel.cpp
+++ b/src/LangModels/LangSloveneModel.cpp
@@ -36,7 +36,9 @@
* ***** END LICENSE BLOCK ***** */
#include "../nsSBCharSetProber.h"
+#include "../nsSBCharSetProber-generated.h"
#include "../nsLanguageDetector.h"
+#include "../nsLanguageDetector-generated.h"
/********* Language model for: Slovene *********/
diff --git a/src/LangModels/LangSpanishModel.cpp b/src/LangModels/LangSpanishModel.cpp
index 5a789bb..f182612 100644
--- a/src/LangModels/LangSpanishModel.cpp
+++ b/src/LangModels/LangSpanishModel.cpp
@@ -36,7 +36,9 @@
* ***** END LICENSE BLOCK ***** */
#include "../nsSBCharSetProber.h"
+#include "../nsSBCharSetProber-generated.h"
#include "../nsLanguageDetector.h"
+#include "../nsLanguageDetector-generated.h"
/********* Language model for: Spanish *********/
diff --git a/src/LangModels/LangSwedishModel.cpp b/src/LangModels/LangSwedishModel.cpp
index f8188f5..6326c74 100644
--- a/src/LangModels/LangSwedishModel.cpp
+++ b/src/LangModels/LangSwedishModel.cpp
@@ -36,7 +36,9 @@
* ***** END LICENSE BLOCK ***** */
#include "../nsSBCharSetProber.h"
+#include "../nsSBCharSetProber-generated.h"
#include "../nsLanguageDetector.h"
+#include "../nsLanguageDetector-generated.h"
/********* Language model for: Swedish *********/
diff --git a/src/LangModels/LangThaiModel.cpp b/src/LangModels/LangThaiModel.cpp
index fb409b2..4a08478 100644
--- a/src/LangModels/LangThaiModel.cpp
+++ b/src/LangModels/LangThaiModel.cpp
@@ -36,7 +36,9 @@
* ***** END LICENSE BLOCK ***** */
#include "../nsSBCharSetProber.h"
+#include "../nsSBCharSetProber-generated.h"
#include "../nsLanguageDetector.h"
+#include "../nsLanguageDetector-generated.h"
/********* Language model for: Thai *********/
diff --git a/src/LangModels/LangTurkishModel.cpp b/src/LangModels/LangTurkishModel.cpp
index e6ac9cc..4996b7e 100644
--- a/src/LangModels/LangTurkishModel.cpp
+++ b/src/LangModels/LangTurkishModel.cpp
@@ -36,7 +36,9 @@
* ***** END LICENSE BLOCK ***** */
#include "../nsSBCharSetProber.h"
+#include "../nsSBCharSetProber-generated.h"
#include "../nsLanguageDetector.h"
+#include "../nsLanguageDetector-generated.h"
/********* Language model for: Turkish *********/
diff --git a/src/LangModels/LangUkrainianModel.cpp b/src/LangModels/LangUkrainianModel.cpp
index 9114842..8c62599 100644
--- a/src/LangModels/LangUkrainianModel.cpp
+++ b/src/LangModels/LangUkrainianModel.cpp
@@ -36,7 +36,9 @@
* ***** END LICENSE BLOCK ***** */
#include "../nsSBCharSetProber.h"
+#include "../nsSBCharSetProber-generated.h"
#include "../nsLanguageDetector.h"
+#include "../nsLanguageDetector-generated.h"
/********* Language model for: Ukrainian *********/
diff --git a/src/LangModels/LangVietnameseModel.cpp b/src/LangModels/LangVietnameseModel.cpp
index 0cd43ee..efd2561 100644
--- a/src/LangModels/LangVietnameseModel.cpp
+++ b/src/LangModels/LangVietnameseModel.cpp
@@ -36,7 +36,9 @@
* ***** END LICENSE BLOCK ***** */
#include "../nsSBCharSetProber.h"
+#include "../nsSBCharSetProber-generated.h"
#include "../nsLanguageDetector.h"
+#include "../nsLanguageDetector-generated.h"
/********* Language model for: Vietnamese *********/
diff --git a/src/nsLanguageDetector-generated.h b/src/nsLanguageDetector-generated.h
new file mode 100644
index 0000000..4285e1d
--- /dev/null
+++ b/src/nsLanguageDetector-generated.h
@@ -0,0 +1,80 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* ***** BEGIN LICENSE BLOCK *****
+ * Version: MPL 1.1/GPL 2.0/LGPL 2.1
+ *
+ * The contents of this file are subject to the Mozilla Public License Version
+ * 1.1 (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ * http://www.mozilla.org/MPL/
+ *
+ * Software distributed under the License is distributed on an "AS IS" basis,
+ * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
+ * for the specific language governing rights and limitations under the
+ * License.
+ *
+ * The Original Code is Mozilla Communicator client code.
+ *
+ * The Initial Developer of the Original Code is
+ * Netscape Communications Corporation.
+ * Portions created by the Initial Developer are Copyright (C) 1998
+ * the Initial Developer. All Rights Reserved.
+ *
+ * Contributor(s):
+ *
+ * Alternatively, the contents of this file may be used under the terms of
+ * either the GNU General Public License Version 2 or later (the "GPL"), or
+ * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
+ * in which case the provisions of the GPL or the LGPL are applicable instead
+ * of those above. If you wish to allow use of your version of this file only
+ * under the terms of either the GPL or the LGPL, and not to allow others to
+ * use your version of this file under the terms of the MPL, indicate your
+ * decision by deleting the provisions above and replace them with the notice
+ * and other provisions required by the GPL or the LGPL. If you do not delete
+ * the provisions above, a recipient may use your version of this file under
+ * the terms of any one of the MPL, the GPL or the LGPL.
+ *
+ * ***** END LICENSE BLOCK ***** */
+
+#ifndef nsLanguageDetector_h_generated_h__
+#define nsLanguageDetector_h_generated_h__
+
+#define NUM_OF_LANGUAGE_MODELS 36
+
+extern const LanguageModel ArabicModel;
+extern const LanguageModel BelarusianModel;
+extern const LanguageModel BulgarianModel;
+extern const LanguageModel CzechModel;
+extern const LanguageModel DanishModel;
+extern const LanguageModel GermanModel;
+extern const LanguageModel GreekModel;
+extern const LanguageModel EnglishModel;
+extern const LanguageModel EsperantoModel;
+extern const LanguageModel SpanishModel;
+extern const LanguageModel EstonianModel;
+extern const LanguageModel FinnishModel;
+extern const LanguageModel FrenchModel;
+extern const LanguageModel IrishModel;
+extern const LanguageModel HebrewModel;
+extern const LanguageModel HindiModel;
+extern const LanguageModel CroatianModel;
+extern const LanguageModel HungarianModel;
+extern const LanguageModel ItalianModel;
+extern const LanguageModel LithuanianModel;
+extern const LanguageModel LatvianModel;
+extern const LanguageModel MacedonianModel;
+extern const LanguageModel MalteseModel;
+extern const LanguageModel NorwegianModel;
+extern const LanguageModel PolishModel;
+extern const LanguageModel PortugueseModel;
+extern const LanguageModel RomanianModel;
+extern const LanguageModel RussianModel;
+extern const LanguageModel SlovakModel;
+extern const LanguageModel SloveneModel;
+extern const LanguageModel SerbianModel;
+extern const LanguageModel SwedishModel;
+extern const LanguageModel ThaiModel;
+extern const LanguageModel TurkishModel;
+extern const LanguageModel UkrainianModel;
+extern const LanguageModel VietnameseModel;
+
+#endif /* nsLanguageDetector_h_generated_h__ */ \ No newline at end of file
diff --git a/src/nsLanguageDetector.h b/src/nsLanguageDetector.h
index 45d2af2..17868d5 100644
--- a/src/nsLanguageDetector.h
+++ b/src/nsLanguageDetector.h
@@ -125,41 +125,4 @@ private:
int GetOrderFromCodePoint(int codePoint);
};
-extern const LanguageModel ArabicModel;
-extern const LanguageModel BelarusianModel;
-extern const LanguageModel BulgarianModel;
-extern const LanguageModel CroatianModel;
-extern const LanguageModel CzechModel;
-extern const LanguageModel DanishModel;
-extern const LanguageModel EnglishModel;
-extern const LanguageModel EsperantoModel;
-extern const LanguageModel EstonianModel;
-extern const LanguageModel FinnishModel;
-extern const LanguageModel FrenchModel;
-extern const LanguageModel GermanModel;
-extern const LanguageModel GreekModel;
-extern const LanguageModel HebrewModel;
-extern const LanguageModel HindiModel;
-extern const LanguageModel HungarianModel;
-extern const LanguageModel IrishModel;
-extern const LanguageModel ItalianModel;
-extern const LanguageModel LatvianModel;
-extern const LanguageModel LithuanianModel;
-extern const LanguageModel MacedonianModel;
-extern const LanguageModel MalteseModel;
-extern const LanguageModel NorwegianModel;
-extern const LanguageModel PolishModel;
-extern const LanguageModel PortugueseModel;
-extern const LanguageModel RomanianModel;
-extern const LanguageModel RussianModel;
-extern const LanguageModel SerbianModel;
-extern const LanguageModel SlovakModel;
-extern const LanguageModel SloveneModel;
-extern const LanguageModel SpanishModel;
-extern const LanguageModel SwedishModel;
-extern const LanguageModel ThaiModel;
-extern const LanguageModel TurkishModel;
-extern const LanguageModel UkrainianModel;
-extern const LanguageModel VietnameseModel;
-
#endif /* nsLanguageDetector_h__ */
diff --git a/src/nsMBCSGroupProber.h b/src/nsMBCSGroupProber.h
index 60522e0..db0b51c 100644
--- a/src/nsMBCSGroupProber.h
+++ b/src/nsMBCSGroupProber.h
@@ -48,8 +48,11 @@
#include "nsBig5Prober.h"
#include "nsEUCTWProber.h"
+#include "nsLanguageDetector-generated.h"
+
#define NUM_OF_PROBERS 8
-#define NUM_OF_LANGUAGES 37
+/* All the generated language model + the CJK detector. */
+#define NUM_OF_LANGUAGES (NUM_OF_LANGUAGE_MODELS + 1)
class nsMBCSGroupProber: public nsCharSetProber {
public:
diff --git a/src/nsSBCSGroupProber.cpp b/src/nsSBCSGroupProber.cpp
index 49e5303..b0aa01a 100644
--- a/src/nsSBCSGroupProber.cpp
+++ b/src/nsSBCSGroupProber.cpp
@@ -36,10 +36,12 @@
*
* ***** END LICENSE BLOCK ***** */
+#include <assert.h>
#include <stdio.h>
#include "prmem.h"
#include "nsSBCharSetProber.h"
+#include "nsSBCharSetProber-generated.h"
#include "nsSBCSGroupProber.h"
#include "nsHebrewProber.h"
@@ -50,6 +52,14 @@ nsSBCSGroupProber::nsSBCSGroupProber()
PRUint32 heb_prober_idx;
PRUint32 n = 0;
+ /* We create more probers than sequence models because of Hebrew handling,
+ * making Windows_1255HebrewModel and Ibm862HebrewModel used twice, while
+ * Iso_8859_8HebrewModel is currently unused.
+ */
+ n_sbcs_probers = NUM_OF_SEQUENCE_MODELS + 2;
+ mProbers = new nsCharSetProber*[n_sbcs_probers];
+ mIsActive = new PRBool[n_sbcs_probers];
+
mProbers[n++] = new nsSingleByteCharSetProber(&Windows_1251RussianModel);
mProbers[n++] = new nsSingleByteCharSetProber(&Koi8_RRussianModel);
mProbers[n++] = new nsSingleByteCharSetProber(&Iso_8859_5RussianModel);
@@ -226,15 +236,19 @@ nsSBCSGroupProber::nsSBCSGroupProber()
mProbers[n++] = new nsSingleByteCharSetProber(&Ibm855MacedonianModel);
mProbers[n++] = new nsSingleByteCharSetProber(&Iso_8859_5MacedonianModel);
+ assert (n_sbcs_probers == n);
+
Reset();
}
nsSBCSGroupProber::~nsSBCSGroupProber()
{
- for (PRUint32 i = 0; i < NUM_OF_SBCS_PROBERS; i++)
+ for (PRUint32 i = 0; i < n_sbcs_probers; i++)
{
delete mProbers[i];
}
+ delete mProbers;
+ delete mIsActive;
}
@@ -266,7 +280,7 @@ const char* nsSBCSGroupProber::GetLanguage(int candidate)
void nsSBCSGroupProber::Reset(void)
{
mActiveNum = 0;
- for (PRUint32 i = 0; i < NUM_OF_SBCS_PROBERS; i++)
+ for (PRUint32 i = 0; i < n_sbcs_probers; i++)
{
if (mProbers[i]) // not null
{
@@ -303,7 +317,7 @@ nsProbingState nsSBCSGroupProber::HandleData(const char* aBuf, PRUint32 aLen,
if (newLen1 == 0)
goto done; // Nothing to see here, move on.
- for (i = 0; i < NUM_OF_SBCS_PROBERS; i++)
+ for (i = 0; i < n_sbcs_probers; i++)
{
if (!mIsActive[i])
continue;
@@ -344,7 +358,7 @@ float nsSBCSGroupProber::GetConfidence(int candidate)
case eNotMe:
return (float)0.01; //sure no
default:
- for (i = 0; i < NUM_OF_SBCS_PROBERS; i++)
+ for (i = 0; i < n_sbcs_probers; i++)
{
if (!mIsActive[i])
continue;
@@ -367,7 +381,7 @@ void nsSBCSGroupProber::DumpStatus()
cf = GetConfidence(0);
printf(" SBCS Group Prober --------begin status \r\n");
- for (i = 0; i < NUM_OF_SBCS_PROBERS; i++)
+ for (i = 0; i < n_sbcs_probers; i++)
{
if (!mIsActive[i])
printf(" inactive: [%s] (i.e. confidence is too low).\r\n", mProbers[i]->GetCharSetName(0));
diff --git a/src/nsSBCSGroupProber.h b/src/nsSBCSGroupProber.h
index d782732..d61efe9 100644
--- a/src/nsSBCSGroupProber.h
+++ b/src/nsSBCSGroupProber.h
@@ -39,9 +39,6 @@
#ifndef nsSBCSGroupProber_h__
#define nsSBCSGroupProber_h__
-
-#define NUM_OF_SBCS_PROBERS 117
-
class nsCharSetProber;
class nsSBCSGroupProber: public nsCharSetProber {
public:
@@ -63,12 +60,12 @@ public:
#endif
protected:
- nsProbingState mState;
- nsCharSetProber* mProbers[NUM_OF_SBCS_PROBERS];
- PRBool mIsActive[NUM_OF_SBCS_PROBERS];
- PRInt32 mBestGuess;
- PRUint32 mActiveNum;
+ nsProbingState mState;
+ nsCharSetProber **mProbers;
+ PRBool *mIsActive;
+ PRInt32 mBestGuess;
+ PRUint32 mActiveNum;
+ PRUint32 n_sbcs_probers;
};
#endif /* nsSBCSGroupProber_h__ */
-
diff --git a/src/nsSBCharSetProber-generated.h b/src/nsSBCharSetProber-generated.h
new file mode 100644
index 0000000..fa54561
--- /dev/null
+++ b/src/nsSBCharSetProber-generated.h
@@ -0,0 +1,194 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* ***** BEGIN LICENSE BLOCK *****
+ * Version: MPL 1.1/GPL 2.0/LGPL 2.1
+ *
+ * The contents of this file are subject to the Mozilla Public License Version
+ * 1.1 (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ * http://www.mozilla.org/MPL/
+ *
+ * Software distributed under the License is distributed on an "AS IS" basis,
+ * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
+ * for the specific language governing rights and limitations under the
+ * License.
+ *
+ * The Original Code is Mozilla Communicator client code.
+ *
+ * The Initial Developer of the Original Code is
+ * Netscape Communications Corporation.
+ * Portions created by the Initial Developer are Copyright (C) 1998
+ * the Initial Developer. All Rights Reserved.
+ *
+ * Contributor(s):
+ *
+ * Alternatively, the contents of this file may be used under the terms of
+ * either the GNU General Public License Version 2 or later (the "GPL"), or
+ * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
+ * in which case the provisions of the GPL or the LGPL are applicable instead
+ * of those above. If you wish to allow use of your version of this file only
+ * under the terms of either the GPL or the LGPL, and not to allow others to
+ * use your version of this file under the terms of the MPL, indicate your
+ * decision by deleting the provisions above and replace them with the notice
+ * and other provisions required by the GPL or the LGPL. If you do not delete
+ * the provisions above, a recipient may use your version of this file under
+ * the terms of any one of the MPL, the GPL or the LGPL.
+ *
+ * ***** END LICENSE BLOCK ***** */
+
+#ifndef nsSingleByteCharSetProber_generated_h__
+#define nsSingleByteCharSetProber_generated_h__
+
+#define NUM_OF_SEQUENCE_MODELS 115
+
+extern const SequenceModel Iso_8859_6ArabicModel;
+extern const SequenceModel Windows_1256ArabicModel;
+
+extern const SequenceModel Windows_1251BelarusianModel;
+extern const SequenceModel Iso_8859_5BelarusianModel;
+
+extern const SequenceModel Windows_1251BulgarianModel;
+extern const SequenceModel Iso_8859_5BulgarianModel;
+
+extern const SequenceModel Iso_8859_2CzechModel;
+extern const SequenceModel Windows_1250CzechModel;
+extern const SequenceModel Ibm852CzechModel;
+extern const SequenceModel Mac_CentraleuropeCzechModel;
+
+extern const SequenceModel Iso_8859_15DanishModel;
+extern const SequenceModel Iso_8859_1DanishModel;
+extern const SequenceModel Windows_1252DanishModel;
+extern const SequenceModel Ibm865DanishModel;
+
+extern const SequenceModel Iso_8859_1GermanModel;
+extern const SequenceModel Windows_1252GermanModel;
+
+extern const SequenceModel Iso_8859_7GreekModel;
+extern const SequenceModel Windows_1253GreekModel;
+
+extern const SequenceModel Iso_8859_1EnglishModel;
+extern const SequenceModel Windows_1252EnglishModel;
+
+extern const SequenceModel Iso_8859_3EsperantoModel;
+
+extern const SequenceModel Iso_8859_15SpanishModel;
+extern const SequenceModel Iso_8859_1SpanishModel;
+extern const SequenceModel Windows_1252SpanishModel;
+
+extern const SequenceModel Iso_8859_4EstonianModel;
+extern const SequenceModel Iso_8859_13EstonianModel;
+extern const SequenceModel Iso_8859_15EstonianModel;
+extern const SequenceModel Windows_1252EstonianModel;
+extern const SequenceModel Windows_1257EstonianModel;
+
+extern const SequenceModel Iso_8859_1FinnishModel;
+extern const SequenceModel Iso_8859_4FinnishModel;
+extern const SequenceModel Iso_8859_9FinnishModel;
+extern const SequenceModel Iso_8859_13FinnishModel;
+extern const SequenceModel Iso_8859_15FinnishModel;
+extern const SequenceModel Windows_1252FinnishModel;
+
+extern const SequenceModel Iso_8859_15FrenchModel;
+extern const SequenceModel Iso_8859_1FrenchModel;
+extern const SequenceModel Windows_1252FrenchModel;
+
+extern const SequenceModel Iso_8859_15IrishModel;
+extern const SequenceModel Iso_8859_1IrishModel;
+extern const SequenceModel Iso_8859_9IrishModel;
+extern const SequenceModel Windows_1252IrishModel;
+
+extern const SequenceModel Iso_8859_8HebrewModel;
+extern const SequenceModel Windows_1255HebrewModel;
+extern const SequenceModel Ibm862HebrewModel;
+
+
+extern const SequenceModel Iso_8859_2CroatianModel;
+extern const SequenceModel Iso_8859_13CroatianModel;
+extern const SequenceModel Iso_8859_16CroatianModel;
+extern const SequenceModel Windows_1250CroatianModel;
+extern const SequenceModel Ibm852CroatianModel;
+extern const SequenceModel Mac_CentraleuropeCroatianModel;
+
+extern const SequenceModel Iso_8859_2HungarianModel;
+extern const SequenceModel Windows_1250HungarianModel;
+
+extern const SequenceModel Iso_8859_1ItalianModel;
+extern const SequenceModel Iso_8859_3ItalianModel;
+extern const SequenceModel Iso_8859_9ItalianModel;
+extern const SequenceModel Iso_8859_15ItalianModel;
+extern const SequenceModel Windows_1252ItalianModel;
+
+extern const SequenceModel Iso_8859_4LithuanianModel;
+extern const SequenceModel Iso_8859_10LithuanianModel;
+extern const SequenceModel Iso_8859_13LithuanianModel;
+
+extern const SequenceModel Iso_8859_4LatvianModel;
+extern const SequenceModel Iso_8859_10LatvianModel;
+extern const SequenceModel Iso_8859_13LatvianModel;
+
+extern const SequenceModel Windows_1251MacedonianModel;
+extern const SequenceModel Ibm855MacedonianModel;
+extern const SequenceModel Iso_8859_5MacedonianModel;
+
+extern const SequenceModel Iso_8859_3MalteseModel;
+
+extern const SequenceModel Ibm865NorwegianModel;
+extern const SequenceModel Iso_8859_15NorwegianModel;
+extern const SequenceModel Iso_8859_1NorwegianModel;
+extern const SequenceModel Windows_1252NorwegianModel;
+
+extern const SequenceModel Iso_8859_2PolishModel;
+extern const SequenceModel Iso_8859_13PolishModel;
+extern const SequenceModel Iso_8859_16PolishModel;
+extern const SequenceModel Windows_1250PolishModel;
+extern const SequenceModel Ibm852PolishModel;
+extern const SequenceModel Mac_CentraleuropePolishModel;
+
+extern const SequenceModel Iso_8859_15PortugueseModel;
+extern const SequenceModel Iso_8859_1PortugueseModel;
+extern const SequenceModel Windows_1252PortugueseModel;
+extern const SequenceModel Iso_8859_9PortugueseModel;
+
+extern const SequenceModel Iso_8859_2RomanianModel;
+extern const SequenceModel Iso_8859_16RomanianModel;
+extern const SequenceModel Windows_1250RomanianModel;
+extern const SequenceModel Ibm852RomanianModel;
+
+extern const SequenceModel Windows_1251RussianModel;
+extern const SequenceModel Iso_8859_5RussianModel;
+extern const SequenceModel Koi8_RRussianModel;
+extern const SequenceModel Ibm855RussianModel;
+extern const SequenceModel Ibm866RussianModel;
+extern const SequenceModel Mac_CyrillicRussianModel;
+
+extern const SequenceModel Iso_8859_2SlovakModel;
+extern const SequenceModel Windows_1250SlovakModel;
+extern const SequenceModel Ibm852SlovakModel;
+extern const SequenceModel Mac_CentraleuropeSlovakModel;
+
+extern const SequenceModel Iso_8859_2SloveneModel;
+extern const SequenceModel Iso_8859_16SloveneModel;
+extern const SequenceModel Windows_1250SloveneModel;
+extern const SequenceModel Ibm852SloveneModel;
+extern const SequenceModel Mac_CentraleuropeSloveneModel;
+
+extern const SequenceModel Windows_1251SerbianModel;
+extern const SequenceModel Iso_8859_5SerbianModel;
+
+extern const SequenceModel Iso_8859_1SwedishModel;
+extern const SequenceModel Iso_8859_4SwedishModel;
+extern const SequenceModel Iso_8859_9SwedishModel;
+extern const SequenceModel Iso_8859_15SwedishModel;
+extern const SequenceModel Windows_1252SwedishModel;
+
+extern const SequenceModel Iso_8859_11ThaiModel;
+extern const SequenceModel Tis_620ThaiModel;
+
+extern const SequenceModel Iso_8859_3TurkishModel;
+extern const SequenceModel Iso_8859_9TurkishModel;
+
+extern const SequenceModel Windows_1251UkrainianModel;
+
+extern const SequenceModel Windows_1258VietnameseModel;
+extern const SequenceModel VisciiVietnameseModel;
+
+#endif /* nsSingleByteCharSetProber_generated_h__ */ \ No newline at end of file
diff --git a/src/nsSBCharSetProber.h b/src/nsSBCharSetProber.h
index 767d266..f5eb5b3 100644
--- a/src/nsSBCharSetProber.h
+++ b/src/nsSBCharSetProber.h
@@ -131,154 +131,4 @@ protected:
};
-extern const SequenceModel Windows_1256ArabicModel;
-extern const SequenceModel Iso_8859_6ArabicModel;
-
-extern const SequenceModel Koi8_RRussianModel;
-extern const SequenceModel Windows_1251RussianModel;
-extern const SequenceModel Iso_8859_5RussianModel;
-extern const SequenceModel Mac_CyrillicRussianModel;
-extern const SequenceModel Ibm866RussianModel;
-extern const SequenceModel Ibm855RussianModel;
-
-extern const SequenceModel Iso_8859_7GreekModel;
-extern const SequenceModel Windows_1253GreekModel;
-
-extern const SequenceModel Iso_8859_5BelarusianModel;
-extern const SequenceModel Windows_1251BelarusianModel;
-
-extern const SequenceModel Iso_8859_5BulgarianModel;
-extern const SequenceModel Windows_1251BulgarianModel;
-
-extern const SequenceModel Iso_8859_2HungarianModel;
-extern const SequenceModel Windows_1250HungarianModel;
-
-extern const SequenceModel Windows_1255HebrewModel;
-extern const SequenceModel Ibm862HebrewModel;
-
-extern const SequenceModel Tis_620ThaiModel;
-extern const SequenceModel Iso_8859_11ThaiModel;
-
-extern const SequenceModel Iso_8859_15FrenchModel;
-extern const SequenceModel Iso_8859_1FrenchModel;
-extern const SequenceModel Windows_1252FrenchModel;
-
-extern const SequenceModel Iso_8859_15SpanishModel;
-extern const SequenceModel Iso_8859_1SpanishModel;
-extern const SequenceModel Windows_1252SpanishModel;
-
-extern const SequenceModel Iso_8859_1GermanModel;
-extern const SequenceModel Windows_1252GermanModel;
-
-extern const SequenceModel Iso_8859_3EsperantoModel;
-
-extern const SequenceModel Iso_8859_3TurkishModel;
-extern const SequenceModel Iso_8859_9TurkishModel;
-
-extern const SequenceModel VisciiVietnameseModel;
-extern const SequenceModel Windows_1258VietnameseModel;
-
-extern const SequenceModel Iso_8859_15DanishModel;
-extern const SequenceModel Iso_8859_1DanishModel;
-extern const SequenceModel Windows_1252DanishModel;
-extern const SequenceModel Ibm865DanishModel;
-
-extern const SequenceModel Iso_8859_1EnglishModel;
-extern const SequenceModel Windows_1252EnglishModel;
-
-extern const SequenceModel Iso_8859_13LithuanianModel;
-extern const SequenceModel Iso_8859_10LithuanianModel;
-extern const SequenceModel Iso_8859_4LithuanianModel;
-
-extern const SequenceModel Iso_8859_13LatvianModel;
-extern const SequenceModel Iso_8859_10LatvianModel;
-extern const SequenceModel Iso_8859_4LatvianModel;
-
-extern const SequenceModel Iso_8859_1PortugueseModel;
-extern const SequenceModel Iso_8859_9PortugueseModel;
-extern const SequenceModel Iso_8859_15PortugueseModel;
-extern const SequenceModel Windows_1252PortugueseModel;
-
-extern const SequenceModel Iso_8859_3MalteseModel;
-
-extern const SequenceModel Windows_1250CzechModel;
-extern const SequenceModel Iso_8859_2CzechModel;
-extern const SequenceModel Ibm852CzechModel;
-extern const SequenceModel Mac_CentraleuropeCzechModel;
-
-extern const SequenceModel Windows_1250SlovakModel;
-extern const SequenceModel Iso_8859_2SlovakModel;
-extern const SequenceModel Ibm852SlovakModel;
-extern const SequenceModel Mac_CentraleuropeSlovakModel;
-
-extern const SequenceModel Windows_1250PolishModel;
-extern const SequenceModel Iso_8859_2PolishModel;
-extern const SequenceModel Iso_8859_13PolishModel;
-extern const SequenceModel Iso_8859_16PolishModel;
-extern const SequenceModel Ibm852PolishModel;
-extern const SequenceModel Mac_CentraleuropePolishModel;
-
-extern const SequenceModel Iso_8859_1FinnishModel;
-extern const SequenceModel Iso_8859_4FinnishModel;
-extern const SequenceModel Iso_8859_9FinnishModel;
-extern const SequenceModel Iso_8859_13FinnishModel;
-extern const SequenceModel Iso_8859_15FinnishModel;
-extern const SequenceModel Windows_1252FinnishModel;
-
-extern const SequenceModel Iso_8859_1ItalianModel;
-extern const SequenceModel Iso_8859_3ItalianModel;
-extern const SequenceModel Iso_8859_9ItalianModel;
-extern const SequenceModel Iso_8859_15ItalianModel;
-extern const SequenceModel Windows_1252ItalianModel;
-
-extern const SequenceModel Windows_1250CroatianModel;
-extern const SequenceModel Iso_8859_2CroatianModel;
-extern const SequenceModel Iso_8859_13CroatianModel;
-extern const SequenceModel Iso_8859_16CroatianModel;
-extern const SequenceModel Ibm852CroatianModel;
-extern const SequenceModel Mac_CentraleuropeCroatianModel;
-
-extern const SequenceModel Windows_1252EstonianModel;
-extern const SequenceModel Windows_1257EstonianModel;
-extern const SequenceModel Iso_8859_4EstonianModel;
-extern const SequenceModel Iso_8859_13EstonianModel;
-extern const SequenceModel Iso_8859_15EstonianModel;
-
-extern const SequenceModel Iso_8859_15IrishModel;
-extern const SequenceModel Iso_8859_9IrishModel;
-extern const SequenceModel Iso_8859_1IrishModel;
-extern const SequenceModel Windows_1252IrishModel;
-
-extern const SequenceModel Windows_1250RomanianModel;
-extern const SequenceModel Iso_8859_2RomanianModel;
-extern const SequenceModel Iso_8859_16RomanianModel;
-extern const SequenceModel Ibm852RomanianModel;
-
-extern const SequenceModel Windows_1250SloveneModel;
-extern const SequenceModel Iso_8859_2SloveneModel;
-extern const SequenceModel Iso_8859_16SloveneModel;
-extern const SequenceModel Ibm852SloveneModel;
-extern const SequenceModel Mac_CentraleuropeSloveneModel;
-
-extern const SequenceModel Iso_8859_1SwedishModel;
-extern const SequenceModel Iso_8859_4SwedishModel;
-extern const SequenceModel Iso_8859_9SwedishModel;
-extern const SequenceModel Iso_8859_15SwedishModel;
-extern const SequenceModel Windows_1252SwedishModel;
-
-extern const SequenceModel Iso_8859_15NorwegianModel;
-extern const SequenceModel Iso_8859_1NorwegianModel;
-extern const SequenceModel Windows_1252NorwegianModel;
-extern const SequenceModel Ibm865NorwegianModel;
-
-extern const SequenceModel Windows_1251UkrainianModel;
-
-extern const SequenceModel Windows_1251SerbianModel;
-extern const SequenceModel Iso_8859_5SerbianModel;
-
-extern const SequenceModel Windows_1251MacedonianModel;
-extern const SequenceModel Ibm855MacedonianModel;
-extern const SequenceModel Iso_8859_5MacedonianModel;
-
-
#endif /* nsSingleByteCharSetProber_h__ */