diff options
author | Jehan <jehan@girinstud.io> | 2021-11-09 22:18:11 +0100 |
---|---|---|
committer | Jehan <jehan@girinstud.io> | 2022-12-14 00:24:53 +0100 |
commit | 6365cad4fd1a463571bb172ec3e90be5c7bd4864 (patch) | |
tree | 92947adb2da464960536e82358c70c0dcb82c5db | |
parent | 81b83fffa9b0fa044878fdd154f4e6adf9aa4e68 (diff) |
script: improve a bit the management of use_ascii option.
-rwxr-xr-x | script/BuildLangModel.py | 12 |
1 files changed, 5 insertions, 7 deletions
diff --git a/script/BuildLangModel.py b/script/BuildLangModel.py index faf28bd..c95d286 100755 --- a/script/BuildLangModel.py +++ b/script/BuildLangModel.py @@ -139,11 +139,14 @@ def local_lowercase(text, lang): lowercased += l return lowercased +if lang.use_ascii: + if lang.alphabet is None: + lang.alphabet = [chr(l) for l in range(65, 91)] + [chr(l) for l in range(97, 123)] + else: + lang.alphabet += [chr(l) for l in range(65, 91)] + [chr(l) for l in range(97, 123)] if lang.alphabet is not None: # Allowing to provide an alphabet in string format rather than list. lang.alphabet = list(lang.alphabet) - if lang.use_ascii: - lang.alphabet += [chr(l) for l in range(65, 91)] + [chr(l) for l in range(97, 123)] if lang.case_mapping or lang.custom_case_mapping is not None: lang.alphabet = [local_lowercase(l, lang) for l in lang.alphabet] #alphabet = [] @@ -242,11 +245,6 @@ def process_text(content, lang): if unicode_value in characters: characters[unicode_value] += 1 is_letter = True - elif lang.use_ascii and \ - ((unicode_value >= 65 and unicode_value <= 90) or \ - (unicode_value >= 97 and unicode_value <= 122)): - characters[unicode_value] = 1 - is_letter = True elif lang.unicode_ranges is not None: for start, end in lang.unicode_ranges: if unicode_value >= start and unicode_value <= end: |