summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJehan <jehan@girinstud.io>2021-11-09 22:18:11 +0100
committerJehan <jehan@girinstud.io>2022-12-14 00:24:53 +0100
commit6365cad4fd1a463571bb172ec3e90be5c7bd4864 (patch)
tree92947adb2da464960536e82358c70c0dcb82c5db
parent81b83fffa9b0fa044878fdd154f4e6adf9aa4e68 (diff)
script: improve a bit the management of use_ascii option.
-rwxr-xr-xscript/BuildLangModel.py12
1 files changed, 5 insertions, 7 deletions
diff --git a/script/BuildLangModel.py b/script/BuildLangModel.py
index faf28bd..c95d286 100755
--- a/script/BuildLangModel.py
+++ b/script/BuildLangModel.py
@@ -139,11 +139,14 @@ def local_lowercase(text, lang):
lowercased += l
return lowercased
+if lang.use_ascii:
+ if lang.alphabet is None:
+ lang.alphabet = [chr(l) for l in range(65, 91)] + [chr(l) for l in range(97, 123)]
+ else:
+ lang.alphabet += [chr(l) for l in range(65, 91)] + [chr(l) for l in range(97, 123)]
if lang.alphabet is not None:
# Allowing to provide an alphabet in string format rather than list.
lang.alphabet = list(lang.alphabet)
- if lang.use_ascii:
- lang.alphabet += [chr(l) for l in range(65, 91)] + [chr(l) for l in range(97, 123)]
if lang.case_mapping or lang.custom_case_mapping is not None:
lang.alphabet = [local_lowercase(l, lang) for l in lang.alphabet]
#alphabet = []
@@ -242,11 +245,6 @@ def process_text(content, lang):
if unicode_value in characters:
characters[unicode_value] += 1
is_letter = True
- elif lang.use_ascii and \
- ((unicode_value >= 65 and unicode_value <= 90) or \
- (unicode_value >= 97 and unicode_value <= 122)):
- characters[unicode_value] = 1
- is_letter = True
elif lang.unicode_ranges is not None:
for start, end in lang.unicode_ranges:
if unicode_value >= start and unicode_value <= end: