script: model-building script updated to produce the 2 new ratios…

… introduced in previous commit.
author: Jehan <jehan@girinstud.io> 2022-12-14 20:15:01 +0100
committer: Jehan <jehan@girinstud.io> 2022-12-14 20:15:34 +0100
commit: e311b64cd9d7255365cf35f07f4d4ac768c500cc (patch)
tree: ef04e31ac24095a485dab8e5c164cc4c37549ade
parent: 401eb55dfc792dad1dd34f259e414ff3bf2cd38e (diff)
1 files changed, 26 insertions, 1 deletions
diff --git a/script/BuildLangModel.py b/script/BuildLangModel.py
index f3d81e1..7049ee7 100755
--- a/script/BuildLangModel.py
+++ b/script/BuildLangModel.py
@@ -392,6 +392,8 @@ accumulated_ratios = 0
 # frequent list, and we stop then. There may therefore be more or less than
 # 64 frequent characters depending on the language.
 logfd.write('\nMost Frequent characters:')
+very_freq_count = 0
+very_freq_ratio = 0
 if lang.alphabet is None and lang.frequent_ranges is None:
     freq_count = min(64, len(sorted_ratios))
     for order, (char, ratio) in enumerate(sorted_ratios):
@@ -399,6 +401,9 @@ if lang.alphabet is None and lang.frequent_ranges is None:
             break
         logfd.write("\n[{:2}] Char {}: {} %".format(order, chr(char), ratio * 100))
         accumulated_ratios += ratio
+        if very_freq_ratio < 0.4:
+          very_freq_count += 1
+          very_freq_ratio += ratio
 elif lang.alphabet is not None:
     freq_count = 0
     for order, (char, ratio) in enumerate(sorted_ratios):
@@ -409,6 +414,9 @@ elif lang.alphabet is not None:
         logfd.write("\n[{:2}] Char {}: {} %".format(order, chr(char), ratio * 100))
         accumulated_ratios += ratio
         freq_count += 1
+        if accumulated_ratios < 0.4:
+          very_freq_count += 1
+          very_freq_ratio += ratio
     else:
         if len(lang.alphabet) > 0:
             print("Error: alphabet characters are absent from data collection"
@@ -432,6 +440,9 @@ elif lang.frequent_ranges is not None:
           logfd.write("\n[{:2}] Char {}: {} %".format(order, chr(char), ratio * 100))
           frequent_ranges_size -= 1
           break
+        if accumulated_ratios < 0.4:
+          very_freq_count += 1
+          very_freq_ratio += ratio
       else:
         # A frequent character in the non-frequent range.
         logfd.write("\n[{:2}] Char {}: {} %".format(order, chr(char), ratio * 100))
@@ -440,7 +451,18 @@ elif lang.frequent_ranges is not None:
       if frequent_ranges_size <= 0:
         break
 
+low_freq_order = freq_count - 1
+low_freq_ratio = 0
+for back_order, (char, ratio) in enumerate(reversed(sorted_ratios[:freq_count])):
+  if low_freq_ratio < 0.03:
+    low_freq_ratio += ratio
+    low_freq_order -= 1
+  else:
+    break
+
 logfd.write("\n\nThe first {} characters have an accumulated ratio of {}.\n".format(freq_count, accumulated_ratios))
+logfd.write("The first {} characters have an accumulated ratio of {}.\n".format(very_freq_count, very_freq_ratio))
+logfd.write("All characters whose order is over {} have an accumulated ratio of {}.\n".format(low_freq_order, low_freq_ratio))
 
 with open(current_dir + '/header-template.cpp', 'r') as header_fd:
     c_code = header_fd.read()
@@ -729,7 +751,10 @@ SM_str += '\n  Unicode_CharOrder,'
 SM_str += '\n  {},'.format(len(sorted_chars)) # Order is wrong!
 SM_str += '\n  {}LangModel,'.format(language_c)
 SM_str += '\n  {},'.format(freq_count)
-SM_str += '\n  (float){},'.format(accumulated_ratios)
+SM_str += '\n  {},'.format(very_freq_count)
+SM_str += '\n  (float){},'.format(very_freq_ratio)
+SM_str += '\n  {},'.format(low_freq_order)
+SM_str += '\n  (float){},'.format(low_freq_ratio)
 SM_str += '\n};'
 c_code += SM_str
author	Jehan <jehan@girinstud.io>	2022-12-14 20:15:01 +0100
committer	Jehan <jehan@girinstud.io>	2022-12-14 20:15:34 +0100
commit	e311b64cd9d7255365cf35f07f4d4ac768c500cc (patch)
tree	ef04e31ac24095a485dab8e5c164cc4c37549ade
parent	401eb55dfc792dad1dd34f259e414ff3bf2cd38e (diff)