summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJehan <jehan@girinstud.io>2022-12-17 18:30:55 +0100
committerJehan <jehan@girinstud.io>2022-12-17 18:41:00 +0100
commitffb94e4a9d4da3fb60cd022d3eeffe12301f96bf (patch)
tree4fda6281236e6391aab02557752a48b7c0525a74
parent5e25e93da795c22265befcdc72d1ffd0daed6934 (diff)
script, src, test: Bulgarian language models added.
Not sure why we had the Bulgarian support but haven't recently updated it (i.e. never with the model generation script, or so it seems), especially with generic language models, allowing to have UTF-8/Bulgarian support. Maybe I tested it some time ago and it was getting bad results? Anyway now with all the recents updates on the confidence computation, I get very good detection scores. So adding support for UTF-8/Bulgarian and rebuilding other models too. Also adding a test for ISO-8859-5/Bulgarian (we already had support, but no test files). The 2 new test files are text from page 'Мармоти' on Wikipedia in Bulgarian language.
-rw-r--r--README.md1
-rw-r--r--script/BuildLangModelLogs/LangBulgarianModel.log263
-rw-r--r--script/charsets/iso-8859-5.py72
-rw-r--r--script/charsets/windows-1251.py75
-rw-r--r--script/langs/bg.py58
-rw-r--r--src/LangModels/LangBulgarianModel.cpp406
-rw-r--r--src/nsLanguageDetector.h1
-rw-r--r--src/nsMBCSGroupProber.cpp1
-rw-r--r--src/nsMBCSGroupProber.h2
-rw-r--r--src/nsSBCSGroupProber.cpp4
-rw-r--r--src/nsSBCharSetProber.h4
-rw-r--r--test/bg/iso-8859-5.txt3
-rw-r--r--test/bg/utf-8.txt3
13 files changed, 700 insertions, 193 deletions
diff --git a/README.md b/README.md
index 288e0b3..28c085a 100644
--- a/README.md
+++ b/README.md
@@ -18,6 +18,7 @@ uchardet started as a C language binding of the original C++ implementation of t
* ISO-8859-6
* WINDOWS-1256
* Bulgarian
+ * UTF-8
* ISO-8859-5
* WINDOWS-1251
* Chinese
diff --git a/script/BuildLangModelLogs/LangBulgarianModel.log b/script/BuildLangModelLogs/LangBulgarianModel.log
new file mode 100644
index 0000000..452dda4
--- /dev/null
+++ b/script/BuildLangModelLogs/LangBulgarianModel.log
@@ -0,0 +1,263 @@
+= Logs of language model for Bulgarian (bg) =
+
+- Generated by BuildLangModel.py
+- Started: 2022-12-17 18:13:39.705509
+- Maximum depth: 4
+- Max number of pages: 200
+
+== Parsed pages ==
+
+Амурски_леопард (revision 11479353)
+Пектусан (revision 11051736)
+Тъкан (revision 11413541)
+Растителноядно животно (revision 9401552)
+Козмин (залив) (revision 10801896)
+Око (revision 11307426)
+Руска кухня (revision 8912349)
+Обединена система за таксономична информация (revision 10952587)
+Лисица (revision 11570875)
+Сихоте Алин (revision 10913633)
+Шриланкски леопард (revision 11478652)
+Фазан (revision 11554738)
+Северна Корея (revision 11596651)
+Протисти (revision 11599945)
+Калдера (revision 10605482)
+Месо (revision 11396435)
+Мезозойска ера (revision 11406482)
+Тамилски (revision 11536357)
+Птици (revision 11599947)
+Паразитизъм (revision 10905879)
+Череп (revision 11382448)
+Домати (revision 11568692)
+Гъби (revision 11575731)
+Връх (revision 11560584)
+Хабаровски край (revision 11326255)
+Слъзна жлеза (revision 9848117)
+Клетка (биология) (revision 11599652)
+Чанбайшан (revision 11436397)
+Усури (revision 11485897)
+Нормативен контрол (revision 11218813)
+Phasianus (revision 11554738)
+Перм (период) (revision 10376629)
+Въздух (revision 11586473)
+Растения (revision 11599967)
+Лов (revision 11549760)
+Култ към личността (revision 11309525)
+Биология (revision 11597684)
+Азиатска късоноктеста видра (revision 11530864)
+Ротатории (revision 10164408)
+Торонто (revision 11500811)
+Епител (revision 11544065)
+Животни (revision 11599450)
+Animal Diversity Web (revision 11280365)
+Главоноги (revision 11321675)
+Новозеландски морски лъв (revision 11531150)
+Общомедия (revision 11583644)
+Яйцеклетка (revision 11574210)
+Риба (revision 11602135)
+Ялуцзян (revision 11616897)
+Водорасли (revision 11589165)
+Тигрова генета (revision 11532904)
+Карбон (revision 11440434)
+Енотовидно куче (revision 11530902)
+Пинин (revision 10953442)
+Морска видра (revision 11022765)
+Коткови (revision 11296822)
+Сметана (revision 10602821)
+Просо (revision 10908234)
+Корейски полуостров (revision 11532552)
+Уикивидове (revision 9824200)
+Източна Азия (revision 10984512)
+Злато (revision 11601280)
+Лист (revision 11417909)
+Уикиданни (revision 10288984)
+Персийски леопард (revision 10731068)
+Vormela (revision 11531190)
+Африкански леопард (revision 10671790)
+Далечен изток (revision 10098481)
+Индийски леопард (revision 10949302)
+Червен списък на световнозастрашените видове (revision 10923987)
+Елда (revision 11398540)
+Латински език (revision 11610275)
+Николай Пржевалски (revision 11378214)
+Корейски език (revision 11585784)
+Цитоплазма (revision 10815311)
+Силур (revision 10913196)
+Дърво (revision 11599411)
+Амур (revision 11232524)
+Оцет (revision 10974969)
+Индийски солонгой (revision 11530605)
+Креда (revision 11194691)
+BBC News (revision 11556539)
+Ендодерма (revision 10159731)
+Система на Маккюн-Райшауер (revision 10199499)
+Вол (revision 11486361)
+Тумънцзян (revision 11405669)
+Тайга (revision 11596057)
+Паренхим (revision 9238563)
+Бикин (река) (revision 10416126)
+Национален център за биотехнологична информация на САЩ (revision 10901368)
+Кокошоподобни (revision 11377806)
+Телевизор (revision 11587645)
+Влажност (revision 11587428)
+Анатолийски леопард (revision 10986842)
+Синайски леопард (revision 10737955)
+Акомодация (revision 9073034)
+Бульон (revision 9265335)
+Мляко (revision 11599803)
+Хранителна верига (revision 9990974)
+Китайски език (revision 11315056)
+Мъжки (revision 11120791)
+Камбрий (revision 10117802)
+Зигота (revision 10544543)
+Листо (revision 11417909)
+Кромид лук (revision 10698110)
+Хрян (revision 11494398)
+Ектодерма (revision 10806725)
+Храст (revision 11500525)
+Геология (revision 11598573)
+Дългоопашат скункс (revision 11531277)
+Лигавица (revision 10894252)
+Горчица (revision 8753833)
+Подковонос на Мехели (revision 10377709)
+Бозайници (revision 11597688)
+Кванмьонсон-1 (revision 11507924)
+Азиатска палмова цивета (revision 11531312)
+Хранителни вещества (revision 11590475)
+Дмитрий Орлов (revision 10880810)
+Въглероден диоксид (revision 10769242)
+Ракообразни (revision 11349934)
+Испански език (revision 11599556)
+Уикиречник (revision 9194836)
+Уретра (revision 11600909)
+ISO 639 (revision 10477132)
+Биологична система (revision 10872761)
+Палеозой (revision 10972967)
+Розетка (revision 11250355)
+Ихтиозаври (revision 11141622)
+Хабаровск (revision 11427125)
+Хавайски тюлен монах (revision 11531012)
+Кодкод (revision 11480480)
+Южна Европа (revision 10119488)
+Вода (revision 11606762)
+URL (revision 11283400)
+Ивичест зурлест скункс (revision 11476684)
+Храносмилателна система (revision 11298271)
+Триас (revision 10657489)
+ООН (revision 11599875)
+Alexa Internet (revision 11547819)
+Псориазис (revision 11607604)
+Партеногенеза (revision 11201489)
+Картоф (revision 11611083)
+Коприва (revision 11416720)
+Воден плъх (revision 11351201)
+Прилепи (revision 11566273)
+Odobenidae (revision 11032101)
+Гондвана (revision 11074999)
+Домашна муха (revision 11484479)
+Трахея (revision 11408131)
+Безполово размножаване (revision 10972108)
+Карибски регион (revision 10503045)
+Географска координатна система (revision 10929840)
+Entoprocta (revision 10346607)
+Бадем (revision 11339812)
+Удил (revision 10422385)
+Южноафриканска морска котка (revision 11476346)
+Библиотечно дело (revision 11477309)
+Организъм (revision 11079762)
+Животно (revision 11599450)
+Донг Фанг Хонг I (revision 11537199)
+Палеоген (revision 9895031)
+Триптофан (revision 11566722)
+Боливия (revision 11584461)
+Суспензия (revision 11306702)
+Chlorophyceae (revision 11097610)
+Тетраподоморфи (revision 10796558)
+Wayback Machine (revision 11423066)
+Mustelidae (revision 10988654)
+Епителна тъкан (revision 11544065)
+Чернолапа котка (revision 11545586)
+Уралски федерален окръг (revision 11412555)
+Северна Африка (revision 11617946)
+Корейски архипелаг (revision 11436736)
+Златна палмова цивета (revision 11530618)
+Макроелемент (revision 11151625)
+Международен съюз за защита на природата (revision 11546091)
+Пролетен горицвет (revision 11560104)
+Име (revision 11387941)
+Neophoca (revision 11552636)
+Алвеола (revision 10429710)
+Лападови (revision 9926969)
+
+== End of Parsed pages ==
+
+- Wikipedia parsing ended at: 2022-12-17 18:16:58.793948
+
+59 characters appeared 866927 times.
+
+Most Frequent characters:
+[ 0] Char а: 11.195290952986813 %
+[ 1] Char и: 9.90394808328729 %
+[ 2] Char о: 8.887830232533997 %
+[ 3] Char е: 8.05834862681633 %
+[ 4] Char т: 7.773895610587743 %
+[ 5] Char н: 7.376976377480457 %
+[ 6] Char р: 5.300561638984598 %
+[ 7] Char с: 4.85496472021289 %
+[ 8] Char в: 4.23022930419747 %
+[ 9] Char л: 3.41978044287466 %
+[10] Char к: 3.3481481139703804 %
+[11] Char д: 2.8882477994110234 %
+[12] Char п: 2.700227354783044 %
+[13] Char з: 2.255207185841484 %
+[14] Char м: 2.1408953695063135 %
+[15] Char я: 1.6356625182973883 %
+[16] Char ъ: 1.4382987264210252 %
+[17] Char г: 1.3491332026802718 %
+[18] Char ч: 1.2814227726209935 %
+[19] Char у: 1.267234726799373 %
+[20] Char б: 1.132852016375081 %
+[21] Char ж: 0.7340871838113243 %
+[22] Char ц: 0.6595711057563094 %
+[23] Char х: 0.5456053393192275 %
+[24] Char й: 0.5091547500539261 %
+[25] Char a: 0.437522421149647 %
+[26] Char ф: 0.37927068830478233 %
+[27] Char щ: 0.3754641394258109 %
+[28] Char i: 0.342589399107422 %
+[29] Char e: 0.3205575555957999 %
+[30] Char o: 0.3129444578378571 %
+[31] Char ш: 0.27326406952373156 %
+[32] Char r: 0.25757647414372836 %
+[33] Char n: 0.24073537910343085 %
+[34] Char s: 0.236006030496224 %
+[35] Char t: 0.23069993205887002 %
+[36] Char c: 0.2030159402118056 %
+[37] Char l: 0.19990149112901087 %
+[38] Char m: 0.16322020193165054 %
+[39] Char u: 0.1605671527129735 %
+[40] Char ю: 0.1558378041057667 %
+[41] Char p: 0.12861521212282004 %
+[42] Char d: 0.12065606446678902 %
+[43] Char h: 0.11258156684472856 %
+[44] Char b: 0.07832262693398637 %
+[45] Char y: 0.07059417921001422 %
+[46] Char g: 0.07047882924398478 %
+[47] Char k: 0.053637734203687275 %
+[48] Char f: 0.052368884577363495 %
+[49] Char v: 0.04060318804236112 %
+[50] Char w: 0.024108142900151914 %
+[51] Char x: 0.022493243375739824 %
+[52] Char ь: 0.01799459470059186 %
+
+The first 53 characters have an accumulated ratio of 0.9996920155907014.
+The first 5 characters have an accumulated ratio of 0.4581931350621217.
+All characters whose order is over 29 have an accumulated ratio of 0.03226223199877268.
+
+1236 sequences found.
+
+First 720 (typical positive ratio): 0.9950164618425456
+Next 201 (921-720): 0.003986830525963603
+Rest: 0.0009967076314908452
+
+- Processing end: 2022-12-17 18:16:58.922580
diff --git a/script/charsets/iso-8859-5.py b/script/charsets/iso-8859-5.py
new file mode 100644
index 0000000..953a437
--- /dev/null
+++ b/script/charsets/iso-8859-5.py
@@ -0,0 +1,72 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+
+# ##### BEGIN LICENSE BLOCK #####
+# Version: MPL 1.1/GPL 2.0/LGPL 2.1
+#
+# The contents of this file are subject to the Mozilla Public License Version
+# 1.1 (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+# http://www.mozilla.org/MPL/
+#
+# Software distributed under the License is distributed on an "AS IS" basis,
+# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
+# for the specific language governing rights and limitations under the
+# License.
+#
+# The Original Code is Mozilla Universal charset detector code.
+#
+# The Initial Developer of the Original Code is
+# Netscape Communications Corporation.
+# Portions created by the Initial Developer are Copyright (C) 2001
+# the Initial Developer. All Rights Reserved.
+#
+# Contributor(s):
+# Jehan <jehan@girinstud.io>
+#
+# Alternatively, the contents of this file may be used under the terms of
+# either the GNU General Public License Version 2 or later (the "GPL"), or
+# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
+# in which case the provisions of the GPL or the LGPL are applicable instead
+# of those above. If you wish to allow use of your version of this file only
+# under the terms of either the GPL or the LGPL, and not to allow others to
+# use your version of this file under the terms of the MPL, indicate your
+# decision by deleting the provisions above and replace them with the notice
+# and other provisions required by the GPL or the LGPL. If you do not delete
+# the provisions above, a recipient may use your version of this file under
+# the terms of any one of the MPL, the GPL or the LGPL.
+#
+# ##### END LICENSE BLOCK #####
+
+from codepoints import *
+
+name = 'ISO-8859-5'
+aliases = ['ISO_8859-5:1988', 'ISO_8859-5', 'iso-ir-144',
+ 'cyrillic', 'csISOLatinCyrillic']
+
+language = \
+{
+ 'complete': [ 'bg', 'be', 'ru', 'sr', 'mk' ],
+ 'incomplete': [ 'uk' ]
+}
+
+# X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF #
+charmap = \
+[
+ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, # 0X
+ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 1X
+ SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # 2X
+ NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, # 3X
+ SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 4X
+ LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,SYM, # 5X
+ SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 6X
+ LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,CTR, # 7X
+ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 8X
+ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 9X
+ SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,LET,LET, # AX
+ LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # BX
+ LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # CX
+ LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # DX
+ LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # EX
+ SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,LET,LET, # FX
+]
diff --git a/script/charsets/windows-1251.py b/script/charsets/windows-1251.py
new file mode 100644
index 0000000..8ab389f
--- /dev/null
+++ b/script/charsets/windows-1251.py
@@ -0,0 +1,75 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+
+# ##### BEGIN LICENSE BLOCK #####
+# Version: MPL 1.1/GPL 2.0/LGPL 2.1
+#
+# The contents of this file are subject to the Mozilla Public License Version
+# 1.1 (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+# http://www.mozilla.org/MPL/
+#
+# Software distributed under the License is distributed on an "AS IS" basis,
+# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
+# for the specific language governing rights and limitations under the
+# License.
+#
+# The Original Code is Mozilla Universal charset detector code.
+#
+# The Initial Developer of the Original Code is
+# Netscape Communications Corporation.
+# Portions created by the Initial Developer are Copyright (C) 2001
+# the Initial Developer. All Rights Reserved.
+#
+# Contributor(s):
+# Jehan <jehan@girinstud.io>
+#
+# Alternatively, the contents of this file may be used under the terms of
+# either the GNU General Public License Version 2 or later (the "GPL"), or
+# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
+# in which case the provisions of the GPL or the LGPL are applicable instead
+# of those above. If you wish to allow use of your version of this file only
+# under the terms of either the GPL or the LGPL, and not to allow others to
+# use your version of this file under the terms of the MPL, indicate your
+# decision by deleting the provisions above and replace them with the notice
+# and other provisions required by the GPL or the LGPL. If you do not delete
+# the provisions above, a recipient may use your version of this file under
+# the terms of any one of the MPL, the GPL or the LGPL.
+#
+# ##### END LICENSE BLOCK #####
+
+from codepoints import *
+
+name = 'WINDOWS-1251'
+aliases = ['CP-1251', 'cswindows1251']
+
+language = \
+{
+ # Windows-1251 is a popular 8-bit character encoding, designed to cover
+ # languages that use the Cyrillic script such as Russian, Bulgarian, Serbian
+ # Cyrillic and other languages. It is the most widely used for encoding the
+ # Bulgarian, Serbian and Macedonian languages.
+ 'complete': [ 'ru', 'uk', 'be', 'bg', 'sr', 'mk' ],
+ 'incomplete': []
+}
+
+# X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF #
+charmap = \
+[
+ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, # 0X
+ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 1X
+ SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # 2X
+ NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, # 3X
+ SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 4X
+ LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,SYM, # 5X
+ SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 6X
+ LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,CTR, # 7X
+ LET,LET,SYM,LET,SYM,SYM,SYM,SYM,SYM,SYM,LET,SYM,LET,LET,LET,LET, # 8X
+ LET,SYM,SYM,SYM,SYM,SYM,SYM,SYM,ILL,SYM,LET,SYM,LET,LET,LET,LET, # 9X
+ SYM,LET,LET,LET,SYM,LET,SYM,SYM,LET,SYM,LET,SYM,SYM,SYM,SYM,LET, # AX
+ SYM,SYM,LET,LET,LET,SYM,SYM,SYM,LET,SYM,LET,SYM,LET,LET,LET,LET, # BX
+ LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # CX
+ LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # DX
+ LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # EX
+ LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # FX
+]
diff --git a/script/langs/bg.py b/script/langs/bg.py
new file mode 100644
index 0000000..bce517e
--- /dev/null
+++ b/script/langs/bg.py
@@ -0,0 +1,58 @@
+#!/bin/python3
+# -*- coding: utf-8 -*-
+
+# ##### BEGIN LICENSE BLOCK #####
+# Version: MPL 1.1/GPL 2.0/LGPL 2.1
+#
+# The contents of this file are subject to the Mozilla Public License Version
+# 1.1 (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+# http://www.mozilla.org/MPL/
+#
+# Software distributed under the License is distributed on an "AS IS" basis,
+# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
+# for the specific language governing rights and limitations under the
+# License.
+#
+# The Original Code is Mozilla Universal charset detector code.
+#
+# The Initial Developer of the Original Code is
+# Netscape Communications Corporation.
+# Portions created by the Initial Developer are Copyright (C) 2001
+# the Initial Developer. All Rights Reserved.
+#
+# Contributor(s):
+# Jehan <jehan@girinstud.io>
+#
+# Alternatively, the contents of this file may be used under the terms of
+# either the GNU General Public License Version 2 or later (the "GPL"), or
+# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
+# in which case the provisions of the GPL or the LGPL are applicable instead
+# of those above. If you wish to allow use of your version of this file only
+# under the terms of either the GPL or the LGPL, and not to allow others to
+# use your version of this file under the terms of the MPL, indicate your
+# decision by deleting the provisions above and replace them with the notice
+# and other provisions required by the GPL or the LGPL. If you do not delete
+# the provisions above, a recipient may use your version of this file under
+# the terms of any one of the MPL, the GPL or the LGPL.
+#
+# ##### END LICENSE BLOCK #####
+
+import re
+
+## Mandatory Properties ##
+
+name = 'Bulgarian'
+code = 'bg'
+use_ascii = False
+charsets = [ 'WINDOWS-1251', 'ISO-8859-5' ]
+
+## Optional Properties ##
+
+# Alphabet characters.
+alphabet = 'абвгдежзийклмнопрстуфхцчшщъьюя'
+# A starred page which was rewarded on the main page when I created
+# the data.
+start_pages = ['Амурски_леопард']
+wikipedia_code = code
+case_mapping = True
diff --git a/src/LangModels/LangBulgarianModel.cpp b/src/LangModels/LangBulgarianModel.cpp
index 1120054..32bba1c 100644
--- a/src/LangModels/LangBulgarianModel.cpp
+++ b/src/LangModels/LangBulgarianModel.cpp
@@ -36,214 +36,244 @@
* ***** END LICENSE BLOCK ***** */
#include "../nsSBCharSetProber.h"
-/****************************************************************
-CTR: Control characters that usually does not exist in any text
-RET: Carriage/Return
-SYM: symbol (punctuation) that does not belong to word
-NUM: 0 - 9
+#include "../nsLanguageDetector.h"
-*****************************************************************/
+/********* Language model for: Bulgarian *********/
-//Character Mapping Table:
-//this talbe is modified base on win1251BulgarianCharToOrderMap, so
-//only number <64 is sure valid
+/**
+ * Generated by BuildLangModel.py
+ * On: 2022-12-17 18:16:58.794613
+ **/
-static const unsigned char Latin5_BulgarianCharToOrderMap[] =
+/* Character Mapping Table:
+ * ILL: illegal character.
+ * CTR: control character specific to the charset.
+ * RET: carriage/return.
+ * SYM: symbol (punctuation) that does not belong to word.
+ * NUM: 0 - 9.
+ *
+ * Other characters are ordered by probabilities
+ * (0 is the most common character in the language).
+ *
+ * Orders are generic to a language. So the codepoint with order X in
+ * CHARSET1 maps to the same character as the codepoint with the same
+ * order X in CHARSET2 for the same language.
+ * As such, it is possible to get missing order. For instance the
+ * ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1
+ * even though they are both used for French. Same for the euro sign.
+ */
+static const unsigned char Windows_1251_CharToOrderMap[] =
+{
+ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */
+ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
+ SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
+ NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
+ SYM, 25, 44, 36, 42, 29, 48, 46, 43, 28, 54, 47, 37, 38, 33, 30, /* 4X */
+ 41, 55, 32, 34, 35, 39, 49, 50, 51, 45, 53,SYM,SYM,SYM,SYM,SYM, /* 5X */
+ SYM, 25, 44, 36, 42, 29, 48, 46, 43, 28, 54, 47, 37, 38, 33, 30, /* 6X */
+ 41, 55, 32, 34, 35, 39, 49, 50, 51, 45, 53,SYM,SYM,SYM,SYM,CTR, /* 7X */
+ 59, 60,SYM, 61,SYM,SYM,SYM,SYM,SYM,SYM, 62,SYM, 63, 64, 65, 66, /* 8X */
+ 67,SYM,SYM,SYM,SYM,SYM,SYM,SYM,ILL,SYM, 68,SYM, 69, 70, 71, 72, /* 9X */
+ SYM, 73, 74, 75,SYM, 76,SYM,SYM, 77,SYM, 78,SYM,SYM,SYM,SYM, 79, /* AX */
+ SYM,SYM, 57, 57, 80,SYM,SYM,SYM, 81,SYM, 82,SYM, 83, 84, 85, 86, /* BX */
+ 0, 20, 8, 17, 11, 3, 21, 13, 1, 24, 10, 9, 14, 5, 2, 12, /* CX */
+ 6, 7, 4, 19, 26, 23, 22, 18, 31, 27, 16, 56, 52, 58, 40, 15, /* DX */
+ 0, 20, 8, 17, 11, 3, 21, 13, 1, 24, 10, 9, 14, 5, 2, 12, /* EX */
+ 6, 7, 4, 19, 26, 23, 22, 18, 31, 27, 16, 56, 52, 58, 40, 15, /* FX */
+};
+/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
+
+static const unsigned char Iso_8859_5_CharToOrderMap[] =
{
-CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, //00
-CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, //10
-SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, //20
-NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, //30
-SYM, 77, 90, 99,100, 72,109,107,101, 79,185, 81,102, 76, 94, 82, //40
-110,186,108, 91, 74,119, 84, 96,111,187,115,SYM,SYM,SYM,SYM,SYM, //50
-SYM, 65, 69, 70, 66, 63, 68,112,103, 92,194,104, 95, 86, 87, 71, //60
-116,195, 85, 93, 97,113,196,197,198,199,200,SYM,SYM,SYM,SYM,SYM, //70
-194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209, //80
-210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225, //90
- 81,226,227,228,229,230,105,231,232,233,234,235,236, 45,237,238, //a0
- 31, 32, 35, 43, 37, 44, 55, 47, 40, 59, 33, 46, 38, 36, 41, 30, //b0
- 39, 28, 34, 51, 48, 49, 53, 50, 54, 57, 61,239, 67,240, 60, 56, //c0
- 1, 18, 9, 20, 11, 3, 23, 15, 2, 26, 12, 10, 14, 6, 4, 13, //d0
- 7, 8, 5, 19, 29, 25, 22, 21, 27, 24, 17, 75, 52,241, 42, 16, //e0
- 62,242,243,244, 58,245, 98,246,247,248,249,250,251, 91,NUM,SYM, //f0
+ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */
+ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
+ SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
+ NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
+ SYM, 25, 44, 36, 42, 29, 48, 46, 43, 28, 54, 47, 37, 38, 33, 30, /* 4X */
+ 41, 55, 32, 34, 35, 39, 49, 50, 51, 45, 53,SYM,SYM,SYM,SYM,SYM, /* 5X */
+ SYM, 25, 44, 36, 42, 29, 48, 46, 43, 28, 54, 47, 37, 38, 33, 30, /* 6X */
+ 41, 55, 32, 34, 35, 39, 49, 50, 51, 45, 53,SYM,SYM,SYM,SYM,CTR, /* 7X */
+ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */
+ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */
+ SYM, 87, 88, 89, 90, 91, 57, 92, 93, 94, 95, 96, 97,SYM, 98, 99, /* AX */
+ 0, 20, 8, 17, 11, 3, 21, 13, 1, 24, 10, 9, 14, 5, 2, 12, /* BX */
+ 6, 7, 4, 19, 26, 23, 22, 18, 31, 27, 16, 56, 52, 58, 40, 15, /* CX */
+ 0, 20, 8, 17, 11, 3, 21, 13, 1, 24, 10, 9, 14, 5, 2, 12, /* DX */
+ 6, 7, 4, 19, 26, 23, 22, 18, 31, 27, 16, 56, 52, 58, 40, 15, /* EX */
+ SYM,100,101,102,103,104, 57,105,106,107,108,109,110,SYM,111,112, /* FX */
};
+/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
-static const unsigned char win1251BulgarianCharToOrderMap[] =
+static const int Unicode_Char_size = 106;
+static const unsigned int Unicode_CharOrder[] =
{
-CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, //00
-CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, //10
-SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, //20
-NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, //30
-SYM, 77, 90, 99,100, 72,109,107,101, 79,185, 81,102, 76, 94, 82, //40
-110,186,108, 91, 74,119, 84, 96,111,187,115,SYM,SYM,SYM,SYM,SYM, //50
-SYM, 65, 69, 70, 66, 63, 68,112,103, 92,194,104, 95, 86, 87, 71, //60
-116,195, 85, 93, 97,113,196,197,198,199,200,SYM,SYM,SYM,SYM,SYM, //70
-206,207,208,209,210,211,212,213,120,214,215,216,217,218,219,220, //80
-221, 78, 64, 83,121, 98,117,105,ILL,223,224,225,226,227,228,229, //90
- 88,230,231,232,233,122, 89,106,234,235,236,237,238, 45,239,240, //a0
- 73, 80,118,114,241,242,243,244,245, 62, 58,246,247,248,249,250, //b0
- 31, 32, 35, 43, 37, 44, 55, 47, 40, 59, 33, 46, 38, 36, 41, 30, //c0
- 39, 28, 34, 51, 48, 49, 53, 50, 54, 57, 61,251, 67,NUM, 60, 56, //d0
- 1, 18, 9, 20, 11, 3, 23, 15, 2, 26, 12, 10, 14, 6, 4, 13, //e0
- 7, 8, 5, 19, 29, 25, 22, 21, 27, 24, 17, 75, 52,SYM, 42, 16, //f0
+ 65, 25, 66, 44, 67, 36, 68, 42, 69, 29, 70, 48, 71, 46, 72, 43,
+ 73, 28, 75, 47, 76, 37, 77, 38, 78, 33, 79, 30, 80, 41, 82, 32,
+ 83, 34, 84, 35, 85, 39, 86, 49, 87, 50, 88, 51, 89, 45, 97, 25,
+ 98, 44, 99, 36, 100, 42, 101, 29, 102, 48, 103, 46, 104, 43, 105, 28,
+ 107, 47, 108, 37, 109, 38, 110, 33, 111, 30, 112, 41, 114, 32, 115, 34,
+ 116, 35, 117, 39, 118, 49, 119, 50, 120, 51, 121, 45, 1040, 0,1041, 20,
+ 1042, 8, 1043, 17, 1044, 11, 1045, 3, 1046, 21, 1047, 13, 1048, 1,1049, 24,
+ 1050, 10, 1051, 9, 1052, 14, 1053, 5, 1054, 2, 1055, 12, 1056, 6,1057, 7,
+ 1058, 4, 1059, 19, 1060, 26, 1061, 23, 1062, 22, 1063, 18, 1064, 31,1065, 27,
+ 1066, 16, 1068, 52, 1070, 40, 1071, 15, 1072, 0, 1073, 20, 1074, 8,1075, 17,
+ 1076, 11, 1077, 3, 1078, 21, 1079, 13, 1080, 1, 1081, 24, 1082, 10,1083, 9,
+ 1084, 14, 1085, 5, 1086, 2, 1087, 12, 1088, 6, 1089, 7, 1090, 4,1091, 19,
+ 1092, 26, 1093, 23, 1094, 22, 1095, 18, 1096, 31, 1097, 27, 1098, 16,1100, 52,
+ 1102, 40, 1103, 15,
};
-//Model Table:
-//total sequences: 100%
-//first 512 sequences: 96.9392%
-//first 1024 sequences:3.0618%
-//rest sequences: 0.2992%
-//negative sequences: 0.0020%
-static const PRUint8 BulgarianLangModel[] =
+
+/* Model Table:
+ * Total considered sequences: 1236 / 2809
+ * - Positive sequences: first 720 (0.9950164618425456)
+ * - Probable sequences: next 201 (921-720) (0.003986830525963603)
+ * - Neutral sequences: last 1888 (0.0009967076314908452)
+ * - Negative sequences: 1573 (off-ratio)
+ * Negative sequences: TODO
+ */
+static const PRUint8 BulgarianLangModel[] =
{
-0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,2,3,3,3,3,3,
-3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,3,2,2,3,2,2,1,2,2,
-3,1,3,3,2,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,3,3,3,3,3,3,3,3,0,3,0,1,
-0,0,0,0,0,0,0,0,0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,
-3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,2,3,3,3,3,3,3,3,3,0,3,1,0,
-0,1,0,0,0,0,0,0,0,0,1,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,
-3,2,2,2,3,3,3,3,3,3,3,3,3,3,3,3,3,1,3,2,3,3,3,3,3,3,3,3,0,3,0,0,
-0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-3,2,3,3,2,3,3,3,3,3,3,3,3,3,3,3,3,1,3,2,3,3,3,3,3,3,3,3,0,3,0,0,
-0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-3,3,3,3,3,3,3,3,3,3,3,2,3,2,2,1,3,3,3,3,2,2,2,1,1,2,0,1,0,1,0,0,
-0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,1,
-3,3,3,3,3,3,3,2,3,2,2,3,3,1,1,2,3,3,2,3,3,3,3,2,1,2,0,2,0,3,0,0,
-0,0,0,0,0,0,0,1,0,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,1,
-3,3,3,3,3,3,3,1,3,3,3,3,3,2,3,2,3,3,3,3,3,2,3,3,1,3,0,3,0,2,0,0,
-0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,
-3,3,3,3,3,3,3,3,1,3,3,2,3,3,3,1,3,3,2,3,2,2,2,0,0,2,0,2,0,2,0,0,
-0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,1,
-3,3,3,3,3,3,3,3,3,0,3,3,3,2,2,3,3,3,1,2,2,3,2,1,1,2,0,2,0,0,0,0,
-1,0,0,0,0,0,0,0,0,0,2,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,
-3,3,3,3,3,3,3,2,3,3,1,2,3,2,2,2,3,3,3,3,3,2,2,3,1,2,0,2,1,2,0,0,
-0,0,0,0,0,0,0,0,0,0,3,0,0,1,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,1,
-3,3,3,3,3,1,3,3,3,3,3,2,3,3,3,2,3,3,2,3,2,2,2,3,1,2,0,1,0,1,0,0,
-0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,
-3,3,3,3,3,3,3,3,3,3,3,1,1,1,2,2,1,3,1,3,2,2,3,0,0,1,0,1,0,1,0,0,
-0,0,0,1,0,0,0,0,1,0,2,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,
-3,3,3,3,3,2,2,3,2,2,3,1,2,1,1,1,2,3,1,3,1,2,2,0,1,1,1,1,0,1,0,0,
-0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,
-3,3,3,3,3,1,3,2,2,3,3,1,2,3,1,1,3,3,3,3,1,2,2,1,1,1,0,2,0,2,0,1,
-0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,
-3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,2,2,3,3,3,2,2,1,1,2,0,2,0,1,0,0,
-0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,
-3,0,1,2,1,3,3,2,3,3,3,3,3,2,3,2,1,0,3,1,2,1,2,1,2,3,2,1,0,1,0,0,
-0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-1,1,1,2,3,3,3,3,3,3,3,3,3,3,3,3,0,0,3,1,3,3,2,3,3,2,2,2,0,1,0,0,
-0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-2,3,3,3,3,0,3,3,3,3,3,2,1,1,2,1,3,3,0,3,1,1,1,1,3,2,0,1,0,0,0,0,
-0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,
-3,3,2,2,2,3,3,3,3,3,3,3,3,3,3,3,1,1,3,1,3,3,2,3,2,2,2,3,0,2,0,0,
-0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-3,3,3,3,3,2,3,3,2,2,3,2,1,1,1,1,1,3,1,3,1,1,0,0,0,1,0,0,0,1,0,0,
-0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,
-3,3,3,3,3,2,3,2,0,3,2,0,3,0,2,0,0,2,1,3,1,0,0,1,0,0,0,1,0,0,0,0,
-0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,
-3,3,3,3,2,1,1,1,1,2,1,1,2,1,1,1,2,2,1,2,1,1,1,0,1,1,0,1,0,1,0,0,
-0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,
-3,3,3,3,2,1,3,1,1,2,1,3,2,1,1,0,1,2,3,2,1,1,1,0,0,0,0,0,0,0,0,0,
-0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-2,3,3,3,3,2,2,1,0,1,0,0,1,0,0,0,2,1,0,3,0,0,1,0,0,0,0,0,0,0,0,0,
-0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,
-3,3,3,2,3,2,3,3,1,3,2,1,1,1,2,1,1,2,1,3,0,1,0,0,0,1,0,0,0,0,0,0,
-0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-3,1,1,2,2,3,3,2,3,2,2,2,3,1,2,2,1,1,2,1,1,2,2,0,1,1,0,1,0,2,0,0,
-0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-3,3,3,3,2,1,3,1,0,2,2,1,3,2,1,0,0,2,0,2,0,1,0,0,0,0,0,0,0,1,0,0,
-0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,
-3,3,3,3,3,3,1,2,0,2,3,1,2,3,2,0,1,3,1,2,1,1,1,0,0,1,0,0,2,2,2,3,
-2,2,2,2,1,2,1,1,2,2,1,1,2,0,1,1,1,0,0,1,1,0,0,1,1,0,0,0,1,1,0,1,
-3,3,3,3,3,2,1,2,2,1,2,0,2,0,1,0,1,2,1,2,1,1,0,0,0,1,0,1,0,0,0,0,
-0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,1,
-3,3,2,3,3,1,1,3,1,0,3,2,1,0,0,0,1,2,0,2,0,1,0,0,0,1,0,1,2,1,2,2,
-1,1,1,1,1,1,1,2,2,2,1,1,1,1,1,1,1,0,1,2,1,1,1,0,0,0,0,0,1,1,0,0,
-3,1,0,1,0,2,3,2,2,2,3,2,2,2,2,2,1,0,2,1,2,1,1,1,0,1,2,1,2,2,2,1,
-1,1,2,2,2,2,1,2,1,1,0,1,2,1,2,2,2,1,1,1,0,1,1,1,1,2,0,1,0,0,0,0,
-2,3,2,3,3,0,0,2,1,0,2,1,0,0,0,0,2,3,0,2,0,0,0,0,0,1,0,0,2,0,1,2,
-2,1,2,1,2,2,1,1,1,2,1,1,1,0,1,2,2,1,1,1,1,1,0,1,1,1,0,0,1,2,0,0,
-3,3,2,2,3,0,2,3,1,1,2,0,0,0,1,0,0,2,0,2,0,0,0,1,0,1,0,1,2,0,2,2,
-1,1,1,1,2,1,0,1,2,2,2,1,1,1,1,1,1,1,0,1,1,1,0,0,0,0,0,0,1,1,0,0,
-2,3,2,3,3,0,0,3,0,1,1,0,1,0,0,0,2,2,1,2,0,0,0,0,0,0,0,0,2,0,1,2,
-2,2,1,1,1,1,1,2,2,2,1,0,2,0,1,0,1,0,0,1,0,1,0,0,1,0,0,0,0,1,0,0,
-3,3,3,3,2,2,2,2,2,0,2,1,1,1,1,2,1,2,1,1,0,2,0,1,0,1,0,0,2,0,1,2,
-1,1,1,1,1,1,1,2,2,1,1,0,2,0,1,0,2,0,0,1,1,1,0,0,2,0,0,0,1,1,0,0,
-2,3,3,3,3,1,0,0,0,0,0,0,0,0,0,0,2,0,0,1,1,0,0,0,0,0,0,1,2,0,1,2,
-2,2,2,1,1,2,1,1,2,2,2,1,2,0,1,1,1,1,1,1,0,1,1,1,1,0,0,1,1,1,0,0,
-2,3,3,3,3,0,2,2,0,2,1,0,0,0,1,1,1,2,0,2,0,0,0,3,0,0,0,0,2,0,2,2,
-1,1,1,2,1,2,1,1,2,2,2,1,2,0,1,1,1,0,1,1,1,1,0,2,1,0,0,0,1,1,0,0,
-2,3,3,3,3,0,2,1,0,0,2,0,0,0,0,0,1,2,0,2,0,0,0,0,0,0,0,0,2,0,1,2,
-1,1,1,2,1,1,1,1,2,2,2,0,1,0,1,1,1,0,0,1,1,1,0,0,1,0,0,0,0,1,0,0,
-3,3,2,2,3,0,1,0,1,0,0,0,0,0,0,0,1,1,0,3,0,0,0,0,0,0,0,0,1,0,2,2,
-1,1,1,1,1,2,1,1,2,2,1,2,2,1,0,1,1,1,1,1,0,1,0,0,1,0,0,0,1,1,0,0,
-3,1,0,1,0,2,2,2,2,3,2,1,1,1,2,3,0,0,1,0,2,1,1,0,1,1,1,1,2,1,1,1,
-1,2,2,1,2,1,2,2,1,1,0,1,2,1,2,2,1,1,1,0,0,1,1,1,2,1,0,1,0,0,0,0,
-2,1,0,1,0,3,1,2,2,2,2,1,2,2,1,1,1,0,2,1,2,2,1,1,2,1,1,0,2,1,1,1,
-1,2,2,2,2,2,2,2,1,2,0,1,1,0,2,1,1,1,1,1,0,0,1,1,1,1,0,1,0,0,0,0,
-2,1,1,1,1,2,2,2,2,1,2,2,2,1,2,2,1,1,2,1,2,3,2,2,1,1,1,1,0,1,0,0,
-0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-2,2,2,3,2,0,1,2,0,1,2,1,1,0,1,0,1,2,1,2,0,0,0,1,1,0,0,0,1,0,0,2,
-1,1,0,0,1,1,0,1,1,1,1,0,2,0,1,1,1,0,0,1,1,0,0,0,0,1,0,0,0,1,0,0,
-2,0,0,0,0,1,2,2,2,2,2,2,2,1,2,1,1,1,1,1,1,1,0,1,1,1,1,1,2,1,1,1,
-1,2,2,2,2,1,1,2,1,2,1,1,1,0,2,1,2,1,1,1,0,2,1,1,1,1,0,1,0,0,0,0,
-3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,
-1,1,0,1,0,1,1,1,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-2,2,2,3,2,0,0,0,0,1,0,0,0,0,0,0,1,1,0,2,0,0,0,0,0,0,0,0,1,0,1,2,
-1,1,1,1,1,1,0,0,2,2,2,2,2,0,1,1,0,1,1,1,1,1,0,0,1,0,0,0,1,1,0,1,
-2,3,1,2,1,0,1,1,0,2,2,2,0,0,1,0,0,1,1,1,1,0,0,0,0,0,0,0,1,0,1,2,
-1,1,1,1,2,1,1,1,1,1,1,1,1,0,1,1,0,1,0,1,0,1,0,0,1,0,0,0,0,1,0,0,
-2,2,2,2,2,0,0,2,0,0,2,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,2,0,2,2,
-1,1,1,1,1,0,0,1,2,1,1,0,1,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,
-1,2,2,2,2,0,0,2,0,1,1,0,0,0,1,0,0,2,0,2,0,0,0,0,0,0,0,0,0,0,1,1,
-0,0,0,1,1,1,1,1,1,1,1,1,1,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,
-1,2,2,3,2,0,0,1,0,0,1,0,0,0,0,0,0,1,0,2,0,0,0,1,0,0,0,0,0,0,0,2,
-1,1,0,0,1,0,0,0,1,1,0,0,1,0,1,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,
-2,1,2,2,2,1,2,1,2,2,1,1,2,1,1,1,0,1,1,1,1,2,0,1,0,1,1,1,1,0,1,1,
-1,1,2,1,1,1,1,1,1,0,0,1,2,1,1,1,1,1,1,0,0,1,1,1,0,0,0,0,0,0,0,0,
-1,0,0,1,3,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,
-0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-2,2,2,2,1,0,0,1,0,2,0,0,0,0,0,1,1,1,0,1,0,0,0,0,0,0,0,0,2,0,0,1,
-0,2,0,1,0,0,1,1,2,0,1,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,
-1,2,2,2,2,0,1,1,0,2,1,0,1,1,1,0,0,1,0,2,0,1,0,0,0,0,0,0,0,0,0,1,
-0,1,0,0,1,0,0,0,1,1,0,0,1,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,
-2,2,2,2,2,0,0,1,0,0,0,1,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,
-0,1,0,1,1,1,0,0,1,1,1,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,
-2,0,1,0,0,1,2,1,1,1,1,1,1,2,2,1,0,0,1,0,1,0,0,0,0,1,1,1,1,0,0,0,
-1,1,2,1,1,1,1,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-2,2,1,2,1,0,0,1,0,0,0,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,
-0,0,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-1,0,0,1,2,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,
-0,1,1,0,1,1,1,0,0,1,0,0,1,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,
-1,0,1,0,0,1,1,1,1,1,1,1,1,1,1,1,0,0,1,0,2,0,0,2,0,1,0,0,1,0,0,1,
-1,1,0,0,1,1,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,
-0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,
-1,1,1,1,1,1,1,2,0,0,0,0,0,0,2,1,0,1,1,0,0,1,1,1,0,1,0,0,0,0,0,0,
-2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-1,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,0,1,1,0,1,1,1,1,1,0,1,0,0,
-0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,
+ 1,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,3,3,3,3,3,3,0,
+ 3,3,0,0,0,3,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,
+ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,3,3,3,3,3,3,3,3,1,
+ 3,3,0,0,0,3,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,
+ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,3,3,3,3,3,3,3,3,0,
+ 3,3,0,0,0,3,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,
+ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,3,3,3,3,3,3,3,3,0,
+ 3,3,1,0,0,3,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,
+ 3,3,3,3,3,3,3,3,3,3,3,3,3,1,3,3,3,3,3,3,3,0,1,2,0,1,
+ 2,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,2,
+ 3,3,3,3,3,3,2,3,3,3,3,3,2,3,2,3,3,3,3,3,2,2,3,3,1,0,
+ 3,1,0,0,0,3,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,3,
+ 3,3,3,3,3,3,1,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,0,
+ 3,0,0,1,0,3,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,2,
+ 3,3,3,3,3,3,3,3,3,3,3,1,3,1,3,3,3,2,3,3,3,0,2,3,0,0,
+ 3,0,0,0,0,3,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,
+ 3,3,3,3,3,3,3,3,1,3,3,3,3,3,3,3,3,3,3,3,0,1,3,1,0,0,
+ 1,0,0,0,0,2,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,
+ 3,3,3,3,3,3,1,3,3,1,3,3,2,3,3,3,3,3,3,3,3,3,3,2,0,0,
+ 3,0,0,0,0,2,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,3,
+ 3,3,3,3,3,3,3,3,3,3,2,1,1,3,3,1,3,2,2,3,0,0,3,1,0,0,
+ 1,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,2,
+ 3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,3,2,2,3,2,3,2,3,0,0,
+ 1,0,0,0,0,3,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,
+ 3,3,3,3,3,3,3,3,3,3,3,1,1,0,1,3,3,0,3,3,1,0,2,2,0,0,
+ 0,0,0,0,0,2,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,1,
+ 3,3,3,3,3,3,3,3,3,3,3,3,3,1,3,3,3,3,3,3,3,1,3,3,0,0,
+ 2,0,0,0,0,3,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,
+ 3,3,3,3,2,3,3,3,3,3,3,1,3,1,1,3,3,2,2,3,3,1,2,1,0,0,
+ 2,0,0,0,0,2,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,2,
+ 1,1,1,2,3,3,3,3,3,3,3,3,3,3,3,2,0,3,3,1,3,1,2,3,3,1,
+ 0,3,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,1,3,3,3,3,3,3,3,3,3,3,3,3,3,0,0,3,3,0,3,3,3,3,3,0,
+ 2,3,0,0,0,2,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,
+ 3,3,3,3,2,3,3,1,3,3,1,2,0,0,3,1,3,0,1,3,1,0,0,0,1,0,
+ 0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,
+ 3,3,3,3,3,3,3,0,3,3,3,0,0,0,0,0,2,0,0,3,0,2,0,3,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,3,3,1,3,3,3,3,2,0,
+ 1,3,0,0,0,3,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,
+ 3,3,3,3,2,3,3,3,3,3,2,2,2,1,3,3,3,1,2,3,0,0,2,3,0,0,
+ 0,3,0,0,0,2,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,1,
+ 3,3,3,3,3,3,1,0,3,3,3,3,0,0,1,0,3,0,0,3,3,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 3,3,2,3,1,1,1,0,3,0,3,0,1,2,0,3,3,0,0,2,0,0,0,1,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,
+ 3,3,3,3,3,3,3,1,3,3,2,0,0,0,1,1,3,0,1,3,0,0,0,2,0,0,
+ 0,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,
+ 1,0,3,3,3,3,2,3,3,3,3,3,1,2,2,0,2,2,3,0,3,0,3,1,0,0,
+ 1,0,0,0,0,2,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,
+ 0,0,3,3,2,0,3,3,3,3,3,3,3,3,0,3,3,1,2,3,3,1,2,2,2,2,0,
+ 3,3,3,3,3,2,3,3,0,3,1,0,0,0,0,3,2,1,0,3,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,
+ 3,3,3,3,3,3,2,0,1,0,0,0,0,0,0,3,1,0,0,3,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,
+ 0,0,3,3,3,0,3,3,3,3,3,3,3,3,0,3,3,1,2,1,3,2,3,3,1,2,0,
+ 0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,3,
+ 0,0,3,2,3,0,3,3,3,3,3,3,3,3,0,3,3,1,2,2,3,2,1,3,2,3,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,
+ 0,0,3,2,3,0,3,3,3,3,3,3,3,3,0,3,3,2,3,1,3,2,3,2,2,2,0,
+ 3,3,3,3,0,3,2,0,3,3,3,0,1,0,1,0,2,0,2,2,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,
+ 0,0,3,3,3,0,2,3,3,3,3,3,3,3,0,3,3,2,2,3,3,2,2,3,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,
+ 0,0,3,3,3,0,1,3,3,3,3,1,1,3,0,1,3,2,2,3,3,2,2,2,0,2,0,
+ 0,1,1,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,3,
+ 0,0,3,3,3,0,1,2,3,3,3,1,2,3,0,3,1,3,3,2,0,1,1,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,
+ 0,0,3,3,3,0,3,1,3,2,2,1,1,3,0,0,1,3,1,3,0,1,1,1,1,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,3,
+ 0,0,3,3,3,0,3,3,2,3,2,3,3,3,0,1,2,3,2,2,1,3,1,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,
+ 0,0,3,3,3,0,1,1,2,2,3,3,2,3,0,2,3,1,2,3,2,1,0,2,1,0,0,
+ 0,0,0,0,0,0,0,1,1,0,0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,3,
+ 0,0,3,3,3,0,1,2,2,1,1,1,3,3,0,3,0,1,3,3,2,0,0,0,0,1,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,
+ 0,0,3,3,1,0,3,3,3,3,3,3,3,1,0,2,2,0,2,1,2,2,2,2,0,2,0,
+ 1,2,0,0,3,3,3,3,2,3,2,3,2,3,2,0,0,3,3,0,2,3,3,1,2,0,
+ 1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,
+ 0,0,3,3,3,0,3,1,2,3,0,3,2,3,0,2,2,3,1,2,1,1,1,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,
+ 0,0,3,3,3,0,2,1,2,2,1,2,1,3,0,0,2,1,1,2,1,1,2,1,1,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,
+ 0,0,3,3,3,0,2,2,1,2,2,2,1,2,0,1,1,0,1,3,1,1,1,0,1,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,
+ 0,0,3,3,3,0,3,3,1,0,3,3,1,2,0,1,1,1,3,2,0,0,1,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,
+ 0,0,2,1,3,0,2,3,3,3,3,3,1,1,0,2,2,1,3,1,2,0,0,0,0,3,0,
+ 0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,
+ 0,0,3,3,3,0,3,3,1,1,1,2,1,3,0,1,2,2,0,2,2,1,1,0,1,1,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,
+ 0,0,3,2,2,0,2,1,2,1,1,1,3,1,0,0,0,1,0,1,3,0,1,0,1,0,0,
+ 0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,
+ 0,0,3,3,3,0,3,0,0,1,1,2,0,3,0,0,1,0,0,0,0,0,2,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,
+ 0,0,3,3,3,0,0,0,0,0,0,1,0,2,0,0,1,1,0,1,0,1,0,1,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,
+ 0,0,2,3,2,0,0,1,1,0,0,1,0,0,0,1,0,1,0,0,0,0,1,0,2,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,
+ 0,0,2,1,2,0,0,0,0,2,1,1,1,2,0,1,0,0,0,1,0,0,1,2,0,2,0,
+ 0,1,3,0,0,1,0,1,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
};
-const SequenceModel Latin5BulgarianModel =
+
+const SequenceModel Windows_1251BulgarianModel =
{
- Latin5_BulgarianCharToOrderMap,
+ Windows_1251_CharToOrderMap,
BulgarianLangModel,
- 64,
- (float)0.969392,
+ 53,
+ (float)0.9990032923685092,
PR_FALSE,
- "ISO-8859-5",
+ "WINDOWS-1251",
"bg"
};
-const SequenceModel Win1251BulgarianModel =
+const SequenceModel Iso_8859_5BulgarianModel =
{
- win1251BulgarianCharToOrderMap,
+ Iso_8859_5_CharToOrderMap,
BulgarianLangModel,
- 64,
- (float)0.969392,
+ 53,
+ (float)0.9990032923685092,
PR_FALSE,
- "WINDOWS-1251",
+ "ISO-8859-5",
"bg"
};
+
+const LanguageModel BulgarianModel =
+{
+ "bg",
+ Unicode_CharOrder,
+ 106,
+ BulgarianLangModel,
+ 53,
+ 5,
+ (float)0.4581931350621217,
+ 29,
+ (float)0.03226223199877268,
+};
diff --git a/src/nsLanguageDetector.h b/src/nsLanguageDetector.h
index 5feb726..ff7cb4a 100644
--- a/src/nsLanguageDetector.h
+++ b/src/nsLanguageDetector.h
@@ -126,6 +126,7 @@ private:
};
extern const LanguageModel ArabicModel;
+extern const LanguageModel BulgarianModel;
extern const LanguageModel CroatianModel;
extern const LanguageModel CzechModel;
extern const LanguageModel DanishModel;
diff --git a/src/nsMBCSGroupProber.cpp b/src/nsMBCSGroupProber.cpp
index 1006359..53f4c3a 100644
--- a/src/nsMBCSGroupProber.cpp
+++ b/src/nsMBCSGroupProber.cpp
@@ -93,6 +93,7 @@ nsMBCSGroupProber::nsMBCSGroupProber(PRUint32 aLanguageFilter)
int j = 0;
langDetectors[i][j++] = new nsLanguageDetector(&ArabicModel);
+ langDetectors[i][j++] = new nsLanguageDetector(&BulgarianModel);
langDetectors[i][j++] = new nsLanguageDetector(&CroatianModel);
langDetectors[i][j++] = new nsLanguageDetector(&CzechModel);
langDetectors[i][j++] = new nsLanguageDetector(&DanishModel);
diff --git a/src/nsMBCSGroupProber.h b/src/nsMBCSGroupProber.h
index 1dea490..f36e820 100644
--- a/src/nsMBCSGroupProber.h
+++ b/src/nsMBCSGroupProber.h
@@ -49,7 +49,7 @@
#include "nsEUCTWProber.h"
#define NUM_OF_PROBERS 8
-#define NUM_OF_LANGUAGES 31
+#define NUM_OF_LANGUAGES 32
class nsMBCSGroupProber: public nsCharSetProber {
public:
diff --git a/src/nsSBCSGroupProber.cpp b/src/nsSBCSGroupProber.cpp
index 04b8c67..1b7da06 100644
--- a/src/nsSBCSGroupProber.cpp
+++ b/src/nsSBCSGroupProber.cpp
@@ -60,8 +60,8 @@ nsSBCSGroupProber::nsSBCSGroupProber()
mProbers[n++] = new nsSingleByteCharSetProber(&Iso_8859_7GreekModel);
mProbers[n++] = new nsSingleByteCharSetProber(&Windows_1253GreekModel);
- mProbers[n++] = new nsSingleByteCharSetProber(&Latin5BulgarianModel);
- mProbers[n++] = new nsSingleByteCharSetProber(&Win1251BulgarianModel);
+ mProbers[n++] = new nsSingleByteCharSetProber(&Iso_8859_5BulgarianModel);
+ mProbers[n++] = new nsSingleByteCharSetProber(&Windows_1251BulgarianModel);
heb_prober_idx = n;
mProbers[n++] = hebprober;
diff --git a/src/nsSBCharSetProber.h b/src/nsSBCharSetProber.h
index bccb9e1..d804b93 100644
--- a/src/nsSBCharSetProber.h
+++ b/src/nsSBCharSetProber.h
@@ -144,8 +144,8 @@ extern const SequenceModel Ibm855RussianModel;
extern const SequenceModel Iso_8859_7GreekModel;
extern const SequenceModel Windows_1253GreekModel;
-extern const SequenceModel Latin5BulgarianModel;
-extern const SequenceModel Win1251BulgarianModel;
+extern const SequenceModel Iso_8859_5BulgarianModel;
+extern const SequenceModel Windows_1251BulgarianModel;
extern const SequenceModel Iso_8859_2HungarianModel;
extern const SequenceModel Windows_1250HungarianModel;
diff --git a/test/bg/iso-8859-5.txt b/test/bg/iso-8859-5.txt
new file mode 100644
index 0000000..d1c7734
--- /dev/null
+++ b/test/bg/iso-8859-5.txt
@@ -0,0 +1,3 @@
+ (Marmota) - (Sciuridae), 14 , (Spermophilus citellus).
+
+ , .
diff --git a/test/bg/utf-8.txt b/test/bg/utf-8.txt
new file mode 100644
index 0000000..048b7ac
--- /dev/null
+++ b/test/bg/utf-8.txt
@@ -0,0 +1,3 @@
+Мармотите (Marmota) са бозайници - род гризачи от семейство катерицови (Sciuridae), включващ 14 вида, включващи групата на лалугерите (Spermophilus citellus).
+
+За разлика от родствената катерица, мармотът и лалугерът водят наземен начин на живот.