summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJehan <jehan@girinstud.io>2022-12-17 19:13:03 +0100
committerJehan <jehan@girinstud.io>2022-12-17 19:13:03 +0100
commit0fffc109b5fda11973c25617e71a57867c81f984 (patch)
treee344e1b915e3e74fd91eef836bff7cddb142dbc0
parentffb94e4a9d4da3fb60cd022d3eeffe12301f96bf (diff)
script, src, test: adding Belarusian support.
Support for UTF-8, Windows-1251 and ISO-8859-5. The test contents comes from page 'Суркі' on Wikipedia in Belarusian.
-rw-r--r--README.md4
-rw-r--r--script/BuildLangModelLogs/LangBelarusianModel.log240
-rw-r--r--script/langs/be.py58
-rw-r--r--src/CMakeLists.txt1
-rw-r--r--src/LangModels/LangBelarusianModel.cpp202
-rw-r--r--src/nsLanguageDetector.h1
-rw-r--r--src/nsMBCSGroupProber.cpp1
-rw-r--r--src/nsMBCSGroupProber.h2
-rw-r--r--src/nsSBCSGroupProber.cpp3
-rw-r--r--src/nsSBCSGroupProber.h2
-rw-r--r--src/nsSBCharSetProber.h3
-rw-r--r--test/be/iso-8859-5.txt3
-rw-r--r--test/be/utf-8.txt3
-rw-r--r--test/be/windows-1251.txt3
14 files changed, 524 insertions, 2 deletions
diff --git a/README.md b/README.md
index 28c085a..ddc0b45 100644
--- a/README.md
+++ b/README.md
@@ -17,6 +17,10 @@ uchardet started as a C language binding of the original C++ implementation of t
* UTF-8
* ISO-8859-6
* WINDOWS-1256
+ * Belarusian
+ * UTF-8
+ * ISO-8859-5
+ * WINDOWS-1251
* Bulgarian
* UTF-8
* ISO-8859-5
diff --git a/script/BuildLangModelLogs/LangBelarusianModel.log b/script/BuildLangModelLogs/LangBelarusianModel.log
new file mode 100644
index 0000000..66bdc14
--- /dev/null
+++ b/script/BuildLangModelLogs/LangBelarusianModel.log
@@ -0,0 +1,240 @@
+= Logs of language model for Belarusian (be) =
+
+- Generated by BuildLangModel.py
+- Started: 2022-12-17 18:45:44.158196
+- Maximum depth: 4
+- Max number of pages: 200
+
+== Parsed pages ==
+
+Максім_Танк (revision 4282144)
+Польская Рэспубліка (1918—1939) (revision 4280541)
+Літаратурная прэмія імя Янкі Купалы (revision 4017964)
+Мядзел (revision 4262814)
+Каралеўская бібліятэка Швецыі (revision 4114661)
+Калоссе (1935) (revision 3858825)
+Наша Ніва (1991) (revision 4100218)
+Леанід Уладзіміравіч Маракоў (revision 4053060)
+Дзмітрый Браніслававіч Смольскі (revision 4282791)
+Васіль Філімонавіч Шавура (revision 3397335)
+Леанід Дранько-Майсюк (revision 4280504)
+Рэферэндум у Беларусі, 1995 (revision 4133742)
+Дзіцячая літаратура (revision 4215153)
+1990 (revision 3826851)
+Барысаў (паэма) (revision 3675556)
+Часопіс (revision 4062833)
+1940 (revision 4141940)
+Літаратурная прэмія імя Якуба Коласа (revision 3790577)
+Мікалай Дамашкевіч (revision 4124871)
+Паўночная Інгрыя (revision 4022023)
+Антон Браніслававіч Насілоўскі (revision 3575651)
+Джэймс Фенімар Купер (revision 3516371)
+Ханс Крысціян Андэрсен (revision 3845458)
+Virtual International Authority File (revision 4119042)
+1912 (revision 4201938)
+Кампазітар (revision 4086673)
+Парламенцкія выбары ў Беларусі (2012) (revision 4056679)
+Янка Купала (revision 4297880)
+Уладзімір Някляеў (revision 4061577)
+TUT.BY (revision 4254319)
+Гарады Мінскай вобласці (revision 4290488)
+Вільня (revision 4260328)
+Брэсцкая вобласць (revision 4095450)
+Украінская мова (revision 4281826)
+Сацыяльная сетка (revision 3501794)
+Джонатан Свіфт (revision 4047140)
+Мікалай Аляксеевіч Някрасаў (revision 4054879)
+Наша ніва (1920) (revision 3648798)
+Белсат (revision 4295169)
+Рэспубліка Сярэдняй Літвы (revision 4287459)
+Беларусь (revision 4283834)
+Віктар Дзмітрыевіч Смольскі (revision 3829868)
+Генадзь Пятровіч Пашкоў (revision 4254449)
+Знешняя палітыка Беларусі (revision 4258993)
+2012 (revision 4181555)
+Саюз пісьменнікаў СССР (revision 4039027)
+1995 (revision 3568939)
+Саюз Савецкіх Сацыялістычных Рэспублік (revision 3433404)
+Сістэма абазначэння аб’ектаў адміністрацыйна-тэрытарыяльнага падзелу (revision 2873336)
+XVIII (revision 4286695)
+Я. Шутовіч (revision 4063897)
+Заходняя Беларусь (revision 4189742)
+1958 (revision 4179116)
+Санкт-Пецярбург (revision 4297788)
+Аляксандр Паўлавіч Мацвееў (revision 4002064)
+Жодзіна (revision 4204566)
+Узда (revision 4263659)
+Анатоль Васільевіч Багатыроў (revision 4045167)
+Людміла Рублеўская (revision 4109306)
+Беларускі ПЭН-цэнтр (revision 4256051)
+Дзяржаўны літаратурны музей Янкі Купалы (revision 4258654)
+Пётр Паўлавіч Яршоў (revision 3212130)
+Выбары ў мясцовыя Саветы дэпутатаў Беларусі (1995) (revision 3419938)
+Беластоцкае ваяводства (1919—1939) (revision 4167163)
+Уладзімір Аляксеевіч Арлоў (revision 3996298)
+1948 (revision 4298274)
+1998 (revision 4169162)
+Андрэй Катлярчук (revision 4045257)
+Аляксей Камай (revision 4004900)
+БелаПАН (revision 4114047)
+Літаратура і мастацтва (1932) (revision 4226134)
+18 стагоддзе (revision 4286695)
+Якуб Колас (revision 4276306)
+6 лістапада (revision 4241889)
+Масква (revision 4293280)
+Столінскі раён (revision 4126133)
+БелТА (revision 4114101)
+Беларуская Энцыклапедыя імя Петруся Броўкі (revision 4131649)
+11 красавіка (revision 4257199)
+Беларуская мова (revision 4156511)
+Маскоўская кансерваторыя (revision 3240079)
+Ягор Аляксандравіч Марціновіч (revision 4224289)
+Фінляндыя (revision 4214425)
+Мікола Мятліцкі (revision 4283301)
+Народны артыст Беларусі (revision 4275698)
+Курган (паэма) (revision 4086218)
+Старыя Дарогі (revision 4204749)
+Слуцк (revision 4278680)
+Каралеўства Польскае, 1916—1918 (revision 4288202)
+Залаты апостраф (revision 4111782)
+Уладзімір Андрэевіч Калеснік (revision 4074048)
+Залатая літара (revision 4085127)
+Нарматыўны кантроль (revision 4228063)
+Вышэйшы Гаспадарчы суд Рэспублікі Беларусь (revision 4215415)
+Euronews (revision 4165755)
+2003 (revision 4206607)
+Л. Маракоў (revision 4053060)
+1957 (revision 4170762)
+Слуцкі раён (revision 4145373)
+Дзяржаўная прэмія БССР (revision 3316889)
+Энцыклапедыя гісторыі Беларусі (revision 4030685)
+Звязда (1917) (revision 4008703)
+2018 (revision 4289036)
+1986 (revision 3316291)
+Бухарская Народная Савецкая Рэспубліка (revision 2623266)
+1939 (revision 4148673)
+Дзеяслоў (2002) (revision 4049427)
+Руская мова (revision 4214240)
+Рафаэла Джаваньёлі (revision 2633449)
+Фёдар Анісімавіч Сурганаў (revision 4188740)
+Саюз пісьменнікаў Беларусі (2005) (revision 4262467)
+Беларуская дзяржаўная кансерваторыя (revision 4216964)
+2001 (revision 4204274)
+Ігнацы Масціцкі (revision 4002826)
+29 верасня (revision 4261890)
+Іван Андрэевіч Крылоў (revision 3874970)
+М. Шкялёнак (revision 4197856)
+1935 (revision 3316357)
+Тутэйшыя (фільм, 1993) (revision 3952769)
+Майскі пераварот (Польшча) (revision 2832232)
+Лацвянскі сельсавет (revision 3562080)
+Генадзь Пашкоў (revision 4254449)
+Сырмежскі сельсавет (revision 4077910)
+13 красавіка (revision 4201937)
+1994 (revision 4170911)
+Рабінавая ноч (revision 4262182)
+5 ліпеня (revision 4148432)
+Беларускае Палессе (revision 4277908)
+Віцебская вобласць (revision 4257032)
+Саюз вызвалення Беларусі (справа) (revision 4152192)
+Навагрудак (revision 4283306)
+Расійская дзяржава (1918—1920) (revision 4070494)
+Утварэнне Вялікага Княства Літоўскага (revision 4281842)
+Коўна (revision 4264967)
+1989 (revision 4170810)
+7 сакавіка (revision 4272372)
+Советская Белоруссия (revision 3941699)
+Саксафон (revision 4119817)
+Вікісховішча (revision 4276248)
+1915 (revision 4204295)
+Канстытуцыя Рэспублікі Беларусь (revision 4051195)
+Расійская імперыя (revision 4273900)
+1930-я (revision 3508427)
+Доктар гістарычных навук (revision 4036548)
+Віленскае ваяводства, 1926—1939 (revision 4010285)
+1767 (revision 4119132)
+Салігорск (revision 4285782)
+Горад (revision 4154288)
+Заходні Берлін (revision 4273163)
+Калійныя солі (revision 3812964)
+Паштовы індэкс (revision 2680497)
+1714 (revision 3317887)
+Джордж Харысан (revision 4129049)
+Аўстра-Венгрыя (revision 3868613)
+Антарктыка (revision 3997579)
+1956 (revision 4169991)
+Люфтвафэ (revision 3726645)
+Канстытуцыя Украіны (revision 2683533)
+Васіль Уладзіміравіч Быкаў (revision 4288405)
+1698 (revision 3448249)
+Бяларучы (revision 4294726)
+1950-я (revision 4204989)
+Выбаргскі раён (Ленінградская вобласць) (revision 3641710)
+Нацыянальная парламенцкая бібліятэка Японіі (revision 4020527)
+1934 (revision 4275604)
+XIX стагоддзе (revision 4286738)
+24 студзеня (revision 4268404)
+Вільнюскае гарадское самакіраванне (revision 3492972)
+Залаты Купідон (revision 4267601)
+Мінская вобласць (revision 4296852)
+Кантрольны нумар Бібліятэкі Кангрэса (revision 3491858)
+Пінск (revision 4286890)
+Нацыянальная бібліятэка Францыі (revision 4267432)
+10 студзеня (revision 3935845)
+Аляксандр Іванавіч Якімовіч (revision 4085685)
+Лужаснянскі дзяржаўны аграрны каледж імя Ф. А. Сурганава (revision 4171547)
+Рыта Леві-Мантальчыні (revision 4058476)
+
+== End of Parsed pages ==
+
+- Wikipedia parsing ended at: 2022-12-17 18:49:26.830622
+
+65 characters appeared 853773 times.
+
+Most Frequent characters:
+[ 0] Char а: 15.572991884259633 %
+[ 1] Char н: 6.632324985681206 %
+[ 2] Char і: 5.7941630855039925 %
+[ 3] Char р: 5.325888731548082 %
+[ 4] Char с: 5.02124100902699 %
+[ 5] Char к: 4.3536162422564315 %
+[ 6] Char ы: 4.066654719697156 %
+[ 7] Char л: 4.051428189928704 %
+[ 8] Char е: 3.6824776609239227 %
+[ 9] Char т: 3.4540797143971527 %
+[10] Char я: 3.1694607348791775 %
+[11] Char в: 3.116285007841663 %
+[12] Char д: 3.1063291999161367 %
+[13] Char о: 2.9540639022316237 %
+[14] Char у: 2.9458650015870727 %
+[15] Char м: 2.709385281567817 %
+[16] Char п: 2.6671023796723485 %
+[17] Char з: 2.22483025347487 %
+[18] Char ц: 1.998657722837335 %
+[19] Char г: 1.9463018858642753 %
+[20] Char ў: 1.9429051984543901 %
+[21] Char б: 1.610732595198021 %
+[22] Char э: 1.3249423441593962 %
+[23] Char ч: 1.3172119521231052 %
+[24] Char й: 1.1151676148109626 %
+[25] Char ь: 1.0356382785588207 %
+[26] Char х: 0.9220249410557607 %
+[27] Char ш: 0.7558215122755112 %
+[28] Char ж: 0.5403075524758923 %
+[29] Char ю: 0.4688599897162361 %
+[30] Char ф: 0.3941328666987595 %
+[31] Char i: 0.36391406146598687 %
+[32] Char e: 0.2897725742088354 %
+[33] Char ё: 0.28508748812623497 %
+
+The first 34 characters have an accumulated ratio of 0.9715966656242353.
+The first 6 characters have an accumulated ratio of 0.42700225938276326.
+All characters whose order is over 26 have an accumulated ratio of 0.030978960449674565.
+
+1518 sequences found.
+
+First 893 (typical positive ratio): 0.9950100888151092
+Next 272 (1165-893): 0.003995003102100991
+Rest: 0.0009949080827897916
+
+- Processing end: 2022-12-17 18:49:26.928946
diff --git a/script/langs/be.py b/script/langs/be.py
new file mode 100644
index 0000000..3dea248
--- /dev/null
+++ b/script/langs/be.py
@@ -0,0 +1,58 @@
+#!/bin/python3
+# -*- coding: utf-8 -*-
+
+# ##### BEGIN LICENSE BLOCK #####
+# Version: MPL 1.1/GPL 2.0/LGPL 2.1
+#
+# The contents of this file are subject to the Mozilla Public License Version
+# 1.1 (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+# http://www.mozilla.org/MPL/
+#
+# Software distributed under the License is distributed on an "AS IS" basis,
+# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
+# for the specific language governing rights and limitations under the
+# License.
+#
+# The Original Code is Mozilla Universal charset detector code.
+#
+# The Initial Developer of the Original Code is
+# Netscape Communications Corporation.
+# Portions created by the Initial Developer are Copyright (C) 2001
+# the Initial Developer. All Rights Reserved.
+#
+# Contributor(s):
+# Jehan <jehan@girinstud.io>
+#
+# Alternatively, the contents of this file may be used under the terms of
+# either the GNU General Public License Version 2 or later (the "GPL"), or
+# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
+# in which case the provisions of the GPL or the LGPL are applicable instead
+# of those above. If you wish to allow use of your version of this file only
+# under the terms of either the GPL or the LGPL, and not to allow others to
+# use your version of this file under the terms of the MPL, indicate your
+# decision by deleting the provisions above and replace them with the notice
+# and other provisions required by the GPL or the LGPL. If you do not delete
+# the provisions above, a recipient may use your version of this file under
+# the terms of any one of the MPL, the GPL or the LGPL.
+#
+# ##### END LICENSE BLOCK #####
+
+import re
+
+## Mandatory Properties ##
+
+name = 'Belarusian'
+code = 'be'
+use_ascii = False
+charsets = [ 'WINDOWS-1251', 'ISO-8859-5' ]
+
+## Optional Properties ##
+
+# Alphabet characters.
+alphabet = 'абвгдеёжзійклмнопрстуўфхцчшыьэюя'
+# A starred page which was rewarded on the main page when I created
+# the data.
+start_pages = ['Максім_Танк']
+wikipedia_code = code
+case_mapping = True
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index a0b607c..50cf70b 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -8,6 +8,7 @@ set(
CharDistribution.cpp
JpCntx.cpp
LangModels/LangArabicModel.cpp
+ LangModels/LangBelarusianModel.cpp
LangModels/LangBulgarianModel.cpp
LangModels/LangCroatianModel.cpp
LangModels/LangCzechModel.cpp
diff --git a/src/LangModels/LangBelarusianModel.cpp b/src/LangModels/LangBelarusianModel.cpp
new file mode 100644
index 0000000..f013abe
--- /dev/null
+++ b/src/LangModels/LangBelarusianModel.cpp
@@ -0,0 +1,202 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* ***** BEGIN LICENSE BLOCK *****
+ * Version: MPL 1.1/GPL 2.0/LGPL 2.1
+ *
+ * The contents of this file are subject to the Mozilla Public License Version
+ * 1.1 (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ * http://www.mozilla.org/MPL/
+ *
+ * Software distributed under the License is distributed on an "AS IS" basis,
+ * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
+ * for the specific language governing rights and limitations under the
+ * License.
+ *
+ * The Original Code is Mozilla Communicator client code.
+ *
+ * The Initial Developer of the Original Code is
+ * Netscape Communications Corporation.
+ * Portions created by the Initial Developer are Copyright (C) 1998
+ * the Initial Developer. All Rights Reserved.
+ *
+ * Contributor(s):
+ *
+ * Alternatively, the contents of this file may be used under the terms of
+ * either the GNU General Public License Version 2 or later (the "GPL"), or
+ * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
+ * in which case the provisions of the GPL or the LGPL are applicable instead
+ * of those above. If you wish to allow use of your version of this file only
+ * under the terms of either the GPL or the LGPL, and not to allow others to
+ * use your version of this file under the terms of the MPL, indicate your
+ * decision by deleting the provisions above and replace them with the notice
+ * and other provisions required by the GPL or the LGPL. If you do not delete
+ * the provisions above, a recipient may use your version of this file under
+ * the terms of any one of the MPL, the GPL or the LGPL.
+ *
+ * ***** END LICENSE BLOCK ***** */
+
+#include "../nsSBCharSetProber.h"
+#include "../nsLanguageDetector.h"
+
+/********* Language model for: Belarusian *********/
+
+/**
+ * Generated by BuildLangModel.py
+ * On: 2022-12-17 18:49:26.830966
+ **/
+
+/* Character Mapping Table:
+ * ILL: illegal character.
+ * CTR: control character specific to the charset.
+ * RET: carriage/return.
+ * SYM: symbol (punctuation) that does not belong to word.
+ * NUM: 0 - 9.
+ *
+ * Other characters are ordered by probabilities
+ * (0 is the most common character in the language).
+ *
+ * Orders are generic to a language. So the codepoint with order X in
+ * CHARSET1 maps to the same character as the codepoint with the same
+ * order X in CHARSET2 for the same language.
+ * As such, it is possible to get missing order. For instance the
+ * ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1
+ * even though they are both used for French. Same for the euro sign.
+ */
+static const unsigned char Windows_1251_CharToOrderMap[] =
+{
+ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */
+ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
+ SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
+ NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
+ SYM, 34, 46, 42, 43, 32, 55, 49, 45, 31, 57, 51, 41, 48, 35, 36, /* 4X */
+ 47, 62, 38, 37, 39, 44, 50, 54, 53, 52, 56,SYM,SYM,SYM,SYM,SYM, /* 5X */
+ SYM, 34, 46, 42, 43, 32, 55, 49, 45, 31, 57, 51, 41, 48, 35, 36, /* 6X */
+ 47, 62, 38, 37, 39, 44, 50, 54, 53, 52, 56,SYM,SYM,SYM,SYM,CTR, /* 7X */
+ 65, 66,SYM, 67,SYM,SYM,SYM,SYM,SYM,SYM, 68,SYM, 69, 70, 71, 64, /* 8X */
+ 72,SYM,SYM,SYM,SYM,SYM,SYM,SYM,ILL,SYM, 73,SYM, 74, 75, 76, 64, /* 9X */
+ SYM, 20, 20, 77,SYM, 63,SYM,SYM, 33,SYM, 61,SYM,SYM,SYM,SYM, 60, /* AX */
+ SYM,SYM, 2, 2, 63,SYM,SYM,SYM, 33,SYM, 61,SYM, 78, 79, 80, 60, /* BX */
+ 0, 21, 11, 19, 12, 8, 28, 17, 40, 24, 5, 7, 15, 1, 13, 16, /* CX */
+ 3, 4, 9, 14, 30, 26, 18, 23, 27, 59, 58, 6, 25, 22, 29, 10, /* DX */
+ 0, 21, 11, 19, 12, 8, 28, 17, 40, 24, 5, 7, 15, 1, 13, 16, /* EX */
+ 3, 4, 9, 14, 30, 26, 18, 23, 27, 59, 58, 6, 25, 22, 29, 10, /* FX */
+};
+/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
+
+static const unsigned char Iso_8859_5_CharToOrderMap[] =
+{
+ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */
+ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
+ SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
+ NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
+ SYM, 34, 46, 42, 43, 32, 55, 49, 45, 31, 57, 51, 41, 48, 35, 36, /* 4X */
+ 47, 62, 38, 37, 39, 44, 50, 54, 53, 52, 56,SYM,SYM,SYM,SYM,SYM, /* 5X */
+ SYM, 34, 46, 42, 43, 32, 55, 49, 45, 31, 57, 51, 41, 48, 35, 36, /* 6X */
+ 47, 62, 38, 37, 39, 44, 50, 54, 53, 52, 56,SYM,SYM,SYM,SYM,CTR, /* 7X */
+ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */
+ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */
+ SYM, 33, 81, 82, 61, 83, 2, 60, 84, 85, 86, 87, 88,SYM, 20, 64, /* AX */
+ 0, 21, 11, 19, 12, 8, 28, 17, 40, 24, 5, 7, 15, 1, 13, 16, /* BX */
+ 3, 4, 9, 14, 30, 26, 18, 23, 27, 59, 58, 6, 25, 22, 29, 10, /* CX */
+ 0, 21, 11, 19, 12, 8, 28, 17, 40, 24, 5, 7, 15, 1, 13, 16, /* DX */
+ 3, 4, 9, 14, 30, 26, 18, 23, 27, 59, 58, 6, 25, 22, 29, 10, /* EX */
+ SYM, 33, 89, 90, 61, 91, 2, 60, 92, 93, 94, 95, 96,SYM, 20, 64, /* FX */
+};
+/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
+
+static const int Unicode_Char_size = 68;
+static const unsigned int Unicode_CharOrder[] =
+{
+ 69, 32, 73, 31, 101, 32, 105, 31, 1025, 33, 1030, 2, 1038, 20,1040, 0,
+ 1041, 21, 1042, 11, 1043, 19, 1044, 12, 1045, 8, 1046, 28, 1047, 17,1049, 24,
+ 1050, 5, 1051, 7, 1052, 15, 1053, 1, 1054, 13, 1055, 16, 1056, 3,1057, 4,
+ 1058, 9, 1059, 14, 1060, 30, 1061, 26, 1062, 18, 1063, 23, 1064, 27,1067, 6,
+ 1068, 25, 1069, 22, 1070, 29, 1071, 10, 1072, 0, 1073, 21, 1074, 11,1075, 19,
+ 1076, 12, 1077, 8, 1078, 28, 1079, 17, 1081, 24, 1082, 5, 1083, 7,1084, 15,
+ 1085, 1, 1086, 13, 1087, 16, 1088, 3, 1089, 4, 1090, 9, 1091, 14,1092, 30,
+ 1093, 26, 1094, 18, 1095, 23, 1096, 27, 1099, 6, 1100, 25, 1101, 22,1102, 29,
+ 1103, 10, 1105, 33, 1110, 2, 1118, 20,
+};
+
+
+/* Model Table:
+ * Total considered sequences: 1518 / 1156
+ * - Positive sequences: first 893 (0.9950100888151092)
+ * - Probable sequences: next 272 (1165-893) (0.003995003102100991)
+ * - Neutral sequences: last -9 (0.0009949080827897916)
+ * - Negative sequences: -362 (off-ratio)
+ * Negative sequences: TODO
+ */
+static const PRUint8 BelarusianLangModel[] =
+{
+ 3,3,3,3,3,3,0,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,0,3,3,3,3,3,1,0,3,
+ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,3,3,3,0,2,3,3,0,3,2,3,3,3,3,2,0,3,
+ 3,3,3,3,3,3,1,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,1,3,3,0,3,3,3,3,3,1,0,3,
+ 3,3,2,2,3,3,3,3,3,3,2,3,3,3,3,3,3,2,3,3,0,3,3,3,1,2,3,3,3,1,3,0,0,1,
+ 3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,0,3,2,0,3,3,3,0,3,3,2,0,3,3,1,0,3,
+ 3,3,3,3,3,3,1,3,3,3,2,3,2,3,3,3,3,3,3,2,0,2,1,1,0,0,0,2,0,2,1,2,0,2,
+ 2,3,3,3,3,3,0,3,3,3,3,3,3,1,3,3,3,3,3,3,3,3,1,3,3,0,3,3,3,3,3,1,0,3,
+ 3,3,3,2,3,3,3,3,3,3,3,2,3,3,3,2,2,2,2,3,0,2,3,2,0,3,1,1,1,3,3,2,0,3,
+ 3,3,3,3,3,3,0,3,3,3,3,3,3,3,1,3,3,3,3,3,3,3,0,3,3,0,3,3,3,2,3,0,0,0,
+ 3,3,2,3,3,3,3,3,3,3,1,3,1,3,3,3,1,1,3,2,0,3,3,3,0,3,2,0,1,2,2,0,0,0,
+ 1,3,1,3,3,3,0,3,3,3,3,3,3,2,1,3,3,3,3,3,3,3,1,3,3,0,3,3,3,3,3,0,0,0,
+ 3,3,3,2,3,3,3,2,3,3,3,1,2,3,3,1,3,1,1,2,0,2,2,0,0,2,0,1,1,3,1,2,0,3,
+ 3,3,2,3,3,3,3,3,3,3,1,3,3,3,3,3,3,3,3,3,0,3,3,3,0,2,2,2,3,1,1,0,0,0,
+ 1,3,3,3,3,3,0,3,3,3,3,3,3,1,2,3,3,3,3,3,3,3,2,3,3,0,3,3,3,3,3,0,0,0,
+ 3,3,3,3,3,3,0,3,3,3,2,3,3,2,2,3,3,3,3,3,3,3,3,3,3,0,3,3,3,3,2,0,0,1,
+ 3,3,3,2,3,3,3,3,3,1,3,3,2,3,3,3,3,3,3,1,0,3,3,2,0,0,1,1,0,2,3,1,0,3,
+ 3,3,3,3,3,3,3,3,3,3,3,1,1,3,3,1,2,2,3,0,0,3,3,3,0,0,1,3,0,0,1,1,0,3,
+ 3,3,3,3,3,3,3,3,3,0,3,3,3,3,3,3,1,2,2,3,0,3,3,2,0,3,0,3,1,3,1,1,0,3,
+ 3,3,3,1,2,3,3,3,3,3,3,3,1,3,3,2,1,1,3,2,0,1,3,0,1,3,1,1,0,3,0,1,0,3,
+ 3,3,3,3,3,3,1,3,3,2,0,3,3,3,3,3,1,0,0,3,0,2,3,3,0,0,0,1,0,1,0,1,0,1,
+ 0,3,2,3,3,3,0,3,3,3,3,3,3,0,1,3,3,3,3,3,0,3,1,3,0,0,2,3,3,0,2,0,0,1,
+ 3,3,3,3,3,3,3,3,3,2,3,3,2,3,3,3,2,1,2,1,0,0,2,2,0,1,2,2,1,3,1,0,0,1,
+ 3,3,3,3,3,3,0,3,2,3,3,3,3,3,1,3,3,3,3,3,3,3,1,3,3,0,3,3,3,2,3,0,0,0,
+ 3,3,0,0,1,3,3,3,3,1,0,3,1,3,3,2,1,0,1,0,1,2,3,3,0,2,1,0,1,1,0,0,0,1,
+ 1,3,1,2,3,3,0,3,1,3,1,3,3,1,1,3,2,2,3,3,0,3,1,3,0,0,3,3,0,1,0,0,0,0,
+ 0,3,2,1,3,3,0,1,3,3,3,3,3,2,1,3,3,1,3,3,0,3,0,2,0,1,2,3,0,2,3,0,0,3,
+ 3,3,3,3,3,1,0,3,3,3,1,3,1,3,3,3,1,1,2,2,0,2,2,1,0,0,2,0,0,1,0,0,0,1,
+ 3,3,1,2,2,3,3,3,2,3,0,3,1,3,3,3,3,0,2,1,0,0,3,3,0,0,1,3,0,0,0,0,0,0,
+ 3,3,0,2,3,3,3,1,2,0,0,3,3,3,3,0,1,1,3,1,0,3,3,2,0,1,0,0,3,0,0,0,0,0,
+ 1,3,2,3,3,3,0,3,2,3,0,2,3,0,0,2,0,3,3,2,0,3,1,3,0,0,2,3,2,3,3,0,0,0,
+ 3,2,3,3,3,2,2,3,3,3,2,0,0,3,3,1,0,1,1,1,0,0,3,0,0,0,0,1,0,2,2,1,0,3,
+ 0,2,1,1,1,2,0,1,0,1,0,0,1,0,0,1,0,1,1,1,0,0,0,2,0,0,0,0,0,0,0,3,3,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,3,0,
+ 0,3,0,3,3,3,0,3,0,3,0,3,3,0,0,3,2,3,1,3,3,1,1,0,3,0,0,0,1,2,0,0,0,0,
+};
+
+
+const SequenceModel Windows_1251BelarusianModel =
+{
+ Windows_1251_CharToOrderMap,
+ BelarusianLangModel,
+ 34,
+ (float)0.9990050919172102,
+ PR_FALSE,
+ "WINDOWS-1251",
+ "be"
+};
+
+const SequenceModel Iso_8859_5BelarusianModel =
+{
+ Iso_8859_5_CharToOrderMap,
+ BelarusianLangModel,
+ 34,
+ (float)0.9990050919172102,
+ PR_FALSE,
+ "ISO-8859-5",
+ "be"
+};
+
+const LanguageModel BelarusianModel =
+{
+ "be",
+ Unicode_CharOrder,
+ 68,
+ BelarusianLangModel,
+ 34,
+ 6,
+ (float)0.42700225938276326,
+ 26,
+ (float)0.030978960449674565,
+};
diff --git a/src/nsLanguageDetector.h b/src/nsLanguageDetector.h
index ff7cb4a..8cff540 100644
--- a/src/nsLanguageDetector.h
+++ b/src/nsLanguageDetector.h
@@ -126,6 +126,7 @@ private:
};
extern const LanguageModel ArabicModel;
+extern const LanguageModel BelarusianModel;
extern const LanguageModel BulgarianModel;
extern const LanguageModel CroatianModel;
extern const LanguageModel CzechModel;
diff --git a/src/nsMBCSGroupProber.cpp b/src/nsMBCSGroupProber.cpp
index 53f4c3a..72e907a 100644
--- a/src/nsMBCSGroupProber.cpp
+++ b/src/nsMBCSGroupProber.cpp
@@ -93,6 +93,7 @@ nsMBCSGroupProber::nsMBCSGroupProber(PRUint32 aLanguageFilter)
int j = 0;
langDetectors[i][j++] = new nsLanguageDetector(&ArabicModel);
+ langDetectors[i][j++] = new nsLanguageDetector(&BelarusianModel);
langDetectors[i][j++] = new nsLanguageDetector(&BulgarianModel);
langDetectors[i][j++] = new nsLanguageDetector(&CroatianModel);
langDetectors[i][j++] = new nsLanguageDetector(&CzechModel);
diff --git a/src/nsMBCSGroupProber.h b/src/nsMBCSGroupProber.h
index f36e820..b508eb4 100644
--- a/src/nsMBCSGroupProber.h
+++ b/src/nsMBCSGroupProber.h
@@ -49,7 +49,7 @@
#include "nsEUCTWProber.h"
#define NUM_OF_PROBERS 8
-#define NUM_OF_LANGUAGES 32
+#define NUM_OF_LANGUAGES 33
class nsMBCSGroupProber: public nsCharSetProber {
public:
diff --git a/src/nsSBCSGroupProber.cpp b/src/nsSBCSGroupProber.cpp
index 1b7da06..e6ce015 100644
--- a/src/nsSBCSGroupProber.cpp
+++ b/src/nsSBCSGroupProber.cpp
@@ -214,6 +214,9 @@ nsSBCSGroupProber::nsSBCSGroupProber()
mProbers[n++] = new nsSingleByteCharSetProber(&Iso_8859_1EnglishModel);
mProbers[n++] = new nsSingleByteCharSetProber(&Windows_1252EnglishModel);
+ mProbers[n++] = new nsSingleByteCharSetProber(&Windows_1251BelarusianModel);
+ mProbers[n++] = new nsSingleByteCharSetProber(&Iso_8859_5BelarusianModel);
+
Reset();
}
diff --git a/src/nsSBCSGroupProber.h b/src/nsSBCSGroupProber.h
index 57102a0..3d42110 100644
--- a/src/nsSBCSGroupProber.h
+++ b/src/nsSBCSGroupProber.h
@@ -40,7 +40,7 @@
#define nsSBCSGroupProber_h__
-#define NUM_OF_SBCS_PROBERS 109
+#define NUM_OF_SBCS_PROBERS 111
class nsCharSetProber;
class nsSBCSGroupProber: public nsCharSetProber {
diff --git a/src/nsSBCharSetProber.h b/src/nsSBCharSetProber.h
index d804b93..b274390 100644
--- a/src/nsSBCharSetProber.h
+++ b/src/nsSBCharSetProber.h
@@ -144,6 +144,9 @@ extern const SequenceModel Ibm855RussianModel;
extern const SequenceModel Iso_8859_7GreekModel;
extern const SequenceModel Windows_1253GreekModel;
+extern const SequenceModel Iso_8859_5BelarusianModel;
+extern const SequenceModel Windows_1251BelarusianModel;
+
extern const SequenceModel Iso_8859_5BulgarianModel;
extern const SequenceModel Windows_1251BulgarianModel;
diff --git a/test/be/iso-8859-5.txt b/test/be/iso-8859-5.txt
new file mode 100644
index 0000000..afec761
--- /dev/null
+++ b/test/be/iso-8859-5.txt
@@ -0,0 +1,3 @@
+ (Marmota), , .
+
+ 15 , . -- . , . , , . , , .
diff --git a/test/be/utf-8.txt b/test/be/utf-8.txt
new file mode 100644
index 0000000..7fc7177
--- /dev/null
+++ b/test/be/utf-8.txt
@@ -0,0 +1,3 @@
+Суркі (Marmota), сысуны, прадстаўнікі атраду грызуноў.
+
+На Зямлі існуе 15 відаў суркоў, якія маюць агульнага продка. Прарадзіма суркоў — Амерыка. У той час як большасць жывёл рухалася з Еўразіі ў Амерыку, суркі з Амерыкі перабіраліся ў Азію. Розныя віды абасобіліся ў розных геаграфічных зонах і адрозніваюцца асаблівасцямі паводзін, але захавалі знешнюю падобнасць, неабходнасць упадаць у спячку і жыццё ў калоніях. Усе суркі траваядныя, жывуць у норах, маюць цёплае футра.
diff --git a/test/be/windows-1251.txt b/test/be/windows-1251.txt
new file mode 100644
index 0000000..1d82ec8
--- /dev/null
+++ b/test/be/windows-1251.txt
@@ -0,0 +1,3 @@
+ (Marmota), , .
+
+ 15 , .  . Ţ糳 , . , , . , , .