diff options
author | Jehan <jehan@girinstud.io> | 2022-12-17 19:13:03 +0100 |
---|---|---|
committer | Jehan <jehan@girinstud.io> | 2022-12-17 19:13:03 +0100 |
commit | 0fffc109b5fda11973c25617e71a57867c81f984 (patch) | |
tree | e344e1b915e3e74fd91eef836bff7cddb142dbc0 | |
parent | ffb94e4a9d4da3fb60cd022d3eeffe12301f96bf (diff) |
script, src, test: adding Belarusian support.
Support for UTF-8, Windows-1251 and ISO-8859-5.
The test contents comes from page 'Суркі' on Wikipedia in Belarusian.
-rw-r--r-- | README.md | 4 | ||||
-rw-r--r-- | script/BuildLangModelLogs/LangBelarusianModel.log | 240 | ||||
-rw-r--r-- | script/langs/be.py | 58 | ||||
-rw-r--r-- | src/CMakeLists.txt | 1 | ||||
-rw-r--r-- | src/LangModels/LangBelarusianModel.cpp | 202 | ||||
-rw-r--r-- | src/nsLanguageDetector.h | 1 | ||||
-rw-r--r-- | src/nsMBCSGroupProber.cpp | 1 | ||||
-rw-r--r-- | src/nsMBCSGroupProber.h | 2 | ||||
-rw-r--r-- | src/nsSBCSGroupProber.cpp | 3 | ||||
-rw-r--r-- | src/nsSBCSGroupProber.h | 2 | ||||
-rw-r--r-- | src/nsSBCharSetProber.h | 3 | ||||
-rw-r--r-- | test/be/iso-8859-5.txt | 3 | ||||
-rw-r--r-- | test/be/utf-8.txt | 3 | ||||
-rw-r--r-- | test/be/windows-1251.txt | 3 |
14 files changed, 524 insertions, 2 deletions
@@ -17,6 +17,10 @@ uchardet started as a C language binding of the original C++ implementation of t * UTF-8 * ISO-8859-6 * WINDOWS-1256 + * Belarusian + * UTF-8 + * ISO-8859-5 + * WINDOWS-1251 * Bulgarian * UTF-8 * ISO-8859-5 diff --git a/script/BuildLangModelLogs/LangBelarusianModel.log b/script/BuildLangModelLogs/LangBelarusianModel.log new file mode 100644 index 0000000..66bdc14 --- /dev/null +++ b/script/BuildLangModelLogs/LangBelarusianModel.log @@ -0,0 +1,240 @@ += Logs of language model for Belarusian (be) = + +- Generated by BuildLangModel.py +- Started: 2022-12-17 18:45:44.158196 +- Maximum depth: 4 +- Max number of pages: 200 + +== Parsed pages == + +Максім_Танк (revision 4282144) +Польская Рэспубліка (1918—1939) (revision 4280541) +Літаратурная прэмія імя Янкі Купалы (revision 4017964) +Мядзел (revision 4262814) +Каралеўская бібліятэка Швецыі (revision 4114661) +Калоссе (1935) (revision 3858825) +Наша Ніва (1991) (revision 4100218) +Леанід Уладзіміравіч Маракоў (revision 4053060) +Дзмітрый Браніслававіч Смольскі (revision 4282791) +Васіль Філімонавіч Шавура (revision 3397335) +Леанід Дранько-Майсюк (revision 4280504) +Рэферэндум у Беларусі, 1995 (revision 4133742) +Дзіцячая літаратура (revision 4215153) +1990 (revision 3826851) +Барысаў (паэма) (revision 3675556) +Часопіс (revision 4062833) +1940 (revision 4141940) +Літаратурная прэмія імя Якуба Коласа (revision 3790577) +Мікалай Дамашкевіч (revision 4124871) +Паўночная Інгрыя (revision 4022023) +Антон Браніслававіч Насілоўскі (revision 3575651) +Джэймс Фенімар Купер (revision 3516371) +Ханс Крысціян Андэрсен (revision 3845458) +Virtual International Authority File (revision 4119042) +1912 (revision 4201938) +Кампазітар (revision 4086673) +Парламенцкія выбары ў Беларусі (2012) (revision 4056679) +Янка Купала (revision 4297880) +Уладзімір Някляеў (revision 4061577) +TUT.BY (revision 4254319) +Гарады Мінскай вобласці (revision 4290488) +Вільня (revision 4260328) +Брэсцкая вобласць (revision 4095450) +Украінская мова (revision 4281826) +Сацыяльная сетка (revision 3501794) +Джонатан Свіфт (revision 4047140) +Мікалай Аляксеевіч Някрасаў (revision 4054879) +Наша ніва (1920) (revision 3648798) +Белсат (revision 4295169) +Рэспубліка Сярэдняй Літвы (revision 4287459) +Беларусь (revision 4283834) +Віктар Дзмітрыевіч Смольскі (revision 3829868) +Генадзь Пятровіч Пашкоў (revision 4254449) +Знешняя палітыка Беларусі (revision 4258993) +2012 (revision 4181555) +Саюз пісьменнікаў СССР (revision 4039027) +1995 (revision 3568939) +Саюз Савецкіх Сацыялістычных Рэспублік (revision 3433404) +Сістэма абазначэння аб’ектаў адміністрацыйна-тэрытарыяльнага падзелу (revision 2873336) +XVIII (revision 4286695) +Я. Шутовіч (revision 4063897) +Заходняя Беларусь (revision 4189742) +1958 (revision 4179116) +Санкт-Пецярбург (revision 4297788) +Аляксандр Паўлавіч Мацвееў (revision 4002064) +Жодзіна (revision 4204566) +Узда (revision 4263659) +Анатоль Васільевіч Багатыроў (revision 4045167) +Людміла Рублеўская (revision 4109306) +Беларускі ПЭН-цэнтр (revision 4256051) +Дзяржаўны літаратурны музей Янкі Купалы (revision 4258654) +Пётр Паўлавіч Яршоў (revision 3212130) +Выбары ў мясцовыя Саветы дэпутатаў Беларусі (1995) (revision 3419938) +Беластоцкае ваяводства (1919—1939) (revision 4167163) +Уладзімір Аляксеевіч Арлоў (revision 3996298) +1948 (revision 4298274) +1998 (revision 4169162) +Андрэй Катлярчук (revision 4045257) +Аляксей Камай (revision 4004900) +БелаПАН (revision 4114047) +Літаратура і мастацтва (1932) (revision 4226134) +18 стагоддзе (revision 4286695) +Якуб Колас (revision 4276306) +6 лістапада (revision 4241889) +Масква (revision 4293280) +Столінскі раён (revision 4126133) +БелТА (revision 4114101) +Беларуская Энцыклапедыя імя Петруся Броўкі (revision 4131649) +11 красавіка (revision 4257199) +Беларуская мова (revision 4156511) +Маскоўская кансерваторыя (revision 3240079) +Ягор Аляксандравіч Марціновіч (revision 4224289) +Фінляндыя (revision 4214425) +Мікола Мятліцкі (revision 4283301) +Народны артыст Беларусі (revision 4275698) +Курган (паэма) (revision 4086218) +Старыя Дарогі (revision 4204749) +Слуцк (revision 4278680) +Каралеўства Польскае, 1916—1918 (revision 4288202) +Залаты апостраф (revision 4111782) +Уладзімір Андрэевіч Калеснік (revision 4074048) +Залатая літара (revision 4085127) +Нарматыўны кантроль (revision 4228063) +Вышэйшы Гаспадарчы суд Рэспублікі Беларусь (revision 4215415) +Euronews (revision 4165755) +2003 (revision 4206607) +Л. Маракоў (revision 4053060) +1957 (revision 4170762) +Слуцкі раён (revision 4145373) +Дзяржаўная прэмія БССР (revision 3316889) +Энцыклапедыя гісторыі Беларусі (revision 4030685) +Звязда (1917) (revision 4008703) +2018 (revision 4289036) +1986 (revision 3316291) +Бухарская Народная Савецкая Рэспубліка (revision 2623266) +1939 (revision 4148673) +Дзеяслоў (2002) (revision 4049427) +Руская мова (revision 4214240) +Рафаэла Джаваньёлі (revision 2633449) +Фёдар Анісімавіч Сурганаў (revision 4188740) +Саюз пісьменнікаў Беларусі (2005) (revision 4262467) +Беларуская дзяржаўная кансерваторыя (revision 4216964) +2001 (revision 4204274) +Ігнацы Масціцкі (revision 4002826) +29 верасня (revision 4261890) +Іван Андрэевіч Крылоў (revision 3874970) +М. Шкялёнак (revision 4197856) +1935 (revision 3316357) +Тутэйшыя (фільм, 1993) (revision 3952769) +Майскі пераварот (Польшча) (revision 2832232) +Лацвянскі сельсавет (revision 3562080) +Генадзь Пашкоў (revision 4254449) +Сырмежскі сельсавет (revision 4077910) +13 красавіка (revision 4201937) +1994 (revision 4170911) +Рабінавая ноч (revision 4262182) +5 ліпеня (revision 4148432) +Беларускае Палессе (revision 4277908) +Віцебская вобласць (revision 4257032) +Саюз вызвалення Беларусі (справа) (revision 4152192) +Навагрудак (revision 4283306) +Расійская дзяржава (1918—1920) (revision 4070494) +Утварэнне Вялікага Княства Літоўскага (revision 4281842) +Коўна (revision 4264967) +1989 (revision 4170810) +7 сакавіка (revision 4272372) +Советская Белоруссия (revision 3941699) +Саксафон (revision 4119817) +Вікісховішча (revision 4276248) +1915 (revision 4204295) +Канстытуцыя Рэспублікі Беларусь (revision 4051195) +Расійская імперыя (revision 4273900) +1930-я (revision 3508427) +Доктар гістарычных навук (revision 4036548) +Віленскае ваяводства, 1926—1939 (revision 4010285) +1767 (revision 4119132) +Салігорск (revision 4285782) +Горад (revision 4154288) +Заходні Берлін (revision 4273163) +Калійныя солі (revision 3812964) +Паштовы індэкс (revision 2680497) +1714 (revision 3317887) +Джордж Харысан (revision 4129049) +Аўстра-Венгрыя (revision 3868613) +Антарктыка (revision 3997579) +1956 (revision 4169991) +Люфтвафэ (revision 3726645) +Канстытуцыя Украіны (revision 2683533) +Васіль Уладзіміравіч Быкаў (revision 4288405) +1698 (revision 3448249) +Бяларучы (revision 4294726) +1950-я (revision 4204989) +Выбаргскі раён (Ленінградская вобласць) (revision 3641710) +Нацыянальная парламенцкая бібліятэка Японіі (revision 4020527) +1934 (revision 4275604) +XIX стагоддзе (revision 4286738) +24 студзеня (revision 4268404) +Вільнюскае гарадское самакіраванне (revision 3492972) +Залаты Купідон (revision 4267601) +Мінская вобласць (revision 4296852) +Кантрольны нумар Бібліятэкі Кангрэса (revision 3491858) +Пінск (revision 4286890) +Нацыянальная бібліятэка Францыі (revision 4267432) +10 студзеня (revision 3935845) +Аляксандр Іванавіч Якімовіч (revision 4085685) +Лужаснянскі дзяржаўны аграрны каледж імя Ф. А. Сурганава (revision 4171547) +Рыта Леві-Мантальчыні (revision 4058476) + +== End of Parsed pages == + +- Wikipedia parsing ended at: 2022-12-17 18:49:26.830622 + +65 characters appeared 853773 times. + +Most Frequent characters: +[ 0] Char а: 15.572991884259633 % +[ 1] Char н: 6.632324985681206 % +[ 2] Char і: 5.7941630855039925 % +[ 3] Char р: 5.325888731548082 % +[ 4] Char с: 5.02124100902699 % +[ 5] Char к: 4.3536162422564315 % +[ 6] Char ы: 4.066654719697156 % +[ 7] Char л: 4.051428189928704 % +[ 8] Char е: 3.6824776609239227 % +[ 9] Char т: 3.4540797143971527 % +[10] Char я: 3.1694607348791775 % +[11] Char в: 3.116285007841663 % +[12] Char д: 3.1063291999161367 % +[13] Char о: 2.9540639022316237 % +[14] Char у: 2.9458650015870727 % +[15] Char м: 2.709385281567817 % +[16] Char п: 2.6671023796723485 % +[17] Char з: 2.22483025347487 % +[18] Char ц: 1.998657722837335 % +[19] Char г: 1.9463018858642753 % +[20] Char ў: 1.9429051984543901 % +[21] Char б: 1.610732595198021 % +[22] Char э: 1.3249423441593962 % +[23] Char ч: 1.3172119521231052 % +[24] Char й: 1.1151676148109626 % +[25] Char ь: 1.0356382785588207 % +[26] Char х: 0.9220249410557607 % +[27] Char ш: 0.7558215122755112 % +[28] Char ж: 0.5403075524758923 % +[29] Char ю: 0.4688599897162361 % +[30] Char ф: 0.3941328666987595 % +[31] Char i: 0.36391406146598687 % +[32] Char e: 0.2897725742088354 % +[33] Char ё: 0.28508748812623497 % + +The first 34 characters have an accumulated ratio of 0.9715966656242353. +The first 6 characters have an accumulated ratio of 0.42700225938276326. +All characters whose order is over 26 have an accumulated ratio of 0.030978960449674565. + +1518 sequences found. + +First 893 (typical positive ratio): 0.9950100888151092 +Next 272 (1165-893): 0.003995003102100991 +Rest: 0.0009949080827897916 + +- Processing end: 2022-12-17 18:49:26.928946 diff --git a/script/langs/be.py b/script/langs/be.py new file mode 100644 index 0000000..3dea248 --- /dev/null +++ b/script/langs/be.py @@ -0,0 +1,58 @@ +#!/bin/python3 +# -*- coding: utf-8 -*- + +# ##### BEGIN LICENSE BLOCK ##### +# Version: MPL 1.1/GPL 2.0/LGPL 2.1 +# +# The contents of this file are subject to the Mozilla Public License Version +# 1.1 (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# http://www.mozilla.org/MPL/ +# +# Software distributed under the License is distributed on an "AS IS" basis, +# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License +# for the specific language governing rights and limitations under the +# License. +# +# The Original Code is Mozilla Universal charset detector code. +# +# The Initial Developer of the Original Code is +# Netscape Communications Corporation. +# Portions created by the Initial Developer are Copyright (C) 2001 +# the Initial Developer. All Rights Reserved. +# +# Contributor(s): +# Jehan <jehan@girinstud.io> +# +# Alternatively, the contents of this file may be used under the terms of +# either the GNU General Public License Version 2 or later (the "GPL"), or +# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), +# in which case the provisions of the GPL or the LGPL are applicable instead +# of those above. If you wish to allow use of your version of this file only +# under the terms of either the GPL or the LGPL, and not to allow others to +# use your version of this file under the terms of the MPL, indicate your +# decision by deleting the provisions above and replace them with the notice +# and other provisions required by the GPL or the LGPL. If you do not delete +# the provisions above, a recipient may use your version of this file under +# the terms of any one of the MPL, the GPL or the LGPL. +# +# ##### END LICENSE BLOCK ##### + +import re + +## Mandatory Properties ## + +name = 'Belarusian' +code = 'be' +use_ascii = False +charsets = [ 'WINDOWS-1251', 'ISO-8859-5' ] + +## Optional Properties ## + +# Alphabet characters. +alphabet = 'абвгдеёжзійклмнопрстуўфхцчшыьэюя' +# A starred page which was rewarded on the main page when I created +# the data. +start_pages = ['Максім_Танк'] +wikipedia_code = code +case_mapping = True diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index a0b607c..50cf70b 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -8,6 +8,7 @@ set( CharDistribution.cpp JpCntx.cpp LangModels/LangArabicModel.cpp + LangModels/LangBelarusianModel.cpp LangModels/LangBulgarianModel.cpp LangModels/LangCroatianModel.cpp LangModels/LangCzechModel.cpp diff --git a/src/LangModels/LangBelarusianModel.cpp b/src/LangModels/LangBelarusianModel.cpp new file mode 100644 index 0000000..f013abe --- /dev/null +++ b/src/LangModels/LangBelarusianModel.cpp @@ -0,0 +1,202 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* ***** BEGIN LICENSE BLOCK ***** + * Version: MPL 1.1/GPL 2.0/LGPL 2.1 + * + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * The Original Code is Mozilla Communicator client code. + * + * The Initial Developer of the Original Code is + * Netscape Communications Corporation. + * Portions created by the Initial Developer are Copyright (C) 1998 + * the Initial Developer. All Rights Reserved. + * + * Contributor(s): + * + * Alternatively, the contents of this file may be used under the terms of + * either the GNU General Public License Version 2 or later (the "GPL"), or + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), + * in which case the provisions of the GPL or the LGPL are applicable instead + * of those above. If you wish to allow use of your version of this file only + * under the terms of either the GPL or the LGPL, and not to allow others to + * use your version of this file under the terms of the MPL, indicate your + * decision by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL or the LGPL. If you do not delete + * the provisions above, a recipient may use your version of this file under + * the terms of any one of the MPL, the GPL or the LGPL. + * + * ***** END LICENSE BLOCK ***** */ + +#include "../nsSBCharSetProber.h" +#include "../nsLanguageDetector.h" + +/********* Language model for: Belarusian *********/ + +/** + * Generated by BuildLangModel.py + * On: 2022-12-17 18:49:26.830966 + **/ + +/* Character Mapping Table: + * ILL: illegal character. + * CTR: control character specific to the charset. + * RET: carriage/return. + * SYM: symbol (punctuation) that does not belong to word. + * NUM: 0 - 9. + * + * Other characters are ordered by probabilities + * (0 is the most common character in the language). + * + * Orders are generic to a language. So the codepoint with order X in + * CHARSET1 maps to the same character as the codepoint with the same + * order X in CHARSET2 for the same language. + * As such, it is possible to get missing order. For instance the + * ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1 + * even though they are both used for French. Same for the euro sign. + */ +static const unsigned char Windows_1251_CharToOrderMap[] = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 34, 46, 42, 43, 32, 55, 49, 45, 31, 57, 51, 41, 48, 35, 36, /* 4X */ + 47, 62, 38, 37, 39, 44, 50, 54, 53, 52, 56,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 34, 46, 42, 43, 32, 55, 49, 45, 31, 57, 51, 41, 48, 35, 36, /* 6X */ + 47, 62, 38, 37, 39, 44, 50, 54, 53, 52, 56,SYM,SYM,SYM,SYM,CTR, /* 7X */ + 65, 66,SYM, 67,SYM,SYM,SYM,SYM,SYM,SYM, 68,SYM, 69, 70, 71, 64, /* 8X */ + 72,SYM,SYM,SYM,SYM,SYM,SYM,SYM,ILL,SYM, 73,SYM, 74, 75, 76, 64, /* 9X */ + SYM, 20, 20, 77,SYM, 63,SYM,SYM, 33,SYM, 61,SYM,SYM,SYM,SYM, 60, /* AX */ + SYM,SYM, 2, 2, 63,SYM,SYM,SYM, 33,SYM, 61,SYM, 78, 79, 80, 60, /* BX */ + 0, 21, 11, 19, 12, 8, 28, 17, 40, 24, 5, 7, 15, 1, 13, 16, /* CX */ + 3, 4, 9, 14, 30, 26, 18, 23, 27, 59, 58, 6, 25, 22, 29, 10, /* DX */ + 0, 21, 11, 19, 12, 8, 28, 17, 40, 24, 5, 7, 15, 1, 13, 16, /* EX */ + 3, 4, 9, 14, 30, 26, 18, 23, 27, 59, 58, 6, 25, 22, 29, 10, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Iso_8859_5_CharToOrderMap[] = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 34, 46, 42, 43, 32, 55, 49, 45, 31, 57, 51, 41, 48, 35, 36, /* 4X */ + 47, 62, 38, 37, 39, 44, 50, 54, 53, 52, 56,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 34, 46, 42, 43, 32, 55, 49, 45, 31, 57, 51, 41, 48, 35, 36, /* 6X */ + 47, 62, 38, 37, 39, 44, 50, 54, 53, 52, 56,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM, 33, 81, 82, 61, 83, 2, 60, 84, 85, 86, 87, 88,SYM, 20, 64, /* AX */ + 0, 21, 11, 19, 12, 8, 28, 17, 40, 24, 5, 7, 15, 1, 13, 16, /* BX */ + 3, 4, 9, 14, 30, 26, 18, 23, 27, 59, 58, 6, 25, 22, 29, 10, /* CX */ + 0, 21, 11, 19, 12, 8, 28, 17, 40, 24, 5, 7, 15, 1, 13, 16, /* DX */ + 3, 4, 9, 14, 30, 26, 18, 23, 27, 59, 58, 6, 25, 22, 29, 10, /* EX */ + SYM, 33, 89, 90, 61, 91, 2, 60, 92, 93, 94, 95, 96,SYM, 20, 64, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const int Unicode_Char_size = 68; +static const unsigned int Unicode_CharOrder[] = +{ + 69, 32, 73, 31, 101, 32, 105, 31, 1025, 33, 1030, 2, 1038, 20,1040, 0, + 1041, 21, 1042, 11, 1043, 19, 1044, 12, 1045, 8, 1046, 28, 1047, 17,1049, 24, + 1050, 5, 1051, 7, 1052, 15, 1053, 1, 1054, 13, 1055, 16, 1056, 3,1057, 4, + 1058, 9, 1059, 14, 1060, 30, 1061, 26, 1062, 18, 1063, 23, 1064, 27,1067, 6, + 1068, 25, 1069, 22, 1070, 29, 1071, 10, 1072, 0, 1073, 21, 1074, 11,1075, 19, + 1076, 12, 1077, 8, 1078, 28, 1079, 17, 1081, 24, 1082, 5, 1083, 7,1084, 15, + 1085, 1, 1086, 13, 1087, 16, 1088, 3, 1089, 4, 1090, 9, 1091, 14,1092, 30, + 1093, 26, 1094, 18, 1095, 23, 1096, 27, 1099, 6, 1100, 25, 1101, 22,1102, 29, + 1103, 10, 1105, 33, 1110, 2, 1118, 20, +}; + + +/* Model Table: + * Total considered sequences: 1518 / 1156 + * - Positive sequences: first 893 (0.9950100888151092) + * - Probable sequences: next 272 (1165-893) (0.003995003102100991) + * - Neutral sequences: last -9 (0.0009949080827897916) + * - Negative sequences: -362 (off-ratio) + * Negative sequences: TODO + */ +static const PRUint8 BelarusianLangModel[] = +{ + 3,3,3,3,3,3,0,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,0,3,3,3,3,3,1,0,3, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,3,3,3,0,2,3,3,0,3,2,3,3,3,3,2,0,3, + 3,3,3,3,3,3,1,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,1,3,3,0,3,3,3,3,3,1,0,3, + 3,3,2,2,3,3,3,3,3,3,2,3,3,3,3,3,3,2,3,3,0,3,3,3,1,2,3,3,3,1,3,0,0,1, + 3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,0,3,2,0,3,3,3,0,3,3,2,0,3,3,1,0,3, + 3,3,3,3,3,3,1,3,3,3,2,3,2,3,3,3,3,3,3,2,0,2,1,1,0,0,0,2,0,2,1,2,0,2, + 2,3,3,3,3,3,0,3,3,3,3,3,3,1,3,3,3,3,3,3,3,3,1,3,3,0,3,3,3,3,3,1,0,3, + 3,3,3,2,3,3,3,3,3,3,3,2,3,3,3,2,2,2,2,3,0,2,3,2,0,3,1,1,1,3,3,2,0,3, + 3,3,3,3,3,3,0,3,3,3,3,3,3,3,1,3,3,3,3,3,3,3,0,3,3,0,3,3,3,2,3,0,0,0, + 3,3,2,3,3,3,3,3,3,3,1,3,1,3,3,3,1,1,3,2,0,3,3,3,0,3,2,0,1,2,2,0,0,0, + 1,3,1,3,3,3,0,3,3,3,3,3,3,2,1,3,3,3,3,3,3,3,1,3,3,0,3,3,3,3,3,0,0,0, + 3,3,3,2,3,3,3,2,3,3,3,1,2,3,3,1,3,1,1,2,0,2,2,0,0,2,0,1,1,3,1,2,0,3, + 3,3,2,3,3,3,3,3,3,3,1,3,3,3,3,3,3,3,3,3,0,3,3,3,0,2,2,2,3,1,1,0,0,0, + 1,3,3,3,3,3,0,3,3,3,3,3,3,1,2,3,3,3,3,3,3,3,2,3,3,0,3,3,3,3,3,0,0,0, + 3,3,3,3,3,3,0,3,3,3,2,3,3,2,2,3,3,3,3,3,3,3,3,3,3,0,3,3,3,3,2,0,0,1, + 3,3,3,2,3,3,3,3,3,1,3,3,2,3,3,3,3,3,3,1,0,3,3,2,0,0,1,1,0,2,3,1,0,3, + 3,3,3,3,3,3,3,3,3,3,3,1,1,3,3,1,2,2,3,0,0,3,3,3,0,0,1,3,0,0,1,1,0,3, + 3,3,3,3,3,3,3,3,3,0,3,3,3,3,3,3,1,2,2,3,0,3,3,2,0,3,0,3,1,3,1,1,0,3, + 3,3,3,1,2,3,3,3,3,3,3,3,1,3,3,2,1,1,3,2,0,1,3,0,1,3,1,1,0,3,0,1,0,3, + 3,3,3,3,3,3,1,3,3,2,0,3,3,3,3,3,1,0,0,3,0,2,3,3,0,0,0,1,0,1,0,1,0,1, + 0,3,2,3,3,3,0,3,3,3,3,3,3,0,1,3,3,3,3,3,0,3,1,3,0,0,2,3,3,0,2,0,0,1, + 3,3,3,3,3,3,3,3,3,2,3,3,2,3,3,3,2,1,2,1,0,0,2,2,0,1,2,2,1,3,1,0,0,1, + 3,3,3,3,3,3,0,3,2,3,3,3,3,3,1,3,3,3,3,3,3,3,1,3,3,0,3,3,3,2,3,0,0,0, + 3,3,0,0,1,3,3,3,3,1,0,3,1,3,3,2,1,0,1,0,1,2,3,3,0,2,1,0,1,1,0,0,0,1, + 1,3,1,2,3,3,0,3,1,3,1,3,3,1,1,3,2,2,3,3,0,3,1,3,0,0,3,3,0,1,0,0,0,0, + 0,3,2,1,3,3,0,1,3,3,3,3,3,2,1,3,3,1,3,3,0,3,0,2,0,1,2,3,0,2,3,0,0,3, + 3,3,3,3,3,1,0,3,3,3,1,3,1,3,3,3,1,1,2,2,0,2,2,1,0,0,2,0,0,1,0,0,0,1, + 3,3,1,2,2,3,3,3,2,3,0,3,1,3,3,3,3,0,2,1,0,0,3,3,0,0,1,3,0,0,0,0,0,0, + 3,3,0,2,3,3,3,1,2,0,0,3,3,3,3,0,1,1,3,1,0,3,3,2,0,1,0,0,3,0,0,0,0,0, + 1,3,2,3,3,3,0,3,2,3,0,2,3,0,0,2,0,3,3,2,0,3,1,3,0,0,2,3,2,3,3,0,0,0, + 3,2,3,3,3,2,2,3,3,3,2,0,0,3,3,1,0,1,1,1,0,0,3,0,0,0,0,1,0,2,2,1,0,3, + 0,2,1,1,1,2,0,1,0,1,0,0,1,0,0,1,0,1,1,1,0,0,0,2,0,0,0,0,0,0,0,3,3,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,3,0, + 0,3,0,3,3,3,0,3,0,3,0,3,3,0,0,3,2,3,1,3,3,1,1,0,3,0,0,0,1,2,0,0,0,0, +}; + + +const SequenceModel Windows_1251BelarusianModel = +{ + Windows_1251_CharToOrderMap, + BelarusianLangModel, + 34, + (float)0.9990050919172102, + PR_FALSE, + "WINDOWS-1251", + "be" +}; + +const SequenceModel Iso_8859_5BelarusianModel = +{ + Iso_8859_5_CharToOrderMap, + BelarusianLangModel, + 34, + (float)0.9990050919172102, + PR_FALSE, + "ISO-8859-5", + "be" +}; + +const LanguageModel BelarusianModel = +{ + "be", + Unicode_CharOrder, + 68, + BelarusianLangModel, + 34, + 6, + (float)0.42700225938276326, + 26, + (float)0.030978960449674565, +}; diff --git a/src/nsLanguageDetector.h b/src/nsLanguageDetector.h index ff7cb4a..8cff540 100644 --- a/src/nsLanguageDetector.h +++ b/src/nsLanguageDetector.h @@ -126,6 +126,7 @@ private: }; extern const LanguageModel ArabicModel; +extern const LanguageModel BelarusianModel; extern const LanguageModel BulgarianModel; extern const LanguageModel CroatianModel; extern const LanguageModel CzechModel; diff --git a/src/nsMBCSGroupProber.cpp b/src/nsMBCSGroupProber.cpp index 53f4c3a..72e907a 100644 --- a/src/nsMBCSGroupProber.cpp +++ b/src/nsMBCSGroupProber.cpp @@ -93,6 +93,7 @@ nsMBCSGroupProber::nsMBCSGroupProber(PRUint32 aLanguageFilter) int j = 0; langDetectors[i][j++] = new nsLanguageDetector(&ArabicModel); + langDetectors[i][j++] = new nsLanguageDetector(&BelarusianModel); langDetectors[i][j++] = new nsLanguageDetector(&BulgarianModel); langDetectors[i][j++] = new nsLanguageDetector(&CroatianModel); langDetectors[i][j++] = new nsLanguageDetector(&CzechModel); diff --git a/src/nsMBCSGroupProber.h b/src/nsMBCSGroupProber.h index f36e820..b508eb4 100644 --- a/src/nsMBCSGroupProber.h +++ b/src/nsMBCSGroupProber.h @@ -49,7 +49,7 @@ #include "nsEUCTWProber.h" #define NUM_OF_PROBERS 8 -#define NUM_OF_LANGUAGES 32 +#define NUM_OF_LANGUAGES 33 class nsMBCSGroupProber: public nsCharSetProber { public: diff --git a/src/nsSBCSGroupProber.cpp b/src/nsSBCSGroupProber.cpp index 1b7da06..e6ce015 100644 --- a/src/nsSBCSGroupProber.cpp +++ b/src/nsSBCSGroupProber.cpp @@ -214,6 +214,9 @@ nsSBCSGroupProber::nsSBCSGroupProber() mProbers[n++] = new nsSingleByteCharSetProber(&Iso_8859_1EnglishModel); mProbers[n++] = new nsSingleByteCharSetProber(&Windows_1252EnglishModel); + mProbers[n++] = new nsSingleByteCharSetProber(&Windows_1251BelarusianModel); + mProbers[n++] = new nsSingleByteCharSetProber(&Iso_8859_5BelarusianModel); + Reset(); } diff --git a/src/nsSBCSGroupProber.h b/src/nsSBCSGroupProber.h index 57102a0..3d42110 100644 --- a/src/nsSBCSGroupProber.h +++ b/src/nsSBCSGroupProber.h @@ -40,7 +40,7 @@ #define nsSBCSGroupProber_h__ -#define NUM_OF_SBCS_PROBERS 109 +#define NUM_OF_SBCS_PROBERS 111 class nsCharSetProber; class nsSBCSGroupProber: public nsCharSetProber { diff --git a/src/nsSBCharSetProber.h b/src/nsSBCharSetProber.h index d804b93..b274390 100644 --- a/src/nsSBCharSetProber.h +++ b/src/nsSBCharSetProber.h @@ -144,6 +144,9 @@ extern const SequenceModel Ibm855RussianModel; extern const SequenceModel Iso_8859_7GreekModel; extern const SequenceModel Windows_1253GreekModel; +extern const SequenceModel Iso_8859_5BelarusianModel; +extern const SequenceModel Windows_1251BelarusianModel; + extern const SequenceModel Iso_8859_5BulgarianModel; extern const SequenceModel Windows_1251BulgarianModel; diff --git a/test/be/iso-8859-5.txt b/test/be/iso-8859-5.txt new file mode 100644 index 0000000..afec761 --- /dev/null +++ b/test/be/iso-8859-5.txt @@ -0,0 +1,3 @@ + (Marmota), , . + + 15 , . -- . , . , , . , , . diff --git a/test/be/utf-8.txt b/test/be/utf-8.txt new file mode 100644 index 0000000..7fc7177 --- /dev/null +++ b/test/be/utf-8.txt @@ -0,0 +1,3 @@ +Суркі (Marmota), сысуны, прадстаўнікі атраду грызуноў. + +На Зямлі існуе 15 відаў суркоў, якія маюць агульнага продка. Прарадзіма суркоў — Амерыка. У той час як большасць жывёл рухалася з Еўразіі ў Амерыку, суркі з Амерыкі перабіраліся ў Азію. Розныя віды абасобіліся ў розных геаграфічных зонах і адрозніваюцца асаблівасцямі паводзін, але захавалі знешнюю падобнасць, неабходнасць упадаць у спячку і жыццё ў калоніях. Усе суркі траваядныя, жывуць у норах, маюць цёплае футра. diff --git a/test/be/windows-1251.txt b/test/be/windows-1251.txt new file mode 100644 index 0000000..1d82ec8 --- /dev/null +++ b/test/be/windows-1251.txt @@ -0,0 +1,3 @@ + (Marmota), , . + + 15 , . . Ţ糳 , . , , . , , . |