From 41d309e8a28407372317b048342e2bb23d9c8959 Mon Sep 17 00:00:00 2001 From: Jehan Date: Sat, 17 Dec 2022 21:32:24 +0100 Subject: script, src: regenerate Russian models and add UTF-8/Russian support. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This fixes the broken Russian test in Windows-1251 which once again gets a much better score with Russian. Also this adds UTF-8 support. Same as Bulgarian, I wonder why I had not regenerated this earlier. The new UTF-8 test comes from the 'Сурки' page of Wikipedia in Russian. Note that now this broke the test zh:gb18030 (the score for KOI8-R / ru (0.766388) beats GB18030 / zh (0.700000)). I think I'll have to look a bit closer at our GB18030 dedicated prober. --- README.md | 1 + script/BuildLangModelLogs/LangRussianModel.log | 270 ++++++++++++ script/charsets/ibm855.py | 75 ++++ script/charsets/ibm866.py | 72 ++++ script/charsets/koi8-r.py | 74 ++++ script/charsets/mac-cyrillic.py | 72 ++++ script/langs/ru.py | 58 +++ src/LangModels/LangRussianModel.cpp | 576 +++++++++++++------------ src/nsLanguageDetector.h | 1 + src/nsMBCSGroupProber.cpp | 1 + src/nsMBCSGroupProber.h | 2 +- src/nsSBCSGroupProber.cpp | 8 +- src/nsSBCharSetProber.h | 8 +- test/ru/utf-8.txt | 1 + 14 files changed, 943 insertions(+), 276 deletions(-) create mode 100644 script/BuildLangModelLogs/LangRussianModel.log create mode 100644 script/charsets/ibm855.py create mode 100644 script/charsets/ibm866.py create mode 100644 script/charsets/koi8-r.py create mode 100644 script/charsets/mac-cyrillic.py create mode 100644 script/langs/ru.py create mode 100644 test/ru/utf-8.txt diff --git a/README.md b/README.md index 7501f91..07018e0 100644 --- a/README.md +++ b/README.md @@ -156,6 +156,7 @@ uchardet started as a C language binding of the original C++ implementation of t * Windows-1250 * IBM852 * Russian + * UTF-8 * ISO-8859-5 * KOI8-R * WINDOWS-1251 diff --git a/script/BuildLangModelLogs/LangRussianModel.log b/script/BuildLangModelLogs/LangRussianModel.log new file mode 100644 index 0000000..82d9804 --- /dev/null +++ b/script/BuildLangModelLogs/LangRussianModel.log @@ -0,0 +1,270 @@ += Logs of language model for Russian (ru) = + +- Generated by BuildLangModel.py +- Started: 2022-12-17 19:53:30.416132 +- Maximum depth: 4 +- Max number of pages: 200 + +== Parsed pages == + +Пулмен (рабочий посёлок) (revision 127314030) +Водонапорная башня (revision 123368499) +Обама, Барак (revision 127312814) +Историзм (искусство) (revision 125199154) +Насосная станция (revision 126671775) +Школьный округ (revision 118138873) +Конденсат (revision 97819205) +1880-е годы (revision 124959394) +Линкольн, Роберт Тодд (revision 126851305) +Габарит подвижного состава (revision 127265050) +Межвоенный период (revision 123201828) +Гражданская война в США (revision 127311614) +История евреев в США (revision 123703208) +Англо-занзибарская война (revision 127263956) +Линкольн, Джесси Харлан (revision 87795509) +Бенкен, Герман (revision 120809711) +УралГАХУ (revision 126489964) +Великобритания (revision 127175319) +Фленсбург (revision 126961771) +Мещанство (revision 127304945) +Прохоров, Александр Михайлович (revision 127233579) +VIAF (revision 122626337) +Национальная библиотека Чешской Республики (revision 124152023) +Регулирующая арматура (revision 116046805) +Раннее Средневековье (revision 126932807) +Европейская интеграция (revision 125721443) +Бойл, Уиллард (revision 120835257) +Бут, Эдвин (revision 126437526) +Московский трамвай (revision 127184149) +Лондонский метрополитен (revision 126810923) +F-18 (revision 127113399) +Сацумско-британская война (revision 124671983) +Луизианская покупка (revision 123941200) +Община (Германия) (revision 125007479) +Запорная арматура (revision 121220496) +Новая Англия (revision 125214368) +Берни Сандерс (revision 126983575) +Бак (резервуар) (revision 126670363) +Хемингуэй, Эрнест (revision 126959711) +2021 год (revision 127125948) +1951 год (revision 126285688) +Жидкость (revision 127133343) +Большая советская энциклопедия (revision 127144085) +Россия (revision 127297047) +CSS Virginia (revision 121318647) +Школа реки Гудзон (revision 123627995) +Водозаборные сооружения (revision 123836554) +Ривера, Диего (revision 125976771) +Квантовая физика (revision 126896053) +Рочестер (Нью-Йорк) (revision 126016553) +Конденсация (теплотехника) (revision 123837631) +Средиземноморская Антанта (revision 125156636) +Историография (revision 121180824) +Гбови, Лейма (revision 124860814) +Премудрый пискарь (revision 121359555) +Люнебургская водонапорная башня (revision 117681965) +XVIII век (revision 126913825) +Сислей, Альфред (revision 127063100) +Средние века (revision 127154753) +Энциклопедический словарь Брокгауза и Ефрона (revision 125357601) +Нефтепровод (revision 123810227) +Нефть (revision 126997759) +Вентиляция (revision 126675588) +Цилиндр (revision 126783664) +Английский язык (revision 127275941) +Бензин (revision 126966322) +Министр по делам ветеранов США (revision 124072400) +Первобытное общество (revision 127057340) +Пикассо, Пабло (revision 126869217) +Рисунок в разрезе (revision 121960314) +Междупутье (revision 125745955) +Битва при Форт-Генри (revision 123999672) +Канал (водный) (revision 123736265) +Белорусская народная республика (revision 126958885) +25 апреля (revision 127246597) +Насос (revision 126768788) +Теннесси (revision 124804069) +Локомотив (revision 127032264) +Габарит погрузки (revision 123372556) +Вебби (revision 121964659) +Алегзандрия (Виргиния) (revision 126338837) +Война Фаррапус (revision 125765352) +Образование в США (revision 126788195) +Пресс-конференция (revision 127075029) +Рио-де-Жанейро (revision 127002708) +Габарит приближения строений (revision 117538368) +Международный идентификатор стандартных наименований (revision 120216410) +Мопассан, Ги де (revision 127086462) +История Европейского союза (revision 123952687) +Прусский социализм (revision 127165836) +Библиотека Александрина (revision 126093192) +Тэйкан-дзукури (revision 124877986) +1883 год (revision 125476166) +Конфликт на Китайско-Восточной железной дороге (revision 122499702) +Энергетический уровень (revision 119322956) +Алюминий (revision 126861293) +Санкт-петербургский трамвай (revision 127306763) +Национальная библиотека Франции (revision 127015965) +12 мая (revision 127207333) +Граммофон (revision 126498827) +Маккьяйоли (revision 126836176) +Канализационная установка (revision 123736401) +Газ (revision 126950046) +Луизиана (revision 127312945) +Память Парижской Коммуны (revision 126960401) +Сталь (revision 127216605) +Семья Барака Обамы (revision 124529726) +Поверхностный насос (revision 121146223) +Каразин, Николай Николаевич (revision 127097562) +Кирпичная готика (revision 125337841) +The Century Magazine (revision 127098805) +Контрольный номер Библиотеки Конгресса (revision 113360170) +Русско-персидская война (1804—1813) (revision 126999654) +Берн (revision 122913269) +Поздняя античность (revision 127266287) +Гарвардский университет (revision 127033732) +Бои на Халхин-Голе (revision 126542980) +Алый знак доблести (фильм, 1951) (revision 120728355) +Водопровод (revision 127182411) +Пар (revision 126003244) +1971 год (revision 127068279) +Искусство Древнего Египта (revision 125737336) +Пенсильванский университет (Индиана) (revision 123963620) +Национальная библиотека Израиля (revision 126108080) +1884 год (revision 125476122) +Проезд снаружи поездов (revision 127239100) +Норвегия (revision 126986958) +Барбур, Джеймс (revision 126851158) +Французская интервенция в Испанию (revision 119666106) +Англия (revision 127268120) +Галлатин, Альберт (revision 127160198) +Калифорния (revision 127027363) +Роял, Кеннет Клайборн (revision 110605693) +США (revision 126887888) +Федеральная архитектура (revision 116000492) +Конденсат Бозе — Эйнштейна (revision 125188375) +Колонна (revision 126876842) +1907 год (revision 127134918) +13 сентября (revision 125587404) +Генрих Лев (revision 126407574) +Этрусское искусство (revision 123158050) +Амальрик, Андрей Алексеевич (revision 126033545) +9 декабря (revision 127201233) +Селищи (22712000298) (revision 124521248) +1798 год (revision 125783094) +Мюледорф (Берн) (revision 121861015) +Большая игра (revision 126891168) +Битва (revision 124395796) +Война не-персе (revision 127189710) +Президентские выборы в США (2020) (revision 126639368) +Площадь Карла Фаберже (revision 123223942) +Банкрофт, Джордж (revision 126851184) +Кобаяси, Макото (revision 121939251) +Газойль (revision 123647640) +Ватиканская апостольская библиотека (revision 124986491) +Общественная собственность (revision 125722109) +Славная революция (revision 122270271) +Золя (revision 127092383) +Офицер (revision 126230098) +Метастабильное состояние (revision 118552209) +Лыжные гонки (revision 124233040) +Средиземное море (revision 126980465) +Защитная арматура (revision 124665168) +Президент Турции (revision 123861767) +Макдональд, Артур (revision 123992590) +Песок (revision 126799930) +Сублимация (физика) (revision 127108939) +Новицкий, Василий Фёдорович (revision 126350745) +Список султанов Занзибара (revision 94020222) +Туман (revision 124866163) +2005 год (revision 127291761) +Исламская Республика Афганистан (revision 126605442) +Викисловарь (revision 126840626) +22 января (revision 126465130) +Российская национальная библиотека (revision 126055277) +Наука в США (revision 124150312) +Екатеринбургский завод (revision 125779202) +Океания (revision 125374219) +Нидершерли (revision 116230829) +Война за австрийское наследство (revision 126874381) +Доминиканская Республика (revision 127046641) +Военный паровоз (revision 124117506) +Подземные воды (revision 126705165) +5 сентября (revision 126628763) +Кафка, Франц (revision 127130321) +Двухванная печь (revision 123510834) +Чертаново Южное (revision 122081039) + +== End of Parsed pages == + +- Wikipedia parsing ended at: 2022-12-17 19:57:30.506110 + +63 characters appeared 2343890 times. + +Most Frequent characters: +[ 0] Char о: 10.136567842347551 % +[ 1] Char и: 8.217151828797427 % +[ 2] Char а: 7.941797609956098 % +[ 3] Char е: 7.781337861418411 % +[ 4] Char н: 6.689093771465385 % +[ 5] Char с: 5.755304216494801 % +[ 6] Char р: 5.58695160609073 % +[ 7] Char т: 5.486136294791991 % +[ 8] Char в: 4.621547939536412 % +[ 9] Char л: 4.156039745892512 % +[10] Char к: 3.458694733967891 % +[11] Char м: 2.899666793236884 % +[12] Char д: 2.856064064439884 % +[13] Char п: 2.69799350652121 % +[14] Char у: 2.0648579924825823 % +[15] Char я: 1.9596482770095867 % +[16] Char г: 1.812798382176638 % +[17] Char ы: 1.7729500957809452 % +[18] Char б: 1.5043794717328884 % +[19] Char з: 1.4936707780655236 % +[20] Char й: 1.4190938994577391 % +[21] Char ь: 1.2650764327677493 % +[22] Char ч: 1.0549983147673312 % +[23] Char х: 1.0016255029032932 % +[24] Char ж: 0.7652236239755279 % +[25] Char ц: 0.5965297006258826 % +[26] Char ю: 0.5917513193878552 % +[27] Char ш: 0.5520310253467526 % +[28] Char ф: 0.4393977533075358 % +[29] Char щ: 0.3068403380704726 % +[30] Char э: 0.3063710327703092 % +[31] Char i: 0.25978181569954223 % +[32] Char ё: 0.24984107615971737 % +[33] Char e: 0.2357619171548153 % +[34] Char a: 0.21839762104876934 % +[35] Char n: 0.18004257878996027 % +[36] Char r: 0.1703151598411188 % +[37] Char t: 0.16216631326555428 % +[38] Char s: 0.15969179441014725 % +[39] Char o: 0.1568759626091668 % +[40] Char l: 0.1263711180985456 % +[41] Char c: 0.09795681537956133 % +[42] Char d: 0.08571221345711616 % +[43] Char h: 0.07956858043679524 % +[44] Char m: 0.07009714619713382 % +[45] Char u: 0.0688598867694303 % +[46] Char x: 0.05725524661993524 % +[47] Char p: 0.05644462837419845 % +[48] Char b: 0.05482339188272487 % +[49] Char g: 0.051111613599614324 % +[50] Char f: 0.05038632359027087 % +[51] Char y: 0.04923439239896071 % +[52] Char v: 0.0470158582527337 % +[53] Char ъ: 0.03617917223077875 % + +The first 54 characters have an accumulated ratio of 0.9991548238185242. +The first 5 characters have an accumulated ratio of 0.40765948913984873. +All characters whose order is over 29 have an accumulated ratio of 0.030302616590369. + +1554 sequences found. + +First 819 (typical positive ratio): 0.9950050289366638 +Next 260 (1079-819): 0.003999322715788067 +Rest: 0.0009956483475481726 + +- Processing end: 2022-12-17 19:57:30.653466 diff --git a/script/charsets/ibm855.py b/script/charsets/ibm855.py new file mode 100644 index 0000000..451e938 --- /dev/null +++ b/script/charsets/ibm855.py @@ -0,0 +1,75 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +# ##### BEGIN LICENSE BLOCK ##### +# Version: MPL 1.1/GPL 2.0/LGPL 2.1 +# +# The contents of this file are subject to the Mozilla Public License Version +# 1.1 (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# http://www.mozilla.org/MPL/ +# +# Software distributed under the License is distributed on an "AS IS" basis, +# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License +# for the specific language governing rights and limitations under the +# License. +# +# The Original Code is Mozilla Universal charset detector code. +# +# The Initial Developer of the Original Code is +# Netscape Communications Corporation. +# Portions created by the Initial Developer are Copyright (C) 2001 +# the Initial Developer. All Rights Reserved. +# +# Contributor(s): +# Jehan +# +# Alternatively, the contents of this file may be used under the terms of +# either the GNU General Public License Version 2 or later (the "GPL"), or +# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), +# in which case the provisions of the GPL or the LGPL are applicable instead +# of those above. If you wish to allow use of your version of this file only +# under the terms of either the GPL or the LGPL, and not to allow others to +# use your version of this file under the terms of the MPL, indicate your +# decision by deleting the provisions above and replace them with the notice +# and other provisions required by the GPL or the LGPL. If you do not delete +# the provisions above, a recipient may use your version of this file under +# the terms of any one of the MPL, the GPL or the LGPL. +# +# ##### END LICENSE BLOCK ##### + +from codepoints import * + +name = 'IBM855' +aliases = ['CP855', 'OEM 855', 'MS-DOS Cyrillic'] + +language = \ +{ + # Wikipedia tells us: At one time it was widely used in Serbia, Macedonia + # and Bulgaria, but it never caught on in Russia, where Code page 866 was more + # common. This code page is not used much. + 'complete': [ 'sr', 'mk', 'bg', 'ru' ], + 'incomplete': [] +} + +# X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF # +charmap = \ +[ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, # 0X + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 1X + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # 2X + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, # 3X + SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 4X + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,SYM, # 5X + SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 6X + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,CTR, # 7X + + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 8X + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 9X + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM, # AX + SYM,SYM,SYM,SYM,SYM,LET,LET,LET,LET,SYM,SYM,SYM,SYM,LET,LET,SYM, # BX + SYM,SYM,SYM,SYM,SYM,SYM,LET,LET,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # CX + LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,LET,LET,SYM, # DX + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM, # EX + SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM, # FX +] diff --git a/script/charsets/ibm866.py b/script/charsets/ibm866.py new file mode 100644 index 0000000..9ed7bc5 --- /dev/null +++ b/script/charsets/ibm866.py @@ -0,0 +1,72 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +# ##### BEGIN LICENSE BLOCK ##### +# Version: MPL 1.1/GPL 2.0/LGPL 2.1 +# +# The contents of this file are subject to the Mozilla Public License Version +# 1.1 (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# http://www.mozilla.org/MPL/ +# +# Software distributed under the License is distributed on an "AS IS" basis, +# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License +# for the specific language governing rights and limitations under the +# License. +# +# The Original Code is Mozilla Universal charset detector code. +# +# The Initial Developer of the Original Code is +# Netscape Communications Corporation. +# Portions created by the Initial Developer are Copyright (C) 2001 +# the Initial Developer. All Rights Reserved. +# +# Contributor(s): +# Jehan +# +# Alternatively, the contents of this file may be used under the terms of +# either the GNU General Public License Version 2 or later (the "GPL"), or +# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), +# in which case the provisions of the GPL or the LGPL are applicable instead +# of those above. If you wish to allow use of your version of this file only +# under the terms of either the GPL or the LGPL, and not to allow others to +# use your version of this file under the terms of the MPL, indicate your +# decision by deleting the provisions above and replace them with the notice +# and other provisions required by the GPL or the LGPL. If you do not delete +# the provisions above, a recipient may use your version of this file under +# the terms of any one of the MPL, the GPL or the LGPL. +# +# ##### END LICENSE BLOCK ##### + +from codepoints import * + +name = 'IBM866' +aliases = ['CP866', 'DOS Cyrillic Russian'] + +language = \ +{ + 'complete': [ 'bg', 'ru' ], + 'incomplete': [ 'uk', 'be' ] +} + +# X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF # +charmap = \ +[ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, # 0X + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 1X + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # 2X + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, # 3X + SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 4X + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,SYM, # 5X + SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 6X + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,CTR, # 7X + + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 8X + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 9X + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # AX + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # BX + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # CX + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # DX + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # EX + LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # FX +] diff --git a/script/charsets/koi8-r.py b/script/charsets/koi8-r.py new file mode 100644 index 0000000..8abbc04 --- /dev/null +++ b/script/charsets/koi8-r.py @@ -0,0 +1,74 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +# ##### BEGIN LICENSE BLOCK ##### +# Version: MPL 1.1/GPL 2.0/LGPL 2.1 +# +# The contents of this file are subject to the Mozilla Public License Version +# 1.1 (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# http://www.mozilla.org/MPL/ +# +# Software distributed under the License is distributed on an "AS IS" basis, +# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License +# for the specific language governing rights and limitations under the +# License. +# +# The Original Code is Mozilla Universal charset detector code. +# +# The Initial Developer of the Original Code is +# Netscape Communications Corporation. +# Portions created by the Initial Developer are Copyright (C) 2001 +# the Initial Developer. All Rights Reserved. +# +# Contributor(s): +# Jehan +# +# Alternatively, the contents of this file may be used under the terms of +# either the GNU General Public License Version 2 or later (the "GPL"), or +# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), +# in which case the provisions of the GPL or the LGPL are applicable instead +# of those above. If you wish to allow use of your version of this file only +# under the terms of either the GPL or the LGPL, and not to allow others to +# use your version of this file under the terms of the MPL, indicate your +# decision by deleting the provisions above and replace them with the notice +# and other provisions required by the GPL or the LGPL. If you do not delete +# the provisions above, a recipient may use your version of this file under +# the terms of any one of the MPL, the GPL or the LGPL. +# +# ##### END LICENSE BLOCK ##### + +from codepoints import * + +name = 'KOI8-R' +aliases = ['csKOI8R'] + +language = \ +{ + # KOI8-R is an 8-bit character encoding, designed to cover Russian, which + # uses a Cyrillic alphabet. It also happens to cover Bulgarian, but has not + # been used for that purpose since CP1251 was accepted. + 'complete': [ 'ru', 'bg' ], + 'incomplete': [] +} + +# X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF # +charmap = \ +[ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, # 0X + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 1X + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # 2X + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, # 3X + SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 4X + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,SYM, # 5X + SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 6X + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,CTR, # 7X + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # 8X + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # 9X + SYM,SYM,SYM,LET,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # AX + SYM,SYM,SYM,LET,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # BX + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # CX + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # DX + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # EX + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # FX +] diff --git a/script/charsets/mac-cyrillic.py b/script/charsets/mac-cyrillic.py new file mode 100644 index 0000000..a967519 --- /dev/null +++ b/script/charsets/mac-cyrillic.py @@ -0,0 +1,72 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +# ##### BEGIN LICENSE BLOCK ##### +# Version: MPL 1.1/GPL 2.0/LGPL 2.1 +# +# The contents of this file are subject to the Mozilla Public License Version +# 1.1 (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# http://www.mozilla.org/MPL/ +# +# Software distributed under the License is distributed on an "AS IS" basis, +# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License +# for the specific language governing rights and limitations under the +# License. +# +# The Original Code is Mozilla Universal charset detector code. +# +# The Initial Developer of the Original Code is +# Netscape Communications Corporation. +# Portions created by the Initial Developer are Copyright (C) 2001 +# the Initial Developer. All Rights Reserved. +# +# Contributor(s): +# Jehan +# +# Alternatively, the contents of this file may be used under the terms of +# either the GNU General Public License Version 2 or later (the "GPL"), or +# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), +# in which case the provisions of the GPL or the LGPL are applicable instead +# of those above. If you wish to allow use of your version of this file only +# under the terms of either the GPL or the LGPL, and not to allow others to +# use your version of this file under the terms of the MPL, indicate your +# decision by deleting the provisions above and replace them with the notice +# and other provisions required by the GPL or the LGPL. If you do not delete +# the provisions above, a recipient may use your version of this file under +# the terms of any one of the MPL, the GPL or the LGPL. +# +# ##### END LICENSE BLOCK ##### + +from codepoints import * + +name = 'MAC-CYRILLIC' +aliases = ['x-mac-cyrillic' ] + +language = \ +{ + 'complete': [ 'bg', 'ru' ], + 'incomplete': [ 'uk', 'be' ] +} + +# X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF # +charmap = \ +[ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, # 0X + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 1X + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # 2X + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, # 3X + SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 4X + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,SYM, # 5X + SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 6X + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,CTR, # 7X + + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 8X + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 9X + SYM,SYM,LET,SYM,SYM,SYM,SYM,LET,SYM,SYM,SYM,LET,LET,SYM,LET,LET, # AX + SYM,SYM,SYM,SYM,LET,SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # BX + LET,LET,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,LET,LET,LET,LET,LET, # CX + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,LET,LET,LET,LET,SYM,LET,LET,LET, # DX + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # EX + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM, # FX +] diff --git a/script/langs/ru.py b/script/langs/ru.py new file mode 100644 index 0000000..9d330e1 --- /dev/null +++ b/script/langs/ru.py @@ -0,0 +1,58 @@ +#!/bin/python3 +# -*- coding: utf-8 -*- + +# ##### BEGIN LICENSE BLOCK ##### +# Version: MPL 1.1/GPL 2.0/LGPL 2.1 +# +# The contents of this file are subject to the Mozilla Public License Version +# 1.1 (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# http://www.mozilla.org/MPL/ +# +# Software distributed under the License is distributed on an "AS IS" basis, +# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License +# for the specific language governing rights and limitations under the +# License. +# +# The Original Code is Mozilla Universal charset detector code. +# +# The Initial Developer of the Original Code is +# Netscape Communications Corporation. +# Portions created by the Initial Developer are Copyright (C) 2001 +# the Initial Developer. All Rights Reserved. +# +# Contributor(s): +# Jehan +# +# Alternatively, the contents of this file may be used under the terms of +# either the GNU General Public License Version 2 or later (the "GPL"), or +# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), +# in which case the provisions of the GPL or the LGPL are applicable instead +# of those above. If you wish to allow use of your version of this file only +# under the terms of either the GPL or the LGPL, and not to allow others to +# use your version of this file under the terms of the MPL, indicate your +# decision by deleting the provisions above and replace them with the notice +# and other provisions required by the GPL or the LGPL. If you do not delete +# the provisions above, a recipient may use your version of this file under +# the terms of any one of the MPL, the GPL or the LGPL. +# +# ##### END LICENSE BLOCK ##### + +import re + +## Mandatory Properties ## + +name = 'Russian' +code = 'ru' +use_ascii = False +charsets = [ 'WINDOWS-1251', 'ISO-8859-5', 'KOI8-R', 'IBM855', 'IBM866', 'MAC-CYRILLIC' ] + +## Optional Properties ## + +# Alphabet characters. +alphabet = 'абвгдеёжзийклмнопрстуфхцчшщъыьэюя' +# A starred page which was rewarded on the main page when I created +# the data. +start_pages = ['Пулмен (рабочий посёлок)'] +wikipedia_code = code +case_mapping = True diff --git a/src/LangModels/LangRussianModel.cpp b/src/LangModels/LangRussianModel.cpp index 50631df..32a5e87 100644 --- a/src/LangModels/LangRussianModel.cpp +++ b/src/LangModels/LangRussianModel.cpp @@ -36,332 +36,374 @@ * ***** END LICENSE BLOCK ***** */ #include "../nsSBCharSetProber.h" +#include "../nsLanguageDetector.h" +/********* Language model for: Russian *********/ +/** + * Generated by BuildLangModel.py + * On: 2022-12-17 19:57:30.506433 + **/ -//KOI8-R language model -//Character Mapping Table: -static const unsigned char KOI8R_CharToOrderMap[] = +/* Character Mapping Table: + * ILL: illegal character. + * CTR: control character specific to the charset. + * RET: carriage/return. + * SYM: symbol (punctuation) that does not belong to word. + * NUM: 0 - 9. + * + * Other characters are ordered by probabilities + * (0 is the most common character in the language). + * + * Orders are generic to a language. So the codepoint with order X in + * CHARSET1 maps to the same character as the codepoint with the same + * order X in CHARSET2 for the same language. + * As such, it is possible to get missing order. For instance the + * ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1 + * even though they are both used for French. Same for the euro sign. + */ +static const unsigned char Windows_1251_CharToOrderMap[] = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 34, 48, 41, 42, 33, 50, 49, 43, 31, 56, 54, 40, 44, 35, 39, /* 4X */ + 47, 58, 36, 38, 37, 45, 52, 55, 46, 51, 57,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 34, 48, 41, 42, 33, 50, 49, 43, 31, 56, 54, 40, 44, 35, 39, /* 6X */ + 47, 58, 36, 38, 37, 45, 52, 55, 46, 51, 57,SYM,SYM,SYM,SYM,CTR, /* 7X */ + 63, 64,SYM, 65,SYM,SYM,SYM,SYM,SYM,SYM, 66,SYM, 67, 68, 69, 70, /* 8X */ + 71,SYM,SYM,SYM,SYM,SYM,SYM,SYM,ILL,SYM, 72,SYM, 73, 74, 75, 76, /* 9X */ + SYM, 60, 60, 77,SYM, 62,SYM,SYM, 32,SYM, 78,SYM,SYM,SYM,SYM, 61, /* AX */ + SYM,SYM, 59, 59, 62,SYM,SYM,SYM, 32,SYM, 79,SYM, 80, 81, 82, 61, /* BX */ + 2, 18, 8, 16, 12, 3, 24, 19, 1, 20, 10, 9, 11, 4, 0, 13, /* CX */ + 6, 5, 7, 14, 28, 23, 25, 22, 27, 29, 53, 17, 21, 30, 26, 15, /* DX */ + 2, 18, 8, 16, 12, 3, 24, 19, 1, 20, 10, 9, 11, 4, 0, 13, /* EX */ + 6, 5, 7, 14, 28, 23, 25, 22, 27, 29, 53, 17, 21, 30, 26, 15, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Iso_8859_5_CharToOrderMap[] = { -CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, //00 -CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, //10 -SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, //20 -NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, //30 -SYM,142,143,144,145,146,147,148,149,150,151,152, 74,153, 75,154, //40 -155,156,157,158,159,160,161,162,163,164,165,SYM,SYM,SYM,SYM,SYM, //50 -SYM, 71,172, 66,173, 65,174, 76,175, 64,176,177, 77, 72,178, 69, //60 - 67,179, 78, 73,180,181, 79,182,183,184,185,SYM,SYM,SYM,SYM,SYM, //70 -191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206, //80 -207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222, //90 -223,224,225, 68,226,227,228,229,230,231,232,233,234,235,236,237, //a0 -238,239,240,241,242,243,244,245,246,247,248,249,250,251,NUM,SYM, //b0 - 27, 3, 21, 28, 13, 2, 39, 19, 26, 4, 23, 11, 8, 12, 5, 1, //c0 - 15, 16, 9, 7, 6, 14, 24, 10, 17, 18, 20, 25, 30, 29, 22, 54, //d0 - 59, 37, 44, 58, 41, 48, 53, 46, 55, 42, 60, 36, 49, 38, 31, 34, //e0 - 35, 43, 45, 32, 40, 52, 56, 33, 61, 62, 51, 57, 47, 63, 50, 70, //f0 + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 34, 48, 41, 42, 33, 50, 49, 43, 31, 56, 54, 40, 44, 35, 39, /* 4X */ + 47, 58, 36, 38, 37, 45, 52, 55, 46, 51, 57,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 34, 48, 41, 42, 33, 50, 49, 43, 31, 56, 54, 40, 44, 35, 39, /* 6X */ + 47, 58, 36, 38, 37, 45, 52, 55, 46, 51, 57,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM, 32, 83, 84, 85, 86, 59, 61, 87, 88, 89, 90, 91,SYM, 60, 92, /* AX */ + 2, 18, 8, 16, 12, 3, 24, 19, 1, 20, 10, 9, 11, 4, 0, 13, /* BX */ + 6, 5, 7, 14, 28, 23, 25, 22, 27, 29, 53, 17, 21, 30, 26, 15, /* CX */ + 2, 18, 8, 16, 12, 3, 24, 19, 1, 20, 10, 9, 11, 4, 0, 13, /* DX */ + 6, 5, 7, 14, 28, 23, 25, 22, 27, 29, 53, 17, 21, 30, 26, 15, /* EX */ + SYM, 32, 93, 94, 95, 96, 59, 61, 97, 98, 99,100,101,SYM, 60,102, /* FX */ }; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ -static const unsigned char win1251_CharToOrderMap[] = +static const unsigned char Koi8_R_CharToOrderMap[] = { -CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, //00 -CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, //10 -SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, //20 -NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, //30 -SYM,142,143,144,145,146,147,148,149,150,151,152, 74,153, 75,154, //40 -155,156,157,158,159,160,161,162,163,164,165,SYM,SYM,SYM,SYM,SYM, //50 -SYM, 71,172, 66,173, 65,174, 76,175, 64,176,177, 77, 72,178, 69, //60 - 67,179, 78, 73,180,181, 79,182,183,184,185,SYM,SYM,SYM,SYM,SYM, //70 -191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206, -207,208,209,210,211,212,213,214,ILL,216,217,218,219,220,221,222, -223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238, -239,240,241,242,243,244,245,246, 68,247,248,249,250,251,NUM,SYM, - 37, 44, 33, 46, 41, 48, 56, 51, 42, 60, 36, 49, 38, 31, 34, 35, - 45, 32, 40, 52, 53, 55, 58, 50, 57, 63, 70, 62, 61, 47, 59, 43, - 3, 21, 10, 19, 13, 2, 24, 20, 4, 23, 11, 8, 12, 5, 1, 15, - 9, 7, 6, 14, 39, 26, 28, 22, 25, 29, 54, 18, 17, 30, 27, 16, + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 34, 48, 41, 42, 33, 50, 49, 43, 31, 56, 54, 40, 44, 35, 39, /* 4X */ + 47, 58, 36, 38, 37, 45, 52, 55, 46, 51, 57,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 34, 48, 41, 42, 33, 50, 49, 43, 31, 56, 54, 40, 44, 35, 39, /* 6X */ + 47, 58, 36, 38, 37, 45, 52, 55, 46, 51, 57,SYM,SYM,SYM,SYM,CTR, /* 7X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 8X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 9X */ + SYM,SYM,SYM, 32,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */ + SYM,SYM,SYM, 32,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */ + 26, 2, 18, 25, 12, 3, 28, 16, 23, 1, 20, 10, 9, 11, 4, 0, /* CX */ + 13, 15, 6, 5, 7, 14, 24, 8, 21, 17, 19, 27, 30, 29, 22, 53, /* DX */ + 26, 2, 18, 25, 12, 3, 28, 16, 23, 1, 20, 10, 9, 11, 4, 0, /* EX */ + 13, 15, 6, 5, 7, 14, 24, 8, 21, 17, 19, 27, 30, 29, 22, 53, /* FX */ }; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ -static const unsigned char latin5_CharToOrderMap[] = +static const unsigned char Ibm855_CharToOrderMap[] = { -CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, //00 -CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, //10 -SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, //20 -NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, //30 -SYM,142,143,144,145,146,147,148,149,150,151,152, 74,153, 75,154, //40 -155,156,157,158,159,160,161,162,163,164,165,SYM,SYM,SYM,SYM,SYM, //50 -SYM, 71,172, 66,173, 65,174, 76,175, 64,176,177, 77, 72,178, 69, //60 - 67,179, 78, 73,180,181, 79,182,183,184,185,SYM,SYM,SYM,SYM,SYM, //70 -191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206, -207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222, -223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238, - 37, 44, 33, 46, 41, 48, 56, 51, 42, 60, 36, 49, 38, 31, 34, 35, - 45, 32, 40, 52, 53, 55, 58, 50, 57, 63, 70, 62, 61, 47, 59, 43, - 3, 21, 10, 19, 13, 2, 24, 20, 4, 23, 11, 8, 12, 5, 1, 15, - 9, 7, 6, 14, 39, 26, 28, 22, 25, 29, 54, 18, 17, 30, 27, 16, -239, 68,240,241,242,243,244,245,246,247,248,249,250,251,NUM,CTR, + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 34, 48, 41, 42, 33, 50, 49, 43, 31, 56, 54, 40, 44, 35, 39, /* 4X */ + 47, 58, 36, 38, 37, 45, 52, 55, 46, 51, 57,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 34, 48, 41, 42, 33, 50, 49, 43, 31, 56, 54, 40, 44, 35, 39, /* 6X */ + 47, 58, 36, 38, 37, 45, 52, 55, 46, 51, 57,SYM,SYM,SYM,SYM,CTR, /* 7X */ + 103,104,105,106, 32, 32,107,108,109,110, 59, 59, 61, 61,111,112, /* 8X */ + 113,114,115,116,117,118,119,120, 60, 60,121,122, 26, 26, 53, 53, /* 9X */ + 2, 2, 18, 18, 25, 25, 12, 12, 3, 3, 28, 28, 16, 16,SYM,SYM, /* AX */ + SYM,SYM,SYM,SYM,SYM, 23, 23, 1, 1,SYM,SYM,SYM,SYM, 20, 20,SYM, /* BX */ + SYM,SYM,SYM,SYM,SYM,SYM, 10, 10,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* CX */ + 9, 9, 11, 11, 4, 4, 0, 0, 13,SYM,SYM,SYM,SYM, 13, 15,SYM, /* DX */ + 15, 6, 6, 5, 5, 7, 7, 14, 14, 24, 24, 8, 8, 21, 21,SYM, /* EX */ + SYM, 17, 17, 19, 19, 27, 27, 30, 30, 29, 29, 22, 22,SYM,SYM,SYM, /* FX */ }; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ -static const unsigned char macCyrillic_CharToOrderMap[] = +static const unsigned char Ibm866_CharToOrderMap[] = { -CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, //00 -CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, //10 -SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, //20 -NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, //30 -SYM,142,143,144,145,146,147,148,149,150,151,152, 74,153, 75,154, //40 -155,156,157,158,159,160,161,162,163,164,165,SYM,SYM,SYM,SYM,SYM, //50 -SYM, 71,172, 66,173, 65,174, 76,175, 64,176,177, 77, 72,178, 69, //60 - 67,179, 78, 73,180,181, 79,182,183,184,185,SYM,SYM,SYM,SYM,SYM, //70 - 37, 44, 33, 46, 41, 48, 56, 51, 42, 60, 36, 49, 38, 31, 34, 35, - 45, 32, 40, 52, 53, 55, 58, 50, 57, 63, 70, 62, 61, 47, 59, 43, -191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206, -207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222, -223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238, -239,240,241,242,243,244,245,246,247,248,249,250,251,NUM, 68, 16, - 3, 21, 10, 19, 13, 2, 24, 20, 4, 23, 11, 8, 12, 5, 1, 15, - 9, 7, 6, 14, 39, 26, 28, 22, 25, 29, 54, 18, 17, 30, 27,CTR, + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 34, 48, 41, 42, 33, 50, 49, 43, 31, 56, 54, 40, 44, 35, 39, /* 4X */ + 47, 58, 36, 38, 37, 45, 52, 55, 46, 51, 57,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 34, 48, 41, 42, 33, 50, 49, 43, 31, 56, 54, 40, 44, 35, 39, /* 6X */ + 47, 58, 36, 38, 37, 45, 52, 55, 46, 51, 57,SYM,SYM,SYM,SYM,CTR, /* 7X */ + 2, 18, 8, 16, 12, 3, 24, 19, 1, 20, 10, 9, 11, 4, 0, 13, /* 8X */ + 6, 5, 7, 14, 28, 23, 25, 22, 27, 29, 53, 17, 21, 30, 26, 15, /* 9X */ + 2, 18, 8, 16, 12, 3, 24, 19, 1, 20, 10, 9, 11, 4, 0, 13, /* AX */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* CX */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* DX */ + 6, 5, 7, 14, 28, 23, 25, 22, 27, 29, 53, 17, 21, 30, 26, 15, /* EX */ + 32, 32,123,124, 61, 61, 60, 60,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* FX */ }; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ -static const unsigned char IBM855_CharToOrderMap[] = +static const unsigned char Mac_Cyrillic_CharToOrderMap[] = { -CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, //00 -CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, //10 -SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, //20 -NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, //30 -SYM,142,143,144,145,146,147,148,149,150,151,152, 74,153, 75,154, //40 -155,156,157,158,159,160,161,162,163,164,165,SYM,SYM,SYM,SYM,SYM, //50 -SYM, 71,172, 66,173, 65,174, 76,175, 64,176,177, 77, 72,178, 69, //60 - 67,179, 78, 73,180,181, 79,182,183,184,185,SYM,SYM,SYM,SYM,SYM, //70 -191,192,193,194, 68,195,196,197,198,199,200,201,202,203,204,205, -206,207,208,209,210,211,212,213,214,215,216,217, 27, 59, 54, 70, - 3, 37, 21, 44, 28, 58, 13, 41, 2, 48, 39, 53, 19, 46,218,219, -220,221,222,223,224, 26, 55, 4, 42,225,226,227,228, 23, 60,229, -230,231,232,233,234,235, 11, 36,236,237,238,239,240,241,242,243, - 8, 49, 12, 38, 5, 31, 1, 34, 15,244,245,246,247, 35, 16,248, - 43, 9, 45, 7, 32, 6, 40, 14, 52, 24, 56, 10, 33, 17, 61,249, -250, 18, 62, 20, 51, 25, 57, 30, 47, 29, 63, 22, 50,251,NUM,CTR, + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 34, 48, 41, 42, 33, 50, 49, 43, 31, 56, 54, 40, 44, 35, 39, /* 4X */ + 47, 58, 36, 38, 37, 45, 52, 55, 46, 51, 57,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 34, 48, 41, 42, 33, 50, 49, 43, 31, 56, 54, 40, 44, 35, 39, /* 6X */ + 47, 58, 36, 38, 37, 45, 52, 55, 46, 51, 57,SYM,SYM,SYM,SYM,CTR, /* 7X */ + 2, 18, 8, 16, 12, 3, 24, 19, 1, 20, 10, 9, 11, 4, 0, 13, /* 8X */ + 6, 5, 7, 14, 28, 23, 25, 22, 27, 29, 53, 17, 21, 30, 26, 15, /* 9X */ + SYM,SYM, 62,SYM,SYM,SYM,SYM, 59,SYM,SYM,SYM,125,126,SYM,127,128, /* AX */ + SYM,SYM,SYM,SYM, 59,SYM, 62,129,130,131, 61, 61,132,133,134,135, /* BX */ + 136,137,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,138,139,140,141,142, /* CX */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, 60, 60,143,144,SYM, 32, 32, 15, /* DX */ + 2, 18, 8, 16, 12, 3, 24, 19, 1, 20, 10, 9, 11, 4, 0, 13, /* EX */ + 6, 5, 7, 14, 28, 23, 25, 22, 27, 29, 53, 17, 21, 30, 26,SYM, /* FX */ }; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ -static const unsigned char IBM866_CharToOrderMap[] = +static const int Unicode_Char_size = 108; +static const unsigned int Unicode_CharOrder[] = { -CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, //00 -CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, //10 -SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, //20 -NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, //30 -SYM,142,143,144,145,146,147,148,149,150,151,152, 74,153, 75,154, //40 -155,156,157,158,159,160,161,162,163,164,165,SYM,SYM,SYM,SYM,SYM, //50 -SYM, 71,172, 66,173, 65,174, 76,175, 64,176,177, 77, 72,178, 69, //60 - 67,179, 78, 73,180,181, 79,182,183,184,185,SYM,SYM,SYM,SYM,SYM, //70 - 37, 44, 33, 46, 41, 48, 56, 51, 42, 60, 36, 49, 38, 31, 34, 35, - 45, 32, 40, 52, 53, 55, 58, 50, 57, 63, 70, 62, 61, 47, 59, 43, - 3, 21, 10, 19, 13, 2, 24, 20, 4, 23, 11, 8, 12, 5, 1, 15, -191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206, -207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222, -223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238, - 9, 7, 6, 14, 39, 26, 28, 22, 25, 29, 54, 18, 17, 30, 27, 16, -239, 68,240,241,242,243,244,245,246,247,248,249,250,251,NUM,CTR, + 65, 34, 66, 48, 67, 41, 68, 42, 69, 33, 70, 50, 71, 49, 72, 43, + 73, 31, 76, 40, 77, 44, 78, 35, 79, 39, 80, 47, 82, 36, 83, 38, + 84, 37, 85, 45, 86, 52, 88, 46, 89, 51, 97, 34, 98, 48, 99, 41, + 100, 42, 101, 33, 102, 50, 103, 49, 104, 43, 105, 31, 108, 40, 109, 44, + 110, 35, 111, 39, 112, 47, 114, 36, 115, 38, 116, 37, 117, 45, 118, 52, + 120, 46, 121, 51, 1025, 32, 1040, 2, 1041, 18, 1042, 8, 1043, 16,1044, 12, + 1045, 3, 1046, 24, 1047, 19, 1048, 1, 1049, 20, 1050, 10, 1051, 9,1052, 11, + 1053, 4, 1054, 0, 1055, 13, 1056, 6, 1057, 5, 1058, 7, 1059, 14,1060, 28, + 1061, 23, 1062, 25, 1063, 22, 1064, 27, 1065, 29, 1066, 53, 1067, 17,1068, 21, + 1069, 30, 1070, 26, 1071, 15, 1072, 2, 1073, 18, 1074, 8, 1075, 16,1076, 12, + 1077, 3, 1078, 24, 1079, 19, 1080, 1, 1081, 20, 1082, 10, 1083, 9,1084, 11, + 1085, 4, 1086, 0, 1087, 13, 1088, 6, 1089, 5, 1090, 7, 1091, 14,1092, 28, + 1093, 23, 1094, 25, 1095, 22, 1096, 27, 1097, 29, 1098, 53, 1099, 17,1100, 21, + 1101, 30, 1102, 26, 1103, 15, 1105, 32, }; -//Model Table: -//total sequences: 100% -//first 512 sequences: 97.6601% -//first 1024 sequences: 2.3389% -//rest sequences: 0.1237% -//negative sequences: 0.0009% -static const PRUint8 RussianLangModel[] = + +/* Model Table: + * Total considered sequences: 1554 / 2916 + * - Positive sequences: first 819 (0.9950050289366638) + * - Probable sequences: next 260 (1079-819) (0.003999322715788067) + * - Neutral sequences: last 1837 (0.0009956483475481726) + * - Negative sequences: 1362 (off-ratio) + * Negative sequences: TODO + */ +static const PRUint8 RussianLangModel[] = { -0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,1,3,3,3,3,1,3,3,3,2,3,2,3,3, -3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,2,2,2,2,2,0,0,2, -3,3,3,2,3,3,3,3,3,3,3,3,3,3,2,3,3,0,0,3,3,3,3,3,3,3,3,3,2,3,2,0, -0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -3,3,3,2,2,3,3,3,3,3,3,3,3,3,2,3,3,0,0,3,3,3,3,3,3,3,3,2,3,3,1,0, -0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -3,2,3,2,3,3,3,3,3,3,3,3,3,3,3,3,3,0,0,3,3,3,3,3,3,3,3,3,3,3,2,1, -0,0,0,0,0,0,0,2,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,0,0,3,3,3,3,3,3,3,3,3,3,3,2,1, -0,0,0,0,0,1,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -3,3,3,3,3,3,3,3,2,2,2,3,1,3,3,1,3,3,3,3,2,2,3,0,2,2,2,3,3,2,1,0, -0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0, -3,3,3,3,3,3,2,3,3,3,3,3,2,2,3,2,3,3,3,2,1,2,2,0,1,2,2,2,2,2,2,0, -0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0, -3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,3,0,2,2,3,3,2,1,2,0, -0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,1,0,0,2,0,0,0,0,0,0,0,0,0, -3,3,3,3,3,3,2,3,3,1,2,3,2,2,3,2,3,3,3,3,2,2,3,0,3,2,2,3,1,1,1,0, -0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -3,3,3,3,3,3,3,3,2,2,3,3,3,3,3,2,3,3,3,3,2,2,2,0,3,3,3,2,2,2,2,0, -0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -3,3,3,3,3,3,3,3,3,3,2,3,2,3,3,3,3,3,3,2,3,2,2,0,1,3,2,1,2,2,1,0, -0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0, -3,3,3,3,3,3,3,3,3,3,3,2,1,1,3,0,1,1,1,1,2,1,1,0,2,2,2,1,2,0,1,0, -0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -3,3,3,3,3,3,2,3,3,2,2,2,2,1,3,2,3,2,3,2,1,2,2,0,1,1,2,1,2,1,2,0, -0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -3,3,3,3,3,3,3,3,3,3,3,3,2,2,3,2,3,3,3,2,2,2,2,0,2,2,2,2,3,1,1,0, -0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0, -3,2,3,2,2,3,3,3,3,3,3,3,3,3,1,3,2,0,0,3,3,3,3,2,3,3,3,3,2,3,2,0, -0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -2,3,3,3,3,3,2,2,3,3,0,2,1,0,3,2,3,2,3,0,0,1,2,0,0,1,0,1,2,1,1,0, -0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -3,0,3,0,2,3,3,3,3,2,3,3,3,3,1,2,2,0,0,2,3,2,2,2,3,2,3,2,2,3,0,0, -0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -3,2,3,0,2,3,2,3,0,1,2,3,3,2,0,2,3,0,0,2,3,2,2,0,1,3,1,3,2,2,1,0, -0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -3,1,3,0,2,3,3,3,3,3,3,3,3,2,1,3,2,0,0,2,2,3,3,3,2,3,3,0,2,2,0,0, -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -3,3,3,3,3,3,2,2,3,3,2,2,2,3,3,0,0,1,1,1,1,1,2,0,0,1,1,1,1,0,1,0, -0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -3,3,3,3,3,3,2,2,3,3,3,3,3,3,3,0,3,2,3,3,2,3,2,0,2,1,0,1,1,0,1,0, -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0, -3,3,3,3,3,3,2,3,3,3,2,2,2,2,3,1,3,2,3,1,1,2,1,0,2,2,2,2,1,3,1,0, -0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0, -2,2,3,3,3,3,3,1,2,2,1,3,1,0,3,0,0,3,0,0,0,1,1,0,1,2,1,0,0,0,0,0, -0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -3,2,2,1,1,3,3,3,2,2,1,2,2,3,1,1,2,0,0,2,2,1,3,0,0,2,1,1,2,1,1,0, -0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -3,2,3,3,3,3,1,2,2,2,1,2,1,3,3,1,1,2,1,2,1,2,2,0,2,0,0,1,1,0,1,0, -0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -2,3,3,3,3,3,2,1,3,2,2,3,2,0,3,2,0,3,0,1,0,1,1,0,0,1,1,1,1,0,1,0, -0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -3,3,2,3,3,3,2,2,2,3,3,1,2,1,2,1,0,1,0,1,1,0,1,0,0,2,1,1,1,0,1,0, -0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0, -3,1,1,2,1,2,3,3,2,2,1,2,2,3,0,2,1,0,0,2,2,3,2,1,2,2,2,2,2,3,1,0, -0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -3,3,3,3,3,1,1,0,1,1,2,2,1,1,3,0,0,1,3,1,1,1,0,0,0,1,0,1,1,0,0,0, -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -2,1,3,3,3,2,0,0,0,2,1,0,1,0,2,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -2,0,1,0,0,2,3,2,2,2,1,2,2,2,1,2,1,0,0,1,1,1,0,2,0,1,1,1,0,0,1,1, -1,0,0,0,0,0,1,2,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0, -2,3,3,3,3,0,0,0,0,1,0,0,0,0,3,0,1,2,1,0,0,0,0,0,0,0,1,1,0,0,1,1, -1,0,1,0,1,2,0,0,1,1,2,1,0,1,1,1,1,0,1,1,1,1,0,1,0,0,1,0,0,1,1,0, -2,2,3,2,2,2,3,1,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,0,1,0,1,1,1,0,2,1, -1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,0,1,0,1,1,0,1,1,1,0,1,1,0, -3,3,3,2,2,2,2,3,2,2,1,1,2,2,2,2,1,1,3,1,2,1,2,0,0,1,1,0,1,0,2,1, -1,1,1,1,1,2,1,0,1,1,1,1,0,1,0,0,1,1,0,0,1,0,1,0,0,1,0,0,0,1,1,0, -2,0,0,1,0,3,2,2,2,2,1,2,1,2,1,2,0,0,0,2,1,2,2,1,1,2,2,0,1,1,0,2, -1,1,1,1,1,0,1,1,1,2,1,1,1,2,1,0,1,2,1,1,1,1,0,1,1,1,0,0,1,0,0,1, -1,3,2,2,2,1,1,1,2,3,0,0,0,0,2,0,2,2,1,0,0,0,0,0,0,1,0,0,0,0,1,1, -1,0,1,1,0,1,0,1,1,0,1,1,0,2,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,1,1,0, -2,3,2,3,2,1,2,2,2,2,1,0,0,0,2,0,0,1,1,0,0,0,0,0,0,0,1,1,0,0,2,1, -1,1,2,1,0,2,0,0,1,0,1,0,0,1,0,0,1,1,0,1,1,0,0,0,0,0,1,0,0,0,0,0, -3,0,0,1,0,2,2,2,3,2,2,2,2,2,2,2,0,0,0,2,1,2,1,1,1,2,2,0,0,0,1,2, -1,1,1,1,1,0,1,2,1,1,1,1,1,1,1,0,1,1,1,1,1,1,0,1,1,1,1,1,1,0,0,1, -2,3,2,3,3,2,0,1,1,1,0,0,1,0,2,0,1,1,3,1,0,0,0,0,0,0,0,1,0,0,2,1, -1,1,1,1,1,1,1,0,1,0,1,1,1,1,0,1,1,1,0,0,1,1,0,1,0,0,0,0,0,0,1,0, -2,3,3,3,3,1,2,2,2,2,0,1,1,0,2,1,1,1,2,1,0,1,1,0,0,1,0,1,0,0,2,0, -0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -2,3,3,3,2,0,0,1,1,2,2,1,0,0,2,0,1,1,3,0,0,1,0,0,0,0,0,1,0,1,2,1, -1,1,2,0,1,1,1,0,1,0,1,1,0,1,0,1,1,1,1,0,1,0,0,0,0,0,0,1,0,1,1,0, -1,3,2,3,2,1,0,0,2,2,2,0,1,0,2,0,1,1,1,0,1,0,0,0,3,0,1,1,0,0,2,1, -1,1,1,0,1,1,0,0,0,0,1,1,0,1,0,0,2,1,1,0,1,0,0,0,1,0,1,0,0,1,1,0, -3,1,2,1,1,2,2,2,2,2,2,1,2,2,1,1,0,0,0,2,2,2,0,0,0,1,2,1,0,1,0,1, -2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,2,1,1,1,0,1,0,1,1,0,1,1,1,0,0,1, -3,0,0,0,0,2,0,1,1,1,1,1,1,1,0,1,0,0,0,1,1,1,0,1,0,1,1,0,0,1,0,1, -1,1,0,0,1,0,0,0,1,0,1,1,0,0,1,0,1,0,1,0,0,0,0,1,0,0,0,1,0,0,0,1, -1,3,3,2,2,0,0,0,2,2,0,0,0,1,2,0,1,1,2,0,0,0,0,0,0,0,0,1,0,0,2,1, -0,1,1,0,0,1,1,0,0,0,1,1,0,1,1,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0, -2,3,2,3,2,0,0,0,0,1,1,0,0,0,2,0,2,0,2,0,0,0,0,0,1,0,0,1,0,0,1,1, -1,1,2,0,1,2,1,0,1,1,2,1,1,1,1,1,2,1,1,0,1,0,0,1,1,1,1,1,0,1,1,0, -1,3,2,2,2,1,0,0,2,2,1,0,1,2,2,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,1, -0,0,1,1,0,1,1,0,0,1,1,0,1,1,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0, -1,0,0,1,0,2,3,1,2,2,2,2,2,2,1,1,0,0,0,1,0,1,0,2,1,1,1,0,0,0,0,1, -1,1,0,1,1,0,1,1,1,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0, -2,0,2,0,0,1,0,3,2,1,2,1,2,2,0,1,0,0,0,2,1,0,0,2,1,1,1,1,0,2,0,2, -2,1,1,1,1,1,1,1,1,1,1,1,1,2,1,0,1,1,1,1,0,0,0,1,1,1,1,0,1,0,0,1, -1,2,2,2,2,1,0,0,1,0,0,0,0,0,2,0,1,1,1,1,0,0,0,0,1,0,1,2,0,0,2,0, -1,0,1,1,1,2,1,0,1,0,1,1,0,0,1,0,1,1,1,0,1,0,0,0,1,0,0,1,0,1,1,0, -2,1,2,2,2,0,3,0,1,1,0,0,0,0,2,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1, -0,0,0,1,1,1,0,0,1,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0, -1,2,2,3,2,2,0,0,1,1,2,0,1,2,1,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1, -0,1,1,0,0,1,1,0,0,1,1,0,0,1,1,0,1,1,0,0,1,0,0,0,0,0,0,0,0,1,1,0, -2,2,1,1,2,1,2,2,2,2,2,1,2,2,0,1,0,0,0,1,2,2,2,1,2,1,1,1,1,1,2,1, -1,1,1,1,1,1,1,1,1,1,0,0,1,1,1,0,1,1,1,0,0,0,0,1,1,1,0,1,1,0,0,1, -1,2,2,2,2,0,1,0,2,2,0,0,0,0,2,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,2,0, -0,0,1,0,0,1,0,0,0,0,1,0,1,1,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0, -0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0, -0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -1,2,2,2,2,0,0,0,2,2,2,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1, -0,1,1,0,0,1,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -1,2,2,2,2,0,0,0,0,1,0,0,1,1,2,0,0,0,0,1,0,1,0,0,1,0,0,2,0,0,0,1, -0,0,1,0,0,1,0,0,0,1,1,0,0,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0, -1,2,2,2,1,1,2,0,2,1,1,1,1,0,2,2,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,1, -0,0,1,0,1,1,0,0,0,0,1,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0, -1,0,2,1,2,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0, -0,0,1,0,1,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0, -1,0,0,0,0,2,0,1,2,1,0,1,1,1,0,1,0,0,0,1,0,1,0,0,1,0,1,0,0,0,0,1, -0,0,0,0,0,1,0,0,1,1,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1, -2,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1, -1,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0, -2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1, -1,1,1,0,1,0,1,0,0,1,1,1,1,0,0,0,1,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0, -1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1, -1,1,0,1,1,0,1,0,1,0,0,0,0,1,1,0,1,1,0,0,0,0,0,1,0,1,1,0,1,0,0,0, -0,1,1,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,3,0,3,3,3,3,3, + 3,3,3,3,0,3,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,3,0,3,3,3,3,3, + 3,3,3,2,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,3,0,3,3,3,3,3, + 3,3,3,3,1,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,3,0,3,3,3,3,3, + 3,3,3,2,0,3,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,3,3,3,3,3,3,3,3,2,3,2,3,2,3,3,3,3,3,3,0,3,3,3,3,3,3, + 3,3,3,2,0,3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,0,3,3,3,3,3,3, + 3,3,1,3,0,3,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,3, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,3,3,3,3, + 3,3,2,3,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,1,0,3,3,2,1,3,3, + 1,3,2,2,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,0,3,2,3,2,2,1, + 3,1,3,1,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1, + 3,3,3,3,3,3,3,3,2,3,3,3,3,2,3,3,3,3,2,2,0,3,2,3,3,1,3, + 1,2,2,2,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,3,3,3,3,3,3,3,3,3,3,3,2,2,3,1,3,1,1,3,0,2,1,1,3,3,1, + 2,2,0,2,0,2,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0, + 3,3,3,3,3,3,2,2,3,3,3,3,2,3,3,3,2,3,3,2,0,3,2,1,1,2,2, + 1,2,1,3,1,3,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,0,3,3,2,3,3,3, + 3,1,0,3,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3, + 3,3,3,3,3,3,3,3,1,3,3,1,2,3,3,3,1,3,3,1,0,3,2,1,0,3,1, + 2,2,1,1,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,0,3,3,2,0,3,3,3,3,3, + 3,3,3,3,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0, + 1,1,1,3,3,3,3,3,3,3,3,3,3,3,1,3,3,0,3,3,3,0,3,3,3,3,3, + 2,1,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,3,3,3,3,3,3,3,3,3,3,2,3,1,3,1,3,1,2,2,0,1,2,1,1,1,2, + 2,1,0,2,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,2,1,3,3,3,3,3,3,3,3,3,3,3,1,2,3,0,3,3,3,0,3,3,3,2,0, + 3,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,3,3,3,3,3,3,2,3,3,3,3,2,1,3,3,1,3,3,2,0,2,2,3,3,2,3, + 3,1,3,2,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3, + 3,3,3,3,3,3,3,1,3,3,3,3,3,1,3,3,3,3,3,2,0,3,2,0,3,2,2, + 2,0,0,1,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1, + 3,2,2,2,3,3,3,3,2,3,3,3,3,2,1,2,2,0,3,2,0,0,3,1,1,3,1, + 3,2,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,3,0,3,3,3,2,3,3,1,3,3,3,3,0,3,3,0,3,3,0,0,3,1,1,3,3, + 3,3,2,1,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 2,3,3,3,3,1,3,3,2,3,3,2,0,1,3,0,0,1,0,0,0,3,1,0,3,0,1, + 3,1,0,1,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,3,3,3,3,3,3,3,3,3,2,3,2,1,3,0,1,1,1,1,0,2,1,2,0,1,1, + 1,1,0,2,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1, + 3,3,3,3,3,3,1,1,2,1,3,1,3,2,3,0,1,1,3,1,0,3,2,0,1,2,1, + 1,0,0,1,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1, + 3,3,3,3,1,1,1,1,3,2,3,1,2,1,3,1,1,3,2,2,0,1,0,1,0,1,1, + 1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 1,2,2,1,3,3,3,3,2,3,2,3,3,2,1,1,3,0,3,3,1,0,3,2,3,3,2, + 1,1,3,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,3,3,3,3,2,3,3,3,3,3,2,1,2,3,0,1,0,1,0,0,3,1,1,0,0,1, + 1,1,0,1,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,3,3,3,2,3,3,3,0,3,3,2,1,1,3,1,3,2,1,1,0,3,0,1,0,1,2, + 1,3,0,1,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 1,3,3,3,3,0,1,0,0,0,0,0,0,0,3,0,0,0,0,0,0,3,0,0,0,0,0, + 0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 1,1,1,1,3,3,3,3,3,3,3,3,3,3,1,2,2,0,2,2,3,0,1,1,1,2,1, + 2,3,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,3,0,3,3,3,3,3,3,3,3,3,3,1,3,2,3,2,3,3,2,1,3,0, + 0,0,0,1,3,3,3,3,3,3,2,3,2,2,1,1,3,0,2,3,2,0,1,3,2,1,0, + 0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,1,0,0,1,1,1,0,0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,3,0,3,3,3,3,3,3,2,3,3,3,2,3,3,2,2,2,3,2,2,3,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0, + 0,0,0,0,3,0,2,2,3,3,3,3,2,3,3,3,2,3,3,1,3,2,3,3,3,2,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,3,0,3,3,3,2,3,3,3,2,3,3,1,1,2,1,1,2,3,2,2,1,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,3,0,3,3,3,3,3,3,3,3,3,3,2,3,3,0,2,2,3,2,3,2,0, + 0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,3,0,3,3,1,3,3,3,3,2,2,1,3,1,3,0,1,1,1,2,3,1,0, + 0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,3,0,3,3,2,1,3,3,3,2,3,1,3,2,2,0,3,3,1,2,2,1,0, + 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,2,0,2,2,3,3,3,3,3,3,3,3,2,3,3,2,3,3,2,3,2,3,0, + 1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,3,0,3,3,1,1,3,2,3,3,2,2,1,2,3,0,1,2,1,2,2,1,0, + 1,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,3,0,3,3,2,2,3,2,3,3,2,1,3,1,2,0,1,1,1,1,2,1,0, + 0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,3,0,3,3,1,2,1,2,3,2,1,2,1,1,2,0,1,1,2,1,2,1,0, + 0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,3,0,3,3,2,2,3,2,3,2,1,1,1,1,3,0,1,1,1,1,2,0,0, + 0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,3,0,3,3,2,1,1,2,3,1,2,1,1,3,3,0,3,3,2,1,2,1,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,2,0,3,2,3,3,3,3,1,3,2,2,1,3,0,1,2,3,2,2,1,1,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,3,0,1,1,0,1,1,1,1,0,1,0,1,0,0,3,1,1,0,1,1,3,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,3,0,3,3,1,3,2,2,3,3,1,1,3,1,3,0,2,1,0,1,1,1,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,3,0,3,3,3,3,1,1,3,3,2,1,1,1,2,1,1,2,0,1,2,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,3,0,3,3,2,3,1,2,3,3,1,1,3,1,2,0,1,1,1,0,1,1,0, + 0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,3,0,3,3,1,3,2,1,3,2,1,1,0,1,2,1,0,1,1,2,1,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,1,0,2,2,2,1,1,3,2,3,2,1,1,1,1,0,1,3,1,1,1,1,0, + 0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,3,0,3,3,1,1,0,1,2,0,1,1,0,1,1,1,0,1,0,1,1,0,0, + 0,1,0,3,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,1, + 0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, }; -const SequenceModel Koi8rRussianModel = +const SequenceModel Windows_1251RussianModel = { - KOI8R_CharToOrderMap, + Windows_1251_CharToOrderMap, RussianLangModel, - 64, - (float)0.976601, + 54, + (float)0.9990043516524518, PR_FALSE, - "KOI8-R", + "WINDOWS-1251", "ru" }; -const SequenceModel Win1251RussianModel = +const SequenceModel Iso_8859_5RussianModel = { - win1251_CharToOrderMap, + Iso_8859_5_CharToOrderMap, RussianLangModel, - 64, - (float)0.976601, + 54, + (float)0.9990043516524518, PR_FALSE, - "WINDOWS-1251", + "ISO-8859-5", "ru" }; -const SequenceModel Latin5RussianModel = +const SequenceModel Koi8_RRussianModel = { - latin5_CharToOrderMap, + Koi8_R_CharToOrderMap, RussianLangModel, - 64, - (float)0.976601, + 54, + (float)0.9990043516524518, PR_FALSE, - "ISO-8859-5", + "KOI8-R", "ru" }; -const SequenceModel MacCyrillicRussianModel = +const SequenceModel Ibm855RussianModel = { - macCyrillic_CharToOrderMap, + Ibm855_CharToOrderMap, RussianLangModel, - 64, - (float)0.976601, + 54, + (float)0.9990043516524518, PR_FALSE, - "MAC-CYRILLIC", + "IBM855", "ru" }; const SequenceModel Ibm866RussianModel = { - IBM866_CharToOrderMap, + Ibm866_CharToOrderMap, RussianLangModel, - 64, - (float)0.976601, + 54, + (float)0.9990043516524518, PR_FALSE, "IBM866", "ru" }; -const SequenceModel Ibm855RussianModel = +const SequenceModel Mac_CyrillicRussianModel = { - IBM855_CharToOrderMap, + Mac_Cyrillic_CharToOrderMap, RussianLangModel, - 64, - (float)0.976601, + 54, + (float)0.9990043516524518, PR_FALSE, - "IBM855", + "MAC-CYRILLIC", "ru" }; + +const LanguageModel RussianModel = +{ + "ru", + Unicode_CharOrder, + 108, + RussianLangModel, + 54, + 5, + (float)0.40765948913984873, + 29, + (float)0.030302616590369, +}; diff --git a/src/nsLanguageDetector.h b/src/nsLanguageDetector.h index 6ac7ffc..01eedfb 100644 --- a/src/nsLanguageDetector.h +++ b/src/nsLanguageDetector.h @@ -150,6 +150,7 @@ extern const LanguageModel NorwegianModel; extern const LanguageModel PolishModel; extern const LanguageModel PortugueseModel; extern const LanguageModel RomanianModel; +extern const LanguageModel RussianModel; extern const LanguageModel SlovakModel; extern const LanguageModel SloveneModel; extern const LanguageModel SpanishModel; diff --git a/src/nsMBCSGroupProber.cpp b/src/nsMBCSGroupProber.cpp index 874e182..04c628d 100644 --- a/src/nsMBCSGroupProber.cpp +++ b/src/nsMBCSGroupProber.cpp @@ -117,6 +117,7 @@ nsMBCSGroupProber::nsMBCSGroupProber(PRUint32 aLanguageFilter) langDetectors[i][j++] = new nsLanguageDetector(&PolishModel); langDetectors[i][j++] = new nsLanguageDetector(&PortugueseModel); langDetectors[i][j++] = new nsLanguageDetector(&RomanianModel); + langDetectors[i][j++] = new nsLanguageDetector(&RussianModel); langDetectors[i][j++] = new nsLanguageDetector(&SlovakModel); langDetectors[i][j++] = new nsLanguageDetector(&SloveneModel); langDetectors[i][j++] = new nsLanguageDetector(&SpanishModel); diff --git a/src/nsMBCSGroupProber.h b/src/nsMBCSGroupProber.h index c1b2367..17c3f66 100644 --- a/src/nsMBCSGroupProber.h +++ b/src/nsMBCSGroupProber.h @@ -49,7 +49,7 @@ #include "nsEUCTWProber.h" #define NUM_OF_PROBERS 8 -#define NUM_OF_LANGUAGES 34 +#define NUM_OF_LANGUAGES 35 class nsMBCSGroupProber: public nsCharSetProber { public: diff --git a/src/nsSBCSGroupProber.cpp b/src/nsSBCSGroupProber.cpp index 8344b9b..7d474ca 100644 --- a/src/nsSBCSGroupProber.cpp +++ b/src/nsSBCSGroupProber.cpp @@ -50,10 +50,10 @@ nsSBCSGroupProber::nsSBCSGroupProber() PRUint32 heb_prober_idx; PRUint32 n = 0; - mProbers[n++] = new nsSingleByteCharSetProber(&Win1251RussianModel); - mProbers[n++] = new nsSingleByteCharSetProber(&Koi8rRussianModel); - mProbers[n++] = new nsSingleByteCharSetProber(&Latin5RussianModel); - mProbers[n++] = new nsSingleByteCharSetProber(&MacCyrillicRussianModel); + mProbers[n++] = new nsSingleByteCharSetProber(&Windows_1251RussianModel); + mProbers[n++] = new nsSingleByteCharSetProber(&Koi8_RRussianModel); + mProbers[n++] = new nsSingleByteCharSetProber(&Iso_8859_5RussianModel); + mProbers[n++] = new nsSingleByteCharSetProber(&Mac_CyrillicRussianModel); mProbers[n++] = new nsSingleByteCharSetProber(&Ibm866RussianModel); mProbers[n++] = new nsSingleByteCharSetProber(&Ibm855RussianModel); diff --git a/src/nsSBCharSetProber.h b/src/nsSBCharSetProber.h index 5eb8f12..3ab5830 100644 --- a/src/nsSBCharSetProber.h +++ b/src/nsSBCharSetProber.h @@ -134,10 +134,10 @@ protected: extern const SequenceModel Windows_1256ArabicModel; extern const SequenceModel Iso_8859_6ArabicModel; -extern const SequenceModel Koi8rRussianModel; -extern const SequenceModel Win1251RussianModel; -extern const SequenceModel Latin5RussianModel; -extern const SequenceModel MacCyrillicRussianModel; +extern const SequenceModel Koi8_RRussianModel; +extern const SequenceModel Windows_1251RussianModel; +extern const SequenceModel Iso_8859_5RussianModel; +extern const SequenceModel Mac_CyrillicRussianModel; extern const SequenceModel Ibm866RussianModel; extern const SequenceModel Ibm855RussianModel; diff --git a/test/ru/utf-8.txt b/test/ru/utf-8.txt new file mode 100644 index 0000000..11fb7c5 --- /dev/null +++ b/test/ru/utf-8.txt @@ -0,0 +1 @@ +Сурки образуют отчётливо выраженную группу из 14 или 15 видов (статус лесостепного сурка как отдельного вида является предметом обсуждения), в рамках семейства беличьих. Это относительно крупные, весом в несколько килограммов, животные, обитающие в открытых ландшафтах, в сооружаемых самостоятельно норах. Прародина сурков — Северная Америка, откуда они распространились через Берингию в Азию, и дальше — в Европу. В Евразии большинство исследователей выделяет 8 видов сурков: три или четыре вида, объединяемых в группу bobak (степной сурок, лесостепной сурок, серый сурок и монгольский сурок), населяющие широкую полосу степей и гор от Украины на западе до северо-западного Китая на востоке, единственный чисто европейский вид — альпийский сурок, три вида гор Центральной Азии — сурок Мензбира, длиннохвостый, или красный сурок и гималайский сурок и обособленный северо-восточный вид — черношапочный сурок. Разные виды сурков обособились в различных географических зонах и отличаются друг от друга особенностями поведения, но сохранили внешнее сходство и необходимость впадать в зимнюю спячку. Все сурки травоядны, селятся в норах, имеют тёплый мех и почти все живут колониями. Различаются равнинные сурки (байбаки) и сурки горные, живущие в суровых условиях альпийских гор, куда летнее тепло приходит поздно, а зима является рано. Сурки встречают свистом восход солнца[3]. Сурки иногда храпят[3]. -- cgit v1.2.3