summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJehan <jehan@girinstud.io>2022-12-17 22:46:13 +0100
committerJehan <jehan@girinstud.io>2022-12-17 22:47:54 +0100
commitabd123e07da34a88255c17ff470ed70c0745de43 (patch)
treea490532311618af744e9a49a19b4340438efa878
parentd00d4d52b7b389d970685491eccb10228ffccdbd (diff)
script, src, test: add Serbian support.
For UTF-8, ISO-8859-5 and WINDOWS-1251. Test files' contents come from page 'Мрмот' on Wikipedia in Serbian.
-rw-r--r--README.md4
-rw-r--r--script/BuildLangModelLogs/LangSerbianModel.log251
-rw-r--r--script/langs/sr.py58
-rw-r--r--src/CMakeLists.txt1
-rw-r--r--src/LangModels/LangSerbianModel.cpp274
-rw-r--r--src/nsLanguageDetector.h1
-rw-r--r--src/nsMBCSGroupProber.cpp1
-rw-r--r--src/nsMBCSGroupProber.h2
-rw-r--r--src/nsSBCSGroupProber.cpp3
-rw-r--r--src/nsSBCSGroupProber.h2
-rw-r--r--src/nsSBCharSetProber.h3
-rw-r--r--test/sr/iso-8859-5.txt1
-rw-r--r--test/sr/utf-8.txt1
-rw-r--r--test/sr/windows-1251.txt1
14 files changed, 601 insertions, 2 deletions
diff --git a/README.md b/README.md
index 301c6a3..fe02758 100644
--- a/README.md
+++ b/README.md
@@ -168,6 +168,10 @@ uchardet started as a C language binding of the original C++ implementation of t
* MAC-CYRILLIC
* IBM866
* IBM855
+ * Serbian
+ * UTF-8
+ * ISO-8859-5
+ * WINDOWS-1251
* Slovak
* UTF-8
* Windows-1250
diff --git a/script/BuildLangModelLogs/LangSerbianModel.log b/script/BuildLangModelLogs/LangSerbianModel.log
new file mode 100644
index 0000000..426e0b2
--- /dev/null
+++ b/script/BuildLangModelLogs/LangSerbianModel.log
@@ -0,0 +1,251 @@
+= Logs of language model for Serbian (sr) =
+
+- Generated by BuildLangModel.py
+- Started: 2022-12-17 22:32:34.945303
+- Maximum depth: 4
+- Max number of pages: 200
+
+== Parsed pages ==
+
+Ратно_ваздухопловство_и_противваздушна_одбрана_Војске_Републике_Српске (revision 24582261)
+Рат у Босни и Херцеговини (revision 25415566)
+Момчило Крајишник (revision 25271534)
+Предсједник Републике Српске (revision 25359856)
+Драган Вуковић (revision 25343578)
+М53/59 Прага (revision 25379553)
+2. батаљон Војне полиције (revision 23891722)
+Никола Делић (revision 24741676)
+Источнобосански корпус Војске Републике Српске (revision 24462780)
+5. козарска лака пјешадијска бригада (revision 25381013)
+Радомир Чавић (revision 24221757)
+25. јануар (revision 25269630)
+6. санска лака пјешадијска бригада (revision 24929928)
+17. кључка лака пјешадијска бригада (revision 24355928)
+Орлови Грмеча (revision 23891727)
+Мићо Влаисављевић (revision 24915097)
+Цвјетко Савић (revision 25199639)
+Патриотска лига (БиХ) (revision 21762882)
+Упала плућа (revision 24386531)
+Милорад Кутлешић (revision 25169318)
+92. мјешовита авијацијска бригада (revision 24461739)
+Историја Републике Српске (1992—1995) (revision 25396800)
+17. август (revision 25162661)
+Социјалистичка партија (Република Српска) (revision 25339684)
+Сима Лозанић (revision 25227668)
+Орден за војне заслуге (revision 24140948)
+Живомир Нинковић (revision 24589570)
+2. подрињска лака пјешадијска бригада (revision 24490712)
+1. которварошка лака пјешадијска бригада (revision 23884431)
+Анте Марковић (revision 25363253)
+15. новембар (revision 25387946)
+Радивоје Милетић (revision 24232926)
+Policija Republike Srpske (revision 24293827)
+Стаменко Новаковић (revision 24933201)
+Самоопредељење (revision 25423052)
+18. децембар (revision 25259258)
+2. оклопна бригада (revision 24369237)
+Славко Лисица (revision 25424390)
+УНКРО (revision 24399325)
+25. децембар (revision 25394962)
+Београд (revision 25435556)
+Епархија захумско-херцеговачка и приморска (revision 25193025)
+Будимир Гаврић (revision 25205770)
+Први батаљон војне полиције 1.КК (revision 23998235)
+1950 (revision 25396634)
+Радивоје Томанић (revision 24051372)
+Јован Марић (revision 24589591)
+Мило Ђукановић (revision 25437661)
+Џон Херт (revision 24441263)
+Дринска бановина (revision 25409388)
+43. приједорска моторизована бригада (revision 25347814)
+Југословенска народна армија (revision 25388846)
+Питер Фајт (revision 25425148)
+2. сарајевска лака пјешадијска бригада (revision 24591755)
+2. теслићка лака пјешадијска бригада (revision 24027353)
+Драгиша Масал (revision 25302857)
+Карађорђева звијезда (revision 24721051)
+Перо Млађеновић (revision 25424880)
+Татра (revision 25171753)
+1831 (revision 24734221)
+Карингтон-Кутиљеров план (revision 25391410)
+Poli(metil metakrilat) (revision 20647284)
+3. сарајевска пјешадијска бригада (revision 24591756)
+Милутин Скочајић (revision 24619682)
+Владимир Арсић (revision 24238327)
+Гарда Пантери (revision 24200236)
+4. август (revision 25139596)
+Владо Спремо (revision 25339001)
+Манојло Миловановић (revision 25368228)
+Мировни планови прије и током Рата у БиХ (revision 24482990)
+Вашингтонски споразум (1994) (revision 22769830)
+Чедо Сладоје (revision 24464996)
+24. фебруар (revision 25270517)
+Сарајевско-романијски корпус Војске Републике Српске (revision 24591773)
+Момир Зец (revision 25274792)
+ЈНА (revision 25388846)
+Momir Talić (revision 24773518)
+Вељко Стојановић (revision 24591774)
+Здравко Толимир (revision 24593446)
+Новак Ђукић (revision 24724367)
+Оклоп (revision 23883339)
+Представништва Републике Српске у иностранству (revision 25423590)
+Словачка (revision 25283209)
+Ваздухопловни завод Космос (revision 25214359)
+Спасоје Орашанин (revision 24464248)
+Битка за Возућу (revision 25351440)
+Операција Намјерна сила (revision 25416485)
+Дејтонски мировни споразум (revision 25403210)
+1. херцеговачка моторизована бригада (Требиње) (revision 25162762)
+Москва (revision 25329265)
+Богдан Суботић (revision 24318915)
+Чехословачка (revision 25210184)
+11. мркоњићка лака пешадијска бригада (revision 23887232)
+13. новембар (revision 25357481)
+Светозар Андрић (revision 25255141)
+Мате Бобан (revision 24220533)
+Блаж Краљевић (revision 24737190)
+Логор Узамница (revision 24525764)
+Абасиди (revision 25422122)
+19. јануар (revision 25314987)
+Предсједник Владе Републике Српске (revision 25340553)
+Милован Станковић (политичар, 1958) (revision 25152054)
+Топ (revision 25201602)
+Емил Влајки (revision 24038256)
+7. извиђачко-диверзантски одред (revision 24073614)
+Никола Мишковић (revision 25228450)
+Инцидент код Мркоњић Града (revision 25389261)
+БОВ (оклопни транспортер) (revision 25252351)
+Мићо Грубор (revision 24289250)
+Социјалистичка Република Босна и Херцеговина (revision 24573038)
+13. фебруар (revision 25259071)
+Маринко Шиљеговић (revision 24589619)
+Европски ратови (revision 25166321)
+1991 (revision 25356221)
+1999 (revision 25425404)
+Станислав Галић (revision 24775466)
+Самостални пјешадијски батаљон Скелани (revision 24236929)
+Бијело Брдо (Дервента) (revision 23651156)
+Војска Републике Српске (revision 25349210)
+Радомир Лукић (revision 24268767)
+1961 (revision 24417631)
+Орден слободе (revision 25287659)
+Ослободилачка национална армија (revision 24118083)
+2. семберска лака пјешадијска бригада (revision 24461729)
+Операција Спреча 95 (revision 24403645)
+Драгомир Милошевић (revision 24780575)
+26. фебруар (revision 25147680)
+Operacija Una (revision 24725456)
+Божо Новак (revision 25300274)
+Херцеговачки корпус Војске Републике Српске (revision 24479790)
+2. крајишки корпус Војске Републике Српске (revision 25162755)
+Операција Звезда (revision 24403718)
+Војно медицински центар (revision 23886998)
+Бошко Келечевић (revision 23631478)
+Предузетник (revision 24933587)
+Бошко Гвозден (revision 25269514)
+15. март (revision 25314965)
+Територијална одбрана Српске Крајине (revision 24437954)
+Вукови са Вучијака (revision 25371082)
+4. јануар (revision 25388556)
+Република Босна и Херцеговина (revision 25139818)
+16. октобар (revision 25396928)
+Немања Недовић (revision 25359086)
+Операција Штит (revision 25375240)
+Митко Стојковски (revision 25232528)
+Српска академија наука и уметности (revision 25413602)
+Француска револуција (revision 25223439)
+Србија (revision 25433539)
+7. лаки артиљеријски пук ПВО Херцеговачког корпуса (revision 23891985)
+Рајко Балаћ (revision 24926978)
+Битка за Купрес (1994) (revision 25241940)
+Паравојска (revision 25390961)
+Саво Сокановић (revision 23929682)
+Брег (река) (revision 24516962)
+7. купрешка моторизована бригада (revision 23891702)
+Народна партија Српске (revision 25339710)
+Radislav Krstić (revision 25169830)
+Центар војних школа ВРС „Генерал Рајко Балаћ” (revision 25160226)
+Генерал (ЈНА) (revision 24124119)
+Ратно ваздухопловство и противваздушна одбрана Војске Републике Српске (revision 24582261)
+15. век (revision 25356672)
+Богдан Сладојевић (revision 25312818)
+Карађорђе Петровић (revision 25412434)
+Херцеговачки санџак (revision 23536553)
+7. март (revision 25315286)
+НАТО бомбардовање Републике Српске (revision 25416485)
+Владо Лиздек (revision 25116123)
+1998 (revision 25434166)
+1. добојска лака пјешадијска бригада (revision 25258891)
+Сан Марино (revision 25357929)
+Карловачка митрополија (revision 25413663)
+
+== End of Parsed pages ==
+
+- Wikipedia parsing ended at: 2022-12-17 22:36:36.028806
+
+64 characters appeared 1001054 times.
+
+Most Frequent characters:
+[ 0] Char а: 11.390294629460548 %
+[ 1] Char и: 8.957958311939217 %
+[ 2] Char о: 8.472270227180552 %
+[ 3] Char е: 8.20595092772218 %
+[ 4] Char р: 6.002373498332758 %
+[ 5] Char н: 5.695996419773559 %
+[ 6] Char с: 5.0323958547690735 %
+[ 7] Char к: 4.062318316494415 %
+[ 8] Char у: 4.039442427681224 %
+[ 9] Char т: 3.779216705592306 %
+[10] Char в: 3.5658416029504902 %
+[11] Char ј: 3.2442805283231473 %
+[12] Char д: 3.23958547690734 %
+[13] Char п: 3.0462892111714255 %
+[14] Char л: 2.7947543289373 %
+[15] Char м: 2.7568942334779143 %
+[16] Char б: 1.946947916895592 %
+[17] Char г: 1.7272794474623747 %
+[18] Char з: 1.4506709927736166 %
+[19] Char ц: 0.9910554275793314 %
+[20] Char ч: 0.8583952514050192 %
+[21] Char ш: 0.8464078860880633 %
+[22] Char х: 0.7302303372245653 %
+[23] Char њ: 0.5863819534210941 %
+[24] Char i: 0.5255460744375429 %
+[25] Char a: 0.5215502859985576 %
+[26] Char ћ: 0.4614136699918286 %
+[27] Char љ: 0.4285483100811744 %
+[28] Char e: 0.38309621658771653 %
+[29] Char o: 0.3636167479476632 %
+[30] Char ж: 0.3585221176879569 %
+[31] Char ф: 0.33494696589794354 %
+[32] Char n: 0.3133697083274229 %
+[33] Char ђ: 0.2959880286178368 %
+[34] Char r: 0.2686168778107874 %
+[35] Char s: 0.25932666969014656 %
+[36] Char t: 0.2007883690590118 %
+[37] Char u: 0.19029942440667535 %
+[38] Char j: 0.18690300423353784 %
+[39] Char k: 0.1803099533092121 %
+[40] Char l: 0.1803099533092121 %
+[41] Char p: 0.16782311443738301 %
+[42] Char m: 0.16312806302157526 %
+[43] Char c: 0.1390534376766888 %
+[44] Char v: 0.1382542799888917 %
+[45] Char d: 0.1287642824463016 %
+[46] Char b: 0.09040471343204262 %
+[47] Char g: 0.06942682412736975 %
+[48] Char z: 0.061834826093297664 %
+[49] Char h: 0.05254461797265682 %
+[50] Char џ: 0.04285483100811745 %
+
+The first 51 characters have an accumulated ratio of 0.9993047328116168.
+The first 5 characters have an accumulated ratio of 0.43028847594635256.
+All characters whose order is over 31 have an accumulated ratio of 0.03130000978968167.
+
+1174 sequences found.
+
+First 658 (typical positive ratio): 0.9950262953064305
+Next 193 (851-658): 0.0039828494616473975
+Rest: 0.0009908552319221053
+
+- Processing end: 2022-12-17 22:36:36.141687
diff --git a/script/langs/sr.py b/script/langs/sr.py
new file mode 100644
index 0000000..4981db1
--- /dev/null
+++ b/script/langs/sr.py
@@ -0,0 +1,58 @@
+#!/bin/python3
+# -*- coding: utf-8 -*-
+
+# ##### BEGIN LICENSE BLOCK #####
+# Version: MPL 1.1/GPL 2.0/LGPL 2.1
+#
+# The contents of this file are subject to the Mozilla Public License Version
+# 1.1 (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+# http://www.mozilla.org/MPL/
+#
+# Software distributed under the License is distributed on an "AS IS" basis,
+# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
+# for the specific language governing rights and limitations under the
+# License.
+#
+# The Original Code is Mozilla Universal charset detector code.
+#
+# The Initial Developer of the Original Code is
+# Netscape Communications Corporation.
+# Portions created by the Initial Developer are Copyright (C) 2001
+# the Initial Developer. All Rights Reserved.
+#
+# Contributor(s):
+# Jehan <jehan@girinstud.io>
+#
+# Alternatively, the contents of this file may be used under the terms of
+# either the GNU General Public License Version 2 or later (the "GPL"), or
+# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
+# in which case the provisions of the GPL or the LGPL are applicable instead
+# of those above. If you wish to allow use of your version of this file only
+# under the terms of either the GPL or the LGPL, and not to allow others to
+# use your version of this file under the terms of the MPL, indicate your
+# decision by deleting the provisions above and replace them with the notice
+# and other provisions required by the GPL or the LGPL. If you do not delete
+# the provisions above, a recipient may use your version of this file under
+# the terms of any one of the MPL, the GPL or the LGPL.
+#
+# ##### END LICENSE BLOCK #####
+
+import re
+
+## Mandatory Properties ##
+
+name = 'Serbian'
+code = 'sr'
+use_ascii = False
+charsets = [ 'WINDOWS-1251', 'ISO-8859-5' ]
+
+## Optional Properties ##
+
+# Alphabet characters.
+alphabet = 'абвгдђежзијклљмнњопрстћуфхцчџш'
+# A starred page which was rewarded on the main page when I created
+# the data.
+start_pages = ['Ратно_ваздухопловство_и_противваздушна_одбрана_Војске_Републике_Српске']
+wikipedia_code = code
+case_mapping = True
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index cec12ab..c017642 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -33,6 +33,7 @@ set(
LangModels/LangPortugueseModel.cpp
LangModels/LangRomanianModel.cpp
LangModels/LangRussianModel.cpp
+ LangModels/LangSerbianModel.cpp
LangModels/LangSlovakModel.cpp
LangModels/LangSloveneModel.cpp
LangModels/LangSwedishModel.cpp
diff --git a/src/LangModels/LangSerbianModel.cpp b/src/LangModels/LangSerbianModel.cpp
new file mode 100644
index 0000000..a1a40a3
--- /dev/null
+++ b/src/LangModels/LangSerbianModel.cpp
@@ -0,0 +1,274 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* ***** BEGIN LICENSE BLOCK *****
+ * Version: MPL 1.1/GPL 2.0/LGPL 2.1
+ *
+ * The contents of this file are subject to the Mozilla Public License Version
+ * 1.1 (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ * http://www.mozilla.org/MPL/
+ *
+ * Software distributed under the License is distributed on an "AS IS" basis,
+ * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
+ * for the specific language governing rights and limitations under the
+ * License.
+ *
+ * The Original Code is Mozilla Communicator client code.
+ *
+ * The Initial Developer of the Original Code is
+ * Netscape Communications Corporation.
+ * Portions created by the Initial Developer are Copyright (C) 1998
+ * the Initial Developer. All Rights Reserved.
+ *
+ * Contributor(s):
+ *
+ * Alternatively, the contents of this file may be used under the terms of
+ * either the GNU General Public License Version 2 or later (the "GPL"), or
+ * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
+ * in which case the provisions of the GPL or the LGPL are applicable instead
+ * of those above. If you wish to allow use of your version of this file only
+ * under the terms of either the GPL or the LGPL, and not to allow others to
+ * use your version of this file under the terms of the MPL, indicate your
+ * decision by deleting the provisions above and replace them with the notice
+ * and other provisions required by the GPL or the LGPL. If you do not delete
+ * the provisions above, a recipient may use your version of this file under
+ * the terms of any one of the MPL, the GPL or the LGPL.
+ *
+ * ***** END LICENSE BLOCK ***** */
+
+#include "../nsSBCharSetProber.h"
+#include "../nsLanguageDetector.h"
+
+/********* Language model for: Serbian *********/
+
+/**
+ * Generated by BuildLangModel.py
+ * On: 2022-12-17 22:36:36.029761
+ **/
+
+/* Character Mapping Table:
+ * ILL: illegal character.
+ * CTR: control character specific to the charset.
+ * RET: carriage/return.
+ * SYM: symbol (punctuation) that does not belong to word.
+ * NUM: 0 - 9.
+ *
+ * Other characters are ordered by probabilities
+ * (0 is the most common character in the language).
+ *
+ * Orders are generic to a language. So the codepoint with order X in
+ * CHARSET1 maps to the same character as the codepoint with the same
+ * order X in CHARSET2 for the same language.
+ * As such, it is possible to get missing order. For instance the
+ * ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1
+ * even though they are both used for French. Same for the euro sign.
+ */
+static const unsigned char Windows_1251_CharToOrderMap[] =
+{
+ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */
+ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
+ SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
+ NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
+ SYM, 25, 46, 43, 45, 28, 51, 47, 49, 24, 38, 39, 40, 42, 32, 29, /* 4X */
+ 41, 55, 34, 35, 36, 37, 44, 53, 54, 52, 48,SYM,SYM,SYM,SYM,SYM, /* 5X */
+ SYM, 25, 46, 43, 45, 28, 51, 47, 49, 24, 38, 39, 40, 42, 32, 29, /* 6X */
+ 41, 55, 34, 35, 36, 37, 44, 53, 54, 52, 48,SYM,SYM,SYM,SYM,CTR, /* 7X */
+ 33, 64,SYM, 65,SYM,SYM,SYM,SYM,SYM,SYM, 27,SYM, 23, 66, 26, 50, /* 8X */
+ 33,SYM,SYM,SYM,SYM,SYM,SYM,SYM,ILL,SYM, 27,SYM, 23, 67, 26, 50, /* 9X */
+ SYM, 68, 69, 11,SYM, 70,SYM,SYM, 71,SYM, 72,SYM,SYM,SYM,SYM, 73, /* AX */
+ SYM,SYM, 62, 62, 74,SYM,SYM,SYM, 75,SYM, 76,SYM, 11, 77, 78, 79, /* BX */
+ 0, 16, 10, 17, 12, 3, 30, 18, 1, 57, 7, 14, 15, 5, 2, 13, /* CX */
+ 4, 6, 9, 8, 31, 22, 19, 20, 21, 80, 60, 59, 56, 63, 61, 58, /* DX */
+ 0, 16, 10, 17, 12, 3, 30, 18, 1, 57, 7, 14, 15, 5, 2, 13, /* EX */
+ 4, 6, 9, 8, 31, 22, 19, 20, 21, 81, 60, 59, 56, 63, 61, 58, /* FX */
+};
+/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
+
+static const unsigned char Iso_8859_5_CharToOrderMap[] =
+{
+ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */
+ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
+ SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
+ NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
+ SYM, 25, 46, 43, 45, 28, 51, 47, 49, 24, 38, 39, 40, 42, 32, 29, /* 4X */
+ 41, 55, 34, 35, 36, 37, 44, 53, 54, 52, 48,SYM,SYM,SYM,SYM,SYM, /* 5X */
+ SYM, 25, 46, 43, 45, 28, 51, 47, 49, 24, 38, 39, 40, 42, 32, 29, /* 6X */
+ 41, 55, 34, 35, 36, 37, 44, 53, 54, 52, 48,SYM,SYM,SYM,SYM,CTR, /* 7X */
+ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */
+ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */
+ SYM, 82, 33, 83, 84, 85, 62, 86, 11, 27, 23, 26, 87,SYM, 88, 50, /* AX */
+ 0, 16, 10, 17, 12, 3, 30, 18, 1, 57, 7, 14, 15, 5, 2, 13, /* BX */
+ 4, 6, 9, 8, 31, 22, 19, 20, 21, 89, 60, 59, 56, 63, 61, 58, /* CX */
+ 0, 16, 10, 17, 12, 3, 30, 18, 1, 57, 7, 14, 15, 5, 2, 13, /* DX */
+ 4, 6, 9, 8, 31, 22, 19, 20, 21, 90, 60, 59, 56, 63, 61, 58, /* EX */
+ SYM, 91, 33, 92, 93, 94, 62, 95, 11, 27, 23, 26, 96,SYM, 97, 50, /* FX */
+};
+/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
+
+static const int Unicode_Char_size = 102;
+static const unsigned int Unicode_CharOrder[] =
+{
+ 65, 25, 66, 46, 67, 43, 68, 45, 69, 28, 71, 47, 72, 49, 73, 24,
+ 74, 38, 75, 39, 76, 40, 77, 42, 78, 32, 79, 29, 80, 41, 82, 34,
+ 83, 35, 84, 36, 85, 37, 86, 44, 90, 48, 97, 25, 98, 46, 99, 43,
+ 100, 45, 101, 28, 103, 47, 104, 49, 105, 24, 106, 38, 107, 39, 108, 40,
+ 109, 42, 110, 32, 111, 29, 112, 41, 114, 34, 115, 35, 116, 36, 117, 37,
+ 118, 44, 122, 48, 1026, 33, 1032, 11, 1033, 27, 1034, 23, 1035, 26,1039, 50,
+ 1040, 0, 1041, 16, 1042, 10, 1043, 17, 1044, 12, 1045, 3, 1046, 30,1047, 18,
+ 1048, 1, 1050, 7, 1051, 14, 1052, 15, 1053, 5, 1054, 2, 1055, 13,1056, 4,
+ 1057, 6, 1058, 9, 1059, 8, 1060, 31, 1061, 22, 1062, 19, 1063, 20,1064, 21,
+ 1072, 0, 1073, 16, 1074, 10, 1075, 17, 1076, 12, 1077, 3, 1078, 30,1079, 18,
+ 1080, 1, 1082, 7, 1083, 14, 1084, 15, 1085, 5, 1086, 2, 1087, 13,1088, 4,
+ 1089, 6, 1090, 9, 1091, 8, 1092, 31, 1093, 22, 1094, 19, 1095, 20,1096, 21,
+ 1106, 33, 1112, 11, 1113, 27, 1114, 23, 1115, 26, 1119, 50,
+};
+
+
+/* Model Table:
+ * Total considered sequences: 1174 / 2601
+ * - Positive sequences: first 658 (0.9950262953064305)
+ * - Probable sequences: next 193 (851-658) (0.0039828494616473975)
+ * - Neutral sequences: last 1750 (0.0009908552319221053)
+ * - Negative sequences: 1427 (off-ratio)
+ * Negative sequences: TODO
+ */
+static const PRUint8 SerbianLangModel[] =
+{
+ 1,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,
+ 0,3,3,0,1,3,3,0,3,0,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,3,
+ 3,1,3,2,3,3,3,3,1,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,
+ 0,3,3,0,0,3,3,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,
+ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,
+ 0,3,3,0,0,3,3,1,3,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,2,
+ 3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,
+ 0,3,3,0,0,3,3,0,3,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,
+ 3,3,3,3,1,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,
+ 1,1,1,0,0,3,2,0,3,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,2,
+ 3,3,3,3,3,1,3,3,3,3,3,2,3,3,2,1,3,3,3,3,3,1,2,0,0,
+ 1,0,1,0,0,3,3,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,
+ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,1,0,3,0,0,2,0,0,
+ 1,0,3,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 3,3,3,3,3,3,3,2,3,3,3,2,2,2,3,3,1,1,0,3,1,2,1,3,0,
+ 1,2,3,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 3,3,3,3,3,3,3,3,1,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,
+ 0,3,3,0,0,3,3,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,
+ 3,3,3,3,3,3,3,3,3,1,3,3,2,3,3,2,3,1,2,2,1,0,3,3,0,
+ 1,0,2,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 3,3,3,3,3,3,3,3,3,2,3,3,3,2,3,2,1,3,1,3,2,3,0,1,0,
+ 0,0,3,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 3,3,3,3,2,3,3,3,3,3,3,2,3,3,3,3,2,1,3,3,3,2,2,2,0,
+ 0,0,1,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,
+ 3,3,3,3,3,3,3,1,3,1,3,3,0,2,3,3,3,3,3,1,0,1,1,3,0,
+ 1,0,2,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 3,3,3,3,3,3,3,2,3,3,3,3,1,1,3,1,1,1,1,2,2,3,1,0,0,
+ 0,1,3,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,
+ 3,3,3,3,1,3,3,3,3,3,2,0,3,2,1,3,3,2,1,2,2,3,2,1,0,
+ 0,0,0,0,0,1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,
+ 3,3,3,3,3,3,3,2,3,1,1,3,2,3,3,3,3,1,2,3,2,1,0,2,0,
+ 1,1,3,0,0,0,2,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 3,3,3,3,3,3,2,1,3,2,2,3,2,1,3,1,0,1,3,1,1,1,2,1,0,
+ 0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 3,3,3,3,3,3,1,1,3,2,3,0,3,1,3,2,1,1,2,0,0,1,0,2,0,
+ 0,0,2,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 3,3,3,3,3,3,1,1,3,0,3,3,3,1,3,3,3,3,0,0,0,0,0,1,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 3,3,3,3,3,1,1,2,3,1,2,3,0,1,1,1,1,1,0,0,0,0,1,1,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 3,3,3,3,0,3,1,3,3,0,2,3,0,0,3,1,0,1,0,2,0,0,0,1,0,
+ 0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 3,3,3,3,2,3,0,3,3,3,3,2,1,3,3,1,1,0,0,2,2,1,0,3,0,
+ 0,3,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 3,3,3,3,3,3,2,1,3,3,3,1,2,0,2,2,1,0,0,0,0,1,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 3,3,3,3,0,0,3,1,3,0,0,0,1,0,0,0,0,0,0,2,1,1,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,
+ 3,0,0,2,3,0,0,3,0,3,3,3,1,3,3,3,2,3,3,3,3,2,3,3,3,0,
+ 0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,
+ 1,0,1,3,3,0,0,3,0,3,3,3,2,3,3,3,3,3,3,3,3,2,3,3,1,0,
+ 3,3,3,3,0,3,0,3,3,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,
+ 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 3,3,3,3,0,3,3,3,3,2,0,0,1,0,0,0,2,0,0,2,1,2,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,2,
+ 3,1,0,2,2,0,0,3,0,3,3,3,2,2,3,3,3,3,3,3,3,3,3,3,2,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,
+ 1,0,0,0,2,0,0,3,0,3,3,3,2,3,3,3,3,3,3,3,3,3,3,2,1,0,
+ 3,3,3,3,2,3,0,0,3,0,1,3,3,0,0,1,3,0,0,0,0,0,0,3,0,
+ 0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 3,3,3,3,3,2,3,1,3,2,0,1,0,0,3,1,1,0,0,1,0,1,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,
+ 3,0,0,3,3,0,0,2,0,1,3,3,3,3,2,1,1,1,3,0,3,1,3,2,0,0,
+ 3,3,3,3,0,0,2,0,3,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,3,
+ 3,0,0,3,3,0,0,3,0,2,3,3,3,1,2,2,3,3,3,3,2,2,3,2,1,0,
+ 0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,
+ 3,0,0,3,3,0,0,3,0,3,2,3,3,1,3,3,3,2,2,3,1,3,1,0,2,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,3,
+ 3,0,0,3,3,0,0,2,0,3,3,2,3,1,2,2,1,1,1,3,1,1,0,1,3,0,
+ 0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,
+ 2,0,0,2,1,0,0,3,0,3,3,3,0,3,3,3,3,3,2,1,3,3,3,2,2,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,
+ 3,0,0,3,3,0,0,3,0,0,3,1,3,0,2,1,0,1,2,1,1,1,0,1,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,
+ 3,0,0,3,3,0,0,2,0,3,2,3,3,0,1,3,0,3,2,1,0,1,1,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,
+ 3,0,0,3,3,0,0,3,0,0,2,2,3,3,2,3,0,0,1,1,1,1,1,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,
+ 3,0,0,3,3,0,0,2,0,3,3,2,3,1,1,3,2,0,1,0,2,0,0,0,2,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,
+ 3,0,0,3,3,0,0,1,0,2,1,1,2,1,0,2,2,3,1,0,1,2,1,0,0,0,
+ 0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,
+ 3,0,0,3,3,0,0,0,0,2,1,3,3,1,3,2,0,1,2,1,0,0,1,2,3,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,
+ 3,0,0,3,3,0,0,3,0,3,2,0,3,1,1,3,1,1,1,0,1,0,1,1,0,0,
+ 0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,
+ 3,0,0,3,3,0,0,3,0,3,2,0,3,2,0,2,0,2,1,2,0,2,2,1,1,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,
+ 3,0,0,3,3,0,0,3,0,3,1,0,3,2,0,3,0,0,0,0,0,1,1,1,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,
+ 3,0,0,3,3,0,0,2,0,3,1,1,3,0,0,3,0,0,0,1,1,0,0,1,1,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,
+ 3,0,0,2,3,0,0,2,0,1,1,0,2,1,0,2,0,2,0,3,1,3,1,0,0,0,
+ 0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,
+ 3,0,0,3,3,0,0,2,0,3,0,1,2,0,0,1,0,0,0,2,0,1,0,0,0,0,
+ 3,3,3,3,0,0,0,0,2,0,1,0,0,0,0,0,3,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+};
+
+
+const SequenceModel Windows_1251SerbianModel =
+{
+ Windows_1251_CharToOrderMap,
+ SerbianLangModel,
+ 51,
+ (float)0.9990091447680779,
+ PR_FALSE,
+ "WINDOWS-1251",
+ "sr"
+};
+
+const SequenceModel Iso_8859_5SerbianModel =
+{
+ Iso_8859_5_CharToOrderMap,
+ SerbianLangModel,
+ 51,
+ (float)0.9990091447680779,
+ PR_FALSE,
+ "ISO-8859-5",
+ "sr"
+};
+
+const LanguageModel SerbianModel =
+{
+ "sr",
+ Unicode_CharOrder,
+ 102,
+ SerbianLangModel,
+ 51,
+ 5,
+ (float)0.43028847594635256,
+ 31,
+ (float)0.03130000978968167,
+};
diff --git a/src/nsLanguageDetector.h b/src/nsLanguageDetector.h
index 01bba14..45d2af2 100644
--- a/src/nsLanguageDetector.h
+++ b/src/nsLanguageDetector.h
@@ -152,6 +152,7 @@ extern const LanguageModel PolishModel;
extern const LanguageModel PortugueseModel;
extern const LanguageModel RomanianModel;
extern const LanguageModel RussianModel;
+extern const LanguageModel SerbianModel;
extern const LanguageModel SlovakModel;
extern const LanguageModel SloveneModel;
extern const LanguageModel SpanishModel;
diff --git a/src/nsMBCSGroupProber.cpp b/src/nsMBCSGroupProber.cpp
index 9a0680a..cbce483 100644
--- a/src/nsMBCSGroupProber.cpp
+++ b/src/nsMBCSGroupProber.cpp
@@ -119,6 +119,7 @@ nsMBCSGroupProber::nsMBCSGroupProber(PRUint32 aLanguageFilter)
langDetectors[i][j++] = new nsLanguageDetector(&PortugueseModel);
langDetectors[i][j++] = new nsLanguageDetector(&RomanianModel);
langDetectors[i][j++] = new nsLanguageDetector(&RussianModel);
+ langDetectors[i][j++] = new nsLanguageDetector(&SerbianModel);
langDetectors[i][j++] = new nsLanguageDetector(&SlovakModel);
langDetectors[i][j++] = new nsLanguageDetector(&SloveneModel);
langDetectors[i][j++] = new nsLanguageDetector(&SpanishModel);
diff --git a/src/nsMBCSGroupProber.h b/src/nsMBCSGroupProber.h
index 1374786..60522e0 100644
--- a/src/nsMBCSGroupProber.h
+++ b/src/nsMBCSGroupProber.h
@@ -49,7 +49,7 @@
#include "nsEUCTWProber.h"
#define NUM_OF_PROBERS 8
-#define NUM_OF_LANGUAGES 36
+#define NUM_OF_LANGUAGES 37
class nsMBCSGroupProber: public nsCharSetProber {
public:
diff --git a/src/nsSBCSGroupProber.cpp b/src/nsSBCSGroupProber.cpp
index 93dac13..49e5303 100644
--- a/src/nsSBCSGroupProber.cpp
+++ b/src/nsSBCSGroupProber.cpp
@@ -219,6 +219,9 @@ nsSBCSGroupProber::nsSBCSGroupProber()
mProbers[n++] = new nsSingleByteCharSetProber(&Windows_1251UkrainianModel);
+ mProbers[n++] = new nsSingleByteCharSetProber(&Windows_1251SerbianModel);
+ mProbers[n++] = new nsSingleByteCharSetProber(&Iso_8859_5SerbianModel);
+
mProbers[n++] = new nsSingleByteCharSetProber(&Windows_1251MacedonianModel);
mProbers[n++] = new nsSingleByteCharSetProber(&Ibm855MacedonianModel);
mProbers[n++] = new nsSingleByteCharSetProber(&Iso_8859_5MacedonianModel);
diff --git a/src/nsSBCSGroupProber.h b/src/nsSBCSGroupProber.h
index 2a25e1f..d782732 100644
--- a/src/nsSBCSGroupProber.h
+++ b/src/nsSBCSGroupProber.h
@@ -40,7 +40,7 @@
#define nsSBCSGroupProber_h__
-#define NUM_OF_SBCS_PROBERS 115
+#define NUM_OF_SBCS_PROBERS 117
class nsCharSetProber;
class nsSBCSGroupProber: public nsCharSetProber {
diff --git a/src/nsSBCharSetProber.h b/src/nsSBCharSetProber.h
index 942d3ec..767d266 100644
--- a/src/nsSBCharSetProber.h
+++ b/src/nsSBCharSetProber.h
@@ -273,6 +273,9 @@ extern const SequenceModel Ibm865NorwegianModel;
extern const SequenceModel Windows_1251UkrainianModel;
+extern const SequenceModel Windows_1251SerbianModel;
+extern const SequenceModel Iso_8859_5SerbianModel;
+
extern const SequenceModel Windows_1251MacedonianModel;
extern const SequenceModel Ibm855MacedonianModel;
extern const SequenceModel Iso_8859_5MacedonianModel;
diff --git a/test/sr/iso-8859-5.txt b/test/sr/iso-8859-5.txt
new file mode 100644
index 0000000..e282bc8
--- /dev/null
+++ b/test/sr/iso-8859-5.txt
@@ -0,0 +1 @@
+ , . , . , . , , . , . . , .[2]
diff --git a/test/sr/utf-8.txt b/test/sr/utf-8.txt
new file mode 100644
index 0000000..5312f1c
--- /dev/null
+++ b/test/sr/utf-8.txt
@@ -0,0 +1 @@
+Мрмот је дугачак отприлике педесет сантиметара, заједно с репом који сам износи двадесет сантиметара. Тежак је четири до пет килограма, ретко више. Има прекрасно крзно сивкасте боје, које на светлости добија златносмеђе преливе. Као мишеви, зечеви и остали глодари, мрмот има секутиће који непрестано расту па мора често нешто да глође да би их трошио. На предњим ногама има четири прста наоружана повијеним канџама помоћу којих животиња узима траву и гранчице, приноси храну устима и копа земљу. На задњим ногама има пет прстију с прилично кратким канџама. Мрмот има веома оштар вид, а уши му хватају сваки и најслабији звук.[2]
diff --git a/test/sr/windows-1251.txt b/test/sr/windows-1251.txt
new file mode 100644
index 0000000..b1e4380
--- /dev/null
+++ b/test/sr/windows-1251.txt
@@ -0,0 +1 @@
+ , . , . , . , , . , . . , .[2]