summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJehan <jehan@girinstud.io>2022-12-20 14:23:24 +0100
committerJehan <jehan@girinstud.io>2022-12-20 14:28:29 +0100
commit7875272a8c61fdccba1db6b3c29ce248cc5fd65f (patch)
treeb51d6fca83d3a9978b89ca38de834767cf50665b
parentc843d23a17eebaa69be56565c5963471d5f1295f (diff)
script, src, test: new Georgian support.
For charsets UTF-8, GEORGIAN-ACADEMY and GEORGIAN-PS. The 2 GEORGIAN-* sets were generated thanks to the new create-table.py script. Test text comes from page 'ვირზაზუნა' page of Wikipedia in Georgian.
-rw-r--r--README.md4
-rw-r--r--script/BuildLangModelLogs/LangGeorgianModel.log240
-rw-r--r--script/charsets/georgian-academy.py87
-rw-r--r--script/charsets/georgian-ps.py87
-rw-r--r--script/langs/ka.py58
-rw-r--r--script/support.txt1
-rw-r--r--src/CMakeLists.txt1
-rw-r--r--src/LangModels/LangGeorgianModel.cpp288
-rw-r--r--src/nsLanguageDetector-generated.h3
-rw-r--r--src/nsMBCSGroupProber.cpp1
-rw-r--r--src/nsSBCSGroupProber.cpp3
-rw-r--r--src/nsSBCharSetProber-generated.h5
-rw-r--r--test/ka/georgian-academy.txt1
-rw-r--r--test/ka/georgian-ps.txt1
-rw-r--r--test/ka/utf-8.txt1
15 files changed, 779 insertions, 2 deletions
diff --git a/README.md b/README.md
index 8b71ba9..a3537f8 100644
--- a/README.md
+++ b/README.md
@@ -86,6 +86,10 @@ uchardet started as a C language binding of the original C++ implementation of t
* UTF-8
* ISO-8859-1
* WINDOWS-1252
+ * Georgian
+ * UTF-8
+ * GEORGIAN-ACADEMY
+ * GEORGIAN-PS
* Greek
* UTF-8
* ISO-8859-7
diff --git a/script/BuildLangModelLogs/LangGeorgianModel.log b/script/BuildLangModelLogs/LangGeorgianModel.log
new file mode 100644
index 0000000..62ce1d6
--- /dev/null
+++ b/script/BuildLangModelLogs/LangGeorgianModel.log
@@ -0,0 +1,240 @@
+= Logs of language model for Georgian (ka) =
+
+- Generated by BuildLangModel.py
+- Started: 2022-12-20 12:39:32.203539
+- Maximum depth: 4
+- Max number of pages: 200
+
+== Parsed pages ==
+
+არნოლდ შონბერგი (revision 4450208)
+ნიკოლოზ საკვირველთმოქმედი (revision 4452640)
+ინგლისური (revision 4330414)
+აშშ (revision 4451590)
+პროკოპი კესარიელი (revision 4424699)
+მთვარის პიერო (revision 4429979)
+კომპოზიტორი (revision 4257818)
+ტეტრარქია (revision 4308810)
+ახალი ზელანდია (revision 4416034)
+ფილოსოფია (revision 4288867)
+კონსტანტინე I დიდი (revision 4375061)
+სული (revision 4451196)
+ჯონ კეიჯი (revision 4417221)
+ადოლფ ჰიტლერი (revision 4442491)
+იუდაიზმი (revision 4355617)
+15 მარტი (revision 4265284)
+350 (revision 3827298)
+ჟორჟ ბიზე (revision 4430611)
+ვატიკანის მუზეუმები (revision 4287947)
+რიხარდ ვაგნერი (revision 4435391)
+ნაცისტური პარტია (revision 4387148)
+ნიკეის პირველი საეკლესიო კრება (revision 4393312)
+III საუკუნე (revision 3158968)
+წმინდა მიწა (revision 4053366)
+1991 (revision 4405448)
+ჯაზი (revision 4433861)
+იოანე მალალა (revision 3928608)
+1975 (revision 4167940)
+ლაიბნიცი, გოტფრიდ ვილჰელმ (revision 4406460)
+XIII საუკუნე (revision 3158955)
+საბუნებისმეტყველო მეცნიერებები (revision 784852)
+კომპიუტერი (revision 4291207)
+იოანე II (ბიზანტია) (revision 4245444)
+ოსიანი (revision 4275863)
+რომის კურია (revision 4093740)
+ანგლო-საქსური პერიოდი (revision 4212229)
+პოეტი (revision 4273738)
+ევროკავშირის ენები (revision 4137186)
+ბიორკი (revision 4422616)
+სვასტიკა (revision 4104250)
+3 ივნისი (revision 4314533)
+1933 (revision 4275988)
+1925 (revision 4054131)
+Wayback Machine (revision 4393565)
+ჰიტლერი (revision 4442491)
+რეპი (revision 4164120)
+რელიგია (revision 4445299)
+ჟერარ დე ნერვალი (revision 4430603)
+ფრანგული ენა (revision 4329255)
+პაპის სახელმწიფო საქმეების ვატიკანის კომისია (revision 4256091)
+აშშ-ის ვიცე-პრეზიდენტი (revision 4337709)
+ვიკისაწყობი (revision 4350741)
+მიხეილ ლერმონტოვი (revision 4098634)
+სეკულარიზმი (revision 4387207)
+რუჯერო ლეონკავალო (revision 4193014)
+საბჭოთა კავშირი (revision 4378005)
+რუდოლფ ჰესი (revision 4345664)
+კომპოზიტორები (revision 4314363)
+სს (revision 4167932)
+რუსეთი (revision 4452792)
+მუსიკა (revision 4135531)
+იუსტინიანე II (ბიზანტია) (revision 3597768)
+ანგლიკანიზმი (revision 4401729)
+ვიოლინო (revision 4146868)
+პარსიფალი (revision 4435526)
+291 (revision 3826103)
+ქრისტიანობა (revision 4446624)
+ლიბერალიზმი (revision 4383197)
+22 მარტი (revision 3887060)
+27 თებერვალი (revision 4313421)
+ბლუზი (revision 4109362)
+ლიბანი (revision 4331748)
+205 (revision 3050594)
+რომის კათოლიკური ეკლესია (revision 4175965)
+ეფესოს საეკლესიო კრება (revision 4042351)
+რიხარდ ფონ ვაიცზეკერი (revision 4331853)
+325 (revision 3827050)
+ტრევორ ჰოვარდი (revision 3560113)
+1895 (revision 4276008)
+პალესტინის სახელმწიფო (revision 4357268)
+პილიგრიმი (revision 4352139)
+გერმანული ენა (revision 4418082)
+ბჰუტანი (revision 4380636)
+კინიკოსები (revision 2833240)
+ევაგრიოს სქოლასტიკოსი (revision 3929102)
+ჯონ კიტსი (revision 4107589)
+ქალი (revision 4372485)
+სიქსტეს კაპელა (revision 4389181)
+მიხეილ VIII (ბიზანტია) (revision 4021585)
+2 იანვარი (revision 4451458)
+201 (revision 3824700)
+ვიკიციტატა (revision 4393663)
+სირია (revision 4331625)
+კონსტანტინე IV (ბიზანტია) (revision 4302882)
+ჯვაროსნული ლაშქრობები (revision 4395332)
+ბგერა (revision 4436502)
+DMOZ (revision 4386077)
+ივნისი (revision 3753237)
+ღირებულება (revision 4250301)
+კატეგორია (ფილოსოფია) (revision 2381896)
+ავიცენა (revision 4327548)
+MusicBrainz (revision 4411515)
+იდეალიზმი (revision 4245343)
+210 (revision 3050588)
+ბიზანტია (revision 4440485)
+258 (revision 3825790)
+20 მაისი (revision 4434926)
+ძვ. წ. 44 (revision 2356607)
+ალექსანდრია (revision 4427155)
+ინდუიზმი (revision 4448864)
+კოლორადოს შტატი (revision 3351421)
+კონსტანტინოპოლის მეორე საეკლესიო კრება (revision 4374923)
+მარტინ ბუბერი (revision 4440267)
+მიუზიკლი (revision 4356140)
+დიდი ბრიტანეთი (revision 4438930)
+ფელიქს მენდელსონი (revision 4108745)
+მოსახლეობა (revision 2789480)
+ISSN (revision 3500238)
+ქვეყნების სია (revision 4448427)
+ებრაული ენა (revision 4210619)
+ბელიზი (revision 4430794)
+რენი ჰარლინი (revision 3743470)
+1952 (revision 4278487)
+ძველი საბერძნეთი (revision 4446035)
+ფილოსოფოსი (revision 4288867)
+თორმეტი ტაბულის კანონები (revision 4310428)
+ისააკ I კომნენოსი (revision 4016717)
+სუბიექტი (revision 4137093)
+მესამე რაიხი (revision 4431825)
+281 (revision 3050510)
+დასავლეთ რომის იმპერია (revision 4418326)
+კლასიკური მუსიკა (revision 4448910)
+კავთისხევი (revision 4353780)
+2007 (revision 4441027)
+ნეოპლატონიზმი (revision 4336053)
+236 (revision 3825656)
+ფუგა (revision 3218315)
+პალესტინა (revision 4240018)
+მეორე მსოფლიო ომი (revision 4442511)
+დავიდ ბენ-გურიონი (revision 4428059)
+1948 (revision 4278428)
+ზაქარია რიტორი (revision 4021268)
+პერლისი (revision 4308212)
+211 (revision 3824746)
+ადამ მიცკევიჩი (revision 4261723)
+პეტრე ჩაიკოვსკი (revision 4441450)
+ქორონიკონი (revision 4451019)
+ებრ. (revision 4210619)
+ისლამი (revision 4302636)
+260 (revision 3991034)
+VII საუკუნე (revision 3938533)
+იოანე ანტიოქიელი (ისტორიკოსი) (revision 3657193)
+სახელმწიფო რელიგია (revision 4440560)
+არიანელობა (revision 4081875)
+ოტო მაისნერი (revision 3459961)
+
+== End of Parsed pages ==
+
+- Wikipedia parsing ended at: 2022-12-20 12:56:27.858987
+
+77 characters appeared 1115054 times.
+
+Most Frequent characters:
+[ 0] Char ა: 12.857762942422518 %
+[ 1] Char ი: 12.247478597449092 %
+[ 2] Char ე: 8.665768653356698 %
+[ 3] Char ს: 6.575466300286801 %
+[ 4] Char რ: 6.0818579189886774 %
+[ 5] Char ო: 5.1301551315003575 %
+[ 6] Char მ: 4.846133012392225 %
+[ 7] Char ლ: 4.529556416101821 %
+[ 8] Char ნ: 4.125181381350141 %
+[ 9] Char დ: 3.9241148859158392 %
+[10] Char ბ: 3.471311703289706 %
+[11] Char ვ: 2.726504725331688 %
+[12] Char უ: 2.657001364956316 %
+[13] Char თ: 2.120973513390383 %
+[14] Char გ: 1.9234046064136805 %
+[15] Char ტ: 1.9089658438066675 %
+[16] Char კ: 1.5684442188450065 %
+[17] Char შ: 1.4508714376164742 %
+[18] Char ხ: 1.2111521056379333 %
+[19] Char ც: 1.1790460372322775 %
+[20] Char პ: 0.9052476382309737 %
+[21] Char წ: 0.8995976876456208 %
+[22] Char ზ: 0.8015755290775155 %
+[23] Char ქ: 0.7613981026927844 %
+[24] Char ფ: 0.731354714659559 %
+[25] Char ყ: 0.57871636709971 %
+[26] Char i: 0.47576171198883643 %
+[27] Char e: 0.46858717156299157 %
+[28] Char ღ: 0.41737888927352396 %
+[29] Char a: 0.3600722476220882 %
+[30] Char ძ: 0.3447366674618449 %
+[31] Char n: 0.334512947355016 %
+[32] Char o: 0.3023171971940372 %
+[33] Char s: 0.2952323385235155 %
+[34] Char r: 0.2890442974062243 %
+[35] Char t: 0.27639916990567276 %
+[36] Char ჩ: 0.2525438229897386 %
+[37] Char ჰ: 0.21810602894568334 %
+[38] Char l: 0.21039339798790013 %
+[39] Char ჯ: 0.1890491402210117 %
+[40] Char h: 0.18286109910372056 %
+[41] Char c: 0.17218897022027632 %
+[42] Char d: 0.16752551894347717 %
+[43] Char u: 0.13550913229314454 %
+[44] Char m: 0.12743777431406908 %
+[45] Char b: 0.10340306388748885 %
+[46] Char p: 0.10017452069585868 %
+[47] Char g: 0.09282061675936772 %
+[48] Char ჭ: 0.09048889112096814 %
+[49] Char y: 0.08752939319530713 %
+[50] Char v: 0.07524299271604784 %
+[51] Char f: 0.06887558808811053 %
+[52] Char w: 0.0669025894710032 %
+[53] Char x: 0.056051097076912866 %
+[54] Char k: 0.05273287212995962 %
+[55] Char ჟ: 0.04735196681057599 %
+
+The first 56 characters have an accumulated ratio of 0.9994027195095485.
+The first 4 characters have an accumulated ratio of 0.4034647649351511.
+All characters whose order is over 33 have an accumulated ratio of 0.03062631944282519.
+
+1485 sequences found.
+
+First 819 (typical positive ratio): 0.9950126614517769
+Next 240 (1059-819): 0.003988409500368384
+Rest: 0.000998929047854702
+
+- Processing end: 2022-12-20 12:56:28.396075
diff --git a/script/charsets/georgian-academy.py b/script/charsets/georgian-academy.py
new file mode 100644
index 0000000..24f3abc
--- /dev/null
+++ b/script/charsets/georgian-academy.py
@@ -0,0 +1,87 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+
+# ##### BEGIN LICENSE BLOCK #####
+# Version: MPL 1.1/GPL 2.0/LGPL 2.1
+#
+# The contents of this file are subject to the Mozilla Public License Version
+# 1.1 (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+# http://www.mozilla.org/MPL/
+#
+# Software distributed under the License is distributed on an "AS IS" basis,
+# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
+# for the specific language governing rights and limitations under the
+# License.
+#
+# The Original Code is Mozilla Universal charset detector code.
+#
+# The Initial Developer of the Original Code is
+# Netscape Communications Corporation.
+# Portions created by the Initial Developer are Copyright (C) 2001
+# the Initial Developer. All Rights Reserved.
+#
+# Contributor(s):
+# Jehan <jehan@girinstud.io>
+#
+# Alternatively, the contents of this file may be used under the terms of
+# either the GNU General Public License Version 2 or later (the "GPL"), or
+# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
+# in which case the provisions of the GPL or the LGPL are applicable instead
+# of those above. If you wish to allow use of your version of this file only
+# under the terms of either the GPL or the LGPL, and not to allow others to
+# use your version of this file under the terms of the MPL, indicate your
+# decision by deleting the provisions above and replace them with the notice
+# and other provisions required by the GPL or the LGPL. If you do not delete
+# the provisions above, a recipient may use your version of this file under
+# the terms of any one of the MPL, the GPL or the LGPL.
+#
+# ##### END LICENSE BLOCK #####
+
+from codepoints import *
+
+name = 'GEORGIAN-ACADEMY'
+aliases = []
+
+language = \
+{
+ # Languages with complete coverage.
+ 'complete': [ 'ka' ],
+ 'incomplete': []
+}
+
+## Table generated by script/create-table.py with iconv ##
+# X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF #
+charmap = \
+[
+ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, # 0X
+ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 1X
+ SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # 2X
+ #' ' '!' '"' '#' '$' '%' '&' ''' '(' ')' '*' '+' ',' '-' '.' '/'
+ NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, # 3X
+ #'0' '1' '2' '3' '4' '5' '6' '7' '8' '9' ':' ';' '<' '=' '>' '?'
+ SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 4X
+ #'@' 'A' 'B' 'C' 'D' 'E' 'F' 'G' 'H' 'I' 'J' 'K' 'L' 'M' 'N' 'O'
+ LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,SYM, # 5X
+ #'P' 'Q' 'R' 'S' 'T' 'U' 'V' 'W' 'X' 'Y' 'Z' '[' '\' ']' '^' '_'
+ SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 6X
+ #'`' 'a' 'b' 'c' 'd' 'e' 'f' 'g' 'h' 'i' 'j' 'k' 'l' 'm' 'n' 'o'
+ LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,CTR, # 7X
+ #'p' 'q' 'r' 's' 't' 'u' 'v' 'w' 'x' 'y' 'z' '{' '|' '}' '~' CTR
+ CTR,CTR,SYM,LET,SYM,SYM,SYM,SYM,LET,SYM,LET,SYM,LET,CTR,CTR,CTR, # 8X
+ #CTR CTR '‚' 'ƒ' '„' '…' '†' '‡' 'ˆ' '‰' 'Š' '‹' 'Œ' CTR CTR CTR
+ CTR,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,LET,SYM,LET,CTR,CTR,LET, # 9X
+ #CTR '‘' '’' '“' '”' '•' '–' '—' '˜' '™' 'š' '›' 'œ' CTR CTR 'Ÿ'
+ CTR,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,LET,SYM,SYM,CTR,SYM,SYM, # AX
+ #CTR '¡' '¢' '£' '¤' '¥' '¦' '§' '¨' '©' 'ª' '«' '¬' CTR '®' '¯'
+ SYM,SYM,NUM,NUM,SYM,LET,SYM,SYM,SYM,NUM,LET,SYM,SYM,SYM,SYM,SYM, # BX
+ #'°' '±' '²' '³' '´' 'µ' '¶' '·' '¸' '¹' 'º' '»' '¼' '½' '¾' '¿'
+ LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # CX
+ #'ა' 'ბ' 'გ' 'დ' 'ე' 'ვ' 'ზ' 'თ' 'ი' 'კ' 'ლ' 'მ' 'ნ' 'ო' 'პ' 'ჟ'
+ LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # DX
+ #'რ' 'ს' 'ტ' 'უ' 'ფ' 'ქ' 'ღ' 'ყ' 'შ' 'ჩ' 'ც' 'ძ' 'წ' 'ჭ' 'ხ' 'ჯ'
+ LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # EX
+ #'ჰ' 'ჱ' 'ჲ' 'ჳ' 'ჴ' 'ჵ' 'ჶ' 'ç' 'è' 'é' 'ê' 'ë' 'ì' 'í' 'î' 'ï'
+ LET,LET,LET,LET,LET,LET,LET,SYM,LET,LET,LET,LET,LET,LET,LET,LET, # FX
+ #'ð' 'ñ' 'ò' 'ó' 'ô' 'õ' 'ö' '÷' 'ø' 'ù' 'ú' 'û' 'ü' 'ý' 'þ' 'ÿ'
+]
diff --git a/script/charsets/georgian-ps.py b/script/charsets/georgian-ps.py
new file mode 100644
index 0000000..8c00c28
--- /dev/null
+++ b/script/charsets/georgian-ps.py
@@ -0,0 +1,87 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+
+# ##### BEGIN LICENSE BLOCK #####
+# Version: MPL 1.1/GPL 2.0/LGPL 2.1
+#
+# The contents of this file are subject to the Mozilla Public License Version
+# 1.1 (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+# http://www.mozilla.org/MPL/
+#
+# Software distributed under the License is distributed on an "AS IS" basis,
+# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
+# for the specific language governing rights and limitations under the
+# License.
+#
+# The Original Code is Mozilla Universal charset detector code.
+#
+# The Initial Developer of the Original Code is
+# Netscape Communications Corporation.
+# Portions created by the Initial Developer are Copyright (C) 2001
+# the Initial Developer. All Rights Reserved.
+#
+# Contributor(s):
+# Jehan <jehan@girinstud.io>
+#
+# Alternatively, the contents of this file may be used under the terms of
+# either the GNU General Public License Version 2 or later (the "GPL"), or
+# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
+# in which case the provisions of the GPL or the LGPL are applicable instead
+# of those above. If you wish to allow use of your version of this file only
+# under the terms of either the GPL or the LGPL, and not to allow others to
+# use your version of this file under the terms of the MPL, indicate your
+# decision by deleting the provisions above and replace them with the notice
+# and other provisions required by the GPL or the LGPL. If you do not delete
+# the provisions above, a recipient may use your version of this file under
+# the terms of any one of the MPL, the GPL or the LGPL.
+#
+# ##### END LICENSE BLOCK #####
+
+from codepoints import *
+
+name = 'GEORGIAN-PS'
+aliases = []
+
+language = \
+{
+ # Languages with complete coverage.
+ 'complete': [ 'ka' ],
+ 'incomplete': []
+}
+
+## Table generated by script/create-table.py with iconv ##
+# X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF #
+charmap = \
+[
+ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, # 0X
+ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 1X
+ SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # 2X
+ #' ' '!' '"' '#' '$' '%' '&' ''' '(' ')' '*' '+' ',' '-' '.' '/'
+ NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, # 3X
+ #'0' '1' '2' '3' '4' '5' '6' '7' '8' '9' ':' ';' '<' '=' '>' '?'
+ SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 4X
+ #'@' 'A' 'B' 'C' 'D' 'E' 'F' 'G' 'H' 'I' 'J' 'K' 'L' 'M' 'N' 'O'
+ LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,SYM, # 5X
+ #'P' 'Q' 'R' 'S' 'T' 'U' 'V' 'W' 'X' 'Y' 'Z' '[' '\' ']' '^' '_'
+ SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 6X
+ #'`' 'a' 'b' 'c' 'd' 'e' 'f' 'g' 'h' 'i' 'j' 'k' 'l' 'm' 'n' 'o'
+ LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,CTR, # 7X
+ #'p' 'q' 'r' 's' 't' 'u' 'v' 'w' 'x' 'y' 'z' '{' '|' '}' '~' CTR
+ CTR,CTR,SYM,LET,SYM,SYM,SYM,SYM,LET,SYM,LET,SYM,LET,CTR,CTR,CTR, # 8X
+ #CTR CTR '‚' 'ƒ' '„' '…' '†' '‡' 'ˆ' '‰' 'Š' '‹' 'Œ' CTR CTR CTR
+ CTR,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,LET,SYM,LET,CTR,CTR,LET, # 9X
+ #CTR '‘' '’' '“' '”' '•' '–' '—' '˜' '™' 'š' '›' 'œ' CTR CTR 'Ÿ'
+ CTR,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,LET,SYM,SYM,CTR,SYM,SYM, # AX
+ #CTR '¡' '¢' '£' '¤' '¥' '¦' '§' '¨' '©' 'ª' '«' '¬' CTR '®' '¯'
+ SYM,SYM,NUM,NUM,SYM,LET,SYM,SYM,SYM,NUM,LET,SYM,SYM,SYM,SYM,SYM, # BX
+ #'°' '±' '²' '³' '´' 'µ' '¶' '·' '¸' '¹' 'º' '»' '¼' '½' '¾' '¿'
+ LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # CX
+ #'ა' 'ბ' 'გ' 'დ' 'ე' 'ვ' 'ზ' 'ჱ' 'თ' 'ი' 'კ' 'ლ' 'მ' 'ნ' 'ჲ' 'ო'
+ LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # DX
+ #'პ' 'ჟ' 'რ' 'ს' 'ტ' 'ჳ' 'უ' 'ფ' 'ქ' 'ღ' 'ყ' 'შ' 'ჩ' 'ც' 'ძ' 'წ'
+ LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # EX
+ #'ჭ' 'ხ' 'ჴ' 'ჯ' 'ჰ' 'ჵ' 'æ' 'ç' 'è' 'é' 'ê' 'ë' 'ì' 'í' 'î' 'ï'
+ LET,LET,LET,LET,LET,LET,LET,SYM,LET,LET,LET,LET,LET,LET,LET,LET, # FX
+ #'ð' 'ñ' 'ò' 'ó' 'ô' 'õ' 'ö' '÷' 'ø' 'ù' 'ú' 'û' 'ü' 'ý' 'þ' 'ÿ'
+]
diff --git a/script/langs/ka.py b/script/langs/ka.py
new file mode 100644
index 0000000..8db67fb
--- /dev/null
+++ b/script/langs/ka.py
@@ -0,0 +1,58 @@
+#!/bin/python3
+# -*- coding: utf-8 -*-
+
+# ##### BEGIN LICENSE BLOCK #####
+# Version: MPL 1.1/GPL 2.0/LGPL 2.1
+#
+# The contents of this file are subject to the Mozilla Public License Version
+# 1.1 (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+# http://www.mozilla.org/MPL/
+#
+# Software distributed under the License is distributed on an "AS IS" basis,
+# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
+# for the specific language governing rights and limitations under the
+# License.
+#
+# The Original Code is Mozilla Universal charset detector code.
+#
+# The Initial Developer of the Original Code is
+# Netscape Communications Corporation.
+# Portions created by the Initial Developer are Copyright (C) 2001
+# the Initial Developer. All Rights Reserved.
+#
+# Contributor(s):
+# Jehan <jehan@girinstud.io>
+#
+# Alternatively, the contents of this file may be used under the terms of
+# either the GNU General Public License Version 2 or later (the "GPL"), or
+# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
+# in which case the provisions of the GPL or the LGPL are applicable instead
+# of those above. If you wish to allow use of your version of this file only
+# under the terms of either the GPL or the LGPL, and not to allow others to
+# use your version of this file under the terms of the MPL, indicate your
+# decision by deleting the provisions above and replace them with the notice
+# and other provisions required by the GPL or the LGPL. If you do not delete
+# the provisions above, a recipient may use your version of this file under
+# the terms of any one of the MPL, the GPL or the LGPL.
+#
+# ##### END LICENSE BLOCK #####
+
+import re
+
+## Mandatory Properties ##
+
+name = 'Georgian'
+code = 'ka'
+use_ascii = False
+charsets = [ 'GEORGIAN-ACADEMY', 'GEORGIAN-PS' ]
+
+## Optional Properties ##
+
+# Alphabet characters.
+alphabet = 'აბგდევზთიკლმნოპჟრსტუფქღყშჩცძწჭხჯჰ'
+# A starred page which was rewarded on the main page when I created
+# the data.
+start_pages = ['არნოლდ შონბერგი', 'ნიკოლოზ საკვირველთმოქმედი']
+wikipedia_code = code
+case_mapping = True
diff --git a/script/support.txt b/script/support.txt
index 9b10026..dc978f1 100644
--- a/script/support.txt
+++ b/script/support.txt
@@ -18,6 +18,7 @@ hi
hr
hu
it
+ka
lt
lv
mk
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 17fd980..1226a4f 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -20,6 +20,7 @@ set(
LangModels/LangFrenchModel.cpp
LangModels/LangDanishModel.cpp
LangModels/LangGermanModel.cpp
+ LangModels/LangGeorgianModel.cpp
LangModels/LangGreekModel.cpp
LangModels/LangHungarianModel.cpp
LangModels/LangHebrewModel.cpp
diff --git a/src/LangModels/LangGeorgianModel.cpp b/src/LangModels/LangGeorgianModel.cpp
new file mode 100644
index 0000000..7da31ee
--- /dev/null
+++ b/src/LangModels/LangGeorgianModel.cpp
@@ -0,0 +1,288 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* ***** BEGIN LICENSE BLOCK *****
+ * Version: MPL 1.1/GPL 2.0/LGPL 2.1
+ *
+ * The contents of this file are subject to the Mozilla Public License Version
+ * 1.1 (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ * http://www.mozilla.org/MPL/
+ *
+ * Software distributed under the License is distributed on an "AS IS" basis,
+ * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
+ * for the specific language governing rights and limitations under the
+ * License.
+ *
+ * The Original Code is Mozilla Communicator client code.
+ *
+ * The Initial Developer of the Original Code is
+ * Netscape Communications Corporation.
+ * Portions created by the Initial Developer are Copyright (C) 1998
+ * the Initial Developer. All Rights Reserved.
+ *
+ * Contributor(s):
+ *
+ * Alternatively, the contents of this file may be used under the terms of
+ * either the GNU General Public License Version 2 or later (the "GPL"), or
+ * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
+ * in which case the provisions of the GPL or the LGPL are applicable instead
+ * of those above. If you wish to allow use of your version of this file only
+ * under the terms of either the GPL or the LGPL, and not to allow others to
+ * use your version of this file under the terms of the MPL, indicate your
+ * decision by deleting the provisions above and replace them with the notice
+ * and other provisions required by the GPL or the LGPL. If you do not delete
+ * the provisions above, a recipient may use your version of this file under
+ * the terms of any one of the MPL, the GPL or the LGPL.
+ *
+ * ***** END LICENSE BLOCK ***** */
+
+#include "../nsSBCharSetProber.h"
+#include "../nsSBCharSetProber-generated.h"
+#include "../nsLanguageDetector.h"
+
+#include "../nsLanguageDetector-generated.h"
+
+/********* Language model for: Georgian *********/
+
+/**
+ * Generated by BuildLangModel.py
+ * On: 2022-12-20 12:56:27.859568
+ **/
+
+ /* Character Mapping Table:
+ * ILL: illegal character.
+ * CTR: control character specific to the charset.
+ * RET: carriage/return.
+ * SYM: symbol (punctuation) that does not belong to word.
+ * NUM: 0 - 9.
+ *
+ * Other characters are ordered by probabilities
+ * (0 is the most common character in the language).
+ *
+ * Orders are generic to a language. So the codepoint with order X in
+ * CHARSET1 maps to the same character as the codepoint with the same
+ * order X in CHARSET2 for the same language.
+ * As such, it is possible to get missing order. For instance the
+ * ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1
+ * even though they are both used for French. Same for the euro sign.
+ */
+ static const unsigned char Georgian_Academy_CharToOrderMap[] =
+{
+ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */
+ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
+ SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
+ NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
+ SYM, 29, 45, 41, 42, 27, 51, 47, 40, 26, 57, 54, 38, 44, 31, 32, /* 4X */
+ 46, 59, 34, 33, 35, 43, 50, 52, 53, 49, 56,SYM,SYM,SYM,SYM,SYM, /* 5X */
+ SYM, 29, 45, 41, 42, 27, 51, 47, 40, 26, 57, 54, 38, 44, 31, 32, /* 6X */
+ 46, 59, 34, 33, 35, 43, 50, 52, 53, 49, 56,SYM,SYM,SYM,SYM,CTR, /* 7X */
+ CTR,CTR,SYM, 77,SYM,SYM,SYM,SYM, 78,SYM, 79,SYM, 68,CTR,CTR,CTR, /* 8X */
+ CTR,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, 80,SYM, 68,CTR,CTR, 81, /* 9X */
+ CTR,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, 82,SYM,SYM,CTR,SYM,SYM, /* AX */
+ SYM,SYM,NUM,NUM,SYM, 83,SYM,SYM,SYM,NUM, 84,SYM,SYM,SYM,SYM,SYM, /* BX */
+ 0, 10, 14, 9, 2, 11, 22, 13, 1, 16, 7, 6, 8, 5, 20, 55, /* CX */
+ 4, 3, 15, 12, 24, 23, 28, 25, 17, 36, 19, 30, 21, 48, 18, 39, /* DX */
+ 37, 85, 86, 87, 88, 89, 76, 67, 71, 61, 74, 90, 73, 66, 72, 91, /* EX */
+ 65, 92, 93, 64, 94, 75, 60,SYM, 70, 95, 69, 96, 58, 97, 63, 98, /* FX */
+};
+/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
+
+static const unsigned char Georgian_Ps_CharToOrderMap[] =
+{
+ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */
+ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
+ SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
+ NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
+ SYM, 29, 45, 41, 42, 27, 51, 47, 40, 26, 57, 54, 38, 44, 31, 32, /* 4X */
+ 46, 59, 34, 33, 35, 43, 50, 52, 53, 49, 56,SYM,SYM,SYM,SYM,SYM, /* 5X */
+ SYM, 29, 45, 41, 42, 27, 51, 47, 40, 26, 57, 54, 38, 44, 31, 32, /* 6X */
+ 46, 59, 34, 33, 35, 43, 50, 52, 53, 49, 56,SYM,SYM,SYM,SYM,CTR, /* 7X */
+ CTR,CTR,SYM, 99,SYM,SYM,SYM,SYM,100,SYM,101,SYM, 68,CTR,CTR,CTR, /* 8X */
+ CTR,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,102,SYM, 68,CTR,CTR,103, /* 9X */
+ CTR,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,104,SYM,SYM,CTR,SYM,SYM, /* AX */
+ SYM,SYM,NUM,NUM,SYM,105,SYM,SYM,SYM,NUM,106,SYM,SYM,SYM,SYM,SYM, /* BX */
+ 0, 10, 14, 9, 2, 11, 22,107, 13, 1, 16, 7, 6, 8,108, 5, /* CX */
+ 20, 55, 4, 3, 15,109, 12, 24, 23, 28, 25, 17, 36, 19, 30, 21, /* DX */
+ 48, 18,110, 39, 37,111, 62, 67, 71, 61, 74,112, 73, 66, 72,113, /* EX */
+ 65,114,115, 64,116, 75, 60,SYM, 70,117, 69,118, 58,119, 63,120, /* FX */
+};
+/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
+
+static const int Unicode_Char_size = 112;
+static const unsigned int Unicode_CharOrder[] =
+{
+ 65, 29, 66, 45, 67, 41, 68, 42, 69, 27, 70, 51, 71, 47, 72, 40,
+ 73, 26, 75, 54, 76, 38, 77, 44, 78, 31, 79, 32, 80, 46, 82, 34,
+ 83, 33, 84, 35, 85, 43, 86, 50, 87, 52, 88, 53, 89, 49, 97, 29,
+ 98, 45, 99, 41, 100, 42, 101, 27, 102, 51, 103, 47, 104, 40, 105, 26,
+ 107, 54, 108, 38, 109, 44, 110, 31, 111, 32, 112, 46, 114, 34, 115, 33,
+ 116, 35, 117, 43, 118, 50, 119, 52, 120, 53, 121, 49, 4304, 0,4305, 10,
+ 4306, 14, 4307, 9, 4308, 2, 4309, 11, 4310, 22, 4311, 13, 4312, 1,4313, 16,
+ 4314, 7, 4315, 6, 4316, 8, 4317, 5, 4318, 20, 4319, 55, 4320, 4,4321, 3,
+ 4322, 15, 4323, 12, 4324, 24, 4325, 23, 4326, 28, 4327, 25, 4328, 17,4329, 36,
+ 4330, 19, 4331, 30, 4332, 21, 4333, 48, 4334, 18, 4335, 39, 4336, 37,7312, 0,
+ 7313, 10, 7314, 14, 7315, 9, 7316, 2, 7317, 11, 7318, 22, 7319, 13,7320, 1,
+ 7321, 16, 7322, 7, 7323, 6, 7324, 8, 7325, 5, 7326, 20, 7327, 55,7328, 4,
+ 7329, 3, 7330, 15, 7331, 12, 7332, 24, 7333, 23, 7334, 28, 7335, 25,7336, 17,
+ 7337, 36, 7338, 19, 7339, 30, 7340, 21, 7341, 48, 7342, 18, 7343, 39,7344, 37,
+};
+
+
+ /* Model Table:
+ * Total considered sequences: 1485 / 3136
+ * - Positive sequences: first 819 (0.9950126614517769)
+ * - Probable sequences: next 240 (1059-819) (0.003988409500368384)
+ * - Neutral sequences: last 2077 (0.000998929047854702)
+ * - Negative sequences: 1651 (off-ratio)
+ * Negative sequences: TODO
+ */
+static const PRUint8 GeorgianLangModel[] =
+{
+ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,0,
+ 3,0,3,0,0,0,0,0,3,3,0,3,0,0,0,1,0,0,0,0,3,0,0,0,0,0,0,3,
+ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,0,
+ 3,0,3,0,0,0,0,0,3,2,0,3,0,0,0,0,0,0,0,0,3,0,0,0,1,0,0,3,
+ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,0,
+ 3,0,3,0,1,0,0,0,3,3,0,3,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,3,
+ 3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,0,0,
+ 1,0,3,0,0,0,0,0,1,2,0,3,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,
+ 3,3,3,3,1,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,0,
+ 3,0,3,0,0,0,0,0,3,2,0,3,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,2,
+ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,0,
+ 3,0,3,0,0,0,0,0,3,3,0,3,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,1,
+ 3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,0,
+ 3,0,3,0,0,0,0,0,2,1,0,3,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,3,
+ 3,3,3,3,2,3,3,1,3,3,3,3,3,3,3,3,3,3,3,2,2,3,3,3,3,3,0,0,
+ 2,0,1,0,0,0,0,0,2,3,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,
+ 3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,0,0,
+ 2,0,3,0,0,0,0,0,2,3,0,3,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,3,
+ 3,3,3,3,3,3,3,3,3,3,1,3,3,2,3,2,1,3,2,0,0,2,2,1,0,1,0,0,
+ 3,0,1,0,0,0,0,0,1,2,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,
+ 3,3,3,3,3,3,3,3,3,3,2,2,3,3,3,2,1,3,0,0,1,0,3,1,0,0,0,0,
+ 1,0,0,0,0,0,0,0,1,3,0,1,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,
+ 3,3,3,3,3,3,3,3,3,3,1,1,3,3,3,3,3,3,3,2,1,1,3,1,2,2,0,0,
+ 2,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,
+ 3,3,3,3,3,3,3,3,3,3,3,3,1,3,3,3,3,3,3,3,3,3,3,3,3,3,0,0,
+ 3,0,3,0,0,0,0,0,3,3,0,2,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,2,
+ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,2,3,3,1,2,2,3,3,0,0,0,0,
+ 1,0,0,0,0,0,0,0,0,2,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 3,3,3,3,3,3,3,3,3,3,2,3,3,2,1,3,1,3,0,1,1,1,3,1,2,0,0,0,
+ 0,0,0,0,0,0,0,0,1,1,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 3,3,3,3,3,3,3,3,2,2,3,3,3,3,2,2,3,3,1,1,1,2,3,0,2,3,0,0,
+ 0,0,0,0,0,0,0,0,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 3,3,3,3,3,3,3,3,3,2,2,3,3,2,1,1,1,3,1,0,1,0,2,1,2,0,0,0,
+ 0,0,1,0,0,1,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 3,3,3,2,3,3,3,3,3,0,0,3,3,3,0,3,3,3,2,0,3,0,1,3,2,0,0,0,
+ 0,0,0,0,0,0,0,0,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 3,3,3,3,3,3,3,3,3,3,3,3,3,2,0,3,1,3,1,1,0,1,3,0,0,2,0,0,
+ 0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,
+ 3,3,3,2,3,3,3,3,3,3,2,3,3,2,1,0,2,2,3,0,0,1,2,0,1,0,0,0,
+ 0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 3,3,3,3,3,3,2,3,1,0,1,1,3,1,0,3,1,1,1,1,1,0,2,0,1,3,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 3,3,3,0,3,3,3,3,1,0,0,3,3,0,0,0,1,0,0,0,0,3,0,1,1,3,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 3,3,3,3,3,3,3,1,3,2,2,3,3,1,2,1,0,3,0,0,0,0,2,0,0,0,0,0,
+ 3,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 3,3,3,3,3,3,3,2,3,1,0,3,3,2,1,3,0,2,0,3,1,0,0,1,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 3,3,3,3,3,3,2,3,3,2,1,1,3,2,2,2,2,2,3,3,0,0,0,1,1,0,0,0,
+ 1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 3,3,3,1,3,3,3,3,3,2,1,3,3,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,3,
+ 0,3,0,3,3,3,3,3,0,0,3,0,2,3,3,2,3,3,2,3,0,2,3,3,1,3,3,0,
+ 1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,3,3,
+ 0,3,0,3,3,3,3,3,0,0,3,0,2,3,3,3,3,3,3,3,0,3,3,2,3,3,2,0,
+ 3,3,3,3,3,3,3,3,3,3,1,3,3,2,1,1,1,2,0,0,0,3,2,1,2,0,0,0,
+ 0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,
+ 0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,2,
+ 0,2,1,3,1,3,3,3,0,0,3,0,2,3,3,3,3,3,3,3,0,3,3,3,2,2,2,0,
+ 3,3,3,1,3,3,3,3,3,0,0,3,3,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,
+ 3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,3,
+ 0,3,0,3,3,3,2,3,0,0,2,0,2,3,3,3,1,3,1,3,0,2,1,2,1,1,2,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,3,
+ 0,2,0,3,3,3,3,3,0,0,3,0,3,3,3,3,3,3,3,3,0,2,3,3,3,3,3,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,3,3,
+ 0,3,0,2,3,3,2,3,0,0,3,0,3,3,2,2,3,3,3,2,0,3,1,1,2,0,3,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,3,
+ 0,3,0,3,3,3,3,3,0,0,3,0,2,3,3,3,3,3,2,3,0,3,2,2,2,0,3,0,
+ 0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,3,
+ 0,3,0,1,3,3,3,3,0,0,3,1,3,2,1,3,2,1,1,1,0,3,1,2,2,0,1,0,
+ 3,3,3,1,3,3,1,1,3,1,1,3,3,1,0,0,1,1,2,0,1,0,0,2,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 3,3,3,0,1,3,3,2,1,1,1,1,3,1,2,0,2,0,0,1,2,0,1,3,1,3,0,0,
+ 0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,3,
+ 0,3,0,1,3,3,1,3,0,0,3,0,2,2,3,3,2,2,2,2,0,3,1,2,1,0,2,0,
+ 3,3,3,2,2,3,3,1,2,2,0,3,3,1,3,0,0,2,0,0,0,0,2,0,0,0,0,0,
+ 1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,3,
+ 0,3,0,3,3,2,3,3,0,0,2,0,1,0,2,3,2,1,1,2,0,3,1,1,1,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,3,
+ 0,3,0,0,3,2,3,3,0,0,3,0,3,2,2,3,1,2,1,1,0,3,0,1,1,0,3,0,
+ 0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,3,
+ 0,3,0,2,3,3,2,1,0,0,2,0,1,1,2,3,2,1,1,2,0,2,1,2,1,0,0,0,
+ 0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,3,
+ 0,3,0,3,2,3,3,3,0,0,3,0,1,3,3,1,3,3,2,3,0,0,1,2,1,1,2,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,3,
+ 0,3,0,1,3,2,0,2,0,0,2,0,1,2,0,3,3,3,3,0,0,2,0,1,0,0,1,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,3,
+ 0,3,0,3,3,2,3,1,0,0,3,0,1,2,0,3,0,2,1,0,0,3,0,1,0,0,1,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,3,
+ 0,3,0,2,3,2,3,2,0,0,3,0,3,0,2,3,1,1,3,0,0,2,0,1,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,3,
+ 0,3,0,2,3,3,3,1,0,0,3,0,3,0,1,3,0,1,0,2,0,2,0,1,0,0,0,0,
+ 3,3,3,1,3,3,2,1,1,3,0,2,2,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,3,
+ 0,2,0,2,3,3,2,2,0,0,2,0,0,3,1,1,2,3,2,0,0,1,1,1,1,0,1,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,3,
+ 0,3,0,0,3,1,2,0,0,0,0,0,1,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,3,
+ 0,3,0,1,3,0,3,2,0,0,2,0,1,0,0,2,1,1,1,1,0,0,0,2,1,0,1,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,3,
+ 0,3,0,2,3,2,2,1,0,0,1,0,3,0,0,1,1,0,1,0,0,0,1,1,2,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,1,
+ 0,2,0,0,1,0,0,2,0,0,0,0,1,1,0,1,0,0,2,0,0,1,3,2,0,3,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,3,
+ 0,3,0,2,2,3,2,1,0,0,2,0,1,0,0,2,1,1,0,1,0,2,1,1,1,0,1,0,
+ 3,3,3,1,0,3,1,1,2,0,0,0,3,1,0,1,1,1,0,0,0,0,1,0,0,0,0,0,
+ 2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+};
+
+
+const SequenceModel Georgian_AcademyGeorgianModel =
+{
+ Georgian_Academy_CharToOrderMap,
+ GeorgianLangModel,
+ 56,
+ (float)0.9990010709521453,
+ PR_FALSE,
+ "GEORGIAN-ACADEMY",
+ "ka"
+};
+
+const SequenceModel Georgian_PsGeorgianModel =
+{
+ Georgian_Ps_CharToOrderMap,
+ GeorgianLangModel,
+ 56,
+ (float)0.9990010709521453,
+ PR_FALSE,
+ "GEORGIAN-PS",
+ "ka"
+};
+
+const LanguageModel GeorgianModel =
+{
+ "ka",
+ Unicode_CharOrder,
+ 112,
+ GeorgianLangModel,
+ 56,
+ 4,
+ (float)0.4034647649351511,
+ 33,
+ (float)0.03062631944282519,
+};
diff --git a/src/nsLanguageDetector-generated.h b/src/nsLanguageDetector-generated.h
index 39e0936..64054fb 100644
--- a/src/nsLanguageDetector-generated.h
+++ b/src/nsLanguageDetector-generated.h
@@ -38,7 +38,7 @@
#ifndef nsLanguageDetector_h_generated_h__
#define nsLanguageDetector_h_generated_h__
-#define NUM_OF_LANGUAGE_MODELS 37
+#define NUM_OF_LANGUAGE_MODELS 38
extern const LanguageModel ArabicModel;
extern const LanguageModel BelarusianModel;
@@ -60,6 +60,7 @@ extern const LanguageModel HindiModel;
extern const LanguageModel CroatianModel;
extern const LanguageModel HungarianModel;
extern const LanguageModel ItalianModel;
+extern const LanguageModel GeorgianModel;
extern const LanguageModel LithuanianModel;
extern const LanguageModel LatvianModel;
extern const LanguageModel MacedonianModel;
diff --git a/src/nsMBCSGroupProber.cpp b/src/nsMBCSGroupProber.cpp
index e9d7548..9512f3a 100644
--- a/src/nsMBCSGroupProber.cpp
+++ b/src/nsMBCSGroupProber.cpp
@@ -105,6 +105,7 @@ nsMBCSGroupProber::nsMBCSGroupProber(PRUint32 aLanguageFilter)
langDetectors[i][j++] = new nsLanguageDetector(&FinnishModel);
langDetectors[i][j++] = new nsLanguageDetector(&FrenchModel);
langDetectors[i][j++] = new nsLanguageDetector(&GermanModel);
+ langDetectors[i][j++] = new nsLanguageDetector(&GeorgianModel);
langDetectors[i][j++] = new nsLanguageDetector(&GreekModel);
langDetectors[i][j++] = new nsLanguageDetector(&HebrewModel);
langDetectors[i][j++] = new nsLanguageDetector(&HindiModel);
diff --git a/src/nsSBCSGroupProber.cpp b/src/nsSBCSGroupProber.cpp
index ba054c8..74340e3 100644
--- a/src/nsSBCSGroupProber.cpp
+++ b/src/nsSBCSGroupProber.cpp
@@ -240,6 +240,9 @@ nsSBCSGroupProber::nsSBCSGroupProber()
mProbers[n++] = new nsSingleByteCharSetProber(&Iso_8859_1CatalanModel);
mProbers[n++] = new nsSingleByteCharSetProber(&Windows_1252CatalanModel);
+ mProbers[n++] = new nsSingleByteCharSetProber(&Georgian_AcademyGeorgianModel);
+ mProbers[n++] = new nsSingleByteCharSetProber(&Georgian_PsGeorgianModel);
+
assert (n_sbcs_probers == n);
Reset();
diff --git a/src/nsSBCharSetProber-generated.h b/src/nsSBCharSetProber-generated.h
index e110f08..ef15b75 100644
--- a/src/nsSBCharSetProber-generated.h
+++ b/src/nsSBCharSetProber-generated.h
@@ -38,7 +38,7 @@
#ifndef nsSingleByteCharSetProber_generated_h__
#define nsSingleByteCharSetProber_generated_h__
-#define NUM_OF_SEQUENCE_MODELS 118
+#define NUM_OF_SEQUENCE_MODELS 120
extern const SequenceModel Iso_8859_6ArabicModel;
extern const SequenceModel Windows_1256ArabicModel;
@@ -121,6 +121,9 @@ extern const SequenceModel Iso_8859_9ItalianModel;
extern const SequenceModel Iso_8859_15ItalianModel;
extern const SequenceModel Windows_1252ItalianModel;
+extern const SequenceModel Georgian_AcademyGeorgianModel;
+extern const SequenceModel Georgian_PsGeorgianModel;
+
extern const SequenceModel Iso_8859_4LithuanianModel;
extern const SequenceModel Iso_8859_10LithuanianModel;
extern const SequenceModel Iso_8859_13LithuanianModel;
diff --git a/test/ka/georgian-academy.txt b/test/ka/georgian-academy.txt
new file mode 100644
index 0000000..a13ceea
--- /dev/null
+++ b/test/ka/georgian-academy.txt
@@ -0,0 +1 @@
+ (. Marmota) . 15 . . . , , .
diff --git a/test/ka/georgian-ps.txt b/test/ka/georgian-ps.txt
new file mode 100644
index 0000000..1307154
--- /dev/null
+++ b/test/ka/georgian-ps.txt
@@ -0,0 +1 @@
+ (. Marmota) . 15 . . . , , .
diff --git a/test/ka/utf-8.txt b/test/ka/utf-8.txt
new file mode 100644
index 0000000..218d10a
--- /dev/null
+++ b/test/ka/utf-8.txt
@@ -0,0 +1 @@
+ვირზაზუნა (ლათ. Marmota) — ძუძუმწოვრების გვარი მღრღნელების რიგისა. მსოფლიოში ვირზაზუნათა 15 სახეობაა ცნობილი. მათ სამშობლოდ ამერიკა ითვლება. ვირზაზუნები ძალიან განსხვავდებიან სხვა ძუძუმწოვრებისაგან. იმ დროს როდესაც ცხოველები აზიიდან და ევროპიდან ამერიკისაკენ მიემართებოდნენ, ვირზაზუნები პირიქით, ამერიკიდან გავრცელდნენ მთელ მსოფლიოში.