summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJehan <jehan@girinstud.io>2021-03-19 22:34:55 +0100
committerJehan <jehan@girinstud.io>2022-12-14 00:23:13 +0100
commita1b186fa8ba92a4e07e725df9f4a55cb8c5badd3 (patch)
tree4d64e83ce3f198ef4e8e80bbb870615300c643a6
parent9736950227be69a96a3d149db8de56432b44f821 (diff)
src: add Hindi/UTF-8 support.
-rw-r--r--script/BuildLangModelLogs/LangHindiModel.log190
-rw-r--r--script/langs/hi.py75
-rw-r--r--src/CMakeLists.txt1
-rw-r--r--src/LangModels/LangHindiModel.cpp228
-rw-r--r--src/nsLanguageDetector.h1
-rw-r--r--src/nsMBCSGroupProber.cpp3
-rw-r--r--src/nsMBCSGroupProber.h2
-rw-r--r--test/hi/utf-8.txt3
8 files changed, 501 insertions, 2 deletions
diff --git a/script/BuildLangModelLogs/LangHindiModel.log b/script/BuildLangModelLogs/LangHindiModel.log
new file mode 100644
index 0000000..d91de96
--- /dev/null
+++ b/script/BuildLangModelLogs/LangHindiModel.log
@@ -0,0 +1,190 @@
+= Logs of language model for Hindi (hi) =
+
+- Generated by BuildLangModel.py
+- Started: 2021-03-19 22:26:39.897264
+- Maximum depth: 4
+- Max number of pages: 100
+
+== Parsed pages ==
+
+मुखपृष्ठ (revision 5072364)
+2020 विशाखपट्नम गैस रिसाव (revision 4964453)
+अंग्रेज़ी विकिपीडिया (revision 4812878)
+अंशुन बस दुर्घटना (revision 5080233)
+अभिनव बिंद्रा (revision 5066925)
+अम्फान महाचक्रवात (revision 4974141)
+अल्बर्टा (revision 4815865)
+अस्तित्ववाद (revision 5095575)
+आज का आलेख १८ मार्च २०२१ (revision 5119127)
+इंडोनेशिया (revision 5137321)
+उत्तर प्रदेश (revision 5137498)
+एयर इंडिया एक्सप्रेस उड़ान 1344 (revision 4958774)
+कतिकी मेला (revision 4822652)
+कनाडा (revision 5092431)
+कश्मीर (revision 5101264)
+कार्तिक पूर्णिमा (revision 5039499)
+कालिंजर दुर्ग (revision 5070202)
+कालीकट अंतर्राष्ट्रीय विमानक्षेत्र (revision 5053097)
+किलर व्हेल (revision 4922729)
+कोड़िकोड (revision 5106489)
+कोरोनावायरस महामारी (revision 5118212)
+कोसला (revision 4901745)
+खजुराहो (revision 5123204)
+गायक (revision 5128003)
+गुइझोऊ (revision 4579091)
+गुप्त वंश (revision 5101672)
+ग्लेशियर नेशनल पार्क (revision 5135892)
+घुम रेलवे स्टेशन (revision 4973755)
+जापान के प्रधानमंत्री (revision 4960597)
+जावा सागर (revision 3325350)
+जुलाई (revision 4367726)
+डल झील (revision 5109477)
+तिरहुत (revision 5056746)
+दरभंगा (revision 5139550)
+धर्मराय स्वामी मंदिर (revision 5080095)
+नेपाल (revision 5119140)
+पक्षी (revision 4905207)
+पाकिस्तान इंटरनेशनल एयरलाइंस उड़ान 8303 (revision 4972163)
+पारिस्थितिकी तंत्र (revision 4960487)
+पूर्वी भारत (revision 5008841)
+पृथ्वीराज चौहान (revision 5141495)
+प्रजातियां (revision 5084721)
+प्रणब मुखर्जी (revision 5026060)
+फल (revision 4887332)
+फूल (revision 4985648)
+बज्जिका (revision 5052110)
+बांदा जिला (revision 5066719)
+बिहार (revision 5141557)
+बुन्देलखण्ड (revision 5142668)
+बुलबुल (revision 4959703)
+बेयरूत धमाका 2020 (revision 5128013)
+बैटमैन (revision 5093040)
+ब्रिटिश कोलम्बिया (revision 5134714)
+भारत (revision 5112806)
+भारत की स्वतंत्रता (revision 5102484)
+भारत में कोरोनावायरस महामारी का आर्थिक प्रभाव (revision 5105591)
+भालचंद्र नेमाडे (revision 5069123)
+मधुबनी (revision 5127643)
+मुखपृष्ठ/अन्य भाषाओं में (revision 4949624)
+मुखपृष्ठ/आज का आलेख (revision 5072364)
+मुखपृष्ठ/पिछला आलेख (revision 5072364)
+मुखपृष्ठ/पूर्व प्रदर्शित (revision 4427327)
+मुखपृष्ठ/बन्धु प्रकल्प (revision 4786332)
+मुखपृष्ठ/वर्तमान (revision 5072364)
+मुजफ्फरपुर (revision 5112424)
+मैथिली (revision 5048285)
+मोन्टाना (revision 4530336)
+मोहम्मद ग़ोरी (revision 5053528)
+योशिहिडे सुगा (revision 5131517)
+राजपूत (revision 5142481)
+राजीव गांधी खेल रत्न (revision 5035806)
+रॉकी पर्वत शृंखला (revision 5063055)
+लेबनान (revision 5087028)
+विंध्य पर्वत (revision 4998895)
+विधु विनोद चोपड़ा (revision 4920989)
+विश्व धरोहर स्थल (revision 5050725)
+वैशाली (revision 5134349)
+शिंजो अबे (revision 4959991)
+शिकारा (revision 4959995)
+शिकारा (२०२० फ़िल्म) (revision 5110981)
+शिवहर (revision 5002252)
+श्रीनगर, जम्मू और कश्मीर (revision 5103394)
+श्रीविजय एयर उड़ान 182 (revision 5112969)
+संयुक्त राज्य अमेरिका (revision 5122291)
+समस्तीपुर (revision 5137150)
+सीतामढी (revision 5125137)
+स्पेन के फ़िलिप पंचम (revision 4969865)
+हाल की घटनाएँ (revision 2774346)
+हिन्दी विकिपीडिया (revision 5131026)
+हैब्सबर्ग राजवंश (revision 5036757)
+२०२१ में निधन (revision 5077158)
+2020 कोरोनावायरस महामारी (revision 5118212)
+Chevron Phillips Chemical (revision 4964453)
+Deccan Chronicle (revision 4976246)
+Indian Standard Time (revision 5132803)
+The Hindu (revision 5046686)
+UTC+05:30 (revision 4947123)
+अन्तर्राष्ट्रीय मानक क्रम संख्या (revision 4951625)
+आंध्र प्रदेश (revision 5118459)
+
+== End of Parsed pages ==
+
+- Wikipedia parsing ended at: 2021-03-19 22:31:01.818311
+
+80 characters appeared 80 times.
+
+Most Frequent characters:
+[ 0] Char व: 1.25 %
+[ 1] Char ि: 1.25 %
+[ 2] Char श: 1.25 %
+[ 3] Char ा: 1.25 %
+[ 4] Char ख: 1.25 %
+[ 5] Char प: 1.25 %
+[ 6] Char त: 1.25 %
+[ 7] Char ्: 1.25 %
+[ 8] Char न: 1.25 %
+[ 9] Char म: 1.25 %
+[10] Char ग: 1.25 %
+[11] Char ै: 1.25 %
+[12] Char स: 1.25 %
+[13] Char र: 1.25 %
+[14] Char ज: 1.25 %
+[15] Char े: 1.25 %
+[16] Char भ: 1.25 %
+[17] Char ी: 1.25 %
+[18] Char क: 1.25 %
+[19] Char ह: 1.25 %
+[20] Char ई: 1.25 %
+[21] Char ो: 1.25 %
+[22] Char आ: 1.25 %
+[23] Char ध: 1.25 %
+[24] Char द: 1.25 %
+[25] Char ं: 1.25 %
+[26] Char ट: 1.25 %
+[27] Char ु: 1.25 %
+[28] Char ए: 1.25 %
+[29] Char ल: 1.25 %
+[30] Char ॉ: 1.25 %
+[31] Char उ: 1.25 %
+[32] Char य: 1.25 %
+[33] Char ष: 1.25 %
+[34] Char घ: 1.25 %
+[35] Char थ: 1.25 %
+[36] Char ।: 1.25 %
+[37] Char इ: 1.25 %
+[38] Char ौ: 1.25 %
+[39] Char ृ: 1.25 %
+[40] Char औ: 1.25 %
+[41] Char ँ: 1.25 %
+[42] Char फ: 1.25 %
+[43] Char ू: 1.25 %
+[44] Char ठ: 1.25 %
+[45] Char ड: 1.25 %
+[46] Char ब: 1.25 %
+[47] Char च: 1.25 %
+[48] Char अ: 1.25 %
+[49] Char ण: 1.25 %
+[50] Char छ: 1.25 %
+[51] Char ़: 1.25 %
+[52] Char ऊ: 1.25 %
+[53] Char ऐ: 1.25 %
+[54] Char ढ: 1.25 %
+[55] Char ञ: 1.25 %
+[56] Char ओ: 1.25 %
+[57] Char ः: 1.25 %
+[58] Char ऑ: 1.25 %
+[59] Char १: 1.25 %
+[60] Char ५: 1.25 %
+[61] Char २: 1.25 %
+[62] Char ०: 1.25 %
+[63] Char ७: 1.25 %
+
+The first 64 characters have an accumulated ratio of 0.7999999999999992.
+
+2113 sequences found.
+
+First 1356 (typical positive ratio): 0.9950083796268726
+Next 397 (1753-1356): 0.00399414702204226
+Rest: 0.000997473351085132
+
+- Processing end: 2021-03-19 22:31:02.178353
diff --git a/script/langs/hi.py b/script/langs/hi.py
new file mode 100644
index 0000000..e4981d5
--- /dev/null
+++ b/script/langs/hi.py
@@ -0,0 +1,75 @@
+#!/bin/python3
+# -*- coding: utf-8 -*-
+
+# ##### BEGIN LICENSE BLOCK #####
+# Version: MPL 1.1/GPL 2.0/LGPL 2.1
+#
+# The contents of this file are subject to the Mozilla Public License Version
+# 1.1 (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+# http://www.mozilla.org/MPL/
+#
+# Software distributed under the License is distributed on an "AS IS" basis,
+# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
+# for the specific language governing rights and limitations under the
+# License.
+#
+# The Original Code is Mozilla Universal charset detector code.
+#
+# The Initial Developer of the Original Code is
+# Netscape Communications Corporation.
+# Portions created by the Initial Developer are Copyright (C) 2001
+# the Initial Developer. All Rights Reserved.
+#
+# Contributor(s):
+# Jehan <jehan@girinstud.io>
+#
+# Alternatively, the contents of this file may be used under the terms of
+# either the GNU General Public License Version 2 or later (the "GPL"), or
+# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
+# in which case the provisions of the GPL or the LGPL are applicable instead
+# of those above. If you wish to allow use of your version of this file only
+# under the terms of either the GPL or the LGPL, and not to allow others to
+# use your version of this file under the terms of the MPL, indicate your
+# decision by deleting the provisions above and replace them with the notice
+# and other provisions required by the GPL or the LGPL. If you do not delete
+# the provisions above, a recipient may use your version of this file under
+# the terms of any one of the MPL, the GPL or the LGPL.
+#
+# ##### END LICENSE BLOCK #####
+
+import re
+
+## Mandatory Properties ##
+
+name = 'Hindi'
+aliases = [ 'Devanagari' ]
+code = 'hi'
+use_ascii = False
+# ISCII is a collection of single byte encodings (10 variants).
+# Unfortunately it looks like neither iconv nor python (probably based off
+# iconv?) know any of the ISCII encodings. Therefore I cannot build any ISCII
+# data, and as a consequence process it.
+# See:
+# http://stackoverflow.com/questions/27143365/unicode-to-iscii-conversion
+# https://en.wikipedia.org/wiki/Indian_Script_Code_for_Information_Interchange
+# Anyway according to Wikipedia, these encodings don't seem much used. UTF-8 is
+# mostly used in India, it would appear.
+#charsets = ['ISCII']
+charsets = []
+
+## Optional Properties ##
+
+# Devanagari script, see:
+# https://en.wikipedia.org/wiki/Devanagari
+# 11 vowels and 33 consonants with independent and diacritic forms, etc.
+# To keep it simple, I don't list the alphabet and leave statistics work
+# its magics.
+unicode_ranges = [(0x900, 0x97F), # Devanagari
+ (0xA8E0, 0xA8FF), # Devanagari Extended
+ (0x1CD0, 0x1CFF), # Vedic Extensions
+ ]
+
+start_pages = ['मुखपृष्ठ']
+wikipedia_code = code
+case_mapping = False
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 72104a8..bb68b4c 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -20,6 +20,7 @@ set(
LangModels/LangGreekModel.cpp
LangModels/LangHungarianModel.cpp
LangModels/LangHebrewModel.cpp
+ LangModels/LangHindiModel.cpp
LangModels/LangIrishModel.cpp
LangModels/LangItalianModel.cpp
LangModels/LangKoreanModel.cpp
diff --git a/src/LangModels/LangHindiModel.cpp b/src/LangModels/LangHindiModel.cpp
new file mode 100644
index 0000000..f34a273
--- /dev/null
+++ b/src/LangModels/LangHindiModel.cpp
@@ -0,0 +1,228 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* ***** BEGIN LICENSE BLOCK *****
+ * Version: MPL 1.1/GPL 2.0/LGPL 2.1
+ *
+ * The contents of this file are subject to the Mozilla Public License Version
+ * 1.1 (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ * http://www.mozilla.org/MPL/
+ *
+ * Software distributed under the License is distributed on an "AS IS" basis,
+ * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
+ * for the specific language governing rights and limitations under the
+ * License.
+ *
+ * The Original Code is Mozilla Communicator client code.
+ *
+ * The Initial Developer of the Original Code is
+ * Netscape Communications Corporation.
+ * Portions created by the Initial Developer are Copyright (C) 1998
+ * the Initial Developer. All Rights Reserved.
+ *
+ * Contributor(s):
+ *
+ * Alternatively, the contents of this file may be used under the terms of
+ * either the GNU General Public License Version 2 or later (the "GPL"), or
+ * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
+ * in which case the provisions of the GPL or the LGPL are applicable instead
+ * of those above. If you wish to allow use of your version of this file only
+ * under the terms of either the GPL or the LGPL, and not to allow others to
+ * use your version of this file under the terms of the MPL, indicate your
+ * decision by deleting the provisions above and replace them with the notice
+ * and other provisions required by the GPL or the LGPL. If you do not delete
+ * the provisions above, a recipient may use your version of this file under
+ * the terms of any one of the MPL, the GPL or the LGPL.
+ *
+ * ***** END LICENSE BLOCK ***** */
+
+#include "../nsSBCharSetProber.h"
+#include "../nsLanguageDetector.h"
+
+/********* Language model for: Hindi *********/
+
+/**
+ * Generated by BuildLangModel.py
+ * On: 2021-03-19 22:31:01.819135
+ **/
+
+/* Character Mapping Table:
+ * ILL: illegal character.
+ * CTR: control character specific to the charset.
+ * RET: carriage/return.
+ * SYM: symbol (punctuation) that does not belong to word.
+ * NUM: 0 - 9.
+ *
+ * Other characters are ordered by probabilities
+ * (0 is the most common character in the language).
+ *
+ * Orders are generic to a language. So the codepoint with order X in
+ * CHARSET1 maps to the same character as the codepoint with the same
+ * order X in CHARSET2 for the same language.
+ * As such, it is possible to get missing order. For instance the
+ * ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1
+ * even though they are both used for French. Same for the euro sign.
+ */
+static const int Unicode_Char_size = 64;
+static const unsigned int Unicode_CharOrder[] =
+{
+ 2305, 41, 2306, 25, 2307, 57, 2309, 48, 2310, 22, 2311, 37, 2312, 20,2313, 31,
+ 2314, 52, 2319, 28, 2320, 53, 2321, 58, 2323, 56, 2324, 40, 2325, 18,2326, 4,
+ 2327, 10, 2328, 34, 2330, 47, 2331, 50, 2332, 14, 2334, 55, 2335, 26,2336, 44,
+ 2337, 45, 2338, 54, 2339, 49, 2340, 6, 2341, 35, 2342, 24, 2343, 23,2344, 8,
+ 2346, 5, 2347, 42, 2348, 46, 2349, 16, 2350, 9, 2351, 32, 2352, 13,2354, 29,
+ 2357, 0, 2358, 2, 2359, 33, 2360, 12, 2361, 19, 2364, 51, 2366, 3,2367, 1,
+ 2368, 17, 2369, 27, 2370, 43, 2371, 39, 2375, 15, 2376, 11, 2377, 30,2379, 21,
+ 2380, 38, 2381, 7, 2404, 36, 2406, 62, 2407, 59, 2408, 61, 2411, 60,2413, 63,
+};
+
+
+/* Model Table:
+ * Total considered sequences: 2113 / 4096
+ * - Positive sequences: first 1356 (0.9950083796268726)
+ * - Probable sequences: next 397 (1753-1356) (0.00399414702204226)
+ * - Neutral sequences: last 2343 (0.000997473351085132)
+ * - Negative sequences: 1983 (off-ratio)
+ * Negative sequences: TODO
+ */
+static const PRUint8 HindiLangModel[] =
+{
+ 3,3,3,3,0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,3,0,3,3,3,3,3,0,3,3,0,
+ 3,0,2,1,0,1,0,3,0,0,1,2,0,1,1,3,0,2,1,0,1,0,0,0,0,1,0,0,0,0,0,0,
+ 3,0,3,0,3,3,3,0,3,3,3,0,3,3,3,0,3,0,3,3,0,0,1,3,3,3,3,0,3,3,0,1,
+ 3,3,2,3,3,1,0,0,0,0,3,0,3,3,3,3,2,3,3,2,2,1,1,0,2,2,0,0,0,0,1,0,
+ 3,3,2,3,0,2,3,3,3,3,2,3,3,3,3,3,3,3,3,3,0,3,0,0,2,3,2,3,0,3,2,0,
+ 3,0,0,0,1,0,3,2,0,0,0,3,0,0,3,1,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,
+ 3,0,3,0,3,3,3,0,3,3,3,0,3,3,3,1,3,0,3,3,3,0,1,3,3,3,3,0,3,3,0,3,
+ 3,3,3,3,3,3,0,0,1,3,3,0,3,3,3,3,0,3,2,2,3,2,3,2,3,1,0,0,0,0,0,0,
+ 2,3,0,3,0,3,3,3,3,3,2,2,2,3,3,3,3,3,3,0,0,3,0,0,3,3,2,3,0,3,0,0,
+ 0,0,0,0,3,0,2,0,0,0,0,3,0,3,2,2,0,3,0,3,0,0,0,0,0,0,0,0,0,0,0,0,
+ 3,3,3,3,2,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,0,1,3,3,3,3,2,3,3,0,
+ 3,3,1,3,1,0,3,3,0,0,0,3,3,3,3,3,0,3,2,0,0,0,3,0,0,0,0,0,0,0,0,0,
+ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,0,3,0,1,3,3,3,3,1,3,0,0,
+ 3,0,0,3,2,0,3,3,0,0,0,3,0,1,3,3,0,1,0,1,0,0,0,1,0,3,0,0,0,0,0,0,
+ 3,0,3,0,3,3,3,0,3,3,3,0,3,3,3,1,3,0,3,3,0,0,0,3,3,0,3,1,0,3,0,0,
+ 3,3,3,3,0,0,0,0,0,0,3,0,3,3,3,3,0,3,3,0,0,0,2,3,0,0,0,0,0,0,0,0,
+ 3,3,3,3,0,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,2,3,3,3,3,3,3,3,3,0,
+ 3,3,2,1,3,1,3,3,0,0,3,3,0,3,3,3,0,1,0,0,3,1,0,0,1,3,0,0,0,0,0,0,
+ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,1,3,3,3,3,3,2,3,3,1,
+ 3,0,0,3,1,1,3,3,0,2,1,3,3,3,2,3,0,3,3,0,2,0,3,0,0,1,0,0,0,0,1,0,
+ 3,3,3,3,0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,3,2,3,3,3,2,0,
+ 3,0,0,0,2,0,3,3,0,1,3,3,3,3,3,0,0,3,0,3,0,0,3,0,0,2,0,0,0,0,0,0,
+ 3,0,3,0,0,3,3,0,3,3,3,1,3,3,3,0,2,0,3,2,0,0,0,3,3,3,3,0,0,3,0,0,
+ 3,3,0,3,3,0,0,0,0,0,2,0,3,3,3,2,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 3,3,2,3,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,3,0,0,3,3,3,3,2,3,3,1,
+ 2,0,2,0,0,1,3,3,0,2,3,3,0,3,3,3,0,0,0,0,1,0,1,0,0,2,0,0,0,0,1,0,
+ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,3,1,3,3,3,2,3,3,3,3,1,
+ 3,3,2,3,3,0,3,3,1,0,3,3,3,2,3,3,2,3,2,0,0,0,0,0,0,2,0,0,0,0,0,0,
+ 3,3,3,3,2,3,3,3,3,3,3,3,3,3,1,3,3,3,3,3,1,3,0,3,3,3,3,3,0,3,3,0,
+ 3,0,1,0,2,0,3,0,0,0,3,3,0,3,3,1,0,0,0,3,0,0,0,0,0,1,0,0,0,0,0,0,
+ 3,0,3,0,3,3,3,0,3,3,3,0,3,3,3,2,0,0,3,3,3,0,0,2,3,3,3,0,0,3,0,1,
+ 3,3,2,2,3,2,0,0,0,1,3,0,0,3,3,3,2,3,0,0,1,0,2,0,0,0,0,0,0,0,0,0,
+ 3,3,0,3,0,1,2,3,0,1,3,3,1,3,2,3,1,3,3,0,0,3,0,1,3,3,3,3,0,3,0,0,
+ 3,0,0,0,2,1,3,0,0,0,0,3,1,2,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 3,0,3,0,3,3,3,0,3,3,3,0,3,3,3,0,2,0,3,2,2,0,3,3,3,3,3,0,3,3,0,0,
+ 3,3,3,2,3,2,0,0,1,0,3,0,3,3,3,3,0,3,3,0,0,0,3,0,3,1,2,0,0,1,1,0,
+ 3,3,3,3,0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,0,3,3,3,3,0,3,3,0,
+ 2,2,1,3,3,0,3,3,0,0,2,3,3,3,3,2,0,3,2,3,0,0,1,0,0,0,0,0,0,1,0,0,
+ 3,3,2,3,1,2,3,3,3,3,3,3,3,3,3,3,1,3,3,2,1,3,0,0,3,3,3,3,0,3,3,0,
+ 3,0,0,3,2,0,2,2,0,1,2,3,1,3,3,3,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 1,0,3,0,0,1,0,0,3,2,2,0,3,3,1,0,0,0,3,0,0,0,2,0,2,3,3,0,2,3,0,0,
+ 3,0,0,0,3,0,0,0,0,0,1,0,0,0,2,1,0,0,0,0,0,0,0,0,2,0,0,0,0,0,1,0,
+ 3,0,3,0,3,3,3,0,3,3,3,0,3,3,3,0,2,0,3,3,3,0,2,3,3,3,3,0,3,3,0,2,
+ 3,3,2,3,3,3,0,0,0,1,3,0,2,3,3,3,1,3,0,0,2,0,3,0,1,0,0,0,0,0,0,0,
+ 3,0,3,0,3,3,3,0,3,3,3,0,3,3,3,1,2,0,3,3,3,0,0,3,3,3,2,0,3,3,0,1,
+ 3,1,1,0,3,3,0,0,0,3,1,0,3,1,3,3,1,3,1,0,0,0,0,0,1,0,0,0,0,0,0,0,
+ 3,3,2,3,0,2,3,3,3,2,3,0,3,3,1,3,0,3,3,0,1,3,0,1,0,1,0,3,0,1,0,0,
+ 1,0,0,0,1,0,1,0,0,0,0,3,0,2,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 3,3,3,3,0,3,3,3,3,3,3,3,3,3,1,3,2,3,3,3,1,3,0,0,3,3,0,3,0,3,0,0,
+ 3,0,0,0,1,0,3,3,0,0,2,3,0,3,3,2,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 3,0,3,0,3,3,3,0,3,1,3,0,3,3,3,0,3,0,3,3,0,0,2,3,3,2,3,1,1,3,0,1,
+ 3,0,3,3,3,0,0,0,0,0,2,0,3,3,3,3,0,1,2,0,0,0,2,0,0,0,0,1,0,0,2,0,
+ 3,3,1,3,1,3,3,3,3,3,2,3,3,3,2,3,2,3,3,2,1,3,0,0,1,2,0,3,1,3,3,0,
+ 2,0,0,0,1,0,2,0,0,0,2,3,0,2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 3,0,3,0,3,3,3,1,3,3,3,0,3,3,3,1,3,0,3,3,3,0,3,3,3,3,3,1,3,3,0,1,
+ 3,3,2,2,0,3,0,0,0,3,3,0,2,3,3,3,2,3,3,1,0,1,2,1,3,2,0,0,0,0,0,0,
+ 3,0,3,0,0,2,2,0,3,3,3,0,3,1,3,0,0,0,3,2,1,0,1,2,0,3,3,0,1,3,0,0,
+ 3,0,0,2,3,0,0,0,0,3,3,0,0,3,1,3,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,
+ 3,3,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,3,1,2,3,3,3,3,0,3,3,1,
+ 3,0,3,1,2,1,3,0,0,0,1,3,0,3,3,3,2,0,2,0,0,0,0,0,0,1,0,0,0,0,0,0,
+ 2,0,2,0,0,2,0,0,3,3,3,0,3,3,3,0,0,0,3,0,0,0,0,0,0,3,3,0,0,3,0,2,
+ 3,0,0,0,0,2,0,0,0,0,2,0,0,3,3,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,1,3,3,0,3,3,3,0,3,3,2,0,3,0,3,1,0,0,0,2,3,3,2,0,0,3,0,0,
+ 0,3,0,2,0,0,0,0,0,2,2,0,3,3,2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 3,3,3,3,1,3,3,3,3,3,3,0,3,3,3,3,3,3,3,3,0,3,0,3,3,3,3,3,0,3,2,0,
+ 3,0,1,3,2,1,3,0,0,0,2,3,1,2,3,3,1,3,0,1,0,0,0,0,0,3,0,0,0,0,0,0,
+ 2,3,1,3,0,2,3,3,2,3,2,2,3,3,3,3,2,3,3,1,0,3,0,3,3,0,2,2,0,0,0,0,
+ 3,2,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 2,3,0,3,0,0,0,2,3,2,0,0,0,3,0,2,0,3,2,1,0,3,0,0,1,3,3,3,0,2,0,0,
+ 0,0,0,0,0,0,1,1,0,0,0,3,0,2,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 3,3,2,3,0,2,2,3,3,3,1,0,2,3,0,3,0,3,3,2,2,3,0,1,0,2,0,3,0,3,2,0,
+ 2,0,0,0,2,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,
+ 3,0,2,0,1,3,2,0,2,3,2,0,3,2,3,0,2,0,3,3,0,0,2,0,3,0,0,0,2,2,0,2,
+ 3,0,1,0,1,3,0,0,0,0,2,0,0,1,3,2,2,0,1,0,0,0,0,0,1,0,0,2,0,2,0,0,
+ 3,0,2,0,0,3,3,0,3,3,2,0,3,3,3,0,0,0,3,0,0,0,0,1,0,3,3,0,2,3,0,0,
+ 3,1,0,2,1,0,0,0,0,1,2,0,0,3,3,3,0,3,1,1,0,0,0,0,1,0,1,0,0,0,0,0,
+ 2,0,2,0,3,3,3,0,3,3,3,0,3,3,3,0,1,0,3,3,0,0,0,3,3,3,3,0,0,3,0,0,
+ 0,1,0,3,1,0,0,0,0,1,1,0,0,3,2,1,0,2,1,1,0,0,0,0,0,0,0,0,0,0,0,0,
+ 1,0,3,0,1,3,3,0,2,0,2,0,1,0,3,0,2,0,3,3,0,0,0,0,3,3,0,0,0,0,0,0,
+ 0,3,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,2,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,3,0,0,0,0,0,0,3,3,1,0,0,0,1,0,0,0,0,1,3,0,1,0,0,0,0,0,
+ 0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 3,0,1,1,3,2,2,0,1,0,3,0,3,0,0,0,0,0,3,2,0,0,0,3,3,0,3,0,0,0,0,1,
+ 1,0,0,0,1,0,0,0,0,3,1,0,1,2,1,3,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,
+ 2,3,1,3,0,0,2,3,2,0,3,3,3,3,0,3,0,3,1,1,0,3,1,0,0,2,2,3,0,3,3,1,
+ 0,0,0,0,1,0,2,0,0,0,0,3,0,3,2,0,0,1,0,3,0,0,0,0,0,0,0,0,0,0,0,0,
+ 3,0,2,0,3,3,3,0,3,3,3,0,3,3,3,0,3,0,3,3,0,0,2,2,3,3,3,0,3,3,0,1,
+ 2,3,1,2,0,0,0,0,0,3,3,0,3,3,3,3,1,3,2,0,0,0,3,0,2,0,0,0,0,0,1,0,
+ 3,3,2,3,0,1,3,2,3,3,1,0,1,1,0,3,3,3,3,3,0,3,0,0,0,3,0,1,0,2,0,0,
+ 1,0,0,0,0,0,1,0,0,0,1,3,0,0,3,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 3,3,0,3,0,3,2,3,3,3,2,2,2,3,0,3,1,3,3,1,0,3,0,0,0,2,2,3,0,3,3,0,
+ 2,0,0,0,0,0,2,0,0,0,0,3,0,1,3,2,0,0,0,3,0,0,0,0,0,1,0,0,0,0,0,0,
+ 2,3,2,3,2,0,3,3,3,2,3,3,3,3,3,3,0,3,3,3,3,3,0,2,3,3,3,3,0,3,3,0,
+ 2,0,2,2,1,1,3,3,0,3,0,3,1,3,2,3,0,0,1,0,0,0,3,0,0,0,0,0,0,0,0,0,
+ 3,3,2,3,0,3,3,3,3,3,0,3,3,3,1,3,2,3,3,2,0,3,0,0,0,3,3,3,1,3,1,0,
+ 3,0,0,0,0,1,3,0,0,1,0,3,0,2,2,0,0,2,0,0,0,0,3,0,3,0,0,0,0,0,0,0,
+ 3,0,3,1,3,3,3,0,3,3,3,0,3,3,3,0,3,0,3,3,0,0,0,3,3,3,3,0,0,3,0,0,
+ 3,3,2,3,0,0,0,0,0,1,3,0,1,3,3,3,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,
+ 3,3,1,3,0,3,3,3,3,2,2,0,3,3,1,3,1,3,3,0,0,3,0,2,2,0,0,3,0,0,0,0,
+ 3,0,0,1,1,0,1,0,0,0,0,0,0,1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 3,3,0,3,0,2,3,1,1,1,0,0,0,1,2,3,0,3,1,3,0,3,0,0,2,1,2,3,0,3,0,0,
+ 0,0,0,0,0,0,1,0,0,0,0,3,3,3,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,
+ 3,3,1,3,1,2,3,3,3,3,3,3,3,3,3,3,1,3,3,2,2,3,0,1,2,1,0,3,0,3,2,0,
+ 2,0,0,0,2,0,3,0,0,1,3,3,0,1,3,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,2,3,1,0,1,1,0,0,2,2,0,0,0,0,0,0,0,0,0,0,2,3,1,0,0,1,0,0,
+ 0,1,0,0,0,0,0,0,0,3,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 1,0,0,0,0,0,3,0,2,0,0,0,3,1,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,2,0,0,
+ 0,0,0,0,0,2,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,2,0,3,0,0,2,0,2,0,0,0,0,0,0,3,0,3,2,2,0,0,0,0,0,3,0,0,0,3,0,0,
+ 0,0,0,0,0,0,0,0,0,2,0,2,0,0,1,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,1,0,3,0,0,1,2,0,0,0,0,0,0,0,1,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 3,0,2,0,0,1,1,0,1,2,1,0,2,3,1,0,0,0,2,0,2,0,1,1,1,3,2,0,0,3,0,0,
+ 0,0,0,1,0,1,0,0,0,2,0,0,0,2,2,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,2,0,2,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,2,0,0,0,0,0,0,0,
+ 0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,2,0,0,2,0,0,0,3,3,0,0,0,0,3,0,0,0,0,0,0,0,1,0,0,1,0,0,
+ 1,0,0,0,0,0,0,0,0,0,3,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,3,3,3,3,3,
+ 2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,3,2,3,3,
+ 2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,3,3,3,2,
+ 1,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,3,3,3,3,
+ 2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,3,3,3,3,
+};
+
+
+const LanguageModel HindiModel =
+{
+ "hi",
+ Unicode_CharOrder,
+ 64,
+ HindiLangModel,
+ 64,
+ (float)0.7999999999999992,
+};
diff --git a/src/nsLanguageDetector.h b/src/nsLanguageDetector.h
index f89d077..a2139a6 100644
--- a/src/nsLanguageDetector.h
+++ b/src/nsLanguageDetector.h
@@ -123,6 +123,7 @@ extern const LanguageModel FrenchModel;
extern const LanguageModel GermanModel;
extern const LanguageModel GreekModel;
extern const LanguageModel HebrewModel;
+extern const LanguageModel HindiModel;
extern const LanguageModel HungarianModel;
extern const LanguageModel IrishModel;
extern const LanguageModel ItalianModel;
diff --git a/src/nsMBCSGroupProber.cpp b/src/nsMBCSGroupProber.cpp
index 418a1c1..1b99da1 100644
--- a/src/nsMBCSGroupProber.cpp
+++ b/src/nsMBCSGroupProber.cpp
@@ -102,9 +102,11 @@ nsMBCSGroupProber::nsMBCSGroupProber(PRUint32 aLanguageFilter)
langDetectors[i][j++] = new nsLanguageDetector(&GermanModel);
langDetectors[i][j++] = new nsLanguageDetector(&GreekModel);
langDetectors[i][j++] = new nsLanguageDetector(&HebrewModel);
+ langDetectors[i][j++] = new nsLanguageDetector(&HindiModel);
langDetectors[i][j++] = new nsLanguageDetector(&HungarianModel);
langDetectors[i][j++] = new nsLanguageDetector(&IrishModel);
langDetectors[i][j++] = new nsLanguageDetector(&ItalianModel);
+ langDetectors[i][j++] = new nsLanguageDetector(&KoreanModel);
langDetectors[i][j++] = new nsLanguageDetector(&LatvianModel);
langDetectors[i][j++] = new nsLanguageDetector(&LithuanianModel);
langDetectors[i][j++] = new nsLanguageDetector(&MalteseModel);
@@ -118,7 +120,6 @@ nsMBCSGroupProber::nsMBCSGroupProber(PRUint32 aLanguageFilter)
langDetectors[i][j++] = new nsLanguageDetector(&ThaiModel);
langDetectors[i][j++] = new nsLanguageDetector(&TurkishModel);
langDetectors[i][j++] = new nsLanguageDetector(&VietnameseModel);
- langDetectors[i][j++] = new nsLanguageDetector(&KoreanModel);
}
else
{
diff --git a/src/nsMBCSGroupProber.h b/src/nsMBCSGroupProber.h
index 96f0dc7..a6bfc59 100644
--- a/src/nsMBCSGroupProber.h
+++ b/src/nsMBCSGroupProber.h
@@ -49,7 +49,7 @@
#include "nsEUCTWProber.h"
#define NUM_OF_PROBERS 8
-#define NUM_OF_LANGUAGES 28
+#define NUM_OF_LANGUAGES 29
class nsMBCSGroupProber: public nsCharSetProber {
public:
diff --git a/test/hi/utf-8.txt b/test/hi/utf-8.txt
new file mode 100644
index 0000000..4b4d738
--- /dev/null
+++ b/test/hi/utf-8.txt
@@ -0,0 +1,3 @@
+ग्लेशियर नेशनल पार्क (अंग्रेज़ी: Glacier National Park; उच्चा.: ग्लेशियर नेशनल पार्क) अमेरिकी राष्ट्रीय उद्यान है, जो कि कनाडा-संयुक्त राज्य अमेरिका की सीमा पर स्थित है। उद्यान संयुक्त राज्य के उत्तर-पश्चिमी मोंटाना राज्य में स्थित है और कनाडा की ओर अल्बर्टा और ब्रिटिश कोलम्बिया प्रांतों से सटा हुआ है। उद्यान दस लाख एकड़ (4,000 किमी2) से अधिक क्षेत्र में फैला हुआ है और इसमें दो पर्वत श्रृंखला (रॉकी पर्वत की उप-श्रेणियाँ), 130 से अधिक नामित झीलें, 1,000 से अधिक विभिन्न पौधों की प्रजातियां और सैकड़ों वन्यजीवों की प्रजातियां शामिल हैं। इस विशाल प्राचीन पारिस्थितिकी तंत्र को जो कि 16,000 वर्ग मील (41,000 किमी2) में शामिल संरक्षित भूमि का भाग है, "क्राउन ऑफ़ द कॉन्टिनेंट इकोसिस्टम" के रूप में संदर्भित किया गया है।[1]
+
+ग्लेशियर नेशनल पार्क में लगभग सभी मूल स्थानीय पादप और जीव-जन्तु प्रजातियां हैं। बड़े स्तनधारी जैसे कि भूरा भालू, मूस, और पहाड़ी बकरियों के साथ-साथ दुर्लभ या लुप्तप्राय प्रजातियां जैसे कि वूल्वरिन और कनाडाई लिनेक्स, उद्यान में निवास करते हैं। यहां से पक्षियों की सैकड़ों प्रजातियां, एक दर्जन से अधिक मछलियों की प्रजातियां और कुछ सरीसृप और उभयचर प्रजातियों को प्रलेखित किया गया है। उद्यान में प्रेरी से टुंड्रा तक कई पारिस्थितिकी तंत्र हैं। उद्यान के दक्षिण-पश्चिम हिस्से में पश्चिमी रेडेकार्डर और हेमलॉक के जंगल पाये जाते हैं। उद्यान के जंगलों में आग लगना आम है। 1964 को छोड़कर उद्यान में हर साल आग लगती है। 1936 में 64 बार आग लगी थी जो कि रिकॉर्ड में सबसे अधिक है।[2][3] 2003 में लगी छह आग ने लगभग 136,000 एकड़ (550 किमी2), उद्यान के 13% से अधिक हिस्से को जला डाला था।[4]