diff options
author | Jehan <jehan@girinstud.io> | 2021-05-23 19:33:36 +0200 |
---|---|---|
committer | Jehan <jehan@girinstud.io> | 2022-12-14 00:24:53 +0100 |
commit | bfa4b10d4db7273005d0ae0466e2980ef25feeae (patch) | |
tree | dc2ce5b4ab14e0ad226b4645fab258a5e7422938 | |
parent | bed459c6e75e8a5be59ccd9bc80ac76c0bb8dbeb (diff) |
script, src: add English language model.
English detection is still quite crappy so I don't add a unit test yet.
Though I believe the detection being bad is mostly because of too much
shortcutting we are doing to go "fast". I should probably review this
whole part of the logics as well.
-rw-r--r-- | script/BuildLangModelLogs/LangEnglishModel.log | 181 | ||||
-rw-r--r-- | script/langs/en.py | 64 | ||||
-rw-r--r-- | src/CMakeLists.txt | 1 | ||||
-rw-r--r-- | src/LangModels/LangEnglishModel.cpp | 289 | ||||
-rw-r--r-- | src/nsLanguageDetector.h | 1 | ||||
-rw-r--r-- | src/nsMBCSGroupProber.cpp | 1 | ||||
-rw-r--r-- | src/nsMBCSGroupProber.h | 2 | ||||
-rw-r--r-- | src/nsSBCSGroupProber.cpp | 3 | ||||
-rw-r--r-- | src/nsSBCSGroupProber.h | 2 | ||||
-rw-r--r-- | src/nsSBCharSetProber.h | 3 |
10 files changed, 545 insertions, 2 deletions
diff --git a/script/BuildLangModelLogs/LangEnglishModel.log b/script/BuildLangModelLogs/LangEnglishModel.log new file mode 100644 index 0000000..22f3ede --- /dev/null +++ b/script/BuildLangModelLogs/LangEnglishModel.log @@ -0,0 +1,181 @@ += Logs of language model for English (en) = + +- Generated by BuildLangModel.py +- Started: 2021-03-19 23:26:14.143096 +- Maximum depth: 4 +- Max number of pages: 100 + +== Parsed pages == + +Marmot (revision 1000529225) +Alashan ground squirrel (revision 1010437381) +Alaska (revision 1012870556) +Alaska marmot (revision 1010409368) +Allen's chipmunk (revision 1010890232) +Alpine chipmunk (revision 1010409470) +Alpine marmot (revision 1012720679) +Alps (revision 1007908369) +Altai Mountains (revision 1006577543) +Ancient Greece (revision 1012778875) +Animal (revision 1013060732) +Animal Diversity Web (revision 996899740) +Antelope squirrel (revision 1010441265) +Apennine Mountains (revision 1009656710) +Arctic ground squirrel (revision 1010409925) +Asia Minor ground squirrel (revision 1010437585) +BNF (identifier) (revision 1010501260) +Baja California rock squirrel (revision 1010410301) +Barcode of Life Data System (revision 997241036) +Bat (revision 1012442106) +Bear (revision 1012937821) +Belding's ground squirrel (revision 1010410588) +Bibcode (identifier) (revision 1009103296) +Black-capped marmot (revision 992988317) +Black-tailed prairie dog (revision 1010411000) +Black Hills (revision 1011995885) +Bobak marmot (revision 1010411082) +Brokpa (revision 1001820104) +Brooks Range (revision 1009930357) +Buller's chipmunk (revision 1010411572) +California chipmunk (revision 1010411807) +California ground squirrel (revision 1010411812) +Callospermophilus (revision 1010416079) +Carpathian Mountains (revision 1011395807) +Cascade Range (revision 1011474213) +Cascade golden-mantled ground squirrel (revision 1010416079) +Chordate (revision 1008964469) +Cliff chipmunk (revision 1010412814) +Colorado chipmunk (revision 1010412919) +Daurian ground squirrel (revision 1010413422) +Deosai National Park (revision 1006913741) +Doi (identifier) (revision 1010427488) +Durango chipmunk (revision 1010413819) +EPPO Code (revision 998151320) +Eastern chipmunk (revision 999177830) +Encyclopedia of Life (revision 994178741) +Espíritu Santo antelope squirrel (revision 1010414324) +Ethnology (revision 1011057083) +Eulipotyphla (revision 1012652578) +Eurasian Steppe (revision 1013064344) +European ground squirrel (revision 1010414381) +Eutamias (revision 1010406609) +Extinction (revision 1011028396) +Fauna Europaea (revision 963073975) +Flower (revision 1010385350) +Forest-steppe marmot (revision 1010436539) +Forrest's rock squirrel (revision 1010437668) +France (revision 1012524494) +Franklin's ground squirrel (revision 1010415067) +French Alps (revision 1006041101) +GND (identifier) (revision 1010440981) +Gallo-Romance languages (revision 1012668074) +Genus (revision 1007184632) +Global Biodiversity Information Facility (revision 1010489511) +Gold (revision 1012856700) +Gold-digging ant (revision 1007959560) +Golden-mantled ground squirrel (revision 1010416079) +Gray-collared chipmunk (revision 1010416642) +Gray-footed chipmunk (revision 1010416658) +Gray marmot (revision 1010416479) +Ground squirrel (revision 1010442953) +Groundhog Day (revision 1012802985) +Gunnison's prairie dog (revision 1010416998) +Harris's antelope squirrel (revision 1010417210) +Herbivore (revision 1006902225) +Herodotus (revision 1012927818) +Hibernate (revision 1009048926) +Hibernation (revision 1009048926) +Himalayan marmot (revision 1010417424) +Hoary marmot (revision 1010417525) +Hopi chipmunk (revision 1010417623) +INaturalist (revision 1009815294) +ISBN (identifier) (revision 1009586768) +Ictidomys (revision 1010406819) +Ictidomys parvidens (revision 1010426310) +Integrated Taxonomic Information System (revision 999235988) +Interim Register of Marine and Nonmarine Genera (revision 995182351) +JSTOR (identifier) (revision 1011078319) +Jacopo Ligozzi (revision 1006687935) +Johann Friedrich Blumenbach (revision 1006564504) +Kazakhstan (revision 1012748504) +LCCN (identifier) (revision 1006934344) +Ladakh (revision 1010799326) +Latin (revision 1012971392) +Least chipmunk (revision 1010419221) + +== End of Parsed pages == + +- Wikipedia parsing ended at: 2021-03-19 23:29:33.380471 + +59 characters appeared 59 times. + +Most Frequent characters: +[ 0] Char m: 1.694915254237288 % +[ 1] Char a: 1.694915254237288 % +[ 2] Char r: 1.694915254237288 % +[ 3] Char o: 1.694915254237288 % +[ 4] Char t: 1.694915254237288 % +[ 5] Char s: 1.694915254237288 % +[ 6] Char e: 1.694915254237288 % +[ 7] Char l: 1.694915254237288 % +[ 8] Char i: 1.694915254237288 % +[ 9] Char v: 1.694915254237288 % +[10] Char y: 1.694915254237288 % +[11] Char g: 1.694915254237288 % +[12] Char u: 1.694915254237288 % +[13] Char n: 1.694915254237288 % +[14] Char d: 1.694915254237288 % +[15] Char q: 1.694915254237288 % +[16] Char h: 1.694915254237288 % +[17] Char w: 1.694915254237288 % +[18] Char p: 1.694915254237288 % +[19] Char c: 1.694915254237288 % +[20] Char b: 1.694915254237288 % +[21] Char f: 1.694915254237288 % +[22] Char k: 1.694915254237288 % +[23] Char x: 1.694915254237288 % +[24] Char z: 1.694915254237288 % +[25] Char j: 1.694915254237288 % +[26] Char á: 1.694915254237288 % +[27] Char ö: 1.694915254237288 % +[28] Char ä: 1.694915254237288 % +[29] Char í: 1.694915254237288 % +[30] Char ç: 1.694915254237288 % +[31] Char ô: 1.694915254237288 % +[32] Char à: 1.694915254237288 % +[33] Char ü: 1.694915254237288 % +[34] Char æ: 1.694915254237288 % +[35] Char é: 1.694915254237288 % +[36] Char ï: 1.694915254237288 % +[37] Char û: 1.694915254237288 % +[38] Char ó: 1.694915254237288 % +[39] Char µ: 1.694915254237288 % +[40] Char è: 1.694915254237288 % +[41] Char ì: 1.694915254237288 % +[42] Char î: 1.694915254237288 % +[43] Char ë: 1.694915254237288 % +[44] Char ð: 1.694915254237288 % +[45] Char ý: 1.694915254237288 % +[46] Char š: 1.694915254237288 % +[47] Char ñ: 1.694915254237288 % +[48] Char œ: 1.694915254237288 % +[49] Char ê: 1.694915254237288 % +[50] Char â: 1.694915254237288 % +[51] Char ø: 1.694915254237288 % +[52] Char þ: 1.694915254237288 % +[53] Char å: 1.694915254237288 % +[54] Char ß: 1.694915254237288 % +[55] Char ã: 1.694915254237288 % +[56] Char ž: 1.694915254237288 % +[57] Char õ: 1.694915254237288 % +[58] Char ú: 1.694915254237288 % + +The first 59 characters have an accumulated ratio of 0.9999999999999989. + +920 sequences found. + +First 378 (typical positive ratio): 0.9950109024233114 +Next 182 (560-378): 0.003993012537786833 +Rest: 0.000996085038901806 + +- Processing end: 2021-03-19 23:29:33.474226 diff --git a/script/langs/en.py b/script/langs/en.py new file mode 100644 index 0000000..967483c --- /dev/null +++ b/script/langs/en.py @@ -0,0 +1,64 @@ +#!/bin/python3 +# -*- coding: utf-8 -*- + +# ##### BEGIN LICENSE BLOCK ##### +# Version: MPL 1.1/GPL 2.0/LGPL 2.1 +# +# The contents of this file are subject to the Mozilla Public License Version +# 1.1 (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# http://www.mozilla.org/MPL/ +# +# Software distributed under the License is distributed on an "AS IS" basis, +# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License +# for the specific language governing rights and limitations under the +# License. +# +# The Original Code is Mozilla Universal charset detector code. +# +# The Initial Developer of the Original Code is +# Netscape Communications Corporation. +# Portions created by the Initial Developer are Copyright (C) 2001 +# the Initial Developer. All Rights Reserved. +# +# Contributor(s): +# Jehan <jehan@girinstud.io> +# +# Alternatively, the contents of this file may be used under the terms of +# either the GNU General Public License Version 2 or later (the "GPL"), or +# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), +# in which case the provisions of the GPL or the LGPL are applicable instead +# of those above. If you wish to allow use of your version of this file only +# under the terms of either the GPL or the LGPL, and not to allow others to +# use your version of this file under the terms of the MPL, indicate your +# decision by deleting the provisions above and replace them with the notice +# and other provisions required by the GPL or the LGPL. If you do not delete +# the provisions above, a recipient may use your version of this file under +# the terms of any one of the MPL, the GPL or the LGPL. +# +# ##### END LICENSE BLOCK ##### + +import re + +## Mandatory Properties ## + +# The human name for the language, in English. +name = 'English' +# Use 2-letter ISO 639-1 if possible, 3-letter ISO code otherwise, +# or use another catalog as a last resort. +code = 'en' +# ASCII characters are also used in French. +use_ascii = True +# The charsets we want to support and create data for. +charsets = ['ISO-8859-1', 'WINDOWS-1252'] + +## Optional Properties ## + +# The start page. Though optional, it is advised to choose one yourself. +start_pages = ['Marmot'] +# give possibility to select another code for the Wikipedia URL. +wikipedia_code = code +# 'a' and 'A' will be considered the same character, and so on. +# This uses Python algorithm to determine upper/lower-case of a given +# character. +case_mapping = True diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index c5dd54e..a0b607c 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -11,6 +11,7 @@ set( LangModels/LangBulgarianModel.cpp LangModels/LangCroatianModel.cpp LangModels/LangCzechModel.cpp + LangModels/LangEnglishModel.cpp LangModels/LangEsperantoModel.cpp LangModels/LangEstonianModel.cpp LangModels/LangFinnishModel.cpp diff --git a/src/LangModels/LangEnglishModel.cpp b/src/LangModels/LangEnglishModel.cpp new file mode 100644 index 0000000..dfe86f3 --- /dev/null +++ b/src/LangModels/LangEnglishModel.cpp @@ -0,0 +1,289 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* ***** BEGIN LICENSE BLOCK ***** + * Version: MPL 1.1/GPL 2.0/LGPL 2.1 + * + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * The Original Code is Mozilla Communicator client code. + * + * The Initial Developer of the Original Code is + * Netscape Communications Corporation. + * Portions created by the Initial Developer are Copyright (C) 1998 + * the Initial Developer. All Rights Reserved. + * + * Contributor(s): + * + * Alternatively, the contents of this file may be used under the terms of + * either the GNU General Public License Version 2 or later (the "GPL"), or + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), + * in which case the provisions of the GPL or the LGPL are applicable instead + * of those above. If you wish to allow use of your version of this file only + * under the terms of either the GPL or the LGPL, and not to allow others to + * use your version of this file under the terms of the MPL, indicate your + * decision by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL or the LGPL. If you do not delete + * the provisions above, a recipient may use your version of this file under + * the terms of any one of the MPL, the GPL or the LGPL. + * + * ***** END LICENSE BLOCK ***** */ + +#include "../nsSBCharSetProber.h" +#include "../nsLanguageDetector.h" + +/********* Language model for: English *********/ + +/** + * Generated by BuildLangModel.py + * On: 2021-03-19 23:29:33.380823 + **/ + +/* Character Mapping Table: + * ILL: illegal character. + * CTR: control character specific to the charset. + * RET: carriage/return. + * SYM: symbol (punctuation) that does not belong to word. + * NUM: 0 - 9. + * + * Other characters are ordered by probabilities + * (0 is the most common character in the language). + * + * Orders are generic to a language. So the codepoint with order X in + * CHARSET1 maps to the same character as the codepoint with the same + * order X in CHARSET2 for the same language. + * As such, it is possible to get missing order. For instance the + * ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1 + * even though they are both used for French. Same for the euro sign. + */ +static const unsigned char Iso_8859_1_CharToOrderMap[] = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 1, 20, 19, 14, 6, 21, 11, 16, 8, 25, 22, 7, 0, 13, 3, /* 4X */ + 18, 15, 2, 5, 4, 12, 9, 17, 23, 10, 24,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 1, 20, 19, 14, 6, 21, 11, 16, 8, 25, 22, 7, 0, 13, 3, /* 6X */ + 18, 15, 2, 5, 4, 12, 9, 17, 23, 10, 24,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */ + SYM,SYM,SYM,SYM,SYM, 39,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */ + 32, 26, 50, 55, 28, 53, 34, 30, 40, 35, 49, 43, 41, 29, 42, 36, /* CX */ + 44, 47, 59, 38, 31, 57, 27,SYM, 51, 60, 58, 37, 33, 45, 52, 54, /* DX */ + 32, 26, 50, 55, 28, 53, 34, 30, 40, 35, 49, 43, 41, 29, 42, 36, /* EX */ + 44, 47, 61, 38, 31, 57, 27,SYM, 51, 62, 58, 37, 33, 45, 52, 63, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Windows_1252_CharToOrderMap[] = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 1, 20, 19, 14, 6, 21, 11, 16, 8, 25, 22, 7, 0, 13, 3, /* 4X */ + 18, 15, 2, 5, 4, 12, 9, 17, 23, 10, 24,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 1, 20, 19, 14, 6, 21, 11, 16, 8, 25, 22, 7, 0, 13, 3, /* 6X */ + 18, 15, 2, 5, 4, 12, 9, 17, 23, 10, 24,SYM,SYM,SYM,SYM,CTR, /* 7X */ + SYM,ILL,SYM, 64,SYM,SYM,SYM,SYM,SYM,SYM, 46,SYM, 48,ILL, 56,ILL, /* 8X */ + ILL,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, 46,SYM, 48,ILL, 56, 65, /* 9X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */ + SYM,SYM,SYM,SYM,SYM, 39,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */ + 32, 26, 50, 55, 28, 53, 34, 30, 40, 35, 49, 43, 41, 29, 42, 36, /* CX */ + 44, 47, 66, 38, 31, 57, 27,SYM, 51, 67, 58, 37, 33, 45, 52, 54, /* DX */ + 32, 26, 50, 55, 28, 53, 34, 30, 40, 35, 49, 43, 41, 29, 42, 36, /* EX */ + 44, 47, 68, 38, 31, 57, 27,SYM, 51, 69, 58, 37, 33, 45, 52, 70, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const int Unicode_Char_size = 117; +static const unsigned int Unicode_CharOrder[] = +{ + 65, 1, 66, 20, 67, 19, 68, 14, 69, 6, 70, 21, 71, 11, 72, 16, + 73, 8, 74, 25, 75, 22, 76, 7, 77, 0, 78, 13, 79, 3, 80, 18, + 81, 15, 82, 2, 83, 5, 84, 4, 85, 12, 86, 9, 87, 17, 88, 23, + 89, 10, 90, 24, 97, 1, 98, 20, 99, 19, 100, 14, 101, 6,102, 21, + 103, 11, 104, 16, 105, 8, 106, 25, 107, 22, 108, 7, 109, 0,110, 13, + 111, 3, 112, 18, 113, 15, 114, 2, 115, 5, 116, 4, 117, 12,118, 9, + 119, 17, 120, 23, 121, 10, 122, 24, 181, 39, 192, 32, 193, 26,194, 50, + 195, 55, 196, 28, 197, 53, 198, 34, 199, 30, 200, 40, 201, 35,202, 49, + 203, 43, 204, 41, 205, 29, 206, 42, 207, 36, 208, 44, 209, 47,211, 38, + 212, 31, 213, 57, 214, 27, 216, 51, 218, 58, 219, 37, 220, 33,221, 45, + 222, 52, 223, 54, 224, 32, 225, 26, 226, 50, 227, 55, 228, 28,229, 53, + 230, 34, 231, 30, 232, 40, 233, 35, 234, 49, 235, 43, 236, 41,237, 29, + 238, 42, 239, 36, 240, 44, 241, 47, 243, 38, 244, 31, 245, 57,246, 27, + 248, 51, 250, 58, 251, 37, 252, 33, 253, 45, 254, 52, 338, 48,339, 48, + 352, 46, 353, 46, 381, 56, 382, 56, 924, 39, +}; + + +/* Model Table: + * Total considered sequences: 920 / 3481 + * - Positive sequences: first 378 (0.9950109024233114) + * - Probable sequences: next 182 (560-378) (0.003993012537786833) + * - Neutral sequences: last 2921 (0.000996085038901806) + * - Negative sequences: 2561 (off-ratio) + * Negative sequences: TODO + */ +static const PRUint8 EnglishLangModel[] = +{ + 3,3,1,3,2,3,3,2,3,1,3,2,3,3,2,1,2,1,3,2,3,2,1,1,1,1,2,1,1, + 1,0,0,1,1,1,2,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0, + 3,2,3,2,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,0,0,0, + 0,1,0,0,0,0,1,1,0,0,0,0,0,1,1,1,1,0,1,0,1,0,0,0,0,0,0,0,0,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,3,3,3,3,3,3,3,1,2,1,1,1,1, + 1,0,1,0,0,0,2,0,1,1,0,1,0,1,0,0,0,0,0,0,1,2,0,0,0,0,0,0,0,1, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,0,0,0, + 0,0,0,0,0,0,1,2,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,3,3,3,3,3,3,3,3,2,3,2,3,3,1,2,3,3,2,3,2,2,2,0,3,0,1,0,0, + 1,0,0,1,1,0,2,0,0,1,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1, + 3,3,3,3,3,3,3,3,3,1,3,2,3,3,2,3,3,3,3,3,3,3,3,1,1,1,1,1,0, + 1,0,0,0,0,0,2,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,1,0,0, + 1,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,0,2,3,3,3,3,3,3,0,2,1,1,0,1, + 1,0,1,0,0,0,2,0,0,1,0,1,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,2, + 3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,2,1,3,3,3,3,3,3,3,2,1,0,0, + 1,0,0,0,0,0,1,0,0,1,0,2,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0, + 1,3,2,3,1,2,3,2,3,2,2,1,2,1,1,0,1,0,1,1,0,0,1,0,0,1,1,0,0, + 1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0, + 3,3,3,3,3,3,3,3,3,1,2,2,3,3,3,0,1,2,3,3,2,1,2,2,2,1,1,0,0, + 0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,1,3,2,2,1,2,2,1,0,2,1,1,2,0, + 1,0,0,0,0,0,2,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,3,3,3,3,3,3,3,3,3,2,3,2,3,3,1,2,1,3,3,3,3,3,3,2,2,1,0,0, + 1,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,3,3,3,3,2,2,3,1,1,0, + 2,2,1,0,0,1,2,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0, + 3,3,3,3,2,3,3,3,3,2,3,3,3,2,3,2,3,3,3,2,2,3,1,1,2,2,0,0,0, + 1,0,1,0,1,0,2,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0, + 0,2,0,0,1,1,0,0,2,1,1,0,3,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,3,3,3,3,3,3,3,3,1,3,2,3,3,2,2,1,3,1,2,2,1,1,0,1,0,1,1,1, + 1,0,1,0,1,1,2,0,0,1,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,1, + 1,3,3,3,3,3,3,3,3,0,2,1,1,3,2,0,3,2,1,2,2,2,2,0,0,0,0,0,0, + 0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0, + 3,3,3,3,3,3,3,3,3,0,3,2,3,1,2,0,3,2,3,2,2,0,2,1,1,0,1,0,0, + 2,0,1,0,0,2,2,0,0,1,0,1,0,0,1,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0, + 3,3,3,3,3,3,3,3,3,0,3,2,3,2,2,2,3,0,0,3,1,2,3,0,2,0,1,0,0, + 1,0,1,1,0,0,2,0,1,0,0,1,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0, + 2,3,3,3,3,3,3,3,3,1,3,2,3,3,2,0,2,1,2,3,3,2,1,0,1,3,0,0,0, + 0,0,0,0,0,0,2,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0, + 2,3,3,3,3,2,3,3,3,0,3,1,3,1,2,0,0,1,1,2,1,3,1,0,0,1,0,1,0, + 1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0, + 3,3,3,3,2,3,3,3,3,0,3,2,3,3,1,1,3,2,2,2,2,1,1,0,1,1,1,2,0, + 0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0, + 1,3,0,3,3,2,3,0,3,2,2,0,3,0,1,1,2,1,3,3,0,2,0,2,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 1,3,1,3,2,2,3,2,3,1,2,0,2,2,1,0,2,1,0,0,2,0,1,0,2,1,0,0,0, + 0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,3,2,3,0,3,3,1,3,1,1,0,3,1,0,0,1,1,1,0,0,0,1,0,0,2,1,1,0, + 0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 1,1,1,0,1,1,0,2,0,1,0,1,1,2,0,0,0,0,1,1,1,0,0,1,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 1,0,1,0,2,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,1,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0, + 0,2,1,0,2,2,0,1,0,1,0,0,0,2,1,1,0,0,1,0,1,0,0,0,1,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,2,0,2,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0, + 1,0,0,0,2,1,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,1,0,0,1,0,1,1,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,1,0,1,1,0,0,0,0,0,0,1,1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 2,1,2,1,2,2,2,2,1,2,0,2,1,2,2,1,0,0,2,2,1,0,0,0,1,1,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,2,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 1,0,1,0,0,2,0,1,0,1,0,0,0,2,1,0,0,1,0,1,0,0,1,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 1,0,2,0,0,1,0,0,0,1,0,1,0,1,1,2,0,0,0,1,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,1,0,1,1,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,1,0,2,1,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 1,0,0,0,2,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 2,0,0,0,1,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +}; + + +const SequenceModel Iso_8859_1EnglishModel = +{ + Iso_8859_1_CharToOrderMap, + EnglishLangModel, + 59, + (float)0.9990039149610982, + PR_TRUE, + "ISO-8859-1", + "en" +}; + +const SequenceModel Windows_1252EnglishModel = +{ + Windows_1252_CharToOrderMap, + EnglishLangModel, + 59, + (float)0.9990039149610982, + PR_TRUE, + "WINDOWS-1252", + "en" +}; + +const LanguageModel EnglishModel = +{ + "en", + Unicode_CharOrder, + 117, + EnglishLangModel, + 59, + (float)0.9999999999999989, +}; diff --git a/src/nsLanguageDetector.h b/src/nsLanguageDetector.h index 8a5c74d..30b935a 100644 --- a/src/nsLanguageDetector.h +++ b/src/nsLanguageDetector.h @@ -116,6 +116,7 @@ extern const LanguageModel ArabicModel; extern const LanguageModel CroatianModel; extern const LanguageModel CzechModel; extern const LanguageModel DanishModel; +extern const LanguageModel EnglishModel; extern const LanguageModel EsperantoModel; extern const LanguageModel EstonianModel; extern const LanguageModel FinnishModel; diff --git a/src/nsMBCSGroupProber.cpp b/src/nsMBCSGroupProber.cpp index 9aeb2f7..51c268f 100644 --- a/src/nsMBCSGroupProber.cpp +++ b/src/nsMBCSGroupProber.cpp @@ -96,6 +96,7 @@ nsMBCSGroupProber::nsMBCSGroupProber(PRUint32 aLanguageFilter) langDetectors[i][j++] = new nsLanguageDetector(&CroatianModel); langDetectors[i][j++] = new nsLanguageDetector(&CzechModel); langDetectors[i][j++] = new nsLanguageDetector(&DanishModel); + langDetectors[i][j++] = new nsLanguageDetector(&EnglishModel); langDetectors[i][j++] = new nsLanguageDetector(&EsperantoModel); langDetectors[i][j++] = new nsLanguageDetector(&EstonianModel); langDetectors[i][j++] = new nsLanguageDetector(&FinnishModel); diff --git a/src/nsMBCSGroupProber.h b/src/nsMBCSGroupProber.h index a6bfc59..9596ac0 100644 --- a/src/nsMBCSGroupProber.h +++ b/src/nsMBCSGroupProber.h @@ -49,7 +49,7 @@ #include "nsEUCTWProber.h" #define NUM_OF_PROBERS 8 -#define NUM_OF_LANGUAGES 29 +#define NUM_OF_LANGUAGES 30 class nsMBCSGroupProber: public nsCharSetProber { public: diff --git a/src/nsSBCSGroupProber.cpp b/src/nsSBCSGroupProber.cpp index ca54911..fd4f2d6 100644 --- a/src/nsSBCSGroupProber.cpp +++ b/src/nsSBCSGroupProber.cpp @@ -197,6 +197,9 @@ nsSBCSGroupProber::nsSBCSGroupProber() mProbers[103] = new nsSingleByteCharSetProber(&Windows_1252NorwegianModel); mProbers[104] = new nsSingleByteCharSetProber(&Ibm865NorwegianModel); + mProbers[104] = new nsSingleByteCharSetProber(&Iso_8859_1EnglishModel); + mProbers[105] = new nsSingleByteCharSetProber(&Windows_1252EnglishModel); + Reset(); } diff --git a/src/nsSBCSGroupProber.h b/src/nsSBCSGroupProber.h index 96f8c22..a68a2a4 100644 --- a/src/nsSBCSGroupProber.h +++ b/src/nsSBCSGroupProber.h @@ -40,7 +40,7 @@ #define nsSBCSGroupProber_h__ -#define NUM_OF_SBCS_PROBERS 105 +#define NUM_OF_SBCS_PROBERS 106 class nsCharSetProber; class nsSBCSGroupProber: public nsCharSetProber { diff --git a/src/nsSBCharSetProber.h b/src/nsSBCharSetProber.h index f6173a8..2fad476 100644 --- a/src/nsSBCharSetProber.h +++ b/src/nsSBCharSetProber.h @@ -179,6 +179,9 @@ extern const SequenceModel Iso_8859_1DanishModel; extern const SequenceModel Windows_1252DanishModel; extern const SequenceModel Ibm865DanishModel; +extern const SequenceModel Iso_8859_1EnglishModel; +extern const SequenceModel Windows_1252EnglishModel; + extern const SequenceModel Iso_8859_13LithuanianModel; extern const SequenceModel Iso_8859_10LithuanianModel; extern const SequenceModel Iso_8859_4LithuanianModel; |