summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJehan <jehan@girinstud.io>2021-05-23 19:33:36 +0200
committerJehan <jehan@girinstud.io>2022-12-14 00:24:53 +0100
commitbfa4b10d4db7273005d0ae0466e2980ef25feeae (patch)
treedc2ce5b4ab14e0ad226b4645fab258a5e7422938
parentbed459c6e75e8a5be59ccd9bc80ac76c0bb8dbeb (diff)
script, src: add English language model.
English detection is still quite crappy so I don't add a unit test yet. Though I believe the detection being bad is mostly because of too much shortcutting we are doing to go "fast". I should probably review this whole part of the logics as well.
-rw-r--r--script/BuildLangModelLogs/LangEnglishModel.log181
-rw-r--r--script/langs/en.py64
-rw-r--r--src/CMakeLists.txt1
-rw-r--r--src/LangModels/LangEnglishModel.cpp289
-rw-r--r--src/nsLanguageDetector.h1
-rw-r--r--src/nsMBCSGroupProber.cpp1
-rw-r--r--src/nsMBCSGroupProber.h2
-rw-r--r--src/nsSBCSGroupProber.cpp3
-rw-r--r--src/nsSBCSGroupProber.h2
-rw-r--r--src/nsSBCharSetProber.h3
10 files changed, 545 insertions, 2 deletions
diff --git a/script/BuildLangModelLogs/LangEnglishModel.log b/script/BuildLangModelLogs/LangEnglishModel.log
new file mode 100644
index 0000000..22f3ede
--- /dev/null
+++ b/script/BuildLangModelLogs/LangEnglishModel.log
@@ -0,0 +1,181 @@
+= Logs of language model for English (en) =
+
+- Generated by BuildLangModel.py
+- Started: 2021-03-19 23:26:14.143096
+- Maximum depth: 4
+- Max number of pages: 100
+
+== Parsed pages ==
+
+Marmot (revision 1000529225)
+Alashan ground squirrel (revision 1010437381)
+Alaska (revision 1012870556)
+Alaska marmot (revision 1010409368)
+Allen's chipmunk (revision 1010890232)
+Alpine chipmunk (revision 1010409470)
+Alpine marmot (revision 1012720679)
+Alps (revision 1007908369)
+Altai Mountains (revision 1006577543)
+Ancient Greece (revision 1012778875)
+Animal (revision 1013060732)
+Animal Diversity Web (revision 996899740)
+Antelope squirrel (revision 1010441265)
+Apennine Mountains (revision 1009656710)
+Arctic ground squirrel (revision 1010409925)
+Asia Minor ground squirrel (revision 1010437585)
+BNF (identifier) (revision 1010501260)
+Baja California rock squirrel (revision 1010410301)
+Barcode of Life Data System (revision 997241036)
+Bat (revision 1012442106)
+Bear (revision 1012937821)
+Belding's ground squirrel (revision 1010410588)
+Bibcode (identifier) (revision 1009103296)
+Black-capped marmot (revision 992988317)
+Black-tailed prairie dog (revision 1010411000)
+Black Hills (revision 1011995885)
+Bobak marmot (revision 1010411082)
+Brokpa (revision 1001820104)
+Brooks Range (revision 1009930357)
+Buller's chipmunk (revision 1010411572)
+California chipmunk (revision 1010411807)
+California ground squirrel (revision 1010411812)
+Callospermophilus (revision 1010416079)
+Carpathian Mountains (revision 1011395807)
+Cascade Range (revision 1011474213)
+Cascade golden-mantled ground squirrel (revision 1010416079)
+Chordate (revision 1008964469)
+Cliff chipmunk (revision 1010412814)
+Colorado chipmunk (revision 1010412919)
+Daurian ground squirrel (revision 1010413422)
+Deosai National Park (revision 1006913741)
+Doi (identifier) (revision 1010427488)
+Durango chipmunk (revision 1010413819)
+EPPO Code (revision 998151320)
+Eastern chipmunk (revision 999177830)
+Encyclopedia of Life (revision 994178741)
+Espíritu Santo antelope squirrel (revision 1010414324)
+Ethnology (revision 1011057083)
+Eulipotyphla (revision 1012652578)
+Eurasian Steppe (revision 1013064344)
+European ground squirrel (revision 1010414381)
+Eutamias (revision 1010406609)
+Extinction (revision 1011028396)
+Fauna Europaea (revision 963073975)
+Flower (revision 1010385350)
+Forest-steppe marmot (revision 1010436539)
+Forrest's rock squirrel (revision 1010437668)
+France (revision 1012524494)
+Franklin's ground squirrel (revision 1010415067)
+French Alps (revision 1006041101)
+GND (identifier) (revision 1010440981)
+Gallo-Romance languages (revision 1012668074)
+Genus (revision 1007184632)
+Global Biodiversity Information Facility (revision 1010489511)
+Gold (revision 1012856700)
+Gold-digging ant (revision 1007959560)
+Golden-mantled ground squirrel (revision 1010416079)
+Gray-collared chipmunk (revision 1010416642)
+Gray-footed chipmunk (revision 1010416658)
+Gray marmot (revision 1010416479)
+Ground squirrel (revision 1010442953)
+Groundhog Day (revision 1012802985)
+Gunnison's prairie dog (revision 1010416998)
+Harris's antelope squirrel (revision 1010417210)
+Herbivore (revision 1006902225)
+Herodotus (revision 1012927818)
+Hibernate (revision 1009048926)
+Hibernation (revision 1009048926)
+Himalayan marmot (revision 1010417424)
+Hoary marmot (revision 1010417525)
+Hopi chipmunk (revision 1010417623)
+INaturalist (revision 1009815294)
+ISBN (identifier) (revision 1009586768)
+Ictidomys (revision 1010406819)
+Ictidomys parvidens (revision 1010426310)
+Integrated Taxonomic Information System (revision 999235988)
+Interim Register of Marine and Nonmarine Genera (revision 995182351)
+JSTOR (identifier) (revision 1011078319)
+Jacopo Ligozzi (revision 1006687935)
+Johann Friedrich Blumenbach (revision 1006564504)
+Kazakhstan (revision 1012748504)
+LCCN (identifier) (revision 1006934344)
+Ladakh (revision 1010799326)
+Latin (revision 1012971392)
+Least chipmunk (revision 1010419221)
+
+== End of Parsed pages ==
+
+- Wikipedia parsing ended at: 2021-03-19 23:29:33.380471
+
+59 characters appeared 59 times.
+
+Most Frequent characters:
+[ 0] Char m: 1.694915254237288 %
+[ 1] Char a: 1.694915254237288 %
+[ 2] Char r: 1.694915254237288 %
+[ 3] Char o: 1.694915254237288 %
+[ 4] Char t: 1.694915254237288 %
+[ 5] Char s: 1.694915254237288 %
+[ 6] Char e: 1.694915254237288 %
+[ 7] Char l: 1.694915254237288 %
+[ 8] Char i: 1.694915254237288 %
+[ 9] Char v: 1.694915254237288 %
+[10] Char y: 1.694915254237288 %
+[11] Char g: 1.694915254237288 %
+[12] Char u: 1.694915254237288 %
+[13] Char n: 1.694915254237288 %
+[14] Char d: 1.694915254237288 %
+[15] Char q: 1.694915254237288 %
+[16] Char h: 1.694915254237288 %
+[17] Char w: 1.694915254237288 %
+[18] Char p: 1.694915254237288 %
+[19] Char c: 1.694915254237288 %
+[20] Char b: 1.694915254237288 %
+[21] Char f: 1.694915254237288 %
+[22] Char k: 1.694915254237288 %
+[23] Char x: 1.694915254237288 %
+[24] Char z: 1.694915254237288 %
+[25] Char j: 1.694915254237288 %
+[26] Char á: 1.694915254237288 %
+[27] Char ö: 1.694915254237288 %
+[28] Char ä: 1.694915254237288 %
+[29] Char í: 1.694915254237288 %
+[30] Char ç: 1.694915254237288 %
+[31] Char ô: 1.694915254237288 %
+[32] Char à: 1.694915254237288 %
+[33] Char ü: 1.694915254237288 %
+[34] Char æ: 1.694915254237288 %
+[35] Char é: 1.694915254237288 %
+[36] Char ï: 1.694915254237288 %
+[37] Char û: 1.694915254237288 %
+[38] Char ó: 1.694915254237288 %
+[39] Char µ: 1.694915254237288 %
+[40] Char è: 1.694915254237288 %
+[41] Char ì: 1.694915254237288 %
+[42] Char î: 1.694915254237288 %
+[43] Char ë: 1.694915254237288 %
+[44] Char ð: 1.694915254237288 %
+[45] Char ý: 1.694915254237288 %
+[46] Char š: 1.694915254237288 %
+[47] Char ñ: 1.694915254237288 %
+[48] Char œ: 1.694915254237288 %
+[49] Char ê: 1.694915254237288 %
+[50] Char â: 1.694915254237288 %
+[51] Char ø: 1.694915254237288 %
+[52] Char þ: 1.694915254237288 %
+[53] Char å: 1.694915254237288 %
+[54] Char ß: 1.694915254237288 %
+[55] Char ã: 1.694915254237288 %
+[56] Char ž: 1.694915254237288 %
+[57] Char õ: 1.694915254237288 %
+[58] Char ú: 1.694915254237288 %
+
+The first 59 characters have an accumulated ratio of 0.9999999999999989.
+
+920 sequences found.
+
+First 378 (typical positive ratio): 0.9950109024233114
+Next 182 (560-378): 0.003993012537786833
+Rest: 0.000996085038901806
+
+- Processing end: 2021-03-19 23:29:33.474226
diff --git a/script/langs/en.py b/script/langs/en.py
new file mode 100644
index 0000000..967483c
--- /dev/null
+++ b/script/langs/en.py
@@ -0,0 +1,64 @@
+#!/bin/python3
+# -*- coding: utf-8 -*-
+
+# ##### BEGIN LICENSE BLOCK #####
+# Version: MPL 1.1/GPL 2.0/LGPL 2.1
+#
+# The contents of this file are subject to the Mozilla Public License Version
+# 1.1 (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+# http://www.mozilla.org/MPL/
+#
+# Software distributed under the License is distributed on an "AS IS" basis,
+# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
+# for the specific language governing rights and limitations under the
+# License.
+#
+# The Original Code is Mozilla Universal charset detector code.
+#
+# The Initial Developer of the Original Code is
+# Netscape Communications Corporation.
+# Portions created by the Initial Developer are Copyright (C) 2001
+# the Initial Developer. All Rights Reserved.
+#
+# Contributor(s):
+# Jehan <jehan@girinstud.io>
+#
+# Alternatively, the contents of this file may be used under the terms of
+# either the GNU General Public License Version 2 or later (the "GPL"), or
+# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
+# in which case the provisions of the GPL or the LGPL are applicable instead
+# of those above. If you wish to allow use of your version of this file only
+# under the terms of either the GPL or the LGPL, and not to allow others to
+# use your version of this file under the terms of the MPL, indicate your
+# decision by deleting the provisions above and replace them with the notice
+# and other provisions required by the GPL or the LGPL. If you do not delete
+# the provisions above, a recipient may use your version of this file under
+# the terms of any one of the MPL, the GPL or the LGPL.
+#
+# ##### END LICENSE BLOCK #####
+
+import re
+
+## Mandatory Properties ##
+
+# The human name for the language, in English.
+name = 'English'
+# Use 2-letter ISO 639-1 if possible, 3-letter ISO code otherwise,
+# or use another catalog as a last resort.
+code = 'en'
+# ASCII characters are also used in French.
+use_ascii = True
+# The charsets we want to support and create data for.
+charsets = ['ISO-8859-1', 'WINDOWS-1252']
+
+## Optional Properties ##
+
+# The start page. Though optional, it is advised to choose one yourself.
+start_pages = ['Marmot']
+# give possibility to select another code for the Wikipedia URL.
+wikipedia_code = code
+# 'a' and 'A' will be considered the same character, and so on.
+# This uses Python algorithm to determine upper/lower-case of a given
+# character.
+case_mapping = True
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index c5dd54e..a0b607c 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -11,6 +11,7 @@ set(
LangModels/LangBulgarianModel.cpp
LangModels/LangCroatianModel.cpp
LangModels/LangCzechModel.cpp
+ LangModels/LangEnglishModel.cpp
LangModels/LangEsperantoModel.cpp
LangModels/LangEstonianModel.cpp
LangModels/LangFinnishModel.cpp
diff --git a/src/LangModels/LangEnglishModel.cpp b/src/LangModels/LangEnglishModel.cpp
new file mode 100644
index 0000000..dfe86f3
--- /dev/null
+++ b/src/LangModels/LangEnglishModel.cpp
@@ -0,0 +1,289 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* ***** BEGIN LICENSE BLOCK *****
+ * Version: MPL 1.1/GPL 2.0/LGPL 2.1
+ *
+ * The contents of this file are subject to the Mozilla Public License Version
+ * 1.1 (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ * http://www.mozilla.org/MPL/
+ *
+ * Software distributed under the License is distributed on an "AS IS" basis,
+ * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
+ * for the specific language governing rights and limitations under the
+ * License.
+ *
+ * The Original Code is Mozilla Communicator client code.
+ *
+ * The Initial Developer of the Original Code is
+ * Netscape Communications Corporation.
+ * Portions created by the Initial Developer are Copyright (C) 1998
+ * the Initial Developer. All Rights Reserved.
+ *
+ * Contributor(s):
+ *
+ * Alternatively, the contents of this file may be used under the terms of
+ * either the GNU General Public License Version 2 or later (the "GPL"), or
+ * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
+ * in which case the provisions of the GPL or the LGPL are applicable instead
+ * of those above. If you wish to allow use of your version of this file only
+ * under the terms of either the GPL or the LGPL, and not to allow others to
+ * use your version of this file under the terms of the MPL, indicate your
+ * decision by deleting the provisions above and replace them with the notice
+ * and other provisions required by the GPL or the LGPL. If you do not delete
+ * the provisions above, a recipient may use your version of this file under
+ * the terms of any one of the MPL, the GPL or the LGPL.
+ *
+ * ***** END LICENSE BLOCK ***** */
+
+#include "../nsSBCharSetProber.h"
+#include "../nsLanguageDetector.h"
+
+/********* Language model for: English *********/
+
+/**
+ * Generated by BuildLangModel.py
+ * On: 2021-03-19 23:29:33.380823
+ **/
+
+/* Character Mapping Table:
+ * ILL: illegal character.
+ * CTR: control character specific to the charset.
+ * RET: carriage/return.
+ * SYM: symbol (punctuation) that does not belong to word.
+ * NUM: 0 - 9.
+ *
+ * Other characters are ordered by probabilities
+ * (0 is the most common character in the language).
+ *
+ * Orders are generic to a language. So the codepoint with order X in
+ * CHARSET1 maps to the same character as the codepoint with the same
+ * order X in CHARSET2 for the same language.
+ * As such, it is possible to get missing order. For instance the
+ * ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1
+ * even though they are both used for French. Same for the euro sign.
+ */
+static const unsigned char Iso_8859_1_CharToOrderMap[] =
+{
+ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */
+ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
+ SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
+ NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
+ SYM, 1, 20, 19, 14, 6, 21, 11, 16, 8, 25, 22, 7, 0, 13, 3, /* 4X */
+ 18, 15, 2, 5, 4, 12, 9, 17, 23, 10, 24,SYM,SYM,SYM,SYM,SYM, /* 5X */
+ SYM, 1, 20, 19, 14, 6, 21, 11, 16, 8, 25, 22, 7, 0, 13, 3, /* 6X */
+ 18, 15, 2, 5, 4, 12, 9, 17, 23, 10, 24,SYM,SYM,SYM,SYM,CTR, /* 7X */
+ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */
+ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */
+ SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */
+ SYM,SYM,SYM,SYM,SYM, 39,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */
+ 32, 26, 50, 55, 28, 53, 34, 30, 40, 35, 49, 43, 41, 29, 42, 36, /* CX */
+ 44, 47, 59, 38, 31, 57, 27,SYM, 51, 60, 58, 37, 33, 45, 52, 54, /* DX */
+ 32, 26, 50, 55, 28, 53, 34, 30, 40, 35, 49, 43, 41, 29, 42, 36, /* EX */
+ 44, 47, 61, 38, 31, 57, 27,SYM, 51, 62, 58, 37, 33, 45, 52, 63, /* FX */
+};
+/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
+
+static const unsigned char Windows_1252_CharToOrderMap[] =
+{
+ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */
+ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
+ SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
+ NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
+ SYM, 1, 20, 19, 14, 6, 21, 11, 16, 8, 25, 22, 7, 0, 13, 3, /* 4X */
+ 18, 15, 2, 5, 4, 12, 9, 17, 23, 10, 24,SYM,SYM,SYM,SYM,SYM, /* 5X */
+ SYM, 1, 20, 19, 14, 6, 21, 11, 16, 8, 25, 22, 7, 0, 13, 3, /* 6X */
+ 18, 15, 2, 5, 4, 12, 9, 17, 23, 10, 24,SYM,SYM,SYM,SYM,CTR, /* 7X */
+ SYM,ILL,SYM, 64,SYM,SYM,SYM,SYM,SYM,SYM, 46,SYM, 48,ILL, 56,ILL, /* 8X */
+ ILL,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, 46,SYM, 48,ILL, 56, 65, /* 9X */
+ SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */
+ SYM,SYM,SYM,SYM,SYM, 39,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */
+ 32, 26, 50, 55, 28, 53, 34, 30, 40, 35, 49, 43, 41, 29, 42, 36, /* CX */
+ 44, 47, 66, 38, 31, 57, 27,SYM, 51, 67, 58, 37, 33, 45, 52, 54, /* DX */
+ 32, 26, 50, 55, 28, 53, 34, 30, 40, 35, 49, 43, 41, 29, 42, 36, /* EX */
+ 44, 47, 68, 38, 31, 57, 27,SYM, 51, 69, 58, 37, 33, 45, 52, 70, /* FX */
+};
+/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
+
+static const int Unicode_Char_size = 117;
+static const unsigned int Unicode_CharOrder[] =
+{
+ 65, 1, 66, 20, 67, 19, 68, 14, 69, 6, 70, 21, 71, 11, 72, 16,
+ 73, 8, 74, 25, 75, 22, 76, 7, 77, 0, 78, 13, 79, 3, 80, 18,
+ 81, 15, 82, 2, 83, 5, 84, 4, 85, 12, 86, 9, 87, 17, 88, 23,
+ 89, 10, 90, 24, 97, 1, 98, 20, 99, 19, 100, 14, 101, 6,102, 21,
+ 103, 11, 104, 16, 105, 8, 106, 25, 107, 22, 108, 7, 109, 0,110, 13,
+ 111, 3, 112, 18, 113, 15, 114, 2, 115, 5, 116, 4, 117, 12,118, 9,
+ 119, 17, 120, 23, 121, 10, 122, 24, 181, 39, 192, 32, 193, 26,194, 50,
+ 195, 55, 196, 28, 197, 53, 198, 34, 199, 30, 200, 40, 201, 35,202, 49,
+ 203, 43, 204, 41, 205, 29, 206, 42, 207, 36, 208, 44, 209, 47,211, 38,
+ 212, 31, 213, 57, 214, 27, 216, 51, 218, 58, 219, 37, 220, 33,221, 45,
+ 222, 52, 223, 54, 224, 32, 225, 26, 226, 50, 227, 55, 228, 28,229, 53,
+ 230, 34, 231, 30, 232, 40, 233, 35, 234, 49, 235, 43, 236, 41,237, 29,
+ 238, 42, 239, 36, 240, 44, 241, 47, 243, 38, 244, 31, 245, 57,246, 27,
+ 248, 51, 250, 58, 251, 37, 252, 33, 253, 45, 254, 52, 338, 48,339, 48,
+ 352, 46, 353, 46, 381, 56, 382, 56, 924, 39,
+};
+
+
+/* Model Table:
+ * Total considered sequences: 920 / 3481
+ * - Positive sequences: first 378 (0.9950109024233114)
+ * - Probable sequences: next 182 (560-378) (0.003993012537786833)
+ * - Neutral sequences: last 2921 (0.000996085038901806)
+ * - Negative sequences: 2561 (off-ratio)
+ * Negative sequences: TODO
+ */
+static const PRUint8 EnglishLangModel[] =
+{
+ 3,3,1,3,2,3,3,2,3,1,3,2,3,3,2,1,2,1,3,2,3,2,1,1,1,1,2,1,1,
+ 1,0,0,1,1,1,2,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,
+ 3,2,3,2,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,0,0,0,
+ 0,1,0,0,0,0,1,1,0,0,0,0,0,1,1,1,1,0,1,0,1,0,0,0,0,0,0,0,0,0,
+ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,3,3,3,3,3,3,3,1,2,1,1,1,1,
+ 1,0,1,0,0,0,2,0,1,1,0,1,0,1,0,0,0,0,0,0,1,2,0,0,0,0,0,0,0,1,
+ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,0,0,0,
+ 0,0,0,0,0,0,1,2,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 3,3,3,3,3,3,3,3,3,2,3,2,3,3,1,2,3,3,2,3,2,2,2,0,3,0,1,0,0,
+ 1,0,0,1,1,0,2,0,0,1,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,
+ 3,3,3,3,3,3,3,3,3,1,3,2,3,3,2,3,3,3,3,3,3,3,3,1,1,1,1,1,0,
+ 1,0,0,0,0,0,2,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,1,0,0,
+ 1,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,
+ 3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,0,2,3,3,3,3,3,3,0,2,1,1,0,1,
+ 1,0,1,0,0,0,2,0,0,1,0,1,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,2,
+ 3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,2,1,3,3,3,3,3,3,3,2,1,0,0,
+ 1,0,0,0,0,0,1,0,0,1,0,2,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,
+ 1,3,2,3,1,2,3,2,3,2,2,1,2,1,1,0,1,0,1,1,0,0,1,0,0,1,1,0,0,
+ 1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,
+ 3,3,3,3,3,3,3,3,3,1,2,2,3,3,3,0,1,2,3,3,2,1,2,2,2,1,1,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,1,3,2,2,1,2,2,1,0,2,1,1,2,0,
+ 1,0,0,0,0,0,2,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 3,3,3,3,3,3,3,3,3,3,2,3,2,3,3,1,2,1,3,3,3,3,3,3,2,2,1,0,0,
+ 1,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,
+ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,3,3,3,3,2,2,3,1,1,0,
+ 2,2,1,0,0,1,2,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,
+ 3,3,3,3,2,3,3,3,3,2,3,3,3,2,3,2,3,3,3,2,2,3,1,1,2,2,0,0,0,
+ 1,0,1,0,1,0,2,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,
+ 0,2,0,0,1,1,0,0,2,1,1,0,3,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 3,3,3,3,3,3,3,3,3,1,3,2,3,3,2,2,1,3,1,2,2,1,1,0,1,0,1,1,1,
+ 1,0,1,0,1,1,2,0,0,1,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,1,
+ 1,3,3,3,3,3,3,3,3,0,2,1,1,3,2,0,3,2,1,2,2,2,2,0,0,0,0,0,0,
+ 0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,
+ 3,3,3,3,3,3,3,3,3,0,3,2,3,1,2,0,3,2,3,2,2,0,2,1,1,0,1,0,0,
+ 2,0,1,0,0,2,2,0,0,1,0,1,0,0,1,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,
+ 3,3,3,3,3,3,3,3,3,0,3,2,3,2,2,2,3,0,0,3,1,2,3,0,2,0,1,0,0,
+ 1,0,1,1,0,0,2,0,1,0,0,1,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,
+ 2,3,3,3,3,3,3,3,3,1,3,2,3,3,2,0,2,1,2,3,3,2,1,0,1,3,0,0,0,
+ 0,0,0,0,0,0,2,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,
+ 2,3,3,3,3,2,3,3,3,0,3,1,3,1,2,0,0,1,1,2,1,3,1,0,0,1,0,1,0,
+ 1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,
+ 3,3,3,3,2,3,3,3,3,0,3,2,3,3,1,1,3,2,2,2,2,1,1,0,1,1,1,2,0,
+ 0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,
+ 1,3,0,3,3,2,3,0,3,2,2,0,3,0,1,1,2,1,3,3,0,2,0,2,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 1,3,1,3,2,2,3,2,3,1,2,0,2,2,1,0,2,1,0,0,2,0,1,0,2,1,0,0,0,
+ 0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,3,2,3,0,3,3,1,3,1,1,0,3,1,0,0,1,1,1,0,0,0,1,0,0,2,1,1,0,
+ 0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 1,1,1,0,1,1,0,2,0,1,0,1,1,2,0,0,0,0,1,1,1,0,0,1,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 1,0,1,0,2,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,1,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,2,1,0,2,2,0,1,0,1,0,0,0,2,1,1,0,0,1,0,1,0,0,0,1,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,2,0,2,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,
+ 1,0,0,0,2,1,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,1,0,0,1,0,1,1,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,1,0,1,1,0,0,0,0,0,0,1,1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 2,1,2,1,2,2,2,2,1,2,0,2,1,2,2,1,0,0,2,2,1,0,0,0,1,1,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,2,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 1,0,1,0,0,2,0,1,0,1,0,0,0,2,1,0,0,1,0,1,0,0,1,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 1,0,2,0,0,1,0,0,0,1,0,1,0,1,1,2,0,0,0,1,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,1,0,1,1,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,1,0,2,1,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 1,0,0,0,2,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 2,0,0,0,1,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+};
+
+
+const SequenceModel Iso_8859_1EnglishModel =
+{
+ Iso_8859_1_CharToOrderMap,
+ EnglishLangModel,
+ 59,
+ (float)0.9990039149610982,
+ PR_TRUE,
+ "ISO-8859-1",
+ "en"
+};
+
+const SequenceModel Windows_1252EnglishModel =
+{
+ Windows_1252_CharToOrderMap,
+ EnglishLangModel,
+ 59,
+ (float)0.9990039149610982,
+ PR_TRUE,
+ "WINDOWS-1252",
+ "en"
+};
+
+const LanguageModel EnglishModel =
+{
+ "en",
+ Unicode_CharOrder,
+ 117,
+ EnglishLangModel,
+ 59,
+ (float)0.9999999999999989,
+};
diff --git a/src/nsLanguageDetector.h b/src/nsLanguageDetector.h
index 8a5c74d..30b935a 100644
--- a/src/nsLanguageDetector.h
+++ b/src/nsLanguageDetector.h
@@ -116,6 +116,7 @@ extern const LanguageModel ArabicModel;
extern const LanguageModel CroatianModel;
extern const LanguageModel CzechModel;
extern const LanguageModel DanishModel;
+extern const LanguageModel EnglishModel;
extern const LanguageModel EsperantoModel;
extern const LanguageModel EstonianModel;
extern const LanguageModel FinnishModel;
diff --git a/src/nsMBCSGroupProber.cpp b/src/nsMBCSGroupProber.cpp
index 9aeb2f7..51c268f 100644
--- a/src/nsMBCSGroupProber.cpp
+++ b/src/nsMBCSGroupProber.cpp
@@ -96,6 +96,7 @@ nsMBCSGroupProber::nsMBCSGroupProber(PRUint32 aLanguageFilter)
langDetectors[i][j++] = new nsLanguageDetector(&CroatianModel);
langDetectors[i][j++] = new nsLanguageDetector(&CzechModel);
langDetectors[i][j++] = new nsLanguageDetector(&DanishModel);
+ langDetectors[i][j++] = new nsLanguageDetector(&EnglishModel);
langDetectors[i][j++] = new nsLanguageDetector(&EsperantoModel);
langDetectors[i][j++] = new nsLanguageDetector(&EstonianModel);
langDetectors[i][j++] = new nsLanguageDetector(&FinnishModel);
diff --git a/src/nsMBCSGroupProber.h b/src/nsMBCSGroupProber.h
index a6bfc59..9596ac0 100644
--- a/src/nsMBCSGroupProber.h
+++ b/src/nsMBCSGroupProber.h
@@ -49,7 +49,7 @@
#include "nsEUCTWProber.h"
#define NUM_OF_PROBERS 8
-#define NUM_OF_LANGUAGES 29
+#define NUM_OF_LANGUAGES 30
class nsMBCSGroupProber: public nsCharSetProber {
public:
diff --git a/src/nsSBCSGroupProber.cpp b/src/nsSBCSGroupProber.cpp
index ca54911..fd4f2d6 100644
--- a/src/nsSBCSGroupProber.cpp
+++ b/src/nsSBCSGroupProber.cpp
@@ -197,6 +197,9 @@ nsSBCSGroupProber::nsSBCSGroupProber()
mProbers[103] = new nsSingleByteCharSetProber(&Windows_1252NorwegianModel);
mProbers[104] = new nsSingleByteCharSetProber(&Ibm865NorwegianModel);
+ mProbers[104] = new nsSingleByteCharSetProber(&Iso_8859_1EnglishModel);
+ mProbers[105] = new nsSingleByteCharSetProber(&Windows_1252EnglishModel);
+
Reset();
}
diff --git a/src/nsSBCSGroupProber.h b/src/nsSBCSGroupProber.h
index 96f8c22..a68a2a4 100644
--- a/src/nsSBCSGroupProber.h
+++ b/src/nsSBCSGroupProber.h
@@ -40,7 +40,7 @@
#define nsSBCSGroupProber_h__
-#define NUM_OF_SBCS_PROBERS 105
+#define NUM_OF_SBCS_PROBERS 106
class nsCharSetProber;
class nsSBCSGroupProber: public nsCharSetProber {
diff --git a/src/nsSBCharSetProber.h b/src/nsSBCharSetProber.h
index f6173a8..2fad476 100644
--- a/src/nsSBCharSetProber.h
+++ b/src/nsSBCharSetProber.h
@@ -179,6 +179,9 @@ extern const SequenceModel Iso_8859_1DanishModel;
extern const SequenceModel Windows_1252DanishModel;
extern const SequenceModel Ibm865DanishModel;
+extern const SequenceModel Iso_8859_1EnglishModel;
+extern const SequenceModel Windows_1252EnglishModel;
+
extern const SequenceModel Iso_8859_13LithuanianModel;
extern const SequenceModel Iso_8859_10LithuanianModel;
extern const SequenceModel Iso_8859_4LithuanianModel;