summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJehan <jehan@girinstud.io>2020-04-23 18:24:12 +0200
committerJehan <jehan@girinstud.io>2020-04-23 18:39:49 +0200
commit4e967c9e882b8857e3a059698768cb0b2bff14a6 (patch)
tree8327c6dae337e25573fcbcfa0f1e85707b80ca23
parent94736d1565c06e8a871e8cd724b2c812672d7365 (diff)
src: new API to get the detected language.
This doesn't work for all probers yet, in particular not for the most generic probers (such as UTF-8) or WINDOWS-1252. These will return NULL. It's still a good first step. Right now, it returns the 2-character language code from ISO 639-1. A using project could easily get the English language name from the XML/json files provided by the iso-codes project. This project will also allow to easily localize the language name in other languages through gettext (this is what we do in GIMP for instance). I don't add any dependency though and leave it to downstream projects to implement this. I was also wondering if we want to support region information for cases when it would make sense. I especially wondered about it for Chinese encodings as some of them seem quite specific to a region (according to Wikipedia at least). For the time being though, these just return "zh". We'll see later if it makes sense to be more accurate (maybe depending on reports?).
-rw-r--r--src/LangModels/LangArabicModel.cpp6
-rw-r--r--src/LangModels/LangBulgarianModel.cpp6
-rw-r--r--src/LangModels/LangCroatianModel.cpp18
-rw-r--r--src/LangModels/LangCzechModel.cpp12
-rw-r--r--src/LangModels/LangDanishModel.cpp9
-rw-r--r--src/LangModels/LangEsperantoModel.cpp3
-rw-r--r--src/LangModels/LangEstonianModel.cpp15
-rw-r--r--src/LangModels/LangFinnishModel.cpp18
-rw-r--r--src/LangModels/LangFrenchModel.cpp9
-rw-r--r--src/LangModels/LangGermanModel.cpp6
-rw-r--r--src/LangModels/LangGreekModel.cpp6
-rw-r--r--src/LangModels/LangHebrewModel.cpp4
-rw-r--r--src/LangModels/LangHungarianModel.cpp6
-rw-r--r--src/LangModels/LangIrishModel.cpp12
-rw-r--r--src/LangModels/LangItalianModel.cpp15
-rw-r--r--src/LangModels/LangLatvianModel.cpp9
-rw-r--r--src/LangModels/LangLithuanianModel.cpp9
-rw-r--r--src/LangModels/LangMalteseModel.cpp3
-rw-r--r--src/LangModels/LangPolishModel.cpp18
-rw-r--r--src/LangModels/LangPortugueseModel.cpp12
-rw-r--r--src/LangModels/LangRomanianModel.cpp12
-rw-r--r--src/LangModels/LangRussianModel.cpp18
-rw-r--r--src/LangModels/LangSlovakModel.cpp12
-rw-r--r--src/LangModels/LangSloveneModel.cpp15
-rw-r--r--src/LangModels/LangSpanishModel.cpp9
-rw-r--r--src/LangModels/LangSwedishModel.cpp15
-rw-r--r--src/LangModels/LangThaiModel.cpp6
-rw-r--r--src/LangModels/LangTurkishModel.cpp6
-rw-r--r--src/LangModels/LangVietnameseModel.cpp6
-rw-r--r--src/nsBig5Prober.h1
-rw-r--r--src/nsCharSetProber.h1
-rw-r--r--src/nsEUCJPProber.h1
-rw-r--r--src/nsEUCKRProber.h1
-rw-r--r--src/nsEUCTWProber.h1
-rw-r--r--src/nsEscCharsetProber.h3
-rw-r--r--src/nsGB2312Prober.h1
-rw-r--r--src/nsHebrewProber.h3
-rw-r--r--src/nsLatin1Prober.h3
-rw-r--r--src/nsMBCSGroupProber.cpp12
-rw-r--r--src/nsMBCSGroupProber.h1
-rw-r--r--src/nsSBCSGroupProber.cpp11
-rw-r--r--src/nsSBCSGroupProber.h1
-rw-r--r--src/nsSBCharSetProber.cpp7
-rw-r--r--src/nsSBCharSetProber.h2
-rw-r--r--src/nsSJISProber.h1
-rw-r--r--src/nsUTF8Prober.h2
-rw-r--r--src/nsUniversalDetector.cpp6
-rw-r--r--src/nsUniversalDetector.h3
-rw-r--r--src/symbols.cmake1
-rw-r--r--src/uchardet.cpp21
-rw-r--r--src/uchardet.h2
51 files changed, 276 insertions, 104 deletions
diff --git a/src/LangModels/LangArabicModel.cpp b/src/LangModels/LangArabicModel.cpp
index 6ac80f3..0a6d654 100644
--- a/src/LangModels/LangArabicModel.cpp
+++ b/src/LangModels/LangArabicModel.cpp
@@ -251,7 +251,8 @@ const SequenceModel Iso_8859_6ArabicModel =
64,
(float)0.9696025116913417,
PR_FALSE,
- "ISO-8859-6"
+ "ISO-8859-6",
+ "ar"
};
const SequenceModel Windows_1256ArabicModel =
@@ -261,5 +262,6 @@ const SequenceModel Windows_1256ArabicModel =
64,
(float)0.9696025116913417,
PR_FALSE,
- "WINDOWS-1256"
+ "WINDOWS-1256",
+ "ar"
};
diff --git a/src/LangModels/LangBulgarianModel.cpp b/src/LangModels/LangBulgarianModel.cpp
index 18c58ee..1120054 100644
--- a/src/LangModels/LangBulgarianModel.cpp
+++ b/src/LangModels/LangBulgarianModel.cpp
@@ -233,7 +233,8 @@ const SequenceModel Latin5BulgarianModel =
64,
(float)0.969392,
PR_FALSE,
- "ISO-8859-5"
+ "ISO-8859-5",
+ "bg"
};
const SequenceModel Win1251BulgarianModel =
@@ -243,5 +244,6 @@ const SequenceModel Win1251BulgarianModel =
64,
(float)0.969392,
PR_FALSE,
- "WINDOWS-1251"
+ "WINDOWS-1251",
+ "bg"
};
diff --git a/src/LangModels/LangCroatianModel.cpp b/src/LangModels/LangCroatianModel.cpp
index 58f882e..961bd0e 100644
--- a/src/LangModels/LangCroatianModel.cpp
+++ b/src/LangModels/LangCroatianModel.cpp
@@ -238,7 +238,8 @@ const SequenceModel Windows_1250CroatianModel =
31,
(float)0.9989731099787131,
PR_TRUE,
- "WINDOWS-1250"
+ "WINDOWS-1250",
+ "hr"
};
const SequenceModel Iso_8859_2CroatianModel =
@@ -248,7 +249,8 @@ const SequenceModel Iso_8859_2CroatianModel =
31,
(float)0.9989731099787131,
PR_TRUE,
- "ISO-8859-2"
+ "ISO-8859-2",
+ "hr"
};
const SequenceModel Iso_8859_16CroatianModel =
@@ -258,7 +260,8 @@ const SequenceModel Iso_8859_16CroatianModel =
31,
(float)0.9989731099787131,
PR_TRUE,
- "ISO-8859-16"
+ "ISO-8859-16",
+ "hr"
};
const SequenceModel Mac_CentraleuropeCroatianModel =
@@ -268,7 +271,8 @@ const SequenceModel Mac_CentraleuropeCroatianModel =
31,
(float)0.9989731099787131,
PR_TRUE,
- "MAC-CENTRALEUROPE"
+ "MAC-CENTRALEUROPE",
+ "hr"
};
const SequenceModel Iso_8859_13CroatianModel =
@@ -278,7 +282,8 @@ const SequenceModel Iso_8859_13CroatianModel =
31,
(float)0.9989731099787131,
PR_TRUE,
- "ISO-8859-13"
+ "ISO-8859-13",
+ "hr"
};
const SequenceModel Ibm852CroatianModel =
@@ -288,5 +293,6 @@ const SequenceModel Ibm852CroatianModel =
31,
(float)0.9989731099787131,
PR_TRUE,
- "IBM852"
+ "IBM852",
+ "hr"
};
diff --git a/src/LangModels/LangCzechModel.cpp b/src/LangModels/LangCzechModel.cpp
index 2557376..c12c07e 100644
--- a/src/LangModels/LangCzechModel.cpp
+++ b/src/LangModels/LangCzechModel.cpp
@@ -247,7 +247,8 @@ const SequenceModel Windows_1250CzechModel =
41,
(float)0.9786035192432675,
PR_TRUE,
- "WINDOWS-1250"
+ "WINDOWS-1250",
+ "cs"
};
const SequenceModel Mac_CentraleuropeCzechModel =
@@ -257,7 +258,8 @@ const SequenceModel Mac_CentraleuropeCzechModel =
41,
(float)0.9786035192432675,
PR_TRUE,
- "MAC-CENTRALEUROPE"
+ "MAC-CENTRALEUROPE",
+ "cs"
};
const SequenceModel Ibm852CzechModel =
@@ -267,7 +269,8 @@ const SequenceModel Ibm852CzechModel =
41,
(float)0.9786035192432675,
PR_TRUE,
- "IBM852"
+ "IBM852",
+ "cs"
};
const SequenceModel Iso_8859_2CzechModel =
@@ -277,5 +280,6 @@ const SequenceModel Iso_8859_2CzechModel =
41,
(float)0.9786035192432675,
PR_TRUE,
- "ISO-8859-2"
+ "ISO-8859-2",
+ "cs"
};
diff --git a/src/LangModels/LangDanishModel.cpp b/src/LangModels/LangDanishModel.cpp
index c60e7b2..cb99e9b 100644
--- a/src/LangModels/LangDanishModel.cpp
+++ b/src/LangModels/LangDanishModel.cpp
@@ -174,7 +174,8 @@ const SequenceModel Iso_8859_15DanishModel =
30,
(float)0.9968082796759031,
PR_TRUE,
- "ISO-8859-15"
+ "ISO-8859-15",
+ "da"
};
const SequenceModel Iso_8859_1DanishModel =
@@ -184,7 +185,8 @@ const SequenceModel Iso_8859_1DanishModel =
30,
(float)0.9968082796759031,
PR_TRUE,
- "ISO-8859-1"
+ "ISO-8859-1",
+ "da"
};
const SequenceModel Windows_1252DanishModel =
@@ -194,5 +196,6 @@ const SequenceModel Windows_1252DanishModel =
30,
(float)0.9968082796759031,
PR_TRUE,
- "WINDOWS-1252"
+ "WINDOWS-1252",
+ "da"
};
diff --git a/src/LangModels/LangEsperantoModel.cpp b/src/LangModels/LangEsperantoModel.cpp
index 0884cd2..1d55ec7 100644
--- a/src/LangModels/LangEsperantoModel.cpp
+++ b/src/LangModels/LangEsperantoModel.cpp
@@ -137,5 +137,6 @@ const SequenceModel Iso_8859_3EsperantoModel =
35,
(float)0.9942980632768038,
PR_FALSE,
- "ISO-8859-3"
+ "ISO-8859-3",
+ "eo"
};
diff --git a/src/LangModels/LangEstonianModel.cpp b/src/LangModels/LangEstonianModel.cpp
index c5fa9b3..71d9c66 100644
--- a/src/LangModels/LangEstonianModel.cpp
+++ b/src/LangModels/LangEstonianModel.cpp
@@ -219,7 +219,8 @@ const SequenceModel Iso_8859_4EstonianModel =
33,
(float)0.9972721312183132,
PR_TRUE,
- "ISO-8859-4"
+ "ISO-8859-4",
+ "et"
};
const SequenceModel Windows_1252EstonianModel =
@@ -229,7 +230,8 @@ const SequenceModel Windows_1252EstonianModel =
33,
(float)0.9972721312183132,
PR_TRUE,
- "WINDOWS-1252"
+ "WINDOWS-1252",
+ "et"
};
const SequenceModel Iso_8859_15EstonianModel =
@@ -239,7 +241,8 @@ const SequenceModel Iso_8859_15EstonianModel =
33,
(float)0.9972721312183132,
PR_TRUE,
- "ISO-8859-15"
+ "ISO-8859-15",
+ "et"
};
const SequenceModel Iso_8859_13EstonianModel =
@@ -249,7 +252,8 @@ const SequenceModel Iso_8859_13EstonianModel =
33,
(float)0.9972721312183132,
PR_TRUE,
- "ISO-8859-13"
+ "ISO-8859-13",
+ "et"
};
const SequenceModel Windows_1257EstonianModel =
@@ -259,5 +263,6 @@ const SequenceModel Windows_1257EstonianModel =
33,
(float)0.9972721312183132,
PR_TRUE,
- "WINDOWS-1257"
+ "WINDOWS-1257",
+ "et"
};
diff --git a/src/LangModels/LangFinnishModel.cpp b/src/LangModels/LangFinnishModel.cpp
index ee91e14..cbc9528 100644
--- a/src/LangModels/LangFinnishModel.cpp
+++ b/src/LangModels/LangFinnishModel.cpp
@@ -237,7 +237,8 @@ const SequenceModel Iso_8859_15FinnishModel =
30,
(float)0.9985378147555799,
PR_TRUE,
- "ISO-8859-15"
+ "ISO-8859-15",
+ "fi"
};
const SequenceModel Windows_1252FinnishModel =
@@ -247,7 +248,8 @@ const SequenceModel Windows_1252FinnishModel =
30,
(float)0.9985378147555799,
PR_TRUE,
- "WINDOWS-1252"
+ "WINDOWS-1252",
+ "fi"
};
const SequenceModel Iso_8859_4FinnishModel =
@@ -257,7 +259,8 @@ const SequenceModel Iso_8859_4FinnishModel =
30,
(float)0.9985378147555799,
PR_TRUE,
- "ISO-8859-4"
+ "ISO-8859-4",
+ "fi"
};
const SequenceModel Iso_8859_13FinnishModel =
@@ -267,7 +270,8 @@ const SequenceModel Iso_8859_13FinnishModel =
30,
(float)0.9985378147555799,
PR_TRUE,
- "ISO-8859-13"
+ "ISO-8859-13",
+ "fi"
};
const SequenceModel Iso_8859_9FinnishModel =
@@ -277,7 +281,8 @@ const SequenceModel Iso_8859_9FinnishModel =
30,
(float)0.9985378147555799,
PR_TRUE,
- "ISO-8859-9"
+ "ISO-8859-9",
+ "fi"
};
const SequenceModel Iso_8859_1FinnishModel =
@@ -287,5 +292,6 @@ const SequenceModel Iso_8859_1FinnishModel =
30,
(float)0.9985378147555799,
PR_TRUE,
- "ISO-8859-1"
+ "ISO-8859-1",
+ "fi"
};
diff --git a/src/LangModels/LangFrenchModel.cpp b/src/LangModels/LangFrenchModel.cpp
index cd458cb..5baaf31 100644
--- a/src/LangModels/LangFrenchModel.cpp
+++ b/src/LangModels/LangFrenchModel.cpp
@@ -182,7 +182,8 @@ const SequenceModel Windows_1252FrenchModel =
38,
(float)0.997057879992383,
PR_TRUE,
- "WINDOWS-1252"
+ "WINDOWS-1252",
+ "fr"
};
const SequenceModel Iso_8859_1FrenchModel =
@@ -192,7 +193,8 @@ const SequenceModel Iso_8859_1FrenchModel =
38,
(float)0.997057879992383,
PR_TRUE,
- "ISO-8859-1"
+ "ISO-8859-1",
+ "fr"
};
const SequenceModel Iso_8859_15FrenchModel =
@@ -202,5 +204,6 @@ const SequenceModel Iso_8859_15FrenchModel =
38,
(float)0.997057879992383,
PR_TRUE,
- "ISO-8859-15"
+ "ISO-8859-15",
+ "fr"
};
diff --git a/src/LangModels/LangGermanModel.cpp b/src/LangModels/LangGermanModel.cpp
index feeda8e..dd4228c 100644
--- a/src/LangModels/LangGermanModel.cpp
+++ b/src/LangModels/LangGermanModel.cpp
@@ -154,7 +154,8 @@ const SequenceModel Windows_1252GermanModel =
31,
(float)0.9934041448127945,
PR_TRUE,
- "WINDOWS-1252"
+ "WINDOWS-1252",
+ "de"
};
const SequenceModel Iso_8859_1GermanModel =
@@ -164,5 +165,6 @@ const SequenceModel Iso_8859_1GermanModel =
31,
(float)0.9934041448127945,
PR_TRUE,
- "ISO-8859-1"
+ "ISO-8859-1",
+ "de"
};
diff --git a/src/LangModels/LangGreekModel.cpp b/src/LangModels/LangGreekModel.cpp
index 499affe..28951e6 100644
--- a/src/LangModels/LangGreekModel.cpp
+++ b/src/LangModels/LangGreekModel.cpp
@@ -215,7 +215,8 @@ const SequenceModel Windows_1253GreekModel =
46,
(float)0.958419074626211,
PR_FALSE,
- "WINDOWS-1253"
+ "WINDOWS-1253",
+ "el"
};
const SequenceModel Iso_8859_7GreekModel =
@@ -225,5 +226,6 @@ const SequenceModel Iso_8859_7GreekModel =
46,
(float)0.958419074626211,
PR_FALSE,
- "ISO-8859-7"
+ "ISO-8859-7",
+ "el"
};
diff --git a/src/LangModels/LangHebrewModel.cpp b/src/LangModels/LangHebrewModel.cpp
index af9ac2b..811c048 100644
--- a/src/LangModels/LangHebrewModel.cpp
+++ b/src/LangModels/LangHebrewModel.cpp
@@ -215,6 +215,6 @@ const SequenceModel Win1255Model =
64,
(float)0.984004,
PR_FALSE,
- "WINDOWS-1255"
+ "WINDOWS-1255",
+ "he"
};
-
diff --git a/src/LangModels/LangHungarianModel.cpp b/src/LangModels/LangHungarianModel.cpp
index 83e6eaa..22f0de6 100644
--- a/src/LangModels/LangHungarianModel.cpp
+++ b/src/LangModels/LangHungarianModel.cpp
@@ -155,7 +155,8 @@ const SequenceModel Iso_8859_2HungarianModel =
32,
(float)0.9748272224933486,
PR_FALSE,
- "ISO-8859-2"
+ "ISO-8859-2",
+ "hu"
};
const SequenceModel Windows_1250HungarianModel =
@@ -165,5 +166,6 @@ const SequenceModel Windows_1250HungarianModel =
32,
(float)0.9748272224933486,
PR_FALSE,
- "WINDOWS-1250"
+ "WINDOWS-1250",
+ "hu"
};
diff --git a/src/LangModels/LangIrishModel.cpp b/src/LangModels/LangIrishModel.cpp
index af3a16d..bbd9500 100644
--- a/src/LangModels/LangIrishModel.cpp
+++ b/src/LangModels/LangIrishModel.cpp
@@ -196,7 +196,8 @@ const SequenceModel Iso_8859_1IrishModel =
31,
(float)0.9974076651249096,
PR_TRUE,
- "ISO-8859-1"
+ "ISO-8859-1",
+ "ga"
};
const SequenceModel Windows_1252IrishModel =
@@ -206,7 +207,8 @@ const SequenceModel Windows_1252IrishModel =
31,
(float)0.9974076651249096,
PR_TRUE,
- "WINDOWS-1252"
+ "WINDOWS-1252",
+ "ga"
};
const SequenceModel Iso_8859_15IrishModel =
@@ -216,7 +218,8 @@ const SequenceModel Iso_8859_15IrishModel =
31,
(float)0.9974076651249096,
PR_TRUE,
- "ISO-8859-15"
+ "ISO-8859-15",
+ "ga"
};
const SequenceModel Iso_8859_9IrishModel =
@@ -226,5 +229,6 @@ const SequenceModel Iso_8859_9IrishModel =
31,
(float)0.9974076651249096,
PR_TRUE,
- "ISO-8859-9"
+ "ISO-8859-9",
+ "ga"
};
diff --git a/src/LangModels/LangItalianModel.cpp b/src/LangModels/LangItalianModel.cpp
index 0a9565c..4bb5dc5 100644
--- a/src/LangModels/LangItalianModel.cpp
+++ b/src/LangModels/LangItalianModel.cpp
@@ -220,7 +220,8 @@ const SequenceModel Iso_8859_3ItalianModel =
34,
(float)0.9989484485502651,
PR_TRUE,
- "ISO-8859-3"
+ "ISO-8859-3",
+ "it"
};
const SequenceModel Iso_8859_15ItalianModel =
@@ -230,7 +231,8 @@ const SequenceModel Iso_8859_15ItalianModel =
34,
(float)0.9989484485502651,
PR_TRUE,
- "ISO-8859-15"
+ "ISO-8859-15",
+ "it"
};
const SequenceModel Iso_8859_9ItalianModel =
@@ -240,7 +242,8 @@ const SequenceModel Iso_8859_9ItalianModel =
34,
(float)0.9989484485502651,
PR_TRUE,
- "ISO-8859-9"
+ "ISO-8859-9",
+ "it"
};
const SequenceModel Iso_8859_1ItalianModel =
@@ -250,7 +253,8 @@ const SequenceModel Iso_8859_1ItalianModel =
34,
(float)0.9989484485502651,
PR_TRUE,
- "ISO-8859-1"
+ "ISO-8859-1",
+ "it"
};
const SequenceModel Windows_1252ItalianModel =
@@ -260,5 +264,6 @@ const SequenceModel Windows_1252ItalianModel =
34,
(float)0.9989484485502651,
PR_TRUE,
- "WINDOWS-1252"
+ "WINDOWS-1252",
+ "it"
};
diff --git a/src/LangModels/LangLatvianModel.cpp b/src/LangModels/LangLatvianModel.cpp
index b62d414..fcccc82 100644
--- a/src/LangModels/LangLatvianModel.cpp
+++ b/src/LangModels/LangLatvianModel.cpp
@@ -183,7 +183,8 @@ const SequenceModel Iso_8859_4LatvianModel =
39,
(float)0.9904102202220861,
PR_TRUE,
- "ISO-8859-4"
+ "ISO-8859-4",
+ "lv"
};
const SequenceModel Iso_8859_10LatvianModel =
@@ -193,7 +194,8 @@ const SequenceModel Iso_8859_10LatvianModel =
39,
(float)0.9904102202220861,
PR_TRUE,
- "ISO-8859-10"
+ "ISO-8859-10",
+ "lv"
};
const SequenceModel Iso_8859_13LatvianModel =
@@ -203,5 +205,6 @@ const SequenceModel Iso_8859_13LatvianModel =
39,
(float)0.9904102202220861,
PR_TRUE,
- "ISO-8859-13"
+ "ISO-8859-13",
+ "lv"
};
diff --git a/src/LangModels/LangLithuanianModel.cpp b/src/LangModels/LangLithuanianModel.cpp
index af65db3..686014a 100644
--- a/src/LangModels/LangLithuanianModel.cpp
+++ b/src/LangModels/LangLithuanianModel.cpp
@@ -182,7 +182,8 @@ const SequenceModel Iso_8859_10LithuanianModel =
38,
(float)0.9928710196247589,
PR_TRUE,
- "ISO-8859-10"
+ "ISO-8859-10",
+ "lt"
};
const SequenceModel Iso_8859_4LithuanianModel =
@@ -192,7 +193,8 @@ const SequenceModel Iso_8859_4LithuanianModel =
38,
(float)0.9928710196247589,
PR_TRUE,
- "ISO-8859-4"
+ "ISO-8859-4",
+ "lt"
};
const SequenceModel Iso_8859_13LithuanianModel =
@@ -202,5 +204,6 @@ const SequenceModel Iso_8859_13LithuanianModel =
38,
(float)0.9928710196247589,
PR_TRUE,
- "ISO-8859-13"
+ "ISO-8859-13",
+ "lt"
};
diff --git a/src/LangModels/LangMalteseModel.cpp b/src/LangModels/LangMalteseModel.cpp
index dd82ef6..e253539 100644
--- a/src/LangModels/LangMalteseModel.cpp
+++ b/src/LangModels/LangMalteseModel.cpp
@@ -133,5 +133,6 @@ const SequenceModel Iso_8859_3MalteseModel =
31,
(float)0.9959115850692665,
PR_TRUE,
- "ISO-8859-3"
+ "ISO-8859-3",
+ "mt"
};
diff --git a/src/LangModels/LangPolishModel.cpp b/src/LangModels/LangPolishModel.cpp
index cb62bdc..38791de 100644
--- a/src/LangModels/LangPolishModel.cpp
+++ b/src/LangModels/LangPolishModel.cpp
@@ -244,7 +244,8 @@ const SequenceModel Ibm852PolishModel =
37,
(float)0.9894531815946438,
PR_TRUE,
- "IBM852"
+ "IBM852",
+ "pl"
};
const SequenceModel Iso_8859_16PolishModel =
@@ -254,7 +255,8 @@ const SequenceModel Iso_8859_16PolishModel =
37,
(float)0.9894531815946438,
PR_TRUE,
- "ISO-8859-16"
+ "ISO-8859-16",
+ "pl"
};
const SequenceModel Iso_8859_2PolishModel =
@@ -264,7 +266,8 @@ const SequenceModel Iso_8859_2PolishModel =
37,
(float)0.9894531815946438,
PR_TRUE,
- "ISO-8859-2"
+ "ISO-8859-2",
+ "pl"
};
const SequenceModel Mac_CentraleuropePolishModel =
@@ -274,7 +277,8 @@ const SequenceModel Mac_CentraleuropePolishModel =
37,
(float)0.9894531815946438,
PR_TRUE,
- "MAC-CENTRALEUROPE"
+ "MAC-CENTRALEUROPE",
+ "pl"
};
const SequenceModel Iso_8859_13PolishModel =
@@ -284,7 +288,8 @@ const SequenceModel Iso_8859_13PolishModel =
37,
(float)0.9894531815946438,
PR_TRUE,
- "ISO-8859-13"
+ "ISO-8859-13",
+ "pl"
};
const SequenceModel Windows_1250PolishModel =
@@ -294,5 +299,6 @@ const SequenceModel Windows_1250PolishModel =
37,
(float)0.9894531815946438,
PR_TRUE,
- "WINDOWS-1250"
+ "WINDOWS-1250",
+ "pl"
};
diff --git a/src/LangModels/LangPortugueseModel.cpp b/src/LangModels/LangPortugueseModel.cpp
index 8d4bc4a..0b2dd1b 100644
--- a/src/LangModels/LangPortugueseModel.cpp
+++ b/src/LangModels/LangPortugueseModel.cpp
@@ -203,7 +203,8 @@ const SequenceModel Iso_8859_1PortugueseModel =
38,
(float)0.9953179582313172,
PR_TRUE,
- "ISO-8859-1"
+ "ISO-8859-1",
+ "pt"
};
const SequenceModel Iso_8859_9PortugueseModel =
@@ -213,7 +214,8 @@ const SequenceModel Iso_8859_9PortugueseModel =
38,
(float)0.9953179582313172,
PR_TRUE,
- "ISO-8859-9"
+ "ISO-8859-9",
+ "pt"
};
const SequenceModel Iso_8859_15PortugueseModel =
@@ -223,7 +225,8 @@ const SequenceModel Iso_8859_15PortugueseModel =
38,
(float)0.9953179582313172,
PR_TRUE,
- "ISO-8859-15"
+ "ISO-8859-15",
+ "pt"
};
const SequenceModel Windows_1252PortugueseModel =
@@ -233,5 +236,6 @@ const SequenceModel Windows_1252PortugueseModel =
38,
(float)0.9953179582313172,
PR_TRUE,
- "WINDOWS-1252"
+ "WINDOWS-1252",
+ "pt"
};
diff --git a/src/LangModels/LangRomanianModel.cpp b/src/LangModels/LangRomanianModel.cpp
index 154c03f..cfb1b8d 100644
--- a/src/LangModels/LangRomanianModel.cpp
+++ b/src/LangModels/LangRomanianModel.cpp
@@ -198,7 +198,8 @@ const SequenceModel Iso_8859_16RomanianModel =
33,
(float)0.997762564143313,
PR_TRUE,
- "ISO-8859-16"
+ "ISO-8859-16",
+ "ro"
};
const SequenceModel Iso_8859_2RomanianModel =
@@ -208,7 +209,8 @@ const SequenceModel Iso_8859_2RomanianModel =
33,
(float)0.997762564143313,
PR_TRUE,
- "ISO-8859-2"
+ "ISO-8859-2",
+ "ro"
};
const SequenceModel Windows_1250RomanianModel =
@@ -218,7 +220,8 @@ const SequenceModel Windows_1250RomanianModel =
33,
(float)0.997762564143313,
PR_TRUE,
- "WINDOWS-1250"
+ "WINDOWS-1250",
+ "ro"
};
const SequenceModel Ibm852RomanianModel =
@@ -228,5 +231,6 @@ const SequenceModel Ibm852RomanianModel =
33,
(float)0.997762564143313,
PR_TRUE,
- "IBM852"
+ "IBM852",
+ "ro"
};
diff --git a/src/LangModels/LangRussianModel.cpp b/src/LangModels/LangRussianModel.cpp
index a532049..50631df 100644
--- a/src/LangModels/LangRussianModel.cpp
+++ b/src/LangModels/LangRussianModel.cpp
@@ -307,7 +307,8 @@ const SequenceModel Koi8rRussianModel =
64,
(float)0.976601,
PR_FALSE,
- "KOI8-R"
+ "KOI8-R",
+ "ru"
};
const SequenceModel Win1251RussianModel =
@@ -317,7 +318,8 @@ const SequenceModel Win1251RussianModel =
64,
(float)0.976601,
PR_FALSE,
- "WINDOWS-1251"
+ "WINDOWS-1251",
+ "ru"
};
const SequenceModel Latin5RussianModel =
@@ -327,7 +329,8 @@ const SequenceModel Latin5RussianModel =
64,
(float)0.976601,
PR_FALSE,
- "ISO-8859-5"
+ "ISO-8859-5",
+ "ru"
};
const SequenceModel MacCyrillicRussianModel =
@@ -337,7 +340,8 @@ const SequenceModel MacCyrillicRussianModel =
64,
(float)0.976601,
PR_FALSE,
- "MAC-CYRILLIC"
+ "MAC-CYRILLIC",
+ "ru"
};
const SequenceModel Ibm866RussianModel =
@@ -347,7 +351,8 @@ const SequenceModel Ibm866RussianModel =
64,
(float)0.976601,
PR_FALSE,
- "IBM866"
+ "IBM866",
+ "ru"
};
const SequenceModel Ibm855RussianModel =
@@ -357,5 +362,6 @@ const SequenceModel Ibm855RussianModel =
64,
(float)0.976601,
PR_FALSE,
- "IBM855"
+ "IBM855",
+ "ru"
};
diff --git a/src/LangModels/LangSlovakModel.cpp b/src/LangModels/LangSlovakModel.cpp
index cfa94aa..480b4b5 100644
--- a/src/LangModels/LangSlovakModel.cpp
+++ b/src/LangModels/LangSlovakModel.cpp
@@ -255,7 +255,8 @@ const SequenceModel Ibm852SlovakModel =
45,
(float)0.9733303573968434,
PR_TRUE,
- "IBM852"
+ "IBM852",
+ "sk"
};
const SequenceModel Iso_8859_2SlovakModel =
@@ -265,7 +266,8 @@ const SequenceModel Iso_8859_2SlovakModel =
45,
(float)0.9733303573968434,
PR_TRUE,
- "ISO-8859-2"
+ "ISO-8859-2",
+ "sk"
};
const SequenceModel Mac_CentraleuropeSlovakModel =
@@ -275,7 +277,8 @@ const SequenceModel Mac_CentraleuropeSlovakModel =
45,
(float)0.9733303573968434,
PR_TRUE,
- "MAC-CENTRALEUROPE"
+ "MAC-CENTRALEUROPE",
+ "sk"
};
const SequenceModel Windows_1250SlovakModel =
@@ -285,5 +288,6 @@ const SequenceModel Windows_1250SlovakModel =
45,
(float)0.9733303573968434,
PR_TRUE,
- "WINDOWS-1250"
+ "WINDOWS-1250",
+ "sk"
};
diff --git a/src/LangModels/LangSloveneModel.cpp b/src/LangModels/LangSloveneModel.cpp
index da28d86..160f054 100644
--- a/src/LangModels/LangSloveneModel.cpp
+++ b/src/LangModels/LangSloveneModel.cpp
@@ -215,7 +215,8 @@ const SequenceModel Iso_8859_2SloveneModel =
29,
(float)0.9983524317161332,
PR_TRUE,
- "ISO-8859-2"
+ "ISO-8859-2",
+ "sl"
};
const SequenceModel Iso_8859_16SloveneModel =
@@ -225,7 +226,8 @@ const SequenceModel Iso_8859_16SloveneModel =
29,
(float)0.9983524317161332,
PR_TRUE,
- "ISO-8859-16"
+ "ISO-8859-16",
+ "sl"
};
const SequenceModel Windows_1250SloveneModel =
@@ -235,7 +237,8 @@ const SequenceModel Windows_1250SloveneModel =
29,
(float)0.9983524317161332,
PR_TRUE,
- "WINDOWS-1250"
+ "WINDOWS-1250",
+ "sl"
};
const SequenceModel Mac_CentraleuropeSloveneModel =
@@ -245,7 +248,8 @@ const SequenceModel Mac_CentraleuropeSloveneModel =
29,
(float)0.9983524317161332,
PR_TRUE,
- "MAC-CENTRALEUROPE"
+ "MAC-CENTRALEUROPE",
+ "sl"
};
const SequenceModel Ibm852SloveneModel =
@@ -255,5 +259,6 @@ const SequenceModel Ibm852SloveneModel =
29,
(float)0.9983524317161332,
PR_TRUE,
- "IBM852"
+ "IBM852",
+ "sl"
};
diff --git a/src/LangModels/LangSpanishModel.cpp b/src/LangModels/LangSpanishModel.cpp
index 18c400a..6c3f3a9 100644
--- a/src/LangModels/LangSpanishModel.cpp
+++ b/src/LangModels/LangSpanishModel.cpp
@@ -177,7 +177,8 @@ const SequenceModel Iso_8859_1SpanishModel =
33,
(float)0.9970385677528184,
PR_TRUE,
- "ISO-8859-1"
+ "ISO-8859-1",
+ "es"
};
const SequenceModel Iso_8859_15SpanishModel =
@@ -187,7 +188,8 @@ const SequenceModel Iso_8859_15SpanishModel =
33,
(float)0.9970385677528184,
PR_TRUE,
- "ISO-8859-15"
+ "ISO-8859-15",
+ "es"
};
const SequenceModel Windows_1252SpanishModel =
@@ -197,5 +199,6 @@ const SequenceModel Windows_1252SpanishModel =
33,
(float)0.9970385677528184,
PR_TRUE,
- "WINDOWS-1252"
+ "WINDOWS-1252",
+ "es"
};
diff --git a/src/LangModels/LangSwedishModel.cpp b/src/LangModels/LangSwedishModel.cpp
index 0d2dadf..3dca8e8 100644
--- a/src/LangModels/LangSwedishModel.cpp
+++ b/src/LangModels/LangSwedishModel.cpp
@@ -217,7 +217,8 @@ const SequenceModel Windows_1252SwedishModel =
31,
(float)0.997323508584682,
PR_TRUE,
- "WINDOWS-1252"
+ "WINDOWS-1252",
+ "sv"
};
const SequenceModel Iso_8859_9SwedishModel =
@@ -227,7 +228,8 @@ const SequenceModel Iso_8859_9SwedishModel =
31,
(float)0.997323508584682,
PR_TRUE,
- "ISO-8859-9"
+ "ISO-8859-9",
+ "sv"
};
const SequenceModel Iso_8859_1SwedishModel =
@@ -237,7 +239,8 @@ const SequenceModel Iso_8859_1SwedishModel =
31,
(float)0.997323508584682,
PR_TRUE,
- "ISO-8859-1"
+ "ISO-8859-1",
+ "sv"
};
const SequenceModel Iso_8859_4SwedishModel =
@@ -247,7 +250,8 @@ const SequenceModel Iso_8859_4SwedishModel =
31,
(float)0.997323508584682,
PR_TRUE,
- "ISO-8859-4"
+ "ISO-8859-4",
+ "sv"
};
const SequenceModel Iso_8859_15SwedishModel =
@@ -257,5 +261,6 @@ const SequenceModel Iso_8859_15SwedishModel =
31,
(float)0.997323508584682,
PR_TRUE,
- "ISO-8859-15"
+ "ISO-8859-15",
+ "sv"
};
diff --git a/src/LangModels/LangThaiModel.cpp b/src/LangModels/LangThaiModel.cpp
index 091fb8d..9880e09 100644
--- a/src/LangModels/LangThaiModel.cpp
+++ b/src/LangModels/LangThaiModel.cpp
@@ -251,7 +251,8 @@ const SequenceModel Tis_620ThaiModel =
64,
(float)0.8815720594354438,
PR_FALSE,
- "TIS-620"
+ "TIS-620",
+ "th"
};
const SequenceModel Iso_8859_11ThaiModel =
@@ -261,5 +262,6 @@ const SequenceModel Iso_8859_11ThaiModel =
64,
(float)0.8815720594354438,
PR_FALSE,
- "ISO-8859-11"
+ "ISO-8859-11",
+ "th"
};
diff --git a/src/LangModels/LangTurkishModel.cpp b/src/LangModels/LangTurkishModel.cpp
index 71d72c5..16c133f 100644
--- a/src/LangModels/LangTurkishModel.cpp
+++ b/src/LangModels/LangTurkishModel.cpp
@@ -159,7 +159,8 @@ const SequenceModel Iso_8859_3TurkishModel =
36,
(float)0.991865243864388,
PR_FALSE,
- "ISO-8859-3"
+ "ISO-8859-3",
+ "tr"
};
const SequenceModel Iso_8859_9TurkishModel =
@@ -169,5 +170,6 @@ const SequenceModel Iso_8859_9TurkishModel =
36,
(float)0.991865243864388,
PR_FALSE,
- "ISO-8859-9"
+ "ISO-8859-9",
+ "tr"
};
diff --git a/src/LangModels/LangVietnameseModel.cpp b/src/LangModels/LangVietnameseModel.cpp
index 288a525..0569887 100644
--- a/src/LangModels/LangVietnameseModel.cpp
+++ b/src/LangModels/LangVietnameseModel.cpp
@@ -233,7 +233,8 @@ const SequenceModel Windows_1258VietnameseModel =
55,
(float)0.9321889118082535,
PR_FALSE,
- "WINDOWS-1258"
+ "WINDOWS-1258",
+ "vi"
};
const SequenceModel VisciiVietnameseModel =
@@ -243,5 +244,6 @@ const SequenceModel VisciiVietnameseModel =
55,
(float)0.9321889118082535,
PR_FALSE,
- "VISCII"
+ "VISCII",
+ "vi"
};
diff --git a/src/nsBig5Prober.h b/src/nsBig5Prober.h
index 7d13be8..4b5d9fa 100644
--- a/src/nsBig5Prober.h
+++ b/src/nsBig5Prober.h
@@ -51,6 +51,7 @@ public:
virtual ~nsBig5Prober(void){delete mCodingSM;}
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
const char* GetCharSetName() {return "BIG5";}
+ const char* GetLanguage() {return "zh";}
nsProbingState GetState(void) {return mState;}
void Reset(void);
float GetConfidence(void);
diff --git a/src/nsCharSetProber.h b/src/nsCharSetProber.h
index c078ccf..c13afb8 100644
--- a/src/nsCharSetProber.h
+++ b/src/nsCharSetProber.h
@@ -54,6 +54,7 @@ class nsCharSetProber {
public:
virtual ~nsCharSetProber() {}
virtual const char* GetCharSetName() = 0;
+ virtual const char* GetLanguage() = 0;
virtual nsProbingState HandleData(const char* aBuf, PRUint32 aLen) = 0;
virtual nsProbingState GetState(void) = 0;
virtual void Reset(void) = 0;
diff --git a/src/nsEUCJPProber.h b/src/nsEUCJPProber.h
index a7a2f51..a74c779 100644
--- a/src/nsEUCJPProber.h
+++ b/src/nsEUCJPProber.h
@@ -57,6 +57,7 @@ public:
virtual ~nsEUCJPProber(void){delete mCodingSM;}
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
const char* GetCharSetName() {return "EUC-JP";}
+ const char* GetLanguage() {return "ja";}
nsProbingState GetState(void) {return mState;}
void Reset(void);
float GetConfidence(void);
diff --git a/src/nsEUCKRProber.h b/src/nsEUCKRProber.h
index 954c038..8ce9eb2 100644
--- a/src/nsEUCKRProber.h
+++ b/src/nsEUCKRProber.h
@@ -57,6 +57,7 @@ public:
* Korean documents are actually created with this character set.
*/
const char* GetCharSetName() {return "UHC";}
+ const char* GetLanguage() {return "ko";}
nsProbingState GetState(void) {return mState;}
void Reset(void);
float GetConfidence(void);
diff --git a/src/nsEUCTWProber.h b/src/nsEUCTWProber.h
index ee6376e..6701027 100644
--- a/src/nsEUCTWProber.h
+++ b/src/nsEUCTWProber.h
@@ -51,6 +51,7 @@ public:
virtual ~nsEUCTWProber(void){delete mCodingSM;}
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
const char* GetCharSetName() {return "EUC-TW";}
+ const char* GetLanguage() {return "zh";}
nsProbingState GetState(void) {return mState;}
void Reset(void);
float GetConfidence(void);
diff --git a/src/nsEscCharsetProber.h b/src/nsEscCharsetProber.h
index 4b648e0..eab3080 100644
--- a/src/nsEscCharsetProber.h
+++ b/src/nsEscCharsetProber.h
@@ -38,6 +38,8 @@
#ifndef nsEscCharSetProber_h__
#define nsEscCharSetProber_h__
+#include <cstddef>
+
#include "nsCharSetProber.h"
#include "nsCodingStateMachine.h"
@@ -49,6 +51,7 @@ public:
virtual ~nsEscCharSetProber(void);
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
const char* GetCharSetName() {return mDetectedCharset;}
+ const char* GetLanguage() {return NULL;}
nsProbingState GetState(void) {return mState;}
void Reset(void);
float GetConfidence(void){return (float)0.99;}
diff --git a/src/nsGB2312Prober.h b/src/nsGB2312Prober.h
index 26ebf84..a35e585 100644
--- a/src/nsGB2312Prober.h
+++ b/src/nsGB2312Prober.h
@@ -53,6 +53,7 @@ public:
virtual ~nsGB18030Prober(void){delete mCodingSM;}
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
const char* GetCharSetName() {return "GB18030";}
+ const char* GetLanguage() {return "zh";}
nsProbingState GetState(void) {return mState;}
void Reset(void);
float GetConfidence(void);
diff --git a/src/nsHebrewProber.h b/src/nsHebrewProber.h
index eedfed4..8442aab 100644
--- a/src/nsHebrewProber.h
+++ b/src/nsHebrewProber.h
@@ -49,7 +49,8 @@ public:
virtual ~nsHebrewProber(void) {}
virtual nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
- virtual const char* GetCharSetName();
+ virtual const char *GetCharSetName();
+ virtual const char *GetLanguage(void) { return "he"; }
virtual void Reset(void);
virtual nsProbingState GetState(void);
diff --git a/src/nsLatin1Prober.h b/src/nsLatin1Prober.h
index 59118a7..bd3a9d5 100644
--- a/src/nsLatin1Prober.h
+++ b/src/nsLatin1Prober.h
@@ -39,6 +39,8 @@
#ifndef nsLatin1Prober_h__
#define nsLatin1Prober_h__
+#include <cstddef>
+
#include "nsCharSetProber.h"
#define FREQ_CAT_NUM 4
@@ -49,6 +51,7 @@ public:
virtual ~nsLatin1Prober(void){}
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
const char* GetCharSetName() {return "WINDOWS-1252";}
+ const char* GetLanguage() {return NULL;}
nsProbingState GetState(void) {return mState;}
void Reset(void);
float GetConfidence(void);
diff --git a/src/nsMBCSGroupProber.cpp b/src/nsMBCSGroupProber.cpp
index 057ddb1..68c896a 100644
--- a/src/nsMBCSGroupProber.cpp
+++ b/src/nsMBCSGroupProber.cpp
@@ -97,6 +97,18 @@ const char* nsMBCSGroupProber::GetCharSetName()
return mProbers[mBestGuess]->GetCharSetName();
}
+const char* nsMBCSGroupProber::GetLanguage(void)
+{
+ if (mBestGuess == -1)
+ {
+ GetConfidence();
+ }
+ if (mBestGuess == -1)
+ return NULL;
+ else
+ return mProbers[mBestGuess]->GetLanguage();
+}
+
void nsMBCSGroupProber::Reset(void)
{
mActiveNum = 0;
diff --git a/src/nsMBCSGroupProber.h b/src/nsMBCSGroupProber.h
index c4e9964..0e55221 100644
--- a/src/nsMBCSGroupProber.h
+++ b/src/nsMBCSGroupProber.h
@@ -55,6 +55,7 @@ public:
virtual ~nsMBCSGroupProber();
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
const char* GetCharSetName();
+ const char* GetLanguage();
nsProbingState GetState(void) {return mState;}
void Reset(void);
float GetConfidence(void);
diff --git a/src/nsSBCSGroupProber.cpp b/src/nsSBCSGroupProber.cpp
index f956d25..6a3ef4f 100644
--- a/src/nsSBCSGroupProber.cpp
+++ b/src/nsSBCSGroupProber.cpp
@@ -217,6 +217,17 @@ const char* nsSBCSGroupProber::GetCharSetName()
return mProbers[mBestGuess]->GetCharSetName();
}
+const char* nsSBCSGroupProber::GetLanguage()
+{
+ if (mBestGuess == -1)
+ {
+ GetConfidence();
+ if (mBestGuess == -1)
+ mBestGuess = 0;
+ }
+ return mProbers[mBestGuess]->GetLanguage();
+}
+
void nsSBCSGroupProber::Reset(void)
{
mActiveNum = 0;
diff --git a/src/nsSBCSGroupProber.h b/src/nsSBCSGroupProber.h
index ec72324..d07e16f 100644
--- a/src/nsSBCSGroupProber.h
+++ b/src/nsSBCSGroupProber.h
@@ -49,6 +49,7 @@ public:
virtual ~nsSBCSGroupProber();
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
const char* GetCharSetName();
+ const char* GetLanguage();
nsProbingState GetState(void) {return mState;}
void Reset(void);
float GetConfidence(void);
diff --git a/src/nsSBCharSetProber.cpp b/src/nsSBCharSetProber.cpp
index 001529f..7832f11 100644
--- a/src/nsSBCharSetProber.cpp
+++ b/src/nsSBCharSetProber.cpp
@@ -145,6 +145,13 @@ const char* nsSingleByteCharSetProber::GetCharSetName()
return mNameProber->GetCharSetName();
}
+const char* nsSingleByteCharSetProber::GetLanguage()
+{
+ if (!mNameProber)
+ return mModel->langName;
+ return mNameProber->GetLanguage();
+}
+
#ifdef DEBUG_chardet
void nsSingleByteCharSetProber::DumpStatus()
{
diff --git a/src/nsSBCharSetProber.h b/src/nsSBCharSetProber.h
index 42d21b2..2cd4409 100644
--- a/src/nsSBCharSetProber.h
+++ b/src/nsSBCharSetProber.h
@@ -75,6 +75,7 @@ typedef struct
float mTypicalPositiveRatio; // = freqSeqs / totalSeqs
PRBool keepEnglishLetter; // says if this script contains English characters (not implemented)
const char* const charsetName;
+ const char* const langName;
} SequenceModel;
@@ -86,6 +87,7 @@ public:
:mModel(model), mReversed(reversed), mNameProber(nameProber) { Reset(); }
virtual const char* GetCharSetName();
+ virtual const char* GetLanguage();
virtual nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
virtual nsProbingState GetState(void) {return mState;}
virtual void Reset(void);
diff --git a/src/nsSJISProber.h b/src/nsSJISProber.h
index f326ded..61e6352 100644
--- a/src/nsSJISProber.h
+++ b/src/nsSJISProber.h
@@ -58,6 +58,7 @@ public:
virtual ~nsSJISProber(void){delete mCodingSM;}
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
const char* GetCharSetName() {return "SHIFT_JIS";}
+ const char* GetLanguage() {return "ja";}
nsProbingState GetState(void) {return mState;}
void Reset(void);
float GetConfidence(void);
diff --git a/src/nsUTF8Prober.h b/src/nsUTF8Prober.h
index 21c91c4..a2cf4ee 100644
--- a/src/nsUTF8Prober.h
+++ b/src/nsUTF8Prober.h
@@ -38,6 +38,7 @@
#ifndef nsUTF8Prober_h__
#define nsUTF8Prober_h__
+#include <cstddef>
#include "nsCharSetProber.h"
#include "nsCodingStateMachine.h"
@@ -49,6 +50,7 @@ public:
virtual ~nsUTF8Prober(){delete mCodingSM;}
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
const char* GetCharSetName() {return "UTF-8";}
+ const char* GetLanguage() {return NULL;}
nsProbingState GetState(void) {return mState;}
void Reset(void);
float GetConfidence(void);
diff --git a/src/nsUniversalDetector.cpp b/src/nsUniversalDetector.cpp
index 2da4b4b..bc9e9b2 100644
--- a/src/nsUniversalDetector.cpp
+++ b/src/nsUniversalDetector.cpp
@@ -305,7 +305,7 @@ void nsUniversalDetector::DataEnd()
* when finding them.
*/
mDone = PR_TRUE;
- Report(mDetectedCharset, 1.0);
+ Report(mDetectedCharset, NULL, 1.0);
return;
}
@@ -323,7 +323,9 @@ void nsUniversalDetector::DataEnd()
if (proberConfidence > MINIMUM_THRESHOLD)
/* Only report what we are confident in. */
- Report(mCharSetProbers[i]->GetCharSetName(), proberConfidence);
+ Report(mCharSetProbers[i]->GetCharSetName(),
+ mCharSetProbers[i]->GetLanguage(),
+ proberConfidence);
}
}
}
diff --git a/src/nsUniversalDetector.h b/src/nsUniversalDetector.h
index eecdea6..702a9fe 100644
--- a/src/nsUniversalDetector.h
+++ b/src/nsUniversalDetector.h
@@ -69,7 +69,8 @@ public:
virtual void DataEnd(void);
protected:
- virtual void Report(const char* aCharset,
+ virtual void Report(const char *encoding,
+ const char *language,
float confidence) = 0;
virtual void Reset();
nsInputState mInputState;
diff --git a/src/symbols.cmake b/src/symbols.cmake
index a6690ff..e66bfa0 100644
--- a/src/symbols.cmake
+++ b/src/symbols.cmake
@@ -9,6 +9,7 @@ set(
uchardet_get_candidates
uchardet_get_encoding
uchardet_get_confidence
+ uchardet_get_language
)
set (LINK_FLAGS "")
diff --git a/src/uchardet.cpp b/src/uchardet.cpp
index f5391ea..19a73f0 100644
--- a/src/uchardet.cpp
+++ b/src/uchardet.cpp
@@ -65,6 +65,7 @@ public:
}
virtual void Report(const char *encoding,
+ const char *language,
float confidence)
{
std::vector<UChardetCandidate>::iterator it;
@@ -72,7 +73,8 @@ public:
for (it = candidates.begin(); it != candidates.end(); it++)
{
- if (strcmp(it->encoding, encoding) == 0)
+ if (strcmp(it->encoding, encoding) == 0 &&
+ it->language && language && strcmp(it->language, language) == 0)
{
/* Already reported. Bail out or update the confidence
* when needed.
@@ -91,6 +93,7 @@ public:
candidate = UChardetCandidate();
candidate.encoding = strdup(encoding);
+ candidate.language = language ? strdup(language) : NULL;
candidate.confidence = confidence;
for (it = candidates.begin(); it != candidates.end(); it++)
@@ -107,7 +110,11 @@ public:
nsUniversalDetector::Reset();
for (it = candidates.begin(); it != candidates.end(); it++)
+ {
free(it->encoding);
+ if (it->language)
+ free(it->language);
+ }
candidates.clear();
}
@@ -125,6 +132,12 @@ public:
{
return (candidates.size() > i) ? candidates[i].confidence : 0.0;
}
+
+ const char* GetLanguage(size_t i) const
+ {
+ return (candidates.size() > i) ? candidates[i].language : NULL;
+ }
+
};
uchardet_t uchardet_new(void)
@@ -178,3 +191,9 @@ const char * uchardet_get_encoding (uchardet_t ud,
{
return reinterpret_cast<HandleUniversalDetector*>(ud)->GetCharset(candidate);
}
+
+const char * uchardet_get_language (uchardet_t ud,
+ size_t candidate)
+{
+ return reinterpret_cast<HandleUniversalDetector*>(ud)->GetLanguage(candidate);
+}
diff --git a/src/uchardet.h b/src/uchardet.h
index c452a69..df1387e 100644
--- a/src/uchardet.h
+++ b/src/uchardet.h
@@ -120,6 +120,8 @@ UCHARDET_INTERFACE float uchardet_get_confidence (uchardet_t ud,
size_t candidate);
UCHARDET_INTERFACE const char * uchardet_get_encoding (uchardet_t ud,
size_t candidate);
+UCHARDET_INTERFACE const char * uchardet_get_language (uchardet_t ud,
+ size_t candidate);
#ifdef __cplusplus