diff options
author | Jehan <jehan@girinstud.io> | 2020-04-23 18:24:12 +0200 |
---|---|---|
committer | Jehan <jehan@girinstud.io> | 2020-04-23 18:39:49 +0200 |
commit | 4e967c9e882b8857e3a059698768cb0b2bff14a6 (patch) | |
tree | 8327c6dae337e25573fcbcfa0f1e85707b80ca23 | |
parent | 94736d1565c06e8a871e8cd724b2c812672d7365 (diff) |
src: new API to get the detected language.
This doesn't work for all probers yet, in particular not for the most
generic probers (such as UTF-8) or WINDOWS-1252. These will return NULL.
It's still a good first step.
Right now, it returns the 2-character language code from ISO 639-1. A
using project could easily get the English language name from the
XML/json files provided by the iso-codes project. This project will also
allow to easily localize the language name in other languages through
gettext (this is what we do in GIMP for instance). I don't add any
dependency though and leave it to downstream projects to implement this.
I was also wondering if we want to support region information for cases
when it would make sense. I especially wondered about it for Chinese
encodings as some of them seem quite specific to a region (according to
Wikipedia at least). For the time being though, these just return "zh".
We'll see later if it makes sense to be more accurate (maybe depending
on reports?).
51 files changed, 276 insertions, 104 deletions
diff --git a/src/LangModels/LangArabicModel.cpp b/src/LangModels/LangArabicModel.cpp index 6ac80f3..0a6d654 100644 --- a/src/LangModels/LangArabicModel.cpp +++ b/src/LangModels/LangArabicModel.cpp @@ -251,7 +251,8 @@ const SequenceModel Iso_8859_6ArabicModel = 64, (float)0.9696025116913417, PR_FALSE, - "ISO-8859-6" + "ISO-8859-6", + "ar" }; const SequenceModel Windows_1256ArabicModel = @@ -261,5 +262,6 @@ const SequenceModel Windows_1256ArabicModel = 64, (float)0.9696025116913417, PR_FALSE, - "WINDOWS-1256" + "WINDOWS-1256", + "ar" }; diff --git a/src/LangModels/LangBulgarianModel.cpp b/src/LangModels/LangBulgarianModel.cpp index 18c58ee..1120054 100644 --- a/src/LangModels/LangBulgarianModel.cpp +++ b/src/LangModels/LangBulgarianModel.cpp @@ -233,7 +233,8 @@ const SequenceModel Latin5BulgarianModel = 64, (float)0.969392, PR_FALSE, - "ISO-8859-5" + "ISO-8859-5", + "bg" }; const SequenceModel Win1251BulgarianModel = @@ -243,5 +244,6 @@ const SequenceModel Win1251BulgarianModel = 64, (float)0.969392, PR_FALSE, - "WINDOWS-1251" + "WINDOWS-1251", + "bg" }; diff --git a/src/LangModels/LangCroatianModel.cpp b/src/LangModels/LangCroatianModel.cpp index 58f882e..961bd0e 100644 --- a/src/LangModels/LangCroatianModel.cpp +++ b/src/LangModels/LangCroatianModel.cpp @@ -238,7 +238,8 @@ const SequenceModel Windows_1250CroatianModel = 31, (float)0.9989731099787131, PR_TRUE, - "WINDOWS-1250" + "WINDOWS-1250", + "hr" }; const SequenceModel Iso_8859_2CroatianModel = @@ -248,7 +249,8 @@ const SequenceModel Iso_8859_2CroatianModel = 31, (float)0.9989731099787131, PR_TRUE, - "ISO-8859-2" + "ISO-8859-2", + "hr" }; const SequenceModel Iso_8859_16CroatianModel = @@ -258,7 +260,8 @@ const SequenceModel Iso_8859_16CroatianModel = 31, (float)0.9989731099787131, PR_TRUE, - "ISO-8859-16" + "ISO-8859-16", + "hr" }; const SequenceModel Mac_CentraleuropeCroatianModel = @@ -268,7 +271,8 @@ const SequenceModel Mac_CentraleuropeCroatianModel = 31, (float)0.9989731099787131, PR_TRUE, - "MAC-CENTRALEUROPE" + "MAC-CENTRALEUROPE", + "hr" }; const SequenceModel Iso_8859_13CroatianModel = @@ -278,7 +282,8 @@ const SequenceModel Iso_8859_13CroatianModel = 31, (float)0.9989731099787131, PR_TRUE, - "ISO-8859-13" + "ISO-8859-13", + "hr" }; const SequenceModel Ibm852CroatianModel = @@ -288,5 +293,6 @@ const SequenceModel Ibm852CroatianModel = 31, (float)0.9989731099787131, PR_TRUE, - "IBM852" + "IBM852", + "hr" }; diff --git a/src/LangModels/LangCzechModel.cpp b/src/LangModels/LangCzechModel.cpp index 2557376..c12c07e 100644 --- a/src/LangModels/LangCzechModel.cpp +++ b/src/LangModels/LangCzechModel.cpp @@ -247,7 +247,8 @@ const SequenceModel Windows_1250CzechModel = 41, (float)0.9786035192432675, PR_TRUE, - "WINDOWS-1250" + "WINDOWS-1250", + "cs" }; const SequenceModel Mac_CentraleuropeCzechModel = @@ -257,7 +258,8 @@ const SequenceModel Mac_CentraleuropeCzechModel = 41, (float)0.9786035192432675, PR_TRUE, - "MAC-CENTRALEUROPE" + "MAC-CENTRALEUROPE", + "cs" }; const SequenceModel Ibm852CzechModel = @@ -267,7 +269,8 @@ const SequenceModel Ibm852CzechModel = 41, (float)0.9786035192432675, PR_TRUE, - "IBM852" + "IBM852", + "cs" }; const SequenceModel Iso_8859_2CzechModel = @@ -277,5 +280,6 @@ const SequenceModel Iso_8859_2CzechModel = 41, (float)0.9786035192432675, PR_TRUE, - "ISO-8859-2" + "ISO-8859-2", + "cs" }; diff --git a/src/LangModels/LangDanishModel.cpp b/src/LangModels/LangDanishModel.cpp index c60e7b2..cb99e9b 100644 --- a/src/LangModels/LangDanishModel.cpp +++ b/src/LangModels/LangDanishModel.cpp @@ -174,7 +174,8 @@ const SequenceModel Iso_8859_15DanishModel = 30, (float)0.9968082796759031, PR_TRUE, - "ISO-8859-15" + "ISO-8859-15", + "da" }; const SequenceModel Iso_8859_1DanishModel = @@ -184,7 +185,8 @@ const SequenceModel Iso_8859_1DanishModel = 30, (float)0.9968082796759031, PR_TRUE, - "ISO-8859-1" + "ISO-8859-1", + "da" }; const SequenceModel Windows_1252DanishModel = @@ -194,5 +196,6 @@ const SequenceModel Windows_1252DanishModel = 30, (float)0.9968082796759031, PR_TRUE, - "WINDOWS-1252" + "WINDOWS-1252", + "da" }; diff --git a/src/LangModels/LangEsperantoModel.cpp b/src/LangModels/LangEsperantoModel.cpp index 0884cd2..1d55ec7 100644 --- a/src/LangModels/LangEsperantoModel.cpp +++ b/src/LangModels/LangEsperantoModel.cpp @@ -137,5 +137,6 @@ const SequenceModel Iso_8859_3EsperantoModel = 35, (float)0.9942980632768038, PR_FALSE, - "ISO-8859-3" + "ISO-8859-3", + "eo" }; diff --git a/src/LangModels/LangEstonianModel.cpp b/src/LangModels/LangEstonianModel.cpp index c5fa9b3..71d9c66 100644 --- a/src/LangModels/LangEstonianModel.cpp +++ b/src/LangModels/LangEstonianModel.cpp @@ -219,7 +219,8 @@ const SequenceModel Iso_8859_4EstonianModel = 33, (float)0.9972721312183132, PR_TRUE, - "ISO-8859-4" + "ISO-8859-4", + "et" }; const SequenceModel Windows_1252EstonianModel = @@ -229,7 +230,8 @@ const SequenceModel Windows_1252EstonianModel = 33, (float)0.9972721312183132, PR_TRUE, - "WINDOWS-1252" + "WINDOWS-1252", + "et" }; const SequenceModel Iso_8859_15EstonianModel = @@ -239,7 +241,8 @@ const SequenceModel Iso_8859_15EstonianModel = 33, (float)0.9972721312183132, PR_TRUE, - "ISO-8859-15" + "ISO-8859-15", + "et" }; const SequenceModel Iso_8859_13EstonianModel = @@ -249,7 +252,8 @@ const SequenceModel Iso_8859_13EstonianModel = 33, (float)0.9972721312183132, PR_TRUE, - "ISO-8859-13" + "ISO-8859-13", + "et" }; const SequenceModel Windows_1257EstonianModel = @@ -259,5 +263,6 @@ const SequenceModel Windows_1257EstonianModel = 33, (float)0.9972721312183132, PR_TRUE, - "WINDOWS-1257" + "WINDOWS-1257", + "et" }; diff --git a/src/LangModels/LangFinnishModel.cpp b/src/LangModels/LangFinnishModel.cpp index ee91e14..cbc9528 100644 --- a/src/LangModels/LangFinnishModel.cpp +++ b/src/LangModels/LangFinnishModel.cpp @@ -237,7 +237,8 @@ const SequenceModel Iso_8859_15FinnishModel = 30, (float)0.9985378147555799, PR_TRUE, - "ISO-8859-15" + "ISO-8859-15", + "fi" }; const SequenceModel Windows_1252FinnishModel = @@ -247,7 +248,8 @@ const SequenceModel Windows_1252FinnishModel = 30, (float)0.9985378147555799, PR_TRUE, - "WINDOWS-1252" + "WINDOWS-1252", + "fi" }; const SequenceModel Iso_8859_4FinnishModel = @@ -257,7 +259,8 @@ const SequenceModel Iso_8859_4FinnishModel = 30, (float)0.9985378147555799, PR_TRUE, - "ISO-8859-4" + "ISO-8859-4", + "fi" }; const SequenceModel Iso_8859_13FinnishModel = @@ -267,7 +270,8 @@ const SequenceModel Iso_8859_13FinnishModel = 30, (float)0.9985378147555799, PR_TRUE, - "ISO-8859-13" + "ISO-8859-13", + "fi" }; const SequenceModel Iso_8859_9FinnishModel = @@ -277,7 +281,8 @@ const SequenceModel Iso_8859_9FinnishModel = 30, (float)0.9985378147555799, PR_TRUE, - "ISO-8859-9" + "ISO-8859-9", + "fi" }; const SequenceModel Iso_8859_1FinnishModel = @@ -287,5 +292,6 @@ const SequenceModel Iso_8859_1FinnishModel = 30, (float)0.9985378147555799, PR_TRUE, - "ISO-8859-1" + "ISO-8859-1", + "fi" }; diff --git a/src/LangModels/LangFrenchModel.cpp b/src/LangModels/LangFrenchModel.cpp index cd458cb..5baaf31 100644 --- a/src/LangModels/LangFrenchModel.cpp +++ b/src/LangModels/LangFrenchModel.cpp @@ -182,7 +182,8 @@ const SequenceModel Windows_1252FrenchModel = 38, (float)0.997057879992383, PR_TRUE, - "WINDOWS-1252" + "WINDOWS-1252", + "fr" }; const SequenceModel Iso_8859_1FrenchModel = @@ -192,7 +193,8 @@ const SequenceModel Iso_8859_1FrenchModel = 38, (float)0.997057879992383, PR_TRUE, - "ISO-8859-1" + "ISO-8859-1", + "fr" }; const SequenceModel Iso_8859_15FrenchModel = @@ -202,5 +204,6 @@ const SequenceModel Iso_8859_15FrenchModel = 38, (float)0.997057879992383, PR_TRUE, - "ISO-8859-15" + "ISO-8859-15", + "fr" }; diff --git a/src/LangModels/LangGermanModel.cpp b/src/LangModels/LangGermanModel.cpp index feeda8e..dd4228c 100644 --- a/src/LangModels/LangGermanModel.cpp +++ b/src/LangModels/LangGermanModel.cpp @@ -154,7 +154,8 @@ const SequenceModel Windows_1252GermanModel = 31, (float)0.9934041448127945, PR_TRUE, - "WINDOWS-1252" + "WINDOWS-1252", + "de" }; const SequenceModel Iso_8859_1GermanModel = @@ -164,5 +165,6 @@ const SequenceModel Iso_8859_1GermanModel = 31, (float)0.9934041448127945, PR_TRUE, - "ISO-8859-1" + "ISO-8859-1", + "de" }; diff --git a/src/LangModels/LangGreekModel.cpp b/src/LangModels/LangGreekModel.cpp index 499affe..28951e6 100644 --- a/src/LangModels/LangGreekModel.cpp +++ b/src/LangModels/LangGreekModel.cpp @@ -215,7 +215,8 @@ const SequenceModel Windows_1253GreekModel = 46, (float)0.958419074626211, PR_FALSE, - "WINDOWS-1253" + "WINDOWS-1253", + "el" }; const SequenceModel Iso_8859_7GreekModel = @@ -225,5 +226,6 @@ const SequenceModel Iso_8859_7GreekModel = 46, (float)0.958419074626211, PR_FALSE, - "ISO-8859-7" + "ISO-8859-7", + "el" }; diff --git a/src/LangModels/LangHebrewModel.cpp b/src/LangModels/LangHebrewModel.cpp index af9ac2b..811c048 100644 --- a/src/LangModels/LangHebrewModel.cpp +++ b/src/LangModels/LangHebrewModel.cpp @@ -215,6 +215,6 @@ const SequenceModel Win1255Model = 64, (float)0.984004, PR_FALSE, - "WINDOWS-1255" + "WINDOWS-1255", + "he" }; - diff --git a/src/LangModels/LangHungarianModel.cpp b/src/LangModels/LangHungarianModel.cpp index 83e6eaa..22f0de6 100644 --- a/src/LangModels/LangHungarianModel.cpp +++ b/src/LangModels/LangHungarianModel.cpp @@ -155,7 +155,8 @@ const SequenceModel Iso_8859_2HungarianModel = 32, (float)0.9748272224933486, PR_FALSE, - "ISO-8859-2" + "ISO-8859-2", + "hu" }; const SequenceModel Windows_1250HungarianModel = @@ -165,5 +166,6 @@ const SequenceModel Windows_1250HungarianModel = 32, (float)0.9748272224933486, PR_FALSE, - "WINDOWS-1250" + "WINDOWS-1250", + "hu" }; diff --git a/src/LangModels/LangIrishModel.cpp b/src/LangModels/LangIrishModel.cpp index af3a16d..bbd9500 100644 --- a/src/LangModels/LangIrishModel.cpp +++ b/src/LangModels/LangIrishModel.cpp @@ -196,7 +196,8 @@ const SequenceModel Iso_8859_1IrishModel = 31, (float)0.9974076651249096, PR_TRUE, - "ISO-8859-1" + "ISO-8859-1", + "ga" }; const SequenceModel Windows_1252IrishModel = @@ -206,7 +207,8 @@ const SequenceModel Windows_1252IrishModel = 31, (float)0.9974076651249096, PR_TRUE, - "WINDOWS-1252" + "WINDOWS-1252", + "ga" }; const SequenceModel Iso_8859_15IrishModel = @@ -216,7 +218,8 @@ const SequenceModel Iso_8859_15IrishModel = 31, (float)0.9974076651249096, PR_TRUE, - "ISO-8859-15" + "ISO-8859-15", + "ga" }; const SequenceModel Iso_8859_9IrishModel = @@ -226,5 +229,6 @@ const SequenceModel Iso_8859_9IrishModel = 31, (float)0.9974076651249096, PR_TRUE, - "ISO-8859-9" + "ISO-8859-9", + "ga" }; diff --git a/src/LangModels/LangItalianModel.cpp b/src/LangModels/LangItalianModel.cpp index 0a9565c..4bb5dc5 100644 --- a/src/LangModels/LangItalianModel.cpp +++ b/src/LangModels/LangItalianModel.cpp @@ -220,7 +220,8 @@ const SequenceModel Iso_8859_3ItalianModel = 34, (float)0.9989484485502651, PR_TRUE, - "ISO-8859-3" + "ISO-8859-3", + "it" }; const SequenceModel Iso_8859_15ItalianModel = @@ -230,7 +231,8 @@ const SequenceModel Iso_8859_15ItalianModel = 34, (float)0.9989484485502651, PR_TRUE, - "ISO-8859-15" + "ISO-8859-15", + "it" }; const SequenceModel Iso_8859_9ItalianModel = @@ -240,7 +242,8 @@ const SequenceModel Iso_8859_9ItalianModel = 34, (float)0.9989484485502651, PR_TRUE, - "ISO-8859-9" + "ISO-8859-9", + "it" }; const SequenceModel Iso_8859_1ItalianModel = @@ -250,7 +253,8 @@ const SequenceModel Iso_8859_1ItalianModel = 34, (float)0.9989484485502651, PR_TRUE, - "ISO-8859-1" + "ISO-8859-1", + "it" }; const SequenceModel Windows_1252ItalianModel = @@ -260,5 +264,6 @@ const SequenceModel Windows_1252ItalianModel = 34, (float)0.9989484485502651, PR_TRUE, - "WINDOWS-1252" + "WINDOWS-1252", + "it" }; diff --git a/src/LangModels/LangLatvianModel.cpp b/src/LangModels/LangLatvianModel.cpp index b62d414..fcccc82 100644 --- a/src/LangModels/LangLatvianModel.cpp +++ b/src/LangModels/LangLatvianModel.cpp @@ -183,7 +183,8 @@ const SequenceModel Iso_8859_4LatvianModel = 39, (float)0.9904102202220861, PR_TRUE, - "ISO-8859-4" + "ISO-8859-4", + "lv" }; const SequenceModel Iso_8859_10LatvianModel = @@ -193,7 +194,8 @@ const SequenceModel Iso_8859_10LatvianModel = 39, (float)0.9904102202220861, PR_TRUE, - "ISO-8859-10" + "ISO-8859-10", + "lv" }; const SequenceModel Iso_8859_13LatvianModel = @@ -203,5 +205,6 @@ const SequenceModel Iso_8859_13LatvianModel = 39, (float)0.9904102202220861, PR_TRUE, - "ISO-8859-13" + "ISO-8859-13", + "lv" }; diff --git a/src/LangModels/LangLithuanianModel.cpp b/src/LangModels/LangLithuanianModel.cpp index af65db3..686014a 100644 --- a/src/LangModels/LangLithuanianModel.cpp +++ b/src/LangModels/LangLithuanianModel.cpp @@ -182,7 +182,8 @@ const SequenceModel Iso_8859_10LithuanianModel = 38, (float)0.9928710196247589, PR_TRUE, - "ISO-8859-10" + "ISO-8859-10", + "lt" }; const SequenceModel Iso_8859_4LithuanianModel = @@ -192,7 +193,8 @@ const SequenceModel Iso_8859_4LithuanianModel = 38, (float)0.9928710196247589, PR_TRUE, - "ISO-8859-4" + "ISO-8859-4", + "lt" }; const SequenceModel Iso_8859_13LithuanianModel = @@ -202,5 +204,6 @@ const SequenceModel Iso_8859_13LithuanianModel = 38, (float)0.9928710196247589, PR_TRUE, - "ISO-8859-13" + "ISO-8859-13", + "lt" }; diff --git a/src/LangModels/LangMalteseModel.cpp b/src/LangModels/LangMalteseModel.cpp index dd82ef6..e253539 100644 --- a/src/LangModels/LangMalteseModel.cpp +++ b/src/LangModels/LangMalteseModel.cpp @@ -133,5 +133,6 @@ const SequenceModel Iso_8859_3MalteseModel = 31, (float)0.9959115850692665, PR_TRUE, - "ISO-8859-3" + "ISO-8859-3", + "mt" }; diff --git a/src/LangModels/LangPolishModel.cpp b/src/LangModels/LangPolishModel.cpp index cb62bdc..38791de 100644 --- a/src/LangModels/LangPolishModel.cpp +++ b/src/LangModels/LangPolishModel.cpp @@ -244,7 +244,8 @@ const SequenceModel Ibm852PolishModel = 37, (float)0.9894531815946438, PR_TRUE, - "IBM852" + "IBM852", + "pl" }; const SequenceModel Iso_8859_16PolishModel = @@ -254,7 +255,8 @@ const SequenceModel Iso_8859_16PolishModel = 37, (float)0.9894531815946438, PR_TRUE, - "ISO-8859-16" + "ISO-8859-16", + "pl" }; const SequenceModel Iso_8859_2PolishModel = @@ -264,7 +266,8 @@ const SequenceModel Iso_8859_2PolishModel = 37, (float)0.9894531815946438, PR_TRUE, - "ISO-8859-2" + "ISO-8859-2", + "pl" }; const SequenceModel Mac_CentraleuropePolishModel = @@ -274,7 +277,8 @@ const SequenceModel Mac_CentraleuropePolishModel = 37, (float)0.9894531815946438, PR_TRUE, - "MAC-CENTRALEUROPE" + "MAC-CENTRALEUROPE", + "pl" }; const SequenceModel Iso_8859_13PolishModel = @@ -284,7 +288,8 @@ const SequenceModel Iso_8859_13PolishModel = 37, (float)0.9894531815946438, PR_TRUE, - "ISO-8859-13" + "ISO-8859-13", + "pl" }; const SequenceModel Windows_1250PolishModel = @@ -294,5 +299,6 @@ const SequenceModel Windows_1250PolishModel = 37, (float)0.9894531815946438, PR_TRUE, - "WINDOWS-1250" + "WINDOWS-1250", + "pl" }; diff --git a/src/LangModels/LangPortugueseModel.cpp b/src/LangModels/LangPortugueseModel.cpp index 8d4bc4a..0b2dd1b 100644 --- a/src/LangModels/LangPortugueseModel.cpp +++ b/src/LangModels/LangPortugueseModel.cpp @@ -203,7 +203,8 @@ const SequenceModel Iso_8859_1PortugueseModel = 38, (float)0.9953179582313172, PR_TRUE, - "ISO-8859-1" + "ISO-8859-1", + "pt" }; const SequenceModel Iso_8859_9PortugueseModel = @@ -213,7 +214,8 @@ const SequenceModel Iso_8859_9PortugueseModel = 38, (float)0.9953179582313172, PR_TRUE, - "ISO-8859-9" + "ISO-8859-9", + "pt" }; const SequenceModel Iso_8859_15PortugueseModel = @@ -223,7 +225,8 @@ const SequenceModel Iso_8859_15PortugueseModel = 38, (float)0.9953179582313172, PR_TRUE, - "ISO-8859-15" + "ISO-8859-15", + "pt" }; const SequenceModel Windows_1252PortugueseModel = @@ -233,5 +236,6 @@ const SequenceModel Windows_1252PortugueseModel = 38, (float)0.9953179582313172, PR_TRUE, - "WINDOWS-1252" + "WINDOWS-1252", + "pt" }; diff --git a/src/LangModels/LangRomanianModel.cpp b/src/LangModels/LangRomanianModel.cpp index 154c03f..cfb1b8d 100644 --- a/src/LangModels/LangRomanianModel.cpp +++ b/src/LangModels/LangRomanianModel.cpp @@ -198,7 +198,8 @@ const SequenceModel Iso_8859_16RomanianModel = 33, (float)0.997762564143313, PR_TRUE, - "ISO-8859-16" + "ISO-8859-16", + "ro" }; const SequenceModel Iso_8859_2RomanianModel = @@ -208,7 +209,8 @@ const SequenceModel Iso_8859_2RomanianModel = 33, (float)0.997762564143313, PR_TRUE, - "ISO-8859-2" + "ISO-8859-2", + "ro" }; const SequenceModel Windows_1250RomanianModel = @@ -218,7 +220,8 @@ const SequenceModel Windows_1250RomanianModel = 33, (float)0.997762564143313, PR_TRUE, - "WINDOWS-1250" + "WINDOWS-1250", + "ro" }; const SequenceModel Ibm852RomanianModel = @@ -228,5 +231,6 @@ const SequenceModel Ibm852RomanianModel = 33, (float)0.997762564143313, PR_TRUE, - "IBM852" + "IBM852", + "ro" }; diff --git a/src/LangModels/LangRussianModel.cpp b/src/LangModels/LangRussianModel.cpp index a532049..50631df 100644 --- a/src/LangModels/LangRussianModel.cpp +++ b/src/LangModels/LangRussianModel.cpp @@ -307,7 +307,8 @@ const SequenceModel Koi8rRussianModel = 64, (float)0.976601, PR_FALSE, - "KOI8-R" + "KOI8-R", + "ru" }; const SequenceModel Win1251RussianModel = @@ -317,7 +318,8 @@ const SequenceModel Win1251RussianModel = 64, (float)0.976601, PR_FALSE, - "WINDOWS-1251" + "WINDOWS-1251", + "ru" }; const SequenceModel Latin5RussianModel = @@ -327,7 +329,8 @@ const SequenceModel Latin5RussianModel = 64, (float)0.976601, PR_FALSE, - "ISO-8859-5" + "ISO-8859-5", + "ru" }; const SequenceModel MacCyrillicRussianModel = @@ -337,7 +340,8 @@ const SequenceModel MacCyrillicRussianModel = 64, (float)0.976601, PR_FALSE, - "MAC-CYRILLIC" + "MAC-CYRILLIC", + "ru" }; const SequenceModel Ibm866RussianModel = @@ -347,7 +351,8 @@ const SequenceModel Ibm866RussianModel = 64, (float)0.976601, PR_FALSE, - "IBM866" + "IBM866", + "ru" }; const SequenceModel Ibm855RussianModel = @@ -357,5 +362,6 @@ const SequenceModel Ibm855RussianModel = 64, (float)0.976601, PR_FALSE, - "IBM855" + "IBM855", + "ru" }; diff --git a/src/LangModels/LangSlovakModel.cpp b/src/LangModels/LangSlovakModel.cpp index cfa94aa..480b4b5 100644 --- a/src/LangModels/LangSlovakModel.cpp +++ b/src/LangModels/LangSlovakModel.cpp @@ -255,7 +255,8 @@ const SequenceModel Ibm852SlovakModel = 45, (float)0.9733303573968434, PR_TRUE, - "IBM852" + "IBM852", + "sk" }; const SequenceModel Iso_8859_2SlovakModel = @@ -265,7 +266,8 @@ const SequenceModel Iso_8859_2SlovakModel = 45, (float)0.9733303573968434, PR_TRUE, - "ISO-8859-2" + "ISO-8859-2", + "sk" }; const SequenceModel Mac_CentraleuropeSlovakModel = @@ -275,7 +277,8 @@ const SequenceModel Mac_CentraleuropeSlovakModel = 45, (float)0.9733303573968434, PR_TRUE, - "MAC-CENTRALEUROPE" + "MAC-CENTRALEUROPE", + "sk" }; const SequenceModel Windows_1250SlovakModel = @@ -285,5 +288,6 @@ const SequenceModel Windows_1250SlovakModel = 45, (float)0.9733303573968434, PR_TRUE, - "WINDOWS-1250" + "WINDOWS-1250", + "sk" }; diff --git a/src/LangModels/LangSloveneModel.cpp b/src/LangModels/LangSloveneModel.cpp index da28d86..160f054 100644 --- a/src/LangModels/LangSloveneModel.cpp +++ b/src/LangModels/LangSloveneModel.cpp @@ -215,7 +215,8 @@ const SequenceModel Iso_8859_2SloveneModel = 29, (float)0.9983524317161332, PR_TRUE, - "ISO-8859-2" + "ISO-8859-2", + "sl" }; const SequenceModel Iso_8859_16SloveneModel = @@ -225,7 +226,8 @@ const SequenceModel Iso_8859_16SloveneModel = 29, (float)0.9983524317161332, PR_TRUE, - "ISO-8859-16" + "ISO-8859-16", + "sl" }; const SequenceModel Windows_1250SloveneModel = @@ -235,7 +237,8 @@ const SequenceModel Windows_1250SloveneModel = 29, (float)0.9983524317161332, PR_TRUE, - "WINDOWS-1250" + "WINDOWS-1250", + "sl" }; const SequenceModel Mac_CentraleuropeSloveneModel = @@ -245,7 +248,8 @@ const SequenceModel Mac_CentraleuropeSloveneModel = 29, (float)0.9983524317161332, PR_TRUE, - "MAC-CENTRALEUROPE" + "MAC-CENTRALEUROPE", + "sl" }; const SequenceModel Ibm852SloveneModel = @@ -255,5 +259,6 @@ const SequenceModel Ibm852SloveneModel = 29, (float)0.9983524317161332, PR_TRUE, - "IBM852" + "IBM852", + "sl" }; diff --git a/src/LangModels/LangSpanishModel.cpp b/src/LangModels/LangSpanishModel.cpp index 18c400a..6c3f3a9 100644 --- a/src/LangModels/LangSpanishModel.cpp +++ b/src/LangModels/LangSpanishModel.cpp @@ -177,7 +177,8 @@ const SequenceModel Iso_8859_1SpanishModel = 33, (float)0.9970385677528184, PR_TRUE, - "ISO-8859-1" + "ISO-8859-1", + "es" }; const SequenceModel Iso_8859_15SpanishModel = @@ -187,7 +188,8 @@ const SequenceModel Iso_8859_15SpanishModel = 33, (float)0.9970385677528184, PR_TRUE, - "ISO-8859-15" + "ISO-8859-15", + "es" }; const SequenceModel Windows_1252SpanishModel = @@ -197,5 +199,6 @@ const SequenceModel Windows_1252SpanishModel = 33, (float)0.9970385677528184, PR_TRUE, - "WINDOWS-1252" + "WINDOWS-1252", + "es" }; diff --git a/src/LangModels/LangSwedishModel.cpp b/src/LangModels/LangSwedishModel.cpp index 0d2dadf..3dca8e8 100644 --- a/src/LangModels/LangSwedishModel.cpp +++ b/src/LangModels/LangSwedishModel.cpp @@ -217,7 +217,8 @@ const SequenceModel Windows_1252SwedishModel = 31, (float)0.997323508584682, PR_TRUE, - "WINDOWS-1252" + "WINDOWS-1252", + "sv" }; const SequenceModel Iso_8859_9SwedishModel = @@ -227,7 +228,8 @@ const SequenceModel Iso_8859_9SwedishModel = 31, (float)0.997323508584682, PR_TRUE, - "ISO-8859-9" + "ISO-8859-9", + "sv" }; const SequenceModel Iso_8859_1SwedishModel = @@ -237,7 +239,8 @@ const SequenceModel Iso_8859_1SwedishModel = 31, (float)0.997323508584682, PR_TRUE, - "ISO-8859-1" + "ISO-8859-1", + "sv" }; const SequenceModel Iso_8859_4SwedishModel = @@ -247,7 +250,8 @@ const SequenceModel Iso_8859_4SwedishModel = 31, (float)0.997323508584682, PR_TRUE, - "ISO-8859-4" + "ISO-8859-4", + "sv" }; const SequenceModel Iso_8859_15SwedishModel = @@ -257,5 +261,6 @@ const SequenceModel Iso_8859_15SwedishModel = 31, (float)0.997323508584682, PR_TRUE, - "ISO-8859-15" + "ISO-8859-15", + "sv" }; diff --git a/src/LangModels/LangThaiModel.cpp b/src/LangModels/LangThaiModel.cpp index 091fb8d..9880e09 100644 --- a/src/LangModels/LangThaiModel.cpp +++ b/src/LangModels/LangThaiModel.cpp @@ -251,7 +251,8 @@ const SequenceModel Tis_620ThaiModel = 64, (float)0.8815720594354438, PR_FALSE, - "TIS-620" + "TIS-620", + "th" }; const SequenceModel Iso_8859_11ThaiModel = @@ -261,5 +262,6 @@ const SequenceModel Iso_8859_11ThaiModel = 64, (float)0.8815720594354438, PR_FALSE, - "ISO-8859-11" + "ISO-8859-11", + "th" }; diff --git a/src/LangModels/LangTurkishModel.cpp b/src/LangModels/LangTurkishModel.cpp index 71d72c5..16c133f 100644 --- a/src/LangModels/LangTurkishModel.cpp +++ b/src/LangModels/LangTurkishModel.cpp @@ -159,7 +159,8 @@ const SequenceModel Iso_8859_3TurkishModel = 36, (float)0.991865243864388, PR_FALSE, - "ISO-8859-3" + "ISO-8859-3", + "tr" }; const SequenceModel Iso_8859_9TurkishModel = @@ -169,5 +170,6 @@ const SequenceModel Iso_8859_9TurkishModel = 36, (float)0.991865243864388, PR_FALSE, - "ISO-8859-9" + "ISO-8859-9", + "tr" }; diff --git a/src/LangModels/LangVietnameseModel.cpp b/src/LangModels/LangVietnameseModel.cpp index 288a525..0569887 100644 --- a/src/LangModels/LangVietnameseModel.cpp +++ b/src/LangModels/LangVietnameseModel.cpp @@ -233,7 +233,8 @@ const SequenceModel Windows_1258VietnameseModel = 55, (float)0.9321889118082535, PR_FALSE, - "WINDOWS-1258" + "WINDOWS-1258", + "vi" }; const SequenceModel VisciiVietnameseModel = @@ -243,5 +244,6 @@ const SequenceModel VisciiVietnameseModel = 55, (float)0.9321889118082535, PR_FALSE, - "VISCII" + "VISCII", + "vi" }; diff --git a/src/nsBig5Prober.h b/src/nsBig5Prober.h index 7d13be8..4b5d9fa 100644 --- a/src/nsBig5Prober.h +++ b/src/nsBig5Prober.h @@ -51,6 +51,7 @@ public: virtual ~nsBig5Prober(void){delete mCodingSM;} nsProbingState HandleData(const char* aBuf, PRUint32 aLen); const char* GetCharSetName() {return "BIG5";} + const char* GetLanguage() {return "zh";} nsProbingState GetState(void) {return mState;} void Reset(void); float GetConfidence(void); diff --git a/src/nsCharSetProber.h b/src/nsCharSetProber.h index c078ccf..c13afb8 100644 --- a/src/nsCharSetProber.h +++ b/src/nsCharSetProber.h @@ -54,6 +54,7 @@ class nsCharSetProber { public: virtual ~nsCharSetProber() {} virtual const char* GetCharSetName() = 0; + virtual const char* GetLanguage() = 0; virtual nsProbingState HandleData(const char* aBuf, PRUint32 aLen) = 0; virtual nsProbingState GetState(void) = 0; virtual void Reset(void) = 0; diff --git a/src/nsEUCJPProber.h b/src/nsEUCJPProber.h index a7a2f51..a74c779 100644 --- a/src/nsEUCJPProber.h +++ b/src/nsEUCJPProber.h @@ -57,6 +57,7 @@ public: virtual ~nsEUCJPProber(void){delete mCodingSM;} nsProbingState HandleData(const char* aBuf, PRUint32 aLen); const char* GetCharSetName() {return "EUC-JP";} + const char* GetLanguage() {return "ja";} nsProbingState GetState(void) {return mState;} void Reset(void); float GetConfidence(void); diff --git a/src/nsEUCKRProber.h b/src/nsEUCKRProber.h index 954c038..8ce9eb2 100644 --- a/src/nsEUCKRProber.h +++ b/src/nsEUCKRProber.h @@ -57,6 +57,7 @@ public: * Korean documents are actually created with this character set. */ const char* GetCharSetName() {return "UHC";} + const char* GetLanguage() {return "ko";} nsProbingState GetState(void) {return mState;} void Reset(void); float GetConfidence(void); diff --git a/src/nsEUCTWProber.h b/src/nsEUCTWProber.h index ee6376e..6701027 100644 --- a/src/nsEUCTWProber.h +++ b/src/nsEUCTWProber.h @@ -51,6 +51,7 @@ public: virtual ~nsEUCTWProber(void){delete mCodingSM;} nsProbingState HandleData(const char* aBuf, PRUint32 aLen); const char* GetCharSetName() {return "EUC-TW";} + const char* GetLanguage() {return "zh";} nsProbingState GetState(void) {return mState;} void Reset(void); float GetConfidence(void); diff --git a/src/nsEscCharsetProber.h b/src/nsEscCharsetProber.h index 4b648e0..eab3080 100644 --- a/src/nsEscCharsetProber.h +++ b/src/nsEscCharsetProber.h @@ -38,6 +38,8 @@ #ifndef nsEscCharSetProber_h__ #define nsEscCharSetProber_h__ +#include <cstddef> + #include "nsCharSetProber.h" #include "nsCodingStateMachine.h" @@ -49,6 +51,7 @@ public: virtual ~nsEscCharSetProber(void); nsProbingState HandleData(const char* aBuf, PRUint32 aLen); const char* GetCharSetName() {return mDetectedCharset;} + const char* GetLanguage() {return NULL;} nsProbingState GetState(void) {return mState;} void Reset(void); float GetConfidence(void){return (float)0.99;} diff --git a/src/nsGB2312Prober.h b/src/nsGB2312Prober.h index 26ebf84..a35e585 100644 --- a/src/nsGB2312Prober.h +++ b/src/nsGB2312Prober.h @@ -53,6 +53,7 @@ public: virtual ~nsGB18030Prober(void){delete mCodingSM;} nsProbingState HandleData(const char* aBuf, PRUint32 aLen); const char* GetCharSetName() {return "GB18030";} + const char* GetLanguage() {return "zh";} nsProbingState GetState(void) {return mState;} void Reset(void); float GetConfidence(void); diff --git a/src/nsHebrewProber.h b/src/nsHebrewProber.h index eedfed4..8442aab 100644 --- a/src/nsHebrewProber.h +++ b/src/nsHebrewProber.h @@ -49,7 +49,8 @@ public: virtual ~nsHebrewProber(void) {} virtual nsProbingState HandleData(const char* aBuf, PRUint32 aLen); - virtual const char* GetCharSetName(); + virtual const char *GetCharSetName(); + virtual const char *GetLanguage(void) { return "he"; } virtual void Reset(void); virtual nsProbingState GetState(void); diff --git a/src/nsLatin1Prober.h b/src/nsLatin1Prober.h index 59118a7..bd3a9d5 100644 --- a/src/nsLatin1Prober.h +++ b/src/nsLatin1Prober.h @@ -39,6 +39,8 @@ #ifndef nsLatin1Prober_h__ #define nsLatin1Prober_h__ +#include <cstddef> + #include "nsCharSetProber.h" #define FREQ_CAT_NUM 4 @@ -49,6 +51,7 @@ public: virtual ~nsLatin1Prober(void){} nsProbingState HandleData(const char* aBuf, PRUint32 aLen); const char* GetCharSetName() {return "WINDOWS-1252";} + const char* GetLanguage() {return NULL;} nsProbingState GetState(void) {return mState;} void Reset(void); float GetConfidence(void); diff --git a/src/nsMBCSGroupProber.cpp b/src/nsMBCSGroupProber.cpp index 057ddb1..68c896a 100644 --- a/src/nsMBCSGroupProber.cpp +++ b/src/nsMBCSGroupProber.cpp @@ -97,6 +97,18 @@ const char* nsMBCSGroupProber::GetCharSetName() return mProbers[mBestGuess]->GetCharSetName(); } +const char* nsMBCSGroupProber::GetLanguage(void) +{ + if (mBestGuess == -1) + { + GetConfidence(); + } + if (mBestGuess == -1) + return NULL; + else + return mProbers[mBestGuess]->GetLanguage(); +} + void nsMBCSGroupProber::Reset(void) { mActiveNum = 0; diff --git a/src/nsMBCSGroupProber.h b/src/nsMBCSGroupProber.h index c4e9964..0e55221 100644 --- a/src/nsMBCSGroupProber.h +++ b/src/nsMBCSGroupProber.h @@ -55,6 +55,7 @@ public: virtual ~nsMBCSGroupProber(); nsProbingState HandleData(const char* aBuf, PRUint32 aLen); const char* GetCharSetName(); + const char* GetLanguage(); nsProbingState GetState(void) {return mState;} void Reset(void); float GetConfidence(void); diff --git a/src/nsSBCSGroupProber.cpp b/src/nsSBCSGroupProber.cpp index f956d25..6a3ef4f 100644 --- a/src/nsSBCSGroupProber.cpp +++ b/src/nsSBCSGroupProber.cpp @@ -217,6 +217,17 @@ const char* nsSBCSGroupProber::GetCharSetName() return mProbers[mBestGuess]->GetCharSetName(); } +const char* nsSBCSGroupProber::GetLanguage() +{ + if (mBestGuess == -1) + { + GetConfidence(); + if (mBestGuess == -1) + mBestGuess = 0; + } + return mProbers[mBestGuess]->GetLanguage(); +} + void nsSBCSGroupProber::Reset(void) { mActiveNum = 0; diff --git a/src/nsSBCSGroupProber.h b/src/nsSBCSGroupProber.h index ec72324..d07e16f 100644 --- a/src/nsSBCSGroupProber.h +++ b/src/nsSBCSGroupProber.h @@ -49,6 +49,7 @@ public: virtual ~nsSBCSGroupProber(); nsProbingState HandleData(const char* aBuf, PRUint32 aLen); const char* GetCharSetName(); + const char* GetLanguage(); nsProbingState GetState(void) {return mState;} void Reset(void); float GetConfidence(void); diff --git a/src/nsSBCharSetProber.cpp b/src/nsSBCharSetProber.cpp index 001529f..7832f11 100644 --- a/src/nsSBCharSetProber.cpp +++ b/src/nsSBCharSetProber.cpp @@ -145,6 +145,13 @@ const char* nsSingleByteCharSetProber::GetCharSetName() return mNameProber->GetCharSetName(); } +const char* nsSingleByteCharSetProber::GetLanguage() +{ + if (!mNameProber) + return mModel->langName; + return mNameProber->GetLanguage(); +} + #ifdef DEBUG_chardet void nsSingleByteCharSetProber::DumpStatus() { diff --git a/src/nsSBCharSetProber.h b/src/nsSBCharSetProber.h index 42d21b2..2cd4409 100644 --- a/src/nsSBCharSetProber.h +++ b/src/nsSBCharSetProber.h @@ -75,6 +75,7 @@ typedef struct float mTypicalPositiveRatio; // = freqSeqs / totalSeqs PRBool keepEnglishLetter; // says if this script contains English characters (not implemented) const char* const charsetName; + const char* const langName; } SequenceModel; @@ -86,6 +87,7 @@ public: :mModel(model), mReversed(reversed), mNameProber(nameProber) { Reset(); } virtual const char* GetCharSetName(); + virtual const char* GetLanguage(); virtual nsProbingState HandleData(const char* aBuf, PRUint32 aLen); virtual nsProbingState GetState(void) {return mState;} virtual void Reset(void); diff --git a/src/nsSJISProber.h b/src/nsSJISProber.h index f326ded..61e6352 100644 --- a/src/nsSJISProber.h +++ b/src/nsSJISProber.h @@ -58,6 +58,7 @@ public: virtual ~nsSJISProber(void){delete mCodingSM;} nsProbingState HandleData(const char* aBuf, PRUint32 aLen); const char* GetCharSetName() {return "SHIFT_JIS";} + const char* GetLanguage() {return "ja";} nsProbingState GetState(void) {return mState;} void Reset(void); float GetConfidence(void); diff --git a/src/nsUTF8Prober.h b/src/nsUTF8Prober.h index 21c91c4..a2cf4ee 100644 --- a/src/nsUTF8Prober.h +++ b/src/nsUTF8Prober.h @@ -38,6 +38,7 @@ #ifndef nsUTF8Prober_h__ #define nsUTF8Prober_h__ +#include <cstddef> #include "nsCharSetProber.h" #include "nsCodingStateMachine.h" @@ -49,6 +50,7 @@ public: virtual ~nsUTF8Prober(){delete mCodingSM;} nsProbingState HandleData(const char* aBuf, PRUint32 aLen); const char* GetCharSetName() {return "UTF-8";} + const char* GetLanguage() {return NULL;} nsProbingState GetState(void) {return mState;} void Reset(void); float GetConfidence(void); diff --git a/src/nsUniversalDetector.cpp b/src/nsUniversalDetector.cpp index 2da4b4b..bc9e9b2 100644 --- a/src/nsUniversalDetector.cpp +++ b/src/nsUniversalDetector.cpp @@ -305,7 +305,7 @@ void nsUniversalDetector::DataEnd() * when finding them. */ mDone = PR_TRUE; - Report(mDetectedCharset, 1.0); + Report(mDetectedCharset, NULL, 1.0); return; } @@ -323,7 +323,9 @@ void nsUniversalDetector::DataEnd() if (proberConfidence > MINIMUM_THRESHOLD) /* Only report what we are confident in. */ - Report(mCharSetProbers[i]->GetCharSetName(), proberConfidence); + Report(mCharSetProbers[i]->GetCharSetName(), + mCharSetProbers[i]->GetLanguage(), + proberConfidence); } } } diff --git a/src/nsUniversalDetector.h b/src/nsUniversalDetector.h index eecdea6..702a9fe 100644 --- a/src/nsUniversalDetector.h +++ b/src/nsUniversalDetector.h @@ -69,7 +69,8 @@ public: virtual void DataEnd(void); protected: - virtual void Report(const char* aCharset, + virtual void Report(const char *encoding, + const char *language, float confidence) = 0; virtual void Reset(); nsInputState mInputState; diff --git a/src/symbols.cmake b/src/symbols.cmake index a6690ff..e66bfa0 100644 --- a/src/symbols.cmake +++ b/src/symbols.cmake @@ -9,6 +9,7 @@ set( uchardet_get_candidates uchardet_get_encoding uchardet_get_confidence + uchardet_get_language ) set (LINK_FLAGS "") diff --git a/src/uchardet.cpp b/src/uchardet.cpp index f5391ea..19a73f0 100644 --- a/src/uchardet.cpp +++ b/src/uchardet.cpp @@ -65,6 +65,7 @@ public: } virtual void Report(const char *encoding, + const char *language, float confidence) { std::vector<UChardetCandidate>::iterator it; @@ -72,7 +73,8 @@ public: for (it = candidates.begin(); it != candidates.end(); it++) { - if (strcmp(it->encoding, encoding) == 0) + if (strcmp(it->encoding, encoding) == 0 && + it->language && language && strcmp(it->language, language) == 0) { /* Already reported. Bail out or update the confidence * when needed. @@ -91,6 +93,7 @@ public: candidate = UChardetCandidate(); candidate.encoding = strdup(encoding); + candidate.language = language ? strdup(language) : NULL; candidate.confidence = confidence; for (it = candidates.begin(); it != candidates.end(); it++) @@ -107,7 +110,11 @@ public: nsUniversalDetector::Reset(); for (it = candidates.begin(); it != candidates.end(); it++) + { free(it->encoding); + if (it->language) + free(it->language); + } candidates.clear(); } @@ -125,6 +132,12 @@ public: { return (candidates.size() > i) ? candidates[i].confidence : 0.0; } + + const char* GetLanguage(size_t i) const + { + return (candidates.size() > i) ? candidates[i].language : NULL; + } + }; uchardet_t uchardet_new(void) @@ -178,3 +191,9 @@ const char * uchardet_get_encoding (uchardet_t ud, { return reinterpret_cast<HandleUniversalDetector*>(ud)->GetCharset(candidate); } + +const char * uchardet_get_language (uchardet_t ud, + size_t candidate) +{ + return reinterpret_cast<HandleUniversalDetector*>(ud)->GetLanguage(candidate); +} diff --git a/src/uchardet.h b/src/uchardet.h index c452a69..df1387e 100644 --- a/src/uchardet.h +++ b/src/uchardet.h @@ -120,6 +120,8 @@ UCHARDET_INTERFACE float uchardet_get_confidence (uchardet_t ud, size_t candidate); UCHARDET_INTERFACE const char * uchardet_get_encoding (uchardet_t ud, size_t candidate); +UCHARDET_INTERFACE const char * uchardet_get_language (uchardet_t ud, + size_t candidate); #ifdef __cplusplus |