summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJehan <jehan@girinstud.io>2022-11-30 20:58:37 +0100
committerJehan <jehan@girinstud.io>2022-12-14 00:24:53 +0100
commitb5b75b81cec2fc0b2c131be60086062a8d3af1af (patch)
tree9e4df7b45df1cb53df4103dcf24c03aa4dd2eaf1
parent0be80a21db41321da0a33ffc6b5d272a712cbf6c (diff)
script, src: rebuild the Danish model.
Now that it has IBM865 support on the main branch and that I rebased, this feature branch for the new API got broken too.
-rw-r--r--script/BuildLangModelLogs/LangDanishModel.log362
-rw-r--r--src/LangModels/LangDanishModel.cpp196
-rw-r--r--src/nsSBCSGroupProber.cpp4
-rw-r--r--src/nsSBCSGroupProber.h2
4 files changed, 341 insertions, 223 deletions
diff --git a/script/BuildLangModelLogs/LangDanishModel.log b/script/BuildLangModelLogs/LangDanishModel.log
index 2408080..1396f6e 100644
--- a/script/BuildLangModelLogs/LangDanishModel.log
+++ b/script/BuildLangModelLogs/LangDanishModel.log
@@ -1,156 +1,240 @@
= Logs of language model for Danish (da) =
- Generated by BuildLangModel.py
-- Started: 2021-03-16 01:32:17.684746
-- Maximum depth: 4
-- Max number of pages: 100
+- Started: 2022-11-30 20:49:10.182568
+- Maximum depth: 2
+- Max number of pages: 200
== Parsed pages ==
Forside (revision 10000691)
-1. symfoni (Beethoven) (revision 10648993)
-15. marts (revision 8172123)
-1917 (revision 10645384)
-1930 (revision 10645389)
-1940 (revision 10648721)
-1951 (revision 10640371)
-1972 (revision 10641861)
+15. januar (revision 10515606)
+IC4 (revision 11317878)
+VM i fodbold 2022 (mænd) (revision 11344039)
+28. november (revision 9410945)
+Forenede Nationer (revision 11199108)
+Middelaldercentret (revision 11339897)
+Vilhelm Erobreren (revision 11279565)
+Casper & Mandrilaftalen (revision 11221713)
+Nikolaj Lie Kaas (revision 11322663)
+Stig Hoffmeyer (revision 11340274)
+Rock and Roll Hall of Fame (revision 8408189)
+Anwar Ibrahim (revision 11342876)
+Afrikamesterskabet i håndbold 2022 (kvinder) (revision 11341917)
+1940 (revision 11263756)
+1937 (revision 11303923)
+1934 (revision 11224625)
+Danmarksdemokraterne (revision 11335570)
+The Julekalender (revision 11341242)
+Ruslands invasion af Ukraine 2022 (revision 11335164)
+25. november (revision 10378454)
+The Jimi Hendrix Experience (revision 10497780)
+24. november (revision 6877891)
+Vikingetidens rustning og våben (revision 11332607)
+Torben Rechendorff (revision 11342962)
+Thomas Edison (revision 11052704)
+1947 (revision 11252357)
+Eurovision Song Contest 2014 (revision 11333950)
+29. november (revision 6877900)
+Ukraine (revision 11334630)
+1990 (revision 11340072)
+Maurice Norman (revision 11342318)
+Sergej Sjojgu (revision 11309097)
+Færøerne (revision 11333678)
+Fonograf (revision 11032483)
+Folketingsvalget 2022 (revision 11339557)
+Hans Magnus Enzensberger (revision 11341046)
+Moderaterne (revision 11305861)
+Hawaii (revision 11317011)
+Mandan (indfødte amerikanere) (revision 11336303)
+SI-præfiks (revision 11332802)
+Encyklopædi (revision 11315276)
+Storbritannien (revision 11329834)
+1991 (revision 11250037)
+Det Konservative Folkeparti (revision 11313857)
+Wandsworth-skjoldet (revision 11341402)
+Angolas håndboldlandshold (damer) (revision 11331888)
+Shu-bi-dua (revision 11324736)
+1877 (revision 11224901)
+Kon-Tiki (revision 10615971)
+Socialdemokratiet (revision 11325315)
+Donatan (revision 10586146)
+Adolf Hitler (revision 11317375)
+Procent (revision 10764365)
+1. juni (revision 10206137)
+1863 (revision 11081613)
+ISO 3166-1 alpha-3 (revision 11250626)
+Senegals håndboldlandshold (herrer) (revision 8621578)
+Billion (revision 11039345)
+Lørdag (revision 11159889)
+Sachsen (revision 11299889)
+Vestindien (revision 11330329)
+Folketingsvalget 1988 (revision 10970017)
+Dogme 95 (revision 10973606)
+Encyclopédie (revision 11314734)
+Afrikamesterskabet i håndbold 2018 (mænd) (revision 11131830)
+Mew (revision 11308840)
2. marts (revision 9423344)
-2003 (revision 10654209)
-44 f.Kr. (revision 7242128)
-7. marts (revision 9423388)
-9. marts (revision 10601197)
-Abdikation (revision 10197388)
-Afsnit af Badehotellet (revision 10654331)
-Agnes Slott-Møller (revision 10648962)
-Australian Open-mesterskabet i damesingle 2021 (revision 10630904)
-Australian Open-mesterskabet i herresingle 2021 (revision 10630887)
-Australian Open 2021 (revision 10630544)
-Casper & Mandrilaftalen (revision 10444147)
-Coronaviruspandemien (revision 10652415)
-Cykling under sommer-OL 2012 – Linjeløb (kvinder) (revision 10651872)
-Dansk (sprog) (revision 10633727)
-Den danske Treårsekspedition til Østgrønland 1931-34 (revision 10654093)
-Dnepr (revision 10635465)
-Donald Trump (revision 10653185)
-Døde i 2021 (revision 10653976)
-Encyklopædi (revision 10590147)
-Eurovision Song Contest 2014 (revision 10592331)
-Folkerepublikken Kina (revision 10634829)
-Folketinget (revision 10643927)
-Fram-ekspeditionen 1910-1912 (revision 10630146)
-Frankrig (revision 10648749)
-Frankrigs præsidenter (revision 10477099)
-Geologi (revision 10631000)
-Geoteknik (revision 10603548)
-Greater London (revision 10380043)
-Hortus Botanicus Amsterdam (revision 8854568)
-Hu Jintao (revision 10610855)
-IC4 (revision 10577458)
-Idus martius (revision 10652897)
-Inger Støjberg (revision 10643259)
-Italiens premierministre (revision 10625575)
-John Polkinghorne (revision 10654447)
-Julius Cæsar (revision 10653812)
-Korruption (revision 10401686)
-Lars Göran Petrov (revision 10650013)
-London Underground (revision 10635531)
-Marge Simpson (revision 10640942)
-Mario Draghi (revision 10652699)
-Matilde af Skotland (revision 10648200)
-Metrosystemer i verden (revision 10510595)
-Middelaldercentret (revision 10574228)
-Naomi Osaka (revision 10478959)
-Nederlandene (revision 10642742)
-Nicolas Sarkozy (revision 10639376)
-Nikolaj 2. af Rusland (revision 10639924)
-Novak Djokovic (revision 10479710)
-Outlaw Gentlemen & Shady Ladies (revision 10492201)
-Paris-Nice 2021 (revision 10653019)
-Rigsretssagen mod Donald Trump 2021 (revision 10653875)
-Rigsretssagen mod Inger Støjberg (revision 10643260)
-Rusland (revision 10631140)
-Sanja Ilić (revision 10645645)
-Senat (revision 10429780)
-Senatet (USA) (revision 10624834)
-Shu-bi-dua (revision 10630614)
-Svend Johansen (skuespiller) (revision 10643631)
-Tennis (revision 10651841)
-Tommy Troelsen (revision 10648382)
-Træsko (revision 10626215)
-USA's præsidenter (revision 10639768)
-Undergrundsbane (revision 10541653)
-Vilhelm Erobreren (revision 10631208)
-Wikimedia (revision 10260889)
-Wikipedia (revision 10627445)
-Zar (revision 10557166)
-1800 (revision 10645359)
-2. april (revision 9568657)
-Burgtheater (revision 9296862)
-C-dur (revision 10513719)
-Cello (revision 10641506)
-Coda (revision 9298442)
-Dominant (revision 9513277)
-Dynamik (musik) (revision 9504157)
-F-dur (revision 8135200)
-Fagot (revision 10578018)
-Fløjte (revision 10329382)
-Harmonik (revision 10577145)
-International Music Score Library Project (revision 10115839)
-Italienske og franske musikudtryk (revision 10352094)
-Johann Georg Albrechtsberger (revision 10289540)
-Joseph Haydn (revision 10289602)
-Klarinet (revision 10490230)
-Klassicisme (musik) (revision 10436811)
-Kontrabas (revision 10147393)
-Kontrapunkt (musikteori) (revision 10184029)
-Leipzig (revision 10611798)
-Ludwig van Beethoven (revision 10642134)
+Rajon (revision 11185598)
+TheTVDB (revision 10969052)
+Skueproces (revision 11322041)
+New York Times (revision 10236433)
+2006 (revision 11271490)
+Jacinda Ardern (revision 11243495)
+8. maj (revision 9423405)
+7. juni (revision 10287352)
+Ray Charles (revision 10893843)
+Dansk Rock - fra pigtråd til punk (revision 10970784)
+1950'erne (revision 10917112)
+John Wesley Hyatt (revision 9405508)
+Landsdel Hovedstaden (revision 10723037)
+Zar-Rusland (revision 11328111)
+1816 (revision 11198312)
+Engelsk litteratur (revision 10817139)
+22. november (revision 10203064)
+Maj (revision 11288718)
+Progressiv rock (revision 11259601)
+Maurice Setters (revision 10936371)
+Minkkommissionen (revision 11337058)
+Ragnhild Hveger (revision 11072132)
+1961 (revision 11224941)
+Montenegro (revision 11340028)
+Socialkonservatisme (revision 8745187)
+TV 2 (revision 11339141)
+7. februar (revision 9423377)
+Ar (enhed) (revision 11309905)
+1881 (revision 11144791)
+Etiopisk kalender (revision 9931290)
+Ethelbert Nevin (revision 10591854)
+The Moscow Times (revision 11329355)
+1960'erne (revision 11261802)
+15. november (revision 6877873)
+Politikens Forlag (revision 11322941)
+Island (revision 11219029)
+Danmark (revision 11313400)
+Det Kongelige Teater (revision 11319106)
+20. juni (revision 10232768)
+VM i fodbold 1958 (revision 11014260)
+Næste folketingsvalg (revision 11338101)
+Virtual International Authority File (revision 8702589)
+Marmor (revision 11309004)
+Oslo (revision 11290885)
+1938 (revision 11336099)
+Frie Grønne (revision 11294501)
+Lottorp (revision 11223312)
+1931 (revision 11236350)
+1930 (revision 11252037)
+Albanien (revision 11309379)
+Holger Begtrup (revision 10289352)
+1887 (revision 11250123)
+Kristen Helveg Petersen (revision 10505239)
+Benito Mussolini (revision 11311831)
+Tamilrapporten (revision 10672604)
+Internationale Valutafond (revision 10871884)
+Ron Flowers (revision 10999963)
+Scud-missil (revision 11072276)
+1860'erne (revision 8151963)
+11. november (revision 10903885)
+10. november (revision 9286344)
+1697 (revision 10865232)
+Det Humanistiske Parti (revision 10898925)
+1998 (revision 11342743)
+Centrum-Demokraterne (revision 11201902)
+Præstens Urskov (revision 10261164)
+Kraghave (Tingsted Sogn) (revision 11124871)
+Burkina Faso (revision 11309150)
+Johannes Peter Frederik Königsfeldt (revision 10942128)
+John Bardeen (revision 10622362)
+Retsforbundet (revision 11333888)
+Mykolaiv oblast (revision 11215109)
+Folketingsvalget 1932 (revision 10529645)
+Atassut (revision 11250468)
+1780 (revision 10879041)
+Pokalvindernes Europa Cup (revision 10533322)
+Harmonium (revision 10648166)
+Litra MA (revision 10707516)
+14. oktober (revision 9764309)
+Letland i Eurovision Song Contest (revision 11273114)
+Den røde tråd (sang) (revision 11117198)
+Peter A.G. Nielsen (revision 11311663)
+Internationalt Standardbognummer (revision 11037702)
+Denys Sjmyhal (revision 11184932)
+Souvenir (revision 10530474)
+Kristendemokraterne (revision 11310458)
+Edward Gibbon (revision 11316150)
+19. november (revision 10910432)
+Aarhus Hovedbanegård (revision 11254458)
+Grækere (revision 11277065)
+Moderaterna (revision 11275745)
+Margrethe 2. (revision 11264709)
+1978 (revision 11340075)
+Demokratiske Republik Congos håndboldlandshold (damer) (revision 11330801)
+Philip af Storbritannien (revision 11307679)
+21. århundrede (revision 9838559)
+Jørgen Christensen (handelsminister) (revision 9548745)
+Holger Juul Hansen (revision 11316843)
+Fodboldspiller (revision 11234361)
+Parliamo italiano (revision 11322505)
+Borgerlig (revision 10930991)
+Mail (revision 10885336)
+Disko (revision 10767773)
+Tunesiens fodboldlandshold (revision 11334411)
+6. december (revision 10378463)
+Erhvervspartiet (1978-79) (revision 8449157)
+Sovjetunionen (revision 11333771)
+1567 (revision 10818742)
+1875 (revision 11198318)
+Hubble-teleskopet (revision 11304842)
+Hærulfstenen (revision 11317806)
+Frankrig (revision 11235194)
+Coney Island (revision 11211594)
+1952 (revision 11243498)
== End of Parsed pages ==
-- Wikipedia parsing ended at: 2021-03-16 01:36:49.098009
+- Wikipedia parsing ended at: 2022-11-30 20:52:37.002648
-57 characters appeared 1058523 times.
+63 characters appeared 1374958 times.
-First 30 characters:
-[ 0] Char e: 15.118707859914238 %
-[ 1] Char r: 8.552388564065213 %
-[ 2] Char n: 7.6833474567864855 %
-[ 3] Char t: 7.125305732610439 %
-[ 4] Char a: 6.351302711419591 %
-[ 5] Char i: 6.265806222443915 %
-[ 6] Char s: 6.152629654716997 %
-[ 7] Char d: 5.90341447469729 %
-[ 8] Char o: 5.144999211164992 %
-[ 9] Char l: 5.1253491893893655 %
-[10] Char g: 3.907992551885977 %
-[11] Char m: 3.3046990948708723 %
-[12] Char k: 3.0474538578755492 %
-[13] Char f: 2.586434116216653 %
-[14] Char v: 2.2680659749481116 %
-[15] Char u: 1.9654745338551927 %
-[16] Char b: 1.7524418458550264 %
-[17] Char p: 1.6338804163915193 %
-[18] Char h: 1.5844719481768466 %
-[19] Char ø: 0.7598323324103491 %
-[20] Char æ: 0.7542585281566863 %
-[21] Char å: 0.728278932059105 %
-[22] Char y: 0.6751860847615027 %
-[23] Char c: 0.6527963964883143 %
-[24] Char j: 0.5847770903419198 %
-[25] Char w: 0.17241004682940286 %
-[26] Char z: 0.0783166733268904 %
-[27] Char x: 0.05602145631223884 %
-[28] Char é: 0.019177665482941794 %
-[29] Char q: 0.016626941502452003 %
+Most Frequent characters:
+[ 0] Char e: 14.79056087531401 %
+[ 1] Char r: 8.641427592697378 %
+[ 2] Char n: 7.613105273033795 %
+[ 3] Char t: 6.915483963873806 %
+[ 4] Char a: 6.583692010955971 %
+[ 5] Char i: 6.462524673480935 %
+[ 6] Char s: 6.347902990491345 %
+[ 7] Char d: 5.849924143137463 %
+[ 8] Char l: 5.1523755634717565 %
+[ 9] Char o: 4.9496784629057755 %
+[10] Char g: 3.827389636628901 %
+[11] Char m: 3.251226582921078 %
+[12] Char k: 3.2378443559730554 %
+[13] Char f: 2.605170485207548 %
+[14] Char v: 2.205303725641074 %
+[15] Char u: 1.978242244490377 %
+[16] Char b: 1.8278376503136822 %
+[17] Char p: 1.5923395478261881 %
+[18] Char h: 1.5512473835564433 %
+[19] Char ø: 0.88409973250092 %
+[20] Char æ: 0.7078761678538544 %
+[21] Char å: 0.7005304889312983 %
+[22] Char y: 0.6576200873044848 %
+[23] Char c: 0.648019794059164 %
+[24] Char j: 0.646928851644923 %
+[25] Char w: 0.14465896412835882 %
+[26] Char z: 0.06814753614292218 %
+[27] Char x: 0.03643747663564996 %
+[28] Char é: 0.020946094353427522 %
+[29] Char ó: 0.013600415430871343 %
+[30] Char q: 0.013018579476609468 %
-The first 30 characters have an accumulated ratio of 0.9997184756495605.
+The first 31 characters have an accumulated ratio of 0.9992516135038306.
-936 sequences found.
+1079 sequences found.
-First 512 (typical positive ratio): 0.9962304038307248
-Next 512 (512-1024): 0.007598323324103491
-Rest: -5.2909066017292616e-17
+First 508 (typical positive ratio): 0.995012453333286
+Next 198 (706-508): 0.003993410296057376
+Rest: 0.0009941363706565953
-- Processing end: 2021-03-16 01:36:49.182013
+- Processing end: 2022-11-30 20:52:37.084319
diff --git a/src/LangModels/LangDanishModel.cpp b/src/LangModels/LangDanishModel.cpp
index 1ce75cb..0abdd97 100644
--- a/src/LangModels/LangDanishModel.cpp
+++ b/src/LangModels/LangDanishModel.cpp
@@ -42,7 +42,7 @@
/**
* Generated by BuildLangModel.py
- * On: 2021-03-16 01:36:49.098484
+ * On: 2022-11-30 20:52:37.003457
**/
/* Character Mapping Table:
@@ -68,18 +68,18 @@ static const unsigned char Iso_8859_15_CharToOrderMap[] =
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
- SYM, 4, 16, 23, 7, 0, 13, 10, 18, 5, 24, 12, 9, 11, 2, 8, /* 4X */
- 17, 29, 1, 6, 3, 15, 14, 25, 27, 22, 26,SYM,SYM,SYM,SYM,SYM, /* 5X */
- SYM, 4, 16, 23, 7, 0, 13, 10, 18, 5, 24, 12, 9, 11, 2, 8, /* 6X */
- 17, 29, 1, 6, 3, 15, 14, 25, 27, 22, 26,SYM,SYM,SYM,SYM,CTR, /* 7X */
+ SYM, 4, 16, 23, 7, 0, 13, 10, 18, 5, 24, 12, 8, 11, 2, 9, /* 4X */
+ 17, 30, 1, 6, 3, 15, 14, 25, 27, 22, 26,SYM,SYM,SYM,SYM,SYM, /* 5X */
+ SYM, 4, 16, 23, 7, 0, 13, 10, 18, 5, 24, 12, 8, 11, 2, 9, /* 6X */
+ 17, 30, 1, 6, 3, 15, 14, 25, 27, 22, 26,SYM,SYM,SYM,SYM,CTR, /* 7X */
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */
- SYM,SYM,SYM,SYM,SYM,SYM, 40,SYM, 40,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */
- SYM,SYM,SYM,SYM, 56, 52,SYM,SYM, 56,SYM,SYM,SYM, 57, 58, 59,SYM, /* BX */
- 41, 32, 48, 60, 33, 21, 20, 37, 34, 28, 39, 46, 43, 36, 53, 45, /* CX */
- 49, 54, 47, 35, 42, 61, 30,SYM, 19, 55, 38, 62, 31, 51, 50, 44, /* DX */
- 41, 32, 48, 63, 33, 21, 20, 37, 34, 28, 39, 46, 43, 36, 53, 45, /* EX */
- 49, 54, 47, 35, 42, 64, 30,SYM, 19, 55, 38, 65, 31, 51, 50, 66, /* FX */
+ SYM,SYM,SYM,SYM,SYM,SYM, 44,SYM, 44,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */
+ SYM,SYM,SYM,SYM, 55, 56,SYM,SYM, 55,SYM,SYM,SYM, 63, 64, 65,SYM, /* BX */
+ 46, 34, 52, 53, 38, 21, 20, 40, 39, 28, 43, 36, 66, 35, 67, 62, /* CX */
+ 32, 49, 60, 29, 48, 68, 33,SYM, 19, 61, 37, 59, 31, 42, 41, 45, /* DX */
+ 46, 34, 52, 53, 38, 21, 20, 40, 39, 28, 43, 36, 69, 35, 70, 62, /* EX */
+ 32, 49, 60, 29, 48, 71, 33,SYM, 19, 61, 37, 59, 31, 42, 41, 72, /* FX */
};
/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
@@ -89,18 +89,18 @@ static const unsigned char Iso_8859_1_CharToOrderMap[] =
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
- SYM, 4, 16, 23, 7, 0, 13, 10, 18, 5, 24, 12, 9, 11, 2, 8, /* 4X */
- 17, 29, 1, 6, 3, 15, 14, 25, 27, 22, 26,SYM,SYM,SYM,SYM,SYM, /* 5X */
- SYM, 4, 16, 23, 7, 0, 13, 10, 18, 5, 24, 12, 9, 11, 2, 8, /* 6X */
- 17, 29, 1, 6, 3, 15, 14, 25, 27, 22, 26,SYM,SYM,SYM,SYM,CTR, /* 7X */
+ SYM, 4, 16, 23, 7, 0, 13, 10, 18, 5, 24, 12, 8, 11, 2, 9, /* 4X */
+ 17, 30, 1, 6, 3, 15, 14, 25, 27, 22, 26,SYM,SYM,SYM,SYM,SYM, /* 5X */
+ SYM, 4, 16, 23, 7, 0, 13, 10, 18, 5, 24, 12, 8, 11, 2, 9, /* 6X */
+ 17, 30, 1, 6, 3, 15, 14, 25, 27, 22, 26,SYM,SYM,SYM,SYM,CTR, /* 7X */
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */
- SYM,SYM,SYM,SYM,SYM, 52,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */
- 41, 32, 48, 67, 33, 21, 20, 37, 34, 28, 39, 46, 43, 36, 53, 45, /* CX */
- 49, 54, 47, 35, 42, 68, 30,SYM, 19, 55, 38, 69, 31, 51, 50, 44, /* DX */
- 41, 32, 48, 70, 33, 21, 20, 37, 34, 28, 39, 46, 43, 36, 53, 45, /* EX */
- 49, 54, 47, 35, 42, 71, 30,SYM, 19, 55, 38, 72, 31, 51, 50, 73, /* FX */
+ SYM,SYM,SYM,SYM,SYM, 56,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */
+ 46, 34, 52, 53, 38, 21, 20, 40, 39, 28, 43, 36, 73, 35, 74, 62, /* CX */
+ 32, 49, 60, 29, 48, 75, 33,SYM, 19, 61, 37, 59, 31, 42, 41, 45, /* DX */
+ 46, 34, 52, 53, 38, 21, 20, 40, 39, 28, 43, 36, 76, 35, 77, 62, /* EX */
+ 32, 49, 60, 29, 48, 78, 33,SYM, 19, 61, 37, 59, 31, 42, 41, 79, /* FX */
};
/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
@@ -110,74 +110,97 @@ static const unsigned char Windows_1252_CharToOrderMap[] =
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
- SYM, 4, 16, 23, 7, 0, 13, 10, 18, 5, 24, 12, 9, 11, 2, 8, /* 4X */
- 17, 29, 1, 6, 3, 15, 14, 25, 27, 22, 26,SYM,SYM,SYM,SYM,SYM, /* 5X */
- SYM, 4, 16, 23, 7, 0, 13, 10, 18, 5, 24, 12, 9, 11, 2, 8, /* 6X */
- 17, 29, 1, 6, 3, 15, 14, 25, 27, 22, 26,SYM,SYM,SYM,SYM,CTR, /* 7X */
- SYM,ILL,SYM, 74,SYM,SYM,SYM,SYM,SYM,SYM, 40,SYM, 75,ILL, 56,ILL, /* 8X */
- ILL,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, 40,SYM, 76,ILL, 56, 77, /* 9X */
+ SYM, 4, 16, 23, 7, 0, 13, 10, 18, 5, 24, 12, 8, 11, 2, 9, /* 4X */
+ 17, 30, 1, 6, 3, 15, 14, 25, 27, 22, 26,SYM,SYM,SYM,SYM,SYM, /* 5X */
+ SYM, 4, 16, 23, 7, 0, 13, 10, 18, 5, 24, 12, 8, 11, 2, 9, /* 6X */
+ 17, 30, 1, 6, 3, 15, 14, 25, 27, 22, 26,SYM,SYM,SYM,SYM,CTR, /* 7X */
+ SYM,ILL,SYM, 80,SYM,SYM,SYM,SYM,SYM,SYM, 44,SYM, 81,ILL, 55,ILL, /* 8X */
+ ILL,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, 44,SYM, 82,ILL, 55, 83, /* 9X */
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */
- SYM,SYM,SYM,SYM,SYM, 52,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */
- 41, 32, 48, 78, 33, 21, 20, 37, 34, 28, 39, 46, 43, 36, 53, 45, /* CX */
- 49, 54, 47, 35, 42, 79, 30,SYM, 19, 55, 38, 80, 31, 51, 50, 44, /* DX */
- 41, 32, 48, 81, 33, 21, 20, 37, 34, 28, 39, 46, 43, 36, 53, 45, /* EX */
- 49, 54, 47, 35, 42, 82, 30,SYM, 19, 55, 38, 83, 31, 51, 50, 84, /* FX */
+ SYM,SYM,SYM,SYM,SYM, 56,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */
+ 46, 34, 52, 53, 38, 21, 20, 40, 39, 28, 43, 36, 84, 35, 85, 62, /* CX */
+ 32, 49, 60, 29, 48, 86, 33,SYM, 19, 61, 37, 59, 31, 42, 41, 45, /* DX */
+ 46, 34, 52, 53, 38, 21, 20, 40, 39, 28, 43, 36, 87, 35, 88, 62, /* EX */
+ 32, 49, 60, 29, 48, 89, 33,SYM, 19, 61, 37, 59, 31, 42, 41, 90, /* FX */
};
/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
-static const int Unicode_Char_size = 60;
+static const unsigned char Ibm865_CharToOrderMap[] =
+{
+ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */
+ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
+ SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
+ NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
+ SYM, 4, 16, 23, 7, 0, 13, 10, 18, 5, 24, 12, 8, 11, 2, 9, /* 4X */
+ 17, 30, 1, 6, 3, 15, 14, 25, 27, 22, 26,SYM,SYM,SYM,SYM,SYM, /* 5X */
+ SYM, 4, 16, 23, 7, 0, 13, 10, 18, 5, 24, 12, 8, 11, 2, 9, /* 6X */
+ 17, 30, 1, 6, 3, 15, 14, 25, 27, 22, 26,SYM,SYM,SYM,SYM,CTR, /* 7X */
+ 40, 31, 28, 52, 38, 46, 21, 40, 43, 36, 39, 62, 91, 92, 38, 21, /* 8X */
+ 28, 20, 20, 48, 33, 60, 59, 61, 93, 33, 31, 19,SYM, 19,SYM,SYM, /* 9X */
+ 34, 35, 29, 37, 49, 49,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */
+ SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */
+ SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* CX */
+ SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* DX */
+ 47, 45, 94, 54, 57, 57, 56, 58, 50, 95, 96, 97, 98, 50, 51,SYM, /* EX */
+ SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* FX */
+};
+/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
+
+static const int Unicode_Char_size = 62;
static const unsigned int Unicode_CharOrder[] =
{
65, 4, 66, 16, 67, 23, 68, 7, 69, 0, 70, 13, 71, 10, 72, 18,
- 73, 5, 74, 24, 75, 12, 76, 9, 77, 11, 78, 2, 79, 8, 80, 17,
- 81, 29, 82, 1, 83, 6, 84, 3, 85, 15, 86, 14, 87, 25, 88, 27,
+ 73, 5, 74, 24, 75, 12, 76, 8, 77, 11, 78, 2, 79, 9, 80, 17,
+ 81, 30, 82, 1, 83, 6, 84, 3, 85, 15, 86, 14, 87, 25, 88, 27,
89, 22, 90, 26, 97, 4, 98, 16, 99, 23, 100, 7, 101, 0,102, 13,
- 103, 10, 104, 18, 105, 5, 106, 24, 107, 12, 108, 9, 109, 11,110, 2,
- 111, 8, 112, 17, 113, 29, 114, 1, 115, 6, 116, 3, 117, 15,118, 14,
- 119, 25, 120, 27, 121, 22, 122, 26, 197, 21, 198, 20, 201, 28,216, 19,
- 229, 21, 230, 20, 233, 28, 248, 19,
+ 103, 10, 104, 18, 105, 5, 106, 24, 107, 12, 108, 8, 109, 11,110, 2,
+ 111, 9, 112, 17, 113, 30, 114, 1, 115, 6, 116, 3, 117, 15,118, 14,
+ 119, 25, 120, 27, 121, 22, 122, 26, 197, 21, 198, 20, 201, 28,211, 29,
+ 216, 19, 229, 21, 230, 20, 233, 28, 243, 29, 248, 19,
};
/* Model Table:
- * Total sequences: 936
- * First 512 sequences: 0.9962304038307248
- * Next 512 sequences (512-1024): 0.003769596169275244
- * Rest: -5.2909066017292616e-17
+ * Total considered sequences: 1079 / 961
+ * - Positive sequences: first 508 (0.995012453333286)
+ * - Probable sequences: next 198 (706-508) (0.003993410296057376)
+ * - Neutral sequences: last 255 (0.0009941363706565953)
+ * - Negative sequences: -118 (off-ratio)
* Negative sequences: TODO
*/
static const PRUint8 DanishLangModel[] =
{
- 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,3,3,3,3,2,3,0,2,
- 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,2,2,2,
- 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,2,2,2,
- 3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,2,0,
- 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,2,0,3,3,3,3,3,3,0,2,
- 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,3,3,2,3,3,2,2,
- 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,0,2,2,
- 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,2,0,2,0,
- 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,0,0,3,3,3,3,3,2,0,2,
- 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,0,2,2,
- 3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,2,3,3,3,3,3,2,3,2,2,0,2,2,
- 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,2,0,2,0,
- 3,3,3,3,3,3,3,2,3,3,2,3,3,2,3,3,2,2,3,3,3,3,3,2,2,2,0,0,2,0,
- 3,3,2,3,3,3,3,2,3,3,3,2,2,3,3,3,3,2,3,3,3,3,3,2,3,2,0,3,2,0,
- 3,3,3,3,3,3,3,3,3,3,2,2,2,2,2,3,3,2,3,2,3,3,2,0,3,0,2,0,0,2,
- 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,2,2,0,2,3,2,2,2,2,0,2,
- 3,3,3,3,3,3,3,3,3,3,0,2,2,2,0,3,3,2,2,3,3,3,3,2,3,2,2,0,2,0,
- 3,3,3,3,3,3,3,3,3,3,3,2,2,3,2,3,3,3,3,2,3,3,2,2,2,0,2,0,2,0,
- 3,3,3,3,3,3,3,2,3,2,2,3,2,2,3,3,2,2,2,3,3,3,3,2,3,2,0,0,2,0,
- 3,3,3,3,2,2,3,3,0,3,3,3,3,3,3,2,3,2,2,0,0,0,2,2,3,0,0,0,0,0,
- 2,3,3,3,2,3,3,3,3,3,3,3,3,3,3,2,3,2,2,0,0,0,0,2,0,0,0,0,0,0,
- 3,3,3,3,0,0,3,3,2,3,2,2,3,2,3,0,3,2,2,0,0,0,0,0,0,0,0,0,0,0,
- 3,3,3,3,3,2,3,3,3,3,3,3,3,2,3,2,3,3,2,2,0,0,0,2,2,2,2,0,0,0,
- 3,3,2,3,3,3,2,2,3,3,2,2,3,2,2,3,2,2,3,0,3,0,3,3,0,0,2,0,2,2,
- 3,3,2,3,3,3,3,3,3,3,2,2,2,2,2,3,3,2,2,3,3,2,3,0,0,2,2,0,2,0,
- 3,2,2,2,3,3,2,2,3,2,0,2,2,2,0,2,2,0,3,0,2,0,2,2,0,2,0,0,0,0,
- 3,2,2,2,3,3,2,2,3,0,2,2,2,0,2,2,2,2,2,0,0,0,2,2,2,2,2,2,0,0,
- 3,2,2,2,3,3,2,0,2,2,0,0,0,2,2,2,2,2,0,0,0,0,0,2,0,2,0,2,0,0,
- 2,2,3,2,2,0,2,2,2,2,2,2,0,0,2,0,2,2,0,0,0,0,0,2,0,0,2,0,0,0,
- 0,2,0,0,2,2,0,2,2,2,0,0,2,2,0,3,2,0,0,0,0,0,0,0,0,0,0,0,0,2,
+ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,1,2,3,3,3,3,2,3,1,0,2,
+ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,2,2,2,1,
+ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,1,2,2,1,
+ 3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,1,2,3,0,
+ 3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,0,1,0,3,3,3,3,3,3,0,0,2,
+ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,1,1,3,3,1,3,3,1,0,2,
+ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,0,1,1,2,
+ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,1,0,2,2,1,
+ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,1,1,0,2,1,1,
+ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,0,1,3,3,3,3,2,2,1,0,1,
+ 3,3,3,3,3,3,3,3,3,3,3,3,2,2,3,3,3,2,3,3,3,3,3,2,3,2,0,0,1,1,0,
+ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,1,1,1,2,1,0,
+ 3,3,3,3,3,3,3,2,3,3,2,3,3,2,3,3,2,2,3,3,3,3,3,1,3,1,1,1,1,1,0,
+ 3,3,3,3,3,3,3,2,3,3,3,2,2,3,2,3,2,2,3,3,3,3,3,2,3,1,1,2,1,1,0,
+ 3,3,3,3,3,3,3,3,3,3,2,3,2,2,2,3,2,2,2,3,3,3,2,1,3,0,0,0,1,0,0,
+ 3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,1,1,0,2,3,2,2,2,3,1,0,0,
+ 3,3,3,3,3,3,3,2,3,3,0,2,1,1,1,3,3,1,2,3,3,3,3,2,3,1,1,0,1,2,0,
+ 3,3,3,3,3,3,3,3,3,3,3,2,2,3,2,3,3,3,3,2,3,3,3,1,1,0,1,0,2,1,0,
+ 3,3,3,3,3,3,3,2,3,3,1,3,2,2,3,3,1,1,2,3,3,3,3,1,3,3,0,1,1,1,2,
+ 3,3,3,3,1,2,3,3,3,1,3,3,3,2,3,1,3,2,1,0,0,0,2,0,3,0,0,0,0,0,0,
+ 2,3,3,3,1,3,3,3,3,3,3,3,3,3,3,2,3,2,1,0,0,0,0,2,0,0,0,0,0,0,0,
+ 3,3,3,3,0,0,3,3,3,1,2,1,3,2,3,0,3,1,1,1,0,0,0,0,0,0,0,0,0,0,0,
+ 3,3,3,3,3,2,3,3,3,3,3,3,3,2,3,2,3,3,2,1,0,0,0,2,2,1,2,1,0,0,0,
+ 3,3,1,3,3,3,2,3,3,3,1,2,3,1,1,3,1,1,3,0,1,0,3,3,1,1,1,0,1,0,1,
+ 3,3,3,3,3,3,3,3,3,3,2,1,2,1,1,3,2,1,2,3,3,1,3,0,0,0,0,0,1,2,0,
+ 3,2,2,2,3,3,2,1,2,3,0,1,1,1,0,2,2,0,2,0,1,0,1,1,1,2,1,0,0,0,0,
+ 3,1,1,1,3,3,1,1,1,3,1,2,1,1,0,2,1,1,1,0,0,0,2,1,2,1,3,0,0,0,0,
+ 2,1,1,1,2,3,1,1,2,2,0,0,1,1,1,1,1,2,2,1,0,0,1,2,0,1,0,1,0,0,0,
+ 2,2,3,2,1,0,2,2,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,1,0,0,1,0,0,0,0,
+ 0,3,2,2,1,0,1,0,2,0,1,0,2,0,1,0,0,1,1,0,0,0,0,0,0,1,1,0,0,0,0,
+ 2,0,0,0,2,2,0,0,0,0,0,0,0,1,1,3,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,
};
@@ -185,8 +208,8 @@ const SequenceModel Iso_8859_15DanishModel =
{
Iso_8859_15_CharToOrderMap,
DanishLangModel,
- 30,
- (float)0.9962304038307248,
+ 31,
+ (float)0.9990058636293434,
PR_TRUE,
"ISO-8859-15",
"da"
@@ -196,8 +219,8 @@ const SequenceModel Iso_8859_1DanishModel =
{
Iso_8859_1_CharToOrderMap,
DanishLangModel,
- 30,
- (float)0.9962304038307248,
+ 31,
+ (float)0.9990058636293434,
PR_TRUE,
"ISO-8859-1",
"da"
@@ -207,19 +230,30 @@ const SequenceModel Windows_1252DanishModel =
{
Windows_1252_CharToOrderMap,
DanishLangModel,
- 30,
- (float)0.9962304038307248,
+ 31,
+ (float)0.9990058636293434,
PR_TRUE,
"WINDOWS-1252",
"da"
};
+const SequenceModel Ibm865DanishModel =
+{
+ Ibm865_CharToOrderMap,
+ DanishLangModel,
+ 31,
+ (float)0.9990058636293434,
+ PR_TRUE,
+ "IBM865",
+ "da"
+};
+
const LanguageModel DanishModel =
{
"da",
Unicode_CharOrder,
- 60,
+ 62,
DanishLangModel,
- 30,
- (float)0.9962304038307248,
+ 31,
+ (float)0.9992516135038306,
};
diff --git a/src/nsSBCSGroupProber.cpp b/src/nsSBCSGroupProber.cpp
index fd4f2d6..ef8da36 100644
--- a/src/nsSBCSGroupProber.cpp
+++ b/src/nsSBCSGroupProber.cpp
@@ -197,8 +197,8 @@ nsSBCSGroupProber::nsSBCSGroupProber()
mProbers[103] = new nsSingleByteCharSetProber(&Windows_1252NorwegianModel);
mProbers[104] = new nsSingleByteCharSetProber(&Ibm865NorwegianModel);
- mProbers[104] = new nsSingleByteCharSetProber(&Iso_8859_1EnglishModel);
- mProbers[105] = new nsSingleByteCharSetProber(&Windows_1252EnglishModel);
+ mProbers[105] = new nsSingleByteCharSetProber(&Iso_8859_1EnglishModel);
+ mProbers[106] = new nsSingleByteCharSetProber(&Windows_1252EnglishModel);
Reset();
}
diff --git a/src/nsSBCSGroupProber.h b/src/nsSBCSGroupProber.h
index a68a2a4..1c473f2 100644
--- a/src/nsSBCSGroupProber.h
+++ b/src/nsSBCSGroupProber.h
@@ -40,7 +40,7 @@
#define nsSBCSGroupProber_h__
-#define NUM_OF_SBCS_PROBERS 106
+#define NUM_OF_SBCS_PROBERS 107
class nsCharSetProber;
class nsSBCSGroupProber: public nsCharSetProber {