diff options
author | Jehan <jehan@girinstud.io> | 2022-11-30 20:58:37 +0100 |
---|---|---|
committer | Jehan <jehan@girinstud.io> | 2022-12-14 00:24:53 +0100 |
commit | b5b75b81cec2fc0b2c131be60086062a8d3af1af (patch) | |
tree | 9e4df7b45df1cb53df4103dcf24c03aa4dd2eaf1 | |
parent | 0be80a21db41321da0a33ffc6b5d272a712cbf6c (diff) |
script, src: rebuild the Danish model.
Now that it has IBM865 support on the main branch and that I rebased,
this feature branch for the new API got broken too.
-rw-r--r-- | script/BuildLangModelLogs/LangDanishModel.log | 362 | ||||
-rw-r--r-- | src/LangModels/LangDanishModel.cpp | 196 | ||||
-rw-r--r-- | src/nsSBCSGroupProber.cpp | 4 | ||||
-rw-r--r-- | src/nsSBCSGroupProber.h | 2 |
4 files changed, 341 insertions, 223 deletions
diff --git a/script/BuildLangModelLogs/LangDanishModel.log b/script/BuildLangModelLogs/LangDanishModel.log index 2408080..1396f6e 100644 --- a/script/BuildLangModelLogs/LangDanishModel.log +++ b/script/BuildLangModelLogs/LangDanishModel.log @@ -1,156 +1,240 @@ = Logs of language model for Danish (da) = - Generated by BuildLangModel.py -- Started: 2021-03-16 01:32:17.684746 -- Maximum depth: 4 -- Max number of pages: 100 +- Started: 2022-11-30 20:49:10.182568 +- Maximum depth: 2 +- Max number of pages: 200 == Parsed pages == Forside (revision 10000691) -1. symfoni (Beethoven) (revision 10648993) -15. marts (revision 8172123) -1917 (revision 10645384) -1930 (revision 10645389) -1940 (revision 10648721) -1951 (revision 10640371) -1972 (revision 10641861) +15. januar (revision 10515606) +IC4 (revision 11317878) +VM i fodbold 2022 (mænd) (revision 11344039) +28. november (revision 9410945) +Forenede Nationer (revision 11199108) +Middelaldercentret (revision 11339897) +Vilhelm Erobreren (revision 11279565) +Casper & Mandrilaftalen (revision 11221713) +Nikolaj Lie Kaas (revision 11322663) +Stig Hoffmeyer (revision 11340274) +Rock and Roll Hall of Fame (revision 8408189) +Anwar Ibrahim (revision 11342876) +Afrikamesterskabet i håndbold 2022 (kvinder) (revision 11341917) +1940 (revision 11263756) +1937 (revision 11303923) +1934 (revision 11224625) +Danmarksdemokraterne (revision 11335570) +The Julekalender (revision 11341242) +Ruslands invasion af Ukraine 2022 (revision 11335164) +25. november (revision 10378454) +The Jimi Hendrix Experience (revision 10497780) +24. november (revision 6877891) +Vikingetidens rustning og våben (revision 11332607) +Torben Rechendorff (revision 11342962) +Thomas Edison (revision 11052704) +1947 (revision 11252357) +Eurovision Song Contest 2014 (revision 11333950) +29. november (revision 6877900) +Ukraine (revision 11334630) +1990 (revision 11340072) +Maurice Norman (revision 11342318) +Sergej Sjojgu (revision 11309097) +Færøerne (revision 11333678) +Fonograf (revision 11032483) +Folketingsvalget 2022 (revision 11339557) +Hans Magnus Enzensberger (revision 11341046) +Moderaterne (revision 11305861) +Hawaii (revision 11317011) +Mandan (indfødte amerikanere) (revision 11336303) +SI-præfiks (revision 11332802) +Encyklopædi (revision 11315276) +Storbritannien (revision 11329834) +1991 (revision 11250037) +Det Konservative Folkeparti (revision 11313857) +Wandsworth-skjoldet (revision 11341402) +Angolas håndboldlandshold (damer) (revision 11331888) +Shu-bi-dua (revision 11324736) +1877 (revision 11224901) +Kon-Tiki (revision 10615971) +Socialdemokratiet (revision 11325315) +Donatan (revision 10586146) +Adolf Hitler (revision 11317375) +Procent (revision 10764365) +1. juni (revision 10206137) +1863 (revision 11081613) +ISO 3166-1 alpha-3 (revision 11250626) +Senegals håndboldlandshold (herrer) (revision 8621578) +Billion (revision 11039345) +Lørdag (revision 11159889) +Sachsen (revision 11299889) +Vestindien (revision 11330329) +Folketingsvalget 1988 (revision 10970017) +Dogme 95 (revision 10973606) +Encyclopédie (revision 11314734) +Afrikamesterskabet i håndbold 2018 (mænd) (revision 11131830) +Mew (revision 11308840) 2. marts (revision 9423344) -2003 (revision 10654209) -44 f.Kr. (revision 7242128) -7. marts (revision 9423388) -9. marts (revision 10601197) -Abdikation (revision 10197388) -Afsnit af Badehotellet (revision 10654331) -Agnes Slott-Møller (revision 10648962) -Australian Open-mesterskabet i damesingle 2021 (revision 10630904) -Australian Open-mesterskabet i herresingle 2021 (revision 10630887) -Australian Open 2021 (revision 10630544) -Casper & Mandrilaftalen (revision 10444147) -Coronaviruspandemien (revision 10652415) -Cykling under sommer-OL 2012 – Linjeløb (kvinder) (revision 10651872) -Dansk (sprog) (revision 10633727) -Den danske Treårsekspedition til Østgrønland 1931-34 (revision 10654093) -Dnepr (revision 10635465) -Donald Trump (revision 10653185) -Døde i 2021 (revision 10653976) -Encyklopædi (revision 10590147) -Eurovision Song Contest 2014 (revision 10592331) -Folkerepublikken Kina (revision 10634829) -Folketinget (revision 10643927) -Fram-ekspeditionen 1910-1912 (revision 10630146) -Frankrig (revision 10648749) -Frankrigs præsidenter (revision 10477099) -Geologi (revision 10631000) -Geoteknik (revision 10603548) -Greater London (revision 10380043) -Hortus Botanicus Amsterdam (revision 8854568) -Hu Jintao (revision 10610855) -IC4 (revision 10577458) -Idus martius (revision 10652897) -Inger Støjberg (revision 10643259) -Italiens premierministre (revision 10625575) -John Polkinghorne (revision 10654447) -Julius Cæsar (revision 10653812) -Korruption (revision 10401686) -Lars Göran Petrov (revision 10650013) -London Underground (revision 10635531) -Marge Simpson (revision 10640942) -Mario Draghi (revision 10652699) -Matilde af Skotland (revision 10648200) -Metrosystemer i verden (revision 10510595) -Middelaldercentret (revision 10574228) -Naomi Osaka (revision 10478959) -Nederlandene (revision 10642742) -Nicolas Sarkozy (revision 10639376) -Nikolaj 2. af Rusland (revision 10639924) -Novak Djokovic (revision 10479710) -Outlaw Gentlemen & Shady Ladies (revision 10492201) -Paris-Nice 2021 (revision 10653019) -Rigsretssagen mod Donald Trump 2021 (revision 10653875) -Rigsretssagen mod Inger Støjberg (revision 10643260) -Rusland (revision 10631140) -Sanja Ilić (revision 10645645) -Senat (revision 10429780) -Senatet (USA) (revision 10624834) -Shu-bi-dua (revision 10630614) -Svend Johansen (skuespiller) (revision 10643631) -Tennis (revision 10651841) -Tommy Troelsen (revision 10648382) -Træsko (revision 10626215) -USA's præsidenter (revision 10639768) -Undergrundsbane (revision 10541653) -Vilhelm Erobreren (revision 10631208) -Wikimedia (revision 10260889) -Wikipedia (revision 10627445) -Zar (revision 10557166) -1800 (revision 10645359) -2. april (revision 9568657) -Burgtheater (revision 9296862) -C-dur (revision 10513719) -Cello (revision 10641506) -Coda (revision 9298442) -Dominant (revision 9513277) -Dynamik (musik) (revision 9504157) -F-dur (revision 8135200) -Fagot (revision 10578018) -Fløjte (revision 10329382) -Harmonik (revision 10577145) -International Music Score Library Project (revision 10115839) -Italienske og franske musikudtryk (revision 10352094) -Johann Georg Albrechtsberger (revision 10289540) -Joseph Haydn (revision 10289602) -Klarinet (revision 10490230) -Klassicisme (musik) (revision 10436811) -Kontrabas (revision 10147393) -Kontrapunkt (musikteori) (revision 10184029) -Leipzig (revision 10611798) -Ludwig van Beethoven (revision 10642134) +Rajon (revision 11185598) +TheTVDB (revision 10969052) +Skueproces (revision 11322041) +New York Times (revision 10236433) +2006 (revision 11271490) +Jacinda Ardern (revision 11243495) +8. maj (revision 9423405) +7. juni (revision 10287352) +Ray Charles (revision 10893843) +Dansk Rock - fra pigtråd til punk (revision 10970784) +1950'erne (revision 10917112) +John Wesley Hyatt (revision 9405508) +Landsdel Hovedstaden (revision 10723037) +Zar-Rusland (revision 11328111) +1816 (revision 11198312) +Engelsk litteratur (revision 10817139) +22. november (revision 10203064) +Maj (revision 11288718) +Progressiv rock (revision 11259601) +Maurice Setters (revision 10936371) +Minkkommissionen (revision 11337058) +Ragnhild Hveger (revision 11072132) +1961 (revision 11224941) +Montenegro (revision 11340028) +Socialkonservatisme (revision 8745187) +TV 2 (revision 11339141) +7. februar (revision 9423377) +Ar (enhed) (revision 11309905) +1881 (revision 11144791) +Etiopisk kalender (revision 9931290) +Ethelbert Nevin (revision 10591854) +The Moscow Times (revision 11329355) +1960'erne (revision 11261802) +15. november (revision 6877873) +Politikens Forlag (revision 11322941) +Island (revision 11219029) +Danmark (revision 11313400) +Det Kongelige Teater (revision 11319106) +20. juni (revision 10232768) +VM i fodbold 1958 (revision 11014260) +Næste folketingsvalg (revision 11338101) +Virtual International Authority File (revision 8702589) +Marmor (revision 11309004) +Oslo (revision 11290885) +1938 (revision 11336099) +Frie Grønne (revision 11294501) +Lottorp (revision 11223312) +1931 (revision 11236350) +1930 (revision 11252037) +Albanien (revision 11309379) +Holger Begtrup (revision 10289352) +1887 (revision 11250123) +Kristen Helveg Petersen (revision 10505239) +Benito Mussolini (revision 11311831) +Tamilrapporten (revision 10672604) +Internationale Valutafond (revision 10871884) +Ron Flowers (revision 10999963) +Scud-missil (revision 11072276) +1860'erne (revision 8151963) +11. november (revision 10903885) +10. november (revision 9286344) +1697 (revision 10865232) +Det Humanistiske Parti (revision 10898925) +1998 (revision 11342743) +Centrum-Demokraterne (revision 11201902) +Præstens Urskov (revision 10261164) +Kraghave (Tingsted Sogn) (revision 11124871) +Burkina Faso (revision 11309150) +Johannes Peter Frederik Königsfeldt (revision 10942128) +John Bardeen (revision 10622362) +Retsforbundet (revision 11333888) +Mykolaiv oblast (revision 11215109) +Folketingsvalget 1932 (revision 10529645) +Atassut (revision 11250468) +1780 (revision 10879041) +Pokalvindernes Europa Cup (revision 10533322) +Harmonium (revision 10648166) +Litra MA (revision 10707516) +14. oktober (revision 9764309) +Letland i Eurovision Song Contest (revision 11273114) +Den røde tråd (sang) (revision 11117198) +Peter A.G. Nielsen (revision 11311663) +Internationalt Standardbognummer (revision 11037702) +Denys Sjmyhal (revision 11184932) +Souvenir (revision 10530474) +Kristendemokraterne (revision 11310458) +Edward Gibbon (revision 11316150) +19. november (revision 10910432) +Aarhus Hovedbanegård (revision 11254458) +Grækere (revision 11277065) +Moderaterna (revision 11275745) +Margrethe 2. (revision 11264709) +1978 (revision 11340075) +Demokratiske Republik Congos håndboldlandshold (damer) (revision 11330801) +Philip af Storbritannien (revision 11307679) +21. århundrede (revision 9838559) +Jørgen Christensen (handelsminister) (revision 9548745) +Holger Juul Hansen (revision 11316843) +Fodboldspiller (revision 11234361) +Parliamo italiano (revision 11322505) +Borgerlig (revision 10930991) +Mail (revision 10885336) +Disko (revision 10767773) +Tunesiens fodboldlandshold (revision 11334411) +6. december (revision 10378463) +Erhvervspartiet (1978-79) (revision 8449157) +Sovjetunionen (revision 11333771) +1567 (revision 10818742) +1875 (revision 11198318) +Hubble-teleskopet (revision 11304842) +Hærulfstenen (revision 11317806) +Frankrig (revision 11235194) +Coney Island (revision 11211594) +1952 (revision 11243498) == End of Parsed pages == -- Wikipedia parsing ended at: 2021-03-16 01:36:49.098009 +- Wikipedia parsing ended at: 2022-11-30 20:52:37.002648 -57 characters appeared 1058523 times. +63 characters appeared 1374958 times. -First 30 characters: -[ 0] Char e: 15.118707859914238 % -[ 1] Char r: 8.552388564065213 % -[ 2] Char n: 7.6833474567864855 % -[ 3] Char t: 7.125305732610439 % -[ 4] Char a: 6.351302711419591 % -[ 5] Char i: 6.265806222443915 % -[ 6] Char s: 6.152629654716997 % -[ 7] Char d: 5.90341447469729 % -[ 8] Char o: 5.144999211164992 % -[ 9] Char l: 5.1253491893893655 % -[10] Char g: 3.907992551885977 % -[11] Char m: 3.3046990948708723 % -[12] Char k: 3.0474538578755492 % -[13] Char f: 2.586434116216653 % -[14] Char v: 2.2680659749481116 % -[15] Char u: 1.9654745338551927 % -[16] Char b: 1.7524418458550264 % -[17] Char p: 1.6338804163915193 % -[18] Char h: 1.5844719481768466 % -[19] Char ø: 0.7598323324103491 % -[20] Char æ: 0.7542585281566863 % -[21] Char å: 0.728278932059105 % -[22] Char y: 0.6751860847615027 % -[23] Char c: 0.6527963964883143 % -[24] Char j: 0.5847770903419198 % -[25] Char w: 0.17241004682940286 % -[26] Char z: 0.0783166733268904 % -[27] Char x: 0.05602145631223884 % -[28] Char é: 0.019177665482941794 % -[29] Char q: 0.016626941502452003 % +Most Frequent characters: +[ 0] Char e: 14.79056087531401 % +[ 1] Char r: 8.641427592697378 % +[ 2] Char n: 7.613105273033795 % +[ 3] Char t: 6.915483963873806 % +[ 4] Char a: 6.583692010955971 % +[ 5] Char i: 6.462524673480935 % +[ 6] Char s: 6.347902990491345 % +[ 7] Char d: 5.849924143137463 % +[ 8] Char l: 5.1523755634717565 % +[ 9] Char o: 4.9496784629057755 % +[10] Char g: 3.827389636628901 % +[11] Char m: 3.251226582921078 % +[12] Char k: 3.2378443559730554 % +[13] Char f: 2.605170485207548 % +[14] Char v: 2.205303725641074 % +[15] Char u: 1.978242244490377 % +[16] Char b: 1.8278376503136822 % +[17] Char p: 1.5923395478261881 % +[18] Char h: 1.5512473835564433 % +[19] Char ø: 0.88409973250092 % +[20] Char æ: 0.7078761678538544 % +[21] Char å: 0.7005304889312983 % +[22] Char y: 0.6576200873044848 % +[23] Char c: 0.648019794059164 % +[24] Char j: 0.646928851644923 % +[25] Char w: 0.14465896412835882 % +[26] Char z: 0.06814753614292218 % +[27] Char x: 0.03643747663564996 % +[28] Char é: 0.020946094353427522 % +[29] Char ó: 0.013600415430871343 % +[30] Char q: 0.013018579476609468 % -The first 30 characters have an accumulated ratio of 0.9997184756495605. +The first 31 characters have an accumulated ratio of 0.9992516135038306. -936 sequences found. +1079 sequences found. -First 512 (typical positive ratio): 0.9962304038307248 -Next 512 (512-1024): 0.007598323324103491 -Rest: -5.2909066017292616e-17 +First 508 (typical positive ratio): 0.995012453333286 +Next 198 (706-508): 0.003993410296057376 +Rest: 0.0009941363706565953 -- Processing end: 2021-03-16 01:36:49.182013 +- Processing end: 2022-11-30 20:52:37.084319 diff --git a/src/LangModels/LangDanishModel.cpp b/src/LangModels/LangDanishModel.cpp index 1ce75cb..0abdd97 100644 --- a/src/LangModels/LangDanishModel.cpp +++ b/src/LangModels/LangDanishModel.cpp @@ -42,7 +42,7 @@ /** * Generated by BuildLangModel.py - * On: 2021-03-16 01:36:49.098484 + * On: 2022-11-30 20:52:37.003457 **/ /* Character Mapping Table: @@ -68,18 +68,18 @@ static const unsigned char Iso_8859_15_CharToOrderMap[] = CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ - SYM, 4, 16, 23, 7, 0, 13, 10, 18, 5, 24, 12, 9, 11, 2, 8, /* 4X */ - 17, 29, 1, 6, 3, 15, 14, 25, 27, 22, 26,SYM,SYM,SYM,SYM,SYM, /* 5X */ - SYM, 4, 16, 23, 7, 0, 13, 10, 18, 5, 24, 12, 9, 11, 2, 8, /* 6X */ - 17, 29, 1, 6, 3, 15, 14, 25, 27, 22, 26,SYM,SYM,SYM,SYM,CTR, /* 7X */ + SYM, 4, 16, 23, 7, 0, 13, 10, 18, 5, 24, 12, 8, 11, 2, 9, /* 4X */ + 17, 30, 1, 6, 3, 15, 14, 25, 27, 22, 26,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 4, 16, 23, 7, 0, 13, 10, 18, 5, 24, 12, 8, 11, 2, 9, /* 6X */ + 17, 30, 1, 6, 3, 15, 14, 25, 27, 22, 26,SYM,SYM,SYM,SYM,CTR, /* 7X */ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ - SYM,SYM,SYM,SYM,SYM,SYM, 40,SYM, 40,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */ - SYM,SYM,SYM,SYM, 56, 52,SYM,SYM, 56,SYM,SYM,SYM, 57, 58, 59,SYM, /* BX */ - 41, 32, 48, 60, 33, 21, 20, 37, 34, 28, 39, 46, 43, 36, 53, 45, /* CX */ - 49, 54, 47, 35, 42, 61, 30,SYM, 19, 55, 38, 62, 31, 51, 50, 44, /* DX */ - 41, 32, 48, 63, 33, 21, 20, 37, 34, 28, 39, 46, 43, 36, 53, 45, /* EX */ - 49, 54, 47, 35, 42, 64, 30,SYM, 19, 55, 38, 65, 31, 51, 50, 66, /* FX */ + SYM,SYM,SYM,SYM,SYM,SYM, 44,SYM, 44,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */ + SYM,SYM,SYM,SYM, 55, 56,SYM,SYM, 55,SYM,SYM,SYM, 63, 64, 65,SYM, /* BX */ + 46, 34, 52, 53, 38, 21, 20, 40, 39, 28, 43, 36, 66, 35, 67, 62, /* CX */ + 32, 49, 60, 29, 48, 68, 33,SYM, 19, 61, 37, 59, 31, 42, 41, 45, /* DX */ + 46, 34, 52, 53, 38, 21, 20, 40, 39, 28, 43, 36, 69, 35, 70, 62, /* EX */ + 32, 49, 60, 29, 48, 71, 33,SYM, 19, 61, 37, 59, 31, 42, 41, 72, /* FX */ }; /*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ @@ -89,18 +89,18 @@ static const unsigned char Iso_8859_1_CharToOrderMap[] = CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ - SYM, 4, 16, 23, 7, 0, 13, 10, 18, 5, 24, 12, 9, 11, 2, 8, /* 4X */ - 17, 29, 1, 6, 3, 15, 14, 25, 27, 22, 26,SYM,SYM,SYM,SYM,SYM, /* 5X */ - SYM, 4, 16, 23, 7, 0, 13, 10, 18, 5, 24, 12, 9, 11, 2, 8, /* 6X */ - 17, 29, 1, 6, 3, 15, 14, 25, 27, 22, 26,SYM,SYM,SYM,SYM,CTR, /* 7X */ + SYM, 4, 16, 23, 7, 0, 13, 10, 18, 5, 24, 12, 8, 11, 2, 9, /* 4X */ + 17, 30, 1, 6, 3, 15, 14, 25, 27, 22, 26,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 4, 16, 23, 7, 0, 13, 10, 18, 5, 24, 12, 8, 11, 2, 9, /* 6X */ + 17, 30, 1, 6, 3, 15, 14, 25, 27, 22, 26,SYM,SYM,SYM,SYM,CTR, /* 7X */ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */ - SYM,SYM,SYM,SYM,SYM, 52,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */ - 41, 32, 48, 67, 33, 21, 20, 37, 34, 28, 39, 46, 43, 36, 53, 45, /* CX */ - 49, 54, 47, 35, 42, 68, 30,SYM, 19, 55, 38, 69, 31, 51, 50, 44, /* DX */ - 41, 32, 48, 70, 33, 21, 20, 37, 34, 28, 39, 46, 43, 36, 53, 45, /* EX */ - 49, 54, 47, 35, 42, 71, 30,SYM, 19, 55, 38, 72, 31, 51, 50, 73, /* FX */ + SYM,SYM,SYM,SYM,SYM, 56,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */ + 46, 34, 52, 53, 38, 21, 20, 40, 39, 28, 43, 36, 73, 35, 74, 62, /* CX */ + 32, 49, 60, 29, 48, 75, 33,SYM, 19, 61, 37, 59, 31, 42, 41, 45, /* DX */ + 46, 34, 52, 53, 38, 21, 20, 40, 39, 28, 43, 36, 76, 35, 77, 62, /* EX */ + 32, 49, 60, 29, 48, 78, 33,SYM, 19, 61, 37, 59, 31, 42, 41, 79, /* FX */ }; /*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ @@ -110,74 +110,97 @@ static const unsigned char Windows_1252_CharToOrderMap[] = CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ - SYM, 4, 16, 23, 7, 0, 13, 10, 18, 5, 24, 12, 9, 11, 2, 8, /* 4X */ - 17, 29, 1, 6, 3, 15, 14, 25, 27, 22, 26,SYM,SYM,SYM,SYM,SYM, /* 5X */ - SYM, 4, 16, 23, 7, 0, 13, 10, 18, 5, 24, 12, 9, 11, 2, 8, /* 6X */ - 17, 29, 1, 6, 3, 15, 14, 25, 27, 22, 26,SYM,SYM,SYM,SYM,CTR, /* 7X */ - SYM,ILL,SYM, 74,SYM,SYM,SYM,SYM,SYM,SYM, 40,SYM, 75,ILL, 56,ILL, /* 8X */ - ILL,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, 40,SYM, 76,ILL, 56, 77, /* 9X */ + SYM, 4, 16, 23, 7, 0, 13, 10, 18, 5, 24, 12, 8, 11, 2, 9, /* 4X */ + 17, 30, 1, 6, 3, 15, 14, 25, 27, 22, 26,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 4, 16, 23, 7, 0, 13, 10, 18, 5, 24, 12, 8, 11, 2, 9, /* 6X */ + 17, 30, 1, 6, 3, 15, 14, 25, 27, 22, 26,SYM,SYM,SYM,SYM,CTR, /* 7X */ + SYM,ILL,SYM, 80,SYM,SYM,SYM,SYM,SYM,SYM, 44,SYM, 81,ILL, 55,ILL, /* 8X */ + ILL,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, 44,SYM, 82,ILL, 55, 83, /* 9X */ SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */ - SYM,SYM,SYM,SYM,SYM, 52,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */ - 41, 32, 48, 78, 33, 21, 20, 37, 34, 28, 39, 46, 43, 36, 53, 45, /* CX */ - 49, 54, 47, 35, 42, 79, 30,SYM, 19, 55, 38, 80, 31, 51, 50, 44, /* DX */ - 41, 32, 48, 81, 33, 21, 20, 37, 34, 28, 39, 46, 43, 36, 53, 45, /* EX */ - 49, 54, 47, 35, 42, 82, 30,SYM, 19, 55, 38, 83, 31, 51, 50, 84, /* FX */ + SYM,SYM,SYM,SYM,SYM, 56,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */ + 46, 34, 52, 53, 38, 21, 20, 40, 39, 28, 43, 36, 84, 35, 85, 62, /* CX */ + 32, 49, 60, 29, 48, 86, 33,SYM, 19, 61, 37, 59, 31, 42, 41, 45, /* DX */ + 46, 34, 52, 53, 38, 21, 20, 40, 39, 28, 43, 36, 87, 35, 88, 62, /* EX */ + 32, 49, 60, 29, 48, 89, 33,SYM, 19, 61, 37, 59, 31, 42, 41, 90, /* FX */ }; /*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ -static const int Unicode_Char_size = 60; +static const unsigned char Ibm865_CharToOrderMap[] = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 4, 16, 23, 7, 0, 13, 10, 18, 5, 24, 12, 8, 11, 2, 9, /* 4X */ + 17, 30, 1, 6, 3, 15, 14, 25, 27, 22, 26,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 4, 16, 23, 7, 0, 13, 10, 18, 5, 24, 12, 8, 11, 2, 9, /* 6X */ + 17, 30, 1, 6, 3, 15, 14, 25, 27, 22, 26,SYM,SYM,SYM,SYM,CTR, /* 7X */ + 40, 31, 28, 52, 38, 46, 21, 40, 43, 36, 39, 62, 91, 92, 38, 21, /* 8X */ + 28, 20, 20, 48, 33, 60, 59, 61, 93, 33, 31, 19,SYM, 19,SYM,SYM, /* 9X */ + 34, 35, 29, 37, 49, 49,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* CX */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* DX */ + 47, 45, 94, 54, 57, 57, 56, 58, 50, 95, 96, 97, 98, 50, 51,SYM, /* EX */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const int Unicode_Char_size = 62; static const unsigned int Unicode_CharOrder[] = { 65, 4, 66, 16, 67, 23, 68, 7, 69, 0, 70, 13, 71, 10, 72, 18, - 73, 5, 74, 24, 75, 12, 76, 9, 77, 11, 78, 2, 79, 8, 80, 17, - 81, 29, 82, 1, 83, 6, 84, 3, 85, 15, 86, 14, 87, 25, 88, 27, + 73, 5, 74, 24, 75, 12, 76, 8, 77, 11, 78, 2, 79, 9, 80, 17, + 81, 30, 82, 1, 83, 6, 84, 3, 85, 15, 86, 14, 87, 25, 88, 27, 89, 22, 90, 26, 97, 4, 98, 16, 99, 23, 100, 7, 101, 0,102, 13, - 103, 10, 104, 18, 105, 5, 106, 24, 107, 12, 108, 9, 109, 11,110, 2, - 111, 8, 112, 17, 113, 29, 114, 1, 115, 6, 116, 3, 117, 15,118, 14, - 119, 25, 120, 27, 121, 22, 122, 26, 197, 21, 198, 20, 201, 28,216, 19, - 229, 21, 230, 20, 233, 28, 248, 19, + 103, 10, 104, 18, 105, 5, 106, 24, 107, 12, 108, 8, 109, 11,110, 2, + 111, 9, 112, 17, 113, 30, 114, 1, 115, 6, 116, 3, 117, 15,118, 14, + 119, 25, 120, 27, 121, 22, 122, 26, 197, 21, 198, 20, 201, 28,211, 29, + 216, 19, 229, 21, 230, 20, 233, 28, 243, 29, 248, 19, }; /* Model Table: - * Total sequences: 936 - * First 512 sequences: 0.9962304038307248 - * Next 512 sequences (512-1024): 0.003769596169275244 - * Rest: -5.2909066017292616e-17 + * Total considered sequences: 1079 / 961 + * - Positive sequences: first 508 (0.995012453333286) + * - Probable sequences: next 198 (706-508) (0.003993410296057376) + * - Neutral sequences: last 255 (0.0009941363706565953) + * - Negative sequences: -118 (off-ratio) * Negative sequences: TODO */ static const PRUint8 DanishLangModel[] = { - 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,3,3,3,3,2,3,0,2, - 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,2,2,2, - 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,2,2,2, - 3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,2,0, - 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,2,0,3,3,3,3,3,3,0,2, - 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,3,3,2,3,3,2,2, - 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,0,2,2, - 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,2,0,2,0, - 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,0,0,3,3,3,3,3,2,0,2, - 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,0,2,2, - 3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,2,3,3,3,3,3,2,3,2,2,0,2,2, - 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,2,0,2,0, - 3,3,3,3,3,3,3,2,3,3,2,3,3,2,3,3,2,2,3,3,3,3,3,2,2,2,0,0,2,0, - 3,3,2,3,3,3,3,2,3,3,3,2,2,3,3,3,3,2,3,3,3,3,3,2,3,2,0,3,2,0, - 3,3,3,3,3,3,3,3,3,3,2,2,2,2,2,3,3,2,3,2,3,3,2,0,3,0,2,0,0,2, - 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,2,2,0,2,3,2,2,2,2,0,2, - 3,3,3,3,3,3,3,3,3,3,0,2,2,2,0,3,3,2,2,3,3,3,3,2,3,2,2,0,2,0, - 3,3,3,3,3,3,3,3,3,3,3,2,2,3,2,3,3,3,3,2,3,3,2,2,2,0,2,0,2,0, - 3,3,3,3,3,3,3,2,3,2,2,3,2,2,3,3,2,2,2,3,3,3,3,2,3,2,0,0,2,0, - 3,3,3,3,2,2,3,3,0,3,3,3,3,3,3,2,3,2,2,0,0,0,2,2,3,0,0,0,0,0, - 2,3,3,3,2,3,3,3,3,3,3,3,3,3,3,2,3,2,2,0,0,0,0,2,0,0,0,0,0,0, - 3,3,3,3,0,0,3,3,2,3,2,2,3,2,3,0,3,2,2,0,0,0,0,0,0,0,0,0,0,0, - 3,3,3,3,3,2,3,3,3,3,3,3,3,2,3,2,3,3,2,2,0,0,0,2,2,2,2,0,0,0, - 3,3,2,3,3,3,2,2,3,3,2,2,3,2,2,3,2,2,3,0,3,0,3,3,0,0,2,0,2,2, - 3,3,2,3,3,3,3,3,3,3,2,2,2,2,2,3,3,2,2,3,3,2,3,0,0,2,2,0,2,0, - 3,2,2,2,3,3,2,2,3,2,0,2,2,2,0,2,2,0,3,0,2,0,2,2,0,2,0,0,0,0, - 3,2,2,2,3,3,2,2,3,0,2,2,2,0,2,2,2,2,2,0,0,0,2,2,2,2,2,2,0,0, - 3,2,2,2,3,3,2,0,2,2,0,0,0,2,2,2,2,2,0,0,0,0,0,2,0,2,0,2,0,0, - 2,2,3,2,2,0,2,2,2,2,2,2,0,0,2,0,2,2,0,0,0,0,0,2,0,0,2,0,0,0, - 0,2,0,0,2,2,0,2,2,2,0,0,2,2,0,3,2,0,0,0,0,0,0,0,0,0,0,0,0,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,1,2,3,3,3,3,2,3,1,0,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,2,2,2,1, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,1,2,2,1, + 3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,1,2,3,0, + 3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,0,1,0,3,3,3,3,3,3,0,0,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,1,1,3,3,1,3,3,1,0,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,0,1,1,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,1,0,2,2,1, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,1,1,0,2,1,1, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,0,1,3,3,3,3,2,2,1,0,1, + 3,3,3,3,3,3,3,3,3,3,3,3,2,2,3,3,3,2,3,3,3,3,3,2,3,2,0,0,1,1,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,1,1,1,2,1,0, + 3,3,3,3,3,3,3,2,3,3,2,3,3,2,3,3,2,2,3,3,3,3,3,1,3,1,1,1,1,1,0, + 3,3,3,3,3,3,3,2,3,3,3,2,2,3,2,3,2,2,3,3,3,3,3,2,3,1,1,2,1,1,0, + 3,3,3,3,3,3,3,3,3,3,2,3,2,2,2,3,2,2,2,3,3,3,2,1,3,0,0,0,1,0,0, + 3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,1,1,0,2,3,2,2,2,3,1,0,0, + 3,3,3,3,3,3,3,2,3,3,0,2,1,1,1,3,3,1,2,3,3,3,3,2,3,1,1,0,1,2,0, + 3,3,3,3,3,3,3,3,3,3,3,2,2,3,2,3,3,3,3,2,3,3,3,1,1,0,1,0,2,1,0, + 3,3,3,3,3,3,3,2,3,3,1,3,2,2,3,3,1,1,2,3,3,3,3,1,3,3,0,1,1,1,2, + 3,3,3,3,1,2,3,3,3,1,3,3,3,2,3,1,3,2,1,0,0,0,2,0,3,0,0,0,0,0,0, + 2,3,3,3,1,3,3,3,3,3,3,3,3,3,3,2,3,2,1,0,0,0,0,2,0,0,0,0,0,0,0, + 3,3,3,3,0,0,3,3,3,1,2,1,3,2,3,0,3,1,1,1,0,0,0,0,0,0,0,0,0,0,0, + 3,3,3,3,3,2,3,3,3,3,3,3,3,2,3,2,3,3,2,1,0,0,0,2,2,1,2,1,0,0,0, + 3,3,1,3,3,3,2,3,3,3,1,2,3,1,1,3,1,1,3,0,1,0,3,3,1,1,1,0,1,0,1, + 3,3,3,3,3,3,3,3,3,3,2,1,2,1,1,3,2,1,2,3,3,1,3,0,0,0,0,0,1,2,0, + 3,2,2,2,3,3,2,1,2,3,0,1,1,1,0,2,2,0,2,0,1,0,1,1,1,2,1,0,0,0,0, + 3,1,1,1,3,3,1,1,1,3,1,2,1,1,0,2,1,1,1,0,0,0,2,1,2,1,3,0,0,0,0, + 2,1,1,1,2,3,1,1,2,2,0,0,1,1,1,1,1,2,2,1,0,0,1,2,0,1,0,1,0,0,0, + 2,2,3,2,1,0,2,2,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,1,0,0,1,0,0,0,0, + 0,3,2,2,1,0,1,0,2,0,1,0,2,0,1,0,0,1,1,0,0,0,0,0,0,1,1,0,0,0,0, + 2,0,0,0,2,2,0,0,0,0,0,0,0,1,1,3,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1, }; @@ -185,8 +208,8 @@ const SequenceModel Iso_8859_15DanishModel = { Iso_8859_15_CharToOrderMap, DanishLangModel, - 30, - (float)0.9962304038307248, + 31, + (float)0.9990058636293434, PR_TRUE, "ISO-8859-15", "da" @@ -196,8 +219,8 @@ const SequenceModel Iso_8859_1DanishModel = { Iso_8859_1_CharToOrderMap, DanishLangModel, - 30, - (float)0.9962304038307248, + 31, + (float)0.9990058636293434, PR_TRUE, "ISO-8859-1", "da" @@ -207,19 +230,30 @@ const SequenceModel Windows_1252DanishModel = { Windows_1252_CharToOrderMap, DanishLangModel, - 30, - (float)0.9962304038307248, + 31, + (float)0.9990058636293434, PR_TRUE, "WINDOWS-1252", "da" }; +const SequenceModel Ibm865DanishModel = +{ + Ibm865_CharToOrderMap, + DanishLangModel, + 31, + (float)0.9990058636293434, + PR_TRUE, + "IBM865", + "da" +}; + const LanguageModel DanishModel = { "da", Unicode_CharOrder, - 60, + 62, DanishLangModel, - 30, - (float)0.9962304038307248, + 31, + (float)0.9992516135038306, }; diff --git a/src/nsSBCSGroupProber.cpp b/src/nsSBCSGroupProber.cpp index fd4f2d6..ef8da36 100644 --- a/src/nsSBCSGroupProber.cpp +++ b/src/nsSBCSGroupProber.cpp @@ -197,8 +197,8 @@ nsSBCSGroupProber::nsSBCSGroupProber() mProbers[103] = new nsSingleByteCharSetProber(&Windows_1252NorwegianModel); mProbers[104] = new nsSingleByteCharSetProber(&Ibm865NorwegianModel); - mProbers[104] = new nsSingleByteCharSetProber(&Iso_8859_1EnglishModel); - mProbers[105] = new nsSingleByteCharSetProber(&Windows_1252EnglishModel); + mProbers[105] = new nsSingleByteCharSetProber(&Iso_8859_1EnglishModel); + mProbers[106] = new nsSingleByteCharSetProber(&Windows_1252EnglishModel); Reset(); } diff --git a/src/nsSBCSGroupProber.h b/src/nsSBCSGroupProber.h index a68a2a4..1c473f2 100644 --- a/src/nsSBCSGroupProber.h +++ b/src/nsSBCSGroupProber.h @@ -40,7 +40,7 @@ #define nsSBCSGroupProber_h__ -#define NUM_OF_SBCS_PROBERS 106 +#define NUM_OF_SBCS_PROBERS 107 class nsCharSetProber; class nsSBCSGroupProber: public nsCharSetProber { |