diff options
author | Jehan <jehan@girinstud.io> | 2016-09-28 22:35:40 +0200 |
---|---|---|
committer | Jehan <jehan@girinstud.io> | 2016-09-28 22:42:13 +0200 |
commit | 119fed7e8dcb7b9e72457ff2b268a61d2264f12d (patch) | |
tree | 24ef4c950ddd03d05a9658b944507ac192c5e01d | |
parent | d62154bd6ed1eaeca2e40f36673a3e32acd445d7 (diff) |
LangModels: add Swedish support.
Encodings: ISO-8859-1, ISO-8859-4, ISO-8859-9, ISO-8859-15 and
WINDOWS-1252.
Test text from https://sv.wikipedia.org/wiki/Mölle
-rw-r--r-- | README.md | 6 | ||||
-rw-r--r-- | script/BuildLangModelLogs/LangSwedishModel.log | 151 | ||||
-rw-r--r-- | script/langs/sv.py | 56 | ||||
-rw-r--r-- | src/CMakeLists.txt | 1 | ||||
-rw-r--r-- | src/LangModels/LangSwedishModel.cpp | 261 | ||||
-rw-r--r-- | src/nsSBCSGroupProber.cpp | 6 | ||||
-rw-r--r-- | src/nsSBCSGroupProber.h | 2 | ||||
-rw-r--r-- | src/nsSBCharSetProber.h | 6 | ||||
-rw-r--r-- | test/sv/iso-8859-1.txt | 10 | ||||
-rw-r--r-- | test/sv/utf-8.txt | 10 | ||||
-rw-r--r-- | test/sv/windows-1252.txt | 10 |
11 files changed, 518 insertions, 1 deletions
@@ -142,6 +142,12 @@ Techniques used by universalchardet are described at http://www.mozilla.org/proj * ISO-8859-1 * ISO-8859-15 * WINDOWS-1252 + * Swedish + * ISO-8859-1 + * ISO-8859-4 + * ISO-8859-9 + * ISO-8859-15 + * WINDOWS-1252 * Thai * TIS-620 * ISO-8859-11 diff --git a/script/BuildLangModelLogs/LangSwedishModel.log b/script/BuildLangModelLogs/LangSwedishModel.log new file mode 100644 index 0000000..029e510 --- /dev/null +++ b/script/BuildLangModelLogs/LangSwedishModel.log @@ -0,0 +1,151 @@ += Logs of language model for Swedish (sv) = + +- Generated by BuildLangModel.py +- Started: 2016-09-28 22:26:37.221506 +- Maximum depth: 5 +- Max number of pages: 100 + +== Parsed pages == + +Kakapo (revision 36509929) +Akut hotad (revision 32517788) +Aotearoa (revision 36575359) +Art (revision 36771341) +Artepitet (revision 36771341) +Auckland (revision 35752058) +Auktorsnamn (revision 35976965) +BBC (revision 36508743) +Basalomsättning (revision 30567523) +Beilschmiedia tawa (revision 29101923) +Berguv (revision 36295501) +Betesmark (revision 34292168) +Biotop (revision 35528052) +BirdLife International (revision 36124283) +Bonaparte (revision 37325183) +British Museum (revision 36420244) +Bröstben (revision 30602527) +Dacrydium cupressinum (revision 32986501) +Digital object identifier (revision 27637223) +Djur (revision 37300775) +Djurpark (revision 37147093) +Domän (biologi) (revision 33377709) +Don Merton (revision 36509929) +Douglas Adams (revision 36556245) +Däggdjur (revision 37328286) +Ekologisk nisch (revision 33898643) +Ekosystem (revision 36598266) +Endemisk (revision 30647109) +Eukaryoter (revision 37095313) +Evolution (revision 37093592) +Familj (biologi) (revision 30280200) +Femininum (revision 30597527) +Fjäder (biologi) (revision 36364943) +Fjäderdräkt (revision 36364943) +Fladdermöss (revision 37307257) +Flygg (revision 36479633) +Frukter (revision 34088588) +Frö (revision 37333131) +Fågelläte (revision 34034723) +Fåglar (revision 37387306) +Fåglarnas liv (revision 36509929) +Genitiv (revision 37388438) +George Edward Grey (revision 36509929) +George Robert Gray (revision 20426710) +Haasts örn (revision 29175076) +Hauturu/Little Barrier Island (revision 36509929) +Hermelin (revision 36578682) +Hertz (revision 37104488) +Hjortdjur (revision 36493550) +Hund (revision 37351832) +Husdjur (revision 37384850) +Huskatt (revision 32922967) +Hāngi (revision 29609696) +IUCN (revision 30570280) +Iller (revision 30663158) +Infraröd (revision 36770733) +Internationella naturvårdsunionen (revision 30570280) +Jordbruk (revision 37352625) +Kahurangi National Park (revision 35956142) +Kamouflage (revision 36579595) +Kaniner (revision 36877621) +Kapiti Island (revision 37395588) +Katt (revision 36734686) +Kelp (revision 30312471) +Kivier (revision 36373234) +Klass (biologi) (revision 30280201) +Kroppsfett (revision 35066611) +Könsdimorfism (revision 30816932) +Könsfördelning (revision 24769321) +Lamm- och fårkött (revision 36187205) +Lek (fortplantningsbeteende) (revision 30508235) +Mandel (revision 36577529) +Maori (revision 32560474) +Maorier (revision 35862066) +Maoripapegojor (revision 36545138) +Mark Carwardine (revision 20375916) +Markpapegoja (revision 36295722) +Maskulinum (revision 32704551) +Masterton (revision 29859631) +Metrosideros umbellata (revision 29071212) +Milford Sound (revision 20284758) +Morrhår (revision 36533839) +Muskelmage (revision 31196380) +Mustela (revision 20934105) +Mårddjur (revision 37306347) +Māori (revision 32560474) +NHNZ (revision 36509929) +Nattpapegoja (revision 33486517) +Nordön (revision 24810231) +Nya Zeeland (revision 36575359) +Näbb (revision 23648463) +Ollonår (revision 36509929) +Ordning (biologi) (revision 30280196) + +== End of Parsed pages == + +- Wikipedia parsing ended at: 2016-09-28 22:29:21.480287 + +48 characters appeared 594415 times. + +First 31 characters: +[ 0] Char a: 10.070741821791172 % +[ 1] Char e: 9.737136512369304 % +[ 2] Char r: 9.110638190489809 % +[ 3] Char n: 8.378826240925951 % +[ 4] Char t: 7.481305148759705 % +[ 5] Char s: 5.828587771169974 % +[ 6] Char i: 5.359891658184939 % +[ 7] Char l: 5.173489901836259 % +[ 8] Char o: 4.694195133029954 % +[ 9] Char d: 4.597293136949774 % +[10] Char k: 3.297359588839447 % +[11] Char m: 3.1898589369379975 % +[12] Char g: 3.004466576381821 % +[13] Char v: 2.2324470277499726 % +[14] Char f: 2.1988005013332437 % +[15] Char p: 2.06017681249632 % +[16] Char u: 2.0499146219392173 % +[17] Char ä: 2.0475593650900468 % +[18] Char h: 2.028380845032511 % +[19] Char å: 1.5443755625278637 % +[20] Char c: 1.442594820117258 % +[21] Char ö: 1.3515809661600062 % +[22] Char b: 1.268642278542769 % +[23] Char j: 0.7302978558751041 % +[24] Char y: 0.6699023409570755 % +[25] Char x: 0.2111319532649748 % +[26] Char w: 0.10262190557102362 % +[27] Char z: 0.09151855185350302 % +[28] Char é: 0.021197311642539303 % +[29] Char ā: 0.011103353717520588 % +[30] Char q: 0.007570468443764037 % + +The first 31 characters have an accumulated ratio of 0.999936071599808. + +748 sequences found. + +First 512 (typical positive ratio): 0.997323508584682 +Next 512 (512-1024): 1.6823263208364526e-06 +Rest: 1.7780915628762273e-17 + +- Processing end: 2016-09-28 22:29:21.590354 diff --git a/script/langs/sv.py b/script/langs/sv.py new file mode 100644 index 0000000..9dc7570 --- /dev/null +++ b/script/langs/sv.py @@ -0,0 +1,56 @@ +#!/bin/python3 +# -*- coding: utf-8 -*- + +# ##### BEGIN LICENSE BLOCK ##### +# Version: MPL 1.1/GPL 2.0/LGPL 2.1 +# +# The contents of this file are subject to the Mozilla Public License Version +# 1.1 (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# http://www.mozilla.org/MPL/ +# +# Software distributed under the License is distributed on an "AS IS" basis, +# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License +# for the specific language governing rights and limitations under the +# License. +# +# The Original Code is Mozilla Universal charset detector code. +# +# The Initial Developer of the Original Code is +# Netscape Communications Corporation. +# Portions created by the Initial Developer are Copyright (C) 2001 +# the Initial Developer. All Rights Reserved. +# +# Contributor(s): +# Jehan <jehan@girinstud.io> +# +# Alternatively, the contents of this file may be used under the terms of +# either the GNU General Public License Version 2 or later (the "GPL"), or +# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), +# in which case the provisions of the GPL or the LGPL are applicable instead +# of those above. If you wish to allow use of your version of this file only +# under the terms of either the GPL or the LGPL, and not to allow others to +# use your version of this file under the terms of the MPL, indicate your +# decision by deleting the provisions above and replace them with the notice +# and other provisions required by the GPL or the LGPL. If you do not delete +# the provisions above, a recipient may use your version of this file under +# the terms of any one of the MPL, the GPL or the LGPL. +# +# ##### END LICENSE BLOCK ##### + +import re + +## Mandatory Properties ## + +name = 'Swedish' +code = 'sv' +use_ascii = True +charsets = ['ISO-8859-1', 'ISO-8859-4', 'ISO-8859-9', + 'ISO-8859-15', 'WINDOWS-1252'] + +## Optional Properties ## + +alphabet = 'åäö' +start_pages = ['Kakapo'] +wikipedia_code = code +case_mapping = True diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 67e76b1..952b594 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -31,6 +31,7 @@ set( LangModels/LangRussianModel.cpp LangModels/LangSlovakModel.cpp LangModels/LangSloveneModel.cpp + LangModels/LangSwedishModel.cpp LangModels/LangSpanishModel.cpp LangModels/LangThaiModel.cpp LangModels/LangTurkishModel.cpp diff --git a/src/LangModels/LangSwedishModel.cpp b/src/LangModels/LangSwedishModel.cpp new file mode 100644 index 0000000..0d2dadf --- /dev/null +++ b/src/LangModels/LangSwedishModel.cpp @@ -0,0 +1,261 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* ***** BEGIN LICENSE BLOCK ***** + * Version: MPL 1.1/GPL 2.0/LGPL 2.1 + * + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * The Original Code is Mozilla Communicator client code. + * + * The Initial Developer of the Original Code is + * Netscape Communications Corporation. + * Portions created by the Initial Developer are Copyright (C) 1998 + * the Initial Developer. All Rights Reserved. + * + * Contributor(s): + * + * Alternatively, the contents of this file may be used under the terms of + * either the GNU General Public License Version 2 or later (the "GPL"), or + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), + * in which case the provisions of the GPL or the LGPL are applicable instead + * of those above. If you wish to allow use of your version of this file only + * under the terms of either the GPL or the LGPL, and not to allow others to + * use your version of this file under the terms of the MPL, indicate your + * decision by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL or the LGPL. If you do not delete + * the provisions above, a recipient may use your version of this file under + * the terms of any one of the MPL, the GPL or the LGPL. + * + * ***** END LICENSE BLOCK ***** */ + +#include "../nsSBCharSetProber.h" + +/********* Language model for: Swedish *********/ + +/** + * Generated by BuildLangModel.py + * On: 2016-09-28 22:29:21.480940 + **/ + +/* Character Mapping Table: + * ILL: illegal character. + * CTR: control character specific to the charset. + * RET: carriage/return. + * SYM: symbol (punctuation) that does not belong to word. + * NUM: 0 - 9. + * + * Other characters are ordered by probabilities + * (0 is the most common character in the language). + * + * Orders are generic to a language. So the codepoint with order X in + * CHARSET1 maps to the same character as the codepoint with the same + * order X in CHARSET2 for the same language. + * As such, it is possible to get missing order. For instance the + * ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1 + * even though they are both used for French. Same for the euro sign. + */ +static const unsigned char Windows_1252_CharToOrderMap[] = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 0, 22, 20, 9, 1, 14, 12, 18, 6, 23, 10, 7, 11, 3, 8, /* 4X */ + 15, 30, 2, 5, 4, 16, 13, 26, 25, 24, 27,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 0, 22, 20, 9, 1, 14, 12, 18, 6, 23, 10, 7, 11, 3, 8, /* 6X */ + 15, 30, 2, 5, 4, 16, 13, 26, 25, 24, 27,SYM,SYM,SYM,SYM,CTR, /* 7X */ + SYM,ILL,SYM, 34,SYM,SYM,SYM,SYM,SYM,SYM, 48,SYM, 49,ILL, 50,ILL, /* 8X */ + ILL,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, 51,SYM, 52,ILL, 53, 54, /* 9X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */ + SYM,SYM,SYM,SYM,SYM, 55,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */ + 56, 44, 57, 58, 17, 19, 38, 40, 32, 28, 45, 59, 60, 61, 47, 62, /* CX */ + 63, 64, 65, 66, 35, 67, 21,SYM, 37, 68, 69, 70, 31, 71, 72, 73, /* DX */ + 74, 44, 75, 76, 17, 19, 38, 40, 32, 28, 45, 77, 78, 79, 47, 80, /* EX */ + 81, 82, 83, 84, 35, 85, 21,SYM, 37, 86, 87, 88, 31, 89, 90, 91, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Iso_8859_9_CharToOrderMap[] = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 0, 22, 20, 9, 1, 14, 12, 18, 6, 23, 10, 7, 11, 3, 8, /* 4X */ + 15, 30, 2, 5, 4, 16, 13, 26, 25, 24, 27,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 0, 22, 20, 9, 1, 14, 12, 18, 6, 23, 10, 7, 11, 3, 8, /* 6X */ + 15, 30, 2, 5, 4, 16, 13, 26, 25, 24, 27,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */ + SYM,SYM,SYM,SYM,SYM, 92,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */ + 93, 44, 94, 95, 17, 19, 38, 40, 32, 28, 45, 96, 97, 98, 47, 99, /* CX */ + 100,101,102,103, 35,104, 21,SYM, 37,105,106,107, 31,108,109,110, /* DX */ + 111, 44,112,113, 17, 19, 38, 40, 32, 28, 45,114,115,116, 47,117, /* EX */ + 118,119,120,121, 35,122, 21,SYM, 37,123,124,125, 31, 42,126,127, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Iso_8859_1_CharToOrderMap[] = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 0, 22, 20, 9, 1, 14, 12, 18, 6, 23, 10, 7, 11, 3, 8, /* 4X */ + 15, 30, 2, 5, 4, 16, 13, 26, 25, 24, 27,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 0, 22, 20, 9, 1, 14, 12, 18, 6, 23, 10, 7, 11, 3, 8, /* 6X */ + 15, 30, 2, 5, 4, 16, 13, 26, 25, 24, 27,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */ + SYM,SYM,SYM,SYM,SYM,128,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */ + 129, 44,130,131, 17, 19, 38, 40, 32, 28, 45,132,133,134, 47,135, /* CX */ + 136,137,138,139, 35,140, 21,SYM, 37,141,142,143, 31,144,145,146, /* DX */ + 147, 44,148,149, 17, 19, 38, 40, 32, 28, 45,150,151,152, 47,153, /* EX */ + 154,155,156,157, 35,158, 21,SYM, 37,159,160,161, 31,162,163,164, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Iso_8859_4_CharToOrderMap[] = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 0, 22, 20, 9, 1, 14, 12, 18, 6, 23, 10, 7, 11, 3, 8, /* 4X */ + 15, 30, 2, 5, 4, 16, 13, 26, 25, 24, 27,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 0, 22, 20, 9, 1, 14, 12, 18, 6, 23, 10, 7, 11, 3, 8, /* 6X */ + 15, 30, 2, 5, 4, 16, 13, 26, 25, 24, 27,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM,165,166,167,SYM,168,169,SYM,SYM,170,171,172,173,SYM,174,SYM, /* AX */ + SYM,175,SYM,176,SYM,177,178,SYM,SYM,179,180,181,182, 43,183, 43, /* BX */ + 29, 44,184,185, 17, 19, 38,186,187, 28,188,189, 39,190, 47, 41, /* CX */ + 191,192, 33,193, 35,194, 21,SYM, 37, 36,195,196, 31,197, 46,198, /* DX */ + 29, 44,199,200, 17, 19, 38,201,202, 28,203,204, 39,205, 47, 41, /* EX */ + 206,207, 33,208, 35,209, 21,SYM, 37, 36,210,211, 31,212, 46,SYM, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Iso_8859_15_CharToOrderMap[] = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 0, 22, 20, 9, 1, 14, 12, 18, 6, 23, 10, 7, 11, 3, 8, /* 4X */ + 15, 30, 2, 5, 4, 16, 13, 26, 25, 24, 27,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 0, 22, 20, 9, 1, 14, 12, 18, 6, 23, 10, 7, 11, 3, 8, /* 6X */ + 15, 30, 2, 5, 4, 16, 13, 26, 25, 24, 27,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM,SYM,SYM,SYM,SYM,SYM,213,SYM,214,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */ + SYM,SYM,SYM,SYM,215,216,SYM,SYM,217,SYM,SYM,SYM,218,219,220,SYM, /* BX */ + 221, 44,222,223, 17, 19, 38, 40, 32, 28, 45,224,225,226, 47,227, /* CX */ + 228,229,230,231, 35,232, 21,SYM, 37,233,234,235, 31,236,237,238, /* DX */ + 239, 44,240,241, 17, 19, 38, 40, 32, 28, 45,242,243,244, 47,245, /* EX */ + 246,247,248,249, 35,249, 21,SYM, 37,249,249,249, 31,249,249,249, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + + +/* Model Table: + * Total sequences: 748 + * First 512 sequences: 0.997323508584682 + * Next 512 sequences (512-1024): 0.0026764914153179875 + * Rest: 1.7780915628762273e-17 + * Negative sequences: TODO + */ +static const PRUint8 SwedishLangModel[] = +{ + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,0,3,2,3,3,3,3,3,2,0,0,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,2,3,2,3,3,3,3,3,3,0,0,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,2,2,0,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,2,0,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,0,2,2,2,2,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,2,2,2,0,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,3,2,2,2,2,3,0,0,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,0,2,2,2,0,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,2,3,2,3,3,2,3,3,2,2,0,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,2,3,3,3,3,0,2,0,2,0,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,2,3,0,2,0,2,2,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,0,2,0,3,3,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,2,3,3,3,3,0,2,2,0,2,0, + 3,3,3,3,3,3,3,3,3,3,3,2,2,2,3,2,3,3,3,3,0,2,3,2,0,0,0,2,0,0,0, + 3,3,3,2,3,2,3,3,3,2,0,2,2,2,3,2,3,3,0,3,2,3,0,3,3,0,0,0,2,0,0, + 3,3,3,3,3,3,3,3,3,2,3,2,3,3,3,3,3,3,3,3,2,2,2,2,3,2,0,2,3,2,0, + 3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,2,0,2,0,3,2,3,2,0,3,0,0,0,2,0, + 2,2,3,3,3,3,0,3,0,3,3,3,3,3,3,3,2,2,0,0,3,0,3,0,0,3,0,0,0,0,0, + 3,3,3,3,3,2,3,2,3,2,2,2,2,0,0,0,3,3,2,3,2,3,2,3,3,0,0,3,0,2,0, + 2,3,3,3,3,3,2,3,0,3,3,3,3,3,2,0,0,0,2,0,0,2,3,0,0,0,0,0,0,0,0, + 3,3,3,3,3,2,3,3,3,2,3,2,2,2,2,0,3,0,3,0,3,2,2,0,3,0,0,2,2,0,2, + 3,3,3,3,3,3,2,3,2,3,3,3,3,3,2,3,0,2,2,0,3,2,2,3,0,0,0,0,0,0,0, + 3,3,3,3,3,3,3,3,3,2,2,2,0,2,2,2,3,3,2,3,3,3,3,3,3,0,0,2,2,0,0, + 3,3,0,2,2,3,2,3,3,3,2,0,0,0,2,0,3,3,0,0,0,3,2,0,0,0,0,0,2,0,0, + 3,2,3,3,3,3,2,3,3,3,3,3,3,2,3,3,2,0,2,0,3,0,3,2,0,3,0,2,0,0,0, + 3,3,0,3,3,0,3,2,3,0,2,2,0,0,2,3,2,0,2,0,0,0,2,0,2,2,0,0,0,0,0, + 3,3,2,2,2,3,3,2,3,2,2,0,0,0,0,0,2,0,2,0,0,0,0,0,2,0,2,2,0,0,0, + 3,3,0,2,2,0,2,0,3,0,2,0,0,0,0,0,2,0,2,0,0,0,2,0,2,0,0,2,0,0,0, + 0,3,2,2,0,2,0,2,2,2,0,0,0,2,0,2,0,0,2,0,0,2,2,0,0,0,0,0,0,0,0, + 0,0,0,2,0,0,2,0,3,0,2,2,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,2,0,0,0,0,0,0,2,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +}; + + +const SequenceModel Windows_1252SwedishModel = +{ + Windows_1252_CharToOrderMap, + SwedishLangModel, + 31, + (float)0.997323508584682, + PR_TRUE, + "WINDOWS-1252" +}; + +const SequenceModel Iso_8859_9SwedishModel = +{ + Iso_8859_9_CharToOrderMap, + SwedishLangModel, + 31, + (float)0.997323508584682, + PR_TRUE, + "ISO-8859-9" +}; + +const SequenceModel Iso_8859_1SwedishModel = +{ + Iso_8859_1_CharToOrderMap, + SwedishLangModel, + 31, + (float)0.997323508584682, + PR_TRUE, + "ISO-8859-1" +}; + +const SequenceModel Iso_8859_4SwedishModel = +{ + Iso_8859_4_CharToOrderMap, + SwedishLangModel, + 31, + (float)0.997323508584682, + PR_TRUE, + "ISO-8859-4" +}; + +const SequenceModel Iso_8859_15SwedishModel = +{ + Iso_8859_15_CharToOrderMap, + SwedishLangModel, + 31, + (float)0.997323508584682, + PR_TRUE, + "ISO-8859-15" +}; diff --git a/src/nsSBCSGroupProber.cpp b/src/nsSBCSGroupProber.cpp index 161129d..66738ea 100644 --- a/src/nsSBCSGroupProber.cpp +++ b/src/nsSBCSGroupProber.cpp @@ -185,6 +185,12 @@ nsSBCSGroupProber::nsSBCSGroupProber() mProbers[92] = new nsSingleByteCharSetProber(&Mac_CentraleuropeSloveneModel); mProbers[93] = new nsSingleByteCharSetProber(&Ibm852SloveneModel); + mProbers[94] = new nsSingleByteCharSetProber(&Iso_8859_1SwedishModel); + mProbers[95] = new nsSingleByteCharSetProber(&Iso_8859_4SwedishModel); + mProbers[96] = new nsSingleByteCharSetProber(&Iso_8859_9SwedishModel); + mProbers[97] = new nsSingleByteCharSetProber(&Iso_8859_15SwedishModel); + mProbers[98] = new nsSingleByteCharSetProber(&Windows_1252SwedishModel); + Reset(); } diff --git a/src/nsSBCSGroupProber.h b/src/nsSBCSGroupProber.h index b22f46e..64c021b 100644 --- a/src/nsSBCSGroupProber.h +++ b/src/nsSBCSGroupProber.h @@ -40,7 +40,7 @@ #define nsSBCSGroupProber_h__ -#define NUM_OF_SBCS_PROBERS 94 +#define NUM_OF_SBCS_PROBERS 99 class nsCharSetProber; class nsSBCSGroupProber: public nsCharSetProber { diff --git a/src/nsSBCharSetProber.h b/src/nsSBCharSetProber.h index dd29b90..42d21b2 100644 --- a/src/nsSBCharSetProber.h +++ b/src/nsSBCharSetProber.h @@ -246,5 +246,11 @@ extern const SequenceModel Iso_8859_16SloveneModel; extern const SequenceModel Ibm852SloveneModel; extern const SequenceModel Mac_CentraleuropeSloveneModel; +extern const SequenceModel Iso_8859_1SwedishModel; +extern const SequenceModel Iso_8859_4SwedishModel; +extern const SequenceModel Iso_8859_9SwedishModel; +extern const SequenceModel Iso_8859_15SwedishModel; +extern const SequenceModel Windows_1252SwedishModel; + #endif /* nsSingleByteCharSetProber_h__ */ diff --git a/test/sv/iso-8859-1.txt b/test/sv/iso-8859-1.txt new file mode 100644 index 0000000..fcf070c --- /dev/null +++ b/test/sv/iso-8859-1.txt @@ -0,0 +1,10 @@ +Mlle r en ttort p Kullahalvn i Brunnby socken i Hgans kommun, Skne ln. + +Samhllet var frn brjan ett fiskelge, men kom att spela en stor roll i den +framvxande turismen i Sverige i slutet av 1800-talet. Till detta bidrog - och +bidrar - Mlles naturskna lge invid resunds norra utlopp, med Kullaberg som +bakgrund. Gemensamhetsbad fr mn och kvinnor introducerades i Ransvik i brjan +av 1900-talet. Storhetstiden som turistort intrffade strax fre frsta +vrldskriget, men ven under mellankrigstiden var turiststrmmarna stora. +Fortfarande r Mlle en populr turistort med en tredubbling av invnarantalet +under sommarmnaderna. diff --git a/test/sv/utf-8.txt b/test/sv/utf-8.txt new file mode 100644 index 0000000..d66be04 --- /dev/null +++ b/test/sv/utf-8.txt @@ -0,0 +1,10 @@ +Mölle är en tätort på Kullahalvön i Brunnby socken i Höganäs kommun, Skåne län. + +Samhället var från början ett fiskeläge, men kom att spela en stor roll i den +framväxande turismen i Sverige i slutet av 1800-talet. Till detta bidrog – och +bidrar – Mölles natursköna läge invid Öresunds norra utlopp, med Kullaberg som +bakgrund. Gemensamhetsbad för män och kvinnor introducerades i Ransvik i början +av 1900-talet. Storhetstiden som turistort inträffade strax före första +världskriget, men även under mellankrigstiden var turistströmmarna stora. +Fortfarande är Mölle en populär turistort med en tredubbling av invånarantalet +under sommarmånaderna. diff --git a/test/sv/windows-1252.txt b/test/sv/windows-1252.txt new file mode 100644 index 0000000..94f15c6 --- /dev/null +++ b/test/sv/windows-1252.txt @@ -0,0 +1,10 @@ +Mlle r en ttort p Kullahalvn i Brunnby socken i Hgans kommun, Skne ln. + +Samhllet var frn brjan ett fiskelge, men kom att spela en stor roll i den +framvxande turismen i Sverige i slutet av 1800-talet. Till detta bidrog och +bidrar Mlles naturskna lge invid resunds norra utlopp, med Kullaberg som +bakgrund. Gemensamhetsbad fr mn och kvinnor introducerades i Ransvik i brjan +av 1900-talet. Storhetstiden som turistort intrffade strax fre frsta +vrldskriget, men ven under mellankrigstiden var turiststrmmarna stora. +Fortfarande r Mlle en populr turistort med en tredubbling av invnarantalet +under sommarmnaderna. |