diff options
author | Jehan <jehan@girinstud.io> | 2022-12-20 01:46:15 +0100 |
---|---|---|
committer | Jehan <jehan@girinstud.io> | 2022-12-20 01:46:15 +0100 |
commit | d40e5868d5ec1f08f1e6e0d25e04dae68c586ba1 (patch) | |
tree | 95cab0573b4160c7d0b5223f7466014309e62d5a | |
parent | cec8817d799e1f0bdc86031d4e20433016fcf85f (diff) |
script, src, test: adding Catalan support.
For UTF-8, ISO-8859-1 and WINDOWS-1252 support.
The test for UTF-8 and ISO-8859-1 is taken from 'Marmota' page on
Wikipedia in Catalan. The test for WINDOWS-1252 is taken from the
'Unió_Europea' page. ISO-8859-1 and WINDOWS-1252 being very similar,
regarding most letters (in particular the ones used in Catalan), I
differentiated the test with a text containing the '€' symbol, which is
on an unused spot in ISO-8859-1.
-rw-r--r-- | README.md | 4 | ||||
-rw-r--r-- | script/BuildLangModelLogs/LangCatalanModel.log | 238 | ||||
-rw-r--r-- | script/langs/ca.py | 79 | ||||
-rw-r--r-- | script/support.txt | 1 | ||||
-rw-r--r-- | src/CMakeLists.txt | 1 | ||||
-rw-r--r-- | src/LangModels/LangCatalanModel.cpp | 207 | ||||
-rw-r--r-- | src/nsLanguageDetector-generated.h | 3 | ||||
-rw-r--r-- | src/nsMBCSGroupProber.cpp | 1 | ||||
-rw-r--r-- | src/nsSBCSGroupProber.cpp | 3 | ||||
-rw-r--r-- | src/nsSBCharSetProber-generated.h | 5 | ||||
-rw-r--r-- | test/ca/iso-8859-1.txt | 1 | ||||
-rw-r--r-- | test/ca/utf-8.txt | 1 | ||||
-rw-r--r-- | test/ca/windows-1252.txt | 1 |
13 files changed, 543 insertions, 2 deletions
@@ -25,6 +25,10 @@ uchardet started as a C language binding of the original C++ implementation of t * UTF-8 * ISO-8859-5 * WINDOWS-1251 + * Catalan + * UTF-8 + * ISO-8859-1 + * WINDOWS-1252 * Chinese * UTF-8 * ISO-2022-CN diff --git a/script/BuildLangModelLogs/LangCatalanModel.log b/script/BuildLangModelLogs/LangCatalanModel.log new file mode 100644 index 0000000..f00ab58 --- /dev/null +++ b/script/BuildLangModelLogs/LangCatalanModel.log @@ -0,0 +1,238 @@ += Logs of language model for Catalan (ca) = + +- Generated by BuildLangModel.py +- Started: 2022-12-20 01:31:40.290803 +- Maximum depth: 4 +- Max number of pages: 200 + +== Parsed pages == + +Parlament_Europeu (revision 31056370) +Genji Monogatari (revision 31007904) +Bundestag (revision 30742728) +Kana (revision 29176811) +Jun'ichirō Tanizaki (revision 30750244) +Representació proporcional amb llista de partit (revision 22086795) +Agències de la Unió Europea (revision 30276199) +Poder executiu (revision 30290834) +Edicions Atalanta (revision 26048077) +Animació (revision 30865051) +Pressupost de la Unió Europea (revision 30231577) +Jorge Luis Borges (revision 30783720) +Universitat de Pittsburgh (revision 25411555) +Satiricó (revision 31019009) +Dramatis personae (revision 30858787) +Corpus lingüístic (revision 28600087) +Genji Monogatari Emaki (revision 30520718) +Era Keichō (revision 27881416) +Període Heian (revision 30351338) +Uji (revision 26298733) +Clan Minamoto (revision 29218047) +Ventafocs (revision 30167478) +わ (revision 28487155) +Japó (revision 30980338) +Agència Europea dels Sistemes Globals de Navegació per Satèl·lit (revision 28777516) +Període Shōwa (revision 30351346) +ム (revision 25190709) +Premi Balzan (revision 30321993) +Germans Grimm (revision 30104486) +Europol (revision 25369380) +Unió Europea (revision 30730061) +Kyoto (revision 30706119) +Incendi del Reichstag (revision 30894126) +Processament de llenguatge natural (revision 29016655) +794 (revision 29283769) +CANTIC (revision 30488826) +Casa de la Història Europea (revision 30703943) +VP:VER (revision 30232565) +Katakana (revision 29937701) +Shogunat Kamakura (revision 28808156) +Eleccions (revision 30449311) +Noam Chomsky (revision 30552025) +Eleccions federals alemanyes de 1994 (revision 28337358) +Conceptes d'unitat europea abans del 1945 (revision 30927921) +Era Heian (revision 30351338) +Gemeinsame Normdatei (revision 30883432) +La Bella Dorment (pel·lícula de 1959) (revision 30982067) +Població (revision 30352350) +Obra literària (revision 31011396) +も (revision 25190714) +Istituto Centrale per il Catalogo Unico (revision 28786509) +Política (revision 31014511) +ハ (revision 31071577) +Vot (revision 27865452) +Clan Taira (revision 26323649) +Permís de conducció europeu (revision 27672810) +Mandala (revision 30940608) +Campània antiga (revision 29855854) +ゐ (revision 28487156) +Consell de la Unió Europea (revision 30308594) +24 de juliol (revision 31063555) +Kyōto (revision 30706119) +Alfons X de Castella (revision 30535714) +VIAF (revision 28927187) +1975 (revision 31057077) +モ (revision 25190714) +Sistema Galileo (revision 30880731) +Densitat de població (revision 30174278) +Autodesk Maya (revision 30989692) +Nàpols (revision 31028649) +Memòria de traducció (revision 30341759) +Ryukyu (revision 29922259) +Agència Europea per a la Seguretat i la Salut en el Treball (revision 29049313) +ISNI (revision 30824306) +PDF (revision 29442049) +Eleccions federals alemanyes de 1972 (revision 30271501) +Sistema presidencialista (revision 30596011) +Primer ministre (revision 27174693) +Coeducació (revision 31048027) +Ko Tazawa (revision 30932179) +Poliomielitis (revision 30976061) +18 de setembre (revision 31063494) +Campanya electoral (revision 27935270) +Kōbō Abe (revision 30016508) +Rodopis (revision 28014188) +Política Agrària Comunitària (revision 30353551) +21 d'octubre (revision 30980460) +1984 (revision 31063521) +South Park (revision 31024165) +Hiragana (revision 29920075) +Associació de Votants de Schleswig Meridional (revision 30753058) +ひ (revision 31071564) +Lingüística (revision 31037031) +Blauet comú (revision 28729161) +Autodeterminació (revision 29349294) +Xina (revision 31007838) +Control d'autoritats (revision 29854505) +Guillermo de Torre (revision 30765552) +Unesco (revision 30129516) +Romanització Hepburn (revision 29144432) +Tanka (revision 30478859) +Clientelisme (revision 30811663) +Corpus Textual Informatitzat de la Llengua Catalana (revision 29876775) +Secessió (revision 29980781) +Fada protectora (revision 29175001) +を (revision 28487157) +Ōtsu (revision 30010938) +Gran Enciclopèdia Catalana (revision 30724375) +LCCN (revision 30638965) +Universitat privada (revision 28518823) +Robert Louis Stevenson (revision 30728093) +Kioto (revision 30706119) +7 de setembre (revision 30503878) +Aardman Animations (revision 30216975) +Llibertinatge (revision 29597307) +Bibliothèque nationale de France (revision 30715383) +Alemanya Occidental (revision 30239917) +National Library of Australia (revision 30977078) +Diccionari Descriptiu de la Llengua Catalana (revision 27017217) +1969 (revision 31060188) +Separació de poders (revision 30362225) +Isaac Titsingh (revision 29748956) +Adolf Hitler (revision 30951478) +Període Kamakura (revision 28808156) +Societas Europaea (revision 28857120) +Invasions japoneses a Corea (revision 30978745) +Agència de la Unió Europea (revision 30276199) +Sistema polític (revision 30713673) +1606 (revision 26237152) +Universitat Rovira i Virgili (revision 30865280) +IVA (revision 30328630) +Patricis (revision 30923152) +Els barrufets (revision 31008031) +Lapislàtzuli Editorial (revision 30176117) +Internet (revision 30894405) +BIBSYS (revision 30255267) +Agència Europea de Seguretat Marítima (revision 28888118) +National Diet Library (revision 30669422) +Grup Enciclopèdia Catalana (revision 31077222) +Competència comunicativa (revision 30307632) +Castell Fushimi (revision 30610308) +Walter Gropius (revision 30790098) +Biblioteca Nacional de España (revision 31071591) +Diccionari Normatiu Valencià (revision 29882403) +Oscar Wilde (revision 31078983) +Hampshire (revision 30823098) +Clan Fujiwara (revision 30894950) +Speedy Gonzales (revision 30151280) +Tlön, Uqbar, Orbis Tertius (revision 29688246) +Japó ocupat (revision 28083159) +Garbancito de la Mancha (revision 30219073) +SUDOC (revision 29231585) +Gerardo Diego (revision 29912471) +Universitat (revision 29907980) +Foliscopi (revision 29903436) +1980 (revision 31063457) +Infart de miocardi (revision 30894255) +Encyclopædia Britannica (revision 28347959) +Petroni (revision 29790499) +Horari de màxima audiència (revision 27872454) +Sutra (revision 23458427) +Medicina (revision 31002196) +ホ (revision 25190705) +Luci Appuleu (revision 30336717) +Novel·la (revision 30386814) +Kimba, el lleó blanc (revision 30273901) +UTC+09:00 (revision 25182859) +Arquitectura neogòtica (revision 30347122) +Segle I (revision 30953541) +Emperador del Japó (revision 27799841) +Biblioteca Nacional de la República Txeca (revision 29847950) +Gran Diccionari de la Llengua Catalana (revision 29063719) +Període Reiwa (revision 29227861) + +== End of Parsed pages == + +- Wikipedia parsing ended at: 2022-12-20 01:34:38.734771 + +57 characters appeared 1339831 times. + +Most Frequent characters: +[ 0] Char e: 12.524042211293812 % +[ 1] Char a: 11.715955221218199 % +[ 2] Char i: 7.815090112111155 % +[ 3] Char s: 7.809940208877089 % +[ 4] Char r: 6.866686917976969 % +[ 5] Char n: 6.706069646097157 % +[ 6] Char l: 6.58105387918327 % +[ 7] Char t: 6.268850325152949 % +[ 8] Char o: 5.046308079153267 % +[ 9] Char c: 4.242027539294135 % +[10] Char d: 4.013192708632656 % +[11] Char u: 3.5825413802188484 % +[12] Char m: 3.048966623402504 % +[13] Char p: 2.778783294310999 % +[14] Char g: 1.4824257686230575 % +[15] Char v: 1.3498717375549603 % +[16] Char b: 1.2941184373253045 % +[17] Char f: 0.975943980994618 % +[18] Char q: 0.7455417884792933 % +[19] Char h: 0.6949383914837021 % +[20] Char ó: 0.5910446914573555 % +[21] Char x: 0.5195431364104875 % +[22] Char é: 0.4443097674258918 % +[23] Char à: 0.3875115592936721 % +[24] Char j: 0.36474749427353154 % +[25] Char y: 0.3636279500922131 % +[26] Char è: 0.3583287743006394 % +[27] Char í: 0.3250409939761059 % +[28] Char k: 0.2481656268589098 % +[29] Char ò: 0.21577348187943107 % +[30] Char z: 0.17778361599336034 % +[31] Char w: 0.11673113997213082 % +[32] Char ç: 0.11016314744172959 % +[33] Char ú: 0.08792153637287091 % +[34] Char ü: 0.06709801460034885 % +[35] Char ï: 0.05448448349082832 % + +The first 36 characters have an accumulated ratio of 0.9997462366522347. +The first 5 characters have an accumulated ratio of 0.4673171467147723. +All characters whose order is over 21 have an accumulated ratio of 0.03321687585971664. + +1083 sequences found. + +First 517 (typical positive ratio): 0.9950067888087288 +Next 195 (712-517): 0.003994192320077694 +Rest: 0.0009990188711934689 + +- Processing end: 2022-12-20 01:34:38.859159 diff --git a/script/langs/ca.py b/script/langs/ca.py new file mode 100644 index 0000000..535ed66 --- /dev/null +++ b/script/langs/ca.py @@ -0,0 +1,79 @@ +#!/bin/python3 +# -*- coding: utf-8 -*- + +# ##### BEGIN LICENSE BLOCK ##### +# Version: MPL 1.1/GPL 2.0/LGPL 2.1 +# +# The contents of this file are subject to the Mozilla Public License Version +# 1.1 (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# http://www.mozilla.org/MPL/ +# +# Software distributed under the License is distributed on an "AS IS" basis, +# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License +# for the specific language governing rights and limitations under the +# License. +# +# The Original Code is Mozilla Universal charset detector code. +# +# The Initial Developer of the Original Code is +# Netscape Communications Corporation. +# Portions created by the Initial Developer are Copyright (C) 2001 +# the Initial Developer. All Rights Reserved. +# +# Contributor(s): +# Jehan <jehan@girinstud.io> +# +# Alternatively, the contents of this file may be used under the terms of +# either the GNU General Public License Version 2 or later (the "GPL"), or +# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), +# in which case the provisions of the GPL or the LGPL are applicable instead +# of those above. If you wish to allow use of your version of this file only +# under the terms of either the GPL or the LGPL, and not to allow others to +# use your version of this file under the terms of the MPL, indicate your +# decision by deleting the provisions above and replace them with the notice +# and other provisions required by the GPL or the LGPL. If you do not delete +# the provisions above, a recipient may use your version of this file under +# the terms of any one of the MPL, the GPL or the LGPL. +# +# ##### END LICENSE BLOCK ##### + +import re + +## Mandatory Properties ## + +# The human name for the language, in English. +name = 'Catalan' +# Use 2-letter ISO 639-1 if possible, 3-letter ISO code otherwise, +# or use another catalog as a last resort. +code = 'ca' +# ASCII characters are also used in French. +use_ascii = True +# The charsets we want to support and create data for. +charsets = ['ISO-8859-1', 'WINDOWS-1252'] + +## Optional Properties ## + +# Alphabet characters. +# If use_ascii=True, there is no need to add any ASCII characters. +# If case_mapping=True, there is no need to add several cases of a same +# character (provided Python algorithms know the right cases). +alphabet = ['à', 'è', 'é', 'í', 'ï', 'ó', 'ò', 'ú', 'ü', 'ç'] +# The start page. Though optional, it is advised to choose one yourself. +start_pages = ['Parlament_Europeu', 'Genji Monogatari'] +# give possibility to select another code for the Wikipedia URL. +wikipedia_code = code +# 'a' and 'A' will be considered the same character, and so on. +# This uses Python algorithm to determine upper/lower-case of a given +# character. +case_mapping = True + +# A function to clean content returned by the `wikipedia` python lib, +# in case some unwanted data has been overlooked. +# Note that we are already cleaning away the '=' from the title syntax +# of Wikipedia, as well as double spaces. But sometimes, Wikipedia in +# some language may return weird syntax or UI text which should be +# discarded. If you encounter one of these cases, use this function. +def clean_wikipedia_content(content): + # Do your garbage text cleaning here. + return content diff --git a/script/support.txt b/script/support.txt index d52051e..9b10026 100644 --- a/script/support.txt +++ b/script/support.txt @@ -1,6 +1,7 @@ ar be bg +ca cs da de diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index c017642..17fd980 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -10,6 +10,7 @@ set( LangModels/LangArabicModel.cpp LangModels/LangBelarusianModel.cpp LangModels/LangBulgarianModel.cpp + LangModels/LangCatalanModel.cpp LangModels/LangCroatianModel.cpp LangModels/LangCzechModel.cpp LangModels/LangEnglishModel.cpp diff --git a/src/LangModels/LangCatalanModel.cpp b/src/LangModels/LangCatalanModel.cpp new file mode 100644 index 0000000..5e0f4f6 --- /dev/null +++ b/src/LangModels/LangCatalanModel.cpp @@ -0,0 +1,207 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* ***** BEGIN LICENSE BLOCK ***** + * Version: MPL 1.1/GPL 2.0/LGPL 2.1 + * + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * The Original Code is Mozilla Communicator client code. + * + * The Initial Developer of the Original Code is + * Netscape Communications Corporation. + * Portions created by the Initial Developer are Copyright (C) 1998 + * the Initial Developer. All Rights Reserved. + * + * Contributor(s): + * + * Alternatively, the contents of this file may be used under the terms of + * either the GNU General Public License Version 2 or later (the "GPL"), or + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), + * in which case the provisions of the GPL or the LGPL are applicable instead + * of those above. If you wish to allow use of your version of this file only + * under the terms of either the GPL or the LGPL, and not to allow others to + * use your version of this file under the terms of the MPL, indicate your + * decision by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL or the LGPL. If you do not delete + * the provisions above, a recipient may use your version of this file under + * the terms of any one of the MPL, the GPL or the LGPL. + * + * ***** END LICENSE BLOCK ***** */ + +#include "../nsSBCharSetProber.h" +#include "../nsSBCharSetProber-generated.h" +#include "../nsLanguageDetector.h" + +#include "../nsLanguageDetector-generated.h" + +/********* Language model for: Catalan *********/ + +/** + * Generated by BuildLangModel.py + * On: 2022-12-20 01:34:38.735681 + **/ + + /* Character Mapping Table: + * ILL: illegal character. + * CTR: control character specific to the charset. + * RET: carriage/return. + * SYM: symbol (punctuation) that does not belong to word. + * NUM: 0 - 9. + * + * Other characters are ordered by probabilities + * (0 is the most common character in the language). + * + * Orders are generic to a language. So the codepoint with order X in + * CHARSET1 maps to the same character as the codepoint with the same + * order X in CHARSET2 for the same language. + * As such, it is possible to get missing order. For instance the + * ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1 + * even though they are both used for French. Same for the euro sign. + */ + static const unsigned char Iso_8859_1_CharToOrderMap[] = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 1, 16, 9, 10, 0, 17, 14, 19, 2, 24, 28, 6, 12, 5, 8, /* 4X */ + 13, 18, 4, 3, 7, 11, 15, 31, 21, 25, 30,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 1, 16, 9, 10, 0, 17, 14, 19, 2, 24, 28, 6, 12, 5, 8, /* 6X */ + 13, 18, 4, 3, 7, 11, 15, 31, 21, 25, 30,SYM,SYM,SYM,SYM,CTR, /* 7X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */ + SYM,SYM,SYM,SYM,SYM, 54,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */ + 23, 36, 45, 41, 39, 57, 40, 32, 26, 22, 49, 42, 50, 27, 58, 35, /* CX */ + 53, 38, 29, 20, 43, 52, 37,SYM, 56, 51, 33, 55, 34, 59, 60, 44, /* DX */ + 23, 36, 45, 41, 39, 61, 40, 32, 26, 22, 49, 42, 50, 27, 62, 35, /* EX */ + 53, 38, 29, 20, 43, 52, 37,SYM, 56, 51, 33, 55, 34, 63, 64, 65, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Windows_1252_CharToOrderMap[] = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 1, 16, 9, 10, 0, 17, 14, 19, 2, 24, 28, 6, 12, 5, 8, /* 4X */ + 13, 18, 4, 3, 7, 11, 15, 31, 21, 25, 30,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 1, 16, 9, 10, 0, 17, 14, 19, 2, 24, 28, 6, 12, 5, 8, /* 6X */ + 13, 18, 4, 3, 7, 11, 15, 31, 21, 25, 30,SYM,SYM,SYM,SYM,CTR, /* 7X */ + SYM,ILL,SYM, 66,SYM,SYM,SYM,SYM,SYM,SYM, 48,SYM, 47,ILL, 46,ILL, /* 8X */ + ILL,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, 48,SYM, 47,ILL, 46, 67, /* 9X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */ + SYM,SYM,SYM,SYM,SYM, 54,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */ + 23, 36, 45, 41, 39, 68, 40, 32, 26, 22, 49, 42, 50, 27, 69, 35, /* CX */ + 53, 38, 29, 20, 43, 52, 37,SYM, 56, 51, 33, 55, 34, 70, 71, 44, /* DX */ + 23, 36, 45, 41, 39, 72, 40, 32, 26, 22, 49, 42, 50, 27, 73, 35, /* EX */ + 53, 38, 29, 20, 43, 52, 37,SYM, 56, 51, 33, 55, 34, 74, 75, 76, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const int Unicode_Char_size = 72; +static const unsigned int Unicode_CharOrder[] = +{ + 65, 1, 66, 16, 67, 9, 68, 10, 69, 0, 70, 17, 71, 14, 72, 19, + 73, 2, 74, 24, 75, 28, 76, 6, 77, 12, 78, 5, 79, 8, 80, 13, + 81, 18, 82, 4, 83, 3, 84, 7, 85, 11, 86, 15, 87, 31, 88, 21, + 89, 25, 90, 30, 97, 1, 98, 16, 99, 9, 100, 10, 101, 0,102, 17, + 103, 14, 104, 19, 105, 2, 106, 24, 107, 28, 108, 6, 109, 12,110, 5, + 111, 8, 112, 13, 113, 18, 114, 4, 115, 3, 116, 7, 117, 11,118, 15, + 119, 31, 120, 21, 121, 25, 122, 30, 192, 23, 199, 32, 200, 26,201, 22, + 205, 27, 207, 35, 210, 29, 211, 20, 218, 33, 220, 34, 224, 23,231, 32, + 232, 26, 233, 22, 237, 27, 239, 35, 242, 29, 243, 20, 250, 33,252, 34, +}; + + + /* Model Table: + * Total considered sequences: 1083 / 1296 + * - Positive sequences: first 517 (0.9950067888087288) + * - Probable sequences: next 195 (712-517) (0.003994192320077694) + * - Neutral sequences: last 584 (0.0009990188711934689) + * - Negative sequences: 213 (off-ratio) + * Negative sequences: TODO + */ +static const PRUint8 CatalanLangModel[] = +{ + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,3,1,3,3,3,0,1,3,2,3,3,2,0,2,3, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,1,0,3,3,2,3,3,1,3,3,3,1,1,3, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,0,3,3,3,3,2,0,0,1, + 3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,1,2,3,0,3,3,3,3,2,1,3,0,2,1,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,2,3,3,3,3,3,2,2,3,3,2,1, + 3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,2,3,3,0,0, + 3,3,3,3,2,2,3,3,3,3,3,3,3,3,3,3,3,3,1,3,3,2,3,3,1,3,3,3,3,3,3,1,2,2,1,0, + 3,3,3,3,3,3,3,3,3,3,1,3,3,2,3,2,2,2,0,3,2,3,3,3,3,3,3,3,1,3,3,2,0,2,1,0, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,1,2,3,3,2,0,3,1,2,3,1,0,0,2, + 3,3,3,3,3,3,3,3,3,3,2,3,2,2,2,1,0,1,2,3,2,1,3,3,1,3,3,3,3,3,1,1,0,2,0,0, + 3,3,3,3,3,2,3,2,3,2,3,3,3,2,3,2,1,3,3,2,2,0,3,3,2,3,3,3,1,2,2,3,0,3,0,0, + 3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,2,3,3,2,2,1,3,3,2,3,2,3,3,3,0,2,1,0,0,0,3, + 3,3,3,3,2,3,2,3,3,3,1,3,3,3,1,1,3,3,0,1,3,1,3,3,2,2,3,3,0,3,1,1,0,3,1,0, + 3,3,3,3,3,2,3,3,3,3,3,3,1,3,1,2,2,2,0,3,3,0,2,3,0,2,3,3,1,3,0,1,2,3,0,0, + 3,3,3,3,3,3,3,2,3,1,2,3,3,2,3,1,2,1,0,3,2,1,1,2,2,3,3,2,1,2,1,2,0,2,3,0, + 3,3,3,1,2,1,1,1,3,0,1,3,0,1,0,1,1,1,0,0,0,1,3,3,0,1,2,3,1,1,0,1,0,0,0,0, + 3,3,3,3,3,3,3,3,3,2,3,3,2,2,2,2,3,0,0,1,2,1,3,3,3,2,3,2,1,2,1,0,0,2,1,0, + 3,3,3,2,3,1,3,3,3,1,1,3,1,1,2,1,0,3,0,0,2,0,2,3,0,1,2,3,1,3,0,0,0,1,2,0, + 0,1,1,1,1,0,0,1,0,0,0,3,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0, + 3,3,3,2,3,3,3,3,3,1,1,3,2,1,0,0,1,1,1,1,1,0,1,2,1,2,1,1,2,1,1,2,0,1,1,0, + 1,0,0,3,2,3,1,1,0,1,1,0,1,2,1,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0, + 3,3,3,1,1,0,1,3,3,3,1,2,1,3,0,3,1,2,1,2,1,3,1,2,0,0,3,3,1,3,0,1,0,1,0,0, + 2,1,2,3,2,3,2,2,1,1,1,2,2,1,1,1,1,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0, + 0,0,2,3,3,3,3,3,0,3,2,3,3,3,3,2,2,3,2,1,0,3,0,0,0,0,0,0,1,0,0,0,0,0,0,1, + 3,3,3,1,1,1,0,0,3,1,1,3,1,1,1,1,0,1,0,0,1,0,1,3,0,0,1,0,1,0,1,1,0,1,1,0, + 3,3,3,3,2,2,3,2,3,2,2,3,2,2,1,1,3,0,0,1,1,0,1,2,0,1,0,0,1,0,0,1,0,1,0,0, + 0,0,3,3,3,3,3,3,0,3,3,1,3,3,3,2,1,2,1,0,0,3,0,0,1,0,0,0,1,0,0,0,0,0,0,0, + 2,3,0,3,2,3,3,3,3,3,3,0,3,2,3,2,2,3,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0, + 3,3,3,3,2,2,2,2,3,1,1,3,2,2,1,1,1,2,0,2,1,1,0,1,1,3,0,1,2,0,0,2,0,1,0,0, + 0,0,0,3,3,3,3,3,1,3,3,0,3,3,3,1,2,2,2,0,0,2,0,0,0,0,0,0,1,0,0,0,0,0,0,0, + 3,3,3,1,1,1,2,1,3,1,0,3,1,1,1,0,1,0,1,2,1,1,2,2,0,1,1,0,1,1,2,1,0,0,1,0, + 3,3,3,2,2,2,1,1,3,0,1,1,1,1,0,0,1,0,0,2,0,0,1,0,0,1,0,0,1,0,0,2,0,0,1,0, + 1,3,0,0,0,0,0,0,3,0,0,2,1,0,0,0,0,0,0,0,2,0,0,3,0,0,0,0,0,1,0,0,0,0,0,0, + 0,0,1,3,2,3,3,2,0,1,1,0,3,1,0,1,3,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0, + 3,0,3,0,2,2,2,1,0,1,1,0,1,0,0,0,1,0,0,2,0,0,1,0,0,0,3,3,0,0,0,0,0,0,0,0, + 2,2,0,3,1,3,2,3,0,2,3,0,1,0,0,0,1,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +}; + + +const SequenceModel Iso_8859_1CatalanModel = +{ + Iso_8859_1_CharToOrderMap, + CatalanLangModel, + 36, + (float)0.9990009811288065, + PR_TRUE, + "ISO-8859-1", + "ca" +}; + +const SequenceModel Windows_1252CatalanModel = +{ + Windows_1252_CharToOrderMap, + CatalanLangModel, + 36, + (float)0.9990009811288065, + PR_TRUE, + "WINDOWS-1252", + "ca" +}; + +const LanguageModel CatalanModel = +{ + "ca", + Unicode_CharOrder, + 72, + CatalanLangModel, + 36, + 5, + (float)0.4673171467147723, + 21, + (float)0.03321687585971664, +}; diff --git a/src/nsLanguageDetector-generated.h b/src/nsLanguageDetector-generated.h index 4285e1d..39e0936 100644 --- a/src/nsLanguageDetector-generated.h +++ b/src/nsLanguageDetector-generated.h @@ -38,11 +38,12 @@ #ifndef nsLanguageDetector_h_generated_h__ #define nsLanguageDetector_h_generated_h__ -#define NUM_OF_LANGUAGE_MODELS 36 +#define NUM_OF_LANGUAGE_MODELS 37 extern const LanguageModel ArabicModel; extern const LanguageModel BelarusianModel; extern const LanguageModel BulgarianModel; +extern const LanguageModel CatalanModel; extern const LanguageModel CzechModel; extern const LanguageModel DanishModel; extern const LanguageModel GermanModel; diff --git a/src/nsMBCSGroupProber.cpp b/src/nsMBCSGroupProber.cpp index cbce483..e9d7548 100644 --- a/src/nsMBCSGroupProber.cpp +++ b/src/nsMBCSGroupProber.cpp @@ -95,6 +95,7 @@ nsMBCSGroupProber::nsMBCSGroupProber(PRUint32 aLanguageFilter) langDetectors[i][j++] = new nsLanguageDetector(&ArabicModel); langDetectors[i][j++] = new nsLanguageDetector(&BelarusianModel); langDetectors[i][j++] = new nsLanguageDetector(&BulgarianModel); + langDetectors[i][j++] = new nsLanguageDetector(&CatalanModel); langDetectors[i][j++] = new nsLanguageDetector(&CroatianModel); langDetectors[i][j++] = new nsLanguageDetector(&CzechModel); langDetectors[i][j++] = new nsLanguageDetector(&DanishModel); diff --git a/src/nsSBCSGroupProber.cpp b/src/nsSBCSGroupProber.cpp index 9bf3ad3..ba054c8 100644 --- a/src/nsSBCSGroupProber.cpp +++ b/src/nsSBCSGroupProber.cpp @@ -237,6 +237,9 @@ nsSBCSGroupProber::nsSBCSGroupProber() mProbers[n++] = new nsSingleByteCharSetProber(&Ibm855MacedonianModel); mProbers[n++] = new nsSingleByteCharSetProber(&Iso_8859_5MacedonianModel); + mProbers[n++] = new nsSingleByteCharSetProber(&Iso_8859_1CatalanModel); + mProbers[n++] = new nsSingleByteCharSetProber(&Windows_1252CatalanModel); + assert (n_sbcs_probers == n); Reset(); diff --git a/src/nsSBCharSetProber-generated.h b/src/nsSBCharSetProber-generated.h index 86dbae5..e110f08 100644 --- a/src/nsSBCharSetProber-generated.h +++ b/src/nsSBCharSetProber-generated.h @@ -38,7 +38,7 @@ #ifndef nsSingleByteCharSetProber_generated_h__ #define nsSingleByteCharSetProber_generated_h__ -#define NUM_OF_SEQUENCE_MODELS 116 +#define NUM_OF_SEQUENCE_MODELS 118 extern const SequenceModel Iso_8859_6ArabicModel; extern const SequenceModel Windows_1256ArabicModel; @@ -49,6 +49,9 @@ extern const SequenceModel Iso_8859_5BelarusianModel; extern const SequenceModel Windows_1251BulgarianModel; extern const SequenceModel Iso_8859_5BulgarianModel; +extern const SequenceModel Iso_8859_1CatalanModel; +extern const SequenceModel Windows_1252CatalanModel; + extern const SequenceModel Iso_8859_2CzechModel; extern const SequenceModel Windows_1250CzechModel; extern const SequenceModel Ibm852CzechModel; diff --git a/test/ca/iso-8859-1.txt b/test/ca/iso-8859-1.txt new file mode 100644 index 0000000..e052082 --- /dev/null +++ b/test/ca/iso-8859-1.txt @@ -0,0 +1 @@ +Les marmotes (Marmota) sn un gnere de mamfers de la famlia dels escirids.[1] Viuen a l'alta muntanya a l'hemisferi nord. Sn rosegadors de mida mitjana, una mica ms grans que els gats domstics, de potes curtes i cos ample que els proporcionen un aspecte fora rabassut. diff --git a/test/ca/utf-8.txt b/test/ca/utf-8.txt new file mode 100644 index 0000000..a6ea831 --- /dev/null +++ b/test/ca/utf-8.txt @@ -0,0 +1 @@ +Les marmotes (Marmota) són un gènere de mamífers de la família dels esciúrids.[1] Viuen a l'alta muntanya a l'hemisferi nord. Són rosegadors de mida mitjana, una mica més grans que els gats domèstics, de potes curtes i cos ample que els proporcionen un aspecte força rabassut. diff --git a/test/ca/windows-1252.txt b/test/ca/windows-1252.txt new file mode 100644 index 0000000..d630e58 --- /dev/null +++ b/test/ca/windows-1252.txt @@ -0,0 +1 @@ +Les especials relacions econmiques es fonamenten en la llibertat de trnsit de mercaderies, treballadors i capitals, aix com en l'establiment d'una moneda nica, l'euro () per tots els estats membres (la denominada Eurozona). |