summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJehan <jehan@girinstud.io>2022-12-20 01:46:15 +0100
committerJehan <jehan@girinstud.io>2022-12-20 01:46:15 +0100
commitd40e5868d5ec1f08f1e6e0d25e04dae68c586ba1 (patch)
tree95cab0573b4160c7d0b5223f7466014309e62d5a
parentcec8817d799e1f0bdc86031d4e20433016fcf85f (diff)
script, src, test: adding Catalan support.
For UTF-8, ISO-8859-1 and WINDOWS-1252 support. The test for UTF-8 and ISO-8859-1 is taken from 'Marmota' page on Wikipedia in Catalan. The test for WINDOWS-1252 is taken from the 'Unió_Europea' page. ISO-8859-1 and WINDOWS-1252 being very similar, regarding most letters (in particular the ones used in Catalan), I differentiated the test with a text containing the '€' symbol, which is on an unused spot in ISO-8859-1.
-rw-r--r--README.md4
-rw-r--r--script/BuildLangModelLogs/LangCatalanModel.log238
-rw-r--r--script/langs/ca.py79
-rw-r--r--script/support.txt1
-rw-r--r--src/CMakeLists.txt1
-rw-r--r--src/LangModels/LangCatalanModel.cpp207
-rw-r--r--src/nsLanguageDetector-generated.h3
-rw-r--r--src/nsMBCSGroupProber.cpp1
-rw-r--r--src/nsSBCSGroupProber.cpp3
-rw-r--r--src/nsSBCharSetProber-generated.h5
-rw-r--r--test/ca/iso-8859-1.txt1
-rw-r--r--test/ca/utf-8.txt1
-rw-r--r--test/ca/windows-1252.txt1
13 files changed, 543 insertions, 2 deletions
diff --git a/README.md b/README.md
index ab200bb..8b71ba9 100644
--- a/README.md
+++ b/README.md
@@ -25,6 +25,10 @@ uchardet started as a C language binding of the original C++ implementation of t
* UTF-8
* ISO-8859-5
* WINDOWS-1251
+ * Catalan
+ * UTF-8
+ * ISO-8859-1
+ * WINDOWS-1252
* Chinese
* UTF-8
* ISO-2022-CN
diff --git a/script/BuildLangModelLogs/LangCatalanModel.log b/script/BuildLangModelLogs/LangCatalanModel.log
new file mode 100644
index 0000000..f00ab58
--- /dev/null
+++ b/script/BuildLangModelLogs/LangCatalanModel.log
@@ -0,0 +1,238 @@
+= Logs of language model for Catalan (ca) =
+
+- Generated by BuildLangModel.py
+- Started: 2022-12-20 01:31:40.290803
+- Maximum depth: 4
+- Max number of pages: 200
+
+== Parsed pages ==
+
+Parlament_Europeu (revision 31056370)
+Genji Monogatari (revision 31007904)
+Bundestag (revision 30742728)
+Kana (revision 29176811)
+Jun'ichirō Tanizaki (revision 30750244)
+Representació proporcional amb llista de partit (revision 22086795)
+Agències de la Unió Europea (revision 30276199)
+Poder executiu (revision 30290834)
+Edicions Atalanta (revision 26048077)
+Animació (revision 30865051)
+Pressupost de la Unió Europea (revision 30231577)
+Jorge Luis Borges (revision 30783720)
+Universitat de Pittsburgh (revision 25411555)
+Satiricó (revision 31019009)
+Dramatis personae (revision 30858787)
+Corpus lingüístic (revision 28600087)
+Genji Monogatari Emaki (revision 30520718)
+Era Keichō (revision 27881416)
+Període Heian (revision 30351338)
+Uji (revision 26298733)
+Clan Minamoto (revision 29218047)
+Ventafocs (revision 30167478)
+わ (revision 28487155)
+Japó (revision 30980338)
+Agència Europea dels Sistemes Globals de Navegació per Satèl·lit (revision 28777516)
+Període Shōwa (revision 30351346)
+ム (revision 25190709)
+Premi Balzan (revision 30321993)
+Germans Grimm (revision 30104486)
+Europol (revision 25369380)
+Unió Europea (revision 30730061)
+Kyoto (revision 30706119)
+Incendi del Reichstag (revision 30894126)
+Processament de llenguatge natural (revision 29016655)
+794 (revision 29283769)
+CANTIC (revision 30488826)
+Casa de la Història Europea (revision 30703943)
+VP:VER (revision 30232565)
+Katakana (revision 29937701)
+Shogunat Kamakura (revision 28808156)
+Eleccions (revision 30449311)
+Noam Chomsky (revision 30552025)
+Eleccions federals alemanyes de 1994 (revision 28337358)
+Conceptes d'unitat europea abans del 1945 (revision 30927921)
+Era Heian (revision 30351338)
+Gemeinsame Normdatei (revision 30883432)
+La Bella Dorment (pel·lícula de 1959) (revision 30982067)
+Població (revision 30352350)
+Obra literària (revision 31011396)
+も (revision 25190714)
+Istituto Centrale per il Catalogo Unico (revision 28786509)
+Política (revision 31014511)
+ハ (revision 31071577)
+Vot (revision 27865452)
+Clan Taira (revision 26323649)
+Permís de conducció europeu (revision 27672810)
+Mandala (revision 30940608)
+Campània antiga (revision 29855854)
+ゐ (revision 28487156)
+Consell de la Unió Europea (revision 30308594)
+24 de juliol (revision 31063555)
+Kyōto (revision 30706119)
+Alfons X de Castella (revision 30535714)
+VIAF (revision 28927187)
+1975 (revision 31057077)
+モ (revision 25190714)
+Sistema Galileo (revision 30880731)
+Densitat de població (revision 30174278)
+Autodesk Maya (revision 30989692)
+Nàpols (revision 31028649)
+Memòria de traducció (revision 30341759)
+Ryukyu (revision 29922259)
+Agència Europea per a la Seguretat i la Salut en el Treball (revision 29049313)
+ISNI (revision 30824306)
+PDF (revision 29442049)
+Eleccions federals alemanyes de 1972 (revision 30271501)
+Sistema presidencialista (revision 30596011)
+Primer ministre (revision 27174693)
+Coeducació (revision 31048027)
+Ko Tazawa (revision 30932179)
+Poliomielitis (revision 30976061)
+18 de setembre (revision 31063494)
+Campanya electoral (revision 27935270)
+Kōbō Abe (revision 30016508)
+Rodopis (revision 28014188)
+Política Agrària Comunitària (revision 30353551)
+21 d'octubre (revision 30980460)
+1984 (revision 31063521)
+South Park (revision 31024165)
+Hiragana (revision 29920075)
+Associació de Votants de Schleswig Meridional (revision 30753058)
+ひ (revision 31071564)
+Lingüística (revision 31037031)
+Blauet comú (revision 28729161)
+Autodeterminació (revision 29349294)
+Xina (revision 31007838)
+Control d'autoritats (revision 29854505)
+Guillermo de Torre (revision 30765552)
+Unesco (revision 30129516)
+Romanització Hepburn (revision 29144432)
+Tanka (revision 30478859)
+Clientelisme (revision 30811663)
+Corpus Textual Informatitzat de la Llengua Catalana (revision 29876775)
+Secessió (revision 29980781)
+Fada protectora (revision 29175001)
+を (revision 28487157)
+Ōtsu (revision 30010938)
+Gran Enciclopèdia Catalana (revision 30724375)
+LCCN (revision 30638965)
+Universitat privada (revision 28518823)
+Robert Louis Stevenson (revision 30728093)
+Kioto (revision 30706119)
+7 de setembre (revision 30503878)
+Aardman Animations (revision 30216975)
+Llibertinatge (revision 29597307)
+Bibliothèque nationale de France (revision 30715383)
+Alemanya Occidental (revision 30239917)
+National Library of Australia (revision 30977078)
+Diccionari Descriptiu de la Llengua Catalana (revision 27017217)
+1969 (revision 31060188)
+Separació de poders (revision 30362225)
+Isaac Titsingh (revision 29748956)
+Adolf Hitler (revision 30951478)
+Període Kamakura (revision 28808156)
+Societas Europaea (revision 28857120)
+Invasions japoneses a Corea (revision 30978745)
+Agència de la Unió Europea (revision 30276199)
+Sistema polític (revision 30713673)
+1606 (revision 26237152)
+Universitat Rovira i Virgili (revision 30865280)
+IVA (revision 30328630)
+Patricis (revision 30923152)
+Els barrufets (revision 31008031)
+Lapislàtzuli Editorial (revision 30176117)
+Internet (revision 30894405)
+BIBSYS (revision 30255267)
+Agència Europea de Seguretat Marítima (revision 28888118)
+National Diet Library (revision 30669422)
+Grup Enciclopèdia Catalana (revision 31077222)
+Competència comunicativa (revision 30307632)
+Castell Fushimi (revision 30610308)
+Walter Gropius (revision 30790098)
+Biblioteca Nacional de España (revision 31071591)
+Diccionari Normatiu Valencià (revision 29882403)
+Oscar Wilde (revision 31078983)
+Hampshire (revision 30823098)
+Clan Fujiwara (revision 30894950)
+Speedy Gonzales (revision 30151280)
+Tlön, Uqbar, Orbis Tertius (revision 29688246)
+Japó ocupat (revision 28083159)
+Garbancito de la Mancha (revision 30219073)
+SUDOC (revision 29231585)
+Gerardo Diego (revision 29912471)
+Universitat (revision 29907980)
+Foliscopi (revision 29903436)
+1980 (revision 31063457)
+Infart de miocardi (revision 30894255)
+Encyclopædia Britannica (revision 28347959)
+Petroni (revision 29790499)
+Horari de màxima audiència (revision 27872454)
+Sutra (revision 23458427)
+Medicina (revision 31002196)
+ホ (revision 25190705)
+Luci Appuleu (revision 30336717)
+Novel·la (revision 30386814)
+Kimba, el lleó blanc (revision 30273901)
+UTC+09:00 (revision 25182859)
+Arquitectura neogòtica (revision 30347122)
+Segle I (revision 30953541)
+Emperador del Japó (revision 27799841)
+Biblioteca Nacional de la República Txeca (revision 29847950)
+Gran Diccionari de la Llengua Catalana (revision 29063719)
+Període Reiwa (revision 29227861)
+
+== End of Parsed pages ==
+
+- Wikipedia parsing ended at: 2022-12-20 01:34:38.734771
+
+57 characters appeared 1339831 times.
+
+Most Frequent characters:
+[ 0] Char e: 12.524042211293812 %
+[ 1] Char a: 11.715955221218199 %
+[ 2] Char i: 7.815090112111155 %
+[ 3] Char s: 7.809940208877089 %
+[ 4] Char r: 6.866686917976969 %
+[ 5] Char n: 6.706069646097157 %
+[ 6] Char l: 6.58105387918327 %
+[ 7] Char t: 6.268850325152949 %
+[ 8] Char o: 5.046308079153267 %
+[ 9] Char c: 4.242027539294135 %
+[10] Char d: 4.013192708632656 %
+[11] Char u: 3.5825413802188484 %
+[12] Char m: 3.048966623402504 %
+[13] Char p: 2.778783294310999 %
+[14] Char g: 1.4824257686230575 %
+[15] Char v: 1.3498717375549603 %
+[16] Char b: 1.2941184373253045 %
+[17] Char f: 0.975943980994618 %
+[18] Char q: 0.7455417884792933 %
+[19] Char h: 0.6949383914837021 %
+[20] Char ó: 0.5910446914573555 %
+[21] Char x: 0.5195431364104875 %
+[22] Char é: 0.4443097674258918 %
+[23] Char à: 0.3875115592936721 %
+[24] Char j: 0.36474749427353154 %
+[25] Char y: 0.3636279500922131 %
+[26] Char è: 0.3583287743006394 %
+[27] Char í: 0.3250409939761059 %
+[28] Char k: 0.2481656268589098 %
+[29] Char ò: 0.21577348187943107 %
+[30] Char z: 0.17778361599336034 %
+[31] Char w: 0.11673113997213082 %
+[32] Char ç: 0.11016314744172959 %
+[33] Char ú: 0.08792153637287091 %
+[34] Char ü: 0.06709801460034885 %
+[35] Char ï: 0.05448448349082832 %
+
+The first 36 characters have an accumulated ratio of 0.9997462366522347.
+The first 5 characters have an accumulated ratio of 0.4673171467147723.
+All characters whose order is over 21 have an accumulated ratio of 0.03321687585971664.
+
+1083 sequences found.
+
+First 517 (typical positive ratio): 0.9950067888087288
+Next 195 (712-517): 0.003994192320077694
+Rest: 0.0009990188711934689
+
+- Processing end: 2022-12-20 01:34:38.859159
diff --git a/script/langs/ca.py b/script/langs/ca.py
new file mode 100644
index 0000000..535ed66
--- /dev/null
+++ b/script/langs/ca.py
@@ -0,0 +1,79 @@
+#!/bin/python3
+# -*- coding: utf-8 -*-
+
+# ##### BEGIN LICENSE BLOCK #####
+# Version: MPL 1.1/GPL 2.0/LGPL 2.1
+#
+# The contents of this file are subject to the Mozilla Public License Version
+# 1.1 (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+# http://www.mozilla.org/MPL/
+#
+# Software distributed under the License is distributed on an "AS IS" basis,
+# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
+# for the specific language governing rights and limitations under the
+# License.
+#
+# The Original Code is Mozilla Universal charset detector code.
+#
+# The Initial Developer of the Original Code is
+# Netscape Communications Corporation.
+# Portions created by the Initial Developer are Copyright (C) 2001
+# the Initial Developer. All Rights Reserved.
+#
+# Contributor(s):
+# Jehan <jehan@girinstud.io>
+#
+# Alternatively, the contents of this file may be used under the terms of
+# either the GNU General Public License Version 2 or later (the "GPL"), or
+# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
+# in which case the provisions of the GPL or the LGPL are applicable instead
+# of those above. If you wish to allow use of your version of this file only
+# under the terms of either the GPL or the LGPL, and not to allow others to
+# use your version of this file under the terms of the MPL, indicate your
+# decision by deleting the provisions above and replace them with the notice
+# and other provisions required by the GPL or the LGPL. If you do not delete
+# the provisions above, a recipient may use your version of this file under
+# the terms of any one of the MPL, the GPL or the LGPL.
+#
+# ##### END LICENSE BLOCK #####
+
+import re
+
+## Mandatory Properties ##
+
+# The human name for the language, in English.
+name = 'Catalan'
+# Use 2-letter ISO 639-1 if possible, 3-letter ISO code otherwise,
+# or use another catalog as a last resort.
+code = 'ca'
+# ASCII characters are also used in French.
+use_ascii = True
+# The charsets we want to support and create data for.
+charsets = ['ISO-8859-1', 'WINDOWS-1252']
+
+## Optional Properties ##
+
+# Alphabet characters.
+# If use_ascii=True, there is no need to add any ASCII characters.
+# If case_mapping=True, there is no need to add several cases of a same
+# character (provided Python algorithms know the right cases).
+alphabet = ['à', 'è', 'é', 'í', 'ï', 'ó', 'ò', 'ú', 'ü', 'ç']
+# The start page. Though optional, it is advised to choose one yourself.
+start_pages = ['Parlament_Europeu', 'Genji Monogatari']
+# give possibility to select another code for the Wikipedia URL.
+wikipedia_code = code
+# 'a' and 'A' will be considered the same character, and so on.
+# This uses Python algorithm to determine upper/lower-case of a given
+# character.
+case_mapping = True
+
+# A function to clean content returned by the `wikipedia` python lib,
+# in case some unwanted data has been overlooked.
+# Note that we are already cleaning away the '=' from the title syntax
+# of Wikipedia, as well as double spaces. But sometimes, Wikipedia in
+# some language may return weird syntax or UI text which should be
+# discarded. If you encounter one of these cases, use this function.
+def clean_wikipedia_content(content):
+ # Do your garbage text cleaning here.
+ return content
diff --git a/script/support.txt b/script/support.txt
index d52051e..9b10026 100644
--- a/script/support.txt
+++ b/script/support.txt
@@ -1,6 +1,7 @@
ar
be
bg
+ca
cs
da
de
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index c017642..17fd980 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -10,6 +10,7 @@ set(
LangModels/LangArabicModel.cpp
LangModels/LangBelarusianModel.cpp
LangModels/LangBulgarianModel.cpp
+ LangModels/LangCatalanModel.cpp
LangModels/LangCroatianModel.cpp
LangModels/LangCzechModel.cpp
LangModels/LangEnglishModel.cpp
diff --git a/src/LangModels/LangCatalanModel.cpp b/src/LangModels/LangCatalanModel.cpp
new file mode 100644
index 0000000..5e0f4f6
--- /dev/null
+++ b/src/LangModels/LangCatalanModel.cpp
@@ -0,0 +1,207 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* ***** BEGIN LICENSE BLOCK *****
+ * Version: MPL 1.1/GPL 2.0/LGPL 2.1
+ *
+ * The contents of this file are subject to the Mozilla Public License Version
+ * 1.1 (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ * http://www.mozilla.org/MPL/
+ *
+ * Software distributed under the License is distributed on an "AS IS" basis,
+ * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
+ * for the specific language governing rights and limitations under the
+ * License.
+ *
+ * The Original Code is Mozilla Communicator client code.
+ *
+ * The Initial Developer of the Original Code is
+ * Netscape Communications Corporation.
+ * Portions created by the Initial Developer are Copyright (C) 1998
+ * the Initial Developer. All Rights Reserved.
+ *
+ * Contributor(s):
+ *
+ * Alternatively, the contents of this file may be used under the terms of
+ * either the GNU General Public License Version 2 or later (the "GPL"), or
+ * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
+ * in which case the provisions of the GPL or the LGPL are applicable instead
+ * of those above. If you wish to allow use of your version of this file only
+ * under the terms of either the GPL or the LGPL, and not to allow others to
+ * use your version of this file under the terms of the MPL, indicate your
+ * decision by deleting the provisions above and replace them with the notice
+ * and other provisions required by the GPL or the LGPL. If you do not delete
+ * the provisions above, a recipient may use your version of this file under
+ * the terms of any one of the MPL, the GPL or the LGPL.
+ *
+ * ***** END LICENSE BLOCK ***** */
+
+#include "../nsSBCharSetProber.h"
+#include "../nsSBCharSetProber-generated.h"
+#include "../nsLanguageDetector.h"
+
+#include "../nsLanguageDetector-generated.h"
+
+/********* Language model for: Catalan *********/
+
+/**
+ * Generated by BuildLangModel.py
+ * On: 2022-12-20 01:34:38.735681
+ **/
+
+ /* Character Mapping Table:
+ * ILL: illegal character.
+ * CTR: control character specific to the charset.
+ * RET: carriage/return.
+ * SYM: symbol (punctuation) that does not belong to word.
+ * NUM: 0 - 9.
+ *
+ * Other characters are ordered by probabilities
+ * (0 is the most common character in the language).
+ *
+ * Orders are generic to a language. So the codepoint with order X in
+ * CHARSET1 maps to the same character as the codepoint with the same
+ * order X in CHARSET2 for the same language.
+ * As such, it is possible to get missing order. For instance the
+ * ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1
+ * even though they are both used for French. Same for the euro sign.
+ */
+ static const unsigned char Iso_8859_1_CharToOrderMap[] =
+{
+ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */
+ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
+ SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
+ NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
+ SYM, 1, 16, 9, 10, 0, 17, 14, 19, 2, 24, 28, 6, 12, 5, 8, /* 4X */
+ 13, 18, 4, 3, 7, 11, 15, 31, 21, 25, 30,SYM,SYM,SYM,SYM,SYM, /* 5X */
+ SYM, 1, 16, 9, 10, 0, 17, 14, 19, 2, 24, 28, 6, 12, 5, 8, /* 6X */
+ 13, 18, 4, 3, 7, 11, 15, 31, 21, 25, 30,SYM,SYM,SYM,SYM,CTR, /* 7X */
+ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */
+ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */
+ SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */
+ SYM,SYM,SYM,SYM,SYM, 54,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */
+ 23, 36, 45, 41, 39, 57, 40, 32, 26, 22, 49, 42, 50, 27, 58, 35, /* CX */
+ 53, 38, 29, 20, 43, 52, 37,SYM, 56, 51, 33, 55, 34, 59, 60, 44, /* DX */
+ 23, 36, 45, 41, 39, 61, 40, 32, 26, 22, 49, 42, 50, 27, 62, 35, /* EX */
+ 53, 38, 29, 20, 43, 52, 37,SYM, 56, 51, 33, 55, 34, 63, 64, 65, /* FX */
+};
+/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
+
+static const unsigned char Windows_1252_CharToOrderMap[] =
+{
+ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */
+ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
+ SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
+ NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
+ SYM, 1, 16, 9, 10, 0, 17, 14, 19, 2, 24, 28, 6, 12, 5, 8, /* 4X */
+ 13, 18, 4, 3, 7, 11, 15, 31, 21, 25, 30,SYM,SYM,SYM,SYM,SYM, /* 5X */
+ SYM, 1, 16, 9, 10, 0, 17, 14, 19, 2, 24, 28, 6, 12, 5, 8, /* 6X */
+ 13, 18, 4, 3, 7, 11, 15, 31, 21, 25, 30,SYM,SYM,SYM,SYM,CTR, /* 7X */
+ SYM,ILL,SYM, 66,SYM,SYM,SYM,SYM,SYM,SYM, 48,SYM, 47,ILL, 46,ILL, /* 8X */
+ ILL,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, 48,SYM, 47,ILL, 46, 67, /* 9X */
+ SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* AX */
+ SYM,SYM,SYM,SYM,SYM, 54,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */
+ 23, 36, 45, 41, 39, 68, 40, 32, 26, 22, 49, 42, 50, 27, 69, 35, /* CX */
+ 53, 38, 29, 20, 43, 52, 37,SYM, 56, 51, 33, 55, 34, 70, 71, 44, /* DX */
+ 23, 36, 45, 41, 39, 72, 40, 32, 26, 22, 49, 42, 50, 27, 73, 35, /* EX */
+ 53, 38, 29, 20, 43, 52, 37,SYM, 56, 51, 33, 55, 34, 74, 75, 76, /* FX */
+};
+/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
+
+static const int Unicode_Char_size = 72;
+static const unsigned int Unicode_CharOrder[] =
+{
+ 65, 1, 66, 16, 67, 9, 68, 10, 69, 0, 70, 17, 71, 14, 72, 19,
+ 73, 2, 74, 24, 75, 28, 76, 6, 77, 12, 78, 5, 79, 8, 80, 13,
+ 81, 18, 82, 4, 83, 3, 84, 7, 85, 11, 86, 15, 87, 31, 88, 21,
+ 89, 25, 90, 30, 97, 1, 98, 16, 99, 9, 100, 10, 101, 0,102, 17,
+ 103, 14, 104, 19, 105, 2, 106, 24, 107, 28, 108, 6, 109, 12,110, 5,
+ 111, 8, 112, 13, 113, 18, 114, 4, 115, 3, 116, 7, 117, 11,118, 15,
+ 119, 31, 120, 21, 121, 25, 122, 30, 192, 23, 199, 32, 200, 26,201, 22,
+ 205, 27, 207, 35, 210, 29, 211, 20, 218, 33, 220, 34, 224, 23,231, 32,
+ 232, 26, 233, 22, 237, 27, 239, 35, 242, 29, 243, 20, 250, 33,252, 34,
+};
+
+
+ /* Model Table:
+ * Total considered sequences: 1083 / 1296
+ * - Positive sequences: first 517 (0.9950067888087288)
+ * - Probable sequences: next 195 (712-517) (0.003994192320077694)
+ * - Neutral sequences: last 584 (0.0009990188711934689)
+ * - Negative sequences: 213 (off-ratio)
+ * Negative sequences: TODO
+ */
+static const PRUint8 CatalanLangModel[] =
+{
+ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,3,1,3,3,3,0,1,3,2,3,3,2,0,2,3,
+ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,1,0,3,3,2,3,3,1,3,3,3,1,1,3,
+ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,0,3,3,3,3,2,0,0,1,
+ 3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,1,2,3,0,3,3,3,3,2,1,3,0,2,1,0,
+ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,2,3,3,3,3,3,2,2,3,3,2,1,
+ 3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,2,3,3,0,0,
+ 3,3,3,3,2,2,3,3,3,3,3,3,3,3,3,3,3,3,1,3,3,2,3,3,1,3,3,3,3,3,3,1,2,2,1,0,
+ 3,3,3,3,3,3,3,3,3,3,1,3,3,2,3,2,2,2,0,3,2,3,3,3,3,3,3,3,1,3,3,2,0,2,1,0,
+ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,1,2,3,3,2,0,3,1,2,3,1,0,0,2,
+ 3,3,3,3,3,3,3,3,3,3,2,3,2,2,2,1,0,1,2,3,2,1,3,3,1,3,3,3,3,3,1,1,0,2,0,0,
+ 3,3,3,3,3,2,3,2,3,2,3,3,3,2,3,2,1,3,3,2,2,0,3,3,2,3,3,3,1,2,2,3,0,3,0,0,
+ 3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,2,3,3,2,2,1,3,3,2,3,2,3,3,3,0,2,1,0,0,0,3,
+ 3,3,3,3,2,3,2,3,3,3,1,3,3,3,1,1,3,3,0,1,3,1,3,3,2,2,3,3,0,3,1,1,0,3,1,0,
+ 3,3,3,3,3,2,3,3,3,3,3,3,1,3,1,2,2,2,0,3,3,0,2,3,0,2,3,3,1,3,0,1,2,3,0,0,
+ 3,3,3,3,3,3,3,2,3,1,2,3,3,2,3,1,2,1,0,3,2,1,1,2,2,3,3,2,1,2,1,2,0,2,3,0,
+ 3,3,3,1,2,1,1,1,3,0,1,3,0,1,0,1,1,1,0,0,0,1,3,3,0,1,2,3,1,1,0,1,0,0,0,0,
+ 3,3,3,3,3,3,3,3,3,2,3,3,2,2,2,2,3,0,0,1,2,1,3,3,3,2,3,2,1,2,1,0,0,2,1,0,
+ 3,3,3,2,3,1,3,3,3,1,1,3,1,1,2,1,0,3,0,0,2,0,2,3,0,1,2,3,1,3,0,0,0,1,2,0,
+ 0,1,1,1,1,0,0,1,0,0,0,3,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,
+ 3,3,3,2,3,3,3,3,3,1,1,3,2,1,0,0,1,1,1,1,1,0,1,2,1,2,1,1,2,1,1,2,0,1,1,0,
+ 1,0,0,3,2,3,1,1,0,1,1,0,1,2,1,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,
+ 3,3,3,1,1,0,1,3,3,3,1,2,1,3,0,3,1,2,1,2,1,3,1,2,0,0,3,3,1,3,0,1,0,1,0,0,
+ 2,1,2,3,2,3,2,2,1,1,1,2,2,1,1,1,1,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,
+ 0,0,2,3,3,3,3,3,0,3,2,3,3,3,3,2,2,3,2,1,0,3,0,0,0,0,0,0,1,0,0,0,0,0,0,1,
+ 3,3,3,1,1,1,0,0,3,1,1,3,1,1,1,1,0,1,0,0,1,0,1,3,0,0,1,0,1,0,1,1,0,1,1,0,
+ 3,3,3,3,2,2,3,2,3,2,2,3,2,2,1,1,3,0,0,1,1,0,1,2,0,1,0,0,1,0,0,1,0,1,0,0,
+ 0,0,3,3,3,3,3,3,0,3,3,1,3,3,3,2,1,2,1,0,0,3,0,0,1,0,0,0,1,0,0,0,0,0,0,0,
+ 2,3,0,3,2,3,3,3,3,3,3,0,3,2,3,2,2,3,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,
+ 3,3,3,3,2,2,2,2,3,1,1,3,2,2,1,1,1,2,0,2,1,1,0,1,1,3,0,1,2,0,0,2,0,1,0,0,
+ 0,0,0,3,3,3,3,3,1,3,3,0,3,3,3,1,2,2,2,0,0,2,0,0,0,0,0,0,1,0,0,0,0,0,0,0,
+ 3,3,3,1,1,1,2,1,3,1,0,3,1,1,1,0,1,0,1,2,1,1,2,2,0,1,1,0,1,1,2,1,0,0,1,0,
+ 3,3,3,2,2,2,1,1,3,0,1,1,1,1,0,0,1,0,0,2,0,0,1,0,0,1,0,0,1,0,0,2,0,0,1,0,
+ 1,3,0,0,0,0,0,0,3,0,0,2,1,0,0,0,0,0,0,0,2,0,0,3,0,0,0,0,0,1,0,0,0,0,0,0,
+ 0,0,1,3,2,3,3,2,0,1,1,0,3,1,0,1,3,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,
+ 3,0,3,0,2,2,2,1,0,1,1,0,1,0,0,0,1,0,0,2,0,0,1,0,0,0,3,3,0,0,0,0,0,0,0,0,
+ 2,2,0,3,1,3,2,3,0,2,3,0,1,0,0,0,1,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+};
+
+
+const SequenceModel Iso_8859_1CatalanModel =
+{
+ Iso_8859_1_CharToOrderMap,
+ CatalanLangModel,
+ 36,
+ (float)0.9990009811288065,
+ PR_TRUE,
+ "ISO-8859-1",
+ "ca"
+};
+
+const SequenceModel Windows_1252CatalanModel =
+{
+ Windows_1252_CharToOrderMap,
+ CatalanLangModel,
+ 36,
+ (float)0.9990009811288065,
+ PR_TRUE,
+ "WINDOWS-1252",
+ "ca"
+};
+
+const LanguageModel CatalanModel =
+{
+ "ca",
+ Unicode_CharOrder,
+ 72,
+ CatalanLangModel,
+ 36,
+ 5,
+ (float)0.4673171467147723,
+ 21,
+ (float)0.03321687585971664,
+};
diff --git a/src/nsLanguageDetector-generated.h b/src/nsLanguageDetector-generated.h
index 4285e1d..39e0936 100644
--- a/src/nsLanguageDetector-generated.h
+++ b/src/nsLanguageDetector-generated.h
@@ -38,11 +38,12 @@
#ifndef nsLanguageDetector_h_generated_h__
#define nsLanguageDetector_h_generated_h__
-#define NUM_OF_LANGUAGE_MODELS 36
+#define NUM_OF_LANGUAGE_MODELS 37
extern const LanguageModel ArabicModel;
extern const LanguageModel BelarusianModel;
extern const LanguageModel BulgarianModel;
+extern const LanguageModel CatalanModel;
extern const LanguageModel CzechModel;
extern const LanguageModel DanishModel;
extern const LanguageModel GermanModel;
diff --git a/src/nsMBCSGroupProber.cpp b/src/nsMBCSGroupProber.cpp
index cbce483..e9d7548 100644
--- a/src/nsMBCSGroupProber.cpp
+++ b/src/nsMBCSGroupProber.cpp
@@ -95,6 +95,7 @@ nsMBCSGroupProber::nsMBCSGroupProber(PRUint32 aLanguageFilter)
langDetectors[i][j++] = new nsLanguageDetector(&ArabicModel);
langDetectors[i][j++] = new nsLanguageDetector(&BelarusianModel);
langDetectors[i][j++] = new nsLanguageDetector(&BulgarianModel);
+ langDetectors[i][j++] = new nsLanguageDetector(&CatalanModel);
langDetectors[i][j++] = new nsLanguageDetector(&CroatianModel);
langDetectors[i][j++] = new nsLanguageDetector(&CzechModel);
langDetectors[i][j++] = new nsLanguageDetector(&DanishModel);
diff --git a/src/nsSBCSGroupProber.cpp b/src/nsSBCSGroupProber.cpp
index 9bf3ad3..ba054c8 100644
--- a/src/nsSBCSGroupProber.cpp
+++ b/src/nsSBCSGroupProber.cpp
@@ -237,6 +237,9 @@ nsSBCSGroupProber::nsSBCSGroupProber()
mProbers[n++] = new nsSingleByteCharSetProber(&Ibm855MacedonianModel);
mProbers[n++] = new nsSingleByteCharSetProber(&Iso_8859_5MacedonianModel);
+ mProbers[n++] = new nsSingleByteCharSetProber(&Iso_8859_1CatalanModel);
+ mProbers[n++] = new nsSingleByteCharSetProber(&Windows_1252CatalanModel);
+
assert (n_sbcs_probers == n);
Reset();
diff --git a/src/nsSBCharSetProber-generated.h b/src/nsSBCharSetProber-generated.h
index 86dbae5..e110f08 100644
--- a/src/nsSBCharSetProber-generated.h
+++ b/src/nsSBCharSetProber-generated.h
@@ -38,7 +38,7 @@
#ifndef nsSingleByteCharSetProber_generated_h__
#define nsSingleByteCharSetProber_generated_h__
-#define NUM_OF_SEQUENCE_MODELS 116
+#define NUM_OF_SEQUENCE_MODELS 118
extern const SequenceModel Iso_8859_6ArabicModel;
extern const SequenceModel Windows_1256ArabicModel;
@@ -49,6 +49,9 @@ extern const SequenceModel Iso_8859_5BelarusianModel;
extern const SequenceModel Windows_1251BulgarianModel;
extern const SequenceModel Iso_8859_5BulgarianModel;
+extern const SequenceModel Iso_8859_1CatalanModel;
+extern const SequenceModel Windows_1252CatalanModel;
+
extern const SequenceModel Iso_8859_2CzechModel;
extern const SequenceModel Windows_1250CzechModel;
extern const SequenceModel Ibm852CzechModel;
diff --git a/test/ca/iso-8859-1.txt b/test/ca/iso-8859-1.txt
new file mode 100644
index 0000000..e052082
--- /dev/null
+++ b/test/ca/iso-8859-1.txt
@@ -0,0 +1 @@
+Les marmotes (Marmota) sn un gnere de mamfers de la famlia dels escirids.[1] Viuen a l'alta muntanya a l'hemisferi nord. Sn rosegadors de mida mitjana, una mica ms grans que els gats domstics, de potes curtes i cos ample que els proporcionen un aspecte fora rabassut.
diff --git a/test/ca/utf-8.txt b/test/ca/utf-8.txt
new file mode 100644
index 0000000..a6ea831
--- /dev/null
+++ b/test/ca/utf-8.txt
@@ -0,0 +1 @@
+Les marmotes (Marmota) són un gènere de mamífers de la família dels esciúrids.[1] Viuen a l'alta muntanya a l'hemisferi nord. Són rosegadors de mida mitjana, una mica més grans que els gats domèstics, de potes curtes i cos ample que els proporcionen un aspecte força rabassut.
diff --git a/test/ca/windows-1252.txt b/test/ca/windows-1252.txt
new file mode 100644
index 0000000..d630e58
--- /dev/null
+++ b/test/ca/windows-1252.txt
@@ -0,0 +1 @@
+Les especials relacions econmiques es fonamenten en la llibertat de trnsit de mercaderies, treballadors i capitals, aix com en l'establiment d'una moneda nica, l'euro () per tots els estats membres (la denominada Eurozona).