summaryrefslogtreecommitdiff
path: root/langclass/fpdb.conf
blob: 428074b0877f0bdb5efbef3339c6cad120d5c1e5 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
# This file have been modified (to OOo by Jocelyn MERAND joc.mer@gmail.com) to
# include country and encoding
#
# TO-DO: convert to BCP-47
#
# guess strings are made as following : language-country-encoding
#
# Based on a sample config file for the language models provided with Gertjan
# van Noords language guesser (http://odur.let.rug.nl/~vannoord/TextCat/)
#
# Notes:
# - Putting the most probable languages at the top of the list
# improves performance, because this will raise the threshold for
# likely candidates more quickly.
#
# Top 10 http://www.ethnologue.com/ethno_docs/distribution.asp?by=size
zh-Hans.lm  zh-CN-utf8  #zh-Hans
es.lm       es--utf8
en.lm       en--utf8
ar.lm       ar--utf8
hi.lm       hi--utf8
bn.lm       bn--utf8
pt.lm       pt--utf8
ru.lm       ru--utf8
ja.lm       ja--utf8
de.lm       de--utf8

ab.lm       ab--utf8
ace.lm      ace--utf8
ada.lm      ada--utf8
af.lm       af--utf8
ak.lm       ak--utf8
alt.lm      alt--utf8
am.lm       am--utf8
arn.lm      arn--utf8
ast.lm      ast--utf8
az.lm  	    az--utf8   #az-Latn
az-Cyrl.lm  az-cyrillic-utf8
ay.lm       ay--utf8
ban.lm      ban--utf8
be.lm       be--utf8
bem.lm      bem--utf8
bg.lm       bg--utf8
bho.lm      bho--utf8
bi.lm       bi--utf8
bik.lm      bik--utf8
bm.lm       bm--utf8
bo.lm       bo--utf8
br.lm       br--utf8
bs.lm       bs--utf8    #Suppress-Script: Latn
buc.lm      buc--utf8
ca.lm       ca--utf8
ckb.lm      ckb--utf8
cs.lm       cs--utf8
cv.lm       cv--utf8
cy.lm       cy--utf8
da.lm       da--utf8
dv.lm       dv--utf8
dz.lm       dz--utf8
ee.lm       ee--utf8
el.lm       el--utf8
emk-Latn.lm emk-Latn-utf8
eo.lm       eo--utf8
et.lm       et--utf8
eu.lm       eu--utf8
fa.lm       fa--utf8
fi.lm       fi--utf8
fj.lm       fj--utf8
fkv.lm      fkv--utf8
fo.lm       fo--utf8
fr.lm       fr--utf8
fur.lm      fur--utf8
fy.lm       fy--utf8
ga.lm       ga--utf8
gd.lm       gd--utf8
gl.lm       gl--utf8
grc.lm      grc--utf8
gu.lm       gu--utf8
gug.lm      gug--utf8
gv.lm       gv--utf8
ha-NG.lm    ha-NG-utf8
haw.lm      haw-utf8
he.lm       he--utf8
hil.lm      hil--utf8
hr.lm       hr--utf8    #Suppress-Script: Latn
hsb.lm      hsb--utf8
ht.lm       ht--utf8
hu.lm       hu--utf8
hy.lm       hy--utf8
ia.lm       ia--utf8
id.lm       id--utf8
is.lm       is--utf8
it.lm       it--utf8
ka.lm       ka--utf8
kk.lm       kk--utf8
kl.lm       kl--utf8
km.lm       km--utf8
kn.lm       kn--utf8
kng.lm      kng--utf8
ko.lm       ko--utf8
ktu.lm      ktu--utf8
ky.lm       ky--utf8
la.lm       la--utf8
lb.lm       lb--utf8
lg.lm       lg--utf8
ln.lm       ln--utf8
lo.lm       lo--utf8
lt.lm       lt--utf8
lv.lm       lv--utf8
mai.lm      mai--utf8
mi.lm       mi--utf8
mk.lm       mk--utf8
ml.lm       ml--utf8
mn.lm       mn--utf8    #mn-Cyrl
mos.lm      mos--utf8
mr.lm       mr--utf8
ms.lm       ms--utf8    #ms-Latn
mt.lm       mt--utf8
my.lm       my--utf8
nb.lm       nb--utf8
nds.lm      nds--utf8
ne.lm       ne--utf8
nl.lm       nl--utf8
nn.lm       nn--utf8
nr.lm       nr--utf8
nso.lm      nso--utf8
ny.lm       ny--utf8
oc.lm       oc--utf8
om.lm       om--utf8
pa.lm       pa--utf8
pl.lm       pl--utf8
plt.lm      plt--utf8
quz.lm      quz--utf8
qxa.lm      qxa--utf8
rm.lm       rm--utf8
ro.lm       ro--utf8
rue.lm      rue--utf8
rw.lm       rw--utf8
sa.lm       sa--utf8
sc.lm       sc--utf8
sco.lm      sco--utf8
sd.lm       sd--utf8    #sr-Arab
se.lm       se--utf8
sg.lm       sg--utf8
shs.lm      shs--utf8
si.lm       si--utf8
sk.lm       sk--utf8
sl.lm       sl--utf8
so.lm       so--utf8
sq.lm       sq--utf8
sr-Cyrl.lm  sr--utf8    #sr-Cyrl
sr-Latn.lm  sh--utf8    #sr-Latn
ss.lm       ss--utf8
st.lm       st--utf8
sv.lm       sv--utf8
sw.lm       sw--utf8
ta.lm       ta--utf8
tet.lm      tet--utf8
tg.lm       tg--utf8
th.lm       th--utf8
ti.lm       ti--utf8
tk.lm       tk--utf8    #tk-Latn
tl.lm       tl--utf8
tn.lm       tn--utf8
tpi.lm      tpi--utf8
tr.lm       tr--utf8
ts.lm       ts--utf8
tt.lm       tt--utf8
ty.lm       ty--utf8
tzm-Latn.lm tzm-Latn-utf8
ug.lm       ug--utf8    #ug-Arab
uk.lm       uk--utf8
ur.lm       ur--utf8
uz.lm       uz--utf8    #uz-Latn
uz-Cyrl.lm  uz-Cyrl-utf8
ve.lm       ve--utf8
vep.lm      vep--utf8
vi.lm       vi--utf8
wa.lm       wa--utf8
xh.lm       xh--utf8
yi.lm       yi--utf8
yo.lm       yo--utf8
zh-Hant.lm  zh-TW-utf8  #zh-Hant
zu.lm       zu--utf8