1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
|
#!/usr/bin/env python
# coding=utf-8
#
# This file is part of the LibreOffice project.
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
#
# This utility translates a normal dictionary (in this case English/Czech)
# into a thesaurus for one of the languages (in this case Czech).
#
# Based on idea of Zdenek Zabokrtsky <zabokrtsky@ufal.mff.cuni.cz>, big
# thanks! :-)
import os
import re
import sys
def usage():
message = """Usage: {program} slovnik_data_utf8.txt backlist.txt
slovnik_data_utf8.txt: Dictionary data from http://slovnik.zcu.cz/download.php
blacklist.txt: List of words that should be ignored when generating
"""
sys.stderr.write(message.format(program = os.path.basename(sys.argv[0])))
def classify(typ):
if typ == '':
return ''
elif typ == 'adj':
return '(příd. jm.)'
elif typ == 'adv':
return '(přísl.)'
elif typ == 'n':
return '(podst. jm.)'
elif typ == 'v':
return '(slov.)'
return ''
def parse(filename, blacklistname):
blacklist = {}
with open(blacklistname, "r") as fp:
for line in fp:
if (line == ''):
continue
elif (line[0] == '#'):
continue
else:
blacklist[line.strip(' \n')] = 1
synonyms = {}
meanings = {}
classification = {}
match_ignore = re.compile('(\[neprav\.\]|\[vulg\.\])')
match_cleanup = re.compile('(\[.*\]|\*|:.*)')
with open(filename, "r") as fp:
for line in fp:
if (line == ''):
continue
elif (line[0] == '#'):
continue
else:
terms = line.split('\t')
if (terms[0] == '' or len(terms) < 2):
continue
index = terms[0].strip()
if (index == ''):
continue
word = terms[1].strip()
if (word != '' and word[0] == '"' and word[len(word)-1] == '"'):
word = word.strip('" ')
if (word == ''):
continue
if (index + '\t' + word in blacklist or
index in blacklist or
index + '\t' in blacklist or
'\t' + word in blacklist):
continue
typ = ''
if (len(terms) >= 2):
typ = terms[2]
# ignore non-translations
if match_ignore.search(typ) != None:
continue
typ = match_cleanup.sub('', typ)
typ = typ.strip()
typ = classify(typ)
if index in synonyms:
synonyms[index].append( (word, typ) )
else:
synonyms[index] = [ (word, typ) ]
if word in meanings:
meanings[word].append(index)
else:
meanings[word] = [ index ]
if typ != '':
if word in classification:
if not typ in classification[word]:
classification[word].append(typ)
else:
classification[word] = [ typ ]
return (synonyms, meanings, classification)
def buildThesaurus(synonyms, meanings, classification):
# for every word:
# find all the indexes, and then again map the indexes to words - these are the synonyms
for word in sorted(meanings.keys()):
# we assume that various indexes (english words here) are various
# meanings; not generally true, but...
indexes = meanings[word]
# only limit the words if the type is unambiguous
typ = ''
if word in classification and len(classification[word]) == 1:
typ = classification[word][0]
# we want to output each word just once
used_this_round = [ word ]
output_lines = []
for index in indexes:
syns = synonyms[index]
# collect types first
types = []
for (w, t) in syns:
if not t in types:
types.append(t)
# build the various thesaurus lines
line = {}
for syn in syns:
(w, t) = syn
if typ != '' and t != '' and typ != t:
continue
if not w in used_this_round:
if t in line:
line[t] += '|' + w
else:
line[t] = '|' + w
used_this_round.append(w)
if len(line) != 0:
for t in types:
if t in line:
output_lines.append( (t, line[t]) )
if len(output_lines) > 0:
print word + '|' + str(len(output_lines))
# those with existing classification are probably a better fit,
# put them to the front (even if we don't output the
# classification in the end)
for i in [0, 1]:
for (t, line) in output_lines:
# first pass only non-empty, 2nd pass only empty
if (i == 0 and t != '') or (i == 1 and t == ''):
if typ == '':
print t + line
else:
print line
def main(args):
if (len(args) != 3):
usage()
sys.exit(1)
(synonyms, meanings, classification) = parse(args[1], args[2])
print "UTF-8"
buildThesaurus(synonyms, meanings, classification)
if __name__ == "__main__":
main(sys.argv)
# vim:set shiftwidth=4 softtabstop=4 expandtab:
|