summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJan Holesovsky <kendy@collabora.com>2016-02-26 08:38:58 +0100
committerJan Holesovsky <kendy@collabora.com>2016-02-26 08:46:32 +0100
commitc32de9bba681bcc7becc64f0ea3b605ff2b38266 (patch)
tree1afe71589e1e970bc4c5f15ec27de7a5db32c6b5
parent8442e91f9d62d25d478f891d940a56551b2fd484 (diff)
dictionary-to-thesaurus.py: Only output the same class of word.
When the class of the word is unambiguous, limit the output only to that - gives more precise & expected results. [Like, it is interesting to see the other possibilities too, but I guess less choices but more focused ones are preferred.] Change-Id: I2876fbb4fa02c00fc7e65189812365f77b9a5ed6
-rwxr-xr-xcs_CZ/thesaurus/dictionary-to-thesaurus.py32
1 files changed, 27 insertions, 5 deletions
diff --git a/cs_CZ/thesaurus/dictionary-to-thesaurus.py b/cs_CZ/thesaurus/dictionary-to-thesaurus.py
index 8ee022c..63f906a 100755
--- a/cs_CZ/thesaurus/dictionary-to-thesaurus.py
+++ b/cs_CZ/thesaurus/dictionary-to-thesaurus.py
@@ -54,6 +54,7 @@ def parse(filename, blacklistname):
synonyms = {}
meanings = {}
+ classification = {}
match_ignore = re.compile('(\[neprav\.\]|\[vulg\.\])')
match_cleanup = re.compile('(\[.*\]|\*|:.*)')
@@ -109,9 +110,16 @@ def parse(filename, blacklistname):
else:
meanings[word] = [ index ]
- return (synonyms, meanings)
+ if typ != '':
+ if word in classification:
+ if not typ in classification[word]:
+ classification[word].append(typ)
+ else:
+ classification[word] = [ typ ]
+
+ return (synonyms, meanings, classification)
-def buildThesaurus(synonyms, meanings):
+def buildThesaurus(synonyms, meanings, classification):
# for every word:
# find all the indexes, and then again map the indexes to words - these are the synonyms
for word in sorted(meanings.keys()):
@@ -119,6 +127,11 @@ def buildThesaurus(synonyms, meanings):
# meanings; not generally true, but...
indexes = meanings[word]
+ # only limit the words if the type is unambiguous
+ typ = ''
+ if word in classification and len(classification[word]) == 1:
+ typ = classification[word][0]
+
# we want to output each word just once
used_this_round = [ word ]
@@ -132,9 +145,14 @@ def buildThesaurus(synonyms, meanings):
if not t in types:
types.append(t)
+ # build the various thesaurus lines
line = {}
for syn in syns:
(w, t) = syn
+
+ if typ != '' and t != '' and typ != t:
+ continue
+
if not w in used_this_round:
if t in line:
line[t] += '|' + w
@@ -145,7 +163,11 @@ def buildThesaurus(synonyms, meanings):
if len(line) != 0:
for t in types:
if t in line:
- output_lines.append(t + line[t])
+ if typ == '':
+ # classification is abmiguous, output the type too
+ output_lines.append(t + line[t])
+ else:
+ output_lines.append(line[t])
if len(output_lines) > 0:
print word + '|' + str(len(output_lines))
@@ -157,10 +179,10 @@ def main(args):
usage()
sys.exit(1)
- (synonyms, meanings) = parse(args[1], args[2])
+ (synonyms, meanings, classification) = parse(args[1], args[2])
print "UTF-8"
- buildThesaurus(synonyms, meanings)
+ buildThesaurus(synonyms, meanings, classification)
if __name__ == "__main__":
main(sys.argv)