diff options
author | Jan Holesovsky <kendy@collabora.com> | 2016-02-26 08:38:58 +0100 |
---|---|---|
committer | Jan Holesovsky <kendy@collabora.com> | 2016-02-26 08:46:32 +0100 |
commit | c32de9bba681bcc7becc64f0ea3b605ff2b38266 (patch) | |
tree | 1afe71589e1e970bc4c5f15ec27de7a5db32c6b5 | |
parent | 8442e91f9d62d25d478f891d940a56551b2fd484 (diff) |
dictionary-to-thesaurus.py: Only output the same class of word.
When the class of the word is unambiguous, limit the output only to that -
gives more precise & expected results.
[Like, it is interesting to see the other possibilities too, but I guess less
choices but more focused ones are preferred.]
Change-Id: I2876fbb4fa02c00fc7e65189812365f77b9a5ed6
-rwxr-xr-x | cs_CZ/thesaurus/dictionary-to-thesaurus.py | 32 |
1 files changed, 27 insertions, 5 deletions
diff --git a/cs_CZ/thesaurus/dictionary-to-thesaurus.py b/cs_CZ/thesaurus/dictionary-to-thesaurus.py index 8ee022c..63f906a 100755 --- a/cs_CZ/thesaurus/dictionary-to-thesaurus.py +++ b/cs_CZ/thesaurus/dictionary-to-thesaurus.py @@ -54,6 +54,7 @@ def parse(filename, blacklistname): synonyms = {} meanings = {} + classification = {} match_ignore = re.compile('(\[neprav\.\]|\[vulg\.\])') match_cleanup = re.compile('(\[.*\]|\*|:.*)') @@ -109,9 +110,16 @@ def parse(filename, blacklistname): else: meanings[word] = [ index ] - return (synonyms, meanings) + if typ != '': + if word in classification: + if not typ in classification[word]: + classification[word].append(typ) + else: + classification[word] = [ typ ] + + return (synonyms, meanings, classification) -def buildThesaurus(synonyms, meanings): +def buildThesaurus(synonyms, meanings, classification): # for every word: # find all the indexes, and then again map the indexes to words - these are the synonyms for word in sorted(meanings.keys()): @@ -119,6 +127,11 @@ def buildThesaurus(synonyms, meanings): # meanings; not generally true, but... indexes = meanings[word] + # only limit the words if the type is unambiguous + typ = '' + if word in classification and len(classification[word]) == 1: + typ = classification[word][0] + # we want to output each word just once used_this_round = [ word ] @@ -132,9 +145,14 @@ def buildThesaurus(synonyms, meanings): if not t in types: types.append(t) + # build the various thesaurus lines line = {} for syn in syns: (w, t) = syn + + if typ != '' and t != '' and typ != t: + continue + if not w in used_this_round: if t in line: line[t] += '|' + w @@ -145,7 +163,11 @@ def buildThesaurus(synonyms, meanings): if len(line) != 0: for t in types: if t in line: - output_lines.append(t + line[t]) + if typ == '': + # classification is abmiguous, output the type too + output_lines.append(t + line[t]) + else: + output_lines.append(line[t]) if len(output_lines) > 0: print word + '|' + str(len(output_lines)) @@ -157,10 +179,10 @@ def main(args): usage() sys.exit(1) - (synonyms, meanings) = parse(args[1], args[2]) + (synonyms, meanings, classification) = parse(args[1], args[2]) print "UTF-8" - buildThesaurus(synonyms, meanings) + buildThesaurus(synonyms, meanings, classification) if __name__ == "__main__": main(sys.argv) |