diff options
author | Daniel Korostil <ted.korostiled@gmail.com> | 2012-06-29 13:28:59 +0300 |
---|---|---|
committer | Daniel Korostil <ted.korostiled@gmail.com> | 2012-06-29 13:28:59 +0300 |
commit | 8ad29fd4c0883da4effe8df7bda32a7456d5be08 (patch) | |
tree | 5da96b5ac2ee23a1395836bcd26c8e013675b6de | |
parent | 5666081ea845ee8c9b538310bcee3c3db24fa16c (diff) |
added marker handler and some fixers for common mistakes like space before comma
-rw-r--r-- | src/converter/xml2reexp.py | 64 |
1 files changed, 44 insertions, 20 deletions
diff --git a/src/converter/xml2reexp.py b/src/converter/xml2reexp.py index 8c599bc..b35c6ef 100644 --- a/src/converter/xml2reexp.py +++ b/src/converter/xml2reexp.py @@ -1,23 +1,47 @@ import xml.etree.ElementTree as ET +import string +import re -tree = ET.parse("test.xml") # parsing grammar.xml into an ElementTree instance +tree = ET.parse("grammar.xml") + +def poscorr(s, markerpos): # increase group positions in the replacement string, if needed + if markerpos: + return re.sub(r"(?<=\\)([%d-9])" % markerpos, lambda r: str(int(r.group(0)) + 1), s) + return s + +def cleaner(a): + a = ' '.join(a) + a = string.replace(a," ,",",") + a = string.replace(a," .",".") + a = string.replace(a," ???","") + return a + +for rule in tree.iter("rule"): + pattern = [] + for markertrue in rule.findall('pattern/marker'): + if markertrue.tag == "marker": + continue +# if rule.find('pattern/marker') is None: # check only rules with <marker> +# continue + markerpos = 0 + n = 0 + for item in rule.findall('pattern/*'): + n += 1 + if item.attrib and item.attrib.keys() != ["regexp"]: + pattern = [] + break + if item.tag == 'marker': + markerpos = n + for i in item.iter('token'): + pattern += ["%s" % (i.text or "???")] + pattern[n - 1] = "(" + pattern[n - 1] + pattern[-1] += ')' + else: + pattern += ["%s" % (item.text or "???")] + if pattern: + try: + print pattern + print cleaner(pattern), "-%d>" % markerpos, poscorr(rule.find('message').find('suggestion').text, markerpos), "# Did you mean?" + except: + print "# Did you mean?" -# list all rules with simple tokens -for rule in tree.iter("rule"): # cycle for all <rule> elements of grammar.xml, variable rule contains the data of the actual element - simple = True # simple rule is a rule with tokens without attributes (see documentaton of LanguageTool grammar.xml) - for token in rule.iter("token"): # cycle for all tokens in the actual rule, variable token contains the data of the actual <token> element - if token.attrib and token.attrib.keys() != ["regexp"]: # if attrib is not an empty dict (attrib is the Python dict of attributes of the XML element, see ElementTree doc), regexp is supported by the parethesized tokens in the output - simple = False # the rule is not simple - if simple: - for token in rule.iter("token"): - for child in rule: - if child.tag == 'token': - print "%s" % child.text - elif child.tag == 'marker': - print "(" - for subchild in child: - if subchild.tag == 'token': - print "%s" % subchild.text - print ")" - print "%s" % token.text, - print "->", rule.find('message').find('suggestion').text, "# Did you mean?"
\ No newline at end of file |