added marker handler and some fixers for common mistakes like space before comma

author: Daniel Korostil <ted.korostiled@gmail.com> 2012-06-29 13:28:59 +0300
committer: Daniel Korostil <ted.korostiled@gmail.com> 2012-06-29 13:28:59 +0300
commit: 8ad29fd4c0883da4effe8df7bda32a7456d5be08 (patch)
tree: 5da96b5ac2ee23a1395836bcd26c8e013675b6de
parent: 5666081ea845ee8c9b538310bcee3c3db24fa16c (diff)
1 files changed, 44 insertions, 20 deletions
diff --git a/src/converter/xml2reexp.py b/src/converter/xml2reexp.py
index 8c599bc..b35c6ef 100644
--- a/src/converter/xml2reexp.py
+++ b/src/converter/xml2reexp.py
@@ -1,23 +1,47 @@
 import xml.etree.ElementTree as ET
+import string
+import re
 
-tree = ET.parse("test.xml") # parsing grammar.xml into an ElementTree instance
+tree = ET.parse("grammar.xml")
+
+def poscorr(s, markerpos): # increase group positions in the replacement string, if needed
+  if markerpos:
+    return re.sub(r"(?<=\\)([%d-9])" % markerpos, lambda r: str(int(r.group(0)) + 1), s)
+  return s
+  
+def cleaner(a):
+  a = ' '.join(a)
+  a = string.replace(a," ,",",")
+  a = string.replace(a," .",".")
+  a = string.replace(a," ???","")
+  return a
+
+for rule in tree.iter("rule"):
+  pattern = []
+  for markertrue in rule.findall('pattern/marker'):
+      if markertrue.tag == "marker":
+	continue
+#  if rule.find('pattern/marker') is None:  # check only rules with <marker>
+#    continue
+  markerpos = 0
+  n = 0
+  for item in rule.findall('pattern/*'):
+    n += 1
+    if item.attrib and item.attrib.keys() != ["regexp"]: 
+      pattern = []
+      break
+    if item.tag == 'marker':
+	markerpos = n
+	for i in item.iter('token'):
+	    pattern += ["%s" % (i.text or "???")]
+	pattern[n - 1] = "(" + pattern[n - 1]
+	pattern[-1] += ')'
+    else:
+    	pattern += ["%s" % (item.text or "???")]
+  if pattern:
+    try:
+      print pattern
+      print cleaner(pattern), "-%d>" % markerpos, poscorr(rule.find('message').find('suggestion').text, markerpos), "# Did you mean?"
+    except:
+      print "# Did you mean?"
 
-# list all rules with simple tokens
-for rule in tree.iter("rule"):  # cycle for all <rule> elements of grammar.xml, variable rule contains the data of the actual element
-  simple = True  # simple rule is a rule with tokens without attributes (see documentaton of LanguageTool grammar.xml)
-  for token in rule.iter("token"): # cycle for all tokens in the actual rule, variable token contains the data of the actual <token> element
-    if token.attrib and token.attrib.keys() != ["regexp"]: # if attrib is not an empty dict (attrib is the Python dict of attributes of the XML element, see ElementTree doc), regexp is supported by the parethesized tokens in the output
-      simple = False  # the rule is not simple
-  if simple:
-    for token in rule.iter("token"):
-      for child in rule:
-        if child.tag == 'token':
-            print "%s" % child.text
-        elif child.tag == 'marker':
-            print "("
-            for subchild in child:
-                if subchild.tag == 'token':
-                    print "%s" % subchild.text
-            print ")"
-      print "%s" % token.text,
-    print "->", rule.find('message').find('suggestion').text, "# Did you mean?"
-\ No newline at end of file
author	Daniel Korostil <ted.korostiled@gmail.com>	2012-06-29 13:28:59 +0300
committer	Daniel Korostil <ted.korostiled@gmail.com>	2012-06-29 13:28:59 +0300
commit	8ad29fd4c0883da4effe8df7bda32a7456d5be08 (patch)
tree	5da96b5ac2ee23a1395836bcd26c8e013675b6de
parent	5666081ea845ee8c9b538310bcee3c3db24fa16c (diff)