Added new features to converter

author: Daniel Korostil <ted.korostiled@gmail.com> 2012-07-16 15:35:24 +0300
committer: Daniel Korostil <ted.korostiled@gmail.com> 2012-07-16 15:35:24 +0300
commit: d021ca49317fc607064b04437e9b2d03b97cde5c (patch)
tree: c98da7034ab519b372eef1f5592bb0897c8ea877
parent: d74eb78c70ac29667a7b06017a646e22d1caf6e4 (diff)
2 files changed, 530 insertions, 63 deletions
diff --git a/src/converter/test.xml b/src/converter/test.xml
deleted file mode 100644
index b40f7b3..0000000
--- a/src/converter/test.xml
+++ /dev/null
@@ -1,16 +0,0 @@
-<rule>
-        <pattern>
-          <token>xxx</token>
-          <token>To</token>
-	<marker>
-          <token>being</token>
-          <token>with</token>
-	</marker>
-          <token>,</token>
-        </pattern>
-        <message>Did you mean <suggestion>begin</suggestion>?</message>
-        <short>Possible typo</short>
-        <example correction="begin" type="incorrect">To
-<marker>being</marker> with, she is a Russian spy.</example>
-        <example type="correct">To begin with, she's a spy.</example>
-      </rule>
diff --git a/src/converter/xml2reexp.py b/src/converter/xml2reexp.py
index 00252dd..eea3fd5 100644
--- a/src/converter/xml2reexp.py
+++ b/src/converter/xml2reexp.py
@@ -1,48 +1,531 @@
-import xml.etree.ElementTree as ET
-import string
-import re
-
-tree = ET.parse("grammar.xml")
-
-def poscorr(s, markerpos): # increase group positions in the replacement string, if needed
-  if markerpos:
-    return re.sub(r"(?<=\\)([%d-9])" % markerpos, lambda r: str(int(r.group(0)) + 1), s)
-  return s
-  
-def cleaner(a):
-  a = ' '.join(a)
-  a = string.replace(a," ,",",")
-  a = string.replace(a," .",".")
-  a = string.replace(a," ???","")
-  a = string.replace(a," (???)","")
-  a = string.replace(a,"??? ","")
-  return a
-
-for rule in tree.iter("rule"):
-  pattern = []
-  for markertrue in rule.findall('pattern/marker'):
-      if markertrue.tag == "marker":
-	continue
-#  if rule.find('pattern/marker') is None:  # check only rules with <marker>
-#    continue
-  markerpos = 0
-  n = 0
-  for item in rule.findall('pattern/*'):
-    n += 1
-    if item.attrib and item.attrib.keys() != ["regexp"]: 
-      pattern = []
-      break
-    if item.tag == 'marker':
-	markerpos = n
-	for i in item.iter('token'):
-	    pattern += ["%s" % (i.text or "???")]
-	pattern[n - 1] = "(" + pattern[n - 1]
-	pattern[-1] += ')'
-    else:
-    	pattern += ["%s" % (item.text or "???")]
-  if pattern:
-    try:
-      print cleaner(pattern), "-%d>" % markerpos, poscorr(rule.find('message').find('suggestion').text, markerpos), "# Did you mean?"
-    except:
-      print "# Did you mean?"
+# -*- coding: UTF-8 -*-
+import lxml.etree as ET
+import sys
+import os
+import codecs
+import copy
 
+did_you_mean = False
+number = 0
+amount = 0
+to_implement = 0
+no_support = 0
+
+
+class RuleModel(object):
+    '''parse() method returns rule'''
+    def __init__(self, rule):
+        self.rule = rule
+        self.pattern = rule.find('pattern')
+        self.pattern_descendants = RuleModel.element_descendants(self.pattern)
+        self.message = rule.find('message')
+        self.message_descendants = RuleModel.element_descendants(self.message)
+        self.items = RuleModel.nesting_list_killer(
+                    [i.items() for i in self.message.iterdescendants()]
+                    + [i.items() for i in self.pattern.iterdescendants()])
+        self.keys = RuleModel.nesting_list_killer(
+                    [i.keys() for i in self.message.iterdescendants()]
+                    + [i.keys() for i in self.pattern.iterdescendants()])
+
+    @staticmethod
+    def nesting_list_killer(lst):
+        result = []
+        for element in lst:
+            if isinstance(element, (str, unicode, tuple)):
+                result.append(element)
+            else:
+                result += RuleModel.nesting_list_killer(element)
+        return set(result)
+
+    @staticmethod
+    def key_remover(iterable, keys_list):
+        keys = copy.copy(keys_list)
+        it = list(copy.copy(iterable))
+        for key in keys:
+            try:
+                it.remove(key)
+            except ValueError:
+                pass
+        return it
+
+    @staticmethod
+    def space_killer(old_result):
+        '''space killer, char dealer'''
+        result = [i.strip() for i in old_result[1:]]
+
+        for item in result:
+            if (item in ('.', ',', ':', '!', '?', "'", '"', '([.])')
+                and item in result
+                or item.startswith('\\n')
+                ):
+                indx = result.index(item)
+                if indx != 0:
+                    result[indx - 1] += item
+                    result.remove(item)
+                else:
+                    result = [(item + result[:][1]), ] + result[2:]
+
+        for item in result:
+            if '\\n' in item:
+                result[result.index(item)] = '\\n'.join(i.strip()
+                                                    for i in item.split('\\n'))
+
+            if item.startswith('([.])') or item.endswith('([.]))'):
+                indx = result.index(item)
+                if indx != 0:
+                    result[indx - 1] += item
+                    result.remove(item)
+
+        for item in result:
+            if item.endswith('([.]))'):
+                indx = result.index(item)
+                if indx != 0:
+                    result[indx - 1] += item
+                    result.remove(item)
+#            if '|' in item:
+#                result[result.index(item)] = '\\n'.join(i.strip()
+#                                                    for i in item.split('|'))
+
+        for item in result:
+            if item.endswith("'") and (len(result) - result.index(item) >= 2):
+                indx = result.index(item)
+                result[indx] += result[indx + 1]
+                result.remove(result[indx + 1])
+
+        result.insert(0, old_result[0])
+        return result
+
+    @staticmethod
+    def element_descendants(element):
+        return ((i, i.tag, tuple(i.keys()), i.getchildren(), i.text, i.tail)
+                                            for i in element.iterdescendants())
+
+    def parse(self, attr):
+        result = []
+        suggestions = []
+
+        if attr == 'pattern':
+            target = self.pattern_descendants
+        elif attr == 'message':
+            target = self.message_descendants
+        else:
+            raise AttributeError("wrong attr for parse()'s target ")
+
+        for (element, tag, keys, _children, text,
+                                            tail) in target:
+# start for #
+            '''unsupported pattern keys'''
+            if (self.keys
+                and 'postag' in self.keys
+                or 'postag_regexp' in self.keys
+                or 'negate_pos' in self.keys
+                ):
+                if ('postag', 'SENT_START') in self.items:
+                    r = copy.copy(self.items)
+                    r.remove(('postag', 'SENT_START'))
+                    r = [i[0] for i in RuleModel.nesting_list_killer(r)]
+                    if ('postag' in r
+                        or 'postag_regexp' in r
+                        or 'negate_pos' in r):
+                        result = [[], 'Not supported', ]
+                        return result
+                    else:
+                        pass
+                else:
+                    result = [[], 'Not supported', ]
+                    return result
+
+            '''not implemented features'''
+            if (self.keys
+                and 'regexp_match' in self.keys
+                or 'regexp_replace' in self.keys
+                or 'regexp_replace' in self.keys
+                or 'case_conversion' in self.keys
+#                or tag == 'match'
+#                or 'skip' in self.keys
+#                or 'negate' in keys
+#                or 'spacebefore' in keys
+#                or 'inflected' in keys
+#                or tag == 'exception'
+                or ('postag', 'SENT_START') in self.items
+                ):
+                if not 'Not supported' in result:
+                    result = [[], 'Not implemented', ]
+                    return result
+
+            if not len(result):
+                result.append(element.items())
+            else:
+                result[0] += element.items()
+
+            #exception tag with scope attr rule
+            if (tag == 'exception' and 'scope' in keys
+                and not RuleModel.key_remover(keys, ['scope', ])):
+                ''' exception tag with only the scope key'''
+                if element.get('scope') == 'previous':
+                    result.append('(?<!%s)' % text)
+                if element.get('scope') == 'next':
+                    result.append('(?!%s)' % text)
+                if element.get('scope') == "current":
+                    result.append('(?=%s)' % text)
+                if tail is not None:
+                    result[-1] += ' (%s)' % tail
+
+            # regexp attr rule
+            if ('regexp' in keys
+                and not RuleModel.key_remover(keys, ['regexp',
+                                                     'inflected',
+                                                     'skip'
+                                                     ])
+                and not element.getparent() == 'marker'):
+                '''elements with only the regexp attr'''
+                if text and text.strip():
+                    result.append('(%s)' % text.strip())
+
+            # token tag rule
+            if tag == 'token':
+                if not RuleModel.key_remover(keys, ['inflected',
+                                                    'skip'
+                                                    ]):
+                    '''common tokens with no attrs'''
+                    if text and text.strip():
+                        if text is not '.':
+                            result.append('(%s)' % text.rstrip())
+                        else:
+                            result.append('([.])')
+                    else:
+                        if not element.getchildren():
+                            result.append('()')
+
+            # marked tag rules
+            if (element.tag == 'marker' and not element.getchildren()):
+                '''markers without children -> error'''
+                raise TypeError("'markered' elements in %s"
+                                    % self.rule.get('id'))
+            if element.getparent().tag == 'marker':
+                children_list = element.getparent().getchildren()
+                if len(children_list) == 1:
+                    pass
+                    #result[-1] = '(' + result[-1] + ')'
+                else:
+                    if children_list[0] == element:
+                        if "regexp" not in element.keys():
+                            if result[-1][0] != '(':
+                                result[-1] = '((' + result[-1] + ')'
+                            else:
+                                result[-1] = '(' + result[-1]
+                    elif len(result) > 2:
+                        if (element in children_list[1:-1]
+                              and "regexp" not in element.keys()):
+                            if not result[-1][0] == '(':
+                                result[-1] = '(' + result[-1] + ')'
+
+                        elif children_list[-1] == element:
+                            """ ')' after last element"""
+                            if result[-1][0] != '(':
+                                result[-1] = '(' + result[-1] + '))'
+                            elif result[-1][0] == '(':
+                                result[-1] += ')'
+
+            #negate attr rule
+            if (('negate', 'yes') in element.items()
+                and not RuleModel.key_remover(keys, ['regexp',
+                                                     'default',
+                                                     'skip',
+                                                     'negate'])):
+                '''negate="yes" attribute'''
+                if not RuleModel.key_remover([i for i in text],
+                                        ('.', ',', ':', '!', '?', "'", '"',)):
+                    result.append('([^%s])' % text)
+                elif 'regexp' in keys:
+                    result.append('(^(%s))' % text)
+                else:
+                    result.append('(^%s)' % text)
+
+            #spacebefore attr rule
+            if ('spacebefore' in keys
+                and not RuleModel.key_remover(keys, ['default',
+                                                     'spacebefore',
+                                                     'skip',
+                                                     'inflected'])):
+                if text and text.strip():
+                    if not self.pattern.getchildren()[0] == element:
+                        if element.get('spacebefore') == 'no':
+                            result[-1] += text.strip()
+                        else:
+                            result[-1] = result[-1] + ' ' + text.strip()
+                    else:
+                        if element.get('spacebefore') == 'no':
+                            result.append('(w*)%s' % text.strip())
+                        else:
+                            result.append('(w*) %s' % text.strip())
+            #spacebefore + regexp attr rule
+            if ('spacebefore' in keys
+                and 'regexp' in keys
+                and not RuleModel.key_remover(keys, ['default',
+                                                     'spacebefore',
+                                                     'regexp',
+                                                     'skip',
+                                                     'inflected'])):
+                if text and text.strip():
+                    if not self.pattern.getchildren()[0] == element:
+                        if element.get('spacebefore') == 'no':
+                            result[-1] = ('(%s%s)'
+                                          % (result[-1].translate(None, '()'),
+                                             text.strip()))
+                        else:
+                            result[-1] = ('(%s %s)'
+                                          % (result[-1].translate(None, '()'),
+                                             text.strip()))
+                    else:
+                        if element.get('spacebefore') == 'no':
+                            result.append('()%s' % text.strip())
+                        else:
+                            result.append('(() %s)' % text.strip())
+            # skip attr rule
+            if 'skip' in keys:
+                if element.get('skip') == '-1':
+                    result.append('(\w+ )*')
+                else:
+                    result.append('(\w+ ){0,%d}' % int(element.get('skip')))
+
+            # rules for message
+            # suggestion tag with no attrs or suggestion match tag with 1 attr
+            if tag == 'suggestion' and not RuleModel.key_remover(keys, 'no'):
+                if text and text.strip() and not keys:
+                    '''message suggestions appearance'''
+                    if [item.startswith('=') for item in result[1:]]:
+                        result.append('\\n%s' % text.strip())
+                        suggestions.append('%s' % text.strip())
+                    else:
+                        result.append('%s' % text.strip())
+                        suggestions.append('%s' % text.strip())
+
+                # suggestion match rules
+                for el in element.getchildren():
+                    if el.tag == "match":
+                        match_no = int(el.get('no'))
+#                       match_el = self.pattern.getchildren()[match_no- 1]
+#                       print 'ZZZ', match_el.text
+                        result.append('\\%s' % (match_no))
+                        suggestions.append('\\%s' % (match_no))
+                        if el.tail and el.tail.strip():
+                            result.append('%s' % el.tail.strip())
+                            suggestions.append('%s' % el.tail.strip())
+                if tail and tail.strip():
+                    suggestions.append(tail.strip())
+
+# end for #
+        if attr == 'message':
+            if self.message.getchildren() and not result:
+                '''in case of empty message'''
+                raise TypeError('empty message!')
+
+            '''message verbosity'''
+            global did_you_mean
+            if did_you_mean:
+                result.append('#Did you mean?')
+            else:
+                result.append('#%s' % self.message.text.strip())
+                if self.message.tail and self.message.tail.strip():
+                    result.append('%s' % self.message.tail.strip())
+
+                if suggestions is not None:
+                    for suggestion in suggestions:
+                        if suggestion is not None:
+                            result.append('%s' % suggestion)
+
+        result = RuleModel.space_killer(result)
+        return result
+
+
+class RuleView(object):
+    '''adaptes rule's API, printer() method prints rule'''
+    def __init__(self, rule, category):
+        self.parse_pattern = RuleModel(rule).parse('pattern')
+        self.pref = self.parse_pattern[0]
+        self.pattern = self.parse_pattern[1:]
+        self.parse_message = RuleModel(rule).parse('message')
+        self.ix = self.parse_message[0]
+        self.message = self.parse_message[1:]
+        self.prefix = list(self.pref) + list(self.ix)
+        self.id = rule.get('id')[0:15] if rule.get('id') else 'Unknown'
+        self.rule = rule
+        self.category = category
+        self.supported_keys = ['regexp', 'scope', 'spacebefore', 'negate']
+        self.key_to_print = None
+        self.keys = RuleModel.nesting_list_killer([i.keys()
+                                        for i in self.rule.iterdescendants()])
+
+    @staticmethod
+    def aggregate(iter1, iter2):
+        set1 = set(iter1)
+        set2 = set(iter2)
+        aggregate = set1 & set2
+        return list(aggregate)
+
+    def sys_argv(self):
+        supported_args = {'-short': 'Print \"Did you mean?\". '
+                                        'Default: Print full message',
+                          '-file': 'Dump rules to \'rules.txt\' file',
+                          '-attr': 'Print <rule> tags\'s attr list '
+                          'before rule if any',
+                          '-show=attr': 'Incompatible with other '
+                          ' keys except maybe -short :)'}
+        if len(sys.argv) > 1:
+            if '-short' in sys.argv:
+                global did_you_mean
+                did_you_mean = True
+            if '-file' in sys.argv:
+                sys.stdout = codecs.open('rules.txt',
+                                         encoding='utf-8', mode='a+')
+            if '-attr' in sys.argv:
+                if self.prefix:
+                    print sorted(list(set(self.prefix))), '=>',
+
+            show_attr = [i for i in sys.argv if i.startswith('-show=')]
+            if show_attr:
+                if len(show_attr) == 1:
+                    self.key_to_print = show_attr[0][6:]
+                else:
+                    raise TypeError('Too many "show" arguments')
+
+            if False in [el in supported_args.keys() for el in sys.argv[1:]
+                                            if not el.startswith('-show=')]:
+                print '\n-*-sys.argv Error-*-'
+                print '\nYou printed:  python',
+                for i in sys.argv:
+                    print i,
+                print ('\nUsage: python %s [ Keys ]\n\n Keys:'
+                       % sys.argv[0])
+                for i, j in sorted(supported_args.items()):
+                    print '{0:<20}{1}'.format(i, j)
+                sys.exit(1)
+
+    @staticmethod
+    def print_element(element):
+        for el in element:
+            print el,
+
+    def printer(self):
+        if (self.pattern and self.message
+            and 'Not supported' not in self.pattern
+            and 'Not supported' not in self.message
+            and 'Not implemented' not in self.pattern
+            and 'Not implemented' not in self.message):
+            sys.stdout = open('ok.txt', 'a+')
+            ET.dump(self.rule)
+            sys.stdout = sys.__stdout__
+
+            RuleView.sys_argv(self)
+            '''
+            if not did_you_mean:
+                print (self.rule.get('id') if self.rule.get('id')
+                       else self.rule.getparent().get('id'))
+            '''
+            if self.key_to_print:
+                if self.key_to_print in self.prefix:
+                    if self.key_to_print in self.supported_keys:
+                        sys.stdout = sys.__stdout__
+                else:
+                    sys.stdout = codecs.open(os.devnull, encoding='utf-8',
+                                                                    mode='w')
+
+            global number
+            global amount
+            number += 1
+###############
+            ''' API's "marker" difference handler'''
+            if ('marker' in [el.tag
+                        for el in self.rule.find('pattern').getchildren()]):
+
+                ''' regexp in marker => suggestion_counters +=1'''
+
+                for string in self.message:
+                    if '\\' in string:
+                        try:
+                            parts = string.split('\\')
+                            rest = []
+                            for el in parts[1:]:
+                                el = str(int(el[0]) + 1) + el[1:]
+                                rest.append(el)
+                            result = '\\'.join([parts[0], ] + rest)
+                            self.message[self.message.index(string)] = result
+                        except ValueError:
+                            pass
+            self.message.append('\n')
+###############
+            RuleView.print_element(self.pattern)
+###############
+            ''' API's '<- option[...] ->' handler'''
+            print '<- option("%s")' % self.category.get('name'),
+            try:
+                ''' marker in pattern => -%d> in result'''
+                marker = [el.tag for el
+            in self.rule.find('pattern').getchildren()].index('marker') + 1
+                if not 'skip' in self.keys:
+                    print '-%d>' % marker,
+                else:
+                    print '-%d>' % (marker + 1),
+            except ValueError:
+                print '->',
+###############
+            RuleView.print_element(self.message)
+############### For debug purposes only.
+        elif ('Not implemented' in self.pattern
+              or 'Not implemented' in self.message
+              and 'Not supported' not in self.pattern
+              and 'Not supported' not in self.message):
+            global to_implement
+            to_implement += 1
+#            print ET.dump(self.rule)
+            sys.stdout = open('notimplemented.txt', 'a+')
+            ET.dump(self.rule)
+            sys.stdout = sys.__stdout__
+        elif ('Not supported' in self.pattern
+              or 'Not supported' in self.message
+              and 'Not implemented' not in self.pattern
+              and 'Not implemented' not in self.message):
+            global no_support
+            no_support += 1
+            sys.stdout = open('unsupported.txt', 'a+')
+            ET.dump(self.rule)
+            sys.stdout = sys.__stdout__
+
+
+class RuleController(object):
+    '''flow control'''
+    def __init__(self):
+        try:
+            os.remove(os.path.join(os.getcwd(), 'unsupported.txt'))
+        except OSError:
+            pass
+        try:
+            os.remove(os.path.join(os.getcwd(), 'notimplemented.txt'))
+        except OSError:
+            pass
+        try:
+            os.remove(os.path.join(os.getcwd(), 'rules.txt'))
+        except OSError:
+            pass
+
+        self.categories = ET.iterparse("grammar.xml", events=('end',),
+                                       tag='category')
+
+    def process(self):
+        global number
+        global amount
+        for _event, category in self.categories:
+            for rule in category.iter('rule'):
+                amount += 1
+                RuleView(rule, category).printer()
+        if not did_you_mean:
+            percent = float(number) / amount * 100
+            print '\n%.3f %s of rules covered (%s/%s)' % (percent, chr(37),
+                                                         number, amount)
+            global to_implement
+            print '%s rules left to cover' % to_implement
+            global no_support
+            print '%s unsupported rules' % no_support
+
+RuleController().process()
author	Daniel Korostil <ted.korostiled@gmail.com>	2012-07-16 15:35:24 +0300
committer	Daniel Korostil <ted.korostiled@gmail.com>	2012-07-16 15:35:24 +0300
commit	d021ca49317fc607064b04437e9b2d03b97cde5c (patch)
tree	c98da7034ab519b372eef1f5592bb0897c8ea877
parent	d74eb78c70ac29667a7b06017a646e22d1caf6e4 (diff)