summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDaniel Korostil <ted.korostiled@gmail.com>2012-07-16 15:35:24 +0300
committerDaniel Korostil <ted.korostiled@gmail.com>2012-07-16 15:35:24 +0300
commitd021ca49317fc607064b04437e9b2d03b97cde5c (patch)
treec98da7034ab519b372eef1f5592bb0897c8ea877
parentd74eb78c70ac29667a7b06017a646e22d1caf6e4 (diff)
Added new features to converter
-rw-r--r--src/converter/test.xml16
-rw-r--r--src/converter/xml2reexp.py577
2 files changed, 530 insertions, 63 deletions
diff --git a/src/converter/test.xml b/src/converter/test.xml
deleted file mode 100644
index b40f7b3..0000000
--- a/src/converter/test.xml
+++ /dev/null
@@ -1,16 +0,0 @@
-<rule>
- <pattern>
- <token>xxx</token>
- <token>To</token>
- <marker>
- <token>being</token>
- <token>with</token>
- </marker>
- <token>,</token>
- </pattern>
- <message>Did you mean <suggestion>begin</suggestion>?</message>
- <short>Possible typo</short>
- <example correction="begin" type="incorrect">To
-<marker>being</marker> with, she is a Russian spy.</example>
- <example type="correct">To begin with, she's a spy.</example>
- </rule>
diff --git a/src/converter/xml2reexp.py b/src/converter/xml2reexp.py
index 00252dd..eea3fd5 100644
--- a/src/converter/xml2reexp.py
+++ b/src/converter/xml2reexp.py
@@ -1,48 +1,531 @@
-import xml.etree.ElementTree as ET
-import string
-import re
-
-tree = ET.parse("grammar.xml")
-
-def poscorr(s, markerpos): # increase group positions in the replacement string, if needed
- if markerpos:
- return re.sub(r"(?<=\\)([%d-9])" % markerpos, lambda r: str(int(r.group(0)) + 1), s)
- return s
-
-def cleaner(a):
- a = ' '.join(a)
- a = string.replace(a," ,",",")
- a = string.replace(a," .",".")
- a = string.replace(a," ???","")
- a = string.replace(a," (???)","")
- a = string.replace(a,"??? ","")
- return a
-
-for rule in tree.iter("rule"):
- pattern = []
- for markertrue in rule.findall('pattern/marker'):
- if markertrue.tag == "marker":
- continue
-# if rule.find('pattern/marker') is None: # check only rules with <marker>
-# continue
- markerpos = 0
- n = 0
- for item in rule.findall('pattern/*'):
- n += 1
- if item.attrib and item.attrib.keys() != ["regexp"]:
- pattern = []
- break
- if item.tag == 'marker':
- markerpos = n
- for i in item.iter('token'):
- pattern += ["%s" % (i.text or "???")]
- pattern[n - 1] = "(" + pattern[n - 1]
- pattern[-1] += ')'
- else:
- pattern += ["%s" % (item.text or "???")]
- if pattern:
- try:
- print cleaner(pattern), "-%d>" % markerpos, poscorr(rule.find('message').find('suggestion').text, markerpos), "# Did you mean?"
- except:
- print "# Did you mean?"
+# -*- coding: UTF-8 -*-
+import lxml.etree as ET
+import sys
+import os
+import codecs
+import copy
+did_you_mean = False
+number = 0
+amount = 0
+to_implement = 0
+no_support = 0
+
+
+class RuleModel(object):
+ '''parse() method returns rule'''
+ def __init__(self, rule):
+ self.rule = rule
+ self.pattern = rule.find('pattern')
+ self.pattern_descendants = RuleModel.element_descendants(self.pattern)
+ self.message = rule.find('message')
+ self.message_descendants = RuleModel.element_descendants(self.message)
+ self.items = RuleModel.nesting_list_killer(
+ [i.items() for i in self.message.iterdescendants()]
+ + [i.items() for i in self.pattern.iterdescendants()])
+ self.keys = RuleModel.nesting_list_killer(
+ [i.keys() for i in self.message.iterdescendants()]
+ + [i.keys() for i in self.pattern.iterdescendants()])
+
+ @staticmethod
+ def nesting_list_killer(lst):
+ result = []
+ for element in lst:
+ if isinstance(element, (str, unicode, tuple)):
+ result.append(element)
+ else:
+ result += RuleModel.nesting_list_killer(element)
+ return set(result)
+
+ @staticmethod
+ def key_remover(iterable, keys_list):
+ keys = copy.copy(keys_list)
+ it = list(copy.copy(iterable))
+ for key in keys:
+ try:
+ it.remove(key)
+ except ValueError:
+ pass
+ return it
+
+ @staticmethod
+ def space_killer(old_result):
+ '''space killer, char dealer'''
+ result = [i.strip() for i in old_result[1:]]
+
+ for item in result:
+ if (item in ('.', ',', ':', '!', '?', "'", '"', '([.])')
+ and item in result
+ or item.startswith('\\n')
+ ):
+ indx = result.index(item)
+ if indx != 0:
+ result[indx - 1] += item
+ result.remove(item)
+ else:
+ result = [(item + result[:][1]), ] + result[2:]
+
+ for item in result:
+ if '\\n' in item:
+ result[result.index(item)] = '\\n'.join(i.strip()
+ for i in item.split('\\n'))
+
+ if item.startswith('([.])') or item.endswith('([.]))'):
+ indx = result.index(item)
+ if indx != 0:
+ result[indx - 1] += item
+ result.remove(item)
+
+ for item in result:
+ if item.endswith('([.]))'):
+ indx = result.index(item)
+ if indx != 0:
+ result[indx - 1] += item
+ result.remove(item)
+# if '|' in item:
+# result[result.index(item)] = '\\n'.join(i.strip()
+# for i in item.split('|'))
+
+ for item in result:
+ if item.endswith("'") and (len(result) - result.index(item) >= 2):
+ indx = result.index(item)
+ result[indx] += result[indx + 1]
+ result.remove(result[indx + 1])
+
+ result.insert(0, old_result[0])
+ return result
+
+ @staticmethod
+ def element_descendants(element):
+ return ((i, i.tag, tuple(i.keys()), i.getchildren(), i.text, i.tail)
+ for i in element.iterdescendants())
+
+ def parse(self, attr):
+ result = []
+ suggestions = []
+
+ if attr == 'pattern':
+ target = self.pattern_descendants
+ elif attr == 'message':
+ target = self.message_descendants
+ else:
+ raise AttributeError("wrong attr for parse()'s target ")
+
+ for (element, tag, keys, _children, text,
+ tail) in target:
+# start for #
+ '''unsupported pattern keys'''
+ if (self.keys
+ and 'postag' in self.keys
+ or 'postag_regexp' in self.keys
+ or 'negate_pos' in self.keys
+ ):
+ if ('postag', 'SENT_START') in self.items:
+ r = copy.copy(self.items)
+ r.remove(('postag', 'SENT_START'))
+ r = [i[0] for i in RuleModel.nesting_list_killer(r)]
+ if ('postag' in r
+ or 'postag_regexp' in r
+ or 'negate_pos' in r):
+ result = [[], 'Not supported', ]
+ return result
+ else:
+ pass
+ else:
+ result = [[], 'Not supported', ]
+ return result
+
+ '''not implemented features'''
+ if (self.keys
+ and 'regexp_match' in self.keys
+ or 'regexp_replace' in self.keys
+ or 'regexp_replace' in self.keys
+ or 'case_conversion' in self.keys
+# or tag == 'match'
+# or 'skip' in self.keys
+# or 'negate' in keys
+# or 'spacebefore' in keys
+# or 'inflected' in keys
+# or tag == 'exception'
+ or ('postag', 'SENT_START') in self.items
+ ):
+ if not 'Not supported' in result:
+ result = [[], 'Not implemented', ]
+ return result
+
+ if not len(result):
+ result.append(element.items())
+ else:
+ result[0] += element.items()
+
+ #exception tag with scope attr rule
+ if (tag == 'exception' and 'scope' in keys
+ and not RuleModel.key_remover(keys, ['scope', ])):
+ ''' exception tag with only the scope key'''
+ if element.get('scope') == 'previous':
+ result.append('(?<!%s)' % text)
+ if element.get('scope') == 'next':
+ result.append('(?!%s)' % text)
+ if element.get('scope') == "current":
+ result.append('(?=%s)' % text)
+ if tail is not None:
+ result[-1] += ' (%s)' % tail
+
+ # regexp attr rule
+ if ('regexp' in keys
+ and not RuleModel.key_remover(keys, ['regexp',
+ 'inflected',
+ 'skip'
+ ])
+ and not element.getparent() == 'marker'):
+ '''elements with only the regexp attr'''
+ if text and text.strip():
+ result.append('(%s)' % text.strip())
+
+ # token tag rule
+ if tag == 'token':
+ if not RuleModel.key_remover(keys, ['inflected',
+ 'skip'
+ ]):
+ '''common tokens with no attrs'''
+ if text and text.strip():
+ if text is not '.':
+ result.append('(%s)' % text.rstrip())
+ else:
+ result.append('([.])')
+ else:
+ if not element.getchildren():
+ result.append('()')
+
+ # marked tag rules
+ if (element.tag == 'marker' and not element.getchildren()):
+ '''markers without children -> error'''
+ raise TypeError("'markered' elements in %s"
+ % self.rule.get('id'))
+ if element.getparent().tag == 'marker':
+ children_list = element.getparent().getchildren()
+ if len(children_list) == 1:
+ pass
+ #result[-1] = '(' + result[-1] + ')'
+ else:
+ if children_list[0] == element:
+ if "regexp" not in element.keys():
+ if result[-1][0] != '(':
+ result[-1] = '((' + result[-1] + ')'
+ else:
+ result[-1] = '(' + result[-1]
+ elif len(result) > 2:
+ if (element in children_list[1:-1]
+ and "regexp" not in element.keys()):
+ if not result[-1][0] == '(':
+ result[-1] = '(' + result[-1] + ')'
+
+ elif children_list[-1] == element:
+ """ ')' after last element"""
+ if result[-1][0] != '(':
+ result[-1] = '(' + result[-1] + '))'
+ elif result[-1][0] == '(':
+ result[-1] += ')'
+
+ #negate attr rule
+ if (('negate', 'yes') in element.items()
+ and not RuleModel.key_remover(keys, ['regexp',
+ 'default',
+ 'skip',
+ 'negate'])):
+ '''negate="yes" attribute'''
+ if not RuleModel.key_remover([i for i in text],
+ ('.', ',', ':', '!', '?', "'", '"',)):
+ result.append('([^%s])' % text)
+ elif 'regexp' in keys:
+ result.append('(^(%s))' % text)
+ else:
+ result.append('(^%s)' % text)
+
+ #spacebefore attr rule
+ if ('spacebefore' in keys
+ and not RuleModel.key_remover(keys, ['default',
+ 'spacebefore',
+ 'skip',
+ 'inflected'])):
+ if text and text.strip():
+ if not self.pattern.getchildren()[0] == element:
+ if element.get('spacebefore') == 'no':
+ result[-1] += text.strip()
+ else:
+ result[-1] = result[-1] + ' ' + text.strip()
+ else:
+ if element.get('spacebefore') == 'no':
+ result.append('(w*)%s' % text.strip())
+ else:
+ result.append('(w*) %s' % text.strip())
+ #spacebefore + regexp attr rule
+ if ('spacebefore' in keys
+ and 'regexp' in keys
+ and not RuleModel.key_remover(keys, ['default',
+ 'spacebefore',
+ 'regexp',
+ 'skip',
+ 'inflected'])):
+ if text and text.strip():
+ if not self.pattern.getchildren()[0] == element:
+ if element.get('spacebefore') == 'no':
+ result[-1] = ('(%s%s)'
+ % (result[-1].translate(None, '()'),
+ text.strip()))
+ else:
+ result[-1] = ('(%s %s)'
+ % (result[-1].translate(None, '()'),
+ text.strip()))
+ else:
+ if element.get('spacebefore') == 'no':
+ result.append('()%s' % text.strip())
+ else:
+ result.append('(() %s)' % text.strip())
+ # skip attr rule
+ if 'skip' in keys:
+ if element.get('skip') == '-1':
+ result.append('(\w+ )*')
+ else:
+ result.append('(\w+ ){0,%d}' % int(element.get('skip')))
+
+ # rules for message
+ # suggestion tag with no attrs or suggestion match tag with 1 attr
+ if tag == 'suggestion' and not RuleModel.key_remover(keys, 'no'):
+ if text and text.strip() and not keys:
+ '''message suggestions appearance'''
+ if [item.startswith('=') for item in result[1:]]:
+ result.append('\\n%s' % text.strip())
+ suggestions.append('%s' % text.strip())
+ else:
+ result.append('%s' % text.strip())
+ suggestions.append('%s' % text.strip())
+
+ # suggestion match rules
+ for el in element.getchildren():
+ if el.tag == "match":
+ match_no = int(el.get('no'))
+# match_el = self.pattern.getchildren()[match_no- 1]
+# print 'ZZZ', match_el.text
+ result.append('\\%s' % (match_no))
+ suggestions.append('\\%s' % (match_no))
+ if el.tail and el.tail.strip():
+ result.append('%s' % el.tail.strip())
+ suggestions.append('%s' % el.tail.strip())
+ if tail and tail.strip():
+ suggestions.append(tail.strip())
+
+# end for #
+ if attr == 'message':
+ if self.message.getchildren() and not result:
+ '''in case of empty message'''
+ raise TypeError('empty message!')
+
+ '''message verbosity'''
+ global did_you_mean
+ if did_you_mean:
+ result.append('#Did you mean?')
+ else:
+ result.append('#%s' % self.message.text.strip())
+ if self.message.tail and self.message.tail.strip():
+ result.append('%s' % self.message.tail.strip())
+
+ if suggestions is not None:
+ for suggestion in suggestions:
+ if suggestion is not None:
+ result.append('%s' % suggestion)
+
+ result = RuleModel.space_killer(result)
+ return result
+
+
+class RuleView(object):
+ '''adaptes rule's API, printer() method prints rule'''
+ def __init__(self, rule, category):
+ self.parse_pattern = RuleModel(rule).parse('pattern')
+ self.pref = self.parse_pattern[0]
+ self.pattern = self.parse_pattern[1:]
+ self.parse_message = RuleModel(rule).parse('message')
+ self.ix = self.parse_message[0]
+ self.message = self.parse_message[1:]
+ self.prefix = list(self.pref) + list(self.ix)
+ self.id = rule.get('id')[0:15] if rule.get('id') else 'Unknown'
+ self.rule = rule
+ self.category = category
+ self.supported_keys = ['regexp', 'scope', 'spacebefore', 'negate']
+ self.key_to_print = None
+ self.keys = RuleModel.nesting_list_killer([i.keys()
+ for i in self.rule.iterdescendants()])
+
+ @staticmethod
+ def aggregate(iter1, iter2):
+ set1 = set(iter1)
+ set2 = set(iter2)
+ aggregate = set1 & set2
+ return list(aggregate)
+
+ def sys_argv(self):
+ supported_args = {'-short': 'Print \"Did you mean?\". '
+ 'Default: Print full message',
+ '-file': 'Dump rules to \'rules.txt\' file',
+ '-attr': 'Print <rule> tags\'s attr list '
+ 'before rule if any',
+ '-show=attr': 'Incompatible with other '
+ ' keys except maybe -short :)'}
+ if len(sys.argv) > 1:
+ if '-short' in sys.argv:
+ global did_you_mean
+ did_you_mean = True
+ if '-file' in sys.argv:
+ sys.stdout = codecs.open('rules.txt',
+ encoding='utf-8', mode='a+')
+ if '-attr' in sys.argv:
+ if self.prefix:
+ print sorted(list(set(self.prefix))), '=>',
+
+ show_attr = [i for i in sys.argv if i.startswith('-show=')]
+ if show_attr:
+ if len(show_attr) == 1:
+ self.key_to_print = show_attr[0][6:]
+ else:
+ raise TypeError('Too many "show" arguments')
+
+ if False in [el in supported_args.keys() for el in sys.argv[1:]
+ if not el.startswith('-show=')]:
+ print '\n-*-sys.argv Error-*-'
+ print '\nYou printed: python',
+ for i in sys.argv:
+ print i,
+ print ('\nUsage: python %s [ Keys ]\n\n Keys:'
+ % sys.argv[0])
+ for i, j in sorted(supported_args.items()):
+ print '{0:<20}{1}'.format(i, j)
+ sys.exit(1)
+
+ @staticmethod
+ def print_element(element):
+ for el in element:
+ print el,
+
+ def printer(self):
+ if (self.pattern and self.message
+ and 'Not supported' not in self.pattern
+ and 'Not supported' not in self.message
+ and 'Not implemented' not in self.pattern
+ and 'Not implemented' not in self.message):
+ sys.stdout = open('ok.txt', 'a+')
+ ET.dump(self.rule)
+ sys.stdout = sys.__stdout__
+
+ RuleView.sys_argv(self)
+ '''
+ if not did_you_mean:
+ print (self.rule.get('id') if self.rule.get('id')
+ else self.rule.getparent().get('id'))
+ '''
+ if self.key_to_print:
+ if self.key_to_print in self.prefix:
+ if self.key_to_print in self.supported_keys:
+ sys.stdout = sys.__stdout__
+ else:
+ sys.stdout = codecs.open(os.devnull, encoding='utf-8',
+ mode='w')
+
+ global number
+ global amount
+ number += 1
+###############
+ ''' API's "marker" difference handler'''
+ if ('marker' in [el.tag
+ for el in self.rule.find('pattern').getchildren()]):
+
+ ''' regexp in marker => suggestion_counters +=1'''
+
+ for string in self.message:
+ if '\\' in string:
+ try:
+ parts = string.split('\\')
+ rest = []
+ for el in parts[1:]:
+ el = str(int(el[0]) + 1) + el[1:]
+ rest.append(el)
+ result = '\\'.join([parts[0], ] + rest)
+ self.message[self.message.index(string)] = result
+ except ValueError:
+ pass
+ self.message.append('\n')
+###############
+ RuleView.print_element(self.pattern)
+###############
+ ''' API's '<- option[...] ->' handler'''
+ print '<- option("%s")' % self.category.get('name'),
+ try:
+ ''' marker in pattern => -%d> in result'''
+ marker = [el.tag for el
+ in self.rule.find('pattern').getchildren()].index('marker') + 1
+ if not 'skip' in self.keys:
+ print '-%d>' % marker,
+ else:
+ print '-%d>' % (marker + 1),
+ except ValueError:
+ print '->',
+###############
+ RuleView.print_element(self.message)
+############### For debug purposes only.
+ elif ('Not implemented' in self.pattern
+ or 'Not implemented' in self.message
+ and 'Not supported' not in self.pattern
+ and 'Not supported' not in self.message):
+ global to_implement
+ to_implement += 1
+# print ET.dump(self.rule)
+ sys.stdout = open('notimplemented.txt', 'a+')
+ ET.dump(self.rule)
+ sys.stdout = sys.__stdout__
+ elif ('Not supported' in self.pattern
+ or 'Not supported' in self.message
+ and 'Not implemented' not in self.pattern
+ and 'Not implemented' not in self.message):
+ global no_support
+ no_support += 1
+ sys.stdout = open('unsupported.txt', 'a+')
+ ET.dump(self.rule)
+ sys.stdout = sys.__stdout__
+
+
+class RuleController(object):
+ '''flow control'''
+ def __init__(self):
+ try:
+ os.remove(os.path.join(os.getcwd(), 'unsupported.txt'))
+ except OSError:
+ pass
+ try:
+ os.remove(os.path.join(os.getcwd(), 'notimplemented.txt'))
+ except OSError:
+ pass
+ try:
+ os.remove(os.path.join(os.getcwd(), 'rules.txt'))
+ except OSError:
+ pass
+
+ self.categories = ET.iterparse("grammar.xml", events=('end',),
+ tag='category')
+
+ def process(self):
+ global number
+ global amount
+ for _event, category in self.categories:
+ for rule in category.iter('rule'):
+ amount += 1
+ RuleView(rule, category).printer()
+ if not did_you_mean:
+ percent = float(number) / amount * 100
+ print '\n%.3f %s of rules covered (%s/%s)' % (percent, chr(37),
+ number, amount)
+ global to_implement
+ print '%s rules left to cover' % to_implement
+ global no_support
+ print '%s unsupported rules' % no_support
+
+RuleController().process()