1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
|
import re,string
class reason_counter:
def __init__(self, wording):
self.wording = wording
self.num = 1
def update(self, new_wording):
self.num += 1
self.wording = new_wording
def html(self):
if self.num == 1:
return self.wording
else:
return "%s (%d+)" % (self.wording, self.num)
def numbers_are_irrelevant(txt):
## ? when do we replace numbers with NN ?
## By default is always, but
## if/when some categories of reasons choose to keep their numbers,
## then the function shall return False for such categories
return True
def aggregate_reason_fields(reasons_list):
# each reason in the list may be a combination
# of | - separated reasons.
# expand into list
reasons_txt = '|'.join(reasons_list)
reasons = reasons_txt.split('|')
reason_htable = {}
for reason in reasons:
reason_reduced = reason.strip()
## reduce whitespaces
reason_reduced = re.sub(r"\s+"," ", reason_reduced)
if reason_reduced == '':
continue # ignore empty reasons
if numbers_are_irrelevant(reason_reduced):
# reduce numbers included into reason descriptor
# by replacing them with generic NN
reason_reduced = re.sub(r"\d+","NN", reason_reduced)
if not reason_reduced in reason_htable:
reason_htable[reason_reduced] = reason_counter(reason)
else:
## reason_counter keeps original ( non reduced )
## reason if it occured once
## if reason occured more then once, reason_counter
## will keep it in reduced/generalized form
reason_htable[reason_reduced].update(reason_reduced)
generic_reasons = reason_htable.keys()
generic_reasons.sort(key = (lambda k: reason_htable[k].num),
reverse = True)
return map(lambda generic_reason: reason_htable[generic_reason].html(),
generic_reasons)
|