import re,string class reason_counter: def __init__(self, wording): self.wording = wording self.num = 1 def update(self, new_wording): self.num += 1 self.wording = new_wording def html(self): if self.num == 1: return self.wording else: return "%s (%d+)" % (self.wording, self.num) def numbers_are_irrelevant(txt): ## ? when do we replace numbers with NN ? ## By default is always, but ## if/when some categories of reasons choose to keep their numbers, ## then the function shall return False for such categories return True def aggregate_reason_fields(reasons_list): # each reason in the list may be a combination # of | - separated reasons. # expand into list reasons_txt = '|'.join(reasons_list) reasons = reasons_txt.split('|') reason_htable = {} for reason in reasons: reason_reduced = reason.strip() ## reduce whitespaces reason_reduced = re.sub(r"\s+"," ", reason_reduced) if reason_reduced == '': continue # ignore empty reasons if numbers_are_irrelevant(reason_reduced): # reduce numbers included into reason descriptor # by replacing them with generic NN reason_reduced = re.sub(r"\d+","NN", reason_reduced) if not reason_reduced in reason_htable: reason_htable[reason_reduced] = reason_counter(reason) else: ## reason_counter keeps original ( non reduced ) ## reason if it occured once ## if reason occured more then once, reason_counter ## will keep it in reduced/generalized form reason_htable[reason_reduced].update(reason_reduced) generic_reasons = reason_htable.keys() generic_reasons.sort(key = (lambda k: reason_htable[k].num), reverse = True) return map(lambda generic_reason: reason_htable[generic_reason].html(), generic_reasons)