From 7dded42472968783f5c90a0aa9e497ee3dfb247f Mon Sep 17 00:00:00 2001 From: Dávid Vastag Date: Mon, 4 Feb 2013 22:05:01 +0100 Subject: help-to-wiki.py now uses .po files as source of translations. Plus added erro handling Reviewed-by: Jan Holesovsky Tested-by: Jan Holesovsky Signed-off-by: Andras Timar --- help-to-wiki.py | 8 +-- to-wiki/wikiconv2.py | 151 ++++++++++++++++++++++++++++++++++++--------------- 2 files changed, 112 insertions(+), 47 deletions(-) diff --git a/help-to-wiki.py b/help-to-wiki.py index 4a15f95b21..0ee17ba677 100755 --- a/help-to-wiki.py +++ b/help-to-wiki.py @@ -95,13 +95,13 @@ print "Generating the titles..." os.system( "python to-wiki/getalltitles.py source/text > alltitles.csv" ) try: - sdf_path = args[0] + po_path = args[0] except: - sdf_path = '../../translations/unxlngx6.pro/misc/sdf-l10n' - sys.stderr.write('Path to the .sdf files not provided, using "%s"\n'% sdf_path) + po_path = '../translations/source' + sys.stderr.write('Path to the .po files not provided, using "%s"\n'% po_path) # do the work for lang in langs: - wikiconv2.convert(generate_redirects, lang, '%s/%s.sdf'% (sdf_path, lang)) + wikiconv2.convert(generate_redirects, lang, '%s/%s/helpcontent2/source'% (po_path, lang)) # vim:set shiftwidth=4 softtabstop=4 expandtab: diff --git a/to-wiki/wikiconv2.py b/to-wiki/wikiconv2.py index be968cbfba..0e3bb5bf97 100755 --- a/to-wiki/wikiconv2.py +++ b/to-wiki/wikiconv2.py @@ -7,7 +7,7 @@ # file, You can obtain one at http://mozilla.org/MPL/2.0/. # -import os, sys, thread, threading, time +import os, sys, thread, threading, time, re, copy import xml.parsers.expat import codecs from threading import Thread @@ -179,31 +179,79 @@ def escape_equals_sign(text): return t -def load_localization_data(sdf_file): - global localization_data - localization_data = {} - try: - file = codecs.open(sdf_file, "r", "utf-8") - except: - sys.stderr.write('Error: Cannot open .sdf file "%s"\n'% sdf_file) - return False +def xopen(path, mode, encoding): + """Wrapper around open() to support both python2 and python3.""" + if sys.version_info >= (3,): + return open(path, mode, encoding=encoding) + else: + return open(path, mode) + +# used by ecape_help_text +helptagre = re.compile('''<[/]??[a-z_\-]+?(?:| +[a-zA-Z]+?=[\\\\]??".*?") *[/]??>''') + +def escape_help_text(text): + """Escapes the help text as it would be in an SDF file.""" + + for tag in helptagre.findall(text): + escapethistag = False + for escape_tag in ["ahelp", "link", "item", "emph", "defaultinline", "switchinline", "caseinline", "variable", "bookmark_value", "image", "embedvar", "alt"]: + if tag.startswith("<%s" % escape_tag) or tag == "" % escape_tag: + escapethistag = True + if tag in ["
", ""]: + escapethistag = True + if escapethistag: + escaped_tag = ("\\<" + tag[1:-1] + "\\>") + text = text.replace(tag, escaped_tag) + return text - for line in file: - line = line.strip() - if line[0] == '#': - continue - spl = line.split("\t") - - # the form of the key is like - # source/text/shared/explorer/database/02010100.xhp#hd_id3149233 - # otherwise we are getting duplicates - key = '%s#%s'% (spl[1].replace('\\', '/'), spl[4]) - try: - localization_data[key] = spl[10] - except: - sys.stderr.write('Warning: Ignored line "%s"\n'% line.encode('utf-8')) - file.close() +def load_localization_data(po_root): + global localization_data + localization_data = {} + for root, dirs, files in os.walk(po_root): + for file in files: + if re.search(r'\.po$', file) == None: + continue + path = "%s/%s" % (root, file) + sock = xopen(path, "r", encoding='utf-8') + hashKey = None + transCollecting = False + trans = "" + it = iter(sock) + line = next(it, None) + while line != None: + line=line.decode("utf-8") + if line.startswith('msgctxt ""'): # constructing the hashKey + key=[] + allGood = True + i=0 + while i<2 and allGood: + msgctxt_line = next(it, None); + if msgctxt_line != None and msgctxt_line.strip().startswith('"'): + key.append( msgctxt_line[1:-4] ) #-4 cuts \\n"\n from the end of the line + i=i+1 + else: + allGood = False + if i==2: #hash key is allowed to be constructed + hashKey = '#'.join( (re.sub(r'^.*helpcontent2/source/', r'source/', path[:-3]) + '/' + key[0] , key[1]) ) + else: + hashKey = None + elif hashKey != None: # constructing trans value for hashKey + if transCollecting: + if line.startswith('"'): + trans= trans + line.strip()[1:-1] + else: + transCollecting = False + localization_data[hashKey] = escape_help_text(trans) + hashKey = None + elif line.startswith('msgstr '): + trans = line.strip()[8:-1] + if trans == '': # possibly multiline + transCollecting = True + else: + localization_data[hashKey] = escape_help_text(trans) + hashKey = None + line = next(it, None) return True def unescape(str): @@ -250,6 +298,9 @@ def href_to_fname_id(href): return [fname, id] +# Exception classes +class UnhandledItemType(Exception): + pass # Base class for all the elements # # self.name - name of the element, to drop the self.child_parsing flag @@ -500,7 +551,7 @@ class TableCell(ElementBase): if parser.follow_embed: self.embed_href(parser, fname, id) elif name == 'paragraph': - parser.parse_localized_paragraph(TableContentParagraph(attrs, self), attrs, self) + parser.parse_localized_paragraph(TableContentParagraph, attrs, self) elif name == 'section': self.parse_child(Section(attrs, self)) elif name == 'bascode': @@ -541,7 +592,7 @@ class BasicCode(ElementBase): def start_element(self, parser, name, attrs): if name == 'paragraph': - parser.parse_localized_paragraph(BasicCodeParagraph(attrs, self), attrs, self) + parser.parse_localized_paragraph(BasicCodeParagraph, attrs, self) else: self.unhandled_element(parser, name) @@ -580,7 +631,7 @@ class ListItem(ElementBase): if parser.follow_embed: self.embed_href(parser, fname, id) elif name == 'paragraph': - parser.parse_localized_paragraph(ListItemParagraph(attrs, self), attrs, self) + parser.parse_localized_paragraph(ListItemParagraph, attrs, self) elif name == 'list': self.parse_child(List(attrs, self)) else: @@ -683,7 +734,7 @@ class Meta(ElementBase): class Section(ElementBase): def __init__(self, attrs, parent): ElementBase.__init__(self, 'section', parent) - self.id = attrs['id'] + self.id = attrs[ 'id' ] def start_element(self, parser, name, attrs): if name == 'bookmark': @@ -970,8 +1021,12 @@ class Item(ElementBase): text + \ self.replace_type['end'][self.type] except: - sys.stderr.write('Unhandled item type "%s".\n'% self.type) - + try: + sys.stderr.write('Unhandled item type "%s".\n'% self.type) + except: + sys.stderr.write('Unhandled item type. Possibly type has been localized.\n') + finally: + raise UnhandledItemType return replace_text(self.text) @@ -1062,7 +1117,10 @@ class Paragraph(ElementBase): role = 'tablenextpara' # the text itself - children = ElementBase.get_all(self) + try: + children = ElementBase.get_all(self) + except UnhandledItemType: + raise UnhandledItemType('Paragraph id: '+str(self.id)) if self.role != 'emph' and self.role != 'bascode' and self.role != 'logocode': children = children.strip() @@ -1196,23 +1254,30 @@ class ParserBase: def get_variable(self, id): return self.head_obj.get_variable(id) - def parse_localized_paragraph(self, paragraph, attrs, obj): + def parse_localized_paragraph(self, Paragraph_type, attrs, obj): localized_text = '' try: localized_text = get_localized_text(self.filename, attrs['id']) except: pass + paragraph = Paragraph_type(attrs, obj) if localized_text != '': # parse the localized text text = u'' + localized_text + '' - ParserBase(self.filename, self.follow_embed, self.embedding_app, \ - self.current_app, self.wiki_page_name, self.lang, \ - paragraph, text.encode('utf-8')) - # add it to the overall structure - obj.objects.append(paragraph) - # and ignore the original text - obj.parse_child(Ignore(attrs, obj, 'paragraph')) + try: + ParserBase(self.filename, self.follow_embed, self.embedding_app, \ + self.current_app, self.wiki_page_name, self.lang, \ + paragraph, text.encode('utf-8')) + except xml.parsers.expat.ExpatError: + sys.stderr.write( 'Invalid XML in translated text. Using the original text. Error location:\n'\ + + 'Curren xhp: ' + self.filename + '\nParagraph id: ' + attrs['id'] + '\n') + obj.parse_child(Paragraph_type(attrs, obj)) # new paragraph must be created because "paragraph" is corrupted by "ParserBase" + else: + # add it to the overall structure + obj.objects.append(paragraph) + # and ignore the original text + obj.parse_child(Ignore(attrs, obj, 'paragraph')) else: obj.parse_child(paragraph) @@ -1229,7 +1294,7 @@ class ParserBase: if ignore_this: obj.parse_child(Ignore(attrs, obj, 'paragraph')) else: - self.parse_localized_paragraph(Paragraph(attrs, obj), attrs, obj) + self.parse_localized_paragraph(Paragraph, attrs, obj) class XhpParser(ParserBase): def __init__(self, filename, follow_embed, embedding_app, wiki_page_name, lang): @@ -1329,7 +1394,7 @@ def write_redirects(): write_link(r, target) # Main Function -def convert(generate_redirects, lang, sdf_file): +def convert(generate_redirects, lang, po_root): if lang == '': print 'Generating the main wiki pages...' else: @@ -1343,8 +1408,8 @@ def convert(generate_redirects, lang, sdf_file): loadallfiles("alltitles.csv") if lang != '': - sys.stderr.write('Using localizations from "%s"\n'% sdf_file) - if not load_localization_data(sdf_file): + sys.stderr.write('Using localizations from "%s"\n'% po_root) + if not load_localization_data(po_root): return for title in titles: -- cgit v1.2.3