diff options
author | Kohei Yoshida <kyoshida@novell.com> | 2009-12-23 17:36:59 -0500 |
---|---|---|
committer | Kohei Yoshida <kyoshida@novell.com> | 2009-12-23 17:36:59 -0500 |
commit | 2e2fd65997b0029571905f07ecd9ad7c77ca10ca (patch) | |
tree | 8e91b38eba8e95f2ab151046eab949adbf95f935 | |
parent | 7fe4aea75e8d7174b710713ba53f8fa8744a0c3c (diff) |
More work on xhp to docbook conversion. Started working on parsing tree structure.
-rwxr-xr-x | ooo-help-parser.py | 45 | ||||
-rw-r--r-- | source/docbook.py | 126 | ||||
-rw-r--r-- | source/expatimpl.py | 36 | ||||
-rw-r--r-- | source/node.py | 26 |
4 files changed, 206 insertions, 27 deletions
diff --git a/ooo-help-parser.py b/ooo-help-parser.py index 5b0e785..165b511 100755 --- a/ooo-help-parser.py +++ b/ooo-help-parser.py @@ -5,10 +5,39 @@ sys.path.append(sys.path[0]+"/source") import globals, expatimpl, docbook, node +def processTreeFiles (tree_dir): + if not os.path.isdir(tree_dir): + globals.error("Specified tree directory is invalid") + sys.exit(1) + + if tree_dir[-1] == '/': + tree_dir = tree_dir[:-1] + + rootNodes = {} + for entry in os.listdir(tree_dir): + entry = tree_dir + '/' + entry + if not os.path.isfile(entry): + continue + + name, ext = os.path.splitext(entry) + if ext != '.tree': + continue + + basename = os.path.basename(name) + + file = open(entry, 'r') + strm = file.read() + file.close() + p = expatimpl.TreeParser(strm) + p.parse() + rootNodes[basename] = p.root + node.prettyPrint(sys.stdout, p.root) + def main (): parser = optparse.OptionParser() parser.set_defaults(output=None) parser.add_option("-o", "--output", dest="output", help="write output to FILE", metavar="FILE") + parser.add_option("-t", "--tree-dir", dest="tree_dir", help="Directory where the tree files are located. Tree files are expected to have .tree extension.") parser.add_option("--no-convert", action="store_false", dest="convert", help="Don't convert to docbook but simply output the parsed raw xhp structure", default=True) options, args = parser.parse_args() @@ -16,6 +45,15 @@ def main (): parser.print_help() sys.exit(1) + if options.tree_dir == None: + globals.error("Tree file directory is not provided.") + parser.print_help() + sys.exit(1) + + # Process the tree files first + processTreeFiles(options.tree_dir) + sys.exit(0) + filepaths = [] for fpath in args: if os.path.isdir(fpath): @@ -41,9 +79,14 @@ def main (): file = open(fpath, 'r') strm = file.read() file.close() - p = expatimpl.Parser(strm) + p = expatimpl.XHPParser(strm) p.parse() if p.filename != None: + if p.filename[0] == '/': + # Remove leading '/' if exists. We do this because some of the + # file names don't begin with '/' while the majority of them do. + # We need to make this consistent. + p.filename = p.filename[1:] rootNodes[p.filename] = p.root filesParsed += 1 diff --git a/source/docbook.py b/source/docbook.py index 8363b9e..34f5f25 100644 --- a/source/docbook.py +++ b/source/docbook.py @@ -1,3 +1,4 @@ +import sys import globals, node chapterNames = [ @@ -9,31 +10,142 @@ chapterNames = [ '/text/schart/main0000.xhp' ] + +class FilePathSorter: + + def __init__ (self, filepaths): + self.filepaths = filepaths + self.root = node.Root() + + def buildPaths (self): + for filepath in self.filepaths: + # NOTE: we assume that none of the file names begin with '/'. + hier = filepath.split('/') + curnode = self.root + for _dir in hier[:-1]: + temp = curnode.firstChildByName(_dir) + if temp == None: + # new directory node. + curnode = curnode.appendElement(_dir) + else: + # directory node already exists. + curnode = temp + + # append file as a content node. + curnode.appendContent(hier[-1]) + + def sortPaths (self): + self.__sortNode(self.root) + + def __sortNode (self, _node): + + # sort the files based on the file-1st-directory-2nd rule. + contents = {} + elements = {} + for child in _node.getChildNodes(): + if child.nodeType == node.NodeType.Content: + contents[child.content] = child + elif child.nodeType == node.NodeType.Element: + elements[child.name] = child + self.__sortNode(child) + + # build a new set of child node list. + children = [] + + # contents first. + contentNames = contents.keys() + contentNames.sort() + for name in contentNames: + children.append(contents[name]) + + # elements next. + elementNames = elements.keys() + elementNames.sort() + if _node.nodeType == node.NodeType.Element and _node.name == 'text': + # 'swriter', 'scalc', 'simpress', 'sdraw', 'schart' are ranked higher in this order. + l = ['swriter', 'scalc', 'simpress', 'sdraw', 'smath', 'schart'] + newElemNames = [] + for elem in l: + if elem in elementNames: + pos = elementNames.index(elem) + poped = elementNames.pop(pos) + newElemNames.append(poped) + newElemNames.extend(elementNames) + elementNames = newElemNames + + for name in elementNames: + children.append(elements[name]) + _node.setChildNodes(children) + + def sort (self): + self.buildPaths() + self.sortPaths() + return self.getFilePaths() + + def getFilePaths (self): + """Return a list of sorted file names.""" + self.filepaths = [] # empty the existing list first. + self.__walkToContent(self.root) + return self.filepaths + + def __walkToContent (self, _node): + if _node.nodeType == node.NodeType.Content: + s = _node.content + n = _node.parent + while n.nodeType == node.NodeType.Element: + s = n.name + '/' + s + n = n.parent + self.filepaths.append(s) + elif _node.nodeType == node.NodeType.Element or _node.nodeType == node.NodeType.Root: + for child in _node.getChildNodes(): + self.__walkToContent(child) + class DocBookConverter: def __init__ (self, xhproots): self.__xhproots = xhproots self.root = node.Root() def convert (self): + filenames = self.__xhproots.keys() + sorter = FilePathSorter(filenames) + filenames = sorter.sort() +# for filename in filenames: +# print (filename) +# return + book = self.root.appendElement('book') bookinfo = book.appendElement('bookinfo') title = bookinfo.appendElement('title') title.appendContent("OpenOffice.org Help") - for chapterName in chapterNames: - if not self.__xhproots.has_key(chapterName): + sectionStack = [book] + for filename in filenames: + if not self.__xhproots.has_key(filename): continue - xhproot = self.__xhproots[chapterName] + # 0 - book, 1 - chapter, 2 - sect1, 3 - sect2, and so on... + # file name is expected to be like 'text/swriter/...' + level = filename.count('/') - 1 + + xhproot = self.__xhproots[filename] # go to helpdocument/meta/topic/title to get the title text. - chapter = book.appendElement('chapter') - titleText = xhproot.firstChild().firstChildByName('meta').firstChildByName('topic').firstChildByName('title').getContent() - chapter.appendElement('title').appendContent(titleText) + cursect = None + if level == 1: + cursect = book.appendElement('chapter') + chapter = cursect + elif level == 2: + cursect = chapter.appendElement('sect1') + sect1 = cursect + if cursect == None: + continue + + titleText = xhproot.firstChild().firstChildByName('meta').firstChildByName('topic').firstChildByName('filename').getContent() + cursect.appendElement('title').appendContent(titleText) # convert all paragraphs. xhpbody = xhproot.firstChild().firstChildByName('body') for xhppara in xhpbody.getChildByName('paragraph'): - para = chapter.appendElement('para') + para = cursect.appendElement('para') para.appendContent(xhppara.getContent()) def prettyPrint (self, fd): diff --git a/source/expatimpl.py b/source/expatimpl.py index 55a98d8..de0125a 100644 --- a/source/expatimpl.py +++ b/source/expatimpl.py @@ -1,17 +1,16 @@ import xml.parsers.expat, sys import globals, node -class Parser: - - class ParseFailed(globals.Exception): - def __init__ (self): - globals.Exception.__init__(self, "parse failed") +class ParseFailed(globals.Exception): + def __init__ (self): + globals.Exception.__init__(self, "parse failed") +class ParserBase: def __init__ (self, strm): self.strm = strm self.root = node.Root() self.nodestack = [self.root] - self.filename = None + self.char = None def startElement(self, name, attrs): n = node.Element(name, attrs) @@ -20,7 +19,7 @@ class Parser: def endElement(self, name): if name != self.nodestack[-1].name: - raise Parser.ParseFailed() + raise ParseFailed() self.nodestack.pop() def character(self, data): @@ -35,9 +34,7 @@ class Parser: s = s.replace('$[officename]', 'OpenOffice.org') s = s.replace('%PRODUCTNAME', 'OpenOffice.org') self.nodestack[-1].appendChild(node.Content(s)) - if self.nodestack[-1].name == 'filename': - # For now, I just assume that the filename element is always at the correct position. - self.filename = s + self.char = s # store current character. def parse (self): p = xml.parsers.expat.ParserCreate() @@ -46,3 +43,22 @@ class Parser: p.CharacterDataHandler = self.character p.Parse(self.strm, 1) + +class XHPParser(ParserBase): + + def __init__ (self, strm): + ParserBase.__init__(self, strm) + self.filename = None + + def character(self, data): + ParserBase.character(self, data) + if self.nodestack[-1].name == 'filename': + # For now, I just assume that the filename element is always at the correct position. + self.filename = self.char + +class TreeParser(ParserBase): + + def __init__ (self, strm): + ParserBase.__init__(self, strm) + + diff --git a/source/node.py b/source/node.py index dcda870..0296993 100644 --- a/source/node.py +++ b/source/node.py @@ -7,11 +7,13 @@ class NodeType: class NodeBase: def __init__ (self, nodeType = NodeType.Unknown): - self.children = [] + self.parent = None + self.__children = [] self.nodeType = nodeType def appendChild (self, node): - self.children.append(node) + self.__children.append(node) + node.parent = self def appendElement (self, name): node = Element(name) @@ -24,17 +26,23 @@ class NodeBase: return node def firstChild (self): - return self.children[0] + return self.__children[0] + + def setChildNodes (self, children): + self.__children = children + + def getChildNodes (self): + return self.__children def firstChildByName (self, name): - for child in self.children: + for child in self.__children: if child.nodeType == NodeType.Element and child.name == name: return child return None def getChildByName (self, name): children = [] - for child in self.children: + for child in self.__children: if child.nodeType == NodeType.Element and child.name == name: children.append(child) return children @@ -57,7 +65,7 @@ class Element(NodeBase): def getContent (self): text = '' first = True - for child in self.children: + for child in self.getChildNodes(): if first: first = False else: @@ -103,10 +111,10 @@ def printNode (fd, node, level): indent = singleIndent*level if node.nodeType == NodeType.Root: # root node itself only contains child nodes. - for child in node.children: + for child in node.getChildNodes(): printNode(fd, child, level) elif node.nodeType == NodeType.Element: - hasChildren = len(node.children) > 0 + hasChildren = len(node.getChildNodes()) > 0 # We add '<' and '>' (or '/>') after the element content gets # encoded. @@ -120,7 +128,7 @@ def printNode (fd, node, level): line = encodeString(line) line = "<%s>\n"%line fd.write (indent + line) - for child in node.children: + for child in node.getChildNodes(): printNode(fd, child, level+1) line = "</%s>\n"%node.name fd.write (indent + line) |