More work on xhp to docbook conversion. Started working on parsing tree structure.

author: Kohei Yoshida <kyoshida@novell.com> 2009-12-23 17:36:59 -0500
committer: Kohei Yoshida <kyoshida@novell.com> 2009-12-23 17:36:59 -0500
commit: 2e2fd65997b0029571905f07ecd9ad7c77ca10ca (patch)
tree: 8e91b38eba8e95f2ab151046eab949adbf95f935
parent: 7fe4aea75e8d7174b710713ba53f8fa8744a0c3c (diff)
4 files changed, 206 insertions, 27 deletions
diff --git a/ooo-help-parser.py b/ooo-help-parser.py
index 5b0e785..165b511 100755
--- a/ooo-help-parser.py
+++ b/ooo-help-parser.py
@@ -5,10 +5,39 @@ sys.path.append(sys.path[0]+"/source")
 
 import globals, expatimpl, docbook, node
 
+def processTreeFiles (tree_dir):
+    if not os.path.isdir(tree_dir):
+        globals.error("Specified tree directory is invalid")
+        sys.exit(1)
+
+    if tree_dir[-1] == '/':
+        tree_dir = tree_dir[:-1]
+
+    rootNodes = {}
+    for entry in os.listdir(tree_dir):
+        entry = tree_dir + '/' + entry
+        if not os.path.isfile(entry):
+            continue
+
+        name, ext = os.path.splitext(entry)
+        if ext != '.tree':
+            continue
+
+        basename = os.path.basename(name)
+
+        file = open(entry, 'r')
+        strm = file.read()
+        file.close()
+        p = expatimpl.TreeParser(strm)
+        p.parse()
+        rootNodes[basename] = p.root
+        node.prettyPrint(sys.stdout, p.root)
+
 def main ():
     parser = optparse.OptionParser()
     parser.set_defaults(output=None)
     parser.add_option("-o", "--output", dest="output", help="write output to FILE", metavar="FILE")
+    parser.add_option("-t", "--tree-dir", dest="tree_dir", help="Directory where the tree files are located.  Tree files are expected to have .tree extension.")
     parser.add_option("--no-convert", action="store_false", dest="convert", 
         help="Don't convert to docbook but simply output the parsed raw xhp structure", default=True)
     options, args = parser.parse_args()
@@ -16,6 +45,15 @@ def main ():
         parser.print_help()
         sys.exit(1)
 
+    if options.tree_dir == None:
+        globals.error("Tree file directory is not provided.")
+        parser.print_help()
+        sys.exit(1)
+
+    # Process the tree files first
+    processTreeFiles(options.tree_dir)
+    sys.exit(0)
+
     filepaths = []
     for fpath in args:
         if os.path.isdir(fpath):
@@ -41,9 +79,14 @@ def main ():
         file = open(fpath, 'r')
         strm = file.read()
         file.close()
-        p = expatimpl.Parser(strm)
+        p = expatimpl.XHPParser(strm)
         p.parse()
         if p.filename != None:
+            if p.filename[0] == '/':
+                # Remove leading '/' if exists.  We do this because some of the 
+                # file names don't begin with '/' while the majority of them do.
+                # We need to make this consistent.
+                p.filename = p.filename[1:]
             rootNodes[p.filename] = p.root
         filesParsed += 1
 
diff --git a/source/docbook.py b/source/docbook.py
index 8363b9e..34f5f25 100644
--- a/source/docbook.py
+++ b/source/docbook.py
@@ -1,3 +1,4 @@
+import sys
 import globals, node
 
 chapterNames = [
@@ -9,31 +10,142 @@ chapterNames = [
     '/text/schart/main0000.xhp'
 ]
 
+
+class FilePathSorter:
+
+    def __init__ (self, filepaths):
+        self.filepaths = filepaths
+        self.root = node.Root()
+
+    def buildPaths (self):
+        for filepath in self.filepaths:
+            # NOTE: we assume that none of the file names begin with '/'.
+            hier = filepath.split('/')
+            curnode = self.root
+            for _dir in hier[:-1]:
+                temp = curnode.firstChildByName(_dir)
+                if temp == None:
+                    # new directory node.
+                    curnode = curnode.appendElement(_dir)
+                else:
+                    # directory node already exists.
+                    curnode = temp
+
+            # append file as a content node.
+            curnode.appendContent(hier[-1])
+            
+    def sortPaths (self):
+        self.__sortNode(self.root)
+
+    def __sortNode (self, _node):
+
+        # sort the files based on the file-1st-directory-2nd rule.
+        contents = {}
+        elements = {}
+        for child in _node.getChildNodes():
+            if child.nodeType == node.NodeType.Content:
+                contents[child.content] = child
+            elif child.nodeType == node.NodeType.Element:
+                elements[child.name] = child
+                self.__sortNode(child)
+
+        # build a new set of child node list.
+        children = []
+
+        # contents first.
+        contentNames = contents.keys()
+        contentNames.sort()
+        for name in contentNames:
+            children.append(contents[name])
+
+        # elements next.
+        elementNames = elements.keys()
+        elementNames.sort()
+        if _node.nodeType == node.NodeType.Element and _node.name == 'text':
+            # 'swriter', 'scalc', 'simpress', 'sdraw', 'schart' are ranked higher in this order.
+            l = ['swriter', 'scalc', 'simpress', 'sdraw', 'smath', 'schart']
+            newElemNames = []
+            for elem in l:
+                if elem in elementNames:
+                    pos = elementNames.index(elem)
+                    poped = elementNames.pop(pos)
+                    newElemNames.append(poped)
+            newElemNames.extend(elementNames)
+            elementNames = newElemNames
+
+        for name in elementNames:
+            children.append(elements[name])
+        _node.setChildNodes(children)
+
+    def sort (self):
+        self.buildPaths()
+        self.sortPaths()
+        return self.getFilePaths()
+
+    def getFilePaths (self):
+        """Return a list of sorted file names."""
+        self.filepaths = [] # empty the existing list first.
+        self.__walkToContent(self.root)
+        return self.filepaths
+
+    def __walkToContent (self, _node):
+        if _node.nodeType == node.NodeType.Content:
+            s = _node.content
+            n = _node.parent
+            while n.nodeType == node.NodeType.Element:
+                s = n.name + '/' + s
+                n = n.parent
+            self.filepaths.append(s)
+        elif _node.nodeType == node.NodeType.Element or _node.nodeType == node.NodeType.Root:
+            for child in _node.getChildNodes():
+                self.__walkToContent(child)
+
 class DocBookConverter:
     def __init__ (self, xhproots):
         self.__xhproots = xhproots
         self.root = node.Root()
 
     def convert (self):
+        filenames = self.__xhproots.keys()
+        sorter = FilePathSorter(filenames)
+        filenames = sorter.sort()
+#       for filename in filenames:
+#           print (filename)
+#       return
+
         book = self.root.appendElement('book')
         bookinfo = book.appendElement('bookinfo')
         title = bookinfo.appendElement('title')
         title.appendContent("OpenOffice.org Help")
 
-        for chapterName in chapterNames:
-            if not self.__xhproots.has_key(chapterName):
+        sectionStack = [book]
+        for filename in filenames:
+            if not self.__xhproots.has_key(filename):
                 continue
 
-            xhproot = self.__xhproots[chapterName]
+            # 0 - book, 1 - chapter, 2 - sect1, 3 - sect2, and so on...
+            # file name is expected to be like 'text/swriter/...'
+            level = filename.count('/') - 1
+
+            xhproot = self.__xhproots[filename]
             # go to helpdocument/meta/topic/title to get the title text.
-            chapter = book.appendElement('chapter')
-            titleText = xhproot.firstChild().firstChildByName('meta').firstChildByName('topic').firstChildByName('title').getContent()
-            chapter.appendElement('title').appendContent(titleText)
+            cursect = None
+            if level == 1:
+                cursect = book.appendElement('chapter')
+                chapter = cursect
+            elif level == 2:
+                cursect = chapter.appendElement('sect1')
+                sect1 = cursect
+            if cursect == None:
+                continue
+
+            titleText = xhproot.firstChild().firstChildByName('meta').firstChildByName('topic').firstChildByName('filename').getContent()
+            cursect.appendElement('title').appendContent(titleText)
 
             # convert all paragraphs.
             xhpbody = xhproot.firstChild().firstChildByName('body')
             for xhppara in xhpbody.getChildByName('paragraph'):
-                para = chapter.appendElement('para')
+                para = cursect.appendElement('para')
                 para.appendContent(xhppara.getContent())
 
     def prettyPrint (self, fd):
diff --git a/source/expatimpl.py b/source/expatimpl.py
index 55a98d8..de0125a 100644
--- a/source/expatimpl.py
+++ b/source/expatimpl.py
@@ -1,17 +1,16 @@
 import xml.parsers.expat, sys
 import globals, node
 
-class Parser:
-
-    class ParseFailed(globals.Exception):
-        def __init__ (self):
-            globals.Exception.__init__(self, "parse failed")
+class ParseFailed(globals.Exception):
+    def __init__ (self):
+        globals.Exception.__init__(self, "parse failed")
 
+class ParserBase:
     def __init__ (self, strm):
         self.strm = strm
         self.root = node.Root()
         self.nodestack = [self.root]
-        self.filename = None
+        self.char = None
 
     def startElement(self, name, attrs):
         n = node.Element(name, attrs)
@@ -20,7 +19,7 @@ class Parser:
 
     def endElement(self, name):
         if name != self.nodestack[-1].name:
-            raise Parser.ParseFailed()
+            raise ParseFailed()
         self.nodestack.pop()
 
     def character(self, data):
@@ -35,9 +34,7 @@ class Parser:
         s = s.replace('$[officename]', 'OpenOffice.org')
         s = s.replace('%PRODUCTNAME', 'OpenOffice.org')
         self.nodestack[-1].appendChild(node.Content(s))
-        if self.nodestack[-1].name == 'filename':
-            # For now, I just assume that the filename element is always at the correct position.
-            self.filename = s
+        self.char = s # store current character.
 
     def parse (self):
         p = xml.parsers.expat.ParserCreate()
@@ -46,3 +43,22 @@ class Parser:
         p.CharacterDataHandler = self.character
         p.Parse(self.strm, 1)
 
+
+class XHPParser(ParserBase):
+
+    def __init__ (self, strm):
+        ParserBase.__init__(self, strm)
+        self.filename = None
+
+    def character(self, data):
+        ParserBase.character(self, data)
+        if self.nodestack[-1].name == 'filename':
+            # For now, I just assume that the filename element is always at the correct position.
+            self.filename = self.char
+
+class TreeParser(ParserBase):
+
+    def __init__ (self, strm):
+        ParserBase.__init__(self, strm)
+
+
diff --git a/source/node.py b/source/node.py
index dcda870..0296993 100644
--- a/source/node.py
+++ b/source/node.py
@@ -7,11 +7,13 @@ class NodeType:
 
 class NodeBase:
     def __init__ (self, nodeType = NodeType.Unknown):
-        self.children = []
+        self.parent = None
+        self.__children = []
         self.nodeType = nodeType
 
     def appendChild (self, node):
-        self.children.append(node)
+        self.__children.append(node)
+        node.parent = self
 
     def appendElement (self, name):
         node = Element(name)
@@ -24,17 +26,23 @@ class NodeBase:
         return node
 
     def firstChild (self):
-        return self.children[0]
+        return self.__children[0]
+
+    def setChildNodes (self, children):
+        self.__children = children
+
+    def getChildNodes (self):
+        return self.__children
 
     def firstChildByName (self, name):
-        for child in self.children:
+        for child in self.__children:
             if child.nodeType == NodeType.Element and child.name == name:
                 return child
         return None
 
     def getChildByName (self, name):
         children = []
-        for child in self.children:
+        for child in self.__children:
             if child.nodeType == NodeType.Element and child.name == name:
                 children.append(child)
         return children
@@ -57,7 +65,7 @@ class Element(NodeBase):
     def getContent (self):
         text = ''
         first = True
-        for child in self.children:
+        for child in self.getChildNodes():
             if first:
                 first = False
             else:
@@ -103,10 +111,10 @@ def printNode (fd, node, level):
     indent = singleIndent*level
     if node.nodeType == NodeType.Root:
         # root node itself only contains child nodes.
-        for child in node.children:
+        for child in node.getChildNodes():
             printNode(fd, child, level)
     elif node.nodeType == NodeType.Element:
-        hasChildren = len(node.children) > 0
+        hasChildren = len(node.getChildNodes()) > 0
 
         # We add '<' and '>' (or '/>') after the element content gets 
         # encoded.
@@ -120,7 +128,7 @@ def printNode (fd, node, level):
             line = encodeString(line)
             line = "<%s>\n"%line
             fd.write (indent + line)
-            for child in node.children:
+            for child in node.getChildNodes():
                 printNode(fd, child, level+1)
             line = "</%s>\n"%node.name
             fd.write (indent + line)
author	Kohei Yoshida <kyoshida@novell.com>	2009-12-23 17:36:59 -0500
committer	Kohei Yoshida <kyoshida@novell.com>	2009-12-23 17:36:59 -0500
commit	2e2fd65997b0029571905f07ecd9ad7c77ca10ca (patch)
tree	8e91b38eba8e95f2ab151046eab949adbf95f935
parent	7fe4aea75e8d7174b710713ba53f8fa8744a0c3c (diff)