summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKohei Yoshida <kyoshida@novell.com>2009-12-23 17:36:59 -0500
committerKohei Yoshida <kyoshida@novell.com>2009-12-23 17:36:59 -0500
commit2e2fd65997b0029571905f07ecd9ad7c77ca10ca (patch)
tree8e91b38eba8e95f2ab151046eab949adbf95f935
parent7fe4aea75e8d7174b710713ba53f8fa8744a0c3c (diff)
More work on xhp to docbook conversion. Started working on parsing tree structure.
-rwxr-xr-xooo-help-parser.py45
-rw-r--r--source/docbook.py126
-rw-r--r--source/expatimpl.py36
-rw-r--r--source/node.py26
4 files changed, 206 insertions, 27 deletions
diff --git a/ooo-help-parser.py b/ooo-help-parser.py
index 5b0e785..165b511 100755
--- a/ooo-help-parser.py
+++ b/ooo-help-parser.py
@@ -5,10 +5,39 @@ sys.path.append(sys.path[0]+"/source")
import globals, expatimpl, docbook, node
+def processTreeFiles (tree_dir):
+ if not os.path.isdir(tree_dir):
+ globals.error("Specified tree directory is invalid")
+ sys.exit(1)
+
+ if tree_dir[-1] == '/':
+ tree_dir = tree_dir[:-1]
+
+ rootNodes = {}
+ for entry in os.listdir(tree_dir):
+ entry = tree_dir + '/' + entry
+ if not os.path.isfile(entry):
+ continue
+
+ name, ext = os.path.splitext(entry)
+ if ext != '.tree':
+ continue
+
+ basename = os.path.basename(name)
+
+ file = open(entry, 'r')
+ strm = file.read()
+ file.close()
+ p = expatimpl.TreeParser(strm)
+ p.parse()
+ rootNodes[basename] = p.root
+ node.prettyPrint(sys.stdout, p.root)
+
def main ():
parser = optparse.OptionParser()
parser.set_defaults(output=None)
parser.add_option("-o", "--output", dest="output", help="write output to FILE", metavar="FILE")
+ parser.add_option("-t", "--tree-dir", dest="tree_dir", help="Directory where the tree files are located. Tree files are expected to have .tree extension.")
parser.add_option("--no-convert", action="store_false", dest="convert",
help="Don't convert to docbook but simply output the parsed raw xhp structure", default=True)
options, args = parser.parse_args()
@@ -16,6 +45,15 @@ def main ():
parser.print_help()
sys.exit(1)
+ if options.tree_dir == None:
+ globals.error("Tree file directory is not provided.")
+ parser.print_help()
+ sys.exit(1)
+
+ # Process the tree files first
+ processTreeFiles(options.tree_dir)
+ sys.exit(0)
+
filepaths = []
for fpath in args:
if os.path.isdir(fpath):
@@ -41,9 +79,14 @@ def main ():
file = open(fpath, 'r')
strm = file.read()
file.close()
- p = expatimpl.Parser(strm)
+ p = expatimpl.XHPParser(strm)
p.parse()
if p.filename != None:
+ if p.filename[0] == '/':
+ # Remove leading '/' if exists. We do this because some of the
+ # file names don't begin with '/' while the majority of them do.
+ # We need to make this consistent.
+ p.filename = p.filename[1:]
rootNodes[p.filename] = p.root
filesParsed += 1
diff --git a/source/docbook.py b/source/docbook.py
index 8363b9e..34f5f25 100644
--- a/source/docbook.py
+++ b/source/docbook.py
@@ -1,3 +1,4 @@
+import sys
import globals, node
chapterNames = [
@@ -9,31 +10,142 @@ chapterNames = [
'/text/schart/main0000.xhp'
]
+
+class FilePathSorter:
+
+ def __init__ (self, filepaths):
+ self.filepaths = filepaths
+ self.root = node.Root()
+
+ def buildPaths (self):
+ for filepath in self.filepaths:
+ # NOTE: we assume that none of the file names begin with '/'.
+ hier = filepath.split('/')
+ curnode = self.root
+ for _dir in hier[:-1]:
+ temp = curnode.firstChildByName(_dir)
+ if temp == None:
+ # new directory node.
+ curnode = curnode.appendElement(_dir)
+ else:
+ # directory node already exists.
+ curnode = temp
+
+ # append file as a content node.
+ curnode.appendContent(hier[-1])
+
+ def sortPaths (self):
+ self.__sortNode(self.root)
+
+ def __sortNode (self, _node):
+
+ # sort the files based on the file-1st-directory-2nd rule.
+ contents = {}
+ elements = {}
+ for child in _node.getChildNodes():
+ if child.nodeType == node.NodeType.Content:
+ contents[child.content] = child
+ elif child.nodeType == node.NodeType.Element:
+ elements[child.name] = child
+ self.__sortNode(child)
+
+ # build a new set of child node list.
+ children = []
+
+ # contents first.
+ contentNames = contents.keys()
+ contentNames.sort()
+ for name in contentNames:
+ children.append(contents[name])
+
+ # elements next.
+ elementNames = elements.keys()
+ elementNames.sort()
+ if _node.nodeType == node.NodeType.Element and _node.name == 'text':
+ # 'swriter', 'scalc', 'simpress', 'sdraw', 'schart' are ranked higher in this order.
+ l = ['swriter', 'scalc', 'simpress', 'sdraw', 'smath', 'schart']
+ newElemNames = []
+ for elem in l:
+ if elem in elementNames:
+ pos = elementNames.index(elem)
+ poped = elementNames.pop(pos)
+ newElemNames.append(poped)
+ newElemNames.extend(elementNames)
+ elementNames = newElemNames
+
+ for name in elementNames:
+ children.append(elements[name])
+ _node.setChildNodes(children)
+
+ def sort (self):
+ self.buildPaths()
+ self.sortPaths()
+ return self.getFilePaths()
+
+ def getFilePaths (self):
+ """Return a list of sorted file names."""
+ self.filepaths = [] # empty the existing list first.
+ self.__walkToContent(self.root)
+ return self.filepaths
+
+ def __walkToContent (self, _node):
+ if _node.nodeType == node.NodeType.Content:
+ s = _node.content
+ n = _node.parent
+ while n.nodeType == node.NodeType.Element:
+ s = n.name + '/' + s
+ n = n.parent
+ self.filepaths.append(s)
+ elif _node.nodeType == node.NodeType.Element or _node.nodeType == node.NodeType.Root:
+ for child in _node.getChildNodes():
+ self.__walkToContent(child)
+
class DocBookConverter:
def __init__ (self, xhproots):
self.__xhproots = xhproots
self.root = node.Root()
def convert (self):
+ filenames = self.__xhproots.keys()
+ sorter = FilePathSorter(filenames)
+ filenames = sorter.sort()
+# for filename in filenames:
+# print (filename)
+# return
+
book = self.root.appendElement('book')
bookinfo = book.appendElement('bookinfo')
title = bookinfo.appendElement('title')
title.appendContent("OpenOffice.org Help")
- for chapterName in chapterNames:
- if not self.__xhproots.has_key(chapterName):
+ sectionStack = [book]
+ for filename in filenames:
+ if not self.__xhproots.has_key(filename):
continue
- xhproot = self.__xhproots[chapterName]
+ # 0 - book, 1 - chapter, 2 - sect1, 3 - sect2, and so on...
+ # file name is expected to be like 'text/swriter/...'
+ level = filename.count('/') - 1
+
+ xhproot = self.__xhproots[filename]
# go to helpdocument/meta/topic/title to get the title text.
- chapter = book.appendElement('chapter')
- titleText = xhproot.firstChild().firstChildByName('meta').firstChildByName('topic').firstChildByName('title').getContent()
- chapter.appendElement('title').appendContent(titleText)
+ cursect = None
+ if level == 1:
+ cursect = book.appendElement('chapter')
+ chapter = cursect
+ elif level == 2:
+ cursect = chapter.appendElement('sect1')
+ sect1 = cursect
+ if cursect == None:
+ continue
+
+ titleText = xhproot.firstChild().firstChildByName('meta').firstChildByName('topic').firstChildByName('filename').getContent()
+ cursect.appendElement('title').appendContent(titleText)
# convert all paragraphs.
xhpbody = xhproot.firstChild().firstChildByName('body')
for xhppara in xhpbody.getChildByName('paragraph'):
- para = chapter.appendElement('para')
+ para = cursect.appendElement('para')
para.appendContent(xhppara.getContent())
def prettyPrint (self, fd):
diff --git a/source/expatimpl.py b/source/expatimpl.py
index 55a98d8..de0125a 100644
--- a/source/expatimpl.py
+++ b/source/expatimpl.py
@@ -1,17 +1,16 @@
import xml.parsers.expat, sys
import globals, node
-class Parser:
-
- class ParseFailed(globals.Exception):
- def __init__ (self):
- globals.Exception.__init__(self, "parse failed")
+class ParseFailed(globals.Exception):
+ def __init__ (self):
+ globals.Exception.__init__(self, "parse failed")
+class ParserBase:
def __init__ (self, strm):
self.strm = strm
self.root = node.Root()
self.nodestack = [self.root]
- self.filename = None
+ self.char = None
def startElement(self, name, attrs):
n = node.Element(name, attrs)
@@ -20,7 +19,7 @@ class Parser:
def endElement(self, name):
if name != self.nodestack[-1].name:
- raise Parser.ParseFailed()
+ raise ParseFailed()
self.nodestack.pop()
def character(self, data):
@@ -35,9 +34,7 @@ class Parser:
s = s.replace('$[officename]', 'OpenOffice.org')
s = s.replace('%PRODUCTNAME', 'OpenOffice.org')
self.nodestack[-1].appendChild(node.Content(s))
- if self.nodestack[-1].name == 'filename':
- # For now, I just assume that the filename element is always at the correct position.
- self.filename = s
+ self.char = s # store current character.
def parse (self):
p = xml.parsers.expat.ParserCreate()
@@ -46,3 +43,22 @@ class Parser:
p.CharacterDataHandler = self.character
p.Parse(self.strm, 1)
+
+class XHPParser(ParserBase):
+
+ def __init__ (self, strm):
+ ParserBase.__init__(self, strm)
+ self.filename = None
+
+ def character(self, data):
+ ParserBase.character(self, data)
+ if self.nodestack[-1].name == 'filename':
+ # For now, I just assume that the filename element is always at the correct position.
+ self.filename = self.char
+
+class TreeParser(ParserBase):
+
+ def __init__ (self, strm):
+ ParserBase.__init__(self, strm)
+
+
diff --git a/source/node.py b/source/node.py
index dcda870..0296993 100644
--- a/source/node.py
+++ b/source/node.py
@@ -7,11 +7,13 @@ class NodeType:
class NodeBase:
def __init__ (self, nodeType = NodeType.Unknown):
- self.children = []
+ self.parent = None
+ self.__children = []
self.nodeType = nodeType
def appendChild (self, node):
- self.children.append(node)
+ self.__children.append(node)
+ node.parent = self
def appendElement (self, name):
node = Element(name)
@@ -24,17 +26,23 @@ class NodeBase:
return node
def firstChild (self):
- return self.children[0]
+ return self.__children[0]
+
+ def setChildNodes (self, children):
+ self.__children = children
+
+ def getChildNodes (self):
+ return self.__children
def firstChildByName (self, name):
- for child in self.children:
+ for child in self.__children:
if child.nodeType == NodeType.Element and child.name == name:
return child
return None
def getChildByName (self, name):
children = []
- for child in self.children:
+ for child in self.__children:
if child.nodeType == NodeType.Element and child.name == name:
children.append(child)
return children
@@ -57,7 +65,7 @@ class Element(NodeBase):
def getContent (self):
text = ''
first = True
- for child in self.children:
+ for child in self.getChildNodes():
if first:
first = False
else:
@@ -103,10 +111,10 @@ def printNode (fd, node, level):
indent = singleIndent*level
if node.nodeType == NodeType.Root:
# root node itself only contains child nodes.
- for child in node.children:
+ for child in node.getChildNodes():
printNode(fd, child, level)
elif node.nodeType == NodeType.Element:
- hasChildren = len(node.children) > 0
+ hasChildren = len(node.getChildNodes()) > 0
# We add '<' and '>' (or '/>') after the element content gets
# encoded.
@@ -120,7 +128,7 @@ def printNode (fd, node, level):
line = encodeString(line)
line = "<%s>\n"%line
fd.write (indent + line)
- for child in node.children:
+ for child in node.getChildNodes():
printNode(fd, child, level+1)
line = "</%s>\n"%node.name
fd.write (indent + line)