diff options
author | Thorsten Behrens <tbehrens@novell.com> | 2010-05-26 09:25:18 +0200 |
---|---|---|
committer | Thorsten Behrens <tbehrens@novell.com> | 2010-05-26 09:25:18 +0200 |
commit | c40781c051cf6f37a489f7aeda7294cec756e535 (patch) | |
tree | 11b343ea5f075e799a8457939dd61185f7174090 |
Initial import of pyx tools
* xml2pyx.py: Convert from xml to PYX line-based format
* pyx2xml.py: Convert back from PYX to xml
PYX is a line-based representation for (much of) the xml info set,
see http://www.ibm.com/developerworks/xml/library/x-matters17.html
for an introduction
-rwxr-xr-x | pyx2xml.py | 79 | ||||
-rwxr-xr-x | xml2pyx.py | 62 |
2 files changed, 141 insertions, 0 deletions
diff --git a/pyx2xml.py b/pyx2xml.py new file mode 100755 index 0000000..3e478d7 --- /dev/null +++ b/pyx2xml.py @@ -0,0 +1,79 @@ +#!/usr/bin/env python +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain a +# copy of the License at: +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. +# +# Inspired by Sean McGrath's pyx2xml, and David Mertz' XML matters +# Cobbled together by Thorsten Behrens <thb@openoffice.org> +# +import sys, os, re + +def grabNS (qname): + global num_ns, uris + delimit = qname.find('}') + uri = qname[1:delimit] + name = qname[delimit+1:] + if not uri in uris: + num_ns += 1 + uris[uri] = 'ns'+str(num_ns) + return uris[uri] + ':' + name + +num_ns = 0 +get_attrs = 0 +uris = {} +lines = [] + +unescape = lambda s: s.replace(r'\t','\t').replace(r'\\','\\') +ns_handling = len(sys.argv) > 1 and sys.argv[1] == '-ns' +if ns_handling: + writeln = lambda s: lines.append(s) +else: + writeln = lambda s: sys.stdout.write(s+'\n') + +writeln('<?xml version="1.0" encoding="UTF-8"?>') +curr_line="" +for line in sys.stdin: + if get_attrs and line[0] != 'A': + # attr section ends here + get_attrs = 0 + curr_line += '>' + if line[0] == '?': + writeln(curr_line+'<?%s?>' % line[1:-1]) + curr_line="" + elif line[0] == '(': + curr_line += '<%s' % grabNS(line[1:-1]) + get_attrs = 1 + elif line[0] == 'A': + name,val = line[1:].split(None, 1) + curr_line += ' %s="%s"' % (grabNS(name), unescape(val)[:-1]) + elif line[:3] == r'-\n': + writeln(curr_line) + curr_line="" + elif line[0] == '-': + curr_line += unescape(line[1:-1]) + elif line[0] == ')': + curr_line += '</%s>' % grabNS(line[1:-1]) + +if len(curr_line): + writeln(curr_line) + +if ns_handling: + opening_tag=re.compile("(\\s*<\\s*)([^\\? \\t\\n\\r\\f\\v]+)") + ns_written=False + for line in lines: + if not ns_written and re.match(opening_tag,line): + line = re.split(opening_tag,line) + line.insert(3, ' ' + ' '.join(['xmlns:'+n+'="'+u+'"' for (u,n) in uris.items()])) + sys.stdout.write(''.join(line) + '\n') + ns_written = True + else: + sys.stdout.write(line+'\n') diff --git a/xml2pyx.py b/xml2pyx.py new file mode 100755 index 0000000..4b16bac --- /dev/null +++ b/xml2pyx.py @@ -0,0 +1,62 @@ +#!/usr/bin/env python +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain a +# copy of the License at: +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. +# +# Inspired by Sean McGrath's html2pyx, and Peter A. Bigot's saxutils +# Cobbled together by Thorsten Behrens <thb@openoffice.org> +# +import xml.sax +import string +import StringIO + +class DummyResolver: + """Dummy - ignore PIs we dont' care about.""" + def resolveEntity (self, p, s): + return StringIO.StringIO('') + +class PyxConverter (xml.sax.handler.ContentHandler): + """SAX handler class that transforms xml into pyx.""" + + def setDocumentLocator (self, locator): + pass + + def encode (self,s): + s = string.replace (s,"\\","\\\\") + s = string.replace (s,"\n","\\n") + s = string.replace (s,"\t","\\t") + return s + + def startElementNS (self, name, qname, attrs): + print "({%s}%s" % name + for (n,v) in attrs.items(): + print "A{%s}%s %s" % (n[0], n[1], (self.encode(v))) + + def endElementNS (self, name, qname): + print "){%s}%s" % name + + def characters (self, content): + print "-%s" % self.encode(content) + + def processingInstruction (self, data): + print "?%s" % self.encode(data) + +if __name__ == "__main__": + import sys + + parser = xml.sax.make_parser() + parser.setFeature(xml.sax.handler.feature_namespaces, True) + parser.setFeature(xml.sax.handler.feature_namespace_prefixes, False) + parser.setContentHandler(PyxConverter()) + parser.setEntityResolver(DummyResolver()) + + parser.parse(open(sys.argv[1],"r")) |