diff options
author | Fridrich Štrba <fridrich.strba@bluewin.ch> | 2013-11-29 14:39:13 +0100 |
---|---|---|
committer | Fridrich Štrba <fridrich.strba@bluewin.ch> | 2013-11-29 14:39:13 +0100 |
commit | 99ffff90905f4c6f7a8a9b4e0ea60b58aa5d42a8 (patch) | |
tree | 05335c672499191c5c4d67815cae82edfe2a2510 | |
parent | f84d26c809970b727dd66452965762c138878401 (diff) |
Some more xml-parsing based on the awml.dtd
-rw-r--r-- | awml.dtd | 221 | ||||
-rw-r--r-- | src/lib/ABWParser.cpp | 390 | ||||
-rw-r--r-- | src/lib/ABWParser.h | 19 | ||||
-rw-r--r-- | src/lib/AbiDocument.cpp | 7 | ||||
-rw-r--r-- | src/lib/Makefile.am | 2 |
5 files changed, 635 insertions, 4 deletions
diff --git a/awml.dtd b/awml.dtd new file mode 100644 index 0000000..d465fff --- /dev/null +++ b/awml.dtd @@ -0,0 +1,221 @@ +<!-- + + AbiWord Markup Language DTD + Last updated January 17, 2007 + (but still far from complete and correct) + + Currently, this document is intend for reference purposes only, + and should not be interpreted as an absolute guide to the + AbiWord file format. A more comprehsive view can be gained + from the code in files ie_exp_AbiWord_1.cpp and + ie_imp_AbiWord_1.cpp These are currently definitive. Other + documentation is availible in docs/AbiWord_DocumentFormat.abw + + Sam Tobin-Hochstadt + Christian Biesinger - Updated in August 2002 + Marc Maurer - Updated in January 2007 + +--> + + +<!ELEMENT abiword (metadata?, history?, revisions?, ignoredwords?, styles?, lists?, pagesize*, section+, data?)> +<!ATTLIST abiword + version CDATA "unnumbered" + fileformat CDATA ""> +<!-- This is the root element, with an attribute + representing the version number, and an + attribute representing the file format + version. --> + +<!-- awml is an alias for abiword. copying above declaration. --> +<!ELEMENT awml (metadata?, history?, revisions?, ignoredwords?, styles?, lists?, pagesize*, section+, data?)> +<!ATTLIST awml + version CDATA "unnumbered" + fileformat CDATA ""> + +<!ELEMENT metadata (m+)> +<!ELEMENT m (#PCDATA)> +<!ATTLIST m + key CDATA #REQUIRED> + +<!ELEMENT ignoredwords (iw+)> +<!ELEMENT iw (#PCDATA)> +<!-- Each iw element contains a single word which + is to be ignored when spell-checking the + document. --> + +<!ELEMENT history (version+)> +<!ATTLIST history + version CDATA #REQUIRED + edit-time CDATA #REQUIRED + last-saved CDATA #REQUIRED + uid CDATA #REQUIRED> +<!-- A history keeps track of when the document was saved. --> + +<!ELEMENT version EMPTY> +<!ATTLIST version + id CDATA #REQUIRED + started CDATA #REQUIRED + uid CDATA #REQUIRED + auto CDATA #REQUIRED + top-xid CDATA #REQUIRED> +<!-- A version represents a single safe-point in time. --> + +<!ELEMENT revisions (r+)> +<!ATTLIST revisions + show CDATA #REQUIRED + mark CDATA #REQUIRED + show-level CDATA #REQUIRED + auto CDATA #REQUIRED> +<!-- Revisions store all document changes made within a + single editing session. --> + +<!ELEMENT styles (s*)> +<!ELEMENT s EMPTY> +<!ATTLIST s + basedon CDATA #IMPLIED + name CDATA #REQUIRED + type CDATA #IMPLIED + parentid CDATA #IMPLIED + level CDATA #IMPLIED + style CDATA #IMPLIED + props CDATA #IMPLIED> +<!-- Styles allow for paragraph level formatting. + The basedon attribute specifies inheritance + and the props attribute specifies features + of the style. --> + +<!ELEMENT lists (l*)> +<!ELEMENT l EMPTY> +<!ATTLIST l + id CDATA #REQUIRED + parentid CDATA #REQUIRED + type CDATA #REQUIRED + start-value CDATA #REQUIRED + list-decimal CDATA #REQUIRED + list-delim CDATA #REQUIRED> +<!-- Styles allow for paragraph level formatting. + The basedon attribute specifies inheritance + and the props attribute specifies features + of the style. --> + +<!ELEMENT pagesize EMPTY> +<!ATTLIST pagesize + pagetype CDATA #REQUIRED + orientation CDATA #REQUIRED + width CDATA #REQUIRED + height CDATA #REQUIRED + units CDATA #REQUIRED + page-scale CDATA #REQUIRED> +<!-- pagesizes specify different page sizes that can + be used in the document. --> + +<!ELEMENT section ((p | table | frame | toc)+)> +<!ATTLIST section + props CDATA #IMPLIED + type (footer | CDATA) #IMPLIED + id CDATA #IMPLIED + header CDATA #IMPLIED + footer CDATA #IMPLIED + num_columns CDATA #IMPLIED + column_gap CDATA #IMPLIED + xid CDATA #IMPLIED> +<!-- Sections are collections of paragraphs, tables, + Table of Contents and frames. + They may specify footers to be attached, + or formatting properties. --> + +<!ELEMENT p (#PCDATA | c | field | f | image | i | cbr | pbr | br | bookmark | a)*> +<!ATTLIST p + props CDATA #IMPLIED + level (0|1|2|3|4|5|6|7|8|9|10 | CDATA) #IMPLIED + style CDATA #IMPLIED + listid CDATA #IMPLIED + parentid CDATA #IMPLIED + xid CDATA #IMPLIED + id ID #IMPLIED> +<!-- p element contain text, images, fields or character + spans. Level is used for lists, style for styles + and props for formatting. listid and parentid are + both used in lists. --> + +<!ELEMENT c (#PCDATA | cbr | pbr | br)*> +<!ATTLIST c + props CDATA #IMPLIED + type (list_label | CDATA) #IMPLIED + style CDATA #IMPLIED> +<!-- c is used to specify character based formatting + which is done with the props attribute. + type is used for lists. style can specify + predefined styles. --> + +<!ELEMENT field EMPTY> +<!ATTLIST field + type CDATA #REQUIRED + xid CDATA #IMPLIED> +<!-- fields represent calculated data on the page. --> +<!ELEMENT f EMPTY> +<!ATTLIST f + type CDATA #REQUIRED> +<!-- f is an alias for field --> + +<!ELEMENT image EMPTY> +<!ATTLIST image + dataid CDATA #REQUIRED + props CDATA #IMPLIED + title CDATA #IMPLIED + alt CDATA #IMPLIED + xid CDATA #IMPLIED> +<!-- image refers to an image in + d section. it is used much + the same as in HTML. --> + +<!ELEMENT i EMPTY> +<!ATTLIST i + dataid CDATA #REQUIRED + props CDATA #IMPLIED> +<!-- i is an alias for image --> + +<!ELEMENT br EMPTY> +<!ELEMENT pbr EMPTY> +<!ELEMENT cbr EMPTY> +<!-- These represent line, + page and column breaks. --> + +<!ELEMENT data (d+)> +<!ELEMENT d (#PCDATA)> +<!ATTLIST d + name CDATA #REQUIRED> +<!-- d is used to store actual + data, which currently means + images. --> + +<!ELEMENT bookmark EMPTY> +<!ATTLIST bookmark type (start|end) #REQUIRED + name CDATA #REQUIRED + xid CDATA #IMPLIED> + +<!ELEMENT a (c)> +<!ATTLIST a href CDATA #REQUIRED> +<!-- link to bookmark --> + +<!ELEMENT table (cell+)> +<!ATTLIST table + props CDATA #IMPLIED + xid CDATA #IMPLIED> +<!-- A table is really a special type of section. + Tables are collections of cells. --> + +<!ELEMENT cell (p | table)+> +<!ATTLIST cell + props CDATA #IMPLIED + xid CDATA #IMPLIED> +<!-- Cells are collections of paragraphs + or embedded tables --> + +<!ELEMENT frame (p | table)*> +<!ATTLIST frame + props CDATA #IMPLIED + xid CDATA #IMPLIED> +<!-- A frame is a container similar to a section. --> + diff --git a/src/lib/ABWParser.cpp b/src/lib/ABWParser.cpp index 13fff6b..1ccdebd 100644 --- a/src/lib/ABWParser.cpp +++ b/src/lib/ABWParser.cpp @@ -60,7 +60,9 @@ bool libabw::ABWParser::processXmlDocument(librevenge::RVNGInputStream *input) int ret = xmlTextReaderRead(reader); while (1 == ret) { - processXmlNode(reader); + int tokenType = xmlTextReaderNodeType(reader); + if (XML_READER_TYPE_SIGNIFICANT_WHITESPACE != tokenType) + processXmlNode(reader); ret = xmlTextReaderRead(reader); } @@ -77,6 +79,42 @@ void libabw::ABWParser::processXmlNode(xmlTextReaderPtr reader) int tokenType = xmlTextReaderNodeType(reader); switch (tokenId) { + case XML_METADATA: + if (XML_READER_TYPE_ELEMENT == tokenType) + readMetadata(reader); + break; + case XML_HISTORY: + if (XML_READER_TYPE_ELEMENT == tokenType) + readHistory(reader); + break; + case XML_REVISIONS: + if (XML_READER_TYPE_ELEMENT == tokenType) + readRevisions(reader); + break; + case XML_IGNOREDWORDS: + if (XML_READER_TYPE_ELEMENT == tokenType) + readIgnoredWords(reader); + break; + case XML_STYLES: + if (XML_READER_TYPE_ELEMENT == tokenType) + readStyles(reader); + break; + case XML_LISTS: + if (XML_READER_TYPE_ELEMENT == tokenType) + readLists(reader); + break; + case XML_PAGESIZE: + if (XML_READER_TYPE_ELEMENT == tokenType) + readPageSize(reader); + break; + case XML_SECTION: + if (XML_READER_TYPE_ELEMENT == tokenType) + readSection(reader); + break; + case XML_DATA: + if (XML_READER_TYPE_ELEMENT == tokenType) + readData(reader); + break; default: break; } @@ -111,4 +149,354 @@ int libabw::ABWParser::getElementToken(xmlTextReaderPtr reader) return ABWXMLTokenMap::getTokenId(xmlTextReaderConstName(reader)); } + +void libabw::ABWParser::readMetadata(xmlTextReaderPtr reader) +{ + int ret = 1; + int tokenId = XML_TOKEN_INVALID; + int tokenType = -1; + do + { + ret = xmlTextReaderRead(reader); + tokenId = getElementToken(reader); + if (XML_TOKEN_INVALID == tokenId) + { + ABW_DEBUG_MSG(("VDXParser::readMetadata: unknown token %s\n", xmlTextReaderConstName(reader))); + } + tokenType = xmlTextReaderNodeType(reader); + switch (tokenId) + { + case XML_M: + if (XML_READER_TYPE_ELEMENT == tokenType) + readM(reader); + break; + default: + break; + } + } + while ((XML_METADATA != tokenId || XML_READER_TYPE_END_ELEMENT != tokenType) && 1 == ret); +} + +void libabw::ABWParser::readHistory(xmlTextReaderPtr reader) +{ + int ret = 1; + int tokenId = XML_TOKEN_INVALID; + int tokenType = -1; + do + { + ret = xmlTextReaderRead(reader); + tokenId = getElementToken(reader); + if (XML_TOKEN_INVALID == tokenId) + { + ABW_DEBUG_MSG(("VDXParser::readHistory: unknown token %s\n", xmlTextReaderConstName(reader))); + } + tokenType = xmlTextReaderNodeType(reader); + switch (tokenId) + { + case XML_VERSION: + if (XML_READER_TYPE_ELEMENT == tokenType) + readVersion(reader); + break; + default: + break; + } + } + while ((XML_HISTORY != tokenId || XML_READER_TYPE_END_ELEMENT != tokenType) && 1 == ret); +} + +void libabw::ABWParser::readRevisions(xmlTextReaderPtr reader) +{ + int ret = 1; + int tokenId = XML_TOKEN_INVALID; + int tokenType = -1; + do + { + ret = xmlTextReaderRead(reader); + tokenId = getElementToken(reader); + if (XML_TOKEN_INVALID == tokenId) + { + ABW_DEBUG_MSG(("VDXParser::readRevisions: unknown token %s\n", xmlTextReaderConstName(reader))); + } + tokenType = xmlTextReaderNodeType(reader); + (void)tokenType; + switch (tokenId) + { + default: + break; + } + } + while ((XML_REVISIONS != tokenId || XML_READER_TYPE_END_ELEMENT != tokenType) && 1 == ret); +} + +void libabw::ABWParser::readIgnoredWords(xmlTextReaderPtr reader) +{ + int ret = 1; + int tokenId = XML_TOKEN_INVALID; + int tokenType = -1; + do + { + ret = xmlTextReaderRead(reader); + tokenId = getElementToken(reader); + if (XML_TOKEN_INVALID == tokenId) + { + ABW_DEBUG_MSG(("VDXParser::readIgnoreWords: unknown token %s\n", xmlTextReaderConstName(reader))); + } + tokenType = xmlTextReaderNodeType(reader); + switch (tokenId) + { + case XML_IW: + if (XML_READER_TYPE_ELEMENT == tokenType) + readIw(reader); + break; + default: + break; + } + } + while ((XML_IGNOREDWORDS != tokenId || XML_READER_TYPE_END_ELEMENT != tokenType) && 1 == ret); +} + +void libabw::ABWParser::readStyles(xmlTextReaderPtr reader) +{ + int ret = 1; + int tokenId = XML_TOKEN_INVALID; + int tokenType = -1; + do + { + ret = xmlTextReaderRead(reader); + tokenId = getElementToken(reader); + if (XML_TOKEN_INVALID == tokenId) + { + ABW_DEBUG_MSG(("VDXParser::readStyles: unknown token %s\n", xmlTextReaderConstName(reader))); + } + tokenType = xmlTextReaderNodeType(reader); + switch (tokenId) + { + case XML_S: + if (XML_READER_TYPE_ELEMENT == tokenType) + readS(reader); + break; + default: + break; + } + } + while ((XML_STYLES != tokenId || XML_READER_TYPE_END_ELEMENT != tokenType) && 1 == ret); +} + +void libabw::ABWParser::readLists(xmlTextReaderPtr reader) +{ + int ret = 1; + int tokenId = XML_TOKEN_INVALID; + int tokenType = -1; + do + { + ret = xmlTextReaderRead(reader); + tokenId = getElementToken(reader); + if (XML_TOKEN_INVALID == tokenId) + { + ABW_DEBUG_MSG(("VDXParser::readLists: unknown token %s\n", xmlTextReaderConstName(reader))); + } + tokenType = xmlTextReaderNodeType(reader); + switch (tokenId) + { + case XML_L: + if (XML_READER_TYPE_ELEMENT == tokenType) + readL(reader); + break; + default: + break; + } + } + while ((XML_LISTS != tokenId || XML_READER_TYPE_END_ELEMENT != tokenType) && 1 == ret); +} + +void libabw::ABWParser::readPageSize(xmlTextReaderPtr reader) +{ + xmlTextReaderRead(reader); +} + +void libabw::ABWParser::readSection(xmlTextReaderPtr reader) +{ + int ret = 1; + int tokenId = XML_TOKEN_INVALID; + int tokenType = -1; + do + { + ret = xmlTextReaderRead(reader); + tokenId = getElementToken(reader); + if (XML_TOKEN_INVALID == tokenId) + { + ABW_DEBUG_MSG(("VDXParser::readSection: unknown token %s\n", xmlTextReaderConstName(reader))); + } + tokenType = xmlTextReaderNodeType(reader); + switch (tokenId) + { + default: + break; + } + } + while ((XML_SECTION != tokenId || XML_READER_TYPE_END_ELEMENT != tokenType) && 1 == ret); +} + +void libabw::ABWParser::readData(xmlTextReaderPtr reader) +{ + int ret = 1; + int tokenId = XML_TOKEN_INVALID; + int tokenType = -1; + do + { + ret = xmlTextReaderRead(reader); + tokenId = getElementToken(reader); + if (XML_TOKEN_INVALID == tokenId) + { + ABW_DEBUG_MSG(("VDXParser::readData: unknown token %s\n", xmlTextReaderConstName(reader))); + } + tokenType = xmlTextReaderNodeType(reader); + switch (tokenId) + { + case XML_D: + if (XML_READER_TYPE_ELEMENT == tokenType) + readD(reader); + break; + default: + break; + } + } + while ((XML_DATA != tokenId || XML_READER_TYPE_END_ELEMENT != tokenType) && 1 == ret); +} + +void libabw::ABWParser::readM(xmlTextReaderPtr reader) +{ + int ret = 1; + int tokenId = XML_TOKEN_INVALID; + int tokenType = -1; + do + { + ret = xmlTextReaderRead(reader); + tokenId = getElementToken(reader); + if (XML_TOKEN_INVALID == tokenId) + { + ABW_DEBUG_MSG(("VDXParser::readM: unknown token %s\n", xmlTextReaderConstName(reader))); + } + tokenType = xmlTextReaderNodeType(reader); + (void)tokenType; + switch (tokenId) + { + default: + break; + } + } + while ((XML_M != tokenId || XML_READER_TYPE_END_ELEMENT != tokenType) && 1 == ret); +} + +void libabw::ABWParser::readIw(xmlTextReaderPtr reader) +{ + int ret = 1; + int tokenId = XML_TOKEN_INVALID; + int tokenType = -1; + do + { + ret = xmlTextReaderRead(reader); + tokenId = getElementToken(reader); + if (XML_TOKEN_INVALID == tokenId) + { + ABW_DEBUG_MSG(("VDXParser::readIw: unknown token %s\n", xmlTextReaderConstName(reader))); + } + tokenType = xmlTextReaderNodeType(reader); + (void)tokenType; + switch (tokenId) + { + default: + break; + } + } + while ((XML_IW != tokenId || XML_READER_TYPE_END_ELEMENT != tokenType) && 1 == ret); +} + +void libabw::ABWParser::readVersion(xmlTextReaderPtr reader) +{ + xmlTextReaderRead(reader); +} + +void libabw::ABWParser::readS(xmlTextReaderPtr reader) +{ + xmlTextReaderRead(reader); +} + +void libabw::ABWParser::readL(xmlTextReaderPtr reader) +{ + xmlTextReaderRead(reader); +} + +void libabw::ABWParser::readP(xmlTextReaderPtr reader) +{ + int ret = 1; + int tokenId = XML_TOKEN_INVALID; + int tokenType = -1; + do + { + ret = xmlTextReaderRead(reader); + tokenId = getElementToken(reader); + if (XML_TOKEN_INVALID == tokenId) + { + ABW_DEBUG_MSG(("VDXParser::readP: unknown token %s\n", xmlTextReaderConstName(reader))); + } + tokenType = xmlTextReaderNodeType(reader); + (void)tokenType; + switch (tokenId) + { + default: + break; + } + } + while ((XML_P != tokenId || XML_READER_TYPE_END_ELEMENT != tokenType) && 1 == ret); +} + +void libabw::ABWParser::readC(xmlTextReaderPtr reader) +{ + int ret = 1; + int tokenId = XML_TOKEN_INVALID; + int tokenType = -1; + do + { + ret = xmlTextReaderRead(reader); + tokenId = getElementToken(reader); + if (XML_TOKEN_INVALID == tokenId) + { + ABW_DEBUG_MSG(("VDXParser::readC: unknown token %s\n", xmlTextReaderConstName(reader))); + } + tokenType = xmlTextReaderNodeType(reader); + (void)tokenType; + switch (tokenId) + { + default: + break; + } + } + while ((XML_C != tokenId || XML_READER_TYPE_END_ELEMENT != tokenType) && 1 == ret); +} + +void libabw::ABWParser::readD(xmlTextReaderPtr reader) +{ + int ret = 1; + int tokenId = XML_TOKEN_INVALID; + int tokenType = -1; + do + { + ret = xmlTextReaderRead(reader); + tokenId = getElementToken(reader); + if (XML_TOKEN_INVALID == tokenId) + { + ABW_DEBUG_MSG(("VDXParser::readD: unknown token %s\n", xmlTextReaderConstName(reader))); + } + tokenType = xmlTextReaderNodeType(reader); + (void)tokenType; + switch (tokenId) + { + default: + break; + } + } + while ((XML_D != tokenId || XML_READER_TYPE_END_ELEMENT != tokenType) && 1 == ret); +} + /* vim:set shiftwidth=2 softtabstop=2 expandtab: */ diff --git a/src/lib/ABWParser.h b/src/lib/ABWParser.h index 0df83b0..eab5238 100644 --- a/src/lib/ABWParser.h +++ b/src/lib/ABWParser.h @@ -39,6 +39,25 @@ private: bool processXmlDocument(librevenge::RVNGInputStream *input); void processXmlNode(xmlTextReaderPtr reader); + void readMetadata(xmlTextReaderPtr reader); + void readHistory(xmlTextReaderPtr reader); + void readRevisions(xmlTextReaderPtr reader); + void readIgnoredWords(xmlTextReaderPtr reader); + void readStyles(xmlTextReaderPtr reader); + void readLists(xmlTextReaderPtr reader); + void readPageSize(xmlTextReaderPtr reader); + void readSection(xmlTextReaderPtr reader); + void readData(xmlTextReaderPtr reader); + + void readM(xmlTextReaderPtr reader); + void readIw(xmlTextReaderPtr reader); + void readVersion(xmlTextReaderPtr reader); + void readS(xmlTextReaderPtr reader); + void readL(xmlTextReaderPtr reader); + void readP(xmlTextReaderPtr reader); + void readC(xmlTextReaderPtr reader); + void readD(xmlTextReaderPtr reader); + librevenge::RVNGInputStream *m_input; librevenge::RVNGTextInterface *m_iface; ABWCollector *m_collector; diff --git a/src/lib/AbiDocument.cpp b/src/lib/AbiDocument.cpp index 6b10bbe..3ae5ce8 100644 --- a/src/lib/AbiDocument.cpp +++ b/src/lib/AbiDocument.cpp @@ -61,8 +61,11 @@ bool AbiDocument::isFileFormatSupported(librevenge::RVNGInputStream *input) } if (!xmlStrEqual(name, BAD_CAST("abiword"))) { - xmlFreeTextReader(reader); - return false; + if (!xmlStrEqual(name, BAD_CAST("awml"))) + { + xmlFreeTextReader(reader); + return false; + } } // Checking the namespace of AbiWord documents. diff --git a/src/lib/Makefile.am b/src/lib/Makefile.am index 2a6830b..401fad0 100644 --- a/src/lib/Makefile.am +++ b/src/lib/Makefile.am @@ -45,7 +45,7 @@ libabw_@ABW_MAJOR_VERSION@_@ABW_MINOR_VERSION@_la_SOURCES = \ $(generated_files) ABWXMLTokenMap.lo : $(generated_files) -ABWXParser.lo : $(generated_files) +ABWParser.lo : $(generated_files) $(top_builddir)/src/lib/tokens.h : $(top_builddir)/src/lib/tokens.gperf |