summaryrefslogtreecommitdiff
path: root/sax/source
diff options
context:
space:
mode:
authorStephan Bergmann <sbergman@redhat.com>2022-02-03 13:50:55 +0100
committerStephan Bergmann <sbergman@redhat.com>2022-02-04 09:25:31 +0100
commit84b79376d7bc57a3967d7203a7d51466625588e8 (patch)
tree562d8070f1b89d60b3ed1a4004ac80d4bb460b9b /sax/source
parent3c2192c840f8bc86a677dd1394e9e1b1b52f86f3 (diff)
Add a clarifying comment regarding the use of UTF-8
...justifying that 2f3a0bfbfe110c0837b3c7e04f9ad0969d6e56e4 "tdf#147088: Also handle U+FFFE, U+FFFF invalid XML 1.0 characters" added code that assumes `string` is UTF-8 while carelessly removing the "assuming we're writing UTF-8" disclaimer comment that had been added with 8b25b67d5268abbb260da968cc23b6f6c8dd31af "escape invalid XML characters with _xHHHH_ when writing escaped" Change-Id: I0866da2bbbc536b2feb977c35b164459b745d918 Reviewed-on: https://gerrit.libreoffice.org/c/core/+/129422 Tested-by: Jenkins Reviewed-by: Stephan Bergmann <sbergman@redhat.com>
Diffstat (limited to 'sax/source')
-rw-r--r--sax/source/tools/fastserializer.cxx11
1 files changed, 11 insertions, 0 deletions
diff --git a/sax/source/tools/fastserializer.cxx b/sax/source/tools/fastserializer.cxx
index 8dcc308a836e..f4763dfa7c5d 100644
--- a/sax/source/tools/fastserializer.cxx
+++ b/sax/source/tools/fastserializer.cxx
@@ -106,6 +106,17 @@ namespace sax_fastparser {
/** Characters not allowed in XML 1.0
XML 1.1 would exclude only U+0000
+
+ This assumes that `string` is UTF-8, but which appears to generally be the case: The only
+ user of this FastSaxSerializer code is FastSerializerHelper, and when its constructor
+ (sax/source/tools/fshelper.cxx) is called with bWriteHeader being true, it calls
+ FastSaxSerializer::startDocument, which writes sXmlHeader claiming encoding="UTF-8". The
+ only place that appears to construct FastSerializerHelper appears to be
+ XmlFilterBase::openFragmentStreamWithSerializer (oox/source/core/xmlfilterbase.cxx), and it
+ only passes false for bWriteHeader when the given rMediaType contains "vml" but not "+xml"
+ (see <https://git.libreoffice.org/core/+/6a11add2c4ea975356cfb7bab02301788c79c904%5E!/>
+ "XLSX VML Export fixes", stating "Don't write xml headers for vml files"). But lets assume
+ that even such Vector Markup Language files are written as UTF-8.
*/
template<typename Int> static std::optional<std::pair<unsigned, Int>> invalidChar(
char const * string, Int length, Int index )