diff options
author | Luboš Luňák <l.lunak@collabora.com> | 2021-04-29 20:10:34 +0200 |
---|---|---|
committer | Andras Timar <andras.timar@collabora.com> | 2021-05-05 11:10:02 +0200 |
commit | deb349b129c5ea6706a30005a1eb42bae3849fda (patch) | |
tree | 778f8de79bd0261d38c677ebe26daa860e6866b3 | |
parent | 481f144b71be1df89d9306f92ec59cbe1ae0e1a3 (diff) |
allow utf-8 in xml names (liborcus) (tdf#141672)
Change-Id: Ib150d55b588a572e4352396f18de2331983b2aae
Reviewed-on: https://gerrit.libreoffice.org/c/core/+/114892
Tested-by: Jenkins
Reviewed-by: Luboš Luňák <l.lunak@collabora.com>
-rw-r--r-- | external/liborcus/UnpackedTarball_liborcus.mk | 4 | ||||
-rw-r--r-- | external/liborcus/allow-utf-8-in-xml-names.patch | 263 |
2 files changed, 267 insertions, 0 deletions
diff --git a/external/liborcus/UnpackedTarball_liborcus.mk b/external/liborcus/UnpackedTarball_liborcus.mk index 94ef29f80462..aecf528e364b 100644 --- a/external/liborcus/UnpackedTarball_liborcus.mk +++ b/external/liborcus/UnpackedTarball_liborcus.mk @@ -26,6 +26,10 @@ $(eval $(call gb_UnpackedTarball_add_patches,liborcus,\ external/liborcus/0001-Prevent-unsigned-integer-underflow.patch \ )) +$(eval $(call gb_UnpackedTarball_add_patches,liborcus,\ + external/liborcus/allow-utf-8-in-xml-names.patch \ +)) + ifeq ($(OS),WNT) $(eval $(call gb_UnpackedTarball_add_patches,liborcus,\ external/liborcus/windows-constants-hack.patch \ diff --git a/external/liborcus/allow-utf-8-in-xml-names.patch b/external/liborcus/allow-utf-8-in-xml-names.patch new file mode 100644 index 000000000000..d77957d9dae1 --- /dev/null +++ b/external/liborcus/allow-utf-8-in-xml-names.patch @@ -0,0 +1,263 @@ +From eda114350863a2543a835321fc3fe55e7858400b Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Lubo=C5=A1=20Lu=C5=88=C3=A1k?= <l.lunak@centrum.cz> +Date: Thu, 29 Apr 2021 19:12:20 +0200 +Subject: [PATCH] allow utf-8 in xml names (#137) + +https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-NameStartChar +has a list of all allowed characters. +--- + include/orcus/sax_parser_base.hpp | 3 + + src/orcus_test_xml.cpp | 1 + + src/parser/sax_parser_base.cpp | 178 +++++++++++++++++++++++++++++- + test/xml/non-ascii/check.txt | 4 + + test/xml/non-ascii/input.xml | 4 + + 5 files changed, 185 insertions(+), 5 deletions(-) + create mode 100644 test/xml/non-ascii/check.txt + create mode 100644 test/xml/non-ascii/input.xml + +diff --git a/include/orcus/sax_parser_base.hpp b/include/orcus/sax_parser_base.hpp +index 99946128..3cd7b92b 100644 +--- a/include/orcus/sax_parser_base.hpp ++++ b/include/orcus/sax_parser_base.hpp +@@ -219,6 +219,9 @@ protected: + void element_name(parser_element& elem, std::ptrdiff_t begin_pos); + void attribute_name(pstring& attr_ns, pstring& attr_name); + void characters_with_encoded_char(cell_buffer& buf); ++ ++ int is_name_char(); ++ int is_name_start_char(); + }; + + }} +diff --git a/src/orcus_test_xml.cpp b/src/orcus_test_xml.cpp +index 6e269cd2..df0d3779 100644 +--- a/src/orcus_test_xml.cpp ++++ b/src/orcus_test_xml.cpp +@@ -77,6 +77,7 @@ const char* sax_parser_test_dirs[] = { + SRCDIR"/test/xml/no-decl-1/", + SRCDIR"/test/xml/underscore-identifier/", + SRCDIR"/test/xml/self-closing-root/", ++ SRCDIR"/test/xml/non-ascii/", + }; + + const char* sax_parser_parse_only_test_dirs[] = { +diff --git a/src/parser/sax_parser_base.cpp b/src/parser/sax_parser_base.cpp +index 5dea04cf..11d1343b 100644 +--- a/src/parser/sax_parser_base.cpp ++++ b/src/parser/sax_parser_base.cpp +@@ -338,19 +338,187 @@ bool parser_base::value(pstring& str, bool decode) + return false; + } + ++// https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-NameStartChar ++// Return length of the character in bytes, otherwise 0. ++template< bool only_start_name > ++static ++int is_name_char_helper(const char* mp_char, const char* mp_end) ++{ ++ const unsigned char first = mp_char[0]; ++ // Note that ':' technically is an allowed name character, but it is handled separately ++ // e.g. in element_name(), so here pretend it isn't. ++ if (/*first == ':' ||*/ first == '_' || (first >= 'A' && first <= 'Z') || (first >= 'a' && first <= 'z')) ++ return 1; ++ if (!only_start_name && (first == '-' || first == '.' || (first >= '0' && first <= '9'))) ++ return 1; ++ ++ if (first < 0x7f) // other ascii characters are not allowed ++ return 0; ++ if (mp_end < mp_char + 1) ++ return 0; ++ const unsigned char second = mp_char[1]; ++ ++ // 0xb7 = 0xc2 0xb7 utf-8 ++ if (!only_start_name && first == 0xc2 && second == 0xb7) ++ return 2; ++ ++ // [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] ++ // 0xc0 = 0xc3 0x80 utf-8 ++ if (first < 0xc3) ++ return 0; ++ // xd7 = 0xc3 0x97 utf-8, 0xf7 = 0xc3 0xb7 utf-8 ++ if (first == 0xc3) ++ return second >= 0x80 && second <= 0xff && second != 0x97 && second != 0xb7 ? 2 : 0; ++ // 0x2ff = 0xcb 0xbf utf-8, 0x300 = 0xcc 0x80 utf-8 ++ if (first >= 0xc4 && first <= 0xcb) ++ return 2; ++ ++ // [#x0300-#x036F] ++ // 0x0300 = 0xcc 0x80 utf-8, 0x36f = 0xcd 0xaf utf-8 ++ if (!only_start_name && first == 0xcc) ++ return 2; ++ if (!only_start_name && first == 0xcd && second <= 0xaf) ++ return 2; ++ ++ // [#x370-#x37D] | [#x37F-#x1FFF] ++ // 0x370 = 0xcd 0xb0 utf-8, 0x37e = 0xcd 0xbe ++ if (first < 0xcd) ++ return 0; ++ if (first == 0xcd) ++ return second >= 0xb0 && second != 0xbe ? 2 : 0; ++ // 0x07ff = 0xdf 0xbf utf-8 (the last 2-byte utf-8) ++ if (first <= 0xdf) ++ return 2; ++ ++ if (first < 0xe0) ++ return 0; ++ if (mp_end < mp_char + 2) ++ return 0; ++ const unsigned char third = mp_char[2]; ++ ++ // 0x0800 = 0xe0 0xa0 0x80 utf-8, 0x1fff = 0xe1 0xbf 0xbf utf-8, 0x2000 = 0xe2 0x80 0x80 ++ if (first == 0xe0 || first == 0xe1) ++ return 3; ++ ++ // [#x200C-#x200D] ++ // 0x200c = 0xe2 0x80 0x8c utf-8, 0x200d = 0xe2 0x80 0x8d utf-8 ++ if (first < 0xe2) ++ return 0; ++ if (first == 0xe2 && second == 0x80 && (third == 0x8c || third == 0x8d)) ++ return 3; ++ ++ // [#x203F-#x2040] ++ // 0x203f = 0xe2 0x80 0xbf utf-8, 0x2040 = 0xe2 0x81 0x80 utf-8 ++ if (!only_start_name && first == 0xe2 && second == 0x80 && third == 0xbf) ++ return 3; ++ if (!only_start_name && first == 0xe2 && second == 0x81 && third == 0x80) ++ return 3; ++ ++ // [#x2070-#x218F] ++ // 0x2070 = 0xe2 0x81 0xb0 utf-8, 0x218f = 0xe2 0x86 0x8f utf-8 ++ if (first == 0xe2) ++ { ++ if (second < 0x81) ++ return 0; ++ if (second >= 0x81 && second < 0x86) ++ return 3; ++ if (second == 0x86 && third <= 0x8f) ++ return 3; ++ } ++ ++ // [#x2C00-#x2FEF] ++ // 0x2c00 = 0xe2 0xb0 0x80 utf-8, 0x2fef = 0xe2 0xbf 0xaf utf-8 ++ if (first == 0xe2) ++ { ++ if (second < 0xb0) ++ return 0; ++ if (second < 0xbf) ++ return 3; ++ if (second == 0xbf && third <= 0xaf) ++ return 3; ++ } ++ ++ // [#x3001-#xD7FF] ++ // 0x3001 = 0xe3 0x80 0x81 utf-8, 0xd7ff = 0xed 0x9f 0xbf utf-8, 0xd800 = 0xed 0xa0 0x80 utf-8 ++ if (first < 0xe3) ++ return 0; ++ if (first < 0xed) ++ return 3; ++ if (first == 0xed && second <= 0x9f) ++ return 3; ++ ++ // [#xF900-#xFDCF] ++ // 0xf900 = 0xef 0xa4 0x80 utf-8, 0xfdcf = 0xef 0xb7 0x8f utf-8 ++ if (first == 0xef) ++ { ++ if (second < 0xa4) ++ return 0; ++ if (second < 0xb7) ++ return 3; ++ if (second == 0xb7 && third <= 0x8f) ++ return 3; ++ } ++ ++ // [#xFDF0-#xFFFD] ++ // 0xfdf0 = 0xef 0xb7 0xb0 utf-8, 0xfffd = 0xef 0xbf 0xbd utf-8 ++ if (first == 0xef) ++ { ++ assert(second >= 0xb7); ++ if (second == 0xb7 && third < 0xb0) ++ return 0; ++ if (second < 0xbe) ++ return 3; ++ if (second == 0xbf && third <= 0xbd) ++ return 3; ++ } ++ ++ if (first < 0xf0) ++ return 0; ++ if (mp_end < mp_char + 3) ++ return 0; ++ // const unsigned char fourth = mp_char[3]; ++ ++ // [#x10000-#xEFFFF] ++ // 0x10000 = 0xf0 0x90 0x80 0x80 utf-8, 0xeffff = 0xf3 0xaf 0xbf 0xbf utf-8, ++ // 0xf0000 = 0xf3 0xb0 0x80 0x80 utf-8 ++ if (first >= 0xf0 && first < 0xf2) ++ return 4; ++ if (first == 0xf3 && second < 0xb0) ++ return 4; ++ ++ return 0; ++} ++ ++int parser_base::is_name_char() ++{ ++ return is_name_char_helper<false>(mp_char, mp_end); ++} ++ ++int parser_base::is_name_start_char() ++{ ++ return is_name_char_helper<true>(mp_char, mp_end); ++} ++ + void parser_base::name(pstring& str) + { + const char* p0 = mp_char; +- char c = cur_char(); +- if (!is_alpha(c) && c != '_') ++ int skip = is_name_start_char(); ++ if (skip == 0) + { + ::std::ostringstream os; +- os << "name must begin with an alphabet, but got this instead '" << c << "'"; ++ os << "name must begin with an alphabet, but got this instead '" << cur_char() << "'"; + throw malformed_xml_error(os.str(), offset()); + } ++ next(skip); + +- while (is_alpha(c) || is_numeric(c) || is_name_char(c)) +- c = next_char_checked(); ++ for(;;) ++ { ++ cur_char_checked(); // check end of xml stream ++ skip = is_name_char(); ++ if(skip == 0) ++ break; ++ next(skip); ++ } + + str = pstring(p0, mp_char-p0); + } +diff --git a/test/xml/non-ascii/check.txt b/test/xml/non-ascii/check.txt +new file mode 100644 +index 00000000..77b7c003 +--- /dev/null ++++ b/test/xml/non-ascii/check.txt +@@ -0,0 +1,4 @@ ++/Myšička ++/Myšička@jméno="Žužla" ++/Myšička/Nožičky ++/Myšička/Nožičky"4" +diff --git a/test/xml/non-ascii/input.xml b/test/xml/non-ascii/input.xml +new file mode 100644 +index 00000000..c516744b +--- /dev/null ++++ b/test/xml/non-ascii/input.xml +@@ -0,0 +1,4 @@ ++<?xml version="1.0" encoding="UTF-8"?> ++<Myšička jméno="Žužla"> ++ <Nožičky>4</Nožičky> ++</Myšička> +-- +2.26.2 + |