diff options
author | Tor Lillqvist <tml@collabora.com> | 2021-01-15 03:03:41 +0200 |
---|---|---|
committer | Tor Lillqvist <tml@collabora.com> | 2021-01-15 08:54:12 +0100 |
commit | 5aee16cf09f9d4ba50feaf804b2a7a649af276bc (patch) | |
tree | 9fc4c9e3b4a9f1f201b00fbe142de93a2b24b0bd /tools | |
parent | 84a6cfd9bb6532602ca811b0d5daf016bb9b4578 (diff) |
Make JsonWriter::writeEscapedOUString() handle surrogate pairs properly
It is wrong to iterate over UTF-16 code units one by one. We have
OUString::iterateCodePoints() to iterate over Unicode code points.
The two UTF-16 code units of a surrogate pair (for a non-BMP code
point) should not be encoded separately to UTF-8 bytes. It is the code
point that should be encoded (to four bytes).
Change-Id: Ica4341308deb6618c9c2da8dcee8a11ef4e8238d
Reviewed-on: https://gerrit.libreoffice.org/c/core/+/109318
Tested-by: Jenkins
Reviewed-by: Noel Grandin <noel.grandin@collabora.co.uk>
Reviewed-by: Tor Lillqvist <tml@collabora.com>
Diffstat (limited to 'tools')
-rw-r--r-- | tools/source/misc/json_writer.cxx | 18 |
1 files changed, 15 insertions, 3 deletions
diff --git a/tools/source/misc/json_writer.cxx b/tools/source/misc/json_writer.cxx index a50e2ada967f..c326201eb9e5 100644 --- a/tools/source/misc/json_writer.cxx +++ b/tools/source/misc/json_writer.cxx @@ -123,9 +123,10 @@ void JsonWriter::endStruct() void JsonWriter::writeEscapedOUString(const OUString& rPropVal) { // Convert from UTF-16 to UTF-8 and perform escaping - for (int i = 0; i < rPropVal.getLength(); ++i) + sal_Int32 i = 0; + while (i < rPropVal.getLength()) { - sal_Unicode ch = rPropVal[i]; + sal_uInt32 ch = rPropVal.iterateCodePoints(&i); if (ch == '\\') { *mPos = static_cast<char>(ch); @@ -173,7 +174,7 @@ void JsonWriter::writeEscapedOUString(const OUString& rPropVal) *mPos = 0x80 | (ch & 0x3F); /* 10xxxxxx */ ++mPos; } - else + else if (ch <= 0xFFFF) { *mPos = 0xE0 | (ch >> 12); /* 1110xxxx */ ++mPos; @@ -182,6 +183,17 @@ void JsonWriter::writeEscapedOUString(const OUString& rPropVal) *mPos = 0x80 | (ch & 0x3F); /* 10xxxxxx */ ++mPos; } + else + { + *mPos = 0xF0 | (ch >> 18); /* 11110xxx */ + ++mPos; + *mPos = 0x80 | ((ch >> 12) & 0x3F); /* 10xxxxxx */ + ++mPos; + *mPos = 0x80 | ((ch >> 6) & 0x3F); /* 10xxxxxx */ + ++mPos; + *mPos = 0x80 | (ch & 0x3F); /* 10xxxxxx */ + ++mPos; + } } } |