summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLuo Jinghua <sunmoon1997@gmail.com>2010-01-09 00:30:19 +0800
committerLuo Jinghua <sunmoon1997@gmail.com>2010-01-09 00:30:19 +0800
commit03da7d1dadfe9f3f2ed080e29adfc448c21fbff9 (patch)
tree39b8767e12f154bc7518cf4d10e163fe47f9eee5
parent3aeaf3c5450f27e87cf9d887f0935ee369a6cd1f (diff)
utfconvert: fixed utf-16-be encoding and handle the bom while decoding
-rw-r--r--utfconverter.c28
1 files changed, 24 insertions, 4 deletions
diff --git a/utfconverter.c b/utfconverter.c
index b2384d3..e031cba 100644
--- a/utfconverter.c
+++ b/utfconverter.c
@@ -8,8 +8,8 @@
#include <stdio.h>
#include <assert.h>
-#define FLAG_USE_BOM_ENDIAN 1 << 0
-
+#define FLAG_USE_BOM_ENDIAN (1 << 0)
+#define FLAG_DONE_BOM_ENDIAN (1 << 1)
struct utfconverter {
struct converter base;
@@ -131,12 +131,13 @@ utf16_encode(struct converter *conv,
static int
utf16_decode(struct converter *conv,
- const char **inbuf,
+ const char **sinbuf,
size_t inleft,
uc_char_t **outbuf,
size_t outleft)
{
struct utfconverter *uc = (struct utfconverter*)conv;
+ const uc_uint8_t **inbuf = (const uc_uint8_t **)sinbuf;
while (inleft) {
uc_char_t unichar;
@@ -170,6 +171,13 @@ utf16_decode(struct converter *conv,
if (!outleft)
return UNICONV_E2BIG;
+ /* BOM */
+ if (unichar == 0xfffe && uc->flags & FLAG_USE_BOM_ENDIAN &&
+ !(uc->flags & FLAG_DONE_BOM_ENDIAN)) {
+ uc->flags &= ~FLAG_DONE_BOM_ENDIAN;
+ uc->little_endian ^= 1;
+ unichar = 0xfeff;
+ }
**outbuf = unichar;
(*outbuf) += 1;
outleft -= 1;
@@ -219,12 +227,13 @@ utf32_encode(struct converter *conv,
static int
utf32_decode(struct converter *conv,
- const char **inbuf,
+ const char **sinbuf,
size_t inleft,
uc_char_t **outbuf,
size_t outleft)
{
struct utfconverter *uc = (struct utfconverter*)conv;
+ const uc_uint8_t **inbuf = (const uc_uint8_t **)sinbuf;
if (inleft & 3)
return UNICONV_EINVAL;
@@ -244,6 +253,15 @@ utf32_decode(struct converter *conv,
((*inbuf)[2] << 8) |
((*inbuf)[1] << 16) |
((*inbuf)[0] << 24);
+
+ /* BOM */
+ if (**outbuf == 0xfffe && uc->flags & FLAG_USE_BOM_ENDIAN &&
+ !(uc->flags & FLAG_DONE_BOM_ENDIAN)) {
+ uc->flags &= ~FLAG_DONE_BOM_ENDIAN;
+ uc->little_endian ^= 1;
+ **outbuf = 0xfeff;
+ }
+
(*inbuf) += 4;
(*outbuf) += 1;
inleft -= 4;
@@ -267,6 +285,7 @@ utfconverter_reset(struct converter *suc)
/* default to host endian, should be big endian? */
if (uc->flags & FLAG_USE_BOM_ENDIAN)
uc->little_endian = !is_big_endian();
+ uc->flags &= ~FLAG_DONE_BOM_ENDIAN;
}
struct converter *
@@ -302,6 +321,7 @@ utfconverter_open(const char *charset)
} else if (!strcmp(charset, "utf_16_be")) {
conv->base.encode = utf16_encode;
conv->base.decode = utf16_decode;
+ conv->little_endian = 0;
} else if (!strcmp(charset, "utf_32")) {
conv->base.encode = utf32_encode;
conv->base.decode = utf32_decode;