diff options
author | Luo Jinghua <sunmoon1997@gmail.com> | 2010-01-09 00:30:19 +0800 |
---|---|---|
committer | Luo Jinghua <sunmoon1997@gmail.com> | 2010-01-09 00:30:19 +0800 |
commit | 03da7d1dadfe9f3f2ed080e29adfc448c21fbff9 (patch) | |
tree | 39b8767e12f154bc7518cf4d10e163fe47f9eee5 | |
parent | 3aeaf3c5450f27e87cf9d887f0935ee369a6cd1f (diff) |
utfconvert: fixed utf-16-be encoding and handle the bom while decoding
-rw-r--r-- | utfconverter.c | 28 |
1 files changed, 24 insertions, 4 deletions
diff --git a/utfconverter.c b/utfconverter.c index b2384d3..e031cba 100644 --- a/utfconverter.c +++ b/utfconverter.c @@ -8,8 +8,8 @@ #include <stdio.h> #include <assert.h> -#define FLAG_USE_BOM_ENDIAN 1 << 0 - +#define FLAG_USE_BOM_ENDIAN (1 << 0) +#define FLAG_DONE_BOM_ENDIAN (1 << 1) struct utfconverter { struct converter base; @@ -131,12 +131,13 @@ utf16_encode(struct converter *conv, static int utf16_decode(struct converter *conv, - const char **inbuf, + const char **sinbuf, size_t inleft, uc_char_t **outbuf, size_t outleft) { struct utfconverter *uc = (struct utfconverter*)conv; + const uc_uint8_t **inbuf = (const uc_uint8_t **)sinbuf; while (inleft) { uc_char_t unichar; @@ -170,6 +171,13 @@ utf16_decode(struct converter *conv, if (!outleft) return UNICONV_E2BIG; + /* BOM */ + if (unichar == 0xfffe && uc->flags & FLAG_USE_BOM_ENDIAN && + !(uc->flags & FLAG_DONE_BOM_ENDIAN)) { + uc->flags &= ~FLAG_DONE_BOM_ENDIAN; + uc->little_endian ^= 1; + unichar = 0xfeff; + } **outbuf = unichar; (*outbuf) += 1; outleft -= 1; @@ -219,12 +227,13 @@ utf32_encode(struct converter *conv, static int utf32_decode(struct converter *conv, - const char **inbuf, + const char **sinbuf, size_t inleft, uc_char_t **outbuf, size_t outleft) { struct utfconverter *uc = (struct utfconverter*)conv; + const uc_uint8_t **inbuf = (const uc_uint8_t **)sinbuf; if (inleft & 3) return UNICONV_EINVAL; @@ -244,6 +253,15 @@ utf32_decode(struct converter *conv, ((*inbuf)[2] << 8) | ((*inbuf)[1] << 16) | ((*inbuf)[0] << 24); + + /* BOM */ + if (**outbuf == 0xfffe && uc->flags & FLAG_USE_BOM_ENDIAN && + !(uc->flags & FLAG_DONE_BOM_ENDIAN)) { + uc->flags &= ~FLAG_DONE_BOM_ENDIAN; + uc->little_endian ^= 1; + **outbuf = 0xfeff; + } + (*inbuf) += 4; (*outbuf) += 1; inleft -= 4; @@ -267,6 +285,7 @@ utfconverter_reset(struct converter *suc) /* default to host endian, should be big endian? */ if (uc->flags & FLAG_USE_BOM_ENDIAN) uc->little_endian = !is_big_endian(); + uc->flags &= ~FLAG_DONE_BOM_ENDIAN; } struct converter * @@ -302,6 +321,7 @@ utfconverter_open(const char *charset) } else if (!strcmp(charset, "utf_16_be")) { conv->base.encode = utf16_encode; conv->base.decode = utf16_decode; + conv->little_endian = 0; } else if (!strcmp(charset, "utf_32")) { conv->base.encode = utf32_encode; conv->base.decode = utf32_decode; |