diff options
author | Luo Jinghua <sunmoon1997@gmail.com> | 2010-01-08 22:54:04 +0800 |
---|---|---|
committer | Luo Jinghua <sunmoon1997@gmail.com> | 2010-01-08 22:54:04 +0800 |
commit | 3aeaf3c5450f27e87cf9d887f0935ee369a6cd1f (patch) | |
tree | 37fcdf7bc1f391205973c490feae1bc1db33b608 | |
parent | 5b798c4039eae1e8bfdba96f3b3e7be736ea8da8 (diff) |
utfconverter: support utf_16le, utf_16be, utf_32le and utf_32be
-rw-r--r-- | utfconverter.c | 172 |
1 files changed, 147 insertions, 25 deletions
diff --git a/utfconverter.c b/utfconverter.c index 04a8199..b2384d3 100644 --- a/utfconverter.c +++ b/utfconverter.c @@ -8,11 +8,27 @@ #include <stdio.h> #include <assert.h> +#define FLAG_USE_BOM_ENDIAN 1 << 0 + struct utfconverter { struct converter base; + + unsigned int flags; + int little_endian; }; static int +is_big_endian() +{ + static const union { + int iv; + char cv[4]; + } u = { 0x12345678 }; + + return u.cv[0] == 0x12; +} + +static int utf8_encode(struct converter *conv, const uc_char_t **inbuf, size_t inleft, @@ -75,23 +91,39 @@ utf16_encode(struct converter *conv, char **outbuf, size_t outleft) { + struct utfconverter *uc = (struct utfconverter*)conv; size_t i; - uc_uint16_t **soutbuf = (uc_uint16_t **)outbuf; if (!inbuf) return UNICONV_SUCCESS; for (i = 0; i < inleft; i++) { int seqlen = ucs4toutf16(**inbuf, NULL); + uc_uint16_t utf16[2]; if (seqlen < 0) return UNICONV_EILSEQ; - if (seqlen * sizeof(uc_uint16_t) > outleft) + if (seqlen * 2 > outleft) return UNICONV_E2BIG; - ucs4toutf16(**inbuf, *soutbuf); + ucs4toutf16(**inbuf, utf16); + if (uc->little_endian) { + (*outbuf)[0] = utf16[0] & 0xff; + (*outbuf)[1] = utf16[0] >> 8; + if (seqlen == 2) { + (*outbuf)[2] = utf16[1] & 0xff; + (*outbuf)[3] = utf16[1] >> 8; + } + } else { + (*outbuf)[0] = utf16[0] >> 8; + (*outbuf)[1] = utf16[0] & 0xff; + if (seqlen == 2) { + (*outbuf)[2] = utf16[1] >> 8; + (*outbuf)[3] = utf16[1] & 0xff; + } + } (*inbuf) += 1; - (*soutbuf) += seqlen; - outleft -= seqlen; + (*outbuf) += seqlen * 2; + outleft -= seqlen * 2; } return UNICONV_SUCCESS; @@ -104,12 +136,33 @@ utf16_decode(struct converter *conv, uc_char_t **outbuf, size_t outleft) { - const uc_uint16_t **sinbuf = (const uc_uint16_t**)inbuf; + struct utfconverter *uc = (struct utfconverter*)conv; while (inleft) { uc_char_t unichar; - int seqlen = ucs4fromutf16(*sinbuf, &unichar, - inleft / sizeof(uc_uint16_t)); + uc_uint16_t utf16[2]; + int seqlen; + + if (inleft < 2) + return UNICONV_EINVAL; + + if (uc->little_endian) + utf16[0] = (*inbuf)[1] << 8 | (*inbuf)[0]; + else + utf16[0] = (*inbuf)[0] << 8 | (*inbuf)[1]; + if (utf16[0] >= 0xd800 && utf16[0] <= 0xbeff) + seqlen = 2; + else + seqlen = 1; + if (inleft < seqlen * 2) + return UNICONV_EINVAL; + if (seqlen == 2) { + if (uc->little_endian) + utf16[1] = (*inbuf)[3] << 8 | (*inbuf)[2]; + else + utf16[1] = (*inbuf)[2] << 8 | (*inbuf)[3]; + } + seqlen = ucs4fromutf16(utf16, &unichar, seqlen); if (seqlen == -2) return UNICONV_EINVAL; else if (seqlen < 0) @@ -121,8 +174,8 @@ utf16_decode(struct converter *conv, (*outbuf) += 1; outleft -= 1; - (*sinbuf) += seqlen; - inleft -= seqlen * sizeof(uc_uint16_t); + (*inbuf) += seqlen * 2; + inleft -= seqlen * 2; } return UNICONV_SUCCESS; @@ -135,33 +188,68 @@ utf32_encode(struct converter *conv, char **outbuf, size_t outleft) { + struct utfconverter *uc = (struct utfconverter*)conv; + if (!inbuf) return UNICONV_SUCCESS; - if (inleft * sizeof(uc_char_t) > outleft) + if (inleft * 4 > outleft) return UNICONV_E2BIG; - memcpy(*outbuf, *inbuf, inleft * sizeof(uc_char_t)); - (*inbuf) += inleft; - (*outbuf) += inleft * sizeof(uc_char_t); + while (inleft) { + if (uc->little_endian) { + (*outbuf)[0] = ((**inbuf) & 0x000000ff) >> 0; + (*outbuf)[1] = ((**inbuf) & 0x0000ff00) >> 8; + (*outbuf)[2] = ((**inbuf) & 0x00ff0000) >> 16; + (*outbuf)[3] = ((**inbuf) & 0xff000000) >> 24; + } else { + (*outbuf)[3] = ((**inbuf) & 0x000000ff) >> 0; + (*outbuf)[2] = ((**inbuf) & 0x0000ff00) >> 8; + (*outbuf)[1] = ((**inbuf) & 0x00ff0000) >> 16; + (*outbuf)[0] = ((**inbuf) & 0xff000000) >> 24; + } + (*inbuf) += 1; + (*outbuf) += 4; + inleft -= 1; + outleft -= 4; + } + return UNICONV_SUCCESS; } static int utf32_decode(struct converter *conv, - const char **inbuf, - size_t inleft, - uc_char_t **outbuf, - size_t outleft) + const char **inbuf, + size_t inleft, + uc_char_t **outbuf, + size_t outleft) { + struct utfconverter *uc = (struct utfconverter*)conv; + if (inleft & 3) return UNICONV_EINVAL; - if (inleft > outleft) + if (inleft / 4 > outleft) return UNICONV_E2BIG; - memcpy(*outbuf, *inbuf, inleft); - (*inbuf) += inleft; - (*outbuf) += inleft / sizeof(uc_char_t); + while (inleft) { + if (uc->little_endian) + **outbuf = + ((*inbuf)[0] << 0) | + ((*inbuf)[1] << 8) | + ((*inbuf)[2] << 16) | + ((*inbuf)[3] << 24); + else + **outbuf = + ((*inbuf)[3] << 0) | + ((*inbuf)[2] << 8) | + ((*inbuf)[1] << 16) | + ((*inbuf)[0] << 24); + (*inbuf) += 4; + (*outbuf) += 1; + inleft -= 4; + outleft -= 1; + } + return UNICONV_SUCCESS; } @@ -171,30 +259,64 @@ utfconverter_close(struct converter *conv) free(conv); } +static void +utfconverter_reset(struct converter *suc) +{ + struct utfconverter *uc = (struct utfconverter*)suc; + + /* default to host endian, should be big endian? */ + if (uc->flags & FLAG_USE_BOM_ENDIAN) + uc->little_endian = !is_big_endian(); +} + struct converter * utfconverter_open(const char *charset) { struct utfconverter *conv; - if (strcmp(charset, "utf_8") && strcmp(charset, "utf_16") && - strcmp(charset, "utf_32")) + if (strcmp(charset, "utf_8") && + strcmp(charset, "utf_16") && + strcmp(charset, "utf_16_le") && + strcmp(charset, "utf_16_be") && + strcmp(charset, "utf_32") && + strcmp(charset, "utf_32_le") && + strcmp(charset, "utf_32_be")) return NULL; conv = malloc(sizeof(struct utfconverter)); if (!conv) return NULL; + conv->flags = 0; + conv->little_endian = !is_big_endian(); if (!strcmp(charset, "utf_8")) { conv->base.encode = utf8_encode; conv->base.decode = utf8_decode; } else if (!strcmp(charset, "utf_16")) { conv->base.encode = utf16_encode; conv->base.decode = utf16_decode; + conv->flags |= FLAG_USE_BOM_ENDIAN; + } else if (!strcmp(charset, "utf_16_le")) { + conv->base.encode = utf16_encode; + conv->base.decode = utf16_decode; + conv->little_endian = 1; + } else if (!strcmp(charset, "utf_16_be")) { + conv->base.encode = utf16_encode; + conv->base.decode = utf16_decode; } else if (!strcmp(charset, "utf_32")) { conv->base.encode = utf32_encode; conv->base.decode = utf32_decode; + conv->flags |= FLAG_USE_BOM_ENDIAN; + } else if (!strcmp(charset, "utf_32_le")) { + conv->base.encode = utf32_encode; + conv->base.decode = utf32_decode; + conv->little_endian = 1; + } else if (!strcmp(charset, "utf_32_be")) { + conv->base.encode = utf32_encode; + conv->base.decode = utf32_decode; + conv->little_endian = 0; } conv->base.close = utfconverter_close; - conv->base.reset = NULL; + conv->base.reset = utfconverter_reset; return &conv->base; } |