summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLuo Jinghua <sunmoon1997@gmail.com>2010-01-08 22:54:04 +0800
committerLuo Jinghua <sunmoon1997@gmail.com>2010-01-08 22:54:04 +0800
commit3aeaf3c5450f27e87cf9d887f0935ee369a6cd1f (patch)
tree37fcdf7bc1f391205973c490feae1bc1db33b608
parent5b798c4039eae1e8bfdba96f3b3e7be736ea8da8 (diff)
utfconverter: support utf_16le, utf_16be, utf_32le and utf_32be
-rw-r--r--utfconverter.c172
1 files changed, 147 insertions, 25 deletions
diff --git a/utfconverter.c b/utfconverter.c
index 04a8199..b2384d3 100644
--- a/utfconverter.c
+++ b/utfconverter.c
@@ -8,11 +8,27 @@
#include <stdio.h>
#include <assert.h>
+#define FLAG_USE_BOM_ENDIAN 1 << 0
+
struct utfconverter {
struct converter base;
+
+ unsigned int flags;
+ int little_endian;
};
static int
+is_big_endian()
+{
+ static const union {
+ int iv;
+ char cv[4];
+ } u = { 0x12345678 };
+
+ return u.cv[0] == 0x12;
+}
+
+static int
utf8_encode(struct converter *conv,
const uc_char_t **inbuf,
size_t inleft,
@@ -75,23 +91,39 @@ utf16_encode(struct converter *conv,
char **outbuf,
size_t outleft)
{
+ struct utfconverter *uc = (struct utfconverter*)conv;
size_t i;
- uc_uint16_t **soutbuf = (uc_uint16_t **)outbuf;
if (!inbuf)
return UNICONV_SUCCESS;
for (i = 0; i < inleft; i++) {
int seqlen = ucs4toutf16(**inbuf, NULL);
+ uc_uint16_t utf16[2];
if (seqlen < 0)
return UNICONV_EILSEQ;
- if (seqlen * sizeof(uc_uint16_t) > outleft)
+ if (seqlen * 2 > outleft)
return UNICONV_E2BIG;
- ucs4toutf16(**inbuf, *soutbuf);
+ ucs4toutf16(**inbuf, utf16);
+ if (uc->little_endian) {
+ (*outbuf)[0] = utf16[0] & 0xff;
+ (*outbuf)[1] = utf16[0] >> 8;
+ if (seqlen == 2) {
+ (*outbuf)[2] = utf16[1] & 0xff;
+ (*outbuf)[3] = utf16[1] >> 8;
+ }
+ } else {
+ (*outbuf)[0] = utf16[0] >> 8;
+ (*outbuf)[1] = utf16[0] & 0xff;
+ if (seqlen == 2) {
+ (*outbuf)[2] = utf16[1] >> 8;
+ (*outbuf)[3] = utf16[1] & 0xff;
+ }
+ }
(*inbuf) += 1;
- (*soutbuf) += seqlen;
- outleft -= seqlen;
+ (*outbuf) += seqlen * 2;
+ outleft -= seqlen * 2;
}
return UNICONV_SUCCESS;
@@ -104,12 +136,33 @@ utf16_decode(struct converter *conv,
uc_char_t **outbuf,
size_t outleft)
{
- const uc_uint16_t **sinbuf = (const uc_uint16_t**)inbuf;
+ struct utfconverter *uc = (struct utfconverter*)conv;
while (inleft) {
uc_char_t unichar;
- int seqlen = ucs4fromutf16(*sinbuf, &unichar,
- inleft / sizeof(uc_uint16_t));
+ uc_uint16_t utf16[2];
+ int seqlen;
+
+ if (inleft < 2)
+ return UNICONV_EINVAL;
+
+ if (uc->little_endian)
+ utf16[0] = (*inbuf)[1] << 8 | (*inbuf)[0];
+ else
+ utf16[0] = (*inbuf)[0] << 8 | (*inbuf)[1];
+ if (utf16[0] >= 0xd800 && utf16[0] <= 0xbeff)
+ seqlen = 2;
+ else
+ seqlen = 1;
+ if (inleft < seqlen * 2)
+ return UNICONV_EINVAL;
+ if (seqlen == 2) {
+ if (uc->little_endian)
+ utf16[1] = (*inbuf)[3] << 8 | (*inbuf)[2];
+ else
+ utf16[1] = (*inbuf)[2] << 8 | (*inbuf)[3];
+ }
+ seqlen = ucs4fromutf16(utf16, &unichar, seqlen);
if (seqlen == -2)
return UNICONV_EINVAL;
else if (seqlen < 0)
@@ -121,8 +174,8 @@ utf16_decode(struct converter *conv,
(*outbuf) += 1;
outleft -= 1;
- (*sinbuf) += seqlen;
- inleft -= seqlen * sizeof(uc_uint16_t);
+ (*inbuf) += seqlen * 2;
+ inleft -= seqlen * 2;
}
return UNICONV_SUCCESS;
@@ -135,33 +188,68 @@ utf32_encode(struct converter *conv,
char **outbuf,
size_t outleft)
{
+ struct utfconverter *uc = (struct utfconverter*)conv;
+
if (!inbuf)
return UNICONV_SUCCESS;
- if (inleft * sizeof(uc_char_t) > outleft)
+ if (inleft * 4 > outleft)
return UNICONV_E2BIG;
- memcpy(*outbuf, *inbuf, inleft * sizeof(uc_char_t));
- (*inbuf) += inleft;
- (*outbuf) += inleft * sizeof(uc_char_t);
+ while (inleft) {
+ if (uc->little_endian) {
+ (*outbuf)[0] = ((**inbuf) & 0x000000ff) >> 0;
+ (*outbuf)[1] = ((**inbuf) & 0x0000ff00) >> 8;
+ (*outbuf)[2] = ((**inbuf) & 0x00ff0000) >> 16;
+ (*outbuf)[3] = ((**inbuf) & 0xff000000) >> 24;
+ } else {
+ (*outbuf)[3] = ((**inbuf) & 0x000000ff) >> 0;
+ (*outbuf)[2] = ((**inbuf) & 0x0000ff00) >> 8;
+ (*outbuf)[1] = ((**inbuf) & 0x00ff0000) >> 16;
+ (*outbuf)[0] = ((**inbuf) & 0xff000000) >> 24;
+ }
+ (*inbuf) += 1;
+ (*outbuf) += 4;
+ inleft -= 1;
+ outleft -= 4;
+ }
+
return UNICONV_SUCCESS;
}
static int
utf32_decode(struct converter *conv,
- const char **inbuf,
- size_t inleft,
- uc_char_t **outbuf,
- size_t outleft)
+ const char **inbuf,
+ size_t inleft,
+ uc_char_t **outbuf,
+ size_t outleft)
{
+ struct utfconverter *uc = (struct utfconverter*)conv;
+
if (inleft & 3)
return UNICONV_EINVAL;
- if (inleft > outleft)
+ if (inleft / 4 > outleft)
return UNICONV_E2BIG;
- memcpy(*outbuf, *inbuf, inleft);
- (*inbuf) += inleft;
- (*outbuf) += inleft / sizeof(uc_char_t);
+ while (inleft) {
+ if (uc->little_endian)
+ **outbuf =
+ ((*inbuf)[0] << 0) |
+ ((*inbuf)[1] << 8) |
+ ((*inbuf)[2] << 16) |
+ ((*inbuf)[3] << 24);
+ else
+ **outbuf =
+ ((*inbuf)[3] << 0) |
+ ((*inbuf)[2] << 8) |
+ ((*inbuf)[1] << 16) |
+ ((*inbuf)[0] << 24);
+ (*inbuf) += 4;
+ (*outbuf) += 1;
+ inleft -= 4;
+ outleft -= 1;
+ }
+
return UNICONV_SUCCESS;
}
@@ -171,30 +259,64 @@ utfconverter_close(struct converter *conv)
free(conv);
}
+static void
+utfconverter_reset(struct converter *suc)
+{
+ struct utfconverter *uc = (struct utfconverter*)suc;
+
+ /* default to host endian, should be big endian? */
+ if (uc->flags & FLAG_USE_BOM_ENDIAN)
+ uc->little_endian = !is_big_endian();
+}
+
struct converter *
utfconverter_open(const char *charset)
{
struct utfconverter *conv;
- if (strcmp(charset, "utf_8") && strcmp(charset, "utf_16") &&
- strcmp(charset, "utf_32"))
+ if (strcmp(charset, "utf_8") &&
+ strcmp(charset, "utf_16") &&
+ strcmp(charset, "utf_16_le") &&
+ strcmp(charset, "utf_16_be") &&
+ strcmp(charset, "utf_32") &&
+ strcmp(charset, "utf_32_le") &&
+ strcmp(charset, "utf_32_be"))
return NULL;
conv = malloc(sizeof(struct utfconverter));
if (!conv)
return NULL;
+ conv->flags = 0;
+ conv->little_endian = !is_big_endian();
if (!strcmp(charset, "utf_8")) {
conv->base.encode = utf8_encode;
conv->base.decode = utf8_decode;
} else if (!strcmp(charset, "utf_16")) {
conv->base.encode = utf16_encode;
conv->base.decode = utf16_decode;
+ conv->flags |= FLAG_USE_BOM_ENDIAN;
+ } else if (!strcmp(charset, "utf_16_le")) {
+ conv->base.encode = utf16_encode;
+ conv->base.decode = utf16_decode;
+ conv->little_endian = 1;
+ } else if (!strcmp(charset, "utf_16_be")) {
+ conv->base.encode = utf16_encode;
+ conv->base.decode = utf16_decode;
} else if (!strcmp(charset, "utf_32")) {
conv->base.encode = utf32_encode;
conv->base.decode = utf32_decode;
+ conv->flags |= FLAG_USE_BOM_ENDIAN;
+ } else if (!strcmp(charset, "utf_32_le")) {
+ conv->base.encode = utf32_encode;
+ conv->base.decode = utf32_decode;
+ conv->little_endian = 1;
+ } else if (!strcmp(charset, "utf_32_be")) {
+ conv->base.encode = utf32_encode;
+ conv->base.decode = utf32_decode;
+ conv->little_endian = 0;
}
conv->base.close = utfconverter_close;
- conv->base.reset = NULL;
+ conv->base.reset = utfconverter_reset;
return &conv->base;
}