utfconverter: support utf_16le, utf_16be, utf_32le and utf_32be

author: Luo Jinghua <sunmoon1997@gmail.com> 2010-01-08 22:54:04 +0800
committer: Luo Jinghua <sunmoon1997@gmail.com> 2010-01-08 22:54:04 +0800
commit: 3aeaf3c5450f27e87cf9d887f0935ee369a6cd1f (patch)
tree: 37fcdf7bc1f391205973c490feae1bc1db33b608
parent: 5b798c4039eae1e8bfdba96f3b3e7be736ea8da8 (diff)
1 files changed, 147 insertions, 25 deletions
diff --git a/utfconverter.c b/utfconverter.c
index 04a8199..b2384d3 100644
--- a/utfconverter.c
+++ b/utfconverter.c
@@ -8,11 +8,27 @@
 #include <stdio.h>
 #include <assert.h>
 
+#define FLAG_USE_BOM_ENDIAN 1 << 0
+
 struct utfconverter {
     struct converter base;
+
+    unsigned int flags;
+    int little_endian;
 };
 
 static int
+is_big_endian()
+{
+    static const union {
+	int iv;
+	char cv[4];
+    } u = { 0x12345678 };
+
+    return u.cv[0] == 0x12;
+}
+
+static int
 utf8_encode(struct converter *conv,
 	    const uc_char_t **inbuf,
 	    size_t inleft,
@@ -75,23 +91,39 @@ utf16_encode(struct converter *conv,
 	     char **outbuf,
 	     size_t outleft)
 {
+    struct utfconverter *uc = (struct utfconverter*)conv;
     size_t i;
-    uc_uint16_t **soutbuf = (uc_uint16_t **)outbuf;
 
     if (!inbuf)
 	return UNICONV_SUCCESS;
 
     for (i = 0; i < inleft; i++) {
 	int seqlen = ucs4toutf16(**inbuf, NULL);
+	uc_uint16_t utf16[2];
 	if (seqlen < 0)
 	    return UNICONV_EILSEQ;
-	if (seqlen * sizeof(uc_uint16_t) > outleft)
+	if (seqlen * 2 > outleft)
 	    return UNICONV_E2BIG;
 
-	ucs4toutf16(**inbuf, *soutbuf);
+	ucs4toutf16(**inbuf, utf16);
+	if (uc->little_endian) {
+	    (*outbuf)[0] = utf16[0] & 0xff;
+	    (*outbuf)[1] = utf16[0] >> 8;
+	    if (seqlen == 2) {
+		(*outbuf)[2] = utf16[1] & 0xff;
+		(*outbuf)[3] = utf16[1] >> 8;
+	    }
+	} else {
+	    (*outbuf)[0] = utf16[0] >> 8;
+	    (*outbuf)[1] = utf16[0] & 0xff;
+	    if (seqlen == 2) {
+		(*outbuf)[2] = utf16[1] >> 8;
+		(*outbuf)[3] = utf16[1] & 0xff;
+	    }
+	}
 	(*inbuf) += 1;
-	(*soutbuf) += seqlen;
-	outleft -= seqlen;
+	(*outbuf) += seqlen * 2;
+	outleft -= seqlen * 2;
     }
 
     return UNICONV_SUCCESS;
@@ -104,12 +136,33 @@ utf16_decode(struct converter *conv,
 	     uc_char_t **outbuf,
 	     size_t outleft)
 {
-    const uc_uint16_t **sinbuf = (const uc_uint16_t**)inbuf;
+    struct utfconverter *uc = (struct utfconverter*)conv;
 
     while (inleft) {
 	uc_char_t unichar;
-	int seqlen = ucs4fromutf16(*sinbuf, &unichar,
-				   inleft / sizeof(uc_uint16_t));
+	uc_uint16_t utf16[2];
+	int seqlen;
+
+	if (inleft < 2)
+	    return UNICONV_EINVAL;
+
+	if (uc->little_endian)
+	    utf16[0] = (*inbuf)[1] << 8 | (*inbuf)[0];
+	else
+	    utf16[0] = (*inbuf)[0] << 8 | (*inbuf)[1];
+	if (utf16[0] >= 0xd800 && utf16[0] <= 0xbeff)
+	    seqlen = 2;
+	else
+	    seqlen = 1;
+	if (inleft < seqlen * 2)
+	    return UNICONV_EINVAL;
+	if (seqlen == 2) {
+	    if (uc->little_endian)
+		utf16[1] = (*inbuf)[3] << 8 | (*inbuf)[2];
+	    else
+		utf16[1] = (*inbuf)[2] << 8 | (*inbuf)[3];
+	}
+	seqlen = ucs4fromutf16(utf16, &unichar, seqlen);
 	if (seqlen == -2)
 	    return UNICONV_EINVAL;
 	else if (seqlen < 0)
@@ -121,8 +174,8 @@ utf16_decode(struct converter *conv,
 	(*outbuf) += 1;
 	outleft -= 1;
 
-	(*sinbuf) += seqlen;
-	inleft -= seqlen * sizeof(uc_uint16_t);
+	(*inbuf) += seqlen * 2;
+	inleft -= seqlen * 2;
     }
 
     return UNICONV_SUCCESS;
@@ -135,33 +188,68 @@ utf32_encode(struct converter *conv,
 	     char **outbuf,
 	     size_t outleft)
 {
+    struct utfconverter *uc = (struct utfconverter*)conv;
+
     if (!inbuf)
 	return UNICONV_SUCCESS;
 
-    if (inleft * sizeof(uc_char_t) > outleft)
+    if (inleft * 4 > outleft)
 	return UNICONV_E2BIG;
 
-    memcpy(*outbuf, *inbuf, inleft * sizeof(uc_char_t));
-    (*inbuf) += inleft;
-    (*outbuf) += inleft * sizeof(uc_char_t);
+    while (inleft) {
+	if (uc->little_endian) {
+	    (*outbuf)[0] = ((**inbuf) & 0x000000ff) >>  0;
+	    (*outbuf)[1] = ((**inbuf) & 0x0000ff00) >>  8;
+	    (*outbuf)[2] = ((**inbuf) & 0x00ff0000) >> 16;
+	    (*outbuf)[3] = ((**inbuf) & 0xff000000) >> 24;
+	} else {
+	    (*outbuf)[3] = ((**inbuf) & 0x000000ff) >>  0;
+	    (*outbuf)[2] = ((**inbuf) & 0x0000ff00) >>  8;
+	    (*outbuf)[1] = ((**inbuf) & 0x00ff0000) >> 16;
+	    (*outbuf)[0] = ((**inbuf) & 0xff000000) >> 24;
+	}
+	(*inbuf) += 1;
+	(*outbuf) += 4;
+	inleft -= 1;
+	outleft -= 4;
+    }
+
     return UNICONV_SUCCESS;
 }
 
 static int
 utf32_decode(struct converter *conv,
-	    const char **inbuf,
-	    size_t inleft,
-	    uc_char_t **outbuf,
-	    size_t outleft)
+	     const char **inbuf,
+	     size_t inleft,
+	     uc_char_t **outbuf,
+	     size_t outleft)
 {
+    struct utfconverter *uc = (struct utfconverter*)conv;
+
     if (inleft & 3)
 	return UNICONV_EINVAL;
-    if (inleft > outleft)
+    if (inleft / 4 > outleft)
 	return UNICONV_E2BIG;
 
-    memcpy(*outbuf, *inbuf, inleft);
-    (*inbuf) += inleft;
-    (*outbuf) += inleft / sizeof(uc_char_t);
+    while (inleft) {
+	if (uc->little_endian)
+	    **outbuf =
+		((*inbuf)[0] <<  0) |
+		((*inbuf)[1] <<  8) |
+		((*inbuf)[2] << 16) |
+		((*inbuf)[3] << 24);
+	else
+	    **outbuf =
+		((*inbuf)[3] <<  0) |
+		((*inbuf)[2] <<  8) |
+		((*inbuf)[1] << 16) |
+		((*inbuf)[0] << 24);
+	(*inbuf) += 4;
+	(*outbuf) += 1;
+	inleft -= 4;
+	outleft -= 1;
+    }
+
     return UNICONV_SUCCESS;
 }
 
@@ -171,30 +259,64 @@ utfconverter_close(struct converter *conv)
     free(conv);
 }
 
+static void
+utfconverter_reset(struct converter *suc)
+{
+    struct utfconverter *uc = (struct utfconverter*)suc;
+
+    /* default to host endian, should be big endian? */
+    if (uc->flags & FLAG_USE_BOM_ENDIAN)
+	uc->little_endian = !is_big_endian();
+}
+
 struct converter *
 utfconverter_open(const char *charset)
 {
     struct utfconverter *conv;
 
-    if (strcmp(charset, "utf_8") && strcmp(charset, "utf_16") &&
-	strcmp(charset, "utf_32"))
+    if (strcmp(charset, "utf_8") &&
+	strcmp(charset, "utf_16") &&
+	strcmp(charset, "utf_16_le") &&
+	strcmp(charset, "utf_16_be") &&
+	strcmp(charset, "utf_32") &&
+	strcmp(charset, "utf_32_le") &&
+	strcmp(charset, "utf_32_be"))
 	return NULL;
 
     conv = malloc(sizeof(struct utfconverter));
     if (!conv)
 	return NULL;
+    conv->flags = 0;
+    conv->little_endian = !is_big_endian();
     if (!strcmp(charset, "utf_8")) {
 	conv->base.encode = utf8_encode;
 	conv->base.decode = utf8_decode;
     } else if (!strcmp(charset, "utf_16")) {
 	conv->base.encode = utf16_encode;
 	conv->base.decode = utf16_decode;
+	conv->flags |= FLAG_USE_BOM_ENDIAN;
+    } else if (!strcmp(charset, "utf_16_le")) {
+	conv->base.encode = utf16_encode;
+	conv->base.decode = utf16_decode;
+	conv->little_endian = 1;
+    } else if (!strcmp(charset, "utf_16_be")) {
+	conv->base.encode = utf16_encode;
+	conv->base.decode = utf16_decode;
     } else if (!strcmp(charset, "utf_32")) {
 	conv->base.encode = utf32_encode;
 	conv->base.decode = utf32_decode;
+	conv->flags |= FLAG_USE_BOM_ENDIAN;
+    } else if (!strcmp(charset, "utf_32_le")) {
+	conv->base.encode = utf32_encode;
+	conv->base.decode = utf32_decode;
+	conv->little_endian = 1;
+    } else if (!strcmp(charset, "utf_32_be")) {
+	conv->base.encode = utf32_encode;
+	conv->base.decode = utf32_decode;
+	conv->little_endian = 0;
     }
     conv->base.close = utfconverter_close;
-    conv->base.reset = NULL;
+    conv->base.reset = utfconverter_reset;
 
     return &conv->base;
 }
author	Luo Jinghua <sunmoon1997@gmail.com>	2010-01-08 22:54:04 +0800
committer	Luo Jinghua <sunmoon1997@gmail.com>	2010-01-08 22:54:04 +0800
commit	3aeaf3c5450f27e87cf9d887f0935ee369a6cd1f (patch)
tree	37fcdf7bc1f391205973c490feae1bc1db33b608
parent	5b798c4039eae1e8bfdba96f3b3e7be736ea8da8 (diff)