summaryrefslogtreecommitdiff
path: root/xc/lib/X11/lcUTF8.c
diff options
context:
space:
mode:
Diffstat (limited to 'xc/lib/X11/lcUTF8.c')
-rw-r--r--xc/lib/X11/lcUTF8.c247
1 files changed, 79 insertions, 168 deletions
diff --git a/xc/lib/X11/lcUTF8.c b/xc/lib/X11/lcUTF8.c
index 994580e4d..585593be1 100644
--- a/xc/lib/X11/lcUTF8.c
+++ b/xc/lib/X11/lcUTF8.c
@@ -24,7 +24,7 @@ ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
******************************************************************/
-/* $XFree86: xc/lib/X11/lcUTF8.c,v 1.2 2000/02/29 03:09:04 dawes Exp $ */
+/* $XFree86: xc/lib/X11/lcUTF8.c,v 1.5 2000/06/30 18:27:00 dawes Exp $ */
/*
* This file contains:
@@ -120,7 +120,8 @@ close_converter(conv)
*/
typedef wchar_t original_wchar_t;
-#define wchar_t unsigned int
+typedef unsigned int local_wchar_t;
+#define wchar_t local_wchar_t
#define conv_t XlcConv
typedef struct _Utf8ConvRec {
@@ -142,12 +143,13 @@ typedef struct _Utf8ConvRec {
* int xxx_cstowc (XlcConv conv, wchar_t *pwc, unsigned char const *s, int n)
* converts the byte sequence starting at s to a wide character. Up to n bytes
* are available at s. n is >= 1.
- * Result is number of bytes consumed, or -1 if invalid, or 0 if n too small.
+ * Result is number of bytes consumed (if a wide character was read),
+ * or 0 if invalid, or -1 if n too small.
*
* int xxx_wctocs (XlcConv conv, unsigned char *r, wchar_t wc, int n)
* converts the wide character wc to the character set xxx, and stores the
* result beginning at r. Up to n bytes may be written at r. n is >= 1.
- * Result is number of bytes written, or -1 if invalid, or 0 if n too small.
+ * Result is number of bytes written, or 0 if invalid, or -1 if n too small.
*/
/* Return code if invalid. (xxx_mbtowc, xxx_wctomb) */
@@ -164,6 +166,7 @@ typedef struct _Utf8ConvRec {
* when the current locale is not an UTF-8 locale.
*/
+#include "lcUniConv/utf8.h"
#ifdef notused
#include "lcUniConv/ascii.h"
#endif
@@ -177,6 +180,7 @@ typedef struct _Utf8ConvRec {
#include "lcUniConv/iso8859_8.h"
#include "lcUniConv/iso8859_9.h"
#include "lcUniConv/iso8859_10.h"
+#include "lcUniConv/iso8859_13.h"
#include "lcUniConv/iso8859_14.h"
#include "lcUniConv/iso8859_15.h"
#include "lcUniConv/iso8859_16.h"
@@ -206,6 +210,12 @@ typedef struct {
#endif
static Utf8ConvRec all_charsets[] = {
+ /* The ISO10646-1/UTF-8 entry occurs twice, once at the beginning
+ (for lookup speed), once at the end (as a fallback). */
+ { "ISO10646-1", NULLQUARK,
+ utf8_mbtowc, utf8_wctomb
+ },
+
{ "ISO8859-1", NULLQUARK,
iso8859_1_mbtowc, iso8859_1_wctomb
},
@@ -236,6 +246,9 @@ static Utf8ConvRec all_charsets[] = {
{ "ISO8859-10", NULLQUARK,
iso8859_10_mbtowc, iso8859_10_wctomb
},
+ { "ISO8859-13", NULLQUARK,
+ iso8859_13_mbtowc, iso8859_13_wctomb
+ },
{ "ISO8859-14", NULLQUARK,
iso8859_14_mbtowc, iso8859_14_wctomb
},
@@ -295,6 +308,12 @@ static Utf8ConvRec all_charsets[] = {
big5_mbtowc, big5_wctomb
},
#endif
+
+ /* The ISO10646-1/UTF-8 entry occurs twice, once at the beginning
+ (for lookup speed), once at the end (as a fallback). */
+ { "ISO10646-1", NULLQUARK,
+ utf8_mbtowc, utf8_wctomb
+ },
};
#define all_charsets_count (sizeof(all_charsets)/sizeof(all_charsets[0]))
@@ -315,119 +334,6 @@ init_all_charsets()
init_all_charsets(); \
} while (0)
-/*
- * UTF-8 itself
- */
-
-static int
-utf8_cstowc(pwc, src, n)
- wchar_t *pwc;
- unsigned char const *src;
- int n;
-{
- unsigned char c = src[0];
-
- if (c < 0x80) {
- *pwc = c;
- return 1;
- } else if (c < 0xc2) {
- return -1;
- } else if (c < 0xe0) {
- if (n < 2)
- return 0;
- if (!((src[1] ^ 0x80) < 0x40))
- return -1;
- *pwc = ((wchar_t) (c & 0x1f) << 6)
- | (wchar_t) (src[1] ^ 0x80);
- return 2;
- } else if (c < 0xf0) {
- if (n < 3)
- return 0;
- if (!((src[1] ^ 0x80) < 0x40 && (src[2] ^ 0x80) < 0x40
- && (c >= 0xe1 || src[1] >= 0xa0)))
- return -1;
- *pwc = ((wchar_t) (c & 0x0f) << 12)
- | ((wchar_t) (src[1] ^ 0x80) << 6)
- | (wchar_t) (src[2] ^ 0x80);
- return 3;
- } else if (c < 0xf8 && sizeof(wchar_t)*8 >= 32) {
- if (n < 4)
- return 0;
- if (!((src[1] ^ 0x80) < 0x40 && (src[2] ^ 0x80) < 0x40
- && (src[3] ^ 0x80) < 0x40
- && (c >= 0xf1 || src[1] >= 0x90)))
- return -1;
- *pwc = ((wchar_t) (c & 0x07) << 18)
- | ((wchar_t) (src[1] ^ 0x80) << 12)
- | ((wchar_t) (src[2] ^ 0x80) << 6)
- | (wchar_t) (src[3] ^ 0x80);
- return 4;
- } else if (c < 0xfc && sizeof(wchar_t)*8 >= 32) {
- if (n < 5)
- return 0;
- if (!((src[1] ^ 0x80) < 0x40 && (src[2] ^ 0x80) < 0x40
- && (src[3] ^ 0x80) < 0x40 && (src[4] ^ 0x80) < 0x40
- && (c >= 0xf9 || src[1] >= 0x88)))
- return -1;
- *pwc = ((wchar_t) (c & 0x03) << 24)
- | ((wchar_t) (src[1] ^ 0x80) << 18)
- | ((wchar_t) (src[2] ^ 0x80) << 12)
- | ((wchar_t) (src[3] ^ 0x80) << 6)
- | (wchar_t) (src[4] ^ 0x80);
- return 5;
- } else if (c < 0xfe && sizeof(wchar_t)*8 >= 32) {
- if (n < 6)
- return 0;
- if (!((src[1] ^ 0x80) < 0x40 && (src[2] ^ 0x80) < 0x40
- && (src[3] ^ 0x80) < 0x40 && (src[4] ^ 0x80) < 0x40
- && (src[5] ^ 0x80) < 0x40
- && (c >= 0xfd || src[1] >= 0x84)))
- return -1;
- *pwc = ((wchar_t) (c & 0x01) << 30)
- | ((wchar_t) (src[1] ^ 0x80) << 24)
- | ((wchar_t) (src[2] ^ 0x80) << 18)
- | ((wchar_t) (src[3] ^ 0x80) << 12)
- | ((wchar_t) (src[4] ^ 0x80) << 6)
- | (wchar_t) (src[5] ^ 0x80);
- return 6;
- } else
- return -1;
-}
-
-static int
-utf8_wctocs(r, wc, n)
- unsigned char *r;
- wchar_t wc;
- int n; /* n == 0 is acceptable */
-{
- int count;
- if ((unsigned int) wc < 0x80)
- count = 1;
- else if ((unsigned int) wc < 0x800)
- count = 2;
- else if ((unsigned int) wc < 0x10000)
- count = 3;
- else if ((unsigned int) wc < 0x200000)
- count = 4;
- else if ((unsigned int) wc < 0x4000000)
- count = 5;
- else if ((unsigned int) wc <= 0x7fffffff)
- count = 6;
- else
- return -1;
- if (n < count)
- return 0;
- switch (count) { /* note: code falls through cases! */
- case 6: r[5] = 0x80 | (wc & 0x3f); wc = wc >> 6; wc |= 0x4000000;
- case 5: r[4] = 0x80 | (wc & 0x3f); wc = wc >> 6; wc |= 0x200000;
- case 4: r[3] = 0x80 | (wc & 0x3f); wc = wc >> 6; wc |= 0x10000;
- case 3: r[2] = 0x80 | (wc & 0x3f); wc = wc >> 6; wc |= 0x800;
- case 2: r[1] = 0x80 | (wc & 0x3f); wc = wc >> 6; wc |= 0xc0;
- case 1: r[0] = wc;
- }
- return count;
-}
-
/* from XlcNCharSet to XlcNUtf8String */
static int
@@ -460,7 +366,7 @@ cstoutf8(conv, from, from_left, to, to_left, args, num_args)
name = charset->encoding_name;
/* not charset->name because the latter has a ":GL"/":GR" suffix */
- for (convptr = all_charsets, i = all_charsets_count; i > 0; convptr++, i--)
+ for (convptr = all_charsets, i = all_charsets_count-1; i > 0; convptr++, i--)
if (!strcmp(convptr->name, name))
break;
if (i == 0)
@@ -478,17 +384,17 @@ cstoutf8(conv, from, from_left, to, to_left, args, num_args)
int count;
consumed = convptr->cstowc(conv, &wc, src, srcend-src);
- if (consumed < 0)
+ if (consumed == RET_ILSEQ)
return -1;
- if (consumed == 0)
+ if (consumed == RET_TOOFEW(0))
break;
- count = utf8_wctocs(dst, wc, dstend-dst);
- if (count == 0)
+ count = utf8_wctomb(NULL, dst, wc, dstend-dst);
+ if (count == RET_TOOSMALL)
break;
- if (count < 0) {
- count = utf8_wctocs(dst, BAD_WCHAR, dstend-dst);
- if (count == 0)
+ if (count == RET_ILSEQ) {
+ count = utf8_wctomb(NULL, dst, BAD_WCHAR, dstend-dst);
+ if (count == RET_TOOSMALL)
break;
unconv_num++;
}
@@ -547,8 +453,8 @@ create_tocs_conv(lcd, methods)
charset_num = 0;
for (i = 0; i < codeset_num; i++)
charset_num += codeset_list[i]->num_charsets;
- if (charset_num > all_charsets_count)
- charset_num = all_charsets_count;
+ if (charset_num > all_charsets_count-1)
+ charset_num = all_charsets_count-1;
preferred = (Utf8Conv *) Xmalloc((charset_num + 1) * sizeof(Utf8Conv));
if (preferred == (Utf8Conv *) NULL) {
Xfree((char *) conv);
@@ -568,7 +474,7 @@ create_tocs_conv(lcd, methods)
break;
if (k < 0) {
/* Look it up in all_charsets[]. */
- for (k = 0; k < all_charsets_count; k++)
+ for (k = 0; k < all_charsets_count-1; k++)
if (!strcmp(all_charsets[k].name, name)) {
/* Add it to the preferred set. */
preferred[charset_num++] = &all_charsets[k];
@@ -612,28 +518,28 @@ charset_wctocs(preferred, charsetp, sidep, conv, r, wc, n)
Utf8Conv convptr;
int i;
- while (*preferred != (Utf8Conv) NULL) {
+ for (; *preferred != (Utf8Conv) NULL; preferred++) {
convptr = *preferred;
count = convptr->wctocs(conv, r, wc, n);
- if (count == 0)
- return 0;
- if (count > 0) {
+ if (count == RET_TOOSMALL)
+ return RET_TOOSMALL;
+ if (count != RET_ILSEQ) {
*charsetp = convptr;
*sidep = (*r < 0x80 ? XlcGL : XlcGR);
return count;
}
}
- for (convptr = all_charsets, i = all_charsets_count; i > 0; convptr++, i--) {
+ for (convptr = all_charsets+1, i = all_charsets_count-1; i > 0; convptr++, i--) {
count = convptr->wctocs(conv, r, wc, n);
- if (count == 0)
- return 0;
- if (count > 0) {
+ if (count == RET_TOOSMALL)
+ return RET_TOOSMALL;
+ if (count != RET_ILSEQ) {
*charsetp = convptr;
*sidep = (*r < 0x80 ? XlcGL : XlcGR);
return count;
}
}
- return -1;
+ return RET_ILSEQ;
}
static int
@@ -671,19 +577,19 @@ utf8tocs(conv, from, from_left, to, to_left, args, num_args)
int consumed;
int count;
- consumed = utf8_cstowc(&wc, src, srcend-src);
- if (consumed == 0)
+ consumed = utf8_mbtowc(NULL, &wc, src, srcend-src);
+ if (consumed == RET_TOOFEW(0))
break;
- if (consumed < 0) {
+ if (consumed == RET_ILSEQ) {
src++;
unconv_num++;
continue;
}
count = charset_wctocs(preferred_charsets, &chosen_charset, &chosen_side, conv, dst, wc, dstend-dst);
- if (count == 0)
+ if (count == RET_TOOSMALL)
break;
- if (count < 0) {
+ if (count == RET_ILSEQ) {
src += consumed;
unconv_num++;
continue;
@@ -774,19 +680,19 @@ utf8tocs1(conv, from, from_left, to, to_left, args, num_args)
int consumed;
int count;
- consumed = utf8_cstowc(&wc, src, srcend-src);
- if (consumed == 0)
+ consumed = utf8_mbtowc(NULL, &wc, src, srcend-src);
+ if (consumed == RET_TOOFEW(0))
break;
- if (consumed < 0) {
+ if (consumed == RET_ILSEQ) {
src++;
unconv_num++;
continue;
}
count = charset_wctocs(preferred_charsets, &chosen_charset, &chosen_side, conv, dst, wc, dstend-dst);
- if (count == 0)
+ if (count == RET_TOOSMALL)
break;
- if (count < 0) {
+ if (count == RET_ILSEQ) {
src += consumed;
unconv_num++;
continue;
@@ -873,12 +779,12 @@ utf8tostr(conv, from, from_left, to, to_left, args, num_args)
wchar_t wc;
int consumed;
- consumed = utf8_cstowc(&wc, src, srcend-src);
- if (consumed == 0)
+ consumed = utf8_mbtowc(NULL, &wc, src, srcend-src);
+ if (consumed == RET_TOOFEW(0))
break;
if (dst == dstend)
break;
- if (consumed < 0) {
+ if (consumed == RET_ILSEQ) {
consumed = 1;
c = BAD_CHAR;
unconv_num++;
@@ -943,8 +849,8 @@ strtoutf8(conv, from, from_left, to, to_left, args, num_args)
dstend = dst + *to_left;
while (src < srcend) {
- int count = utf8_wctocs(dst, *src, dstend-dst);
- if (count == 0)
+ int count = utf8_wctomb(NULL, dst, *src, dstend-dst);
+ if (count == RET_TOOSMALL)
break;
dst += count;
src++;
@@ -1023,13 +929,17 @@ utf8towcs(conv, from, from_left, to, to_left, args, num_args)
unconv_num = 0;
while (src < srcend && dst < dstend) {
- int consumed = utf8_cstowc(dst, src, srcend-src);
- if (consumed < 0) {
+ local_wchar_t wc;
+ int consumed = utf8_mbtowc(NULL, &wc, src, srcend-src);
+ if (consumed == RET_TOOFEW(0))
+ break;
+ if (consumed == RET_ILSEQ) {
src++;
*dst = BAD_WCHAR;
unconv_num++;
} else {
src += consumed;
+ *dst = wc;
}
dst++;
}
@@ -1086,12 +996,12 @@ wcstoutf8(conv, from, from_left, to, to_left, args, num_args)
unconv_num = 0;
while (src < srcend) {
- int count = utf8_wctocs(dst, *src, dstend-dst);
- if (count == 0)
+ int count = utf8_wctomb(NULL, dst, *src, dstend-dst);
+ if (count == RET_TOOSMALL)
break;
- if (count < 0) {
- count = utf8_wctocs(dst, BAD_WCHAR, dstend-dst);
- if (count == 0)
+ if (count == RET_ILSEQ) {
+ count = utf8_wctomb(NULL, dst, BAD_WCHAR, dstend-dst);
+ if (count == RET_TOOSMALL)
break;
unconv_num++;
}
@@ -1266,10 +1176,11 @@ cstowcs(conv, from, from_left, to, to_left, args, num_args)
return -1;
charset = (XlcCharSet) args[0];
- name = charset->name;
+ name = charset->encoding_name;
+ /* not charset->name because the latter has a ":GL"/":GR" suffix */
- for (convptr = all_charsets, i = all_charsets_count; i > 0; convptr++, i--)
- if (!strcmp(convptr->name, name)) /* FIXME: charset->side */
+ for (convptr = all_charsets, i = all_charsets_count-1; i > 0; convptr++, i--)
+ if (!strcmp(convptr->name, name))
break;
if (i == 0)
return -1;
@@ -1285,9 +1196,9 @@ cstowcs(conv, from, from_left, to, to_left, args, num_args)
int consumed;
consumed = convptr->cstowc(conv, &wc, src, srcend-src);
- if (consumed < 0)
+ if (consumed == RET_ILSEQ)
return -1;
- if (consumed == 0)
+ if (consumed == RET_TOOFEW(0))
break;
*dst++ = wc;
@@ -1356,9 +1267,9 @@ wcstocs(conv, from, from_left, to, to_left, args, num_args)
int count;
count = charset_wctocs(preferred_charsets, &chosen_charset, &chosen_side, conv, dst, wc, dstend-dst);
- if (count == 0)
+ if (count == RET_TOOSMALL)
break;
- if (count < 0) {
+ if (count == RET_ILSEQ) {
src++;
unconv_num++;
continue;
@@ -1449,9 +1360,9 @@ wcstocs1(conv, from, from_left, to, to_left, args, num_args)
int count;
count = charset_wctocs(preferred_charsets, &chosen_charset, &chosen_side, conv, dst, wc, dstend-dst);
- if (count == 0)
+ if (count == RET_TOOSMALL)
break;
- if (count < 0) {
+ if (count == RET_ILSEQ) {
src++;
unconv_num++;
continue;