diff options
Diffstat (limited to 'xc/lib/X11/lcUTF8.c')
-rw-r--r-- | xc/lib/X11/lcUTF8.c | 247 |
1 files changed, 79 insertions, 168 deletions
diff --git a/xc/lib/X11/lcUTF8.c b/xc/lib/X11/lcUTF8.c index 994580e4d..585593be1 100644 --- a/xc/lib/X11/lcUTF8.c +++ b/xc/lib/X11/lcUTF8.c @@ -24,7 +24,7 @@ ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. ******************************************************************/ -/* $XFree86: xc/lib/X11/lcUTF8.c,v 1.2 2000/02/29 03:09:04 dawes Exp $ */ +/* $XFree86: xc/lib/X11/lcUTF8.c,v 1.5 2000/06/30 18:27:00 dawes Exp $ */ /* * This file contains: @@ -120,7 +120,8 @@ close_converter(conv) */ typedef wchar_t original_wchar_t; -#define wchar_t unsigned int +typedef unsigned int local_wchar_t; +#define wchar_t local_wchar_t #define conv_t XlcConv typedef struct _Utf8ConvRec { @@ -142,12 +143,13 @@ typedef struct _Utf8ConvRec { * int xxx_cstowc (XlcConv conv, wchar_t *pwc, unsigned char const *s, int n) * converts the byte sequence starting at s to a wide character. Up to n bytes * are available at s. n is >= 1. - * Result is number of bytes consumed, or -1 if invalid, or 0 if n too small. + * Result is number of bytes consumed (if a wide character was read), + * or 0 if invalid, or -1 if n too small. * * int xxx_wctocs (XlcConv conv, unsigned char *r, wchar_t wc, int n) * converts the wide character wc to the character set xxx, and stores the * result beginning at r. Up to n bytes may be written at r. n is >= 1. - * Result is number of bytes written, or -1 if invalid, or 0 if n too small. + * Result is number of bytes written, or 0 if invalid, or -1 if n too small. */ /* Return code if invalid. (xxx_mbtowc, xxx_wctomb) */ @@ -164,6 +166,7 @@ typedef struct _Utf8ConvRec { * when the current locale is not an UTF-8 locale. */ +#include "lcUniConv/utf8.h" #ifdef notused #include "lcUniConv/ascii.h" #endif @@ -177,6 +180,7 @@ typedef struct _Utf8ConvRec { #include "lcUniConv/iso8859_8.h" #include "lcUniConv/iso8859_9.h" #include "lcUniConv/iso8859_10.h" +#include "lcUniConv/iso8859_13.h" #include "lcUniConv/iso8859_14.h" #include "lcUniConv/iso8859_15.h" #include "lcUniConv/iso8859_16.h" @@ -206,6 +210,12 @@ typedef struct { #endif static Utf8ConvRec all_charsets[] = { + /* The ISO10646-1/UTF-8 entry occurs twice, once at the beginning + (for lookup speed), once at the end (as a fallback). */ + { "ISO10646-1", NULLQUARK, + utf8_mbtowc, utf8_wctomb + }, + { "ISO8859-1", NULLQUARK, iso8859_1_mbtowc, iso8859_1_wctomb }, @@ -236,6 +246,9 @@ static Utf8ConvRec all_charsets[] = { { "ISO8859-10", NULLQUARK, iso8859_10_mbtowc, iso8859_10_wctomb }, + { "ISO8859-13", NULLQUARK, + iso8859_13_mbtowc, iso8859_13_wctomb + }, { "ISO8859-14", NULLQUARK, iso8859_14_mbtowc, iso8859_14_wctomb }, @@ -295,6 +308,12 @@ static Utf8ConvRec all_charsets[] = { big5_mbtowc, big5_wctomb }, #endif + + /* The ISO10646-1/UTF-8 entry occurs twice, once at the beginning + (for lookup speed), once at the end (as a fallback). */ + { "ISO10646-1", NULLQUARK, + utf8_mbtowc, utf8_wctomb + }, }; #define all_charsets_count (sizeof(all_charsets)/sizeof(all_charsets[0])) @@ -315,119 +334,6 @@ init_all_charsets() init_all_charsets(); \ } while (0) -/* - * UTF-8 itself - */ - -static int -utf8_cstowc(pwc, src, n) - wchar_t *pwc; - unsigned char const *src; - int n; -{ - unsigned char c = src[0]; - - if (c < 0x80) { - *pwc = c; - return 1; - } else if (c < 0xc2) { - return -1; - } else if (c < 0xe0) { - if (n < 2) - return 0; - if (!((src[1] ^ 0x80) < 0x40)) - return -1; - *pwc = ((wchar_t) (c & 0x1f) << 6) - | (wchar_t) (src[1] ^ 0x80); - return 2; - } else if (c < 0xf0) { - if (n < 3) - return 0; - if (!((src[1] ^ 0x80) < 0x40 && (src[2] ^ 0x80) < 0x40 - && (c >= 0xe1 || src[1] >= 0xa0))) - return -1; - *pwc = ((wchar_t) (c & 0x0f) << 12) - | ((wchar_t) (src[1] ^ 0x80) << 6) - | (wchar_t) (src[2] ^ 0x80); - return 3; - } else if (c < 0xf8 && sizeof(wchar_t)*8 >= 32) { - if (n < 4) - return 0; - if (!((src[1] ^ 0x80) < 0x40 && (src[2] ^ 0x80) < 0x40 - && (src[3] ^ 0x80) < 0x40 - && (c >= 0xf1 || src[1] >= 0x90))) - return -1; - *pwc = ((wchar_t) (c & 0x07) << 18) - | ((wchar_t) (src[1] ^ 0x80) << 12) - | ((wchar_t) (src[2] ^ 0x80) << 6) - | (wchar_t) (src[3] ^ 0x80); - return 4; - } else if (c < 0xfc && sizeof(wchar_t)*8 >= 32) { - if (n < 5) - return 0; - if (!((src[1] ^ 0x80) < 0x40 && (src[2] ^ 0x80) < 0x40 - && (src[3] ^ 0x80) < 0x40 && (src[4] ^ 0x80) < 0x40 - && (c >= 0xf9 || src[1] >= 0x88))) - return -1; - *pwc = ((wchar_t) (c & 0x03) << 24) - | ((wchar_t) (src[1] ^ 0x80) << 18) - | ((wchar_t) (src[2] ^ 0x80) << 12) - | ((wchar_t) (src[3] ^ 0x80) << 6) - | (wchar_t) (src[4] ^ 0x80); - return 5; - } else if (c < 0xfe && sizeof(wchar_t)*8 >= 32) { - if (n < 6) - return 0; - if (!((src[1] ^ 0x80) < 0x40 && (src[2] ^ 0x80) < 0x40 - && (src[3] ^ 0x80) < 0x40 && (src[4] ^ 0x80) < 0x40 - && (src[5] ^ 0x80) < 0x40 - && (c >= 0xfd || src[1] >= 0x84))) - return -1; - *pwc = ((wchar_t) (c & 0x01) << 30) - | ((wchar_t) (src[1] ^ 0x80) << 24) - | ((wchar_t) (src[2] ^ 0x80) << 18) - | ((wchar_t) (src[3] ^ 0x80) << 12) - | ((wchar_t) (src[4] ^ 0x80) << 6) - | (wchar_t) (src[5] ^ 0x80); - return 6; - } else - return -1; -} - -static int -utf8_wctocs(r, wc, n) - unsigned char *r; - wchar_t wc; - int n; /* n == 0 is acceptable */ -{ - int count; - if ((unsigned int) wc < 0x80) - count = 1; - else if ((unsigned int) wc < 0x800) - count = 2; - else if ((unsigned int) wc < 0x10000) - count = 3; - else if ((unsigned int) wc < 0x200000) - count = 4; - else if ((unsigned int) wc < 0x4000000) - count = 5; - else if ((unsigned int) wc <= 0x7fffffff) - count = 6; - else - return -1; - if (n < count) - return 0; - switch (count) { /* note: code falls through cases! */ - case 6: r[5] = 0x80 | (wc & 0x3f); wc = wc >> 6; wc |= 0x4000000; - case 5: r[4] = 0x80 | (wc & 0x3f); wc = wc >> 6; wc |= 0x200000; - case 4: r[3] = 0x80 | (wc & 0x3f); wc = wc >> 6; wc |= 0x10000; - case 3: r[2] = 0x80 | (wc & 0x3f); wc = wc >> 6; wc |= 0x800; - case 2: r[1] = 0x80 | (wc & 0x3f); wc = wc >> 6; wc |= 0xc0; - case 1: r[0] = wc; - } - return count; -} - /* from XlcNCharSet to XlcNUtf8String */ static int @@ -460,7 +366,7 @@ cstoutf8(conv, from, from_left, to, to_left, args, num_args) name = charset->encoding_name; /* not charset->name because the latter has a ":GL"/":GR" suffix */ - for (convptr = all_charsets, i = all_charsets_count; i > 0; convptr++, i--) + for (convptr = all_charsets, i = all_charsets_count-1; i > 0; convptr++, i--) if (!strcmp(convptr->name, name)) break; if (i == 0) @@ -478,17 +384,17 @@ cstoutf8(conv, from, from_left, to, to_left, args, num_args) int count; consumed = convptr->cstowc(conv, &wc, src, srcend-src); - if (consumed < 0) + if (consumed == RET_ILSEQ) return -1; - if (consumed == 0) + if (consumed == RET_TOOFEW(0)) break; - count = utf8_wctocs(dst, wc, dstend-dst); - if (count == 0) + count = utf8_wctomb(NULL, dst, wc, dstend-dst); + if (count == RET_TOOSMALL) break; - if (count < 0) { - count = utf8_wctocs(dst, BAD_WCHAR, dstend-dst); - if (count == 0) + if (count == RET_ILSEQ) { + count = utf8_wctomb(NULL, dst, BAD_WCHAR, dstend-dst); + if (count == RET_TOOSMALL) break; unconv_num++; } @@ -547,8 +453,8 @@ create_tocs_conv(lcd, methods) charset_num = 0; for (i = 0; i < codeset_num; i++) charset_num += codeset_list[i]->num_charsets; - if (charset_num > all_charsets_count) - charset_num = all_charsets_count; + if (charset_num > all_charsets_count-1) + charset_num = all_charsets_count-1; preferred = (Utf8Conv *) Xmalloc((charset_num + 1) * sizeof(Utf8Conv)); if (preferred == (Utf8Conv *) NULL) { Xfree((char *) conv); @@ -568,7 +474,7 @@ create_tocs_conv(lcd, methods) break; if (k < 0) { /* Look it up in all_charsets[]. */ - for (k = 0; k < all_charsets_count; k++) + for (k = 0; k < all_charsets_count-1; k++) if (!strcmp(all_charsets[k].name, name)) { /* Add it to the preferred set. */ preferred[charset_num++] = &all_charsets[k]; @@ -612,28 +518,28 @@ charset_wctocs(preferred, charsetp, sidep, conv, r, wc, n) Utf8Conv convptr; int i; - while (*preferred != (Utf8Conv) NULL) { + for (; *preferred != (Utf8Conv) NULL; preferred++) { convptr = *preferred; count = convptr->wctocs(conv, r, wc, n); - if (count == 0) - return 0; - if (count > 0) { + if (count == RET_TOOSMALL) + return RET_TOOSMALL; + if (count != RET_ILSEQ) { *charsetp = convptr; *sidep = (*r < 0x80 ? XlcGL : XlcGR); return count; } } - for (convptr = all_charsets, i = all_charsets_count; i > 0; convptr++, i--) { + for (convptr = all_charsets+1, i = all_charsets_count-1; i > 0; convptr++, i--) { count = convptr->wctocs(conv, r, wc, n); - if (count == 0) - return 0; - if (count > 0) { + if (count == RET_TOOSMALL) + return RET_TOOSMALL; + if (count != RET_ILSEQ) { *charsetp = convptr; *sidep = (*r < 0x80 ? XlcGL : XlcGR); return count; } } - return -1; + return RET_ILSEQ; } static int @@ -671,19 +577,19 @@ utf8tocs(conv, from, from_left, to, to_left, args, num_args) int consumed; int count; - consumed = utf8_cstowc(&wc, src, srcend-src); - if (consumed == 0) + consumed = utf8_mbtowc(NULL, &wc, src, srcend-src); + if (consumed == RET_TOOFEW(0)) break; - if (consumed < 0) { + if (consumed == RET_ILSEQ) { src++; unconv_num++; continue; } count = charset_wctocs(preferred_charsets, &chosen_charset, &chosen_side, conv, dst, wc, dstend-dst); - if (count == 0) + if (count == RET_TOOSMALL) break; - if (count < 0) { + if (count == RET_ILSEQ) { src += consumed; unconv_num++; continue; @@ -774,19 +680,19 @@ utf8tocs1(conv, from, from_left, to, to_left, args, num_args) int consumed; int count; - consumed = utf8_cstowc(&wc, src, srcend-src); - if (consumed == 0) + consumed = utf8_mbtowc(NULL, &wc, src, srcend-src); + if (consumed == RET_TOOFEW(0)) break; - if (consumed < 0) { + if (consumed == RET_ILSEQ) { src++; unconv_num++; continue; } count = charset_wctocs(preferred_charsets, &chosen_charset, &chosen_side, conv, dst, wc, dstend-dst); - if (count == 0) + if (count == RET_TOOSMALL) break; - if (count < 0) { + if (count == RET_ILSEQ) { src += consumed; unconv_num++; continue; @@ -873,12 +779,12 @@ utf8tostr(conv, from, from_left, to, to_left, args, num_args) wchar_t wc; int consumed; - consumed = utf8_cstowc(&wc, src, srcend-src); - if (consumed == 0) + consumed = utf8_mbtowc(NULL, &wc, src, srcend-src); + if (consumed == RET_TOOFEW(0)) break; if (dst == dstend) break; - if (consumed < 0) { + if (consumed == RET_ILSEQ) { consumed = 1; c = BAD_CHAR; unconv_num++; @@ -943,8 +849,8 @@ strtoutf8(conv, from, from_left, to, to_left, args, num_args) dstend = dst + *to_left; while (src < srcend) { - int count = utf8_wctocs(dst, *src, dstend-dst); - if (count == 0) + int count = utf8_wctomb(NULL, dst, *src, dstend-dst); + if (count == RET_TOOSMALL) break; dst += count; src++; @@ -1023,13 +929,17 @@ utf8towcs(conv, from, from_left, to, to_left, args, num_args) unconv_num = 0; while (src < srcend && dst < dstend) { - int consumed = utf8_cstowc(dst, src, srcend-src); - if (consumed < 0) { + local_wchar_t wc; + int consumed = utf8_mbtowc(NULL, &wc, src, srcend-src); + if (consumed == RET_TOOFEW(0)) + break; + if (consumed == RET_ILSEQ) { src++; *dst = BAD_WCHAR; unconv_num++; } else { src += consumed; + *dst = wc; } dst++; } @@ -1086,12 +996,12 @@ wcstoutf8(conv, from, from_left, to, to_left, args, num_args) unconv_num = 0; while (src < srcend) { - int count = utf8_wctocs(dst, *src, dstend-dst); - if (count == 0) + int count = utf8_wctomb(NULL, dst, *src, dstend-dst); + if (count == RET_TOOSMALL) break; - if (count < 0) { - count = utf8_wctocs(dst, BAD_WCHAR, dstend-dst); - if (count == 0) + if (count == RET_ILSEQ) { + count = utf8_wctomb(NULL, dst, BAD_WCHAR, dstend-dst); + if (count == RET_TOOSMALL) break; unconv_num++; } @@ -1266,10 +1176,11 @@ cstowcs(conv, from, from_left, to, to_left, args, num_args) return -1; charset = (XlcCharSet) args[0]; - name = charset->name; + name = charset->encoding_name; + /* not charset->name because the latter has a ":GL"/":GR" suffix */ - for (convptr = all_charsets, i = all_charsets_count; i > 0; convptr++, i--) - if (!strcmp(convptr->name, name)) /* FIXME: charset->side */ + for (convptr = all_charsets, i = all_charsets_count-1; i > 0; convptr++, i--) + if (!strcmp(convptr->name, name)) break; if (i == 0) return -1; @@ -1285,9 +1196,9 @@ cstowcs(conv, from, from_left, to, to_left, args, num_args) int consumed; consumed = convptr->cstowc(conv, &wc, src, srcend-src); - if (consumed < 0) + if (consumed == RET_ILSEQ) return -1; - if (consumed == 0) + if (consumed == RET_TOOFEW(0)) break; *dst++ = wc; @@ -1356,9 +1267,9 @@ wcstocs(conv, from, from_left, to, to_left, args, num_args) int count; count = charset_wctocs(preferred_charsets, &chosen_charset, &chosen_side, conv, dst, wc, dstend-dst); - if (count == 0) + if (count == RET_TOOSMALL) break; - if (count < 0) { + if (count == RET_ILSEQ) { src++; unconv_num++; continue; @@ -1449,9 +1360,9 @@ wcstocs1(conv, from, from_left, to, to_left, args, num_args) int count; count = charset_wctocs(preferred_charsets, &chosen_charset, &chosen_side, conv, dst, wc, dstend-dst); - if (count == 0) + if (count == RET_TOOSMALL) break; - if (count < 0) { + if (count == RET_ILSEQ) { src++; unconv_num++; continue; |