/* * TSM - Unicode Handling * * Copyright (c) 2011 David Herrmann * Copyright (c) 2011-2012 University of Tuebingen * * Permission is hereby granted, free of charge, to any person obtaining * a copy of this software and associated documentation files * (the "Software"), to deal in the Software without restriction, including * without limitation the rights to use, copy, modify, merge, publish, * distribute, sublicense, and/or sell copies of the Software, and to * permit persons to whom the Software is furnished to do so, subject to * the following conditions: * * The above copyright notice and this permission notice shall be included * in all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ /* * The tsm-utf8-state-machine is based on the wayland-compositor demos: * * Copyright © 2008 Kristian Høgsberg * * Permission to use, copy, modify, distribute, and sell this software and * its documentation for any purpose is hereby granted without fee, provided * that the above copyright notice appear in all copies and that both that * copyright notice and this permission notice appear in supporting * documentation, and that the name of the copyright holders not be used in * advertising or publicity pertaining to distribution of the software * without specific, written prior permission. The copyright holders make * no representations about the suitability of this software for any * purpose. It is provided "as is" without express or implied warranty. * * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER * RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF * CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ /* * Unicode Helpers * This implements several helpers for Unicode/UTF8/UCS4 input and output. See * below for comments on each helper. */ #include #include #include #include #include "external/wcwidth.h" #include "shl_array.h" #include "shl_hashtable.h" #include "shl_misc.h" #include "tsm_unicode.h" /* * Unicode Symbol Handling * The main goal of the tsm_symbol_* functions is to provide a datatype which * can contain the representation of any printable character. This includes all * basic Unicode characters but also combined characters. * To avoid all the memory management we still represent a character as a single * integer value (tsm_symbol_t) but internally we allocate a string which is * represented by this value. * * A tsm_symbol_t is an integer which represents a single character point. * For most Unicode characters this is simply the UCS4 representation. In fact, * every UCS4 characters is a valid tsm_symbol_t object. * However, Unicode standard allows combining marks. Therefore, some characters * consists of more than one Unicode character. * A global symbol-table provides all those combined characters as single * integers. You simply create a valid base character and append your combining * marks and the table will return a new valid tsm_symbol_t. It is no longer * a valid UCS4 value, though. But no memory management is needed as all * tsm_symbol_t objects are simple integers. * * The symbol table contains two-way * references. The Hash Table contains all the symbols with the symbol ucs4 * string as key and the symbol ID as value. * The index array contains the symbol ID as key and a pointer to the ucs4 * string as value. But the hash table owns the ucs4 string. * This allows fast implementations of *_get() and *_append() without long * search intervals. * * When creating a new symbol, we simply return the UCS4 value as new symbol. We * do not add it to our symbol table as it is only one character. However, if a * character is appended to an existing symbol, we create a new ucs4 string and * push the new symbol into the symbol table. */ SHL_EXPORT const tsm_symbol_t tsm_symbol_default = 0; struct tsm_symbol_table { unsigned long ref; uint32_t next_id; struct shl_array *index; struct shl_hashtable *symbols; }; /* TODO: remove the default context */ static struct tsm_symbol_table *tsm_symbol_table_default; static unsigned int hash_ucs4(const void *key) { unsigned int val = 5381; size_t i; const uint32_t *ucs4 = key; i = 0; while (ucs4[i] <= TSM_UCS4_MAX) { val = val * 33 + ucs4[i]; ++i; } return val; } static bool cmp_ucs4(const void *a, const void *b) { size_t i; const uint32_t *v1, *v2; v1 = a; v2 = b; i = 0; while (1) { if (v1[i] > TSM_UCS4_MAX && v2[i] > TSM_UCS4_MAX) return true; if (v1[i] > TSM_UCS4_MAX && v2[i] <= TSM_UCS4_MAX) return false; if (v1[i] <= TSM_UCS4_MAX && v2[i] > TSM_UCS4_MAX) return false; if (v1[i] != v2[i]) return false; ++i; } } SHL_EXPORT int tsm_symbol_table_new(struct tsm_symbol_table **out) { struct tsm_symbol_table *tbl; int ret; static const uint32_t *val = NULL; /* we need a valid lvalue */ if (!out) return -EINVAL; tbl = malloc(sizeof(*tbl)); if (!tbl) return -ENOMEM; memset(tbl, 0, sizeof(*tbl)); tbl->ref = 1; tbl->next_id = TSM_UCS4_MAX + 2; ret = shl_array_new(&tbl->index, sizeof(uint32_t*), 4); if (ret) goto err_free; /* first entry is not used so add dummy */ shl_array_push(tbl->index, &val); ret = shl_hashtable_new(&tbl->symbols, hash_ucs4, cmp_ucs4, free, NULL); if (ret) goto err_array; *out = tbl; return 0; err_array: shl_array_free(tbl->index); err_free: free(tbl); return ret; } SHL_EXPORT void tsm_symbol_table_ref(struct tsm_symbol_table *tbl) { if (!tbl || !tbl->ref) return; ++tbl->ref; } SHL_EXPORT void tsm_symbol_table_unref(struct tsm_symbol_table *tbl) { if (!tbl || !tbl->ref || --tbl->ref) return; shl_hashtable_free(tbl->symbols); shl_array_free(tbl->index); free(tbl); } SHL_EXPORT tsm_symbol_t tsm_symbol_make(uint32_t ucs4) { if (ucs4 > TSM_UCS4_MAX) return 0; else return ucs4; } /* * This decomposes a symbol into a ucs4 string and a size value. If \sym is a * valid UCS4 character, this returns a pointer to \sym and writes 1 into \size. * Therefore, the returned value may get destroyed if your \sym argument gets * destroyed. * If \sym is a composed ucs4 string, then the returned value points into the * hash table of the symbol table and lives as long as the symbol table does. * * This always returns a valid value. If an error happens, the default character * is returned. If \size is NULL, then the size value is omitted. */ SHL_EXPORT const uint32_t *tsm_symbol_get(struct tsm_symbol_table *tbl, tsm_symbol_t *sym, size_t *size) { uint32_t *ucs4, idx; int ret; if (*sym <= TSM_UCS4_MAX) { if (size) *size = 1; return sym; } if (!tbl) tbl = tsm_symbol_table_default; if (!tbl) { ret = tsm_symbol_table_new(&tbl); if (ret) { if (size) *size = 1; return &tsm_symbol_default; } tsm_symbol_table_default = tbl; } idx = *sym - (TSM_UCS4_MAX + 1); if (idx >= shl_array_get_length(tbl->index)) ucs4 = NULL; else ucs4 = *SHL_ARRAY_AT(tbl->index, uint32_t*, idx); if (!ucs4) { if (size) *size = 1; return &tsm_symbol_default; } if (size) { *size = 0; while (ucs4[*size] <= TSM_UCS4_MAX) ++*size; } return ucs4; } SHL_EXPORT tsm_symbol_t tsm_symbol_append(struct tsm_symbol_table *tbl, tsm_symbol_t sym, uint32_t ucs4) { uint32_t buf[TSM_UCS4_MAXLEN + 1], nsym, *nval; const uint32_t *ptr; size_t s; void *tmp; bool res; int ret; if (!tbl) tbl = tsm_symbol_table_default; if (!tbl) { ret = tsm_symbol_table_new(&tbl); if (ret) return sym; tsm_symbol_table_default = tbl; } if (ucs4 > TSM_UCS4_MAX) return sym; ptr = tsm_symbol_get(tbl, &sym, &s); if (s >= TSM_UCS4_MAXLEN) return sym; memcpy(buf, ptr, s * sizeof(uint32_t)); buf[s++] = ucs4; buf[s++] = TSM_UCS4_MAX + 1; res = shl_hashtable_find(tbl->symbols, &tmp, buf); if (res) return (uint32_t)(long)tmp; nval = malloc(sizeof(uint32_t) * s); if (!nval) return sym; memcpy(nval, buf, s * sizeof(uint32_t)); nsym = tbl->next_id + 1; /* Out of IDs; we actually have 2 Billion IDs so this seems * very unlikely but lets be safe here */ if (nsym <= tbl->next_id++) goto err_id; ret = shl_hashtable_insert(tbl->symbols, nval, (void*)(long)nsym); if (ret) goto err_id; ret = shl_array_push(tbl->index, &nval); if (ret) goto err_symbol; return nsym; err_symbol: shl_hashtable_remove(tbl->symbols, nval); err_id: --tbl->next_id; free(nval); return sym; } SHL_EXPORT unsigned int tsm_symbol_get_width(struct tsm_symbol_table *tbl, tsm_symbol_t sym) { int ret; const uint32_t *ch; size_t len; if (!tbl) tbl = tsm_symbol_table_default; if (!tbl) { ret = tsm_symbol_table_new(&tbl); if (ret) return sym; tsm_symbol_table_default = tbl; } ch = tsm_symbol_get(tbl, &sym, &len); if (len == 0) return 0; return tsm_ucs4_get_width(*ch); } /* * Convert UCS4 character to UTF-8. This creates one of: * 0xxxxxxx * 110xxxxx 10xxxxxx * 1110xxxx 10xxxxxx 10xxxxxx * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx * This is based on the same function from "terminology" from the Enlightenment * project. See COPYING for more information. * * @txt must point to a 4 byte-buffer. A number between 0 and 4 is returned and * indicates how long the written UTF8 string is. * * Please note @g is a real UCS4 code and not a tsm_symbol_t object! * * Unicode symbols between 0xD800 and 0xDFFF are not assigned and reserved for * UTF16 compatibility. It is an error to encode them. Same applies to numbers * greater than 0x10FFFF, the range 0xFDD0-0xFDEF and codepoints ending with * 0xFFFF or 0xFFFE. */ SHL_EXPORT unsigned int tsm_ucs4_get_width(uint32_t ucs4) { int ret; ret = mk_wcwidth(ucs4); if (ret <= 0) return 0; return ret; } SHL_EXPORT size_t tsm_ucs4_to_utf8(uint32_t g, char *txt) { if (g >= 0xd800 && g <= 0xdfff) return 0; if (g > 0x10ffff || (g & 0xffff) == 0xffff || (g & 0xffff) == 0xfffe) return 0; if (g >= 0xfdd0 && g <= 0xfdef) return 0; if (g < (1 << 7)) { txt[0] = g & 0x7f; return 1; } else if (g < (1 << (5 + 6))) { txt[0] = 0xc0 | ((g >> 6) & 0x1f); txt[1] = 0x80 | ((g ) & 0x3f); return 2; } else if (g < (1 << (4 + 6 + 6))) { txt[0] = 0xe0 | ((g >> 12) & 0x0f); txt[1] = 0x80 | ((g >> 6) & 0x3f); txt[2] = 0x80 | ((g ) & 0x3f); return 3; } else if (g < (1 << (3 + 6 + 6 + 6))) { txt[0] = 0xf0 | ((g >> 18) & 0x07); txt[1] = 0x80 | ((g >> 12) & 0x3f); txt[2] = 0x80 | ((g >> 6) & 0x3f); txt[3] = 0x80 | ((g ) & 0x3f); return 4; } else { return 0; } } SHL_EXPORT char *tsm_ucs4_to_utf8_alloc(const uint32_t *ucs4, size_t len, size_t *len_out) { char *val; size_t i, pos; val = malloc(4 * len); if (!val) return NULL; pos = 0; for (i = 0; i < len; ++i) pos += tsm_ucs4_to_utf8(ucs4[i], &val[pos]); if (!pos) { free(val); return NULL; } if (len_out) *len_out = pos; return val; } /* * UTF8 State Machine * This state machine parses UTF8 and converts it into a stream of Unicode * characters (UCS4 values). A state-machine is represented by a * "struct tsm_utf8_mach" object. It has no global state and all functions are * re-entrant if called with different state-machine objects. * * tsm_utf8_mach_new(): This creates a new state-machine and resets it to its * initial state. Returns 0 on success. * * tsm_uft8_mach_free(): This destroys a state-machine and frees all internally * allocated memory. * * tsm_utf8_mach_reset(): Reset a given state-machine to its initial state. This * is the same state the machine is in after it got created. * * tsm_uft8_mach_feed(): Feed one byte of the UTF8 input stream into the * state-machine. This function returns the new state of the state-machine after * this character has been parsed. If it is TSM_UTF8_ACCEPT or TSM_UTF8_REJECT, * then there is a pending UCS4 character that you should retrieve via * tsm_utf8_mach_get(). If it is TSM_UTF8_ACCEPT, then a character was * successfully parsed. If it is TSM_UTF8_REJECT, the input was invalid UTF8 and * some error recovery was tried or a replacement character was choosen. All * other states mean that the machine needs more input to parse the stream. * * tsm_utf8_mach_get(): Returns the last parsed character. It has no effect on * the state machine so you can call it multiple times. * * Internally, we use TSM_UTF8_START whenever the state-machine is reset. This * can be used to ignore the last read input or to simply reset the machine. * TSM_UTF8_EXPECT* is used to remember how many bytes are still to be read to * get a full UTF8 sequence. * If an error occurs during reading, we go to state TSM_UTF8_REJECT and the * user will read a replacement character. If further errors occur, we go to * state TSM_UTF8_START to avoid printing multiple replacement characters for a * single misinterpreted UTF8 sequence. However, under some circumstances it may * happen that we stay in TSM_UTF8_REJECT and a next replacement character is * returned. * It is difficult to decide how to interpret wrong input but this machine seems * to be quite good at deciding what to do. Generally, we prefer discarding or * replacing input instead of trying to decipher ASCII values from the invalid * data. This guarantees that we do not send wrong values to the terminal * emulator. Some might argue that an ASCII fallback would be better. However, * this means that we might send very weird escape-sequences to the VTE layer. * Especially with C1 codes applications can really break many terminal features * so we avoid any non-ASCII+non-UTF8 input to prevent this. */ struct tsm_utf8_mach { int state; uint32_t ch; }; SHL_EXPORT int tsm_utf8_mach_new(struct tsm_utf8_mach **out) { struct tsm_utf8_mach *mach; if (!out) return -EINVAL; mach = malloc(sizeof(*mach)); if (!mach) return -ENOMEM; memset(mach, 0, sizeof(*mach)); mach->state = TSM_UTF8_START; *out = mach; return 0; } SHL_EXPORT void tsm_utf8_mach_free(struct tsm_utf8_mach *mach) { if (!mach) return; free(mach); } SHL_EXPORT int tsm_utf8_mach_feed(struct tsm_utf8_mach *mach, char ci) { uint32_t c; if (!mach) return TSM_UTF8_START; c = ci; switch (mach->state) { case TSM_UTF8_START: case TSM_UTF8_ACCEPT: case TSM_UTF8_REJECT: if (c == 0xC0 || c == 0xC1) { /* overlong encoding for ASCII, reject */ mach->state = TSM_UTF8_REJECT; } else if ((c & 0x80) == 0) { /* single byte, accept */ mach->ch = c; mach->state = TSM_UTF8_ACCEPT; } else if ((c & 0xC0) == 0x80) { /* parser out of sync, ignore byte */ mach->state = TSM_UTF8_START; } else if ((c & 0xE0) == 0xC0) { /* start of two byte sequence */ mach->ch = (c & 0x1F) << 6; mach->state = TSM_UTF8_EXPECT1; } else if ((c & 0xF0) == 0xE0) { /* start of three byte sequence */ mach->ch = (c & 0x0F) << 12; mach->state = TSM_UTF8_EXPECT2; } else if ((c & 0xF8) == 0xF0) { /* start of four byte sequence */ mach->ch = (c & 0x07) << 18; mach->state = TSM_UTF8_EXPECT3; } else { /* overlong encoding, reject */ mach->state = TSM_UTF8_REJECT; } break; case TSM_UTF8_EXPECT3: mach->ch |= (c & 0x3F) << 12; if ((c & 0xC0) == 0x80) mach->state = TSM_UTF8_EXPECT2; else mach->state = TSM_UTF8_REJECT; break; case TSM_UTF8_EXPECT2: mach->ch |= (c & 0x3F) << 6; if ((c & 0xC0) == 0x80) mach->state = TSM_UTF8_EXPECT1; else mach->state = TSM_UTF8_REJECT; break; case TSM_UTF8_EXPECT1: mach->ch |= c & 0x3F; if ((c & 0xC0) == 0x80) mach->state = TSM_UTF8_ACCEPT; else mach->state = TSM_UTF8_REJECT; break; default: mach->state = TSM_UTF8_REJECT; break; } return mach->state; } SHL_EXPORT uint32_t tsm_utf8_mach_get(struct tsm_utf8_mach *mach) { if (!mach || mach->state != TSM_UTF8_ACCEPT) return TSM_UCS4_REPLACEMENT; return mach->ch; } SHL_EXPORT void tsm_utf8_mach_reset(struct tsm_utf8_mach *mach) { if (!mach) return; mach->state = TSM_UTF8_START; }