/* * utf8.c * * Part of gwm, the Gratuitous Window Manager, * by Gary Wong, . * * Copyright (C) 2009 Gary Wong * * This program is free software: you can redistribute it and/or modify * it under the terms of version 3 of the GNU General Public License as * published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . * * $Id$ */ #include #include #if HAVE_ICONV_H #include #endif #include #include #include "gwm.h" #include "utf8.h" #if HAVE_ICONV iconv_t iso2022; int tried_iso2022; #endif extern char *to_utf8( enum gwm_encoding encoding, const char *in, size_t len ) { size_t outlen; char *out, *outp; if( len < 0 ) len = strlen( in ); #if HAVE_ICONV outlen = len << ( encoding == ENCODING_COMPOUND ? 2 : 1 ); #else outlen = len << 1; #endif outp = out = xmalloc( outlen + 1 ); #if HAVE_ICONV if( encoding == ENCODING_COMPOUND ) { if( !tried_iso2022 ) { iso2022 = iconv_open( "UTF-8", "ISO-2022-JP-2" ); tried_iso2022 = TRUE; } if( iso2022 != (iconv_t) -1 ) { static const char resetseq[ 3 ] = "\x1B\x2D\x41"; const char *inp; size_t resetlen = 3; /* Reset the decoder to the Compound Text initial state (G1 = ASCII, G3 = ISO 8859-1). */ inp = resetseq; iconv( iso2022, NULL, NULL, NULL, NULL ); /* Bah. Several old implementations of iconv() declared the inbuf parameter as (const char **), but SUS says it's simply (char **). We cast the thing to (void *), which will keep them both happy. */ iconv( iso2022, (void *) &inp, &resetlen, &outp, &outlen ); iconv( iso2022, (void *) &in, &len, &outp, &outlen ); *outp++ = 0; assert( !utf8_illegal( (unsigned char *) out ) ); return xrealloc( out, outp - out ); } } #endif for( ; len; len-- ) if( *in & 0x80 ) { *outp++ = 0xC0 | ( (const unsigned char) *in >> 6 ); *outp++ = 0x80 | ( *in++ & 0x3F ); } else *outp++ = *in++; *outp++ = 0; assert( !utf8_illegal( (unsigned char *) out ) ); return xrealloc( out, outp - out ); } extern PURE unsigned char *utf8_illegal( const unsigned char *str ) { for(;;) if( !*str ) /* End of string. Everything was legal. */ return NULL; else if( !( str[ 0 ] & 0x80 ) ) /* Legal single byte character. */ str++; else if( ( str[ 0 ] >= 0x80 && str[ 0 ] <= 0xC1 ) || ( str[ 0 ] > 0xF4 ) ) /* Illegal continuation byte, long representation of single byte character, or overly long sequence. */ return (unsigned char *) str; else if( str[ 0 ] >= 0xC2 && str[ 0 ] <= 0xDF ) { /* Two byte sequence... */ if( str[ 1 ] < 0x80 || str[ 1 ] > 0xBF ) /* ...where byte 2 is illegal. */ return (unsigned char *) str + 1; else /* ...which is fully legal. */ str += 2; } else if( str[ 0 ] >= 0xE0 && str[ 0 ] <= 0xEF ) { /* Three byte sequence... */ if( str[ 1 ] < 0x80 || str[ 1 ] > 0xBF || ( str[ 0 ] == 0xE0 && str[ 1 ] < 0xA0 ) || ( str[ 0 ] == 0xED && str[ 1 ] > 0x9F ) ) /* ...where byte 2 is illegal. */ return (unsigned char *) str + 1; else if( str[ 2 ] < 0x80 || str[ 2 ] > 0xBF ) /* ...where byte 3 is illegal. */ return (unsigned char *) str + 2; else /* ...which is fully legal. */ str += 3; } else { assert( str[ 0 ] >= 0xF0 && str[ 0 ] <= 0xF4 ); /* Four byte sequence... */ if( str[ 1 ] < 0x80 || str[ 1 ] > 0xBF || ( str[ 0 ] == 0xF0 && str[ 1 ] < 0x90 ) || ( str[ 0 ] == 0xF4 && str[ 1 ] > 0x8F ) ) /* ...where byte 2 is illegal. */ return (unsigned char *) str + 1; else if( str[ 2 ] < 0x80 || str[ 2 ] > 0xBF ) /* ...where byte 3 is illegal. */ return (unsigned char *) str + 2; else if( str[ 3 ] < 0x80 || str[ 3 ] > 0xBF ) /* ...where byte 4 is illegal. */ return (unsigned char *) str + 3; else /* ...which is fully legal. */ str += 4; } } extern PURE int utf8_length( const unsigned char *str ) { int len; assert( !utf8_illegal( str ) ); for( len = 0; *str; len++ ) if( *str < 0x80 ) str++; else if( *str < 0xE0 ) str += 2; else if( *str < 0xF0 ) str += 3; else str += 4; return len; } static MALLOC unsigned char *dup_valid_common( const unsigned char *str, int len ) { const unsigned char *p; unsigned char *out, *outp; outp = out = xmalloc( len + 1 ); p = str; for(;;) if( outp == out + len ) { /* End of string. */ *outp = 0; assert( !utf8_illegal( out ) ); return out; } else if( !( p[ 0 ] & 0x80 ) ) /* Legal single byte character. */ *outp++ = *p++; else if( *p >= 0xC2 && *p <= 0xDF && p[ 1 ] >= 0x80 && p[ 1 ] <= 0xBF ) { /* Legal two byte character. */ *outp++ = *p++; *outp++ = *p++; } else if( *p >= 0xE0 && *p <= 0xEF && p[ 1 ] >= 0x80 && p[ 1 ] <= 0xBF && p[ 2 ] >= 0x80 && p[ 2 ] <= 0xBF && ( *p > 0xE0 || p[ 1 ] > 0x9F ) && ( *p != 0xED || p[ 1 ] < 0xA0 ) ) { /* Legal three byte character. */ *outp++ = *p++; *outp++ = *p++; *outp++ = *p++; } else if( *p >= 0xF0 && *p <= 0xF4 && p[ 1 ] >= 0x80 && p[ 1 ] <= 0xBF && p[ 2 ] >= 0x80 && p[ 2 ] <= 0xBF && p[ 3 ] >= 0x80 && p[ 3 ] <= 0xBF && ( *p > 0xF0 || p[ 1 ] > 0x8F ) && ( *p != 0xF4 || p[ 1 ] < 0x90 ) ) { /* Legal four byte character. */ *outp++ = *p++; *outp++ = *p++; *outp++ = *p++; *outp++ = *p++; } else /* Illegal character: ignore this byte and continue. */ p++; } extern MALLOC unsigned char *utf8_dup_valid_len( const unsigned char *str, int num_bytes ) { int len; const unsigned char *p; len = 0; p = str; for(;;) if( !num_bytes ) /* End of string. */ break; else if( !( p[ 0 ] & 0x80 ) ) { /* Legal single byte character. */ len++; p++; num_bytes--; } else if( num_bytes >= 2 && *p >= 0xC2 && *p <= 0xDF && p[ 1 ] >= 0x80 && p[ 1 ] <= 0xBF ) { /* Legal two byte character. */ len += 2; p += 2; num_bytes -= 2; } else if( num_bytes >= 3 && *p >= 0xE0 && *p <= 0xEF && p[ 1 ] >= 0x80 && p[ 1 ] <= 0xBF && p[ 2 ] >= 0x80 && p[ 2 ] <= 0xBF && ( *p > 0xE0 || p[ 1 ] > 0x9F ) && ( *p != 0xED || p[ 1 ] < 0xA0 ) ) { /* Legal three byte character. */ len += 3; p += 3; num_bytes -= 3; } else if( num_bytes >= 4 && *p >= 0xF0 && *p <= 0xF4 && p[ 1 ] >= 0x80 && p[ 1 ] <= 0xBF && p[ 2 ] >= 0x80 && p[ 2 ] <= 0xBF && p[ 3 ] >= 0x80 && p[ 3 ] <= 0xBF && ( *p > 0xF0 || p[ 1 ] > 0x8F ) && ( *p != 0xF4 || p[ 1 ] < 0x90 ) ) { /* Legal four byte character. */ len += 4; p += 4; num_bytes -= 4; } else { /* Illegal character: ignore this byte and continue. */ p++; num_bytes--; } return dup_valid_common( str, len ); } extern MALLOC unsigned char *utf8_dup_valid( const unsigned char *str ) { int len; const unsigned char *p; len = 0; p = str; for(;;) if( !p[ 0 ] ) /* End of string. */ break; else if( !( p[ 0 ] & 0x80 ) ) { /* Legal single byte character. */ len++; p++; } else if( *p >= 0xC2 && *p <= 0xDF && p[ 1 ] >= 0x80 && p[ 1 ] <= 0xBF ) { /* Legal two byte character. */ len += 2; p += 2; } else if( *p >= 0xE0 && *p <= 0xEF && p[ 1 ] >= 0x80 && p[ 1 ] <= 0xBF && p[ 2 ] >= 0x80 && p[ 2 ] <= 0xBF && ( *p > 0xE0 || p[ 1 ] > 0x9F ) && ( *p != 0xED || p[ 1 ] < 0xA0 ) ) { /* Legal three byte character. */ len += 3; p += 3; } else if( *p >= 0xF0 && *p <= 0xF4 && p[ 1 ] >= 0x80 && p[ 1 ] <= 0xBF && p[ 2 ] >= 0x80 && p[ 2 ] <= 0xBF && p[ 3 ] >= 0x80 && p[ 3 ] <= 0xBF && ( *p > 0xF0 || p[ 1 ] > 0x8F ) && ( *p != 0xF4 || p[ 1 ] < 0x90 ) ) { /* Legal four byte character. */ len += 4; p += 4; } else /* Illegal character: ignore this byte and continue. */ p++; return dup_valid_common( str, len ); } extern uint32_t utf8_next( const unsigned char **p ) { uint32_t n; const unsigned char *c = *p; assert( c[ 0 ] < 0x80 || ( c[ 0 ] >= 0xC2 && c[ 0 ] <= 0xDF && c[ 1 ] >= 0x80 && c[ 1 ] <= 0xBF ) || ( c[ 0 ] >= 0xE0 && c[ 0 ] <= 0xEF && c[ 1 ] >= 0x80 && c[ 1 ] <= 0xBF && c[ 2 ] >= 0x80 && c[ 2 ] <= 0xBF && ( c[ 0 ] > 0xE0 || c[ 1 ] > 0x9F ) && ( c[ 0 ] != 0xED || c[ 1 ] < 0xA0 ) ) || ( c[ 0 ] >= 0xF0 && c[ 0 ] <= 0xF4 && c[ 1 ] >= 0x80 && c[ 1 ] <= 0xBF && c[ 2 ] >= 0x80 && c[ 2 ] <= 0xBF && c[ 3 ] >= 0x80 && c[ 3 ] <= 0xBF && ( c[ 0 ] > 0xF0 || c[ 1 ] > 0x8F ) && ( c[ 0 ] != 0xF4 || c[ 1 ] < 0x90 ) ) ); if( !*c ) return 0; if( c[ 0 ] < 0x80 ) { n = c[ 0 ]; ( *p )++; } else if( c[ 0 ] < 0xE0 ) { n = ( ( c[ 0 ] & 0x1F ) << 6 ) | ( c[ 1 ] & 0x3F ); *p += 2; } else if( c[ 0 ] < 0xF0 ) { n = ( ( c[ 0 ] & 0x0F ) << 12 ) | ( ( c[ 1 ] & 0x3F ) << 6 ) | ( c[ 2 ] & 0x3F ); *p += 3; } else { n = ( ( c[ 0 ] & 0x07 ) << 18 ) | ( ( c[ 1 ] & 0x3F ) << 12 ) | ( ( c[ 2 ] & 0x3F ) << 6 ) | ( c[ 3 ] & 0x3F ); *p += 4; } return n; } extern void cleanup_utf8( void ) { #if HAVE_ICONV if( tried_iso2022 && iso2022 != (iconv_t) -1 ) iconv_close( iso2022 ); #endif }