/*
 * utf8.c
 *
 * Part of gwm, the Gratuitous Window Manager,
 *     by Gary Wong, <gtw@gnu.org>.
 *
 * Copyright (C) 2009  Gary Wong
 *
 *  This program is free software: you can redistribute it and/or modify
 *  it under the terms of version 3 of the GNU General Public License as
 *  published by the Free Software Foundation.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 * $Id$
 */

#include <config.h>

#include <assert.h>
#if HAVE_ICONV_H
#include <iconv.h>
#endif
#include <string.h>
#include <xcb/xcb.h>

#include "gwm.h"

#include "utf8.h"

#if HAVE_ICONV
iconv_t iso2022;
int tried_iso2022;
#endif

extern char *to_utf8( enum gwm_encoding encoding, const char *in,
		      size_t len ) {

    size_t outlen;
    char *out, *outp;
    
    if( len < 0 )
	len = strlen( in );

#if HAVE_ICONV
    outlen = len << ( encoding == ENCODING_COMPOUND ? 2 : 1 );
#else
    outlen = len << 1;
#endif
    
    outp = out = xmalloc( outlen + 1 );
    
#if HAVE_ICONV
    if( encoding == ENCODING_COMPOUND ) {
	if( !tried_iso2022 ) {
	    iso2022 = iconv_open( "UTF-8", "ISO-2022-JP-2" );
	    tried_iso2022 = TRUE;
	}

	if( iso2022 != (iconv_t) -1 ) {
	    static const char resetseq[ 3 ] = "\x1B\x2D\x41";
	    const char *inp;
	    size_t resetlen = 3;

	    /* Reset the decoder to the Compound Text initial state
	       (G1 = ASCII, G3 = ISO 8859-1). */
	    inp = resetseq;
	    iconv( iso2022, NULL, NULL, NULL, NULL );
	    /* Bah.  Several old implementations of iconv() declared
	       the inbuf parameter as (const char **), but SUS says
	       it's simply (char **).  We cast the thing to (void *),
	       which will keep them both happy. */
	    iconv( iso2022, (void *) &inp, &resetlen, &outp, &outlen );

	    iconv( iso2022, (void *) &in, &len, &outp, &outlen );

	    *outp++ = 0;

	    assert( !utf8_illegal( (unsigned char *) out ) );
	    
	    return xrealloc( out, outp - out );
	}
    }
#endif

    for( ; len; len-- )
	if( *in & 0x80 ) {
	    *outp++ = 0xC0 | ( (const unsigned char) *in >> 6 );
	    *outp++ = 0x80 | ( *in++ & 0x3F );
	} else
	    *outp++ = *in++;

    *outp++ = 0;

    assert( !utf8_illegal( (unsigned char *) out ) );
    
    return xrealloc( out, outp - out );
}

extern PURE unsigned char *utf8_illegal( const unsigned char *str ) {

    for(;;)
	if( !*str )
	    /* End of string.  Everything was legal. */
	    return NULL;
	else if( !( str[ 0 ] & 0x80 ) )
	    /* Legal single byte character. */
	    str++;
	else if( ( str[ 0 ] >= 0x80 && str[ 0 ] <= 0xC1 ) ||
		 ( str[ 0 ] > 0xF4 ) )
	    /* Illegal continuation byte, long representation of single
	       byte character, or overly long sequence. */
	    return (unsigned char *) str;
	else if( str[ 0 ] >= 0xC2 && str[ 0 ] <= 0xDF ) {
	    /* Two byte sequence... */
	    if( str[ 1 ] < 0x80 || str[ 1 ] > 0xBF )
		/* ...where byte 2 is illegal. */
		return (unsigned char *) str + 1;
	    else
		/* ...which is fully legal. */
		str += 2;
	} else if( str[ 0 ] >= 0xE0 && str[ 0 ] <= 0xEF ) {
	    /* Three byte sequence... */
	    if( str[ 1 ] < 0x80 || str[ 1 ] > 0xBF ||
		( str[ 0 ] == 0xE0 && str[ 1 ] < 0xA0 ) ||
		( str[ 0 ] == 0xED && str[ 1 ] > 0x9F ) )
		/* ...where byte 2 is illegal. */
		return (unsigned char *) str + 1;
	    else if( str[ 2 ] < 0x80 || str[ 2 ] > 0xBF )
		/* ...where byte 3 is illegal. */
		return (unsigned char *) str + 2;
	    else
		/* ...which is fully legal. */
		str += 3;
	} else {
	    assert( str[ 0 ] >= 0xF0 && str[ 0 ] <= 0xF4 );
	    /* Four byte sequence... */
	    if( str[ 1 ] < 0x80 || str[ 1 ] > 0xBF ||
		( str[ 0 ] == 0xF0 && str[ 1 ] < 0x90 ) ||
		( str[ 0 ] == 0xF4 && str[ 1 ] > 0x8F ) )
		/* ...where byte 2 is illegal. */
		return (unsigned char *) str + 1;
	    else if( str[ 2 ] < 0x80 || str[ 2 ] > 0xBF )
		/* ...where byte 3 is illegal. */
		return (unsigned char *) str + 2;
	    else if( str[ 3 ] < 0x80 || str[ 3 ] > 0xBF )
		/* ...where byte 4 is illegal. */
		return (unsigned char *) str + 3;
	    else
		/* ...which is fully legal. */
		str += 4;
	}
}   

extern PURE int utf8_length( const unsigned char *str ) {

    int len;
    
    assert( !utf8_illegal( str ) );

    for( len = 0; *str; len++ )
	if( *str < 0x80 )
	    str++;
	else if( *str < 0xE0 )
	    str += 2;
	else if( *str < 0xF0 )
	    str += 3;
	else
	    str += 4;

    return len;
}

static MALLOC unsigned char *dup_valid_common( const unsigned char *str,
					       int len ) {

    const unsigned char *p;
    unsigned char *out, *outp;
    
    outp = out = xmalloc( len + 1 );
    p = str;
    for(;;)
	if( outp == out + len ) {
	    /* End of string. */
	    *outp = 0;
	    assert( !utf8_illegal( out ) );
	    return out;
	} else if( !( p[ 0 ] & 0x80 ) )
	    /* Legal single byte character. */
	    *outp++ = *p++;
	else if( *p >= 0xC2 && *p <= 0xDF &&
		 p[ 1 ] >= 0x80 && p[ 1 ] <= 0xBF ) {
	    /* Legal two byte character. */
	    *outp++ = *p++;
	    *outp++ = *p++;
	} else if( *p >= 0xE0 && *p <= 0xEF &&
		   p[ 1 ] >= 0x80 && p[ 1 ] <= 0xBF &&
		   p[ 2 ] >= 0x80 && p[ 2 ] <= 0xBF &&
		   ( *p > 0xE0 || p[ 1 ] > 0x9F ) &&
		   ( *p != 0xED || p[ 1 ] < 0xA0 ) ) {
	    /* Legal three byte character. */
	    *outp++ = *p++;
	    *outp++ = *p++;
	    *outp++ = *p++;
	} else if( *p >= 0xF0 && *p <= 0xF4 &&
		   p[ 1 ] >= 0x80 && p[ 1 ] <= 0xBF &&
		   p[ 2 ] >= 0x80 && p[ 2 ] <= 0xBF &&
		   p[ 3 ] >= 0x80 && p[ 3 ] <= 0xBF &&
		   ( *p > 0xF0 || p[ 1 ] > 0x8F ) &&
		   ( *p != 0xF4 || p[ 1 ] < 0x90 ) ) {
	    /* Legal four byte character. */
	    *outp++ = *p++;
	    *outp++ = *p++;
	    *outp++ = *p++;
	    *outp++ = *p++;
	} else
	    /* Illegal character: ignore this byte and continue. */
	    p++;
}

extern MALLOC unsigned char *utf8_dup_valid_len( const unsigned char *str,
						 int num_bytes ) {

    int len;
    const unsigned char *p;
    
    len = 0;
    p = str;
    for(;;)
	if( !num_bytes )
	    /* End of string. */
	    break;
	else if( !( p[ 0 ] & 0x80 ) ) {
	    /* Legal single byte character. */
	    len++;
	    p++;
	    num_bytes--;
	} else if( num_bytes >= 2 &&
		   *p >= 0xC2 && *p <= 0xDF &&
		   p[ 1 ] >= 0x80 && p[ 1 ] <= 0xBF ) {
	    /* Legal two byte character. */
	    len += 2;
	    p += 2;
	    num_bytes -= 2;
	} else if( num_bytes >= 3 &&
		   *p >= 0xE0 && *p <= 0xEF &&
		   p[ 1 ] >= 0x80 && p[ 1 ] <= 0xBF &&
		   p[ 2 ] >= 0x80 && p[ 2 ] <= 0xBF &&
		   ( *p > 0xE0 || p[ 1 ] > 0x9F ) &&
		   ( *p != 0xED || p[ 1 ] < 0xA0 ) ) {
	    /* Legal three byte character. */
	    len += 3;
	    p += 3;
	    num_bytes -= 3;
	} else if( num_bytes >= 4 &&
		   *p >= 0xF0 && *p <= 0xF4 &&
		   p[ 1 ] >= 0x80 && p[ 1 ] <= 0xBF &&
		   p[ 2 ] >= 0x80 && p[ 2 ] <= 0xBF &&
		   p[ 3 ] >= 0x80 && p[ 3 ] <= 0xBF &&
		   ( *p > 0xF0 || p[ 1 ] > 0x8F ) &&
		   ( *p != 0xF4 || p[ 1 ] < 0x90 ) ) {
	    /* Legal four byte character. */
	    len += 4;
	    p += 4;
	    num_bytes -= 4;
	} else {
	    /* Illegal character: ignore this byte and continue. */
	    p++;
	    num_bytes--;
	}

    return dup_valid_common( str, len );
}

extern MALLOC unsigned char *utf8_dup_valid( const unsigned char *str ) {

    int len;
    const unsigned char *p;
    
    len = 0;
    p = str;
    for(;;)
	if( !p[ 0 ] )
	    /* End of string. */
	    break;
	else if( !( p[ 0 ] & 0x80 ) ) {
	    /* Legal single byte character. */
	    len++;
	    p++;
	} else if( *p >= 0xC2 && *p <= 0xDF &&
		   p[ 1 ] >= 0x80 && p[ 1 ] <= 0xBF ) {
	    /* Legal two byte character. */
	    len += 2;
	    p += 2;
	} else if( *p >= 0xE0 && *p <= 0xEF &&
		   p[ 1 ] >= 0x80 && p[ 1 ] <= 0xBF &&
		   p[ 2 ] >= 0x80 && p[ 2 ] <= 0xBF &&
		   ( *p > 0xE0 || p[ 1 ] > 0x9F ) &&
		   ( *p != 0xED || p[ 1 ] < 0xA0 ) ) {
	    /* Legal three byte character. */
	    len += 3;
	    p += 3;
	} else if( *p >= 0xF0 && *p <= 0xF4 &&
		   p[ 1 ] >= 0x80 && p[ 1 ] <= 0xBF &&
		   p[ 2 ] >= 0x80 && p[ 2 ] <= 0xBF &&
		   p[ 3 ] >= 0x80 && p[ 3 ] <= 0xBF &&
		   ( *p > 0xF0 || p[ 1 ] > 0x8F ) &&
		   ( *p != 0xF4 || p[ 1 ] < 0x90 ) ) {
	    /* Legal four byte character. */
	    len += 4;
	    p += 4;
	} else
	    /* Illegal character: ignore this byte and continue. */
	    p++;

    return dup_valid_common( str, len );
}

extern uint32_t utf8_next( const unsigned char **p ) {

    uint32_t n;
    const unsigned char *c = *p;
    
    assert( c[ 0 ] < 0x80 ||
	    ( c[ 0 ] >= 0xC2 && c[ 0 ] <= 0xDF &&
	      c[ 1 ] >= 0x80 && c[ 1 ] <= 0xBF ) ||
	    ( c[ 0 ] >= 0xE0 && c[ 0 ] <= 0xEF &&
	      c[ 1 ] >= 0x80 && c[ 1 ] <= 0xBF &&
	      c[ 2 ] >= 0x80 && c[ 2 ] <= 0xBF &&
	      ( c[ 0 ] > 0xE0 || c[ 1 ] > 0x9F ) &&
	      ( c[ 0 ] != 0xED || c[ 1 ] < 0xA0 ) ) ||
	    ( c[ 0 ] >= 0xF0 && c[ 0 ] <= 0xF4 &&
	      c[ 1 ] >= 0x80 && c[ 1 ] <= 0xBF &&
	      c[ 2 ] >= 0x80 && c[ 2 ] <= 0xBF &&
	      c[ 3 ] >= 0x80 && c[ 3 ] <= 0xBF &&
	      ( c[ 0 ] > 0xF0 || c[ 1 ] > 0x8F ) &&
	      ( c[ 0 ] != 0xF4 || c[ 1 ] < 0x90 ) ) );

    if( !*c )
	return 0;

    if( c[ 0 ] < 0x80 ) {
	n = c[ 0 ];
	( *p )++;
    } else if( c[ 0 ] < 0xE0 ) {
	n = ( ( c[ 0 ] & 0x1F ) << 6 ) | ( c[ 1 ] & 0x3F );
	*p += 2;
    } else if( c[ 0 ] < 0xF0 ) {
	n = ( ( c[ 0 ] & 0x0F ) << 12 ) | ( ( c[ 1 ] & 0x3F ) << 6 ) |
	    ( c[ 2 ] & 0x3F );
	*p += 3;
    } else {
	n = ( ( c[ 0 ] & 0x07 ) << 18 ) | ( ( c[ 1 ] & 0x3F ) << 12 ) |
	    ( ( c[ 2 ] & 0x3F ) << 6 ) | ( c[ 3 ] & 0x3F );
	*p += 4;
    }

    return n;
}

extern void cleanup_utf8( void ) {
    
#if HAVE_ICONV
    if( tried_iso2022 && iso2022 != (iconv_t) -1 )
	iconv_close( iso2022 );
#endif
}