/* Oort
 * Copyright 2007, Soren Sandmann Pedersen
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
 */

#include <glib.h>
#include <string.h>
#include <stdlib.h>
#include "ast.h"

typedef struct keyword_t keyword_t;

struct keyword_t
{
    const char		keyword[12];
    token_type_t	type;
};

const keyword_t keywords[] =
{
    { "while",		TOKEN_WHILE },
    { "{",		TOKEN_LBRACE },
    { "}",		TOKEN_RBRACE },
    { "=",		TOKEN_ASSIGN },
    { ";",		TOKEN_SEMICOLON },
    { "(",		TOKEN_LPAREN },
    { ")",		TOKEN_RPAREN },
    { ".",		TOKEN_DOT },
    { "..",		TOKEN_DOT_DOT },
    { ",",		TOKEN_COMMA },
    { "for",		TOKEN_FOR },
    { "goto",		TOKEN_GOTO },
    { "break",		TOKEN_BREAK },
    { "continue",	TOKEN_CONTINUE },
    { "class",		TOKEN_CLASS },
    { "this",		TOKEN_THIS },
    { "virtual",	TOKEN_VIRTUAL },
    { "match",		TOKEN_MATCH },
    { "struct",		TOKEN_STRUCT },
    { "new",		TOKEN_NEW },
    { "+",		TOKEN_PLUS },
    { "-",		TOKEN_MINUS },
    { "*",		TOKEN_TIMES },
    { "/",		TOKEN_DIVIDE },
    { "+=",		TOKEN_PLUS_EQ },
    { "-=",		TOKEN_MINUS_EQ },
    { "*=",		TOKEN_TIMES_EQ },
    { "%=",		TOKEN_MOD_EQ },
    { "/=",		TOKEN_DIVIDE_EQ },
    { ">>=",		TOKEN_RSHIFT_EQ },
    { "<<=",		TOKEN_LSHIFT_EQ },
    { "&=",		TOKEN_BAND_EQ },
    { "|=",		TOKEN_BOR_EQ },
    { "^=",		TOKEN_BXOR_EQ },
    { "print",		TOKEN_PRINT },
    { "<<",		TOKEN_LSHIFT },
    { ">>",		TOKEN_RSHIFT },
    { "[",		TOKEN_LBRACK },
    { "]",		TOKEN_RBRACK },
    { ":",		TOKEN_COLON },
    { "if",		TOKEN_IF },
    { "else",		TOKEN_ELSE },
    { "return",		TOKEN_RETURN },
    { "&&",		TOKEN_AND },
    { "||",		TOKEN_OR },
    { "^^",		TOKEN_XOR },
    { "!",		TOKEN_NOT },
    { "&",		TOKEN_BAND },
    { "|",		TOKEN_BOR },
    { "^",		TOKEN_BXOR },
    { "~",		TOKEN_BNEG },
    { "null",		TOKEN_NULL },
    { "enum",		TOKEN_ENUM },
    { "const",		TOKEN_CONST },
    { ">",		TOKEN_GT },
    { "<",		TOKEN_LT },
    { ">=",		TOKEN_GTE },
    { "<=",		TOKEN_LTE },
    { "!=",		TOKEN_NOT_EQ },
    { "==",		TOKEN_EQ },
    { "int64",		TOKEN_INT64 },
    { "uint64",		TOKEN_UINT64 },
    { "int32",		TOKEN_INT32 },
    { "uint32",		TOKEN_UINT32 },
    { "int16",		TOKEN_INT16 },
    { "uint16",		TOKEN_UINT16 },
    { "int8",		TOKEN_INT8 },
    { "uint8",		TOKEN_UINT8 },
    { "double",		TOKEN_DOUBLE },
    { "float",		TOKEN_FLOAT },
    { "string",		TOKEN_STRING },
    { "->",		TOKEN_RARROW },
    { "<-",		TOKEN_LARROW },
    { "++",		TOKEN_INC },
    { "--",		TOKEN_DEC },
    { "%",		TOKEN_MOD },
    { "?",		TOKEN_QUESTION },
    { "true",		TOKEN_TRUE },
    { "false",		TOKEN_FALSE },
    { "throw",		TOKEN_THROW },
    { "try",		TOKEN_TRY },
    { "catch",		TOKEN_CATCH },
    { "finally",	TOKEN_FINALLY },
    { "typedef",	TOKEN_TYPEDEF },
    { "do",		TOKEN_DO },
    { "bool",		TOKEN_BOOL },
    { "switch",		TOKEN_SWITCH },
    { "case",		TOKEN_CASE },
    { "default",	TOKEN_DEFAULT },
    { "with",		TOKEN_WITH },
    { "fn",		TOKEN_FN },
    { "as",		TOKEN_AS },
    { "is",		TOKEN_IS },
    { "in",		TOKEN_IN },
    { "@",		TOKEN_AT },
    { "var",		TOKEN_VAR },
    { "void",		TOKEN_VOID },
    { "operator",	TOKEN_OPERATOR },
    { "public",		TOKEN_PUBLIC },
    { "private",	TOKEN_PRIVATE },
    { "array",		TOKEN_ARRAY },
    { "yield",		TOKEN_YIELD },
    { "override",	TOKEN_OVERRIDE },
    { "label",		TOKEN_LABEL },
};

static gboolean
check_best (const char *input, const char **best)
{
    if (input > *best)
    {
	*best = input;
	return TRUE;
    }

    return FALSE;
}

static gboolean
check_prefix (const char *input, const char *prefix, const char **best)
{
    int n = strlen (prefix);

    if (strncmp (input, prefix, n) == 0)
	input += n;

    return check_best (input, best);
}

static gboolean
check_identifier (const char *in, const char **best)
{
    /* An identifier is an ascii letter followed by a
     * sequence of ascii letters, underscores and digits
     */
    if (g_ascii_isalpha (*in))
    {
	++in;
	while (g_ascii_isalnum (*in) || *in == '_')
	    ++in;
    }

    return check_best (in, best);
}

static gboolean
check_whitespace (const char *in, const char **best)
{
    while (g_ascii_isspace (*in))
	in++;

    return check_best (in, best);
}

static gboolean
check_comment (const char *in, const char **best)
{
    const char *tmp;

    /* C comment */
    tmp = in;
    if (strncmp (tmp, "/*", 2) == 0)
    {
	tmp += 2;

	if ((tmp = strstr (tmp, "*/")))
	{
	    tmp += 2;
	    return check_best (tmp, best);
	}
    }

    /* C++ comment */
    tmp = in;
    if (strncmp (tmp, "//", 2) == 0)
    {
	tmp += 2;

	if ((tmp = strstr (tmp, "\n")))
	{
	    tmp += 1;
	    return check_best (tmp, best);
	}
    }

    return FALSE;
}

static gboolean
check_string_literal (const char *in, const char **best)
{
    gboolean escape = FALSE;
    char c;

    if (*in++ == '"')
    {
	while ((c = *in++))
	{
	    if (escape)
	    {
		escape = FALSE;
	    }
	    else
	    {
		if (c == '\\')
		    escape = TRUE;
		else if (c == '"')
		    return check_best (in, best);
	    }
	}
    }

    return FALSE;
}

#define HEX_DIGITS "0123456789abcdefABCDEF"
#define DECIMAL_DIGITS "0123456789"
#define BINARY_DIGITS "01"
#define OCTAL_DIGITS "01234567"

static gboolean
check_number (const char *in, const char *prefix,
	      const char *digits, const char **best)
{
    if (strncmp (in, prefix, strlen (prefix)) == 0)
    {
	in += strlen (prefix);

	if (strchr (digits, *in++))
	{
	    while (strchr (digits, *in))
		in++;

	    return check_best (in, best);
	}
    }

    return FALSE;
}

static gboolean
check_float_literal (const char *in, const char **best)
{
    while (g_ascii_isdigit (*in))
	in++;

    if (*in++ != '.')
	return FALSE;

    while (g_ascii_isdigit (*in))
	in++;

    return check_best (in, best);
}

static char *
unescape (const char *start, const char *end)
{
    const char *s;
    GString *unescaped = g_string_new (NULL);
    gboolean escape = FALSE;

    for (s = start + 1; s != end - 1; ++s)
    {
	if (*s == '\\' && !escape)
	{
	    escape = TRUE;
	}
	else
	{
	    char c = *s;
	    
	    if (escape)
	    {
		switch (c)
		{
		case 't':
		    c = '\t';
		    break;
		    
		case 'n':
		    c = '\n';
		    break;
		    
		case '\\':
		    c = '\\';
		    break;
		    
		case '\n':
		    /* An escaped newline is simply stripped out */
		    continue;
		    
		default:
		    break;
		}
		
		escape = FALSE;
	    }

	    g_string_append_c (unescaped, c);
	}
    }
	
    return g_string_free (unescaped, FALSE);
}

typedef struct position_t position_t;

struct position_t
{
    int	line;
    int	pos;
};

static void
token_skip (const char *start, const char *end, position_t *pos)
{
    while (start < end)
    {
	if (*start == '\n')
	{
	    pos->line++;
	    pos->pos = 0;
	}
	else
	{
	    pos->pos++;
	}

	start++;
    }
}

static token_t
token_init (token_type_t type, const char *begin, const char *end,
	    position_t *pos)
{
    token_t token;

    token.common.type = type;
    token.common.string = g_strndup (begin, end - begin);

    token.common.line = pos->line;
    token.common.begin_pos = pos->pos;

    token.common.err = 0;

    token_skip (begin, end, pos);

    token.common.end_pos = pos->pos;

    if (type == TOKEN_IDENTIFIER)
    {
	token.identifier.name = token.common.string;
    }
    else if (type == TOKEN_INT_LITERAL)
    {
	if (strncmp (begin, "0x", 2) == 0)
	    token.int_literal.value = strtol (begin + 2, NULL, 16);
	else if (strncmp (begin, "0b", 2) == 0)
	    token.int_literal.value = strtol (begin + 2, NULL, 2);
	else if (strncmp (begin, "0", 1) == 0)
	    token.int_literal.value = strtol (begin + 1, NULL, 8);
	else
	    token.int_literal.value = strtol (begin, NULL, 10);
    }
    else if (type == TOKEN_FLOAT_LITERAL)
    {
	token.float_literal.value = strtod (begin, NULL);
    }
    else if (type == TOKEN_STRING_LITERAL)
    {
	token.string_literal.value = unescape (begin, end);
    }
    else if (type == TOKEN_END_OF_FILE)
    {
	token.common.string = g_strdup ("[EOF]");
    }

    return token;
}

token_t *
scan (const char *input)
{
    GArray *tokens = g_array_new (TRUE, TRUE, sizeof (token_t));
    token_t token;
    position_t pos = { 0, 0 };

    while (*input)
    {
	token_type_t best_type = 0;
	const char *best = input;
	int i;

	/* Whitespace */
	if (check_whitespace (input, &best))
	    best_type = TOKEN_WHITESPACE;

	/* Comments */
	if (check_comment (input, &best))
	    best_type = TOKEN_WHITESPACE;

	/* Keywords */
	for (i = 0; i < G_N_ELEMENTS (keywords); ++i)
	{
	    if (check_prefix (input, keywords[i].keyword, &best))
		best_type = keywords[i].type;
	}

	/* Identifier */
	if (check_identifier (input, &best))
	    best_type = TOKEN_IDENTIFIER;

	/* String literal */
	if (check_string_literal (input, &best))
	    best_type = TOKEN_STRING_LITERAL;

	/* Integer literal */
	if (check_number (input, "0x", HEX_DIGITS, &best))
	    best_type = TOKEN_INT_LITERAL;

	if (check_number (input, "0b", BINARY_DIGITS, &best))
	    best_type = TOKEN_INT_LITERAL;

	/* Must be checked before decimal */
	if (check_number (input, "0", OCTAL_DIGITS, &best))
	    best_type = TOKEN_INT_LITERAL;

	if (check_number (input, "", DECIMAL_DIGITS, &best))
	    best_type = TOKEN_INT_LITERAL;

	/* Floating point literal */
	if (check_float_literal (input, &best))
	    best_type = TOKEN_FLOAT_LITERAL;

	/* Make token */
	if (best != input)
	{
	    if (best_type != TOKEN_WHITESPACE)
	    {
		token = token_init (best_type, input, best, &pos);

		g_array_append_val (tokens, token);
	    }
	    else
	    {
		token_skip (input, best, &pos);
	    }

	    input = best;
	}
	else
	{
	    report_error ("%d %d: Invalid input at %s\n",
			  pos.line, pos.pos, input);
	    return NULL;
	}
    }

    /* End of file token */
    token = token_init (TOKEN_END_OF_FILE, input, input + 1, &pos);
    g_array_append_val (tokens, token);

    return (token_t *)g_array_free (tokens, FALSE);
}