/* Oort * Copyright 2007, Soren Sandmann Pedersen * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #include #include #include #include "ast.h" typedef struct keyword_t keyword_t; struct keyword_t { const char keyword[12]; token_type_t type; }; const keyword_t keywords[] = { { "while", TOKEN_WHILE }, { "{", TOKEN_LBRACE }, { "}", TOKEN_RBRACE }, { "=", TOKEN_ASSIGN }, { ";", TOKEN_SEMICOLON }, { "(", TOKEN_LPAREN }, { ")", TOKEN_RPAREN }, { ".", TOKEN_DOT }, { "..", TOKEN_DOT_DOT }, { ",", TOKEN_COMMA }, { "for", TOKEN_FOR }, { "goto", TOKEN_GOTO }, { "break", TOKEN_BREAK }, { "continue", TOKEN_CONTINUE }, { "class", TOKEN_CLASS }, { "this", TOKEN_THIS }, { "virtual", TOKEN_VIRTUAL }, { "match", TOKEN_MATCH }, { "struct", TOKEN_STRUCT }, { "new", TOKEN_NEW }, { "+", TOKEN_PLUS }, { "-", TOKEN_MINUS }, { "*", TOKEN_TIMES }, { "/", TOKEN_DIVIDE }, { "+=", TOKEN_PLUS_EQ }, { "-=", TOKEN_MINUS_EQ }, { "*=", TOKEN_TIMES_EQ }, { "%=", TOKEN_MOD_EQ }, { "/=", TOKEN_DIVIDE_EQ }, { ">>=", TOKEN_RSHIFT_EQ }, { "<<=", TOKEN_LSHIFT_EQ }, { "&=", TOKEN_BAND_EQ }, { "|=", TOKEN_BOR_EQ }, { "^=", TOKEN_BXOR_EQ }, { "print", TOKEN_PRINT }, { "<<", TOKEN_LSHIFT }, { ">>", TOKEN_RSHIFT }, { "[", TOKEN_LBRACK }, { "]", TOKEN_RBRACK }, { ":", TOKEN_COLON }, { "if", TOKEN_IF }, { "else", TOKEN_ELSE }, { "return", TOKEN_RETURN }, { "&&", TOKEN_AND }, { "||", TOKEN_OR }, { "^^", TOKEN_XOR }, { "!", TOKEN_NOT }, { "&", TOKEN_BAND }, { "|", TOKEN_BOR }, { "^", TOKEN_BXOR }, { "~", TOKEN_BNEG }, { "null", TOKEN_NULL }, { "enum", TOKEN_ENUM }, { "const", TOKEN_CONST }, { ">", TOKEN_GT }, { "<", TOKEN_LT }, { ">=", TOKEN_GTE }, { "<=", TOKEN_LTE }, { "!=", TOKEN_NOT_EQ }, { "==", TOKEN_EQ }, { "int64", TOKEN_INT64 }, { "uint64", TOKEN_UINT64 }, { "int32", TOKEN_INT32 }, { "uint32", TOKEN_UINT32 }, { "int16", TOKEN_INT16 }, { "uint16", TOKEN_UINT16 }, { "int8", TOKEN_INT8 }, { "uint8", TOKEN_UINT8 }, { "double", TOKEN_DOUBLE }, { "float", TOKEN_FLOAT }, { "string", TOKEN_STRING }, { "->", TOKEN_RARROW }, { "<-", TOKEN_LARROW }, { "++", TOKEN_INC }, { "--", TOKEN_DEC }, { "%", TOKEN_MOD }, { "?", TOKEN_QUESTION }, { "true", TOKEN_TRUE }, { "false", TOKEN_FALSE }, { "throw", TOKEN_THROW }, { "try", TOKEN_TRY }, { "catch", TOKEN_CATCH }, { "finally", TOKEN_FINALLY }, { "typedef", TOKEN_TYPEDEF }, { "do", TOKEN_DO }, { "bool", TOKEN_BOOL }, { "switch", TOKEN_SWITCH }, { "case", TOKEN_CASE }, { "default", TOKEN_DEFAULT }, { "with", TOKEN_WITH }, { "fn", TOKEN_FN }, { "as", TOKEN_AS }, { "is", TOKEN_IS }, { "in", TOKEN_IN }, { "@", TOKEN_AT }, { "var", TOKEN_VAR }, { "void", TOKEN_VOID }, { "operator", TOKEN_OPERATOR }, { "public", TOKEN_PUBLIC }, { "private", TOKEN_PRIVATE }, { "array", TOKEN_ARRAY }, { "yield", TOKEN_YIELD }, { "override", TOKEN_OVERRIDE }, { "label", TOKEN_LABEL }, }; static gboolean check_best (const char *input, const char **best) { if (input > *best) { *best = input; return TRUE; } return FALSE; } static gboolean check_prefix (const char *input, const char *prefix, const char **best) { int n = strlen (prefix); if (strncmp (input, prefix, n) == 0) input += n; return check_best (input, best); } static gboolean check_identifier (const char *in, const char **best) { /* An identifier is an ascii letter followed by a * sequence of ascii letters, underscores and digits */ if (g_ascii_isalpha (*in)) { ++in; while (g_ascii_isalnum (*in) || *in == '_') ++in; } return check_best (in, best); } static gboolean check_whitespace (const char *in, const char **best) { while (g_ascii_isspace (*in)) in++; return check_best (in, best); } static gboolean check_comment (const char *in, const char **best) { const char *tmp; /* C comment */ tmp = in; if (strncmp (tmp, "/*", 2) == 0) { tmp += 2; if ((tmp = strstr (tmp, "*/"))) { tmp += 2; return check_best (tmp, best); } } /* C++ comment */ tmp = in; if (strncmp (tmp, "//", 2) == 0) { tmp += 2; if ((tmp = strstr (tmp, "\n"))) { tmp += 1; return check_best (tmp, best); } } return FALSE; } static gboolean check_string_literal (const char *in, const char **best) { gboolean escape = FALSE; char c; if (*in++ == '"') { while ((c = *in++)) { if (escape) { escape = FALSE; } else { if (c == '\\') escape = TRUE; else if (c == '"') return check_best (in, best); } } } return FALSE; } #define HEX_DIGITS "0123456789abcdefABCDEF" #define DECIMAL_DIGITS "0123456789" #define BINARY_DIGITS "01" #define OCTAL_DIGITS "01234567" static gboolean check_number (const char *in, const char *prefix, const char *digits, const char **best) { if (strncmp (in, prefix, strlen (prefix)) == 0) { in += strlen (prefix); if (strchr (digits, *in++)) { while (strchr (digits, *in)) in++; return check_best (in, best); } } return FALSE; } static gboolean check_float_literal (const char *in, const char **best) { while (g_ascii_isdigit (*in)) in++; if (*in++ != '.') return FALSE; while (g_ascii_isdigit (*in)) in++; return check_best (in, best); } static char * unescape (const char *start, const char *end) { const char *s; GString *unescaped = g_string_new (NULL); gboolean escape = FALSE; for (s = start + 1; s != end - 1; ++s) { if (*s == '\\' && !escape) { escape = TRUE; } else { char c = *s; if (escape) { switch (c) { case 't': c = '\t'; break; case 'n': c = '\n'; break; case '\\': c = '\\'; break; case '\n': /* An escaped newline is simply stripped out */ continue; default: break; } escape = FALSE; } g_string_append_c (unescaped, c); } } return g_string_free (unescaped, FALSE); } typedef struct position_t position_t; struct position_t { int line; int pos; }; static void token_skip (const char *start, const char *end, position_t *pos) { while (start < end) { if (*start == '\n') { pos->line++; pos->pos = 0; } else { pos->pos++; } start++; } } static token_t token_init (token_type_t type, const char *begin, const char *end, position_t *pos) { token_t token; token.common.type = type; token.common.string = g_strndup (begin, end - begin); token.common.line = pos->line; token.common.begin_pos = pos->pos; token.common.err = 0; token_skip (begin, end, pos); token.common.end_pos = pos->pos; if (type == TOKEN_IDENTIFIER) { token.identifier.name = token.common.string; } else if (type == TOKEN_INT_LITERAL) { if (strncmp (begin, "0x", 2) == 0) token.int_literal.value = strtol (begin + 2, NULL, 16); else if (strncmp (begin, "0b", 2) == 0) token.int_literal.value = strtol (begin + 2, NULL, 2); else if (strncmp (begin, "0", 1) == 0) token.int_literal.value = strtol (begin + 1, NULL, 8); else token.int_literal.value = strtol (begin, NULL, 10); } else if (type == TOKEN_FLOAT_LITERAL) { token.float_literal.value = strtod (begin, NULL); } else if (type == TOKEN_STRING_LITERAL) { token.string_literal.value = unescape (begin, end); } else if (type == TOKEN_END_OF_FILE) { token.common.string = g_strdup ("[EOF]"); } return token; } token_t * scan (const char *input) { GArray *tokens = g_array_new (TRUE, TRUE, sizeof (token_t)); token_t token; position_t pos = { 0, 0 }; while (*input) { token_type_t best_type = 0; const char *best = input; int i; /* Whitespace */ if (check_whitespace (input, &best)) best_type = TOKEN_WHITESPACE; /* Comments */ if (check_comment (input, &best)) best_type = TOKEN_WHITESPACE; /* Keywords */ for (i = 0; i < G_N_ELEMENTS (keywords); ++i) { if (check_prefix (input, keywords[i].keyword, &best)) best_type = keywords[i].type; } /* Identifier */ if (check_identifier (input, &best)) best_type = TOKEN_IDENTIFIER; /* String literal */ if (check_string_literal (input, &best)) best_type = TOKEN_STRING_LITERAL; /* Integer literal */ if (check_number (input, "0x", HEX_DIGITS, &best)) best_type = TOKEN_INT_LITERAL; if (check_number (input, "0b", BINARY_DIGITS, &best)) best_type = TOKEN_INT_LITERAL; /* Must be checked before decimal */ if (check_number (input, "0", OCTAL_DIGITS, &best)) best_type = TOKEN_INT_LITERAL; if (check_number (input, "", DECIMAL_DIGITS, &best)) best_type = TOKEN_INT_LITERAL; /* Floating point literal */ if (check_float_literal (input, &best)) best_type = TOKEN_FLOAT_LITERAL; /* Make token */ if (best != input) { if (best_type != TOKEN_WHITESPACE) { token = token_init (best_type, input, best, &pos); g_array_append_val (tokens, token); } else { token_skip (input, best, &pos); } input = best; } else { report_error ("%d %d: Invalid input at %s\n", pos.line, pos.pos, input); return NULL; } } /* End of file token */ token = token_init (TOKEN_END_OF_FILE, input, input + 1, &pos); g_array_append_val (tokens, token); return (token_t *)g_array_free (tokens, FALSE); }