summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMatthias Kramm <kramm@quiss.org>2009-12-12 19:03:19 -0800
committerMatthias Kramm <kramm@quiss.org>2009-12-12 19:03:19 -0800
commit5f639e1d63bbb39eb0e9cb6d4aca28aff1ac21e3 (patch)
treefc46855cb425e37f66f3977d731887681f780eb0
parentf6c0cbfa8e2ce312453954cf21a984927a1dc498 (diff)
fixed utf8 handling
-rw-r--r--lib/as3/Makefile10
-rw-r--r--lib/as3/main.c4
-rw-r--r--lib/as3/tokenizer.lex21
3 files changed, 28 insertions, 7 deletions
diff --git a/lib/as3/Makefile b/lib/as3/Makefile
index be189016..b3ecd23d 100644
--- a/lib/as3/Makefile
+++ b/lib/as3/Makefile
@@ -5,17 +5,23 @@ tests: testwrite testrewrite testpaths testreadwrite
D=-g -pg
+#BISONDEBUG=yes
+
MODULES = abc.o opcodes.o code.o pool.o scripts.o expr.o common.o initcode.o
SOURCES = abc.c abc.h pool.c pool.h files.c files.h code.c code.h registry.c registry.h opcodes.c opcodes.h builtin.c builtin.h compiler.c compiler.h parser.tab.h parser.tab.c tokenizer.yy.c scripts.c import.c import.h expr.c expr.h common.c common.h initcode.c initcode.h
tokenizer.yy.c: tokenizer.lex tokenizer.h
flex -Pas3_ -8 -B -otokenizer.yy.c tokenizer.lex
+ifeq "$(BISONDEBUG)" "yes"
+BISONDEBUGFLAGS=-t
+BISONDEBUGDEFINE=-DBISONDEBUG
+endif
parser.tab.h parser.tab.c: parser.y parser.h skeleton.m4 Makefile
- bison -S ./skeleton.m4 -v --defines -pa3_ parser.y -o parser.tab.c
+ bison $(BISONDEBUGFLAGS) -S ./skeleton.m4 -v --defines -pa3_ parser.y -o parser.tab.c
main.o: main.c parser.tab.h parser.h
- $(C) main.c -o main.o
+ $(C) $(BISONDEBUGDEFINE) main.c -o main.o
mklib.o: mklib.c parser.tab.h parser.h
$(C) mklib.c -o mklib.o
diff --git a/lib/as3/main.c b/lib/as3/main.c
index 2835556f..85b36a62 100644
--- a/lib/as3/main.c
+++ b/lib/as3/main.c
@@ -64,7 +64,9 @@ int main(int argn, char*argv[])
}
filename=argv[argn-1];
- //a3_debug = 1; //if bison was called with -t
+#ifdef BISONDEBUG
+ a3_debug = 1; //if bison was called with -t
+#endif
as3_add_include_dir(getcwd(buf, 512));
diff --git a/lib/as3/tokenizer.lex b/lib/as3/tokenizer.lex
index 4e0495d5..05fd4074 100644
--- a/lib/as3/tokenizer.lex
+++ b/lib/as3/tokenizer.lex
@@ -481,6 +481,7 @@ void tokenizer_unregister_namespace(const char*id)
}*/
static inline char tokenizer_is_namespace(const char*id)
{
+ if(!active_namespaces) return 0;
return trie_contains(active_namespaces, (const unsigned char*)id);
}
@@ -514,8 +515,20 @@ static int tokenerror();
%x XMLTEXT
%x XML
-NAME [a-zA-Z_\x80-\xff][a-zA-Z0-9_\\\x80-\xff]*
-_ [^a-zA-Z0-9_\\\x80-\xff]
+X1 parsing identifiers with a non unicode lexer is a knightmare we have to skip all possible
+X2 combinations of byte order markers or utf8 space chars and i dont quite like the fact that
+X3 lex doesnt support proper comments in this section either...
+X4 {NAME_HEAD}{NAME_TAIL}
+
+NAME_NOC2EF [a-zA-Z_\x80-\xc1\xc3-\xee\xf0-\xff]
+NAME_EF [\xef][a-zA-Z0-9_\\\x80-\xba\xbc-\xff]
+NAME_C2 [\xc2][a-zA-Z0-9_\\\x80-\x9f\xa1-\xff]
+NAME_EFBB [\xef][\xbb][a-zA-Z0-9_\\\x80-\xbe\xc0-\xff]
+NAME_TAIL [a-zA-Z_0-9\\\x80-\xff]*
+NAME_HEAD (({NAME_NOC2EF})|({NAME_EF})|({NAME_C2})|({NAME_EFBB}))
+NAME {NAME_HEAD}{NAME_TAIL}
+
+_ [^a-zA-Z0-9_\\\x80-\xff]
HEXINT 0x[a-zA-Z0-9]+
HEXFLOAT 0x[a-zA-Z0-9]*\.[a-zA-Z0-9]*
@@ -534,7 +547,7 @@ XMLID [A-Za-z0-9_\x80-\xff]+([:][A-Za-z0-9_\x80-\xff]+)?
XMLSTRING ["][^"]*["]
STRING ["](\\[\x00-\xff]|[^\\"\n])*["]|['](\\[\x00-\xff]|[^\\'\n])*[']
-S ([ \n\r\t\xa0]|\xc2\xa0)
+S ([ \n\r\t\xa0]|[\xc2][\xa0])
MULTILINE_COMMENT [/][*]+([*][^/]|[^/*]|[^*][/]|[\x00-\x1f])*[*]+[/]
SINGLELINE_COMMENT \/\/[^\n\r]*[\n\r]
REGEXP [/]([^/\n]|\\[/])*[/][a-zA-Z]*
@@ -587,7 +600,7 @@ REGEXP [/]([^/\n]|\\[/])*[/][a-zA-Z]*
<REGEXPOK>[\{] {c(); BEGIN(REGEXPOK);return m(T_DICTSTART);}
[\{] {c(); BEGIN(DEFAULT); return m('{');}
-\xef\xbb\xbf {/* utf 8 bom */}
+\xef\xbb\xbf {/* utf 8 bom (0xfeff) */}
{S} {l();}
{HEXINT}/{_} {c(); BEGIN(DEFAULT);return handlehex();}