From: Siraaj Khandkar Date: Wed, 23 May 2018 03:46:42 +0000 (-0400) Subject: Complete 1.02.p.1: Tiger lexer X-Git-Url: https://git.xandkar.net/?a=commitdiff_plain;h=78c9eca51ebc5150d79f84e255a57bb9df9f82fc;p=tiger.ml.git Complete 1.02.p.1: Tiger lexer --- diff --git a/README.md b/README.md index abe952e..cda2fd7 100644 --- a/README.md +++ b/README.md @@ -22,14 +22,14 @@ Project Plan | [x] | 1.01.e.1.c | ---- demo unbalanced behaviour | --- | -- | -- | 2018-04-18 | 2018-04-18 | | [x] | 1.01.e.1.d | ---- find functional balanced tree | --- | -- | -- | 2018-04-19 | 2018-04-20 | | ------ | ---------- | ---------------------------------------- | ----- | -------- | ------ | ---------- | ---------- | -| [ ] | 1.02 | -- Lexical Analysis | 024 | 01 | -- | ---------- | ---------- | +| [ ] | 1.02 | -- Lexical Analysis | 024 | 01 | 01 | 2018-05-22 | 2018-05-22 | | [ ] | 1.02.1 | --- Lexical tokens | 001 | -- | -- | ---------- | ---------- | | [ ] | 1.02.2 | --- Regular expressions | 003 | -- | -- | ---------- | ---------- | | [ ] | 1.02.3 | --- Finite automata | 003 | -- | -- | ---------- | ---------- | | [ ] | 1.02.4 | --- Nondeterministic finite automata | 006 | -- | -- | ---------- | ---------- | | [ ] | 1.02.5 | --- ML-Lex: a lexical analyzer generator | 003 | -- | -- | ---------- | ---------- | | [ ] | 1.02.p | --- Program | 002 | -- | -- | ---------- | ---------- | -| [ ] | 1.02.p.1 | ---- Tiger lexer | --- | -- | -- | ---------- | ---------- | +| [ ] | 1.02.p.1 | ---- Tiger lexer | --- | 01 | 01 | 2018-05-22 | 2018-05-22 | | [ ] | 1.02.e | --- Exercises | 004 | -- | -- | ---------- | ---------- | | [ ] | 1.02.e.01 | ---- regexes | --- | -- | -- | ---------- | ---------- | | [ ] | 1.02.e.02 | ---- why no regexes | --- | -- | -- | ---------- | ---------- | diff --git a/tiger/Makefile b/tiger/Makefile index 9917366..20eedaa 100644 --- a/tiger/Makefile +++ b/tiger/Makefile @@ -1,9 +1,9 @@ MAKEFLAGS := --no-builtin-rules -EXE_TYPE := byte # byte | native -EXECUTABLES := tiger +EXE_TYPE := byte # byte | native +EXECUTABLES := tigerc tiger_tests OCAMLBUILD_FLAGS := -I src/exe -I src/lib/tiger -OCAMLBUILD := ocamlbuild $(OCAMLBUILD_FLAGS) +OCAMLBUILD := ocamlbuild $(OCAMLBUILD_FLAGS) .PHONY: \ all \ @@ -12,19 +12,19 @@ OCAMLBUILD := ocamlbuild $(OCAMLBUILD_FLAGS) test all: - $(MAKE) clean - $(MAKE) build - $(MAKE) test + @$(MAKE) -s clean + @$(MAKE) -s build + @$(MAKE) -s test build: - $(OCAMLBUILD) $(addsuffix _main.$(EXE_TYPE),$(EXECUTABLES)) - mkdir -p bin/exe - $(foreach exe,$(EXECUTABLES),cp _build/src/exe/$(exe)_main.$(EXE_TYPE) bin/exe/$(exe)) - rm $(addsuffix _main.$(EXE_TYPE),$(EXECUTABLES)) + @$(OCAMLBUILD) $(addsuffix .$(EXE_TYPE),$(EXECUTABLES)) + @mkdir -p bin/exe + $(foreach exe,$(EXECUTABLES),cp _build/src/exe/$(exe).$(EXE_TYPE) bin/exe/$(exe); ) + @rm $(addsuffix .$(EXE_TYPE),$(EXECUTABLES)) clean: - $(OCAMLBUILD) -clean - rm -rf ./bin + @$(OCAMLBUILD) -clean + @rm -rf ./bin -test: bin/exe/tiger - ./$< +test: bin/exe/tiger_tests + @./$< diff --git a/tiger/src/exe/tiger_main.ml b/tiger/src/exe/tiger_main.ml deleted file mode 100644 index bd1a443..0000000 --- a/tiger/src/exe/tiger_main.ml +++ /dev/null @@ -1,3 +0,0 @@ -let () = - let bar = String.make 80 '-' in - Printf.printf "%s\nTiger says: %S\n%s\n" bar Tiger.Growl.text bar; diff --git a/tiger/src/exe/tiger_tests.ml b/tiger/src/exe/tiger_tests.ml new file mode 100644 index 0000000..314eece --- /dev/null +++ b/tiger/src/exe/tiger_tests.ml @@ -0,0 +1,46 @@ +open Printf + +module List = ListLabels + +let test_01 = + let code = + " + /* an array type and an array variable */ + let + type arrtype = array of int + var arr1:arrtype := arrtype [10] of 0 + in + arr1 + end + " + in + let tokens = + let open Tiger.Parser.Token in + [ LET; + TYPE; ID "arrtype"; EQ; ARRAY; OF; ID "int"; + VAR; ID "arr1"; COLON; ID "arrtype"; ASSIGN; + ID "arrtype"; LBRACK; INT 10; RBRACK; OF; INT 0; + IN; + ID "arr1"; + END + ] + in + (code, tokens) + +let tokens_of_code code = + let lexbuf = Lexing.from_string code in + let rec tokens () = + match Tiger.Lexer.token lexbuf with + | Tiger.Parser.Token.EOF -> [] + | token -> token :: tokens () + in + tokens () + +let tests = + [ test_01 + ] + +let () = + List.iter tests ~f:(fun (code, tokens_expected) -> + assert ((tokens_of_code code) = tokens_expected) + ) diff --git a/tiger/src/exe/tiger_main.mli b/tiger/src/exe/tiger_tests.mli similarity index 100% copy from tiger/src/exe/tiger_main.mli copy to tiger/src/exe/tiger_tests.mli diff --git a/tiger/src/exe/tigerc.ml b/tiger/src/exe/tigerc.ml new file mode 100644 index 0000000..77eafb4 --- /dev/null +++ b/tiger/src/exe/tigerc.ml @@ -0,0 +1,15 @@ +open Printf + +let () = + let path_to_program_file = Sys.argv.(1) in + let ic = open_in path_to_program_file in + let lexbuf = Lexing.from_channel ic in + let rec parse_and_print () = + let token = Tiger.Lexer.token lexbuf in + printf "%s\n" (Tiger.Parser.Token.to_string token); + match token with + | Tiger.Parser.Token.EOF -> () + | _ -> parse_and_print () + in + parse_and_print (); + close_in ic; diff --git a/tiger/src/exe/tiger_main.mli b/tiger/src/exe/tigerc.mli similarity index 100% rename from tiger/src/exe/tiger_main.mli rename to tiger/src/exe/tigerc.mli diff --git a/tiger/src/lib/tiger/tiger.ml b/tiger/src/lib/tiger/tiger.ml index 9844f26..a9a2799 100644 --- a/tiger/src/lib/tiger/tiger.ml +++ b/tiger/src/lib/tiger/tiger.ml @@ -1 +1,2 @@ -module Growl = struct let text = "Grrrrrrrrr!" end +module Lexer = Tiger_lexer +module Parser = Tiger_parser diff --git a/tiger/src/lib/tiger/tiger_lexer.mli b/tiger/src/lib/tiger/tiger_lexer.mli new file mode 100644 index 0000000..fd30dce --- /dev/null +++ b/tiger/src/lib/tiger/tiger_lexer.mli @@ -0,0 +1 @@ +val token : Lexing.lexbuf -> Tiger_parser.Token.t diff --git a/tiger/src/lib/tiger/tiger_lexer.mll b/tiger/src/lib/tiger/tiger_lexer.mll new file mode 100644 index 0000000..86bf0f8 --- /dev/null +++ b/tiger/src/lib/tiger/tiger_lexer.mll @@ -0,0 +1,139 @@ +{ + open Tiger_parser.Token + + let comment_level = ref 0 + let string_buf = Buffer.create 100 +} + +let alpha = ['a'-'z' 'A'-'Z'] +let num = ['0'-'9'] +let newline = '\n' | '\r' | "\n\r" + +rule token = parse + | eof { + EOF + } + + (* Track line number *) + | newline { + Lexing.new_line lexbuf; + token lexbuf + } + + (* Comment *) + | "/*" { + incr comment_level; + comment lexbuf + } + + | ":=" {ASSIGN} + | "<=" {LE} + | ">=" {GE} + | "<>" {NEQ} + | '&' {AND} + | '(' {LPAREN} + | ')' {RPAREN} + | '*' {TIMES} + | '+' {PLUS} + | '-' {MINUS} + | '/' {DIVIDE} + | ',' {COMMA} + | '.' {DOT} + | ':' {COLON} + | ';' {SEMICOLON} + | '>' {GT} + | '<' {LT} + | '=' {EQ} + | '[' {LBRACK} + | ']' {RBRACK} + | '{' {LBRACE} + | '}' {RBRACE} + | '|' {OR} + + (* String literal *) + | '"' { + string_literal lexbuf + } + + (* Drop whitespace *) + | [' ' '\t'] { + token lexbuf + } + + | (num+ as int) { + INT (int_of_string int) + } + + | (alpha (alpha | num | '_')* as id) { + match id with + | "array" -> ARRAY + | "break" -> BREAK + | "do" -> DO + | "else" -> ELSE + | "end" -> END + | "for" -> FOR + | "function" -> FUNCTION + | "if" -> IF + | "in" -> IN + | "let" -> LET + | "nil" -> NIL + | "of" -> OF + | "then" -> THEN + | "to" -> TO + | "type" -> TYPE + | "var" -> VAR + | "while" -> WHILE + | _ -> ID id + } + + (* Eat unimplemented. FIXME: stop indiscriminate eating *) + | _ { + token lexbuf + } +and string_literal = parse + (* Keep escaped quote marks as part of the string literal *) + | '\\' '"' { + Buffer.add_char string_buf '"'; + string_literal lexbuf + } + + | '"' { + let string = Buffer.contents string_buf in + Buffer.reset string_buf; + STRING string + } + + + | (_ as c) { + Buffer.add_char string_buf c; + string_literal lexbuf + } +and comment = parse + | eof { + (* TODO: Error: unterminated comment? or we don't care? *) + EOF + } + + (* Track line number *) + | newline { + Lexing.new_line lexbuf; + comment lexbuf + } + + | "/*" { + incr comment_level; + comment lexbuf + } + + | "*/" { + decr comment_level; + match !comment_level with + | 0 -> token lexbuf + | n when n > 0 -> comment lexbuf + | _ -> assert false + } + + (* Drop comment contents *) + | _ { + comment lexbuf + } diff --git a/tiger/src/lib/tiger/tiger_parser.ml b/tiger/src/lib/tiger/tiger_parser.ml new file mode 100644 index 0000000..191f8a3 --- /dev/null +++ b/tiger/src/lib/tiger/tiger_parser.ml @@ -0,0 +1,95 @@ +open Printf + +module Token = struct + type t = + | AND + | ARRAY + | ASSIGN + | BREAK + | COLON + | COMMA + | DIVIDE + | DO + | DOT + | ELSE + | END + | EOF + | EQ + | FOR + | FUNCTION + | GE + | GT + | ID of string + | IF + | IN + | INT of int + | LBRACE + | LBRACK + | LE + | LET + | LPAREN + | LT + | MINUS + | NEQ + | NIL + | OF + | OR + | PLUS + | RBRACE + | RBRACK + | RPAREN + | SEMICOLON + | STRING of string + | THEN + | TIMES + | TO + | TYPE + | VAR + | WHILE + + let to_string = function + | TYPE -> "TYPE" + | VAR -> "VAR" + | FUNCTION -> "FUNCTION" + | BREAK -> "BREAK" + | OF -> "OF" + | END -> "END" + | IN -> "IN" + | NIL -> "NIL" + | LET -> "LET" + | DO -> "DO" + | TO -> "TO" + | FOR -> "FOR" + | WHILE -> "WHILE" + | ELSE -> "ELSE" + | THEN -> "THEN" + | IF -> "IF" + | ARRAY -> "ARRAY" + | ASSIGN -> "ASSIGN" + | OR -> "OR" + | AND -> "AND" + | GE -> "GE" + | GT -> "GT" + | LE -> "LE" + | LT -> "LT" + | NEQ -> "NEQ" + | EQ -> "EQ" + | DIVIDE -> "DIVIDE" + | TIMES -> "TIMES" + | MINUS -> "MINUS" + | PLUS -> "PLUS" + | DOT -> "DOT" + | RBRACE -> "RBRACE" + | LBRACE -> "LBRACE" + | RBRACK -> "RBRACK" + | LBRACK -> "LBRACK" + | RPAREN -> "RPAREN" + | LPAREN -> "LPAREN" + | SEMICOLON -> "SEMICOLON" + | COLON -> "COLON" + | COMMA -> "COMMA" + | STRING s -> sprintf "STRING (%S)" s + | INT i -> sprintf "INT (%d)" i + | ID id -> sprintf "ID (%s)" id + | EOF -> "EOF" +end diff --git a/tiger/src/lib/tiger/tiger_parser.mli b/tiger/src/lib/tiger/tiger_parser.mli new file mode 100644 index 0000000..3eb635e --- /dev/null +++ b/tiger/src/lib/tiger/tiger_parser.mli @@ -0,0 +1,49 @@ +module Token : sig + type t = + | AND + | ARRAY + | ASSIGN + | BREAK + | COLON + | COMMA + | DIVIDE + | DO + | DOT + | ELSE + | END + | EOF + | EQ + | FOR + | FUNCTION + | GE + | GT + | ID of string + | IF + | IN + | INT of int + | LBRACE + | LBRACK + | LE + | LET + | LPAREN + | LT + | MINUS + | NEQ + | NIL + | OF + | OR + | PLUS + | RBRACE + | RBRACK + | RPAREN + | SEMICOLON + | STRING of string + | THEN + | TIMES + | TO + | TYPE + | VAR + | WHILE + + val to_string : t -> string +end