From: Siraaj Khandkar Date: Sat, 26 May 2018 01:55:20 +0000 (-0400) Subject: Define initial grammar X-Git-Url: https://git.xandkar.net/?a=commitdiff_plain;h=543d34200923ddd3fedc247b7d663ee46733f0f1;p=tiger.ml.git Define initial grammar FIXME: 85 shift/reduce conflicts, 1 reduce/reduce conflict. --- diff --git a/tiger/src/exe/tiger_tests.ml b/tiger/src/exe/tiger_tests.ml index 3b1a0c2..266797e 100644 --- a/tiger/src/exe/tiger_tests.ml +++ b/tiger/src/exe/tiger_tests.ml @@ -1,3 +1,5 @@ +open Printf + module List = ListLabels let test_01 = @@ -15,7 +17,7 @@ let test_01 = " in let tokens = - let open Tiger.Parser.Token in + let open Tiger.Parser in [ LET; TYPE; ID "arrtype"; EQ; ARRAY; OF; ID "int"; VAR; ID "arr1"; COLON; ID "arrtype"; ASSIGN; @@ -43,7 +45,7 @@ let test_02 = " in let tokens = - let open Tiger.Parser.Token in + let open Tiger.Parser in [ LET; TYPE; ID "myint"; EQ; ID "int"; TYPE; ID "arrtype"; EQ; ARRAY; OF; ID "myint"; @@ -78,7 +80,7 @@ let test_03 = " in let tokens = - let open Tiger.Parser.Token in + let open Tiger.Parser in [ LET; TYPE; ID "rectype"; EQ; LBRACE; ID "name"; COLON; ID "string"; @@ -116,7 +118,7 @@ let test_04 = " in let tokens = - let open Tiger.Parser.Token in + let open Tiger.Parser in [ LET; FUNCTION; ID "nfactor"; LPAREN; ID "n"; COLON; ID "int"; RPAREN; COLON; ID "int"; EQ; IF; ID "n"; EQ; INT 0; @@ -132,14 +134,13 @@ let test_04 = let test_09 = let name = "error : types of then - else differ" in let code = - " - /* "^name^" */ - - if (5>4) then 13 else \" \" + " \ + /* "^name^" */ \ + if (5>4) then 13 else \" \" \ " in let tokens = - let open Tiger.Parser.Token in + let open Tiger.Parser in [ IF; LPAREN; INT 5; GT; INT 4; RPAREN; THEN; INT 13; ELSE; STRING " " ] in @@ -149,12 +150,24 @@ let test_09 = let tokens_of_code code = let lexbuf = Lexing.from_string code in let rec tokens () = - match Tiger.Lexer.token lexbuf with - | None -> [] - | Some token -> token :: tokens () + let token = Tiger.Lexer.token lexbuf in + (* Avoiding fragile pattern-matching *) + if token = Tiger.Parser.EOF then [] else token :: tokens () in tokens () +let parsetree_of_code code = + let lb = Lexing.from_string code in + (match Tiger.Parser.program Tiger.Lexer.token lb with + | exception Parsing.Parse_error -> + let module L = Lexing in + let L.({lex_curr_p = {pos_lnum=l; pos_bol=b; pos_cnum=c; _}; _}) = lb in + let msg = sprintf "Syntax error around line: %d, column: %d" l (c - b) in + Error msg + | parsetree -> + Ok parsetree + ) + let tests = [ test_01 ; test_02 @@ -166,22 +179,38 @@ let tests = let () = let bar_sep = String.make 80 '-' in let bar_end = String.make 80 '=' in + let indent n = String.make (2 * n) ' ' in + let color_on_green = "\027[0;32m" in + let color_on_red = "\027[1;31m" in + let color_off = "\027[0m" in List.iteri tests ~f:(fun i (name, code, tokens_expected) -> let i = i + 1 in (* Because iteri starts with 0 *) - let open Printf in - printf "%s\n Test %d : %S\n" bar_sep i name; + printf "%s\n%sTest %d : %S\n" bar_sep (indent 0) i name; + + printf "%sLexing : " (indent 1); let tokens_emitted = tokens_of_code code in (try assert (tokens_emitted = tokens_expected); - print_endline " ---> OK"; + printf "%sOK%s\n" color_on_green color_off; with Assert_failure _ -> let tokens_to_string tokens = - String.concat "; " (List.map ~f:Tiger.Parser.Token.to_string tokens) + String.concat "; " (List.map ~f:Tiger.Parser_token.to_string tokens) in printf - " ---> ERROR\n Expected: %s\n Emitted : %s\n\n" + "%sERROR%s\n%sExpected: %s\n%sEmitted : %s\n\n" + color_on_red + color_off + (indent 2) (tokens_to_string tokens_expected) + (indent 2) (tokens_to_string tokens_emitted) ); + + printf "%sParsing: " (indent 1); + (match parsetree_of_code code with + | Error errmsg -> printf "%sERROR:%s %s\n" color_on_red color_off errmsg + | Ok parsetree -> printf "%sOK:%s %s\n" color_on_green color_off parsetree + ); + ); print_endline bar_end; diff --git a/tiger/src/exe/tigerc.ml b/tiger/src/exe/tigerc.ml index 9c901a7..fd58a83 100644 --- a/tiger/src/exe/tigerc.ml +++ b/tiger/src/exe/tigerc.ml @@ -1,16 +1,16 @@ -open Printf - let () = let path_to_program_file = Sys.argv.(1) in let ic = open_in path_to_program_file in let lexbuf = Lexing.from_channel ic in - let rec parse_and_print () = - match Tiger.Lexer.token lexbuf with - | None -> - () - | Some token -> - printf "%s\n" (Tiger.Parser.Token.to_string token); - parse_and_print () - in - parse_and_print (); + (match Tiger.Parser.program Tiger.Lexer.token lexbuf with + | exception Parsing.Parse_error -> + let + Lexing.({lex_curr_p = {pos_lnum; pos_bol; pos_cnum; _}; _}) = lexbuf + in + Printf.printf + "Syntax error in file %S, around line: %d, column: %d\n" + path_to_program_file pos_lnum (pos_cnum - pos_bol) + | program -> + print_endline program + ); close_in ic; diff --git a/tiger/src/lib/tiger/tiger.ml b/tiger/src/lib/tiger/tiger.ml index a9a2799..577e25d 100644 --- a/tiger/src/lib/tiger/tiger.ml +++ b/tiger/src/lib/tiger/tiger.ml @@ -1,2 +1,3 @@ module Lexer = Tiger_lexer module Parser = Tiger_parser +module Parser_token = Tiger_parser_token diff --git a/tiger/src/lib/tiger/tiger_lexer.mli b/tiger/src/lib/tiger/tiger_lexer.mli index dceeee5..da2cb11 100644 --- a/tiger/src/lib/tiger/tiger_lexer.mli +++ b/tiger/src/lib/tiger/tiger_lexer.mli @@ -1 +1 @@ -val token : Lexing.lexbuf -> Tiger_parser.Token.t option +val token : Lexing.lexbuf -> Tiger_parser.token diff --git a/tiger/src/lib/tiger/tiger_lexer.mll b/tiger/src/lib/tiger/tiger_lexer.mll index 9e08685..359366f 100644 --- a/tiger/src/lib/tiger/tiger_lexer.mll +++ b/tiger/src/lib/tiger/tiger_lexer.mll @@ -1,5 +1,5 @@ { - open Tiger_parser.Token + open Tiger_parser let comment_level = ref 0 let string_buf = Buffer.create 100 @@ -10,9 +10,7 @@ let num = ['0'-'9'] let newline = '\n' | '\r' | "\n\r" rule token = parse - | eof { - None - } + | eof {EOF} (* Track line number *) | newline { @@ -26,29 +24,29 @@ rule token = parse comment lexbuf } - | ":=" {Some ASSIGN} - | "<=" {Some LE} - | ">=" {Some GE} - | "<>" {Some NEQ} - | '&' {Some AND} - | '(' {Some LPAREN} - | ')' {Some RPAREN} - | '*' {Some TIMES} - | '+' {Some PLUS} - | '-' {Some MINUS} - | '/' {Some DIVIDE} - | ',' {Some COMMA} - | '.' {Some DOT} - | ':' {Some COLON} - | ';' {Some SEMICOLON} - | '>' {Some GT} - | '<' {Some LT} - | '=' {Some EQ} - | '[' {Some LBRACK} - | ']' {Some RBRACK} - | '{' {Some LBRACE} - | '}' {Some RBRACE} - | '|' {Some OR} + | ":=" {ASSIGN} + | "<=" {LE} + | ">=" {GE} + | "<>" {NEQ} + | '&' {AND} + | '(' {LPAREN} + | ')' {RPAREN} + | '*' {TIMES} + | '+' {PLUS} + | '-' {MINUS} + | '/' {DIVIDE} + | ',' {COMMA} + | '.' {DOT} + | ':' {COLON} + | ';' {SEMICOLON} + | '>' {GT} + | '<' {LT} + | '=' {EQ} + | '[' {LBRACK} + | ']' {RBRACK} + | '{' {LBRACE} + | '}' {RBRACE} + | '|' {OR} (* String literal *) | '"' { @@ -61,29 +59,29 @@ rule token = parse } | (num+ as int) { - Some (INT (int_of_string int)) + INT (int_of_string int) } | (alpha (alpha | num | '_')* as id) { match id with - | "array" -> Some ARRAY - | "break" -> Some BREAK - | "do" -> Some DO - | "else" -> Some ELSE - | "end" -> Some END - | "for" -> Some FOR - | "function" -> Some FUNCTION - | "if" -> Some IF - | "in" -> Some IN - | "let" -> Some LET - | "nil" -> Some NIL - | "of" -> Some OF - | "then" -> Some THEN - | "to" -> Some TO - | "type" -> Some TYPE - | "var" -> Some VAR - | "while" -> Some WHILE - | _ -> Some (ID id) + | "array" -> ARRAY + | "break" -> BREAK + | "do" -> DO + | "else" -> ELSE + | "end" -> END + | "for" -> FOR + | "function" -> FUNCTION + | "if" -> IF + | "in" -> IN + | "let" -> LET + | "nil" -> NIL + | "of" -> OF + | "then" -> THEN + | "to" -> TO + | "type" -> TYPE + | "var" -> VAR + | "while" -> WHILE + | _ -> (ID id) } and string_literal = parse (* Keep escaped quote marks as part of the string literal *) @@ -95,7 +93,7 @@ and string_literal = parse | '"' { let string = Buffer.contents string_buf in Buffer.reset string_buf; - Some (STRING string) + STRING string } @@ -104,10 +102,8 @@ and string_literal = parse string_literal lexbuf } and comment = parse - | eof { - (* TODO: Error: unterminated comment? or we don't care? *) - None - } + (* TODO: Error: unterminated comment? or we don't care? *) + | eof {EOF} (* Track line number *) | newline { diff --git a/tiger/src/lib/tiger/tiger_parser.ml b/tiger/src/lib/tiger/tiger_parser.ml deleted file mode 100644 index 191f8a3..0000000 --- a/tiger/src/lib/tiger/tiger_parser.ml +++ /dev/null @@ -1,95 +0,0 @@ -open Printf - -module Token = struct - type t = - | AND - | ARRAY - | ASSIGN - | BREAK - | COLON - | COMMA - | DIVIDE - | DO - | DOT - | ELSE - | END - | EOF - | EQ - | FOR - | FUNCTION - | GE - | GT - | ID of string - | IF - | IN - | INT of int - | LBRACE - | LBRACK - | LE - | LET - | LPAREN - | LT - | MINUS - | NEQ - | NIL - | OF - | OR - | PLUS - | RBRACE - | RBRACK - | RPAREN - | SEMICOLON - | STRING of string - | THEN - | TIMES - | TO - | TYPE - | VAR - | WHILE - - let to_string = function - | TYPE -> "TYPE" - | VAR -> "VAR" - | FUNCTION -> "FUNCTION" - | BREAK -> "BREAK" - | OF -> "OF" - | END -> "END" - | IN -> "IN" - | NIL -> "NIL" - | LET -> "LET" - | DO -> "DO" - | TO -> "TO" - | FOR -> "FOR" - | WHILE -> "WHILE" - | ELSE -> "ELSE" - | THEN -> "THEN" - | IF -> "IF" - | ARRAY -> "ARRAY" - | ASSIGN -> "ASSIGN" - | OR -> "OR" - | AND -> "AND" - | GE -> "GE" - | GT -> "GT" - | LE -> "LE" - | LT -> "LT" - | NEQ -> "NEQ" - | EQ -> "EQ" - | DIVIDE -> "DIVIDE" - | TIMES -> "TIMES" - | MINUS -> "MINUS" - | PLUS -> "PLUS" - | DOT -> "DOT" - | RBRACE -> "RBRACE" - | LBRACE -> "LBRACE" - | RBRACK -> "RBRACK" - | LBRACK -> "LBRACK" - | RPAREN -> "RPAREN" - | LPAREN -> "LPAREN" - | SEMICOLON -> "SEMICOLON" - | COLON -> "COLON" - | COMMA -> "COMMA" - | STRING s -> sprintf "STRING (%S)" s - | INT i -> sprintf "INT (%d)" i - | ID id -> sprintf "ID (%s)" id - | EOF -> "EOF" -end diff --git a/tiger/src/lib/tiger/tiger_parser.mli b/tiger/src/lib/tiger/tiger_parser.mli deleted file mode 100644 index 3eb635e..0000000 --- a/tiger/src/lib/tiger/tiger_parser.mli +++ /dev/null @@ -1,49 +0,0 @@ -module Token : sig - type t = - | AND - | ARRAY - | ASSIGN - | BREAK - | COLON - | COMMA - | DIVIDE - | DO - | DOT - | ELSE - | END - | EOF - | EQ - | FOR - | FUNCTION - | GE - | GT - | ID of string - | IF - | IN - | INT of int - | LBRACE - | LBRACK - | LE - | LET - | LPAREN - | LT - | MINUS - | NEQ - | NIL - | OF - | OR - | PLUS - | RBRACE - | RBRACK - | RPAREN - | SEMICOLON - | STRING of string - | THEN - | TIMES - | TO - | TYPE - | VAR - | WHILE - - val to_string : t -> string -end diff --git a/tiger/src/lib/tiger/tiger_parser.mly b/tiger/src/lib/tiger/tiger_parser.mly new file mode 100644 index 0000000..157a8a6 --- /dev/null +++ b/tiger/src/lib/tiger/tiger_parser.mly @@ -0,0 +1,334 @@ +%{ + open Printf +%} + +/* Declarations */ +%token AND +%token ARRAY +%token ASSIGN +%token BREAK +%token COLON +%token COMMA +%token DIVIDE +%token DO +%token DOT +%token ELSE +%token END +%token EOF +%token EQ +%token FOR +%token FUNCTION +%token GE +%token GT +%token ID +%token IF +%token IN +%token INT +%token LBRACE +%token LBRACK +%token LE +%token LET +%token LPAREN +%token LT +%token MINUS +%token NEQ +%token NIL +%token OF +%token OR +%token PLUS +%token RBRACE +%token RBRACK +%token RPAREN +%token SEMICOLON +%token STRING +%token THEN +%token TIMES +%token TO +%token TYPE +%token VAR +%token WHILE + +/* from lowest precedence */ +%left OR +%left AND +%nonassoc EQ NEQ GT LT GE LE +%left PLUS MINUS +%left TIMES DIVIDE +%nonassoc UMINUS +/* to highest precedence */ + +%type program + +%start program + +%% + +program: + | exp EOF + { + sprintf "program[%s]" $1 + } + +exp: + | NIL + { + "nil[]" + } + | INT + { + sprintf "int[%d]" $1 + } + | MINUS exp %prec UMINUS + { + sprintf "negation[%s]" $2 + } + | type_id LBRACK exp RBRACK OF exp + { + sprintf "array[type[%s], size[%s], val[%s]]" $1 $3 $6 + } + | type_id LBRACE rec_field_assignments RBRACE + { + sprintf "record[type[%s], rec_field_assignments[%s]]" $1 $3 + } + | lvalue + { + $1 + } + | lvalue ASSIGN exp + { + sprintf "assign[%s := %s]" $1 $3 + } + | STRING + { + sprintf "string[%S]" $1 + } + | fun_call + { + $1 + } + | exp op exp + { + sprintf "op[%s %s %s]" $1 $2 $3 + } + | IF exp THEN exp ELSE exp + { + let e1 = $2 in + let e2 = $4 in + let e3 = $6 in + sprintf "if_then_else[%s, then[%s], else[%s]]" e1 e2 e3 + } + | IF exp THEN exp + { + sprintf "if_then[%s, then[%s]]" $2 $4 + } + | WHILE exp DO exp + { + sprintf "while[%s, do[%s]]" $2 $4 + } + | FOR id ASSIGN exp TO exp DO exp + { + let id = $2 in + let e1 = $4 in + let e2 = $6 in + let e3 = $8 in + sprintf "for[%s := %s, to[%s], do[%s]]" id e1 e2 e3 + } + | BREAK + { + "break[]" + } + | LPAREN seq RPAREN + { + sprintf "seq[%s]" $2 + } + | LET decs IN seq END + { + let decs = $2 in + let seq = $4 in + sprintf "let[decs[%s], in[seq[%s]]]" decs seq + } + | unit + { + $1 + } + +seq: + | exp + { + sprintf "%s" $1 + } + | exp SEMICOLON seq + { + sprintf "%s; %s" $1 $3 + } + +decs: + | dec + { + sprintf "%s" $1 + } + | dec decs + { + sprintf "%s %s" $1 $2 + } + +dec: + | tydec {$1} + | vardec {$1} + | fundec {$1} + +fundec: + | FUNCTION id LPAREN tyfields RPAREN EQ exp + { + let id = $2 in + let tyfields = $4 in + let exp = $7 in + sprintf "fundec[%s, tyfields[%s], exp[%s]]" id tyfields exp + } + | FUNCTION id LPAREN tyfields RPAREN COLON type_id EQ exp + { + let id = $2 in + let tyfields = $4 in + let type_id = $7 in + let exp = $9 in + sprintf + "fundec[%s, tyfields[%s], type_id[%s], exp[%s]]" + id tyfields type_id exp + } + +vardec: + | VAR id ASSIGN exp + { + let id = $2 in + let exp = $4 in + sprintf "vardec[%s, exp[%s]]" id exp + } + | VAR id COLON type_id ASSIGN exp + { + let id = $2 in + let tyid = $4 in + let exp = $6 in + sprintf "vardec[%s, type_id[%s], exp[%s]]" id tyid exp + } + +tydec: + | TYPE type_id EQ ty + { + sprintf "tydec[%s, %s]" $2 $4 + } + +ty: + | type_id + {$1} + | LBRACE RBRACE + { + "record[]" + } + | LBRACE tyfields RBRACE + { + let tyfields = $2 in + sprintf "record[%s]" tyfields + } + | ARRAY OF type_id + { + let type_id = $3 in + sprintf "array_of_type[%s]" type_id + } + +tyfields: +/*| epsilon */ + | tyfield + {$1} + | tyfield COMMA tyfields + { + let tyfield = $1 in + let tyfields = $3 in + sprintf "%s, %s" tyfield tyfields + } + +tyfield: + | id COLON type_id + { + let id = $1 in + let type_id = $3 in + sprintf "tyfield[%s, %s]" id type_id + } + +id: + | ID + { + sprintf "id[%S]" $1 + } + +/* Perhaps "void"? */ +unit: + | LPAREN RPAREN + { + "unit[]" + } + +type_id: + | id + { + sprintf "type_id[%S]" $1 + } + +rec_field_assignments: + | id EQ exp + { + sprintf "%S = %s" $1 $3 + } + | id EQ exp COMMA rec_field_assignments + { + sprintf "%S = %s, %s" $1 $3 $5 + } + +fun_call: + | id unit + { + sprintf "fun_call[%s, %s]" $1 $2 + } + | id LPAREN fun_args RPAREN + { + sprintf "fun_call[%s, %s]" $1 $3 + } + +fun_args: + | exp + { + $1 + } + | exp COMMA fun_args + { + sprintf "%s, %s" $1 $3 + } + +op: + | PLUS {"+"} + | MINUS {"-"} + | TIMES {"*"} + | DIVIDE {"/"} + | EQ {"="} + | NEQ {"<>"} + | GT {">"} + | LT {"<"} + | GE {">="} + | LE {"<="} + | AND {"&"} + | OR {"|"} + +lvalue: + | id + { + sprintf "lvalue[%s]" $1 + } + | lvalue DOT id + { + sprintf "get_record_field[%s, %s]" $1 $3 + } + | lvalue LBRACK exp RBRACK + { + sprintf "get_array_subscript[%s, %s]" $1 $3 + } + +%% diff --git a/tiger/src/lib/tiger/tiger_parser_token.ml b/tiger/src/lib/tiger/tiger_parser_token.ml new file mode 100644 index 0000000..b78440c --- /dev/null +++ b/tiger/src/lib/tiger/tiger_parser_token.ml @@ -0,0 +1,51 @@ +open Printf + +module T = Tiger_parser + +type t = T.token + +let to_string = function + | T.TYPE -> "TYPE" + | T.VAR -> "VAR" + | T.FUNCTION -> "FUNCTION" + | T.BREAK -> "BREAK" + | T.OF -> "OF" + | T.END -> "END" + | T.IN -> "IN" + | T.NIL -> "NIL" + | T.LET -> "LET" + | T.DO -> "DO" + | T.TO -> "TO" + | T.FOR -> "FOR" + | T.WHILE -> "WHILE" + | T.ELSE -> "ELSE" + | T.THEN -> "THEN" + | T.IF -> "IF" + | T.ARRAY -> "ARRAY" + | T.ASSIGN -> "ASSIGN" + | T.OR -> "OR" + | T.AND -> "AND" + | T.GE -> "GE" + | T.GT -> "GT" + | T.LE -> "LE" + | T.LT -> "LT" + | T.NEQ -> "NEQ" + | T.EQ -> "EQ" + | T.DIVIDE -> "DIVIDE" + | T.TIMES -> "TIMES" + | T.MINUS -> "MINUS" + | T.PLUS -> "PLUS" + | T.DOT -> "DOT" + | T.RBRACE -> "RBRACE" + | T.LBRACE -> "LBRACE" + | T.RBRACK -> "RBRACK" + | T.LBRACK -> "LBRACK" + | T.RPAREN -> "RPAREN" + | T.LPAREN -> "LPAREN" + | T.SEMICOLON -> "SEMICOLON" + | T.COLON -> "COLON" + | T.COMMA -> "COMMA" + | T.STRING s -> sprintf "STRING (%S)" s + | T.INT i -> sprintf "INT (%d)" i + | T.ID id -> sprintf "ID (%s)" id + | T.EOF -> "EOF" diff --git a/tiger/src/lib/tiger/tiger_parser_token.mli b/tiger/src/lib/tiger/tiger_parser_token.mli new file mode 100644 index 0000000..c888529 --- /dev/null +++ b/tiger/src/lib/tiger/tiger_parser_token.mli @@ -0,0 +1,3 @@ +type t = Tiger_parser.token + +val to_string : t -> string