diff --git a/core/c/frontend/preprocessor/const_expr.odin b/core/c/frontend/preprocessor/const_expr.odin new file mode 100644 index 000000000..5f52fff9c --- /dev/null +++ b/core/c/frontend/preprocessor/const_expr.odin @@ -0,0 +1,25 @@ +package c_frontend_preprocess + +import "core:c/frontend/tokenizer" + +const_expr :: proc(rest: ^^Token, tok: ^Token) -> i64 { + // TODO(bill): Handle const_expr correctly + // This is effectively a mini-parser + + assert(rest != nil); + assert(tok != nil); + rest^ = tokenizer.new_eof(tok); + switch v in tok.val { + case i64: + return v; + case f64: + return i64(v); + case string: + return 0; + case []u16: + // TODO + case []u32: + // TODO + } + return 0; +} diff --git a/core/c/frontend/preprocessor/preprocess.odin b/core/c/frontend/preprocessor/preprocess.odin new file mode 100644 index 000000000..3af2c2339 --- /dev/null +++ b/core/c/frontend/preprocessor/preprocess.odin @@ -0,0 +1,1498 @@ +package c_frontend_preprocess + +import "../tokenizer" + +import "core:strings" +import "core:strconv" +import "core:path/filepath" +import "core:unicode/utf8" +import "core:unicode/utf16" +import "core:os" + +@(private) +Tokenizer :: tokenizer.Tokenizer; +@(private) +Token :: tokenizer.Token; + +Error_Handler :: tokenizer.Error_Handler; + +Macro_Param :: struct { + next: ^Macro_Param, + name: string, +} + +Macro_Arg :: struct { + next: ^Macro_Arg, + name: string, + tok: ^Token, + is_va_args: bool, +} + +Macro_Kind :: enum u8 { + Function_Like, + Value_Like, +} + +Macro_Handler :: #type proc(^Preprocessor, ^Token) -> ^Token; + +Macro :: struct { + name: string, + kind: Macro_Kind, + params: ^Macro_Param, + va_args_name: string, + body: ^Token, + handler: Macro_Handler, +} + +Cond_Incl_State :: enum u8 { + In_Then, + In_Elif, + In_Else, +} + +Cond_Incl :: struct { + next: ^Cond_Incl, + tok: ^Token, + state: Cond_Incl_State, + included: bool, +} + +Pragma_Handler :: #type proc(^Preprocessor, ^Token); + +Preprocessor :: struct { + // Lookup tables + macros: map[string]^Macro, + pragma_once: map[string]bool, + include_guards: map[string]string, + filepath_cache: map[string]string, + + // Include path data + include_paths: []string, + + // Counter for __COUNTER__ macro + counter: i64, + + // Include information + cond_incl: ^Cond_Incl, + include_level: int, + include_next_index: int, + + wide_char_size: int, + + // Mutable data + err: Error_Handler, + warn: Error_Handler, + pragma_handler: Pragma_Handler, + error_count: int, + warning_count: int, +} + +MAX_INCLUDE_LEVEL :: 1024; + +error :: proc(cpp: ^Preprocessor, tok: ^Token, msg: string, args: ..any) { + if cpp.err != nil { + cpp.err(tok.pos, msg, ..args); + } + cpp.error_count += 1; +} + +warn :: proc(cpp: ^Preprocessor, tok: ^Token, msg: string, args: ..any) { + if cpp.warn != nil { + cpp.warn(tok.pos, msg, ..args); + } + cpp.warning_count += 1; +} + +is_hash :: proc(tok: ^Token) -> bool { + return tok.at_bol && tok.lit == "#"; +} + +skip_line :: proc(cpp: ^Preprocessor, tok: ^Token) -> ^Token { + tok := tok; + if tok.at_bol { + return tok; + } + warn(cpp, tok, "extra token"); + for tok.at_bol { + tok = tok.next; + } + return tok; +} + + +append_token :: proc(a, b: ^Token) -> ^Token { + if a.kind == .EOF { + return b; + } + + head: Token; + curr := &head; + + for tok := a; tok.kind != .EOF; tok = tok.next { + curr.next = tokenizer.copy_token(tok); + curr = curr.next; + } + curr.next = b; + return head.next; +} + + +is_hex_digit :: proc(x: byte) -> bool { + switch x { + case '0'..'9', 'a'..'f', 'A'..'F': + return true; + } + return false; +} +from_hex :: proc(x: byte) -> i32 { + switch x { + case '0'..'9': + return i32(x) - '0'; + case 'a'..'f': + return i32(x) - 'a' + 10; + case 'A'..'F': + return i32(x) - 'A' + 10; + } + return 16; +} + + +convert_pp_number :: proc(tok: ^Token) { + convert_pp_int :: proc(tok: ^Token) -> bool { + p := tok.lit; + base := 10; + if len(p) > 2 { + if strings.equal_fold(p[:2], "0x") && is_hex_digit(p[2]) { + p = p[2:]; + base = 16; + } else if strings.equal_fold(p[:2], "0b") && p[2] == '0' || p[2] == '1' { + p = p[2:]; + base = 2; + } + } + if base == 10 && p[0] == '0' { + base = 8; + } + + + tok.val, _ = strconv.parse_i64_of_base(p, base); + + l, u: int; + + suf: [3]byte; + suf_n := 0; + i := len(p)-1; + for /**/; i >= 0 && suf_n < len(suf); i -= 1 { + switch p[i] { + case 'l', 'L': + suf[suf_n] = 'l'; + l += 1; + suf_n += 1; + case 'u', 'U': + suf[suf_n] = 'u'; + u += 1; + suf_n += 1; + } + } + if i < len(p) { + if !is_hex_digit(p[i]) && p[i] != '.' { + return false; + } + } + if u > 1 { + return false; + } + + if l > 2 { + return false; + } + + if u == 1 { + switch l { + case 0: tok.type_hint = .Unsigned_Int; + case 1: tok.type_hint = .Unsigned_Long; + case 2: tok.type_hint = .Unsigned_Long_Long; + } + } else { + switch l { + case 0: tok.type_hint = .Int; + case 1: tok.type_hint = .Long; + case 2: tok.type_hint = .Long_Long; + } + } + return true; + } + + if convert_pp_int(tok) { + return; + } + + fval, _ := strconv.parse_f64(tok.lit); + tok.val = fval; + + end := tok.lit[len(tok.lit)-1]; + switch end { + case 'f', 'F': + tok.type_hint = .Float; + case 'l', 'L': + tok.type_hint = .Long_Double; + case: + tok.type_hint = .Double; + } + +} + +convert_pp_char :: proc(tok: ^Token) { + assert(len(tok.lit) >= 2); + r, _, _, _ := unquote_char(tok.lit, tok.lit[0]); + tok.val = i64(r); + + tok.type_hint = .Int; + switch tok.prefix { + case "u": tok.type_hint = .UTF_16; + case "U": tok.type_hint = .UTF_32; + case "L": tok.type_hint = .UTF_Wide; + } +} + +wide_char_size :: proc(cpp: ^Preprocessor) -> int { + char_size := 4; + if cpp.wide_char_size > 0 { + char_size = clamp(cpp.wide_char_size, 1, 4); + assert(char_size & (char_size-1) == 0); + } + return char_size; +} + +convert_pp_string :: proc(cpp: ^Preprocessor, tok: ^Token) { + assert(len(tok.lit) >= 2); + str, _, _ := unquote_string(tok.lit); + tok.val = str; + + char_size := 1; + + switch tok.prefix { + case "u8": + tok.type_hint = .UTF_8; + char_size = 1; + case "u": + tok.type_hint = .UTF_16; + char_size = 2; + case "U": + tok.type_hint = .UTF_32; + char_size = 4; + case "L": + tok.type_hint = .UTF_Wide; + char_size = wide_char_size(cpp); + } + + switch char_size { + case 2: + n: int; + buf := make([]u16, len(str)); + for c in str { + ch := c; + if ch < 0x10000 { + buf[n] = u16(ch); + n += 1; + } else { + ch -= 0x10000; + buf[n+0] = 0xd800 + u16((ch >> 10) & 0x3ff); + buf[n+1] = 0xdc00 + u16(ch & 0x3ff); + n += 2; + } + } + tok.val = buf[:n]; + case 4: + n: int; + buf := make([]u32, len(str)); + for ch in str { + buf[n] = u32(ch); + n += 1; + } + tok.val = buf[:n]; + } + +} + +convert_pp_token :: proc(cpp: ^Preprocessor, t: ^Token, is_keyword: tokenizer.Is_Keyword_Proc) { + switch { + case t.kind == .Char: + convert_pp_char(t); + case t.kind == .String: + convert_pp_string(cpp, t); + case is_keyword != nil && is_keyword(t): + t.kind = .Keyword; + case t.kind == .PP_Number: + convert_pp_number(t); + } +} +convert_pp_tokens :: proc(cpp: ^Preprocessor, tok: ^Token, is_keyword: tokenizer.Is_Keyword_Proc) { + for t := tok; t != nil && t.kind != .EOF; t = t.next { + convert_pp_token(cpp, tok, is_keyword); + } +} + +join_adjacent_string_literals :: proc(cpp: ^Preprocessor, initial_tok: ^Token) { + for tok1 := initial_tok; tok1.kind != .EOF; /**/ { + if tok1.kind != .String || tok1.next.kind != .String { + tok1 = tok1.next; + continue; + } + + type_hint := tokenizer.Token_Type_Hint.None; + char_size := 1; + + start := tok1; + for t := tok1; t != nil && t.kind == .String; t = t.next { + if t.val == nil { + convert_pp_string(cpp, t); + } + tok1 = t.next; + if type_hint != t.type_hint { + if t.type_hint != .None && type_hint != .None { + error(cpp, t, "unsupported non-standard concatenation of string literals of different types"); + } + prev_char_size := char_size; + + #partial switch type_hint { + case .UTF_8: char_size = max(char_size, 1); + case .UTF_16: char_size = max(char_size, 2); + case .UTF_32: char_size = max(char_size, 4); + case .UTF_Wide: char_size = max(char_size, wide_char_size(cpp)); + } + + if type_hint == .None || prev_char_size < char_size { + type_hint = t.type_hint; + } + } + } + + // NOTE(bill): Verbose logic in order to correctly concantenate strings, even if they different in type + max_len := 0; + switch char_size { + case 1: + for t := start; t != nil && t.kind == .String; t = t.next { + #partial switch v in t.val { + case string: max_len += len(v); + case []u16: max_len += 2*len(v); + case []u32: max_len += 4*len(v); + } + } + n := 0; + buf := make([]byte, max_len); + for t := start; t != nil && t.kind == .String; t = t.next { + #partial switch v in t.val { + case string: + n += copy(buf[n:], v); + case []u16: + for i := 0; i < len(v); /**/ { + c1 := v[i]; + r: rune; + if !utf16.is_surrogate(rune(c1)) { + r = rune(c1); + i += 1; + } else if i+1 == len(v) { + r = utf16.REPLACEMENT_CHAR; + i += 1; + } else { + c2 := v[i+1]; + i += 2; + r = utf16.decode_surrogate_pair(rune(c1), rune(c2)); + } + + b, w := utf8.encode_rune(r); + n += copy(buf[n:], b[:w]); + } + case []u32: + for r in v { + b, w := utf8.encode_rune(rune(r)); + n += copy(buf[n:], b[:w]); + } + } + } + + new_tok := tokenizer.copy_token(start); + new_tok.lit = ""; + new_tok.val = string(buf[:n]); + new_tok.next = tok1; + new_tok.type_hint = type_hint; + start^ = new_tok^; + case 2: + for t := start; t != nil && t.kind == .String; t = t.next { + #partial switch v in t.val { + case string: max_len += len(v); + case []u16: max_len += len(v); + case []u32: max_len += 2*len(v); + } + } + n := 0; + buf := make([]u16, max_len); + for t := start; t != nil && t.kind == .String; t = t.next { + #partial switch v in t.val { + case string: + for r in v { + if r >= 0x10000 { + c1, c2 := utf16.encode_surrogate_pair(r); + buf[n+0] = u16(c1); + buf[n+1] = u16(c2); + n += 2; + } else { + buf[n] = u16(r); + n += 1; + } + } + case []u16: + n += copy(buf[n:], v); + case []u32: + for r in v { + if r >= 0x10000 { + c1, c2 := utf16.encode_surrogate_pair(rune(r)); + buf[n+0] = u16(c1); + buf[n+1] = u16(c2); + n += 2; + } else { + buf[n] = u16(r); + n += 1; + } + } + } + } + + new_tok := tokenizer.copy_token(start); + new_tok.lit = ""; + new_tok.val = buf[:n]; + new_tok.next = tok1; + new_tok.type_hint = type_hint; + start^ = new_tok^; + case 4: + for t := start; t != nil && t.kind == .String; t = t.next { + #partial switch v in t.val { + case string: max_len += len(v); + case []u16: max_len += len(v); + case []u32: max_len += len(v); + } + } + n := 0; + buf := make([]u32, max_len); + for t := start; t != nil && t.kind == .String; t = t.next { + #partial switch v in t.val { + case string: + for r in v { + buf[n] = u32(r); + n += 1; + } + case []u16: + for i := 0; i < len(v); /**/ { + c1 := v[i]; + if !utf16.is_surrogate(rune(c1)) { + buf[n] = u32(c1); + n += 1; + i += 1; + } else if i+1 == len(v) { + buf[n] = utf16.REPLACEMENT_CHAR; + n += 1; + i += 1; + } else { + c2 := v[i+1]; + i += 2; + r := utf16.decode_surrogate_pair(rune(c1), rune(c2)); + buf[n] = u32(r); + n += 1; + } + } + case []u32: + n += copy(buf[n:], v); + } + } + + new_tok := tokenizer.copy_token(start); + new_tok.lit = ""; + new_tok.val = buf[:n]; + new_tok.next = tok1; + new_tok.type_hint = type_hint; + start^ = new_tok^; + } + } +} + + +quote_string :: proc(s: string) -> []byte { + b := &strings.Builder{}; + strings.init_builder(b, 0, len(s)+2); + strings.write_quoted_string(b, s, '"'); + return b.buf[:]; +} + + +_init_tokenizer_from_preprocessor :: proc(t: ^Tokenizer, cpp: ^Preprocessor) -> ^Tokenizer { + t.warn = cpp.warn; + t.err = cpp.err; + return t; +} + +new_string_token :: proc(cpp: ^Preprocessor, str: string, tok: ^Token) -> ^Token { + assert(tok != nil); + assert(str != ""); + t := _init_tokenizer_from_preprocessor(&Tokenizer{}, cpp); + src := quote_string(str); + return tokenizer.inline_tokenize(t, tok, src); +} + +stringize :: proc(cpp: ^Preprocessor, hash, arg: ^Token) -> ^Token { + s := join_tokens(arg, nil); + return new_string_token(cpp, s, hash); +} + + +new_number_token :: proc(cpp: ^Preprocessor, i: i64, tok: ^Token) -> ^Token { + t := _init_tokenizer_from_preprocessor(&Tokenizer{}, cpp); + buf: [32]byte; + n := len(strconv.append_int(buf[:], i, 10)); + src := make([]byte, n); + copy(src, buf[:n]); + return tokenizer.inline_tokenize(t, tok, src); +} + + +find_macro :: proc(cpp: ^Preprocessor, tok: ^Token) -> ^Macro { + if tok.kind != .Ident { + return nil; + } + return cpp.macros[tok.lit]; +} + +add_macro :: proc(cpp: ^Preprocessor, name: string, kind: Macro_Kind, body: ^Token) -> ^Macro { + m := new(Macro); + m.name = name; + m.kind = kind; + m.body = body; + cpp.macros[name] = m; + return m; +} + + +undef_macro :: proc(cpp: ^Preprocessor, name: string) { + delete_key(&cpp.macros, name); +} + +add_builtin :: proc(cpp: ^Preprocessor, name: string, handler: Macro_Handler) -> ^Macro { + m := add_macro(cpp, name, .Value_Like, nil); + m.handler = handler; + return m; +} + + +skip :: proc(cpp: ^Preprocessor, tok: ^Token, op: string) -> ^Token { + if tok.lit != op { + error(cpp, tok, "expected '%q'", op); + } + return tok.next; +} + +consume :: proc(rest: ^^Token, tok: ^Token, lit: string) -> bool { + if tok.lit == lit { + rest^ = tok.next; + return true; + } + rest^ = tok; + return false; +} + +read_macro_params :: proc(cpp: ^Preprocessor, rest: ^^Token, tok: ^Token) -> (param: ^Macro_Param, va_args_name: string) { + head: Macro_Param; + curr := &head; + + tok := tok; + for tok.lit != ")" && tok.kind != .EOF { + if curr != &head { + tok = skip(cpp, tok, ","); + } + + if tok.lit == "..." { + va_args_name = "__VA_ARGS__"; + rest^ = skip(cpp, tok.next, ")"); + param = head.next; + return; + } + + if tok.kind != .Ident { + error(cpp, tok, "expected an identifier"); + } + + if tok.next.lit == "..." { + va_args_name = tok.lit; + rest^ = skip(cpp, tok.next.next, ")"); + param = head.next; + return; + } + + m := new(Macro_Param); + m.name = tok.lit; + curr.next = m; + curr = curr.next; + tok = tok.next; + } + + + rest^ = tok.next; + param = head.next; + return; +} + +copy_line :: proc(rest: ^^Token, tok: ^Token) -> ^Token { + head: Token; + curr := &head; + + tok := tok; + for ; !tok.at_bol; tok = tok.next { + curr.next = tokenizer.copy_token(tok); + curr = curr.next; + } + curr.next = tokenizer.new_eof(tok); + rest^ = tok; + return head.next; +} + +read_macro_definition :: proc(cpp: ^Preprocessor, rest: ^^Token, tok: ^Token) { + tok := tok; + if tok.kind != .Ident { + error(cpp, tok, "macro name must be an identifier"); + } + name := tok.lit; + tok = tok.next; + + if !tok.has_space && tok.lit == "(" { + params, va_args_name := read_macro_params(cpp, &tok, tok.next); + + m := add_macro(cpp, name, .Function_Like, copy_line(rest, tok)); + m.params = params; + m.va_args_name = va_args_name; + } else { + add_macro(cpp, name, .Value_Like, copy_line(rest, tok)); + } +} + + +join_tokens :: proc(tok, end: ^Token) -> string { + n := 1; + for t := tok; t != end && t.kind != .EOF; t = t.next { + if t != tok && t.has_space { + n += 1; + } + n += len(t.lit); + } + + buf := make([]byte, n); + + pos := 0; + for t := tok; t != end && t.kind != .EOF; t = t.next { + if t != tok && t.has_space { + buf[pos] = ' '; + pos += 1; + } + copy(buf[pos:], t.lit); + pos += len(t.lit); + } + + return string(buf[:pos]); +} + +read_include_filename :: proc(cpp: ^Preprocessor, rest: ^^Token, tok: ^Token) -> (filename: string, is_quote: bool) { + tok := tok; + + if tok.kind == .String { + rest^ = skip_line(cpp, tok.next); + filename = tok.lit[1:len(tok.lit)-1]; + is_quote = true; + return; + } + + if tok.lit == "<" { + start := tok; + for ; tok.kind != .EOF; tok = tok.next { + if tok.at_bol || tok.kind == .EOF { + error(cpp, tok, "expected '>'"); + } + is_quote = false; + if tok.lit == ">" { + break; + } + } + rest^ = skip_line(cpp, tok.next); + filename = join_tokens(start.next, tok); + return; + } + + if tok.kind == .Ident { + tok2 := preprocess_internal(cpp, copy_line(rest, tok)); + return read_include_filename(cpp, &tok2, tok2); + } + + error(cpp, tok, "expected a filename"); + return; +} + +skip_cond_incl :: proc(tok: ^Token) -> ^Token { + next_skip :: proc(tok: ^Token) -> ^Token { + tok := tok; + for tok.kind != .EOF { + if is_hash(tok) { + switch tok.next.lit { + case "if", "ifdef", "ifndef": + tok = next_skip(tok.next.next); + continue; + + case "endif": + return tok.next.next; + } + } + tok = tok.next; + } + return tok; + } + + tok := tok; + + loop: for tok.kind != .EOF { + if is_hash(tok) { + switch tok.next.lit { + case "if", "ifdef", "ifndef": + tok = next_skip(tok.next.next); + continue loop; + + case "elif", "else", "endif": + break loop; + } + } + + tok = tok.next; + } + return tok; +} + +check_for_include_guard :: proc(tok: ^Token) -> (guard: string, ok: bool) { + if !is_hash(tok) || tok.next.lit != "ifndef" { + return; + } + tok := tok; + tok = tok.next.next; + + if tok.kind != .Ident { + return; + } + + m := tok.lit; + tok = tok.next; + + if !is_hash(tok) || tok.next.lit != "define" || tok.next.lit != "macro" { + return; + } + + for tok.kind != .EOF { + if !is_hash(tok) { + tok = tok.next; + continue; + } + + if tok.next.lit == "endif" && tok.next.next.kind == .EOF { + return m, true; + } + + switch tok.lit { + case "if", "ifdef", "ifndef": + tok = skip_cond_incl(tok.next); + case: + tok = tok.next; + } + } + return; +} + +include_file :: proc(cpp: ^Preprocessor, tok: ^Token, path: string, filename_tok: ^Token) -> ^Token { + if cpp.pragma_once[path] { + return tok; + } + + guard_name, guard_name_found := cpp.include_guards[path]; + if guard_name_found && cpp.macros[guard_name] != nil { + return tok; + } + + if !os.exists(path) { + error(cpp, filename_tok, "%s: cannot open file", path); + return tok; + } + + cpp.include_level += 1; + if cpp.include_level > MAX_INCLUDE_LEVEL { + error(cpp, tok, "exceeded maximum nest amount: %d", MAX_INCLUDE_LEVEL); + return tok; + } + + t := _init_tokenizer_from_preprocessor(&Tokenizer{}, cpp); + tok2 := tokenizer.tokenize_file(t, path, /*file.id*/1); + if tok2 == nil { + error(cpp, filename_tok, "%s: cannot open file", path); + } + cpp.include_level -= 1; + + guard_name, guard_name_found = check_for_include_guard(tok2); + if guard_name_found { + cpp.include_guards[path] = guard_name; + } + + return append_token(tok2, tok); +} + +find_arg :: proc(args: ^Macro_Arg, tok: ^Token) -> ^Macro_Arg { + for ap := args; ap != nil; ap = ap.next { + if tok.lit == ap.name { + return ap; + } + } + return nil; +} + +paste :: proc(cpp: ^Preprocessor, lhs, rhs: ^Token) -> ^Token { + buf := strings.concatenate({lhs.lit, rhs.lit}); + t := _init_tokenizer_from_preprocessor(&Tokenizer{}, cpp); + tok := tokenizer.inline_tokenize(t, lhs, transmute([]byte)buf); + if tok.next.kind != .EOF { + error(cpp, lhs, "pasting forms '%s', an invalid token", buf); + } + return tok; +} + +has_varargs :: proc(args: ^Macro_Arg) -> bool { + for ap := args; ap != nil; ap = ap.next { + if ap.name == "__VA_ARGS__" { + return ap.tok.kind != .EOF; + } + } + return false; +} + +substitute_token :: proc(cpp: ^Preprocessor, tok: ^Token, args: ^Macro_Arg) -> ^Token { + head: Token; + curr := &head; + tok := tok; + for tok.kind != .EOF { + if tok.lit == "#" { + arg := find_arg(args, tok.next); + if arg == nil { + error(cpp, tok.next, "'#' is not followed by a macro parameter"); + } + arg_tok := arg.tok if arg != nil else tok.next; + curr.next = stringize(cpp, tok, arg_tok); + curr = curr.next; + tok = tok.next.next; + continue; + } + + if tok.lit == "," && tok.next.lit == "##" { + if arg := find_arg(args, tok.next.next); arg != nil && arg.is_va_args { + if arg.tok.kind == .EOF { + tok = tok.next.next.next; + } else { + curr.next = tokenizer.copy_token(tok); + curr = curr.next; + tok = tok.next.next; + } + continue; + } + } + + if tok.lit == "##" { + if curr == &head { + error(cpp, tok, "'##' cannot appear at start of macro expansion"); + } + if tok.next.kind == .EOF { + error(cpp, tok, "'##' cannot appear at end of macro expansion"); + } + + if arg := find_arg(args, tok.next); arg != nil { + if arg.tok.kind != .EOF { + curr^ = paste(cpp, curr, arg.tok)^; + for t := arg.tok.next; t.kind != .EOF; t = t.next { + curr.next = tokenizer.copy_token(t); + curr = curr.next; + } + } + tok = tok.next.next; + continue; + } + + curr^ = paste(cpp, curr, tok.next)^; + tok = tok.next.next; + continue; + } + + arg := find_arg(args, tok); + + if arg != nil && tok.next.lit == "##" { + rhs := tok.next.next; + + if arg.tok.kind == .EOF { + args2 := find_arg(args, rhs); + if args2 != nil { + for t := args.tok; t.kind != .EOF; t = t.next { + curr.next = tokenizer.copy_token(t); + curr = curr.next; + } + } else { + curr.next = tokenizer.copy_token(rhs); + curr = curr.next; + } + tok = rhs.next; + continue; + } + + for t := arg.tok; t.kind != .EOF; t = t.next { + curr.next = tokenizer.copy_token(t); + curr = curr.next; + } + tok = tok.next; + continue; + } + + if tok.lit == "__VA__OPT__" && tok.next.lit == "(" { + opt_arg := read_macro_arg_one(cpp, &tok, tok.next.next, true); + if has_varargs(args) { + for t := opt_arg.tok; t.kind != .EOF; t = t.next { + curr.next = t; + curr = curr.next; + } + } + tok = skip(cpp, tok, ")"); + continue; + } + + if arg != nil { + t := preprocess_internal(cpp, arg.tok); + t.at_bol = tok.at_bol; + t.has_space = tok.has_space; + for ; t.kind != .EOF; t = t.next { + curr.next = tokenizer.copy_token(t); + curr = curr.next; + } + tok = tok.next; + continue; + } + + curr.next = tokenizer.copy_token(tok); + curr = curr.next; + tok = tok.next; + continue; + } + + curr.next = tok; + return head.next; +} + +read_macro_arg_one :: proc(cpp: ^Preprocessor, rest: ^^Token, tok: ^Token, read_rest: bool) -> ^Macro_Arg { + tok := tok; + head: Token; + curr := &head; + level := 0; + for { + if level == 0 && tok.lit == ")" { + break; + } + if level == 0 && !read_rest && tok.lit == "," { + break; + } + + if tok.kind == .EOF { + error(cpp, tok, "premature end of input"); + } + + switch tok.lit { + case "(": level += 1; + case ")": level -= 1; + } + + curr.next = tokenizer.copy_token(tok); + curr = curr.next; + tok = tok.next; + } + curr.next = tokenizer.new_eof(tok); + + arg := new(Macro_Arg); + arg.tok = head.next; + rest^ = tok; + return arg; +} + +read_macro_args :: proc(cpp: ^Preprocessor, rest: ^^Token, tok: ^Token, params: ^Macro_Param, va_args_name: string) -> ^Macro_Arg { + tok := tok; + start := tok; + tok = tok.next.next; + + head: Macro_Arg; + curr := &head; + + pp := params; + for ; pp != nil; pp = pp.next { + if curr != &head { + tok = skip(cpp, tok, ","); + } + curr.next = read_macro_arg_one(cpp, &tok, tok, false); + curr = curr.next; + curr.name = pp.name; + } + + if va_args_name != "" { + arg: ^Macro_Arg; + if tok.lit == ")" { + arg = new(Macro_Arg); + arg.tok = tokenizer.new_eof(tok); + } else { + if pp != params { + tok = skip(cpp, tok, ","); + } + arg = read_macro_arg_one(cpp, &tok, tok, true); + } + arg.name = va_args_name; + arg.is_va_args = true; + curr.next = arg; + curr = curr.next; + } else if pp != nil { + error(cpp, start, "too many arguments"); + } + + skip(cpp, tok, ")"); + rest^ = tok; + return head.next; +} + +expand_macro :: proc(cpp: ^Preprocessor, rest: ^^Token, tok: ^Token) -> bool { + if tokenizer.hide_set_contains(tok.hide_set, tok.lit) { + return false; + } + tok := tok; + m := find_macro(cpp, tok); + if m == nil { + return false; + } + + if m.handler != nil { + rest^ = m.handler(cpp, tok); + rest^.next = tok.next; + return true; + } + + if m.kind == .Value_Like { + hs := tokenizer.hide_set_union(tok.hide_set, tokenizer.new_hide_set(m.name)); + body := tokenizer.add_hide_set(m.body, hs); + for t := body; t.kind != .EOF; t = t.next { + t.origin = tok; + } + rest^ = append_token(body, tok.next); + rest^.at_bol = tok.at_bol; + rest^.has_space = tok.has_space; + return true; + } + + if tok.next.lit != "(" { + return false; + } + + macro_token := tok; + args := read_macro_args(cpp, &tok, tok, m.params, m.va_args_name); + close_paren := tok; + + hs := tokenizer.hide_set_intersection(macro_token.hide_set, close_paren.hide_set); + hs = tokenizer.hide_set_union(hs, tokenizer.new_hide_set(m.name)); + + body := substitute_token(cpp, m.body, args); + body = tokenizer.add_hide_set(body, hs); + for t := body; t.kind != .EOF; t = t.next { + t.origin = macro_token; + } + rest^ = append_token(body, tok.next); + rest^.at_bol = macro_token.at_bol; + rest^.has_space = macro_token.has_space; + return true; +} + +search_include_next :: proc(cpp: ^Preprocessor, filename: string) -> (path: string, ok: bool) { + for ; cpp.include_next_index < len(cpp.include_paths); cpp.include_next_index += 1 { + tpath := filepath.join(elems={cpp.include_paths[cpp.include_next_index], filename}, allocator=context.temp_allocator); + if os.exists(tpath) { + return strings.clone(tpath), true; + } + } + return; +} + +search_include_paths :: proc(cpp: ^Preprocessor, filename: string) -> (path: string, ok: bool) { + if filepath.is_abs(filename) { + return filename, true; + } + + if path, ok = cpp.filepath_cache[filename]; ok { + return; + } + + for include_path in cpp.include_paths { + tpath := filepath.join(elems={include_path, filename}, allocator=context.temp_allocator); + if os.exists(tpath) { + path, ok = strings.clone(tpath), true; + cpp.filepath_cache[filename] = path; + return; + } + } + + return; +} + +read_const_expr :: proc(cpp: ^Preprocessor, rest: ^^Token, tok: ^Token) -> ^Token { + tok := tok; + tok = copy_line(rest, tok); + head: Token; + curr := &head; + for tok.kind != .EOF { + if tok.lit == "defined" { + start := tok; + has_paren := consume(&tok, tok.next, "("); + if tok.kind != .Ident { + error(cpp, start, "macro name must be an identifier"); + } + m := find_macro(cpp, tok); + tok = tok.next; + + if has_paren { + tok = skip(cpp, tok, ")"); + } + + curr.next = new_number_token(cpp, 1 if m != nil else 0, start); + curr = curr.next; + continue; + } + + curr.next = tok; + curr = curr.next; + tok = tok.next; + } + + curr.next = tok; + return head.next; +} + +eval_const_expr :: proc(cpp: ^Preprocessor, rest: ^^Token, tok: ^Token) -> (val: i64) { + tok := tok; + start := tok; + expr := read_const_expr(cpp, rest, tok.next); + expr = preprocess_internal(cpp, expr); + + if expr.kind == .EOF { + error(cpp, start, "no expression"); + } + + for t := expr; t.kind != .EOF; t = t.next { + if t.kind == .Ident { + next := t.next; + t^ = new_number_token(cpp, 0, t)^; + t.next = next; + } + } + + val = 1; + convert_pp_tokens(cpp, expr, tokenizer.default_is_keyword); + + rest2: ^Token; + val = const_expr(&rest2, expr); + if rest2 != nil && rest2.kind != .EOF { + error(cpp, rest2, "extra token"); + } + return; +} + +push_cond_incl :: proc(cpp: ^Preprocessor, tok: ^Token, included: bool) -> ^Cond_Incl { + ci := new(Cond_Incl); + ci.next = cpp.cond_incl; + ci.state = .In_Then; + ci.tok = tok; + ci.included = included; + cpp.cond_incl = ci; + return ci; +} + +read_line_marker:: proc(cpp: ^Preprocessor, rest: ^^Token, tok: ^Token) { + tok := tok; + start := tok; + tok = preprocess(cpp, copy_line(rest, tok)); + if tok.kind != .Number { + error(cpp, tok, "invalid line marker"); + } + ival, _ := tok.val.(i64); + start.file.line_delta = int(ival - i64(start.pos.line)); + tok = tok.next; + if tok.kind == .EOF { + return; + } + + if tok.kind != .String { + error(cpp, tok, "filename expected"); + } + start.file.display_name = tok.lit; +} + +preprocess_internal :: proc(cpp: ^Preprocessor, tok: ^Token) -> ^Token { + head: Token; + curr := &head; + + tok := tok; + for tok != nil && tok.kind != .EOF { + if expand_macro(cpp, &tok, tok) { + continue; + } + + if !is_hash(tok) { + if tok.file != nil { + tok.line_delta = tok.file.line_delta; + } + curr.next = tok; + curr = curr.next; + tok = tok.next; + continue; + } + + start := tok; + tok = tok.next; + + switch tok.lit { + case "include": + filename, is_quote := read_include_filename(cpp, &tok, tok.next); + is_absolute := filepath.is_abs(filename); + if is_absolute { + tok = include_file(cpp, tok, filename, start.next.next); + continue; + } + + if is_quote { + dir := ""; + if start.file != nil { + dir = filepath.dir(start.file.name); + } + path := filepath.join(dir, filename); + if os.exists(path) { + tok = include_file(cpp, tok, path, start.next.next); + continue; + } + } + + path, ok := search_include_paths(cpp, filename); + if !ok { + path = filename; + } + tok = include_file(cpp, tok, path, start.next.next); + continue; + + case "include_next": + filename, _ := read_include_filename(cpp, &tok, tok.next); + path, ok := search_include_next(cpp, filename); + if !ok { + path = filename; + } + tok = include_file(cpp, tok, path, start.next.next); + continue; + + case "define": + read_macro_definition(cpp, &tok, tok.next); + continue; + + case "undef": + tok = tok.next; + if tok.kind != .Ident { + error(cpp, tok, "macro name must be an identifier"); + } + undef_macro(cpp, tok.lit); + tok = skip_line(cpp, tok.next); + continue; + + case "if": + val := eval_const_expr(cpp, &tok, tok); + push_cond_incl(cpp, start, val != 0); + if val == 0 { + tok = skip_cond_incl(tok); + } + continue; + + case "ifdef": + defined := find_macro(cpp, tok.next); + push_cond_incl(cpp, tok, defined != nil); + tok = skip_line(cpp, tok.next.next); + if defined == nil { + tok = skip_cond_incl(tok); + } + continue; + + case "ifndef": + defined := find_macro(cpp, tok.next); + push_cond_incl(cpp, tok, defined != nil); + tok = skip_line(cpp, tok.next.next); + if !(defined == nil) { + tok = skip_cond_incl(tok); + } + continue; + + case "elif": + if cpp.cond_incl == nil || cpp.cond_incl.state == .In_Else { + error(cpp, start, "stray #elif"); + } + if cpp.cond_incl != nil { + cpp.cond_incl.state = .In_Elif; + } + + if (cpp.cond_incl != nil && !cpp.cond_incl.included) && eval_const_expr(cpp, &tok, tok) != 0 { + cpp.cond_incl.included = true; + } else { + tok = skip_cond_incl(tok); + } + continue; + + case "else": + if cpp.cond_incl == nil || cpp.cond_incl.state == .In_Else { + error(cpp, start, "stray #else"); + } + if cpp.cond_incl != nil { + cpp.cond_incl.state = .In_Else; + } + tok = skip_line(cpp, tok.next); + + if cpp.cond_incl != nil { + tok = skip_cond_incl(tok); + } + continue; + + case "endif": + if cpp.cond_incl == nil { + error(cpp, start, "stray #endif"); + } else { + cpp.cond_incl = cpp.cond_incl.next; + } + tok = skip_line(cpp, tok.next); + continue; + + case "line": + read_line_marker(cpp, &tok, tok.next); + continue; + + case "pragma": + if tok.next.lit == "once" { + cpp.pragma_once[tok.pos.file] = true; + tok = skip_line(cpp, tok.next.next); + continue; + } + + pragma_tok, pragma_end := tok, tok; + + for tok != nil && tok.kind != .EOF { + pragma_end = tok; + tok = tok.next; + if tok.at_bol { + break; + } + } + pragma_end.next = tokenizer.new_eof(tok); + if cpp.pragma_handler != nil { + cpp.pragma_handler(cpp, pragma_tok.next); + continue; + } + + continue; + + case "error": + error(cpp, tok, "error"); + } + + if tok.kind == .PP_Number { + read_line_marker(cpp, &tok, tok); + continue; + } + + if !tok.at_bol { + error(cpp, tok, "invalid preprocessor directive"); + } + } + + curr.next = tok; + return head.next; +} + + +preprocess :: proc(cpp: ^Preprocessor, tok: ^Token) -> ^Token { + tok := tok; + tok = preprocess_internal(cpp, tok); + if cpp.cond_incl != nil { + error(cpp, tok, "unterminated conditional directive"); + } + convert_pp_tokens(cpp, tok, tokenizer.default_is_keyword); + join_adjacent_string_literals(cpp, tok); + for t := tok; t != nil; t = t.next { + t.pos.line += t.line_delta; + } + return tok; +} + + +define_macro :: proc(cpp: ^Preprocessor, name, def: string) { + src := transmute([]byte)def; + + file := new(tokenizer.File); + file.id = -1; + file.src = src; + file.name = ""; + file.display_name = file.name; + + + t := _init_tokenizer_from_preprocessor(&Tokenizer{}, cpp); + tok := tokenizer.tokenize(t, file); + add_macro(cpp, name, .Value_Like, tok); +} + + +file_macro :: proc(cpp: ^Preprocessor, tok: ^Token) -> ^Token { + tok := tok; + for tok.origin != nil { + tok = tok.origin; + } + i := i64(tok.pos.line + tok.file.line_delta); + return new_number_token(cpp, i, tok); +} +line_macro :: proc(cpp: ^Preprocessor, tok: ^Token) -> ^Token { + tok := tok; + for tok.origin != nil { + tok = tok.origin; + } + return new_string_token(cpp, tok.file.display_name, tok); +} +counter_macro :: proc(cpp: ^Preprocessor, tok: ^Token) -> ^Token { + i := cpp.counter; + cpp.counter += 1; + return new_number_token(cpp, i, tok); +} + +init_default_macros :: proc(cpp: ^Preprocessor) { + define_macro(cpp, "__C99_MACRO_WITH_VA_ARGS", "1"); + define_macro(cpp, "__alignof__", "_Alignof"); + define_macro(cpp, "__const__", "const"); + define_macro(cpp, "__inline__", "inline"); + define_macro(cpp, "__signed__", "signed"); + define_macro(cpp, "__typeof__", "typeof"); + define_macro(cpp, "__volatile__", "volatile"); + + add_builtin(cpp, "__FILE__", file_macro); + add_builtin(cpp, "__LINE__", line_macro); + add_builtin(cpp, "__COUNTER__", counter_macro); +} + +init_lookup_tables :: proc(cpp: ^Preprocessor, allocator := context.allocator) { + context.allocator = allocator; + reserve(&cpp.macros, max(16, cap(cpp.macros))); + reserve(&cpp.pragma_once, max(16, cap(cpp.pragma_once))); + reserve(&cpp.include_guards, max(16, cap(cpp.include_guards))); + reserve(&cpp.filepath_cache, max(16, cap(cpp.filepath_cache))); +} diff --git a/core/c/frontend/preprocessor/unquote.odin b/core/c/frontend/preprocessor/unquote.odin new file mode 100644 index 000000000..4aa4926e7 --- /dev/null +++ b/core/c/frontend/preprocessor/unquote.odin @@ -0,0 +1,154 @@ +package c_frontend_preprocess + +import "core:unicode/utf8" + +unquote_char :: proc(str: string, quote: byte) -> (r: rune, multiple_bytes: bool, tail_string: string, success: bool) { + hex_to_int :: proc(c: byte) -> int { + switch c { + case '0'..'9': return int(c-'0'); + case 'a'..'f': return int(c-'a')+10; + case 'A'..'F': return int(c-'A')+10; + } + return -1; + } + w: int; + + if str[0] == quote && quote == '"' { + return; + } else if str[0] >= 0x80 { + r, w = utf8.decode_rune_in_string(str); + return r, true, str[w:], true; + } else if str[0] != '\\' { + return rune(str[0]), false, str[1:], true; + } + + if len(str) <= 1 { + return; + } + s := str; + c := s[1]; + s = s[2:]; + + switch c { + case: r = rune(c); + + case 'a': r = '\a'; + case 'b': r = '\b'; + case 'e': r = '\e'; + case 'f': r = '\f'; + case 'n': r = '\n'; + case 'r': r = '\r'; + case 't': r = '\t'; + case 'v': r = '\v'; + case '\\': r = '\\'; + + case '"': r = '"'; + case '\'': r = '\''; + + case '0'..'7': + v := int(c-'0'); + if len(s) < 2 { + return; + } + for i in 0.. 7 { + return; + } + v = (v<<3) | d; + } + s = s[2:]; + if v > 0xff { + return; + } + r = rune(v); + + case 'x', 'u', 'U': + count: int; + switch c { + case 'x': count = 2; + case 'u': count = 4; + case 'U': count = 8; + } + + if len(s) < count { + return; + } + + for i in 0.. utf8.MAX_RUNE { + return; + } + multiple_bytes = true; + } + + success = true; + tail_string = s; + return; +} + +unquote_string :: proc(lit: string, allocator := context.allocator) -> (res: string, allocated, success: bool) { + contains_rune :: proc(s: string, r: rune) -> int { + for c, offset in s { + if c == r { + return offset; + } + } + return -1; + } + + assert(len(lit) >= 2); + + s := lit; + quote := '"'; + + if s == `""` { + return "", false, true; + } + + if contains_rune(s, '\n') >= 0 { + return s, false, false; + } + + if contains_rune(s, '\\') < 0 && contains_rune(s, quote) < 0 { + if quote == '"' { + return s, false, true; + } + } + s = s[1:len(s)-1]; + + + buf_len := 3*len(s) / 2; + buf := make([]byte, buf_len, allocator); + offset := 0; + for len(s) > 0 { + r, multiple_bytes, tail_string, ok := unquote_char(s, byte(quote)); + if !ok { + delete(buf); + return s, false, false; + } + s = tail_string; + if r < 0x80 || !multiple_bytes { + buf[offset] = byte(r); + offset += 1; + } else { + b, w := utf8.encode_rune(r); + copy(buf[offset:], b[:w]); + offset += w; + } + } + + new_string := string(buf[:offset]); + + return new_string, true, true; +} diff --git a/core/c/frontend/tokenizer/doc.odin b/core/c/frontend/tokenizer/doc.odin new file mode 100644 index 000000000..bfac6e6fa --- /dev/null +++ b/core/c/frontend/tokenizer/doc.odin @@ -0,0 +1,35 @@ +/* +package demo + +import tokenizer "core:c/frontend/tokenizer" +import preprocessor "core:c/frontend/preprocessor" +import "core:fmt" +import "core:path/filepath" + +main :: proc() { + t := &tokenizer.Tokenizer{}; + tokenizer.init_defaults(t); + + cpp := &preprocessor.Preprocessor{}; + cpp.warn, cpp.err = t.warn, t.err; + preprocessor.init_lookup_tables(cpp); + preprocessor.init_default_macros(cpp); + cpp.include_paths = {"W:/Odin/core/c/frontend/include"}; + + tok := tokenizer.tokenize_file(t, match_path, 1); + + tok = preprocessor.preprocess(cpp, tok); + if tok != nil { + for t := tok; t.kind != .EOF; t = t.next { + fmt.println(t.lit); + } + } + + fmt.println("[Done]"); +} +*/ + + +package c_frontend_tokenizer + + diff --git a/core/c/frontend/tokenizer/hide_set.odin b/core/c/frontend/tokenizer/hide_set.odin new file mode 100644 index 000000000..9cf3b4aea --- /dev/null +++ b/core/c/frontend/tokenizer/hide_set.odin @@ -0,0 +1,68 @@ +package c_frontend_tokenizer + +// NOTE(bill): This is a really dumb approach for a hide set, +// but it's really simple and probably fast enough in practice + + +Hide_Set :: struct { + next: ^Hide_Set, + name: string, +} + + +new_hide_set :: proc(name: string) -> ^Hide_Set { + hs := new(Hide_Set); + hs.name = name; + return hs; +} + +hide_set_contains :: proc(hs: ^Hide_Set, name: string) -> bool { + for h := hs; h != nil; h = h.next { + if h.name == name { + return true; + } + } + return false; +} + + +hide_set_union :: proc(a, b: ^Hide_Set) -> ^Hide_Set { + head: Hide_Set; + curr := &head; + + for h := a; h != nil; h = h.next { + curr.next = new_hide_set(h.name); + curr = curr.next; + } + curr.next = b; + return head.next; +} + + +hide_set_intersection :: proc(a, b: ^Hide_Set) -> ^Hide_Set { + head: Hide_Set; + curr := &head; + + for h := a; h != nil; h = h.next { + if hide_set_contains(b, h.name) { + curr.next = new_hide_set(h.name); + curr = curr.next; + } + } + return head.next; +} + + +add_hide_set :: proc(tok: ^Token, hs: ^Hide_Set) -> ^Token { + head: Token; + curr := &head; + + tok := tok; + for ; tok != nil; tok = tok.next { + t := copy_token(tok); + t.hide_set = hide_set_union(t.hide_set, hs); + curr.next = t; + curr = curr.next; + } + return head.next; +} diff --git a/core/c/frontend/tokenizer/token.odin b/core/c/frontend/tokenizer/token.odin new file mode 100644 index 000000000..a85468e6a --- /dev/null +++ b/core/c/frontend/tokenizer/token.odin @@ -0,0 +1,169 @@ +package c_frontend_tokenizer + + +Pos :: struct { + file: string, + line: int, + column: int, + offset: int, +} + +Token_Kind :: enum { + Invalid, + Ident, + Punct, + Keyword, + Char, + String, + Number, + PP_Number, + Comment, + EOF, +} + +File :: struct { + name: string, + id: int, + src: []byte, + + display_name: string, + line_delta: int, +} + + +Token_Type_Hint :: enum u8 { + None, + + Int, + Long, + Long_Long, + + Unsigned_Int, + Unsigned_Long, + Unsigned_Long_Long, + + Float, + Double, + Long_Double, + + UTF_8, + UTF_16, + UTF_32, + UTF_Wide, +} + +Token_Value :: union { + i64, + f64, + string, + []u16, + []u32, +} + +Token :: struct { + kind: Token_Kind, + next: ^Token, + lit: string, + + pos: Pos, + file: ^File, + line_delta: int, + at_bol: bool, + has_space: bool, + + type_hint: Token_Type_Hint, + val: Token_Value, + prefix: string, + + // Preprocessor values + hide_set: ^Hide_Set, + origin: ^Token, +} + +Is_Keyword_Proc :: #type proc(tok: ^Token) -> bool; + +copy_token :: proc(tok: ^Token) -> ^Token { + t := new_clone(tok^); + t.next = nil; + return t; +} + +new_eof :: proc(tok: ^Token) -> ^Token { + t := new_clone(tok^); + t.kind = .EOF; + t.lit = ""; + return t; +} + +default_is_keyword :: proc(tok: ^Token) -> bool { + if tok.kind == .Keyword { + return true; + } + if len(tok.lit) > 0 { + return default_keyword_set[tok.lit]; + } + return false; +} + + +token_name := [Token_Kind]string { + .Invalid = "invalid", + .Ident = "ident", + .Punct = "punct", + .Keyword = "keyword", + .Char = "char", + .String = "string", + .Number = "number", + .PP_Number = "preprocessor number", + .Comment = "comment", + .EOF = "eof", +}; + +default_keyword_set := map[string]bool{ + "auto" = true, + "break" = true, + "case" = true, + "char" = true, + "const" = true, + "continue" = true, + "default" = true, + "do" = true, + "double" = true, + "else" = true, + "enum" = true, + "extern" = true, + "float" = true, + "for" = true, + "goto" = true, + "if" = true, + "int" = true, + "long" = true, + "register" = true, + "restrict" = true, + "return" = true, + "short" = true, + "signed" = true, + "sizeof" = true, + "static" = true, + "struct" = true, + "switch" = true, + "typedef" = true, + "union" = true, + "unsigned" = true, + "void" = true, + "volatile" = true, + "while" = true, + "_Alignas" = true, + "_Alignof" = true, + "_Atomic" = true, + "_Bool" = true, + "_Generic" = true, + "_Noreturn" = true, + "_Thread_local" = true, + "__restrict" = true, + "typeof" = true, + "asm" = true, + "__restrict__" = true, + "__thread" = true, + "__attribute__" = true, +}; diff --git a/core/c/frontend/tokenizer/tokenizer.odin b/core/c/frontend/tokenizer/tokenizer.odin new file mode 100644 index 000000000..d65a8cd4e --- /dev/null +++ b/core/c/frontend/tokenizer/tokenizer.odin @@ -0,0 +1,667 @@ +package c_frontend_tokenizer + +import "core:fmt" +import "core:os" +import "core:strings" +import "core:unicode/utf8" + + +Error_Handler :: #type proc(pos: Pos, fmt: string, args: ..any); + + +Tokenizer :: struct { + // Immutable data + path: string, + src: []byte, + + + // Tokenizing state + ch: rune, + offset: int, + read_offset: int, + line_offset: int, + line_count: int, + + // Extra information for tokens + at_bol: bool, + has_space: bool, + + // Mutable data + err: Error_Handler, + warn: Error_Handler, + error_count: int, + warning_count: int, +} + +init_defaults :: proc(t: ^Tokenizer, err: Error_Handler = default_error_handler, warn: Error_Handler = default_warn_handler) { + t.err = err; + t.warn = warn; +} + + +@(private) +offset_to_pos :: proc(t: ^Tokenizer, offset: int) -> (pos: Pos) { + pos.file = t.path; + pos.offset = offset; + pos.line = t.line_count; + pos.column = offset - t.line_offset + 1; + return; +} + +default_error_handler :: proc(pos: Pos, msg: string, args: ..any) { + fmt.eprintf("%s(%d:%d) ", pos.file, pos.line, pos.column); + fmt.eprintf(msg, ..args); + fmt.eprintf("\n"); +} + +default_warn_handler :: proc(pos: Pos, msg: string, args: ..any) { + fmt.eprintf("%s(%d:%d) warning: ", pos.file, pos.line, pos.column); + fmt.eprintf(msg, ..args); + fmt.eprintf("\n"); +} + +error_offset :: proc(t: ^Tokenizer, offset: int, msg: string, args: ..any) { + pos := offset_to_pos(t, offset); + if t.err != nil { + t.err(pos, msg, ..args); + } + t.error_count += 1; +} + +warn_offset :: proc(t: ^Tokenizer, offset: int, msg: string, args: ..any) { + pos := offset_to_pos(t, offset); + if t.warn != nil { + t.warn(pos, msg, ..args); + } + t.warning_count += 1; +} + +error :: proc(t: ^Tokenizer, tok: ^Token, msg: string, args: ..any) { + pos := tok.pos; + if t.err != nil { + t.err(pos, msg, ..args); + } + t.error_count += 1; +} + +warn :: proc(t: ^Tokenizer, tok: ^Token, msg: string, args: ..any) { + pos := tok.pos; + if t.warn != nil { + t.warn(pos, msg, ..args); + } + t.warning_count += 1; +} + + +advance_rune :: proc(t: ^Tokenizer) { + if t.read_offset < len(t.src) { + t.offset = t.read_offset; + if t.ch == '\n' { + t.at_bol = true; + t.line_offset = t.offset; + t.line_count += 1; + } + r, w := rune(t.src[t.read_offset]), 1; + switch { + case r == 0: + error_offset(t, t.offset, "illegal character NUL"); + case r >= utf8.RUNE_SELF: + r, w = utf8.decode_rune(t.src[t.read_offset:]); + if r == utf8.RUNE_ERROR && w == 1 { + error_offset(t, t.offset, "illegal UTF-8 encoding"); + } else if r == utf8.RUNE_BOM && t.offset > 0 { + error_offset(t, t.offset, "illegal byte order mark"); + } + } + t.read_offset += w; + t.ch = r; + } else { + t.offset = len(t.src); + if t.ch == '\n' { + t.at_bol = true; + t.line_offset = t.offset; + t.line_count += 1; + } + t.ch = -1; + } +} + +advance_rune_n :: proc(t: ^Tokenizer, n: int) { + for in 0.. bool { + return '0' <= r && r <= '9'; +} + +skip_whitespace :: proc(t: ^Tokenizer) { + for { + switch t.ch { + case ' ', '\t', '\r', '\v', '\f', '\n': + t.has_space = true; + advance_rune(t); + case: + return; + } + } +} + +scan_comment :: proc(t: ^Tokenizer) -> string { + offset := t.offset-1; + next := -1; + general: { + if t.ch == '/'{ // line comments + advance_rune(t); + for t.ch != '\n' && t.ch >= 0 { + advance_rune(t); + } + + next = t.offset; + if t.ch == '\n' { + next += 1; + } + break general; + } + + /* style comment */ + advance_rune(t); + for t.ch >= 0 { + ch := t.ch; + advance_rune(t); + if ch == '*' && t.ch == '/' { + advance_rune(t); + next = t.offset; + break general; + } + } + + error_offset(t, offset, "comment not terminated"); + } + + lit := t.src[offset : t.offset]; + + // NOTE(bill): Strip CR for line comments + for len(lit) > 2 && lit[1] == '/' && lit[len(lit)-1] == '\r' { + lit = lit[:len(lit)-1]; + } + + + return string(lit); +} + +scan_identifier :: proc(t: ^Tokenizer) -> string { + offset := t.offset; + + for is_ident1(t.ch) { + advance_rune(t); + } + + return string(t.src[offset : t.offset]); +} + +scan_string :: proc(t: ^Tokenizer) -> string { + offset := t.offset-1; + + for { + ch := t.ch; + if ch == '\n' || ch < 0 { + error_offset(t, offset, "string literal was not terminated"); + break; + } + advance_rune(t); + if ch == '"' { + break; + } + if ch == '\\' { + scan_escape(t); + } + } + + return string(t.src[offset : t.offset]); +} + +digit_val :: proc(r: rune) -> int { + switch r { + case '0'..'9': + return int(r-'0'); + case 'A'..'F': + return int(r-'A' + 10); + case 'a'..'f': + return int(r-'a' + 10); + } + return 16; +} + +scan_escape :: proc(t: ^Tokenizer) -> bool { + offset := t.offset; + + esc := t.ch; + n: int; + base, max: u32; + switch esc { + case 'a', 'b', 'e', 'f', 'n', 't', 'v', 'r', '\\', '\'', '\"': + advance_rune(t); + return true; + + case '0'..'7': + for digit_val(t.ch) < 8 { + advance_rune(t); + } + return true; + case 'x': + advance_rune(t); + for digit_val(t.ch) < 16 { + advance_rune(t); + } + return true; + case 'u': + advance_rune(t); + n, base, max = 4, 16, utf8.MAX_RUNE; + case 'U': + advance_rune(t); + n, base, max = 8, 16, utf8.MAX_RUNE; + case: + if t.ch < 0 { + error_offset(t, offset, "escape sequence was not terminated"); + } else { + break; + } + return false; + } + + x: u32; + main_loop: for n > 0 { + d := u32(digit_val(t.ch)); + if d >= base { + if t.ch == '"' || t.ch == '\'' { + break main_loop; + } + if t.ch < 0 { + error_offset(t, t.offset, "escape sequence was not terminated"); + } else { + error_offset(t, t.offset, "illegal character '%r' : %d in escape sequence", t.ch, t.ch); + } + return false; + } + + x = x*base + d; + advance_rune(t); + n -= 1; + } + + if x > max || 0xd800 <= x && x <= 0xe000 { + error_offset(t, offset, "escape sequence is an invalid Unicode code point"); + return false; + } + return true; +} + +scan_rune :: proc(t: ^Tokenizer) -> string { + offset := t.offset-1; + valid := true; + n := 0; + for { + ch := t.ch; + if ch == '\n' || ch < 0 { + if valid { + error_offset(t, offset, "rune literal not terminated"); + valid = false; + } + break; + } + advance_rune(t); + if ch == '\'' { + break; + } + n += 1; + if ch == '\\' { + if !scan_escape(t) { + valid = false; + } + } + } + + if valid && n != 1 { + error_offset(t, offset, "illegal rune literal"); + } + + return string(t.src[offset : t.offset]); +} + +scan_number :: proc(t: ^Tokenizer, seen_decimal_point: bool) -> (Token_Kind, string) { + scan_mantissa :: proc(t: ^Tokenizer, base: int) { + for digit_val(t.ch) < base { + advance_rune(t); + } + } + scan_exponent :: proc(t: ^Tokenizer) { + if t.ch == 'e' || t.ch == 'E' || t.ch == 'p' || t.ch == 'P' { + advance_rune(t); + if t.ch == '-' || t.ch == '+' { + advance_rune(t); + } + if digit_val(t.ch) < 10 { + scan_mantissa(t, 10); + } else { + error_offset(t, t.offset, "illegal floating-point exponent"); + } + } + } + scan_fraction :: proc(t: ^Tokenizer) -> (early_exit: bool) { + if t.ch == '.' && peek(t) == '.' { + return true; + } + if t.ch == '.' { + advance_rune(t); + scan_mantissa(t, 10); + } + return false; + } + + check_end := true; + + + offset := t.offset; + seen_point := seen_decimal_point; + + if seen_point { + offset -= 1; + scan_mantissa(t, 10); + scan_exponent(t); + } else { + if t.ch == '0' { + int_base :: inline proc(t: ^Tokenizer, base: int, msg: string) { + prev := t.offset; + advance_rune(t); + scan_mantissa(t, base); + if t.offset - prev <= 1 { + error_offset(t, t.offset, msg); + } + } + + advance_rune(t); + switch t.ch { + case 'b', 'B': + int_base(t, 2, "illegal binary integer"); + case 'x', 'X': + int_base(t, 16, "illegal hexadecimal integer"); + case: + seen_point = false; + scan_mantissa(t, 10); + if t.ch == '.' { + seen_point = true; + if scan_fraction(t) { + check_end = false; + } + } + if check_end { + scan_exponent(t); + check_end = false; + } + } + } + } + + if check_end { + scan_mantissa(t, 10); + + if !scan_fraction(t) { + scan_exponent(t); + } + } + + return .Number, string(t.src[offset : t.offset]); +} + +scan_punct :: proc(t: ^Tokenizer, ch: rune) -> (kind: Token_Kind) { + kind = .Punct; + switch ch { + case: + kind = .Invalid; + + case '<', '>': + if t.ch == ch { + advance_rune(t); + } + if t.ch == '=' { + advance_rune(t); + } + case '!', '+', '-', '*', '/', '%', '^', '=': + if t.ch == '=' { + advance_rune(t); + } + case '#': + if t.ch == '#' { + advance_rune(t); + } + case '&': + if t.ch == '=' || t.ch == '&' { + advance_rune(t); + } + case '|': + if t.ch == '=' || t.ch == '|' { + advance_rune(t); + } + case '(', ')', '[', ']', '{', '}': + // okay + case '~', ',', ':', ';', '?': + // okay + case '`': + // okay + case '.': + if t.ch == '.' && peek(t) == '.' { + advance_rune(t); + advance_rune(t); // consume last '.' + } + } + return; +} + +peek :: proc(t: ^Tokenizer) -> byte { + if t.read_offset < len(t.src) { + return t.src[t.read_offset]; + } + return 0; +} +peek_str :: proc(t: ^Tokenizer, str: string) -> bool { + if t.read_offset < len(t.src) { + return strings.has_prefix(string(t.src[t.offset:]), str); + } + return false; +} + +scan_literal_prefix :: proc(t: ^Tokenizer, str: string, prefix: ^string) -> bool { + if peek_str(t, str) { + offset := t.offset; + for _ in str { + advance_rune(t); + } + prefix^ = string(t.src[offset:][:len(str)-1]); + return true; + } + return false; +} + + +allow_next_to_be_newline :: proc(t: ^Tokenizer) -> bool { + if t.ch == '\n' { + advance_rune(t); + return true; + } else if t.ch == '\r' && peek(t) == '\n' { // allow for MS-DOS style line endings + advance_rune(t); // \r + advance_rune(t); // \n + return true; + } + return false; +} + +scan :: proc(t: ^Tokenizer, f: ^File) -> ^Token { + skip_whitespace(t); + + offset := t.offset; + + kind: Token_Kind; + lit: string; + prefix: string; + + switch ch := t.ch; { + case scan_literal_prefix(t, `u8"`, &prefix): + kind = .String; + lit = scan_string(t); + case scan_literal_prefix(t, `u"`, &prefix): + kind = .String; + lit = scan_string(t); + case scan_literal_prefix(t, `L"`, &prefix): + kind = .String; + lit = scan_string(t); + case scan_literal_prefix(t, `U"`, &prefix): + kind = .String; + lit = scan_string(t); + case scan_literal_prefix(t, `u'`, &prefix): + kind = .Char; + lit = scan_rune(t); + case scan_literal_prefix(t, `L'`, &prefix): + kind = .Char; + lit = scan_rune(t); + case scan_literal_prefix(t, `U'`, &prefix): + kind = .Char; + lit = scan_rune(t); + + case is_ident0(ch): + lit = scan_identifier(t); + kind = .Ident; + case '0' <= ch && ch <= '9': + kind, lit = scan_number(t, false); + case: + advance_rune(t); + switch ch { + case -1: + kind = .EOF; + case '\\': + kind = .Punct; + if allow_next_to_be_newline(t) { + t.at_bol = true; + t.has_space = false; + return scan(t, f); + } + + case '.': + if is_digit(t.ch) { + kind, lit = scan_number(t, true); + } else { + kind = scan_punct(t, ch); + } + case '"': + kind = .String; + lit = scan_string(t); + case '\'': + kind = .Char; + lit = scan_rune(t); + case '/': + if t.ch == '/' || t.ch == '*' { + kind = .Comment; + lit = scan_comment(t); + t.has_space = true; + break; + } + fallthrough; + case: + kind = scan_punct(t, ch); + if kind == .Invalid && ch != utf8.RUNE_BOM { + error_offset(t, t.offset, "illegal character '%r': %d", ch, ch); + } + } + } + + if lit == "" { + lit = string(t.src[offset : t.offset]); + } + + if kind == .Comment { + return scan(t, f); + } + + tok := new(Token); + tok.kind = kind; + tok.lit = lit; + tok.pos = offset_to_pos(t, offset); + tok.file = f; + tok.prefix = prefix; + tok.at_bol = t.at_bol; + tok.has_space = t.has_space; + + t.at_bol, t.has_space = false, false; + + return tok; +} + +tokenize :: proc(t: ^Tokenizer, f: ^File) -> ^Token { + setup_tokenizer: { + t.src = f.src; + t.ch = ' '; + t.offset = 0; + t.read_offset = 0; + t.line_offset = 0; + t.line_count = len(t.src) > 0 ? 1 : 0; + t.error_count = 0; + t.path = f.name; + + + advance_rune(t); + if t.ch == utf8.RUNE_BOM { + advance_rune(t); + } + } + + + t.at_bol = true; + t.has_space = false; + + head: Token; + curr := &head; + for { + tok := scan(t, f); + if tok == nil { + break; + } + curr.next = tok; + curr = curr.next; + if tok.kind == .EOF { + break; + } + } + + return head.next; +} + +add_new_file :: proc(t: ^Tokenizer, name: string, src: []byte, id: int) -> ^File { + file := new(File); + file.id = id; + file.src = src; + file.name = name; + file.display_name = name; + return file; +} + +tokenize_file :: proc(t: ^Tokenizer, path: string, id: int, loc := #caller_location) -> ^Token { + src, ok := os.read_entire_file(path); + if !ok { + return nil; + } + return tokenize(t, add_new_file(t, path, src, id)); +} + + +inline_tokenize :: proc(t: ^Tokenizer, tok: ^Token, src: []byte) -> ^Token { + file := new(File); + file.src = src; + if tok.file != nil { + file.id = tok.file.id; + file.name = tok.file.name; + file.display_name = tok.file.name; + } + + return tokenize(t, file); +} diff --git a/core/c/frontend/tokenizer/unicode.odin b/core/c/frontend/tokenizer/unicode.odin new file mode 100644 index 000000000..41c48eb73 --- /dev/null +++ b/core/c/frontend/tokenizer/unicode.odin @@ -0,0 +1,116 @@ +package c_frontend_tokenizer + + +in_range :: proc(range: []rune, c: rune) -> bool #no_bounds_check { + for i := 0; range[i] != -1; i += 2 { + if range[i] <= c && c <= range[i+1] { + return true; + } + } + return false; +} + + +// [https://www.sigbus.info/n1570#D] C11 allows ASCII and some multibyte characters in certan Unicode ranges to be used in an identifier. +// +// is_ident0 returns true if a given character is acceptable as the first character of an identifier. +is_ident0 :: proc(c: rune) -> bool { + return in_range(_range_ident0, c); +} +// is_ident0 returns true if a given character is acceptable as a non-first character of an identifier. +is_ident1 :: proc(c: rune) -> bool { + return is_ident0(c) || in_range(_range_ident1, c); +} + +// Returns the number of columns needed to display a given character in a fixed-width font. +// Based on https://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c +char_width :: proc(c: rune) -> int { + switch { + case in_range(_range_width0, c): + return 0; + case in_range(_range_width2, c): + return 2; + } + return 1; +} + +display_width :: proc(str: string) -> (w: int) { + for c in str { + w += char_width(c); + } + return; +} + + + +_range_ident0 := []rune{ + '_', '_', 'a', 'z', 'A', 'Z', '$', '$', + 0x00A8, 0x00A8, 0x00AA, 0x00AA, 0x00AD, 0x00AD, 0x00AF, 0x00AF, + 0x00B2, 0x00B5, 0x00B7, 0x00BA, 0x00BC, 0x00BE, 0x00C0, 0x00D6, + 0x00D8, 0x00F6, 0x00F8, 0x00FF, 0x0100, 0x02FF, 0x0370, 0x167F, + 0x1681, 0x180D, 0x180F, 0x1DBF, 0x1E00, 0x1FFF, 0x200B, 0x200D, + 0x202A, 0x202E, 0x203F, 0x2040, 0x2054, 0x2054, 0x2060, 0x206F, + 0x2070, 0x20CF, 0x2100, 0x218F, 0x2460, 0x24FF, 0x2776, 0x2793, + 0x2C00, 0x2DFF, 0x2E80, 0x2FFF, 0x3004, 0x3007, 0x3021, 0x302F, + 0x3031, 0x303F, 0x3040, 0xD7FF, 0xF900, 0xFD3D, 0xFD40, 0xFDCF, + 0xFDF0, 0xFE1F, 0xFE30, 0xFE44, 0xFE47, 0xFFFD, + 0x10000, 0x1FFFD, 0x20000, 0x2FFFD, 0x30000, 0x3FFFD, 0x40000, 0x4FFFD, + 0x50000, 0x5FFFD, 0x60000, 0x6FFFD, 0x70000, 0x7FFFD, 0x80000, 0x8FFFD, + 0x90000, 0x9FFFD, 0xA0000, 0xAFFFD, 0xB0000, 0xBFFFD, 0xC0000, 0xCFFFD, + 0xD0000, 0xDFFFD, 0xE0000, 0xEFFFD, + -1, +}; + +_range_ident1 := []rune{ + '0', '9', '$', '$', 0x0300, 0x036F, 0x1DC0, 0x1DFF, 0x20D0, 0x20FF, 0xFE20, 0xFE2F, + -1, +}; + + +_range_width0 := []rune{ + 0x0000, 0x001F, 0x007f, 0x00a0, 0x0300, 0x036F, 0x0483, 0x0486, + 0x0488, 0x0489, 0x0591, 0x05BD, 0x05BF, 0x05BF, 0x05C1, 0x05C2, + 0x05C4, 0x05C5, 0x05C7, 0x05C7, 0x0600, 0x0603, 0x0610, 0x0615, + 0x064B, 0x065E, 0x0670, 0x0670, 0x06D6, 0x06E4, 0x06E7, 0x06E8, + 0x06EA, 0x06ED, 0x070F, 0x070F, 0x0711, 0x0711, 0x0730, 0x074A, + 0x07A6, 0x07B0, 0x07EB, 0x07F3, 0x0901, 0x0902, 0x093C, 0x093C, + 0x0941, 0x0948, 0x094D, 0x094D, 0x0951, 0x0954, 0x0962, 0x0963, + 0x0981, 0x0981, 0x09BC, 0x09BC, 0x09C1, 0x09C4, 0x09CD, 0x09CD, + 0x09E2, 0x09E3, 0x0A01, 0x0A02, 0x0A3C, 0x0A3C, 0x0A41, 0x0A42, + 0x0A47, 0x0A48, 0x0A4B, 0x0A4D, 0x0A70, 0x0A71, 0x0A81, 0x0A82, + 0x0ABC, 0x0ABC, 0x0AC1, 0x0AC5, 0x0AC7, 0x0AC8, 0x0ACD, 0x0ACD, + 0x0AE2, 0x0AE3, 0x0B01, 0x0B01, 0x0B3C, 0x0B3C, 0x0B3F, 0x0B3F, + 0x0B41, 0x0B43, 0x0B4D, 0x0B4D, 0x0B56, 0x0B56, 0x0B82, 0x0B82, + 0x0BC0, 0x0BC0, 0x0BCD, 0x0BCD, 0x0C3E, 0x0C40, 0x0C46, 0x0C48, + 0x0C4A, 0x0C4D, 0x0C55, 0x0C56, 0x0CBC, 0x0CBC, 0x0CBF, 0x0CBF, + 0x0CC6, 0x0CC6, 0x0CCC, 0x0CCD, 0x0CE2, 0x0CE3, 0x0D41, 0x0D43, + 0x0D4D, 0x0D4D, 0x0DCA, 0x0DCA, 0x0DD2, 0x0DD4, 0x0DD6, 0x0DD6, + 0x0E31, 0x0E31, 0x0E34, 0x0E3A, 0x0E47, 0x0E4E, 0x0EB1, 0x0EB1, + 0x0EB4, 0x0EB9, 0x0EBB, 0x0EBC, 0x0EC8, 0x0ECD, 0x0F18, 0x0F19, + 0x0F35, 0x0F35, 0x0F37, 0x0F37, 0x0F39, 0x0F39, 0x0F71, 0x0F7E, + 0x0F80, 0x0F84, 0x0F86, 0x0F87, 0x0F90, 0x0F97, 0x0F99, 0x0FBC, + 0x0FC6, 0x0FC6, 0x102D, 0x1030, 0x1032, 0x1032, 0x1036, 0x1037, + 0x1039, 0x1039, 0x1058, 0x1059, 0x1160, 0x11FF, 0x135F, 0x135F, + 0x1712, 0x1714, 0x1732, 0x1734, 0x1752, 0x1753, 0x1772, 0x1773, + 0x17B4, 0x17B5, 0x17B7, 0x17BD, 0x17C6, 0x17C6, 0x17C9, 0x17D3, + 0x17DD, 0x17DD, 0x180B, 0x180D, 0x18A9, 0x18A9, 0x1920, 0x1922, + 0x1927, 0x1928, 0x1932, 0x1932, 0x1939, 0x193B, 0x1A17, 0x1A18, + 0x1B00, 0x1B03, 0x1B34, 0x1B34, 0x1B36, 0x1B3A, 0x1B3C, 0x1B3C, + 0x1B42, 0x1B42, 0x1B6B, 0x1B73, 0x1DC0, 0x1DCA, 0x1DFE, 0x1DFF, + 0x200B, 0x200F, 0x202A, 0x202E, 0x2060, 0x2063, 0x206A, 0x206F, + 0x20D0, 0x20EF, 0x302A, 0x302F, 0x3099, 0x309A, 0xA806, 0xA806, + 0xA80B, 0xA80B, 0xA825, 0xA826, 0xFB1E, 0xFB1E, 0xFE00, 0xFE0F, + 0xFE20, 0xFE23, 0xFEFF, 0xFEFF, 0xFFF9, 0xFFFB, 0x10A01, 0x10A03, + 0x10A05, 0x10A06, 0x10A0C, 0x10A0F, 0x10A38, 0x10A3A, 0x10A3F, 0x10A3F, + 0x1D167, 0x1D169, 0x1D173, 0x1D182, 0x1D185, 0x1D18B, 0x1D1AA, 0x1D1AD, + 0x1D242, 0x1D244, 0xE0001, 0xE0001, 0xE0020, 0xE007F, 0xE0100, 0xE01EF, + -1, +}; + +_range_width2 := []rune{ + 0x1100, 0x115F, 0x2329, 0x2329, 0x232A, 0x232A, 0x2E80, 0x303E, + 0x3040, 0xA4CF, 0xAC00, 0xD7A3, 0xF900, 0xFAFF, 0xFE10, 0xFE19, + 0xFE30, 0xFE6F, 0xFF00, 0xFF60, 0xFFE0, 0xFFE6, 0x1F000, 0x1F644, + 0x20000, 0x2FFFD, 0x30000, 0x3FFFD, + -1, +};