From a39921aa6a6086739fa087c28864dfc449217d0c Mon Sep 17 00:00:00 2001 From: gingerBill Date: Wed, 18 Nov 2020 23:33:16 +0000 Subject: [PATCH] Add `package text/scanner` --- core/text/scanner/scanner.odin | 583 +++++++++++++++++++++++++++++++++ 1 file changed, 583 insertions(+) create mode 100644 core/text/scanner/scanner.odin diff --git a/core/text/scanner/scanner.odin b/core/text/scanner/scanner.odin new file mode 100644 index 000000000..ddfd0bb5f --- /dev/null +++ b/core/text/scanner/scanner.odin @@ -0,0 +1,583 @@ +package text_scanner + +import "core:fmt" +import "core:strings" +import "core:unicode" +import "core:unicode/utf8" + +Position :: struct { + filename: string, // filename, if present + offset: int, // byte offset, starting @ 0 + line: int, // line number, starting @ 1 + column: int, // column number, starting @ 1 (character count per line) +} + +position_is_valid :: proc(pos: Position) -> bool { + return pos.line > 0; +} + +position_to_string :: proc(pos: Position, allocator := context.temp_allocator) -> string { + s := pos.filename; + if s == "" { + s = ""; + } + + context.allocator = allocator; + if position_is_valid(pos) { + return fmt.aprintf("%s(%d:%d)", s, pos.line, pos.column); + } else { + return strings.clone(s); + } +} + +EOF :: -1; +Ident :: -2; +Int :: -3; +Float :: -4; +Char :: -5; +String :: -6; +Raw_String :: -7; +Comment :: -8; + +Scan_Flag :: enum u32 { + Scan_Idents, + Scan_Ints, + Scan_C_Int_Prefixes, + Scan_Floats, + Scan_Chars, + Scan_Strings, + Scan_Raw_Strings, + Scan_Comments, + Skip_Comments, +} +Scan_Flags :: bit_set[Scan_Flag; u32]; + +Odin_Like_Tokens :: Scan_Flags{.Scan_Idents, .Scan_Ints, .Scan_Floats, .Scan_Chars, .Scan_Strings, .Scan_Raw_Strings, .Scan_Comments, .Skip_Comments}; +C_Like_Tokens :: Scan_Flags{.Scan_Idents, .Scan_Ints, .Scan_C_Int_Prefixes, .Scan_Floats, .Scan_Chars, .Scan_Strings, .Scan_Raw_Strings, .Scan_Comments, .Skip_Comments}; + +Odin_Whitespace :: 1<<'\t' | 1<<'\n' | 1<<'\r' | 1<<' '; +C_Whitespace :: 1<<'\t' | 1<<'\n' | 1<<'\r' | 1<<'\v' | 1<<'\f' | 1<<' '; + + +Scanner :: struct { + src: string, + + src_pos: int, + src_end: int, + + tok_pos: int, + tok_end: int, + + ch: rune, + + line: int, + column: int, + prev_line_len: int, + prev_char_len: int, + + error: proc(s: ^Scanner, msg: string), + error_count: int, + + flags: Scan_Flags, + whitespace: u64, + + is_ident_rune: proc(ch: rune, i: int) -> bool, + + pos: Position, +} + +init :: proc(s: ^Scanner, src: string, filename := "") -> ^Scanner { + s^ = {}; + + s.src = src; + s.pos.filename = filename; + + s.tok_pos = -1; + + s.ch = -2; // no char read yet, not an EOF + + s.line = 1; + + s.flags = Odin_Like_Tokens; + s.whitespace = Odin_Whitespace; + + return s; +} + + +@(private) +advance :: proc(s: ^Scanner) -> rune { + if s.src_pos >= len(s.src) { + s.prev_char_len = 0; + return EOF; + } + ch, width := rune(s.src[s.src_pos]), 1; + + if ch >= utf8.RUNE_SELF { + ch, width = utf8.decode_rune_in_string(s.src[s.src_pos:]); + if ch == utf8.RUNE_ERROR && width == 1 { + s.src_pos += width; + s.prev_char_len = width; + s.column += 1; + error(s, "invalid UTF-8 encoding"); + return ch; + } + } + + s.src_pos += width; + s.prev_char_len = width; + s.column += 1; + + switch ch { + case 0: + error(s, "invalid character NUL"); + case '\n': + s.line += 1; + s.prev_line_len = s.column; + s.column = 0; + } + + return ch; +} + +next :: proc(s: ^Scanner) -> rune { + s.tok_pos = -1; + s.pos.line = 0; + ch := peek(s); + if ch != EOF { + s.ch = advance(s); + } + return ch; +} + +peek :: proc(s: ^Scanner) -> rune { + if s.ch == -2 { + s.ch = advance(s); + if s.ch == '\ufeff' { // Ignore BOM + s.ch = advance(s); + } + } + return s.ch; +} + + +error :: proc(s: ^Scanner, msg: string) { + s.error_count += 1; + if s.error != nil { + s.error(s, msg); + return; + } + p := s.pos; + if !position_is_valid(p) { + p = position(s); + } + + s := p.filename; + if s == "" { + s = ""; + } + + if position_is_valid(p) { + fmt.eprintf("%s(%d:%d): %s\n", s, p.line, p.column, msg); + } else { + fmt.eprintf("%s: %s\n", s, msg); + } +} + +errorf :: proc(s: ^Scanner, format: string, args: ..any) { + error(s, fmt.tprintf(format, ..args)); +} + +@(private) +is_ident_rune :: proc(s: ^Scanner, ch: rune, i: int) -> bool { + if s.is_ident_rune != nil { + return s.is_ident_rune(ch, i); + } + return ch == '_' || unicode.is_letter(ch) || unicode.is_digit(ch) && i > 0; +} + +@(private) +scan_identifier :: proc(s: ^Scanner) -> rune { + ch := advance(s); + for i := 1; is_ident_rune(s, ch, i); i += 1 { + ch = advance(s); + } + return ch; +} + +@(private) lower :: proc(ch: rune) -> rune { return ('a' - 'A') | ch; } +@(private) is_decimal :: proc(ch: rune) -> bool { return '0' <= ch && ch <= '9'; } +@(private) is_hex :: proc(ch: rune) -> bool { return '0' <= ch && ch <= '9' || 'a' <= lower(ch) && lower(ch) <= 'f'; } + + + +@(private) +scan_number :: proc(s: ^Scanner, ch: rune, seen_dot: bool) -> (rune, rune) { + lit_name :: proc(prefix: rune) -> string { + switch prefix { + case 'b': return "binary literal"; + case 'o': return "octal literal"; + case 'z': return "dozenal literal"; + case 'x': return "hexadecimal literal"; + } + return "decimal literal"; + } + + digits :: proc(s: ^Scanner, ch0: rune, base: int, invalid: ^rune) -> (ch: rune, digsep: int) { + ch = ch0; + if base <= 10 { + max := rune('0' + base); + for is_decimal(ch) || ch == '_' { + ds := 1; + if ch == '_' { + ds = 2; + } else if ch >= max && invalid^ == 0 { + invalid^ = ch; + } + digsep |= ds; + ch = advance(s); + } + } else { + for is_hex(ch) || ch == '_' { + ds := 1; + if ch == '_' { + ds = 2; + } + digsep |= ds; + ch = advance(s); + } + } + return; + } + + ch, seen_dot := ch, seen_dot; + + base := 10; + prefix := rune(0); + digsep := 0; + invalid := rune(0); + + tok: rune; + ds: int; + + if !seen_dot { + tok = Int; + ch0 := ch; + if ch == '0' { + ch = advance(s); + + p := lower(ch); + if .Scan_C_Int_Prefixes in s.flags { + switch lower(ch) { + case 'b': + ch = advance(s); + base, prefix = 2, 'b'; + case 'x': + ch = advance(s); + base, prefix = 16, 'x'; + case: + base, prefix = 8, 'o'; + digsep = 1; // Leading zero + } + } else { + switch lower(ch) { + case 'b': + ch = advance(s); + base, prefix = 2, 'b'; + case 'o': + ch = advance(s); + base, prefix = 8, 'o'; + case 'd': + ch = advance(s); + base, prefix = 10, 'd'; + case 'z': + ch = advance(s); + base, prefix = 12, 'z'; + case 'x': + ch = advance(s); + base, prefix = 16, 'x'; + case: + digsep = 1; // Leading zero + } + } + } + + ch, ds = digits(s, ch, base, &invalid); + digsep |= ds; + if ch == '.' && .Scan_Floats in s.flags { + ch = advance(s); + seen_dot = true; + } + } + + if seen_dot { + tok = Float; + if prefix != 0 && prefix != 'x' { + errorf(s, "invalid radix point in %s", lit_name(prefix)); + } + ch, ds = digits(s, ch, base, &invalid); + digsep |= ds; + } + + if digsep&1 == 0 { + errorf(s, "%s has no digits", lit_name(prefix)); + } + + if e := lower(ch); (e == 'e' || e == 'p') && .Scan_Floats in s.flags { + switch { + case e == 'e' && prefix != 0: + errorf(s, "%q exponent requires decimal mantissa", ch); + case e == 'p' && prefix != 'x': + errorf(s, "%q exponent requires hexadecimal mantissa", ch); + } + ch = advance(s); + tok = Float; + if ch == '+' || ch == '-' { + ch = advance(s); + } + ch, ds = digits(s, ch, 10, nil); + digsep |= ds; + if ds&1 == 0 { + error(s, "exponent has no digits"); + } + } else if prefix == 'x' && tok == Float { + error(s, "hexadecimal mantissa requires a 'p' exponent"); + } + + if tok == Int && invalid != 0 { + errorf(s, "invalid digit %q in %s", invalid, lit_name(prefix)); + } + + if digsep&2 != 0 { + s.tok_end = s.src_pos - s.prev_char_len; + } + return tok, ch; +} + +@(private) +scan_string :: proc(s: ^Scanner, quote: rune) -> (n: int) { + digit_val :: proc(ch: rune) -> int { + switch v := lower(ch); v { + case '0'..'9': return int(v - '0'); + case 'a'..'z': return int(v - 'a'); + } + return 16; + } + + scan_digits :: proc(s: ^Scanner, ch: rune, base, n: int) -> rune { + ch, n := ch, n; + for n > 0 && digit_val(ch) < base { + ch = advance(s); + n -= 1; + } + if n > 0 { + error(s, "invalid char escape"); + } + return ch; + } + + ch := advance(s); + for ch != quote { + if ch == '\n' || ch < 0 { + error(s, "literal no terminated"); + return; + } + if ch == '\\' { + ch = advance(s); + switch ch { + case quote, 'a', 'b', 'e', 'f', 'n', 'r', 't', 'v', '\\': + ch = advance(s); + case '0'..'7': ch = scan_digits(s, ch, 8, 3); + case 'x': ch = scan_digits(s, ch, 16, 2); + case 'u': ch = scan_digits(s, ch, 16, 4); + case 'U': ch = scan_digits(s, ch, 16, 8); + case: + error(s, "invalid char escape"); + } + } else { + ch = advance(s); + } + n += 1; + } + return; +} + +@(private) +scan_raw_string :: proc(s: ^Scanner) { + ch := advance(s); + for ch != '`' { + if ch < 0 { + error(s, "literal not terminated"); + return; + } + ch = advance(s); + } +} + +@(private) +scan_char :: proc(s: ^Scanner) { + if scan_string(s, '\'') != 1 { + error(s, "invalid char literal"); + } +} + +@(private) +scan_comment :: proc(s: ^Scanner, ch: rune) -> rune { + ch := ch; + if ch == '/' { // line comment + ch = advance(s); + for ch != '\n' && ch >= 0 { + ch = advance(s); + } + return ch; + } + + // block /**/ comment + ch = advance(s); + for { + if ch < 0 { + error(s, "comment not terminated"); + break; + } + ch0 := ch; + ch = advance(s); + if ch0 == '*' && ch == '/' { + return advance(s); + } + } + return ch; +} + +scan :: proc(s: ^Scanner) -> (tok: rune) { + ch := peek(s); + if ch == EOF { + return ch; + } + + // reset position + s.tok_pos = -1; + s.pos.line = 0; + + redo: for { + for s.whitespace & (1< 0 { + s.pos.line = s.line; + s.pos.column = s.column; + } else { + // previous character was newline + s.pos.line = s.line - 1; + s.pos.column = s.prev_line_len; + } + + tok = ch; + if is_ident_rune(s, ch, 0) { + if .Scan_Idents in s.flags { + tok = Ident; + ch = scan_identifier(s); + } else { + ch = advance(s); + } + + } else if is_decimal(ch) { + if s.flags >= {.Scan_Ints, .Scan_Floats} { + tok, ch = scan_number(s, ch, false); + } else { + ch = advance(s); + } + } else { + switch ch { + case EOF: + break; + case '"': + if .Scan_Strings in s.flags { + scan_string(s, '"'); + tok = String; + } + ch = advance(s); + case '\'': + if .Scan_Chars in s.flags { + scan_string(s, '\''); + tok = Char; + } + ch = advance(s); + case '`': + if .Scan_Raw_Strings in s.flags { + scan_raw_string(s); + tok = Raw_String; + } + ch = advance(s); + case '.': + ch = advance(s); + if is_decimal(ch) && .Scan_Floats in s.flags { + tok, ch = scan_number(s, ch, true); + } + case '/': + ch = advance(s); + if (ch == '/' || ch == '*') && .Scan_Comments in s.flags { + if .Skip_Comments in s.flags { + s.tok_pos = -1; + ch = scan_comment(s, ch); + continue redo; + } + ch = scan_comment(s, ch); + tok = Comment; + } + case: + ch = advance(s); + } + } + + break redo; + } + + s.tok_end = s.src_pos - s.prev_char_len; + + s.ch = ch; + return tok; +} + +position :: proc(s: ^Scanner) -> Position { + pos: Position; + pos.filename = s.pos.filename; + pos.offset = s.src_pos - s.prev_char_len; + switch { + case s.column > 0: + pos.line = s.line; + pos.column = s.column; + case s.prev_line_len > 0: + pos.line = s.line-1; + pos.column = s.prev_line_len; + case: + pos.line = 1; + pos.column = 1; + } + return pos; +} + +token_text :: proc(s: ^Scanner) -> string { + if s.tok_pos < 0 { + return ""; + } + return string(s.src[s.tok_pos:s.tok_end]); +} + +token_string :: proc(tok: rune, allocator := context.temp_allocator) -> string { + context.allocator = allocator; + switch tok { + case EOF: return strings.clone("EOF"); + case Ident: return strings.clone("Ident"); + case Int: return strings.clone("Int"); + case Float: return strings.clone("Float"); + case Char: return strings.clone("Char"); + case String: return strings.clone("String"); + case Raw_String: return strings.clone("Raw_String"); + case Comment: return strings.clone("Comment"); + } + return fmt.aprintf("%q", tok); +}