package xml import "core:fmt" import "core:unicode" import "core:unicode/utf8" Error_Handler :: #type proc(pos: Pos, fmt: string, args: ..any) Token :: struct { kind: Token_Kind, text: string, pos: Pos, } Pos :: struct { file: string, offset: int, // starting at 0 line: int, // starting at 1 column: int, // starting at 1 } Token_Kind :: enum { Invalid, Ident, Literal, Rune, String, Double_Quote, // " Single_Quote, // ' Colon, // : Eq, // = Lt, // < Gt, // > Exclaim, // ! Question, // ? Hash, // # Slash, // / Dash, // - Open_Bracket, // [ Close_Bracket, // ] EOF, } CDATA_START :: "" Tokenizer :: struct { // Immutable data path: string, src: string, err: Error_Handler, // Tokenizing state ch: rune, offset: int, read_offset: int, line_offset: int, line_count: int, // Mutable data error_count: int, } init :: proc(t: ^Tokenizer, src: string, path: string, err: Error_Handler = default_error_handler) { t.src = src t.err = err t.ch = ' ' t.offset = 0 t.read_offset = 0 t.line_offset = 0 t.line_count = len(src) > 0 ? 1 : 0 t.error_count = 0 t.path = path advance_rune(t) if t.ch == utf8.RUNE_BOM { advance_rune(t) } } @(private) offset_to_pos :: proc(t: ^Tokenizer, offset: int) -> Pos { line := t.line_count column := offset - t.line_offset + 1 return Pos { file = t.path, offset = offset, line = line, column = column, } } default_error_handler :: proc(pos: Pos, msg: string, args: ..any) { fmt.eprintf("%s(%d:%d) ", pos.file, pos.line, pos.column) fmt.eprintf(msg, ..args) fmt.eprintf("\n") } error :: proc(t: ^Tokenizer, offset: int, msg: string, args: ..any) { pos := offset_to_pos(t, offset) if t.err != nil { t.err(pos, msg, ..args) } t.error_count += 1 } advance_rune :: proc(using t: ^Tokenizer) { if read_offset < len(src) { offset = read_offset if ch == '\n' { line_offset = offset line_count += 1 } r, w := rune(src[read_offset]), 1 switch { case r == 0: error(t, t.offset, "illegal character NUL") case r >= utf8.RUNE_SELF: r, w = utf8.decode_rune_in_string(src[read_offset:]) if r == utf8.RUNE_ERROR && w == 1 { error(t, t.offset, "illegal UTF-8 encoding") } else if r == utf8.RUNE_BOM && offset > 0 { error(t, t.offset, "illegal byte order mark") } } read_offset += w ch = r } else { offset = len(src) if ch == '\n' { line_offset = offset line_count += 1 } ch = -1 } } peek_byte :: proc(t: ^Tokenizer, offset := 0) -> byte { if t.read_offset+offset < len(t.src) { return t.src[t.read_offset+offset] } return 0 } skip_whitespace :: proc(t: ^Tokenizer) { for { switch t.ch { case ' ', '\t', '\r', '\n': advance_rune(t) case: return } } } is_letter :: proc(r: rune) -> bool { if r < utf8.RUNE_SELF { switch r { case '_': return true case 'A'..='Z', 'a'..='z': return true } } return unicode.is_letter(r) } is_valid_identifier_rune :: proc(r: rune) -> bool { if r < utf8.RUNE_SELF { switch r { case '_', '-', ':': return true case 'A'..='Z', 'a'..='z': return true case '0'..'9': return true } } if unicode.is_letter(r) || unicode.is_digit(r) { return true } return false } scan_identifier :: proc(t: ^Tokenizer) -> string { offset := t.offset namespaced := false for is_valid_identifier_rune(t.ch) { advance_rune(t) if t.ch == ':' { /* A namespaced attr can have at most two parts, `namespace:ident`. */ if namespaced { break } namespaced = true } } return string(t.src[offset : t.offset]) } scan_string :: proc(t: ^Tokenizer, offset: int, close: rune = '<', consume_close := false, multiline := true) -> (value: string, err: Error) { err = .None in_cdata := false loop: for { ch := t.ch switch ch { case -1: error(t, t.offset, "[scan_string] Premature end of file.\n") return "", .Premature_EOF case '<': /* Might be the start of a CDATA tag. */ if t.read_offset + len(CDATA_START) < len(t.src) { if string(t.src[t.offset:][:len(CDATA_START)]) == CDATA_START { in_cdata = true } } case ']': /* Might be the end of a CDATA tag. */ if t.read_offset + len(CDATA_END) < len(t.src) { if string(t.src[t.offset:][:len(CDATA_END)]) == CDATA_END { in_cdata = false } } case '\n': if !(multiline || in_cdata) { error(t, offset, string(t.src[offset : t.offset])) error(t, offset, "[scan_string] Not terminated\n") err = .Invalid_Tag_Value break loop } } if ch == close && !in_cdata { /* If it's not a CDATA tag, it's the end of this body. */ break loop } advance_rune(t) } /* Strip trailing whitespace. */ lit := string(t.src[offset : t.offset]) end := len(lit) eat: for ; end > 0; end -= 1 { ch := lit[end - 1] switch ch { case ' ', '\t', '\r', '\n': case: break eat } } lit = lit[:end] if consume_close { advance_rune(t) } /* TODO: Handle decoding escape characters and unboxing CDATA. */ return lit, err } peek :: proc(t: ^Tokenizer) -> (token: Token) { old := t^ token = scan(t) t^ = old return token } scan :: proc(t: ^Tokenizer) -> Token { skip_whitespace(t) offset := t.offset kind: Token_Kind err: Error lit: string pos := offset_to_pos(t, offset) switch ch := t.ch; true { case is_letter(ch): lit = scan_identifier(t) kind = .Ident case: advance_rune(t) switch ch { case -1: kind = .EOF case '<': kind = .Lt case '>': kind = .Gt case '!': kind = .Exclaim case '?': kind = .Question case '=': kind = .Eq case '#': kind = .Hash case '/': kind = .Slash case '-': kind = .Dash case ':': kind = .Colon case '"', '\'': lit, err = scan_string(t, t.offset, ch, true, false) if err == .None { kind = .String } else { kind = .Invalid } case '\n': lit = "\n" case: kind = .Invalid } } if lit == "" { lit = string(t.src[offset : t.offset]) } return Token{kind, lit, pos} }