mirror of
https://github.com/Ed94/Odin.git
synced 2026-06-13 01:21:38 -07:00
Update XML reader to normalize whitespace, part 1.
This commit is contained in:
@@ -218,9 +218,7 @@ scan_identifier :: proc(t: ^Tokenizer) -> string {
|
||||
for is_valid_identifier_rune(t.ch) {
|
||||
advance_rune(t)
|
||||
if t.ch == ':' {
|
||||
/*
|
||||
A namespaced attr can have at most two parts, `namespace:ident`.
|
||||
*/
|
||||
// A namespaced attr can have at most two parts, `namespace:ident`.
|
||||
if namespaced {
|
||||
break
|
||||
}
|
||||
@@ -268,14 +266,10 @@ scan_comment :: proc(t: ^Tokenizer) -> (comment: string, err: Error) {
|
||||
return string(t.src[offset : t.offset - 1]), .None
|
||||
}
|
||||
|
||||
/*
|
||||
Skip CDATA
|
||||
*/
|
||||
// Skip CDATA
|
||||
skip_cdata :: proc(t: ^Tokenizer) -> (err: Error) {
|
||||
if t.read_offset + len(CDATA_START) >= len(t.src) {
|
||||
/*
|
||||
Can't be the start of a CDATA tag.
|
||||
*/
|
||||
// Can't be the start of a CDATA tag.
|
||||
return .None
|
||||
}
|
||||
|
||||
@@ -290,9 +284,7 @@ skip_cdata :: proc(t: ^Tokenizer) -> (err: Error) {
|
||||
return .Premature_EOF
|
||||
}
|
||||
|
||||
/*
|
||||
Scan until the end of a CDATA tag.
|
||||
*/
|
||||
// Scan until the end of a CDATA tag.
|
||||
if t.read_offset + len(CDATA_END) < len(t.src) {
|
||||
if string(t.src[t.offset:][:len(CDATA_END)]) == CDATA_END {
|
||||
t.read_offset += len(CDATA_END)
|
||||
@@ -319,14 +311,10 @@ scan_string :: proc(t: ^Tokenizer, offset: int, close: rune = '<', consume_close
|
||||
case '<':
|
||||
if peek_byte(t) == '!' {
|
||||
if peek_byte(t, 1) == '[' {
|
||||
/*
|
||||
Might be the start of a CDATA tag.
|
||||
*/
|
||||
// Might be the start of a CDATA tag.
|
||||
skip_cdata(t) or_return
|
||||
} else if peek_byte(t, 1) == '-' && peek_byte(t, 2) == '-' {
|
||||
/*
|
||||
Comment start. Eat comment.
|
||||
*/
|
||||
// Comment start. Eat comment.
|
||||
t.read_offset += 3
|
||||
_ = scan_comment(t) or_return
|
||||
}
|
||||
@@ -342,17 +330,13 @@ scan_string :: proc(t: ^Tokenizer, offset: int, close: rune = '<', consume_close
|
||||
}
|
||||
|
||||
if t.ch == close {
|
||||
/*
|
||||
If it's not a CDATA or comment, it's the end of this body.
|
||||
*/
|
||||
// If it's not a CDATA or comment, it's the end of this body.
|
||||
break loop
|
||||
}
|
||||
advance_rune(t)
|
||||
}
|
||||
|
||||
/*
|
||||
Strip trailing whitespace.
|
||||
*/
|
||||
// Strip trailing whitespace.
|
||||
lit := string(t.src[offset : t.offset])
|
||||
|
||||
end := len(lit)
|
||||
@@ -369,11 +353,6 @@ scan_string :: proc(t: ^Tokenizer, offset: int, close: rune = '<', consume_close
|
||||
if consume_close {
|
||||
advance_rune(t)
|
||||
}
|
||||
|
||||
/*
|
||||
TODO: Handle decoding escape characters and unboxing CDATA.
|
||||
*/
|
||||
|
||||
return lit, err
|
||||
}
|
||||
|
||||
@@ -384,7 +363,7 @@ peek :: proc(t: ^Tokenizer) -> (token: Token) {
|
||||
return token
|
||||
}
|
||||
|
||||
scan :: proc(t: ^Tokenizer) -> Token {
|
||||
scan :: proc(t: ^Tokenizer, multiline_string := false) -> Token {
|
||||
skip_whitespace(t)
|
||||
|
||||
offset := t.offset
|
||||
@@ -418,7 +397,7 @@ scan :: proc(t: ^Tokenizer) -> Token {
|
||||
case '"', '\'':
|
||||
kind = .Invalid
|
||||
|
||||
lit, err = scan_string(t, t.offset, ch, true, false)
|
||||
lit, err = scan_string(t, t.offset, ch, true, multiline_string)
|
||||
if err == .None {
|
||||
kind = .String
|
||||
}
|
||||
@@ -435,4 +414,4 @@ scan :: proc(t: ^Tokenizer) -> Token {
|
||||
lit = string(t.src[offset : t.offset])
|
||||
}
|
||||
return Token{kind, lit, pos}
|
||||
}
|
||||
}
|
||||
@@ -203,9 +203,7 @@ parse_bytes :: proc(data: []u8, options := DEFAULT_OPTIONS, path := "", error_ha
|
||||
|
||||
doc.elements = make([dynamic]Element, 1024, 1024, allocator)
|
||||
|
||||
// strings.intern_init(&doc.intern, allocator, allocator)
|
||||
|
||||
err = .Unexpected_Token
|
||||
err = .Unexpected_Token
|
||||
element, parent: Element_ID
|
||||
open: Token
|
||||
|
||||
@@ -259,8 +257,8 @@ parse_bytes :: proc(data: []u8, options := DEFAULT_OPTIONS, path := "", error_ha
|
||||
case .Slash:
|
||||
// Empty tag. Close it.
|
||||
expect(t, .Gt) or_return
|
||||
parent = doc.elements[element].parent
|
||||
element = parent
|
||||
parent = doc.elements[element].parent
|
||||
element = parent
|
||||
|
||||
case:
|
||||
error(t, t.offset, "Expected close tag, got: %#v\n", end_token)
|
||||
@@ -276,8 +274,8 @@ parse_bytes :: proc(data: []u8, options := DEFAULT_OPTIONS, path := "", error_ha
|
||||
error(t, t.offset, "Mismatched Closing Tag. Expected %v, got %v\n", doc.elements[element].ident, ident.text)
|
||||
return doc, .Mismatched_Closing_Tag
|
||||
}
|
||||
parent = doc.elements[element].parent
|
||||
element = parent
|
||||
parent = doc.elements[element].parent
|
||||
element = parent
|
||||
|
||||
} else if open.kind == .Exclaim {
|
||||
// <!
|
||||
@@ -463,8 +461,8 @@ validate_options :: proc(options: Options) -> (validated: Options, err: Error) {
|
||||
return validated, .None
|
||||
}
|
||||
|
||||
expect :: proc(t: ^Tokenizer, kind: Token_Kind) -> (tok: Token, err: Error) {
|
||||
tok = scan(t)
|
||||
expect :: proc(t: ^Tokenizer, kind: Token_Kind, multiline_string := false) -> (tok: Token, err: Error) {
|
||||
tok = scan(t, multiline_string=multiline_string)
|
||||
if tok.kind == kind { return tok, .None }
|
||||
|
||||
error(t, t.offset, "Expected \"%v\", got \"%v\".", kind, tok.kind)
|
||||
@@ -480,7 +478,13 @@ parse_attribute :: proc(doc: ^Document) -> (attr: Attribute, offset: int, err: E
|
||||
offset = t.offset - len(key.text)
|
||||
|
||||
_ = expect(t, .Eq) or_return
|
||||
value := expect(t, .String) or_return
|
||||
value := expect(t, .String, multiline_string=true) or_return
|
||||
|
||||
normalized, normalize_err := entity.decode_xml(value.text, {.Normalize_Whitespace}, doc.allocator)
|
||||
if normalize_err == .None {
|
||||
append(&doc.strings_to_free, normalized)
|
||||
value.text = normalized
|
||||
}
|
||||
|
||||
attr.key = key.text
|
||||
attr.val = value.text
|
||||
|
||||
Reference in New Issue
Block a user