mirror of
https://github.com/Ed94/Odin.git
synced 2026-06-13 01:21:38 -07:00
[xml] Improve CDATA + comment handling in tag body.
This commit is contained in:
@@ -46,8 +46,11 @@ Token_Kind :: enum {
|
||||
EOF,
|
||||
}
|
||||
|
||||
CDATA_START :: "<![CDATA["
|
||||
CDATA_END :: "]]>"
|
||||
CDATA_START :: "<![CDATA["
|
||||
CDATA_END :: "]]>"
|
||||
|
||||
COMMENT_START :: "<!--"
|
||||
COMMENT_END :: "-->"
|
||||
|
||||
Tokenizer :: struct {
|
||||
// Immutable data
|
||||
@@ -214,10 +217,83 @@ scan_identifier :: proc(t: ^Tokenizer) -> string {
|
||||
return string(t.src[offset : t.offset])
|
||||
}
|
||||
|
||||
/*
|
||||
A comment ends when we see -->, preceded by a character that's not a dash.
|
||||
"For compatibility, the string "--" (double-hyphen) must not occur within comments."
|
||||
|
||||
See: https://www.w3.org/TR/2006/REC-xml11-20060816/#dt-comment
|
||||
|
||||
Thanks to the length (4) of the comment start, we also have enough lookback,
|
||||
and the peek at the next byte asserts that there's at least one more character
|
||||
that's a `>`.
|
||||
*/
|
||||
scan_comment :: proc(t: ^Tokenizer) -> (comment: string, err: Error) {
|
||||
offset := t.offset
|
||||
|
||||
for {
|
||||
advance_rune(t)
|
||||
ch := t.ch
|
||||
|
||||
if ch < 0 {
|
||||
error(t, offset, "[parse] Comment was not terminated\n")
|
||||
return "", .Unclosed_Comment
|
||||
}
|
||||
|
||||
if string(t.src[t.offset - 1:][:2]) == "--" {
|
||||
if peek_byte(t) == '>' {
|
||||
break
|
||||
} else {
|
||||
error(t, t.offset - 1, "Invalid -- sequence in comment.\n")
|
||||
return "", .Invalid_Sequence_In_Comment
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
expect(t, .Dash)
|
||||
expect(t, .Gt)
|
||||
|
||||
return string(t.src[offset : t.offset - 1]), .None
|
||||
}
|
||||
|
||||
/*
|
||||
Skip CDATA
|
||||
*/
|
||||
skip_cdata :: proc(t: ^Tokenizer) -> (err: Error) {
|
||||
if t.read_offset + len(CDATA_START) >= len(t.src) {
|
||||
/*
|
||||
Can't be the start of a CDATA tag.
|
||||
*/
|
||||
return .None
|
||||
}
|
||||
|
||||
if string(t.src[t.offset:][:len(CDATA_START)]) == CDATA_START {
|
||||
t.read_offset += len(CDATA_START)
|
||||
offset := t.offset
|
||||
|
||||
cdata_scan: for {
|
||||
advance_rune(t)
|
||||
if t.ch < 0 {
|
||||
error(t, offset, "[scan_string] CDATA was not terminated\n")
|
||||
return .Premature_EOF
|
||||
}
|
||||
|
||||
/*
|
||||
Scan until the end of a CDATA tag.
|
||||
*/
|
||||
if t.read_offset + len(CDATA_END) < len(t.src) {
|
||||
if string(t.src[t.offset:][:len(CDATA_END)]) == CDATA_END {
|
||||
t.read_offset += len(CDATA_END)
|
||||
break cdata_scan
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
@(optimization_mode="speed")
|
||||
scan_string :: proc(t: ^Tokenizer, offset: int, close: rune = '<', consume_close := false, multiline := true) -> (value: string, err: Error) {
|
||||
err = .None
|
||||
in_cdata := false
|
||||
|
||||
loop: for {
|
||||
ch := t.ch
|
||||
@@ -228,27 +304,23 @@ scan_string :: proc(t: ^Tokenizer, offset: int, close: rune = '<', consume_close
|
||||
return "", .Premature_EOF
|
||||
|
||||
case '<':
|
||||
/*
|
||||
Might be the start of a CDATA tag.
|
||||
*/
|
||||
if t.read_offset + len(CDATA_START) < len(t.src) {
|
||||
if string(t.src[t.offset:][:len(CDATA_START)]) == CDATA_START {
|
||||
in_cdata = true
|
||||
}
|
||||
}
|
||||
|
||||
case ']':
|
||||
/*
|
||||
Might be the end of a CDATA tag.
|
||||
*/
|
||||
if t.read_offset + len(CDATA_END) < len(t.src) {
|
||||
if string(t.src[t.offset:][:len(CDATA_END)]) == CDATA_END {
|
||||
in_cdata = false
|
||||
if peek_byte(t) == '!' {
|
||||
if peek_byte(t, 1) == '[' {
|
||||
/*
|
||||
Might be the start of a CDATA tag.
|
||||
*/
|
||||
skip_cdata(t) or_return
|
||||
} else if peek_byte(t, 1) == '-' && peek_byte(t, 2) == '-' {
|
||||
/*
|
||||
Comment start. Eat comment.
|
||||
*/
|
||||
t.read_offset += 3
|
||||
_ = scan_comment(t) or_return
|
||||
}
|
||||
}
|
||||
|
||||
case '\n':
|
||||
if !(multiline || in_cdata) {
|
||||
if !multiline {
|
||||
error(t, offset, string(t.src[offset : t.offset]))
|
||||
error(t, offset, "[scan_string] Not terminated\n")
|
||||
err = .Invalid_Tag_Value
|
||||
@@ -256,13 +328,12 @@ scan_string :: proc(t: ^Tokenizer, offset: int, close: rune = '<', consume_close
|
||||
}
|
||||
}
|
||||
|
||||
if ch == close && !in_cdata {
|
||||
if t.ch == close {
|
||||
/*
|
||||
If it's not a CDATA tag, it's the end of this body.
|
||||
If it's not a CDATA or comment, it's the end of this body.
|
||||
*/
|
||||
break loop
|
||||
}
|
||||
|
||||
advance_rune(t)
|
||||
}
|
||||
|
||||
|
||||
@@ -307,39 +307,10 @@ parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", err
|
||||
The grammar does not allow a comment to end in --->
|
||||
*/
|
||||
expect(t, .Dash)
|
||||
offset := t.offset
|
||||
|
||||
for {
|
||||
advance_rune(t)
|
||||
ch := t.ch
|
||||
|
||||
/*
|
||||
A comment ends when we see -->, preceded by a character that's not a dash.
|
||||
"For compatibility, the string "--" (double-hyphen) must not occur within comments."
|
||||
|
||||
See: https://www.w3.org/TR/2006/REC-xml11-20060816/#dt-comment
|
||||
|
||||
Thanks to the length (4) of the comment start, we also have enough lookback,
|
||||
and the peek at the next byte asserts that there's at least one more character
|
||||
that's a `>`.
|
||||
*/
|
||||
if ch < 0 {
|
||||
error(t, offset, "[parse] Comment was not terminated\n")
|
||||
return doc, .Unclosed_Comment
|
||||
}
|
||||
|
||||
if string(t.src[t.offset - 1:][:2]) == "--" {
|
||||
if peek_byte(t) == '>' {
|
||||
break
|
||||
} else {
|
||||
error(t, t.offset - 1, "Invalid -- sequence in comment.\n")
|
||||
return doc, .Invalid_Sequence_In_Comment
|
||||
}
|
||||
}
|
||||
}
|
||||
comment := scan_comment(t) or_return
|
||||
|
||||
if .Intern_Comments in opts.flags {
|
||||
comment := strings.intern_get(&doc.intern, string(t.src[offset : t.offset - 1]))
|
||||
comment = strings.intern_get(&doc.intern, comment)
|
||||
|
||||
if doc.root == nil {
|
||||
append(&doc.comments, comment)
|
||||
@@ -352,9 +323,6 @@ parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", err
|
||||
}
|
||||
}
|
||||
|
||||
expect(t, .Dash)
|
||||
expect(t, .Gt)
|
||||
|
||||
case:
|
||||
error(t, t.offset, "Invalid Token after <!. Expected .Ident, got %#v\n", next)
|
||||
return
|
||||
|
||||
Reference in New Issue
Block a user