[xml] Improve CDATA + comment handling in tag body.

This commit is contained in:
Jeroen van Rijn
2021-12-01 22:01:19 +01:00
parent beff90e1d1
commit 23baf56c87
2 changed files with 96 additions and 57 deletions
+94 -23
View File
@@ -46,8 +46,11 @@ Token_Kind :: enum {
EOF,
}
CDATA_START :: "<![CDATA["
CDATA_END :: "]]>"
CDATA_START :: "<![CDATA["
CDATA_END :: "]]>"
COMMENT_START :: "<!--"
COMMENT_END :: "-->"
Tokenizer :: struct {
// Immutable data
@@ -214,10 +217,83 @@ scan_identifier :: proc(t: ^Tokenizer) -> string {
return string(t.src[offset : t.offset])
}
/*
A comment ends when we see -->, preceded by a character that's not a dash.
"For compatibility, the string "--" (double-hyphen) must not occur within comments."
See: https://www.w3.org/TR/2006/REC-xml11-20060816/#dt-comment
Thanks to the length (4) of the comment start, we also have enough lookback,
and the peek at the next byte asserts that there's at least one more character
that's a `>`.
*/
scan_comment :: proc(t: ^Tokenizer) -> (comment: string, err: Error) {
offset := t.offset
for {
advance_rune(t)
ch := t.ch
if ch < 0 {
error(t, offset, "[parse] Comment was not terminated\n")
return "", .Unclosed_Comment
}
if string(t.src[t.offset - 1:][:2]) == "--" {
if peek_byte(t) == '>' {
break
} else {
error(t, t.offset - 1, "Invalid -- sequence in comment.\n")
return "", .Invalid_Sequence_In_Comment
}
}
}
expect(t, .Dash)
expect(t, .Gt)
return string(t.src[offset : t.offset - 1]), .None
}
/*
Skip CDATA
*/
skip_cdata :: proc(t: ^Tokenizer) -> (err: Error) {
if t.read_offset + len(CDATA_START) >= len(t.src) {
/*
Can't be the start of a CDATA tag.
*/
return .None
}
if string(t.src[t.offset:][:len(CDATA_START)]) == CDATA_START {
t.read_offset += len(CDATA_START)
offset := t.offset
cdata_scan: for {
advance_rune(t)
if t.ch < 0 {
error(t, offset, "[scan_string] CDATA was not terminated\n")
return .Premature_EOF
}
/*
Scan until the end of a CDATA tag.
*/
if t.read_offset + len(CDATA_END) < len(t.src) {
if string(t.src[t.offset:][:len(CDATA_END)]) == CDATA_END {
t.read_offset += len(CDATA_END)
break cdata_scan
}
}
}
}
return
}
@(optimization_mode="speed")
scan_string :: proc(t: ^Tokenizer, offset: int, close: rune = '<', consume_close := false, multiline := true) -> (value: string, err: Error) {
err = .None
in_cdata := false
loop: for {
ch := t.ch
@@ -228,27 +304,23 @@ scan_string :: proc(t: ^Tokenizer, offset: int, close: rune = '<', consume_close
return "", .Premature_EOF
case '<':
/*
Might be the start of a CDATA tag.
*/
if t.read_offset + len(CDATA_START) < len(t.src) {
if string(t.src[t.offset:][:len(CDATA_START)]) == CDATA_START {
in_cdata = true
}
}
case ']':
/*
Might be the end of a CDATA tag.
*/
if t.read_offset + len(CDATA_END) < len(t.src) {
if string(t.src[t.offset:][:len(CDATA_END)]) == CDATA_END {
in_cdata = false
if peek_byte(t) == '!' {
if peek_byte(t, 1) == '[' {
/*
Might be the start of a CDATA tag.
*/
skip_cdata(t) or_return
} else if peek_byte(t, 1) == '-' && peek_byte(t, 2) == '-' {
/*
Comment start. Eat comment.
*/
t.read_offset += 3
_ = scan_comment(t) or_return
}
}
case '\n':
if !(multiline || in_cdata) {
if !multiline {
error(t, offset, string(t.src[offset : t.offset]))
error(t, offset, "[scan_string] Not terminated\n")
err = .Invalid_Tag_Value
@@ -256,13 +328,12 @@ scan_string :: proc(t: ^Tokenizer, offset: int, close: rune = '<', consume_close
}
}
if ch == close && !in_cdata {
if t.ch == close {
/*
If it's not a CDATA tag, it's the end of this body.
If it's not a CDATA or comment, it's the end of this body.
*/
break loop
}
advance_rune(t)
}
+2 -34
View File
@@ -307,39 +307,10 @@ parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", err
The grammar does not allow a comment to end in --->
*/
expect(t, .Dash)
offset := t.offset
for {
advance_rune(t)
ch := t.ch
/*
A comment ends when we see -->, preceded by a character that's not a dash.
"For compatibility, the string "--" (double-hyphen) must not occur within comments."
See: https://www.w3.org/TR/2006/REC-xml11-20060816/#dt-comment
Thanks to the length (4) of the comment start, we also have enough lookback,
and the peek at the next byte asserts that there's at least one more character
that's a `>`.
*/
if ch < 0 {
error(t, offset, "[parse] Comment was not terminated\n")
return doc, .Unclosed_Comment
}
if string(t.src[t.offset - 1:][:2]) == "--" {
if peek_byte(t) == '>' {
break
} else {
error(t, t.offset - 1, "Invalid -- sequence in comment.\n")
return doc, .Invalid_Sequence_In_Comment
}
}
}
comment := scan_comment(t) or_return
if .Intern_Comments in opts.flags {
comment := strings.intern_get(&doc.intern, string(t.src[offset : t.offset - 1]))
comment = strings.intern_get(&doc.intern, comment)
if doc.root == nil {
append(&doc.comments, comment)
@@ -352,9 +323,6 @@ parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", err
}
}
expect(t, .Dash)
expect(t, .Gt)
case:
error(t, t.offset, "Invalid Token after <!. Expected .Ident, got %#v\n", next)
return