mirror of
https://github.com/Ed94/Odin.git
synced 2026-07-04 19:01:36 -07:00
@@ -108,7 +108,7 @@ decode_xml :: proc(input: string, options := XML_Decode_Options{}, allocator :=
|
||||
it couldn't have been part of an XML tag body to be decoded here.
|
||||
|
||||
Keep in mind that we could already *be* inside a CDATA tag.
|
||||
If so, write `>` as a literal and continue.
|
||||
If so, write `<` as a literal and continue.
|
||||
*/
|
||||
if in_data {
|
||||
write_rune(&builder, '<')
|
||||
@@ -119,11 +119,9 @@ decode_xml :: proc(input: string, options := XML_Decode_Options{}, allocator :=
|
||||
case ']':
|
||||
// If we're unboxing _and_ decoding CDATA, we'll have to check for the end tag.
|
||||
if in_data {
|
||||
if t.read_offset + len(CDATA_END) < len(t.src) {
|
||||
if string(t.src[t.offset:][:len(CDATA_END)]) == CDATA_END {
|
||||
in_data = false
|
||||
t.read_offset += len(CDATA_END) - 1
|
||||
}
|
||||
if strings.has_prefix(t.src[t.offset:], CDATA_END) {
|
||||
in_data = false
|
||||
t.read_offset += len(CDATA_END) - 1
|
||||
}
|
||||
continue
|
||||
} else {
|
||||
@@ -297,40 +295,40 @@ _handle_xml_special :: proc(t: ^Tokenizer, builder: ^strings.Builder, options: X
|
||||
assert(t != nil && t.r == '<')
|
||||
if t.read_offset + len(CDATA_START) >= len(t.src) { return false, .None }
|
||||
|
||||
if string(t.src[t.offset:][:len(CDATA_START)]) == CDATA_START {
|
||||
t.read_offset += len(CDATA_START) - 1
|
||||
|
||||
s := string(t.src[t.offset:])
|
||||
if strings.has_prefix(s, CDATA_START) {
|
||||
if .Unbox_CDATA in options && .Decode_CDATA in options {
|
||||
// We're unboxing _and_ decoding CDATA
|
||||
t.read_offset += len(CDATA_START) - 1
|
||||
return true, .None
|
||||
}
|
||||
|
||||
// CDATA is passed through.
|
||||
offset := t.offset
|
||||
|
||||
// Scan until end of CDATA.
|
||||
// CDATA is passed through. Scan until end of CDATA.
|
||||
start_offset := t.offset
|
||||
t.read_offset += len(CDATA_START)
|
||||
for {
|
||||
advance(t) or_return
|
||||
if t.r < 0 { return true, .CDATA_Not_Terminated }
|
||||
advance(t)
|
||||
if t.r < 0 {
|
||||
// error(t, offset, "[scan_string] CDATA was not terminated\n")
|
||||
return true, .CDATA_Not_Terminated
|
||||
}
|
||||
|
||||
if t.read_offset + len(CDATA_END) < len(t.src) {
|
||||
if string(t.src[t.offset:][:len(CDATA_END)]) == CDATA_END {
|
||||
t.read_offset += len(CDATA_END) - 1
|
||||
// Scan until the end of a CDATA tag.
|
||||
if s = string(t.src[t.read_offset:]); strings.has_prefix(s, CDATA_END) {
|
||||
t.read_offset += len(CDATA_END)
|
||||
cdata := string(t.src[start_offset:t.read_offset])
|
||||
|
||||
cdata := string(t.src[offset : t.read_offset])
|
||||
|
||||
if .Unbox_CDATA in options {
|
||||
cdata = cdata[len(CDATA_START):]
|
||||
cdata = cdata[:len(cdata) - len(CDATA_END)]
|
||||
}
|
||||
|
||||
write_string(builder, cdata)
|
||||
return false, .None
|
||||
if .Unbox_CDATA in options {
|
||||
cdata = cdata[len(CDATA_START):]
|
||||
cdata = cdata[:len(cdata) - len(CDATA_END)]
|
||||
}
|
||||
write_string(builder, cdata)
|
||||
return false, .None
|
||||
}
|
||||
}
|
||||
|
||||
} else if string(t.src[t.offset:][:len(COMMENT_START)]) == COMMENT_START {
|
||||
|
||||
} else if strings.has_prefix(s, COMMENT_START) {
|
||||
t.read_offset += len(COMMENT_START)
|
||||
// Comment is passed through by default.
|
||||
offset := t.offset
|
||||
|
||||
@@ -16,6 +16,7 @@ package encoding_xml
|
||||
import "core:fmt"
|
||||
import "core:unicode"
|
||||
import "core:unicode/utf8"
|
||||
import "core:strings"
|
||||
|
||||
Error_Handler :: #type proc(pos: Pos, fmt: string, args: ..any)
|
||||
|
||||
@@ -121,7 +122,7 @@ default_error_handler :: proc(pos: Pos, msg: string, args: ..any) {
|
||||
error :: proc(t: ^Tokenizer, offset: int, msg: string, args: ..any) {
|
||||
pos := offset_to_pos(t, offset)
|
||||
if t.err != nil {
|
||||
t.err(pos, msg, ..args)
|
||||
t.err(pos=pos, fmt=msg, args=args)
|
||||
}
|
||||
t.error_count += 1
|
||||
}
|
||||
@@ -268,32 +269,27 @@ scan_comment :: proc(t: ^Tokenizer) -> (comment: string, err: Error) {
|
||||
|
||||
// Skip CDATA
|
||||
skip_cdata :: proc(t: ^Tokenizer) -> (err: Error) {
|
||||
if t.read_offset + len(CDATA_START) >= len(t.src) {
|
||||
// Can't be the start of a CDATA tag.
|
||||
if s := string(t.src[t.offset:]); !strings.has_prefix(s, CDATA_START) {
|
||||
return .None
|
||||
}
|
||||
|
||||
if string(t.src[t.offset:][:len(CDATA_START)]) == CDATA_START {
|
||||
t.read_offset += len(CDATA_START)
|
||||
offset := t.offset
|
||||
t.read_offset += len(CDATA_START)
|
||||
offset := t.offset
|
||||
|
||||
cdata_scan: for {
|
||||
advance_rune(t)
|
||||
if t.ch < 0 {
|
||||
error(t, offset, "[scan_string] CDATA was not terminated\n")
|
||||
return .Premature_EOF
|
||||
}
|
||||
cdata_scan: for {
|
||||
advance_rune(t)
|
||||
if t.ch < 0 {
|
||||
error(t, offset, "[scan_string] CDATA was not terminated\n")
|
||||
return .Premature_EOF
|
||||
}
|
||||
|
||||
// Scan until the end of a CDATA tag.
|
||||
if t.read_offset + len(CDATA_END) < len(t.src) {
|
||||
if string(t.src[t.offset:][:len(CDATA_END)]) == CDATA_END {
|
||||
t.read_offset += len(CDATA_END)
|
||||
break cdata_scan
|
||||
}
|
||||
}
|
||||
// Scan until the end of a CDATA tag.
|
||||
if s := string(t.src[t.read_offset:]); strings.has_prefix(s, CDATA_END) {
|
||||
t.read_offset += len(CDATA_END)
|
||||
break cdata_scan
|
||||
}
|
||||
}
|
||||
return
|
||||
return .None
|
||||
}
|
||||
|
||||
@(optimization_mode="favor_size")
|
||||
@@ -393,6 +389,8 @@ scan :: proc(t: ^Tokenizer, multiline_string := false) -> Token {
|
||||
case '/': kind = .Slash
|
||||
case '-': kind = .Dash
|
||||
case ':': kind = .Colon
|
||||
case '[': kind = .Open_Bracket
|
||||
case ']': kind = .Close_Bracket
|
||||
|
||||
case '"', '\'':
|
||||
kind = .Invalid
|
||||
|
||||
@@ -56,7 +56,7 @@ Option_Flag :: enum {
|
||||
Option_Flags :: bit_set[Option_Flag; u16]
|
||||
|
||||
Document :: struct {
|
||||
elements: [dynamic]Element,
|
||||
elements: [dynamic]Element `fmt:"v,element_count"`,
|
||||
element_count: Element_ID,
|
||||
|
||||
prologue: Attributes,
|
||||
@@ -70,15 +70,15 @@ Document :: struct {
|
||||
|
||||
// If we encounter comments before the root node, and the option to intern comments is given, this is where they'll live.
|
||||
// Otherwise they'll be in the element tree.
|
||||
comments: [dynamic]string,
|
||||
comments: [dynamic]string `fmt:"-"`,
|
||||
|
||||
// Internal
|
||||
tokenizer: ^Tokenizer,
|
||||
allocator: mem.Allocator,
|
||||
tokenizer: ^Tokenizer `fmt:"-"`,
|
||||
allocator: mem.Allocator `fmt:"-"`,
|
||||
|
||||
// Input. Either the original buffer, or a copy if `.Input_May_Be_Modified` isn't specified.
|
||||
input: []u8,
|
||||
strings_to_free: [dynamic]string,
|
||||
input: []u8 `fmt:"-"`,
|
||||
strings_to_free: [dynamic]string `fmt:"-"`,
|
||||
}
|
||||
|
||||
Element :: struct {
|
||||
@@ -195,7 +195,6 @@ parse_bytes :: proc(data: []u8, options := DEFAULT_OPTIONS, path := "", error_ha
|
||||
|
||||
loop: for {
|
||||
skip_whitespace(t)
|
||||
// NOTE(Jeroen): This is faster as a switch.
|
||||
switch t.ch {
|
||||
case '<':
|
||||
// Consume peeked `<`
|
||||
@@ -306,9 +305,13 @@ parse_bytes :: proc(data: []u8, options := DEFAULT_OPTIONS, path := "", error_ha
|
||||
}
|
||||
}
|
||||
|
||||
case .Open_Bracket:
|
||||
// This could be a CDATA tag part of a tag's body. Unread the `<![`
|
||||
t.offset -= 3
|
||||
parse_body(doc, element, opts) or_return
|
||||
|
||||
case:
|
||||
error(t, t.offset, "Invalid Token after <!. Expected .Ident, got %#v\n", next)
|
||||
return
|
||||
error(t, t.offset, "Unexpected Token after <!: %#v", next)
|
||||
}
|
||||
|
||||
} else if open.kind == .Question {
|
||||
@@ -341,38 +344,7 @@ parse_bytes :: proc(data: []u8, options := DEFAULT_OPTIONS, path := "", error_ha
|
||||
|
||||
case:
|
||||
// This should be a tag's body text.
|
||||
body_text := scan_string(t, t.offset) or_return
|
||||
needs_processing := .Unbox_CDATA in opts.flags
|
||||
needs_processing |= .Decode_SGML_Entities in opts.flags
|
||||
|
||||
if !needs_processing {
|
||||
append(&doc.elements[element].value, body_text)
|
||||
continue
|
||||
}
|
||||
|
||||
decode_opts := entity.XML_Decode_Options{}
|
||||
if .Keep_Tag_Body_Comments not_in opts.flags {
|
||||
decode_opts += { .Comment_Strip }
|
||||
}
|
||||
|
||||
if .Decode_SGML_Entities not_in opts.flags {
|
||||
decode_opts += { .No_Entity_Decode }
|
||||
}
|
||||
|
||||
if .Unbox_CDATA in opts.flags {
|
||||
decode_opts += { .Unbox_CDATA }
|
||||
if .Decode_SGML_Entities in opts.flags {
|
||||
decode_opts += { .Decode_CDATA }
|
||||
}
|
||||
}
|
||||
|
||||
decoded, decode_err := entity.decode_xml(body_text, decode_opts)
|
||||
if decode_err == .None {
|
||||
append(&doc.elements[element].value, decoded)
|
||||
append(&doc.strings_to_free, decoded)
|
||||
} else {
|
||||
append(&doc.elements[element].value, body_text)
|
||||
}
|
||||
parse_body(doc, element, opts) or_return
|
||||
}
|
||||
}
|
||||
|
||||
@@ -457,8 +429,6 @@ parse_attribute :: proc(doc: ^Document) -> (attr: Attribute, offset: int, err: E
|
||||
t := doc.tokenizer
|
||||
|
||||
key := expect(t, .Ident) or_return
|
||||
offset = t.offset - len(key.text)
|
||||
|
||||
_ = expect(t, .Eq) or_return
|
||||
value := expect(t, .String, multiline_string=true) or_return
|
||||
|
||||
@@ -591,6 +561,47 @@ parse_doctype :: proc(doc: ^Document) -> (err: Error) {
|
||||
return .None
|
||||
}
|
||||
|
||||
parse_body :: proc(doc: ^Document, element: Element_ID, opts: Options) -> (err: Error) {
|
||||
assert(doc != nil)
|
||||
context.allocator = doc.allocator
|
||||
t := doc.tokenizer
|
||||
|
||||
body_text := scan_string(t, t.offset) or_return
|
||||
needs_processing := .Unbox_CDATA in opts.flags
|
||||
needs_processing |= .Decode_SGML_Entities in opts.flags
|
||||
|
||||
if !needs_processing {
|
||||
append(&doc.elements[element].value, body_text)
|
||||
return
|
||||
}
|
||||
|
||||
decode_opts := entity.XML_Decode_Options{}
|
||||
if .Keep_Tag_Body_Comments not_in opts.flags {
|
||||
decode_opts += { .Comment_Strip }
|
||||
}
|
||||
|
||||
if .Decode_SGML_Entities not_in opts.flags {
|
||||
decode_opts += { .No_Entity_Decode }
|
||||
}
|
||||
|
||||
if .Unbox_CDATA in opts.flags {
|
||||
decode_opts += { .Unbox_CDATA }
|
||||
if .Decode_SGML_Entities in opts.flags {
|
||||
decode_opts += { .Decode_CDATA }
|
||||
}
|
||||
}
|
||||
|
||||
decoded, decode_err := entity.decode_xml(body_text, decode_opts)
|
||||
if decode_err == .None {
|
||||
append(&doc.elements[element].value, decoded)
|
||||
append(&doc.strings_to_free, decoded)
|
||||
} else {
|
||||
append(&doc.elements[element].value, body_text)
|
||||
}
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
Element_ID :: u32
|
||||
|
||||
new_element :: proc(doc: ^Document) -> (id: Element_ID) {
|
||||
@@ -609,4 +620,4 @@ new_element :: proc(doc: ^Document) -> (id: Element_ID) {
|
||||
cur := doc.element_count
|
||||
doc.element_count += 1
|
||||
return cur
|
||||
}
|
||||
}
|
||||
@@ -25,5 +25,10 @@
|
||||
<div>
|
||||
| | | fj ` \ ® ϱ ∳ ⁏
|
||||
</div>
|
||||
<div> H<![CDATA[ellope!]]>Hellope!</div>
|
||||
<div> H<![CDATA[ellope!]]>Hellope!</div>
|
||||
<div> H<![CDATA[ellope!]]>Hellope! </div>
|
||||
<div> H<![CDATA[ellope!]]>Hellope! </div>
|
||||
<div>H<![CDATA[ellope!]]>Hellope! </div>
|
||||
</body>
|
||||
</html>
|
||||
@@ -114,7 +114,7 @@ xml_test_entities :: proc(t: ^testing.T) {
|
||||
},
|
||||
expected_doctype = "html",
|
||||
},
|
||||
crc32 = 0x05373317,
|
||||
crc32 = 0x48f41216,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -128,7 +128,7 @@ xml_test_entities_unbox :: proc(t: ^testing.T) {
|
||||
},
|
||||
expected_doctype = "html",
|
||||
},
|
||||
crc32 = 0x350ca83e,
|
||||
crc32 = 0xd0567818,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -142,7 +142,7 @@ xml_test_entities_unbox_decode :: proc(t: ^testing.T) {
|
||||
},
|
||||
expected_doctype = "html",
|
||||
},
|
||||
crc32 = 0x7f58db7d,
|
||||
crc32 = 0x68d2571e,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -191,7 +191,7 @@ xml_test_unicode :: proc(t: ^testing.T) {
|
||||
}
|
||||
|
||||
@(private)
|
||||
run_test :: proc(t: ^testing.T, test: TEST) {
|
||||
run_test :: proc(t: ^testing.T, test: TEST, loc := #caller_location) {
|
||||
path := strings.concatenate({TEST_SUITE_PATH, test.filename})
|
||||
defer delete(path)
|
||||
|
||||
@@ -205,10 +205,10 @@ run_test :: proc(t: ^testing.T, test: TEST) {
|
||||
crc32 := hash.crc32(tree_bytes)
|
||||
|
||||
failed := err != test.err
|
||||
testing.expectf(t, err == test.err, "%v: Expected return value %v, got %v", test.filename, test.err, err)
|
||||
testing.expectf(t, err == test.err, "%v: Expected return value %v, got %v", test.filename, test.err, err, loc=loc)
|
||||
|
||||
failed |= crc32 != test.crc32
|
||||
testing.expectf(t, crc32 == test.crc32, "%v: Expected CRC 0x%08x, got 0x%08x, with options %v", test.filename, test.crc32, crc32, test.options)
|
||||
testing.expectf(t, crc32 == test.crc32, "%v: Expected CRC 0x%08x, got 0x%08x, with options %v", test.filename, test.crc32, crc32, test.options, loc=loc)
|
||||
|
||||
if failed {
|
||||
// Don't fully print big trees.
|
||||
|
||||
Reference in New Issue
Block a user