mirror of
https://github.com/Ed94/Odin.git
synced 2026-06-24 06:35:00 -07:00
[xml] Initial implementation of core:encoding/xml.
A from-scratch XML implementation, loosely modeled on the [spec](https://www.w3.org/TR/2006/REC-xml11-20060816). Features: - Supports enough of the XML 1.0/1.1 spec to handle the 99.9% of XML documents in common current usage. - Simple to understand and use. Small. Caveats: - We do NOT support HTML in this package, as that may or may not be valid XML. If it works, great. If it doesn't, that's not considered a bug. - We do NOT support UTF-16. If you have a UTF-16 XML file, please convert it to UTF-8 first. Also, our condolences. - <[!ELEMENT and <[!ATTLIST are not supported, and will be either ignored or return an error depending on the parser options. TODO: - Optional CDATA unboxing. - Optional `>`, ` `, ` ` and other escape substitution in tag bodies. - Test suite MAYBE: - XML writer? - Serialize/deserialize Odin types?
This commit is contained in:
@@ -0,0 +1,651 @@
|
||||
package xml
|
||||
/*
|
||||
An XML 1.0 / 1.1 parser
|
||||
|
||||
Copyright 2021 Jeroen van Rijn <nom@duclavier.com>.
|
||||
Made available under Odin's BSD-3 license.
|
||||
|
||||
A from-scratch XML implementation, loosely modelled on the [spec](https://www.w3.org/TR/2006/REC-xml11-20060816).
|
||||
|
||||
Features:
|
||||
- Supports enough of the XML 1.0/1.1 spec to handle the 99.9% of XML documents in common current usage.
|
||||
- Simple to understand and use. Small.
|
||||
|
||||
Caveats:
|
||||
- We do NOT support HTML in this package, as that may or may not be valid XML.
|
||||
If it works, great. If it doesn't, that's not considered a bug.
|
||||
|
||||
- We do NOT support UTF-16. If you have a UTF-16 XML file, please convert it to UTF-8 first. Also, our condolences.
|
||||
- <[!ELEMENT and <[!ATTLIST are not supported, and will be either ignored or return an error depending on the parser options.
|
||||
|
||||
TODO:
|
||||
- Optional CDATA unboxing.
|
||||
- Optional `>`, ` `, ` ` and other escape substitution in tag bodies.
|
||||
|
||||
MAYBE:
|
||||
- XML writer?
|
||||
- Serialize/deserialize Odin types?
|
||||
|
||||
List of contributors:
|
||||
Jeroen van Rijn: Initial implementation.
|
||||
*/
|
||||
|
||||
import "core:strings"
|
||||
import "core:mem"
|
||||
import "core:os"
|
||||
|
||||
DEFAULT_Options :: Options{
|
||||
flags = {
|
||||
.Ignore_Unsupported,
|
||||
},
|
||||
expected_doctype = "",
|
||||
}
|
||||
|
||||
Option_Flag :: enum {
|
||||
/*
|
||||
Document MUST start with `<?xml` prolog.
|
||||
*/
|
||||
Must_Have_Prolog,
|
||||
|
||||
/*
|
||||
Document MUST have a `<!DOCTYPE`.
|
||||
*/
|
||||
Must_Have_DocType,
|
||||
|
||||
/*
|
||||
By default we skip comments. Use this option to intern a comment on a parented Element.
|
||||
*/
|
||||
Intern_Comments,
|
||||
|
||||
/*
|
||||
How to handle unsupported parts of the specification, like <! other than <!DOCTYPE and <![CDATA[
|
||||
*/
|
||||
Error_on_Unsupported,
|
||||
Ignore_Unsupported,
|
||||
|
||||
/*
|
||||
By default CDATA tags are passed-through as-is.
|
||||
This option unwraps them when encountered.
|
||||
*/
|
||||
Unbox_CDATA,
|
||||
|
||||
/*
|
||||
By default SGML entities like `>`, ` ` and ` ` are passed-through as-is.
|
||||
This option decodes them when encountered.
|
||||
*/
|
||||
Decode_SGML_Entities,
|
||||
}
|
||||
|
||||
Document :: struct {
|
||||
root: ^Element,
|
||||
prolog: Attributes,
|
||||
encoding: Encoding,
|
||||
|
||||
doctype: struct {
|
||||
/*
|
||||
We only scan the <!DOCTYPE IDENT part and skip the rest.
|
||||
*/
|
||||
ident: string,
|
||||
rest: string,
|
||||
},
|
||||
|
||||
/*
|
||||
Internal
|
||||
*/
|
||||
tokenizer: ^Tokenizer,
|
||||
allocator: mem.Allocator,
|
||||
intern: strings.Intern,
|
||||
}
|
||||
|
||||
Element :: struct {
|
||||
ident: string,
|
||||
value: string,
|
||||
attribs: Attributes,
|
||||
|
||||
kind: enum {
|
||||
Element = 0,
|
||||
Comment,
|
||||
},
|
||||
|
||||
parent: ^Element,
|
||||
children: [dynamic]^Element,
|
||||
}
|
||||
|
||||
Attr :: struct {
|
||||
key: string,
|
||||
val: string,
|
||||
}
|
||||
|
||||
Attributes :: [dynamic]Attr
|
||||
|
||||
Options :: struct {
|
||||
flags: Option_Flags,
|
||||
expected_doctype: string,
|
||||
}
|
||||
Option_Flags :: bit_set[Option_Flag]
|
||||
|
||||
Encoding :: enum {
|
||||
Unknown,
|
||||
|
||||
UTF_8,
|
||||
ISO_8859_1,
|
||||
|
||||
/*
|
||||
Aliases
|
||||
*/
|
||||
LATIN_1 = ISO_8859_1,
|
||||
}
|
||||
|
||||
Error :: enum {
|
||||
/*
|
||||
General return values.
|
||||
*/
|
||||
None = 0,
|
||||
General_Error,
|
||||
Unexpected_Token,
|
||||
Invalid_Token,
|
||||
|
||||
/*
|
||||
Couldn't find, open or read file.
|
||||
*/
|
||||
File_Error,
|
||||
|
||||
/*
|
||||
File too short.
|
||||
*/
|
||||
Premature_EOF,
|
||||
|
||||
/*
|
||||
XML-specific errors.
|
||||
*/
|
||||
No_Prolog,
|
||||
Invalid_Prolog,
|
||||
Too_Many_Prologs,
|
||||
|
||||
No_DocType,
|
||||
Too_Many_DocTypes,
|
||||
DocType_Must_Proceed_Elements,
|
||||
|
||||
/*
|
||||
If a DOCTYPE is present _or_ the caller
|
||||
asked for a specific DOCTYPE and the DOCTYPE
|
||||
and root tag don't match, we return `.Invalid_DocType`.
|
||||
*/
|
||||
Invalid_DocType,
|
||||
|
||||
Invalid_Tag_Value,
|
||||
Mismatched_Closing_Tag,
|
||||
|
||||
Unclosed_Comment,
|
||||
Comment_Before_Root_Element,
|
||||
Invalid_Sequence_In_Comment,
|
||||
|
||||
Unsupported_Version,
|
||||
Unsupported_Encoding,
|
||||
|
||||
/*
|
||||
<!FOO are usually skipped.
|
||||
*/
|
||||
Unhandled_Bang,
|
||||
|
||||
Duplicate_Attribute,
|
||||
Conflicting_Options,
|
||||
|
||||
/*
|
||||
Unhandled TODO:
|
||||
*/
|
||||
Unhandled_CDATA_Unboxing,
|
||||
Unhandled_SGML_Entity_Decoding,
|
||||
}
|
||||
|
||||
/*
|
||||
Implementation starts here.
|
||||
*/
|
||||
parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", error_handler := default_error_handler, allocator := context.allocator) -> (doc: ^Document, err: Error) {
|
||||
context.allocator = allocator
|
||||
|
||||
opts := validate_options(options) or_return
|
||||
|
||||
t := &Tokenizer{}
|
||||
init(t, string(data), path, error_handler)
|
||||
|
||||
doc = new(Document)
|
||||
doc.allocator = allocator
|
||||
doc.tokenizer = t
|
||||
|
||||
strings.intern_init(&doc.intern, allocator, allocator)
|
||||
|
||||
err = .Unexpected_Token
|
||||
element, parent: ^Element
|
||||
|
||||
/*
|
||||
If a DOCTYPE is present, the root tag has to match.
|
||||
If an expected DOCTYPE is given in options (i.e. it's non-empty), the DOCTYPE (if present) and root tag have to match.
|
||||
*/
|
||||
expected_doctype := options.expected_doctype
|
||||
|
||||
loop: for {
|
||||
tok := scan(t)
|
||||
#partial switch tok.kind {
|
||||
|
||||
case .Lt:
|
||||
open := scan(t)
|
||||
#partial switch open.kind {
|
||||
|
||||
case .Question:
|
||||
/*
|
||||
<?xml
|
||||
*/
|
||||
next := scan(t)
|
||||
#partial switch next.kind {
|
||||
case .Ident:
|
||||
if len(next.text) == 3 && strings.to_lower(next.text, context.temp_allocator) == "xml" {
|
||||
parse_prolog(doc) or_return
|
||||
} else if len(doc.prolog) > 0 {
|
||||
/*
|
||||
We've already seen a prolog.
|
||||
*/
|
||||
return doc, .Too_Many_Prologs
|
||||
} else {
|
||||
error(t, t.offset, "Expected \"<?xml\", got \"<?%v\".", tok.text)
|
||||
return
|
||||
}
|
||||
case:
|
||||
error(t, t.offset, "Expected \"<?xml\", got \"<?%v\".", tok.text)
|
||||
return
|
||||
}
|
||||
|
||||
case .Exclaim:
|
||||
/*
|
||||
<!
|
||||
*/
|
||||
next := scan(t)
|
||||
#partial switch next.kind {
|
||||
case .Ident:
|
||||
switch next.text {
|
||||
case "DOCTYPE":
|
||||
if len(doc.doctype.ident) > 0 {
|
||||
return doc, .Too_Many_DocTypes
|
||||
}
|
||||
if doc.root != nil {
|
||||
return doc, .DocType_Must_Proceed_Elements
|
||||
}
|
||||
parse_doctype(doc) or_return
|
||||
|
||||
if len(expected_doctype) > 0 && expected_doctype != doc.doctype.ident {
|
||||
error(t, t.offset, "Invalid DOCTYPE. Expected: %v, got: %v\n", expected_doctype, doc.doctype.ident)
|
||||
return doc, .Invalid_DocType
|
||||
}
|
||||
expected_doctype = doc.doctype.ident
|
||||
|
||||
case:
|
||||
if .Error_on_Unsupported in opts.flags {
|
||||
error(t, t.offset, "Unhandled: <!%v\n", next.text)
|
||||
err = .Unhandled_Bang
|
||||
return
|
||||
}
|
||||
skip_element(t) or_return
|
||||
}
|
||||
|
||||
case .Dash:
|
||||
/*
|
||||
Comment: <!-- -->.
|
||||
The grammar does not allow a comment to end in --->
|
||||
*/
|
||||
if doc.root == nil {
|
||||
return doc, .Comment_Before_Root_Element
|
||||
}
|
||||
|
||||
expect(t, .Dash)
|
||||
offset := t.offset
|
||||
|
||||
for {
|
||||
advance_rune(t)
|
||||
ch := t.ch
|
||||
|
||||
/*
|
||||
A comment ends when we see -->, preceded by a character that's not a dash.
|
||||
"For compatibility, the string "--" (double-hyphen) must not occur within comments."
|
||||
|
||||
See: https://www.w3.org/TR/2006/REC-xml11-20060816/#dt-comment
|
||||
|
||||
Thanks to the length (4) of the comment start, we also have enough lookback,
|
||||
and the peek at the next byte asserts that there's at least one more character
|
||||
that's a `>`.
|
||||
*/
|
||||
if ch < 0 {
|
||||
error(t, offset, "[parse] Comment was not terminated\n")
|
||||
return doc, .Unclosed_Comment
|
||||
}
|
||||
|
||||
if string(t.src[t.offset - 1:][:2]) == "--" {
|
||||
if peek_byte(t) == '>' {
|
||||
break
|
||||
} else {
|
||||
error(t, t.offset - 1, "Invalid -- sequence in comment.\n")
|
||||
return doc, .Invalid_Sequence_In_Comment
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if .Intern_Comments in opts.flags {
|
||||
el := new(Element)
|
||||
|
||||
el.parent = element
|
||||
el.kind = .Comment
|
||||
el.value = strings.intern_get(&doc.intern, string(t.src[offset : t.offset - 1]))
|
||||
append(&element.children, el)
|
||||
}
|
||||
|
||||
expect(t, .Dash)
|
||||
expect(t, .Gt)
|
||||
|
||||
case:
|
||||
error(t, t.offset, "Invalid Token after <!. Expected .Ident, got %#v\n", next)
|
||||
return
|
||||
}
|
||||
|
||||
case .Ident:
|
||||
/*
|
||||
e.g. <odin - Start of new element.
|
||||
*/
|
||||
element = new(Element)
|
||||
|
||||
if doc.root == nil {
|
||||
/*
|
||||
First element.
|
||||
*/
|
||||
doc.root = element
|
||||
parent = element
|
||||
} else {
|
||||
append(&parent.children, element)
|
||||
}
|
||||
|
||||
element.parent = parent
|
||||
element.ident = strings.intern_get(&doc.intern, open.text)
|
||||
|
||||
parse_attributes(doc, &element.attribs) or_return
|
||||
|
||||
/*
|
||||
If a DOCTYPE is present _or_ the caller
|
||||
asked for a specific DOCTYPE and the DOCTYPE
|
||||
and root tag don't match, we return .Invalid_Root_Tag.
|
||||
*/
|
||||
if element == doc.root {
|
||||
if len(expected_doctype) > 0 && expected_doctype != open.text {
|
||||
error(t, t.offset, "Root Tag doesn't match DOCTYPE. Expected: %v, got: %v\n", expected_doctype, open.text)
|
||||
return doc, .Invalid_DocType
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
One of these should follow:
|
||||
- `>`, which means we've just opened this tag and expect a later element to close it.
|
||||
- `/>`, which means this is an 'empty' or self-closing tag.
|
||||
*/
|
||||
end_token := scan(t)
|
||||
|
||||
#partial switch end_token.kind {
|
||||
case .Gt:
|
||||
/*
|
||||
We're now the new parent.
|
||||
*/
|
||||
parent = element
|
||||
|
||||
case .Slash:
|
||||
/*
|
||||
Empty tag?
|
||||
*/
|
||||
expect(t, .Gt) or_return
|
||||
|
||||
case:
|
||||
error(t, t.offset, "Expected close tag, got: %#v\n", end_token)
|
||||
return
|
||||
}
|
||||
|
||||
case .Slash:
|
||||
/*
|
||||
Close tag.
|
||||
*/
|
||||
ident := expect(t, .Ident) or_return
|
||||
_ = expect(t, .Gt) or_return
|
||||
|
||||
if element.ident != ident.text {
|
||||
error(t, t.offset, "Mismatched Closing Tag: %v\n", ident.text)
|
||||
return doc, .Mismatched_Closing_Tag
|
||||
}
|
||||
parent = element.parent
|
||||
element = parent
|
||||
|
||||
case:
|
||||
error(t, t.offset, "Invalid Token after <: %#v\n", open)
|
||||
return
|
||||
}
|
||||
|
||||
case .EOF:
|
||||
break loop
|
||||
|
||||
case:
|
||||
/*
|
||||
This should be a tag's body text.
|
||||
*/
|
||||
element.value = scan_string(t, tok.pos.offset) or_return
|
||||
}
|
||||
}
|
||||
|
||||
if .Must_Have_Prolog in opts.flags && len(doc.prolog) == 0 {
|
||||
return doc, .No_Prolog
|
||||
}
|
||||
|
||||
if .Must_Have_DocType in opts.flags && len(doc.doctype.ident) == 0 {
|
||||
return doc, .No_DocType
|
||||
}
|
||||
|
||||
return doc, .None
|
||||
}
|
||||
|
||||
parse_from_file :: proc(filename: string, options := DEFAULT_Options, error_handler := default_error_handler, allocator := context.allocator) -> (doc: ^Document, err: Error) {
|
||||
context.allocator = allocator
|
||||
|
||||
data, data_ok := os.read_entire_file(filename)
|
||||
defer delete(data)
|
||||
|
||||
if !data_ok { return {}, .File_Error }
|
||||
|
||||
return parse_from_slice(data, options, filename, error_handler, allocator)
|
||||
}
|
||||
|
||||
parse :: proc { parse_from_file, parse_from_slice }
|
||||
|
||||
free_element :: proc(element: ^Element) {
|
||||
if element == nil { return }
|
||||
|
||||
for child in element.children {
|
||||
/*
|
||||
NOTE: Recursive.
|
||||
|
||||
Could be rewritten so it adds them to a list of pointers to free.
|
||||
*/
|
||||
free_element(child)
|
||||
}
|
||||
delete(element.attribs)
|
||||
delete(element.children)
|
||||
free(element)
|
||||
}
|
||||
|
||||
destroy :: proc(doc: ^Document) {
|
||||
if doc == nil { return }
|
||||
|
||||
free_element(doc.root)
|
||||
strings.intern_destroy(&doc.intern)
|
||||
|
||||
delete(doc.prolog)
|
||||
free(doc)
|
||||
}
|
||||
|
||||
/*
|
||||
Helpers.
|
||||
*/
|
||||
|
||||
validate_options :: proc(options: Options) -> (validated: Options, err: Error) {
|
||||
validated = options
|
||||
|
||||
if .Error_on_Unsupported in validated.flags && .Ignore_Unsupported in validated.flags {
|
||||
return options, .Conflicting_Options
|
||||
}
|
||||
|
||||
if .Unbox_CDATA in validated.flags {
|
||||
return options, .Unhandled_CDATA_Unboxing
|
||||
}
|
||||
|
||||
if .Decode_SGML_Entities in validated.flags {
|
||||
return options, .Unhandled_SGML_Entity_Decoding
|
||||
}
|
||||
|
||||
return validated, .None
|
||||
}
|
||||
|
||||
expect :: proc(t: ^Tokenizer, kind: Token_Kind) -> (tok: Token, err: Error) {
|
||||
tok = scan(t)
|
||||
if tok.kind == kind { return tok, .None }
|
||||
|
||||
error(t, t.offset, "Expected \"%v\", got \"%v\".", kind, tok.kind)
|
||||
return tok, .Unexpected_Token
|
||||
}
|
||||
|
||||
parse_attribute :: proc(doc: ^Document) -> (attr: Attr, offset: int, err: Error) {
|
||||
assert(doc != nil)
|
||||
context.allocator = doc.allocator
|
||||
t := doc.tokenizer
|
||||
|
||||
key := expect(t, .Ident) or_return
|
||||
offset = t.offset - len(key.text)
|
||||
|
||||
_ = expect(t, .Eq) or_return
|
||||
value := expect(t, .String) or_return
|
||||
|
||||
attr.key = strings.intern_get(&doc.intern, key.text)
|
||||
attr.val = strings.intern_get(&doc.intern, value.text)
|
||||
|
||||
err = .None
|
||||
return
|
||||
}
|
||||
|
||||
check_duplicate_attributes :: proc(t: ^Tokenizer, attribs: Attributes, attr: Attr, offset: int) -> (err: Error) {
|
||||
for a in attribs {
|
||||
if attr.key == a.key {
|
||||
error(t, offset, "Duplicate attribute: %v\n", attr.key)
|
||||
return .Duplicate_Attribute
|
||||
}
|
||||
}
|
||||
return .None
|
||||
}
|
||||
|
||||
parse_attributes :: proc(doc: ^Document, attribs: ^Attributes) -> (err: Error) {
|
||||
assert(doc != nil)
|
||||
context.allocator = doc.allocator
|
||||
t := doc.tokenizer
|
||||
|
||||
for peek(t).kind == .Ident {
|
||||
attr, offset := parse_attribute(doc) or_return
|
||||
check_duplicate_attributes(t, attribs^, attr, offset) or_return
|
||||
append(attribs, attr)
|
||||
}
|
||||
skip_whitespace(t)
|
||||
return .None
|
||||
}
|
||||
|
||||
parse_prolog :: proc(doc: ^Document) -> (err: Error) {
|
||||
assert(doc != nil)
|
||||
context.allocator = doc.allocator
|
||||
t := doc.tokenizer
|
||||
|
||||
offset := t.offset
|
||||
parse_attributes(doc, &doc.prolog) or_return
|
||||
|
||||
for attr in doc.prolog {
|
||||
switch attr.key {
|
||||
case "version":
|
||||
switch attr.val {
|
||||
case "1.0", "1.1":
|
||||
case:
|
||||
error(t, offset, "[parse_prolog] Warning: Unhandled XML version: %v\n", attr.val)
|
||||
}
|
||||
|
||||
case "encoding":
|
||||
switch strings.to_lower(attr.val, context.temp_allocator) {
|
||||
case "utf-8", "utf8":
|
||||
doc.encoding = .UTF_8
|
||||
|
||||
case "latin-1", "latin1", "iso-8859-1":
|
||||
doc.encoding = .LATIN_1
|
||||
|
||||
case:
|
||||
/*
|
||||
Unrecognized encoding, assume UTF-8.
|
||||
*/
|
||||
error(t, offset, "[parse_prolog] Warning: Unrecognized encoding: %v\n", attr.val)
|
||||
}
|
||||
|
||||
case:
|
||||
// Ignored.
|
||||
}
|
||||
}
|
||||
|
||||
_ = expect(t, .Question) or_return
|
||||
_ = expect(t, .Gt) or_return
|
||||
|
||||
return .None
|
||||
}
|
||||
|
||||
skip_element :: proc(t: ^Tokenizer) -> (err: Error) {
|
||||
close := 1
|
||||
|
||||
loop: for {
|
||||
tok := scan(t)
|
||||
#partial switch tok.kind {
|
||||
case .EOF:
|
||||
error(t, t.offset, "[skip_element] Premature EOF\n")
|
||||
return .Premature_EOF
|
||||
|
||||
case .Lt:
|
||||
close += 1
|
||||
|
||||
case .Gt:
|
||||
close -= 1
|
||||
if close == 0 {
|
||||
break loop
|
||||
}
|
||||
|
||||
case:
|
||||
|
||||
}
|
||||
}
|
||||
return .None
|
||||
}
|
||||
|
||||
parse_doctype :: proc(doc: ^Document) -> (err: Error) {
|
||||
/*
|
||||
<!DOCTYPE greeting SYSTEM "hello.dtd">
|
||||
|
||||
<!DOCTYPE greeting [
|
||||
<!ELEMENT greeting (#PCDATA)>
|
||||
]>
|
||||
*/
|
||||
assert(doc != nil)
|
||||
context.allocator = doc.allocator
|
||||
t := doc.tokenizer
|
||||
|
||||
tok := expect(t, .Ident) or_return
|
||||
doc.doctype.ident = strings.intern_get(&doc.intern, tok.text)
|
||||
|
||||
skip_whitespace(t)
|
||||
offset := t.offset
|
||||
skip_element(t) or_return
|
||||
|
||||
/*
|
||||
-1 because the current offset is that of the closing tag, so the rest of the DOCTYPE tag ends just before it.
|
||||
*/
|
||||
doc.doctype.rest = strings.intern_get(&doc.intern, string(t.src[offset : t.offset - 1]))
|
||||
return .None
|
||||
}
|
||||
Reference in New Issue
Block a user