Merge pull request #1342 from Kelimion/xml

Initial version of `core:encoding/xml`.
2026-07-29 19:00:06 +00:00 · 2022-04-28 15:54:28 +02:00
parent 9fcba99ca2 127b0ba65e
commit 62139cb5a4
19 changed files with 10193 additions and 7 deletions
@@ -0,0 +1,21 @@
+# License
+
+By obtaining, using and/or copying this work, you (the licensee) agree that you have read, understood, and will comply with the following terms and conditions.
+
+Permission to copy, modify, and distribute this software and its documentation, with or without modification, for any purpose and without fee or royalty is hereby granted, provided that you include the following on ALL copies of the software and documentation or portions thereof, including modifications:
+
+The full text of this NOTICE in a location viewable to users of the redistributed or derivative work.
+Any pre-existing intellectual property disclaimers, notices, or terms and conditions. If none exist, the W3C Software Short Notice should be included (hypertext is preferred, text is permitted) within the body of any redistributed or derivative code.
+
+Notice of any changes or modifications to the files, including the date changes were made. (We recommend you provide URIs to the location from which the code is derived.)
+
+# Disclaimers
+
+THIS SOFTWARE AND DOCUMENTATION IS PROVIDED "AS IS," AND COPYRIGHT HOLDERS MAKE NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO, WARRANTIES OF MERCHANTABILITY OR FITNESS FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF THE SOFTWARE OR DOCUMENTATION WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS.
+
+COPYRIGHT HOLDERS WILL NOT BE LIABLE FOR ANY DIRECT, INDIRECT, SPECIAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF ANY USE OF THE SOFTWARE OR DOCUMENTATION.
+
+The name and trademarks of copyright holders may NOT be used in advertising or publicity pertaining to the software without specific, written prior permission. Title to copyright in this software and any associated documentation will at all times remain with copyright holders.
+
+# Notes
+This version: http://www.w3.org/Consortium/Legal/2002/copyright-software-20021231
@@ -0,0 +1,374 @@
+package unicode_entity
+/*
+	A unicode entity encoder/decoder
+
+	Copyright 2021 Jeroen van Rijn <nom@duclavier.com>.
+	Made available under Odin's BSD-3 license.
+
+	This code has several procedures to map unicode runes to/from different textual encodings.
+	- SGML/XML/HTML entity
+	-- &#<decimal>;
+	-- &#x<hexadecimal>;
+	-- &<entity name>;   (If the lookup tables are compiled in).
+	Reference: https://www.w3.org/2003/entities/2007xml/unicode.xml	
+
+	- URL encode / decode %hex entity
+	Reference: https://datatracker.ietf.org/doc/html/rfc3986/#section-2.1
+
+	List of contributors:
+		Jeroen van Rijn: Initial implementation.
+*/
+
+import "core:unicode/utf8"
+import "core:unicode"
+import "core:strings"
+
+MAX_RUNE_CODEPOINT :: int(unicode.MAX_RUNE)
+
+write_rune   :: strings.write_rune_builder
+write_string :: strings.write_string_builder
+
+Error :: enum u8 {
+	None = 0,
+	Tokenizer_Is_Nil,
+
+	Illegal_NUL_Character,
+	Illegal_UTF_Encoding,
+	Illegal_BOM,
+
+	CDATA_Not_Terminated,
+	Comment_Not_Terminated,
+	Invalid_Entity_Encoding,
+}
+
+Tokenizer :: struct {
+	r:           rune,
+	w:           int,
+
+	src:         string,
+	offset:      int,
+	read_offset: int,
+}
+
+CDATA_START   :: "<![CDATA["
+CDATA_END     :: "]]>"
+
+COMMENT_START :: "<!--"
+COMMENT_END   :: "-->"
+
+/*
+	Default: CDATA and comments are passed through unchanged.
+*/
+XML_Decode_Option :: enum u8 {
+	/*
+		Do not decode & entities. It decodes by default.
+		If given, overrides `Decode_CDATA`.
+	*/
+	No_Entity_Decode,
+
+	/*
+		CDATA is unboxed.
+	*/
+	Unbox_CDATA,
+
+	/*
+		Unboxed CDATA is decoded as well.
+		Ignored if `.Unbox_CDATA` is not given.
+	*/
+	Decode_CDATA,
+
+	/*
+		Comments are stripped.
+	*/
+	Comment_Strip,
+}
+XML_Decode_Options :: bit_set[XML_Decode_Option; u8]
+
+/*
+	Decode a string that may include SGML/XML/HTML entities.
+	The caller has to free the result.
+*/
+decode_xml :: proc(input: string, options := XML_Decode_Options{}, allocator := context.allocator) -> (decoded: string, err: Error) {
+	context.allocator = allocator
+
+	l := len(input)
+	if l == 0 { return "", .None }
+
+	builder := strings.make_builder()
+	defer strings.destroy_builder(&builder)
+
+	t := Tokenizer{src=input}
+	in_data := false
+
+	loop: for {
+		advance(&t) or_return
+		if t.r < 0 { break loop }
+
+		/*
+			Below here we're never inside a CDATA tag.
+			At most we'll see the start of one, but that doesn't affect the logic.
+		*/
+		switch t.r {
+		case '<':
+			/*
+				Might be the start of a CDATA tag or comment.
+
+				We don't need to check if we need to write a `<`, because if it isn't CDATA or a comment,
+				it couldn't have been part of an XML tag body to be decoded here.
+
+				Keep in mind that we could already *be* inside a CDATA tag.
+				If so, write `>` as a literal and continue.
+			*/
+			if in_data {
+				write_rune(&builder, '<')
+				continue
+			}
+			in_data = _handle_xml_special(&t, &builder, options) or_return
+
+		case ']':
+			/*
+				If we're unboxing _and_ decoding CDATA, we'll have to check for the end tag.
+			*/
+			if in_data {
+				if t.read_offset + len(CDATA_END) < len(t.src) {
+					if string(t.src[t.offset:][:len(CDATA_END)]) == CDATA_END {
+						in_data = false
+						t.read_offset += len(CDATA_END) - 1
+					}
+				}
+				continue
+			} else {
+				write_rune(&builder, ']')
+			}
+
+		case:
+			if in_data && .Decode_CDATA not_in options {
+				/*
+					Unboxed, but undecoded.
+				*/
+				write_rune(&builder, t.r)
+				continue
+			}
+
+			if t.r == '&' {
+				if entity, entity_err := _extract_xml_entity(&t); entity_err != .None {
+					/*
+						We read to the end of the string without closing the entity.
+						Pass through as-is.
+					*/
+					write_string(&builder, entity)
+				} else {
+
+					if .No_Entity_Decode not_in options {
+						if decoded, ok := xml_decode_entity(entity); ok {
+							write_rune(&builder, decoded)
+							continue
+						}
+					}
+
+					/*
+						Literal passthrough because the decode failed or we want entities not decoded.
+					*/
+					write_string(&builder, "&")
+					write_string(&builder, entity)
+					write_string(&builder, ";")
+				}
+			} else {
+				write_rune(&builder, t.r)
+			}
+		}
+	}
+
+	return strings.clone(strings.to_string(builder), allocator), err
+}
+
+advance :: proc(t: ^Tokenizer) -> (err: Error) {
+	if t == nil { return .Tokenizer_Is_Nil }
+	using t
+
+	#no_bounds_check {
+		if read_offset < len(src) {
+			offset = read_offset
+			r, w   = rune(src[read_offset]), 1
+			switch {
+			case r == 0:
+				return .Illegal_NUL_Character
+			case r >= utf8.RUNE_SELF:
+				r, w = utf8.decode_rune_in_string(src[read_offset:])
+				if r == utf8.RUNE_ERROR && w == 1 {
+					return .Illegal_UTF_Encoding
+				} else if r == utf8.RUNE_BOM && offset > 0 {
+					return .Illegal_BOM
+				}
+			}
+			read_offset += w
+			return .None
+		} else {
+			offset = len(src)
+			r = -1
+			return
+		}
+	}
+}
+
+xml_decode_entity :: proc(entity: string) -> (decoded: rune, ok: bool) {
+	entity := entity
+	if len(entity) == 0 { return -1, false }
+
+	switch entity[0] {
+	case '#':
+		base  := 10
+		val   := 0
+		entity = entity[1:]
+
+		if len(entity) == 0 { return -1, false }
+
+		if entity[0] == 'x' || entity[0] == 'X' {
+			base = 16
+			entity = entity[1:]
+		}
+
+		for len(entity) > 0 {
+			r := entity[0]
+			switch r {
+			case '0'..'9':
+				val *= base
+				val += int(r - '0')
+
+			case 'a'..'f':
+				if base == 10 { return -1, false }
+				val *= base
+				val += int(r - 'a' + 10)
+
+			case 'A'..'F':
+				if base == 10 { return -1, false }
+				val *= base
+				val += int(r - 'A' + 10)
+
+			case:
+				return -1, false
+			}
+
+			if val > MAX_RUNE_CODEPOINT { return -1, false }
+			entity = entity[1:]
+		}
+		return rune(val), true
+
+	case:
+		/*
+			Named entity.
+		*/
+		return named_xml_entity_to_rune(entity)
+	}
+}
+
+/*
+	Private XML helper to extract `&<stuff>;` entity.
+*/
+@(private="file")
+_extract_xml_entity :: proc(t: ^Tokenizer) -> (entity: string, err: Error) {
+	assert(t != nil && t.r == '&')
+
+	/*
+		All of these would be in the ASCII range.
+		Even if one is not, it doesn't matter. All characters we need to compare to extract are.
+	*/
+	using t
+
+	length := len(t.src)
+	found  := false
+
+	#no_bounds_check {
+		for read_offset < length {
+			if src[read_offset] == ';' {
+				found = true
+				read_offset += 1
+				break
+			}
+			read_offset += 1
+		}
+	}
+
+	if found {
+		return string(src[offset + 1 : read_offset - 1]), .None
+	}
+	return string(src[offset : read_offset]), .Invalid_Entity_Encoding
+}
+
+/*
+	Private XML helper for CDATA and comments.
+*/
+@(private="file")
+_handle_xml_special :: proc(t: ^Tokenizer, builder: ^strings.Builder, options: XML_Decode_Options) -> (in_data: bool, err: Error) {
+	assert(t != nil && t.r == '<')
+	if t.read_offset + len(CDATA_START) >= len(t.src) { return false, .None }
+
+	if string(t.src[t.offset:][:len(CDATA_START)]) == CDATA_START {
+		t.read_offset += len(CDATA_START) - 1
+
+		if .Unbox_CDATA in options && .Decode_CDATA in options {
+			/*
+				We're unboxing _and_ decoding CDATA
+			*/
+			return true, .None
+		}
+
+		/*
+			CDATA is passed through.
+		*/
+		offset := t.offset
+
+		/*
+			Scan until end of CDATA.
+		*/
+		for {
+			advance(t) or_return
+			if t.r < 0 { return true, .CDATA_Not_Terminated }
+
+			if t.read_offset + len(CDATA_END) < len(t.src) {
+				if string(t.src[t.offset:][:len(CDATA_END)]) == CDATA_END {
+					t.read_offset += len(CDATA_END) - 1
+
+					cdata := string(t.src[offset : t.read_offset])
+	
+					if .Unbox_CDATA in options {
+						cdata = cdata[len(CDATA_START):]
+						cdata = cdata[:len(cdata) - len(CDATA_END)]
+					}
+
+					write_string(builder, cdata)
+					return false, .None
+				}
+			}
+		}
+
+	} else if string(t.src[t.offset:][:len(COMMENT_START)]) == COMMENT_START {
+		t.read_offset += len(COMMENT_START)
+		/*
+			Comment is passed through by default.
+		*/
+		offset := t.offset
+
+		/*
+			Scan until end of Comment.
+		*/
+		for {
+			advance(t) or_return
+			if t.r < 0 { return true, .Comment_Not_Terminated }
+
+			if t.read_offset + len(COMMENT_END) < len(t.src) {
+				if string(t.src[t.offset:][:len(COMMENT_END)]) == COMMENT_END {
+					t.read_offset += len(COMMENT_END) - 1
+
+					if .Comment_Strip not_in options {
+						comment := string(t.src[offset : t.read_offset])
+						write_string(builder, comment)
+					}
+					return false, .None
+				}
+			}
+		}
+
+	}
+	return false, .None
+}
@@ -0,0 +1,76 @@
+package unicode_entity_example
+
+import "core:encoding/xml"
+import "core:strings"
+import "core:mem"
+import "core:fmt"
+import "core:time"
+
+doc_print :: proc(doc: ^xml.Document) {
+	buf: strings.Builder
+	defer strings.destroy_builder(&buf)
+	w := strings.to_writer(&buf)
+
+	xml.print(w, doc)
+	fmt.println(strings.to_string(buf))
+}
+
+_entities :: proc() {
+	doc: ^xml.Document
+	err: xml.Error
+
+	DOC :: #load("../../../../tests/core/assets/XML/unicode.xml")
+
+	OPTIONS  :: xml.Options{
+		flags            = {
+			.Ignore_Unsupported, .Intern_Comments,
+		},
+		expected_doctype = "",
+	}
+
+	parse_duration: time.Duration
+
+	{
+		time.SCOPED_TICK_DURATION(&parse_duration)
+		doc, err = xml.parse(DOC, OPTIONS)
+	}
+	defer xml.destroy(doc)
+
+	doc_print(doc)
+
+	ms := time.duration_milliseconds(parse_duration)
+
+	speed := (f64(1000.0) / ms) * f64(len(DOC)) / 1_024.0 / 1_024.0
+
+	fmt.printf("Parse time: %.2f ms (%.2f MiB/s).\n", ms, speed)
+	fmt.printf("Error: %v\n", err)
+}
+
+_main :: proc() {
+	using fmt
+
+	options := xml.Options{ flags = { .Ignore_Unsupported, .Intern_Comments, .Unbox_CDATA, .Decode_SGML_Entities }}
+
+	doc, _ := xml.parse(#load("test.html"), options)
+
+	defer xml.destroy(doc)
+	doc_print(doc)
+}
+
+main :: proc() {
+	using fmt
+
+	track: mem.Tracking_Allocator
+	mem.tracking_allocator_init(&track, context.allocator)
+	context.allocator = mem.tracking_allocator(&track)
+
+	// _main()
+	_entities()
+
+	if len(track.allocation_map) > 0 {
+		println()
+		for _, v in track.allocation_map {
+			printf("%v Leaked %v bytes.\n", v.location, v.size)
+		}
+	}	
+}
@@ -0,0 +1,28 @@
+<html>
+	<head>
+		<title>Entity Reference Test</title>
+		<style>
+			body {
+				background: #000; color: #eee;
+				width: 40%;
+				margin-left:  auto;
+				margin-right: auto;
+				font-size: 14pt;
+			}
+		</style>
+	</head>
+	<body>
+		<h1>Entity Reference Test</h1>
+		<div id="test_cdata_in_comment" foo="">
+			Foozle]!&#32;&copy;&#x20;<!-- <![CDATA[&#32;&reg;&#x20;]]> -->42&;1234&
+		</div>
+		<!-- EXPECTED: Foozle]! © 42&;1234& -->
+		<div id="test_cdata_unwrap_and_passthrough">
+			Foozle]!&#32;&copy;&#x20;<![CDATA[BOX&#32;&reg;&#x20;/BOX]]>42&;1234&
+		</div>
+		<!-- EXPECTED: Foozle]! © BOX ® /BOX42&;1234& -->
+		<div>
+			&verbar; &vert; &VerticalLine; &fjlig; &grave; &bsol; &reg; &rhov; &CounterClockwiseContourIntegral; &bsemi;
+		</div>
+	</body>
+</html>
@@ -0,0 +1,86 @@
+/*
+	An XML 1.0 / 1.1 parser
+
+	Copyright 2021-2022 Jeroen van Rijn <nom@duclavier.com>.
+	Made available under Odin's BSD-3 license.
+
+	A from-scratch XML implementation, loosely modeled on the [spec](https://www.w3.org/TR/2006/REC-xml11-20060816).
+
+	List of contributors:
+		Jeroen van Rijn: Initial implementation.
+*/
+package xml
+
+import "core:io"
+import "core:fmt"
+
+/*
+	Just for debug purposes.
+*/
+print :: proc(writer: io.Writer, doc: ^Document) -> (written: int, err: io.Error) {
+	if doc == nil { return }
+	using fmt
+
+	written += wprintf(writer, "[XML Prolog]\n")
+
+	for attr in doc.prolog {
+		written += wprintf(writer, "\t%v: %v\n", attr.key, attr.val)
+	}
+
+	written += wprintf(writer, "[Encoding] %v\n", doc.encoding)
+
+	if len(doc.doctype.ident) > 0 {
+		written += wprintf(writer, "[DOCTYPE]  %v\n", doc.doctype.ident)
+
+		if len(doc.doctype.rest) > 0 {
+		 	wprintf(writer, "\t%v\n", doc.doctype.rest)
+		}
+	}
+
+	for comment in doc.comments {
+		written += wprintf(writer, "[Pre-root comment]  %v\n", comment)
+	}
+
+	if len(doc.elements) > 0 {
+	 	wprintln(writer, " --- ")
+	 	print_element(writer, doc, 0)
+	 	wprintln(writer, " --- ")
+	 }
+
+	return written, .None
+}
+
+print_element :: proc(writer: io.Writer, doc: ^Document, element_id: Element_ID, indent := 0) -> (written: int, err: io.Error) {
+	using fmt
+
+	tab :: proc(writer: io.Writer, indent: int) {
+		for _ in 0..=indent {
+			wprintf(writer, "\t")
+		}
+	}
+
+	tab(writer, indent)
+
+	element := doc.elements[element_id]
+
+	if element.kind == .Element {
+		wprintf(writer, "<%v>\n", element.ident)
+		if len(element.value) > 0 {
+			tab(writer, indent + 1)
+			wprintf(writer, "[Value] %v\n", element.value)
+		}
+
+		for attr in element.attribs {
+			tab(writer, indent + 1)
+			wprintf(writer, "[Attr] %v: %v\n", attr.key, attr.val)
+		}
+
+		for child in element.children {
+			print_element(writer, doc, child, indent + 1)
+		}
+	} else if element.kind == .Comment {
+		wprintf(writer, "[COMMENT] %v\n", element.value)
+	}
+
+	return written, .None
+}
@@ -0,0 +1,112 @@
+package xml_example
+
+import "core:encoding/xml"
+import "core:mem"
+import "core:fmt"
+import "core:time"
+import "core:strings"
+import "core:hash"
+
+N :: 1
+
+example :: proc() {
+	using fmt
+
+	docs:  [N]^xml.Document
+	errs:  [N]xml.Error
+	times: [N]time.Duration
+
+	defer for round in 0..<N {
+		xml.destroy(docs[round])
+	}
+
+	DOC :: #load("../../../../tests/core/assets/XML/unicode.xml")
+	input := DOC
+
+	for round in 0..<N {
+		start := time.tick_now()
+
+		docs[round], errs[round] = xml.parse(input, xml.Options{
+			flags={.Ignore_Unsupported},
+			expected_doctype = "",
+		})
+
+		end   := time.tick_now()
+		times[round] = time.tick_diff(start, end)
+	}
+
+	fastest := time.Duration(max(i64))
+	slowest := time.Duration(0)
+	total   := time.Duration(0)
+
+	for round in 0..<N {
+		fastest = min(fastest, times[round])
+		slowest = max(slowest, times[round])
+		total  += times[round]
+	}
+
+	fastest_ms := time.duration_milliseconds(fastest)
+	slowest_ms := time.duration_milliseconds(slowest)
+	average_ms := time.duration_milliseconds(time.Duration(f64(total) / f64(N)))
+
+	fastest_speed := (f64(1000.0) / fastest_ms) * f64(len(DOC)) / 1_024.0 / 1_024.0
+	slowest_speed := (f64(1000.0) / slowest_ms) * f64(len(DOC)) / 1_024.0 / 1_024.0
+	average_speed := (f64(1000.0) / average_ms) * f64(len(DOC)) / 1_024.0 / 1_024.0
+
+	fmt.printf("N = %v\n", N)
+	fmt.printf("[Fastest]: %v bytes in %.2f ms (%.2f MiB/s).\n", len(input), fastest_ms, fastest_speed)
+	fmt.printf("[Slowest]: %v bytes in %.2f ms (%.2f MiB/s).\n", len(input), slowest_ms, slowest_speed)
+	fmt.printf("[Average]: %v bytes in %.2f ms (%.2f MiB/s).\n", len(input), average_ms, average_speed)
+
+	if errs[0] != .None {
+		printf("Load/Parse error: %v\n", errs[0])
+		if errs[0] == .File_Error {
+			println("\"unicode.xml\" not found. Did you run \"tests\\download_assets.py\"?")
+		}
+		return
+	}
+
+	charlist, charlist_ok := xml.find_child_by_ident(docs[0], 0, "charlist")
+	if !charlist_ok {
+	 	eprintln("Could not locate top-level `<charlist>` tag.")
+	 	return
+	}
+
+	printf("Found `<charlist>` with %v children, %v elements total\n", len(docs[0].elements[charlist].children), docs[0].element_count)
+
+	crc32 := doc_hash(docs[0])
+	printf("[%v] CRC32: 0x%08x\n", "🎉" if crc32 == 0xcaa042b9 else "🤬", crc32)
+
+	for round in 0..<N {
+		defer xml.destroy(docs[round])
+	}
+}
+
+doc_hash :: proc(doc: ^xml.Document, print := false) -> (crc32: u32) {
+	buf: strings.Builder
+	defer strings.destroy_builder(&buf)
+	w := strings.to_writer(&buf)
+
+	xml.print(w, doc)
+	tree := strings.to_string(buf)
+	if print { fmt.println(tree) }
+	return hash.crc32(transmute([]u8)tree)
+}
+
+main :: proc() {
+	using fmt
+
+	track: mem.Tracking_Allocator
+	mem.tracking_allocator_init(&track, context.allocator)
+	context.allocator = mem.tracking_allocator(&track)
+
+	example()
+
+	if len(track.allocation_map) > 0 {
+		println()
+		for _, v in track.allocation_map {
+			printf("%v Leaked %v bytes.\n", v.location, v.size)
+		}
+	}
+	println("Done and cleaned up!")
+}
@@ -0,0 +1,45 @@
+/*
+	An XML 1.0 / 1.1 parser
+
+	Copyright 2021-2022 Jeroen van Rijn <nom@duclavier.com>.
+	Made available under Odin's BSD-3 license.
+
+	This file contains helper functions.
+*/
+package xml
+
+// Find parent's nth child with a given ident.
+find_child_by_ident :: proc(doc: ^Document, parent_id: Element_ID, ident: string, nth := 0) -> (res: Element_ID, found: bool) {
+	tag := doc.elements[parent_id]
+
+	count := 0
+	for child_id in tag.children {
+		child := doc.elements[child_id]
+		/*
+			Skip commments. They have no name.
+		*/
+		if child.kind  != .Element                { continue }
+
+		/*
+			If the ident matches and it's the nth such child, return it.
+		*/
+		if child.ident == ident {
+			if count == nth                       { return child_id, true }
+			count += 1
+		}
+	}
+	return 0, false
+}
+
+// Find an attribute by key.
+find_attribute_val_by_key :: proc(doc: ^Document, parent_id: Element_ID, key: string) -> (val: string, found: bool) {
+	tag := doc.elements[parent_id]
+
+	for attr in tag.attribs {
+		/*
+			If the ident matches, we're done. There can only ever be one attribute with the same name.
+		*/
+		if attr.key == key { return attr.val, true }
+	}
+	return "", false
+}
@@ -0,0 +1,436 @@
+/*
+	An XML 1.0 / 1.1 parser
+
+	Copyright 2021-2022 Jeroen van Rijn <nom@duclavier.com>.
+	Made available under Odin's BSD-3 license.
+
+	A from-scratch XML implementation, loosely modeled on the [spec](https://www.w3.org/TR/2006/REC-xml11-20060816).
+
+	List of contributors:
+		Jeroen van Rijn: Initial implementation.
+*/
+package xml
+
+import "core:fmt"
+import "core:unicode"
+import "core:unicode/utf8"
+
+Error_Handler :: #type proc(pos: Pos, fmt: string, args: ..any)
+
+Token :: struct {
+	kind: Token_Kind,
+	text: string,
+	pos:  Pos,
+}
+
+Pos :: struct {
+	file:   string,
+	offset: int, // starting at 0
+	line:   int, // starting at 1
+	column: int, // starting at 1
+}
+
+Token_Kind :: enum {
+	Invalid,
+
+	Ident,
+	Literal,
+	Rune,
+	String,
+
+	Double_Quote,  // "
+	Single_Quote,  // '
+	Colon,         // :
+
+	Eq,            // =
+	Lt,            // <
+	Gt,            // >
+	Exclaim,       // !
+	Question,      // ?
+	Hash,          // #
+	Slash,         // /
+	Dash,          // -
+
+	Open_Bracket,  // [
+	Close_Bracket, // ]
+
+	EOF,
+}
+
+CDATA_START   :: "<![CDATA["
+CDATA_END     :: "]]>"
+
+COMMENT_START :: "<!--"
+COMMENT_END   :: "-->"
+
+Tokenizer :: struct {
+	// Immutable data
+	path: string,
+	src:  string,
+	err:  Error_Handler,
+
+	// Tokenizing state
+	ch:          rune,
+	offset:      int,
+	read_offset: int,
+	line_offset: int,
+	line_count:  int,
+
+	// Mutable data
+	error_count: int,
+}
+
+init :: proc(t: ^Tokenizer, src: string, path: string, err: Error_Handler = default_error_handler) {
+	t.src = src
+	t.err = err
+	t.ch = ' '
+	t.offset = 0
+	t.read_offset = 0
+	t.line_offset = 0
+	t.line_count = len(src) > 0 ? 1 : 0
+	t.error_count = 0
+	t.path = path
+
+	advance_rune(t)
+	if t.ch == utf8.RUNE_BOM {
+		advance_rune(t)
+	}
+}
+
+@(private)
+offset_to_pos :: proc(t: ^Tokenizer, offset: int) -> Pos {
+	line := t.line_count
+	column := offset - t.line_offset + 1
+
+	return Pos {
+		file = t.path,
+		offset = offset,
+		line = line,
+		column = column,
+	}
+}
+
+default_error_handler :: proc(pos: Pos, msg: string, args: ..any) {
+	fmt.eprintf("%s(%d:%d) ", pos.file, pos.line, pos.column)
+	fmt.eprintf(msg, ..args)
+	fmt.eprintf("\n")
+}
+
+error :: proc(t: ^Tokenizer, offset: int, msg: string, args: ..any) {
+	pos := offset_to_pos(t, offset)
+	if t.err != nil {
+		t.err(pos, msg, ..args)
+	}
+	t.error_count += 1
+}
+
+@(optimization_mode="speed")
+advance_rune :: proc(using t: ^Tokenizer) {
+	#no_bounds_check {
+		/*
+			Already bounds-checked here.
+		*/
+		if read_offset < len(src) {
+			offset = read_offset
+			if ch == '\n' {
+				line_offset = offset
+				line_count += 1
+			}
+			r, w := rune(src[read_offset]), 1
+			switch {
+			case r == 0:
+				error(t, t.offset, "illegal character NUL")
+			case r >= utf8.RUNE_SELF:
+				r, w = #force_inline utf8.decode_rune_in_string(src[read_offset:])
+				if r == utf8.RUNE_ERROR && w == 1 {
+					error(t, t.offset, "illegal UTF-8 encoding")
+				} else if r == utf8.RUNE_BOM && offset > 0 {
+					error(t, t.offset, "illegal byte order mark")
+				}
+			}
+			read_offset += w
+			ch = r
+		} else {
+			offset = len(src)
+			if ch == '\n' {
+				line_offset = offset
+				line_count += 1
+			}
+			ch = -1
+		}
+	}
+}
+
+peek_byte :: proc(t: ^Tokenizer, offset := 0) -> byte {
+	if t.read_offset+offset < len(t.src) {
+		#no_bounds_check return t.src[t.read_offset+offset]
+	}
+	return 0
+}
+
+@(optimization_mode="speed")
+skip_whitespace :: proc(t: ^Tokenizer) {
+	for {
+		switch t.ch {
+		case ' ', '\t', '\r', '\n':
+			advance_rune(t)
+		case:
+			return
+		}
+	}
+}
+
+@(optimization_mode="speed")
+is_letter :: proc(r: rune) -> bool {
+	if r < utf8.RUNE_SELF {
+		switch r {
+		case '_':
+			return true
+		case 'A'..='Z', 'a'..='z':
+			return true
+		}
+	}
+	return unicode.is_letter(r)
+}
+
+is_valid_identifier_rune :: proc(r: rune) -> bool {
+	if r < utf8.RUNE_SELF {
+		switch r {
+		case '_', '-', ':':        return true
+		case 'A'..='Z', 'a'..='z': return true
+		case '0'..'9':             return true
+		case -1:                   return false
+		}
+	}
+
+	if unicode.is_letter(r) || unicode.is_digit(r) {
+		return true
+	}
+	return false
+}
+
+scan_identifier :: proc(t: ^Tokenizer) -> string {
+	offset     := t.offset
+	namespaced := false
+
+	for is_valid_identifier_rune(t.ch) {
+		advance_rune(t)
+		if t.ch == ':' {
+			/*
+				A namespaced attr can have at most two parts, `namespace:ident`.
+			*/
+			if namespaced {
+				break	
+			}
+			namespaced = true
+		}
+	}
+	return string(t.src[offset : t.offset])
+}
+
+/*
+	A comment ends when we see -->, preceded by a character that's not a dash.
+	"For compatibility, the string "--" (double-hyphen) must not occur within comments."
+
+	See: https://www.w3.org/TR/2006/REC-xml11-20060816/#dt-comment
+
+	Thanks to the length (4) of the comment start, we also have enough lookback,
+	and the peek at the next byte asserts that there's at least one more character
+	that's a `>`.
+*/
+scan_comment :: proc(t: ^Tokenizer) -> (comment: string, err: Error) {
+	offset := t.offset
+
+	for {
+		advance_rune(t)
+		ch := t.ch
+
+		if ch < 0 {
+			error(t, offset, "[parse] Comment was not terminated\n")
+			return "", .Unclosed_Comment
+		}
+
+		if string(t.src[t.offset - 1:][:2]) == "--" {
+			if peek_byte(t) == '>' {
+				break
+			} else {
+				error(t, t.offset - 1, "Invalid -- sequence in comment.\n")
+				return "", .Invalid_Sequence_In_Comment
+			}
+		}
+	}
+
+	expect(t, .Dash)
+	expect(t, .Gt)
+
+	return string(t.src[offset : t.offset - 1]), .None
+}
+
+/*
+	Skip CDATA
+*/
+skip_cdata :: proc(t: ^Tokenizer) -> (err: Error) {
+	if t.read_offset + len(CDATA_START) >= len(t.src) {
+		/*
+			Can't be the start of a CDATA tag.
+		*/
+		return .None
+	}
+
+	if string(t.src[t.offset:][:len(CDATA_START)]) == CDATA_START {
+		t.read_offset += len(CDATA_START)
+		offset := t.offset
+
+		cdata_scan: for {
+			advance_rune(t)
+			if t.ch < 0 {
+				error(t, offset, "[scan_string] CDATA was not terminated\n")
+				return .Premature_EOF
+			}
+
+			/*
+				Scan until the end of a CDATA tag.
+			*/
+			if t.read_offset + len(CDATA_END) < len(t.src) {
+				if string(t.src[t.offset:][:len(CDATA_END)]) == CDATA_END {
+					t.read_offset += len(CDATA_END)
+					break cdata_scan
+				}
+			}
+		}
+	}
+	return
+}
+
+@(optimization_mode="speed")
+scan_string :: proc(t: ^Tokenizer, offset: int, close: rune = '<', consume_close := false, multiline := true) -> (value: string, err: Error) {
+	err = .None
+
+	loop: for {
+		ch := t.ch
+
+		switch ch {
+		case -1:
+			error(t, t.offset, "[scan_string] Premature end of file.\n")
+			return "", .Premature_EOF
+
+		case '<':
+			if peek_byte(t) == '!' {
+				if peek_byte(t, 1) == '[' {
+					/*
+						Might be the start of a CDATA tag.
+					*/
+					skip_cdata(t) or_return
+				} else if peek_byte(t, 1) == '-' && peek_byte(t, 2) == '-' {
+					/*
+						Comment start. Eat comment.
+					*/
+					t.read_offset += 3
+					_ = scan_comment(t) or_return
+				}
+			}
+
+		case '\n':
+			if !multiline {
+				error(t, offset, string(t.src[offset : t.offset]))
+				error(t, offset, "[scan_string] Not terminated\n")
+				err = .Invalid_Tag_Value
+				break loop	
+			}
+		}
+
+		if t.ch == close {
+			/*
+				If it's not a CDATA or comment, it's the end of this body.
+			*/
+			break loop
+		}
+		advance_rune(t)
+	}
+
+	/*
+		Strip trailing whitespace.
+	*/
+	lit := string(t.src[offset : t.offset])
+
+	end := len(lit)
+	eat: for ; end > 0; end -= 1 {
+		ch := lit[end - 1]
+		switch ch {
+		case ' ', '\t', '\r', '\n':
+		case:
+			break eat
+		}
+	}
+	lit = lit[:end]
+
+	if consume_close {
+		advance_rune(t)
+	}
+
+	/*
+		TODO: Handle decoding escape characters and unboxing CDATA.
+	*/
+
+	return lit, err
+}
+
+peek :: proc(t: ^Tokenizer) -> (token: Token) {
+	old  := t^
+	token = scan(t)
+	t^ = old
+	return token
+}
+
+scan :: proc(t: ^Tokenizer) -> Token {
+	skip_whitespace(t)
+
+	offset := t.offset
+
+	kind: Token_Kind
+	err:  Error
+	lit:  string
+	pos := offset_to_pos(t, offset)
+
+	switch ch := t.ch; true {
+	case is_letter(ch):
+		lit = scan_identifier(t)
+		kind = .Ident
+
+	case:
+		advance_rune(t)
+		switch ch {
+		case -1:
+			kind = .EOF
+
+		case '<': kind = .Lt
+		case '>': kind = .Gt
+		case '!': kind = .Exclaim
+		case '?': kind = .Question
+		case '=': kind = .Eq
+		case '#': kind = .Hash
+		case '/': kind = .Slash
+		case '-': kind = .Dash
+		case ':': kind = .Colon
+
+		case '"', '\'':
+			kind = .Invalid
+
+			lit, err = scan_string(t, t.offset, ch, true, false)
+			if err == .None {
+				kind = .String
+			}
+
+		case '\n':
+			lit = "\n"
+
+		case:
+			kind = .Invalid
+		}
+	}
+
+	if kind != .String && lit == "" {
+		lit = string(t.src[offset : t.offset])
+	}
+	return Token{kind, lit, pos}
+}
@@ -0,0 +1,709 @@
+/*
+	An XML 1.0 / 1.1 parser
+
+	Copyright 2021-2022 Jeroen van Rijn <nom@duclavier.com>.
+	Made available under Odin's BSD-3 license.
+
+	A from-scratch XML implementation, loosely modelled on the [spec](https://www.w3.org/TR/2006/REC-xml11-20060816).
+
+	Features:
+		- Supports enough of the XML 1.0/1.1 spec to handle the 99.9% of XML documents in common current usage.
+		- Simple to understand and use. Small.
+
+	Caveats:
+		- We do NOT support HTML in this package, as that may or may not be valid XML.
+		  If it works, great. If it doesn't, that's not considered a bug.
+
+		- We do NOT support UTF-16. If you have a UTF-16 XML file, please convert it to UTF-8 first. Also, our condolences.
+		- <[!ELEMENT and <[!ATTLIST are not supported, and will be either ignored or return an error depending on the parser options.
+
+	MAYBE:
+	- XML writer?
+	- Serialize/deserialize Odin types?
+
+	List of contributors:
+		Jeroen van Rijn: Initial implementation.
+*/
+package xml
+// An XML 1.0 / 1.1 parser
+
+import "core:bytes"
+import "core:encoding/entity"
+import "core:intrinsics"
+import "core:mem"
+import "core:os"
+import "core:strings"
+
+likely :: intrinsics.expect
+
+DEFAULT_Options :: Options{
+	flags            = {
+		.Ignore_Unsupported,
+	},
+	expected_doctype = "",
+}
+
+Option_Flag :: enum {
+	/*
+		If the caller says that input may be modified, we can perform in-situ parsing.
+		If this flag isn't provided, the XML parser first duplicates the input so that it can.
+	*/
+	Input_May_Be_Modified,
+
+	/*
+		Document MUST start with `<?xml` prolog.
+	*/
+	Must_Have_Prolog,
+
+	/*
+		Document MUST have a `<!DOCTYPE`.
+	*/
+	Must_Have_DocType,
+
+	/*
+		By default we skip comments. Use this option to intern a comment on a parented Element.
+	*/
+	Intern_Comments,
+
+	/*
+		How to handle unsupported parts of the specification, like <! other than <!DOCTYPE and <![CDATA[
+	*/
+	Error_on_Unsupported,
+	Ignore_Unsupported,
+
+	/*
+		By default CDATA tags are passed-through as-is.
+		This option unwraps them when encountered.
+	*/
+	Unbox_CDATA,
+
+	/*
+		By default SGML entities like `&gt;`, `&#32;` and `&#x20;` are passed-through as-is.
+		This option decodes them when encountered.
+	*/
+	Decode_SGML_Entities,
+
+	/*
+		If a tag body has a comment, it will be stripped unless this option is given.
+	*/
+	Keep_Tag_Body_Comments,
+
+}
+Option_Flags :: bit_set[Option_Flag; u16]
+
+Document :: struct {
+	elements:      [dynamic]Element,
+	element_count: Element_ID,
+
+	prolog:   Attributes,
+	encoding: Encoding,
+
+	doctype: struct {
+		/*
+			We only scan the <!DOCTYPE IDENT part and skip the rest.
+		*/
+		ident:   string,
+		rest:    string,
+	},
+
+	/*
+		If we encounter comments before the root node, and the option to intern comments is given, this is where they'll live.
+		Otherwise they'll be in the element tree.
+	*/
+	comments: [dynamic]string,
+
+	/*
+		Internal
+	*/
+	tokenizer: ^Tokenizer,
+	allocator: mem.Allocator,
+
+	/*
+		Input. Either the original buffer, or a copy if `.Input_May_Be_Modified` isn't specified.
+	*/
+	input:           []u8,
+	strings_to_free: [dynamic]string,
+}
+
+Element :: struct {
+	ident:   string,
+	value:   string,
+	attribs: Attributes,
+
+	kind: enum {
+		Element = 0,
+		Comment,
+	},
+
+	parent:   Element_ID,
+	children: [dynamic]Element_ID,
+}
+
+Attr :: struct {
+	key: string,
+	val: string,
+}
+
+Attributes :: [dynamic]Attr
+
+Options :: struct {
+	flags:            Option_Flags,
+	expected_doctype: string,
+}
+
+Encoding :: enum {
+	Unknown,
+
+	UTF_8,
+	ISO_8859_1,
+
+	/*
+		Aliases
+	*/
+	LATIN_1 = ISO_8859_1,
+}
+
+Error :: enum {
+	/*
+		General return values.
+	*/
+	None = 0,
+	General_Error,
+	Unexpected_Token,
+	Invalid_Token,
+
+	/*
+		Couldn't find, open or read file.
+	*/
+	File_Error,
+
+	/*
+		File too short.
+	*/
+	Premature_EOF,
+
+	/*
+		XML-specific errors.
+	*/
+	No_Prolog,
+	Invalid_Prolog,
+	Too_Many_Prologs,
+
+	No_DocType,
+	Too_Many_DocTypes,
+	DocType_Must_Preceed_Elements,
+
+	/*
+		If a DOCTYPE is present _or_ the caller
+		asked for a specific DOCTYPE and the DOCTYPE
+		and root tag don't match, we return `.Invalid_DocType`.
+	*/
+	Invalid_DocType,
+
+	Invalid_Tag_Value,
+	Mismatched_Closing_Tag,
+
+	Unclosed_Comment,
+	Comment_Before_Root_Element,
+	Invalid_Sequence_In_Comment,
+
+	Unsupported_Version,
+	Unsupported_Encoding,
+
+	/*
+		<!FOO are usually skipped.
+	*/
+	Unhandled_Bang,
+
+	Duplicate_Attribute,
+	Conflicting_Options,
+}
+
+/*
+	Implementation starts here.
+*/
+parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", error_handler := default_error_handler, allocator := context.allocator) -> (doc: ^Document, err: Error) {
+	data := data
+	context.allocator = allocator
+
+	opts := validate_options(options) or_return
+
+	/*
+		If `.Input_May_Be_Modified` is not specified, we duplicate the input so that we can modify it in-place.
+	*/
+	if .Input_May_Be_Modified not_in opts.flags {
+		data = bytes.clone(data)
+	}
+
+	t := &Tokenizer{}
+	init(t, string(data), path, error_handler)
+
+	doc = new(Document)
+	doc.allocator = allocator
+	doc.tokenizer = t
+	doc.input     = data
+
+	doc.elements = make([dynamic]Element, 1024, 1024, allocator)
+
+	// strings.intern_init(&doc.intern, allocator, allocator)
+
+	err =            .Unexpected_Token
+	element, parent: Element_ID
+
+	tag_is_open   := false
+	first_element := true
+	open: Token
+
+	/*
+		If a DOCTYPE is present, the root tag has to match.
+		If an expected DOCTYPE is given in options (i.e. it's non-empty), the DOCTYPE (if present) and root tag have to match.
+	*/
+	expected_doctype := options.expected_doctype
+
+	loop: for {
+		skip_whitespace(t)
+		// NOTE(Jeroen): This is faster as a switch.
+		switch t.ch {
+		case '<':
+			/*
+				Consume peeked `<`
+			*/
+			advance_rune(t)
+
+			open = scan(t)
+			// NOTE(Jeroen): We're not using a switch because this if-else chain ordered by likelihood is 2.5% faster at -o:size and -o:speed.
+			if likely(open.kind, Token_Kind.Ident) == .Ident {
+				/*
+					e.g. <odin - Start of new element.
+				*/
+				element = new_element(doc)
+				tag_is_open = true
+
+				if first_element {
+					/*
+						First element.
+					*/
+					parent   = element
+					first_element = false
+				} else {
+					append(&doc.elements[parent].children, element)
+				}
+
+				doc.elements[element].parent = parent
+				doc.elements[element].ident  = open.text
+
+				parse_attributes(doc, &doc.elements[element].attribs) or_return
+
+				/*
+					If a DOCTYPE is present _or_ the caller
+					asked for a specific DOCTYPE and the DOCTYPE
+					and root tag don't match, we return .Invalid_Root_Tag.
+				*/
+				if element == 0 { // Root tag?
+					if len(expected_doctype) > 0 && expected_doctype != open.text {
+						error(t, t.offset, "Root Tag doesn't match DOCTYPE. Expected: %v, got: %v\n", expected_doctype, open.text)
+						return doc, .Invalid_DocType
+					}
+				}
+
+				/*
+					One of these should follow:
+					- `>`,  which means we've just opened this tag and expect a later element to close it.
+					- `/>`, which means this is an 'empty' or self-closing tag.
+				*/
+				end_token := scan(t)
+				#partial switch end_token.kind {
+				case .Gt:
+					/*
+						We're now the new parent.
+					*/
+					parent = element
+
+				case .Slash:
+					/*
+						Empty tag. Close it.
+					*/
+					expect(t, .Gt) or_return
+					parent      = doc.elements[element].parent
+					element     = parent
+					tag_is_open = false
+
+				case:
+					error(t, t.offset, "Expected close tag, got: %#v\n", end_token)
+					return
+				}
+
+			} else if open.kind == .Slash {
+				/*
+					Close tag.
+				*/
+				ident := expect(t, .Ident) or_return
+				_      = expect(t, .Gt)    or_return
+
+				if doc.elements[element].ident != ident.text {
+					error(t, t.offset, "Mismatched Closing Tag. Expected %v, got %v\n", doc.elements[element].ident, ident.text)
+					return doc, .Mismatched_Closing_Tag
+				}
+				parent      = doc.elements[element].parent
+				element     = parent
+				tag_is_open = false
+
+			} else if open.kind == .Exclaim {
+				/*
+					<!
+				*/
+				next := scan(t)
+				#partial switch next.kind {
+				case .Ident:
+					switch next.text {
+					case "DOCTYPE":
+						if len(doc.doctype.ident) > 0 {
+							return doc, .Too_Many_DocTypes
+						}
+						if doc.element_count > 0 {
+							return doc, .DocType_Must_Preceed_Elements
+						}
+						parse_doctype(doc) or_return
+
+						if len(expected_doctype) > 0 && expected_doctype != doc.doctype.ident {
+							error(t, t.offset, "Invalid DOCTYPE. Expected: %v, got: %v\n", expected_doctype, doc.doctype.ident)
+							return doc, .Invalid_DocType
+						}
+						expected_doctype = doc.doctype.ident
+
+					case:
+						if .Error_on_Unsupported in opts.flags {
+							error(t, t.offset, "Unhandled: <!%v\n", next.text)
+							return doc, .Unhandled_Bang
+						}
+						skip_element(t) or_return
+					}
+
+				case .Dash:
+					/*
+						Comment: <!-- -->.
+						The grammar does not allow a comment to end in --->
+					*/
+					expect(t, .Dash)
+					comment := scan_comment(t) or_return
+
+					if .Intern_Comments in opts.flags {
+						if len(doc.elements) == 0 {
+							append(&doc.comments, comment)
+						} else {
+							el := new_element(doc)
+							doc.elements[el].parent = element
+							doc.elements[el].kind   = .Comment
+							doc.elements[el].value  = comment
+							append(&doc.elements[element].children, el)
+						}
+					}
+
+				case:
+					error(t, t.offset, "Invalid Token after <!. Expected .Ident, got %#v\n", next)
+					return
+				}
+
+			} else if open.kind == .Question {
+				/*
+					<?xml
+				*/
+				next := scan(t)
+				#partial switch next.kind {
+				case .Ident:
+					if len(next.text) == 3 && strings.to_lower(next.text, context.temp_allocator) == "xml" {
+						parse_prolog(doc) or_return
+					} else if len(doc.prolog) > 0 {
+						/*
+							We've already seen a prolog.
+						*/
+						return doc, .Too_Many_Prologs
+					} else {
+						/*
+							Could be `<?xml-stylesheet`, etc. Ignore it.
+						*/
+						skip_element(t) or_return
+					}
+				case:
+					error(t, t.offset, "Expected \"<?xml\", got \"<?%v\".", next.text)
+					return
+				}
+
+			} else {
+				error(t, t.offset, "Invalid Token after <: %#v\n", open)
+				return
+			}
+
+		case -1:
+			/*
+				End of file.
+			*/
+			if tag_is_open {
+				return doc, .Premature_EOF
+			}
+			break loop
+
+		case:
+			/*
+				This should be a tag's body text.
+			*/
+			body_text        := scan_string(t, t.offset) or_return
+			needs_processing := .Unbox_CDATA          in opts.flags
+			needs_processing |= .Decode_SGML_Entities in opts.flags
+
+			if !needs_processing {
+				doc.elements[element].value = body_text
+				continue
+			}
+
+			decode_opts := entity.XML_Decode_Options{}
+			if .Keep_Tag_Body_Comments not_in opts.flags {
+				decode_opts += { .Comment_Strip }
+			}
+
+			if .Decode_SGML_Entities not_in opts.flags {
+				decode_opts += { .No_Entity_Decode }
+			}
+
+			if .Unbox_CDATA in opts.flags {
+				decode_opts += { .Unbox_CDATA }
+				if .Decode_SGML_Entities in opts.flags {
+					decode_opts += { .Decode_CDATA }
+				}
+			}
+
+			decoded, decode_err := entity.decode_xml(body_text, decode_opts)
+			if decode_err == .None {
+				doc.elements[element].value = decoded
+				append(&doc.strings_to_free, decoded)
+			} else {
+				doc.elements[element].value = body_text
+			}
+		}
+	}
+
+	if .Must_Have_Prolog in opts.flags && len(doc.prolog) == 0 {
+		return doc, .No_Prolog
+	}
+
+	if .Must_Have_DocType in opts.flags && len(doc.doctype.ident) == 0 {
+		return doc, .No_DocType
+	}
+
+	resize(&doc.elements, int(doc.element_count))
+	return doc, .None
+}
+
+parse_from_file :: proc(filename: string, options := DEFAULT_Options, error_handler := default_error_handler, allocator := context.allocator) -> (doc: ^Document, err: Error) {
+	context.allocator = allocator
+	options := options
+
+	data, data_ok := os.read_entire_file(filename)
+	if !data_ok { return {}, .File_Error }
+
+	options.flags += { .Input_May_Be_Modified }
+
+	return parse_from_slice(data, options, filename, error_handler, allocator)
+}
+
+parse :: proc { parse_from_file, parse_from_slice }
+
+destroy :: proc(doc: ^Document) {
+	if doc == nil { return }
+
+	for el in doc.elements {
+		delete(el.attribs)
+		delete(el.children)
+	}
+	delete(doc.elements)
+
+	delete(doc.prolog)
+	delete(doc.comments)
+	delete(doc.input)
+
+	for s in doc.strings_to_free {
+		delete(s)
+	}
+	delete(doc.strings_to_free)
+
+	free(doc)
+}
+
+/*
+	Helpers.
+*/
+
+validate_options :: proc(options: Options) -> (validated: Options, err: Error) {
+	validated = options
+
+	if .Error_on_Unsupported in validated.flags && .Ignore_Unsupported in validated.flags {
+		return options, .Conflicting_Options
+	}
+	return validated, .None
+}
+
+expect :: proc(t: ^Tokenizer, kind: Token_Kind) -> (tok: Token, err: Error) {
+	tok = scan(t)
+	if tok.kind == kind { return tok, .None }
+
+	error(t, t.offset, "Expected \"%v\", got \"%v\".", kind, tok.kind)
+	return tok, .Unexpected_Token
+}
+
+parse_attribute :: proc(doc: ^Document) -> (attr: Attr, offset: int, err: Error) {
+	assert(doc != nil)
+	context.allocator = doc.allocator
+	t := doc.tokenizer
+
+	key    := expect(t, .Ident)  or_return
+	offset  = t.offset - len(key.text)
+
+	_       = expect(t, .Eq)     or_return
+	value  := expect(t, .String) or_return
+
+	attr.key = key.text
+	attr.val = value.text
+
+	err = .None
+	return
+}
+
+check_duplicate_attributes :: proc(t: ^Tokenizer, attribs: Attributes, attr: Attr, offset: int) -> (err: Error) {
+	for a in attribs {
+		if attr.key == a.key {
+			error(t, offset, "Duplicate attribute: %v\n", attr.key)
+			return .Duplicate_Attribute
+		}
+	}
+	return .None	
+}
+
+parse_attributes :: proc(doc: ^Document, attribs: ^Attributes) -> (err: Error) {
+	assert(doc != nil)
+	context.allocator = doc.allocator
+	t := doc.tokenizer
+
+	for peek(t).kind == .Ident {
+		attr, offset := parse_attribute(doc)                  or_return
+		check_duplicate_attributes(t, attribs^, attr, offset) or_return
+		append(attribs, attr)
+	}
+	skip_whitespace(t)
+	return .None
+}
+
+parse_prolog :: proc(doc: ^Document) -> (err: Error) {
+	assert(doc != nil)
+	context.allocator = doc.allocator
+	t := doc.tokenizer
+
+	offset := t.offset
+	parse_attributes(doc, &doc.prolog) or_return
+
+	for attr in doc.prolog {
+		switch attr.key {
+		case "version":
+			switch attr.val {
+			case "1.0", "1.1":
+			case:
+				error(t, offset, "[parse_prolog] Warning: Unhandled XML version: %v\n", attr.val)
+			}
+
+		case "encoding":
+			switch strings.to_lower(attr.val, context.temp_allocator) {
+			case "utf-8", "utf8":
+				doc.encoding = .UTF_8
+
+			case "latin-1", "latin1", "iso-8859-1":
+				doc.encoding = .LATIN_1
+
+			case:
+				/*
+					Unrecognized encoding, assume UTF-8.
+				*/
+				error(t, offset, "[parse_prolog] Warning: Unrecognized encoding: %v\n", attr.val)
+			}
+
+		case:
+			// Ignored.
+		}
+	}
+
+	_ = expect(t, .Question) or_return
+	_ = expect(t, .Gt)       or_return
+
+	return .None
+}
+
+skip_element :: proc(t: ^Tokenizer) -> (err: Error) {
+	close := 1
+
+	loop: for {
+		tok := scan(t)
+		#partial switch tok.kind {
+		case .EOF:
+			error(t, t.offset, "[skip_element] Premature EOF\n")
+			return .Premature_EOF
+
+		case .Lt:
+			close += 1
+
+		case .Gt:
+			close -= 1
+			if close == 0 {
+				break loop
+			}
+
+		case:
+
+		}
+	}
+	return .None
+}
+
+parse_doctype :: proc(doc: ^Document) -> (err: Error) {
+	/*
+		<!DOCTYPE greeting SYSTEM "hello.dtd">
+
+		<!DOCTYPE greeting [
+			<!ELEMENT greeting (#PCDATA)>
+		]>
+	*/
+	assert(doc != nil)
+	context.allocator = doc.allocator
+	t := doc.tokenizer
+
+	tok := expect(t, .Ident) or_return
+	doc.doctype.ident = tok.text
+
+	skip_whitespace(t)
+	offset := t.offset
+	skip_element(t) or_return
+
+	/*
+		-1 because the current offset is that of the closing tag, so the rest of the DOCTYPE tag ends just before it.
+	*/
+	doc.doctype.rest = string(t.src[offset : t.offset - 1])
+	return .None
+}
+
+Element_ID :: u32
+
+new_element :: proc(doc: ^Document) -> (id: Element_ID) {
+	element_space := len(doc.elements)
+
+	// Need to resize
+	if int(doc.element_count) + 1 > element_space {
+		if element_space < 65536 {
+			element_space *= 2
+		} else {
+			element_space += 65536
+		}
+		resize(&doc.elements, element_space)
+	}
+
+	cur := doc.element_count
+	doc.element_count += 1
+
+	return cur
+}
@@ -0,0 +1,287 @@
+package xml_example
+
+import "core:encoding/xml"
+import "core:os"
+import "core:path"
+import "core:mem"
+import "core:strings"
+import "core:strconv"
+import "core:slice"
+import "core:fmt"
+
+/*
+	Silent error handler for the parser.
+*/
+Error_Handler :: proc(pos: xml.Pos, fmt: string, args: ..any) {}
+
+OPTIONS :: xml.Options{ flags = { .Ignore_Unsupported, }, expected_doctype = "unicode", }
+
+Entity :: struct {
+	name:        string,
+	codepoint:   rune,
+	description: string,
+}
+
+generate_encoding_entity_table :: proc() {
+	using fmt
+
+	filename := path.join(ODIN_ROOT, "tests", "core", "assets", "XML", "unicode.xml")
+	defer delete(filename)
+
+	generated_filename := path.join(ODIN_ROOT, "core", "encoding", "entity", "generated.odin")
+	defer delete(generated_filename)
+
+	doc, err := xml.parse(filename, OPTIONS, Error_Handler)
+	defer xml.destroy(doc)
+
+	if err != .None {
+		printf("Load/Parse error: %v\n", err)
+		if err == .File_Error {
+			printf("\"%v\" not found. Did you run \"tests\\download_assets.py\"?", filename)
+		}
+		os.exit(1)
+	}
+
+	printf("\"%v\" loaded and parsed.\n", filename)
+
+	generated_buf: strings.Builder
+	defer strings.destroy_builder(&generated_buf)
+	w := strings.to_writer(&generated_buf)
+
+	charlist, charlist_ok := xml.find_child_by_ident(doc.root, "charlist")
+	if !charlist_ok {
+		eprintln("Could not locate top-level `<charlist>` tag.")
+		os.exit(1)
+	}
+
+	printf("Found `<charlist>` with %v children.\n", len(charlist.children))
+
+	entity_map: map[string]Entity
+	names: [dynamic]string
+
+	min_name_length := max(int)
+	max_name_length := min(int)
+	shortest_name: string
+	longest_name:  string
+
+	count := 0
+	for char in charlist.children {
+		if char.ident != "character" {
+			eprintf("Expected `<character>`, got `<%v>`\n", char.ident)
+			os.exit(1)
+		}
+
+		if codepoint_string, ok := xml.find_attribute_val_by_key(char, "dec"); !ok {
+			eprintln("`<character id=\"...\">` attribute not found.")
+			os.exit(1)
+		} else {
+			codepoint := strconv.atoi(codepoint_string)
+
+			desc, desc_ok := xml.find_child_by_ident(char, "description")
+			description   := desc.value if desc_ok else ""
+
+			/*
+				For us to be interested in this codepoint, it has to have at least one entity.
+			*/
+
+			nth := 0
+			for {
+				character_entity, entity_ok := xml.find_child_by_ident(char, "entity", nth)
+				if !entity_ok { break }
+
+				nth   += 1
+				if name, name_ok := xml.find_attribute_val_by_key(character_entity, "id"); name_ok {
+
+					if len(name) == 0 {
+						/*
+							Invalid name. Skip.
+						*/
+						continue
+					}
+
+					if name == "\"\"" {
+						printf("%#v\n", char)
+						printf("%#v\n", character_entity)
+					}
+
+					if len(name) > max_name_length { longest_name  = name }
+					if len(name) < min_name_length { shortest_name = name }
+
+					min_name_length = min(min_name_length, len(name))
+					max_name_length = max(max_name_length, len(name))
+
+					e := Entity{
+						name        = name,
+						codepoint   = rune(codepoint),
+						description = description,
+					}
+
+					if _, seen := entity_map[name]; seen {
+						continue
+					}
+
+					entity_map[name] = e
+					append(&names, name)
+					count += 1
+				}
+			}
+		}
+	}
+
+	/*
+		Sort by name.
+	*/
+	slice.sort(names[:])
+
+	printf("Found %v unique `&name;` -> rune mappings.\n", count)
+	printf("Shortest name: %v (%v)\n", shortest_name, min_name_length)
+	printf("Longest name:  %v (%v)\n", longest_name,  max_name_length)
+
+	// println(rune_to_string(1234))
+
+	/*
+		Generate table.
+	*/
+	wprintln(w, "package unicode_entity")
+	wprintln(w, "")
+	wprintln(w, GENERATED)
+	wprintln(w, "")
+	wprintf (w, TABLE_FILE_PROLOG)
+	wprintln(w, "")
+
+	wprintf (w, "// `&%v;`\n", shortest_name)
+	wprintf (w, "XML_NAME_TO_RUNE_MIN_LENGTH :: %v\n", min_name_length)
+	wprintf (w, "// `&%v;`\n", longest_name)
+	wprintf (w, "XML_NAME_TO_RUNE_MAX_LENGTH :: %v\n", max_name_length)
+	wprintln(w, "")
+
+	wprintln(w,
+`
+/*
+	Input:
+		entity_name - a string, like "copy" that describes a user-encoded Unicode entity as used in XML.
+
+	Output:
+		"decoded" - The decoded rune if found by name, or -1 otherwise.
+		"ok"      - true if found, false if not.
+
+	IMPORTANT: XML processors (including browsers) treat these names as case-sensitive. So do we.
+*/
+named_xml_entity_to_rune :: proc(name: string) -> (decoded: rune, ok: bool) {
+	/*
+		Early out if the name is too short or too long.
+		min as a precaution in case the generated table has a bogus value.
+	*/
+	if len(name) < min(1, XML_NAME_TO_RUNE_MIN_LENGTH) || len(name) > XML_NAME_TO_RUNE_MAX_LENGTH {
+		return -1, false
+	}
+
+	switch rune(name[0]) {
+`)
+
+	prefix := '?'
+	should_close := false
+
+	for v in names {
+		if rune(v[0]) != prefix {
+			if should_close {
+				wprintln(w, "\t\t}\n")
+			}
+
+			prefix = rune(v[0])
+			wprintf (w, "\tcase '%v':\n", prefix)
+			wprintln(w, "\t\tswitch name {")
+		}
+
+		e := entity_map[v]
+
+		wprintf(w, "\t\t\tcase \"%v\": \n",     e.name)
+		wprintf(w, "\t\t\t\t// %v\n",           e.description)
+		wprintf(w, "\t\t\t\treturn %v, true\n", rune_to_string(e.codepoint))
+
+		should_close = true
+	}
+	wprintln(w, "\t\t}")
+	wprintln(w, "\t}")
+	wprintln(w, "\treturn -1, false")
+	wprintln(w, "}\n")
+	wprintln(w, GENERATED)
+
+	println()
+	println(strings.to_string(generated_buf))
+	println()
+
+	written := os.write_entire_file(generated_filename, transmute([]byte)strings.to_string(generated_buf))
+
+	if written {
+		fmt.printf("Successfully written generated \"%v\".", generated_filename)
+	} else {
+		fmt.printf("Failed to write generated \"%v\".", generated_filename)
+	}
+
+	delete(entity_map)
+	delete(names)
+	for name in &names {
+		free(&name)
+	}
+}
+
+GENERATED :: `/*
+	------ GENERATED ------ DO NOT EDIT ------ GENERATED ------ DO NOT EDIT ------ GENERATED ------
+*/`
+
+TABLE_FILE_PROLOG :: `/*
+	This file is generated from "https://www.w3.org/2003/entities/2007xml/unicode.xml".
+	
+	UPDATE:
+		- Ensure the XML file was downloaded using "tests\core\download_assets.py".
+		- Run "core/unicode/tools/generate_entity_table.odin"
+
+	Odin unicode generated tables: https://github.com/odin-lang/Odin/tree/master/core/encoding/entity
+
+		Copyright © 2021 World Wide Web Consortium, (Massachusetts Institute of Technology,
+		European Research Consortium for Informatics and Mathematics, Keio University, Beihang).
+
+		All Rights Reserved.
+
+		This work is distributed under the W3C® Software License [1] in the hope that it will be useful,
+		but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+
+		[1] http://www.w3.org/Consortium/Legal/copyright-software
+
+	See also: LICENSE_table.md
+*/
+`
+
+rune_to_string :: proc(r: rune) -> (res: string) {
+	res = fmt.tprintf("%08x", int(r))
+	for len(res) > 2 && res[:2] == "00" {
+		res = res[2:]
+	}
+	return fmt.tprintf("rune(0x%v)", res)
+}
+
+is_dotted_name :: proc(name: string) -> (dotted: bool) {
+	for r in name {
+		if r == '.' { return true}
+	}
+	return false
+}
+
+main :: proc() {
+	using fmt
+
+	track: mem.Tracking_Allocator
+	mem.tracking_allocator_init(&track, context.allocator)
+	context.allocator = mem.tracking_allocator(&track)
+
+	generate_encoding_entity_table()
+
+	if len(track.allocation_map) > 0 {
+		println()
+		for _, v in track.allocation_map {
+			printf("%v Leaked %v bytes.\n", v.location, v.size)
+		}
+	}
+	println("Done and cleaned up!")
+}
@@ -0,0 +1,2 @@
+# This file will be downloaded by download_assets.py
+unicode.xml
@@ -0,0 +1,29 @@
+<html>
+	<head>
+		<title>Entity Reference Test</title>
+		<style>
+			body {
+				background: #000; color: #eee;
+				width: 40%;
+				margin-left:  auto;
+				margin-right: auto;
+				font-size: 14pt;
+			}
+		</style>
+	</head>
+	<body>
+		<h1>Entity Reference Test</h1>
+		<div id="test_cdata_in_comment" foo="">
+			Foozle]!&#32;&copy;&#x20;<!-- <![CDATA[&#32;&reg;&#x20;]]> -->42&;1234&
+		</div>
+		<!-- foo attribute should be empty but present -->
+		<!-- EXPECTED: Foozle]! © 42&;1234& -->
+		<div id="test_cdata_unwrap_and_passthrough">
+			Foozle]!&#32;&copy;&#x20;<![CDATA[BOX&#32;&reg;&#x20;/BOX]]>42&;1234&
+		</div>
+		<!-- EXPECTED: Foozle]! © BOX ® /BOX42&;1234& -->
+		<div>
+			&verbar; &vert; &VerticalLine; &fjlig; &grave; &bsol; &reg; &rhov; &CounterClockwiseContourIntegral; &bsemi;
+		</div>
+	</body>
+</html>
@@ -0,0 +1,35 @@
+<?xml version="1.0" encoding="utf-8"?>
+<!DOCTYPE TS>
+<TS version="2.1" language="nl" sourcelanguage="en">
+<context>
+  <name>Page</name>
+  <message>
+    <source>Text for translation</source>
+    <comment>commenting</comment>
+    <translation type="obsolete">Tekst om te vertalen</translation>
+  </message>
+  <message>
+     <source>Also text to translate</source>
+     <extracomment>some text</extracomment>
+    <translation>Ook tekst om te vertalen</translation>
+  </message>
+</context>
+<context>
+  <name>installscript</name>
+  <message>
+    <source>99 bottles of beer on the wall</source>
+    <oldcomment>some new comments here</oldcomment>
+    <translation>99 flessen bier op de muur</translation>
+  </message>
+</context>
+<context>
+    <name>apple_count</name>
+    <message numerus="yes">
+      <source>%d apple(s)</source>
+      <translation>
+        <numerusform>%d appel</numerusform>
+        <numerusform>%d appels</numerusform>
+      </translation>
+    </message>
+  </context>
+</TS>
@@ -0,0 +1,38 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<xliff version="1.2" xmlns="urn:oasis:names:tc:xliff:document:1.2">
+	<file id="42" original="Foozle.xml" source-language="en" target-language="nl-NL" datatype="plaintext">
+		<body>
+			<trans-unit id="874396" maxwidth="20" size-unit="char">
+				<source>text</source>
+				<target state="translated">tekst</target>
+				<note>Context</note>
+			</trans-unit>
+			<trans-unit id="874397" approved="yes">
+				<source>text 1</source>
+				<target state="translated">tekst 1</target>
+				<note>Context 1</note>
+			</trans-unit>
+			<trans-unit id="874398">
+				<source>text 2</source>
+				<target state="needs-translation"/>
+				<context context-type="context">Context of the segment 2</context>
+			</trans-unit>
+			<trans-unit id="874399" translate="no">
+				<source>text 3</source>
+				<target state="final">translation 3</target>
+				<note>Context 3</note>
+			</trans-unit>
+			<group restype="x-gettext-plurals">
+				<note>Plurals</note>
+				<trans-unit id="14343743[0]">
+					<source>%d month</source>
+					<target xml:lang="nl" state="translated">%d maand</target>
+				</trans-unit>
+				<trans-unit id="14343743[1]">
+					<source>%d months</source>
+					<target xml:lang="nl" state="translated">%d maanden</target>
+				</trans-unit>
+			</group>
+		</body>
+	</file>
+</xliff>
@@ -0,0 +1,52 @@
+<?xml version="1.0" encoding="utf-8"?>
+<xliff xmlns="urn:oasis:names:tc:xliff:document:2.0" version="2.0" srcLang="en" trgLang="nl">
+	<file id="f1">
+		<notes>
+			<note id="n1">Note for file</note>
+		</notes>
+		<unit id="u1">
+			<notes>
+				<note id="n1">Note for unit</note>
+			</notes>
+			<segment id="s1" state="initial">
+				<source>text</source>
+				<target></target>
+			</segment>
+		</unit>
+		<unit id="u2">
+			<notes>
+				<note id="n2">Note for unit 2</note>
+			</notes>
+			<segment id="s2" state="translated">
+				<source>text 2</source>
+				<target>translation 2</target>
+			</segment>
+		</unit>
+		<unit id="u3">
+			<notes>
+				<note id="n3">Note for unit 3</note>
+			</notes>
+			<segment id="s3" state="final">
+				<source>text 3</source>
+				<target>approved translation 3</target>
+			</segment>
+		</unit>
+		<group id="90290" type="x-gettext:plurals">
+			<unit id="90291" name="90290[0]">
+				<notes>
+					<note category="context">Plurals</note>
+				</notes>
+				<segment>
+					<source>%d month</source>
+					<target xml:lang="nl">%d maand</target>
+				</segment>
+			</unit>
+			<unit id="90292" name="90290[1]">
+				<segment>
+					<source>%d months</source>
+					<target xml:lang="nl">%d maanden</target>
+				</segment>
+			</unit>
+		</group>
+	</file>
+</xliff>
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="utf-8"?>
+<!DOCTYPE 恥ずべきフクロウ>
+<恥ずべきフクロウ 올빼미_id="Foozle&#32;<![CDATA[<greeting>Hello, world!"</greeting>]]>Barzle">
+<부끄러운:barzle>
+	<name foo:bar="birmese">ရှက်စရာ ဇီးကွက်</name>
+	<nickname>Owl of Shame</nickname>
+	<data>More CDATA <![CDATA[<greeting>Hello, world!</greeting><![CDATA] <$]]> Nonsense.</data>
+</부끄러운:barzle>
@@ -1,5 +1,6 @@
@echo off
-set COMMON=-show-timings -no-bounds-check -vet -strict-style -collection:tests=..
+set COMMON=-no-bounds-check -vet -strict-style
+set COLLECTION=-collection:tests=..
 set PATH_TO_ODIN==..\..\odin
 python3 download_assets.py
 echo ---
@@ -35,9 +36,10 @@ echo ---
 echo ---
 echo Running core:encoding tests
 echo ---
-%PATH_TO_ODIN% run encoding/hxa %COMMON% -out:test_hxa.exe
-%PATH_TO_ODIN% run encoding/json %COMMON% -out:test_json.exe
+%PATH_TO_ODIN% run encoding/hxa    %COMMON% %COLLECTION% -out:test_hxa.exe
+%PATH_TO_ODIN% run encoding/json   %COMMON% -out:test_json.exe
 %PATH_TO_ODIN% run encoding/varint %COMMON% -out:test_varint.exe
+%PATH_TO_ODIN% run encoding/xml    %COMMON% -out:test_xml.exe

 echo ---
 echo Running core:math/noise tests
@@ -47,19 +49,19 @@ echo ---
 echo ---
 echo Running core:math tests
 echo ---
-%PATH_TO_ODIN% run math %COMMON% -out:test_core_math.exe
+%PATH_TO_ODIN% run math %COMMON% %COLLECTION% -out:test_core_math.exe

 echo ---
 echo Running core:math/linalg/glsl tests
 echo ---
-%PATH_TO_ODIN% run math/linalg/glsl %COMMON% -out:test_linalg_glsl.exe
+%PATH_TO_ODIN% run math/linalg/glsl %COMMON% %COLLECTION% -out:test_linalg_glsl.exe

 echo ---
 echo Running core:path/filepath tests
 echo ---
-%PATH_TO_ODIN% run path/filepath %COMMON% -out:test_core_filepath.exe
+%PATH_TO_ODIN% run path/filepath %COMMON% %COLLECTION% -out:test_core_filepath.exe

 echo ---
 echo Running core:reflect tests
 echo ---
-%PATH_TO_ODIN% run reflect %COMMON% -out:test_core_reflect.exe
+%PATH_TO_ODIN% run reflect %COMMON% %COLLECTION% -out:test_core_reflect.exe
@@ -0,0 +1,353 @@
+package test_core_xml
+
+import "core:encoding/xml"
+import "core:testing"
+import "core:mem"
+import "core:strings"
+import "core:io"
+import "core:fmt"
+import "core:hash"
+
+Silent :: proc(pos: xml.Pos, format: string, args: ..any) {}
+
+OPTIONS :: xml.Options{ flags = { .Ignore_Unsupported, .Intern_Comments, },
+	expected_doctype = "",
+}
+
+TEST_count := 0
+TEST_fail  := 0
+
+TEST :: struct {
+	filename: string,
+	options:  xml.Options,
+	err:      xml.Error,
+	crc32:    u32,
+}
+
+/*
+	Relative to ODIN_ROOT
+*/
+TEST_FILE_PATH_PREFIX :: "tests/core/assets/XML"
+
+TESTS :: []TEST{
+	/*
+		First we test that certain files parse without error.
+	*/
+
+	{
+		/*
+		<?xml version="1.0" encoding="utf-8"?>
+		<!DOCTYPE 恥ずべきフクロウ>
+		<恥ずべきフクロウ 올빼미_id="Foozle&#32;<![CDATA[<greeting>Hello, world!"</greeting>]]>Barzle">
+		<부끄러운:barzle>
+			<name foo:bar="birmese">ရှက်စရာ ဇီးကွက်</name>
+			<nickname>Owl of Shame</nickname>
+			<data>More CDATA <![CDATA[<greeting>Hello, world!</greeting><![CDATA] <$]]> Nonsense.</data>
+		</부끄러운:barzle>
+		*/
+
+		/*
+			Tests UTF-8 idents and values.
+			Test namespaced ident.
+			Tests that nested partial CDATA start doesn't trip up parser.
+		*/
+		filename  = "utf8.xml",
+		options   = {
+			flags = {
+				.Ignore_Unsupported, .Intern_Comments,
+			},
+			expected_doctype = "恥ずべきフクロウ",
+		},
+		crc32     = 0x30d82264,
+	},
+
+	{
+		/*
+			Same as above.
+			Unbox CDATA in data tag.
+		*/
+		filename  = "utf8.xml",
+		options   = {
+			flags = {
+				.Ignore_Unsupported, .Intern_Comments, .Unbox_CDATA,
+			},
+			expected_doctype = "恥ずべきフクロウ",
+		},
+		crc32     = 0xad31d8e8,
+	},
+
+	{
+		/*
+			Simple Qt TS translation file.
+			`core:i18n` requires it to be parsed properly.
+		*/
+		filename  = "nl_NL-qt-ts.ts",
+		options   = {
+			flags = {
+				.Ignore_Unsupported, .Intern_Comments, .Unbox_CDATA, .Decode_SGML_Entities,
+			},
+			expected_doctype = "TS",
+		},
+		crc32     = 0x7bce2630,
+	},
+
+	{
+		/*
+			Simple XLiff 1.2 file.
+			`core:i18n` requires it to be parsed properly.
+		*/
+		filename  = "nl_NL-xliff-1.2.xliff",
+		options   = {
+			flags = {
+				.Ignore_Unsupported, .Intern_Comments, .Unbox_CDATA, .Decode_SGML_Entities,
+			},
+			expected_doctype = "xliff",
+		},
+		crc32     = 0x43f19d61,
+	},
+
+	{
+		/*
+			Simple XLiff 2.0 file.
+			`core:i18n` requires it to be parsed properly.
+		*/
+		filename  = "nl_NL-xliff-2.0.xliff",
+		options   = {
+			flags = {
+				.Ignore_Unsupported, .Intern_Comments, .Unbox_CDATA, .Decode_SGML_Entities,
+			},
+			expected_doctype = "xliff",
+		},
+		crc32     = 0x961e7635,
+	},
+
+	{
+		filename  = "entities.html",
+		options   = {
+			flags = {
+				.Ignore_Unsupported, .Intern_Comments,
+			},
+			expected_doctype = "html",
+		},
+		crc32     = 0x573c1033,
+	},
+
+	{
+		filename  = "entities.html",
+		options   = {
+			flags = {
+				.Ignore_Unsupported, .Intern_Comments, .Unbox_CDATA,
+			},
+			expected_doctype = "html",
+		},
+		crc32     = 0x82588917,
+	},
+
+	{
+		filename  = "entities.html",
+		options   = {
+			flags = {
+				.Ignore_Unsupported, .Intern_Comments, .Unbox_CDATA, .Decode_SGML_Entities,
+			},
+			expected_doctype = "html",
+		},
+		crc32     = 0x5e74d8a6,
+	},
+
+	/*
+		Then we test that certain errors are returned as expected.
+	*/
+	{
+		filename  = "utf8.xml",
+		options   = {
+			flags            = {
+				.Ignore_Unsupported, .Intern_Comments,
+			},
+			expected_doctype = "Odin",
+		},
+		err       = .Invalid_DocType,
+		crc32     = 0x49b83d0a,
+	},
+
+	/*
+		Parse the 8.2 MiB unicode.xml for good measure.
+	*/
+	{
+		filename  = "unicode.xml",
+		options   = {
+			flags            = {
+				.Ignore_Unsupported,
+			},
+			expected_doctype = "",
+		},
+		err       = .None,
+		crc32     = 0xcaa042b9,
+	},
+}
+
+when ODIN_TEST {
+	expect  :: testing.expect
+	log     :: testing.log
+} else {
+	expect  :: proc(t: ^testing.T, condition: bool, message: string, loc := #caller_location) {
+		TEST_count += 1
+		if !condition {
+			TEST_fail += 1
+			fmt.printf("[%v] %v\n", loc, message)
+			return
+		}
+	}
+	log     :: proc(t: ^testing.T, v: any, loc := #caller_location) {
+		fmt.printf("[%v] LOG:\n\t%v\n", loc, v)
+	}
+}
+
+test_file_path :: proc(filename: string) -> (path: string) {
+
+	path = fmt.tprintf("%v%v/%v", ODIN_ROOT, TEST_FILE_PATH_PREFIX, filename)
+	temp := transmute([]u8)path
+
+	for r, i in path {
+		if r == '\\' {
+			temp[i] = '/'
+		}
+	}
+	return path
+}
+
+doc_to_string :: proc(doc: ^xml.Document) -> (result: string) {
+	/*
+		Effectively a clone of the debug printer in the xml package.
+		We duplicate it here so that the way it prints an XML document to a string is stable.
+
+		This way we can hash the output. If it changes, it means that the document or how it was parsed changed,
+		not how it was printed. One less source of variability.
+	*/
+	print :: proc(writer: io.Writer, doc: ^xml.Document) -> (written: int, err: io.Error) {
+		if doc == nil { return }
+		using fmt
+
+		written += wprintf(writer, "[XML Prolog]\n")
+
+		for attr in doc.prolog {
+			written += wprintf(writer, "\t%v: %v\n", attr.key, attr.val)
+		}
+
+		written += wprintf(writer, "[Encoding] %v\n", doc.encoding)
+
+		if len(doc.doctype.ident) > 0 {
+			written += wprintf(writer, "[DOCTYPE]  %v\n", doc.doctype.ident)
+
+			if len(doc.doctype.rest) > 0 {
+			 	wprintf(writer, "\t%v\n", doc.doctype.rest)
+			}
+		}
+
+		for comment in doc.comments {
+			written += wprintf(writer, "[Pre-root comment]  %v\n", comment)
+		}
+
+		if doc.element_count > 0 {
+		 	wprintln(writer, " --- ")
+		 	print_element(writer, doc, 0)
+		 	wprintln(writer, " --- ")
+		 }
+
+		return written, .None
+	}
+
+	print_element :: proc(writer: io.Writer, doc: ^xml.Document, element_id: xml.Element_ID, indent := 0) -> (written: int, err: io.Error) {
+		using fmt
+
+		tab :: proc(writer: io.Writer, indent: int) {
+			for _ in 0..=indent {
+				wprintf(writer, "\t")
+			}
+		}
+
+		tab(writer, indent)
+
+		element := doc.elements[element_id]
+
+		if element.kind == .Element {
+			wprintf(writer, "<%v>\n", element.ident)
+			if len(element.value) > 0 {
+				tab(writer, indent + 1)
+				wprintf(writer, "[Value] %v\n", element.value)
+			}
+
+			for attr in element.attribs {
+				tab(writer, indent + 1)
+				wprintf(writer, "[Attr] %v: %v\n", attr.key, attr.val)
+			}
+
+			for child in element.children {
+				print_element(writer, doc, child, indent + 1)
+			}
+		} else if element.kind == .Comment {
+			wprintf(writer, "[COMMENT] %v\n", element.value)
+		}
+
+		return written, .None
+	}
+
+	buf: strings.Builder
+	defer strings.destroy_builder(&buf)
+
+	print(strings.to_writer(&buf), doc)
+	return strings.clone(strings.to_string(buf))
+}
+
+@test
+run_tests :: proc(t: ^testing.T) {
+	using fmt
+
+	for test in TESTS {
+		path := test_file_path(test.filename)
+		log(t, fmt.tprintf("Trying to parse %v", path))
+
+		doc, err := xml.parse(path, test.options, Silent)
+		defer xml.destroy(doc)
+
+		tree_string := doc_to_string(doc)
+		tree_bytes  := transmute([]u8)tree_string
+		defer delete(tree_bytes)
+
+		crc32 := hash.crc32(tree_bytes)
+
+		failed := err != test.err
+		err_msg := tprintf("Expected return value %v, got %v", test.err, err)
+		expect(t, err == test.err, err_msg)
+
+		failed |= crc32 != test.crc32
+		err_msg  = tprintf("Expected CRC 0x%08x, got 0x%08x, with options %v", test.crc32, crc32, test.options)
+		expect(t, crc32 == test.crc32, err_msg)
+
+		if failed {
+			/*
+				Don't fully print big trees.
+			*/
+			tree_string = tree_string[:min(2_048, len(tree_string))]
+			println(tree_string)
+		}
+	}
+}
+
+main :: proc() {
+	t := testing.T{}
+
+	track: mem.Tracking_Allocator
+	mem.tracking_allocator_init(&track, context.allocator)
+	context.allocator = mem.tracking_allocator(&track)
+
+	run_tests(&t)
+
+	if len(track.allocation_map) > 0 {
+		for _, v in track.allocation_map {
+			err_msg := fmt.tprintf("%v Leaked %v bytes.", v.location, v.size)
+			expect(&t, false, err_msg)
+		}
+	}	
+
+	fmt.printf("\n%v/%v tests successful.\n", TEST_count - TEST_fail, TEST_count)
+}