Merge pull request #1342 from Kelimion/xml

Initial version of `core:encoding/xml`.
This commit is contained in:
Jeroen van Rijn
2022-04-28 15:54:28 +02:00
committed by GitHub
19 changed files with 10193 additions and 7 deletions
+21
View File
@@ -0,0 +1,21 @@
# License
By obtaining, using and/or copying this work, you (the licensee) agree that you have read, understood, and will comply with the following terms and conditions.
Permission to copy, modify, and distribute this software and its documentation, with or without modification, for any purpose and without fee or royalty is hereby granted, provided that you include the following on ALL copies of the software and documentation or portions thereof, including modifications:
The full text of this NOTICE in a location viewable to users of the redistributed or derivative work.
Any pre-existing intellectual property disclaimers, notices, or terms and conditions. If none exist, the W3C Software Short Notice should be included (hypertext is preferred, text is permitted) within the body of any redistributed or derivative code.
Notice of any changes or modifications to the files, including the date changes were made. (We recommend you provide URIs to the location from which the code is derived.)
# Disclaimers
THIS SOFTWARE AND DOCUMENTATION IS PROVIDED "AS IS," AND COPYRIGHT HOLDERS MAKE NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO, WARRANTIES OF MERCHANTABILITY OR FITNESS FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF THE SOFTWARE OR DOCUMENTATION WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS.
COPYRIGHT HOLDERS WILL NOT BE LIABLE FOR ANY DIRECT, INDIRECT, SPECIAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF ANY USE OF THE SOFTWARE OR DOCUMENTATION.
The name and trademarks of copyright holders may NOT be used in advertising or publicity pertaining to the software without specific, written prior permission. Title to copyright in this software and any associated documentation will at all times remain with copyright holders.
# Notes
This version: http://www.w3.org/Consortium/Legal/2002/copyright-software-20021231
+374
View File
@@ -0,0 +1,374 @@
package unicode_entity
/*
A unicode entity encoder/decoder
Copyright 2021 Jeroen van Rijn <nom@duclavier.com>.
Made available under Odin's BSD-3 license.
This code has several procedures to map unicode runes to/from different textual encodings.
- SGML/XML/HTML entity
-- &#<decimal>;
-- &#x<hexadecimal>;
-- &<entity name>; (If the lookup tables are compiled in).
Reference: https://www.w3.org/2003/entities/2007xml/unicode.xml
- URL encode / decode %hex entity
Reference: https://datatracker.ietf.org/doc/html/rfc3986/#section-2.1
List of contributors:
Jeroen van Rijn: Initial implementation.
*/
import "core:unicode/utf8"
import "core:unicode"
import "core:strings"
MAX_RUNE_CODEPOINT :: int(unicode.MAX_RUNE)
write_rune :: strings.write_rune_builder
write_string :: strings.write_string_builder
Error :: enum u8 {
None = 0,
Tokenizer_Is_Nil,
Illegal_NUL_Character,
Illegal_UTF_Encoding,
Illegal_BOM,
CDATA_Not_Terminated,
Comment_Not_Terminated,
Invalid_Entity_Encoding,
}
Tokenizer :: struct {
r: rune,
w: int,
src: string,
offset: int,
read_offset: int,
}
CDATA_START :: "<![CDATA["
CDATA_END :: "]]>"
COMMENT_START :: "<!--"
COMMENT_END :: "-->"
/*
Default: CDATA and comments are passed through unchanged.
*/
XML_Decode_Option :: enum u8 {
/*
Do not decode & entities. It decodes by default.
If given, overrides `Decode_CDATA`.
*/
No_Entity_Decode,
/*
CDATA is unboxed.
*/
Unbox_CDATA,
/*
Unboxed CDATA is decoded as well.
Ignored if `.Unbox_CDATA` is not given.
*/
Decode_CDATA,
/*
Comments are stripped.
*/
Comment_Strip,
}
XML_Decode_Options :: bit_set[XML_Decode_Option; u8]
/*
Decode a string that may include SGML/XML/HTML entities.
The caller has to free the result.
*/
decode_xml :: proc(input: string, options := XML_Decode_Options{}, allocator := context.allocator) -> (decoded: string, err: Error) {
context.allocator = allocator
l := len(input)
if l == 0 { return "", .None }
builder := strings.make_builder()
defer strings.destroy_builder(&builder)
t := Tokenizer{src=input}
in_data := false
loop: for {
advance(&t) or_return
if t.r < 0 { break loop }
/*
Below here we're never inside a CDATA tag.
At most we'll see the start of one, but that doesn't affect the logic.
*/
switch t.r {
case '<':
/*
Might be the start of a CDATA tag or comment.
We don't need to check if we need to write a `<`, because if it isn't CDATA or a comment,
it couldn't have been part of an XML tag body to be decoded here.
Keep in mind that we could already *be* inside a CDATA tag.
If so, write `>` as a literal and continue.
*/
if in_data {
write_rune(&builder, '<')
continue
}
in_data = _handle_xml_special(&t, &builder, options) or_return
case ']':
/*
If we're unboxing _and_ decoding CDATA, we'll have to check for the end tag.
*/
if in_data {
if t.read_offset + len(CDATA_END) < len(t.src) {
if string(t.src[t.offset:][:len(CDATA_END)]) == CDATA_END {
in_data = false
t.read_offset += len(CDATA_END) - 1
}
}
continue
} else {
write_rune(&builder, ']')
}
case:
if in_data && .Decode_CDATA not_in options {
/*
Unboxed, but undecoded.
*/
write_rune(&builder, t.r)
continue
}
if t.r == '&' {
if entity, entity_err := _extract_xml_entity(&t); entity_err != .None {
/*
We read to the end of the string without closing the entity.
Pass through as-is.
*/
write_string(&builder, entity)
} else {
if .No_Entity_Decode not_in options {
if decoded, ok := xml_decode_entity(entity); ok {
write_rune(&builder, decoded)
continue
}
}
/*
Literal passthrough because the decode failed or we want entities not decoded.
*/
write_string(&builder, "&")
write_string(&builder, entity)
write_string(&builder, ";")
}
} else {
write_rune(&builder, t.r)
}
}
}
return strings.clone(strings.to_string(builder), allocator), err
}
advance :: proc(t: ^Tokenizer) -> (err: Error) {
if t == nil { return .Tokenizer_Is_Nil }
using t
#no_bounds_check {
if read_offset < len(src) {
offset = read_offset
r, w = rune(src[read_offset]), 1
switch {
case r == 0:
return .Illegal_NUL_Character
case r >= utf8.RUNE_SELF:
r, w = utf8.decode_rune_in_string(src[read_offset:])
if r == utf8.RUNE_ERROR && w == 1 {
return .Illegal_UTF_Encoding
} else if r == utf8.RUNE_BOM && offset > 0 {
return .Illegal_BOM
}
}
read_offset += w
return .None
} else {
offset = len(src)
r = -1
return
}
}
}
xml_decode_entity :: proc(entity: string) -> (decoded: rune, ok: bool) {
entity := entity
if len(entity) == 0 { return -1, false }
switch entity[0] {
case '#':
base := 10
val := 0
entity = entity[1:]
if len(entity) == 0 { return -1, false }
if entity[0] == 'x' || entity[0] == 'X' {
base = 16
entity = entity[1:]
}
for len(entity) > 0 {
r := entity[0]
switch r {
case '0'..'9':
val *= base
val += int(r - '0')
case 'a'..'f':
if base == 10 { return -1, false }
val *= base
val += int(r - 'a' + 10)
case 'A'..'F':
if base == 10 { return -1, false }
val *= base
val += int(r - 'A' + 10)
case:
return -1, false
}
if val > MAX_RUNE_CODEPOINT { return -1, false }
entity = entity[1:]
}
return rune(val), true
case:
/*
Named entity.
*/
return named_xml_entity_to_rune(entity)
}
}
/*
Private XML helper to extract `&<stuff>;` entity.
*/
@(private="file")
_extract_xml_entity :: proc(t: ^Tokenizer) -> (entity: string, err: Error) {
assert(t != nil && t.r == '&')
/*
All of these would be in the ASCII range.
Even if one is not, it doesn't matter. All characters we need to compare to extract are.
*/
using t
length := len(t.src)
found := false
#no_bounds_check {
for read_offset < length {
if src[read_offset] == ';' {
found = true
read_offset += 1
break
}
read_offset += 1
}
}
if found {
return string(src[offset + 1 : read_offset - 1]), .None
}
return string(src[offset : read_offset]), .Invalid_Entity_Encoding
}
/*
Private XML helper for CDATA and comments.
*/
@(private="file")
_handle_xml_special :: proc(t: ^Tokenizer, builder: ^strings.Builder, options: XML_Decode_Options) -> (in_data: bool, err: Error) {
assert(t != nil && t.r == '<')
if t.read_offset + len(CDATA_START) >= len(t.src) { return false, .None }
if string(t.src[t.offset:][:len(CDATA_START)]) == CDATA_START {
t.read_offset += len(CDATA_START) - 1
if .Unbox_CDATA in options && .Decode_CDATA in options {
/*
We're unboxing _and_ decoding CDATA
*/
return true, .None
}
/*
CDATA is passed through.
*/
offset := t.offset
/*
Scan until end of CDATA.
*/
for {
advance(t) or_return
if t.r < 0 { return true, .CDATA_Not_Terminated }
if t.read_offset + len(CDATA_END) < len(t.src) {
if string(t.src[t.offset:][:len(CDATA_END)]) == CDATA_END {
t.read_offset += len(CDATA_END) - 1
cdata := string(t.src[offset : t.read_offset])
if .Unbox_CDATA in options {
cdata = cdata[len(CDATA_START):]
cdata = cdata[:len(cdata) - len(CDATA_END)]
}
write_string(builder, cdata)
return false, .None
}
}
}
} else if string(t.src[t.offset:][:len(COMMENT_START)]) == COMMENT_START {
t.read_offset += len(COMMENT_START)
/*
Comment is passed through by default.
*/
offset := t.offset
/*
Scan until end of Comment.
*/
for {
advance(t) or_return
if t.r < 0 { return true, .Comment_Not_Terminated }
if t.read_offset + len(COMMENT_END) < len(t.src) {
if string(t.src[t.offset:][:len(COMMENT_END)]) == COMMENT_END {
t.read_offset += len(COMMENT_END) - 1
if .Comment_Strip not_in options {
comment := string(t.src[offset : t.read_offset])
write_string(builder, comment)
}
return false, .None
}
}
}
}
return false, .None
}
@@ -0,0 +1,76 @@
package unicode_entity_example
import "core:encoding/xml"
import "core:strings"
import "core:mem"
import "core:fmt"
import "core:time"
doc_print :: proc(doc: ^xml.Document) {
buf: strings.Builder
defer strings.destroy_builder(&buf)
w := strings.to_writer(&buf)
xml.print(w, doc)
fmt.println(strings.to_string(buf))
}
_entities :: proc() {
doc: ^xml.Document
err: xml.Error
DOC :: #load("../../../../tests/core/assets/XML/unicode.xml")
OPTIONS :: xml.Options{
flags = {
.Ignore_Unsupported, .Intern_Comments,
},
expected_doctype = "",
}
parse_duration: time.Duration
{
time.SCOPED_TICK_DURATION(&parse_duration)
doc, err = xml.parse(DOC, OPTIONS)
}
defer xml.destroy(doc)
doc_print(doc)
ms := time.duration_milliseconds(parse_duration)
speed := (f64(1000.0) / ms) * f64(len(DOC)) / 1_024.0 / 1_024.0
fmt.printf("Parse time: %.2f ms (%.2f MiB/s).\n", ms, speed)
fmt.printf("Error: %v\n", err)
}
_main :: proc() {
using fmt
options := xml.Options{ flags = { .Ignore_Unsupported, .Intern_Comments, .Unbox_CDATA, .Decode_SGML_Entities }}
doc, _ := xml.parse(#load("test.html"), options)
defer xml.destroy(doc)
doc_print(doc)
}
main :: proc() {
using fmt
track: mem.Tracking_Allocator
mem.tracking_allocator_init(&track, context.allocator)
context.allocator = mem.tracking_allocator(&track)
// _main()
_entities()
if len(track.allocation_map) > 0 {
println()
for _, v in track.allocation_map {
printf("%v Leaked %v bytes.\n", v.location, v.size)
}
}
}
+28
View File
@@ -0,0 +1,28 @@
<html>
<head>
<title>Entity Reference Test</title>
<style>
body {
background: #000; color: #eee;
width: 40%;
margin-left: auto;
margin-right: auto;
font-size: 14pt;
}
</style>
</head>
<body>
<h1>Entity Reference Test</h1>
<div id="test_cdata_in_comment" foo="">
Foozle]!&#32;&copy;&#x20;<!-- <![CDATA[&#32;&reg;&#x20;]]> -->42&;1234&
</div>
<!-- EXPECTED: Foozle]! © 42&;1234& -->
<div id="test_cdata_unwrap_and_passthrough">
Foozle]!&#32;&copy;&#x20;<![CDATA[BOX&#32;&reg;&#x20;/BOX]]>42&;1234&
</div>
<!-- EXPECTED: Foozle]! © BOX ® /BOX42&;1234& -->
<div>
&verbar; &vert; &VerticalLine; &fjlig; &grave; &bsol; &reg; &rhov; &CounterClockwiseContourIntegral; &bsemi;
</div>
</body>
</html>
File diff suppressed because it is too large Load Diff
+86
View File
@@ -0,0 +1,86 @@
/*
An XML 1.0 / 1.1 parser
Copyright 2021-2022 Jeroen van Rijn <nom@duclavier.com>.
Made available under Odin's BSD-3 license.
A from-scratch XML implementation, loosely modeled on the [spec](https://www.w3.org/TR/2006/REC-xml11-20060816).
List of contributors:
Jeroen van Rijn: Initial implementation.
*/
package xml
import "core:io"
import "core:fmt"
/*
Just for debug purposes.
*/
print :: proc(writer: io.Writer, doc: ^Document) -> (written: int, err: io.Error) {
if doc == nil { return }
using fmt
written += wprintf(writer, "[XML Prolog]\n")
for attr in doc.prolog {
written += wprintf(writer, "\t%v: %v\n", attr.key, attr.val)
}
written += wprintf(writer, "[Encoding] %v\n", doc.encoding)
if len(doc.doctype.ident) > 0 {
written += wprintf(writer, "[DOCTYPE] %v\n", doc.doctype.ident)
if len(doc.doctype.rest) > 0 {
wprintf(writer, "\t%v\n", doc.doctype.rest)
}
}
for comment in doc.comments {
written += wprintf(writer, "[Pre-root comment] %v\n", comment)
}
if len(doc.elements) > 0 {
wprintln(writer, " --- ")
print_element(writer, doc, 0)
wprintln(writer, " --- ")
}
return written, .None
}
print_element :: proc(writer: io.Writer, doc: ^Document, element_id: Element_ID, indent := 0) -> (written: int, err: io.Error) {
using fmt
tab :: proc(writer: io.Writer, indent: int) {
for _ in 0..=indent {
wprintf(writer, "\t")
}
}
tab(writer, indent)
element := doc.elements[element_id]
if element.kind == .Element {
wprintf(writer, "<%v>\n", element.ident)
if len(element.value) > 0 {
tab(writer, indent + 1)
wprintf(writer, "[Value] %v\n", element.value)
}
for attr in element.attribs {
tab(writer, indent + 1)
wprintf(writer, "[Attr] %v: %v\n", attr.key, attr.val)
}
for child in element.children {
print_element(writer, doc, child, indent + 1)
}
} else if element.kind == .Comment {
wprintf(writer, "[COMMENT] %v\n", element.value)
}
return written, .None
}
+112
View File
@@ -0,0 +1,112 @@
package xml_example
import "core:encoding/xml"
import "core:mem"
import "core:fmt"
import "core:time"
import "core:strings"
import "core:hash"
N :: 1
example :: proc() {
using fmt
docs: [N]^xml.Document
errs: [N]xml.Error
times: [N]time.Duration
defer for round in 0..<N {
xml.destroy(docs[round])
}
DOC :: #load("../../../../tests/core/assets/XML/unicode.xml")
input := DOC
for round in 0..<N {
start := time.tick_now()
docs[round], errs[round] = xml.parse(input, xml.Options{
flags={.Ignore_Unsupported},
expected_doctype = "",
})
end := time.tick_now()
times[round] = time.tick_diff(start, end)
}
fastest := time.Duration(max(i64))
slowest := time.Duration(0)
total := time.Duration(0)
for round in 0..<N {
fastest = min(fastest, times[round])
slowest = max(slowest, times[round])
total += times[round]
}
fastest_ms := time.duration_milliseconds(fastest)
slowest_ms := time.duration_milliseconds(slowest)
average_ms := time.duration_milliseconds(time.Duration(f64(total) / f64(N)))
fastest_speed := (f64(1000.0) / fastest_ms) * f64(len(DOC)) / 1_024.0 / 1_024.0
slowest_speed := (f64(1000.0) / slowest_ms) * f64(len(DOC)) / 1_024.0 / 1_024.0
average_speed := (f64(1000.0) / average_ms) * f64(len(DOC)) / 1_024.0 / 1_024.0
fmt.printf("N = %v\n", N)
fmt.printf("[Fastest]: %v bytes in %.2f ms (%.2f MiB/s).\n", len(input), fastest_ms, fastest_speed)
fmt.printf("[Slowest]: %v bytes in %.2f ms (%.2f MiB/s).\n", len(input), slowest_ms, slowest_speed)
fmt.printf("[Average]: %v bytes in %.2f ms (%.2f MiB/s).\n", len(input), average_ms, average_speed)
if errs[0] != .None {
printf("Load/Parse error: %v\n", errs[0])
if errs[0] == .File_Error {
println("\"unicode.xml\" not found. Did you run \"tests\\download_assets.py\"?")
}
return
}
charlist, charlist_ok := xml.find_child_by_ident(docs[0], 0, "charlist")
if !charlist_ok {
eprintln("Could not locate top-level `<charlist>` tag.")
return
}
printf("Found `<charlist>` with %v children, %v elements total\n", len(docs[0].elements[charlist].children), docs[0].element_count)
crc32 := doc_hash(docs[0])
printf("[%v] CRC32: 0x%08x\n", "🎉" if crc32 == 0xcaa042b9 else "🤬", crc32)
for round in 0..<N {
defer xml.destroy(docs[round])
}
}
doc_hash :: proc(doc: ^xml.Document, print := false) -> (crc32: u32) {
buf: strings.Builder
defer strings.destroy_builder(&buf)
w := strings.to_writer(&buf)
xml.print(w, doc)
tree := strings.to_string(buf)
if print { fmt.println(tree) }
return hash.crc32(transmute([]u8)tree)
}
main :: proc() {
using fmt
track: mem.Tracking_Allocator
mem.tracking_allocator_init(&track, context.allocator)
context.allocator = mem.tracking_allocator(&track)
example()
if len(track.allocation_map) > 0 {
println()
for _, v in track.allocation_map {
printf("%v Leaked %v bytes.\n", v.location, v.size)
}
}
println("Done and cleaned up!")
}
+45
View File
@@ -0,0 +1,45 @@
/*
An XML 1.0 / 1.1 parser
Copyright 2021-2022 Jeroen van Rijn <nom@duclavier.com>.
Made available under Odin's BSD-3 license.
This file contains helper functions.
*/
package xml
// Find parent's nth child with a given ident.
find_child_by_ident :: proc(doc: ^Document, parent_id: Element_ID, ident: string, nth := 0) -> (res: Element_ID, found: bool) {
tag := doc.elements[parent_id]
count := 0
for child_id in tag.children {
child := doc.elements[child_id]
/*
Skip commments. They have no name.
*/
if child.kind != .Element { continue }
/*
If the ident matches and it's the nth such child, return it.
*/
if child.ident == ident {
if count == nth { return child_id, true }
count += 1
}
}
return 0, false
}
// Find an attribute by key.
find_attribute_val_by_key :: proc(doc: ^Document, parent_id: Element_ID, key: string) -> (val: string, found: bool) {
tag := doc.elements[parent_id]
for attr in tag.attribs {
/*
If the ident matches, we're done. There can only ever be one attribute with the same name.
*/
if attr.key == key { return attr.val, true }
}
return "", false
}
+436
View File
@@ -0,0 +1,436 @@
/*
An XML 1.0 / 1.1 parser
Copyright 2021-2022 Jeroen van Rijn <nom@duclavier.com>.
Made available under Odin's BSD-3 license.
A from-scratch XML implementation, loosely modeled on the [spec](https://www.w3.org/TR/2006/REC-xml11-20060816).
List of contributors:
Jeroen van Rijn: Initial implementation.
*/
package xml
import "core:fmt"
import "core:unicode"
import "core:unicode/utf8"
Error_Handler :: #type proc(pos: Pos, fmt: string, args: ..any)
Token :: struct {
kind: Token_Kind,
text: string,
pos: Pos,
}
Pos :: struct {
file: string,
offset: int, // starting at 0
line: int, // starting at 1
column: int, // starting at 1
}
Token_Kind :: enum {
Invalid,
Ident,
Literal,
Rune,
String,
Double_Quote, // "
Single_Quote, // '
Colon, // :
Eq, // =
Lt, // <
Gt, // >
Exclaim, // !
Question, // ?
Hash, // #
Slash, // /
Dash, // -
Open_Bracket, // [
Close_Bracket, // ]
EOF,
}
CDATA_START :: "<![CDATA["
CDATA_END :: "]]>"
COMMENT_START :: "<!--"
COMMENT_END :: "-->"
Tokenizer :: struct {
// Immutable data
path: string,
src: string,
err: Error_Handler,
// Tokenizing state
ch: rune,
offset: int,
read_offset: int,
line_offset: int,
line_count: int,
// Mutable data
error_count: int,
}
init :: proc(t: ^Tokenizer, src: string, path: string, err: Error_Handler = default_error_handler) {
t.src = src
t.err = err
t.ch = ' '
t.offset = 0
t.read_offset = 0
t.line_offset = 0
t.line_count = len(src) > 0 ? 1 : 0
t.error_count = 0
t.path = path
advance_rune(t)
if t.ch == utf8.RUNE_BOM {
advance_rune(t)
}
}
@(private)
offset_to_pos :: proc(t: ^Tokenizer, offset: int) -> Pos {
line := t.line_count
column := offset - t.line_offset + 1
return Pos {
file = t.path,
offset = offset,
line = line,
column = column,
}
}
default_error_handler :: proc(pos: Pos, msg: string, args: ..any) {
fmt.eprintf("%s(%d:%d) ", pos.file, pos.line, pos.column)
fmt.eprintf(msg, ..args)
fmt.eprintf("\n")
}
error :: proc(t: ^Tokenizer, offset: int, msg: string, args: ..any) {
pos := offset_to_pos(t, offset)
if t.err != nil {
t.err(pos, msg, ..args)
}
t.error_count += 1
}
@(optimization_mode="speed")
advance_rune :: proc(using t: ^Tokenizer) {
#no_bounds_check {
/*
Already bounds-checked here.
*/
if read_offset < len(src) {
offset = read_offset
if ch == '\n' {
line_offset = offset
line_count += 1
}
r, w := rune(src[read_offset]), 1
switch {
case r == 0:
error(t, t.offset, "illegal character NUL")
case r >= utf8.RUNE_SELF:
r, w = #force_inline utf8.decode_rune_in_string(src[read_offset:])
if r == utf8.RUNE_ERROR && w == 1 {
error(t, t.offset, "illegal UTF-8 encoding")
} else if r == utf8.RUNE_BOM && offset > 0 {
error(t, t.offset, "illegal byte order mark")
}
}
read_offset += w
ch = r
} else {
offset = len(src)
if ch == '\n' {
line_offset = offset
line_count += 1
}
ch = -1
}
}
}
peek_byte :: proc(t: ^Tokenizer, offset := 0) -> byte {
if t.read_offset+offset < len(t.src) {
#no_bounds_check return t.src[t.read_offset+offset]
}
return 0
}
@(optimization_mode="speed")
skip_whitespace :: proc(t: ^Tokenizer) {
for {
switch t.ch {
case ' ', '\t', '\r', '\n':
advance_rune(t)
case:
return
}
}
}
@(optimization_mode="speed")
is_letter :: proc(r: rune) -> bool {
if r < utf8.RUNE_SELF {
switch r {
case '_':
return true
case 'A'..='Z', 'a'..='z':
return true
}
}
return unicode.is_letter(r)
}
is_valid_identifier_rune :: proc(r: rune) -> bool {
if r < utf8.RUNE_SELF {
switch r {
case '_', '-', ':': return true
case 'A'..='Z', 'a'..='z': return true
case '0'..'9': return true
case -1: return false
}
}
if unicode.is_letter(r) || unicode.is_digit(r) {
return true
}
return false
}
scan_identifier :: proc(t: ^Tokenizer) -> string {
offset := t.offset
namespaced := false
for is_valid_identifier_rune(t.ch) {
advance_rune(t)
if t.ch == ':' {
/*
A namespaced attr can have at most two parts, `namespace:ident`.
*/
if namespaced {
break
}
namespaced = true
}
}
return string(t.src[offset : t.offset])
}
/*
A comment ends when we see -->, preceded by a character that's not a dash.
"For compatibility, the string "--" (double-hyphen) must not occur within comments."
See: https://www.w3.org/TR/2006/REC-xml11-20060816/#dt-comment
Thanks to the length (4) of the comment start, we also have enough lookback,
and the peek at the next byte asserts that there's at least one more character
that's a `>`.
*/
scan_comment :: proc(t: ^Tokenizer) -> (comment: string, err: Error) {
offset := t.offset
for {
advance_rune(t)
ch := t.ch
if ch < 0 {
error(t, offset, "[parse] Comment was not terminated\n")
return "", .Unclosed_Comment
}
if string(t.src[t.offset - 1:][:2]) == "--" {
if peek_byte(t) == '>' {
break
} else {
error(t, t.offset - 1, "Invalid -- sequence in comment.\n")
return "", .Invalid_Sequence_In_Comment
}
}
}
expect(t, .Dash)
expect(t, .Gt)
return string(t.src[offset : t.offset - 1]), .None
}
/*
Skip CDATA
*/
skip_cdata :: proc(t: ^Tokenizer) -> (err: Error) {
if t.read_offset + len(CDATA_START) >= len(t.src) {
/*
Can't be the start of a CDATA tag.
*/
return .None
}
if string(t.src[t.offset:][:len(CDATA_START)]) == CDATA_START {
t.read_offset += len(CDATA_START)
offset := t.offset
cdata_scan: for {
advance_rune(t)
if t.ch < 0 {
error(t, offset, "[scan_string] CDATA was not terminated\n")
return .Premature_EOF
}
/*
Scan until the end of a CDATA tag.
*/
if t.read_offset + len(CDATA_END) < len(t.src) {
if string(t.src[t.offset:][:len(CDATA_END)]) == CDATA_END {
t.read_offset += len(CDATA_END)
break cdata_scan
}
}
}
}
return
}
@(optimization_mode="speed")
scan_string :: proc(t: ^Tokenizer, offset: int, close: rune = '<', consume_close := false, multiline := true) -> (value: string, err: Error) {
err = .None
loop: for {
ch := t.ch
switch ch {
case -1:
error(t, t.offset, "[scan_string] Premature end of file.\n")
return "", .Premature_EOF
case '<':
if peek_byte(t) == '!' {
if peek_byte(t, 1) == '[' {
/*
Might be the start of a CDATA tag.
*/
skip_cdata(t) or_return
} else if peek_byte(t, 1) == '-' && peek_byte(t, 2) == '-' {
/*
Comment start. Eat comment.
*/
t.read_offset += 3
_ = scan_comment(t) or_return
}
}
case '\n':
if !multiline {
error(t, offset, string(t.src[offset : t.offset]))
error(t, offset, "[scan_string] Not terminated\n")
err = .Invalid_Tag_Value
break loop
}
}
if t.ch == close {
/*
If it's not a CDATA or comment, it's the end of this body.
*/
break loop
}
advance_rune(t)
}
/*
Strip trailing whitespace.
*/
lit := string(t.src[offset : t.offset])
end := len(lit)
eat: for ; end > 0; end -= 1 {
ch := lit[end - 1]
switch ch {
case ' ', '\t', '\r', '\n':
case:
break eat
}
}
lit = lit[:end]
if consume_close {
advance_rune(t)
}
/*
TODO: Handle decoding escape characters and unboxing CDATA.
*/
return lit, err
}
peek :: proc(t: ^Tokenizer) -> (token: Token) {
old := t^
token = scan(t)
t^ = old
return token
}
scan :: proc(t: ^Tokenizer) -> Token {
skip_whitespace(t)
offset := t.offset
kind: Token_Kind
err: Error
lit: string
pos := offset_to_pos(t, offset)
switch ch := t.ch; true {
case is_letter(ch):
lit = scan_identifier(t)
kind = .Ident
case:
advance_rune(t)
switch ch {
case -1:
kind = .EOF
case '<': kind = .Lt
case '>': kind = .Gt
case '!': kind = .Exclaim
case '?': kind = .Question
case '=': kind = .Eq
case '#': kind = .Hash
case '/': kind = .Slash
case '-': kind = .Dash
case ':': kind = .Colon
case '"', '\'':
kind = .Invalid
lit, err = scan_string(t, t.offset, ch, true, false)
if err == .None {
kind = .String
}
case '\n':
lit = "\n"
case:
kind = .Invalid
}
}
if kind != .String && lit == "" {
lit = string(t.src[offset : t.offset])
}
return Token{kind, lit, pos}
}
+709
View File
@@ -0,0 +1,709 @@
/*
An XML 1.0 / 1.1 parser
Copyright 2021-2022 Jeroen van Rijn <nom@duclavier.com>.
Made available under Odin's BSD-3 license.
A from-scratch XML implementation, loosely modelled on the [spec](https://www.w3.org/TR/2006/REC-xml11-20060816).
Features:
- Supports enough of the XML 1.0/1.1 spec to handle the 99.9% of XML documents in common current usage.
- Simple to understand and use. Small.
Caveats:
- We do NOT support HTML in this package, as that may or may not be valid XML.
If it works, great. If it doesn't, that's not considered a bug.
- We do NOT support UTF-16. If you have a UTF-16 XML file, please convert it to UTF-8 first. Also, our condolences.
- <[!ELEMENT and <[!ATTLIST are not supported, and will be either ignored or return an error depending on the parser options.
MAYBE:
- XML writer?
- Serialize/deserialize Odin types?
List of contributors:
Jeroen van Rijn: Initial implementation.
*/
package xml
// An XML 1.0 / 1.1 parser
import "core:bytes"
import "core:encoding/entity"
import "core:intrinsics"
import "core:mem"
import "core:os"
import "core:strings"
likely :: intrinsics.expect
DEFAULT_Options :: Options{
flags = {
.Ignore_Unsupported,
},
expected_doctype = "",
}
Option_Flag :: enum {
/*
If the caller says that input may be modified, we can perform in-situ parsing.
If this flag isn't provided, the XML parser first duplicates the input so that it can.
*/
Input_May_Be_Modified,
/*
Document MUST start with `<?xml` prolog.
*/
Must_Have_Prolog,
/*
Document MUST have a `<!DOCTYPE`.
*/
Must_Have_DocType,
/*
By default we skip comments. Use this option to intern a comment on a parented Element.
*/
Intern_Comments,
/*
How to handle unsupported parts of the specification, like <! other than <!DOCTYPE and <![CDATA[
*/
Error_on_Unsupported,
Ignore_Unsupported,
/*
By default CDATA tags are passed-through as-is.
This option unwraps them when encountered.
*/
Unbox_CDATA,
/*
By default SGML entities like `&gt;`, `&#32;` and `&#x20;` are passed-through as-is.
This option decodes them when encountered.
*/
Decode_SGML_Entities,
/*
If a tag body has a comment, it will be stripped unless this option is given.
*/
Keep_Tag_Body_Comments,
}
Option_Flags :: bit_set[Option_Flag; u16]
Document :: struct {
elements: [dynamic]Element,
element_count: Element_ID,
prolog: Attributes,
encoding: Encoding,
doctype: struct {
/*
We only scan the <!DOCTYPE IDENT part and skip the rest.
*/
ident: string,
rest: string,
},
/*
If we encounter comments before the root node, and the option to intern comments is given, this is where they'll live.
Otherwise they'll be in the element tree.
*/
comments: [dynamic]string,
/*
Internal
*/
tokenizer: ^Tokenizer,
allocator: mem.Allocator,
/*
Input. Either the original buffer, or a copy if `.Input_May_Be_Modified` isn't specified.
*/
input: []u8,
strings_to_free: [dynamic]string,
}
Element :: struct {
ident: string,
value: string,
attribs: Attributes,
kind: enum {
Element = 0,
Comment,
},
parent: Element_ID,
children: [dynamic]Element_ID,
}
Attr :: struct {
key: string,
val: string,
}
Attributes :: [dynamic]Attr
Options :: struct {
flags: Option_Flags,
expected_doctype: string,
}
Encoding :: enum {
Unknown,
UTF_8,
ISO_8859_1,
/*
Aliases
*/
LATIN_1 = ISO_8859_1,
}
Error :: enum {
/*
General return values.
*/
None = 0,
General_Error,
Unexpected_Token,
Invalid_Token,
/*
Couldn't find, open or read file.
*/
File_Error,
/*
File too short.
*/
Premature_EOF,
/*
XML-specific errors.
*/
No_Prolog,
Invalid_Prolog,
Too_Many_Prologs,
No_DocType,
Too_Many_DocTypes,
DocType_Must_Preceed_Elements,
/*
If a DOCTYPE is present _or_ the caller
asked for a specific DOCTYPE and the DOCTYPE
and root tag don't match, we return `.Invalid_DocType`.
*/
Invalid_DocType,
Invalid_Tag_Value,
Mismatched_Closing_Tag,
Unclosed_Comment,
Comment_Before_Root_Element,
Invalid_Sequence_In_Comment,
Unsupported_Version,
Unsupported_Encoding,
/*
<!FOO are usually skipped.
*/
Unhandled_Bang,
Duplicate_Attribute,
Conflicting_Options,
}
/*
Implementation starts here.
*/
parse_from_slice :: proc(data: []u8, options := DEFAULT_Options, path := "", error_handler := default_error_handler, allocator := context.allocator) -> (doc: ^Document, err: Error) {
data := data
context.allocator = allocator
opts := validate_options(options) or_return
/*
If `.Input_May_Be_Modified` is not specified, we duplicate the input so that we can modify it in-place.
*/
if .Input_May_Be_Modified not_in opts.flags {
data = bytes.clone(data)
}
t := &Tokenizer{}
init(t, string(data), path, error_handler)
doc = new(Document)
doc.allocator = allocator
doc.tokenizer = t
doc.input = data
doc.elements = make([dynamic]Element, 1024, 1024, allocator)
// strings.intern_init(&doc.intern, allocator, allocator)
err = .Unexpected_Token
element, parent: Element_ID
tag_is_open := false
first_element := true
open: Token
/*
If a DOCTYPE is present, the root tag has to match.
If an expected DOCTYPE is given in options (i.e. it's non-empty), the DOCTYPE (if present) and root tag have to match.
*/
expected_doctype := options.expected_doctype
loop: for {
skip_whitespace(t)
// NOTE(Jeroen): This is faster as a switch.
switch t.ch {
case '<':
/*
Consume peeked `<`
*/
advance_rune(t)
open = scan(t)
// NOTE(Jeroen): We're not using a switch because this if-else chain ordered by likelihood is 2.5% faster at -o:size and -o:speed.
if likely(open.kind, Token_Kind.Ident) == .Ident {
/*
e.g. <odin - Start of new element.
*/
element = new_element(doc)
tag_is_open = true
if first_element {
/*
First element.
*/
parent = element
first_element = false
} else {
append(&doc.elements[parent].children, element)
}
doc.elements[element].parent = parent
doc.elements[element].ident = open.text
parse_attributes(doc, &doc.elements[element].attribs) or_return
/*
If a DOCTYPE is present _or_ the caller
asked for a specific DOCTYPE and the DOCTYPE
and root tag don't match, we return .Invalid_Root_Tag.
*/
if element == 0 { // Root tag?
if len(expected_doctype) > 0 && expected_doctype != open.text {
error(t, t.offset, "Root Tag doesn't match DOCTYPE. Expected: %v, got: %v\n", expected_doctype, open.text)
return doc, .Invalid_DocType
}
}
/*
One of these should follow:
- `>`, which means we've just opened this tag and expect a later element to close it.
- `/>`, which means this is an 'empty' or self-closing tag.
*/
end_token := scan(t)
#partial switch end_token.kind {
case .Gt:
/*
We're now the new parent.
*/
parent = element
case .Slash:
/*
Empty tag. Close it.
*/
expect(t, .Gt) or_return
parent = doc.elements[element].parent
element = parent
tag_is_open = false
case:
error(t, t.offset, "Expected close tag, got: %#v\n", end_token)
return
}
} else if open.kind == .Slash {
/*
Close tag.
*/
ident := expect(t, .Ident) or_return
_ = expect(t, .Gt) or_return
if doc.elements[element].ident != ident.text {
error(t, t.offset, "Mismatched Closing Tag. Expected %v, got %v\n", doc.elements[element].ident, ident.text)
return doc, .Mismatched_Closing_Tag
}
parent = doc.elements[element].parent
element = parent
tag_is_open = false
} else if open.kind == .Exclaim {
/*
<!
*/
next := scan(t)
#partial switch next.kind {
case .Ident:
switch next.text {
case "DOCTYPE":
if len(doc.doctype.ident) > 0 {
return doc, .Too_Many_DocTypes
}
if doc.element_count > 0 {
return doc, .DocType_Must_Preceed_Elements
}
parse_doctype(doc) or_return
if len(expected_doctype) > 0 && expected_doctype != doc.doctype.ident {
error(t, t.offset, "Invalid DOCTYPE. Expected: %v, got: %v\n", expected_doctype, doc.doctype.ident)
return doc, .Invalid_DocType
}
expected_doctype = doc.doctype.ident
case:
if .Error_on_Unsupported in opts.flags {
error(t, t.offset, "Unhandled: <!%v\n", next.text)
return doc, .Unhandled_Bang
}
skip_element(t) or_return
}
case .Dash:
/*
Comment: <!-- -->.
The grammar does not allow a comment to end in --->
*/
expect(t, .Dash)
comment := scan_comment(t) or_return
if .Intern_Comments in opts.flags {
if len(doc.elements) == 0 {
append(&doc.comments, comment)
} else {
el := new_element(doc)
doc.elements[el].parent = element
doc.elements[el].kind = .Comment
doc.elements[el].value = comment
append(&doc.elements[element].children, el)
}
}
case:
error(t, t.offset, "Invalid Token after <!. Expected .Ident, got %#v\n", next)
return
}
} else if open.kind == .Question {
/*
<?xml
*/
next := scan(t)
#partial switch next.kind {
case .Ident:
if len(next.text) == 3 && strings.to_lower(next.text, context.temp_allocator) == "xml" {
parse_prolog(doc) or_return
} else if len(doc.prolog) > 0 {
/*
We've already seen a prolog.
*/
return doc, .Too_Many_Prologs
} else {
/*
Could be `<?xml-stylesheet`, etc. Ignore it.
*/
skip_element(t) or_return
}
case:
error(t, t.offset, "Expected \"<?xml\", got \"<?%v\".", next.text)
return
}
} else {
error(t, t.offset, "Invalid Token after <: %#v\n", open)
return
}
case -1:
/*
End of file.
*/
if tag_is_open {
return doc, .Premature_EOF
}
break loop
case:
/*
This should be a tag's body text.
*/
body_text := scan_string(t, t.offset) or_return
needs_processing := .Unbox_CDATA in opts.flags
needs_processing |= .Decode_SGML_Entities in opts.flags
if !needs_processing {
doc.elements[element].value = body_text
continue
}
decode_opts := entity.XML_Decode_Options{}
if .Keep_Tag_Body_Comments not_in opts.flags {
decode_opts += { .Comment_Strip }
}
if .Decode_SGML_Entities not_in opts.flags {
decode_opts += { .No_Entity_Decode }
}
if .Unbox_CDATA in opts.flags {
decode_opts += { .Unbox_CDATA }
if .Decode_SGML_Entities in opts.flags {
decode_opts += { .Decode_CDATA }
}
}
decoded, decode_err := entity.decode_xml(body_text, decode_opts)
if decode_err == .None {
doc.elements[element].value = decoded
append(&doc.strings_to_free, decoded)
} else {
doc.elements[element].value = body_text
}
}
}
if .Must_Have_Prolog in opts.flags && len(doc.prolog) == 0 {
return doc, .No_Prolog
}
if .Must_Have_DocType in opts.flags && len(doc.doctype.ident) == 0 {
return doc, .No_DocType
}
resize(&doc.elements, int(doc.element_count))
return doc, .None
}
parse_from_file :: proc(filename: string, options := DEFAULT_Options, error_handler := default_error_handler, allocator := context.allocator) -> (doc: ^Document, err: Error) {
context.allocator = allocator
options := options
data, data_ok := os.read_entire_file(filename)
if !data_ok { return {}, .File_Error }
options.flags += { .Input_May_Be_Modified }
return parse_from_slice(data, options, filename, error_handler, allocator)
}
parse :: proc { parse_from_file, parse_from_slice }
destroy :: proc(doc: ^Document) {
if doc == nil { return }
for el in doc.elements {
delete(el.attribs)
delete(el.children)
}
delete(doc.elements)
delete(doc.prolog)
delete(doc.comments)
delete(doc.input)
for s in doc.strings_to_free {
delete(s)
}
delete(doc.strings_to_free)
free(doc)
}
/*
Helpers.
*/
validate_options :: proc(options: Options) -> (validated: Options, err: Error) {
validated = options
if .Error_on_Unsupported in validated.flags && .Ignore_Unsupported in validated.flags {
return options, .Conflicting_Options
}
return validated, .None
}
expect :: proc(t: ^Tokenizer, kind: Token_Kind) -> (tok: Token, err: Error) {
tok = scan(t)
if tok.kind == kind { return tok, .None }
error(t, t.offset, "Expected \"%v\", got \"%v\".", kind, tok.kind)
return tok, .Unexpected_Token
}
parse_attribute :: proc(doc: ^Document) -> (attr: Attr, offset: int, err: Error) {
assert(doc != nil)
context.allocator = doc.allocator
t := doc.tokenizer
key := expect(t, .Ident) or_return
offset = t.offset - len(key.text)
_ = expect(t, .Eq) or_return
value := expect(t, .String) or_return
attr.key = key.text
attr.val = value.text
err = .None
return
}
check_duplicate_attributes :: proc(t: ^Tokenizer, attribs: Attributes, attr: Attr, offset: int) -> (err: Error) {
for a in attribs {
if attr.key == a.key {
error(t, offset, "Duplicate attribute: %v\n", attr.key)
return .Duplicate_Attribute
}
}
return .None
}
parse_attributes :: proc(doc: ^Document, attribs: ^Attributes) -> (err: Error) {
assert(doc != nil)
context.allocator = doc.allocator
t := doc.tokenizer
for peek(t).kind == .Ident {
attr, offset := parse_attribute(doc) or_return
check_duplicate_attributes(t, attribs^, attr, offset) or_return
append(attribs, attr)
}
skip_whitespace(t)
return .None
}
parse_prolog :: proc(doc: ^Document) -> (err: Error) {
assert(doc != nil)
context.allocator = doc.allocator
t := doc.tokenizer
offset := t.offset
parse_attributes(doc, &doc.prolog) or_return
for attr in doc.prolog {
switch attr.key {
case "version":
switch attr.val {
case "1.0", "1.1":
case:
error(t, offset, "[parse_prolog] Warning: Unhandled XML version: %v\n", attr.val)
}
case "encoding":
switch strings.to_lower(attr.val, context.temp_allocator) {
case "utf-8", "utf8":
doc.encoding = .UTF_8
case "latin-1", "latin1", "iso-8859-1":
doc.encoding = .LATIN_1
case:
/*
Unrecognized encoding, assume UTF-8.
*/
error(t, offset, "[parse_prolog] Warning: Unrecognized encoding: %v\n", attr.val)
}
case:
// Ignored.
}
}
_ = expect(t, .Question) or_return
_ = expect(t, .Gt) or_return
return .None
}
skip_element :: proc(t: ^Tokenizer) -> (err: Error) {
close := 1
loop: for {
tok := scan(t)
#partial switch tok.kind {
case .EOF:
error(t, t.offset, "[skip_element] Premature EOF\n")
return .Premature_EOF
case .Lt:
close += 1
case .Gt:
close -= 1
if close == 0 {
break loop
}
case:
}
}
return .None
}
parse_doctype :: proc(doc: ^Document) -> (err: Error) {
/*
<!DOCTYPE greeting SYSTEM "hello.dtd">
<!DOCTYPE greeting [
<!ELEMENT greeting (#PCDATA)>
]>
*/
assert(doc != nil)
context.allocator = doc.allocator
t := doc.tokenizer
tok := expect(t, .Ident) or_return
doc.doctype.ident = tok.text
skip_whitespace(t)
offset := t.offset
skip_element(t) or_return
/*
-1 because the current offset is that of the closing tag, so the rest of the DOCTYPE tag ends just before it.
*/
doc.doctype.rest = string(t.src[offset : t.offset - 1])
return .None
}
Element_ID :: u32
new_element :: proc(doc: ^Document) -> (id: Element_ID) {
element_space := len(doc.elements)
// Need to resize
if int(doc.element_count) + 1 > element_space {
if element_space < 65536 {
element_space *= 2
} else {
element_space += 65536
}
resize(&doc.elements, element_space)
}
cur := doc.element_count
doc.element_count += 1
return cur
}
@@ -0,0 +1,287 @@
package xml_example
import "core:encoding/xml"
import "core:os"
import "core:path"
import "core:mem"
import "core:strings"
import "core:strconv"
import "core:slice"
import "core:fmt"
/*
Silent error handler for the parser.
*/
Error_Handler :: proc(pos: xml.Pos, fmt: string, args: ..any) {}
OPTIONS :: xml.Options{ flags = { .Ignore_Unsupported, }, expected_doctype = "unicode", }
Entity :: struct {
name: string,
codepoint: rune,
description: string,
}
generate_encoding_entity_table :: proc() {
using fmt
filename := path.join(ODIN_ROOT, "tests", "core", "assets", "XML", "unicode.xml")
defer delete(filename)
generated_filename := path.join(ODIN_ROOT, "core", "encoding", "entity", "generated.odin")
defer delete(generated_filename)
doc, err := xml.parse(filename, OPTIONS, Error_Handler)
defer xml.destroy(doc)
if err != .None {
printf("Load/Parse error: %v\n", err)
if err == .File_Error {
printf("\"%v\" not found. Did you run \"tests\\download_assets.py\"?", filename)
}
os.exit(1)
}
printf("\"%v\" loaded and parsed.\n", filename)
generated_buf: strings.Builder
defer strings.destroy_builder(&generated_buf)
w := strings.to_writer(&generated_buf)
charlist, charlist_ok := xml.find_child_by_ident(doc.root, "charlist")
if !charlist_ok {
eprintln("Could not locate top-level `<charlist>` tag.")
os.exit(1)
}
printf("Found `<charlist>` with %v children.\n", len(charlist.children))
entity_map: map[string]Entity
names: [dynamic]string
min_name_length := max(int)
max_name_length := min(int)
shortest_name: string
longest_name: string
count := 0
for char in charlist.children {
if char.ident != "character" {
eprintf("Expected `<character>`, got `<%v>`\n", char.ident)
os.exit(1)
}
if codepoint_string, ok := xml.find_attribute_val_by_key(char, "dec"); !ok {
eprintln("`<character id=\"...\">` attribute not found.")
os.exit(1)
} else {
codepoint := strconv.atoi(codepoint_string)
desc, desc_ok := xml.find_child_by_ident(char, "description")
description := desc.value if desc_ok else ""
/*
For us to be interested in this codepoint, it has to have at least one entity.
*/
nth := 0
for {
character_entity, entity_ok := xml.find_child_by_ident(char, "entity", nth)
if !entity_ok { break }
nth += 1
if name, name_ok := xml.find_attribute_val_by_key(character_entity, "id"); name_ok {
if len(name) == 0 {
/*
Invalid name. Skip.
*/
continue
}
if name == "\"\"" {
printf("%#v\n", char)
printf("%#v\n", character_entity)
}
if len(name) > max_name_length { longest_name = name }
if len(name) < min_name_length { shortest_name = name }
min_name_length = min(min_name_length, len(name))
max_name_length = max(max_name_length, len(name))
e := Entity{
name = name,
codepoint = rune(codepoint),
description = description,
}
if _, seen := entity_map[name]; seen {
continue
}
entity_map[name] = e
append(&names, name)
count += 1
}
}
}
}
/*
Sort by name.
*/
slice.sort(names[:])
printf("Found %v unique `&name;` -> rune mappings.\n", count)
printf("Shortest name: %v (%v)\n", shortest_name, min_name_length)
printf("Longest name: %v (%v)\n", longest_name, max_name_length)
// println(rune_to_string(1234))
/*
Generate table.
*/
wprintln(w, "package unicode_entity")
wprintln(w, "")
wprintln(w, GENERATED)
wprintln(w, "")
wprintf (w, TABLE_FILE_PROLOG)
wprintln(w, "")
wprintf (w, "// `&%v;`\n", shortest_name)
wprintf (w, "XML_NAME_TO_RUNE_MIN_LENGTH :: %v\n", min_name_length)
wprintf (w, "// `&%v;`\n", longest_name)
wprintf (w, "XML_NAME_TO_RUNE_MAX_LENGTH :: %v\n", max_name_length)
wprintln(w, "")
wprintln(w,
`
/*
Input:
entity_name - a string, like "copy" that describes a user-encoded Unicode entity as used in XML.
Output:
"decoded" - The decoded rune if found by name, or -1 otherwise.
"ok" - true if found, false if not.
IMPORTANT: XML processors (including browsers) treat these names as case-sensitive. So do we.
*/
named_xml_entity_to_rune :: proc(name: string) -> (decoded: rune, ok: bool) {
/*
Early out if the name is too short or too long.
min as a precaution in case the generated table has a bogus value.
*/
if len(name) < min(1, XML_NAME_TO_RUNE_MIN_LENGTH) || len(name) > XML_NAME_TO_RUNE_MAX_LENGTH {
return -1, false
}
switch rune(name[0]) {
`)
prefix := '?'
should_close := false
for v in names {
if rune(v[0]) != prefix {
if should_close {
wprintln(w, "\t\t}\n")
}
prefix = rune(v[0])
wprintf (w, "\tcase '%v':\n", prefix)
wprintln(w, "\t\tswitch name {")
}
e := entity_map[v]
wprintf(w, "\t\t\tcase \"%v\": \n", e.name)
wprintf(w, "\t\t\t\t// %v\n", e.description)
wprintf(w, "\t\t\t\treturn %v, true\n", rune_to_string(e.codepoint))
should_close = true
}
wprintln(w, "\t\t}")
wprintln(w, "\t}")
wprintln(w, "\treturn -1, false")
wprintln(w, "}\n")
wprintln(w, GENERATED)
println()
println(strings.to_string(generated_buf))
println()
written := os.write_entire_file(generated_filename, transmute([]byte)strings.to_string(generated_buf))
if written {
fmt.printf("Successfully written generated \"%v\".", generated_filename)
} else {
fmt.printf("Failed to write generated \"%v\".", generated_filename)
}
delete(entity_map)
delete(names)
for name in &names {
free(&name)
}
}
GENERATED :: `/*
------ GENERATED ------ DO NOT EDIT ------ GENERATED ------ DO NOT EDIT ------ GENERATED ------
*/`
TABLE_FILE_PROLOG :: `/*
This file is generated from "https://www.w3.org/2003/entities/2007xml/unicode.xml".
UPDATE:
- Ensure the XML file was downloaded using "tests\core\download_assets.py".
- Run "core/unicode/tools/generate_entity_table.odin"
Odin unicode generated tables: https://github.com/odin-lang/Odin/tree/master/core/encoding/entity
Copyright © 2021 World Wide Web Consortium, (Massachusetts Institute of Technology,
European Research Consortium for Informatics and Mathematics, Keio University, Beihang).
All Rights Reserved.
This work is distributed under the W3C® Software License [1] in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
[1] http://www.w3.org/Consortium/Legal/copyright-software
See also: LICENSE_table.md
*/
`
rune_to_string :: proc(r: rune) -> (res: string) {
res = fmt.tprintf("%08x", int(r))
for len(res) > 2 && res[:2] == "00" {
res = res[2:]
}
return fmt.tprintf("rune(0x%v)", res)
}
is_dotted_name :: proc(name: string) -> (dotted: bool) {
for r in name {
if r == '.' { return true}
}
return false
}
main :: proc() {
using fmt
track: mem.Tracking_Allocator
mem.tracking_allocator_init(&track, context.allocator)
context.allocator = mem.tracking_allocator(&track)
generate_encoding_entity_table()
if len(track.allocation_map) > 0 {
println()
for _, v in track.allocation_map {
printf("%v Leaked %v bytes.\n", v.location, v.size)
}
}
println("Done and cleaned up!")
}
+2
View File
@@ -0,0 +1,2 @@
# This file will be downloaded by download_assets.py
unicode.xml
+29
View File
@@ -0,0 +1,29 @@
<html>
<head>
<title>Entity Reference Test</title>
<style>
body {
background: #000; color: #eee;
width: 40%;
margin-left: auto;
margin-right: auto;
font-size: 14pt;
}
</style>
</head>
<body>
<h1>Entity Reference Test</h1>
<div id="test_cdata_in_comment" foo="">
Foozle]!&#32;&copy;&#x20;<!-- <![CDATA[&#32;&reg;&#x20;]]> -->42&;1234&
</div>
<!-- foo attribute should be empty but present -->
<!-- EXPECTED: Foozle]! © 42&;1234& -->
<div id="test_cdata_unwrap_and_passthrough">
Foozle]!&#32;&copy;&#x20;<![CDATA[BOX&#32;&reg;&#x20;/BOX]]>42&;1234&
</div>
<!-- EXPECTED: Foozle]! © BOX ® /BOX42&;1234& -->
<div>
&verbar; &vert; &VerticalLine; &fjlig; &grave; &bsol; &reg; &rhov; &CounterClockwiseContourIntegral; &bsemi;
</div>
</body>
</html>
+35
View File
@@ -0,0 +1,35 @@
<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE TS>
<TS version="2.1" language="nl" sourcelanguage="en">
<context>
<name>Page</name>
<message>
<source>Text for translation</source>
<comment>commenting</comment>
<translation type="obsolete">Tekst om te vertalen</translation>
</message>
<message>
<source>Also text to translate</source>
<extracomment>some text</extracomment>
<translation>Ook tekst om te vertalen</translation>
</message>
</context>
<context>
<name>installscript</name>
<message>
<source>99 bottles of beer on the wall</source>
<oldcomment>some new comments here</oldcomment>
<translation>99 flessen bier op de muur</translation>
</message>
</context>
<context>
<name>apple_count</name>
<message numerus="yes">
<source>%d apple(s)</source>
<translation>
<numerusform>%d appel</numerusform>
<numerusform>%d appels</numerusform>
</translation>
</message>
</context>
</TS>
@@ -0,0 +1,38 @@
<?xml version="1.0" encoding="UTF-8"?>
<xliff version="1.2" xmlns="urn:oasis:names:tc:xliff:document:1.2">
<file id="42" original="Foozle.xml" source-language="en" target-language="nl-NL" datatype="plaintext">
<body>
<trans-unit id="874396" maxwidth="20" size-unit="char">
<source>text</source>
<target state="translated">tekst</target>
<note>Context</note>
</trans-unit>
<trans-unit id="874397" approved="yes">
<source>text 1</source>
<target state="translated">tekst 1</target>
<note>Context 1</note>
</trans-unit>
<trans-unit id="874398">
<source>text 2</source>
<target state="needs-translation"/>
<context context-type="context">Context of the segment 2</context>
</trans-unit>
<trans-unit id="874399" translate="no">
<source>text 3</source>
<target state="final">translation 3</target>
<note>Context 3</note>
</trans-unit>
<group restype="x-gettext-plurals">
<note>Plurals</note>
<trans-unit id="14343743[0]">
<source>%d month</source>
<target xml:lang="nl" state="translated">%d maand</target>
</trans-unit>
<trans-unit id="14343743[1]">
<source>%d months</source>
<target xml:lang="nl" state="translated">%d maanden</target>
</trans-unit>
</group>
</body>
</file>
</xliff>
@@ -0,0 +1,52 @@
<?xml version="1.0" encoding="utf-8"?>
<xliff xmlns="urn:oasis:names:tc:xliff:document:2.0" version="2.0" srcLang="en" trgLang="nl">
<file id="f1">
<notes>
<note id="n1">Note for file</note>
</notes>
<unit id="u1">
<notes>
<note id="n1">Note for unit</note>
</notes>
<segment id="s1" state="initial">
<source>text</source>
<target></target>
</segment>
</unit>
<unit id="u2">
<notes>
<note id="n2">Note for unit 2</note>
</notes>
<segment id="s2" state="translated">
<source>text 2</source>
<target>translation 2</target>
</segment>
</unit>
<unit id="u3">
<notes>
<note id="n3">Note for unit 3</note>
</notes>
<segment id="s3" state="final">
<source>text 3</source>
<target>approved translation 3</target>
</segment>
</unit>
<group id="90290" type="x-gettext:plurals">
<unit id="90291" name="90290[0]">
<notes>
<note category="context">Plurals</note>
</notes>
<segment>
<source>%d month</source>
<target xml:lang="nl">%d maand</target>
</segment>
</unit>
<unit id="90292" name="90290[1]">
<segment>
<source>%d months</source>
<target xml:lang="nl">%d maanden</target>
</segment>
</unit>
</group>
</file>
</xliff>
+8
View File
@@ -0,0 +1,8 @@
<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE 恥ずべきフクロウ>
<恥ずべきフクロウ 올빼미_id="Foozle&#32;<![CDATA[<greeting>Hello, world!"</greeting>]]>Barzle">
<부끄러운:barzle>
<name foo:bar="birmese">ရှက်စရာ ဇီးကွက်</name>
<nickname>Owl of Shame</nickname>
<data>More CDATA <![CDATA[<greeting>Hello, world!</greeting><![CDATA] <$]]> Nonsense.</data>
</부끄러운:barzle>
+9 -7
View File
@@ -1,5 +1,6 @@
@echo off
set COMMON=-show-timings -no-bounds-check -vet -strict-style -collection:tests=..
set COMMON=-no-bounds-check -vet -strict-style
set COLLECTION=-collection:tests=..
set PATH_TO_ODIN==..\..\odin
python3 download_assets.py
echo ---
@@ -35,9 +36,10 @@ echo ---
echo ---
echo Running core:encoding tests
echo ---
%PATH_TO_ODIN% run encoding/hxa %COMMON% -out:test_hxa.exe
%PATH_TO_ODIN% run encoding/json %COMMON% -out:test_json.exe
%PATH_TO_ODIN% run encoding/hxa %COMMON% %COLLECTION% -out:test_hxa.exe
%PATH_TO_ODIN% run encoding/json %COMMON% -out:test_json.exe
%PATH_TO_ODIN% run encoding/varint %COMMON% -out:test_varint.exe
%PATH_TO_ODIN% run encoding/xml %COMMON% -out:test_xml.exe
echo ---
echo Running core:math/noise tests
@@ -47,19 +49,19 @@ echo ---
echo ---
echo Running core:math tests
echo ---
%PATH_TO_ODIN% run math %COMMON% -out:test_core_math.exe
%PATH_TO_ODIN% run math %COMMON% %COLLECTION% -out:test_core_math.exe
echo ---
echo Running core:math/linalg/glsl tests
echo ---
%PATH_TO_ODIN% run math/linalg/glsl %COMMON% -out:test_linalg_glsl.exe
%PATH_TO_ODIN% run math/linalg/glsl %COMMON% %COLLECTION% -out:test_linalg_glsl.exe
echo ---
echo Running core:path/filepath tests
echo ---
%PATH_TO_ODIN% run path/filepath %COMMON% -out:test_core_filepath.exe
%PATH_TO_ODIN% run path/filepath %COMMON% %COLLECTION% -out:test_core_filepath.exe
echo ---
echo Running core:reflect tests
echo ---
%PATH_TO_ODIN% run reflect %COMMON% -out:test_core_reflect.exe
%PATH_TO_ODIN% run reflect %COMMON% %COLLECTION% -out:test_core_reflect.exe
+353
View File
@@ -0,0 +1,353 @@
package test_core_xml
import "core:encoding/xml"
import "core:testing"
import "core:mem"
import "core:strings"
import "core:io"
import "core:fmt"
import "core:hash"
Silent :: proc(pos: xml.Pos, format: string, args: ..any) {}
OPTIONS :: xml.Options{ flags = { .Ignore_Unsupported, .Intern_Comments, },
expected_doctype = "",
}
TEST_count := 0
TEST_fail := 0
TEST :: struct {
filename: string,
options: xml.Options,
err: xml.Error,
crc32: u32,
}
/*
Relative to ODIN_ROOT
*/
TEST_FILE_PATH_PREFIX :: "tests/core/assets/XML"
TESTS :: []TEST{
/*
First we test that certain files parse without error.
*/
{
/*
<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE >
< _id="Foozle&#32;<![CDATA[<greeting>Hello, world!"</greeting>]]>Barzle">
<:barzle>
<name foo:bar="birmese">က ကက</name>
<nickname>Owl of Shame</nickname>
<data>More CDATA <![CDATA[<greeting>Hello, world!</greeting><![CDATA] <$]]> Nonsense.</data>
</:barzle>
*/
/*
Tests UTF-8 idents and values.
Test namespaced ident.
Tests that nested partial CDATA start doesn't trip up parser.
*/
filename = "utf8.xml",
options = {
flags = {
.Ignore_Unsupported, .Intern_Comments,
},
expected_doctype = "恥ずべきフクロウ",
},
crc32 = 0x30d82264,
},
{
/*
Same as above.
Unbox CDATA in data tag.
*/
filename = "utf8.xml",
options = {
flags = {
.Ignore_Unsupported, .Intern_Comments, .Unbox_CDATA,
},
expected_doctype = "恥ずべきフクロウ",
},
crc32 = 0xad31d8e8,
},
{
/*
Simple Qt TS translation file.
`core:i18n` requires it to be parsed properly.
*/
filename = "nl_NL-qt-ts.ts",
options = {
flags = {
.Ignore_Unsupported, .Intern_Comments, .Unbox_CDATA, .Decode_SGML_Entities,
},
expected_doctype = "TS",
},
crc32 = 0x7bce2630,
},
{
/*
Simple XLiff 1.2 file.
`core:i18n` requires it to be parsed properly.
*/
filename = "nl_NL-xliff-1.2.xliff",
options = {
flags = {
.Ignore_Unsupported, .Intern_Comments, .Unbox_CDATA, .Decode_SGML_Entities,
},
expected_doctype = "xliff",
},
crc32 = 0x43f19d61,
},
{
/*
Simple XLiff 2.0 file.
`core:i18n` requires it to be parsed properly.
*/
filename = "nl_NL-xliff-2.0.xliff",
options = {
flags = {
.Ignore_Unsupported, .Intern_Comments, .Unbox_CDATA, .Decode_SGML_Entities,
},
expected_doctype = "xliff",
},
crc32 = 0x961e7635,
},
{
filename = "entities.html",
options = {
flags = {
.Ignore_Unsupported, .Intern_Comments,
},
expected_doctype = "html",
},
crc32 = 0x573c1033,
},
{
filename = "entities.html",
options = {
flags = {
.Ignore_Unsupported, .Intern_Comments, .Unbox_CDATA,
},
expected_doctype = "html",
},
crc32 = 0x82588917,
},
{
filename = "entities.html",
options = {
flags = {
.Ignore_Unsupported, .Intern_Comments, .Unbox_CDATA, .Decode_SGML_Entities,
},
expected_doctype = "html",
},
crc32 = 0x5e74d8a6,
},
/*
Then we test that certain errors are returned as expected.
*/
{
filename = "utf8.xml",
options = {
flags = {
.Ignore_Unsupported, .Intern_Comments,
},
expected_doctype = "Odin",
},
err = .Invalid_DocType,
crc32 = 0x49b83d0a,
},
/*
Parse the 8.2 MiB unicode.xml for good measure.
*/
{
filename = "unicode.xml",
options = {
flags = {
.Ignore_Unsupported,
},
expected_doctype = "",
},
err = .None,
crc32 = 0xcaa042b9,
},
}
when ODIN_TEST {
expect :: testing.expect
log :: testing.log
} else {
expect :: proc(t: ^testing.T, condition: bool, message: string, loc := #caller_location) {
TEST_count += 1
if !condition {
TEST_fail += 1
fmt.printf("[%v] %v\n", loc, message)
return
}
}
log :: proc(t: ^testing.T, v: any, loc := #caller_location) {
fmt.printf("[%v] LOG:\n\t%v\n", loc, v)
}
}
test_file_path :: proc(filename: string) -> (path: string) {
path = fmt.tprintf("%v%v/%v", ODIN_ROOT, TEST_FILE_PATH_PREFIX, filename)
temp := transmute([]u8)path
for r, i in path {
if r == '\\' {
temp[i] = '/'
}
}
return path
}
doc_to_string :: proc(doc: ^xml.Document) -> (result: string) {
/*
Effectively a clone of the debug printer in the xml package.
We duplicate it here so that the way it prints an XML document to a string is stable.
This way we can hash the output. If it changes, it means that the document or how it was parsed changed,
not how it was printed. One less source of variability.
*/
print :: proc(writer: io.Writer, doc: ^xml.Document) -> (written: int, err: io.Error) {
if doc == nil { return }
using fmt
written += wprintf(writer, "[XML Prolog]\n")
for attr in doc.prolog {
written += wprintf(writer, "\t%v: %v\n", attr.key, attr.val)
}
written += wprintf(writer, "[Encoding] %v\n", doc.encoding)
if len(doc.doctype.ident) > 0 {
written += wprintf(writer, "[DOCTYPE] %v\n", doc.doctype.ident)
if len(doc.doctype.rest) > 0 {
wprintf(writer, "\t%v\n", doc.doctype.rest)
}
}
for comment in doc.comments {
written += wprintf(writer, "[Pre-root comment] %v\n", comment)
}
if doc.element_count > 0 {
wprintln(writer, " --- ")
print_element(writer, doc, 0)
wprintln(writer, " --- ")
}
return written, .None
}
print_element :: proc(writer: io.Writer, doc: ^xml.Document, element_id: xml.Element_ID, indent := 0) -> (written: int, err: io.Error) {
using fmt
tab :: proc(writer: io.Writer, indent: int) {
for _ in 0..=indent {
wprintf(writer, "\t")
}
}
tab(writer, indent)
element := doc.elements[element_id]
if element.kind == .Element {
wprintf(writer, "<%v>\n", element.ident)
if len(element.value) > 0 {
tab(writer, indent + 1)
wprintf(writer, "[Value] %v\n", element.value)
}
for attr in element.attribs {
tab(writer, indent + 1)
wprintf(writer, "[Attr] %v: %v\n", attr.key, attr.val)
}
for child in element.children {
print_element(writer, doc, child, indent + 1)
}
} else if element.kind == .Comment {
wprintf(writer, "[COMMENT] %v\n", element.value)
}
return written, .None
}
buf: strings.Builder
defer strings.destroy_builder(&buf)
print(strings.to_writer(&buf), doc)
return strings.clone(strings.to_string(buf))
}
@test
run_tests :: proc(t: ^testing.T) {
using fmt
for test in TESTS {
path := test_file_path(test.filename)
log(t, fmt.tprintf("Trying to parse %v", path))
doc, err := xml.parse(path, test.options, Silent)
defer xml.destroy(doc)
tree_string := doc_to_string(doc)
tree_bytes := transmute([]u8)tree_string
defer delete(tree_bytes)
crc32 := hash.crc32(tree_bytes)
failed := err != test.err
err_msg := tprintf("Expected return value %v, got %v", test.err, err)
expect(t, err == test.err, err_msg)
failed |= crc32 != test.crc32
err_msg = tprintf("Expected CRC 0x%08x, got 0x%08x, with options %v", test.crc32, crc32, test.options)
expect(t, crc32 == test.crc32, err_msg)
if failed {
/*
Don't fully print big trees.
*/
tree_string = tree_string[:min(2_048, len(tree_string))]
println(tree_string)
}
}
}
main :: proc() {
t := testing.T{}
track: mem.Tracking_Allocator
mem.tracking_allocator_init(&track, context.allocator)
context.allocator = mem.tracking_allocator(&track)
run_tests(&t)
if len(track.allocation_map) > 0 {
for _, v in track.allocation_map {
err_msg := fmt.tprintf("%v Leaked %v bytes.", v.location, v.size)
expect(&t, false, err_msg)
}
}
fmt.printf("\n%v/%v tests successful.\n", TEST_count - TEST_fail, TEST_count)
}