[xml] Implement optional unboxing of CDATA and decoding of tag values.

This commit is contained in:
Jeroen van Rijn
2021-12-02 21:07:40 +01:00
parent 2dd67dba89
commit 3d72e80ccf
4 changed files with 56 additions and 93 deletions
+24 -15
View File
@@ -60,16 +60,22 @@ COMMENT_END :: "-->"
Default: CDATA and comments are passed through unchanged.
*/
XML_Decode_Option :: enum u8 {
/*
Do not decode & entities. It decodes by default.
If given, overrides `Decode_CDATA`.
*/
No_Entity_Decode,
/*
CDATA is unboxed.
*/
CDATA_Unbox,
Unbox_CDATA,
/*
Unboxed CDATA is decoded as well.
Ignored if `.CDATA_Unbox` is not given.
Ignored if `.Unbox_CDATA` is not given.
*/
CDATA_Decode,
Decode_CDATA,
/*
Comments are stripped.
@@ -129,7 +135,7 @@ decode_xml :: proc(input: string, options := XML_Decode_Options{}, allocator :=
}
case:
if in_data && .CDATA_Decode not_in options {
if in_data && .Decode_CDATA not_in options {
/*
Unboxed, but undecoded.
*/
@@ -145,17 +151,20 @@ decode_xml :: proc(input: string, options := XML_Decode_Options{}, allocator :=
*/
write_string(&builder, entity)
} else {
if decoded, ok := xml_decode_entity(entity); ok {
write_rune(&builder, decoded)
} else {
/*
Decode failed. Pass through original.
*/
write_string(&builder, "&")
write_string(&builder, entity)
write_string(&builder, ";")
if .No_Entity_Decode not_in options {
if decoded, ok := xml_decode_entity(entity); ok {
write_rune(&builder, decoded)
continue
}
}
/*
Literal passthrough because the decode failed or we want entities not decoded.
*/
write_string(&builder, "&")
write_string(&builder, entity)
write_string(&builder, ";")
}
} else {
write_rune(&builder, t.r)
@@ -290,7 +299,7 @@ _handle_xml_special :: proc(t: ^Tokenizer, builder: ^strings.Builder, options: X
if string(t.src[t.offset:][:len(CDATA_START)]) == CDATA_START {
t.read_offset += len(CDATA_START) - 1
if .CDATA_Unbox in options && .CDATA_Decode in options {
if .Unbox_CDATA in options && .Decode_CDATA in options {
/*
We're unboxing _and_ decoding CDATA
*/
@@ -315,7 +324,7 @@ _handle_xml_special :: proc(t: ^Tokenizer, builder: ^strings.Builder, options: X
cdata := string(t.src[offset : t.read_offset])
if .CDATA_Unbox in options {
if .Unbox_CDATA in options {
cdata = cdata[len(CDATA_START):]
cdata = cdata[:len(cdata) - len(CDATA_END)]
}
@@ -1,19 +1,11 @@
package unicode_entity_example
import "core:encoding/xml"
import "core:encoding/entity"
import "core:strings"
import "core:mem"
import "core:fmt"
import "core:time"
OPTIONS :: xml.Options{
flags = {
.Ignore_Unsupported, .Intern_Comments,
},
expected_doctype = "",
}
doc_print :: proc(doc: ^xml.Document) {
buf: strings.Builder
defer strings.destroy_builder(&buf)
@@ -29,6 +21,13 @@ _entities :: proc() {
DOC :: #load("../../../../tests/core/assets/XML/unicode.xml")
OPTIONS :: xml.Options{
flags = {
.Ignore_Unsupported, .Intern_Comments,
},
expected_doctype = "",
}
parse_duration: time.Duration
{
@@ -50,57 +49,11 @@ _entities :: proc() {
_main :: proc() {
using fmt
doc, err := xml.parse(#load("test.html"))
options := xml.Options{ flags = { .Ignore_Unsupported, .Intern_Comments, .Unbox_CDATA, .Decode_SGML_Entities }}
doc, _ := xml.parse(#load("test.html"), options)
defer xml.destroy(doc)
doc_print(doc)
if false {
val := doc.root.children[1].children[2].value
println()
replaced, ok := entity.decode_xml(val)
defer delete(replaced)
printf("Before: '%v', Err: %v\n", val, err)
printf("Passthrough: '%v'\nOK: %v\n", replaced, ok)
println()
}
if false {
val := doc.root.children[1].children[2].value
println()
replaced, ok := entity.decode_xml(val, { .CDATA_Unbox })
defer delete(replaced)
printf("Before: '%v', Err: %v\n", val, err)
printf("CDATA_Unbox: '%v'\nOK: %v\n", replaced, ok)
println()
}
if true {
val := doc.root.children[1].children[2].value
println()
replaced, ok := entity.decode_xml(val, { .CDATA_Unbox, .CDATA_Decode })
defer delete(replaced)
printf("Before: '%v', Err: %v\n", val, err)
printf("CDATA_Decode: '%v'\nOK: %v\n", replaced, ok)
println()
}
if true {
val := doc.root.children[1].children[1].value
println()
replaced, ok := entity.decode_xml(val, { .Comment_Strip })
defer delete(replaced)
printf("Before: '%v', Err: %v\n", val, err)
printf("Comment_Strip: '%v'\nOK: %v\n", replaced, ok)
println()
}
}
main :: proc() {
+2
View File
@@ -16,9 +16,11 @@
<div id="test_cdata_in_comment" foo="">
Foozle]!&#32;&copy;&#x20;<!-- <![CDATA[&#32;&reg;&#x20;]]> -->42&;1234&
</div>
<!-- EXPECTED: Foozle]! © 42&;1234& -->
<div id="test_cdata_unwrap_and_passthrough">
Foozle]!&#32;&copy;&#x20;<![CDATA[BOX&#32;&reg;&#x20;/BOX]]>42&;1234&
</div>
<!-- EXPECTED: Foozle]! © BOX ® /BOX42&;1234& -->
<div>
&verbar; &vert; &VerticalLine; &fjlig; &grave; &bsol; &reg; &rhov; &CounterClockwiseContourIntegral;
</div>