Update XML reader to normalize whitespace, part 1.

2026-07-29 02:40:05 +00:00 · 2024-06-12 12:52:48 +02:00
parent e87c5bca58
commit ebadff555d
4 changed files with 70 additions and 116 deletions
@@ -218,9 +218,7 @@ scan_identifier :: proc(t: ^Tokenizer) -> string {
 	for is_valid_identifier_rune(t.ch) {
 		advance_rune(t)
 		if t.ch == ':' {
-			/*
-				A namespaced attr can have at most two parts, `namespace:ident`.
-			*/
+			// A namespaced attr can have at most two parts, `namespace:ident`.
 			if namespaced {
 				break	
 			}
@@ -268,14 +266,10 @@ scan_comment :: proc(t: ^Tokenizer) -> (comment: string, err: Error) {
 	return string(t.src[offset : t.offset - 1]), .None
 }

-/*
-	Skip CDATA
-*/
+// Skip CDATA
 skip_cdata :: proc(t: ^Tokenizer) -> (err: Error) {
 	if t.read_offset + len(CDATA_START) >= len(t.src) {
-		/*
-			Can't be the start of a CDATA tag.
-		*/
+		// Can't be the start of a CDATA tag.
 		return .None
 	}

@@ -290,9 +284,7 @@ skip_cdata :: proc(t: ^Tokenizer) -> (err: Error) {
 				return .Premature_EOF
 			}

-			/*
-				Scan until the end of a CDATA tag.
-			*/
+			// Scan until the end of a CDATA tag.
 			if t.read_offset + len(CDATA_END) < len(t.src) {
 				if string(t.src[t.offset:][:len(CDATA_END)]) == CDATA_END {
 					t.read_offset += len(CDATA_END)
@@ -319,14 +311,10 @@ scan_string :: proc(t: ^Tokenizer, offset: int, close: rune = '<', consume_close
 		case '<':
 			if peek_byte(t) == '!' {
 				if peek_byte(t, 1) == '[' {
-					/*
-						Might be the start of a CDATA tag.
-					*/
+					// Might be the start of a CDATA tag.
 					skip_cdata(t) or_return
 				} else if peek_byte(t, 1) == '-' && peek_byte(t, 2) == '-' {
-					/*
-						Comment start. Eat comment.
-					*/
+					// Comment start. Eat comment.
 					t.read_offset += 3
 					_ = scan_comment(t) or_return
 				}
@@ -342,17 +330,13 @@ scan_string :: proc(t: ^Tokenizer, offset: int, close: rune = '<', consume_close
 		}

 		if t.ch == close {
-			/*
-				If it's not a CDATA or comment, it's the end of this body.
-			*/
+			// If it's not a CDATA or comment, it's the end of this body.
 			break loop
 		}
 		advance_rune(t)
 	}

-	/*
-		Strip trailing whitespace.
-	*/
+	// Strip trailing whitespace.
 	lit := string(t.src[offset : t.offset])

 	end := len(lit)
@@ -369,11 +353,6 @@ scan_string :: proc(t: ^Tokenizer, offset: int, close: rune = '<', consume_close
 	if consume_close {
 		advance_rune(t)
 	}
-
-	/*
-		TODO: Handle decoding escape characters and unboxing CDATA.
-	*/
-
 	return lit, err
 }

@@ -384,7 +363,7 @@ peek :: proc(t: ^Tokenizer) -> (token: Token) {
 	return token
 }

-scan :: proc(t: ^Tokenizer) -> Token {
+scan :: proc(t: ^Tokenizer, multiline_string := false) -> Token {
 	skip_whitespace(t)

 	offset := t.offset
@@ -418,7 +397,7 @@ scan :: proc(t: ^Tokenizer) -> Token {
 		case '"', '\'':
 			kind = .Invalid

-			lit, err = scan_string(t, t.offset, ch, true, false)
+			lit, err = scan_string(t, t.offset, ch, true, multiline_string)
 			if err == .None {
 				kind = .String
 			}
@@ -435,4 +414,4 @@ scan :: proc(t: ^Tokenizer) -> Token {
 		lit = string(t.src[offset : t.offset])
 	}
 	return Token{kind, lit, pos}
-}
+}
@@ -203,9 +203,7 @@ parse_bytes :: proc(data: []u8, options := DEFAULT_OPTIONS, path := "", error_ha

 	doc.elements = make([dynamic]Element, 1024, 1024, allocator)

-	// strings.intern_init(&doc.intern, allocator, allocator)
-
-	err =            .Unexpected_Token
+	err = .Unexpected_Token
 	element, parent: Element_ID
 	open: Token

@@ -259,8 +257,8 @@ parse_bytes :: proc(data: []u8, options := DEFAULT_OPTIONS, path := "", error_ha
 				case .Slash:
 					// Empty tag. Close it.
 					expect(t, .Gt) or_return
-					parent      = doc.elements[element].parent
-					element     = parent
+					parent  = doc.elements[element].parent
+					element = parent

 				case:
 					error(t, t.offset, "Expected close tag, got: %#v\n", end_token)
@@ -276,8 +274,8 @@ parse_bytes :: proc(data: []u8, options := DEFAULT_OPTIONS, path := "", error_ha
 					error(t, t.offset, "Mismatched Closing Tag. Expected %v, got %v\n", doc.elements[element].ident, ident.text)
 					return doc, .Mismatched_Closing_Tag
 				}
-				parent      = doc.elements[element].parent
-				element     = parent
+				parent  = doc.elements[element].parent
+				element = parent

 			} else if open.kind == .Exclaim {
 				// <!
@@ -463,8 +461,8 @@ validate_options :: proc(options: Options) -> (validated: Options, err: Error) {
 	return validated, .None
 }

-expect :: proc(t: ^Tokenizer, kind: Token_Kind) -> (tok: Token, err: Error) {
-	tok = scan(t)
+expect :: proc(t: ^Tokenizer, kind: Token_Kind, multiline_string := false) -> (tok: Token, err: Error) {
+	tok = scan(t, multiline_string=multiline_string)
 	if tok.kind == kind { return tok, .None }

 	error(t, t.offset, "Expected \"%v\", got \"%v\".", kind, tok.kind)
@@ -480,7 +478,13 @@ parse_attribute :: proc(doc: ^Document) -> (attr: Attribute, offset: int, err: E
 	offset  = t.offset - len(key.text)

 	_       = expect(t, .Eq)     or_return
-	value  := expect(t, .String) or_return
+	value  := expect(t, .String, multiline_string=true) or_return
+
+	normalized, normalize_err := entity.decode_xml(value.text, {.Normalize_Whitespace}, doc.allocator)
+	if normalize_err == .None {
+		append(&doc.strings_to_free, normalized)
+		value.text = normalized
+	}

 	attr.key = key.text
 	attr.val = value.text