From 487ef06b4fecda9d8aea513ddb30397447662449 Mon Sep 17 00:00:00 2001 From: ryanfleury Date: Wed, 30 Jun 2021 15:15:41 -0600 Subject: [PATCH] get rid of digraph comments, use regular tests as ways of verifying that there is one node per symbolic character. docs pass #1 --- docs/metadesk_reference.md | 228 +++++++++++++++++++------------------ source/md.h | 7 +- tests/sanity_tests.c | 9 -- 3 files changed, 120 insertions(+), 124 deletions(-) diff --git a/docs/metadesk_reference.md b/docs/metadesk_reference.md index a972ac3..ef26316 100644 --- a/docs/metadesk_reference.md +++ b/docs/metadesk_reference.md @@ -126,31 +126,6 @@ main: """ } -@send(Strings) -@doc("This type is used to report the results of consuming one character from a unicode encoded stream.") -@see(MD_CodepointFromUtf8) -@see(MD_CodepointFromUtf16) -@struct MD_UnicodeConsume: { - @doc("The codepoint of the consumed character.") - codepoint: MD_u32, - - @doc("The size of the character in the encoded stream, measured in 'units'. A unit is one byte in UTF-8, two bytes in UTF-16, and four bytes in UTF-32.") - advance: MD_u32, -}; - -@send(Strings) -@doc("These constants control the @code `MD_StyledStringFromString` function.") -@enum MD_WordStyle: { - @doc("Creates identifiers that look like this @code `ExampleIdentifier`") - UpperCamelCase, - @doc("Creates identifiers that look like this @code `exampleIdentifier`") - LowerCamelCase, - @doc("Creates identifiers that look like this @code `Example_Identifier`") - UpperCase, - @doc("Creates identifiers that look like this @code `example_identifier`") - LowerCase, -}; - @send(Strings) @doc("These flags control matching rules in routines that perform matching on strings and Metadesk AST nodes.") @see(MD_Node) @@ -176,6 +151,31 @@ main: TagArguments, }; +@send(Strings) +@doc("This type is used to report the results of consuming one character from a unicode encoded stream.") +@see(MD_CodepointFromUtf8) +@see(MD_CodepointFromUtf16) +@struct MD_UnicodeConsume: { + @doc("The codepoint of the consumed character.") + codepoint: MD_u32, + + @doc("The size of the character in the encoded stream, measured in 'units'. A unit is one byte in UTF-8, two bytes in UTF-16, and four bytes in UTF-32.") + advance: MD_u32, +}; + +@send(Strings) +@doc("These constants control how MD_StyledStringFromString forms strings.") +@enum MD_WordStyle: { + @doc("Also known as @code 'PascalCase'. Creates identifiers that look like: @code `ExampleIdentifier`") + UpperCamelCase, + @doc("Creates identifiers that look like: @code `exampleIdentifier`") + LowerCamelCase, + @doc("Creates identifiers that look like: @code `Example_Identifier`") + UpperCase, + @doc("Creates identifiers that look like: @code `example_identifier`") + LowerCase, +}; + //////////////////////////////// //~ Node types that are used to build all ASTs. @@ -201,8 +201,11 @@ main: @doc("A Tag node represents a tag attached to a label node with the @code '@identifer' syntax. The children of a tag node represent the arguments placed in the tag.") Tag, + @doc("An ErrorMarker node is generated when reporting errors. It is used to record the location of an error that occurred in the lexing phase of a parse.") + ErrorMarker, + @doc("Not a real kind value given to nodes, this is always one larger than the largest enum value that can be given to a node.") - MAX, + COUNT, }; @send(Nodes) @@ -227,26 +230,33 @@ main: @doc("The delimiter between this node and its next sibling is a @code ';'") BeforeSemicolon, - @doc("The delimiter between this node and its next sibling is a @code ','") - BeforeComma, - @doc("The delimiter between this node and its previous sibling is a @code ';'") AfterSemicolon, + + @doc("The delimiter between this node and its next sibling is a @code ','") + BeforeComma, @doc("The delimiter between this node and its previous sibling is a @code ','") AfterComma, + @doc("This is a string literal, with @code `'` character(s) marking the boundaries.") + StringSingleQuote, + @doc("This is a string literal, with @code `"` character(s) marking the boundaries." "\"") + StringDoubleQuote, + @doc("This is a string literal, with @code '`' character(s) marking the boundaries." "\"") + StringTick, + @doc("This is a string literal that used triplets (three of its boundary characters in a row, on either side) to mark its boundaries, making it multiline.") + StringTriplet, + @doc("The label on this node comes from a token with the @code MD_TokenKind_NumericLiteral kind.") Numeric, @doc("The label on this node comes from a token with the @code MD_TokenKind_Identifier kind.") Identifier, @doc("The label on this node comes from a token with the @code MD_TokenKind_StringLiteral kind.") StringLiteral, - @doc("The label on this node comes from a token with the @code MD_TokenKind_CharLiteral kind.") - CharLiteral, }; @send(Nodes) -@doc("The @code `MD_Node` is the main 'lego-brick' for modeling the result of a metadesk parse. Also used in some auxiliary data structures.") +@doc("The @code `MD_Node` is the main 'lego-brick' for modeling the result of a Metadesk parse. Also used in some auxiliary data structures.") @struct MD_Node: { @doc("The next sibling in the hierarchy, or the next tag in a list of tags, or next node in an externally chained linked list.") next: *MD_Node, @@ -280,12 +290,8 @@ main: @doc("The raw string of the comment token after this node, if there is one.") comment_after: MD_String8, - @doc("The name of the file from which this node was parsed; or the name that was passed to the parse call.") - filename: MD_String8, - @doc("The pointer to the base of the raw string from which this node was parsed.") - file_contents: *MD_u8, - @doc("A pointer into the raw string from which this was parsed indicating the beginning of the text that generated this node.") - at: *MD_u8, + @doc("The byte-offset into the string from which this node was parsed. Used for producing data for an MD_CodeLoc.") + offset: MD_u64, @doc("The external pointer from an @code 'MD_NodeKind_Reference' kind node in an externally linked list.") ref_target: *MD_Node, @@ -311,11 +317,11 @@ main: @doc("An abstraction over the types of keys used in a MD_Map and the work of hashing those keys, can be constructed from an MD_String8 or an void*.") @struct MD_MapKey: { @doc("The hash of the key. The hash function used is determined from the key type.") - hash: MD_u64, + hash: MD_u64, @doc("For a non-empty MD_String8, the size of the string data. For a void*, zero.") - size: MD_u64, + size: MD_u64, @doc("For a non-empty MD_String8, points to the string data of the key. For a void*, the direct pointer value.") - ptr: *void, + ptr: *void, }; @send(Map) @@ -323,81 +329,79 @@ main: @struct MD_MapSlot: { @doc("The next slot in the same bucket of the MD_Map.") next: *MD_MapSlot, - @doc("For slots with a string key, the hash of the key.") - hash: MD_u64, - @doc("The key for slots with a pointer key.") - key: *void; + @doc("The key that maps to this slot.") + key: MD_MapKey; @doc("The value part of the pair.") value: *void; }; +@send(Map) +@doc("The data used to form a table in an MD_Map. Stores pointers that form a linked list of all MD_MapSlot instances that mapped to this bucket.") +@struct MD_MapBucket: +{ + first: *MD_MapSlot, + last: *MD_MapSlot, +} + @send(Map) @doc("The map is a chained hash table data structure. Data written to the map is a key-value pair. The key of a pair may either be a pointer, or a string. Both types may be mixed inside a single map. Keys stored with one type never match keys of the other type. The values of the pairs are pointers.") @struct MD_Map: { - table_size: MD_u64, - table: **MD_MapSlot, + buckets: *MD_MapBucket, + bucket_count: MD_u64, }; //////////////////////////////// //~ Tokens @send(Tokens) -@enum MD_TokenKind: { - Nil, +@doc("Flags encoding the kind of a token produced by the lexer.") +@see(MD_TokenFromString) +@flags MD_TokenKind: +{ + @doc("When this bit is set, the token follows C-like identifier rules. It may start with an alphabetic character or an underscore, and can contain alphanumeric characters or underscores inside it.") + Identifier, - RegularMin, + @doc("When this bit is set, the token follows C-like numeric literal rules.") + NumericLiteral, - // A group of characters that begins with an underscore or alphabetic character, - // and consists of numbers, alphabetic characters, or underscores after that. - Identifier, + @doc("When this bit is set, the token was recognized as a string literal. These may be formed with C-like rules, with a single-quote or double-quote around the string contents. They may also be formed with Metadesk's additional rules. These rules allow using @code '`' characters to mark the boundaries of the string, and also using triplets of any of these characters (@code '```This is a string```') to allow newlines within the string's contents.") + StringLiteral, - // A group of characters beginning with a numeric character or a '-', and then - // consisting of only numbers, alphabetic characters, or '.'s after that. - NumericLiteral, + @doc("When this bit is set, the token was recognized as a symbolic character. Whether a character is considered symbolic is determined by the MD_CharIsSymbol function.") + Symbol, - // A group of arbitrary characters, grouped together by a " character, OR by a - // """ symbol at the beginning and end of the group. String literals beginning with - // " are to only be specified on a single line, but """ strings can exist across - // many lines. - StringLiteral, + @doc("When this bit is set, the token is reserved for special uses by the Metadesk parser.") + Reserved, - // A group of arbitrary characters, grouped together by a ' character at the - // beginning, and a ' character at the end. - CharLiteral, + @doc("When this bit is set, the token was recognized as a comment. Comments can be formed in the traditional C-like ways, using @code '//' for single-line, or @code '/*' and @code '*/' for multiline. Metadesk differs, slightly, in that it allows nested multiline comments. So, every @code '/*' must be matched by a @code '*/'.") + Comment, - // A group of symbolic characters. The symbolic characters are: - // ~!@#$%^&*()-+=[{]}:;<>,./?|\ - // - // Groups of multiple characters are only allowed in specific circumstances. Most of these - // are only 1 character long, but some groups are allowed: - // - // "<<", ">>", "<=", ">=", "+=", "-=", "*=", "/=", "::", ":=", "==", "&=", "|=", "->" - Symbol, + @doc("When this bit is set, the token contains only whitespace.") + Whitespace, - RegularMax, + @doc("When this bit is set, the token is a newline character.") + Newline, - Comment, - Whitespace, - Newline, - BadCharacter, + @doc("When this bit is set, the token is a comment that was malformed syntactically.") + BrokenComment, - MAX, -}; + @doc("When this bit is set, the token is a string literal that was malformed syntactically.") + BrokenStringLiteral, + + @doc("When this bit is set, the token contains a character in an encoding that is not supported by the parser Metadesk.") + BadCharacter, +} @send(Tokens) +@doc("The type used for encoding data about any token produced by the lexer.") @struct MD_Token: { - kind: MD_TokenKind, - string: MD_String8, - outer_string: MD_String8, -}; - -@send(Tokens) -@prefix(MD_TokenGroup) -@base_type(MD_u32) -@flags MD_TokenGroups: { - Comment, - Whitespace, - Regular, + kind: MD_TokenKind; + @doc("Flags that should be attached to an MD_Node that uses this token to define its string. Only includes flags that can be understood by the lexer; is not the comprehensive set of node flags that a node needs.") + node_flags: MD_NodeFlags; + @doc("The contents of this token, not including any boundary characters.") + string: MD_String8; + @doc("The full contents of the string used to form this token, including all boundary characters.") + outer_string: MD_String8; }; //////////////////////////////// @@ -927,55 +931,55 @@ main: @send(Map) MD_MapMakeBucketCount: { - bucket_count: MD_u64, - return: MD_Map, + bucket_count: MD_u64, + return: MD_Map, }; @send(Map) MD_MapMake: { - return: MD_Map, + return: MD_Map, }; @send(Map) MD_MapKeyStr: { - string: MD_String8, - return: MD_MapKey, + string: MD_String8, + return: MD_MapKey, }; @send(Map) MD_MapKeyPtr: { - ptr: *void, - return: MD_MapKey, + ptr: *void, + return: MD_MapKey, }; @send(Map) MD_MapLookup: { - map: *MD_Map, - key: MD_MapKey, - return: *MD_MapSlot, + map: *MD_Map, + key: MD_MapKey, + return: *MD_MapSlot, }; @send(Map) MD_MapScan: { - first_slot: *MD_MapSlot, - key: MD_MapKey, - return: *MD_MapSlot, + first_slot: *MD_MapSlot, + key: MD_MapKey, + return: *MD_MapSlot, }; @send(Map) MD_MapInsert: { - map: *MD_Map, - key: MD_MapKey, - val: *void, - return: *MD_MapSlot, + map: *MD_Map, + key: MD_MapKey, + val: *void, + return: *MD_MapSlot, }; @send(Map) MD_MapOverwrite: { - map: *MD_Map, - key: MD_MapKey, - val: *void, - return: *MD_MapSlot, + map: *MD_Map, + key: MD_MapKey, + val: *void, + return: *MD_MapSlot, }; diff --git a/source/md.h b/source/md.h index 383bbea..0f418b8 100644 --- a/source/md.h +++ b/source/md.h @@ -12,7 +12,7 @@ // [ ] pass all tests // [ ] simplify error sorting and catastrophic error handling // [ ] integer -> string helpers -// [ ] {rjf} remove symbol digraphs (maybe a test for this or something) and remove from comments +// [x] {rjf} remove symbol digraphs (maybe a test for this or something) and remove from comments // [x] stb_snprintf included and modified for %S ~ MD_String8 // [ ] naming pass // [ ] {rjf} get the branches/labels setup on Git for beta 0.1 and dev @@ -378,8 +378,8 @@ typedef struct MD_CodeLoc MD_CodeLoc; struct MD_CodeLoc { MD_String8 filename; - int line; - int column; + MD_u32 line; + MD_u32 column; }; //~ String-To-Ptr and Ptr-To-Ptr tables @@ -432,6 +432,7 @@ enum MD_TokenKind_BadCharacter = (1<<10), }; +typedef MD_u32 MD_TokenGroups; enum { MD_TokenGroup_Comment = MD_TokenKind_Comment, diff --git a/tests/sanity_tests.c b/tests/sanity_tests.c index 9bb26fb..80c6647 100644 --- a/tests/sanity_tests.c +++ b/tests/sanity_tests.c @@ -223,12 +223,8 @@ int main(void) MD_PushChild(size, MakeTestNode(MD_NodeKind_Label, MD_S8Lit("u64"))); MD_PushChild(params, size); MD_PushChild(tree, params); - // TODO(rjf): This test will fail once we have digraphs implemented. Adjust the separate - // "-" and ">" set members, and combine them to form a single "->" set member. - // { MD_PushChild(tree, MakeTestNode(MD_NodeKind_Label, MD_S8Lit("-"))); MD_PushChild(tree, MakeTestNode(MD_NodeKind_Label, MD_S8Lit(">"))); - // } MD_PushChild(tree, MakeTestNode(MD_NodeKind_Label, MD_S8Lit("*"))); MD_PushChild(tree, MakeTestNode(MD_NodeKind_Label, MD_S8Lit("void"))); TestResult(MatchParsedWithNode(string, tree)); @@ -287,11 +283,6 @@ int main(void) TestResult(parse.node->first_child->flags & MD_NodeFlag_BeforeSemicolon); TestResult(parse.node->first_child->next->flags & MD_NodeFlag_AfterSemicolon); } - { - // TODO(rjf): Enable this once we have digraphs. - // MD_ParseResult parse = MD_ParseOneNode(MD_S8Lit(""), MD_S8Lit("(a -> b)")); - // TestResult(parse.node->first_child->flags & MD_NodeFlag_BeforeArrow); - } } Test("Node Text Flags")