get rid of digraph comments, use regular tests as ways of verifying that there is one node per symbolic character. docs pass #1

This commit is contained in:
ryanfleury
2021-06-30 15:15:41 -06:00
parent a477709af3
commit 487ef06b4f
3 changed files with 120 additions and 124 deletions
+116 -112
View File
@@ -126,31 +126,6 @@ main:
"""
}
@send(Strings)
@doc("This type is used to report the results of consuming one character from a unicode encoded stream.")
@see(MD_CodepointFromUtf8)
@see(MD_CodepointFromUtf16)
@struct MD_UnicodeConsume: {
@doc("The codepoint of the consumed character.")
codepoint: MD_u32,
@doc("The size of the character in the encoded stream, measured in 'units'. A unit is one byte in UTF-8, two bytes in UTF-16, and four bytes in UTF-32.")
advance: MD_u32,
};
@send(Strings)
@doc("These constants control the @code `MD_StyledStringFromString` function.")
@enum MD_WordStyle: {
@doc("Creates identifiers that look like this @code `ExampleIdentifier`")
UpperCamelCase,
@doc("Creates identifiers that look like this @code `exampleIdentifier`")
LowerCamelCase,
@doc("Creates identifiers that look like this @code `Example_Identifier`")
UpperCase,
@doc("Creates identifiers that look like this @code `example_identifier`")
LowerCase,
};
@send(Strings)
@doc("These flags control matching rules in routines that perform matching on strings and Metadesk AST nodes.")
@see(MD_Node)
@@ -176,6 +151,31 @@ main:
TagArguments,
};
@send(Strings)
@doc("This type is used to report the results of consuming one character from a unicode encoded stream.")
@see(MD_CodepointFromUtf8)
@see(MD_CodepointFromUtf16)
@struct MD_UnicodeConsume: {
@doc("The codepoint of the consumed character.")
codepoint: MD_u32,
@doc("The size of the character in the encoded stream, measured in 'units'. A unit is one byte in UTF-8, two bytes in UTF-16, and four bytes in UTF-32.")
advance: MD_u32,
};
@send(Strings)
@doc("These constants control how MD_StyledStringFromString forms strings.")
@enum MD_WordStyle: {
@doc("Also known as @code 'PascalCase'. Creates identifiers that look like: @code `ExampleIdentifier`")
UpperCamelCase,
@doc("Creates identifiers that look like: @code `exampleIdentifier`")
LowerCamelCase,
@doc("Creates identifiers that look like: @code `Example_Identifier`")
UpperCase,
@doc("Creates identifiers that look like: @code `example_identifier`")
LowerCase,
};
////////////////////////////////
//~ Node types that are used to build all ASTs.
@@ -201,8 +201,11 @@ main:
@doc("A Tag node represents a tag attached to a label node with the @code '@identifer' syntax. The children of a tag node represent the arguments placed in the tag.")
Tag,
@doc("An ErrorMarker node is generated when reporting errors. It is used to record the location of an error that occurred in the lexing phase of a parse.")
ErrorMarker,
@doc("Not a real kind value given to nodes, this is always one larger than the largest enum value that can be given to a node.")
MAX,
COUNT,
};
@send(Nodes)
@@ -227,26 +230,33 @@ main:
@doc("The delimiter between this node and its next sibling is a @code ';'")
BeforeSemicolon,
@doc("The delimiter between this node and its next sibling is a @code ','")
BeforeComma,
@doc("The delimiter between this node and its previous sibling is a @code ';'")
AfterSemicolon,
@doc("The delimiter between this node and its next sibling is a @code ','")
BeforeComma,
@doc("The delimiter between this node and its previous sibling is a @code ','")
AfterComma,
@doc("This is a string literal, with @code `'` character(s) marking the boundaries.")
StringSingleQuote,
@doc("This is a string literal, with @code `"` character(s) marking the boundaries." "\"")
StringDoubleQuote,
@doc("This is a string literal, with @code '`' character(s) marking the boundaries." "\"")
StringTick,
@doc("This is a string literal that used triplets (three of its boundary characters in a row, on either side) to mark its boundaries, making it multiline.")
StringTriplet,
@doc("The label on this node comes from a token with the @code MD_TokenKind_NumericLiteral kind.")
Numeric,
@doc("The label on this node comes from a token with the @code MD_TokenKind_Identifier kind.")
Identifier,
@doc("The label on this node comes from a token with the @code MD_TokenKind_StringLiteral kind.")
StringLiteral,
@doc("The label on this node comes from a token with the @code MD_TokenKind_CharLiteral kind.")
CharLiteral,
};
@send(Nodes)
@doc("The @code `MD_Node` is the main 'lego-brick' for modeling the result of a metadesk parse. Also used in some auxiliary data structures.")
@doc("The @code `MD_Node` is the main 'lego-brick' for modeling the result of a Metadesk parse. Also used in some auxiliary data structures.")
@struct MD_Node: {
@doc("The next sibling in the hierarchy, or the next tag in a list of tags, or next node in an externally chained linked list.")
next: *MD_Node,
@@ -280,12 +290,8 @@ main:
@doc("The raw string of the comment token after this node, if there is one.")
comment_after: MD_String8,
@doc("The name of the file from which this node was parsed; or the name that was passed to the parse call.")
filename: MD_String8,
@doc("The pointer to the base of the raw string from which this node was parsed.")
file_contents: *MD_u8,
@doc("A pointer into the raw string from which this was parsed indicating the beginning of the text that generated this node.")
at: *MD_u8,
@doc("The byte-offset into the string from which this node was parsed. Used for producing data for an MD_CodeLoc.")
offset: MD_u64,
@doc("The external pointer from an @code 'MD_NodeKind_Reference' kind node in an externally linked list.")
ref_target: *MD_Node,
@@ -311,11 +317,11 @@ main:
@doc("An abstraction over the types of keys used in a MD_Map and the work of hashing those keys, can be constructed from an MD_String8 or an void*.")
@struct MD_MapKey: {
@doc("The hash of the key. The hash function used is determined from the key type.")
hash: MD_u64,
hash: MD_u64,
@doc("For a non-empty MD_String8, the size of the string data. For a void*, zero.")
size: MD_u64,
size: MD_u64,
@doc("For a non-empty MD_String8, points to the string data of the key. For a void*, the direct pointer value.")
ptr: *void,
ptr: *void,
};
@send(Map)
@@ -323,81 +329,79 @@ main:
@struct MD_MapSlot: {
@doc("The next slot in the same bucket of the MD_Map.")
next: *MD_MapSlot,
@doc("For slots with a string key, the hash of the key.")
hash: MD_u64,
@doc("The key for slots with a pointer key.")
key: *void;
@doc("The key that maps to this slot.")
key: MD_MapKey;
@doc("The value part of the pair.")
value: *void;
};
@send(Map)
@doc("The data used to form a table in an MD_Map. Stores pointers that form a linked list of all MD_MapSlot instances that mapped to this bucket.")
@struct MD_MapBucket:
{
first: *MD_MapSlot,
last: *MD_MapSlot,
}
@send(Map)
@doc("The map is a chained hash table data structure. Data written to the map is a key-value pair. The key of a pair may either be a pointer, or a string. Both types may be mixed inside a single map. Keys stored with one type never match keys of the other type. The values of the pairs are pointers.")
@struct MD_Map: {
table_size: MD_u64,
table: **MD_MapSlot,
buckets: *MD_MapBucket,
bucket_count: MD_u64,
};
////////////////////////////////
//~ Tokens
@send(Tokens)
@enum MD_TokenKind: {
Nil,
@doc("Flags encoding the kind of a token produced by the lexer.")
@see(MD_TokenFromString)
@flags MD_TokenKind:
{
@doc("When this bit is set, the token follows C-like identifier rules. It may start with an alphabetic character or an underscore, and can contain alphanumeric characters or underscores inside it.")
Identifier,
RegularMin,
@doc("When this bit is set, the token follows C-like numeric literal rules.")
NumericLiteral,
// A group of characters that begins with an underscore or alphabetic character,
// and consists of numbers, alphabetic characters, or underscores after that.
Identifier,
@doc("When this bit is set, the token was recognized as a string literal. These may be formed with C-like rules, with a single-quote or double-quote around the string contents. They may also be formed with Metadesk's additional rules. These rules allow using @code '`' characters to mark the boundaries of the string, and also using triplets of any of these characters (@code '```This is a string```') to allow newlines within the string's contents.")
StringLiteral,
// A group of characters beginning with a numeric character or a '-', and then
// consisting of only numbers, alphabetic characters, or '.'s after that.
NumericLiteral,
@doc("When this bit is set, the token was recognized as a symbolic character. Whether a character is considered symbolic is determined by the MD_CharIsSymbol function.")
Symbol,
// A group of arbitrary characters, grouped together by a " character, OR by a
// """ symbol at the beginning and end of the group. String literals beginning with
// " are to only be specified on a single line, but """ strings can exist across
// many lines.
StringLiteral,
@doc("When this bit is set, the token is reserved for special uses by the Metadesk parser.")
Reserved,
// A group of arbitrary characters, grouped together by a ' character at the
// beginning, and a ' character at the end.
CharLiteral,
@doc("When this bit is set, the token was recognized as a comment. Comments can be formed in the traditional C-like ways, using @code '//' for single-line, or @code '/*' and @code '*/' for multiline. Metadesk differs, slightly, in that it allows nested multiline comments. So, every @code '/*' must be matched by a @code '*/'.")
Comment,
// A group of symbolic characters. The symbolic characters are:
// ~!@#$%^&*()-+=[{]}:;<>,./?|\
//
// Groups of multiple characters are only allowed in specific circumstances. Most of these
// are only 1 character long, but some groups are allowed:
//
// "<<", ">>", "<=", ">=", "+=", "-=", "*=", "/=", "::", ":=", "==", "&=", "|=", "->"
Symbol,
@doc("When this bit is set, the token contains only whitespace.")
Whitespace,
RegularMax,
@doc("When this bit is set, the token is a newline character.")
Newline,
Comment,
Whitespace,
Newline,
BadCharacter,
@doc("When this bit is set, the token is a comment that was malformed syntactically.")
BrokenComment,
MAX,
};
@doc("When this bit is set, the token is a string literal that was malformed syntactically.")
BrokenStringLiteral,
@doc("When this bit is set, the token contains a character in an encoding that is not supported by the parser Metadesk.")
BadCharacter,
}
@send(Tokens)
@doc("The type used for encoding data about any token produced by the lexer.")
@struct MD_Token: {
kind: MD_TokenKind,
string: MD_String8,
outer_string: MD_String8,
};
@send(Tokens)
@prefix(MD_TokenGroup)
@base_type(MD_u32)
@flags MD_TokenGroups: {
Comment,
Whitespace,
Regular,
kind: MD_TokenKind;
@doc("Flags that should be attached to an MD_Node that uses this token to define its string. Only includes flags that can be understood by the lexer; is not the comprehensive set of node flags that a node needs.")
node_flags: MD_NodeFlags;
@doc("The contents of this token, not including any boundary characters.")
string: MD_String8;
@doc("The full contents of the string used to form this token, including all boundary characters.")
outer_string: MD_String8;
};
////////////////////////////////
@@ -927,55 +931,55 @@ main:
@send(Map)
MD_MapMakeBucketCount: {
bucket_count: MD_u64,
return: MD_Map,
bucket_count: MD_u64,
return: MD_Map,
};
@send(Map)
MD_MapMake: {
return: MD_Map,
return: MD_Map,
};
@send(Map)
MD_MapKeyStr: {
string: MD_String8,
return: MD_MapKey,
string: MD_String8,
return: MD_MapKey,
};
@send(Map)
MD_MapKeyPtr: {
ptr: *void,
return: MD_MapKey,
ptr: *void,
return: MD_MapKey,
};
@send(Map)
MD_MapLookup: {
map: *MD_Map,
key: MD_MapKey,
return: *MD_MapSlot,
map: *MD_Map,
key: MD_MapKey,
return: *MD_MapSlot,
};
@send(Map)
MD_MapScan: {
first_slot: *MD_MapSlot,
key: MD_MapKey,
return: *MD_MapSlot,
first_slot: *MD_MapSlot,
key: MD_MapKey,
return: *MD_MapSlot,
};
@send(Map)
MD_MapInsert: {
map: *MD_Map,
key: MD_MapKey,
val: *void,
return: *MD_MapSlot,
map: *MD_Map,
key: MD_MapKey,
val: *void,
return: *MD_MapSlot,
};
@send(Map)
MD_MapOverwrite: {
map: *MD_Map,
key: MD_MapKey,
val: *void,
return: *MD_MapSlot,
map: *MD_Map,
key: MD_MapKey,
val: *void,
return: *MD_MapSlot,
};
+4 -3
View File
@@ -12,7 +12,7 @@
// [ ] pass all tests
// [ ] simplify error sorting and catastrophic error handling
// [ ] integer -> string helpers
// [ ] {rjf} remove symbol digraphs (maybe a test for this or something) and remove from comments
// [x] {rjf} remove symbol digraphs (maybe a test for this or something) and remove from comments
// [x] stb_snprintf included and modified for %S ~ MD_String8
// [ ] naming pass
// [ ] {rjf} get the branches/labels setup on Git for beta 0.1 and dev
@@ -378,8 +378,8 @@ typedef struct MD_CodeLoc MD_CodeLoc;
struct MD_CodeLoc
{
MD_String8 filename;
int line;
int column;
MD_u32 line;
MD_u32 column;
};
//~ String-To-Ptr and Ptr-To-Ptr tables
@@ -432,6 +432,7 @@ enum
MD_TokenKind_BadCharacter = (1<<10),
};
typedef MD_u32 MD_TokenGroups;
enum
{
MD_TokenGroup_Comment = MD_TokenKind_Comment,
-9
View File
@@ -223,12 +223,8 @@ int main(void)
MD_PushChild(size, MakeTestNode(MD_NodeKind_Label, MD_S8Lit("u64")));
MD_PushChild(params, size);
MD_PushChild(tree, params);
// TODO(rjf): This test will fail once we have digraphs implemented. Adjust the separate
// "-" and ">" set members, and combine them to form a single "->" set member.
// {
MD_PushChild(tree, MakeTestNode(MD_NodeKind_Label, MD_S8Lit("-")));
MD_PushChild(tree, MakeTestNode(MD_NodeKind_Label, MD_S8Lit(">")));
// }
MD_PushChild(tree, MakeTestNode(MD_NodeKind_Label, MD_S8Lit("*")));
MD_PushChild(tree, MakeTestNode(MD_NodeKind_Label, MD_S8Lit("void")));
TestResult(MatchParsedWithNode(string, tree));
@@ -287,11 +283,6 @@ int main(void)
TestResult(parse.node->first_child->flags & MD_NodeFlag_BeforeSemicolon);
TestResult(parse.node->first_child->next->flags & MD_NodeFlag_AfterSemicolon);
}
{
// TODO(rjf): Enable this once we have digraphs.
// MD_ParseResult parse = MD_ParseOneNode(MD_S8Lit(""), MD_S8Lit("(a -> b)"));
// TestResult(parse.node->first_child->flags & MD_NodeFlag_BeforeArrow);
}
}
Test("Node Text Flags")