From adbcb2a83b7ffddf3896f24c1d1785c2ba36d19f Mon Sep 17 00:00:00 2001 From: Ed_ Date: Mon, 16 Dec 2024 14:57:01 -0500 Subject: [PATCH] Progress on parser documentation --- base/components/parser.cpp | 7 - base/components/parser_types.hpp | 4 - docs/Parser_Algo.md | 681 ++++++++++++++++++++++--------- docs/Parsing.md | 87 +++- 4 files changed, 567 insertions(+), 212 deletions(-) diff --git a/base/components/parser.cpp b/base/components/parser.cpp index 0fd59ad..f72c1b4 100644 --- a/base/components/parser.cpp +++ b/base/components/parser.cpp @@ -4133,13 +4133,6 @@ CodeFn parser_parse_function() } // - // Note(Ed): We're enforcing that using this codepath requires non-macro jank. - // Code macro_stmt = parse_macro_as_definiton(attributes, specifiers); - // if (macro_stmt) { - // parser_pop(& _ctx->parser); - // return macro_stmt; - // } - CodeTypename ret_type = parser_parse_type(parser_not_from_template, nullptr); if ( cast(Code, ret_type) == Code_Invalid ) { parser_pop(& _ctx->parser); diff --git a/base/components/parser_types.hpp b/base/components/parser_types.hpp index f98291e..ffe9b18 100644 --- a/base/components/parser_types.hpp +++ b/base/components/parser_types.hpp @@ -132,8 +132,6 @@ enum MacroType : u16 MT_Expression, // A macro is assumed to be a expression if not resolved. MT_Statement, MT_Typename, - MT_Attribute, // More of a note to the parser than anythign else (attributes should be defined in the user attribues def). - MT_Specifier, // More of a note to the parser than anythign else (specifiers should be defined in the user attribues def). MT_Block_Start, // Not Supported yet MT_Block_End, // Not Supported yet MT_Case_Statement, // Not Supported yet @@ -160,8 +158,6 @@ Str macrotype_to_str( MacroType type ) { "Statement", sizeof("Statement") - 1 }, { "Expression", sizeof("Expression") - 1 }, { "Typename", sizeof("Typename") - 1 }, - { "Attribute(Macro)", sizeof("Attribute(Macro)") - 1 }, - { "Specifier(Macro)", sizeof("Specifier(Macro)") - 1 }, { "Block_Start", sizeof("Block_Start") - 1 }, { "Block_End", sizeof("Block_End") - 1 }, { "Case_Statement", sizeof("Case_Statement") - 1 }, diff --git a/docs/Parser_Algo.md b/docs/Parser_Algo.md index 8180168..ec23954 100644 --- a/docs/Parser_Algo.md +++ b/docs/Parser_Algo.md @@ -12,7 +12,9 @@ gencpp uses a hand-written recursive descent parser. Both the lexer and parser c ### Lexer -The lex procedure does the lexical pass of content provided as a `Str` type. +File: [lexer.cpp](../base/components/lexer.cpp) + +The `lex` procedure does the lexical pass of content provided as a `Str` type. The tokens are stored (for now) in `Lexer_Tokens`. Fields: @@ -24,17 +26,15 @@ s32 Idx; What token types are supported can be found in [ETokType.csv](../base/enums/ETokType.csv) you can also find the token types in [ETokType.h](../base/components/gen/etoktype.cpp) , which is the generated enum from the csv file. -Tokens are defined with the struct `gen::parser::Token`: - -Fields: - ```cpp -char const* Text; -sptr Length; -TokType Type; -s32 Line; -s32 Column; -u32 Flags; +struct Token +{ + Str Text; + TokType Type; + s32 Line; + s32 Column; + u32 Flags; +} ``` Flags is a bitfield made up of TokFlags (Token Flags): @@ -52,25 +52,17 @@ Flags is a bitfield made up of TokFlags (Token Flags): * `TF_EndDefinition` : Can be interpreted as an end definition for a scope. * `TF_Formatting` : Considered a part of the formatting * `TF_Literal` : Anything considered a literal by C++. - -I plan to replace IsAssign with a general flags field and properly keep track of all operator types instead of abstracting it away to `ETokType::Operator`. - -Traversing the tokens is done with the following interface macros: - -| Macro | Description | -| --- | --- | -| `currtok_noskip` | Get the current token without skipping whitespace | -| `currtok` | Get the current token, skip any whitespace tokens | -| `prevtok` | Get the previous token (does not skip whitespace) | -| `nexttok` | Get the next token (does not skip whitespace) | -| `eat( Token Type )` | Check to see if the current token is of the given type, if so, advance Token's index to the next token | -| `left` | Get the number of tokens left in the token array | -| `check_noskip` | Check to see if the current token is of the given type, without skipping whitespace | -| `check` | Check to see if the current token is of the given type, skip any whitespace tokens | +* `TF_Macro_Functional` : Used to indicate macro token is flagged as `MF_Functional`. +* `TF_Macro_Expects_Body` : Used to indicate macro token is flagged as `MF_Expects_Body`. ### Parser -The parser has a limited user interface, only specific types of definitions or statements are expected to be provided by the user directly when using to construct an AST dynamically (See SOA for example). It however does attempt to provide capability to parse a full C/C++ from production codebases. +Files: + +* [interface.parsering.cpp](../base/components/interface.parsing.cpp) +* [parser.cpp](../base/components/parser.cpp) + +The parser has a limited user interface, only specific types of definitions or statements are expected to be provided by the user directly when using to construct an AST dynamically. It however does attempt to provide capability to parse a full C/C++ from production codebases. Each public user interface procedure has the following format: @@ -89,8 +81,7 @@ Each public user interface procedure has the following format: } ``` -The most top-level parsing procedure used for C/C++ file parsing is `parse_global_body`: - +The most top-level parsing procedure used for C/C++ file parsing is `parse_global_body`. It uses a helper procedure called `parse_global_nspace`. Each internal procedure will have the following format: @@ -111,126 +102,300 @@ internal } ``` +The parsing implementation contains throughut the codeapths to indicate how far their contextual AST node has been resolved. + +Example: + +```c +internal +CodeFn parser_parse_function() +{ + push_scope(); + + Specifier specs_found[16] = { Spec_NumSpecifiers }; + s32 NumSpecifiers = 0; + + CodeAttributes attributes = { nullptr }; + CodeSpecifiers specifiers = { nullptr }; + ModuleFlag mflags = ModuleFlag_None; + + if ( check(Tok_Module_Export) ) { + mflags = ModuleFlag_Export; + eat( Tok_Module_Export ); + } + // + + attributes = parse_attributes(); + // + + while ( left && tok_is_specifier(currtok) ) + { + Specifier spec = str_to_specifier( tok_to_str(currtok) ); + + switch ( spec ) + { + GEN_PARSER_FUNCTION_ALLOWED_SPECIFIERS_CASES: + break; + + default: + log_failure( "Invalid specifier %S for functon\n%SB", spec_to_str(spec), parser_to_strbuilder(_ctx->parser) ); + parser_pop(& _ctx->parser); + return InvalidCode; + } + + if ( spec == Spec_Const ) + continue; + + specs_found[NumSpecifiers] = spec; + NumSpecifiers++; + eat( currtok.Type ); + } + + if ( NumSpecifiers ) { + specifiers = def_specifiers_arr( NumSpecifiers, specs_found ); + } + // + + CodeTypename ret_type = parser_parse_type(parser_not_from_template, nullptr); + if ( cast(Code, ret_type) == Code_Invalid ) { + parser_pop(& _ctx->parser); + return InvalidCode; + } + // + + Token name = parse_identifier(nullptr); + _ctx->parser.Scope->Name = name.Text; + if ( ! tok_is_valid(name) ) { + parser_pop(& _ctx->parser); + return InvalidCode; + } + // + + CodeFn result = parse_function_after_name( mflags, attributes, specifiers, ret_type, name ); + // ... + + parser_pop(& _ctx->parser); + return result; +} +``` + +In the above `parse_function` implementation: + +`// ...` + +Will be conventionlly used where by that point in time for the codepath: `` should be resolved for the AST. + +## Outline of parsing codepaths + Below is an outline of the general alogirithim used for these internal procedures. The intention is to provide a basic briefing to aid the user in traversing the actual code definitions. These appear in the same order as they are in the `parser.cpp` file ***NOTE: This is still heavily in an alpha state. A large swaph of this can change, make sure these docs are up to date before considering them 1:1 with the repo commit your considering.*** ## `parse_array_decl` -1. Check if its an array declaration with no expression. - 1. Consume and return empty array declaration -2. Opening square bracket -3. Consume expression -4. Closing square bracket -5. If adjacent opening bracket - 1. Repeat array declaration parse until no brackets remain - -## `parse_assignment_expression` - -1. Eat the assignment operator -2. Make sure there is content or at least an end statement after. -3. Flatten the assignment expression to an untyped Code string. +1. Push parser scope +2. Check for empty array `[]` + 1. Return untyped string with single space if found +3. Check for opening bracket `[` + 1. Validate parser not at EOF + 2. Reject empty array expression + 3. Capture expression tokens until closing bracket + 4. Calculate expression span length + 5. Convert to untyped string +4. Validate and consume closing bracket `]` +5. Handle multi-dimensional case + 1. If adjacent `[` detected, recursively parse + 2. Link array expressions via Next pointer +6. Pop parser scope +7. Return array expression or NullCode on failure ## `parse_attributes` -1. Check for standard attribute -2. Check for GNU attribute -3. Check for MSVC attribute -4. Check for a token registered as an attribute - a. Check and grab the arguments of a token registered of an attribute if it has any. -5. Repeat for chained attributes. Flatten them to a single attribute AST node. +1. Push parser scope and initialize tracking + 1. Store initial token position + 2. Initialize length counter +2. Process attributes while available + 1. Handle C++ style attributes `[[...]]` + 1. Consume opening `[[` + 2. Capture content until closing `]]` + 3. Calculate attribute span length + 2. Handle GNU style `__attribute__((...))` + 1. Consume `__attribute__` keyword and opening `((` + 2. Capture content until closing `))` + 3. Calculate attribute span length + 3. Handle MSVC style `__declspec(...)` + 1. Consume `__declspec` and opening `(` + 2. Capture content until closing `)` + 3. Calculate attribute span length + 4. Handle macro-style attributes + 1. Consume attribute token + 2. If followed by parentheses + 1. Handle nested parentheses tracking + 2. Capture content maintaining paren balance + 3. Calculate attribute span length +3. Generate attribute code if content captured + 1. Create attribute text span from start position and length + 2. Strip formatting from attribute text + 3. Construct Code node + 1. Set type to `CT_PlatformAttributes` + 2. Cache and set Name and Content fields + 4. Return as CodeAttributes +4. Pop parser scope +5. Return NullCode if no attributes found ## `parse_class_struct` -1. Check for export module specifier -2. class or struct keyword -3. `parse_attributes` -4. If identifier : `parse_identifier` -5. Parse inherited parent or interfaces -6. If opening curly brace : `parse_class_struct_body` -7. If not an inplace definition - 1. End statement - 2. Check for inline comment +1. Validate token type is class or struct + 1. Return InvalidCode if validation fails +2. Initialize class/struct metadata + 1. Access specifier (default) + 2. Parent class/struct reference + 3. Class/struct body + 4. Attributes + 5. Module flags +3. Parse module export flag if present + 1. Set ModuleFlag_Export + 2. Consume export token +4. Consume class/struct token +5. Parse attributes via `parse_attributes()` +6. Parse class/struct identifier + 1. Update parser scope name +7. Initialize interface array (4KB arena) +8. Parse inheritance/implementation + 1. If classifier token (`:`) present + 1. Parse access specifier if exists + 2. Parse parent class/struct name + 3. Parse additional interfaces + 1. Separated by commas + 2. Optional access specifiers + 3. Store in interface array +9. Parse class body if present + 1. Triggered by opening brace + 2. Parse via `parse_class_struct_body` +10. Handle statement termination + 1. Skip for inplace definitions + 2. Consume semicolon + 3. Parse inline comment if present +11. Construct result node + 1. Create class definition if Tok_Decl_Class + 2. Create struct definition if Tok_Decl_Struct + 3. Attach inline comment if exists +12. Cleanup interface array and return result ## `parse_class_struct_body` -1. Opening curly brace -2. Parse the body (Possible options): - 1. Ignore dangling end statements - 2. Newline : ast constant - 3. Comment : `parse_comment` - 4. Access_Public : ast constant - 5. Access_Protected : ast constant - 6. Access_Private : ast constant - 7. Decl_Class : `parse_complicated_definition` - 8. Decl_Enum : `parse_complicated_definition` - 9. Decl_Friend : `parse_friend` - 10. Decl_Operator : `parse_operator_cast` - 11. Decl_Struct : `parse_complicated_definition` - 12. Decl_Template : `parse_template` - 13. Decl_Typedef : `parse_typedef` - 14. Decl_Union : `parse_complicated_definition` - 15. Decl_Using : `parse_using` - 16. Operator == '~' - 1. `parse_destructor` - 17. Preprocess_Define : `parse_define` - 18. Preprocess_Include : `parse_include` - 19. Preprocess_Conditional (if, ifdef, ifndef, elif, else, endif) : `parse_preprocess_cond` or else/endif ast constant - 20. Preprocess_Macro : `parse_simple_preprocess` - 21. Preprocess_Pragma : `parse_pragma` - 22. Preprocess_Unsupported : `parse_simple_preprocess` - 23. StaticAssert : `parse_static_assert` - 24. The following compound into a resolved definition or declaration: - 1. Attributes (Standard, GNU, MSVC) : `parse_attributes` - 2. Specifiers (consteval, constexpr, constinit, explicit, forceinline, inline, mutable, neverinline, static, volatile, virtual) - 3. Possible Destructor : `parse_destructor` - 4. Possible User defined operator cast : `parse_operator_cast` - 5. Possible Constructor : `parse_constructor` - 6. Something that has the following: (identifier, const, unsigned, signed, short, long, bool, char, int, double) - 1. Possible Constructor `parse_constructor` - 2. Possible Operator, Function, or varaible : `parse_operator_function_or_variable` - 25. Something completely unknown (will just make untyped...) : `parse_untyped` +1. Initialize scope and body structure + 1. Push parser scope + 2. Consume opening brace + 3. Create code node with `CT_Class_Body` or `CT_Struct_Body` type +2. Parse body members while not at closing brace + 1. Initialize member parsing state + 1. Code member (InvalidCode) + 2. Attributes (null) + 3. Specifiers (null) + 4. Function expectation flag + 2. Handle preprocessor hash if present + 3. Process member by token type in switch statement + 1. Statement termination - warn and skip + 2. Newline - format member + 3. Comments - parse comment + 4. Access specifiers - `public/protected/private` + 5. Declarations - `class/enum/struct/union/typedef/using` + 6. Operators - `destructors/casts` + 7. Preprocessor directives - `define/include/conditionals/pragmas` + 8. Preprocessor statement macros + 9. Report naked preprocossor expression macros detected as an error. + 10. Static assertions + 11. Attributes and specifiers + 1. Parse attributes + 2. Parse valid member specifiers + 3. Handle `attribute-specifier-attribute` case + 12. Identifiers and types + 1. Check for constructors + 2. Parse `operator/function/variable` + 13. Default - capture unknown content until closing brace + 4. Validate member parsing + 1. Return InvalidCode if member invalid + 2. Append valid member to body +3. Finalize body + 1. Consume closing brace + 2. Pop parser scope + 3. Return completed CodeBody ## `parse_comment` 1. Just wrap the token into a cached string ( the lexer did the processing ) -## `parse_compilcated_definition` +## `parse_complicated_definition` -This is a helper function used by the following functions to help resolve a declaration or definition: +1. Initialize parsing context + 1. Push scope + 2. Set inplace flag false + 3. Get token array reference +2. Scan ahead for statement termination + 1. Track brace nesting level + 2. Find first semicolon at level 0 +3. Handle declaration variants + 1. Forward declaration case + 1. Check if only 2 tokens before semicolon + 2. Parse via `parse_forward_or_definition` + 2. Function with trailing specifiers + 1. Identify trailing specifiers + 2. Check for function pattern + 3. Parse as `operator/function/variable` + 4. Return `InvalidCode` if pattern invalid + 3. Identifier-based declarations + 1. Check identifier patterns + 1. Inplace definition `{...} id;` + 2. Namespace type variable `which id id;` + 3. Enum with class qualifier + 4. `Pointer/reference` types + 2. Parse as `operator/function/variable` if valid + 3. Return `InvalidCode` if pattern invalid + 4. Basic type declarations + 1. Validate enum class pattern + 2. Parse via `parser_parse_enum` + 3. Return `InvalidCode` if invalid + 5. Direct definitions + 1. Handle closing brace - `parse_forward_or_definition` + 2. Handle array definitions - `parse_operator_function_or_variable` + 3. Return InvalidCode for unknown patterns -* `parse_class_struct_body` -* `parse_global_nspace` -* `parse_union` +## `parse_assignment_expression` -A portion of the code in `parse_typedef` is very similar to this as both have to resolve a similar issue. - -1. Look ahead to the termination token (End statement) -2. Check to see if it fits the pattern for a forward declare -3. If the previous token was an identifier ( `token[-1]` ): - 1. Look back one more token : `[-2]` - 2. If the token has a closing brace its an inplace definition - 3. If the `token[-2]` is an identifier & `token[-3]` is the declaration type, its a variable using a namespaced type. - 4. If the `token[-2]` is an indirection, then its a variable using a namespaced/forwarded type. - 5. If the `token[-2]` is an assign classifier, and the starting tokens were the which type with possible `class` token after, its an enum forward declaration. - 6. If any of the above is the case, `parse_operator_function_or_variable` -4. If the `token[2]` is a vendor fundamental type (builtin) then it is an enum forward declaration. -5. If the previous token was a closing curly brace, its a definition : `parse_forward_or_definition` -6. If the previous token was a closing square brace, its an array definition : `parse_operator_function_or_variable` - -## `parse_define` - -1. Define directive -2. Get identifier -3. Get Content (Optional) +1. Initialize expression parsing + 1. Null expression pointer + 2. Consume assignment operator token + 3. Capture initial expression token +2. Validate expression presence + 1. Check for immediate termination + 2. Return `InvalidCode` if missing expression +3. Parse balanced expression + 1. Track nesting level for + 1. Curly braces + 2. Parentheses + 2. Continue until + 1. End of input, or + 2. Statement terminator, or + 3. Unnested comma + 3. Consume tokens sequentially +4. Generate expression code + 1. Calculate expression span length + 2. Convert to untyped string + 3. Return expression node ## `parse_forward_or_definition` -* Parse any of the following for either a forward declaration or definition: - 1. Decl_Class : `parse_class` - 2. Decl_Enum : `parse_enum` - 3. Decl_Struct : `parse_struct` - 4. Decl_Union : `parse_union` +1. Declaration type routing + 1. Class (`Tok_Decl_Class`) -> `parser_parse_class` + 2. Enum (`Tok_Decl_Enum`) -> `parser_parse_enum` + 3. Struct (`Tok_Decl_Struct`) -> `parser_parse_struct` + 4. Union (`Tok_Decl_Union`) -> `parser_parse_union` +2. Error handling + 1. Return `InvalidCode` for unsupported token types + 2. Log failure with parser context + +`is_inplace` flag propagates to specialized codepaths to maintain parsing context. ## `parse_function_after_name` @@ -239,80 +404,191 @@ after its been made ceratin that the type of declaration or definition is indeed By the point this function is called the following are known : export module flag, attributes, specifiers, return type, & name -1. `parse_parameters` -2. parse postfix specifiers (we do not check if the specifier here is correct or not to be here... yet) -3. If there is a body : `parse_body` -4. Otherwise : - 1. Statment end - 2. Check for inline comment +1. Parameter parsing + 1. Push scope + 2. Parse parameter list with parentheses +2. Post-parameter specifier processing + 1. Collect trailing specifiers + 2. Initialize or append to existing specifiers +3. Parse function termination + 1. Function body case + 1. Parse body if open brace found + 2. Validate body type (`CT_Function_Body` or `CT_Untyped`) + 2. Pure virtual case + 1. Handle "`= 0`" syntax + 2. Append pure specifier + 3. Forward declaration case + 1. Consume statement terminator + 4. Handle inline comments for all cases +4. Construct function node + 1. Strip whitespace from name + 2. Initialize `CodeFn` with base properties + 1. Name (cached, stripped) + 2. Module flags + 3. Set node type + 1. `CT_Function` if body present + 2. `CT_Function_Fwd` if declaration only + 4. Attach components + 1. Attributes if present + 2. Specifiers if present + 3. Return type + 4. Parameters if present + 5. Inline comment if present +5. Cleanup and return + 1. Pop scope + 2. Return completed function node ## `parse_function_body` Currently there is no actual parsing of the function body. Any content with the braces is shoved into an execution AST node. In the future statements and expressions will be parsed. -1. Open curly brace -2. Grab all tokens between the brace and the closing brace, shove them in a execution AST node. -3. Closing curly brace +1. Initialize body parsing + 1. Push scope + 2. Consume opening brace + 3. Create CodeBody with CT_Function_Body type +2. Capture function content + 1. Record start token position + 2. Track brace nesting level + 3. Consume tokens while + 1. Input remains AND + 2. Not at unmatched closing brace + 4. Update level counters + 1. Increment on open brace + 2. Decrement on closing brace when level > 0 +3. Process captured content + 1. Calculate content length via pointer arithmetic + 2. Create execution block if content exists + 1. Construct string span from start position and length + 2. Wrap in execution node + 3. Append to body +4. Finalize + 1. Consume closing brace + 2. Pop scope + 3. Return cast body node ## `parse_global_nspace` -1. Make sure this is being called for a valid type (namespace, global body, export body, linkage body) -2. If its not a global body, consume the opening curly brace -3. Parse the body (Possible options): - 1. Ignore dangling end statements - 2. NewLine : ast constant - 3. Comment : `parse_comment` - 4. Decl_Cass : `parse_complicated_definition` - 5. Decl_Enum : `parse_complicated_definition` - 6. Decl_Extern_Linkage : `parse_extern_link` - 7. Decl_Namespace : `parse_namespace` - 8. Decl_Struct : `parse_complicated_definition` - 9. Decl_Template : `parse_template` - 10. Decl_Typedef : `parse_typedef` - 11. Decl_Union : `parse_complicated_definition` - 12. Decl_Using : `parse_using` - 13. Preprocess_Define : `parse_define` - 14. Preprocess_Include : `parse_include` - 15. Preprocess_If, IfDef, IfNotDef, Elif : `parse_preprocess_cond` - 16. Preprocess_Else : ast constant - 17. Preprocess_Endif : ast constant - 18. Preprocess_Macro : `parse_simple_preprocess` - 19. Preprocess_Pragma : `parse_pragma` - 20. Preprocess_Unsupported : `parse_simple_preprocess` - 21. StaticAssert : `parse_static_assert` - 22. Module_Export : `parse_export_body` - 23. Module_Import : NOT_IMPLEMENTED - 24. The following compound into a resolved definition or declaration: - 1. Attributes ( Standard, GNU, MSVC, Macro ) : `parse_attributes` - 2. Specifiers ( consteval, constexpr, constinit, extern, forceinline, global, inline, internal_linkage, neverinline, static ) - 3. Is either ( identifier, const specifier, long, short, signed, unsigned, bool, char, double, int) - 1. Attempt to parse as construtor or destructor : `parse_global_nspace_constructor_destructor` - 2. If its an operator cast (definition outside class) : `parse_operator_cast` - 3. Its an operator, function, or varaible : `parse_operator_function_or_varaible` -4. If its not a global body, consume the closing curly brace +1. State initialization + 1. Push parser scope + 2. Validate namespace type (Global, Namespace, Export, Extern Linkage) + 3. Consume opening brace for non-global scopes + 4. Initialize `CodeBody` with specified body type: `which` +2. Member parsing loop (while not at closing brace) + 1. Reset parse state + * Member code + * Attributes + * Specifiers + * Function expectation flag + 2. Member type handling + 1. Declarations + * `Class/Struct/Union/Enum` via `parse_complicated_definition` + * `Template/Typedef/Using` via dedicated parsers + * `Namespace/Export/Extern` declarations + 2. Preprocessor directivess + * Include/Define + * Conditionals `(if / ifdef / ifndef / elif / else / endif)` + * Pragmas + * Preprocessor statement macros + * Report naked preprocossor expression macros detected as an error. + 3. Comments/Formatting + * Newlines + * Comments + 4. Static assertions + 3. Attributes and specifiers + 1. Parse attributes if present + 2. Collect valid specifiers (max 16) + 3. Handle `consteval` for function expectation + 4. Identifier resolution + 1. Check `constructor/destructor` implementation + 2. Look ahead for user defined operator implementation outside of class + 3. Default to `operator/function/variable` parse +3. Member validation/storage + 1. Validate parsed member + 2. Append to body if valid + 3. Return `InvalidCode` on parse failure +4. Scope finalization + 1. Consume closing brace for non-global scopes + 2. Pop parser scope + 3. Return completed body ## `parse_global_nspace_constructor_destructor` -1. Look ahead for the start of the arguments for a possible constructor/destructor -2. Go back past the identifier -3. Check to see if its a destructor by checking for the `~` -4. Continue the next token should be a `::` -5. Determine if the next valid identifier (ignoring possible template parameters) is the same as the first identifier of the function. -6. If it is we have either a constructor or destructor so parse using their respective functions (`parse_constructor`, `parse_destructor`). +1. Forward Token Analysis + 1. Scan for parameter list opening parenthesis + 2. Template Expression Handling + * Track template nesting depth + * Account for nested parentheses within templates + * Skip until template closure or parameter start + +```cpp + // Valid patterns: + ClassName :: ClassName(...) + ClassName :: ~ClassName(...) + ClassName< T ... > :: ClassName(...) +``` + +2. Constructor/Destructor Identification + 1. Token Validation Sequence + * Verify identifier preceding parameters + * Check for destructor indicator (`~`) + * Validate scope resolution operator (`::`) + 2. Left-side Token Analysis + * Process nested template expressions + * Maintain template/capture level tracking + * Locate matching identifier token +3. Parser Resolution + 1. Name Pattern Validation + * Compare identifier tokens for exact match + 2. Specialized Parsing + * Route to `parser_parse_destructor` for '~' prefix + * Route to `parser_parse_constructor` for direct match + 3. Apply specifiers to resulting node +4. Return result (`NullCode` on pattern mismatch) + +### Implementation Constraints + +* Cannot definitively distinguish nested namespaces with identical names +* Return type detection requires parser enhancement +* Template parameter validation is syntax-based only +* Future enhancement: Implement type parsing with rollback capability ## `parse_identifier` This is going to get heavily changed down the line to have a more broken down "identifier expression" so that the qualifier, template args, etc, can be distinguished between the targeted identifier. The function can parse all of them, however the AST node compresses them all into a string. -1. Consume first identifier -2. `parse_template_args` -3. While there is a static symbol accessor ( `::` ) - 1. Consume `::` - 2. Consume member identifier - 3. `parse_template args` (for member identifier) - 4. If a `~` is encounted and the scope is for a destructor's identifier, do not consume it and return with what parsed. +1. Initialize identifier context + 1. Push parser scope + 2. Capture initial token as name + 3. Set scope name from token text +2. Process initial identifier component + 1. Consume identifier token + 2. Parse template arguments if present +3. Handle qualified identifiers (loop while `::` found) + 1. Consume static access operator + 2. Validate token sequence: + 1. Handle destructor operator (`~`) + * Validate destructor parsing context + * Update name span if valid + * Return invalid on context mismatch + 2. Process member function pointer (`*`) + * Set possible_member_function flag if context allows + * Return invalid if pointer unexpected + 3. Verify identifier token follows + 3. Update identifier span + 1. Extend name length to include new qualifier + 2. Consume identifier token + 3. Parse additional template arguments +4. Return completed identifier token + +Technical notes: + +* Current implementation treats identifier as single token span +* TODO: Refactor to AST-based identifier representation for: + * Distinct qualifier/symbol tracking + * Improved semantic analysis capabilities + * Better support for nested symbol resolution ## `parse_include` @@ -323,16 +599,45 @@ The function can parse all of them, however the AST node compresses them all int This is needed as a operator defintion is not easily resolvable early on, as such this function handles resolving a operator after its been made ceratin that the type of declaration or definition is indeed for a operator signature. -By the point this function is called the following are known : export module flag, attributes, specifiers, return type +By the point this function is called the following are known : export module flag, attributes, specifiers, and return type -1. If there is any qualifiers for the operator, parse them -2. Consume operator keyword -3. Determine the operator type (This will be offloaded to the lexer moreso than how it is now) & consume -4. `parse_params` -5. If there is no parameters this is operator is a member of pointer if its symbols is a *. -6. Parse postfix specifiers -7. If there is a opening curly brace, `parse function_body` -8. Otherwise: consume end statement, check for inline comment. +1. Initialize operator context + 1. Push scope + 2. Parse qualified namespace identifier + 3. Consume `operator` keyword +2. Operator identification + 1. Validate operator token presence + 2. Set scope name from operator token + 3. Map operator token to internal operator enum: + * Arithmetic: `+, -, *, /, %` + * Assignment: `+=, -=, *=, /=, %=, =` + * Bitwise: `&, |, ^, ~, >>` + * Logical: `&&, ||, !, ==` + * Comparison: `<, >, <=, >=` + * Member access: `->, ->*` + * Special: `(), [], new, delete` + 4. Handle array variants for new/delete +3. Parameter and specifier processing + 1. Parse parameter list + 2. Handle multiply/member-pointer ambiguity + 3. Collect trailing specifiers + 4. Merge with existing specifiers +4. Function body handling + 1. Parse implementation if present + 2. Otherwise consume statement terminator + 3. Capture inline comments +5. Result construction + 1. Create operator node with: + * Operator type + * Namespace qualification + * Parameters + * Return type + * Implementation body + * Specifiers + * Attributes + * Module flags + 2. Attach inline comments +6. Pop scope ## `parse_operator_function_or_variable` diff --git a/docs/Parsing.md b/docs/Parsing.md index c29fc17..535c3b9 100644 --- a/docs/Parsing.md +++ b/docs/Parsing.md @@ -6,9 +6,9 @@ # Parsing -The library features a naive single-pass parser tailored for only what the library needs to construct the supported syntax of C++ into its AST for *"front-end"* meta-programming purposes. +The library features a naive single-pass parser, tailored for only what the library needs; for construction of C++ code into gencpp's AST for *"front-end"* meta-programming purposes. -This parser does not, and should not do the compiler's job. By only supporting this minimal set of features, the parser is kept (so far) around ~7000 loc. I hope to keep it under 10k loc worst case. +This parser does not, and should not do the compiler's job. By only supporting this minimal set of features, the parser is kept (so far) around ~7000 loc. I hope to keep it under 10-15k loc worst case. You can think of this parser as *frontend parser* vs a *semantic parser*. Its intuitively similar to WYSIWYG. What you ***precerive*** as the syntax from the user-side before the compiler gets a hold of it, is what you get. @@ -17,6 +17,7 @@ User exposed interface: ```cpp CodeClass parse_class ( Str class_def ); CodeConstructor parse_constructor ( Str constructor_def ); +CodeDefine parse_define ( Str define_def ); CodeDestructor parse_destructor ( Str destructor_def ); CodeEnum parse_enum ( Str enum_def ); CodeBody parse_export_body ( Str export_def ); @@ -53,7 +54,7 @@ The keywords supported for the preprocessor are: * endif * pragma -Each directive `#` line is considered one preproecessor unit, and will be treated as one Preprocessor AST. +Each directive `#` line is considered one preproecessor unit, and will be treated as one Preprocessor AST node. If a directive is used with an unsupported keyword its will be processed as an untyped AST. The preprocessor lines are stored as members of their associated scope they are parsed within. ( Global, Namespace, Class/Struct ) @@ -62,29 +63,89 @@ The preprocessor lines are stored as members of their associated scope they are Any preprocessor definition abuse that changes the syntax of the core language is unsupported and will fail to parse if not kept within an execution scope (function body, or expression assignment). Exceptions: -* function signatures are allowed for a preprocessed macro: `neverinline MACRO() { ... }` +* varaible definitions are allowed for a preprocessed macro `extern MACRO();` +* function definitions are allowed for a preprocessed macro: `neverinline MACRO() { ... }` * Disable with: `#define GEN_PARSER_DISABLE_MACRO_FUNCTION_SIGNATURES` * typedefs allow for a preprocessed macro: `typedef MACRO();` * Disable with: `#define GEN_PARSER_DISABLE_MACRO_TYPEDEF` * Macros can behave as typenames -* There is some macro support in paramters for functions or templates *(Specifically added to support parsing Unreal Engine source)*. +* There is some macro support in parameters for functions or templates *(Specifically added to support parsing Unreal Engine source)*. *(Exceptions are added on an on-demand basis)* *(See functions `parse_operator_function_or_variable` and `parse_typedef` )* Adding your own exceptions is possible by simply modifying the parser to allow for the syntax you need. -*Note: You could interpret this strictness as a feature. This would allow the user to see if their codebase or a third-party's codebase some some egregious preprocessor abuse.* +*Note: You could interpret this strictness as a feature. This would allow the user to see if their codebase or a third-party's codebase contains some egregious preprocessor abuse.* -If a macro is not defined withint e scope of parsing a set of files, it can be defined beforehand by: +Macros used within a file should be registered by the user before parsing. This can be done two ways: -* Appending the [`PreprocessorDefines`](https://github.com/Ed94/gencpp/blob/a18b5b97aa5cfd20242065cbf53462a623cd18fa/base/components/header_end.hpp#L137) array. - * For functional macros a "(" just needs to be added after the name like: `(` so that it will tokenize its arguments as part of the token during lexing. -* Defining a CodeDefine using `def_define`. The definition will be processed by the interface for user into `PreprocessorDefines`. - * This can be prevented by setting the optional prameter `dont_append_preprocess_defines`. +1. The register macro interface within [interface.hpp](../base/components/interface.hpp). +2. Using `def_define` to create a CodeDefine and making sure to not set `opts.dont_register_to_preprocess_macros` to `true`. -The lexing and parsing takes shortcuts from whats expected in the standard. +## Registering macros +While the registeration of macros in the meta-program's side for parsing can be considered tedius, its necessary for the parser to accurately resolve the macros intent in one pass and it provides in a sense hygenics in verifying that they are used as intended. + +The following can be used to register a macro: + +```c +GEN_API void register_macro( Macro macro ); +GEN_API void register_macros( s32 num, ... ); +GEN_API void register_macros_arr( s32 num, Macro* macros ); +``` + +The Macro typename is defined with the following in [parser_types.hpp](../base/components/parser_types.hpp): + +```c +struct Macro +{ + StrCached Name; + MacroType Type; + MacroFlags Flags; +}; +``` + +The macro can be designated one of the following types: + +* `MT_Expression`: Intended to resolve to an expression expansion. +* `MT_Statement`: Intended to resolve an statement expansion. +* `MT_Typename`: Intended to resolve to a typename. + +Additioonally tthe following flags may be set: + +* `MF_Functional`: The macro intended to be passed arguments are at least have the calling `()` as part of its usage. +* `MF_Expects_Body`: The parser should expect a braced-body `{ ... }` after the macro signature ` ` +* `MF_Allow_As_Identifier`: Will allow the macro to be an acceptable token/s when an `Tok_Identifier` is expected. +* `MF_Allow_As_Attribute`: Will allow the macro to be an acceptable token/s when an attribute token/s is expected. +* `MF_Allow_As_Definition`: Will allow the macro be an acceptable token/s when the parser expects a declartion or definition to resolve after attributes or specifiers have been identified beforehand. + * This flag requires that the macro is of type `MT_Statement` to make any sense of usage. + +If a macro is not define the following warning will be issued if `GEN_BUILD_DEBUG=1` during lexing within [lexer.cpp](../base/components/lexer.cpp) - `lex_preprocessor_define`: + +```c +log_fmt("Warning: '%S' was not registered before the lexer processed its #define directive, it will be registered as a expression macro\n" + , name.Text +); +``` + +Further within the same scope, the lexer will issue a warning if it detects a macro was not flagged as function but has an open parenthesis `(` token right after is name with no whitespace: + +```c +log_fmt("Warning: %S registered macro is not flagged as functional yet the definition detects opening parenthesis '(' for arguments\n" + , name.Text +); +``` + +Macros are tracked using a `MacroTable Macros;` defined as a member of the library's `Context`. + +```c +typedef HashTable(Macro) MacroTable; +``` + +## Notes + +* Empty lines used throughout the file are preserved for formatting purposes during ast serialization (they have a dedicated Token: `Tok_NewLine`). * Numeric literals are not checked for validity. * The parse API treats any execution scope definitions with no validation and are turned into untyped Code ASTs. (There is a [todo](https://github.com/Ed94/gencpp/issues/49) to add support) * *This includes the assignment of variables.* @@ -95,4 +156,4 @@ The lexing and parsing takes shortcuts from whats expected in the standard. * Parsing attributes can be extended to support user defined macros by defining `GEN_DEFINE_ATTRIBUTE_TOKENS` (see `gen.hpp` for the formatting) * This is useful for example: parsing Unreal `Module_API` macros. -Empty lines used throughout the file are preserved for formatting purposes during ast serialization. +**The lexer & parser do not gracefully attempt to continue when it comes across incorrect code, and doesn't properly track errors into a listing (yet).**