From c81f4b34ee93a49b5b77e8a1327cdbdc602bb711 Mon Sep 17 00:00:00 2001 From: Ed_ Date: Wed, 23 Aug 2023 02:17:47 -0400 Subject: [PATCH] Cleanup and doc updates --- docs/ASTs.md | 684 +++++++++++++++++++++++ docs/Parsing.md | 16 +- project/components/ast.cpp | 59 +- project/components/ast_types.hpp | 13 +- project/components/interface.parsing.cpp | 32 +- 5 files changed, 733 insertions(+), 71 deletions(-) create mode 100644 docs/ASTs.md diff --git a/docs/ASTs.md b/docs/ASTs.md new file mode 100644 index 0000000..f44f258 --- /dev/null +++ b/docs/ASTs.md @@ -0,0 +1,684 @@ +# ASTs Documentation + +While the Readme for docs covers the data layout per AST, this will focus on the AST types avaialble, and their nuances. + +## Body + +These are containers representing a scope body of a definition that can be of the following `ECode` type: + +* Class_Body +* Enum_Body +* Export_Body +* Extern_Linkage_Body +* Function_Body +* Global_Body +* Namespace_Body +* Struct_Body +* Union_Body + +Fields: + +```cpp +Code Front; +Code Back; +Code Parent; +StringCached Name; +CodeT Type; +s32 NumEntries; +``` + +The `Front` member represents the start of the link list and `Back` the end. +NumEntries is the number of entries in the body. + +Parent should have a compatible ECode type for the type of defintion used. + +Serialization: + +Will output only the entries, the braces are handled by the parent. + +```cpp +... + +``` + +## Attributes + +Represent standard or vendor specific C/C++ attributes. + +Fields: + +```cpp +StringCached Content; +Code Prev; +Code Next; +Code Parent; +StringCached Name; +CodeT Type; +``` + +Serialization: + +```cpp + +``` + +While the parser supports the `__declspec` and `__attribute__` syntax, the upfront constructor ( def_attributes ) must have the user specify the entire attribute, including the `[[]]`, `__declspec` or `__attribute__` parts. + +## Comment + +Stores a comment. + +Fields: + +```cpp +StringCached Content; +Code Prev; +Code Next; +Code Parent; +StringCached Name; +CodeT Type; +``` + +Serialization: + +```cpp + +``` + +The parser will perserve comments found if residing with a body or in accepted inline-to-definition locations. +Otherwise they will be skipped by the TokArray::__eat and TokArray::current( skip foramtting enabled ) functions. + +The upfront constructor: `def_comment` expects to recieve a comment without the `//` or `/* */` parts. It will add them during construction. + +## Class & Struct + +Fields: + +```cpp +CodeComment InlineCmt; // Only supported by forward declarations +CodeAttributes Attributes; +CodeType ParentType; +CodeBody Body; +CodeType Last; // Used to store references to interfaces +CodeType Next; // Used to store references to interfaces +Code Parent; +StringCached Name; +CodeT Type; +ModuleFlag ModuleFlags; +AccessSpec ParentAccess; +``` + +Serialization: + +```cpp +// Class_Fwd + ; + +// Class + : , public , ... +{ + +}; +``` + +You'll notice that only one parent type is supported only with parent access. This library only supports single inheritance, the rest must be done through interfaces. + +## Constructor + +Fields: + +```cpp +CodeComment InlineCmt; // Only supported by forward declarations +Code InitializerList; +CodeParam Params; +Code Body; +Code Prev; +Code Next; +Code Parent; +CodeT Type; +``` + +Serialization: + +```cpp +// Constructor_Fwd + Name>( ); + +// Constructor + Name>( ): +{ + +} +``` + +## Define + +Represents a preprocessor define + +Fields: + +```cpp +StringCached Content; +Code Prev; +Code Next; +Code Parent; +StringCached Name; +CodeT Type; +``` + +Serialization: + +```cpp +#define +``` + +## Destructor + +Fields: + +```cpp +CodeComment InlineCmt; +CodeSpecifiers Specs; +Code Body; +Code Prev; +Code Next; +Code Parent; +CodeT Type; +``` + +Serialization: + +```cpp +// Destructor_Fwd + ~Name>( ) ; + +// Destructor + ~Name>( ) +{ + +} +``` + +## Enum + +Fields: + +```cpp +CodeComment InlineCmt; +CodeAttributes Attributes; +CodeType UnderlyingType; +CodeBody Body; +Code Prev; +Code Next; +Code Parent; +StringCached Name; +CodeT Type; +ModuleFlag ModuleFlags; +``` + +Serialization: + +```cpp +// Enum_Fwd + enum class : ; + +// Enum + : +{ + +}; +``` + +## Execution + +Just represents an execution body. Equivalent to an untyped body. +Will be obsolute when function body parsing is implemented. + +Fields: + +```cpp +StringCached Content; +Code Prev; +Code Next; +Code Parent; +StringCached Name; +CodeT Type; +``` + +Serialization: + +```cpp + +``` + +## External Linkage + +Fields: + +```cpp +CodeBody Body; +Code Prev; +Code Next; +Code Parent; +StringCached Name; +CodeT Type; +``` + +Serialization: + +```cpp +extern "" +{ + +} +``` + +## Include + +Fields: + +```cpp +StringCached Content; +Code Prev; +Code Next; +Code Parent; +StringCached Name; +CodeT Type; +``` + +Serialization: + +```cpp +#include +``` + +## Friend + +This library (until its necessary become some third-party library to do otherwise) does not support friend declarations with in-statment function definitions. + +Fields: + +```cpp +CodeComment InlineCmt; +Code Declaration; +Code Prev; +Code Next; +Code Parent; +StringCached Name; +CodeT Type; +``` + +Serialization: + +```cpp +friend ; +``` + +## Function + +Fields: + +```cpp +CodeComment InlineCmt; +CodeAttributes Attributes; +CodeSpecifiers Specs; +CodeType ReturnType; +CodeParam Params; +CodeBody Body; +Code Prev; +Code Parent; +Code Next; +StringCached Name; +CodeT Type; +ModuleFlag ModuleFlags; +``` + +Serialization: + +```cpp +// Function_Fwd + ( ) ; + +// Function + ( ) +{ + +} +``` + +## Module + +Fields: + +```cpp +Code Prev; +Code Next; +Code Parent; +StringCached Name; +CodeT Type; +ModuleFlag ModuleFlags; +``` + +Serialization: + +```cpp + module ; +``` + +## Namespace + +Fields: + +```cpp +CodeBody Body; +Code Prev; +Code Next; +Code Parent; +StringCached Name; +CodeT Type; +ModuleFlag ModuleFlags; +``` + +Serialization: + +```cpp + namespace +{ + +} +``` + +## Operator Overload + +Fields: + +```cpp +CodeComment InlineCmt; +CodeAttributes Attributes; +CodeSpecifiers Specs; +CodeType ReturnType; +CodeParam Params; +CodeBody Body; +Code Prev; +Code Next; +Code Parent; +StringCached Name; +CodeT Type; +ModuleFlag ModuleFlags; +OperatorT Op; +``` + +Serialization: + +```cpp +// Operator_Fwd + operator ( ) ; + +// Operator + operator ( ) +{ + +} +``` + +## Operator Cast Overload ( User-Defined Type Conversion ) + +Fields: + +```cpp +CodeComment InlineCmt; +CodeSpecifiers Specs; +CodeType ValueType; +CodeBody Body; +Code Prev; +Code Next; +Code Parent; +StringCached Name; +CodeT Type; +``` + +Serialization: + +```cpp +// Operator_Cast_Fwd + operator () ; + +// Operator_Cast + operator () +{ + +} +``` + +## Parameters + +Fields: + +```cpp +CodeType ValueType; +Code Value; +CodeParam Last; +CodeParam Next; +Code Parent; +StringCached Name; +CodeT Type; +s32 NumEntries; +``` + +Serialization: + +```cpp + , ... +``` + +## Pragma + +Fields: + +```cpp +StringCached Content; +Code Prev; +Code Next; +Code Parent; +StringCached Name; +CodeT Type; +``` + +Serialization: + +```cpp +#pragma +``` + +## Preprocessor Conditional + +Fields: + +```cpp +StringCached Content; +Code Prev; +Code Next; +Code Parent; +StringCached Name; +CodeT Type; +``` + +Serialization: + +```cpp +# +``` + +## Specifiers + +Fields: + +```cpp +SpecifierT ArrSpecs[ AST::ArrSpecs_Cap ]; +Code Prev; +Code Next; +Code Parent; +StringCached Name; +CodeT Type; +s32 NumEntries; +``` + +Serialization: + +```cpp +, ... +``` + +## Template + +Fields: + +```cpp +CodeParam Params; +Code Declaration; +Code Prev; +Code Next; +Code Parent; +StringCached Name; +CodeT Type; +ModuleFlag ModuleFlags; +``` + +Serialization: + +```cpp + +template< > + +``` + +## Typename + +Typenames represent the type "symbol". + +Fields: + +```cpp +CodeAttributes Attributes; +CodeSpecifiers Specs; +Code ArrExpr; +Code Prev; +Code Next; +Code Parent; +StringCached Name; +CodeT Type; +``` + +Serialization: + +```cpp + +``` + +## Typedef + +Behave as usual except function or macro typedefs. +Those don't use the underlying type field as everything was serialized under the Name field. + +Fields: + +```cpp +CodeComment InlineCmt; +Code UnderlyingType; +Code Prev; +Code Next; +Code Parent; +StringCached Name; +CodeT Type; +ModuleFlag ModuleFlags; +b32 IsFunction; +``` + +Serialization: + +```cpp +// Regular + typedef ; + +// Functions + typedef ; +``` + +## Union + +Fields: + +```cpp +CodeAttributes Attributes; +CodeBody Body; +Code Prev; +Code Next; +Code Parent; +StringCached Name; +CodeT Type; +ModuleFlag ModuleFlags; +``` + +Serialization: + +```cpp + union +{ + +} +``` + +## Using + +Fields: + +```cpp +CodeComment InlineCmt; +CodeAttributes Attributes; +CodeType UnderlyingType; +Code Prev; +Code Next; +Code Parent; +StringCached Name; +CodeT Type; +ModuleFlag ModuleFlags; +``` + +Serialization: + +```cpp +// Regular + using = ; + +// Namespace + using namespace ; +``` + +## Variable + +Fields: + +```cpp +CodeComment InlineCmt; +CodeAttributes Attributes; +CodeSpecifiers Specs; +CodeType ValueType; +Code BitfieldSize; +Code Value; +Code Prev; +Code Next; +Code Parent; +StringCached Name; +CodeT Type; +ModuleFlag ModuleFlags; +``` + +Serialization: + +```cpp +// Regular + = ; + +// Bitfield + : = ; +``` diff --git a/docs/Parsing.md b/docs/Parsing.md index 9a538bd..12f189b 100644 --- a/docs/Parsing.md +++ b/docs/Parsing.md @@ -1,7 +1,9 @@ # Parsing The library features a naive parser tailored for only what the library needs to construct the supported syntax of C++ into its AST. -This parser does not, and should not do the compiler's job. By only supporting this minimal set of features, the parser is kept (so far) under 5000 loc. +This parser does not, and should not do the compiler's job. By only supporting this minimal set of features, the parser is kept (so far) around 5000 loc. + +You can think of this parser of a frontend parser vs a semantic parser. Its intuitively similar to WYSIWYG. What you precerive as the syntax from the user-side before the compiler gets a hold of it, is what you get. The parsing implementation supports the following for the user: @@ -27,10 +29,12 @@ CodeUsing parse_using ( StrC using_def ); CodeVar parse_variable ( StrC var_def ); ``` +To parse file buffers, use the `parse_global_body` function. + ***Parsing will aggregate any tokens within a function body or expression statement to an untyped Code AST.*** Everything is done in one pass for both the preprocessor directives and the rest of the language. -The parser performs no macro expansion as the scope of gencpp feature-set is to only support the preprocessor for the goal of having rudimentary awareness of preprocessor ***conditionals***, ***defines***, and ***includes***, and ***pragmas***. +The parser performs no macro expansion as the scope of gencpp feature-set is to only support the preprocessor for the goal of having rudimentary awareness of preprocessor ***conditionals***, ***defines***, ***includes***, and ***pragmas***. The keywords supported for the preprocessor are: @@ -51,10 +55,17 @@ Any preprocessor definition abuse that changes the syntax of the core language i Exceptions: * function signatures are allowed for a preprocessed macro: `neverinline MACRO() { ... }` + * Disable with: `#define GEN_PARSER_DISABLE_MACRO_FUNCTION_SIGNATURES` * typedefs allow for a preprocessed macro: `typedef MACRO();` + * Disable with: `#define GEN_PARSER_DISABLE_MACRO_TYPEDEF` *(See functions `parse_operator_function_or_variable` and `parse_typedef` )* +Adding your own exceptions is possible by simply modifying the parser to allow for the syntax you need. + +*Note: You could interpret this strictness as a feature. This would allow the user to see if their codebase or a third-party's codebase some some egregious preprocessor abuse.* + + The lexing and parsing takes shortcuts from whats expected in the standard. * Numeric literals are not checked for validity. @@ -69,3 +80,4 @@ The lexing and parsing takes shortcuts from whats expected in the standard. * Parsing attributes can be extended to support user defined macros by defining `GEN_DEFINE_ATTRIBUTE_TOKENS` (see `gen.hpp` for the formatting) Empty lines used throughout the file are preserved for formatting purposes during ast serialization. + diff --git a/project/components/ast.cpp b/project/components/ast.cpp index 9836fd3..86055e4 100644 --- a/project/components/ast.cpp +++ b/project/components/ast.cpp @@ -38,49 +38,8 @@ String AST::to_string() case Untyped: case Execution: - result.append( Content ); - break; - case Comment: - { - // TODO : Move this formmating process to def_comment, - // Were going to preserve as much of the original formatting as possible - // so that the parsed comments don't have any artifacts. - // Just doing what untyped and execution do -#if 0 - if ( Prev && Prev->Type != Comment && Prev->Type != NewLine ) - result.append( "\n" ); - - static char line[ MaxCommentLineLength ]; - - char const* end = & scast(String, Content).back(); - char* scanner = Content.Data; - s32 curr = 0; - do - { - char const* next = scanner; - s32 length = 0; - while ( next != end && scanner[ length ] != '\n' ) - { - next = scanner + length; - length++; - } - length++; - - str_copy( line, scanner, length ); - result.append_fmt( "//%.*s", length, line ); - mem_set( line, 0, MaxCommentLineLength ); - - scanner += length; - } - while ( scanner <= end ); - - if ( result.back() != '\n' ) - result.append( "\n" ); -#else result.append( Content ); -#endif - } break; case Access_Private: @@ -148,7 +107,7 @@ String AST::to_string() result.append_fmt( "class %S %S", Attributes->to_string(), Name ); else result.append_fmt( "class %S", Name ); - + // Check if it can have an end-statement if ( Parent == nullptr || ( Parent->Type != ECode::Typedef && Parent->Type != ECode::Variable ) ) { @@ -226,7 +185,7 @@ String AST::to_string() } else result.append_fmt( "~%S();", Parent->Name ); - + if ( InlineCmt ) result.append_fmt( " %S", InlineCmt->Content ); else @@ -364,7 +323,7 @@ String AST::to_string() { result.append( ";" ); } - + if ( InlineCmt ) result.append_fmt(" %S", InlineCmt->Content ); else @@ -539,7 +498,7 @@ String AST::to_string() } } } - + if ( InlineCmt ) result.append_fmt( "; %S", InlineCmt->Content ); else @@ -552,7 +511,7 @@ String AST::to_string() if ( Specs ) { // TODO : Add support for specifies before the operator keyword - + if ( Name && Name.length() ) result.append_fmt( "%Soperator %S()", Name, ValueType->to_string() ); else @@ -582,7 +541,7 @@ String AST::to_string() if ( Specs ) { // TODO : Add support for specifies before the operator keyword - + result.append_fmt( "operator %S()", ValueType->to_string() ); for ( SpecifierT spec : Specs->cast() ) @@ -795,7 +754,7 @@ String AST::to_string() { result.append( ";" ); } - + if ( InlineCmt ) result.append_fmt(" %S", InlineCmt->Content); else @@ -872,7 +831,7 @@ String AST::to_string() } else result.append_fmt( "using %S;", Name ); - + if ( InlineCmt ) result.append_fmt(" %S\n", InlineCmt->Content ); else @@ -927,7 +886,7 @@ String AST::to_string() else result.append_fmt( "%S %S;", UnderlyingType->to_string(), Name ); - + if ( InlineCmt ) result.append_fmt(" %S", InlineCmt->Content); else diff --git a/project/components/ast_types.hpp b/project/components/ast_types.hpp index 860feab..15710e2 100644 --- a/project/components/ast_types.hpp +++ b/project/components/ast_types.hpp @@ -57,7 +57,7 @@ struct AST_Class char _PAD_[ sizeof(SpecifierT) * AST::ArrSpecs_Cap ]; struct { - CodeComment InlineCmt; + CodeComment InlineCmt; // Only supported by forward declarations CodeAttributes Attributes; char _PAD_SPECS_ [ sizeof(AST*) ]; CodeType ParentType; @@ -81,11 +81,12 @@ struct AST_Constructor char _PAD_[ sizeof(SpecifierT) * AST::ArrSpecs_Cap ]; struct { - CodeComment InlineCmt; - char _PAD_PROPERTIES_ [ sizeof(AST*) * 3 ]; - Code InitializerList; - CodeParam Params; - Code Body; + CodeComment InlineCmt; // Only supported by forward declarations + char _PAD_PROPERTIES_ [ sizeof(AST*) * 1 ]; + CodeSpecifiers Specs; + Code InitializerList; + CodeParam Params; + Code Body; }; }; Code Prev; diff --git a/project/components/interface.parsing.cpp b/project/components/interface.parsing.cpp index 3f2d357..caf14dd 100644 --- a/project/components/interface.parsing.cpp +++ b/project/components/interface.parsing.cpp @@ -1184,7 +1184,7 @@ CodeComment parse_comment() using namespace Parser; StackNode scope { nullptr, currtok_noskip, NullToken, txt( __func__ ) }; Context.push( & scope ); - + CodeComment result = (CodeComment) make_code(); result->Type = ECode::Comment; @@ -1812,7 +1812,7 @@ CodeFn parse_function_after_name( { Token stmt_end = currtok; eat( TokType::Statement_End ); - + if ( currtok_noskip.Type && TokType::Comment && currtok_noskip.Line == stmt_end.Line ) inline_cmt = parse_comment(); } @@ -1855,7 +1855,7 @@ CodeFn parse_function_after_name( if ( params ) result->Params = params; - + if ( inline_cmt ) result->InlineCmt = inline_cmt; @@ -2142,7 +2142,7 @@ CodeOperator parse_operator_after_ret_type( { Token stmt_end = currtok; eat( TokType::Statement_End ); - + if ( currtok_noskip.Type == TokType::Comment && currtok_noskip.Line == stmt_end.Line ) inline_cmt = parse_comment(); } @@ -2152,7 +2152,7 @@ CodeOperator parse_operator_after_ret_type( if ( inline_cmt ) result->InlineCmt = inline_cmt; - + Context.pop(); return result; } @@ -2319,7 +2319,7 @@ Code parse_simple_preprocess( Parser::TokType which ) { Token stmt_end = currtok; eat( TokType::Statement_End ); - + if ( currtok_noskip.Type == TokType::Comment && currtok_noskip.Line == stmt_end.Line ) eat( TokType::Comment ); } @@ -2335,7 +2335,7 @@ Code parse_simple_preprocess( Parser::TokType which ) { Token stmt_end = currtok; eat( TokType::Statement_End ); - + if ( currtok_noskip.Type == TokType::Comment && currtok_noskip.Line == stmt_end.Line ) eat( TokType::Comment ); } @@ -2361,6 +2361,7 @@ Code parse_operator_function_or_variable( bool expects_function, CodeAttributes Code result = CodeInvalid; +#ifndef GEN_PARSER_DISABLE_MACRO_FUNCTION_SIGNATURES if ( currtok.Type == TokType::Preprocess_Macro ) { // Were dealing with a macro after attributes/specifiers. @@ -2368,6 +2369,7 @@ Code parse_operator_function_or_variable( bool expects_function, CodeAttributes Context.pop(); return result; } +#endif CodeType type = parse_type(); @@ -3594,12 +3596,12 @@ CodeEnum parse_enum( bool inplace_def ) } CodeComment inline_cmt = NoCode; - + if ( ! inplace_def ) { Token stmt_end = currtok; eat( TokType::Statement_End ); - + if ( currtok_noskip.Type == TokType::Comment && currtok_noskip.Line == stmt_end.Line ) inline_cmt = parse_comment(); } @@ -3626,7 +3628,7 @@ CodeEnum parse_enum( bool inplace_def ) if ( type ) result->UnderlyingType = type; - + if ( inline_cmt ) result->InlineCmt = inline_cmt; @@ -4518,7 +4520,11 @@ CodeTypedef parse_typedef() constexpr bool from_typedef = true; +#if GEN_PARSER_DISABLE_MACRO_TYPEDEF + if ( false ) +#else if ( check( TokType::Preprocess_Macro )) +#endif { type = t_empty; name = currtok; @@ -4631,10 +4637,10 @@ CodeTypedef parse_typedef() } array_expr = parse_array_decl(); - + Token stmt_end = currtok; eat( TokType::Statement_End ); - + CodeComment inline_cmt = NoCode; if ( currtok_noskip.Type == TokType::Comment && currtok_noskip.Line == stmt_end.Line ) inline_cmt = parse_comment(); @@ -4661,7 +4667,7 @@ CodeTypedef parse_typedef() if ( type->Type == Typename && array_expr && array_expr->Type != Invalid ) type.cast()->ArrExpr = array_expr; - + if ( inline_cmt ) result->InlineCmt = inline_cmt;