Cleanup and doc updates

This commit is contained in:
Edward R. Gonzalez 2023-08-23 02:17:47 -04:00
parent c97762ac16
commit c81f4b34ee
5 changed files with 733 additions and 71 deletions

docs/ Normal file
View File

@ -0,0 +1,684 @@
# ASTs Documentation
While the Readme for docs covers the data layout per AST, this will focus on the AST types avaialble, and their nuances.
## Body
These are containers representing a scope body of a definition that can be of the following `ECode` type:
* Class_Body
* Enum_Body
* Export_Body
* Extern_Linkage_Body
* Function_Body
* Global_Body
* Namespace_Body
* Struct_Body
* Union_Body
Code Front;
Code Back;
Code Parent;
StringCached Name;
CodeT Type;
s32 NumEntries;
The `Front` member represents the start of the link list and `Back` the end.
NumEntries is the number of entries in the body.
Parent should have a compatible ECode type for the type of defintion used.
Will output only the entries, the braces are handled by the parent.
## Attributes
Represent standard or vendor specific C/C++ attributes.
StringCached Content;
Code Prev;
Code Next;
Code Parent;
StringCached Name;
CodeT Type;
While the parser supports the `__declspec` and `__attribute__` syntax, the upfront constructor ( def_attributes ) must have the user specify the entire attribute, including the `[[]]`, `__declspec` or `__attribute__` parts.
## Comment
Stores a comment.
StringCached Content;
Code Prev;
Code Next;
Code Parent;
StringCached Name;
CodeT Type;
The parser will perserve comments found if residing with a body or in accepted inline-to-definition locations.
Otherwise they will be skipped by the TokArray::__eat and TokArray::current( skip foramtting enabled ) functions.
The upfront constructor: `def_comment` expects to recieve a comment without the `//` or `/* */` parts. It will add them during construction.
## Class & Struct
CodeComment InlineCmt; // Only supported by forward declarations
CodeAttributes Attributes;
CodeType ParentType;
CodeBody Body;
CodeType Last; // Used to store references to interfaces
CodeType Next; // Used to store references to interfaces
Code Parent;
StringCached Name;
CodeT Type;
ModuleFlag ModuleFlags;
AccessSpec ParentAccess;
// Class_Fwd
<ModuleFlags> <class/struct> <Name>; <InlineCmt>
// Class
<ModuleFlags> <class/struct> <Attributes> <Name> : <ParentAccess> <ParentType>, public <Next>, ...<Last>
You'll notice that only one parent type is supported only with parent access. This library only supports single inheritance, the rest must be done through interfaces.
## Constructor
CodeComment InlineCmt; // Only supported by forward declarations
Code InitializerList;
CodeParam Params;
Code Body;
Code Prev;
Code Next;
Code Parent;
CodeT Type;
// Constructor_Fwd
<Specs> <Parent->Name>( <Params> ); <InlineCmt>
// Constructor
<Specs> <Parent->Name>( <Params> ): <InitializerList>
## Define
Represents a preprocessor define
StringCached Content;
Code Prev;
Code Next;
Code Parent;
StringCached Name;
CodeT Type;
#define <Name> <Content>
## Destructor
CodeComment InlineCmt;
CodeSpecifiers Specs;
Code Body;
Code Prev;
Code Next;
Code Parent;
CodeT Type;
// Destructor_Fwd
<Specs> ~<Parent->Name>( <Params> ) <Specs>; <InlineCmt>
// Destructor
<Specs> ~<Parent->Name>( <Params> ) <Specs>
## Enum
CodeComment InlineCmt;
CodeAttributes Attributes;
CodeType UnderlyingType;
CodeBody Body;
Code Prev;
Code Next;
Code Parent;
StringCached Name;
CodeT Type;
ModuleFlag ModuleFlags;
// Enum_Fwd
<ModuleFlags> enum class <Name> : <UnderlyingType>; <InlineCmt>
// Enum
<ModuleFlags> <enum or enum class> <Name> : <UnderlyingType>
## Execution
Just represents an execution body. Equivalent to an untyped body.
Will be obsolute when function body parsing is implemented.
StringCached Content;
Code Prev;
Code Next;
Code Parent;
StringCached Name;
CodeT Type;
## External Linkage
CodeBody Body;
Code Prev;
Code Next;
Code Parent;
StringCached Name;
CodeT Type;
extern "<Name>"
## Include
StringCached Content;
Code Prev;
Code Next;
Code Parent;
StringCached Name;
CodeT Type;
#include <Content>
## Friend
This library (until its necessary become some third-party library to do otherwise) does not support friend declarations with in-statment function definitions.
CodeComment InlineCmt;
Code Declaration;
Code Prev;
Code Next;
Code Parent;
StringCached Name;
CodeT Type;
friend <Declaration>; <InlineCmt>
## Function
CodeComment InlineCmt;
CodeAttributes Attributes;
CodeSpecifiers Specs;
CodeType ReturnType;
CodeParam Params;
CodeBody Body;
Code Prev;
Code Parent;
Code Next;
StringCached Name;
CodeT Type;
ModuleFlag ModuleFlags;
// Function_Fwd
<ModuleFlags> <Attributes> <Specs> <ReturnType> <Name>( <Params> ) <Specs>; <InlineCmt>
// Function
<ModuleFlags> <Attributes> <Specs> <ReturnType> <Name>( <Params> ) <Specs>
## Module
Code Prev;
Code Next;
Code Parent;
StringCached Name;
CodeT Type;
ModuleFlag ModuleFlags;
<ModuleFlags> module <Name>;
## Namespace
CodeBody Body;
Code Prev;
Code Next;
Code Parent;
StringCached Name;
CodeT Type;
ModuleFlag ModuleFlags;
<ModuleFlags> namespace <Name>
## Operator Overload
CodeComment InlineCmt;
CodeAttributes Attributes;
CodeSpecifiers Specs;
CodeType ReturnType;
CodeParam Params;
CodeBody Body;
Code Prev;
Code Next;
Code Parent;
StringCached Name;
CodeT Type;
ModuleFlag ModuleFlags;
OperatorT Op;
// Operator_Fwd
<ModuleFlags> <Attributes> <Specs> <ReturnType> operator <Op>( <Params> ) <Specs>; <InlineCmt>
// Operator
<ModuleFlags> <Attributes> <Specs> <ReturnType> <Name>operator <Op>( <Params> ) <Specs>
## Operator Cast Overload ( User-Defined Type Conversion )
CodeComment InlineCmt;
CodeSpecifiers Specs;
CodeType ValueType;
CodeBody Body;
Code Prev;
Code Next;
Code Parent;
StringCached Name;
CodeT Type;
// Operator_Cast_Fwd
<Specs> operator <ValueType>() <Specs>; <InlineCmt>
// Operator_Cast
<Specs> <Name>operator <ValueType>() <Specs>
## Parameters
CodeType ValueType;
Code Value;
CodeParam Last;
CodeParam Next;
Code Parent;
StringCached Name;
CodeT Type;
s32 NumEntries;
<ValueType> <Name>, <Next>... <Last>
## Pragma
StringCached Content;
Code Prev;
Code Next;
Code Parent;
StringCached Name;
CodeT Type;
#pragma <Content>
## Preprocessor Conditional
StringCached Content;
Code Prev;
Code Next;
Code Parent;
StringCached Name;
CodeT Type;
#<based off Type> <Content>
## Specifiers
SpecifierT ArrSpecs[ AST::ArrSpecs_Cap ];
Code Prev;
Code Next;
Code Parent;
StringCached Name;
CodeT Type;
s32 NumEntries;
<Spec>, ...
## Template
CodeParam Params;
Code Declaration;
Code Prev;
Code Next;
Code Parent;
StringCached Name;
CodeT Type;
ModuleFlag ModuleFlags;
template< <Params> >
## Typename
Typenames represent the type "symbol".
CodeAttributes Attributes;
CodeSpecifiers Specs;
Code ArrExpr;
Code Prev;
Code Next;
Code Parent;
StringCached Name;
CodeT Type;
<Attributes> <Name> <Specs>
## Typedef
Behave as usual except function or macro typedefs.
Those don't use the underlying type field as everything was serialized under the Name field.
CodeComment InlineCmt;
Code UnderlyingType;
Code Prev;
Code Next;
Code Parent;
StringCached Name;
CodeT Type;
ModuleFlag ModuleFlags;
b32 IsFunction;
// Regular
<ModuleFlags> typedef <UnderlyingType> <Name>; <InlineCmt>
// Functions
<ModuleFlags> typedef <Name>; <InlineCmt>
## Union
CodeAttributes Attributes;
CodeBody Body;
Code Prev;
Code Next;
Code Parent;
StringCached Name;
CodeT Type;
ModuleFlag ModuleFlags;
<ModuleFlags> union <Attributes> <Name>
## Using
CodeComment InlineCmt;
CodeAttributes Attributes;
CodeType UnderlyingType;
Code Prev;
Code Next;
Code Parent;
StringCached Name;
CodeT Type;
ModuleFlag ModuleFlags;
// Regular
<ModuleFlags> using <Attributes> <Name> = <UnderlyingType>; <InlineCmt>
// Namespace
<ModuleFlags> using namespace <Name>; <InlineCmt>
## Variable
CodeComment InlineCmt;
CodeAttributes Attributes;
CodeSpecifiers Specs;
CodeType ValueType;
Code BitfieldSize;
Code Value;
Code Prev;
Code Next;
Code Parent;
StringCached Name;
CodeT Type;
ModuleFlag ModuleFlags;
// Regular
<ModuleFlags> <Attributes> <Specs> <ValueType> <Name> = <Value>; <InlineCmt>
// Bitfield
<ModuleFlags> <Attributes> <Specs> <ValueType> <Name> : <BitfieldSize> = <Value>; <InlineCmt>

View File

@ -1,7 +1,9 @@
# Parsing
The library features a naive parser tailored for only what the library needs to construct the supported syntax of C++ into its AST.
This parser does not, and should not do the compiler's job. By only supporting this minimal set of features, the parser is kept (so far) under 5000 loc.
This parser does not, and should not do the compiler's job. By only supporting this minimal set of features, the parser is kept (so far) around 5000 loc.
You can think of this parser of a frontend parser vs a semantic parser. Its intuitively similar to WYSIWYG. What you precerive as the syntax from the user-side before the compiler gets a hold of it, is what you get.
The parsing implementation supports the following for the user:
@ -27,10 +29,12 @@ CodeUsing parse_using ( StrC using_def );
CodeVar parse_variable ( StrC var_def );
To parse file buffers, use the `parse_global_body` function.
***Parsing will aggregate any tokens within a function body or expression statement to an untyped Code AST.***
Everything is done in one pass for both the preprocessor directives and the rest of the language.
The parser performs no macro expansion as the scope of gencpp feature-set is to only support the preprocessor for the goal of having rudimentary awareness of preprocessor ***conditionals***, ***defines***, and ***includes***, and ***pragmas***.
The parser performs no macro expansion as the scope of gencpp feature-set is to only support the preprocessor for the goal of having rudimentary awareness of preprocessor ***conditionals***, ***defines***, ***includes***, and ***pragmas***.
The keywords supported for the preprocessor are:
@ -51,10 +55,17 @@ Any preprocessor definition abuse that changes the syntax of the core language i
* function signatures are allowed for a preprocessed macro: `neverinline MACRO() { ... }`
* typedefs allow for a preprocessed macro: `typedef MACRO();`
* Disable with: `#define GEN_PARSER_DISABLE_MACRO_TYPEDEF`
*(See functions `parse_operator_function_or_variable` and `parse_typedef` )*
Adding your own exceptions is possible by simply modifying the parser to allow for the syntax you need.
*Note: You could interpret this strictness as a feature. This would allow the user to see if their codebase or a third-party's codebase some some egregious preprocessor abuse.*
The lexing and parsing takes shortcuts from whats expected in the standard.
* Numeric literals are not checked for validity.
@ -69,3 +80,4 @@ The lexing and parsing takes shortcuts from whats expected in the standard.
* Parsing attributes can be extended to support user defined macros by defining `GEN_DEFINE_ATTRIBUTE_TOKENS` (see `gen.hpp` for the formatting)
Empty lines used throughout the file are preserved for formatting purposes during ast serialization.

View File

@ -38,49 +38,8 @@ String AST::to_string()
case Untyped:
case Execution:
result.append( Content );
case Comment:
// TODO : Move this formmating process to def_comment,
// Were going to preserve as much of the original formatting as possible
// so that the parsed comments don't have any artifacts.
// Just doing what untyped and execution do
#if 0
if ( Prev && Prev->Type != Comment && Prev->Type != NewLine )
result.append( "\n" );
static char line[ MaxCommentLineLength ];
char const* end = & scast(String, Content).back();
char* scanner = Content.Data;
s32 curr = 0;
char const* next = scanner;
s32 length = 0;
while ( next != end && scanner[ length ] != '\n' )
next = scanner + length;
str_copy( line, scanner, length );
result.append_fmt( "//%.*s", length, line );
mem_set( line, 0, MaxCommentLineLength );
scanner += length;
while ( scanner <= end );
if ( result.back() != '\n' )
result.append( "\n" );
result.append( Content );
case Access_Private:
@ -148,7 +107,7 @@ String AST::to_string()
result.append_fmt( "class %S %S", Attributes->to_string(), Name );
else result.append_fmt( "class %S", Name );
// Check if it can have an end-statement
if ( Parent == nullptr || ( Parent->Type != ECode::Typedef && Parent->Type != ECode::Variable ) )
@ -226,7 +185,7 @@ String AST::to_string()
result.append_fmt( "~%S();", Parent->Name );
if ( InlineCmt )
result.append_fmt( " %S", InlineCmt->Content );
@ -364,7 +323,7 @@ String AST::to_string()
result.append( ";" );
if ( InlineCmt )
result.append_fmt(" %S", InlineCmt->Content );
@ -539,7 +498,7 @@ String AST::to_string()
if ( InlineCmt )
result.append_fmt( "; %S", InlineCmt->Content );
@ -552,7 +511,7 @@ String AST::to_string()
if ( Specs )
// TODO : Add support for specifies before the operator keyword
if ( Name && Name.length() )
result.append_fmt( "%Soperator %S()", Name, ValueType->to_string() );
@ -582,7 +541,7 @@ String AST::to_string()
if ( Specs )
// TODO : Add support for specifies before the operator keyword
result.append_fmt( "operator %S()", ValueType->to_string() );
for ( SpecifierT spec : Specs->cast<CodeSpecifiers>() )
@ -795,7 +754,7 @@ String AST::to_string()
result.append( ";" );
if ( InlineCmt )
result.append_fmt(" %S", InlineCmt->Content);
@ -872,7 +831,7 @@ String AST::to_string()
result.append_fmt( "using %S;", Name );
if ( InlineCmt )
result.append_fmt(" %S\n", InlineCmt->Content );
@ -927,7 +886,7 @@ String AST::to_string()
result.append_fmt( "%S %S;", UnderlyingType->to_string(), Name );
if ( InlineCmt )
result.append_fmt(" %S", InlineCmt->Content);

View File

@ -57,7 +57,7 @@ struct AST_Class
char _PAD_[ sizeof(SpecifierT) * AST::ArrSpecs_Cap ];
CodeComment InlineCmt;
CodeComment InlineCmt; // Only supported by forward declarations
CodeAttributes Attributes;
char _PAD_SPECS_ [ sizeof(AST*) ];
CodeType ParentType;
@ -81,11 +81,12 @@ struct AST_Constructor
char _PAD_[ sizeof(SpecifierT) * AST::ArrSpecs_Cap ];
CodeComment InlineCmt;
char _PAD_PROPERTIES_ [ sizeof(AST*) * 3 ];
Code InitializerList;
CodeParam Params;
Code Body;
CodeComment InlineCmt; // Only supported by forward declarations
char _PAD_PROPERTIES_ [ sizeof(AST*) * 1 ];
CodeSpecifiers Specs;
Code InitializerList;
CodeParam Params;
Code Body;
Code Prev;

View File

@ -1184,7 +1184,7 @@ CodeComment parse_comment()
using namespace Parser;
StackNode scope { nullptr, currtok_noskip, NullToken, txt( __func__ ) };
Context.push( & scope );
result = (CodeComment) make_code();
result->Type = ECode::Comment;
@ -1812,7 +1812,7 @@ CodeFn parse_function_after_name(
Token stmt_end = currtok;
eat( TokType::Statement_End );
if ( currtok_noskip.Type && TokType::Comment && currtok_noskip.Line == stmt_end.Line )
inline_cmt = parse_comment();
@ -1855,7 +1855,7 @@ CodeFn parse_function_after_name(
if ( params )
result->Params = params;
if ( inline_cmt )
result->InlineCmt = inline_cmt;
@ -2142,7 +2142,7 @@ CodeOperator parse_operator_after_ret_type(
Token stmt_end = currtok;
eat( TokType::Statement_End );
if ( currtok_noskip.Type == TokType::Comment && currtok_noskip.Line == stmt_end.Line )
inline_cmt = parse_comment();
@ -2152,7 +2152,7 @@ CodeOperator parse_operator_after_ret_type(
if ( inline_cmt )
result->InlineCmt = inline_cmt;
return result;
@ -2319,7 +2319,7 @@ Code parse_simple_preprocess( Parser::TokType which )
Token stmt_end = currtok;
eat( TokType::Statement_End );
if ( currtok_noskip.Type == TokType::Comment && currtok_noskip.Line == stmt_end.Line )
eat( TokType::Comment );
@ -2335,7 +2335,7 @@ Code parse_simple_preprocess( Parser::TokType which )
Token stmt_end = currtok;
eat( TokType::Statement_End );
if ( currtok_noskip.Type == TokType::Comment && currtok_noskip.Line == stmt_end.Line )
eat( TokType::Comment );
@ -2361,6 +2361,7 @@ Code parse_operator_function_or_variable( bool expects_function, CodeAttributes
Code result = CodeInvalid;
if ( currtok.Type == TokType::Preprocess_Macro )
// Were dealing with a macro after attributes/specifiers.
@ -2368,6 +2369,7 @@ Code parse_operator_function_or_variable( bool expects_function, CodeAttributes
return result;
CodeType type = parse_type();
@ -3594,12 +3596,12 @@ CodeEnum parse_enum( bool inplace_def )
CodeComment inline_cmt = NoCode;
if ( ! inplace_def )
Token stmt_end = currtok;
eat( TokType::Statement_End );
if ( currtok_noskip.Type == TokType::Comment && currtok_noskip.Line == stmt_end.Line )
inline_cmt = parse_comment();
@ -3626,7 +3628,7 @@ CodeEnum parse_enum( bool inplace_def )
if ( type )
result->UnderlyingType = type;
if ( inline_cmt )
result->InlineCmt = inline_cmt;
@ -4518,7 +4520,11 @@ CodeTypedef parse_typedef()
constexpr bool from_typedef = true;
if ( false )
if ( check( TokType::Preprocess_Macro ))
type = t_empty;
name = currtok;
@ -4631,10 +4637,10 @@ CodeTypedef parse_typedef()
array_expr = parse_array_decl();
Token stmt_end = currtok;
eat( TokType::Statement_End );
CodeComment inline_cmt = NoCode;
if ( currtok_noskip.Type == TokType::Comment && currtok_noskip.Line == stmt_end.Line )
inline_cmt = parse_comment();
@ -4661,7 +4667,7 @@ CodeTypedef parse_typedef()
if ( type->Type == Typename && array_expr && array_expr->Type != Invalid )
type.cast<CodeType>()->ArrExpr = array_expr;
if ( inline_cmt )
result->InlineCmt = inline_cmt;