From f73f643aedb810b5493dfccd51a660c7c6a33c3d Mon Sep 17 00:00:00 2001 From: ryanfleury Date: Tue, 29 Jun 2021 20:48:37 -0600 Subject: [PATCH] kill old parser --- source/md_impl.c | 944 +------------------------------------------ tests/sanity_tests.c | 6 +- 2 files changed, 14 insertions(+), 936 deletions(-) diff --git a/source/md_impl.c b/source/md_impl.c index f62312b..344a8e0 100644 --- a/source/md_impl.c +++ b/source/md_impl.c @@ -1913,6 +1913,17 @@ MD_ParseOneNode(MD_String8 string, MD_u64 offset) off = colon_check_off; //- rjf: prohibit tags here + // TODO(allen): This poking in an error "from afar" thing seems + // like a bad sign to me. First it took a bit of digging for me to + // understand how this code actually detects the errors it says it + // does. Second it's kind of unclear that this should be illegal. + // I mean we can do these: + // `label: @tag child` + // `label: child @tag {children}` + // `label: @tag child` + // I do get *why* this is an odd thing to allow, but it's weird either way. + // Third, looks like this also is throwing out an error in the totally legal case: + // `label:{@tag {bar}}` for(MD_u64 tag_check_off = off; tag_check_off < string.size;) { MD_Token token = MD_TokenFromString(MD_StringSkip(string, tag_check_off)); @@ -2024,939 +2035,6 @@ MD_ParseWholeString(MD_String8 filename, MD_String8 contents) return result; } -#if 0 -MD_FUNCTION_IMPL void -MD_PushNodeError(MD_ParseCtx *ctx, MD_Node *node, MD_MessageKind kind, MD_String8 str){ - // TODO(allen): pass over this... the catastrophic error logic is a bit hard - // for me to follow. - - // NOTE(mal): Sort errors. Traverse the whole list assuming it will be short. - // The alternative is to drop a prev pointer into MD_Error and search backwards - MD_Error *prev_error = 0; - for(MD_Error *e = ctx->first_error; e; e = e->next) - { - if(e->node->at < node->at) - { - prev_error = e; - } - else - { - break; - } - } - - // NOTE(mal): Ignore errors after first catastrophic error - if(ctx->error_level < MD_MessageKind_CatastrophicError || !prev_error || prev_error->next) - { - // TODO(allen): put memory on ctx? put memory on persistent arena? - // alloc and fill error - MD_Error *error = MD_PushArray(MD_Error, 1); - error->node = node; - error->kind = kind; - error->string = str; - - // insert error - if(prev_error) - { - error->next = prev_error->next; - prev_error->next = error; - } - else - { - error->next = ctx->first_error; - ctx->first_error = error; - } - if(ctx->last_error == prev_error) - { - ctx->last_error = error; - } - - // set error level - if(kind > ctx->error_level) - { - ctx->error_level = kind; - } - } -} -#endif - -#if 0 -MD_FUNCTION void -MD_PushNodeErrorF(MD_ParseCtx *ctx, MD_Node *node, MD_MessageKind kind, char *fmt, ...){ - // TODO(allen): use memory from ctx? use persistent memory? - va_list args; - va_start(args, fmt); - MD_PushNodeError(ctx, node, kind, MD_PushStringFV(fmt, args)); - va_end(args); -} -#endif - -#if 0 -MD_FUNCTION void -MD_PushTokenError(MD_ParseCtx *ctx, MD_Token token, MD_MessageKind kind, MD_String8 str){ - MD_Node *stub_file = MD_MakeNode(MD_NodeKind_ErrorMarker, ctx->file_contents, ctx->file_contents, ctx->file_contents.str); - MD_Node *stub = MD_MakeNode(MD_NodeKind_ErrorMarker, token.string, token.outer_string, token.outer_string.str); - MD_PushNodeError(ctx, stub, kind, str); - MD_PushChild(stub_file, stub); -} -#endif - -#if 0 -MD_FUNCTION void -MD_PushTokenErrorF(MD_ParseCtx *ctx, MD_Token token, MD_MessageKind kind, char *fmt, ...){ - // TODO(allen): use memory from ctx? use persistent memory? - va_list args; - va_start(args, fmt); - MD_PushTokenError(ctx, token, kind, MD_PushStringFV(fmt, args)); - va_end(args); -} -#endif - -#if 0 -MD_FUNCTION_IMPL MD_ParseCtx -MD_Parse_InitializeCtx(MD_String8 filename, MD_String8 contents) -{ - MD_ParseCtx ctx = MD_ZERO_STRUCT; - ctx.at = contents.str; - ctx.file_contents = contents; - ctx.filename = filename; - return ctx; -} -#endif - -#if 0 -MD_FUNCTION_IMPL void -MD_Parse_Bump(MD_ParseCtx *ctx, MD_Token token) -{ - ctx->at = token.outer_string.str + token.outer_string.size; -} -#endif - -#if 0 -MD_FUNCTION_IMPL void -MD_Parse_BumpNext(MD_ParseCtx *ctx) -{ - MD_Parse_Bump(ctx, MD_Parse_LexNext(ctx)); -} -#endif - -#if 0 -MD_FUNCTION_IMPL MD_Token -MD_Parse_LexNext(MD_ParseCtx *ctx) -{ - MD_Token token; - MD_MemoryZero(&token, sizeof(token)); - - MD_u8 *one_past_last = ctx->file_contents.str + ctx->file_contents.size; - MD_u8 *first = ctx->at; - if (first < one_past_last) - { - MD_u8 *at = first; - MD_u32 skip_n = 0; - MD_u32 chop_n = 0; - -#define MD_TokenizerScan(cond) for (; at < one_past_last && (cond); at += 1) - - switch (*at) - { - // NOTE(allen): Whitespace parsing - case '\n': - { - token.kind = MD_TokenKind_Newline; - at += 1; - }break; - - case ' ': case '\r': case '\t': case '\f': case '\v': - { - token.kind = MD_TokenKind_Whitespace; - at += 1; - MD_TokenizerScan(*at == ' ' || *at == '\r' || *at == '\t' || *at == '\f' || *at == '\v'); - }break; - - // NOTE(allen): Comment parsing - case '/': - { - if (at + 1 < one_past_last) - { - if (at[1] == '/') - { - - // TODO(allen): This seems like an odd choice to me. What about two spaces!? - // What about an extra /? I'm wondering if there are other places where we make - // this kind of judgement call a lot, or is this the only one? Maybe the user - // should just always skip-chop whitespace if they want to clean this kind of - // thing up? They're going to have to if they ever use two spaces anyways, right? - - // NOTE(rjf): Trim off the first //, and a space after it if one is there. - if(at+2 < one_past_last && - at[2] == ' ') - { - skip_n = 3; - } - else - { - skip_n = 2; - } - - at += skip_n; - token.kind = MD_TokenKind_Comment; - MD_TokenizerScan(*at != '\n' && *at != '\r'); - } - else if (at[1] == '*') - { - // TODO(allen): proposal: - // 1. only set `kind = Comment` in the `counter == 0` case - // 2. otherwise set `kind = RunOnComment` (or something) - // maybe also emit an error *from here* or signal to a system that - // runs later that there are token errors to emit. - // Or: keep the `kind = Comment` but in the `counter != 0` case - // set some kind of error/unclosed/run-on flag on the token. - - at += 2; - token.kind = MD_TokenKind_Comment; - skip_n = 2; - int counter = 1; - for (;at < one_past_last && counter > 0; at += 1) - { - if (at + 1 < one_past_last) - { - if (at[0] == '*' && at[1] == '/') - { - at += 1; - counter -= 1; - } - else if (at[0] == '/' && at[1] == '*') - { - at += 1; - counter += 1; - } - } - } - if(counter == 0) - { - chop_n = 2; - } - } - } - if (token.kind == MD_TokenKind_Nil) goto symbol_lex; - }break; - - // NOTE(allen): Strings - case '"': - case '\'': - case '`': - { - // TODO(allen): proposal: - // go see the proposal in the block comment lexer, same idea here? - - // determine delimiter setup - MD_u8 d = *at; - MD_b32 is_triplet = (at + 2 < one_past_last && at[1] == d && at[2] == d); - - // lex triple-delimiter string - if (is_triplet) - { - skip_n = 3; - at += 3; - MD_u32 consecutive_d = 0; - for (;;) - { - // fail condition - if (at >= one_past_last){ - break; - } - - if(at[0] == d) - { - consecutive_d += 1; - at += 1; - // close condition - if (consecutive_d == 3){ - chop_n = 3; - break; - } - } - else - { - consecutive_d = 0; - - // escaping rule - if(at[0] == '\\') - { - at += 1; - if(at < one_past_last && (at[0] == d || at[0] == '\\')) - { - at += 1; - } - } - else{ - at += 1; - } - } - } - } - - // lex single-delimiter string - if (!is_triplet) - { - skip_n = 1; - at += 1; - for (;at < one_past_last;) - { - // close condition - if (*at == d){ - at += 1; - chop_n = 1; - break; - } - - // fail condition - if (*at == '\n'){ - break; - } - - // escaping rule - if (at[0] == '\\'){ - at += 1; - if (at < one_past_last && (at[0] == d || at[0] == '\\')){ - at += 1; - } - } - else{ - at += 1; - } - } - } - - // set token kind - if(is_triplet) - { - switch(d) - { - case '\'': token.kind = MD_TokenKind_StringLiteralSingleQuoteTriplet; break; - case '"': token.kind = MD_TokenKind_StringLiteralDoubleQuoteTriplet; break; - case '`': token.kind = MD_TokenKind_StringLiteralTickTriplet; break; - default: break; - } - } - else - { - switch(d) - { - case '\'': token.kind = MD_TokenKind_StringLiteralSingleQuote; break; - case '"': token.kind = MD_TokenKind_StringLiteralDoubleQuote; break; - case '`': token.kind = MD_TokenKind_StringLiteralTick; break; - default: break; - } - } - - }break; - - // NOTE(allen): Identifiers, Numbers, Operators - default: - { - if (MD_CharIsAlpha(*at) || *at == '_') - { - token.kind = MD_TokenKind_Identifier; - at += 1; - MD_TokenizerScan(MD_CharIsAlpha(*at) || MD_CharIsDigit(*at) || *at == '_'); - } - - else if (MD_CharIsDigit(*at) || - (at + 1 < one_past_last && at[0] == '-' && MD_CharIsDigit(at[1]))) - { - token.kind = MD_TokenKind_NumericLiteral; - at += 1; - MD_TokenizerScan(MD_CharIsAlpha(*at) || MD_CharIsDigit(*at) || *at == '.'); - } - - else if (MD_CharIsSymbol(*at)) - { - symbol_lex: - token.kind = MD_TokenKind_Symbol; - at += 1; - } - - else - { - token.kind = MD_TokenKind_BadCharacter; - at += 1; - } - }break; - } - - token.outer_string = MD_S8Range(first, at); - token.string = MD_StringSubstring(token.outer_string, skip_n, token.outer_string.size - chop_n); - - ctx->at = at; - } - - return token; -} -#endif - -#if 0 -MD_FUNCTION_IMPL MD_Token -MD_Parse_PeekSkipSome(MD_ParseCtx *ctx, MD_TokenGroups skip_groups) -{ - MD_ParseCtx ctx_restore = *ctx; - - MD_b32 skip_comment = (skip_groups & MD_TokenGroup_Comment); - MD_b32 skip_whitespace = (skip_groups & MD_TokenGroup_Whitespace); - MD_b32 skip_regular = (skip_groups & MD_TokenGroup_Regular); - - MD_Token result; - MD_MemoryZero(&result, sizeof(result)); - - loop: - { - result = MD_Parse_LexNext(ctx); - if ((skip_comment && MD_TokenKindIsComment(result.kind)) || - (skip_whitespace && MD_TokenKindIsWhitespace(result.kind)) || - (skip_regular && MD_TokenKindIsRegular(result.kind))){ - MD_Parse_Bump(ctx, result); - goto loop; - } - } - - { - // TODO(allen): I'm not a fan of what this implies. - *ctx = ctx_restore; - } - - return result; -} -#endif - -#if 0 -MD_FUNCTION_IMPL MD_b32 -MD_Parse_Require(MD_ParseCtx *ctx, MD_String8 string, MD_TokenKind kind) -{ - int result = 0; - - MD_Token token_any = MD_Parse_PeekSkipSome(ctx, 0); - MD_Token token_regular; - if(MD_StringMatch(token_any.string, string, 0) && token_any.kind == kind) - { - result = 1; - MD_Parse_Bump(ctx, token_any); - goto end; - } - - token_regular = MD_Parse_PeekSkipSome(ctx, MD_TokenGroup_Comment|MD_TokenGroup_Whitespace); - if(MD_StringMatch(token_regular.string, string, 0) && token_regular.kind == kind) - { - result = 1; - MD_Parse_Bump(ctx, token_regular); - goto end; - } - - end:; - return result; -} -#endif - -#if 0 -MD_FUNCTION_IMPL MD_b32 -MD_Parse_RequireKind(MD_ParseCtx *ctx, MD_TokenKind kind, MD_Token *out_token) -{ - int result = 0; - - MD_TokenGroups skip_groups = MD_TokenGroup_Comment|MD_TokenGroup_Whitespace; - if (MD_TokenKindIsWhitespace(kind)) - { - skip_groups &= ~MD_TokenGroup_Whitespace; - } - if (MD_TokenKindIsComment(kind)) - { - skip_groups &= ~MD_TokenGroup_Comment; - } - - MD_Token token = MD_Parse_PeekSkipSome(ctx, skip_groups); - if(token.kind == kind) - { - result = 1; - MD_Parse_Bump(ctx, token); - if(out_token) - { - *out_token = token; - } - } - return result; -} -#endif - -#if 0 -MD_FUNCTION_IMPL void -MD_Parse_Set(MD_ParseCtx *ctx, MD_Node *parent, MD_ParseSetFlags flags) -{ - MD_Token initial_token = MD_Parse_PeekSkipSome(ctx, MD_TokenGroup_Comment|MD_TokenGroup_Whitespace); - - // check for set opener - MD_u8 set_opener = 0; - if((flags & MD_ParseSetFlag_Brace) && - MD_Parse_Require(ctx, MD_S8Lit("{"), MD_TokenKind_Symbol)) - { - set_opener = '{'; - } - else if((flags & MD_ParseSetFlag_Paren) && - MD_Parse_Require(ctx, MD_S8Lit("("), MD_TokenKind_Symbol)) - { - set_opener = '('; - } - else if((flags & MD_ParseSetFlag_Bracket) && - MD_Parse_Require(ctx, MD_S8Lit("["), MD_TokenKind_Symbol)) - { - set_opener = '['; - } - - // attach left-symbol flag to parent - switch (set_opener){ - case '{': - { - parent->flags |= MD_NodeFlag_BraceLeft; - }break; - case '(': - { - parent->flags |= MD_NodeFlag_ParenLeft; - }break; - case '[': - { - parent->flags |= MD_NodeFlag_BracketLeft; - }break; - } - - // determine set close rule - MD_b32 close_with_brace = 0; - MD_b32 close_with_paren = 0; - MD_b32 close_with_separator = 0; - switch (set_opener){ - default: - { - close_with_separator = (!!(flags & MD_ParseSetFlag_Implicit)); - }break; - case '{': - { - close_with_brace = 1; - }break; - case '(': - case '[': - { - close_with_paren = 1; - }break; - } - - // NOTE(rjf): Parse children. - if((set_opener != 0) || close_with_separator) - { - MD_NodeFlags next_child_flags = 0; - for(;;) - { - if(close_with_brace) - { - if(MD_Parse_Require(ctx, MD_S8Lit("}"), MD_TokenKind_Symbol)) - { - parent->flags |= MD_NodeFlag_BraceRight; - goto end_parse; - } - } - else if(close_with_paren) - { - if((flags & MD_ParseSetFlag_Paren) && - MD_Parse_Require(ctx, MD_S8Lit(")"), MD_TokenKind_Symbol)) - { - parent->flags |= MD_NodeFlag_ParenRight; - goto end_parse; - } - else if((flags & MD_ParseSetFlag_Bracket) && - MD_Parse_Require(ctx, MD_S8Lit("]"), MD_TokenKind_Symbol)) - { - parent->flags |= MD_NodeFlag_BracketRight; - goto end_parse; - } - } - else - { - MD_Token peek = MD_Parse_PeekSkipSome(ctx, MD_TokenGroup_Whitespace | MD_TokenGroup_Comment); - if(peek.kind == MD_TokenKind_Symbol && - (MD_StringMatch(peek.string, MD_S8Lit("}"), 0) || - MD_StringMatch(peek.string, MD_S8Lit(")"), 0) || - MD_StringMatch(peek.string, MD_S8Lit("]"), 0))) - { - goto end_parse; - } - } - - // NOTE(allen): parse the next node - MD_ParseResult parse = MD_ParseOneNodeFromCtx(ctx); - MD_Node *child = parse.node; - if(MD_NodeIsNil(child)) - { - if(set_opener != 0) - { - MD_PushTokenErrorF(ctx, initial_token, MD_MessageKind_CatastrophicError, - "Unbalanced \"%c\"", set_opener); - } - goto end_parse; - } - - // connect node into graph - MD_PushChild(parent, child); - - // check trailing symbol - MD_u32 symbol_flags = 0; - if (!close_with_separator){ - if(MD_Parse_Require(ctx, MD_S8Lit(","), MD_TokenKind_Symbol)) - { - symbol_flags = MD_NodeFlag_BeforeComma; - } - else if(MD_Parse_Require(ctx, MD_S8Lit(";"), MD_TokenKind_Symbol)) - { - symbol_flags = MD_NodeFlag_BeforeSemicolon; - } - } - - // fill flags from surrounding context - child->flags |= next_child_flags|symbol_flags; - - // setup next_child_flags - next_child_flags = MD_NodeFlag_AfterFromBefore(symbol_flags); - - // separator close condition - if(close_with_separator) - { - MD_Token next_token = MD_Parse_PeekSkipSome(ctx, 0); - if(next_token.kind == MD_TokenKind_Newline || - (next_token.kind == MD_TokenKind_Symbol && - (MD_StringMatch(next_token.string, MD_S8Lit(","), 0) || - MD_StringMatch(next_token.string, MD_S8Lit(";"), 0) || - MD_StringMatch(next_token.string, MD_S8Lit("}"), 0) || - MD_StringMatch(next_token.string, MD_S8Lit("]"), 0) || - MD_StringMatch(next_token.string, MD_S8Lit(")"), 0)))) - { - goto end_parse; - } - } - - // TODO(allen): I find it kind of concerning that ParseWholeString and - // ParseOneNode are both doing this. I did some refactors in the - // ParseWholeString to break it down into flatter blocks, not realizing - // very similar logic happens here too. - // I also see that these really are slightly different, but it seems - // like it should be possible to express the whole-string case as a - // special case of this and avoid the duplication. - } - } - - end_parse:; -} -#endif - -#if 0 -MD_FUNCTION_IMPL MD_ParseResult -MD_ParseOneNodeFromCtx(MD_ParseCtx *ctx) -{ - MD_u8 *at_first = ctx->at; - - MD_ParseResult result = MD_ZERO_STRUCT; - result.node = MD_NilNode(); - - MD_Error *ctx_last_error = ctx->last_error; - - MD_Token token; - MD_MemoryZero(&token, sizeof(token)); - - MD_Node *first_tag = 0; - MD_Node *last_tag = 0; - _MD_ParseTagList(ctx, &first_tag, &last_tag); - - // NOTE(rjf): Parse the comment preceding this node. - MD_String8 comment_before = MD_ZERO_STRUCT; - { - MD_Token comment_token = MD_ZERO_STRUCT; - for(;;) - { - MD_Token token = MD_Parse_PeekSkipSome(ctx, 0); - if(token.kind == MD_TokenKind_Comment) - { - comment_token = token; - MD_Parse_Bump(ctx, token); - } - else if(token.kind == MD_TokenKind_Newline) - { - MD_Parse_Bump(ctx, token); - if(MD_Parse_RequireKind(ctx, MD_TokenKind_Comment, &comment_token)) - { - // NOTE(mal): If more than one comment, use the last comment - } - else if(MD_Parse_RequireKind(ctx, MD_TokenKind_Newline, 0)) - { - MD_MemoryZero(&comment_token, sizeof(comment_token)); - } - } - else if(MD_TokenKindIsWhitespace(token.kind)) - { - MD_Parse_Bump(ctx, token); - } - else - { - break; - } - } - comment_before = comment_token.string; - // TODO(allen): I find this odd. Wouldn't it have been easier to generate this - // durring or right after the lexing phase? - if(!_MD_CommentIsSyntacticallyCorrect(comment_token)) - { - MD_String8 capped = MD_StringPrefix(comment_token.outer_string, MD_UNTERMINATED_TOKEN_LEN_CAP); - MD_PushTokenErrorF(ctx, comment_token, MD_MessageKind_CatastrophicError, - "Unterminated comment \"%.*s\"", MD_StringExpand(capped)); - } - } - - MD_TokenGroups skip_groups = MD_TokenGroup_Whitespace|MD_TokenGroup_Comment; - MD_Token next_token = MD_Parse_PeekSkipSome(ctx, skip_groups); - - retry: - - // NOTE(rjf): Unnamed Sets - if(next_token.kind == MD_TokenKind_Symbol && - (MD_StringMatch(next_token.string, MD_S8Lit("("), 0) || - MD_StringMatch(next_token.string, MD_S8Lit("{"), 0) || - MD_StringMatch(next_token.string, MD_S8Lit("["), 0))) - { - result.node = MD_MakeNode(MD_NodeKind_Label, MD_S8Lit(""), MD_S8Lit(""), next_token.outer_string.str); - - MD_Parse_Set(ctx, result.node, - MD_ParseSetFlag_Paren | - MD_ParseSetFlag_Brace | - MD_ParseSetFlag_Bracket); - goto end_parse; - } - - // NOTE(rjf): Labels - else if(next_token.kind == MD_TokenKind_Identifier || - next_token.kind == MD_TokenKind_NumericLiteral || - next_token.kind == MD_TokenKind_StringLiteralTick || - next_token.kind == MD_TokenKind_StringLiteralSingleQuote || - next_token.kind == MD_TokenKind_StringLiteralDoubleQuote || - next_token.kind == MD_TokenKind_StringLiteralTickTriplet || - next_token.kind == MD_TokenKind_StringLiteralSingleQuoteTriplet || - next_token.kind == MD_TokenKind_StringLiteralDoubleQuoteTriplet || - next_token.kind == MD_TokenKind_Symbol ) - { - MD_Parse_Bump(ctx, next_token); - result.node = MD_MakeNode(MD_NodeKind_Label, next_token.string, next_token.outer_string, next_token.outer_string.str); - result.node->flags |= MD_NodeFlagsFromTokenKind(next_token.kind); - - // TODO(rjf): Before we were just able to check one kind. I think preserving - // which kind of string literal was used is very important, for the same reason - // that preserving which symbols were used to delimit a set is important. - // But, having to manage this "group of kinds" is a little bit annoying. - // Maybe we should define the set of legal kinds for certain syntactic - // contexts somewhere unified, so that the parser is never duplicating this? - // - // It's also possible that it never matters and we only ever use this group - // in one place, but I just got that "we're duplicating stuff" allergy that - // I usually get. - // - // If that turned out to be a good idea, maybe we could do something like - // MD_TokenKindIsLegalLabelHead, MD_TokenKindNeedsBalancing?? I don't know. - if(next_token.kind == MD_TokenKind_StringLiteralTick || - next_token.kind == MD_TokenKind_StringLiteralSingleQuote || - next_token.kind == MD_TokenKind_StringLiteralDoubleQuote || - next_token.kind == MD_TokenKind_StringLiteralTickTriplet || - next_token.kind == MD_TokenKind_StringLiteralSingleQuoteTriplet || - next_token.kind == MD_TokenKind_StringLiteralDoubleQuoteTriplet) - { - if(!_MD_TokenBoundariesAreBalanced(next_token)) - { - MD_String8 capped = MD_StringPrefix(next_token.outer_string, MD_UNTERMINATED_TOKEN_LEN_CAP); - MD_PushNodeErrorF(ctx, result.node, MD_MessageKind_CatastrophicError, - "Unterminated text literal \"%.*s\"", MD_StringExpand(capped)); - } - } - else if(next_token.kind == MD_TokenKind_Symbol && next_token.string.size == 1 && MD_CharIsReservedSymbol(next_token.string.str[0])) - { - MD_u8 c = next_token.string.str[0]; - if(c == '}' || c == ']' || c == ')') - { - MD_PushTokenErrorF(ctx, next_token, MD_MessageKind_CatastrophicError, "Unbalanced \"%c\"", c); - } - else - { - MD_PushTokenErrorF(ctx, next_token, MD_MessageKind_Error, "Unexpected reserved symbol \"%c\"", - c); - } - } - - // NOTE(rjf): Children - if(MD_Parse_Require(ctx, MD_S8Lit(":"), MD_TokenKind_Symbol)) - { - MD_Parse_Set(ctx, result.node, - MD_ParseSetFlag_Paren | - MD_ParseSetFlag_Brace | - MD_ParseSetFlag_Bracket | - MD_ParseSetFlag_Implicit); - - // TODO(allen): This poking in an error "from afar" thing seems - // like a bad sign to me. First it took a bit of digging for me to - // understand how this code actually detects the errors it says it - // does. Second it's kind of unclear that this should be illegal. - // I mean we can do these: - // `label: @tag child` - // `label: child @tag {children}` - // `label: @tag child` - // I do get *why* this is an odd thing to allow, but it's weird either way. - // Third, looks like this also is throwing out an error in the totally legal case: - // `label:{@tag {bar}}` - - // NOTE(mal): Generate error for tags in positions such as "label:@tag {children}" - MD_Node *fc = result.node->first_child; - if(fc == result.node->last_child && !MD_NodeIsNil(fc->first_tag) && // NOTE(mal): One child. Tagged. - fc->kind == MD_NodeKind_Label && fc->whole_string.size == 0) // NOTE(mal): Unlabeled set - { - for(MD_EachNode(tag, fc->first_tag)) - { - MD_PushNodeErrorF(ctx, tag, MD_MessageKind_Error, - "Invalid position for tag \"%.*s\"", MD_StringExpand(tag->string)); - } - } - } - goto end_parse; - } - - else if(MD_Parse_RequireKind(ctx, MD_TokenKind_BadCharacter, &token)) - { - MD_String8List bytes = {0}; - for(int i_byte = 0; i_byte < token.outer_string.size; ++i_byte) - { - // TODO(allen): tighten up with good integer <-> string helpers - MD_PushStringToList(&bytes, MD_PushStringF("0x%02X", token.outer_string.str[i_byte])); - } - MD_String8 byte_string = MD_JoinStringList(bytes, MD_S8Lit(" ")); - MD_PushTokenErrorF(ctx, token, MD_MessageKind_Error, - "Non-ASCII character \"%.*s\"", MD_StringExpand(byte_string)); - goto retry; - } - - end_parse:; - - // NOTE(rjf): Parse comments after nodes. - MD_String8 comment_after = MD_ZERO_STRUCT; - { - MD_Token comment_token = MD_ZERO_STRUCT; - for(;;) - { - MD_Token token = MD_Parse_PeekSkipSome(ctx, 0); - if(token.kind == MD_TokenKind_Comment) - { - comment_token = token; - MD_Parse_Bump(ctx, token); - break; - } - else if(token.kind == MD_TokenKind_Newline) - { - break; - } - else if(MD_TokenKindIsWhitespace(token.kind)) - { - MD_Parse_Bump(ctx, token); - } - else - { - break; - } - } - comment_after = comment_token.string; - // TODO(allen): I find this odd. Wouldn't it have been easier to generate this - // durring or right after the lexing phase? - if(!_MD_CommentIsSyntacticallyCorrect(comment_token)) - { - MD_String8 capped = MD_StringPrefix(comment_token.outer_string, MD_UNTERMINATED_TOKEN_LEN_CAP); - MD_PushTokenErrorF(ctx, comment_token, MD_MessageKind_CatastrophicError, - "Unterminated comment \"%.*s\"", MD_StringExpand(capped)); - } - } - - result.bytes_parsed = (MD_u64)(ctx->at - at_first); - result.first_error = ctx_last_error ? ctx_last_error->next : 0; - if(!MD_NodeIsNil(result.node)) - { - result.node->first_tag = first_tag; - result.node->last_tag = last_tag; - for(MD_Node *tag = first_tag; !MD_NodeIsNil(tag); tag = tag->next) - { - tag->parent = result.node; - } - result.node->comment_before = comment_before; - result.node->comment_after = comment_after; - } - return result; -} -#endif - -#if 0 -MD_FUNCTION_IMPL MD_ParseResult -MD_ParseOneNode(MD_String8 filename, MD_String8 contents) -{ - MD_ParseCtx ctx = MD_Parse_InitializeCtx(filename, contents); - return MD_ParseOneNodeFromCtx(&ctx); -} -#endif - -#if 0 -MD_FUNCTION_IMPL MD_ParseResult -MD_ParseWholeString(MD_String8 filename, MD_String8 contents) -{ - MD_ParseResult result = MD_ZERO_STRUCT; - MD_String8 root_string = filename; - MD_Node *root = MD_MakeNode(MD_NodeKind_File, root_string, root_string, contents.str); - if(contents.size > 0) - { - // NOTE(allen): setup parse context - MD_ParseCtx ctx = MD_Parse_InitializeCtx(filename, contents); - - // NOTE(allen): parse loop - MD_NodeFlags next_child_flags = 0; - for(;;) - { - // NOTE(allen): parse the next node - MD_ParseResult parse = MD_ParseOneNodeFromCtx(&ctx); - MD_Node *child = parse.node; - if(MD_NodeIsNil(child)) - { - break; - } - - // connect node into graph - MD_PushChild(root, child); - - // check trailing symbol - MD_u32 symbol_flags = 0; - if(MD_Parse_Require(&ctx, MD_S8Lit(","), MD_TokenKind_Symbol)) - { - symbol_flags = MD_NodeFlag_BeforeComma; - } - else if(MD_Parse_Require(&ctx, MD_S8Lit(";"), MD_TokenKind_Symbol)) - { - symbol_flags = MD_NodeFlag_BeforeSemicolon; - } - - // fill flags from surrounding context - child->flags |= next_child_flags|symbol_flags; - - // setup next_child_flags - next_child_flags = MD_NodeFlag_AfterFromBefore(symbol_flags); - } - result.bytes_parsed = (MD_u64)(ctx.at - contents.str); - result.first_error = ctx.first_error; - } - result.node = root; - return result; -} -#endif - MD_FUNCTION_IMPL MD_ParseResult MD_ParseWholeFile(MD_String8 filename) { diff --git a/tests/sanity_tests.c b/tests/sanity_tests.c index ba7ca0b..afc1011 100644 --- a/tests/sanity_tests.c +++ b/tests/sanity_tests.c @@ -787,7 +787,7 @@ int main(void) MD_String8 *string = test_strings; for (int i = 0; i < MD_ArrayCount(test_strings); i += 1, string += 1){ MD_ParseResult result = MD_ParseWholeString(file_name, *string); - TestResult((result.first_error == 0) && + TestResult((result.errors.first == 0) && (result.node->first_child == result.node->last_child) && (result.node->first_child->flags & MD_NodeFlag_Numeric)); } @@ -820,7 +820,7 @@ int main(void) MD_String8 *string = test_strings; for (int i = 0; i < MD_ArrayCount(test_strings); i += 1, string += 1){ MD_ParseResult result = MD_ParseWholeString(file_name, *string); - TestResult((result.first_error == 0) && + TestResult((result.errors.first == 0) && (result.node->first_child == result.node->last_child) && (result.node->first_child->flags & MD_NodeFlag_Numeric)); } @@ -844,7 +844,7 @@ int main(void) MD_String8 *string = test_strings; for (int i = 0; i < MD_ArrayCount(test_strings); i += 1, string += 1){ MD_ParseResult result = MD_ParseWholeString(file_name, *string); - TestResult((result.first_error == 0) && + TestResult((result.errors.first == 0) && (result.node->first_child != result.node->last_child)); } }