diff --git a/src/text_cache/text_cache.c b/src/text_cache/text_cache.c index 0a0363a7..894670c1 100644 --- a/src/text_cache/text_cache.c +++ b/src/text_cache/text_cache.c @@ -26,9 +26,27 @@ txt_lang_kind_from_extension(String8 extension) { kind = TXT_LangKind_CPlusPlus; } + else if(str8_match(extension, str8_lit("odin"), StringMatchFlag_CaseInsensitive)) + { + kind = TXT_LangKind_Odin; + } return kind; } +internal TXT_LangLexFunctionType * +txt_lex_function_from_lang_kind(TXT_LangKind kind) +{ + TXT_LangLexFunctionType *fn = 0; + switch(kind) + { + default:{}break; + case TXT_LangKind_C: {fn = txt_token_array_from_string__c_cpp;}break; + case TXT_LangKind_CPlusPlus: {fn = txt_token_array_from_string__c_cpp;}break; + case TXT_LangKind_Odin: {fn = txt_token_array_from_string__odin;}break; + } + return fn; +} + //////////////////////////////// //~ rjf: Token Type Functions @@ -378,6 +396,234 @@ txt_token_array_from_string__c_cpp(Arena *arena, U64 *bytes_processed_counter, S return result; } +internal TXT_TokenArray +txt_token_array_from_string__odin(Arena *arena, U64 *bytes_processed_counter, String8 string) +{ + Temp scratch = scratch_begin(&arena, 1); + + //- rjf: generate token list + TXT_TokenChunkList tokens = {0}; + { + B32 comment_is_single_line = 0; + B32 string_is_char = 0; + TXT_TokenKind active_token_kind = TXT_TokenKind_Null; + U64 active_token_start_idx = 0; + B32 escaped = 0; + B32 next_escaped = 0; + U64 byte_process_start_idx = 0; + for(U64 idx = 0; idx <= string.size;) + { + U8 byte = (idx+0 < string.size) ? (string.str[idx+0]) : 0; + U8 next_byte = (idx+1 < string.size) ? (string.str[idx+1]) : 0; + + // rjf: update counter + if(bytes_processed_counter != 0 && ((idx-byte_process_start_idx) >= 1000 || idx == string.size)) + { + ins_atomic_u64_add_eval(bytes_processed_counter, (idx-byte_process_start_idx)); + byte_process_start_idx = idx; + } + + // rjf: escaping + if(escaped && (byte != '\r' && byte != '\n')) + { + next_escaped = 0; + } + else if(!escaped && byte == '\\') + { + next_escaped = 1; + } + + // rjf: take starter, determine active token kind + if(active_token_kind == TXT_TokenKind_Null) + { + // rjf: use next bytes to start a new token + if(0){} + else if(char_is_space(byte)) { active_token_kind = TXT_TokenKind_Whitespace; } + else if(byte == '_' || + byte == '$' || + char_is_alpha(byte)) { active_token_kind = TXT_TokenKind_Identifier; } + else if(char_is_digit(byte, 10) || + (byte == '.' && + char_is_digit(next_byte, 10))) { active_token_kind = TXT_TokenKind_Numeric; } + else if(byte == '"') { active_token_kind = TXT_TokenKind_String; string_is_char = 0; } + else if(byte == '\'') { active_token_kind = TXT_TokenKind_String; string_is_char = 1; } + else if(byte == '/' && next_byte == '/') { active_token_kind = TXT_TokenKind_Comment; comment_is_single_line = 1; } + else if(byte == '/' && next_byte == '*') { active_token_kind = TXT_TokenKind_Comment; comment_is_single_line = 0; } + else if(byte == '~' || byte == '!' || + byte == '%' || byte == '^' || + byte == '&' || byte == '*' || + byte == '(' || byte == ')' || + byte == '-' || byte == '=' || + byte == '+' || byte == '[' || + byte == ']' || byte == '{' || + byte == '}' || byte == ':' || + byte == ';' || byte == ',' || + byte == '.' || byte == '<' || + byte == '>' || byte == '/' || + byte == '?' || byte == '|') { active_token_kind = TXT_TokenKind_Symbol; } + else if(byte == '#') { active_token_kind = TXT_TokenKind_Meta; } + + // rjf: start new token + if(active_token_kind != TXT_TokenKind_Null) + { + active_token_start_idx = idx; + } + + // rjf: invalid token kind -> emit error + else + { + TXT_Token token = {TXT_TokenKind_Error, r1u64(idx, idx+1)}; + txt_token_chunk_list_push(scratch.arena, &tokens, 4096, &token); + } + } + + // rjf: look for ender + U64 ender_pad = 0; + B32 ender_found = 0; + if(active_token_kind != TXT_TokenKind_Null && idx>active_token_start_idx) + { + if(idx == string.size) + { + ender_pad = 0; + ender_found = 1; + } + else switch(active_token_kind) + { + default:break; + case TXT_TokenKind_Whitespace: + { + ender_found = !char_is_space(byte); + }break; + case TXT_TokenKind_Identifier: + { + ender_found = (!char_is_alpha(byte) && !char_is_digit(byte, 10) && byte != '_' && byte != '$'); + }break; + case TXT_TokenKind_Numeric: + { + ender_found = (!char_is_alpha(byte) && !char_is_digit(byte, 10) && byte != '_' && byte != '.' && byte != '\''); + }break; + case TXT_TokenKind_String: + { + ender_found = (!escaped && ((!string_is_char && byte == '"') || (string_is_char && byte == '\''))); + ender_pad += 1; + }break; + case TXT_TokenKind_Symbol: + { + ender_found = (byte != '~' && byte != '!' && + byte != '%' && byte != '^' && + byte != '&' && byte != '*' && + byte != '(' && byte != ')' && + byte != '-' && byte != '=' && + byte != '+' && byte != '[' && + byte != ']' && byte != '{' && + byte != '}' && byte != ':' && + byte != ';' && byte != ',' && + byte != '.' && byte != '<' && + byte != '>' && byte != '/' && + byte != '?' && byte != '|'); + }break; + case TXT_TokenKind_Comment: + { + if(comment_is_single_line) + { + ender_found = (!escaped && (byte == '\r' || byte == '\n')); + } + else + { + ender_found = (active_token_start_idx+1 < idx && byte == '*' && next_byte == '/'); + ender_pad += 2; + } + }break; + case TXT_TokenKind_Meta: + { + ender_found = (!char_is_alpha(byte) && !char_is_digit(byte, 10) && byte != '_' && byte != '$'); + }break; + } + } + + // rjf: next byte is ender => emit token + if(ender_found) + { + TXT_Token token = {active_token_kind, r1u64(active_token_start_idx, idx+ender_pad)}; + active_token_kind = TXT_TokenKind_Null; + + // rjf: identifier -> keyword in special cases + if(token.kind == TXT_TokenKind_Identifier) + { + read_only local_persist String8 odin_keywords[] = + { + str8_lit_comp("asm"), + str8_lit_comp("auto_cast"), + str8_lit_comp("bit_set"), + str8_lit_comp("break"), + str8_lit_comp("case"), + str8_lit_comp("cast"), + str8_lit_comp("context"), + str8_lit_comp("continue"), + str8_lit_comp("defer"), + str8_lit_comp("distinct"), + str8_lit_comp("do"), + str8_lit_comp("dynamic"), + str8_lit_comp("else"), + str8_lit_comp("enum"), + str8_lit_comp("fallthrough"), + str8_lit_comp("for"), + str8_lit_comp("foreign"), + str8_lit_comp("if"), + str8_lit_comp("in"), + str8_lit_comp("map"), + str8_lit_comp("matrix"), + str8_lit_comp("not_in"), + str8_lit_comp("or_break"), + str8_lit_comp("or_continue"), + str8_lit_comp("or_else"), + str8_lit_comp("or_return"), + str8_lit_comp("package"), + str8_lit_comp("proc"), + str8_lit_comp("return"), + str8_lit_comp("struct"), + str8_lit_comp("switch"), + str8_lit_comp("transmute"), + str8_lit_comp("typeid"), + str8_lit_comp("union"), + str8_lit_comp("using"), + str8_lit_comp("when"), + str8_lit_comp("where"), + str8_lit_comp("import"), + }; + String8 token_string = str8_substr(string, r1u64(active_token_start_idx, idx+ender_pad)); + for(U64 keyword_idx = 0; keyword_idx < ArrayCount(odin_keywords); keyword_idx += 1) + { + if(str8_match(odin_keywords[keyword_idx], token_string, 0)) + { + token.kind = TXT_TokenKind_Keyword; + break; + } + } + } + + // rjf: push + txt_token_chunk_list_push(scratch.arena, &tokens, 4096, &token); + + // rjf: increment by ender padding + idx += ender_pad; + } + + // rjf: advance by 1 byte if we haven't found an ender + if(!ender_found) + { + idx += 1; + } + escaped = next_escaped; + } + } + + //- rjf: token list -> token array + TXT_TokenArray result = txt_token_array_from_chunk_list(arena, &tokens); + scratch_end(scratch); + return result; +} + //////////////////////////////// //~ rjf: Main Layer Initialization @@ -784,16 +1030,7 @@ txt_parse_thread__entry_point(void *p) } //- rjf: lang -> lex function - TXT_LangLexFunctionType *lex_function = 0; - switch(lang) - { - default:{}break; - case TXT_LangKind_C: - case TXT_LangKind_CPlusPlus: - { - lex_function = txt_token_array_from_string__c_cpp; - }break; - } + TXT_LangLexFunctionType *lex_function = txt_lex_function_from_lang_kind(lang); //- rjf: lex function * data -> tokens TXT_TokenArray tokens = {0}; diff --git a/src/text_cache/text_cache.h b/src/text_cache/text_cache.h index 3bced5bb..74437762 100644 --- a/src/text_cache/text_cache.h +++ b/src/text_cache/text_cache.h @@ -32,15 +32,6 @@ typedef enum TXT_TokenKind } TXT_TokenKind; -typedef enum TXT_LangKind -{ - TXT_LangKind_Null, - TXT_LangKind_C, - TXT_LangKind_CPlusPlus, - TXT_LangKind_COUNT -} -TXT_LangKind; - typedef struct TXT_Token TXT_Token; struct TXT_Token { @@ -105,6 +96,19 @@ struct TXT_TextInfo TXT_TokenArray tokens; }; +//////////////////////////////// +//~ rjf: Language Kind Types + +typedef enum TXT_LangKind +{ + TXT_LangKind_Null, + TXT_LangKind_C, + TXT_LangKind_CPlusPlus, + TXT_LangKind_Odin, + TXT_LangKind_COUNT +} +TXT_LangKind; + typedef TXT_TokenArray TXT_LangLexFunctionType(Arena *arena, U64 *bytes_processed_counter, String8 string); //////////////////////////////// @@ -234,6 +238,7 @@ global TXT_Shared *txt_shared = 0; //~ rjf: Basic Helpers internal TXT_LangKind txt_lang_kind_from_extension(String8 extension); +internal TXT_LangLexFunctionType *txt_lex_function_from_lang_kind(TXT_LangKind kind); //////////////////////////////// //~ rjf: Token Type Functions @@ -247,6 +252,7 @@ internal TXT_TokenArray txt_token_array_from_list(Arena *arena, TXT_TokenList *l //~ rjf: Lexing Functions internal TXT_TokenArray txt_token_array_from_string__c_cpp(Arena *arena, U64 *bytes_processed_counter, String8 string); +internal TXT_TokenArray txt_token_array_from_string__odin(Arena *arena, U64 *bytes_processed_counter, String8 string); //////////////////////////////// //~ rjf: Main Layer Initialization diff --git a/src/txti/txti.c b/src/txti/txti.c index d58e0fc4..9fc4cf06 100644 --- a/src/txti/txti.c +++ b/src/txti/txti.c @@ -47,31 +47,6 @@ txti_hash_from_string(String8 string) return result; } -internal TXTI_LangKind -txti_lang_kind_from_extension(String8 extension) -{ - TXTI_LangKind kind = TXTI_LangKind_Null; - if(str8_match(extension, str8_lit("c"), 0) || - str8_match(extension, str8_lit("h"), 0)) - { - kind = TXTI_LangKind_C; - } - else if(str8_match(extension, str8_lit("cpp"), StringMatchFlag_CaseInsensitive) || - str8_match(extension, str8_lit("cxx"), StringMatchFlag_CaseInsensitive) || - str8_match(extension, str8_lit("cc"), StringMatchFlag_CaseInsensitive) || - str8_match(extension, str8_lit("c++"), StringMatchFlag_CaseInsensitive) || - str8_match(extension, str8_lit("C"), 0) || - str8_match(extension, str8_lit("hpp"), StringMatchFlag_CaseInsensitive) || - str8_match(extension, str8_lit("hxx"), StringMatchFlag_CaseInsensitive) || - str8_match(extension, str8_lit("hh"), StringMatchFlag_CaseInsensitive) || - str8_match(extension, str8_lit("h++"), StringMatchFlag_CaseInsensitive) || - str8_match(extension, str8_lit("H"), 0)) - { - kind = TXTI_LangKind_CPlusPlus; - } - return kind; -} - //////////////////////////////// //~ rjf: Message Type Functions @@ -628,14 +603,14 @@ txti_mut_thread_entry_point(void *p) //- rjf: load file if we need it B32 load_valid = 0; String8 file_contents = {0}; - TXTI_LangKind lang_kind = TXTI_LangKind_Null; + TXT_LangKind lang_kind = TXT_LangKind_Null; U64 timestamp = 0; if(msg->kind == TXTI_MsgKind_Reload) ProfScope("reload file") { FileProperties pre_load_props = os_properties_from_file_path(msg->string); OS_Handle file = os_file_open(OS_AccessFlag_Read|OS_AccessFlag_ShareRead|OS_AccessFlag_ShareWrite, msg->string); file_contents = os_string_from_file_range(scratch.arena, file, r1u64(0, pre_load_props.size)); - lang_kind = txti_lang_kind_from_extension(str8_skip_last_dot(msg->string)); + lang_kind = txt_lang_kind_from_extension(str8_skip_last_dot(msg->string)); os_file_close(file); FileProperties post_load_props = os_properties_from_file_path(msg->string); load_valid = (post_load_props.modified == pre_load_props.modified); @@ -646,16 +621,7 @@ txti_mut_thread_entry_point(void *p) } //- rjf: nonzero lang kind -> unpack lang info - TXTI_LangLexFunctionType *lex_function = 0; - switch(lang_kind) - { - default:{}break; - case TXTI_LangKind_C: - case TXTI_LangKind_CPlusPlus: - { - lex_function = txt_token_array_from_string__c_cpp; - }break; - } + TXT_LangLexFunctionType *lex_function = txt_lex_function_from_lang_kind(lang_kind); //- rjf: detect line end kind TXT_LineEndKind line_end_kind = TXT_LineEndKind_Null; @@ -739,7 +705,7 @@ txti_mut_thread_entry_point(void *p) { entity->line_end_kind = line_end_kind; } - if(lang_kind != TXTI_LangKind_Null) + if(lang_kind != TXT_LangKind_Null) { entity->lang_kind = lang_kind; } diff --git a/src/txti/txti.h b/src/txti/txti.h index 18df67e3..48b1844b 100644 --- a/src/txti/txti.h +++ b/src/txti/txti.h @@ -56,20 +56,6 @@ struct TXTI_Handle U64 u64[2]; }; -//////////////////////////////// -//~ rjf: Language Kinds - -typedef enum TXTI_LangKind -{ - TXTI_LangKind_Null, - TXTI_LangKind_C, - TXTI_LangKind_CPlusPlus, - TXTI_LangKind_COUNT -} -TXTI_LangKind; - -typedef TXT_TokenArray TXTI_LangLexFunctionType(Arena *arena, U64 *bytes_processed_counter, String8 string); - //////////////////////////////// //~ rjf: Buffer Entity Types @@ -106,7 +92,7 @@ struct TXTI_Entity // rjf: metadata TXT_LineEndKind line_end_kind; - TXTI_LangKind lang_kind; + TXT_LangKind lang_kind; U64 bytes_processed; U64 bytes_to_process; U64 working_count; @@ -157,7 +143,7 @@ struct TXTI_BufferInfo String8 path; U64 timestamp; TXT_LineEndKind line_end_kind; - TXTI_LangKind lang_kind; + TXT_LangKind lang_kind; U64 total_line_count; U64 last_line_size; U64 max_line_size; @@ -258,7 +244,6 @@ internal void txti_init(void); //~ rjf: Basic Helpers internal U64 txti_hash_from_string(String8 string); -internal TXTI_LangKind txti_lang_kind_from_extension(String8 extension); //////////////////////////////// //~ rjf: Message Type Functions