From bcbb4142cf48eef511399d60e333058c34e94a21 Mon Sep 17 00:00:00 2001 From: Ryan Fleury Date: Thu, 28 Mar 2024 12:01:14 -0700 Subject: [PATCH] plug in x64 intel-syntax disassembly 'language' into text analysis cache layer, to use that layer to naturally do parser/highlighter work for disassembly --- src/df/gfx/df_view_rule_hooks.c | 2 +- src/text_cache/text_cache.c | 179 +++++++++++++++++++++++++++++++- src/text_cache/text_cache.h | 2 + 3 files changed, 179 insertions(+), 4 deletions(-) diff --git a/src/df/gfx/df_view_rule_hooks.c b/src/df/gfx/df_view_rule_hooks.c index 079517bd..f97a1ec7 100644 --- a/src/df/gfx/df_view_rule_hooks.c +++ b/src/df/gfx/df_view_rule_hooks.c @@ -829,7 +829,7 @@ DF_GFX_VIEW_RULE_BLOCK_UI_FUNCTION_DEF(disasm) { U128 dasm_text_hash = hs_hash_from_key(dasm_info.text_key, rewind_idx); dasm_text_data = hs_data_from_hash(hs_scope, dasm_text_hash); - dasm_text_info = txt_text_info_from_hash_lang(txt_scope, dasm_text_hash, TXT_LangKind_Null); + dasm_text_info = txt_text_info_from_hash_lang(txt_scope, dasm_text_hash, TXT_LangKind_DisasmX64Intel); if(dasm_text_info.lines_count != 0) { break; diff --git a/src/text_cache/text_cache.c b/src/text_cache/text_cache.c index cdfd0784..cca92ef5 100644 --- a/src/text_cache/text_cache.c +++ b/src/text_cache/text_cache.c @@ -40,9 +40,10 @@ txt_lex_function_from_lang_kind(TXT_LangKind kind) switch(kind) { default:{}break; - case TXT_LangKind_C: {fn = txt_token_array_from_string__c_cpp;}break; - case TXT_LangKind_CPlusPlus: {fn = txt_token_array_from_string__c_cpp;}break; - case TXT_LangKind_Odin: {fn = txt_token_array_from_string__odin;}break; + case TXT_LangKind_C: {fn = txt_token_array_from_string__c_cpp;}break; + case TXT_LangKind_CPlusPlus: {fn = txt_token_array_from_string__c_cpp;}break; + case TXT_LangKind_Odin: {fn = txt_token_array_from_string__odin;}break; + case TXT_LangKind_DisasmX64Intel:{fn = txt_token_array_from_string__disasm_x64_intel;}break; } return fn; } @@ -624,6 +625,178 @@ txt_token_array_from_string__odin(Arena *arena, U64 *bytes_processed_counter, St return result; } +internal TXT_TokenArray +txt_token_array_from_string__disasm_x64_intel(Arena *arena, U64 *bytes_processed_counter, String8 string) +{ + Temp scratch = scratch_begin(&arena, 1); + + //- rjf: parse tokens + TXT_TokenChunkList tokens = {0}; + { + TXT_TokenKind active_token_kind = TXT_TokenKind_Null; + U64 active_token_start_off = 0; + U64 off = 0; + B32 escaped = 0; + B32 string_is_char = 0; + for(U64 advance = 0; off <= string.size; off += advance) + { + U8 byte = (off+0 < string.size) ? string.str[off+0] : 0; + U8 next_byte = (off+1 < string.size) ? string.str[off+1] : 0; + B32 ender_found = 0; + advance = (active_token_kind != TXT_TokenKind_Null ? 1 : 0); + if(off == string.size && active_token_kind != TXT_TokenKind_Null) + { + ender_found = 1; + advance = 1; + } + switch(active_token_kind) + { + default: + case TXT_TokenKind_Null: + { + if(byte == ' ' || byte == '\t' || byte == '\v' || byte == '\f' || byte == '\r' || byte == '\n') + { + active_token_start_off = off; + active_token_kind = TXT_TokenKind_Whitespace; + advance = 1; + } + else if(('a' <= byte && byte <= 'z') || ('A' <= byte && byte <= 'Z') || byte == '_') + { + active_token_start_off = off; + active_token_kind = TXT_TokenKind_Identifier; + advance = 1; + } + else if(byte == '\'') + { + active_token_start_off = off; + active_token_kind = TXT_TokenKind_String; + advance = 1; + string_is_char = 1; + } + else if(byte == '"') + { + active_token_start_off = off; + active_token_kind = TXT_TokenKind_String; + advance = 1; + string_is_char = 0; + } + else if(('0' <= byte && byte <= '9') || (byte == '.' && '0' <= next_byte && next_byte <= '9')) + { + active_token_start_off = off; + active_token_kind = TXT_TokenKind_Numeric; + advance = 1; + } + else if(byte == '~' || byte == '!' || byte == '%' || byte == '^' || + byte == '&' || byte == '*' || byte == '(' || byte == ')' || + byte == '-' || byte == '=' || byte == '+' || byte == '[' || + byte == ']' || byte == '{' || byte == '}' || byte == ';' || + byte == ':' || byte == '?' || byte == '/' || byte == '<' || + byte == '>' || byte == ',' || byte == '.') + { + active_token_start_off = off; + active_token_kind = TXT_TokenKind_Symbol; + advance = 1; + } + else + { + active_token_start_off = off; + active_token_kind = TXT_TokenKind_Error; + advance = 1; + } + }break; + case TXT_TokenKind_Whitespace: + if(byte != ' ' && byte != '\t' && byte != '\v' && byte != '\f') + { + ender_found = 1; + advance = 0; + }break; + case TXT_TokenKind_Identifier: + if((byte < 'a' || 'z' < byte) && (byte < 'A' || 'Z' < byte) && (byte < '0' || '9' < byte) && byte != '_') + { + ender_found = 1; + advance = 0; + }break; + case TXT_TokenKind_String: + { + U8 ender_byte = string_is_char ? '\'' : '"'; + if(!escaped && byte == ender_byte) + { + ender_found = 1; + advance = 1; + } + else if(escaped) + { + escaped = 0; + advance = 1; + } + else if(byte == '\\') + { + escaped = 1; + advance = 1; + } + else + { + U8 byte_class = utf8_class[byte>>3]; + if(byte_class > 1) + { + advance = (U64)byte_class; + } + } + }break; + case TXT_TokenKind_Numeric: + if((byte < 'a' || 'z' < byte) && (byte < 'A' || 'Z' < byte) && (byte < '0' || '9' < byte) && byte != '.') + { + ender_found = 1; + advance = 0; + }break; + case TXT_TokenKind_Symbol: + if(1) + { + // NOTE(rjf): avoiding maximum munch rule for now + ender_found = 1; + advance = 0; + } + else if(byte != '~' && byte != '!' && byte != '#' && byte != '%' && + byte != '^' && byte != '&' && byte != '*' && byte != '(' && + byte != ')' && byte != '-' && byte != '=' && byte != '+' && + byte != '[' && byte != ']' && byte != '{' && byte != '}' && + byte != ';' && byte != ':' && byte != '?' && byte != '/' && + byte != '<' && byte != '>' && byte != ',' && byte != '.') + { + ender_found = 1; + advance = 0; + }break; + case TXT_TokenKind_Error: + { + ender_found = 1; + advance = 0; + }break; + } + if(ender_found != 0) + { + TXT_Token token = {active_token_kind, r1u64(active_token_start_off, off+advance)}; + if(active_token_kind == TXT_TokenKind_Identifier) + { + String8 token_string = str8_substr(string, token.range); + B32 identifier_is_instruction = 0; + if(identifier_is_instruction) + { + token.kind = TXT_TokenKind_Keyword; + } + } + txt_token_chunk_list_push(arena, &tokens, 1024, &token); + active_token_kind = TXT_TokenKind_Null; + active_token_start_off = token.range.max; + } + } + } + + //- rjf: token list -> token array + TXT_TokenArray result = txt_token_array_from_chunk_list(arena, &tokens); + scratch_end(scratch); + return result; +} + //////////////////////////////// //~ rjf: Main Layer Initialization diff --git a/src/text_cache/text_cache.h b/src/text_cache/text_cache.h index 20b5ec15..537c3659 100644 --- a/src/text_cache/text_cache.h +++ b/src/text_cache/text_cache.h @@ -113,6 +113,7 @@ typedef enum TXT_LangKind TXT_LangKind_C, TXT_LangKind_CPlusPlus, TXT_LangKind_Odin, + TXT_LangKind_DisasmX64Intel, TXT_LangKind_COUNT } TXT_LangKind; @@ -248,6 +249,7 @@ internal TXT_TokenArray txt_token_array_from_list(Arena *arena, TXT_TokenList *l internal TXT_TokenArray txt_token_array_from_string__c_cpp(Arena *arena, U64 *bytes_processed_counter, String8 string); internal TXT_TokenArray txt_token_array_from_string__odin(Arena *arena, U64 *bytes_processed_counter, String8 string); +internal TXT_TokenArray txt_token_array_from_string__disasm_x64_intel(Arena *arena, U64 *bytes_processed_counter, String8 string); //////////////////////////////// //~ rjf: Main Layer Initialization