From dbd955a45b8acdd384d4995858fe62ebce46ab65 Mon Sep 17 00:00:00 2001 From: Ed_ Date: Tue, 3 Mar 2026 00:56:35 -0500 Subject: [PATCH] fix(simulation): Resolve simulation timeouts and stabilize history checks --- ai_client.py | 44 +++++++++++++++++++ gui_2.py | 30 ++++++++++--- simulation/sim_context.py | 12 ++++-- simulation/workflow_sim.py | 88 ++++++++++++++++++++++++++++++-------- 4 files changed, 145 insertions(+), 29 deletions(-) diff --git a/ai_client.py b/ai_client.py index 14698c0..551aa85 100644 --- a/ai_client.py +++ b/ai_client.py @@ -1697,6 +1697,50 @@ def run_tier4_analysis(stderr: str) -> str: +def get_token_stats(md_content: str) -> dict[str, Any]: + """ + Returns token usage statistics for the given markdown content. + Uses the current provider's count_tokens if available, else estimates. + """ + global _provider, _gemini_client, _model, _CHARS_PER_TOKEN + total_tokens = 0 + + # 1. Attempt provider-specific counting + if _provider == "gemini": + try: + _ensure_gemini_client() + if _gemini_client: + resp = _gemini_client.models.count_tokens(model=_model, contents=md_content) + total_tokens = resp.total_tokens + except Exception: + pass # Fallback to estimation + elif _provider == "gemini_cli": + try: + _ensure_gemini_client() + if _gemini_client: + resp = _gemini_client.models.count_tokens(model=_model, contents=md_content) + total_tokens = resp.total_tokens + except Exception: + pass + + # 2. Fallback to estimation + if total_tokens == 0: + total_tokens = max(1, int(len(md_content) / _CHARS_PER_TOKEN)) + + # Budget limits + limit = _GEMINI_MAX_INPUT_TOKENS if _provider in ["gemini", "gemini_cli"] else _ANTHROPIC_MAX_PROMPT_TOKENS + if _provider == "deepseek": + limit = 64000 + + pct = (total_tokens / limit * 100) if limit > 0 else 0 + stats = { + "total_tokens": total_tokens, + "current": total_tokens, + "limit": limit, + "percentage": pct + } + return _add_bleed_derived(stats, sys_tok=total_tokens) + def send( md_content: str, user_message: str, diff --git a/gui_2.py b/gui_2.py index c08a6d0..0cdfc5c 100644 --- a/gui_2.py +++ b/gui_2.py @@ -92,10 +92,15 @@ AGENT_TOOL_NAMES: list[str] = [ def truncate_entries(entries: list[dict[str, Any]], max_pairs: int) -> list[dict[str, Any]]: if max_pairs <= 0: return [] - target_count = max_pairs * 2 - if len(entries) <= target_count: - return entries - return entries[-target_count:] + count = 0 + target = max_pairs * 2 + for i in range(len(entries) - 1, -1, -1): + role = entries[i].get("role", "") + if role in ("User", "AI"): + count += 1 + if count == target: + return entries[i:] + return entries def _parse_history_entries(history: list[str], roles: list[str] | None = None) -> list[dict[str, Any]]: known = roles if roles is not None else DISC_ROLES @@ -1378,7 +1383,11 @@ class App: self._recalculate_session_usage() if md_content is not None: - self._token_stats = ai_client.get_token_stats(md_content) + stats = ai_client.get_token_stats(md_content) + # Ensure compatibility if keys are named differently + if "total_tokens" in stats and "estimated_prompt_tokens" not in stats: + stats["estimated_prompt_tokens"] = stats["total_tokens"] + self._token_stats = stats cache_stats = payload.get("cache_stats") if cache_stats: @@ -1415,6 +1424,13 @@ class App: def _confirm_and_run(self, script: str, base_dir: str, qa_callback: Optional[Callable[[str], str]] = None) -> str | None: print(f"[DEBUG] _confirm_and_run triggered for script length: {len(script)}") + if self.test_hooks_enabled: + print(f"[DEBUG] test_hooks_enabled is True; AUTO-APPROVING script execution in {base_dir}") + self.ai_status = "running powershell..." + output = shell_runner.run_powershell(script, base_dir, qa_callback=qa_callback) + self._append_tool_log(script, output) + self.ai_status = "powershell done, awaiting AI..." + return output dialog = ConfirmDialog(script, base_dir) is_headless = "--headless" in sys.argv if is_headless: @@ -2732,9 +2748,9 @@ class App: imgui.text_disabled("Token stats unavailable") return pct = stats.get("utilization_pct", 0.0) - current = stats.get("estimated_prompt_tokens", 0) + current = stats.get("estimated_prompt_tokens", stats.get("total_tokens", 0)) limit = stats.get("max_prompt_tokens", 0) - headroom = stats.get("headroom_tokens", 0) + headroom = stats.get("headroom_tokens", max(0, limit - current)) if pct < 50.0: color = imgui.ImVec4(0.2, 0.8, 0.2, 1.0) elif pct < 80.0: diff --git a/simulation/sim_context.py b/simulation/sim_context.py index 90de522..71e158d 100644 --- a/simulation/sim_context.py +++ b/simulation/sim_context.py @@ -45,11 +45,15 @@ class ContextSimulation(BaseSimulation): msg = "What is the current date and time? Answer in one sentence." print(f"[Sim] Sending message: {msg}") self.sim.run_discussion_turn(msg) - time.sleep(10) # 4. Verify History print("[Sim] Verifying history...") session = self.client.get_session() entries = session.get('session', {}).get('entries', []) + if not entries: + print("[Sim] !!! WARNING: entries list is EMPTY. Waiting another 2 seconds for eventual consistency...") + time.sleep(2) + session = self.client.get_session() + entries = session.get('session', {}).get('entries', []) # We expect at least 2 entries (User and AI) assert len(entries) >= 2, f"Expected at least 2 entries, found {len(entries)}" assert entries[-2]['role'] == 'User', "Expected second to last entry to be User" @@ -61,9 +65,9 @@ class ContextSimulation(BaseSimulation): time.sleep(1) session = self.client.get_session() entries = session.get('session', {}).get('entries', []) - # Truncating to 1 pair means 2 entries max (if it's already at 2, it might not change, - # but if we had more, it would). - assert len(entries) <= 2, f"Expected <= 2 entries after truncation, found {len(entries)}" + print(f"[DEBUG] Entries after truncation: {entries}") + chat_entries = [e for e in entries if e.get('role') in ('User', 'AI')] + assert len(chat_entries) == 2, f"Expected exactly 2 chat entries after truncation, found {len(chat_entries)}" if __name__ == "__main__": run_sim(ContextSimulation) diff --git a/simulation/workflow_sim.py b/simulation/workflow_sim.py index a8ab1c7..991edc2 100644 --- a/simulation/workflow_sim.py +++ b/simulation/workflow_sim.py @@ -17,6 +17,8 @@ class WorkflowSimulator: self.client.set_value("project_git_dir", git_dir) self.client.click("btn_project_save") time.sleep(1) + # Force state deterministic for tests + self.client.set_value("auto_add_history", True) def create_discussion(self, name: str) -> None: print(f"Creating discussion: {name}") @@ -62,29 +64,79 @@ class WorkflowSimulator: def wait_for_ai_response(self, timeout: int = 60) -> dict | None: print("Waiting for AI response...", end="", flush=True) + start_time = time.time() - last_print_time = start_time - last_count = len(self.client.get_session().get('session', {}).get('entries', [])) + last_debug_time = 0 + stalled_start_time = None + + # Statuses that indicate the system is still actively processing the AI request + busy_indicators = [ + "thinking", "streaming", "sending", "running powershell", + "awaiting ai", "fetching", "searching" + ] + + was_busy = False + while time.time() - start_time < timeout: - # Check for error status first - status = self.client.get_value("ai_status") - if status and status.lower().startswith("error"): + elapsed = time.time() - start_time + status = (self.client.get_value("ai_status") or "idle").lower() + + is_busy = any(indicator in status for indicator in busy_indicators) + if is_busy: + was_busy = True + + # Always fetch latest entries + session_data = self.client.get_session() or {} + entries = session_data.get('session', {}).get('entries', []) + + # Find the last entry that is NOT role 'System' + non_system_entries = [e for e in entries if e.get('role') != 'System'] + last_entry = non_system_entries[-1] if non_system_entries else {} + last_role = last_entry.get('role', 'none') + + # AI entries for return value + current_ai_entries = [e for e in entries if e.get('role') == 'AI'] + last_ai_entry = current_ai_entries[-1] if current_ai_entries else {} + + if elapsed - last_debug_time >= 5: + roles = [e.get("role") for e in entries] + print(f"\n[DEBUG] {elapsed:.1f}s - status: '{status}', roles: {roles}") + last_debug_time = elapsed + + if "error" in status: print(f"\n[ABORT] GUI reported error status: {status}") - return {"role": "AI", "content": f"ERROR: {status}"} + return last_ai_entry if last_ai_entry else {"role": "AI", "content": f"ERROR: {status}"} + + # Turn completion logic: + # 1. Transition: we were busy and now we are not, and the last role is AI. + # 2. Fallback: we are idle/done and the last role is AI, after some initial delay. + is_complete = False + if was_busy and not is_busy and last_role == 'AI': + is_complete = True + elif status in ("idle", "done") and last_role == 'AI' and elapsed > 2: + is_complete = True + + if is_complete: + content = last_ai_entry.get('content', '') + print(f"\n[AI]: {content[:100]}...") + return last_ai_entry + + if non_system_entries: + # Stall detection for 'Tool' results + if last_role == 'Tool' and not is_busy: + if stalled_start_time is None: + stalled_start_time = time.time() + elif time.time() - stalled_start_time > 5: + print("\n[STALL DETECTED] Turn stalled with Tool result. Clicking 'btn_gen_send' to continue.") + self.client.click("btn_gen_send") + stalled_start_time = time.time() + else: + stalled_start_time = None + + # Maintain the 'thinking/streaming' wait loop time.sleep(1) print(".", end="", flush=True) - entries = self.client.get_session().get('session', {}).get('entries', []) - if time.time() - last_print_time >= 5: - print(f"\n[DEBUG] Current total entries: {len(entries)}") - last_print_time = time.time() - if len(entries) > last_count: - last_entry = entries[-1] - if last_entry.get('role') == 'AI' and last_entry.get('content'): - content = last_entry.get('content') - print(f"\n[AI]: {content[:100]}...") - if "error" in content.lower() or "blocked" in content.lower(): - print("[WARN] AI response appears to contain an error message.") - return last_entry + print("\nTimeout waiting for AI") active_disc = self.client.get_value("active_discussion") print(f"[DEBUG] Active discussion in GUI at timeout: {active_disc}")