diff --git a/ai_client.py b/ai_client.py index 277e6f2..4ca350d 100644 --- a/ai_client.py +++ b/ai_client.py @@ -32,7 +32,7 @@ MAX_TOOL_ROUNDS = 10 # Maximum characters per text chunk sent to Anthropic. # Kept well under the ~200k token API limit. -_ANTHROPIC_CHUNK_SIZE = 180_000 +_ANTHROPIC_CHUNK_SIZE = 120_000 _SYSTEM_PROMPT = ( "You are a helpful coding assistant with access to a PowerShell tool and MCP tools (file access: read_file, list_directory, search_files, get_file_summary, web access: web_search, fetch_url). " @@ -538,6 +538,139 @@ def _send_gemini(md_content: str, user_message: str, base_dir: str, file_items: raise _classify_gemini_error(exc) from exc + +# ------------------------------------------------------------------ anthropic history management + +# Rough chars-per-token ratio. Anthropic tokeniser averages ~3.5-4 chars/token. +# We use 3.5 to be conservative (overestimate token count = safer). +_CHARS_PER_TOKEN = 3.5 + +# Maximum token budget for the entire prompt (system + tools + messages). +# Anthropic's limit is 200k. We leave headroom for the response + tool schemas. +_ANTHROPIC_MAX_PROMPT_TOKENS = 180_000 + +# Marker prefix used to identify stale file-refresh injections in history +_FILE_REFRESH_MARKER = "[FILES UPDATED" + + +def _estimate_message_tokens(msg: dict) -> int: + """Rough token estimate for a single Anthropic message dict.""" + total_chars = 0 + content = msg.get("content", "") + if isinstance(content, str): + total_chars += len(content) + elif isinstance(content, list): + for block in content: + if isinstance(block, dict): + text = block.get("text", "") or block.get("content", "") + if isinstance(text, str): + total_chars += len(text) + # tool_use input + inp = block.get("input") + if isinstance(inp, dict): + import json as _json + total_chars += len(_json.dumps(inp, ensure_ascii=False)) + elif isinstance(block, str): + total_chars += len(block) + return max(1, int(total_chars / _CHARS_PER_TOKEN)) + + +def _estimate_prompt_tokens(system_blocks: list[dict], history: list[dict]) -> int: + """Estimate total prompt tokens: system + tools + all history messages.""" + total = 0 + # System blocks + for block in system_blocks: + text = block.get("text", "") + total += max(1, int(len(text) / _CHARS_PER_TOKEN)) + # Tool definitions (rough fixed estimate — they're ~2k tokens for our set) + total += 2500 + # History messages + for msg in history: + total += _estimate_message_tokens(msg) + return total + + +def _strip_stale_file_refreshes(history: list[dict]): + """ + Remove [FILES UPDATED ...] text blocks from all history turns EXCEPT + the very last user message. These are stale snapshots from previous + tool rounds that bloat the context without providing value. + """ + if len(history) < 2: + return + # Find the index of the last user message — we keep its file refresh intact + last_user_idx = -1 + for i in range(len(history) - 1, -1, -1): + if history[i].get("role") == "user": + last_user_idx = i + break + for i, msg in enumerate(history): + if msg.get("role") != "user" or i == last_user_idx: + continue + content = msg.get("content") + if not isinstance(content, list): + continue + cleaned = [] + for block in content: + if isinstance(block, dict) and block.get("type") == "text": + text = block.get("text", "") + if text.startswith(_FILE_REFRESH_MARKER): + continue # drop this stale file refresh block + cleaned.append(block) + if len(cleaned) < len(content): + msg["content"] = cleaned + + +def _trim_anthropic_history(system_blocks: list[dict], history: list[dict]): + """ + Trim the Anthropic history to fit within the token budget. + Strategy: + 1. Strip stale file-refresh injections from old turns. + 2. If still over budget, drop oldest turn pairs (user + assistant). + Returns the number of messages dropped. + """ + # Phase 1: strip stale file refreshes + _strip_stale_file_refreshes(history) + + est = _estimate_prompt_tokens(system_blocks, history) + if est <= _ANTHROPIC_MAX_PROMPT_TOKENS: + return 0 + + # Phase 2: drop oldest turn pairs until within budget + dropped = 0 + while len(history) > 2 and est > _ANTHROPIC_MAX_PROMPT_TOKENS: + # Always drop from the front in pairs (user, assistant) to maintain alternation + # But be careful: the first message might be user, followed by assistant + if history[0].get("role") == "user" and len(history) > 1 and history[1].get("role") == "assistant": + removed_user = history.pop(0) + removed_asst = history.pop(0) + dropped += 2 + est -= _estimate_message_tokens(removed_user) + est -= _estimate_message_tokens(removed_asst) + # If the next message is a user tool_result that belonged to the dropped assistant, + # we need to drop it too to avoid dangling tool_results + while history and history[0].get("role") == "user": + content = history[0].get("content", []) + if isinstance(content, list) and content and isinstance(content[0], dict) and content[0].get("type") == "tool_result": + removed_tr = history.pop(0) + dropped += 1 + est -= _estimate_message_tokens(removed_tr) + # And the assistant reply that followed it + if history and history[0].get("role") == "assistant": + removed_a2 = history.pop(0) + dropped += 1 + est -= _estimate_message_tokens(removed_a2) + else: + break + else: + # Edge case: history starts with something unexpected. Drop one message. + removed = history.pop(0) + dropped += 1 + est -= _estimate_message_tokens(removed) + + return dropped + + # ------------------------------------------------------------------ anthropic def _ensure_anthropic_client(): @@ -640,9 +773,20 @@ def _send_anthropic(md_content: str, user_message: str, base_dir: str, file_item # We allow MAX_TOOL_ROUNDS, plus 1 final loop to get the text synthesis for round_idx in range(MAX_TOOL_ROUNDS + 2): + # Trim history to fit within token budget before each API call + dropped = _trim_anthropic_history(system_blocks, _anthropic_history) + if dropped > 0: + est_tokens = _estimate_prompt_tokens(system_blocks, _anthropic_history) + _append_comms("OUT", "request", { + "message": ( + f"[HISTORY TRIMMED: dropped {dropped} old messages to fit token budget. " + f"Estimated {est_tokens} tokens remaining. {len(_anthropic_history)} messages in history.]" + ), + }) + response = _anthropic_client.messages.create( model=_model, - max_tokens=8096, + max_tokens=16384, system=system_blocks, tools=_build_anthropic_tools(), messages=_anthropic_history,