fixes for anthorpic client? Still sucks (need to do better with token throughput)

2026-02-22 01:28:06 -05:00
parent 7a32a5138f
commit b69338f880
1 changed files with 146 additions and 2 deletions
--- a/ai_client.py
+++ b/ai_client.py
@@ -32,7 +32,7 @@ MAX_TOOL_ROUNDS = 10
 # Maximum characters per text chunk sent to Anthropic.
 # Kept well under the ~200k token API limit.
-_ANTHROPIC_CHUNK_SIZE = 180_000
+_ANTHROPIC_CHUNK_SIZE = 120_000
 _SYSTEM_PROMPT = (
    "You are a helpful coding assistant with access to a PowerShell tool and MCP tools (file access: read_file, list_directory, search_files, get_file_summary, web access: web_search, fetch_url). "
@@ -538,6 +538,139 @@ def _send_gemini(md_content: str, user_message: str, base_dir: str, file_items:
        raise _classify_gemini_error(exc) from exc
 # ------------------------------------------------------------------ anthropic history management
 # Rough chars-per-token ratio. Anthropic tokeniser averages ~3.5-4 chars/token.
 # We use 3.5 to be conservative (overestimate token count = safer).
 _CHARS_PER_TOKEN = 3.5
 # Maximum token budget for the entire prompt (system + tools + messages).
 # Anthropic's limit is 200k. We leave headroom for the response + tool schemas.
 _ANTHROPIC_MAX_PROMPT_TOKENS = 180_000
 # Marker prefix used to identify stale file-refresh injections in history
 _FILE_REFRESH_MARKER = "[FILES UPDATED"
 def _estimate_message_tokens(msg: dict) -> int:
    """Rough token estimate for a single Anthropic message dict."""
    total_chars = 0
    content = msg.get("content", "")
    if isinstance(content, str):
        total_chars += len(content)
    elif isinstance(content, list):
        for block in content:
            if isinstance(block, dict):
                text = block.get("text", "") or block.get("content", "")
                if isinstance(text, str):
                    total_chars += len(text)
                # tool_use input
                inp = block.get("input")
                if isinstance(inp, dict):
                    import json as _json
                    total_chars += len(_json.dumps(inp, ensure_ascii=False))
            elif isinstance(block, str):
                total_chars += len(block)
    return max(1, int(total_chars / _CHARS_PER_TOKEN))
 def _estimate_prompt_tokens(system_blocks: list[dict], history: list[dict]) -> int:
    """Estimate total prompt tokens: system + tools + all history messages."""
    total = 0
    # System blocks
    for block in system_blocks:
        text = block.get("text", "")
        total += max(1, int(len(text) / _CHARS_PER_TOKEN))
    # Tool definitions (rough fixed estimate — they're ~2k tokens for our set)
    total += 2500
    # History messages
    for msg in history:
        total += _estimate_message_tokens(msg)
    return total
 def _strip_stale_file_refreshes(history: list[dict]):
    """
    Remove [FILES UPDATED ...] text blocks from all history turns EXCEPT
    the very last user message. These are stale snapshots from previous
    tool rounds that bloat the context without providing value.
    """
    if len(history) < 2:
        return
    # Find the index of the last user message — we keep its file refresh intact
    last_user_idx = -1
    for i in range(len(history) - 1, -1, -1):
        if history[i].get("role") == "user":
            last_user_idx = i
            break
    for i, msg in enumerate(history):
        if msg.get("role") != "user" or i == last_user_idx:
            continue
        content = msg.get("content")
        if not isinstance(content, list):
            continue
        cleaned = []
        for block in content:
            if isinstance(block, dict) and block.get("type") == "text":
                text = block.get("text", "")
                if text.startswith(_FILE_REFRESH_MARKER):
                    continue  # drop this stale file refresh block
            cleaned.append(block)
        if len(cleaned) < len(content):
            msg["content"] = cleaned
 def _trim_anthropic_history(system_blocks: list[dict], history: list[dict]):
    """
    Trim the Anthropic history to fit within the token budget.
    Strategy:
      1. Strip stale file-refresh injections from old turns.
      2. If still over budget, drop oldest turn pairs (user + assistant).
    Returns the number of messages dropped.
    """
    # Phase 1: strip stale file refreshes
    _strip_stale_file_refreshes(history)
    est = _estimate_prompt_tokens(system_blocks, history)
    if est <= _ANTHROPIC_MAX_PROMPT_TOKENS:
        return 0
    # Phase 2: drop oldest turn pairs until within budget
    dropped = 0
    while len(history) > 2 and est > _ANTHROPIC_MAX_PROMPT_TOKENS:
        # Always drop from the front in pairs (user, assistant) to maintain alternation
        # But be careful: the first message might be user, followed by assistant
        if history[0].get("role") == "user" and len(history) > 1 and history[1].get("role") == "assistant":
            removed_user = history.pop(0)
            removed_asst = history.pop(0)
            dropped += 2
            est -= _estimate_message_tokens(removed_user)
            est -= _estimate_message_tokens(removed_asst)
            # If the next message is a user tool_result that belonged to the dropped assistant,
            # we need to drop it too to avoid dangling tool_results
            while history and history[0].get("role") == "user":
                content = history[0].get("content", [])
                if isinstance(content, list) and content and isinstance(content[0], dict) and content[0].get("type") == "tool_result":
                    removed_tr = history.pop(0)
                    dropped += 1
                    est -= _estimate_message_tokens(removed_tr)
                    # And the assistant reply that followed it
                    if history and history[0].get("role") == "assistant":
                        removed_a2 = history.pop(0)
                        dropped += 1
                        est -= _estimate_message_tokens(removed_a2)
                else:
                    break
        else:
            # Edge case: history starts with something unexpected. Drop one message.
            removed = history.pop(0)
            dropped += 1
            est -= _estimate_message_tokens(removed)
    return dropped
 # ------------------------------------------------------------------ anthropic
 def _ensure_anthropic_client():
@@ -640,9 +773,20 @@ def _send_anthropic(md_content: str, user_message: str, base_dir: str, file_item
        # We allow MAX_TOOL_ROUNDS, plus 1 final loop to get the text synthesis
        for round_idx in range(MAX_TOOL_ROUNDS + 2):
            # Trim history to fit within token budget before each API call
            dropped = _trim_anthropic_history(system_blocks, _anthropic_history)
            if dropped > 0:
                est_tokens = _estimate_prompt_tokens(system_blocks, _anthropic_history)
                _append_comms("OUT", "request", {
                    "message": (
                        f"[HISTORY TRIMMED: dropped {dropped} old messages to fit token budget. "
                        f"Estimated {est_tokens} tokens remaining. {len(_anthropic_history)} messages in history.]"
                    ),
                })
            response = _anthropic_client.messages.create(
                model=_model,
-                max_tokens=8096,
+                max_tokens=16384,
                system=system_blocks,
                tools=_build_anthropic_tools(),
                messages=_anthropic_history,