fixes for anthorpic client? Still sucks (need to do better with token throughput)

This commit is contained in:
2026-02-22 01:28:06 -05:00
parent 7a32a5138f
commit b69338f880

View File

@@ -32,7 +32,7 @@ MAX_TOOL_ROUNDS = 10
# Maximum characters per text chunk sent to Anthropic. # Maximum characters per text chunk sent to Anthropic.
# Kept well under the ~200k token API limit. # Kept well under the ~200k token API limit.
_ANTHROPIC_CHUNK_SIZE = 180_000 _ANTHROPIC_CHUNK_SIZE = 120_000
_SYSTEM_PROMPT = ( _SYSTEM_PROMPT = (
"You are a helpful coding assistant with access to a PowerShell tool and MCP tools (file access: read_file, list_directory, search_files, get_file_summary, web access: web_search, fetch_url). " "You are a helpful coding assistant with access to a PowerShell tool and MCP tools (file access: read_file, list_directory, search_files, get_file_summary, web access: web_search, fetch_url). "
@@ -538,6 +538,139 @@ def _send_gemini(md_content: str, user_message: str, base_dir: str, file_items:
raise _classify_gemini_error(exc) from exc raise _classify_gemini_error(exc) from exc
# ------------------------------------------------------------------ anthropic history management
# Rough chars-per-token ratio. Anthropic tokeniser averages ~3.5-4 chars/token.
# We use 3.5 to be conservative (overestimate token count = safer).
_CHARS_PER_TOKEN = 3.5
# Maximum token budget for the entire prompt (system + tools + messages).
# Anthropic's limit is 200k. We leave headroom for the response + tool schemas.
_ANTHROPIC_MAX_PROMPT_TOKENS = 180_000
# Marker prefix used to identify stale file-refresh injections in history
_FILE_REFRESH_MARKER = "[FILES UPDATED"
def _estimate_message_tokens(msg: dict) -> int:
"""Rough token estimate for a single Anthropic message dict."""
total_chars = 0
content = msg.get("content", "")
if isinstance(content, str):
total_chars += len(content)
elif isinstance(content, list):
for block in content:
if isinstance(block, dict):
text = block.get("text", "") or block.get("content", "")
if isinstance(text, str):
total_chars += len(text)
# tool_use input
inp = block.get("input")
if isinstance(inp, dict):
import json as _json
total_chars += len(_json.dumps(inp, ensure_ascii=False))
elif isinstance(block, str):
total_chars += len(block)
return max(1, int(total_chars / _CHARS_PER_TOKEN))
def _estimate_prompt_tokens(system_blocks: list[dict], history: list[dict]) -> int:
"""Estimate total prompt tokens: system + tools + all history messages."""
total = 0
# System blocks
for block in system_blocks:
text = block.get("text", "")
total += max(1, int(len(text) / _CHARS_PER_TOKEN))
# Tool definitions (rough fixed estimate — they're ~2k tokens for our set)
total += 2500
# History messages
for msg in history:
total += _estimate_message_tokens(msg)
return total
def _strip_stale_file_refreshes(history: list[dict]):
"""
Remove [FILES UPDATED ...] text blocks from all history turns EXCEPT
the very last user message. These are stale snapshots from previous
tool rounds that bloat the context without providing value.
"""
if len(history) < 2:
return
# Find the index of the last user message — we keep its file refresh intact
last_user_idx = -1
for i in range(len(history) - 1, -1, -1):
if history[i].get("role") == "user":
last_user_idx = i
break
for i, msg in enumerate(history):
if msg.get("role") != "user" or i == last_user_idx:
continue
content = msg.get("content")
if not isinstance(content, list):
continue
cleaned = []
for block in content:
if isinstance(block, dict) and block.get("type") == "text":
text = block.get("text", "")
if text.startswith(_FILE_REFRESH_MARKER):
continue # drop this stale file refresh block
cleaned.append(block)
if len(cleaned) < len(content):
msg["content"] = cleaned
def _trim_anthropic_history(system_blocks: list[dict], history: list[dict]):
"""
Trim the Anthropic history to fit within the token budget.
Strategy:
1. Strip stale file-refresh injections from old turns.
2. If still over budget, drop oldest turn pairs (user + assistant).
Returns the number of messages dropped.
"""
# Phase 1: strip stale file refreshes
_strip_stale_file_refreshes(history)
est = _estimate_prompt_tokens(system_blocks, history)
if est <= _ANTHROPIC_MAX_PROMPT_TOKENS:
return 0
# Phase 2: drop oldest turn pairs until within budget
dropped = 0
while len(history) > 2 and est > _ANTHROPIC_MAX_PROMPT_TOKENS:
# Always drop from the front in pairs (user, assistant) to maintain alternation
# But be careful: the first message might be user, followed by assistant
if history[0].get("role") == "user" and len(history) > 1 and history[1].get("role") == "assistant":
removed_user = history.pop(0)
removed_asst = history.pop(0)
dropped += 2
est -= _estimate_message_tokens(removed_user)
est -= _estimate_message_tokens(removed_asst)
# If the next message is a user tool_result that belonged to the dropped assistant,
# we need to drop it too to avoid dangling tool_results
while history and history[0].get("role") == "user":
content = history[0].get("content", [])
if isinstance(content, list) and content and isinstance(content[0], dict) and content[0].get("type") == "tool_result":
removed_tr = history.pop(0)
dropped += 1
est -= _estimate_message_tokens(removed_tr)
# And the assistant reply that followed it
if history and history[0].get("role") == "assistant":
removed_a2 = history.pop(0)
dropped += 1
est -= _estimate_message_tokens(removed_a2)
else:
break
else:
# Edge case: history starts with something unexpected. Drop one message.
removed = history.pop(0)
dropped += 1
est -= _estimate_message_tokens(removed)
return dropped
# ------------------------------------------------------------------ anthropic # ------------------------------------------------------------------ anthropic
def _ensure_anthropic_client(): def _ensure_anthropic_client():
@@ -640,9 +773,20 @@ def _send_anthropic(md_content: str, user_message: str, base_dir: str, file_item
# We allow MAX_TOOL_ROUNDS, plus 1 final loop to get the text synthesis # We allow MAX_TOOL_ROUNDS, plus 1 final loop to get the text synthesis
for round_idx in range(MAX_TOOL_ROUNDS + 2): for round_idx in range(MAX_TOOL_ROUNDS + 2):
# Trim history to fit within token budget before each API call
dropped = _trim_anthropic_history(system_blocks, _anthropic_history)
if dropped > 0:
est_tokens = _estimate_prompt_tokens(system_blocks, _anthropic_history)
_append_comms("OUT", "request", {
"message": (
f"[HISTORY TRIMMED: dropped {dropped} old messages to fit token budget. "
f"Estimated {est_tokens} tokens remaining. {len(_anthropic_history)} messages in history.]"
),
})
response = _anthropic_client.messages.create( response = _anthropic_client.messages.create(
model=_model, model=_model,
max_tokens=8096, max_tokens=16384,
system=system_blocks, system=system_blocks,
tools=_build_anthropic_tools(), tools=_build_anthropic_tools(),
messages=_anthropic_history, messages=_anthropic_history,