fixes for anthorpic client? Still sucks (need to do better with token throughput)
This commit is contained in:
148
ai_client.py
148
ai_client.py
@@ -32,7 +32,7 @@ MAX_TOOL_ROUNDS = 10
|
|||||||
|
|
||||||
# Maximum characters per text chunk sent to Anthropic.
|
# Maximum characters per text chunk sent to Anthropic.
|
||||||
# Kept well under the ~200k token API limit.
|
# Kept well under the ~200k token API limit.
|
||||||
_ANTHROPIC_CHUNK_SIZE = 180_000
|
_ANTHROPIC_CHUNK_SIZE = 120_000
|
||||||
|
|
||||||
_SYSTEM_PROMPT = (
|
_SYSTEM_PROMPT = (
|
||||||
"You are a helpful coding assistant with access to a PowerShell tool and MCP tools (file access: read_file, list_directory, search_files, get_file_summary, web access: web_search, fetch_url). "
|
"You are a helpful coding assistant with access to a PowerShell tool and MCP tools (file access: read_file, list_directory, search_files, get_file_summary, web access: web_search, fetch_url). "
|
||||||
@@ -538,6 +538,139 @@ def _send_gemini(md_content: str, user_message: str, base_dir: str, file_items:
|
|||||||
raise _classify_gemini_error(exc) from exc
|
raise _classify_gemini_error(exc) from exc
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------ anthropic history management
|
||||||
|
|
||||||
|
# Rough chars-per-token ratio. Anthropic tokeniser averages ~3.5-4 chars/token.
|
||||||
|
# We use 3.5 to be conservative (overestimate token count = safer).
|
||||||
|
_CHARS_PER_TOKEN = 3.5
|
||||||
|
|
||||||
|
# Maximum token budget for the entire prompt (system + tools + messages).
|
||||||
|
# Anthropic's limit is 200k. We leave headroom for the response + tool schemas.
|
||||||
|
_ANTHROPIC_MAX_PROMPT_TOKENS = 180_000
|
||||||
|
|
||||||
|
# Marker prefix used to identify stale file-refresh injections in history
|
||||||
|
_FILE_REFRESH_MARKER = "[FILES UPDATED"
|
||||||
|
|
||||||
|
|
||||||
|
def _estimate_message_tokens(msg: dict) -> int:
|
||||||
|
"""Rough token estimate for a single Anthropic message dict."""
|
||||||
|
total_chars = 0
|
||||||
|
content = msg.get("content", "")
|
||||||
|
if isinstance(content, str):
|
||||||
|
total_chars += len(content)
|
||||||
|
elif isinstance(content, list):
|
||||||
|
for block in content:
|
||||||
|
if isinstance(block, dict):
|
||||||
|
text = block.get("text", "") or block.get("content", "")
|
||||||
|
if isinstance(text, str):
|
||||||
|
total_chars += len(text)
|
||||||
|
# tool_use input
|
||||||
|
inp = block.get("input")
|
||||||
|
if isinstance(inp, dict):
|
||||||
|
import json as _json
|
||||||
|
total_chars += len(_json.dumps(inp, ensure_ascii=False))
|
||||||
|
elif isinstance(block, str):
|
||||||
|
total_chars += len(block)
|
||||||
|
return max(1, int(total_chars / _CHARS_PER_TOKEN))
|
||||||
|
|
||||||
|
|
||||||
|
def _estimate_prompt_tokens(system_blocks: list[dict], history: list[dict]) -> int:
|
||||||
|
"""Estimate total prompt tokens: system + tools + all history messages."""
|
||||||
|
total = 0
|
||||||
|
# System blocks
|
||||||
|
for block in system_blocks:
|
||||||
|
text = block.get("text", "")
|
||||||
|
total += max(1, int(len(text) / _CHARS_PER_TOKEN))
|
||||||
|
# Tool definitions (rough fixed estimate — they're ~2k tokens for our set)
|
||||||
|
total += 2500
|
||||||
|
# History messages
|
||||||
|
for msg in history:
|
||||||
|
total += _estimate_message_tokens(msg)
|
||||||
|
return total
|
||||||
|
|
||||||
|
|
||||||
|
def _strip_stale_file_refreshes(history: list[dict]):
|
||||||
|
"""
|
||||||
|
Remove [FILES UPDATED ...] text blocks from all history turns EXCEPT
|
||||||
|
the very last user message. These are stale snapshots from previous
|
||||||
|
tool rounds that bloat the context without providing value.
|
||||||
|
"""
|
||||||
|
if len(history) < 2:
|
||||||
|
return
|
||||||
|
# Find the index of the last user message — we keep its file refresh intact
|
||||||
|
last_user_idx = -1
|
||||||
|
for i in range(len(history) - 1, -1, -1):
|
||||||
|
if history[i].get("role") == "user":
|
||||||
|
last_user_idx = i
|
||||||
|
break
|
||||||
|
for i, msg in enumerate(history):
|
||||||
|
if msg.get("role") != "user" or i == last_user_idx:
|
||||||
|
continue
|
||||||
|
content = msg.get("content")
|
||||||
|
if not isinstance(content, list):
|
||||||
|
continue
|
||||||
|
cleaned = []
|
||||||
|
for block in content:
|
||||||
|
if isinstance(block, dict) and block.get("type") == "text":
|
||||||
|
text = block.get("text", "")
|
||||||
|
if text.startswith(_FILE_REFRESH_MARKER):
|
||||||
|
continue # drop this stale file refresh block
|
||||||
|
cleaned.append(block)
|
||||||
|
if len(cleaned) < len(content):
|
||||||
|
msg["content"] = cleaned
|
||||||
|
|
||||||
|
|
||||||
|
def _trim_anthropic_history(system_blocks: list[dict], history: list[dict]):
|
||||||
|
"""
|
||||||
|
Trim the Anthropic history to fit within the token budget.
|
||||||
|
Strategy:
|
||||||
|
1. Strip stale file-refresh injections from old turns.
|
||||||
|
2. If still over budget, drop oldest turn pairs (user + assistant).
|
||||||
|
Returns the number of messages dropped.
|
||||||
|
"""
|
||||||
|
# Phase 1: strip stale file refreshes
|
||||||
|
_strip_stale_file_refreshes(history)
|
||||||
|
|
||||||
|
est = _estimate_prompt_tokens(system_blocks, history)
|
||||||
|
if est <= _ANTHROPIC_MAX_PROMPT_TOKENS:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
# Phase 2: drop oldest turn pairs until within budget
|
||||||
|
dropped = 0
|
||||||
|
while len(history) > 2 and est > _ANTHROPIC_MAX_PROMPT_TOKENS:
|
||||||
|
# Always drop from the front in pairs (user, assistant) to maintain alternation
|
||||||
|
# But be careful: the first message might be user, followed by assistant
|
||||||
|
if history[0].get("role") == "user" and len(history) > 1 and history[1].get("role") == "assistant":
|
||||||
|
removed_user = history.pop(0)
|
||||||
|
removed_asst = history.pop(0)
|
||||||
|
dropped += 2
|
||||||
|
est -= _estimate_message_tokens(removed_user)
|
||||||
|
est -= _estimate_message_tokens(removed_asst)
|
||||||
|
# If the next message is a user tool_result that belonged to the dropped assistant,
|
||||||
|
# we need to drop it too to avoid dangling tool_results
|
||||||
|
while history and history[0].get("role") == "user":
|
||||||
|
content = history[0].get("content", [])
|
||||||
|
if isinstance(content, list) and content and isinstance(content[0], dict) and content[0].get("type") == "tool_result":
|
||||||
|
removed_tr = history.pop(0)
|
||||||
|
dropped += 1
|
||||||
|
est -= _estimate_message_tokens(removed_tr)
|
||||||
|
# And the assistant reply that followed it
|
||||||
|
if history and history[0].get("role") == "assistant":
|
||||||
|
removed_a2 = history.pop(0)
|
||||||
|
dropped += 1
|
||||||
|
est -= _estimate_message_tokens(removed_a2)
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
# Edge case: history starts with something unexpected. Drop one message.
|
||||||
|
removed = history.pop(0)
|
||||||
|
dropped += 1
|
||||||
|
est -= _estimate_message_tokens(removed)
|
||||||
|
|
||||||
|
return dropped
|
||||||
|
|
||||||
|
|
||||||
# ------------------------------------------------------------------ anthropic
|
# ------------------------------------------------------------------ anthropic
|
||||||
|
|
||||||
def _ensure_anthropic_client():
|
def _ensure_anthropic_client():
|
||||||
@@ -640,9 +773,20 @@ def _send_anthropic(md_content: str, user_message: str, base_dir: str, file_item
|
|||||||
|
|
||||||
# We allow MAX_TOOL_ROUNDS, plus 1 final loop to get the text synthesis
|
# We allow MAX_TOOL_ROUNDS, plus 1 final loop to get the text synthesis
|
||||||
for round_idx in range(MAX_TOOL_ROUNDS + 2):
|
for round_idx in range(MAX_TOOL_ROUNDS + 2):
|
||||||
|
# Trim history to fit within token budget before each API call
|
||||||
|
dropped = _trim_anthropic_history(system_blocks, _anthropic_history)
|
||||||
|
if dropped > 0:
|
||||||
|
est_tokens = _estimate_prompt_tokens(system_blocks, _anthropic_history)
|
||||||
|
_append_comms("OUT", "request", {
|
||||||
|
"message": (
|
||||||
|
f"[HISTORY TRIMMED: dropped {dropped} old messages to fit token budget. "
|
||||||
|
f"Estimated {est_tokens} tokens remaining. {len(_anthropic_history)} messages in history.]"
|
||||||
|
),
|
||||||
|
})
|
||||||
|
|
||||||
response = _anthropic_client.messages.create(
|
response = _anthropic_client.messages.create(
|
||||||
model=_model,
|
model=_model,
|
||||||
max_tokens=8096,
|
max_tokens=16384,
|
||||||
system=system_blocks,
|
system=system_blocks,
|
||||||
tools=_build_anthropic_tools(),
|
tools=_build_anthropic_tools(),
|
||||||
messages=_anthropic_history,
|
messages=_anthropic_history,
|
||||||
|
|||||||
Reference in New Issue
Block a user