add history truncation

2026-02-22 10:34:53 -05:00
parent bf2d09f3fd
commit da8df7a393
4 changed files with 52 additions and 9 deletions
--- a/ai_client.py
+++ b/ai_client.py
@@ -23,10 +23,13 @@ _model: str = "gemini-2.5-flash"
 _temperature: float = 0.0
 _max_tokens: int = 8192

-def set_model_params(temp: float, max_tok: int):
-    global _temperature, _max_tokens
+_history_trunc_limit: int = 8000
+
+def set_model_params(temp: float, max_tok: int, trunc_limit: int = 8000):
+    global _temperature, _max_tokens, _history_trunc_limit
    _temperature = temp
    _max_tokens = max_tok
+    _history_trunc_limit = trunc_limit

 _gemini_client = None
 _gemini_chat = None
@@ -201,6 +204,16 @@ def set_provider(provider: str, model: str):
    _model = model


+
+def cleanup():
+    """Called on application exit to prevent orphaned caches from billing."""
+    global _gemini_client, _gemini_cache
+    if _gemini_client and _gemini_cache:
+        try:
+            _gemini_client.caches.delete(name=_gemini_cache.name)
+        except Exception:
+            pass
+
 def reset_session():
    global _gemini_client, _gemini_chat, _gemini_cache
    global _anthropic_client, _anthropic_history
@@ -487,6 +500,22 @@ def _send_gemini(md_content: str, user_message: str, base_dir: str, file_items:
            _gemini_chat = _gemini_client.chats.create(**kwargs)
            _gemini_chat._last_md_hash = current_md_hash
        
+        # COMPRESS HISTORY: Truncate massive tool outputs from previous turns to stop token leaks
+        if _gemini_chat and getattr(_gemini_chat, "history", None):
+            for msg in _gemini_chat.history:
+                if msg.role == "user" and hasattr(msg, "parts"):
+                    for p in msg.parts:
+                        if hasattr(p, "function_response") and p.function_response and hasattr(p.function_response, "response"):
+                            r = p.function_response.response
+                            if isinstance(r, dict) and "output" in r:
+                                val = r["output"]
+                                if isinstance(val, str):
+                                    if "[SYSTEM: FILES UPDATED]" in val:
+                                        val = val.split("[SYSTEM: FILES UPDATED]")[0].strip()
+                                    if _history_trunc_limit > 0 and len(val) > _history_trunc_limit:
+                                        val = val[:_history_trunc_limit] + "\n\n... [TRUNCATED BY SYSTEM TO SAVE TOKENS. Original output was too large.]"
+                                    r["output"] = val
+
        _append_comms("OUT", "request", {"message": f"[ctx {len(md_content)} + msg {len(user_message)}]"})
        payload, all_text = user_message, []
        
@@ -760,6 +789,15 @@ def _send_anthropic(md_content: str, user_message: str, base_dir: str, file_item

        user_content = [{"type": "text", "text": user_message}]

+        # COMPRESS HISTORY: Truncate massive tool outputs from previous turns
+        for msg in _anthropic_history:
+            if msg.get("role") == "user" and isinstance(msg.get("content"), list):
+                for block in msg["content"]:
+                    if isinstance(block, dict) and block.get("type") == "tool_result":
+                        t_content = block.get("content", "")
+                        if _history_trunc_limit > 0 and isinstance(t_content, str) and len(t_content) > _history_trunc_limit:
+                            block["content"] = t_content[:_history_trunc_limit] + "\n\n... [TRUNCATED BY SYSTEM TO SAVE TOKENS. Original output was too large.]"
+
        _strip_cache_controls(_anthropic_history)
        _repair_anthropic_history(_anthropic_history)
        _anthropic_history.append({"role": "user", "content": user_content})