From f81f1f5eaad498ac07d6246633bfa376b66ef2a9 Mon Sep 17 00:00:00 2001 From: Ed_ Date: Tue, 2 Jun 2026 18:57:56 -0400 Subject: [PATCH] docs(architecture): add MiniMax provider, RAG integration, Tier 4 patch flow, discussion compression, subagent summarization, async tool execution --- docs/guide_architecture.md | 191 ++++++++++++++++++++++++++++++++++--- 1 file changed, 178 insertions(+), 13 deletions(-) diff --git a/docs/guide_architecture.md b/docs/guide_architecture.md index 7133b65b..94659e67 100644 --- a/docs/guide_architecture.md +++ b/docs/guide_architecture.md @@ -395,9 +395,10 @@ def resolve_pending_action(self, action_id: str, approved: bool) -> bool: ### Module-Level State ```python -_provider: str = "gemini" # "gemini" | "anthropic" | "deepseek" | "gemini_cli" +_provider: str = "gemini" # "gemini" | "anthropic" | "deepseek" | "gemini_cli" | "minimax" _model: str = "gemini-2.5-flash-lite" _temperature: float = 0.0 +_top_p: float = 1.0 _max_tokens: int = 8192 _history_trunc_limit: int = 8000 # Char limit for truncating old tool outputs @@ -411,7 +412,9 @@ Per-provider client objects: _gemini_client: genai.Client | None _gemini_chat: Any # Holds history internally _gemini_cache: Any # Server-side CachedContent -_gemini_cache_md_hash: int | None # For cache invalidation +_gemini_cache_md_hash: str | None # Hash for cache invalidation +_gemini_cache_created_at: float | None # Monotonic time of cache creation +_gemini_cached_file_paths: list[str] # File paths included in the active cache _GEMINI_CACHE_TTL: int = 3600 # 1-hour; rebuilt at 90% (3240s) # Anthropic (client-managed history) @@ -420,9 +423,15 @@ _anthropic_history: list[dict] # Mutable [{role, content}, ...] _anthropic_history_lock: threading.Lock # DeepSeek (raw HTTP, client-managed history) +_deepseek_client: Any | None _deepseek_history: list[dict] _deepseek_history_lock: threading.Lock +# MiniMax (raw HTTP, client-managed history) +_minimax_client: Any | None +_minimax_history: list[dict] +_minimax_history_lock: threading.Lock + # Gemini CLI (adapter wrapper) _gemini_cli_adapter: GeminiCliAdapter | None ``` @@ -442,27 +451,41 @@ _GEMINI_MAX_INPUT_TOKENS: int = 900_000 # 1M window minus headroom ```python def send(md_content, user_message, base_dir=".", file_items=None, discussion_history="", stream=False, - pre_tool_callback=None, qa_callback=None) -> str: + pre_tool_callback=None, qa_callback=None, + enable_tools=True, stream_callback=None, patch_callback=None, + rag_engine=None) -> str: with _send_lock: if _provider == "gemini": return _send_gemini(...) elif _provider == "gemini_cli": return _send_gemini_cli(...) elif _provider == "anthropic": return _send_anthropic(...) elif _provider == "deepseek": return _send_deepseek(..., stream=stream) + elif _provider == "minimax": return _send_minimax(..., stream=stream) ``` `_send_lock` serializes all API calls — only one provider call can be in-flight at a time. All providers share the same callback signatures. Return type is always `str`. +**Parameter evolution** (newer parameters, may be missing from older docstring mirrors): + +- `enable_tools: bool = True` — Per-call gate for the PowerShell + MCP tool set. Tier 4 and certain planning calls pass `enable_tools=False` to force text-only responses. +- `stream_callback: Optional[Callable[[str], None]]` — Provider-specific streaming sink. The DeepSeek and MiniMax paths invoke this as tokens arrive; other providers deliver the full response after the network round-trip. +- `patch_callback: Optional[Callable[[str, str], Optional[str]]]` — Tier 4 patch generation hook. Receives `(error_text, file_context)` and returns an optional diff. See [Tier 4 Patch Generation](#tier-4-patch-generation-flow) below. +- `rag_engine: Optional[Any]` — When provided, the dispatcher injects RAG-retrieved context into `md_content` before the provider call. The RAG engine is owned by the caller (typically `AppController` or `multi_agent_conductor.run_worker_lifecycle`); `ai_client` does not own its lifecycle. See [RAG Integration](#rag-integration) below. + +`_send_lock` serializes all API calls — only one provider call can be in-flight at a time. All providers share the same callback signatures. Return type is always `str`. + ### Provider Comparison -| Aspect | Gemini SDK | Anthropic | DeepSeek | Gemini CLI | -|---|---|---|---|---| -| **Client** | `genai.Client` | `anthropic.Anthropic` | Raw `requests.post` | `GeminiCliAdapter` (subprocess) | -| **History** | SDK-managed (`_gemini_chat._history`) | Client-managed list | Client-managed list | CLI-managed (session ID) | -| **Caching** | Server-side `CachedContent` with TTL | Prompt caching via `cache_control: ephemeral` (4 breakpoints) | None | None | -| **Tool format** | `types.FunctionDeclaration` | JSON Schema dict | Not declared | Same as SDK via adapter | -| **Tool results** | `Part.from_function_response(response={"output": ...})` | `{"type": "tool_result", "tool_use_id": ..., "content": ...}` | `{"role": "tool", "tool_call_id": ..., "content": ...}` | `{"role": "tool", ...}` | -| **History trimming** | In-place at 40% of 900K token estimate | 2-phase: strip stale file refreshes, then drop turn pairs at 180K | None | None | -| **Streaming** | No | No | Yes | No | +| Aspect | Gemini SDK | Anthropic | DeepSeek | Gemini CLI | MiniMax | +|---|---|---|---|---|---| +| **Client** | `genai.Client` | `anthropic.Anthropic` | Raw `requests.post` | `GeminiCliAdapter` (subprocess) | Raw `requests.post` (OpenAI-compatible endpoint) | +| **History** | SDK-managed (`_gemini_chat._history`) | Client-managed list | Client-managed list | CLI-managed (session ID) | Client-managed list | +| **Caching** | Server-side `CachedContent` with TTL | Prompt caching via `cache_control: ephemeral` (4 breakpoints) | None | None | None | +| **Tool format** | `types.FunctionDeclaration` | JSON Schema dict | Not declared | Same as SDK via adapter | Not declared | +| **Tool results** | `Part.from_function_response(response={"output": ...})` | `{"type": "tool_result", "tool_use_id": ..., "content": ...}` | `{"role": "tool", "tool_call_id": ..., "content": ...}` | `{"role": "tool", ...}` | `{"role": "tool", "tool_call_id": ..., "content": ...}` | +| **History trimming** | In-place at 40% of 900K token estimate | 2-phase: strip stale file refreshes, then drop turn pairs at 180K | None | None | 2-phase: drop turn pairs at 180K (Anthropic-equivalent) | +| **Streaming** | No | No | Yes | No | Yes | +| **Error classifier** | `_classify_gemini_error` | `_classify_anthropic_error` | `_classify_deepseek_error` | (inherits Gemini) | `_classify_minimax_error` | +| **Repair hook** | (SDK self-heals) | `_repair_anthropic_history` | `_repair_deepseek_history` | (CLI handles) | `_repair_minimax_history` | ### Tool-Call Loop (common pattern across providers) @@ -512,9 +535,151 @@ Before placing breakpoint 4, all existing `cache_control` is stripped from histo System instruction content is hashed. On each call, a 3-way decision: - **Hash changed**: Delete old cache, rebuild with new content. -- **Cache age > 90% of TTL**: Proactive renewal (delete + rebuild). +- **Cache age > 90% of TTL**: Proactive renewal (delete + rebuild). `cache_created_at` is tracked via `time.monotonic()` for this check. - **No cache exists**: Create new `CachedContent` if token count >= 2048; otherwise inline. +The active cache's file inclusion set is tracked in `_gemini_cached_file_paths: list[str]`. On rebuild, the list is replaced atomically. The GUI uses this list to render the "cached files" indicator in the Cache Panel. + +--- + +## Async Tool Execution + +Independent tool calls within a single round execute concurrently via `asyncio.gather`. This is the major latency win: when the AI emits 3 read_file calls in one turn, they run in parallel rather than sequentially. + +### Entry Point + +```python +async def _execute_tool_calls_concurrently( + calls: list[Any], + base_dir: str, + pre_tool_callback: ..., + qa_callback: ..., + r_idx: int, + provider: str, + patch_callback: ... = None, +) -> list[tuple[str, str, str, str]]: # (tool_name, call_id, output, original_name) + ... +``` + +### Per-Call Worker + +```python +async def _execute_single_tool_call_async( + name: str, args: dict, call_id: str, base_dir: str, + pre_tool_callback, qa_callback, r_idx: int, + tier: str | None = None, + patch_callback = None, +) -> tuple[str, str, str, str]: + ... +``` + +`tier: str | None` is propagated to the comms log and pre-tool callback so audit trails can attribute tool calls to a specific MMA tier (e.g., "Tier 3", "Tier 4"). Thread-local `_local_storage.current_tier` is the source; the parameter is the explicit pass-through. + +### Exception Handling + +If any individual call raises, `asyncio.gather` with `return_exceptions=True` converts the exception to a returned value rather than cancelling siblings. The post-round loop in `_send_*` then formats the error per provider. See [guide_tools.md](guide_tools.md#parallel-tool-execution) for the full implementation pattern and the timing analysis (sequential vs concurrent latency for a typical 3-call round). + +--- + +## RAG Integration + +`ai_client.send()` accepts an optional `rag_engine` parameter. When supplied, the dispatcher augments `md_content` with RAG-retrieved context before the provider call. + +```python +def send(md_content, user_message, base_dir=".", file_items=None, ..., + rag_engine: Optional[Any] = None) -> str: + if rag_engine is not None: + retrieved = rag_engine.query(user_message, top_k=5) + md_content = _inject_rag_context(md_content, retrieved) + ... +``` + +The RAG engine is **not** owned by `ai_client`; the caller (typically `AppController` for the main discussion flow, or `multi_agent_conductor.run_worker_lifecycle` for Tier 3 workers) is responsible for instantiating and configuring it. This keeps `ai_client` decoupled from any specific retrieval backend (ChromaDB local, external MCP RAG server, or none). + +**Lifecycle**: +- The `AppController` constructs a single `RAGEngine` per project load. +- The RAG engine is passed through to `send()` for every AI call. +- If a project disables RAG, `rag_engine=None` is passed and the integration is a no-op. +- See [guide_rag.md](guide_rag.md) (placeholder; written in Task 10) for the vector store, chunking, and indexing pipeline. + +--- + +## Tier 4 Patch Generation Flow + +When a Tier 3 worker's test run fails, the engine can request a Tier 4 patch instead of just an error summary. This is a structured diff, not a free-form suggestion. + +### Entry Point + +```python +def run_tier4_patch_generation(error: str, file_context: str) -> str: + ... +``` + +### Flow + +1. Tier 3 worker fails a test; `stderr` is captured by the test runner. +2. The conductor thread calls `run_tier4_patch_callback(stderr, base_dir)` to get a candidate patch. +3. If a patch is generated, the GUI's patch modal (`src/patch_modal.py`) presents the diff for human review. +4. User clicks Apply Patch to resume the pipeline, or Reject to send the worker back for another attempt. +5. The `patch_callback` parameter on `send()` is the Tier 4 hook; it can be `None` for callers that don't support patch generation. + +### Threading + +`run_tier4_patch_generation` calls `send()` with `enable_tools=False` to force a text-only response. The result is parsed as a unified diff. If parsing fails, the modal shows the raw response and the user can manually copy-edit. + +--- + +## Discussion Compression + +Long discussions accumulate tool outputs and intermediate reasoning that bloat the context. The `run_discussion_compression` function asks the active provider to produce a compressed summary of the discussion so far. + +### Entry Point + +```python +def run_discussion_compression(discussion_text: str) -> str: + ... +``` + +### Flow + +1. Caller (typically the GUI's "Compress Discussion" button or an automatic trigger when history exceeds N tokens) invokes `run_discussion_compression(current_history)`. +2. The function dispatches to the active provider with `enable_tools=False` and a fixed system prompt instructing the model to summarize while preserving key decisions, file paths, and unresolved questions. +3. The returned string replaces the discussion history in subsequent `send()` calls. +4. The original history is archived to the session log (`logs/sessions//comms.log`) for audit. + +### Provider Robustness + +The function tolerates case- and whitespace-variation in the provider string (`" MiniMax "` is normalized to `"minimax"`). This is important because the active provider may be set via different code paths (TOML, env var, runtime override). + +--- + +## Subagent Summarization + +For very large files, the heuristic `summarise_file` in `src/summarize.py` may be insufficient. The `run_subagent_summarization` function asks the active provider to produce a high-signal summary of a single file using a model call rather than a heuristic. + +### Entry Point + +```python +def run_subagent_summarization(file_path: str, content: str, is_code: bool, outline: str) -> str: + ... +``` + +### When Invoked + +- File exceeds the heuristic summary's effective scope (configurable, typically > 5000 lines or > 100KB) +- The aggregation strategy in `aggregate.py` is set to `summarize` (rather than `full` or `skeleton`) +- The Tier 2 ticket generation explicitly requests a sub-agent summary for a high-priority file + +### Flow + +1. Caller builds a structured prompt combining the file path, content, an AST outline (if `is_code=True`), and a "summary" instruction. +2. The function dispatches to the active provider with `enable_tools=False`. +3. The returned string is the file's summary, which replaces the full content in the aggregated context. + +### Cost vs Quality Trade-off + +Sub-agent summarization is more expensive than heuristic summarization (one full provider call per file) but produces higher-quality results for complex files. The caller decides based on the project's token budget and quality requirements. + --- ## Comms Log System