From f81f1f5eaad498ac07d6246633bfa376b66ef2a9 Mon Sep 17 00:00:00 2001
From: Ed_ <edwardgz@gmail.com>
Date: Tue, 2 Jun 2026 18:57:56 -0400
Subject: [PATCH] docs(architecture): add MiniMax provider, RAG integration,
 Tier 4 patch flow, discussion compression, subagent summarization, async tool
 execution

---
 docs/guide_architecture.md | 191 ++++++++++++++++++++++++++++++++++---
 1 file changed, 178 insertions(+), 13 deletions(-)

diff --git a/docs/guide_architecture.md b/docs/guide_architecture.md
index 7133b65b..94659e67 100644
--- a/docs/guide_architecture.md
+++ b/docs/guide_architecture.md
@@ -395,9 +395,10 @@ def resolve_pending_action(self, action_id: str, approved: bool) -> bool:
 ### Module-Level State
 
 ```python
-_provider: str = "gemini"              # "gemini" | "anthropic" | "deepseek" | "gemini_cli"
+_provider: str = "gemini"              # "gemini" | "anthropic" | "deepseek" | "gemini_cli" | "minimax"
 _model: str = "gemini-2.5-flash-lite"
 _temperature: float = 0.0
+_top_p: float = 1.0
 _max_tokens: int = 8192
 _history_trunc_limit: int = 8000       # Char limit for truncating old tool outputs
 
@@ -411,7 +412,9 @@ Per-provider client objects:
 _gemini_client: genai.Client | None
 _gemini_chat: Any                      # Holds history internally
 _gemini_cache: Any                     # Server-side CachedContent
-_gemini_cache_md_hash: int | None      # For cache invalidation
+_gemini_cache_md_hash: str | None      # Hash for cache invalidation
+_gemini_cache_created_at: float | None # Monotonic time of cache creation
+_gemini_cached_file_paths: list[str]   # File paths included in the active cache
 _GEMINI_CACHE_TTL: int = 3600          # 1-hour; rebuilt at 90% (3240s)
 
 # Anthropic (client-managed history)
@@ -420,9 +423,15 @@ _anthropic_history: list[dict]         # Mutable [{role, content}, ...]
 _anthropic_history_lock: threading.Lock
 
 # DeepSeek (raw HTTP, client-managed history)
+_deepseek_client: Any | None
 _deepseek_history: list[dict]
 _deepseek_history_lock: threading.Lock
 
+# MiniMax (raw HTTP, client-managed history)
+_minimax_client: Any | None
+_minimax_history: list[dict]
+_minimax_history_lock: threading.Lock
+
 # Gemini CLI (adapter wrapper)
 _gemini_cli_adapter: GeminiCliAdapter | None
 ```
@@ -442,27 +451,41 @@ _GEMINI_MAX_INPUT_TOKENS: int = 900_000      # 1M window minus headroom
 ```python
 def send(md_content, user_message, base_dir=".", file_items=None,
          discussion_history="", stream=False,
-         pre_tool_callback=None, qa_callback=None) -> str:
+         pre_tool_callback=None, qa_callback=None,
+         enable_tools=True, stream_callback=None, patch_callback=None,
+         rag_engine=None) -> str:
     with _send_lock:
         if _provider == "gemini":      return _send_gemini(...)
         elif _provider == "gemini_cli": return _send_gemini_cli(...)
         elif _provider == "anthropic":  return _send_anthropic(...)
         elif _provider == "deepseek":   return _send_deepseek(..., stream=stream)
+        elif _provider == "minimax":    return _send_minimax(..., stream=stream)
 ```
 
 `_send_lock` serializes all API calls — only one provider call can be in-flight at a time. All providers share the same callback signatures. Return type is always `str`.
 
+**Parameter evolution** (newer parameters, may be missing from older docstring mirrors):
+
+- `enable_tools: bool = True` — Per-call gate for the PowerShell + MCP tool set. Tier 4 and certain planning calls pass `enable_tools=False` to force text-only responses.
+- `stream_callback: Optional[Callable[[str], None]]` — Provider-specific streaming sink. The DeepSeek and MiniMax paths invoke this as tokens arrive; other providers deliver the full response after the network round-trip.
+- `patch_callback: Optional[Callable[[str, str], Optional[str]]]` — Tier 4 patch generation hook. Receives `(error_text, file_context)` and returns an optional diff. See [Tier 4 Patch Generation](#tier-4-patch-generation-flow) below.
+- `rag_engine: Optional[Any]` — When provided, the dispatcher injects RAG-retrieved context into `md_content` before the provider call. The RAG engine is owned by the caller (typically `AppController` or `multi_agent_conductor.run_worker_lifecycle`); `ai_client` does not own its lifecycle. See [RAG Integration](#rag-integration) below.
+
+`_send_lock` serializes all API calls — only one provider call can be in-flight at a time. All providers share the same callback signatures. Return type is always `str`.
+
 ### Provider Comparison
 
-| Aspect | Gemini SDK | Anthropic | DeepSeek | Gemini CLI |
-|---|---|---|---|---|
-| **Client** | `genai.Client` | `anthropic.Anthropic` | Raw `requests.post` | `GeminiCliAdapter` (subprocess) |
-| **History** | SDK-managed (`_gemini_chat._history`) | Client-managed list | Client-managed list | CLI-managed (session ID) |
-| **Caching** | Server-side `CachedContent` with TTL | Prompt caching via `cache_control: ephemeral` (4 breakpoints) | None | None |
-| **Tool format** | `types.FunctionDeclaration` | JSON Schema dict | Not declared | Same as SDK via adapter |
-| **Tool results** | `Part.from_function_response(response={"output": ...})` | `{"type": "tool_result", "tool_use_id": ..., "content": ...}` | `{"role": "tool", "tool_call_id": ..., "content": ...}` | `{"role": "tool", ...}` |
-| **History trimming** | In-place at 40% of 900K token estimate | 2-phase: strip stale file refreshes, then drop turn pairs at 180K | None | None |
-| **Streaming** | No | No | Yes | No |
+| Aspect | Gemini SDK | Anthropic | DeepSeek | Gemini CLI | MiniMax |
+|---|---|---|---|---|---|
+| **Client** | `genai.Client` | `anthropic.Anthropic` | Raw `requests.post` | `GeminiCliAdapter` (subprocess) | Raw `requests.post` (OpenAI-compatible endpoint) |
+| **History** | SDK-managed (`_gemini_chat._history`) | Client-managed list | Client-managed list | CLI-managed (session ID) | Client-managed list |
+| **Caching** | Server-side `CachedContent` with TTL | Prompt caching via `cache_control: ephemeral` (4 breakpoints) | None | None | None |
+| **Tool format** | `types.FunctionDeclaration` | JSON Schema dict | Not declared | Same as SDK via adapter | Not declared |
+| **Tool results** | `Part.from_function_response(response={"output": ...})` | `{"type": "tool_result", "tool_use_id": ..., "content": ...}` | `{"role": "tool", "tool_call_id": ..., "content": ...}` | `{"role": "tool", ...}` | `{"role": "tool", "tool_call_id": ..., "content": ...}` |
+| **History trimming** | In-place at 40% of 900K token estimate | 2-phase: strip stale file refreshes, then drop turn pairs at 180K | None | None | 2-phase: drop turn pairs at 180K (Anthropic-equivalent) |
+| **Streaming** | No | No | Yes | No | Yes |
+| **Error classifier** | `_classify_gemini_error` | `_classify_anthropic_error` | `_classify_deepseek_error` | (inherits Gemini) | `_classify_minimax_error` |
+| **Repair hook** | (SDK self-heals) | `_repair_anthropic_history` | `_repair_deepseek_history` | (CLI handles) | `_repair_minimax_history` |
 
 ### Tool-Call Loop (common pattern across providers)
 
@@ -512,9 +535,151 @@ Before placing breakpoint 4, all existing `cache_control` is stripped from histo
 System instruction content is hashed. On each call, a 3-way decision:
 
 - **Hash changed**: Delete old cache, rebuild with new content.
-- **Cache age > 90% of TTL**: Proactive renewal (delete + rebuild).
+- **Cache age > 90% of TTL**: Proactive renewal (delete + rebuild). `cache_created_at` is tracked via `time.monotonic()` for this check.
 - **No cache exists**: Create new `CachedContent` if token count >= 2048; otherwise inline.
 
+The active cache's file inclusion set is tracked in `_gemini_cached_file_paths: list[str]`. On rebuild, the list is replaced atomically. The GUI uses this list to render the "cached files" indicator in the Cache Panel.
+
+---
+
+## Async Tool Execution
+
+Independent tool calls within a single round execute concurrently via `asyncio.gather`. This is the major latency win: when the AI emits 3 read_file calls in one turn, they run in parallel rather than sequentially.
+
+### Entry Point
+
+```python
+async def _execute_tool_calls_concurrently(
+    calls: list[Any],
+    base_dir: str,
+    pre_tool_callback: ...,
+    qa_callback: ...,
+    r_idx: int,
+    provider: str,
+    patch_callback: ... = None,
+) -> list[tuple[str, str, str, str]]:  # (tool_name, call_id, output, original_name)
+    ...
+```
+
+### Per-Call Worker
+
+```python
+async def _execute_single_tool_call_async(
+    name: str, args: dict, call_id: str, base_dir: str,
+    pre_tool_callback, qa_callback, r_idx: int,
+    tier: str | None = None,
+    patch_callback = None,
+) -> tuple[str, str, str, str]:
+    ...
+```
+
+`tier: str | None` is propagated to the comms log and pre-tool callback so audit trails can attribute tool calls to a specific MMA tier (e.g., "Tier 3", "Tier 4"). Thread-local `_local_storage.current_tier` is the source; the parameter is the explicit pass-through.
+
+### Exception Handling
+
+If any individual call raises, `asyncio.gather` with `return_exceptions=True` converts the exception to a returned value rather than cancelling siblings. The post-round loop in `_send_*` then formats the error per provider. See [guide_tools.md](guide_tools.md#parallel-tool-execution) for the full implementation pattern and the timing analysis (sequential vs concurrent latency for a typical 3-call round).
+
+---
+
+## RAG Integration
+
+`ai_client.send()` accepts an optional `rag_engine` parameter. When supplied, the dispatcher augments `md_content` with RAG-retrieved context before the provider call.
+
+```python
+def send(md_content, user_message, base_dir=".", file_items=None, ...,
+         rag_engine: Optional[Any] = None) -> str:
+    if rag_engine is not None:
+        retrieved = rag_engine.query(user_message, top_k=5)
+        md_content = _inject_rag_context(md_content, retrieved)
+    ...
+```
+
+The RAG engine is **not** owned by `ai_client`; the caller (typically `AppController` for the main discussion flow, or `multi_agent_conductor.run_worker_lifecycle` for Tier 3 workers) is responsible for instantiating and configuring it. This keeps `ai_client` decoupled from any specific retrieval backend (ChromaDB local, external MCP RAG server, or none).
+
+**Lifecycle**:
+- The `AppController` constructs a single `RAGEngine` per project load.
+- The RAG engine is passed through to `send()` for every AI call.
+- If a project disables RAG, `rag_engine=None` is passed and the integration is a no-op.
+- See [guide_rag.md](guide_rag.md) (placeholder; written in Task 10) for the vector store, chunking, and indexing pipeline.
+
+---
+
+## Tier 4 Patch Generation Flow
+
+When a Tier 3 worker's test run fails, the engine can request a Tier 4 patch instead of just an error summary. This is a structured diff, not a free-form suggestion.
+
+### Entry Point
+
+```python
+def run_tier4_patch_generation(error: str, file_context: str) -> str:
+    ...
+```
+
+### Flow
+
+1. Tier 3 worker fails a test; `stderr` is captured by the test runner.
+2. The conductor thread calls `run_tier4_patch_callback(stderr, base_dir)` to get a candidate patch.
+3. If a patch is generated, the GUI's patch modal (`src/patch_modal.py`) presents the diff for human review.
+4. User clicks Apply Patch to resume the pipeline, or Reject to send the worker back for another attempt.
+5. The `patch_callback` parameter on `send()` is the Tier 4 hook; it can be `None` for callers that don't support patch generation.
+
+### Threading
+
+`run_tier4_patch_generation` calls `send()` with `enable_tools=False` to force a text-only response. The result is parsed as a unified diff. If parsing fails, the modal shows the raw response and the user can manually copy-edit.
+
+---
+
+## Discussion Compression
+
+Long discussions accumulate tool outputs and intermediate reasoning that bloat the context. The `run_discussion_compression` function asks the active provider to produce a compressed summary of the discussion so far.
+
+### Entry Point
+
+```python
+def run_discussion_compression(discussion_text: str) -> str:
+    ...
+```
+
+### Flow
+
+1. Caller (typically the GUI's "Compress Discussion" button or an automatic trigger when history exceeds N tokens) invokes `run_discussion_compression(current_history)`.
+2. The function dispatches to the active provider with `enable_tools=False` and a fixed system prompt instructing the model to summarize while preserving key decisions, file paths, and unresolved questions.
+3. The returned string replaces the discussion history in subsequent `send()` calls.
+4. The original history is archived to the session log (`logs/sessions/<id>/comms.log`) for audit.
+
+### Provider Robustness
+
+The function tolerates case- and whitespace-variation in the provider string (`"  MiniMax  "` is normalized to `"minimax"`). This is important because the active provider may be set via different code paths (TOML, env var, runtime override).
+
+---
+
+## Subagent Summarization
+
+For very large files, the heuristic `summarise_file` in `src/summarize.py` may be insufficient. The `run_subagent_summarization` function asks the active provider to produce a high-signal summary of a single file using a model call rather than a heuristic.
+
+### Entry Point
+
+```python
+def run_subagent_summarization(file_path: str, content: str, is_code: bool, outline: str) -> str:
+    ...
+```
+
+### When Invoked
+
+- File exceeds the heuristic summary's effective scope (configurable, typically > 5000 lines or > 100KB)
+- The aggregation strategy in `aggregate.py` is set to `summarize` (rather than `full` or `skeleton`)
+- The Tier 2 ticket generation explicitly requests a sub-agent summary for a high-priority file
+
+### Flow
+
+1. Caller builds a structured prompt combining the file path, content, an AST outline (if `is_code=True`), and a "summary" instruction.
+2. The function dispatches to the active provider with `enable_tools=False`.
+3. The returned string is the file's summary, which replaces the full content in the aggregated context.
+
+### Cost vs Quality Trade-off
+
+Sub-agent summarization is more expensive than heuristic summarization (one full provider call per file) but produces higher-quality results for complex files. The caller decides based on the project's token budget and quality requirements.
+
 ---
 
 ## Comms Log System