From 989b2e6835717c81c99422861b76fc49586dd95a Mon Sep 17 00:00:00 2001 From: Ed_ Date: Wed, 10 Jun 2026 12:45:56 -0400 Subject: [PATCH] conductor(plan): New track for RAG phase 4 sync fix --- .../metadata.json | 41 +++++ .../rag_phase4_sync_fix_20260610/plan.md | 118 +++++++++++++ .../rag_phase4_sync_fix_20260610/spec.md | 158 ++++++++++++++++++ .../rag_phase4_sync_fix_20260610/state.toml | 38 +++++ 4 files changed, 355 insertions(+) create mode 100644 conductor/tracks/rag_phase4_sync_fix_20260610/metadata.json create mode 100644 conductor/tracks/rag_phase4_sync_fix_20260610/plan.md create mode 100644 conductor/tracks/rag_phase4_sync_fix_20260610/spec.md create mode 100644 conductor/tracks/rag_phase4_sync_fix_20260610/state.toml diff --git a/conductor/tracks/rag_phase4_sync_fix_20260610/metadata.json b/conductor/tracks/rag_phase4_sync_fix_20260610/metadata.json new file mode 100644 index 00000000..bb0a9dd8 --- /dev/null +++ b/conductor/tracks/rag_phase4_sync_fix_20260610/metadata.json @@ -0,0 +1,41 @@ +{ + "track_id": "rag_phase4_sync_fix_20260610", + "name": "Fix RAG phase 4 final verify test - sync never reaches 'ready' (2026-06-10)", + "created_at": "2026-06-10", + "status": "spec", + "priority": "A", + "blocked_by": [], + "blocks": [], + "inherits_from": [ + "conductor/tracks/mma_tier_usage_reset_fix_20260610/" + ], + "supersedes": [], + "domain": "RAG (live_gui integration test)", + "scope_summary": "One pre-existing bug in src/rag_engine.py or src/app_controller.py: tests/test_rag_phase4_final_verify.py::test_phase4_final_verify fails because rag_status stays at 'idle' after the test sets rag_enabled/rag_source/rag_emb_provider via the Hook API. The _do_rag_sync worker either never runs, never sets the status, or the status is reset before the test polls. Discovered as the out-of-scope failure that halted the tier-3-live_gui batch during the mma_tier_usage_reset_fix_20260610 verification run on 2026-06-10.", + "estimated_effort": "1-2 hours", + "phases": 1, + "verification_criteria": [ + "tests/test_rag_phase4_final_verify.py::test_phase4_final_verify passes in isolation", + "tests/test_rag_phase4_final_verify.py::test_phase4_final_verify passes in the tier-3-live_gui full batch (or at least gets past it without halting)", + "tests/test_extended_sims.py::test_context_sim_live still passes in batch (regression check)", + "All 4 sim tests in tests/test_extended_sims.py still pass in isolation (regression check)" + ], + "out_of_scope": [ + "Refactoring _do_rag_sync logic", + "Changing the RAG test design", + "Adding new RAG features", + "Updating documentation", + "Follow-up tracks" + ], + "risks": [ + { + "risk": "RAG test requires sentence-transformers, which may not be installed", + "mitigation": "Check installation first; if missing, document the install command and consider marking the test with skipif marker" + }, + { + "risk": "The fix might break other RAG tests that depend on the current behavior", + "mitigation": "Run all RAG tests in the test_rag_*.py files to verify regression" + } + ], + "tier_2_supervision_required_for": [] +} diff --git a/conductor/tracks/rag_phase4_sync_fix_20260610/plan.md b/conductor/tracks/rag_phase4_sync_fix_20260610/plan.md new file mode 100644 index 00000000..34ad3759 --- /dev/null +++ b/conductor/tracks/rag_phase4_sync_fix_20260610/plan.md @@ -0,0 +1,118 @@ +# RAG Phase 4 Sync Fix — Implementation Plan (2026-06-10) + +> **For Tier 3 workers:** Steps use checkbox (`- [ ]`) syntax. Scope is 1-2 line surgical fix. Do not refactor `_do_rag_sync` more than necessary. + +**Goal:** Fix `tests/test_rag_phase4_final_verify.py::test_phase4_final_verify` so `rag_status` reaches `'ready'` after the test configures RAG via the Hook API. + +**Tech Stack:** Python 3.11+, pytest. + +**HARD CONSTRAINTS:** +- **NEVER** use `git checkout -- `, `git restore`, `git reset` (AGENTS.md HARD BAN) +- 1-space indent, CRLF, type hints +- 1 atomic commit +- No "while we're at it" refactors + +--- + +## Phase 1: Diagnose and fix + +### Task 1.1: Diagnose the failure mode + +- [ ] **Step 1.1.1: Read the exact current code** + Use `manual-slop_py_get_skeleton` or `manual-slop_get_file_slice` on `src/app_controller.py:1463-1500` and `src/rag_engine.py:88-180`. + +- [ ] **Step 1.1.2: Add temporary diagnostic logging** + Add 1-line stderr prints in `_do_rag_sync` to see what's happening: + - After `if token != self._rag_sync_token: return`: print f"[RAG_DIAG] stale token {token} != current {self._rag_sync_token}, returning" + - Before `self._set_rag_status("initializing...")`: print f"[RAG_DIAG] running sync for token {token}" + - After setting status to "ready": print f"[RAG_DIAG] set status to 'ready' for token {token}" + - In the except branch: print the exception (the existing code already does this) + + Use `manual-slop_edit_file` to add the diagnostic lines. + +- [ ] **Step 1.1.3: Run the failing test in isolation** + ```powershell + cd C:\projects\manual_slop; uv run pytest tests/test_rag_phase4_final_verify.py::test_phase4_final_verify -v --timeout=120 -s 2>&1 | Tee-Object -FilePath "tests/artifacts/rag_diag_20260610.log" | Select-Object -Last 80 + ``` + Expected: see the diagnostic output in stderr. + +- [ ] **Step 1.1.4: Read the diagnostic log and predict the failure mode** + Open `tests/artifacts/rag_diag_20260610.log` and look for `[RAG_DIAG]` lines. Determine: + - Did the worker for the latest token run? + - Did it set status to "ready" or did it error? + - Was there a race condition where multiple workers ran but the last one never completed? + +### Task 1.2: Apply the fix + +- [ ] **Step 1.2.1: Apply the fix in src/app_controller.py or src/rag_engine.py** + Based on Step 1.1.4's diagnosis, apply a 1-2 line fix. Most likely candidates: + - (a) Force the last worker to actually run by serializing them in the io_pool (not feasible without restructuring) + - (b) Use a `threading.Semaphore(1)` to ensure only ONE RAG sync runs at a time + - (c) Remove the coalescing complexity — each setter just runs sync directly + - (d) Fix the RAGEngine init to handle missing sentence-transformers gracefully (e.g., fall back to a mock provider) + +- [ ] **Step 1.2.2: Remove the diagnostic logging** + After the fix is verified, remove the `[RAG_DIAG]` lines from `src/app_controller.py`. (Diagnostic code does not ship in production per AGENTS.md.) + +- [ ] **Step 1.2.3: Verify syntax** + ```powershell + cd C:\projects\manual_slop; uv run python -c "import ast; ast.parse(open('src/app_controller.py').read()); print('OK')" + ``` + +- [ ] **Step 1.2.4: Verify import** + ```powershell + cd C:\projects\manual_slop; uv run python -c "from src.app_controller import AppController; print('import OK')" + ``` + +### Task 1.3: Verify in isolation + +- [ ] **Step 1.3.1: Run the RAG test in isolation** + ```powershell + cd C:\projects\manual_slop; uv run pytest tests/test_rag_phase4_final_verify.py::test_phase4_final_verify -v --timeout=120 + ``` + Expected: 1/1 pass. + +### Task 1.4: Verify in batch + +- [ ] **Step 1.4.1: Run all 4 sim tests in isolation (regression check)** + ```powershell + cd C:\projects\manual_slop; uv run pytest tests/test_extended_sims.py -v --timeout=300 + ``` + Expected: 4/4 pass. + +- [ ] **Step 1.4.2: Run the full tier-3-live_gui batch (authoritative)** + ```powershell + cd C:\projects\manual_slop; uv run .\scripts\run_tests_batched.py 2>&1 | Tee-Object -FilePath "tests/artifacts/post_rag_fix_batch_20260610.log" | Select-Object -Last 50 + ``` + Expected: tier-1 5/5, tier-2 5/5, tier-3 either completes fully or only halts on a DIFFERENT (unrelated) pre-existing failure. + +### Task 1.5: Checkpoint commit + +- [ ] **Step 1.5.1: Commit the fix** + ```powershell + cd C:\projects\manual_slop; git add src/app_controller.py src/rag_engine.py + git commit -m "fix(rag): [describe the actual fix]" + $h = git log -1 --format='%H' + git notes add -m "..." $h + ``` + +- [ ] **Step 1.5.2: Checkpoint commit with batch log** + ```powershell + cd C:\projects\manual_slop; git add -f tests/artifacts/post_rag_fix_batch_20260610.log + git commit -m "conductor(checkpoint): RAG phase 4 sync fix complete" + $h = git log -1 --format='%H' + git notes add -m "..." $h + ``` + +--- + +## Final Verification + +- [ ] `test_rag_phase4_final_verify.py::test_phase4_final_verify` passes in isolation +- [ ] 4 sim tests in `test_extended_sims.py` pass in isolation (regression) +- [ ] Full tier-3-live_gui batch: at least gets past `test_rag_phase4_final_verify` +- [ ] 1 atomic commit + 1 checkpoint + +## Track Done + +After the fix and verification, the track is DONE. diff --git a/conductor/tracks/rag_phase4_sync_fix_20260610/spec.md b/conductor/tracks/rag_phase4_sync_fix_20260610/spec.md new file mode 100644 index 00000000..09fea985 --- /dev/null +++ b/conductor/tracks/rag_phase4_sync_fix_20260610/spec.md @@ -0,0 +1,158 @@ +# RAG Phase 4 Sync Fix — Specification (2026-06-10) + +## Overview + +This track fixes a pre-existing RAG test failure that halted the `tier-3-live_gui` batch during the `mma_tier_usage_reset_fix_20260610` verification run on 2026-06-10. + +**The bug:** `tests/test_rag_phase4_final_verify.py::test_phase4_final_verify` fails because `rag_status` stays at `'idle'` after the test sets `rag_enabled=True`, `rag_source='chroma'`, `rag_emb_provider='local'` via the Hook API. The test polls for `rag_status == 'ready'` for 50 seconds (100 × 0.5s) and never sees it. + +The test was failing before any changes from the `mma_tier_usage_reset_fix_20260610` track. It is a pre-existing fragility in the RAG sync flow that the previous track's batch run exposed. + +## Reproduction (already verified) + +```powershell +cd C:\projects\manual_slop; uv run pytest tests/test_rag_phase4_final_verify.py::test_phase4_final_verify -v --timeout=120 +``` + +**Result:** 1 failed in 57.39s — `AssertionError: RAG sync failed. Status: idle` + +## Suspected root cause + +Looking at `src/app_controller.py:1463-1500`: + +```python +def _sync_rag_engine(self) -> None: + with self._rag_sync_lock: + self._rag_sync_token += 1 + self._rag_sync_dirty = True + token = self._rag_sync_token + self.submit_io(lambda: self._do_rag_sync(token)) + +def _do_rag_sync(self, token: int) -> None: + while True: + with self._rag_sync_lock: + if token != self._rag_sync_token: + return # ← BUG: returns silently + self._rag_sync_dirty = False + self._set_rag_status("initializing...") # ← only sets after the check + ... +``` + +The coalescing logic is the prime suspect: if 5 setters are called in quick succession (`rag_collection_name`, `files`, `rag_enabled`, `rag_source`, `rag_emb_provider`), each increments the token and submits a worker. The 5 workers all run concurrently. The first worker checks `if token != self._rag_sync_token` — the token from the first call is now stale (token 1 vs current 5), so it returns without setting status. The second worker (token 2) also returns. The third worker (token 3) also returns. Only the LAST worker (token 5) actually proceeds and sets status. + +But the io_pool has limited concurrency (4 workers in startup_speedup_20260606, plus more in `_io_pool` for general use). With 5 setters fired in quick succession from the API, 5 workers are submitted. They all race. The LAST one to acquire `_rag_sync_lock` wins. + +This SHOULD work — only the worker with the latest token should set the status. But there's a subtle race: if worker for token 5 acquires the lock first, sees its own token, and proceeds. But what if all 5 workers start before any of them acquires the lock? Then the order of acquisition is non-deterministic. + +Looking more carefully: the first worker (token 1) runs, acquires lock, sees token=1 but current=5, returns. Now `self._rag_sync_dirty` is whatever it was BEFORE the first worker (let's say False, because no one has set it True yet — wait, but token 1's setter set `self._rag_sync_dirty = True` BEFORE submitting). + +Actually, let me re-read: +```python +def _sync_rag_engine(self) -> None: + with self._rag_sync_lock: + self._rag_sync_token += 1 + self._rag_sync_dirty = True + token = self._rag_sync_token + self.submit_io(lambda: self._do_rag_sync(token)) +``` + +So each setter: +1. Acquires lock +2. Increments token +3. Sets dirty=True +4. Releases lock +5. Captures `token` (the new value) +6. Submits worker with the captured `token` + +So worker 1 captures token=1, worker 5 captures token=5. All 5 workers are submitted. + +In `_do_rag_sync`: +```python +while True: + with self._rag_sync_lock: + if token != self._rag_sync_token: + return # stale, return + self._rag_sync_dirty = False + self._set_rag_status("initializing...") + # ... do work ... + with self._rag_sync_lock: + if not self._rag_sync_dirty: + return # no more setters, done + token = self._rag_sync_token + self._rag_sync_dirty = False +``` + +So worker 1 acquires lock, sees token (1) != self._rag_sync_token (5), returns immediately. Worker 2 same. Worker 3 same. Worker 4 same. Worker 5 acquires lock, sees token (5) == self._rag_sync_token (5), proceeds. Sets status to "initializing...". Does work. Then checks dirty; if no more setters, returns. Sets status to "ready". + +This SHOULD work. So why doesn't it? + +Possibility 1: The io_pool doesn't process the 5th worker. Maybe the io_pool is full with other work (the test sets a lot of other things, all going through submit_io). + +Possibility 2: The worker for token 5 crashes before setting status. The except branch sets status to "error: ...", not "ready". But the test shows "idle", not "error: ...". + +Possibility 3: The status is reset by something else. Looking at `_handle_reset_session`: +```python +self.rag_status = 'idle' +``` +But the test doesn't call reset. + +Possibility 4: The test is checking the wrong state. The Hook API's `get_value` might be returning a cached value. + +Let me look at how `get_value` works in the API hooks. + +## Diagnostic plan + +1. Add a print or log line in `_do_rag_sync` to see if it's being called and with what token +2. Add a print after `_set_rag_status` to see what status is being set +3. Run the test and observe +4. Once we know the actual failure mode, fix it + +## Goals + +1. The RAG phase 4 test passes in isolation +2. The RAG phase 4 test passes in the full tier-3-live_gui batch (or at least doesn't halt it) +3. No regression in the 4 sim tests in tests/test_extended_sims.py +4. No regression in other RAG tests in tests/test_rag_*.py + +## Non-Goals + +- Refactoring `_do_rag_sync` (just fix the bug) +- Changing the RAG test design +- Adding new RAG features +- Updating documentation +- Filing follow-up tracks + +## Functional Requirements + +### FR1. RAG sync reaches 'ready' after configuration + +**Where:** `src/app_controller.py` (or `src/rag_engine.py` if the issue is in RAGEngine init) + +**What:** After the test sets `rag_enabled=True`, `rag_source='chroma'`, `rag_emb_provider='local'`, the `_do_rag_sync` worker must complete and set `rag_status='ready'` (or 'error: ...' with a clear message if it can't). + +**Why:** The RAG test polls for 'ready' and fails if it doesn't see it within 50s. + +**Acceptance:** +- `test_rag_phase4_final_verify.py::test_phase4_final_verify` passes +- 4 sim tests in `test_extended_sims.py` still pass + +## Non-Functional Requirements + +- NFR1: 1-2 line fix, surgical +- NFR2: No new dependencies +- NFR3: 1 atomic commit + +## Architecture Reference + +- `src/app_controller.py:1463-1500`: `_sync_rag_engine` + `_do_rag_sync` (the coalescing logic) +- `src/app_controller.py:1848-1852`: rag_config initialization in project load +- `src/rag_engine.py:22-53`: lazy imports (`_get_sentence_transformers`, etc.) +- `src/rag_engine.py:88-108`: RAGEngine `__init__` + `_init_embedding_provider` +- `tests/test_rag_phase4_final_verify.py`: the failing test + +## Out of Scope + +- Refactoring `_do_rag_sync` to a state machine +- Adding observability/metrics to the RAG sync +- Speeding up RAG startup +- Adding new RAG embedding providers diff --git a/conductor/tracks/rag_phase4_sync_fix_20260610/state.toml b/conductor/tracks/rag_phase4_sync_fix_20260610/state.toml new file mode 100644 index 00000000..bca34f66 --- /dev/null +++ b/conductor/tracks/rag_phase4_sync_fix_20260610/state.toml @@ -0,0 +1,38 @@ +# Track state for rag_phase4_sync_fix_20260610 +# Updated by executing agent as tasks complete + +[meta] +track_id = "rag_phase4_sync_fix_20260610" +name = "Fix RAG phase 4 final verify test - sync never reaches 'ready' (2026-06-10)" +status = "in_progress" +current_phase = 1 +last_updated = "2026-06-10" + +[blocked_by] +# No blockers. + +[blocks] +# This track blocks nothing. + +[phases] +phase_1 = { status = "in_progress", checkpointsha = "", name = "Diagnose why _do_rag_sync doesn't set status, apply fix, verify in batch" } + +[tasks] +t1_1 = { status = "in_progress", commit_sha = "", description = "Diagnose: why does rag_status stay at 'idle' after setters call _sync_rag_engine?" } +t1_2 = { status = "pending", commit_sha = "", description = "Apply fix in src/rag_engine.py or src/app_controller.py" } +t1_3 = { status = "pending", commit_sha = "", description = "Verify test_rag_phase4_final_verify.py::test_phase4_final_verify passes in isolation" } +t1_4 = { status = "pending", commit_sha = "", description = "Verify test_rag_phase4_final_verify.py::test_phase4_final_verify passes in tier-3-live_gui full batch" } +t1_5 = { status = "pending", commit_sha = "", description = "Run regression: test_extended_sims.py 4 sim tests + other RAG tests" } +t1_6 = { status = "pending", commit_sha = "", description = "Checkpoint commit" } + +[verification] +diagnosis_complete = false +fix_applied = false +isolated_test_passes = false +batch_test_passes = false +regression_clean = false + +[baseline_capture] +# Captured from the 2026-06-10 isolation run +isolated_status = "FAIL: tests/test_rag_phase4_final_verify.py::test_phase4_final_verify - AssertionError: RAG sync failed. Status: idle" +batch_status = "FAIL: tier-3-live_gui halted at this test (after test_extended_sims.py::test_context_sim_live PASSED 87.10s and test_tools_sim_live PASSED 58.50s)"