conductor(track): metadata.json for doeh_test_thinking_cleanup_20260615

2026-06-15 12:21:16 -04:00
parent 304f469663
commit 88bf04eb3d
1 changed files with 326 additions and 0 deletions
@@ -0,0 +1,326 @@
+{
+  "track_id": "doeh_test_thinking_cleanup_20260615",
+  "name": "Data-Oriented Error Handling Test & Thinking-Parser Cleanup",
+  "initialized": "2026-06-15",
+  "owner": "tier2-tech-lead",
+  "priority": "high",
+  "status": "active",
+  "type": "bugfix + test_cleanup + refactor + documentation",
+  "scope": {
+    "new_files": [
+      "tests/test_gemini_thinking_format.py"
+    ],
+    "modified_files": [
+      "src/app_controller.py",
+      "src/ai_client.py",
+      "src/thinking_parser.py",
+      "tests/test_llama_provider.py",
+      "tests/test_llama_ollama_native.py",
+      "tests/test_grok_provider.py",
+      "tests/test_ai_client_tool_loop_builder.py",
+      "tests/test_headless_service.py",
+      "tests/test_thinking_trace.py",
+      "conductor/tracks/ai_loop_regressions_20260614/state.toml",
+      "conductor/tracks.md",
+      "docs/guide_ai_client.md"
+    ]
+  },
+  "blocked_by": [],
+  "blocks": [],
+  "estimated_phases": 5,
+  "spec": "spec.md",
+  "plan": "plan.md",
+
+  "regressions_and_deferred_items": [
+    {
+      "id": "G1_api_generate_name_error",
+      "severity": "CRITICAL",
+      "category": "production_regression",
+      "introduced_by": "ai_loop_regressions_20260614 commit 2b7b571a (FR2 fix)",
+      "file_line": "src/app_controller.py:265-295",
+      "symptom": "/api/v1/generate returns HTTP 500 with NameError: name 'context_to_send' is not defined",
+      "fix_phase": 1,
+      "fix_size_lines": 3,
+      "fix": "Add back the 2 lines that were removed: with controller._disc_entries_lock: has_ai_response = ... and context_to_send = stable_md if not has_ai_response else ''"
+    },
+    {
+      "id": "G2_grok_uses_xai_endpoint",
+      "severity": "high",
+      "category": "test_mock_bug",
+      "introduced_by": "data_oriented_error_handling_20260606 commit 64b787b8 (ProviderError removal + _send_* rename)",
+      "file_line": "tests/test_grok_provider.py:13",
+      "fix_phase": 2,
+      "fix": "Change `assert result == 'hi from grok'` to `assert result.ok and result.data == 'hi from grok'`"
+    },
+    {
+      "id": "G3_grok_web_search",
+      "severity": "high",
+      "category": "test_mock_bug",
+      "introduced_by": "data_oriented_error_handling_20260606 (tool loop refactor)",
+      "file_line": "tests/test_grok_provider.py:30",
+      "symptom": "captured_kwargs has 12 entries instead of 1 (tool loop calls multiple times)",
+      "fix_phase": 2,
+      "fix": "Change `assert len(captured_kwargs) == 1` and `captured_kwargs[0][...]` to check across all kwargs with any()"
+    },
+    {
+      "id": "G4_grok_x_search",
+      "severity": "high",
+      "category": "test_mock_bug",
+      "introduced_by": "data_oriented_error_handling_20260606 (tool loop refactor)",
+      "file_line": "tests/test_grok_provider.py:46",
+      "fix_phase": 2,
+      "fix": "Same as G3 — change captured_kwargs[0] to any() across all kwargs"
+    },
+    {
+      "id": "G5_llama_openrouter",
+      "severity": "high",
+      "category": "test_mock_bug",
+      "introduced_by": "data_oriented_error_handling_20260606 (Result API)",
+      "file_line": "tests/test_llama_provider.py:24",
+      "fix_phase": 2,
+      "fix": "Change `assert result == 'hi from openrouter'` to `assert result.ok and result.data == 'hi from openrouter'`"
+    },
+    {
+      "id": "G6_llama_custom_url",
+      "severity": "high",
+      "category": "test_mock_bug",
+      "introduced_by": "data_oriented_error_handling_20260606 (Result API)",
+      "file_line": "tests/test_llama_provider.py:43",
+      "fix_phase": 2,
+      "fix": "Same as G5"
+    },
+    {
+      "id": "G7_llama_ollama_backend",
+      "severity": "high",
+      "category": "test_mock_bug",
+      "introduced_by": "data_oriented_error_handling_20260606 (Result API)",
+      "file_line": "tests/test_llama_provider.py:62",
+      "fix_phase": 2,
+      "fix": "Change `assert 'hi from ollama' in result` to `assert result.ok and 'hi from ollama' in result.data`"
+    },
+    {
+      "id": "G8_llama_native_calls_ollama_chat",
+      "severity": "high",
+      "category": "test_mock_bug",
+      "introduced_by": "data_oriented_error_handling_20260606 (Result API)",
+      "file_line": "tests/test_llama_ollama_native.py:70",
+      "fix_phase": 2,
+      "fix": "Same as G7"
+    },
+    {
+      "id": "G9_llama_native_preserves_thinking",
+      "severity": "high",
+      "category": "test_mock_bug",
+      "introduced_by": "data_oriented_error_handling_20260606 (Result API)",
+      "file_line": "tests/test_llama_ollama_native.py:88",
+      "fix_phase": 2,
+      "fix": "Same as G7"
+    },
+    {
+      "id": "G10_llama_routes_to_native",
+      "severity": "high",
+      "category": "test_mock_bug",
+      "introduced_by": "data_oriented_error_handling_20260606 (Result API)",
+      "file_line": "tests/test_llama_ollama_native.py:107",
+      "fix_phase": 2,
+      "fix": "Same as G7"
+    },
+    {
+      "id": "G11_llama_keeps_openai_path",
+      "severity": "high",
+      "category": "test_mock_bug",
+      "introduced_by": "data_oriented_error_handling_20260606 (Result API)",
+      "file_line": "tests/test_llama_ollama_native.py:122",
+      "fix_phase": 2,
+      "fix": "Same as G7"
+    },
+    {
+      "id": "G12_ai_client_tool_loop_builder",
+      "severity": "high",
+      "category": "test_mock_shape_bug",
+      "introduced_by": "data_oriented_error_handling_20260606 commit 3aa7bdca (NormalizedResponse return shape)",
+      "file_line": "tests/test_ai_client_tool_loop_builder.py:33",
+      "symptom": "_default_send does `if not res.ok:` expecting Result[NormalizedResponse]; mock returns raw NormalizedResponse",
+      "fix_phase": 2,
+      "fix": "Wrap the mock return in Result(data=...) — Result(data=tool_response), Result(data=final)"
+    },
+    {
+      "id": "G13_headless_service_test_generate",
+      "severity": "high",
+      "category": "test_mock_bug",
+      "introduced_by": "data_oriented_error_handling_20260606 (Result API)",
+      "file_line": "tests/test_headless_service.py:57",
+      "symptom": "Mocks ai_client.send (deprecated); production now uses send_result. Test returns 500 due to G1 NameError + mock mismatch.",
+      "fix_phase": 2,
+      "fix": "Change `patch('src.ai_client.send', return_value='AI Response')` to `patch('src.ai_client.send_result', return_value=Result(data='AI Response'))`; update assertion to use .data"
+    },
+    {
+      "id": "G14_gemini_thinking_format",
+      "severity": "medium",
+      "category": "deferred_bug",
+      "introduced_by": "pre-existing limitation (not from data_oriented_error_handling refactor)",
+      "file_line": "src/ai_client.py:_send_gemini (lines 1538-1781), _send_gemini_cli (lines 1783-1897)",
+      "symptom": "User complained that thinking monologues don't render for Gemini requests",
+      "fix_phase": 3,
+      "fix": "Empirical investigation: run a Gemini request that produces thinking, inspect resp.text, decide between (a) normalization pass in _send_gemini* or (b) extend parse_thinking_trace"
+    },
+    {
+      "id": "G15_think_half_width_marker",
+      "severity": "low",
+      "category": "deferred_bug",
+      "introduced_by": "pre-existing limitation (not from data_oriented_error_handling refactor)",
+      "file_line": "src/thinking_parser.py:9",
+      "symptom": "User screenshot 1 showed <think>...</think> format (half-width); current regex requires <thinking> (full-width)",
+      "fix_phase": 4,
+      "fix": "Extend the tag_pattern regex at line 9 to also match <think>...</think>"
+    },
+    {
+      "id": "G16_state_toml_duplicates",
+      "severity": "low",
+      "category": "housekeeping",
+      "introduced_by": "ai_loop_regressions_20260614 commit 01075222",
+      "file_line": "conductor/tracks/ai_loop_regressions_20260614/state.toml lines 23-26 and 46-58",
+      "symptom": "Python's tomllib.load() raises TOMLDecodeError: Cannot overwrite a value",
+      "fix_phase": 5,
+      "fix": "Delete the duplicate pending entries; keep only the completed entries with commit SHAs"
+    },
+    {
+      "id": "G17_tracks_md_row_24",
+      "severity": "low",
+      "category": "housekeeping",
+      "introduced_by": "ai_loop_regressions_20260614 (track shipped but tracks.md not updated)",
+      "file_line": "conductor/tracks.md:41",
+      "symptom": "Track row still says 'spec ✓, plan ✓, ready to start' though the track shipped on 2026-06-15",
+      "fix_phase": 5,
+      "fix": "Update status column or move to Recently Completed section"
+    }
+  ],
+
+  "deferred_to_followup_tracks": [
+    {
+      "id": "public_api_migration_20260606",
+      "title": "Public API Result Migration",
+      "description": "Removes the deprecated ai_client.send() and migrates the remaining 5 production call sites + ~50 test call sites to send_result(). This track handles 11 of the 63 tests; the other ~50 are deferred.",
+      "blocks_field_in_tracks_md": true,
+      "track_status": "planned; not yet specced"
+    },
+    {
+      "id": "live_gui_mock_injection_20260615",
+      "title": "Live GUI Mock Injection Infrastructure",
+      "description": "Infrastructure for mock injection into the live_gui subprocess. Unblocks proper end-to-end live_gui + AI client tests (the ai_loop_regressions_20260614 smoke tests only verify Hook API substrate reachability).",
+      "blocks_field_in_tracks_md": false,
+      "track_status": "recommended; not yet specced"
+    },
+    {
+      "id": "test_rag_phase4_final_verify_fix",
+      "title": "test_rag_phase4_final_verify RAG flakiness fix",
+      "description": "Pre-existing RAG subsystem issue ('NoneType' object has no attribute 'get'). The error is in RAG config lookup code, not AI client code. A partial fix was attempted in commit 16412ad5 (RAG Phase 4 dim-mismatch recovery). Recommended as a separate RAG track.",
+      "blocks_field_in_tracks_md": false,
+      "track_status": "pre-existing; not caused by either data_oriented_error_handling or ai_loop_regressions tracks"
+    },
+    {
+      "id": "ui_polish_five_issues_20260302",
+      "title": "UI Polish Five Issues",
+      "description": "The 2 unrelated test failures (test_discussion_truncate_layout, test_log_management_refresh) are Phase 2 and Phase 3 of the UI Polish track. That track has its own spec and plan.",
+      "blocks_field_in_tracks_md": true,
+      "track_status": "ready to start; spec/plan in place; not caused by data_oriented_error_handling refactor"
+    }
+  ],
+
+  "verification_criteria": {
+    "g1_api_generate_returns_200": "uv run pytest tests/test_headless_service.py::TestHeadlessAPI::test_generate_endpoint returns 200 (proves G1 fix)",
+    "g2_g12_test_mock_fixes_pass": "Full batched test suite has 11 fewer failures than the pre-track baseline (G2-G12)",
+    "g13_tool_loop_builder_passes": "uv run pytest tests/test_ai_client_tool_loop_builder.py::test_run_with_tool_loop_calls_request_builder_each_round passes",
+    "g14_headless_service_test_passes": "uv run pytest tests/test_headless_service.py::TestHeadlessAPI::test_generate_endpoint returns 200 (after G1 + G13 fixes)",
+    "g15_gemini_thinking_format_investigated": "Phase 3 produces an empirical finding (either normalization pass in _send_gemini* or parser extension) + live_gui or unit test demonstrates the fix",
+    "g16_half_width_marker_supported": "tests/test_thinking_trace.py has 1+ new test for <think>...</think> marker; all existing tests still pass",
+    "g17_state_toml_parseable": "python -c 'import tomllib; tomllib.load(open(\"conductor/tracks/ai_loop_regressions_20260614/state.toml\",\"rb\"))' succeeds",
+    "g18_tracks_md_row_24_updated": "Row 24 in conductor/tracks.md reflects the track's completion (status column or section move)",
+    "full_suite_green": "uv run pytest tests/ shows no new failures beyond the deferred test_rag_phase4_final_verify and the 2 UI Polish tests",
+    "docs_updated": "docs/guide_ai_client.md 'See Also' section has 2 new cross-references: (1) this cleanup track; (2) public_api_migration_20260606"
+  },
+
+  "fr_to_phase_mapping": {
+    "FR1_fix_api_generate_name_error": {
+      "phase": 1,
+      "fix_files": ["src/app_controller.py:265-295"],
+      "test_files": ["tests/test_headless_service.py::TestHeadlessAPI::test_generate_endpoint"],
+      "min_test_count": 1
+    },
+    "FR2_FR3_test_mock_fixes": {
+      "phase": 2,
+      "fix_files": [
+        "tests/test_llama_provider.py",
+        "tests/test_llama_ollama_native.py",
+        "tests/test_grok_provider.py",
+        "tests/test_ai_client_tool_loop_builder.py",
+        "tests/test_headless_service.py"
+      ],
+      "min_test_count": 11
+    },
+    "FR4_gemini_thinking_format": {
+      "phase": 3,
+      "fix_files": ["src/ai_client.py:_send_gemini", "src/ai_client.py:_send_gemini_cli", "src/thinking_parser.py:9"],
+      "test_files": ["tests/test_gemini_thinking_format.py (new)"],
+      "min_test_count": 1
+    },
+    "FR5_think_half_width_marker": {
+      "phase": 4,
+      "fix_files": ["src/thinking_parser.py:9"],
+      "test_files": ["tests/test_thinking_trace.py"],
+      "min_test_count": 1
+    },
+    "FR6_state_toml_cleanup": {
+      "phase": 5,
+      "fix_files": ["conductor/tracks/ai_loop_regressions_20260614/state.toml"],
+      "min_test_count": 0
+    },
+    "FR7_tracks_md_update": {
+      "phase": 5,
+      "fix_files": ["conductor/tracks.md"],
+      "min_test_count": 0
+    },
+    "FR8_regression_sweep_and_docs": {
+      "phase": 5,
+      "fix_files": ["docs/guide_ai_client.md"],
+      "min_test_count": 0
+    }
+  },
+
+  "estimated_effort": {
+    "phase_1": "10 min — 1 critical regression fix + 1 test verification",
+    "phase_2": "1.5 hours — 11 mechanical test mock fixes across 5 files",
+    "phase_3": "2-4 hours — empirical Gemini investigation + fix (uncertain duration depending on finding)",
+    "phase_4": "30 min — 1 regex extension + 1+ new test",
+    "phase_5": "1 hour — 4 housekeeping tasks (state.toml, tracks.md, sweep, docs)",
+    "total": "5-8 hours of Tier 2 work (0.5-1 day)"
+  },
+
+  "risk_register": {
+    "R1_api_generate_fix_breaks_fr2_fr3": {
+      "likelihood": "low",
+      "impact": "high",
+      "mitigation": "Fix only ADDS lines; doesn't modify existing logic. Function semantics match pre-ai_loop_regressions_20260614 state."
+    },
+    "R2_test_mock_fixes_introduce_subtle_failures": {
+      "likelihood": "low",
+      "impact": "low",
+      "mitigation": "Pattern is mechanical (assert result.ok then assert result.data); failure messages are clear if a test has a real bug"
+    },
+    "R3_gemini_investigation_needs_real_credentials": {
+      "likelihood": "medium",
+      "impact": "medium",
+      "mitigation": "Use a mock client that returns a realistic Gemini response with thinking content if real credentials unavailable; document the format assumption"
+    },
+    "R4_think_regex_greedy": {
+      "likelihood": "low",
+      "impact": "low",
+      "mitigation": "Use re.DOTALL + non-greedy .*? (consistent with existing pattern); existing 5+ tests catch regressions"
+    },
+    "R5_state_toml_cleanup_deletes_wrong_lines": {
+      "likelihood": "very_low",
+      "impact": "high",
+      "mitigation": "Only delete the duplicate 'pending' entries; the 'completed' entries with commit SHAs must be preserved. Fix is mechanical and verifiable by re-running tomllib.load()"
+    }
+  }
+}