From 88bf04eb3d340dd438211c480aad0795f49df5b7 Mon Sep 17 00:00:00 2001 From: Ed_ Date: Mon, 15 Jun 2026 12:21:16 -0400 Subject: [PATCH] conductor(track): metadata.json for doeh_test_thinking_cleanup_20260615 --- .../metadata.json | 326 ++++++++++++++++++ 1 file changed, 326 insertions(+) create mode 100644 conductor/tracks/doeh_test_thinking_cleanup_20260615/metadata.json diff --git a/conductor/tracks/doeh_test_thinking_cleanup_20260615/metadata.json b/conductor/tracks/doeh_test_thinking_cleanup_20260615/metadata.json new file mode 100644 index 00000000..6e86d75c --- /dev/null +++ b/conductor/tracks/doeh_test_thinking_cleanup_20260615/metadata.json @@ -0,0 +1,326 @@ +{ + "track_id": "doeh_test_thinking_cleanup_20260615", + "name": "Data-Oriented Error Handling Test & Thinking-Parser Cleanup", + "initialized": "2026-06-15", + "owner": "tier2-tech-lead", + "priority": "high", + "status": "active", + "type": "bugfix + test_cleanup + refactor + documentation", + "scope": { + "new_files": [ + "tests/test_gemini_thinking_format.py" + ], + "modified_files": [ + "src/app_controller.py", + "src/ai_client.py", + "src/thinking_parser.py", + "tests/test_llama_provider.py", + "tests/test_llama_ollama_native.py", + "tests/test_grok_provider.py", + "tests/test_ai_client_tool_loop_builder.py", + "tests/test_headless_service.py", + "tests/test_thinking_trace.py", + "conductor/tracks/ai_loop_regressions_20260614/state.toml", + "conductor/tracks.md", + "docs/guide_ai_client.md" + ] + }, + "blocked_by": [], + "blocks": [], + "estimated_phases": 5, + "spec": "spec.md", + "plan": "plan.md", + + "regressions_and_deferred_items": [ + { + "id": "G1_api_generate_name_error", + "severity": "CRITICAL", + "category": "production_regression", + "introduced_by": "ai_loop_regressions_20260614 commit 2b7b571a (FR2 fix)", + "file_line": "src/app_controller.py:265-295", + "symptom": "/api/v1/generate returns HTTP 500 with NameError: name 'context_to_send' is not defined", + "fix_phase": 1, + "fix_size_lines": 3, + "fix": "Add back the 2 lines that were removed: with controller._disc_entries_lock: has_ai_response = ... and context_to_send = stable_md if not has_ai_response else ''" + }, + { + "id": "G2_grok_uses_xai_endpoint", + "severity": "high", + "category": "test_mock_bug", + "introduced_by": "data_oriented_error_handling_20260606 commit 64b787b8 (ProviderError removal + _send_* rename)", + "file_line": "tests/test_grok_provider.py:13", + "fix_phase": 2, + "fix": "Change `assert result == 'hi from grok'` to `assert result.ok and result.data == 'hi from grok'`" + }, + { + "id": "G3_grok_web_search", + "severity": "high", + "category": "test_mock_bug", + "introduced_by": "data_oriented_error_handling_20260606 (tool loop refactor)", + "file_line": "tests/test_grok_provider.py:30", + "symptom": "captured_kwargs has 12 entries instead of 1 (tool loop calls multiple times)", + "fix_phase": 2, + "fix": "Change `assert len(captured_kwargs) == 1` and `captured_kwargs[0][...]` to check across all kwargs with any()" + }, + { + "id": "G4_grok_x_search", + "severity": "high", + "category": "test_mock_bug", + "introduced_by": "data_oriented_error_handling_20260606 (tool loop refactor)", + "file_line": "tests/test_grok_provider.py:46", + "fix_phase": 2, + "fix": "Same as G3 — change captured_kwargs[0] to any() across all kwargs" + }, + { + "id": "G5_llama_openrouter", + "severity": "high", + "category": "test_mock_bug", + "introduced_by": "data_oriented_error_handling_20260606 (Result API)", + "file_line": "tests/test_llama_provider.py:24", + "fix_phase": 2, + "fix": "Change `assert result == 'hi from openrouter'` to `assert result.ok and result.data == 'hi from openrouter'`" + }, + { + "id": "G6_llama_custom_url", + "severity": "high", + "category": "test_mock_bug", + "introduced_by": "data_oriented_error_handling_20260606 (Result API)", + "file_line": "tests/test_llama_provider.py:43", + "fix_phase": 2, + "fix": "Same as G5" + }, + { + "id": "G7_llama_ollama_backend", + "severity": "high", + "category": "test_mock_bug", + "introduced_by": "data_oriented_error_handling_20260606 (Result API)", + "file_line": "tests/test_llama_provider.py:62", + "fix_phase": 2, + "fix": "Change `assert 'hi from ollama' in result` to `assert result.ok and 'hi from ollama' in result.data`" + }, + { + "id": "G8_llama_native_calls_ollama_chat", + "severity": "high", + "category": "test_mock_bug", + "introduced_by": "data_oriented_error_handling_20260606 (Result API)", + "file_line": "tests/test_llama_ollama_native.py:70", + "fix_phase": 2, + "fix": "Same as G7" + }, + { + "id": "G9_llama_native_preserves_thinking", + "severity": "high", + "category": "test_mock_bug", + "introduced_by": "data_oriented_error_handling_20260606 (Result API)", + "file_line": "tests/test_llama_ollama_native.py:88", + "fix_phase": 2, + "fix": "Same as G7" + }, + { + "id": "G10_llama_routes_to_native", + "severity": "high", + "category": "test_mock_bug", + "introduced_by": "data_oriented_error_handling_20260606 (Result API)", + "file_line": "tests/test_llama_ollama_native.py:107", + "fix_phase": 2, + "fix": "Same as G7" + }, + { + "id": "G11_llama_keeps_openai_path", + "severity": "high", + "category": "test_mock_bug", + "introduced_by": "data_oriented_error_handling_20260606 (Result API)", + "file_line": "tests/test_llama_ollama_native.py:122", + "fix_phase": 2, + "fix": "Same as G7" + }, + { + "id": "G12_ai_client_tool_loop_builder", + "severity": "high", + "category": "test_mock_shape_bug", + "introduced_by": "data_oriented_error_handling_20260606 commit 3aa7bdca (NormalizedResponse return shape)", + "file_line": "tests/test_ai_client_tool_loop_builder.py:33", + "symptom": "_default_send does `if not res.ok:` expecting Result[NormalizedResponse]; mock returns raw NormalizedResponse", + "fix_phase": 2, + "fix": "Wrap the mock return in Result(data=...) — Result(data=tool_response), Result(data=final)" + }, + { + "id": "G13_headless_service_test_generate", + "severity": "high", + "category": "test_mock_bug", + "introduced_by": "data_oriented_error_handling_20260606 (Result API)", + "file_line": "tests/test_headless_service.py:57", + "symptom": "Mocks ai_client.send (deprecated); production now uses send_result. Test returns 500 due to G1 NameError + mock mismatch.", + "fix_phase": 2, + "fix": "Change `patch('src.ai_client.send', return_value='AI Response')` to `patch('src.ai_client.send_result', return_value=Result(data='AI Response'))`; update assertion to use .data" + }, + { + "id": "G14_gemini_thinking_format", + "severity": "medium", + "category": "deferred_bug", + "introduced_by": "pre-existing limitation (not from data_oriented_error_handling refactor)", + "file_line": "src/ai_client.py:_send_gemini (lines 1538-1781), _send_gemini_cli (lines 1783-1897)", + "symptom": "User complained that thinking monologues don't render for Gemini requests", + "fix_phase": 3, + "fix": "Empirical investigation: run a Gemini request that produces thinking, inspect resp.text, decide between (a) normalization pass in _send_gemini* or (b) extend parse_thinking_trace" + }, + { + "id": "G15_think_half_width_marker", + "severity": "low", + "category": "deferred_bug", + "introduced_by": "pre-existing limitation (not from data_oriented_error_handling refactor)", + "file_line": "src/thinking_parser.py:9", + "symptom": "User screenshot 1 showed ... format (half-width); current regex requires (full-width)", + "fix_phase": 4, + "fix": "Extend the tag_pattern regex at line 9 to also match ..." + }, + { + "id": "G16_state_toml_duplicates", + "severity": "low", + "category": "housekeeping", + "introduced_by": "ai_loop_regressions_20260614 commit 01075222", + "file_line": "conductor/tracks/ai_loop_regressions_20260614/state.toml lines 23-26 and 46-58", + "symptom": "Python's tomllib.load() raises TOMLDecodeError: Cannot overwrite a value", + "fix_phase": 5, + "fix": "Delete the duplicate pending entries; keep only the completed entries with commit SHAs" + }, + { + "id": "G17_tracks_md_row_24", + "severity": "low", + "category": "housekeeping", + "introduced_by": "ai_loop_regressions_20260614 (track shipped but tracks.md not updated)", + "file_line": "conductor/tracks.md:41", + "symptom": "Track row still says 'spec ✓, plan ✓, ready to start' though the track shipped on 2026-06-15", + "fix_phase": 5, + "fix": "Update status column or move to Recently Completed section" + } + ], + + "deferred_to_followup_tracks": [ + { + "id": "public_api_migration_20260606", + "title": "Public API Result Migration", + "description": "Removes the deprecated ai_client.send() and migrates the remaining 5 production call sites + ~50 test call sites to send_result(). This track handles 11 of the 63 tests; the other ~50 are deferred.", + "blocks_field_in_tracks_md": true, + "track_status": "planned; not yet specced" + }, + { + "id": "live_gui_mock_injection_20260615", + "title": "Live GUI Mock Injection Infrastructure", + "description": "Infrastructure for mock injection into the live_gui subprocess. Unblocks proper end-to-end live_gui + AI client tests (the ai_loop_regressions_20260614 smoke tests only verify Hook API substrate reachability).", + "blocks_field_in_tracks_md": false, + "track_status": "recommended; not yet specced" + }, + { + "id": "test_rag_phase4_final_verify_fix", + "title": "test_rag_phase4_final_verify RAG flakiness fix", + "description": "Pre-existing RAG subsystem issue ('NoneType' object has no attribute 'get'). The error is in RAG config lookup code, not AI client code. A partial fix was attempted in commit 16412ad5 (RAG Phase 4 dim-mismatch recovery). Recommended as a separate RAG track.", + "blocks_field_in_tracks_md": false, + "track_status": "pre-existing; not caused by either data_oriented_error_handling or ai_loop_regressions tracks" + }, + { + "id": "ui_polish_five_issues_20260302", + "title": "UI Polish Five Issues", + "description": "The 2 unrelated test failures (test_discussion_truncate_layout, test_log_management_refresh) are Phase 2 and Phase 3 of the UI Polish track. That track has its own spec and plan.", + "blocks_field_in_tracks_md": true, + "track_status": "ready to start; spec/plan in place; not caused by data_oriented_error_handling refactor" + } + ], + + "verification_criteria": { + "g1_api_generate_returns_200": "uv run pytest tests/test_headless_service.py::TestHeadlessAPI::test_generate_endpoint returns 200 (proves G1 fix)", + "g2_g12_test_mock_fixes_pass": "Full batched test suite has 11 fewer failures than the pre-track baseline (G2-G12)", + "g13_tool_loop_builder_passes": "uv run pytest tests/test_ai_client_tool_loop_builder.py::test_run_with_tool_loop_calls_request_builder_each_round passes", + "g14_headless_service_test_passes": "uv run pytest tests/test_headless_service.py::TestHeadlessAPI::test_generate_endpoint returns 200 (after G1 + G13 fixes)", + "g15_gemini_thinking_format_investigated": "Phase 3 produces an empirical finding (either normalization pass in _send_gemini* or parser extension) + live_gui or unit test demonstrates the fix", + "g16_half_width_marker_supported": "tests/test_thinking_trace.py has 1+ new test for ... marker; all existing tests still pass", + "g17_state_toml_parseable": "python -c 'import tomllib; tomllib.load(open(\"conductor/tracks/ai_loop_regressions_20260614/state.toml\",\"rb\"))' succeeds", + "g18_tracks_md_row_24_updated": "Row 24 in conductor/tracks.md reflects the track's completion (status column or section move)", + "full_suite_green": "uv run pytest tests/ shows no new failures beyond the deferred test_rag_phase4_final_verify and the 2 UI Polish tests", + "docs_updated": "docs/guide_ai_client.md 'See Also' section has 2 new cross-references: (1) this cleanup track; (2) public_api_migration_20260606" + }, + + "fr_to_phase_mapping": { + "FR1_fix_api_generate_name_error": { + "phase": 1, + "fix_files": ["src/app_controller.py:265-295"], + "test_files": ["tests/test_headless_service.py::TestHeadlessAPI::test_generate_endpoint"], + "min_test_count": 1 + }, + "FR2_FR3_test_mock_fixes": { + "phase": 2, + "fix_files": [ + "tests/test_llama_provider.py", + "tests/test_llama_ollama_native.py", + "tests/test_grok_provider.py", + "tests/test_ai_client_tool_loop_builder.py", + "tests/test_headless_service.py" + ], + "min_test_count": 11 + }, + "FR4_gemini_thinking_format": { + "phase": 3, + "fix_files": ["src/ai_client.py:_send_gemini", "src/ai_client.py:_send_gemini_cli", "src/thinking_parser.py:9"], + "test_files": ["tests/test_gemini_thinking_format.py (new)"], + "min_test_count": 1 + }, + "FR5_think_half_width_marker": { + "phase": 4, + "fix_files": ["src/thinking_parser.py:9"], + "test_files": ["tests/test_thinking_trace.py"], + "min_test_count": 1 + }, + "FR6_state_toml_cleanup": { + "phase": 5, + "fix_files": ["conductor/tracks/ai_loop_regressions_20260614/state.toml"], + "min_test_count": 0 + }, + "FR7_tracks_md_update": { + "phase": 5, + "fix_files": ["conductor/tracks.md"], + "min_test_count": 0 + }, + "FR8_regression_sweep_and_docs": { + "phase": 5, + "fix_files": ["docs/guide_ai_client.md"], + "min_test_count": 0 + } + }, + + "estimated_effort": { + "phase_1": "10 min — 1 critical regression fix + 1 test verification", + "phase_2": "1.5 hours — 11 mechanical test mock fixes across 5 files", + "phase_3": "2-4 hours — empirical Gemini investigation + fix (uncertain duration depending on finding)", + "phase_4": "30 min — 1 regex extension + 1+ new test", + "phase_5": "1 hour — 4 housekeeping tasks (state.toml, tracks.md, sweep, docs)", + "total": "5-8 hours of Tier 2 work (0.5-1 day)" + }, + + "risk_register": { + "R1_api_generate_fix_breaks_fr2_fr3": { + "likelihood": "low", + "impact": "high", + "mitigation": "Fix only ADDS lines; doesn't modify existing logic. Function semantics match pre-ai_loop_regressions_20260614 state." + }, + "R2_test_mock_fixes_introduce_subtle_failures": { + "likelihood": "low", + "impact": "low", + "mitigation": "Pattern is mechanical (assert result.ok then assert result.data); failure messages are clear if a test has a real bug" + }, + "R3_gemini_investigation_needs_real_credentials": { + "likelihood": "medium", + "impact": "medium", + "mitigation": "Use a mock client that returns a realistic Gemini response with thinking content if real credentials unavailable; document the format assumption" + }, + "R4_think_regex_greedy": { + "likelihood": "low", + "impact": "low", + "mitigation": "Use re.DOTALL + non-greedy .*? (consistent with existing pattern); existing 5+ tests catch regressions" + }, + "R5_state_toml_cleanup_deletes_wrong_lines": { + "likelihood": "very_low", + "impact": "high", + "mitigation": "Only delete the duplicate 'pending' entries; the 'completed' entries with commit SHAs must be preserved. Fix is mechanical and verifiable by re-running tomllib.load()" + } + } +}