Private
Public Access
0
0

conductor(track): metadata.json for doeh_test_thinking_cleanup_20260615

This commit is contained in:
2026-06-15 12:21:16 -04:00
parent 304f469663
commit 88bf04eb3d
@@ -0,0 +1,326 @@
{
"track_id": "doeh_test_thinking_cleanup_20260615",
"name": "Data-Oriented Error Handling Test & Thinking-Parser Cleanup",
"initialized": "2026-06-15",
"owner": "tier2-tech-lead",
"priority": "high",
"status": "active",
"type": "bugfix + test_cleanup + refactor + documentation",
"scope": {
"new_files": [
"tests/test_gemini_thinking_format.py"
],
"modified_files": [
"src/app_controller.py",
"src/ai_client.py",
"src/thinking_parser.py",
"tests/test_llama_provider.py",
"tests/test_llama_ollama_native.py",
"tests/test_grok_provider.py",
"tests/test_ai_client_tool_loop_builder.py",
"tests/test_headless_service.py",
"tests/test_thinking_trace.py",
"conductor/tracks/ai_loop_regressions_20260614/state.toml",
"conductor/tracks.md",
"docs/guide_ai_client.md"
]
},
"blocked_by": [],
"blocks": [],
"estimated_phases": 5,
"spec": "spec.md",
"plan": "plan.md",
"regressions_and_deferred_items": [
{
"id": "G1_api_generate_name_error",
"severity": "CRITICAL",
"category": "production_regression",
"introduced_by": "ai_loop_regressions_20260614 commit 2b7b571a (FR2 fix)",
"file_line": "src/app_controller.py:265-295",
"symptom": "/api/v1/generate returns HTTP 500 with NameError: name 'context_to_send' is not defined",
"fix_phase": 1,
"fix_size_lines": 3,
"fix": "Add back the 2 lines that were removed: with controller._disc_entries_lock: has_ai_response = ... and context_to_send = stable_md if not has_ai_response else ''"
},
{
"id": "G2_grok_uses_xai_endpoint",
"severity": "high",
"category": "test_mock_bug",
"introduced_by": "data_oriented_error_handling_20260606 commit 64b787b8 (ProviderError removal + _send_* rename)",
"file_line": "tests/test_grok_provider.py:13",
"fix_phase": 2,
"fix": "Change `assert result == 'hi from grok'` to `assert result.ok and result.data == 'hi from grok'`"
},
{
"id": "G3_grok_web_search",
"severity": "high",
"category": "test_mock_bug",
"introduced_by": "data_oriented_error_handling_20260606 (tool loop refactor)",
"file_line": "tests/test_grok_provider.py:30",
"symptom": "captured_kwargs has 12 entries instead of 1 (tool loop calls multiple times)",
"fix_phase": 2,
"fix": "Change `assert len(captured_kwargs) == 1` and `captured_kwargs[0][...]` to check across all kwargs with any()"
},
{
"id": "G4_grok_x_search",
"severity": "high",
"category": "test_mock_bug",
"introduced_by": "data_oriented_error_handling_20260606 (tool loop refactor)",
"file_line": "tests/test_grok_provider.py:46",
"fix_phase": 2,
"fix": "Same as G3 — change captured_kwargs[0] to any() across all kwargs"
},
{
"id": "G5_llama_openrouter",
"severity": "high",
"category": "test_mock_bug",
"introduced_by": "data_oriented_error_handling_20260606 (Result API)",
"file_line": "tests/test_llama_provider.py:24",
"fix_phase": 2,
"fix": "Change `assert result == 'hi from openrouter'` to `assert result.ok and result.data == 'hi from openrouter'`"
},
{
"id": "G6_llama_custom_url",
"severity": "high",
"category": "test_mock_bug",
"introduced_by": "data_oriented_error_handling_20260606 (Result API)",
"file_line": "tests/test_llama_provider.py:43",
"fix_phase": 2,
"fix": "Same as G5"
},
{
"id": "G7_llama_ollama_backend",
"severity": "high",
"category": "test_mock_bug",
"introduced_by": "data_oriented_error_handling_20260606 (Result API)",
"file_line": "tests/test_llama_provider.py:62",
"fix_phase": 2,
"fix": "Change `assert 'hi from ollama' in result` to `assert result.ok and 'hi from ollama' in result.data`"
},
{
"id": "G8_llama_native_calls_ollama_chat",
"severity": "high",
"category": "test_mock_bug",
"introduced_by": "data_oriented_error_handling_20260606 (Result API)",
"file_line": "tests/test_llama_ollama_native.py:70",
"fix_phase": 2,
"fix": "Same as G7"
},
{
"id": "G9_llama_native_preserves_thinking",
"severity": "high",
"category": "test_mock_bug",
"introduced_by": "data_oriented_error_handling_20260606 (Result API)",
"file_line": "tests/test_llama_ollama_native.py:88",
"fix_phase": 2,
"fix": "Same as G7"
},
{
"id": "G10_llama_routes_to_native",
"severity": "high",
"category": "test_mock_bug",
"introduced_by": "data_oriented_error_handling_20260606 (Result API)",
"file_line": "tests/test_llama_ollama_native.py:107",
"fix_phase": 2,
"fix": "Same as G7"
},
{
"id": "G11_llama_keeps_openai_path",
"severity": "high",
"category": "test_mock_bug",
"introduced_by": "data_oriented_error_handling_20260606 (Result API)",
"file_line": "tests/test_llama_ollama_native.py:122",
"fix_phase": 2,
"fix": "Same as G7"
},
{
"id": "G12_ai_client_tool_loop_builder",
"severity": "high",
"category": "test_mock_shape_bug",
"introduced_by": "data_oriented_error_handling_20260606 commit 3aa7bdca (NormalizedResponse return shape)",
"file_line": "tests/test_ai_client_tool_loop_builder.py:33",
"symptom": "_default_send does `if not res.ok:` expecting Result[NormalizedResponse]; mock returns raw NormalizedResponse",
"fix_phase": 2,
"fix": "Wrap the mock return in Result(data=...) — Result(data=tool_response), Result(data=final)"
},
{
"id": "G13_headless_service_test_generate",
"severity": "high",
"category": "test_mock_bug",
"introduced_by": "data_oriented_error_handling_20260606 (Result API)",
"file_line": "tests/test_headless_service.py:57",
"symptom": "Mocks ai_client.send (deprecated); production now uses send_result. Test returns 500 due to G1 NameError + mock mismatch.",
"fix_phase": 2,
"fix": "Change `patch('src.ai_client.send', return_value='AI Response')` to `patch('src.ai_client.send_result', return_value=Result(data='AI Response'))`; update assertion to use .data"
},
{
"id": "G14_gemini_thinking_format",
"severity": "medium",
"category": "deferred_bug",
"introduced_by": "pre-existing limitation (not from data_oriented_error_handling refactor)",
"file_line": "src/ai_client.py:_send_gemini (lines 1538-1781), _send_gemini_cli (lines 1783-1897)",
"symptom": "User complained that thinking monologues don't render for Gemini requests",
"fix_phase": 3,
"fix": "Empirical investigation: run a Gemini request that produces thinking, inspect resp.text, decide between (a) normalization pass in _send_gemini* or (b) extend parse_thinking_trace"
},
{
"id": "G15_think_half_width_marker",
"severity": "low",
"category": "deferred_bug",
"introduced_by": "pre-existing limitation (not from data_oriented_error_handling refactor)",
"file_line": "src/thinking_parser.py:9",
"symptom": "User screenshot 1 showed <think>...</think> format (half-width); current regex requires <thinking> (full-width)",
"fix_phase": 4,
"fix": "Extend the tag_pattern regex at line 9 to also match <think>...</think>"
},
{
"id": "G16_state_toml_duplicates",
"severity": "low",
"category": "housekeeping",
"introduced_by": "ai_loop_regressions_20260614 commit 01075222",
"file_line": "conductor/tracks/ai_loop_regressions_20260614/state.toml lines 23-26 and 46-58",
"symptom": "Python's tomllib.load() raises TOMLDecodeError: Cannot overwrite a value",
"fix_phase": 5,
"fix": "Delete the duplicate pending entries; keep only the completed entries with commit SHAs"
},
{
"id": "G17_tracks_md_row_24",
"severity": "low",
"category": "housekeeping",
"introduced_by": "ai_loop_regressions_20260614 (track shipped but tracks.md not updated)",
"file_line": "conductor/tracks.md:41",
"symptom": "Track row still says 'spec ✓, plan ✓, ready to start' though the track shipped on 2026-06-15",
"fix_phase": 5,
"fix": "Update status column or move to Recently Completed section"
}
],
"deferred_to_followup_tracks": [
{
"id": "public_api_migration_20260606",
"title": "Public API Result Migration",
"description": "Removes the deprecated ai_client.send() and migrates the remaining 5 production call sites + ~50 test call sites to send_result(). This track handles 11 of the 63 tests; the other ~50 are deferred.",
"blocks_field_in_tracks_md": true,
"track_status": "planned; not yet specced"
},
{
"id": "live_gui_mock_injection_20260615",
"title": "Live GUI Mock Injection Infrastructure",
"description": "Infrastructure for mock injection into the live_gui subprocess. Unblocks proper end-to-end live_gui + AI client tests (the ai_loop_regressions_20260614 smoke tests only verify Hook API substrate reachability).",
"blocks_field_in_tracks_md": false,
"track_status": "recommended; not yet specced"
},
{
"id": "test_rag_phase4_final_verify_fix",
"title": "test_rag_phase4_final_verify RAG flakiness fix",
"description": "Pre-existing RAG subsystem issue ('NoneType' object has no attribute 'get'). The error is in RAG config lookup code, not AI client code. A partial fix was attempted in commit 16412ad5 (RAG Phase 4 dim-mismatch recovery). Recommended as a separate RAG track.",
"blocks_field_in_tracks_md": false,
"track_status": "pre-existing; not caused by either data_oriented_error_handling or ai_loop_regressions tracks"
},
{
"id": "ui_polish_five_issues_20260302",
"title": "UI Polish Five Issues",
"description": "The 2 unrelated test failures (test_discussion_truncate_layout, test_log_management_refresh) are Phase 2 and Phase 3 of the UI Polish track. That track has its own spec and plan.",
"blocks_field_in_tracks_md": true,
"track_status": "ready to start; spec/plan in place; not caused by data_oriented_error_handling refactor"
}
],
"verification_criteria": {
"g1_api_generate_returns_200": "uv run pytest tests/test_headless_service.py::TestHeadlessAPI::test_generate_endpoint returns 200 (proves G1 fix)",
"g2_g12_test_mock_fixes_pass": "Full batched test suite has 11 fewer failures than the pre-track baseline (G2-G12)",
"g13_tool_loop_builder_passes": "uv run pytest tests/test_ai_client_tool_loop_builder.py::test_run_with_tool_loop_calls_request_builder_each_round passes",
"g14_headless_service_test_passes": "uv run pytest tests/test_headless_service.py::TestHeadlessAPI::test_generate_endpoint returns 200 (after G1 + G13 fixes)",
"g15_gemini_thinking_format_investigated": "Phase 3 produces an empirical finding (either normalization pass in _send_gemini* or parser extension) + live_gui or unit test demonstrates the fix",
"g16_half_width_marker_supported": "tests/test_thinking_trace.py has 1+ new test for <think>...</think> marker; all existing tests still pass",
"g17_state_toml_parseable": "python -c 'import tomllib; tomllib.load(open(\"conductor/tracks/ai_loop_regressions_20260614/state.toml\",\"rb\"))' succeeds",
"g18_tracks_md_row_24_updated": "Row 24 in conductor/tracks.md reflects the track's completion (status column or section move)",
"full_suite_green": "uv run pytest tests/ shows no new failures beyond the deferred test_rag_phase4_final_verify and the 2 UI Polish tests",
"docs_updated": "docs/guide_ai_client.md 'See Also' section has 2 new cross-references: (1) this cleanup track; (2) public_api_migration_20260606"
},
"fr_to_phase_mapping": {
"FR1_fix_api_generate_name_error": {
"phase": 1,
"fix_files": ["src/app_controller.py:265-295"],
"test_files": ["tests/test_headless_service.py::TestHeadlessAPI::test_generate_endpoint"],
"min_test_count": 1
},
"FR2_FR3_test_mock_fixes": {
"phase": 2,
"fix_files": [
"tests/test_llama_provider.py",
"tests/test_llama_ollama_native.py",
"tests/test_grok_provider.py",
"tests/test_ai_client_tool_loop_builder.py",
"tests/test_headless_service.py"
],
"min_test_count": 11
},
"FR4_gemini_thinking_format": {
"phase": 3,
"fix_files": ["src/ai_client.py:_send_gemini", "src/ai_client.py:_send_gemini_cli", "src/thinking_parser.py:9"],
"test_files": ["tests/test_gemini_thinking_format.py (new)"],
"min_test_count": 1
},
"FR5_think_half_width_marker": {
"phase": 4,
"fix_files": ["src/thinking_parser.py:9"],
"test_files": ["tests/test_thinking_trace.py"],
"min_test_count": 1
},
"FR6_state_toml_cleanup": {
"phase": 5,
"fix_files": ["conductor/tracks/ai_loop_regressions_20260614/state.toml"],
"min_test_count": 0
},
"FR7_tracks_md_update": {
"phase": 5,
"fix_files": ["conductor/tracks.md"],
"min_test_count": 0
},
"FR8_regression_sweep_and_docs": {
"phase": 5,
"fix_files": ["docs/guide_ai_client.md"],
"min_test_count": 0
}
},
"estimated_effort": {
"phase_1": "10 min — 1 critical regression fix + 1 test verification",
"phase_2": "1.5 hours — 11 mechanical test mock fixes across 5 files",
"phase_3": "2-4 hours — empirical Gemini investigation + fix (uncertain duration depending on finding)",
"phase_4": "30 min — 1 regex extension + 1+ new test",
"phase_5": "1 hour — 4 housekeeping tasks (state.toml, tracks.md, sweep, docs)",
"total": "5-8 hours of Tier 2 work (0.5-1 day)"
},
"risk_register": {
"R1_api_generate_fix_breaks_fr2_fr3": {
"likelihood": "low",
"impact": "high",
"mitigation": "Fix only ADDS lines; doesn't modify existing logic. Function semantics match pre-ai_loop_regressions_20260614 state."
},
"R2_test_mock_fixes_introduce_subtle_failures": {
"likelihood": "low",
"impact": "low",
"mitigation": "Pattern is mechanical (assert result.ok then assert result.data); failure messages are clear if a test has a real bug"
},
"R3_gemini_investigation_needs_real_credentials": {
"likelihood": "medium",
"impact": "medium",
"mitigation": "Use a mock client that returns a realistic Gemini response with thinking content if real credentials unavailable; document the format assumption"
},
"R4_think_regex_greedy": {
"likelihood": "low",
"impact": "low",
"mitigation": "Use re.DOTALL + non-greedy .*? (consistent with existing pattern); existing 5+ tests catch regressions"
},
"R5_state_toml_cleanup_deletes_wrong_lines": {
"likelihood": "very_low",
"impact": "high",
"mitigation": "Only delete the duplicate 'pending' entries; the 'completed' entries with commit SHAs must be preserved. Fix is mechanical and verifiable by re-running tomllib.load()"
}
}
}