conductor(track): metadata.json for doeh_test_thinking_cleanup_20260615
This commit is contained in:
@@ -0,0 +1,326 @@
|
||||
{
|
||||
"track_id": "doeh_test_thinking_cleanup_20260615",
|
||||
"name": "Data-Oriented Error Handling Test & Thinking-Parser Cleanup",
|
||||
"initialized": "2026-06-15",
|
||||
"owner": "tier2-tech-lead",
|
||||
"priority": "high",
|
||||
"status": "active",
|
||||
"type": "bugfix + test_cleanup + refactor + documentation",
|
||||
"scope": {
|
||||
"new_files": [
|
||||
"tests/test_gemini_thinking_format.py"
|
||||
],
|
||||
"modified_files": [
|
||||
"src/app_controller.py",
|
||||
"src/ai_client.py",
|
||||
"src/thinking_parser.py",
|
||||
"tests/test_llama_provider.py",
|
||||
"tests/test_llama_ollama_native.py",
|
||||
"tests/test_grok_provider.py",
|
||||
"tests/test_ai_client_tool_loop_builder.py",
|
||||
"tests/test_headless_service.py",
|
||||
"tests/test_thinking_trace.py",
|
||||
"conductor/tracks/ai_loop_regressions_20260614/state.toml",
|
||||
"conductor/tracks.md",
|
||||
"docs/guide_ai_client.md"
|
||||
]
|
||||
},
|
||||
"blocked_by": [],
|
||||
"blocks": [],
|
||||
"estimated_phases": 5,
|
||||
"spec": "spec.md",
|
||||
"plan": "plan.md",
|
||||
|
||||
"regressions_and_deferred_items": [
|
||||
{
|
||||
"id": "G1_api_generate_name_error",
|
||||
"severity": "CRITICAL",
|
||||
"category": "production_regression",
|
||||
"introduced_by": "ai_loop_regressions_20260614 commit 2b7b571a (FR2 fix)",
|
||||
"file_line": "src/app_controller.py:265-295",
|
||||
"symptom": "/api/v1/generate returns HTTP 500 with NameError: name 'context_to_send' is not defined",
|
||||
"fix_phase": 1,
|
||||
"fix_size_lines": 3,
|
||||
"fix": "Add back the 2 lines that were removed: with controller._disc_entries_lock: has_ai_response = ... and context_to_send = stable_md if not has_ai_response else ''"
|
||||
},
|
||||
{
|
||||
"id": "G2_grok_uses_xai_endpoint",
|
||||
"severity": "high",
|
||||
"category": "test_mock_bug",
|
||||
"introduced_by": "data_oriented_error_handling_20260606 commit 64b787b8 (ProviderError removal + _send_* rename)",
|
||||
"file_line": "tests/test_grok_provider.py:13",
|
||||
"fix_phase": 2,
|
||||
"fix": "Change `assert result == 'hi from grok'` to `assert result.ok and result.data == 'hi from grok'`"
|
||||
},
|
||||
{
|
||||
"id": "G3_grok_web_search",
|
||||
"severity": "high",
|
||||
"category": "test_mock_bug",
|
||||
"introduced_by": "data_oriented_error_handling_20260606 (tool loop refactor)",
|
||||
"file_line": "tests/test_grok_provider.py:30",
|
||||
"symptom": "captured_kwargs has 12 entries instead of 1 (tool loop calls multiple times)",
|
||||
"fix_phase": 2,
|
||||
"fix": "Change `assert len(captured_kwargs) == 1` and `captured_kwargs[0][...]` to check across all kwargs with any()"
|
||||
},
|
||||
{
|
||||
"id": "G4_grok_x_search",
|
||||
"severity": "high",
|
||||
"category": "test_mock_bug",
|
||||
"introduced_by": "data_oriented_error_handling_20260606 (tool loop refactor)",
|
||||
"file_line": "tests/test_grok_provider.py:46",
|
||||
"fix_phase": 2,
|
||||
"fix": "Same as G3 — change captured_kwargs[0] to any() across all kwargs"
|
||||
},
|
||||
{
|
||||
"id": "G5_llama_openrouter",
|
||||
"severity": "high",
|
||||
"category": "test_mock_bug",
|
||||
"introduced_by": "data_oriented_error_handling_20260606 (Result API)",
|
||||
"file_line": "tests/test_llama_provider.py:24",
|
||||
"fix_phase": 2,
|
||||
"fix": "Change `assert result == 'hi from openrouter'` to `assert result.ok and result.data == 'hi from openrouter'`"
|
||||
},
|
||||
{
|
||||
"id": "G6_llama_custom_url",
|
||||
"severity": "high",
|
||||
"category": "test_mock_bug",
|
||||
"introduced_by": "data_oriented_error_handling_20260606 (Result API)",
|
||||
"file_line": "tests/test_llama_provider.py:43",
|
||||
"fix_phase": 2,
|
||||
"fix": "Same as G5"
|
||||
},
|
||||
{
|
||||
"id": "G7_llama_ollama_backend",
|
||||
"severity": "high",
|
||||
"category": "test_mock_bug",
|
||||
"introduced_by": "data_oriented_error_handling_20260606 (Result API)",
|
||||
"file_line": "tests/test_llama_provider.py:62",
|
||||
"fix_phase": 2,
|
||||
"fix": "Change `assert 'hi from ollama' in result` to `assert result.ok and 'hi from ollama' in result.data`"
|
||||
},
|
||||
{
|
||||
"id": "G8_llama_native_calls_ollama_chat",
|
||||
"severity": "high",
|
||||
"category": "test_mock_bug",
|
||||
"introduced_by": "data_oriented_error_handling_20260606 (Result API)",
|
||||
"file_line": "tests/test_llama_ollama_native.py:70",
|
||||
"fix_phase": 2,
|
||||
"fix": "Same as G7"
|
||||
},
|
||||
{
|
||||
"id": "G9_llama_native_preserves_thinking",
|
||||
"severity": "high",
|
||||
"category": "test_mock_bug",
|
||||
"introduced_by": "data_oriented_error_handling_20260606 (Result API)",
|
||||
"file_line": "tests/test_llama_ollama_native.py:88",
|
||||
"fix_phase": 2,
|
||||
"fix": "Same as G7"
|
||||
},
|
||||
{
|
||||
"id": "G10_llama_routes_to_native",
|
||||
"severity": "high",
|
||||
"category": "test_mock_bug",
|
||||
"introduced_by": "data_oriented_error_handling_20260606 (Result API)",
|
||||
"file_line": "tests/test_llama_ollama_native.py:107",
|
||||
"fix_phase": 2,
|
||||
"fix": "Same as G7"
|
||||
},
|
||||
{
|
||||
"id": "G11_llama_keeps_openai_path",
|
||||
"severity": "high",
|
||||
"category": "test_mock_bug",
|
||||
"introduced_by": "data_oriented_error_handling_20260606 (Result API)",
|
||||
"file_line": "tests/test_llama_ollama_native.py:122",
|
||||
"fix_phase": 2,
|
||||
"fix": "Same as G7"
|
||||
},
|
||||
{
|
||||
"id": "G12_ai_client_tool_loop_builder",
|
||||
"severity": "high",
|
||||
"category": "test_mock_shape_bug",
|
||||
"introduced_by": "data_oriented_error_handling_20260606 commit 3aa7bdca (NormalizedResponse return shape)",
|
||||
"file_line": "tests/test_ai_client_tool_loop_builder.py:33",
|
||||
"symptom": "_default_send does `if not res.ok:` expecting Result[NormalizedResponse]; mock returns raw NormalizedResponse",
|
||||
"fix_phase": 2,
|
||||
"fix": "Wrap the mock return in Result(data=...) — Result(data=tool_response), Result(data=final)"
|
||||
},
|
||||
{
|
||||
"id": "G13_headless_service_test_generate",
|
||||
"severity": "high",
|
||||
"category": "test_mock_bug",
|
||||
"introduced_by": "data_oriented_error_handling_20260606 (Result API)",
|
||||
"file_line": "tests/test_headless_service.py:57",
|
||||
"symptom": "Mocks ai_client.send (deprecated); production now uses send_result. Test returns 500 due to G1 NameError + mock mismatch.",
|
||||
"fix_phase": 2,
|
||||
"fix": "Change `patch('src.ai_client.send', return_value='AI Response')` to `patch('src.ai_client.send_result', return_value=Result(data='AI Response'))`; update assertion to use .data"
|
||||
},
|
||||
{
|
||||
"id": "G14_gemini_thinking_format",
|
||||
"severity": "medium",
|
||||
"category": "deferred_bug",
|
||||
"introduced_by": "pre-existing limitation (not from data_oriented_error_handling refactor)",
|
||||
"file_line": "src/ai_client.py:_send_gemini (lines 1538-1781), _send_gemini_cli (lines 1783-1897)",
|
||||
"symptom": "User complained that thinking monologues don't render for Gemini requests",
|
||||
"fix_phase": 3,
|
||||
"fix": "Empirical investigation: run a Gemini request that produces thinking, inspect resp.text, decide between (a) normalization pass in _send_gemini* or (b) extend parse_thinking_trace"
|
||||
},
|
||||
{
|
||||
"id": "G15_think_half_width_marker",
|
||||
"severity": "low",
|
||||
"category": "deferred_bug",
|
||||
"introduced_by": "pre-existing limitation (not from data_oriented_error_handling refactor)",
|
||||
"file_line": "src/thinking_parser.py:9",
|
||||
"symptom": "User screenshot 1 showed <think>...</think> format (half-width); current regex requires <thinking> (full-width)",
|
||||
"fix_phase": 4,
|
||||
"fix": "Extend the tag_pattern regex at line 9 to also match <think>...</think>"
|
||||
},
|
||||
{
|
||||
"id": "G16_state_toml_duplicates",
|
||||
"severity": "low",
|
||||
"category": "housekeeping",
|
||||
"introduced_by": "ai_loop_regressions_20260614 commit 01075222",
|
||||
"file_line": "conductor/tracks/ai_loop_regressions_20260614/state.toml lines 23-26 and 46-58",
|
||||
"symptom": "Python's tomllib.load() raises TOMLDecodeError: Cannot overwrite a value",
|
||||
"fix_phase": 5,
|
||||
"fix": "Delete the duplicate pending entries; keep only the completed entries with commit SHAs"
|
||||
},
|
||||
{
|
||||
"id": "G17_tracks_md_row_24",
|
||||
"severity": "low",
|
||||
"category": "housekeeping",
|
||||
"introduced_by": "ai_loop_regressions_20260614 (track shipped but tracks.md not updated)",
|
||||
"file_line": "conductor/tracks.md:41",
|
||||
"symptom": "Track row still says 'spec ✓, plan ✓, ready to start' though the track shipped on 2026-06-15",
|
||||
"fix_phase": 5,
|
||||
"fix": "Update status column or move to Recently Completed section"
|
||||
}
|
||||
],
|
||||
|
||||
"deferred_to_followup_tracks": [
|
||||
{
|
||||
"id": "public_api_migration_20260606",
|
||||
"title": "Public API Result Migration",
|
||||
"description": "Removes the deprecated ai_client.send() and migrates the remaining 5 production call sites + ~50 test call sites to send_result(). This track handles 11 of the 63 tests; the other ~50 are deferred.",
|
||||
"blocks_field_in_tracks_md": true,
|
||||
"track_status": "planned; not yet specced"
|
||||
},
|
||||
{
|
||||
"id": "live_gui_mock_injection_20260615",
|
||||
"title": "Live GUI Mock Injection Infrastructure",
|
||||
"description": "Infrastructure for mock injection into the live_gui subprocess. Unblocks proper end-to-end live_gui + AI client tests (the ai_loop_regressions_20260614 smoke tests only verify Hook API substrate reachability).",
|
||||
"blocks_field_in_tracks_md": false,
|
||||
"track_status": "recommended; not yet specced"
|
||||
},
|
||||
{
|
||||
"id": "test_rag_phase4_final_verify_fix",
|
||||
"title": "test_rag_phase4_final_verify RAG flakiness fix",
|
||||
"description": "Pre-existing RAG subsystem issue ('NoneType' object has no attribute 'get'). The error is in RAG config lookup code, not AI client code. A partial fix was attempted in commit 16412ad5 (RAG Phase 4 dim-mismatch recovery). Recommended as a separate RAG track.",
|
||||
"blocks_field_in_tracks_md": false,
|
||||
"track_status": "pre-existing; not caused by either data_oriented_error_handling or ai_loop_regressions tracks"
|
||||
},
|
||||
{
|
||||
"id": "ui_polish_five_issues_20260302",
|
||||
"title": "UI Polish Five Issues",
|
||||
"description": "The 2 unrelated test failures (test_discussion_truncate_layout, test_log_management_refresh) are Phase 2 and Phase 3 of the UI Polish track. That track has its own spec and plan.",
|
||||
"blocks_field_in_tracks_md": true,
|
||||
"track_status": "ready to start; spec/plan in place; not caused by data_oriented_error_handling refactor"
|
||||
}
|
||||
],
|
||||
|
||||
"verification_criteria": {
|
||||
"g1_api_generate_returns_200": "uv run pytest tests/test_headless_service.py::TestHeadlessAPI::test_generate_endpoint returns 200 (proves G1 fix)",
|
||||
"g2_g12_test_mock_fixes_pass": "Full batched test suite has 11 fewer failures than the pre-track baseline (G2-G12)",
|
||||
"g13_tool_loop_builder_passes": "uv run pytest tests/test_ai_client_tool_loop_builder.py::test_run_with_tool_loop_calls_request_builder_each_round passes",
|
||||
"g14_headless_service_test_passes": "uv run pytest tests/test_headless_service.py::TestHeadlessAPI::test_generate_endpoint returns 200 (after G1 + G13 fixes)",
|
||||
"g15_gemini_thinking_format_investigated": "Phase 3 produces an empirical finding (either normalization pass in _send_gemini* or parser extension) + live_gui or unit test demonstrates the fix",
|
||||
"g16_half_width_marker_supported": "tests/test_thinking_trace.py has 1+ new test for <think>...</think> marker; all existing tests still pass",
|
||||
"g17_state_toml_parseable": "python -c 'import tomllib; tomllib.load(open(\"conductor/tracks/ai_loop_regressions_20260614/state.toml\",\"rb\"))' succeeds",
|
||||
"g18_tracks_md_row_24_updated": "Row 24 in conductor/tracks.md reflects the track's completion (status column or section move)",
|
||||
"full_suite_green": "uv run pytest tests/ shows no new failures beyond the deferred test_rag_phase4_final_verify and the 2 UI Polish tests",
|
||||
"docs_updated": "docs/guide_ai_client.md 'See Also' section has 2 new cross-references: (1) this cleanup track; (2) public_api_migration_20260606"
|
||||
},
|
||||
|
||||
"fr_to_phase_mapping": {
|
||||
"FR1_fix_api_generate_name_error": {
|
||||
"phase": 1,
|
||||
"fix_files": ["src/app_controller.py:265-295"],
|
||||
"test_files": ["tests/test_headless_service.py::TestHeadlessAPI::test_generate_endpoint"],
|
||||
"min_test_count": 1
|
||||
},
|
||||
"FR2_FR3_test_mock_fixes": {
|
||||
"phase": 2,
|
||||
"fix_files": [
|
||||
"tests/test_llama_provider.py",
|
||||
"tests/test_llama_ollama_native.py",
|
||||
"tests/test_grok_provider.py",
|
||||
"tests/test_ai_client_tool_loop_builder.py",
|
||||
"tests/test_headless_service.py"
|
||||
],
|
||||
"min_test_count": 11
|
||||
},
|
||||
"FR4_gemini_thinking_format": {
|
||||
"phase": 3,
|
||||
"fix_files": ["src/ai_client.py:_send_gemini", "src/ai_client.py:_send_gemini_cli", "src/thinking_parser.py:9"],
|
||||
"test_files": ["tests/test_gemini_thinking_format.py (new)"],
|
||||
"min_test_count": 1
|
||||
},
|
||||
"FR5_think_half_width_marker": {
|
||||
"phase": 4,
|
||||
"fix_files": ["src/thinking_parser.py:9"],
|
||||
"test_files": ["tests/test_thinking_trace.py"],
|
||||
"min_test_count": 1
|
||||
},
|
||||
"FR6_state_toml_cleanup": {
|
||||
"phase": 5,
|
||||
"fix_files": ["conductor/tracks/ai_loop_regressions_20260614/state.toml"],
|
||||
"min_test_count": 0
|
||||
},
|
||||
"FR7_tracks_md_update": {
|
||||
"phase": 5,
|
||||
"fix_files": ["conductor/tracks.md"],
|
||||
"min_test_count": 0
|
||||
},
|
||||
"FR8_regression_sweep_and_docs": {
|
||||
"phase": 5,
|
||||
"fix_files": ["docs/guide_ai_client.md"],
|
||||
"min_test_count": 0
|
||||
}
|
||||
},
|
||||
|
||||
"estimated_effort": {
|
||||
"phase_1": "10 min — 1 critical regression fix + 1 test verification",
|
||||
"phase_2": "1.5 hours — 11 mechanical test mock fixes across 5 files",
|
||||
"phase_3": "2-4 hours — empirical Gemini investigation + fix (uncertain duration depending on finding)",
|
||||
"phase_4": "30 min — 1 regex extension + 1+ new test",
|
||||
"phase_5": "1 hour — 4 housekeeping tasks (state.toml, tracks.md, sweep, docs)",
|
||||
"total": "5-8 hours of Tier 2 work (0.5-1 day)"
|
||||
},
|
||||
|
||||
"risk_register": {
|
||||
"R1_api_generate_fix_breaks_fr2_fr3": {
|
||||
"likelihood": "low",
|
||||
"impact": "high",
|
||||
"mitigation": "Fix only ADDS lines; doesn't modify existing logic. Function semantics match pre-ai_loop_regressions_20260614 state."
|
||||
},
|
||||
"R2_test_mock_fixes_introduce_subtle_failures": {
|
||||
"likelihood": "low",
|
||||
"impact": "low",
|
||||
"mitigation": "Pattern is mechanical (assert result.ok then assert result.data); failure messages are clear if a test has a real bug"
|
||||
},
|
||||
"R3_gemini_investigation_needs_real_credentials": {
|
||||
"likelihood": "medium",
|
||||
"impact": "medium",
|
||||
"mitigation": "Use a mock client that returns a realistic Gemini response with thinking content if real credentials unavailable; document the format assumption"
|
||||
},
|
||||
"R4_think_regex_greedy": {
|
||||
"likelihood": "low",
|
||||
"impact": "low",
|
||||
"mitigation": "Use re.DOTALL + non-greedy .*? (consistent with existing pattern); existing 5+ tests catch regressions"
|
||||
},
|
||||
"R5_state_toml_cleanup_deletes_wrong_lines": {
|
||||
"likelihood": "very_low",
|
||||
"impact": "high",
|
||||
"mitigation": "Only delete the duplicate 'pending' entries; the 'completed' entries with commit SHAs must be preserved. Fix is mechanical and verifiable by re-running tomllib.load()"
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user