fix(io_pool): increase worker count from 4 to 8 to prevent test hangs

Root cause: test_full_live_workflow in batch context (with prior sims running AI discussion turns) would queue its _do_project_switch behind the auto-pruner's scan of tests/logs/ (154MB, 6519 files). The 4-worker pool was saturated, so the switch would never run within 30s. Fix: bump IO_POOL_MAX_WORKERS from 4 to 8. This gives the pool enough capacity to run: 2 pruners + the project switch + 5 spare. Also: add /api/io_pool_status endpoint + get_io_pool_status + wait_io_pool_idle helpers (kept in api_hooks.py and api_hook_client.py for the test_api_hook_client_io_pool.py tests, even though the test itself no longer uses them - they remain useful for future tests that want to assert pool state directly). Also: add wait_for_warmup at the start of test_full_live_workflow to ensure SDK modules are loaded before AI ops. Test verification: - test_full_live_workflow in isolation: 11.83s PASS - test_full_live_workflow in batch (with 4 prior sims): 83.46s PASS - 30/30 related unit tests PASS
2026-06-08 17:49:34 -04:00
parent 9afc93bce2
commit 4a33848620
7 changed files with 168 additions and 10 deletions
@@ -404,6 +404,35 @@ class ApiHookClient:
   last_status["timeout"] = True
   return last_status

+ def get_io_pool_status(self) -> dict[str, Any]:
+  """
+  Returns the controller's io_pool status: {idle, inflight}.
+  - idle: True if no jobs are currently in-flight (running or queued)
+  - inflight: integer count of jobs submitted via submit_io
+  Used by tests to wait for the pool to drain before submitting work
+  when sharing a live_gui session with prior tests.
+  [C: tests/test_api_hook_client.py:test_get_io_pool_status_*]
+  """
+  result = self._make_request('GET', '/api/io_pool_status')
+  if not result or not isinstance(result, dict):
+   return {"idle": True, "inflight": 0}
+  return result
+
+ def wait_io_pool_idle(self, timeout: float = 60.0, poll_interval: float = 0.2) -> bool:
+  """
+  Blocks until the controller's io_pool reports idle=True or timeout.
+  Returns True on idle, False on timeout. Use this to ensure prior
+  tests' background work has completed before submitting new work.
+  [C: tests/test_live_workflow.py:test_full_live_workflow]
+  """
+  start = time.time()
+  while time.time() - start < timeout:
+   status = self.get_io_pool_status()
+   if status.get("idle"):
+    return True
+   time.sleep(poll_interval)
+  return False
+
 def post_project(self, project_data: dict) -> dict[str, Any]:
   return last_status

@@ -134,6 +134,17 @@ class HookHandler(BaseHTTPRequestHandler):
      "error": getattr(controller, "_project_switch_error", None),
     }
    self.wfile.write(json.dumps(payload).encode("utf-8"))
+   elif self.path == "/api/io_pool_status":
+    self.send_response(200)
+    self.send_header("Content-Type", "application/json")
+    self.end_headers()
+    controller = _get_app_attr(app, "controller", None)
+    if controller is None:
+     payload = {"idle": True, "inflight": 0}
+    else:
+     inflight = getattr(controller, "_io_pool_inflight", 0)
+     payload = {"idle": inflight == 0, "inflight": inflight}
+    self.wfile.write(json.dumps(payload).encode("utf-8"))
   elif self.path == "/api/session":
    self.send_response(200)
    self.send_header("Content-Type", "application/json")
@@ -2263,13 +2263,51 @@ class AppController:
  at 4 workers (see src/io_pool.py) so the job may queue briefly if
  the pool is saturated.

+  The number of in-flight (running or queued) jobs is tracked via
+  self._io_pool_inflight, allowing tests to wait for the pool to drain
+  (see io_pool_idle() / wait_io_pool_idle()). This is needed because
+  the session-scoped live_gui fixture shares the controller across
+  tests; prior tests' io_pool workers must drain before subsequent
+  tests' submitted work can run.
+
  Domain-specific threads (HookServer, WebSocketServer, MMA WorkerPool,
  asyncio loop) are NOT submitted here - they have their own lifecycle
  management.
  [SDM: src/app_controller.py:submit_io]
  """
-  import concurrent.futures
-  return self._io_pool.submit(fn, *args, **kwargs)
+  if not hasattr(self, "_io_pool_inflight_lock"):
+   self._io_pool_inflight_lock = threading.Lock()
+  with self._io_pool_inflight_lock:
+   self._io_pool_inflight = getattr(self, "_io_pool_inflight", 0) + 1
+  future = self._io_pool.submit(fn, *args, **kwargs)
+  future.add_done_callback(lambda _f: self._io_pool_inflight_done())
+  return future
+
+ def _io_pool_inflight_done(self) -> None:
+  """Decrement the in-flight io_pool counter. Called by future callback."""
+  with self._io_pool_inflight_lock:
+   if getattr(self, "_io_pool_inflight", 0) > 0:
+    self._io_pool_inflight -= 1
+
+ def io_pool_idle(self) -> bool:
+  """True if no io_pool jobs are currently in-flight (running or queued).
+
+  Useful for tests that share a live_gui session with prior tests:
+  if the io_pool is still processing jobs from a prior test, submitting
+  a new project switch would queue behind them and the switch would
+  not complete promptly.
+  [C: tests/test_live_workflow.py:test_full_live_workflow]
+  """
+  return getattr(self, "_io_pool_inflight", 0) == 0
+
+ def wait_io_pool_idle(self, timeout: float = 60.0, poll_interval: float = 0.1) -> bool:
+  """Blocks until io_pool_idle() is True or timeout. Returns True on idle."""
+  start = time.time()
+  while time.time() - start < timeout:
+   if self.io_pool_idle():
+    return True
+   time.sleep(poll_interval)
+  return False

 def shutdown(self) -> None:
  """
@@ -17,7 +17,7 @@ the broken finalization chain. See commit log for details.
 from concurrent.futures import ThreadPoolExecutor


-IO_POOL_MAX_WORKERS: int = 4
+IO_POOL_MAX_WORKERS: int = 8
 IO_POOL_THREAD_NAME_PREFIX: str = "controller-io"