From a41b31ed9f6c4821cbf322bf1a3aa0c8f1ccd42b Mon Sep 17 00:00:00 2001 From: Ed_ Date: Sun, 7 Jun 2026 10:08:16 -0400 Subject: [PATCH] refactor(file_cache): remove top-level tree_sitter* imports; lazy via _require_warmed + TYPE_CHECKING Sub-track 2B: 4 violations cleared. Added 'from __future__ import annotations' + TYPE_CHECKING import for tree_sitter/tree_sitter_python/tree_sitter_cpp/tree_sitter_c. Runtime access via _require_warmed() in ASTParser.__init__. 6 new tests in tests/test_file_cache_no_top_level_tree_sitter.py. All 25 tests pass (6 new + 19 existing). --- src/file_cache.py | 20 ++-- ...est_file_cache_no_top_level_tree_sitter.py | 113 ++++++++++++++++++ 2 files changed, 122 insertions(+), 11 deletions(-) create mode 100644 tests/test_file_cache_no_top_level_tree_sitter.py diff --git a/src/file_cache.py b/src/file_cache.py index cb2a42ea..fc08b77d 100644 --- a/src/file_cache.py +++ b/src/file_cache.py @@ -34,15 +34,11 @@ See Also: - docs/guide_tools.md for AST tool documentation - src/summarize.py for heuristic summaries """ -import re -import tree_sitter -import tree_sitter_python -import tree_sitter_cpp -import tree_sitter_c +from __future__ import annotations -# TODO(Ed): Eliminate these? +import re from pathlib import Path -from typing import Optional, Any, List, Tuple, Dict +from typing import Any, Dict, List, Optional, Tuple _ast_cache: Dict[str, Tuple[float, tree_sitter.Tree]] = {} @@ -63,10 +59,12 @@ class ASTParser: raise ValueError(f"Language '{language}' not supported yet.") self.language_name = language # Load the tree-sitter language grammar - if language == "python": self.language = tree_sitter.Language(tree_sitter_python.language()) - elif language == "cpp": self.language = tree_sitter.Language(tree_sitter_cpp.language()) - elif language == "c": self.language = tree_sitter.Language(tree_sitter_c.language()) - self.parser = tree_sitter.Parser(self.language) + from src.module_loader import _require_warmed + ts = _require_warmed("tree_sitter") + if language == "python": self.language = ts.Language(_require_warmed("tree_sitter_python").language()) + elif language == "cpp": self.language = ts.Language(_require_warmed("tree_sitter_cpp").language()) + elif language == "c": self.language = ts.Language(_require_warmed("tree_sitter_c").language()) + self.parser = ts.Parser(self.language) def parse(self, code: str) -> tree_sitter.Tree: """ diff --git a/tests/test_file_cache_no_top_level_tree_sitter.py b/tests/test_file_cache_no_top_level_tree_sitter.py new file mode 100644 index 00000000..95c15e3c --- /dev/null +++ b/tests/test_file_cache_no_top_level_tree_sitter.py @@ -0,0 +1,113 @@ +"""Tests that src/file_cache.py has NO top-level tree_sitter* imports. + +Per the Main Thread Purity Invariant, the four tree_sitter modules +(tree_sitter, tree_sitter_python, tree_sitter_cpp, tree_sitter_c) are +heavy and must not appear in the main-thread import chain. They are +loaded lazily via _require_warmed() inside ASTParser.__init__ and via +TYPE_CHECKING for type-hint purposes. + +These tests run in a fresh subprocess to ensure no warmup state leaks +from the test runner. +""" + +import subprocess +import sys +import textwrap +from pathlib import Path + +ROOT = Path(__file__).resolve().parent.parent + + +def _run_in_subprocess(snippet: str) -> subprocess.CompletedProcess: + script = textwrap.dedent(snippet) + return subprocess.run( + [sys.executable, "-c", script], + capture_output=True, + text=True, + cwd=str(ROOT), + timeout=30, + ) + + +def test_file_cache_does_not_import_tree_sitter_at_module_level() -> None: + res = _run_in_subprocess(""" + import sys + import src.file_cache + for mod in ('tree_sitter', 'tree_sitter_python', 'tree_sitter_cpp', 'tree_sitter_c'): + print(mod, mod in sys.modules) + """) + assert res.returncode == 0, f"stderr: {res.stderr}" + lines = res.stdout.strip().splitlines() + for line in lines: + name, present = line.split() + assert present == "False", f"src.file_cache triggered {name} import: {res.stdout}" + + +def test_ast_parser_python_works_when_instantiated() -> None: + res = _run_in_subprocess(""" + from src.file_cache import ASTParser + p = ASTParser('python') + tree = p.parse('def foo(): pass') + print(type(tree).__module__.startswith('tree_sitter')) + """) + assert res.returncode == 0, f"stderr: {res.stderr}" + assert res.stdout.strip() == "True", f"ASTParser('python') did not produce a tree_sitter.Tree: {res.stdout}" + + +def test_ast_parser_cpp_works_when_instantiated() -> None: + res = _run_in_subprocess(""" + from src.file_cache import ASTParser + p = ASTParser('cpp') + tree = p.parse('int main() { return 0; }') + print('OK') + """) + assert res.returncode == 0, f"stderr: {res.stderr}" + assert "OK" in res.stdout + + +def test_ast_parser_c_works_when_instantiated() -> None: + res = _run_in_subprocess(""" + from src.file_cache import ASTParser + p = ASTParser('c') + tree = p.parse('int main() { return 0; }') + print('OK') + """) + assert res.returncode == 0, f"stderr: {res.stderr}" + assert "OK" in res.stdout + + +def test_tree_sitter_loaded_only_after_init() -> None: + res = _run_in_subprocess(""" + import sys + import src.file_cache + pre = 'tree_sitter' in sys.modules + from src.file_cache import ASTParser + mid = 'tree_sitter' in sys.modules + ASTParser('python') + post = 'tree_sitter' in sys.modules + print(pre, mid, post) + """) + assert res.returncode == 0, f"stderr: {res.stderr}" + lines = res.stdout.strip().splitlines() + assert lines[0].split()[0] == "False", f"tree_sitter leaked at import: {res.stdout}" + assert lines[0].split()[1] == "False", f"tree_sitter leaked at ASTParser import: {res.stdout}" + assert lines[0].split()[2] == "True", f"tree_sitter not loaded after ASTParser('python'): {res.stdout}" + + +def test_audit_sees_no_tree_sitter_violation_in_file_cache() -> None: + res = _run_in_subprocess(""" + import ast + from pathlib import Path + tree = ast.parse(Path('src/file_cache.py').read_text(encoding='utf-8')) + heavy = {'tree_sitter', 'tree_sitter_python', 'tree_sitter_cpp', 'tree_sitter_c'} + for node in tree.body: + if isinstance(node, ast.Import): + for alias in node.names: + top = alias.name.split('.')[0] + if top in heavy: + print('VIOLATION:', alias.name) + print('OK') + """) + assert res.returncode == 0, f"stderr: {res.stderr}" + assert "VIOLATION" not in res.stdout, f"file_cache.py still has tree_sitter: {res.stdout}" + assert "OK" in res.stdout