# summarize.py """ Note(Gemini): Local heuristic summariser. Doesn't use any AI or network. Uses Python's AST to reliably pull out classes, methods, and functions. Regex is used for TOML and Markdown. The rationale here is simple: giving the AI the *structure* of a codebase is 90% as good as giving it the full source, but costs 1% of the tokens. If it needs the full source of a file after reading the summary, it can just call read_file. """ # summarize.py """ Local symbolic summariser — no AI calls, no network. For each file, extracts structural information: .py : imports, classes (with methods), top-level functions, global constants .toml : top-level table keys + array lengths .md : headings (h1-h3) other : line count + first 8 lines as preview Returns a compact markdown string per file, suitable for use as a low-token context block that replaces full file contents in the initial send. """ import ast import re from pathlib import Path # ------------------------------------------------------------------ per-type extractors def _summarise_python(path: Path, content: str) -> str: lines = content.splitlines() line_count = len(lines) parts = [f"**Python** — {line_count} lines"] try: tree = ast.parse(content.lstrip(chr(0xFEFF)), filename=str(path)) except SyntaxError as e: parts.append(f"_Parse error: {e}_") return "\n".join(parts) # Imports imports = [] for node in ast.walk(tree): if isinstance(node, ast.Import): for alias in node.names: imports.append(alias.name.split(".")[0]) elif isinstance(node, ast.ImportFrom): if node.module: imports.append(node.module.split(".")[0]) if imports: unique_imports = sorted(set(imports)) parts.append(f"imports: {', '.join(unique_imports)}") # Top-level constants (ALL_CAPS assignments) constants = [] for node in ast.iter_child_nodes(tree): if isinstance(node, ast.Assign): for t in node.targets: if isinstance(t, ast.Name) and t.id.isupper(): constants.append(t.id) elif isinstance(node, (ast.AnnAssign,)): if isinstance(node.target, ast.Name) and node.target.id.isupper(): constants.append(node.target.id) if constants: parts.append(f"constants: {', '.join(constants)}") # Classes + their methods for node in ast.iter_child_nodes(tree): if isinstance(node, ast.ClassDef): methods = [ n.name for n in ast.iter_child_nodes(node) if isinstance(n, (ast.FunctionDef, ast.AsyncFunctionDef)) ] if methods: parts.append(f"class {node.name}: {', '.join(methods)}") else: parts.append(f"class {node.name}") # Top-level functions top_fns = [ node.name for node in ast.iter_child_nodes(tree) if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)) ] if top_fns: parts.append(f"functions: {', '.join(top_fns)}") return "\n".join(parts) def _summarise_toml(path: Path, content: str) -> str: lines = content.splitlines() line_count = len(lines) parts = [f"**TOML** — {line_count} lines"] # Extract top-level table headers [key] and [[key]] table_pat = re.compile(r"^\s*\[{1,2}([^\[\]]+)\]{1,2}") tables = [] for line in lines: m = table_pat.match(line) if m: tables.append(m.group(1).strip()) if tables: parts.append(f"tables: {', '.join(tables)}") # Top-level key = value (not inside a [table]) kv_pat = re.compile(r"^([a-zA-Z_][a-zA-Z0-9_]*)\s*=") in_table = False top_keys = [] for line in lines: if table_pat.match(line): in_table = True continue if not in_table: m = kv_pat.match(line) if m: top_keys.append(m.group(1)) if top_keys: parts.append(f"top-level keys: {', '.join(top_keys)}") return "\n".join(parts) def _summarise_markdown(path: Path, content: str) -> str: lines = content.splitlines() line_count = len(lines) parts = [f"**Markdown** — {line_count} lines"] headings = [] for line in lines: m = re.match(r"^(#{1,3})\s+(.+)", line) if m: level = len(m.group(1)) text = m.group(2).strip() indent = " " * (level - 1) headings.append(f"{indent}{text}") if headings: parts.append("headings:\n" + "\n".join(f" {h}" for h in headings)) return "\n".join(parts) def _summarise_generic(path: Path, content: str) -> str: lines = content.splitlines() line_count = len(lines) suffix = path.suffix.lstrip(".").upper() or "TEXT" parts = [f"**{suffix}** — {line_count} lines"] preview = lines[:8] if preview: parts.append("preview:\n```\n" + "\n".join(preview) + "\n```") return "\n".join(parts) # ------------------------------------------------------------------ dispatch _SUMMARISERS = { ".py": _summarise_python, ".toml": _summarise_toml, ".md": _summarise_markdown, ".ini": _summarise_generic, ".txt": _summarise_generic, ".ps1": _summarise_generic, } def summarise_file(path: Path, content: str) -> str: """ Return a compact markdown summary string for a single file. `content` is the already-read file text (or an error string). """ suffix = path.suffix.lower() if hasattr(path, "suffix") else "" fn = _SUMMARISERS.get(suffix, _summarise_generic) try: return fn(path, content) except Exception as e: return f"_Summariser error: {e}_" def summarise_items(file_items: list[dict]) -> list[dict]: """ Given a list of file_item dicts (as returned by aggregate.build_file_items), return a parallel list of dicts with an added `summary` key. """ result = [] for item in file_items: path = item.get("path") content = item.get("content", "") error = item.get("error", False) if error or path is None: summary = f"_Error reading file_" else: p = Path(path) if not isinstance(path, Path) else path summary = summarise_file(p, content) result.append({**item, "summary": summary}) return result def build_summary_markdown(file_items: list[dict]) -> str: """ Build a compact markdown string of file summaries, suitable for the initial block instead of full file contents. """ summarised = summarise_items(file_items) parts = [] for item in summarised: path = item.get("path") or item.get("entry", "unknown") summary = item.get("summary", "") parts.append(f"### `{path}`\n\n{summary}") return "\n\n---\n\n".join(parts)