Private
Public Access
0
0
Files
manual_slop/src/summarize.py
T

220 lines
7.1 KiB
Python

# summarize.py
"""
Local symbolic summariser — no AI calls, no network.
For each file, extracts structural information:
.py : imports, classes (with methods), top-level functions, global constants
.toml : top-level table keys + array lengths
.md : headings (h1-h3)
other : line count + first 8 lines as preview
Returns a compact markdown string per file, suitable for use as a low-token
context block that replaces full file contents in the initial <context> send.
"""
import ast
import re
from pathlib import Path
from typing import Callable, Any
from src import ai_client
from src.summary_cache import SummaryCache, get_file_hash
_summary_cache = SummaryCache()
# ------------------------------------------------------------------ per-type extractors
def _summarise_python(path: Path, content: str) -> str:
lines = content.splitlines()
line_count = len(lines)
parts = [f"**Python** — {line_count} lines"]
try:
tree = ast.parse(content.lstrip(chr(0xFEFF)), filename=str(path))
except SyntaxError as e:
parts.append(f"_Parse error: {e}_")
return "\n".join(parts)
imports = []
for node in ast.walk(tree):
if isinstance(node, ast.Import):
for alias in node.names:
imports.append(alias.name.split(".")[0])
elif isinstance(node, ast.ImportFrom):
if node.module:
imports.append(node.module.split(".")[0])
if imports:
unique_imports = sorted(set(imports))
parts.append(f"imports: {', '.join(unique_imports)}")
constants = []
for node in ast.iter_child_nodes(tree):
if isinstance(node, ast.Assign):
for t in node.targets:
if isinstance(t, ast.Name) and t.id.isupper():
constants.append(t.id)
elif isinstance(node, (ast.AnnAssign,)):
if isinstance(node.target, ast.Name) and node.target.id.isupper():
constants.append(node.target.id)
if constants:
parts.append(f"constants: {', '.join(constants)}")
for node in ast.iter_child_nodes(tree):
if isinstance(node, ast.ClassDef):
methods = [
n.name for n in ast.iter_child_nodes(node)
if isinstance(n, (ast.FunctionDef, ast.AsyncFunctionDef))
]
if methods: parts.append(f"class {node.name}: {', '.join(methods)}")
else: parts.append(f"class {node.name}")
top_fns = [
node.name for node in ast.iter_child_nodes(tree)
if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef))
]
if top_fns: parts.append(f"functions: {', '.join(top_fns)}")
return "\n".join(parts)
def _summarise_toml(path: Path, content: str) -> str:
lines = content.splitlines()
line_count = len(lines)
parts = [f"**TOML** — {line_count} lines"]
table_pat = re.compile(r"^\s*\[{1,2}([^\[\]]+)\]{1,2}")
tables = []
for line in lines:
m = table_pat.match(line)
if m:
tables.append(m.group(1).strip())
if tables:
parts.append(f"tables: {', '.join(tables)}")
kv_pat = re.compile(r"^([a-zA-Z_][a-zA-Z0-9_]*)\s*=")
in_table = False
top_keys = []
for line in lines:
if table_pat.match(line):
in_table = True
continue
if not in_table:
m = kv_pat.match(line)
if m:
top_keys.append(m.group(1))
if top_keys:
parts.append(f"top-level keys: {', '.join(top_keys)}")
return "\n".join(parts)
def _summarise_markdown(path: Path, content: str) -> str:
lines = content.splitlines()
line_count = len(lines)
parts = [f"**Markdown** — {line_count} lines"]
headings = []
for line in lines:
m = re.match(r"^(#{1,3})\s+(.+)", line)
if m:
level = len(m.group(1))
text = m.group(2).strip()
indent = " " * (level - 1)
headings.append(f"{indent}{text}")
if headings:
parts.append("headings:\n" + "\n".join(f" {h}" for h in headings))
return "\n".join(parts)
def _summarise_generic(path: Path, content: str) -> str:
lines = content.splitlines()
line_count = len(lines)
suffix = path.suffix.lstrip(".").upper() or "TEXT"
parts = [f"**{suffix}** — {line_count} lines"]
# Heuristic for C-style languages
important_lines = []
for line in lines[:200]:
trimmed = line.strip()
if not trimmed or trimmed.startswith("//") or trimmed.startswith("/*") or trimmed.startswith("*"):
continue
if re.match(r'^\s*(class|struct|namespace|enum|template|void|int|float|double|char|bool|virtual|static|inline|extern|#define|#include)\b', line):
important_lines.append(trimmed)
if len(important_lines) >= 15:
break
if important_lines:
parts.append("Key elements / Outline:\n- " + "\n- ".join(important_lines))
else:
preview = [l for l in lines[:10] if l.strip()]
if preview:
parts.append("preview:\n```\n" + "\n".join(preview) + "\n```")
return "\n".join(parts)
_SUMMARISERS: dict[str, Callable[[Path, str], str]] = {
".py": _summarise_python,
".toml": _summarise_toml,
".md": _summarise_markdown,
".ini": _summarise_generic,
".txt": _summarise_generic,
".c": _summarise_generic,
".h": _summarise_generic,
".cpp": _summarise_generic,
".hpp": _summarise_generic,
".ps1": _summarise_generic,
}
def summarise_file(path: Path, content: str) -> str:
"""
Return a compact markdown summary string for a single file.
`content` is the already-read file text (or an error string).
[C: tests/test_subagent_summarization.py:test_summarise_file_integration]
"""
content_hash = get_file_hash(content)
cached = _summary_cache.get_summary(str(path), content_hash)
if cached: return cached
suffix = path.suffix.lower() if hasattr(path, "suffix") else ""
fn = _SUMMARISERS.get(suffix, _summarise_generic)
try:
heuristic_outline = fn(path, content)
# Smart AI Summarization
is_code = suffix in [".py", ".ps1", ".js", ".ts", ".cpp", ".c", ".h", ".cs", ".go", ".rs", ".lua"]
try:
smart_summary = ai_client.run_subagent_summarization(
file_path=str(path),
content=content[:10000],
is_code=is_code,
outline=heuristic_outline
)
if smart_summary and not smart_summary.startswith("ERROR:"):
summary = f"{smart_summary}\n\n**Outline:**\n{heuristic_outline}"
else:
summary = heuristic_outline
except Exception:
summary = heuristic_outline
_summary_cache.set_summary(str(path), content_hash, summary)
return summary
except Exception as e:
return f"_Summariser error: {e}_"
def summarise_items(file_items: list[dict[str, Any]]) -> list[dict[str, Any]]:
"""
Given a list of file_item dicts (as returned by aggregate.build_file_items),
return a parallel list of dicts with an added `summary` key.
"""
result = []
for item in file_items:
path = item.get("path")
content = item.get("content", "")
error = item.get("error", False)
if error or path is None:
summary = "_Error reading file_"
else:
p = Path(path) if not isinstance(path, Path) else path
summary = summarise_file(p, content)
result.append({**item, "summary": summary})
return result
def build_summary_markdown(file_items: list[dict[str, Any]]) -> str:
"""
Build a compact markdown string of file summaries, suitable for the
initial <context> block instead of full file contents.
"""
summarised = summarise_items(file_items)
parts = []
for item in summarised:
path = item.get("path") or item.get("entry", "unknown")
summary = item.get("summary", "")
parts.append(f"### `{path}`\n\n{summary}")
return "\n\n---\n\n".join(parts)