feat(scripts): add audit_weak_types.py for AI-readability analysis
AST-based static analyzer that identifies type signatures that reduce code clarity and AI-readability. Targets: - Dict[str, Any] / dict[str, Any] (302 findings) - list[dict[...]] (115 findings) - Optional[dict[...]] / Optional[tuple[...]] (11 findings) - Tuple[...]/tuple[...] as anonymous structs (4 findings) - Return tuples and assign tuples (4 findings) The script also counts POSITIVE patterns (TypeAlias, NamedTuple, @dataclass, pydantic.BaseModel) that already exist in the codebase. Current count: 0. The codebase has zero strong type aliases. Usage: python scripts/audit_weak_types.py [--json] [--top N] [--verbose] Exits 0 (informational); exits 1 only on usage error. Initial run on src/ found 430 weak sites across 29 files. The 4 most common unique type strings (list[dict[str, Any]], dict[str, Any], Dict[str, Any], List[Dict[str, Any]]) account for 86% of findings. A focused track adding 4-6 type aliases would eliminate the vast majority of the noise. Output modes: - human-readable (default): top N files with category breakdowns - JSON (--json): machine-readable for tooling - verbose (--verbose): every finding inline Exit codes: - 0: audit ran successfully (regardless of findings) - 1: usage error (bad args, source dir not found)
This commit is contained in:
@@ -0,0 +1,281 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Audit src/ for weak or anonymous type annotations.
|
||||
|
||||
Identifies type signatures that reduce code clarity and AI-readability.
|
||||
The target patterns are the ones an LLM-driven workflow stumbles on most:
|
||||
|
||||
- Dict[str, Any] / dict[str, Any] - opaque dict, no schema hint
|
||||
- Dict[str, V] for primitive V - vague; "what's in the dict?"
|
||||
- List[Dict[str, Any]] / list[dict[str, Any]] - list of opaque dicts
|
||||
- Tuple[A, B, ...] / tuple[A, B, ...] - anonymous struct
|
||||
- Optional[Tuple[...]] / Optional[Dict[...]] - "missing or anonymous"
|
||||
- Functions returning tuples via commas - (x, y) without a name
|
||||
|
||||
The script also detects a few POSITIVE patterns: type aliases,
|
||||
NamedTuples, dataclasses, and pydantic models that already exist
|
||||
in the codebase. (The current codebase has few of these; that's part
|
||||
of the problem the audit measures.)
|
||||
|
||||
The output is a report that the user (or a follow-up track) can use
|
||||
to decide whether a type-strengthening refactor is worth it.
|
||||
|
||||
Usage:
|
||||
python scripts/audit_weak_types.py # human-readable report
|
||||
python scripts/audit_weak_types.py --json # JSON output for tooling
|
||||
python scripts/audit_weak_types.py --src src # override the source dir
|
||||
python scripts/audit_weak_types.py --top 20 # show top N files
|
||||
python scripts/audit_weak_types.py --verbose # show every finding inline
|
||||
|
||||
Exit codes:
|
||||
0 - audit ran (regardless of findings; the audit is informational)
|
||||
1 - usage error (bad args, source dir not found, etc.)
|
||||
"""
|
||||
from __future__ import annotations
|
||||
import argparse
|
||||
import ast
|
||||
import json
|
||||
import re
|
||||
import sys
|
||||
from collections import Counter
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
WEAK_PATTERNS: list[tuple[str, str]] = [
|
||||
(r"Dict\[str,\s*Any\]", "dict_str_any"),
|
||||
(r"dict\[str,\s*Any\]", "dict_str_any"),
|
||||
(r"List\[Dict\[", "list_of_dict"),
|
||||
(r"list\[dict\[", "list_of_dict"),
|
||||
(r"Optional\[List\[Dict\[", "optional_list_of_dict"),
|
||||
(r"Optional\[list\[dict\[", "optional_list_of_dict"),
|
||||
(r"Optional\[Dict\[", "optional_dict"),
|
||||
(r"Optional\[dict\[", "optional_dict"),
|
||||
(r":\s*Dict\[str,\s*Any\]", "param_dict_str_any"),
|
||||
(r":\s*dict\[str,\s*Any\]", "param_dict_str_any"),
|
||||
(r"->\s*Tuple\[[^\]]+\]\s*$", "return_tuple"),
|
||||
(r"->\s*tuple\[[^\]]+\]\s*$", "return_tuple"),
|
||||
(r"Optional\[Tuple\[", "optional_tuple"),
|
||||
(r"Optional\[tuple\[", "optional_tuple"),
|
||||
]
|
||||
|
||||
POSITIVE_PATTERNS: list[tuple[str, str]] = [
|
||||
(r"TypeAlias\s*=", "type_alias_def"),
|
||||
(r"NamedTuple", "named_tuple"),
|
||||
(r"@\s*dataclass", "dataclass_decoration"),
|
||||
(r"pydantic\.BaseModel", "pydantic_model"),
|
||||
]
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class Finding:
|
||||
filename: str
|
||||
line: int
|
||||
context: str
|
||||
type_str: str
|
||||
category: str
|
||||
severity: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class FileReport:
|
||||
filename: str
|
||||
weak: list[Finding] = field(default_factory=list)
|
||||
positive: list[tuple[int, str, str]] = field(default_factory=list)
|
||||
|
||||
@property
|
||||
def weak_count(self) -> int:
|
||||
return len(self.weak)
|
||||
|
||||
@property
|
||||
def positive_count(self) -> int:
|
||||
return len(self.positive)
|
||||
|
||||
|
||||
class WeakTypeVisitor(ast.NodeVisitor):
|
||||
def __init__(self, filename: str, source: str) -> None:
|
||||
self.filename = filename
|
||||
self.source = source
|
||||
self.report = FileReport(filename=filename)
|
||||
self._func_stack: list[ast.FunctionDef] = []
|
||||
|
||||
def _check_type(self, type_node: ast.AST | None, line: int, context: str) -> None:
|
||||
if type_node is None:
|
||||
return
|
||||
type_str = ast.unparse(type_node).replace("\n", " ").strip()
|
||||
for pattern, category in WEAK_PATTERNS:
|
||||
if re.search(pattern, type_str):
|
||||
severity = "high" if "Any" in type_str or "list_of_dict" in category else "medium"
|
||||
self.report.weak.append(Finding(
|
||||
filename=self.filename,
|
||||
line=line,
|
||||
context=context,
|
||||
type_str=type_str,
|
||||
category=category,
|
||||
severity=severity,
|
||||
))
|
||||
for pattern, category in POSITIVE_PATTERNS:
|
||||
if re.search(pattern, type_str):
|
||||
self.report.positive.append((line, type_str, category))
|
||||
return
|
||||
|
||||
def visit_FunctionDef(self, node: ast.FunctionDef) -> None:
|
||||
self._func_stack.append(node)
|
||||
try:
|
||||
for arg in node.args.args + node.args.kwonlyargs:
|
||||
self._check_type(arg.annotation, arg.lineno, f"{node.name}({arg.arg})")
|
||||
if node.args.vararg and node.args.vararg.annotation:
|
||||
self._check_type(node.args.vararg.annotation, node.args.vararg.lineno, f"{node.name}(*{node.args.vararg.arg})")
|
||||
if node.args.kwarg and node.args.kwarg.annotation:
|
||||
self._check_type(node.args.kwarg.annotation, node.args.kwarg.lineno, f"{node.name}(**{node.args.kwarg.arg})")
|
||||
self._check_type(node.returns, node.returns.lineno if node.returns else node.lineno, f"{node.name} -> ...")
|
||||
for stmt in node.body:
|
||||
self.visit(stmt)
|
||||
finally:
|
||||
self._func_stack.pop()
|
||||
|
||||
def visit_AnnAssign(self, node: ast.AnnAssign) -> None:
|
||||
target = ast.unparse(node.target)
|
||||
self._check_type(node.annotation, node.lineno, f"{target}: ...")
|
||||
self.generic_visit(node)
|
||||
|
||||
def visit_Return(self, node: ast.Return) -> None:
|
||||
if node.value is None:
|
||||
self.generic_visit(node)
|
||||
return
|
||||
if isinstance(node.value, ast.Tuple) and len(node.value.elts) > 1:
|
||||
type_str = ast.unparse(node.value)
|
||||
for pattern, category in WEAK_PATTERNS:
|
||||
if re.search(pattern, type_str):
|
||||
self.report.weak.append(Finding(
|
||||
filename=self.filename,
|
||||
line=node.lineno,
|
||||
context=f"return in {self._func_stack[-1].name if self._func_stack else '<module>'}",
|
||||
type_str=type_str,
|
||||
category="return_tuple_literal",
|
||||
severity="medium",
|
||||
))
|
||||
break
|
||||
self.generic_visit(node)
|
||||
|
||||
def visit_Assign(self, node: ast.Assign) -> None:
|
||||
if isinstance(node.value, ast.Tuple) and len(node.value.elts) > 1:
|
||||
type_str = ast.unparse(node.value)
|
||||
for pattern, category in WEAK_PATTERNS:
|
||||
if re.search(pattern, type_str):
|
||||
self.report.weak.append(Finding(
|
||||
filename=self.filename,
|
||||
line=node.lineno,
|
||||
context=f"assign in {self._func_stack[-1].name if self._func_stack else '<module>'}",
|
||||
type_str=type_str,
|
||||
category="assign_tuple_literal",
|
||||
severity="low",
|
||||
))
|
||||
break
|
||||
self.generic_visit(node)
|
||||
|
||||
|
||||
def audit_file(filepath: Path) -> FileReport:
|
||||
try:
|
||||
source = filepath.read_text(encoding="utf-8")
|
||||
except (OSError, UnicodeDecodeError) as e:
|
||||
print(f"WARN: could not read {filepath}: {e}", file=sys.stderr)
|
||||
return FileReport(filename=str(filepath))
|
||||
try:
|
||||
tree = ast.parse(source, filename=str(filepath))
|
||||
except SyntaxError as e:
|
||||
print(f"WARN: syntax error in {filepath}: {e}", file=sys.stderr)
|
||||
return FileReport(filename=str(filepath))
|
||||
visitor = WeakTypeVisitor(str(filepath), source)
|
||||
visitor.visit(tree)
|
||||
return visitor.report
|
||||
|
||||
|
||||
def find_python_files(root: Path) -> list[Path]:
|
||||
if not root.exists():
|
||||
raise FileNotFoundError(f"Source directory not found: {root}")
|
||||
return sorted(p for p in root.rglob("*.py") if "artifacts" not in p.parts and "__pycache__" not in p.parts)
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
|
||||
parser.add_argument("--src", default="src", help="Source directory to audit (default: src)")
|
||||
parser.add_argument("--json", action="store_true", help="Output JSON instead of human-readable report")
|
||||
parser.add_argument("--top", type=int, default=10, help="Show top N files by weak count (default: 10)")
|
||||
parser.add_argument("--verbose", action="store_true", help="Show every finding inline (default: top N per file)")
|
||||
args = parser.parse_args()
|
||||
|
||||
src = Path(args.src)
|
||||
try:
|
||||
files = find_python_files(src)
|
||||
except FileNotFoundError as e:
|
||||
print(f"ERROR: {e}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
reports: list[FileReport] = [audit_file(f) for f in files]
|
||||
reports = [r for r in reports if r.weak_count > 0 or r.positive_count > 0]
|
||||
|
||||
if args.json:
|
||||
output = {
|
||||
"src_dir": str(src),
|
||||
"files_scanned": len(files),
|
||||
"files_with_findings": len(reports),
|
||||
"total_weak": sum(r.weak_count for r in reports),
|
||||
"total_positive": sum(r.positive_count for r in reports),
|
||||
"by_category": dict(Counter(f.category for r in reports for f in r.weak).most_common()),
|
||||
"by_severity": dict(Counter(f.severity for r in reports for f in r.weak).most_common()),
|
||||
"by_file": [
|
||||
{
|
||||
"filename": r.filename,
|
||||
"weak_count": r.weak_count,
|
||||
"positive_count": r.positive_count,
|
||||
"findings": [
|
||||
{
|
||||
"line": f.line,
|
||||
"context": f.context,
|
||||
"type_str": f.type_str,
|
||||
"category": f.category,
|
||||
"severity": f.severity,
|
||||
}
|
||||
for f in r.weak
|
||||
],
|
||||
}
|
||||
for r in sorted(reports, key=lambda r: -r.weak_count)
|
||||
],
|
||||
}
|
||||
print(json.dumps(output, indent=2))
|
||||
return 0
|
||||
|
||||
print(f"=== Weak Type Audit: {src} ===\n")
|
||||
print(f"Files scanned: {len(files)}")
|
||||
print(f"Files with findings: {len(reports)}")
|
||||
print(f"Total weak findings: {sum(r.weak_count for r in reports)}")
|
||||
print(f"Total positive patterns (already in use): {sum(r.positive_count for r in reports)}\n")
|
||||
|
||||
cat_counts = Counter(f.category for r in reports for f in r.weak)
|
||||
sev_counts = Counter(f.severity for r in reports for f in r.weak)
|
||||
print("By category:")
|
||||
for cat, n in cat_counts.most_common():
|
||||
print(f" {cat:30s} {n:4d}")
|
||||
print("\nBy severity:")
|
||||
for sev, n in sev_counts.most_common():
|
||||
print(f" {sev:30s} {n:4d}")
|
||||
|
||||
print(f"\n--- Top {args.top} files by weak count ---")
|
||||
top = sorted(reports, key=lambda r: -r.weak_count)[:args.top]
|
||||
for r in top:
|
||||
pct = (r.weak_count / max(sum(rr.weak_count for rr in reports), 1)) * 100
|
||||
print(f"\n{r.filename} ({r.weak_count} findings, {pct:.1f}% of total, {r.positive_count} positive)")
|
||||
if args.verbose:
|
||||
for f in r.weak:
|
||||
print(f" L{f.line:4d} [{f.severity:6s}] {f.category:25s} {f.context}")
|
||||
print(f" {f.type_str[:120]}")
|
||||
else:
|
||||
by_cat = Counter(f.category for f in r.weak)
|
||||
for cat, n in by_cat.most_common():
|
||||
print(f" {cat:30s} {n}")
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
Reference in New Issue
Block a user