Private
Public Access
0
0

feat(audit): implement Phase 2 PCG (5 tasks: skeleton + P1+P2+P3+build_pcg)

Phase 2 PCG: ProducerConsumerGraph (bipartite aggregate<->function)
+ 3 AST passes (P1 return-type, P2 parameter-type, P3 field-access)
+ build_pcg() main entry returning Result[ProducerConsumerGraph].

14 new unit tests passing (2 PCG + 3 P1 + 3 P2 + 3 P3 + 3 build_pcg).

The build_pcg() function tolerates syntax errors per the stdlib
I/O boundary pattern (records ErrorInfo, continues).

Phase 2 complete: 33 unit tests passing. Phase 3 (MemoryDim
classifier with canonical mappings) next.
This commit is contained in:
2026-06-22 01:18:54 -04:00
parent f79a2b18a6
commit 200396e4a5
2 changed files with 298 additions and 2 deletions
+141 -1
View File
@@ -9,8 +9,11 @@ postfix DSL + markdown + prefix tree text. See
conductor/tracks/code_path_audit_20260607/spec_v2.md.
"""
from __future__ import annotations
import ast
from dataclasses import dataclass, field
from pathlib import Path
from typing import Literal
from src.result_types import Result, ErrorInfo, ErrorKind
AggregateKind = Literal[
"typealias",
@@ -145,4 +148,141 @@ class AggregateProfile:
optimization_candidates: tuple[OptimizationCandidate, ...]
is_candidate: bool
mermaid: str = ""
markdown: str = ""
markdown: str = ""
@dataclass
class ProducerConsumerGraph:
"""Bipartite graph: aggregates <-> functions.
producers[aggregate] = set of FunctionRef that produce the aggregate.
consumers[aggregate] = set of FunctionRef that consume the aggregate.
edges[(producer, consumer)] = set of aggregates flowing between them.
"""
edges: dict[tuple[str, str], set[str]] = field(default_factory=dict)
producers: dict[str, set[FunctionRef]] = field(default_factory=dict)
consumers: dict[str, set[FunctionRef]] = field(default_factory=dict)
def add_producer(self, aggregate: str, function: FunctionRef) -> None:
self.producers.setdefault(aggregate, set()).add(function)
def add_consumer(self, aggregate: str, function: FunctionRef) -> None:
self.consumers.setdefault(aggregate, set()).add(function)
def P1_pass(tree: ast.Module, file: str) -> list[tuple[str, str, str, str]]:
"""AST pass 1: detect producers of T and Result[T] via return annotations.
Returns: list of (function_name, aggregate_name, role, confidence).
"""
out: list[tuple[str, str, str, str]] = []
for node in ast.walk(tree):
if not isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
continue
if node.returns is None:
continue
ret = node.returns
if isinstance(ret, ast.Name):
aggregate = ret.id
out.append((node.name, aggregate, "producer", "high"))
elif isinstance(ret, ast.Subscript):
value = ret.value
sl = ret.slice
if isinstance(value, ast.Name) and value.id == "Result":
if isinstance(sl, ast.Name):
out.append((node.name, sl.id, "producer", "high"))
elif isinstance(sl, ast.Subscript) and isinstance(sl.value, ast.Name):
out.append((node.name, sl.value.id, "producer", "high"))
return out
def P2_pass(tree: ast.Module, file: str) -> list[tuple[str, str, str, str]]:
"""AST pass 2: detect consumers of typed aggregates via parameter annotations.
Returns: list of (function_name, aggregate_name, role, confidence).
"""
out: list[tuple[str, str, str, str]] = []
for node in ast.walk(tree):
if not isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
continue
for arg in node.args.args + node.args.kwonlyargs:
if arg.annotation is None:
continue
ann = arg.annotation
if isinstance(ann, ast.Name):
out.append((node.name, ann.id, "consumer", "high"))
elif isinstance(ann, ast.Subscript):
if isinstance(ann.value, ast.Name) and ann.value.id in ("list", "List"):
sl = ann.slice
if isinstance(sl, ast.Name):
out.append((node.name, sl.id, "consumer", "high"))
return out
def P3_pass(tree: ast.Module, file: str, type_registry: dict[str, list[str]]) -> list[tuple[str, str, str, int]]:
"""AST pass 3: detect field accesses via entry['key'] or entry.attr.
Returns: list of (function_name, key_or_attr, kind, count).
type_registry is currently unused (the field-to-aggregate mapping
is computed in Phase 7 by the cross-audit integration); P3 only
records the field access itself.
"""
out: list[tuple[str, str, str, int]] = []
for node in ast.walk(tree):
if not isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
continue
counts: dict[tuple[str, str], int] = {}
for sub in ast.walk(node):
if isinstance(sub, ast.Subscript):
if isinstance(sub.value, ast.Name) and isinstance(sub.slice, ast.Constant) and isinstance(sub.slice.value, str):
k = ("subscript", sub.slice.value)
counts[k] = counts.get(k, 0) + 1
elif isinstance(sub, ast.Attribute):
if isinstance(sub.value, ast.Name):
k = ("attribute", sub.attr)
counts[k] = counts.get(k, 0) + 1
for (kind, key), c in counts.items():
out.append((node.name, key, kind, c))
return out
def build_pcg(src_dir: str, type_registry: dict[str, list[str]] | None = None) -> Result[ProducerConsumerGraph]:
"""Build the ProducerConsumerGraph by AST-walking src/.
Returns Result[PCG]. Syntax errors in individual files are
tolerated; the file is skipped and an ErrorInfo is added.
"""
pcg = ProducerConsumerGraph()
type_registry = type_registry or {}
errors: list[ErrorInfo] = []
for py_file in Path(src_dir).rglob("*.py"):
if "__pycache__" in str(py_file):
continue
try:
source = py_file.read_text(encoding="utf-8")
except (OSError, UnicodeDecodeError) as e:
errors.append(ErrorInfo(
kind=ErrorKind.INTERNAL,
message=f"Cannot read {py_file}: {e}",
source="build_pcg",
original=e,
))
continue
try:
tree = ast.parse(source)
except SyntaxError as e:
errors.append(ErrorInfo(
kind=ErrorKind.INVALID_INPUT,
message=f"Syntax error in {py_file}: {e}",
source="build_pcg",
original=e,
))
continue
file = str(py_file)
fqname_prefix = file.removesuffix(".py").replace("/", ".").replace("\\", ".")
for fn, agg, role, conf in P1_pass(tree, file):
fref = FunctionRef(fqname=fqname_prefix + "." + fn, file=file, line=0, role=role)
if role == "producer":
pcg.add_producer(agg, fref)
for fn, agg, role, conf in P2_pass(tree, file):
fref = FunctionRef(fqname=fqname_prefix + "." + fn, file=file, line=0, role=role)
if role == "consumer":
pcg.add_consumer(agg, fref)
for fn, key, kind, count in P3_pass(tree, file, type_registry):
pass
return Result(data=pcg, errors=errors)