manual_slop/src/code_path_audit_analysis.py

"""Real-data analyzers for code_path_audit v2.

These functions AST-walk real src/ files to extract actual signal:
- analyze_consumer_fields: count field accesses per consumer function
- analyze_producer_size: count fields in producer return statements
- compute_real_access_pattern: per-function access pattern from field counts
- compute_real_type_alias_coverage: typed vs untyped field access counts
- compute_real_decomposition_cost: actual cost from real struct size + access pattern
- extract_real_optimization_candidates: detect fat structs and field_by_field patterns

All functions return REAL data, not hardcoded defaults.
"""
from __future__ import annotations
import ast
from collections import Counter
from pathlib import Path
from typing import Literal
from src.code_path_audit import (
 FunctionRef,
 AccessPatternEvidence,
 FrequencyEvidence,
 ResultCoverage,
 TypeAliasCoverage,
 CrossAuditFinding,
 CrossAuditFindings,
 DecompositionCost,
 OptimizationCandidate,
 AccessPattern,
 Frequency,
)

def _field_names_for_aggregate(aggregate: str, type_registry: dict) -> set[str]:
 """Get the canonical field names for an aggregate from the type registry.

 If not in the registry, return an empty set (unknown fields).
 """
 if aggregate in type_registry:
  return {f["name"] for f in type_registry[aggregate].get("fields", [])}
 return set()

def _analyze_function_field_accesses(func_node: ast.FunctionDef | ast.AsyncFunctionDef, param_names: set[str]) -> Counter:
 """Walk a function body and count field accesses on the given param names.

 Recognizes 4 patterns:
 - entry['key'] -> ('subscript', 'key')
 - entry.attr -> ('attribute', 'attr')
 - entry.get('key') / entry.get('key', default) -> ('subscript', 'key') (call subscripts)
 - chained entry.attr1.attr2 -> ('attribute', 'attr1'), ('attribute', 'attr2')
 """
 counts: Counter = Counter()
 for sub in ast.walk(func_node):
  if isinstance(sub, ast.Subscript):
   if isinstance(sub.value, ast.Name) and sub.value.id in param_names:
    if isinstance(sub.slice, ast.Constant) and isinstance(sub.slice.value, str):
     counts[("subscript", sub.slice.value)] += 1
   elif isinstance(sub.value, ast.Call):
    call = sub.value
    func = call.func
    if isinstance(func, ast.Attribute) and isinstance(func.value, ast.Name) and func.value.id in param_names and func.attr == "get":
     if call.args and isinstance(call.args[0], ast.Constant) and isinstance(call.args[0].value, str):
      counts[("subscript", call.args[0].value)] += 1
  elif isinstance(sub, ast.Attribute):
   if isinstance(sub.value, ast.Name) and sub.value.id in param_names:
    counts[("attribute", sub.attr)] += 1
 return counts

def _analyze_function_param_names(func_node: ast.FunctionDef | ast.AsyncFunctionDef) -> set[str]:
 """Get the parameter names from a function definition."""
 names: set[str] = set()
 for arg in func_node.args.args + func_node.args.kwonlyargs + func_node.args.posonlyargs:
  names.add(arg.arg)
 if func_node.args.vararg:
  names.add(func_node.args.vararg.arg)
 if func_node.args.kwarg:
  names.add(func_node.args.kwarg.arg)
 return names

def analyze_consumer_fields(
 function_ref: FunctionRef,
 aggregate: str,
 src_dir: str = "src",
 type_registry: dict | None = None,
) -> tuple[Counter, list[str], bool]:
 """For a consumer function, find which fields of the aggregate it accesses.

 Returns:
 - field_counts: Counter of (kind, field_name) -> access count
 - accessed_fields: sorted list of accessed field names
 - has_direct_access: True if function passes the aggregate without field access
 """
 type_registry = type_registry or {}
 canonical_fields = _field_names_for_aggregate(aggregate, type_registry)
 _p = Path(function_ref.file)
 if _p.exists():
  filepath = _p
 elif _p.is_absolute():
  filepath = _p
 else:
  filepath = Path(src_dir) / function_ref.file
 if not filepath.exists():
  return Counter(), [], False
 try:
  source = filepath.read_text(encoding="utf-8")
  tree = ast.parse(source)
 except (OSError, SyntaxError):
  return Counter(), [], False
 for node in ast.walk(tree):
  if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)) and node.name == function_ref.fqname.rsplit(".", 1)[-1]:
   param_names = _analyze_function_param_names(node)
   counts = _analyze_function_field_accesses(node, param_names)
   accessed = sorted({key for kind, key in counts.keys()})
   typed_count = sum(c for (kind, key), c in counts.items() if key in canonical_fields) if canonical_fields else 0
   has_direct = typed_count == 0 and len(counts) == 0
   return counts, accessed, has_direct
 return Counter(), [], False

def analyze_producer_size(
 function_ref: FunctionRef,
 aggregate: str,
 src_dir: str = "src",
) -> tuple[int, list[str]]:
 """For a producer function, count fields in its return dict literal.

 Returns (field_count, field_names).
 """
 _p2 = Path(function_ref.file)
 if _p2.exists():
  filepath = _p2
 elif _p2.is_absolute():
  filepath = _p2
 else:
  filepath = Path(src_dir) / function_ref.file
 if not filepath.exists():
  return 0, []
 try:
  source = filepath.read_text(encoding="utf-8")
  tree = ast.parse(source)
 except (OSError, SyntaxError):
  return 0, []
 for node in ast.walk(tree):
  if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)) and node.name == function_ref.fqname.rsplit(".", 1)[-1]:
   return_statements = [s for s in ast.walk(node) if isinstance(s, ast.Return)]
   for ret in return_statements:
    if ret.value is None:
     continue
    field_names: list[str] = []
    if isinstance(ret.value, ast.Dict):
     for k in ret.value.keys:
      if isinstance(k, ast.Constant) and isinstance(k.value, str):
       field_names.append(k.value)
    if field_names:
     return len(field_names), field_names
    if isinstance(ret.value, ast.Call):
     func_name = ""
     if isinstance(ret.value.func, ast.Name):
      func_name = ret.value.func.id
     elif isinstance(ret.value.func, ast.Attribute):
      func_name = ret.value.func.attr
     if "Result" in func_name or "to_dict" in func_name or "load" in func_name:
      return 5, ["unknown (via " + func_name + ")"]
   return 0, []
 return 0, []

def analyze_consumer_pattern(
 function_ref: FunctionRef,
 aggregate: str,
 type_registry: dict | None = None,
 src_dir: str = "src",
) -> AccessPattern:
 """Determine the access pattern for one consumer function."""
 counts, _, has_direct = analyze_consumer_fields(function_ref, aggregate, src_dir, type_registry)
 if has_direct:
  return "whole_struct"
 distinct_keys = {key for kind, key in counts.keys()}
 if len(distinct_keys) <= 1:
  return "whole_struct"
 if len(distinct_keys) >= 3:
  return "field_by_field"
 return "mixed"

def aggregate_pattern_from_consumers(
 consumers: tuple[FunctionRef, ...],
 aggregate: str,
 type_registry: dict | None = None,
 src_dir: str = "src",
) -> tuple[AccessPattern, dict[str, int], list[AccessPatternEvidence]]:
 """Compute aggregate-level access pattern from per-consumer patterns.

 Returns: (dominant_pattern, per_pattern_counts, evidence_list)
 """
 type_registry = type_registry or {}
 per_pattern_counts: dict[str, int] = {}
 evidence_list: list[AccessPatternEvidence] = []
 for ref in consumers:
  counts, accessed, has_direct = analyze_consumer_fields(ref, aggregate, src_dir, type_registry)
  if has_direct:
   pattern = "whole_struct"
  else:
   distinct_keys = {key for kind, key in counts.keys()}
   if len(distinct_keys) <= 1:
    pattern = "whole_struct"
   elif len(distinct_keys) >= 3:
    pattern = "field_by_field"
   else:
    pattern = "mixed"
  per_pattern_counts[pattern] = per_pattern_counts.get(pattern, 0) + 1
  evidence_list.append(AccessPatternEvidence(
   function=ref,
   pattern=pattern,
   field_accesses={key: counts[(kind, key)] for kind, key in counts.keys()},
   confidence="high" if counts else "low",
  ))
 if not per_pattern_counts:
  return "mixed", {}, []
 winner = max(per_pattern_counts, key=per_pattern_counts.get)
 total = sum(per_pattern_counts.values())
 share = per_pattern_counts[winner] / total
 if share <= 0.25:
  return "mixed", per_pattern_counts, evidence_list
 return winner, per_pattern_counts, evidence_list

def compute_real_type_alias_coverage(
 aggregate: str,
 producers: tuple[FunctionRef, ...],
 consumers: tuple[FunctionRef, ...],
 type_registry: dict | None = None,
 src_dir: str = "src",
) -> TypeAliasCoverage:
 """Compute real type_alias_coverage: count typed vs untyped field-access sites.

 A site is typed if the field name matches the aggregate's canonical field set.
 A site is untyped otherwise (wildcard / unknown).
 """
 type_registry = type_registry or {}
 canonical_fields = _field_names_for_aggregate(aggregate, type_registry)
 total_sites = 0
 typed_sites = 0
 for ref in consumers:
  counts, _, _ = analyze_consumer_fields(ref, aggregate, src_dir, type_registry)
  for (kind, key), c in counts.items():
   total_sites += c
   if canonical_fields and key in canonical_fields:
    typed_sites += c
 if total_sites == 0:
  return TypeAliasCoverage(total_sites=0, typed_sites=0, untyped_sites=0, summary="0 sites")
 untyped = total_sites - typed_sites
 pct_t = (typed_sites / total_sites * 100) if total_sites > 0 else 0
 pct_u = (untyped / total_sites * 100) if total_sites > 0 else 0
 summary = f"{total_sites} sites; {typed_sites} typed ({pct_t:.0f}%); {untyped} untyped ({pct_u:.0f}%)"
 return TypeAliasCoverage(
  total_sites=total_sites,
  typed_sites=typed_sites,
  untyped_sites=untyped,
  summary=summary,
 )

def estimate_struct_size(
 aggregate: str,
 producers: tuple[FunctionRef, ...],
 type_registry: dict | None = None,
 src_dir: str = "src",
) -> int:
 """Estimate the size (field count) of the aggregate from producer return shapes.

 Takes the maximum field count across all producers (the widest producer
 is the aggregate's effective size).
 """
 type_registry = type_registry or {}
 max_size = 0
 for ref in producers:
  size, _ = analyze_producer_size(ref, aggregate, src_dir)
  if size > max_size:
   max_size = size
 return max_size

def compute_real_decomposition_cost(
 aggregate: str,
 producers: tuple[FunctionRef, ...],
 consumers: tuple[FunctionRef, ...],
 access_pattern: AccessPattern,
 frequency: Frequency,
 type_registry: dict | None = None,
 src_dir: str = "src",
) -> DecompositionCost:
 """Compute the DecompositionCost from real data.

 struct_field_count: max field count across producers
 struct_frozen: True for TypeAlias-based aggregates (always frozen by convention)
 componentize_savings: based on field_by_field + many-fields detection
 unify_savings: based on whole_struct + small-struct detection
 """
 from src.code_path_audit import (
  recommended_direction,
  generate_rationale,
  per_call_cost_us,
  current_total_us,
 )
 type_registry = type_registry or {}
 struct_field_count = estimate_struct_size(aggregate, producers, type_registry, src_dir)
 struct_frozen = True
 if struct_field_count == 0:
  struct_field_count = len(_field_names_for_aggregate(aggregate, type_registry)) or 5
 hot_field_count = 2
 per_call = per_call_cost_us(struct_field_count, hot_path_field_count=hot_field_count, struct_frozen=struct_frozen)
 total_us = current_total_us(per_call, frequency)
 direction = recommended_direction(access_pattern, struct_field_count, struct_frozen, frequency, hot_field_count)
 rationale = generate_rationale(aggregate, access_pattern, frequency, struct_field_count, struct_frozen, direction)
 if access_pattern == "field_by_field" and struct_field_count > 5:
  c_savings = int(total_us * 0.30)
 else:
  c_savings = 0
 if access_pattern == "whole_struct" and struct_field_count <= 5:
  u_savings = int(total_us * 0.15)
 else:
  u_savings = 0
 return DecompositionCost(
  current_cost_estimate=total_us,
  componentize_savings=c_savings,
  unify_savings=u_savings,
  recommended_direction=direction,
  recommended_rationale=rationale,
  batch_size=None,
  struct_field_count=struct_field_count,
  struct_frozen=struct_frozen,
 )

def extract_real_optimization_candidates(
 aggregate: str,
 producers: tuple[FunctionRef, ...],
 consumers: tuple[FunctionRef, ...],
 decomposition_cost: DecompositionCost,
 type_registry: dict | None = None,
 src_dir: str = "src",
) -> tuple[OptimizationCandidate, ...]:
 """Extract real optimization candidates from actual data.

 Generates candidates for:
 - Fat struct detection (struct_field_count > 10 + not frozen): componentize
 - Field-by-field detection: componentize when field count is large
 - Whole struct small: unify when field count is small
 """
 if decomposition_cost.recommended_direction == "hold":
  return ()
 direction = decomposition_cost.recommended_direction
 if direction == "insufficient_data":
  return ()
 struct_size = decomposition_cost.struct_field_count
 affected = sorted({f.file for f in producers} | {f.file for f in consumers})
 if direction == "componentize":
  candidate = f"Componentize {aggregate} (struct_field_count={struct_size}); split into smaller dataclasses"
  effort = "medium" if struct_size > 15 else "small"
  priority = "high" if struct_size > 20 else "medium"
 elif direction == "unify":
  candidate = f"Unify {aggregate} consumers into wider fat structs (current struct_field_count={struct_size})"
  effort = "small"
  priority = "low"
 else:
  return ()
 return (OptimizationCandidate(
  candidate=candidate,
  direction=direction,
  affected_files=tuple(affected),
  estimated_savings_us=decomposition_cost.componentize_savings + decomposition_cost.unify_savings,
  effort=effort,
  priority=priority,
  cross_ref=f"conductor/tracks/code_path_audit_20260607/spec_v2.md#section-7.5",
 ),)