Private
Public Access
0
0

feat(audit): implement Phase 7 cross-audit integration + Phase 8.1 DSL arity

Phase 7: read_input_json (stdlib I/O boundary), INPUT_JSON_CONTRACTS
(6 input sources), find_enclosing_function (3-tier mapping tier 1),
compute_result_coverage (cross-check of doeh), compute_type_alias_coverage
(cross-check of dss), aggregate_cross_audit_findings (per-aggregate
bucketing), run_all_cross_audit_reads (convenience).

Phase 8 Task 8.1: DSL_WORD_ARITY_V2 (14 new tagged words).

15 new unit tests passing. 111 total tests passing.

Phase 8 Tasks 8.2-8.5 (4 renderers + parser) next.
This commit is contained in:
2026-06-22 01:49:14 -04:00
parent ae5dcb775e
commit e59334a303
6 changed files with 840 additions and 1 deletions
@@ -0,0 +1,8 @@
import sys
additions_file = sys.argv[1]
with open(additions_file, 'r', encoding='utf-8') as f:
code = f.read()
with open(r'C:\projects\manual_slop_tier2\src\code_path_audit.py', 'a', encoding='utf-8') as out:
out.write('\n\n')
out.write(code)
print('Appended', len(code), 'bytes')
@@ -0,0 +1,9 @@
import sys
# Phase 5 + Phase 6 additions (read from sys.argv file)
additions_file = sys.argv[1]
with open(additions_file, 'r', encoding='utf-8') as f:
code = f.read()
with open(r'C:\projects\manual_slop_tier2\src\code_path_audit.py', 'a', encoding='utf-8') as out:
out.write(code)
print('Appended', len(code), 'bytes')
@@ -0,0 +1,190 @@
INIT_CALLERS = frozenset({"__init__", "warmup"})
HOT_CALLERS = frozenset({"render_main_toolbar", "render_menu_bar", "render_frame", "update"})
PER_TURN_CALLERS = frozenset({
"_send_anthropic_result", "_send_deepseek_result", "_send_minimax_result",
"_send_qwen_result", "_send_grok_result", "_send_llama_result",
"_send_gemini_result", "_send_gemini_cli_result",
"process_user_request", "_handle_generate_send",
})
COLD_CALLERS = frozenset({"cleanup", "reset_session", "_classify_anthropic_error", "_classify_gemini_error"})
PER_DISCUSSION_CALLERS = frozenset({"save_project", "load_project", "save_snapshot", "load_snapshot"})
PER_REQUEST_CALLERS = frozenset({
"_api_get_key", "_api_status", "_api_performance", "_api_gui",
"_api_mma_status", "_api_comms", "_api_diagnostics",
})
def detect_frequency_from_entry_point(caller: str, caller_class: str) -> Frequency:
"""Detect the call frequency from the caller name and class."""
if caller in INIT_CALLERS:
return "init"
if caller in HOT_CALLERS:
return "hot"
if caller in PER_TURN_CALLERS:
return "per_turn"
if caller in COLD_CALLERS:
return "cold"
if caller in PER_DISCUSSION_CALLERS:
return "per_discussion"
if caller in PER_REQUEST_CALLERS:
return "per_request"
return "unknown"
def load_frequency_overrides(path: str) -> dict[str, Frequency]:
"""Load frequency overrides from a TOML file."""
p = Path(path)
if not p.exists():
return {}
with p.open("rb") as f:
data = tomllib.load(f)
out: dict[str, Frequency] = {}
for key, value in data.get("frequency", {}).items():
if isinstance(value, str):
out[key] = value
return out
def estimate_call_frequency(
function: FunctionRef,
callers: list[tuple[FunctionRef, str]],
overrides: dict[str, Frequency],
) -> Frequency:
"""Estimate the call frequency of a function.
Precedence: override > entry-point detector > unknown.
"""
if function.fqname in overrides:
return overrides[function.fqname]
if callers:
first_caller, caller_class = callers[0]
return detect_frequency_from_entry_point(first_caller.fqname.rsplit(".", 1)[-1], caller_class)
return "unknown"
MICROSECOND_BUDGET_PER_LLM_TURN: int = 50_000
BRANCH_DISPATCH_OVERHEAD_US: int = 100
ALLOCATION_OVERHEAD_US: int = 50
DEAD_FIELD_COST_PER_FIELD_US: int = 10
COMPONENTIZATION_INDIRECTION_US: int = 200
UNIFICATION_INDIRECTION_US: int = 300
def per_call_cost_us(struct_field_count: int, hot_path_field_count: int, struct_frozen: bool) -> int:
"""Per-call cost in microseconds."""
return (
struct_field_count * ALLOCATION_OVERHEAD_US
+ max(hot_path_field_count, 1) * BRANCH_DISPATCH_OVERHEAD_US
+ (20 if struct_frozen else 0)
)
FREQUENCY_MULTIPLIER: dict[Frequency, float] = {
"hot": 60.0,
"per_turn": 1.0,
"per_request": 1.0,
"per_discussion": 1.0,
"cold": 0.01,
"init": 0.001,
"unknown": 0.0,
}
def current_total_us(per_call_cost: int, frequency: Frequency) -> int:
"""Current total microsecond cost (per unit of frequency)."""
return int(per_call_cost * FREQUENCY_MULTIPLIER[frequency])
def componentize_factor(
access_pattern: AccessPattern,
struct_field_count: int,
struct_frozen: bool,
hot_field_count: int = 0,
) -> float:
"""Determine the componentize factor per spec section 7.5."""
if access_pattern == "field_by_field" and struct_field_count > 10 and not struct_frozen:
return 0.30
if access_pattern == "hot_cold_split" and hot_field_count <= 2 and struct_field_count > 5:
return 0.40
if access_pattern in ("whole_struct", "bulk_batched"):
return -0.20
if access_pattern == "mixed":
return 0.0
return -0.10
def unify_factor(access_pattern: AccessPattern, struct_field_count: int, struct_frozen: bool) -> float:
"""Determine the unify factor per spec section 7.5."""
if access_pattern == "bulk_batched" and struct_field_count <= 3 and struct_frozen:
return 0.25
if access_pattern == "whole_struct" and struct_field_count <= 5 and struct_frozen:
return 0.15
if access_pattern == "field_by_field":
return -0.30
if access_pattern == "hot_cold_split":
return -0.10
if access_pattern == "mixed":
return 0.0
return 0.05
def recommended_direction(
access_pattern: AccessPattern,
struct_field_count: int,
struct_frozen: bool,
frequency: Frequency,
hot_field_count: int = 0,
) -> RecommendedDirection:
"""Determine the recommended decomposition direction per spec section 7.5."""
if access_pattern == "field_by_field" and struct_field_count > 10:
return "componentize"
if access_pattern == "hot_cold_split" and hot_field_count <= 2:
return "componentize"
if access_pattern == "bulk_batched" and struct_field_count <= 3:
return "unify"
if access_pattern == "whole_struct" and struct_field_count <= 5:
return "unify"
if access_pattern == "mixed" or frequency == "unknown":
return "insufficient_data"
if struct_frozen and access_pattern == "whole_struct":
return "hold"
return "hold"
def generate_rationale(
aggregate: str,
access_pattern: AccessPattern,
frequency: Frequency,
struct_field_count: int,
struct_frozen: bool,
direction: RecommendedDirection,
) -> str:
"""Generate the auto-rationale string per spec section 7.5."""
justification = {
"componentize": "the access pattern is field_by_field and the struct has many dead fields",
"unify": "the access pattern is uniform and the struct is small",
"hold": "the current shape matches the access pattern",
"insufficient_data": "runtime profiling is needed to determine the dominant pattern",
}.get(direction, "no justification available")
return (
f"{aggregate}: access_pattern={access_pattern}, frequency={frequency}, "
f"struct_field_count={struct_field_count}, struct_frozen={struct_frozen}. "
f"Recommended: {direction} because {justification}."
)
def compute_decomposition_cost(
aggregate: str,
access_pattern: AccessPattern,
struct_field_count: int,
struct_frozen: bool,
frequency: Frequency,
hot_field_count: int = 0,
) -> DecompositionCost:
"""Compute the per-aggregate DecompositionCost."""
per_call = per_call_cost_us(struct_field_count, hot_path_field_count=hot_field_count, struct_frozen=struct_frozen)
current_total = current_total_us(per_call, frequency)
direction = recommended_direction(access_pattern, struct_field_count, struct_frozen, frequency, hot_field_count)
c_factor = componentize_factor(access_pattern, struct_field_count, struct_frozen, hot_field_count)
u_factor = unify_factor(access_pattern, struct_field_count, struct_frozen)
c_savings = int(current_total * c_factor) if c_factor > 0 else 0
u_savings = int(current_total * u_factor) if u_factor > 0 else 0
rationale = generate_rationale(aggregate, access_pattern, frequency, struct_field_count, struct_frozen, direction)
return DecompositionCost(
current_cost_estimate=current_total,
componentize_savings=c_savings,
unify_savings=u_savings,
recommended_direction=direction,
recommended_rationale=rationale,
batch_size=None,
struct_field_count=struct_field_count,
struct_frozen=struct_frozen,
)
@@ -0,0 +1,203 @@
import json
def read_input_json(path: str) -> Result[dict]:
"""Read a JSON file and return Result[dict].
Per error_handling.md stdlib I/O boundary pattern: catches
OSError (missing/permission denied) and json.JSONDecodeError (malformed
JSON), converts to ErrorInfo.
"""
p = Path(path)
try:
raw = p.read_text(encoding="utf-8")
except (OSError, UnicodeDecodeError) as e:
return Result(
data={},
errors=[ErrorInfo(
kind=ErrorKind.NOT_FOUND,
message=f"Cannot read {path}: {e}",
source="read_input_json",
original=e,
)],
)
try:
data = json.loads(raw)
except json.JSONDecodeError as e:
return Result(
data={},
errors=[ErrorInfo(
kind=ErrorKind.INVALID_INPUT,
message=f"Malformed JSON in {path}: {e}",
source="read_input_json",
original=e,
)],
)
if not isinstance(data, dict):
return Result(
data={},
errors=[ErrorInfo(
kind=ErrorKind.INVALID_INPUT,
message=f"JSON root in {path} is not a dict",
source="read_input_json",
)],
)
return Result(data=data)
INPUT_JSON_CONTRACTS: dict[str, dict[str, str]] = {
"audit_weak_types": {
"producer": "scripts/audit_weak_types.py --json",
"filename": "audit_weak_types.json",
},
"audit_exception_handling": {
"producer": "scripts/audit_exception_handling.py --json",
"filename": "audit_exception_handling.json",
},
"audit_optional_in_3_files": {
"producer": "scripts/audit_optional_in_3_files.py --json",
"filename": "audit_optional_in_3_files.json",
},
"audit_no_models_config_io": {
"producer": "scripts/audit_no_models_config_io.py --json",
"filename": "audit_no_models_config_io.json",
},
"audit_main_thread_imports": {
"producer": "scripts/audit_main_thread_imports.py --json",
"filename": "audit_main_thread_imports.json",
},
"type_registry": {
"producer": "scripts/generate_type_registry.py --json",
"filename": "type_registry.json",
},
}
def find_enclosing_function(
file: str,
line: int,
function_refs: list[FunctionRef],
) -> FunctionRef | None:
"""Tier 1 of the 3-tier mapping: find the function ref at (file, line)."""
candidates = [r for r in function_refs if r.file == file and r.line <= line]
if not candidates:
return None
return max(candidates, key=lambda r: r.line)
def compute_result_coverage(
producers: list[FunctionRef],
consumers: list[FunctionRef],
branches_on_errors: set[str],
) -> ResultCoverage:
"""Compute the per-aggregate result coverage.
result_producers: total number of producers (the caller is responsible
for filtering to Result[T] producers; this function reports the raw
count).
result_consumers: consumers whose fqname is in branches_on_errors
(the caller passes the set from AST analysis).
"""
total_producers = len(producers)
result_producers = total_producers
total_consumers = len(consumers)
result_consumers = len({c.fqname for c in consumers if c.fqname in branches_on_errors})
if total_producers > 0 and result_producers == total_producers:
pct_p = 100
else:
pct_p = (result_producers / total_producers * 100) if total_producers > 0 else 0
pct_c = (result_consumers / total_consumers * 100) if total_consumers > 0 else 0
summary = f"{result_producers}/{total_producers} producers return Result[T] ({pct_p:.0f}%); {result_consumers}/{total_consumers} consumers branch on .errors ({pct_c:.0f}%)"
return ResultCoverage(
total_producers=total_producers,
result_producers=result_producers,
total_consumers=total_consumers,
result_consumers=result_consumers,
summary=summary,
)
def compute_type_alias_coverage(total_sites: int, typed_sites: int) -> TypeAliasCoverage:
"""Compute the per-aggregate type alias coverage."""
untyped = total_sites - typed_sites
pct_typed = (typed_sites / total_sites * 100) if total_sites > 0 else 0
pct_untyped = (untyped / total_sites * 100) if total_sites > 0 else 0
summary = f"{total_sites} total sites; {typed_sites} typed ({pct_typed:.0f}%); {untyped} untyped ({pct_untyped:.0f}%)"
return TypeAliasCoverage(
total_sites=total_sites,
typed_sites=typed_sites,
untyped_sites=untyped,
summary=summary,
)
def aggregate_cross_audit_findings(
audit_name: str,
findings: list[dict],
example_file: str,
example_line: int,
) -> CrossAuditFindings:
"""Aggregate audit findings into a per-aggregate CrossAuditFindings.
Returns all-empty CrossAuditFindings when findings is empty (the
empty audit case is represented by 5 empty tuples, not 5 tuples
of zero-count CrossAuditFinding entries).
"""
empty = ()
if not findings:
return CrossAuditFindings(weak_types=empty, exception_handling=empty, optional_in_baseline=empty, config_io_ownership=empty, import_graph=empty)
site_count = len(findings)
note = f"{site_count} sites in producer+consumer functions"
finding = CrossAuditFinding(
audit_script=audit_name,
site_count=site_count,
example_file=example_file,
example_line=example_line,
note=note,
)
buckets = {
"audit_weak_types": "weak_types",
"audit_exception_handling": "exception_handling",
"audit_optional_in_3_files": "optional_in_baseline",
"audit_no_models_config_io": "config_io_ownership",
"audit_main_thread_imports": "import_graph",
}
field = buckets.get(audit_name)
if field is None:
return CrossAuditFindings(weak_types=empty, exception_handling=empty, optional_in_baseline=empty, config_io_ownership=empty, import_graph=empty)
kwargs = {f: empty for f in buckets.values()}
kwargs[field] = (finding,)
return CrossAuditFindings(**kwargs)
def run_all_cross_audit_reads(audit_inputs_dir: str) -> dict[str, dict]:
"""Read all 6 input JSONs from audit_inputs_dir.
Returns a dict keyed by audit_name. Missing and malformed files
are tolerated (return empty dict).
"""
out: dict[str, dict] = {}
p = Path(audit_inputs_dir)
if not p.exists():
return out
for audit_name, contract in INPUT_JSON_CONTRACTS.items():
json_path = p / contract["filename"]
if not json_path.exists():
out[audit_name] = {}
continue
result = read_input_json(str(json_path))
if result.ok:
out[audit_name] = result.data
else:
out[audit_name] = {}
return out
DSL_WORD_ARITY_V2: dict[str, int] = {
"kind": 1,
"mem-dim": 1,
"fn-ref": 4,
"access-pattern": 1,
"ap-evidence": 4,
"frequency": 1,
"freq-evidence": 4,
"result-coverage": 5,
"type-alias-coverage": 4,
"cross-audit-finding": 5,
"cross-audit-findings": 5,
"decomp-cost": 8,
"opt-candidate": 7,
"is-candidate": 1,
}