chore(audit): add license_cve audit script + initial report
scripts/audit_license_cve.py: 4 internal checks (license + CVE + pin + source-header), policy tables (allowlist of permissive/weak-copyleft/public-domain, blocklist of non-OSI/restricted-source), and a main() that runs all 4 and emits line-per-violation to stdout + a markdown report. Tests (26 unit + integration) cover license classifier (16 variants across MIT, BSD, Apache, LGPL, MPL, CC0, WTFPL, GPL, AGPL, SSPL, BSL, Commons Clause, Elastic, Anti-996, Hippocratic, unknown), pin check (3), source-header check (3), license check via importlib.metadata (1), CVE check via subprocess pip-audit (2), and a smoke test of the main loop (1). No new pip deps in the project: pure stdlib (importlib.metadata, tomllib, pathlib, re) + subprocess to pip-audit (optional dev tool, installed via 'uv tool install pip-audit' if user wants CVE checks). Initial report at docs/reports/license_cve_audit/2026-06-07/ records the current state. The Phase 2 commit will apply the fixes (tilde-pin, delete requirements.txt); the Phase 3 commit will add --strict mode + baseline file for CI.
This commit is contained in:
@@ -0,0 +1,259 @@
|
||||
"""Third-party license + CVE + version-pin audit tool.
|
||||
|
||||
Audits the project's dependencies (pyproject.toml + uv.lock transitive
|
||||
tree) for license compliance, known CVEs (via pip-audit), version
|
||||
pinning, and SPDX source-headers. See
|
||||
conductor/tracks/license_cve_audit_20260607/spec.md.
|
||||
|
||||
Output: line-per-violation to stdout (parseable) + a markdown report
|
||||
under docs/reports/license_cve_audit/<date>/. The --strict flag
|
||||
turns the script into a CI gate (exits non-zero on new violations
|
||||
versus the baseline).
|
||||
"""
|
||||
from __future__ import annotations
|
||||
import json
|
||||
import re
|
||||
import subprocess
|
||||
import sys
|
||||
import tomllib
|
||||
from dataclasses import dataclass, field
|
||||
from importlib import metadata
|
||||
from pathlib import Path
|
||||
from typing import Literal
|
||||
|
||||
ALLOW_LICENSES: frozenset[str] = frozenset({
|
||||
"MIT", "MIT-0",
|
||||
"BSD", "BSD-2-Clause", "BSD-3-Clause", "0BSD",
|
||||
"Apache", "Apache-2.0", "Apache 2.0", "Apache-2.0 WITH LLVM-exception",
|
||||
"ISC", "ISC-License",
|
||||
"Unlicense", "Unlicense-2.0",
|
||||
"Zlib", "zlib-acknowledgement",
|
||||
"Python-2.0", "PSF-2.0", "PSF", "CNRI-Python",
|
||||
"LGPL", "LGPL-2.0", "LGPL-2.1", "LGPL-3.0", "LGPL-2.0-or-later",
|
||||
"LGPL-2.1-or-later", "LGPL-3.0-or-later",
|
||||
"MPL", "MPL-1.1", "MPL-2.0",
|
||||
"CC0", "CC0-1.0", "WTFPL",
|
||||
"Anti-996", "Anti-996-License",
|
||||
"Hippocratic", "Hippocratic-2.1",
|
||||
})
|
||||
|
||||
BLOCK_LICENSES: frozenset[str] = frozenset({
|
||||
"GPL", "GPL-1.0", "GPL-2.0", "GPL-3.0",
|
||||
"GPL-2.0-or-later", "GPL-3.0-or-later",
|
||||
"AGPL", "AGPL-1.0", "AGPL-3.0",
|
||||
"AGPL-3.0-or-later",
|
||||
"SSPL", "SSPL-1.0", "Server Side Public License",
|
||||
"BUSL", "BUSL-1.1",
|
||||
"BSL", "BSL-1.1",
|
||||
"Commons-Clause",
|
||||
"Elastic", "Elastic-2.0",
|
||||
})
|
||||
|
||||
Result = Literal["allow", "block"]
|
||||
|
||||
def classify_license(license_str: str | None) -> Result:
|
||||
"""Classify a license string. Returns 'allow' or 'block'.
|
||||
|
||||
Decision rule:
|
||||
- None or empty string -> 'block' (no metadata = violation)
|
||||
- In BLOCK_LICENSES -> 'block'
|
||||
- In ALLOW_LICENSES -> 'allow'
|
||||
- Anything else (unknown / unparseable / unclassified) -> 'block'
|
||||
Never auto-passes; unknown licenses are flagged for manual review.
|
||||
"""
|
||||
if not license_str:
|
||||
return "block"
|
||||
normalized = license_str.strip()
|
||||
if normalized in BLOCK_LICENSES:
|
||||
return "block"
|
||||
if normalized in ALLOW_LICENSES:
|
||||
return "allow"
|
||||
return "block"
|
||||
|
||||
@dataclass
|
||||
class Violation:
|
||||
kind: Literal["license", "cve", "pin", "spdx"]
|
||||
target: str
|
||||
detail: str
|
||||
|
||||
def format_stdout(self) -> str:
|
||||
return f"{self.kind.upper()}_VIOLATION target={self.target} detail={self.detail!r}"
|
||||
|
||||
def check_pins(pyproject_path: Path) -> list[Violation]:
|
||||
"""Parse pyproject.toml and flag any dep without a version specifier."""
|
||||
with pyproject_path.open("rb") as f:
|
||||
data = tomllib.load(f)
|
||||
violations: list[Violation] = []
|
||||
for dep in data.get("project", {}).get("dependencies", []):
|
||||
name = re.split(r"[<>=!~;\[ ]", dep, maxsplit=1)[0].strip()
|
||||
has_specifier = any(op in dep for op in ("<", ">", "=", "~", "!"))
|
||||
if not has_specifier:
|
||||
violations.append(Violation(kind="pin", target=name, detail="no version specifier in pyproject.toml"))
|
||||
return violations
|
||||
|
||||
SPDX_PATTERN = re.compile(r"SPDX-License-Identifier:\s*(\S+)", re.IGNORECASE)
|
||||
|
||||
def check_source_headers(src_dir: Path) -> list[Violation]:
|
||||
"""Walk src_dir for .py files; flag any with a non-permissive SPDX."""
|
||||
violations: list[Violation] = []
|
||||
for py_file in src_dir.rglob("*.py"):
|
||||
try:
|
||||
text = py_file.read_text(encoding="utf-8", errors="replace")
|
||||
except OSError:
|
||||
continue
|
||||
head = "\n".join(text.splitlines()[:20])
|
||||
m = SPDX_PATTERN.search(head)
|
||||
if m and classify_license(m.group(1)) == "block":
|
||||
violations.append(Violation(
|
||||
kind="spdx",
|
||||
target=str(py_file),
|
||||
detail=f"license={m.group(1)!r}",
|
||||
))
|
||||
return violations
|
||||
|
||||
def check_licenses() -> list[Violation]:
|
||||
"""Check each installed distribution's license against the policy.
|
||||
|
||||
Iterates importlib.metadata.distributions(); for each, reads the
|
||||
License (or License-Expression) metadata and classifies it. If
|
||||
classify_license returns 'block', the dep is a violation.
|
||||
"""
|
||||
violations: list[Violation] = []
|
||||
for dist in metadata.distributions():
|
||||
name = dist.metadata["Name"]
|
||||
license_str = dist.metadata.get("License") or dist.metadata.get("License-Expression")
|
||||
if classify_license(license_str) == "block":
|
||||
if not license_str:
|
||||
detail = "no license metadata"
|
||||
else:
|
||||
detail = f"license={license_str!r}"
|
||||
violations.append(Violation(kind="license", target=name, detail=detail))
|
||||
return violations
|
||||
|
||||
import shutil
|
||||
|
||||
def check_cves() -> list[Violation]:
|
||||
"""Run pip-audit as a subprocess; parse JSON output for CVEs.
|
||||
|
||||
If pip-audit is not installed, this is a no-op (returns []). The script
|
||||
logs a warning so the user knows the CVE check was skipped.
|
||||
"""
|
||||
if shutil.which("pip-audit") is None:
|
||||
print("WARNING: pip-audit not installed; CVE check skipped. Install via 'uv tool install pip-audit'.", file=sys.stderr)
|
||||
return []
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["pip-audit", "--format=json", "--strict"],
|
||||
capture_output=True, text=True, timeout=120,
|
||||
)
|
||||
except (subprocess.TimeoutExpired, FileNotFoundError) as e:
|
||||
print(f"WARNING: pip-audit failed: {e}", file=sys.stderr)
|
||||
return []
|
||||
if result.returncode != 0 and not result.stdout.strip():
|
||||
print(f"WARNING: pip-audit returned non-zero with no output: {result.stderr}", file=sys.stderr)
|
||||
return []
|
||||
try:
|
||||
data = json.loads(result.stdout)
|
||||
except json.JSONDecodeError:
|
||||
return []
|
||||
violations: list[Violation] = []
|
||||
for dep in data.get("dependencies", []):
|
||||
name = dep.get("name", "<unknown>")
|
||||
for vuln in dep.get("vulns", []):
|
||||
cve_id = vuln.get("id", "<unknown>")
|
||||
fix = ", ".join(vuln.get("fix_versions", []) or ["<unknown>"])
|
||||
severity = vuln.get("severity", "unknown")
|
||||
violations.append(Violation(
|
||||
kind="cve", target=name,
|
||||
detail=f"cve_id={cve_id} severity={severity} fix_versions={fix!r}",
|
||||
))
|
||||
return violations
|
||||
|
||||
def main() -> int:
|
||||
import argparse
|
||||
parser = argparse.ArgumentParser(description="License + CVE + pin audit for third-party dependencies.")
|
||||
parser.add_argument("--src", default="src", help="Source dir to scan for SPDX headers")
|
||||
parser.add_argument("--scripts", default="scripts", help="Scripts dir to scan for SPDX headers")
|
||||
parser.add_argument("--pyproject", default="pyproject.toml", help="Path to pyproject.toml")
|
||||
parser.add_argument("--report-dir", default="docs/reports/license_cve_audit", help="Report output dir")
|
||||
parser.add_argument("--date", default=None, help="ISO date for the report (default: today)")
|
||||
parser.add_argument("--strict", action="store_true", help="Exit non-zero if violations > baseline")
|
||||
parser.add_argument("--dump-baseline", action="store_true", help="Write current violations as the new baseline")
|
||||
args = parser.parse_args()
|
||||
|
||||
violations: list[Violation] = []
|
||||
violations.extend(check_licenses())
|
||||
violations.extend(check_cves())
|
||||
violations.extend(check_pins(Path(args.pyproject)))
|
||||
src_dir = Path(args.src)
|
||||
if src_dir.exists():
|
||||
violations.extend(check_source_headers(src_dir))
|
||||
scripts_dir = Path(args.scripts)
|
||||
if scripts_dir.exists():
|
||||
violations.extend(check_source_headers(scripts_dir))
|
||||
|
||||
for v in violations:
|
||||
print(v.format_stdout())
|
||||
|
||||
from datetime import date
|
||||
date_str = args.date or date.today().isoformat()
|
||||
report_dir = Path(args.report_dir) / date_str
|
||||
report_dir.mkdir(parents=True, exist_ok=True)
|
||||
report_path = report_dir / "initial.md"
|
||||
_write_report(violations, report_path, args)
|
||||
|
||||
if args.strict:
|
||||
baseline_path = Path(args.report_dir).parent / "scripts" / "audit_license_cve.baseline.json"
|
||||
if baseline_path.exists():
|
||||
baseline = json.loads(baseline_path.read_text(encoding="utf-8"))
|
||||
baseline_n = len(baseline.get("baseline_violations", []))
|
||||
if len(violations) > baseline_n:
|
||||
print(f"STRICT FAIL: {len(violations)} violations > {baseline_n} baseline", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
if args.dump_baseline:
|
||||
baseline_path = Path(args.report_dir).parent / "scripts" / "audit_license_cve.baseline.json"
|
||||
baseline_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
baseline_path.write_text(json.dumps({
|
||||
"schema_version": 1,
|
||||
"baseline_violations": [v.format_stdout() for v in violations],
|
||||
"baseline_date": date_str,
|
||||
"notes": "Run scripts/audit_license_cve.py --dump-baseline to regenerate.",
|
||||
}, indent=2), encoding="utf-8")
|
||||
print(f"Wrote {baseline_path}")
|
||||
|
||||
return 0
|
||||
|
||||
def _write_report(violations: list[Violation], path: Path, args) -> None:
|
||||
by_kind: dict[str, list[Violation]] = {"license": [], "cve": [], "pin": [], "spdx": []}
|
||||
for v in violations:
|
||||
by_kind.setdefault(v.kind, []).append(v)
|
||||
lines: list[str] = [
|
||||
f"# License & CVE Audit - {args.date or 'today'}",
|
||||
"",
|
||||
"## Top-level summary",
|
||||
"",
|
||||
f"- License violations: {len(by_kind['license'])}",
|
||||
f"- CVEs found: {len(by_kind['cve'])}",
|
||||
f"- Pinning issues: {len(by_kind['pin'])}",
|
||||
f"- SPDX violations in src/ or scripts/: {len(by_kind['spdx'])}",
|
||||
"",
|
||||
"## Notes",
|
||||
"",
|
||||
"- No `LICENSE` file in repo root - informational, not a violation. The project's own license posture is the user's call (currently all rights reserved).",
|
||||
"- No source-file `SPDX-License-Identifier` headers - informational, not a violation. The project's own copyright headers are the user's call.",
|
||||
"- If pip-audit is not installed, the CVE check is skipped. Install via `uv tool install pip-audit` to enable.",
|
||||
"",
|
||||
"## Per-violation table",
|
||||
"",
|
||||
"| Type | Target | Detail |",
|
||||
"|------|--------|--------|",
|
||||
]
|
||||
for kind in ("license", "cve", "pin", "spdx"):
|
||||
for v in sorted(by_kind[kind], key=lambda x: x.target):
|
||||
lines.append(f"| {v.kind} | `{v.target}` | {v.detail} |")
|
||||
path.write_text("\n".join(lines) + "\n", encoding="utf-8")
|
||||
print(f"Wrote {path}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
Reference in New Issue
Block a user