docs(reports): TRACK_COMPLETION for video_analysis_campaign_20260621 (Phase 0+1+2 init only)

2026-06-21 15:44:06 -04:00
parent 365fa554d9
commit ebadfda9d6
2 changed files with 460 additions and 0 deletions
@@ -0,0 +1,313 @@
+"""One-time scaffold generator for video_analysis_campaign_20260621 child + synthesis tracks.
+
+Reads the umbrella's README.md to extract the child list, then writes plan.md + metadata.json + state.toml
+for each child and the synthesis track.
+
+Per Tier 2 sandbox convention (conductor/workflow.md "Throw-away scripts"), this lives in
+scripts/tier2/artifacts/video_analysis_campaign_20260621/ and is NOT shipped to production.
+"""
+from __future__ import annotations
+
+import json
+import re
+from pathlib import Path
+
+ROOT = Path(__file__).resolve().parents[4]
+UMBRELLA = ROOT / "conductor" / "tracks" / "video_analysis_campaign_20260621"
+TRACKS_DIR = ROOT / "conductor" / "tracks"
+
+VIDEOS = [
+ {"order": 1, "slug": "cs229_building_llms", "cluster": "E", "title": "Stanford CS229 - Building Large Language Models (LLMs)", "youtube_id": "9vM4p9NN0Ts", "author": "Stanford CS229", "url": "https://youtu.be/9vM4p9NN0Ts", "needs_yt_dlp_verify": True},
+ {"order": 2, "slug": "probability_logic", "cluster": "A", "title": "Probability Theory is an Extension of Logic", "youtube_id": "0yF9TvMeAzM", "author": None, "url": "https://youtu.be/0yF9TvMeAzM", "needs_yt_dlp_verify": False},
+ {"order": 3, "slug": "entropy_epiplexity", "cluster": "A", "title": "From Entropy to Epiplexity", "youtube_id": "_U8AwUq_aJQ", "author": "Andrew Wilson and Marc Finzi", "url": "https://youtu.be/_U8AwUq_aJQ", "needs_yt_dlp_verify": False},
+ {"order": 4, "slug": "score_dynamics_giorgini", "cluster": "A", "title": "Learning Dynamics from Statistics: a score-based approach", "youtube_id": "P75iVMmbqQk", "author": "Ludovico Giorgini", "url": "https://youtu.be/P75iVMmbqQk", "needs_yt_dlp_verify": False},
+ {"order": 5, "slug": "platonic_intelligence_kumar", "cluster": "B", "title": "Towards a Platonic Intelligence with Unified Factored Representations", "youtube_id": "1mXUFweWOug", "author": "Akarsh Kumar", "url": "https://youtu.be/1mXUFweWOug", "needs_yt_dlp_verify": False},
+ {"order": 6, "slug": "free_lunches_levin", "cluster": "B", "title": "Free Lunches: Model Systems for Studying the Agential Gifts from the Platonic Space", "youtube_id": "K8BmMU1Tm-I", "author": "Michael Levin", "url": "https://youtu.be/K8BmMU1Tm-I", "needs_yt_dlp_verify": False},
+ {"order": 7, "slug": "generic_systems_fields", "cluster": "C", "title": "Interesting Behavior by Generic Systems", "youtube_id": "QeMajYvhEbI", "author": "Chris Fields", "url": "https://youtu.be/QeMajYvhEbI", "needs_yt_dlp_verify": False},
+ {"order": 8, "slug": "brain_counterintuitive", "cluster": "C", "title": "The Most Counterintuitive Way to Build a Brain", "youtube_id": "cDxtFtoQVNc", "author": None, "url": "https://youtu.be/cDxtFtoQVNc", "needs_yt_dlp_verify": False},
+ {"order": 9, "slug": "neural_dynamics_miller", "cluster": "C", "title": "Cognition Emerges from Neural Dynamics", "youtube_id": "0BS-BzEFTXA", "author": "Earl Miller", "url": "https://youtu.be/0BS-BzEFTXA", "needs_yt_dlp_verify": False},
+ {"order": 10, "slug": "multiscale_hoffman", "cluster": "C", "title": "A Multiscale Logic of Collective Intelligence", "youtube_id": "YnfaT5APPB0", "author": "Donald Hoffman and Chetan Prakash", "url": "https://youtu.be/YnfaT5APPB0", "needs_yt_dlp_verify": False},
+ {"order": 11, "slug": "cs336_architectures", "cluster": "E", "title": "Stanford CS336 Lecture 3: Architectures", "youtube_id": "lVynu4bo1rY", "author": "Stanford CS336 Spring 2026", "url": "https://youtu.be/lVynu4bo1rY", "needs_yt_dlp_verify": True},
+ {"order": 12, "slug": "creikey_dl_cv", "cluster": "D", "title": "Creikey - Deep Learning and Computer Vision for Game Developers (BSC 2025)", "youtube_id": "yxkUvXs-hoQ", "author": "Creikey", "url": "https://youtu.be/yxkUvXs-hoQ", "needs_yt_dlp_verify": False},
+]
+
+CLUSTER_LEGEND = {
+ "A": "Math & information-theoretic foundations",
+ "B": "Platonic / geometric AI representations",
+ "C": "Biological / cognitive / generic systems",
+ "D": "Applied / practical",
+ "E": "Stanford course VODs >1hr",
+}
+
+CLUSTER_BLOCKED_BY = {
+ "A": ["video_analysis_cs229_building_llms_20260621"],
+ "B": ["video_analysis_score_dynamics_giorgini_20260621"],
+ "C": ["video_analysis_free_lunches_levin_20260621"],
+ "D": ["video_analysis_cs336_architectures_20260621"],
+ "E": [],
+}
+
+
+def plan_template(v: dict) -> str:
+ yt_dlp_verify_step = ""
+ if v["needs_yt_dlp_verify"]:
+  yt_dlp_verify_step = (
+   "\n- [ ] **Step 0: yt-dlp access verification (R5).** "
+   "Run `uv run yt-dlp --simulate {url}` to confirm yt-dlp can fetch metadata. "
+   "If it fails (HTTP 401/403), fall back to manual transcript sourcing or escalate per umbrella spec §13 R5.\n".format(url=v["url"])
+  )
+ return f"""# Plan: video_analysis_{v['slug']}_20260621
+
+> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox syntax for tracking.
+
+**Goal:** Execute the 5-phase pipeline (Acquire → Keyframes → OCR → Synthesis → Verification) for *{v['title']}* and ship `report.md` (1000-10000 LOC) + `summary.md` (200-400 words).
+
+**Parent:** This is child #{v['order']} of the [video_analysis_campaign_20260621](../../video_analysis_campaign_20260621/) umbrella.
+
+**Source:** {v['url']} (YouTube ID `{v['youtube_id']}`)
+**Cluster:** {v['cluster']} ({CLUSTER_LEGEND.get(v['cluster'], '')})
+**Author:** {v['author'] or '(unknown)'}
+
+---
+
+## Phase 1: Acquire
+
+{yt_dlp_verify_step}- [ ] **Step 1: Run extract_transcript.py**
+ - `uv run python scripts/video_analysis/extract_transcript.py {v['url']} artifacts/transcript.json`
+ - Commit `artifacts/transcript.json` atomically.
+- [ ] **Step 2: Run download_video.py**
+ - `uv run python scripts/video_analysis/download_video.py {v['url']} artifacts/video.mp4`
+ - Commit `artifacts/video.mp4` (gitignored) + `artifacts/video.log` atomically.
+
+## Phase 2: Keyframes
+
+- [ ] **Step 1: Run extract_keyframes.py**
+ - `uv run python scripts/video_analysis/extract_keyframes.py artifacts/video.mp4 artifacts/frames --threshold 0.4`
+ - Commit `artifacts/frames/*.jpg` + `artifacts/extraction_meta.json` atomically.
+- [ ] **Step 2: Manual review** — flag any frames that look wrong.
+
+## Phase 3: OCR
+
+- [ ] **Step 1: Run ocr_frames.py**
+ - `uv run python scripts/video_analysis/ocr_frames.py artifacts/frames artifacts/ocr.md --backend winsdk`
+ - Commit `artifacts/ocr.md` atomically.
+- [ ] **Step 2: Spot-check OCR quality.**
+
+## Phase 4: Synthesis (DELEGATE TO TIER 3 WORKER)
+
+- [ ] **Step 1: Delegate report writing**
+ - Inputs: `artifacts/transcript.json` + `artifacts/ocr.md` + `artifacts/frames/*.jpg`
+ - Output: `report.md` (1000-10000 LOC) + `summary.md` (200-400 words)
+ - 8-section structure per umbrella spec §FR6
+ - Cross-references to other children (forward + backward)
+- [ ] **Step 2: Human review + iterate**
+
+## Phase 5: Verification
+
+- [ ] **Step 1: Idempotency check** — re-run scripts, confirm outputs match modulo timestamps
+- [ ] **Step 2: Audit checklist** — every section of `report.md` populated, no "TBD"
+- [ ] **Step 3: Write end-of-track report** at `docs/reports/TRACK_COMPLETION_video_analysis_{v['slug']}_20260621.md`
+- [ ] **Step 4: Update state.toml** to `status = "completed"`
+
+## Self-review
+
+- [ ] `report.md` is 1000-10000 LOC markdown
+- [ ] `summary.md` is 200-400 words
+- [ ] All 7 deliverable artifacts present
+- [ ] All 8 report sections populated
+- [ ] Per-task commits with git notes
+"""
+
+
+def metadata_template(v: dict) -> str:
+ cluster_blockers = CLUSTER_BLOCKED_BY.get(v["cluster"], [])
+ all_blockers = ["video_analysis_campaign_20260621"] + cluster_blockers
+ return json.dumps({
+  "track_id": f"video_analysis_{v['slug']}_20260621",
+  "name": v["title"],
+  "created": "2026-06-21",
+  "status": "spec_approved",
+  "blocked_by": all_blockers,
+  "blocks": [],
+  "priority": "A",
+  "type": "per-child research track (Pass 1 of 3)",
+  "parent": "video_analysis_campaign_20260621",
+  "domain": "meta-tooling (research artifacts; no manual_slop src/ changes)",
+  "cluster": v["cluster"],
+  "youtube_id": v["youtube_id"],
+  "youtube_url": v["url"],
+  "author": v["author"],
+  "scope": {
+   "new_files": [
+    "artifacts/transcript.json",
+    "artifacts/ocr.md",
+    "artifacts/frames/*.jpg",
+    "artifacts/extraction_meta.json",
+    "artifacts/video.mp4 (gitignored)",
+    "artifacts/video.log",
+    "report.md (1000-10000 LOC target)",
+    "summary.md (200-400 words)",
+   ],
+   "modified_files": [],
+   "deleted_files": [],
+  },
+  "estimated_effort": {
+   "method": "scope (per conductor/workflow.md Tier 1 Track Initialization Rules). NO day estimates.",
+   "phase_1": "1 task: acquire (transcript + download)",
+   "phase_2": "1 task: keyframes extraction",
+   "phase_3": "1 task: OCR",
+   "phase_4": "1 task: synthesis (delegate to Tier 3 worker)",
+   "phase_5": "1 task: verification",
+   "summary": "5 tasks per child. 12 children total = 60 tasks in campaign.",
+  },
+  "verification_criteria": [
+   "All 7 deliverable artifacts present (transcript.json, video.log, frames/, extraction_meta.json, ocr.md, report.md, summary.md)",
+   "report.md is 1000-10000 LOC markdown",
+   "summary.md is 200-400 words",
+   "All 8 report sections populated (TL;DR, Key Concepts, Frame Analysis, Transcript Highlights, Math/Theoretical Content, Connections, Open Questions, References)",
+   "Idempotency check passes",
+   "Per-task commits with git notes",
+  ],
+  "risk_register": [
+   {
+    "id": f"R5-{v['slug']}",
+    "title": "yt-dlp access failure (oEmbed returned 401 for E-cluster videos)",
+    "likelihood": "high" if v["needs_yt_dlp_verify"] else "low",
+    "scope_impact": "Phase 1 Acquire blocked if yt-dlp also fails",
+    "mitigation": "Phase 1 Step 0 verifies yt-dlp access before downloading. Fall back to manual transcript sourcing if yt-dlp fails.",
+   },
+  ],
+  "user_directives": [
+   "1000-10000 LOC markdown per video report (per user 2026-06-21)",
+   "Lossless preservation: transcripts (JSON), frames (raw images), OCR (plain text) must be preserved in machine-readable form",
+   "Cross-references: forward + backward to other children in the campaign",
+  ],
+ }, indent=2) + "\n"
+
+
+def state_template(v: dict) -> str:
+ return f"""# Track state for video_analysis_{v['slug']}_20260621
+# Updated by Tier 2 Tech Lead (during execution)
+
+[meta]
+track_id = "video_analysis_{v['slug']}_20260621"
+name = "{v['title']}"
+status = "active"
+current_phase = 1  # Phase 1 = Acquire (first execution phase)
+last_updated = "2026-06-21"
+
+[blocked_by]
+video_analysis_campaign_20260621 = "shipped"
+""" + (
+  "\n".join(f'{bid} = "shipped"' for bid in CLUSTER_BLOCKED_BY.get(v["cluster"], [])) + "\n" if CLUSTER_BLOCKED_BY.get(v["cluster"]) else ""
+) + f"""
+[blocks]
+# Depends-on: umbrella + cluster-blockers
+
+[phases]
+phase_1 = {{ status = "pending", checkpointsha = "", name = "Acquire (transcript + download)" }}
+phase_2 = {{ status = "pending", checkpointsha = "", name = "Keyframes extraction" }}
+phase_3 = {{ status = "pending", checkpointsha = "", name = "OCR" }}
+phase_4 = {{ status = "pending", checkpointsha = "", name = "Synthesis (Tier 3 worker)" }}
+phase_5 = {{ status = "pending", checkpointsha = "", name = "Verification" }}
+
+[tasks]
+t1_1 = {{ status = "pending", commit_sha = "", description = "Run extract_transcript.py + download_video.py. Commit artifacts atomically." }}
+t2_1 = {{ status = "pending", commit_sha = "", description = "Run extract_keyframes.py with threshold 0.4. Manual review of frames." }}
+t3_1 = {{ status = "pending", commit_sha = "", description = "Run ocr_frames.py. Spot-check OCR." }}
+t4_1 = {{ status = "pending", commit_sha = "", description = "Delegate report.md (1000-10000 LOC) + summary.md (200-400 words) to Tier 3 worker." }}
+t5_1 = {{ status = "pending", commit_sha = "", description = "Idempotency check + audit + end-of-track report." }}
+
+[verification]
+all_artifacts_present = false
+report_loc_target_met = false
+summary_word_count_met = false
+end_of_track_report_committed = false
+"""
+
+
+def synthesis_metadata() -> str:
+ return json.dumps({
+  "track_id": "video_analysis_synthesis_20260621",
+  "name": "Video Analysis Campaign Synthesis (cross-cutting)",
+  "created": "2026-06-21",
+  "status": "spec_approved",
+  "blocked_by": [f"video_analysis_{v['slug']}_20260621" for v in VIDEOS],
+  "blocks": [],
+  "priority": "A",
+  "type": "synthesis (cross-cutting report consuming all 12 children)",
+  "parent": "video_analysis_campaign_20260621",
+  "domain": "meta-tooling (research artifacts; no manual_slop src/ changes)",
+  "scope": {
+   "new_files": [
+    "per_video_summary.md (one paragraph 150-250 words per video)",
+    "report.md (6-section cross-cutting synthesis)",
+   ],
+   "modified_files": [],
+   "deleted_files": [],
+  },
+  "estimated_effort": {
+   "method": "scope (per conductor/workflow.md Tier 1 Track Initialization Rules). NO day estimates.",
+   "summary": "1 task: delegate synthesis to Tier 3 worker. Consumes all 12 children's report.md + summary.md.",
+  },
+  "verification_criteria": [
+   "per_video_summary.md has 12 paragraphs (one per child)",
+   "report.md has 6 sections: Theme Matrix, Cross-Video Concept Map, 5-10 Takeaways, Math Prereq Graph, Open Research Questions, Next-Watch List",
+   "All 12 child tracks shipped (each with their report.md + summary.md)",
+  ],
+  "user_directives": [
+   "1000-5000 LOC synthesis report (less than per-video because heavy lifting is in children)",
+   "Lossless preservation directive applies here too — DO NOT over-summarize; Pass 2 will compress",
+  ],
+ }, indent=2) + "\n"
+
+
+def synthesis_state() -> str:
+ return """# Track state for video_analysis_synthesis_20260621
+
+[meta]
+track_id = "video_analysis_synthesis_20260621"
+name = "Video Analysis Campaign Synthesis"
+status = "active"
+current_phase = 1
+last_updated = "2026-06-21"
+
+[blocked_by]
+""" + "\n".join(f'video_analysis_{v["slug"]}_20260621 = "shipped"' for v in VIDEOS) + """
+
+[blocks]
+
+[phases]
+phase_1 = { status = "pending", checkpointsha = "", name = "Verify all 12 children shipped" }
+phase_2 = { status = "pending", checkpointsha = "", name = "Delegate synthesis to Tier 3 worker" }
+phase_3 = { status = "pending", checkpointsha = "", name = "Human review + iterate" }
+phase_4 = { status = "pending", checkpointsha = "", name = "End-of-track report" }
+
+[tasks]
+t1_1 = { status = "pending", commit_sha = "", description = "Verify all 12 children have report.md + summary.md" }
+t2_1 = { status = "pending", commit_sha = "", description = "Delegate synthesis (per_video_summary.md + report.md) to Tier 3 worker" }
+t3_1 = { status = "pending", commit_sha = "", description = "Human review + iterate" }
+t4_1 = { status = "pending", commit_sha = "", description = "Write end-of-track report" }
+"""
+
+
+def main() -> None:
+ for v in VIDEOS:
+  folder = TRACKS_DIR / f"video_analysis_{v['slug']}_20260621"
+  plan_path = folder / "plan.md"
+  meta_path = folder / "metadata.json"
+  state_path = folder / "state.toml"
+  plan_path.write_text(plan_template(v), encoding="utf-8")
+  meta_path.write_text(metadata_template(v), encoding="utf-8")
+  state_path.write_text(state_template(v), encoding="utf-8")
+  print(f"Wrote: {plan_path}, {meta_path}, {state_path}")
+
+ synth_folder = TRACKS_DIR / "video_analysis_synthesis_20260621"
+ synth_folder.mkdir(parents=True, exist_ok=True)
+ (synth_folder / "metadata.json").write_text(synthesis_metadata(), encoding="utf-8")
+ (synth_folder / "state.toml").write_text(synthesis_state(), encoding="utf-8")
+ print(f"Wrote synthesis: metadata.json + state.toml")
+
+
+if __name__ == "__main__":
+ main()