Private
Public Access
0
0

docs(reports): TRACK_COMPLETION for video_analysis_campaign_20260621 (Phase 0+1+2 init only)

This commit is contained in:
2026-06-21 15:44:06 -04:00
parent 365fa554d9
commit ebadfda9d6
2 changed files with 460 additions and 0 deletions
@@ -0,0 +1,313 @@
"""One-time scaffold generator for video_analysis_campaign_20260621 child + synthesis tracks.
Reads the umbrella's README.md to extract the child list, then writes plan.md + metadata.json + state.toml
for each child and the synthesis track.
Per Tier 2 sandbox convention (conductor/workflow.md "Throw-away scripts"), this lives in
scripts/tier2/artifacts/video_analysis_campaign_20260621/ and is NOT shipped to production.
"""
from __future__ import annotations
import json
import re
from pathlib import Path
ROOT = Path(__file__).resolve().parents[4]
UMBRELLA = ROOT / "conductor" / "tracks" / "video_analysis_campaign_20260621"
TRACKS_DIR = ROOT / "conductor" / "tracks"
VIDEOS = [
{"order": 1, "slug": "cs229_building_llms", "cluster": "E", "title": "Stanford CS229 - Building Large Language Models (LLMs)", "youtube_id": "9vM4p9NN0Ts", "author": "Stanford CS229", "url": "https://youtu.be/9vM4p9NN0Ts", "needs_yt_dlp_verify": True},
{"order": 2, "slug": "probability_logic", "cluster": "A", "title": "Probability Theory is an Extension of Logic", "youtube_id": "0yF9TvMeAzM", "author": None, "url": "https://youtu.be/0yF9TvMeAzM", "needs_yt_dlp_verify": False},
{"order": 3, "slug": "entropy_epiplexity", "cluster": "A", "title": "From Entropy to Epiplexity", "youtube_id": "_U8AwUq_aJQ", "author": "Andrew Wilson and Marc Finzi", "url": "https://youtu.be/_U8AwUq_aJQ", "needs_yt_dlp_verify": False},
{"order": 4, "slug": "score_dynamics_giorgini", "cluster": "A", "title": "Learning Dynamics from Statistics: a score-based approach", "youtube_id": "P75iVMmbqQk", "author": "Ludovico Giorgini", "url": "https://youtu.be/P75iVMmbqQk", "needs_yt_dlp_verify": False},
{"order": 5, "slug": "platonic_intelligence_kumar", "cluster": "B", "title": "Towards a Platonic Intelligence with Unified Factored Representations", "youtube_id": "1mXUFweWOug", "author": "Akarsh Kumar", "url": "https://youtu.be/1mXUFweWOug", "needs_yt_dlp_verify": False},
{"order": 6, "slug": "free_lunches_levin", "cluster": "B", "title": "Free Lunches: Model Systems for Studying the Agential Gifts from the Platonic Space", "youtube_id": "K8BmMU1Tm-I", "author": "Michael Levin", "url": "https://youtu.be/K8BmMU1Tm-I", "needs_yt_dlp_verify": False},
{"order": 7, "slug": "generic_systems_fields", "cluster": "C", "title": "Interesting Behavior by Generic Systems", "youtube_id": "QeMajYvhEbI", "author": "Chris Fields", "url": "https://youtu.be/QeMajYvhEbI", "needs_yt_dlp_verify": False},
{"order": 8, "slug": "brain_counterintuitive", "cluster": "C", "title": "The Most Counterintuitive Way to Build a Brain", "youtube_id": "cDxtFtoQVNc", "author": None, "url": "https://youtu.be/cDxtFtoQVNc", "needs_yt_dlp_verify": False},
{"order": 9, "slug": "neural_dynamics_miller", "cluster": "C", "title": "Cognition Emerges from Neural Dynamics", "youtube_id": "0BS-BzEFTXA", "author": "Earl Miller", "url": "https://youtu.be/0BS-BzEFTXA", "needs_yt_dlp_verify": False},
{"order": 10, "slug": "multiscale_hoffman", "cluster": "C", "title": "A Multiscale Logic of Collective Intelligence", "youtube_id": "YnfaT5APPB0", "author": "Donald Hoffman and Chetan Prakash", "url": "https://youtu.be/YnfaT5APPB0", "needs_yt_dlp_verify": False},
{"order": 11, "slug": "cs336_architectures", "cluster": "E", "title": "Stanford CS336 Lecture 3: Architectures", "youtube_id": "lVynu4bo1rY", "author": "Stanford CS336 Spring 2026", "url": "https://youtu.be/lVynu4bo1rY", "needs_yt_dlp_verify": True},
{"order": 12, "slug": "creikey_dl_cv", "cluster": "D", "title": "Creikey - Deep Learning and Computer Vision for Game Developers (BSC 2025)", "youtube_id": "yxkUvXs-hoQ", "author": "Creikey", "url": "https://youtu.be/yxkUvXs-hoQ", "needs_yt_dlp_verify": False},
]
CLUSTER_LEGEND = {
"A": "Math & information-theoretic foundations",
"B": "Platonic / geometric AI representations",
"C": "Biological / cognitive / generic systems",
"D": "Applied / practical",
"E": "Stanford course VODs >1hr",
}
CLUSTER_BLOCKED_BY = {
"A": ["video_analysis_cs229_building_llms_20260621"],
"B": ["video_analysis_score_dynamics_giorgini_20260621"],
"C": ["video_analysis_free_lunches_levin_20260621"],
"D": ["video_analysis_cs336_architectures_20260621"],
"E": [],
}
def plan_template(v: dict) -> str:
yt_dlp_verify_step = ""
if v["needs_yt_dlp_verify"]:
yt_dlp_verify_step = (
"\n- [ ] **Step 0: yt-dlp access verification (R5).** "
"Run `uv run yt-dlp --simulate {url}` to confirm yt-dlp can fetch metadata. "
"If it fails (HTTP 401/403), fall back to manual transcript sourcing or escalate per umbrella spec §13 R5.\n".format(url=v["url"])
)
return f"""# Plan: video_analysis_{v['slug']}_20260621
> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox syntax for tracking.
**Goal:** Execute the 5-phase pipeline (Acquire → Keyframes → OCR → Synthesis → Verification) for *{v['title']}* and ship `report.md` (1000-10000 LOC) + `summary.md` (200-400 words).
**Parent:** This is child #{v['order']} of the [video_analysis_campaign_20260621](../../video_analysis_campaign_20260621/) umbrella.
**Source:** {v['url']} (YouTube ID `{v['youtube_id']}`)
**Cluster:** {v['cluster']} ({CLUSTER_LEGEND.get(v['cluster'], '')})
**Author:** {v['author'] or '(unknown)'}
---
## Phase 1: Acquire
{yt_dlp_verify_step}- [ ] **Step 1: Run extract_transcript.py**
- `uv run python scripts/video_analysis/extract_transcript.py {v['url']} artifacts/transcript.json`
- Commit `artifacts/transcript.json` atomically.
- [ ] **Step 2: Run download_video.py**
- `uv run python scripts/video_analysis/download_video.py {v['url']} artifacts/video.mp4`
- Commit `artifacts/video.mp4` (gitignored) + `artifacts/video.log` atomically.
## Phase 2: Keyframes
- [ ] **Step 1: Run extract_keyframes.py**
- `uv run python scripts/video_analysis/extract_keyframes.py artifacts/video.mp4 artifacts/frames --threshold 0.4`
- Commit `artifacts/frames/*.jpg` + `artifacts/extraction_meta.json` atomically.
- [ ] **Step 2: Manual review** — flag any frames that look wrong.
## Phase 3: OCR
- [ ] **Step 1: Run ocr_frames.py**
- `uv run python scripts/video_analysis/ocr_frames.py artifacts/frames artifacts/ocr.md --backend winsdk`
- Commit `artifacts/ocr.md` atomically.
- [ ] **Step 2: Spot-check OCR quality.**
## Phase 4: Synthesis (DELEGATE TO TIER 3 WORKER)
- [ ] **Step 1: Delegate report writing**
- Inputs: `artifacts/transcript.json` + `artifacts/ocr.md` + `artifacts/frames/*.jpg`
- Output: `report.md` (1000-10000 LOC) + `summary.md` (200-400 words)
- 8-section structure per umbrella spec §FR6
- Cross-references to other children (forward + backward)
- [ ] **Step 2: Human review + iterate**
## Phase 5: Verification
- [ ] **Step 1: Idempotency check** — re-run scripts, confirm outputs match modulo timestamps
- [ ] **Step 2: Audit checklist** — every section of `report.md` populated, no "TBD"
- [ ] **Step 3: Write end-of-track report** at `docs/reports/TRACK_COMPLETION_video_analysis_{v['slug']}_20260621.md`
- [ ] **Step 4: Update state.toml** to `status = "completed"`
## Self-review
- [ ] `report.md` is 1000-10000 LOC markdown
- [ ] `summary.md` is 200-400 words
- [ ] All 7 deliverable artifacts present
- [ ] All 8 report sections populated
- [ ] Per-task commits with git notes
"""
def metadata_template(v: dict) -> str:
cluster_blockers = CLUSTER_BLOCKED_BY.get(v["cluster"], [])
all_blockers = ["video_analysis_campaign_20260621"] + cluster_blockers
return json.dumps({
"track_id": f"video_analysis_{v['slug']}_20260621",
"name": v["title"],
"created": "2026-06-21",
"status": "spec_approved",
"blocked_by": all_blockers,
"blocks": [],
"priority": "A",
"type": "per-child research track (Pass 1 of 3)",
"parent": "video_analysis_campaign_20260621",
"domain": "meta-tooling (research artifacts; no manual_slop src/ changes)",
"cluster": v["cluster"],
"youtube_id": v["youtube_id"],
"youtube_url": v["url"],
"author": v["author"],
"scope": {
"new_files": [
"artifacts/transcript.json",
"artifacts/ocr.md",
"artifacts/frames/*.jpg",
"artifacts/extraction_meta.json",
"artifacts/video.mp4 (gitignored)",
"artifacts/video.log",
"report.md (1000-10000 LOC target)",
"summary.md (200-400 words)",
],
"modified_files": [],
"deleted_files": [],
},
"estimated_effort": {
"method": "scope (per conductor/workflow.md Tier 1 Track Initialization Rules). NO day estimates.",
"phase_1": "1 task: acquire (transcript + download)",
"phase_2": "1 task: keyframes extraction",
"phase_3": "1 task: OCR",
"phase_4": "1 task: synthesis (delegate to Tier 3 worker)",
"phase_5": "1 task: verification",
"summary": "5 tasks per child. 12 children total = 60 tasks in campaign.",
},
"verification_criteria": [
"All 7 deliverable artifacts present (transcript.json, video.log, frames/, extraction_meta.json, ocr.md, report.md, summary.md)",
"report.md is 1000-10000 LOC markdown",
"summary.md is 200-400 words",
"All 8 report sections populated (TL;DR, Key Concepts, Frame Analysis, Transcript Highlights, Math/Theoretical Content, Connections, Open Questions, References)",
"Idempotency check passes",
"Per-task commits with git notes",
],
"risk_register": [
{
"id": f"R5-{v['slug']}",
"title": "yt-dlp access failure (oEmbed returned 401 for E-cluster videos)",
"likelihood": "high" if v["needs_yt_dlp_verify"] else "low",
"scope_impact": "Phase 1 Acquire blocked if yt-dlp also fails",
"mitigation": "Phase 1 Step 0 verifies yt-dlp access before downloading. Fall back to manual transcript sourcing if yt-dlp fails.",
},
],
"user_directives": [
"1000-10000 LOC markdown per video report (per user 2026-06-21)",
"Lossless preservation: transcripts (JSON), frames (raw images), OCR (plain text) must be preserved in machine-readable form",
"Cross-references: forward + backward to other children in the campaign",
],
}, indent=2) + "\n"
def state_template(v: dict) -> str:
return f"""# Track state for video_analysis_{v['slug']}_20260621
# Updated by Tier 2 Tech Lead (during execution)
[meta]
track_id = "video_analysis_{v['slug']}_20260621"
name = "{v['title']}"
status = "active"
current_phase = 1 # Phase 1 = Acquire (first execution phase)
last_updated = "2026-06-21"
[blocked_by]
video_analysis_campaign_20260621 = "shipped"
""" + (
"\n".join(f'{bid} = "shipped"' for bid in CLUSTER_BLOCKED_BY.get(v["cluster"], [])) + "\n" if CLUSTER_BLOCKED_BY.get(v["cluster"]) else ""
) + f"""
[blocks]
# Depends-on: umbrella + cluster-blockers
[phases]
phase_1 = {{ status = "pending", checkpointsha = "", name = "Acquire (transcript + download)" }}
phase_2 = {{ status = "pending", checkpointsha = "", name = "Keyframes extraction" }}
phase_3 = {{ status = "pending", checkpointsha = "", name = "OCR" }}
phase_4 = {{ status = "pending", checkpointsha = "", name = "Synthesis (Tier 3 worker)" }}
phase_5 = {{ status = "pending", checkpointsha = "", name = "Verification" }}
[tasks]
t1_1 = {{ status = "pending", commit_sha = "", description = "Run extract_transcript.py + download_video.py. Commit artifacts atomically." }}
t2_1 = {{ status = "pending", commit_sha = "", description = "Run extract_keyframes.py with threshold 0.4. Manual review of frames." }}
t3_1 = {{ status = "pending", commit_sha = "", description = "Run ocr_frames.py. Spot-check OCR." }}
t4_1 = {{ status = "pending", commit_sha = "", description = "Delegate report.md (1000-10000 LOC) + summary.md (200-400 words) to Tier 3 worker." }}
t5_1 = {{ status = "pending", commit_sha = "", description = "Idempotency check + audit + end-of-track report." }}
[verification]
all_artifacts_present = false
report_loc_target_met = false
summary_word_count_met = false
end_of_track_report_committed = false
"""
def synthesis_metadata() -> str:
return json.dumps({
"track_id": "video_analysis_synthesis_20260621",
"name": "Video Analysis Campaign Synthesis (cross-cutting)",
"created": "2026-06-21",
"status": "spec_approved",
"blocked_by": [f"video_analysis_{v['slug']}_20260621" for v in VIDEOS],
"blocks": [],
"priority": "A",
"type": "synthesis (cross-cutting report consuming all 12 children)",
"parent": "video_analysis_campaign_20260621",
"domain": "meta-tooling (research artifacts; no manual_slop src/ changes)",
"scope": {
"new_files": [
"per_video_summary.md (one paragraph 150-250 words per video)",
"report.md (6-section cross-cutting synthesis)",
],
"modified_files": [],
"deleted_files": [],
},
"estimated_effort": {
"method": "scope (per conductor/workflow.md Tier 1 Track Initialization Rules). NO day estimates.",
"summary": "1 task: delegate synthesis to Tier 3 worker. Consumes all 12 children's report.md + summary.md.",
},
"verification_criteria": [
"per_video_summary.md has 12 paragraphs (one per child)",
"report.md has 6 sections: Theme Matrix, Cross-Video Concept Map, 5-10 Takeaways, Math Prereq Graph, Open Research Questions, Next-Watch List",
"All 12 child tracks shipped (each with their report.md + summary.md)",
],
"user_directives": [
"1000-5000 LOC synthesis report (less than per-video because heavy lifting is in children)",
"Lossless preservation directive applies here too — DO NOT over-summarize; Pass 2 will compress",
],
}, indent=2) + "\n"
def synthesis_state() -> str:
return """# Track state for video_analysis_synthesis_20260621
[meta]
track_id = "video_analysis_synthesis_20260621"
name = "Video Analysis Campaign Synthesis"
status = "active"
current_phase = 1
last_updated = "2026-06-21"
[blocked_by]
""" + "\n".join(f'video_analysis_{v["slug"]}_20260621 = "shipped"' for v in VIDEOS) + """
[blocks]
[phases]
phase_1 = { status = "pending", checkpointsha = "", name = "Verify all 12 children shipped" }
phase_2 = { status = "pending", checkpointsha = "", name = "Delegate synthesis to Tier 3 worker" }
phase_3 = { status = "pending", checkpointsha = "", name = "Human review + iterate" }
phase_4 = { status = "pending", checkpointsha = "", name = "End-of-track report" }
[tasks]
t1_1 = { status = "pending", commit_sha = "", description = "Verify all 12 children have report.md + summary.md" }
t2_1 = { status = "pending", commit_sha = "", description = "Delegate synthesis (per_video_summary.md + report.md) to Tier 3 worker" }
t3_1 = { status = "pending", commit_sha = "", description = "Human review + iterate" }
t4_1 = { status = "pending", commit_sha = "", description = "Write end-of-track report" }
"""
def main() -> None:
for v in VIDEOS:
folder = TRACKS_DIR / f"video_analysis_{v['slug']}_20260621"
plan_path = folder / "plan.md"
meta_path = folder / "metadata.json"
state_path = folder / "state.toml"
plan_path.write_text(plan_template(v), encoding="utf-8")
meta_path.write_text(metadata_template(v), encoding="utf-8")
state_path.write_text(state_template(v), encoding="utf-8")
print(f"Wrote: {plan_path}, {meta_path}, {state_path}")
synth_folder = TRACKS_DIR / "video_analysis_synthesis_20260621"
synth_folder.mkdir(parents=True, exist_ok=True)
(synth_folder / "metadata.json").write_text(synthesis_metadata(), encoding="utf-8")
(synth_folder / "state.toml").write_text(synthesis_state(), encoding="utf-8")
print(f"Wrote synthesis: metadata.json + state.toml")
if __name__ == "__main__":
main()