feat(video_analysis): synthesize_report.py orchestrator with TDD (5 tests)

2026-06-21 15:39:22 -04:00
parent ed0d198afe
commit 548c4fef63
2 changed files with 166 additions and 0 deletions
@@ -0,0 +1,113 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+
+from scripts.video_analysis import download_video, extract_keyframes, extract_transcript, ocr_frames
+from scripts.video_analysis.error_types import ErrorInfo, make_error
+
+
+PIPELINE_STAGES: list[str] = ["transcript", "download", "keyframes", "ocr", "report"]
+
+
+@dataclass
+class ReportContext:
+ url: str
+ slug: str
+ output_dir: Path
+
+
+@dataclass
+class _Ok:
+ value: Any
+
+ def is_ok(self) -> bool:
+  return True
+
+ def is_err(self) -> bool:
+  return False
+
+
+@dataclass
+class _Err:
+ err: ErrorInfo
+
+ def is_ok(self) -> bool:
+  return False
+
+ def is_err(self) -> bool:
+  return True
+
+
+def ok(value: Any) -> _Ok:
+ return _Ok(value)
+
+
+def err(error: ErrorInfo) -> _Err:
+ return _Err(error)
+
+
+def build_report_stub(slug: str, url: str, video_id: str) -> str:
+ return f"""# <Video Title> ({slug})
+
+**Source:** {url}
+**YouTube ID:** {video_id}
+**Date Added to Campaign:** 2026-06-21
+
+> **Tier 3 worker prompt:** populate each section using the transcript.json, ocr.md, and frames/ artifacts in this directory.
+
+## 1. TL;DR
+## 2. Key Concepts
+## 3. Frame Analysis
+## 4. Transcript Highlights
+## 5. Mathematical / Theoretical Content
+## 6. Connections to Other Videos in Campaign
+## 7. Open Questions / Follow-up
+## 8. References
+"""
+
+
+def build_summary_stub(slug: str, title: str, author: str | None) -> str:
+ return f"""# Summary: {slug}
+
+**Title:** {title}
+**Author:** {author or "(unknown)"}
+
+<200-400 word summary to be filled in by the Tier 3 worker after reading report.md>
+"""
+
+
+def synthesize_report(url: str, slug: str, output_dir: Path, skip_video_download: bool = False) -> _Ok | _Err:
+ artifacts = output_dir / "artifacts"
+ frames_dir = artifacts / "frames"
+ artifacts.mkdir(parents=True, exist_ok=True)
+ transcript_path = artifacts / "transcript.json"
+ frames_dir.mkdir(parents=True, exist_ok=True)
+ t_result = extract_transcript.extract_transcript(url, transcript_path)
+ if t_result.is_err():
+  return t_result
+ video_path = artifacts / "video.mp4"
+ if not skip_video_download:
+  d_result = download_video.download_video(url, video_path)
+  if d_result.is_err():
+   return d_result
+  k_result = extract_keyframes.extract_keyframes(video_path, frames_dir)
+  if k_result.is_err():
+   return k_result
+  ocr_path = artifacts / "ocr.md"
+  o_result = ocr_frames.ocr_frames(frames_dir, ocr_path)
+  if o_result.is_err():
+   return o_result
+ parsed = extract_transcript.parse_video_id(url)
+ video_id = parsed.value if parsed.is_ok() else "UNKNOWN"
+ report_path = output_dir / "report.md"
+ report_path.write_text(build_report_stub(slug, url, video_id), encoding="utf-8")
+ summary_path = output_dir / "summary.md"
+ summary_path.write_text(build_summary_stub(slug, "<Title TBD>", None), encoding="utf-8")
+ return ok({
+  "transcript": str(transcript_path),
+  "frames": str(frames_dir),
+  "report": str(report_path),
+  "summary": str(summary_path),
+ })
@@ -0,0 +1,53 @@
+from __future__ import annotations
+
+from pathlib import Path
+from unittest.mock import patch
+
+from scripts.video_analysis.extract_transcript import ok as extract_ok
+from scripts.video_analysis.download_video import ok as download_ok
+from scripts.video_analysis.extract_keyframes import ok as keyframes_ok
+from scripts.video_analysis.ocr_frames import ok as ocr_ok
+from scripts.video_analysis.synthesize_report import (
+ PIPELINE_STAGES,
+ ReportContext,
+ build_report_stub,
+ build_summary_stub,
+ synthesize_report,
+)
+
+
+def test_pipeline_stages_in_order() -> None:
+ assert PIPELINE_STAGES == ["transcript", "download", "keyframes", "ocr", "report"]
+
+
+def test_report_context_dataclass() -> None:
+ ctx = ReportContext(url="https://youtu.be/VID", slug="vid", output_dir=Path("/tmp/vid"))
+ assert ctx.url == "https://youtu.be/VID"
+ assert ctx.slug == "vid"
+
+
+def test_build_report_stub_has_sections() -> None:
+ stub = build_report_stub("vid", "https://youtu.be/VID", "VID123ABCDE")
+ assert "VID123ABCDE" in stub
+ assert "## 1. TL;DR" in stub
+ assert "## 8. References" in stub
+
+
+def test_build_summary_stub_short() -> None:
+ stub = build_summary_stub("vid", "Title", "Author")
+ assert "vid" in stub
+ assert "Title" in stub
+ assert len(stub) < 500
+
+
+def test_synthesize_report_orchestrates(tmp_path: Path) -> None:
+ with patch("scripts.video_analysis.synthesize_report.extract_transcript") as t, \
+   patch("scripts.video_analysis.synthesize_report.download_video") as d, \
+   patch("scripts.video_analysis.synthesize_report.extract_keyframes") as k, \
+   patch("scripts.video_analysis.synthesize_report.ocr_frames") as o:
+  t.return_value = extract_ok({})
+  d.return_value = download_ok({})
+  k.return_value = keyframes_ok({})
+  o.return_value = ocr_ok({})
+  result = synthesize_report("https://youtu.be/ABCDEFGHIJK", "vid", tmp_path, skip_video_download=True)
+  assert result.is_ok()