Private
Public Access
0
0

feat(video_analysis): synthesize_report.py orchestrator with TDD (5 tests)

This commit is contained in:
2026-06-21 15:39:22 -04:00
parent ed0d198afe
commit 548c4fef63
2 changed files with 166 additions and 0 deletions
+113
View File
@@ -0,0 +1,113 @@
from __future__ import annotations
from dataclasses import dataclass
from pathlib import Path
from typing import Any
from scripts.video_analysis import download_video, extract_keyframes, extract_transcript, ocr_frames
from scripts.video_analysis.error_types import ErrorInfo, make_error
PIPELINE_STAGES: list[str] = ["transcript", "download", "keyframes", "ocr", "report"]
@dataclass
class ReportContext:
url: str
slug: str
output_dir: Path
@dataclass
class _Ok:
value: Any
def is_ok(self) -> bool:
return True
def is_err(self) -> bool:
return False
@dataclass
class _Err:
err: ErrorInfo
def is_ok(self) -> bool:
return False
def is_err(self) -> bool:
return True
def ok(value: Any) -> _Ok:
return _Ok(value)
def err(error: ErrorInfo) -> _Err:
return _Err(error)
def build_report_stub(slug: str, url: str, video_id: str) -> str:
return f"""# <Video Title> ({slug})
**Source:** {url}
**YouTube ID:** {video_id}
**Date Added to Campaign:** 2026-06-21
> **Tier 3 worker prompt:** populate each section using the transcript.json, ocr.md, and frames/ artifacts in this directory.
## 1. TL;DR
## 2. Key Concepts
## 3. Frame Analysis
## 4. Transcript Highlights
## 5. Mathematical / Theoretical Content
## 6. Connections to Other Videos in Campaign
## 7. Open Questions / Follow-up
## 8. References
"""
def build_summary_stub(slug: str, title: str, author: str | None) -> str:
return f"""# Summary: {slug}
**Title:** {title}
**Author:** {author or "(unknown)"}
<200-400 word summary to be filled in by the Tier 3 worker after reading report.md>
"""
def synthesize_report(url: str, slug: str, output_dir: Path, skip_video_download: bool = False) -> _Ok | _Err:
artifacts = output_dir / "artifacts"
frames_dir = artifacts / "frames"
artifacts.mkdir(parents=True, exist_ok=True)
transcript_path = artifacts / "transcript.json"
frames_dir.mkdir(parents=True, exist_ok=True)
t_result = extract_transcript.extract_transcript(url, transcript_path)
if t_result.is_err():
return t_result
video_path = artifacts / "video.mp4"
if not skip_video_download:
d_result = download_video.download_video(url, video_path)
if d_result.is_err():
return d_result
k_result = extract_keyframes.extract_keyframes(video_path, frames_dir)
if k_result.is_err():
return k_result
ocr_path = artifacts / "ocr.md"
o_result = ocr_frames.ocr_frames(frames_dir, ocr_path)
if o_result.is_err():
return o_result
parsed = extract_transcript.parse_video_id(url)
video_id = parsed.value if parsed.is_ok() else "UNKNOWN"
report_path = output_dir / "report.md"
report_path.write_text(build_report_stub(slug, url, video_id), encoding="utf-8")
summary_path = output_dir / "summary.md"
summary_path.write_text(build_summary_stub(slug, "<Title TBD>", None), encoding="utf-8")
return ok({
"transcript": str(transcript_path),
"frames": str(frames_dir),
"report": str(report_path),
"summary": str(summary_path),
})
@@ -0,0 +1,53 @@
from __future__ import annotations
from pathlib import Path
from unittest.mock import patch
from scripts.video_analysis.extract_transcript import ok as extract_ok
from scripts.video_analysis.download_video import ok as download_ok
from scripts.video_analysis.extract_keyframes import ok as keyframes_ok
from scripts.video_analysis.ocr_frames import ok as ocr_ok
from scripts.video_analysis.synthesize_report import (
PIPELINE_STAGES,
ReportContext,
build_report_stub,
build_summary_stub,
synthesize_report,
)
def test_pipeline_stages_in_order() -> None:
assert PIPELINE_STAGES == ["transcript", "download", "keyframes", "ocr", "report"]
def test_report_context_dataclass() -> None:
ctx = ReportContext(url="https://youtu.be/VID", slug="vid", output_dir=Path("/tmp/vid"))
assert ctx.url == "https://youtu.be/VID"
assert ctx.slug == "vid"
def test_build_report_stub_has_sections() -> None:
stub = build_report_stub("vid", "https://youtu.be/VID", "VID123ABCDE")
assert "VID123ABCDE" in stub
assert "## 1. TL;DR" in stub
assert "## 8. References" in stub
def test_build_summary_stub_short() -> None:
stub = build_summary_stub("vid", "Title", "Author")
assert "vid" in stub
assert "Title" in stub
assert len(stub) < 500
def test_synthesize_report_orchestrates(tmp_path: Path) -> None:
with patch("scripts.video_analysis.synthesize_report.extract_transcript") as t, \
patch("scripts.video_analysis.synthesize_report.download_video") as d, \
patch("scripts.video_analysis.synthesize_report.extract_keyframes") as k, \
patch("scripts.video_analysis.synthesize_report.ocr_frames") as o:
t.return_value = extract_ok({})
d.return_value = download_ok({})
k.return_value = keyframes_ok({})
o.return_value = ocr_ok({})
result = synthesize_report("https://youtu.be/ABCDEFGHIJK", "vid", tmp_path, skip_video_download=True)
assert result.is_ok()