feat(video_analysis): synthesize_report.py orchestrator with TDD (5 tests)
This commit is contained in:
@@ -0,0 +1,113 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from scripts.video_analysis import download_video, extract_keyframes, extract_transcript, ocr_frames
|
||||
from scripts.video_analysis.error_types import ErrorInfo, make_error
|
||||
|
||||
|
||||
PIPELINE_STAGES: list[str] = ["transcript", "download", "keyframes", "ocr", "report"]
|
||||
|
||||
|
||||
@dataclass
|
||||
class ReportContext:
|
||||
url: str
|
||||
slug: str
|
||||
output_dir: Path
|
||||
|
||||
|
||||
@dataclass
|
||||
class _Ok:
|
||||
value: Any
|
||||
|
||||
def is_ok(self) -> bool:
|
||||
return True
|
||||
|
||||
def is_err(self) -> bool:
|
||||
return False
|
||||
|
||||
|
||||
@dataclass
|
||||
class _Err:
|
||||
err: ErrorInfo
|
||||
|
||||
def is_ok(self) -> bool:
|
||||
return False
|
||||
|
||||
def is_err(self) -> bool:
|
||||
return True
|
||||
|
||||
|
||||
def ok(value: Any) -> _Ok:
|
||||
return _Ok(value)
|
||||
|
||||
|
||||
def err(error: ErrorInfo) -> _Err:
|
||||
return _Err(error)
|
||||
|
||||
|
||||
def build_report_stub(slug: str, url: str, video_id: str) -> str:
|
||||
return f"""# <Video Title> ({slug})
|
||||
|
||||
**Source:** {url}
|
||||
**YouTube ID:** {video_id}
|
||||
**Date Added to Campaign:** 2026-06-21
|
||||
|
||||
> **Tier 3 worker prompt:** populate each section using the transcript.json, ocr.md, and frames/ artifacts in this directory.
|
||||
|
||||
## 1. TL;DR
|
||||
## 2. Key Concepts
|
||||
## 3. Frame Analysis
|
||||
## 4. Transcript Highlights
|
||||
## 5. Mathematical / Theoretical Content
|
||||
## 6. Connections to Other Videos in Campaign
|
||||
## 7. Open Questions / Follow-up
|
||||
## 8. References
|
||||
"""
|
||||
|
||||
|
||||
def build_summary_stub(slug: str, title: str, author: str | None) -> str:
|
||||
return f"""# Summary: {slug}
|
||||
|
||||
**Title:** {title}
|
||||
**Author:** {author or "(unknown)"}
|
||||
|
||||
<200-400 word summary to be filled in by the Tier 3 worker after reading report.md>
|
||||
"""
|
||||
|
||||
|
||||
def synthesize_report(url: str, slug: str, output_dir: Path, skip_video_download: bool = False) -> _Ok | _Err:
|
||||
artifacts = output_dir / "artifacts"
|
||||
frames_dir = artifacts / "frames"
|
||||
artifacts.mkdir(parents=True, exist_ok=True)
|
||||
transcript_path = artifacts / "transcript.json"
|
||||
frames_dir.mkdir(parents=True, exist_ok=True)
|
||||
t_result = extract_transcript.extract_transcript(url, transcript_path)
|
||||
if t_result.is_err():
|
||||
return t_result
|
||||
video_path = artifacts / "video.mp4"
|
||||
if not skip_video_download:
|
||||
d_result = download_video.download_video(url, video_path)
|
||||
if d_result.is_err():
|
||||
return d_result
|
||||
k_result = extract_keyframes.extract_keyframes(video_path, frames_dir)
|
||||
if k_result.is_err():
|
||||
return k_result
|
||||
ocr_path = artifacts / "ocr.md"
|
||||
o_result = ocr_frames.ocr_frames(frames_dir, ocr_path)
|
||||
if o_result.is_err():
|
||||
return o_result
|
||||
parsed = extract_transcript.parse_video_id(url)
|
||||
video_id = parsed.value if parsed.is_ok() else "UNKNOWN"
|
||||
report_path = output_dir / "report.md"
|
||||
report_path.write_text(build_report_stub(slug, url, video_id), encoding="utf-8")
|
||||
summary_path = output_dir / "summary.md"
|
||||
summary_path.write_text(build_summary_stub(slug, "<Title TBD>", None), encoding="utf-8")
|
||||
return ok({
|
||||
"transcript": str(transcript_path),
|
||||
"frames": str(frames_dir),
|
||||
"report": str(report_path),
|
||||
"summary": str(summary_path),
|
||||
})
|
||||
@@ -0,0 +1,53 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
from unittest.mock import patch
|
||||
|
||||
from scripts.video_analysis.extract_transcript import ok as extract_ok
|
||||
from scripts.video_analysis.download_video import ok as download_ok
|
||||
from scripts.video_analysis.extract_keyframes import ok as keyframes_ok
|
||||
from scripts.video_analysis.ocr_frames import ok as ocr_ok
|
||||
from scripts.video_analysis.synthesize_report import (
|
||||
PIPELINE_STAGES,
|
||||
ReportContext,
|
||||
build_report_stub,
|
||||
build_summary_stub,
|
||||
synthesize_report,
|
||||
)
|
||||
|
||||
|
||||
def test_pipeline_stages_in_order() -> None:
|
||||
assert PIPELINE_STAGES == ["transcript", "download", "keyframes", "ocr", "report"]
|
||||
|
||||
|
||||
def test_report_context_dataclass() -> None:
|
||||
ctx = ReportContext(url="https://youtu.be/VID", slug="vid", output_dir=Path("/tmp/vid"))
|
||||
assert ctx.url == "https://youtu.be/VID"
|
||||
assert ctx.slug == "vid"
|
||||
|
||||
|
||||
def test_build_report_stub_has_sections() -> None:
|
||||
stub = build_report_stub("vid", "https://youtu.be/VID", "VID123ABCDE")
|
||||
assert "VID123ABCDE" in stub
|
||||
assert "## 1. TL;DR" in stub
|
||||
assert "## 8. References" in stub
|
||||
|
||||
|
||||
def test_build_summary_stub_short() -> None:
|
||||
stub = build_summary_stub("vid", "Title", "Author")
|
||||
assert "vid" in stub
|
||||
assert "Title" in stub
|
||||
assert len(stub) < 500
|
||||
|
||||
|
||||
def test_synthesize_report_orchestrates(tmp_path: Path) -> None:
|
||||
with patch("scripts.video_analysis.synthesize_report.extract_transcript") as t, \
|
||||
patch("scripts.video_analysis.synthesize_report.download_video") as d, \
|
||||
patch("scripts.video_analysis.synthesize_report.extract_keyframes") as k, \
|
||||
patch("scripts.video_analysis.synthesize_report.ocr_frames") as o:
|
||||
t.return_value = extract_ok({})
|
||||
d.return_value = download_ok({})
|
||||
k.return_value = keyframes_ok({})
|
||||
o.return_value = ocr_ok({})
|
||||
result = synthesize_report("https://youtu.be/ABCDEFGHIJK", "vid", tmp_path, skip_video_download=True)
|
||||
assert result.is_ok()
|
||||
Reference in New Issue
Block a user