114 lines
2.9 KiB
Python
114 lines
2.9 KiB
Python
from __future__ import annotations
|
|
|
|
from dataclasses import dataclass
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
from scripts.video_analysis import download_video, extract_keyframes, extract_transcript, ocr_frames
|
|
from scripts.video_analysis.error_types import ErrorInfo, make_error
|
|
|
|
|
|
PIPELINE_STAGES: list[str] = ["transcript", "download", "keyframes", "ocr", "report"]
|
|
|
|
|
|
@dataclass
|
|
class ReportContext:
|
|
url: str
|
|
slug: str
|
|
output_dir: Path
|
|
|
|
|
|
@dataclass
|
|
class _Ok:
|
|
value: Any
|
|
|
|
def is_ok(self) -> bool:
|
|
return True
|
|
|
|
def is_err(self) -> bool:
|
|
return False
|
|
|
|
|
|
@dataclass
|
|
class _Err:
|
|
err: ErrorInfo
|
|
|
|
def is_ok(self) -> bool:
|
|
return False
|
|
|
|
def is_err(self) -> bool:
|
|
return True
|
|
|
|
|
|
def ok(value: Any) -> _Ok:
|
|
return _Ok(value)
|
|
|
|
|
|
def err(error: ErrorInfo) -> _Err:
|
|
return _Err(error)
|
|
|
|
|
|
def build_report_stub(slug: str, url: str, video_id: str) -> str:
|
|
return f"""# <Video Title> ({slug})
|
|
|
|
**Source:** {url}
|
|
**YouTube ID:** {video_id}
|
|
**Date Added to Campaign:** 2026-06-21
|
|
|
|
> **Tier 3 worker prompt:** populate each section using the transcript.json, ocr.md, and frames/ artifacts in this directory.
|
|
|
|
## 1. TL;DR
|
|
## 2. Key Concepts
|
|
## 3. Frame Analysis
|
|
## 4. Transcript Highlights
|
|
## 5. Mathematical / Theoretical Content
|
|
## 6. Connections to Other Videos in Campaign
|
|
## 7. Open Questions / Follow-up
|
|
## 8. References
|
|
"""
|
|
|
|
|
|
def build_summary_stub(slug: str, title: str, author: str | None) -> str:
|
|
return f"""# Summary: {slug}
|
|
|
|
**Title:** {title}
|
|
**Author:** {author or "(unknown)"}
|
|
|
|
<200-400 word summary to be filled in by the Tier 3 worker after reading report.md>
|
|
"""
|
|
|
|
|
|
def synthesize_report(url: str, slug: str, output_dir: Path, skip_video_download: bool = False) -> _Ok | _Err:
|
|
artifacts = output_dir / "artifacts"
|
|
frames_dir = artifacts / "frames"
|
|
artifacts.mkdir(parents=True, exist_ok=True)
|
|
transcript_path = artifacts / "transcript.json"
|
|
frames_dir.mkdir(parents=True, exist_ok=True)
|
|
t_result = extract_transcript.extract_transcript(url, transcript_path)
|
|
if t_result.is_err():
|
|
return t_result
|
|
video_path = artifacts / "video.mp4"
|
|
if not skip_video_download:
|
|
d_result = download_video.download_video(url, video_path)
|
|
if d_result.is_err():
|
|
return d_result
|
|
k_result = extract_keyframes.extract_keyframes(video_path, frames_dir)
|
|
if k_result.is_err():
|
|
return k_result
|
|
ocr_path = artifacts / "ocr.md"
|
|
o_result = ocr_frames.ocr_frames(frames_dir, ocr_path)
|
|
if o_result.is_err():
|
|
return o_result
|
|
parsed = extract_transcript.parse_video_id(url)
|
|
video_id = parsed.value if parsed.is_ok() else "UNKNOWN"
|
|
report_path = output_dir / "report.md"
|
|
report_path.write_text(build_report_stub(slug, url, video_id), encoding="utf-8")
|
|
summary_path = output_dir / "summary.md"
|
|
summary_path.write_text(build_summary_stub(slug, "<Title TBD>", None), encoding="utf-8")
|
|
return ok({
|
|
"transcript": str(transcript_path),
|
|
"frames": str(frames_dir),
|
|
"report": str(report_path),
|
|
"summary": str(summary_path),
|
|
})
|