Private
Public Access
0
0
Files
manual_slop/scripts/video_analysis/synthesize_report.py
T

114 lines
2.9 KiB
Python

from __future__ import annotations
from dataclasses import dataclass
from pathlib import Path
from typing import Any
from scripts.video_analysis import download_video, extract_keyframes, extract_transcript, ocr_frames
from scripts.video_analysis.error_types import ErrorInfo, make_error
PIPELINE_STAGES: list[str] = ["transcript", "download", "keyframes", "ocr", "report"]
@dataclass
class ReportContext:
url: str
slug: str
output_dir: Path
@dataclass
class _Ok:
value: Any
def is_ok(self) -> bool:
return True
def is_err(self) -> bool:
return False
@dataclass
class _Err:
err: ErrorInfo
def is_ok(self) -> bool:
return False
def is_err(self) -> bool:
return True
def ok(value: Any) -> _Ok:
return _Ok(value)
def err(error: ErrorInfo) -> _Err:
return _Err(error)
def build_report_stub(slug: str, url: str, video_id: str) -> str:
return f"""# <Video Title> ({slug})
**Source:** {url}
**YouTube ID:** {video_id}
**Date Added to Campaign:** 2026-06-21
> **Tier 3 worker prompt:** populate each section using the transcript.json, ocr.md, and frames/ artifacts in this directory.
## 1. TL;DR
## 2. Key Concepts
## 3. Frame Analysis
## 4. Transcript Highlights
## 5. Mathematical / Theoretical Content
## 6. Connections to Other Videos in Campaign
## 7. Open Questions / Follow-up
## 8. References
"""
def build_summary_stub(slug: str, title: str, author: str | None) -> str:
return f"""# Summary: {slug}
**Title:** {title}
**Author:** {author or "(unknown)"}
<200-400 word summary to be filled in by the Tier 3 worker after reading report.md>
"""
def synthesize_report(url: str, slug: str, output_dir: Path, skip_video_download: bool = False) -> _Ok | _Err:
artifacts = output_dir / "artifacts"
frames_dir = artifacts / "frames"
artifacts.mkdir(parents=True, exist_ok=True)
transcript_path = artifacts / "transcript.json"
frames_dir.mkdir(parents=True, exist_ok=True)
t_result = extract_transcript.extract_transcript(url, transcript_path)
if t_result.is_err():
return t_result
video_path = artifacts / "video.mp4"
if not skip_video_download:
d_result = download_video.download_video(url, video_path)
if d_result.is_err():
return d_result
k_result = extract_keyframes.extract_keyframes(video_path, frames_dir)
if k_result.is_err():
return k_result
ocr_path = artifacts / "ocr.md"
o_result = ocr_frames.ocr_frames(frames_dir, ocr_path)
if o_result.is_err():
return o_result
parsed = extract_transcript.parse_video_id(url)
video_id = parsed.value if parsed.is_ok() else "UNKNOWN"
report_path = output_dir / "report.md"
report_path.write_text(build_report_stub(slug, url, video_id), encoding="utf-8")
summary_path = output_dir / "summary.md"
summary_path.write_text(build_summary_stub(slug, "<Title TBD>", None), encoding="utf-8")
return ok({
"transcript": str(transcript_path),
"frames": str(frames_dir),
"report": str(report_path),
"summary": str(summary_path),
})