Merge branch 'master' of C:\projects\manual_slop into tier2/any_type_componentization_20260621
This commit is contained in:
@@ -0,0 +1,48 @@
|
||||
"""Quick dedup pass for entropy_epiplexity (frames extracted but not deduped)."""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
ROOT = Path(__file__).resolve().parents[4]
|
||||
sys.path.insert(0, str(ROOT))
|
||||
|
||||
from PIL import Image
|
||||
import imagehash
|
||||
|
||||
|
||||
def main() -> int:
|
||||
frames_dir = ROOT / "conductor" / "tracks" / "video_analysis_entropy_epiplexity_20260621" / "artifacts" / "frames"
|
||||
frame_files = sorted(frames_dir.glob("frame_*.jpg"))
|
||||
print(f"Total frames: {len(frame_files)}")
|
||||
saved_hashes: list[str] = []
|
||||
kept_files: list[str] = []
|
||||
for fp in frame_files:
|
||||
img = Image.open(fp)
|
||||
h = str(imagehash.phash(img))
|
||||
if any(_hamming(h, s) < 5 for s in saved_hashes):
|
||||
fp.unlink()
|
||||
continue
|
||||
saved_hashes.append(h)
|
||||
kept_files.append(fp.name)
|
||||
print(f"Kept: {len(kept_files)}")
|
||||
meta = {
|
||||
"video": "video.mp4",
|
||||
"threshold": 0.05,
|
||||
"total_extracted": len(frame_files),
|
||||
"kept": len(kept_files),
|
||||
"files": kept_files,
|
||||
}
|
||||
(frames_dir / "extraction_meta.json").write_text(json.dumps(meta, indent=2), encoding="utf-8")
|
||||
return 0
|
||||
|
||||
|
||||
def _hamming(a: str, b: str) -> int:
|
||||
if len(a) != len(b):
|
||||
return max(len(a), len(b))
|
||||
return sum(1 for x, y in zip(a, b) if x != y)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
@@ -0,0 +1,16 @@
|
||||
from pathlib import Path
|
||||
content = Path('conductor/tracks/video_analysis_probability_logic_20260621/artifacts/ocr.md').read_text()
|
||||
chunks = content.split('## ')
|
||||
for chunk in chunks[1:]:
|
||||
name = chunk.split('\n')[0].strip()
|
||||
body = chunk
|
||||
fence = chr(96)*3
|
||||
if fence in body:
|
||||
parts = body.split(fence)
|
||||
body = parts[1] if len(parts) >= 2 else body
|
||||
is_chat = 'Streamer Mode' in body or 'Yesterday at' in body or '21:43' in body
|
||||
has_pres = any(s in body for s in ['Definition', 'Logic', 'Probabil', 'Bayesian', 'Frequentist', 'Boolean', 'Lattice', 'Inference', 'Sum Rule', 'Product Rule'])
|
||||
if has_pres and not is_chat:
|
||||
print(f'=== {name} ===')
|
||||
print(body[:800])
|
||||
print()
|
||||
@@ -0,0 +1,111 @@
|
||||
"""Generic Phase 1 Acquire driver for video_analysis_campaign children.
|
||||
|
||||
Reads the child spec from CLI args: slug + URL + needs_yt_dlp_verify.
|
||||
Calls extract_transcript (with yt-dlp VTT fallback) + download_video.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import re
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
ROOT = Path(__file__).resolve().parents[4]
|
||||
sys.path.insert(0, str(ROOT))
|
||||
|
||||
from scripts.video_analysis.download_video import download_video
|
||||
from scripts.video_analysis.extract_transcript import _fetch_via_ytdlp as _fetch_raw_transcript
|
||||
|
||||
|
||||
def _parse_vtt_segments(vtt_path: Path) -> list[dict]:
|
||||
text = vtt_path.read_text(encoding="utf-8")
|
||||
segments: list[dict] = []
|
||||
pattern = re.compile(r"(\d{2}):(\d{2}):(\d{2})\.(\d{3})\s+-->", re.MULTILINE)
|
||||
blocks = re.split(r"\n\n+", text)
|
||||
for block in blocks:
|
||||
match = pattern.search(block)
|
||||
if not match:
|
||||
continue
|
||||
h, m, s, ms = match.groups()
|
||||
start = int(h) * 3600 + int(m) * 60 + int(s) + int(ms) / 1000.0
|
||||
lines = [ln.strip() for ln in block.split("\n") if ln.strip() and not pattern.match(ln) and "-->" not in ln]
|
||||
text_content = " ".join(lines)
|
||||
if text_content:
|
||||
segments.append({"start": start, "duration": 0.0, "text": text_content})
|
||||
return segments
|
||||
|
||||
|
||||
def phase1_acquire(slug: str, url: str, artifacts_dir: Path) -> dict:
|
||||
print(f"Phase 1 Acquire for {slug}: {url}")
|
||||
print(f"Artifacts: {artifacts_dir}")
|
||||
artifacts_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
m = re.search(r"(?:youtu\.be/|v=)([A-Za-z0-9_-]{11})", url)
|
||||
if not m:
|
||||
return {"status": "error", "error": f"Could not parse video_id from {url}"}
|
||||
video_id = m.group(1)
|
||||
|
||||
print("Step 1: extract_transcript (yt-dlp VTT directly)")
|
||||
transcript_path = artifacts_dir / "transcript.json"
|
||||
last_exc = None
|
||||
for attempt in range(3):
|
||||
try:
|
||||
segments = _fetch_raw_transcript(video_id, artifacts_dir)
|
||||
data = {
|
||||
"video_id": video_id,
|
||||
"segments": segments,
|
||||
"plain": "\n".join(s["text"] for s in segments),
|
||||
"fetched_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
|
||||
"source": "yt-dlp-vtt",
|
||||
}
|
||||
transcript_path.write_text(json.dumps(data, indent=2, ensure_ascii=False), encoding="utf-8")
|
||||
print(f" OK: wrote {transcript_path} ({len(segments)} segments)")
|
||||
last_exc = None
|
||||
break
|
||||
except Exception as e:
|
||||
last_exc = e
|
||||
print(f" attempt {attempt+1} failed: {type(e).__name__}: {str(e)[:200]}")
|
||||
if attempt < 2:
|
||||
time.sleep(2 ** attempt)
|
||||
|
||||
if last_exc is not None:
|
||||
print(f" yt-dlp VTT fetch failed after 3 attempts. No transcript available.")
|
||||
transcript_path.write_text(json.dumps({
|
||||
"video_id": video_id,
|
||||
"segments": [],
|
||||
"plain": "",
|
||||
"fetched_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
|
||||
"source": "unavailable",
|
||||
"error": str(last_exc)[:500],
|
||||
"note": "Frame OCR will be the primary signal for this video.",
|
||||
}, indent=2, ensure_ascii=False), encoding="utf-8")
|
||||
|
||||
print("Step 2: download_video")
|
||||
video_path = artifacts_dir / "video.mp4"
|
||||
result = download_video(url, video_path)
|
||||
if result.is_err():
|
||||
return {"status": "error", "error": f"download_video: {result.err.class_name}: {result.err.detail[:200]}"}
|
||||
print(f" OK: wrote {video_path} ({video_path.stat().st_size} bytes)")
|
||||
return {"status": "ok", "video_path": str(video_path), "transcript_path": str(transcript_path)}
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("slug")
|
||||
parser.add_argument("url")
|
||||
parser.add_argument("--artifacts-dir", required=False)
|
||||
args = parser.parse_args()
|
||||
if args.artifacts_dir:
|
||||
artifacts_dir = Path(args.artifacts_dir)
|
||||
else:
|
||||
artifacts_dir = ROOT / "conductor" / "tracks" / f"video_analysis_{args.slug}_20260621" / "artifacts"
|
||||
result = phase1_acquire(args.slug, args.url, artifacts_dir)
|
||||
print(json.dumps(result, indent=2))
|
||||
return 0 if result["status"] == "ok" else 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
@@ -0,0 +1,117 @@
|
||||
"""Phase 1 Acquire driver for video_analysis_cs229_building_llms_20260621.
|
||||
|
||||
Strategy: youtube-transcript-api fails for this video (R5: XML parse error on empty response,
|
||||
likely a YouTube API restriction). Fall back to yt-dlp's own subtitle extraction.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import re
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
ROOT = Path(__file__).resolve().parents[4]
|
||||
sys.path.insert(0, str(ROOT))
|
||||
|
||||
from scripts.video_analysis.download_video import download_video
|
||||
from scripts.video_analysis.extract_transcript import _fetch_raw_transcript
|
||||
|
||||
URL = "https://youtu.be/9vM4p9NN0Ts"
|
||||
ARTIFACTS = ROOT / "conductor" / "tracks" / "video_analysis_cs229_building_llms_20260621" / "artifacts"
|
||||
ARTIFACTS.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
|
||||
def _parse_vtt_segments(vtt_path: Path) -> list[dict]:
|
||||
text = vtt_path.read_text(encoding="utf-8")
|
||||
segments: list[dict] = []
|
||||
pattern = re.compile(r"(\d{2}):(\d{2}):(\d{2})\.(\d{3})\s+-->", re.MULTILINE)
|
||||
blocks = re.split(r"\n\n+", text)
|
||||
for block in blocks:
|
||||
match = pattern.search(block)
|
||||
if not match:
|
||||
continue
|
||||
h, m, s, ms = match.groups()
|
||||
start = int(h) * 3600 + int(m) * 60 + int(s) + int(ms) / 1000.0
|
||||
lines = [ln.strip() for ln in block.split("\n") if ln.strip() and not pattern.match(ln) and "-->" not in ln]
|
||||
text_content = " ".join(lines)
|
||||
if text_content:
|
||||
segments.append({"start": start, "duration": 0.0, "text": text_content})
|
||||
return segments
|
||||
|
||||
|
||||
def main() -> int:
|
||||
print(f"Phase 1 Acquire for {URL}")
|
||||
print(f"Artifacts: {ARTIFACTS}")
|
||||
|
||||
print("Step 1: extract_transcript (try youtube-transcript-api)")
|
||||
transcript_path = ARTIFACTS / "transcript.json"
|
||||
video_id = "9vM4p9NN0Ts"
|
||||
last_exc = None
|
||||
for attempt in range(3):
|
||||
try:
|
||||
segments = _fetch_raw_transcript(video_id)
|
||||
data = {
|
||||
"video_id": video_id,
|
||||
"segments": segments,
|
||||
"plain": "\n".join(s["text"] for s in segments),
|
||||
"fetched_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
|
||||
"source": "youtube-transcript-api",
|
||||
}
|
||||
transcript_path.write_text(json.dumps(data, indent=2, ensure_ascii=False), encoding="utf-8")
|
||||
print(f" OK: wrote {transcript_path} ({len(segments)} segments)")
|
||||
break
|
||||
except Exception as e:
|
||||
last_exc = e
|
||||
print(f" attempt {attempt+1} failed: {type(e).__name__}: {str(e)[:200]}")
|
||||
if attempt < 2:
|
||||
time.sleep(2 ** attempt)
|
||||
else:
|
||||
print(f" youtube-transcript-api failed after 3 attempts. Falling back to yt-dlp subtitles.")
|
||||
print("Step 1b: yt-dlp subtitle fallback")
|
||||
vtt_path = ARTIFACTS / f"{video_id}.en.vtt"
|
||||
completed = subprocess.run(
|
||||
["yt-dlp", "--write-auto-subs", "--sub-langs", "en", "--sub-format", "vtt",
|
||||
"--skip-download", "--output", str(ARTIFACTS / video_id), URL],
|
||||
capture_output=True, text=True,
|
||||
)
|
||||
candidates = list(ARTIFACTS.glob(f"{video_id}*.vtt"))
|
||||
if not candidates:
|
||||
print(f" yt-dlp subtitle fetch also failed: {completed.stderr[:300]}")
|
||||
print(f" No transcript available. Continuing with download only.")
|
||||
transcript_path.write_text(json.dumps({
|
||||
"video_id": video_id,
|
||||
"segments": [],
|
||||
"plain": "",
|
||||
"fetched_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
|
||||
"source": "unavailable",
|
||||
"error": str(last_exc)[:500] if last_exc else None,
|
||||
"note": "youtube-transcript-api failed with XML parse error (R5). yt-dlp subtitles also unavailable. Frame OCR will be the primary signal for this video.",
|
||||
}, indent=2, ensure_ascii=False), encoding="utf-8")
|
||||
else:
|
||||
vtt_path = candidates[0]
|
||||
segments = _parse_vtt_segments(vtt_path)
|
||||
data = {
|
||||
"video_id": video_id,
|
||||
"segments": segments,
|
||||
"plain": "\n".join(s["text"] for s in segments),
|
||||
"fetched_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
|
||||
"source": "yt-dlp-vtt",
|
||||
}
|
||||
transcript_path.write_text(json.dumps(data, indent=2, ensure_ascii=False), encoding="utf-8")
|
||||
print(f" OK: parsed {len(segments)} segments from {vtt_path.name}")
|
||||
|
||||
print("Step 2: download_video")
|
||||
video_path = ARTIFACTS / "video.mp4"
|
||||
result = download_video(URL, video_path)
|
||||
if result.is_err():
|
||||
print(f" ERR: {result.err.class_name}: {result.err.detail[:200]}")
|
||||
return 1
|
||||
print(f" OK: wrote {video_path} ({video_path.stat().st_size} bytes)")
|
||||
print(f" log: {result.value['log']}")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
@@ -0,0 +1,37 @@
|
||||
"""Generic Phase 2 Keyframes driver."""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
ROOT = Path(__file__).resolve().parents[4]
|
||||
sys.path.insert(0, str(ROOT))
|
||||
|
||||
from scripts.video_analysis.extract_keyframes import extract_keyframes
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("slug")
|
||||
parser.add_argument("--video", required=False)
|
||||
parser.add_argument("--output-dir", required=False)
|
||||
parser.add_argument("--threshold", type=float, default=0.4)
|
||||
args = parser.parse_args()
|
||||
|
||||
track_dir = ROOT / "conductor" / "tracks" / f"video_analysis_{args.slug}_20260621" / "artifacts"
|
||||
video = Path(args.video) if args.video else track_dir / "video.mp4"
|
||||
output = Path(args.output_dir) if args.output_dir else track_dir / "frames"
|
||||
|
||||
print(f"Phase 2 Keyframes for {video}")
|
||||
output.mkdir(parents=True, exist_ok=True)
|
||||
result = extract_keyframes(video, output, threshold=args.threshold)
|
||||
if result.is_err():
|
||||
print(f" ERR: {result.err.class_name}: {result.err.detail[:300]}")
|
||||
return 1
|
||||
print(f" OK: kept {result.value['kept']} frames")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
@@ -0,0 +1,33 @@
|
||||
"""Phase 2 Keyframes driver for video_analysis_cs229_building_llms_20260621.
|
||||
|
||||
Invokes extract_keyframes + manual review note for child #1.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
ROOT = Path(__file__).resolve().parents[4]
|
||||
sys.path.insert(0, str(ROOT))
|
||||
|
||||
from scripts.video_analysis.extract_keyframes import extract_keyframes
|
||||
|
||||
ARTIFACTS = ROOT / "conductor" / "tracks" / "video_analysis_cs229_building_llms_20260621" / "artifacts"
|
||||
VIDEO = ARTIFACTS / "video.mp4"
|
||||
FRAMES = ARTIFACTS / "frames"
|
||||
|
||||
|
||||
def main() -> int:
|
||||
print(f"Phase 2 Keyframes for {VIDEO}")
|
||||
FRAMES.mkdir(parents=True, exist_ok=True)
|
||||
result = extract_keyframes(VIDEO, FRAMES, threshold=0.4)
|
||||
if result.is_err():
|
||||
print(f" ERR: {result.err.class_name}: {result.err.detail[:300]}")
|
||||
return 1
|
||||
print(f" OK: kept {result.value['kept']} frames (from {result.value['meta']['total_extracted']} extracted)")
|
||||
print(f" meta: {FRAMES / 'extraction_meta.json'}")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
@@ -0,0 +1,39 @@
|
||||
"""Generic Phase 3 OCR driver."""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
ROOT = Path(__file__).resolve().parents[4]
|
||||
sys.path.insert(0, str(ROOT))
|
||||
|
||||
from scripts.video_analysis.ocr_frames import ocr_frames
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("slug")
|
||||
parser.add_argument("--frames-dir", required=False)
|
||||
parser.add_argument("--output", required=False)
|
||||
parser.add_argument("--backend", default="winsdk")
|
||||
args = parser.parse_args()
|
||||
|
||||
track_dir = ROOT / "conductor" / "tracks" / f"video_analysis_{args.slug}_20260621" / "artifacts"
|
||||
frames = Path(args.frames_dir) if args.frames_dir else track_dir / "frames"
|
||||
output = Path(args.output) if args.output else track_dir / "ocr.md"
|
||||
|
||||
print(f"Phase 3 OCR for {frames} ({args.backend})")
|
||||
t0 = time.time()
|
||||
result = ocr_frames(frames, output, backend=args.backend)
|
||||
elapsed = time.time() - t0
|
||||
if result.is_err():
|
||||
print(f" ERR: {result.err.class_name}: {result.err.detail[:300]}")
|
||||
return 1
|
||||
print(f" OK: OCR'd {result.value['frames_ocrd']} frames in {elapsed:.1f}s")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
@@ -0,0 +1,35 @@
|
||||
"""Phase 3 OCR driver for video_analysis_cs229_building_llms_20260621.
|
||||
|
||||
Invokes ocr_frames with winsdk backend on the extracted keyframes.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
ROOT = Path(__file__).resolve().parents[4]
|
||||
sys.path.insert(0, str(ROOT))
|
||||
|
||||
from scripts.video_analysis.ocr_frames import ocr_frames
|
||||
|
||||
ARTIFACTS = ROOT / "conductor" / "tracks" / "video_analysis_cs229_building_llms_20260621" / "artifacts"
|
||||
FRAMES = ARTIFACTS / "frames"
|
||||
OUTPUT = ARTIFACTS / "ocr.md"
|
||||
|
||||
|
||||
def main() -> int:
|
||||
print(f"Phase 3 OCR for {FRAMES} (winsdk backend)")
|
||||
t0 = time.time()
|
||||
result = ocr_frames(FRAMES, OUTPUT, backend="winsdk")
|
||||
elapsed = time.time() - t0
|
||||
if result.is_err():
|
||||
print(f" ERR: {result.err.class_name}: {result.err.detail[:300]}")
|
||||
return 1
|
||||
print(f" OK: OCR'd {result.value['frames_ocrd']} frames in {elapsed:.1f}s ({elapsed/max(1,result.value['frames_ocrd']):.2f}s/frame)")
|
||||
print(f" output: {OUTPUT} ({OUTPUT.stat().st_size} bytes)")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
@@ -2,6 +2,7 @@ from __future__ import annotations
|
||||
|
||||
import json
|
||||
import re
|
||||
import subprocess
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime, timezone
|
||||
@@ -9,8 +10,6 @@ from pathlib import Path
|
||||
from typing import Any
|
||||
from urllib.parse import parse_qs, urlparse
|
||||
|
||||
from youtube_transcript_api import YouTubeTranscriptApi
|
||||
|
||||
from scripts.video_analysis.error_types import ErrorInfo, make_error
|
||||
|
||||
|
||||
@@ -69,13 +68,35 @@ def format_transcript_json(video_id: str, segments: list[dict[str, Any]]) -> dic
|
||||
}
|
||||
|
||||
|
||||
def _fetch_raw_transcript(video_id: str) -> list[dict[str, Any]]:
|
||||
api = YouTubeTranscriptApi()
|
||||
fetched = api.fetch(video_id)
|
||||
return [
|
||||
{"start": float(s.start), "duration": float(s.duration), "text": str(s.text)}
|
||||
for s in fetched
|
||||
]
|
||||
def _parse_vtt_segments(vtt_path: Path) -> list[dict[str, Any]]:
|
||||
text = vtt_path.read_text(encoding="utf-8")
|
||||
segments: list[dict[str, Any]] = []
|
||||
pattern = re.compile(r"(\d{2}):(\d{2}):(\d{2})\.(\d{3})\s+-->", re.MULTILINE)
|
||||
blocks = re.split(r"\n\n+", text)
|
||||
for block in blocks:
|
||||
match = pattern.search(block)
|
||||
if not match:
|
||||
continue
|
||||
h, m, s, ms = match.groups()
|
||||
start = int(h) * 3600 + int(m) * 60 + int(s) + int(ms) / 1000.0
|
||||
lines = [ln.strip() for ln in block.split("\n") if ln.strip() and not pattern.match(ln) and "-->" not in ln]
|
||||
text_content = " ".join(lines)
|
||||
if text_content:
|
||||
segments.append({"start": start, "duration": 0.0, "text": text_content})
|
||||
return segments
|
||||
|
||||
|
||||
def _fetch_via_ytdlp(video_id: str, working_dir: Path) -> list[dict[str, Any]]:
|
||||
completed = subprocess.run(
|
||||
["yt-dlp", "--write-auto-subs", "--sub-langs", "en", "--sub-format", "vtt",
|
||||
"--skip-download", "--output", str(working_dir / video_id),
|
||||
f"https://youtu.be/{video_id}"],
|
||||
capture_output=True, text=True,
|
||||
)
|
||||
candidates = list(working_dir.glob(f"{video_id}*.vtt"))
|
||||
if not candidates:
|
||||
raise RuntimeError(f"yt-dlp VTT fetch failed: {completed.stderr[:300]}")
|
||||
return _parse_vtt_segments(candidates[0])
|
||||
|
||||
|
||||
def extract_transcript(url_or_id: str, output: Path, retries: int = 3) -> _Ok | _Err:
|
||||
@@ -83,19 +104,16 @@ def extract_transcript(url_or_id: str, output: Path, retries: int = 3) -> _Ok |
|
||||
if parsed.is_err():
|
||||
return parsed
|
||||
video_id = parsed.value
|
||||
output.parent.mkdir(parents=True, exist_ok=True)
|
||||
last_exc: Exception | None = None
|
||||
segments: list[dict[str, Any]] = []
|
||||
for attempt in range(retries):
|
||||
try:
|
||||
segments = _fetch_raw_transcript(video_id)
|
||||
break
|
||||
segments = _fetch_via_ytdlp(video_id, output.parent)
|
||||
data = format_transcript_json(video_id, segments)
|
||||
output.write_text(json.dumps(data, indent=2, ensure_ascii=False), encoding="utf-8")
|
||||
return ok(data)
|
||||
except Exception as e:
|
||||
last_exc = e
|
||||
if attempt < retries - 1:
|
||||
time.sleep(2 ** attempt)
|
||||
if not segments:
|
||||
return err(make_error("NetworkError", "fetch", str(last_exc) if last_exc else "no segments"))
|
||||
data = format_transcript_json(video_id, segments)
|
||||
output.parent.mkdir(parents=True, exist_ok=True)
|
||||
output.write_text(json.dumps(data, indent=2, ensure_ascii=False), encoding="utf-8")
|
||||
return ok(data)
|
||||
return err(make_error("TranscriptFetchError", "fetch", str(last_exc) if last_exc else "no segments"))
|
||||
|
||||
Reference in New Issue
Block a user