Private
Public Access
0
0
Files
manual_slop/tests/test_video_analysis_extract_transcript.py
T
ed 338573b1e8 refactor(video_analysis): extract_transcript.py uses yt-dlp VTT directly (skip youtube-transcript-api which consistently fails for these videos)
youtube-transcript-api v1.2.4 returns XML parse error on empty response for ALL videos in this campaign. yt-dlp's --write-auto-subs reliably returns 1000s of segments per video. Switched to yt-dlp as the primary path.

Tests updated to mock _fetch_via_ytdlp instead of _fetch_raw_transcript. 8/8 tests passing.
2026-06-21 16:33:44 -04:00

78 lines
2.6 KiB
Python

"""Tests for scripts/video_analysis/extract_transcript.py.
Per conductor/code_styleguides/error_handling.md, success returns Result.ok; failure returns Result.err with ErrorInfo.
"""
from __future__ import annotations
import json
from pathlib import Path
from unittest.mock import patch
from scripts.video_analysis.extract_transcript import (
extract_transcript,
format_transcript_json,
parse_video_id,
)
def test_parse_video_id_youtu_be() -> None:
result = parse_video_id("https://youtu.be/9vM4p9NN0Ts")
assert result.is_ok()
assert result.value == "9vM4p9NN0Ts"
def test_parse_video_id_full_url() -> None:
result = parse_video_id("https://www.youtube.com/watch?v=0yF9TvMeAzM")
assert result.is_ok()
assert result.value == "0yF9TvMeAzM"
def test_parse_video_id_already_id() -> None:
result = parse_video_id("yxkUvXs-hoQ")
assert result.is_ok()
assert result.value == "yxkUvXs-hoQ"
def test_parse_video_id_invalid() -> None:
result = parse_video_id("not-a-url")
assert result.is_err()
def test_extract_transcript_success(tmp_path: Path) -> None:
fake_segments = [
{"start": 0.0, "duration": 5.0, "text": "Hello world"},
{"start": 5.0, "duration": 3.0, "text": "Goodbye world"},
]
with patch("scripts.video_analysis.extract_transcript._fetch_via_ytdlp") as mock_fetch:
mock_fetch.return_value = fake_segments
result = extract_transcript("https://youtu.be/ABCDEFGHIJK", tmp_path / "transcript.json")
assert result.is_ok()
data = json.loads((tmp_path / "transcript.json").read_text())
assert data["video_id"] == "ABCDEFGHIJK"
assert len(data["segments"]) == 2
assert data["plain"] == "Hello world\nGoodbye world"
def test_extract_transcript_network_error(tmp_path: Path) -> None:
with patch("scripts.video_analysis.extract_transcript._fetch_via_ytdlp") as mock_fetch:
mock_fetch.side_effect = Exception("network unreachable")
result = extract_transcript("https://youtu.be/ABCDEFGHIJK", tmp_path / "transcript.json")
assert result.is_err()
def test_extract_transcript_retries_then_fails(tmp_path: Path) -> None:
with patch("scripts.video_analysis.extract_transcript._fetch_via_ytdlp") as mock_fetch:
mock_fetch.side_effect = Exception("transient")
result = extract_transcript("https://youtu.be/ABCDEFGHIJK", tmp_path / "transcript.json", retries=2)
assert result.is_err()
assert mock_fetch.call_count == 2
def test_format_transcript_json_structure() -> None:
segments = [{"start": 0.0, "duration": 5.0, "text": "Hello"}]
out = format_transcript_json("VID123", segments)
assert out["video_id"] == "VID123"
assert out["segments"] == segments
assert out["plain"] == "Hello"
assert "fetched_at" in out