338573b1e8
youtube-transcript-api v1.2.4 returns XML parse error on empty response for ALL videos in this campaign. yt-dlp's --write-auto-subs reliably returns 1000s of segments per video. Switched to yt-dlp as the primary path. Tests updated to mock _fetch_via_ytdlp instead of _fetch_raw_transcript. 8/8 tests passing.
78 lines
2.6 KiB
Python
78 lines
2.6 KiB
Python
"""Tests for scripts/video_analysis/extract_transcript.py.
|
|
|
|
Per conductor/code_styleguides/error_handling.md, success returns Result.ok; failure returns Result.err with ErrorInfo.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
from pathlib import Path
|
|
from unittest.mock import patch
|
|
|
|
from scripts.video_analysis.extract_transcript import (
|
|
extract_transcript,
|
|
format_transcript_json,
|
|
parse_video_id,
|
|
)
|
|
|
|
|
|
def test_parse_video_id_youtu_be() -> None:
|
|
result = parse_video_id("https://youtu.be/9vM4p9NN0Ts")
|
|
assert result.is_ok()
|
|
assert result.value == "9vM4p9NN0Ts"
|
|
|
|
|
|
def test_parse_video_id_full_url() -> None:
|
|
result = parse_video_id("https://www.youtube.com/watch?v=0yF9TvMeAzM")
|
|
assert result.is_ok()
|
|
assert result.value == "0yF9TvMeAzM"
|
|
|
|
|
|
def test_parse_video_id_already_id() -> None:
|
|
result = parse_video_id("yxkUvXs-hoQ")
|
|
assert result.is_ok()
|
|
assert result.value == "yxkUvXs-hoQ"
|
|
|
|
|
|
def test_parse_video_id_invalid() -> None:
|
|
result = parse_video_id("not-a-url")
|
|
assert result.is_err()
|
|
|
|
|
|
def test_extract_transcript_success(tmp_path: Path) -> None:
|
|
fake_segments = [
|
|
{"start": 0.0, "duration": 5.0, "text": "Hello world"},
|
|
{"start": 5.0, "duration": 3.0, "text": "Goodbye world"},
|
|
]
|
|
with patch("scripts.video_analysis.extract_transcript._fetch_via_ytdlp") as mock_fetch:
|
|
mock_fetch.return_value = fake_segments
|
|
result = extract_transcript("https://youtu.be/ABCDEFGHIJK", tmp_path / "transcript.json")
|
|
assert result.is_ok()
|
|
data = json.loads((tmp_path / "transcript.json").read_text())
|
|
assert data["video_id"] == "ABCDEFGHIJK"
|
|
assert len(data["segments"]) == 2
|
|
assert data["plain"] == "Hello world\nGoodbye world"
|
|
|
|
|
|
def test_extract_transcript_network_error(tmp_path: Path) -> None:
|
|
with patch("scripts.video_analysis.extract_transcript._fetch_via_ytdlp") as mock_fetch:
|
|
mock_fetch.side_effect = Exception("network unreachable")
|
|
result = extract_transcript("https://youtu.be/ABCDEFGHIJK", tmp_path / "transcript.json")
|
|
assert result.is_err()
|
|
|
|
|
|
def test_extract_transcript_retries_then_fails(tmp_path: Path) -> None:
|
|
with patch("scripts.video_analysis.extract_transcript._fetch_via_ytdlp") as mock_fetch:
|
|
mock_fetch.side_effect = Exception("transient")
|
|
result = extract_transcript("https://youtu.be/ABCDEFGHIJK", tmp_path / "transcript.json", retries=2)
|
|
assert result.is_err()
|
|
assert mock_fetch.call_count == 2
|
|
|
|
|
|
def test_format_transcript_json_structure() -> None:
|
|
segments = [{"start": 0.0, "duration": 5.0, "text": "Hello"}]
|
|
out = format_transcript_json("VID123", segments)
|
|
assert out["video_id"] == "VID123"
|
|
assert out["segments"] == segments
|
|
assert out["plain"] == "Hello"
|
|
assert "fetched_at" in out
|