Private
Public Access
0
0

Merge branch 'master' of C:\projects\manual_slop into tier2/any_type_componentization_20260621

This commit is contained in:
2026-06-21 17:46:57 -04:00
356 changed files with 72559 additions and 69 deletions
@@ -0,0 +1,48 @@
"""Quick dedup pass for entropy_epiplexity (frames extracted but not deduped)."""
from __future__ import annotations
import json
import sys
from pathlib import Path
ROOT = Path(__file__).resolve().parents[4]
sys.path.insert(0, str(ROOT))
from PIL import Image
import imagehash
def main() -> int:
frames_dir = ROOT / "conductor" / "tracks" / "video_analysis_entropy_epiplexity_20260621" / "artifacts" / "frames"
frame_files = sorted(frames_dir.glob("frame_*.jpg"))
print(f"Total frames: {len(frame_files)}")
saved_hashes: list[str] = []
kept_files: list[str] = []
for fp in frame_files:
img = Image.open(fp)
h = str(imagehash.phash(img))
if any(_hamming(h, s) < 5 for s in saved_hashes):
fp.unlink()
continue
saved_hashes.append(h)
kept_files.append(fp.name)
print(f"Kept: {len(kept_files)}")
meta = {
"video": "video.mp4",
"threshold": 0.05,
"total_extracted": len(frame_files),
"kept": len(kept_files),
"files": kept_files,
}
(frames_dir / "extraction_meta.json").write_text(json.dumps(meta, indent=2), encoding="utf-8")
return 0
def _hamming(a: str, b: str) -> int:
if len(a) != len(b):
return max(len(a), len(b))
return sum(1 for x, y in zip(a, b) if x != y)
if __name__ == "__main__":
sys.exit(main())
@@ -0,0 +1,16 @@
from pathlib import Path
content = Path('conductor/tracks/video_analysis_probability_logic_20260621/artifacts/ocr.md').read_text()
chunks = content.split('## ')
for chunk in chunks[1:]:
name = chunk.split('\n')[0].strip()
body = chunk
fence = chr(96)*3
if fence in body:
parts = body.split(fence)
body = parts[1] if len(parts) >= 2 else body
is_chat = 'Streamer Mode' in body or 'Yesterday at' in body or '21:43' in body
has_pres = any(s in body for s in ['Definition', 'Logic', 'Probabil', 'Bayesian', 'Frequentist', 'Boolean', 'Lattice', 'Inference', 'Sum Rule', 'Product Rule'])
if has_pres and not is_chat:
print(f'=== {name} ===')
print(body[:800])
print()
@@ -0,0 +1,111 @@
"""Generic Phase 1 Acquire driver for video_analysis_campaign children.
Reads the child spec from CLI args: slug + URL + needs_yt_dlp_verify.
Calls extract_transcript (with yt-dlp VTT fallback) + download_video.
"""
from __future__ import annotations
import argparse
import json
import re
import subprocess
import sys
import time
from pathlib import Path
ROOT = Path(__file__).resolve().parents[4]
sys.path.insert(0, str(ROOT))
from scripts.video_analysis.download_video import download_video
from scripts.video_analysis.extract_transcript import _fetch_via_ytdlp as _fetch_raw_transcript
def _parse_vtt_segments(vtt_path: Path) -> list[dict]:
text = vtt_path.read_text(encoding="utf-8")
segments: list[dict] = []
pattern = re.compile(r"(\d{2}):(\d{2}):(\d{2})\.(\d{3})\s+-->", re.MULTILINE)
blocks = re.split(r"\n\n+", text)
for block in blocks:
match = pattern.search(block)
if not match:
continue
h, m, s, ms = match.groups()
start = int(h) * 3600 + int(m) * 60 + int(s) + int(ms) / 1000.0
lines = [ln.strip() for ln in block.split("\n") if ln.strip() and not pattern.match(ln) and "-->" not in ln]
text_content = " ".join(lines)
if text_content:
segments.append({"start": start, "duration": 0.0, "text": text_content})
return segments
def phase1_acquire(slug: str, url: str, artifacts_dir: Path) -> dict:
print(f"Phase 1 Acquire for {slug}: {url}")
print(f"Artifacts: {artifacts_dir}")
artifacts_dir.mkdir(parents=True, exist_ok=True)
m = re.search(r"(?:youtu\.be/|v=)([A-Za-z0-9_-]{11})", url)
if not m:
return {"status": "error", "error": f"Could not parse video_id from {url}"}
video_id = m.group(1)
print("Step 1: extract_transcript (yt-dlp VTT directly)")
transcript_path = artifacts_dir / "transcript.json"
last_exc = None
for attempt in range(3):
try:
segments = _fetch_raw_transcript(video_id, artifacts_dir)
data = {
"video_id": video_id,
"segments": segments,
"plain": "\n".join(s["text"] for s in segments),
"fetched_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
"source": "yt-dlp-vtt",
}
transcript_path.write_text(json.dumps(data, indent=2, ensure_ascii=False), encoding="utf-8")
print(f" OK: wrote {transcript_path} ({len(segments)} segments)")
last_exc = None
break
except Exception as e:
last_exc = e
print(f" attempt {attempt+1} failed: {type(e).__name__}: {str(e)[:200]}")
if attempt < 2:
time.sleep(2 ** attempt)
if last_exc is not None:
print(f" yt-dlp VTT fetch failed after 3 attempts. No transcript available.")
transcript_path.write_text(json.dumps({
"video_id": video_id,
"segments": [],
"plain": "",
"fetched_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
"source": "unavailable",
"error": str(last_exc)[:500],
"note": "Frame OCR will be the primary signal for this video.",
}, indent=2, ensure_ascii=False), encoding="utf-8")
print("Step 2: download_video")
video_path = artifacts_dir / "video.mp4"
result = download_video(url, video_path)
if result.is_err():
return {"status": "error", "error": f"download_video: {result.err.class_name}: {result.err.detail[:200]}"}
print(f" OK: wrote {video_path} ({video_path.stat().st_size} bytes)")
return {"status": "ok", "video_path": str(video_path), "transcript_path": str(transcript_path)}
def main() -> int:
parser = argparse.ArgumentParser()
parser.add_argument("slug")
parser.add_argument("url")
parser.add_argument("--artifacts-dir", required=False)
args = parser.parse_args()
if args.artifacts_dir:
artifacts_dir = Path(args.artifacts_dir)
else:
artifacts_dir = ROOT / "conductor" / "tracks" / f"video_analysis_{args.slug}_20260621" / "artifacts"
result = phase1_acquire(args.slug, args.url, artifacts_dir)
print(json.dumps(result, indent=2))
return 0 if result["status"] == "ok" else 1
if __name__ == "__main__":
sys.exit(main())
@@ -0,0 +1,117 @@
"""Phase 1 Acquire driver for video_analysis_cs229_building_llms_20260621.
Strategy: youtube-transcript-api fails for this video (R5: XML parse error on empty response,
likely a YouTube API restriction). Fall back to yt-dlp's own subtitle extraction.
"""
from __future__ import annotations
import json
import re
import subprocess
import sys
import time
from pathlib import Path
ROOT = Path(__file__).resolve().parents[4]
sys.path.insert(0, str(ROOT))
from scripts.video_analysis.download_video import download_video
from scripts.video_analysis.extract_transcript import _fetch_raw_transcript
URL = "https://youtu.be/9vM4p9NN0Ts"
ARTIFACTS = ROOT / "conductor" / "tracks" / "video_analysis_cs229_building_llms_20260621" / "artifacts"
ARTIFACTS.mkdir(parents=True, exist_ok=True)
def _parse_vtt_segments(vtt_path: Path) -> list[dict]:
text = vtt_path.read_text(encoding="utf-8")
segments: list[dict] = []
pattern = re.compile(r"(\d{2}):(\d{2}):(\d{2})\.(\d{3})\s+-->", re.MULTILINE)
blocks = re.split(r"\n\n+", text)
for block in blocks:
match = pattern.search(block)
if not match:
continue
h, m, s, ms = match.groups()
start = int(h) * 3600 + int(m) * 60 + int(s) + int(ms) / 1000.0
lines = [ln.strip() for ln in block.split("\n") if ln.strip() and not pattern.match(ln) and "-->" not in ln]
text_content = " ".join(lines)
if text_content:
segments.append({"start": start, "duration": 0.0, "text": text_content})
return segments
def main() -> int:
print(f"Phase 1 Acquire for {URL}")
print(f"Artifacts: {ARTIFACTS}")
print("Step 1: extract_transcript (try youtube-transcript-api)")
transcript_path = ARTIFACTS / "transcript.json"
video_id = "9vM4p9NN0Ts"
last_exc = None
for attempt in range(3):
try:
segments = _fetch_raw_transcript(video_id)
data = {
"video_id": video_id,
"segments": segments,
"plain": "\n".join(s["text"] for s in segments),
"fetched_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
"source": "youtube-transcript-api",
}
transcript_path.write_text(json.dumps(data, indent=2, ensure_ascii=False), encoding="utf-8")
print(f" OK: wrote {transcript_path} ({len(segments)} segments)")
break
except Exception as e:
last_exc = e
print(f" attempt {attempt+1} failed: {type(e).__name__}: {str(e)[:200]}")
if attempt < 2:
time.sleep(2 ** attempt)
else:
print(f" youtube-transcript-api failed after 3 attempts. Falling back to yt-dlp subtitles.")
print("Step 1b: yt-dlp subtitle fallback")
vtt_path = ARTIFACTS / f"{video_id}.en.vtt"
completed = subprocess.run(
["yt-dlp", "--write-auto-subs", "--sub-langs", "en", "--sub-format", "vtt",
"--skip-download", "--output", str(ARTIFACTS / video_id), URL],
capture_output=True, text=True,
)
candidates = list(ARTIFACTS.glob(f"{video_id}*.vtt"))
if not candidates:
print(f" yt-dlp subtitle fetch also failed: {completed.stderr[:300]}")
print(f" No transcript available. Continuing with download only.")
transcript_path.write_text(json.dumps({
"video_id": video_id,
"segments": [],
"plain": "",
"fetched_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
"source": "unavailable",
"error": str(last_exc)[:500] if last_exc else None,
"note": "youtube-transcript-api failed with XML parse error (R5). yt-dlp subtitles also unavailable. Frame OCR will be the primary signal for this video.",
}, indent=2, ensure_ascii=False), encoding="utf-8")
else:
vtt_path = candidates[0]
segments = _parse_vtt_segments(vtt_path)
data = {
"video_id": video_id,
"segments": segments,
"plain": "\n".join(s["text"] for s in segments),
"fetched_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
"source": "yt-dlp-vtt",
}
transcript_path.write_text(json.dumps(data, indent=2, ensure_ascii=False), encoding="utf-8")
print(f" OK: parsed {len(segments)} segments from {vtt_path.name}")
print("Step 2: download_video")
video_path = ARTIFACTS / "video.mp4"
result = download_video(URL, video_path)
if result.is_err():
print(f" ERR: {result.err.class_name}: {result.err.detail[:200]}")
return 1
print(f" OK: wrote {video_path} ({video_path.stat().st_size} bytes)")
print(f" log: {result.value['log']}")
return 0
if __name__ == "__main__":
sys.exit(main())
@@ -0,0 +1,37 @@
"""Generic Phase 2 Keyframes driver."""
from __future__ import annotations
import argparse
import sys
from pathlib import Path
ROOT = Path(__file__).resolve().parents[4]
sys.path.insert(0, str(ROOT))
from scripts.video_analysis.extract_keyframes import extract_keyframes
def main() -> int:
parser = argparse.ArgumentParser()
parser.add_argument("slug")
parser.add_argument("--video", required=False)
parser.add_argument("--output-dir", required=False)
parser.add_argument("--threshold", type=float, default=0.4)
args = parser.parse_args()
track_dir = ROOT / "conductor" / "tracks" / f"video_analysis_{args.slug}_20260621" / "artifacts"
video = Path(args.video) if args.video else track_dir / "video.mp4"
output = Path(args.output_dir) if args.output_dir else track_dir / "frames"
print(f"Phase 2 Keyframes for {video}")
output.mkdir(parents=True, exist_ok=True)
result = extract_keyframes(video, output, threshold=args.threshold)
if result.is_err():
print(f" ERR: {result.err.class_name}: {result.err.detail[:300]}")
return 1
print(f" OK: kept {result.value['kept']} frames")
return 0
if __name__ == "__main__":
sys.exit(main())
@@ -0,0 +1,33 @@
"""Phase 2 Keyframes driver for video_analysis_cs229_building_llms_20260621.
Invokes extract_keyframes + manual review note for child #1.
"""
from __future__ import annotations
import sys
from pathlib import Path
ROOT = Path(__file__).resolve().parents[4]
sys.path.insert(0, str(ROOT))
from scripts.video_analysis.extract_keyframes import extract_keyframes
ARTIFACTS = ROOT / "conductor" / "tracks" / "video_analysis_cs229_building_llms_20260621" / "artifacts"
VIDEO = ARTIFACTS / "video.mp4"
FRAMES = ARTIFACTS / "frames"
def main() -> int:
print(f"Phase 2 Keyframes for {VIDEO}")
FRAMES.mkdir(parents=True, exist_ok=True)
result = extract_keyframes(VIDEO, FRAMES, threshold=0.4)
if result.is_err():
print(f" ERR: {result.err.class_name}: {result.err.detail[:300]}")
return 1
print(f" OK: kept {result.value['kept']} frames (from {result.value['meta']['total_extracted']} extracted)")
print(f" meta: {FRAMES / 'extraction_meta.json'}")
return 0
if __name__ == "__main__":
sys.exit(main())
@@ -0,0 +1,39 @@
"""Generic Phase 3 OCR driver."""
from __future__ import annotations
import argparse
import sys
import time
from pathlib import Path
ROOT = Path(__file__).resolve().parents[4]
sys.path.insert(0, str(ROOT))
from scripts.video_analysis.ocr_frames import ocr_frames
def main() -> int:
parser = argparse.ArgumentParser()
parser.add_argument("slug")
parser.add_argument("--frames-dir", required=False)
parser.add_argument("--output", required=False)
parser.add_argument("--backend", default="winsdk")
args = parser.parse_args()
track_dir = ROOT / "conductor" / "tracks" / f"video_analysis_{args.slug}_20260621" / "artifacts"
frames = Path(args.frames_dir) if args.frames_dir else track_dir / "frames"
output = Path(args.output) if args.output else track_dir / "ocr.md"
print(f"Phase 3 OCR for {frames} ({args.backend})")
t0 = time.time()
result = ocr_frames(frames, output, backend=args.backend)
elapsed = time.time() - t0
if result.is_err():
print(f" ERR: {result.err.class_name}: {result.err.detail[:300]}")
return 1
print(f" OK: OCR'd {result.value['frames_ocrd']} frames in {elapsed:.1f}s")
return 0
if __name__ == "__main__":
sys.exit(main())
@@ -0,0 +1,35 @@
"""Phase 3 OCR driver for video_analysis_cs229_building_llms_20260621.
Invokes ocr_frames with winsdk backend on the extracted keyframes.
"""
from __future__ import annotations
import sys
import time
from pathlib import Path
ROOT = Path(__file__).resolve().parents[4]
sys.path.insert(0, str(ROOT))
from scripts.video_analysis.ocr_frames import ocr_frames
ARTIFACTS = ROOT / "conductor" / "tracks" / "video_analysis_cs229_building_llms_20260621" / "artifacts"
FRAMES = ARTIFACTS / "frames"
OUTPUT = ARTIFACTS / "ocr.md"
def main() -> int:
print(f"Phase 3 OCR for {FRAMES} (winsdk backend)")
t0 = time.time()
result = ocr_frames(FRAMES, OUTPUT, backend="winsdk")
elapsed = time.time() - t0
if result.is_err():
print(f" ERR: {result.err.class_name}: {result.err.detail[:300]}")
return 1
print(f" OK: OCR'd {result.value['frames_ocrd']} frames in {elapsed:.1f}s ({elapsed/max(1,result.value['frames_ocrd']):.2f}s/frame)")
print(f" output: {OUTPUT} ({OUTPUT.stat().st_size} bytes)")
return 0
if __name__ == "__main__":
sys.exit(main())
+36 -18
View File
@@ -2,6 +2,7 @@ from __future__ import annotations
import json
import re
import subprocess
import time
from dataclasses import dataclass
from datetime import datetime, timezone
@@ -9,8 +10,6 @@ from pathlib import Path
from typing import Any
from urllib.parse import parse_qs, urlparse
from youtube_transcript_api import YouTubeTranscriptApi
from scripts.video_analysis.error_types import ErrorInfo, make_error
@@ -69,13 +68,35 @@ def format_transcript_json(video_id: str, segments: list[dict[str, Any]]) -> dic
}
def _fetch_raw_transcript(video_id: str) -> list[dict[str, Any]]:
api = YouTubeTranscriptApi()
fetched = api.fetch(video_id)
return [
{"start": float(s.start), "duration": float(s.duration), "text": str(s.text)}
for s in fetched
]
def _parse_vtt_segments(vtt_path: Path) -> list[dict[str, Any]]:
text = vtt_path.read_text(encoding="utf-8")
segments: list[dict[str, Any]] = []
pattern = re.compile(r"(\d{2}):(\d{2}):(\d{2})\.(\d{3})\s+-->", re.MULTILINE)
blocks = re.split(r"\n\n+", text)
for block in blocks:
match = pattern.search(block)
if not match:
continue
h, m, s, ms = match.groups()
start = int(h) * 3600 + int(m) * 60 + int(s) + int(ms) / 1000.0
lines = [ln.strip() for ln in block.split("\n") if ln.strip() and not pattern.match(ln) and "-->" not in ln]
text_content = " ".join(lines)
if text_content:
segments.append({"start": start, "duration": 0.0, "text": text_content})
return segments
def _fetch_via_ytdlp(video_id: str, working_dir: Path) -> list[dict[str, Any]]:
completed = subprocess.run(
["yt-dlp", "--write-auto-subs", "--sub-langs", "en", "--sub-format", "vtt",
"--skip-download", "--output", str(working_dir / video_id),
f"https://youtu.be/{video_id}"],
capture_output=True, text=True,
)
candidates = list(working_dir.glob(f"{video_id}*.vtt"))
if not candidates:
raise RuntimeError(f"yt-dlp VTT fetch failed: {completed.stderr[:300]}")
return _parse_vtt_segments(candidates[0])
def extract_transcript(url_or_id: str, output: Path, retries: int = 3) -> _Ok | _Err:
@@ -83,19 +104,16 @@ def extract_transcript(url_or_id: str, output: Path, retries: int = 3) -> _Ok |
if parsed.is_err():
return parsed
video_id = parsed.value
output.parent.mkdir(parents=True, exist_ok=True)
last_exc: Exception | None = None
segments: list[dict[str, Any]] = []
for attempt in range(retries):
try:
segments = _fetch_raw_transcript(video_id)
break
segments = _fetch_via_ytdlp(video_id, output.parent)
data = format_transcript_json(video_id, segments)
output.write_text(json.dumps(data, indent=2, ensure_ascii=False), encoding="utf-8")
return ok(data)
except Exception as e:
last_exc = e
if attempt < retries - 1:
time.sleep(2 ** attempt)
if not segments:
return err(make_error("NetworkError", "fetch", str(last_exc) if last_exc else "no segments"))
data = format_transcript_json(video_id, segments)
output.parent.mkdir(parents=True, exist_ok=True)
output.write_text(json.dumps(data, indent=2, ensure_ascii=False), encoding="utf-8")
return ok(data)
return err(make_error("TranscriptFetchError", "fetch", str(last_exc) if last_exc else "no segments"))