initial commit

2026-02-19 15:37:05 -05:00
commit f4c8181cfc
6 changed files with 489 additions and 0 deletions
--- a/extract_frames.py
+++ b/extract_frames.py
@@ -0,0 +1,112 @@
+import os
+import re
+import sys
+import imagehash
+from PIL import Image
+import cv2
+import yt_dlp
+
+VIDEOS = {
+    "Forth Day 2020 - Preview of x64 & ColorForth & SPIR V - Onat.txt": "https://youtu.be/ajZAECYdJvE",
+    "Neokineogfx - 4th And Beyond - Transcript.txt": "https://youtu.be/Awkdt30Ruvk",
+    "Silicon Valley Forth Interest Group - Metaprogramming VAMP in KYRA, a Next-gen Forth-like language --- Onat Türkçüoğlu -- 2025-04-26.txt": "https://youtu.be/J9U_5tjdegY"
+}
+
+REFERENCES_DIR = "C:/projects/forth/bootslop/references"
+SNAPSHOTS_DIR = os.path.join(REFERENCES_DIR, "snapshots")
+
+os.makedirs(SNAPSHOTS_DIR, exist_ok=True)
+
+def parse_timestamps(filepath):
+    timestamps = []
+    with open(filepath, 'r', encoding='utf-8') as f:
+        for line in f:
+            line = line.strip()
+            # Match MM:SS or H:MM:SS
+            if re.match(r'^(\d+:)?\d+:\d{2}$', line):
+                parts = list(map(int, line.split(':')))
+                if len(parts) == 2:
+                    seconds = parts[0] * 60 + parts[1]
+                else:
+                    seconds = parts[0] * 3600 + parts[1] * 60 + parts[2]
+                timestamps.append(seconds)
+    return sorted(list(set(timestamps)))
+
+def download_video(url, output_path):
+    ydl_opts = {
+        'format': 'best[ext=mp4]/best',
+        'outtmpl': output_path,
+        'quiet': True,
+        'no_warnings': True,
+    }
+    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+        ydl.download([url])
+
+def extract_and_dedup_frames(video_path, timestamps, out_dir):
+    os.makedirs(out_dir, exist_ok=True)
+    cap = cv2.VideoCapture(video_path)
+    
+    if not cap.isOpened():
+        print(f"Failed to open video {video_path}")
+        return
+
+    fps = cap.get(cv2.CAP_PROP_FPS)
+    saved_hashes = []
+    
+    for t_sec in timestamps:
+        # Set video position
+        cap.set(cv2.CAP_PROP_POS_MSEC, t_sec * 1000)
+        ret, frame = cap.read()
+        if not ret:
+            continue
+            
+        # Convert CV2 BGR to RGB for Pillow
+        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+        pil_img = Image.fromarray(frame_rgb)
+        
+        # Calculate perceptual hash
+        h = imagehash.phash(pil_img)
+        
+        # Check against previous hashes (hamming distance < 5 is very similar)
+        is_duplicate = False
+        for saved_h in saved_hashes:
+            if h - saved_h < 5:
+                is_duplicate = True
+                break
+                
+        if not is_duplicate:
+            saved_hashes.append(h)
+            out_filename = os.path.join(out_dir, f"frame_{t_sec:04d}s.jpg")
+            pil_img.save(out_filename, quality=90)
+            print(f"Saved {out_filename}")
+
+    cap.release()
+
+def main():
+    for transcript_file, url in VIDEOS.items():
+        filepath = os.path.join(REFERENCES_DIR, transcript_file)
+        if not os.path.exists(filepath):
+            print(f"Transcript not found: {filepath}")
+            continue
+            
+        print(f"Processing {transcript_file}...")
+        timestamps = parse_timestamps(filepath)
+        print(f"Found {len(timestamps)} timestamps.")
+        
+        video_name = os.path.splitext(transcript_file)[0]
+        video_path = os.path.join(SNAPSHOTS_DIR, f"{video_name}.mp4")
+        out_dir = os.path.join(SNAPSHOTS_DIR, video_name)
+        
+        if not os.path.exists(video_path):
+            print(f"Downloading video from {url}...")
+            download_video(url, video_path)
+            
+        print("Extracting frames...")
+        extract_and_dedup_frames(video_path, timestamps, out_dir)
+        
+        # Clean up video to save space
+        if os.path.exists(video_path):
+            os.remove(video_path)
+
+if __name__ == '__main__':
+    main()