diff --git a/fetch_blog.py b/fetch_blog.py new file mode 100644 index 0000000..d9a7837 --- /dev/null +++ b/fetch_blog.py @@ -0,0 +1,43 @@ +import csv +import os +import requests +from markdownify import markdownify as md + +CSV_PATH = "C:/projects/forth/bootslop/references/TimothyLottesBlog.csv" +OUT_DIR = "C:/projects/forth/bootslop/references/TimothyLottesBlog" + +os.makedirs(OUT_DIR, exist_ok=True) + +with open(CSV_PATH, "r", encoding="utf-8") as f: + reader = csv.reader(f) + for row in reader: + if len(row) < 2: + continue + title = row[0].strip() + url = row[1].strip() + + # Clean up title for filename + filename = "".join(c for c in title if c.isalnum() or c in (' ', '-', '_')).strip() + filename = filename.replace(' ', '_') + ".md" + out_path = os.path.join(OUT_DIR, filename) + + if os.path.exists(out_path): + print(f"Already exists: {filename}") + continue + + print(f"Fetching: {title}") + try: + response = requests.get(url) + response.raise_for_status() + + # Convert HTML to markdown + markdown_text = md(response.text, heading_style="ATX") + + with open(out_path, "w", encoding="utf-8") as out_f: + out_f.write(f"# {title}\n\n") + out_f.write(f"**Source:** {url}\n\n") + out_f.write(markdown_text) + + print(f"Saved: {filename}") + except Exception as e: + print(f"Failed to fetch {title}: {e}") diff --git a/fetch_notes.py b/fetch_notes.py new file mode 100644 index 0000000..6a52e75 --- /dev/null +++ b/fetch_notes.py @@ -0,0 +1,76 @@ +import csv +import os +import requests +from markdownify import markdownify as md + +NOTES_CSV = "C:/projects/forth/bootslop/references/FORTH_NOTES.csv" +BLOG_CSV = "C:/projects/forth/bootslop/references/TimothyLottesBlog.csv" +OUT_DIR = "C:/projects/forth/bootslop/references/ForthNotes" + +os.makedirs(OUT_DIR, exist_ok=True) + +# Load URLs we already processed in TimothyLottesBlog.csv +ignored_urls = set() +if os.path.exists(BLOG_CSV): + with open(BLOG_CSV, "r", encoding="utf-8") as f: + reader = csv.reader(f) + for row in reader: + if len(row) >= 2: + ignored_urls.add(row[1].strip()) + +def is_relevant(title, url): + # Ignore YouTube as it's not useful to scrape HTML for them + if "youtube.com" in url or "youtu.be" in url: + return False + # Ignore PDFs (requires special handling) + if url.endswith(".pdf"): + return False + # Ignore twitter (requires JS/auth) + if "x.com" in url or "twitter.com" in url: + return False + return True + +with open(NOTES_CSV, "r", encoding="utf-8") as f: + reader = csv.reader(f) + next(reader, None) # Skip header + for row in reader: + if len(row) < 2: + continue + + title = row[0].strip() + url = row[1].strip() + + if url in ignored_urls: + continue + + if not is_relevant(title, url): + print(f"Skipping unsupported/media link: {title} ({url})") + continue + + # Clean up title for filename + filename = "".join(c for c in title if c.isalnum() or c in (' ', '-', '_')).strip() + filename = filename.replace(' ', '_') + ".md" + out_path = os.path.join(OUT_DIR, filename) + + if os.path.exists(out_path): + print(f"Already exists: {filename}") + continue + + print(f"Fetching: {title} from {url}") + try: + # Set a user-agent to avoid simple anti-bot blocks + headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'} + response = requests.get(url, headers=headers, timeout=10) + response.raise_for_status() + + # Convert HTML to markdown + markdown_text = md(response.text, heading_style="ATX") + + with open(out_path, "w", encoding="utf-8") as out_f: + out_f.write(f"# {title}\n\n") + out_f.write(f"**Source:** {url}\n\n") + out_f.write(markdown_text) + + print(f"Saved: {filename}") + except Exception as e: + print(f"Failed to fetch {title}: {e}") diff --git a/image.png b/image.png new file mode 100644 index 0000000..2646efc Binary files /dev/null and b/image.png differ diff --git a/ocr_interaction.py b/ocr_interaction.py new file mode 100644 index 0000000..0e45ae9 --- /dev/null +++ b/ocr_interaction.py @@ -0,0 +1,39 @@ +import asyncio +import os +from winsdk.windows.storage import StorageFile +from winsdk.windows.graphics.imaging import BitmapDecoder +from winsdk.windows.media.ocr import OcrEngine +from winsdk.windows.globalization import Language + +async def ocr_single_image(img_path): + if not os.path.exists(img_path): + print(f"File not found: {img_path}") + return + + file = await StorageFile.get_file_from_path_async(os.path.abspath(img_path)) + stream = await file.open_read_async() + decoder = await BitmapDecoder.create_async(stream) + bitmap = await decoder.get_software_bitmap_async() + + engine = OcrEngine.try_create_from_language(Language("en-US")) + if not engine: + print("Failed to create OCR engine") + return + + result = await engine.recognize_async(bitmap) + + output = [f"# OCR Thread: {os.path.basename(img_path)}\n"] + if result and result.lines: + for line in result.lines: + output.append(line.text) + + out_path = img_path + ".ocr.md" + # Using join with a literal newline to avoid potential issues in some environments + content = "\n".join(output) + with open(out_path, "w", encoding="utf-8") as f: + f.write(content) + print(f"OCR result saved to {out_path}") + +if __name__ == "__main__": + img = r"C:\projects\forth\bootslop\references\X.com - Onat & Lottes Interaction 1.png" + asyncio.run(ocr_single_image(img)) diff --git a/process_visuals.py b/process_visuals.py index 760d236..705da6e 100644 --- a/process_visuals.py +++ b/process_visuals.py @@ -161,7 +161,7 @@ async def process_video(video_path, timestamps, video_name): code_crop = frame[min_y:max_y, min_x:max_x] code_path = os.path.join(out_dir, f"code_{t_sec:04d}s.jpg") - cv2.imwrite(code_path, code_crop) + Image.fromarray(cv2.cvtColor(code_crop, cv2.COLOR_BGR2RGB)).save(code_path) markdown_lines.append(f"\n*Saved code image: {code_path}*") # Find non-text visual content (e.g. diagrams) outside the text area @@ -183,7 +183,7 @@ async def process_video(video_path, timestamps, video_name): if bw > 100 and bh > 100: # large enough visual_crop = frame[max(0, by-pad):min(frame.shape[0], by+bh+pad), max(0, bx-pad):min(frame.shape[1], bx+bw+pad)] visual_path = os.path.join(out_dir, f"visual_{t_sec:04d}s.jpg") - cv2.imwrite(visual_path, visual_crop) + Image.fromarray(cv2.cvtColor(visual_crop, cv2.COLOR_BGR2RGB)).save(visual_path) markdown_lines.append(f"\n*Saved non-text visual: {visual_path}*") markdown_lines.append("\n---\n")