import csv import os import requests from markdownify import markdownify as md NOTES_CSV = "C:/projects/forth/bootslop/references/FORTH_NOTES.csv" BLOG_CSV = "C:/projects/forth/bootslop/references/TimothyLottesBlog.csv" OUT_DIR = "C:/projects/forth/bootslop/references/ForthNotes" os.makedirs(OUT_DIR, exist_ok=True) # Load URLs we already processed in TimothyLottesBlog.csv ignored_urls = set() if os.path.exists(BLOG_CSV): with open(BLOG_CSV, "r", encoding="utf-8") as f: reader = csv.reader(f) for row in reader: if len(row) >= 2: ignored_urls.add(row[1].strip()) def is_relevant(title, url): # Ignore YouTube as it's not useful to scrape HTML for them if "youtube.com" in url or "youtu.be" in url: return False # Ignore PDFs (requires special handling) if url.endswith(".pdf"): return False # Ignore twitter (requires JS/auth) if "x.com" in url or "twitter.com" in url: return False return True with open(NOTES_CSV, "r", encoding="utf-8") as f: reader = csv.reader(f) next(reader, None) # Skip header for row in reader: if len(row) < 2: continue title = row[0].strip() url = row[1].strip() if url in ignored_urls: continue if not is_relevant(title, url): print(f"Skipping unsupported/media link: {title} ({url})") continue # Clean up title for filename filename = "".join(c for c in title if c.isalnum() or c in (' ', '-', '_')).strip() filename = filename.replace(' ', '_') + ".md" out_path = os.path.join(OUT_DIR, filename) if os.path.exists(out_path): print(f"Already exists: {filename}") continue print(f"Fetching: {title} from {url}") try: # Set a user-agent to avoid simple anti-bot blocks headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'} response = requests.get(url, headers=headers, timeout=10) response.raise_for_status() # Convert HTML to markdown markdown_text = md(response.text, heading_style="ATX") with open(out_path, "w", encoding="utf-8") as out_f: out_f.write(f"# {title}\n\n") out_f.write(f"**Source:** {url}\n\n") out_f.write(markdown_text) print(f"Saved: {filename}") except Exception as e: print(f"Failed to fetch {title}: {e}")