curation & gather

2026-02-19 16:09:48 -05:00
parent f4c8181cfc
commit 3ce2977f01
5 changed files with 160 additions and 2 deletions
--- a/fetch_notes.py
+++ b/fetch_notes.py
@@ -0,0 +1,76 @@
+import csv
+import os
+import requests
+from markdownify import markdownify as md
+
+NOTES_CSV = "C:/projects/forth/bootslop/references/FORTH_NOTES.csv"
+BLOG_CSV = "C:/projects/forth/bootslop/references/TimothyLottesBlog.csv"
+OUT_DIR = "C:/projects/forth/bootslop/references/ForthNotes"
+
+os.makedirs(OUT_DIR, exist_ok=True)
+
+# Load URLs we already processed in TimothyLottesBlog.csv
+ignored_urls = set()
+if os.path.exists(BLOG_CSV):
+    with open(BLOG_CSV, "r", encoding="utf-8") as f:
+        reader = csv.reader(f)
+        for row in reader:
+            if len(row) >= 2:
+                ignored_urls.add(row[1].strip())
+
+def is_relevant(title, url):
+    # Ignore YouTube as it's not useful to scrape HTML for them
+    if "youtube.com" in url or "youtu.be" in url:
+        return False
+    # Ignore PDFs (requires special handling)
+    if url.endswith(".pdf"):
+        return False
+    # Ignore twitter (requires JS/auth)
+    if "x.com" in url or "twitter.com" in url:
+        return False
+    return True
+
+with open(NOTES_CSV, "r", encoding="utf-8") as f:
+    reader = csv.reader(f)
+    next(reader, None) # Skip header
+    for row in reader:
+        if len(row) < 2:
+            continue
+            
+        title = row[0].strip()
+        url = row[1].strip()
+        
+        if url in ignored_urls:
+            continue
+            
+        if not is_relevant(title, url):
+            print(f"Skipping unsupported/media link: {title} ({url})")
+            continue
+            
+        # Clean up title for filename
+        filename = "".join(c for c in title if c.isalnum() or c in (' ', '-', '_')).strip()
+        filename = filename.replace(' ', '_') + ".md"
+        out_path = os.path.join(OUT_DIR, filename)
+        
+        if os.path.exists(out_path):
+            print(f"Already exists: {filename}")
+            continue
+            
+        print(f"Fetching: {title} from {url}")
+        try:
+            # Set a user-agent to avoid simple anti-bot blocks
+            headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
+            response = requests.get(url, headers=headers, timeout=10)
+            response.raise_for_status()
+            
+            # Convert HTML to markdown
+            markdown_text = md(response.text, heading_style="ATX")
+            
+            with open(out_path, "w", encoding="utf-8") as out_f:
+                out_f.write(f"# {title}\n\n")
+                out_f.write(f"**Source:** {url}\n\n")
+                out_f.write(markdown_text)
+                
+            print(f"Saved: {filename}")
+        except Exception as e:
+            print(f"Failed to fetch {title}: {e}")