curation & gather
This commit is contained in:
76
fetch_notes.py
Normal file
76
fetch_notes.py
Normal file
@@ -0,0 +1,76 @@
|
||||
import csv
|
||||
import os
|
||||
import requests
|
||||
from markdownify import markdownify as md
|
||||
|
||||
NOTES_CSV = "C:/projects/forth/bootslop/references/FORTH_NOTES.csv"
|
||||
BLOG_CSV = "C:/projects/forth/bootslop/references/TimothyLottesBlog.csv"
|
||||
OUT_DIR = "C:/projects/forth/bootslop/references/ForthNotes"
|
||||
|
||||
os.makedirs(OUT_DIR, exist_ok=True)
|
||||
|
||||
# Load URLs we already processed in TimothyLottesBlog.csv
|
||||
ignored_urls = set()
|
||||
if os.path.exists(BLOG_CSV):
|
||||
with open(BLOG_CSV, "r", encoding="utf-8") as f:
|
||||
reader = csv.reader(f)
|
||||
for row in reader:
|
||||
if len(row) >= 2:
|
||||
ignored_urls.add(row[1].strip())
|
||||
|
||||
def is_relevant(title, url):
|
||||
# Ignore YouTube as it's not useful to scrape HTML for them
|
||||
if "youtube.com" in url or "youtu.be" in url:
|
||||
return False
|
||||
# Ignore PDFs (requires special handling)
|
||||
if url.endswith(".pdf"):
|
||||
return False
|
||||
# Ignore twitter (requires JS/auth)
|
||||
if "x.com" in url or "twitter.com" in url:
|
||||
return False
|
||||
return True
|
||||
|
||||
with open(NOTES_CSV, "r", encoding="utf-8") as f:
|
||||
reader = csv.reader(f)
|
||||
next(reader, None) # Skip header
|
||||
for row in reader:
|
||||
if len(row) < 2:
|
||||
continue
|
||||
|
||||
title = row[0].strip()
|
||||
url = row[1].strip()
|
||||
|
||||
if url in ignored_urls:
|
||||
continue
|
||||
|
||||
if not is_relevant(title, url):
|
||||
print(f"Skipping unsupported/media link: {title} ({url})")
|
||||
continue
|
||||
|
||||
# Clean up title for filename
|
||||
filename = "".join(c for c in title if c.isalnum() or c in (' ', '-', '_')).strip()
|
||||
filename = filename.replace(' ', '_') + ".md"
|
||||
out_path = os.path.join(OUT_DIR, filename)
|
||||
|
||||
if os.path.exists(out_path):
|
||||
print(f"Already exists: {filename}")
|
||||
continue
|
||||
|
||||
print(f"Fetching: {title} from {url}")
|
||||
try:
|
||||
# Set a user-agent to avoid simple anti-bot blocks
|
||||
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
|
||||
response = requests.get(url, headers=headers, timeout=10)
|
||||
response.raise_for_status()
|
||||
|
||||
# Convert HTML to markdown
|
||||
markdown_text = md(response.text, heading_style="ATX")
|
||||
|
||||
with open(out_path, "w", encoding="utf-8") as out_f:
|
||||
out_f.write(f"# {title}\n\n")
|
||||
out_f.write(f"**Source:** {url}\n\n")
|
||||
out_f.write(markdown_text)
|
||||
|
||||
print(f"Saved: {filename}")
|
||||
except Exception as e:
|
||||
print(f"Failed to fetch {title}: {e}")
|
||||
Reference in New Issue
Block a user