curation & gather
This commit is contained in:
43
fetch_blog.py
Normal file
43
fetch_blog.py
Normal file
@@ -0,0 +1,43 @@
|
|||||||
|
import csv
|
||||||
|
import os
|
||||||
|
import requests
|
||||||
|
from markdownify import markdownify as md
|
||||||
|
|
||||||
|
CSV_PATH = "C:/projects/forth/bootslop/references/TimothyLottesBlog.csv"
|
||||||
|
OUT_DIR = "C:/projects/forth/bootslop/references/TimothyLottesBlog"
|
||||||
|
|
||||||
|
os.makedirs(OUT_DIR, exist_ok=True)
|
||||||
|
|
||||||
|
with open(CSV_PATH, "r", encoding="utf-8") as f:
|
||||||
|
reader = csv.reader(f)
|
||||||
|
for row in reader:
|
||||||
|
if len(row) < 2:
|
||||||
|
continue
|
||||||
|
title = row[0].strip()
|
||||||
|
url = row[1].strip()
|
||||||
|
|
||||||
|
# Clean up title for filename
|
||||||
|
filename = "".join(c for c in title if c.isalnum() or c in (' ', '-', '_')).strip()
|
||||||
|
filename = filename.replace(' ', '_') + ".md"
|
||||||
|
out_path = os.path.join(OUT_DIR, filename)
|
||||||
|
|
||||||
|
if os.path.exists(out_path):
|
||||||
|
print(f"Already exists: {filename}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
print(f"Fetching: {title}")
|
||||||
|
try:
|
||||||
|
response = requests.get(url)
|
||||||
|
response.raise_for_status()
|
||||||
|
|
||||||
|
# Convert HTML to markdown
|
||||||
|
markdown_text = md(response.text, heading_style="ATX")
|
||||||
|
|
||||||
|
with open(out_path, "w", encoding="utf-8") as out_f:
|
||||||
|
out_f.write(f"# {title}\n\n")
|
||||||
|
out_f.write(f"**Source:** {url}\n\n")
|
||||||
|
out_f.write(markdown_text)
|
||||||
|
|
||||||
|
print(f"Saved: {filename}")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Failed to fetch {title}: {e}")
|
||||||
76
fetch_notes.py
Normal file
76
fetch_notes.py
Normal file
@@ -0,0 +1,76 @@
|
|||||||
|
import csv
|
||||||
|
import os
|
||||||
|
import requests
|
||||||
|
from markdownify import markdownify as md
|
||||||
|
|
||||||
|
NOTES_CSV = "C:/projects/forth/bootslop/references/FORTH_NOTES.csv"
|
||||||
|
BLOG_CSV = "C:/projects/forth/bootslop/references/TimothyLottesBlog.csv"
|
||||||
|
OUT_DIR = "C:/projects/forth/bootslop/references/ForthNotes"
|
||||||
|
|
||||||
|
os.makedirs(OUT_DIR, exist_ok=True)
|
||||||
|
|
||||||
|
# Load URLs we already processed in TimothyLottesBlog.csv
|
||||||
|
ignored_urls = set()
|
||||||
|
if os.path.exists(BLOG_CSV):
|
||||||
|
with open(BLOG_CSV, "r", encoding="utf-8") as f:
|
||||||
|
reader = csv.reader(f)
|
||||||
|
for row in reader:
|
||||||
|
if len(row) >= 2:
|
||||||
|
ignored_urls.add(row[1].strip())
|
||||||
|
|
||||||
|
def is_relevant(title, url):
|
||||||
|
# Ignore YouTube as it's not useful to scrape HTML for them
|
||||||
|
if "youtube.com" in url or "youtu.be" in url:
|
||||||
|
return False
|
||||||
|
# Ignore PDFs (requires special handling)
|
||||||
|
if url.endswith(".pdf"):
|
||||||
|
return False
|
||||||
|
# Ignore twitter (requires JS/auth)
|
||||||
|
if "x.com" in url or "twitter.com" in url:
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
with open(NOTES_CSV, "r", encoding="utf-8") as f:
|
||||||
|
reader = csv.reader(f)
|
||||||
|
next(reader, None) # Skip header
|
||||||
|
for row in reader:
|
||||||
|
if len(row) < 2:
|
||||||
|
continue
|
||||||
|
|
||||||
|
title = row[0].strip()
|
||||||
|
url = row[1].strip()
|
||||||
|
|
||||||
|
if url in ignored_urls:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if not is_relevant(title, url):
|
||||||
|
print(f"Skipping unsupported/media link: {title} ({url})")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Clean up title for filename
|
||||||
|
filename = "".join(c for c in title if c.isalnum() or c in (' ', '-', '_')).strip()
|
||||||
|
filename = filename.replace(' ', '_') + ".md"
|
||||||
|
out_path = os.path.join(OUT_DIR, filename)
|
||||||
|
|
||||||
|
if os.path.exists(out_path):
|
||||||
|
print(f"Already exists: {filename}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
print(f"Fetching: {title} from {url}")
|
||||||
|
try:
|
||||||
|
# Set a user-agent to avoid simple anti-bot blocks
|
||||||
|
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
|
||||||
|
response = requests.get(url, headers=headers, timeout=10)
|
||||||
|
response.raise_for_status()
|
||||||
|
|
||||||
|
# Convert HTML to markdown
|
||||||
|
markdown_text = md(response.text, heading_style="ATX")
|
||||||
|
|
||||||
|
with open(out_path, "w", encoding="utf-8") as out_f:
|
||||||
|
out_f.write(f"# {title}\n\n")
|
||||||
|
out_f.write(f"**Source:** {url}\n\n")
|
||||||
|
out_f.write(markdown_text)
|
||||||
|
|
||||||
|
print(f"Saved: {filename}")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Failed to fetch {title}: {e}")
|
||||||
39
ocr_interaction.py
Normal file
39
ocr_interaction.py
Normal file
@@ -0,0 +1,39 @@
|
|||||||
|
import asyncio
|
||||||
|
import os
|
||||||
|
from winsdk.windows.storage import StorageFile
|
||||||
|
from winsdk.windows.graphics.imaging import BitmapDecoder
|
||||||
|
from winsdk.windows.media.ocr import OcrEngine
|
||||||
|
from winsdk.windows.globalization import Language
|
||||||
|
|
||||||
|
async def ocr_single_image(img_path):
|
||||||
|
if not os.path.exists(img_path):
|
||||||
|
print(f"File not found: {img_path}")
|
||||||
|
return
|
||||||
|
|
||||||
|
file = await StorageFile.get_file_from_path_async(os.path.abspath(img_path))
|
||||||
|
stream = await file.open_read_async()
|
||||||
|
decoder = await BitmapDecoder.create_async(stream)
|
||||||
|
bitmap = await decoder.get_software_bitmap_async()
|
||||||
|
|
||||||
|
engine = OcrEngine.try_create_from_language(Language("en-US"))
|
||||||
|
if not engine:
|
||||||
|
print("Failed to create OCR engine")
|
||||||
|
return
|
||||||
|
|
||||||
|
result = await engine.recognize_async(bitmap)
|
||||||
|
|
||||||
|
output = [f"# OCR Thread: {os.path.basename(img_path)}\n"]
|
||||||
|
if result and result.lines:
|
||||||
|
for line in result.lines:
|
||||||
|
output.append(line.text)
|
||||||
|
|
||||||
|
out_path = img_path + ".ocr.md"
|
||||||
|
# Using join with a literal newline to avoid potential issues in some environments
|
||||||
|
content = "\n".join(output)
|
||||||
|
with open(out_path, "w", encoding="utf-8") as f:
|
||||||
|
f.write(content)
|
||||||
|
print(f"OCR result saved to {out_path}")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
img = r"C:\projects\forth\bootslop\references\X.com - Onat & Lottes Interaction 1.png"
|
||||||
|
asyncio.run(ocr_single_image(img))
|
||||||
@@ -161,7 +161,7 @@ async def process_video(video_path, timestamps, video_name):
|
|||||||
|
|
||||||
code_crop = frame[min_y:max_y, min_x:max_x]
|
code_crop = frame[min_y:max_y, min_x:max_x]
|
||||||
code_path = os.path.join(out_dir, f"code_{t_sec:04d}s.jpg")
|
code_path = os.path.join(out_dir, f"code_{t_sec:04d}s.jpg")
|
||||||
cv2.imwrite(code_path, code_crop)
|
Image.fromarray(cv2.cvtColor(code_crop, cv2.COLOR_BGR2RGB)).save(code_path)
|
||||||
markdown_lines.append(f"\n*Saved code image: {code_path}*")
|
markdown_lines.append(f"\n*Saved code image: {code_path}*")
|
||||||
|
|
||||||
# Find non-text visual content (e.g. diagrams) outside the text area
|
# Find non-text visual content (e.g. diagrams) outside the text area
|
||||||
@@ -183,7 +183,7 @@ async def process_video(video_path, timestamps, video_name):
|
|||||||
if bw > 100 and bh > 100: # large enough
|
if bw > 100 and bh > 100: # large enough
|
||||||
visual_crop = frame[max(0, by-pad):min(frame.shape[0], by+bh+pad), max(0, bx-pad):min(frame.shape[1], bx+bw+pad)]
|
visual_crop = frame[max(0, by-pad):min(frame.shape[0], by+bh+pad), max(0, bx-pad):min(frame.shape[1], bx+bw+pad)]
|
||||||
visual_path = os.path.join(out_dir, f"visual_{t_sec:04d}s.jpg")
|
visual_path = os.path.join(out_dir, f"visual_{t_sec:04d}s.jpg")
|
||||||
cv2.imwrite(visual_path, visual_crop)
|
Image.fromarray(cv2.cvtColor(visual_crop, cv2.COLOR_BGR2RGB)).save(visual_path)
|
||||||
markdown_lines.append(f"\n*Saved non-text visual: {visual_path}*")
|
markdown_lines.append(f"\n*Saved non-text visual: {visual_path}*")
|
||||||
|
|
||||||
markdown_lines.append("\n---\n")
|
markdown_lines.append("\n---\n")
|
||||||
|
|||||||
Reference in New Issue
Block a user