curation & gather

2026-02-19 16:37:12 -05:00
parent 6d3a2dd7d9
commit d147dccbb4
6 changed files with 126 additions and 1 deletions
--- a/estimate_context.py
+++ b/estimate_context.py
@@ -0,0 +1,39 @@
+import os
+
+REFERENCES_DIR = "C:/projects/forth/bootslop/references"
+
+def estimate_tokens():
+    text_extensions = {".txt", ".md", ".csv"}
+    image_extensions = {".jpg", ".jpeg", ".png"}
+    
+    total_text_chars = 0
+    total_images = 0
+    
+    for root, _, files in os.walk(REFERENCES_DIR):
+        for file in files:
+            ext = os.path.splitext(file)[1].lower()
+            filepath = os.path.join(root, file)
+            
+            if ext in text_extensions:
+                try:
+                    with open(filepath, "r", encoding="utf-8", errors="ignore") as f:
+                        total_text_chars += len(f.read())
+                except Exception as e:
+                    print(f"Error reading {filepath}: {e}")
+            elif ext in image_extensions:
+                total_images += 1
+                
+    # Approximate 1 token = 4 characters for English text
+    estimated_text_tokens = total_text_chars // 4
+    # Approximate Gemini 1.5 image token cost (typically 258 tokens per image)
+    estimated_image_tokens = total_images * 258
+    
+    total_tokens = estimated_text_tokens + estimated_image_tokens
+    
+    print(f"Total Text Files Scanned: {total_text_chars} characters (~{estimated_text_tokens} tokens)")
+    print(f"Total Images Scanned: {total_images} images (~{estimated_image_tokens} tokens)")
+    print(f"---")
+    print(f"Estimated Total Context Cost: ~{total_tokens} tokens")
+
+if __name__ == "__main__":
+    estimate_tokens()