From 321810438bad5da19a60b4c5dc72c237eea4c535 Mon Sep 17 00:00:00 2001 From: Ed_ Date: Mon, 4 May 2026 04:44:11 -0400 Subject: [PATCH] feat(aggregation): Implement hash-based summary cache --- src/summarize.py | 12 +++++++- src/summary_cache.py | 55 +++++++++++++++++++++++++++++++++++++ tests/test_summary_cache.py | 46 +++++++++++++++++++++++++++++++ 3 files changed, 112 insertions(+), 1 deletion(-) create mode 100644 src/summary_cache.py create mode 100644 tests/test_summary_cache.py diff --git a/src/summarize.py b/src/summarize.py index c81357f..058d4fd 100644 --- a/src/summarize.py +++ b/src/summarize.py @@ -27,6 +27,9 @@ import ast import re from pathlib import Path from typing import Callable, Any +from src.summary_cache import SummaryCache, get_file_hash + +_summary_cache = SummaryCache() # ------------------------------------------------------------------ per-type extractors @@ -153,10 +156,17 @@ def summarise_file(path: Path, content: str) -> str: Return a compact markdown summary string for a single file. `content` is the already-read file text (or an error string). """ + content_hash = get_file_hash(content) + cached = _summary_cache.get_summary(str(path), content_hash) + if cached: + return cached + suffix = path.suffix.lower() if hasattr(path, "suffix") else "" fn = _SUMMARISERS.get(suffix, _summarise_generic) try: - return fn(path, content) + summary = fn(path, content) + _summary_cache.set_summary(str(path), content_hash, summary) + return summary except Exception as e: return f"_Summariser error: {e}_" diff --git a/src/summary_cache.py b/src/summary_cache.py new file mode 100644 index 0000000..548e410 --- /dev/null +++ b/src/summary_cache.py @@ -0,0 +1,55 @@ +import hashlib +import json +from pathlib import Path +from typing import Optional, Dict + +def get_file_hash(content: str) -> str: + """Returns SHA256 hash of the content.""" + return hashlib.sha256(content.encode("utf-8")).hexdigest() + +class SummaryCache: + """ + A hash-based cache for file summaries to avoid redundant processing. + Invalidates when content hash changes. + """ + def __init__(self, cache_file: Optional[str] = None): + if cache_file: + self.cache_file = Path(cache_file) + else: + # Default relative to current working directory + self.cache_file = Path(".slop_cache/summary_cache.json") + self.cache: Dict[str, Dict[str, str]] = {} + self.load() + + def load(self) -> None: + """Loads cache from disk.""" + if self.cache_file.exists(): + try: + with open(self.cache_file, "r", encoding="utf-8") as f: + self.cache = json.load(f) + except Exception: + self.cache = {} + + def save(self) -> None: + """Saves cache to disk.""" + try: + self.cache_file.parent.mkdir(parents=True, exist_ok=True) + with open(self.cache_file, "w", encoding="utf-8") as f: + json.dump(self.cache, f, indent=1) + except Exception: + pass + + def get_summary(self, file_path: str, content_hash: str) -> Optional[str]: + """Returns cached summary if hash matches, otherwise None.""" + entry = self.cache.get(file_path) + if entry and entry.get("hash") == content_hash: + return entry.get("summary") + return None + + def set_summary(self, file_path: str, content_hash: str, summary: str) -> None: + """Stores summary in cache and saves to disk.""" + self.cache[file_path] = { + "hash": content_hash, + "summary": summary + } + self.save() diff --git a/tests/test_summary_cache.py b/tests/test_summary_cache.py new file mode 100644 index 0000000..2cda9f4 --- /dev/null +++ b/tests/test_summary_cache.py @@ -0,0 +1,46 @@ +import os +import shutil +from pathlib import Path +from src.summary_cache import SummaryCache, get_file_hash + +def test_get_file_hash(): + content = "hello world" + # sha256 of "hello world" + expected = "b94d27b9934d3e08a52e52d7da7dabfac484efe37a5380ee9088f7ace2efcde9" + assert get_file_hash(content) == expected + +def test_summary_cache(): + cache_dir = Path(".test_cache") + if cache_dir.exists(): + shutil.rmtree(cache_dir) + cache_file = cache_dir / "cache.json" + + cache = SummaryCache(str(cache_file)) + + file_path = "test.py" + content = "print('hello')" + content_hash = get_file_hash(content) + summary = "**Python** - 1 lines" + + # Test empty cache + assert cache.get_summary(file_path, content_hash) is None + + # Test set and get + cache.set_summary(file_path, content_hash, summary) + assert cache.get_summary(file_path, content_hash) == summary + + # Test cache invalidation + assert cache.get_summary(file_path, "different_hash") is None + + # Test persistence + cache2 = SummaryCache(str(cache_file)) + assert cache2.get_summary(file_path, content_hash) == summary + + # Cleanup + if cache_dir.exists(): + shutil.rmtree(cache_dir) + +if __name__ == "__main__": + test_get_file_hash() + test_summary_cache() + print("Tests passed!")