feat(aggregation): Implement hash-based summary cache

This commit is contained in:
2026-05-04 04:44:11 -04:00
parent 04c710d60c
commit 321810438b
3 changed files with 112 additions and 1 deletions
+11 -1
View File
@@ -27,6 +27,9 @@ import ast
import re import re
from pathlib import Path from pathlib import Path
from typing import Callable, Any from typing import Callable, Any
from src.summary_cache import SummaryCache, get_file_hash
_summary_cache = SummaryCache()
# ------------------------------------------------------------------ per-type extractors # ------------------------------------------------------------------ per-type extractors
@@ -153,10 +156,17 @@ def summarise_file(path: Path, content: str) -> str:
Return a compact markdown summary string for a single file. Return a compact markdown summary string for a single file.
`content` is the already-read file text (or an error string). `content` is the already-read file text (or an error string).
""" """
content_hash = get_file_hash(content)
cached = _summary_cache.get_summary(str(path), content_hash)
if cached:
return cached
suffix = path.suffix.lower() if hasattr(path, "suffix") else "" suffix = path.suffix.lower() if hasattr(path, "suffix") else ""
fn = _SUMMARISERS.get(suffix, _summarise_generic) fn = _SUMMARISERS.get(suffix, _summarise_generic)
try: try:
return fn(path, content) summary = fn(path, content)
_summary_cache.set_summary(str(path), content_hash, summary)
return summary
except Exception as e: except Exception as e:
return f"_Summariser error: {e}_" return f"_Summariser error: {e}_"
+55
View File
@@ -0,0 +1,55 @@
import hashlib
import json
from pathlib import Path
from typing import Optional, Dict
def get_file_hash(content: str) -> str:
"""Returns SHA256 hash of the content."""
return hashlib.sha256(content.encode("utf-8")).hexdigest()
class SummaryCache:
"""
A hash-based cache for file summaries to avoid redundant processing.
Invalidates when content hash changes.
"""
def __init__(self, cache_file: Optional[str] = None):
if cache_file:
self.cache_file = Path(cache_file)
else:
# Default relative to current working directory
self.cache_file = Path(".slop_cache/summary_cache.json")
self.cache: Dict[str, Dict[str, str]] = {}
self.load()
def load(self) -> None:
"""Loads cache from disk."""
if self.cache_file.exists():
try:
with open(self.cache_file, "r", encoding="utf-8") as f:
self.cache = json.load(f)
except Exception:
self.cache = {}
def save(self) -> None:
"""Saves cache to disk."""
try:
self.cache_file.parent.mkdir(parents=True, exist_ok=True)
with open(self.cache_file, "w", encoding="utf-8") as f:
json.dump(self.cache, f, indent=1)
except Exception:
pass
def get_summary(self, file_path: str, content_hash: str) -> Optional[str]:
"""Returns cached summary if hash matches, otherwise None."""
entry = self.cache.get(file_path)
if entry and entry.get("hash") == content_hash:
return entry.get("summary")
return None
def set_summary(self, file_path: str, content_hash: str, summary: str) -> None:
"""Stores summary in cache and saves to disk."""
self.cache[file_path] = {
"hash": content_hash,
"summary": summary
}
self.save()
+46
View File
@@ -0,0 +1,46 @@
import os
import shutil
from pathlib import Path
from src.summary_cache import SummaryCache, get_file_hash
def test_get_file_hash():
content = "hello world"
# sha256 of "hello world"
expected = "b94d27b9934d3e08a52e52d7da7dabfac484efe37a5380ee9088f7ace2efcde9"
assert get_file_hash(content) == expected
def test_summary_cache():
cache_dir = Path(".test_cache")
if cache_dir.exists():
shutil.rmtree(cache_dir)
cache_file = cache_dir / "cache.json"
cache = SummaryCache(str(cache_file))
file_path = "test.py"
content = "print('hello')"
content_hash = get_file_hash(content)
summary = "**Python** - 1 lines"
# Test empty cache
assert cache.get_summary(file_path, content_hash) is None
# Test set and get
cache.set_summary(file_path, content_hash, summary)
assert cache.get_summary(file_path, content_hash) == summary
# Test cache invalidation
assert cache.get_summary(file_path, "different_hash") is None
# Test persistence
cache2 = SummaryCache(str(cache_file))
assert cache2.get_summary(file_path, content_hash) == summary
# Cleanup
if cache_dir.exists():
shutil.rmtree(cache_dir)
if __name__ == "__main__":
test_get_file_hash()
test_summary_cache()
print("Tests passed!")