feat(rag): implement incremental and parallel indexing performance optimizations

2026-05-04 21:47:54 -04:00
parent a3d7376535
commit 095368bca2
3 changed files with 47 additions and 5 deletions
@@ -149,6 +149,20 @@ class RAGEngine:
  if not os.path.exists(full_path):
   return
   
+  try:
+   mtime = os.path.getmtime(full_path)
+  except Exception:
+   return
+
+  # Incremental check: see if we already have this file with the same mtime
+  try:
+   res = self.collection.get(where={"path": file_path}, limit=1, include=["metadatas"])
+   if res and res["metadatas"] and res["metadatas"][0]:
+    if res["metadatas"][0].get("mtime") == mtime:
+     return
+  except Exception:
+   pass
+
  try:
   with open(full_path, "r", encoding="utf-8", errors="ignore") as f:
    content = f.read()
@@ -167,7 +181,7 @@ class RAGEngine:
   return

  ids = [f"{file_path}_{i}" for i in range(len(chunks))]
-  metadatas = [{"path": file_path, "chunk": i} for i in range(len(chunks))]
+  metadatas = [{"path": file_path, "chunk": i, "mtime": mtime} for i in range(len(chunks))]
  self.add_documents(ids, chunks, metadatas)

 def _search_mcp(self, query: str, top_k: int = 5) -> List[Dict[str, Any]]:
@@ -216,3 +230,17 @@ class RAGEngine:
  if not self.config.enabled or self.collection == "mock":
   return
  self.collection.delete(ids=ids)
+
+ def get_all_indexed_paths(self) -> List[str]:
+  if not self.config.enabled or self.collection == "mock":
+   return []
+  res = self.collection.get(include=["metadatas"])
+  if not res or not res["metadatas"]:
+   return []
+  return list(set(m.get("path") for m in res["metadatas"] if m.get("path")))
+
+ def delete_documents_by_path(self, file_paths: List[str]):
+  if not self.config.enabled or self.collection == "mock":
+   return
+  for path in file_paths:
+   self.collection.delete(where={"path": path})