feat(rag): Implement indexing and retrieval logic with AppController integration

2026-05-04 06:53:32 -04:00
parent 337c21ad1f
commit fe0069c046
4 changed files with 237 additions and 3 deletions
@@ -2245,9 +2245,20 @@ def send(
 enable_tools: bool = True,
 stream_callback: Optional[Callable[[str], None]] = None,
 patch_callback: Optional[Callable[[str, str], Optional[str]]] = None,
+ rag_engine: Optional[Any] = None,
 ) -> str:
 monitor = performance_monitor.get_monitor()
 if monitor.enabled: monitor.start_component("ai_client.send")
+
+ if rag_engine and getattr(rag_engine.config, "enabled", False):
+  chunks = rag_engine.search(user_message)
+  if chunks:
+   context_block = "## Retrieved Context\n\n"
+   for i, chunk in enumerate(chunks):
+    path = chunk.get("metadata", {}).get("path", "unknown")
+    context_block += f"### Chunk {i+1} (Source: {path})\n{chunk.get('document', '')}\n\n"
+   user_message = context_block + user_message
+
 _append_comms("OUT", "request", {"message": user_message, "system": _get_combined_system_prompt(_active_tool_preset, _active_bias_profile)})
 with _send_lock:
  if _provider == "gemini":
@@ -35,6 +35,7 @@ from src import orchestrator_pm
 from src import conductor_tech_lead
 from src import multi_agent_conductor
 from src import tool_presets
+from src import rag_engine
 from src import theme_2 as theme

 def hide_tk_root() -> Tk:
@@ -202,6 +203,8 @@ class AppController:
  self._pending_ask_dialog: bool = False
  self.mcp_config: models.MCPConfiguration = models.MCPConfiguration()
  self.rag_config: Optional[models.RAGConfig] = None
+  self.rag_engine: Optional[rag_engine.RAGEngine] = None
+  self.rag_status: str = 'idle'
  # AI settings state
  self._current_provider: str = "gemini"
  self._current_model: str = "gemini-2.5-flash-lite"
@@ -353,6 +356,7 @@ class AppController:
   'show_confirm_modal': 'show_confirm_modal',
   'mma_epic_input': 'ui_epic_input',
   'mma_status': 'mma_status',
+   'rag_status': 'rag_status',
   'mma_active_tier': 'active_tier',
   'ui_new_track_name': 'ui_new_track_name',
   'ui_new_track_desc': 'ui_new_track_desc',
@@ -560,6 +564,32 @@ class AppController:
     "payload": status
    })

+ def _set_rag_status(self, status: str) -> None:
+  """Thread-safe update of rag_status via the GUI task queue."""
+  with self._pending_gui_tasks_lock:
+   self._pending_gui_tasks.append({
+     "action": "set_value",
+     "item": "rag_status",
+     "value": status
+    })
+
+ def _rebuild_rag_index(self) -> None:
+  """Background thread that re-indexes all files in the current project."""
+  if not self.rag_config or not self.rag_config.enabled or not self.rag_engine:
+   return
+
+  def _run():
+   try:
+    self._set_rag_status("indexing...")
+    for f in self.files:
+     path = f.path if hasattr(f, "path") else str(f)
+     self.rag_engine.index_file(path)
+    self._set_rag_status("ready")
+   except Exception as e:
+    self._set_rag_status(f"error: {e}")
+
+  threading.Thread(target=_run, daemon=True).start()
+
 def _trigger_gui_refresh(self):
  with self._pending_gui_tasks_lock:
   self._pending_gui_tasks.append({'action': 'set_comms_dirty'})
@@ -955,6 +985,8 @@ class AppController:
  else:
   self.rag_config = models.RAGConfig()

+  self.rag_engine = rag_engine.RAGEngine(self.rag_config, self.active_project_root)
+
  from src.personas import PersonaManager
  self.persona_manager = PersonaManager(Path(self.active_project_path).parent if self.active_project_path else None)
  self.personas = self.persona_manager.load_all()
@@ -1448,7 +1480,8 @@ class AppController:
    stream_callback=lambda text: self._on_ai_stream(text),
    pre_tool_callback=self._confirm_and_run,
    qa_callback=ai_client.run_tier4_analysis,
-    patch_callback=ai_client.run_tier4_patch_callback
+    patch_callback=ai_client.run_tier4_patch_callback,
+    rag_engine=self.rag_engine
   )
   self.event_queue.put("response", {"text": resp, "status": "done", "role": "AI"})
  except ai_client.ProviderError as e:
@@ -1867,7 +1900,7 @@ class AppController:
        "ts": project_manager.now_ts()
       })
    try:
-     resp = ai_client.send(stable_md, user_msg, base_dir, self.last_file_items, disc_text)
+     resp = ai_client.send(stable_md, user_msg, base_dir, self.last_file_items, disc_text, rag_engine=self.rag_engine)
     if req.auto_add_history:
      with self._pending_history_adds_lock:
       self._pending_history_adds.append({
@@ -2024,7 +2057,17 @@ class AppController:
  self._set_status(f"switched to: {Path(path).stem}")

 def _refresh_from_project(self) -> None:
-  self.files = list(self.project.get("files", {}).get("paths", []))
+  # Deserialize FileItems in files.paths
+  raw_paths = self.project.get("files", {}).get("paths", [])
+  self.files = []
+  for p in raw_paths:
+   if isinstance(p, models.FileItem):
+    self.files.append(p)
+   elif isinstance(p, dict):
+    self.files.append(models.FileItem.from_dict(p))
+   else:
+    self.files.append(models.FileItem(path=str(p)))
+
  self.screenshots = list(self.project.get("screenshots", {}).get("paths", []))
  disc_sec = self.project.get("discussion", {})
  self.disc_roles = list(disc_sec.get("roles", ["User", "AI", "Vendor API", "System"]))
@@ -2090,6 +2133,9 @@ class AppController:
  self.tool_presets = self.tool_preset_manager.load_all_presets()
  self.bias_profiles = self.tool_preset_manager.load_all_bias_profiles()

+  if self.rag_config and self.rag_config.enabled:
+   self._rebuild_rag_index()
+
 def _apply_preset(self, name: str, scope: str) -> None:
  print(f"[DEBUG] _apply_preset: name={name}, scope={scope}")
  if name == "None":
@@ -91,6 +91,72 @@ class RAGEngine:
   metadatas=metadatas
  )

+ def _chunk_text(self, content: str) -> List[str]:
+  """Character-based chunking with overlap."""
+  chunks = []
+  if not content:
+   return chunks
+  chunk_size = self.config.chunk_size
+  overlap = self.config.chunk_overlap
+  start = 0
+  while start < len(content):
+   end = start + chunk_size
+   chunks.append(content[start:end])
+   if end >= len(content):
+    break
+   start += (chunk_size - overlap)
+  return chunks
+
+ def _chunk_code(self, content: str, file_path: str) -> List[str]:
+  """AST-aware chunking for Python code."""
+  try:
+   from src.file_cache import ASTParser
+   parser = ASTParser("python")
+   tree = parser.parse(content)
+   chunks = []
+   
+   # Capture classes and top-level functions
+   for node in tree.root_node.children:
+    if node.type in ("function_definition", "class_definition"):
+     chunks.append(content[node.start_byte:node.end_byte])
+   
+   # Fallback if no structural chunks found or if file is small
+   if not chunks or len(content) < self.config.chunk_size:
+    return self._chunk_text(content)
+   return chunks
+  except Exception:
+   return self._chunk_text(content)
+
+ def index_file(self, file_path: str):
+  """Reads, chunks, and indexes a file into the vector store."""
+  if not self.config.enabled or self.collection == "mock":
+   return
+  
+  full_path = os.path.join(self.base_dir, file_path)
+  if not os.path.exists(full_path):
+   return
+   
+  try:
+   with open(full_path, "r", encoding="utf-8", errors="ignore") as f:
+    content = f.read()
+  except Exception:
+   return
+
+  # Remove old entries for this file
+  self.collection.delete(where={"path": file_path})
+  
+  if file_path.lower().endswith(".py"):
+   chunks = self._chunk_code(content, file_path)
+  else:
+   chunks = self._chunk_text(content)
+   
+  if not chunks:
+   return
+
+  ids = [f"{file_path}_{i}" for i in range(len(chunks))]
+  metadatas = [{"path": file_path, "chunk": i} for i in range(len(chunks))]
+  self.add_documents(ids, chunks, metadatas)
+
 def search(self, query: str, top_k: int = 5) -> List[Dict[str, Any]]:
  if not self.config.enabled or self.collection == "mock":
   return []