feat(rag): Implement indexing and retrieval logic with AppController integration
This commit is contained in:
@@ -91,6 +91,72 @@ class RAGEngine:
|
||||
metadatas=metadatas
|
||||
)
|
||||
|
||||
def _chunk_text(self, content: str) -> List[str]:
|
||||
"""Character-based chunking with overlap."""
|
||||
chunks = []
|
||||
if not content:
|
||||
return chunks
|
||||
chunk_size = self.config.chunk_size
|
||||
overlap = self.config.chunk_overlap
|
||||
start = 0
|
||||
while start < len(content):
|
||||
end = start + chunk_size
|
||||
chunks.append(content[start:end])
|
||||
if end >= len(content):
|
||||
break
|
||||
start += (chunk_size - overlap)
|
||||
return chunks
|
||||
|
||||
def _chunk_code(self, content: str, file_path: str) -> List[str]:
|
||||
"""AST-aware chunking for Python code."""
|
||||
try:
|
||||
from src.file_cache import ASTParser
|
||||
parser = ASTParser("python")
|
||||
tree = parser.parse(content)
|
||||
chunks = []
|
||||
|
||||
# Capture classes and top-level functions
|
||||
for node in tree.root_node.children:
|
||||
if node.type in ("function_definition", "class_definition"):
|
||||
chunks.append(content[node.start_byte:node.end_byte])
|
||||
|
||||
# Fallback if no structural chunks found or if file is small
|
||||
if not chunks or len(content) < self.config.chunk_size:
|
||||
return self._chunk_text(content)
|
||||
return chunks
|
||||
except Exception:
|
||||
return self._chunk_text(content)
|
||||
|
||||
def index_file(self, file_path: str):
|
||||
"""Reads, chunks, and indexes a file into the vector store."""
|
||||
if not self.config.enabled or self.collection == "mock":
|
||||
return
|
||||
|
||||
full_path = os.path.join(self.base_dir, file_path)
|
||||
if not os.path.exists(full_path):
|
||||
return
|
||||
|
||||
try:
|
||||
with open(full_path, "r", encoding="utf-8", errors="ignore") as f:
|
||||
content = f.read()
|
||||
except Exception:
|
||||
return
|
||||
|
||||
# Remove old entries for this file
|
||||
self.collection.delete(where={"path": file_path})
|
||||
|
||||
if file_path.lower().endswith(".py"):
|
||||
chunks = self._chunk_code(content, file_path)
|
||||
else:
|
||||
chunks = self._chunk_text(content)
|
||||
|
||||
if not chunks:
|
||||
return
|
||||
|
||||
ids = [f"{file_path}_{i}" for i in range(len(chunks))]
|
||||
metadatas = [{"path": file_path, "chunk": i} for i in range(len(chunks))]
|
||||
self.add_documents(ids, chunks, metadatas)
|
||||
|
||||
def search(self, query: str, top_k: int = 5) -> List[Dict[str, Any]]:
|
||||
if not self.config.enabled or self.collection == "mock":
|
||||
return []
|
||||
|
||||
Reference in New Issue
Block a user