6399dcc4ed
Phase 5: rag_engine.search() return type (FR4 row 7) Before: def search(...) -> List[Dict[str, Any]] at src/rag_engine.py:367 After: def search(...) -> List["RAGChunk"] Delta: -1 wrong type annotation (List[Dict] -> List[RAGChunk]) RAGChunk dataclass extended with `id: str = ""` field to preserve the chroma wire-format identifier. The search() function now constructs RAGChunk instances directly from chromadb query results, normalizing the wire format (metadata.path -> RAGChunk.path; distance -> 1.0 - score) at the boundary. Consumer updates: - src/ai_client.py:3259-3266: chunk["metadata"]["path"] -> chunk.path; chunk["document"] -> chunk.document (direct attribute access) - src/app_controller.py:3506: docstring updated from Result[List[Dict]] to Result[List[RAGChunk]] (no code change; pass-through) Test updates: - tests/test_rag_engine.py:61: results[0]["id"] -> results[0].id (now uses dataclass attribute access) Verification: - audit_weak_types --strict: OK (107 <= 112 baseline) - py_check_syntax: OK on rag_engine.py, ai_client.py, test_rag_engine.py - 21 RAG tests pass (test_rag_engine, test_rag_chunk, test_rag_engine_ready_status_bug, test_rag_integration, test_context_composition_decoupled, test_tiered_aggregation)
151 lines
5.8 KiB
Python
151 lines
5.8 KiB
Python
import pytest
|
|
import os
|
|
from unittest.mock import MagicMock, patch
|
|
from src import models
|
|
from src import rag_engine
|
|
from src.rag_engine import RAGEngine, BaseEmbeddingProvider, LocalEmbeddingProvider, GeminiEmbeddingProvider
|
|
|
|
class MockEmbeddingProvider(BaseEmbeddingProvider):
|
|
def embed(self, texts):
|
|
return [[0.1] * 384 for _ in texts]
|
|
|
|
@pytest.fixture
|
|
def mock_rag_config():
|
|
vs_config = models.VectorStoreConfig(provider='mock', collection_name='test')
|
|
return models.RAGConfig(enabled=True, vector_store=vs_config, embedding_provider='gemini')
|
|
|
|
def test_rag_engine_init_mock(mock_rag_config):
|
|
engine = RAGEngine(mock_rag_config)
|
|
assert engine.config.enabled is True
|
|
assert engine.collection == "mock"
|
|
|
|
def test_local_embedding_provider_missing_dependency_has_install_hint():
|
|
with patch.object(rag_engine, "_SENTENCE_TRANSFORMERS", None):
|
|
with patch.dict("sys.modules", {"sentence_transformers": None}):
|
|
with pytest.raises(ImportError, match=r"manual_slop\[local-rag\]"):
|
|
LocalEmbeddingProvider()
|
|
|
|
@patch('src.rag_engine.LocalEmbeddingProvider.embed')
|
|
@patch('src.rag_engine._get_chromadb')
|
|
def test_rag_engine_chroma(mock_get_chroma, mock_embed):
|
|
mock_chroma = MagicMock()
|
|
mock_settings = MagicMock()
|
|
mock_get_chroma.return_value = (mock_chroma, mock_settings)
|
|
|
|
mock_embed.return_value = [[0.1, 0.2, 0.3]]
|
|
mock_collection = MagicMock()
|
|
mock_client = MagicMock()
|
|
mock_client.get_or_create_collection.return_value = mock_collection
|
|
mock_chroma.PersistentClient.return_value = mock_client
|
|
|
|
vs_config = models.VectorStoreConfig(provider='chroma', collection_name='test')
|
|
config = models.RAGConfig(enabled=True, vector_store=vs_config, embedding_provider='local')
|
|
|
|
with patch('src.rag_engine._get_sentence_transformers') as mock_st:
|
|
mock_st.return_value = MagicMock()
|
|
engine = RAGEngine(config)
|
|
assert engine.collection == mock_collection
|
|
|
|
engine.add_documents(["doc1"], ["hello world"])
|
|
mock_collection.upsert.assert_called_once()
|
|
|
|
mock_collection.query.return_value = {
|
|
"ids": [["doc1"]],
|
|
"documents": [["hello world"]],
|
|
"metadatas": [[{}]],
|
|
"distances": [[0.0]]
|
|
}
|
|
|
|
results = engine.search("hello", top_k=1)
|
|
assert len(results) == 1
|
|
assert results[0].id == "doc1"
|
|
engine.delete_documents(["doc1"])
|
|
mock_collection.delete.assert_called_once_with(ids=["doc1"])
|
|
|
|
@patch('src.rag_engine.LocalEmbeddingProvider.embed')
|
|
@patch('src.rag_engine._get_chromadb')
|
|
def test_rag_collection_dim_mismatch_recreates_collection(mock_get_chroma, mock_embed):
|
|
"""
|
|
Regression test for the live_gui_test_hardening_v2 followup
|
|
(RAG dimension-mismatch flake in test_rag_phase4_stress).
|
|
|
|
Scenario: a ChromaDB collection exists on disk with vectors from a
|
|
previous embedding provider (e.g. Gemini, 3072-dim), but the current
|
|
config uses a different provider (e.g. local SentenceTransformers,
|
|
384-dim). Without the dim check, upsert silently corrupts the
|
|
collection and search() later fails with
|
|
"Collection expecting embedding with dimension of 3072, got 384".
|
|
|
|
Expected: RAGEngine.__init__ detects the mismatch, deletes the
|
|
mismatched collection via client.delete_collection, and recreates it
|
|
empty so subsequent indexing uses the correct dim.
|
|
"""
|
|
mock_chroma = MagicMock()
|
|
mock_settings = MagicMock()
|
|
mock_get_chroma.return_value = (mock_chroma, mock_settings)
|
|
|
|
mock_embed.return_value = [[0.1] * 384]
|
|
mock_collection = MagicMock()
|
|
mock_collection.get.return_value = {
|
|
"embeddings": [[0.1] * 3072],
|
|
"metadatas": [{}],
|
|
"ids": ["stale_doc_1"],
|
|
}
|
|
mock_collection.name = "test"
|
|
|
|
mock_client = MagicMock()
|
|
mock_client.get_or_create_collection.return_value = mock_collection
|
|
mock_chroma.PersistentClient.return_value = mock_client
|
|
|
|
vs_config = models.VectorStoreConfig(provider='chroma', collection_name='test')
|
|
config = models.RAGConfig(enabled=True, vector_store=vs_config, embedding_provider='local')
|
|
|
|
with patch('src.rag_engine._get_sentence_transformers') as mock_st:
|
|
mock_st.return_value = MagicMock()
|
|
engine = RAGEngine(config)
|
|
assert engine.collection == mock_collection
|
|
# On dim mismatch, _validate_collection_dim_result calls
|
|
# client.delete_collection(name) then get_or_create_collection(name)
|
|
# to recreate the collection with the correct dim. The first
|
|
# get_or_create_collection call was in _init_vector_store_result.
|
|
assert mock_client.get_or_create_collection.call_count == 2
|
|
mock_client.delete_collection.assert_called_once_with("test")
|
|
|
|
@patch('src.rag_engine.LocalEmbeddingProvider.embed')
|
|
@patch('src.rag_engine._get_chromadb')
|
|
def test_rag_collection_dim_match_preserves_collection(mock_get_chroma, mock_embed):
|
|
"""
|
|
Companion test: when the collection's existing dim matches the current
|
|
provider's dim, the engine must NOT delete the collection (which would
|
|
discard indexed data).
|
|
"""
|
|
mock_chroma = MagicMock()
|
|
mock_settings = MagicMock()
|
|
mock_get_chroma.return_value = (mock_chroma, mock_settings)
|
|
|
|
mock_embed.return_value = [[0.1] * 384]
|
|
mock_collection = MagicMock()
|
|
mock_collection.get.return_value = {
|
|
"embeddings": [[0.1] * 384],
|
|
"metadatas": [{"path": "file_25.txt"}],
|
|
"ids": ["doc_25_0"],
|
|
}
|
|
mock_collection.name = "test"
|
|
|
|
mock_client = MagicMock()
|
|
mock_client.get_or_create_collection.return_value = mock_collection
|
|
mock_chroma.PersistentClient.return_value = mock_client
|
|
|
|
vs_config = models.VectorStoreConfig(provider='chroma', collection_name='test')
|
|
config = models.RAGConfig(enabled=True, vector_store=vs_config, embedding_provider='local')
|
|
|
|
with patch('src.rag_engine._get_sentence_transformers') as mock_st:
|
|
mock_st.return_value = MagicMock()
|
|
engine = RAGEngine(config)
|
|
assert engine.collection == mock_collection
|
|
mock_client.delete_collection.assert_not_called()
|
|
assert mock_client.get_or_create_collection.call_count == 1
|
|
|
|
engine.delete_documents(["doc1"])
|
|
mock_collection.delete.assert_called_once_with(ids=["doc1"])
|