diff --git a/docker-compose-library.yaml b/docker-compose-library.yaml index 65c7e50ac..52482b18b 100644 --- a/docker-compose-library.yaml +++ b/docker-compose-library.yaml @@ -71,7 +71,7 @@ services: - WATSONX_API_KEY=${WATSONX_API_KEY:-} # Enable debug logging if needed - LLAMA_STACK_LOGGING=${LLAMA_STACK_LOGGING:-} - # FAISS test + # FAISS test and inline RAG config - FAISS_VECTOR_STORE_ID=${FAISS_VECTOR_STORE_ID:-} healthcheck: test: ["CMD", "curl", "-f", "http://localhost:8080/liveness"] diff --git a/docker-compose.yaml b/docker-compose.yaml index aed2bc0a1..1de76cdb3 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -89,6 +89,8 @@ services: - TENANT_ID=${TENANT_ID:-} - CLIENT_ID=${CLIENT_ID:-} - CLIENT_SECRET=${CLIENT_SECRET:-} + # FAISS vector store ID (used by inline RAG config) + - FAISS_VECTOR_STORE_ID=${FAISS_VECTOR_STORE_ID:-} depends_on: llama-stack: condition: service_healthy diff --git a/src/llama_stack_configuration.py b/src/llama_stack_configuration.py index e81ac0c3d..30a1b0796 100644 --- a/src/llama_stack_configuration.py +++ b/src/llama_stack_configuration.py @@ -184,7 +184,11 @@ def construct_vector_stores_section( output = ls_config["registered_resources"]["vector_stores"].copy() # append new vector_stores entries, skipping duplicates - existing_store_ids = {vs.get("vector_store_id") for vs in output} + # Resolve ${env.VAR} patterns so comparisons work when existing entries + # use environment variable references and new entries have resolved values. + existing_store_ids = { + replace_env_vars(vs.get("vector_store_id", "")) for vs in output + } added = 0 for brag in byok_rag: if not brag.get("rag_id"): diff --git a/test.containerfile b/test.containerfile index ecfc54313..884fd8525 100644 --- a/test.containerfile +++ b/test.containerfile @@ -20,7 +20,8 @@ COPY src ./src RUN uv sync --locked --no-install-project --group llslibdev # Add virtual environment to PATH for llama command -ENV PATH="/opt/app-root/.venv/bin:$PATH" +ENV PATH="/opt/app-root/.venv/bin:$PATH" \ + PYTHONPATH="/opt/app-root/src" # Set HOME directory so llama-stack uses /opt/app-root/src/.llama ENV HOME="/opt/app-root/src" diff --git a/tests/e2e-prow/rhoai/configs/lightspeed-stack-inline-rag.yaml b/tests/e2e-prow/rhoai/configs/lightspeed-stack-inline-rag.yaml new file mode 100644 index 000000000..bdbf2b73e --- /dev/null +++ b/tests/e2e-prow/rhoai/configs/lightspeed-stack-inline-rag.yaml @@ -0,0 +1,38 @@ +name: Lightspeed Core Service (LCS) +service: + host: 0.0.0.0 + port: 8080 + auth_enabled: false + workers: 1 + color_log: true + access_log: true +llama_stack: + use_as_library_client: false + url: http://${env.E2E_LLAMA_HOSTNAME}:8321 + api_key: xyzzy +user_data_collection: + feedback_enabled: true + feedback_storage: "/tmp/data/feedback" + transcripts_enabled: true + transcripts_storage: "/tmp/data/transcripts" + +conversation_cache: + type: "sqlite" + sqlite: + db_path: "/tmp/data/conversation-cache.db" + +authentication: + module: "noop" + +byok_rag: + - rag_id: e2e-test-docs + rag_type: inline::faiss + embedding_model: sentence-transformers/all-mpnet-base-v2 + embedding_dimension: 768 + vector_db_id: ${env.FAISS_VECTOR_STORE_ID} + db_path: ${env.KV_RAG_PATH:=~/.llama/storage/rag/kv_store.db} + score_multiplier: 1.0 + +rag: + inline: + - e2e-test-docs diff --git a/tests/e2e/configs/run-ci.yaml b/tests/e2e/configs/run-ci.yaml index cc16560f6..5c9dd8cb1 100644 --- a/tests/e2e/configs/run-ci.yaml +++ b/tests/e2e/configs/run-ci.yaml @@ -58,13 +58,7 @@ providers: - config: {} # Enable MCP (Model Context Protocol) support provider_id: model-context-protocol provider_type: remote::model-context-protocol - vector_io: - - config: # Define the storage backend for RAG - persistence: - namespace: vector_io::faiss - backend: kv_rag - provider_id: faiss - provider_type: inline::faiss + vector_io: [] agents: - config: persistence: @@ -111,9 +105,6 @@ storage: kv_default: type: kv_sqlite db_path: ${env.KV_STORE_PATH:=~/.llama/storage/kv_store.db} - kv_rag: # Define the storage backend type for RAG - type: kv_sqlite - db_path: ${env.KV_RAG_PATH:=~/.llama/storage/rag/kv_store.db} sql_default: type: sql_sqlite db_path: ${env.SQL_STORE_PATH:=~/.llama/storage/sql_store.db} @@ -144,11 +135,7 @@ registered_resources: - shield_id: llama-guard provider_id: llama-guard provider_shield_id: openai/gpt-4o-mini - vector_stores: - - embedding_dimension: 768 - embedding_model: sentence-transformers/all-mpnet-base-v2 - provider_id: faiss - vector_store_id: ${env.FAISS_VECTOR_STORE_ID} + vector_stores: [] datasets: [] scoring_fns: [] benchmarks: [] diff --git a/tests/e2e/configuration/library-mode/lightspeed-stack-auth-noop-token.yaml b/tests/e2e/configuration/library-mode/lightspeed-stack-auth-noop-token.yaml index 0391b8ac6..2c55ae440 100644 --- a/tests/e2e/configuration/library-mode/lightspeed-stack-auth-noop-token.yaml +++ b/tests/e2e/configuration/library-mode/lightspeed-stack-auth-noop-token.yaml @@ -26,3 +26,16 @@ authentication: inference: default_provider: openai default_model: gpt-4o-mini + +byok_rag: + - rag_id: e2e-test-docs + rag_type: inline::faiss + embedding_model: sentence-transformers/all-mpnet-base-v2 + embedding_dimension: 768 + vector_db_id: ${env.FAISS_VECTOR_STORE_ID} + db_path: ${env.KV_RAG_PATH:=~/.llama/storage/rag/kv_store.db} + score_multiplier: 1.0 + +rag: + tool: + - e2e-test-docs diff --git a/tests/e2e/configuration/library-mode/lightspeed-stack-inline-rag.yaml b/tests/e2e/configuration/library-mode/lightspeed-stack-inline-rag.yaml new file mode 100644 index 000000000..ffd744da6 --- /dev/null +++ b/tests/e2e/configuration/library-mode/lightspeed-stack-inline-rag.yaml @@ -0,0 +1,40 @@ +name: Lightspeed Core Service (LCS) +service: + host: 0.0.0.0 + port: 8080 + auth_enabled: false + workers: 1 + color_log: true + access_log: true +llama_stack: + use_as_library_client: true + library_client_config_path: run.yaml +user_data_collection: + feedback_enabled: true + feedback_storage: "/tmp/data/feedback" + transcripts_enabled: true + transcripts_storage: "/tmp/data/transcripts" + +conversation_cache: + type: "sqlite" + sqlite: + db_path: "/tmp/data/conversation-cache.db" + +authentication: + module: "noop" +inference: + default_provider: openai + default_model: gpt-4o-mini + +byok_rag: + - rag_id: e2e-test-docs + rag_type: inline::faiss + embedding_model: sentence-transformers/all-mpnet-base-v2 + embedding_dimension: 768 + vector_db_id: ${env.FAISS_VECTOR_STORE_ID} + db_path: ${env.KV_RAG_PATH:=~/.llama/storage/rag/kv_store.db} + score_multiplier: 1.0 + +rag: + inline: + - e2e-test-docs diff --git a/tests/e2e/configuration/library-mode/lightspeed-stack.yaml b/tests/e2e/configuration/library-mode/lightspeed-stack.yaml index bc5694578..b0d8b7290 100644 --- a/tests/e2e/configuration/library-mode/lightspeed-stack.yaml +++ b/tests/e2e/configuration/library-mode/lightspeed-stack.yaml @@ -20,6 +20,19 @@ authentication: inference: default_provider: openai default_model: gpt-4o-mini +byok_rag: + - rag_id: e2e-test-docs + rag_type: inline::faiss + embedding_model: sentence-transformers/all-mpnet-base-v2 + embedding_dimension: 768 + vector_db_id: ${env.FAISS_VECTOR_STORE_ID} + db_path: ${env.KV_RAG_PATH:=~/.llama/storage/rag/kv_store.db} + score_multiplier: 1.0 + +rag: + tool: + - e2e-test-docs + mcp_servers: # Mock server with client-provided auth - should appear in mcp-auth/client-options response - name: "github-api" diff --git a/tests/e2e/configuration/server-mode/lightspeed-stack-auth-noop-token.yaml b/tests/e2e/configuration/server-mode/lightspeed-stack-auth-noop-token.yaml index 642624020..a8bd9f207 100644 --- a/tests/e2e/configuration/server-mode/lightspeed-stack-auth-noop-token.yaml +++ b/tests/e2e/configuration/server-mode/lightspeed-stack-auth-noop-token.yaml @@ -32,3 +32,16 @@ authentication: inference: default_provider: openai default_model: gpt-4o-mini + +byok_rag: + - rag_id: e2e-test-docs + rag_type: inline::faiss + embedding_model: sentence-transformers/all-mpnet-base-v2 + embedding_dimension: 768 + vector_db_id: ${env.FAISS_VECTOR_STORE_ID} + db_path: ${env.KV_RAG_PATH:=~/.llama/storage/rag/kv_store.db} + score_multiplier: 1.0 + +rag: + tool: + - e2e-test-docs diff --git a/tests/e2e/configuration/server-mode/lightspeed-stack-inline-rag.yaml b/tests/e2e/configuration/server-mode/lightspeed-stack-inline-rag.yaml new file mode 100644 index 000000000..f5d09b2d8 --- /dev/null +++ b/tests/e2e/configuration/server-mode/lightspeed-stack-inline-rag.yaml @@ -0,0 +1,41 @@ +name: Lightspeed Core Service (LCS) +service: + host: 0.0.0.0 + port: 8080 + auth_enabled: false + workers: 1 + color_log: true + access_log: true +llama_stack: + use_as_library_client: false + url: http://llama-stack:8321 + api_key: xyzzy +user_data_collection: + feedback_enabled: true + feedback_storage: "/tmp/data/feedback" + transcripts_enabled: true + transcripts_storage: "/tmp/data/transcripts" + +conversation_cache: + type: "sqlite" + sqlite: + db_path: "/tmp/data/conversation-cache.db" + +authentication: + module: "noop" +inference: + default_provider: openai + default_model: gpt-4o-mini + +byok_rag: + - rag_id: e2e-test-docs + rag_type: inline::faiss + embedding_model: sentence-transformers/all-mpnet-base-v2 + embedding_dimension: 768 + vector_db_id: ${env.FAISS_VECTOR_STORE_ID} + db_path: ${env.KV_RAG_PATH:=~/.llama/storage/rag/kv_store.db} + score_multiplier: 1.0 + +rag: + inline: + - e2e-test-docs diff --git a/tests/e2e/configuration/server-mode/lightspeed-stack.yaml b/tests/e2e/configuration/server-mode/lightspeed-stack.yaml index 026c551de..1e43005fd 100644 --- a/tests/e2e/configuration/server-mode/lightspeed-stack.yaml +++ b/tests/e2e/configuration/server-mode/lightspeed-stack.yaml @@ -21,6 +21,19 @@ authentication: inference: default_provider: openai default_model: gpt-4o-mini +byok_rag: + - rag_id: e2e-test-docs + rag_type: inline::faiss + embedding_model: sentence-transformers/all-mpnet-base-v2 + embedding_dimension: 768 + vector_db_id: ${env.FAISS_VECTOR_STORE_ID} + db_path: ${env.KV_RAG_PATH:=~/.llama/storage/rag/kv_store.db} + score_multiplier: 1.0 + +rag: + tool: + - e2e-test-docs + mcp_servers: # Mock server with client-provided auth - should appear in mcp-auth/client-options response - name: "github-api" diff --git a/tests/e2e/features/environment.py b/tests/e2e/features/environment.py index 14204d45b..792c9fa0e 100644 --- a/tests/e2e/features/environment.py +++ b/tests/e2e/features/environment.py @@ -76,6 +76,10 @@ "tests/e2e/configuration/{mode_dir}/lightspeed-stack-mcp-oauth-auth.yaml", "tests/e2e-prow/rhoai/configs/lightspeed-stack-mcp-oauth-auth.yaml", ), + "inline-rag": ( + "tests/e2e/configuration/{mode_dir}/lightspeed-stack-inline-rag.yaml", + "tests/e2e-prow/rhoai/configs/lightspeed-stack-inline-rag.yaml", + ), "mcp-auth": ( "tests/e2e/configuration/{mode_dir}/lightspeed-stack-mcp-auth.yaml", "tests/e2e-prow/rhoai/configs/lightspeed-stack-mcp-auth.yaml", diff --git a/tests/e2e/features/faiss.feature b/tests/e2e/features/faiss.feature index dd676c5af..c38dfc9e8 100644 --- a/tests/e2e/features/faiss.feature +++ b/tests/e2e/features/faiss.feature @@ -14,7 +14,7 @@ Feature: FAISS support tests """ { "rags": [ - "{VECTOR_STORE_ID}" + "e2e-test-docs" ] } """ diff --git a/tests/e2e/features/inline_rag.feature b/tests/e2e/features/inline_rag.feature new file mode 100644 index 000000000..358d248ff --- /dev/null +++ b/tests/e2e/features/inline_rag.feature @@ -0,0 +1,72 @@ +Feature: Inline RAG (BYOK) support tests + + Background: + Given The service is started locally + And The system is in default state + And I set the Authorization header to Bearer eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiIxMjM0NTY3ODkwIiwibmFtZSI6Ikpva + And REST API service prefix is /v1 + And The service uses the lightspeed-stack-inline-rag.yaml configuration + And The service is restarted + + Scenario: Check if inline RAG source is registered + When I access REST API endpoint rags using HTTP GET method + Then The status code of the response is 200 + And the body of the response has the following structure + """ + { + "rags": [ + "e2e-test-docs" + ] + } + """ + + Scenario: Query with inline RAG returns relevant content + When I use "query" to ask question with authorization header + """ + {"query": "What is the title of the article from Paul?", "system_prompt": "You are an assistant. Write only lowercase letters"} + """ + Then The status code of the response is 200 + And The response should contain following fragments + | Fragments in LLM response | + | great work | + And The response should contain non-empty rag_chunks + + Scenario: Inline RAG query includes referenced documents + When I use "query" to ask question with authorization header + """ + {"query": "What does Paul Graham say about great work?"} + """ + Then The status code of the response is 200 + And The response should contain non-empty referenced_documents + + Scenario: Streaming query with inline RAG returns relevant content + When I use "streaming_query" to ask question with authorization header + """ + {"query": "What is the title of the article from Paul?", "system_prompt": "You are an assistant. Write only lowercase letters"} + """ + Then The status code of the response is 200 + And I wait for the response to be completed + And The streamed response should contain following fragments + | Fragments in LLM response | + | great work | + + Scenario: Responses API with inline RAG returns relevant content + When I use "responses" to ask question with authorization header + """ + {"input": "What is the title of the article from Paul?", "model": "{PROVIDER}/{MODEL}", "stream": false, "instructions": "You are an assistant. Write only lowercase letters"} + """ + Then The status code of the response is 200 + And The response should contain following fragments + | Fragments in LLM response | + | great work | + + Scenario: Streaming Responses API with inline RAG returns relevant content + When I use "responses" to ask question with authorization header + """ + {"input": "What is the title of the article from Paul?", "model": "{PROVIDER}/{MODEL}", "stream": true, "instructions": "You are an assistant. Write only lowercase letters"} + """ + Then The status code of the response is 200 + And I wait for the response to be completed + And The streamed response should contain following fragments + | Fragments in LLM response | + | great work | diff --git a/tests/e2e/features/steps/common.py b/tests/e2e/features/steps/common.py index b20f292e6..aca407661 100644 --- a/tests/e2e/features/steps/common.py +++ b/tests/e2e/features/steps/common.py @@ -5,6 +5,14 @@ from behave import given # pyright: ignore[reportAttributeAccessIssue] from behave.runner import Context +from tests.e2e.utils.utils import ( + create_config_backup, + is_prow_environment, + restart_container, + switch_config, + wait_for_container_health, +) + @given("The service is started locally") def service_is_started_locally(context: Context) -> None: @@ -23,6 +31,43 @@ def service_is_started_locally(context: Context) -> None: context.port_llama = os.getenv("E2E_LLAMA_PORT", "8321") +@given("The service uses the {config_name} configuration") # type: ignore +def configure_service(context: Context, config_name: str) -> None: + """Switch to the given configuration if not already active. + + On first call creates a backup of the current config, switches to the + named config, and restarts the container. Subsequent calls within + the same feature are no-ops (detected by backup file existence in Docker + or backup key presence in Prow). + + Parameters: + context (Context): Behave context. + config_name (str): Config filename (e.g. lightspeed-stack-inline-rag.yaml). + """ + if not is_prow_environment() and os.path.exists("lightspeed-stack.yaml.backup"): + return + + mode_dir = "library-mode" if context.is_library_mode else "server-mode" + if is_prow_environment(): + config_path = f"tests/e2e-prow/rhoai/configs/{config_name}" + else: + config_path = f"tests/e2e/configuration/{mode_dir}/{config_name}" + create_config_backup("lightspeed-stack.yaml") + switch_config(config_path) + + +@given("The service is restarted") +def restart_service(context: Context) -> None: + """Restart the lightspeed-stack container and wait for it to be healthy. + + Parameters: + context (Context): Behave context. + """ + restart_container("lightspeed-stack") + # Library mode needs extra time to load embedding models after restart + wait_for_container_health("lightspeed-stack", max_attempts=12) + + @given("The system is in default state") def system_in_default_state(context: Context) -> None: """Check the default system state. diff --git a/tests/e2e/features/steps/llm_query_response.py b/tests/e2e/features/steps/llm_query_response.py index 4d5dcc487..2410ddbe3 100644 --- a/tests/e2e/features/steps/llm_query_response.py +++ b/tests/e2e/features/steps/llm_query_response.py @@ -136,6 +136,30 @@ def check_llm_response_not_truncated(context: Context) -> None: assert response_json["truncated"] is False +@then("The response should contain non-empty rag_chunks") +def check_rag_chunks_present(context: Context) -> None: + """Check that the response contains non-empty rag_chunks from inline RAG.""" + assert context.response is not None + response_json = context.response.json() + assert "rag_chunks" in response_json, "rag_chunks field missing from response" + assert ( + len(response_json["rag_chunks"]) > 0 + ), "rag_chunks is empty — inline RAG did not inject any chunks" + + +@then("The response should contain non-empty referenced_documents") +def check_referenced_documents_present(context: Context) -> None: + """Check that the response contains non-empty referenced_documents.""" + assert context.response is not None + response_json = context.response.json() + assert ( + "referenced_documents" in response_json + ), "referenced_documents field missing from response" + assert ( + len(response_json["referenced_documents"]) > 0 + ), "referenced_documents is empty — no documents were referenced" + + @then("The response should contain following fragments") def check_fragments_in_response(context: Context) -> None: """Check that all specified fragments are present in the LLM response. @@ -149,7 +173,21 @@ def check_fragments_in_response(context: Context) -> None: """ assert context.response is not None response_json = context.response.json() - response = response_json["response"] + + # Support both query endpoint format (response field) and responses API format (output array) + if "response" in response_json: + response = response_json["response"] + else: + # Responses API format: extract text from output messages + response = " ".join( + part.get("text", "") + for item in response_json.get("output", []) + if item.get("type") == "message" + for part in ( + item.get("content") if isinstance(item.get("content"), list) else [] + ) + if part.get("type") == "output_text" + ) assert context.table is not None, "Fragments are not specified in table" diff --git a/tests/e2e/test_list.txt b/tests/e2e/test_list.txt index 988232bfa..faabf3747 100644 --- a/tests/e2e/test_list.txt +++ b/tests/e2e/test_list.txt @@ -1,4 +1,5 @@ features/faiss.feature +features/inline_rag.feature features/smoketests.feature features/authorized_noop.feature features/authorized_noop_token.feature diff --git a/tests/unit/test_llama_stack_configuration.py b/tests/unit/test_llama_stack_configuration.py index 56dc11a17..14fd1c999 100644 --- a/tests/unit/test_llama_stack_configuration.py +++ b/tests/unit/test_llama_stack_configuration.py @@ -101,6 +101,34 @@ def test_construct_vector_stores_section_skips_duplicate_from_existing() -> None assert output[0]["provider_id"] == "original_provider" +def test_construct_vector_stores_section_skips_duplicate_env_var( + monkeypatch: pytest.MonkeyPatch, +) -> None: + """Test skips BYOK entry when existing store uses an env var that resolves to the same ID.""" + monkeypatch.setenv("FAISS_VECTOR_STORE_ID", "vs_abc123") + ls_config = { + "registered_resources": { + "vector_stores": [ + { + "vector_store_id": "${env.FAISS_VECTOR_STORE_ID}", + "provider_id": "faiss", + }, + ] + } + } + byok_rag = [ + { + "rag_id": "rag1", + "vector_db_id": "vs_abc123", + "embedding_model": "test-model", + "embedding_dimension": 768, + }, + ] + output = construct_vector_stores_section(ls_config, byok_rag) + assert len(output) == 1 + assert output[0]["provider_id"] == "faiss" + + def test_construct_vector_stores_section_skips_duplicate_within_byok() -> None: """Test skips duplicate vector_db_id entries within the BYOK RAG list.""" ls_config: dict[str, Any] = {}