lightspeed-core · are-ces · Mar 22, 2026 · Mar 23, 2026 · Mar 23, 2026 · Mar 23, 2026
diff --git a/docker-compose-library.yaml b/docker-compose-library.yaml
@@ -71,7 +71,7 @@ services:
       - WATSONX_API_KEY=${WATSONX_API_KEY:-}
       # Enable debug logging if needed
       - LLAMA_STACK_LOGGING=${LLAMA_STACK_LOGGING:-}
-      # FAISS test
+      # FAISS test and inline RAG config
       - FAISS_VECTOR_STORE_ID=${FAISS_VECTOR_STORE_ID:-}
     healthcheck:
       test: ["CMD", "curl", "-f", "http://localhost:8080/liveness"]

diff --git a/docker-compose.yaml b/docker-compose.yaml
@@ -89,6 +89,8 @@ services:
       - TENANT_ID=${TENANT_ID:-}
       - CLIENT_ID=${CLIENT_ID:-}
       - CLIENT_SECRET=${CLIENT_SECRET:-}
+      # FAISS vector store ID (used by inline RAG config)
+      - FAISS_VECTOR_STORE_ID=${FAISS_VECTOR_STORE_ID:-}
     depends_on:
         llama-stack:
           condition: service_healthy

diff --git a/src/llama_stack_configuration.py b/src/llama_stack_configuration.py
@@ -184,7 +184,11 @@ def construct_vector_stores_section(
             output = ls_config["registered_resources"]["vector_stores"].copy()
 
     # append new vector_stores entries, skipping duplicates
-    existing_store_ids = {vs.get("vector_store_id") for vs in output}
+    # Resolve ${env.VAR} patterns so comparisons work when existing entries
+    # use environment variable references and new entries have resolved values.
+    existing_store_ids = {
+        replace_env_vars(vs.get("vector_store_id", "")) for vs in output
+    }
     added = 0
     for brag in byok_rag:
         if not brag.get("rag_id"):

diff --git a/test.containerfile b/test.containerfile
@@ -20,7 +20,8 @@ COPY src ./src
 RUN uv sync --locked --no-install-project --group llslibdev
 
 # Add virtual environment to PATH for llama command
-ENV PATH="/opt/app-root/.venv/bin:$PATH"
+ENV PATH="/opt/app-root/.venv/bin:$PATH" \
+    PYTHONPATH="/opt/app-root/src"
 
 # Set HOME directory so llama-stack uses /opt/app-root/src/.llama
 ENV HOME="/opt/app-root/src"

diff --git a/tests/e2e-prow/rhoai/configs/lightspeed-stack-inline-rag.yaml b/tests/e2e-prow/rhoai/configs/lightspeed-stack-inline-rag.yaml
@@ -0,0 +1,38 @@
+name: Lightspeed Core Service (LCS)
+service:
+  host: 0.0.0.0
+  port: 8080
+  auth_enabled: false
+  workers: 1
+  color_log: true
+  access_log: true
+llama_stack:
+  use_as_library_client: false
+  url: http://${env.E2E_LLAMA_HOSTNAME}:8321
+  api_key: xyzzy
+user_data_collection:
+  feedback_enabled: true
+  feedback_storage: "/tmp/data/feedback"
+  transcripts_enabled: true
+  transcripts_storage: "/tmp/data/transcripts"
+
+conversation_cache:
+  type: "sqlite"
+  sqlite:
+    db_path: "/tmp/data/conversation-cache.db"
+
+authentication:
+  module: "noop"
+
+byok_rag:
+  - rag_id: e2e-test-docs
+    rag_type: inline::faiss
+    embedding_model: sentence-transformers/all-mpnet-base-v2
+    embedding_dimension: 768
+    vector_db_id: ${env.FAISS_VECTOR_STORE_ID}
+    db_path: ${env.KV_RAG_PATH:=~/.llama/storage/rag/kv_store.db}
+    score_multiplier: 1.0
+
+rag:
+  inline:
+    - e2e-test-docs
diff --git a/tests/e2e/configs/run-ci.yaml b/tests/e2e/configs/run-ci.yaml
@@ -58,13 +58,7 @@ providers:
   - config: {} # Enable MCP (Model Context Protocol) support
     provider_id: model-context-protocol
     provider_type: remote::model-context-protocol
-  vector_io:
-  - config: # Define the storage backend for RAG
-      persistence:
-        namespace: vector_io::faiss
-        backend: kv_rag
-    provider_id: faiss
-    provider_type: inline::faiss
+  vector_io: []
   agents:
   - config:
       persistence:
@@ -111,9 +105,6 @@ storage:
     kv_default:
       type: kv_sqlite
       db_path: ${env.KV_STORE_PATH:=~/.llama/storage/kv_store.db}
-    kv_rag: # Define the storage backend type for RAG
-      type: kv_sqlite
-      db_path: ${env.KV_RAG_PATH:=~/.llama/storage/rag/kv_store.db}
     sql_default:
       type: sql_sqlite
       db_path: ${env.SQL_STORE_PATH:=~/.llama/storage/sql_store.db}
@@ -144,11 +135,7 @@ registered_resources:
   - shield_id: llama-guard
     provider_id: llama-guard
     provider_shield_id: openai/gpt-4o-mini
-  vector_stores: 
-  - embedding_dimension: 768
-    embedding_model: sentence-transformers/all-mpnet-base-v2
-    provider_id: faiss
-    vector_store_id: ${env.FAISS_VECTOR_STORE_ID}
+  vector_stores: []
   datasets: []
   scoring_fns: []
   benchmarks: []

diff --git a/tests/e2e/configuration/library-mode/lightspeed-stack-auth-noop-token.yaml b/tests/e2e/configuration/library-mode/lightspeed-stack-auth-noop-token.yaml
@@ -26,3 +26,16 @@ authentication:
 inference:
   default_provider: openai
   default_model: gpt-4o-mini
+
+byok_rag:
+  - rag_id: e2e-test-docs
+    rag_type: inline::faiss
+    embedding_model: sentence-transformers/all-mpnet-base-v2
+    embedding_dimension: 768
+    vector_db_id: ${env.FAISS_VECTOR_STORE_ID}
+    db_path: ${env.KV_RAG_PATH:=~/.llama/storage/rag/kv_store.db}
+    score_multiplier: 1.0
+
+rag:
+  tool:
+    - e2e-test-docs
diff --git a/tests/e2e/configuration/library-mode/lightspeed-stack-inline-rag.yaml b/tests/e2e/configuration/library-mode/lightspeed-stack-inline-rag.yaml
@@ -0,0 +1,40 @@
+name: Lightspeed Core Service (LCS)
+service:
+  host: 0.0.0.0
+  port: 8080
+  auth_enabled: false
+  workers: 1
+  color_log: true
+  access_log: true
+llama_stack:
+  use_as_library_client: true
+  library_client_config_path: run.yaml
+user_data_collection:
+  feedback_enabled: true
+  feedback_storage: "/tmp/data/feedback"
+  transcripts_enabled: true
+  transcripts_storage: "/tmp/data/transcripts"
+
+conversation_cache:
+  type: "sqlite"
+  sqlite:
+    db_path: "/tmp/data/conversation-cache.db"
+
+authentication:
+  module: "noop"
+inference:
+  default_provider: openai
+  default_model: gpt-4o-mini
+
+byok_rag:
+  - rag_id: e2e-test-docs
+    rag_type: inline::faiss
+    embedding_model: sentence-transformers/all-mpnet-base-v2
+    embedding_dimension: 768
+    vector_db_id: ${env.FAISS_VECTOR_STORE_ID}
+    db_path: ${env.KV_RAG_PATH:=~/.llama/storage/rag/kv_store.db}
+    score_multiplier: 1.0
+
+rag:
+  inline:
+    - e2e-test-docs
diff --git a/tests/e2e/configuration/library-mode/lightspeed-stack.yaml b/tests/e2e/configuration/library-mode/lightspeed-stack.yaml
@@ -20,6 +20,19 @@ authentication:
 inference:
   default_provider: openai
   default_model: gpt-4o-mini
+byok_rag:
+  - rag_id: e2e-test-docs
+    rag_type: inline::faiss
+    embedding_model: sentence-transformers/all-mpnet-base-v2
+    embedding_dimension: 768
+    vector_db_id: ${env.FAISS_VECTOR_STORE_ID}
+    db_path: ${env.KV_RAG_PATH:=~/.llama/storage/rag/kv_store.db}
+    score_multiplier: 1.0
+
+rag:
+  tool:
+    - e2e-test-docs
+
 mcp_servers:
   # Mock server with client-provided auth - should appear in mcp-auth/client-options response
   - name: "github-api"

diff --git a/tests/e2e/configuration/server-mode/lightspeed-stack-auth-noop-token.yaml b/tests/e2e/configuration/server-mode/lightspeed-stack-auth-noop-token.yaml
@@ -32,3 +32,16 @@ authentication:
 inference:
   default_provider: openai
   default_model: gpt-4o-mini
+
+byok_rag:
+  - rag_id: e2e-test-docs
+    rag_type: inline::faiss
+    embedding_model: sentence-transformers/all-mpnet-base-v2
+    embedding_dimension: 768
+    vector_db_id: ${env.FAISS_VECTOR_STORE_ID}
+    db_path: ${env.KV_RAG_PATH:=~/.llama/storage/rag/kv_store.db}
+    score_multiplier: 1.0
+
+rag:
+  tool:
+    - e2e-test-docs
diff --git a/tests/e2e/configuration/server-mode/lightspeed-stack-inline-rag.yaml b/tests/e2e/configuration/server-mode/lightspeed-stack-inline-rag.yaml
@@ -0,0 +1,41 @@
+name: Lightspeed Core Service (LCS)
+service:
+  host: 0.0.0.0
+  port: 8080
+  auth_enabled: false
+  workers: 1
+  color_log: true
+  access_log: true
+llama_stack:
+  use_as_library_client: false
+  url: http://llama-stack:8321
+  api_key: xyzzy
+user_data_collection:
+  feedback_enabled: true
+  feedback_storage: "/tmp/data/feedback"
+  transcripts_enabled: true
+  transcripts_storage: "/tmp/data/transcripts"
+
+conversation_cache:
+  type: "sqlite"
+  sqlite:
+    db_path: "/tmp/data/conversation-cache.db"
+
+authentication:
+  module: "noop"
+inference:
+  default_provider: openai
+  default_model: gpt-4o-mini
+
+byok_rag:
+  - rag_id: e2e-test-docs
+    rag_type: inline::faiss
+    embedding_model: sentence-transformers/all-mpnet-base-v2
+    embedding_dimension: 768
+    vector_db_id: ${env.FAISS_VECTOR_STORE_ID}
+    db_path: ${env.KV_RAG_PATH:=~/.llama/storage/rag/kv_store.db}
+    score_multiplier: 1.0
+
+rag:
+  inline:
+    - e2e-test-docs
diff --git a/tests/e2e/configuration/server-mode/lightspeed-stack.yaml b/tests/e2e/configuration/server-mode/lightspeed-stack.yaml
@@ -21,6 +21,19 @@ authentication:
 inference:
   default_provider: openai
   default_model: gpt-4o-mini
+byok_rag:
+  - rag_id: e2e-test-docs
+    rag_type: inline::faiss
+    embedding_model: sentence-transformers/all-mpnet-base-v2
+    embedding_dimension: 768
+    vector_db_id: ${env.FAISS_VECTOR_STORE_ID}
+    db_path: ${env.KV_RAG_PATH:=~/.llama/storage/rag/kv_store.db}
+    score_multiplier: 1.0
+
+rag:
+  tool:
+    - e2e-test-docs
+
 mcp_servers:
   # Mock server with client-provided auth - should appear in mcp-auth/client-options response
   - name: "github-api"

diff --git a/tests/e2e/features/environment.py b/tests/e2e/features/environment.py
@@ -76,6 +76,10 @@
         "tests/e2e/configuration/{mode_dir}/lightspeed-stack-mcp-oauth-auth.yaml",
         "tests/e2e-prow/rhoai/configs/lightspeed-stack-mcp-oauth-auth.yaml",
     ),
+    "inline-rag": (
+        "tests/e2e/configuration/{mode_dir}/lightspeed-stack-inline-rag.yaml",
+        "tests/e2e-prow/rhoai/configs/lightspeed-stack-inline-rag.yaml",
+    ),
     "mcp-auth": (
         "tests/e2e/configuration/{mode_dir}/lightspeed-stack-mcp-auth.yaml",
         "tests/e2e-prow/rhoai/configs/lightspeed-stack-mcp-auth.yaml",

diff --git a/tests/e2e/features/faiss.feature b/tests/e2e/features/faiss.feature
@@ -14,7 +14,7 @@ Feature: FAISS support tests
     """
     {
       "rags": [
-        "{VECTOR_STORE_ID}"
+        "e2e-test-docs"
       ]
     }
     """

diff --git a/tests/e2e/features/inline_rag.feature b/tests/e2e/features/inline_rag.feature
@@ -0,0 +1,72 @@
+Feature: Inline RAG (BYOK) support tests
+
+  Background:
+    Given The service is started locally
+      And The system is in default state
+      And I set the Authorization header to Bearer eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiIxMjM0NTY3ODkwIiwibmFtZSI6Ikpva
+      And REST API service prefix is /v1
+      And The service uses the lightspeed-stack-inline-rag.yaml configuration
+      And The service is restarted
+
+  Scenario: Check if inline RAG source is registered
+    When I access REST API endpoint rags using HTTP GET method
+    Then The status code of the response is 200
+     And the body of the response has the following structure
+    """
+    {
+      "rags": [
+        "e2e-test-docs"
+      ]
+    }
+    """
+
+  Scenario: Query with inline RAG returns relevant content
+    When I use "query" to ask question with authorization header
+    """
+    {"query": "What is the title of the article from Paul?", "system_prompt": "You are an assistant. Write only lowercase letters"}
+    """
+    Then The status code of the response is 200
+     And The response should contain following fragments
+         | Fragments in LLM response |
+         | great work                |
+     And The response should contain non-empty rag_chunks
+
+  Scenario: Inline RAG query includes referenced documents
+    When I use "query" to ask question with authorization header
+    """
+    {"query": "What does Paul Graham say about great work?"}
+    """
+    Then The status code of the response is 200
+     And The response should contain non-empty referenced_documents
+
+  Scenario: Streaming query with inline RAG returns relevant content
+    When I use "streaming_query" to ask question with authorization header
+    """
+    {"query": "What is the title of the article from Paul?", "system_prompt": "You are an assistant. Write only lowercase letters"}
+    """
+    Then The status code of the response is 200
+     And I wait for the response to be completed
+     And The streamed response should contain following fragments
+         | Fragments in LLM response |
+         | great work                |
+
+  Scenario: Responses API with inline RAG returns relevant content
+    When I use "responses" to ask question with authorization header
+    """
+    {"input": "What is the title of the article from Paul?", "model": "{PROVIDER}/{MODEL}", "stream": false, "instructions": "You are an assistant. Write only lowercase letters"}
+    """
+    Then The status code of the response is 200
+     And The response should contain following fragments
+         | Fragments in LLM response |
+         | great work                |
+
+  Scenario: Streaming Responses API with inline RAG returns relevant content
+    When I use "responses" to ask question with authorization header
+    """
+    {"input": "What is the title of the article from Paul?", "model": "{PROVIDER}/{MODEL}", "stream": true, "instructions": "You are an assistant. Write only lowercase letters"}
+    """
+    Then The status code of the response is 200
+     And I wait for the response to be completed
+     And The streamed response should contain following fragments
+         | Fragments in LLM response |
+         | great work                |
-Original file line number
+Diff line change
@@ Expand Up / @@ -14,7 +14,7 @@ Feature: FAISS support tests @@
         """
         {
           "rags": [
-            "{VECTOR_STORE_ID}"
+            "e2e-test-docs"
           ]
         }
         """
@@ Expand Down @@