diff --git a/docker-compose-library.yaml b/docker-compose-library.yaml
index 65c7e50ac..52482b18b 100644
--- a/docker-compose-library.yaml
+++ b/docker-compose-library.yaml
@@ -71,7 +71,7 @@ services:
       - WATSONX_API_KEY=${WATSONX_API_KEY:-}
       # Enable debug logging if needed
       - LLAMA_STACK_LOGGING=${LLAMA_STACK_LOGGING:-}
-      # FAISS test
+      # FAISS test and inline RAG config
       - FAISS_VECTOR_STORE_ID=${FAISS_VECTOR_STORE_ID:-}
     healthcheck:
       test: ["CMD", "curl", "-f", "http://localhost:8080/liveness"]
diff --git a/docker-compose.yaml b/docker-compose.yaml
index aed2bc0a1..1de76cdb3 100644
--- a/docker-compose.yaml
+++ b/docker-compose.yaml
@@ -89,6 +89,8 @@ services:
       - TENANT_ID=${TENANT_ID:-}
       - CLIENT_ID=${CLIENT_ID:-}
       - CLIENT_SECRET=${CLIENT_SECRET:-}
+      # FAISS vector store ID (used by inline RAG config)
+      - FAISS_VECTOR_STORE_ID=${FAISS_VECTOR_STORE_ID:-}
     depends_on:
         llama-stack:
           condition: service_healthy
diff --git a/src/llama_stack_configuration.py b/src/llama_stack_configuration.py
index e81ac0c3d..30a1b0796 100644
--- a/src/llama_stack_configuration.py
+++ b/src/llama_stack_configuration.py
@@ -184,7 +184,11 @@ def construct_vector_stores_section(
             output = ls_config["registered_resources"]["vector_stores"].copy()
 
     # append new vector_stores entries, skipping duplicates
-    existing_store_ids = {vs.get("vector_store_id") for vs in output}
+    # Resolve ${env.VAR} patterns so comparisons work when existing entries
+    # use environment variable references and new entries have resolved values.
+    existing_store_ids = {
+        replace_env_vars(vs.get("vector_store_id", "")) for vs in output
+    }
     added = 0
     for brag in byok_rag:
         if not brag.get("rag_id"):
diff --git a/test.containerfile b/test.containerfile
index ecfc54313..884fd8525 100644
--- a/test.containerfile
+++ b/test.containerfile
@@ -20,7 +20,8 @@ COPY src ./src
 RUN uv sync --locked --no-install-project --group llslibdev
 
 # Add virtual environment to PATH for llama command
-ENV PATH="/opt/app-root/.venv/bin:$PATH"
+ENV PATH="/opt/app-root/.venv/bin:$PATH" \
+    PYTHONPATH="/opt/app-root/src"
 
 # Set HOME directory so llama-stack uses /opt/app-root/src/.llama
 ENV HOME="/opt/app-root/src"
diff --git a/tests/e2e-prow/rhoai/configs/lightspeed-stack-inline-rag.yaml b/tests/e2e-prow/rhoai/configs/lightspeed-stack-inline-rag.yaml
new file mode 100644
index 000000000..bdbf2b73e
--- /dev/null
+++ b/tests/e2e-prow/rhoai/configs/lightspeed-stack-inline-rag.yaml
@@ -0,0 +1,38 @@
+name: Lightspeed Core Service (LCS)
+service:
+  host: 0.0.0.0
+  port: 8080
+  auth_enabled: false
+  workers: 1
+  color_log: true
+  access_log: true
+llama_stack:
+  use_as_library_client: false
+  url: http://${env.E2E_LLAMA_HOSTNAME}:8321
+  api_key: xyzzy
+user_data_collection:
+  feedback_enabled: true
+  feedback_storage: "/tmp/data/feedback"
+  transcripts_enabled: true
+  transcripts_storage: "/tmp/data/transcripts"
+
+conversation_cache:
+  type: "sqlite"
+  sqlite:
+    db_path: "/tmp/data/conversation-cache.db"
+
+authentication:
+  module: "noop"
+
+byok_rag:
+  - rag_id: e2e-test-docs
+    rag_type: inline::faiss
+    embedding_model: sentence-transformers/all-mpnet-base-v2
+    embedding_dimension: 768
+    vector_db_id: ${env.FAISS_VECTOR_STORE_ID}
+    db_path: ${env.KV_RAG_PATH:=~/.llama/storage/rag/kv_store.db}
+    score_multiplier: 1.0
+
+rag:
+  inline:
+    - e2e-test-docs
diff --git a/tests/e2e/configs/run-ci.yaml b/tests/e2e/configs/run-ci.yaml
index cc16560f6..5c9dd8cb1 100644
--- a/tests/e2e/configs/run-ci.yaml
+++ b/tests/e2e/configs/run-ci.yaml
@@ -58,13 +58,7 @@ providers:
   - config: {} # Enable MCP (Model Context Protocol) support
     provider_id: model-context-protocol
     provider_type: remote::model-context-protocol
-  vector_io:
-  - config: # Define the storage backend for RAG
-      persistence:
-        namespace: vector_io::faiss
-        backend: kv_rag
-    provider_id: faiss
-    provider_type: inline::faiss
+  vector_io: []
   agents:
   - config:
       persistence:
@@ -111,9 +105,6 @@ storage:
     kv_default:
       type: kv_sqlite
       db_path: ${env.KV_STORE_PATH:=~/.llama/storage/kv_store.db}
-    kv_rag: # Define the storage backend type for RAG
-      type: kv_sqlite
-      db_path: ${env.KV_RAG_PATH:=~/.llama/storage/rag/kv_store.db}
     sql_default:
       type: sql_sqlite
       db_path: ${env.SQL_STORE_PATH:=~/.llama/storage/sql_store.db}
@@ -144,11 +135,7 @@ registered_resources:
   - shield_id: llama-guard
     provider_id: llama-guard
     provider_shield_id: openai/gpt-4o-mini
-  vector_stores: 
-  - embedding_dimension: 768
-    embedding_model: sentence-transformers/all-mpnet-base-v2
-    provider_id: faiss
-    vector_store_id: ${env.FAISS_VECTOR_STORE_ID}
+  vector_stores: []
   datasets: []
   scoring_fns: []
   benchmarks: []
diff --git a/tests/e2e/configuration/library-mode/lightspeed-stack-auth-noop-token.yaml b/tests/e2e/configuration/library-mode/lightspeed-stack-auth-noop-token.yaml
index 0391b8ac6..2c55ae440 100644
--- a/tests/e2e/configuration/library-mode/lightspeed-stack-auth-noop-token.yaml
+++ b/tests/e2e/configuration/library-mode/lightspeed-stack-auth-noop-token.yaml
@@ -26,3 +26,16 @@ authentication:
 inference:
   default_provider: openai
   default_model: gpt-4o-mini
+
+byok_rag:
+  - rag_id: e2e-test-docs
+    rag_type: inline::faiss
+    embedding_model: sentence-transformers/all-mpnet-base-v2
+    embedding_dimension: 768
+    vector_db_id: ${env.FAISS_VECTOR_STORE_ID}
+    db_path: ${env.KV_RAG_PATH:=~/.llama/storage/rag/kv_store.db}
+    score_multiplier: 1.0
+
+rag:
+  tool:
+    - e2e-test-docs
diff --git a/tests/e2e/configuration/library-mode/lightspeed-stack-inline-rag.yaml b/tests/e2e/configuration/library-mode/lightspeed-stack-inline-rag.yaml
new file mode 100644
index 000000000..ffd744da6
--- /dev/null
+++ b/tests/e2e/configuration/library-mode/lightspeed-stack-inline-rag.yaml
@@ -0,0 +1,40 @@
+name: Lightspeed Core Service (LCS)
+service:
+  host: 0.0.0.0
+  port: 8080
+  auth_enabled: false
+  workers: 1
+  color_log: true
+  access_log: true
+llama_stack:
+  use_as_library_client: true
+  library_client_config_path: run.yaml
+user_data_collection:
+  feedback_enabled: true
+  feedback_storage: "/tmp/data/feedback"
+  transcripts_enabled: true
+  transcripts_storage: "/tmp/data/transcripts"
+
+conversation_cache:
+  type: "sqlite"
+  sqlite:
+    db_path: "/tmp/data/conversation-cache.db"
+
+authentication:
+  module: "noop"
+inference:
+  default_provider: openai
+  default_model: gpt-4o-mini
+
+byok_rag:
+  - rag_id: e2e-test-docs
+    rag_type: inline::faiss
+    embedding_model: sentence-transformers/all-mpnet-base-v2
+    embedding_dimension: 768
+    vector_db_id: ${env.FAISS_VECTOR_STORE_ID}
+    db_path: ${env.KV_RAG_PATH:=~/.llama/storage/rag/kv_store.db}
+    score_multiplier: 1.0
+
+rag:
+  inline:
+    - e2e-test-docs
diff --git a/tests/e2e/configuration/library-mode/lightspeed-stack.yaml b/tests/e2e/configuration/library-mode/lightspeed-stack.yaml
index bc5694578..b0d8b7290 100644
--- a/tests/e2e/configuration/library-mode/lightspeed-stack.yaml
+++ b/tests/e2e/configuration/library-mode/lightspeed-stack.yaml
@@ -20,6 +20,19 @@ authentication:
 inference:
   default_provider: openai
   default_model: gpt-4o-mini
+byok_rag:
+  - rag_id: e2e-test-docs
+    rag_type: inline::faiss
+    embedding_model: sentence-transformers/all-mpnet-base-v2
+    embedding_dimension: 768
+    vector_db_id: ${env.FAISS_VECTOR_STORE_ID}
+    db_path: ${env.KV_RAG_PATH:=~/.llama/storage/rag/kv_store.db}
+    score_multiplier: 1.0
+
+rag:
+  tool:
+    - e2e-test-docs
+
 mcp_servers:
   # Mock server with client-provided auth - should appear in mcp-auth/client-options response
   - name: "github-api"
diff --git a/tests/e2e/configuration/server-mode/lightspeed-stack-auth-noop-token.yaml b/tests/e2e/configuration/server-mode/lightspeed-stack-auth-noop-token.yaml
index 642624020..a8bd9f207 100644
--- a/tests/e2e/configuration/server-mode/lightspeed-stack-auth-noop-token.yaml
+++ b/tests/e2e/configuration/server-mode/lightspeed-stack-auth-noop-token.yaml
@@ -32,3 +32,16 @@ authentication:
 inference:
   default_provider: openai
   default_model: gpt-4o-mini
+
+byok_rag:
+  - rag_id: e2e-test-docs
+    rag_type: inline::faiss
+    embedding_model: sentence-transformers/all-mpnet-base-v2
+    embedding_dimension: 768
+    vector_db_id: ${env.FAISS_VECTOR_STORE_ID}
+    db_path: ${env.KV_RAG_PATH:=~/.llama/storage/rag/kv_store.db}
+    score_multiplier: 1.0
+
+rag:
+  tool:
+    - e2e-test-docs
diff --git a/tests/e2e/configuration/server-mode/lightspeed-stack-inline-rag.yaml b/tests/e2e/configuration/server-mode/lightspeed-stack-inline-rag.yaml
new file mode 100644
index 000000000..f5d09b2d8
--- /dev/null
+++ b/tests/e2e/configuration/server-mode/lightspeed-stack-inline-rag.yaml
@@ -0,0 +1,41 @@
+name: Lightspeed Core Service (LCS)
+service:
+  host: 0.0.0.0
+  port: 8080
+  auth_enabled: false
+  workers: 1
+  color_log: true
+  access_log: true
+llama_stack:
+  use_as_library_client: false
+  url: http://llama-stack:8321
+  api_key: xyzzy
+user_data_collection:
+  feedback_enabled: true
+  feedback_storage: "/tmp/data/feedback"
+  transcripts_enabled: true
+  transcripts_storage: "/tmp/data/transcripts"
+
+conversation_cache:
+  type: "sqlite"
+  sqlite:
+    db_path: "/tmp/data/conversation-cache.db"
+
+authentication:
+  module: "noop"
+inference:
+  default_provider: openai
+  default_model: gpt-4o-mini
+
+byok_rag:
+  - rag_id: e2e-test-docs
+    rag_type: inline::faiss
+    embedding_model: sentence-transformers/all-mpnet-base-v2
+    embedding_dimension: 768
+    vector_db_id: ${env.FAISS_VECTOR_STORE_ID}
+    db_path: ${env.KV_RAG_PATH:=~/.llama/storage/rag/kv_store.db}
+    score_multiplier: 1.0
+
+rag:
+  inline:
+    - e2e-test-docs
diff --git a/tests/e2e/configuration/server-mode/lightspeed-stack.yaml b/tests/e2e/configuration/server-mode/lightspeed-stack.yaml
index 026c551de..1e43005fd 100644
--- a/tests/e2e/configuration/server-mode/lightspeed-stack.yaml
+++ b/tests/e2e/configuration/server-mode/lightspeed-stack.yaml
@@ -21,6 +21,19 @@ authentication:
 inference:
   default_provider: openai
   default_model: gpt-4o-mini
+byok_rag:
+  - rag_id: e2e-test-docs
+    rag_type: inline::faiss
+    embedding_model: sentence-transformers/all-mpnet-base-v2
+    embedding_dimension: 768
+    vector_db_id: ${env.FAISS_VECTOR_STORE_ID}
+    db_path: ${env.KV_RAG_PATH:=~/.llama/storage/rag/kv_store.db}
+    score_multiplier: 1.0
+
+rag:
+  tool:
+    - e2e-test-docs
+
 mcp_servers:
   # Mock server with client-provided auth - should appear in mcp-auth/client-options response
   - name: "github-api"
diff --git a/tests/e2e/features/environment.py b/tests/e2e/features/environment.py
index 14204d45b..792c9fa0e 100644
--- a/tests/e2e/features/environment.py
+++ b/tests/e2e/features/environment.py
@@ -76,6 +76,10 @@
         "tests/e2e/configuration/{mode_dir}/lightspeed-stack-mcp-oauth-auth.yaml",
         "tests/e2e-prow/rhoai/configs/lightspeed-stack-mcp-oauth-auth.yaml",
     ),
+    "inline-rag": (
+        "tests/e2e/configuration/{mode_dir}/lightspeed-stack-inline-rag.yaml",
+        "tests/e2e-prow/rhoai/configs/lightspeed-stack-inline-rag.yaml",
+    ),
     "mcp-auth": (
         "tests/e2e/configuration/{mode_dir}/lightspeed-stack-mcp-auth.yaml",
         "tests/e2e-prow/rhoai/configs/lightspeed-stack-mcp-auth.yaml",
diff --git a/tests/e2e/features/faiss.feature b/tests/e2e/features/faiss.feature
index dd676c5af..c38dfc9e8 100644
--- a/tests/e2e/features/faiss.feature
+++ b/tests/e2e/features/faiss.feature
@@ -14,7 +14,7 @@ Feature: FAISS support tests
     """
     {
       "rags": [
-        "{VECTOR_STORE_ID}"
+        "e2e-test-docs"
       ]
     }
     """
diff --git a/tests/e2e/features/inline_rag.feature b/tests/e2e/features/inline_rag.feature
new file mode 100644
index 000000000..358d248ff
--- /dev/null
+++ b/tests/e2e/features/inline_rag.feature
@@ -0,0 +1,72 @@
+Feature: Inline RAG (BYOK) support tests
+
+  Background:
+    Given The service is started locally
+      And The system is in default state
+      And I set the Authorization header to Bearer eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiIxMjM0NTY3ODkwIiwibmFtZSI6Ikpva
+      And REST API service prefix is /v1
+      And The service uses the lightspeed-stack-inline-rag.yaml configuration
+      And The service is restarted
+
+  Scenario: Check if inline RAG source is registered
+    When I access REST API endpoint rags using HTTP GET method
+    Then The status code of the response is 200
+     And the body of the response has the following structure
+    """
+    {
+      "rags": [
+        "e2e-test-docs"
+      ]
+    }
+    """
+
+  Scenario: Query with inline RAG returns relevant content
+    When I use "query" to ask question with authorization header
+    """
+    {"query": "What is the title of the article from Paul?", "system_prompt": "You are an assistant. Write only lowercase letters"}
+    """
+    Then The status code of the response is 200
+     And The response should contain following fragments
+         | Fragments in LLM response |
+         | great work                |
+     And The response should contain non-empty rag_chunks
+
+  Scenario: Inline RAG query includes referenced documents
+    When I use "query" to ask question with authorization header
+    """
+    {"query": "What does Paul Graham say about great work?"}
+    """
+    Then The status code of the response is 200
+     And The response should contain non-empty referenced_documents
+
+  Scenario: Streaming query with inline RAG returns relevant content
+    When I use "streaming_query" to ask question with authorization header
+    """
+    {"query": "What is the title of the article from Paul?", "system_prompt": "You are an assistant. Write only lowercase letters"}
+    """
+    Then The status code of the response is 200
+     And I wait for the response to be completed
+     And The streamed response should contain following fragments
+         | Fragments in LLM response |
+         | great work                |
+
+  Scenario: Responses API with inline RAG returns relevant content
+    When I use "responses" to ask question with authorization header
+    """
+    {"input": "What is the title of the article from Paul?", "model": "{PROVIDER}/{MODEL}", "stream": false, "instructions": "You are an assistant. Write only lowercase letters"}
+    """
+    Then The status code of the response is 200
+     And The response should contain following fragments
+         | Fragments in LLM response |
+         | great work                |
+
+  Scenario: Streaming Responses API with inline RAG returns relevant content
+    When I use "responses" to ask question with authorization header
+    """
+    {"input": "What is the title of the article from Paul?", "model": "{PROVIDER}/{MODEL}", "stream": true, "instructions": "You are an assistant. Write only lowercase letters"}
+    """
+    Then The status code of the response is 200
+     And I wait for the response to be completed
+     And The streamed response should contain following fragments
+         | Fragments in LLM response |
+         | great work                |
diff --git a/tests/e2e/features/steps/common.py b/tests/e2e/features/steps/common.py
index b20f292e6..aca407661 100644
--- a/tests/e2e/features/steps/common.py
+++ b/tests/e2e/features/steps/common.py
@@ -5,6 +5,14 @@
 from behave import given  # pyright: ignore[reportAttributeAccessIssue]
 from behave.runner import Context
 
+from tests.e2e.utils.utils import (
+    create_config_backup,
+    is_prow_environment,
+    restart_container,
+    switch_config,
+    wait_for_container_health,
+)
+
 
 @given("The service is started locally")
 def service_is_started_locally(context: Context) -> None:
@@ -23,6 +31,43 @@ def service_is_started_locally(context: Context) -> None:
     context.port_llama = os.getenv("E2E_LLAMA_PORT", "8321")
 
 
+@given("The service uses the {config_name} configuration")  # type: ignore
+def configure_service(context: Context, config_name: str) -> None:
+    """Switch to the given configuration if not already active.
+
+    On first call creates a backup of the current config, switches to the
+    named config, and restarts the container.  Subsequent calls within
+    the same feature are no-ops (detected by backup file existence in Docker
+    or backup key presence in Prow).
+
+    Parameters:
+        context (Context): Behave context.
+        config_name (str): Config filename (e.g. lightspeed-stack-inline-rag.yaml).
+    """
+    if not is_prow_environment() and os.path.exists("lightspeed-stack.yaml.backup"):
+        return
+
+    mode_dir = "library-mode" if context.is_library_mode else "server-mode"
+    if is_prow_environment():
+        config_path = f"tests/e2e-prow/rhoai/configs/{config_name}"
+    else:
+        config_path = f"tests/e2e/configuration/{mode_dir}/{config_name}"
+    create_config_backup("lightspeed-stack.yaml")
+    switch_config(config_path)
+
+
+@given("The service is restarted")
+def restart_service(context: Context) -> None:
+    """Restart the lightspeed-stack container and wait for it to be healthy.
+
+    Parameters:
+        context (Context): Behave context.
+    """
+    restart_container("lightspeed-stack")
+    # Library mode needs extra time to load embedding models after restart
+    wait_for_container_health("lightspeed-stack", max_attempts=12)
+
+
 @given("The system is in default state")
 def system_in_default_state(context: Context) -> None:
     """Check the default system state.
diff --git a/tests/e2e/features/steps/llm_query_response.py b/tests/e2e/features/steps/llm_query_response.py
index 4d5dcc487..2410ddbe3 100644
--- a/tests/e2e/features/steps/llm_query_response.py
+++ b/tests/e2e/features/steps/llm_query_response.py
@@ -136,6 +136,30 @@ def check_llm_response_not_truncated(context: Context) -> None:
     assert response_json["truncated"] is False
 
 
+@then("The response should contain non-empty rag_chunks")
+def check_rag_chunks_present(context: Context) -> None:
+    """Check that the response contains non-empty rag_chunks from inline RAG."""
+    assert context.response is not None
+    response_json = context.response.json()
+    assert "rag_chunks" in response_json, "rag_chunks field missing from response"
+    assert (
+        len(response_json["rag_chunks"]) > 0
+    ), "rag_chunks is empty — inline RAG did not inject any chunks"
+
+
+@then("The response should contain non-empty referenced_documents")
+def check_referenced_documents_present(context: Context) -> None:
+    """Check that the response contains non-empty referenced_documents."""
+    assert context.response is not None
+    response_json = context.response.json()
+    assert (
+        "referenced_documents" in response_json
+    ), "referenced_documents field missing from response"
+    assert (
+        len(response_json["referenced_documents"]) > 0
+    ), "referenced_documents is empty — no documents were referenced"
+
+
 @then("The response should contain following fragments")
 def check_fragments_in_response(context: Context) -> None:
     """Check that all specified fragments are present in the LLM response.
@@ -149,7 +173,21 @@ def check_fragments_in_response(context: Context) -> None:
     """
     assert context.response is not None
     response_json = context.response.json()
-    response = response_json["response"]
+
+    # Support both query endpoint format (response field) and responses API format (output array)
+    if "response" in response_json:
+        response = response_json["response"]
+    else:
+        # Responses API format: extract text from output messages
+        response = " ".join(
+            part.get("text", "")
+            for item in response_json.get("output", [])
+            if item.get("type") == "message"
+            for part in (
+                item.get("content") if isinstance(item.get("content"), list) else []
+            )
+            if part.get("type") == "output_text"
+        )
 
     assert context.table is not None, "Fragments are not specified in table"
 
diff --git a/tests/e2e/test_list.txt b/tests/e2e/test_list.txt
index 988232bfa..faabf3747 100644
--- a/tests/e2e/test_list.txt
+++ b/tests/e2e/test_list.txt
@@ -1,4 +1,5 @@
 features/faiss.feature
+features/inline_rag.feature
 features/smoketests.feature
 features/authorized_noop.feature
 features/authorized_noop_token.feature
diff --git a/tests/unit/test_llama_stack_configuration.py b/tests/unit/test_llama_stack_configuration.py
index 56dc11a17..14fd1c999 100644
--- a/tests/unit/test_llama_stack_configuration.py
+++ b/tests/unit/test_llama_stack_configuration.py
@@ -101,6 +101,34 @@ def test_construct_vector_stores_section_skips_duplicate_from_existing() -> None
     assert output[0]["provider_id"] == "original_provider"
 
 
+def test_construct_vector_stores_section_skips_duplicate_env_var(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    """Test skips BYOK entry when existing store uses an env var that resolves to the same ID."""
+    monkeypatch.setenv("FAISS_VECTOR_STORE_ID", "vs_abc123")
+    ls_config = {
+        "registered_resources": {
+            "vector_stores": [
+                {
+                    "vector_store_id": "${env.FAISS_VECTOR_STORE_ID}",
+                    "provider_id": "faiss",
+                },
+            ]
+        }
+    }
+    byok_rag = [
+        {
+            "rag_id": "rag1",
+            "vector_db_id": "vs_abc123",
+            "embedding_model": "test-model",
+            "embedding_dimension": 768,
+        },
+    ]
+    output = construct_vector_stores_section(ls_config, byok_rag)
+    assert len(output) == 1
+    assert output[0]["provider_id"] == "faiss"
+
+
 def test_construct_vector_stores_section_skips_duplicate_within_byok() -> None:
     """Test skips duplicate vector_db_id entries within the BYOK RAG list."""
     ls_config: dict[str, Any] = {}