fix(chunkers): strip capturing groups and validate strictBoundaries scope

waleedlatif1 · waleedlatif1 · commit daaadb0f917f · 2026-04-30T17:47:03.000-07:00
- Convert capturing groups to non-capturing in regex chunker so split() doesn't surface delimiter text as spurious chunks
- Reject strictBoundaries in chunkingConfigSchema when strategy is not regex
diff --git a/apps/sim/lib/api/contracts/knowledge/base.ts b/apps/sim/lib/api/contracts/knowledge/base.ts
@@ -45,6 +45,9 @@ export const chunkingConfigSchema = z
       message: 'Regex pattern is required when using the regex chunking strategy',
     }
   )
+  .refine((data) => data.strategy === 'regex' || data.strategyOptions?.strictBoundaries !== true, {
+    message: 'strictBoundaries is only valid for the regex chunking strategy',
+  })
 
 export const createKnowledgeBaseBodySchema = z.object({
   name: z.string().min(1, 'Name is required'),
diff --git a/apps/sim/lib/chunkers/regex-chunker.test.ts b/apps/sim/lib/chunkers/regex-chunker.test.ts
@@ -184,6 +184,41 @@ describe('RegexChunker', () => {
     })
   })
 
+  describe('capturing groups', () => {
+    it.concurrent(
+      'should not include delimiter text as a chunk when pattern has capturing groups',
+      async () => {
+        const chunker = new RegexChunker({
+          pattern: '(---)',
+          chunkSize: 1024,
+          strictBoundaries: true,
+        })
+        const text = 'Section one content.---Section two content.---Section three content.'
+        const chunks = await chunker.chunk(text)
+
+        expect(chunks).toHaveLength(3)
+        expect(chunks[0].text).toBe('Section one content.')
+        expect(chunks[1].text).toBe('Section two content.')
+        expect(chunks[2].text).toBe('Section three content.')
+        for (const chunk of chunks) {
+          expect(chunk.text).not.toBe('---')
+        }
+      }
+    )
+
+    it.concurrent('should leave non-capturing groups and lookarounds intact', async () => {
+      const chunker = new RegexChunker({
+        pattern: '(?=\\n\\s*\\{\\s*"id"\\s*:)',
+        chunkSize: 1024,
+        strictBoundaries: true,
+      })
+      const text = '{"id": 1, "v": "a"}\n{"id": 2, "v": "b"}\n{"id": 3, "v": "c"}'
+      const chunks = await chunker.chunk(text)
+
+      expect(chunks).toHaveLength(3)
+    })
+  })
+
   describe('strictBoundaries mode', () => {
     it.concurrent(
       'should produce one chunk per match without merging small adjacent segments',
diff --git a/apps/sim/lib/chunkers/regex-chunker.ts b/apps/sim/lib/chunkers/regex-chunker.ts
@@ -15,6 +15,33 @@ const logger = createLogger('RegexChunker')
 
 const MAX_PATTERN_LENGTH = 500
 
+/**
+ * Converts unescaped capturing groups `(...)` into non-capturing groups `(?:...)`.
+ * `String.prototype.split()` interleaves captured groups into the result array,
+ * which would surface delimiter text as spurious chunks. Lookarounds, named
+ * groups, and other `(?...)` constructs are left untouched.
+ */
+function toNonCapturing(pattern: string): string {
+  let result = ''
+  let inClass = false
+  for (let i = 0; i < pattern.length; i++) {
+    const c = pattern[i]
+    if (c === '\\' && i + 1 < pattern.length) {
+      result += c + pattern[i + 1]
+      i++
+      continue
+    }
+    if (c === '[') inClass = true
+    else if (c === ']') inClass = false
+    if (!inClass && c === '(' && pattern[i + 1] !== '?') {
+      result += '(?:'
+      continue
+    }
+    result += c
+  }
+  return result
+}
+
 export class RegexChunker {
   private readonly chunkSize: number
   private readonly chunkOverlap: number
@@ -39,7 +66,7 @@ export class RegexChunker {
     }
 
     try {
-      const regex = new RegExp(pattern, 'g')
+      const regex = new RegExp(toNonCapturing(pattern), 'g')
 
       const testStrings = [
         'a'.repeat(10000),

Original file line number	Diff line number	Diff line change
`@@ -45,6 +45,9 @@ export const chunkingConfigSchema = z`
`45`	`45`	`message: 'Regex pattern is required when using the regex chunking strategy',`
`46`	`46`	`}`
`47`	`47`	`)`
	`48`	`+ .refine((data) => data.strategy === 'regex' \|\| data.strategyOptions?.strictBoundaries !== true, {`
	`49`	`+ message: 'strictBoundaries is only valid for the regex chunking strategy',`
	`50`	`+ })`
`48`	`51`
`49`	`52`	`export const createKnowledgeBaseBodySchema = z.object({`
`50`	`53`	`name: z.string().min(1, 'Name is required'),`