Skip to content

Commit daaadb0

Browse files
committed
fix(chunkers): strip capturing groups and validate strictBoundaries scope
- Convert capturing groups to non-capturing in regex chunker so split() doesn't surface delimiter text as spurious chunks - Reject strictBoundaries in chunkingConfigSchema when strategy is not regex
1 parent 19e04fb commit daaadb0

3 files changed

Lines changed: 66 additions & 1 deletion

File tree

apps/sim/lib/api/contracts/knowledge/base.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,9 @@ export const chunkingConfigSchema = z
4545
message: 'Regex pattern is required when using the regex chunking strategy',
4646
}
4747
)
48+
.refine((data) => data.strategy === 'regex' || data.strategyOptions?.strictBoundaries !== true, {
49+
message: 'strictBoundaries is only valid for the regex chunking strategy',
50+
})
4851

4952
export const createKnowledgeBaseBodySchema = z.object({
5053
name: z.string().min(1, 'Name is required'),

apps/sim/lib/chunkers/regex-chunker.test.ts

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -184,6 +184,41 @@ describe('RegexChunker', () => {
184184
})
185185
})
186186

187+
describe('capturing groups', () => {
188+
it.concurrent(
189+
'should not include delimiter text as a chunk when pattern has capturing groups',
190+
async () => {
191+
const chunker = new RegexChunker({
192+
pattern: '(---)',
193+
chunkSize: 1024,
194+
strictBoundaries: true,
195+
})
196+
const text = 'Section one content.---Section two content.---Section three content.'
197+
const chunks = await chunker.chunk(text)
198+
199+
expect(chunks).toHaveLength(3)
200+
expect(chunks[0].text).toBe('Section one content.')
201+
expect(chunks[1].text).toBe('Section two content.')
202+
expect(chunks[2].text).toBe('Section three content.')
203+
for (const chunk of chunks) {
204+
expect(chunk.text).not.toBe('---')
205+
}
206+
}
207+
)
208+
209+
it.concurrent('should leave non-capturing groups and lookarounds intact', async () => {
210+
const chunker = new RegexChunker({
211+
pattern: '(?=\\n\\s*\\{\\s*"id"\\s*:)',
212+
chunkSize: 1024,
213+
strictBoundaries: true,
214+
})
215+
const text = '{"id": 1, "v": "a"}\n{"id": 2, "v": "b"}\n{"id": 3, "v": "c"}'
216+
const chunks = await chunker.chunk(text)
217+
218+
expect(chunks).toHaveLength(3)
219+
})
220+
})
221+
187222
describe('strictBoundaries mode', () => {
188223
it.concurrent(
189224
'should produce one chunk per match without merging small adjacent segments',

apps/sim/lib/chunkers/regex-chunker.ts

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,33 @@ const logger = createLogger('RegexChunker')
1515

1616
const MAX_PATTERN_LENGTH = 500
1717

18+
/**
19+
* Converts unescaped capturing groups `(...)` into non-capturing groups `(?:...)`.
20+
* `String.prototype.split()` interleaves captured groups into the result array,
21+
* which would surface delimiter text as spurious chunks. Lookarounds, named
22+
* groups, and other `(?...)` constructs are left untouched.
23+
*/
24+
function toNonCapturing(pattern: string): string {
25+
let result = ''
26+
let inClass = false
27+
for (let i = 0; i < pattern.length; i++) {
28+
const c = pattern[i]
29+
if (c === '\\' && i + 1 < pattern.length) {
30+
result += c + pattern[i + 1]
31+
i++
32+
continue
33+
}
34+
if (c === '[') inClass = true
35+
else if (c === ']') inClass = false
36+
if (!inClass && c === '(' && pattern[i + 1] !== '?') {
37+
result += '(?:'
38+
continue
39+
}
40+
result += c
41+
}
42+
return result
43+
}
44+
1845
export class RegexChunker {
1946
private readonly chunkSize: number
2047
private readonly chunkOverlap: number
@@ -39,7 +66,7 @@ export class RegexChunker {
3966
}
4067

4168
try {
42-
const regex = new RegExp(pattern, 'g')
69+
const regex = new RegExp(toNonCapturing(pattern), 'g')
4370

4471
const testStrings = [
4572
'a'.repeat(10000),

0 commit comments

Comments
 (0)