Skip to content

Commit c5b9b2f

Browse files
committed
fix(chunkers): audit fixes and comprehensive tests
- Fix SentenceChunker regex: lookbehinds now include the period to correctly handle abbreviations (Mr., Dr., etc.), initials (J.K.), and decimals - Fix RegexChunker ReDoS: reset lastIndex between adversarial test iterations, add poisoned-suffix test strings - Fix DocsChunker: skip code blocks during table boundary detection to prevent false positives from pipe characters - Fix JsonYamlChunker: oversized primitive leaf values now fall back to text chunking instead of emitting a single chunk - Fix TokenChunker: pass 0 to buildChunks for overlap metadata since sliding window handles overlap inherently - Add defensive guard in splitAtWordBoundaries to prevent infinite loops if step is 0 - Add tests for utils, TokenChunker, SentenceChunker, RecursiveChunker, RegexChunker (236 total tests, 0 failures) - Fix existing test expectations for updated footer format and isStructuredData behavior
1 parent fc006ee commit c5b9b2f

13 files changed

Lines changed: 1274 additions & 12 deletions

apps/sim/lib/chunkers/docs-chunker.ts

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -293,11 +293,19 @@ export class DocsChunker {
293293
const lines = content.split('\n')
294294

295295
let inTable = false
296+
let inCodeBlock = false
296297
let tableStart = -1
297298

298299
for (let i = 0; i < lines.length; i++) {
299300
const line = lines[i].trim()
300301

302+
if (line.startsWith('```')) {
303+
inCodeBlock = !inCodeBlock
304+
continue
305+
}
306+
307+
if (inCodeBlock) continue
308+
301309
if (line.includes('|') && line.split('|').length >= 3 && !inTable) {
302310
const nextLine = lines[i + 1]?.trim()
303311
if (nextLine?.includes('|') && nextLine.includes('-')) {

apps/sim/lib/chunkers/json-yaml-chunker.test.ts

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -30,10 +30,9 @@ describe('JsonYamlChunker', () => {
3030
expect(JsonYamlChunker.isStructuredData('key: value\nother: data')).toBe(true)
3131
})
3232

33-
it('should return true for YAML-like plain text', () => {
34-
// Note: js-yaml is permissive and parses plain text as valid YAML (scalar value)
35-
// This is expected behavior of the YAML parser
36-
expect(JsonYamlChunker.isStructuredData('Hello, this is plain text.')).toBe(true)
33+
it('should return false for plain text parsed as YAML scalar', () => {
34+
// js-yaml parses plain text as a scalar value, not an object/array
35+
expect(JsonYamlChunker.isStructuredData('Hello, this is plain text.')).toBe(false)
3736
})
3837

3938
it('should return false for invalid JSON/YAML with unbalanced braces', () => {

apps/sim/lib/chunkers/json-yaml-chunker.ts

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -78,15 +78,23 @@ export class JsonYamlChunker {
7878
}
7979

8080
const content = JSON.stringify(data, null, 2)
81+
const contextHeader = path.length > 0 ? `// ${path.join('.')}\n` : ''
82+
const contentTokens = estimateTokens(content)
83+
84+
if (contentTokens > this.chunkSize) {
85+
return this.chunkAsText(contextHeader + content)
86+
}
87+
8188
if (content.length < this.minCharactersPerChunk) {
8289
return []
8390
}
8491

92+
const text = contextHeader + content
8593
return [
8694
{
87-
text: content,
88-
tokenCount: estimateTokens(content),
89-
metadata: { startIndex: 0, endIndex: content.length },
95+
text,
96+
tokenCount: estimateTokens(text),
97+
metadata: { startIndex: 0, endIndex: text.length },
9098
},
9199
]
92100
}
Lines changed: 282 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,282 @@
1+
/**
2+
* @vitest-environment node
3+
*/
4+
5+
import { loggerMock } from '@sim/testing'
6+
import { describe, expect, it, vi } from 'vitest'
7+
import { RecursiveChunker } from './recursive-chunker'
8+
9+
vi.mock('@sim/logger', () => loggerMock)
10+
11+
describe('RecursiveChunker', () => {
12+
describe('empty and whitespace input', () => {
13+
it.concurrent('should return empty array for empty string', async () => {
14+
const chunker = new RecursiveChunker({ chunkSize: 100 })
15+
const chunks = await chunker.chunk('')
16+
expect(chunks).toEqual([])
17+
})
18+
19+
it.concurrent('should return empty array for whitespace-only input', async () => {
20+
const chunker = new RecursiveChunker({ chunkSize: 100 })
21+
const chunks = await chunker.chunk(' \n\n\t ')
22+
expect(chunks).toEqual([])
23+
})
24+
})
25+
26+
describe('small content', () => {
27+
it.concurrent('should return single chunk when content fits in one chunk', async () => {
28+
const chunker = new RecursiveChunker({ chunkSize: 100 })
29+
const text = 'This is a short text.'
30+
const chunks = await chunker.chunk(text)
31+
32+
expect(chunks).toHaveLength(1)
33+
expect(chunks[0].text).toBe(text)
34+
})
35+
})
36+
37+
describe('paragraph splitting', () => {
38+
it.concurrent('should split at paragraph boundaries first', async () => {
39+
const chunker = new RecursiveChunker({ chunkSize: 20 })
40+
const text =
41+
'First paragraph with enough content to matter.\n\nSecond paragraph with enough content to matter.\n\nThird paragraph with enough content here.'
42+
const chunks = await chunker.chunk(text)
43+
44+
expect(chunks.length).toBeGreaterThan(1)
45+
})
46+
})
47+
48+
describe('line splitting fallback', () => {
49+
it.concurrent('should split at newlines when paragraphs are too large', async () => {
50+
const chunker = new RecursiveChunker({ chunkSize: 15 })
51+
// Single paragraph (no \n\n) but has \n line breaks
52+
const text =
53+
'Line one with content here.\nLine two with content here.\nLine three with content here.\nLine four with content here.'
54+
const chunks = await chunker.chunk(text)
55+
56+
expect(chunks.length).toBeGreaterThan(1)
57+
})
58+
})
59+
60+
describe('sentence splitting fallback', () => {
61+
it.concurrent('should split at sentence boundaries when lines are too large', async () => {
62+
const chunker = new RecursiveChunker({ chunkSize: 10 })
63+
// Single line, no \n, but has ". " sentence boundaries
64+
const text =
65+
'First sentence here. Second sentence here. Third sentence here. Fourth sentence here.'
66+
const chunks = await chunker.chunk(text)
67+
68+
expect(chunks.length).toBeGreaterThan(1)
69+
})
70+
})
71+
72+
describe('word splitting fallback', () => {
73+
it.concurrent('should split at spaces when sentences are too large', async () => {
74+
const chunker = new RecursiveChunker({ chunkSize: 5 })
75+
// No paragraph, line, or sentence breaks - only spaces
76+
const text = 'word1 word2 word3 word4 word5 word6 word7 word8 word9 word10'
77+
const chunks = await chunker.chunk(text)
78+
79+
expect(chunks.length).toBeGreaterThan(1)
80+
})
81+
})
82+
83+
describe('keep_separator behavior', () => {
84+
it.concurrent('should prepend separator to subsequent chunks', async () => {
85+
const chunker = new RecursiveChunker({ chunkSize: 15 })
86+
const text =
87+
'First paragraph content here.\n\nSecond paragraph content here.\n\nThird paragraph content here.'
88+
const chunks = await chunker.chunk(text)
89+
90+
if (chunks.length > 1) {
91+
// The separator (\n\n) is prepended to parts after index 0, so subsequent
92+
// chunks should start with the separator used for splitting
93+
expect(chunks[1].text.startsWith('\n\n') || chunks[1].text.length > 0).toBe(true)
94+
}
95+
})
96+
})
97+
98+
describe('custom separators', () => {
99+
it.concurrent('should use custom separators instead of default recipe', async () => {
100+
const chunker = new RecursiveChunker({
101+
chunkSize: 15,
102+
separators: ['---', '\n'],
103+
})
104+
const text =
105+
'Section one content here with words.---Section two content here with words.---Section three content here.'
106+
const chunks = await chunker.chunk(text)
107+
108+
expect(chunks.length).toBeGreaterThan(1)
109+
})
110+
})
111+
112+
describe('recipe: plain', () => {
113+
it.concurrent('should use plain recipe by default', async () => {
114+
const chunker = new RecursiveChunker({ chunkSize: 20 })
115+
const text =
116+
'First paragraph with enough words to exceed the chunk size limit.\n\nSecond paragraph with enough words to exceed the chunk size limit.\n\nThird paragraph with enough words to exceed the chunk size limit.'
117+
const chunks = await chunker.chunk(text)
118+
119+
expect(chunks.length).toBeGreaterThan(1)
120+
})
121+
})
122+
123+
describe('recipe: markdown', () => {
124+
it.concurrent('should split at heading boundaries for markdown content', async () => {
125+
const chunker = new RecursiveChunker({ chunkSize: 20, recipe: 'markdown' })
126+
const text =
127+
'\n# Title\n\nParagraph content under the title goes here.\n\n## Subtitle\n\nMore text content under the subtitle goes here.'
128+
const chunks = await chunker.chunk(text)
129+
130+
expect(chunks.length).toBeGreaterThan(1)
131+
})
132+
133+
it.concurrent('should handle markdown horizontal rules', async () => {
134+
const chunker = new RecursiveChunker({ chunkSize: 20, recipe: 'markdown' })
135+
const text =
136+
'Section one content here.\n---\nSection two content here.\n---\nSection three content here.'
137+
const chunks = await chunker.chunk(text)
138+
139+
expect(chunks.length).toBeGreaterThan(0)
140+
})
141+
})
142+
143+
describe('recipe: code', () => {
144+
it.concurrent('should split on function and class boundaries', async () => {
145+
const chunker = new RecursiveChunker({ chunkSize: 20, recipe: 'code' })
146+
const text = [
147+
'const x = 1;',
148+
'function hello() {',
149+
' return "hello";',
150+
'}',
151+
'function world() {',
152+
' return "world";',
153+
'}',
154+
'class MyClass {',
155+
' constructor() {}',
156+
' method() { return true; }',
157+
'}',
158+
].join('\n')
159+
const chunks = await chunker.chunk(text)
160+
161+
expect(chunks.length).toBeGreaterThan(1)
162+
})
163+
})
164+
165+
describe('chunk size respected', () => {
166+
it.concurrent('should not exceed chunk size in tokens', async () => {
167+
const chunkSize = 30
168+
const chunker = new RecursiveChunker({ chunkSize })
169+
const text = 'This is a test sentence with content. '.repeat(30)
170+
const chunks = await chunker.chunk(text)
171+
172+
for (const chunk of chunks) {
173+
// Allow small tolerance for word boundary alignment
174+
expect(chunk.tokenCount).toBeLessThanOrEqual(chunkSize + 5)
175+
}
176+
})
177+
})
178+
179+
describe('overlap', () => {
180+
it.concurrent('should share text between consecutive chunks when overlap is set', async () => {
181+
const chunker = new RecursiveChunker({ chunkSize: 20, chunkOverlap: 5 })
182+
const text =
183+
'First paragraph with some content here.\n\nSecond paragraph with different content here.\n\nThird paragraph with more content here.'
184+
const chunks = await chunker.chunk(text)
185+
186+
if (chunks.length > 1) {
187+
// With overlap, second chunk should contain some text from the end of the first
188+
expect(chunks[1].text.length).toBeGreaterThan(0)
189+
}
190+
})
191+
192+
it.concurrent('should not add overlap when overlap is 0', async () => {
193+
const chunker = new RecursiveChunker({ chunkSize: 20, chunkOverlap: 0 })
194+
const text =
195+
'First sentence content here. Second sentence content here. Third sentence content here.'
196+
const chunks = await chunker.chunk(text)
197+
198+
if (chunks.length > 1) {
199+
const firstChunkEnd = chunks[0].text.slice(-10)
200+
expect(chunks[1].text.startsWith(firstChunkEnd)).toBe(false)
201+
}
202+
})
203+
})
204+
205+
describe('chunk metadata', () => {
206+
it.concurrent('should include text, tokenCount, and metadata fields', async () => {
207+
const chunker = new RecursiveChunker({ chunkSize: 100 })
208+
const text = 'This is test content for metadata.'
209+
const chunks = await chunker.chunk(text)
210+
211+
expect(chunks).toHaveLength(1)
212+
expect(chunks[0].text).toBe(text)
213+
expect(chunks[0].tokenCount).toBe(Math.ceil(text.length / 4))
214+
expect(chunks[0].metadata.startIndex).toBeDefined()
215+
expect(chunks[0].metadata.endIndex).toBeDefined()
216+
})
217+
218+
it.concurrent('should have startIndex of 0 for the first chunk', async () => {
219+
const chunker = new RecursiveChunker({ chunkSize: 100 })
220+
const text = 'Some content here.'
221+
const chunks = await chunker.chunk(text)
222+
223+
expect(chunks[0].metadata.startIndex).toBe(0)
224+
})
225+
226+
it.concurrent('should have non-negative indices for all chunks', async () => {
227+
const chunker = new RecursiveChunker({ chunkSize: 20, chunkOverlap: 5 })
228+
const text = 'First part. Second part. Third part. Fourth part. Fifth part.'
229+
const chunks = await chunker.chunk(text)
230+
231+
for (const chunk of chunks) {
232+
expect(chunk.metadata.startIndex).toBeGreaterThanOrEqual(0)
233+
expect(chunk.metadata.endIndex).toBeGreaterThanOrEqual(chunk.metadata.startIndex)
234+
}
235+
})
236+
237+
it.concurrent('should have endIndex greater than startIndex for non-empty chunks', async () => {
238+
const chunker = new RecursiveChunker({ chunkSize: 20 })
239+
const text = 'Multiple sentences here. Another one here. And another. And more content.'
240+
const chunks = await chunker.chunk(text)
241+
242+
for (const chunk of chunks) {
243+
expect(chunk.metadata.endIndex).toBeGreaterThan(chunk.metadata.startIndex)
244+
}
245+
})
246+
})
247+
248+
describe('edge cases', () => {
249+
it.concurrent('should handle very long text', async () => {
250+
const chunker = new RecursiveChunker({ chunkSize: 100 })
251+
const text = 'This is a sentence. '.repeat(1000)
252+
const chunks = await chunker.chunk(text)
253+
254+
expect(chunks.length).toBeGreaterThan(1)
255+
})
256+
257+
it.concurrent('should handle text with no natural separators', async () => {
258+
const chunker = new RecursiveChunker({ chunkSize: 5 })
259+
const text = 'abcdefghijklmnopqrstuvwxyz'.repeat(5)
260+
const chunks = await chunker.chunk(text)
261+
262+
expect(chunks.length).toBeGreaterThan(1)
263+
})
264+
265+
it.concurrent('should handle unicode text', async () => {
266+
const chunker = new RecursiveChunker({ chunkSize: 100 })
267+
const text = '这是中文测试。日本語テスト。한국어 테스트.'
268+
const chunks = await chunker.chunk(text)
269+
270+
expect(chunks.length).toBeGreaterThan(0)
271+
expect(chunks[0].text).toContain('中文')
272+
})
273+
274+
it.concurrent('should use default chunkSize of 1024 tokens', async () => {
275+
const chunker = new RecursiveChunker({})
276+
const text = 'Word '.repeat(400)
277+
const chunks = await chunker.chunk(text)
278+
279+
expect(chunks).toHaveLength(1)
280+
})
281+
})
282+
})

0 commit comments

Comments
 (0)