|
| 1 | +/** |
| 2 | + * @vitest-environment node |
| 3 | + */ |
| 4 | + |
| 5 | +import { loggerMock } from '@sim/testing' |
| 6 | +import { describe, expect, it, vi } from 'vitest' |
| 7 | +import { RecursiveChunker } from './recursive-chunker' |
| 8 | + |
| 9 | +vi.mock('@sim/logger', () => loggerMock) |
| 10 | + |
| 11 | +describe('RecursiveChunker', () => { |
| 12 | + describe('empty and whitespace input', () => { |
| 13 | + it.concurrent('should return empty array for empty string', async () => { |
| 14 | + const chunker = new RecursiveChunker({ chunkSize: 100 }) |
| 15 | + const chunks = await chunker.chunk('') |
| 16 | + expect(chunks).toEqual([]) |
| 17 | + }) |
| 18 | + |
| 19 | + it.concurrent('should return empty array for whitespace-only input', async () => { |
| 20 | + const chunker = new RecursiveChunker({ chunkSize: 100 }) |
| 21 | + const chunks = await chunker.chunk(' \n\n\t ') |
| 22 | + expect(chunks).toEqual([]) |
| 23 | + }) |
| 24 | + }) |
| 25 | + |
| 26 | + describe('small content', () => { |
| 27 | + it.concurrent('should return single chunk when content fits in one chunk', async () => { |
| 28 | + const chunker = new RecursiveChunker({ chunkSize: 100 }) |
| 29 | + const text = 'This is a short text.' |
| 30 | + const chunks = await chunker.chunk(text) |
| 31 | + |
| 32 | + expect(chunks).toHaveLength(1) |
| 33 | + expect(chunks[0].text).toBe(text) |
| 34 | + }) |
| 35 | + }) |
| 36 | + |
| 37 | + describe('paragraph splitting', () => { |
| 38 | + it.concurrent('should split at paragraph boundaries first', async () => { |
| 39 | + const chunker = new RecursiveChunker({ chunkSize: 20 }) |
| 40 | + const text = |
| 41 | + 'First paragraph with enough content to matter.\n\nSecond paragraph with enough content to matter.\n\nThird paragraph with enough content here.' |
| 42 | + const chunks = await chunker.chunk(text) |
| 43 | + |
| 44 | + expect(chunks.length).toBeGreaterThan(1) |
| 45 | + }) |
| 46 | + }) |
| 47 | + |
| 48 | + describe('line splitting fallback', () => { |
| 49 | + it.concurrent('should split at newlines when paragraphs are too large', async () => { |
| 50 | + const chunker = new RecursiveChunker({ chunkSize: 15 }) |
| 51 | + // Single paragraph (no \n\n) but has \n line breaks |
| 52 | + const text = |
| 53 | + 'Line one with content here.\nLine two with content here.\nLine three with content here.\nLine four with content here.' |
| 54 | + const chunks = await chunker.chunk(text) |
| 55 | + |
| 56 | + expect(chunks.length).toBeGreaterThan(1) |
| 57 | + }) |
| 58 | + }) |
| 59 | + |
| 60 | + describe('sentence splitting fallback', () => { |
| 61 | + it.concurrent('should split at sentence boundaries when lines are too large', async () => { |
| 62 | + const chunker = new RecursiveChunker({ chunkSize: 10 }) |
| 63 | + // Single line, no \n, but has ". " sentence boundaries |
| 64 | + const text = |
| 65 | + 'First sentence here. Second sentence here. Third sentence here. Fourth sentence here.' |
| 66 | + const chunks = await chunker.chunk(text) |
| 67 | + |
| 68 | + expect(chunks.length).toBeGreaterThan(1) |
| 69 | + }) |
| 70 | + }) |
| 71 | + |
| 72 | + describe('word splitting fallback', () => { |
| 73 | + it.concurrent('should split at spaces when sentences are too large', async () => { |
| 74 | + const chunker = new RecursiveChunker({ chunkSize: 5 }) |
| 75 | + // No paragraph, line, or sentence breaks - only spaces |
| 76 | + const text = 'word1 word2 word3 word4 word5 word6 word7 word8 word9 word10' |
| 77 | + const chunks = await chunker.chunk(text) |
| 78 | + |
| 79 | + expect(chunks.length).toBeGreaterThan(1) |
| 80 | + }) |
| 81 | + }) |
| 82 | + |
| 83 | + describe('keep_separator behavior', () => { |
| 84 | + it.concurrent('should prepend separator to subsequent chunks', async () => { |
| 85 | + const chunker = new RecursiveChunker({ chunkSize: 15 }) |
| 86 | + const text = |
| 87 | + 'First paragraph content here.\n\nSecond paragraph content here.\n\nThird paragraph content here.' |
| 88 | + const chunks = await chunker.chunk(text) |
| 89 | + |
| 90 | + if (chunks.length > 1) { |
| 91 | + // The separator (\n\n) is prepended to parts after index 0, so subsequent |
| 92 | + // chunks should start with the separator used for splitting |
| 93 | + expect(chunks[1].text.startsWith('\n\n') || chunks[1].text.length > 0).toBe(true) |
| 94 | + } |
| 95 | + }) |
| 96 | + }) |
| 97 | + |
| 98 | + describe('custom separators', () => { |
| 99 | + it.concurrent('should use custom separators instead of default recipe', async () => { |
| 100 | + const chunker = new RecursiveChunker({ |
| 101 | + chunkSize: 15, |
| 102 | + separators: ['---', '\n'], |
| 103 | + }) |
| 104 | + const text = |
| 105 | + 'Section one content here with words.---Section two content here with words.---Section three content here.' |
| 106 | + const chunks = await chunker.chunk(text) |
| 107 | + |
| 108 | + expect(chunks.length).toBeGreaterThan(1) |
| 109 | + }) |
| 110 | + }) |
| 111 | + |
| 112 | + describe('recipe: plain', () => { |
| 113 | + it.concurrent('should use plain recipe by default', async () => { |
| 114 | + const chunker = new RecursiveChunker({ chunkSize: 20 }) |
| 115 | + const text = |
| 116 | + 'First paragraph with enough words to exceed the chunk size limit.\n\nSecond paragraph with enough words to exceed the chunk size limit.\n\nThird paragraph with enough words to exceed the chunk size limit.' |
| 117 | + const chunks = await chunker.chunk(text) |
| 118 | + |
| 119 | + expect(chunks.length).toBeGreaterThan(1) |
| 120 | + }) |
| 121 | + }) |
| 122 | + |
| 123 | + describe('recipe: markdown', () => { |
| 124 | + it.concurrent('should split at heading boundaries for markdown content', async () => { |
| 125 | + const chunker = new RecursiveChunker({ chunkSize: 20, recipe: 'markdown' }) |
| 126 | + const text = |
| 127 | + '\n# Title\n\nParagraph content under the title goes here.\n\n## Subtitle\n\nMore text content under the subtitle goes here.' |
| 128 | + const chunks = await chunker.chunk(text) |
| 129 | + |
| 130 | + expect(chunks.length).toBeGreaterThan(1) |
| 131 | + }) |
| 132 | + |
| 133 | + it.concurrent('should handle markdown horizontal rules', async () => { |
| 134 | + const chunker = new RecursiveChunker({ chunkSize: 20, recipe: 'markdown' }) |
| 135 | + const text = |
| 136 | + 'Section one content here.\n---\nSection two content here.\n---\nSection three content here.' |
| 137 | + const chunks = await chunker.chunk(text) |
| 138 | + |
| 139 | + expect(chunks.length).toBeGreaterThan(0) |
| 140 | + }) |
| 141 | + }) |
| 142 | + |
| 143 | + describe('recipe: code', () => { |
| 144 | + it.concurrent('should split on function and class boundaries', async () => { |
| 145 | + const chunker = new RecursiveChunker({ chunkSize: 20, recipe: 'code' }) |
| 146 | + const text = [ |
| 147 | + 'const x = 1;', |
| 148 | + 'function hello() {', |
| 149 | + ' return "hello";', |
| 150 | + '}', |
| 151 | + 'function world() {', |
| 152 | + ' return "world";', |
| 153 | + '}', |
| 154 | + 'class MyClass {', |
| 155 | + ' constructor() {}', |
| 156 | + ' method() { return true; }', |
| 157 | + '}', |
| 158 | + ].join('\n') |
| 159 | + const chunks = await chunker.chunk(text) |
| 160 | + |
| 161 | + expect(chunks.length).toBeGreaterThan(1) |
| 162 | + }) |
| 163 | + }) |
| 164 | + |
| 165 | + describe('chunk size respected', () => { |
| 166 | + it.concurrent('should not exceed chunk size in tokens', async () => { |
| 167 | + const chunkSize = 30 |
| 168 | + const chunker = new RecursiveChunker({ chunkSize }) |
| 169 | + const text = 'This is a test sentence with content. '.repeat(30) |
| 170 | + const chunks = await chunker.chunk(text) |
| 171 | + |
| 172 | + for (const chunk of chunks) { |
| 173 | + // Allow small tolerance for word boundary alignment |
| 174 | + expect(chunk.tokenCount).toBeLessThanOrEqual(chunkSize + 5) |
| 175 | + } |
| 176 | + }) |
| 177 | + }) |
| 178 | + |
| 179 | + describe('overlap', () => { |
| 180 | + it.concurrent('should share text between consecutive chunks when overlap is set', async () => { |
| 181 | + const chunker = new RecursiveChunker({ chunkSize: 20, chunkOverlap: 5 }) |
| 182 | + const text = |
| 183 | + 'First paragraph with some content here.\n\nSecond paragraph with different content here.\n\nThird paragraph with more content here.' |
| 184 | + const chunks = await chunker.chunk(text) |
| 185 | + |
| 186 | + if (chunks.length > 1) { |
| 187 | + // With overlap, second chunk should contain some text from the end of the first |
| 188 | + expect(chunks[1].text.length).toBeGreaterThan(0) |
| 189 | + } |
| 190 | + }) |
| 191 | + |
| 192 | + it.concurrent('should not add overlap when overlap is 0', async () => { |
| 193 | + const chunker = new RecursiveChunker({ chunkSize: 20, chunkOverlap: 0 }) |
| 194 | + const text = |
| 195 | + 'First sentence content here. Second sentence content here. Third sentence content here.' |
| 196 | + const chunks = await chunker.chunk(text) |
| 197 | + |
| 198 | + if (chunks.length > 1) { |
| 199 | + const firstChunkEnd = chunks[0].text.slice(-10) |
| 200 | + expect(chunks[1].text.startsWith(firstChunkEnd)).toBe(false) |
| 201 | + } |
| 202 | + }) |
| 203 | + }) |
| 204 | + |
| 205 | + describe('chunk metadata', () => { |
| 206 | + it.concurrent('should include text, tokenCount, and metadata fields', async () => { |
| 207 | + const chunker = new RecursiveChunker({ chunkSize: 100 }) |
| 208 | + const text = 'This is test content for metadata.' |
| 209 | + const chunks = await chunker.chunk(text) |
| 210 | + |
| 211 | + expect(chunks).toHaveLength(1) |
| 212 | + expect(chunks[0].text).toBe(text) |
| 213 | + expect(chunks[0].tokenCount).toBe(Math.ceil(text.length / 4)) |
| 214 | + expect(chunks[0].metadata.startIndex).toBeDefined() |
| 215 | + expect(chunks[0].metadata.endIndex).toBeDefined() |
| 216 | + }) |
| 217 | + |
| 218 | + it.concurrent('should have startIndex of 0 for the first chunk', async () => { |
| 219 | + const chunker = new RecursiveChunker({ chunkSize: 100 }) |
| 220 | + const text = 'Some content here.' |
| 221 | + const chunks = await chunker.chunk(text) |
| 222 | + |
| 223 | + expect(chunks[0].metadata.startIndex).toBe(0) |
| 224 | + }) |
| 225 | + |
| 226 | + it.concurrent('should have non-negative indices for all chunks', async () => { |
| 227 | + const chunker = new RecursiveChunker({ chunkSize: 20, chunkOverlap: 5 }) |
| 228 | + const text = 'First part. Second part. Third part. Fourth part. Fifth part.' |
| 229 | + const chunks = await chunker.chunk(text) |
| 230 | + |
| 231 | + for (const chunk of chunks) { |
| 232 | + expect(chunk.metadata.startIndex).toBeGreaterThanOrEqual(0) |
| 233 | + expect(chunk.metadata.endIndex).toBeGreaterThanOrEqual(chunk.metadata.startIndex) |
| 234 | + } |
| 235 | + }) |
| 236 | + |
| 237 | + it.concurrent('should have endIndex greater than startIndex for non-empty chunks', async () => { |
| 238 | + const chunker = new RecursiveChunker({ chunkSize: 20 }) |
| 239 | + const text = 'Multiple sentences here. Another one here. And another. And more content.' |
| 240 | + const chunks = await chunker.chunk(text) |
| 241 | + |
| 242 | + for (const chunk of chunks) { |
| 243 | + expect(chunk.metadata.endIndex).toBeGreaterThan(chunk.metadata.startIndex) |
| 244 | + } |
| 245 | + }) |
| 246 | + }) |
| 247 | + |
| 248 | + describe('edge cases', () => { |
| 249 | + it.concurrent('should handle very long text', async () => { |
| 250 | + const chunker = new RecursiveChunker({ chunkSize: 100 }) |
| 251 | + const text = 'This is a sentence. '.repeat(1000) |
| 252 | + const chunks = await chunker.chunk(text) |
| 253 | + |
| 254 | + expect(chunks.length).toBeGreaterThan(1) |
| 255 | + }) |
| 256 | + |
| 257 | + it.concurrent('should handle text with no natural separators', async () => { |
| 258 | + const chunker = new RecursiveChunker({ chunkSize: 5 }) |
| 259 | + const text = 'abcdefghijklmnopqrstuvwxyz'.repeat(5) |
| 260 | + const chunks = await chunker.chunk(text) |
| 261 | + |
| 262 | + expect(chunks.length).toBeGreaterThan(1) |
| 263 | + }) |
| 264 | + |
| 265 | + it.concurrent('should handle unicode text', async () => { |
| 266 | + const chunker = new RecursiveChunker({ chunkSize: 100 }) |
| 267 | + const text = '这是中文测试。日本語テスト。한국어 테스트.' |
| 268 | + const chunks = await chunker.chunk(text) |
| 269 | + |
| 270 | + expect(chunks.length).toBeGreaterThan(0) |
| 271 | + expect(chunks[0].text).toContain('中文') |
| 272 | + }) |
| 273 | + |
| 274 | + it.concurrent('should use default chunkSize of 1024 tokens', async () => { |
| 275 | + const chunker = new RecursiveChunker({}) |
| 276 | + const text = 'Word '.repeat(400) |
| 277 | + const chunks = await chunker.chunk(text) |
| 278 | + |
| 279 | + expect(chunks).toHaveLength(1) |
| 280 | + }) |
| 281 | + }) |
| 282 | +}) |
0 commit comments