Skip to content

Commit cb814ff

Browse files
committed
chore(chunkers): remove unnecessary comments and dead code
Strip 445 lines of redundant TSDoc, math calculation comments, implementation rationale notes, and assertion-restating comments across all chunker source and test files.
1 parent c5b9b2f commit cb814ff

23 files changed

Lines changed: 20 additions & 425 deletions

apps/sim/app/api/knowledge/route.ts

Lines changed: 0 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -15,14 +15,6 @@ import { captureServerEvent } from '@/lib/posthog/server'
1515

1616
const logger = createLogger('KnowledgeBaseAPI')
1717

18-
/**
19-
* Schema for creating a knowledge base
20-
*
21-
* Chunking config units:
22-
* - maxSize: tokens (1 token ≈ 4 characters)
23-
* - minSize: characters
24-
* - overlap: tokens (1 token ≈ 4 characters)
25-
*/
2618
const CreateKnowledgeBaseSchema = z.object({
2719
name: z.string().min(1, 'Name is required'),
2820
description: z.string().optional(),
@@ -31,25 +23,17 @@ const CreateKnowledgeBaseSchema = z.object({
3123
embeddingDimension: z.literal(1536).default(1536),
3224
chunkingConfig: z
3325
.object({
34-
/** Maximum chunk size in tokens (1 token ≈ 4 characters) */
3526
maxSize: z.number().min(100).max(4000).default(1024),
36-
/** Minimum chunk size in characters */
3727
minSize: z.number().min(1).max(2000).default(100),
38-
/** Overlap between chunks in tokens (1 token ≈ 4 characters) */
3928
overlap: z.number().min(0).max(500).default(200),
40-
/** Chunking strategy */
4129
strategy: z
4230
.enum(['auto', 'text', 'regex', 'recursive', 'sentence', 'token'])
4331
.default('auto')
4432
.optional(),
45-
/** Strategy-specific options */
4633
strategyOptions: z
4734
.object({
48-
/** Regex pattern for 'regex' strategy (max 500 chars) */
4935
pattern: z.string().max(500).optional(),
50-
/** Custom separator hierarchy for 'recursive' strategy */
5136
separators: z.array(z.string()).optional(),
52-
/** Pre-built separator recipe for 'recursive' strategy */
5337
recipe: z.enum(['plain', 'markdown', 'code']).optional(),
5438
})
5539
.optional(),

apps/sim/app/workspace/[workspaceId]/knowledge/components/create-base-modal/create-base-modal.tsx

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -60,26 +60,20 @@ const FormSchema = z
6060
.max(100, 'Name must be less than 100 characters')
6161
.refine((value) => value.trim().length > 0, 'Name cannot be empty'),
6262
description: z.string().max(500, 'Description must be less than 500 characters').optional(),
63-
/** Minimum chunk size in characters */
6463
minChunkSize: z
6564
.number()
6665
.min(1, 'Min chunk size must be at least 1 character')
6766
.max(2000, 'Min chunk size must be less than 2000 characters'),
68-
/** Maximum chunk size in tokens (1 token ≈ 4 characters) */
6967
maxChunkSize: z
7068
.number()
7169
.min(100, 'Max chunk size must be at least 100 tokens')
7270
.max(4000, 'Max chunk size must be less than 4000 tokens'),
73-
/** Overlap between chunks in tokens */
7471
overlapSize: z
7572
.number()
7673
.min(0, 'Overlap must be non-negative')
7774
.max(500, 'Overlap must be less than 500 tokens'),
78-
/** Chunking strategy */
7975
strategy: z.enum(['auto', 'text', 'regex', 'recursive', 'sentence', 'token']).default('auto'),
80-
/** Regex pattern (required when strategy is 'regex') */
8176
regexPattern: z.string().optional(),
82-
/** Custom separators for recursive strategy (comma-separated) */
8377
customSeparators: z.string().optional(),
8478
})
8579
.refine(
@@ -376,7 +370,6 @@ export const CreateBaseModal = memo(function CreateBaseModal({
376370
<div className='space-y-3'>
377371
<div className='flex flex-col gap-2'>
378372
<Label htmlFor='kb-name'>Name</Label>
379-
{/* Hidden decoy fields to prevent browser autofill */}
380373
<input
381374
type='text'
382375
name='fakeusernameremembered'

apps/sim/hooks/queries/kb/knowledge.ts

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -339,10 +339,7 @@ export interface DocumentChunkSearchParams {
339339
search: string
340340
}
341341

342-
/**
343-
* Fetches all chunks matching a search query by paginating through results.
344-
* This is used for search functionality where we need all matching chunks.
345-
*/
342+
/** Paginates through all matching chunks rather than returning a single page. */
346343
export async function fetchAllDocumentChunks(
347344
{ knowledgeBaseId, documentId, search }: DocumentChunkSearchParams,
348345
signal?: AbortSignal
@@ -377,10 +374,6 @@ export const serializeSearchParams = (params: DocumentChunkSearchParams) =>
377374
search: params.search,
378375
})
379376

380-
/**
381-
* Hook to search for chunks in a document.
382-
* Fetches all matching chunks and returns them for client-side pagination.
383-
*/
384377
export function useDocumentChunkSearchQuery(
385378
params: DocumentChunkSearchParams,
386379
options?: {

apps/sim/lib/chunkers/docs-chunker.ts

Lines changed: 3 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -21,9 +21,6 @@ interface Frontmatter {
2121

2222
const logger = createLogger('DocsChunker')
2323

24-
/**
25-
* Docs-specific chunker that processes .mdx files and tracks header context
26-
*/
2724
export class DocsChunker {
2825
private readonly textChunker: TextChunker
2926
private readonly baseUrl: string
@@ -39,9 +36,6 @@ export class DocsChunker {
3936
this.baseUrl = options.baseUrl ?? 'https://docs.sim.ai'
4037
}
4138

42-
/**
43-
* Process all .mdx files in the docs directory
44-
*/
4539
async chunkAllDocs(docsPath: string): Promise<DocChunk[]> {
4640
const allChunks: DocChunk[] = []
4741

@@ -67,9 +61,6 @@ export class DocsChunker {
6761
}
6862
}
6963

70-
/**
71-
* Process a single .mdx file
72-
*/
7364
async chunkMdxFile(filePath: string, basePath: string): Promise<DocChunk[]> {
7465
const content = await fs.readFile(filePath, 'utf-8')
7566
const relativePath = path.relative(basePath, filePath)
@@ -120,9 +111,6 @@ export class DocsChunker {
120111
return chunks
121112
}
122113

123-
/**
124-
* Find all .mdx files recursively
125-
*/
126114
private async findMdxFiles(dirPath: string): Promise<string[]> {
127115
const files: string[] = []
128116

@@ -142,9 +130,6 @@ export class DocsChunker {
142130
return files
143131
}
144132

145-
/**
146-
* Extract headers and their positions from markdown content
147-
*/
148133
private extractHeaders(content: string): HeaderInfo[] {
149134
const headers: HeaderInfo[] = []
150135
const headerRegex = /^(#{1,6})\s+(.+)$/gm
@@ -166,9 +151,6 @@ export class DocsChunker {
166151
return headers
167152
}
168153

169-
/**
170-
* Generate URL-safe anchor from header text
171-
*/
172154
private generateAnchor(headerText: string): string {
173155
return headerText
174156
.toLowerCase()
@@ -178,10 +160,7 @@ export class DocsChunker {
178160
.replace(/^-|-$/g, '')
179161
}
180162

181-
/**
182-
* Generate document URL from relative path
183-
* Handles index.mdx files specially - they are served at the parent directory path
184-
*/
163+
/** index.mdx files are served at the parent directory path */
185164
private generateDocumentUrl(relativePath: string): string {
186165
let urlPath = relativePath.replace(/\.mdx$/, '').replace(/\\/g, '/')
187166

@@ -194,9 +173,6 @@ export class DocsChunker {
194173
return `${this.baseUrl}/${urlPath}`
195174
}
196175

197-
/**
198-
* Find the most relevant header for a given position
199-
*/
200176
private findRelevantHeader(headers: HeaderInfo[], position: number): HeaderInfo | null {
201177
if (headers.length === 0) return null
202178

@@ -213,11 +189,7 @@ export class DocsChunker {
213189
return relevantHeader
214190
}
215191

216-
/**
217-
* Split content into chunks using the existing TextChunker with table awareness.
218-
* Returns both the chunks and the cleaned content so header extraction
219-
* operates on the same text that was chunked (aligned positions).
220-
*/
192+
/** Returns both chunks and cleaned content so header extraction uses aligned positions. */
221193
private async splitContent(
222194
content: string
223195
): Promise<{ chunks: string[]; cleanedContent: string }> {
@@ -238,9 +210,6 @@ export class DocsChunker {
238210
return { chunks: finalChunks, cleanedContent }
239211
}
240212

241-
/**
242-
* Clean content by removing MDX-specific elements and excessive whitespace
243-
*/
244213
private cleanContent(content: string): string {
245214
return content
246215
.replace(/\r\n/g, '\n')
@@ -255,9 +224,6 @@ export class DocsChunker {
255224
.trim()
256225
}
257226

258-
/**
259-
* Parse frontmatter from MDX content
260-
*/
261227
private parseFrontmatter(content: string): { data: Frontmatter; content: string } {
262228
const frontmatterRegex = /^---\r?\n([\s\S]*?)\r?\n---\r?\n([\s\S]*)$/
263229
const match = content.match(frontmatterRegex)
@@ -285,9 +251,7 @@ export class DocsChunker {
285251
return { data, content: markdownContent }
286252
}
287253

288-
/**
289-
* Detect table boundaries in markdown content to avoid splitting them
290-
*/
254+
/** Detects table boundaries to avoid splitting tables across chunks. */
291255
private detectTableBoundaries(content: string): { start: number; end: number }[] {
292256
const tables: { start: number; end: number }[] = []
293257
const lines = content.split('\n')
@@ -331,16 +295,10 @@ export class DocsChunker {
331295
return tables
332296
}
333297

334-
/**
335-
* Get character position from line number
336-
*/
337298
private getCharacterPosition(lines: string[], lineIndex: number): number {
338299
return lines.slice(0, lineIndex).reduce((acc, line) => acc + line.length + 1, 0)
339300
}
340301

341-
/**
342-
* Merge chunks that would split tables
343-
*/
344302
private mergeTableChunks(
345303
chunks: string[],
346304
tableBoundaries: { start: number; end: number }[],
@@ -393,9 +351,6 @@ export class DocsChunker {
393351
return mergedChunks.filter((chunk) => chunk.length > 50)
394352
}
395353

396-
/**
397-
* Enforce token size limit on chunks, using the configured chunkSize
398-
*/
399354
private enforceSizeLimit(chunks: string[]): string[] {
400355
const finalChunks: string[] = []
401356

apps/sim/lib/chunkers/json-yaml-chunker.test.ts

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -31,12 +31,10 @@ describe('JsonYamlChunker', () => {
3131
})
3232

3333
it('should return false for plain text parsed as YAML scalar', () => {
34-
// js-yaml parses plain text as a scalar value, not an object/array
3534
expect(JsonYamlChunker.isStructuredData('Hello, this is plain text.')).toBe(false)
3635
})
3736

3837
it('should return false for invalid JSON/YAML with unbalanced braces', () => {
39-
// Only truly malformed content that fails YAML parsing returns false
4038
expect(JsonYamlChunker.isStructuredData('{invalid: json: content: {{')).toBe(false)
4139
})
4240

@@ -60,7 +58,6 @@ describe('JsonYamlChunker', () => {
6058
const json = '{}'
6159
const chunks = await chunker.chunk(json)
6260

63-
// Empty object is valid JSON, should return at least metadata
6461
expect(chunks.length).toBeGreaterThanOrEqual(0)
6562
})
6663

@@ -203,7 +200,6 @@ server:
203200
const json = '[]'
204201
const chunks = await chunker.chunk(json)
205202

206-
// Empty array should not produce chunks with meaningful content
207203
expect(chunks.length).toBeGreaterThanOrEqual(0)
208204
})
209205

@@ -271,7 +267,6 @@ server:
271267

272268
it.concurrent('should fall back to text chunking for invalid JSON', async () => {
273269
const chunker = new JsonYamlChunker({ chunkSize: 100, minCharactersPerChunk: 10 })
274-
// Create content that fails YAML parsing and is long enough to produce chunks
275270
const invalidJson = `{this is not valid json: content: {{${' more content here '.repeat(10)}`
276271
const chunks = await chunker.chunk(invalidJson)
277272

@@ -376,9 +371,7 @@ server:
376371
const json = JSON.stringify({ a: 1, b: 2, c: 3 })
377372
const chunks = await chunker.chunk(json)
378373

379-
// Should produce chunks that are valid
380374
expect(chunks.length).toBeGreaterThan(0)
381-
// The entire small object fits in one chunk
382375
expect(chunks[0].text.length).toBeGreaterThan(0)
383376
})
384377
})

apps/sim/lib/chunkers/json-yaml-chunker.ts

Lines changed: 0 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,6 @@ type JsonArray = JsonValue[]
1212

1313
const MAX_DEPTH = 5
1414

15-
/**
16-
* Structure-aware chunker for JSON and YAML content
17-
* Recursively decomposes objects and arrays while preserving structure
18-
*/
1915
export class JsonYamlChunker {
2016
private chunkSize: number
2117
private minCharactersPerChunk: number
@@ -25,9 +21,6 @@ export class JsonYamlChunker {
2521
this.minCharactersPerChunk = options.minCharactersPerChunk ?? 100
2622
}
2723

28-
/**
29-
* Check if content is structured JSON/YAML data (object or array, not a primitive)
30-
*/
3124
static isStructuredData(content: string): boolean {
3225
try {
3326
const parsed = JSON.parse(content)
@@ -42,9 +35,6 @@ export class JsonYamlChunker {
4235
}
4336
}
4437

45-
/**
46-
* Chunk JSON/YAML content intelligently based on structure
47-
*/
4838
async chunk(content: string): Promise<Chunk[]> {
4939
try {
5040
let data: JsonValue
@@ -65,9 +55,6 @@ export class JsonYamlChunker {
6555
}
6656
}
6757

68-
/**
69-
* Chunk structured data based on its structure
70-
*/
7158
private chunkStructuredData(data: JsonValue, path: string[], depth: number): Chunk[] {
7259
if (Array.isArray(data)) {
7360
return this.chunkArray(data, path, depth)
@@ -99,9 +86,6 @@ export class JsonYamlChunker {
9986
]
10087
}
10188

102-
/**
103-
* Chunk an array by batching items until the token budget is reached
104-
*/
10589
private chunkArray(arr: JsonArray, path: string[], depth: number): Chunk[] {
10690
const chunks: Chunk[] = []
10791
let currentBatch: JsonValue[] = []
@@ -158,9 +142,6 @@ export class JsonYamlChunker {
158142
return chunks
159143
}
160144

161-
/**
162-
* Chunk an object by grouping key-value pairs until the token budget is reached
163-
*/
164145
private chunkObject(obj: JsonObject, path: string[], depth: number): Chunk[] {
165146
const chunks: Chunk[] = []
166147
const entries = Object.entries(obj)
@@ -239,9 +220,6 @@ export class JsonYamlChunker {
239220
return chunks
240221
}
241222

242-
/**
243-
* Build a chunk from a batch of array items
244-
*/
245223
private buildBatchChunk(
246224
contextHeader: string,
247225
batch: JsonValue[],
@@ -256,9 +234,6 @@ export class JsonYamlChunker {
256234
}
257235
}
258236

259-
/**
260-
* Fall back to text chunking if JSON parsing fails
261-
*/
262237
private chunkAsText(content: string): Chunk[] {
263238
const chunks: Chunk[] = []
264239
const lines = content.split('\n')
@@ -296,9 +271,6 @@ export class JsonYamlChunker {
296271
return chunks
297272
}
298273

299-
/**
300-
* Static method for chunking JSON/YAML data with default options
301-
*/
302274
static async chunkJsonYaml(content: string, options: ChunkerOptions = {}): Promise<Chunk[]> {
303275
const chunker = new JsonYamlChunker(options)
304276
return chunker.chunk(content)

0 commit comments

Comments
 (0)