@@ -21,9 +21,6 @@ interface Frontmatter {
2121
2222const logger = createLogger ( 'DocsChunker' )
2323
24- /**
25- * Docs-specific chunker that processes .mdx files and tracks header context
26- */
2724export class DocsChunker {
2825 private readonly textChunker : TextChunker
2926 private readonly baseUrl : string
@@ -39,9 +36,6 @@ export class DocsChunker {
3936 this . baseUrl = options . baseUrl ?? 'https://docs.sim.ai'
4037 }
4138
42- /**
43- * Process all .mdx files in the docs directory
44- */
4539 async chunkAllDocs ( docsPath : string ) : Promise < DocChunk [ ] > {
4640 const allChunks : DocChunk [ ] = [ ]
4741
@@ -67,9 +61,6 @@ export class DocsChunker {
6761 }
6862 }
6963
70- /**
71- * Process a single .mdx file
72- */
7364 async chunkMdxFile ( filePath : string , basePath : string ) : Promise < DocChunk [ ] > {
7465 const content = await fs . readFile ( filePath , 'utf-8' )
7566 const relativePath = path . relative ( basePath , filePath )
@@ -120,9 +111,6 @@ export class DocsChunker {
120111 return chunks
121112 }
122113
123- /**
124- * Find all .mdx files recursively
125- */
126114 private async findMdxFiles ( dirPath : string ) : Promise < string [ ] > {
127115 const files : string [ ] = [ ]
128116
@@ -142,9 +130,6 @@ export class DocsChunker {
142130 return files
143131 }
144132
145- /**
146- * Extract headers and their positions from markdown content
147- */
148133 private extractHeaders ( content : string ) : HeaderInfo [ ] {
149134 const headers : HeaderInfo [ ] = [ ]
150135 const headerRegex = / ^ ( # { 1 , 6 } ) \s + ( .+ ) $ / gm
@@ -166,9 +151,6 @@ export class DocsChunker {
166151 return headers
167152 }
168153
169- /**
170- * Generate URL-safe anchor from header text
171- */
172154 private generateAnchor ( headerText : string ) : string {
173155 return headerText
174156 . toLowerCase ( )
@@ -178,10 +160,7 @@ export class DocsChunker {
178160 . replace ( / ^ - | - $ / g, '' )
179161 }
180162
181- /**
182- * Generate document URL from relative path
183- * Handles index.mdx files specially - they are served at the parent directory path
184- */
163+ /** index.mdx files are served at the parent directory path */
185164 private generateDocumentUrl ( relativePath : string ) : string {
186165 let urlPath = relativePath . replace ( / \. m d x $ / , '' ) . replace ( / \\ / g, '/' )
187166
@@ -194,9 +173,6 @@ export class DocsChunker {
194173 return `${ this . baseUrl } /${ urlPath } `
195174 }
196175
197- /**
198- * Find the most relevant header for a given position
199- */
200176 private findRelevantHeader ( headers : HeaderInfo [ ] , position : number ) : HeaderInfo | null {
201177 if ( headers . length === 0 ) return null
202178
@@ -213,11 +189,7 @@ export class DocsChunker {
213189 return relevantHeader
214190 }
215191
216- /**
217- * Split content into chunks using the existing TextChunker with table awareness.
218- * Returns both the chunks and the cleaned content so header extraction
219- * operates on the same text that was chunked (aligned positions).
220- */
192+ /** Returns both chunks and cleaned content so header extraction uses aligned positions. */
221193 private async splitContent (
222194 content : string
223195 ) : Promise < { chunks : string [ ] ; cleanedContent : string } > {
@@ -238,9 +210,6 @@ export class DocsChunker {
238210 return { chunks : finalChunks , cleanedContent }
239211 }
240212
241- /**
242- * Clean content by removing MDX-specific elements and excessive whitespace
243- */
244213 private cleanContent ( content : string ) : string {
245214 return content
246215 . replace ( / \r \n / g, '\n' )
@@ -255,9 +224,6 @@ export class DocsChunker {
255224 . trim ( )
256225 }
257226
258- /**
259- * Parse frontmatter from MDX content
260- */
261227 private parseFrontmatter ( content : string ) : { data : Frontmatter ; content : string } {
262228 const frontmatterRegex = / ^ - - - \r ? \n ( [ \s \S ] * ?) \r ? \n - - - \r ? \n ( [ \s \S ] * ) $ /
263229 const match = content . match ( frontmatterRegex )
@@ -285,9 +251,7 @@ export class DocsChunker {
285251 return { data, content : markdownContent }
286252 }
287253
288- /**
289- * Detect table boundaries in markdown content to avoid splitting them
290- */
254+ /** Detects table boundaries to avoid splitting tables across chunks. */
291255 private detectTableBoundaries ( content : string ) : { start : number ; end : number } [ ] {
292256 const tables : { start : number ; end : number } [ ] = [ ]
293257 const lines = content . split ( '\n' )
@@ -331,16 +295,10 @@ export class DocsChunker {
331295 return tables
332296 }
333297
334- /**
335- * Get character position from line number
336- */
337298 private getCharacterPosition ( lines : string [ ] , lineIndex : number ) : number {
338299 return lines . slice ( 0 , lineIndex ) . reduce ( ( acc , line ) => acc + line . length + 1 , 0 )
339300 }
340301
341- /**
342- * Merge chunks that would split tables
343- */
344302 private mergeTableChunks (
345303 chunks : string [ ] ,
346304 tableBoundaries : { start : number ; end : number } [ ] ,
@@ -393,9 +351,6 @@ export class DocsChunker {
393351 return mergedChunks . filter ( ( chunk ) => chunk . length > 50 )
394352 }
395353
396- /**
397- * Enforce token size limit on chunks, using the configured chunkSize
398- */
399354 private enforceSizeLimit ( chunks : string [ ] ) : string [ ] {
400355 const finalChunks : string [ ] = [ ]
401356
0 commit comments