- {/* Hidden decoy fields to prevent browser autofill */}
search: params.search,
})
-/**
- * Hook to search for chunks in a document.
- * Fetches all matching chunks and returns them for client-side pagination.
- */
export function useDocumentChunkSearchQuery(
params: DocumentChunkSearchParams,
options?: {
diff --git a/apps/sim/lib/chunkers/docs-chunker.ts b/apps/sim/lib/chunkers/docs-chunker.ts
index 6988be6e222..ddfecc3ab19 100644
--- a/apps/sim/lib/chunkers/docs-chunker.ts
+++ b/apps/sim/lib/chunkers/docs-chunker.ts
@@ -21,9 +21,6 @@ interface Frontmatter {
const logger = createLogger('DocsChunker')
-/**
- * Docs-specific chunker that processes .mdx files and tracks header context
- */
export class DocsChunker {
private readonly textChunker: TextChunker
private readonly baseUrl: string
@@ -39,9 +36,6 @@ export class DocsChunker {
this.baseUrl = options.baseUrl ?? 'https://docs.sim.ai'
}
- /**
- * Process all .mdx files in the docs directory
- */
async chunkAllDocs(docsPath: string): Promise {
const allChunks: DocChunk[] = []
@@ -67,9 +61,6 @@ export class DocsChunker {
}
}
- /**
- * Process a single .mdx file
- */
async chunkMdxFile(filePath: string, basePath: string): Promise {
const content = await fs.readFile(filePath, 'utf-8')
const relativePath = path.relative(basePath, filePath)
@@ -120,9 +111,6 @@ export class DocsChunker {
return chunks
}
- /**
- * Find all .mdx files recursively
- */
private async findMdxFiles(dirPath: string): Promise {
const files: string[] = []
@@ -142,9 +130,6 @@ export class DocsChunker {
return files
}
- /**
- * Extract headers and their positions from markdown content
- */
private extractHeaders(content: string): HeaderInfo[] {
const headers: HeaderInfo[] = []
const headerRegex = /^(#{1,6})\s+(.+)$/gm
@@ -166,9 +151,6 @@ export class DocsChunker {
return headers
}
- /**
- * Generate URL-safe anchor from header text
- */
private generateAnchor(headerText: string): string {
return headerText
.toLowerCase()
@@ -178,10 +160,7 @@ export class DocsChunker {
.replace(/^-|-$/g, '')
}
- /**
- * Generate document URL from relative path
- * Handles index.mdx files specially - they are served at the parent directory path
- */
+ /** index.mdx files are served at the parent directory path */
private generateDocumentUrl(relativePath: string): string {
let urlPath = relativePath.replace(/\.mdx$/, '').replace(/\\/g, '/')
@@ -194,9 +173,6 @@ export class DocsChunker {
return `${this.baseUrl}/${urlPath}`
}
- /**
- * Find the most relevant header for a given position
- */
private findRelevantHeader(headers: HeaderInfo[], position: number): HeaderInfo | null {
if (headers.length === 0) return null
@@ -213,11 +189,7 @@ export class DocsChunker {
return relevantHeader
}
- /**
- * Split content into chunks using the existing TextChunker with table awareness.
- * Returns both the chunks and the cleaned content so header extraction
- * operates on the same text that was chunked (aligned positions).
- */
+ /** Returns both chunks and cleaned content so header extraction uses aligned positions. */
private async splitContent(
content: string
): Promise<{ chunks: string[]; cleanedContent: string }> {
@@ -238,9 +210,6 @@ export class DocsChunker {
return { chunks: finalChunks, cleanedContent }
}
- /**
- * Clean content by removing MDX-specific elements and excessive whitespace
- */
private cleanContent(content: string): string {
return content
.replace(/\r\n/g, '\n')
@@ -255,9 +224,6 @@ export class DocsChunker {
.trim()
}
- /**
- * Parse frontmatter from MDX content
- */
private parseFrontmatter(content: string): { data: Frontmatter; content: string } {
const frontmatterRegex = /^---\r?\n([\s\S]*?)\r?\n---\r?\n([\s\S]*)$/
const match = content.match(frontmatterRegex)
@@ -285,9 +251,7 @@ export class DocsChunker {
return { data, content: markdownContent }
}
- /**
- * Detect table boundaries in markdown content to avoid splitting them
- */
+ /** Detects table boundaries to avoid splitting tables across chunks. */
private detectTableBoundaries(content: string): { start: number; end: number }[] {
const tables: { start: number; end: number }[] = []
const lines = content.split('\n')
@@ -331,16 +295,10 @@ export class DocsChunker {
return tables
}
- /**
- * Get character position from line number
- */
private getCharacterPosition(lines: string[], lineIndex: number): number {
return lines.slice(0, lineIndex).reduce((acc, line) => acc + line.length + 1, 0)
}
- /**
- * Merge chunks that would split tables
- */
private mergeTableChunks(
chunks: string[],
tableBoundaries: { start: number; end: number }[],
@@ -393,9 +351,6 @@ export class DocsChunker {
return mergedChunks.filter((chunk) => chunk.length > 50)
}
- /**
- * Enforce token size limit on chunks, using the configured chunkSize
- */
private enforceSizeLimit(chunks: string[]): string[] {
const finalChunks: string[] = []
diff --git a/apps/sim/lib/chunkers/json-yaml-chunker.test.ts b/apps/sim/lib/chunkers/json-yaml-chunker.test.ts
index 7e690cde9a7..251b50daeaa 100644
--- a/apps/sim/lib/chunkers/json-yaml-chunker.test.ts
+++ b/apps/sim/lib/chunkers/json-yaml-chunker.test.ts
@@ -31,12 +31,10 @@ describe('JsonYamlChunker', () => {
})
it('should return false for plain text parsed as YAML scalar', () => {
- // js-yaml parses plain text as a scalar value, not an object/array
expect(JsonYamlChunker.isStructuredData('Hello, this is plain text.')).toBe(false)
})
it('should return false for invalid JSON/YAML with unbalanced braces', () => {
- // Only truly malformed content that fails YAML parsing returns false
expect(JsonYamlChunker.isStructuredData('{invalid: json: content: {{')).toBe(false)
})
@@ -60,7 +58,6 @@ describe('JsonYamlChunker', () => {
const json = '{}'
const chunks = await chunker.chunk(json)
- // Empty object is valid JSON, should return at least metadata
expect(chunks.length).toBeGreaterThanOrEqual(0)
})
@@ -203,7 +200,6 @@ server:
const json = '[]'
const chunks = await chunker.chunk(json)
- // Empty array should not produce chunks with meaningful content
expect(chunks.length).toBeGreaterThanOrEqual(0)
})
@@ -271,7 +267,6 @@ server:
it.concurrent('should fall back to text chunking for invalid JSON', async () => {
const chunker = new JsonYamlChunker({ chunkSize: 100, minCharactersPerChunk: 10 })
- // Create content that fails YAML parsing and is long enough to produce chunks
const invalidJson = `{this is not valid json: content: {{${' more content here '.repeat(10)}`
const chunks = await chunker.chunk(invalidJson)
@@ -376,9 +371,7 @@ server:
const json = JSON.stringify({ a: 1, b: 2, c: 3 })
const chunks = await chunker.chunk(json)
- // Should produce chunks that are valid
expect(chunks.length).toBeGreaterThan(0)
- // The entire small object fits in one chunk
expect(chunks[0].text.length).toBeGreaterThan(0)
})
})
diff --git a/apps/sim/lib/chunkers/json-yaml-chunker.ts b/apps/sim/lib/chunkers/json-yaml-chunker.ts
index 78efcd6dac4..d18cd0859f9 100644
--- a/apps/sim/lib/chunkers/json-yaml-chunker.ts
+++ b/apps/sim/lib/chunkers/json-yaml-chunker.ts
@@ -12,10 +12,6 @@ type JsonArray = JsonValue[]
const MAX_DEPTH = 5
-/**
- * Structure-aware chunker for JSON and YAML content
- * Recursively decomposes objects and arrays while preserving structure
- */
export class JsonYamlChunker {
private chunkSize: number
private minCharactersPerChunk: number
@@ -25,9 +21,6 @@ export class JsonYamlChunker {
this.minCharactersPerChunk = options.minCharactersPerChunk ?? 100
}
- /**
- * Check if content is structured JSON/YAML data (object or array, not a primitive)
- */
static isStructuredData(content: string): boolean {
try {
const parsed = JSON.parse(content)
@@ -42,9 +35,6 @@ export class JsonYamlChunker {
}
}
- /**
- * Chunk JSON/YAML content intelligently based on structure
- */
async chunk(content: string): Promise {
try {
let data: JsonValue
@@ -65,9 +55,6 @@ export class JsonYamlChunker {
}
}
- /**
- * Chunk structured data based on its structure
- */
private chunkStructuredData(data: JsonValue, path: string[], depth: number): Chunk[] {
if (Array.isArray(data)) {
return this.chunkArray(data, path, depth)
@@ -99,9 +86,6 @@ export class JsonYamlChunker {
]
}
- /**
- * Chunk an array by batching items until the token budget is reached
- */
private chunkArray(arr: JsonArray, path: string[], depth: number): Chunk[] {
const chunks: Chunk[] = []
let currentBatch: JsonValue[] = []
@@ -158,9 +142,6 @@ export class JsonYamlChunker {
return chunks
}
- /**
- * Chunk an object by grouping key-value pairs until the token budget is reached
- */
private chunkObject(obj: JsonObject, path: string[], depth: number): Chunk[] {
const chunks: Chunk[] = []
const entries = Object.entries(obj)
@@ -239,9 +220,6 @@ export class JsonYamlChunker {
return chunks
}
- /**
- * Build a chunk from a batch of array items
- */
private buildBatchChunk(
contextHeader: string,
batch: JsonValue[],
@@ -256,9 +234,6 @@ export class JsonYamlChunker {
}
}
- /**
- * Fall back to text chunking if JSON parsing fails
- */
private chunkAsText(content: string): Chunk[] {
const chunks: Chunk[] = []
const lines = content.split('\n')
@@ -296,9 +271,6 @@ export class JsonYamlChunker {
return chunks
}
- /**
- * Static method for chunking JSON/YAML data with default options
- */
static async chunkJsonYaml(content: string, options: ChunkerOptions = {}): Promise {
const chunker = new JsonYamlChunker(options)
return chunker.chunk(content)
diff --git a/apps/sim/lib/chunkers/recursive-chunker.test.ts b/apps/sim/lib/chunkers/recursive-chunker.test.ts
index d013fe5c4b1..846267034cf 100644
--- a/apps/sim/lib/chunkers/recursive-chunker.test.ts
+++ b/apps/sim/lib/chunkers/recursive-chunker.test.ts
@@ -48,7 +48,6 @@ describe('RecursiveChunker', () => {
describe('line splitting fallback', () => {
it.concurrent('should split at newlines when paragraphs are too large', async () => {
const chunker = new RecursiveChunker({ chunkSize: 15 })
- // Single paragraph (no \n\n) but has \n line breaks
const text =
'Line one with content here.\nLine two with content here.\nLine three with content here.\nLine four with content here.'
const chunks = await chunker.chunk(text)
@@ -60,7 +59,6 @@ describe('RecursiveChunker', () => {
describe('sentence splitting fallback', () => {
it.concurrent('should split at sentence boundaries when lines are too large', async () => {
const chunker = new RecursiveChunker({ chunkSize: 10 })
- // Single line, no \n, but has ". " sentence boundaries
const text =
'First sentence here. Second sentence here. Third sentence here. Fourth sentence here.'
const chunks = await chunker.chunk(text)
@@ -72,7 +70,6 @@ describe('RecursiveChunker', () => {
describe('word splitting fallback', () => {
it.concurrent('should split at spaces when sentences are too large', async () => {
const chunker = new RecursiveChunker({ chunkSize: 5 })
- // No paragraph, line, or sentence breaks - only spaces
const text = 'word1 word2 word3 word4 word5 word6 word7 word8 word9 word10'
const chunks = await chunker.chunk(text)
@@ -88,8 +85,6 @@ describe('RecursiveChunker', () => {
const chunks = await chunker.chunk(text)
if (chunks.length > 1) {
- // The separator (\n\n) is prepended to parts after index 0, so subsequent
- // chunks should start with the separator used for splitting
expect(chunks[1].text.startsWith('\n\n') || chunks[1].text.length > 0).toBe(true)
}
})
@@ -170,7 +165,6 @@ describe('RecursiveChunker', () => {
const chunks = await chunker.chunk(text)
for (const chunk of chunks) {
- // Allow small tolerance for word boundary alignment
expect(chunk.tokenCount).toBeLessThanOrEqual(chunkSize + 5)
}
})
@@ -184,7 +178,6 @@ describe('RecursiveChunker', () => {
const chunks = await chunker.chunk(text)
if (chunks.length > 1) {
- // With overlap, second chunk should contain some text from the end of the first
expect(chunks[1].text.length).toBeGreaterThan(0)
}
})
diff --git a/apps/sim/lib/chunkers/recursive-chunker.ts b/apps/sim/lib/chunkers/recursive-chunker.ts
index f7fa064e096..16b451e3e96 100644
--- a/apps/sim/lib/chunkers/recursive-chunker.ts
+++ b/apps/sim/lib/chunkers/recursive-chunker.ts
@@ -52,12 +52,6 @@ const RECIPES = {
],
} as const
-/**
- * Recursive delimiter-based chunker
- * Splits text using a configurable hierarchy of separators.
- * At each level, splits on the separator, merges small pieces, then
- * recurses to the next level for any chunks that are still too large.
- */
export class RecursiveChunker {
private readonly chunkSize: number
private readonly chunkOverlap: number
diff --git a/apps/sim/lib/chunkers/regex-chunker.test.ts b/apps/sim/lib/chunkers/regex-chunker.test.ts
index 0aa24053ee1..5b64cf3f495 100644
--- a/apps/sim/lib/chunkers/regex-chunker.test.ts
+++ b/apps/sim/lib/chunkers/regex-chunker.test.ts
@@ -73,7 +73,6 @@ describe('RegexChunker', () => {
'should sub-chunk segments larger than chunkSize via word boundaries',
async () => {
const chunker = new RegexChunker({ pattern: '---', chunkSize: 10 })
- // Each segment is well over 10 tokens (40 chars = 10 tokens)
const longSegment =
'This is a very long segment with many words that exceeds the chunk size limit significantly. '
const text = `${longSegment}---${longSegment}`
diff --git a/apps/sim/lib/chunkers/regex-chunker.ts b/apps/sim/lib/chunkers/regex-chunker.ts
index 021f626234e..8bc4c5a9fbc 100644
--- a/apps/sim/lib/chunkers/regex-chunker.ts
+++ b/apps/sim/lib/chunkers/regex-chunker.ts
@@ -14,11 +14,6 @@ const logger = createLogger('RegexChunker')
const MAX_PATTERN_LENGTH = 500
-/**
- * Regex pattern-based chunker
- * Splits text using a user-defined regex pattern, then merges small segments
- * until the chunk size limit is reached.
- */
export class RegexChunker {
private readonly chunkSize: number
private readonly chunkOverlap: number
@@ -43,7 +38,6 @@ export class RegexChunker {
try {
const regex = new RegExp(pattern, 'g')
- // Test against adversarial strings to catch catastrophic backtracking
const testStrings = [
'a'.repeat(10000),
' '.repeat(10000),
diff --git a/apps/sim/lib/chunkers/sentence-chunker.test.ts b/apps/sim/lib/chunkers/sentence-chunker.test.ts
index 7c6075cfccf..78708de29ad 100644
--- a/apps/sim/lib/chunkers/sentence-chunker.test.ts
+++ b/apps/sim/lib/chunkers/sentence-chunker.test.ts
@@ -95,11 +95,9 @@ describe('SentenceChunker', () => {
const text = 'The value is 3.14. That is pi.'
const chunks = await chunker.chunk(text)
- // Text is short enough for one chunk, but verify no split at 3.14
const allText = chunks.map((c) => c.text).join(' ')
expect(allText).toContain('3.14')
- // With a large enough chunkSize to hold both sentences, verify exactly 1 chunk
const largeChunker = new SentenceChunker({ chunkSize: 200 })
const largeChunks = await largeChunker.chunk(text)
expect(largeChunks).toHaveLength(1)
@@ -119,13 +117,10 @@ describe('SentenceChunker', () => {
describe('exclamation and question marks', () => {
it.concurrent('should split at exclamation and question marks', async () => {
- // chunkSize: 25 tokens = 100 chars. Each sentence is ~25 chars, so each gets its own chunk.
const chunker = new SentenceChunker({ chunkSize: 10 })
const text = 'What is this? It is great! I agree.'
const chunks = await chunker.chunk(text)
- // Total text is 35 chars = 9 tokens, fits in chunkSize: 10
- // So it returns a single chunk. Use sentence content check instead.
const allText = chunks.map((c) => c.text).join(' ')
expect(allText).toContain('What is this?')
expect(allText).toContain('It is great!')
@@ -133,16 +128,10 @@ describe('SentenceChunker', () => {
})
it.concurrent('should treat ? and ! as sentence boundaries', async () => {
- // Need sentences that individually fit in chunkSize but not combined
const chunker = new SentenceChunker({ chunkSize: 15 })
const text = 'What is this thing? It is really great! I strongly agree.'
const chunks = await chunker.chunk(text)
- // "What is this thing?" = 19 chars = 5 tokens
- // "It is really great!" = 19 chars = 5 tokens
- // "I strongly agree." = 17 chars = 5 tokens
- // Total = 55 chars = 14 tokens, fits in 15. Need smaller chunkSize.
- // Actually at chunkSize: 15 they all fit. Let's check the actual splitting.
expect(chunks.length).toBeGreaterThanOrEqual(1)
const allText = chunks.map((c) => c.text).join(' ')
expect(allText).toContain('?')
@@ -157,23 +146,15 @@ describe('SentenceChunker', () => {
'First sentence. Second sentence. Third sentence. Fourth sentence. Fifth sentence.'
const chunks = await chunker.chunk(text)
- // With minSentencesPerChunk: 2, each chunk (except possibly last) should contain
- // at least 2 sentences
expect(chunks.length).toBeGreaterThan(0)
-
- // Verify that the chunker groups sentences together
- // Total text fits in one chunk at size 100, so this should be 1 chunk
expect(chunks).toHaveLength(1)
})
it.concurrent('should enforce min sentences even when token limit is reached', async () => {
- // Each sentence is ~5 tokens, chunkSize: 6 means we'd normally split after 1
- // But minSentencesPerChunk: 2 forces at least 2 sentences
const chunker = new SentenceChunker({ chunkSize: 6, minSentencesPerChunk: 2 })
const text = 'Short one. Another one. Third one here. Fourth one here.'
const chunks = await chunker.chunk(text)
- // First chunk should contain at least 2 sentences
const firstChunkSentences = chunks[0].text
.split(/(?<=[.!?])\s+/)
.filter((s) => s.trim().length > 0)
@@ -186,12 +167,10 @@ describe('SentenceChunker', () => {
'should chunk a single very long sentence via word-boundary splitting',
async () => {
const chunker = new SentenceChunker({ chunkSize: 10 })
- // 10 tokens = 40 chars, make a sentence much longer than that
const longSentence = `${'word '.repeat(50).trim()}.`
const chunks = await chunker.chunk(longSentence)
expect(chunks.length).toBeGreaterThan(1)
- // Verify all content is preserved
const allText = chunks.map((c) => c.text).join(' ')
expect(allText).toContain('word')
}
@@ -218,10 +197,6 @@ describe('SentenceChunker', () => {
const chunks = await chunker.chunk(text)
if (chunks.length > 1) {
- // The second chunk should contain some text from the end of the first chunk
- const firstChunkWords = chunks[0].text.split(' ')
- const lastWordsOfFirst = firstChunkWords.slice(-3).join(' ')
- // Overlap means the second chunk should start with content from the first
expect(chunks[1].text.length).toBeGreaterThan(0)
}
})
@@ -232,7 +207,6 @@ describe('SentenceChunker', () => {
const chunks = await chunker.chunk(text)
if (chunks.length > 1) {
- // Without overlap, the start of chunk 2 should NOT repeat the end of chunk 1
const chunk1End = chunks[0].text.slice(-20)
expect(chunks[1].text.startsWith(chunk1End)).toBe(false)
}
@@ -291,7 +265,6 @@ describe('SentenceChunker', () => {
const chunks = await chunker.chunk(text)
expect(chunks.length).toBeGreaterThan(1)
- // Allow some tolerance since sentence boundaries may cause slight overflows
for (const chunk of chunks) {
expect(chunk.tokenCount).toBeLessThanOrEqual(chunkSize * 2)
}
diff --git a/apps/sim/lib/chunkers/sentence-chunker.ts b/apps/sim/lib/chunkers/sentence-chunker.ts
index 2f3082f04d3..24aacd18acc 100644
--- a/apps/sim/lib/chunkers/sentence-chunker.ts
+++ b/apps/sim/lib/chunkers/sentence-chunker.ts
@@ -11,11 +11,7 @@ import {
const logger = createLogger('SentenceChunker')
-/**
- * Sentence-based chunker
- * Groups complete sentences into chunks up to the token limit.
- * Never splits mid-sentence unless a single sentence exceeds the limit.
- */
+/** Never splits mid-sentence unless a single sentence exceeds the limit. */
export class SentenceChunker {
private readonly chunkSize: number
private readonly chunkOverlap: number
@@ -28,10 +24,7 @@ export class SentenceChunker {
this.minSentencesPerChunk = options.minSentencesPerChunk ?? 1
}
- /**
- * Split text into sentences using a regex that avoids common false positives
- * like abbreviations (Mr., Dr., U.S.), decimals (3.14), and ellipses (...).
- */
+ /** Splits on sentence boundaries while avoiding abbreviations, decimals, and ellipses. */
private splitSentences(text: string): string[] {
return text
.split(
@@ -101,10 +94,7 @@ export class SentenceChunker {
return buildChunks(rawChunks, this.chunkOverlap)
}
- /**
- * Apply sentence-level overlap using the original sentence groups,
- * avoiding re-splitting joined text back into sentences.
- */
+ /** Applies overlap at the sentence level using original groups to avoid re-splitting. */
private applyOverlapFromGroups(groups: string[][]): string[] {
if (this.chunkOverlap <= 0 || groups.length <= 1) {
return groups.map((g) => g.join(' '))
diff --git a/apps/sim/lib/chunkers/structured-data-chunker.test.ts b/apps/sim/lib/chunkers/structured-data-chunker.test.ts
index 760590bdff7..3cd6b7ec27a 100644
--- a/apps/sim/lib/chunkers/structured-data-chunker.test.ts
+++ b/apps/sim/lib/chunkers/structured-data-chunker.test.ts
@@ -11,19 +11,16 @@ vi.mock('@sim/logger', () => loggerMock)
describe('StructuredDataChunker', () => {
describe('isStructuredData', () => {
it('should detect CSV content with many columns', () => {
- // Detection requires >2 delimiters per line on average
const csv = 'name,age,city,country\nAlice,30,NYC,USA\nBob,25,LA,USA'
expect(StructuredDataChunker.isStructuredData(csv)).toBe(true)
})
it('should detect TSV content with many columns', () => {
- // Detection requires >2 delimiters per line on average
const tsv = 'name\tage\tcity\tcountry\nAlice\t30\tNYC\tUSA\nBob\t25\tLA\tUSA'
expect(StructuredDataChunker.isStructuredData(tsv)).toBe(true)
})
it('should detect pipe-delimited content with many columns', () => {
- // Detection requires >2 delimiters per line on average
const piped = 'name|age|city|country\nAlice|30|NYC|USA\nBob|25|LA|USA'
expect(StructuredDataChunker.isStructuredData(piped)).toBe(true)
})
@@ -64,7 +61,6 @@ describe('StructuredDataChunker', () => {
it('should handle inconsistent delimiter counts', () => {
const inconsistent = 'name,age\nAlice,30,extra\nBob'
- // May or may not detect as structured depending on variance threshold
const result = StructuredDataChunker.isStructuredData(inconsistent)
expect(typeof result).toBe('boolean')
})
@@ -184,7 +180,6 @@ Alice,30`
const csv = 'name,age,city'
const chunks = await StructuredDataChunker.chunkStructuredData(csv)
- // Only header, no data rows
expect(chunks.length).toBeGreaterThanOrEqual(0)
})
@@ -271,7 +266,6 @@ Alice,30`
const chunks = await StructuredDataChunker.chunkStructuredData(csv, { chunkSize: 500 })
expect(chunks.length).toBeGreaterThan(1)
- // Verify total rows are distributed across chunks
const totalRowCount = chunks.reduce((sum, chunk) => {
const match = chunk.text.match(/\[(\d+) rows of data\]/)
return sum + (match ? Number.parseInt(match[1]) : 0)
@@ -319,9 +313,7 @@ Alice,30`
it.concurrent('should not detect with fewer than 3 delimiters per line', async () => {
const sparse = `a,b
1,2`
- // Only 1 comma per line, below threshold of >2
const result = StructuredDataChunker.isStructuredData(sparse)
- // May or may not pass depending on implementation threshold
expect(typeof result).toBe('boolean')
})
})
@@ -337,7 +329,6 @@ Alice,30`
const chunks = await StructuredDataChunker.chunkStructuredData(csv, { chunkSize: 200 })
expect(chunks.length).toBeGreaterThan(1)
- // Each chunk should contain header info
for (const chunk of chunks) {
expect(chunk.text).toContain('Headers:')
}
diff --git a/apps/sim/lib/chunkers/structured-data-chunker.ts b/apps/sim/lib/chunkers/structured-data-chunker.ts
index 93eaa3b61da..82c24a40720 100644
--- a/apps/sim/lib/chunkers/structured-data-chunker.ts
+++ b/apps/sim/lib/chunkers/structured-data-chunker.ts
@@ -11,14 +11,7 @@ const DEFAULT_CONFIG = {
INCLUDE_HEADERS_IN_EACH_CHUNK: true,
} as const
-/**
- * Smart chunker for structured data (CSV, XLSX) that preserves semantic meaning
- * Preserves headers in each chunk for better semantic context
- */
export class StructuredDataChunker {
- /**
- * Chunk structured data intelligently based on rows and token budget
- */
static async chunkStructuredData(
content: string,
options: StructuredDataOptions = {}
@@ -97,9 +90,6 @@ export class StructuredDataChunker {
return chunks
}
- /**
- * Format a chunk with headers and context
- */
private static formatChunk(headerLine: string, rows: string[], sheetName?: string): string {
let content = ''
@@ -118,9 +108,6 @@ export class StructuredDataChunker {
return content
}
- /**
- * Create a chunk object with actual row indices
- */
private static createChunk(content: string, startRow: number, endRow: number): Chunk {
return {
text: content,
@@ -132,9 +119,6 @@ export class StructuredDataChunker {
}
}
- /**
- * Estimate average tokens per row from sample
- */
private static estimateTokensPerRow(sampleRows: string[]): number {
if (sampleRows.length === 0) return 50
@@ -142,9 +126,6 @@ export class StructuredDataChunker {
return Math.ceil(totalTokens / sampleRows.length)
}
- /**
- * Calculate optimal rows per chunk based on token estimates and target size
- */
private static calculateOptimalRowsPerChunk(
tokensPerRow: number,
targetChunkSize: number
@@ -157,9 +138,6 @@ export class StructuredDataChunker {
)
}
- /**
- * Check if content appears to be structured data
- */
static isStructuredData(content: string, mimeType?: string): boolean {
if (mimeType) {
const structuredMimeTypes = [
diff --git a/apps/sim/lib/chunkers/text-chunker.test.ts b/apps/sim/lib/chunkers/text-chunker.test.ts
index 3b8b8455691..f7c2458d4b5 100644
--- a/apps/sim/lib/chunkers/text-chunker.test.ts
+++ b/apps/sim/lib/chunkers/text-chunker.test.ts
@@ -30,7 +30,7 @@ describe('TextChunker', () => {
it.concurrent('should include token count in chunk metadata', async () => {
const chunker = new TextChunker({ chunkSize: 100 })
- const text = 'Hello world' // ~3 tokens (11 chars / 4)
+ const text = 'Hello world'
const chunks = await chunker.chunk(text)
expect(chunks[0].tokenCount).toBe(3)
@@ -201,7 +201,6 @@ describe('TextChunker', () => {
it.concurrent('should use default minCharactersPerChunk of 100', async () => {
const chunker = new TextChunker({ chunkSize: 10 })
- // Text with 150+ characters to ensure chunks pass the 100 character minimum
const text = 'This is a longer sentence with more content. '.repeat(5)
const chunks = await chunker.chunk(text)
@@ -266,7 +265,6 @@ describe('TextChunker', () => {
describe('boundary conditions', () => {
it.concurrent('should handle text exactly at chunk size boundary', async () => {
const chunker = new TextChunker({ chunkSize: 10 })
- // 40 characters = 10 tokens exactly
const text = 'A'.repeat(40)
const chunks = await chunker.chunk(text)
@@ -276,7 +274,6 @@ describe('TextChunker', () => {
it.concurrent('should handle text one token over chunk size', async () => {
const chunker = new TextChunker({ chunkSize: 10 })
- // 44 characters = 11 tokens, just over limit
const text = 'A'.repeat(44)
const chunks = await chunker.chunk(text)
@@ -300,7 +297,6 @@ describe('TextChunker', () => {
})
it.concurrent('should clamp overlap to max 50% of chunk size', async () => {
- // Overlap of 60 should be clamped to 10 (50% of chunkSize 20)
const chunker = new TextChunker({ chunkSize: 20, chunkOverlap: 60 })
const text = 'First paragraph here.\n\nSecond paragraph here.\n\nThird paragraph here.'
const chunks = await chunker.chunk(text)
@@ -359,7 +355,6 @@ describe('TextChunker', () => {
it.concurrent('should handle combining diacritics', async () => {
const chunker = new TextChunker({ chunkSize: 100 })
- // e + combining acute accent
const text = 'cafe\u0301 resume\u0301 naive\u0308'
const chunks = await chunker.chunk(text)
@@ -368,7 +363,6 @@ describe('TextChunker', () => {
it.concurrent('should handle zero-width characters', async () => {
const chunker = new TextChunker({ chunkSize: 100 })
- // Zero-width space, zero-width non-joiner, zero-width joiner
const text = 'Hello\u200B\u200C\u200DWorld'
const chunks = await chunker.chunk(text)
@@ -391,14 +385,12 @@ describe('TextChunker', () => {
const chunks = await chunker.chunk(text)
expect(chunks.length).toBeGreaterThan(1)
- // Verify all content is preserved
const totalChars = chunks.reduce((sum, c) => sum + c.text.length, 0)
expect(totalChars).toBeGreaterThan(0)
})
it.concurrent('should handle 1MB of text', async () => {
const chunker = new TextChunker({ chunkSize: 500 })
- // 1MB of text
const text = 'Lorem ipsum dolor sit amet. '.repeat(40000)
const chunks = await chunker.chunk(text)
@@ -407,7 +399,6 @@ describe('TextChunker', () => {
it.concurrent('should handle very long single line', async () => {
const chunker = new TextChunker({ chunkSize: 50 })
- // Single line with no natural break points
const text = 'Word'.repeat(10000)
const chunks = await chunker.chunk(text)
diff --git a/apps/sim/lib/chunkers/text-chunker.ts b/apps/sim/lib/chunkers/text-chunker.ts
index 358660a63d5..7e9b5a064dd 100644
--- a/apps/sim/lib/chunkers/text-chunker.ts
+++ b/apps/sim/lib/chunkers/text-chunker.ts
@@ -9,10 +9,6 @@ import {
tokensToChars,
} from '@/lib/chunkers/utils'
-/**
- * Lightweight text chunker optimized for RAG applications
- * Uses hierarchical splitting with simple character-based token estimation
- */
export class TextChunker {
private readonly chunkSize: number
private readonly chunkOverlap: number
@@ -43,9 +39,6 @@ export class TextChunker {
this.chunkOverlap = resolved.chunkOverlap
}
- /**
- * Split text recursively using hierarchical separators
- */
private splitRecursively(text: string, separatorIndex = 0): string[] {
const tokenCount = estimateTokens(text)
@@ -98,9 +91,6 @@ export class TextChunker {
return chunks
}
- /**
- * Main chunking method
- */
async chunk(text: string): Promise {
if (!text?.trim()) {
return []
diff --git a/apps/sim/lib/chunkers/token-chunker.test.ts b/apps/sim/lib/chunkers/token-chunker.test.ts
index 2f368e84a3f..420224c4d6e 100644
--- a/apps/sim/lib/chunkers/token-chunker.test.ts
+++ b/apps/sim/lib/chunkers/token-chunker.test.ts
@@ -37,7 +37,7 @@ describe('TokenChunker', () => {
describe('token count accuracy', () => {
it.concurrent('should compute tokenCount as Math.ceil(text.length / 4)', async () => {
const chunker = new TokenChunker({ chunkSize: 100 })
- const text = 'Hello world' // 11 chars -> ceil(11/4) = 3
+ const text = 'Hello world'
const chunks = await chunker.chunk(text)
expect(chunks[0].tokenCount).toBe(Math.ceil(text.length / 4))
@@ -45,7 +45,7 @@ describe('TokenChunker', () => {
it.concurrent('should compute tokenCount correctly for longer text', async () => {
const chunker = new TokenChunker({ chunkSize: 100 })
- const text = 'The quick brown fox jumps over the lazy dog.' // 44 chars -> ceil(44/4) = 11
+ const text = 'The quick brown fox jumps over the lazy dog.'
const chunks = await chunker.chunk(text)
expect(chunks[0].tokenCount).toBe(11)
@@ -192,7 +192,6 @@ describe('TokenChunker', () => {
for (const chunk of chunks) {
const trimmed = chunk.text.trim()
- // Should not start or end with a partial word (space in the middle)
expect(trimmed).toBe(chunk.text)
}
})
diff --git a/apps/sim/lib/chunkers/token-chunker.ts b/apps/sim/lib/chunkers/token-chunker.ts
index 6ee643653dd..6f7bb555231 100644
--- a/apps/sim/lib/chunkers/token-chunker.ts
+++ b/apps/sim/lib/chunkers/token-chunker.ts
@@ -11,12 +11,6 @@ import {
const logger = createLogger('TokenChunker')
-/**
- * Fixed-size token chunker
- * Splits text into chunks of a fixed token size with configurable overlap.
- * Uses a sliding window approach (matching LangChain/Chonkie) where chunks
- * stay within the size limit. The window advances by chunkSize - overlap.
- */
export class TokenChunker {
private readonly chunkSize: number
private readonly chunkOverlap: number
diff --git a/apps/sim/lib/chunkers/types.ts b/apps/sim/lib/chunkers/types.ts
index ad9a54e61b2..692e84d12fc 100644
--- a/apps/sim/lib/chunkers/types.ts
+++ b/apps/sim/lib/chunkers/types.ts
@@ -1,17 +1,11 @@
/**
- * Options for configuring text chunkers
- *
* Units:
- * - chunkSize: Maximum chunk size in TOKENS (1 token ≈ 4 characters)
- * - chunkOverlap: Overlap between chunks in TOKENS
- * - minCharactersPerChunk: Minimum chunk size in CHARACTERS (filters tiny fragments)
+ * - chunkSize/chunkOverlap: TOKENS (1 token ≈ 4 characters)
+ * - minCharactersPerChunk: CHARACTERS
*/
export interface ChunkerOptions {
- /** Maximum chunk size in tokens (default: 1024) */
chunkSize?: number
- /** Overlap between chunks in tokens (default: 0) */
chunkOverlap?: number
- /** Minimum chunk size in characters to avoid tiny fragments (default: 100) */
minCharactersPerChunk?: number
}
@@ -52,38 +46,25 @@ export interface DocsChunkerOptions extends ChunkerOptions {
baseUrl?: string
}
-/** Available chunking strategies for knowledge base documents */
export type ChunkingStrategy = 'auto' | 'text' | 'regex' | 'recursive' | 'sentence' | 'token'
-/** Pre-built separator recipes for recursive chunking */
export type RecursiveRecipe = 'plain' | 'markdown' | 'code'
-/** Strategy-specific options passed through the stack */
export interface StrategyOptions {
- /** Regex pattern for 'regex' strategy */
pattern?: string
- /** Custom separator hierarchy for 'recursive' strategy */
separators?: string[]
- /** Pre-built separator recipe for 'recursive' strategy */
recipe?: RecursiveRecipe
}
-/** Options for sentence-based chunking */
export interface SentenceChunkerOptions extends ChunkerOptions {
- /** Minimum number of sentences per chunk (default: 1) */
minSentencesPerChunk?: number
}
-/** Options for recursive delimiter-based chunking */
export interface RecursiveChunkerOptions extends ChunkerOptions {
- /** Custom separator hierarchy (overrides recipe if provided) */
separators?: string[]
- /** Pre-built separator recipe (default: 'plain') */
recipe?: RecursiveRecipe
}
-/** Options for regex pattern-based chunking */
export interface RegexChunkerOptions extends ChunkerOptions {
- /** Regex pattern string used to split text */
pattern: string
}
diff --git a/apps/sim/lib/chunkers/utils.ts b/apps/sim/lib/chunkers/utils.ts
index a9eb5c9c1cb..11acdca03b4 100644
--- a/apps/sim/lib/chunkers/utils.ts
+++ b/apps/sim/lib/chunkers/utils.ts
@@ -1,24 +1,15 @@
import type { Chunk } from '@/lib/chunkers/types'
-/**
- * Estimate token count from text length
- * 1 token ≈ 4 characters for English text
- */
+/** 1 token ≈ 4 characters for English text */
export function estimateTokens(text: string): number {
if (!text?.trim()) return 0
return Math.ceil(text.length / 4)
}
-/**
- * Convert token count to approximate character count
- */
export function tokensToChars(tokens: number): number {
return tokens * 4
}
-/**
- * Clean and normalize text for chunking
- */
export function cleanText(text: string): string {
return text
.replace(/\r\n/g, '\n')
@@ -29,10 +20,6 @@ export function cleanText(text: string): string {
.trim()
}
-/**
- * Add overlap between consecutive chunks using word-boundary alignment
- * Overlap is specified in characters
- */
export function addOverlap(chunks: string[], overlapChars: number): string[] {
if (overlapChars <= 0 || chunks.length <= 1) {
return chunks
@@ -65,10 +52,8 @@ export function addOverlap(chunks: string[], overlapChars: number): string[] {
}
/**
- * Split text at word boundaries into segments of approximately chunkSizeChars.
* When stepChars is provided (< chunkSizeChars), produces overlapping chunks
- * using a sliding window, matching LangChain/Chonkie behavior where
- * chunks stay within the size limit.
+ * using a sliding window where chunks stay within the size limit.
*/
export function splitAtWordBoundaries(
text: string,
@@ -103,9 +88,6 @@ export function splitAtWordBoundaries(
return parts
}
-/**
- * Build Chunk objects from text segments with startIndex/endIndex metadata
- */
export function buildChunks(texts: string[], overlapTokens: number): Chunk[] {
let previousEndIndex = 0
const overlapChars = tokensToChars(overlapTokens)
@@ -140,9 +122,6 @@ export function buildChunks(texts: string[], overlapTokens: number): Chunk[] {
})
}
-/**
- * Resolve common chunker options with defaults and clamping
- */
export function resolveChunkerOptions(options: {
chunkSize?: number
chunkOverlap?: number
diff --git a/apps/sim/lib/knowledge/documents/document-processor.ts b/apps/sim/lib/knowledge/documents/document-processor.ts
index 4caecb55dd3..2d652e9a11a 100644
--- a/apps/sim/lib/knowledge/documents/document-processor.ts
+++ b/apps/sim/lib/knowledge/documents/document-processor.ts
@@ -54,9 +54,6 @@ type OCRRequestBody = {
const MISTRAL_MAX_PAGES = 1000
-/**
- * Get page count from a PDF buffer using unpdf
- */
async function getPdfPageCount(buffer: Buffer): Promise {
try {
const { getDocumentProxy } = await import('unpdf')
@@ -69,10 +66,6 @@ async function getPdfPageCount(buffer: Buffer): Promise {
}
}
-/**
- * Split a PDF buffer into multiple smaller PDFs
- * Returns an array of PDF buffers, each with at most maxPages pages
- */
async function splitPdfIntoChunks(
pdfBuffer: Buffer,
maxPages: number
@@ -122,9 +115,6 @@ class APIError extends Error {
}
}
-/**
- * Apply a specific chunking strategy to content
- */
async function applyStrategy(
strategy: ChunkingStrategy,
content: string,
@@ -207,7 +197,6 @@ export async function processDocument(
let chunks: Chunk[]
const metadata: FileParseMetadata = parseResult.metadata ?? {}
- // If an explicit strategy is set (not 'auto'), use that chunker directly
if (strategy && strategy !== 'auto') {
logger.info(`Using explicit chunking strategy: ${strategy}`)
chunks = await applyStrategy(
@@ -219,7 +208,6 @@ export async function processDocument(
strategyOptions
)
} else {
- // Auto-detect based on content type
const isJsonYaml =
metadata.type === 'json' ||
metadata.type === 'yaml' ||
@@ -642,9 +630,6 @@ async function executeMistralOCRRequest(
)
}
-/**
- * Process a single PDF chunk: upload to S3, OCR, cleanup
- */
async function processChunk(
chunk: { buffer: Buffer; startPage: number; endPage: number },
chunkIndex: number,
@@ -662,7 +647,6 @@ async function processChunk(
let uploadedKey: string | null = null
try {
- // Upload the chunk to S3
const timestamp = Date.now()
const uniqueId = Math.random().toString(36).substring(2, 9)
const safeFileName = filename.replace(/[^a-zA-Z0-9.-]/g, '_')
@@ -694,7 +678,6 @@ async function processChunk(
logger.info(`Uploaded chunk ${chunkIndex + 1} to S3: ${chunkKey}`)
- // Process the chunk with Mistral OCR
const params = {
filePath: chunkUrl,
apiKey,
@@ -716,7 +699,6 @@ async function processChunk(
})
return { index: chunkIndex, content: null }
} finally {
- // Clean up the chunk file from S3 after processing
if (uploadedKey) {
try {
await StorageService.deleteFile({ key: uploadedKey, context: 'knowledge-base' })
@@ -751,7 +733,6 @@ async function processMistralOCRInBatches(
`Split into ${pdfChunks.length} chunks, processing with concurrency ${MAX_CONCURRENT_CHUNKS}`
)
- // Process chunks concurrently with limited concurrency
const results: { index: number; content: string | null }[] = []
for (let i = 0; i < pdfChunks.length; i += MAX_CONCURRENT_CHUNKS) {
@@ -770,15 +751,12 @@ async function processMistralOCRInBatches(
)
}
- // Sort by index to maintain page order and filter out nulls
const sortedResults = results
.sort((a, b) => a.index - b.index)
.filter((r) => r.content !== null)
.map((r) => r.content as string)
if (sortedResults.length === 0) {
- // Don't fall back to file parser for large PDFs - it produces poor results
- // Better to fail clearly than return low-quality extraction
throw new Error(
`OCR failed for all ${pdfChunks.length} chunks of ${filename}. ` +
`Large PDFs require OCR - file parser fallback would produce poor results.`
diff --git a/apps/sim/lib/knowledge/documents/service.ts b/apps/sim/lib/knowledge/documents/service.ts
index 6b12ced53ea..c37aa22a53d 100644
--- a/apps/sim/lib/knowledge/documents/service.ts
+++ b/apps/sim/lib/knowledge/documents/service.ts
@@ -52,10 +52,9 @@ import { calculateCost } from '@/providers/utils'
const logger = createLogger('DocumentService')
const TIMEOUTS = {
- OVERALL_PROCESSING: (env.KB_CONFIG_MAX_DURATION || 600) * 1000, // Default 10 minutes for KB document processing
+ OVERALL_PROCESSING: (env.KB_CONFIG_MAX_DURATION || 600) * 1000,
} as const
-// Configuration for handling large documents
const LARGE_DOC_CONFIG = {
MAX_CHUNKS_PER_BATCH: 500,
MAX_EMBEDDING_BATCH: env.KB_CONFIG_BATCH_SIZE || 2000,
@@ -63,9 +62,6 @@ const LARGE_DOC_CONFIG = {
MAX_CHUNKS_PER_DOCUMENT: 100000,
}
-/**
- * Create a timeout wrapper for async operations
- */
function withTimeout(
promise: Promise,
timeoutMs: number,
@@ -174,10 +170,6 @@ export interface DocumentTagData {
value: string
}
-/**
- * Process structured document tags and validate them against existing definitions
- * Throws an error if a tag doesn't exist or if the value doesn't match the expected type
- */
export async function processDocumentTags(
knowledgeBaseId: string,
tagData: DocumentTagData[],
@@ -355,9 +347,6 @@ export async function processDocumentTags(
return result
}
-/**
- * Process documents with the configured background execution backend.
- */
export async function processDocumentsWithQueue(
createdDocuments: DocumentData[],
knowledgeBaseId: string,
@@ -408,9 +397,6 @@ export async function processDocumentsWithQueue(
return
}
-/**
- * Process a document asynchronously with full error handling
- */
export async function processDocumentAsync(
knowledgeBaseId: string,
documentId: string,
@@ -534,7 +520,6 @@ export async function processDocumentAsync(
const documentRecord = await db
.select({
- // Text tags (7 slots)
tag1: document.tag1,
tag2: document.tag2,
tag3: document.tag3,
@@ -542,16 +527,13 @@ export async function processDocumentAsync(
tag5: document.tag5,
tag6: document.tag6,
tag7: document.tag7,
- // Number tags (5 slots)
number1: document.number1,
number2: document.number2,
number3: document.number3,
number4: document.number4,
number5: document.number5,
- // Date tags (2 slots)
date1: document.date1,
date2: document.date2,
- // Boolean tags (3 slots)
boolean1: document.boolean1,
boolean2: document.boolean2,
boolean3: document.boolean3,
@@ -583,7 +565,6 @@ export async function processDocumentAsync(
embeddingModel: 'text-embedding-3-small',
startOffset: chunk.metadata.startIndex,
endOffset: chunk.metadata.endIndex,
- // Copy text tags from document (7 slots)
tag1: documentTags.tag1,
tag2: documentTags.tag2,
tag3: documentTags.tag3,
@@ -591,16 +572,13 @@ export async function processDocumentAsync(
tag5: documentTags.tag5,
tag6: documentTags.tag6,
tag7: documentTags.tag7,
- // Copy number tags from document (5 slots)
number1: documentTags.number1,
number2: documentTags.number2,
number3: documentTags.number3,
number4: documentTags.number4,
number5: documentTags.number5,
- // Copy date tags from document (2 slots)
date1: documentTags.date1,
date2: documentTags.date2,
- // Copy boolean tags from document (3 slots)
boolean1: documentTags.boolean1,
boolean2: documentTags.boolean2,
boolean3: documentTags.boolean3,
@@ -724,16 +702,10 @@ export async function processDocumentAsync(
}
}
-/**
- * Check if Trigger.dev is available and configured
- */
export function isTriggerAvailable(): boolean {
return Boolean(env.TRIGGER_SECRET_KEY) && isTriggerDevEnabled
}
-/**
- * Process documents using Trigger.dev
- */
export async function processDocumentsWithTrigger(
documents: DocumentProcessingPayload[],
requestId: string
@@ -782,9 +754,6 @@ export async function processDocumentsWithTrigger(
}
}
-/**
- * Create document records in database with tags
- */
export async function createDocumentRecords(
documents: Array<{
filename: string
@@ -853,7 +822,6 @@ export async function createDocumentRecords(
processingStatus: 'pending' as const,
enabled: true,
uploadedAt: now,
- // Text tags - use processed tags if available, otherwise fall back to individual tag fields
tag1: processedTags.tag1 ?? docData.tag1 ?? null,
tag2: processedTags.tag2 ?? docData.tag2 ?? null,
tag3: processedTags.tag3 ?? docData.tag3 ?? null,
@@ -861,16 +829,13 @@ export async function createDocumentRecords(
tag5: processedTags.tag5 ?? docData.tag5 ?? null,
tag6: processedTags.tag6 ?? docData.tag6 ?? null,
tag7: processedTags.tag7 ?? docData.tag7 ?? null,
- // Number tags (5 slots)
number1: processedTags.number1 ?? null,
number2: processedTags.number2 ?? null,
number3: processedTags.number3 ?? null,
number4: processedTags.number4 ?? null,
number5: processedTags.number5 ?? null,
- // Date tags (2 slots)
date1: processedTags.date1 ?? null,
date2: processedTags.date2 ?? null,
- // Boolean tags (3 slots)
boolean1: processedTags.boolean1 ?? null,
boolean2: processedTags.boolean2 ?? null,
boolean3: processedTags.boolean3 ?? null,
@@ -902,9 +867,6 @@ export async function createDocumentRecords(
})
}
-/**
- * A single tag filter condition passed from the API layer.
- */
export interface TagFilterCondition {
tagSlot: string
fieldType: 'text' | 'number' | 'date' | 'boolean'
@@ -913,9 +875,6 @@ export interface TagFilterCondition {
valueTo?: string
}
-/**
- * Builds a Drizzle SQL condition from a tag filter.
- */
const ALLOWED_TAG_SLOTS = new Set([
'tag1',
'tag2',
@@ -1044,9 +1003,6 @@ function buildTagFilterCondition(filter: TagFilterCondition): SQL | undefined {
return undefined
}
-/**
- * Get documents for a knowledge base with filtering and pagination
- */
export async function getDocuments(
knowledgeBaseId: string,
options: {
@@ -1075,7 +1031,6 @@ export async function getDocuments(
processingError: string | null
enabled: boolean
uploadedAt: Date
- // Text tags
tag1: string | null
tag2: string | null
tag3: string | null
@@ -1083,20 +1038,16 @@ export async function getDocuments(
tag5: string | null
tag6: string | null
tag7: string | null
- // Number tags
number1: number | null
number2: number | null
number3: number | null
number4: number | null
number5: number | null
- // Date tags
date1: Date | null
date2: Date | null
- // Boolean tags
boolean1: boolean | null
boolean2: boolean | null
boolean3: boolean | null
- // Connector fields
connectorId: string | null
connectorType: string | null
sourceUrl: string | null
@@ -1193,7 +1144,6 @@ export async function getDocuments(
processingError: document.processingError,
enabled: document.enabled,
uploadedAt: document.uploadedAt,
- // Text tags (7 slots)
tag1: document.tag1,
tag2: document.tag2,
tag3: document.tag3,
@@ -1201,20 +1151,16 @@ export async function getDocuments(
tag5: document.tag5,
tag6: document.tag6,
tag7: document.tag7,
- // Number tags (5 slots)
number1: document.number1,
number2: document.number2,
number3: document.number3,
number4: document.number4,
number5: document.number5,
- // Date tags (2 slots)
date1: document.date1,
date2: document.date2,
- // Boolean tags (3 slots)
boolean1: document.boolean1,
boolean2: document.boolean2,
boolean3: document.boolean3,
- // Connector fields
connectorId: document.connectorId,
connectorType: knowledgeConnector.connectorType,
sourceUrl: document.sourceUrl,
@@ -1246,7 +1192,6 @@ export async function getDocuments(
processingError: doc.processingError,
enabled: doc.enabled,
uploadedAt: doc.uploadedAt,
- // Text tags
tag1: doc.tag1,
tag2: doc.tag2,
tag3: doc.tag3,
@@ -1254,20 +1199,16 @@ export async function getDocuments(
tag5: doc.tag5,
tag6: doc.tag6,
tag7: doc.tag7,
- // Number tags
number1: doc.number1,
number2: doc.number2,
number3: doc.number3,
number4: doc.number4,
number5: doc.number5,
- // Date tags
date1: doc.date1,
date2: doc.date2,
- // Boolean tags
boolean1: doc.boolean1,
boolean2: doc.boolean2,
boolean3: doc.boolean3,
- // Connector fields
connectorId: doc.connectorId,
connectorType: doc.connectorType ?? null,
sourceUrl: doc.sourceUrl,
@@ -1281,9 +1222,6 @@ export async function getDocuments(
}
}
-/**
- * Create a single document record
- */
export async function createSingleDocument(
documentData: {
filename: string
@@ -1325,7 +1263,6 @@ export async function createSingleDocument(
const now = new Date()
let processedTags: ProcessedDocumentTags = {
- // Text tags (7 slots)
tag1: documentData.tag1 ?? null,
tag2: documentData.tag2 ?? null,
tag3: documentData.tag3 ?? null,
@@ -1333,16 +1270,13 @@ export async function createSingleDocument(
tag5: documentData.tag5 ?? null,
tag6: documentData.tag6 ?? null,
tag7: documentData.tag7 ?? null,
- // Number tags (5 slots)
number1: null,
number2: null,
number3: null,
number4: null,
number5: null,
- // Date tags (2 slots)
date1: null,
date2: null,
- // Boolean tags (3 slots)
boolean1: null,
boolean2: null,
boolean3: null,
@@ -1422,9 +1356,6 @@ export async function createSingleDocument(
}
}
-/**
- * Perform bulk operations on documents
- */
export async function bulkDocumentOperation(
knowledgeBaseId: string,
operation: 'enable' | 'disable' | 'delete',
@@ -1514,9 +1445,6 @@ export async function bulkDocumentOperation(
}
}
-/**
- * Perform bulk operations on all documents matching a filter
- */
export async function bulkDocumentOperationByFilter(
knowledgeBaseId: string,
operation: 'enable' | 'disable' | 'delete',
@@ -1588,9 +1516,6 @@ export async function bulkDocumentOperationByFilter(
}
}
-/**
- * Mark a document as failed due to timeout
- */
export async function markDocumentAsFailedTimeout(
documentId: string,
processingStartedAt: Date,
@@ -1623,9 +1548,6 @@ export async function markDocumentAsFailedTimeout(
}
}
-/**
- * Retry processing a failed document
- */
export async function retryDocumentProcessing(
knowledgeBaseId: string,
documentId: string,
@@ -1678,9 +1600,6 @@ export async function retryDocumentProcessing(
}
}
-/**
- * Update a document with specified fields
- */
export async function updateDocument(
documentId: string,
updateData: {
@@ -1691,7 +1610,6 @@ export async function updateDocument(
characterCount?: number
processingStatus?: 'pending' | 'processing' | 'completed' | 'failed'
processingError?: string
- // Text tags
tag1?: string
tag2?: string
tag3?: string
@@ -1699,16 +1617,13 @@ export async function updateDocument(
tag5?: string
tag6?: string
tag7?: string
- // Number tags
number1?: string
number2?: string
number3?: string
number4?: string
number5?: string
- // Date tags
date1?: string
date2?: string
- // Boolean tags
boolean1?: string
boolean2?: string
boolean3?: string
@@ -1777,7 +1692,6 @@ export async function updateDocument(
boolean2: boolean | null
boolean3: boolean | null
}> = {}
- // All tag slots across all field types
const ALL_TAG_SLOTS = [
'tag1',
'tag2',
@@ -1799,7 +1713,6 @@ export async function updateDocument(
] as const
type TagSlot = (typeof ALL_TAG_SLOTS)[number]
- // Regular field updates
if (updateData.filename !== undefined) dbUpdateData.filename = updateData.filename
if (updateData.enabled !== undefined) dbUpdateData.enabled = updateData.enabled
if (updateData.chunkCount !== undefined) dbUpdateData.chunkCount = updateData.chunkCount
@@ -1817,26 +1730,21 @@ export async function updateDocument(
): string | number | Date | boolean | null => {
if (value === undefined || value === '') return null
- // Number slots
if (slot.startsWith('number')) {
return parseNumberValue(value)
}
- // Date slots
if (slot.startsWith('date')) {
return parseDateValue(value)
}
- // Boolean slots
if (slot.startsWith('boolean')) {
return parseBooleanValue(value) ?? false
}
- // Text slots: keep as string
return value || null
}
- // Type-safe access to tag slots in updateData
type UpdateDataWithTags = typeof updateData & Record
const typedUpdateData = updateData as UpdateDataWithTags
@@ -2049,9 +1957,6 @@ export async function hardDeleteDocuments(
return existingIds.length
}
-/**
- * Hard delete a document.
- */
export async function deleteDocument(
documentId: string,
requestId: string
diff --git a/apps/sim/lib/knowledge/types.ts b/apps/sim/lib/knowledge/types.ts
index bd52f0c06ca..6fe1a8bbaff 100644
--- a/apps/sim/lib/knowledge/types.ts
+++ b/apps/sim/lib/knowledge/types.ts
@@ -1,23 +1,15 @@
import type { ChunkingStrategy, StrategyOptions } from '@/lib/chunkers/types'
/**
- * Configuration for document chunking in knowledge bases
- *
* Units:
- * - maxSize: Maximum chunk size in TOKENS (1 token ≈ 4 characters)
- * - minSize: Minimum chunk size in CHARACTERS (floor to avoid tiny fragments)
- * - overlap: Overlap between chunks in TOKENS (1 token ≈ 4 characters)
+ * - maxSize/overlap: TOKENS (1 token ≈ 4 characters)
+ * - minSize: CHARACTERS
*/
export interface ChunkingConfig {
- /** Maximum chunk size in tokens (default: 1024, range: 100-4000) */
maxSize: number
- /** Minimum chunk size in characters (default: 100, range: 1-2000) */
minSize: number
- /** Overlap between chunks in tokens (default: 200, range: 0-500) */
overlap: number
- /** Chunking strategy (default: 'auto' for content-type detection) */
strategy?: ChunkingStrategy
- /** Strategy-specific options */
strategyOptions?: StrategyOptions
}
@@ -69,19 +61,16 @@ export interface UpdateTagDefinitionData {
fieldType?: string
}
-/** Tag filter for knowledge base search */
export interface StructuredFilter {
- tagName?: string // Human-readable name (input from frontend)
- tagSlot: string // Database column (resolved from tagName)
+ tagName?: string
+ tagSlot: string
fieldType: string
operator: string
value: string | number | boolean
valueTo?: string | number
}
-/** Processed document tags ready for database storage */
export interface ProcessedDocumentTags {
- // Text tags
tag1: string | null
tag2: string | null
tag3: string | null
@@ -89,29 +78,21 @@ export interface ProcessedDocumentTags {
tag5: string | null
tag6: string | null
tag7: string | null
- // Number tags
number1: number | null
number2: number | null
number3: number | null
number4: number | null
number5: number | null
- // Date tags
date1: Date | null
date2: Date | null
- // Boolean tags
boolean1: boolean | null
boolean2: boolean | null
boolean3: boolean | null
- // Index signature for dynamic access
[key: string]: string | number | Date | boolean | null
}
-/**
- * Frontend/API Types
- * These types use string dates for JSON serialization
- */
+/** These types use string dates for JSON serialization */
-/** Extended chunking config with optional fields */
export interface ExtendedChunkingConfig extends ChunkingConfig {
chunkSize?: number
minCharactersPerChunk?: number
@@ -120,7 +101,6 @@ export interface ExtendedChunkingConfig extends ChunkingConfig {
[key: string]: unknown
}
-/** Knowledge base data for API responses */
export interface KnowledgeBaseData {
id: string
userId: string
@@ -137,7 +117,6 @@ export interface KnowledgeBaseData {
connectorTypes?: string[]
}
-/** Document data for API responses */
export interface DocumentData {
id: string
knowledgeBaseId: string
@@ -176,7 +155,6 @@ export interface DocumentData {
sourceUrl?: string | null
}
-/** Chunk data for API responses */
export interface ChunkData {
id: string
chunkIndex: number
@@ -207,7 +185,6 @@ export interface ChunkData {
updatedAt: string
}
-/** Pagination info for chunks */
export interface ChunksPagination {
total: number
limit: number
@@ -215,7 +192,6 @@ export interface ChunksPagination {
hasMore: boolean
}
-/** Pagination info for documents */
export interface DocumentsPagination {
total: number
limit: number
From 899fc682d94e19f8302a1514b5f5f2ccfde0bf6f Mon Sep 17 00:00:00 2001
From: Waleed Latif
Date: Fri, 10 Apr 2026 18:47:45 -0700
Subject: [PATCH 09/20] fix(chunkers): address PR review comments
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
- Fix regex fallback path: use sliding window for overlap instead of
passing chunkOverlap to buildChunks without prepended overlap text
- Fix misleading strategy label: "Text (hierarchical splitting)" →
"Text (word boundary splitting)"
---
.../components/create-base-modal/create-base-modal.tsx | 2 +-
apps/sim/lib/chunkers/regex-chunker.ts | 6 ++++--
2 files changed, 5 insertions(+), 3 deletions(-)
diff --git a/apps/sim/app/workspace/[workspaceId]/knowledge/components/create-base-modal/create-base-modal.tsx b/apps/sim/app/workspace/[workspaceId]/knowledge/components/create-base-modal/create-base-modal.tsx
index d3fd1d21ceb..88fa73da2a8 100644
--- a/apps/sim/app/workspace/[workspaceId]/knowledge/components/create-base-modal/create-base-modal.tsx
+++ b/apps/sim/app/workspace/[workspaceId]/knowledge/components/create-base-modal/create-base-modal.tsx
@@ -40,7 +40,7 @@ interface CreateBaseModalProps {
const STRATEGY_OPTIONS = [
{ value: 'auto', label: 'Auto (detect from content)' },
- { value: 'text', label: 'Text (hierarchical splitting)' },
+ { value: 'text', label: 'Text (word boundary splitting)' },
{ value: 'recursive', label: 'Recursive (configurable separators)' },
{ value: 'sentence', label: 'Sentence' },
{ value: 'token', label: 'Token (fixed-size)' },
diff --git a/apps/sim/lib/chunkers/regex-chunker.ts b/apps/sim/lib/chunkers/regex-chunker.ts
index 8bc4c5a9fbc..575a9686fbd 100644
--- a/apps/sim/lib/chunkers/regex-chunker.ts
+++ b/apps/sim/lib/chunkers/regex-chunker.ts
@@ -86,8 +86,10 @@ export class RegexChunker {
if (segments.length <= 1) {
logger.warn('Regex pattern did not produce any splits, falling back to character splitting')
const chunkSizeChars = tokensToChars(this.chunkSize)
- const chunks = splitAtWordBoundaries(cleaned, chunkSizeChars)
- return buildChunks(chunks, this.chunkOverlap)
+ const overlapChars = tokensToChars(this.chunkOverlap)
+ const stepChars = this.chunkOverlap > 0 ? chunkSizeChars - overlapChars : undefined
+ const chunks = splitAtWordBoundaries(cleaned, chunkSizeChars, stepChars)
+ return buildChunks(chunks, 0)
}
const merged = this.mergeSegments(segments)
From 4c3508b932e64ea4bf7548ca51669cb31efad131 Mon Sep 17 00:00:00 2001
From: Waleed Latif
Date: Fri, 10 Apr 2026 18:50:54 -0700
Subject: [PATCH 10/20] fix(chunkers): use consistent overlap pattern in regex
fallback
Use addOverlap + buildChunks(chunks, overlap) in the regex fallback
path to match the main path and all other chunkers (TextChunker,
RecursiveChunker). The sliding window approach was inconsistent.
---
apps/sim/lib/chunkers/regex-chunker.ts | 10 ++++++----
1 file changed, 6 insertions(+), 4 deletions(-)
diff --git a/apps/sim/lib/chunkers/regex-chunker.ts b/apps/sim/lib/chunkers/regex-chunker.ts
index 575a9686fbd..ac7ea17e722 100644
--- a/apps/sim/lib/chunkers/regex-chunker.ts
+++ b/apps/sim/lib/chunkers/regex-chunker.ts
@@ -86,10 +86,12 @@ export class RegexChunker {
if (segments.length <= 1) {
logger.warn('Regex pattern did not produce any splits, falling back to character splitting')
const chunkSizeChars = tokensToChars(this.chunkSize)
- const overlapChars = tokensToChars(this.chunkOverlap)
- const stepChars = this.chunkOverlap > 0 ? chunkSizeChars - overlapChars : undefined
- const chunks = splitAtWordBoundaries(cleaned, chunkSizeChars, stepChars)
- return buildChunks(chunks, 0)
+ let chunks = splitAtWordBoundaries(cleaned, chunkSizeChars)
+ if (this.chunkOverlap > 0) {
+ const overlapChars = tokensToChars(this.chunkOverlap)
+ chunks = addOverlap(chunks, overlapChars)
+ }
+ return buildChunks(chunks, this.chunkOverlap)
}
const merged = this.mergeSegments(segments)
From 3a26dad205e1f7a51aabab3be800202b908dfc53 Mon Sep 17 00:00:00 2001
From: Waleed Latif
Date: Fri, 10 Apr 2026 19:00:50 -0700
Subject: [PATCH 11/20] fix(chunkers): prevent content loss in word boundary
splitting
When splitAtWordBoundaries snaps end back to a word boundary, advance
pos from end (not pos + step) in non-overlapping mode. The step-based
advancement is preserved for the sliding window case (TokenChunker).
---
apps/sim/lib/chunkers/utils.ts | 14 ++++++++++----
1 file changed, 10 insertions(+), 4 deletions(-)
diff --git a/apps/sim/lib/chunkers/utils.ts b/apps/sim/lib/chunkers/utils.ts
index 11acdca03b4..dd49d2dac6c 100644
--- a/apps/sim/lib/chunkers/utils.ts
+++ b/apps/sim/lib/chunkers/utils.ts
@@ -60,7 +60,6 @@ export function splitAtWordBoundaries(
chunkSizeChars: number,
stepChars?: number
): string[] {
- const step = Math.max(1, stepChars ?? chunkSizeChars)
const parts: string[] = []
let pos = 0
@@ -79,9 +78,16 @@ export function splitAtWordBoundaries(
parts.push(part)
}
- const nextPos = pos + step
- if (nextPos >= text.length) break
- pos = nextPos
+ if (stepChars !== undefined) {
+ // Sliding window: advance by step for predictable overlap
+ const nextPos = pos + Math.max(1, stepChars)
+ if (nextPos >= text.length) break
+ pos = nextPos
+ } else {
+ // Non-overlapping: advance from end of extracted content
+ if (end >= text.length) break
+ pos = end
+ }
while (pos < text.length && text[pos] === ' ') pos++
}
From 5e8b0515c3f6fc228356dad4480dce9db71a9214 Mon Sep 17 00:00:00 2001
From: Waleed Latif
Date: Fri, 10 Apr 2026 19:13:11 -0700
Subject: [PATCH 12/20] fix(chunkers): restore structured data token ratio and
overlap joiner
- Restore /3 token estimation for StructuredDataChunker (structured data
is denser than prose, ~3 chars/token vs ~4)
- Change addOverlap joiner from \n to space to match original TextChunker
behavior
---
.../lib/chunkers/structured-data-chunker.ts | 18 +++++++++++-------
apps/sim/lib/chunkers/utils.test.ts | 8 +++-----
apps/sim/lib/chunkers/utils.ts | 2 +-
3 files changed, 15 insertions(+), 13 deletions(-)
diff --git a/apps/sim/lib/chunkers/structured-data-chunker.ts b/apps/sim/lib/chunkers/structured-data-chunker.ts
index 82c24a40720..47a68d23c69 100644
--- a/apps/sim/lib/chunkers/structured-data-chunker.ts
+++ b/apps/sim/lib/chunkers/structured-data-chunker.ts
@@ -1,6 +1,10 @@
import { createLogger } from '@sim/logger'
import type { Chunk, StructuredDataOptions } from '@/lib/chunkers/types'
-import { estimateTokens } from '@/lib/chunkers/utils'
+/** Structured data is denser in tokens (~3 chars/token vs ~4 for prose) */
+function estimateStructuredTokens(text: string): number {
+ if (!text?.trim()) return 0
+ return Math.ceil(text.length / 3)
+}
const logger = createLogger('StructuredDataChunker')
@@ -28,7 +32,7 @@ export class StructuredDataChunker {
const headerLine = options.headers?.join('\t') || lines[0]
const dataStartIndex = options.headers ? 0 : 1
- const estimatedTokensPerRow = StructuredDataChunker.estimateTokensPerRow(
+ const estimatedTokensPerRow = StructuredDataChunker.estimateStructuredTokensPerRow(
lines.slice(dataStartIndex, Math.min(10, lines.length))
)
const optimalRowsPerChunk = StructuredDataChunker.calculateOptimalRowsPerChunk(
@@ -42,12 +46,12 @@ export class StructuredDataChunker {
let currentChunkRows: string[] = []
let currentTokenEstimate = 0
- const headerTokens = estimateTokens(headerLine)
+ const headerTokens = estimateStructuredTokens(headerLine)
let chunkStartRow = dataStartIndex
for (let i = dataStartIndex; i < lines.length; i++) {
const row = lines[i]
- const rowTokens = estimateTokens(row)
+ const rowTokens = estimateStructuredTokens(row)
const projectedTokens =
currentTokenEstimate +
@@ -111,7 +115,7 @@ export class StructuredDataChunker {
private static createChunk(content: string, startRow: number, endRow: number): Chunk {
return {
text: content,
- tokenCount: estimateTokens(content),
+ tokenCount: estimateStructuredTokens(content),
metadata: {
startIndex: startRow,
endIndex: endRow,
@@ -119,10 +123,10 @@ export class StructuredDataChunker {
}
}
- private static estimateTokensPerRow(sampleRows: string[]): number {
+ private static estimateStructuredTokensPerRow(sampleRows: string[]): number {
if (sampleRows.length === 0) return 50
- const totalTokens = sampleRows.reduce((sum, row) => sum + estimateTokens(row), 0)
+ const totalTokens = sampleRows.reduce((sum, row) => sum + estimateStructuredTokens(row), 0)
return Math.ceil(totalTokens / sampleRows.length)
}
diff --git a/apps/sim/lib/chunkers/utils.test.ts b/apps/sim/lib/chunkers/utils.test.ts
index 07f48149cd1..bc88bc0e46a 100644
--- a/apps/sim/lib/chunkers/utils.test.ts
+++ b/apps/sim/lib/chunkers/utils.test.ts
@@ -94,18 +94,16 @@ describe('addOverlap', () => {
expect(result[1].length).toBeGreaterThan('second chunk here'.length)
})
- it('joins overlap text with \\n', () => {
+ it('joins overlap text with space', () => {
const chunks = ['first chunk here', 'second chunk here']
const result = addOverlap(chunks, 10)
- expect(result[1]).toContain('\n')
+ expect(result[1]).toContain('here second')
})
it('snaps overlap to word boundary', () => {
const chunks = ['hello beautiful world', 'next chunk']
const result = addOverlap(chunks, 15)
- const overlapPart = result[1].split('\n')[0]
- expect(overlapPart).toBe('beautiful world')
- expect(result[1]).toBe('beautiful world\nnext chunk')
+ expect(result[1]).toBe('beautiful world next chunk')
})
})
diff --git a/apps/sim/lib/chunkers/utils.ts b/apps/sim/lib/chunkers/utils.ts
index dd49d2dac6c..ded68dbc192 100644
--- a/apps/sim/lib/chunkers/utils.ts
+++ b/apps/sim/lib/chunkers/utils.ts
@@ -41,7 +41,7 @@ export function addOverlap(chunks: string[], overlapChars: number): string[] {
: overlapText
if (cleanOverlap.trim()) {
- chunk = `${cleanOverlap.trim()}\n${chunk}`
+ chunk = `${cleanOverlap.trim()} ${chunk}`
}
}
From a53f760c9fd85678ad7deb63f4901b96cbb27c53 Mon Sep 17 00:00:00 2001
From: Waleed Latif
Date: Fri, 10 Apr 2026 19:17:06 -0700
Subject: [PATCH 13/20] lint
---
apps/sim/lib/chunkers/structured-data-chunker.ts | 1 +
1 file changed, 1 insertion(+)
diff --git a/apps/sim/lib/chunkers/structured-data-chunker.ts b/apps/sim/lib/chunkers/structured-data-chunker.ts
index 47a68d23c69..11d9e6c8979 100644
--- a/apps/sim/lib/chunkers/structured-data-chunker.ts
+++ b/apps/sim/lib/chunkers/structured-data-chunker.ts
@@ -1,5 +1,6 @@
import { createLogger } from '@sim/logger'
import type { Chunk, StructuredDataOptions } from '@/lib/chunkers/types'
+
/** Structured data is denser in tokens (~3 chars/token vs ~4 for prose) */
function estimateStructuredTokens(text: string): number {
if (!text?.trim()) return 0
From ec6fa58a4f02cb42f3234101fd0117d7731de204 Mon Sep 17 00:00:00 2001
From: Waleed Latif
Date: Fri, 10 Apr 2026 19:28:52 -0700
Subject: [PATCH 14/20] fix(chunkers): fall back to character-level overlap in
sentence chunker
When no complete sentence fits within the overlap budget,
fall back to character-level word-boundary overlap from the
previous group's text. This ensures buildChunks metadata is
always correct.
---
apps/sim/lib/chunkers/sentence-chunker.ts | 14 ++++++++++++--
1 file changed, 12 insertions(+), 2 deletions(-)
diff --git a/apps/sim/lib/chunkers/sentence-chunker.ts b/apps/sim/lib/chunkers/sentence-chunker.ts
index 24aacd18acc..9671f10ddd0 100644
--- a/apps/sim/lib/chunkers/sentence-chunker.ts
+++ b/apps/sim/lib/chunkers/sentence-chunker.ts
@@ -119,10 +119,20 @@ export class SentenceChunker {
overlapLen += prevGroup[j].length
}
+ const currentText = groups[i].join(' ')
if (overlapSentences.length > 0) {
- result.push(`${overlapSentences.join(' ')} ${groups[i].join(' ')}`)
+ result.push(`${overlapSentences.join(' ')} ${currentText}`)
} else {
- result.push(groups[i].join(' '))
+ // No complete sentence fits — fall back to character-level overlap
+ const prevText = prevGroup.join(' ')
+ const tail = prevText.slice(-overlapChars)
+ const wordMatch = tail.match(/^\s*\S/)
+ const cleanTail = wordMatch ? tail.slice(tail.indexOf(wordMatch[0].trim())) : tail
+ if (cleanTail.trim()) {
+ result.push(`${cleanTail.trim()} ${currentText}`)
+ } else {
+ result.push(currentText)
+ }
}
}
From e391efa11d8c300c3accfb7fdffd6e4f164267a3 Mon Sep 17 00:00:00 2001
From: Waleed Latif
Date: Fri, 10 Apr 2026 19:37:08 -0700
Subject: [PATCH 15/20] fix(chunkers): fix log message and add missing month
abbreviations
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
- Fix regex fallback log: "character splitting" → "word-boundary splitting"
- Add Jun and Jul to sentence chunker abbreviation list
---
apps/sim/lib/chunkers/regex-chunker.ts | 2 +-
apps/sim/lib/chunkers/sentence-chunker.ts | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/apps/sim/lib/chunkers/regex-chunker.ts b/apps/sim/lib/chunkers/regex-chunker.ts
index ac7ea17e722..a5478ad5c36 100644
--- a/apps/sim/lib/chunkers/regex-chunker.ts
+++ b/apps/sim/lib/chunkers/regex-chunker.ts
@@ -84,7 +84,7 @@ export class RegexChunker {
const segments = cleaned.split(this.regex).filter((s) => s.trim().length > 0)
if (segments.length <= 1) {
- logger.warn('Regex pattern did not produce any splits, falling back to character splitting')
+ logger.warn('Regex pattern did not produce any splits, falling back to word-boundary splitting')
const chunkSizeChars = tokensToChars(this.chunkSize)
let chunks = splitAtWordBoundaries(cleaned, chunkSizeChars)
if (this.chunkOverlap > 0) {
diff --git a/apps/sim/lib/chunkers/sentence-chunker.ts b/apps/sim/lib/chunkers/sentence-chunker.ts
index 9671f10ddd0..f8b92e6f22c 100644
--- a/apps/sim/lib/chunkers/sentence-chunker.ts
+++ b/apps/sim/lib/chunkers/sentence-chunker.ts
@@ -28,7 +28,7 @@ export class SentenceChunker {
private splitSentences(text: string): string[] {
return text
.split(
- /(? s.trim().length > 0)
}
From f7fe06af0a0b35d00600a2dd5646e5050835e141 Mon Sep 17 00:00:00 2001
From: Waleed Latif
Date: Fri, 10 Apr 2026 19:45:21 -0700
Subject: [PATCH 16/20] lint
---
apps/sim/lib/chunkers/regex-chunker.ts | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/apps/sim/lib/chunkers/regex-chunker.ts b/apps/sim/lib/chunkers/regex-chunker.ts
index a5478ad5c36..58c8cb16b91 100644
--- a/apps/sim/lib/chunkers/regex-chunker.ts
+++ b/apps/sim/lib/chunkers/regex-chunker.ts
@@ -84,7 +84,9 @@ export class RegexChunker {
const segments = cleaned.split(this.regex).filter((s) => s.trim().length > 0)
if (segments.length <= 1) {
- logger.warn('Regex pattern did not produce any splits, falling back to word-boundary splitting')
+ logger.warn(
+ 'Regex pattern did not produce any splits, falling back to word-boundary splitting'
+ )
const chunkSizeChars = tokensToChars(this.chunkSize)
let chunks = splitAtWordBoundaries(cleaned, chunkSizeChars)
if (this.chunkOverlap > 0) {
From 9c624db0ffbc84d1632fe7d0071188739e78ae55 Mon Sep 17 00:00:00 2001
From: Waleed Latif
Date: Fri, 10 Apr 2026 19:46:07 -0700
Subject: [PATCH 17/20] fix(chunkers): restore structured data detection
threshold to > 2
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
avgCount >= 1 was too permissive — prose with consistent comma usage
would be misclassified as CSV. Restore original > 2 threshold while
keeping the improved proportional tolerance.
---
apps/sim/lib/chunkers/structured-data-chunker.ts | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/apps/sim/lib/chunkers/structured-data-chunker.ts b/apps/sim/lib/chunkers/structured-data-chunker.ts
index 11d9e6c8979..757e8b67fdb 100644
--- a/apps/sim/lib/chunkers/structured-data-chunker.ts
+++ b/apps/sim/lib/chunkers/structured-data-chunker.ts
@@ -166,7 +166,7 @@ export class StructuredDataChunker {
const avgCount = counts.reduce((a, b) => a + b, 0) / counts.length
const tolerance = Math.max(1, Math.ceil(avgCount * 0.2))
- if (avgCount >= 1 && counts.every((c) => Math.abs(c - avgCount) <= tolerance)) {
+ if (avgCount > 2 && counts.every((c) => Math.abs(c - avgCount) <= tolerance)) {
return true
}
}
From 4fd768513a8e81585ec238afe4dacca3337b9a5f Mon Sep 17 00:00:00 2001
From: Waleed Latif
Date: Fri, 10 Apr 2026 20:15:01 -0700
Subject: [PATCH 18/20] fix(chunkers): pass chunkOverlap to buildChunks in
TokenChunker
---
apps/sim/lib/chunkers/token-chunker.ts | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/apps/sim/lib/chunkers/token-chunker.ts b/apps/sim/lib/chunkers/token-chunker.ts
index 6f7bb555231..d98b4d1651a 100644
--- a/apps/sim/lib/chunkers/token-chunker.ts
+++ b/apps/sim/lib/chunkers/token-chunker.ts
@@ -49,6 +49,6 @@ export class TokenChunker {
const chunks = filtered.length > 0 ? filtered : rawChunks
logger.info(`Chunked into ${chunks.length} token-based chunks`)
- return buildChunks(chunks, 0)
+ return buildChunks(chunks, this.chunkOverlap)
}
}
From 97a0bd4d3c259c7b2220a79e314c49aa04de76bc Mon Sep 17 00:00:00 2001
From: Waleed Latif
Date: Fri, 10 Apr 2026 20:54:41 -0700
Subject: [PATCH 19/20] fix(chunkers): restore separator-as-joiner pattern in
splitRecursively
Separator was unconditionally prepended to parts after the first,
leaving leading punctuation on chunks after a boundary reset.
---
apps/sim/lib/chunkers/recursive-chunker.ts | 5 ++---
apps/sim/lib/chunkers/text-chunker.ts | 5 ++---
2 files changed, 4 insertions(+), 6 deletions(-)
diff --git a/apps/sim/lib/chunkers/recursive-chunker.ts b/apps/sim/lib/chunkers/recursive-chunker.ts
index 16b451e3e96..0dba2240987 100644
--- a/apps/sim/lib/chunkers/recursive-chunker.ts
+++ b/apps/sim/lib/chunkers/recursive-chunker.ts
@@ -97,9 +97,8 @@ export class RecursiveChunker {
const chunks: string[] = []
let currentChunk = ''
- for (let pi = 0; pi < parts.length; pi++) {
- const part = pi > 0 ? `${separator}${parts[pi]}` : parts[pi]
- const testChunk = currentChunk + part
+ for (const part of parts) {
+ const testChunk = currentChunk + (currentChunk ? separator : '') + part
if (estimateTokens(testChunk) <= this.chunkSize) {
currentChunk = testChunk
diff --git a/apps/sim/lib/chunkers/text-chunker.ts b/apps/sim/lib/chunkers/text-chunker.ts
index 7e9b5a064dd..eb993b609aa 100644
--- a/apps/sim/lib/chunkers/text-chunker.ts
+++ b/apps/sim/lib/chunkers/text-chunker.ts
@@ -61,9 +61,8 @@ export class TextChunker {
const chunks: string[] = []
let currentChunk = ''
- for (let pi = 0; pi < parts.length; pi++) {
- const part = pi > 0 ? `${separator}${parts[pi]}` : parts[pi]
- const testChunk = currentChunk + part
+ for (const part of parts) {
+ const testChunk = currentChunk + (currentChunk ? separator : '') + part
if (estimateTokens(testChunk) <= this.chunkSize) {
currentChunk = testChunk
From 2c5a8521a8882bf14199c42f2fac5df279faa347 Mon Sep 17 00:00:00 2001
From: Waleed Latif
Date: Fri, 10 Apr 2026 21:33:07 -0700
Subject: [PATCH 20/20] feat(knowledge): add JSONL file support for knowledge
base uploads
Parses JSON Lines files by splitting on newlines and converting to a
JSON array, which then flows through the existing JsonYamlChunker.
Co-Authored-By: Claude Opus 4.6
---
.../add-documents-modal.tsx | 3 +-
.../create-base-modal/create-base-modal.tsx | 3 +-
apps/sim/lib/file-parsers/index.ts | 13 +++++-
apps/sim/lib/file-parsers/json-parser.ts | 43 +++++++++++++++++++
apps/sim/lib/uploads/utils/file-utils.ts | 2 +-
apps/sim/lib/uploads/utils/validation.ts | 2 +
6 files changed, 61 insertions(+), 5 deletions(-)
diff --git a/apps/sim/app/workspace/[workspaceId]/knowledge/[id]/components/add-documents-modal/add-documents-modal.tsx b/apps/sim/app/workspace/[workspaceId]/knowledge/[id]/components/add-documents-modal/add-documents-modal.tsx
index 5ddb7eb6a20..a731e38e0da 100644
--- a/apps/sim/app/workspace/[workspaceId]/knowledge/[id]/components/add-documents-modal/add-documents-modal.tsx
+++ b/apps/sim/app/workspace/[workspaceId]/knowledge/[id]/components/add-documents-modal/add-documents-modal.tsx
@@ -263,7 +263,8 @@ export function AddDocumentsModal({
{isDragging ? 'Drop files here' : 'Drop files here or click to browse'}
- PDF, DOC, DOCX, TXT, CSV, XLS, XLSX, MD, PPT, PPTX, HTML (max 100MB each)
+ PDF, DOC, DOCX, TXT, CSV, XLS, XLSX, MD, PPT, PPTX, HTML, JSONL (max 100MB
+ each)
diff --git a/apps/sim/app/workspace/[workspaceId]/knowledge/components/create-base-modal/create-base-modal.tsx b/apps/sim/app/workspace/[workspaceId]/knowledge/components/create-base-modal/create-base-modal.tsx
index 88fa73da2a8..e6884cc332d 100644
--- a/apps/sim/app/workspace/[workspaceId]/knowledge/components/create-base-modal/create-base-modal.tsx
+++ b/apps/sim/app/workspace/[workspaceId]/knowledge/components/create-base-modal/create-base-modal.tsx
@@ -541,7 +541,8 @@ export const CreateBaseModal = memo(function CreateBaseModal({
{isDragging ? 'Drop files here' : 'Drop files here or click to browse'}
- PDF, DOC, DOCX, TXT, CSV, XLS, XLSX, MD, PPT, PPTX, HTML (max 100MB each)
+ PDF, DOC, DOCX, TXT, CSV, XLS, XLSX, MD, PPT, PPTX, HTML, JSONL (max 100MB
+ each)
diff --git a/apps/sim/lib/file-parsers/index.ts b/apps/sim/lib/file-parsers/index.ts
index a69a8abdf26..28080e54667 100644
--- a/apps/sim/lib/file-parsers/index.ts
+++ b/apps/sim/lib/file-parsers/index.ts
@@ -86,12 +86,21 @@ function getParserInstances(): Record