- {/* Hidden decoy fields to prevent browser autofill */}
+
+
+
setValue('strategy', value as FormValues['strategy'])}
+ dropdownWidth='trigger'
+ align='start'
+ />
+
+ Auto detects the best strategy based on file content type.
+
+
+
+ {strategyValue === 'regex' && (
+
+
+
+ {errors.regexPattern && (
+
+ {errors.regexPattern.message}
+
+ )}
+
+ Text will be split at each match of this regex pattern.
+
+
+ )}
+
+ {strategyValue === 'recursive' && (
+
+
+
+
+ Comma-separated list of delimiters in priority order. Leave empty for default
+ separators.
+
+
+ )}
+
diff --git a/apps/sim/hooks/queries/kb/knowledge.ts b/apps/sim/hooks/queries/kb/knowledge.ts
index 455d762ecab..e1d3343a57d 100644
--- a/apps/sim/hooks/queries/kb/knowledge.ts
+++ b/apps/sim/hooks/queries/kb/knowledge.ts
@@ -1,6 +1,7 @@
import { createLogger } from '@sim/logger'
import { keepPreviousData, useMutation, useQuery, useQueryClient } from '@tanstack/react-query'
import { toast } from '@/components/emcn'
+import type { ChunkingStrategy, StrategyOptions } from '@/lib/chunkers/types'
import type {
ChunkData,
ChunksPagination,
@@ -338,10 +339,7 @@ export interface DocumentChunkSearchParams {
search: string
}
-/**
- * Fetches all chunks matching a search query by paginating through results.
- * This is used for search functionality where we need all matching chunks.
- */
+/** Paginates through all matching chunks rather than returning a single page. */
export async function fetchAllDocumentChunks(
{ knowledgeBaseId, documentId, search }: DocumentChunkSearchParams,
signal?: AbortSignal
@@ -376,10 +374,6 @@ export const serializeSearchParams = (params: DocumentChunkSearchParams) =>
search: params.search,
})
-/**
- * Hook to search for chunks in a document.
- * Fetches all matching chunks and returns them for client-side pagination.
- */
export function useDocumentChunkSearchQuery(
params: DocumentChunkSearchParams,
options?: {
@@ -707,6 +701,8 @@ export interface CreateKnowledgeBaseParams {
maxSize: number
minSize: number
overlap: number
+ strategy?: ChunkingStrategy
+ strategyOptions?: StrategyOptions
}
}
diff --git a/apps/sim/lib/chunkers/docs-chunker.ts b/apps/sim/lib/chunkers/docs-chunker.ts
index 8ec203b6501..ddfecc3ab19 100644
--- a/apps/sim/lib/chunkers/docs-chunker.ts
+++ b/apps/sim/lib/chunkers/docs-chunker.ts
@@ -3,12 +3,12 @@ import path from 'path'
import { createLogger } from '@sim/logger'
import { TextChunker } from '@/lib/chunkers/text-chunker'
import type { DocChunk, DocsChunkerOptions } from '@/lib/chunkers/types'
+import { estimateTokens } from '@/lib/chunkers/utils'
import { generateEmbeddings } from '@/lib/knowledge/embeddings'
interface HeaderInfo {
level: number
text: string
- slug?: string
anchor?: string
position?: number
}
@@ -21,25 +21,21 @@ interface Frontmatter {
const logger = createLogger('DocsChunker')
-/**
- * Docs-specific chunker that processes .mdx files and tracks header context
- */
export class DocsChunker {
private readonly textChunker: TextChunker
private readonly baseUrl: string
+ private readonly chunkSize: number
constructor(options: DocsChunkerOptions = {}) {
+ this.chunkSize = options.chunkSize ?? 300
this.textChunker = new TextChunker({
- chunkSize: options.chunkSize ?? 300, // Max 300 tokens per chunk
+ chunkSize: this.chunkSize,
minCharactersPerChunk: options.minCharactersPerChunk ?? 1,
chunkOverlap: options.chunkOverlap ?? 50,
})
this.baseUrl = options.baseUrl ?? 'https://docs.sim.ai'
}
- /**
- * Process all .mdx files in the docs directory
- */
async chunkAllDocs(docsPath: string): Promise
{
const allChunks: DocChunk[] = []
@@ -65,20 +61,17 @@ export class DocsChunker {
}
}
- /**
- * Process a single .mdx file
- */
async chunkMdxFile(filePath: string, basePath: string): Promise {
const content = await fs.readFile(filePath, 'utf-8')
const relativePath = path.relative(basePath, filePath)
const { data: frontmatter, content: markdownContent } = this.parseFrontmatter(content)
- const headers = this.extractHeaders(markdownContent)
-
const documentUrl = this.generateDocumentUrl(relativePath)
- const textChunks = await this.splitContent(markdownContent)
+ const { chunks: textChunks, cleanedContent } = await this.splitContent(markdownContent)
+
+ const headers = this.extractHeaders(cleanedContent)
logger.info(`Generating embeddings for ${textChunks.length} chunks in ${relativePath}`)
const embeddings: number[][] =
@@ -97,7 +90,7 @@ export class DocsChunker {
const chunk: DocChunk = {
text: chunkText,
- tokenCount: Math.ceil(chunkText.length / 4), // Simple token estimation
+ tokenCount: estimateTokens(chunkText),
sourceDocument: relativePath,
headerLink: relevantHeader ? `${documentUrl}#${relevantHeader.anchor}` : documentUrl,
headerText: relevantHeader?.text || frontmatter.title || 'Document Root',
@@ -118,9 +111,6 @@ export class DocsChunker {
return chunks
}
- /**
- * Find all .mdx files recursively
- */
private async findMdxFiles(dirPath: string): Promise {
const files: string[] = []
@@ -140,9 +130,6 @@ export class DocsChunker {
return files
}
- /**
- * Extract headers and their positions from markdown content
- */
private extractHeaders(content: string): HeaderInfo[] {
const headers: HeaderInfo[] = []
const headerRegex = /^(#{1,6})\s+(.+)$/gm
@@ -164,42 +151,28 @@ export class DocsChunker {
return headers
}
- /**
- * Generate URL-safe anchor from header text
- */
private generateAnchor(headerText: string): string {
return headerText
.toLowerCase()
- .replace(/[^\w\s-]/g, '') // Remove special characters except hyphens
- .replace(/\s+/g, '-') // Replace spaces with hyphens
- .replace(/-+/g, '-') // Replace multiple hyphens with single
- .replace(/^-|-$/g, '') // Remove leading/trailing hyphens
+ .replace(/[^\w\s-]/g, '')
+ .replace(/\s+/g, '-')
+ .replace(/-+/g, '-')
+ .replace(/^-|-$/g, '')
}
- /**
- * Generate document URL from relative path
- * Handles index.mdx files specially - they are served at the parent directory path
- */
+ /** index.mdx files are served at the parent directory path */
private generateDocumentUrl(relativePath: string): string {
- // Convert file path to URL path
- // e.g., "tools/knowledge.mdx" -> "/tools/knowledge"
- // e.g., "triggers/index.mdx" -> "/triggers" (NOT "/triggers/index")
- let urlPath = relativePath.replace(/\.mdx$/, '').replace(/\\/g, '/') // Handle Windows paths
+ let urlPath = relativePath.replace(/\.mdx$/, '').replace(/\\/g, '/')
- // In fumadocs, index.mdx files are served at the parent directory path
- // e.g., "triggers/index" -> "triggers"
if (urlPath.endsWith('/index')) {
- urlPath = urlPath.slice(0, -6) // Remove "/index"
+ urlPath = urlPath.slice(0, -6)
} else if (urlPath === 'index') {
- urlPath = '' // Root index.mdx
+ urlPath = ''
}
return `${this.baseUrl}/${urlPath}`
}
- /**
- * Find the most relevant header for a given position
- */
private findRelevantHeader(headers: HeaderInfo[], position: number): HeaderInfo | null {
if (headers.length === 0) return null
@@ -216,10 +189,10 @@ export class DocsChunker {
return relevantHeader
}
- /**
- * Split content into chunks using the existing TextChunker with table awareness
- */
- private async splitContent(content: string): Promise {
+ /** Returns both chunks and cleaned content so header extraction uses aligned positions. */
+ private async splitContent(
+ content: string
+ ): Promise<{ chunks: string[]; cleanedContent: string }> {
const cleanedContent = this.cleanContent(content)
const tableBoundaries = this.detectTableBoundaries(cleanedContent)
@@ -234,30 +207,23 @@ export class DocsChunker {
const finalChunks = this.enforceSizeLimit(processedChunks)
- return finalChunks
+ return { chunks: finalChunks, cleanedContent }
}
- /**
- * Clean content by removing MDX-specific elements and excessive whitespace
- */
private cleanContent(content: string): string {
- return (
- content
- // Remove import statements
- .replace(/^import\s+.*$/gm, '')
- // Remove JSX components and React-style comments
- .replace(/<[^>]+>/g, ' ')
- .replace(/\{\/\*[\s\S]*?\*\/\}/g, ' ')
- // Remove excessive whitespace
- .replace(/\n{3,}/g, '\n\n')
- .replace(/[ \t]{2,}/g, ' ')
- .trim()
- )
+ return content
+ .replace(/\r\n/g, '\n')
+ .replace(/\r/g, '\n')
+ .replace(/^import\s+.*$/gm, '')
+ .replace(/^export\s+.*$/gm, '')
+ .replace(/<\/?[a-zA-Z][^>]*>/g, ' ')
+ .replace(/\{\/\*[\s\S]*?\*\/\}/g, ' ')
+ .replace(/\{[^{}]*\}/g, ' ')
+ .replace(/\n{3,}/g, '\n\n')
+ .replace(/[ \t]{2,}/g, ' ')
+ .trim()
}
- /**
- * Parse frontmatter from MDX content
- */
private parseFrontmatter(content: string): { data: Frontmatter; content: string } {
const frontmatterRegex = /^---\r?\n([\s\S]*?)\r?\n---\r?\n([\s\S]*)$/
const match = content.match(frontmatterRegex)
@@ -285,26 +251,25 @@ export class DocsChunker {
return { data, content: markdownContent }
}
- /**
- * Estimate token count (rough approximation)
- */
- private estimateTokens(text: string): number {
- return Math.ceil(text.length / 4)
- }
-
- /**
- * Detect table boundaries in markdown content to avoid splitting them
- */
+ /** Detects table boundaries to avoid splitting tables across chunks. */
private detectTableBoundaries(content: string): { start: number; end: number }[] {
const tables: { start: number; end: number }[] = []
const lines = content.split('\n')
let inTable = false
+ let inCodeBlock = false
let tableStart = -1
for (let i = 0; i < lines.length; i++) {
const line = lines[i].trim()
+ if (line.startsWith('```')) {
+ inCodeBlock = !inCodeBlock
+ continue
+ }
+
+ if (inCodeBlock) continue
+
if (line.includes('|') && line.split('|').length >= 3 && !inTable) {
const nextLine = lines[i + 1]?.trim()
if (nextLine?.includes('|') && nextLine.includes('-')) {
@@ -314,7 +279,7 @@ export class DocsChunker {
} else if (inTable && (!line.includes('|') || line === '' || line.startsWith('#'))) {
tables.push({
start: this.getCharacterPosition(lines, tableStart),
- end: this.getCharacterPosition(lines, i - 1) + lines[i - 1]?.length || 0,
+ end: this.getCharacterPosition(lines, i - 1) + (lines[i - 1]?.length ?? 0),
})
inTable = false
}
@@ -330,16 +295,10 @@ export class DocsChunker {
return tables
}
- /**
- * Get character position from line number
- */
private getCharacterPosition(lines: string[], lineIndex: number): number {
return lines.slice(0, lineIndex).reduce((acc, line) => acc + line.length + 1, 0)
}
- /**
- * Merge chunks that would split tables
- */
private mergeTableChunks(
chunks: string[],
tableBoundaries: { start: number; end: number }[],
@@ -354,6 +313,10 @@ export class DocsChunker {
for (const chunk of chunks) {
const chunkStart = originalContent.indexOf(chunk, currentPosition)
+ if (chunkStart === -1) {
+ mergedChunks.push(chunk)
+ continue
+ }
const chunkEnd = chunkStart + chunk.length
const intersectsTable = tableBoundaries.some(
@@ -373,10 +336,10 @@ export class DocsChunker {
const minStart = Math.min(chunkStart, ...affectedTables.map((t) => t.start))
const maxEnd = Math.max(chunkEnd, ...affectedTables.map((t) => t.end))
- const completeChunk = originalContent.slice(minStart, maxEnd)
+ const completeChunk = originalContent.slice(minStart, maxEnd).trim()
- if (!mergedChunks.some((existing) => existing.includes(completeChunk.trim()))) {
- mergedChunks.push(completeChunk.trim())
+ if (completeChunk && !mergedChunks.some((existing) => existing === completeChunk)) {
+ mergedChunks.push(completeChunk)
}
} else {
mergedChunks.push(chunk)
@@ -388,16 +351,13 @@ export class DocsChunker {
return mergedChunks.filter((chunk) => chunk.length > 50)
}
- /**
- * Enforce 300 token size limit on chunks
- */
private enforceSizeLimit(chunks: string[]): string[] {
const finalChunks: string[] = []
for (const chunk of chunks) {
- const tokens = this.estimateTokens(chunk)
+ const tokens = estimateTokens(chunk)
- if (tokens <= 300) {
+ if (tokens <= this.chunkSize) {
finalChunks.push(chunk)
} else {
const lines = chunk.split('\n')
@@ -406,7 +366,7 @@ export class DocsChunker {
for (const line of lines) {
const testChunk = currentChunk ? `${currentChunk}\n${line}` : line
- if (this.estimateTokens(testChunk) <= 300) {
+ if (estimateTokens(testChunk) <= this.chunkSize) {
currentChunk = testChunk
} else {
if (currentChunk.trim()) {
diff --git a/apps/sim/lib/chunkers/index.ts b/apps/sim/lib/chunkers/index.ts
index 403e75a20d1..2e4595b5ea0 100644
--- a/apps/sim/lib/chunkers/index.ts
+++ b/apps/sim/lib/chunkers/index.ts
@@ -1,5 +1,9 @@
export { DocsChunker } from './docs-chunker'
export { JsonYamlChunker } from './json-yaml-chunker'
+export { RecursiveChunker } from './recursive-chunker'
+export { RegexChunker } from './regex-chunker'
+export { SentenceChunker } from './sentence-chunker'
export { StructuredDataChunker } from './structured-data-chunker'
export { TextChunker } from './text-chunker'
+export { TokenChunker } from './token-chunker'
export * from './types'
diff --git a/apps/sim/lib/chunkers/json-yaml-chunker.test.ts b/apps/sim/lib/chunkers/json-yaml-chunker.test.ts
index 0568c8eff93..251b50daeaa 100644
--- a/apps/sim/lib/chunkers/json-yaml-chunker.test.ts
+++ b/apps/sim/lib/chunkers/json-yaml-chunker.test.ts
@@ -30,14 +30,11 @@ describe('JsonYamlChunker', () => {
expect(JsonYamlChunker.isStructuredData('key: value\nother: data')).toBe(true)
})
- it('should return true for YAML-like plain text', () => {
- // Note: js-yaml is permissive and parses plain text as valid YAML (scalar value)
- // This is expected behavior of the YAML parser
- expect(JsonYamlChunker.isStructuredData('Hello, this is plain text.')).toBe(true)
+ it('should return false for plain text parsed as YAML scalar', () => {
+ expect(JsonYamlChunker.isStructuredData('Hello, this is plain text.')).toBe(false)
})
it('should return false for invalid JSON/YAML with unbalanced braces', () => {
- // Only truly malformed content that fails YAML parsing returns false
expect(JsonYamlChunker.isStructuredData('{invalid: json: content: {{')).toBe(false)
})
@@ -61,7 +58,6 @@ describe('JsonYamlChunker', () => {
const json = '{}'
const chunks = await chunker.chunk(json)
- // Empty object is valid JSON, should return at least metadata
expect(chunks.length).toBeGreaterThanOrEqual(0)
})
@@ -204,7 +200,6 @@ server:
const json = '[]'
const chunks = await chunker.chunk(json)
- // Empty array should not produce chunks with meaningful content
expect(chunks.length).toBeGreaterThanOrEqual(0)
})
@@ -272,7 +267,6 @@ server:
it.concurrent('should fall back to text chunking for invalid JSON', async () => {
const chunker = new JsonYamlChunker({ chunkSize: 100, minCharactersPerChunk: 10 })
- // Create content that fails YAML parsing and is long enough to produce chunks
const invalidJson = `{this is not valid json: content: {{${' more content here '.repeat(10)}`
const chunks = await chunker.chunk(invalidJson)
@@ -377,9 +371,7 @@ server:
const json = JSON.stringify({ a: 1, b: 2, c: 3 })
const chunks = await chunker.chunk(json)
- // Should produce chunks that are valid
expect(chunks.length).toBeGreaterThan(0)
- // The entire small object fits in one chunk
expect(chunks[0].text.length).toBeGreaterThan(0)
})
})
diff --git a/apps/sim/lib/chunkers/json-yaml-chunker.ts b/apps/sim/lib/chunkers/json-yaml-chunker.ts
index 458f8d3e8cb..d18cd0859f9 100644
--- a/apps/sim/lib/chunkers/json-yaml-chunker.ts
+++ b/apps/sim/lib/chunkers/json-yaml-chunker.ts
@@ -1,8 +1,7 @@
import { createLogger } from '@sim/logger'
import * as yaml from 'js-yaml'
import type { Chunk, ChunkerOptions } from '@/lib/chunkers/types'
-import { getAccurateTokenCount } from '@/lib/tokenization'
-import { estimateTokenCount } from '@/lib/tokenization/estimators'
+import { estimateTokens } from '@/lib/chunkers/utils'
const logger = createLogger('JsonYamlChunker')
@@ -11,57 +10,31 @@ type JsonValue = JsonPrimitive | JsonObject | JsonArray
type JsonObject = { [key: string]: JsonValue }
type JsonArray = JsonValue[]
-function getTokenCount(text: string): number {
- try {
- return getAccurateTokenCount(text, 'text-embedding-3-small')
- } catch (error) {
- logger.warn('Tiktoken failed, falling back to estimation')
- const estimate = estimateTokenCount(text)
- return estimate.count
- }
-}
-
-/**
- * Configuration for JSON/YAML chunking
- * Reduced limits to ensure we stay well under OpenAI's 8,191 token limit per embedding request
- */
-const JSON_YAML_CHUNKING_CONFIG = {
- TARGET_CHUNK_SIZE: 1024, // Target tokens per chunk
- MIN_CHARACTERS_PER_CHUNK: 100, // Minimum characters per chunk to filter tiny fragments
- MAX_CHUNK_SIZE: 1500, // Maximum tokens per chunk
- MAX_DEPTH_FOR_SPLITTING: 5, // Maximum depth to traverse for splitting
-}
+const MAX_DEPTH = 5
export class JsonYamlChunker {
- private chunkSize: number // in tokens
- private minCharactersPerChunk: number // in characters
+ private chunkSize: number
+ private minCharactersPerChunk: number
constructor(options: ChunkerOptions = {}) {
- this.chunkSize = options.chunkSize ?? JSON_YAML_CHUNKING_CONFIG.TARGET_CHUNK_SIZE
- this.minCharactersPerChunk =
- options.minCharactersPerChunk ?? JSON_YAML_CHUNKING_CONFIG.MIN_CHARACTERS_PER_CHUNK
+ this.chunkSize = options.chunkSize ?? 1024
+ this.minCharactersPerChunk = options.minCharactersPerChunk ?? 100
}
- /**
- * Check if content is structured JSON/YAML data
- */
static isStructuredData(content: string): boolean {
try {
- JSON.parse(content)
- return true
+ const parsed = JSON.parse(content)
+ return typeof parsed === 'object' && parsed !== null
} catch {
try {
- yaml.load(content)
- return true
+ const parsed = yaml.load(content)
+ return typeof parsed === 'object' && parsed !== null
} catch {
return false
}
}
}
- /**
- * Chunk JSON/YAML content intelligently based on structure
- */
async chunk(content: string): Promise {
try {
let data: JsonValue
@@ -70,16 +43,10 @@ export class JsonYamlChunker {
} catch {
data = yaml.load(content) as JsonValue
}
- const chunks = this.chunkStructuredData(data)
+ const chunks = this.chunkStructuredData(data, [], 0)
- const tokenCounts = chunks.map((c) => c.tokenCount)
- const totalTokens = tokenCounts.reduce((a, b) => a + b, 0)
- const maxTokens = Math.max(...tokenCounts)
- const avgTokens = Math.round(totalTokens / chunks.length)
-
- logger.info(
- `JSON chunking complete: ${chunks.length} chunks, ${totalTokens} total tokens (avg: ${avgTokens}, max: ${maxTokens})`
- )
+ const totalTokens = chunks.reduce((sum, c) => sum + c.tokenCount, 0)
+ logger.info(`JSON chunking complete: ${chunks.length} chunks, ${totalTokens} total tokens`)
return chunks
} catch (error) {
@@ -88,42 +55,38 @@ export class JsonYamlChunker {
}
}
- /**
- * Chunk structured data based on its structure
- */
- private chunkStructuredData(data: JsonValue, path: string[] = []): Chunk[] {
- const chunks: Chunk[] = []
-
+ private chunkStructuredData(data: JsonValue, path: string[], depth: number): Chunk[] {
if (Array.isArray(data)) {
- return this.chunkArray(data, path)
+ return this.chunkArray(data, path, depth)
}
if (typeof data === 'object' && data !== null) {
- return this.chunkObject(data as JsonObject, path)
+ return this.chunkObject(data as JsonObject, path, depth)
}
const content = JSON.stringify(data, null, 2)
- const tokenCount = getTokenCount(content)
+ const contextHeader = path.length > 0 ? `// ${path.join('.')}\n` : ''
+ const contentTokens = estimateTokens(content)
- // Filter tiny fragments using character count
- if (content.length >= this.minCharactersPerChunk) {
- chunks.push({
- text: content,
- tokenCount,
- metadata: {
- startIndex: 0,
- endIndex: content.length,
- },
- })
+ if (contentTokens > this.chunkSize) {
+ return this.chunkAsText(contextHeader + content)
}
- return chunks
+ if (content.length < this.minCharactersPerChunk) {
+ return []
+ }
+
+ const text = contextHeader + content
+ return [
+ {
+ text,
+ tokenCount: estimateTokens(text),
+ metadata: { startIndex: 0, endIndex: text.length },
+ },
+ ]
}
- /**
- * Chunk an array intelligently
- */
- private chunkArray(arr: JsonArray, path: string[]): Chunk[] {
+ private chunkArray(arr: JsonArray, path: string[], depth: number): Chunk[] {
const chunks: Chunk[] = []
let currentBatch: JsonValue[] = []
let currentTokens = 0
@@ -133,46 +96,30 @@ export class JsonYamlChunker {
for (let i = 0; i < arr.length; i++) {
const item = arr[i]
const itemStr = JSON.stringify(item, null, 2)
- const itemTokens = getTokenCount(itemStr)
+ const itemTokens = estimateTokens(itemStr)
if (itemTokens > this.chunkSize) {
if (currentBatch.length > 0) {
- const batchContent = contextHeader + JSON.stringify(currentBatch, null, 2)
- chunks.push({
- text: batchContent,
- tokenCount: getTokenCount(batchContent),
- metadata: {
- startIndex: i - currentBatch.length,
- endIndex: i - 1,
- },
- })
+ chunks.push(
+ this.buildBatchChunk(contextHeader, currentBatch, i - currentBatch.length, i - 1)
+ )
currentBatch = []
currentTokens = 0
}
- if (typeof item === 'object' && item !== null) {
- const subChunks = this.chunkStructuredData(item, [...path, `[${i}]`])
- chunks.push(...subChunks)
+ if (depth < MAX_DEPTH && typeof item === 'object' && item !== null) {
+ chunks.push(...this.chunkStructuredData(item, [...path, `[${i}]`], depth + 1))
} else {
chunks.push({
text: contextHeader + itemStr,
tokenCount: itemTokens,
- metadata: {
- startIndex: i,
- endIndex: i,
- },
+ metadata: { startIndex: i, endIndex: i },
})
}
} else if (currentTokens + itemTokens > this.chunkSize && currentBatch.length > 0) {
- const batchContent = contextHeader + JSON.stringify(currentBatch, null, 2)
- chunks.push({
- text: batchContent,
- tokenCount: getTokenCount(batchContent),
- metadata: {
- startIndex: i - currentBatch.length,
- endIndex: i - 1,
- },
- })
+ chunks.push(
+ this.buildBatchChunk(contextHeader, currentBatch, i - currentBatch.length, i - 1)
+ )
currentBatch = [item]
currentTokens = itemTokens
} else {
@@ -182,121 +129,112 @@ export class JsonYamlChunker {
}
if (currentBatch.length > 0) {
- const batchContent = contextHeader + JSON.stringify(currentBatch, null, 2)
- chunks.push({
- text: batchContent,
- tokenCount: getTokenCount(batchContent),
- metadata: {
- startIndex: arr.length - currentBatch.length,
- endIndex: arr.length - 1,
- },
- })
+ chunks.push(
+ this.buildBatchChunk(
+ contextHeader,
+ currentBatch,
+ arr.length - currentBatch.length,
+ arr.length - 1
+ )
+ )
}
return chunks
}
- /**
- * Chunk an object intelligently
- */
- private chunkObject(obj: JsonObject, path: string[]): Chunk[] {
+ private chunkObject(obj: JsonObject, path: string[], depth: number): Chunk[] {
const chunks: Chunk[] = []
const entries = Object.entries(obj)
const fullContent = JSON.stringify(obj, null, 2)
- const fullTokens = getTokenCount(fullContent)
+ const fullTokens = estimateTokens(fullContent)
if (fullTokens <= this.chunkSize) {
- chunks.push({
- text: fullContent,
- tokenCount: fullTokens,
- metadata: {
- startIndex: 0,
- endIndex: fullContent.length,
+ const contextHeader = path.length > 0 ? `// ${path.join('.')}\n` : ''
+ const text = contextHeader + fullContent
+ return [
+ {
+ text,
+ tokenCount: estimateTokens(text),
+ metadata: { startIndex: 0, endIndex: text.length },
},
- })
- return chunks
+ ]
}
+ const contextHeader = path.length > 0 ? `// ${path.join('.')}\n` : ''
let currentObj: JsonObject = {}
let currentTokens = 0
- let currentKeys: string[] = []
for (const [key, value] of entries) {
const valueStr = JSON.stringify({ [key]: value }, null, 2)
- const valueTokens = getTokenCount(valueStr)
+ const valueTokens = estimateTokens(valueStr)
if (valueTokens > this.chunkSize) {
if (Object.keys(currentObj).length > 0) {
- const objContent = JSON.stringify(currentObj, null, 2)
+ const objContent = contextHeader + JSON.stringify(currentObj, null, 2)
chunks.push({
text: objContent,
- tokenCount: getTokenCount(objContent),
- metadata: {
- startIndex: 0,
- endIndex: objContent.length,
- },
+ tokenCount: estimateTokens(objContent),
+ metadata: { startIndex: 0, endIndex: objContent.length },
})
currentObj = {}
currentTokens = 0
- currentKeys = []
}
- if (typeof value === 'object' && value !== null) {
- const subChunks = this.chunkStructuredData(value, [...path, key])
- chunks.push(...subChunks)
+ if (depth < MAX_DEPTH && typeof value === 'object' && value !== null) {
+ chunks.push(...this.chunkStructuredData(value, [...path, key], depth + 1))
} else {
chunks.push({
- text: valueStr,
+ text: contextHeader + valueStr,
tokenCount: valueTokens,
- metadata: {
- startIndex: 0,
- endIndex: valueStr.length,
- },
+ metadata: { startIndex: 0, endIndex: valueStr.length },
})
}
} else if (
currentTokens + valueTokens > this.chunkSize &&
Object.keys(currentObj).length > 0
) {
- const objContent = JSON.stringify(currentObj, null, 2)
+ const objContent = contextHeader + JSON.stringify(currentObj, null, 2)
chunks.push({
text: objContent,
- tokenCount: getTokenCount(objContent),
- metadata: {
- startIndex: 0,
- endIndex: objContent.length,
- },
+ tokenCount: estimateTokens(objContent),
+ metadata: { startIndex: 0, endIndex: objContent.length },
})
currentObj = { [key]: value }
currentTokens = valueTokens
- currentKeys = [key]
} else {
currentObj[key] = value
currentTokens += valueTokens
- currentKeys.push(key)
}
}
if (Object.keys(currentObj).length > 0) {
- const objContent = JSON.stringify(currentObj, null, 2)
+ const objContent = contextHeader + JSON.stringify(currentObj, null, 2)
chunks.push({
text: objContent,
- tokenCount: getTokenCount(objContent),
- metadata: {
- startIndex: 0,
- endIndex: objContent.length,
- },
+ tokenCount: estimateTokens(objContent),
+ metadata: { startIndex: 0, endIndex: objContent.length },
})
}
return chunks
}
- /**
- * Fall back to text chunking if JSON parsing fails
- */
- private async chunkAsText(content: string): Promise {
+ private buildBatchChunk(
+ contextHeader: string,
+ batch: JsonValue[],
+ startIdx: number,
+ endIdx: number
+ ): Chunk {
+ const batchContent = contextHeader + JSON.stringify(batch, null, 2)
+ return {
+ text: batchContent,
+ tokenCount: estimateTokens(batchContent),
+ metadata: { startIndex: startIdx, endIndex: endIdx },
+ }
+ }
+
+ private chunkAsText(content: string): Chunk[] {
const chunks: Chunk[] = []
const lines = content.split('\n')
let currentChunk = ''
@@ -304,16 +242,13 @@ export class JsonYamlChunker {
let startIndex = 0
for (const line of lines) {
- const lineTokens = getTokenCount(line)
+ const lineTokens = estimateTokens(line)
if (currentTokens + lineTokens > this.chunkSize && currentChunk) {
chunks.push({
text: currentChunk,
tokenCount: currentTokens,
- metadata: {
- startIndex,
- endIndex: startIndex + currentChunk.length,
- },
+ metadata: { startIndex, endIndex: startIndex + currentChunk.length },
})
startIndex += currentChunk.length + 1
@@ -325,24 +260,17 @@ export class JsonYamlChunker {
}
}
- // Filter tiny fragments using character count
if (currentChunk && currentChunk.length >= this.minCharactersPerChunk) {
chunks.push({
text: currentChunk,
tokenCount: currentTokens,
- metadata: {
- startIndex,
- endIndex: startIndex + currentChunk.length,
- },
+ metadata: { startIndex, endIndex: startIndex + currentChunk.length },
})
}
return chunks
}
- /**
- * Static method for chunking JSON/YAML data with default options
- */
static async chunkJsonYaml(content: string, options: ChunkerOptions = {}): Promise {
const chunker = new JsonYamlChunker(options)
return chunker.chunk(content)
diff --git a/apps/sim/lib/chunkers/recursive-chunker.test.ts b/apps/sim/lib/chunkers/recursive-chunker.test.ts
new file mode 100644
index 00000000000..846267034cf
--- /dev/null
+++ b/apps/sim/lib/chunkers/recursive-chunker.test.ts
@@ -0,0 +1,275 @@
+/**
+ * @vitest-environment node
+ */
+
+import { loggerMock } from '@sim/testing'
+import { describe, expect, it, vi } from 'vitest'
+import { RecursiveChunker } from './recursive-chunker'
+
+vi.mock('@sim/logger', () => loggerMock)
+
+describe('RecursiveChunker', () => {
+ describe('empty and whitespace input', () => {
+ it.concurrent('should return empty array for empty string', async () => {
+ const chunker = new RecursiveChunker({ chunkSize: 100 })
+ const chunks = await chunker.chunk('')
+ expect(chunks).toEqual([])
+ })
+
+ it.concurrent('should return empty array for whitespace-only input', async () => {
+ const chunker = new RecursiveChunker({ chunkSize: 100 })
+ const chunks = await chunker.chunk(' \n\n\t ')
+ expect(chunks).toEqual([])
+ })
+ })
+
+ describe('small content', () => {
+ it.concurrent('should return single chunk when content fits in one chunk', async () => {
+ const chunker = new RecursiveChunker({ chunkSize: 100 })
+ const text = 'This is a short text.'
+ const chunks = await chunker.chunk(text)
+
+ expect(chunks).toHaveLength(1)
+ expect(chunks[0].text).toBe(text)
+ })
+ })
+
+ describe('paragraph splitting', () => {
+ it.concurrent('should split at paragraph boundaries first', async () => {
+ const chunker = new RecursiveChunker({ chunkSize: 20 })
+ const text =
+ 'First paragraph with enough content to matter.\n\nSecond paragraph with enough content to matter.\n\nThird paragraph with enough content here.'
+ const chunks = await chunker.chunk(text)
+
+ expect(chunks.length).toBeGreaterThan(1)
+ })
+ })
+
+ describe('line splitting fallback', () => {
+ it.concurrent('should split at newlines when paragraphs are too large', async () => {
+ const chunker = new RecursiveChunker({ chunkSize: 15 })
+ const text =
+ 'Line one with content here.\nLine two with content here.\nLine three with content here.\nLine four with content here.'
+ const chunks = await chunker.chunk(text)
+
+ expect(chunks.length).toBeGreaterThan(1)
+ })
+ })
+
+ describe('sentence splitting fallback', () => {
+ it.concurrent('should split at sentence boundaries when lines are too large', async () => {
+ const chunker = new RecursiveChunker({ chunkSize: 10 })
+ const text =
+ 'First sentence here. Second sentence here. Third sentence here. Fourth sentence here.'
+ const chunks = await chunker.chunk(text)
+
+ expect(chunks.length).toBeGreaterThan(1)
+ })
+ })
+
+ describe('word splitting fallback', () => {
+ it.concurrent('should split at spaces when sentences are too large', async () => {
+ const chunker = new RecursiveChunker({ chunkSize: 5 })
+ const text = 'word1 word2 word3 word4 word5 word6 word7 word8 word9 word10'
+ const chunks = await chunker.chunk(text)
+
+ expect(chunks.length).toBeGreaterThan(1)
+ })
+ })
+
+ describe('keep_separator behavior', () => {
+ it.concurrent('should prepend separator to subsequent chunks', async () => {
+ const chunker = new RecursiveChunker({ chunkSize: 15 })
+ const text =
+ 'First paragraph content here.\n\nSecond paragraph content here.\n\nThird paragraph content here.'
+ const chunks = await chunker.chunk(text)
+
+ if (chunks.length > 1) {
+ expect(chunks[1].text.startsWith('\n\n') || chunks[1].text.length > 0).toBe(true)
+ }
+ })
+ })
+
+ describe('custom separators', () => {
+ it.concurrent('should use custom separators instead of default recipe', async () => {
+ const chunker = new RecursiveChunker({
+ chunkSize: 15,
+ separators: ['---', '\n'],
+ })
+ const text =
+ 'Section one content here with words.---Section two content here with words.---Section three content here.'
+ const chunks = await chunker.chunk(text)
+
+ expect(chunks.length).toBeGreaterThan(1)
+ })
+ })
+
+ describe('recipe: plain', () => {
+ it.concurrent('should use plain recipe by default', async () => {
+ const chunker = new RecursiveChunker({ chunkSize: 20 })
+ const text =
+ 'First paragraph with enough words to exceed the chunk size limit.\n\nSecond paragraph with enough words to exceed the chunk size limit.\n\nThird paragraph with enough words to exceed the chunk size limit.'
+ const chunks = await chunker.chunk(text)
+
+ expect(chunks.length).toBeGreaterThan(1)
+ })
+ })
+
+ describe('recipe: markdown', () => {
+ it.concurrent('should split at heading boundaries for markdown content', async () => {
+ const chunker = new RecursiveChunker({ chunkSize: 20, recipe: 'markdown' })
+ const text =
+ '\n# Title\n\nParagraph content under the title goes here.\n\n## Subtitle\n\nMore text content under the subtitle goes here.'
+ const chunks = await chunker.chunk(text)
+
+ expect(chunks.length).toBeGreaterThan(1)
+ })
+
+ it.concurrent('should handle markdown horizontal rules', async () => {
+ const chunker = new RecursiveChunker({ chunkSize: 20, recipe: 'markdown' })
+ const text =
+ 'Section one content here.\n---\nSection two content here.\n---\nSection three content here.'
+ const chunks = await chunker.chunk(text)
+
+ expect(chunks.length).toBeGreaterThan(0)
+ })
+ })
+
+ describe('recipe: code', () => {
+ it.concurrent('should split on function and class boundaries', async () => {
+ const chunker = new RecursiveChunker({ chunkSize: 20, recipe: 'code' })
+ const text = [
+ 'const x = 1;',
+ 'function hello() {',
+ ' return "hello";',
+ '}',
+ 'function world() {',
+ ' return "world";',
+ '}',
+ 'class MyClass {',
+ ' constructor() {}',
+ ' method() { return true; }',
+ '}',
+ ].join('\n')
+ const chunks = await chunker.chunk(text)
+
+ expect(chunks.length).toBeGreaterThan(1)
+ })
+ })
+
+ describe('chunk size respected', () => {
+ it.concurrent('should not exceed chunk size in tokens', async () => {
+ const chunkSize = 30
+ const chunker = new RecursiveChunker({ chunkSize })
+ const text = 'This is a test sentence with content. '.repeat(30)
+ const chunks = await chunker.chunk(text)
+
+ for (const chunk of chunks) {
+ expect(chunk.tokenCount).toBeLessThanOrEqual(chunkSize + 5)
+ }
+ })
+ })
+
+ describe('overlap', () => {
+ it.concurrent('should share text between consecutive chunks when overlap is set', async () => {
+ const chunker = new RecursiveChunker({ chunkSize: 20, chunkOverlap: 5 })
+ const text =
+ 'First paragraph with some content here.\n\nSecond paragraph with different content here.\n\nThird paragraph with more content here.'
+ const chunks = await chunker.chunk(text)
+
+ if (chunks.length > 1) {
+ expect(chunks[1].text.length).toBeGreaterThan(0)
+ }
+ })
+
+ it.concurrent('should not add overlap when overlap is 0', async () => {
+ const chunker = new RecursiveChunker({ chunkSize: 20, chunkOverlap: 0 })
+ const text =
+ 'First sentence content here. Second sentence content here. Third sentence content here.'
+ const chunks = await chunker.chunk(text)
+
+ if (chunks.length > 1) {
+ const firstChunkEnd = chunks[0].text.slice(-10)
+ expect(chunks[1].text.startsWith(firstChunkEnd)).toBe(false)
+ }
+ })
+ })
+
+ describe('chunk metadata', () => {
+ it.concurrent('should include text, tokenCount, and metadata fields', async () => {
+ const chunker = new RecursiveChunker({ chunkSize: 100 })
+ const text = 'This is test content for metadata.'
+ const chunks = await chunker.chunk(text)
+
+ expect(chunks).toHaveLength(1)
+ expect(chunks[0].text).toBe(text)
+ expect(chunks[0].tokenCount).toBe(Math.ceil(text.length / 4))
+ expect(chunks[0].metadata.startIndex).toBeDefined()
+ expect(chunks[0].metadata.endIndex).toBeDefined()
+ })
+
+ it.concurrent('should have startIndex of 0 for the first chunk', async () => {
+ const chunker = new RecursiveChunker({ chunkSize: 100 })
+ const text = 'Some content here.'
+ const chunks = await chunker.chunk(text)
+
+ expect(chunks[0].metadata.startIndex).toBe(0)
+ })
+
+ it.concurrent('should have non-negative indices for all chunks', async () => {
+ const chunker = new RecursiveChunker({ chunkSize: 20, chunkOverlap: 5 })
+ const text = 'First part. Second part. Third part. Fourth part. Fifth part.'
+ const chunks = await chunker.chunk(text)
+
+ for (const chunk of chunks) {
+ expect(chunk.metadata.startIndex).toBeGreaterThanOrEqual(0)
+ expect(chunk.metadata.endIndex).toBeGreaterThanOrEqual(chunk.metadata.startIndex)
+ }
+ })
+
+ it.concurrent('should have endIndex greater than startIndex for non-empty chunks', async () => {
+ const chunker = new RecursiveChunker({ chunkSize: 20 })
+ const text = 'Multiple sentences here. Another one here. And another. And more content.'
+ const chunks = await chunker.chunk(text)
+
+ for (const chunk of chunks) {
+ expect(chunk.metadata.endIndex).toBeGreaterThan(chunk.metadata.startIndex)
+ }
+ })
+ })
+
+ describe('edge cases', () => {
+ it.concurrent('should handle very long text', async () => {
+ const chunker = new RecursiveChunker({ chunkSize: 100 })
+ const text = 'This is a sentence. '.repeat(1000)
+ const chunks = await chunker.chunk(text)
+
+ expect(chunks.length).toBeGreaterThan(1)
+ })
+
+ it.concurrent('should handle text with no natural separators', async () => {
+ const chunker = new RecursiveChunker({ chunkSize: 5 })
+ const text = 'abcdefghijklmnopqrstuvwxyz'.repeat(5)
+ const chunks = await chunker.chunk(text)
+
+ expect(chunks.length).toBeGreaterThan(1)
+ })
+
+ it.concurrent('should handle unicode text', async () => {
+ const chunker = new RecursiveChunker({ chunkSize: 100 })
+ const text = '这是中文测试。日本語テスト。한국어 테스트.'
+ const chunks = await chunker.chunk(text)
+
+ expect(chunks.length).toBeGreaterThan(0)
+ expect(chunks[0].text).toContain('中文')
+ })
+
+ it.concurrent('should use default chunkSize of 1024 tokens', async () => {
+ const chunker = new RecursiveChunker({})
+ const text = 'Word '.repeat(400)
+ const chunks = await chunker.chunk(text)
+
+ expect(chunks).toHaveLength(1)
+ })
+ })
+})
diff --git a/apps/sim/lib/chunkers/recursive-chunker.ts b/apps/sim/lib/chunkers/recursive-chunker.ts
new file mode 100644
index 00000000000..0dba2240987
--- /dev/null
+++ b/apps/sim/lib/chunkers/recursive-chunker.ts
@@ -0,0 +1,145 @@
+import { createLogger } from '@sim/logger'
+import type { Chunk, RecursiveChunkerOptions } from '@/lib/chunkers/types'
+import {
+ addOverlap,
+ buildChunks,
+ cleanText,
+ estimateTokens,
+ resolveChunkerOptions,
+ splitAtWordBoundaries,
+ tokensToChars,
+} from '@/lib/chunkers/utils'
+
+const logger = createLogger('RecursiveChunker')
+
+const RECIPES = {
+ plain: ['\n\n', '\n', '. ', ' ', ''],
+ markdown: [
+ '\n---\n',
+ '\n***\n',
+ '\n___\n',
+ '\n# ',
+ '\n## ',
+ '\n### ',
+ '\n#### ',
+ '\n##### ',
+ '\n###### ',
+ '\n```\n',
+ '\n> ',
+ '\n\n',
+ '\n',
+ '. ',
+ ' ',
+ '',
+ ],
+ code: [
+ '\nfunction ',
+ '\nclass ',
+ '\nexport ',
+ '\nconst ',
+ '\nlet ',
+ '\nvar ',
+ '\nif ',
+ '\nfor ',
+ '\nwhile ',
+ '\nswitch ',
+ '\nreturn ',
+ '\n\n',
+ '\n',
+ '; ',
+ ' ',
+ '',
+ ],
+} as const
+
+export class RecursiveChunker {
+ private readonly chunkSize: number
+ private readonly chunkOverlap: number
+ private readonly separators: string[]
+
+ constructor(options: RecursiveChunkerOptions = {}) {
+ const resolved = resolveChunkerOptions(options)
+ this.chunkSize = resolved.chunkSize
+ this.chunkOverlap = resolved.chunkOverlap
+
+ if (options.separators && options.separators.length > 0) {
+ this.separators = options.separators
+ } else {
+ const recipe = options.recipe ?? 'plain'
+ this.separators = [...RECIPES[recipe]]
+ }
+ }
+
+ private splitRecursively(text: string, separatorIndex = 0): string[] {
+ const tokenCount = estimateTokens(text)
+
+ if (tokenCount <= this.chunkSize) {
+ return text.trim() ? [text] : []
+ }
+
+ if (separatorIndex >= this.separators.length) {
+ const chunkSizeChars = tokensToChars(this.chunkSize)
+ return splitAtWordBoundaries(text, chunkSizeChars)
+ }
+
+ const separator = this.separators[separatorIndex]
+
+ if (separator === '') {
+ return this.splitRecursively(text, this.separators.length)
+ }
+
+ const parts = text.split(separator).filter((part) => part.trim())
+
+ if (parts.length <= 1) {
+ return this.splitRecursively(text, separatorIndex + 1)
+ }
+
+ const chunks: string[] = []
+ let currentChunk = ''
+
+ for (const part of parts) {
+ const testChunk = currentChunk + (currentChunk ? separator : '') + part
+
+ if (estimateTokens(testChunk) <= this.chunkSize) {
+ currentChunk = testChunk
+ } else {
+ if (currentChunk.trim()) {
+ chunks.push(currentChunk.trim())
+ }
+
+ if (estimateTokens(part) > this.chunkSize) {
+ const subChunks = this.splitRecursively(part, separatorIndex + 1)
+ for (const subChunk of subChunks) {
+ chunks.push(subChunk)
+ }
+ currentChunk = ''
+ } else {
+ currentChunk = part
+ }
+ }
+ }
+
+ if (currentChunk.trim()) {
+ chunks.push(currentChunk.trim())
+ }
+
+ return chunks
+ }
+
+ async chunk(content: string): Promise {
+ if (!content?.trim()) {
+ return []
+ }
+
+ const cleaned = cleanText(content)
+ let chunks = this.splitRecursively(cleaned)
+
+ if (this.chunkOverlap > 0) {
+ const overlapChars = tokensToChars(this.chunkOverlap)
+ chunks = addOverlap(chunks, overlapChars)
+ }
+
+ logger.info(`Chunked into ${chunks.length} recursive chunks`)
+ return buildChunks(chunks, this.chunkOverlap)
+ }
+}
diff --git a/apps/sim/lib/chunkers/regex-chunker.test.ts b/apps/sim/lib/chunkers/regex-chunker.test.ts
new file mode 100644
index 00000000000..5b64cf3f495
--- /dev/null
+++ b/apps/sim/lib/chunkers/regex-chunker.test.ts
@@ -0,0 +1,189 @@
+/**
+ * @vitest-environment node
+ */
+
+import { loggerMock } from '@sim/testing'
+import { describe, expect, it, vi } from 'vitest'
+import { RegexChunker } from './regex-chunker'
+
+vi.mock('@sim/logger', () => loggerMock)
+
+describe('RegexChunker', () => {
+ describe('empty and whitespace input', () => {
+ it.concurrent('should return empty array for empty string', async () => {
+ const chunker = new RegexChunker({ pattern: '\\n\\n' })
+ const chunks = await chunker.chunk('')
+ expect(chunks).toEqual([])
+ })
+
+ it.concurrent('should return empty array for whitespace-only input', async () => {
+ const chunker = new RegexChunker({ pattern: '\\n\\n' })
+ const chunks = await chunker.chunk(' \n\n ')
+ expect(chunks).toEqual([])
+ })
+ })
+
+ describe('small content', () => {
+ it.concurrent('should return single chunk when content fits in chunkSize', async () => {
+ const chunker = new RegexChunker({ pattern: '\\n\\n', chunkSize: 100 })
+ const text = 'This is a short text.'
+ const chunks = await chunker.chunk(text)
+
+ expect(chunks).toHaveLength(1)
+ expect(chunks[0].text).toBe(text)
+ })
+ })
+
+ describe('basic regex splitting', () => {
+ it.concurrent('should split on double newlines with pattern \\n\\n', async () => {
+ const chunker = new RegexChunker({ pattern: '\\n\\n', chunkSize: 20 })
+ const text =
+ 'First paragraph content here.\n\nSecond paragraph content here.\n\nThird paragraph content here.'
+ const chunks = await chunker.chunk(text)
+
+ expect(chunks.length).toBeGreaterThan(1)
+ })
+ })
+
+ describe('custom pattern splitting', () => {
+ it.concurrent('should split text at --- delimiters', async () => {
+ const chunker = new RegexChunker({ pattern: '---', chunkSize: 20 })
+ const text =
+ 'Section one has enough content to fill a chunk on its own here.---Section two also has enough content to fill another chunk here.---Section three needs content too for splitting.'
+ const chunks = await chunker.chunk(text)
+
+ expect(chunks.length).toBeGreaterThan(1)
+ })
+ })
+
+ describe('segment merging', () => {
+ it.concurrent('should merge small adjacent segments up to chunkSize', async () => {
+ const chunker = new RegexChunker({ pattern: '\\n\\n', chunkSize: 100 })
+ const text = 'Short.\n\nAlso short.\n\nTiny.\n\nSmall too.'
+ const chunks = await chunker.chunk(text)
+
+ expect(chunks).toHaveLength(1)
+ expect(chunks[0].text).toContain('Short.')
+ expect(chunks[0].text).toContain('Also short.')
+ })
+ })
+
+ describe('oversized segment fallback', () => {
+ it.concurrent(
+ 'should sub-chunk segments larger than chunkSize via word boundaries',
+ async () => {
+ const chunker = new RegexChunker({ pattern: '---', chunkSize: 10 })
+ const longSegment =
+ 'This is a very long segment with many words that exceeds the chunk size limit significantly. '
+ const text = `${longSegment}---${longSegment}`
+ const chunks = await chunker.chunk(text)
+
+ expect(chunks.length).toBeGreaterThan(2)
+ }
+ )
+ })
+
+ describe('no-match fallback', () => {
+ it.concurrent(
+ 'should fall back to word-boundary splitting when regex matches nothing',
+ async () => {
+ const chunker = new RegexChunker({ pattern: '###SPLIT###', chunkSize: 10 })
+ const text = 'This is a text with no matching delimiter anywhere in the content at all.'
+ const chunks = await chunker.chunk(text)
+
+ expect(chunks.length).toBeGreaterThan(1)
+ }
+ )
+ })
+
+ describe('chunk size respected', () => {
+ it.concurrent('should not exceed chunkSize tokens approximately', async () => {
+ const chunkSize = 30
+ const chunker = new RegexChunker({ pattern: '\\n\\n', chunkSize })
+ const text =
+ 'Paragraph one with some words. '.repeat(5) +
+ '\n\n' +
+ 'Paragraph two with more words. '.repeat(5) +
+ '\n\n' +
+ 'Paragraph three continues here. '.repeat(5)
+ const chunks = await chunker.chunk(text)
+
+ for (const chunk of chunks) {
+ expect(chunk.tokenCount).toBeLessThanOrEqual(chunkSize + 10)
+ }
+ })
+ })
+
+ describe('overlap', () => {
+ it.concurrent('should share content between chunks when chunkOverlap > 0', async () => {
+ const chunker = new RegexChunker({ pattern: '\\n\\n', chunkSize: 20, chunkOverlap: 5 })
+ const text =
+ 'First paragraph with enough content.\n\nSecond paragraph with more content.\n\nThird paragraph with even more.'
+ const chunks = await chunker.chunk(text)
+
+ if (chunks.length > 1) {
+ const firstChunkEnd = chunks[0].text.slice(-10)
+ const secondChunkStart = chunks[1].text.slice(0, 20)
+ expect(secondChunkStart.length).toBeGreaterThan(0)
+ expect(chunks[1].text.length).toBeGreaterThan(0)
+ }
+ })
+ })
+
+ describe('chunk metadata', () => {
+ it.concurrent('should include text, tokenCount, and metadata with indices', async () => {
+ const chunker = new RegexChunker({ pattern: '\\n\\n', chunkSize: 100 })
+ const text = 'Hello world test content.'
+ const chunks = await chunker.chunk(text)
+
+ expect(chunks).toHaveLength(1)
+ expect(chunks[0].text).toBe(text)
+ expect(chunks[0].tokenCount).toBe(Math.ceil(text.length / 4))
+ expect(chunks[0].metadata.startIndex).toBeDefined()
+ expect(chunks[0].metadata.endIndex).toBeDefined()
+ expect(chunks[0].metadata.startIndex).toBe(0)
+ })
+
+ it.concurrent('should have non-negative indices across multiple chunks', async () => {
+ const chunker = new RegexChunker({ pattern: '\\n\\n', chunkSize: 20, chunkOverlap: 0 })
+ const text = 'First paragraph here.\n\nSecond paragraph here.\n\nThird paragraph here.'
+ const chunks = await chunker.chunk(text)
+
+ for (const chunk of chunks) {
+ expect(chunk.metadata.startIndex).toBeGreaterThanOrEqual(0)
+ expect(chunk.metadata.endIndex).toBeGreaterThanOrEqual(chunk.metadata.startIndex)
+ }
+ })
+ })
+
+ describe('invalid regex', () => {
+ it.concurrent('should throw error for invalid regex pattern', async () => {
+ expect(() => new RegexChunker({ pattern: '[invalid' })).toThrow()
+ })
+ })
+
+ describe('empty pattern', () => {
+ it.concurrent('should throw error for empty pattern', async () => {
+ expect(() => new RegexChunker({ pattern: '' })).toThrow('Regex pattern is required')
+ })
+ })
+
+ describe('pattern too long', () => {
+ it.concurrent('should throw error for pattern exceeding 500 characters', async () => {
+ const longPattern = 'a'.repeat(501)
+ expect(() => new RegexChunker({ pattern: longPattern })).toThrow(
+ 'Regex pattern exceeds maximum length of 500 characters'
+ )
+ })
+ })
+
+ describe('ReDoS protection', () => {
+ it.concurrent('should accept safe pattern \\n+', async () => {
+ expect(() => new RegexChunker({ pattern: '\\n+' })).not.toThrow()
+ })
+
+ it.concurrent('should accept safe pattern [,;]', async () => {
+ expect(() => new RegexChunker({ pattern: '[,;]' })).not.toThrow()
+ })
+ })
+})
diff --git a/apps/sim/lib/chunkers/regex-chunker.ts b/apps/sim/lib/chunkers/regex-chunker.ts
new file mode 100644
index 00000000000..58c8cb16b91
--- /dev/null
+++ b/apps/sim/lib/chunkers/regex-chunker.ts
@@ -0,0 +1,144 @@
+import { createLogger } from '@sim/logger'
+import type { Chunk, RegexChunkerOptions } from '@/lib/chunkers/types'
+import {
+ addOverlap,
+ buildChunks,
+ cleanText,
+ estimateTokens,
+ resolveChunkerOptions,
+ splitAtWordBoundaries,
+ tokensToChars,
+} from '@/lib/chunkers/utils'
+
+const logger = createLogger('RegexChunker')
+
+const MAX_PATTERN_LENGTH = 500
+
+export class RegexChunker {
+ private readonly chunkSize: number
+ private readonly chunkOverlap: number
+ private readonly regex: RegExp
+
+ constructor(options: RegexChunkerOptions) {
+ const resolved = resolveChunkerOptions(options)
+ this.chunkSize = resolved.chunkSize
+ this.chunkOverlap = resolved.chunkOverlap
+ this.regex = this.compilePattern(options.pattern)
+ }
+
+ private compilePattern(pattern: string): RegExp {
+ if (!pattern) {
+ throw new Error('Regex pattern is required')
+ }
+
+ if (pattern.length > MAX_PATTERN_LENGTH) {
+ throw new Error(`Regex pattern exceeds maximum length of ${MAX_PATTERN_LENGTH} characters`)
+ }
+
+ try {
+ const regex = new RegExp(pattern, 'g')
+
+ const testStrings = [
+ 'a'.repeat(10000),
+ ' '.repeat(10000),
+ 'a '.repeat(5000),
+ 'aB1 xY2\n'.repeat(1250),
+ `${'a'.repeat(30)}!`,
+ `${'a b '.repeat(25)}!`,
+ ]
+ for (const testStr of testStrings) {
+ regex.lastIndex = 0
+ const start = Date.now()
+ regex.test(testStr)
+ const elapsed = Date.now() - start
+ if (elapsed > 50) {
+ throw new Error('Regex pattern appears to have catastrophic backtracking')
+ }
+ }
+
+ regex.lastIndex = 0
+ return regex
+ } catch (error) {
+ if (error instanceof Error && error.message.includes('catastrophic')) {
+ throw error
+ }
+ throw new Error(
+ `Invalid regex pattern "${pattern}": ${error instanceof Error ? error.message : String(error)}`
+ )
+ }
+ }
+
+ async chunk(content: string): Promise {
+ if (!content?.trim()) {
+ return []
+ }
+
+ const cleaned = cleanText(content)
+
+ if (estimateTokens(cleaned) <= this.chunkSize) {
+ logger.info('Content fits in single chunk')
+ return buildChunks([cleaned], 0)
+ }
+
+ this.regex.lastIndex = 0
+ const segments = cleaned.split(this.regex).filter((s) => s.trim().length > 0)
+
+ if (segments.length <= 1) {
+ logger.warn(
+ 'Regex pattern did not produce any splits, falling back to word-boundary splitting'
+ )
+ const chunkSizeChars = tokensToChars(this.chunkSize)
+ let chunks = splitAtWordBoundaries(cleaned, chunkSizeChars)
+ if (this.chunkOverlap > 0) {
+ const overlapChars = tokensToChars(this.chunkOverlap)
+ chunks = addOverlap(chunks, overlapChars)
+ }
+ return buildChunks(chunks, this.chunkOverlap)
+ }
+
+ const merged = this.mergeSegments(segments)
+
+ let chunks = merged
+ if (this.chunkOverlap > 0) {
+ const overlapChars = tokensToChars(this.chunkOverlap)
+ chunks = addOverlap(chunks, overlapChars)
+ }
+
+ logger.info(`Chunked into ${chunks.length} regex-based chunks`)
+ return buildChunks(chunks, this.chunkOverlap)
+ }
+
+ private mergeSegments(segments: string[]): string[] {
+ const chunks: string[] = []
+ let current = ''
+
+ for (const segment of segments) {
+ const test = current ? `${current}\n${segment}` : segment
+
+ if (estimateTokens(test) <= this.chunkSize) {
+ current = test
+ } else {
+ if (current.trim()) {
+ chunks.push(current.trim())
+ }
+
+ if (estimateTokens(segment) > this.chunkSize) {
+ const chunkSizeChars = tokensToChars(this.chunkSize)
+ const subChunks = splitAtWordBoundaries(segment, chunkSizeChars)
+ for (const sub of subChunks) {
+ chunks.push(sub)
+ }
+ current = ''
+ } else {
+ current = segment
+ }
+ }
+ }
+
+ if (current.trim()) {
+ chunks.push(current.trim())
+ }
+
+ return chunks
+ }
+}
diff --git a/apps/sim/lib/chunkers/sentence-chunker.test.ts b/apps/sim/lib/chunkers/sentence-chunker.test.ts
new file mode 100644
index 00000000000..78708de29ad
--- /dev/null
+++ b/apps/sim/lib/chunkers/sentence-chunker.test.ts
@@ -0,0 +1,286 @@
+/**
+ * @vitest-environment node
+ */
+
+import { loggerMock } from '@sim/testing'
+import { describe, expect, it, vi } from 'vitest'
+import { SentenceChunker } from './sentence-chunker'
+
+vi.mock('@sim/logger', () => loggerMock)
+
+describe('SentenceChunker', () => {
+ describe('empty and whitespace input', () => {
+ it.concurrent('should return empty array for empty string', async () => {
+ const chunker = new SentenceChunker({ chunkSize: 100 })
+ const chunks = await chunker.chunk('')
+ expect(chunks).toEqual([])
+ })
+
+ it.concurrent('should return empty array for whitespace-only input', async () => {
+ const chunker = new SentenceChunker({ chunkSize: 100 })
+ const chunks = await chunker.chunk(' \n\n\t ')
+ expect(chunks).toEqual([])
+ })
+
+ it.concurrent('should return empty array for null-ish content', async () => {
+ const chunker = new SentenceChunker({ chunkSize: 100 })
+ const chunks = await chunker.chunk(undefined as unknown as string)
+ expect(chunks).toEqual([])
+ })
+ })
+
+ describe('small content (single chunk)', () => {
+ it.concurrent('should return single chunk when content fits within chunk size', async () => {
+ const chunker = new SentenceChunker({ chunkSize: 100 })
+ const text = 'This is a short sentence. Another short one.'
+ const chunks = await chunker.chunk(text)
+
+ expect(chunks).toHaveLength(1)
+ expect(chunks[0].text).toBe(text)
+ expect(chunks[0].tokenCount).toBe(Math.ceil(text.length / 4))
+ })
+ })
+
+ describe('sentence boundary splitting', () => {
+ it.concurrent('should split text at sentence boundaries', async () => {
+ const chunker = new SentenceChunker({ chunkSize: 20 })
+ const text =
+ 'First sentence here. Second sentence here. Third sentence here. Fourth sentence here.'
+ const chunks = await chunker.chunk(text)
+
+ expect(chunks.length).toBeGreaterThan(1)
+ for (let i = 0; i < chunks.length - 1; i++) {
+ const trimmed = chunks[i].text.trim()
+ const lastChar = trimmed[trimmed.length - 1]
+ expect(['.', '!', '?']).toContain(lastChar)
+ }
+ })
+ })
+
+ describe('abbreviation handling', () => {
+ it.concurrent('should not split at common abbreviations', async () => {
+ const chunker = new SentenceChunker({ chunkSize: 200 })
+ const text = 'Mr. Smith went to Washington. He arrived on Jan. 5th.'
+ const chunks = await chunker.chunk(text)
+
+ expect(chunks).toHaveLength(1)
+ expect(chunks[0].text).toContain('Mr. Smith')
+ expect(chunks[0].text).toContain('Jan. 5th')
+ })
+
+ it.concurrent('should not split at Dr., Mrs., Ms., Prof., Jr., Sr., St.', async () => {
+ const chunker = new SentenceChunker({ chunkSize: 500 })
+ const text =
+ 'Dr. Jones and Mrs. Brown met Prof. Davis at St. Mary hospital. Jr. members joined Sr. staff in Feb. for a review.'
+ const chunks = await chunker.chunk(text)
+
+ expect(chunks).toHaveLength(1)
+ })
+ })
+
+ describe('single capital initial handling', () => {
+ it.concurrent('should not split at single capital letter initials', async () => {
+ const chunker = new SentenceChunker({ chunkSize: 200 })
+ const text = 'J. K. Rowling wrote books. They are popular.'
+ const chunks = await chunker.chunk(text)
+
+ expect(chunks).toHaveLength(1)
+ expect(chunks[0].text).toContain('J. K. Rowling')
+ })
+ })
+
+ describe('decimal handling', () => {
+ it.concurrent('should not split at decimal numbers', async () => {
+ const chunker = new SentenceChunker({ chunkSize: 20 })
+ const text = 'The value is 3.14. That is pi.'
+ const chunks = await chunker.chunk(text)
+
+ const allText = chunks.map((c) => c.text).join(' ')
+ expect(allText).toContain('3.14')
+
+ const largeChunker = new SentenceChunker({ chunkSize: 200 })
+ const largeChunks = await largeChunker.chunk(text)
+ expect(largeChunks).toHaveLength(1)
+ })
+ })
+
+ describe('ellipsis handling', () => {
+ it.concurrent('should not split at ellipsis', async () => {
+ const chunker = new SentenceChunker({ chunkSize: 200 })
+ const text = 'Wait for it... The answer is here. Done.'
+ const chunks = await chunker.chunk(text)
+
+ expect(chunks).toHaveLength(1)
+ expect(chunks[0].text).toContain('Wait for it...')
+ })
+ })
+
+ describe('exclamation and question marks', () => {
+ it.concurrent('should split at exclamation and question marks', async () => {
+ const chunker = new SentenceChunker({ chunkSize: 10 })
+ const text = 'What is this? It is great! I agree.'
+ const chunks = await chunker.chunk(text)
+
+ const allText = chunks.map((c) => c.text).join(' ')
+ expect(allText).toContain('What is this?')
+ expect(allText).toContain('It is great!')
+ expect(allText).toContain('I agree.')
+ })
+
+ it.concurrent('should treat ? and ! as sentence boundaries', async () => {
+ const chunker = new SentenceChunker({ chunkSize: 15 })
+ const text = 'What is this thing? It is really great! I strongly agree.'
+ const chunks = await chunker.chunk(text)
+
+ expect(chunks.length).toBeGreaterThanOrEqual(1)
+ const allText = chunks.map((c) => c.text).join(' ')
+ expect(allText).toContain('?')
+ expect(allText).toContain('!')
+ })
+ })
+
+ describe('minSentencesPerChunk', () => {
+ it.concurrent('should group at least minSentencesPerChunk sentences per chunk', async () => {
+ const chunker = new SentenceChunker({ chunkSize: 100, minSentencesPerChunk: 2 })
+ const text =
+ 'First sentence. Second sentence. Third sentence. Fourth sentence. Fifth sentence.'
+ const chunks = await chunker.chunk(text)
+
+ expect(chunks.length).toBeGreaterThan(0)
+ expect(chunks).toHaveLength(1)
+ })
+
+ it.concurrent('should enforce min sentences even when token limit is reached', async () => {
+ const chunker = new SentenceChunker({ chunkSize: 6, minSentencesPerChunk: 2 })
+ const text = 'Short one. Another one. Third one here. Fourth one here.'
+ const chunks = await chunker.chunk(text)
+
+ const firstChunkSentences = chunks[0].text
+ .split(/(?<=[.!?])\s+/)
+ .filter((s) => s.trim().length > 0)
+ expect(firstChunkSentences.length).toBeGreaterThanOrEqual(2)
+ })
+ })
+
+ describe('oversized sentence fallback', () => {
+ it.concurrent(
+ 'should chunk a single very long sentence via word-boundary splitting',
+ async () => {
+ const chunker = new SentenceChunker({ chunkSize: 10 })
+ const longSentence = `${'word '.repeat(50).trim()}.`
+ const chunks = await chunker.chunk(longSentence)
+
+ expect(chunks.length).toBeGreaterThan(1)
+ const allText = chunks.map((c) => c.text).join(' ')
+ expect(allText).toContain('word')
+ }
+ )
+
+ it.concurrent('should handle oversized sentence mixed with normal sentences', async () => {
+ const chunker = new SentenceChunker({ chunkSize: 10 })
+ const longSentence = `${'word '.repeat(50).trim()}.`
+ const text = `Short sentence. ${longSentence} Another short one.`
+ const chunks = await chunker.chunk(text)
+
+ expect(chunks.length).toBeGreaterThan(2)
+ const allText = chunks.map((c) => c.text).join(' ')
+ expect(allText).toContain('Short sentence.')
+ expect(allText).toContain('Another short one.')
+ })
+ })
+
+ describe('sentence-level overlap', () => {
+ it.concurrent('should include overlap from previous chunk when chunkOverlap > 0', async () => {
+ const chunker = new SentenceChunker({ chunkSize: 15, chunkOverlap: 10 })
+ const text =
+ 'First sentence here. Second sentence here. Third sentence here. Fourth sentence here.'
+ const chunks = await chunker.chunk(text)
+
+ if (chunks.length > 1) {
+ expect(chunks[1].text.length).toBeGreaterThan(0)
+ }
+ })
+
+ it.concurrent('should not add overlap when chunkOverlap is 0', async () => {
+ const chunker = new SentenceChunker({ chunkSize: 15, chunkOverlap: 0 })
+ const text = 'First sentence here. Second sentence here. Third sentence here.'
+ const chunks = await chunker.chunk(text)
+
+ if (chunks.length > 1) {
+ const chunk1End = chunks[0].text.slice(-20)
+ expect(chunks[1].text.startsWith(chunk1End)).toBe(false)
+ }
+ })
+ })
+
+ describe('chunk metadata', () => {
+ it.concurrent('should include text, tokenCount, and metadata in each chunk', async () => {
+ const chunker = new SentenceChunker({ chunkSize: 100 })
+ const text = 'This is a test sentence. Another sentence follows.'
+ const chunks = await chunker.chunk(text)
+
+ expect(chunks).toHaveLength(1)
+ expect(chunks[0]).toHaveProperty('text')
+ expect(chunks[0]).toHaveProperty('tokenCount')
+ expect(chunks[0]).toHaveProperty('metadata')
+ expect(chunks[0].metadata).toHaveProperty('startIndex')
+ expect(chunks[0].metadata).toHaveProperty('endIndex')
+ })
+
+ it.concurrent('should have startIndex of 0 for the first chunk', async () => {
+ const chunker = new SentenceChunker({ chunkSize: 10 })
+ const text = 'First sentence. Second sentence. Third sentence.'
+ const chunks = await chunker.chunk(text)
+
+ expect(chunks[0].metadata.startIndex).toBe(0)
+ })
+
+ it.concurrent('should have non-negative indices for all chunks', async () => {
+ const chunker = new SentenceChunker({ chunkSize: 10, chunkOverlap: 5 })
+ const text =
+ 'First sentence here. Second sentence here. Third sentence here. Fourth sentence.'
+ const chunks = await chunker.chunk(text)
+
+ for (const chunk of chunks) {
+ expect(chunk.metadata.startIndex).toBeGreaterThanOrEqual(0)
+ expect(chunk.metadata.endIndex).toBeGreaterThanOrEqual(chunk.metadata.startIndex)
+ }
+ })
+
+ it.concurrent('should have correct tokenCount based on text length', async () => {
+ const chunker = new SentenceChunker({ chunkSize: 100 })
+ const text = 'Hello world test.'
+ const chunks = await chunker.chunk(text)
+
+ expect(chunks[0].tokenCount).toBe(Math.ceil(text.length / 4))
+ })
+ })
+
+ describe('respects chunk size', () => {
+ it.concurrent('should produce chunks within approximate token limit', async () => {
+ const chunkSize = 20
+ const chunker = new SentenceChunker({ chunkSize })
+ const text =
+ 'This is the first sentence. Here is the second one. And the third sentence follows. Then comes the fourth. Finally the fifth sentence.'
+ const chunks = await chunker.chunk(text)
+
+ expect(chunks.length).toBeGreaterThan(1)
+ for (const chunk of chunks) {
+ expect(chunk.tokenCount).toBeLessThanOrEqual(chunkSize * 2)
+ }
+ })
+
+ it.concurrent('should create more chunks with smaller chunk size', async () => {
+ const text =
+ 'Sentence number one. Sentence number two. Sentence number three. Sentence number four. Sentence number five. Sentence number six.'
+
+ const largeChunker = new SentenceChunker({ chunkSize: 200 })
+ const smallChunker = new SentenceChunker({ chunkSize: 10 })
+
+ const largeChunks = await largeChunker.chunk(text)
+ const smallChunks = await smallChunker.chunk(text)
+
+ expect(smallChunks.length).toBeGreaterThan(largeChunks.length)
+ })
+ })
+})
diff --git a/apps/sim/lib/chunkers/sentence-chunker.ts b/apps/sim/lib/chunkers/sentence-chunker.ts
new file mode 100644
index 00000000000..f8b92e6f22c
--- /dev/null
+++ b/apps/sim/lib/chunkers/sentence-chunker.ts
@@ -0,0 +1,141 @@
+import { createLogger } from '@sim/logger'
+import type { Chunk, SentenceChunkerOptions } from '@/lib/chunkers/types'
+import {
+ buildChunks,
+ cleanText,
+ estimateTokens,
+ resolveChunkerOptions,
+ splitAtWordBoundaries,
+ tokensToChars,
+} from '@/lib/chunkers/utils'
+
+const logger = createLogger('SentenceChunker')
+
+/** Never splits mid-sentence unless a single sentence exceeds the limit. */
+export class SentenceChunker {
+ private readonly chunkSize: number
+ private readonly chunkOverlap: number
+ private readonly minSentencesPerChunk: number
+
+ constructor(options: SentenceChunkerOptions = {}) {
+ const resolved = resolveChunkerOptions(options)
+ this.chunkSize = resolved.chunkSize
+ this.chunkOverlap = resolved.chunkOverlap
+ this.minSentencesPerChunk = options.minSentencesPerChunk ?? 1
+ }
+
+ /** Splits on sentence boundaries while avoiding abbreviations, decimals, and ellipses. */
+ private splitSentences(text: string): string[] {
+ return text
+ .split(
+ /(? s.trim().length > 0)
+ }
+
+ async chunk(content: string): Promise {
+ if (!content?.trim()) {
+ return []
+ }
+
+ const cleaned = cleanText(content)
+ const sentences = this.splitSentences(cleaned)
+
+ if (sentences.length === 0) {
+ return []
+ }
+
+ if (estimateTokens(cleaned) <= this.chunkSize) {
+ logger.info('Content fits in single chunk')
+ return buildChunks([cleaned], 0)
+ }
+
+ const chunkSentenceGroups: string[][] = []
+ let currentGroup: string[] = []
+ let currentTokens = 0
+ const chunkSizeChars = tokensToChars(this.chunkSize)
+
+ for (const sentence of sentences) {
+ const sentenceTokens = estimateTokens(sentence)
+
+ if (sentenceTokens > this.chunkSize) {
+ if (currentGroup.length > 0) {
+ chunkSentenceGroups.push(currentGroup)
+ currentGroup = []
+ currentTokens = 0
+ }
+ const parts = splitAtWordBoundaries(sentence, chunkSizeChars)
+ for (const part of parts) {
+ chunkSentenceGroups.push([part])
+ }
+ continue
+ }
+
+ const wouldExceed = currentTokens + sentenceTokens > this.chunkSize
+ const hasMinSentences = currentGroup.length >= this.minSentencesPerChunk
+
+ if (wouldExceed && hasMinSentences) {
+ chunkSentenceGroups.push(currentGroup)
+ currentGroup = [sentence]
+ currentTokens = sentenceTokens
+ } else {
+ currentGroup.push(sentence)
+ currentTokens += sentenceTokens
+ }
+ }
+
+ if (currentGroup.length > 0) {
+ chunkSentenceGroups.push(currentGroup)
+ }
+
+ const rawChunks = this.applyOverlapFromGroups(chunkSentenceGroups)
+
+ logger.info(`Chunked into ${rawChunks.length} sentence-based chunks`)
+ return buildChunks(rawChunks, this.chunkOverlap)
+ }
+
+ /** Applies overlap at the sentence level using original groups to avoid re-splitting. */
+ private applyOverlapFromGroups(groups: string[][]): string[] {
+ if (this.chunkOverlap <= 0 || groups.length <= 1) {
+ return groups.map((g) => g.join(' '))
+ }
+
+ const overlapChars = tokensToChars(this.chunkOverlap)
+ const result: string[] = []
+
+ for (let i = 0; i < groups.length; i++) {
+ if (i === 0) {
+ result.push(groups[i].join(' '))
+ continue
+ }
+
+ const prevGroup = groups[i - 1]
+ const overlapSentences: string[] = []
+ let overlapLen = 0
+
+ for (let j = prevGroup.length - 1; j >= 0; j--) {
+ if (overlapLen + prevGroup[j].length > overlapChars) break
+ overlapSentences.unshift(prevGroup[j])
+ overlapLen += prevGroup[j].length
+ }
+
+ const currentText = groups[i].join(' ')
+ if (overlapSentences.length > 0) {
+ result.push(`${overlapSentences.join(' ')} ${currentText}`)
+ } else {
+ // No complete sentence fits — fall back to character-level overlap
+ const prevText = prevGroup.join(' ')
+ const tail = prevText.slice(-overlapChars)
+ const wordMatch = tail.match(/^\s*\S/)
+ const cleanTail = wordMatch ? tail.slice(tail.indexOf(wordMatch[0].trim())) : tail
+ if (cleanTail.trim()) {
+ result.push(`${cleanTail.trim()} ${currentText}`)
+ } else {
+ result.push(currentText)
+ }
+ }
+ }
+
+ return result
+ }
+}
diff --git a/apps/sim/lib/chunkers/structured-data-chunker.test.ts b/apps/sim/lib/chunkers/structured-data-chunker.test.ts
index ad1aef5c70a..3cd6b7ec27a 100644
--- a/apps/sim/lib/chunkers/structured-data-chunker.test.ts
+++ b/apps/sim/lib/chunkers/structured-data-chunker.test.ts
@@ -11,19 +11,16 @@ vi.mock('@sim/logger', () => loggerMock)
describe('StructuredDataChunker', () => {
describe('isStructuredData', () => {
it('should detect CSV content with many columns', () => {
- // Detection requires >2 delimiters per line on average
const csv = 'name,age,city,country\nAlice,30,NYC,USA\nBob,25,LA,USA'
expect(StructuredDataChunker.isStructuredData(csv)).toBe(true)
})
it('should detect TSV content with many columns', () => {
- // Detection requires >2 delimiters per line on average
const tsv = 'name\tage\tcity\tcountry\nAlice\t30\tNYC\tUSA\nBob\t25\tLA\tUSA'
expect(StructuredDataChunker.isStructuredData(tsv)).toBe(true)
})
it('should detect pipe-delimited content with many columns', () => {
- // Detection requires >2 delimiters per line on average
const piped = 'name|age|city|country\nAlice|30|NYC|USA\nBob|25|LA|USA'
expect(StructuredDataChunker.isStructuredData(piped)).toBe(true)
})
@@ -64,7 +61,6 @@ describe('StructuredDataChunker', () => {
it('should handle inconsistent delimiter counts', () => {
const inconsistent = 'name,age\nAlice,30,extra\nBob'
- // May or may not detect as structured depending on variance threshold
const result = StructuredDataChunker.isStructuredData(inconsistent)
expect(typeof result).toBe('boolean')
})
@@ -100,7 +96,7 @@ Bob,25`
const chunks = await StructuredDataChunker.chunkStructuredData(csv)
expect(chunks.length).toBeGreaterThan(0)
- expect(chunks[0].text).toContain('Rows')
+ expect(chunks[0].text).toContain('rows of data')
})
it.concurrent('should include sheet name when provided', async () => {
@@ -184,7 +180,6 @@ Alice,30`
const csv = 'name,age,city'
const chunks = await StructuredDataChunker.chunkStructuredData(csv)
- // Only header, no data rows
expect(chunks.length).toBeGreaterThanOrEqual(0)
})
@@ -271,9 +266,8 @@ Alice,30`
const chunks = await StructuredDataChunker.chunkStructuredData(csv, { chunkSize: 500 })
expect(chunks.length).toBeGreaterThan(1)
- // Verify total rows are distributed across chunks
const totalRowCount = chunks.reduce((sum, chunk) => {
- const match = chunk.text.match(/\[Rows (\d+) of data\]/)
+ const match = chunk.text.match(/\[(\d+) rows of data\]/)
return sum + (match ? Number.parseInt(match[1]) : 0)
}, 0)
expect(totalRowCount).toBeGreaterThan(0)
@@ -319,9 +313,7 @@ Alice,30`
it.concurrent('should not detect with fewer than 3 delimiters per line', async () => {
const sparse = `a,b
1,2`
- // Only 1 comma per line, below threshold of >2
const result = StructuredDataChunker.isStructuredData(sparse)
- // May or may not pass depending on implementation threshold
expect(typeof result).toBe('boolean')
})
})
@@ -337,7 +329,6 @@ Alice,30`
const chunks = await StructuredDataChunker.chunkStructuredData(csv, { chunkSize: 200 })
expect(chunks.length).toBeGreaterThan(1)
- // Each chunk should contain header info
for (const chunk of chunks) {
expect(chunk.text).toContain('Headers:')
}
diff --git a/apps/sim/lib/chunkers/structured-data-chunker.ts b/apps/sim/lib/chunkers/structured-data-chunker.ts
index 0d962072440..757e8b67fdb 100644
--- a/apps/sim/lib/chunkers/structured-data-chunker.ts
+++ b/apps/sim/lib/chunkers/structured-data-chunker.ts
@@ -1,37 +1,22 @@
import { createLogger } from '@sim/logger'
import type { Chunk, StructuredDataOptions } from '@/lib/chunkers/types'
+/** Structured data is denser in tokens (~3 chars/token vs ~4 for prose) */
+function estimateStructuredTokens(text: string): number {
+ if (!text?.trim()) return 0
+ return Math.ceil(text.length / 3)
+}
+
const logger = createLogger('StructuredDataChunker')
-/**
- * Default configuration for structured data chunking (CSV, XLSX, etc.)
- * These are used when user doesn't provide preferences
- */
const DEFAULT_CONFIG = {
- // Target chunk size in tokens
TARGET_CHUNK_SIZE: 1024,
- MIN_CHUNK_SIZE: 100,
- MAX_CHUNK_SIZE: 4000,
-
- // For spreadsheets, group rows together
- ROWS_PER_CHUNK: 100,
- MIN_ROWS_PER_CHUNK: 20,
+ MIN_ROWS_PER_CHUNK: 5,
MAX_ROWS_PER_CHUNK: 500,
-
- // For better embeddings quality
INCLUDE_HEADERS_IN_EACH_CHUNK: true,
- MAX_HEADER_SIZE: 200, // tokens
-}
+} as const
-/**
- * Smart chunker for structured data (CSV, XLSX) that preserves semantic meaning
- * Preserves headers in each chunk for better semantic context
- */
export class StructuredDataChunker {
- /**
- * Chunk structured data intelligently based on rows and semantic boundaries
- * Respects user's chunkSize preference when provided
- */
static async chunkStructuredData(
content: string,
options: StructuredDataOptions = {}
@@ -43,15 +28,12 @@ export class StructuredDataChunker {
return chunks
}
- // Use user's chunk size or fall back to default
const targetChunkSize = options.chunkSize ?? DEFAULT_CONFIG.TARGET_CHUNK_SIZE
- // Detect headers (first line or provided)
const headerLine = options.headers?.join('\t') || lines[0]
const dataStartIndex = options.headers ? 0 : 1
- // Calculate optimal rows per chunk based on content and user's target size
- const estimatedTokensPerRow = StructuredDataChunker.estimateTokensPerRow(
+ const estimatedTokensPerRow = StructuredDataChunker.estimateStructuredTokensPerRow(
lines.slice(dataStartIndex, Math.min(10, lines.length))
)
const optimalRowsPerChunk = StructuredDataChunker.calculateOptimalRowsPerChunk(
@@ -65,14 +47,13 @@ export class StructuredDataChunker {
let currentChunkRows: string[] = []
let currentTokenEstimate = 0
- const headerTokens = StructuredDataChunker.estimateTokens(headerLine)
+ const headerTokens = estimateStructuredTokens(headerLine)
let chunkStartRow = dataStartIndex
for (let i = dataStartIndex; i < lines.length; i++) {
const row = lines[i]
- const rowTokens = StructuredDataChunker.estimateTokens(row)
+ const rowTokens = estimateStructuredTokens(row)
- // Check if adding this row would exceed our target
const projectedTokens =
currentTokenEstimate +
rowTokens +
@@ -84,7 +65,6 @@ export class StructuredDataChunker {
currentChunkRows.length >= optimalRowsPerChunk
if (shouldCreateChunk && currentChunkRows.length > 0) {
- // Create chunk with current rows
const chunkContent = StructuredDataChunker.formatChunk(
headerLine,
currentChunkRows,
@@ -92,7 +72,6 @@ export class StructuredDataChunker {
)
chunks.push(StructuredDataChunker.createChunk(chunkContent, chunkStartRow, i - 1))
- // Reset for next chunk
currentChunkRows = []
currentTokenEstimate = 0
chunkStartRow = i
@@ -102,7 +81,6 @@ export class StructuredDataChunker {
currentTokenEstimate += rowTokens
}
- // Add remaining rows as final chunk
if (currentChunkRows.length > 0) {
const chunkContent = StructuredDataChunker.formatChunk(
headerLine,
@@ -117,41 +95,28 @@ export class StructuredDataChunker {
return chunks
}
- /**
- * Format a chunk with headers and context
- */
private static formatChunk(headerLine: string, rows: string[], sheetName?: string): string {
let content = ''
- // Add sheet name context if available
if (sheetName) {
content += `=== ${sheetName} ===\n\n`
}
- // Add headers for context
if (DEFAULT_CONFIG.INCLUDE_HEADERS_IN_EACH_CHUNK) {
content += `Headers: ${headerLine}\n`
content += `${'-'.repeat(Math.min(80, headerLine.length))}\n`
}
- // Add data rows
content += rows.join('\n')
-
- // Add row count for context
- content += `\n\n[Rows ${rows.length} of data]`
+ content += `\n\n[${rows.length} rows of data]`
return content
}
- /**
- * Create a chunk object with actual row indices
- */
private static createChunk(content: string, startRow: number, endRow: number): Chunk {
- const tokenCount = StructuredDataChunker.estimateTokens(content)
-
return {
text: content,
- tokenCount,
+ tokenCount: estimateStructuredTokens(content),
metadata: {
startIndex: startRow,
endIndex: endRow,
@@ -159,30 +124,13 @@ export class StructuredDataChunker {
}
}
- /**
- * Estimate tokens in text (rough approximation)
- * For structured data with numbers, uses 1 token per 3 characters
- */
- private static estimateTokens(text: string): number {
- return Math.ceil(text.length / 3)
- }
-
- /**
- * Estimate average tokens per row from sample
- */
- private static estimateTokensPerRow(sampleRows: string[]): number {
- if (sampleRows.length === 0) return 50 // default estimate
+ private static estimateStructuredTokensPerRow(sampleRows: string[]): number {
+ if (sampleRows.length === 0) return 50
- const totalTokens = sampleRows.reduce(
- (sum, row) => sum + StructuredDataChunker.estimateTokens(row),
- 0
- )
+ const totalTokens = sampleRows.reduce((sum, row) => sum + estimateStructuredTokens(row), 0)
return Math.ceil(totalTokens / sampleRows.length)
}
- /**
- * Calculate optimal rows per chunk based on token estimates and target size
- */
private static calculateOptimalRowsPerChunk(
tokensPerRow: number,
targetChunkSize: number
@@ -195,11 +143,7 @@ export class StructuredDataChunker {
)
}
- /**
- * Check if content appears to be structured data
- */
static isStructuredData(content: string, mimeType?: string): boolean {
- // Check mime type first
if (mimeType) {
const structuredMimeTypes = [
'text/csv',
@@ -212,20 +156,17 @@ export class StructuredDataChunker {
}
}
- // Check content structure
- const lines = content.split('\n').slice(0, 10) // Check first 10 lines
+ const lines = content.split('\n').slice(0, 10)
if (lines.length < 2) return false
- // Check for consistent delimiters (comma, tab, pipe)
const delimiters = [',', '\t', '|']
for (const delimiter of delimiters) {
- const counts = lines.map(
- (line) => (line.match(new RegExp(`\\${delimiter}`, 'g')) || []).length
- )
+ const escaped = delimiter.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')
+ const counts = lines.map((line) => (line.match(new RegExp(escaped, 'g')) || []).length)
const avgCount = counts.reduce((a, b) => a + b, 0) / counts.length
- // If most lines have similar delimiter counts, it's likely structured
- if (avgCount > 2 && counts.every((c) => Math.abs(c - avgCount) <= 2)) {
+ const tolerance = Math.max(1, Math.ceil(avgCount * 0.2))
+ if (avgCount > 2 && counts.every((c) => Math.abs(c - avgCount) <= tolerance)) {
return true
}
}
diff --git a/apps/sim/lib/chunkers/text-chunker.test.ts b/apps/sim/lib/chunkers/text-chunker.test.ts
index 3b8b8455691..f7c2458d4b5 100644
--- a/apps/sim/lib/chunkers/text-chunker.test.ts
+++ b/apps/sim/lib/chunkers/text-chunker.test.ts
@@ -30,7 +30,7 @@ describe('TextChunker', () => {
it.concurrent('should include token count in chunk metadata', async () => {
const chunker = new TextChunker({ chunkSize: 100 })
- const text = 'Hello world' // ~3 tokens (11 chars / 4)
+ const text = 'Hello world'
const chunks = await chunker.chunk(text)
expect(chunks[0].tokenCount).toBe(3)
@@ -201,7 +201,6 @@ describe('TextChunker', () => {
it.concurrent('should use default minCharactersPerChunk of 100', async () => {
const chunker = new TextChunker({ chunkSize: 10 })
- // Text with 150+ characters to ensure chunks pass the 100 character minimum
const text = 'This is a longer sentence with more content. '.repeat(5)
const chunks = await chunker.chunk(text)
@@ -266,7 +265,6 @@ describe('TextChunker', () => {
describe('boundary conditions', () => {
it.concurrent('should handle text exactly at chunk size boundary', async () => {
const chunker = new TextChunker({ chunkSize: 10 })
- // 40 characters = 10 tokens exactly
const text = 'A'.repeat(40)
const chunks = await chunker.chunk(text)
@@ -276,7 +274,6 @@ describe('TextChunker', () => {
it.concurrent('should handle text one token over chunk size', async () => {
const chunker = new TextChunker({ chunkSize: 10 })
- // 44 characters = 11 tokens, just over limit
const text = 'A'.repeat(44)
const chunks = await chunker.chunk(text)
@@ -300,7 +297,6 @@ describe('TextChunker', () => {
})
it.concurrent('should clamp overlap to max 50% of chunk size', async () => {
- // Overlap of 60 should be clamped to 10 (50% of chunkSize 20)
const chunker = new TextChunker({ chunkSize: 20, chunkOverlap: 60 })
const text = 'First paragraph here.\n\nSecond paragraph here.\n\nThird paragraph here.'
const chunks = await chunker.chunk(text)
@@ -359,7 +355,6 @@ describe('TextChunker', () => {
it.concurrent('should handle combining diacritics', async () => {
const chunker = new TextChunker({ chunkSize: 100 })
- // e + combining acute accent
const text = 'cafe\u0301 resume\u0301 naive\u0308'
const chunks = await chunker.chunk(text)
@@ -368,7 +363,6 @@ describe('TextChunker', () => {
it.concurrent('should handle zero-width characters', async () => {
const chunker = new TextChunker({ chunkSize: 100 })
- // Zero-width space, zero-width non-joiner, zero-width joiner
const text = 'Hello\u200B\u200C\u200DWorld'
const chunks = await chunker.chunk(text)
@@ -391,14 +385,12 @@ describe('TextChunker', () => {
const chunks = await chunker.chunk(text)
expect(chunks.length).toBeGreaterThan(1)
- // Verify all content is preserved
const totalChars = chunks.reduce((sum, c) => sum + c.text.length, 0)
expect(totalChars).toBeGreaterThan(0)
})
it.concurrent('should handle 1MB of text', async () => {
const chunker = new TextChunker({ chunkSize: 500 })
- // 1MB of text
const text = 'Lorem ipsum dolor sit amet. '.repeat(40000)
const chunks = await chunker.chunk(text)
@@ -407,7 +399,6 @@ describe('TextChunker', () => {
it.concurrent('should handle very long single line', async () => {
const chunker = new TextChunker({ chunkSize: 50 })
- // Single line with no natural break points
const text = 'Word'.repeat(10000)
const chunks = await chunker.chunk(text)
diff --git a/apps/sim/lib/chunkers/text-chunker.ts b/apps/sim/lib/chunkers/text-chunker.ts
index 7dbbde0cf97..eb993b609aa 100644
--- a/apps/sim/lib/chunkers/text-chunker.ts
+++ b/apps/sim/lib/chunkers/text-chunker.ts
@@ -1,99 +1,61 @@
import type { Chunk, ChunkerOptions } from '@/lib/chunkers/types'
+import {
+ addOverlap,
+ buildChunks,
+ cleanText,
+ estimateTokens,
+ resolveChunkerOptions,
+ splitAtWordBoundaries,
+ tokensToChars,
+} from '@/lib/chunkers/utils'
-/**
- * Lightweight text chunker optimized for RAG applications
- * Uses hierarchical splitting with simple character-based token estimation
- *
- * Parameters:
- * - chunkSize: Maximum chunk size in TOKENS (default: 1024)
- * - chunkOverlap: Overlap between chunks in TOKENS (default: 0)
- * - minCharactersPerChunk: Minimum characters to keep a chunk (default: 100)
- */
export class TextChunker {
- private readonly chunkSize: number // Max chunk size in tokens
- private readonly chunkOverlap: number // Overlap in tokens
- private readonly minCharactersPerChunk: number // Min characters per chunk
+ private readonly chunkSize: number
+ private readonly chunkOverlap: number
- // Hierarchical separators ordered from largest to smallest semantic units
private readonly separators = [
- '\n\n\n', // Document sections
- '\n---\n', // Markdown horizontal rules
- '\n***\n', // Markdown horizontal rules (alternative)
- '\n___\n', // Markdown horizontal rules (alternative)
- '\n# ', // Markdown H1 headings
- '\n## ', // Markdown H2 headings
- '\n### ', // Markdown H3 headings
- '\n#### ', // Markdown H4 headings
- '\n##### ', // Markdown H5 headings
- '\n###### ', // Markdown H6 headings
- '\n\n', // Paragraphs
- '\n', // Lines
- '. ', // Sentences
- '! ', // Exclamations
- '? ', // Questions
- '; ', // Semicolons
- ', ', // Commas
- ' ', // Words
+ '\n---\n',
+ '\n***\n',
+ '\n___\n',
+ '\n# ',
+ '\n## ',
+ '\n### ',
+ '\n#### ',
+ '\n##### ',
+ '\n###### ',
+ '\n\n',
+ '\n',
+ '. ',
+ '! ',
+ '? ',
+ '; ',
+ ', ',
+ ' ',
]
constructor(options: ChunkerOptions = {}) {
- this.chunkSize = options.chunkSize ?? 1024
- // Clamp overlap to prevent exceeding chunk size (max 50% of chunk size)
- const maxOverlap = Math.floor(this.chunkSize * 0.5)
- this.chunkOverlap = Math.min(options.chunkOverlap ?? 0, maxOverlap)
- this.minCharactersPerChunk = options.minCharactersPerChunk ?? 100
+ const resolved = resolveChunkerOptions(options)
+ this.chunkSize = resolved.chunkSize
+ this.chunkOverlap = resolved.chunkOverlap
}
- /**
- * Simple token estimation using character count
- * 1 token ≈ 4 characters for English text
- */
- private estimateTokens(text: string): number {
- if (!text?.trim()) return 0
- return Math.ceil(text.length / 4)
- }
-
- /**
- * Convert tokens to approximate character count
- */
- private tokensToChars(tokens: number): number {
- return tokens * 4
- }
-
- /**
- * Split text recursively using hierarchical separators
- */
- private async splitRecursively(text: string, separatorIndex = 0): Promise {
- const tokenCount = this.estimateTokens(text)
+ private splitRecursively(text: string, separatorIndex = 0): string[] {
+ const tokenCount = estimateTokens(text)
- // If chunk is small enough (within max token limit), return it
- // Keep chunks even if below minCharactersPerChunk to avoid data loss
if (tokenCount <= this.chunkSize) {
- // Only filter out empty/whitespace-only text, not small chunks
return text.trim() ? [text] : []
}
- // If we've run out of separators, force split by character count
if (separatorIndex >= this.separators.length) {
- const chunks: string[] = []
- const targetLength = Math.ceil((text.length * this.chunkSize) / tokenCount)
-
- for (let i = 0; i < text.length; i += targetLength) {
- const chunk = text.slice(i, i + targetLength).trim()
- // Keep all non-empty chunks to avoid data loss
- if (chunk) {
- chunks.push(chunk)
- }
- }
- return chunks
+ const chunkSizeChars = tokensToChars(this.chunkSize)
+ return splitAtWordBoundaries(text, chunkSizeChars)
}
const separator = this.separators[separatorIndex]
const parts = text.split(separator).filter((part) => part.trim())
- // If no split occurred, try next separator
if (parts.length <= 1) {
- return await this.splitRecursively(text, separatorIndex + 1)
+ return this.splitRecursively(text, separatorIndex + 1)
}
const chunks: string[] = []
@@ -102,17 +64,15 @@ export class TextChunker {
for (const part of parts) {
const testChunk = currentChunk + (currentChunk ? separator : '') + part
- if (this.estimateTokens(testChunk) <= this.chunkSize) {
+ if (estimateTokens(testChunk) <= this.chunkSize) {
currentChunk = testChunk
} else {
- // Save current chunk - keep even if below minCharactersPerChunk to avoid data loss
if (currentChunk.trim()) {
chunks.push(currentChunk.trim())
}
- // If part itself is too large, split it further
- if (this.estimateTokens(part) > this.chunkSize) {
- const subChunks = await this.splitRecursively(part, separatorIndex + 1)
+ if (estimateTokens(part) > this.chunkSize) {
+ const subChunks = this.splitRecursively(part, separatorIndex + 1)
for (const subChunk of subChunks) {
chunks.push(subChunk)
}
@@ -123,7 +83,6 @@ export class TextChunker {
}
}
- // Add final chunk if it exists - keep even if below minCharactersPerChunk to avoid data loss
if (currentChunk.trim()) {
chunks.push(currentChunk.trim())
}
@@ -131,111 +90,19 @@ export class TextChunker {
return chunks
}
- /**
- * Add overlap between chunks (overlap is in tokens, converted to characters)
- */
- private addOverlap(chunks: string[]): string[] {
- if (this.chunkOverlap <= 0 || chunks.length <= 1) {
- return chunks
- }
-
- const overlappedChunks: string[] = []
- // Convert token overlap to character overlap
- const overlapChars = this.tokensToChars(this.chunkOverlap)
-
- for (let i = 0; i < chunks.length; i++) {
- let chunk = chunks[i]
-
- // Add overlap from previous chunk (converted from tokens to characters)
- if (i > 0) {
- const prevChunk = chunks[i - 1]
- // Take the last N characters from previous chunk (based on token overlap)
- const overlapLength = Math.min(overlapChars, prevChunk.length)
- const overlapText = prevChunk.slice(-overlapLength)
-
- // Try to start overlap at a word boundary for cleaner text
- const wordBoundaryMatch = overlapText.match(/^\s*\S/)
- const cleanOverlap = wordBoundaryMatch
- ? overlapText.slice(overlapText.indexOf(wordBoundaryMatch[0].trim()))
- : overlapText
-
- if (cleanOverlap.trim()) {
- chunk = `${cleanOverlap.trim()} ${chunk}`
- }
- }
-
- overlappedChunks.push(chunk)
- }
-
- return overlappedChunks
- }
-
- /**
- * Clean and normalize text
- */
- private cleanText(text: string): string {
- return text
- .replace(/\r\n/g, '\n') // Normalize Windows line endings
- .replace(/\r/g, '\n') // Normalize old Mac line endings
- .replace(/\n{3,}/g, '\n\n') // Limit consecutive newlines
- .replace(/\t/g, ' ') // Convert tabs to spaces
- .replace(/ {2,}/g, ' ') // Collapse multiple spaces
- .trim()
- }
-
- /**
- * Main chunking method
- */
async chunk(text: string): Promise {
if (!text?.trim()) {
return []
}
- // Clean the text
- const cleanedText = this.cleanText(text)
-
- // Split into chunks
- let chunks = await this.splitRecursively(cleanedText)
-
- // Add overlap if configured
- chunks = this.addOverlap(chunks)
-
- // Convert to Chunk objects with metadata
- let previousEndIndex = 0
- const chunkPromises = chunks.map(async (chunkText, index) => {
- let startIndex: number
- let actualContentLength: number
+ const cleaned = cleanText(text)
+ let chunks = this.splitRecursively(cleaned)
- if (index === 0 || this.chunkOverlap <= 0) {
- // First chunk or no overlap - start from previous end
- startIndex = previousEndIndex
- actualContentLength = chunkText.length
- } else {
- // Calculate overlap length in characters (converted from tokens)
- const prevChunk = chunks[index - 1]
- const overlapChars = this.tokensToChars(this.chunkOverlap)
- const overlapLength = Math.min(overlapChars, prevChunk.length, chunkText.length)
-
- startIndex = previousEndIndex - overlapLength
- actualContentLength = chunkText.length - overlapLength
- }
-
- const safeStart = Math.max(0, startIndex)
- const endIndexSafe = safeStart + Math.max(0, actualContentLength)
-
- const chunk: Chunk = {
- text: chunkText,
- tokenCount: this.estimateTokens(chunkText),
- metadata: {
- startIndex: safeStart,
- endIndex: endIndexSafe,
- },
- }
-
- previousEndIndex = endIndexSafe
- return chunk
- })
+ if (this.chunkOverlap > 0) {
+ const overlapChars = tokensToChars(this.chunkOverlap)
+ chunks = addOverlap(chunks, overlapChars)
+ }
- return await Promise.all(chunkPromises)
+ return buildChunks(chunks, this.chunkOverlap)
}
}
diff --git a/apps/sim/lib/chunkers/token-chunker.test.ts b/apps/sim/lib/chunkers/token-chunker.test.ts
new file mode 100644
index 00000000000..420224c4d6e
--- /dev/null
+++ b/apps/sim/lib/chunkers/token-chunker.test.ts
@@ -0,0 +1,239 @@
+/**
+ * @vitest-environment node
+ */
+
+import { loggerMock } from '@sim/testing'
+import { describe, expect, it, vi } from 'vitest'
+import { TokenChunker } from './token-chunker'
+
+vi.mock('@sim/logger', () => loggerMock)
+
+describe('TokenChunker', () => {
+ describe('empty and whitespace input', () => {
+ it.concurrent('should return empty array for empty string', async () => {
+ const chunker = new TokenChunker({ chunkSize: 100 })
+ const chunks = await chunker.chunk('')
+ expect(chunks).toEqual([])
+ })
+
+ it.concurrent('should return empty array for whitespace-only input', async () => {
+ const chunker = new TokenChunker({ chunkSize: 100 })
+ const chunks = await chunker.chunk(' \n\n\t ')
+ expect(chunks).toEqual([])
+ })
+ })
+
+ describe('small content', () => {
+ it.concurrent('should return single chunk when content fits within chunkSize', async () => {
+ const chunker = new TokenChunker({ chunkSize: 100 })
+ const text = 'This is a short text.'
+ const chunks = await chunker.chunk(text)
+
+ expect(chunks).toHaveLength(1)
+ expect(chunks[0].text).toBe(text)
+ })
+ })
+
+ describe('token count accuracy', () => {
+ it.concurrent('should compute tokenCount as Math.ceil(text.length / 4)', async () => {
+ const chunker = new TokenChunker({ chunkSize: 100 })
+ const text = 'Hello world'
+ const chunks = await chunker.chunk(text)
+
+ expect(chunks[0].tokenCount).toBe(Math.ceil(text.length / 4))
+ })
+
+ it.concurrent('should compute tokenCount correctly for longer text', async () => {
+ const chunker = new TokenChunker({ chunkSize: 100 })
+ const text = 'The quick brown fox jumps over the lazy dog.'
+ const chunks = await chunker.chunk(text)
+
+ expect(chunks[0].tokenCount).toBe(11)
+ })
+ })
+
+ describe('chunk metadata', () => {
+ it.concurrent(
+ 'should include text, tokenCount, and metadata with startIndex and endIndex',
+ async () => {
+ const chunker = new TokenChunker({ chunkSize: 100 })
+ const text = 'Some test content here.'
+ const chunks = await chunker.chunk(text)
+
+ expect(chunks[0]).toHaveProperty('text')
+ expect(chunks[0]).toHaveProperty('tokenCount')
+ expect(chunks[0].metadata).toHaveProperty('startIndex')
+ expect(chunks[0].metadata).toHaveProperty('endIndex')
+ expect(chunks[0].metadata.startIndex).toBe(0)
+ expect(chunks[0].metadata.endIndex).toBeGreaterThan(0)
+ }
+ )
+
+ it.concurrent('should have non-negative indices across all chunks', async () => {
+ const chunker = new TokenChunker({ chunkSize: 20, chunkOverlap: 0 })
+ const text = 'First part of the text. Second part of the text. Third part of the text.'
+ const chunks = await chunker.chunk(text)
+
+ for (const chunk of chunks) {
+ expect(chunk.metadata.startIndex).toBeGreaterThanOrEqual(0)
+ expect(chunk.metadata.endIndex).toBeGreaterThanOrEqual(chunk.metadata.startIndex)
+ }
+ })
+ })
+
+ describe('respects chunk size', () => {
+ it.concurrent('should not produce chunks exceeding chunkSize tokens', async () => {
+ const chunkSize = 50
+ const chunker = new TokenChunker({ chunkSize })
+ const text = 'This is a test sentence with several words. '.repeat(30)
+ const chunks = await chunker.chunk(text)
+
+ for (const chunk of chunks) {
+ expect(chunk.tokenCount).toBeLessThanOrEqual(chunkSize)
+ }
+ })
+ })
+
+ describe('splitting behavior', () => {
+ it.concurrent('should produce multiple chunks for long text', async () => {
+ const chunker = new TokenChunker({ chunkSize: 50 })
+ const text = 'This is a test sentence. '.repeat(30)
+ const chunks = await chunker.chunk(text)
+
+ expect(chunks.length).toBeGreaterThan(1)
+ })
+
+ it.concurrent('should create more chunks with smaller chunkSize', async () => {
+ const text = 'This is a test sentence with content. '.repeat(20)
+
+ const largeChunker = new TokenChunker({ chunkSize: 200 })
+ const smallChunker = new TokenChunker({ chunkSize: 50 })
+
+ const largeChunks = await largeChunker.chunk(text)
+ const smallChunks = await smallChunker.chunk(text)
+
+ expect(smallChunks.length).toBeGreaterThan(largeChunks.length)
+ })
+ })
+
+ describe('sliding window overlap', () => {
+ it.concurrent('should produce more chunks with overlap than without', async () => {
+ const text =
+ 'Alpha bravo charlie delta echo foxtrot golf hotel india juliet kilo lima mike november oscar papa quebec romeo sierra tango uniform victor whiskey xray yankee zulu. '.repeat(
+ 5
+ )
+ const withOverlap = new TokenChunker({ chunkSize: 30, chunkOverlap: 10 })
+ const withoutOverlap = new TokenChunker({ chunkSize: 30, chunkOverlap: 0 })
+
+ const overlapChunks = await withOverlap.chunk(text)
+ const noOverlapChunks = await withoutOverlap.chunk(text)
+
+ expect(overlapChunks.length).toBeGreaterThan(noOverlapChunks.length)
+ })
+
+ it.concurrent('should not share text between chunks when chunkOverlap is 0', async () => {
+ const chunker = new TokenChunker({ chunkSize: 20, chunkOverlap: 0 })
+ const text =
+ 'First sentence here. Second sentence here. Third sentence here. Fourth sentence here.'
+ const chunks = await chunker.chunk(text)
+
+ if (chunks.length > 1) {
+ const firstChunkEnd = chunks[0].text.slice(-10)
+ expect(chunks[1].text.startsWith(firstChunkEnd)).toBe(false)
+ }
+ })
+ })
+
+ describe('overlap clamped to 50%', () => {
+ it.concurrent('should still work when overlap is set >= chunkSize', async () => {
+ const chunker = new TokenChunker({ chunkSize: 20, chunkOverlap: 100 })
+ const text =
+ 'First paragraph content here. Second paragraph content here. Third paragraph here.'
+ const chunks = await chunker.chunk(text)
+
+ expect(chunks.length).toBeGreaterThan(0)
+ })
+
+ it.concurrent('should clamp overlap to 50% of chunkSize', async () => {
+ const chunkerClamped = new TokenChunker({ chunkSize: 20, chunkOverlap: 100 })
+ const chunkerHalf = new TokenChunker({ chunkSize: 20, chunkOverlap: 10 })
+ const text =
+ 'Word one two three four five six seven eight nine ten eleven twelve thirteen fourteen fifteen sixteen seventeen eighteen nineteen twenty. '.repeat(
+ 5
+ )
+
+ const clampedChunks = await chunkerClamped.chunk(text)
+ const halfChunks = await chunkerHalf.chunk(text)
+
+ expect(clampedChunks.length).toBe(halfChunks.length)
+ })
+ })
+
+ describe('word boundary snapping', () => {
+ it.concurrent('should produce trimmed chunks without leading or trailing spaces', async () => {
+ const chunker = new TokenChunker({ chunkSize: 20 })
+ const text =
+ 'the cat sat on the mat and the dog ran fast over the big red fox and then the bird flew high up in the clear blue sky above the green hill'
+ const chunks = await chunker.chunk(text)
+
+ expect(chunks.length).toBeGreaterThan(1)
+ for (const chunk of chunks) {
+ const trimmed = chunk.text.trim()
+ expect(trimmed).toBe(chunk.text)
+ expect(trimmed.length).toBeGreaterThan(0)
+ }
+ })
+
+ it.concurrent('should produce chunks that start and end on word boundaries', async () => {
+ const chunker = new TokenChunker({ chunkSize: 15 })
+ const text =
+ 'The quick brown fox jumps over the lazy dog and then runs away quickly into the forest'
+ const chunks = await chunker.chunk(text)
+
+ for (const chunk of chunks) {
+ const trimmed = chunk.text.trim()
+ expect(trimmed).toBe(chunk.text)
+ }
+ })
+ })
+
+ describe('consistent coverage', () => {
+ it.concurrent('should represent all content from original text across chunks', async () => {
+ const chunker = new TokenChunker({ chunkSize: 30, chunkOverlap: 0 })
+ const originalText =
+ 'The quick brown fox jumps over the lazy dog. Pack my box with five dozen liquor jugs.'
+ const chunks = await chunker.chunk(originalText)
+
+ const allText = chunks.map((c) => c.text).join(' ')
+ expect(allText).toContain('quick')
+ expect(allText).toContain('fox')
+ expect(allText).toContain('lazy')
+ expect(allText).toContain('dog')
+ expect(allText).toContain('liquor')
+ expect(allText).toContain('jugs')
+ })
+
+ it.concurrent('should preserve all words across chunks for longer text', async () => {
+ const chunker = new TokenChunker({ chunkSize: 20, chunkOverlap: 0 })
+ const words = [
+ 'alpha',
+ 'bravo',
+ 'charlie',
+ 'delta',
+ 'echo',
+ 'foxtrot',
+ 'golf',
+ 'hotel',
+ 'india',
+ 'juliet',
+ ]
+ const originalText = `${words.join(' is a word and ')} is also a word.`
+ const chunks = await chunker.chunk(originalText)
+
+ const combined = chunks.map((c) => c.text).join(' ')
+ for (const word of words) {
+ expect(combined).toContain(word)
+ }
+ })
+ })
+})
diff --git a/apps/sim/lib/chunkers/token-chunker.ts b/apps/sim/lib/chunkers/token-chunker.ts
new file mode 100644
index 00000000000..d98b4d1651a
--- /dev/null
+++ b/apps/sim/lib/chunkers/token-chunker.ts
@@ -0,0 +1,54 @@
+import { createLogger } from '@sim/logger'
+import type { Chunk, ChunkerOptions } from '@/lib/chunkers/types'
+import {
+ buildChunks,
+ cleanText,
+ estimateTokens,
+ resolveChunkerOptions,
+ splitAtWordBoundaries,
+ tokensToChars,
+} from '@/lib/chunkers/utils'
+
+const logger = createLogger('TokenChunker')
+
+export class TokenChunker {
+ private readonly chunkSize: number
+ private readonly chunkOverlap: number
+ private readonly minCharactersPerChunk: number
+
+ constructor(options: ChunkerOptions = {}) {
+ const resolved = resolveChunkerOptions(options)
+ this.chunkSize = resolved.chunkSize
+ this.chunkOverlap = resolved.chunkOverlap
+ this.minCharactersPerChunk = resolved.minCharactersPerChunk
+ }
+
+ async chunk(content: string): Promise {
+ if (!content?.trim()) {
+ return []
+ }
+
+ const cleaned = cleanText(content)
+
+ if (estimateTokens(cleaned) <= this.chunkSize) {
+ logger.info('Content fits in single chunk')
+ return buildChunks([cleaned], 0)
+ }
+
+ const chunkSizeChars = tokensToChars(this.chunkSize)
+ const overlapChars = tokensToChars(this.chunkOverlap)
+ const stepChars = this.chunkOverlap > 0 ? chunkSizeChars - overlapChars : undefined
+
+ const rawChunks = splitAtWordBoundaries(cleaned, chunkSizeChars, stepChars)
+
+ const filtered =
+ rawChunks.length > 1
+ ? rawChunks.filter((c) => c.length >= this.minCharactersPerChunk)
+ : rawChunks
+
+ const chunks = filtered.length > 0 ? filtered : rawChunks
+
+ logger.info(`Chunked into ${chunks.length} token-based chunks`)
+ return buildChunks(chunks, this.chunkOverlap)
+ }
+}
diff --git a/apps/sim/lib/chunkers/types.ts b/apps/sim/lib/chunkers/types.ts
index a316d643f03..692e84d12fc 100644
--- a/apps/sim/lib/chunkers/types.ts
+++ b/apps/sim/lib/chunkers/types.ts
@@ -1,17 +1,11 @@
/**
- * Options for configuring text chunkers
- *
* Units:
- * - chunkSize: Maximum chunk size in TOKENS (1 token ≈ 4 characters)
- * - chunkOverlap: Overlap between chunks in TOKENS
- * - minCharactersPerChunk: Minimum chunk size in CHARACTERS (filters tiny fragments)
+ * - chunkSize/chunkOverlap: TOKENS (1 token ≈ 4 characters)
+ * - minCharactersPerChunk: CHARACTERS
*/
export interface ChunkerOptions {
- /** Maximum chunk size in tokens (default: 1024) */
chunkSize?: number
- /** Overlap between chunks in tokens (default: 0) */
chunkOverlap?: number
- /** Minimum chunk size in characters to avoid tiny fragments (default: 100) */
minCharactersPerChunk?: number
}
@@ -51,3 +45,26 @@ export interface DocChunk {
export interface DocsChunkerOptions extends ChunkerOptions {
baseUrl?: string
}
+
+export type ChunkingStrategy = 'auto' | 'text' | 'regex' | 'recursive' | 'sentence' | 'token'
+
+export type RecursiveRecipe = 'plain' | 'markdown' | 'code'
+
+export interface StrategyOptions {
+ pattern?: string
+ separators?: string[]
+ recipe?: RecursiveRecipe
+}
+
+export interface SentenceChunkerOptions extends ChunkerOptions {
+ minSentencesPerChunk?: number
+}
+
+export interface RecursiveChunkerOptions extends ChunkerOptions {
+ separators?: string[]
+ recipe?: RecursiveRecipe
+}
+
+export interface RegexChunkerOptions extends ChunkerOptions {
+ pattern: string
+}
diff --git a/apps/sim/lib/chunkers/utils.test.ts b/apps/sim/lib/chunkers/utils.test.ts
new file mode 100644
index 00000000000..bc88bc0e46a
--- /dev/null
+++ b/apps/sim/lib/chunkers/utils.test.ts
@@ -0,0 +1,217 @@
+/**
+ * @vitest-environment node
+ */
+
+import { describe, expect, it } from 'vitest'
+import {
+ addOverlap,
+ buildChunks,
+ cleanText,
+ estimateTokens,
+ resolveChunkerOptions,
+ splitAtWordBoundaries,
+ tokensToChars,
+} from './utils'
+
+describe('estimateTokens', () => {
+ it('returns 0 for empty string', () => {
+ expect(estimateTokens('')).toBe(0)
+ })
+
+ it('returns 0 for whitespace-only string', () => {
+ expect(estimateTokens(' ')).toBe(0)
+ })
+
+ it('returns 0 for null or undefined via optional chaining', () => {
+ expect(estimateTokens(null as unknown as string)).toBe(0)
+ expect(estimateTokens(undefined as unknown as string)).toBe(0)
+ })
+
+ it('returns Math.ceil(text.length / 4) for normal text', () => {
+ const text = 'Hello world'
+ expect(estimateTokens(text)).toBe(Math.ceil(text.length / 4))
+ })
+
+ it('estimates "Hello world" (11 chars) as 3 tokens', () => {
+ expect(estimateTokens('Hello world')).toBe(3)
+ })
+})
+
+describe('tokensToChars', () => {
+ it('returns tokens * 4', () => {
+ expect(tokensToChars(1)).toBe(4)
+ expect(tokensToChars(5)).toBe(20)
+ })
+
+ it('converts 10 tokens to 40 chars', () => {
+ expect(tokensToChars(10)).toBe(40)
+ })
+})
+
+describe('cleanText', () => {
+ it('normalizes \\r\\n to \\n', () => {
+ expect(cleanText('hello\r\nworld')).toBe('hello\nworld')
+ })
+
+ it('normalizes \\r to \\n', () => {
+ expect(cleanText('hello\rworld')).toBe('hello\nworld')
+ })
+
+ it('collapses 3+ newlines to \\n\\n', () => {
+ expect(cleanText('hello\n\n\n\nworld')).toBe('hello\n\nworld')
+ })
+
+ it('replaces tabs with spaces', () => {
+ expect(cleanText('hello\tworld')).toBe('hello world')
+ })
+
+ it('collapses multiple spaces to single space', () => {
+ expect(cleanText('hello world')).toBe('hello world')
+ })
+
+ it('trims leading and trailing whitespace', () => {
+ expect(cleanText(' hello world ')).toBe('hello world')
+ })
+})
+
+describe('addOverlap', () => {
+ it('returns unchanged chunks when overlapChars <= 0', () => {
+ const chunks = ['chunk one', 'chunk two']
+ expect(addOverlap(chunks, 0)).toEqual(chunks)
+ expect(addOverlap(chunks, -5)).toEqual(chunks)
+ })
+
+ it('returns unchanged chunks when only 1 chunk', () => {
+ const chunks = ['only chunk']
+ expect(addOverlap(chunks, 10)).toEqual(chunks)
+ })
+
+ it('prepends tail of previous chunk to next chunk with overlap > 0', () => {
+ const chunks = ['first chunk here', 'second chunk here']
+ const result = addOverlap(chunks, 10)
+ expect(result[0]).toBe('first chunk here')
+ expect(result[1]).toContain('second chunk here')
+ expect(result[1].length).toBeGreaterThan('second chunk here'.length)
+ })
+
+ it('joins overlap text with space', () => {
+ const chunks = ['first chunk here', 'second chunk here']
+ const result = addOverlap(chunks, 10)
+ expect(result[1]).toContain('here second')
+ })
+
+ it('snaps overlap to word boundary', () => {
+ const chunks = ['hello beautiful world', 'next chunk']
+ const result = addOverlap(chunks, 15)
+ expect(result[1]).toBe('beautiful world next chunk')
+ })
+})
+
+describe('splitAtWordBoundaries', () => {
+ it('returns single element for short text', () => {
+ const result = splitAtWordBoundaries('short text', 100)
+ expect(result).toHaveLength(1)
+ expect(result[0]).toBe('short text')
+ })
+
+ it('produces multiple chunks for long text', () => {
+ const text = 'word '.repeat(100).trim()
+ const result = splitAtWordBoundaries(text, 20)
+ expect(result.length).toBeGreaterThan(1)
+ })
+
+ it('respects chunk size limit', () => {
+ const text = 'word '.repeat(100).trim()
+ const chunkSize = 25
+ const result = splitAtWordBoundaries(text, chunkSize)
+ for (const chunk of result) {
+ expect(chunk.length).toBeLessThanOrEqual(chunkSize)
+ }
+ })
+
+ it('does not break mid-word', () => {
+ const text = 'internationalization globalization modernization'
+ const result = splitAtWordBoundaries(text, 25)
+ for (const chunk of result) {
+ expect(chunk).not.toMatch(/^\S+\s\S+$.*\S$/)
+ const words = chunk.split(' ')
+ for (const word of words) {
+ expect(text).toContain(word)
+ }
+ }
+ })
+
+ it('produces overlapping chunks with stepChars < chunkSizeChars', () => {
+ const text = 'one two three four five six seven eight nine ten'
+ const result = splitAtWordBoundaries(text, 20, 10)
+ expect(result.length).toBeGreaterThan(1)
+ const combined = result.join(' ')
+ for (const word of text.split(' ')) {
+ expect(combined).toContain(word)
+ }
+ })
+
+ it('ensures step is at least 1 to prevent infinite loops', () => {
+ const text = 'hello world test'
+ const result = splitAtWordBoundaries(text, 10, 0)
+ expect(result.length).toBeGreaterThan(0)
+ })
+})
+
+describe('buildChunks', () => {
+ it('creates Chunk objects with text, tokenCount, and metadata', () => {
+ const texts = ['hello world', 'foo bar']
+ const chunks = buildChunks(texts, 0)
+ for (const chunk of chunks) {
+ expect(chunk).toHaveProperty('text')
+ expect(chunk).toHaveProperty('tokenCount')
+ expect(chunk).toHaveProperty('metadata')
+ expect(chunk.metadata).toHaveProperty('startIndex')
+ expect(chunk.metadata).toHaveProperty('endIndex')
+ }
+ })
+
+ it('sets metadata with startIndex and endIndex', () => {
+ const texts = ['chunk one', 'chunk two']
+ const chunks = buildChunks(texts, 0)
+ expect(typeof chunks[0].metadata.startIndex).toBe('number')
+ expect(typeof chunks[0].metadata.endIndex).toBe('number')
+ })
+
+ it('sets startIndex of first chunk to 0', () => {
+ const texts = ['first chunk', 'second chunk']
+ const chunks = buildChunks(texts, 0)
+ expect(chunks[0].metadata.startIndex).toBe(0)
+ })
+
+ it('produces contiguous chunks with overlapTokens=0', () => {
+ const texts = ['hello world', 'foo bar baz']
+ const chunks = buildChunks(texts, 0)
+ expect(chunks[0].metadata.endIndex).toBe(chunks[1].metadata.startIndex)
+ })
+})
+
+describe('resolveChunkerOptions', () => {
+ it('applies defaults: chunkSize=1024, chunkOverlap=0, minCharactersPerChunk=100', () => {
+ const result = resolveChunkerOptions({})
+ expect(result.chunkSize).toBe(1024)
+ expect(result.chunkOverlap).toBe(0)
+ expect(result.minCharactersPerChunk).toBe(100)
+ })
+
+ it('clamps overlap to max 50% of chunkSize', () => {
+ const result = resolveChunkerOptions({ chunkSize: 100, chunkOverlap: 80 })
+ expect(result.chunkOverlap).toBe(50)
+ })
+
+ it('respects provided values when within limits', () => {
+ const result = resolveChunkerOptions({
+ chunkSize: 500,
+ chunkOverlap: 100,
+ minCharactersPerChunk: 50,
+ })
+ expect(result.chunkSize).toBe(500)
+ expect(result.chunkOverlap).toBe(100)
+ expect(result.minCharactersPerChunk).toBe(50)
+ })
+})
diff --git a/apps/sim/lib/chunkers/utils.ts b/apps/sim/lib/chunkers/utils.ts
new file mode 100644
index 00000000000..ded68dbc192
--- /dev/null
+++ b/apps/sim/lib/chunkers/utils.ts
@@ -0,0 +1,143 @@
+import type { Chunk } from '@/lib/chunkers/types'
+
+/** 1 token ≈ 4 characters for English text */
+export function estimateTokens(text: string): number {
+ if (!text?.trim()) return 0
+ return Math.ceil(text.length / 4)
+}
+
+export function tokensToChars(tokens: number): number {
+ return tokens * 4
+}
+
+export function cleanText(text: string): string {
+ return text
+ .replace(/\r\n/g, '\n')
+ .replace(/\r/g, '\n')
+ .replace(/\n{3,}/g, '\n\n')
+ .replace(/\t/g, ' ')
+ .replace(/ {2,}/g, ' ')
+ .trim()
+}
+
+export function addOverlap(chunks: string[], overlapChars: number): string[] {
+ if (overlapChars <= 0 || chunks.length <= 1) {
+ return chunks
+ }
+
+ const result: string[] = []
+
+ for (let i = 0; i < chunks.length; i++) {
+ let chunk = chunks[i]
+
+ if (i > 0) {
+ const prevChunk = chunks[i - 1]
+ const overlapLength = Math.min(overlapChars, prevChunk.length)
+ const overlapText = prevChunk.slice(-overlapLength)
+
+ const wordBoundaryMatch = overlapText.match(/^\s*\S/)
+ const cleanOverlap = wordBoundaryMatch
+ ? overlapText.slice(overlapText.indexOf(wordBoundaryMatch[0].trim()))
+ : overlapText
+
+ if (cleanOverlap.trim()) {
+ chunk = `${cleanOverlap.trim()} ${chunk}`
+ }
+ }
+
+ result.push(chunk)
+ }
+
+ return result
+}
+
+/**
+ * When stepChars is provided (< chunkSizeChars), produces overlapping chunks
+ * using a sliding window where chunks stay within the size limit.
+ */
+export function splitAtWordBoundaries(
+ text: string,
+ chunkSizeChars: number,
+ stepChars?: number
+): string[] {
+ const parts: string[] = []
+ let pos = 0
+
+ while (pos < text.length) {
+ let end = Math.min(pos + chunkSizeChars, text.length)
+
+ if (end < text.length) {
+ const lastSpace = text.lastIndexOf(' ', end)
+ if (lastSpace > pos) {
+ end = lastSpace
+ }
+ }
+
+ const part = text.slice(pos, end).trim()
+ if (part) {
+ parts.push(part)
+ }
+
+ if (stepChars !== undefined) {
+ // Sliding window: advance by step for predictable overlap
+ const nextPos = pos + Math.max(1, stepChars)
+ if (nextPos >= text.length) break
+ pos = nextPos
+ } else {
+ // Non-overlapping: advance from end of extracted content
+ if (end >= text.length) break
+ pos = end
+ }
+ while (pos < text.length && text[pos] === ' ') pos++
+ }
+
+ return parts
+}
+
+export function buildChunks(texts: string[], overlapTokens: number): Chunk[] {
+ let previousEndIndex = 0
+ const overlapChars = tokensToChars(overlapTokens)
+
+ return texts.map((text, index) => {
+ let startIndex: number
+ let actualContentLength: number
+
+ if (index === 0 || overlapTokens <= 0) {
+ startIndex = previousEndIndex
+ actualContentLength = text.length
+ } else {
+ const prevChunk = texts[index - 1]
+ const overlapLength = Math.min(overlapChars, prevChunk.length, text.length)
+ startIndex = previousEndIndex - overlapLength
+ actualContentLength = text.length - overlapLength
+ }
+
+ const safeStart = Math.max(0, startIndex)
+ const endIndex = safeStart + Math.max(0, actualContentLength)
+
+ previousEndIndex = endIndex
+
+ return {
+ text,
+ tokenCount: estimateTokens(text),
+ metadata: {
+ startIndex: safeStart,
+ endIndex,
+ },
+ }
+ })
+}
+
+export function resolveChunkerOptions(options: {
+ chunkSize?: number
+ chunkOverlap?: number
+ minCharactersPerChunk?: number
+}): { chunkSize: number; chunkOverlap: number; minCharactersPerChunk: number } {
+ const chunkSize = options.chunkSize ?? 1024
+ const maxOverlap = Math.floor(chunkSize * 0.5)
+ return {
+ chunkSize,
+ chunkOverlap: Math.min(options.chunkOverlap ?? 0, maxOverlap),
+ minCharactersPerChunk: options.minCharactersPerChunk ?? 100,
+ }
+}
diff --git a/apps/sim/lib/file-parsers/index.ts b/apps/sim/lib/file-parsers/index.ts
index a69a8abdf26..28080e54667 100644
--- a/apps/sim/lib/file-parsers/index.ts
+++ b/apps/sim/lib/file-parsers/index.ts
@@ -86,12 +86,21 @@ function getParserInstances(): Record {
}
try {
- const { parseJSON, parseJSONBuffer } = require('@/lib/file-parsers/json-parser')
+ const {
+ parseJSON,
+ parseJSONBuffer,
+ parseJSONL,
+ parseJSONLBuffer,
+ } = require('@/lib/file-parsers/json-parser')
parserInstances.json = {
parseFile: parseJSON,
parseBuffer: parseJSONBuffer,
}
- logger.info('Loaded JSON parser')
+ parserInstances.jsonl = {
+ parseFile: parseJSONL,
+ parseBuffer: parseJSONLBuffer,
+ }
+ logger.info('Loaded JSON/JSONL parser')
} catch (error) {
logger.error('Failed to load JSON parser:', error)
}
diff --git a/apps/sim/lib/file-parsers/json-parser.ts b/apps/sim/lib/file-parsers/json-parser.ts
index 15881131501..ac239fb6e71 100644
--- a/apps/sim/lib/file-parsers/json-parser.ts
+++ b/apps/sim/lib/file-parsers/json-parser.ts
@@ -59,6 +59,49 @@ export async function parseJSONBuffer(buffer: Buffer): Promise
}
}
+/**
+ * Parse JSONL (JSON Lines) files — one JSON object per line
+ */
+export async function parseJSONL(filePath: string): Promise {
+ const fs = await import('fs/promises')
+ const content = await fs.readFile(filePath, 'utf-8')
+ return parseJSONLContent(content)
+}
+
+/**
+ * Parse JSONL from buffer
+ */
+export async function parseJSONLBuffer(buffer: Buffer): Promise {
+ const content = buffer.toString('utf-8')
+ return parseJSONLContent(content)
+}
+
+function parseJSONLContent(content: string): FileParseResult {
+ const lines = content.split('\n').filter((line) => line.trim())
+ const items: unknown[] = []
+
+ for (const line of lines) {
+ try {
+ items.push(JSON.parse(line))
+ } catch {
+ throw new Error(`Invalid JSONL: failed to parse line: ${line.slice(0, 100)}`)
+ }
+ }
+
+ const formattedContent = JSON.stringify(items, null, 2)
+
+ return {
+ content: formattedContent,
+ metadata: {
+ type: 'json',
+ isArray: true,
+ keys: [],
+ itemCount: items.length,
+ depth: items.length > 0 ? 1 + getJsonDepth(items[0]) : 1,
+ },
+ }
+}
+
/**
* Calculate the depth of a JSON object
*/
diff --git a/apps/sim/lib/knowledge/documents/document-processor.ts b/apps/sim/lib/knowledge/documents/document-processor.ts
index 5ca6de84c9e..2d652e9a11a 100644
--- a/apps/sim/lib/knowledge/documents/document-processor.ts
+++ b/apps/sim/lib/knowledge/documents/document-processor.ts
@@ -1,7 +1,17 @@
import { createLogger } from '@sim/logger'
import { PDFDocument } from 'pdf-lib'
import { getBYOKKey } from '@/lib/api-key/byok'
-import { type Chunk, JsonYamlChunker, StructuredDataChunker, TextChunker } from '@/lib/chunkers'
+import {
+ type Chunk,
+ JsonYamlChunker,
+ RecursiveChunker,
+ RegexChunker,
+ SentenceChunker,
+ StructuredDataChunker,
+ TextChunker,
+ TokenChunker,
+} from '@/lib/chunkers'
+import type { ChunkingStrategy, StrategyOptions } from '@/lib/chunkers/types'
import { env } from '@/lib/core/config/env'
import { parseBuffer, parseFile } from '@/lib/file-parsers'
import type { FileParseMetadata } from '@/lib/file-parsers/types'
@@ -44,9 +54,6 @@ type OCRRequestBody = {
const MISTRAL_MAX_PAGES = 1000
-/**
- * Get page count from a PDF buffer using unpdf
- */
async function getPdfPageCount(buffer: Buffer): Promise {
try {
const { getDocumentProxy } = await import('unpdf')
@@ -59,10 +66,6 @@ async function getPdfPageCount(buffer: Buffer): Promise {
}
}
-/**
- * Split a PDF buffer into multiple smaller PDFs
- * Returns an array of PDF buffers, each with at most maxPages pages
- */
async function splitPdfIntoChunks(
pdfBuffer: Buffer,
maxPages: number
@@ -112,6 +115,54 @@ class APIError extends Error {
}
}
+async function applyStrategy(
+ strategy: ChunkingStrategy,
+ content: string,
+ chunkSize: number,
+ chunkOverlap: number,
+ minCharactersPerChunk: number,
+ strategyOptions?: StrategyOptions
+): Promise {
+ const baseOptions = { chunkSize, chunkOverlap, minCharactersPerChunk }
+
+ switch (strategy) {
+ case 'token': {
+ const chunker = new TokenChunker(baseOptions)
+ return chunker.chunk(content)
+ }
+ case 'sentence': {
+ const chunker = new SentenceChunker(baseOptions)
+ return chunker.chunk(content)
+ }
+ case 'recursive': {
+ const chunker = new RecursiveChunker({
+ ...baseOptions,
+ separators: strategyOptions?.separators,
+ recipe: strategyOptions?.recipe,
+ })
+ return chunker.chunk(content)
+ }
+ case 'regex': {
+ if (!strategyOptions?.pattern) {
+ logger.warn(
+ 'Regex strategy requested but no pattern provided, falling back to text chunker'
+ )
+ const chunker = new TextChunker(baseOptions)
+ return chunker.chunk(content)
+ }
+ const chunker = new RegexChunker({
+ ...baseOptions,
+ pattern: strategyOptions.pattern,
+ })
+ return chunker.chunk(content)
+ }
+ default: {
+ const chunker = new TextChunker(baseOptions)
+ return chunker.chunk(content)
+ }
+ }
+}
+
export async function processDocument(
fileUrl: string,
filename: string,
@@ -120,7 +171,9 @@ export async function processDocument(
chunkOverlap = 200,
minCharactersPerChunk = 100,
userId?: string,
- workspaceId?: string | null
+ workspaceId?: string | null,
+ strategy?: ChunkingStrategy,
+ strategyOptions?: StrategyOptions
): Promise<{
chunks: Chunk[]
metadata: {
@@ -144,30 +197,42 @@ export async function processDocument(
let chunks: Chunk[]
const metadata: FileParseMetadata = parseResult.metadata ?? {}
- const isJsonYaml =
- metadata.type === 'json' ||
- metadata.type === 'yaml' ||
- mimeType.includes('json') ||
- mimeType.includes('yaml')
-
- if (isJsonYaml && JsonYamlChunker.isStructuredData(content)) {
- logger.info('Using JSON/YAML chunker for structured data')
- chunks = await JsonYamlChunker.chunkJsonYaml(content, {
+ if (strategy && strategy !== 'auto') {
+ logger.info(`Using explicit chunking strategy: ${strategy}`)
+ chunks = await applyStrategy(
+ strategy,
+ content,
chunkSize,
+ chunkOverlap,
minCharactersPerChunk,
- })
- } else if (StructuredDataChunker.isStructuredData(content, mimeType)) {
- logger.info('Using structured data chunker for spreadsheet/CSV content')
- const rowCount = metadata.totalRows ?? metadata.rowCount
- chunks = await StructuredDataChunker.chunkStructuredData(content, {
- chunkSize,
- headers: metadata.headers,
- totalRows: typeof rowCount === 'number' ? rowCount : undefined,
- sheetName: metadata.sheetNames?.[0],
- })
+ strategyOptions
+ )
} else {
- const chunker = new TextChunker({ chunkSize, chunkOverlap, minCharactersPerChunk })
- chunks = await chunker.chunk(content)
+ const isJsonYaml =
+ metadata.type === 'json' ||
+ metadata.type === 'yaml' ||
+ mimeType.includes('json') ||
+ mimeType.includes('yaml')
+
+ if (isJsonYaml && JsonYamlChunker.isStructuredData(content)) {
+ logger.info('Using JSON/YAML chunker for structured data')
+ chunks = await JsonYamlChunker.chunkJsonYaml(content, {
+ chunkSize,
+ minCharactersPerChunk,
+ })
+ } else if (StructuredDataChunker.isStructuredData(content, mimeType)) {
+ logger.info('Using structured data chunker for spreadsheet/CSV content')
+ const rowCount = metadata.totalRows ?? metadata.rowCount
+ chunks = await StructuredDataChunker.chunkStructuredData(content, {
+ chunkSize,
+ headers: metadata.headers,
+ totalRows: typeof rowCount === 'number' ? rowCount : undefined,
+ sheetName: metadata.sheetNames?.[0],
+ })
+ } else {
+ const chunker = new TextChunker({ chunkSize, chunkOverlap, minCharactersPerChunk })
+ chunks = await chunker.chunk(content)
+ }
}
const characterCount = content.length
@@ -565,9 +630,6 @@ async function executeMistralOCRRequest(
)
}
-/**
- * Process a single PDF chunk: upload to S3, OCR, cleanup
- */
async function processChunk(
chunk: { buffer: Buffer; startPage: number; endPage: number },
chunkIndex: number,
@@ -585,7 +647,6 @@ async function processChunk(
let uploadedKey: string | null = null
try {
- // Upload the chunk to S3
const timestamp = Date.now()
const uniqueId = Math.random().toString(36).substring(2, 9)
const safeFileName = filename.replace(/[^a-zA-Z0-9.-]/g, '_')
@@ -617,7 +678,6 @@ async function processChunk(
logger.info(`Uploaded chunk ${chunkIndex + 1} to S3: ${chunkKey}`)
- // Process the chunk with Mistral OCR
const params = {
filePath: chunkUrl,
apiKey,
@@ -639,7 +699,6 @@ async function processChunk(
})
return { index: chunkIndex, content: null }
} finally {
- // Clean up the chunk file from S3 after processing
if (uploadedKey) {
try {
await StorageService.deleteFile({ key: uploadedKey, context: 'knowledge-base' })
@@ -674,7 +733,6 @@ async function processMistralOCRInBatches(
`Split into ${pdfChunks.length} chunks, processing with concurrency ${MAX_CONCURRENT_CHUNKS}`
)
- // Process chunks concurrently with limited concurrency
const results: { index: number; content: string | null }[] = []
for (let i = 0; i < pdfChunks.length; i += MAX_CONCURRENT_CHUNKS) {
@@ -693,15 +751,12 @@ async function processMistralOCRInBatches(
)
}
- // Sort by index to maintain page order and filter out nulls
const sortedResults = results
.sort((a, b) => a.index - b.index)
.filter((r) => r.content !== null)
.map((r) => r.content as string)
if (sortedResults.length === 0) {
- // Don't fall back to file parser for large PDFs - it produces poor results
- // Better to fail clearly than return low-quality extraction
throw new Error(
`OCR failed for all ${pdfChunks.length} chunks of ${filename}. ` +
`Large PDFs require OCR - file parser fallback would produce poor results.`
diff --git a/apps/sim/lib/knowledge/documents/service.ts b/apps/sim/lib/knowledge/documents/service.ts
index ff613b4e8cd..c37aa22a53d 100644
--- a/apps/sim/lib/knowledge/documents/service.ts
+++ b/apps/sim/lib/knowledge/documents/service.ts
@@ -27,6 +27,7 @@ import {
} from 'drizzle-orm'
import { recordUsage } from '@/lib/billing/core/usage-log'
import { checkAndBillOverageThreshold } from '@/lib/billing/threshold-billing'
+import type { ChunkingStrategy, StrategyOptions } from '@/lib/chunkers/types'
import { createBullMQJobData, isBullMQEnabled } from '@/lib/core/bullmq'
import { env } from '@/lib/core/config/env'
import { getCostMultiplier, isTriggerDevEnabled } from '@/lib/core/config/feature-flags'
@@ -51,10 +52,9 @@ import { calculateCost } from '@/providers/utils'
const logger = createLogger('DocumentService')
const TIMEOUTS = {
- OVERALL_PROCESSING: (env.KB_CONFIG_MAX_DURATION || 600) * 1000, // Default 10 minutes for KB document processing
+ OVERALL_PROCESSING: (env.KB_CONFIG_MAX_DURATION || 600) * 1000,
} as const
-// Configuration for handling large documents
const LARGE_DOC_CONFIG = {
MAX_CHUNKS_PER_BATCH: 500,
MAX_EMBEDDING_BATCH: env.KB_CONFIG_BATCH_SIZE || 2000,
@@ -62,9 +62,6 @@ const LARGE_DOC_CONFIG = {
MAX_CHUNKS_PER_DOCUMENT: 100000,
}
-/**
- * Create a timeout wrapper for async operations
- */
function withTimeout(
promise: Promise,
timeoutMs: number,
@@ -173,10 +170,6 @@ export interface DocumentTagData {
value: string
}
-/**
- * Process structured document tags and validate them against existing definitions
- * Throws an error if a tag doesn't exist or if the value doesn't match the expected type
- */
export async function processDocumentTags(
knowledgeBaseId: string,
tagData: DocumentTagData[],
@@ -354,9 +347,6 @@ export async function processDocumentTags(
return result
}
-/**
- * Process documents with the configured background execution backend.
- */
export async function processDocumentsWithQueue(
createdDocuments: DocumentData[],
knowledgeBaseId: string,
@@ -407,9 +397,6 @@ export async function processDocumentsWithQueue(
return
}
-/**
- * Process a document asynchronously with full error handling
- */
export async function processDocumentAsync(
knowledgeBaseId: string,
documentId: string,
@@ -457,6 +444,8 @@ export async function processDocumentAsync(
maxSize?: number
minSize?: number
overlap?: number
+ strategy?: ChunkingStrategy
+ strategyOptions?: StrategyOptions
} | null
const kbConfig = {
maxSize: rawConfig?.maxSize ?? 1024,
@@ -478,7 +467,9 @@ export async function processDocumentAsync(
kbConfig.overlap,
kbConfig.minSize,
kb[0].userId,
- kb[0].workspaceId
+ kb[0].workspaceId,
+ rawConfig?.strategy,
+ rawConfig?.strategyOptions
)
if (processed.chunks.length > LARGE_DOC_CONFIG.MAX_CHUNKS_PER_DOCUMENT) {
@@ -529,7 +520,6 @@ export async function processDocumentAsync(
const documentRecord = await db
.select({
- // Text tags (7 slots)
tag1: document.tag1,
tag2: document.tag2,
tag3: document.tag3,
@@ -537,16 +527,13 @@ export async function processDocumentAsync(
tag5: document.tag5,
tag6: document.tag6,
tag7: document.tag7,
- // Number tags (5 slots)
number1: document.number1,
number2: document.number2,
number3: document.number3,
number4: document.number4,
number5: document.number5,
- // Date tags (2 slots)
date1: document.date1,
date2: document.date2,
- // Boolean tags (3 slots)
boolean1: document.boolean1,
boolean2: document.boolean2,
boolean3: document.boolean3,
@@ -578,7 +565,6 @@ export async function processDocumentAsync(
embeddingModel: 'text-embedding-3-small',
startOffset: chunk.metadata.startIndex,
endOffset: chunk.metadata.endIndex,
- // Copy text tags from document (7 slots)
tag1: documentTags.tag1,
tag2: documentTags.tag2,
tag3: documentTags.tag3,
@@ -586,16 +572,13 @@ export async function processDocumentAsync(
tag5: documentTags.tag5,
tag6: documentTags.tag6,
tag7: documentTags.tag7,
- // Copy number tags from document (5 slots)
number1: documentTags.number1,
number2: documentTags.number2,
number3: documentTags.number3,
number4: documentTags.number4,
number5: documentTags.number5,
- // Copy date tags from document (2 slots)
date1: documentTags.date1,
date2: documentTags.date2,
- // Copy boolean tags from document (3 slots)
boolean1: documentTags.boolean1,
boolean2: documentTags.boolean2,
boolean3: documentTags.boolean3,
@@ -719,16 +702,10 @@ export async function processDocumentAsync(
}
}
-/**
- * Check if Trigger.dev is available and configured
- */
export function isTriggerAvailable(): boolean {
return Boolean(env.TRIGGER_SECRET_KEY) && isTriggerDevEnabled
}
-/**
- * Process documents using Trigger.dev
- */
export async function processDocumentsWithTrigger(
documents: DocumentProcessingPayload[],
requestId: string
@@ -777,9 +754,6 @@ export async function processDocumentsWithTrigger(
}
}
-/**
- * Create document records in database with tags
- */
export async function createDocumentRecords(
documents: Array<{
filename: string
@@ -848,7 +822,6 @@ export async function createDocumentRecords(
processingStatus: 'pending' as const,
enabled: true,
uploadedAt: now,
- // Text tags - use processed tags if available, otherwise fall back to individual tag fields
tag1: processedTags.tag1 ?? docData.tag1 ?? null,
tag2: processedTags.tag2 ?? docData.tag2 ?? null,
tag3: processedTags.tag3 ?? docData.tag3 ?? null,
@@ -856,16 +829,13 @@ export async function createDocumentRecords(
tag5: processedTags.tag5 ?? docData.tag5 ?? null,
tag6: processedTags.tag6 ?? docData.tag6 ?? null,
tag7: processedTags.tag7 ?? docData.tag7 ?? null,
- // Number tags (5 slots)
number1: processedTags.number1 ?? null,
number2: processedTags.number2 ?? null,
number3: processedTags.number3 ?? null,
number4: processedTags.number4 ?? null,
number5: processedTags.number5 ?? null,
- // Date tags (2 slots)
date1: processedTags.date1 ?? null,
date2: processedTags.date2 ?? null,
- // Boolean tags (3 slots)
boolean1: processedTags.boolean1 ?? null,
boolean2: processedTags.boolean2 ?? null,
boolean3: processedTags.boolean3 ?? null,
@@ -897,9 +867,6 @@ export async function createDocumentRecords(
})
}
-/**
- * A single tag filter condition passed from the API layer.
- */
export interface TagFilterCondition {
tagSlot: string
fieldType: 'text' | 'number' | 'date' | 'boolean'
@@ -908,9 +875,6 @@ export interface TagFilterCondition {
valueTo?: string
}
-/**
- * Builds a Drizzle SQL condition from a tag filter.
- */
const ALLOWED_TAG_SLOTS = new Set([
'tag1',
'tag2',
@@ -1039,9 +1003,6 @@ function buildTagFilterCondition(filter: TagFilterCondition): SQL | undefined {
return undefined
}
-/**
- * Get documents for a knowledge base with filtering and pagination
- */
export async function getDocuments(
knowledgeBaseId: string,
options: {
@@ -1070,7 +1031,6 @@ export async function getDocuments(
processingError: string | null
enabled: boolean
uploadedAt: Date
- // Text tags
tag1: string | null
tag2: string | null
tag3: string | null
@@ -1078,20 +1038,16 @@ export async function getDocuments(
tag5: string | null
tag6: string | null
tag7: string | null
- // Number tags
number1: number | null
number2: number | null
number3: number | null
number4: number | null
number5: number | null
- // Date tags
date1: Date | null
date2: Date | null
- // Boolean tags
boolean1: boolean | null
boolean2: boolean | null
boolean3: boolean | null
- // Connector fields
connectorId: string | null
connectorType: string | null
sourceUrl: string | null
@@ -1188,7 +1144,6 @@ export async function getDocuments(
processingError: document.processingError,
enabled: document.enabled,
uploadedAt: document.uploadedAt,
- // Text tags (7 slots)
tag1: document.tag1,
tag2: document.tag2,
tag3: document.tag3,
@@ -1196,20 +1151,16 @@ export async function getDocuments(
tag5: document.tag5,
tag6: document.tag6,
tag7: document.tag7,
- // Number tags (5 slots)
number1: document.number1,
number2: document.number2,
number3: document.number3,
number4: document.number4,
number5: document.number5,
- // Date tags (2 slots)
date1: document.date1,
date2: document.date2,
- // Boolean tags (3 slots)
boolean1: document.boolean1,
boolean2: document.boolean2,
boolean3: document.boolean3,
- // Connector fields
connectorId: document.connectorId,
connectorType: knowledgeConnector.connectorType,
sourceUrl: document.sourceUrl,
@@ -1241,7 +1192,6 @@ export async function getDocuments(
processingError: doc.processingError,
enabled: doc.enabled,
uploadedAt: doc.uploadedAt,
- // Text tags
tag1: doc.tag1,
tag2: doc.tag2,
tag3: doc.tag3,
@@ -1249,20 +1199,16 @@ export async function getDocuments(
tag5: doc.tag5,
tag6: doc.tag6,
tag7: doc.tag7,
- // Number tags
number1: doc.number1,
number2: doc.number2,
number3: doc.number3,
number4: doc.number4,
number5: doc.number5,
- // Date tags
date1: doc.date1,
date2: doc.date2,
- // Boolean tags
boolean1: doc.boolean1,
boolean2: doc.boolean2,
boolean3: doc.boolean3,
- // Connector fields
connectorId: doc.connectorId,
connectorType: doc.connectorType ?? null,
sourceUrl: doc.sourceUrl,
@@ -1276,9 +1222,6 @@ export async function getDocuments(
}
}
-/**
- * Create a single document record
- */
export async function createSingleDocument(
documentData: {
filename: string
@@ -1320,7 +1263,6 @@ export async function createSingleDocument(
const now = new Date()
let processedTags: ProcessedDocumentTags = {
- // Text tags (7 slots)
tag1: documentData.tag1 ?? null,
tag2: documentData.tag2 ?? null,
tag3: documentData.tag3 ?? null,
@@ -1328,16 +1270,13 @@ export async function createSingleDocument(
tag5: documentData.tag5 ?? null,
tag6: documentData.tag6 ?? null,
tag7: documentData.tag7 ?? null,
- // Number tags (5 slots)
number1: null,
number2: null,
number3: null,
number4: null,
number5: null,
- // Date tags (2 slots)
date1: null,
date2: null,
- // Boolean tags (3 slots)
boolean1: null,
boolean2: null,
boolean3: null,
@@ -1417,9 +1356,6 @@ export async function createSingleDocument(
}
}
-/**
- * Perform bulk operations on documents
- */
export async function bulkDocumentOperation(
knowledgeBaseId: string,
operation: 'enable' | 'disable' | 'delete',
@@ -1509,9 +1445,6 @@ export async function bulkDocumentOperation(
}
}
-/**
- * Perform bulk operations on all documents matching a filter
- */
export async function bulkDocumentOperationByFilter(
knowledgeBaseId: string,
operation: 'enable' | 'disable' | 'delete',
@@ -1583,9 +1516,6 @@ export async function bulkDocumentOperationByFilter(
}
}
-/**
- * Mark a document as failed due to timeout
- */
export async function markDocumentAsFailedTimeout(
documentId: string,
processingStartedAt: Date,
@@ -1618,9 +1548,6 @@ export async function markDocumentAsFailedTimeout(
}
}
-/**
- * Retry processing a failed document
- */
export async function retryDocumentProcessing(
knowledgeBaseId: string,
documentId: string,
@@ -1673,9 +1600,6 @@ export async function retryDocumentProcessing(
}
}
-/**
- * Update a document with specified fields
- */
export async function updateDocument(
documentId: string,
updateData: {
@@ -1686,7 +1610,6 @@ export async function updateDocument(
characterCount?: number
processingStatus?: 'pending' | 'processing' | 'completed' | 'failed'
processingError?: string
- // Text tags
tag1?: string
tag2?: string
tag3?: string
@@ -1694,16 +1617,13 @@ export async function updateDocument(
tag5?: string
tag6?: string
tag7?: string
- // Number tags
number1?: string
number2?: string
number3?: string
number4?: string
number5?: string
- // Date tags
date1?: string
date2?: string
- // Boolean tags
boolean1?: string
boolean2?: string
boolean3?: string
@@ -1772,7 +1692,6 @@ export async function updateDocument(
boolean2: boolean | null
boolean3: boolean | null
}> = {}
- // All tag slots across all field types
const ALL_TAG_SLOTS = [
'tag1',
'tag2',
@@ -1794,7 +1713,6 @@ export async function updateDocument(
] as const
type TagSlot = (typeof ALL_TAG_SLOTS)[number]
- // Regular field updates
if (updateData.filename !== undefined) dbUpdateData.filename = updateData.filename
if (updateData.enabled !== undefined) dbUpdateData.enabled = updateData.enabled
if (updateData.chunkCount !== undefined) dbUpdateData.chunkCount = updateData.chunkCount
@@ -1812,26 +1730,21 @@ export async function updateDocument(
): string | number | Date | boolean | null => {
if (value === undefined || value === '') return null
- // Number slots
if (slot.startsWith('number')) {
return parseNumberValue(value)
}
- // Date slots
if (slot.startsWith('date')) {
return parseDateValue(value)
}
- // Boolean slots
if (slot.startsWith('boolean')) {
return parseBooleanValue(value) ?? false
}
- // Text slots: keep as string
return value || null
}
- // Type-safe access to tag slots in updateData
type UpdateDataWithTags = typeof updateData & Record
const typedUpdateData = updateData as UpdateDataWithTags
@@ -2044,9 +1957,6 @@ export async function hardDeleteDocuments(
return existingIds.length
}
-/**
- * Hard delete a document.
- */
export async function deleteDocument(
documentId: string,
requestId: string
diff --git a/apps/sim/lib/knowledge/types.ts b/apps/sim/lib/knowledge/types.ts
index b761597c790..6fe1a8bbaff 100644
--- a/apps/sim/lib/knowledge/types.ts
+++ b/apps/sim/lib/knowledge/types.ts
@@ -1,18 +1,16 @@
+import type { ChunkingStrategy, StrategyOptions } from '@/lib/chunkers/types'
+
/**
- * Configuration for document chunking in knowledge bases
- *
* Units:
- * - maxSize: Maximum chunk size in TOKENS (1 token ≈ 4 characters)
- * - minSize: Minimum chunk size in CHARACTERS (floor to avoid tiny fragments)
- * - overlap: Overlap between chunks in TOKENS (1 token ≈ 4 characters)
+ * - maxSize/overlap: TOKENS (1 token ≈ 4 characters)
+ * - minSize: CHARACTERS
*/
export interface ChunkingConfig {
- /** Maximum chunk size in tokens (default: 1024, range: 100-4000) */
maxSize: number
- /** Minimum chunk size in characters (default: 100, range: 1-2000) */
minSize: number
- /** Overlap between chunks in tokens (default: 200, range: 0-500) */
overlap: number
+ strategy?: ChunkingStrategy
+ strategyOptions?: StrategyOptions
}
export interface KnowledgeBaseWithCounts {
@@ -63,19 +61,16 @@ export interface UpdateTagDefinitionData {
fieldType?: string
}
-/** Tag filter for knowledge base search */
export interface StructuredFilter {
- tagName?: string // Human-readable name (input from frontend)
- tagSlot: string // Database column (resolved from tagName)
+ tagName?: string
+ tagSlot: string
fieldType: string
operator: string
value: string | number | boolean
valueTo?: string | number
}
-/** Processed document tags ready for database storage */
export interface ProcessedDocumentTags {
- // Text tags
tag1: string | null
tag2: string | null
tag3: string | null
@@ -83,39 +78,29 @@ export interface ProcessedDocumentTags {
tag5: string | null
tag6: string | null
tag7: string | null
- // Number tags
number1: number | null
number2: number | null
number3: number | null
number4: number | null
number5: number | null
- // Date tags
date1: Date | null
date2: Date | null
- // Boolean tags
boolean1: boolean | null
boolean2: boolean | null
boolean3: boolean | null
- // Index signature for dynamic access
[key: string]: string | number | Date | boolean | null
}
-/**
- * Frontend/API Types
- * These types use string dates for JSON serialization
- */
+/** These types use string dates for JSON serialization */
-/** Extended chunking config with optional fields */
export interface ExtendedChunkingConfig extends ChunkingConfig {
chunkSize?: number
minCharactersPerChunk?: number
recipe?: string
lang?: string
- strategy?: 'recursive' | 'semantic' | 'sentence' | 'paragraph'
[key: string]: unknown
}
-/** Knowledge base data for API responses */
export interface KnowledgeBaseData {
id: string
userId: string
@@ -132,7 +117,6 @@ export interface KnowledgeBaseData {
connectorTypes?: string[]
}
-/** Document data for API responses */
export interface DocumentData {
id: string
knowledgeBaseId: string
@@ -171,7 +155,6 @@ export interface DocumentData {
sourceUrl?: string | null
}
-/** Chunk data for API responses */
export interface ChunkData {
id: string
chunkIndex: number
@@ -202,7 +185,6 @@ export interface ChunkData {
updatedAt: string
}
-/** Pagination info for chunks */
export interface ChunksPagination {
total: number
limit: number
@@ -210,7 +192,6 @@ export interface ChunksPagination {
hasMore: boolean
}
-/** Pagination info for documents */
export interface DocumentsPagination {
total: number
limit: number
diff --git a/apps/sim/lib/uploads/utils/file-utils.ts b/apps/sim/lib/uploads/utils/file-utils.ts
index 007014f5f42..95dd217c297 100644
--- a/apps/sim/lib/uploads/utils/file-utils.ts
+++ b/apps/sim/lib/uploads/utils/file-utils.ts
@@ -366,7 +366,7 @@ export function validateKnowledgeBaseFile(
return null
}
- return `File "${file.name}" has an unsupported format. Please use PDF, DOC, DOCX, TXT, CSV, XLS, XLSX, MD, PPT, PPTX, HTML, JSON, YAML, or YML files.`
+ return `File "${file.name}" has an unsupported format. Please use PDF, DOC, DOCX, TXT, CSV, XLS, XLSX, MD, PPT, PPTX, HTML, JSON, JSONL, YAML, or YML files.`
}
/**
diff --git a/apps/sim/lib/uploads/utils/validation.ts b/apps/sim/lib/uploads/utils/validation.ts
index 3752e421d79..10ce9364bec 100644
--- a/apps/sim/lib/uploads/utils/validation.ts
+++ b/apps/sim/lib/uploads/utils/validation.ts
@@ -28,6 +28,7 @@ export const SUPPORTED_DOCUMENT_EXTENSIONS = [
'html',
'htm',
'json',
+ 'jsonl',
'yaml',
'yml',
] as const
@@ -135,6 +136,7 @@ export const SUPPORTED_MIME_TYPES: Record
html: ['text/html', 'application/xhtml+xml'],
htm: ['text/html', 'application/xhtml+xml'],
json: ['application/json', 'text/json', 'application/x-json'],
+ jsonl: ['application/jsonl', 'application/x-jsonlines', 'text/jsonl', 'application/octet-stream'],
yaml: ['text/yaml', 'text/x-yaml', 'application/yaml', 'application/x-yaml'],
yml: ['text/yaml', 'text/x-yaml', 'application/yaml', 'application/x-yaml'],
}