// Package chunker 提供文本智能分片服务,将长文本切分为适合向量化的片段 package chunker import ( "strings" "unicode/utf8" ) // Options 分片配置 type Options struct { ChunkSize int // 每块最大字符数,默认 500 Overlap int // 块间重叠字符数,默认 50 Separators []string // 分隔符列表(按优先级) } // DefaultOptions 默认分片配置 func DefaultOptions() Options { return Options{ ChunkSize: 500, Overlap: 50, Separators: []string{ "\n\n", "\n", "。", ".", "!", "!", "?", "?", ";", ";", " ", }, } } // ChunkText 智能分片:优先按段落/句子边界切分,避免在句子中间断开 func ChunkText(text string, opts Options) []string { text = strings.TrimSpace(text) if text == "" { return nil } if opts.ChunkSize <= 0 { opts.ChunkSize = 500 } if opts.Overlap < 0 { opts.Overlap = 0 } if opts.Separators == nil { opts.Separators = DefaultOptions().Separators } runeLen := utf8.RuneCountInString(text) if runeLen <= opts.ChunkSize { return []string{text} } // 递归分片 chunks := recursiveSplit(text, opts.Separators, opts.ChunkSize) // 添加重叠 if opts.Overlap > 0 && len(chunks) > 1 { chunks = addOverlap(chunks, opts.Overlap) } // 过滤空片段 var result []string for _, c := range chunks { c = strings.TrimSpace(c) if c != "" { result = append(result, c) } } return result } // recursiveSplit 递归分片 func recursiveSplit(text string, separators []string, chunkSize int) []string { if utf8.RuneCountInString(text) <= chunkSize { return []string{text} } for _, sep := range separators { parts := strings.Split(text, sep) if len(parts) <= 1 { continue } var result []string current := "" for _, part := range parts { candidate := current if candidate != "" { candidate += sep } candidate += part if utf8.RuneCountInString(candidate) <= chunkSize { current = candidate } else { if current != "" { result = append(result, current) } if utf8.RuneCountInString(part) > chunkSize { // 继续用更细的分隔符拆分 nextSeps := separators[1:] if len(nextSeps) == 0 { nextSeps = nil } result = append(result, recursiveSplit(part, nextSeps, chunkSize)...) current = "" } else { current = part } } } if current != "" { result = append(result, current) } return result } // 无分隔符可用,按字符硬切 runes := []rune(text) var result []string for i := 0; i < len(runes); i += chunkSize { end := i + chunkSize if end > len(runes) { end = len(runes) } result = append(result, string(runes[i:end])) } return result } // addOverlap 给分片添加重叠区域 func addOverlap(chunks []string, overlap int) []string { if len(chunks) <= 1 || overlap <= 0 { return chunks } result := []string{chunks[0]} for i := 1; i < len(chunks); i++ { prevRunes := []rune(chunks[i-1]) overlapStart := len(prevRunes) - overlap if overlapStart < 0 { overlapStart = 0 } prevTail := string(prevRunes[overlapStart:]) result = append(result, prevTail+chunks[i]) } return result }