Files
2026-06-15 23:48:37 +08:00

145 lines
3.1 KiB
Go
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
// Package chunker 提供文本智能分片服务,将长文本切分为适合向量化的片段
package chunker
import (
"strings"
"unicode/utf8"
)
// Options 分片配置
type Options struct {
ChunkSize int // 每块最大字符数,默认 500
Overlap int // 块间重叠字符数,默认 50
Separators []string // 分隔符列表(按优先级)
}
// DefaultOptions 默认分片配置
func DefaultOptions() Options {
return Options{
ChunkSize: 500,
Overlap: 50,
Separators: []string{
"\n\n", "\n", "。", ".", "", "!", "", "?", "", ";", " ",
},
}
}
// ChunkText 智能分片:优先按段落/句子边界切分,避免在句子中间断开
func ChunkText(text string, opts Options) []string {
text = strings.TrimSpace(text)
if text == "" {
return nil
}
if opts.ChunkSize <= 0 {
opts.ChunkSize = 500
}
if opts.Overlap < 0 {
opts.Overlap = 0
}
if opts.Separators == nil {
opts.Separators = DefaultOptions().Separators
}
runeLen := utf8.RuneCountInString(text)
if runeLen <= opts.ChunkSize {
return []string{text}
}
// 递归分片
chunks := recursiveSplit(text, opts.Separators, opts.ChunkSize)
// 添加重叠
if opts.Overlap > 0 && len(chunks) > 1 {
chunks = addOverlap(chunks, opts.Overlap)
}
// 过滤空片段
var result []string
for _, c := range chunks {
c = strings.TrimSpace(c)
if c != "" {
result = append(result, c)
}
}
return result
}
// recursiveSplit 递归分片
func recursiveSplit(text string, separators []string, chunkSize int) []string {
if utf8.RuneCountInString(text) <= chunkSize {
return []string{text}
}
for _, sep := range separators {
parts := strings.Split(text, sep)
if len(parts) <= 1 {
continue
}
var result []string
current := ""
for _, part := range parts {
candidate := current
if candidate != "" {
candidate += sep
}
candidate += part
if utf8.RuneCountInString(candidate) <= chunkSize {
current = candidate
} else {
if current != "" {
result = append(result, current)
}
if utf8.RuneCountInString(part) > chunkSize {
// 继续用更细的分隔符拆分
nextSeps := separators[1:]
if len(nextSeps) == 0 {
nextSeps = nil
}
result = append(result, recursiveSplit(part, nextSeps, chunkSize)...)
current = ""
} else {
current = part
}
}
}
if current != "" {
result = append(result, current)
}
return result
}
// 无分隔符可用,按字符硬切
runes := []rune(text)
var result []string
for i := 0; i < len(runes); i += chunkSize {
end := i + chunkSize
if end > len(runes) {
end = len(runes)
}
result = append(result, string(runes[i:end]))
}
return result
}
// addOverlap 给分片添加重叠区域
func addOverlap(chunks []string, overlap int) []string {
if len(chunks) <= 1 || overlap <= 0 {
return chunks
}
result := []string{chunks[0]}
for i := 1; i < len(chunks); i++ {
prevRunes := []rune(chunks[i-1])
overlapStart := len(prevRunes) - overlap
if overlapStart < 0 {
overlapStart = 0
}
prevTail := string(prevRunes[overlapStart:])
result = append(result, prevTail+chunks[i])
}
return result
}