145 lines
3.1 KiB
Go
145 lines
3.1 KiB
Go
// Package chunker 提供文本智能分片服务,将长文本切分为适合向量化的片段
|
||
package chunker
|
||
|
||
import (
|
||
"strings"
|
||
"unicode/utf8"
|
||
)
|
||
|
||
// Options 分片配置
|
||
type Options struct {
|
||
ChunkSize int // 每块最大字符数,默认 500
|
||
Overlap int // 块间重叠字符数,默认 50
|
||
Separators []string // 分隔符列表(按优先级)
|
||
}
|
||
|
||
// DefaultOptions 默认分片配置
|
||
func DefaultOptions() Options {
|
||
return Options{
|
||
ChunkSize: 500,
|
||
Overlap: 50,
|
||
Separators: []string{
|
||
"\n\n", "\n", "。", ".", "!", "!", "?", "?", ";", ";", " ",
|
||
},
|
||
}
|
||
}
|
||
|
||
// ChunkText 智能分片:优先按段落/句子边界切分,避免在句子中间断开
|
||
func ChunkText(text string, opts Options) []string {
|
||
text = strings.TrimSpace(text)
|
||
if text == "" {
|
||
return nil
|
||
}
|
||
|
||
if opts.ChunkSize <= 0 {
|
||
opts.ChunkSize = 500
|
||
}
|
||
if opts.Overlap < 0 {
|
||
opts.Overlap = 0
|
||
}
|
||
if opts.Separators == nil {
|
||
opts.Separators = DefaultOptions().Separators
|
||
}
|
||
|
||
runeLen := utf8.RuneCountInString(text)
|
||
if runeLen <= opts.ChunkSize {
|
||
return []string{text}
|
||
}
|
||
|
||
// 递归分片
|
||
chunks := recursiveSplit(text, opts.Separators, opts.ChunkSize)
|
||
|
||
// 添加重叠
|
||
if opts.Overlap > 0 && len(chunks) > 1 {
|
||
chunks = addOverlap(chunks, opts.Overlap)
|
||
}
|
||
|
||
// 过滤空片段
|
||
var result []string
|
||
for _, c := range chunks {
|
||
c = strings.TrimSpace(c)
|
||
if c != "" {
|
||
result = append(result, c)
|
||
}
|
||
}
|
||
return result
|
||
}
|
||
|
||
// recursiveSplit 递归分片
|
||
func recursiveSplit(text string, separators []string, chunkSize int) []string {
|
||
if utf8.RuneCountInString(text) <= chunkSize {
|
||
return []string{text}
|
||
}
|
||
|
||
for _, sep := range separators {
|
||
parts := strings.Split(text, sep)
|
||
if len(parts) <= 1 {
|
||
continue
|
||
}
|
||
|
||
var result []string
|
||
current := ""
|
||
for _, part := range parts {
|
||
candidate := current
|
||
if candidate != "" {
|
||
candidate += sep
|
||
}
|
||
candidate += part
|
||
|
||
if utf8.RuneCountInString(candidate) <= chunkSize {
|
||
current = candidate
|
||
} else {
|
||
if current != "" {
|
||
result = append(result, current)
|
||
}
|
||
if utf8.RuneCountInString(part) > chunkSize {
|
||
// 继续用更细的分隔符拆分
|
||
nextSeps := separators[1:]
|
||
if len(nextSeps) == 0 {
|
||
nextSeps = nil
|
||
}
|
||
result = append(result, recursiveSplit(part, nextSeps, chunkSize)...)
|
||
current = ""
|
||
} else {
|
||
current = part
|
||
}
|
||
}
|
||
}
|
||
if current != "" {
|
||
result = append(result, current)
|
||
}
|
||
return result
|
||
}
|
||
|
||
// 无分隔符可用,按字符硬切
|
||
runes := []rune(text)
|
||
var result []string
|
||
for i := 0; i < len(runes); i += chunkSize {
|
||
end := i + chunkSize
|
||
if end > len(runes) {
|
||
end = len(runes)
|
||
}
|
||
result = append(result, string(runes[i:end]))
|
||
}
|
||
return result
|
||
}
|
||
|
||
// addOverlap 给分片添加重叠区域
|
||
func addOverlap(chunks []string, overlap int) []string {
|
||
if len(chunks) <= 1 || overlap <= 0 {
|
||
return chunks
|
||
}
|
||
|
||
result := []string{chunks[0]}
|
||
for i := 1; i < len(chunks); i++ {
|
||
prevRunes := []rune(chunks[i-1])
|
||
overlapStart := len(prevRunes) - overlap
|
||
if overlapStart < 0 {
|
||
overlapStart = 0
|
||
}
|
||
prevTail := string(prevRunes[overlapStart:])
|
||
result = append(result, prevTail+chunks[i])
|
||
}
|
||
return result
|
||
}
|