Initial commit: GovAI 政务AI平台
This commit is contained in:
@@ -0,0 +1,144 @@
|
||||
// Package chunker 提供文本智能分片服务,将长文本切分为适合向量化的片段
|
||||
package chunker
|
||||
|
||||
import (
|
||||
"strings"
|
||||
"unicode/utf8"
|
||||
)
|
||||
|
||||
// Options 分片配置
|
||||
type Options struct {
|
||||
ChunkSize int // 每块最大字符数,默认 500
|
||||
Overlap int // 块间重叠字符数,默认 50
|
||||
Separators []string // 分隔符列表(按优先级)
|
||||
}
|
||||
|
||||
// DefaultOptions 默认分片配置
|
||||
func DefaultOptions() Options {
|
||||
return Options{
|
||||
ChunkSize: 500,
|
||||
Overlap: 50,
|
||||
Separators: []string{
|
||||
"\n\n", "\n", "。", ".", "!", "!", "?", "?", ";", ";", " ",
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
// ChunkText 智能分片:优先按段落/句子边界切分,避免在句子中间断开
|
||||
func ChunkText(text string, opts Options) []string {
|
||||
text = strings.TrimSpace(text)
|
||||
if text == "" {
|
||||
return nil
|
||||
}
|
||||
|
||||
if opts.ChunkSize <= 0 {
|
||||
opts.ChunkSize = 500
|
||||
}
|
||||
if opts.Overlap < 0 {
|
||||
opts.Overlap = 0
|
||||
}
|
||||
if opts.Separators == nil {
|
||||
opts.Separators = DefaultOptions().Separators
|
||||
}
|
||||
|
||||
runeLen := utf8.RuneCountInString(text)
|
||||
if runeLen <= opts.ChunkSize {
|
||||
return []string{text}
|
||||
}
|
||||
|
||||
// 递归分片
|
||||
chunks := recursiveSplit(text, opts.Separators, opts.ChunkSize)
|
||||
|
||||
// 添加重叠
|
||||
if opts.Overlap > 0 && len(chunks) > 1 {
|
||||
chunks = addOverlap(chunks, opts.Overlap)
|
||||
}
|
||||
|
||||
// 过滤空片段
|
||||
var result []string
|
||||
for _, c := range chunks {
|
||||
c = strings.TrimSpace(c)
|
||||
if c != "" {
|
||||
result = append(result, c)
|
||||
}
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
// recursiveSplit 递归分片
|
||||
func recursiveSplit(text string, separators []string, chunkSize int) []string {
|
||||
if utf8.RuneCountInString(text) <= chunkSize {
|
||||
return []string{text}
|
||||
}
|
||||
|
||||
for _, sep := range separators {
|
||||
parts := strings.Split(text, sep)
|
||||
if len(parts) <= 1 {
|
||||
continue
|
||||
}
|
||||
|
||||
var result []string
|
||||
current := ""
|
||||
for _, part := range parts {
|
||||
candidate := current
|
||||
if candidate != "" {
|
||||
candidate += sep
|
||||
}
|
||||
candidate += part
|
||||
|
||||
if utf8.RuneCountInString(candidate) <= chunkSize {
|
||||
current = candidate
|
||||
} else {
|
||||
if current != "" {
|
||||
result = append(result, current)
|
||||
}
|
||||
if utf8.RuneCountInString(part) > chunkSize {
|
||||
// 继续用更细的分隔符拆分
|
||||
nextSeps := separators[1:]
|
||||
if len(nextSeps) == 0 {
|
||||
nextSeps = nil
|
||||
}
|
||||
result = append(result, recursiveSplit(part, nextSeps, chunkSize)...)
|
||||
current = ""
|
||||
} else {
|
||||
current = part
|
||||
}
|
||||
}
|
||||
}
|
||||
if current != "" {
|
||||
result = append(result, current)
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
// 无分隔符可用,按字符硬切
|
||||
runes := []rune(text)
|
||||
var result []string
|
||||
for i := 0; i < len(runes); i += chunkSize {
|
||||
end := i + chunkSize
|
||||
if end > len(runes) {
|
||||
end = len(runes)
|
||||
}
|
||||
result = append(result, string(runes[i:end]))
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
// addOverlap 给分片添加重叠区域
|
||||
func addOverlap(chunks []string, overlap int) []string {
|
||||
if len(chunks) <= 1 || overlap <= 0 {
|
||||
return chunks
|
||||
}
|
||||
|
||||
result := []string{chunks[0]}
|
||||
for i := 1; i < len(chunks); i++ {
|
||||
prevRunes := []rune(chunks[i-1])
|
||||
overlapStart := len(prevRunes) - overlap
|
||||
if overlapStart < 0 {
|
||||
overlapStart = 0
|
||||
}
|
||||
prevTail := string(prevRunes[overlapStart:])
|
||||
result = append(result, prevTail+chunks[i])
|
||||
}
|
||||
return result
|
||||
}
|
||||
Reference in New Issue
Block a user