Initial commit: GovAI 政务AI平台

This commit is contained in:
freedakgmail
2026-06-15 23:48:37 +08:00
commit 0f490f72a9
245 changed files with 51669 additions and 0 deletions
+144
View File
@@ -0,0 +1,144 @@
// Package chunker 提供文本智能分片服务,将长文本切分为适合向量化的片段
package chunker
import (
"strings"
"unicode/utf8"
)
// Options 分片配置
type Options struct {
ChunkSize int // 每块最大字符数,默认 500
Overlap int // 块间重叠字符数,默认 50
Separators []string // 分隔符列表(按优先级)
}
// DefaultOptions 默认分片配置
func DefaultOptions() Options {
return Options{
ChunkSize: 500,
Overlap: 50,
Separators: []string{
"\n\n", "\n", "。", ".", "", "!", "", "?", "", ";", " ",
},
}
}
// ChunkText 智能分片:优先按段落/句子边界切分,避免在句子中间断开
func ChunkText(text string, opts Options) []string {
text = strings.TrimSpace(text)
if text == "" {
return nil
}
if opts.ChunkSize <= 0 {
opts.ChunkSize = 500
}
if opts.Overlap < 0 {
opts.Overlap = 0
}
if opts.Separators == nil {
opts.Separators = DefaultOptions().Separators
}
runeLen := utf8.RuneCountInString(text)
if runeLen <= opts.ChunkSize {
return []string{text}
}
// 递归分片
chunks := recursiveSplit(text, opts.Separators, opts.ChunkSize)
// 添加重叠
if opts.Overlap > 0 && len(chunks) > 1 {
chunks = addOverlap(chunks, opts.Overlap)
}
// 过滤空片段
var result []string
for _, c := range chunks {
c = strings.TrimSpace(c)
if c != "" {
result = append(result, c)
}
}
return result
}
// recursiveSplit 递归分片
func recursiveSplit(text string, separators []string, chunkSize int) []string {
if utf8.RuneCountInString(text) <= chunkSize {
return []string{text}
}
for _, sep := range separators {
parts := strings.Split(text, sep)
if len(parts) <= 1 {
continue
}
var result []string
current := ""
for _, part := range parts {
candidate := current
if candidate != "" {
candidate += sep
}
candidate += part
if utf8.RuneCountInString(candidate) <= chunkSize {
current = candidate
} else {
if current != "" {
result = append(result, current)
}
if utf8.RuneCountInString(part) > chunkSize {
// 继续用更细的分隔符拆分
nextSeps := separators[1:]
if len(nextSeps) == 0 {
nextSeps = nil
}
result = append(result, recursiveSplit(part, nextSeps, chunkSize)...)
current = ""
} else {
current = part
}
}
}
if current != "" {
result = append(result, current)
}
return result
}
// 无分隔符可用,按字符硬切
runes := []rune(text)
var result []string
for i := 0; i < len(runes); i += chunkSize {
end := i + chunkSize
if end > len(runes) {
end = len(runes)
}
result = append(result, string(runes[i:end]))
}
return result
}
// addOverlap 给分片添加重叠区域
func addOverlap(chunks []string, overlap int) []string {
if len(chunks) <= 1 || overlap <= 0 {
return chunks
}
result := []string{chunks[0]}
for i := 1; i < len(chunks); i++ {
prevRunes := []rune(chunks[i-1])
overlapStart := len(prevRunes) - overlap
if overlapStart < 0 {
overlapStart = 0
}
prevTail := string(prevRunes[overlapStart:])
result = append(result, prevTail+chunks[i])
}
return result
}