51 lines
2.0 KiB
SQL
51 lines
2.0 KiB
SQL
-- 对现有公安局知识库文档执行分片(纯文本分片,不含embedding)
|
|
-- 使用 regexp_split_to_table 按段落分割,再合并为 ~500 字的 chunk
|
|
|
|
DO $$
|
|
DECLARE
|
|
doc RECORD;
|
|
para RECORD;
|
|
current_chunk TEXT := '';
|
|
chunk_idx INTEGER := 0;
|
|
chunk_size INTEGER := 500;
|
|
BEGIN
|
|
FOR doc IN
|
|
SELECT kd.id AS doc_id, kd.kb_id, kd.content, kd.name
|
|
FROM knowledge_documents kd
|
|
JOIN knowledge_bases kb ON kd.kb_id = kb.id
|
|
WHERE kb.org_id = 'a0000000-0000-0000-0000-000000000002'
|
|
AND kd.content IS NOT NULL AND kd.content != ''
|
|
LOOP
|
|
DELETE FROM knowledge_chunks WHERE doc_id = doc.doc_id;
|
|
current_chunk := '';
|
|
chunk_idx := 0;
|
|
|
|
-- 按段落(双换行或单换行)拆分
|
|
FOR para IN
|
|
SELECT unnest(regexp_split_to_array(doc.content, E'\n')) AS line
|
|
LOOP
|
|
IF char_length(current_chunk) + char_length(para.line) + 1 > chunk_size AND current_chunk != '' THEN
|
|
INSERT INTO knowledge_chunks (kb_id, doc_id, chunk_index, content, char_count)
|
|
VALUES (doc.kb_id, doc.doc_id, chunk_idx, trim(current_chunk), char_length(trim(current_chunk)));
|
|
chunk_idx := chunk_idx + 1;
|
|
current_chunk := '';
|
|
END IF;
|
|
IF current_chunk = '' THEN
|
|
current_chunk := para.line;
|
|
ELSE
|
|
current_chunk := current_chunk || E'\n' || para.line;
|
|
END IF;
|
|
END LOOP;
|
|
|
|
-- 最后剩余的内容
|
|
IF char_length(trim(current_chunk)) > 0 THEN
|
|
INSERT INTO knowledge_chunks (kb_id, doc_id, chunk_index, content, char_count)
|
|
VALUES (doc.kb_id, doc.doc_id, chunk_idx, trim(current_chunk), char_length(trim(current_chunk)));
|
|
chunk_idx := chunk_idx + 1;
|
|
END IF;
|
|
|
|
UPDATE knowledge_documents SET chunk_count = chunk_idx WHERE id = doc.doc_id;
|
|
RAISE NOTICE '文档 [%] 已分片: % chunks', doc.name, chunk_idx;
|
|
END LOOP;
|
|
END $$;
|