Files
GovAI/server/migrations/seed_gongan_chunks.sql
2026-06-15 23:48:37 +08:00

51 lines
2.0 KiB
SQL

-- 对现有公安局知识库文档执行分片(纯文本分片,不含embedding)
-- 使用 regexp_split_to_table 按段落分割,再合并为 ~500 字的 chunk
DO $$
DECLARE
doc RECORD;
para RECORD;
current_chunk TEXT := '';
chunk_idx INTEGER := 0;
chunk_size INTEGER := 500;
BEGIN
FOR doc IN
SELECT kd.id AS doc_id, kd.kb_id, kd.content, kd.name
FROM knowledge_documents kd
JOIN knowledge_bases kb ON kd.kb_id = kb.id
WHERE kb.org_id = 'a0000000-0000-0000-0000-000000000002'
AND kd.content IS NOT NULL AND kd.content != ''
LOOP
DELETE FROM knowledge_chunks WHERE doc_id = doc.doc_id;
current_chunk := '';
chunk_idx := 0;
-- 按段落(双换行或单换行)拆分
FOR para IN
SELECT unnest(regexp_split_to_array(doc.content, E'\n')) AS line
LOOP
IF char_length(current_chunk) + char_length(para.line) + 1 > chunk_size AND current_chunk != '' THEN
INSERT INTO knowledge_chunks (kb_id, doc_id, chunk_index, content, char_count)
VALUES (doc.kb_id, doc.doc_id, chunk_idx, trim(current_chunk), char_length(trim(current_chunk)));
chunk_idx := chunk_idx + 1;
current_chunk := '';
END IF;
IF current_chunk = '' THEN
current_chunk := para.line;
ELSE
current_chunk := current_chunk || E'\n' || para.line;
END IF;
END LOOP;
-- 最后剩余的内容
IF char_length(trim(current_chunk)) > 0 THEN
INSERT INTO knowledge_chunks (kb_id, doc_id, chunk_index, content, char_count)
VALUES (doc.kb_id, doc.doc_id, chunk_idx, trim(current_chunk), char_length(trim(current_chunk)));
chunk_idx := chunk_idx + 1;
END IF;
UPDATE knowledge_documents SET chunk_count = chunk_idx WHERE id = doc.doc_id;
RAISE NOTICE '文档 [%] 已分片: % chunks', doc.name, chunk_idx;
END LOOP;
END $$;