-- 对现有公安局知识库文档执行分片(纯文本分片,不含embedding) -- 使用 regexp_split_to_table 按段落分割,再合并为 ~500 字的 chunk DO $$ DECLARE doc RECORD; para RECORD; current_chunk TEXT := ''; chunk_idx INTEGER := 0; chunk_size INTEGER := 500; BEGIN FOR doc IN SELECT kd.id AS doc_id, kd.kb_id, kd.content, kd.name FROM knowledge_documents kd JOIN knowledge_bases kb ON kd.kb_id = kb.id WHERE kb.org_id = 'a0000000-0000-0000-0000-000000000002' AND kd.content IS NOT NULL AND kd.content != '' LOOP DELETE FROM knowledge_chunks WHERE doc_id = doc.doc_id; current_chunk := ''; chunk_idx := 0; -- 按段落(双换行或单换行)拆分 FOR para IN SELECT unnest(regexp_split_to_array(doc.content, E'\n')) AS line LOOP IF char_length(current_chunk) + char_length(para.line) + 1 > chunk_size AND current_chunk != '' THEN INSERT INTO knowledge_chunks (kb_id, doc_id, chunk_index, content, char_count) VALUES (doc.kb_id, doc.doc_id, chunk_idx, trim(current_chunk), char_length(trim(current_chunk))); chunk_idx := chunk_idx + 1; current_chunk := ''; END IF; IF current_chunk = '' THEN current_chunk := para.line; ELSE current_chunk := current_chunk || E'\n' || para.line; END IF; END LOOP; -- 最后剩余的内容 IF char_length(trim(current_chunk)) > 0 THEN INSERT INTO knowledge_chunks (kb_id, doc_id, chunk_index, content, char_count) VALUES (doc.kb_id, doc.doc_id, chunk_idx, trim(current_chunk), char_length(trim(current_chunk))); chunk_idx := chunk_idx + 1; END IF; UPDATE knowledge_documents SET chunk_count = chunk_idx WHERE id = doc.doc_id; RAISE NOTICE '文档 [%] 已分片: % chunks', doc.name, chunk_idx; END LOOP; END $$;