feat(tts): 新增通用流式 TTS 引擎并接入 AI 对话

- 新增 @wenwumap/tts 独立包:边流式边合成、按句排队顺序播放、
  专业 TTS 失败自动降级浏览器朗读,含 README 使用说明
- AI 后端新增 /ai/tts 接口,改用 DashScope CosyVoice(cosyvoice-v3-flash)
  输出 mp3,串行+退避重试规避 429 限流
- web 对话面板接入 SpeechQueue,按角色配音色,加语音开关与朗读按钮
- admin 支持 /admin/ 基路径部署
- 地图页移除大面积 backdrop-blur,降低 GPU 占用
This commit is contained in:
selfrelease
2026-06-14 23:13:26 +08:00
parent 3a55cd1978
commit 4a9397bccc
17 changed files with 955 additions and 26 deletions
+1 -1
View File
@@ -9,7 +9,7 @@ if (!root) throw new Error("找不到 #root 挂载点");
ReactDOM.createRoot(root).render(
<React.StrictMode>
<BrowserRouter>
<BrowserRouter basename={import.meta.env.BASE_URL}>
<App />
</BrowserRouter>
</React.StrictMode>
+1
View File
@@ -3,6 +3,7 @@ import react from "@vitejs/plugin-react";
import path from "path";
export default defineConfig({
base: process.env.ADMIN_BASE ?? "/",
plugins: [react()],
resolve: {
alias: {
+15
View File
@@ -3,6 +3,7 @@ import { ApiOperation, ApiTags } from "@nestjs/swagger";
import { Response } from "express";
import { AiService } from "./ai.service";
import { ChatDto } from "./dto/chat.dto";
import { TtsDto } from "./dto/tts.dto";
import { RateLimitGuard } from "../common/rate-limit.guard";
@ApiTags("ai")
@@ -39,4 +40,18 @@ export class AiController {
const suggestions = await this.ai.getSuggestions(dto);
return { suggestions };
}
@Post("tts")
@ApiOperation({ summary: "通义千问 TTS:将文本合成语音(返回音频)" })
async tts(@Body() dto: TtsDto, @Res() res: Response): Promise<void> {
try {
const { buffer, contentType } = await this.ai.synthesizeSpeech(dto.text, dto.voice);
res.setHeader("Content-Type", contentType);
res.setHeader("Cache-Control", "no-store");
res.send(buffer);
} catch (err) {
const message = err instanceof Error ? err.message : "TTS 服务异常";
res.status(502).json({ message });
}
}
}
+120
View File
@@ -50,6 +50,28 @@ export type ChatPersona = "artifact" | "guide" | "scholar" | "migration" | "repa
export class AiService {
private readonly logger = new Logger(AiService.name);
// 限制对 DashScope qwen-tts 的并发与节奏,避免触发账号级 QPS 限流(429 Throttling
private static ttsActive = 0;
private static readonly ttsWaiters: (() => void)[] = [];
private static readonly TTS_MAX = Number(process.env["AI_TTS_CONCURRENCY"] ?? 1);
private static lastTtsAt = 0;
private static readonly TTS_MIN_GAP = Number(process.env["AI_TTS_MIN_GAP_MS"] ?? 300);
private static async acquireTtsSlot(): Promise<void> {
if (AiService.ttsActive < AiService.TTS_MAX) {
AiService.ttsActive++;
return;
}
await new Promise<void>((resolve) => AiService.ttsWaiters.push(resolve));
// 被唤醒即代表从释放者手中接过名额,ttsActive 保持不变
}
private static releaseTtsSlot(): void {
const next = AiService.ttsWaiters.shift();
if (next) next();
else AiService.ttsActive--;
}
constructor(
private readonly config: ConfigService,
private readonly db: DatabaseService
@@ -251,6 +273,104 @@ ${common}`;
return parseSuggestions(content);
}
/**
* 通义千问 TTS:将文本合成语音。
* 调用 DashScope qwen-tts,拿到临时音频 URL 后由后端拉取音频字节返回,
* 避免前端直连 OSS(跨域 / 过期)问题,实现同源音频。
*/
async synthesizeSpeech(
text: string,
voice?: string
): Promise<{ buffer: Buffer; contentType: string }> {
const apiKey = this.config.get<string>("AI_API_KEY");
if (!apiKey) throw new Error("AI 服务未配置:请在 .env 中设置 AI_API_KEY");
const model = this.config.get<string>("AI_TTS_MODEL") ?? "cosyvoice-v3-flash";
const v = voice || this.config.get<string>("AI_TTS_VOICE") || "longxiaochun_v3";
const clean = text.trim().slice(0, 800);
if (!clean) throw new Error("待合成文本为空");
const genUrl =
this.config.get<string>("AI_TTS_URL") ??
"https://dashscope.aliyuncs.com/api/v1/services/audio/tts/SpeechSynthesizer";
const audioUrl = await this.requestTtsUrl(genUrl, apiKey, model, v, clean);
const audioResp = await fetch(audioUrl, { signal: AbortSignal.timeout(20_000) });
if (!audioResp.ok) throw new Error(`音频下载失败 ${audioResp.status}`);
const contentType = audioResp.headers.get("content-type") ?? "audio/wav";
const buffer = Buffer.from(await audioResp.arrayBuffer());
return { buffer, contentType };
}
/**
* 请求 DashScope 生成语音并返回音频 URL。
* 通过信号量限制并发,并对 429/5xx 做指数退避重试,规避账号级 QPS 限流。
*/
private async requestTtsUrl(
genUrl: string,
apiKey: string,
model: string,
voice: string,
text: string
): Promise<string> {
const maxAttempts = 5;
let lastErr = "unknown";
for (let attempt = 0; attempt < maxAttempts; attempt++) {
await AiService.acquireTtsSlot();
try {
// 节流:与上一次请求保持最小间隔,降低 QPS 峰值
const since = Date.now() - AiService.lastTtsAt;
if (since < AiService.TTS_MIN_GAP) {
await new Promise((r) => setTimeout(r, AiService.TTS_MIN_GAP - since));
}
AiService.lastTtsAt = Date.now();
const resp = await fetch(genUrl, {
method: "POST",
headers: {
"Content-Type": "application/json",
Authorization: `Bearer ${apiKey}`,
},
body: JSON.stringify({
model,
input: { text, voice, format: "mp3", sample_rate: 22050 },
}),
signal: AbortSignal.timeout(25_000),
});
if (resp.status === 429 || resp.status >= 500) {
lastErr = `${resp.status}`;
// 触发限流/上游错误:退避后重试
} else if (!resp.ok) {
const t = await resp.text().catch(() => "");
throw new Error(`TTS 接口返回 ${resp.status}: ${t.slice(0, 200)}`);
} else {
const json = (await resp.json()) as {
output?: { audio?: { url?: string } };
};
const url = json?.output?.audio?.url;
if (url) return url;
lastErr = "no-url";
}
} catch (err) {
// 网络/超时错误也重试
lastErr = err instanceof Error ? err.message : "network";
} finally {
AiService.releaseTtsSlot();
}
if (attempt < maxAttempts - 1) {
const delay = Math.min(500 * 2 ** attempt, 5000) + Math.floor(Math.random() * 300);
await new Promise((r) => setTimeout(r, delay));
}
}
this.logger.error(`TTS 重试仍失败:${lastErr}`);
throw new Error(`TTS 限流,请稍后再试(${lastErr}`);
}
private async chatComplete(
messages: { role: string; content: string }[],
temperature = 0.7
+14
View File
@@ -0,0 +1,14 @@
import { IsOptional, IsString, MaxLength } from "class-validator";
export class TtsDto {
/** 待合成的纯文本(已去除 Markdown 标记) */
@IsString()
@MaxLength(2000)
text!: string;
/** 音色,默认 Cherry */
@IsOptional()
@IsString()
@MaxLength(40)
voice?: string;
}
+2 -1
View File
@@ -14,7 +14,8 @@ import { Request } from "express";
@Injectable()
export class RateLimitGuard implements CanActivate {
private static readonly WINDOW_MS = 60_000;
private static readonly MAX_REQUESTS = 20;
// 单条回答会按句拆分为多次 TTS 调用,故放宽阈值(仍可防刷)。
private static readonly MAX_REQUESTS = 120;
private static readonly buckets = new Map<string, number[]>();
canActivate(context: ExecutionContext): boolean {
+1 -1
View File
@@ -1,7 +1,7 @@
/** @type {import('next').NextConfig} */
const nextConfig = {
reactStrictMode: true,
transpilePackages: ["@wenwumap/shared"],
transpilePackages: ["@wenwumap/shared", "@wenwumap/tts"],
env: {
NEXT_PUBLIC_API_URL: process.env.NEXT_PUBLIC_API_URL || "http://localhost:3002",
NEXT_PUBLIC_MAP_STYLE: process.env.NEXT_PUBLIC_MAP_STYLE || "",
+1
View File
@@ -17,6 +17,7 @@
"@radix-ui/react-tooltip": "^1.1.2",
"@vis.gl/react-google-maps": "^1.4.0",
"@wenwumap/shared": "workspace:*",
"@wenwumap/tts": "workspace:*",
"clsx": "^2.1.1",
"lucide-react": "^0.414.0",
"maplibre-gl": "^4.5.0",
+7 -7
View File
@@ -407,7 +407,7 @@ export default function MapPage() {
<div className="relative h-screen w-full overflow-hidden bg-[#090806] text-[#f6eddc]">
<div className="pointer-events-none absolute inset-0 z-0 bg-[radial-gradient(circle_at_top_left,rgba(190,137,55,0.18),transparent_32%),radial-gradient(circle_at_bottom_right,rgba(50,101,91,0.18),transparent_36%)]" />
<header className="absolute left-0 right-0 top-0 z-50 flex h-16 items-center border-b border-[#d6aa5b]/15 bg-[#080705]/92 px-6 shadow-[0_18px_40px_rgba(0,0,0,0.45)] backdrop-blur-xl">
<header className="absolute left-0 right-0 top-0 z-50 flex h-16 items-center border-b border-[#d6aa5b]/15 bg-[#080705] px-6 shadow-[0_18px_40px_rgba(0,0,0,0.45)]">
<div className="flex items-center gap-4">
<div className="flex h-9 w-9 items-center justify-center rounded-full border border-[#d6aa5b]/35 bg-[#d6aa5b]/10 text-sm text-[#f2cf83]">
@@ -514,7 +514,7 @@ export default function MapPage() {
</div>
)}
{activeRoute && activeRoute.stops?.length > 0 && (
<div className="absolute bottom-4 left-1/2 z-20 w-[min(720px,90%)] -translate-x-1/2 rounded-2xl border border-[#d6aa5b]/25 bg-[#0c0a06]/94 px-5 py-3.5 shadow-[0_12px_44px_rgba(0,0,0,0.55)] backdrop-blur-xl">
<div className="absolute bottom-4 left-1/2 z-20 w-[min(720px,90%)] -translate-x-1/2 rounded-2xl border border-[#d6aa5b]/25 bg-[#0c0a06] px-5 py-3.5 shadow-[0_12px_44px_rgba(0,0,0,0.55)]">
<div className="mb-2.5 flex items-center justify-between">
<span
className="flex items-center gap-2 font-serif text-sm font-semibold"
@@ -627,7 +627,7 @@ export default function MapPage() {
<button
onClick={() => setLeftCollapsed((v) => !v)}
title={leftCollapsed ? "展开左栏" : "收起左栏"}
className="absolute top-1/2 z-50 flex h-14 w-5 -translate-y-1/2 items-center justify-center rounded-r-md border border-l-0 border-[#d6aa5b]/20 bg-[#0c0a06]/90 text-sm text-[#d6aa5b] backdrop-blur transition-all duration-300 hover:bg-[#d6aa5b]/15 hover:text-[#f2cf83]"
className="absolute top-1/2 z-50 flex h-14 w-5 -translate-y-1/2 items-center justify-center rounded-r-md border border-l-0 border-[#d6aa5b]/20 bg-[#0c0a06] text-sm text-[#d6aa5b] transition-all duration-300 hover:bg-[#d6aa5b]/15 hover:text-[#f2cf83]"
style={{ left: leftW }}
>
{leftCollapsed ? <ChevronRight size={14} /> : <ChevronLeft size={14} />}
@@ -637,13 +637,13 @@ export default function MapPage() {
<button
onClick={() => setRightCollapsed((v) => !v)}
title={rightCollapsed ? "展开右栏" : "收起右栏"}
className="absolute top-1/2 z-50 flex h-14 w-5 -translate-y-1/2 items-center justify-center rounded-l-md border border-r-0 border-[#d6aa5b]/20 bg-[#0c0a06]/90 text-sm text-[#d6aa5b] backdrop-blur transition-all duration-300 hover:bg-[#d6aa5b]/15 hover:text-[#f2cf83]"
className="absolute top-1/2 z-50 flex h-14 w-5 -translate-y-1/2 items-center justify-center rounded-l-md border border-r-0 border-[#d6aa5b]/20 bg-[#0c0a06] text-sm text-[#d6aa5b] transition-all duration-300 hover:bg-[#d6aa5b]/15 hover:text-[#f2cf83]"
style={{ right: effRightW }}
>
{rightCollapsed ? <ChevronLeft size={14} /> : <ChevronRight size={14} />}
</button>
<aside className="absolute bottom-11 left-0 top-16 z-40 flex w-72 flex-col border-r border-[#d6aa5b]/15 bg-[#080705]/94 shadow-[18px_0_48px_rgba(0,0,0,0.38)] backdrop-blur-xl transition-transform duration-300" style={{ transform: leftCollapsed ? "translateX(-100%)" : "none" }}>
<aside className="absolute bottom-11 left-0 top-16 z-40 flex w-72 flex-col border-r border-[#d6aa5b]/15 bg-[#080705] shadow-[18px_0_48px_rgba(0,0,0,0.38)] transition-transform duration-300" style={{ transform: leftCollapsed ? "translateX(-100%)" : "none" }}>
<div className="shrink-0 border-b border-[#d6aa5b]/12 p-5">
<div className="text-xs uppercase tracking-[0.3em] text-[#8f8066]">Explore</div>
<h1 className="mt-2 font-serif text-xl font-semibold text-[#f6eddc]"></h1>
@@ -834,7 +834,7 @@ export default function MapPage() {
</div>
</aside>
<aside className="absolute bottom-11 right-0 top-16 z-40 border-l border-[#d6aa5b]/15 bg-[#080705]/94 shadow-[-18px_0_48px_rgba(0,0,0,0.38)] backdrop-blur-xl transition-transform duration-300" style={{ width: rightWidth, transform: rightCollapsed ? "translateX(100%)" : "none" }}>
<aside className="absolute bottom-11 right-0 top-16 z-40 border-l border-[#d6aa5b]/15 bg-[#080705] shadow-[-18px_0_48px_rgba(0,0,0,0.38)] transition-transform duration-300" style={{ width: rightWidth, transform: rightCollapsed ? "translateX(100%)" : "none" }}>
{/* 拖拽手柄:调整右栏宽度 */}
<div
onMouseDown={startResize}
@@ -1111,7 +1111,7 @@ export default function MapPage() {
)}
</aside>
<footer className="absolute bottom-0 left-0 right-0 z-50 flex h-11 items-center border-t border-[#d6aa5b]/12 bg-[#080705]/95 px-6 text-xs text-[#8f8066] backdrop-blur-xl">
<footer className="absolute bottom-0 left-0 right-0 z-50 flex h-11 items-center border-t border-[#d6aa5b]/12 bg-[#080705] px-6 text-xs text-[#8f8066]">
<span className="text-[#c8b88e]"> {groupedPoints.length} {zoom <= 7 ? "个城市" : "个机构"}{institutionCount} · {visiblePoints.length} </span>
{filterCity && <span className="ml-4">{filterCity}</span>}
{filterCategory && <span className="ml-4">{CATEGORY_LABELS[filterCategory]}</span>}
+146 -15
View File
@@ -3,7 +3,8 @@
import { useCallback, useEffect, useRef, useState } from "react";
import ReactMarkdown from "react-markdown";
import remarkGfm from "remark-gfm";
import { Sparkles, ChevronRight } from "lucide-react";
import { Sparkles, ChevronRight, Volume2, VolumeX, Square } from "lucide-react";
import { SpeechQueue } from "@wenwumap/tts";
const API_URL = process.env.NEXT_PUBLIC_API_URL ?? "http://localhost:3002";
@@ -40,6 +41,16 @@ interface ArtifactChatProps {
fill?: boolean;
}
// 各角色对应的 CosyVoice 音色
const PERSONA_VOICE: Record<Persona, string> = {
artifact: "longxiaochun_v3", // 温润中性
guide: "longxiaochun_v3", // 亲切讲解
scholar: "longsanshu_v3", // 沉稳学者
migration: "longze_v3", // 深沉叙事
repatriation: "longanzhi_v3", // 温情
youth: "longwan_v3", // 活泼
};
export default function ArtifactChat({ artifactId, artifactName, onConversationChange, fill }: ArtifactChatProps) {
const [persona, setPersona] = useState<Persona>("artifact");
const [messages, setMessages] = useState<ChatMessage[]>([]);
@@ -52,6 +63,59 @@ export default function ArtifactChat({ artifactId, artifactName, onConversationC
const scrollRef = useRef<HTMLDivElement | null>(null);
const storageKey = `wenwu_chat_${artifactId}_${persona}`;
// ===== TTS(通义千问语音合成 · 边流式边合成的播放队列)=====
const [ttsOn, setTtsOn] = useState(true);
const [speakingIdx, setSpeakingIdx] = useState<number | null>(null);
const queueRef = useRef<SpeechQueue | null>(null);
// 读取本地保存的“语音开关”偏好
useEffect(() => {
try {
const saved = localStorage.getItem("wenwu_tts_on");
if (saved !== null) setTtsOn(saved === "1");
} catch {
/* ignore */
}
}, []);
// 懒初始化通用 TTS 播放引擎(@wenwumap/tts
const getQueue = useCallback(() => {
if (!queueRef.current) {
queueRef.current = new SpeechQueue({
endpoint: `${API_URL}/api/v1/ai/tts`,
voice: PERSONA_VOICE.artifact,
lang: "zh-CN",
onSpeakingChange: (tag) => setSpeakingIdx(typeof tag === "number" ? tag : null),
});
}
return queueRef.current;
}, []);
const ttsUnlock = useCallback(() => getQueue().unlock(), [getQueue]);
const ttsStop = useCallback(() => queueRef.current?.stop(), []);
const ttsBegin = useCallback(
(idx: number) => {
const q = getQueue();
q.setVoice(PERSONA_VOICE[persona] ?? PERSONA_VOICE.artifact);
q.begin(idx);
},
[getQueue, persona]
);
const ttsFeed = useCallback((delta: string) => queueRef.current?.feed(delta), []);
const ttsFlush = useCallback(() => queueRef.current?.flush(), []);
const speakWhole = useCallback(
(text: string, idx: number) => {
const q = getQueue();
q.unlock();
q.setVoice(PERSONA_VOICE[persona] ?? PERSONA_VOICE.artifact);
q.speakWhole(text, idx);
},
[getQueue, persona]
);
// 卸载时释放
useEffect(() => () => queueRef.current?.destroy(), []);
const persist = useCallback(
(msgs: ChatMessage[]) => {
try {
@@ -67,6 +131,7 @@ export default function ArtifactChat({ artifactId, artifactName, onConversationC
// 切换文物或角色时:从本地存储恢复该组合的历史对话
useEffect(() => {
abortRef.current?.abort();
ttsStop();
setError(null);
setStreaming(false);
setSuggestions([]);
@@ -128,6 +193,14 @@ export default function ArtifactChat({ artifactId, artifactName, onConversationC
const content = text.trim();
if (!content || streaming) return;
const speakThis = ttsOn;
const assistantIdx = messages.length + 1; // 本轮 assistant 消息在列表中的下标
// 在用户手势内解锁自动播放,并开启朗读会话
if (speakThis) {
ttsUnlock();
ttsBegin(assistantIdx);
}
const history: ChatMessage[] = [...messages, { role: "user", content }];
setMessages([...history, { role: "assistant", content: "" }]);
setInput("");
@@ -176,6 +249,7 @@ export default function ArtifactChat({ artifactId, artifactName, onConversationC
setError(json.error);
} else if (json.t) {
assistantContent += json.t;
if (speakThis) ttsFeed(json.t);
setMessages((prev) => {
const next = [...prev];
const last = next[next.length - 1];
@@ -199,6 +273,7 @@ export default function ArtifactChat({ artifactId, artifactName, onConversationC
} finally {
setStreaming(false);
abortRef.current = null;
if (speakThis) ttsFlush();
}
// 每轮回答后生成 4 个下一步追问
@@ -211,7 +286,19 @@ export default function ArtifactChat({ artifactId, artifactName, onConversationC
void fetchSuggestions(finalHistory);
}
},
[artifactId, persona, messages, streaming, fetchSuggestions, persist]
[
artifactId,
persona,
messages,
streaming,
fetchSuggestions,
persist,
ttsOn,
ttsUnlock,
ttsBegin,
ttsFeed,
ttsFlush,
]
);
return (
@@ -227,20 +314,45 @@ export default function ArtifactChat({ artifactId, artifactName, onConversationC
</span>
<span className="text-[11px] uppercase tracking-[0.24em] text-[#a99566]"></span>
</div>
{messages.length > 0 && (
<div className="flex items-center gap-3">
<button
onClick={() => {
abortRef.current?.abort();
setMessages([]);
setError(null);
setSuggestions([]);
persist([]);
setTtsOn((prev) => {
const next = !prev;
try {
localStorage.setItem("wenwu_tts_on", next ? "1" : "0");
} catch {
/* ignore */
}
if (next) ttsUnlock();
else ttsStop();
return next;
});
}}
className="text-[11px] text-[#8f8066] transition hover:text-[#f2cf83]"
title={ttsOn ? "语音朗读:开(点击关闭)" : "语音朗读:关(点击开启)"}
className={`flex items-center gap-1 text-[11px] transition ${
ttsOn ? "text-[#f2cf83]" : "text-[#8f8066] hover:text-[#f2cf83]"
}`}
>
{ttsOn ? <Volume2 size={14} /> : <VolumeX size={14} />}
<span></span>
</button>
)}
{messages.length > 0 && (
<button
onClick={() => {
abortRef.current?.abort();
ttsStop();
setMessages([]);
setError(null);
setSuggestions([]);
persist([]);
}}
className="text-[11px] text-[#8f8066] transition hover:text-[#f2cf83]"
>
</button>
)}
</div>
</div>
{/* 角色设置 */}
@@ -306,9 +418,25 @@ export default function ArtifactChat({ artifactId, artifactName, onConversationC
<div key={i} className="flex justify-start">
<div className="max-w-[92%] rounded-2xl rounded-bl-sm border border-[#d6aa5b]/14 bg-[#11100d]/80 px-3 py-2 text-[#ecdfc6]">
{m.content ? (
<div className="md-chat">
<ReactMarkdown remarkPlugins={[remarkGfm]}>{m.content}</ReactMarkdown>
</div>
<>
<div className="md-chat">
<ReactMarkdown remarkPlugins={[remarkGfm]}>{m.content}</ReactMarkdown>
</div>
{!(streaming && i === messages.length - 1) && (
<button
onClick={() => {
ttsUnlock();
if (speakingIdx === i) ttsStop();
else speakWhole(m.content, i);
}}
title={speakingIdx === i ? "停止朗读" : "朗读这段"}
className="mt-1.5 inline-flex items-center gap-1 text-[10px] text-[#8f8066] transition hover:text-[#f2cf83]"
>
{speakingIdx === i ? <Square size={11} /> : <Volume2 size={12} />}
<span>{speakingIdx === i ? "朗读中 · 停止" : "朗读"}</span>
</button>
)}
</>
) : (
<span className="inline-flex gap-1 py-1 align-middle">
<span className="h-1.5 w-1.5 animate-bounce rounded-full bg-[#d6aa5b] [animation-delay:-0.2s]" />
@@ -378,7 +506,10 @@ export default function ArtifactChat({ artifactId, artifactName, onConversationC
{streaming ? (
<button
type="button"
onClick={() => abortRef.current?.abort()}
onClick={() => {
abortRef.current?.abort();
ttsStop();
}}
className="flex-shrink-0 rounded-full border border-[#d6aa5b]/30 px-3 py-2 text-xs text-[#f2cf83] transition hover:bg-[#d6aa5b]/12"
>
+2 -1
View File
@@ -16,7 +16,8 @@
"plugins": [{ "name": "next" }],
"paths": {
"@/*": ["./src/*"],
"@wenwumap/shared": ["../../packages/shared/src/index.ts"]
"@wenwumap/shared": ["../../packages/shared/src/index.ts"],
"@wenwumap/tts": ["../../packages/tts/src/index.ts"]
}
},
"include": ["next-env.d.ts", "**/*.ts", "**/*.tsx", ".next/types/**/*.ts"],
+258
View File
@@ -0,0 +1,258 @@
# @wenwumap/tts
通用、与框架无关的**流式语音合成(TTS)播放引擎**。
把「AI 回答 → 语音播报」这件事沉淀成一个独立模块:边流式边合成、按句排队、顺序无缝播放,专业 TTS 失败时自动降级到浏览器朗读,绝不静默。
- ✅ 零业务耦合,零运行时依赖(仅用浏览器 `fetch` / `Audio` / `speechSynthesis`
- ✅ 不绑定 React / Vue,纯 TypeScript 类,任何前端都能用
- ✅ 边流式边合成:第一句话出现后约 1 秒即可起声
- ✅ 按入队顺序播放,合成乱序完成也不会乱
- ✅ 双轨兜底:后端 TTS 限流/失败 → 自动浏览器朗读
- ✅ 处理浏览器自动播放策略(`unlock()`)、blob 资源回收、会话防串音
---
## 1. 它依赖什么?
模块本身**不发起任何特定厂商的请求**。它只要求你提供一个**后端 TTS 接口**,满足下面的契约:
### 接口契约
```
POST <endpoint>
Content-Type: application/json
请求体: { "text": "要合成的纯文本", "voice": "音色ID(可选)" }
成功(2xx) 返回音频二进制(mp3 / wav 等,Content-Type 为 audio/*
失败(非2xx): 引擎自动对这一段降级为浏览器朗读
```
> 引擎用 `fetch(endpoint).blob()` 拿音频并用 `URL.createObjectURL` 播放,所以**同源**或正确的 CORS 即可。
### 后端实现参考(任选一种 TTS 服务)
以阿里云百炼 CosyVoice 为例(Node/NestJS 伪代码):
```ts
// POST /api/tts -> 返回 audio/mpeg
app.post("/api/tts", async (req, res) => {
const { text, voice } = req.body;
const r = await fetch(
"https://dashscope.aliyuncs.com/api/v1/services/audio/tts/SpeechSynthesizer",
{
method: "POST",
headers: { Authorization: `Bearer ${process.env.DASHSCOPE_API_KEY}`, "Content-Type": "application/json" },
body: JSON.stringify({
model: "cosyvoice-v3-flash",
input: { text: text.slice(0, 800), voice: voice || "longxiaochun_v3", format: "mp3", sample_rate: 22050 },
}),
}
);
const { output } = await r.json();
const audio = await fetch(output.audio.url); // 拿到临时音频 URL 再回源,避免跨域/过期
res.setHeader("Content-Type", "audio/mpeg");
res.send(Buffer.from(await audio.arrayBuffer()));
});
```
> 提示:CosyVoice/qwen-tts 有**并发限流(429)**。建议后端把对上游的并发限制为 1,并对 429 做退避重试——引擎层即便偶发失败也会自动降级浏览器朗读,不会静默。
---
## 2. 安装 / 引入
### 方式 A:同一个 monorepopnpm workspace,推荐)
1.`packages/tts` 整个目录拷到目标仓库的 `packages/` 下(或保留在本仓库共用)。
2. 在要使用的 app 里加依赖:
```jsonc
// apps/your-app/package.json
{ "dependencies": { "@wenwumap/tts": "workspace:*" } }
```
3. 因为包是「源码直出」(`main` 指向 `src/index.ts`),构建工具需要转译它:
- **Next.js**`next.config.js`
```js
module.exports = { transpilePackages: ["@wenwumap/tts"] };
```
- **tsconfig 路径别名**(编辑器类型解析):
```jsonc
{ "compilerOptions": { "paths": { "@wenwumap/tts": ["../../packages/tts/src/index.ts"] } } }
```
- **Vite**:无需特殊配置;如遇到未转译可在 `optimizeDeps`/`build` 里包含它。
4. `pnpm install` 链接工作区依赖。
### 方式 B:独立项目,直接拷贝源码
包没有任何依赖,直接把 `src/` 三个文件拷进你的项目即可:
```
src/text.ts # stripMarkdown / splitSpeakable
src/speech-queue.ts # SpeechQueue
src/index.ts # 导出
```
然后 `import { SpeechQueue } from "./tts"`。
### 方式 C:改名复用
它叫 `@wenwumap/tts` 只是包名,与业务无关。换个项目可以把包名改成 `@yourorg/tts`,逻辑完全通用。
---
## 3. 快速上手
```ts
import { SpeechQueue } from "@wenwumap/tts";
const tts = new SpeechQueue({
endpoint: "/api/tts", // 你的后端 TTS 接口
voice: "longxiaochun_v3", // 默认音色(可选)
lang: "zh-CN", // 浏览器朗读兜底语言(可选)
});
// 必须在“用户点击”里调一次,解锁浏览器自动播放
button.addEventListener("click", () => {
tts.unlock();
tts.speakWhole("你好,我是这件文物,已经三千岁啦。");
});
```
---
## 4. 两种典型用法
### 4.1 一次性朗读整段(已有完整文本)
```ts
tts.unlock(); // 用户手势内
tts.speakWhole(fullText); // 内部自动按句切分、排队、顺序播放
```
### 4.2 边流式边合成(配合 LLM 流式输出,首声最快)
```ts
tts.unlock(); // 用户点击“发送”时
tts.begin(); // 开启一轮朗读会话
for await (const delta of llmStream) { // 大模型逐 token 输出
appendToUI(delta);
tts.feed(delta); // 凑齐整句即开始合成、播放
}
tts.flush(); // 流结束,朗读剩余尾句
```
---
## 5. React 用法
直接用 `useRef` 持有实例即可(引擎自带状态,组件只需同步 UI):
```tsx
import { useCallback, useEffect, useRef, useState } from "react";
import { SpeechQueue } from "@wenwumap/tts";
function useTts(endpoint: string) {
const ref = useRef<SpeechQueue | null>(null);
const [speakingTag, setSpeakingTag] = useState<unknown>(null);
const get = useCallback(() => {
if (!ref.current) {
ref.current = new SpeechQueue({
endpoint,
onSpeakingChange: (tag) => setSpeakingTag(tag), // tag=null 表示停止
});
}
return ref.current;
}, [endpoint]);
useEffect(() => () => ref.current?.destroy(), []);
return { get, speakingTag };
}
// 组件内:
const { get, speakingTag } = useTts("/api/tts");
// 发送提问(用户手势)
function onSend(idx: number) {
const q = get();
q.unlock();
q.setVoice("longsanshu_v3");
q.begin(idx); // 用 idx 作为 tagspeakingTag === idx 即“这条在朗读”
}
// 流式: q.feed(delta);结束: get().flush()
// 重播某条: get().speakWhole(text, idx)
// 停止: get().stop()
```
> `onSpeakingChange(tag)`:开始播放某会话时回调你传入的 `tag``begin(tag)` / `speakWhole(text, tag)`),停止或播完回调 `null`。用它驱动「朗读中」高亮。
---
## 6. API 参考
### `new SpeechQueue(options)`
| 选项 | 类型 | 默认 | 说明 |
|---|---|---|---|
| `endpoint` | `string` | (必填) | 后端 TTS 接口,`POST {text, voice}` → 音频二进制 |
| `voice` | `string` | `""` | 默认音色 ID(由你的后端/TTS 服务定义) |
| `lang` | `string` | `"zh-CN"` | 浏览器朗读兜底语言 |
| `minSentenceLen` | `number` | `14` | 成句最小长度(去标记后字符数),越小起声越早、请求越碎 |
| `maxInFlight` | `number` | `3` | 最大并发合成请求数 |
| `fetchImpl` | `typeof fetch` | 全局 `fetch` | 自定义 fetch(如带鉴权头) |
| `onSpeakingChange` | `(tag) => void` | — | 播放开始回调 tag,停止/结束回调 `null` |
| `onError` | `(err) => void` | — | 合成/播放出错(非致命,会自动降级或跳过) |
### 方法
| 方法 | 说明 |
|---|---|
| `unlock()` | **必须在用户手势内同步调用一次**,解锁浏览器自动播放权限 |
| `setVoice(v)` | 设置后续合成使用的音色 |
| `begin(tag?)` | 开启一轮新的流式朗读会话(作废上一轮) |
| `feed(delta)` | 喂入增量文本,凑齐整句即合成、播放 |
| `flush()` | 流式结束,朗读尾部剩余文本 |
| `speakWhole(text, tag?)` | 一次性朗读整段(内部 = begin + 切分入队 + flush |
| `stop()` | 停止播放、清空队列、取消浏览器朗读 |
| `destroy()` | 释放资源(组件卸载时调用) |
### 工具函数
```ts
import { stripMarkdown, splitSpeakable } from "@wenwumap/tts";
stripMarkdown("**加粗** `代码`"); // -> "加粗 代码"
splitSpeakable("第一句。第二句还没完"); // -> { chunks: ["第一句。"], rest: "第二句还没完" }
```
---
## 7. 重要注意事项
- **自动播放**:浏览器要求音频播放源于用户手势。务必在点击事件里**同步**调用一次 `unlock()`(它会播放一段静音占位以获授权)。否则首次自动朗读可能被拦截,此时降级到浏览器朗读或等用户手动点击。
- **音色 ID 由谁定义**`voice` 只是透传给你的后端;具体支持哪些音色取决于你接的 TTS 服务(如 CosyVoice 的 `longxiaochun_v3`、`longsanshu_v3` 等)。
- **限流**:高并发下后端 TTS 可能 429。建议后端串行 + 退避;引擎本身已对失败段自动降级浏览器朗读,体验不中断。
- **仅浏览器环境**:引擎使用 `Audio`/`speechSynthesis`,请在客户端(如 Next.js 的 `"use client"` 组件)中使用。
---
## 8. 工作原理(简述)
```
feed(delta) ──► 累积文本,按句号/问号/换行切句
│ (凑够 minSentenceLen)
入队 + 受限并发合成 (maxInFlight)
│ 失败→标记“浏览器朗读”
按入队顺序逐段播放(audio 或 speechSynthesis
onSpeakingChange(tag / null) 通知 UI
```
+14
View File
@@ -0,0 +1,14 @@
{
"name": "@wenwumap/tts",
"version": "0.1.0",
"private": true,
"description": "通用、与框架无关的流式语音合成(TTS)播放引擎:边流式边合成、按句排队、浏览器朗读兜底。",
"main": "./src/index.ts",
"types": "./src/index.ts",
"scripts": {
"type-check": "tsc --noEmit"
},
"devDependencies": {
"typescript": "^5.5.3"
}
}
+3
View File
@@ -0,0 +1,3 @@
export { SpeechQueue } from "./speech-queue";
export type { SpeechQueueOptions } from "./speech-queue";
export { stripMarkdown, splitSpeakable } from "./text";
+313
View File
@@ -0,0 +1,313 @@
import { splitSpeakable, stripMarkdown } from "./text";
export interface SpeechQueueOptions {
/** 后端 TTS 接口地址:POST { text, voice } -> 音频二进制(mp3/wav 等) */
endpoint: string;
/** 默认音色(可随时用 setVoice 覆盖) */
voice?: string;
/** 浏览器朗读兜底语言,默认 zh-CN */
lang?: string;
/** 成句最小长度(去标记后),默认 14 */
minSentenceLen?: number;
/** 最大在途合成请求数,默认 3 */
maxInFlight?: number;
/** 自定义 fetch(默认使用全局 fetch */
fetchImpl?: typeof fetch;
/** 播放状态变化:开始播放某会话时回调 tag,停止/结束时回调 null */
onSpeakingChange?: (tag: unknown | null) => void;
/** 合成或播放出错(不致命,会自动降级/跳过) */
onError?: (err: unknown) => void;
}
type Slot = string | null | "error" | "speech";
// null = 合成中;string = 已就绪的 objectURL"error" = 跳过;"speech" = 浏览器朗读兜底
/**
* 通用流式 TTS 播放引擎(与框架无关,仅依赖浏览器 API)。
*
* 设计要点:
* - 边流式边合成:feed() 持续喂入增量文本,凑齐整句即合成,降低首声延迟。
* - 顺序播放:合成可并发/乱序完成,但严格按入队顺序播放。
* - 双轨兜底:专业 TTS 失败(限流/异常)自动降级到浏览器 Web Speech,绝不静默。
* - 自动播放授权:unlock() 须在用户手势内调用一次。
*/
export class SpeechQueue {
private readonly endpoint: string;
private readonly lang: string;
private readonly minLen: number;
private readonly maxInFlight: number;
private readonly fetchImpl: typeof fetch;
private readonly onSpeakingChange?: (tag: unknown | null) => void;
private readonly onError?: (err: unknown) => void;
private voice: string;
private audio: HTMLAudioElement | null = null;
private unlocked = false;
private texts: string[] = [];
private slots: Slot[] = [];
private playIdx = 0;
private nextFetch = 0;
private inFlight = 0;
private playing = false;
private streamDone = false;
private pending = "";
private tag: unknown = null;
private session = 0;
private static readonly SILENT_WAV =
"data:audio/wav;base64,UklGRjIAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAAAAAA==";
constructor(opts: SpeechQueueOptions) {
this.endpoint = opts.endpoint;
this.voice = opts.voice ?? "";
this.lang = opts.lang ?? "zh-CN";
this.minLen = opts.minSentenceLen ?? 14;
this.maxInFlight = opts.maxInFlight ?? 3;
this.fetchImpl = opts.fetchImpl ?? globalThis.fetch?.bind(globalThis);
this.onSpeakingChange = opts.onSpeakingChange;
this.onError = opts.onError;
}
/** 当前音色 */
setVoice(v: string): void {
if (v) this.voice = v;
}
private getAudio(): HTMLAudioElement {
if (!this.audio) {
this.audio = new Audio();
this.audio.preload = "auto";
}
return this.audio;
}
/** 必须在用户手势(点击)同步调用一次:解锁音频自动播放权限 */
unlock(): void {
if (this.unlocked || typeof window === "undefined") return;
const el = this.getAudio();
try {
el.src = SpeechQueue.SILENT_WAV;
el.muted = true;
const p = el.play();
if (p && typeof p.then === "function") {
p.then(() => {
el.pause();
el.currentTime = 0;
el.muted = false;
this.unlocked = true;
}).catch(() => {});
} else {
this.unlocked = true;
}
} catch {
/* ignore */
}
}
/** 开启一个新的朗读会话(作废旧会话)。tag 用于标识当前在读的内容(如消息下标) */
begin(tag: unknown = null): void {
this.session += 1;
this.revokeAll();
this.texts = [];
this.slots = [];
this.playIdx = 0;
this.nextFetch = 0;
this.inFlight = 0;
this.playing = false;
this.streamDone = false;
this.pending = "";
this.tag = tag;
if (typeof window !== "undefined") window.speechSynthesis?.cancel();
if (this.audio) {
this.audio.pause();
this.audio.onended = null;
}
}
/** 流式喂入增量文本:凑齐整句即入队合成 */
feed(delta: string): void {
this.pending += delta;
const { chunks, rest } = splitSpeakable(this.pending, this.minLen);
this.pending = rest;
for (const c of chunks) this.enqueue(c);
}
/** 流式结束:把剩余文本作为最后一段入队并尝试播放 */
flush(): void {
const rest = this.pending.trim();
this.pending = "";
if (rest) this.enqueue(rest);
this.streamDone = true;
this.pumpPlay();
}
/** 一次性朗读整段文本(如重播某条消息) */
speakWhole(text: string, tag: unknown = null): void {
this.begin(tag);
const { chunks, rest } = splitSpeakable(text, this.minLen);
for (const c of chunks) this.enqueue(c);
if (rest.trim()) this.enqueue(rest);
this.streamDone = true;
this.pumpPlay();
}
/** 停止播放并清空队列 */
stop(): void {
this.session += 1;
this.streamDone = true;
this.playing = false;
this.revokeAll();
this.texts = [];
this.slots = [];
this.playIdx = 0;
this.nextFetch = 0;
this.inFlight = 0;
this.pending = "";
this.tag = null;
if (typeof window !== "undefined") window.speechSynthesis?.cancel();
if (this.audio) {
this.audio.pause();
this.audio.onended = null;
}
this.onSpeakingChange?.(null);
}
/** 释放资源(组件卸载时调用) */
destroy(): void {
this.session += 1;
this.revokeAll();
if (typeof window !== "undefined") window.speechSynthesis?.cancel();
if (this.audio) this.audio.pause();
}
// ===== 内部实现 =====
private revokeAll(): void {
for (const u of this.slots) {
if (typeof u === "string" && u.startsWith("blob:")) URL.revokeObjectURL(u);
}
}
private enqueue(text: string): void {
this.texts.push(text);
this.slots.push(null);
this.pumpFetch();
}
private pumpFetch(): void {
while (this.inFlight < this.maxInFlight && this.nextFetch < this.texts.length) {
const i = this.nextFetch++;
this.inFlight += 1;
void this.fetchChunk(i, this.session);
}
}
private async fetchChunk(i: number, session: number): Promise<void> {
const clean = stripMarkdown(this.texts[i] ?? "").slice(0, 600);
if (!clean) {
if (session === this.session) {
this.slots[i] = "error";
this.pumpPlay();
}
return;
}
try {
const res = await this.fetchImpl(this.endpoint, {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({ text: clean, voice: this.voice }),
});
if (session !== this.session) return;
if (!res.ok) {
this.slots[i] = "speech"; // 专业 TTS 失败 → 浏览器朗读兜底
} else {
const blob = await res.blob();
if (session !== this.session) return;
this.slots[i] = URL.createObjectURL(blob);
}
} catch (err) {
if (session === this.session) {
this.slots[i] = "speech";
this.onError?.(err);
}
}
if (session === this.session) {
this.inFlight = Math.max(0, this.inFlight - 1);
this.pumpFetch();
this.pumpPlay();
}
}
private pumpPlay(): void {
if (this.playing) return;
const i = this.playIdx;
if (i >= this.texts.length) {
if (this.streamDone) {
this.playing = false;
this.onSpeakingChange?.(null);
}
return;
}
const slot = this.slots[i];
if (slot === null || slot === undefined) return; // 合成中,待回调
if (slot === "error") {
this.playIdx = i + 1;
this.pumpPlay();
return;
}
if (slot === "speech") {
this.playViaBrowser(i);
return;
}
this.playViaAudio(i, slot);
}
private playViaAudio(i: number, url: string): void {
const el = this.getAudio();
el.src = url;
el.muted = false;
this.playing = true;
this.onSpeakingChange?.(this.tag);
el.onended = () => {
this.playing = false;
if (url.startsWith("blob:")) URL.revokeObjectURL(url);
this.slots[i] = "error";
this.playIdx = i + 1;
this.pumpPlay();
};
el.play().catch((err) => {
this.playing = false;
this.onError?.(err);
this.onSpeakingChange?.(null);
});
}
private playViaBrowser(i: number): void {
const text = stripMarkdown(this.texts[i] ?? "");
const synth = typeof window !== "undefined" ? window.speechSynthesis : undefined;
if (!text || !synth) {
this.playIdx = i + 1;
this.pumpPlay();
return;
}
this.playing = true;
this.onSpeakingChange?.(this.tag);
const done = () => {
this.playing = false;
this.slots[i] = "error";
this.playIdx = i + 1;
this.pumpPlay();
};
try {
const u = new SpeechSynthesisUtterance(text);
u.lang = this.lang;
u.rate = 1;
u.onend = done;
u.onerror = done;
synth.speak(u);
} catch {
done();
}
}
}
+47
View File
@@ -0,0 +1,47 @@
/**
* 文本处理工具:用于把 Markdown 回答转成适合朗读的纯文本,
* 以及在流式输出时按句切分,便于「边流式边合成」。
*/
/** 去除 Markdown 标记,得到适合朗读的纯文本 */
export function stripMarkdown(md: string): string {
return md
.replace(/```[\s\S]*?```/g, "")
.replace(/`([^`]+)`/g, "$1")
.replace(/!\[[^\]]*\]\([^)]*\)/g, "")
.replace(/\[([^\]]+)\]\([^)]*\)/g, "$1")
.replace(/^#{1,6}\s+/gm, "")
.replace(/^\s*>\s?/gm, "")
.replace(/^\s*[-*+]\s+/gm, "")
.replace(/\*\*([^*]+)\*\*/g, "$1")
.replace(/\*([^*]+)\*/g, "$1")
.replace(/_{1,2}([^_]+)_{1,2}/g, "$1")
.replace(/~~([^~]+)~~/g, "$1")
.replace(/\s{2,}/g, " ")
.trim();
}
/**
* 按句子切分文本:返回可朗读的完整句子块 chunks(每块去标记后长度 >= minLen),
* 以及尾部尚未成句的剩余文本 rest。用于边流式边合成、降低首声延迟。
*/
export function splitSpeakable(
text: string,
minLen = 14
): { chunks: string[]; rest: string } {
const chunks: string[] = [];
let rest = text;
let buf = "";
const re = /^[\s\S]*?[。!?!?\n;…]+/;
let m: RegExpExecArray | null;
while ((m = re.exec(rest))) {
buf += m[0];
rest = rest.slice(m[0].length);
if (stripMarkdown(buf).length >= minLen) {
chunks.push(buf);
buf = "";
}
}
rest = buf + rest;
return { chunks, rest };
}
+10
View File
@@ -0,0 +1,10 @@
{
"extends": "../../tsconfig.base.json",
"compilerOptions": {
"rootDir": "./src",
"outDir": "./dist",
"lib": ["dom", "dom.iterable", "ES2022"],
"types": []
},
"include": ["src"]
}