feat(tts): 新增通用流式 TTS 引擎并接入 AI 对话

- 新增 @wenwumap/tts 独立包:边流式边合成、按句排队顺序播放、
  专业 TTS 失败自动降级浏览器朗读,含 README 使用说明
- AI 后端新增 /ai/tts 接口,改用 DashScope CosyVoice(cosyvoice-v3-flash)
  输出 mp3,串行+退避重试规避 429 限流
- web 对话面板接入 SpeechQueue,按角色配音色,加语音开关与朗读按钮
- admin 支持 /admin/ 基路径部署
- 地图页移除大面积 backdrop-blur,降低 GPU 占用
This commit is contained in:
selfrelease
2026-06-14 23:13:26 +08:00
parent 3a55cd1978
commit 4a9397bccc
17 changed files with 955 additions and 26 deletions
+3
View File
@@ -0,0 +1,3 @@
export { SpeechQueue } from "./speech-queue";
export type { SpeechQueueOptions } from "./speech-queue";
export { stripMarkdown, splitSpeakable } from "./text";
+313
View File
@@ -0,0 +1,313 @@
import { splitSpeakable, stripMarkdown } from "./text";
export interface SpeechQueueOptions {
/** 后端 TTS 接口地址:POST { text, voice } -> 音频二进制(mp3/wav 等) */
endpoint: string;
/** 默认音色(可随时用 setVoice 覆盖) */
voice?: string;
/** 浏览器朗读兜底语言,默认 zh-CN */
lang?: string;
/** 成句最小长度(去标记后),默认 14 */
minSentenceLen?: number;
/** 最大在途合成请求数,默认 3 */
maxInFlight?: number;
/** 自定义 fetch(默认使用全局 fetch */
fetchImpl?: typeof fetch;
/** 播放状态变化:开始播放某会话时回调 tag,停止/结束时回调 null */
onSpeakingChange?: (tag: unknown | null) => void;
/** 合成或播放出错(不致命,会自动降级/跳过) */
onError?: (err: unknown) => void;
}
type Slot = string | null | "error" | "speech";
// null = 合成中;string = 已就绪的 objectURL"error" = 跳过;"speech" = 浏览器朗读兜底
/**
* 通用流式 TTS 播放引擎(与框架无关,仅依赖浏览器 API)。
*
* 设计要点:
* - 边流式边合成:feed() 持续喂入增量文本,凑齐整句即合成,降低首声延迟。
* - 顺序播放:合成可并发/乱序完成,但严格按入队顺序播放。
* - 双轨兜底:专业 TTS 失败(限流/异常)自动降级到浏览器 Web Speech,绝不静默。
* - 自动播放授权:unlock() 须在用户手势内调用一次。
*/
export class SpeechQueue {
private readonly endpoint: string;
private readonly lang: string;
private readonly minLen: number;
private readonly maxInFlight: number;
private readonly fetchImpl: typeof fetch;
private readonly onSpeakingChange?: (tag: unknown | null) => void;
private readonly onError?: (err: unknown) => void;
private voice: string;
private audio: HTMLAudioElement | null = null;
private unlocked = false;
private texts: string[] = [];
private slots: Slot[] = [];
private playIdx = 0;
private nextFetch = 0;
private inFlight = 0;
private playing = false;
private streamDone = false;
private pending = "";
private tag: unknown = null;
private session = 0;
private static readonly SILENT_WAV =
"data:audio/wav;base64,UklGRjIAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAAAAAA==";
constructor(opts: SpeechQueueOptions) {
this.endpoint = opts.endpoint;
this.voice = opts.voice ?? "";
this.lang = opts.lang ?? "zh-CN";
this.minLen = opts.minSentenceLen ?? 14;
this.maxInFlight = opts.maxInFlight ?? 3;
this.fetchImpl = opts.fetchImpl ?? globalThis.fetch?.bind(globalThis);
this.onSpeakingChange = opts.onSpeakingChange;
this.onError = opts.onError;
}
/** 当前音色 */
setVoice(v: string): void {
if (v) this.voice = v;
}
private getAudio(): HTMLAudioElement {
if (!this.audio) {
this.audio = new Audio();
this.audio.preload = "auto";
}
return this.audio;
}
/** 必须在用户手势(点击)同步调用一次:解锁音频自动播放权限 */
unlock(): void {
if (this.unlocked || typeof window === "undefined") return;
const el = this.getAudio();
try {
el.src = SpeechQueue.SILENT_WAV;
el.muted = true;
const p = el.play();
if (p && typeof p.then === "function") {
p.then(() => {
el.pause();
el.currentTime = 0;
el.muted = false;
this.unlocked = true;
}).catch(() => {});
} else {
this.unlocked = true;
}
} catch {
/* ignore */
}
}
/** 开启一个新的朗读会话(作废旧会话)。tag 用于标识当前在读的内容(如消息下标) */
begin(tag: unknown = null): void {
this.session += 1;
this.revokeAll();
this.texts = [];
this.slots = [];
this.playIdx = 0;
this.nextFetch = 0;
this.inFlight = 0;
this.playing = false;
this.streamDone = false;
this.pending = "";
this.tag = tag;
if (typeof window !== "undefined") window.speechSynthesis?.cancel();
if (this.audio) {
this.audio.pause();
this.audio.onended = null;
}
}
/** 流式喂入增量文本:凑齐整句即入队合成 */
feed(delta: string): void {
this.pending += delta;
const { chunks, rest } = splitSpeakable(this.pending, this.minLen);
this.pending = rest;
for (const c of chunks) this.enqueue(c);
}
/** 流式结束:把剩余文本作为最后一段入队并尝试播放 */
flush(): void {
const rest = this.pending.trim();
this.pending = "";
if (rest) this.enqueue(rest);
this.streamDone = true;
this.pumpPlay();
}
/** 一次性朗读整段文本(如重播某条消息) */
speakWhole(text: string, tag: unknown = null): void {
this.begin(tag);
const { chunks, rest } = splitSpeakable(text, this.minLen);
for (const c of chunks) this.enqueue(c);
if (rest.trim()) this.enqueue(rest);
this.streamDone = true;
this.pumpPlay();
}
/** 停止播放并清空队列 */
stop(): void {
this.session += 1;
this.streamDone = true;
this.playing = false;
this.revokeAll();
this.texts = [];
this.slots = [];
this.playIdx = 0;
this.nextFetch = 0;
this.inFlight = 0;
this.pending = "";
this.tag = null;
if (typeof window !== "undefined") window.speechSynthesis?.cancel();
if (this.audio) {
this.audio.pause();
this.audio.onended = null;
}
this.onSpeakingChange?.(null);
}
/** 释放资源(组件卸载时调用) */
destroy(): void {
this.session += 1;
this.revokeAll();
if (typeof window !== "undefined") window.speechSynthesis?.cancel();
if (this.audio) this.audio.pause();
}
// ===== 内部实现 =====
private revokeAll(): void {
for (const u of this.slots) {
if (typeof u === "string" && u.startsWith("blob:")) URL.revokeObjectURL(u);
}
}
private enqueue(text: string): void {
this.texts.push(text);
this.slots.push(null);
this.pumpFetch();
}
private pumpFetch(): void {
while (this.inFlight < this.maxInFlight && this.nextFetch < this.texts.length) {
const i = this.nextFetch++;
this.inFlight += 1;
void this.fetchChunk(i, this.session);
}
}
private async fetchChunk(i: number, session: number): Promise<void> {
const clean = stripMarkdown(this.texts[i] ?? "").slice(0, 600);
if (!clean) {
if (session === this.session) {
this.slots[i] = "error";
this.pumpPlay();
}
return;
}
try {
const res = await this.fetchImpl(this.endpoint, {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({ text: clean, voice: this.voice }),
});
if (session !== this.session) return;
if (!res.ok) {
this.slots[i] = "speech"; // 专业 TTS 失败 → 浏览器朗读兜底
} else {
const blob = await res.blob();
if (session !== this.session) return;
this.slots[i] = URL.createObjectURL(blob);
}
} catch (err) {
if (session === this.session) {
this.slots[i] = "speech";
this.onError?.(err);
}
}
if (session === this.session) {
this.inFlight = Math.max(0, this.inFlight - 1);
this.pumpFetch();
this.pumpPlay();
}
}
private pumpPlay(): void {
if (this.playing) return;
const i = this.playIdx;
if (i >= this.texts.length) {
if (this.streamDone) {
this.playing = false;
this.onSpeakingChange?.(null);
}
return;
}
const slot = this.slots[i];
if (slot === null || slot === undefined) return; // 合成中,待回调
if (slot === "error") {
this.playIdx = i + 1;
this.pumpPlay();
return;
}
if (slot === "speech") {
this.playViaBrowser(i);
return;
}
this.playViaAudio(i, slot);
}
private playViaAudio(i: number, url: string): void {
const el = this.getAudio();
el.src = url;
el.muted = false;
this.playing = true;
this.onSpeakingChange?.(this.tag);
el.onended = () => {
this.playing = false;
if (url.startsWith("blob:")) URL.revokeObjectURL(url);
this.slots[i] = "error";
this.playIdx = i + 1;
this.pumpPlay();
};
el.play().catch((err) => {
this.playing = false;
this.onError?.(err);
this.onSpeakingChange?.(null);
});
}
private playViaBrowser(i: number): void {
const text = stripMarkdown(this.texts[i] ?? "");
const synth = typeof window !== "undefined" ? window.speechSynthesis : undefined;
if (!text || !synth) {
this.playIdx = i + 1;
this.pumpPlay();
return;
}
this.playing = true;
this.onSpeakingChange?.(this.tag);
const done = () => {
this.playing = false;
this.slots[i] = "error";
this.playIdx = i + 1;
this.pumpPlay();
};
try {
const u = new SpeechSynthesisUtterance(text);
u.lang = this.lang;
u.rate = 1;
u.onend = done;
u.onerror = done;
synth.speak(u);
} catch {
done();
}
}
}
+47
View File
@@ -0,0 +1,47 @@
/**
* 文本处理工具:用于把 Markdown 回答转成适合朗读的纯文本,
* 以及在流式输出时按句切分,便于「边流式边合成」。
*/
/** 去除 Markdown 标记,得到适合朗读的纯文本 */
export function stripMarkdown(md: string): string {
return md
.replace(/```[\s\S]*?```/g, "")
.replace(/`([^`]+)`/g, "$1")
.replace(/!\[[^\]]*\]\([^)]*\)/g, "")
.replace(/\[([^\]]+)\]\([^)]*\)/g, "$1")
.replace(/^#{1,6}\s+/gm, "")
.replace(/^\s*>\s?/gm, "")
.replace(/^\s*[-*+]\s+/gm, "")
.replace(/\*\*([^*]+)\*\*/g, "$1")
.replace(/\*([^*]+)\*/g, "$1")
.replace(/_{1,2}([^_]+)_{1,2}/g, "$1")
.replace(/~~([^~]+)~~/g, "$1")
.replace(/\s{2,}/g, " ")
.trim();
}
/**
* 按句子切分文本:返回可朗读的完整句子块 chunks(每块去标记后长度 >= minLen),
* 以及尾部尚未成句的剩余文本 rest。用于边流式边合成、降低首声延迟。
*/
export function splitSpeakable(
text: string,
minLen = 14
): { chunks: string[]; rest: string } {
const chunks: string[] = [];
let rest = text;
let buf = "";
const re = /^[\s\S]*?[。!?!?\n;…]+/;
let m: RegExpExecArray | null;
while ((m = re.exec(rest))) {
buf += m[0];
rest = rest.slice(m[0].length);
if (stripMarkdown(buf).length >= minLen) {
chunks.push(buf);
buf = "";
}
}
rest = buf + rest;
return { chunks, rest };
}