feat(speaking): 添加语音输入功能

2026-03-23 14:15:51 +08:00 · 2026-03-23 14:15:51 +08:00 · 124edfa678
parent ea570f894e
commit 124edfa678
7 changed files with 1293 additions and 1 deletions
--- a/.codebuddy/plans/speaking-voice-input_bb1371d9.md
+++ b/.codebuddy/plans/speaking-voice-input_bb1371d9.md
@ -0,0 +1,141 @@
+---
+name: speaking-voice-input
+overview: 在 Speaking.vue 中增加语音输入功能，调用豆包流式语音识别 API 实现语音转文字
+todos:
+  - id: update-config
+    content: 在 config/index.js 中添加 DOUBAO_ASR_WS_PATH 配置
+    status: completed
+  - id: add-recording-state
+    content: 在 Speaking.vue 中添加录音相关状态和变量
+    status: completed
+    dependencies:
+      - update-config
+  - id: implement-recording
+    content: 实现录音功能（MediaRecorder 音频采集和转换）
+    status: completed
+    dependencies:
+      - add-recording-state
+  - id: implement-websocket
+    content: 实现 WebSocket 流式识别（连接管理、消息收发）
+    status: completed
+    dependencies:
+      - implement-recording
+  - id: add-voice-input-ui
+    content: 添加麦克风按钮 UI 和录音状态动画
+    status: completed
+    dependencies:
+      - implement-websocket
+  - id: cleanup-resources
+    content: 添加组件卸载时的资源清理逻辑
+    status: completed
+    dependencies:
+      - add-voice-input-ui
+---
+
+## 产品需求
+
+在英语口语对话页面（Speaking.vue）中增加语音输入功能，让用户可以通过麦克风进行语音输入，系统调用豆包大模型流式语音识别 API 将语音转换为文字。
+
+## 核心功能
+
+- **麦克风按钮**：在输入框旁边添加麦克风图标按钮，支持点击开始/停止录音
+- **实时录音**：使用浏览器 MediaRecorder API 采集麦克风音频（16kHz, 16bit, 单声道, PCM 格式）
+- **流式识别**：通过 WebSocket 发送音频流到豆包 ASR API，实时接收识别结果
+- **结果回填**：将识别文字自动填入输入框，用户可编辑后发送
+- **状态反馈**：录音状态（空闲/录音中）的视觉反馈，包括按钮样式变化和波形动画
+
+## 用户交互流程
+
+1. 用户点击麦克风按钮，开始录音
+2. 页面显示录音中状态，按钮变为红色闪烁
+3. 用户说话，音频数据通过 WebSocket 实时发送到服务器
+4. 服务器返回识别结果，显示在输入框中
+5. 用户点击停止按钮或再次点击麦克风停止录音
+6. 用户确认输入内容后点击发送按钮
+
+## 技术栈
+
+- 前端框架：Vue 3 + Composition API
+- 样式：Scss（现有项目使用）
+- 音频采集：浏览器 MediaRecorder API
+- 实时通信：WebSocket
+- 认证方式：与 TTS 相同（X-Api-App-Id + X-Api-Access-Key）
+
+## 实现方案
+
+### 1. 配置更新 (config/index.js)
+
+添加流式 ASR WebSocket 地址配置：
+
+```javascript
+// 豆包流式语音识别 ASR
+export const DOUBAO_ASR_WS_PATH = "wss://openspeech.bytedance.com/api/v1/asr/ws_binary";
+```
+
+### 2. Speaking.vue 功能实现
+
+#### 录音控制
+
+- 使用 `navigator.mediaDevices.getUserMedia` 获取麦克风权限
+- 使用 `MediaRecorder` API 录制音频（mimeType: 'audio/webm;codecs=opus' 或 'audio/pcm'）
+- 由于浏览器不直接支持录制 PCM，需要录制后转换或使用 AudioContext 重采样
+
+#### WebSocket 通信
+
+- 建立 WebSocket 连接，使用认证头
+- 发送初始化消息（包含音频参数配置）
+- 分帧发送音频数据（建议每 40ms 一帧）
+- 接收并处理识别结果消息
+
+#### 音频处理
+
+由于浏览器 MediaRecorder 不直接支持录制 PCM，需要：
+
+- 方案一：录制 webm/opus 格式，发送时解码为 PCM
+- 方案二：使用 AudioContext + ScriptProcessor/AudioWorklet 实时采集 PCM
+- 推荐方案一，实现更简单，兼容性更好
+
+#### UI 组件
+
+- 麦克风按钮（SVG 图标）
+- 录音状态动画（波形或脉冲效果）
+- 录音时长显示（可选）
+
+## 关键代码结构
+
+### 录音状态管理
+
+```javascript
+const isRecording = ref(false);
+const recordingDuration = ref(0);
+let mediaRecorder = null;
+let audioContext = null;
+let websocket = null;
+```
+
+### WebSocket 消息处理
+
+```javascript
+// 初始化消息格式
+{ 
+  "app": { "appid": DOUBAO_APP_ID, "token": DOUBAO_ACCESS_TOKEN },
+  "user": { "uid": "speaking_asr_" + Date.now() },
+  "audio": { "format": "pcm", "rate": 16000, "bits": 16, "channel": 1 }
+}
+```
+
+## 性能考虑
+
+- 音频帧发送间隔：40ms（与 16kHz 采样率匹配）
+- WebSocket 心跳：防止连接断开
+- 错误重连：录音中断时自动尝试重连
+- 内存管理：及时释放音频资源
+
+## 兼容性
+
+- 使用 `MediaRecorder.isTypeSupported` 检测浏览器支持
+- 降级处理：浏览器不支持时隐藏麦克风按钮
+
+# Agent Extensions
+
+此任务不需要使用任何 Agent Extensions
--- a/package-lock.json
+++ b/package-lock.json
@ -11,10 +11,13 @@
        "@ffmpeg/ffmpeg": "^0.12.15",
        "@ffmpeg/util": "^0.12.2",
        "axios": "^1.13.6",
+        "marked": "^17.0.5",
+        "pako": "^2.1.0",
        "vue": "^3.5.30",
        "vue-router": "^5.0.3"
      },
      "devDependencies": {
+        "@types/pako": "^2.0.4",
        "@vitejs/plugin-vue": "^6.0.5",
        "vite": "^8.0.0"
      }
@ -500,6 +503,13 @@
        "tslib": "^2.4.0"
      }
    },
+    "node_modules/@types/pako": {
+      "version": "2.0.4",
+      "resolved": "https://registry.npmmirror.com/@types/pako/-/pako-2.0.4.tgz",
+      "integrity": "sha512-VWDCbrLeVXJM9fihYodcLiIv0ku+AlOa/TQ1SvYOaBuyrSKgEcro95LJyIsJ4vSo6BXIxOKxiJAat04CmST9Fw==",
+      "dev": true,
+      "license": "MIT"
+    },
    "node_modules/@vitejs/plugin-vue": {
      "version": "6.0.5",
      "resolved": "https://registry.npmmirror.com/@vitejs/plugin-vue/-/plugin-vue-6.0.5.tgz",
@ -1392,6 +1402,18 @@
        "url": "https://github.com/sponsors/sxzz"
      }
    },
+    "node_modules/marked": {
+      "version": "17.0.5",
+      "resolved": "https://registry.npmmirror.com/marked/-/marked-17.0.5.tgz",
+      "integrity": "sha512-6hLvc0/JEbRjRgzI6wnT2P1XuM1/RrrDEX0kPt0N7jGm1133g6X7DlxFasUIx+72aKAr904GTxhSLDrd5DIlZg==",
+      "license": "MIT",
+      "bin": {
+        "marked": "bin/marked.js"
+      },
+      "engines": {
+        "node": ">= 20"
+      }
+    },
    "node_modules/math-intrinsics": {
      "version": "1.1.0",
      "resolved": "https://registry.npmmirror.com/math-intrinsics/-/math-intrinsics-1.1.0.tgz",
@ -1475,6 +1497,12 @@
        "node": "^10 || ^12 || ^13.7 || ^14 || >=15.0.1"
      }
    },
+    "node_modules/pako": {
+      "version": "2.1.0",
+      "resolved": "https://registry.npmmirror.com/pako/-/pako-2.1.0.tgz",
+      "integrity": "sha512-w+eufiZ1WuJYgPXbV/PO3NCMEc3xqylkKHzp8bxp1uW4qaSNQUkwmLLEc3kKsfz8lpV1F8Ht3U1Cm+9Srog2ug==",
+      "license": "(MIT AND Zlib)"
+    },
    "node_modules/pathe": {
      "version": "2.0.3",
      "resolved": "https://registry.npmmirror.com/pathe/-/pathe-2.0.3.tgz",
--- a/package.json
+++ b/package.json
@ -12,10 +12,13 @@
    "@ffmpeg/ffmpeg": "^0.12.15",
    "@ffmpeg/util": "^0.12.2",
    "axios": "^1.13.6",
+    "marked": "^17.0.5",
+    "pako": "^2.1.0",
    "vue": "^3.5.30",
    "vue-router": "^5.0.3"
  },
  "devDependencies": {
+    "@types/pako": "^2.0.4",
    "@vitejs/plugin-vue": "^6.0.5",
    "vite": "^8.0.0"
  }
--- a/src/config/index.js
+++ b/src/config/index.js
@ -27,6 +27,11 @@ export const DOUBAO_TTS_API_PATH = "/tts-api/api/v3/tts/unidirectional";
 export const DOUBAO_AUDIO_FORMAT = "mp3";
 export const DOUBAO_SAMPLE_RATE = 24000;

+// ── 豆包流式语音识别 ASR（语音输入）──
+export const DOUBAO_ASR_WS_PATH = "wss://openspeech.bytedance.com/api/v3/sauc/bigmodel";
+// 小时版计费资源ID（也可改为并发版：volc.bigasr.sauc.concurrent）
+export const DOUBAO_ASR_RESOURCE_ID = "volc.bigasr.sauc.duration";
+
 // ── 火山引擎 Ark 大模型（口语对话）──
 export const ARK_API_KEY = "";
 export const ARK_MODEL = "doubao-pro-4k";
--- a/src/md/doubao_audio.md
+++ b/src/md/doubao_audio.md
@ -0,0 +1,749 @@
+<span id="1d388eb1"></span>
+# 简介
+本文档介绍如何通过WebSocket协议实时访问大模型流式语音识别服务 (ASR)，主要包含鉴权相关、协议详情、常见问题和使用Demo四部分。
+双向流式模式使用的接口地址是 wss://[openspeech.bytedance.com/api/v3/sauc/bigmodel](http://openspeech.bytedance.com/api/v3/sauc/bigmodel_nostream)
+流式输入模式使用的接口地址是 wss://[openspeech.bytedance.com/api/v3/sauc/bigmodel_nostream](http://openspeech.bytedance.com/api/v3/sauc/bigmodel_nostream)
+
+1. 两者都是每输入一个包返回一个包，双向流式模式会尽快返回识别到的字符，速度较快。
+2. 流式输入模式会在输入音频大于15s或发送最后一包（负包）后返回识别到的结果，准确率更高。
+3. 无论是哪种模式，单包音频大小建议在100~200ms左右，发包间隔建议100～200ms，不能过大或者过小，否则均会影响性能。（注：针对双向流式模式，单包为200ms大小时性能最优，建议双向流式模式选取200ms大小的分包）
+4. 流式输入模式在平均音频时长5s时，可以做到300~400ms以内返回。
+
+
+---
+
+
+双向流式模式（优化版本）接口地址：wss://[openspeech.bytedance.com/api/v3/sauc/bigmodel_async](http://openspeech.bytedance.com/api/v3/sauc/bigmodel_async)
+
+1. 该模式下，不再是每一包输入对应一包返回，只有当结果有变化时才会返回新的数据包（性能优化 rtf 和首字、尾字时延均有一定程度提升）
+2. 双向流式版本，更推荐使用双向流式模式（优化版本），性能相对更优。
+
+<span id="25d1d6d6"></span>
+# 鉴权
+在 websocket 建连的 HTTP 请求头（Header 中）添加以下信息
+
+| | | | \
+|Key |说明 |Value 示例 |
+|---|---|---|
+| | | | \
+| | | |
+| | | | \
+|X-Api-App-Key |使用火山引擎控制台获取的APP ID，可参考 [控制台使用FAQ-Q1](/docs/6561/196768#q1%EF%BC%9A%E5%93%AA%E9%87%8C%E5%8F%AF%E4%BB%A5%E8%8E%B7%E5%8F%96%E5%88%B0%E4%BB%A5%E4%B8%8B%E5%8F%82%E6%95%B0appid%EF%BC%8Ccluster%EF%BC%8Ctoken%EF%BC%8Cauthorization-type%EF%BC%8Csecret-key-%EF%BC%9F) |123456789 |
+| | | | \
+|X-Api-Access-Key |使用火山引擎控制台获取的Access Token，可参考 [控制台使用FAQ-Q1](/docs/6561/196768#q1%EF%BC%9A%E5%93%AA%E9%87%8C%E5%8F%AF%E4%BB%A5%E8%8E%B7%E5%8F%96%E5%88%B0%E4%BB%A5%E4%B8%8B%E5%8F%82%E6%95%B0appid%EF%BC%8Ccluster%EF%BC%8Ctoken%EF%BC%8Cauthorization-type%EF%BC%8Csecret-key-%EF%BC%9F) |your-access-key |
+| | | | \
+|X-Api-Resource-Id |表示调用服务的资源信息 ID |豆包流式语音识别模型1.0 |\
+| | | |\
+| | |* 小时版：volc.bigasr.sauc.duration |\
+| | |* 并发版：volc.bigasr.sauc.concurrent |\
+| | | |\
+| | |豆包流式语音识别模型2.0 |\
+| | | |\
+| | |* 小时版：volc.seedasr.sauc.duration |\
+| | |* 并发版：volc.seedasr.sauc.concurrent |
+|^^|^^|^^| \
+| | | |
+|^^|^^|^^| \
+| | | |
+| | | | \
+|X-Api-Connect-Id |用于追踪当前连接的标志 ID，推荐设置UUID等 |67ee89ba-7050-4c04-a3d7-ac61a63499b3 |
+
+websocket 握手成功后，会返回这些 Response header。强烈建议记录X-Tt-Logid（logid）作为排错线索。
+
+| | | | \
+|Key |说明 |Value 示例 |
+|---|---|---|
+| | | | \
+|X-Api-Connect-Id |用于追踪当前调用信息的标志 ID，推荐用UUID等 |67ee89ba-7050-4c04-a3d7-ac61a63499b3 |
+| | | | \
+|X-Tt-Logid |服务端返回的 logid，建议用户获取和打印方便定位问题 |202407261553070FACFE6D19421815D605 |
+
+```HTTP
+// 建连 HTTP 请求头示例
+GET /api/v3/sauc/bigmodel
+Host: openspeech.bytedance.com
+X-Api-App-Key: 123456789
+X-Api-Access-Key: your-access-key
+X-Api-Resource-Id: volc.bigasr.sauc.duration
+X-Api-Connect-Id: 随机生成的UUID
+
+## 返回 Header
+X-Tt-Logid: 202407261553070FACFE6D19421815D605
+```
+
+<span id="ca5745cc"></span>
+# 协议详情
+<span id="3672cb1f"></span>
+## 交互流程
+
+![Image](https://p9-arcosite.byteimg.com/tos-cn-i-goo7wpa0wc/6d72ca511e454d418f01c735c233bd5d~tplv-goo7wpa0wc-image.image =757x)
+<span id="db13e485"></span>
+## WebSocket 二进制协议
+WebSocket 使用二进制协议传输数据。协议的组成由至少 4 个字节的可变 header、payload size 和 payload 三部分组成，其中 header 描述消息类型、序列化方式以及压缩格式等信息，payload size 是 payload 的长度，payload 是具体负载内容，依据消息类型不同 payload 内容不同。
+需注意：协议中整数类型的字段都使用**大端**表示。
+<span id="df933e14"></span>
+### header 数据格式
+
+| | | | | | | | | | \
+|**Byte \ Bit** |**7** |**6** |**5** |**4** |**3** |**2** |**1** |**0** |
+|---|---|---|---|---|---|---|---|---|
+| | |||| |||| \
+|**0** |Protocol version | | | |Header size | | | |
+| | |||| |||| \
+|**1** |Message type | | | |Message type specific flags | | | |
+| | |||| |||| \
+|**2** |Message serialization method | | | |Message compression | | | |
+| | |||||||| \
+|**3** |Reserved | | | | | | | |
+| | |||||||| \
+|**4** |[Optional header extensions] | | | | | | | |
+| | |||||||| \
+|**5** |[Payload, depending on the Message Type] | | | | | | | |
+| | |||||||| \
+|**6** |... | | | | | | | |
+
+<span id="996c63e9"></span>
+### header 字段描述
+
+| | | | \
+|字段 (size in bits) |说明 |值 |
+|---|---|---|
+| | | | \
+|Protocol version (4) |将来可能会决定使用不同的协议版本，因此此字段是为了使客户端和服务器在版本上达成共识。 |0b0001 - version 1 (目前只有该版本) |
+| | | | \
+|Header (4) |Header 大小。实际 header 大小（以字节为单位）是 header size value x 4 。 |0b0001 - header size = 4 (1 x 4) |
+| | | | \
+|Message type (4) |消息类型。 |0b0001 - 端上发送包含请求参数的 full client request |\
+| | |0b0010 - 端上发送包含音频数据的 audio only request |\
+| | |0b1001 - 服务端下发包含识别结果的 full server response |\
+| | |0b1111 - 服务端处理错误时下发的消息类型（如无效的消息格式，不支持的序列化方法等） |
+| | | | \
+|Message type specific flags (4) |Message type 的补充信息。 |0b0000 - header后4个字节不为sequence number |\
+| | |0b0001 - header后4个字节为sequence number且为正 |\
+| | |0b0010 - header后4个字节不为sequence number，仅指示此为最后一包（负包） |\
+| | |0b0011 - header后4个字节为sequence number且需要为负数（最后一包/负包） |
+| | | | \
+|Message serialization method (4) |full client request 的 payload 序列化方法； |\
+| |服务器将使用与客户端相同的序列化方法。 |0b0000 - 无序列化 |\
+| | |0b0001 - JSON 格式 |
+| | | | \
+|Message Compression (4) |定义 payload 的压缩方法； |\
+| |服务端将使用客户端的压缩方法。 |   0b0000 - no compression |\
+| | |   0b0001 - Gzip 压缩     |
+| | | | \
+|Reserved (8) |保留以供将来使用，还用作填充（使整个标头总计4个字节）。 | |
+
+<span id="231d2daf"></span>
+## 请求流程
+<span id="921764de"></span>
+### 建立连接
+根据 WebSocket 协议本身的机制，client 会发送 HTTP GET 请求和 server 建立连接做协议升级。
+需要在其中根据身份认证协议加入鉴权签名头。设置方法请参考鉴权。
+<span id="f8167db8"></span>
+### 发送 full client request
+WebSocket 建立连接后，发送的第一个请求是 full client request。格式是：
+
+| | | | | \
+|**31 ... 24** |**23 ... 16** |**15 ... 8** |**7 ... 0** |
+|---|---|---|---|
+| |||| \
+|Header | | | |
+| |||| \
+|Payload size (4B, unsigned int32) | | | |
+| |||| \
+|Payload | | | |
+
+Header： 前文描述的 4 字节头。
+Payload size： 是按 Header 中指定压缩方式压缩 payload 后的长度，使用**大端**表示。
+Payload： 包含音频的元数据以及 server 所需的相关参数，一般是 JSON 格式。具体的参数字段见下表：
+
+| | | | | | | \
+|字段 |说明 |层级 |格式 |是否必填 |备注 |
+|---|---|---|---|---|---|
+| | | | | | | \
+|user |用户相关配置 |1 |dict | |提供后可供服务端过滤日志 |
+| | | | | | | \
+|uid |用户标识 |2 |string | |建议采用 IMEI 或 MAC。 |
+| | | | | | | \
+|did |设备名称 |2 |string | | |
+| | | | | | | \
+|platform |操作系统及API版本号 |2 |string | |iOS/Android/Linux |
+| | | | | | | \
+|sdk_version |sdk版本 |2 |string | | |
+| | | | | | | \
+|app_version |app 版本 |2 |string | | |
+| | | | | | | \
+|audio |音频相关配置 |1 |dict |✓ | |
+| | | | | | | \
+|language |指定可识别的语言 |2 |string | |**注意：仅流式输入模式(bigmodel_nostream)支持此参数，二遍不支持** |\
+| | | | | |当该键为空时，该模型支持**中英文、上海话、闽南语，四川、陕西、粤语**识别。当将其设置为下方特定键时，它可以识别指定语言。 |\
+| | | | | |```Python |\
+| | | | | |中文普通话 zh-CN |\
+| | | | | |英语：en-US |\
+| | | | | |日语：ja-JP |\
+| | | | | |印尼语：id-ID |\
+| | | | | |西班牙语：es-MX |\
+| | | | | |葡萄牙语：pt-BR |\
+| | | | | |德语：de-DE |\
+| | | | | |法语：fr-FR |\
+| | | | | |韩语：ko-KR |\
+| | | | | |菲律宾语：fil-PH |\
+| | | | | |马来语：ms-MY |\
+| | | | | |泰语：th-TH |\
+| | | | | |阿拉伯语 ar-SA |\
+| | | | | |意大利语 it-IT |\
+| | | | | |孟加拉语 bn-BD |\
+| | | | | |希腊语 el-GR |\
+| | | | | |荷兰语 nl-NL |\
+| | | | | |俄语 ru-RU |\
+| | | | | |土耳其语 tr-TR |\
+| | | | | |越南语 vi-VN |\
+| | | | | |波兰语 pl-PL |\
+| | | | | |罗马尼亚语 ro-RO   |\
+| | | | | |尼泊尔语 ne-NP |\
+| | | | | |乌克兰语 uk-UA |\
+| | | | | |粤语 yue-CN |\
+| | | | | |``` |\
+| | | | | | |\
+| | | | | |例如，如果输入音频是德语，则此参数传入de-DE |
+| | | | | | | \
+|format |音频容器格式 |2 |string |✓ |pcm / wav / ogg / mp3      |\
+| | | | | |注意：pcm和wav内部音频流必须是pcm_s16le |
+| | | | | | | \
+|codec |音频编码格式 |2 |string | |raw / opus，默认为 raw(表示pcm)     |\
+| | | | | |注意: 当format为ogg的时候，codec必须是opus， |\
+| | | | | |         当format为mp3的时候，codec不生效，传默认值raw即可 |
+| | | | | | | \
+|rate |音频采样率 |2 |int | |默认为 16000，目前只支持16000 |
+| | | | | | | \
+|bits |音频采样点位数 |2 |int | |默认为 16，暂只支持16bits |
+| | | | | | | \
+|channel |音频声道数 |2 |int | |1(mono) / 2(stereo)，默认为1。 |
+| | | | | | | \
+|request |请求相关配置 |1 |dict |✓ | |
+| | | | | | | \
+|model_name |模型名称 |2 |string |✓ |目前只有bigmodel |
+| | | | | | | \
+|enable_nonstream |开启二遍识别 |2 |bool | |开启流式+非流式**二遍识别模式**：在一个接口里实现即双向流式实时返回逐字文本+流式输入模式（nostream）重新识别该分句音频片段提升准确率，既可以满足客户实时上屏需求（快），又可以在最终结果中保证识别准确率（准）。 |\
+| | | | | |目前二遍识别仅在**双向流式优化版**上支持，不支持旧版链路。 |\
+| | | | | |开启二遍识别后，会默认开启VAD分句（默认800ms判停，数值可通过end_window_size参数配置），VAD分句判停时，会使用非流式模型（nostream接口）重新识别该分句音频。且只有在非流式（nostream接口）输出的识别结果中会输出"definite": true 分句标识。 |
+| | | | | | | \
+|enable_itn |启用itn |2 |bool | |默认为true。 |\
+| | | | | |文本规范化 (ITN) 是自动语音识别 (ASR) 后处理管道的一部分。 ITN 的任务是将 ASR 模型的原始语音输出转换为书面形式，以提高文本的可读性。 |\
+| | | | | |例如，“一九七零年”->“1970年”和“一百二十三美元”->“$123”。 |
+| | | | | | | \
+|enable_speaker_info |启用说话人聚类分离 |2 |bool | |默认不开启，不指定*language*字段或者*language指定为"zh-CN"（此时采用默认的中英文模型）可采用该能力* |\
+| | | | | |需同时配置ssd_version = "200"使用（建议使用ASR2.0时开启，ASR1.0不推荐） |
+| | | | | | | \
+|ssd_version |ssd版本号 |2 |string | |ssd_version = "200"时为启动大模型SSD能力（建议使用ASR2.0时开启，ASR1.0不推荐） |
+| | | | | | | \
+|enable_punc |启用标点 |2 |bool | |默认为true。 |
+| | | | | | | \
+|enable_ddc |启用顺滑 |2 |bool | |默认为false。 |\
+| | | | | |**++语义顺滑++**‌是一种技术，旨在提高自动语音识别（ASR）结果的文本可读性和流畅性。这项技术通过删除或修改ASR结果中的不流畅部分，如停顿词、语气词、语义重复词等，使得文本更加易于阅读和理解。 |
+| | | | | | | \
+|output_zh_variant |识别结果输出为繁体中文 |2 |string | | `traditional` ：简体 → 繁体（大陆） |\
+| | | | | | `tw` ：简体 → 台湾正体 |\
+| | | | | | `hk` ：简体 → 香港繁体 |\
+| | | | | |示例： |\
+| | | | | |```Plain Text |\
+| | | | | |"request": { |\
+| | | | | |    "output_zh_variant": "traditional",   // one of traditional/tw/hk |\
+| | | | | |}, |\
+| | | | | |``` |\
+| | | | | | |
+| | | | | | | \
+|show_utterances |输出语音停顿、分句、分词信息 |2 |bool | | |
+| | | | | | | \
+|show_speech_rate（仅nostream接口和双向流式优化版支持） |分句信息携带语速 |2 |bool | |如果设为"True"，则会在分句additions信息中使用speech_rate标记，单位为 token/s。默认 "False"。 |\
+| | | | | |**双向流式优化版**启用此功能会默认开启VAD分句（默认800ms判停，数值可通过end_window_size参数配置。识别结果中"definite": true的分句的additions信息中携带标记信息） |
+| | | | | | | \
+|show_volume（仅nostream接口和双向流式优化版支持） |分句信息携带音量 |2 |bool | |如果设为"True"，则会在分句additions信息中使用volume标记，单位为 分贝。默认 "False"。 |\
+| | | | | |**双向流式优化版**启用此功能会默认开启VAD分句（默认800ms判停，数值可通过end_window_size参数配置。识别结果中"definite": true的分句的additions信息中携带标记信息） |
+| | | | | | | \
+|enable_lid（仅nostream接口和双向流式优化版支持） |启用语种检测 |2 |bool | |**目前能识别语种，且能出识别结果的语言：中英文、上海话、闽南语，四川、陕西、粤语** |\
+| | | | | |如果设为"True"，则会在additions信息中使用lid_lang标记, 返回对应的语种标签。默认 "False" |\
+| | | | | |支持的标签包括： |\
+| | | | | | |\
+| | | | | |*  singing_en：英文唱歌 |\
+| | | | | |*  singing_mand：普通话唱歌 |\
+| | | | | |*  singing_dia_cant：粤语唱歌 |\
+| | | | | |*  speech_en：英文说话 |\
+| | | | | |*  speech_mand：普通话说话 |\
+| | | | | |*  speech_dia_nan：闽南语 |\
+| | | | | |* speech_dia_wuu：吴语（含上海话） |\
+| | | | | |*  speech_dia_cant：粤语说话 |\
+| | | | | |*  speech_dia_xina：西南官话（含四川话） |\
+| | | | | |*  speech_dia_zgyu：中原官话（含陕西话） |\
+| | | | | |*  other_langs：其它语种（其它语种人声） |\
+| | | | | |*  others：检测不出（非语义人声和非人声） |\
+| | | | | |   空时代表无法判断（例如传入音频过短等） |\
+| | | | | | |\
+| | | | | |**实际不支持识别的语种（无识别结果），但该参数可检测并输出对应lang_code。对应的标签如下：** |\
+| | | | | | |\
+| | | | | |* singing_hi：印度语唱歌 |\
+| | | | | |* singing_ja：日语唱歌 |\
+| | | | | |* singing_ko：韩语唱歌 |\
+| | | | | |* singing_th：泰语唱歌 |\
+| | | | | |* speech_hi：印地语说话 |\
+| | | | | |* speech_ja：日语说话 |\
+| | | | | |* speech_ko：韩语说话 |\
+| | | | | |* speech_th：泰语说话 |\
+| | | | | |* speech_kk：哈萨克语说话 |\
+| | | | | |* speech_bo：藏语说话 |\
+| | | | | |* speech_ug：维语 |\
+| | | | | |* speech_mn：蒙古语 |\
+| | | | | |* speech_dia_ql：琼雷话 |\
+| | | | | |* speech_dia_hsn：湘语 |\
+| | | | | |* speech_dia_jin：晋语 |\
+| | | | | |* speech_dia_hak：客家话 |\
+| | | | | |* speech_dia_chao：潮汕话 |\
+| | | | | |* speech_dia_juai：江淮官话 |\
+| | | | | |* speech_dia_lany：兰银官话 |\
+| | | | | |* speech_dia_dbiu：东北官话 |\
+| | | | | |* speech_dia_jliu：胶辽官话 |\
+| | | | | |* speech_dia_jlua：冀鲁官话 |\
+| | | | | |* speech_dia_cdo：闽东话 |\
+| | | | | |* speech_dia_gan：赣语 |\
+| | | | | |* speech_dia_mnp：闽北语 |\
+| | | | | |* speech_dia_czh：徽语 |\
+| | | | | | |\
+| | | | | |**双向流式优化版**启用此功能会默认开启VAD分句（默认800ms判停，数值可通过end_window_size参数配置。识别结果中"definite": true的分句的additions信息中携带标记信息） |
+| | | | | | | \
+|enable_emotion_detection（仅nostream接口和双向流式优化版支持） |启用情绪检测 |2 |bool | |如果设为"True"，则会在分句additions信息中使用emotion标记, 返回对应的情绪标签。默认 "False" |\
+| | | | | |支持的情绪标签包括： |\
+| | | | | | |\
+| | | | | |*  "angry"：表示情绪为生气 |\
+| | | | | |*  "happy"：表示情绪为开心 |\
+| | | | | |*  "neutral"：表示情绪为平静或中性 |\
+| | | | | |*  "sad"：表示情绪为悲伤 |\
+| | | | | |*  "surprise"：表示情绪为惊讶 |\
+| | | | | | |\
+| | | | | |**双向流式优化版**启用此功能会默认开启VAD分句（默认800ms判停，数值可通过end_window_size参数配置。识别结果中"definite": true的分句的additions信息中携带标记信息） |
+| | | | | | | \
+|enable_gender_detection（仅nostream接口和双向流式优化版支持） |启用性别检测 |2 |bool | |如果设为"True"，则会在分句additions信息中使用gender标记, 返回对应的性别标签（male/female）。默认 "False"。 |\
+| | | | | |**双向流式优化版**启用此功能会默认开启VAD分句（默认800ms判停，数值可通过end_window_size参数配置。识别结果中"definite": true的分句的additions信息中携带标记信息） |
+| | | | | | | \
+|result_type |结果返回方式 |2 |string | |默认为"full",全量返回。 |\
+| | | | | |设置为"single"则为增量结果返回，即不返回之前分句的结果。 |
+| | | | | | | \
+|enable_accelerate_text |是否启动首字返回加速 |2 |bool | |如果设为"True"，则会尽量加速首字返回，但会降低首字准确率。 |\
+| | | | | |默认 "False" |
+| | | | | | | \
+|accelerate_score |首字返回加速率 |2 |int | |配合enable_accelerate_text参数使用，默认为0，表示不加速，取值范围[0-20]，值越大，首字出字越快 |
+| | | | | | | \
+|vad_segment_duration |语义切句的最大静音阈值 |2 |int | |单位ms，默认为3000。当静音时间超过该值时，会将文本分为两个句子。不决定判停，所以不会修改definite出现的位置。在end_window_size配置后，该参数失效。    |
+| | | | | | | \
+|end_window_size |强制判停时间 |2 |int | |单位ms，默认为800，最小200。静音时长超过该值，会直接判停，输出definite。配置该值，不使用语义分句，根据静音时长来分句。用于实时性要求较高场景，可以提前获得definite句子 |
+| | | | | | | \
+|force_to_speech_time |强制语音时间 |2 |int | |单位ms，最小1。音频时长超过该值之后，才会尝试判停并返回definite=true，需配合end_window_size参数使用。对小于该数值的音频不做判停处理。 |\
+| | | | | |推荐设置1000，可能会影响识别准确率。 |
+| | | | | | | \
+|sensitive_words_filter |敏感词过滤 |2 |string | |敏感词过滤功能,支持开启或关闭,支持自定义敏感词。该参数可实现：不处理(默认,即展示原文)、过滤、替换为*。 |\
+| | | | | |示例： |\
+| | | | | |system_reserved_filter  //是否使用系统敏感词，会替换成*(默认系统敏感词主要包含一些限制级词汇） |\
+| | | | | |filter_with_empty   // 想要替换成空的敏感词  |\
+| | | | | |filter_with_signed  // 想要替换成 * 的敏感词 |\
+| | | | | |```Python |\
+| | | | | |"sensitive_words_filter":{\"system_reserved_filter\":true,\"filter_with_empty\":[\"敏感词\"],\"filter_with_signed\":[\"敏感词\"]}", |\
+| | | | | |``` |\
+| | | | | | |
+| | | | | | | \
+|enable_poi_fc（nostream接口&双向流式优化版-开启二遍支持） |开启 POI function call |2 |bool | |对于语音识别困难的词语，能调用专业的地图领域推荐词服务辅助识别 |\
+| | | | | |示例： |\
+| | | | | |```Python |\
+| | | | | |"request": { |\
+| | | | | |    "enable_poi_fc": true, |\
+| | | | | |    "corpus": { |\
+| | | | | |        "context": "{\"loc_info\":{\"city_name\":\"北京市\"}}" |\
+| | | | | |    } |\
+| | | | | |} |\
+| | | | | |``` |\
+| | | | | | |\
+| | | | | |其中loc_info字段可选，传入该字段结果相对更精准，city_name单位为地级市。 |
+| | | | | | | \
+|enable_music_fc（nostream接口&双向流式优化版-开启二遍支持） |开启音乐 function call |2 |bool | |对于语音识别困难的词语，能调用专业的音领域推荐词服务辅助识别 |\
+| | | | | |示例： |\
+| | | | | |```Python |\
+| | | | | |"request": { |\
+| | | | | |    "enable_music_fc": true |\
+| | | | | |} |\
+| | | | | |``` |\
+| | | | | | |
+| | | | | | | \
+|corpus |语料/干预词等 |2 |dict | | |
+| | | | | | | \
+|boosting_table_name |自学习平台上设置的热词词表名称 |3 |string | |热词表功能和设置方法可以参考[文档](https://www.volcengine.com/docs/6561/155739) |
+| | | | | | | \
+|boosting_table_id |自学习平台上设置的热词词表id |3 |string | |热词表功能和设置方法可以参考[文档](https://www.volcengine.com/docs/6561/155739) |
+| | | | | | | \
+|correct_table_name |自学习平台上设置的替换词词表名称 |3 |string | |替换词功能和设置方法可以参考[文档](https://www.volcengine.com/docs/6561/1206007) |
+| | | | | | | \
+|correct_table_id |自学习平台上设置的替换词词表id |3 |string | |替换词功能和设置方法可以参考[文档](https://www.volcengine.com/docs/6561/1206007) |
+| | | | | | | \
+|context |热词或者上下文 |3 |string | |1. 热词直传（优先级高于传热词表），双向流式支持100tokens，流式输入nostream支持5000个词 |\
+| | | | | | |\
+| | | | | |"context":"{\"hotwords\":[{\"word\":\"热词1号\"}, {\"word\":\"热词2号\"}]}" |\
+| | | | | | |\
+| | | | | | |\
+| | | | | |2. 上下文，限制800 tokens及20轮（含）内，超出会按照时间顺序从新到旧截断，优先保留更新的对话 |\
+| | | | | | |\
+| | | | | |  context_data字段按照从新到旧的顺序排列，传入需要序列化为jsonstring（转义引号） |\
+| | | | | |**豆包流式语音识别模型2.0，支持将上下文理解的范围从纯文本扩展到视觉层面，** |\
+| | | | | |**通过理解图像内容，帮助模型更精准地完成语音转录。通过image_url传入图片，** |\
+| | | | | |**图片限制传入1张，大小：500k以内（格式：jpeg、jpg、png ）** |\
+| | | | | |```SQL |\
+| | | | | |上下文:可以加入对话历史、聊天所在bot信息、个性化信息、业务场景信息等,如: |\
+| | | | | |a.对话历史:把最近几轮的对话历史传进来 |\
+| | | | | |b.聊天所在bot信息:如"我在和林黛玉聊天","我在使用A助手和手机对话" |\
+| | | | | |c.个性化信息:"我当前在北京市海淀区","我有四川口音","我喜欢音乐" |\
+| | | | | |d.业务场景信息:"当前是中国平安的营销人员针对外部客户采访的录音,可能涉及..." |\
+| | | | | |{ |\
+| | | | | |    \"context_type\": \"dialog_ctx\", |\
+| | | | | |    \"context_data\":[ |\
+| | | | | |        {\"text\": \"text1\"}, |\
+| | | | | |        {\"image_url\": \"image_url\"}, |\
+| | | | | |        {\"text\": \"text2\"}, |\
+| | | | | |        {\"text\": \"text3\"}, |\
+| | | | | |        {\"text\": \"text4\"}, |\
+| | | | | |        ... |\
+| | | | | |    ] |\
+| | | | | |} |\
+| | | | | |``` |\
+| | | | | | |
+
+参数示例：
+```JSON
+{
+    "user": {
+        "uid": "388808088185088"
+    },
+    "audio": {
+        "format": "wav",
+        "rate": 16000,
+        "bits": 16,
+        "channel": 1,
+        "language": "zh-CN"
+    },
+    "request": {
+        "model_name": "bigmodel",
+        "enable_itn": false,
+        "enable_ddc": false,
+        "enable_punc": false,
+        "corpus": {
+            "boosting_table_id": "通过自学习平台配置热词的词表id",
+            },
+            "context": {
+                \"context_type\": \"dialog_ctx\",
+                \"context_data\":[
+                    {\"text\": \"text1\"},
+                    {\"text\": \"text2\"},
+                    {\"text\": \"text3\"},
+                    {\"text\": \"text4\"},
+                    ...
+                ]
+            }
+        }
+    }
+}
+```
+
+<span id="eaf63ef1"></span>
+### 发送 audio only request
+Client 发送 full client request 后，再发送包含音频数据的 audio-only client request。音频应采用 full client request 中指定的格式（音频格式、编解码器、采样率、声道）。格式如下：
+
+| | | | | \
+|**31 ... 24** |**23 ... 16** |**15 ... 8** |**7 ... 0** |
+|---|---|---|---|
+| |||| \
+|Header | | | |
+| |||| \
+|Payload size (4B, unsigned int32) | | | |
+| |||| \
+|Payload | | | |
+
+Payload 是使用指定压缩方法，压缩音频数据后的内容。可以多次发送 audio only request 请求，例如在流式语音识别中如果每次发送 100ms 的音频数据，那么 audio only request 中的 Payload 就是 100ms 的音频数据。
+<span id="096d0921"></span>
+### full server response
+Client 发送的 full client request 和 audio only request，服务端都会返回 full server response。格式如下：
+
+| | | | | \
+|**31 ... 24** |**23 ... 16** |**15 ... 8** |**7 ... 0** |
+|---|---|---|---|
+| |||| \
+|Header | | | |
+| |||| \
+|Sequence | | | |
+| |||| \
+|Payload size (4B, unsigned int32) | | | |
+| |||| \
+|Payload | | | |
+
+Payload 内容是包含识别结果的 JSON 格式，字段说明如下：
+
+| | | | | | | \
+|字段 |说明 |层级 |格式 |是否必填 |备注 |
+|---|---|---|---|---|---|
+| | | | | | | \
+|result |识别结果 |1 |list | |仅当识别成功时填写 |
+| | | | | | | \
+|text |整个音频的识别结果文本 |2 |string | |仅当识别成功时填写。 |
+| | | | | | | \
+|utterances |识别结果语音分句信息 |2 |list | |仅当识别成功且开启show_utterances时填写。 |
+| | | | | | | \
+|text |utterance级的文本内容 |3 |string | |仅当识别成功且开启show_utterances时填写。 |
+| | | | | | | \
+|start_time |起始时间（毫秒） |3 |int | |仅当识别成功且开启show_utterances时填写。 |
+| | | | | | | \
+|end_time |结束时间（毫秒） |3 |int | |仅当识别成功且开启show_utterances时填写。 |
+| | | | | | | \
+|definite |是否是一个确定分句 |3 |bool | |仅当识别成功且开启show_utterances时填写。 |
+
+```JSON
+{
+  "audio_info": {"duration": 10000},
+  "result": {
+      "text": "这是字节跳动， 今日头条母公司。",
+      "utterances": [
+        {
+          "definite": true,
+          "end_time": 1705,
+          "start_time": 0,
+          "text": "这是字节跳动，",
+          "words": [
+            {
+              "blank_duration": 0,
+              "end_time": 860,
+              "start_time": 740,
+              "text": "这"
+            },
+            {
+              "blank_duration": 0,
+              "end_time": 1020,
+              "start_time": 860,
+              "text": "是"
+            },
+            {
+              "blank_duration": 0,
+              "end_time": 1200,
+              "start_time": 1020,
+              "text": "字"
+            },
+            {
+              "blank_duration": 0,
+              "end_time": 1400,
+              "start_time": 1200,
+              "text": "节"
+            },
+            {
+              "blank_duration": 0,
+              "end_time": 1560,
+              "start_time": 1400,
+              "text": "跳"
+            },
+            {
+              "blank_duration": 0,
+              "end_time": 1640,
+              "start_time": 1560,
+              "text": "动"
+            }
+          ]
+        },
+        {
+          "definite": true,
+          "end_time": 3696,
+          "start_time": 2110,
+          "text": "今日头条母公司。",
+          "words": [
+            {
+              "blank_duration": 0,
+              "end_time": 3070,
+              "start_time": 2910,
+              "text": "今"
+            },
+            {
+              "blank_duration": 0,
+              "end_time": 3230,
+              "start_time": 3070,
+              "text": "日"
+            },
+            {
+              "blank_duration": 0,
+              "end_time": 3390,
+              "start_time": 3230,
+              "text": "头"
+            },
+            {
+              "blank_duration": 0,
+              "end_time": 3550,
+              "start_time": 3390,
+              "text": "条"
+            },
+            {
+              "blank_duration": 0,
+              "end_time": 3670,
+              "start_time": 3550,
+              "text": "母"
+            },
+            {
+              "blank_duration": 0,
+              "end_time": 3696,
+              "start_time": 3670,
+              "text": "公"
+            },
+            {
+              "blank_duration": 0,
+              "end_time": 3696,
+              "start_time": 3696,
+              "text": "司"
+            }
+          ]
+        }
+      ]
+   },
+  "audio_info": {
+    "duration": 3696
+  }
+}
+```
+
+<span id="8aa108f1"></span>
+### Error message from server
+当 server 发现无法解决的二进制/传输协议问题时，将发送 Error message from server 消息（例如，client 以 server 不支持的序列化格式发送消息）。格式如下：
+
+| | | | | \
+|**31 ... 24** |**23 ... 16** |**15 ... 8** |**7 ... 0** |
+|---|---|---|---|
+| |||| \
+|Header | | | |
+| |||| \
+|Error message code (4B, unsigned int32) | | | |
+| |||| \
+|Error message size (4B, unsigned int32) | | | |
+| |||| \
+|Error message (UTF8 string) | | | |
+
+Header： 前文描述的 4 字节头。
+Error message code： 错误码，使用**大端**表示。
+Error message size： 错误信息长度，使用**大端**表示。
+Error message： 错误信息。
+<span id="4665ea66"></span>
+### 示例
+<span id="87bf74a6"></span>
+#### 示例：客户发送 3 个请求
+下面的 message flow 会发送多次消息，每个消息都带有版本、header 大小、保留数据。由于每次消息中这些字段值相同，所以有些消息中这些字段省略了。
+Message flow:
+client 发送 "Full client request"
+
+   version: `b0001` (4 bits)
+   header size: `b0001` (4 bits)
+   message type: `b0001` (Full client request) (4bits)
+   message type specific flags: `b0000` (use_specific_pos_sequence) (4bits)
+   message serialization method: `b0001` (JSON) (4 bits)
+   message compression: `b0001` (Gzip) (4bits)
+   reserved data: `0x00` (1 byte)
+   payload size = Gzip 压缩后的长度
+   payload: json 格式的请求字段经过 Gzip 压缩后的数据
+
+server 响应 "Full server response"
+
+   version: `b0001`
+   header size: `b0001`
+   message type: `b1001` (Full server response)
+   message type specific flags: `b0001` (none)
+   message serialization method: `b0001` (JSON 和请求相同)
+   message compression: `b0001` (Gzip 和请求相同)
+   reserved data: `0x00`
+   sequence: 0x00 0x00 0x00 0x01 (4 byte) sequence=1
+   payload size = Gzip 压缩后数据的长度
+   payload: Gzip 压缩后的响应数据
+
+client 发送包含第一包音频数据的 "Audio only client request"
+
+   version: `b0001`
+   header size: `b0001`
+   message type: `b0010` (audio only client request)
+   message type specific flags: `b0000` (用户设置正数 sequence number)
+   message serialization method: `b0000` (none - raw bytes)
+   message compression: `b0001` (Gzip)
+   reserved data: `0x00`
+   payload size = Gzip 压缩后的音频长度
+   payload: 音频数据经过 Gzip 压缩后的数据
+
+server 响应 "Full server response"
+
+   message type: `0b1001` - Full server response
+   message specific flags: `0b0001` (none)
+   message serialization: `0b0001` (JSON, 和请求相同)
+   message compression `0b0001` (Gzip, 和请求相同)
+   reserved data: `0x00`
+   sequence data: 0x00 0x00 0x00 0x02 (4 byte) sequence=2
+   payload size = Gzip 压缩后数据的长度
+   payload: Gzip 压缩后的响应数据
+
+client 发送包含最后一包音频数据（通过 message type specific flags) 的 "Audio-only client request"，
+
+   message type: `b0010` (audio only client request)
+   message type specific flags: **`b0010`** (最后一包音频请求)
+   message serialization method: `b0000` (none - raw bytes)
+   message compression: `b0001` (Gzip)
+   reserved data: `0x00`
+   payload size = Gzip 压缩后的音频长度
+   payload: Gzip 压缩后的音频数据
+
+server 响应 "Full server response" - 最终回应及处理结果
+
+   message type: `b1001` (Full server response)
+   message type specific flags: `b0011` (最后一包音频结果)
+   message serialization method: `b0001` (JSON)
+   message compression: `b0001` (Gzip)
+   reserved data: `0x00`
+   sequence data: `0x00 0x00 0x00 0x03` (4byte) sequence=3
+   payload size = Gzip 压缩后的 JSON 长度
+   payload: Gzip 压缩后的 JSON 数据
+
+如处理过程中出现错误信息，可能有以下错误帧的返回
+
+   message type: `b1111` (error response)
+   message type specific flags: `b0000` (none)
+   message serialization method: `b0001` (JSON)
+   message compression: `b0000` (none)
+   reserved data: `0x00`
+   Error code data: `0x2A 0x0D 0x0A2 0xff` (4byte) 错误码
+   payload size = 错误信息对象的 JSON 长度
+   payload: 错误信息对象的 JSON 数据
+
+<span id="989f9570"></span>
+## 错误码
+
+| | | | \
+|错误码 |含义 |说明 |
+|---|---|---|
+| | | | \
+|20000000 |成功 | |
+| | | | \
+|45000001 |请求参数无效 |请求参数缺失必需字段 / 字段值无效 / 重复请求。 |
+| | | | \
+|45000002 |空音频 | |
+| | | | \
+|45000081 |等包超时 | |
+| | | | \
+|45000151 |音频格式不正确 | |
+| | | | \
+|550xxxxx |服务内部处理错误 | |
+| | | | \
+|55000031 |服务器繁忙 |服务过载，无法处理当前请求。 |
+
+<span id="4468a455"></span>
+# Demo
+Python：
+<Attachment link="https://p9-arcosite.byteimg.com/tos-cn-i-goo7wpa0wc/9a5371db0dbb4fc389115e8808a5ac73~tplv-goo7wpa0wc-image.image" name="sauc_python.zip" ></Attachment>
+Go：
+<Attachment link="https://p9-arcosite.byteimg.com/tos-cn-i-goo7wpa0wc/11e65137790c4ecb8651e01221adc8e9~tplv-goo7wpa0wc-image.image" name="sauc_go.zip" ></Attachment>
+Java：
+<Attachment link="https://p9-arcosite.byteimg.com/tos-cn-i-goo7wpa0wc/9bf64204b30b4ba8be3099c5c5193bdc~tplv-goo7wpa0wc-image.image" name="sauc.zip" ></Attachment>
+
+
+
+
+
--- a/src/views/Speaking.vue
+++ b/src/views/Speaking.vue
@ -2,6 +2,7 @@
 import { ref, computed, nextTick, onUnmounted, onMounted } from "vue";
 import { useRouter } from "vue-router";
 import { DOUBAO_APP_ID, DOUBAO_ACCESS_TOKEN, DOUBAO_RESOURCE_ID, DOUBAO_TTS_API_PATH, DOUBAO_AUDIO_FORMAT, DOUBAO_SAMPLE_RATE, ARK_API_KEY, ARK_MODEL, ARK_API_PATH, ARK_MAX_TOKENS, ARK_HISTORY_LIMIT } from "@/config/index.js";
+import pako from "pako";

 const router = useRouter();

@ -111,6 +112,15 @@ let currentAudioInstance = null;
 const blobUrls = [];
 let lastGreetedScene = null;

+// ── 语音输入状态 ──
+const isRecording = ref(false);
+const asrStatus = ref(""); // 'connecting' | 'recording' | 'error' | ''
+let asrWs = null;
+let asrAudioContext = null;
+let asrScriptProcessor = null;
+let asrMediaStream = null;
+let asrInterimText = ref(""); // 实时识别中间结果
+
 // 兼容的 UUID 生成函数
 const generateUUID = () => {
  return 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx'.replace(/[xy]/g, (c) => {
@ -380,9 +390,258 @@ const handleKeydown = (e) => {
  }
 };

+// ── 语音输入（ASR）──
+
+// 将 Float32 PCM 转为 Int16 PCM Uint8Array
+const float32ToInt16Bytes = (float32Array) => {
+  const int16 = new Int16Array(float32Array.length);
+  for (let i = 0; i < float32Array.length; i++) {
+    const s = Math.max(-1, Math.min(1, float32Array[i]));
+    int16[i] = s < 0 ? s * 0x8000 : s * 0x7fff;
+  }
+  return new Uint8Array(int16.buffer);
+};
+
+// 构建二进制帧：4字节Header + 4字节Payload长度(大端) + Payload
+// messageType: 0x01=Full Client Request, 0x02=Audio Only Request
+// flags: 0x00=普通包, 0x02=最后一包
+// serialization: 0x01=JSON, 0x00=Raw
+// compression: 0x01=Gzip, 0x00=无压缩
+const buildFrame = (messageType, flags, serialization, compression, payload) => {
+  const header = new Uint8Array([
+    (0x01 << 4) | 0x01,                        // byte0: version=1, headerSize=1(×4=4字节)
+    (messageType << 4) | flags,                 // byte1: messageType | flags
+    (serialization << 4) | compression,         // byte2: serialization | compression
+    0x00,                                       // byte3: reserved
+  ]);
+  const sizeView = new DataView(new ArrayBuffer(4));
+  sizeView.setUint32(0, payload.length, false); // 大端序
+  const frame = new Uint8Array(4 + 4 + payload.length);
+  frame.set(header, 0);
+  frame.set(new Uint8Array(sizeView.buffer), 4);
+  frame.set(payload, 8);
+  return frame;
+};
+
+// 解析服务端返回的二进制帧
+// Full Server Response 结构：Header(4) + Sequence(4) + PayloadSize(4) + Payload
+// Error Response 结构：Header(4) + ErrorCode(4) + ErrorMsgSize(4) + ErrorMsg
+const parseServerFrame = (buffer) => {
+  const view = new DataView(buffer);
+  const byte1 = view.getUint8(1);
+  const byte2 = view.getUint8(2);
+  const msgType = (byte1 >> 4) & 0x0f;   // 0x09=Full Server Response, 0x0f=Error
+  const flags   = byte1 & 0x0f;
+  const compression = byte2 & 0x0f;
+
+  // Error Response (msgType=0x0f)
+  if (msgType === 0x0f) {
+    const errCode = view.getUint32(4, false);
+    const errMsgSize = view.getUint32(8, false);
+    const errMsg = new TextDecoder().decode(new Uint8Array(buffer, 12, errMsgSize));
+    return { msgType, flags, error: true, code: errCode, message: errMsg };
+  }
+
+  // Full Server Response (msgType=0x09)：Header(4) + Sequence(4) + PayloadSize(4) + Payload
+  const sequence    = view.getInt32(4, false);   // 有符号，负数表示最后一包
+  const payloadSize = view.getUint32(8, false);
+  const payloadBytes = new Uint8Array(buffer, 12, payloadSize);
+
+  let jsonStr;
+  if (compression === 0x01) {
+    jsonStr = new TextDecoder().decode(pako.ungzip(payloadBytes));
+  } else {
+    jsonStr = new TextDecoder().decode(payloadBytes);
+  }
+
+  return { msgType, flags, sequence, data: JSON.parse(jsonStr) };
+};
+
+// 停止录音并关闭 WebSocket，autoSend=true 时识别完成后自动发送
+const stopRecording = (autoSend = false) => {
+  isRecording.value = false;
+  asrStatus.value = "";
+  asrInterimText.value = ""; // 清除中间结果提示，等待服务端最终结果
+
+  if (asrScriptProcessor) {
+    asrScriptProcessor.disconnect();
+    asrScriptProcessor = null;
+  }
+  if (asrAudioContext) {
+    asrAudioContext.close().catch(() => {});
+    asrAudioContext = null;
+  }
+  if (asrMediaStream) {
+    asrMediaStream.getTracks().forEach((t) => t.stop());
+    asrMediaStream = null;
+  }
+  if (asrWs) {
+    if (asrWs.readyState === WebSocket.OPEN) {
+      try {
+        // 发送结束帧，服务端收到后会返回最终识别结果再关闭连接
+        const emptyGzip = pako.gzip(new Uint8Array(0));
+        const endFrame = buildFrame(0x02, 0x02, 0x00, 0x01, emptyGzip);
+        asrWs.send(endFrame);
+      } catch (e) {}
+
+      if (autoSend) {
+        // 等服务端返回最终结果并关闭连接后再发送
+        asrWs.onclose = () => {
+          asrWs = null;
+          if (inputText.value.trim()) sendMessage();
+        };
+        return; // 延迟关闭，由服务端主动断开
+      }
+    }
+    asrWs.close();
+    asrWs = null;
+  } else if (autoSend && inputText.value.trim()) {
+    sendMessage();
+  }
+};
+
+// 开始录音
+const startRecording = async () => {
+  if (isRecording.value) {
+    stopRecording(true); // 停止录音并自动发送
+    return;
+  }
+
+  // 检查浏览器支持
+  if (!navigator.mediaDevices || !navigator.mediaDevices.getUserMedia) {
+    alert("您的浏览器不支持麦克风录音功能");
+    return;
+  }
+
+  asrStatus.value = "connecting";
+  asrInterimText.value = "";
+
+  try {
+    // 1. 通过 Vite 代理连接（代理层自动注入鉴权 Header，绕过浏览器 WS 不支持自定义 Header 的限制）
+    const wsUrl = `ws://${location.host}/asr-ws/api/v3/sauc/bigmodel`;
+    const ws = new WebSocket(wsUrl);
+    ws.binaryType = "arraybuffer";
+    asrWs = ws;
+
+    await new Promise((resolve, reject) => {
+      ws.onopen = () => resolve();
+      ws.onerror = () => reject(new Error("WebSocket 连接失败"));
+      setTimeout(() => reject(new Error("WebSocket 连接超时")), 8000);
+    });
+
+    // 2. 发送初始化配置帧（Full Client Request，JSON+Gzip）
+    const initPayload = {
+      user: { uid: "speaking_asr_" + Date.now() },
+      audio: {
+        format: "pcm",
+        rate: 16000,
+        bits: 16,
+        channel: 1,
+      },
+      request: {
+        model_name: "bigmodel",
+        enable_punc: true,
+        enable_itn: true,
+      },
+    };
+    const compressedInit = pako.gzip(JSON.stringify(initPayload));
+    // Full Client Request(0x01), flags=0x00, JSON(0x01), Gzip(0x01)
+    ws.send(buildFrame(0x01, 0x00, 0x01, 0x01, compressedInit));
+
+    // 3. 监听识别结果（二进制响应）
+    ws.onmessage = (event) => {
+      try {
+        const parsed = parseServerFrame(event.data);
+
+        // 错误帧
+        if (parsed.error) {
+          console.error("ASR server error:", parsed.code, parsed.message);
+          return;
+        }
+
+        const { data } = parsed;
+        // 检查业务错误码（20000000 = 成功）
+        if (data?.code && data.code !== 20000000) {
+          console.error("ASR error code:", data.code, data.message);
+          return;
+        }
+
+        const result = data?.result;
+        if (!result) return;
+
+        const text = result.text || "";
+        if (!text) return;
+
+        // 通过 utterances[].definite 判断是否为确定分句（最终结果）
+        const hasFinal = result.utterances?.some((u) => u.definite === true);
+        if (hasFinal) {
+          // 最终结果：清空中间结果，将本句完整文本追加到输入框
+          asrInterimText.value = "";
+          inputText.value = (inputText.value + " " + text).trim();
+        } else {
+          // 中间结果：只更新提示，不写入输入框
+          asrInterimText.value = text;
+        }
+      } catch (e) {
+        console.error("ASR parse error:", e);
+      }
+    };
+
+    ws.onerror = () => {
+      console.error("ASR WebSocket error");
+      stopRecording();
+      asrStatus.value = "error";
+      setTimeout(() => { asrStatus.value = ""; }, 3000);
+    };
+
+    ws.onclose = () => {
+      // 异常断开时清理（正常停止录音时 stopRecording 已处理）
+      if (isRecording.value) stopRecording();
+    };
+
+    // 4. 获取麦克风并采集 PCM
+    const stream = await navigator.mediaDevices.getUserMedia({
+      audio: { sampleRate: 16000, channelCount: 1, echoCancellation: true, noiseSuppression: true },
+    });
+    asrMediaStream = stream;
+
+    const AudioContextClass = window.AudioContext || window["webkitAudioContext"];
+    const audioCtx = new AudioContextClass({ sampleRate: 16000 });
+    asrAudioContext = audioCtx;
+
+    const source = audioCtx.createMediaStreamSource(stream);
+    // 每次处理 2048 帧 ≈ 128ms @ 16kHz（2的幂次方，符合 API 建议的 100~200ms 分包）
+    const processor = audioCtx.createScriptProcessor(2048, 1, 1);
+    asrScriptProcessor = processor;
+
+    processor.onaudioprocess = (e) => {
+      if (!isRecording.value || !asrWs || asrWs.readyState !== WebSocket.OPEN) return;
+      const pcmBytes = float32ToInt16Bytes(e.inputBuffer.getChannelData(0));
+      // Audio Only Request(0x02), flags=0x00, Raw(0x00), Gzip(0x01)
+      const compressed = pako.gzip(pcmBytes);
+      asrWs.send(buildFrame(0x02, 0x00, 0x00, 0x01, compressed));
+    };
+
+    source.connect(processor);
+    processor.connect(audioCtx.destination);
+
+    isRecording.value = true;
+    asrStatus.value = "recording";
+  } catch (err) {
+    console.error("ASR start error:", err);
+    stopRecording();
+    asrStatus.value = "error";
+    setTimeout(() => { asrStatus.value = ""; }, 3000);
+    if (err.name === "NotAllowedError") {
+      alert("麦克风权限被拒绝，请在浏览器设置中允许访问麦克风");
+    }
+  }
+};
+
 onUnmounted(() => {
  if (currentAudioInstance) { currentAudioInstance.pause(); currentAudioInstance = null; }
  blobUrls.forEach((url) => URL.revokeObjectURL(url));
+  stopRecording();
 });
 </script>

@ -468,11 +727,33 @@ onUnmounted(() => {
      <textarea
        v-model="inputText"
        class="input-box"
-        placeholder="输入英语内容，按 Enter 发送，Shift+Enter 换行..."
+        :placeholder="asrInterimText ? asrInterimText : '输入英语内容，按 Enter 发送，Shift+Enter 换行...'"
        :disabled="isSending"
        @keydown="handleKeydown"
        rows="1"
      ></textarea>
+      <!-- 麦克风按钮 -->
+      <button
+        class="mic-btn"
+        :class="{ recording: isRecording, error: asrStatus === 'error', connecting: asrStatus === 'connecting' }"
+        @click="startRecording"
+        :title="isRecording ? '停止录音' : '语音输入'"
+        :disabled="isSending"
+      >
+        <!-- 连接中：旋转动画 -->
+        <div v-if="asrStatus === 'connecting'" class="mic-spinner"></div>
+        <!-- 录音中：麦克风 + 脉冲 -->
+        <template v-else-if="isRecording">
+          <div class="mic-pulse"></div>
+          <svg xmlns="http://www.w3.org/2000/svg" fill="currentColor" viewBox="0 0 24 24">
+            <path d="M12 1a4 4 0 0 1 4 4v6a4 4 0 0 1-8 0V5a4 4 0 0 1 4-4zm0 2a2 2 0 0 0-2 2v6a2 2 0 0 0 4 0V5a2 2 0 0 0-2-2zm-1 14.93V20H9v2h6v-2h-2v-2.07A7.001 7.001 0 0 0 19 11h-2a5 5 0 0 1-10 0H5a7.001 7.001 0 0 0 6 6.93z"/>
+          </svg>
+        </template>
+        <!-- 默认：麦克风图标 -->
+        <svg v-else xmlns="http://www.w3.org/2000/svg" fill="currentColor" viewBox="0 0 24 24">
+          <path d="M12 1a4 4 0 0 1 4 4v6a4 4 0 0 1-8 0V5a4 4 0 0 1 4-4zm0 2a2 2 0 0 0-2 2v6a2 2 0 0 0 4 0V5a2 2 0 0 0-2-2zm-1 14.93V20H9v2h6v-2h-2v-2.07A7.001 7.001 0 0 0 19 11h-2a5 5 0 0 1-10 0H5a7.001 7.001 0 0 0 6 6.93z"/>
+        </svg>
+      </button>
      <button class="send-btn" :disabled="!inputText.trim() || isSending" @click="sendMessage">
        <svg v-if="!isSending" xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 24 24" stroke-width="2" stroke="currentColor">
          <path stroke-linecap="round" stroke-linejoin="round" d="M6 12 3.269 3.125A59.769 59.769 0 0 1 21.485 12 59.768 59.768 0 0 1 3.27 20.875L5.999 12Zm0 0h7.5" />
@ -787,6 +1068,71 @@ onUnmounted(() => {
 }
@keyframes spin { to { transform: rotate(360deg); } }

+/* Mic Button */
+.mic-btn {
+  position: relative;
+  width: 46px; height: 46px;
+  border-radius: 50%;
+  background: rgba(255,255,255,0.07);
+  border: 1px solid var(--card-border);
+  color: var(--text-secondary);
+  display: flex;
+  align-items: center;
+  justify-content: center;
+  cursor: pointer;
+  transition: all 0.2s;
+  flex-shrink: 0;
+  overflow: hidden;
+}
+.mic-btn svg { width: 20px; height: 20px; position: relative; z-index: 1; }
+.mic-btn:hover:not(:disabled) { background: rgba(255,255,255,0.12); color: var(--text-primary); border-color: rgba(255,255,255,0.2); }
+.mic-btn:disabled { opacity: 0.4; cursor: not-allowed; }
+
+/* 录音中状态 */
+.mic-btn.recording {
+  background: rgba(239,68,68,0.15);
+  border-color: rgba(239,68,68,0.5);
+  color: #ef4444;
+}
+.mic-btn.recording:hover:not(:disabled) { background: rgba(239,68,68,0.25); }
+
+/* 连接中状态 */
+.mic-btn.connecting {
+  background: rgba(251,191,36,0.1);
+  border-color: rgba(251,191,36,0.4);
+  color: #fbbf24;
+}
+
+/* 错误状态 */
+.mic-btn.error {
+  background: rgba(239,68,68,0.1);
+  border-color: rgba(239,68,68,0.4);
+  color: #ef4444;
+}
+
+/* 脉冲动画 */
+.mic-pulse {
+  position: absolute;
+  inset: 0;
+  border-radius: 50%;
+  background: rgba(239,68,68,0.2);
+  animation: micPulse 1.2s ease-out infinite;
+}
+@keyframes micPulse {
+  0% { transform: scale(0.85); opacity: 0.8; }
+  70% { transform: scale(1.15); opacity: 0; }
+  100% { transform: scale(0.85); opacity: 0; }
+}
+
+/* 连接中旋转 */
+.mic-spinner {
+  width: 18px; height: 18px;
+  border: 2px solid rgba(251,191,36,0.3);
+  border-top-color: #fbbf24;
+  border-radius: 50%;
+  animation: spin 0.8s linear infinite;
+}
+
@media (max-width: 600px) {
  .page-container { padding: 1rem 1rem 0; }
  .bubble-wrap { max-width: 85%; }
--- a/vite.config.js
+++ b/vite.config.js
@ -2,6 +2,11 @@ import { defineConfig } from 'vite'
 import vue from '@vitejs/plugin-vue'
 import { fileURLToPath, URL } from 'node:url'

+// 豆包 ASR 鉴权信息（与 src/config/index.js 保持一致）
+const DOUBAO_APP_ID = '2542859186'
+const DOUBAO_ACCESS_TOKEN = 'a4h5fT3cVlBi82u93iEQlqT3c4MP6_8V'
+const DOUBAO_ASR_RESOURCE_ID = 'volc.bigasr.sauc.duration'
+
 // https://vite.dev/config/
 export default defineConfig({
  plugins: [vue()],
@ -31,6 +36,21 @@ export default defineConfig({
        target: 'https://dashscope.aliyuncs.com',
        changeOrigin: true,
        rewrite: (path) => path.replace(/^\/dashscope-api/, '')
+      },
+      // ASR WebSocket 代理：在代理层注入鉴权 Header（浏览器 WS 不支持自定义 Header）
+      '/asr-ws': {
+        target: 'wss://openspeech.bytedance.com',
+        changeOrigin: true,
+        ws: true,
+        rewrite: (path) => path.replace(/^\/asr-ws/, ''),
+        configure: (proxy) => {
+          proxy.on('proxyReqWs', (proxyReq) => {
+            proxyReq.setHeader('X-Api-App-Key', DOUBAO_APP_ID)
+            proxyReq.setHeader('X-Api-Access-Key', DOUBAO_ACCESS_TOKEN)
+            proxyReq.setHeader('X-Api-Resource-Id', DOUBAO_ASR_RESOURCE_ID)
+            proxyReq.setHeader('X-Api-Connect-Id', crypto.randomUUID())
+          })
+        }
      }
    }
  }