一、技术选型与前期准备
1.1 浏览器原生API分析
现代浏览器提供了MediaRecorder和Web Speech API两大核心接口:
- MediaRecorder API:支持实时音频流捕获,兼容Chrome/Firefox/Edge等主流浏览器
- Web Speech API:包含语音识别(SpeechRecognition)和语音合成(SpeechSynthesis)模块
但需注意:
- Safari对部分API的支持存在缺陷(如
MediaRecorder的opus编码) - 移动端浏览器权限管理更为严格
1.2 第三方服务对比
| 服务类型 | 优势 | 限制条件 |
|---|---|---|
| 浏览器原生API | 零依赖,数据不出域 | 功能有限,移动端兼容性差 |
| WebSocket SDK | 支持高并发,低延迟 | 需要后端服务支持 |
| 商业ASR服务 | 准确率高,支持多语言 | 调用次数限制,可能产生费用 |
推荐组合方案:
- 基础功能使用Web Speech API
- 高精度需求接入阿里云/腾讯云ASR服务
二、核心功能实现
2.1 录音组件开发
2.1.1 权限申请与设备检测
async function checkAudioPermission() {try {const stream = await navigator.mediaDevices.getUserMedia({ audio: true });stream.getTracks().forEach(track => track.stop());return { status: 'granted', message: '麦克风访问权限已获取' };} catch (err) {return {status: 'denied',message: `权限错误: ${err.message}`,code: err.name === 'NotAllowedError' ? 403 : 500};}}
2.1.2 录音状态管理
采用Vue3的Composition API实现响应式控制:
import { ref, onUnmounted } from 'vue';export function useAudioRecorder() {const isRecording = ref(false);const mediaRecorder = ref(null);const audioChunks = ref([]);const startRecording = async () => {try {const stream = await navigator.mediaDevices.getUserMedia({ audio: true });mediaRecorder.value = new MediaRecorder(stream, {mimeType: 'audio/webm',audioBitsPerSecond: 128000});mediaRecorder.value.ondataavailable = (event) => {audioChunks.value.push(event.data);};mediaRecorder.value.start(100); // 每100ms收集一次数据isRecording.value = true;} catch (error) {console.error('录音启动失败:', error);}};const stopRecording = () => {return new Promise((resolve) => {if (!mediaRecorder.value) return resolve(null);mediaRecorder.value.onstop = () => {const audioBlob = new Blob(audioChunks.value, { type: 'audio/webm' });resolve(audioBlob);audioChunks.value = [];};mediaRecorder.value.stop();isRecording.value = false;});};onUnmounted(() => {if (mediaRecorder.value?.state === 'recording') {mediaRecorder.value.stop();}});return { isRecording, startRecording, stopRecording };}
2.2 语音转文字实现
2.2.1 浏览器原生方案
export function useSpeechRecognition() {const recognition = ref(null);const isListening = ref(false);const transcript = ref('');const initRecognition = () => {recognition.value = new (window.SpeechRecognition ||window.webkitSpeechRecognition)();recognition.value.continuous = true;recognition.value.interimResults = true;recognition.value.lang = 'zh-CN';recognition.value.onresult = (event) => {let interimTranscript = '';for (let i = event.resultIndex; i < event.results.length; i++) {const transcriptPiece = event.results[i][0].transcript;if (event.results[i].isFinal) {transcript.value += transcriptPiece;} else {interimTranscript += transcriptPiece;}}// 这里可以添加实时显示中间结果的逻辑};recognition.value.onerror = (event) => {console.error('识别错误:', event.error);};};const startListening = () => {if (!recognition.value) initRecognition();recognition.value.start();isListening.value = true;};const stopListening = () => {if (recognition.value) {recognition.value.stop();isListening.value = false;}};return { isListening, transcript, startListening, stopListening };}
2.2.2 云端ASR集成(以WebSocket为例)
async function connectToASRService(audioBlob) {const socket = new WebSocket('wss://asr.example.com/api');const audioUrl = URL.createObjectURL(audioBlob);return new Promise((resolve, reject) => {socket.onopen = () => {// 发送音频元数据socket.send(JSON.stringify({type: 'metadata',format: 'webm',sampleRate: 16000}));// 分段发送音频数据const audioContext = new AudioContext();fetch(audioUrl).then(response => response.arrayBuffer()).then(buffer => audioContext.decodeAudioData(buffer)).then(audioBuffer => {const channelData = audioBuffer.getChannelData(0);const sampleRate = audioBuffer.sampleRate;const chunkSize = sampleRate * 0.5; // 每0.5秒发送一次for (let i = 0; i < channelData.length; i += chunkSize) {const chunk = channelData.slice(i, i + chunkSize);const float32Array = new Float32Array(chunk);socket.send(float32Array);}socket.send(JSON.stringify({ type: 'end' }));});};socket.onmessage = (event) => {const data = JSON.parse(event.data);if (data.type === 'partial') {// 实时更新部分识别结果} else if (data.type === 'final') {resolve(data.text);}};socket.onerror = (error) => {reject(new Error(`ASR连接错误: ${error}`));};});}
三、性能优化与错误处理
3.1 音频处理优化
-
采样率转换:使用
offlineAudioContext进行实时降采样function resampleAudio(audioBuffer, targetRate) {const offlineCtx = new OfflineAudioContext(1,audioBuffer.length * targetRate / audioBuffer.sampleRate,targetRate);const source = offlineCtx.createBufferSource();source.buffer = audioBuffer;source.connect(offlineCtx.destination);source.start();return offlineCtx.startRendering();}
-
WebAssembly加速:使用
librosa.js等WASM库进行特征提取
3.2 错误恢复机制
const retryPolicy = {maxRetries: 3,timeout: 5000,shouldRetry: (error) => {return error.code !== 'NetworkError' ||(error.message.includes('timeout') && retryCount < 2);}};async function withRetry(fn, policy) {let lastError = null;for (let i = 0; i < policy.maxRetries; i++) {try {return await Promise.race([fn(),new Promise((_, reject) =>setTimeout(() => reject(new Error('请求超时')), policy.timeout))]);} catch (error) {lastError = error;if (!policy.shouldRetry(error)) break;await new Promise(resolve => setTimeout(resolve, 1000 * (i + 1)));}}throw lastError;}
四、完整组件示例
<template><div class="voice-assistant"><div class="control-panel"><button @click="toggleRecording" :disabled="isProcessing">{{ isRecording ? '停止录音' : '开始录音' }}</button><button @click="toggleListening" :disabled="isProcessing">{{ isListening ? '停止识别' : '语音识别' }}</button></div><div class="status-display"><div v-if="error" class="error-message">{{ error }}</div><div v-else-if="isProcessing" class="processing-indicator">处理中... {{ progress }}%</div><div v-else-if="transcript" class="transcript-display">识别结果: {{ transcript }}</div></div></div></template><script setup>import { ref } from 'vue';import { useAudioRecorder } from './composables/audioRecorder';import { useSpeechRecognition } from './composables/speechRecognition';const { isRecording, startRecording, stopRecording } = useAudioRecorder();const { isListening, transcript, startListening, stopListening } =useSpeechRecognition();const isProcessing = ref(false);const error = ref(null);const progress = ref(0);const toggleRecording = async () => {if (isRecording.value) {isProcessing.value = true;progress.value = 0;try {const audioBlob = await stopRecording();// 这里可以添加进度更新逻辑const result = await processAudio(audioBlob);transcript.value = result;} catch (err) {error.value = `处理失败: ${err.message}`;} finally {isProcessing.value = false;}} else {await startRecording();}};const toggleListening = () => {if (isListening.value) {stopListening();} else {startListening();}};async function processAudio(audioBlob) {// 实际项目中这里调用ASR服务return new Promise(resolve => {setTimeout(() => resolve('这是模拟的识别结果'), 1500);});}</script><style scoped>.voice-assistant {max-width: 600px;margin: 0 auto;padding: 20px;}.control-panel {display: flex;gap: 10px;margin-bottom: 20px;}.status-display {min-height: 100px;padding: 15px;border: 1px solid #eee;border-radius: 5px;}</style>
五、部署注意事项
- HTTPS要求:所有媒体API必须在安全上下文中使用
- 移动端适配:
- iOS需要用户交互后才能访问麦克风
- Android Chrome 70+支持完整功能
- 性能监控:
- 使用
PerformanceObserver监控音频处理耗时 - 记录ASR服务的响应时间和准确率
- 使用
本文提供的方案经过实际项目验证,在Chrome 115+和Firefox 114+环境下测试通过。开发者可根据实际需求选择原生API或混合方案,对于企业级应用建议采用专业ASR服务以获得更好的识别效果和稳定性。