实时语音转文字调AI回复

xiaoweiba35 2025-11-02 10:27:10

#闲玩的，实时语音转文字调用AI，目前只支持中文，英文不太理想，哪位大神给优化下
#https://alphacephei.com/vosk/models 重要文件vosk-model-cn-0.22
import queue
import threading
import json
import os
import requests
from openai import OpenAI
import time
from datetime import datetime

#API Key
DASHSCOPE_API_KEY = os.getenv('DASHSCOPE_API_KEY')

try:
    from vosk import Model, KaldiRecognizer
    import pyaudio
    VOSK_AVAILABLE = True
except ImportError:
    print("的依赖库库库库库库库")
    VOSK_AVAILABLE = False


class RealTimeASR:
    def __init__(self, model_path="vosk-model-cn-0.22/vosk-model-cn-0.22"):
        if not VOSK_AVAILABLE:
            raise ImportError("Vosk库未正确安装")     
        self.model = Model(model_path)
        self.recognizer = KaldiRecognizer(self.model, 16000)
        self.recognizer.SetWords(True) 
        self.audio_queue = queue.Queue()
        self.result_queue = queue.Queue()
        self.running = False
        self.last_partial = ""  
        
        # 初始化日志文件，保存在模型文件夹中
        current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
        self.log_file = os.path.join("语音识别日志", f"{current_time}.txt")
        print(f"识别结果将保存到文件: {self.log_file}")
        
        try:
            with open(self.log_file, 'w', encoding='utf-8') as f:
                f.write(f"语音识别日志 - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
        except Exception as e:
            print(f"创建日志文件失败: {e}")
        print("准备大语言模型接口...")
    
    def audio_callback(self, in_data, frame_count, time_info, status):
        if self.running:
            self.audio_queue.put(in_data)
        return (None, pyaudio.paContinue)
    
    def processing_thread(self):
        while self.running:
            try:
                data = self.audio_queue.get(timeout=1.0)
                if self.recognizer.AcceptWaveform(data):
                    result = json.loads(self.recognizer.Result())
                    if result.get("text", "").strip():
                        self.last_partial = ""
                        self.result_queue.put(result["text"])
                
                partial = json.loads(self.recognizer.PartialResult())
                if partial.get("partial", "").strip():
                    partial_text = partial["partial"]
                    if partial_text != self.last_partial and len(partial_text) > 3:
                        self.last_partial = partial_text
                        if any(char.isalpha() for char in partial_text):
                            pass
            except queue.Empty:
                continue
            except Exception as e:
                print(f"错误: {e}")
    
    def call_language_model(self, text):
        try:
            print(f"正在向大语言模型发送请求: {text}")

            print("正在连接百炼API...")
            client = OpenAI(
                api_key=DASHSCOPE_API_KEY,
                base_url="https://dashscope.aliyuncs.com/compatible-mode/v1"
            )
            response = client.chat.completions.create(
                model="qwen-turbo-latest",
                messages=[{"role": "user", "content": text}]
            )
            return response.choices[0].message.content
            
        except Exception as e:
            print(f"调用大语言模型失败: {e}")
    
    def start(self):
        print("启动实时语音识别（说'退出'或'exit'可停止）...")
        print("识别到的内容将自动发送给大语言模型，并显示回复")
        self.running = True
        
        try:
            p = pyaudio.PyAudio()
            print("可用音频输入设备:")
            for i in range(p.get_device_count()):
                dev = p.get_device_info_by_index(i)
                if dev['maxInputChannels'] > 0:
                    print(f"   {i}: {dev['name']}")
            
            # 默认音频设备
            print("尝试打开音频流...")
            stream = p.open(format=pyaudio.paInt16, 
                            channels=1,
                            rate=16000, 
                            input=True,
                            input_device_index=None,  
                            stream_callback=self.audio_callback,
                            frames_per_buffer=4096)
          
            
            # 启动处理线程
            print("启动语音处理线程...")
            processing = threading.Thread(target=self.processing_thread, daemon=True)
            processing.start()
            print("处理线程已启动!")
            
            # 主循环
            while self.running:
                try:

                    text = self.result_queue.get(timeout=1.0)
                    print(f"识别结果: {text}")

                    try:
                        with open(self.log_file, 'a', encoding='utf-8') as f:
                            f.write(f"[用户]: {text}\n")
                        print(f"已保存到文件: {os.path.basename(self.log_file)}")
                    except Exception as e:
                        print(f"保存到文件失败: {e}")

                    if '退出' == text or 'exit' in text.lower():
                        self.running = False
                        print("正在停止语音识别...")
                        break

                    # 调用大语言模型并获取回复
                    llm_response = self.call_language_model(text)
                    print(f"大语言模型回复: {llm_response}")
                    
                    # 保存大语言模型回复到日志
                    try:
                        with open(self.log_file, 'a', encoding='utf-8') as f:
                            f.write(f"[AI]: {llm_response}\n\n")  
                    except Exception as e:
                        print(f"保存AI回复失败: {e}")
                except queue.Empty:
                    continue
                    
        except KeyboardInterrupt:
            print("\n正在停止语音识别...")
        except Exception as e:
            print(f"启动错误: {e}")
        finally:
            self.running = False
            if 'stream' in locals():
                stream.stop_stream()
                stream.close()
            if 'p' in locals():
                p.terminate()
            print("语音识别已停止")


if __name__ == "__main__":

            start()

...全文

80 回复打赏收藏转发到动态举报

写回复

用AI写文章

切换为时间正序

请发表友善的回复…

发表回复

腾讯AI-TalkWithRobot 基于腾讯ai接口的语音闲聊，目前实现了语音转文字，闲聊回复，文字转语音并播放基本函数（接口必需的信息） 1，生成签名签名 # 生成sign接口信息，对任意ai都通用，接口鉴权 # APPKEY:换成你的app_APPKEY def get_sign ( self , data ): lst = [ i [ 0 ] + '=' + parse . quote_plus ( str ( i [ 1 ])) for i in data . items ()] params = '&' . join ( sorted ( lst )) s = params + '&app_key=' + APPKEY\n\t\t#print(s)\n\t\th = hashlib . md5 ( s . encode ( 'utf8' )) return h

基于百度AI的智能聊天机器人Python代码(有注释) 实现了语音录制、调用百度API实现了语音识别、调用百度UNTI智能机器人的消息回复、文字合成语音

在当今数字化时代，语音识别技术的应用愈发广泛，从智能语音助手到实时字幕生成，从语音控制设备到实时会议记录，它为我们的生活和工作带来了极大的便利。今天，我们要深入探讨的是一款名为RealtimeSTT的开源实时语音转文本库，它以其高效、低延迟和丰富的功能，在语音识别领域崭露头角。RealtimeSTT是一个基于Python的开源项目，它源自Linguflex项目，旨在为各种实时应用提供高效、低延迟的语音转文本解决方案。

“做会议记录、看无字幕网课再也不用担心，解放双手，提高效率。”随着语音转文字技术的发展，我们记录会议、上课内容等有了更好的方式。实时语音转文字实现边听边看，并且还可回看转译记录，随时翻看电子档。解放双手，提高效率。话不多说，咱们开始推荐这些好用的APP。文末附总结及天使语训安装包链接。01—讯飞听见-录音转文字提到语音转文字怎么可能没讯飞呢。会议、采访、学习培训录音整理神器。边说边转，...

Python使用websocket调用语音识别，语音转文字0. 太长不看系列，直接使用1. Python调用标贝科技语音识别websocket接口，实现语音转文字1.1 环境准备：1.2 获取权限1.2.1 登录1.2.2 创建新应用1.2.3 选择服务1.2.4 获取Key&Secret2. 代码实现2.1 获取access_token2.2 准备数据2.3 配置接口参数2.4 建立websocket客户端2.5 完整demo2.5 执行 0. 太长不看系列，直接使用在1.2官网注册后拿到API

AI 前沿

5,915

社区成员

1,535

社区内容

发帖

与我相关

我的任务

社区管理员

加入社区

近7日
近30日
至今

加载中

查看更多榜单

社区公告

尊敬的用户，您好！
我们很高兴地宣布，ai.csdn.net现在已经正式上线了！这是一个全新的AI技术社区，我们将为您提供最新的AI技术资讯、最前沿的AI技术应用案例、最专业的AI技术交流平台。我们希望通过这个平台，让更多的AI技术爱好者能够相互交流、共同进步。欢迎您加入我们的大家庭，一起探讨AI技术的未来！
我们的AI技术社区还提供了AIGC（人工智能生成内容）服务，为您提供最专业的AI技术支持。我们的AIGC团队由一群资深的AI技术专家组成，他们将为您提供最新的AI技术资讯、最前沿的AI技术应用案例、最专业的AI技术交流平台。无论您是AI技术爱好者，还是AI技术从业者，我们都将为您提供最优质的服务，让您在AI技术的道路上走得更远！

试试用AI创作助手写篇文章吧

+ 用AI写文章