import gradio as gr from transformers import pipeline, WhisperProcessor, WhisperForConditionalGeneration import logging # 设置日志 logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) # 指定模型ID model_id = "ZizaM/whisperfinetune_shanxi" try: logger.info(f"开始加载模型: {model_id}") # 加载模型和处理器 processor = WhisperProcessor.from_pretrained(model_id) model = WhisperForConditionalGeneration.from_pretrained(model_id) logger.info("模型加载成功") # 创建pipeline pipe = pipeline( "automatic-speech-recognition", model=model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor ) logger.info("Pipeline 创建成功") def transcribe(audio): if audio is None: logger.warning("没有接收到音频输入") return "请先录制音频" try: logger.info("开始处理音频...") # 确保音频路径是字符串类型 if isinstance(audio, tuple): audio_path = audio[1] # 如果是元组,取第二个元素(文件路径) else: audio_path = audio logger.info(f"处理音频文件: {audio_path}") result = pipe(audio_path, generate_kwargs={"language": "zh"}) if not result or "text" not in result: logger.error("模型返回结果格式错误") return "识别失败:模型返回结果格式错误" transcribed_text = result["text"].strip() if not transcribed_text: logger.warning("识别结果为空") return "未能识别出有效内容,请重试" logger.info(f"识别成功: {transcribed_text}") return transcribed_text except Exception as e: logger.error(f"识别过程中出错: {str(e)}", exc_info=True) return f"识别失败: {str(e)}" # 创建 Gradio 接口 iface = gr.Interface( fn=transcribe, inputs=gr.Audio( sources=["microphone"], type="filepath", label="点击麦克风录音" ), outputs=gr.Textbox( label="识别结果", placeholder="这里将显示识别结果..." ), title="语音识别系统", description="点击麦克风图标开始录音,再次点击停止录音。等待几秒钟后会显示识别结果。", examples=None, cache_examples=False ) iface.launch() except Exception as e: logger.error(f"应用启动失败: {str(e)}", exc_info=True) raise e