File size: 2,788 Bytes
b61a569
e2ca24f
20f31d6
5c6a8d2
20f31d6
 
 
 
 
e2ca24f
 
20f31d6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5c6a8d2
20f31d6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5c6a8d2
20f31d6
848524b
20f31d6
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import gradio as gr
from transformers import pipeline, WhisperProcessor, WhisperForConditionalGeneration
import logging

# 设置日志
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# 指定模型ID
model_id = "ZizaM/whisperfinetune_shanxi"

try:
    logger.info(f"开始加载模型: {model_id}")
    # 加载模型和处理器
    processor = WhisperProcessor.from_pretrained(model_id)
    model = WhisperForConditionalGeneration.from_pretrained(model_id)
    logger.info("模型加载成功")
    
    # 创建pipeline
    pipe = pipeline(
        "automatic-speech-recognition",
        model=model,
        tokenizer=processor.tokenizer,
        feature_extractor=processor.feature_extractor
    )
    logger.info("Pipeline 创建成功")

    def transcribe(audio):
        if audio is None:
            logger.warning("没有接收到音频输入")
            return "请先录制音频"
        
        try:
            logger.info("开始处理音频...")
            # 确保音频路径是字符串类型
            if isinstance(audio, tuple):
                audio_path = audio[1]  # 如果是元组,取第二个元素(文件路径)
            else:
                audio_path = audio
                
            logger.info(f"处理音频文件: {audio_path}")
            result = pipe(audio_path, generate_kwargs={"language": "zh"})
            
            if not result or "text" not in result:
                logger.error("模型返回结果格式错误")
                return "识别失败:模型返回结果格式错误"
                
            transcribed_text = result["text"].strip()
            if not transcribed_text:
                logger.warning("识别结果为空")
                return "未能识别出有效内容,请重试"
                
            logger.info(f"识别成功: {transcribed_text}")
            return transcribed_text
            
        except Exception as e:
            logger.error(f"识别过程中出错: {str(e)}", exc_info=True)
            return f"识别失败: {str(e)}"

    # 创建 Gradio 接口
    iface = gr.Interface(
        fn=transcribe,
        inputs=gr.Audio(
            sources=["microphone"],
            type="filepath",
            label="点击麦克风录音"
        ),
        outputs=gr.Textbox(
            label="识别结果",
            placeholder="这里将显示识别结果..."
        ),
        title="语音识别系统",
        description="点击麦克风图标开始录音,再次点击停止录音。等待几秒钟后会显示识别结果。",
        examples=None,
        cache_examples=False
    )

    iface.launch()

except Exception as e:
    logger.error(f"应用启动失败: {str(e)}", exc_info=True)
    raise e