DrBardiaGh commited on
Commit
03c17d1
·
verified ·
1 Parent(s): 692a2d2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +98 -53
app.py CHANGED
@@ -1,64 +1,109 @@
1
  import gradio as gr
2
- from huggingface_hub import InferenceClient
 
 
 
3
 
4
- """
5
- For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
6
- """
7
- client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
 
 
8
 
 
 
 
 
 
9
 
10
- def respond(
11
- message,
12
- history: list[tuple[str, str]],
13
- system_message,
14
- max_tokens,
15
- temperature,
16
- top_p,
17
- ):
18
- messages = [{"role": "system", "content": system_message}]
19
 
20
- for val in history:
21
- if val[0]:
22
- messages.append({"role": "user", "content": val[0]})
23
- if val[1]:
24
- messages.append({"role": "assistant", "content": val[1]})
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
- messages.append({"role": "user", "content": message})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
- response = ""
 
 
 
 
 
 
 
 
 
 
 
 
29
 
30
- for message in client.chat_completion(
31
- messages,
32
- max_tokens=max_tokens,
33
- stream=True,
34
- temperature=temperature,
35
- top_p=top_p,
36
- ):
37
- token = message.choices[0].delta.content
38
 
39
- response += token
40
- yield response
 
 
 
 
41
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
- """
44
- For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
45
- """
46
- demo = gr.ChatInterface(
47
- respond,
48
- additional_inputs=[
49
- gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
50
- gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
51
- gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
52
- gr.Slider(
53
- minimum=0.1,
54
- maximum=1.0,
55
- value=0.95,
56
- step=0.05,
57
- label="Top-p (nucleus sampling)",
58
- ),
59
- ],
60
- )
61
-
62
-
63
- if __name__ == "__main__":
64
- demo.launch()
 
1
  import gradio as gr
2
+ import torch
3
+ from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer
4
+ import openai
5
+ import os
6
 
7
+ # --------------- STEP 1: Load the free STT model from Hugging Face ---------------
8
+ # We use a Persian Wav2Vec2 model
9
+ # Example: 'm3hrdadfi/wav2vec2-large-xlsr-persian-v2'
10
+ stt_model_name = "m3hrdadfi/wav2vec2-large-xlsr-persian-v2"
11
+ tokenizer = Wav2Vec2Tokenizer.from_pretrained(stt_model_name)
12
+ stt_model = Wav2Vec2ForCTC.from_pretrained(stt_model_name)
13
 
14
+ # --------------- STEP 2: Configure OpenAI API for GPT-4o mini (fine-tuned) ---------------
15
+ # You need your OpenAI API key. Let's read from environment variable for safety.
16
+ openai.api_key = os.getenv("OPENAI_API_KEY", "YOUR_API_KEY_HERE")
17
+ # If you'd rather hardcode, replace "YOUR_API_KEY_HERE" with your actual key.
18
+ # But do not commit your real key publicly.
19
 
20
+ OPENAI_MODEL_NAME = "gpt-4o-mini" # The exact model name you said you'd use (example)
 
 
 
 
 
 
 
 
21
 
22
+ # --------------- STT Function ---------------
23
+ def speech_to_text(audio):
24
+ """
25
+ audio is a tuple (sample_rate, numpy_data) from Gradio's microphone or an audio file.
26
+ Convert Persian speech to text using wav2vec2 model offline.
27
+ """
28
+ if audio is None:
29
+ return ""
30
+ sample_rate, data = audio
31
+ # Convert to torch tensor
32
+ input_values = tokenizer(data, return_tensors="pt", sampling_rate=sample_rate).input_values
33
+ with torch.no_grad():
34
+ logits = stt_model(input_values).logits
35
+ predicted_ids = torch.argmax(logits, dim=-1)
36
+ transcription = tokenizer.decode(predicted_ids[0])
37
+ # transcription might be uppercase and missing punctuation. We just return it raw.
38
+ return transcription.lower()
39
 
40
+ # --------------- Correction Function with GPT-4o mini ---------------
41
+ def correct_text_with_gpt(text):
42
+ """
43
+ Send the text to OpenAI GPT-4o mini for correction/improvement.
44
+ The user wants a formal tone consistent with medical/legal usage.
45
+ Fine-tuned model is presumably specialized for that domain.
46
+ """
47
+ if not text.strip():
48
+ return ""
49
+
50
+ system_message = (
51
+ "You are a specialized model that receives Persian text and corrects it with a formal tone, "
52
+ "particularly for medical/legal context. Make the text coherent, unify spacing, and punctuation, "
53
+ "and ensure a formal writing style. Do not add commentary. Just provide the corrected text."
54
+ )
55
+
56
+ user_message = f"متن خام: {text}\n\nلطفاً متن فوق را در یک پاراگراف رسمی و اداری اصلاح کن."
57
 
58
+ # We'll call the ChatCompletion API if GPT-4o mini is chat-based.
59
+ # If it's a completion-based model, code differs. But let's assume it's chat-based for simplicity.
60
+ response = openai.ChatCompletion.create(
61
+ model=OPENAI_MODEL_NAME,
62
+ messages=[
63
+ {"role": "system", "content": system_message},
64
+ {"role": "user", "content": user_message}
65
+ ],
66
+ temperature=0.2,
67
+ max_tokens=1024
68
+ )
69
+ corrected = response.choices[0].message.content
70
+ return corrected.strip()
71
 
72
+ # --------------- Gradio Interface ---------------
73
+ # We want:
74
+ # - A widget to record or upload audio
75
+ # - A box that shows transcribed text
76
+ # - A button "اصلاح متن" that calls GPT-4o mini
77
+ # - A final output box
 
 
78
 
79
+ def process_audio_and_correct(audio):
80
+ # Step A: convert speech to text
81
+ raw_text = speech_to_text(audio)
82
+ # Step B: correct with GPT
83
+ final_text = correct_text_with_gpt(raw_text)
84
+ return raw_text, final_text
85
 
86
+ with gr.Blocks() as demo:
87
+ gr.Markdown("# وب‌سایت تبدیل گفتار به متن (فارسی) + اصلاح متن با GPT-4o mini")
88
+ gr.Markdown("در اینجا می‌توانید یک وویس فارسی ضبط یا فایل صوتی آپلود کنید. سپس متن استخراج‌شده و اصلاح‌شده را دریافت کنید.")
89
+
90
+ with gr.Row():
91
+ audio_in = gr.Audio(source="microphone", type="numpy", label="ضبط صدا ��ا آپلود فایل (رایگان)")
92
+
93
+ transcribed_text = gr.Textbox(label="متن خام (خروجی مرحله اول - رایگان)", lines=3)
94
+ corrected_text = gr.Textbox(label="متن اصلاح‌شده (با GPT-4o mini - پولی)", lines=3)
95
+
96
+ btn_process = gr.Button("تبدیل و اصلاح")
97
+
98
+ def audio_workflow(audio):
99
+ # step 1: stt
100
+ raw_text = speech_to_text(audio)
101
+ transcribed_text.update(raw_text)
102
+ # step 2: correction
103
+ corrected = correct_text_with_gpt(raw_text)
104
+ corrected_text.update(corrected)
105
+ return raw_text, corrected
106
+
107
+ btn_process.click(fn=process_audio_and_correct, inputs=audio_in, outputs=[transcribed_text, corrected_text])
108
 
109
+ demo.launch()