Spaces:
Runtime error
Runtime error
DrBardiaGh
commited on
Update app.py
Browse files
app.py
CHANGED
@@ -1,64 +1,109 @@
|
|
1 |
import gradio as gr
|
2 |
-
|
|
|
|
|
|
|
3 |
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
|
|
|
|
8 |
|
|
|
|
|
|
|
|
|
|
|
9 |
|
10 |
-
|
11 |
-
message,
|
12 |
-
history: list[tuple[str, str]],
|
13 |
-
system_message,
|
14 |
-
max_tokens,
|
15 |
-
temperature,
|
16 |
-
top_p,
|
17 |
-
):
|
18 |
-
messages = [{"role": "system", "content": system_message}]
|
19 |
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
|
26 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
|
28 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
):
|
37 |
-
token = message.choices[0].delta.content
|
38 |
|
39 |
-
|
40 |
-
|
|
|
|
|
|
|
|
|
41 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
|
43 |
-
|
44 |
-
For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
|
45 |
-
"""
|
46 |
-
demo = gr.ChatInterface(
|
47 |
-
respond,
|
48 |
-
additional_inputs=[
|
49 |
-
gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
|
50 |
-
gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
|
51 |
-
gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
|
52 |
-
gr.Slider(
|
53 |
-
minimum=0.1,
|
54 |
-
maximum=1.0,
|
55 |
-
value=0.95,
|
56 |
-
step=0.05,
|
57 |
-
label="Top-p (nucleus sampling)",
|
58 |
-
),
|
59 |
-
],
|
60 |
-
)
|
61 |
-
|
62 |
-
|
63 |
-
if __name__ == "__main__":
|
64 |
-
demo.launch()
|
|
|
1 |
import gradio as gr
|
2 |
+
import torch
|
3 |
+
from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer
|
4 |
+
import openai
|
5 |
+
import os
|
6 |
|
7 |
+
# --------------- STEP 1: Load the free STT model from Hugging Face ---------------
|
8 |
+
# We use a Persian Wav2Vec2 model
|
9 |
+
# Example: 'm3hrdadfi/wav2vec2-large-xlsr-persian-v2'
|
10 |
+
stt_model_name = "m3hrdadfi/wav2vec2-large-xlsr-persian-v2"
|
11 |
+
tokenizer = Wav2Vec2Tokenizer.from_pretrained(stt_model_name)
|
12 |
+
stt_model = Wav2Vec2ForCTC.from_pretrained(stt_model_name)
|
13 |
|
14 |
+
# --------------- STEP 2: Configure OpenAI API for GPT-4o mini (fine-tuned) ---------------
|
15 |
+
# You need your OpenAI API key. Let's read from environment variable for safety.
|
16 |
+
openai.api_key = os.getenv("OPENAI_API_KEY", "YOUR_API_KEY_HERE")
|
17 |
+
# If you'd rather hardcode, replace "YOUR_API_KEY_HERE" with your actual key.
|
18 |
+
# But do not commit your real key publicly.
|
19 |
|
20 |
+
OPENAI_MODEL_NAME = "gpt-4o-mini" # The exact model name you said you'd use (example)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
|
22 |
+
# --------------- STT Function ---------------
|
23 |
+
def speech_to_text(audio):
|
24 |
+
"""
|
25 |
+
audio is a tuple (sample_rate, numpy_data) from Gradio's microphone or an audio file.
|
26 |
+
Convert Persian speech to text using wav2vec2 model offline.
|
27 |
+
"""
|
28 |
+
if audio is None:
|
29 |
+
return ""
|
30 |
+
sample_rate, data = audio
|
31 |
+
# Convert to torch tensor
|
32 |
+
input_values = tokenizer(data, return_tensors="pt", sampling_rate=sample_rate).input_values
|
33 |
+
with torch.no_grad():
|
34 |
+
logits = stt_model(input_values).logits
|
35 |
+
predicted_ids = torch.argmax(logits, dim=-1)
|
36 |
+
transcription = tokenizer.decode(predicted_ids[0])
|
37 |
+
# transcription might be uppercase and missing punctuation. We just return it raw.
|
38 |
+
return transcription.lower()
|
39 |
|
40 |
+
# --------------- Correction Function with GPT-4o mini ---------------
|
41 |
+
def correct_text_with_gpt(text):
|
42 |
+
"""
|
43 |
+
Send the text to OpenAI GPT-4o mini for correction/improvement.
|
44 |
+
The user wants a formal tone consistent with medical/legal usage.
|
45 |
+
Fine-tuned model is presumably specialized for that domain.
|
46 |
+
"""
|
47 |
+
if not text.strip():
|
48 |
+
return ""
|
49 |
+
|
50 |
+
system_message = (
|
51 |
+
"You are a specialized model that receives Persian text and corrects it with a formal tone, "
|
52 |
+
"particularly for medical/legal context. Make the text coherent, unify spacing, and punctuation, "
|
53 |
+
"and ensure a formal writing style. Do not add commentary. Just provide the corrected text."
|
54 |
+
)
|
55 |
+
|
56 |
+
user_message = f"متن خام: {text}\n\nلطفاً متن فوق را در یک پاراگراف رسمی و اداری اصلاح کن."
|
57 |
|
58 |
+
# We'll call the ChatCompletion API if GPT-4o mini is chat-based.
|
59 |
+
# If it's a completion-based model, code differs. But let's assume it's chat-based for simplicity.
|
60 |
+
response = openai.ChatCompletion.create(
|
61 |
+
model=OPENAI_MODEL_NAME,
|
62 |
+
messages=[
|
63 |
+
{"role": "system", "content": system_message},
|
64 |
+
{"role": "user", "content": user_message}
|
65 |
+
],
|
66 |
+
temperature=0.2,
|
67 |
+
max_tokens=1024
|
68 |
+
)
|
69 |
+
corrected = response.choices[0].message.content
|
70 |
+
return corrected.strip()
|
71 |
|
72 |
+
# --------------- Gradio Interface ---------------
|
73 |
+
# We want:
|
74 |
+
# - A widget to record or upload audio
|
75 |
+
# - A box that shows transcribed text
|
76 |
+
# - A button "اصلاح متن" that calls GPT-4o mini
|
77 |
+
# - A final output box
|
|
|
|
|
78 |
|
79 |
+
def process_audio_and_correct(audio):
|
80 |
+
# Step A: convert speech to text
|
81 |
+
raw_text = speech_to_text(audio)
|
82 |
+
# Step B: correct with GPT
|
83 |
+
final_text = correct_text_with_gpt(raw_text)
|
84 |
+
return raw_text, final_text
|
85 |
|
86 |
+
with gr.Blocks() as demo:
|
87 |
+
gr.Markdown("# وبسایت تبدیل گفتار به متن (فارسی) + اصلاح متن با GPT-4o mini")
|
88 |
+
gr.Markdown("در اینجا میتوانید یک وویس فارسی ضبط یا فایل صوتی آپلود کنید. سپس متن استخراجشده و اصلاحشده را دریافت کنید.")
|
89 |
+
|
90 |
+
with gr.Row():
|
91 |
+
audio_in = gr.Audio(source="microphone", type="numpy", label="ضبط صدا ��ا آپلود فایل (رایگان)")
|
92 |
+
|
93 |
+
transcribed_text = gr.Textbox(label="متن خام (خروجی مرحله اول - رایگان)", lines=3)
|
94 |
+
corrected_text = gr.Textbox(label="متن اصلاحشده (با GPT-4o mini - پولی)", lines=3)
|
95 |
+
|
96 |
+
btn_process = gr.Button("تبدیل و اصلاح")
|
97 |
+
|
98 |
+
def audio_workflow(audio):
|
99 |
+
# step 1: stt
|
100 |
+
raw_text = speech_to_text(audio)
|
101 |
+
transcribed_text.update(raw_text)
|
102 |
+
# step 2: correction
|
103 |
+
corrected = correct_text_with_gpt(raw_text)
|
104 |
+
corrected_text.update(corrected)
|
105 |
+
return raw_text, corrected
|
106 |
+
|
107 |
+
btn_process.click(fn=process_audio_and_correct, inputs=audio_in, outputs=[transcribed_text, corrected_text])
|
108 |
|
109 |
+
demo.launch()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|