Spaces:
Running
on
T4
Running
on
T4
gabrielchua
commited on
Commit
Β·
dc06293
1
Parent(s):
1573ac4
add jina, language support
Browse files
app.py
CHANGED
@@ -13,13 +13,15 @@ from typing import List, Literal, Tuple, Optional
|
|
13 |
# Third-party imports
|
14 |
import gradio as gr
|
15 |
from loguru import logger
|
|
|
16 |
from pydantic import BaseModel
|
17 |
from pypdf import PdfReader
|
18 |
from pydub import AudioSegment
|
19 |
|
20 |
# Local imports
|
21 |
from prompts import SYSTEM_PROMPT
|
22 |
-
from utils import generate_script, generate_audio
|
|
|
23 |
|
24 |
class DialogueItem(BaseModel):
|
25 |
"""A single dialogue item."""
|
@@ -36,24 +38,55 @@ class Dialogue(BaseModel):
|
|
36 |
dialogue: List[DialogueItem]
|
37 |
|
38 |
|
39 |
-
def generate_podcast(
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
54 |
if len(text) > 100000:
|
55 |
-
raise gr.Error("The
|
56 |
-
|
57 |
# Modify the system prompt based on the chosen tone and length
|
58 |
modified_system_prompt = SYSTEM_PROMPT
|
59 |
if tone:
|
@@ -64,6 +97,8 @@ def generate_podcast(file: str, tone: Optional[str] = None, length: Optional[str
|
|
64 |
"Medium (3-5 min)": "Aim for a moderate length, about 3-5 minutes.",
|
65 |
}
|
66 |
modified_system_prompt += f"\n\nLENGTH: {length_instructions[length]}"
|
|
|
|
|
67 |
|
68 |
# Call the LLM
|
69 |
llm_output = generate_script(modified_system_prompt, text, Dialogue)
|
@@ -71,7 +106,7 @@ def generate_podcast(file: str, tone: Optional[str] = None, length: Optional[str
|
|
71 |
|
72 |
# Process the dialogue
|
73 |
audio_segments = []
|
74 |
-
transcript = ""
|
75 |
total_characters = 0
|
76 |
|
77 |
for line in llm_output.dialogue:
|
@@ -84,7 +119,7 @@ def generate_podcast(file: str, tone: Optional[str] = None, length: Optional[str
|
|
84 |
total_characters += len(line.text)
|
85 |
|
86 |
# Get audio file path
|
87 |
-
audio_file_path = generate_audio(line.text, line.speaker)
|
88 |
# Read the audio file into an AudioSegment
|
89 |
audio_segment = AudioSegment.from_file(audio_file_path)
|
90 |
audio_segments.append(audio_segment)
|
@@ -115,30 +150,40 @@ def generate_podcast(file: str, tone: Optional[str] = None, length: Optional[str
|
|
115 |
|
116 |
demo = gr.Interface(
|
117 |
title="Open NotebookLM",
|
118 |
-
description="Convert your PDFs into podcasts with open-source AI models (Llama 3.1 405B and MeloTTS). \n \n Note: Only the text content of the
|
119 |
fn=generate_podcast,
|
120 |
inputs=[
|
121 |
gr.File(
|
122 |
-
label="PDF",
|
123 |
-
file_types=[".pdf"
|
|
|
|
|
|
|
|
|
|
|
124 |
),
|
125 |
gr.Radio(
|
126 |
choices=["Fun", "Formal"],
|
127 |
-
label="
|
128 |
-
value="
|
129 |
),
|
130 |
gr.Radio(
|
131 |
choices=["Short (1-2 min)", "Medium (3-5 min)"],
|
132 |
-
label="
|
133 |
value="Medium (3-5 min)"
|
134 |
),
|
|
|
|
|
|
|
|
|
|
|
135 |
],
|
136 |
outputs=[
|
137 |
gr.Audio(label="Audio", format="mp3"),
|
138 |
gr.Markdown(label="Transcript"),
|
139 |
],
|
140 |
allow_flagging="never",
|
141 |
-
api_name="generate_podcast",
|
142 |
theme=gr.themes.Soft(),
|
143 |
concurrency_limit=3
|
144 |
)
|
|
|
13 |
# Third-party imports
|
14 |
import gradio as gr
|
15 |
from loguru import logger
|
16 |
+
from openai import OpenAI
|
17 |
from pydantic import BaseModel
|
18 |
from pypdf import PdfReader
|
19 |
from pydub import AudioSegment
|
20 |
|
21 |
# Local imports
|
22 |
from prompts import SYSTEM_PROMPT
|
23 |
+
from utils import generate_script, generate_audio, parse_url
|
24 |
+
|
25 |
|
26 |
class DialogueItem(BaseModel):
|
27 |
"""A single dialogue item."""
|
|
|
38 |
dialogue: List[DialogueItem]
|
39 |
|
40 |
|
41 |
+
def generate_podcast(
|
42 |
+
files: List[str],
|
43 |
+
url: Optional[str],
|
44 |
+
tone: Optional[str],
|
45 |
+
length: Optional[str],
|
46 |
+
language: str
|
47 |
+
) -> Tuple[str, str]:
|
48 |
+
"""Generate the audio and transcript from the PDFs and/or URL."""
|
49 |
+
text = ""
|
50 |
+
|
51 |
+
# Change language to the appropriate code
|
52 |
+
language_mapping = {
|
53 |
+
"English": "EN",
|
54 |
+
"Spanish": "ES",
|
55 |
+
"French": "FR",
|
56 |
+
"Chinese": "ZH",
|
57 |
+
"Japanese": "JP",
|
58 |
+
"Korean": "KR",
|
59 |
+
}
|
60 |
+
|
61 |
+
# Check if at least one input is provided
|
62 |
+
if not files and not url:
|
63 |
+
raise gr.Error("Please provide at least one PDF file or a URL.")
|
64 |
+
|
65 |
+
# Process PDFs if any
|
66 |
+
if files:
|
67 |
+
for file in files:
|
68 |
+
if not file.lower().endswith('.pdf'):
|
69 |
+
raise gr.Error(f"File {file} is not a PDF. Please upload only PDF files.")
|
70 |
+
|
71 |
+
try:
|
72 |
+
with Path(file).open("rb") as f:
|
73 |
+
reader = PdfReader(f)
|
74 |
+
text += "\n\n".join([page.extract_text() for page in reader.pages])
|
75 |
+
except Exception as e:
|
76 |
+
raise gr.Error(f"Error reading the PDF file {file}: {str(e)}")
|
77 |
+
|
78 |
+
# Process URL if provided
|
79 |
+
if url:
|
80 |
+
try:
|
81 |
+
url_text = parse_url(url)
|
82 |
+
text += "\n\n" + url_text
|
83 |
+
except ValueError as e:
|
84 |
+
raise gr.Error(str(e))
|
85 |
+
|
86 |
+
# Check total character count
|
87 |
if len(text) > 100000:
|
88 |
+
raise gr.Error("The total content is too long. Please ensure the combined text from PDFs and URL is fewer than ~100,000 characters.")
|
89 |
+
|
90 |
# Modify the system prompt based on the chosen tone and length
|
91 |
modified_system_prompt = SYSTEM_PROMPT
|
92 |
if tone:
|
|
|
97 |
"Medium (3-5 min)": "Aim for a moderate length, about 3-5 minutes.",
|
98 |
}
|
99 |
modified_system_prompt += f"\n\nLENGTH: {length_instructions[length]}"
|
100 |
+
if language:
|
101 |
+
modified_system_prompt += f"\n\nOUTPUT LANGUAGE <IMPORTANT>: The the podcast should be {language}."
|
102 |
|
103 |
# Call the LLM
|
104 |
llm_output = generate_script(modified_system_prompt, text, Dialogue)
|
|
|
106 |
|
107 |
# Process the dialogue
|
108 |
audio_segments = []
|
109 |
+
transcript = ""
|
110 |
total_characters = 0
|
111 |
|
112 |
for line in llm_output.dialogue:
|
|
|
119 |
total_characters += len(line.text)
|
120 |
|
121 |
# Get audio file path
|
122 |
+
audio_file_path = generate_audio(line.text, line.speaker, language_mapping[language])
|
123 |
# Read the audio file into an AudioSegment
|
124 |
audio_segment = AudioSegment.from_file(audio_file_path)
|
125 |
audio_segments.append(audio_segment)
|
|
|
150 |
|
151 |
demo = gr.Interface(
|
152 |
title="Open NotebookLM",
|
153 |
+
description="Convert your PDFs into podcasts with open-source AI models (Llama 3.1 405B and MeloTTS). \n \n Note: Only the text content of the PDFs will be processed. Images and tables are not included. The total content should be no more than 100,000 characters due to the context length of Llama 3.1 405B.",
|
154 |
fn=generate_podcast,
|
155 |
inputs=[
|
156 |
gr.File(
|
157 |
+
label="1. π Upload your PDF(s)",
|
158 |
+
file_types=[".pdf"],
|
159 |
+
file_count="multiple"
|
160 |
+
),
|
161 |
+
gr.Textbox(
|
162 |
+
label="2. π Paste a URL (optional)",
|
163 |
+
placeholder="Enter a URL to include its content"
|
164 |
),
|
165 |
gr.Radio(
|
166 |
choices=["Fun", "Formal"],
|
167 |
+
label="3. π Choose the tone",
|
168 |
+
value="Fun"
|
169 |
),
|
170 |
gr.Radio(
|
171 |
choices=["Short (1-2 min)", "Medium (3-5 min)"],
|
172 |
+
label="4. β±οΈ Choose the length",
|
173 |
value="Medium (3-5 min)"
|
174 |
),
|
175 |
+
gr.Dropdown(
|
176 |
+
choices=["English", "Spanish", "French", "Chinese", "Japanese", "Korean"],
|
177 |
+
value="English",
|
178 |
+
label="5. π Choose the language (Highly experimental, English is recommended)",
|
179 |
+
),
|
180 |
],
|
181 |
outputs=[
|
182 |
gr.Audio(label="Audio", format="mp3"),
|
183 |
gr.Markdown(label="Transcript"),
|
184 |
],
|
185 |
allow_flagging="never",
|
186 |
+
api_name="generate_podcast",
|
187 |
theme=gr.themes.Soft(),
|
188 |
concurrency_limit=3
|
189 |
)
|
utils.py
CHANGED
@@ -8,12 +8,14 @@ Functions:
|
|
8 |
"""
|
9 |
|
10 |
import os
|
|
|
11 |
|
12 |
from gradio_client import Client
|
13 |
from openai import OpenAI
|
14 |
from pydantic import ValidationError
|
15 |
|
16 |
MODEL_ID = "accounts/fireworks/models/llama-v3p1-405b-instruct"
|
|
|
17 |
|
18 |
client = OpenAI(
|
19 |
base_url="https://api.fireworks.ai/inference/v1",
|
@@ -59,15 +61,26 @@ def call_llm(system_prompt: str, text: str, dialogue_format):
|
|
59 |
return response
|
60 |
|
61 |
|
62 |
-
def
|
63 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
64 |
if speaker == "Guest":
|
65 |
-
accent = "EN-US"
|
66 |
speed = 0.9
|
67 |
else: # host
|
68 |
-
accent = "EN-Default"
|
69 |
speed = 1
|
|
|
|
|
|
|
|
|
70 |
result = hf_client.predict(
|
71 |
-
text=text, language=
|
72 |
)
|
73 |
return result
|
|
|
8 |
"""
|
9 |
|
10 |
import os
|
11 |
+
import requests
|
12 |
|
13 |
from gradio_client import Client
|
14 |
from openai import OpenAI
|
15 |
from pydantic import ValidationError
|
16 |
|
17 |
MODEL_ID = "accounts/fireworks/models/llama-v3p1-405b-instruct"
|
18 |
+
JINA_URL = "https://r.jina.ai/"
|
19 |
|
20 |
client = OpenAI(
|
21 |
base_url="https://api.fireworks.ai/inference/v1",
|
|
|
61 |
return response
|
62 |
|
63 |
|
64 |
+
def parse_url(url: str) -> str:
|
65 |
+
"""Parse the given URL and return the text content."""
|
66 |
+
full_url = f"{JINA_URL}{url}"
|
67 |
+
response = requests.get(full_url, timeout=60)
|
68 |
+
return response.text
|
69 |
+
|
70 |
+
|
71 |
+
def generate_audio(text: str, speaker: str, language: str) -> bytes:
|
72 |
+
"""Get the audio from the TTS model from HF Spaces and adjust pitch if necessary."""
|
73 |
if speaker == "Guest":
|
74 |
+
accent = "EN-US" if language == "EN" else language
|
75 |
speed = 0.9
|
76 |
else: # host
|
77 |
+
accent = "EN-Default" if language == "EN" else language
|
78 |
speed = 1
|
79 |
+
if language != "EN" and speaker != "Guest":
|
80 |
+
speed = 1.1
|
81 |
+
|
82 |
+
# Generate audio
|
83 |
result = hf_client.predict(
|
84 |
+
text=text, language=language, speaker=accent, speed=speed, api_name="/synthesize"
|
85 |
)
|
86 |
return result
|