Spaces:
Runtime error
Runtime error
eljanmahammadli
commited on
Commit
·
744d9e3
1
Parent(s):
d904dd4
#feat: added YouTube as RAG input; removed standard humanizer
Browse files- ai_generate.py +15 -5
- app.py +25 -10
- humanize.py +6 -4
- requirements.txt +3 -1
- youtube.py +67 -0
ai_generate.py
CHANGED
@@ -216,7 +216,7 @@ def load_llm(model: str, api_key: str, temperature: float = 1.0, max_length: int
|
|
216 |
return llm
|
217 |
|
218 |
|
219 |
-
def create_db_with_langchain(path: list[str], url_content: dict, query: str):
|
220 |
all_docs = []
|
221 |
|
222 |
text_splitter = RecursiveCharacterTextSplitter(
|
@@ -242,6 +242,7 @@ def create_db_with_langchain(path: list[str], url_content: dict, query: str):
|
|
242 |
length_function=len,
|
243 |
add_start_index=False,
|
244 |
)
|
|
|
245 |
if path:
|
246 |
for file in path:
|
247 |
loader = PyMuPDFLoader(file)
|
@@ -249,13 +250,20 @@ def create_db_with_langchain(path: list[str], url_content: dict, query: str):
|
|
249 |
# split it into chunks
|
250 |
docs = text_splitter.split_documents(data)
|
251 |
all_docs.extend(docs)
|
252 |
-
|
253 |
if url_content:
|
254 |
for url, content in url_content.items():
|
255 |
doc = Document(page_content=content, metadata={"source": url})
|
256 |
# split it into chunks
|
257 |
docs = text_splitter.split_documents([doc])
|
258 |
all_docs.extend(docs)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
259 |
|
260 |
print(f"### Total number of documents before bm25s: {len(all_docs)}")
|
261 |
|
@@ -298,6 +306,7 @@ def generate_rag(
|
|
298 |
max_length: int = 2048,
|
299 |
api_key: str = "",
|
300 |
sys_message="",
|
|
|
301 |
):
|
302 |
llm = load_llm(model, api_key, temperature, max_length)
|
303 |
if llm is None:
|
@@ -306,7 +315,7 @@ def generate_rag(
|
|
306 |
|
307 |
query = llm_wrapper(input_role, topic, context, model="OpenAI GPT 4o", task_type="rag", temperature=0.7)
|
308 |
print("### Query: ", query)
|
309 |
-
db, bm25_retriever = create_db_with_langchain(path, url_content, query)
|
310 |
retriever = db.as_retriever(search_type="mmr", search_kwargs={"k": K, "fetch_k": FETCH_K, "lambda_mult": 0.75})
|
311 |
t0 = time.time()
|
312 |
ensemble_retriever = EnsembleRetriever(retrievers=[bm25_retriever, retriever], weights=[0.4, 0.6])
|
@@ -354,10 +363,11 @@ def generate(
|
|
354 |
max_length: int = 2048,
|
355 |
api_key: str = "",
|
356 |
sys_message="",
|
|
|
357 |
):
|
358 |
-
if path or url_content:
|
359 |
return generate_rag(
|
360 |
-
prompt, input_role, topic, context, model, url_content, path, temperature, max_length, api_key, sys_message
|
361 |
)
|
362 |
else:
|
363 |
return generate_base(prompt, topic, model, temperature, max_length, api_key, sys_message)
|
|
|
216 |
return llm
|
217 |
|
218 |
|
219 |
+
def create_db_with_langchain(path: list[str], url_content: dict, yt_content: dict, query: str):
|
220 |
all_docs = []
|
221 |
|
222 |
text_splitter = RecursiveCharacterTextSplitter(
|
|
|
242 |
length_function=len,
|
243 |
add_start_index=False,
|
244 |
)
|
245 |
+
# PDF
|
246 |
if path:
|
247 |
for file in path:
|
248 |
loader = PyMuPDFLoader(file)
|
|
|
250 |
# split it into chunks
|
251 |
docs = text_splitter.split_documents(data)
|
252 |
all_docs.extend(docs)
|
253 |
+
# Internet Search
|
254 |
if url_content:
|
255 |
for url, content in url_content.items():
|
256 |
doc = Document(page_content=content, metadata={"source": url})
|
257 |
# split it into chunks
|
258 |
docs = text_splitter.split_documents([doc])
|
259 |
all_docs.extend(docs)
|
260 |
+
# YouTube Transcriptions
|
261 |
+
if yt_content:
|
262 |
+
for yt_url, content in yt_content.items():
|
263 |
+
doc = Document(page_content=content, metadata={"source": yt_url})
|
264 |
+
# split it into chunks
|
265 |
+
docs = text_splitter.split_documents([doc])
|
266 |
+
all_docs.extend(docs)
|
267 |
|
268 |
print(f"### Total number of documents before bm25s: {len(all_docs)}")
|
269 |
|
|
|
306 |
max_length: int = 2048,
|
307 |
api_key: str = "",
|
308 |
sys_message="",
|
309 |
+
yt_content=None,
|
310 |
):
|
311 |
llm = load_llm(model, api_key, temperature, max_length)
|
312 |
if llm is None:
|
|
|
315 |
|
316 |
query = llm_wrapper(input_role, topic, context, model="OpenAI GPT 4o", task_type="rag", temperature=0.7)
|
317 |
print("### Query: ", query)
|
318 |
+
db, bm25_retriever = create_db_with_langchain(path, url_content, yt_content, query)
|
319 |
retriever = db.as_retriever(search_type="mmr", search_kwargs={"k": K, "fetch_k": FETCH_K, "lambda_mult": 0.75})
|
320 |
t0 = time.time()
|
321 |
ensemble_retriever = EnsembleRetriever(retrievers=[bm25_retriever, retriever], weights=[0.4, 0.6])
|
|
|
363 |
max_length: int = 2048,
|
364 |
api_key: str = "",
|
365 |
sys_message="",
|
366 |
+
yt_content=None,
|
367 |
):
|
368 |
+
if path or url_content or yt_content:
|
369 |
return generate_rag(
|
370 |
+
prompt, input_role, topic, context, model, url_content, path, temperature, max_length, api_key, sys_message, yt_content
|
371 |
)
|
372 |
else:
|
373 |
return generate_base(prompt, topic, model, temperature, max_length, api_key, sys_message)
|
app.py
CHANGED
@@ -22,9 +22,12 @@ from google.cloud import storage
|
|
22 |
|
23 |
if gr.NO_RELOAD:
|
24 |
from humanize import humanize_text, device
|
|
|
|
|
25 |
from utils import remove_special_characters, split_text_allow_complete_sentences_nltk
|
26 |
from google_search import google_search, months, domain_list, build_date
|
27 |
from ai_generate import generate, citations_to_html, remove_citations, display_cited_text, llm_wrapper
|
|
|
28 |
|
29 |
# nltk.download("punkt_tab")
|
30 |
|
@@ -566,6 +569,7 @@ def generate_article(
|
|
566 |
pdf_file_input: list[str] = None,
|
567 |
generated_article: str = None,
|
568 |
user_comments: str = None,
|
|
|
569 |
) -> str:
|
570 |
settings = {
|
571 |
"role": input_role,
|
@@ -605,6 +609,7 @@ def generate_article(
|
|
605 |
max_length=2048,
|
606 |
api_key=api_key,
|
607 |
sys_message="",
|
|
|
608 |
)
|
609 |
return article, citations
|
610 |
|
@@ -689,13 +694,6 @@ def update_structure(format_choice):
|
|
689 |
return gr.update(value="Introduction, Body, Conclusion", interactive=True)
|
690 |
|
691 |
|
692 |
-
def update_temperature(model_dropdown):
|
693 |
-
if model_dropdown == "Standard Model":
|
694 |
-
return gr.update(value=1.2, interactive=True)
|
695 |
-
elif model_dropdown == "Advanced Model (Beta)":
|
696 |
-
return gr.update(value=1.0, interactive=True)
|
697 |
-
|
698 |
-
|
699 |
# Initialize Google Cloud Storage client
|
700 |
client = storage.Client()
|
701 |
bucket_name = "ai-source-detection"
|
@@ -820,6 +818,7 @@ def generate_and_format(
|
|
820 |
exclude_sites,
|
821 |
pdf_file_input,
|
822 |
history=None,
|
|
|
823 |
ai_model="OpenAI GPT 4o",
|
824 |
api_key=None,
|
825 |
generated_article: str = None,
|
@@ -827,6 +826,7 @@ def generate_and_format(
|
|
827 |
):
|
828 |
url_content = None
|
829 |
if google_search_check:
|
|
|
830 |
date_from = build_date(year_from, month_from, day_from)
|
831 |
date_to = build_date(year_to, month_to, day_to)
|
832 |
sorted_date = f"date:r:{date_from}:{date_to}"
|
@@ -841,6 +841,14 @@ def generate_and_format(
|
|
841 |
final_query += " " + " ".join(exclude_queries)
|
842 |
print(f"Google Search Query: {final_query}")
|
843 |
url_content = google_search(final_query, sorted_date, domains_to_include)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
844 |
# topic_context = topic + ", " + context
|
845 |
article, citations = generate_article(
|
846 |
input_role,
|
@@ -863,6 +871,7 @@ def generate_and_format(
|
|
863 |
pdf_file_input,
|
864 |
generated_article,
|
865 |
user_comments,
|
|
|
866 |
)
|
867 |
# if ends_with_references(article) and url_content is not None:
|
868 |
# for url in url_content.keys():
|
@@ -1103,6 +1112,12 @@ with gr.Blocks(
|
|
1103 |
|
1104 |
gr.Markdown("# Add Optional PDF Files with Information", elem_classes="text-center text-3xl mb-6")
|
1105 |
pdf_file_input = gr.File(label="Upload PDF(s)", file_count="multiple", file_types=[".pdf"])
|
|
|
|
|
|
|
|
|
|
|
|
|
1106 |
"""
|
1107 |
# NOTE: HIDE AI MODEL SELECTION
|
1108 |
with gr.Group():
|
@@ -1150,13 +1165,13 @@ with gr.Blocks(
|
|
1150 |
with gr.Accordion("Advanced Humanizer Settings", open=False):
|
1151 |
with gr.Row():
|
1152 |
model_dropdown = gr.Radio(
|
1153 |
-
choices=["
|
1154 |
value="Advanced Model (Beta)",
|
1155 |
label="Humanizer Model Version",
|
1156 |
)
|
1157 |
with gr.Row():
|
1158 |
temperature_slider = gr.Slider(
|
1159 |
-
minimum=0.
|
1160 |
)
|
1161 |
top_k_slider = gr.Slider(minimum=0, maximum=300, step=25, value=40, label="Top k")
|
1162 |
with gr.Row():
|
@@ -1213,7 +1228,6 @@ with gr.Blocks(
|
|
1213 |
# Update the default structure based on the selected format
|
1214 |
# e.g. "Plain Text" for certain formats
|
1215 |
input_format.change(fn=update_structure, inputs=input_format, outputs=input_structure)
|
1216 |
-
model_dropdown.change(fn=update_temperature, inputs=model_dropdown, outputs=temperature_slider)
|
1217 |
report_humanized_btn.click(
|
1218 |
save_humanizer_feedback_to_cloud_storage, inputs=[latest_humanizer_data, humanizer_feedback]
|
1219 |
)
|
@@ -1249,6 +1263,7 @@ with gr.Blocks(
|
|
1249 |
exclude_sites,
|
1250 |
pdf_file_input,
|
1251 |
history,
|
|
|
1252 |
],
|
1253 |
outputs=[output_article, history],
|
1254 |
)
|
|
|
22 |
|
23 |
if gr.NO_RELOAD:
|
24 |
from humanize import humanize_text, device
|
25 |
+
# humanize_text = None
|
26 |
+
# device = None
|
27 |
from utils import remove_special_characters, split_text_allow_complete_sentences_nltk
|
28 |
from google_search import google_search, months, domain_list, build_date
|
29 |
from ai_generate import generate, citations_to_html, remove_citations, display_cited_text, llm_wrapper
|
30 |
+
from youtube import transcribe
|
31 |
|
32 |
# nltk.download("punkt_tab")
|
33 |
|
|
|
569 |
pdf_file_input: list[str] = None,
|
570 |
generated_article: str = None,
|
571 |
user_comments: str = None,
|
572 |
+
yt_content: str = None,
|
573 |
) -> str:
|
574 |
settings = {
|
575 |
"role": input_role,
|
|
|
609 |
max_length=2048,
|
610 |
api_key=api_key,
|
611 |
sys_message="",
|
612 |
+
yt_content=yt_content,
|
613 |
)
|
614 |
return article, citations
|
615 |
|
|
|
694 |
return gr.update(value="Introduction, Body, Conclusion", interactive=True)
|
695 |
|
696 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
697 |
# Initialize Google Cloud Storage client
|
698 |
client = storage.Client()
|
699 |
bucket_name = "ai-source-detection"
|
|
|
818 |
exclude_sites,
|
819 |
pdf_file_input,
|
820 |
history=None,
|
821 |
+
yt_url: str = None,
|
822 |
ai_model="OpenAI GPT 4o",
|
823 |
api_key=None,
|
824 |
generated_article: str = None,
|
|
|
826 |
):
|
827 |
url_content = None
|
828 |
if google_search_check:
|
829 |
+
gr.Info("Searching internet for relevant content...")
|
830 |
date_from = build_date(year_from, month_from, day_from)
|
831 |
date_to = build_date(year_to, month_to, day_to)
|
832 |
sorted_date = f"date:r:{date_from}:{date_to}"
|
|
|
841 |
final_query += " " + " ".join(exclude_queries)
|
842 |
print(f"Google Search Query: {final_query}")
|
843 |
url_content = google_search(final_query, sorted_date, domains_to_include)
|
844 |
+
|
845 |
+
yt_content = {}
|
846 |
+
if yt_url:
|
847 |
+
gr.Info("Transcribing YouTube video...")
|
848 |
+
transcribed_text = transcribe(yt_url)
|
849 |
+
gr.Info("Transcription completed. Generating article...")
|
850 |
+
yt_content[yt_url] = transcribed_text
|
851 |
+
|
852 |
# topic_context = topic + ", " + context
|
853 |
article, citations = generate_article(
|
854 |
input_role,
|
|
|
871 |
pdf_file_input,
|
872 |
generated_article,
|
873 |
user_comments,
|
874 |
+
yt_content,
|
875 |
)
|
876 |
# if ends_with_references(article) and url_content is not None:
|
877 |
# for url in url_content.keys():
|
|
|
1112 |
|
1113 |
gr.Markdown("# Add Optional PDF Files with Information", elem_classes="text-center text-3xl mb-6")
|
1114 |
pdf_file_input = gr.File(label="Upload PDF(s)", file_count="multiple", file_types=[".pdf"])
|
1115 |
+
gr.Markdown("# Add Youtube Video Link", elem_classes="text-center text-3xl mb-6")
|
1116 |
+
yt_url = gr.Textbox(
|
1117 |
+
label="Youtube Video Link",
|
1118 |
+
placeholder="Enter the link of the video",
|
1119 |
+
elem_classes="input-highlight-pink",
|
1120 |
+
)
|
1121 |
"""
|
1122 |
# NOTE: HIDE AI MODEL SELECTION
|
1123 |
with gr.Group():
|
|
|
1165 |
with gr.Accordion("Advanced Humanizer Settings", open=False):
|
1166 |
with gr.Row():
|
1167 |
model_dropdown = gr.Radio(
|
1168 |
+
choices=["Advanced Model (Beta)"],
|
1169 |
value="Advanced Model (Beta)",
|
1170 |
label="Humanizer Model Version",
|
1171 |
)
|
1172 |
with gr.Row():
|
1173 |
temperature_slider = gr.Slider(
|
1174 |
+
minimum=0.1, maximum=2.0, step=0.1, value=1.0, label="Temperature"
|
1175 |
)
|
1176 |
top_k_slider = gr.Slider(minimum=0, maximum=300, step=25, value=40, label="Top k")
|
1177 |
with gr.Row():
|
|
|
1228 |
# Update the default structure based on the selected format
|
1229 |
# e.g. "Plain Text" for certain formats
|
1230 |
input_format.change(fn=update_structure, inputs=input_format, outputs=input_structure)
|
|
|
1231 |
report_humanized_btn.click(
|
1232 |
save_humanizer_feedback_to_cloud_storage, inputs=[latest_humanizer_data, humanizer_feedback]
|
1233 |
)
|
|
|
1263 |
exclude_sites,
|
1264 |
pdf_file_input,
|
1265 |
history,
|
1266 |
+
yt_url,
|
1267 |
],
|
1268 |
outputs=[output_article, history],
|
1269 |
)
|
humanize.py
CHANGED
@@ -25,10 +25,12 @@ else:
|
|
25 |
|
26 |
# ----------------------------
|
27 |
# load encoder-decoder (sequence to sequence) language model
|
28 |
-
seq2seq = "polygraf-ai/poly-humanizer-XL-merged-v2"
|
29 |
-
seq2seq_model = T5ForConditionalGeneration.from_pretrained(seq2seq, torch_dtype=torch.bfloat16).to(device)
|
30 |
-
seq2seq_tokenizer = T5Tokenizer.from_pretrained(seq2seq)
|
31 |
-
print(f"Loaded model: {seq2seq}, Num. params: {seq2seq_model.num_parameters()}")
|
|
|
|
|
32 |
# ----------------------------
|
33 |
# load decoder-only (causal) language model
|
34 |
from unsloth import FastLanguageModel
|
|
|
25 |
|
26 |
# ----------------------------
|
27 |
# load encoder-decoder (sequence to sequence) language model
|
28 |
+
# seq2seq = "polygraf-ai/poly-humanizer-XL-merged-v2"
|
29 |
+
# seq2seq_model = T5ForConditionalGeneration.from_pretrained(seq2seq, torch_dtype=torch.bfloat16).to(device)
|
30 |
+
# seq2seq_tokenizer = T5Tokenizer.from_pretrained(seq2seq)
|
31 |
+
# print(f"Loaded model: {seq2seq}, Num. params: {seq2seq_model.num_parameters()}")
|
32 |
+
seq2seq_model = None
|
33 |
+
seq2seq_tokenizer = None
|
34 |
# ----------------------------
|
35 |
# load decoder-only (causal) language model
|
36 |
from unsloth import FastLanguageModel
|
requirements.txt
CHANGED
@@ -26,4 +26,6 @@ langchain-openai
|
|
26 |
vertexai
|
27 |
html2text
|
28 |
bm25s
|
29 |
-
unsloth
|
|
|
|
|
|
26 |
vertexai
|
27 |
html2text
|
28 |
bm25s
|
29 |
+
unsloth
|
30 |
+
trafilatura
|
31 |
+
yt-dlp
|
youtube.py
ADDED
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gc
|
2 |
+
import torch
|
3 |
+
import yt_dlp as youtube_dl
|
4 |
+
from transformers import pipeline
|
5 |
+
from transformers.pipelines.audio_utils import ffmpeg_read
|
6 |
+
import tempfile
|
7 |
+
import os
|
8 |
+
from time import monotonic
|
9 |
+
|
10 |
+
MODEL_NAME = "openai/whisper-large-v3"
|
11 |
+
BATCH_SIZE = 8
|
12 |
+
YT_LENGTH_LIMIT_S = 5400 # limit to 1.5 hour YouTube files
|
13 |
+
|
14 |
+
device = 'cuda:1' if torch.cuda.is_available() else "cpu"
|
15 |
+
|
16 |
+
pipe = pipeline(
|
17 |
+
task="automatic-speech-recognition",
|
18 |
+
model=MODEL_NAME,
|
19 |
+
torch_dtype=torch.float16,
|
20 |
+
chunk_length_s=30,
|
21 |
+
device=device,
|
22 |
+
generate_kwargs={"language": "english"}
|
23 |
+
)
|
24 |
+
|
25 |
+
def download_yt_audio(yt_url, filename, time_limit_s=YT_LENGTH_LIMIT_S):
|
26 |
+
info_loader = youtube_dl.YoutubeDL()
|
27 |
+
|
28 |
+
try:
|
29 |
+
info = info_loader.extract_info(yt_url, download=False)
|
30 |
+
except youtube_dl.utils.DownloadError as err:
|
31 |
+
raise ValueError(f"Error downloading video: {str(err)}")
|
32 |
+
|
33 |
+
file_length = info["duration"]
|
34 |
+
|
35 |
+
if file_length > time_limit_s:
|
36 |
+
raise ValueError(f"Video is too long. Maximum allowed length is {time_limit_s // 3600} hour(s).")
|
37 |
+
|
38 |
+
ydl_opts = {"outtmpl": filename, "format": "bestaudio/best"} # Only download the best available audio format
|
39 |
+
|
40 |
+
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
|
41 |
+
try:
|
42 |
+
ydl.download([yt_url])
|
43 |
+
except youtube_dl.utils.ExtractorError as err:
|
44 |
+
raise ValueError(f"Error extracting audio: {str(err)}")
|
45 |
+
|
46 |
+
def transcribe(yt_url, time_limit_s=YT_LENGTH_LIMIT_S):
|
47 |
+
with tempfile.TemporaryDirectory() as tmpdirname:
|
48 |
+
filepath = os.path.join(tmpdirname, "video.mp4")
|
49 |
+
t0 = monotonic()
|
50 |
+
download_yt_audio(yt_url, filepath, time_limit_s)
|
51 |
+
t1 = monotonic()
|
52 |
+
print(f"Downloaded video in {t1 - t0:.2f} seconds.")
|
53 |
+
|
54 |
+
with open(filepath, "rb") as f:
|
55 |
+
inputs = f.read()
|
56 |
+
|
57 |
+
inputs = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate)
|
58 |
+
inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
|
59 |
+
t0 = monotonic()
|
60 |
+
text = pipe(inputs, batch_size=BATCH_SIZE)["text"]
|
61 |
+
t1 = monotonic()
|
62 |
+
print(f"Transcribed video in {t1 - t0:.2f} seconds.")
|
63 |
+
|
64 |
+
torch.cuda.empty_cache()
|
65 |
+
gc.collect()
|
66 |
+
|
67 |
+
return text
|