eljanmahammadli commited on
Commit
744d9e3
·
1 Parent(s): d904dd4

#feat: added YouTube as RAG input; removed standard humanizer

Browse files
Files changed (5) hide show
  1. ai_generate.py +15 -5
  2. app.py +25 -10
  3. humanize.py +6 -4
  4. requirements.txt +3 -1
  5. youtube.py +67 -0
ai_generate.py CHANGED
@@ -216,7 +216,7 @@ def load_llm(model: str, api_key: str, temperature: float = 1.0, max_length: int
216
  return llm
217
 
218
 
219
- def create_db_with_langchain(path: list[str], url_content: dict, query: str):
220
  all_docs = []
221
 
222
  text_splitter = RecursiveCharacterTextSplitter(
@@ -242,6 +242,7 @@ def create_db_with_langchain(path: list[str], url_content: dict, query: str):
242
  length_function=len,
243
  add_start_index=False,
244
  )
 
245
  if path:
246
  for file in path:
247
  loader = PyMuPDFLoader(file)
@@ -249,13 +250,20 @@ def create_db_with_langchain(path: list[str], url_content: dict, query: str):
249
  # split it into chunks
250
  docs = text_splitter.split_documents(data)
251
  all_docs.extend(docs)
252
-
253
  if url_content:
254
  for url, content in url_content.items():
255
  doc = Document(page_content=content, metadata={"source": url})
256
  # split it into chunks
257
  docs = text_splitter.split_documents([doc])
258
  all_docs.extend(docs)
 
 
 
 
 
 
 
259
 
260
  print(f"### Total number of documents before bm25s: {len(all_docs)}")
261
 
@@ -298,6 +306,7 @@ def generate_rag(
298
  max_length: int = 2048,
299
  api_key: str = "",
300
  sys_message="",
 
301
  ):
302
  llm = load_llm(model, api_key, temperature, max_length)
303
  if llm is None:
@@ -306,7 +315,7 @@ def generate_rag(
306
 
307
  query = llm_wrapper(input_role, topic, context, model="OpenAI GPT 4o", task_type="rag", temperature=0.7)
308
  print("### Query: ", query)
309
- db, bm25_retriever = create_db_with_langchain(path, url_content, query)
310
  retriever = db.as_retriever(search_type="mmr", search_kwargs={"k": K, "fetch_k": FETCH_K, "lambda_mult": 0.75})
311
  t0 = time.time()
312
  ensemble_retriever = EnsembleRetriever(retrievers=[bm25_retriever, retriever], weights=[0.4, 0.6])
@@ -354,10 +363,11 @@ def generate(
354
  max_length: int = 2048,
355
  api_key: str = "",
356
  sys_message="",
 
357
  ):
358
- if path or url_content:
359
  return generate_rag(
360
- prompt, input_role, topic, context, model, url_content, path, temperature, max_length, api_key, sys_message
361
  )
362
  else:
363
  return generate_base(prompt, topic, model, temperature, max_length, api_key, sys_message)
 
216
  return llm
217
 
218
 
219
+ def create_db_with_langchain(path: list[str], url_content: dict, yt_content: dict, query: str):
220
  all_docs = []
221
 
222
  text_splitter = RecursiveCharacterTextSplitter(
 
242
  length_function=len,
243
  add_start_index=False,
244
  )
245
+ # PDF
246
  if path:
247
  for file in path:
248
  loader = PyMuPDFLoader(file)
 
250
  # split it into chunks
251
  docs = text_splitter.split_documents(data)
252
  all_docs.extend(docs)
253
+ # Internet Search
254
  if url_content:
255
  for url, content in url_content.items():
256
  doc = Document(page_content=content, metadata={"source": url})
257
  # split it into chunks
258
  docs = text_splitter.split_documents([doc])
259
  all_docs.extend(docs)
260
+ # YouTube Transcriptions
261
+ if yt_content:
262
+ for yt_url, content in yt_content.items():
263
+ doc = Document(page_content=content, metadata={"source": yt_url})
264
+ # split it into chunks
265
+ docs = text_splitter.split_documents([doc])
266
+ all_docs.extend(docs)
267
 
268
  print(f"### Total number of documents before bm25s: {len(all_docs)}")
269
 
 
306
  max_length: int = 2048,
307
  api_key: str = "",
308
  sys_message="",
309
+ yt_content=None,
310
  ):
311
  llm = load_llm(model, api_key, temperature, max_length)
312
  if llm is None:
 
315
 
316
  query = llm_wrapper(input_role, topic, context, model="OpenAI GPT 4o", task_type="rag", temperature=0.7)
317
  print("### Query: ", query)
318
+ db, bm25_retriever = create_db_with_langchain(path, url_content, yt_content, query)
319
  retriever = db.as_retriever(search_type="mmr", search_kwargs={"k": K, "fetch_k": FETCH_K, "lambda_mult": 0.75})
320
  t0 = time.time()
321
  ensemble_retriever = EnsembleRetriever(retrievers=[bm25_retriever, retriever], weights=[0.4, 0.6])
 
363
  max_length: int = 2048,
364
  api_key: str = "",
365
  sys_message="",
366
+ yt_content=None,
367
  ):
368
+ if path or url_content or yt_content:
369
  return generate_rag(
370
+ prompt, input_role, topic, context, model, url_content, path, temperature, max_length, api_key, sys_message, yt_content
371
  )
372
  else:
373
  return generate_base(prompt, topic, model, temperature, max_length, api_key, sys_message)
app.py CHANGED
@@ -22,9 +22,12 @@ from google.cloud import storage
22
 
23
  if gr.NO_RELOAD:
24
  from humanize import humanize_text, device
 
 
25
  from utils import remove_special_characters, split_text_allow_complete_sentences_nltk
26
  from google_search import google_search, months, domain_list, build_date
27
  from ai_generate import generate, citations_to_html, remove_citations, display_cited_text, llm_wrapper
 
28
 
29
  # nltk.download("punkt_tab")
30
 
@@ -566,6 +569,7 @@ def generate_article(
566
  pdf_file_input: list[str] = None,
567
  generated_article: str = None,
568
  user_comments: str = None,
 
569
  ) -> str:
570
  settings = {
571
  "role": input_role,
@@ -605,6 +609,7 @@ def generate_article(
605
  max_length=2048,
606
  api_key=api_key,
607
  sys_message="",
 
608
  )
609
  return article, citations
610
 
@@ -689,13 +694,6 @@ def update_structure(format_choice):
689
  return gr.update(value="Introduction, Body, Conclusion", interactive=True)
690
 
691
 
692
- def update_temperature(model_dropdown):
693
- if model_dropdown == "Standard Model":
694
- return gr.update(value=1.2, interactive=True)
695
- elif model_dropdown == "Advanced Model (Beta)":
696
- return gr.update(value=1.0, interactive=True)
697
-
698
-
699
  # Initialize Google Cloud Storage client
700
  client = storage.Client()
701
  bucket_name = "ai-source-detection"
@@ -820,6 +818,7 @@ def generate_and_format(
820
  exclude_sites,
821
  pdf_file_input,
822
  history=None,
 
823
  ai_model="OpenAI GPT 4o",
824
  api_key=None,
825
  generated_article: str = None,
@@ -827,6 +826,7 @@ def generate_and_format(
827
  ):
828
  url_content = None
829
  if google_search_check:
 
830
  date_from = build_date(year_from, month_from, day_from)
831
  date_to = build_date(year_to, month_to, day_to)
832
  sorted_date = f"date:r:{date_from}:{date_to}"
@@ -841,6 +841,14 @@ def generate_and_format(
841
  final_query += " " + " ".join(exclude_queries)
842
  print(f"Google Search Query: {final_query}")
843
  url_content = google_search(final_query, sorted_date, domains_to_include)
 
 
 
 
 
 
 
 
844
  # topic_context = topic + ", " + context
845
  article, citations = generate_article(
846
  input_role,
@@ -863,6 +871,7 @@ def generate_and_format(
863
  pdf_file_input,
864
  generated_article,
865
  user_comments,
 
866
  )
867
  # if ends_with_references(article) and url_content is not None:
868
  # for url in url_content.keys():
@@ -1103,6 +1112,12 @@ with gr.Blocks(
1103
 
1104
  gr.Markdown("# Add Optional PDF Files with Information", elem_classes="text-center text-3xl mb-6")
1105
  pdf_file_input = gr.File(label="Upload PDF(s)", file_count="multiple", file_types=[".pdf"])
 
 
 
 
 
 
1106
  """
1107
  # NOTE: HIDE AI MODEL SELECTION
1108
  with gr.Group():
@@ -1150,13 +1165,13 @@ with gr.Blocks(
1150
  with gr.Accordion("Advanced Humanizer Settings", open=False):
1151
  with gr.Row():
1152
  model_dropdown = gr.Radio(
1153
- choices=["Standard Model", "Advanced Model (Beta)"],
1154
  value="Advanced Model (Beta)",
1155
  label="Humanizer Model Version",
1156
  )
1157
  with gr.Row():
1158
  temperature_slider = gr.Slider(
1159
- minimum=0.5, maximum=2.0, step=0.1, value=1.0, label="Temperature"
1160
  )
1161
  top_k_slider = gr.Slider(minimum=0, maximum=300, step=25, value=40, label="Top k")
1162
  with gr.Row():
@@ -1213,7 +1228,6 @@ with gr.Blocks(
1213
  # Update the default structure based on the selected format
1214
  # e.g. "Plain Text" for certain formats
1215
  input_format.change(fn=update_structure, inputs=input_format, outputs=input_structure)
1216
- model_dropdown.change(fn=update_temperature, inputs=model_dropdown, outputs=temperature_slider)
1217
  report_humanized_btn.click(
1218
  save_humanizer_feedback_to_cloud_storage, inputs=[latest_humanizer_data, humanizer_feedback]
1219
  )
@@ -1249,6 +1263,7 @@ with gr.Blocks(
1249
  exclude_sites,
1250
  pdf_file_input,
1251
  history,
 
1252
  ],
1253
  outputs=[output_article, history],
1254
  )
 
22
 
23
  if gr.NO_RELOAD:
24
  from humanize import humanize_text, device
25
+ # humanize_text = None
26
+ # device = None
27
  from utils import remove_special_characters, split_text_allow_complete_sentences_nltk
28
  from google_search import google_search, months, domain_list, build_date
29
  from ai_generate import generate, citations_to_html, remove_citations, display_cited_text, llm_wrapper
30
+ from youtube import transcribe
31
 
32
  # nltk.download("punkt_tab")
33
 
 
569
  pdf_file_input: list[str] = None,
570
  generated_article: str = None,
571
  user_comments: str = None,
572
+ yt_content: str = None,
573
  ) -> str:
574
  settings = {
575
  "role": input_role,
 
609
  max_length=2048,
610
  api_key=api_key,
611
  sys_message="",
612
+ yt_content=yt_content,
613
  )
614
  return article, citations
615
 
 
694
  return gr.update(value="Introduction, Body, Conclusion", interactive=True)
695
 
696
 
 
 
 
 
 
 
 
697
  # Initialize Google Cloud Storage client
698
  client = storage.Client()
699
  bucket_name = "ai-source-detection"
 
818
  exclude_sites,
819
  pdf_file_input,
820
  history=None,
821
+ yt_url: str = None,
822
  ai_model="OpenAI GPT 4o",
823
  api_key=None,
824
  generated_article: str = None,
 
826
  ):
827
  url_content = None
828
  if google_search_check:
829
+ gr.Info("Searching internet for relevant content...")
830
  date_from = build_date(year_from, month_from, day_from)
831
  date_to = build_date(year_to, month_to, day_to)
832
  sorted_date = f"date:r:{date_from}:{date_to}"
 
841
  final_query += " " + " ".join(exclude_queries)
842
  print(f"Google Search Query: {final_query}")
843
  url_content = google_search(final_query, sorted_date, domains_to_include)
844
+
845
+ yt_content = {}
846
+ if yt_url:
847
+ gr.Info("Transcribing YouTube video...")
848
+ transcribed_text = transcribe(yt_url)
849
+ gr.Info("Transcription completed. Generating article...")
850
+ yt_content[yt_url] = transcribed_text
851
+
852
  # topic_context = topic + ", " + context
853
  article, citations = generate_article(
854
  input_role,
 
871
  pdf_file_input,
872
  generated_article,
873
  user_comments,
874
+ yt_content,
875
  )
876
  # if ends_with_references(article) and url_content is not None:
877
  # for url in url_content.keys():
 
1112
 
1113
  gr.Markdown("# Add Optional PDF Files with Information", elem_classes="text-center text-3xl mb-6")
1114
  pdf_file_input = gr.File(label="Upload PDF(s)", file_count="multiple", file_types=[".pdf"])
1115
+ gr.Markdown("# Add Youtube Video Link", elem_classes="text-center text-3xl mb-6")
1116
+ yt_url = gr.Textbox(
1117
+ label="Youtube Video Link",
1118
+ placeholder="Enter the link of the video",
1119
+ elem_classes="input-highlight-pink",
1120
+ )
1121
  """
1122
  # NOTE: HIDE AI MODEL SELECTION
1123
  with gr.Group():
 
1165
  with gr.Accordion("Advanced Humanizer Settings", open=False):
1166
  with gr.Row():
1167
  model_dropdown = gr.Radio(
1168
+ choices=["Advanced Model (Beta)"],
1169
  value="Advanced Model (Beta)",
1170
  label="Humanizer Model Version",
1171
  )
1172
  with gr.Row():
1173
  temperature_slider = gr.Slider(
1174
+ minimum=0.1, maximum=2.0, step=0.1, value=1.0, label="Temperature"
1175
  )
1176
  top_k_slider = gr.Slider(minimum=0, maximum=300, step=25, value=40, label="Top k")
1177
  with gr.Row():
 
1228
  # Update the default structure based on the selected format
1229
  # e.g. "Plain Text" for certain formats
1230
  input_format.change(fn=update_structure, inputs=input_format, outputs=input_structure)
 
1231
  report_humanized_btn.click(
1232
  save_humanizer_feedback_to_cloud_storage, inputs=[latest_humanizer_data, humanizer_feedback]
1233
  )
 
1263
  exclude_sites,
1264
  pdf_file_input,
1265
  history,
1266
+ yt_url,
1267
  ],
1268
  outputs=[output_article, history],
1269
  )
humanize.py CHANGED
@@ -25,10 +25,12 @@ else:
25
 
26
  # ----------------------------
27
  # load encoder-decoder (sequence to sequence) language model
28
- seq2seq = "polygraf-ai/poly-humanizer-XL-merged-v2"
29
- seq2seq_model = T5ForConditionalGeneration.from_pretrained(seq2seq, torch_dtype=torch.bfloat16).to(device)
30
- seq2seq_tokenizer = T5Tokenizer.from_pretrained(seq2seq)
31
- print(f"Loaded model: {seq2seq}, Num. params: {seq2seq_model.num_parameters()}")
 
 
32
  # ----------------------------
33
  # load decoder-only (causal) language model
34
  from unsloth import FastLanguageModel
 
25
 
26
  # ----------------------------
27
  # load encoder-decoder (sequence to sequence) language model
28
+ # seq2seq = "polygraf-ai/poly-humanizer-XL-merged-v2"
29
+ # seq2seq_model = T5ForConditionalGeneration.from_pretrained(seq2seq, torch_dtype=torch.bfloat16).to(device)
30
+ # seq2seq_tokenizer = T5Tokenizer.from_pretrained(seq2seq)
31
+ # print(f"Loaded model: {seq2seq}, Num. params: {seq2seq_model.num_parameters()}")
32
+ seq2seq_model = None
33
+ seq2seq_tokenizer = None
34
  # ----------------------------
35
  # load decoder-only (causal) language model
36
  from unsloth import FastLanguageModel
requirements.txt CHANGED
@@ -26,4 +26,6 @@ langchain-openai
26
  vertexai
27
  html2text
28
  bm25s
29
- unsloth
 
 
 
26
  vertexai
27
  html2text
28
  bm25s
29
+ unsloth
30
+ trafilatura
31
+ yt-dlp
youtube.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gc
2
+ import torch
3
+ import yt_dlp as youtube_dl
4
+ from transformers import pipeline
5
+ from transformers.pipelines.audio_utils import ffmpeg_read
6
+ import tempfile
7
+ import os
8
+ from time import monotonic
9
+
10
+ MODEL_NAME = "openai/whisper-large-v3"
11
+ BATCH_SIZE = 8
12
+ YT_LENGTH_LIMIT_S = 5400 # limit to 1.5 hour YouTube files
13
+
14
+ device = 'cuda:1' if torch.cuda.is_available() else "cpu"
15
+
16
+ pipe = pipeline(
17
+ task="automatic-speech-recognition",
18
+ model=MODEL_NAME,
19
+ torch_dtype=torch.float16,
20
+ chunk_length_s=30,
21
+ device=device,
22
+ generate_kwargs={"language": "english"}
23
+ )
24
+
25
+ def download_yt_audio(yt_url, filename, time_limit_s=YT_LENGTH_LIMIT_S):
26
+ info_loader = youtube_dl.YoutubeDL()
27
+
28
+ try:
29
+ info = info_loader.extract_info(yt_url, download=False)
30
+ except youtube_dl.utils.DownloadError as err:
31
+ raise ValueError(f"Error downloading video: {str(err)}")
32
+
33
+ file_length = info["duration"]
34
+
35
+ if file_length > time_limit_s:
36
+ raise ValueError(f"Video is too long. Maximum allowed length is {time_limit_s // 3600} hour(s).")
37
+
38
+ ydl_opts = {"outtmpl": filename, "format": "bestaudio/best"} # Only download the best available audio format
39
+
40
+ with youtube_dl.YoutubeDL(ydl_opts) as ydl:
41
+ try:
42
+ ydl.download([yt_url])
43
+ except youtube_dl.utils.ExtractorError as err:
44
+ raise ValueError(f"Error extracting audio: {str(err)}")
45
+
46
+ def transcribe(yt_url, time_limit_s=YT_LENGTH_LIMIT_S):
47
+ with tempfile.TemporaryDirectory() as tmpdirname:
48
+ filepath = os.path.join(tmpdirname, "video.mp4")
49
+ t0 = monotonic()
50
+ download_yt_audio(yt_url, filepath, time_limit_s)
51
+ t1 = monotonic()
52
+ print(f"Downloaded video in {t1 - t0:.2f} seconds.")
53
+
54
+ with open(filepath, "rb") as f:
55
+ inputs = f.read()
56
+
57
+ inputs = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate)
58
+ inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
59
+ t0 = monotonic()
60
+ text = pipe(inputs, batch_size=BATCH_SIZE)["text"]
61
+ t1 = monotonic()
62
+ print(f"Transcribed video in {t1 - t0:.2f} seconds.")
63
+
64
+ torch.cuda.empty_cache()
65
+ gc.collect()
66
+
67
+ return text