Spaces:

polygraf-ai
/

article_writer

Runtime error

App Files Files Community

eljanmahammadli commited on Sep 24, 2024

Commit

744d9e3

1 Parent(s): d904dd4

#feat: added YouTube as RAG input; removed standard humanizer

Browse files

Files changed (5) hide show

ai_generate.py +15 -5
app.py +25 -10
humanize.py +6 -4
requirements.txt +3 -1
youtube.py +67 -0

ai_generate.py CHANGED Viewed

@@ -216,7 +216,7 @@ def load_llm(model: str, api_key: str, temperature: float = 1.0, max_length: int
     return llm
-def create_db_with_langchain(path: list[str], url_content: dict, query: str):
     all_docs = []
     text_splitter = RecursiveCharacterTextSplitter(
@@ -242,6 +242,7 @@ def create_db_with_langchain(path: list[str], url_content: dict, query: str):
         length_function=len,
         add_start_index=False,
     )
     if path:
         for file in path:
             loader = PyMuPDFLoader(file)
@@ -249,13 +250,20 @@ def create_db_with_langchain(path: list[str], url_content: dict, query: str):
             # split it into chunks
             docs = text_splitter.split_documents(data)
             all_docs.extend(docs)
     if url_content:
         for url, content in url_content.items():
             doc = Document(page_content=content, metadata={"source": url})
             # split it into chunks
             docs = text_splitter.split_documents([doc])
             all_docs.extend(docs)
     print(f"### Total number of documents before bm25s: {len(all_docs)}")
@@ -298,6 +306,7 @@ def generate_rag(
     max_length: int = 2048,
     api_key: str = "",
     sys_message="",
 ):
     llm = load_llm(model, api_key, temperature, max_length)
     if llm is None:
@@ -306,7 +315,7 @@ def generate_rag(
     query = llm_wrapper(input_role, topic, context, model="OpenAI GPT 4o", task_type="rag", temperature=0.7)
     print("### Query: ", query)
-    db, bm25_retriever = create_db_with_langchain(path, url_content, query)
     retriever = db.as_retriever(search_type="mmr", search_kwargs={"k": K, "fetch_k": FETCH_K, "lambda_mult": 0.75})
     t0 = time.time()
     ensemble_retriever = EnsembleRetriever(retrievers=[bm25_retriever, retriever], weights=[0.4, 0.6])
@@ -354,10 +363,11 @@ def generate(
     max_length: int = 2048,
     api_key: str = "",
     sys_message="",
 ):
-    if path or url_content:
         return generate_rag(
-            prompt, input_role, topic, context, model, url_content, path, temperature, max_length, api_key, sys_message
         )
     else:
         return generate_base(prompt, topic, model, temperature, max_length, api_key, sys_message)

     return llm
+def create_db_with_langchain(path: list[str], url_content: dict, yt_content: dict, query: str):
     all_docs = []
     text_splitter = RecursiveCharacterTextSplitter(
         length_function=len,
         add_start_index=False,
     )
+    # PDF
     if path:
         for file in path:
             loader = PyMuPDFLoader(file)
             # split it into chunks
             docs = text_splitter.split_documents(data)
             all_docs.extend(docs)
+    # Internet Search
     if url_content:
         for url, content in url_content.items():
             doc = Document(page_content=content, metadata={"source": url})
             # split it into chunks
             docs = text_splitter.split_documents([doc])
             all_docs.extend(docs)
+    # YouTube Transcriptions
+    if yt_content:
+        for yt_url, content in yt_content.items():
+            doc = Document(page_content=content, metadata={"source": yt_url})
+            # split it into chunks
+            docs = text_splitter.split_documents([doc])
+            all_docs.extend(docs)
     print(f"### Total number of documents before bm25s: {len(all_docs)}")
     max_length: int = 2048,
     api_key: str = "",
     sys_message="",
+    yt_content=None,
 ):
     llm = load_llm(model, api_key, temperature, max_length)
     if llm is None:
     query = llm_wrapper(input_role, topic, context, model="OpenAI GPT 4o", task_type="rag", temperature=0.7)
     print("### Query: ", query)
+    db, bm25_retriever = create_db_with_langchain(path, url_content, yt_content, query)
     retriever = db.as_retriever(search_type="mmr", search_kwargs={"k": K, "fetch_k": FETCH_K, "lambda_mult": 0.75})
     t0 = time.time()
     ensemble_retriever = EnsembleRetriever(retrievers=[bm25_retriever, retriever], weights=[0.4, 0.6])
     max_length: int = 2048,
     api_key: str = "",
     sys_message="",
+    yt_content=None,
 ):
+    if path or url_content or yt_content:
         return generate_rag(
+            prompt, input_role, topic, context, model, url_content, path, temperature, max_length, api_key, sys_message, yt_content
         )
     else:
         return generate_base(prompt, topic, model, temperature, max_length, api_key, sys_message)

app.py CHANGED Viewed

@@ -22,9 +22,12 @@ from google.cloud import storage
 if gr.NO_RELOAD:
     from humanize import humanize_text, device
     from utils import remove_special_characters, split_text_allow_complete_sentences_nltk
     from google_search import google_search, months, domain_list, build_date
     from ai_generate import generate, citations_to_html, remove_citations, display_cited_text, llm_wrapper
     # nltk.download("punkt_tab")
@@ -566,6 +569,7 @@ def generate_article(
     pdf_file_input: list[str] = None,
     generated_article: str = None,
     user_comments: str = None,
 ) -> str:
     settings = {
         "role": input_role,
@@ -605,6 +609,7 @@ def generate_article(
         max_length=2048,
         api_key=api_key,
         sys_message="",
     )
     return article, citations
@@ -689,13 +694,6 @@ def update_structure(format_choice):
         return gr.update(value="Introduction, Body, Conclusion", interactive=True)
-def update_temperature(model_dropdown):
-    if model_dropdown == "Standard Model":
-        return gr.update(value=1.2, interactive=True)
-    elif model_dropdown == "Advanced Model (Beta)":
-        return gr.update(value=1.0, interactive=True)
 # Initialize Google Cloud Storage client
 client = storage.Client()
 bucket_name = "ai-source-detection"
@@ -820,6 +818,7 @@ def generate_and_format(
     exclude_sites,
     pdf_file_input,
     history=None,
     ai_model="OpenAI GPT 4o",
     api_key=None,
     generated_article: str = None,
@@ -827,6 +826,7 @@ def generate_and_format(
 ):
     url_content = None
     if google_search_check:
         date_from = build_date(year_from, month_from, day_from)
         date_to = build_date(year_to, month_to, day_to)
         sorted_date = f"date:r:{date_from}:{date_to}"
@@ -841,6 +841,14 @@ def generate_and_format(
             final_query += " " + " ".join(exclude_queries)
         print(f"Google Search Query: {final_query}")
         url_content = google_search(final_query, sorted_date, domains_to_include)
     # topic_context = topic + ", " + context
     article, citations = generate_article(
         input_role,
@@ -863,6 +871,7 @@ def generate_and_format(
         pdf_file_input,
         generated_article,
         user_comments,
     )
     # if ends_with_references(article) and url_content is not None:
     #     for url in url_content.keys():
@@ -1103,6 +1112,12 @@ with gr.Blocks(
                 gr.Markdown("# Add Optional PDF Files with Information", elem_classes="text-center text-3xl mb-6")
                 pdf_file_input = gr.File(label="Upload PDF(s)", file_count="multiple", file_types=[".pdf"])
             """
             # NOTE: HIDE AI MODEL SELECTION
             with gr.Group():
@@ -1150,13 +1165,13 @@ with gr.Blocks(
                 with gr.Accordion("Advanced Humanizer Settings", open=False):
                     with gr.Row():
                         model_dropdown = gr.Radio(
-                            choices=["Standard Model", "Advanced Model (Beta)"],
                             value="Advanced Model (Beta)",
                             label="Humanizer Model Version",
                         )
                     with gr.Row():
                         temperature_slider = gr.Slider(
-                            minimum=0.5, maximum=2.0, step=0.1, value=1.0, label="Temperature"
                         )
                         top_k_slider = gr.Slider(minimum=0, maximum=300, step=25, value=40, label="Top k")
                     with gr.Row():
@@ -1213,7 +1228,6 @@ with gr.Blocks(
     # Update the default structure based on the selected format
     # e.g. "Plain Text" for certain formats
     input_format.change(fn=update_structure, inputs=input_format, outputs=input_structure)
-    model_dropdown.change(fn=update_temperature, inputs=model_dropdown, outputs=temperature_slider)
     report_humanized_btn.click(
         save_humanizer_feedback_to_cloud_storage, inputs=[latest_humanizer_data, humanizer_feedback]
     )
@@ -1249,6 +1263,7 @@ with gr.Blocks(
             exclude_sites,
             pdf_file_input,
             history,
         ],
         outputs=[output_article, history],
     )

 if gr.NO_RELOAD:
     from humanize import humanize_text, device
+    # humanize_text = None
+    # device = None
     from utils import remove_special_characters, split_text_allow_complete_sentences_nltk
     from google_search import google_search, months, domain_list, build_date
     from ai_generate import generate, citations_to_html, remove_citations, display_cited_text, llm_wrapper
+    from youtube import transcribe
     # nltk.download("punkt_tab")
     pdf_file_input: list[str] = None,
     generated_article: str = None,
     user_comments: str = None,
+    yt_content: str = None,
 ) -> str:
     settings = {
         "role": input_role,
         max_length=2048,
         api_key=api_key,
         sys_message="",
+        yt_content=yt_content,
     )
     return article, citations
         return gr.update(value="Introduction, Body, Conclusion", interactive=True)
 # Initialize Google Cloud Storage client
 client = storage.Client()
 bucket_name = "ai-source-detection"
     exclude_sites,
     pdf_file_input,
     history=None,
+    yt_url: str = None,
     ai_model="OpenAI GPT 4o",
     api_key=None,
     generated_article: str = None,
 ):
     url_content = None
     if google_search_check:
+        gr.Info("Searching internet for relevant content...")
         date_from = build_date(year_from, month_from, day_from)
         date_to = build_date(year_to, month_to, day_to)
         sorted_date = f"date:r:{date_from}:{date_to}"
             final_query += " " + " ".join(exclude_queries)
         print(f"Google Search Query: {final_query}")
         url_content = google_search(final_query, sorted_date, domains_to_include)
+    yt_content = {}
+    if yt_url:
+        gr.Info("Transcribing YouTube video...")
+        transcribed_text = transcribe(yt_url)
+        gr.Info("Transcription completed. Generating article...")
+        yt_content[yt_url] = transcribed_text
     # topic_context = topic + ", " + context
     article, citations = generate_article(
         input_role,
         pdf_file_input,
         generated_article,
         user_comments,
+        yt_content,
     )
     # if ends_with_references(article) and url_content is not None:
     #     for url in url_content.keys():
                 gr.Markdown("# Add Optional PDF Files with Information", elem_classes="text-center text-3xl mb-6")
                 pdf_file_input = gr.File(label="Upload PDF(s)", file_count="multiple", file_types=[".pdf"])
+                gr.Markdown("# Add Youtube Video Link", elem_classes="text-center text-3xl mb-6")
+                yt_url = gr.Textbox(
+                    label="Youtube Video Link",
+                    placeholder="Enter the link of the video",
+                    elem_classes="input-highlight-pink",
+                )
             """
             # NOTE: HIDE AI MODEL SELECTION
             with gr.Group():
                 with gr.Accordion("Advanced Humanizer Settings", open=False):
                     with gr.Row():
                         model_dropdown = gr.Radio(
+                            choices=["Advanced Model (Beta)"],
                             value="Advanced Model (Beta)",
                             label="Humanizer Model Version",
                         )
                     with gr.Row():
                         temperature_slider = gr.Slider(
+                            minimum=0.1, maximum=2.0, step=0.1, value=1.0, label="Temperature"
                         )
                         top_k_slider = gr.Slider(minimum=0, maximum=300, step=25, value=40, label="Top k")
                     with gr.Row():
     # Update the default structure based on the selected format
     # e.g. "Plain Text" for certain formats
     input_format.change(fn=update_structure, inputs=input_format, outputs=input_structure)
     report_humanized_btn.click(
         save_humanizer_feedback_to_cloud_storage, inputs=[latest_humanizer_data, humanizer_feedback]
     )
             exclude_sites,
             pdf_file_input,
             history,
+            yt_url,
         ],
         outputs=[output_article, history],
     )

humanize.py CHANGED Viewed

@@ -25,10 +25,12 @@ else:
 # ----------------------------
 # load encoder-decoder (sequence to sequence) language model
-seq2seq = "polygraf-ai/poly-humanizer-XL-merged-v2"
-seq2seq_model = T5ForConditionalGeneration.from_pretrained(seq2seq, torch_dtype=torch.bfloat16).to(device)
-seq2seq_tokenizer = T5Tokenizer.from_pretrained(seq2seq)
-print(f"Loaded model: {seq2seq}, Num. params: {seq2seq_model.num_parameters()}")
 # ----------------------------
 # load decoder-only (causal) language model
 from unsloth import FastLanguageModel

 # ----------------------------
 # load encoder-decoder (sequence to sequence) language model
+# seq2seq = "polygraf-ai/poly-humanizer-XL-merged-v2"
+# seq2seq_model = T5ForConditionalGeneration.from_pretrained(seq2seq, torch_dtype=torch.bfloat16).to(device)
+# seq2seq_tokenizer = T5Tokenizer.from_pretrained(seq2seq)
+# print(f"Loaded model: {seq2seq}, Num. params: {seq2seq_model.num_parameters()}")
+seq2seq_model = None
+seq2seq_tokenizer = None
 # ----------------------------
 # load decoder-only (causal) language model
 from unsloth import FastLanguageModel

requirements.txt CHANGED Viewed

@@ -26,4 +26,6 @@ langchain-openai
 vertexai
 html2text
 bm25s
-unsloth

 vertexai
 html2text
 bm25s
+unsloth
+trafilatura
+yt-dlp

youtube.py ADDED Viewed

	@@ -0,0 +1,67 @@

+import gc
+import torch
+import yt_dlp as youtube_dl
+from transformers import pipeline
+from transformers.pipelines.audio_utils import ffmpeg_read
+import tempfile
+import os
+from time import monotonic
+MODEL_NAME = "openai/whisper-large-v3"
+BATCH_SIZE = 8
+YT_LENGTH_LIMIT_S = 5400  # limit to 1.5 hour YouTube files
+device = 'cuda:1' if torch.cuda.is_available() else "cpu"
+pipe = pipeline(
+    task="automatic-speech-recognition",
+    model=MODEL_NAME,
+    torch_dtype=torch.float16,
+    chunk_length_s=30,
+    device=device,
+    generate_kwargs={"language": "english"}
+)
+def download_yt_audio(yt_url, filename, time_limit_s=YT_LENGTH_LIMIT_S):
+    info_loader = youtube_dl.YoutubeDL()
+    try:
+        info = info_loader.extract_info(yt_url, download=False)
+    except youtube_dl.utils.DownloadError as err:
+        raise ValueError(f"Error downloading video: {str(err)}")
+    file_length = info["duration"]
+    if file_length > time_limit_s:
+        raise ValueError(f"Video is too long. Maximum allowed length is {time_limit_s // 3600} hour(s).")
+    ydl_opts = {"outtmpl": filename, "format": "bestaudio/best"}  # Only download the best available audio format
+    with youtube_dl.YoutubeDL(ydl_opts) as ydl:
+        try:
+            ydl.download([yt_url])
+        except youtube_dl.utils.ExtractorError as err:
+            raise ValueError(f"Error extracting audio: {str(err)}")
+def transcribe(yt_url, time_limit_s=YT_LENGTH_LIMIT_S):
+    with tempfile.TemporaryDirectory() as tmpdirname:
+        filepath = os.path.join(tmpdirname, "video.mp4")
+        t0 = monotonic()
+        download_yt_audio(yt_url, filepath, time_limit_s)
+        t1 = monotonic()
+        print(f"Downloaded video in {t1 - t0:.2f} seconds.")
+        with open(filepath, "rb") as f:
+            inputs = f.read()
+        inputs = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate)
+        inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
+        t0 = monotonic()
+        text = pipe(inputs, batch_size=BATCH_SIZE)["text"]
+        t1 = monotonic()
+        print(f"Transcribed video in {t1 - t0:.2f} seconds.")
+        torch.cuda.empty_cache()
+        gc.collect()
+    return text