Spaces:

polygraf-ai
/

article_writer

Runtime error

App Files Files Community

eljanmahammadli commited on Sep 23, 2024

Commit

d904dd4

1 Parent(s): 8b9c9ff

#perf: quality improvements to website scrape + PDF detect logic

Browse files

Files changed (1) hide show

google_search.py +32 -31

google_search.py CHANGED Viewed

@@ -3,12 +3,10 @@ import time
 from googleapiclient.discovery import build
 import asyncio
 import httpx
-from bs4 import BeautifulSoup
 from dotenv import load_dotenv
-import html2text
 import requests
-import unicodedata
 import fitz
 load_dotenv()
@@ -18,26 +16,6 @@ CSE_KEY = os.environ.get("GOOGLE_SEARCH_CSE_ID")
 # Number of pages to scrape
 NUM_PAGES = 10
-# load html2text and set up configs
-h2t = html2text.HTML2Text()
-h2t.bodywidth = 0  # No wrapping
-h2t.ignore_links = True  # Ignore hyperlinks
-h2t.ignore_images = True  # Ignore images
-h2t.ignore_emphasis = True  # Ignore emphasis
-h2t.ignore_tables = False  # Include tables
-h2t.skip_internal_links = True  # Skip internal links
-h2t.skip_external_links = True  # Skip external links
-h2t.single_line_break = True  # Use single line breaks
-h2t.protect_links = True  # Protect links from being split
-h2t.default_image_alt = "[image]"  # Default alt text for images
-def clean_html(text):
-    text = h2t.handle(text)
-    text = unicodedata.normalize("NFKD", text).encode("ASCII", "ignore").decode("ASCII")  # Remove non-ASCII characters
-    return text
 def build_results_beautifulsoup(url_list):
     print("Starting to scrape URLs...")
     start_time = time.perf_counter()
@@ -59,8 +37,17 @@ def build_results_beautifulsoup(url_list):
         if soup:
             print(f"Processing URL: {url}")
-            text = clean_html(soup.text)
-            if len(text) > 500:
                 print(f"Adding content from URL: {url}, content length: {len(text)}")
                 result_content[url] = text
                 count += 1
@@ -126,13 +113,18 @@ async def get_url_data(url, client):
         if r.status_code == 200:
             content_type = r.headers.get("Content-Type", "").lower()
-            # detect if pdf
             if "application/pdf" in content_type or url.lower().endswith(".pdf"):
                 pdf_content = await extract_pdf_text(r.content)
-                return BeautifulSoup(pdf_content, "html.parser")
             else:
-                return BeautifulSoup(r.content, "html.parser")
-    except Exception:
         return None
@@ -142,10 +134,19 @@ async def extract_pdf_text(content):
             text = ""
             for page in doc:
                 text += page.get_text()
-        return f"<div>{text}</div>"  # Wrap in a div to make it valid HTML
     except Exception as e:
         print(f"Error extracting PDF text: {str(e)}")
-        return "<div>Error extracting PDF text</div>"
 async def parallel_scrap(urls):

 from googleapiclient.discovery import build
 import asyncio
 import httpx
 from dotenv import load_dotenv
 import requests
 import fitz
+from trafilatura import extract
 load_dotenv()
 # Number of pages to scrape
 NUM_PAGES = 10
 def build_results_beautifulsoup(url_list):
     print("Starting to scrape URLs...")
     start_time = time.perf_counter()
         if soup:
             print(f"Processing URL: {url}")
+            text = extract(
+                soup,
+                include_tables=False,
+                include_comments=False,
+                output_format="txt",
+            )
+            # If text is None or empty, log a warning and skip
+            if text is None:
+                print(f"Warning: Extraction returned None for URL: {url}")
+            elif len(text) > 500:
                 print(f"Adding content from URL: {url}, content length: {len(text)}")
                 result_content[url] = text
                 count += 1
         if r.status_code == 200:
             content_type = r.headers.get("Content-Type", "").lower()
+            # Improved PDF detection using Content-Type and file extension
             if "application/pdf" in content_type or url.lower().endswith(".pdf"):
+                print(f"Detected PDF content via Content-Type or file extension at URL: {url}")
                 pdf_content = await extract_pdf_text(r.content)
+                return pdf_content
             else:
+                return r.content
+        else:
+            print(f"Non-200 response for URL: {url}, status code: {r.status_code}")
+            return None
+    except Exception as e:
+        print(f"Error fetching URL: {url}, Error: {str(e)}")
         return None
             text = ""
             for page in doc:
                 text += page.get_text()
+        html_content = f"""
+        <!DOCTYPE html>
+        <html>
+        <body>
+            <p>{text}</p>
+        </body>
+        </html>
+        """
+        html_bytes = html_content.encode('utf-8')
+        return html_bytes # Return in such a format that is parsable by trafilatura
     except Exception as e:
         print(f"Error extracting PDF text: {str(e)}")
+        return None
 async def parallel_scrap(urls):