Spaces:
Running
Running
"""Fetch text from url.""" | |
# pylint: disable=too-many-branches | |
from typing import Optional | |
from urllib.parse import urlparse | |
import html2text | |
import httpx | |
import streamlit as st | |
from logzero import logger | |
from readability import Document | |
# @st.cache | |
def url2txt( | |
url: str, | |
bodywidth: Optional[int] = 5000, | |
remove: bool = False, | |
show_url: bool = True, | |
ignore_links: bool = True, | |
) -> str: | |
"""Fetch text from url. | |
Args: | |
url: netloc from which to fetch text | |
bodywidth: if set to None, fall back to default bodywidth of | |
html2text.HTML2Text | |
remove: remove blank lines if set to True | |
show_url: prepend url if set to True | |
ignore_links: remove [ur](url) | |
Return: | |
main body in text | |
bodywidth: Optional[int] = 5000 | |
remove: bool = False | |
show_url: bool = True | |
ignore_links: bool = True | |
""" | |
url = url.strip() | |
if not url.startswith("http"): | |
url = "http://" + url | |
logger.info("url: %s", url) | |
parsed = urlparse(url) | |
if not parsed.scheme or not parsed.netloc: # no scheme or netloc present | |
raise Exception(f"Invalid url: {url}") | |
try: | |
resp = httpx.get(url, timeout=30) | |
resp.raise_for_status() | |
except Exception as exc: | |
logger.error(exc) | |
raise | |
try: | |
content_type = resp.headers["content-type"] | |
except Exception as e: | |
logger.error(e) | |
content_type = "" | |
# output text if text/plain | |
if "text/plain" in content_type: | |
return resp.text | |
# handle html and the rest | |
try: | |
doc = Document(resp.text) | |
except Exception as exc: | |
logger.error(exc) | |
raise | |
if not doc.summary().strip(): | |
raise Exception("No content for some reason...") | |
if bodywidth is not None: | |
handle = html2text.HTML2Text(bodywidth=bodywidth) | |
else: | |
handle = html2text.HTML2Text() | |
handle.ignore_links = ignore_links | |
try: | |
res = handle.handle(doc.summary()) | |
except Exception as exc: | |
logger.error(exc) | |
raise | |
# remove double blank lines | |
if remove: | |
res = "\n".join(elm for elm in res.splitlines() if elm.strip()) | |
if not res.strip(): # warn if empty output | |
logger.warning("Output seems to be empty...") | |
if show_url: | |
return f"{url}\n# {doc.title()}\n{res}" | |
return f"# {doc.title()}\n{res}" | |