mlbee / st_mlbee /url2txt.py
freemt
Bump version to 0.1.0a2
de6562c
"""Fetch text from url."""
# pylint: disable=too-many-branches
from typing import Optional
from urllib.parse import urlparse
import html2text
import httpx
import streamlit as st
from logzero import logger
from readability import Document
# @st.cache
def url2txt(
url: str,
bodywidth: Optional[int] = 5000,
remove: bool = False,
show_url: bool = True,
ignore_links: bool = True,
) -> str:
"""Fetch text from url.
Args:
url: netloc from which to fetch text
bodywidth: if set to None, fall back to default bodywidth of
html2text.HTML2Text
remove: remove blank lines if set to True
show_url: prepend url if set to True
ignore_links: remove [ur](url)
Return:
main body in text
bodywidth: Optional[int] = 5000
remove: bool = False
show_url: bool = True
ignore_links: bool = True
"""
url = url.strip()
if not url.startswith("http"):
url = "http://" + url
logger.info("url: %s", url)
parsed = urlparse(url)
if not parsed.scheme or not parsed.netloc: # no scheme or netloc present
raise Exception(f"Invalid url: {url}")
try:
resp = httpx.get(url, timeout=30)
resp.raise_for_status()
except Exception as exc:
logger.error(exc)
raise
try:
content_type = resp.headers["content-type"]
except Exception as e:
logger.error(e)
content_type = ""
# output text if text/plain
if "text/plain" in content_type:
return resp.text
# handle html and the rest
try:
doc = Document(resp.text)
except Exception as exc:
logger.error(exc)
raise
if not doc.summary().strip():
raise Exception("No content for some reason...")
if bodywidth is not None:
handle = html2text.HTML2Text(bodywidth=bodywidth)
else:
handle = html2text.HTML2Text()
handle.ignore_links = ignore_links
try:
res = handle.handle(doc.summary())
except Exception as exc:
logger.error(exc)
raise
# remove double blank lines
if remove:
res = "\n".join(elm for elm in res.splitlines() if elm.strip())
if not res.strip(): # warn if empty output
logger.warning("Output seems to be empty...")
if show_url:
return f"{url}\n# {doc.title()}\n{res}"
return f"# {doc.title()}\n{res}"