Spaces:
Running
Running
from .common import process_file | |
from langchain.document_loaders import UnstructuredHTMLLoader | |
import requests | |
import re | |
import unicodedata | |
import tempfile | |
import os | |
import streamlit as st | |
from streamlit.runtime.uploaded_file_manager import UploadedFileRec, UploadedFile | |
def process_html(vector_store, file, stats_db): | |
return process_file(vector_store, file, UnstructuredHTMLLoader, ".html", stats_db=stats_db) | |
def get_html(url): | |
response = requests.get(url) | |
if response.status_code == 200: | |
return response.text | |
else: | |
return None | |
def create_html_file(url, content): | |
file_name = slugify(url) + ".html" | |
temp_file_path = os.path.join(tempfile.gettempdir(), file_name) | |
with open(temp_file_path, 'w') as temp_file: | |
temp_file.write(content) | |
record = UploadedFileRec(id=None, name=file_name, type='text/html', data=open(temp_file_path, 'rb').read()) | |
uploaded_file = UploadedFile(record) | |
return uploaded_file, temp_file_path | |
def delete_tempfile(temp_file_path, url, ret): | |
try: | |
os.remove(temp_file_path) | |
if ret: | |
st.write(f"✅ Content saved... {url} ") | |
except OSError as e: | |
print(f"Error while deleting the temporary file: {str(e)}") | |
if ret: | |
st.write(f"❌ Error while saving content... {url} ") | |
def slugify(text): | |
text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8') | |
text = re.sub(r'[^\w\s-]', '', text).strip().lower() | |
text = re.sub(r'[-\s]+', '-', text) | |
return text |