Spaces:
Runtime error
Runtime error
File size: 1,612 Bytes
b16454e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 |
import os
import re
import pandas as pd
from urllib.parse import urlparse
import logging
logger = logging.getLogger(__name__)
logging.basicConfig(
format="%(asctime)s %(levelname)s [%(name)s] %(message)s", level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S"
)
class UTILS:
def __init__(self):
pass
def split_text(
self,
text
):
text = text.split(',')
text = [t.strip() for t in text]
return text
def replace_newlines_and_spaces(
self,
text
):
# Replace all newline characters with spaces
text = text.replace("\n", " ")
# Replace multiple spaces with a single space
text = re.sub(r'\s+', ' ', text)
return text
def clean_df(
self,
df,
dropna=True,
fillna=False
):
if fillna:
df.fillna('', inplace=True)
if dropna:
df.dropna(inplace=True)
# df = df[~df.isna()]
df = df.drop_duplicates().reset_index(drop=True)
return df
def validate_url_format(
self,
urls,
url_type='urls'
):
valid_urls = []
for url in urls:
result = urlparse(url)
# Check if the url is valid
if all([result.scheme, result.netloc]):
# Online PDF urls should end with .pdf extension
if url_type == 'online_pdf' and not url.endswith('.pdf'):
continue
valid_urls.append(url)
logging.info(f'Valid URLs are: {valid_urls}')
return valid_urls
|