KKMS-KSSW-HF / src /utils.py
Chintan Donda
Moving kkms_kssw.py to src/
04e306a
import os
import re
import pandas as pd
from urllib.parse import urlparse
import logging
logger = logging.getLogger(__name__)
logging.basicConfig(
format="%(asctime)s %(levelname)s [%(name)s] %(message)s", level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S"
)
class UTILS:
def __init__(self):
pass
def split_text(
self,
text
):
text = text.split(',')
text = [t.strip() for t in text]
return text
def replace_newlines_and_spaces(
self,
text
):
# Replace all newline characters with spaces
text = text.replace("\n", " ")
# Replace multiple spaces with a single space
text = re.sub(r'\s+', ' ', text)
return text
def clean_df(
self,
df,
dropna=True,
fillna=False
):
if fillna:
df.fillna('', inplace=True)
if dropna:
df.dropna(inplace=True)
# df = df[~df.isna()]
df = df.drop_duplicates().reset_index(drop=True)
return df
def validate_url_format(
self,
urls,
url_type='urls'
):
valid_urls = []
for url in urls:
result = urlparse(url)
# Check if the url is valid
if all([result.scheme, result.netloc]):
# Online PDF urls should end with .pdf extension
if url_type == 'online_pdf' and not url.endswith('.pdf'):
continue
valid_urls.append(url)
logging.info(f'Valid URLs are: {valid_urls}')
return valid_urls