import re from unidecode import unidecode # from transformers import AutoTokenizer # import yaml # import fitz # import requests # from bs4 import BeautifulSoup # from collections import defaultdict def remove_accents(input_str): text_no_accents = unidecode(input_str) return text_no_accents def remove_special_characters(text): text = re.sub(r'https?://\S+|www\.\S+', '', text) emoji_pattern = re.compile("[" u"\U0001F600-\U0001F64F" # emoticons u"\U0001F300-\U0001F5FF" # symbols & pictographs u"\U0001F680-\U0001F6FF" # transport & map symbols u"\U0001F700-\U0001F77F" # alchemical symbols u"\U0001F780-\U0001F7FF" # Geometric Shapes Extended u"\U0001F800-\U0001F8FF" # Supplemental Arrows-C u"\U0001F900-\U0001F9FF" # Supplemental Symbols and Pictographs u"\U0001FA00-\U0001FA6F" # Chess Symbols u"\U0001FA70-\U0001FAFF" # Symbols and Pictographs Extended-A u"\U00002702-\U000027B0" # Dingbats u"\U000024C2-\U0001F251" "]+", flags=re.UNICODE) text = emoji_pattern.sub('', text) text = re.sub(r'#\w+', '', text) text = re.sub(r'[^\w\s\d.,!?\'"()-;]', '', text) text = re.sub(r'\s+([.,!?;])', r'\1', text) text = re.sub(r'([.,!?;])(\S)', r'\1 \2', text) text = re.sub(r'\s+', ' ', text).strip() return text def remove_special_characters_2(text): pattern = r"[^a-zA-Z0-9 ]+" text = re.sub(pattern, "", text) return text def split_into_sentences(text): sentences = re.split(r'(?<=[.!?]) +', text) return sentences