File size: 1,595 Bytes
132b0ec
 
63045b3
 
 
 
 
 
132b0ec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
import re
from unidecode import unidecode
# from transformers import AutoTokenizer
# import yaml
# import fitz
# import requests
# from bs4 import BeautifulSoup
# from collections import defaultdict

def remove_accents(input_str):
    text_no_accents = unidecode(input_str)
    return text_no_accents

def remove_special_characters(text):
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    emoji_pattern = re.compile("["  
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F700-\U0001F77F"  # alchemical symbols
        u"\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
        u"\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
        u"\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
        u"\U0001FA00-\U0001FA6F"  # Chess Symbols
        u"\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
        u"\U00002702-\U000027B0"  # Dingbats
        u"\U000024C2-\U0001F251" 
        "]+", flags=re.UNICODE)
    text = emoji_pattern.sub('', text) 
    text = re.sub(r'#\w+', '', text)  
    text = re.sub(r'[^\w\s\d.,!?\'"()-;]', '', text) 
    text = re.sub(r'\s+([.,!?;])', r'\1', text)
    text = re.sub(r'([.,!?;])(\S)', r'\1 \2', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def remove_special_characters_2(text):
    pattern = r"[^a-zA-Z0-9 ]+"
    text = re.sub(pattern, "", text)
    return text


def split_into_sentences(text):
    sentences = re.split(r'(?<=[.!?]) +', text)
    return sentences