import re from unidecode import unidecode from nltk import sent_tokenize # from transformers import AutoTokenizer # import yaml # import fitz # import requests # from bs4 import BeautifulSoup # from collections import defaultdict def remove_accents(input_str): text_no_accents = unidecode(input_str) return text_no_accents def remove_special_characters(text): text = re.sub(r"https?://\S+|www\.\S+", "", text) emoji_pattern = re.compile( "[" "\U0001F600-\U0001F64F" # emoticons "\U0001F300-\U0001F5FF" # symbols & pictographs "\U0001F680-\U0001F6FF" # transport & map symbols "\U0001F700-\U0001F77F" # alchemical symbols "\U0001F780-\U0001F7FF" # Geometric Shapes Extended "\U0001F800-\U0001F8FF" # Supplemental Arrows-C "\U0001F900-\U0001F9FF" # Supplemental Symbols and Pictographs "\U0001FA00-\U0001FA6F" # Chess Symbols "\U0001FA70-\U0001FAFF" # Symbols and Pictographs Extended-A "\U00002702-\U000027B0" # Dingbats "\U000024C2-\U0001F251" "]+", flags=re.UNICODE, ) text = emoji_pattern.sub("", text) text = re.sub(r"#\w+", "", text) text = re.sub(r'[^\w\s\d.,!?\'"()-;]', "", text) text = re.sub(r"\s+([.,!?;])", r"\1", text) text = re.sub(r"([.,!?;])(\S)", r"\1 \2", text) text = re.sub(r"\s+", " ", text).strip() return text def remove_special_characters_2(text): pattern = r"[^a-zA-Z0-9 ]+" text = re.sub(pattern, "", text) return text def split_into_sentences(text): sentences = re.split(r"(?<=[.!?]) +", text) return sentences def get_token_length(tokenizer, sentence): return len(tokenizer.tokenize(sentence)) MC_TOKEN_SIZE = 256 BC_TOKEN_SIZE = 333 def split_text_allow_complete_sentences_nltk(text, type_det="bc", tokenizer=None): sentences = sent_tokenize(text) chunks = [] current_chunk = [] current_length = 0 if type_det == "bc": max_tokens = BC_TOKEN_SIZE elif type_det == "mc": max_tokens = MC_TOKEN_SIZE elif type_det == "quillbot": max_tokens = 256 def add_sentence_to_chunk(sentence): nonlocal current_chunk, current_length sentence_length = get_token_length(tokenizer, sentence) if current_length + sentence_length > max_tokens: chunks.append((current_chunk, current_length)) current_chunk = [] current_length = 0 current_chunk.append(sentence) current_length += sentence_length for sentence in sentences: add_sentence_to_chunk(sentence) if current_chunk: chunks.append((current_chunk, current_length)) adjusted_chunks = [] while chunks: chunk = chunks.pop(0) if len(chunks) > 0 and chunk[1] < max_tokens / 2: next_chunk = chunks.pop(0) combined_length = chunk[1] + next_chunk[1] if combined_length <= max_tokens: adjusted_chunks.append((chunk[0] + next_chunk[0], combined_length)) else: adjusted_chunks.append(chunk) chunks.insert(0, next_chunk) else: adjusted_chunks.append(chunk) result_chunks = [" ".join(chunk[0]) for chunk in adjusted_chunks] return result_chunks