Spaces:

polygraf-ai
/

article_writer

Runtime error

article_writer / utils.py

Update utils.py

63045b3 verified 6 months ago

1.6 kB

	import re
	from unidecode import unidecode
	# from transformers import AutoTokenizer
	# import yaml
	# import fitz
	# import requests
	# from bs4 import BeautifulSoup
	# from collections import defaultdict

	def remove_accents(input_str):
	text_no_accents = unidecode(input_str)
	return text_no_accents

	def remove_special_characters(text):
	text = re.sub(r'https?://\S+\|www\.\S+', '', text)
	emoji_pattern = re.compile("["
	u"\U0001F600-\U0001F64F" # emoticons
	u"\U0001F300-\U0001F5FF" # symbols & pictographs
	u"\U0001F680-\U0001F6FF" # transport & map symbols
	u"\U0001F700-\U0001F77F" # alchemical symbols
	u"\U0001F780-\U0001F7FF" # Geometric Shapes Extended
	u"\U0001F800-\U0001F8FF" # Supplemental Arrows-C
	u"\U0001F900-\U0001F9FF" # Supplemental Symbols and Pictographs
	u"\U0001FA00-\U0001FA6F" # Chess Symbols
	u"\U0001FA70-\U0001FAFF" # Symbols and Pictographs Extended-A
	u"\U00002702-\U000027B0" # Dingbats
	u"\U000024C2-\U0001F251"
	"]+", flags=re.UNICODE)
	text = emoji_pattern.sub('', text)
	text = re.sub(r'#\w+', '', text)
	text = re.sub(r'[^\w\s\d.,!?\'"()-;]', '', text)
	text = re.sub(r'\s+([.,!?;])', r'\1', text)
	text = re.sub(r'([.,!?;])(\S)', r'\1 \2', text)
	text = re.sub(r'\s+', ' ', text).strip()
	return text

	def remove_special_characters_2(text):
	pattern = r"[^a-zA-Z0-9 ]+"
	text = re.sub(pattern, "", text)
	return text


	def split_into_sentences(text):
	sentences = re.split(r'(?<=[.!?]) +', text)
	return sentences