|
import re |
|
from html import unescape |
|
|
|
|
|
def clean_text(t): |
|
t = clean_markdown(t) |
|
t = t.replace("\n"," ") |
|
t = t.replace("\t"," ") |
|
t = t.replace("^M"," ") |
|
t = t.replace("\r"," ") |
|
t = t.replace(" ,", ",") |
|
t = re.sub(" +", " ", t) |
|
return t |
|
|
|
|
|
def clean_markdown(md_text): |
|
|
|
md_text = re.sub(r'```.*?```', '', md_text, flags=re.DOTALL) |
|
|
|
md_text = re.sub(r'`[^`]*`', '', md_text) |
|
|
|
md_text = re.sub(r'!\[.*?\]\(.*?\)', '', md_text) |
|
|
|
md_text = re.sub(r'\[([^\]]+)\]\(.*?\)', r'\1', md_text) |
|
|
|
md_text = re.sub(r'(\*\*|__)(.*?)\1', r'\2', md_text) |
|
md_text = re.sub(r'(\*|_)(.*?)\1', r'\2', md_text) |
|
|
|
md_text = re.sub(r'#+ ', '', md_text) |
|
|
|
md_text = re.sub(r'^>.*$', '', md_text, flags=re.MULTILINE) |
|
|
|
md_text = re.sub(r'^(\s*[-*+]|\d+\.)\s+', '', md_text, flags=re.MULTILINE) |
|
|
|
md_text = re.sub(r'^\s*[-*_]{3,}\s*$', '', md_text, flags=re.MULTILINE) |
|
|
|
md_text = re.sub(r'\|.*?\|', '', md_text) |
|
|
|
md_text = re.sub(r'<.*?>', '', md_text) |
|
|
|
md_text = unescape(md_text) |
|
return md_text |
|
|