import re from string import punctuation def escape_tags_and_content(text): """Escape tags and their content containing text, which is not written in natural language, such as code snippets""" NO_TEXT_TAGS = "code", "noformat" for tag in NO_TEXT_TAGS: regex_matching_tag = re.compile("\{%s(.*?)\}(.*?)\{%s\}" % (tag, tag), re.DOTALL) text = re.sub(regex_matching_tag, "", text) return text def escape_tags(text): """Escape markup tags, but retain their content""" ESCAPE_TAGS = "color", "quote", "anchor", "panel" for tag in ESCAPE_TAGS: text = re.sub("\{%s(.*?)\}" % tag, "", text) return text def escape_strings(text): """Escape line breaks, tabulators, slashes and JIRA heading markup symbols""" ESCAPE_STRINGS = "\\r", "\\n", "\\t", "\\f", "\\v", "\"", "\\\\", "h1. ", "h2. ", "h3. ", "h4. ", "h5. ", "h6. " for escape_string in ESCAPE_STRINGS: text = text.replace(escape_string, " ") return text def escape_links(text): """Escape external and internal links, recognized by JIRA markup or leading 'http://' or 'https://' """ LINK_STARTERS = r"\#", r"\^", r"http\:\/\/", r"https\:\/\/", r"malto\:", r"file\:", r"\~" for link_starter in LINK_STARTERS: text = re.sub("\[(.*?\\|)?%s(.*?)\]" % link_starter, "", text) text = re.sub(r"\bhttps?://\S+", "", text) return text def escape_hex_character_codes(text): """Escape characters outside the latin alphabet which are converted to hex code representation""" return re.sub(r"\\x\w\w", "", text) def escape_punctuation_boundaries(text): """Remove all punctuation marks from the beginning and end of words, except for trailing period at the end of words""" return " ".join([word.strip(punctuation.replace(".", "")).lstrip(".") for word in text.split()]) def escape_odd_spaces(text): """Replace several consequent spaces with one space and remove spaces from string start and end""" text = re.sub(r"\s+", " ", text) text = text.strip() return text