alexkueck commited on
Commit
e455e2a
·
1 Parent(s): 873c46f

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +234 -0
utils.py CHANGED
@@ -0,0 +1,234 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+ from typing import TYPE_CHECKING, Any, Callable, Dict, List, Tuple, Type
3
+ import logging
4
+ import json
5
+ import os
6
+ import datetime
7
+ import hashlib
8
+ import csv
9
+ import requests
10
+ import re
11
+ import html
12
+ import markdown2
13
+ import torch
14
+ import sys
15
+ import gc
16
+ from pygments.lexers import guess_lexer, ClassNotFound
17
+
18
+ import gradio as gr
19
+ from pypinyin import lazy_pinyin
20
+ import tiktoken
21
+ import mdtex2html
22
+ from markdown import markdown
23
+ from pygments import highlight
24
+ from pygments.lexers import guess_lexer,get_lexer_by_name
25
+ from pygments.formatters import HtmlFormatter
26
+ from beschreibungen import *
27
+
28
+ logging.basicConfig(
29
+ level=logging.INFO,
30
+ format="%(asctime)s [%(levelname)s] [%(filename)s:%(lineno)d] %(message)s",
31
+ )
32
+
33
+
34
+ def markdown_to_html_with_syntax_highlight(md_str):
35
+ def replacer(match):
36
+ lang = match.group(1) or "text"
37
+ code = match.group(2)
38
+ lang = lang.strip()
39
+ #print(1,lang)
40
+ if lang=="text":
41
+ lexer = guess_lexer(code)
42
+ lang = lexer.name
43
+ #print(2,lang)
44
+ try:
45
+ lexer = get_lexer_by_name(lang, stripall=True)
46
+ except ValueError:
47
+ lexer = get_lexer_by_name("python", stripall=True)
48
+ formatter = HtmlFormatter()
49
+ #print(3,lexer.name)
50
+ highlighted_code = highlight(code, lexer, formatter)
51
+
52
+ return f'<pre><code class="{lang}">{highlighted_code}</code></pre>'
53
+
54
+ code_block_pattern = r"```(\w+)?\n([\s\S]+?)\n```"
55
+ md_str = re.sub(code_block_pattern, replacer, md_str, flags=re.MULTILINE)
56
+
57
+ html_str = markdown(md_str)
58
+ return html_str
59
+
60
+
61
+ def normalize_markdown(md_text: str) -> str:
62
+ lines = md_text.split("\n")
63
+ normalized_lines = []
64
+ inside_list = False
65
+
66
+ for i, line in enumerate(lines):
67
+ if re.match(r"^(\d+\.|-|\*|\+)\s", line.strip()):
68
+ if not inside_list and i > 0 and lines[i - 1].strip() != "":
69
+ normalized_lines.append("")
70
+ inside_list = True
71
+ normalized_lines.append(line)
72
+ elif inside_list and line.strip() == "":
73
+ if i < len(lines) - 1 and not re.match(
74
+ r"^(\d+\.|-|\*|\+)\s", lines[i + 1].strip()
75
+ ):
76
+ normalized_lines.append(line)
77
+ continue
78
+ else:
79
+ inside_list = False
80
+ normalized_lines.append(line)
81
+
82
+ return "\n".join(normalized_lines)
83
+
84
+
85
+ def convert_mdtext(md_text):
86
+ code_block_pattern = re.compile(r"```(.*?)(?:```|$)", re.DOTALL)
87
+ inline_code_pattern = re.compile(r"`(.*?)`", re.DOTALL)
88
+ code_blocks = code_block_pattern.findall(md_text)
89
+ non_code_parts = code_block_pattern.split(md_text)[::2]
90
+
91
+ result = []
92
+ for non_code, code in zip(non_code_parts, code_blocks + [""]):
93
+ if non_code.strip():
94
+ non_code = normalize_markdown(non_code)
95
+ if inline_code_pattern.search(non_code):
96
+ result.append(markdown(non_code, extensions=["tables"]))
97
+ else:
98
+ result.append(mdtex2html.convert(non_code, extensions=["tables"]))
99
+ if code.strip():
100
+ code = f"\n```{code}\n\n```"
101
+ code = markdown_to_html_with_syntax_highlight(code)
102
+ result.append(code)
103
+ result = "".join(result)
104
+ result += ALREADY_CONVERTED_MARK
105
+ return result
106
+
107
+ def convert_asis(userinput):
108
+ return f"<p style=\"white-space:pre-wrap;\">{html.escape(userinput)}</p>"+ALREADY_CONVERTED_MARK
109
+
110
+ def detect_converted_mark(userinput):
111
+ if userinput.endswith(ALREADY_CONVERTED_MARK):
112
+ return True
113
+ else:
114
+ return False
115
+
116
+
117
+
118
+ def detect_language(code):
119
+ if code.startswith("\n"):
120
+ first_line = ""
121
+ else:
122
+ first_line = code.strip().split("\n", 1)[0]
123
+ language = first_line.lower() if first_line else ""
124
+ code_without_language = code[len(first_line) :].lstrip() if first_line else code
125
+ return language, code_without_language
126
+
127
+ def convert_to_markdown(text):
128
+ text = text.replace("$","&#36;")
129
+ def replace_leading_tabs_and_spaces(line):
130
+ new_line = []
131
+
132
+ for char in line:
133
+ if char == "\t":
134
+ new_line.append("&#9;")
135
+ elif char == " ":
136
+ new_line.append("&nbsp;")
137
+ else:
138
+ break
139
+ return "".join(new_line) + line[len(new_line):]
140
+
141
+ markdown_text = ""
142
+ lines = text.split("\n")
143
+ in_code_block = False
144
+
145
+ for line in lines:
146
+ if in_code_block is False and line.startswith("```"):
147
+ in_code_block = True
148
+ markdown_text += f"{line}\n"
149
+ elif in_code_block is True and line.startswith("```"):
150
+ in_code_block = False
151
+ markdown_text += f"{line}\n"
152
+ elif in_code_block:
153
+ markdown_text += f"{line}\n"
154
+ else:
155
+ line = replace_leading_tabs_and_spaces(line)
156
+ line = re.sub(r"^(#)", r"\\\1", line)
157
+ markdown_text += f"{line} \n"
158
+
159
+ return markdown_text
160
+
161
+ def add_language_tag(text):
162
+ def detect_language(code_block):
163
+ try:
164
+ lexer = guess_lexer(code_block)
165
+ return lexer.name.lower()
166
+ except ClassNotFound:
167
+ return ""
168
+
169
+ code_block_pattern = re.compile(r"(```)(\w*\n[^`]+```)", re.MULTILINE)
170
+
171
+ def replacement(match):
172
+ code_block = match.group(2)
173
+ if match.group(2).startswith("\n"):
174
+ language = detect_language(code_block)
175
+ if language:
176
+ return f"```{language}{code_block}```"
177
+ else:
178
+ return f"```\n{code_block}```"
179
+ else:
180
+ return match.group(1) + code_block + "```"
181
+
182
+ text2 = code_block_pattern.sub(replacement, text)
183
+ return text2
184
+
185
+ def delete_last_conversation(chatbot, history):
186
+ if len(chatbot) > 0:
187
+ chatbot.pop()
188
+
189
+ if len(history) > 0:
190
+ history.pop()
191
+
192
+ return (
193
+ chatbot,
194
+ history,
195
+ "Delete Done",
196
+ )
197
+
198
+ def reset_state():
199
+ return [], [], "Reset Done"
200
+
201
+ def reset_textbox():
202
+ return gr.update(value=""),""
203
+
204
+ def cancel_outputing():
205
+ return "Stop Done"
206
+
207
+
208
+ class State:
209
+ interrupted = False
210
+
211
+ def interrupt(self):
212
+ self.interrupted = True
213
+
214
+ def recover(self):
215
+ self.interrupted = False
216
+ shared_state = State()
217
+
218
+
219
+
220
+
221
+
222
+
223
+
224
+
225
+ def is_stop_word_or_prefix(s: str, stop_words: list) -> bool:
226
+ for stop_word in stop_words:
227
+ if s.endswith(stop_word):
228
+ return True
229
+ for i in range(1, len(stop_word)):
230
+ if s.endswith(stop_word[:i]):
231
+ return True
232
+ return False
233
+
234
+