nhathuy07 commited on
Commit
fb3d342
·
verified ·
1 Parent(s): 8030023

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +19 -11
main.py CHANGED
@@ -36,7 +36,7 @@ from secrets import SystemRandom
36
  from random import randint, sample
37
 
38
  from enum import Enum
39
- from re import sub
40
 
41
  from functools import partial
42
 
@@ -197,18 +197,26 @@ async def __ocr(im, file_id):
197
  return out
198
 
199
  def convert_links_to_text(text):
200
- """Converts all Markdown links in a string to their link text.
201
 
202
- Args:
203
- text: The input string containing Markdown links.
204
-
205
- Returns:
206
- The modified string with links replaced by their text.
207
- """
 
 
 
 
208
 
209
- link_pattern = r"\[(.*?)\]\(.*?\)"
210
- return sub(link_pattern, r"\1", text)
211
 
 
 
 
 
 
212
 
213
  async def __convert2md(inp):
214
  # Use gfm-raw_html to strip styling data from source file
@@ -539,7 +547,7 @@ async def generate_questions(request):
539
 
540
  # __raw_outputs = [await p for p in ptasks]
541
  __raw_outputs = await asyncio.gather(*ptasks)
542
-
543
  for pgph_i, o in enumerate(__raw_outputs):
544
  # print(o)
545
  # print(pgph_i)
 
36
  from random import randint, sample
37
 
38
  from enum import Enum
39
+ from re import sub, findall, escape
40
 
41
  from functools import partial
42
 
 
197
  return out
198
 
199
  def convert_links_to_text(text):
200
+ txt = text
201
 
202
+ # Anything that isn't a square closing bracket
203
+ name_regex = "[^]]+"
204
+ # http:// or https:// followed by anything but a closing paren
205
+ url_regex = "http[s]?://[^)]+"
206
+
207
+ markup_regex = '\[({0})]\(\s*({1})\s*\)'.format(name_regex, url_regex)
208
+
209
+ for match in findall(markup_regex,txt):
210
+ link_str = f"[{match[0]}]({match[1]})"
211
+ txt = txt.replace(link_str, match[0])
212
 
213
+ return txt
 
214
 
215
+ def remove_wikipedia_footnote_ptrs(text):
216
+ txt = text
217
+ wiki_footnote_regex = r'\\\[\d+\\]'
218
+ txt = sub(wiki_footnote_regex, '', txt)
219
+ return txt
220
 
221
  async def __convert2md(inp):
222
  # Use gfm-raw_html to strip styling data from source file
 
547
 
548
  # __raw_outputs = [await p for p in ptasks]
549
  __raw_outputs = await asyncio.gather(*ptasks)
550
+ print(__raw_outputs)
551
  for pgph_i, o in enumerate(__raw_outputs):
552
  # print(o)
553
  # print(pgph_i)