nan commited on
Commit
c3ad4e9
·
1 Parent(s): 27de169

feat: reformat codes using black

Browse files
Files changed (1) hide show
  1. README.md +35 -16
README.md CHANGED
@@ -83,7 +83,6 @@ To use `ReaderLM-v2` locally:
83
 
84
  ```python
85
  from transformers import AutoModelForCausalLM, AutoTokenizer
86
- import re
87
 
88
  device = "cuda" # or "cpu"
89
  tokenizer = AutoTokenizer.from_pretrained("jinaai/ReaderLM-v2")
@@ -93,14 +92,17 @@ To use `ReaderLM-v2` locally:
93
  3. (Optional) Pre-clean your HTML to remove scripts, styles, comments, to reduce the noise and length of the input:
94
 
95
  ```python
 
 
96
  # Patterns
97
- SCRIPT_PATTERN = r'<[ ]*script.*?\/[ ]*script[ ]*>'
98
- STYLE_PATTERN = r'<[ ]*style.*?\/[ ]*style[ ]*>'
99
- META_PATTERN = r'<[ ]*meta.*?>'
100
- COMMENT_PATTERN = r'<[ ]*!--.*?--[ ]*>'
101
- LINK_PATTERN = r'<[ ]*link.*?>'
102
  BASE64_IMG_PATTERN = r'<img[^>]+src="data:image/[^;]+;base64,[^"]+"[^>]*>'
103
- SVG_PATTERN = r'(<svg[^>]*>)(.*?)(<\/svg>)'
 
104
 
105
  def replace_svg(html: str, new_content: str = "this is a placeholder") -> str:
106
  return re.sub(
@@ -110,15 +112,27 @@ To use `ReaderLM-v2` locally:
110
  flags=re.DOTALL,
111
  )
112
 
 
113
  def replace_base64_images(html: str, new_image_src: str = "#") -> str:
114
  return re.sub(BASE64_IMG_PATTERN, f'<img src="{new_image_src}"/>', html)
115
 
 
116
  def clean_html(html: str, clean_svg: bool = False, clean_base64: bool = False):
117
- html = re.sub(SCRIPT_PATTERN, '', html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
118
- html = re.sub(STYLE_PATTERN, '', html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
119
- html = re.sub(META_PATTERN, '', html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
120
- html = re.sub(COMMENT_PATTERN, '', html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
121
- html = re.sub(LINK_PATTERN, '', html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
 
 
 
 
 
 
 
 
 
 
122
 
123
  if clean_svg:
124
  html = replace_svg(html)
@@ -130,7 +144,9 @@ To use `ReaderLM-v2` locally:
130
  4. Create a prompt for the model:
131
 
132
  ```python
133
- def create_prompt(text: str, tokenizer=None, instruction: str = None, schema: str = None) -> str:
 
 
134
  """
135
  Create a prompt for the model with optional instruction and JSON schema.
136
  """
@@ -157,14 +173,15 @@ To use `ReaderLM-v2` locally:
157
  ### HTML to Markdown Example
158
 
159
  ```python
160
- # Example HTML
161
  html = "<html><body><h1>Hello, world!</h1></body></html>"
162
 
163
  html = clean_html(html)
164
 
165
  input_prompt = create_prompt(html)
166
  inputs = tokenizer.encode(input_prompt, return_tensors="pt").to(device)
167
- outputs = model.generate(inputs, max_new_tokens=1024, temperature=0, do_sample=False, repetition_penalty=1.08)
 
 
168
 
169
  print(tokenizer.decode(outputs[0]))
170
  ```
@@ -197,7 +214,9 @@ html = clean_html(html)
197
  input_prompt = create_prompt(html, schema=schema)
198
 
199
  inputs = tokenizer.encode(input_prompt, return_tensors="pt").to(device)
200
- outputs = model.generate(inputs, max_new_tokens=1024, temperature=0, do_sample=False, repetition_penalty=1.08)
 
 
201
 
202
  print(tokenizer.decode(outputs[0]))
203
  ```
 
83
 
84
  ```python
85
  from transformers import AutoModelForCausalLM, AutoTokenizer
 
86
 
87
  device = "cuda" # or "cpu"
88
  tokenizer = AutoTokenizer.from_pretrained("jinaai/ReaderLM-v2")
 
92
  3. (Optional) Pre-clean your HTML to remove scripts, styles, comments, to reduce the noise and length of the input:
93
 
94
  ```python
95
+ import re
96
+
97
  # Patterns
98
+ SCRIPT_PATTERN = r"<[ ]*script.*?\/[ ]*script[ ]*>"
99
+ STYLE_PATTERN = r"<[ ]*style.*?\/[ ]*style[ ]*>"
100
+ META_PATTERN = r"<[ ]*meta.*?>"
101
+ COMMENT_PATTERN = r"<[ ]*!--.*?--[ ]*>"
102
+ LINK_PATTERN = r"<[ ]*link.*?>"
103
  BASE64_IMG_PATTERN = r'<img[^>]+src="data:image/[^;]+;base64,[^"]+"[^>]*>'
104
+ SVG_PATTERN = r"(<svg[^>]*>)(.*?)(<\/svg>)"
105
+
106
 
107
  def replace_svg(html: str, new_content: str = "this is a placeholder") -> str:
108
  return re.sub(
 
112
  flags=re.DOTALL,
113
  )
114
 
115
+
116
  def replace_base64_images(html: str, new_image_src: str = "#") -> str:
117
  return re.sub(BASE64_IMG_PATTERN, f'<img src="{new_image_src}"/>', html)
118
 
119
+
120
  def clean_html(html: str, clean_svg: bool = False, clean_base64: bool = False):
121
+ html = re.sub(
122
+ SCRIPT_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL
123
+ )
124
+ html = re.sub(
125
+ STYLE_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL
126
+ )
127
+ html = re.sub(
128
+ META_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL
129
+ )
130
+ html = re.sub(
131
+ COMMENT_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL
132
+ )
133
+ html = re.sub(
134
+ LINK_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL
135
+ )
136
 
137
  if clean_svg:
138
  html = replace_svg(html)
 
144
  4. Create a prompt for the model:
145
 
146
  ```python
147
+ def create_prompt(
148
+ text: str, tokenizer=None, instruction: str = None, schema: str = None
149
+ ) -> str:
150
  """
151
  Create a prompt for the model with optional instruction and JSON schema.
152
  """
 
173
  ### HTML to Markdown Example
174
 
175
  ```python
 
176
  html = "<html><body><h1>Hello, world!</h1></body></html>"
177
 
178
  html = clean_html(html)
179
 
180
  input_prompt = create_prompt(html)
181
  inputs = tokenizer.encode(input_prompt, return_tensors="pt").to(device)
182
+ outputs = model.generate(
183
+ inputs, max_new_tokens=1024, temperature=0, do_sample=False, repetition_penalty=1.08
184
+ )
185
 
186
  print(tokenizer.decode(outputs[0]))
187
  ```
 
214
  input_prompt = create_prompt(html, schema=schema)
215
 
216
  inputs = tokenizer.encode(input_prompt, return_tensors="pt").to(device)
217
+ outputs = model.generate(
218
+ inputs, max_new_tokens=1024, temperature=0, do_sample=False, repetition_penalty=1.08
219
+ )
220
 
221
  print(tokenizer.decode(outputs[0]))
222
  ```