srijaydeshpande commited on
Commit
e852fd8
·
verified ·
1 Parent(s): 64364a7
Files changed (1) hide show
  1. app.py +34 -26
app.py CHANGED
@@ -2,14 +2,17 @@ from pdfminer.high_level import extract_pages
2
  from pdfminer.layout import LTTextContainer
3
  from tqdm import tqdm
4
  import re
5
- import io
6
- import zipfile
7
  import gradio as gr
8
  import os
9
  from llama_cpp import Llama
10
- import tempfile
11
  import transformers
12
- import torch
 
 
 
 
 
13
 
14
  def process_document(pdf_path, page_ids=None):
15
 
@@ -66,10 +69,10 @@ def txt_to_html(text):
66
  html_content += "</body></html>"
67
  return html_content
68
 
69
- def deidentify_doc(pdftext="", prompt="", maxtokens=600, temperature=0.0001, top_probability=0.95):
70
-
71
- prompt = "Task: Please anonymize the following clinical note. Specific Rules: Replace all the following information with the term \"[redacted]\": 1. Redact any strings that is person name 2. Redact any medical staff names 3. Redact any strings that is location or address, such as \"3970 Longview Drive\" 4. Redact any strings that is age of person 5. Redact any dates and IDs 6. Redact clinic and hospital names 7. Redact professions such as \"manager\" 8. Redact any contact information"
72
 
 
 
73
  # output = model.create_chat_completion(
74
  # messages = [
75
  # {"role": "assistant", "content": prompt},
@@ -117,23 +120,26 @@ def mkdir(dir):
117
  if not os.path.exists(dir):
118
  os.makedirs(dir)
119
 
120
- def pdf_to_text(files, prompt="", maxtokens=600, temperature=1.2, top_probability=0.95):
121
- zip_buffer = io.BytesIO()
122
- with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zf:
123
- for file in files:
124
- file_name = os.path.basename(file)
125
- file_name_splt = file_name.split('.')
126
- if(len(file_name_splt)>1 and file_name_splt[1]=='pdf'):
127
- page2content = process_document(file, page_ids=[0])
128
- pdftext = page2content[1]
129
- if(pdftext):
130
- anonymized_text = deidentify_doc(pdftext, prompt, maxtokens, temperature, top_probability)
131
- zf.writestr(file_name_splt[0]+'.txt', anonymized_text)
132
- zip_buffer.seek(0)
133
- with tempfile.NamedTemporaryFile(delete=False, suffix='.zip') as temp_file:
134
- temp_file.write(zip_buffer.getvalue())
135
- temp_file_path = temp_file.name
136
- return temp_file_path
 
 
 
137
 
138
  # model_id = "D:/llama/meta-llama/Meta-Llama-3-8B-Instruct.Q5_K_M.gguf"
139
  # model = Llama(model_path=model_id, n_ctx=2048, n_threads=8, n_gpu_layers=32, n_batch=64)
@@ -149,16 +155,18 @@ model = transformers.pipeline(
149
  device="cuda",
150
  )
151
 
 
152
  temp_slider = gr.Slider(minimum=0, maximum=2, value=0.2, label="Temperature Value")
153
  prob_slider = gr.Slider(minimum=0, maximum=1, value=0.95, label="Max Probability Value")
154
  max_tokens = gr.Number(value=600, label="Max Tokens")
155
  input_folder = gr.File(file_count='multiple')
 
156
  output_text = gr.Textbox()
157
  output_path_component = gr.File(label="Select Output Path")
158
  iface = gr.Interface(
159
  fn = pdf_to_text,
160
- inputs = ['files'],
161
- outputs='file',
162
  title='COBIx Endoscopy Report De-Identification',
163
  description="This application assists to remove personal information from the uploaded clinical report",
164
  theme=gr.themes.Soft(),
 
2
  from pdfminer.layout import LTTextContainer
3
  from tqdm import tqdm
4
  import re
 
 
5
  import gradio as gr
6
  import os
7
  from llama_cpp import Llama
8
+ from gpt4all import GPT4All
9
  import transformers
10
+ # from transformers import GemmaTokenizer, AutoModelForCausalLM
11
+ # from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
12
+ import accelerate
13
+ import torch
14
+
15
+ # HF_TOKEN = os.environ.get("HF_TOKEN", None)
16
 
17
  def process_document(pdf_path, page_ids=None):
18
 
 
69
  html_content += "</body></html>"
70
  return html_content
71
 
72
+ def deidentify_doc(pdftext="", prompt="", maxtokens=600, temperature=1.2, top_probability=0.95):
 
 
73
 
74
+ # prompt = "Please anonymize the following clinical note. Replace all the following information with the term '[redacted]': Redact any strings that might be a name or initials, patients’ names, doctors’ names, the names Dr., redact any medical staff names, redact any strings that might be a location or address, such as '3970 Longview Drive', redact any strings that look like 'age 37', redact any dates and registration numbers, redact professions such as 'manager', redact any contact information."
75
+
76
  # output = model.create_chat_completion(
77
  # messages = [
78
  # {"role": "assistant", "content": prompt},
 
120
  if not os.path.exists(dir):
121
  os.makedirs(dir)
122
 
123
+ def pdf_to_text(files, output_folder, prompt, maxtokens=600, temperature=1.2, top_probability=0.95):
124
+ output_folder = output_folder.replace('\\','/')
125
+ for file in files:
126
+ file_name = os.path.basename(file)
127
+ file_name_splt = file_name.split('.')
128
+ print('File name is ', file_name)
129
+ print('output folder is ', output_folder)
130
+ if(len(file_name_splt)>1 and file_name_splt[1]=='pdf'):
131
+ page2content = process_document(file, page_ids=[0])
132
+ pdftext = page2content[1]
133
+ if(pdftext):
134
+ anonymized_text = deidentify_doc(pdftext, prompt, maxtokens, temperature, top_probability)
135
+ # html = txt_to_html(display_text)
136
+ # with open('out.html', "w", encoding="utf-8") as file:
137
+ # file.write(html)
138
+ # with open(os.path.join(output_folder, file_name_splt[0]+'.txt'), 'w') as file:
139
+ # # Write some text to the file
140
+ # file.write(anonymized_text)
141
+ display_text = "All selected reports are anonymized and results are saved in " + output_folder
142
+ return anonymized_text
143
 
144
  # model_id = "D:/llama/meta-llama/Meta-Llama-3-8B-Instruct.Q5_K_M.gguf"
145
  # model = Llama(model_path=model_id, n_ctx=2048, n_threads=8, n_gpu_layers=32, n_batch=64)
 
155
  device="cuda",
156
  )
157
 
158
+ css = ".gradio-container {background: 'logo.png'}"
159
  temp_slider = gr.Slider(minimum=0, maximum=2, value=0.2, label="Temperature Value")
160
  prob_slider = gr.Slider(minimum=0, maximum=1, value=0.95, label="Max Probability Value")
161
  max_tokens = gr.Number(value=600, label="Max Tokens")
162
  input_folder = gr.File(file_count='multiple')
163
+ input_folder_text = gr.Textbox(label='Enter output folder path')
164
  output_text = gr.Textbox()
165
  output_path_component = gr.File(label="Select Output Path")
166
  iface = gr.Interface(
167
  fn = pdf_to_text,
168
+ inputs = ['files', input_folder_text, "textbox", max_tokens, temp_slider, prob_slider],
169
+ outputs=output_text,
170
  title='COBIx Endoscopy Report De-Identification',
171
  description="This application assists to remove personal information from the uploaded clinical report",
172
  theme=gr.themes.Soft(),