Spaces:

srijaydeshpande
/

DeID

Running on Zero

App Files Files Community

srijaydeshpande commited on May 23, 2024

Commit

e852fd8

verified ·

1 Parent(s): 64364a7

Update

Browse files

Files changed (1) hide show

app.py +34 -26

app.py CHANGED Viewed

@@ -2,14 +2,17 @@ from pdfminer.high_level import extract_pages
 from pdfminer.layout import LTTextContainer
 from tqdm import tqdm
 import re
-import io
-import zipfile
 import gradio as gr
 import os
 from llama_cpp import Llama
-import tempfile
 import transformers
-import torch
 def process_document(pdf_path, page_ids=None):
@@ -66,10 +69,10 @@ def txt_to_html(text):
     html_content += "</body></html>"
     return html_content
-def deidentify_doc(pdftext="", prompt="", maxtokens=600, temperature=0.0001, top_probability=0.95):
-    prompt = "Task: Please anonymize the following clinical note. Specific Rules: Replace all the following information with the term \"[redacted]\": 1. Redact any strings that is person name 2. Redact any medical staff names 3. Redact any strings that is location or address, such as \"3970 Longview Drive\" 4. Redact any strings that is age of person 5. Redact any dates and IDs 6. Redact clinic and hospital names 7. Redact professions such as \"manager\" 8. Redact any contact information"
     # output = model.create_chat_completion(
     #                 messages = [
     #                     {"role": "assistant", "content": prompt},
@@ -117,23 +120,26 @@ def mkdir(dir):
     if not os.path.exists(dir):
         os.makedirs(dir)
-def pdf_to_text(files, prompt="", maxtokens=600, temperature=1.2, top_probability=0.95):
-    zip_buffer = io.BytesIO()
-    with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zf:
-        for file in files:
-            file_name = os.path.basename(file)
-            file_name_splt = file_name.split('.')
-            if(len(file_name_splt)>1 and file_name_splt[1]=='pdf'):
-               page2content = process_document(file, page_ids=[0])
-               pdftext = page2content[1]
-               if(pdftext):
-                   anonymized_text = deidentify_doc(pdftext, prompt, maxtokens, temperature, top_probability)
-                   zf.writestr(file_name_splt[0]+'.txt', anonymized_text)
-    zip_buffer.seek(0)
-    with tempfile.NamedTemporaryFile(delete=False, suffix='.zip') as temp_file:
-        temp_file.write(zip_buffer.getvalue())
-        temp_file_path = temp_file.name
-    return temp_file_path
 # model_id = "D:/llama/meta-llama/Meta-Llama-3-8B-Instruct.Q5_K_M.gguf"
 # model = Llama(model_path=model_id, n_ctx=2048, n_threads=8, n_gpu_layers=32, n_batch=64)
@@ -149,16 +155,18 @@ model = transformers.pipeline(
     device="cuda",
 )
 temp_slider = gr.Slider(minimum=0, maximum=2, value=0.2, label="Temperature Value")
 prob_slider = gr.Slider(minimum=0, maximum=1, value=0.95, label="Max Probability Value")
 max_tokens = gr.Number(value=600, label="Max Tokens")
 input_folder = gr.File(file_count='multiple')
 output_text = gr.Textbox()
 output_path_component = gr.File(label="Select Output Path")
 iface = gr.Interface(
     fn = pdf_to_text,
-    inputs = ['files'],
-    outputs='file',
     title='COBIx Endoscopy Report De-Identification',
     description="This application assists to remove personal information from the uploaded clinical report",
     theme=gr.themes.Soft(),

 from pdfminer.layout import LTTextContainer
 from tqdm import tqdm
 import re
 import gradio as gr
 import os
 from llama_cpp import Llama
+from gpt4all import GPT4All
 import transformers
+# from transformers import GemmaTokenizer, AutoModelForCausalLM
+# from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
+import accelerate
+import torch
+# HF_TOKEN = os.environ.get("HF_TOKEN", None)
 def process_document(pdf_path, page_ids=None):
     html_content += "</body></html>"
     return html_content
+def deidentify_doc(pdftext="", prompt="", maxtokens=600, temperature=1.2, top_probability=0.95):
+    # prompt = "Please anonymize the following clinical note. Replace all the following information with the term '[redacted]': Redact any strings that might be a name or initials, patients’ names, doctors’ names, the names Dr., redact any medical staff names, redact any strings that might be a location or address, such as '3970 Longview Drive', redact any strings that look like 'age 37', redact any dates and registration numbers, redact professions such as 'manager', redact any contact information."
     # output = model.create_chat_completion(
     #                 messages = [
     #                     {"role": "assistant", "content": prompt},
     if not os.path.exists(dir):
         os.makedirs(dir)
+def pdf_to_text(files, output_folder, prompt, maxtokens=600, temperature=1.2, top_probability=0.95):
+    output_folder = output_folder.replace('\\','/')
+    for file in files:
+        file_name = os.path.basename(file)
+        file_name_splt = file_name.split('.')
+        print('File name is ', file_name)
+        print('output folder is ', output_folder)
+        if(len(file_name_splt)>1 and file_name_splt[1]=='pdf'):
+           page2content = process_document(file, page_ids=[0])
+           pdftext = page2content[1]
+           if(pdftext):
+               anonymized_text = deidentify_doc(pdftext, prompt, maxtokens, temperature, top_probability)
+               # html = txt_to_html(display_text)
+               # with open('out.html', "w", encoding="utf-8") as file:
+               #     file.write(html)
+               # with open(os.path.join(output_folder, file_name_splt[0]+'.txt'), 'w') as file:
+               #  # Write some text to the file
+               #  file.write(anonymized_text)
+    display_text = "All selected reports are anonymized and results are saved in " + output_folder
+    return anonymized_text
 # model_id = "D:/llama/meta-llama/Meta-Llama-3-8B-Instruct.Q5_K_M.gguf"
 # model = Llama(model_path=model_id, n_ctx=2048, n_threads=8, n_gpu_layers=32, n_batch=64)
     device="cuda",
 )
+css = ".gradio-container {background: 'logo.png'}"
 temp_slider = gr.Slider(minimum=0, maximum=2, value=0.2, label="Temperature Value")
 prob_slider = gr.Slider(minimum=0, maximum=1, value=0.95, label="Max Probability Value")
 max_tokens = gr.Number(value=600, label="Max Tokens")
 input_folder = gr.File(file_count='multiple')
+input_folder_text = gr.Textbox(label='Enter output folder path')
 output_text = gr.Textbox()
 output_path_component = gr.File(label="Select Output Path")
 iface = gr.Interface(
     fn = pdf_to_text,
+    inputs = ['files', input_folder_text, "textbox", max_tokens, temp_slider, prob_slider],
+    outputs=output_text,
     title='COBIx Endoscopy Report De-Identification',
     description="This application assists to remove personal information from the uploaded clinical report",
     theme=gr.themes.Soft(),