|
"""Streamlit app for Presidio.""" |
|
|
|
import json |
|
from json import JSONEncoder |
|
|
|
import pandas as pd |
|
import streamlit as st |
|
from presidio_analyzer import AnalyzerEngine, RecognizerRegistry |
|
from presidio_anonymizer import AnonymizerEngine |
|
|
|
from transformers_recognizer import TransformersRecognizer |
|
|
|
|
|
import spacy |
|
spacy.cli.download("en_core_web_lg") |
|
|
|
|
|
|
|
@st.cache(allow_output_mutation=True) |
|
def analyzer_engine(): |
|
"""Return AnalyzerEngine.""" |
|
|
|
transformers_recognizer = TransformersRecognizer() |
|
|
|
registry = RecognizerRegistry() |
|
registry.add_recognizer(transformers_recognizer) |
|
registry.load_predefined_recognizers() |
|
|
|
analyzer = AnalyzerEngine(registry=registry) |
|
return analyzer |
|
|
|
|
|
@st.cache(allow_output_mutation=True) |
|
def anonymizer_engine(): |
|
"""Return AnonymizerEngine.""" |
|
return AnonymizerEngine() |
|
|
|
|
|
def get_supported_entities(): |
|
"""Return supported entities from the Analyzer Engine.""" |
|
return analyzer_engine().get_supported_entities() |
|
|
|
|
|
def analyze(**kwargs): |
|
"""Analyze input using Analyzer engine and input arguments (kwargs).""" |
|
if "entities" not in kwargs or "All" in kwargs["entities"]: |
|
kwargs["entities"] = None |
|
return analyzer_engine().analyze(**kwargs) |
|
|
|
|
|
def anonymize(text, analyze_results): |
|
"""Anonymize identified input using Presidio Abonymizer.""" |
|
|
|
res = anonymizer_engine().anonymize(text, analyze_results) |
|
return res.text |
|
|
|
|
|
st.set_page_config(page_title="Presidio demo (English)", layout="wide") |
|
|
|
|
|
st.sidebar.markdown( |
|
""" |
|
Anonymize PII entities with [presidio](https://aka.ms/presidio), spaCy and a [PHI detection Roberta model](https://huggingface.co/obi/deid_roberta_i2b2). |
|
""" |
|
) |
|
|
|
st_entities = st.sidebar.multiselect( |
|
label="Which entities to look for?", |
|
options=get_supported_entities(), |
|
default=list(get_supported_entities()), |
|
) |
|
|
|
st_threhsold = st.sidebar.slider( |
|
label="Acceptance threshold", min_value=0.0, max_value=1.0, value=0.35 |
|
) |
|
|
|
st_return_decision_process = st.sidebar.checkbox("Add analysis explanations in json") |
|
|
|
st.sidebar.info( |
|
"Presidio is an open source framework for PII detection and anonymization. " |
|
"For more info visit [aka.ms/presidio](https://aka.ms/presidio)" |
|
) |
|
|
|
|
|
|
|
analyzer_load_state = st.info("Starting Presidio analyzer...") |
|
engine = analyzer_engine() |
|
analyzer_load_state.empty() |
|
|
|
|
|
|
|
col1, col2 = st.columns(2) |
|
|
|
|
|
col1.subheader("Input string:") |
|
st_text = col1.text_area( |
|
label="Enter text", |
|
value="Type in some text, " |
|
"like a phone number (212-141-4544) " |
|
"or a name (Lebron James).", |
|
height=400, |
|
) |
|
|
|
|
|
col2.subheader("Output:") |
|
|
|
st_analyze_results = analyze( |
|
text=st_text, |
|
entities=st_entities, |
|
language="en", |
|
score_threshold=st_threhsold, |
|
return_decision_process=st_return_decision_process, |
|
) |
|
st_anonymize_results = anonymize(st_text, st_analyze_results) |
|
col2.text_area(label="", value=st_anonymize_results, height=400) |
|
|
|
|
|
|
|
st.subheader("Findings") |
|
if st_analyze_results: |
|
df = pd.DataFrame.from_records([r.to_dict() for r in st_analyze_results]) |
|
df = df[["entity_type", "start", "end", "score"]].rename( |
|
{ |
|
"entity_type": "Entity type", |
|
"start": "Start", |
|
"end": "End", |
|
"score": "Confidence", |
|
}, |
|
axis=1, |
|
) |
|
|
|
st.dataframe(df, width=1000) |
|
else: |
|
st.text("No findings") |
|
|
|
|
|
|
|
class ToDictListEncoder(JSONEncoder): |
|
"""Encode dict to json.""" |
|
|
|
def default(self, o): |
|
"""Encode to JSON using to_dict.""" |
|
if o: |
|
return o.to_dict() |
|
return [] |
|
|
|
|
|
if st_return_decision_process: |
|
st.json(json.dumps(st_analyze_results, cls=ToDictListEncoder)) |
|
|