presidio / app.py
omri374's picture
Update app.py
89035f8
raw
history blame
3.79 kB
"""Streamlit app for Presidio."""
import json
from json import JSONEncoder
import pandas as pd
import streamlit as st
from presidio_analyzer import AnalyzerEngine, RecognizerRegistry
from presidio_anonymizer import AnonymizerEngine
from transformers_recognizer import TransformersRecognizer
import spacy
spacy.cli.download("en_core_web_lg")
# Helper methods
@st.cache(allow_output_mutation=True)
def analyzer_engine():
"""Return AnalyzerEngine."""
transformers_recognizer = TransformersRecognizer()
registry = RecognizerRegistry()
registry.add_recognizer(transformers_recognizer)
registry.load_predefined_recognizers()
analyzer = AnalyzerEngine(registry=registry)
return analyzer
@st.cache(allow_output_mutation=True)
def anonymizer_engine():
"""Return AnonymizerEngine."""
return AnonymizerEngine()
def get_supported_entities():
"""Return supported entities from the Analyzer Engine."""
return analyzer_engine().get_supported_entities()
def analyze(**kwargs):
"""Analyze input using Analyzer engine and input arguments (kwargs)."""
if "entities" not in kwargs or "All" in kwargs["entities"]:
kwargs["entities"] = None
return analyzer_engine().analyze(**kwargs)
def anonymize(text, analyze_results):
"""Anonymize identified input using Presidio Abonymizer."""
res = anonymizer_engine().anonymize(text, analyze_results)
return res.text
st.set_page_config(page_title="Presidio demo (English)", layout="wide")
# Side bar
st.sidebar.markdown(
"""
Anonymize PII entities with [presidio](https://aka.ms/presidio), spaCy and a [PHI detection Roberta model](https://huggingface.co/obi/deid_roberta_i2b2).
"""
)
st_entities = st.sidebar.multiselect(
label="Which entities to look for?",
options=get_supported_entities(),
default=list(get_supported_entities()),
)
st_threhsold = st.sidebar.slider(
label="Acceptance threshold", min_value=0.0, max_value=1.0, value=0.35
)
st_return_decision_process = st.sidebar.checkbox("Add analysis explanations in json")
st.sidebar.info(
"Presidio is an open source framework for PII detection and anonymization. "
"For more info visit [aka.ms/presidio](https://aka.ms/presidio)"
)
# Main panel
analyzer_load_state = st.info("Starting Presidio analyzer...")
engine = analyzer_engine()
analyzer_load_state.empty()
# Create two columns for before and after
col1, col2 = st.columns(2)
# Before:
col1.subheader("Input string:")
st_text = col1.text_area(
label="Enter text",
value="Type in some text, "
"like a phone number (212-141-4544) "
"or a name (Lebron James).",
height=400,
)
# After
col2.subheader("Output:")
st_analyze_results = analyze(
text=st_text,
entities=st_entities,
language="en",
score_threshold=st_threhsold,
return_decision_process=st_return_decision_process,
)
st_anonymize_results = anonymize(st_text, st_analyze_results)
col2.text_area(label="", value=st_anonymize_results, height=400)
# table result
st.subheader("Findings")
if st_analyze_results:
df = pd.DataFrame.from_records([r.to_dict() for r in st_analyze_results])
df = df[["entity_type", "start", "end", "score"]].rename(
{
"entity_type": "Entity type",
"start": "Start",
"end": "End",
"score": "Confidence",
},
axis=1,
)
st.dataframe(df, width=1000)
else:
st.text("No findings")
# json result
class ToDictListEncoder(JSONEncoder):
"""Encode dict to json."""
def default(self, o):
"""Encode to JSON using to_dict."""
if o:
return o.to_dict()
return []
if st_return_decision_process:
st.json(json.dumps(st_analyze_results, cls=ToDictListEncoder))