Spaces:

seanpedrickcase
/

topic_modelling

Running

App Files Files Community

topic_modelling / funcs /anonymiser.py

Sonnyjim

first commit

9dbf344 about 1 year ago

raw

history blame

8.53 kB

	import spacy
	import os

	def is_model_installed(model_name):
	try:
	# Try to load the model
	spacy.load(model_name)
	return True
	except OSError:
	return False

	model_name = "en_core_web_sm"
	if not is_model_installed(model_name):
	os.system(f"python -m spacy download {model_name}")


	# Need to overwrite version of gradio present in Huggingface spaces as it doesn't have like buttons/avatars (Oct 2023)
	#os.system("pip uninstall -y gradio")
	#os.system("pip install gradio==3.50.0")
	#os.system("python -m spacy download en_core_web_lg")

	spacy.load(model_name)

	import re
	import secrets
	import base64
	import time

	import pandas as pd
	import gradio as gr

	from faker import Faker

	from presidio_analyzer import AnalyzerEngine, BatchAnalyzerEngine
	from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
	from presidio_anonymizer.entities import OperatorConfig



	def anon_consistent_names(df):
	# ## Pick out common names and replace them with the same person value
	df_dict = df.to_dict(orient="list")

	analyzer = AnalyzerEngine()
	batch_analyzer = BatchAnalyzerEngine(analyzer_engine=analyzer)

	analyzer_results = batch_analyzer.analyze_dict(df_dict, language="en")
	analyzer_results = list(analyzer_results)

	# + tags=[]
	text = analyzer_results[3].value

	# + tags=[]
	recognizer_result = str(analyzer_results[3].recognizer_results)

	# + tags=[]
	recognizer_result

	# + tags=[]
	data_str = recognizer_result # abbreviated for brevity

	# Adjusting the parse_dict function to handle trailing ']'
	# Splitting the main data string into individual list strings
	list_strs = data_str[1:-1].split('], [')

	def parse_dict(s):
	s = s.strip('[]') # Removing any surrounding brackets
	items = s.split(', ')
	d = {}
	for item in items:
	key, value = item.split(': ')
	if key == 'score':
	d[key] = float(value)
	elif key in ['start', 'end']:
	d[key] = int(value)
	else:
	d[key] = value
	return d

	# Re-running the improved processing code

	result = []

	for lst_str in list_strs:
	# Splitting each list string into individual dictionary strings
	dict_strs = lst_str.split(', type: ')
	dict_strs = [dict_strs[0]] + ['type: ' + s for s in dict_strs[1:]] # Prepending "type: " back to the split strings

	# Parsing each dictionary string
	dicts = [parse_dict(d) for d in dict_strs]
	result.append(dicts)

	#result

	# + tags=[]
	names = []

	for idx, paragraph in enumerate(text):
	paragraph_texts = []
	for dictionary in result[idx]:
	if dictionary['type'] == 'PERSON':
	paragraph_texts.append(paragraph[dictionary['start']:dictionary['end']])
	names.append(paragraph_texts)

	# + tags=[]
	# Flatten the list of lists and extract unique names
	unique_names = list(set(name for sublist in names for name in sublist))

	# + tags=[]
	fake_names = pd.Series(unique_names).apply(fake_first_name)

	# + tags=[]
	mapping_df = pd.DataFrame(data={"Unique names":unique_names,
	"Fake names": fake_names})

	# + tags=[]
	# Convert mapping dataframe to dictionary
	# Convert mapping dataframe to dictionary, adding word boundaries for full-word match
	name_map = {r'\b' + k + r'\b': v for k, v in zip(mapping_df['Unique names'], mapping_df['Fake names'])}

	# + tags=[]
	name_map

	# + tags=[]
	scrubbed_df_consistent_names = df.replace(name_map, regex = True)

	# + tags=[]
	scrubbed_df_consistent_names

	return scrubbed_df_consistent_names

	def detect_file_type(filename):
	"""Detect the file type based on its extension."""
	if (filename.endswith('.csv')) \| (filename.endswith('.csv.gz')) \| (filename.endswith('.zip')):
	return 'csv'
	elif filename.endswith('.xlsx'):
	return 'xlsx'
	elif filename.endswith('.parquet'):
	return 'parquet'
	else:
	raise ValueError("Unsupported file type.")

	def read_file(filename):
	"""Read the file based on its detected type."""
	file_type = detect_file_type(filename)

	if file_type == 'csv':
	return pd.read_csv(filename, low_memory=False)
	elif file_type == 'xlsx':
	return pd.read_excel(filename)
	elif file_type == 'parquet':
	return pd.read_parquet(filename)

	def anonymise_script(df, chosen_col, anon_strat):

	# DataFrame to dict
	df_dict = pd.DataFrame(data={chosen_col:df[chosen_col].astype(str)}).to_dict(orient="list")

	analyzer = AnalyzerEngine()
	batch_analyzer = BatchAnalyzerEngine(analyzer_engine=analyzer)

	anonymizer = AnonymizerEngine()

	batch_anonymizer = BatchAnonymizerEngine(anonymizer_engine = anonymizer)

	print("Identifying personal data")
	analyse_tic = time.perf_counter()
	analyzer_results = batch_analyzer.analyze_dict(df_dict, language="en")
	#print(analyzer_results)
	analyzer_results = list(analyzer_results)

	analyse_toc = time.perf_counter()
	analyse_time_out = f"Cleaning the text took {analyse_toc - analyse_tic:0.1f} seconds."
	print(analyse_time_out)

	# Generate a 128-bit AES key. Then encode the key using base64 to get a string representation
	key = secrets.token_bytes(16) # 128 bits = 16 bytes
	key_string = base64.b64encode(key).decode('utf-8')

	# Create faker function (note that it has to receive a value)

	fake = Faker("en_UK")

	def fake_first_name(x):
	return fake.first_name()

	# Set up the anonymization configuration WITHOUT DATE_TIME
	replace_config = eval('{"DEFAULT": OperatorConfig("replace")}')
	redact_config = eval('{"DEFAULT": OperatorConfig("redact")}')
	hash_config = eval('{"DEFAULT": OperatorConfig("hash")}')
	mask_config = eval('{"DEFAULT": OperatorConfig("mask", {"masking_char":"*", "chars_to_mask":100, "from_end":True})}')
	people_encrypt_config = eval('{"PERSON": OperatorConfig("encrypt", {"key": key_string})}') # The encryption is using AES cypher in CBC mode and requires a cryptographic key as an input for both the encryption and the decryption.
	fake_first_name_config = eval('{"PERSON": OperatorConfig("custom", {"lambda": fake_first_name})}')


	if anon_strat == "replace": chosen_mask_config = replace_config
	if anon_strat == "redact": chosen_mask_config = redact_config
	if anon_strat == "hash": chosen_mask_config = hash_config
	if anon_strat == "mask": chosen_mask_config = mask_config
	if anon_strat == "encrypt": chosen_mask_config = people_encrypt_config
	elif anon_strat == "fake_first_name": chosen_mask_config = fake_first_name_config

	# I think in general people will want to keep date / times
	keep_date_config = eval('{"DATE_TIME": OperatorConfig("keep")}')

	combined_config = {chosen_mask_config, keep_date_config}
	combined_config

	anonymizer_results = batch_anonymizer.anonymize_dict(analyzer_results, operators=combined_config)

	scrubbed_df = pd.DataFrame(anonymizer_results)

	# Create reporting message
	out_message = "Successfully anonymised"

	if anon_strat == "encrypt":
	out_message = out_message + ". Your decryption key is " + key_string + "."

	return scrubbed_df, out_message

	def do_anonymise(in_file, anon_strat, chosen_cols):

	# Load file

	anon_df = pd.DataFrame()

	if in_file:
	for match_file in in_file:
	match_temp_file = pd.read_csv(match_file.name, delimiter = ",", low_memory=False)#, encoding='cp1252')
	anon_df = pd.concat([anon_df, match_temp_file])

	# Split dataframe to keep only selected columns
	all_cols_original_order = list(anon_df.columns)
	anon_df_part = anon_df[chosen_cols]
	anon_df_remain = anon_df.drop(chosen_cols, axis = 1)

	# Anonymise the selected columns
	anon_df_part_out, out_message = anonymise_script(anon_df_part, anon_strat)

	# Rejoin the dataframe together
	anon_df_out = pd.concat([anon_df_part_out, anon_df_remain], axis = 1)
	anon_df_out = anon_df_out[all_cols_original_order]

	# Export file
	out_file_part = re.sub(r'\.csv', '', match_file.name)

	anon_export_file_name = out_file_part + "_anon_" + anon_strat + ".csv"

	anon_df_out.to_csv(anon_export_file_name, index = None)

	return out_message, anon_export_file_name