F24-Racist-Deeds / modules /last_year /search_keywords.py
jacob-stein's picture
Migrate Flask backend
97208ad
raw
history blame
2.36 kB
import os
import re
from difflib import SequenceMatcher
from bigotry_dict import bigotry_dict
#can also submit this with "qsub manual_keyword_check.sh" because it takes a long time
# Path to save the output
output_file_path = "output.txt"
# Open the output file in append mode
with open(output_file_path, 'w', encoding='utf-8') as output_file:
# Walk through the directory
for root, dirs, files in os.walk(r'../deed_preprocessing/racist'):
for file in files:
if file.endswith('.txt'):
txt_file_path = os.path.join(root, file)
with open(txt_file_path, 'rb') as txt_file:
try:
# Read and decode the text file
text = txt_file.read()
decoded_text = text.decode('utf-8')
words = re.split(r'[\n ]+', decoded_text)
# Look for matches in the text
found = False
for i in range(len(words)):
if not found:
for identifier in bigotry_dict.keys():
if not found:
similarity_ratio = SequenceMatcher(None, words[i], identifier).ratio()
if similarity_ratio >= 0.9:
# Collect the surrounding words
context = words[max(0, i-10):min(len(words),i+10)]
context_str = ' '.join(context)
# Write to the output file
output_file.write(f"Context: {context_str}\n")
output_file.write(f"File: {txt_file_path}\n\n")
print(txt_file_path)
found = True
else:
break
else:
break
except Exception as e:
print(f"Error processing {file}: {str(e)}")
print(f"Results saved to {output_file_path}")