Spaces:
Sleeping
Sleeping
import os | |
import re | |
from difflib import SequenceMatcher | |
from bigotry_dict import bigotry_dict | |
#can also submit this with "qsub manual_keyword_check.sh" because it takes a long time | |
# Path to save the output | |
output_file_path = "output.txt" | |
# Open the output file in append mode | |
with open(output_file_path, 'w', encoding='utf-8') as output_file: | |
# Walk through the directory | |
for root, dirs, files in os.walk(r'../deed_preprocessing/racist'): | |
for file in files: | |
if file.endswith('.txt'): | |
txt_file_path = os.path.join(root, file) | |
with open(txt_file_path, 'rb') as txt_file: | |
try: | |
# Read and decode the text file | |
text = txt_file.read() | |
decoded_text = text.decode('utf-8') | |
words = re.split(r'[\n ]+', decoded_text) | |
# Look for matches in the text | |
found = False | |
for i in range(len(words)): | |
if not found: | |
for identifier in bigotry_dict.keys(): | |
if not found: | |
similarity_ratio = SequenceMatcher(None, words[i], identifier).ratio() | |
if similarity_ratio >= 0.9: | |
# Collect the surrounding words | |
context = words[max(0, i-10):min(len(words),i+10)] | |
context_str = ' '.join(context) | |
# Write to the output file | |
output_file.write(f"Context: {context_str}\n") | |
output_file.write(f"File: {txt_file_path}\n\n") | |
print(txt_file_path) | |
found = True | |
else: | |
break | |
else: | |
break | |
except Exception as e: | |
print(f"Error processing {file}: {str(e)}") | |
print(f"Results saved to {output_file_path}") | |