import os
import re
from difflib import SequenceMatcher
from bigotry_dict import bigotry_dict
#can also submit this with "qsub manual_keyword_check.sh" because it takes a long time

# Path to save the output
output_file_path = "output.txt"

# Open the output file in append mode
with open(output_file_path, 'w', encoding='utf-8') as output_file:
    # Walk through the directory
    for root, dirs, files in os.walk(r'../deed_preprocessing/racist'):
        for file in files:
            if file.endswith('.txt'):
                txt_file_path = os.path.join(root, file)

                with open(txt_file_path, 'rb') as txt_file:
                    try:
                        # Read and decode the text file
                        text = txt_file.read()
                        decoded_text = text.decode('utf-8')
                        words = re.split(r'[\n ]+', decoded_text)

                        # Look for matches in the text
                        found = False
                        for i in range(len(words)):
                            if not found:
                                for identifier in bigotry_dict.keys():
                                    if not found:
                                        similarity_ratio = SequenceMatcher(None, words[i], identifier).ratio()
                                        if similarity_ratio >= 0.9:
                                            # Collect the surrounding words
                                            context = words[max(0, i-10):min(len(words),i+10)]
                                            context_str = ' '.join(context)

                                            # Write to the output file
                                            output_file.write(f"Context: {context_str}\n")
                                            output_file.write(f"File: {txt_file_path}\n\n")
                                            print(txt_file_path)
                                            found = True
                                    else:
                                        break
                            else:
                                break
                                    

                    except Exception as e:
                        print(f"Error processing {file}: {str(e)}")

print(f"Results saved to {output_file_path}")