Spaces:

spark-ds549
/

F24-Racist-Deeds

Sleeping

App Files Files Community

jacob-stein commited on Dec 7, 2024

Commit

97208ad

1 Parent(s): 8e73170

Migrate Flask backend

Browse files

Files changed (39) hide show

Dockerfile +11 -5
app.py +114 -5
modules/.DS_Store +0 -0
modules/deed_preprocessing/.DS_Store +0 -0
modules/deed_preprocessing/preprocessor.py +62 -0
modules/deed_preprocessing/spellcheck.py +12 -0
modules/google_cloud_ocr/__init__.py +0 -0
modules/google_cloud_ocr/cloud-creds-template.json +13 -0
modules/google_cloud_ocr/google_cloud_ocr.py +23 -0
modules/google_cloud_ocr/setup.md +59 -0
modules/last_year/OCR.py +11 -0
modules/last_year/__pycache__/racist_chatgpt_analysis.cpython-310.pyc +0 -0
modules/last_year/bigotry_dict.py +102 -0
modules/last_year/env.template +2 -0
modules/last_year/locate.py +30 -0
modules/last_year/manual_keyword_check.sh +3 -0
modules/last_year/pagenum.py +22 -0
modules/last_year/racism_checker_old_pipeline.py +65 -0
modules/last_year/racist_chatgpt_analysis.py +24 -0
modules/last_year/racist_text_query.py +10 -0
modules/last_year/search_keywords.py +51 -0
modules/model_experimentation/bag_of_words_logistic_regression.py +89 -0
modules/model_experimentation/logistic_model.pkl +3 -0
modules/model_experimentation/preprocessed_deeds.pkl +3 -0
modules/model_experimentation/vectorizer.pkl +3 -0
modules/modules.md +19 -0
modules/openai/__pycache__/racist_chatgpt_analysis.cpython-310.pyc +0 -0
modules/openai/batch/batch_instruc.md +57 -0
modules/openai/batch/batch_processing.py +115 -0
modules/openai/batch/cancel_batch.py +18 -0
modules/openai/batch/check_batch_status.py +26 -0
modules/openai/batch/create_batch.py +27 -0
modules/openai/batch/list_batches.py +18 -0
modules/openai/batch/prepare_batch.py +40 -0
modules/openai/batch/retrieve_results.py +23 -0
modules/openai/batch/upload_batch_file.py +23 -0
modules/openai/gpt_extract.py +66 -0
modules/openai/racist_chatgpt_analysis.py +30 -0
requirements.txt +14 -2

Dockerfile CHANGED Viewed

@@ -1,16 +1,22 @@
-# Read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
-# you will also find guides on how best to write your Dockerfile
-FROM python:3.9
 RUN useradd -m -u 1000 user
 USER user
 ENV PATH="/home/user/.local/bin:$PATH"
 WORKDIR /app
 COPY --chown=user ./requirements.txt requirements.txt
 RUN pip install --no-cache-dir --upgrade -r requirements.txt
 COPY --chown=user . /app
-CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

+FROM python:3.9-slim
+# Set up non-root user
 RUN useradd -m -u 1000 user
 USER user
 ENV PATH="/home/user/.local/bin:$PATH"
+# Set working directory
 WORKDIR /app
+# Copy and install dependencies
 COPY --chown=user ./requirements.txt requirements.txt
 RUN pip install --no-cache-dir --upgrade -r requirements.txt
+# Copy all application code
 COPY --chown=user . /app
+# Expose required port
+EXPOSE 7860
+# Run the Flask app using Gunicorn
+CMD ["gunicorn", "-b", "0.0.0.0:7860", "app:app"]

app.py CHANGED Viewed

@@ -1,7 +1,116 @@
-from fastapi import FastAPI
-app = FastAPI()
-@app.get("/")
-def greet_json():
-    return {"Hello": "World!"}

+from flask import Flask, request, jsonify, send_file
+from flask_cors import CORS
+import pickle
+from modules.google_cloud_ocr.google_cloud_ocr import google_cloud_ocr
+from modules.deed_preprocessing.spellcheck import correct_spelling
+from modules.deed_preprocessing.preprocessor import preprocess_text
+from modules.openai.racist_chatgpt_analysis import racist_chatgpt_analysis
+from modules.model_experimentation.bag_of_words_logistic_regression import predict
+import pandas as pd
+import xlsxwriter
+import re
+app = Flask(__name__)
+# CORS(app, resources={r"/*": {"origins": "*"}})
+CORS(app, supports_credentials=True, origins="*")
+with open('modules/model_experimentation/vectorizer.pkl', 'rb') as vec_file:
+    vectorizer = pickle.load(vec_file)
+with open('modules/model_experimentation/logistic_model.pkl', 'rb') as model_file:
+    logistic_model = pickle.load(model_file)
+# Helper to look for the book and page numbers
+def extract_book_and_page(text):
+    book_numbers = re.findall(r"book\s+(\d+)", text, re.IGNORECASE)
+    page_numbers = re.findall(r"page\s+(\d+)", text, re.IGNORECASE)
+    return book_numbers, page_numbers
+@app.route('/api/upload', methods=['POST'])
+def upload_file():
+    if 'file' not in request.files:
+        return jsonify({'error': 'No file part in the request'}), 400
+    file = request.files['file']
+    if file.filename == '':
+        return jsonify({'error': 'No selected file'}), 400
+    ocr_engine = request.form.get('ocr_engine', 'google')
+    analysis_method = request.form.get('analysis_method', 'chatgpt')
+    try:
+        if ocr_engine == 'google':
+            # Step 1: Get text using Google OCR
+            google_text = google_cloud_ocr(file)
+            # Step 2: Pass text through the spell checker
+            spellchecked_text = correct_spelling(google_text)
+            # Step 3: Pass text through the preprocessor
+            processed_text = preprocess_text(spellchecked_text)
+            # Extract book and page numbers right after spellchecking
+            book_numbers, page_numbers = extract_book_and_page(spellchecked_text)
+            # Step 4: Get the names and locations
+            extracted_info = {
+                "names": processed_text.get("names", []),
+                "locations": processed_text.get("locations", []),
+                "book_numbers": book_numbers,
+                "page_numbers": page_numbers
+            }
+            # Step 5: Choose analysis method
+            if analysis_method == 'chatgpt':
+                analysis_result = racist_chatgpt_analysis(processed_text['original_text'])
+                return jsonify({
+                    'status': 'success',
+                    'ocr_engine': 'google',
+                    'analysis_method': 'chatgpt',
+                    'original_text': google_text,
+                    'spellchecked_text': spellchecked_text,
+                    'processed_text': processed_text,
+                    'extracted_info': extracted_info,
+                    'result': analysis_result
+                }), 200
+            elif analysis_method == 'logistic_regression':
+                lr_result = predict(processed_text, vectorizer, logistic_model)['is_racist']
+                return jsonify({
+                    'status': 'success',
+                    'ocr_engine': 'google',
+                    'analysis_method': 'logistic_regression',
+                    'original_text': google_text,
+                    'spellchecked_text': spellchecked_text,
+                    'processed_text': processed_text,
+                    'extracted_info': extracted_info,
+                    'result': lr_result
+                }), 200
+            else:
+                return jsonify({'error': 'Unsupported analysis method selected'}), 400
+        elif ocr_engine == 'azure':
+            return jsonify({'status': 'success', 'ocr_engine': 'azure', 'text': "fill"}), 200
+        else:
+            return jsonify({'error': 'Unsupported OCR engine selected'}), 400
+    except Exception as e:
+        return jsonify({'error': str(e)}), 500
+@app.route('/api/download_excel', methods=['POST'])
+def download_excel():
+    try:
+        data = request.get_json()
+        if not data:
+            return jsonify({'error': 'No data provided'}), 400
+        df = pd.DataFrame(data)
+        excel_path = 'output.xlsx'
+        with pd.ExcelWriter(excel_path, engine='xlsxwriter') as writer:
+            df.to_excel(writer, index=False, sheet_name='Sheet1')
+        return send_file(excel_path, as_attachment=True, download_name='analysis_results.xlsx')
+    except Exception as e:
+        return jsonify({'error': str(e)}), 500
+if __name__ == '__main__':
+    app.run(debug=True, host="0.0.0.0", port=7860)

modules/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

modules/deed_preprocessing/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

modules/deed_preprocessing/preprocessor.py ADDED Viewed

	@@ -0,0 +1,62 @@

+import re
+import spacy
+from collections import Counter
+nlp = spacy.load('en_core_web_sm')
+def preprocess_text(text):
+    text = re.sub(r'[\n\r\t]', ' ', text)
+    text = re.sub(r'[^\x00-\x7F]+', '', text)
+    doc = nlp(text)
+    result = {
+        "original_text": text,
+        "sentences": [],
+        "pos_groups": {},
+        "named_entities": [],
+        "dependencies": [],
+        "token_offsets": [],
+        "word_frequency": {},
+        "sentence_lengths": [],
+        "pos_counts": {}
+    }
+    pos_groups = {
+        "NOUN": [], "VERB": [], "ADJ": [], "ADV": [], "PROPN": [],
+        "DET": [], "AUX": [], "PRON": [], "ADP": [], "NUM": [],
+        "PART": [], "PUNCT": [], "INTJ": [], "X": []
+    }
+    all_tokens = []
+    for sent in doc.sents:
+        result["sentences"].append(sent.text)
+        result["sentence_lengths"].append(len(sent))
+        for token in sent:
+            pos = token.pos_
+            all_tokens.append(token.text)
+            if pos in pos_groups:
+                pos_groups[pos].append(token.text)
+            result["dependencies"].append({
+                "token": token.text,
+                "dep": token.dep_,
+                "head": token.head.text
+            })
+            result["token_offsets"].append({
+                "token": token.text,
+                "start": token.idx,
+                "end": token.idx + len(token.text)
+            })
+    result["pos_groups"] = pos_groups
+    result["named_entities"] = [{"text": ent.text, "label": ent.label_} for ent in doc.ents]
+    result["word_frequency"] = dict(Counter(all_tokens))
+    result["pos_counts"] = dict(Counter([token.pos_ for token in doc]))
+    result["names"] = [ent.text for ent in doc.ents if ent.label_ == "PERSON"]
+    result["locations"] = [ent.text for ent in doc.ents if ent.label_ in {"GPE", "LOC"}]
+    return result

modules/deed_preprocessing/spellcheck.py ADDED Viewed

	@@ -0,0 +1,12 @@

+from autocorrect import Speller
+# Initialize the Speller instance from autocorrect
+spell = Speller(lang='en')
+def correct_spelling(text):
+    """Correct spelling using Autocorrect."""
+    # Correct basic spelling errors using Autocorrect
+    corrected_text = spell(text)
+    return corrected_text

modules/google_cloud_ocr/__init__.py ADDED Viewed

File without changes

modules/google_cloud_ocr/cloud-creds-template.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "type": "",
+  "project_id": "",
+  "private_key_id": "",
+  "private_key": "",
+  "client_email": "",
+  "client_id": "",
+  "auth_uri": "",
+  "token_uri": "",
+  "auth_provider_x509_cert_url": "",
+  "client_x509_cert_url": "",
+  "universe_domain": ""
+}

modules/google_cloud_ocr/google_cloud_ocr.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import os
+import io
+from dotenv import load_dotenv
+from google.cloud import vision
+load_dotenv()
+google_creds = os.getenv('GOOGLE_APPLICATION_CREDENTIALS')
+if google_creds is None:
+    raise EnvironmentError("GOOGLE_APPLICATION_CREDENTIALS not set in .env file")
+client = vision.ImageAnnotatorClient()
+def google_cloud_ocr(tiff_file):
+    content = tiff_file.read()
+    image = vision.Image(content=content)
+    response = client.document_text_detection(image=image)
+    text = response.full_text_annotation.text
+    if response.error.message:
+        raise Exception(f'API Error: {response.error.message}')
+    return text

modules/google_cloud_ocr/setup.md ADDED Viewed

	@@ -0,0 +1,59 @@

+# Google Cloud OCR Setup with Python
+---
+## Step 1: Create a Google Cloud Project
+1. Go to the [Google Cloud Console](https://console.cloud.google.com/).
+2. **Create a New Project**:
+   - In the top-left corner, click the project dropdown menu and then "New Project."
+   - Enter a project name (e.g., "OCR Project") and click "Create."
+---
+## Step 2: Enable the Cloud Vision API
+1. In the **Google Cloud Console**, go to the **Navigation Menu** (three horizontal lines at the top left).
+2. Click on **APIs & Services** > **Library**.
+3. In the search bar, type **Vision API**.
+4. Select **Cloud Vision API** and click **Enable**.
+---
+## Step 3: Create Service Account Credentials
+1. Navigate to **APIs & Services** > **Credentials**.
+2. Click on **Create Credentials** > **Service Account**.
+3. **Service Account Details**:
+   - Give the service account a name (e.g., "vision-api-service-account").
+   - Click "Create and Continue."
+4. **Grant Permissions**:
+   - Choose **Role**: Select "Project" > "Editor" to give your service account sufficient permissions.
+   - Click "Continue."
+5. **Create JSON Key**:
+   - After creating the service account, click on the three dots next to the account.
+   - Select "Manage Keys" > "Add Key" > "Create New Key."
+   - Choose **JSON** format and download the JSON file. This file contains your credentials.
+---
+## Step 4: Set Up the `.env` File
+1. Create a new file named `.env` in the root directory of your Python project.
+2. Add the following line to the `.env` file, replacing the path with the actual path to your downloaded JSON credentials file:
+   ```bash
+   GOOGLE_APPLICATION_CREDENTIALS=/path-to-your-credentials.json
+   ```
+---
+## Step 5: Running the script
+1. Ensure you have the needed package:
+```bash
+pip install google-cloud-vision python-dotenv
+```
+2. Run the script and see the text files for outputs

modules/last_year/OCR.py ADDED Viewed

	@@ -0,0 +1,11 @@

+from PIL import Image
+from io import BytesIO
+import pytesseract
+def tiff_to_ocr(path):
+    img = Image.open(path)
+    TempIO = BytesIO()
+    img.save(TempIO,format="JPEG")
+    img = Image.open(BytesIO(TempIO.getvalue()))
+    return pytesseract.image_to_string(img)

modules/last_year/__pycache__/racist_chatgpt_analysis.cpython-310.pyc ADDED Viewed

Binary file (1.05 kB). View file

modules/last_year/bigotry_dict.py ADDED Viewed

	@@ -0,0 +1,102 @@

+bigotry_dict = {
+    "Irishman": True,
+    "irishman": True,
+    "Greek": True,
+    "greek": True,
+    "Portugese": True,
+    "portugese": True,
+    "Mulatto": True,
+    "mutatto": True,
+    "Quadroon": True,
+    "quadroon": True,
+    "Chinaman": True,
+    "chinaman": True,
+    "Jap": True,
+    "jap": True,
+    "japs": True,
+    "Japs": True,
+    "Hebrew": True,
+    "hebrew": True,
+    "Pole": True,
+    "pole": True,
+    "French Canadian": True,
+    "Canadien": True,
+    "Quebecois": True,
+    "Quebecker": True,
+    "Arab": True,
+    "Arabs": True,
+    "Truk": True,
+    "Turks": True,
+    "Frenchman": True,
+    "German": True,
+    "german": True,
+    "Germans": True,
+    "germans": True,
+    "Spaniard": True,
+    "spaniard": True,
+    "Spaniards": True,
+    "spaniards": True,
+    "Slav": True,
+    "slav": True,
+    "Slavs": True,
+    "slavs": True,
+    "Russian": True,
+    "russian": True,
+    "Russians": True,
+    "russians": True,
+    "Persian": True,
+    "persian": True,
+    "Persians": True,
+    "persians": True,
+    "Korean": True,
+    "korean": True,
+    "Koreans": True,
+    "koreans": True,
+    "Negro": True,
+    "negro": True,
+    "Colored": True,
+    "colored": True,
+    "Polander": True,
+    "polander": True,
+    "Polish": True,
+    "polish": True,
+    "Italian": True,
+    "italian": True,
+    "African": True,
+    "african": True,
+    "Africans": True,
+    "africans": True,
+    "Hindu": True,
+    "hindu": True,
+    "Japanese": True,
+    "japanese": True,
+    "Chinese": True,
+    "chinese": True,
+    "Catholic": True,
+    "catholic": True,
+    "Jew": True,
+    "jew": True,
+    "Jewish": True,
+    "jewish": True,
+    "shall not be resold": True,
+    "shall not be re-sold": True,
+    "shall not be sold": True,
+    "white": True,
+    "White": True,
+    "Whites": True,
+    "whites": True,
+    "Aryan": True,
+    "Aryans": True,
+    "aryan": True,
+    "aryans": True,
+    "Caucasian": True,
+    "caucasian": True,
+    "Caucasians": True,
+    "caucasians": True,
+    "race": True,
+    "Race": True,
+    "races": True,
+    "Races": True,
+    "Semetic": True,
+    "semetic": True,
+}

modules/last_year/env.template ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ OPENAI_API_KEY=
2	+ OPENAI_ORG_ID=

modules/last_year/locate.py ADDED Viewed

	@@ -0,0 +1,30 @@

+import re
+def locate(ocr_text):
+    # input: string produced by the ocr
+    # output: (1) array of possible page numbers (may include false positives)
+    #         (2) array of possible dates
+    #         (3) array of possible book numbers
+    possible_pages = []
+    possible_dates = []
+    possible_book = []
+    result = ocr_text.split("\n")
+    pattern = re.compile(r'Re(?:c|ceived|e|o|a)')
+    book_pattern = re.compile(r'B(?:OOK|00K)',  re.IGNORECASE)
+    for word in result:
+        # checks for possible page numbers
+        if word.isdigit() == True:
+            possible_pages.append(word)
+        # checks for rec'd dates
+        if re.match(pattern, word):
+            # appending entire string for human judgement as OCR fails to correctly translate years in few cases
+            possible_dates.append(word)
+        if re.match(book_pattern, word):
+            possible_book.append(word)
+    if not possible_pages:
+        possible_pages.append("Null")
+    if not possible_dates:
+        possible_dates.append("Null")
+    if not possible_book:
+        possible_book.append("Null")
+    return possible_pages, possible_dates, possible_book

modules/last_year/manual_keyword_check.sh ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ #!/bin/bash -l
2	+
3	+ python search_keywords.py

modules/last_year/pagenum.py ADDED Viewed

	@@ -0,0 +1,22 @@

+from PIL import Image
+def crop_image(input_path, output_path):
+    image = Image.open(input_path)
+    width, height = image.size
+    left_width = width * 0.1575
+    right_start = width * 0.88
+    top_height = height * 0.07
+    left_crop = image.crop((0, 0, left_width, top_height))
+    right_crop = image.crop((right_start, 0, width, top_height))
+    result_width = int(left_width + (width - right_start))
+    result_image = Image.new('RGB', (result_width, int(top_height)))
+    result_image.paste(left_crop, (0, 0))
+    result_image.paste(right_crop, (int(left_width), 0))
+    result_image.save(output_path, format='TIFF')
+    return result_image

modules/last_year/racism_checker_old_pipeline.py ADDED Viewed

	@@ -0,0 +1,65 @@

+from modules.racist_text_query import racist_text_query
+from modules.bigotry_dict import bigotry_dict
+from modules.OCR import tiff_to_ocr
+from modules.racist_chatgpt_analysis import racist_chatgpt_analysis
+from modules.locate import locate
+from modules.pagenum import crop_image
+import os
+import pandas as pd
+def racism_threshold(file_dir):
+    # Create the new folder for cropped images
+    cropped_images_dir = os.path.join(file_dir, 'deed page number')
+    if not os.path.exists(cropped_images_dir):
+        os.makedirs(cropped_images_dir)
+    data = []
+    for images in os.listdir(file_dir):
+        if images.endswith(".tif") or images.endswith(".tiff"):
+            image_path = os.path.join(file_dir, images)
+            # run ocr on images
+            text = tiff_to_ocr(image_path)
+            result1 = racist_chatgpt_analysis(text)
+            result2 = racist_text_query(text, bigotry_dict)
+            a, b, c = locate(text)
+            # Define the output path for the cropped image in the new folder
+            cropped_image_name = "cropped_" + images
+            cropped_image_path = os.path.join(cropped_images_dir, cropped_image_name)
+            # Crop the image and save it to the new folder
+            crop_image(image_path, cropped_image_path)
+            image_path_formatted = cropped_image_path
+            #.replace(' ', '%20')
+            hyperlink_formula = f'file://{image_path_formatted}'
+            # fail safe page number detection
+            page = tiff_to_ocr(cropped_image_path)
+            fail_safe_page = []
+            result = page.split("\n")
+            for word in result:
+              # checks for possible page numbers
+              if word.isdigit() == True:
+                fail_safe_page.append(word)
+            if result1 or result2:
+                print(images, a, b, c)
+                if len(fail_safe_page) != 0:
+                    a.append(fail_safe_page)
+                data.append([images, a, b[0], c[0], hyperlink_formula])
+            else:
+                print(images + " : Not Racist")
+                # data.append([images, a, b[0], c[0], hyperlink_formula])
+    # Include the hyperlink in the DataFrame columns
+    df = pd.DataFrame(data, columns=['File Name', 'Probable Page Number', 'Date', 'Book Number', "Page Link"])
+    df.index += 1
+    df.to_csv(os.path.join(file_dir, 'Racist Deeds.csv'), index=True)
+    df.to_excel(os.path.join(file_dir, 'Racist Deeds.xlsx'), index=True)
+racism_threshold('folderpath')

modules/last_year/racist_chatgpt_analysis.py ADDED Viewed

	@@ -0,0 +1,24 @@

+from openai import OpenAI
+from dotenv import load_dotenv
+import os
+load_dotenv()
+client = OpenAI(
+  organization=os.getenv('OPENAI_ORG_ID'),
+  api_key=os.getenv('OPENAI_API_KEY')
+)
+def racist_chatgpt_analysis(text):
+    response = client.chat.completions.create(
+      model="gpt-3.5-turbo-0125",
+      messages=[ # prompt engineering
+        {"role": "system", "content": "You are a helpful assistant designed to check if there's any racial contents. \
+                                       Please review this document for any racial or discriminatory expressions. \
+                                       If yes, return 'Yes', if there's none, please return 'No racial content found'."},
+        {"role": "user", "content": text}
+      ]
+    )
+    if response.choices[0].message.content == "Yes":
+      return True
+    else:
+      return False

modules/last_year/racist_text_query.py ADDED Viewed

	@@ -0,0 +1,10 @@

+def racist_text_query(text, bigotry_dict):
+  words = text.split()
+  for word in words:
+    if word in bigotry_dict:
+      return True
+  return False
+def read_text(file_path):
+  with open(file_path, 'r') as file:
+    return file.read()

modules/last_year/search_keywords.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import os
+import re
+from difflib import SequenceMatcher
+from bigotry_dict import bigotry_dict
+#can also submit this with "qsub manual_keyword_check.sh" because it takes a long time
+# Path to save the output
+output_file_path = "output.txt"
+# Open the output file in append mode
+with open(output_file_path, 'w', encoding='utf-8') as output_file:
+    # Walk through the directory
+    for root, dirs, files in os.walk(r'../deed_preprocessing/racist'):
+        for file in files:
+            if file.endswith('.txt'):
+                txt_file_path = os.path.join(root, file)
+                with open(txt_file_path, 'rb') as txt_file:
+                    try:
+                        # Read and decode the text file
+                        text = txt_file.read()
+                        decoded_text = text.decode('utf-8')
+                        words = re.split(r'[\n ]+', decoded_text)
+                        # Look for matches in the text
+                        found = False
+                        for i in range(len(words)):
+                            if not found:
+                                for identifier in bigotry_dict.keys():
+                                    if not found:
+                                        similarity_ratio = SequenceMatcher(None, words[i], identifier).ratio()
+                                        if similarity_ratio >= 0.9:
+                                            # Collect the surrounding words
+                                            context = words[max(0, i-10):min(len(words),i+10)]
+                                            context_str = ' '.join(context)
+                                            # Write to the output file
+                                            output_file.write(f"Context: {context_str}\n")
+                                            output_file.write(f"File: {txt_file_path}\n\n")
+                                            print(txt_file_path)
+                                            found = True
+                                    else:
+                                        break
+                            else:
+                                break
+                    except Exception as e:
+                        print(f"Error processing {file}: {str(e)}")
+print(f"Results saved to {output_file_path}")

modules/model_experimentation/bag_of_words_logistic_regression.py ADDED Viewed

	@@ -0,0 +1,89 @@

+import pandas as pd
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.model_selection import train_test_split
+from sklearn.linear_model import LogisticRegression
+from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc
+import matplotlib.pyplot as plt
+import seaborn as sns
+import warnings
+warnings.filterwarnings("ignore", category=FutureWarning)
+import pickle
+def preprocess_bag_of_words(preprocessed_text_list):
+    texts = [preprocessed["original_text"] for preprocessed in preprocessed_text_list]
+    vectorizer = CountVectorizer()
+    bag_of_words = vectorizer.fit_transform(texts)
+    bow_df = pd.DataFrame(bag_of_words.toarray(), columns=vectorizer.get_feature_names_out())
+    return bow_df, vectorizer
+if __name__ == "__main__":
+    preprocessed_data = pd.read_pickle('preprocessed_deeds.pkl')
+    texts = preprocessed_data['original_text']
+    preprocessed_text_list = texts.apply(lambda x: {"original_text": x}).tolist()
+    bow_df, vectorizer = preprocess_bag_of_words(preprocessed_text_list)
+    X = bow_df
+    y = preprocessed_data['is_racist']
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
+    logistic_model = LogisticRegression(max_iter=1000)
+    logistic_model.fit(X_train, y_train)
+    # Save the model and vectorizer
+    with open('vectorizer.pkl', 'wb') as vec_file:
+        pickle.dump(vectorizer, vec_file)
+    with open('logistic_model.pkl', 'wb') as model_file:
+        pickle.dump(logistic_model, model_file)
+    y_pred = logistic_model.predict(X_test)
+    accuracy = accuracy_score(y_test, y_pred)
+    print(f"Accuracy: {accuracy:.2f}")
+    print("\nClassification Report:")
+    print(classification_report(y_test, y_pred))
+    conf_matrix = confusion_matrix(y_test, y_pred)
+    plt.figure(figsize=(6, 4))
+    sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=['Non-racist', 'Racist'], yticklabels=['Non-racist', 'Racist'])
+    plt.title('Confusion Matrix')
+    plt.xlabel('Predicted')
+    plt.ylabel('Actual')
+    plt.show()
+    y_prob = logistic_model.predict_proba(X_test)[:, 1]
+    fpr, tpr, _ = roc_curve(y_test, y_prob)
+    roc_auc = auc(fpr, tpr)
+    plt.figure(figsize=(6, 4))
+    plt.plot(fpr, tpr, label=f'ROC curve (AUC = {roc_auc:.2f})')
+    plt.plot([0, 1], [0, 1], 'k--')
+    plt.xlim([0.0, 1.0])
+    plt.ylim([0.0, 1.05])
+    plt.xlabel('False Positive Rate')
+    plt.ylabel('True Positive Rate')
+    plt.title('Receiver Operating Characteristic (ROC) Curve')
+    plt.legend(loc="lower right")
+    plt.show()
+    feature_importance = pd.Series(logistic_model.coef_[0], index=vectorizer.get_feature_names_out())
+    top_features = feature_importance.nlargest(10)
+    plt.figure(figsize=(8, 6))
+    top_features.plot(kind='barh', color='skyblue')
+    plt.title('Top 10 Most Influential Words for Racist Classification')
+    plt.xlabel('Coefficient Value')
+    plt.ylabel('Word')
+    plt.show()
+# Function to make predictions based on the trained model
+def predict(processed_text, vectorizer, logistic_model):
+    bow_text = vectorizer.transform([processed_text["original_text"]])
+    prediction = logistic_model.predict(bow_text)
+    return {
+        'is_racist': bool(prediction[0]),
+    }

modules/model_experimentation/logistic_model.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bedad659905586b85bd65fd9473b9859d65871a4819df69ccf453a5cfde2229f
+size 316116

modules/model_experimentation/preprocessed_deeds.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8d2d25baf91285511e68a177d9410bfc6ec6b12f54ac9b2d0a49dd4dba7282bf
+size 80115994

modules/model_experimentation/vectorizer.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:11d1802aa1d14f29acd8f3914cb6ad066958832b34a532c3fab0bc5f28878c9d
+size 223071

modules/modules.md ADDED Viewed

	@@ -0,0 +1,19 @@

+### azure_cloud_ocr
+Contains modules related to Microsoft Azure OCR
+### data_retrieval
+Contains Google Colab script to fetch TIFFs from Drive
+### deed_preprocessing
+Includes code to run all tiffs through OCR and preprocessing. Also contains EDA.
+### google_cloud_ocr
+Contains modules related to Google Cloud OCR
+### last_year
+Contains last years modules, which are not being used.

modules/openai/__pycache__/racist_chatgpt_analysis.cpython-310.pyc ADDED Viewed

Binary file (1.23 kB). View file

modules/openai/batch/batch_instruc.md ADDED Viewed

	@@ -0,0 +1,57 @@

+# Calling OpenAI Batch steps
+Guide to OpenAI batch api --> https://platform.openai.com/docs/guides/batch
+---
+## Step 1: Prepare Batch File (prepare_batch.py)
+   1. Prepare a .jsonl file containing your batch requests. Each line represents a single API request.
+   2. Usage: Run the script to process all files in the specified directory and generate the batch file.
+---
+## Step 2: Upload Batch File (upload_batch_file.py)
+   1. Upload the prepared batch file to OpenAI using the Files API.
+   2. The script uploads batch_input.jsonl and returns the file ID.
+---
+## Step 3: Create Batch (create_batch.py)
+   1. Create a batch job using the uploaded file's ID. Add the file ID obtained from running upload_batch_file.py to create_batch.py.
+   2. Returns a batch ID.
+---
+## Step 4: Check Batch Status (check_batch_status.py)
+   1. Use the batch iD from create_batch.py to check the status of the batch job to monitor progress.
+      Possible statuses:
+         1. validating: Validating the input file.
+         2. in_progress: Batch is running.
+         3. completed: Batch is finished and results are ready.
+         4. failed: Validation failed.
+         5. expired: Batch did not complete within the window.
+   2. Retrieves the output_file_ID if completed.
+---
+## Step 5: Retrieve Results (retrieve_results.py)
+   1. Download the results using the output_file_id retrieved from the batch status.
+---
+## Helper scripts
+   1. Cancel Batch (cancel_batch.py)
+      1. Cancel an ongoing batch if required. Changes batch status to cancelling and eventually cancelled.
+   2. List Batches (list_batches.py)
+      1. View all batches created, including their status and metadata.
+## Consolidated script that combines all the functionalities (Except helpers)
+   1. batch_processing.py

modules/openai/batch/batch_processing.py ADDED Viewed

	@@ -0,0 +1,115 @@

+import os
+import json
+from dotenv import load_dotenv
+from openai import OpenAI
+load_dotenv()
+class BatchProcessor:
+    def __init__(self):
+        self.client = OpenAI(
+            api_key=os.getenv("OPENAI_API_KEY"),
+            organization=os.getenv("OPENAI_ORG_ID")
+        )
+    def prepare_batch(self, folder_path, output_file):
+        """Prepare a batch input file from a folder of text files."""
+        with open(output_file, "w") as out_file:
+            for filename in os.listdir(folder_path):
+                if filename.endswith(".txt"):
+                    file_path = os.path.join(folder_path, filename)
+                    with open(file_path, "r") as file:
+                        text = file.read()
+                    batch_entry = {
+                        "custom_id": filename,
+                        "method": "POST",
+                        "url": "/v1/chat/completions",
+                        "body": {
+                            "model": "gpt-4o-mini",
+                            "messages": [
+                                {
+                                    "role": "system",
+                                    "content": (
+                                        "You are a helpful assistant designed to check if there's any racial content. "
+                                        "Please review this document for any racial or discriminatory expressions. "
+                                        "If yes, return 'Yes'; if there's none, please return 'No racial content found'. "
+                                        "If there is any doubt or ambiguity, assume the text contains racial content and respond 'Yes'."
+                                    )
+                                },
+                                {"role": "user", "content": text}
+                            ],
+                            "max_tokens": 1000
+                        }
+                    }
+                    out_file.write(json.dumps(batch_entry) + "\n")
+        print(f"Batch file created: {output_file}")
+    def upload_batch_file(self, batch_file_path):
+        """Upload the prepared batch input file."""
+        with open(batch_file_path, "rb") as f:
+            batch_input_file = self.client.files.create(
+                file=f,
+                purpose="batch"
+            )
+        print(f"Batch input file uploaded. File ID: {batch_input_file.id}")
+        return batch_input_file.id
+    def create_batch(self, file_id):
+        """Create a batch job with the uploaded input file."""
+        batch = self.client.batches.create(
+            input_file_id=file_id,
+            endpoint="/v1/chat/completions",
+            completion_window="24h",
+            metadata={
+                "description": "Deed analysis batch"
+            }
+        )
+        print(f"Batch created. Batch ID: {batch.id}")
+        return batch.id
+    def check_batch_status(self, batch_id):
+        """Check the status of a batch job."""
+        batch_status = self.client.batches.retrieve(batch_id)
+        print(f"Batch Status: {batch_status.status}")
+        if batch_status.status == "completed":
+            output_file_id = batch_status.output_file_id
+            print(f"Output File ID: {output_file_id}")
+            return output_file_id
+        else:
+            return None
+    def retrieve_results(self, output_file_id, output_path):
+        """Retrieve the results of a completed batch job."""
+        file_response = self.client.files.content(output_file_id)
+        with open(output_path, "wb") as out_file:
+            out_file.write(file_response.read())
+        print(f"Batch results downloaded to {output_path}")
+if __name__ == "__main__":
+    processor = BatchProcessor()
+    folder_path = ""
+    batch_input_file = "batch_input.jsonl"
+    batch_output_file = "batch_output.jsonl"
+    # Step 1: Prepare the batch input file
+    processor.prepare_batch(folder_path, batch_input_file)
+    # Step 2: Upload the batch input file
+    file_id = processor.upload_batch_file(batch_input_file)
+    # Step 3: Create a batch job
+    batch_id = processor.create_batch(file_id)
+    # Step 4: Poll for batch status
+    import time
+    while True:
+        output_file_id = processor.check_batch_status(batch_id)
+        if output_file_id:
+            break
+        print("Batch not complete. Retrying in 30 minutes...")
+        time.sleep(1800)
+    # Step 5: Retrieve the results
+    processor.retrieve_results(output_file_id, batch_output_file)

modules/openai/batch/cancel_batch.py ADDED Viewed

	@@ -0,0 +1,18 @@

+from openai import OpenAI
+import os
+from dotenv import load_dotenv
+load_dotenv()
+def cancel_batch(batch_id):
+    client = OpenAI(
+        api_key=os.getenv("OPENAI_API_KEY"),
+        organization=os.getenv("OPENAI_ORG_ID")
+    )
+    client.batches.cancel(batch_id)
+    print(f"Batch {batch_id} cancelled.")
+if __name__ == "__main__":
+    batch_id = "" # batch id here obtained from create_batch.py
+    cancel_batch(batch_id)

modules/openai/batch/check_batch_status.py ADDED Viewed

	@@ -0,0 +1,26 @@

+from openai import OpenAI
+import os
+from dotenv import load_dotenv
+load_dotenv()
+def check_batch_status(batch_id):
+    client = OpenAI(
+        api_key=os.getenv("OPENAI_API_KEY"),
+        organization=os.getenv("OPENAI_ORG_ID")
+    )
+    batch_status = client.batches.retrieve(batch_id)
+    print(f"Batch Status: {batch_status.status}")
+    if batch_status.status == "completed":
+        output_file_id = batch_status.output_file_id
+        print(f"Output File ID: {output_file_id}")
+        return output_file_id
+    else:
+        print(f"Batch Status: {batch_status.status}")
+        return None
+if __name__ == "__main__":
+    batch_id = "" # batch id here
+    check_batch_status(batch_id)

modules/openai/batch/create_batch.py ADDED Viewed

	@@ -0,0 +1,27 @@

+from openai import OpenAI
+import os
+from dotenv import load_dotenv
+load_dotenv()
+def create_batch(file_id):
+    client = OpenAI(
+        api_key=os.getenv("OPENAI_API_KEY"),
+        organization=os.getenv("OPENAI_ORG_ID")
+    )
+    batch = client.batches.create(
+        input_file_id=file_id,
+        endpoint="/v1/chat/completions",
+        completion_window="24h",
+        metadata={
+            "description": "Deed analysis batch"
+        }
+    )
+    print(f"Batch created. Batch ID: {batch.id}")
+    return batch.id
+if __name__ == "__main__":
+    file_id = "" # file id here obtained from running upload_batch_file.py
+    create_batch(file_id)

modules/openai/batch/list_batches.py ADDED Viewed

	@@ -0,0 +1,18 @@

+from openai import OpenAI
+import os
+from dotenv import load_dotenv
+load_dotenv()
+def list_batches(limit=10):
+    client = OpenAI(
+        api_key=os.getenv("OPENAI_API_KEY"),
+        organization=os.getenv("OPENAI_ORG_ID")
+    )
+    batches = client.batches.list(limit=limit)
+    for batch in batches.data:
+        print(f"Batch ID: {batch.id}, Status: {batch.status}")
+if __name__ == "__main__":
+    list_batches()

modules/openai/batch/prepare_batch.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import os
+import json
+def prepare_batch(folder_path, output_file):
+    with open(output_file, "w") as out_file:
+        for filename in os.listdir(folder_path):
+            if filename.endswith(".txt"):
+                file_path = os.path.join(folder_path, filename)
+                # Read the content of the file
+                with open(file_path, "r") as file:
+                    text = file.read()
+                # Create a batch entry
+                batch_entry = {
+                    "custom_id": filename,
+                    "method": "POST",
+                    "url": "/v1/chat/completions",
+                    "body": {
+                        "model": "gpt-4o-mini",
+                        "messages": [
+                            {
+                                "role": "system",
+                                "content": (
+                                    "You are a helpful assistant designed to check if there's any racial content. "
+                                    "Please review this document for any racial or discriminatory expressions. "
+                                    "If yes, return 'Yes'; if there's none, please return 'No racial content found'. "
+                                    "If there is any doubt or ambiguity, assume the text contains racial content and respond 'Yes'."
+                                )
+                            },
+                            {"role": "user", "content": text}
+                        ],
+                        "max_tokens": 1000
+                    }
+                }
+                out_file.write(json.dumps(batch_entry) + "\n")
+    print(f"Batch file created: {output_file}")
+if __name__ == "__main__":
+    prepare_batch("folder_of_deeds", "batch_input.jsonl") # add folder of deeds to pass into openai

modules/openai/batch/retrieve_results.py ADDED Viewed

	@@ -0,0 +1,23 @@

+from openai import OpenAI
+import os
+from dotenv import load_dotenv
+load_dotenv()
+def retrieve_results(output_file_id, output_path):
+    client = OpenAI(
+        api_key=os.getenv("OPENAI_API_KEY"),
+        organization=os.getenv("OPENAI_ORG_ID")
+    )
+    file_response = client.files.content(output_file_id)
+    # Write the binary content to the output file
+    with open(output_path, "wb") as out_file:
+        out_file.write(file_response.read())
+    print(f"Batch results downloaded to {output_path}")
+if __name__ == "__main__":
+    output_file_id = ""  # Replace with your actual output file ID
+    retrieve_results(output_file_id, "batch_output.jsonl")

modules/openai/batch/upload_batch_file.py ADDED Viewed

	@@ -0,0 +1,23 @@

+from openai import OpenAI
+import os
+from dotenv import load_dotenv
+load_dotenv()
+def upload_batch_file(batch_file_path):
+    client = OpenAI(
+        api_key=os.getenv("OPENAI_API_KEY"),
+        organization=os.getenv("OPENAI_ORG_ID")
+    )
+    with open(batch_file_path, "rb") as f:
+        batch_input_file = client.files.create(
+            file=f,
+            purpose="batch"
+        )
+    print(f"Batch input file uploaded. File ID: {batch_input_file.id}")
+    return batch_input_file.id
+if __name__ == "__main__":
+    upload_batch_file("batch_input.jsonl")

modules/openai/gpt_extract.py ADDED Viewed

	@@ -0,0 +1,66 @@

+import os
+import openai
+import pandas as pd
+from dotenv import load_dotenv
+load_dotenv()
+openai.api_key = os.getenv('OPENAI_API_KEY')
+folder_path = "racist_deeds_text"
+output_csv = "deed_names_locations.csv"
+data = []
+def extract_names_and_locations(text):
+    """
+    Extract names and locations from text using OpenAI.
+    """
+    try:
+        response = openai.ChatCompletion.create(
+            model="gpt-4",
+            messages=[
+                {
+                    "role": "system",
+                    "content": (
+                        "You are an assistant that extracts names and locations from legal text. "
+                        "For the given input, identify all names of people (grantors, grantees) and "
+                        "locations (addresses, city, county, state). "
+                        "Return the names as a comma-separated list and locations as a separate comma-separated list "
+                        "strictly in the format:\nNames: [comma-separated names]\nLocations: [comma-separated locations]."
+                    )
+                },
+                {"role": "user", "content": text}
+            ]
+        )
+        output = response.choices[0].message.content.strip()
+        names, locations = "", ""
+        for line in output.split("\n"):
+            if line.startswith("Names:"):
+                names = line.replace("Names:", "").strip()
+            elif line.startswith("Locations:"):
+                locations = line.replace("Locations:", "").strip()
+        return names, locations
+    except Exception as e:
+        print(f"Error extracting names and locations: {e}")
+        return "", ""
+for filename in os.listdir(folder_path):
+    if filename.endswith(".txt"):
+        file_path = os.path.join(folder_path, filename)
+        with open(file_path, "r") as file:
+            text = file.read()
+        names, locations = extract_names_and_locations(text)
+        data.append({"Filename": filename, "Names": names, "Locations": locations})
+        print(f"Processed {filename}")
+df = pd.DataFrame(data)
+df.to_csv(output_csv, index=False)
+print(f"Results saved to {output_csv}")

modules/openai/racist_chatgpt_analysis.py ADDED Viewed

	@@ -0,0 +1,30 @@

+import openai
+from dotenv import load_dotenv
+import os
+load_dotenv()
+openai.organization = os.getenv('OPENAI_ORG_ID')
+openai.api_key = os.getenv('OPENAI_API_KEY')
+def racist_chatgpt_analysis(text):
+    try:
+        response = openai.ChatCompletion.create(
+            model="gpt-4o-mini",
+            messages=[
+                {
+                    "role": "system",
+                    "content": "You are a helpful assistant designed to check if there's any racial contents. \
+                                Please review this document for any racial or discriminatory expressions. \
+                                If yes, return 'Yes', if there's none, please return 'No racial content found'. \
+                                If there is any doubt or ambiguity, assume the text contains racial content and respond 'Yes'."
+                },
+                {"role": "user", "content": text}
+            ]
+        )
+        if response.choices[0].message.content.strip() == "Yes":
+            return True
+        else:
+            return False
+    except Exception as e:
+        print(f"Error: {e}")
+        return False

requirements.txt CHANGED Viewed

@@ -1,2 +1,14 @@
-fastapi
-uvicorn[standard]

+pytesseract
+tesseract
+Pillow
+openai
+python-dotenv
+python-dotenv
+google-cloud-vision
+spacy
+openai==0.28
+Flask==2.x.x
+Flask-Cors==3.x.x
+gunicorn==20.x.x
+pandas==1.x.x
+xlsxwriter==3.x.x