jacob-stein commited on
Commit
97208ad
·
1 Parent(s): 8e73170

Migrate Flask backend

Browse files
Files changed (39) hide show
  1. Dockerfile +11 -5
  2. app.py +114 -5
  3. modules/.DS_Store +0 -0
  4. modules/deed_preprocessing/.DS_Store +0 -0
  5. modules/deed_preprocessing/preprocessor.py +62 -0
  6. modules/deed_preprocessing/spellcheck.py +12 -0
  7. modules/google_cloud_ocr/__init__.py +0 -0
  8. modules/google_cloud_ocr/cloud-creds-template.json +13 -0
  9. modules/google_cloud_ocr/google_cloud_ocr.py +23 -0
  10. modules/google_cloud_ocr/setup.md +59 -0
  11. modules/last_year/OCR.py +11 -0
  12. modules/last_year/__pycache__/racist_chatgpt_analysis.cpython-310.pyc +0 -0
  13. modules/last_year/bigotry_dict.py +102 -0
  14. modules/last_year/env.template +2 -0
  15. modules/last_year/locate.py +30 -0
  16. modules/last_year/manual_keyword_check.sh +3 -0
  17. modules/last_year/pagenum.py +22 -0
  18. modules/last_year/racism_checker_old_pipeline.py +65 -0
  19. modules/last_year/racist_chatgpt_analysis.py +24 -0
  20. modules/last_year/racist_text_query.py +10 -0
  21. modules/last_year/search_keywords.py +51 -0
  22. modules/model_experimentation/bag_of_words_logistic_regression.py +89 -0
  23. modules/model_experimentation/logistic_model.pkl +3 -0
  24. modules/model_experimentation/preprocessed_deeds.pkl +3 -0
  25. modules/model_experimentation/vectorizer.pkl +3 -0
  26. modules/modules.md +19 -0
  27. modules/openai/__pycache__/racist_chatgpt_analysis.cpython-310.pyc +0 -0
  28. modules/openai/batch/batch_instruc.md +57 -0
  29. modules/openai/batch/batch_processing.py +115 -0
  30. modules/openai/batch/cancel_batch.py +18 -0
  31. modules/openai/batch/check_batch_status.py +26 -0
  32. modules/openai/batch/create_batch.py +27 -0
  33. modules/openai/batch/list_batches.py +18 -0
  34. modules/openai/batch/prepare_batch.py +40 -0
  35. modules/openai/batch/retrieve_results.py +23 -0
  36. modules/openai/batch/upload_batch_file.py +23 -0
  37. modules/openai/gpt_extract.py +66 -0
  38. modules/openai/racist_chatgpt_analysis.py +30 -0
  39. requirements.txt +14 -2
Dockerfile CHANGED
@@ -1,16 +1,22 @@
1
- # Read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
2
- # you will also find guides on how best to write your Dockerfile
3
-
4
- FROM python:3.9
5
 
 
6
  RUN useradd -m -u 1000 user
7
  USER user
8
  ENV PATH="/home/user/.local/bin:$PATH"
9
 
 
10
  WORKDIR /app
11
 
 
12
  COPY --chown=user ./requirements.txt requirements.txt
13
  RUN pip install --no-cache-dir --upgrade -r requirements.txt
14
 
 
15
  COPY --chown=user . /app
16
- CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
 
 
 
 
 
 
1
+ FROM python:3.9-slim
 
 
 
2
 
3
+ # Set up non-root user
4
  RUN useradd -m -u 1000 user
5
  USER user
6
  ENV PATH="/home/user/.local/bin:$PATH"
7
 
8
+ # Set working directory
9
  WORKDIR /app
10
 
11
+ # Copy and install dependencies
12
  COPY --chown=user ./requirements.txt requirements.txt
13
  RUN pip install --no-cache-dir --upgrade -r requirements.txt
14
 
15
+ # Copy all application code
16
  COPY --chown=user . /app
17
+
18
+ # Expose required port
19
+ EXPOSE 7860
20
+
21
+ # Run the Flask app using Gunicorn
22
+ CMD ["gunicorn", "-b", "0.0.0.0:7860", "app:app"]
app.py CHANGED
@@ -1,7 +1,116 @@
1
- from fastapi import FastAPI
 
 
 
 
 
 
 
 
 
 
2
 
3
- app = FastAPI()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
- @app.get("/")
6
- def greet_json():
7
- return {"Hello": "World!"}
 
1
+ from flask import Flask, request, jsonify, send_file
2
+ from flask_cors import CORS
3
+ import pickle
4
+ from modules.google_cloud_ocr.google_cloud_ocr import google_cloud_ocr
5
+ from modules.deed_preprocessing.spellcheck import correct_spelling
6
+ from modules.deed_preprocessing.preprocessor import preprocess_text
7
+ from modules.openai.racist_chatgpt_analysis import racist_chatgpt_analysis
8
+ from modules.model_experimentation.bag_of_words_logistic_regression import predict
9
+ import pandas as pd
10
+ import xlsxwriter
11
+ import re
12
 
13
+ app = Flask(__name__)
14
+ # CORS(app, resources={r"/*": {"origins": "*"}})
15
+ CORS(app, supports_credentials=True, origins="*")
16
+
17
+ with open('modules/model_experimentation/vectorizer.pkl', 'rb') as vec_file:
18
+ vectorizer = pickle.load(vec_file)
19
+
20
+ with open('modules/model_experimentation/logistic_model.pkl', 'rb') as model_file:
21
+ logistic_model = pickle.load(model_file)
22
+
23
+ # Helper to look for the book and page numbers
24
+ def extract_book_and_page(text):
25
+ book_numbers = re.findall(r"book\s+(\d+)", text, re.IGNORECASE)
26
+ page_numbers = re.findall(r"page\s+(\d+)", text, re.IGNORECASE)
27
+ return book_numbers, page_numbers
28
+
29
+ @app.route('/api/upload', methods=['POST'])
30
+ def upload_file():
31
+ if 'file' not in request.files:
32
+ return jsonify({'error': 'No file part in the request'}), 400
33
+
34
+ file = request.files['file']
35
+
36
+ if file.filename == '':
37
+ return jsonify({'error': 'No selected file'}), 400
38
+
39
+ ocr_engine = request.form.get('ocr_engine', 'google')
40
+ analysis_method = request.form.get('analysis_method', 'chatgpt')
41
+
42
+ try:
43
+ if ocr_engine == 'google':
44
+ # Step 1: Get text using Google OCR
45
+ google_text = google_cloud_ocr(file)
46
+
47
+ # Step 2: Pass text through the spell checker
48
+ spellchecked_text = correct_spelling(google_text)
49
+
50
+ # Step 3: Pass text through the preprocessor
51
+ processed_text = preprocess_text(spellchecked_text)
52
+
53
+ # Extract book and page numbers right after spellchecking
54
+ book_numbers, page_numbers = extract_book_and_page(spellchecked_text)
55
+
56
+ # Step 4: Get the names and locations
57
+ extracted_info = {
58
+ "names": processed_text.get("names", []),
59
+ "locations": processed_text.get("locations", []),
60
+ "book_numbers": book_numbers,
61
+ "page_numbers": page_numbers
62
+ }
63
+
64
+ # Step 5: Choose analysis method
65
+ if analysis_method == 'chatgpt':
66
+ analysis_result = racist_chatgpt_analysis(processed_text['original_text'])
67
+ return jsonify({
68
+ 'status': 'success',
69
+ 'ocr_engine': 'google',
70
+ 'analysis_method': 'chatgpt',
71
+ 'original_text': google_text,
72
+ 'spellchecked_text': spellchecked_text,
73
+ 'processed_text': processed_text,
74
+ 'extracted_info': extracted_info,
75
+ 'result': analysis_result
76
+ }), 200
77
+ elif analysis_method == 'logistic_regression':
78
+ lr_result = predict(processed_text, vectorizer, logistic_model)['is_racist']
79
+ return jsonify({
80
+ 'status': 'success',
81
+ 'ocr_engine': 'google',
82
+ 'analysis_method': 'logistic_regression',
83
+ 'original_text': google_text,
84
+ 'spellchecked_text': spellchecked_text,
85
+ 'processed_text': processed_text,
86
+ 'extracted_info': extracted_info,
87
+ 'result': lr_result
88
+ }), 200
89
+ else:
90
+ return jsonify({'error': 'Unsupported analysis method selected'}), 400
91
+ elif ocr_engine == 'azure':
92
+ return jsonify({'status': 'success', 'ocr_engine': 'azure', 'text': "fill"}), 200
93
+ else:
94
+ return jsonify({'error': 'Unsupported OCR engine selected'}), 400
95
+ except Exception as e:
96
+ return jsonify({'error': str(e)}), 500
97
+
98
+ @app.route('/api/download_excel', methods=['POST'])
99
+ def download_excel():
100
+ try:
101
+ data = request.get_json()
102
+ if not data:
103
+ return jsonify({'error': 'No data provided'}), 400
104
+
105
+ df = pd.DataFrame(data)
106
+ excel_path = 'output.xlsx'
107
+ with pd.ExcelWriter(excel_path, engine='xlsxwriter') as writer:
108
+ df.to_excel(writer, index=False, sheet_name='Sheet1')
109
+
110
+ return send_file(excel_path, as_attachment=True, download_name='analysis_results.xlsx')
111
+ except Exception as e:
112
+ return jsonify({'error': str(e)}), 500
113
+
114
+ if __name__ == '__main__':
115
+ app.run(debug=True, host="0.0.0.0", port=7860)
116
 
 
 
 
modules/.DS_Store ADDED
Binary file (6.15 kB). View file
 
modules/deed_preprocessing/.DS_Store ADDED
Binary file (6.15 kB). View file
 
modules/deed_preprocessing/preprocessor.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import spacy
3
+ from collections import Counter
4
+
5
+ nlp = spacy.load('en_core_web_sm')
6
+
7
+ def preprocess_text(text):
8
+ text = re.sub(r'[\n\r\t]', ' ', text)
9
+ text = re.sub(r'[^\x00-\x7F]+', '', text)
10
+ doc = nlp(text)
11
+
12
+ result = {
13
+ "original_text": text,
14
+ "sentences": [],
15
+ "pos_groups": {},
16
+ "named_entities": [],
17
+ "dependencies": [],
18
+ "token_offsets": [],
19
+ "word_frequency": {},
20
+ "sentence_lengths": [],
21
+ "pos_counts": {}
22
+ }
23
+
24
+ pos_groups = {
25
+ "NOUN": [], "VERB": [], "ADJ": [], "ADV": [], "PROPN": [],
26
+ "DET": [], "AUX": [], "PRON": [], "ADP": [], "NUM": [],
27
+ "PART": [], "PUNCT": [], "INTJ": [], "X": []
28
+ }
29
+
30
+ all_tokens = []
31
+
32
+ for sent in doc.sents:
33
+ result["sentences"].append(sent.text)
34
+ result["sentence_lengths"].append(len(sent))
35
+
36
+ for token in sent:
37
+ pos = token.pos_
38
+ all_tokens.append(token.text)
39
+
40
+ if pos in pos_groups:
41
+ pos_groups[pos].append(token.text)
42
+
43
+ result["dependencies"].append({
44
+ "token": token.text,
45
+ "dep": token.dep_,
46
+ "head": token.head.text
47
+ })
48
+ result["token_offsets"].append({
49
+ "token": token.text,
50
+ "start": token.idx,
51
+ "end": token.idx + len(token.text)
52
+ })
53
+
54
+ result["pos_groups"] = pos_groups
55
+ result["named_entities"] = [{"text": ent.text, "label": ent.label_} for ent in doc.ents]
56
+ result["word_frequency"] = dict(Counter(all_tokens))
57
+ result["pos_counts"] = dict(Counter([token.pos_ for token in doc]))
58
+
59
+ result["names"] = [ent.text for ent in doc.ents if ent.label_ == "PERSON"]
60
+ result["locations"] = [ent.text for ent in doc.ents if ent.label_ in {"GPE", "LOC"}]
61
+
62
+ return result
modules/deed_preprocessing/spellcheck.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from autocorrect import Speller
2
+
3
+ # Initialize the Speller instance from autocorrect
4
+ spell = Speller(lang='en')
5
+
6
+ def correct_spelling(text):
7
+ """Correct spelling using Autocorrect."""
8
+
9
+ # Correct basic spelling errors using Autocorrect
10
+ corrected_text = spell(text)
11
+
12
+ return corrected_text
modules/google_cloud_ocr/__init__.py ADDED
File without changes
modules/google_cloud_ocr/cloud-creds-template.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "type": "",
3
+ "project_id": "",
4
+ "private_key_id": "",
5
+ "private_key": "",
6
+ "client_email": "",
7
+ "client_id": "",
8
+ "auth_uri": "",
9
+ "token_uri": "",
10
+ "auth_provider_x509_cert_url": "",
11
+ "client_x509_cert_url": "",
12
+ "universe_domain": ""
13
+ }
modules/google_cloud_ocr/google_cloud_ocr.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import io
3
+ from dotenv import load_dotenv
4
+ from google.cloud import vision
5
+
6
+ load_dotenv()
7
+
8
+ google_creds = os.getenv('GOOGLE_APPLICATION_CREDENTIALS')
9
+
10
+ if google_creds is None:
11
+ raise EnvironmentError("GOOGLE_APPLICATION_CREDENTIALS not set in .env file")
12
+
13
+ client = vision.ImageAnnotatorClient()
14
+
15
+ def google_cloud_ocr(tiff_file):
16
+ content = tiff_file.read()
17
+ image = vision.Image(content=content)
18
+ response = client.document_text_detection(image=image)
19
+ text = response.full_text_annotation.text
20
+ if response.error.message:
21
+ raise Exception(f'API Error: {response.error.message}')
22
+
23
+ return text
modules/google_cloud_ocr/setup.md ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Google Cloud OCR Setup with Python
2
+
3
+ ---
4
+
5
+ ## Step 1: Create a Google Cloud Project
6
+
7
+ 1. Go to the [Google Cloud Console](https://console.cloud.google.com/).
8
+ 2. **Create a New Project**:
9
+ - In the top-left corner, click the project dropdown menu and then "New Project."
10
+ - Enter a project name (e.g., "OCR Project") and click "Create."
11
+
12
+ ---
13
+
14
+ ## Step 2: Enable the Cloud Vision API
15
+
16
+ 1. In the **Google Cloud Console**, go to the **Navigation Menu** (three horizontal lines at the top left).
17
+ 2. Click on **APIs & Services** > **Library**.
18
+ 3. In the search bar, type **Vision API**.
19
+ 4. Select **Cloud Vision API** and click **Enable**.
20
+
21
+ ---
22
+
23
+ ## Step 3: Create Service Account Credentials
24
+
25
+ 1. Navigate to **APIs & Services** > **Credentials**.
26
+ 2. Click on **Create Credentials** > **Service Account**.
27
+ 3. **Service Account Details**:
28
+ - Give the service account a name (e.g., "vision-api-service-account").
29
+ - Click "Create and Continue."
30
+ 4. **Grant Permissions**:
31
+ - Choose **Role**: Select "Project" > "Editor" to give your service account sufficient permissions.
32
+ - Click "Continue."
33
+ 5. **Create JSON Key**:
34
+ - After creating the service account, click on the three dots next to the account.
35
+ - Select "Manage Keys" > "Add Key" > "Create New Key."
36
+ - Choose **JSON** format and download the JSON file. This file contains your credentials.
37
+
38
+ ---
39
+
40
+ ## Step 4: Set Up the `.env` File
41
+
42
+ 1. Create a new file named `.env` in the root directory of your Python project.
43
+ 2. Add the following line to the `.env` file, replacing the path with the actual path to your downloaded JSON credentials file:
44
+
45
+ ```bash
46
+ GOOGLE_APPLICATION_CREDENTIALS=/path-to-your-credentials.json
47
+ ```
48
+
49
+ ---
50
+
51
+ ## Step 5: Running the script
52
+
53
+ 1. Ensure you have the needed package:
54
+
55
+ ```bash
56
+ pip install google-cloud-vision python-dotenv
57
+ ```
58
+
59
+ 2. Run the script and see the text files for outputs
modules/last_year/OCR.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from PIL import Image
2
+ from io import BytesIO
3
+ import pytesseract
4
+
5
+ def tiff_to_ocr(path):
6
+ img = Image.open(path)
7
+ TempIO = BytesIO()
8
+ img.save(TempIO,format="JPEG")
9
+ img = Image.open(BytesIO(TempIO.getvalue()))
10
+
11
+ return pytesseract.image_to_string(img)
modules/last_year/__pycache__/racist_chatgpt_analysis.cpython-310.pyc ADDED
Binary file (1.05 kB). View file
 
modules/last_year/bigotry_dict.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ bigotry_dict = {
2
+ "Irishman": True,
3
+ "irishman": True,
4
+ "Greek": True,
5
+ "greek": True,
6
+ "Portugese": True,
7
+ "portugese": True,
8
+ "Mulatto": True,
9
+ "mutatto": True,
10
+ "Quadroon": True,
11
+ "quadroon": True,
12
+ "Chinaman": True,
13
+ "chinaman": True,
14
+ "Jap": True,
15
+ "jap": True,
16
+ "japs": True,
17
+ "Japs": True,
18
+ "Hebrew": True,
19
+ "hebrew": True,
20
+ "Pole": True,
21
+ "pole": True,
22
+ "French Canadian": True,
23
+ "Canadien": True,
24
+ "Quebecois": True,
25
+ "Quebecker": True,
26
+ "Arab": True,
27
+ "Arabs": True,
28
+ "Truk": True,
29
+ "Turks": True,
30
+ "Frenchman": True,
31
+ "German": True,
32
+ "german": True,
33
+ "Germans": True,
34
+ "germans": True,
35
+ "Spaniard": True,
36
+ "spaniard": True,
37
+ "Spaniards": True,
38
+ "spaniards": True,
39
+ "Slav": True,
40
+ "slav": True,
41
+ "Slavs": True,
42
+ "slavs": True,
43
+ "Russian": True,
44
+ "russian": True,
45
+ "Russians": True,
46
+ "russians": True,
47
+ "Persian": True,
48
+ "persian": True,
49
+ "Persians": True,
50
+ "persians": True,
51
+ "Korean": True,
52
+ "korean": True,
53
+ "Koreans": True,
54
+ "koreans": True,
55
+ "Negro": True,
56
+ "negro": True,
57
+ "Colored": True,
58
+ "colored": True,
59
+ "Polander": True,
60
+ "polander": True,
61
+ "Polish": True,
62
+ "polish": True,
63
+ "Italian": True,
64
+ "italian": True,
65
+ "African": True,
66
+ "african": True,
67
+ "Africans": True,
68
+ "africans": True,
69
+ "Hindu": True,
70
+ "hindu": True,
71
+ "Japanese": True,
72
+ "japanese": True,
73
+ "Chinese": True,
74
+ "chinese": True,
75
+ "Catholic": True,
76
+ "catholic": True,
77
+ "Jew": True,
78
+ "jew": True,
79
+ "Jewish": True,
80
+ "jewish": True,
81
+ "shall not be resold": True,
82
+ "shall not be re-sold": True,
83
+ "shall not be sold": True,
84
+ "white": True,
85
+ "White": True,
86
+ "Whites": True,
87
+ "whites": True,
88
+ "Aryan": True,
89
+ "Aryans": True,
90
+ "aryan": True,
91
+ "aryans": True,
92
+ "Caucasian": True,
93
+ "caucasian": True,
94
+ "Caucasians": True,
95
+ "caucasians": True,
96
+ "race": True,
97
+ "Race": True,
98
+ "races": True,
99
+ "Races": True,
100
+ "Semetic": True,
101
+ "semetic": True,
102
+ }
modules/last_year/env.template ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ OPENAI_API_KEY=
2
+ OPENAI_ORG_ID=
modules/last_year/locate.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+
3
+ def locate(ocr_text):
4
+ # input: string produced by the ocr
5
+ # output: (1) array of possible page numbers (may include false positives)
6
+ # (2) array of possible dates
7
+ # (3) array of possible book numbers
8
+ possible_pages = []
9
+ possible_dates = []
10
+ possible_book = []
11
+ result = ocr_text.split("\n")
12
+ pattern = re.compile(r'Re(?:c|ceived|e|o|a)')
13
+ book_pattern = re.compile(r'B(?:OOK|00K)', re.IGNORECASE)
14
+ for word in result:
15
+ # checks for possible page numbers
16
+ if word.isdigit() == True:
17
+ possible_pages.append(word)
18
+ # checks for rec'd dates
19
+ if re.match(pattern, word):
20
+ # appending entire string for human judgement as OCR fails to correctly translate years in few cases
21
+ possible_dates.append(word)
22
+ if re.match(book_pattern, word):
23
+ possible_book.append(word)
24
+ if not possible_pages:
25
+ possible_pages.append("Null")
26
+ if not possible_dates:
27
+ possible_dates.append("Null")
28
+ if not possible_book:
29
+ possible_book.append("Null")
30
+ return possible_pages, possible_dates, possible_book
modules/last_year/manual_keyword_check.sh ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ #!/bin/bash -l
2
+
3
+ python search_keywords.py
modules/last_year/pagenum.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from PIL import Image
2
+
3
+ def crop_image(input_path, output_path):
4
+ image = Image.open(input_path)
5
+
6
+ width, height = image.size
7
+ left_width = width * 0.1575
8
+ right_start = width * 0.88
9
+ top_height = height * 0.07
10
+
11
+ left_crop = image.crop((0, 0, left_width, top_height))
12
+
13
+ right_crop = image.crop((right_start, 0, width, top_height))
14
+
15
+ result_width = int(left_width + (width - right_start))
16
+ result_image = Image.new('RGB', (result_width, int(top_height)))
17
+
18
+ result_image.paste(left_crop, (0, 0))
19
+ result_image.paste(right_crop, (int(left_width), 0))
20
+
21
+ result_image.save(output_path, format='TIFF')
22
+ return result_image
modules/last_year/racism_checker_old_pipeline.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from modules.racist_text_query import racist_text_query
2
+ from modules.bigotry_dict import bigotry_dict
3
+ from modules.OCR import tiff_to_ocr
4
+ from modules.racist_chatgpt_analysis import racist_chatgpt_analysis
5
+ from modules.locate import locate
6
+ from modules.pagenum import crop_image
7
+ import os
8
+ import pandas as pd
9
+
10
+ def racism_threshold(file_dir):
11
+ # Create the new folder for cropped images
12
+ cropped_images_dir = os.path.join(file_dir, 'deed page number')
13
+ if not os.path.exists(cropped_images_dir):
14
+ os.makedirs(cropped_images_dir)
15
+
16
+ data = []
17
+ for images in os.listdir(file_dir):
18
+ if images.endswith(".tif") or images.endswith(".tiff"):
19
+ image_path = os.path.join(file_dir, images)
20
+
21
+ # run ocr on images
22
+ text = tiff_to_ocr(image_path)
23
+
24
+ result1 = racist_chatgpt_analysis(text)
25
+ result2 = racist_text_query(text, bigotry_dict)
26
+
27
+ a, b, c = locate(text)
28
+
29
+ # Define the output path for the cropped image in the new folder
30
+ cropped_image_name = "cropped_" + images
31
+ cropped_image_path = os.path.join(cropped_images_dir, cropped_image_name)
32
+
33
+ # Crop the image and save it to the new folder
34
+ crop_image(image_path, cropped_image_path)
35
+
36
+ image_path_formatted = cropped_image_path
37
+ #.replace(' ', '%20')
38
+ hyperlink_formula = f'file://{image_path_formatted}'
39
+
40
+ # fail safe page number detection
41
+ page = tiff_to_ocr(cropped_image_path)
42
+ fail_safe_page = []
43
+ result = page.split("\n")
44
+ for word in result:
45
+ # checks for possible page numbers
46
+ if word.isdigit() == True:
47
+ fail_safe_page.append(word)
48
+
49
+
50
+ if result1 or result2:
51
+ print(images, a, b, c)
52
+ if len(fail_safe_page) != 0:
53
+ a.append(fail_safe_page)
54
+ data.append([images, a, b[0], c[0], hyperlink_formula])
55
+ else:
56
+ print(images + " : Not Racist")
57
+ # data.append([images, a, b[0], c[0], hyperlink_formula])
58
+
59
+ # Include the hyperlink in the DataFrame columns
60
+ df = pd.DataFrame(data, columns=['File Name', 'Probable Page Number', 'Date', 'Book Number', "Page Link"])
61
+ df.index += 1
62
+ df.to_csv(os.path.join(file_dir, 'Racist Deeds.csv'), index=True)
63
+ df.to_excel(os.path.join(file_dir, 'Racist Deeds.xlsx'), index=True)
64
+
65
+ racism_threshold('folderpath')
modules/last_year/racist_chatgpt_analysis.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from openai import OpenAI
2
+ from dotenv import load_dotenv
3
+ import os
4
+
5
+ load_dotenv()
6
+ client = OpenAI(
7
+ organization=os.getenv('OPENAI_ORG_ID'),
8
+ api_key=os.getenv('OPENAI_API_KEY')
9
+ )
10
+
11
+ def racist_chatgpt_analysis(text):
12
+ response = client.chat.completions.create(
13
+ model="gpt-3.5-turbo-0125",
14
+ messages=[ # prompt engineering
15
+ {"role": "system", "content": "You are a helpful assistant designed to check if there's any racial contents. \
16
+ Please review this document for any racial or discriminatory expressions. \
17
+ If yes, return 'Yes', if there's none, please return 'No racial content found'."},
18
+ {"role": "user", "content": text}
19
+ ]
20
+ )
21
+ if response.choices[0].message.content == "Yes":
22
+ return True
23
+ else:
24
+ return False
modules/last_year/racist_text_query.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ def racist_text_query(text, bigotry_dict):
2
+ words = text.split()
3
+ for word in words:
4
+ if word in bigotry_dict:
5
+ return True
6
+ return False
7
+
8
+ def read_text(file_path):
9
+ with open(file_path, 'r') as file:
10
+ return file.read()
modules/last_year/search_keywords.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ from difflib import SequenceMatcher
4
+ from bigotry_dict import bigotry_dict
5
+ #can also submit this with "qsub manual_keyword_check.sh" because it takes a long time
6
+
7
+ # Path to save the output
8
+ output_file_path = "output.txt"
9
+
10
+ # Open the output file in append mode
11
+ with open(output_file_path, 'w', encoding='utf-8') as output_file:
12
+ # Walk through the directory
13
+ for root, dirs, files in os.walk(r'../deed_preprocessing/racist'):
14
+ for file in files:
15
+ if file.endswith('.txt'):
16
+ txt_file_path = os.path.join(root, file)
17
+
18
+ with open(txt_file_path, 'rb') as txt_file:
19
+ try:
20
+ # Read and decode the text file
21
+ text = txt_file.read()
22
+ decoded_text = text.decode('utf-8')
23
+ words = re.split(r'[\n ]+', decoded_text)
24
+
25
+ # Look for matches in the text
26
+ found = False
27
+ for i in range(len(words)):
28
+ if not found:
29
+ for identifier in bigotry_dict.keys():
30
+ if not found:
31
+ similarity_ratio = SequenceMatcher(None, words[i], identifier).ratio()
32
+ if similarity_ratio >= 0.9:
33
+ # Collect the surrounding words
34
+ context = words[max(0, i-10):min(len(words),i+10)]
35
+ context_str = ' '.join(context)
36
+
37
+ # Write to the output file
38
+ output_file.write(f"Context: {context_str}\n")
39
+ output_file.write(f"File: {txt_file_path}\n\n")
40
+ print(txt_file_path)
41
+ found = True
42
+ else:
43
+ break
44
+ else:
45
+ break
46
+
47
+
48
+ except Exception as e:
49
+ print(f"Error processing {file}: {str(e)}")
50
+
51
+ print(f"Results saved to {output_file_path}")
modules/model_experimentation/bag_of_words_logistic_regression.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from sklearn.feature_extraction.text import CountVectorizer
3
+ from sklearn.model_selection import train_test_split
4
+ from sklearn.linear_model import LogisticRegression
5
+ from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc
6
+ import matplotlib.pyplot as plt
7
+ import seaborn as sns
8
+ import warnings
9
+ warnings.filterwarnings("ignore", category=FutureWarning)
10
+ import pickle
11
+
12
+ def preprocess_bag_of_words(preprocessed_text_list):
13
+ texts = [preprocessed["original_text"] for preprocessed in preprocessed_text_list]
14
+
15
+ vectorizer = CountVectorizer()
16
+ bag_of_words = vectorizer.fit_transform(texts)
17
+
18
+ bow_df = pd.DataFrame(bag_of_words.toarray(), columns=vectorizer.get_feature_names_out())
19
+
20
+ return bow_df, vectorizer
21
+
22
+ if __name__ == "__main__":
23
+ preprocessed_data = pd.read_pickle('preprocessed_deeds.pkl')
24
+
25
+ texts = preprocessed_data['original_text']
26
+ preprocessed_text_list = texts.apply(lambda x: {"original_text": x}).tolist()
27
+
28
+ bow_df, vectorizer = preprocess_bag_of_words(preprocessed_text_list)
29
+
30
+ X = bow_df
31
+ y = preprocessed_data['is_racist']
32
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
33
+
34
+ logistic_model = LogisticRegression(max_iter=1000)
35
+ logistic_model.fit(X_train, y_train)
36
+
37
+ # Save the model and vectorizer
38
+ with open('vectorizer.pkl', 'wb') as vec_file:
39
+ pickle.dump(vectorizer, vec_file)
40
+ with open('logistic_model.pkl', 'wb') as model_file:
41
+ pickle.dump(logistic_model, model_file)
42
+
43
+ y_pred = logistic_model.predict(X_test)
44
+
45
+ accuracy = accuracy_score(y_test, y_pred)
46
+ print(f"Accuracy: {accuracy:.2f}")
47
+ print("\nClassification Report:")
48
+ print(classification_report(y_test, y_pred))
49
+
50
+ conf_matrix = confusion_matrix(y_test, y_pred)
51
+ plt.figure(figsize=(6, 4))
52
+ sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=['Non-racist', 'Racist'], yticklabels=['Non-racist', 'Racist'])
53
+ plt.title('Confusion Matrix')
54
+ plt.xlabel('Predicted')
55
+ plt.ylabel('Actual')
56
+ plt.show()
57
+
58
+ y_prob = logistic_model.predict_proba(X_test)[:, 1]
59
+ fpr, tpr, _ = roc_curve(y_test, y_prob)
60
+ roc_auc = auc(fpr, tpr)
61
+
62
+ plt.figure(figsize=(6, 4))
63
+ plt.plot(fpr, tpr, label=f'ROC curve (AUC = {roc_auc:.2f})')
64
+ plt.plot([0, 1], [0, 1], 'k--')
65
+ plt.xlim([0.0, 1.0])
66
+ plt.ylim([0.0, 1.05])
67
+ plt.xlabel('False Positive Rate')
68
+ plt.ylabel('True Positive Rate')
69
+ plt.title('Receiver Operating Characteristic (ROC) Curve')
70
+ plt.legend(loc="lower right")
71
+ plt.show()
72
+
73
+ feature_importance = pd.Series(logistic_model.coef_[0], index=vectorizer.get_feature_names_out())
74
+ top_features = feature_importance.nlargest(10)
75
+
76
+ plt.figure(figsize=(8, 6))
77
+ top_features.plot(kind='barh', color='skyblue')
78
+ plt.title('Top 10 Most Influential Words for Racist Classification')
79
+ plt.xlabel('Coefficient Value')
80
+ plt.ylabel('Word')
81
+ plt.show()
82
+
83
+ # Function to make predictions based on the trained model
84
+ def predict(processed_text, vectorizer, logistic_model):
85
+ bow_text = vectorizer.transform([processed_text["original_text"]])
86
+ prediction = logistic_model.predict(bow_text)
87
+ return {
88
+ 'is_racist': bool(prediction[0]),
89
+ }
modules/model_experimentation/logistic_model.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bedad659905586b85bd65fd9473b9859d65871a4819df69ccf453a5cfde2229f
3
+ size 316116
modules/model_experimentation/preprocessed_deeds.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8d2d25baf91285511e68a177d9410bfc6ec6b12f54ac9b2d0a49dd4dba7282bf
3
+ size 80115994
modules/model_experimentation/vectorizer.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:11d1802aa1d14f29acd8f3914cb6ad066958832b34a532c3fab0bc5f28878c9d
3
+ size 223071
modules/modules.md ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ### azure_cloud_ocr
2
+
3
+ Contains modules related to Microsoft Azure OCR
4
+
5
+ ### data_retrieval
6
+
7
+ Contains Google Colab script to fetch TIFFs from Drive
8
+
9
+ ### deed_preprocessing
10
+
11
+ Includes code to run all tiffs through OCR and preprocessing. Also contains EDA.
12
+
13
+ ### google_cloud_ocr
14
+
15
+ Contains modules related to Google Cloud OCR
16
+
17
+ ### last_year
18
+
19
+ Contains last years modules, which are not being used.
modules/openai/__pycache__/racist_chatgpt_analysis.cpython-310.pyc ADDED
Binary file (1.23 kB). View file
 
modules/openai/batch/batch_instruc.md ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Calling OpenAI Batch steps
2
+
3
+ Guide to OpenAI batch api --> https://platform.openai.com/docs/guides/batch
4
+
5
+ ---
6
+
7
+ ## Step 1: Prepare Batch File (prepare_batch.py)
8
+
9
+ 1. Prepare a .jsonl file containing your batch requests. Each line represents a single API request.
10
+ 2. Usage: Run the script to process all files in the specified directory and generate the batch file.
11
+
12
+ ---
13
+
14
+ ## Step 2: Upload Batch File (upload_batch_file.py)
15
+
16
+ 1. Upload the prepared batch file to OpenAI using the Files API.
17
+ 2. The script uploads batch_input.jsonl and returns the file ID.
18
+
19
+ ---
20
+
21
+ ## Step 3: Create Batch (create_batch.py)
22
+
23
+ 1. Create a batch job using the uploaded file's ID. Add the file ID obtained from running upload_batch_file.py to create_batch.py.
24
+ 2. Returns a batch ID.
25
+
26
+ ---
27
+
28
+ ## Step 4: Check Batch Status (check_batch_status.py)
29
+
30
+ 1. Use the batch iD from create_batch.py to check the status of the batch job to monitor progress.
31
+ Possible statuses:
32
+ 1. validating: Validating the input file.
33
+ 2. in_progress: Batch is running.
34
+ 3. completed: Batch is finished and results are ready.
35
+ 4. failed: Validation failed.
36
+ 5. expired: Batch did not complete within the window.
37
+ 2. Retrieves the output_file_ID if completed.
38
+
39
+ ---
40
+
41
+ ## Step 5: Retrieve Results (retrieve_results.py)
42
+
43
+ 1. Download the results using the output_file_id retrieved from the batch status.
44
+
45
+ ---
46
+
47
+ ## Helper scripts
48
+
49
+ 1. Cancel Batch (cancel_batch.py)
50
+ 1. Cancel an ongoing batch if required. Changes batch status to cancelling and eventually cancelled.
51
+
52
+ 2. List Batches (list_batches.py)
53
+ 1. View all batches created, including their status and metadata.
54
+
55
+ ## Consolidated script that combines all the functionalities (Except helpers)
56
+
57
+ 1. batch_processing.py
modules/openai/batch/batch_processing.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ from dotenv import load_dotenv
4
+ from openai import OpenAI
5
+
6
+ load_dotenv()
7
+
8
+ class BatchProcessor:
9
+ def __init__(self):
10
+ self.client = OpenAI(
11
+ api_key=os.getenv("OPENAI_API_KEY"),
12
+ organization=os.getenv("OPENAI_ORG_ID")
13
+ )
14
+
15
+ def prepare_batch(self, folder_path, output_file):
16
+ """Prepare a batch input file from a folder of text files."""
17
+ with open(output_file, "w") as out_file:
18
+ for filename in os.listdir(folder_path):
19
+ if filename.endswith(".txt"):
20
+ file_path = os.path.join(folder_path, filename)
21
+ with open(file_path, "r") as file:
22
+ text = file.read()
23
+
24
+ batch_entry = {
25
+ "custom_id": filename,
26
+ "method": "POST",
27
+ "url": "/v1/chat/completions",
28
+ "body": {
29
+ "model": "gpt-4o-mini",
30
+ "messages": [
31
+ {
32
+ "role": "system",
33
+ "content": (
34
+ "You are a helpful assistant designed to check if there's any racial content. "
35
+ "Please review this document for any racial or discriminatory expressions. "
36
+ "If yes, return 'Yes'; if there's none, please return 'No racial content found'. "
37
+ "If there is any doubt or ambiguity, assume the text contains racial content and respond 'Yes'."
38
+ )
39
+ },
40
+ {"role": "user", "content": text}
41
+ ],
42
+ "max_tokens": 1000
43
+ }
44
+ }
45
+ out_file.write(json.dumps(batch_entry) + "\n")
46
+ print(f"Batch file created: {output_file}")
47
+
48
+ def upload_batch_file(self, batch_file_path):
49
+ """Upload the prepared batch input file."""
50
+ with open(batch_file_path, "rb") as f:
51
+ batch_input_file = self.client.files.create(
52
+ file=f,
53
+ purpose="batch"
54
+ )
55
+ print(f"Batch input file uploaded. File ID: {batch_input_file.id}")
56
+ return batch_input_file.id
57
+
58
+ def create_batch(self, file_id):
59
+ """Create a batch job with the uploaded input file."""
60
+ batch = self.client.batches.create(
61
+ input_file_id=file_id,
62
+ endpoint="/v1/chat/completions",
63
+ completion_window="24h",
64
+ metadata={
65
+ "description": "Deed analysis batch"
66
+ }
67
+ )
68
+ print(f"Batch created. Batch ID: {batch.id}")
69
+ return batch.id
70
+
71
+ def check_batch_status(self, batch_id):
72
+ """Check the status of a batch job."""
73
+ batch_status = self.client.batches.retrieve(batch_id)
74
+ print(f"Batch Status: {batch_status.status}")
75
+ if batch_status.status == "completed":
76
+ output_file_id = batch_status.output_file_id
77
+ print(f"Output File ID: {output_file_id}")
78
+ return output_file_id
79
+ else:
80
+ return None
81
+
82
+ def retrieve_results(self, output_file_id, output_path):
83
+ """Retrieve the results of a completed batch job."""
84
+ file_response = self.client.files.content(output_file_id)
85
+ with open(output_path, "wb") as out_file:
86
+ out_file.write(file_response.read())
87
+ print(f"Batch results downloaded to {output_path}")
88
+
89
+ if __name__ == "__main__":
90
+ processor = BatchProcessor()
91
+
92
+ folder_path = ""
93
+ batch_input_file = "batch_input.jsonl"
94
+ batch_output_file = "batch_output.jsonl"
95
+
96
+ # Step 1: Prepare the batch input file
97
+ processor.prepare_batch(folder_path, batch_input_file)
98
+
99
+ # Step 2: Upload the batch input file
100
+ file_id = processor.upload_batch_file(batch_input_file)
101
+
102
+ # Step 3: Create a batch job
103
+ batch_id = processor.create_batch(file_id)
104
+
105
+ # Step 4: Poll for batch status
106
+ import time
107
+ while True:
108
+ output_file_id = processor.check_batch_status(batch_id)
109
+ if output_file_id:
110
+ break
111
+ print("Batch not complete. Retrying in 30 minutes...")
112
+ time.sleep(1800)
113
+
114
+ # Step 5: Retrieve the results
115
+ processor.retrieve_results(output_file_id, batch_output_file)
modules/openai/batch/cancel_batch.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from openai import OpenAI
2
+ import os
3
+ from dotenv import load_dotenv
4
+
5
+ load_dotenv()
6
+
7
+ def cancel_batch(batch_id):
8
+ client = OpenAI(
9
+ api_key=os.getenv("OPENAI_API_KEY"),
10
+ organization=os.getenv("OPENAI_ORG_ID")
11
+ )
12
+
13
+ client.batches.cancel(batch_id)
14
+ print(f"Batch {batch_id} cancelled.")
15
+
16
+ if __name__ == "__main__":
17
+ batch_id = "" # batch id here obtained from create_batch.py
18
+ cancel_batch(batch_id)
modules/openai/batch/check_batch_status.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from openai import OpenAI
2
+ import os
3
+ from dotenv import load_dotenv
4
+
5
+ load_dotenv()
6
+
7
+ def check_batch_status(batch_id):
8
+ client = OpenAI(
9
+ api_key=os.getenv("OPENAI_API_KEY"),
10
+ organization=os.getenv("OPENAI_ORG_ID")
11
+ )
12
+
13
+ batch_status = client.batches.retrieve(batch_id)
14
+ print(f"Batch Status: {batch_status.status}")
15
+
16
+ if batch_status.status == "completed":
17
+ output_file_id = batch_status.output_file_id
18
+ print(f"Output File ID: {output_file_id}")
19
+ return output_file_id
20
+ else:
21
+ print(f"Batch Status: {batch_status.status}")
22
+ return None
23
+
24
+ if __name__ == "__main__":
25
+ batch_id = "" # batch id here
26
+ check_batch_status(batch_id)
modules/openai/batch/create_batch.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from openai import OpenAI
2
+ import os
3
+ from dotenv import load_dotenv
4
+
5
+ load_dotenv()
6
+
7
+ def create_batch(file_id):
8
+ client = OpenAI(
9
+ api_key=os.getenv("OPENAI_API_KEY"),
10
+ organization=os.getenv("OPENAI_ORG_ID")
11
+ )
12
+
13
+ batch = client.batches.create(
14
+ input_file_id=file_id,
15
+ endpoint="/v1/chat/completions",
16
+ completion_window="24h",
17
+ metadata={
18
+ "description": "Deed analysis batch"
19
+ }
20
+ )
21
+
22
+ print(f"Batch created. Batch ID: {batch.id}")
23
+ return batch.id
24
+
25
+ if __name__ == "__main__":
26
+ file_id = "" # file id here obtained from running upload_batch_file.py
27
+ create_batch(file_id)
modules/openai/batch/list_batches.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from openai import OpenAI
2
+ import os
3
+ from dotenv import load_dotenv
4
+
5
+ load_dotenv()
6
+
7
+ def list_batches(limit=10):
8
+ client = OpenAI(
9
+ api_key=os.getenv("OPENAI_API_KEY"),
10
+ organization=os.getenv("OPENAI_ORG_ID")
11
+ )
12
+
13
+ batches = client.batches.list(limit=limit)
14
+ for batch in batches.data:
15
+ print(f"Batch ID: {batch.id}, Status: {batch.status}")
16
+
17
+ if __name__ == "__main__":
18
+ list_batches()
modules/openai/batch/prepare_batch.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+
4
+ def prepare_batch(folder_path, output_file):
5
+ with open(output_file, "w") as out_file:
6
+ for filename in os.listdir(folder_path):
7
+ if filename.endswith(".txt"):
8
+ file_path = os.path.join(folder_path, filename)
9
+
10
+ # Read the content of the file
11
+ with open(file_path, "r") as file:
12
+ text = file.read()
13
+
14
+ # Create a batch entry
15
+ batch_entry = {
16
+ "custom_id": filename,
17
+ "method": "POST",
18
+ "url": "/v1/chat/completions",
19
+ "body": {
20
+ "model": "gpt-4o-mini",
21
+ "messages": [
22
+ {
23
+ "role": "system",
24
+ "content": (
25
+ "You are a helpful assistant designed to check if there's any racial content. "
26
+ "Please review this document for any racial or discriminatory expressions. "
27
+ "If yes, return 'Yes'; if there's none, please return 'No racial content found'. "
28
+ "If there is any doubt or ambiguity, assume the text contains racial content and respond 'Yes'."
29
+ )
30
+ },
31
+ {"role": "user", "content": text}
32
+ ],
33
+ "max_tokens": 1000
34
+ }
35
+ }
36
+ out_file.write(json.dumps(batch_entry) + "\n")
37
+ print(f"Batch file created: {output_file}")
38
+
39
+ if __name__ == "__main__":
40
+ prepare_batch("folder_of_deeds", "batch_input.jsonl") # add folder of deeds to pass into openai
modules/openai/batch/retrieve_results.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from openai import OpenAI
2
+ import os
3
+ from dotenv import load_dotenv
4
+
5
+ load_dotenv()
6
+
7
+ def retrieve_results(output_file_id, output_path):
8
+ client = OpenAI(
9
+ api_key=os.getenv("OPENAI_API_KEY"),
10
+ organization=os.getenv("OPENAI_ORG_ID")
11
+ )
12
+
13
+ file_response = client.files.content(output_file_id)
14
+
15
+ # Write the binary content to the output file
16
+ with open(output_path, "wb") as out_file:
17
+ out_file.write(file_response.read())
18
+
19
+ print(f"Batch results downloaded to {output_path}")
20
+
21
+ if __name__ == "__main__":
22
+ output_file_id = "" # Replace with your actual output file ID
23
+ retrieve_results(output_file_id, "batch_output.jsonl")
modules/openai/batch/upload_batch_file.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from openai import OpenAI
2
+ import os
3
+ from dotenv import load_dotenv
4
+
5
+ load_dotenv()
6
+
7
+ def upload_batch_file(batch_file_path):
8
+ client = OpenAI(
9
+ api_key=os.getenv("OPENAI_API_KEY"),
10
+ organization=os.getenv("OPENAI_ORG_ID")
11
+ )
12
+
13
+ with open(batch_file_path, "rb") as f:
14
+ batch_input_file = client.files.create(
15
+ file=f,
16
+ purpose="batch"
17
+ )
18
+
19
+ print(f"Batch input file uploaded. File ID: {batch_input_file.id}")
20
+ return batch_input_file.id
21
+
22
+ if __name__ == "__main__":
23
+ upload_batch_file("batch_input.jsonl")
modules/openai/gpt_extract.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import openai
3
+ import pandas as pd
4
+ from dotenv import load_dotenv
5
+
6
+ load_dotenv()
7
+
8
+ openai.api_key = os.getenv('OPENAI_API_KEY')
9
+
10
+ folder_path = "racist_deeds_text"
11
+
12
+ output_csv = "deed_names_locations.csv"
13
+
14
+ data = []
15
+
16
+ def extract_names_and_locations(text):
17
+ """
18
+ Extract names and locations from text using OpenAI.
19
+ """
20
+ try:
21
+ response = openai.ChatCompletion.create(
22
+ model="gpt-4",
23
+ messages=[
24
+ {
25
+ "role": "system",
26
+ "content": (
27
+ "You are an assistant that extracts names and locations from legal text. "
28
+ "For the given input, identify all names of people (grantors, grantees) and "
29
+ "locations (addresses, city, county, state). "
30
+ "Return the names as a comma-separated list and locations as a separate comma-separated list "
31
+ "strictly in the format:\nNames: [comma-separated names]\nLocations: [comma-separated locations]."
32
+ )
33
+ },
34
+ {"role": "user", "content": text}
35
+ ]
36
+ )
37
+ output = response.choices[0].message.content.strip()
38
+
39
+ names, locations = "", ""
40
+
41
+ for line in output.split("\n"):
42
+ if line.startswith("Names:"):
43
+ names = line.replace("Names:", "").strip()
44
+ elif line.startswith("Locations:"):
45
+ locations = line.replace("Locations:", "").strip()
46
+
47
+ return names, locations
48
+ except Exception as e:
49
+ print(f"Error extracting names and locations: {e}")
50
+ return "", ""
51
+
52
+ for filename in os.listdir(folder_path):
53
+ if filename.endswith(".txt"):
54
+ file_path = os.path.join(folder_path, filename)
55
+
56
+ with open(file_path, "r") as file:
57
+ text = file.read()
58
+
59
+ names, locations = extract_names_and_locations(text)
60
+
61
+ data.append({"Filename": filename, "Names": names, "Locations": locations})
62
+ print(f"Processed {filename}")
63
+
64
+ df = pd.DataFrame(data)
65
+ df.to_csv(output_csv, index=False)
66
+ print(f"Results saved to {output_csv}")
modules/openai/racist_chatgpt_analysis.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import openai
2
+ from dotenv import load_dotenv
3
+ import os
4
+
5
+ load_dotenv()
6
+ openai.organization = os.getenv('OPENAI_ORG_ID')
7
+ openai.api_key = os.getenv('OPENAI_API_KEY')
8
+
9
+ def racist_chatgpt_analysis(text):
10
+ try:
11
+ response = openai.ChatCompletion.create(
12
+ model="gpt-4o-mini",
13
+ messages=[
14
+ {
15
+ "role": "system",
16
+ "content": "You are a helpful assistant designed to check if there's any racial contents. \
17
+ Please review this document for any racial or discriminatory expressions. \
18
+ If yes, return 'Yes', if there's none, please return 'No racial content found'. \
19
+ If there is any doubt or ambiguity, assume the text contains racial content and respond 'Yes'."
20
+ },
21
+ {"role": "user", "content": text}
22
+ ]
23
+ )
24
+ if response.choices[0].message.content.strip() == "Yes":
25
+ return True
26
+ else:
27
+ return False
28
+ except Exception as e:
29
+ print(f"Error: {e}")
30
+ return False
requirements.txt CHANGED
@@ -1,2 +1,14 @@
1
- fastapi
2
- uvicorn[standard]
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ pytesseract
2
+ tesseract
3
+ Pillow
4
+ openai
5
+ python-dotenv
6
+ python-dotenv
7
+ google-cloud-vision
8
+ spacy
9
+ openai==0.28
10
+ Flask==2.x.x
11
+ Flask-Cors==3.x.x
12
+ gunicorn==20.x.x
13
+ pandas==1.x.x
14
+ xlsxwriter==3.x.x