Spaces:
Sleeping
Sleeping
jacob-stein
commited on
Commit
·
97208ad
1
Parent(s):
8e73170
Migrate Flask backend
Browse files- Dockerfile +11 -5
- app.py +114 -5
- modules/.DS_Store +0 -0
- modules/deed_preprocessing/.DS_Store +0 -0
- modules/deed_preprocessing/preprocessor.py +62 -0
- modules/deed_preprocessing/spellcheck.py +12 -0
- modules/google_cloud_ocr/__init__.py +0 -0
- modules/google_cloud_ocr/cloud-creds-template.json +13 -0
- modules/google_cloud_ocr/google_cloud_ocr.py +23 -0
- modules/google_cloud_ocr/setup.md +59 -0
- modules/last_year/OCR.py +11 -0
- modules/last_year/__pycache__/racist_chatgpt_analysis.cpython-310.pyc +0 -0
- modules/last_year/bigotry_dict.py +102 -0
- modules/last_year/env.template +2 -0
- modules/last_year/locate.py +30 -0
- modules/last_year/manual_keyword_check.sh +3 -0
- modules/last_year/pagenum.py +22 -0
- modules/last_year/racism_checker_old_pipeline.py +65 -0
- modules/last_year/racist_chatgpt_analysis.py +24 -0
- modules/last_year/racist_text_query.py +10 -0
- modules/last_year/search_keywords.py +51 -0
- modules/model_experimentation/bag_of_words_logistic_regression.py +89 -0
- modules/model_experimentation/logistic_model.pkl +3 -0
- modules/model_experimentation/preprocessed_deeds.pkl +3 -0
- modules/model_experimentation/vectorizer.pkl +3 -0
- modules/modules.md +19 -0
- modules/openai/__pycache__/racist_chatgpt_analysis.cpython-310.pyc +0 -0
- modules/openai/batch/batch_instruc.md +57 -0
- modules/openai/batch/batch_processing.py +115 -0
- modules/openai/batch/cancel_batch.py +18 -0
- modules/openai/batch/check_batch_status.py +26 -0
- modules/openai/batch/create_batch.py +27 -0
- modules/openai/batch/list_batches.py +18 -0
- modules/openai/batch/prepare_batch.py +40 -0
- modules/openai/batch/retrieve_results.py +23 -0
- modules/openai/batch/upload_batch_file.py +23 -0
- modules/openai/gpt_extract.py +66 -0
- modules/openai/racist_chatgpt_analysis.py +30 -0
- requirements.txt +14 -2
Dockerfile
CHANGED
@@ -1,16 +1,22 @@
|
|
1 |
-
|
2 |
-
# you will also find guides on how best to write your Dockerfile
|
3 |
-
|
4 |
-
FROM python:3.9
|
5 |
|
|
|
6 |
RUN useradd -m -u 1000 user
|
7 |
USER user
|
8 |
ENV PATH="/home/user/.local/bin:$PATH"
|
9 |
|
|
|
10 |
WORKDIR /app
|
11 |
|
|
|
12 |
COPY --chown=user ./requirements.txt requirements.txt
|
13 |
RUN pip install --no-cache-dir --upgrade -r requirements.txt
|
14 |
|
|
|
15 |
COPY --chown=user . /app
|
16 |
-
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.9-slim
|
|
|
|
|
|
|
2 |
|
3 |
+
# Set up non-root user
|
4 |
RUN useradd -m -u 1000 user
|
5 |
USER user
|
6 |
ENV PATH="/home/user/.local/bin:$PATH"
|
7 |
|
8 |
+
# Set working directory
|
9 |
WORKDIR /app
|
10 |
|
11 |
+
# Copy and install dependencies
|
12 |
COPY --chown=user ./requirements.txt requirements.txt
|
13 |
RUN pip install --no-cache-dir --upgrade -r requirements.txt
|
14 |
|
15 |
+
# Copy all application code
|
16 |
COPY --chown=user . /app
|
17 |
+
|
18 |
+
# Expose required port
|
19 |
+
EXPOSE 7860
|
20 |
+
|
21 |
+
# Run the Flask app using Gunicorn
|
22 |
+
CMD ["gunicorn", "-b", "0.0.0.0:7860", "app:app"]
|
app.py
CHANGED
@@ -1,7 +1,116 @@
|
|
1 |
-
from
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
|
3 |
-
app =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
|
5 |
-
@app.get("/")
|
6 |
-
def greet_json():
|
7 |
-
return {"Hello": "World!"}
|
|
|
1 |
+
from flask import Flask, request, jsonify, send_file
|
2 |
+
from flask_cors import CORS
|
3 |
+
import pickle
|
4 |
+
from modules.google_cloud_ocr.google_cloud_ocr import google_cloud_ocr
|
5 |
+
from modules.deed_preprocessing.spellcheck import correct_spelling
|
6 |
+
from modules.deed_preprocessing.preprocessor import preprocess_text
|
7 |
+
from modules.openai.racist_chatgpt_analysis import racist_chatgpt_analysis
|
8 |
+
from modules.model_experimentation.bag_of_words_logistic_regression import predict
|
9 |
+
import pandas as pd
|
10 |
+
import xlsxwriter
|
11 |
+
import re
|
12 |
|
13 |
+
app = Flask(__name__)
|
14 |
+
# CORS(app, resources={r"/*": {"origins": "*"}})
|
15 |
+
CORS(app, supports_credentials=True, origins="*")
|
16 |
+
|
17 |
+
with open('modules/model_experimentation/vectorizer.pkl', 'rb') as vec_file:
|
18 |
+
vectorizer = pickle.load(vec_file)
|
19 |
+
|
20 |
+
with open('modules/model_experimentation/logistic_model.pkl', 'rb') as model_file:
|
21 |
+
logistic_model = pickle.load(model_file)
|
22 |
+
|
23 |
+
# Helper to look for the book and page numbers
|
24 |
+
def extract_book_and_page(text):
|
25 |
+
book_numbers = re.findall(r"book\s+(\d+)", text, re.IGNORECASE)
|
26 |
+
page_numbers = re.findall(r"page\s+(\d+)", text, re.IGNORECASE)
|
27 |
+
return book_numbers, page_numbers
|
28 |
+
|
29 |
+
@app.route('/api/upload', methods=['POST'])
|
30 |
+
def upload_file():
|
31 |
+
if 'file' not in request.files:
|
32 |
+
return jsonify({'error': 'No file part in the request'}), 400
|
33 |
+
|
34 |
+
file = request.files['file']
|
35 |
+
|
36 |
+
if file.filename == '':
|
37 |
+
return jsonify({'error': 'No selected file'}), 400
|
38 |
+
|
39 |
+
ocr_engine = request.form.get('ocr_engine', 'google')
|
40 |
+
analysis_method = request.form.get('analysis_method', 'chatgpt')
|
41 |
+
|
42 |
+
try:
|
43 |
+
if ocr_engine == 'google':
|
44 |
+
# Step 1: Get text using Google OCR
|
45 |
+
google_text = google_cloud_ocr(file)
|
46 |
+
|
47 |
+
# Step 2: Pass text through the spell checker
|
48 |
+
spellchecked_text = correct_spelling(google_text)
|
49 |
+
|
50 |
+
# Step 3: Pass text through the preprocessor
|
51 |
+
processed_text = preprocess_text(spellchecked_text)
|
52 |
+
|
53 |
+
# Extract book and page numbers right after spellchecking
|
54 |
+
book_numbers, page_numbers = extract_book_and_page(spellchecked_text)
|
55 |
+
|
56 |
+
# Step 4: Get the names and locations
|
57 |
+
extracted_info = {
|
58 |
+
"names": processed_text.get("names", []),
|
59 |
+
"locations": processed_text.get("locations", []),
|
60 |
+
"book_numbers": book_numbers,
|
61 |
+
"page_numbers": page_numbers
|
62 |
+
}
|
63 |
+
|
64 |
+
# Step 5: Choose analysis method
|
65 |
+
if analysis_method == 'chatgpt':
|
66 |
+
analysis_result = racist_chatgpt_analysis(processed_text['original_text'])
|
67 |
+
return jsonify({
|
68 |
+
'status': 'success',
|
69 |
+
'ocr_engine': 'google',
|
70 |
+
'analysis_method': 'chatgpt',
|
71 |
+
'original_text': google_text,
|
72 |
+
'spellchecked_text': spellchecked_text,
|
73 |
+
'processed_text': processed_text,
|
74 |
+
'extracted_info': extracted_info,
|
75 |
+
'result': analysis_result
|
76 |
+
}), 200
|
77 |
+
elif analysis_method == 'logistic_regression':
|
78 |
+
lr_result = predict(processed_text, vectorizer, logistic_model)['is_racist']
|
79 |
+
return jsonify({
|
80 |
+
'status': 'success',
|
81 |
+
'ocr_engine': 'google',
|
82 |
+
'analysis_method': 'logistic_regression',
|
83 |
+
'original_text': google_text,
|
84 |
+
'spellchecked_text': spellchecked_text,
|
85 |
+
'processed_text': processed_text,
|
86 |
+
'extracted_info': extracted_info,
|
87 |
+
'result': lr_result
|
88 |
+
}), 200
|
89 |
+
else:
|
90 |
+
return jsonify({'error': 'Unsupported analysis method selected'}), 400
|
91 |
+
elif ocr_engine == 'azure':
|
92 |
+
return jsonify({'status': 'success', 'ocr_engine': 'azure', 'text': "fill"}), 200
|
93 |
+
else:
|
94 |
+
return jsonify({'error': 'Unsupported OCR engine selected'}), 400
|
95 |
+
except Exception as e:
|
96 |
+
return jsonify({'error': str(e)}), 500
|
97 |
+
|
98 |
+
@app.route('/api/download_excel', methods=['POST'])
|
99 |
+
def download_excel():
|
100 |
+
try:
|
101 |
+
data = request.get_json()
|
102 |
+
if not data:
|
103 |
+
return jsonify({'error': 'No data provided'}), 400
|
104 |
+
|
105 |
+
df = pd.DataFrame(data)
|
106 |
+
excel_path = 'output.xlsx'
|
107 |
+
with pd.ExcelWriter(excel_path, engine='xlsxwriter') as writer:
|
108 |
+
df.to_excel(writer, index=False, sheet_name='Sheet1')
|
109 |
+
|
110 |
+
return send_file(excel_path, as_attachment=True, download_name='analysis_results.xlsx')
|
111 |
+
except Exception as e:
|
112 |
+
return jsonify({'error': str(e)}), 500
|
113 |
+
|
114 |
+
if __name__ == '__main__':
|
115 |
+
app.run(debug=True, host="0.0.0.0", port=7860)
|
116 |
|
|
|
|
|
|
modules/.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
modules/deed_preprocessing/.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
modules/deed_preprocessing/preprocessor.py
ADDED
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import spacy
|
3 |
+
from collections import Counter
|
4 |
+
|
5 |
+
nlp = spacy.load('en_core_web_sm')
|
6 |
+
|
7 |
+
def preprocess_text(text):
|
8 |
+
text = re.sub(r'[\n\r\t]', ' ', text)
|
9 |
+
text = re.sub(r'[^\x00-\x7F]+', '', text)
|
10 |
+
doc = nlp(text)
|
11 |
+
|
12 |
+
result = {
|
13 |
+
"original_text": text,
|
14 |
+
"sentences": [],
|
15 |
+
"pos_groups": {},
|
16 |
+
"named_entities": [],
|
17 |
+
"dependencies": [],
|
18 |
+
"token_offsets": [],
|
19 |
+
"word_frequency": {},
|
20 |
+
"sentence_lengths": [],
|
21 |
+
"pos_counts": {}
|
22 |
+
}
|
23 |
+
|
24 |
+
pos_groups = {
|
25 |
+
"NOUN": [], "VERB": [], "ADJ": [], "ADV": [], "PROPN": [],
|
26 |
+
"DET": [], "AUX": [], "PRON": [], "ADP": [], "NUM": [],
|
27 |
+
"PART": [], "PUNCT": [], "INTJ": [], "X": []
|
28 |
+
}
|
29 |
+
|
30 |
+
all_tokens = []
|
31 |
+
|
32 |
+
for sent in doc.sents:
|
33 |
+
result["sentences"].append(sent.text)
|
34 |
+
result["sentence_lengths"].append(len(sent))
|
35 |
+
|
36 |
+
for token in sent:
|
37 |
+
pos = token.pos_
|
38 |
+
all_tokens.append(token.text)
|
39 |
+
|
40 |
+
if pos in pos_groups:
|
41 |
+
pos_groups[pos].append(token.text)
|
42 |
+
|
43 |
+
result["dependencies"].append({
|
44 |
+
"token": token.text,
|
45 |
+
"dep": token.dep_,
|
46 |
+
"head": token.head.text
|
47 |
+
})
|
48 |
+
result["token_offsets"].append({
|
49 |
+
"token": token.text,
|
50 |
+
"start": token.idx,
|
51 |
+
"end": token.idx + len(token.text)
|
52 |
+
})
|
53 |
+
|
54 |
+
result["pos_groups"] = pos_groups
|
55 |
+
result["named_entities"] = [{"text": ent.text, "label": ent.label_} for ent in doc.ents]
|
56 |
+
result["word_frequency"] = dict(Counter(all_tokens))
|
57 |
+
result["pos_counts"] = dict(Counter([token.pos_ for token in doc]))
|
58 |
+
|
59 |
+
result["names"] = [ent.text for ent in doc.ents if ent.label_ == "PERSON"]
|
60 |
+
result["locations"] = [ent.text for ent in doc.ents if ent.label_ in {"GPE", "LOC"}]
|
61 |
+
|
62 |
+
return result
|
modules/deed_preprocessing/spellcheck.py
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from autocorrect import Speller
|
2 |
+
|
3 |
+
# Initialize the Speller instance from autocorrect
|
4 |
+
spell = Speller(lang='en')
|
5 |
+
|
6 |
+
def correct_spelling(text):
|
7 |
+
"""Correct spelling using Autocorrect."""
|
8 |
+
|
9 |
+
# Correct basic spelling errors using Autocorrect
|
10 |
+
corrected_text = spell(text)
|
11 |
+
|
12 |
+
return corrected_text
|
modules/google_cloud_ocr/__init__.py
ADDED
File without changes
|
modules/google_cloud_ocr/cloud-creds-template.json
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"type": "",
|
3 |
+
"project_id": "",
|
4 |
+
"private_key_id": "",
|
5 |
+
"private_key": "",
|
6 |
+
"client_email": "",
|
7 |
+
"client_id": "",
|
8 |
+
"auth_uri": "",
|
9 |
+
"token_uri": "",
|
10 |
+
"auth_provider_x509_cert_url": "",
|
11 |
+
"client_x509_cert_url": "",
|
12 |
+
"universe_domain": ""
|
13 |
+
}
|
modules/google_cloud_ocr/google_cloud_ocr.py
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import io
|
3 |
+
from dotenv import load_dotenv
|
4 |
+
from google.cloud import vision
|
5 |
+
|
6 |
+
load_dotenv()
|
7 |
+
|
8 |
+
google_creds = os.getenv('GOOGLE_APPLICATION_CREDENTIALS')
|
9 |
+
|
10 |
+
if google_creds is None:
|
11 |
+
raise EnvironmentError("GOOGLE_APPLICATION_CREDENTIALS not set in .env file")
|
12 |
+
|
13 |
+
client = vision.ImageAnnotatorClient()
|
14 |
+
|
15 |
+
def google_cloud_ocr(tiff_file):
|
16 |
+
content = tiff_file.read()
|
17 |
+
image = vision.Image(content=content)
|
18 |
+
response = client.document_text_detection(image=image)
|
19 |
+
text = response.full_text_annotation.text
|
20 |
+
if response.error.message:
|
21 |
+
raise Exception(f'API Error: {response.error.message}')
|
22 |
+
|
23 |
+
return text
|
modules/google_cloud_ocr/setup.md
ADDED
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Google Cloud OCR Setup with Python
|
2 |
+
|
3 |
+
---
|
4 |
+
|
5 |
+
## Step 1: Create a Google Cloud Project
|
6 |
+
|
7 |
+
1. Go to the [Google Cloud Console](https://console.cloud.google.com/).
|
8 |
+
2. **Create a New Project**:
|
9 |
+
- In the top-left corner, click the project dropdown menu and then "New Project."
|
10 |
+
- Enter a project name (e.g., "OCR Project") and click "Create."
|
11 |
+
|
12 |
+
---
|
13 |
+
|
14 |
+
## Step 2: Enable the Cloud Vision API
|
15 |
+
|
16 |
+
1. In the **Google Cloud Console**, go to the **Navigation Menu** (three horizontal lines at the top left).
|
17 |
+
2. Click on **APIs & Services** > **Library**.
|
18 |
+
3. In the search bar, type **Vision API**.
|
19 |
+
4. Select **Cloud Vision API** and click **Enable**.
|
20 |
+
|
21 |
+
---
|
22 |
+
|
23 |
+
## Step 3: Create Service Account Credentials
|
24 |
+
|
25 |
+
1. Navigate to **APIs & Services** > **Credentials**.
|
26 |
+
2. Click on **Create Credentials** > **Service Account**.
|
27 |
+
3. **Service Account Details**:
|
28 |
+
- Give the service account a name (e.g., "vision-api-service-account").
|
29 |
+
- Click "Create and Continue."
|
30 |
+
4. **Grant Permissions**:
|
31 |
+
- Choose **Role**: Select "Project" > "Editor" to give your service account sufficient permissions.
|
32 |
+
- Click "Continue."
|
33 |
+
5. **Create JSON Key**:
|
34 |
+
- After creating the service account, click on the three dots next to the account.
|
35 |
+
- Select "Manage Keys" > "Add Key" > "Create New Key."
|
36 |
+
- Choose **JSON** format and download the JSON file. This file contains your credentials.
|
37 |
+
|
38 |
+
---
|
39 |
+
|
40 |
+
## Step 4: Set Up the `.env` File
|
41 |
+
|
42 |
+
1. Create a new file named `.env` in the root directory of your Python project.
|
43 |
+
2. Add the following line to the `.env` file, replacing the path with the actual path to your downloaded JSON credentials file:
|
44 |
+
|
45 |
+
```bash
|
46 |
+
GOOGLE_APPLICATION_CREDENTIALS=/path-to-your-credentials.json
|
47 |
+
```
|
48 |
+
|
49 |
+
---
|
50 |
+
|
51 |
+
## Step 5: Running the script
|
52 |
+
|
53 |
+
1. Ensure you have the needed package:
|
54 |
+
|
55 |
+
```bash
|
56 |
+
pip install google-cloud-vision python-dotenv
|
57 |
+
```
|
58 |
+
|
59 |
+
2. Run the script and see the text files for outputs
|
modules/last_year/OCR.py
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from PIL import Image
|
2 |
+
from io import BytesIO
|
3 |
+
import pytesseract
|
4 |
+
|
5 |
+
def tiff_to_ocr(path):
|
6 |
+
img = Image.open(path)
|
7 |
+
TempIO = BytesIO()
|
8 |
+
img.save(TempIO,format="JPEG")
|
9 |
+
img = Image.open(BytesIO(TempIO.getvalue()))
|
10 |
+
|
11 |
+
return pytesseract.image_to_string(img)
|
modules/last_year/__pycache__/racist_chatgpt_analysis.cpython-310.pyc
ADDED
Binary file (1.05 kB). View file
|
|
modules/last_year/bigotry_dict.py
ADDED
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
bigotry_dict = {
|
2 |
+
"Irishman": True,
|
3 |
+
"irishman": True,
|
4 |
+
"Greek": True,
|
5 |
+
"greek": True,
|
6 |
+
"Portugese": True,
|
7 |
+
"portugese": True,
|
8 |
+
"Mulatto": True,
|
9 |
+
"mutatto": True,
|
10 |
+
"Quadroon": True,
|
11 |
+
"quadroon": True,
|
12 |
+
"Chinaman": True,
|
13 |
+
"chinaman": True,
|
14 |
+
"Jap": True,
|
15 |
+
"jap": True,
|
16 |
+
"japs": True,
|
17 |
+
"Japs": True,
|
18 |
+
"Hebrew": True,
|
19 |
+
"hebrew": True,
|
20 |
+
"Pole": True,
|
21 |
+
"pole": True,
|
22 |
+
"French Canadian": True,
|
23 |
+
"Canadien": True,
|
24 |
+
"Quebecois": True,
|
25 |
+
"Quebecker": True,
|
26 |
+
"Arab": True,
|
27 |
+
"Arabs": True,
|
28 |
+
"Truk": True,
|
29 |
+
"Turks": True,
|
30 |
+
"Frenchman": True,
|
31 |
+
"German": True,
|
32 |
+
"german": True,
|
33 |
+
"Germans": True,
|
34 |
+
"germans": True,
|
35 |
+
"Spaniard": True,
|
36 |
+
"spaniard": True,
|
37 |
+
"Spaniards": True,
|
38 |
+
"spaniards": True,
|
39 |
+
"Slav": True,
|
40 |
+
"slav": True,
|
41 |
+
"Slavs": True,
|
42 |
+
"slavs": True,
|
43 |
+
"Russian": True,
|
44 |
+
"russian": True,
|
45 |
+
"Russians": True,
|
46 |
+
"russians": True,
|
47 |
+
"Persian": True,
|
48 |
+
"persian": True,
|
49 |
+
"Persians": True,
|
50 |
+
"persians": True,
|
51 |
+
"Korean": True,
|
52 |
+
"korean": True,
|
53 |
+
"Koreans": True,
|
54 |
+
"koreans": True,
|
55 |
+
"Negro": True,
|
56 |
+
"negro": True,
|
57 |
+
"Colored": True,
|
58 |
+
"colored": True,
|
59 |
+
"Polander": True,
|
60 |
+
"polander": True,
|
61 |
+
"Polish": True,
|
62 |
+
"polish": True,
|
63 |
+
"Italian": True,
|
64 |
+
"italian": True,
|
65 |
+
"African": True,
|
66 |
+
"african": True,
|
67 |
+
"Africans": True,
|
68 |
+
"africans": True,
|
69 |
+
"Hindu": True,
|
70 |
+
"hindu": True,
|
71 |
+
"Japanese": True,
|
72 |
+
"japanese": True,
|
73 |
+
"Chinese": True,
|
74 |
+
"chinese": True,
|
75 |
+
"Catholic": True,
|
76 |
+
"catholic": True,
|
77 |
+
"Jew": True,
|
78 |
+
"jew": True,
|
79 |
+
"Jewish": True,
|
80 |
+
"jewish": True,
|
81 |
+
"shall not be resold": True,
|
82 |
+
"shall not be re-sold": True,
|
83 |
+
"shall not be sold": True,
|
84 |
+
"white": True,
|
85 |
+
"White": True,
|
86 |
+
"Whites": True,
|
87 |
+
"whites": True,
|
88 |
+
"Aryan": True,
|
89 |
+
"Aryans": True,
|
90 |
+
"aryan": True,
|
91 |
+
"aryans": True,
|
92 |
+
"Caucasian": True,
|
93 |
+
"caucasian": True,
|
94 |
+
"Caucasians": True,
|
95 |
+
"caucasians": True,
|
96 |
+
"race": True,
|
97 |
+
"Race": True,
|
98 |
+
"races": True,
|
99 |
+
"Races": True,
|
100 |
+
"Semetic": True,
|
101 |
+
"semetic": True,
|
102 |
+
}
|
modules/last_year/env.template
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
OPENAI_API_KEY=
|
2 |
+
OPENAI_ORG_ID=
|
modules/last_year/locate.py
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
|
3 |
+
def locate(ocr_text):
|
4 |
+
# input: string produced by the ocr
|
5 |
+
# output: (1) array of possible page numbers (may include false positives)
|
6 |
+
# (2) array of possible dates
|
7 |
+
# (3) array of possible book numbers
|
8 |
+
possible_pages = []
|
9 |
+
possible_dates = []
|
10 |
+
possible_book = []
|
11 |
+
result = ocr_text.split("\n")
|
12 |
+
pattern = re.compile(r'Re(?:c|ceived|e|o|a)')
|
13 |
+
book_pattern = re.compile(r'B(?:OOK|00K)', re.IGNORECASE)
|
14 |
+
for word in result:
|
15 |
+
# checks for possible page numbers
|
16 |
+
if word.isdigit() == True:
|
17 |
+
possible_pages.append(word)
|
18 |
+
# checks for rec'd dates
|
19 |
+
if re.match(pattern, word):
|
20 |
+
# appending entire string for human judgement as OCR fails to correctly translate years in few cases
|
21 |
+
possible_dates.append(word)
|
22 |
+
if re.match(book_pattern, word):
|
23 |
+
possible_book.append(word)
|
24 |
+
if not possible_pages:
|
25 |
+
possible_pages.append("Null")
|
26 |
+
if not possible_dates:
|
27 |
+
possible_dates.append("Null")
|
28 |
+
if not possible_book:
|
29 |
+
possible_book.append("Null")
|
30 |
+
return possible_pages, possible_dates, possible_book
|
modules/last_year/manual_keyword_check.sh
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash -l
|
2 |
+
|
3 |
+
python search_keywords.py
|
modules/last_year/pagenum.py
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from PIL import Image
|
2 |
+
|
3 |
+
def crop_image(input_path, output_path):
|
4 |
+
image = Image.open(input_path)
|
5 |
+
|
6 |
+
width, height = image.size
|
7 |
+
left_width = width * 0.1575
|
8 |
+
right_start = width * 0.88
|
9 |
+
top_height = height * 0.07
|
10 |
+
|
11 |
+
left_crop = image.crop((0, 0, left_width, top_height))
|
12 |
+
|
13 |
+
right_crop = image.crop((right_start, 0, width, top_height))
|
14 |
+
|
15 |
+
result_width = int(left_width + (width - right_start))
|
16 |
+
result_image = Image.new('RGB', (result_width, int(top_height)))
|
17 |
+
|
18 |
+
result_image.paste(left_crop, (0, 0))
|
19 |
+
result_image.paste(right_crop, (int(left_width), 0))
|
20 |
+
|
21 |
+
result_image.save(output_path, format='TIFF')
|
22 |
+
return result_image
|
modules/last_year/racism_checker_old_pipeline.py
ADDED
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from modules.racist_text_query import racist_text_query
|
2 |
+
from modules.bigotry_dict import bigotry_dict
|
3 |
+
from modules.OCR import tiff_to_ocr
|
4 |
+
from modules.racist_chatgpt_analysis import racist_chatgpt_analysis
|
5 |
+
from modules.locate import locate
|
6 |
+
from modules.pagenum import crop_image
|
7 |
+
import os
|
8 |
+
import pandas as pd
|
9 |
+
|
10 |
+
def racism_threshold(file_dir):
|
11 |
+
# Create the new folder for cropped images
|
12 |
+
cropped_images_dir = os.path.join(file_dir, 'deed page number')
|
13 |
+
if not os.path.exists(cropped_images_dir):
|
14 |
+
os.makedirs(cropped_images_dir)
|
15 |
+
|
16 |
+
data = []
|
17 |
+
for images in os.listdir(file_dir):
|
18 |
+
if images.endswith(".tif") or images.endswith(".tiff"):
|
19 |
+
image_path = os.path.join(file_dir, images)
|
20 |
+
|
21 |
+
# run ocr on images
|
22 |
+
text = tiff_to_ocr(image_path)
|
23 |
+
|
24 |
+
result1 = racist_chatgpt_analysis(text)
|
25 |
+
result2 = racist_text_query(text, bigotry_dict)
|
26 |
+
|
27 |
+
a, b, c = locate(text)
|
28 |
+
|
29 |
+
# Define the output path for the cropped image in the new folder
|
30 |
+
cropped_image_name = "cropped_" + images
|
31 |
+
cropped_image_path = os.path.join(cropped_images_dir, cropped_image_name)
|
32 |
+
|
33 |
+
# Crop the image and save it to the new folder
|
34 |
+
crop_image(image_path, cropped_image_path)
|
35 |
+
|
36 |
+
image_path_formatted = cropped_image_path
|
37 |
+
#.replace(' ', '%20')
|
38 |
+
hyperlink_formula = f'file://{image_path_formatted}'
|
39 |
+
|
40 |
+
# fail safe page number detection
|
41 |
+
page = tiff_to_ocr(cropped_image_path)
|
42 |
+
fail_safe_page = []
|
43 |
+
result = page.split("\n")
|
44 |
+
for word in result:
|
45 |
+
# checks for possible page numbers
|
46 |
+
if word.isdigit() == True:
|
47 |
+
fail_safe_page.append(word)
|
48 |
+
|
49 |
+
|
50 |
+
if result1 or result2:
|
51 |
+
print(images, a, b, c)
|
52 |
+
if len(fail_safe_page) != 0:
|
53 |
+
a.append(fail_safe_page)
|
54 |
+
data.append([images, a, b[0], c[0], hyperlink_formula])
|
55 |
+
else:
|
56 |
+
print(images + " : Not Racist")
|
57 |
+
# data.append([images, a, b[0], c[0], hyperlink_formula])
|
58 |
+
|
59 |
+
# Include the hyperlink in the DataFrame columns
|
60 |
+
df = pd.DataFrame(data, columns=['File Name', 'Probable Page Number', 'Date', 'Book Number', "Page Link"])
|
61 |
+
df.index += 1
|
62 |
+
df.to_csv(os.path.join(file_dir, 'Racist Deeds.csv'), index=True)
|
63 |
+
df.to_excel(os.path.join(file_dir, 'Racist Deeds.xlsx'), index=True)
|
64 |
+
|
65 |
+
racism_threshold('folderpath')
|
modules/last_year/racist_chatgpt_analysis.py
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from openai import OpenAI
|
2 |
+
from dotenv import load_dotenv
|
3 |
+
import os
|
4 |
+
|
5 |
+
load_dotenv()
|
6 |
+
client = OpenAI(
|
7 |
+
organization=os.getenv('OPENAI_ORG_ID'),
|
8 |
+
api_key=os.getenv('OPENAI_API_KEY')
|
9 |
+
)
|
10 |
+
|
11 |
+
def racist_chatgpt_analysis(text):
|
12 |
+
response = client.chat.completions.create(
|
13 |
+
model="gpt-3.5-turbo-0125",
|
14 |
+
messages=[ # prompt engineering
|
15 |
+
{"role": "system", "content": "You are a helpful assistant designed to check if there's any racial contents. \
|
16 |
+
Please review this document for any racial or discriminatory expressions. \
|
17 |
+
If yes, return 'Yes', if there's none, please return 'No racial content found'."},
|
18 |
+
{"role": "user", "content": text}
|
19 |
+
]
|
20 |
+
)
|
21 |
+
if response.choices[0].message.content == "Yes":
|
22 |
+
return True
|
23 |
+
else:
|
24 |
+
return False
|
modules/last_year/racist_text_query.py
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
def racist_text_query(text, bigotry_dict):
|
2 |
+
words = text.split()
|
3 |
+
for word in words:
|
4 |
+
if word in bigotry_dict:
|
5 |
+
return True
|
6 |
+
return False
|
7 |
+
|
8 |
+
def read_text(file_path):
|
9 |
+
with open(file_path, 'r') as file:
|
10 |
+
return file.read()
|
modules/last_year/search_keywords.py
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import re
|
3 |
+
from difflib import SequenceMatcher
|
4 |
+
from bigotry_dict import bigotry_dict
|
5 |
+
#can also submit this with "qsub manual_keyword_check.sh" because it takes a long time
|
6 |
+
|
7 |
+
# Path to save the output
|
8 |
+
output_file_path = "output.txt"
|
9 |
+
|
10 |
+
# Open the output file in append mode
|
11 |
+
with open(output_file_path, 'w', encoding='utf-8') as output_file:
|
12 |
+
# Walk through the directory
|
13 |
+
for root, dirs, files in os.walk(r'../deed_preprocessing/racist'):
|
14 |
+
for file in files:
|
15 |
+
if file.endswith('.txt'):
|
16 |
+
txt_file_path = os.path.join(root, file)
|
17 |
+
|
18 |
+
with open(txt_file_path, 'rb') as txt_file:
|
19 |
+
try:
|
20 |
+
# Read and decode the text file
|
21 |
+
text = txt_file.read()
|
22 |
+
decoded_text = text.decode('utf-8')
|
23 |
+
words = re.split(r'[\n ]+', decoded_text)
|
24 |
+
|
25 |
+
# Look for matches in the text
|
26 |
+
found = False
|
27 |
+
for i in range(len(words)):
|
28 |
+
if not found:
|
29 |
+
for identifier in bigotry_dict.keys():
|
30 |
+
if not found:
|
31 |
+
similarity_ratio = SequenceMatcher(None, words[i], identifier).ratio()
|
32 |
+
if similarity_ratio >= 0.9:
|
33 |
+
# Collect the surrounding words
|
34 |
+
context = words[max(0, i-10):min(len(words),i+10)]
|
35 |
+
context_str = ' '.join(context)
|
36 |
+
|
37 |
+
# Write to the output file
|
38 |
+
output_file.write(f"Context: {context_str}\n")
|
39 |
+
output_file.write(f"File: {txt_file_path}\n\n")
|
40 |
+
print(txt_file_path)
|
41 |
+
found = True
|
42 |
+
else:
|
43 |
+
break
|
44 |
+
else:
|
45 |
+
break
|
46 |
+
|
47 |
+
|
48 |
+
except Exception as e:
|
49 |
+
print(f"Error processing {file}: {str(e)}")
|
50 |
+
|
51 |
+
print(f"Results saved to {output_file_path}")
|
modules/model_experimentation/bag_of_words_logistic_regression.py
ADDED
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
from sklearn.feature_extraction.text import CountVectorizer
|
3 |
+
from sklearn.model_selection import train_test_split
|
4 |
+
from sklearn.linear_model import LogisticRegression
|
5 |
+
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc
|
6 |
+
import matplotlib.pyplot as plt
|
7 |
+
import seaborn as sns
|
8 |
+
import warnings
|
9 |
+
warnings.filterwarnings("ignore", category=FutureWarning)
|
10 |
+
import pickle
|
11 |
+
|
12 |
+
def preprocess_bag_of_words(preprocessed_text_list):
|
13 |
+
texts = [preprocessed["original_text"] for preprocessed in preprocessed_text_list]
|
14 |
+
|
15 |
+
vectorizer = CountVectorizer()
|
16 |
+
bag_of_words = vectorizer.fit_transform(texts)
|
17 |
+
|
18 |
+
bow_df = pd.DataFrame(bag_of_words.toarray(), columns=vectorizer.get_feature_names_out())
|
19 |
+
|
20 |
+
return bow_df, vectorizer
|
21 |
+
|
22 |
+
if __name__ == "__main__":
|
23 |
+
preprocessed_data = pd.read_pickle('preprocessed_deeds.pkl')
|
24 |
+
|
25 |
+
texts = preprocessed_data['original_text']
|
26 |
+
preprocessed_text_list = texts.apply(lambda x: {"original_text": x}).tolist()
|
27 |
+
|
28 |
+
bow_df, vectorizer = preprocess_bag_of_words(preprocessed_text_list)
|
29 |
+
|
30 |
+
X = bow_df
|
31 |
+
y = preprocessed_data['is_racist']
|
32 |
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
|
33 |
+
|
34 |
+
logistic_model = LogisticRegression(max_iter=1000)
|
35 |
+
logistic_model.fit(X_train, y_train)
|
36 |
+
|
37 |
+
# Save the model and vectorizer
|
38 |
+
with open('vectorizer.pkl', 'wb') as vec_file:
|
39 |
+
pickle.dump(vectorizer, vec_file)
|
40 |
+
with open('logistic_model.pkl', 'wb') as model_file:
|
41 |
+
pickle.dump(logistic_model, model_file)
|
42 |
+
|
43 |
+
y_pred = logistic_model.predict(X_test)
|
44 |
+
|
45 |
+
accuracy = accuracy_score(y_test, y_pred)
|
46 |
+
print(f"Accuracy: {accuracy:.2f}")
|
47 |
+
print("\nClassification Report:")
|
48 |
+
print(classification_report(y_test, y_pred))
|
49 |
+
|
50 |
+
conf_matrix = confusion_matrix(y_test, y_pred)
|
51 |
+
plt.figure(figsize=(6, 4))
|
52 |
+
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=['Non-racist', 'Racist'], yticklabels=['Non-racist', 'Racist'])
|
53 |
+
plt.title('Confusion Matrix')
|
54 |
+
plt.xlabel('Predicted')
|
55 |
+
plt.ylabel('Actual')
|
56 |
+
plt.show()
|
57 |
+
|
58 |
+
y_prob = logistic_model.predict_proba(X_test)[:, 1]
|
59 |
+
fpr, tpr, _ = roc_curve(y_test, y_prob)
|
60 |
+
roc_auc = auc(fpr, tpr)
|
61 |
+
|
62 |
+
plt.figure(figsize=(6, 4))
|
63 |
+
plt.plot(fpr, tpr, label=f'ROC curve (AUC = {roc_auc:.2f})')
|
64 |
+
plt.plot([0, 1], [0, 1], 'k--')
|
65 |
+
plt.xlim([0.0, 1.0])
|
66 |
+
plt.ylim([0.0, 1.05])
|
67 |
+
plt.xlabel('False Positive Rate')
|
68 |
+
plt.ylabel('True Positive Rate')
|
69 |
+
plt.title('Receiver Operating Characteristic (ROC) Curve')
|
70 |
+
plt.legend(loc="lower right")
|
71 |
+
plt.show()
|
72 |
+
|
73 |
+
feature_importance = pd.Series(logistic_model.coef_[0], index=vectorizer.get_feature_names_out())
|
74 |
+
top_features = feature_importance.nlargest(10)
|
75 |
+
|
76 |
+
plt.figure(figsize=(8, 6))
|
77 |
+
top_features.plot(kind='barh', color='skyblue')
|
78 |
+
plt.title('Top 10 Most Influential Words for Racist Classification')
|
79 |
+
plt.xlabel('Coefficient Value')
|
80 |
+
plt.ylabel('Word')
|
81 |
+
plt.show()
|
82 |
+
|
83 |
+
# Function to make predictions based on the trained model
|
84 |
+
def predict(processed_text, vectorizer, logistic_model):
|
85 |
+
bow_text = vectorizer.transform([processed_text["original_text"]])
|
86 |
+
prediction = logistic_model.predict(bow_text)
|
87 |
+
return {
|
88 |
+
'is_racist': bool(prediction[0]),
|
89 |
+
}
|
modules/model_experimentation/logistic_model.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bedad659905586b85bd65fd9473b9859d65871a4819df69ccf453a5cfde2229f
|
3 |
+
size 316116
|
modules/model_experimentation/preprocessed_deeds.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8d2d25baf91285511e68a177d9410bfc6ec6b12f54ac9b2d0a49dd4dba7282bf
|
3 |
+
size 80115994
|
modules/model_experimentation/vectorizer.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:11d1802aa1d14f29acd8f3914cb6ad066958832b34a532c3fab0bc5f28878c9d
|
3 |
+
size 223071
|
modules/modules.md
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
### azure_cloud_ocr
|
2 |
+
|
3 |
+
Contains modules related to Microsoft Azure OCR
|
4 |
+
|
5 |
+
### data_retrieval
|
6 |
+
|
7 |
+
Contains Google Colab script to fetch TIFFs from Drive
|
8 |
+
|
9 |
+
### deed_preprocessing
|
10 |
+
|
11 |
+
Includes code to run all tiffs through OCR and preprocessing. Also contains EDA.
|
12 |
+
|
13 |
+
### google_cloud_ocr
|
14 |
+
|
15 |
+
Contains modules related to Google Cloud OCR
|
16 |
+
|
17 |
+
### last_year
|
18 |
+
|
19 |
+
Contains last years modules, which are not being used.
|
modules/openai/__pycache__/racist_chatgpt_analysis.cpython-310.pyc
ADDED
Binary file (1.23 kB). View file
|
|
modules/openai/batch/batch_instruc.md
ADDED
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Calling OpenAI Batch steps
|
2 |
+
|
3 |
+
Guide to OpenAI batch api --> https://platform.openai.com/docs/guides/batch
|
4 |
+
|
5 |
+
---
|
6 |
+
|
7 |
+
## Step 1: Prepare Batch File (prepare_batch.py)
|
8 |
+
|
9 |
+
1. Prepare a .jsonl file containing your batch requests. Each line represents a single API request.
|
10 |
+
2. Usage: Run the script to process all files in the specified directory and generate the batch file.
|
11 |
+
|
12 |
+
---
|
13 |
+
|
14 |
+
## Step 2: Upload Batch File (upload_batch_file.py)
|
15 |
+
|
16 |
+
1. Upload the prepared batch file to OpenAI using the Files API.
|
17 |
+
2. The script uploads batch_input.jsonl and returns the file ID.
|
18 |
+
|
19 |
+
---
|
20 |
+
|
21 |
+
## Step 3: Create Batch (create_batch.py)
|
22 |
+
|
23 |
+
1. Create a batch job using the uploaded file's ID. Add the file ID obtained from running upload_batch_file.py to create_batch.py.
|
24 |
+
2. Returns a batch ID.
|
25 |
+
|
26 |
+
---
|
27 |
+
|
28 |
+
## Step 4: Check Batch Status (check_batch_status.py)
|
29 |
+
|
30 |
+
1. Use the batch iD from create_batch.py to check the status of the batch job to monitor progress.
|
31 |
+
Possible statuses:
|
32 |
+
1. validating: Validating the input file.
|
33 |
+
2. in_progress: Batch is running.
|
34 |
+
3. completed: Batch is finished and results are ready.
|
35 |
+
4. failed: Validation failed.
|
36 |
+
5. expired: Batch did not complete within the window.
|
37 |
+
2. Retrieves the output_file_ID if completed.
|
38 |
+
|
39 |
+
---
|
40 |
+
|
41 |
+
## Step 5: Retrieve Results (retrieve_results.py)
|
42 |
+
|
43 |
+
1. Download the results using the output_file_id retrieved from the batch status.
|
44 |
+
|
45 |
+
---
|
46 |
+
|
47 |
+
## Helper scripts
|
48 |
+
|
49 |
+
1. Cancel Batch (cancel_batch.py)
|
50 |
+
1. Cancel an ongoing batch if required. Changes batch status to cancelling and eventually cancelled.
|
51 |
+
|
52 |
+
2. List Batches (list_batches.py)
|
53 |
+
1. View all batches created, including their status and metadata.
|
54 |
+
|
55 |
+
## Consolidated script that combines all the functionalities (Except helpers)
|
56 |
+
|
57 |
+
1. batch_processing.py
|
modules/openai/batch/batch_processing.py
ADDED
@@ -0,0 +1,115 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import json
|
3 |
+
from dotenv import load_dotenv
|
4 |
+
from openai import OpenAI
|
5 |
+
|
6 |
+
load_dotenv()
|
7 |
+
|
8 |
+
class BatchProcessor:
|
9 |
+
def __init__(self):
|
10 |
+
self.client = OpenAI(
|
11 |
+
api_key=os.getenv("OPENAI_API_KEY"),
|
12 |
+
organization=os.getenv("OPENAI_ORG_ID")
|
13 |
+
)
|
14 |
+
|
15 |
+
def prepare_batch(self, folder_path, output_file):
|
16 |
+
"""Prepare a batch input file from a folder of text files."""
|
17 |
+
with open(output_file, "w") as out_file:
|
18 |
+
for filename in os.listdir(folder_path):
|
19 |
+
if filename.endswith(".txt"):
|
20 |
+
file_path = os.path.join(folder_path, filename)
|
21 |
+
with open(file_path, "r") as file:
|
22 |
+
text = file.read()
|
23 |
+
|
24 |
+
batch_entry = {
|
25 |
+
"custom_id": filename,
|
26 |
+
"method": "POST",
|
27 |
+
"url": "/v1/chat/completions",
|
28 |
+
"body": {
|
29 |
+
"model": "gpt-4o-mini",
|
30 |
+
"messages": [
|
31 |
+
{
|
32 |
+
"role": "system",
|
33 |
+
"content": (
|
34 |
+
"You are a helpful assistant designed to check if there's any racial content. "
|
35 |
+
"Please review this document for any racial or discriminatory expressions. "
|
36 |
+
"If yes, return 'Yes'; if there's none, please return 'No racial content found'. "
|
37 |
+
"If there is any doubt or ambiguity, assume the text contains racial content and respond 'Yes'."
|
38 |
+
)
|
39 |
+
},
|
40 |
+
{"role": "user", "content": text}
|
41 |
+
],
|
42 |
+
"max_tokens": 1000
|
43 |
+
}
|
44 |
+
}
|
45 |
+
out_file.write(json.dumps(batch_entry) + "\n")
|
46 |
+
print(f"Batch file created: {output_file}")
|
47 |
+
|
48 |
+
def upload_batch_file(self, batch_file_path):
|
49 |
+
"""Upload the prepared batch input file."""
|
50 |
+
with open(batch_file_path, "rb") as f:
|
51 |
+
batch_input_file = self.client.files.create(
|
52 |
+
file=f,
|
53 |
+
purpose="batch"
|
54 |
+
)
|
55 |
+
print(f"Batch input file uploaded. File ID: {batch_input_file.id}")
|
56 |
+
return batch_input_file.id
|
57 |
+
|
58 |
+
def create_batch(self, file_id):
|
59 |
+
"""Create a batch job with the uploaded input file."""
|
60 |
+
batch = self.client.batches.create(
|
61 |
+
input_file_id=file_id,
|
62 |
+
endpoint="/v1/chat/completions",
|
63 |
+
completion_window="24h",
|
64 |
+
metadata={
|
65 |
+
"description": "Deed analysis batch"
|
66 |
+
}
|
67 |
+
)
|
68 |
+
print(f"Batch created. Batch ID: {batch.id}")
|
69 |
+
return batch.id
|
70 |
+
|
71 |
+
def check_batch_status(self, batch_id):
|
72 |
+
"""Check the status of a batch job."""
|
73 |
+
batch_status = self.client.batches.retrieve(batch_id)
|
74 |
+
print(f"Batch Status: {batch_status.status}")
|
75 |
+
if batch_status.status == "completed":
|
76 |
+
output_file_id = batch_status.output_file_id
|
77 |
+
print(f"Output File ID: {output_file_id}")
|
78 |
+
return output_file_id
|
79 |
+
else:
|
80 |
+
return None
|
81 |
+
|
82 |
+
def retrieve_results(self, output_file_id, output_path):
|
83 |
+
"""Retrieve the results of a completed batch job."""
|
84 |
+
file_response = self.client.files.content(output_file_id)
|
85 |
+
with open(output_path, "wb") as out_file:
|
86 |
+
out_file.write(file_response.read())
|
87 |
+
print(f"Batch results downloaded to {output_path}")
|
88 |
+
|
89 |
+
if __name__ == "__main__":
|
90 |
+
processor = BatchProcessor()
|
91 |
+
|
92 |
+
folder_path = ""
|
93 |
+
batch_input_file = "batch_input.jsonl"
|
94 |
+
batch_output_file = "batch_output.jsonl"
|
95 |
+
|
96 |
+
# Step 1: Prepare the batch input file
|
97 |
+
processor.prepare_batch(folder_path, batch_input_file)
|
98 |
+
|
99 |
+
# Step 2: Upload the batch input file
|
100 |
+
file_id = processor.upload_batch_file(batch_input_file)
|
101 |
+
|
102 |
+
# Step 3: Create a batch job
|
103 |
+
batch_id = processor.create_batch(file_id)
|
104 |
+
|
105 |
+
# Step 4: Poll for batch status
|
106 |
+
import time
|
107 |
+
while True:
|
108 |
+
output_file_id = processor.check_batch_status(batch_id)
|
109 |
+
if output_file_id:
|
110 |
+
break
|
111 |
+
print("Batch not complete. Retrying in 30 minutes...")
|
112 |
+
time.sleep(1800)
|
113 |
+
|
114 |
+
# Step 5: Retrieve the results
|
115 |
+
processor.retrieve_results(output_file_id, batch_output_file)
|
modules/openai/batch/cancel_batch.py
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from openai import OpenAI
|
2 |
+
import os
|
3 |
+
from dotenv import load_dotenv
|
4 |
+
|
5 |
+
load_dotenv()
|
6 |
+
|
7 |
+
def cancel_batch(batch_id):
|
8 |
+
client = OpenAI(
|
9 |
+
api_key=os.getenv("OPENAI_API_KEY"),
|
10 |
+
organization=os.getenv("OPENAI_ORG_ID")
|
11 |
+
)
|
12 |
+
|
13 |
+
client.batches.cancel(batch_id)
|
14 |
+
print(f"Batch {batch_id} cancelled.")
|
15 |
+
|
16 |
+
if __name__ == "__main__":
|
17 |
+
batch_id = "" # batch id here obtained from create_batch.py
|
18 |
+
cancel_batch(batch_id)
|
modules/openai/batch/check_batch_status.py
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from openai import OpenAI
|
2 |
+
import os
|
3 |
+
from dotenv import load_dotenv
|
4 |
+
|
5 |
+
load_dotenv()
|
6 |
+
|
7 |
+
def check_batch_status(batch_id):
|
8 |
+
client = OpenAI(
|
9 |
+
api_key=os.getenv("OPENAI_API_KEY"),
|
10 |
+
organization=os.getenv("OPENAI_ORG_ID")
|
11 |
+
)
|
12 |
+
|
13 |
+
batch_status = client.batches.retrieve(batch_id)
|
14 |
+
print(f"Batch Status: {batch_status.status}")
|
15 |
+
|
16 |
+
if batch_status.status == "completed":
|
17 |
+
output_file_id = batch_status.output_file_id
|
18 |
+
print(f"Output File ID: {output_file_id}")
|
19 |
+
return output_file_id
|
20 |
+
else:
|
21 |
+
print(f"Batch Status: {batch_status.status}")
|
22 |
+
return None
|
23 |
+
|
24 |
+
if __name__ == "__main__":
|
25 |
+
batch_id = "" # batch id here
|
26 |
+
check_batch_status(batch_id)
|
modules/openai/batch/create_batch.py
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from openai import OpenAI
|
2 |
+
import os
|
3 |
+
from dotenv import load_dotenv
|
4 |
+
|
5 |
+
load_dotenv()
|
6 |
+
|
7 |
+
def create_batch(file_id):
|
8 |
+
client = OpenAI(
|
9 |
+
api_key=os.getenv("OPENAI_API_KEY"),
|
10 |
+
organization=os.getenv("OPENAI_ORG_ID")
|
11 |
+
)
|
12 |
+
|
13 |
+
batch = client.batches.create(
|
14 |
+
input_file_id=file_id,
|
15 |
+
endpoint="/v1/chat/completions",
|
16 |
+
completion_window="24h",
|
17 |
+
metadata={
|
18 |
+
"description": "Deed analysis batch"
|
19 |
+
}
|
20 |
+
)
|
21 |
+
|
22 |
+
print(f"Batch created. Batch ID: {batch.id}")
|
23 |
+
return batch.id
|
24 |
+
|
25 |
+
if __name__ == "__main__":
|
26 |
+
file_id = "" # file id here obtained from running upload_batch_file.py
|
27 |
+
create_batch(file_id)
|
modules/openai/batch/list_batches.py
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from openai import OpenAI
|
2 |
+
import os
|
3 |
+
from dotenv import load_dotenv
|
4 |
+
|
5 |
+
load_dotenv()
|
6 |
+
|
7 |
+
def list_batches(limit=10):
|
8 |
+
client = OpenAI(
|
9 |
+
api_key=os.getenv("OPENAI_API_KEY"),
|
10 |
+
organization=os.getenv("OPENAI_ORG_ID")
|
11 |
+
)
|
12 |
+
|
13 |
+
batches = client.batches.list(limit=limit)
|
14 |
+
for batch in batches.data:
|
15 |
+
print(f"Batch ID: {batch.id}, Status: {batch.status}")
|
16 |
+
|
17 |
+
if __name__ == "__main__":
|
18 |
+
list_batches()
|
modules/openai/batch/prepare_batch.py
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import json
|
3 |
+
|
4 |
+
def prepare_batch(folder_path, output_file):
|
5 |
+
with open(output_file, "w") as out_file:
|
6 |
+
for filename in os.listdir(folder_path):
|
7 |
+
if filename.endswith(".txt"):
|
8 |
+
file_path = os.path.join(folder_path, filename)
|
9 |
+
|
10 |
+
# Read the content of the file
|
11 |
+
with open(file_path, "r") as file:
|
12 |
+
text = file.read()
|
13 |
+
|
14 |
+
# Create a batch entry
|
15 |
+
batch_entry = {
|
16 |
+
"custom_id": filename,
|
17 |
+
"method": "POST",
|
18 |
+
"url": "/v1/chat/completions",
|
19 |
+
"body": {
|
20 |
+
"model": "gpt-4o-mini",
|
21 |
+
"messages": [
|
22 |
+
{
|
23 |
+
"role": "system",
|
24 |
+
"content": (
|
25 |
+
"You are a helpful assistant designed to check if there's any racial content. "
|
26 |
+
"Please review this document for any racial or discriminatory expressions. "
|
27 |
+
"If yes, return 'Yes'; if there's none, please return 'No racial content found'. "
|
28 |
+
"If there is any doubt or ambiguity, assume the text contains racial content and respond 'Yes'."
|
29 |
+
)
|
30 |
+
},
|
31 |
+
{"role": "user", "content": text}
|
32 |
+
],
|
33 |
+
"max_tokens": 1000
|
34 |
+
}
|
35 |
+
}
|
36 |
+
out_file.write(json.dumps(batch_entry) + "\n")
|
37 |
+
print(f"Batch file created: {output_file}")
|
38 |
+
|
39 |
+
if __name__ == "__main__":
|
40 |
+
prepare_batch("folder_of_deeds", "batch_input.jsonl") # add folder of deeds to pass into openai
|
modules/openai/batch/retrieve_results.py
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from openai import OpenAI
|
2 |
+
import os
|
3 |
+
from dotenv import load_dotenv
|
4 |
+
|
5 |
+
load_dotenv()
|
6 |
+
|
7 |
+
def retrieve_results(output_file_id, output_path):
|
8 |
+
client = OpenAI(
|
9 |
+
api_key=os.getenv("OPENAI_API_KEY"),
|
10 |
+
organization=os.getenv("OPENAI_ORG_ID")
|
11 |
+
)
|
12 |
+
|
13 |
+
file_response = client.files.content(output_file_id)
|
14 |
+
|
15 |
+
# Write the binary content to the output file
|
16 |
+
with open(output_path, "wb") as out_file:
|
17 |
+
out_file.write(file_response.read())
|
18 |
+
|
19 |
+
print(f"Batch results downloaded to {output_path}")
|
20 |
+
|
21 |
+
if __name__ == "__main__":
|
22 |
+
output_file_id = "" # Replace with your actual output file ID
|
23 |
+
retrieve_results(output_file_id, "batch_output.jsonl")
|
modules/openai/batch/upload_batch_file.py
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from openai import OpenAI
|
2 |
+
import os
|
3 |
+
from dotenv import load_dotenv
|
4 |
+
|
5 |
+
load_dotenv()
|
6 |
+
|
7 |
+
def upload_batch_file(batch_file_path):
|
8 |
+
client = OpenAI(
|
9 |
+
api_key=os.getenv("OPENAI_API_KEY"),
|
10 |
+
organization=os.getenv("OPENAI_ORG_ID")
|
11 |
+
)
|
12 |
+
|
13 |
+
with open(batch_file_path, "rb") as f:
|
14 |
+
batch_input_file = client.files.create(
|
15 |
+
file=f,
|
16 |
+
purpose="batch"
|
17 |
+
)
|
18 |
+
|
19 |
+
print(f"Batch input file uploaded. File ID: {batch_input_file.id}")
|
20 |
+
return batch_input_file.id
|
21 |
+
|
22 |
+
if __name__ == "__main__":
|
23 |
+
upload_batch_file("batch_input.jsonl")
|
modules/openai/gpt_extract.py
ADDED
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import openai
|
3 |
+
import pandas as pd
|
4 |
+
from dotenv import load_dotenv
|
5 |
+
|
6 |
+
load_dotenv()
|
7 |
+
|
8 |
+
openai.api_key = os.getenv('OPENAI_API_KEY')
|
9 |
+
|
10 |
+
folder_path = "racist_deeds_text"
|
11 |
+
|
12 |
+
output_csv = "deed_names_locations.csv"
|
13 |
+
|
14 |
+
data = []
|
15 |
+
|
16 |
+
def extract_names_and_locations(text):
|
17 |
+
"""
|
18 |
+
Extract names and locations from text using OpenAI.
|
19 |
+
"""
|
20 |
+
try:
|
21 |
+
response = openai.ChatCompletion.create(
|
22 |
+
model="gpt-4",
|
23 |
+
messages=[
|
24 |
+
{
|
25 |
+
"role": "system",
|
26 |
+
"content": (
|
27 |
+
"You are an assistant that extracts names and locations from legal text. "
|
28 |
+
"For the given input, identify all names of people (grantors, grantees) and "
|
29 |
+
"locations (addresses, city, county, state). "
|
30 |
+
"Return the names as a comma-separated list and locations as a separate comma-separated list "
|
31 |
+
"strictly in the format:\nNames: [comma-separated names]\nLocations: [comma-separated locations]."
|
32 |
+
)
|
33 |
+
},
|
34 |
+
{"role": "user", "content": text}
|
35 |
+
]
|
36 |
+
)
|
37 |
+
output = response.choices[0].message.content.strip()
|
38 |
+
|
39 |
+
names, locations = "", ""
|
40 |
+
|
41 |
+
for line in output.split("\n"):
|
42 |
+
if line.startswith("Names:"):
|
43 |
+
names = line.replace("Names:", "").strip()
|
44 |
+
elif line.startswith("Locations:"):
|
45 |
+
locations = line.replace("Locations:", "").strip()
|
46 |
+
|
47 |
+
return names, locations
|
48 |
+
except Exception as e:
|
49 |
+
print(f"Error extracting names and locations: {e}")
|
50 |
+
return "", ""
|
51 |
+
|
52 |
+
for filename in os.listdir(folder_path):
|
53 |
+
if filename.endswith(".txt"):
|
54 |
+
file_path = os.path.join(folder_path, filename)
|
55 |
+
|
56 |
+
with open(file_path, "r") as file:
|
57 |
+
text = file.read()
|
58 |
+
|
59 |
+
names, locations = extract_names_and_locations(text)
|
60 |
+
|
61 |
+
data.append({"Filename": filename, "Names": names, "Locations": locations})
|
62 |
+
print(f"Processed {filename}")
|
63 |
+
|
64 |
+
df = pd.DataFrame(data)
|
65 |
+
df.to_csv(output_csv, index=False)
|
66 |
+
print(f"Results saved to {output_csv}")
|
modules/openai/racist_chatgpt_analysis.py
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import openai
|
2 |
+
from dotenv import load_dotenv
|
3 |
+
import os
|
4 |
+
|
5 |
+
load_dotenv()
|
6 |
+
openai.organization = os.getenv('OPENAI_ORG_ID')
|
7 |
+
openai.api_key = os.getenv('OPENAI_API_KEY')
|
8 |
+
|
9 |
+
def racist_chatgpt_analysis(text):
|
10 |
+
try:
|
11 |
+
response = openai.ChatCompletion.create(
|
12 |
+
model="gpt-4o-mini",
|
13 |
+
messages=[
|
14 |
+
{
|
15 |
+
"role": "system",
|
16 |
+
"content": "You are a helpful assistant designed to check if there's any racial contents. \
|
17 |
+
Please review this document for any racial or discriminatory expressions. \
|
18 |
+
If yes, return 'Yes', if there's none, please return 'No racial content found'. \
|
19 |
+
If there is any doubt or ambiguity, assume the text contains racial content and respond 'Yes'."
|
20 |
+
},
|
21 |
+
{"role": "user", "content": text}
|
22 |
+
]
|
23 |
+
)
|
24 |
+
if response.choices[0].message.content.strip() == "Yes":
|
25 |
+
return True
|
26 |
+
else:
|
27 |
+
return False
|
28 |
+
except Exception as e:
|
29 |
+
print(f"Error: {e}")
|
30 |
+
return False
|
requirements.txt
CHANGED
@@ -1,2 +1,14 @@
|
|
1 |
-
|
2 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
pytesseract
|
2 |
+
tesseract
|
3 |
+
Pillow
|
4 |
+
openai
|
5 |
+
python-dotenv
|
6 |
+
python-dotenv
|
7 |
+
google-cloud-vision
|
8 |
+
spacy
|
9 |
+
openai==0.28
|
10 |
+
Flask==2.x.x
|
11 |
+
Flask-Cors==3.x.x
|
12 |
+
gunicorn==20.x.x
|
13 |
+
pandas==1.x.x
|
14 |
+
xlsxwriter==3.x.x
|