Spaces:
Sleeping
Sleeping
WebashalarForML
commited on
Upload 4 files
Browse files- BACKUP.PY +335 -0
- main.py +350 -0
- readme +150 -0
- requirements.txt +71 -0
BACKUP.PY
ADDED
@@ -0,0 +1,335 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import json
|
3 |
+
from flask import Flask, request, render_template, redirect, url_for, session, flash, send_from_directory, send_file
|
4 |
+
from werkzeug.utils import secure_filename
|
5 |
+
from utils.file_to_text import extract_text_based_on_format, preprocess_text
|
6 |
+
from utils.anoter_to_json import process_uploaded_json
|
7 |
+
from utils.json_to_spacy import convert_json_to_spacy
|
8 |
+
from utils.model import train_model
|
9 |
+
import zipfile
|
10 |
+
|
11 |
+
app = Flask(__name__)
|
12 |
+
app.secret_key = 'your_secret_key'
|
13 |
+
|
14 |
+
# Folder paths
|
15 |
+
app.config['UPLOAD_FOLDER'] = 'uploads'
|
16 |
+
app.config['JSON_FOLDER'] = 'JSON'
|
17 |
+
app.config['DATA_FOLDER'] = 'data'
|
18 |
+
app.config['MODELS_FOLDER'] = 'Models'
|
19 |
+
|
20 |
+
# Allowed file extensions
|
21 |
+
ALLOWED_EXTENSIONS = {'pdf', 'docx', 'rsf', 'odt', 'png', 'jpg', 'jpeg', 'json'}
|
22 |
+
|
23 |
+
# Function to check file extensions
|
24 |
+
def allowed_file(filename):
|
25 |
+
return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
|
26 |
+
|
27 |
+
@app.route('/')
|
28 |
+
def index():
|
29 |
+
return render_template('upload.html')
|
30 |
+
|
31 |
+
# API for uploading Resume files
|
32 |
+
@app.route('/upload',methods=['GET', 'POST'])
|
33 |
+
def upload_file():
|
34 |
+
try:
|
35 |
+
if 'file' not in request.files:
|
36 |
+
flash('No file part', 'error')
|
37 |
+
return redirect(request.url)
|
38 |
+
|
39 |
+
file = request.files['file']
|
40 |
+
if file.filename == '':
|
41 |
+
flash('No selected file', 'error')
|
42 |
+
return redirect(request.url)
|
43 |
+
|
44 |
+
if file and allowed_file(file.filename):
|
45 |
+
filename = secure_filename(file.filename)
|
46 |
+
file_path = os.path.join(app.config['UPLOAD_FOLDER'], filename)
|
47 |
+
file.save(file_path)
|
48 |
+
|
49 |
+
# Handle text extraction for non-JSON files
|
50 |
+
if not filename.lower().endswith('.json'):
|
51 |
+
return process_other_files(file_path, filename)
|
52 |
+
|
53 |
+
flash('File type not allowed', 'error')
|
54 |
+
except Exception as e:
|
55 |
+
flash(f"Error: {str(e)}", 'error')
|
56 |
+
|
57 |
+
return redirect(request.url)
|
58 |
+
|
59 |
+
# Process non-JSON files, extract text and save to 'resume_text.txt'
|
60 |
+
def process_other_files(file_path, filename):
|
61 |
+
try:
|
62 |
+
extracted_text, _ = extract_text_based_on_format(file_path)
|
63 |
+
cleaned_text = preprocess_text(extracted_text)
|
64 |
+
|
65 |
+
os.makedirs(app.config['DATA_FOLDER'], exist_ok=True)
|
66 |
+
resume_file_path = os.path.join(app.config['DATA_FOLDER'], 'resume_text.txt')
|
67 |
+
|
68 |
+
with open(resume_file_path, 'w', encoding='utf-8') as f:
|
69 |
+
f.write(cleaned_text)
|
70 |
+
|
71 |
+
session['uploaded_file'] = filename
|
72 |
+
return render_template('text.html', text=cleaned_text)
|
73 |
+
except Exception as e:
|
74 |
+
flash(f"Error processing file {filename}: {str(e)}", 'error')
|
75 |
+
return redirect(request.referrer)
|
76 |
+
|
77 |
+
# API to handle the text editing and saving
|
78 |
+
@app.route('/edit_text', methods=['POST'])
|
79 |
+
def edit_text():
|
80 |
+
try:
|
81 |
+
# Get the edited text from the form
|
82 |
+
edited_text = request.form['edited_text']
|
83 |
+
|
84 |
+
# Save the edited text back to 'resume_text.txt'
|
85 |
+
resume_file_path = os.path.join(app.config['DATA_FOLDER'], 'resume_text.txt')
|
86 |
+
with open(resume_file_path, 'w', encoding='utf-8') as f:
|
87 |
+
f.write(edited_text)
|
88 |
+
|
89 |
+
flash('Text edited successfully', 'success')
|
90 |
+
# Pass the edited text back to the template
|
91 |
+
return render_template('text.html', text=edited_text)
|
92 |
+
except Exception as e:
|
93 |
+
flash(f"Error saving edited text: {str(e)}", 'error')
|
94 |
+
return redirect(request.referrer)
|
95 |
+
|
96 |
+
# API for downloading the 'resume_text.txt' file
|
97 |
+
@app.route('/download', methods=['GET'])
|
98 |
+
def download_file():
|
99 |
+
try:
|
100 |
+
return send_from_directory(app.config['DATA_FOLDER'], 'resume_text.txt', as_attachment=True)
|
101 |
+
except Exception as e:
|
102 |
+
flash(f"Error downloading file: {str(e)}", 'error')
|
103 |
+
return redirect(request.referrer)
|
104 |
+
|
105 |
+
@app.route('/save_and_download', methods=['POST'])
|
106 |
+
def save_and_download():
|
107 |
+
try:
|
108 |
+
# Get the edited text from the form
|
109 |
+
edited_text = request.form['edited_text']
|
110 |
+
|
111 |
+
# Save the edited text back to 'resume_text.txt'
|
112 |
+
resume_file_path = os.path.join(app.config['DATA_FOLDER'], 'resume_text.txt')
|
113 |
+
with open(resume_file_path, 'w', encoding='utf-8') as f:
|
114 |
+
f.write(edited_text)
|
115 |
+
|
116 |
+
flash('Text edited successfully', 'success')
|
117 |
+
|
118 |
+
# Now send the file as a download
|
119 |
+
return send_from_directory(app.config['DATA_FOLDER'], 'resume_text.txt', as_attachment=True)
|
120 |
+
|
121 |
+
except Exception as e:
|
122 |
+
flash(f"Error saving and downloading file: {str(e)}", 'error')
|
123 |
+
return redirect(request.referrer)
|
124 |
+
|
125 |
+
|
126 |
+
# API for uploading and processing JSON files
|
127 |
+
@app.route('/upload_json', methods=['POST'])
|
128 |
+
def upload_json_file():
|
129 |
+
try:
|
130 |
+
if 'file' not in request.files:
|
131 |
+
flash('No file part', 'error')
|
132 |
+
return redirect(request.url)
|
133 |
+
|
134 |
+
file = request.files['file']
|
135 |
+
if file.filename == '':
|
136 |
+
flash('No selected file', 'error')
|
137 |
+
return redirect(request.url)
|
138 |
+
|
139 |
+
if file and file.filename.lower().endswith('.json'):
|
140 |
+
filename = secure_filename(file.filename)
|
141 |
+
json_path = os.path.join(app.config['JSON_FOLDER'], filename)
|
142 |
+
os.makedirs(app.config['JSON_FOLDER'], exist_ok=True)
|
143 |
+
file.save(json_path)
|
144 |
+
session['uploaded_json'] = filename
|
145 |
+
flash(f'JSON file {filename} uploaded successfully')
|
146 |
+
else:
|
147 |
+
flash('File type not allowed', 'error')
|
148 |
+
except Exception as e:
|
149 |
+
flash(f"Error: {str(e)}", 'error')
|
150 |
+
|
151 |
+
return redirect(request.referrer)
|
152 |
+
|
153 |
+
# Process uploaded JSON file and save formatted data
|
154 |
+
@app.route('/process_json', methods=['GET'])
|
155 |
+
def process_json_file():
|
156 |
+
try:
|
157 |
+
json_folder = app.config['JSON_FOLDER']
|
158 |
+
json_files = os.listdir(json_folder)
|
159 |
+
|
160 |
+
if not json_files:
|
161 |
+
flash('No JSON files found in the folder', 'error')
|
162 |
+
return redirect(request.referrer)
|
163 |
+
|
164 |
+
filename = json_files[0] # Modify logic if needed to handle multiple files
|
165 |
+
json_path = os.path.join(json_folder, filename)
|
166 |
+
|
167 |
+
if not os.path.exists(json_path):
|
168 |
+
flash(f'JSON file {filename} not found', 'error')
|
169 |
+
return redirect(request.referrer)
|
170 |
+
|
171 |
+
process_uploaded_json(json_path)
|
172 |
+
os.makedirs(app.config['DATA_FOLDER'], exist_ok=True)
|
173 |
+
processed_file_path = os.path.join(app.config['DATA_FOLDER'], f'Processed_{filename}')
|
174 |
+
|
175 |
+
flash(f'JSON file {filename} processed successfully')
|
176 |
+
except Exception as e:
|
177 |
+
flash(f"Error processing JSON file: {str(e)}", 'error')
|
178 |
+
|
179 |
+
return redirect(request.referrer)
|
180 |
+
|
181 |
+
# API for removing uploaded JSON files
|
182 |
+
@app.route('/remove_json', methods=['POST'])
|
183 |
+
def remove_all_json_files():
|
184 |
+
try:
|
185 |
+
json_folder = app.config['JSON_FOLDER']
|
186 |
+
for filename in os.listdir(json_folder):
|
187 |
+
file_path = os.path.join(json_folder, filename)
|
188 |
+
if os.path.isfile(file_path):
|
189 |
+
os.remove(file_path)
|
190 |
+
session.pop('uploaded_json', None)
|
191 |
+
|
192 |
+
flash('All JSON files removed successfully')
|
193 |
+
except Exception as e:
|
194 |
+
flash(f"Error removing files: {str(e)}", 'error')
|
195 |
+
|
196 |
+
return redirect(request.referrer)
|
197 |
+
|
198 |
+
# API for removing non-JSON files
|
199 |
+
@app.route('/remove', methods=['POST'])
|
200 |
+
def remove_file():
|
201 |
+
try:
|
202 |
+
upload_folder = app.config['UPLOAD_FOLDER']
|
203 |
+
|
204 |
+
# Check if the folder exists
|
205 |
+
if os.path.exists(upload_folder):
|
206 |
+
# Loop through all files in the upload folder and remove them
|
207 |
+
for filename in os.listdir(upload_folder):
|
208 |
+
file_path = os.path.join(upload_folder, filename)
|
209 |
+
|
210 |
+
# Check if it is a file and remove it
|
211 |
+
if os.path.isfile(file_path):
|
212 |
+
os.remove(file_path)
|
213 |
+
|
214 |
+
# Clear session data related to uploaded files
|
215 |
+
session.pop('uploaded_file', None)
|
216 |
+
flash('All files removed successfully')
|
217 |
+
else:
|
218 |
+
flash(f"Upload folder does not exist", 'error')
|
219 |
+
|
220 |
+
except Exception as e:
|
221 |
+
flash(f"Error removing files: {str(e)}", 'error')
|
222 |
+
|
223 |
+
return redirect(url_for('index'))
|
224 |
+
|
225 |
+
# HTML render routes (modify to fit your structure)
|
226 |
+
@app.route('/ner_preview', methods=['GET'])
|
227 |
+
def ner_preview():
|
228 |
+
return render_template('anoter.html')
|
229 |
+
|
230 |
+
@app.route('/json', methods=['GET'])
|
231 |
+
def json_file():
|
232 |
+
return render_template('savejson.html')
|
233 |
+
|
234 |
+
@app.route('/spacy', methods=['GET'])
|
235 |
+
def spacy_file():
|
236 |
+
return render_template('saveSpacy.html')
|
237 |
+
|
238 |
+
# @app.route('/text', methods=['GET'])
|
239 |
+
# def spacy_file():
|
240 |
+
# return render_template('text.html')
|
241 |
+
|
242 |
+
@app.route('/to_sapcy', methods=['POST'])
|
243 |
+
def to_sapcy():
|
244 |
+
try:
|
245 |
+
# Path to the JSON file
|
246 |
+
json_file_path = 'data/Json_Data.json'
|
247 |
+
# Convert the JSON file to a .spacy file
|
248 |
+
spacy_file_path = 'data/Spacy_data.spacy'
|
249 |
+
|
250 |
+
# Call the conversion function
|
251 |
+
convert_json_to_spacy(json_file_path, spacy_file_path)
|
252 |
+
|
253 |
+
flash('Model training data converted successfully', 'success')
|
254 |
+
except Exception as e:
|
255 |
+
flash(f"Error during conversion: {str(e)}", 'error')
|
256 |
+
|
257 |
+
return redirect(request.referrer)
|
258 |
+
|
259 |
+
@app.route('/train_model_endpoint', methods=['POST'])
|
260 |
+
def train_model_endpoint():
|
261 |
+
try:
|
262 |
+
# Get the number of epochs and model version from the request
|
263 |
+
epochs = int(request.form.get('epochs', 10)) # Default to 10 if not provided
|
264 |
+
version = request.form.get('model_version', 'v1') # Default to 'v1' if not provided
|
265 |
+
|
266 |
+
# Call the training function with user-defined parameters
|
267 |
+
model_path = f"./Models/ner_model_{version}"
|
268 |
+
train_model(epochs, model_path)
|
269 |
+
|
270 |
+
flash('Model training completed successfully', 'success')
|
271 |
+
except Exception as e:
|
272 |
+
flash(f"Error during training: {str(e)}", 'error')
|
273 |
+
|
274 |
+
return redirect(url_for('index'))
|
275 |
+
|
276 |
+
# API for removing all files from specific folders
|
277 |
+
@app.route('/remove_files', methods=['POST'])
|
278 |
+
def remove_files():
|
279 |
+
try:
|
280 |
+
# Define folders to clear
|
281 |
+
folders_to_clear = [app.config['UPLOAD_FOLDER'], app.config['JSON_FOLDER']]
|
282 |
+
|
283 |
+
for folder_path in folders_to_clear:
|
284 |
+
# Remove all files from the specified folder
|
285 |
+
for filename in os.listdir(folder_path):
|
286 |
+
file_path = os.path.join(folder_path, filename)
|
287 |
+
if os.path.isfile(file_path):
|
288 |
+
os.remove(file_path)
|
289 |
+
|
290 |
+
# Clear session variables related to the removed folders
|
291 |
+
session.pop('uploaded_file', None)
|
292 |
+
session.pop('uploaded_json', None)
|
293 |
+
|
294 |
+
flash('All files removed from folder successfully')
|
295 |
+
except Exception as e:
|
296 |
+
flash(f"Error removing files: {str(e)}", 'error')
|
297 |
+
|
298 |
+
return redirect(url_for('index'))
|
299 |
+
|
300 |
+
# API for downloading the latest trained model
|
301 |
+
@app.route('/download_model', methods=['GET'])
|
302 |
+
def download_latest_model():
|
303 |
+
try:
|
304 |
+
models_dir = app.config['MODELS_FOLDER']
|
305 |
+
model_files = os.listdir(models_dir)
|
306 |
+
|
307 |
+
if not model_files:
|
308 |
+
flash('No model files found', 'error')
|
309 |
+
return redirect(request.referrer)
|
310 |
+
|
311 |
+
# Sort model files and get the latest one
|
312 |
+
latest_model_file = sorted(model_files, reverse=True)[0]
|
313 |
+
|
314 |
+
# Full path to the latest model file
|
315 |
+
model_path = os.path.join(models_dir, latest_model_file)
|
316 |
+
|
317 |
+
if not os.path.exists(model_path):
|
318 |
+
flash('Model file not found on the server', 'error')
|
319 |
+
return redirect(request.referrer)
|
320 |
+
|
321 |
+
# Create a zip file with the model
|
322 |
+
zip_filename = os.path.join(models_dir, f"{latest_model_file}.zip")
|
323 |
+
|
324 |
+
with zipfile.ZipFile(zip_filename, 'w') as zipf:
|
325 |
+
zipf.write(model_path, os.path.basename(model_path))
|
326 |
+
|
327 |
+
# Send the zip file as a download
|
328 |
+
return send_file(zip_filename, as_attachment=True)
|
329 |
+
|
330 |
+
except Exception as e:
|
331 |
+
flash(f"Error while downloading the model: {str(e)}", 'error')
|
332 |
+
return redirect(request.referrer)
|
333 |
+
|
334 |
+
if __name__ == '__main__':
|
335 |
+
app.run(debug=True)
|
main.py
ADDED
@@ -0,0 +1,350 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import json
|
3 |
+
from flask import Flask, request, render_template, redirect, url_for, session, flash, send_from_directory, send_file
|
4 |
+
from werkzeug.utils import secure_filename
|
5 |
+
from utils.file_to_text import extract_text_based_on_format, preprocess_text
|
6 |
+
from utils.anoter_to_json import process_uploaded_json
|
7 |
+
from utils.json_to_spacy import convert_json_to_spacy
|
8 |
+
from utils.model import train_model
|
9 |
+
import zipfile
|
10 |
+
|
11 |
+
app = Flask(__name__)
|
12 |
+
app.secret_key = 'your_secret_key'
|
13 |
+
|
14 |
+
# Folder paths
|
15 |
+
app.config['UPLOAD_FOLDER'] = 'uploads'
|
16 |
+
app.config['JSON_FOLDER'] = 'JSON'
|
17 |
+
app.config['DATA_FOLDER'] = 'data'
|
18 |
+
app.config['MODELS_FOLDER'] = 'Models'
|
19 |
+
|
20 |
+
# Allowed file extensions
|
21 |
+
ALLOWED_EXTENSIONS = {'pdf', 'docx', 'rsf', 'odt', 'png', 'jpg', 'jpeg', 'json'}
|
22 |
+
|
23 |
+
# Function to check file extensions
|
24 |
+
def allowed_file(filename):
|
25 |
+
return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
|
26 |
+
|
27 |
+
# HTML render routes (modify to fit your structure)
|
28 |
+
@app.route('/')
|
29 |
+
def index():
|
30 |
+
return render_template('upload.html')
|
31 |
+
@app.route('/guide')
|
32 |
+
def guide():
|
33 |
+
return render_template('guide.html')
|
34 |
+
|
35 |
+
@app.route('/ner_preview', methods=['GET'])
|
36 |
+
def ner_preview():
|
37 |
+
return render_template('anoter.html')
|
38 |
+
|
39 |
+
@app.route('/json', methods=['GET'])
|
40 |
+
def json_file():
|
41 |
+
return render_template('savejson.html')
|
42 |
+
|
43 |
+
@app.route('/spacy', methods=['GET'])
|
44 |
+
def spacy_file():
|
45 |
+
return render_template('saveSpacy.html')
|
46 |
+
|
47 |
+
@app.route('/text_preview', methods=['GET'])
|
48 |
+
def text_preview():
|
49 |
+
try:
|
50 |
+
resume_file_path = os.path.join(app.config['DATA_FOLDER'], 'resume_text.txt')
|
51 |
+
if not os.path.exists(resume_file_path):
|
52 |
+
flash('Resume text not found', 'error')
|
53 |
+
return redirect(url_for('index'))
|
54 |
+
|
55 |
+
with open(resume_file_path, 'r') as f:
|
56 |
+
text = f.read()
|
57 |
+
return render_template('text.html', text=text)
|
58 |
+
except Exception as e:
|
59 |
+
flash(f"Error loading text preview: {str(e)}", 'error')
|
60 |
+
return redirect(url_for('index'))
|
61 |
+
|
62 |
+
# API for uploading Resume files
|
63 |
+
@app.route('/upload',methods=['GET', 'POST'])
|
64 |
+
def upload_file():
|
65 |
+
try:
|
66 |
+
if 'file' not in request.files:
|
67 |
+
flash('No file part', 'error')
|
68 |
+
return redirect(request.url)
|
69 |
+
|
70 |
+
file = request.files['file']
|
71 |
+
if file.filename == '':
|
72 |
+
flash('No selected file', 'error')
|
73 |
+
return redirect(request.url)
|
74 |
+
|
75 |
+
if file and allowed_file(file.filename):
|
76 |
+
filename = secure_filename(file.filename)
|
77 |
+
file_path = os.path.join(app.config['UPLOAD_FOLDER'], filename)
|
78 |
+
file.save(file_path)
|
79 |
+
|
80 |
+
# Handle text extraction for non-JSON files
|
81 |
+
if not filename.lower().endswith('.json'):
|
82 |
+
return process_other_files(file_path, filename)
|
83 |
+
|
84 |
+
flash('File type not allowed', 'error')
|
85 |
+
except Exception as e:
|
86 |
+
flash(f"Error: {str(e)}", 'error')
|
87 |
+
|
88 |
+
return redirect(request.url)
|
89 |
+
|
90 |
+
# Process non-JSON files, extract text and save to 'resume_text.txt'
|
91 |
+
def process_other_files(file_path, filename):
|
92 |
+
try:
|
93 |
+
extracted_text, _ = extract_text_based_on_format(file_path)
|
94 |
+
cleaned_text = preprocess_text(extracted_text)
|
95 |
+
|
96 |
+
os.makedirs(app.config['DATA_FOLDER'], exist_ok=True)
|
97 |
+
resume_file_path = os.path.join(app.config['DATA_FOLDER'], 'resume_text.txt')
|
98 |
+
|
99 |
+
with open(resume_file_path, 'w', encoding='utf-8') as f:
|
100 |
+
f.write(cleaned_text)
|
101 |
+
|
102 |
+
session['uploaded_file'] = filename
|
103 |
+
return render_template('text.html', text=cleaned_text)
|
104 |
+
except Exception as e:
|
105 |
+
flash(f"Error processing file {filename}: {str(e)}", 'error')
|
106 |
+
return redirect(request.referrer)
|
107 |
+
|
108 |
+
# API to handle the text editing and saving
|
109 |
+
@app.route('/edit_text', methods=['POST'])
|
110 |
+
def edit_text():
|
111 |
+
try:
|
112 |
+
# Get the edited text from the form
|
113 |
+
edited_text = request.form['edited_text']
|
114 |
+
|
115 |
+
# Save the edited text back to 'resume_text.txt'
|
116 |
+
resume_file_path = os.path.join(app.config['DATA_FOLDER'], 'resume_text.txt')
|
117 |
+
with open(resume_file_path, 'w', encoding='utf-8') as f:
|
118 |
+
f.write(edited_text)
|
119 |
+
|
120 |
+
flash('Text edited successfully', 'success')
|
121 |
+
# Pass the edited text back to the template
|
122 |
+
return render_template('text.html', text=edited_text)
|
123 |
+
except Exception as e:
|
124 |
+
flash(f"Error saving edited text: {str(e)}", 'error')
|
125 |
+
return redirect(request.referrer)
|
126 |
+
|
127 |
+
# API for downloading the 'resume_text.txt' file
|
128 |
+
@app.route('/download', methods=['GET'])
|
129 |
+
def download_file():
|
130 |
+
try:
|
131 |
+
return send_from_directory(app.config['DATA_FOLDER'], 'resume_text.txt', as_attachment=True)
|
132 |
+
except Exception as e:
|
133 |
+
flash(f"Error downloading file: {str(e)}", 'error')
|
134 |
+
return redirect(request.referrer)
|
135 |
+
|
136 |
+
@app.route('/save_and_download', methods=['POST'])
|
137 |
+
def save_and_download():
|
138 |
+
try:
|
139 |
+
# Get the edited text from the form
|
140 |
+
edited_text = request.form['edited_text']
|
141 |
+
|
142 |
+
# Save the edited text back to 'resume_text.txt'
|
143 |
+
resume_file_path = os.path.join(app.config['DATA_FOLDER'], 'resume_text.txt')
|
144 |
+
with open(resume_file_path, 'w', encoding='utf-8') as f:
|
145 |
+
f.write(edited_text)
|
146 |
+
|
147 |
+
# flash('Text edited successfully', 'success')
|
148 |
+
|
149 |
+
# Now send the file as a download
|
150 |
+
return send_from_directory(app.config['DATA_FOLDER'], 'resume_text.txt', as_attachment=True)
|
151 |
+
|
152 |
+
except Exception as e:
|
153 |
+
flash(f"Error saving and downloading file: {str(e)}", 'error')
|
154 |
+
return redirect(request.referrer)
|
155 |
+
|
156 |
+
|
157 |
+
# API for uploading and processing JSON files
|
158 |
+
@app.route('/upload_json', methods=['POST'])
|
159 |
+
def upload_json_file():
|
160 |
+
try:
|
161 |
+
if 'file' not in request.files:
|
162 |
+
flash('No file part', 'error')
|
163 |
+
return redirect(request.url)
|
164 |
+
|
165 |
+
file = request.files['file']
|
166 |
+
if file.filename == '':
|
167 |
+
flash('No selected file', 'error')
|
168 |
+
return redirect(request.url)
|
169 |
+
|
170 |
+
if file and file.filename.lower().endswith('.json'):
|
171 |
+
filename = secure_filename(file.filename)
|
172 |
+
json_path = os.path.join(app.config['JSON_FOLDER'], filename)
|
173 |
+
os.makedirs(app.config['JSON_FOLDER'], exist_ok=True)
|
174 |
+
file.save(json_path)
|
175 |
+
session['uploaded_json'] = filename
|
176 |
+
flash(f'JSON file {filename} uploaded successfully')
|
177 |
+
else:
|
178 |
+
flash('File type not allowed', 'error')
|
179 |
+
except Exception as e:
|
180 |
+
flash(f"Error: {str(e)}", 'error')
|
181 |
+
|
182 |
+
return redirect(request.referrer)
|
183 |
+
|
184 |
+
# Process uploaded JSON file and save formatted data
|
185 |
+
@app.route('/process_json', methods=['GET'])
|
186 |
+
def process_json_file():
|
187 |
+
try:
|
188 |
+
json_folder = app.config['JSON_FOLDER']
|
189 |
+
json_files = os.listdir(json_folder)
|
190 |
+
|
191 |
+
if not json_files:
|
192 |
+
flash('No JSON files found in the folder', 'error')
|
193 |
+
return redirect(request.referrer)
|
194 |
+
|
195 |
+
filename = json_files[0] # Modify logic if needed to handle multiple files
|
196 |
+
json_path = os.path.join(json_folder, filename)
|
197 |
+
|
198 |
+
if not os.path.exists(json_path):
|
199 |
+
flash(f'JSON file {filename} not found', 'error')
|
200 |
+
return redirect(request.referrer)
|
201 |
+
|
202 |
+
process_uploaded_json(json_path)
|
203 |
+
os.makedirs(app.config['DATA_FOLDER'], exist_ok=True)
|
204 |
+
processed_file_path = os.path.join(app.config['DATA_FOLDER'], f'Processed_{filename}')
|
205 |
+
|
206 |
+
flash(f'JSON file {filename} processed successfully')
|
207 |
+
except Exception as e:
|
208 |
+
flash(f"Error processing JSON file: {str(e)}", 'error')
|
209 |
+
|
210 |
+
return redirect(request.referrer)
|
211 |
+
|
212 |
+
# API for removing uploaded JSON files
|
213 |
+
@app.route('/remove_json', methods=['POST'])
|
214 |
+
def remove_all_json_files():
|
215 |
+
try:
|
216 |
+
json_folder = app.config['JSON_FOLDER']
|
217 |
+
for filename in os.listdir(json_folder):
|
218 |
+
file_path = os.path.join(json_folder, filename)
|
219 |
+
if os.path.isfile(file_path):
|
220 |
+
os.remove(file_path)
|
221 |
+
session.pop('uploaded_json', None)
|
222 |
+
|
223 |
+
flash('All JSON files removed successfully')
|
224 |
+
except Exception as e:
|
225 |
+
flash(f"Error removing files: {str(e)}", 'error')
|
226 |
+
|
227 |
+
return redirect(request.referrer)
|
228 |
+
|
229 |
+
# API for removing non-JSON files
|
230 |
+
@app.route('/remove', methods=['POST'])
|
231 |
+
def remove_file():
|
232 |
+
try:
|
233 |
+
upload_folder = app.config['UPLOAD_FOLDER']
|
234 |
+
|
235 |
+
# Check if the folder exists
|
236 |
+
if os.path.exists(upload_folder):
|
237 |
+
# Loop through all files in the upload folder and remove them
|
238 |
+
for filename in os.listdir(upload_folder):
|
239 |
+
file_path = os.path.join(upload_folder, filename)
|
240 |
+
|
241 |
+
# Check if it is a file and remove it
|
242 |
+
if os.path.isfile(file_path):
|
243 |
+
os.remove(file_path)
|
244 |
+
|
245 |
+
# Clear session data related to uploaded files
|
246 |
+
session.pop('uploaded_file', None)
|
247 |
+
flash('All files removed successfully')
|
248 |
+
else:
|
249 |
+
flash(f"Upload folder does not exist", 'error')
|
250 |
+
|
251 |
+
except Exception as e:
|
252 |
+
flash(f"Error removing files: {str(e)}", 'error')
|
253 |
+
|
254 |
+
return redirect(url_for('index'))
|
255 |
+
|
256 |
+
|
257 |
+
@app.route('/to_sapcy', methods=['POST'])
|
258 |
+
def to_sapcy():
|
259 |
+
try:
|
260 |
+
# Path to the JSON file
|
261 |
+
json_file_path = 'data/Json_Data.json'
|
262 |
+
# Convert the JSON file to a .spacy file
|
263 |
+
spacy_file_path = 'data/Spacy_data.spacy'
|
264 |
+
|
265 |
+
# Call the conversion function
|
266 |
+
convert_json_to_spacy(json_file_path, spacy_file_path)
|
267 |
+
|
268 |
+
flash('Model training data converted successfully', 'success')
|
269 |
+
except Exception as e:
|
270 |
+
flash(f"Error during conversion: {str(e)}", 'error')
|
271 |
+
|
272 |
+
return redirect(request.referrer)
|
273 |
+
|
274 |
+
@app.route('/train_model_endpoint', methods=['POST'])
|
275 |
+
def train_model_endpoint():
|
276 |
+
try:
|
277 |
+
# Get the number of epochs and model version from the request
|
278 |
+
epochs = int(request.form.get('epochs', 10)) # Default to 10 if not provided
|
279 |
+
version = request.form.get('model_version', 'v1') # Default to 'v1' if not provided
|
280 |
+
|
281 |
+
# Call the training function with user-defined parameters
|
282 |
+
model_path = f"./Models/ner_model_{version}"
|
283 |
+
train_model(epochs, model_path)
|
284 |
+
|
285 |
+
flash('Model training completed successfully', 'success')
|
286 |
+
except Exception as e:
|
287 |
+
flash(f"Error during training: {str(e)}", 'error')
|
288 |
+
|
289 |
+
return redirect(url_for('index'))
|
290 |
+
|
291 |
+
# API for removing all files from specific folders
|
292 |
+
@app.route('/remove_files', methods=['POST'])
|
293 |
+
def remove_files():
|
294 |
+
try:
|
295 |
+
# Define folders to clear
|
296 |
+
folders_to_clear = [app.config['UPLOAD_FOLDER'], app.config['JSON_FOLDER'], app.config['MODELS_FOLDER'] ]
|
297 |
+
|
298 |
+
for folder_path in folders_to_clear:
|
299 |
+
# Remove all files from the specified folder
|
300 |
+
for filename in os.listdir(folder_path):
|
301 |
+
file_path = os.path.join(folder_path, filename)
|
302 |
+
if os.path.isfile(file_path):
|
303 |
+
os.remove(file_path)
|
304 |
+
|
305 |
+
# Clear session variables related to the removed folders
|
306 |
+
session.pop('uploaded_file', None)
|
307 |
+
session.pop('uploaded_json', None)
|
308 |
+
|
309 |
+
flash('All files removed from folder successfully')
|
310 |
+
except Exception as e:
|
311 |
+
flash(f"Error removing files: {str(e)}", 'error')
|
312 |
+
|
313 |
+
return redirect(url_for('index'))
|
314 |
+
|
315 |
+
# API for downloading the latest trained model
|
316 |
+
@app.route('/download_model', methods=['GET'])
|
317 |
+
def download_latest_model():
|
318 |
+
try:
|
319 |
+
models_dir = app.config['MODELS_FOLDER']
|
320 |
+
model_files = os.listdir(models_dir)
|
321 |
+
|
322 |
+
if not model_files:
|
323 |
+
flash('No model files found', 'error')
|
324 |
+
return redirect(request.referrer)
|
325 |
+
|
326 |
+
# Sort model files and get the latest one
|
327 |
+
latest_model_file = sorted(model_files, reverse=True)[0]
|
328 |
+
|
329 |
+
# Full path to the latest model file
|
330 |
+
model_path = os.path.join(models_dir, latest_model_file)
|
331 |
+
|
332 |
+
if not os.path.exists(model_path):
|
333 |
+
flash('Model file not found on the server', 'error')
|
334 |
+
return redirect(request.referrer)
|
335 |
+
|
336 |
+
# Create a zip file with the model
|
337 |
+
zip_filename = os.path.join(models_dir, f"{latest_model_file}.zip")
|
338 |
+
|
339 |
+
with zipfile.ZipFile(zip_filename, 'w') as zipf:
|
340 |
+
zipf.write(model_path, os.path.basename(model_path))
|
341 |
+
|
342 |
+
# Send the zip file as a download
|
343 |
+
return send_file(zip_filename, as_attachment=True)
|
344 |
+
|
345 |
+
except Exception as e:
|
346 |
+
flash(f"Error while downloading the model: {str(e)}", 'error')
|
347 |
+
return redirect(request.referrer)
|
348 |
+
|
349 |
+
if __name__ == '__main__':
|
350 |
+
app.run(debug=True)
|
readme
ADDED
@@ -0,0 +1,150 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\
|
2 |
+
\\----------- **Resume Parser** ----------\\
|
3 |
+
\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\
|
4 |
+
|
5 |
+
# Overview:
|
6 |
+
This project is a comprehensive Resume Parsing tool built using Python,
|
7 |
+
integrating the Mistral-Nemo-Instruct-2407 model for primary parsing.
|
8 |
+
If Mistral fails or encounters issues,
|
9 |
+
the system falls back to a custom-trained spaCy model to ensure continued functionality.
|
10 |
+
The tool is wrapped with a Flask API and has a user interface built using HTML and CSS.
|
11 |
+
|
12 |
+
|
13 |
+
# Installation Guide:
|
14 |
+
|
15 |
+
1. Create and Activate a Virtual Environment
|
16 |
+
python -m venv venv
|
17 |
+
source venv/bin/activate # For Linux/Mac
|
18 |
+
# or
|
19 |
+
venv\Scripts\activate # For Windows
|
20 |
+
|
21 |
+
# NOTE: If the virtual environment (venv) is already created, you can skip the creation step and just activate.
|
22 |
+
- For Linux/Mac:
|
23 |
+
source venv/bin/activate
|
24 |
+
- For Windows:
|
25 |
+
venv\Scripts\activate
|
26 |
+
|
27 |
+
2. Install Required Libraries
|
28 |
+
pip install -r requirements.txt
|
29 |
+
|
30 |
+
# Ensure the following dependencies are included:
|
31 |
+
- Flask
|
32 |
+
- spaCy
|
33 |
+
- huggingface_hub
|
34 |
+
- PyMuPDF
|
35 |
+
- python-docx
|
36 |
+
- Tesseract-OCR (for image-based parsing)
|
37 |
+
|
38 |
+
; NOTE : If any model or library is not installed, you can install it using:
|
39 |
+
pip install <model_name>
|
40 |
+
_Replace <model_name> with the specific model or library you need to install_
|
41 |
+
|
42 |
+
3. Set up Hugging Face Token
|
43 |
+
- Add your Hugging Face token to the .env file as:
|
44 |
+
HF_TOKEN=<your_huggingface_token>
|
45 |
+
|
46 |
+
|
47 |
+
# File Structure Overview:
|
48 |
+
Mistral_With_Spacy/
|
49 |
+
β
|
50 |
+
βββ Spacy_Models/
|
51 |
+
β βββ ner_model_05_3 # Pretrained spaCy model directory for resume parsing
|
52 |
+
β
|
53 |
+
βββ templates/
|
54 |
+
β βββ index.html # UI for file upload
|
55 |
+
β βββ result.html # Display parsed results in structured JSON
|
56 |
+
β
|
57 |
+
βββ uploads/ # Directory for uploaded resume files
|
58 |
+
β
|
59 |
+
βββ utils/
|
60 |
+
β βββ mistral.py # Code for calling Mistral API and handling responses
|
61 |
+
β βββ spacy.py # spaCy fallback model for parsing resumes
|
62 |
+
β βββ error.py # Error handling utilities
|
63 |
+
β βββ fileTotext.py # Functions to extract text from different file formats (PDF, DOCX, etc.)
|
64 |
+
β
|
65 |
+
βββ venv/ # Virtual environment
|
66 |
+
β
|
67 |
+
βββ .env # Environment variables file (contains Hugging Face token)
|
68 |
+
β
|
69 |
+
βββ main.py # Flask app handling API routes for uploading and processing resumes
|
70 |
+
β
|
71 |
+
βββ requirements.txt # Dependencies required for the project
|
72 |
+
|
73 |
+
|
74 |
+
# Program Overview:
|
75 |
+
|
76 |
+
# Mistral Integration (utils/mistral.py)
|
77 |
+
- Mistral API Calls: Uses Hugging Faces Mistral-Nemo-Instruct-2407 model to parse resumes.
|
78 |
+
- Personal and Professional Extraction: Two functions extract personal and professional information in structured JSON format.
|
79 |
+
- Fallback Mechanism: If Mistral fails, spaCys NER model is used as a fallback.
|
80 |
+
|
81 |
+
# SpaCy Integration (utils/spacy.py)
|
82 |
+
- Custom Trained Model: Uses a spaCy model (ner_model_05_3) trained specifically for resume parsing.
|
83 |
+
- Named Entity Recognition: Extracts key information like Name, Email, Contact, Location, Skills, Experience, etc., from resumes.
|
84 |
+
- Validation: Includes validation for extracted emails and contacts.
|
85 |
+
|
86 |
+
# File Conversion (utils/fileTotext.py)
|
87 |
+
- Text Extraction: Handles different resume formats (PDF, DOCX, ODT, RSF, and images like PNG, JPG, JPEG) and extracts text for further processing.
|
88 |
+
- PDF Files: Uses PyMuPDF to extract text and, if necessary, Tesseract-OCR for image-based PDF content.
|
89 |
+
- DOCX Files: Uses `python-docx` to extract structured text from Word documents.
|
90 |
+
- ODT Files: Uses `odfpy` to extract text from ODT (OpenDocument) files.
|
91 |
+
- RSF Files: Reads plain text from RSF files.
|
92 |
+
- Images (PNG, JPG, JPEG): Uses Tesseract-OCR to extract text from image-based resumes.
|
93 |
+
Note: For Tesseract-OCR, install it locally by following the [installation guide](https://github.com/UB-Mannheim/tesseract/wiki).
|
94 |
+
- Hyperlink Extraction: Extracts hyperlinks from PDF files, capturing any embedded URLs during the parsing process.
|
95 |
+
|
96 |
+
|
97 |
+
# Error Handling (utils/error.py)
|
98 |
+
- Manages API response errors, file format issues, and ensures smooth fallbacks without crashing the app.
|
99 |
+
|
100 |
+
# Flask API (main.py)
|
101 |
+
Endpoints:
|
102 |
+
- /upload for uploading resumes.
|
103 |
+
- Displays parsed results in JSON format on the results page.
|
104 |
+
- UI: Simple interface for uploading resumes and viewing the parsing results.
|
105 |
+
|
106 |
+
|
107 |
+
# Tree map of your program:
|
108 |
+
|
109 |
+
main.py
|
110 |
+
βββ Handles API side
|
111 |
+
βββ File upload/remove
|
112 |
+
βββ Process resumes
|
113 |
+
βββ Show result
|
114 |
+
utils
|
115 |
+
βββ fileTotext.py
|
116 |
+
β βββ Converts files to text
|
117 |
+
β βββ PDF
|
118 |
+
β βββ DOCX
|
119 |
+
β βββ RTF
|
120 |
+
β βββ ODT
|
121 |
+
β βββ PNG
|
122 |
+
β βββ JPG
|
123 |
+
β βββ JPEG
|
124 |
+
βββ mistral.py
|
125 |
+
β βββ Mistral API Calls
|
126 |
+
β β βββ Uses Mistral-Nemo-Instruct-2407 model
|
127 |
+
β βββ Personal and Professional Extraction
|
128 |
+
β β βββ Extracts personal information
|
129 |
+
β β βββ Extracts professional information
|
130 |
+
β βββ Fallback Mechanism
|
131 |
+
β βββ Uses spaCy NER model if Mistral fails
|
132 |
+
βββ spacy.py
|
133 |
+
βββ Custom Trained Model
|
134 |
+
β βββ Uses spaCy model (ner_model_05_3)
|
135 |
+
βββ Named Entity Recognition
|
136 |
+
β βββ Extracts key information (Name, Email, Contact, etc.)
|
137 |
+
βββ Validation
|
138 |
+
βββ Validates emails and contacts
|
139 |
+
|
140 |
+
|
141 |
+
# References:
|
142 |
+
|
143 |
+
- [Flask Documentation](https://flask.palletsprojects.com/)
|
144 |
+
- [spaCy Documentation](https://spacy.io/usage)
|
145 |
+
- [Mistral Documentation](https://docs.mistral.ai/)
|
146 |
+
- [Hugging Face Hub API](https://huggingface.co/docs/huggingface_hub/index)
|
147 |
+
- [PyMuPDF (MuPDF) Documentation](https://pymupdf.readthedocs.io/en/latest/)
|
148 |
+
- [python-docx Documentation](https://python-docx.readthedocs.io/en/latest/)
|
149 |
+
- [Tesseract OCR Documentation](https://github.com/UB-Mannheim/tesseract/wiki)
|
150 |
+
- [Virtual Environments in Python](https://docs.python.org/3/tutorial/venv.html)
|
requirements.txt
ADDED
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
amqp==5.2.0
|
2 |
+
annotated-types==0.7.0
|
3 |
+
billiard==4.2.0
|
4 |
+
blinker==1.8.2
|
5 |
+
blis==0.7.11
|
6 |
+
catalogue==2.0.10
|
7 |
+
celery==5.4.0
|
8 |
+
certifi==2024.8.30
|
9 |
+
charset-normalizer==3.3.2
|
10 |
+
click==8.1.7
|
11 |
+
click-didyoumean==0.3.1
|
12 |
+
click-plugins==1.1.1
|
13 |
+
click-repl==0.3.0
|
14 |
+
cloudpathlib==0.19.0
|
15 |
+
colorama==0.4.6
|
16 |
+
confection==0.1.5
|
17 |
+
cymem==2.0.8
|
18 |
+
defusedxml==0.7.1
|
19 |
+
Flask==3.0.3
|
20 |
+
Flask-SQLAlchemy==3.1.1
|
21 |
+
greenlet==3.1.0
|
22 |
+
idna==3.10
|
23 |
+
itsdangerous==2.2.0
|
24 |
+
Jinja2==3.1.4
|
25 |
+
kombu==5.4.1
|
26 |
+
langcodes==3.4.0
|
27 |
+
language_data==1.2.0
|
28 |
+
lxml==5.3.0
|
29 |
+
marisa-trie==1.2.0
|
30 |
+
markdown-it-py==3.0.0
|
31 |
+
MarkupSafe==2.1.5
|
32 |
+
mdurl==0.1.2
|
33 |
+
murmurhash==1.0.10
|
34 |
+
numpy==1.26.4
|
35 |
+
odfpy==1.4.1
|
36 |
+
packaging==24.1
|
37 |
+
pdf2image==1.17.0
|
38 |
+
pillow==10.4.0
|
39 |
+
preshed==3.0.9
|
40 |
+
prompt_toolkit==3.0.47
|
41 |
+
pydantic==2.9.1
|
42 |
+
pydantic_core==2.23.3
|
43 |
+
Pygments==2.18.0
|
44 |
+
PyMuPDF==1.24.10
|
45 |
+
PyMuPDFb==1.24.10
|
46 |
+
pytesseract==0.3.13
|
47 |
+
python-dateutil==2.9.0.post0
|
48 |
+
python-docx==1.1.2
|
49 |
+
requests==2.32.3
|
50 |
+
rich==13.8.1
|
51 |
+
setuptools==75.0.0
|
52 |
+
shellingham==1.5.4
|
53 |
+
six==1.16.0
|
54 |
+
smart-open==7.0.4
|
55 |
+
spacy==3.7.6
|
56 |
+
spacy-legacy==3.0.12
|
57 |
+
spacy-loggers==1.0.5
|
58 |
+
SQLAlchemy==2.0.34
|
59 |
+
srsly==2.4.8
|
60 |
+
thinc==8.2.5
|
61 |
+
tqdm==4.66.5
|
62 |
+
typer==0.12.5
|
63 |
+
typing_extensions==4.12.2
|
64 |
+
tzdata==2024.1
|
65 |
+
urllib3==2.2.3
|
66 |
+
vine==5.1.0
|
67 |
+
wasabi==1.1.3
|
68 |
+
wcwidth==0.2.13
|
69 |
+
weasel==0.4.1
|
70 |
+
Werkzeug==3.0.4
|
71 |
+
wrapt==1.16.0
|