WebashalarForML commited on
Commit
057a19f
Β·
verified Β·
1 Parent(s): dc3c0b4

Upload 4 files

Browse files
Files changed (4) hide show
  1. BACKUP.PY +335 -0
  2. main.py +350 -0
  3. readme +150 -0
  4. requirements.txt +71 -0
BACKUP.PY ADDED
@@ -0,0 +1,335 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ from flask import Flask, request, render_template, redirect, url_for, session, flash, send_from_directory, send_file
4
+ from werkzeug.utils import secure_filename
5
+ from utils.file_to_text import extract_text_based_on_format, preprocess_text
6
+ from utils.anoter_to_json import process_uploaded_json
7
+ from utils.json_to_spacy import convert_json_to_spacy
8
+ from utils.model import train_model
9
+ import zipfile
10
+
11
+ app = Flask(__name__)
12
+ app.secret_key = 'your_secret_key'
13
+
14
+ # Folder paths
15
+ app.config['UPLOAD_FOLDER'] = 'uploads'
16
+ app.config['JSON_FOLDER'] = 'JSON'
17
+ app.config['DATA_FOLDER'] = 'data'
18
+ app.config['MODELS_FOLDER'] = 'Models'
19
+
20
+ # Allowed file extensions
21
+ ALLOWED_EXTENSIONS = {'pdf', 'docx', 'rsf', 'odt', 'png', 'jpg', 'jpeg', 'json'}
22
+
23
+ # Function to check file extensions
24
+ def allowed_file(filename):
25
+ return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
26
+
27
+ @app.route('/')
28
+ def index():
29
+ return render_template('upload.html')
30
+
31
+ # API for uploading Resume files
32
+ @app.route('/upload',methods=['GET', 'POST'])
33
+ def upload_file():
34
+ try:
35
+ if 'file' not in request.files:
36
+ flash('No file part', 'error')
37
+ return redirect(request.url)
38
+
39
+ file = request.files['file']
40
+ if file.filename == '':
41
+ flash('No selected file', 'error')
42
+ return redirect(request.url)
43
+
44
+ if file and allowed_file(file.filename):
45
+ filename = secure_filename(file.filename)
46
+ file_path = os.path.join(app.config['UPLOAD_FOLDER'], filename)
47
+ file.save(file_path)
48
+
49
+ # Handle text extraction for non-JSON files
50
+ if not filename.lower().endswith('.json'):
51
+ return process_other_files(file_path, filename)
52
+
53
+ flash('File type not allowed', 'error')
54
+ except Exception as e:
55
+ flash(f"Error: {str(e)}", 'error')
56
+
57
+ return redirect(request.url)
58
+
59
+ # Process non-JSON files, extract text and save to 'resume_text.txt'
60
+ def process_other_files(file_path, filename):
61
+ try:
62
+ extracted_text, _ = extract_text_based_on_format(file_path)
63
+ cleaned_text = preprocess_text(extracted_text)
64
+
65
+ os.makedirs(app.config['DATA_FOLDER'], exist_ok=True)
66
+ resume_file_path = os.path.join(app.config['DATA_FOLDER'], 'resume_text.txt')
67
+
68
+ with open(resume_file_path, 'w', encoding='utf-8') as f:
69
+ f.write(cleaned_text)
70
+
71
+ session['uploaded_file'] = filename
72
+ return render_template('text.html', text=cleaned_text)
73
+ except Exception as e:
74
+ flash(f"Error processing file {filename}: {str(e)}", 'error')
75
+ return redirect(request.referrer)
76
+
77
+ # API to handle the text editing and saving
78
+ @app.route('/edit_text', methods=['POST'])
79
+ def edit_text():
80
+ try:
81
+ # Get the edited text from the form
82
+ edited_text = request.form['edited_text']
83
+
84
+ # Save the edited text back to 'resume_text.txt'
85
+ resume_file_path = os.path.join(app.config['DATA_FOLDER'], 'resume_text.txt')
86
+ with open(resume_file_path, 'w', encoding='utf-8') as f:
87
+ f.write(edited_text)
88
+
89
+ flash('Text edited successfully', 'success')
90
+ # Pass the edited text back to the template
91
+ return render_template('text.html', text=edited_text)
92
+ except Exception as e:
93
+ flash(f"Error saving edited text: {str(e)}", 'error')
94
+ return redirect(request.referrer)
95
+
96
+ # API for downloading the 'resume_text.txt' file
97
+ @app.route('/download', methods=['GET'])
98
+ def download_file():
99
+ try:
100
+ return send_from_directory(app.config['DATA_FOLDER'], 'resume_text.txt', as_attachment=True)
101
+ except Exception as e:
102
+ flash(f"Error downloading file: {str(e)}", 'error')
103
+ return redirect(request.referrer)
104
+
105
+ @app.route('/save_and_download', methods=['POST'])
106
+ def save_and_download():
107
+ try:
108
+ # Get the edited text from the form
109
+ edited_text = request.form['edited_text']
110
+
111
+ # Save the edited text back to 'resume_text.txt'
112
+ resume_file_path = os.path.join(app.config['DATA_FOLDER'], 'resume_text.txt')
113
+ with open(resume_file_path, 'w', encoding='utf-8') as f:
114
+ f.write(edited_text)
115
+
116
+ flash('Text edited successfully', 'success')
117
+
118
+ # Now send the file as a download
119
+ return send_from_directory(app.config['DATA_FOLDER'], 'resume_text.txt', as_attachment=True)
120
+
121
+ except Exception as e:
122
+ flash(f"Error saving and downloading file: {str(e)}", 'error')
123
+ return redirect(request.referrer)
124
+
125
+
126
+ # API for uploading and processing JSON files
127
+ @app.route('/upload_json', methods=['POST'])
128
+ def upload_json_file():
129
+ try:
130
+ if 'file' not in request.files:
131
+ flash('No file part', 'error')
132
+ return redirect(request.url)
133
+
134
+ file = request.files['file']
135
+ if file.filename == '':
136
+ flash('No selected file', 'error')
137
+ return redirect(request.url)
138
+
139
+ if file and file.filename.lower().endswith('.json'):
140
+ filename = secure_filename(file.filename)
141
+ json_path = os.path.join(app.config['JSON_FOLDER'], filename)
142
+ os.makedirs(app.config['JSON_FOLDER'], exist_ok=True)
143
+ file.save(json_path)
144
+ session['uploaded_json'] = filename
145
+ flash(f'JSON file {filename} uploaded successfully')
146
+ else:
147
+ flash('File type not allowed', 'error')
148
+ except Exception as e:
149
+ flash(f"Error: {str(e)}", 'error')
150
+
151
+ return redirect(request.referrer)
152
+
153
+ # Process uploaded JSON file and save formatted data
154
+ @app.route('/process_json', methods=['GET'])
155
+ def process_json_file():
156
+ try:
157
+ json_folder = app.config['JSON_FOLDER']
158
+ json_files = os.listdir(json_folder)
159
+
160
+ if not json_files:
161
+ flash('No JSON files found in the folder', 'error')
162
+ return redirect(request.referrer)
163
+
164
+ filename = json_files[0] # Modify logic if needed to handle multiple files
165
+ json_path = os.path.join(json_folder, filename)
166
+
167
+ if not os.path.exists(json_path):
168
+ flash(f'JSON file {filename} not found', 'error')
169
+ return redirect(request.referrer)
170
+
171
+ process_uploaded_json(json_path)
172
+ os.makedirs(app.config['DATA_FOLDER'], exist_ok=True)
173
+ processed_file_path = os.path.join(app.config['DATA_FOLDER'], f'Processed_{filename}')
174
+
175
+ flash(f'JSON file {filename} processed successfully')
176
+ except Exception as e:
177
+ flash(f"Error processing JSON file: {str(e)}", 'error')
178
+
179
+ return redirect(request.referrer)
180
+
181
+ # API for removing uploaded JSON files
182
+ @app.route('/remove_json', methods=['POST'])
183
+ def remove_all_json_files():
184
+ try:
185
+ json_folder = app.config['JSON_FOLDER']
186
+ for filename in os.listdir(json_folder):
187
+ file_path = os.path.join(json_folder, filename)
188
+ if os.path.isfile(file_path):
189
+ os.remove(file_path)
190
+ session.pop('uploaded_json', None)
191
+
192
+ flash('All JSON files removed successfully')
193
+ except Exception as e:
194
+ flash(f"Error removing files: {str(e)}", 'error')
195
+
196
+ return redirect(request.referrer)
197
+
198
+ # API for removing non-JSON files
199
+ @app.route('/remove', methods=['POST'])
200
+ def remove_file():
201
+ try:
202
+ upload_folder = app.config['UPLOAD_FOLDER']
203
+
204
+ # Check if the folder exists
205
+ if os.path.exists(upload_folder):
206
+ # Loop through all files in the upload folder and remove them
207
+ for filename in os.listdir(upload_folder):
208
+ file_path = os.path.join(upload_folder, filename)
209
+
210
+ # Check if it is a file and remove it
211
+ if os.path.isfile(file_path):
212
+ os.remove(file_path)
213
+
214
+ # Clear session data related to uploaded files
215
+ session.pop('uploaded_file', None)
216
+ flash('All files removed successfully')
217
+ else:
218
+ flash(f"Upload folder does not exist", 'error')
219
+
220
+ except Exception as e:
221
+ flash(f"Error removing files: {str(e)}", 'error')
222
+
223
+ return redirect(url_for('index'))
224
+
225
+ # HTML render routes (modify to fit your structure)
226
+ @app.route('/ner_preview', methods=['GET'])
227
+ def ner_preview():
228
+ return render_template('anoter.html')
229
+
230
+ @app.route('/json', methods=['GET'])
231
+ def json_file():
232
+ return render_template('savejson.html')
233
+
234
+ @app.route('/spacy', methods=['GET'])
235
+ def spacy_file():
236
+ return render_template('saveSpacy.html')
237
+
238
+ # @app.route('/text', methods=['GET'])
239
+ # def spacy_file():
240
+ # return render_template('text.html')
241
+
242
+ @app.route('/to_sapcy', methods=['POST'])
243
+ def to_sapcy():
244
+ try:
245
+ # Path to the JSON file
246
+ json_file_path = 'data/Json_Data.json'
247
+ # Convert the JSON file to a .spacy file
248
+ spacy_file_path = 'data/Spacy_data.spacy'
249
+
250
+ # Call the conversion function
251
+ convert_json_to_spacy(json_file_path, spacy_file_path)
252
+
253
+ flash('Model training data converted successfully', 'success')
254
+ except Exception as e:
255
+ flash(f"Error during conversion: {str(e)}", 'error')
256
+
257
+ return redirect(request.referrer)
258
+
259
+ @app.route('/train_model_endpoint', methods=['POST'])
260
+ def train_model_endpoint():
261
+ try:
262
+ # Get the number of epochs and model version from the request
263
+ epochs = int(request.form.get('epochs', 10)) # Default to 10 if not provided
264
+ version = request.form.get('model_version', 'v1') # Default to 'v1' if not provided
265
+
266
+ # Call the training function with user-defined parameters
267
+ model_path = f"./Models/ner_model_{version}"
268
+ train_model(epochs, model_path)
269
+
270
+ flash('Model training completed successfully', 'success')
271
+ except Exception as e:
272
+ flash(f"Error during training: {str(e)}", 'error')
273
+
274
+ return redirect(url_for('index'))
275
+
276
+ # API for removing all files from specific folders
277
+ @app.route('/remove_files', methods=['POST'])
278
+ def remove_files():
279
+ try:
280
+ # Define folders to clear
281
+ folders_to_clear = [app.config['UPLOAD_FOLDER'], app.config['JSON_FOLDER']]
282
+
283
+ for folder_path in folders_to_clear:
284
+ # Remove all files from the specified folder
285
+ for filename in os.listdir(folder_path):
286
+ file_path = os.path.join(folder_path, filename)
287
+ if os.path.isfile(file_path):
288
+ os.remove(file_path)
289
+
290
+ # Clear session variables related to the removed folders
291
+ session.pop('uploaded_file', None)
292
+ session.pop('uploaded_json', None)
293
+
294
+ flash('All files removed from folder successfully')
295
+ except Exception as e:
296
+ flash(f"Error removing files: {str(e)}", 'error')
297
+
298
+ return redirect(url_for('index'))
299
+
300
+ # API for downloading the latest trained model
301
+ @app.route('/download_model', methods=['GET'])
302
+ def download_latest_model():
303
+ try:
304
+ models_dir = app.config['MODELS_FOLDER']
305
+ model_files = os.listdir(models_dir)
306
+
307
+ if not model_files:
308
+ flash('No model files found', 'error')
309
+ return redirect(request.referrer)
310
+
311
+ # Sort model files and get the latest one
312
+ latest_model_file = sorted(model_files, reverse=True)[0]
313
+
314
+ # Full path to the latest model file
315
+ model_path = os.path.join(models_dir, latest_model_file)
316
+
317
+ if not os.path.exists(model_path):
318
+ flash('Model file not found on the server', 'error')
319
+ return redirect(request.referrer)
320
+
321
+ # Create a zip file with the model
322
+ zip_filename = os.path.join(models_dir, f"{latest_model_file}.zip")
323
+
324
+ with zipfile.ZipFile(zip_filename, 'w') as zipf:
325
+ zipf.write(model_path, os.path.basename(model_path))
326
+
327
+ # Send the zip file as a download
328
+ return send_file(zip_filename, as_attachment=True)
329
+
330
+ except Exception as e:
331
+ flash(f"Error while downloading the model: {str(e)}", 'error')
332
+ return redirect(request.referrer)
333
+
334
+ if __name__ == '__main__':
335
+ app.run(debug=True)
main.py ADDED
@@ -0,0 +1,350 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ from flask import Flask, request, render_template, redirect, url_for, session, flash, send_from_directory, send_file
4
+ from werkzeug.utils import secure_filename
5
+ from utils.file_to_text import extract_text_based_on_format, preprocess_text
6
+ from utils.anoter_to_json import process_uploaded_json
7
+ from utils.json_to_spacy import convert_json_to_spacy
8
+ from utils.model import train_model
9
+ import zipfile
10
+
11
+ app = Flask(__name__)
12
+ app.secret_key = 'your_secret_key'
13
+
14
+ # Folder paths
15
+ app.config['UPLOAD_FOLDER'] = 'uploads'
16
+ app.config['JSON_FOLDER'] = 'JSON'
17
+ app.config['DATA_FOLDER'] = 'data'
18
+ app.config['MODELS_FOLDER'] = 'Models'
19
+
20
+ # Allowed file extensions
21
+ ALLOWED_EXTENSIONS = {'pdf', 'docx', 'rsf', 'odt', 'png', 'jpg', 'jpeg', 'json'}
22
+
23
+ # Function to check file extensions
24
+ def allowed_file(filename):
25
+ return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
26
+
27
+ # HTML render routes (modify to fit your structure)
28
+ @app.route('/')
29
+ def index():
30
+ return render_template('upload.html')
31
+ @app.route('/guide')
32
+ def guide():
33
+ return render_template('guide.html')
34
+
35
+ @app.route('/ner_preview', methods=['GET'])
36
+ def ner_preview():
37
+ return render_template('anoter.html')
38
+
39
+ @app.route('/json', methods=['GET'])
40
+ def json_file():
41
+ return render_template('savejson.html')
42
+
43
+ @app.route('/spacy', methods=['GET'])
44
+ def spacy_file():
45
+ return render_template('saveSpacy.html')
46
+
47
+ @app.route('/text_preview', methods=['GET'])
48
+ def text_preview():
49
+ try:
50
+ resume_file_path = os.path.join(app.config['DATA_FOLDER'], 'resume_text.txt')
51
+ if not os.path.exists(resume_file_path):
52
+ flash('Resume text not found', 'error')
53
+ return redirect(url_for('index'))
54
+
55
+ with open(resume_file_path, 'r') as f:
56
+ text = f.read()
57
+ return render_template('text.html', text=text)
58
+ except Exception as e:
59
+ flash(f"Error loading text preview: {str(e)}", 'error')
60
+ return redirect(url_for('index'))
61
+
62
+ # API for uploading Resume files
63
+ @app.route('/upload',methods=['GET', 'POST'])
64
+ def upload_file():
65
+ try:
66
+ if 'file' not in request.files:
67
+ flash('No file part', 'error')
68
+ return redirect(request.url)
69
+
70
+ file = request.files['file']
71
+ if file.filename == '':
72
+ flash('No selected file', 'error')
73
+ return redirect(request.url)
74
+
75
+ if file and allowed_file(file.filename):
76
+ filename = secure_filename(file.filename)
77
+ file_path = os.path.join(app.config['UPLOAD_FOLDER'], filename)
78
+ file.save(file_path)
79
+
80
+ # Handle text extraction for non-JSON files
81
+ if not filename.lower().endswith('.json'):
82
+ return process_other_files(file_path, filename)
83
+
84
+ flash('File type not allowed', 'error')
85
+ except Exception as e:
86
+ flash(f"Error: {str(e)}", 'error')
87
+
88
+ return redirect(request.url)
89
+
90
+ # Process non-JSON files, extract text and save to 'resume_text.txt'
91
+ def process_other_files(file_path, filename):
92
+ try:
93
+ extracted_text, _ = extract_text_based_on_format(file_path)
94
+ cleaned_text = preprocess_text(extracted_text)
95
+
96
+ os.makedirs(app.config['DATA_FOLDER'], exist_ok=True)
97
+ resume_file_path = os.path.join(app.config['DATA_FOLDER'], 'resume_text.txt')
98
+
99
+ with open(resume_file_path, 'w', encoding='utf-8') as f:
100
+ f.write(cleaned_text)
101
+
102
+ session['uploaded_file'] = filename
103
+ return render_template('text.html', text=cleaned_text)
104
+ except Exception as e:
105
+ flash(f"Error processing file {filename}: {str(e)}", 'error')
106
+ return redirect(request.referrer)
107
+
108
+ # API to handle the text editing and saving
109
+ @app.route('/edit_text', methods=['POST'])
110
+ def edit_text():
111
+ try:
112
+ # Get the edited text from the form
113
+ edited_text = request.form['edited_text']
114
+
115
+ # Save the edited text back to 'resume_text.txt'
116
+ resume_file_path = os.path.join(app.config['DATA_FOLDER'], 'resume_text.txt')
117
+ with open(resume_file_path, 'w', encoding='utf-8') as f:
118
+ f.write(edited_text)
119
+
120
+ flash('Text edited successfully', 'success')
121
+ # Pass the edited text back to the template
122
+ return render_template('text.html', text=edited_text)
123
+ except Exception as e:
124
+ flash(f"Error saving edited text: {str(e)}", 'error')
125
+ return redirect(request.referrer)
126
+
127
+ # API for downloading the 'resume_text.txt' file
128
+ @app.route('/download', methods=['GET'])
129
+ def download_file():
130
+ try:
131
+ return send_from_directory(app.config['DATA_FOLDER'], 'resume_text.txt', as_attachment=True)
132
+ except Exception as e:
133
+ flash(f"Error downloading file: {str(e)}", 'error')
134
+ return redirect(request.referrer)
135
+
136
+ @app.route('/save_and_download', methods=['POST'])
137
+ def save_and_download():
138
+ try:
139
+ # Get the edited text from the form
140
+ edited_text = request.form['edited_text']
141
+
142
+ # Save the edited text back to 'resume_text.txt'
143
+ resume_file_path = os.path.join(app.config['DATA_FOLDER'], 'resume_text.txt')
144
+ with open(resume_file_path, 'w', encoding='utf-8') as f:
145
+ f.write(edited_text)
146
+
147
+ # flash('Text edited successfully', 'success')
148
+
149
+ # Now send the file as a download
150
+ return send_from_directory(app.config['DATA_FOLDER'], 'resume_text.txt', as_attachment=True)
151
+
152
+ except Exception as e:
153
+ flash(f"Error saving and downloading file: {str(e)}", 'error')
154
+ return redirect(request.referrer)
155
+
156
+
157
+ # API for uploading and processing JSON files
158
+ @app.route('/upload_json', methods=['POST'])
159
+ def upload_json_file():
160
+ try:
161
+ if 'file' not in request.files:
162
+ flash('No file part', 'error')
163
+ return redirect(request.url)
164
+
165
+ file = request.files['file']
166
+ if file.filename == '':
167
+ flash('No selected file', 'error')
168
+ return redirect(request.url)
169
+
170
+ if file and file.filename.lower().endswith('.json'):
171
+ filename = secure_filename(file.filename)
172
+ json_path = os.path.join(app.config['JSON_FOLDER'], filename)
173
+ os.makedirs(app.config['JSON_FOLDER'], exist_ok=True)
174
+ file.save(json_path)
175
+ session['uploaded_json'] = filename
176
+ flash(f'JSON file {filename} uploaded successfully')
177
+ else:
178
+ flash('File type not allowed', 'error')
179
+ except Exception as e:
180
+ flash(f"Error: {str(e)}", 'error')
181
+
182
+ return redirect(request.referrer)
183
+
184
+ # Process uploaded JSON file and save formatted data
185
+ @app.route('/process_json', methods=['GET'])
186
+ def process_json_file():
187
+ try:
188
+ json_folder = app.config['JSON_FOLDER']
189
+ json_files = os.listdir(json_folder)
190
+
191
+ if not json_files:
192
+ flash('No JSON files found in the folder', 'error')
193
+ return redirect(request.referrer)
194
+
195
+ filename = json_files[0] # Modify logic if needed to handle multiple files
196
+ json_path = os.path.join(json_folder, filename)
197
+
198
+ if not os.path.exists(json_path):
199
+ flash(f'JSON file {filename} not found', 'error')
200
+ return redirect(request.referrer)
201
+
202
+ process_uploaded_json(json_path)
203
+ os.makedirs(app.config['DATA_FOLDER'], exist_ok=True)
204
+ processed_file_path = os.path.join(app.config['DATA_FOLDER'], f'Processed_{filename}')
205
+
206
+ flash(f'JSON file {filename} processed successfully')
207
+ except Exception as e:
208
+ flash(f"Error processing JSON file: {str(e)}", 'error')
209
+
210
+ return redirect(request.referrer)
211
+
212
+ # API for removing uploaded JSON files
213
+ @app.route('/remove_json', methods=['POST'])
214
+ def remove_all_json_files():
215
+ try:
216
+ json_folder = app.config['JSON_FOLDER']
217
+ for filename in os.listdir(json_folder):
218
+ file_path = os.path.join(json_folder, filename)
219
+ if os.path.isfile(file_path):
220
+ os.remove(file_path)
221
+ session.pop('uploaded_json', None)
222
+
223
+ flash('All JSON files removed successfully')
224
+ except Exception as e:
225
+ flash(f"Error removing files: {str(e)}", 'error')
226
+
227
+ return redirect(request.referrer)
228
+
229
+ # API for removing non-JSON files
230
+ @app.route('/remove', methods=['POST'])
231
+ def remove_file():
232
+ try:
233
+ upload_folder = app.config['UPLOAD_FOLDER']
234
+
235
+ # Check if the folder exists
236
+ if os.path.exists(upload_folder):
237
+ # Loop through all files in the upload folder and remove them
238
+ for filename in os.listdir(upload_folder):
239
+ file_path = os.path.join(upload_folder, filename)
240
+
241
+ # Check if it is a file and remove it
242
+ if os.path.isfile(file_path):
243
+ os.remove(file_path)
244
+
245
+ # Clear session data related to uploaded files
246
+ session.pop('uploaded_file', None)
247
+ flash('All files removed successfully')
248
+ else:
249
+ flash(f"Upload folder does not exist", 'error')
250
+
251
+ except Exception as e:
252
+ flash(f"Error removing files: {str(e)}", 'error')
253
+
254
+ return redirect(url_for('index'))
255
+
256
+
257
+ @app.route('/to_sapcy', methods=['POST'])
258
+ def to_sapcy():
259
+ try:
260
+ # Path to the JSON file
261
+ json_file_path = 'data/Json_Data.json'
262
+ # Convert the JSON file to a .spacy file
263
+ spacy_file_path = 'data/Spacy_data.spacy'
264
+
265
+ # Call the conversion function
266
+ convert_json_to_spacy(json_file_path, spacy_file_path)
267
+
268
+ flash('Model training data converted successfully', 'success')
269
+ except Exception as e:
270
+ flash(f"Error during conversion: {str(e)}", 'error')
271
+
272
+ return redirect(request.referrer)
273
+
274
+ @app.route('/train_model_endpoint', methods=['POST'])
275
+ def train_model_endpoint():
276
+ try:
277
+ # Get the number of epochs and model version from the request
278
+ epochs = int(request.form.get('epochs', 10)) # Default to 10 if not provided
279
+ version = request.form.get('model_version', 'v1') # Default to 'v1' if not provided
280
+
281
+ # Call the training function with user-defined parameters
282
+ model_path = f"./Models/ner_model_{version}"
283
+ train_model(epochs, model_path)
284
+
285
+ flash('Model training completed successfully', 'success')
286
+ except Exception as e:
287
+ flash(f"Error during training: {str(e)}", 'error')
288
+
289
+ return redirect(url_for('index'))
290
+
291
+ # API for removing all files from specific folders
292
+ @app.route('/remove_files', methods=['POST'])
293
+ def remove_files():
294
+ try:
295
+ # Define folders to clear
296
+ folders_to_clear = [app.config['UPLOAD_FOLDER'], app.config['JSON_FOLDER'], app.config['MODELS_FOLDER'] ]
297
+
298
+ for folder_path in folders_to_clear:
299
+ # Remove all files from the specified folder
300
+ for filename in os.listdir(folder_path):
301
+ file_path = os.path.join(folder_path, filename)
302
+ if os.path.isfile(file_path):
303
+ os.remove(file_path)
304
+
305
+ # Clear session variables related to the removed folders
306
+ session.pop('uploaded_file', None)
307
+ session.pop('uploaded_json', None)
308
+
309
+ flash('All files removed from folder successfully')
310
+ except Exception as e:
311
+ flash(f"Error removing files: {str(e)}", 'error')
312
+
313
+ return redirect(url_for('index'))
314
+
315
+ # API for downloading the latest trained model
316
+ @app.route('/download_model', methods=['GET'])
317
+ def download_latest_model():
318
+ try:
319
+ models_dir = app.config['MODELS_FOLDER']
320
+ model_files = os.listdir(models_dir)
321
+
322
+ if not model_files:
323
+ flash('No model files found', 'error')
324
+ return redirect(request.referrer)
325
+
326
+ # Sort model files and get the latest one
327
+ latest_model_file = sorted(model_files, reverse=True)[0]
328
+
329
+ # Full path to the latest model file
330
+ model_path = os.path.join(models_dir, latest_model_file)
331
+
332
+ if not os.path.exists(model_path):
333
+ flash('Model file not found on the server', 'error')
334
+ return redirect(request.referrer)
335
+
336
+ # Create a zip file with the model
337
+ zip_filename = os.path.join(models_dir, f"{latest_model_file}.zip")
338
+
339
+ with zipfile.ZipFile(zip_filename, 'w') as zipf:
340
+ zipf.write(model_path, os.path.basename(model_path))
341
+
342
+ # Send the zip file as a download
343
+ return send_file(zip_filename, as_attachment=True)
344
+
345
+ except Exception as e:
346
+ flash(f"Error while downloading the model: {str(e)}", 'error')
347
+ return redirect(request.referrer)
348
+
349
+ if __name__ == '__main__':
350
+ app.run(debug=True)
readme ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\
2
+ \\----------- **Resume Parser** ----------\\
3
+ \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\
4
+
5
+ # Overview:
6
+ This project is a comprehensive Resume Parsing tool built using Python,
7
+ integrating the Mistral-Nemo-Instruct-2407 model for primary parsing.
8
+ If Mistral fails or encounters issues,
9
+ the system falls back to a custom-trained spaCy model to ensure continued functionality.
10
+ The tool is wrapped with a Flask API and has a user interface built using HTML and CSS.
11
+
12
+
13
+ # Installation Guide:
14
+
15
+ 1. Create and Activate a Virtual Environment
16
+ python -m venv venv
17
+ source venv/bin/activate # For Linux/Mac
18
+ # or
19
+ venv\Scripts\activate # For Windows
20
+
21
+ # NOTE: If the virtual environment (venv) is already created, you can skip the creation step and just activate.
22
+ - For Linux/Mac:
23
+ source venv/bin/activate
24
+ - For Windows:
25
+ venv\Scripts\activate
26
+
27
+ 2. Install Required Libraries
28
+ pip install -r requirements.txt
29
+
30
+ # Ensure the following dependencies are included:
31
+ - Flask
32
+ - spaCy
33
+ - huggingface_hub
34
+ - PyMuPDF
35
+ - python-docx
36
+ - Tesseract-OCR (for image-based parsing)
37
+
38
+ ; NOTE : If any model or library is not installed, you can install it using:
39
+ pip install <model_name>
40
+ _Replace <model_name> with the specific model or library you need to install_
41
+
42
+ 3. Set up Hugging Face Token
43
+ - Add your Hugging Face token to the .env file as:
44
+ HF_TOKEN=<your_huggingface_token>
45
+
46
+
47
+ # File Structure Overview:
48
+ Mistral_With_Spacy/
49
+ β”‚
50
+ β”œβ”€β”€ Spacy_Models/
51
+ β”‚ └── ner_model_05_3 # Pretrained spaCy model directory for resume parsing
52
+ β”‚
53
+ β”œβ”€β”€ templates/
54
+ β”‚ β”œβ”€β”€ index.html # UI for file upload
55
+ β”‚ └── result.html # Display parsed results in structured JSON
56
+ β”‚
57
+ β”œβ”€β”€ uploads/ # Directory for uploaded resume files
58
+ β”‚
59
+ β”œβ”€β”€ utils/
60
+ β”‚ β”œβ”€β”€ mistral.py # Code for calling Mistral API and handling responses
61
+ β”‚ β”œβ”€β”€ spacy.py # spaCy fallback model for parsing resumes
62
+ β”‚ β”œβ”€β”€ error.py # Error handling utilities
63
+ β”‚ └── fileTotext.py # Functions to extract text from different file formats (PDF, DOCX, etc.)
64
+ β”‚
65
+ β”œβ”€β”€ venv/ # Virtual environment
66
+ β”‚
67
+ β”œβ”€β”€ .env # Environment variables file (contains Hugging Face token)
68
+ β”‚
69
+ β”œβ”€β”€ main.py # Flask app handling API routes for uploading and processing resumes
70
+ β”‚
71
+ └── requirements.txt # Dependencies required for the project
72
+
73
+
74
+ # Program Overview:
75
+
76
+ # Mistral Integration (utils/mistral.py)
77
+ - Mistral API Calls: Uses Hugging Faces Mistral-Nemo-Instruct-2407 model to parse resumes.
78
+ - Personal and Professional Extraction: Two functions extract personal and professional information in structured JSON format.
79
+ - Fallback Mechanism: If Mistral fails, spaCys NER model is used as a fallback.
80
+
81
+ # SpaCy Integration (utils/spacy.py)
82
+ - Custom Trained Model: Uses a spaCy model (ner_model_05_3) trained specifically for resume parsing.
83
+ - Named Entity Recognition: Extracts key information like Name, Email, Contact, Location, Skills, Experience, etc., from resumes.
84
+ - Validation: Includes validation for extracted emails and contacts.
85
+
86
+ # File Conversion (utils/fileTotext.py)
87
+ - Text Extraction: Handles different resume formats (PDF, DOCX, ODT, RSF, and images like PNG, JPG, JPEG) and extracts text for further processing.
88
+ - PDF Files: Uses PyMuPDF to extract text and, if necessary, Tesseract-OCR for image-based PDF content.
89
+ - DOCX Files: Uses `python-docx` to extract structured text from Word documents.
90
+ - ODT Files: Uses `odfpy` to extract text from ODT (OpenDocument) files.
91
+ - RSF Files: Reads plain text from RSF files.
92
+ - Images (PNG, JPG, JPEG): Uses Tesseract-OCR to extract text from image-based resumes.
93
+ Note: For Tesseract-OCR, install it locally by following the [installation guide](https://github.com/UB-Mannheim/tesseract/wiki).
94
+ - Hyperlink Extraction: Extracts hyperlinks from PDF files, capturing any embedded URLs during the parsing process.
95
+
96
+
97
+ # Error Handling (utils/error.py)
98
+ - Manages API response errors, file format issues, and ensures smooth fallbacks without crashing the app.
99
+
100
+ # Flask API (main.py)
101
+ Endpoints:
102
+ - /upload for uploading resumes.
103
+ - Displays parsed results in JSON format on the results page.
104
+ - UI: Simple interface for uploading resumes and viewing the parsing results.
105
+
106
+
107
+ # Tree map of your program:
108
+
109
+ main.py
110
+ β”œβ”€β”€ Handles API side
111
+ β”œβ”€β”€ File upload/remove
112
+ β”œβ”€β”€ Process resumes
113
+ └── Show result
114
+ utils
115
+ β”œβ”€β”€ fileTotext.py
116
+ β”‚ └── Converts files to text
117
+ β”‚ β”œβ”€β”€ PDF
118
+ β”‚ β”œβ”€β”€ DOCX
119
+ β”‚ β”œβ”€β”€ RTF
120
+ β”‚ β”œβ”€β”€ ODT
121
+ β”‚ β”œβ”€β”€ PNG
122
+ β”‚ β”œβ”€β”€ JPG
123
+ β”‚ └── JPEG
124
+ β”œβ”€β”€ mistral.py
125
+ β”‚ β”œβ”€β”€ Mistral API Calls
126
+ β”‚ β”‚ └── Uses Mistral-Nemo-Instruct-2407 model
127
+ β”‚ β”œβ”€β”€ Personal and Professional Extraction
128
+ β”‚ β”‚ β”œβ”€β”€ Extracts personal information
129
+ β”‚ β”‚ └── Extracts professional information
130
+ β”‚ └── Fallback Mechanism
131
+ β”‚ └── Uses spaCy NER model if Mistral fails
132
+ └── spacy.py
133
+ β”œβ”€β”€ Custom Trained Model
134
+ β”‚ └── Uses spaCy model (ner_model_05_3)
135
+ β”œβ”€β”€ Named Entity Recognition
136
+ β”‚ └── Extracts key information (Name, Email, Contact, etc.)
137
+ └── Validation
138
+ └── Validates emails and contacts
139
+
140
+
141
+ # References:
142
+
143
+ - [Flask Documentation](https://flask.palletsprojects.com/)
144
+ - [spaCy Documentation](https://spacy.io/usage)
145
+ - [Mistral Documentation](https://docs.mistral.ai/)
146
+ - [Hugging Face Hub API](https://huggingface.co/docs/huggingface_hub/index)
147
+ - [PyMuPDF (MuPDF) Documentation](https://pymupdf.readthedocs.io/en/latest/)
148
+ - [python-docx Documentation](https://python-docx.readthedocs.io/en/latest/)
149
+ - [Tesseract OCR Documentation](https://github.com/UB-Mannheim/tesseract/wiki)
150
+ - [Virtual Environments in Python](https://docs.python.org/3/tutorial/venv.html)
requirements.txt ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ amqp==5.2.0
2
+ annotated-types==0.7.0
3
+ billiard==4.2.0
4
+ blinker==1.8.2
5
+ blis==0.7.11
6
+ catalogue==2.0.10
7
+ celery==5.4.0
8
+ certifi==2024.8.30
9
+ charset-normalizer==3.3.2
10
+ click==8.1.7
11
+ click-didyoumean==0.3.1
12
+ click-plugins==1.1.1
13
+ click-repl==0.3.0
14
+ cloudpathlib==0.19.0
15
+ colorama==0.4.6
16
+ confection==0.1.5
17
+ cymem==2.0.8
18
+ defusedxml==0.7.1
19
+ Flask==3.0.3
20
+ Flask-SQLAlchemy==3.1.1
21
+ greenlet==3.1.0
22
+ idna==3.10
23
+ itsdangerous==2.2.0
24
+ Jinja2==3.1.4
25
+ kombu==5.4.1
26
+ langcodes==3.4.0
27
+ language_data==1.2.0
28
+ lxml==5.3.0
29
+ marisa-trie==1.2.0
30
+ markdown-it-py==3.0.0
31
+ MarkupSafe==2.1.5
32
+ mdurl==0.1.2
33
+ murmurhash==1.0.10
34
+ numpy==1.26.4
35
+ odfpy==1.4.1
36
+ packaging==24.1
37
+ pdf2image==1.17.0
38
+ pillow==10.4.0
39
+ preshed==3.0.9
40
+ prompt_toolkit==3.0.47
41
+ pydantic==2.9.1
42
+ pydantic_core==2.23.3
43
+ Pygments==2.18.0
44
+ PyMuPDF==1.24.10
45
+ PyMuPDFb==1.24.10
46
+ pytesseract==0.3.13
47
+ python-dateutil==2.9.0.post0
48
+ python-docx==1.1.2
49
+ requests==2.32.3
50
+ rich==13.8.1
51
+ setuptools==75.0.0
52
+ shellingham==1.5.4
53
+ six==1.16.0
54
+ smart-open==7.0.4
55
+ spacy==3.7.6
56
+ spacy-legacy==3.0.12
57
+ spacy-loggers==1.0.5
58
+ SQLAlchemy==2.0.34
59
+ srsly==2.4.8
60
+ thinc==8.2.5
61
+ tqdm==4.66.5
62
+ typer==0.12.5
63
+ typing_extensions==4.12.2
64
+ tzdata==2024.1
65
+ urllib3==2.2.3
66
+ vine==5.1.0
67
+ wasabi==1.1.3
68
+ wcwidth==0.2.13
69
+ weasel==0.4.1
70
+ Werkzeug==3.0.4
71
+ wrapt==1.16.0