WebashalarForML commited on
Commit
2f2758d
·
verified ·
1 Parent(s): 882d345

Upload 4 files

Browse files
utils/anoter_to_json.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # import json
2
+
3
+ # def convert_to_spacy_format(json_data):
4
+ # spacy_data = []
5
+
6
+ # # Iterate over the annotations in the input JSON
7
+ # for annotation in json_data['annotations']:
8
+ # text = annotation[0] # The text is the first element in each annotation
9
+ # entities = annotation[1]['entities'] # The entities are in the second element under 'entities'
10
+
11
+ # spacy_entities = []
12
+ # for entity in entities:
13
+ # start, end, label = entity
14
+ # spacy_entities.append((start, end, label))
15
+
16
+ # # Append the converted data in the desired format (like B.json)
17
+ # spacy_data.append([text, {'entities': spacy_entities}])
18
+
19
+ # return spacy_data
20
+
21
+ # # Load your JSON data from 'A.json'
22
+ # json_file_path = './JSON/Row_Json_Data.json'
23
+
24
+ # with open(json_file_path, 'r', encoding='utf-8') as file:
25
+ # json_data = json.load(file)
26
+
27
+ # # Convert the JSON data to the desired format
28
+ # spacy_formatted_data = convert_to_spacy_format(json_data)
29
+
30
+ # # Save the converted data to 'B.json'
31
+ # output_file_path = './data/Json_Data.json'
32
+
33
+ # with open(output_file_path, 'w', encoding='utf-8') as outfile:
34
+ # json.dump(spacy_formatted_data, outfile, ensure_ascii=False, indent=4)
35
+
36
+ # print(f'Successfully converted and saved the data to {output_file_path}')
37
+
38
+ import json
39
+ import os
40
+
41
+ def convert_to_spacy_format(json_data):
42
+ spacy_data = []
43
+
44
+ # Iterate over the annotations in the input JSON
45
+ for annotation in json_data['annotations']:
46
+ text = annotation[0] # The text is the first element in each annotation
47
+ entities = annotation[1]['entities'] # The entities are in the second element under 'entities'
48
+
49
+ spacy_entities = []
50
+ for entity in entities:
51
+ start, end, label = entity
52
+ spacy_entities.append((start, end, label))
53
+
54
+ # Append the converted data in the desired format (like B.json)
55
+ spacy_data.append([text, {'entities': spacy_entities}])
56
+
57
+ return spacy_data
58
+
59
+ def process_uploaded_json(file_path):
60
+ # Load your JSON data from the uploaded file
61
+ with open(file_path, 'r', encoding='utf-8') as file:
62
+ json_data = json.load(file)
63
+
64
+ # Convert the JSON data to the desired format
65
+ spacy_formatted_data = convert_to_spacy_format(json_data)
66
+
67
+ # Define the path to the output file
68
+ output_file_path = './data/Json_Data.json'
69
+
70
+ # Check if the file already exists
71
+ if os.path.exists(output_file_path):
72
+ # If the file exists, load its content
73
+ with open(output_file_path, 'r', encoding='utf-8') as outfile:
74
+ existing_data = json.load(outfile)
75
+ else:
76
+ # If the file doesn't exist, start with an empty list
77
+ existing_data = []
78
+
79
+ # Append the new data to the existing data
80
+ existing_data.extend(spacy_formatted_data)
81
+
82
+ # Write the updated data back to the file
83
+ with open(output_file_path, 'w', encoding='utf-8') as outfile:
84
+ json.dump(existing_data, outfile, ensure_ascii=False, indent=4)
85
+
86
+ print(f'Successfully appended the new data to {output_file_path}')
87
+
utils/file_to_text.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import fitz
4
+ import logging
5
+ from PIL import Image
6
+ from pdf2image import convert_from_path
7
+ import platform
8
+ import pytesseract
9
+ import docx
10
+ from odf.opendocument import load as load_odt
11
+ from odf.text import P
12
+
13
+ # Path to tesseract executable (ensure it points to tesseract.exe)
14
+ if platform.system() == "Windows":
15
+ pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
16
+ else:
17
+ # For Hugging Face Spaces or other Linux environments
18
+ pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract'
19
+
20
+ # Set up logging
21
+ # logging.basicConfig(
22
+ # level=logging.DEBUG,
23
+ # format='%(asctime)s - %(levelname)s - %(message)s',
24
+ # handlers=[logging.StreamHandler()]
25
+ # )
26
+
27
+ # # Path to Tesseract executable
28
+ # tesseract_path = os.getenv('TESSERACT_CMD', '/usr/bin/tesseract')
29
+ # pytesseract.pytesseract.tesseract_cmd = tesseract_path
30
+
31
+ # Function to extract text from PDF using PyMuPDF
32
+ def extract_text_from_pdf(file_path):
33
+ text = ""
34
+ hyperlinks = []
35
+ try:
36
+ doc = fitz.open(file_path)
37
+ for page_num in range(doc.page_count):
38
+ page = doc.load_page(page_num)
39
+ page_text = page.get_text("text")
40
+
41
+ if not page_text.strip():
42
+ images = convert_from_path(file_path, dpi=300)
43
+ for image in images:
44
+ text += pytesseract.image_to_string(image)
45
+ else:
46
+ text += page_text
47
+
48
+ links = page.get_links()
49
+ for link in links:
50
+ if link.get("uri"):
51
+ hyperlinks.append(link["uri"])
52
+ except Exception as e:
53
+ logging.error(f"Error extracting text or hyperlinks from PDF: {e}")
54
+ return "", []
55
+
56
+ return text, list(set(hyperlinks))
57
+
58
+ # Function to extract text from DOCX
59
+ def extract_text_from_docx(file_path):
60
+ try:
61
+ doc = docx.Document(file_path)
62
+ text = "\n".join([para.text for para in doc.paragraphs])
63
+ return text
64
+ except Exception as e:
65
+ logging.error(f"Error extracting text from DOCX: {e}")
66
+ return ""
67
+
68
+ # Function to extract text from RSF (assuming text-based format)
69
+ def extract_text_from_rsf(file_path):
70
+ try:
71
+ with open(file_path, "r", encoding="utf-8") as file:
72
+ return file.read()
73
+ except Exception as e:
74
+ logging.error(f"Error extracting text from RSF: {e}")
75
+ return ""
76
+
77
+ # Function to extract text from ODT
78
+ def extract_text_from_odt(file_path):
79
+ try:
80
+ odt_doc = load_odt(file_path)
81
+ text_elements = odt_doc.getElementsByType(P)
82
+ text = "\n".join([te.firstChild.data for te in text_elements if te.firstChild])
83
+ return text
84
+ except Exception as e:
85
+ logging.error(f"Error extracting text from ODT: {e}")
86
+ return ""
87
+
88
+ # Function to extract text from images using Tesseract
89
+ def extract_text_from_image(file_path):
90
+ try:
91
+ img = Image.open(file_path)
92
+ text = pytesseract.image_to_string(img)
93
+ return text
94
+ except Exception as e:
95
+ logging.error(f"Error extracting text from image: {e}")
96
+ return ""
97
+
98
+ # Function to clean and preprocess the extracted text
99
+ def preprocess_text(text):
100
+ text = re.sub(r'\s+', ' ', text)
101
+ text = re.sub(r'\n', ' ', text)
102
+ text = re.sub(r'(\b\d{3}[-.\s]??\d{3}[-.\s]??\d{4}\b)', r' \1 ', text)
103
+ return text.strip()
104
+
105
+ # Function to automatically detect file format and extract text
106
+ def extract_text_based_on_format(file_path):
107
+ file_ext = os.path.splitext(file_path)[1].lower()
108
+
109
+ if file_ext == '.pdf':
110
+ text, hyperlinks = extract_text_from_pdf(file_path)
111
+ elif file_ext == '.docx':
112
+ text = extract_text_from_docx(file_path)
113
+ hyperlinks = []
114
+ elif file_ext == '.rsf':
115
+ text = extract_text_from_rsf(file_path)
116
+ hyperlinks = []
117
+ elif file_ext == '.odt':
118
+ text = extract_text_from_odt(file_path)
119
+ hyperlinks = []
120
+ elif file_ext in ['.png', '.jpg', '.jpeg']:
121
+ text = extract_text_from_image(file_path)
122
+ hyperlinks = []
123
+ else:
124
+ raise ValueError("Unsupported file format")
125
+
126
+ return text, hyperlinks
127
+
128
+
129
+ def clean_text_to_single_line(text):
130
+ # Replace newline characters with a space and remove extra spaces
131
+ cleaned_text = ' '.join(text.split())
132
+ return cleaned_text
utils/json_to_spacy.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import spacy
3
+ from spacy.tokens import DocBin
4
+
5
+ def read_in_chunks(file_path, chunk_size=1024):
6
+ with open(file_path, 'r', encoding='utf-8') as file:
7
+ while True:
8
+ data = file.read(chunk_size)
9
+ if not data:
10
+ break
11
+ yield data
12
+
13
+ def convert_json_to_spacy(json_file_path, spacy_file_path):
14
+ # Read the file in chunks and combine the chunks
15
+ file_content = ""
16
+ for chunk in read_in_chunks(json_file_path):
17
+ file_content += chunk
18
+
19
+ # Parse the JSON data
20
+ data = json.loads(file_content)
21
+
22
+ # Prepare the data for spaCy
23
+ spacy_format = []
24
+
25
+ for item in data:
26
+ text = item[0] # The first element in the list is the text
27
+ entities = item[1]['entities'] # The second element contains the dictionary with 'entities'
28
+ spacy_entities = [(start, end, label) for start, end, label in entities]
29
+ spacy_format.append({"text": text, "entities": spacy_entities})
30
+
31
+ # Create a blank English model
32
+ nlp = spacy.blank("en")
33
+
34
+ # Initialize a DocBin object
35
+ doc_bin = DocBin()
36
+
37
+ # Convert the data to spaCy Doc objects and add to DocBin
38
+ for entry in spacy_format:
39
+ doc = nlp.make_doc(entry["text"])
40
+ # Convert entities
41
+ entities = []
42
+ seen_positions = set() # To track positions and avoid overlap
43
+ for start, end, label in entry["entities"]:
44
+ # Ensure span is within the document's length
45
+ if start < 0 or end > len(doc.text) or start >= end:
46
+ print(f"Invalid span: start={start}, end={end}, label={label}")
47
+ continue
48
+
49
+ # Check for overlaps and prioritize entities
50
+ if not any(start < e_end and end > e_start for e_start, e_end, _ in seen_positions):
51
+ span = doc.char_span(start, end, label=label)
52
+ if span is not None:
53
+ entities.append(span)
54
+ seen_positions.add((start, end, label))
55
+ else:
56
+ print(f"Overlapping span: start={start}, end={end}, label={label}")
57
+
58
+ # Set entities
59
+ doc.ents = entities
60
+
61
+ # Add to DocBin
62
+ doc_bin.add(doc)
63
+
64
+ # Save the DocBin to a .spacy file
65
+ doc_bin.to_disk(spacy_file_path)
66
+
67
+ print(f"Data has been successfully saved to {spacy_file_path}!")
utils/model.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spacy
2
+ from spacy.training import Example
3
+ from spacy.util import minibatch, compounding
4
+ from pathlib import Path
5
+ from spacy.tokens import DocBin
6
+ import random
7
+
8
+ # Load the training data from the .spacy file
9
+ def load_data_from_spacy_file(file_path):
10
+ # Initialize a blank English model to ensure compatibility
11
+ nlp = spacy.blank("en")
12
+
13
+ # Load the DocBin object and get documents
14
+ try:
15
+ doc_bin = DocBin().from_disk(file_path)
16
+ docs = list(doc_bin.get_docs(nlp.vocab))
17
+ return docs
18
+ except Exception as e:
19
+ print(f"Error loading data from .spacy file: {e}")
20
+ return []
21
+
22
+
23
+ # Train model function
24
+ def train_model(epochs, model_path):
25
+ # Initialize a blank English model
26
+ nlp = spacy.blank("en")
27
+
28
+ # Create an NER component and add it to the pipeline
29
+ if "ner" not in nlp.pipe_names:
30
+ ner = nlp.add_pipe("ner")
31
+
32
+ nlp.add_pipe("sentencizer")
33
+
34
+ # Define all possible entity labels
35
+ labels = [
36
+ "PERSON", "CONTACT", "EMAIL", "ABOUT", "EXPERIENCE", "YEARS_EXPERIENCE",
37
+ "UNIVERSITY", "SOFT_SKILL", "INSTITUTE", "LAST_QUALIFICATION_YEAR", "JOB_TITLE",
38
+ "COMPANY", "COURSE", "DOB", "HOBBIES", "LINK", "SCHOOL", "QUALIFICATION",
39
+ "LANGUAGE", "LOCATION", "PROJECTS", "SKILL", "CERTIFICATE"
40
+ ]
41
+
42
+ # Add labels to the NER component
43
+ for label in labels:
44
+ ner.add_label(label)
45
+
46
+ # Load the training data
47
+ train_data = load_data_from_spacy_file("./data/Spacy_data.spacy")
48
+
49
+ # Start the training
50
+ optimizer = nlp.begin_training()
51
+
52
+ epoch_losses = []
53
+ best_loss = float('inf')
54
+
55
+ # Training loop
56
+ for epoch in range(epochs):
57
+ losses = {}
58
+ random.shuffle(train_data) # Shuffle data for better training
59
+
60
+ # Create minibatches
61
+ batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
62
+
63
+ for batch in batches:
64
+ texts, annotations = zip(*[(doc.text, {"entities": [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]}) for doc in batch])
65
+
66
+ # Convert to Example objects
67
+ examples = [Example.from_dict(nlp.make_doc(text), annotation) for text, annotation in zip(texts, annotations)]
68
+
69
+ # Update the model
70
+ nlp.update(examples, sgd=optimizer, drop=0.35, losses=losses)
71
+
72
+ current_loss = losses.get("ner", float('inf'))
73
+ epoch_losses.append(current_loss)
74
+
75
+ print(f"Losses at epoch {epoch + 1}: {losses}")
76
+
77
+ # Stop training if the loss is zero
78
+ if current_loss == 0:
79
+ break
80
+
81
+ # Save the best model
82
+ if current_loss < best_loss:
83
+ best_loss = current_loss
84
+ nlp.to_disk(model_path)
85
+
86
+ # Save the final model
87
+ nlp.to_disk(model_path)
88
+
89
+ return epoch_losses