Spaces:
Sleeping
Sleeping
WebashalarForML
commited on
Upload 4 files
Browse files- utils/anoter_to_json.py +87 -0
- utils/file_to_text.py +132 -0
- utils/json_to_spacy.py +67 -0
- utils/model.py +89 -0
utils/anoter_to_json.py
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# import json
|
2 |
+
|
3 |
+
# def convert_to_spacy_format(json_data):
|
4 |
+
# spacy_data = []
|
5 |
+
|
6 |
+
# # Iterate over the annotations in the input JSON
|
7 |
+
# for annotation in json_data['annotations']:
|
8 |
+
# text = annotation[0] # The text is the first element in each annotation
|
9 |
+
# entities = annotation[1]['entities'] # The entities are in the second element under 'entities'
|
10 |
+
|
11 |
+
# spacy_entities = []
|
12 |
+
# for entity in entities:
|
13 |
+
# start, end, label = entity
|
14 |
+
# spacy_entities.append((start, end, label))
|
15 |
+
|
16 |
+
# # Append the converted data in the desired format (like B.json)
|
17 |
+
# spacy_data.append([text, {'entities': spacy_entities}])
|
18 |
+
|
19 |
+
# return spacy_data
|
20 |
+
|
21 |
+
# # Load your JSON data from 'A.json'
|
22 |
+
# json_file_path = './JSON/Row_Json_Data.json'
|
23 |
+
|
24 |
+
# with open(json_file_path, 'r', encoding='utf-8') as file:
|
25 |
+
# json_data = json.load(file)
|
26 |
+
|
27 |
+
# # Convert the JSON data to the desired format
|
28 |
+
# spacy_formatted_data = convert_to_spacy_format(json_data)
|
29 |
+
|
30 |
+
# # Save the converted data to 'B.json'
|
31 |
+
# output_file_path = './data/Json_Data.json'
|
32 |
+
|
33 |
+
# with open(output_file_path, 'w', encoding='utf-8') as outfile:
|
34 |
+
# json.dump(spacy_formatted_data, outfile, ensure_ascii=False, indent=4)
|
35 |
+
|
36 |
+
# print(f'Successfully converted and saved the data to {output_file_path}')
|
37 |
+
|
38 |
+
import json
|
39 |
+
import os
|
40 |
+
|
41 |
+
def convert_to_spacy_format(json_data):
|
42 |
+
spacy_data = []
|
43 |
+
|
44 |
+
# Iterate over the annotations in the input JSON
|
45 |
+
for annotation in json_data['annotations']:
|
46 |
+
text = annotation[0] # The text is the first element in each annotation
|
47 |
+
entities = annotation[1]['entities'] # The entities are in the second element under 'entities'
|
48 |
+
|
49 |
+
spacy_entities = []
|
50 |
+
for entity in entities:
|
51 |
+
start, end, label = entity
|
52 |
+
spacy_entities.append((start, end, label))
|
53 |
+
|
54 |
+
# Append the converted data in the desired format (like B.json)
|
55 |
+
spacy_data.append([text, {'entities': spacy_entities}])
|
56 |
+
|
57 |
+
return spacy_data
|
58 |
+
|
59 |
+
def process_uploaded_json(file_path):
|
60 |
+
# Load your JSON data from the uploaded file
|
61 |
+
with open(file_path, 'r', encoding='utf-8') as file:
|
62 |
+
json_data = json.load(file)
|
63 |
+
|
64 |
+
# Convert the JSON data to the desired format
|
65 |
+
spacy_formatted_data = convert_to_spacy_format(json_data)
|
66 |
+
|
67 |
+
# Define the path to the output file
|
68 |
+
output_file_path = './data/Json_Data.json'
|
69 |
+
|
70 |
+
# Check if the file already exists
|
71 |
+
if os.path.exists(output_file_path):
|
72 |
+
# If the file exists, load its content
|
73 |
+
with open(output_file_path, 'r', encoding='utf-8') as outfile:
|
74 |
+
existing_data = json.load(outfile)
|
75 |
+
else:
|
76 |
+
# If the file doesn't exist, start with an empty list
|
77 |
+
existing_data = []
|
78 |
+
|
79 |
+
# Append the new data to the existing data
|
80 |
+
existing_data.extend(spacy_formatted_data)
|
81 |
+
|
82 |
+
# Write the updated data back to the file
|
83 |
+
with open(output_file_path, 'w', encoding='utf-8') as outfile:
|
84 |
+
json.dump(existing_data, outfile, ensure_ascii=False, indent=4)
|
85 |
+
|
86 |
+
print(f'Successfully appended the new data to {output_file_path}')
|
87 |
+
|
utils/file_to_text.py
ADDED
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import re
|
3 |
+
import fitz
|
4 |
+
import logging
|
5 |
+
from PIL import Image
|
6 |
+
from pdf2image import convert_from_path
|
7 |
+
import platform
|
8 |
+
import pytesseract
|
9 |
+
import docx
|
10 |
+
from odf.opendocument import load as load_odt
|
11 |
+
from odf.text import P
|
12 |
+
|
13 |
+
# Path to tesseract executable (ensure it points to tesseract.exe)
|
14 |
+
if platform.system() == "Windows":
|
15 |
+
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
|
16 |
+
else:
|
17 |
+
# For Hugging Face Spaces or other Linux environments
|
18 |
+
pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract'
|
19 |
+
|
20 |
+
# Set up logging
|
21 |
+
# logging.basicConfig(
|
22 |
+
# level=logging.DEBUG,
|
23 |
+
# format='%(asctime)s - %(levelname)s - %(message)s',
|
24 |
+
# handlers=[logging.StreamHandler()]
|
25 |
+
# )
|
26 |
+
|
27 |
+
# # Path to Tesseract executable
|
28 |
+
# tesseract_path = os.getenv('TESSERACT_CMD', '/usr/bin/tesseract')
|
29 |
+
# pytesseract.pytesseract.tesseract_cmd = tesseract_path
|
30 |
+
|
31 |
+
# Function to extract text from PDF using PyMuPDF
|
32 |
+
def extract_text_from_pdf(file_path):
|
33 |
+
text = ""
|
34 |
+
hyperlinks = []
|
35 |
+
try:
|
36 |
+
doc = fitz.open(file_path)
|
37 |
+
for page_num in range(doc.page_count):
|
38 |
+
page = doc.load_page(page_num)
|
39 |
+
page_text = page.get_text("text")
|
40 |
+
|
41 |
+
if not page_text.strip():
|
42 |
+
images = convert_from_path(file_path, dpi=300)
|
43 |
+
for image in images:
|
44 |
+
text += pytesseract.image_to_string(image)
|
45 |
+
else:
|
46 |
+
text += page_text
|
47 |
+
|
48 |
+
links = page.get_links()
|
49 |
+
for link in links:
|
50 |
+
if link.get("uri"):
|
51 |
+
hyperlinks.append(link["uri"])
|
52 |
+
except Exception as e:
|
53 |
+
logging.error(f"Error extracting text or hyperlinks from PDF: {e}")
|
54 |
+
return "", []
|
55 |
+
|
56 |
+
return text, list(set(hyperlinks))
|
57 |
+
|
58 |
+
# Function to extract text from DOCX
|
59 |
+
def extract_text_from_docx(file_path):
|
60 |
+
try:
|
61 |
+
doc = docx.Document(file_path)
|
62 |
+
text = "\n".join([para.text for para in doc.paragraphs])
|
63 |
+
return text
|
64 |
+
except Exception as e:
|
65 |
+
logging.error(f"Error extracting text from DOCX: {e}")
|
66 |
+
return ""
|
67 |
+
|
68 |
+
# Function to extract text from RSF (assuming text-based format)
|
69 |
+
def extract_text_from_rsf(file_path):
|
70 |
+
try:
|
71 |
+
with open(file_path, "r", encoding="utf-8") as file:
|
72 |
+
return file.read()
|
73 |
+
except Exception as e:
|
74 |
+
logging.error(f"Error extracting text from RSF: {e}")
|
75 |
+
return ""
|
76 |
+
|
77 |
+
# Function to extract text from ODT
|
78 |
+
def extract_text_from_odt(file_path):
|
79 |
+
try:
|
80 |
+
odt_doc = load_odt(file_path)
|
81 |
+
text_elements = odt_doc.getElementsByType(P)
|
82 |
+
text = "\n".join([te.firstChild.data for te in text_elements if te.firstChild])
|
83 |
+
return text
|
84 |
+
except Exception as e:
|
85 |
+
logging.error(f"Error extracting text from ODT: {e}")
|
86 |
+
return ""
|
87 |
+
|
88 |
+
# Function to extract text from images using Tesseract
|
89 |
+
def extract_text_from_image(file_path):
|
90 |
+
try:
|
91 |
+
img = Image.open(file_path)
|
92 |
+
text = pytesseract.image_to_string(img)
|
93 |
+
return text
|
94 |
+
except Exception as e:
|
95 |
+
logging.error(f"Error extracting text from image: {e}")
|
96 |
+
return ""
|
97 |
+
|
98 |
+
# Function to clean and preprocess the extracted text
|
99 |
+
def preprocess_text(text):
|
100 |
+
text = re.sub(r'\s+', ' ', text)
|
101 |
+
text = re.sub(r'\n', ' ', text)
|
102 |
+
text = re.sub(r'(\b\d{3}[-.\s]??\d{3}[-.\s]??\d{4}\b)', r' \1 ', text)
|
103 |
+
return text.strip()
|
104 |
+
|
105 |
+
# Function to automatically detect file format and extract text
|
106 |
+
def extract_text_based_on_format(file_path):
|
107 |
+
file_ext = os.path.splitext(file_path)[1].lower()
|
108 |
+
|
109 |
+
if file_ext == '.pdf':
|
110 |
+
text, hyperlinks = extract_text_from_pdf(file_path)
|
111 |
+
elif file_ext == '.docx':
|
112 |
+
text = extract_text_from_docx(file_path)
|
113 |
+
hyperlinks = []
|
114 |
+
elif file_ext == '.rsf':
|
115 |
+
text = extract_text_from_rsf(file_path)
|
116 |
+
hyperlinks = []
|
117 |
+
elif file_ext == '.odt':
|
118 |
+
text = extract_text_from_odt(file_path)
|
119 |
+
hyperlinks = []
|
120 |
+
elif file_ext in ['.png', '.jpg', '.jpeg']:
|
121 |
+
text = extract_text_from_image(file_path)
|
122 |
+
hyperlinks = []
|
123 |
+
else:
|
124 |
+
raise ValueError("Unsupported file format")
|
125 |
+
|
126 |
+
return text, hyperlinks
|
127 |
+
|
128 |
+
|
129 |
+
def clean_text_to_single_line(text):
|
130 |
+
# Replace newline characters with a space and remove extra spaces
|
131 |
+
cleaned_text = ' '.join(text.split())
|
132 |
+
return cleaned_text
|
utils/json_to_spacy.py
ADDED
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import spacy
|
3 |
+
from spacy.tokens import DocBin
|
4 |
+
|
5 |
+
def read_in_chunks(file_path, chunk_size=1024):
|
6 |
+
with open(file_path, 'r', encoding='utf-8') as file:
|
7 |
+
while True:
|
8 |
+
data = file.read(chunk_size)
|
9 |
+
if not data:
|
10 |
+
break
|
11 |
+
yield data
|
12 |
+
|
13 |
+
def convert_json_to_spacy(json_file_path, spacy_file_path):
|
14 |
+
# Read the file in chunks and combine the chunks
|
15 |
+
file_content = ""
|
16 |
+
for chunk in read_in_chunks(json_file_path):
|
17 |
+
file_content += chunk
|
18 |
+
|
19 |
+
# Parse the JSON data
|
20 |
+
data = json.loads(file_content)
|
21 |
+
|
22 |
+
# Prepare the data for spaCy
|
23 |
+
spacy_format = []
|
24 |
+
|
25 |
+
for item in data:
|
26 |
+
text = item[0] # The first element in the list is the text
|
27 |
+
entities = item[1]['entities'] # The second element contains the dictionary with 'entities'
|
28 |
+
spacy_entities = [(start, end, label) for start, end, label in entities]
|
29 |
+
spacy_format.append({"text": text, "entities": spacy_entities})
|
30 |
+
|
31 |
+
# Create a blank English model
|
32 |
+
nlp = spacy.blank("en")
|
33 |
+
|
34 |
+
# Initialize a DocBin object
|
35 |
+
doc_bin = DocBin()
|
36 |
+
|
37 |
+
# Convert the data to spaCy Doc objects and add to DocBin
|
38 |
+
for entry in spacy_format:
|
39 |
+
doc = nlp.make_doc(entry["text"])
|
40 |
+
# Convert entities
|
41 |
+
entities = []
|
42 |
+
seen_positions = set() # To track positions and avoid overlap
|
43 |
+
for start, end, label in entry["entities"]:
|
44 |
+
# Ensure span is within the document's length
|
45 |
+
if start < 0 or end > len(doc.text) or start >= end:
|
46 |
+
print(f"Invalid span: start={start}, end={end}, label={label}")
|
47 |
+
continue
|
48 |
+
|
49 |
+
# Check for overlaps and prioritize entities
|
50 |
+
if not any(start < e_end and end > e_start for e_start, e_end, _ in seen_positions):
|
51 |
+
span = doc.char_span(start, end, label=label)
|
52 |
+
if span is not None:
|
53 |
+
entities.append(span)
|
54 |
+
seen_positions.add((start, end, label))
|
55 |
+
else:
|
56 |
+
print(f"Overlapping span: start={start}, end={end}, label={label}")
|
57 |
+
|
58 |
+
# Set entities
|
59 |
+
doc.ents = entities
|
60 |
+
|
61 |
+
# Add to DocBin
|
62 |
+
doc_bin.add(doc)
|
63 |
+
|
64 |
+
# Save the DocBin to a .spacy file
|
65 |
+
doc_bin.to_disk(spacy_file_path)
|
66 |
+
|
67 |
+
print(f"Data has been successfully saved to {spacy_file_path}!")
|
utils/model.py
ADDED
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import spacy
|
2 |
+
from spacy.training import Example
|
3 |
+
from spacy.util import minibatch, compounding
|
4 |
+
from pathlib import Path
|
5 |
+
from spacy.tokens import DocBin
|
6 |
+
import random
|
7 |
+
|
8 |
+
# Load the training data from the .spacy file
|
9 |
+
def load_data_from_spacy_file(file_path):
|
10 |
+
# Initialize a blank English model to ensure compatibility
|
11 |
+
nlp = spacy.blank("en")
|
12 |
+
|
13 |
+
# Load the DocBin object and get documents
|
14 |
+
try:
|
15 |
+
doc_bin = DocBin().from_disk(file_path)
|
16 |
+
docs = list(doc_bin.get_docs(nlp.vocab))
|
17 |
+
return docs
|
18 |
+
except Exception as e:
|
19 |
+
print(f"Error loading data from .spacy file: {e}")
|
20 |
+
return []
|
21 |
+
|
22 |
+
|
23 |
+
# Train model function
|
24 |
+
def train_model(epochs, model_path):
|
25 |
+
# Initialize a blank English model
|
26 |
+
nlp = spacy.blank("en")
|
27 |
+
|
28 |
+
# Create an NER component and add it to the pipeline
|
29 |
+
if "ner" not in nlp.pipe_names:
|
30 |
+
ner = nlp.add_pipe("ner")
|
31 |
+
|
32 |
+
nlp.add_pipe("sentencizer")
|
33 |
+
|
34 |
+
# Define all possible entity labels
|
35 |
+
labels = [
|
36 |
+
"PERSON", "CONTACT", "EMAIL", "ABOUT", "EXPERIENCE", "YEARS_EXPERIENCE",
|
37 |
+
"UNIVERSITY", "SOFT_SKILL", "INSTITUTE", "LAST_QUALIFICATION_YEAR", "JOB_TITLE",
|
38 |
+
"COMPANY", "COURSE", "DOB", "HOBBIES", "LINK", "SCHOOL", "QUALIFICATION",
|
39 |
+
"LANGUAGE", "LOCATION", "PROJECTS", "SKILL", "CERTIFICATE"
|
40 |
+
]
|
41 |
+
|
42 |
+
# Add labels to the NER component
|
43 |
+
for label in labels:
|
44 |
+
ner.add_label(label)
|
45 |
+
|
46 |
+
# Load the training data
|
47 |
+
train_data = load_data_from_spacy_file("./data/Spacy_data.spacy")
|
48 |
+
|
49 |
+
# Start the training
|
50 |
+
optimizer = nlp.begin_training()
|
51 |
+
|
52 |
+
epoch_losses = []
|
53 |
+
best_loss = float('inf')
|
54 |
+
|
55 |
+
# Training loop
|
56 |
+
for epoch in range(epochs):
|
57 |
+
losses = {}
|
58 |
+
random.shuffle(train_data) # Shuffle data for better training
|
59 |
+
|
60 |
+
# Create minibatches
|
61 |
+
batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
|
62 |
+
|
63 |
+
for batch in batches:
|
64 |
+
texts, annotations = zip(*[(doc.text, {"entities": [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]}) for doc in batch])
|
65 |
+
|
66 |
+
# Convert to Example objects
|
67 |
+
examples = [Example.from_dict(nlp.make_doc(text), annotation) for text, annotation in zip(texts, annotations)]
|
68 |
+
|
69 |
+
# Update the model
|
70 |
+
nlp.update(examples, sgd=optimizer, drop=0.35, losses=losses)
|
71 |
+
|
72 |
+
current_loss = losses.get("ner", float('inf'))
|
73 |
+
epoch_losses.append(current_loss)
|
74 |
+
|
75 |
+
print(f"Losses at epoch {epoch + 1}: {losses}")
|
76 |
+
|
77 |
+
# Stop training if the loss is zero
|
78 |
+
if current_loss == 0:
|
79 |
+
break
|
80 |
+
|
81 |
+
# Save the best model
|
82 |
+
if current_loss < best_loss:
|
83 |
+
best_loss = current_loss
|
84 |
+
nlp.to_disk(model_path)
|
85 |
+
|
86 |
+
# Save the final model
|
87 |
+
nlp.to_disk(model_path)
|
88 |
+
|
89 |
+
return epoch_losses
|