Spaces:

WebashalarForML
/

SpacyModelCreator

Sleeping

App Files Files Community

WebashalarForML commited on Sep 30, 2024

Commit

2f2758d

verified ·

1 Parent(s): 882d345

Upload 4 files

Browse files

Files changed (4) hide show

utils/anoter_to_json.py +87 -0
utils/file_to_text.py +132 -0
utils/json_to_spacy.py +67 -0
utils/model.py +89 -0

utils/anoter_to_json.py ADDED Viewed

	@@ -0,0 +1,87 @@

+# import json
+# def convert_to_spacy_format(json_data):
+#     spacy_data = []
+#     # Iterate over the annotations in the input JSON
+#     for annotation in json_data['annotations']:
+#         text = annotation[0]  # The text is the first element in each annotation
+#         entities = annotation[1]['entities']  # The entities are in the second element under 'entities'
+#         spacy_entities = []
+#         for entity in entities:
+#             start, end, label = entity
+#             spacy_entities.append((start, end, label))
+#         # Append the converted data in the desired format (like B.json)
+#         spacy_data.append([text, {'entities': spacy_entities}])
+#     return spacy_data
+# # Load your JSON data from 'A.json'
+# json_file_path = './JSON/Row_Json_Data.json'
+# with open(json_file_path, 'r', encoding='utf-8') as file:
+#     json_data = json.load(file)
+# # Convert the JSON data to the desired format
+# spacy_formatted_data = convert_to_spacy_format(json_data)
+# # Save the converted data to 'B.json'
+# output_file_path = './data/Json_Data.json'
+# with open(output_file_path, 'w', encoding='utf-8') as outfile:
+#     json.dump(spacy_formatted_data, outfile, ensure_ascii=False, indent=4)
+# print(f'Successfully converted and saved the data to {output_file_path}')
+import json
+import os
+def convert_to_spacy_format(json_data):
+    spacy_data = []
+    # Iterate over the annotations in the input JSON
+    for annotation in json_data['annotations']:
+        text = annotation[0]  # The text is the first element in each annotation
+        entities = annotation[1]['entities']  # The entities are in the second element under 'entities'
+        spacy_entities = []
+        for entity in entities:
+            start, end, label = entity
+            spacy_entities.append((start, end, label))
+        # Append the converted data in the desired format (like B.json)
+        spacy_data.append([text, {'entities': spacy_entities}])
+    return spacy_data
+def process_uploaded_json(file_path):
+    # Load your JSON data from the uploaded file
+    with open(file_path, 'r', encoding='utf-8') as file:
+        json_data = json.load(file)
+    # Convert the JSON data to the desired format
+    spacy_formatted_data = convert_to_spacy_format(json_data)
+    # Define the path to the output file
+    output_file_path = './data/Json_Data.json'
+    # Check if the file already exists
+    if os.path.exists(output_file_path):
+        # If the file exists, load its content
+        with open(output_file_path, 'r', encoding='utf-8') as outfile:
+            existing_data = json.load(outfile)
+    else:
+        # If the file doesn't exist, start with an empty list
+        existing_data = []
+    # Append the new data to the existing data
+    existing_data.extend(spacy_formatted_data)
+    # Write the updated data back to the file
+    with open(output_file_path, 'w', encoding='utf-8') as outfile:
+        json.dump(existing_data, outfile, ensure_ascii=False, indent=4)
+    print(f'Successfully appended the new data to {output_file_path}')

utils/file_to_text.py ADDED Viewed

	@@ -0,0 +1,132 @@

+import os
+import re
+import fitz
+import logging
+from PIL import Image
+from pdf2image import convert_from_path
+import platform
+import pytesseract
+import docx
+from odf.opendocument import load as load_odt
+from odf.text import P
+# Path to tesseract executable (ensure it points to tesseract.exe)
+if platform.system() == "Windows":
+    pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
+else:
+    # For Hugging Face Spaces or other Linux environments
+    pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract'
+# Set up logging
+# logging.basicConfig(
+#     level=logging.DEBUG,
+#     format='%(asctime)s - %(levelname)s - %(message)s',
+#     handlers=[logging.StreamHandler()]
+# )
+# # Path to Tesseract executable
+# tesseract_path = os.getenv('TESSERACT_CMD', '/usr/bin/tesseract')
+# pytesseract.pytesseract.tesseract_cmd = tesseract_path
+# Function to extract text from PDF using PyMuPDF
+def extract_text_from_pdf(file_path):
+    text = ""
+    hyperlinks = []
+    try:
+        doc = fitz.open(file_path)
+        for page_num in range(doc.page_count):
+            page = doc.load_page(page_num)
+            page_text = page.get_text("text")
+            if not page_text.strip():
+                images = convert_from_path(file_path, dpi=300)
+                for image in images:
+                    text += pytesseract.image_to_string(image)
+            else:
+                text += page_text
+            links = page.get_links()
+            for link in links:
+                if link.get("uri"):
+                    hyperlinks.append(link["uri"])
+    except Exception as e:
+        logging.error(f"Error extracting text or hyperlinks from PDF: {e}")
+        return "", []
+    return text, list(set(hyperlinks))
+# Function to extract text from DOCX
+def extract_text_from_docx(file_path):
+    try:
+        doc = docx.Document(file_path)
+        text = "\n".join([para.text for para in doc.paragraphs])
+        return text
+    except Exception as e:
+        logging.error(f"Error extracting text from DOCX: {e}")
+        return ""
+# Function to extract text from RSF (assuming text-based format)
+def extract_text_from_rsf(file_path):
+    try:
+        with open(file_path, "r", encoding="utf-8") as file:
+            return file.read()
+    except Exception as e:
+        logging.error(f"Error extracting text from RSF: {e}")
+        return ""
+# Function to extract text from ODT
+def extract_text_from_odt(file_path):
+    try:
+        odt_doc = load_odt(file_path)
+        text_elements = odt_doc.getElementsByType(P)
+        text = "\n".join([te.firstChild.data for te in text_elements if te.firstChild])
+        return text
+    except Exception as e:
+        logging.error(f"Error extracting text from ODT: {e}")
+        return ""
+# Function to extract text from images using Tesseract
+def extract_text_from_image(file_path):
+    try:
+        img = Image.open(file_path)
+        text = pytesseract.image_to_string(img)
+        return text
+    except Exception as e:
+        logging.error(f"Error extracting text from image: {e}")
+        return ""
+# Function to clean and preprocess the extracted text
+def preprocess_text(text):
+    text = re.sub(r'\s+', ' ', text)
+    text = re.sub(r'\n', ' ', text)
+    text = re.sub(r'(\b\d{3}[-.\s]??\d{3}[-.\s]??\d{4}\b)', r' \1 ', text)
+    return text.strip()
+# Function to automatically detect file format and extract text
+def extract_text_based_on_format(file_path):
+    file_ext = os.path.splitext(file_path)[1].lower()
+    if file_ext == '.pdf':
+        text, hyperlinks = extract_text_from_pdf(file_path)
+    elif file_ext == '.docx':
+        text = extract_text_from_docx(file_path)
+        hyperlinks = []
+    elif file_ext == '.rsf':
+        text = extract_text_from_rsf(file_path)
+        hyperlinks = []
+    elif file_ext == '.odt':
+        text = extract_text_from_odt(file_path)
+        hyperlinks = []
+    elif file_ext in ['.png', '.jpg', '.jpeg']:
+        text = extract_text_from_image(file_path)
+        hyperlinks = []
+    else:
+        raise ValueError("Unsupported file format")
+    return text, hyperlinks
+def clean_text_to_single_line(text):
+    # Replace newline characters with a space and remove extra spaces
+    cleaned_text = ' '.join(text.split())
+    return cleaned_text

utils/json_to_spacy.py ADDED Viewed

	@@ -0,0 +1,67 @@

+import json
+import spacy
+from spacy.tokens import DocBin
+def read_in_chunks(file_path, chunk_size=1024):
+    with open(file_path, 'r', encoding='utf-8') as file:
+        while True:
+            data = file.read(chunk_size)
+            if not data:
+                break
+            yield data
+def convert_json_to_spacy(json_file_path, spacy_file_path):
+    # Read the file in chunks and combine the chunks
+    file_content = ""
+    for chunk in read_in_chunks(json_file_path):
+        file_content += chunk
+    # Parse the JSON data
+    data = json.loads(file_content)
+    # Prepare the data for spaCy
+    spacy_format = []
+    for item in data:
+        text = item[0]  # The first element in the list is the text
+        entities = item[1]['entities']  # The second element contains the dictionary with 'entities'
+        spacy_entities = [(start, end, label) for start, end, label in entities]
+        spacy_format.append({"text": text, "entities": spacy_entities})
+    # Create a blank English model
+    nlp = spacy.blank("en")
+    # Initialize a DocBin object
+    doc_bin = DocBin()
+    # Convert the data to spaCy Doc objects and add to DocBin
+    for entry in spacy_format:
+        doc = nlp.make_doc(entry["text"])
+        # Convert entities
+        entities = []
+        seen_positions = set()  # To track positions and avoid overlap
+        for start, end, label in entry["entities"]:
+            # Ensure span is within the document's length
+            if start < 0 or end > len(doc.text) or start >= end:
+                print(f"Invalid span: start={start}, end={end}, label={label}")
+                continue
+            # Check for overlaps and prioritize entities
+            if not any(start < e_end and end > e_start for e_start, e_end, _ in seen_positions):
+                span = doc.char_span(start, end, label=label)
+                if span is not None:
+                    entities.append(span)
+                    seen_positions.add((start, end, label))
+            else:
+                print(f"Overlapping span: start={start}, end={end}, label={label}")
+        # Set entities
+        doc.ents = entities
+        # Add to DocBin
+        doc_bin.add(doc)
+    # Save the DocBin to a .spacy file
+    doc_bin.to_disk(spacy_file_path)
+    print(f"Data has been successfully saved to {spacy_file_path}!")

utils/model.py ADDED Viewed

	@@ -0,0 +1,89 @@

+import spacy
+from spacy.training import Example
+from spacy.util import minibatch, compounding
+from pathlib import Path
+from spacy.tokens import DocBin
+import random
+# Load the training data from the .spacy file
+def load_data_from_spacy_file(file_path):
+    # Initialize a blank English model to ensure compatibility
+    nlp = spacy.blank("en")
+    # Load the DocBin object and get documents
+    try:
+        doc_bin = DocBin().from_disk(file_path)
+        docs = list(doc_bin.get_docs(nlp.vocab))
+        return docs
+    except Exception as e:
+        print(f"Error loading data from .spacy file: {e}")
+        return []
+# Train model function
+def train_model(epochs, model_path):
+    # Initialize a blank English model
+    nlp = spacy.blank("en")
+    # Create an NER component and add it to the pipeline
+    if "ner" not in nlp.pipe_names:
+        ner = nlp.add_pipe("ner")
+    nlp.add_pipe("sentencizer")
+    # Define all possible entity labels
+    labels = [
+        "PERSON", "CONTACT", "EMAIL", "ABOUT", "EXPERIENCE", "YEARS_EXPERIENCE",
+        "UNIVERSITY", "SOFT_SKILL", "INSTITUTE", "LAST_QUALIFICATION_YEAR", "JOB_TITLE",
+        "COMPANY", "COURSE", "DOB", "HOBBIES", "LINK", "SCHOOL", "QUALIFICATION",
+        "LANGUAGE", "LOCATION", "PROJECTS", "SKILL", "CERTIFICATE"
+    ]
+    # Add labels to the NER component
+    for label in labels:
+        ner.add_label(label)
+    # Load the training data
+    train_data = load_data_from_spacy_file("./data/Spacy_data.spacy")
+    # Start the training
+    optimizer = nlp.begin_training()
+    epoch_losses = []
+    best_loss = float('inf')
+    # Training loop
+    for epoch in range(epochs):
+        losses = {}
+        random.shuffle(train_data)  # Shuffle data for better training
+        # Create minibatches
+        batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
+        for batch in batches:
+            texts, annotations = zip(*[(doc.text, {"entities": [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]}) for doc in batch])
+            # Convert to Example objects
+            examples = [Example.from_dict(nlp.make_doc(text), annotation) for text, annotation in zip(texts, annotations)]
+            # Update the model
+            nlp.update(examples, sgd=optimizer, drop=0.35, losses=losses)
+        current_loss = losses.get("ner", float('inf'))
+        epoch_losses.append(current_loss)
+        print(f"Losses at epoch {epoch + 1}: {losses}")
+        # Stop training if the loss is zero
+        if current_loss == 0:
+            break
+        # Save the best model
+        if current_loss < best_loss:
+            best_loss = current_loss
+            nlp.to_disk(model_path)
+    # Save the final model
+    nlp.to_disk(model_path)
+    return epoch_losses