Spaces:
Sleeping
Sleeping
Upload 2 files
Browse files- resume_parser.py +98 -64
- segmenter.py +34 -27
resume_parser.py
CHANGED
@@ -1,10 +1,13 @@
|
|
1 |
from itertools import chain
|
2 |
from models import Models
|
3 |
-
#from models.prototype.models import Models
|
4 |
#from output_model import OutputModel, WorkExperience
|
5 |
from segmenter import ResumeSegmenter
|
6 |
from flashtext import KeywordProcessor
|
7 |
from collections import defaultdict
|
|
|
|
|
|
|
|
|
8 |
class ResumeParser():
|
9 |
def __init__(self) -> None:
|
10 |
self.resumeSegmenter = ResumeSegmenter()
|
@@ -28,99 +31,130 @@ class ResumeParser():
|
|
28 |
dates_indexes = [self.get_date_index(resume_lines[start_index:end_index], work_date) for work_date in work_dates]
|
29 |
dates_indexes = list(chain.from_iterable(dates_indexes))
|
30 |
dates_indexes = [i + start_index for i in dates_indexes]
|
31 |
-
#this list should be unique and ordered
|
32 |
dates_indexes = sorted([start_index+1] + dates_indexes + [end_index])
|
33 |
dates_indexes = set(dates_indexes)
|
34 |
-
dates_indexes = list(dates_indexes)
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
index = dates_indexes[i]
|
39 |
-
next_index = dates_indexes[i+1]
|
40 |
-
section = resume_lines[index:next_index]
|
41 |
if len(section) == 0:
|
42 |
-
|
43 |
-
|
44 |
-
|
|
|
45 |
|
46 |
def extract_section_text(self, resume_lines, section_header = "work_and_employment"):
|
47 |
-
|
|
|
|
|
|
|
|
|
|
|
48 |
start_index = sections[section_header][0]
|
49 |
end_index = sections[section_header][1]
|
50 |
#on the bases dates would be unique
|
51 |
return start_index, end_index
|
52 |
|
53 |
#more of a utils function
|
54 |
-
def sort_tokens_table(tokens_data):
|
55 |
table = {}
|
56 |
for key, tokens in tokens_data:
|
57 |
for token in tokens:
|
58 |
table[token] = key
|
59 |
return table
|
60 |
|
61 |
-
def format_output(self, keywords,
|
62 |
-
if isWorkExp:
|
63 |
-
headlines = [text[0] for text in work_section_list]
|
64 |
-
else:
|
65 |
-
headlines = work_section_list
|
66 |
-
table = self.sort_tokens_table(keywords)
|
67 |
-
tokens_processor = KeywordProcessor()
|
68 |
-
list_keywords = list(chain.from_iterable([tokens[1] for tokens in keywords]))
|
69 |
-
tokens_processor.add_keywords_from_list(list_keywords)
|
70 |
data = []
|
71 |
-
for
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
|
|
|
|
|
|
|
|
|
|
79 |
return data
|
|
|
80 |
|
81 |
-
def parse_work_history(self, resume_lines):
|
82 |
-
|
83 |
-
|
|
|
|
|
|
|
84 |
single_work_experiences = self.split_work_exp(resume_lines, start_index, end_index, work_dates)
|
85 |
-
|
86 |
-
|
87 |
-
keywords = [("date", work_dates), ("title", job_positions), ("company", companies)]
|
88 |
return self.format_output(keywords, single_work_experiences)
|
89 |
|
90 |
-
def parse_education(self, resume_lines):
|
91 |
-
start_index, end_index =
|
92 |
-
|
93 |
|
94 |
-
|
95 |
-
|
96 |
-
|
|
|
|
|
|
|
|
|
|
|
97 |
output = [res for res in output if res]
|
98 |
|
99 |
return output
|
100 |
|
101 |
-
def parse_basic_info(self,resume_lines):
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
|
|
|
|
|
|
107 |
|
108 |
output = {}
|
109 |
-
|
110 |
-
|
111 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
112 |
return output
|
113 |
|
114 |
def parse(self, resume_lines):
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
from itertools import chain
|
2 |
from models import Models
|
|
|
3 |
#from output_model import OutputModel, WorkExperience
|
4 |
from segmenter import ResumeSegmenter
|
5 |
from flashtext import KeywordProcessor
|
6 |
from collections import defaultdict
|
7 |
+
import re
|
8 |
+
import wordninja
|
9 |
+
from utils import percentage_difference
|
10 |
+
from nltk import word_tokenize
|
11 |
class ResumeParser():
|
12 |
def __init__(self) -> None:
|
13 |
self.resumeSegmenter = ResumeSegmenter()
|
|
|
31 |
dates_indexes = [self.get_date_index(resume_lines[start_index:end_index], work_date) for work_date in work_dates]
|
32 |
dates_indexes = list(chain.from_iterable(dates_indexes))
|
33 |
dates_indexes = [i + start_index for i in dates_indexes]
|
|
|
34 |
dates_indexes = sorted([start_index+1] + dates_indexes + [end_index])
|
35 |
dates_indexes = set(dates_indexes)
|
36 |
+
dates_indexes = sorted(list(dates_indexes))
|
37 |
+
individual_sections = []
|
38 |
+
for i, index in enumerate(dates_indexes):
|
39 |
+
section = resume_lines[index:dates_indexes[min(i+1, len(dates_indexes)-1)]]
|
|
|
|
|
|
|
40 |
if len(section) == 0:
|
41 |
+
continue
|
42 |
+
individual_sections.append(section)
|
43 |
+
|
44 |
+
return individual_sections
|
45 |
|
46 |
def extract_section_text(self, resume_lines, section_header = "work_and_employment"):
|
47 |
+
_ , sections = self.resumeSegmenter.get_parsed_sections(resume_lines)
|
48 |
+
if sections is None:
|
49 |
+
return None
|
50 |
+
print(sections)
|
51 |
+
if section_header not in sections:
|
52 |
+
return None
|
53 |
start_index = sections[section_header][0]
|
54 |
end_index = sections[section_header][1]
|
55 |
#on the bases dates would be unique
|
56 |
return start_index, end_index
|
57 |
|
58 |
#more of a utils function
|
59 |
+
def sort_tokens_table(self, tokens_data):
|
60 |
table = {}
|
61 |
for key, tokens in tokens_data:
|
62 |
for token in tokens:
|
63 |
table[token] = key
|
64 |
return table
|
65 |
|
66 |
+
def format_output(self, keywords, headlines, isWorkExp=True):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
67 |
data = []
|
68 |
+
for section in headlines:
|
69 |
+
extracted_data = {}
|
70 |
+
|
71 |
+
paragraph = '\n'.join(section) if isWorkExp else ' '.join(section)
|
72 |
+
extracted_data['description'] = paragraph
|
73 |
+
recovered_headlines = ' '.join(wordninja.split(paragraph))
|
74 |
+
if percentage_difference(len(word_tokenize(paragraph)), len(word_tokenize(recovered_headlines))) > 50:
|
75 |
+
extracted_data['description'] = recovered_headlines
|
76 |
+
for attr in keywords:
|
77 |
+
result = list(set([s for s in attr[1] if s in paragraph or s in recovered_headlines]))
|
78 |
+
if len(result) > 0:
|
79 |
+
extracted_data[attr[0]] = result
|
80 |
+
data.append(extracted_data)
|
81 |
return data
|
82 |
+
|
83 |
|
84 |
+
def parse_work_history(self, resume_lines, sections):
|
85 |
+
|
86 |
+
start_index, end_index = sections['work_and_employment']
|
87 |
+
text = ' '.join(resume_lines[start_index:end_index])
|
88 |
+
recovered_text = ' '.join(wordninja.split(text))
|
89 |
+
work_dates, companies, locations = self.models.get_ner(text, recovered_text)
|
90 |
single_work_experiences = self.split_work_exp(resume_lines, start_index, end_index, work_dates)
|
91 |
+
entity_dict = self.models.get_custom_ner(' '.join(resume_lines[start_index:end_index]))
|
92 |
+
job_positions = entity_dict['job title']
|
93 |
+
keywords = [("date", work_dates), ("title", job_positions), ("company", companies), ("location", locations)]
|
94 |
return self.format_output(keywords, single_work_experiences)
|
95 |
|
96 |
+
def parse_education(self, resume_lines, sections):
|
97 |
+
start_index, end_index = sections["education_and_training"]
|
98 |
+
text = ' '.join(resume_lines[start_index:end_index])
|
99 |
|
100 |
+
dates, universities, locations = self.models.get_ner(text, ' '.join(wordninja.split(text)))
|
101 |
+
single_education_experiences = self.split_work_exp(resume_lines, start_index, end_index, dates)
|
102 |
+
entity_dict = self.models.get_custom_ner(' '.join(resume_lines[start_index+1:end_index]))
|
103 |
+
degrees = entity_dict['degree']
|
104 |
+
majors = entity_dict['major']
|
105 |
+
keywords = [("date", dates), ("major", majors), ("degree", degrees),
|
106 |
+
("university", universities), ("location", locations)]
|
107 |
+
output = self.format_output(keywords, single_education_experiences, False)
|
108 |
output = [res for res in output if res]
|
109 |
|
110 |
return output
|
111 |
|
112 |
+
def parse_basic_info(self,resume_lines, sections):
|
113 |
+
|
114 |
+
start_index, end_index = sections["basics_info"]
|
115 |
+
text = ' '.join(resume_lines[start_index:end_index])
|
116 |
+
phone_pattern = r'(?:(?:\+?\d{1,2}[-.\s]?)?(?:\(\d{1,4}\)|\d{1,4})[-.\s]?)?(?:\d{1,5}[-.\s]?){1,4}\d{1,6}'
|
117 |
+
email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
|
118 |
+
entites = self.models.ner(text)
|
119 |
+
if len(entites) == 0:
|
120 |
+
entites = self.models.ner(' '.join(resume_lines))
|
121 |
|
122 |
output = {}
|
123 |
+
score = 0
|
124 |
+
for entity in entites:
|
125 |
+
if entity['entity_group'] == 'PER' and entity['score'] > score and ' ' in entity['word']:
|
126 |
+
output['name']= entity['word']
|
127 |
+
score = entity['score']
|
128 |
+
|
129 |
+
email = re.findall(email_pattern, text)
|
130 |
+
phone = re.findall(phone_pattern, text)
|
131 |
+
if email == '':
|
132 |
+
email = re.findall(email_pattern, ' '.join(resume_lines))
|
133 |
+
if phone == '':
|
134 |
+
phone = re.findall(phone_pattern, ' '.join(resume_lines))
|
135 |
+
output['email'] = email[0] if len(email) > 0 else ''
|
136 |
+
output['phone'] = phone[0] if len(phone) > 0 else ''
|
137 |
return output
|
138 |
|
139 |
def parse(self, resume_lines):
|
140 |
+
self.resumeSegmenter.resume_segments = {
|
141 |
+
'objective': [],
|
142 |
+
'work_and_employment': [],
|
143 |
+
'education_and_training': [],
|
144 |
+
'skills': [],
|
145 |
+
'accomplishments': [],
|
146 |
+
'misc': []
|
147 |
+
}
|
148 |
+
self.resumeSegmenter.resume_indices = []
|
149 |
+
sections = self.resumeSegmenter.segment(resume_lines)
|
150 |
+
if sections is None:
|
151 |
+
return {}
|
152 |
+
jobs = self.parse_work_history(resume_lines, sections) if 'work_and_employment' in sections else {}
|
153 |
+
education = self.parse_education(resume_lines, sections) if 'education_and_training' in sections else {}
|
154 |
+
basic_info = self.parse_basic_info(resume_lines, sections) if 'basics_info' in sections else {}
|
155 |
+
result = {"basic_info":basic_info, "education":education, "work_experience":jobs}
|
156 |
+
for section in sections.keys():
|
157 |
+
if section not in ['work_and_employment', 'education_and_training', 'basics_info']:
|
158 |
+
text = '\n'.join(resume_lines[sections[section][0]:sections[section][1]])
|
159 |
+
result[section] =' '.join(wordninja.split(text))
|
160 |
+
return result
|
segmenter.py
CHANGED
@@ -1,8 +1,14 @@
|
|
1 |
from flashtext import KeywordProcessor
|
2 |
import json
|
|
|
|
|
|
|
|
|
|
|
3 |
class ResumeSegmenter():
|
4 |
|
5 |
def __init__(self):
|
|
|
6 |
self.resume_segments = {
|
7 |
'objective': [],
|
8 |
'work_and_employment': [],
|
@@ -12,41 +18,37 @@ class ResumeSegmenter():
|
|
12 |
'misc': []
|
13 |
}
|
14 |
self.resume_indices = []
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
|
16 |
-
def get_average_line_len(self, lines):
|
17 |
-
sum = 0
|
18 |
-
for line in lines:
|
19 |
-
sum+=len(line)
|
20 |
-
return sum / len(lines)
|
21 |
|
22 |
-
def get_average_words_per_line(self, lines):
|
23 |
-
sum = 0
|
24 |
-
for line in lines:
|
25 |
-
#other stopwords too?
|
26 |
-
sum+= len(line.split(' '))
|
27 |
-
return sum/ len(lines)
|
28 |
|
29 |
def find_segment_indices(self, text_list):
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
keyword_processor = KeywordProcessor()
|
35 |
-
keyword_processor.add_keywords_from_dict(keyword_dict=section_headers)
|
36 |
-
average_words_per_line = self.get_average_words_per_line(text_list)
|
37 |
-
|
38 |
for i, line in enumerate(text_list):
|
|
|
39 |
if line[0].islower() or line[-1] == '.':
|
40 |
continue
|
41 |
-
kys = keyword_processor.extract_keywords(line)
|
|
|
|
|
|
|
|
|
42 |
if len(kys) > 0:
|
43 |
-
|
44 |
-
if len(line.split(" ")) > average_words_per_line * 0.75:
|
45 |
continue
|
46 |
-
|
47 |
self.resume_indices.append(i)
|
48 |
self.resume_segments[kys[0]].append(i)
|
49 |
-
|
|
|
50 |
def slice_segments(self, lines):
|
51 |
sections = {}
|
52 |
if len(self.resume_indices) == 0:
|
@@ -73,12 +75,14 @@ class ResumeSegmenter():
|
|
73 |
start = max(s[0], interval[0])
|
74 |
end = min(s[1], interval[1])
|
75 |
return [start, end], section
|
|
|
76 |
def segment(self, resume_lines):
|
77 |
self.find_segment_indices(resume_lines)
|
78 |
sections = self.slice_segments(resume_lines)
|
79 |
-
|
|
|
80 |
sections_list = [(k, v) for k,v in sections.items() if len(v) > 0 ]
|
81 |
-
intersection_intervals = []
|
82 |
|
83 |
for i, s in enumerate(sections_list[:-1]):
|
84 |
result = self.get_interval_intersection(sections_list[i+1:], s[1])
|
@@ -90,14 +94,17 @@ class ResumeSegmenter():
|
|
90 |
intersection_intervals.append((a,b,s[0]))
|
91 |
|
92 |
if len(intersection_intervals) > 0:
|
93 |
-
print("there are intersections", intersection_intervals)
|
94 |
#needs last method of cleaning overlapping intervals with zero shot
|
95 |
#classifier + substract intervals
|
96 |
return sections
|
97 |
|
98 |
def get_parsed_sections(self, resume_lines):
|
|
|
99 |
text_segments = {}
|
100 |
sections = self.segment(resume_lines)
|
|
|
|
|
101 |
for header_title, section in sections.items():
|
102 |
lines = resume_lines[section[0]:section[1]]
|
103 |
text_segments[header_title] = lines
|
|
|
1 |
from flashtext import KeywordProcessor
|
2 |
import json
|
3 |
+
import nltk
|
4 |
+
from nltk.tokenize import word_tokenize,LineTokenizer
|
5 |
+
from utils import get_average_words_per_line, get_average_line_len
|
6 |
+
import wordninja
|
7 |
+
nltk.download('punkt')
|
8 |
class ResumeSegmenter():
|
9 |
|
10 |
def __init__(self):
|
11 |
+
#has to be reiniialized for each resume !!! could just check the intialization in get_parsed_sections
|
12 |
self.resume_segments = {
|
13 |
'objective': [],
|
14 |
'work_and_employment': [],
|
|
|
18 |
'misc': []
|
19 |
}
|
20 |
self.resume_indices = []
|
21 |
+
with open(r"models/prototype/sections.json") as f:
|
22 |
+
data = json.load(f)
|
23 |
+
self.section_headers = data["section_headers"]
|
24 |
+
f.close()
|
25 |
+
self.keyword_processor = KeywordProcessor()
|
26 |
+
self.keyword_processor.add_keywords_from_dict(keyword_dict=self.section_headers)
|
27 |
|
|
|
|
|
|
|
|
|
|
|
28 |
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
|
30 |
def find_segment_indices(self, text_list):
|
31 |
+
|
32 |
+
average_words_per_line = get_average_words_per_line(text_list)
|
33 |
+
average_sentence_length = get_average_line_len(text_list)
|
34 |
+
|
|
|
|
|
|
|
|
|
35 |
for i, line in enumerate(text_list):
|
36 |
+
line_tokenized = LineTokenizer(blanklines='discard').tokenize(line)
|
37 |
if line[0].islower() or line[-1] == '.':
|
38 |
continue
|
39 |
+
kys = self.keyword_processor.extract_keywords(line)
|
40 |
+
if self.keyword_processor.extract_keywords(' '.join(word_tokenize(line))) != []:
|
41 |
+
text_list[i] = line = ' '.join(word_tokenize(line))
|
42 |
+
kys = self.keyword_processor.extract_keywords(line)
|
43 |
+
|
44 |
if len(kys) > 0:
|
45 |
+
if len(word_tokenize(line)) > average_words_per_line * 0.75 and len(line) > average_sentence_length:
|
|
|
46 |
continue
|
47 |
+
|
48 |
self.resume_indices.append(i)
|
49 |
self.resume_segments[kys[0]].append(i)
|
50 |
+
|
51 |
+
|
52 |
def slice_segments(self, lines):
|
53 |
sections = {}
|
54 |
if len(self.resume_indices) == 0:
|
|
|
75 |
start = max(s[0], interval[0])
|
76 |
end = min(s[1], interval[1])
|
77 |
return [start, end], section
|
78 |
+
|
79 |
def segment(self, resume_lines):
|
80 |
self.find_segment_indices(resume_lines)
|
81 |
sections = self.slice_segments(resume_lines)
|
82 |
+
if sections is None:
|
83 |
+
return None
|
84 |
sections_list = [(k, v) for k,v in sections.items() if len(v) > 0 ]
|
85 |
+
"""intersection_intervals = []
|
86 |
|
87 |
for i, s in enumerate(sections_list[:-1]):
|
88 |
result = self.get_interval_intersection(sections_list[i+1:], s[1])
|
|
|
94 |
intersection_intervals.append((a,b,s[0]))
|
95 |
|
96 |
if len(intersection_intervals) > 0:
|
97 |
+
print("there are intersections", intersection_intervals)"""
|
98 |
#needs last method of cleaning overlapping intervals with zero shot
|
99 |
#classifier + substract intervals
|
100 |
return sections
|
101 |
|
102 |
def get_parsed_sections(self, resume_lines):
|
103 |
+
|
104 |
text_segments = {}
|
105 |
sections = self.segment(resume_lines)
|
106 |
+
if sections is None:
|
107 |
+
return None, None
|
108 |
for header_title, section in sections.items():
|
109 |
lines = resume_lines[section[0]:section[1]]
|
110 |
text_segments[header_title] = lines
|