Jonah Ramponi commited on
Commit
22be37d
·
1 Parent(s): 1c878ea

commiting day1

Browse files
Files changed (8) hide show
  1. .gitignore +128 -0
  2. app.py +126 -0
  3. backend.py +41 -0
  4. requirements.txt +4 -0
  5. utils/__init__.py +0 -0
  6. utils/gpt.py +40 -0
  7. utils/process_doc.py +40 -0
  8. utils/prompts.py +124 -0
.gitignore ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Project Specific
2
+ sample_data/
3
+
4
+ # Editors
5
+ .vscode/
6
+ .idea/
7
+
8
+ # Vagrant
9
+ .vagrant/
10
+
11
+ # Mac/OSX
12
+ .DS_Store
13
+
14
+ # Windows
15
+ Thumbs.db
16
+
17
+ # Source for the following rules: https://raw.githubusercontent.com/github/gitignore/master/Python.gitignore
18
+ # Byte-compiled / optimized / DLL files
19
+ __pycache__/
20
+ *.py[cod]
21
+ *$py.class
22
+
23
+ # C extensions
24
+ *.so
25
+
26
+ # Distribution / packaging
27
+ .Python
28
+ build/
29
+ develop-eggs/
30
+ dist/
31
+ downloads/
32
+ eggs/
33
+ .eggs/
34
+ lib/
35
+ lib64/
36
+ parts/
37
+ sdist/
38
+ var/
39
+ wheels/
40
+ *.egg-info/
41
+ .installed.cfg
42
+ *.egg
43
+ MANIFEST
44
+
45
+ # PyInstaller
46
+ # Usually these files are written by a python script from a template
47
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
48
+ *.manifest
49
+ *.spec
50
+
51
+ # Installer logs
52
+ pip-log.txt
53
+ pip-delete-this-directory.txt
54
+
55
+ # Unit test / coverage reports
56
+ htmlcov/
57
+ .tox/
58
+ .nox/
59
+ .coverage
60
+ .coverage.*
61
+ .cache
62
+ nosetests.xml
63
+ coverage.xml
64
+ *.cover
65
+ .hypothesis/
66
+ .pytest_cache/
67
+
68
+ # Translations
69
+ *.mo
70
+ *.pot
71
+
72
+ # Django stuff:
73
+ *.log
74
+ local_settings.py
75
+ db.sqlite3
76
+
77
+ # Flask stuff:
78
+ instance/
79
+ .webassets-cache
80
+
81
+ # Scrapy stuff:
82
+ .scrapy
83
+
84
+ # Sphinx documentation
85
+ docs/_build/
86
+
87
+ # PyBuilder
88
+ target/
89
+
90
+ # Jupyter Notebook
91
+ .ipynb_checkpoints
92
+
93
+ # IPython
94
+ profile_default/
95
+ ipython_config.py
96
+
97
+ # pyenv
98
+ .python-version
99
+
100
+ # celery beat schedule file
101
+ celerybeat-schedule
102
+
103
+ # SageMath parsed files
104
+ *.sage.py
105
+
106
+ # Environments
107
+ .env
108
+ .venv
109
+ env/
110
+ venv/
111
+ ENV/
112
+ env.bak/
113
+ venv.bak/
114
+
115
+ # Spyder project settings
116
+ .spyderproject
117
+ .spyproject
118
+
119
+ # Rope project settings
120
+ .ropeproject
121
+
122
+ # mkdocs documentation
123
+ /site
124
+
125
+ # mypy
126
+ .mypy_cache/
127
+ .dmypy.json
128
+ dmypy.json
app.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ For HF, the interface should be called app.py
3
+ """
4
+
5
+ import json
6
+ import concurrent.futures
7
+
8
+ import streamlit as st
9
+
10
+ from utils.process_doc import parse_docx, parse_pdf
11
+ from backend import process_cv, process_job_posting
12
+ from utils.gpt import test_api_key
13
+
14
+ st.set_page_config(layout="wide")
15
+
16
+ with st.sidebar:
17
+ COHERE_API_KEY = st.text_input(
18
+ "Cohere API Key Entry",
19
+ value="",
20
+ placeholder="Enter your Free Tier Cohere API Key",
21
+ )
22
+
23
+ if "state" not in st.session_state:
24
+ st.session_state.state = {"formatted": False}
25
+
26
+ STATE = st.session_state.state
27
+
28
+
29
+ cv_upload_box = st.file_uploader(
30
+ "CV Upload Box",
31
+ help="Upload your CV in .docx or .pdf form. This CV will be parsed, and used to analyse against the given job post.",
32
+ type=["docx", "pdf"],
33
+ accept_multiple_files=False,
34
+ )
35
+ job_posting_upload_box = st.text_area(
36
+ "Job Description Upload Box",
37
+ placeholder="Copy and Paste a job post you are interested in. Make sure to include the full post! More information is better.",
38
+ help="In this box, please dump text content for a job description you are interested in. This could easily be setup to work directly with a webpage (we'd simply need to scrape said page) however I do not want to do that on HF spaces.",
39
+ )
40
+
41
+ if cv_upload_box and job_posting_upload_box != "":
42
+
43
+ process_files = st.button("Process Files", type="primary")
44
+
45
+ if process_files:
46
+ if test_api_key(COHERE_API_KEY):
47
+
48
+ # Process our two uploaded files into state variables
49
+ STATE["job_posting"] = job_posting_upload_box
50
+
51
+ cv_filetype = cv_upload_box.name.split(".")[-1]
52
+ cv_file_contents = cv_upload_box.getvalue()
53
+
54
+ STATE["cv"] = (
55
+ parse_docx(cv_file_contents)
56
+ if cv_filetype == "docx"
57
+ else parse_pdf(cv_file_contents)
58
+ )
59
+
60
+ # Now, use Cohere to get structured output for both cv and job_posting
61
+
62
+ # Making these calls in parallel
63
+ with concurrent.futures.ThreadPoolExecutor() as executor:
64
+
65
+ future1 = executor.submit(process_cv, STATE["cv"], COHERE_API_KEY)
66
+ future2 = executor.submit(
67
+ process_job_posting, STATE["job_posting"], COHERE_API_KEY
68
+ )
69
+
70
+ cv_json_text = future1.result()
71
+ job_posting_json_text = future2.result()
72
+
73
+ cv_json_text = (
74
+ "{" + cv_json_text.lstrip().lstrip("{").rstrip().rstrip("}") + "}"
75
+ )
76
+ job_posting_json_text = (
77
+ "{"
78
+ + job_posting_json_text.lstrip().lstrip("{").rstrip().rstrip("}")
79
+ + "}"
80
+ )
81
+ try:
82
+ STATE["cv_json"] = json.loads(cv_json_text)
83
+ except json.JSONDecodeError as e:
84
+ print(
85
+ f"Error parsing JSON Output for CV: {e}. Response content: {cv_json_text}"
86
+ )
87
+ STATE["cv_json"] = {"name": "Failed"}
88
+
89
+ try:
90
+ STATE["job_posting_json"] = json.loads(job_posting_json_text)
91
+
92
+ except json.JSONDecodeError as e:
93
+ print(
94
+ f"Error parsing JSON Output for Job Posting: {e}. Response content: {job_posting_json_text}"
95
+ )
96
+ STATE["job_posting_json"] = {"companyName": "Failed"}
97
+
98
+ STATE["formatted"] = True
99
+ else:
100
+ st.error(
101
+ "You entered an invalid Cohere API Key. Please enter a valid API key in the sidebar."
102
+ )
103
+
104
+ # Now, we can work with this !
105
+ if STATE["formatted"]:
106
+ lcol, rcol = st.columns((0.5, 0.5))
107
+ with lcol:
108
+ st.download_button(
109
+ label="Download Job Posting JSON",
110
+ data=json.dumps(STATE["job_posting_json"], indent=4),
111
+ file_name=f"job_posting_formatted_{STATE['job_posting_json']['companyName']}.json",
112
+ mime="application/json",
113
+ use_container_width=True,
114
+ )
115
+ with rcol:
116
+ st.download_button(
117
+ label="Download CV JSON",
118
+ data=json.dumps(STATE["cv_json"], indent=4),
119
+ file_name=f"cv_formatted_{STATE['cv_json']['name']}.json",
120
+ mime="application/json",
121
+ use_container_width=True,
122
+ )
123
+
124
+ cv_critique, practice_interview, general_cv_critique = st.tabs(
125
+ ["Role Specific CV Critique", "Practice Interview", "General CV Critique"]
126
+ )
backend.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from utils.prompts import (
2
+ cv_extract_prompt,
3
+ cv_format,
4
+ job_posting_extract_prompt,
5
+ job_posting_format,
6
+ )
7
+
8
+ from utils.gpt import gpt_response
9
+
10
+
11
+ def process_cv(cv_contents: str, API_KEY: str) -> str:
12
+ """Process CV contents, using Cohere"""
13
+
14
+ prompt = cv_extract_prompt.replace("<input-cv>", cv_contents)
15
+
16
+ response = gpt_response(
17
+ prompt=prompt,
18
+ api_key=API_KEY,
19
+ )
20
+
21
+ return response
22
+
23
+
24
+ def process_job_posting(job_post_contents: str, API_KEY: str) -> str:
25
+ """Process a job posting, using Cohere"""
26
+
27
+ prompt = job_posting_extract_prompt.replace("<job-posting>", job_post_contents)
28
+
29
+ response = gpt_response(
30
+ prompt=prompt,
31
+ api_key=API_KEY,
32
+ )
33
+
34
+ return response
35
+
36
+
37
+ if __name__ == "__main__":
38
+ with open("sample_data/meta_job.txt", "r") as file:
39
+ post_contents = file.read()
40
+
41
+ output = process_job_posting(post_contents)
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ streamlit==1.37.0
2
+ cohere==5.5.7
3
+ pymupdf==1.24.9
4
+ python-docx==1.1.2
utils/__init__.py ADDED
File without changes
utils/gpt.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import cohere
2
+
3
+
4
+ def test_api_key(api_key: str):
5
+
6
+ try:
7
+ # try to just generate 3 tokens
8
+ co = cohere.Client(
9
+ api_key=api_key,
10
+ )
11
+ response = co.generate(prompt="sample prompt", max_tokens=3)
12
+ return True
13
+ except:
14
+ return False
15
+
16
+
17
+ def gpt_stream_response(prompt: str, api_key: str):
18
+ """Get response from Cohere and stream response"""
19
+
20
+ co = cohere.Client(
21
+ api_key=api_key,
22
+ )
23
+
24
+ stream = co.chat_stream(message=prompt)
25
+
26
+ for event in stream:
27
+ if event.event_type == "text-generation":
28
+ yield event.text
29
+
30
+
31
+ def gpt_response(prompt: str, api_key: str) -> str:
32
+ """Get response from Cohere, with option to get output in json format"""
33
+
34
+ co = cohere.Client(
35
+ api_key=api_key,
36
+ )
37
+
38
+ response = co.chat(message=prompt)
39
+
40
+ return response.text
utils/process_doc.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Script for processing an input CV document
3
+ """
4
+
5
+ import io
6
+
7
+ import fitz
8
+ from docx import Document
9
+
10
+
11
+ def parse_pdf(pdf_file) -> str:
12
+ """Read PDF from Streamlit's file uploader"""
13
+
14
+ pdf_document = fitz.open("pdf", pdf_file)
15
+
16
+ all_text = []
17
+
18
+ for page_number in range(len(pdf_document)):
19
+
20
+ page = pdf_document.load_page(page_number)
21
+
22
+ all_text.append(page.get_text("text"))
23
+
24
+ pdf_document.close()
25
+
26
+ return "\n\n".join(all_text)
27
+
28
+
29
+ def parse_docx(docx_file):
30
+ """Read in docx file"""
31
+ docx_file = io.BytesIO(docx_file)
32
+
33
+ document = Document(docx_file)
34
+
35
+ all_text = []
36
+
37
+ for paragraph in document.paragraphs:
38
+ all_text.append(paragraph.text)
39
+
40
+ return "\n".join(all_text)
utils/prompts.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # NOTE: The _format variables may be useful for testing the beta of Cohere's structured output endpoints.
2
+
3
+ cv_format = {
4
+ "type": "object",
5
+ "required": [
6
+ "name",
7
+ "roles",
8
+ "projects",
9
+ "languages",
10
+ "packages",
11
+ "tools",
12
+ "qualifications",
13
+ "responsibilities",
14
+ ],
15
+ "properties": {
16
+ "name": {"type": "string"},
17
+ "roles": {"type": "list"},
18
+ "projects": {"type": "list"},
19
+ "languages": {"type": "list"},
20
+ "packages": {"type": "list"},
21
+ "tools": {"type": "list"},
22
+ "qualifications": {"type": "list"},
23
+ "responsibilities": {"type": "list"},
24
+ },
25
+ }
26
+
27
+
28
+ cv_extract_prompt = """
29
+ You are an expert at extracting information from CVs
30
+
31
+ **Goal**
32
+ For a given CV, you must extract structured information and present it to the user in json form
33
+
34
+ **Input**
35
+ <input-cv>
36
+
37
+ **Output Format**
38
+ You will respond with a json object, in the form given.
39
+ You will ensure that you are concise.
40
+
41
+ {
42
+ "name": ,
43
+ "roles": [],
44
+ "projects": [],
45
+ "languages": [],
46
+ "packages": [],
47
+ "tools": [],
48
+ "qualifications": [] ,
49
+ "responsibilities": [],
50
+ }
51
+
52
+ **Guidance**
53
+
54
+ - languages: programming languages mentioned in the CV. Each should be tagged with a number between 1 and 5. 5 suggests they must be fully fluent, 1 suggests they may have some experience. If the CV does not indicate the level of required experience, approximate it, and if no information at all is given put 3
55
+ - packages: specific packages mentioned in the CV. Each package should be tagged with a number between 1 and 5, 5 suggesting fully fluent and 1 suggesting just tried it once. Use your best judgement to gauge the individual's level of understanding.
56
+ - tools: a list of other tools the individual has experience with
57
+ - qualifications: of form [{"type": , "grade": ,"location": }] where type is the qualification type identified. Available Education Levels are: bsc,msc,phd. grade should be the grade achieved (number between 0 and 100. Make relevant conversions, if no number is given, assume 60). location is the location of where the education was taken.
58
+ - responsibilities: an extensive list of the responsibilities demonstrated in the CV.
59
+
60
+ You will now respond clearly, only responding with the desired json output.
61
+ """
62
+
63
+ job_posting_format = {
64
+ "type": "object",
65
+ "required": [
66
+ "companyName",
67
+ "roleShortDesc",
68
+ "roleLongDesc",
69
+ "requiredExperience",
70
+ "languages",
71
+ "packages",
72
+ "tools",
73
+ "qualifications",
74
+ "responsibilities",
75
+ ],
76
+ "properties": {
77
+ "name": {"type": "string"},
78
+ "roleShortDesc": {"type": "string"},
79
+ "roleLongDesc": {"type": "string"},
80
+ "requiredExperience": {"type": "list"},
81
+ "languages": {"type": "list"},
82
+ "packages": {"type": "list"},
83
+ "tools": {"type": "list"},
84
+ "qualifications": {"type": "list"},
85
+ "responsibilities": {"type": "list"},
86
+ },
87
+ }
88
+
89
+ job_posting_extract_prompt = """
90
+ You are an expert at extracting information from job postings
91
+
92
+ **Goal**
93
+ For a given job posting, you must extract structured information and present it to the user
94
+
95
+ **Input**
96
+ <job-posting>
97
+
98
+ **Output Format**
99
+ You will respond with a json object, in the form given.
100
+ You will ensure that you are concise.
101
+
102
+ {
103
+ "companyName": ,
104
+ "roleShortDesc": ,
105
+ "roleLongDesc": ,
106
+ "requiredExperience": [],
107
+ "languages": [],
108
+ "packages": [],
109
+ "tools": [],
110
+ "qualifications": [] ,
111
+ "responsibilities": [],
112
+ }
113
+
114
+ **Guidance**
115
+ - roleShortDesc should be one sentence only
116
+ - requiredExperience: a standaredized list of items, each item should be at most one short sentence
117
+ - languages: programming languages mentioned in the post. Each should be tagged with a number between 1 and 5. 5 suggests they must be fully fluent, 1 suggests they may have some experience. If the job does not indicate the level of required experience, put 3
118
+ - packages: specific packages mentioned in the post. Each package should be tagged with a number between 1 and 5, 5 suggesting fully fluent and 1 suggesting just tried it once. If no level is given, assume level 3
119
+ - tools: a list of other tools that would be useful to know
120
+ - qualifications: of form [{"type": , "strictness": }] where type is the qualification type identified, and strictness is your approximation of how strict the job posting suggests the requirement is. 1 being not strict at all, 5 being absolute necessity. Available Education Levels are: bsc,msc,phd.
121
+ - responsibilities: an extensive list of the responsibilities given in the advert.
122
+
123
+ You will now respond clearly, only responding with the desired json output.
124
+ """