anisrashidov commited on
Commit
a27e4e5
·
verified ·
1 Parent(s): ba8e246

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +301 -59
  2. crawler.py +98 -0
  3. requirements.txt +19 -1
app.py CHANGED
@@ -1,64 +1,306 @@
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
- from huggingface_hub import InferenceClient
3
-
4
- """
5
- For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
6
- """
7
- client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
8
-
9
-
10
- def respond(
11
- message,
12
- history: list[tuple[str, str]],
13
- system_message,
14
- max_tokens,
15
- temperature,
16
- top_p,
17
- ):
18
- messages = [{"role": "system", "content": system_message}]
19
-
20
- for val in history:
21
- if val[0]:
22
- messages.append({"role": "user", "content": val[0]})
23
- if val[1]:
24
- messages.append({"role": "assistant", "content": val[1]})
25
-
26
- messages.append({"role": "user", "content": message})
27
-
28
- response = ""
29
-
30
- for message in client.chat_completion(
31
- messages,
32
- max_tokens=max_tokens,
33
- stream=True,
34
- temperature=temperature,
35
- top_p=top_p,
36
- ):
37
- token = message.choices[0].delta.content
38
-
39
- response += token
40
- yield response
41
-
42
-
43
- """
44
- For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
45
- """
46
- demo = gr.ChatInterface(
47
- respond,
48
- additional_inputs=[
49
- gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
50
- gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
51
- gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
52
- gr.Slider(
53
- minimum=0.1,
54
- maximum=1.0,
55
- value=0.95,
56
- step=0.05,
57
- label="Top-p (nucleus sampling)",
58
- ),
59
- ],
60
  )
61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
 
63
  if __name__ == "__main__":
64
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # from fastapi import FastAPI
2
+ # from fastapi.middleware.cors import CORSMiddleware
3
+ from openai import OpenAI
4
+ from google import genai
5
+ from crawler import extract_data
6
+ import time
7
+ import os
8
+ from dotenv import load_dotenv
9
  import gradio as gr
10
+ # import multiprocessing
11
+ from together import Together
12
+
13
+ load_dotenv("../.env")
14
+ print("Environment variables:", os.environ)
15
+
16
+
17
+ together_client = Together(
18
+ api_key=os.getenv("TOGETHER_API_KEY"),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  )
20
 
21
+ gemini_client = genai.Client(api_key=os.getenv("GEMINI_API_KEY"))
22
+ genai_model = "gemini-2.0-flash-exp"
23
+
24
+ perplexity_client = OpenAI(api_key=os.getenv("PERPLEXITY_API_KEY"), base_url="https://api.perplexity.ai")
25
+ gpt_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
26
+
27
+
28
+
29
+ def get_answers( query: str ):
30
+ context = extract_data(query, 1)
31
+ return context
32
+
33
+ # with torch.no_grad():
34
+ # model = AutoModel.from_pretrained('BM-K/KoSimCSE-roberta')
35
+ # tokenizer = AutoTokenizer.from_pretrained('BM-K/KoSimCSE-roberta', TOKENIZERS_PARALLELISM=True)
36
+
37
+ # def cal_score(input_data):
38
+ # # Initialize model and tokenizer inside the function
39
+ # with torch.no_grad():
40
+ # inputs = tokenizer(input_data, padding=True, truncation=True, return_tensors="pt")
41
+ # outputs = model.get_input_embeddings(inputs["input_ids"])
42
+
43
+ # a, b = outputs[0], outputs[1] # Adjust based on your model's output structure
44
+
45
+ # # Normalize the tensors
46
+ # a_norm = a / a.norm(dim=1)[:, None]
47
+ # b_norm = b / b.norm(dim=1)[:, None]
48
+
49
+ # print(a.shape, b.shape)
50
+
51
+ # # Return the similarity score
52
+ # # return torch.mm(a_norm, b_norm.transpose(0, 1)) * 100
53
+ # a_norm = a_norm.reshape(1, -1)
54
+ # b_norm = b_norm.reshape(1, -1)
55
+ # similarity_score = cosine_similarity(a_norm, b_norm)
56
+
57
+ # # Return the similarity score (assuming you want the average of the similarities across the tokens)
58
+ # return similarity_score # Scalar value
59
+
60
+
61
+
62
+ # def get_match_scores( message: str, query: str, answers: list[dict[str, object]] ):
63
+ # start = time.time()
64
+ # max_processes = 4
65
+ # with multiprocessing.Pool(processes=max_processes) as pool:
66
+ # scores = pool.map(cal_score, [[answer['questionDetails'], message] for answer in answers])
67
+ # print(f"Time taken to compare: {time.time() - start} seconds")
68
+ # print("Scores: ", scores)
69
+ # return scores
70
+
71
+ def get_naver_answers( message: str ):
72
+ print(">>> Starting naver extraction...")
73
+ print("Question: ", message)
74
+ naver_start_time = time.time()
75
+ response = gemini_client.models.generate_content(
76
+ model = genai_model,
77
+ contents=f"{message}\n 위의 내용을 짧은 제목으로 요약합니다. 제목만 보여주세요. 대답하지 마세요",
78
+ )
79
+ query = response.text
80
+ print( "Query: ", query)
81
+
82
+ context = get_answers( query )
83
+
84
+ sorted_answers = ['. '.join(answer['answers']) for answer in context]
85
+ naver_end_time = time.time()
86
+ print(f"Time taken to extract from Naver: { naver_end_time - naver_start_time } seconds")
87
+ document = '\n'.join(sorted_answers)
88
+ return document, naver_end_time - naver_start_time
89
+
90
+ def get_qwen_big_answer( message: str ):
91
+ print(">>> Starting Qwen 72B extraction...")
92
+ qwen_start_time = time.time()
93
+ response = together_client.chat.completions.create(
94
+ model="Qwen/Qwen2.5-72B-Instruct-Turbo",
95
+ messages=[
96
+ {"role": "system", "content": "You are a helpful question-answer, CONCISE conversation assistant that answers in Korean."},
97
+ {"role": "user", "content": message}
98
+ ]
99
+ )
100
+
101
+ qwen_end_time = time.time()
102
+ print(f"Time taken to extract from Qwen: { qwen_end_time - qwen_start_time } seconds")
103
+ return response.choices[0].message.content, qwen_end_time - qwen_start_time
104
+
105
+ def get_qwen_small_answer( message: str ):
106
+ print(">>> Starting Qwen 7B extraction...")
107
+ qwen_start_time = time.time()
108
+ response = together_client.chat.completions.create(
109
+ model="Qwen/Qwen2.5-7B-Instruct-Turbo",
110
+ messages=[
111
+ {"role": "system", "content": "You are a helpful question-answer, CONCISE conversation assistant that answers in Korean."},
112
+ {"role": "user", "content": message}
113
+ ]
114
+ #TODO: Change the messages option
115
+ )
116
+ qwen_end_time = time.time()
117
+ print(f"Time taken to extract from Qwen: { qwen_end_time - qwen_start_time } seconds")
118
+ return response.choices[0].message.content, qwen_end_time - qwen_start_time
119
+
120
+ def get_llama_small_answer( message: str ):
121
+ print(">>> Starting Llama 3.1 8B extraction...")
122
+ llama_start_time = time.time()
123
+ response = together_client.chat.completions.create(
124
+ model="meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
125
+ messages=[
126
+ {"role": "system", "content": "You are an artificial intelligence assistant and you need to engage in a helpful, CONCISE, polite question-answer conversation with a user."},
127
+ {
128
+ "role": "user",
129
+ "content": message
130
+ }
131
+ ]
132
+ )
133
+ llama_end_time = time.time()
134
+ print(f"Time taken to extract from Llama: { llama_end_time - llama_start_time } seconds")
135
+ return response.choices[0].message.content, llama_end_time - llama_start_time
136
+
137
+ def get_llama_big_answer( message: str ):
138
+ print(">>> Starting Llama 3.1 70B extraction...")
139
+ llama_start_time = time.time()
140
+ response = together_client.chat.completions.create(
141
+ model="meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
142
+ messages=[
143
+ {"role": "system", "content": "You are an artificial intelligence assistant and you need to engage in a helpful, CONCISE, polite question-answer conversation with a user."},
144
+ {
145
+ "role": "user",
146
+ "content": message
147
+ }
148
+ ]
149
+ )
150
+ llama_end_time = time.time()
151
+ print(f"Time taken to extract from Llama: { llama_end_time - llama_start_time } seconds")
152
+ return response.choices[0].message.content, llama_end_time - llama_start_time
153
+
154
+
155
+ def get_gemini_answer( message: str ):
156
+ print(">>> Starting gemini extraction...")
157
+ gemini_start_time = time.time()
158
+ response = gemini_client.models.generate_content(
159
+ model = genai_model,
160
+ contents=message,
161
+ )
162
+ gemini_end_time = time.time()
163
+ print(f"Time taken to extract from Gemini: { gemini_end_time - gemini_start_time } seconds")
164
+ return response.candidates[0].content, gemini_end_time - gemini_start_time
165
+
166
+ # def get_perplexity_answer( message: str ):
167
+ # print(">>> Starting perplexity extraction...")
168
+ # perplexity_start_time = time.time()
169
+ # messages = [
170
+ # {
171
+ # "role": "system",
172
+ # "content": (
173
+ # "You are an artificial intelligence assistant and you need to "
174
+ # "engage in a helpful, CONCISE, polite question-answer conversation with a user."
175
+ # ),
176
+ # },
177
+ # {
178
+ # "role": "user",
179
+ # "content": (
180
+ # message
181
+ # ),
182
+ # },
183
+ # ]
184
+ # response = perplexity_client.chat.completions.create(
185
+ # model="llama-3.1-sonar-small-128k-online",
186
+ # messages=messages
187
+ # )
188
+ # perplexity_end_time = time.time()
189
+ # print(f"Time taken to extract from Perplexity: { perplexity_end_time - perplexity_start_time } seconds")
190
+ # return response.choices[0].message.content, perplexity_end_time - perplexity_start_time
191
+
192
+ def get_gpt_answer( message: str ):
193
+ print(">>> Starting GPT extraction...")
194
+ gpt_start_time = time.time()
195
+ completion = gpt_client.chat.completions.create(
196
+ model="gpt-4o-mini",
197
+ messages=[
198
+ {"role": "system", "content": "You are a helpful assistant that gives short answers and nothing extra."},
199
+ {
200
+ "role": "user",
201
+ "content": message
202
+ }
203
+ ]
204
+ )
205
+ gpt_end_time = time.time()
206
+ print(f"Time taken to extract from GPT: { gpt_end_time - gpt_start_time } seconds")
207
+ return completion.choices[0].message.content, gpt_end_time - gpt_start_time
208
+
209
+ def compare_answers(message: str):
210
+ methods = [
211
+ ("Qwen Big (72B)", get_qwen_big_answer),
212
+ ("Qwen Small (7B)", get_qwen_small_answer),
213
+ ("Llama Small (8B)", get_llama_small_answer),
214
+ ("Llama Big (70B)", get_llama_big_answer),
215
+ ("Gemini-2.0-Flash", get_gemini_answer),
216
+ # ("Perplexity", get_perplexity_answer),
217
+ ("GPT (4o-mini)", get_gpt_answer)
218
+ ]
219
+
220
+ results = []
221
+
222
+ naver_docs, naver_time_taken = get_naver_answers( message )
223
+ content = f'아래 문서를 바탕으로 질문에 답하세요. 답변은 한국어로만 해주세요 \n 질문 {message}\n'
224
+ content += naver_docs
225
+ print("Starting the comparison between summarizers...")
226
+ for method_name, method in methods:
227
+ answer, time_taken = method(content)
228
+ results.append({
229
+ "Method": f"Naver + ({method_name})",
230
+ "Question": message,
231
+ "Answer": answer,
232
+ "Time Taken": naver_time_taken + time_taken
233
+ })
234
+
235
+ print("Starting the comparison between extractors/summarizers...")
236
+ for method_name, method in methods:
237
+ additional_docs, time_taken = method(message)
238
+ results.append({
239
+ "Method": method_name,
240
+ "Question": message,
241
+ "Answer": additional_docs,
242
+ "Time Taken": time_taken
243
+ })
244
+ content += f'\n{additional_docs}'
245
+ time_taken += naver_time_taken
246
+ for summarizer_name, summarizer in methods:
247
+ answer, answer_time = summarizer(content)
248
+ results.append({
249
+ "Method": f"Naver + {method_name} + ({summarizer_name})",
250
+ "Question": message,
251
+ "Answer": answer,
252
+ "Time Taken": time_taken + answer_time
253
+ })
254
+ return results
255
+
256
+ def chatFunction( message, history ):
257
+ content = f'아래 문서를 바탕으로 질문에 답하세요. 답변에서 질문을 따라 출력 하지 마세요. 답변은 한국어로만 해주세요. 찾은 Naver 문서와 다른 문서에서 답변이 없는 내용은 절대 출력하지 마세요 \n 질문: {message}\n 문서: '
258
+ naver_docs, naver_time_taken = get_naver_answers( message )
259
+
260
+ start_time = time.time()
261
+ content += "\n Naver 문서: " + naver_docs
262
+
263
+ completion = gpt_client.chat.completions.create(
264
+ model="gpt-4o-mini",
265
+ messages=[
266
+ {"role": "system", "content": "You are a helpful assistant that answers only in korean."},
267
+ {
268
+ "role": "user",
269
+ "content": message
270
+ }
271
+ ]
272
+ )
273
+ gpt_resp = completion.choices[0].message.content
274
+ content += "\n 다른 문서: " + gpt_resp
275
+
276
+ # content += "\n" + gpt_resp
277
+
278
+ answer, _ = get_qwen_small_answer(content)
279
+
280
+ print("-"*70)
281
+ print("Question: ", message)
282
+ print("Answer: ", answer)
283
+ time_taken = time.time() - start_time
284
+ print("Time taken to summarize: ", time_taken)
285
+ return answer
286
+
287
 
288
  if __name__ == "__main__":
289
+ # multiprocessing.set_start_method("fork", force=True)
290
+ # if multiprocessing.get_start_method(allow_none=True) is None:
291
+ # multiprocessing.set_start_method("fork")
292
+ with gr.ChatInterface( fn=chatFunction, type="messages" ) as demo: pass
293
+ demo.launch(share=True)
294
+ # with open("test_questions.txt", "r") as f:
295
+ # if os.path.exists("comparison_results.csv"):
296
+ # if input("Do you want to delete the former results? (y/n): ") == "y":
297
+ # os.remove("comparison_results.csv")
298
+ # questions = f.readlines()
299
+ # print(questions)
300
+ # for idx, question in enumerate(questions):
301
+ # print(" -> Starting the question number: ", idx)
302
+ # results = compare_answers(question)
303
+ # df = pd.DataFrame(results)
304
+ # df.to_csv("comparison_results.csv", mode='a', index=False)
305
+
306
+
crawler.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from bs4 import BeautifulSoup
2
+ import re
3
+ import requests as r
4
+ from html2text import html2text
5
+ import tqdm
6
+
7
+ def process_url(url):
8
+ """Process a single URL to fetch answers."""
9
+ try:
10
+ response = r.get(url)
11
+ soup = BeautifulSoup(response.text, "html.parser")
12
+ # answers = []
13
+ # for idx in range(1, 100):
14
+ # answer = soup.find('div', {'id': f'answer_{idx}'})
15
+ # if answer:
16
+ # answers.append(answer)
17
+ # else:
18
+ # break
19
+ answers = soup.find_all('div', {'id': re.compile(r'answer_\d+')})
20
+ answers = [html2text(str(answer.find('div', {'class': "answerDetail"}).prettify()))
21
+ for answer in answers if answer.find('div', {'class': "answerDetail"})]
22
+ title = soup.find('div', {'class': 'endTitleSection'}).text.strip()
23
+ questionDetails = soup.find('div', {'class': 'questionDetail'}).text.strip()
24
+ # print("Question: ", questionDetails, '\n')
25
+ title = title.replace("질문", '').strip()
26
+ print("Answers extracted from: \n", url)
27
+ print(len(answers))
28
+ print('-'*60)
29
+ return {
30
+ "title": title,
31
+ "questionDetails": questionDetails,
32
+ "url": url,
33
+ "answers": answers
34
+ }
35
+ except Exception as e:
36
+ print(f"Error processing URL {url}: {e}")
37
+ with open('error_urls.txt', 'w') as f:
38
+ f.write(url + '\n')
39
+ return {"title": '', "questionDetails": '', "url": url, "answers": ''}
40
+
41
+ def get_answers(results_a_elements, query):
42
+ """Fetch answers for all the extracted result links."""
43
+ if not results_a_elements:
44
+ print("No results found.")
45
+ return []
46
+
47
+ print("Result links extracted: ", len(results_a_elements))
48
+
49
+ # Limit the number of parallel processes for better resource management
50
+ # max_processes = 4
51
+
52
+ # with multiprocessing.Pool(processes=max_processes) as pool:
53
+ # results = pool.map(process_url, results_a_elements)
54
+
55
+ results = []
56
+ for url in tqdm.tqdm(results_a_elements):
57
+ results.append(process_url(url))
58
+ return results
59
+
60
+ def get_search_results(query, num_pages):
61
+ """Fetch search results for the given query from Naver 지식in."""
62
+ results = []
63
+ for page in range(1, num_pages + 1):
64
+ url = f"https://kin.naver.com/search/list.naver?query={query}&page={page}"
65
+ print("Starting the scraping process for:\n", url)
66
+
67
+ try:
68
+ response = r.get(url)
69
+ soup = BeautifulSoup(response.text, "html.parser")
70
+ results_a_elements = soup.find("ul", {"class": "basic1"}).find_all("a", {"class": "_searchListTitleAnchor"})
71
+ results_a_elements = [a.get('href') for a in results_a_elements if a.get("href")]
72
+ results += results_a_elements
73
+ except Exception as e:
74
+ print(f"Error while fetching search results: {e}")
75
+ return results
76
+
77
+ def extract_data(query, num_pages=150) -> list[dict[str, object]]:
78
+ results_a_elements = get_search_results(query, num_pages)
79
+ answers = get_answers(results_a_elements, query)
80
+ print("Total answers collected:", len(answers))
81
+ return answers
82
+
83
+ # if __name__ == "__main__":
84
+ # start = time.time()
85
+ # query = "장래희망, 인공지능 개발자/연구원, 파이썬, 중학생 수준, 파이썬 설치, 도서 추천"
86
+ # answers = process_query(query)
87
+ # print("Total answers collected:", len(answers))
88
+ # print("Time taken: ", time.time() - start)
89
+ # # print(answers)
90
+
91
+
92
+
93
+
94
+ # AJAX URL:
95
+ # https://kin.naver.com/ajax/detail/answerList.naver?
96
+ # dirId=401030201&docId=292159869
97
+ # &answerSortType=DEFAULT&answerViewType=DETAIL
98
+ # &answerNo=&page=2&count=5&_=1736131792605
requirements.txt CHANGED
@@ -1 +1,19 @@
1
- huggingface_hub==0.25.2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ beautifulsoup4
2
+ # selenium
3
+ # webdriver-manager
4
+ # fastapi[standard]
5
+ # uvicorn[standard]
6
+ html2text
7
+ transformers
8
+ openai
9
+ google-genai
10
+ # transformers[torch]
11
+ # torch
12
+ # torchvision
13
+ # torchaudio
14
+ gradio
15
+ # scikit-learn
16
+ together
17
+ python-dotenv
18
+ openpyxl
19
+ tonic-validate