from bs4 import BeautifulSoup import re import requests as r from html2text import html2text import tqdm def process_url(url): """Process a single URL to fetch answers.""" try: response = r.get(url) soup = BeautifulSoup(response.text, "html.parser") # answers = [] # for idx in range(1, 100): # answer = soup.find('div', {'id': f'answer_{idx}'}) # if answer: # answers.append(answer) # else: # break answers = soup.find_all('div', {'id': re.compile(r'answer_\d+')}) answers = [html2text(str(answer.find('div', {'class': "answerDetail"}).prettify())) for answer in answers if answer.find('div', {'class': "answerDetail"})] title = soup.find('div', {'class': 'endTitleSection'}).text.strip() questionDetails = soup.find('div', {'class': 'questionDetail'}).text.strip() # print("Question: ", questionDetails, '\n') title = title.replace("질문", '').strip() print("Answers extracted from: \n", url) print(len(answers)) print('-'*60) return { "title": title, "questionDetails": questionDetails, "url": url, "answers": answers } except Exception as e: print(f"Error processing URL {url}: {e}") with open('error_urls.txt', 'w') as f: f.write(url + '\n') return {"title": '', "questionDetails": '', "url": url, "answers": ''} def get_answers(results_a_elements, query): """Fetch answers for all the extracted result links.""" if not results_a_elements: print("No results found.") return [] print("Result links extracted: ", len(results_a_elements)) # Limit the number of parallel processes for better resource management # max_processes = 4 # with multiprocessing.Pool(processes=max_processes) as pool: # results = pool.map(process_url, results_a_elements) results = [] # answer_count = 0 for url in tqdm.tqdm(results_a_elements): res = process_url(url) results.append(res) # answer_count += len(res['answers']) return results def get_search_results(query, num_pages): """Fetch search results for the given query from Naver 지식in.""" results = [] for page in range(1, num_pages + 1): url = f"https://kin.naver.com/search/list.naver?query={query}&page={page}" print("Starting the scraping process for:\n", url) try: response = r.get(url) soup = BeautifulSoup(response.text, "html.parser") results_a_elements = soup.find("ul", {"class": "basic1"}).find_all("a", {"class": "_searchListTitleAnchor"}) results_a_elements = [a.get('href') for a in results_a_elements if a.get("href")] results += results_a_elements except Exception as e: print(f"Error while fetching search results: {e}") return results def extract_data(query, num_pages=150) -> list[dict[str, object]]: results_a_elements = get_search_results(query, num_pages) answers = get_answers(results_a_elements, query) print("Total answers collected:", len(answers)) return answers # if __name__ == "__main__": # start = time.time() # query = "장래희망, 인공지능 개발자/연구원, 파이썬, 중학생 수준, 파이썬 설치, 도서 추천" # answers = process_query(query) # print("Total answers collected:", len(answers)) # print("Time taken: ", time.time() - start) # # print(answers) # AJAX URL: # https://kin.naver.com/ajax/detail/answerList.naver? # dirId=401030201&docId=292159869 # &answerSortType=DEFAULT&answerViewType=DETAIL # &answerNo=&page=2&count=5&_=1736131792605