Spaces:
Runtime error
Runtime error
Young Ho Shin
commited on
Commit
Β·
bfd4c2e
1
Parent(s):
f326d7f
Add app.py and requirements.txt
Browse files- app.py +119 -0
- requirements.txt +3 -0
app.py
ADDED
@@ -0,0 +1,119 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import numpy as np
|
3 |
+
import os
|
4 |
+
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
5 |
+
|
6 |
+
# Load all test data into list of dictionaries
|
7 |
+
#summary_data_path = 'sci-news-sum-kr-50/data/'
|
8 |
+
#summary_objects = []
|
9 |
+
#for root, dirs, files in os.walk(summary_data_path):
|
10 |
+
# files.sort() # Sort file names
|
11 |
+
# for ifile, file_name in enumerate(files):
|
12 |
+
# with open(os.path.join(root, file_name)) as f:
|
13 |
+
# s = json.load(f)
|
14 |
+
# s['index'] = file_name.replace('.json','') # index = 'XY' for file 'XY.json'
|
15 |
+
# s['sentences'] = [sen + '.' for sen in s['sentences']] # Add punctuation to all sentences
|
16 |
+
# s['body'] = ' '.join(s['sentences']) # body is all sentenecs concantenatd with spaces in between
|
17 |
+
# summary_objects.append(s)
|
18 |
+
|
19 |
+
# Load spacy to split text into sentences
|
20 |
+
import spacy
|
21 |
+
|
22 |
+
# Cache language model
|
23 |
+
nlp = spacy.load("ko_core_news_sm")
|
24 |
+
nlp.select_pipes(disable=
|
25 |
+
['tok2vec','tagger','morphologizer','parser','lemmatizer','attribute_ruler','ner']
|
26 |
+
)
|
27 |
+
nlp.enable_pipe('senter')
|
28 |
+
|
29 |
+
def text_to_sentences(nlp, text):
|
30 |
+
"""Split Korean text into sentences."""
|
31 |
+
doc = nlp(text)
|
32 |
+
sentences = [sen for sen in doc.sents]
|
33 |
+
return sentences
|
34 |
+
|
35 |
+
from transformers import AutoConfig, AutoTokenizer, AutoModel
|
36 |
+
from summarizer import Summarizer
|
37 |
+
|
38 |
+
model_path = 'skt/kobert-base-v1'
|
39 |
+
|
40 |
+
# Load model, model config and tokenizer via Transformers
|
41 |
+
custom_config = AutoConfig.from_pretrained(model_path)
|
42 |
+
custom_config.output_hidden_states=True
|
43 |
+
custom_tokenizer = AutoTokenizer.from_pretrained(model_path, do_lower_case=False)
|
44 |
+
custom_model = AutoModel.from_pretrained(model_path, config=custom_config)
|
45 |
+
model = Summarizer(custom_model=custom_model, custom_tokenizer=custom_tokenizer)
|
46 |
+
|
47 |
+
def create_summary(nlp, model, text):
|
48 |
+
"""Create summary from text of an article using given model"""
|
49 |
+
|
50 |
+
# print(model(s['body']))
|
51 |
+
k = model.calculate_optimal_k(text, k_max=10)
|
52 |
+
return text_to_sentences(nlp, model(text, num_sentences=k))
|
53 |
+
|
54 |
+
from urllib.request import urlopen
|
55 |
+
from bs4 import BeautifulSoup
|
56 |
+
|
57 |
+
def extract_naver_news(url):
|
58 |
+
"""Get title, subtitle, and article body from Naver news"""
|
59 |
+
html = urlopen(url).read()
|
60 |
+
soup = BeautifulSoup(html, features="html.parser")
|
61 |
+
|
62 |
+
title = soup.find(class_="media_end_head_headline").get_text()
|
63 |
+
|
64 |
+
area = soup.find(id="dic_area")
|
65 |
+
|
66 |
+
subtitle_tag = area.find('strong')
|
67 |
+
if subtitle_tag: subtitle = area.strong.get_text('\n')
|
68 |
+
else: subtitle = ''
|
69 |
+
|
70 |
+
for tag in area.find_all(class_="img_desc"):
|
71 |
+
tag.extract()
|
72 |
+
|
73 |
+
# Add punctuation and spaces between sentences
|
74 |
+
article = ' '.join( [text for text in area.stripped_strings if text[-1]=='.'] )
|
75 |
+
result = {
|
76 |
+
'title': title,
|
77 |
+
'subtitle': subtitle,
|
78 |
+
'article': article,
|
79 |
+
}
|
80 |
+
return result
|
81 |
+
|
82 |
+
import gradio as gr
|
83 |
+
def interface_handler(custom_text, naver_url, choice):
|
84 |
+
if choice == 1:
|
85 |
+
content = extract_naver_news(naver_url)
|
86 |
+
summary_sentences = create_summary(nlp, model, content['article'])
|
87 |
+
output_text = ""
|
88 |
+
# output_text += f'μ λͺ©:\n{content["title"]}\n'
|
89 |
+
# output_text += f'λΆμ :\n{content["subtitle"]}\n'
|
90 |
+
# output_text += '\nκ°μ:\n'
|
91 |
+
for sen in summary_sentences:
|
92 |
+
output_text += f'\n{sen}'
|
93 |
+
return output_text
|
94 |
+
else:
|
95 |
+
output_text = ""
|
96 |
+
summary_sentences = create_summary(nlp, model, custom_text)
|
97 |
+
for sen in summary_sentences:
|
98 |
+
output_text += f'\n{sen}'
|
99 |
+
return output_text
|
100 |
+
|
101 |
+
default_url = "https://n.news.naver.com/article/015/0004692703?sid=102"
|
102 |
+
default_text = """
|
103 |
+
'λμ ν μ κ²½λ§' νμ΅ κΈ°μ . μΉ΄λ©λΌλ‘ μ°μ μ΄λ―Έμ§μμ νΉμ ν μ¬λ¬Ό μ°Ύλ κΈ°μ νμ©. μ²μμμ λ±μ°λ‘ μ²μ² μ°Ύμ λλ‘ μ΄ μ°μ
ꡬ쑰λ μν λ. λ―Έκ΅ κ΅λ°©λΆλ μ§λλ¬ λ§ μΈκ³΅μ§λ₯(AI)μ μ΄μ©ν΄ μΈκ° λμ μμ΄ μ μ μλ³ν΄ ν격νλ λλ‘ (λ¬΄μΈ ν곡기)μ μμ°νλ€. μ΄ λλ‘ μ μΉ΄λ©λΌ νλ©΄μμ μ΄μΌλ‘ 무μ₯ν μ¬λκ³Ό λ¬΄κΈ°κ° μλ μ¬λμ ꡬλΆν μ μλ€. νμ μΌλ‘ μ ν μ¬λμ μ°Ύμ κ·Έκ° ν μλμ°¨λ₯Ό μΆμ νλ κΈ°λ₯λ μλ€. μ‘°λ§κ° μ격 μ‘°μ’
μμ΄λ μ μ₯μμ νΉμλΆλ κ΅°μΈλ€μ²λΌ μ무λ₯Ό μννλ λλ‘ μ΄ λ±μ₯ν μ λ§μ΄λ€. μ΄ λλ‘ μ΄ μ¬λ λμ μμ΄ μΉ΄λ©λΌ μμμμ λͺ©νλ¬Όμ μΈμνκ³ μΆμ ν μ μλ κ²μ λ°λ‘ βλ¨Έμ λ¬λβ λλΆμ΄λ€. λ¨Έμ λ¬λμ AIμ ν λΆμΌλ‘ μ»΄ν¨ν°κ° μΈκ°μ²λΌ μ€μ€λ‘ νμ΅ν μ μλ λ₯λ ₯μ λΆμ¬νλ μμ
μ λ§νλ€. λ¨Έμ λ¬λμ μ리λ μΈκ°μ ν¬ν¨ν μμ₯λ₯ λλμ μ 보 μ²λ¦¬ κ΅¬μ‘°μΈ βμ κ²½λ§βμ λͺ¨μ¬νλ λ°©μμ΄λ€. λ°λ λκ²°μμ μ΄μΈλ 9λ¨μ μ΄κΈ΄ ꡬκΈμ βμνκ³ β λ± μ§κΈκΉμ§ μκ°λ AI λλΆλΆμ μ¬μΈ΅μ κ²½λ§μ κΈ°λ°μΌλ‘ ν λ¨Έμ λ¬λ μκ³ λ¦¬μ¦μ μ΄μ©νλ€. μ΄λ―Έμ§μμ νΉμ μ¬λ¬Όμ μ°Ύλ κΈ°μ μ μΈκ°μ΄ μλλΌ κ³ μμ΄ λμμ μ λνλ€. κ³ μμ΄ λμ μμ κ²½μμ λ°κ²¬λλ βλμ ν μ κ²½λ§β ꡬ쑰λ μκ°μΈν¬λ€μ΄ 보λ΄μ€λ λ°μμ λͺ¨μ μ¬λ¬ κ°μ μΈ΅(ε±€)μΌλ‘ λλλ€. μ΄λ₯Ό 3λ¨κ³μ κ±Έμ³ μ μ°¨μ μΌλ‘ λ¨μννλ©΄μ 물체μ μκΉμ΄λ λͺ¨μμ νμ
νλ€. μ΄λ₯Ό μ²μμΌλ‘ μ°κ΅¬ν λ°μ΄λΉλ ν΄λ²¨κ³Ό ν μ΄μ€ν
λΉμ €μ 1981λ
λ
Έλ²¨ μ리μνμμ λ°μλ€. AI κ³Όνμλ€μ λμ ν μ κ²½λ§μμ μμ΄λμ΄λ₯Ό μ»μ΄ μ΄λ―Έμ§μμ μ¬λ¬Όμ νλ³νλ μκ³ λ¦¬μ¦μ μ€κ³νλ€. μ°μ μ΄λ―Έμ§μμ ν° νΉμ§μ μΆμΆν λ€μ μ μ°¨ μκ³ λ³΅μ‘ν νΉμ§μ λ°κ²¬ν΄ λκ°λ λ°©μμ΄λ€. μ컨λ μ¬μ§ μμ μλμ°¨κ° μλ€κ³ ν΄ λ³΄μ. μκ³ λ¦¬μ¦μ μ°μ μ¬λ¬Όμ μ 체μ μΈ μ€κ³½μ λ¨Όμ νμΈν λ€ κΈ°μ‘΄μ μ
λ ₯λ μ¬μ§ λ°μ΄ν°μ λΉκ΅ν΄ βν κ²βμΌλ‘ λ²μλ₯Ό μ’νλ€. μ΄ν νμ΄μ΄λ μ μ‘°μ¬ μ λΈλΌμ²λΌ μΈλΆμ μΈ νΉμ§μ νμ
νκ³ βμ¬μ§ μμ μλ 물체λ μλμ°¨βλΌλ κ²°λ‘ μ λ΄λ¦¬κ² λλ€. μ ν λ κ΅¬κΈ μμμ°κ΅¬μμ βλμ ν μ κ²½λ§μ λ€λ₯Έ λ¨Έμ λ¬λ ꡬ쑰λ€κ³Ό λΉκ΅ν λ μμ, μμ± λΆμΌμμ μ’μ μ±λ₯μ 보μΈλ€βλ©° βμ΄λ₯Ό μ΄μ©νλ©΄ μ»΄ν¨ν°κ° μ²μ λ³Έ μ¬λ¬Όλ 무μμΈμ§ νμ
ν μ μλ€βκ³ μ€λͺ
νλ€. μ£Όλ³μμ λ³Ό μ μλ μμ촬μμ© λλ‘ μλ μ΄λ³΄λ€λ κ°λ¨νμ§λ§ λΉμ·ν κΈ°μ μ΄ μ΄μ©λλ€. μΈκ³ 1μ λλ‘ μ
μ²΄μΈ μ€κ΅ DJIμ βν¬ν
4βλ μ¬λ λμ²λΌ λ κ°μ μΉ΄λ©λΌ μΌμλ₯Ό μ₯μ°©νλ€. μ΄λ₯Ό ν΅ν΄ λμ 물체λ₯Ό νμΈνκ³ μΌμ 거리λ₯Ό μ μ§νλ©΄μ λ°λΌλ€λλ€. μ΄λ₯Έλ° βμ‘ν°λΈ νΈλβ κΈ°λ₯μ΄λ€. μ‘ν°λΈ νΈλ κΈ°λ₯μ μΌλ©΄ μ΄μ©μκ° μ§μ ν μ¬λ¬Όμ΄λ μ¬λμ μ€κ³½μ μ μΈμνκ³ ν½μ
(μ΄λ―Έμ§λ₯Ό ꡬμ±νλ κ°μ₯ μμ λ¨μμΈ λ€λͺ¨ λͺ¨μμ μ ) λ¨μλ‘ μΈμνλ€. κ·Έ ν½μ
μ κ³μμ μΌλ‘ κ°μ ν¬κΈ°λ‘ μ μ§νκΈ° μν΄ κΈ°μ²΄κ° μ΄λνλ€. μ컨λ μ£Όλ³μ μλ μ¬λμ μ§μ νμ λ ν½μ
ν¬κΈ°κ° μνμ’μ° 100Γ100 ν½μ
μ΄μλ€κ³ ν΄ λ³΄μ. κ·Έ μ¬λμ΄ μμΌλ‘ μμ§μ¬μ 80Γ80 ν½μ
ν¬κΈ°λ‘ μ€μ΄λ€λ©΄ μλ μμΉμΈ 100Γ100 ν½μ
μ λμ°ΎκΈ° μν΄ λλ‘ λ λ°λΌμ μμΌλ‘ μμ§μ΄λ λ°©μμ΄λ€. κ³Όνμλ€μ λμ ν μ κ²½λ§μ λ³Έλ¬ λ¨Έμ λ¬λ κΈ°μ μ μμ©ν΄ μΈκ° μΆμ μ€ννκ² ν μ μλ κΈ°μ μ κ°λ°νκ³ μλ€. μ€μμ€ μ·¨λ¦¬νλ μ°κ΅¬νμ λλ‘ μ μ΄μ©ν΄ μνμ€ μ°λ§₯μμ μ‘°λμλ₯Ό μ°Ύλ κΈ°μ μ μ°κ΅¬ μ€μ΄λ€. μ°κ΅¬νμ΄ κ°λ°ν AI λλ‘ μ μΉ΄λ©λΌκ° 촬μν μ΄λ―Έμ§λ₯Ό μ΄μ©ν΄ μ²μ΄ μ°κ±°μ§ κ³³κ³Ό λ±μ°λ‘λ₯Ό ꡬλΆνλ€. μ΄λ₯Ό λλ‘ μ λΉν μ μ΄κΈ°λ‘ μ λ¬ν΄ μ΄λ λ°©ν₯μ κ²°μ νλ€. μ¬ν΄ μ΄ μ·¨λ¦¬νλκ° μλ£ν 첫 μ€νμμλ βλλ‘ μ΄ μΈκ°λ³΄λ€ λ±μ°λ‘λ₯Ό μ μ°Ύλλ€βλ κ²°κ³Όκ° λμλ€. μ°κ΅¬νμ μ½ 2λ§μ₯μ μνμ€ μ° λ±μ°λ‘ μ¬μ§μ λ°νμΌλ‘ 3μΌκ° λλ‘ μ νμ¬λ μΈκ³΅μ§λ₯μ μ¬μΈ΅μ κ²½λ§μ νμ΅μμΌ°λ€. μ΄ν λλ‘ μ΄ μ ν κ°λ³΄μ§ λͺ»ν λ±μ°λ‘λ₯Ό μ€λ₯΄λλ‘ νλ€. μ€ν κ²°κ³Ό μ¬λ λμΌλ‘ μλ‘μ΄ λ±μ°λ‘λ₯Ό μλ³ν νλ₯ μ 82%μμΌλ AI λλ‘ μ 85%μ μ±κ³΅λ₯ μ 보μ¬μ€¬λ€. 취리νλ μ°κ΅¬νμ βAI λλ‘ μ μ‘°λ§κ° μ€μ μ ν¬μ
λΌ μ°μ
ꡬ쑰λκ° μ‘°λμλ₯Ό μ°Ύλ μΌμ λμΈ μ μμ κ²βμ΄λΌκ³ λ§νλ€. μ κ²½λ§ νμ΅ κΈ°μ μ λ€μν μ©λλ‘ νμ©ν μ μλ€. λ¬Ένν DJIμ½λ¦¬μ λνλ βAIλ₯Ό νμ¬ν λλ‘ μ μ‘μ μ μ΄λ μ‘μ κ΄ λ± μ°μ
μμ€λ¬Όμ κ²°ν¨ λ°κ²¬, μ°λΆ κ°μ§, μ₯μ λ¬Όμ΄λ κ΅°μ¬μ© λͺ©νλ¬Ό νμ§ λ± μ΄μ© κ°λ₯ λ²μκ° λ¬΄κΆλ¬΄μ§νλ€βκ³ λ§νλ€."),
|
104 |
+
"""
|
105 |
+
|
106 |
+
demo = gr.Interface(
|
107 |
+
fn=interface_handler,
|
108 |
+
inputs=[
|
109 |
+
gr.inputs.Textbox(lines=5, placeholder=None, default=default_text, label="μμ λ¬Έμ₯ (Custom text)", optional=False),
|
110 |
+
gr.inputs.Textbox(lines=1, placeholder=None, default=default_url, label="λ€μ΄λ² λ΄μ€ κΈ°μ¬ λ§ν¬μ£Όμ (Naver News article URL)", optional=False),
|
111 |
+
gr.inputs.Radio(["μ
λ ₯ λ¬Έμ₯ μμ½", "λ€μ΄λ² λ΄μ€ κΈ°μ¬ μμ½"], type="index", default=None, label=None, optional=False)
|
112 |
+
],
|
113 |
+
outputs=[
|
114 |
+
gr.outputs.Textbox(label="κ°μ"),
|
115 |
+
],
|
116 |
+
)
|
117 |
+
|
118 |
+
if __name__ == "__main__":
|
119 |
+
demo.launch(debug=True)
|
requirements.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
sentence-transformers==2.2.0
|
2 |
+
bert-extractive-summarizer==0.10.1
|
3 |
+
spacy==3.3.0
|