Spaces:
Sleeping
Sleeping
File size: 4,184 Bytes
85b718b 2c04949 85b718b c0ea65f 85b718b c0ea65f 85b718b c0ea65f 85b718b c0ea65f 85b718b 320ee31 85b718b c0ea65f 85b718b c0ea65f 85b718b c0ea65f 85b718b c0ea65f 85b718b c0ea65f 85b718b c0ea65f 85b718b c0ea65f 85b718b c0ea65f 85b718b c0ea65f 85b718b c0ea65f 85b718b c0ea65f 85b718b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 |
import gradio as gr # import Gradio library for creating web-based user interfaces
from transformers import pipeline # import pipeline to use pre-trained models
import torch # import PyTorch library, which is commonly used for Deep Learning tasks
from bs4 import BeautifulSoup # import BeautifulSoup for parsing HTML & XML documnts
import requests # To make HTTP requests to retrieve web content.
def summarize_article(url, min_len, max_len):
#Create summarization pipeline
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
try:
# Send an HTTP GET request to the URL(take it from user) and retrieve the web page content
r = requests.get(url)
# Creat object from BeautifulSoup to extract the text content of the webpage, parsing the HTML content
soup = BeautifulSoup(r.text, 'html.parser')
# To finds all the <h1> (header) and <p> (paragraph) elements in the HTML content
results = soup.find_all(['h1','h2','p'])
# Extract the text content from each element and store it in a list called text
text = [result.text for result in results]
# joins all the extracted text into a single string, representing the entire article
ARTICLE = ' '.join(text)
# Replace sentence-ending punctuation with a special token (<eos>) . This helps split the article into smaller chunks for summarization.
ARTICLE = ARTICLE.replace('.', '.<eos>')
ARTICLE = ARTICLE.replace('?', '?<eos>')
ARTICLE = ARTICLE.replace('!', '!<eos>')
# Splits the article into sentences based on the <eos> token and stores them in a list called sentences.
sentences = ARTICLE.split('<eos>')
# Sets the maximum length (in words) for each chunk of text during summarization.
max_chunk = 500
# Initializes a variable to keep track of the current chunk being processed
current_chunk = 0
# Creates an empty list called chunks to store the individual chunks of text
chunks = []
# For loop iterates through each sentence in the sentences list
'''If the length of the current chunk (in terms of words) plus the length of the current sentence (split by spaces) is less than or equal to the max_chunk length:
The sentence is added to the current chunk.
Otherwise:
The current_chunk index is incremented to move to the next chunk.
A new chunk is created, and the current sentence becomes the first sentence in this new chunk.
The current chunk is appended to the chunks list.
'''
for sentence in sentences:
if len(chunks) == current_chunk + 1:
if len(chunks[current_chunk]) + len(sentence.split(' ')) <= max_chunk:
chunks[current_chunk].extend(sentence.split(' '))
else:
current_chunk += 1
chunks.append(sentence.split(' '))
else:
chunks.append(sentence.split(' '))
''' After processing all sentences, the loop iterates through each chunk,
to ensures that each chunk is represented as a single string (rather than a list of words).
'''
for chunk_id in range(len(chunks)):
chunks[chunk_id] = ' '.join(chunks[chunk_id])
# Apply Summarization to text with lenth of 30-120 word for each chunk
res = summarizer(chunks, max_length = max_len, min_length = min_len, do_sample=False)
# Extracting the 'summary_text' value from each summary in the res list
summary = ' '.join([summ['summary_text'] for summ in res])
return summary
# Handle potential errors during web request or parsing
except Exception as e:
return f"Error: {str(e)}"
# Create Gradio Interface
interface = gr.Interface(
fn=summarize_article,
inputs=[
gr.Textbox(label="Enter the article URL"),
gr.Slider(minimum=10, maximum=100, step=1, label="Adjust Minimum Length"),
gr.Slider(minimum=50, maximum=1000, step=1, label="Adjust Maximum Length")
],
outputs=gr.Textbox(label="Summary")
)
interface.launch() |