Spaces:
Sleeping
Sleeping
import gradio as gr # import Gradio library for creating web-based user interfaces | |
from transformers import pipeline # import pipeline to use pre-trained models | |
import torch # import PyTorch library, which is commonly used for Deep Learning tasks | |
from bs4 import BeautifulSoup # import BeautifulSoup for parsing HTML & XML documnts | |
import requests # To make HTTP requests to retrieve web content. | |
def summarize_article(url, min_len, max_len): | |
#Create summarization pipeline | |
summarizer = pipeline("summarization", model="facebook/bart-large-cnn") | |
try: | |
# Send an HTTP GET request to the URL(take it from user) and retrieve the web page content | |
r = requests.get(url) | |
# Creat object from BeautifulSoup to extract the text content of the webpage, parsing the HTML content | |
soup = BeautifulSoup(r.text, 'html.parser') | |
# To finds all the <h1> (header) and <p> (paragraph) elements in the HTML content | |
results = soup.find_all(['h1','h2','p']) | |
# Extract the text content from each element and store it in a list called text | |
text = [result.text for result in results] | |
# joins all the extracted text into a single string, representing the entire article | |
ARTICLE = ' '.join(text) | |
# Replace sentence-ending punctuation with a special token (<eos>) . This helps split the article into smaller chunks for summarization. | |
ARTICLE = ARTICLE.replace('.', '.<eos>') | |
ARTICLE = ARTICLE.replace('?', '?<eos>') | |
ARTICLE = ARTICLE.replace('!', '!<eos>') | |
# Splits the article into sentences based on the <eos> token and stores them in a list called sentences. | |
sentences = ARTICLE.split('<eos>') | |
# Sets the maximum length (in words) for each chunk of text during summarization. | |
max_chunk = 500 | |
# Initializes a variable to keep track of the current chunk being processed | |
current_chunk = 0 | |
# Creates an empty list called chunks to store the individual chunks of text | |
chunks = [] | |
# For loop iterates through each sentence in the sentences list | |
'''If the length of the current chunk (in terms of words) plus the length of the current sentence (split by spaces) is less than or equal to the max_chunk length: | |
The sentence is added to the current chunk. | |
Otherwise: | |
The current_chunk index is incremented to move to the next chunk. | |
A new chunk is created, and the current sentence becomes the first sentence in this new chunk. | |
The current chunk is appended to the chunks list. | |
''' | |
for sentence in sentences: | |
if len(chunks) == current_chunk + 1: | |
if len(chunks[current_chunk]) + len(sentence.split(' ')) <= max_chunk: | |
chunks[current_chunk].extend(sentence.split(' ')) | |
else: | |
current_chunk += 1 | |
chunks.append(sentence.split(' ')) | |
else: | |
chunks.append(sentence.split(' ')) | |
''' After processing all sentences, the loop iterates through each chunk, | |
to ensures that each chunk is represented as a single string (rather than a list of words). | |
''' | |
for chunk_id in range(len(chunks)): | |
chunks[chunk_id] = ' '.join(chunks[chunk_id]) | |
# Apply Summarization to text with lenth of 30-120 word for each chunk | |
res = summarizer(chunks, max_length = max_len, min_length = min_len, do_sample=False) | |
# Extracting the 'summary_text' value from each summary in the res list | |
summary = ' '.join([summ['summary_text'] for summ in res]) | |
return summary | |
# Handle potential errors during web request or parsing | |
except Exception as e: | |
return f"Error: {str(e)}" | |
# Create Gradio Interface | |
interface = gr.Interface( | |
fn=summarize_article, | |
inputs=[ | |
gr.Textbox(label="Enter the article URL"), | |
gr.Slider(minimum=10, maximum=100, step=1, label="Adjust Minimum Length"), | |
gr.Slider(minimum=50, maximum=1000, step=1, label="Adjust Maximum Length") | |
], | |
outputs=gr.Textbox(label="Summary") | |
) | |
interface.launch() |