File size: 4,184 Bytes
85b718b
 
2c04949
 
 
85b718b
 
 
 
c0ea65f
 
85b718b
c0ea65f
 
 
85b718b
c0ea65f
85b718b
 
c0ea65f
85b718b
 
320ee31
85b718b
 
c0ea65f
85b718b
 
c0ea65f
 
85b718b
c0ea65f
 
 
 
85b718b
c0ea65f
85b718b
 
 
 
 
c0ea65f
85b718b
 
c0ea65f
85b718b
 
 
 
 
 
 
 
 
 
 
 
c0ea65f
 
 
 
 
 
 
 
 
 
85b718b
 
 
c0ea65f
 
 
85b718b
 
 
 
c0ea65f
 
 
85b718b
 
c0ea65f
 
 
85b718b
 
 
 
 
 
 
 
 
 
c0ea65f
85b718b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98


import gradio as gr # import Gradio library for creating web-based user interfaces
from transformers import pipeline # import pipeline to use pre-trained models
import torch # import PyTorch library, which is commonly used for Deep Learning tasks
from bs4 import BeautifulSoup # import BeautifulSoup for parsing HTML & XML documnts
import requests # To make HTTP requests to retrieve web content.



def summarize_article(url, min_len, max_len):
  #Create summarization pipeline
    summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

    try:
        # Send an HTTP GET request to the URL(take it from user) and retrieve the web page content
        r = requests.get(url)

        # Creat object from BeautifulSoup to extract the text content of the webpage, parsing the HTML content
        soup = BeautifulSoup(r.text, 'html.parser')

        # To finds all the <h1> (header) and <p> (paragraph) elements in the HTML content
        results = soup.find_all(['h1','h2','p'])

        # Extract the text content from each element and store it in a list called text
        text = [result.text for result in results]

        # joins all the extracted text into a single string, representing the entire article
        ARTICLE = ' '.join(text)

        # Replace sentence-ending punctuation with a special token (<eos>) . This helps split the article into smaller chunks for summarization.
        ARTICLE = ARTICLE.replace('.', '.<eos>')
        ARTICLE = ARTICLE.replace('?', '?<eos>')
        ARTICLE = ARTICLE.replace('!', '!<eos>')

        # Splits the article into sentences based on the <eos> token and stores them in a list called sentences.
        sentences = ARTICLE.split('<eos>')

        # Sets the maximum length (in words) for each chunk of text during summarization.
        max_chunk = 500

        # Initializes a variable to keep track of the current chunk being processed
        current_chunk = 0

        # Creates an empty list called chunks to store the individual chunks of text
        chunks = []

        # For loop iterates through each sentence in the sentences list
        '''If the length of the current chunk (in terms of words) plus the length of the current sentence (split by spaces) is less than or equal to the max_chunk length:
        The sentence is added to the current chunk.

        Otherwise:

        The current_chunk index is incremented to move to the next chunk.
        A new chunk is created, and the current sentence becomes the first sentence in this new chunk.

        The current chunk is appended to the chunks list.
        '''
        for sentence in sentences:
            if len(chunks) == current_chunk + 1:
                if len(chunks[current_chunk]) + len(sentence.split(' ')) <= max_chunk:
                    chunks[current_chunk].extend(sentence.split(' '))
                else:
                    current_chunk += 1
                    chunks.append(sentence.split(' '))
            else:
                chunks.append(sentence.split(' '))

        ''' After processing all sentences, the loop iterates through each chunk,
        to ensures that each chunk is represented as a single string (rather than a list of words).
        '''
        for chunk_id in range(len(chunks)):
            chunks[chunk_id] = ' '.join(chunks[chunk_id])

        # Apply Summarization to text with lenth of 30-120 word for each chunk
        res = summarizer(chunks, max_length = max_len, min_length = min_len, do_sample=False)
 
        # Extracting the 'summary_text' value from each summary in the res list
        summary = ' '.join([summ['summary_text'] for summ in res])
        return summary

    # Handle potential errors during web request or parsing
    except Exception as e: 
        return f"Error: {str(e)}"


# Create Gradio Interface
interface = gr.Interface(
    fn=summarize_article,
    inputs=[
        gr.Textbox(label="Enter the article URL"),
        gr.Slider(minimum=10, maximum=100, step=1, label="Adjust Minimum Length"),
        gr.Slider(minimum=50, maximum=1000, step=1, label="Adjust Maximum Length")
    ],
    outputs=gr.Textbox(label="Summary")
)

interface.launch()