Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -3,26 +3,15 @@ import os
|
|
3 |
import pandas as pd
|
4 |
from PyPDF2 import PdfReader
|
5 |
import openai
|
6 |
-
from
|
7 |
-
from io import StringIO
|
8 |
-
from pdfminer.high_level import extract_text
|
9 |
import json
|
10 |
-
from openai import OpenAI
|
11 |
-
import re
|
12 |
from dotenv import load_dotenv
|
13 |
-
from pdfminer.pdfparser import PDFParser
|
14 |
-
from pdfminer.pdfdocument import PDFDocument
|
15 |
-
from pdfminer.pdfpage import PDFPage
|
16 |
-
from pdfminer.layout import LAParams
|
17 |
-
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
|
18 |
-
from pdfminer.converter import TextConverter
|
19 |
|
20 |
-
#
|
21 |
load_dotenv()
|
22 |
api_key = os.getenv('OPENAI_API_KEY')
|
23 |
openai.api_key = api_key
|
24 |
-
|
25 |
-
pdf_folder = "pdf"
|
26 |
|
27 |
st.title("Mahkamah Agung: NER & Summarization of Legal Documents")
|
28 |
|
@@ -41,89 +30,68 @@ def get_pdf_details(folder_path):
|
|
41 |
except Exception as e:
|
42 |
st.warning(f"Could not read {filename}: {str(e)}")
|
43 |
return pdf_details
|
|
|
44 |
pdf_list = get_pdf_details(pdf_folder)
|
45 |
pdf_df = pd.DataFrame(pdf_list)
|
|
|
46 |
if not pdf_df.empty:
|
47 |
with st.expander('PDF Overview'):
|
48 |
-
|
49 |
else:
|
50 |
st.warning("No PDFs found in the specified folder.")
|
51 |
|
52 |
-
#---------------------
|
53 |
-
st.subheader("
|
54 |
|
|
|
|
|
55 |
|
56 |
def extract_text_from_pdf(uploaded_file, start_page, end_page):
|
57 |
text = extract_text(uploaded_file, page_numbers=range(start_page, end_page+1))
|
58 |
return text
|
59 |
|
60 |
-
|
61 |
-
search_query = st.text_input("Search for a PDF")
|
62 |
-
filtered_pdfs = [pdf for pdf in pdf_files if search_query.lower() in pdf.lower()]
|
63 |
-
|
64 |
-
if filtered_pdfs:
|
65 |
-
selected_pdf = st.selectbox("Select a PDF to convert to text", filtered_pdfs)
|
66 |
-
else:
|
67 |
-
st.warning("No PDFs found matching your search.")
|
68 |
-
selected_pdf = None
|
69 |
|
70 |
-
|
71 |
-
if selected_pdf:
|
72 |
pdf_path = os.path.join(pdf_folder, selected_pdf)
|
73 |
uploaded_file = open(pdf_path, 'rb')
|
74 |
-
# Extract and display the first 3 pages
|
75 |
-
start_page_first = 1
|
76 |
-
end_page_first = 3
|
77 |
-
extracted_text_first = extract_text_from_pdf(uploaded_file, start_page_first, end_page_first)
|
78 |
-
# Determine total number of pages
|
79 |
pdf_reader = PdfReader(uploaded_file)
|
80 |
total_pages = len(pdf_reader.pages)
|
81 |
-
|
82 |
-
# Extract
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
messages=[
|
120 |
-
{"role": "system", "content": "You are a helpful assistant designed to output JSON."},
|
121 |
-
{"role": "user", "content": template.format(extracted_text)}
|
122 |
-
]
|
123 |
-
)
|
124 |
-
|
125 |
-
data= json.loads(response.choices[0].message.content)
|
126 |
-
df = pd.json_normalize(data)
|
127 |
-
df=df.T
|
128 |
-
df.columns = ["Kesimpulan Putusan"]
|
129 |
-
st.dataframe(df)
|
|
|
3 |
import pandas as pd
|
4 |
from PyPDF2 import PdfReader
|
5 |
import openai
|
6 |
+
from pdfminer.high_level import extract_text
|
|
|
|
|
7 |
import json
|
|
|
|
|
8 |
from dotenv import load_dotenv
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
|
10 |
+
# Initialize OpenAI API
|
11 |
load_dotenv()
|
12 |
api_key = os.getenv('OPENAI_API_KEY')
|
13 |
openai.api_key = api_key
|
14 |
+
pdf_folder = "pdf"
|
|
|
15 |
|
16 |
st.title("Mahkamah Agung: NER & Summarization of Legal Documents")
|
17 |
|
|
|
30 |
except Exception as e:
|
31 |
st.warning(f"Could not read {filename}: {str(e)}")
|
32 |
return pdf_details
|
33 |
+
|
34 |
pdf_list = get_pdf_details(pdf_folder)
|
35 |
pdf_df = pd.DataFrame(pdf_list)
|
36 |
+
|
37 |
if not pdf_df.empty:
|
38 |
with st.expander('PDF Overview'):
|
39 |
+
st.dataframe(pdf_df)
|
40 |
else:
|
41 |
st.warning("No PDFs found in the specified folder.")
|
42 |
|
43 |
+
#---------------------MULTISELECT AND TEXT EXTRACTION----------------------
|
44 |
+
st.subheader("Select PDFs for Extraction and Analysis")
|
45 |
|
46 |
+
pdf_files = [f for f in os.listdir(pdf_folder) if f.lower().endswith('.pdf')]
|
47 |
+
selected_pdfs = st.multiselect("Select PDFs", pdf_files)
|
48 |
|
49 |
def extract_text_from_pdf(uploaded_file, start_page, end_page):
|
50 |
text = extract_text(uploaded_file, page_numbers=range(start_page, end_page+1))
|
51 |
return text
|
52 |
|
53 |
+
pdf_texts = {}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
54 |
|
55 |
+
for selected_pdf in selected_pdfs:
|
|
|
56 |
pdf_path = os.path.join(pdf_folder, selected_pdf)
|
57 |
uploaded_file = open(pdf_path, 'rb')
|
|
|
|
|
|
|
|
|
|
|
58 |
pdf_reader = PdfReader(uploaded_file)
|
59 |
total_pages = len(pdf_reader.pages)
|
60 |
+
|
61 |
+
# Extract text from the first 3 pages and the last 3 pages
|
62 |
+
extracted_text_first = extract_text_from_pdf(uploaded_file, 1, min(3, total_pages))
|
63 |
+
extracted_text_last = extract_text_from_pdf(uploaded_file, max(1, total_pages - 2), total_pages)
|
64 |
+
|
65 |
+
extracted_text = extracted_text_first + "\n" + extracted_text_last
|
66 |
+
|
67 |
+
pdf_texts[selected_pdf] = extracted_text
|
68 |
+
|
69 |
+
#---------------------ANALYZE AND SUMMARIZE----------------------
|
70 |
+
template = """
|
71 |
+
# Anda Adalah Seorang Hakim Agung Di Mahkamah Agung Di Indonesia. Berdasarkan Putusan Di Bawah Ini, Berikan Kesimpulannya:
|
72 |
+
{}
|
73 |
+
Variabel Yang Harus Ada Adalah Sebagai Berikut: Hakim Ketua, Hakim Anggota, Panitera, Putusan, Putusan Lainnya, Catatan Putusan, Tanggal Musyawarah, Tanggal Pembacaan, Jenis Institusi Yudisial, Tanggal Pendaftaran, Institusi Yudisial, Nomor Kasus, Pengadilan, Nama Terdakwa, Tempat Lahir Terdakwa, Tanggal Lahir Terdakwa, Usia Terdakwa, Jenis Kelamin Terdakwa, Kebangsaan Terdakwa, Agama Terdakwa, Pekerjaan Terdakwa, Pasal Dakwaan, Pelanggaran Dakwaan, Vonis Hukuman, Deskripsi Vonis Atribut Disita, Vonis Atribut Disita Berat, Denda, Dan Kesimpulan.
|
74 |
+
"""
|
75 |
+
|
76 |
+
if st.button("Analyze Selected PDFs"):
|
77 |
+
summaries = []
|
78 |
+
for pdf_name, text in pdf_texts.items():
|
79 |
+
response = openai.ChatCompletion.create(
|
80 |
+
model="gpt-3.5-turbo",
|
81 |
+
messages=[
|
82 |
+
{"role": "system", "content": "You are a helpful assistant designed to output JSON."},
|
83 |
+
{"role": "user", "content": template.format(text)}
|
84 |
+
]
|
85 |
+
)
|
86 |
+
|
87 |
+
data = json.loads(response.choices[0].message.content)
|
88 |
+
df = pd.json_normalize(data)
|
89 |
+
df = df.T
|
90 |
+
df.columns = [f"Kesimpulan Putusan ({pdf_name})"]
|
91 |
+
|
92 |
+
summaries.append(df)
|
93 |
+
|
94 |
+
# Display the summaries for each selected PDF
|
95 |
+
for summary in summaries:
|
96 |
+
with st.expander(f"Summary for {summary.columns[0]}"):
|
97 |
+
st.dataframe(summary)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|