destiratnakomala commited on
Commit
f2a3f35
·
verified ·
1 Parent(s): ee48719

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +50 -82
app.py CHANGED
@@ -3,26 +3,15 @@ import os
3
  import pandas as pd
4
  from PyPDF2 import PdfReader
5
  import openai
6
- from collections import defaultdict
7
- from io import StringIO
8
- from pdfminer.high_level import extract_text
9
  import json
10
- from openai import OpenAI
11
- import re
12
  from dotenv import load_dotenv
13
- from pdfminer.pdfparser import PDFParser
14
- from pdfminer.pdfdocument import PDFDocument
15
- from pdfminer.pdfpage import PDFPage
16
- from pdfminer.layout import LAParams
17
- from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
18
- from pdfminer.converter import TextConverter
19
 
20
- # 1. Initialization
21
  load_dotenv()
22
  api_key = os.getenv('OPENAI_API_KEY')
23
  openai.api_key = api_key
24
- client = OpenAI(api_key=api_key)
25
- pdf_folder = "pdf"
26
 
27
  st.title("Mahkamah Agung: NER & Summarization of Legal Documents")
28
 
@@ -41,89 +30,68 @@ def get_pdf_details(folder_path):
41
  except Exception as e:
42
  st.warning(f"Could not read {filename}: {str(e)}")
43
  return pdf_details
 
44
  pdf_list = get_pdf_details(pdf_folder)
45
  pdf_df = pd.DataFrame(pdf_list)
 
46
  if not pdf_df.empty:
47
  with st.expander('PDF Overview'):
48
- st.dataframe(pdf_df)
49
  else:
50
  st.warning("No PDFs found in the specified folder.")
51
 
52
- #---------------------PDF SEARCH AND EXTRACT----------------------
53
- st.subheader("PDF to Text Conversion")
54
 
 
 
55
 
56
  def extract_text_from_pdf(uploaded_file, start_page, end_page):
57
  text = extract_text(uploaded_file, page_numbers=range(start_page, end_page+1))
58
  return text
59
 
60
- pdf_files = [f for f in os.listdir(pdf_folder) if f.lower().endswith('.pdf')]
61
- search_query = st.text_input("Search for a PDF")
62
- filtered_pdfs = [pdf for pdf in pdf_files if search_query.lower() in pdf.lower()]
63
-
64
- if filtered_pdfs:
65
- selected_pdf = st.selectbox("Select a PDF to convert to text", filtered_pdfs)
66
- else:
67
- st.warning("No PDFs found matching your search.")
68
- selected_pdf = None
69
 
70
-
71
- if selected_pdf:
72
  pdf_path = os.path.join(pdf_folder, selected_pdf)
73
  uploaded_file = open(pdf_path, 'rb')
74
- # Extract and display the first 3 pages
75
- start_page_first = 1
76
- end_page_first = 3
77
- extracted_text_first = extract_text_from_pdf(uploaded_file, start_page_first, end_page_first)
78
- # Determine total number of pages
79
  pdf_reader = PdfReader(uploaded_file)
80
  total_pages = len(pdf_reader.pages)
81
-
82
- # Extract and display the last 3 pages
83
- if total_pages > 3:
84
- start_page_last = max(1, total_pages - 2)
85
- end_page_last = total_pages
86
- extracted_text_last = extract_text_from_pdf(uploaded_file, start_page_last, end_page_last)
87
- # Join the extracted text
88
- extracted_text = extracted_text_first + "\n" + extracted_text_last if total_pages > 3 else extracted_text_first
89
-
90
- else:
91
- st.warning("Please select a PDF file.")
92
-
93
-
94
- #----------------------ANALYZE
95
-
96
- if st.button("Analyze The Document"):
97
-
98
- # Display the extracted text
99
- if extracted_text:
100
- with st.expander('Extracted Text'):
101
- st.text_area("Extracted Text", value=extracted_text, height=300)
102
- else:
103
- st.warning("No text extracted. The PDF might contain images or other non-text content.")
104
-
105
-
106
-
107
- template = """
108
- # Anda Adalah Seorang Hakim Agung Di Mahkamah Agung Di Indonesia. Berdasarkan Putusan Di Bawah Ini, Berikan Kesimpulannya:
109
- {}
110
- Variabel Yang Harus Ada Adalah Sebagai Berikut: Hakim Ketua, Hakim Anggota, Panitera, Putusan, Putusan Lainnya, Catatan Putusan, Tanggal Musyawarah, Tanggal Pembacaan, Jenis Institusi Yudisial, Tanggal Pendaftaran, Institusi Yudisial, Nomor Kasus, Pengadilan, Nama Terdakwa, Tempat Lahir Terdakwa, Tanggal Lahir Terdakwa, Usia Terdakwa, Jenis Kelamin Terdakwa, Kebangsaan Terdakwa, Agama Terdakwa, Pekerjaan Terdakwa, Pasal Dakwaan, Pelanggaran Dakwaan, Vonis Hukuman, Deskripsi Vonis Atribut Disita, Vonis Atribut Disita Berat, Denda, Dan Kesimpulan.
111
- # """
112
-
113
-
114
-
115
- #---------------------NER & SUMMARIZATION----------------------
116
- response = client.chat.completions.create(
117
- model="gpt-3.5-turbo-0125",
118
- response_format={ "type": "json_object" },
119
- messages=[
120
- {"role": "system", "content": "You are a helpful assistant designed to output JSON."},
121
- {"role": "user", "content": template.format(extracted_text)}
122
- ]
123
- )
124
-
125
- data= json.loads(response.choices[0].message.content)
126
- df = pd.json_normalize(data)
127
- df=df.T
128
- df.columns = ["Kesimpulan Putusan"]
129
- st.dataframe(df)
 
3
  import pandas as pd
4
  from PyPDF2 import PdfReader
5
  import openai
6
+ from pdfminer.high_level import extract_text
 
 
7
  import json
 
 
8
  from dotenv import load_dotenv
 
 
 
 
 
 
9
 
10
+ # Initialize OpenAI API
11
  load_dotenv()
12
  api_key = os.getenv('OPENAI_API_KEY')
13
  openai.api_key = api_key
14
+ pdf_folder = "pdf"
 
15
 
16
  st.title("Mahkamah Agung: NER & Summarization of Legal Documents")
17
 
 
30
  except Exception as e:
31
  st.warning(f"Could not read {filename}: {str(e)}")
32
  return pdf_details
33
+
34
  pdf_list = get_pdf_details(pdf_folder)
35
  pdf_df = pd.DataFrame(pdf_list)
36
+
37
  if not pdf_df.empty:
38
  with st.expander('PDF Overview'):
39
+ st.dataframe(pdf_df)
40
  else:
41
  st.warning("No PDFs found in the specified folder.")
42
 
43
+ #---------------------MULTISELECT AND TEXT EXTRACTION----------------------
44
+ st.subheader("Select PDFs for Extraction and Analysis")
45
 
46
+ pdf_files = [f for f in os.listdir(pdf_folder) if f.lower().endswith('.pdf')]
47
+ selected_pdfs = st.multiselect("Select PDFs", pdf_files)
48
 
49
  def extract_text_from_pdf(uploaded_file, start_page, end_page):
50
  text = extract_text(uploaded_file, page_numbers=range(start_page, end_page+1))
51
  return text
52
 
53
+ pdf_texts = {}
 
 
 
 
 
 
 
 
54
 
55
+ for selected_pdf in selected_pdfs:
 
56
  pdf_path = os.path.join(pdf_folder, selected_pdf)
57
  uploaded_file = open(pdf_path, 'rb')
 
 
 
 
 
58
  pdf_reader = PdfReader(uploaded_file)
59
  total_pages = len(pdf_reader.pages)
60
+
61
+ # Extract text from the first 3 pages and the last 3 pages
62
+ extracted_text_first = extract_text_from_pdf(uploaded_file, 1, min(3, total_pages))
63
+ extracted_text_last = extract_text_from_pdf(uploaded_file, max(1, total_pages - 2), total_pages)
64
+
65
+ extracted_text = extracted_text_first + "\n" + extracted_text_last
66
+
67
+ pdf_texts[selected_pdf] = extracted_text
68
+
69
+ #---------------------ANALYZE AND SUMMARIZE----------------------
70
+ template = """
71
+ # Anda Adalah Seorang Hakim Agung Di Mahkamah Agung Di Indonesia. Berdasarkan Putusan Di Bawah Ini, Berikan Kesimpulannya:
72
+ {}
73
+ Variabel Yang Harus Ada Adalah Sebagai Berikut: Hakim Ketua, Hakim Anggota, Panitera, Putusan, Putusan Lainnya, Catatan Putusan, Tanggal Musyawarah, Tanggal Pembacaan, Jenis Institusi Yudisial, Tanggal Pendaftaran, Institusi Yudisial, Nomor Kasus, Pengadilan, Nama Terdakwa, Tempat Lahir Terdakwa, Tanggal Lahir Terdakwa, Usia Terdakwa, Jenis Kelamin Terdakwa, Kebangsaan Terdakwa, Agama Terdakwa, Pekerjaan Terdakwa, Pasal Dakwaan, Pelanggaran Dakwaan, Vonis Hukuman, Deskripsi Vonis Atribut Disita, Vonis Atribut Disita Berat, Denda, Dan Kesimpulan.
74
+ """
75
+
76
+ if st.button("Analyze Selected PDFs"):
77
+ summaries = []
78
+ for pdf_name, text in pdf_texts.items():
79
+ response = openai.ChatCompletion.create(
80
+ model="gpt-3.5-turbo",
81
+ messages=[
82
+ {"role": "system", "content": "You are a helpful assistant designed to output JSON."},
83
+ {"role": "user", "content": template.format(text)}
84
+ ]
85
+ )
86
+
87
+ data = json.loads(response.choices[0].message.content)
88
+ df = pd.json_normalize(data)
89
+ df = df.T
90
+ df.columns = [f"Kesimpulan Putusan ({pdf_name})"]
91
+
92
+ summaries.append(df)
93
+
94
+ # Display the summaries for each selected PDF
95
+ for summary in summaries:
96
+ with st.expander(f"Summary for {summary.columns[0]}"):
97
+ st.dataframe(summary)