Mahkamah Agung: NER & Summarization of Legal Documents

import streamlit as st
import os
import pandas as pd
from PyPDF2 import PdfReader
from PyPDF2 import PdfFileReader
from PyPDF2.utils import PdfReadError
import openai
from openai import OpenAI
from pdfminer.high_level import extract_text
import json
from dotenv import load_dotenv

# Initialize OpenAI API
load_dotenv()
api_key = os.getenv('OPENAI_API_KEY')
openai.api_key = api_key
client = OpenAI(api_key=api_key)
pdf_folder = "pdf"

# CSS for the bold colored line
bold_line_css = """
<style>
    .centered-title {
        text-align: center;
        margin-bottom: 0;  # No gap between the title and the line
    }
    .bold-colored-line {
        border: none;
        height: 3px;  # Thickness of the line
        background-color: #FF6347;  # Tomato color
        margin-top: 0;  # No gap at the top
    }
</style>
"""

# Add the custom CSS to the Streamlit app


st.markdown(
    """
    <h1 style='text-align: center;'>
        Mahkamah Agung: NER & Summarization of Legal Documents
    </h1>
    <hr>
    """,
    unsafe_allow_html=True
)


#---------------------PDF OVERVIEW----------------------
# Function to read PDF file
def read_pdf(file):
    try:
        pdf_reader = PdfFileReader(file)
        num_pages = pdf_reader.numPages
        pdf_text = ""
        for page_number in range(num_pages):
            page = pdf_reader.getPage(page_number)
            pdf_text += page.extractText()
        return pdf_text
    except PdfReadError as e:
        st.error(f"Error reading PDF: {e}")
        return None

# Sidebar
st.sidebar.title("PDF Viewer")

# File uploader for PDFs
uploaded_files = st.sidebar.file_uploader("Upload PDF", type=["pdf"], accept_multiple_files=True)

# Display uploaded PDFs
if uploaded_files:
    for uploaded_file in uploaded_files:
        file_details = {"Filename": uploaded_file.name, "Filesize": uploaded_file.size}
        st.sidebar.write(file_details)
        pdf_text = read_pdf(uploaded_file)
        if pdf_text:
            st.write(pdf_text)


# ---------------------UPLOAD PDF AND TEXT EXTRACTION----------------------

with col2:
    with st.expander('PDF Documents'):
        uploaded_files = st.file_uploader("Upload PDFs", accept_multiple_files=True)
        
    pdf_texts = {}
    for uploaded_file in uploaded_files:
        pdf_reader = PdfReader(uploaded_file)
        total_pages = len(pdf_reader.pages)
        # Extract text from the first 3 pages and the last 3 pages
        extracted_text_first = extract_text(uploaded_file, page_numbers=range(1, min(4, total_pages)))
        extracted_text_last = extract_text(uploaded_file, page_numbers=range(max(1, total_pages - 2), total_pages + 1))
        extracted_text = extracted_text_first + "\n" + extracted_text_last
        pdf_texts[uploaded_file.name] = extracted_text


#---------------------ANALYZE AND SUMMARIZE----------------------


def get_template():
    if st.checkbox("Use Custom Template"):
        # Text input for custom template
        custom_template = st.text_area("Input Your Template Here:")
        # Use custom template if provided
        if custom_template:
            return custom_template
    # Default template
    default_template = """
    # Anda Adalah Seorang Hakim Agung Di Mahkamah Agung Di Indonesia. Berdasarkan Putusan Di Bawah Ini, Berikan Kesimpulannya:
    {}
    Variabel Yang Harus Ada Adalah Sebagai Berikut: 
    \'Hakim Ketua\', \'Hakim Anggota\', \'Panitera\', \'Putusan\', \'Putusan Lainnya\', \'Catatan Putusan\', \'Tanggal Musyawarah\', \'Tanggal Pembacaan\', \'Jenis Institusi Yudisial\', \'Tanggal Pendaftaran\', \'Institusi Yudisial\', \'Nomor Kasus\', \'Pengadilan\', \'Nama Terdakwa\', \'Tempat Lahir Terdakwa\', \'Tanggal Lahir Terdakwa\', \'Usia Terdakwa\', \'Jenis Kelamin Terdakwa\', \'Kebangsaan Terdakwa\', \'Agama Terdakwa\', \'Pekerjaan Terdakwa\', \'Pasal Dakwaan\',\' Pelanggaran Dakwaan\', \'Vonis Hukuman\', \'Deskripsi Vonis Atribut Disita\', \'Vonis Atribut Disita Berat\', \'Denda\', dan,  \'Kesimpulan\'.
    # """
    return default_template

# Get the template
template = get_template()


if st.button("📝Process"):
    summaries = []
    for pdf_name, text in pdf_texts.items():
        response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": "You are a helpful assistant designed to output JSON."},
                {"role": "user", "content": template.format(text)}
            ]
        )

        data = json.loads(response.choices[0].message.content)
        df = pd.json_normalize(data)
        df = df.T
        df.columns = [f"Kesimpulan Putusan ({pdf_name})"]
        
        summaries.append(df)
        st.session_state.summaries=True

    # Display the summaries for each selected PDF
    for summary in summaries:
        st.session_state.summaries=True
        with st.expander(f"{summary.columns[0]}"):
            st.dataframe(summary)