import streamlit as st import PyPDF2 import torch import numpy as np import pandas as pd from transformers import ( AutoModelForQuestionAnswering, AutoTokenizer, pipeline, AutoModelForSequenceClassification ) from sentence_transformers import SentenceTransformer import random import nltk import ssl # Tambahkan fungsi download NLTK yang robust def download_nltk_resources(): try: # Disable SSL verification if needed try: _create_unverified_https_context = ssl._create_unverified_context except AttributeError: pass else: ssl._create_default_https_context = _create_unverified_https_context # Download resources nltk.download('punkt', quiet=True) nltk.download('stopwords', quiet=True) except Exception as e: st.warning(f"Gagal download NLTK resources: {e}") st.warning("Silakan download manual atau gunakan metode alternatif") # Panggil fungsi download di awal download_nltk_resources() from nltk.corpus import stopwords from nltk.tokenize import sent_tokenize class AdvancedPDFAnalyzer: def __init__(self): # Advanced Models self.summarizer = pipeline("summarization", model="facebook/bart-large-cnn") self.qa_model = pipeline("question-answering", model="deepset/roberta-base-squad2") self.sentence_model = SentenceTransformer('all-MiniLM-L6-v2') # Konfigurasi Tambahan try: self.stop_words = set(stopwords.words('english')) except LookupError: st.warning("Stopwords tidak terdownload. Menggunakan daftar manual.") self.stop_words = {'a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for', 'from', 'has', 'he', 'in', 'is', 'it', 'its', 'of', 'on', 'that', 'the', 'to', 'was', 'were', 'will', 'with'} # ... [sisanya tetap sama seperti kode asli] def preprocess_text(self, text): """Preprocessing teks tingkat lanjut""" # Bersihkan teks dari karakter tidak perlu text = text.replace('\n', ' ').replace('\t', ' ') try: # Tokenisasi kalimat sentences = sent_tokenize(text) except LookupError: # Fallback method jika tokenisasi gagal sentences = text.split('. ') # Filter kalimat berdasarkan panjang dan kompleksitas filtered_sentences = [ sent for sent in sentences if 10 < len(sent.split()) < 50 # Filter panjang kalimat ] return ' '.join(filtered_sentences) def main(): st.set_page_config( page_title="LookupAI: PDF Analyzer", page_icon="📄", initial_sidebar_state="expanded" ) analyzer = AdvancedPDFAnalyzer() analyzer.run_advanced_app() if __name__ == "__main__": main()