Spaces:
Runtime error
Runtime error
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from flask import Flask, request, render_template, jsonify
|
2 |
+
import torch
|
3 |
+
from nltk.tokenize import word_tokenize
|
4 |
+
from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer, PegasusTokenizerFast, T5Tokenizer, T5ForConditionalGeneration, MBartForConditionalGeneration, MBart50TokenizerFast
|
5 |
+
from LDict import find_legal_terms, legal_terms_lower
|
6 |
+
import nltk
|
7 |
+
import re
|
8 |
+
import logging
|
9 |
+
logging.basicConfig(level=logging.ERROR)
|
10 |
+
|
11 |
+
nltk.download('punkt')
|
12 |
+
nltk.download('punkt_tab')
|
13 |
+
|
14 |
+
app = Flask(__name__)
|
15 |
+
|
16 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
17 |
+
# device = "mps" if torch.backends.mps.is_available() else "cpu"
|
18 |
+
|
19 |
+
#Method 1 model
|
20 |
+
pegasus_ckpt = "google/pegasus-cnn_dailymail"
|
21 |
+
tokenizer_pegasus = AutoTokenizer.from_pretrained(pegasus_ckpt)
|
22 |
+
model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(pegasus_ckpt).to(device)
|
23 |
+
|
24 |
+
# Method 2 model
|
25 |
+
port_tokenizer= AutoTokenizer.from_pretrained("stjiris/t5-portuguese-legal-summarization")
|
26 |
+
model_port = AutoModelForSeq2SeqLM.from_pretrained("stjiris/t5-portuguese-legal-summarization").to(device)
|
27 |
+
|
28 |
+
#paraphrase
|
29 |
+
t5_ckpt = "t5-base"
|
30 |
+
tokenizer_t5 = T5Tokenizer.from_pretrained(t5_ckpt)
|
31 |
+
model_t5 = T5ForConditionalGeneration.from_pretrained(t5_ckpt).to(device)
|
32 |
+
|
33 |
+
#Translation Model
|
34 |
+
mbart_ckpt = "facebook/mbart-large-50-one-to-many-mmt"
|
35 |
+
tokenizer_mbart = MBart50TokenizerFast.from_pretrained(mbart_ckpt,src_lang="en_XX")
|
36 |
+
model_mbart = MBartForConditionalGeneration.from_pretrained(mbart_ckpt).to(device)
|
37 |
+
|
38 |
+
|
39 |
+
def simplify_text(input_text):
|
40 |
+
matches = find_legal_terms(input_text)
|
41 |
+
tokens = word_tokenize(input_text)
|
42 |
+
simplified_tokens = [f"{token} ({legal_terms_lower[token.lower()]})" if token.lower() in matches else token for token in tokens]
|
43 |
+
return ' '.join(simplified_tokens)
|
44 |
+
|
45 |
+
def remove_parentheses(text):
|
46 |
+
p1 = re.sub(r"[()]", "", text)
|
47 |
+
p2 = re.sub(r"\s+", " ", p1).strip()
|
48 |
+
p3 = re.sub(r"\b(the|a|an)\s+\1\b", r"\1", p2, flags=re.IGNORECASE)
|
49 |
+
return p3
|
50 |
+
|
51 |
+
def summarize_text(text, method):
|
52 |
+
if method == "method2":
|
53 |
+
#Sumarry Model2
|
54 |
+
inputs_legal = port_tokenizer(text, max_length=1024, truncation=True, return_tensors="pt")
|
55 |
+
summary_ids_legal = model_port.generate(inputs_legal["input_ids"], max_length=250, num_beams=4, early_stopping=True)
|
56 |
+
Summarized_method2 = port_tokenizer.decode(summary_ids_legal[0], skip_special_tokens=True)
|
57 |
+
print("\n\n\n Summarized MEthod2",Summarized_method2, "\n\n\n\n")
|
58 |
+
cleaned_summary2 = remove_parentheses(Summarized_method2)
|
59 |
+
print("\n\n\n Cleaned Summarized MEthod2",cleaned_summary2, "\n\n\n\n")
|
60 |
+
#Paraphrase
|
61 |
+
p_inputs = tokenizer_t5.encode(cleaned_summary2, return_tensors="pt", max_length=512, truncation=True)
|
62 |
+
p_summary_ids = model_t5.generate(p_inputs, max_length=150, min_length=50, length_penalty=2.0, num_beams=4, early_stopping=True)
|
63 |
+
method2 = tokenizer_t5.decode(p_summary_ids[0], skip_special_tokens=True)
|
64 |
+
print("\n\n\n Summarized Paraphrased MEthod2",method2, "\n\n\n\n")
|
65 |
+
return method2
|
66 |
+
|
67 |
+
elif method == "method1":
|
68 |
+
summarization_pipeline = pipeline('summarization', model=model_pegasus, tokenizer=tokenizer_pegasus, device=0 if device == "cuda" else -1)
|
69 |
+
method1 = summarization_pipeline(text, max_length=100, min_length=30, truncation=True)[0]['summary_text']
|
70 |
+
print("\n\n\n Summarized MEthod1",method1, "\n\n\n\n")
|
71 |
+
cleaned_summary1 = remove_parentheses(method1)
|
72 |
+
print("\n\n\n Summarized Cleaned MEthod1",cleaned_summary1, "\n\n\n\n")
|
73 |
+
return cleaned_summary1
|
74 |
+
|
75 |
+
|
76 |
+
def translate_to_hindi(text):
|
77 |
+
inputs = tokenizer_mbart([text], return_tensors="pt", padding=True, truncation=True)
|
78 |
+
translated_tokens = model_mbart.generate(**inputs, forced_bos_token_id=tokenizer_mbart.lang_code_to_id["hi_IN"])
|
79 |
+
|
80 |
+
# Select the first sequence from the generated tokens
|
81 |
+
translation = tokenizer_mbart.decode(translated_tokens[0], skip_special_tokens=True)
|
82 |
+
return translation
|
83 |
+
|
84 |
+
@app.route('/', methods=['GET', 'POST'])
|
85 |
+
def index():
|
86 |
+
if request.method == 'POST':
|
87 |
+
try:
|
88 |
+
input_text = request.form['input_text']
|
89 |
+
logging.info(f"Received data for translation: {input_text}") # Log incoming data
|
90 |
+
method = request.form['method']
|
91 |
+
|
92 |
+
simplified_text = simplify_text(input_text)
|
93 |
+
logging.info(f"Received data for translation: {simplified_text}")
|
94 |
+
summarized_text = summarize_text(simplified_text, method)
|
95 |
+
logging.info(f"Received data for translation: {summarized_text}")
|
96 |
+
|
97 |
+
return jsonify({
|
98 |
+
"summarized_text": summarized_text,
|
99 |
+
})
|
100 |
+
except Exception as e:
|
101 |
+
logging.error(f"Error occurred: {e}", exc_info=True)
|
102 |
+
return jsonify({"error": str(e)}), 500
|
103 |
+
return render_template('index.html')
|
104 |
+
|
105 |
+
@app.route('/translate', methods=['POST'])
|
106 |
+
def translate():
|
107 |
+
try:
|
108 |
+
data = request.get_json()
|
109 |
+
logging.info(f"Received data for translation: {data}") # Log incoming data
|
110 |
+
text = data['text']
|
111 |
+
translated_text = translate_to_hindi(text)
|
112 |
+
|
113 |
+
return jsonify({
|
114 |
+
"translated_text": translated_text
|
115 |
+
})
|
116 |
+
except Exception as e:
|
117 |
+
logging.error(f"Error occurred during translation: {e}", exc_info=True)
|
118 |
+
return jsonify({"error": str(e)}), 500
|
119 |
+
|
120 |
+
|
121 |
+
if __name__ == '__main__':
|
122 |
+
app.run(port=5003)
|
123 |
+
|