OmPatel commited on
Commit
1ec29a2
·
verified ·
1 Parent(s): 98cf2fa

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +123 -0
app.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from flask import Flask, request, render_template, jsonify
2
+ import torch
3
+ from nltk.tokenize import word_tokenize
4
+ from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer, PegasusTokenizerFast, T5Tokenizer, T5ForConditionalGeneration, MBartForConditionalGeneration, MBart50TokenizerFast
5
+ from LDict import find_legal_terms, legal_terms_lower
6
+ import nltk
7
+ import re
8
+ import logging
9
+ logging.basicConfig(level=logging.ERROR)
10
+
11
+ nltk.download('punkt')
12
+ nltk.download('punkt_tab')
13
+
14
+ app = Flask(__name__)
15
+
16
+ device = "cuda" if torch.cuda.is_available() else "cpu"
17
+ # device = "mps" if torch.backends.mps.is_available() else "cpu"
18
+
19
+ #Method 1 model
20
+ pegasus_ckpt = "google/pegasus-cnn_dailymail"
21
+ tokenizer_pegasus = AutoTokenizer.from_pretrained(pegasus_ckpt)
22
+ model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(pegasus_ckpt).to(device)
23
+
24
+ # Method 2 model
25
+ port_tokenizer= AutoTokenizer.from_pretrained("stjiris/t5-portuguese-legal-summarization")
26
+ model_port = AutoModelForSeq2SeqLM.from_pretrained("stjiris/t5-portuguese-legal-summarization").to(device)
27
+
28
+ #paraphrase
29
+ t5_ckpt = "t5-base"
30
+ tokenizer_t5 = T5Tokenizer.from_pretrained(t5_ckpt)
31
+ model_t5 = T5ForConditionalGeneration.from_pretrained(t5_ckpt).to(device)
32
+
33
+ #Translation Model
34
+ mbart_ckpt = "facebook/mbart-large-50-one-to-many-mmt"
35
+ tokenizer_mbart = MBart50TokenizerFast.from_pretrained(mbart_ckpt,src_lang="en_XX")
36
+ model_mbart = MBartForConditionalGeneration.from_pretrained(mbart_ckpt).to(device)
37
+
38
+
39
+ def simplify_text(input_text):
40
+ matches = find_legal_terms(input_text)
41
+ tokens = word_tokenize(input_text)
42
+ simplified_tokens = [f"{token} ({legal_terms_lower[token.lower()]})" if token.lower() in matches else token for token in tokens]
43
+ return ' '.join(simplified_tokens)
44
+
45
+ def remove_parentheses(text):
46
+ p1 = re.sub(r"[()]", "", text)
47
+ p2 = re.sub(r"\s+", " ", p1).strip()
48
+ p3 = re.sub(r"\b(the|a|an)\s+\1\b", r"\1", p2, flags=re.IGNORECASE)
49
+ return p3
50
+
51
+ def summarize_text(text, method):
52
+ if method == "method2":
53
+ #Sumarry Model2
54
+ inputs_legal = port_tokenizer(text, max_length=1024, truncation=True, return_tensors="pt")
55
+ summary_ids_legal = model_port.generate(inputs_legal["input_ids"], max_length=250, num_beams=4, early_stopping=True)
56
+ Summarized_method2 = port_tokenizer.decode(summary_ids_legal[0], skip_special_tokens=True)
57
+ print("\n\n\n Summarized MEthod2",Summarized_method2, "\n\n\n\n")
58
+ cleaned_summary2 = remove_parentheses(Summarized_method2)
59
+ print("\n\n\n Cleaned Summarized MEthod2",cleaned_summary2, "\n\n\n\n")
60
+ #Paraphrase
61
+ p_inputs = tokenizer_t5.encode(cleaned_summary2, return_tensors="pt", max_length=512, truncation=True)
62
+ p_summary_ids = model_t5.generate(p_inputs, max_length=150, min_length=50, length_penalty=2.0, num_beams=4, early_stopping=True)
63
+ method2 = tokenizer_t5.decode(p_summary_ids[0], skip_special_tokens=True)
64
+ print("\n\n\n Summarized Paraphrased MEthod2",method2, "\n\n\n\n")
65
+ return method2
66
+
67
+ elif method == "method1":
68
+ summarization_pipeline = pipeline('summarization', model=model_pegasus, tokenizer=tokenizer_pegasus, device=0 if device == "cuda" else -1)
69
+ method1 = summarization_pipeline(text, max_length=100, min_length=30, truncation=True)[0]['summary_text']
70
+ print("\n\n\n Summarized MEthod1",method1, "\n\n\n\n")
71
+ cleaned_summary1 = remove_parentheses(method1)
72
+ print("\n\n\n Summarized Cleaned MEthod1",cleaned_summary1, "\n\n\n\n")
73
+ return cleaned_summary1
74
+
75
+
76
+ def translate_to_hindi(text):
77
+ inputs = tokenizer_mbart([text], return_tensors="pt", padding=True, truncation=True)
78
+ translated_tokens = model_mbart.generate(**inputs, forced_bos_token_id=tokenizer_mbart.lang_code_to_id["hi_IN"])
79
+
80
+ # Select the first sequence from the generated tokens
81
+ translation = tokenizer_mbart.decode(translated_tokens[0], skip_special_tokens=True)
82
+ return translation
83
+
84
+ @app.route('/', methods=['GET', 'POST'])
85
+ def index():
86
+ if request.method == 'POST':
87
+ try:
88
+ input_text = request.form['input_text']
89
+ logging.info(f"Received data for translation: {input_text}") # Log incoming data
90
+ method = request.form['method']
91
+
92
+ simplified_text = simplify_text(input_text)
93
+ logging.info(f"Received data for translation: {simplified_text}")
94
+ summarized_text = summarize_text(simplified_text, method)
95
+ logging.info(f"Received data for translation: {summarized_text}")
96
+
97
+ return jsonify({
98
+ "summarized_text": summarized_text,
99
+ })
100
+ except Exception as e:
101
+ logging.error(f"Error occurred: {e}", exc_info=True)
102
+ return jsonify({"error": str(e)}), 500
103
+ return render_template('index.html')
104
+
105
+ @app.route('/translate', methods=['POST'])
106
+ def translate():
107
+ try:
108
+ data = request.get_json()
109
+ logging.info(f"Received data for translation: {data}") # Log incoming data
110
+ text = data['text']
111
+ translated_text = translate_to_hindi(text)
112
+
113
+ return jsonify({
114
+ "translated_text": translated_text
115
+ })
116
+ except Exception as e:
117
+ logging.error(f"Error occurred during translation: {e}", exc_info=True)
118
+ return jsonify({"error": str(e)}), 500
119
+
120
+
121
+ if __name__ == '__main__':
122
+ app.run(port=5003)
123
+