Spaces:
Sleeping
Sleeping
import gradio as gr | |
import torch | |
from transformers import AutoTokenizer, T5ForConditionalGeneration, pipeline | |
from sentence_transformers import SentenceTransformer, util | |
import requests | |
import os | |
import warnings | |
from transformers import logging | |
# Suppress warnings | |
warnings.filterwarnings("ignore", category=FutureWarning) | |
warnings.filterwarnings("ignore", category=UserWarning) | |
warnings.filterwarnings("ignore") | |
logging.set_verbosity_error() | |
# Set API keys and environment variables | |
GROQ_API_KEY = os.getenv("GROQ_API_KEY") # Ensure you set this in Hugging Face Spaces | |
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' | |
# Groq API sentence segmentation | |
def segment_into_sentences_groq(passage): | |
headers = { | |
"Authorization": f"Bearer {GROQ_API_KEY}", | |
"Content-Type": "application/json" | |
} | |
payload = { | |
"model": "llama3-8b-8192", | |
"messages": [ | |
{ | |
"role": "system", | |
"content": "Segment sentences by adding '1!2@3#' at the end of each sentence." | |
}, | |
{ | |
"role": "user", | |
"content": f"Segment the passage: {passage}" | |
} | |
], | |
"temperature": 1.0, | |
"max_tokens": 8192 | |
} | |
response = requests.post("https://api.groq.com/openai/v1/chat/completions", json=payload, headers=headers) | |
if response.status_code == 200: | |
data = response.json() | |
segmented_text = data.get("choices", [{}])[0].get("message", {}).get("content", "") | |
sentences = segmented_text.split("1!2@3#") | |
return [sentence.strip() for sentence in sentences if sentence.strip()] | |
else: | |
raise ValueError(f"Groq API error: {response.text}") | |
# Text enhancement class | |
class TextEnhancer: | |
def __init__(self): | |
self.device = "cuda" if torch.cuda.is_available() else "cpu" | |
self.paraphrase_tokenizer = AutoTokenizer.from_pretrained("prithivida/parrot_paraphraser_on_T5") | |
self.paraphrase_model = T5ForConditionalGeneration.from_pretrained("prithivida/parrot_paraphraser_on_T5").to(self.device) | |
self.grammar_pipeline = pipeline( | |
"text2text-generation", | |
model="Grammarly/coedit-large", | |
device=0 if self.device == "cuda" else -1 | |
) | |
self.similarity_model = SentenceTransformer('paraphrase-MiniLM-L6-v2').to(self.device) | |
def enhance_text(self, text, min_similarity=0.8, max_variations=2): | |
sentences = segment_into_sentences_groq(text) | |
enhanced_sentences = [] | |
for sentence in sentences: | |
if not sentence.strip(): | |
continue | |
# Generate paraphrases | |
inputs = self.paraphrase_tokenizer( | |
f"paraphrase: {sentence}", | |
return_tensors="pt", | |
padding=True, | |
max_length=150, | |
truncation=True | |
).to(self.device) | |
outputs = self.paraphrase_model.generate( | |
**inputs, | |
max_length=150, | |
num_return_sequences=max_variations, | |
num_beams=max_variations | |
) | |
paraphrases = [ | |
self.paraphrase_tokenizer.decode(output, skip_special_tokens=True) | |
for output in outputs | |
] | |
# Calculate semantic similarity | |
sentence_embedding = self.similarity_model.encode(sentence) | |
paraphrase_embeddings = self.similarity_model.encode(paraphrases) | |
similarities = util.cos_sim(sentence_embedding, paraphrase_embeddings) | |
# Select the most similar paraphrase | |
valid_paraphrases = [ | |
para for para, sim in zip(paraphrases, similarities[0]) | |
if sim >= min_similarity | |
] | |
if valid_paraphrases: | |
corrected = self.grammar_pipeline( | |
valid_paraphrases[0], | |
max_length=150, | |
num_return_sequences=1 | |
)[0]["generated_text"] | |
enhanced_sentences.append(corrected) | |
else: | |
enhanced_sentences.append(sentence) | |
return ". ".join(enhanced_sentences).strip() + "." | |
# Gradio interface | |
def create_interface(): | |
enhancer = TextEnhancer() | |
def process_text(text, similarity_threshold): | |
try: | |
return enhancer.enhance_text(text, min_similarity=similarity_threshold / 100) | |
except Exception as e: | |
return f"Error: {str(e)}" | |
return gr.Interface( | |
fn=process_text, | |
inputs=[ | |
gr.Textbox(lines=10, placeholder="Enter text to enhance...", label="Input Text"), | |
gr.Slider(50, 100, 80, label="Minimum Semantic Similarity (%)") | |
], | |
outputs=gr.Textbox(lines=10, label="Enhanced Text"), | |
title="Text Enhancement System", | |
description="Enhance text quality with semantic preservation." | |
) | |
if __name__ == "__main__": | |
interface = create_interface() | |
interface.launch(server_name="0.0.0.0", server_port=7860) | |