joselobenitezg
/

mms-grn-tts

+import torch
+from typing import Dict, Any, List, Union
+from transformers import VitsModel, AutoTokenizer
+import numpy as np
+class EndpointHandler:
+    def __init__(self, path="joselobenitezg/mms-grn-tts", device=None):
+        """Initialize the VITS TTS model and tokenizer.
+        Args:
+            path (str): HuggingFace model path
+            device (str, optional): Device to run the model on ('cuda', 'cpu', or specific cuda device)
+        """
+        # Device management
+        self.device = device or ('cuda' if torch.cuda.is_available() else 'cpu')
+        try:
+            self.tokenizer = AutoTokenizer.from_pretrained(path)
+            self.model = VitsModel.from_pretrained(path).to(self.device)
+            self.sampling_rate = self.model.config.sampling_rate
+        except Exception as e:
+            raise RuntimeError(f"Failed to load model and tokenizer: {str(e)}")
+        # Set maximum input length
+        self.max_input_length = 200
+        print(f"Model loaded on {self.device}")
+    def validate_input(self, text: Union[str, List[str]]) -> List[str]:
+        """Validate and preprocess input text.
+        Args:
+            text: Input text or list of texts
+        Returns:
+            List[str]: Validated and processed text list
+        Raises:
+            ValueError: If input validation fails
+        """
+        # Convert single string to list
+        if isinstance(text, str):
+            text = [text]
+        elif isinstance(text, list):
+            if not all(isinstance(t, str) for t in text):
+                raise ValueError("All elements in the input list must be strings")
+        else:
+            raise ValueError("Input must be a string or list of strings")
+        # Validate each text
+        for t in text:
+            if not t.strip():
+                raise ValueError("Empty text is not allowed")
+            if len(t) > self.max_input_length:
+                raise ValueError(f"Input text exceeds maximum length of {self.max_input_length}")
+        return text
+    def batch_process(self, texts: List[str], batch_size: int = 8) -> List[Dict[str, Any]]:
+        """Process multiple texts in batches.
+        Args:
+            texts (List[str]): List of texts to process
+            batch_size (int): Size of each batch
+        Returns:
+            List[Dict[str, Any]]: List of results for each text
+        """
+        results = []
+        for i in range(0, len(texts), batch_size):
+            batch_texts = texts[i:i + batch_size]
+            # Tokenize batch
+            inputs = self.tokenizer(batch_texts, padding=True, return_tensors="pt")
+            inputs = {k: v.to(self.device) for k, v in inputs.items()}
+            try:
+                with torch.no_grad():
+                    outputs = self.model(**inputs).waveform
+                for waveform in outputs:
+                    # Move to CPU and convert to numpy
+                    waveform_np = waveform.cpu().numpy()
+                    results.append({
+                        "waveform": waveform_np.tolist(),
+                        "sampling_rate": self.sampling_rate
+                    })
+            except Exception as e:
+                raise RuntimeError(f"Error during batch processing: {str(e)}")
+        return results
+    def __call__(self, data: Union[Dict[str, Any], str, List[str]]) -> Union[Dict[str, Any], List[Dict[str, Any]]]:
+        """Process the input text and generate audio.
+        Args:
+            data: Input data in one of these formats:
+                - Dict[str, Any]: {"inputs": "text" or ["text1", "text2"], "batch_size": int}
+                - str: Direct text input
+                - List[str]: List of texts to process
+        Returns:
+            Union[Dict[str, Any], List[Dict[str, Any]]]: Dictionary or list of dictionaries
+                containing the audio waveform(s) and sampling rate
+        """
+        try:
+            # Handle different input types
+            if isinstance(data, dict):
+                text = data.get("inputs", "")
+                batch_size = data.get("batch_size", 8)
+            elif isinstance(data, (str, list)):
+                text = data
+                batch_size = 8
+            else:
+                raise ValueError(f"Unsupported input type: {type(data)}")
+            # Validate input
+            texts = self.validate_input(text)
+            # Single input case
+            if len(texts) == 1:
+                inputs = self.tokenizer(texts[0], return_tensors="pt")
+                inputs = {k: v.to(self.device) for k, v in inputs.items()}
+                with torch.no_grad():
+                    output = self.model(**inputs).waveform
+                    waveform = output.cpu().squeeze().numpy()
+                return {
+                    "waveform": waveform.tolist(),
+                    "sampling_rate": self.sampling_rate
+                }
+            # Multiple inputs case
+            else:
+                return self.batch_process(texts, batch_size)
+        except Exception as e:
+            error_msg = f"Error processing input: {str(e)}"
+            print(error_msg)  # Log the error
+            raise RuntimeError(error_msg)
+    def cleanup(self):
+        """Cleanup resources when shutting down."""
+        try:
+            # Clear CUDA cache if using GPU
+            if 'cuda' in self.device:
+                torch.cuda.empty_cache()
+        except Exception as e:
+            print(f"Error during cleanup: {str(e)}")