mmomeni
/

audiogen-medium

Inference Endpoints

Model card Files Files and versions Community

mmomeni commited on Jul 16, 2024

Commit

93bba1b

·

verified ·

1 Parent(s): 91330cc

Update handler.py

Files changed (1) hide show

handler.py +25 -24

handler.py CHANGED Viewed

@@ -1,14 +1,14 @@
-from typing import Dict, List, Any
-from transformers import AutoProcessor, MusicgenForConditionalGeneration
-import torch
 class EndpointHandler:
-    def __init__(self, path=""):
-        # load model and processor from path
-        self.processor = AutoProcessor.from_pretrained(path)
-        self.model = MusicgenForConditionalGeneration.from_pretrained(path, torch_dtype=torch.float16).to("cuda")
-    def __call__(self, data: Dict[str, Any]) -> Dict[str, str]:
         """
         Args:
             data (:dict:):
@@ -16,23 +16,24 @@ class EndpointHandler:
         """
         # process input
         inputs = data.pop("inputs", data)
-        parameters = data.pop("parameters", None)
-        # preprocess
-        inputs = self.processor(
-            text=[inputs],
-            padding=True,
-            return_tensors="pt",).to("cuda")
-        # pass inputs with all kwargs in data
-        if parameters is not None:
-            with torch.autocast("cuda"):
-                outputs = self.model.generate(**inputs, **parameters)
-        else:
-            with torch.autocast("cuda"):
-                outputs = self.model.generate(**inputs,)
-        # postprocess the prediction
-        prediction = outputs[0].cpu().numpy().tolist()
-        return [{"generated_audio": prediction}]

+from typing import Dict, Any
+from audiocraft.models import AudioGen
+# from audiocraft.data.audio import audio_write
 class EndpointHandler:
+    def __init__(self):
+        # Load the AudioGen model
+        self.model = AudioGen.get_pretrained('facebook/audiogen-medium')
+        self.model.set_generation_params(duration=5)  # Set default duration to 5 seconds
+    def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
         """
         Args:
             data (:dict:):
         """
         # process input
         inputs = data.pop("inputs", data)
+        parameters = data.pop("parameters", {})
+        # Update generation parameters if provided
+        if 'duration' in parameters:
+            self.model.set_generation_params(duration=parameters['duration'])
+        # Generate audio from descriptions
+        descriptions = [inputs]
+        wav = self.model.generate(descriptions)
+        # Convert the generated audio to a list format for JSON serialization
+        predictions = []
+        for idx, one_wav in enumerate(wav):
+            # Save the audio to a file (optional)
+            # audio_write(f'{idx}', one_wav.cpu(), self.model.sample_rate, strategy="loudness", loudness_compressor=True)
+            # Convert the tensor to a list
+            prediction = one_wav.cpu().numpy().tolist()
+            predictions.append(prediction)
+        return {"generated_audio": predictions}