Spaces:

justus-tobias
/

Moshi

Sleeping

App Files Files Community

justus-tobias commited on Sep 30, 2024

Commit

43e6c08

1 Parent(s): 2ace8c2

audio truncation for reduced computation

Browse files

Files changed (1) hide show

app.py +24 -9

app.py CHANGED Viewed

@@ -4,9 +4,10 @@ import torch
 from huggingface_hub import hf_hub_download
 from moshi.models import loaders, LMGen
 import numpy as np
 mimi_weight = hf_hub_download(loaders.DEFAULT_REPO, loaders.MIMI_NAME)
 moshi_weight = hf_hub_download(loaders.DEFAULT_REPO, loaders.MOSHI_NAME)
@@ -22,14 +23,13 @@ def compute_codes(wav):
         frame_size = int(mimi.sample_rate / mimi.frame_rate)
         all_codes = []
         with mimi.streaming(batch_size=1):
-            for offset in range(0, wav.shape[-1], frame_size):
                 frame = wav[:, :, offset: offset + frame_size]
                 codes = mimi.encode(frame)
                 assert codes.shape[-1] == 1, codes.shape
                 all_codes.append(codes)
     return all_codes
 @spaces.GPU
 def generate_reponse(all_codes):
     """wav = torch.randn(1, 1, 24000 * 10)  # should be [B, C=1, T]"""
@@ -50,7 +50,7 @@ def generate_reponse(all_codes):
     # Now we will stream over both Moshi I/O, and decode on the fly with Mimi.
     with torch.no_grad(), lm_gen.streaming(1), mimi.streaming(1):
-        for idx, code in enumerate(all_codes):
             # print("CODE: ", code.shape)
             tokens_out = lm_gen.step(code.to(device))
             # tokens_out is [B, 1 + 8, 1], with tokens_out[:, 1] representing the text token.
@@ -89,6 +89,10 @@ def convert2wav(audio):
     return wav
 ##########################################################################################################
@@ -102,27 +106,38 @@ def process_audio(audio, instream):
     print("Audio recieved")
     if audio is None:
-        return gr.update(), instream
     try:
         if instream is None:
             instream = (24000, torch.randn(1, 1, 24000 * 10).squeeze().cpu().numpy())
-        print("STREAM RECIEVED")
         stream = (audio[0], np.concatenate((instream[1], audio[1])))
         # Assuming instream[1] and audio[1] are valid inputs for convert2wav
         wav1 = convert2wav(instream)
         wav2 = convert2wav(audio)
         # Concatenate along the last dimension (time axis)
         combined_wav = torch.cat((wav1, wav2), dim=2)
-        print("WAV COMBINED")
         mimi_codes = compute_codes(combined_wav)
-        print("CODES COMPUTED")
         outwav = generate_reponse(mimi_codes)
     except Exception as e:
-        return gr.update(value=None), (24000, outwav.squeeze().cpu().numpy()), stream, gr.update(visible=True,value=f"LOG: {e}")
     return gr.update(value=None), (24000, outwav.squeeze().cpu().numpy()), stream, gr.update(visible=False)

 from huggingface_hub import hf_hub_download
 from moshi.models import loaders, LMGen
 import numpy as np
+from tqdm import tqdm
+MAX_LENGTH = 24000 * 5  # For example, 30 seconds of audio at 24kHz
 mimi_weight = hf_hub_download(loaders.DEFAULT_REPO, loaders.MIMI_NAME)
 moshi_weight = hf_hub_download(loaders.DEFAULT_REPO, loaders.MOSHI_NAME)
         frame_size = int(mimi.sample_rate / mimi.frame_rate)
         all_codes = []
         with mimi.streaming(batch_size=1):
+            for offset in tqdm(range(0, wav.shape[-1], frame_size), desc="computing Codes"):
                 frame = wav[:, :, offset: offset + frame_size]
                 codes = mimi.encode(frame)
                 assert codes.shape[-1] == 1, codes.shape
                 all_codes.append(codes)
     return all_codes
 @spaces.GPU
 def generate_reponse(all_codes):
     """wav = torch.randn(1, 1, 24000 * 10)  # should be [B, C=1, T]"""
     # Now we will stream over both Moshi I/O, and decode on the fly with Mimi.
     with torch.no_grad(), lm_gen.streaming(1), mimi.streaming(1):
+        for idx, code in tqdm(enumerate(all_codes), desc="generate tokens"):
             # print("CODE: ", code.shape)
             tokens_out = lm_gen.step(code.to(device))
             # tokens_out is [B, 1 + 8, 1], with tokens_out[:, 1] representing the text token.
     return wav
+def truncate_audio(wav, max_length):
+    if wav.shape[2] > max_length:
+        return wav[:, :, -max_length:]
+    return wav
 ##########################################################################################################
     print("Audio recieved")
     if audio is None:
+        return gr.update(), (24000, outwav.squeeze().cpu().numpy()), instream, gr.update(visible=True,value=f"Audio is None")
     try:
         if instream is None:
             instream = (24000, torch.randn(1, 1, 24000 * 10).squeeze().cpu().numpy())
+        print("1. COMBINE AUDIO WITH PREVIOUS CONVERSATION TO STORE")
         stream = (audio[0], np.concatenate((instream[1], audio[1])))
         # Assuming instream[1] and audio[1] are valid inputs for convert2wav
+        print("2. CONVERT AUDIO TO WAV")
         wav1 = convert2wav(instream)
         wav2 = convert2wav(audio)
         # Concatenate along the last dimension (time axis)
+        print("3. COMBINE AUDIOS TO A SINGLE STREAM")
         combined_wav = torch.cat((wav1, wav2), dim=2)
+        # Truncate Audio to a defined length to recude computational efforts
+        print("4. TRUNCATE AUDIO LENGTH TO GIVEN DURATION")
+        combined_wav = truncate_audio(combined_wav, MAX_LENGTH)
+        # Preprocessing, convert the audio into the processable codes/tokens
+        print("5. COMPUTE CODES")
         mimi_codes = compute_codes(combined_wav)
+        # Generation of the Model's reponse
+        print("6. GENRATE TOKENS")
         outwav = generate_reponse(mimi_codes)
     except Exception as e:
+        return gr.update(value=None), (24000, outwav.squeeze().cpu().numpy()), stream, gr.update(visible=True,value=f"LOG: \n{e}")
     return gr.update(value=None), (24000, outwav.squeeze().cpu().numpy()), stream, gr.update(visible=False)