justus-tobias commited on
Commit
a0de5e2
·
1 Parent(s): 8e9a234

cleaned code

Browse files
Files changed (2) hide show
  1. README.md +1 -1
  2. app.py +5 -118
README.md CHANGED
@@ -1,6 +1,6 @@
1
  ---
2
  title: Moshi
3
- emoji: 📈
4
  colorFrom: indigo
5
  colorTo: gray
6
  sdk: gradio
 
1
  ---
2
  title: Moshi
3
+ emoji: 💨
4
  colorFrom: indigo
5
  colorTo: gray
6
  sdk: gradio
app.py CHANGED
@@ -2,79 +2,10 @@ import gradio as gr
2
  import torch
3
  from huggingface_hub import hf_hub_download
4
  from moshi.models import loaders, LMGen
5
- import tempfile
6
- import os
7
- import soundfile as sf
8
  import numpy as np
9
- import time
10
 
11
 
12
- def process_wav(wav):
13
 
14
- mimi_weight = hf_hub_download(loaders.DEFAULT_REPO, loaders.MIMI_NAME)
15
- mimi = loaders.get_mimi(mimi_weight, device='cpu')
16
- mimi.set_num_codebooks(8) # up to 32 for mimi, but limited to 8 for moshi.
17
-
18
-
19
- #wav = torch.randn(1, 1, 24000 * 10) # should be [B, C=1, T]
20
- with torch.no_grad():
21
- codes = mimi.encode(wav) # [B, K = 8, T]
22
- # decoded = mimi.decode(codes)
23
-
24
- # # Supports streaming too.
25
- # frame_size = int(mimi.sample_rate / mimi.frame_rate)
26
- # all_codes = []
27
- # with mimi.streaming(batch_size=1):
28
- # for offset in range(0, wav.shape[-1], frame_size):
29
- # frame = wav[:, :, offset: offset + frame_size]
30
- # codes = mimi.encode(frame)
31
- # assert codes.shape[-1] == 1, codes.shape
32
- # all_codes.append(codes)
33
- all_codes = codes
34
-
35
- mimi.cuda()
36
- moshi_weight = hf_hub_download(loaders.DEFAULT_REPO, loaders.MOSHI_NAME)
37
- moshi = loaders.get_moshi_lm(moshi_weight, device='cuda')
38
- lm_gen = LMGen(moshi, temp=0.8, temp_text=0.7) # this handles sampling params etc.
39
-
40
- out_wav_chunks = []
41
- # Now we will stream over both Moshi I/O, and decode on the fly with Mimi.
42
- with torch.no_grad(), lm_gen.streaming(1), mimi.streaming(1):
43
- for idx, code in enumerate(all_codes):
44
- tokens_out = lm_gen.step(code.cuda())
45
- # tokens_out is [B, 1 + 8, 1], with tokens_out[:, 1] representing the text token.
46
- if tokens_out is not None:
47
- wav_chunk = mimi.decode(tokens_out[:, 1:])
48
- out_wav_chunks.append(wav_chunk)
49
- print(idx, end='\r')
50
- out_wav = torch.cat(out_wav_chunks, dim=-1)
51
-
52
- return out_wav
53
-
54
- def select_audio_frame(audio_tensor, frame_size, start_index=0):
55
- # Ensure the audio tensor is in the correct shape (1, 1, samples)
56
- if audio_tensor.dim() != 3 or audio_tensor.size(0) != 1 or audio_tensor.size(1) != 1:
57
- raise ValueError("Audio tensor must have shape (1, 1, samples)")
58
-
59
- # Get the total number of samples
60
- total_samples = audio_tensor.size(2)
61
-
62
- # If i is not provided, use the total number of samples
63
- i = total_samples
64
-
65
- # Calculate the start and end indices
66
- start_index = max(0, i - frame_size)
67
- end_index = i
68
-
69
- # Extract the frame
70
- frame = audio_tensor[0, 0, start_index:end_index]
71
-
72
- # If the frame is smaller than the desired size, pad with zeros at the beginning
73
- if frame.size(0) < frame_size:
74
- frame = torch.nn.functional.pad(frame, (frame_size - frame.size(0), 0))
75
-
76
- # Reshape to match the original tensor shape
77
- return frame.unsqueeze(0).unsqueeze(0)
78
 
79
  def process_wav_new(in_wav):
80
  """wav = torch.randn(1, 1, 24000 * 10) # should be [B, C=1, T]"""
@@ -193,6 +124,10 @@ Monologue” method significantly improves the linguistic quality of generated s
193
  - **Demo:** [demo](https://moshi.chat/) """)
194
 
195
 
 
 
 
 
196
 
197
  input_audio = gr.Audio(sources="microphone", label="Input Audio")
198
  output_audio = gr.Audio(label="Processed Audio", streaming=True, autoplay=True)
@@ -221,52 +156,4 @@ Monologue” method significantly improves the linguistic quality of generated s
221
  elem_id="citation-button",
222
  show_copy_button=True,
223
  )
224
- demo.launch(debug=True)
225
-
226
- ##########################################################################################################
227
- ##########################################################################################################
228
-
229
- # import gradio as gr
230
- # import numpy as np
231
- # import time
232
-
233
- # def process_stream(audio, instream):
234
-
235
- # if audio is None:
236
- # return gr.update(), instream
237
- # if instream is None:
238
- # ret = audio
239
- # else:
240
- # print("STREAM RECIEVED")
241
- # stream = (audio[0], np.concatenate((instream[1], audio[1])))
242
-
243
- # # Assuming instream[1] and audio[1] are valid inputs for convert2wav
244
- # wav1 = convert2wav(instream[1])
245
- # wav2 = convert2wav(audio[1])
246
-
247
- # # Concatenate along the last dimension (time axis)
248
- # combined_wav = torch.cat((wav1, wav2), dim=2)
249
- # print("WAV COMBINED")
250
-
251
-
252
- # yield from process_wav_new(combined_wav, stream)
253
-
254
-
255
-
256
-
257
- # with gr.Blocks() as demo:
258
- # gr.Markdown("# Moshi Demo")
259
- # gr.Markdown(" ")
260
- # gr.Markdown("-----------")
261
-
262
-
263
- # inp = gr.Audio(sources="microphone")
264
- # out = gr.Audio(autoplay=True)
265
- # stream = gr.State()
266
- # clear = gr.Button("Clear")
267
-
268
- # inp.stream(process_stream, [inp, stream], [out, stream])
269
- # clear.click(lambda: [None, None, None], None, [inp, out, stream])
270
-
271
-
272
- # demo.launch(debug=True)
 
2
  import torch
3
  from huggingface_hub import hf_hub_download
4
  from moshi.models import loaders, LMGen
 
 
 
5
  import numpy as np
 
6
 
7
 
 
8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
  def process_wav_new(in_wav):
11
  """wav = torch.randn(1, 1, 24000 * 10) # should be [B, C=1, T]"""
 
124
  - **Demo:** [demo](https://moshi.chat/) """)
125
 
126
 
127
+ gr.Markdown("""
128
+ 🚨
129
+ The Model will produce a lot of silence, because it is actually meant to stream the input and output.
130
+ I will try to create a demo which works with the streaming.""")
131
 
132
  input_audio = gr.Audio(sources="microphone", label="Input Audio")
133
  output_audio = gr.Audio(label="Processed Audio", streaming=True, autoplay=True)
 
156
  elem_id="citation-button",
157
  show_copy_button=True,
158
  )
159
+ demo.launch(debug=True)