import gradio as gr import torch from huggingface_hub import hf_hub_download from moshi.models import loaders, LMGen import numpy as np def process_wav_new(in_wav): """wav = torch.randn(1, 1, 24000 * 10) # should be [B, C=1, T]""" mimi_weight = hf_hub_download(loaders.DEFAULT_REPO, loaders.MIMI_NAME) mimi = loaders.get_mimi(mimi_weight, device='cpu') mimi.set_num_codebooks(8) # up to 32 for mimi, but limited to 8 for moshi. # frame_size = int(mimi.sample_rate / mimi.frame_rate) # wav = select_audio_frame(in_wav, frame_size) wav = in_wav with torch.no_grad(): # Supports streaming too. frame_size = int(mimi.sample_rate / mimi.frame_rate) all_codes = [] with mimi.streaming(batch_size=1): for offset in range(0, wav.shape[-1], frame_size): frame = wav[:, :, offset: offset + frame_size] codes = mimi.encode(frame) assert codes.shape[-1] == 1, codes.shape all_codes.append(codes) mimi.cuda() moshi_weight = hf_hub_download(loaders.DEFAULT_REPO, loaders.MOSHI_NAME) moshi = loaders.get_moshi_lm(moshi_weight, device='cuda') lm_gen = LMGen(moshi, temp=0.8, temp_text=0.7) # this handles sampling params etc. out_wav_chunks = [] # Now we will stream over both Moshi I/O, and decode on the fly with Mimi. with torch.no_grad(), lm_gen.streaming(1), mimi.streaming(1): for idx, code in enumerate(all_codes): print("CODE: ", code.shape) tokens_out = lm_gen.step(code.cuda()) # tokens_out is [B, 1 + 8, 1], with tokens_out[:, 1] representing the text token. if tokens_out is not None: wav_chunk = mimi.decode(tokens_out[:, 1:]) out_wav_chunks.append(wav_chunk) print(idx, end='\r') out_wav = torch.cat(out_wav_chunks, dim=-1) return out_wav def convert2wav(audio): if audio is None: return None sr, data = audio # Convert to mono if stereo if len(data.shape) > 1: data = np.mean(data, axis=1) # Convert to torch tensor wav = torch.from_numpy(data).float() # Reshape to (1, 1, samples) wav = wav.unsqueeze(0).unsqueeze(0) # Resample to 24000 Hz if necessary if sr != 24000: wav = torch.nn.functional.interpolate(wav, size=24000 * 10, mode='linear', align_corners=False) # Ensure the tensor has the correct shape (1, 1, 24000 * 10) wav = wav[:, :, :24000 * 10] return wav ########################################################################################################## ########################################################################################################## def process_audio(audio, instream): print("Audio recieved") if audio is None: return gr.update(), instream if instream is None: instream = (24000, torch.randn(1, 1, 24000 * 10).squeeze().cpu().numpy()) print("STREAM RECIEVED") stream = (audio[0], np.concatenate((instream[1], audio[1]))) # Assuming instream[1] and audio[1] are valid inputs for convert2wav wav1 = convert2wav(instream) wav2 = convert2wav(audio) # Concatenate along the last dimension (time axis) combined_wav = torch.cat((wav1, wav2), dim=2) print("WAV COMBINED") outwav = process_wav_new(combined_wav) return gr.update(value=None), (24000, outwav.squeeze().cpu().numpy()), stream with gr.Blocks() as demo: gr.Markdown("# Moshi Demo") gr.Markdown(" ") gr.Markdown("-----------") gr.Markdown("### Model Description") gr.Markdown("""Moshi is a speech-text foundation model that casts spoken dialogue as speech-to-speech generation. Starting from a text language model backbone, Moshi generates speech as tokens from the residual quantizer of a neural audio codec, while modeling separately its own speech and that of the user into parallel streams. This allows for the removal of explicit speaker turns, and the modeling of arbitrary conversational dynamics. Moshi also predicts time-aligned text tokens as a prefix to audio tokens. This “Inner Monologue” method significantly improves the linguistic quality of generated speech and provides streaming speech recognition and text-to-speech. As a result, Moshi is the first real-time full-duplex spoken large language model, with a theoretical latency of 160ms, 200ms in practice. """) gr.Markdown(""" - **Developed by:** Kyutai - **Model type:** Multimodal speech-text foundation model - **Language(s) (NLP):** English - **License:** CC-BY""") gr.Markdown("### Model Sources ") gr.Markdown(""" - **Repository:** [repo](https://github.com/kyutai-labs/moshi) - **Paper:** [paper](http://kyutai.org/Moshi.pdf) - **Demo:** [demo](https://moshi.chat/) """) gr.Markdown(""" 🚨 The Model will produce a lot of silence, because it is actually meant to stream the input and output. I will try to create a demo which works with the streaming.""") input_audio = gr.Audio(sources="microphone", label="Input Audio") output_audio = gr.Audio(label="Processed Audio", streaming=True, autoplay=True) stream = gr.State() input_audio.stop_recording( fn=process_audio, inputs=[input_audio, stream], outputs=[input_audio, output_audio, stream] ) with gr.Row(): with gr.Accordion("📙 Citation", open=False): gr.Textbox( value="""@techreport{kyutai2024moshi, author = {Alexandre D\'efossez and Laurent Mazar\'e and Manu Orsini and Am\'elie Royer and Patrick P\'erez and Herv\'e J\'egou and Edouard Grave and Neil Zeghidour}, title = {Moshi: a speech-text foundation model for real-time dialogue}, institution = {Kyutai}, year={2024}, month={September}, url={http://kyutai.org/Moshi.pdf}, } """, lines=7, label="Copy the BibTeX snippet to cite this source", elem_id="citation-button", show_copy_button=True, ) demo.launch(debug=True)