openbmb
/

MiniCPM-o-2_6

Model card Files Files and versions Community

3v324v23 commited on 11 days ago

Commit

686e87a

1 Parent(s): 7063fbc

add usage samples to readme

Browse files

Files changed (4) hide show

README.md +22 -14
assets/input_examples/assistant_default_female_voice.wav +3 -0
assets/input_examples/assistant_male_voice.wav +3 -0
assets/input_examples/icl_20.wav +3 -0

README.md CHANGED Viewed

@@ -1155,7 +1155,14 @@ model.tts.float()
 ```python
 mimick_prompt = "Please repeat each user's speech, including voice style and speech content."
-audio_input, _ = librosa.load('xxx.wav', sr=16000, mono=True)
 msgs = [{'role': 'user', 'content': [mimick_prompt, audio_input]}]
 res = model.chat(
     msgs=msgs,
@@ -1165,7 +1172,7 @@ res = model.chat(
     use_tts_template=True,
     temperature=0.3,
     generate_audio=True,
-    output_audio_path='output.wav', # save the tts result to output_audio_path
 )
 ```
@@ -1177,7 +1184,7 @@ A general usage scenario of `MiniCPM-o-2.6` is role-playing a specific character
 ```python
-ref_audio, _ = librosa.load('./assets/voice_01.wav', sr=16000, mono=True) # load the reference audio
 sys_prompt = model.get_sys_prompt(ref_audio=ref_audio, mode='audio_roleplay', language='en')
 # round one
@@ -1191,7 +1198,7 @@ res = model.chat(
     use_tts_template=True,
     generate_audio=True,
     temperature=0.3,
-    output_audio_path='result.wav',
 )
 # round two
@@ -1206,7 +1213,7 @@ res = model.chat(
     use_tts_template=True,
     generate_audio=True,
     temperature=0.3,
-    output_audio_path='result_round_2.wav',
 )
 print(res)
 ```
@@ -1215,11 +1222,12 @@ print(res)
 #### Speech Conversation as an AI Assistant
-An enhanced feature of `MiniCPM-o-2.6` is to act as an AI assistant, but only with limited choice of voices. In this mode, `MiniCPM-o-2.6` is **less human-like and more like a voice assistant**. But it is more instruction-following.
 ```python
 sys_prompt = model.get_sys_prompt(ref_audio=ref_audio, mode='audio_assistant', language='en')
-user_question = {'role': 'user', 'content': [librosa.load('xxx.wav', sr=16000, mono=True)[0]]}
 # round one
 msgs = [sys_prompt, user_question]
@@ -1231,7 +1239,7 @@ res = model.chat(
     use_tts_template=True,
     generate_audio=True,
     temperature=0.3,
-    output_audio_path='result.wav',
 )
 # round two
@@ -1246,7 +1254,7 @@ res = model.chat(
     use_tts_template=True,
     generate_audio=True,
     temperature=0.3,
-    output_audio_path='result_round_2.wav',
 )
 print(res)
 ```
@@ -1272,7 +1280,7 @@ res = model.chat(
     use_tts_template=True,
     generate_audio=True,
     temperature=0.3,
-    output_audio_path='result.wav',
 )
 ```
@@ -1284,6 +1292,7 @@ res = model.chat(
 ```python
 sys_prompt = model.get_sys_prompt(ref_audio=ref_audio, mode='voice_cloning', language='en')
 text_prompt = f"Please read the text below."
 user_question = {'role': 'user', 'content': [text_prompt, "content that you want to read"]}
@@ -1297,7 +1306,7 @@ res = model.chat(
     use_tts_template=True,
     generate_audio=True,
     temperature=0.3,
-    output_audio_path='result.wav',
 )
 ```
@@ -1308,7 +1317,6 @@ res = model.chat(
 `MiniCPM-o-2.6` can also be used to address various audio understanding tasks, such as ASR, speaker analysis, general audio captioning, and sound scene tagging.
 For audio-to-text tasks, you can use the following prompts:
 - ASR with ZH(same as AST en2zh): `请仔细听这段音频片段，并将其内容逐字记录。`
@@ -1319,7 +1327,7 @@ For audio-to-text tasks, you can use the following prompts:
 ```python
 task_prompt = "Please listen to the audio snippet carefully and transcribe the content." + "\n" # can change to other prompts.
-audio_input, _ = librosa.load('xxx.wav', sr=16000, mono=True)
 msgs = [{'role': 'user', 'content': [task_prompt, audio_input]}]
@@ -1331,7 +1339,7 @@ res = model.chat(
     use_tts_template=True,
     generate_audio=True,
     temperature=0.3,
-    output_audio_path='result.wav',
 )
 print(res)
 ```

 ```python
 mimick_prompt = "Please repeat each user's speech, including voice style and speech content."
+audio_input, _ = librosa.load('./assets/input_examples/Trump_WEF_2018_10s.mp3', sr=16000, mono=True) # load the audio to be mimicked
+# can also try `./assets/input_examples/cxk_original.wav`,
+# `./assets/input_examples/fast-pace.wav`,
+# `./assets/input_examples/chi-english-1.wav`
+# `./assets/input_examples/exciting-emotion.wav`
+# for different aspects of speech-centric features.
 msgs = [{'role': 'user', 'content': [mimick_prompt, audio_input]}]
 res = model.chat(
     msgs=msgs,
     use_tts_template=True,
     temperature=0.3,
     generate_audio=True,
+    output_audio_path='output_mimick.wav', # save the tts result to output_audio_path
 )
 ```
 ```python
+ref_audio, _ = librosa.load('./assets/input_examples/icl_20.wav', sr=16000, mono=True) # load the reference audio
 sys_prompt = model.get_sys_prompt(ref_audio=ref_audio, mode='audio_roleplay', language='en')
 # round one
     use_tts_template=True,
     generate_audio=True,
     temperature=0.3,
+    output_audio_path='result_roleplay_round_1.wav',
 )
 # round two
     use_tts_template=True,
     generate_audio=True,
     temperature=0.3,
+    output_audio_path='result_roleplay_round_2.wav',
 )
 print(res)
 ```
 #### Speech Conversation as an AI Assistant
+An enhanced feature of `MiniCPM-o-2.6` is to act as an AI assistant, but only with limited choice of voices. In this mode, `MiniCPM-o-2.6` is **less human-like and more like a voice assistant**. In this mode, the model is more instruction-following. For demo, you are suggested to use `assistant_default_female_voice`, `assistant_male_voice`. Other voices may work but not as stable as the default voices.
 ```python
+ref_audio, _ = librosa.load('./assets/input_examples/assistant_default_female_voice.wav', sr=16000, mono=True) # or use `./assets/input_examples/assistant_male_voice.wav`
 sys_prompt = model.get_sys_prompt(ref_audio=ref_audio, mode='audio_assistant', language='en')
+user_question = {'role': 'user', 'content': [librosa.load('xxx.wav', sr=16000, mono=True)[0]]} # load the user's audio question
 # round one
 msgs = [sys_prompt, user_question]
     use_tts_template=True,
     generate_audio=True,
     temperature=0.3,
+    output_audio_path='result_assistant_round_1.wav',
 )
 # round two
     use_tts_template=True,
     generate_audio=True,
     temperature=0.3,
+    output_audio_path='result_assistant_round_2.wav',
 )
 print(res)
 ```
     use_tts_template=True,
     generate_audio=True,
     temperature=0.3,
+    output_audio_path='result_voice_creation.wav',
 )
 ```
 ```python
+ref_audio, _ = librosa.load('./assets/input_examples/icl_20.wav', sr=16000, mono=True) # load the reference audio
 sys_prompt = model.get_sys_prompt(ref_audio=ref_audio, mode='voice_cloning', language='en')
 text_prompt = f"Please read the text below."
 user_question = {'role': 'user', 'content': [text_prompt, "content that you want to read"]}
     use_tts_template=True,
     generate_audio=True,
     temperature=0.3,
+    output_audio_path='result_voice_cloning.wav',
 )
 ```
 `MiniCPM-o-2.6` can also be used to address various audio understanding tasks, such as ASR, speaker analysis, general audio captioning, and sound scene tagging.
 For audio-to-text tasks, you can use the following prompts:
 - ASR with ZH(same as AST en2zh): `请仔细听这段音频片段，并将其内容逐字记录。`
 ```python
 task_prompt = "Please listen to the audio snippet carefully and transcribe the content." + "\n" # can change to other prompts.
+audio_input, _ = librosa.load('./assets/input_examples/audio_understanding.mp3', sr=16000, mono=True) # load the audio to be captioned
 msgs = [{'role': 'user', 'content': [task_prompt, audio_input]}]
     use_tts_template=True,
     generate_audio=True,
     temperature=0.3,
+    output_audio_path='result_audio_understanding.wav',
 )
 print(res)
 ```

assets/input_examples/assistant_default_female_voice.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2ee6f84892e693bd2bb478608db0c9a2459b936af3283697b006cfd969c75484
+size 224044

assets/input_examples/assistant_male_voice.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e6b5eff26be104bbfc039f31f8cebcd6f329c275ccafa01234856ec1a964e999
+size 144044

assets/input_examples/icl_20.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:53892ece071342958403bc5643f84169a30b89cc0fc79eb69508bfa11dd85e68
+size 618528