add usage samples to readme
Browse files
README.md
CHANGED
@@ -1155,7 +1155,14 @@ model.tts.float()
|
|
1155 |
|
1156 |
```python
|
1157 |
mimick_prompt = "Please repeat each user's speech, including voice style and speech content."
|
1158 |
-
audio_input, _ = librosa.load('
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1159 |
msgs = [{'role': 'user', 'content': [mimick_prompt, audio_input]}]
|
1160 |
res = model.chat(
|
1161 |
msgs=msgs,
|
@@ -1165,7 +1172,7 @@ res = model.chat(
|
|
1165 |
use_tts_template=True,
|
1166 |
temperature=0.3,
|
1167 |
generate_audio=True,
|
1168 |
-
output_audio_path='
|
1169 |
)
|
1170 |
```
|
1171 |
|
@@ -1177,7 +1184,7 @@ A general usage scenario of `MiniCPM-o-2.6` is role-playing a specific character
|
|
1177 |
|
1178 |
|
1179 |
```python
|
1180 |
-
ref_audio, _ = librosa.load('./assets/
|
1181 |
sys_prompt = model.get_sys_prompt(ref_audio=ref_audio, mode='audio_roleplay', language='en')
|
1182 |
|
1183 |
# round one
|
@@ -1191,7 +1198,7 @@ res = model.chat(
|
|
1191 |
use_tts_template=True,
|
1192 |
generate_audio=True,
|
1193 |
temperature=0.3,
|
1194 |
-
output_audio_path='
|
1195 |
)
|
1196 |
|
1197 |
# round two
|
@@ -1206,7 +1213,7 @@ res = model.chat(
|
|
1206 |
use_tts_template=True,
|
1207 |
generate_audio=True,
|
1208 |
temperature=0.3,
|
1209 |
-
output_audio_path='
|
1210 |
)
|
1211 |
print(res)
|
1212 |
```
|
@@ -1215,11 +1222,12 @@ print(res)
|
|
1215 |
|
1216 |
#### Speech Conversation as an AI Assistant
|
1217 |
|
1218 |
-
An enhanced feature of `MiniCPM-o-2.6` is to act as an AI assistant, but only with limited choice of voices. In this mode, `MiniCPM-o-2.6` is **less human-like and more like a voice assistant**.
|
1219 |
|
1220 |
```python
|
|
|
1221 |
sys_prompt = model.get_sys_prompt(ref_audio=ref_audio, mode='audio_assistant', language='en')
|
1222 |
-
user_question = {'role': 'user', 'content': [librosa.load('xxx.wav', sr=16000, mono=True)[0]]}
|
1223 |
|
1224 |
# round one
|
1225 |
msgs = [sys_prompt, user_question]
|
@@ -1231,7 +1239,7 @@ res = model.chat(
|
|
1231 |
use_tts_template=True,
|
1232 |
generate_audio=True,
|
1233 |
temperature=0.3,
|
1234 |
-
output_audio_path='
|
1235 |
)
|
1236 |
|
1237 |
# round two
|
@@ -1246,7 +1254,7 @@ res = model.chat(
|
|
1246 |
use_tts_template=True,
|
1247 |
generate_audio=True,
|
1248 |
temperature=0.3,
|
1249 |
-
output_audio_path='
|
1250 |
)
|
1251 |
print(res)
|
1252 |
```
|
@@ -1272,7 +1280,7 @@ res = model.chat(
|
|
1272 |
use_tts_template=True,
|
1273 |
generate_audio=True,
|
1274 |
temperature=0.3,
|
1275 |
-
output_audio_path='
|
1276 |
)
|
1277 |
```
|
1278 |
|
@@ -1284,6 +1292,7 @@ res = model.chat(
|
|
1284 |
|
1285 |
|
1286 |
```python
|
|
|
1287 |
sys_prompt = model.get_sys_prompt(ref_audio=ref_audio, mode='voice_cloning', language='en')
|
1288 |
text_prompt = f"Please read the text below."
|
1289 |
user_question = {'role': 'user', 'content': [text_prompt, "content that you want to read"]}
|
@@ -1297,7 +1306,7 @@ res = model.chat(
|
|
1297 |
use_tts_template=True,
|
1298 |
generate_audio=True,
|
1299 |
temperature=0.3,
|
1300 |
-
output_audio_path='
|
1301 |
)
|
1302 |
|
1303 |
```
|
@@ -1308,7 +1317,6 @@ res = model.chat(
|
|
1308 |
|
1309 |
`MiniCPM-o-2.6` can also be used to address various audio understanding tasks, such as ASR, speaker analysis, general audio captioning, and sound scene tagging.
|
1310 |
|
1311 |
-
|
1312 |
For audio-to-text tasks, you can use the following prompts:
|
1313 |
|
1314 |
- ASR with ZH(same as AST en2zh): `请仔细听这段音频片段,并将其内容逐字记录。`
|
@@ -1319,7 +1327,7 @@ For audio-to-text tasks, you can use the following prompts:
|
|
1319 |
|
1320 |
```python
|
1321 |
task_prompt = "Please listen to the audio snippet carefully and transcribe the content." + "\n" # can change to other prompts.
|
1322 |
-
audio_input, _ = librosa.load('
|
1323 |
|
1324 |
msgs = [{'role': 'user', 'content': [task_prompt, audio_input]}]
|
1325 |
|
@@ -1331,7 +1339,7 @@ res = model.chat(
|
|
1331 |
use_tts_template=True,
|
1332 |
generate_audio=True,
|
1333 |
temperature=0.3,
|
1334 |
-
output_audio_path='
|
1335 |
)
|
1336 |
print(res)
|
1337 |
```
|
|
|
1155 |
|
1156 |
```python
|
1157 |
mimick_prompt = "Please repeat each user's speech, including voice style and speech content."
|
1158 |
+
audio_input, _ = librosa.load('./assets/input_examples/Trump_WEF_2018_10s.mp3', sr=16000, mono=True) # load the audio to be mimicked
|
1159 |
+
|
1160 |
+
# can also try `./assets/input_examples/cxk_original.wav`,
|
1161 |
+
# `./assets/input_examples/fast-pace.wav`,
|
1162 |
+
# `./assets/input_examples/chi-english-1.wav`
|
1163 |
+
# `./assets/input_examples/exciting-emotion.wav`
|
1164 |
+
# for different aspects of speech-centric features.
|
1165 |
+
|
1166 |
msgs = [{'role': 'user', 'content': [mimick_prompt, audio_input]}]
|
1167 |
res = model.chat(
|
1168 |
msgs=msgs,
|
|
|
1172 |
use_tts_template=True,
|
1173 |
temperature=0.3,
|
1174 |
generate_audio=True,
|
1175 |
+
output_audio_path='output_mimick.wav', # save the tts result to output_audio_path
|
1176 |
)
|
1177 |
```
|
1178 |
|
|
|
1184 |
|
1185 |
|
1186 |
```python
|
1187 |
+
ref_audio, _ = librosa.load('./assets/input_examples/icl_20.wav', sr=16000, mono=True) # load the reference audio
|
1188 |
sys_prompt = model.get_sys_prompt(ref_audio=ref_audio, mode='audio_roleplay', language='en')
|
1189 |
|
1190 |
# round one
|
|
|
1198 |
use_tts_template=True,
|
1199 |
generate_audio=True,
|
1200 |
temperature=0.3,
|
1201 |
+
output_audio_path='result_roleplay_round_1.wav',
|
1202 |
)
|
1203 |
|
1204 |
# round two
|
|
|
1213 |
use_tts_template=True,
|
1214 |
generate_audio=True,
|
1215 |
temperature=0.3,
|
1216 |
+
output_audio_path='result_roleplay_round_2.wav',
|
1217 |
)
|
1218 |
print(res)
|
1219 |
```
|
|
|
1222 |
|
1223 |
#### Speech Conversation as an AI Assistant
|
1224 |
|
1225 |
+
An enhanced feature of `MiniCPM-o-2.6` is to act as an AI assistant, but only with limited choice of voices. In this mode, `MiniCPM-o-2.6` is **less human-like and more like a voice assistant**. In this mode, the model is more instruction-following. For demo, you are suggested to use `assistant_default_female_voice`, `assistant_male_voice`. Other voices may work but not as stable as the default voices.
|
1226 |
|
1227 |
```python
|
1228 |
+
ref_audio, _ = librosa.load('./assets/input_examples/assistant_default_female_voice.wav', sr=16000, mono=True) # or use `./assets/input_examples/assistant_male_voice.wav`
|
1229 |
sys_prompt = model.get_sys_prompt(ref_audio=ref_audio, mode='audio_assistant', language='en')
|
1230 |
+
user_question = {'role': 'user', 'content': [librosa.load('xxx.wav', sr=16000, mono=True)[0]]} # load the user's audio question
|
1231 |
|
1232 |
# round one
|
1233 |
msgs = [sys_prompt, user_question]
|
|
|
1239 |
use_tts_template=True,
|
1240 |
generate_audio=True,
|
1241 |
temperature=0.3,
|
1242 |
+
output_audio_path='result_assistant_round_1.wav',
|
1243 |
)
|
1244 |
|
1245 |
# round two
|
|
|
1254 |
use_tts_template=True,
|
1255 |
generate_audio=True,
|
1256 |
temperature=0.3,
|
1257 |
+
output_audio_path='result_assistant_round_2.wav',
|
1258 |
)
|
1259 |
print(res)
|
1260 |
```
|
|
|
1280 |
use_tts_template=True,
|
1281 |
generate_audio=True,
|
1282 |
temperature=0.3,
|
1283 |
+
output_audio_path='result_voice_creation.wav',
|
1284 |
)
|
1285 |
```
|
1286 |
|
|
|
1292 |
|
1293 |
|
1294 |
```python
|
1295 |
+
ref_audio, _ = librosa.load('./assets/input_examples/icl_20.wav', sr=16000, mono=True) # load the reference audio
|
1296 |
sys_prompt = model.get_sys_prompt(ref_audio=ref_audio, mode='voice_cloning', language='en')
|
1297 |
text_prompt = f"Please read the text below."
|
1298 |
user_question = {'role': 'user', 'content': [text_prompt, "content that you want to read"]}
|
|
|
1306 |
use_tts_template=True,
|
1307 |
generate_audio=True,
|
1308 |
temperature=0.3,
|
1309 |
+
output_audio_path='result_voice_cloning.wav',
|
1310 |
)
|
1311 |
|
1312 |
```
|
|
|
1317 |
|
1318 |
`MiniCPM-o-2.6` can also be used to address various audio understanding tasks, such as ASR, speaker analysis, general audio captioning, and sound scene tagging.
|
1319 |
|
|
|
1320 |
For audio-to-text tasks, you can use the following prompts:
|
1321 |
|
1322 |
- ASR with ZH(same as AST en2zh): `请仔细听这段音频片段,并将其内容逐字记录。`
|
|
|
1327 |
|
1328 |
```python
|
1329 |
task_prompt = "Please listen to the audio snippet carefully and transcribe the content." + "\n" # can change to other prompts.
|
1330 |
+
audio_input, _ = librosa.load('./assets/input_examples/audio_understanding.mp3', sr=16000, mono=True) # load the audio to be captioned
|
1331 |
|
1332 |
msgs = [{'role': 'user', 'content': [task_prompt, audio_input]}]
|
1333 |
|
|
|
1339 |
use_tts_template=True,
|
1340 |
generate_audio=True,
|
1341 |
temperature=0.3,
|
1342 |
+
output_audio_path='result_audio_understanding.wav',
|
1343 |
)
|
1344 |
print(res)
|
1345 |
```
|
assets/input_examples/assistant_default_female_voice.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2ee6f84892e693bd2bb478608db0c9a2459b936af3283697b006cfd969c75484
|
3 |
+
size 224044
|
assets/input_examples/assistant_male_voice.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e6b5eff26be104bbfc039f31f8cebcd6f329c275ccafa01234856ec1a964e999
|
3 |
+
size 144044
|
assets/input_examples/icl_20.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:53892ece071342958403bc5643f84169a30b89cc0fc79eb69508bfa11dd85e68
|
3 |
+
size 618528
|