|
--- |
|
license: mit |
|
language: |
|
- en |
|
tags: |
|
- audio |
|
- text-to-speech |
|
- matcha-tts |
|
--- |
|
# Matcha-TTS CommonVoice EN001 |
|
|
|
## Source Audio |
|
https://commonvoice.mozilla.org/en/datasets |
|
Common Voice Corpus 1 |
|
|
|
I called audios 42da7f26(head-audio-id)_290(files) EN001 |
|
(No plan to include audios in this repo) |
|
## Any Good point? |
|
LJSpeech is much better quality,but it's female voice. |
|
|
|
VCTK 107 voices are similar quality,but that is ODC-By License. |
|
|
|
This audio is just under MIT more easy to continue training or something. |
|
|
|
however I recommend you use VCTK,ODC-By License is not so problem.I'm going to create new voices with this. |
|
## How to Train |
|
Train with IPA text(this folk) |
|
https://github.com/akjava/Matcha-TTS-Japanese |
|
|
|
check this repo's config files. |
|
however there are no audio copy tools.TODO later |
|
|
|
## Files Info |
|
### checkpoints |
|
Matcha-TTS checkpoint - epoch seems big but train with only 290 audios |
|
|
|
see Training metrics |
|
### ONNX |
|
|
|
onnx simplified loading speed is now 1.5 times faster. |
|
``` |
|
from onnxsim import simplify |
|
import onnx |
|
|
|
model = onnx.load("en001_6399_T2.onnx") |
|
model_simp, check = simplify(model) |
|
|
|
onnx.save(model_simp, "en001_6399_T2_simplify.onnx") |
|
``` |
|
|
|
timesteps is default(5) ,small time steps ;The infer speed is somewhat faster, but the quality is lower. |
|
|
|
If you need original onnx do like official way |
|
``` |
|
python -m matcha.onnx.export checkpoint_epoch=5699.ckpt en001_5699t2.onnx --vocoder-name hifigan_T2_v1 --n-timesteps 5 --vocoder-checkpoint generator_v1 |
|
python -m matcha.onnx.export checkpoint_epoch=5699.ckpt en001_5699.onnx --vocoder-name hifigan_univ_v1 --n-timesteps 5 --vocoder-checkpoint g_02500000 |
|
``` |
|
|
|
- T2 means Vocoder is hifigan_T2_v1 |
|
- Unif means Voder is hifigan_univ_v1 |
|
|
|
you can quantize this onnx,but 3 times smaller, but 4-5 times slower,that why I did't include that. |
|
``` |
|
from onnxruntime.quantization import quantize_dynamic, QuantType |
|
quantized_model = quantize_dynamic(src_model_path, dst_model_path, weight_type=QuantType.QUInt8) |
|
``` |
|
|
|
|
|
To use onnx need something,below is old sample |
|
``` |
|
const _pad = "_"; |
|
const _punctuation = ";:,.!?¡¿—…\"«»“” "; |
|
const _letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"; |
|
const _letters_ipa = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ"; |
|
|
|
// below code called Spread syntax |
|
const Symbols = [_pad, ..._punctuation, ..._letters, ..._letters_ipa]; |
|
|
|
const SpaceId = Symbols.indexOf(' '); |
|
|
|
const symbolToId = {}; |
|
const idToSymbol = {}; |
|
|
|
// initialize symbolToId and idToSymbol |
|
for (let i = 0; i < Symbols.length; i++) { |
|
symbolToId[Symbols[i]] = i; |
|
idToSymbol[i] = Symbols[i]; |
|
} |
|
|
|
class MatchaOnnx { |
|
constructor() { |
|
} |
|
async load_model(model_path,options={}){ |
|
this.session = await ort.InferenceSession.create(model_path,options); |
|
} |
|
|
|
get_output_names_html(){ |
|
if (typeof this.session=='undefined'){ |
|
return null |
|
} |
|
let outputNamesString = '[outputs]<br>'; |
|
const outputNames = this.session.outputNames; |
|
for (let outputName of outputNames) { |
|
console.log(outputName) |
|
outputNamesString+=outputName+"<br>" |
|
} |
|
return outputNamesString.trim() |
|
} |
|
|
|
get_input_names_html(){ |
|
if (typeof this.session=='undefined'){ |
|
return null |
|
} |
|
|
|
let inputNamesString = '[Inputs]<br>'; |
|
const inputNames = this.session.inputNames; |
|
|
|
for (let inputName of inputNames) { |
|
console.log(inputName) |
|
inputNamesString+=inputName+"<br>" |
|
} |
|
return inputNamesString.trim() |
|
} |
|
|
|
|
|
processText(text) { |
|
const x = this.intersperse(this.textToSequence(text)); |
|
const x_phones = this.sequenceToText(x); |
|
const textList = []; |
|
for (let i = 1; i < x_phones.length; i += 2) { |
|
textList.push(x_phones[i]); |
|
} |
|
|
|
return { |
|
x: x, |
|
x_length: x.length, |
|
x_phones: x_phones, |
|
x_phones_label: textList.join(""), |
|
}; |
|
} |
|
|
|
|
|
basicCleaners2(text, lowercase = false) { |
|
if (lowercase) { |
|
text = text.toLowerCase(); |
|
} |
|
text = text.replace(/\s+/g, " "); |
|
return text; |
|
} |
|
|
|
textToSequence(text) { |
|
const sequenceList = []; |
|
const clean_text = this.basicCleaners2(text); |
|
for (let i = 0; i < clean_text.length; i++) { |
|
const symbol = clean_text[i]; |
|
sequenceList.push(symbolToId[symbol]); |
|
} |
|
return sequenceList; |
|
} |
|
|
|
intersperse(sequence, item = 0) { |
|
const sequenceList = [item]; |
|
for (let i = 0; i < sequence.length; i++) { |
|
sequenceList.push(sequence[i]); |
|
sequenceList.push(item); |
|
} |
|
return sequenceList; |
|
} |
|
|
|
sequenceToText(sequence) { |
|
const textList = []; |
|
for (let i = 0; i < sequence.length; i++) { |
|
const symbol = idToSymbol[sequence[i]]; |
|
textList.push(symbol); |
|
} |
|
return textList.join(""); |
|
} |
|
|
|
async infer(text, temperature, speed) { |
|
console.log(this.session) |
|
const dic = this.processText(text); |
|
console.log(`x:${dic.x.join(", ")}`); |
|
console.log(`x_length:${dic.x_length}`); |
|
console.log(`x_phones_label:${dic.x_phones_label}`); |
|
|
|
// Prepare input tensors (assuming your ONNX Runtime library uses similar syntax) |
|
//const x_tensor = new this.session.Tensor('long', dic.x, [1, dic.x.length]); |
|
//const x_length_tensor = new this.session.Tensor('long', [dic.x.length], [1]); |
|
//const scales_tensor = new this.session.Tensor('float', [temperature, speed], [2]); |
|
|
|
const dataX = new BigInt64Array(dic.x.length) |
|
for (let i = 0; i < dic.x.length; i++) { |
|
//console.log(dic.x[i]) |
|
dataX[i] = BigInt(dic.x[i]); // Convert each number to a BigInt |
|
} |
|
const data_x_length = new BigInt64Array(1) |
|
data_x_length[0] = BigInt(dic.x_length) |
|
|
|
//const dataX = Int32Array.from([dic.x_length]) |
|
const tensorX = new ort.Tensor('int64', dataX, [1, dic.x.length]); |
|
// const data_x_length = Int32Array.from([dic.x_length]) |
|
const tensor_x_length = new ort.Tensor('int64', data_x_length, [1]); |
|
const data_scale = Float32Array.from( [temperature, speed]) |
|
const tensor_scale = new ort.Tensor('float32', data_scale, [2]); |
|
|
|
|
|
// Run inference |
|
const output = await this.session.run({ |
|
x: tensorX, |
|
x_lengths: tensor_x_length, |
|
scales: tensor_scale, |
|
}); |
|
console.log(output) |
|
// Extract output (assuming your ONNX Runtime library uses similar syntax) |
|
const wav_array = output.wav.data; |
|
console.log(wav_array[0]); |
|
console.log(wav_array.length); |
|
|
|
const x_lengths_array = output.wav_lengths.data; |
|
console.log(x_lengths_array.join(", ")); |
|
|
|
return wav_array; |
|
} |
|
|
|
|
|
} |
|
``` |
|
convert to wav |
|
``` |
|
|
|
|
|
function webWavPlay(f32array){ |
|
blob = float32ArrayToWav(f32array) |
|
url = createObjectUrlFromBlob(blob) |
|
console.log(url) |
|
playAudioFromUrl(url) |
|
} |
|
|
|
function createObjectUrlFromBlob(blob) { |
|
const url = URL.createObjectURL(blob); |
|
return url; |
|
} |
|
|
|
function playAudioFromUrl(url) { |
|
const audio = new Audio(url); |
|
audio.play().catch(error => console.error('Failed to play audio:', error)); |
|
} |
|
|
|
|
|
//I copied |
|
//https://huggingface.co/spaces/k2-fsa/web-assembly-tts-sherpa-onnx-de/blob/main/app-tts.js |
|
// this function is copied/modified from |
|
// https://gist.github.com/meziantou/edb7217fddfbb70e899e |
|
function float32ArrayToWav(floatSamples, sampleRate=22050) { |
|
let samples = new Int16Array(floatSamples.length); |
|
for (let i = 0; i < samples.length; ++i) { |
|
let s = floatSamples[i]; |
|
if (s >= 1) |
|
s = 1; |
|
else if (s <= -1) |
|
s = -1; |
|
|
|
samples[i] = s * 32767; |
|
} |
|
|
|
let buf = new ArrayBuffer(44 + samples.length * 2); |
|
var view = new DataView(buf); |
|
|
|
// http://soundfile.sapp.org/doc/WaveFormat/ |
|
// F F I R |
|
view.setUint32(0, 0x46464952, true); // chunkID |
|
view.setUint32(4, 36 + samples.length * 2, true); // chunkSize |
|
// E V A W |
|
view.setUint32(8, 0x45564157, true); // format |
|
// |
|
// t m f |
|
view.setUint32(12, 0x20746d66, true); // subchunk1ID |
|
view.setUint32(16, 16, true); // subchunk1Size, 16 for PCM |
|
view.setUint32(20, 1, true); // audioFormat, 1 for PCM |
|
view.setUint16(22, 1, true); // numChannels: 1 channel |
|
view.setUint32(24, sampleRate, true); // sampleRate |
|
view.setUint32(28, sampleRate * 2, true); // byteRate |
|
view.setUint16(32, 2, true); // blockAlign |
|
view.setUint16(34, 16, true); // bitsPerSample |
|
view.setUint32(36, 0x61746164, true); // Subchunk2ID |
|
view.setUint32(40, samples.length * 2, true); // subchunk2Size |
|
|
|
let offset = 44; |
|
for (let i = 0; i < samples.length; ++i) { |
|
view.setInt16(offset, samples[i], true); |
|
offset += 2; |
|
} |
|
|
|
return new Blob([view], {type: 'audio/wav'}); |
|
} |
|
``` |
|
### Audio |
|
I cut with VAD tools and denoise with resemble-enhance |
|
|