|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from __future__ import print_function |
|
import argparse |
|
import datetime |
|
import logging |
|
logging.getLogger('matplotlib').setLevel(logging.WARNING) |
|
from copy import deepcopy |
|
import torch |
|
import torch.distributed as dist |
|
|
|
import pdb |
|
from hyperpyyaml import load_hyperpyyaml |
|
|
|
from torch.distributed.elastic.multiprocessing.errors import record |
|
|
|
from cosyvoice.utils.executor import Executor |
|
from cosyvoice.utils.train_utils import ( |
|
init_distributed, |
|
init_dataset_and_dataloader, |
|
init_optimizer_and_scheduler, |
|
init_summarywriter, save_model, |
|
wrap_cuda_model, check_modify_and_save_config) |
|
|
|
|
|
def get_args(): |
|
parser = argparse.ArgumentParser(description='training your network') |
|
parser.add_argument('--train_engine', |
|
default='torch_ddp', |
|
choices=['torch_ddp', 'deepspeed'], |
|
help='Engine for paralleled training') |
|
parser.add_argument('--model', required=True, help='model which will be trained') |
|
parser.add_argument('--config', required=True, help='config file') |
|
parser.add_argument('--train_data', required=True, help='train data file') |
|
parser.add_argument('--cv_data', required=True, help='cv data file') |
|
parser.add_argument('--checkpoint', help='checkpoint model') |
|
parser.add_argument('--model_dir', required=True, help='save model dir') |
|
parser.add_argument('--tensorboard_dir', |
|
default='tensorboard', |
|
help='tensorboard log dir') |
|
parser.add_argument('--ddp.dist_backend', |
|
dest='dist_backend', |
|
default='nccl', |
|
choices=['nccl', 'gloo'], |
|
help='distributed backend') |
|
parser.add_argument('--num_workers', |
|
default=0, |
|
type=int, |
|
help='num of subprocess workers for reading') |
|
parser.add_argument('--prefetch', |
|
default=100, |
|
type=int, |
|
help='prefetch number') |
|
parser.add_argument('--pin_memory', |
|
action='store_true', |
|
default=False, |
|
help='Use pinned memory buffers used for reading') |
|
parser.add_argument('--deepspeed.save_states', |
|
dest='save_states', |
|
default='model_only', |
|
choices=['model_only', 'model+optimizer'], |
|
help='save model/optimizer states') |
|
parser.add_argument('--timeout', |
|
default=30, |
|
type=int, |
|
help='timeout (in seconds) of cosyvoice_join.') |
|
|
|
args = parser.parse_args() |
|
return args |
|
|
|
|
|
@record |
|
def main(): |
|
args = get_args() |
|
logging.basicConfig(level=logging.DEBUG, |
|
format='%(asctime)s %(levelname)s %(message)s') |
|
|
|
override_dict = {k: None for k in ['llm', 'flow', 'hift'] if k != args.model} |
|
with open(args.config, 'r') as f: |
|
configs = load_hyperpyyaml(f, overrides=override_dict) |
|
configs['train_conf'].update(vars(args)) |
|
|
|
|
|
init_distributed(args) |
|
|
|
|
|
train_dataset, cv_dataset, train_data_loader, cv_data_loader = \ |
|
init_dataset_and_dataloader(args, configs) |
|
|
|
|
|
configs = check_modify_and_save_config(args, configs) |
|
|
|
|
|
writer = init_summarywriter(args) |
|
|
|
|
|
model = configs[args.model] |
|
if args.checkpoint is not None: |
|
model.load_state_dict(torch.load(args.checkpoint, map_location='cpu')) |
|
|
|
|
|
model = wrap_cuda_model(args, model) |
|
|
|
|
|
model, optimizer, scheduler = init_optimizer_and_scheduler(args, configs, model) |
|
|
|
|
|
info_dict = deepcopy(configs['train_conf']) |
|
save_model(model, 'init', info_dict) |
|
|
|
|
|
executor = Executor() |
|
|
|
|
|
for epoch in range(info_dict['max_epoch']): |
|
executor.epoch = epoch |
|
train_dataset.set_epoch(epoch) |
|
dist.barrier() |
|
|
|
|
|
|
|
|
|
group_join = dist.new_group(backend="gloo", timeout=datetime.timedelta(seconds=args.timeout)) |
|
executor.train_one_epoc(model, optimizer, scheduler, train_data_loader, cv_data_loader, writer, info_dict, group_join) |
|
dist.destroy_process_group(group_join) |
|
|
|
if __name__ == '__main__': |
|
main() |
|
|