|
|
@ -39,6 +39,9 @@ data_info = {
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def should_skip(fpath: Path, skip_existing: bool) -> bool:
|
|
|
|
|
|
|
|
return skip_existing and fpath.exists()
|
|
|
|
|
|
|
|
|
|
|
|
def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int,
|
|
|
|
def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int,
|
|
|
|
skip_existing: bool, hparams, no_alignments: bool,
|
|
|
|
skip_existing: bool, hparams, no_alignments: bool,
|
|
|
|
dataset: str, emotion_extract = False, encoder_model_fpath=None):
|
|
|
|
dataset: str, emotion_extract = False, encoder_model_fpath=None):
|
|
|
@ -99,7 +102,7 @@ def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int,
|
|
|
|
print("Max mel frames length: %d" % max(int(m[4]) for m in metadata))
|
|
|
|
print("Max mel frames length: %d" % max(int(m[4]) for m in metadata))
|
|
|
|
print("Max audio timesteps length: %d" % max(int(m[3]) for m in metadata))
|
|
|
|
print("Max audio timesteps length: %d" % max(int(m[3]) for m in metadata))
|
|
|
|
|
|
|
|
|
|
|
|
def embed_utterance(fpaths, encoder_model_fpath):
|
|
|
|
def _embed_utterance(fpaths: str, encoder_model_fpath: str):
|
|
|
|
if not encoder.is_loaded():
|
|
|
|
if not encoder.is_loaded():
|
|
|
|
encoder.load_model(encoder_model_fpath)
|
|
|
|
encoder.load_model(encoder_model_fpath)
|
|
|
|
|
|
|
|
|
|
|
@ -110,15 +113,13 @@ def embed_utterance(fpaths, encoder_model_fpath):
|
|
|
|
embed = encoder.embed_utterance(wav)
|
|
|
|
embed = encoder.embed_utterance(wav)
|
|
|
|
np.save(embed_fpath, embed, allow_pickle=False)
|
|
|
|
np.save(embed_fpath, embed, allow_pickle=False)
|
|
|
|
|
|
|
|
|
|
|
|
def _emo_extract_from_utterance(fpaths, hparams, skip_existing=False):
|
|
|
|
def _emo_extract_from_utterance(fpaths, hparams):
|
|
|
|
if skip_existing and fpaths.exists():
|
|
|
|
|
|
|
|
return
|
|
|
|
|
|
|
|
wav_fpath, emo_fpath = fpaths
|
|
|
|
wav_fpath, emo_fpath = fpaths
|
|
|
|
wav = np.load(wav_fpath)
|
|
|
|
wav = np.load(wav_fpath)
|
|
|
|
emo = extract_emo(np.expand_dims(wav, 0), hparams.sample_rate, True)
|
|
|
|
emo = extract_emo(np.expand_dims(wav, 0), hparams.sample_rate, True)
|
|
|
|
np.save(emo_fpath, emo.squeeze(0), allow_pickle=False)
|
|
|
|
np.save(emo_fpath, emo.squeeze(0), allow_pickle=False)
|
|
|
|
|
|
|
|
|
|
|
|
def create_embeddings(synthesizer_root: Path, encoder_model_fpath: Path, n_processes: int):
|
|
|
|
def create_embeddings(synthesizer_root: Path, encoder_model_fpath: Path, n_processes: int, skip_existing: bool):
|
|
|
|
wav_dir = synthesizer_root.joinpath("audio")
|
|
|
|
wav_dir = synthesizer_root.joinpath("audio")
|
|
|
|
metadata_fpath = synthesizer_root.joinpath("train.txt")
|
|
|
|
metadata_fpath = synthesizer_root.joinpath("train.txt")
|
|
|
|
assert wav_dir.exists() and metadata_fpath.exists()
|
|
|
|
assert wav_dir.exists() and metadata_fpath.exists()
|
|
|
@ -128,11 +129,11 @@ def create_embeddings(synthesizer_root: Path, encoder_model_fpath: Path, n_proce
|
|
|
|
# Gather the input wave filepath and the target output embed filepath
|
|
|
|
# Gather the input wave filepath and the target output embed filepath
|
|
|
|
with metadata_fpath.open("r", encoding="utf-8") as metadata_file:
|
|
|
|
with metadata_fpath.open("r", encoding="utf-8") as metadata_file:
|
|
|
|
metadata = [line.split("|") for line in metadata_file]
|
|
|
|
metadata = [line.split("|") for line in metadata_file]
|
|
|
|
fpaths = [(wav_dir.joinpath(m[0]), embed_dir.joinpath(m[2])) for m in metadata]
|
|
|
|
fpaths = [(wav_dir.joinpath(m[0]), embed_dir.joinpath(m[2])) for m in metadata if not should_skip(embed_dir.joinpath(m[2]), skip_existing)]
|
|
|
|
|
|
|
|
|
|
|
|
# TODO: improve on the multiprocessing, it's terrible. Disk I/O is the bottleneck here.
|
|
|
|
# TODO: improve on the multiprocessing, it's terrible. Disk I/O is the bottleneck here.
|
|
|
|
# Embed the utterances in separate threads
|
|
|
|
# Embed the utterances in separate threads
|
|
|
|
func = partial(embed_utterance, encoder_model_fpath=encoder_model_fpath)
|
|
|
|
func = partial(_embed_utterance, encoder_model_fpath=encoder_model_fpath)
|
|
|
|
job = Pool(n_processes).imap(func, fpaths)
|
|
|
|
job = Pool(n_processes).imap(func, fpaths)
|
|
|
|
tuple(tqdm(job, "Embedding", len(fpaths), unit="utterances"))
|
|
|
|
tuple(tqdm(job, "Embedding", len(fpaths), unit="utterances"))
|
|
|
|
|
|
|
|
|
|
|
@ -142,14 +143,14 @@ def create_emo(synthesizer_root: Path, n_processes: int, skip_existing: bool, hp
|
|
|
|
assert wav_dir.exists() and metadata_fpath.exists()
|
|
|
|
assert wav_dir.exists() and metadata_fpath.exists()
|
|
|
|
emo_dir = synthesizer_root.joinpath("emo")
|
|
|
|
emo_dir = synthesizer_root.joinpath("emo")
|
|
|
|
emo_dir.mkdir(exist_ok=True)
|
|
|
|
emo_dir.mkdir(exist_ok=True)
|
|
|
|
|
|
|
|
|
|
|
|
# Gather the input wave filepath and the target output embed filepath
|
|
|
|
# Gather the input wave filepath and the target output embed filepath
|
|
|
|
with metadata_fpath.open("r", encoding="utf-8") as metadata_file:
|
|
|
|
with metadata_fpath.open("r", encoding="utf-8") as metadata_file:
|
|
|
|
metadata = [line.split("|") for line in metadata_file]
|
|
|
|
metadata = [line.split("|") for line in metadata_file]
|
|
|
|
fpaths = [(wav_dir.joinpath(m[0]), emo_dir.joinpath(m[0].replace("audio-", "emo-"))) for m in metadata]
|
|
|
|
fpaths = [(wav_dir.joinpath(m[0]), emo_dir.joinpath(m[0].replace("audio-", "emo-"))) for m in metadata if not should_skip(emo_dir.joinpath(m[0].replace("audio-", "emo-")), skip_existing)]
|
|
|
|
|
|
|
|
|
|
|
|
# TODO: improve on the multiprocessing, it's terrible. Disk I/O is the bottleneck here.
|
|
|
|
# TODO: improve on the multiprocessing, it's terrible. Disk I/O is the bottleneck here.
|
|
|
|
# Embed the utterances in separate threads
|
|
|
|
# Embed the utterances in separate threads
|
|
|
|
func = partial(_emo_extract_from_utterance, hparams=hparams, skip_existing=skip_existing)
|
|
|
|
func = partial(_emo_extract_from_utterance, hparams=hparams)
|
|
|
|
job = Pool(n_processes).imap(func, fpaths)
|
|
|
|
job = Pool(n_processes).imap(func, fpaths)
|
|
|
|
tuple(tqdm(job, "Emo", len(fpaths), unit="utterances"))
|
|
|
|
tuple(tqdm(job, "Emo", len(fpaths), unit="utterances"))
|
|
|
|