MockingBird/models/synthesizer/preprocess.py

from multiprocessing.pool import Pool 

from functools import partial
from itertools import chain
from pathlib import Path
from tqdm import tqdm
import numpy as np
from models.encoder import inference as encoder
from models.synthesizer.preprocess_audio import preprocess_general, extract_emo
from models.synthesizer.preprocess_transcript import preprocess_transcript_aishell3, preprocess_transcript_magicdata

data_info = {
    "aidatatang_200zh": {
        "subfolders": ["corpus/train"],
        "trans_filepath": "transcript/aidatatang_200_zh_transcript.txt",
        "speak_func": preprocess_general
    },
    "aidatatang_200zh_s": {
        "subfolders": ["corpus/train"],
        "trans_filepath": "transcript/aidatatang_200_zh_transcript.txt",
        "speak_func": preprocess_general
    },
    "magicdata": {
        "subfolders": ["train"],
        "trans_filepath": "train/TRANS.txt",
        "speak_func": preprocess_general,
        "transcript_func": preprocess_transcript_magicdata,
    },
    "aishell3":{
        "subfolders": ["train/wav"],
        "trans_filepath": "train/content.txt",
        "speak_func": preprocess_general,
        "transcript_func": preprocess_transcript_aishell3,
    },
    "data_aishell":{
        "subfolders": ["wav/train"],
        "trans_filepath": "transcript/aishell_transcript_v0.8.txt",
        "speak_func": preprocess_general
    }
}

def should_skip(fpath: Path, skip_existing: bool) -> bool:
    return skip_existing and fpath.exists()

def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int,
                           skip_existing: bool, hparams, no_alignments: bool, 
                           dataset: str, emotion_extract = False, encoder_model_fpath=None):
    dataset_info = data_info[dataset]
    # Gather the input directories
    dataset_root = datasets_root.joinpath(dataset)
    input_dirs = [dataset_root.joinpath(subfolder.strip()) for subfolder in dataset_info["subfolders"]]
    print("\n    ".join(map(str, ["Using data from:"] + input_dirs)))
    assert all(input_dir.exists() for input_dir in input_dirs)
    
    # Create the output directories for each output file type
    out_dir.joinpath("mels").mkdir(exist_ok=True)
    out_dir.joinpath("audio").mkdir(exist_ok=True)
    if emotion_extract:
        out_dir.joinpath("emo").mkdir(exist_ok=True)
    
    # Create a metadata file
    metadata_fpath = out_dir.joinpath("train.txt")
    metadata_file = metadata_fpath.open("a" if skip_existing else "w", encoding="utf-8")

    # Preprocess the dataset
    dict_info = {}
    transcript_dirs = dataset_root.joinpath(dataset_info["trans_filepath"])
    assert transcript_dirs.exists(), str(transcript_dirs)+" not exist."
    with open(transcript_dirs, "r", encoding="utf-8") as dict_transcript:
        # process with specific function for your dataset 
        if "transcript_func" in dataset_info:
            dataset_info["transcript_func"](dict_info, dict_transcript)
        else:
            for v in dict_transcript:
                if not v:
                    continue
                v = v.strip().replace("\n","").replace("\t"," ").split(" ")
                dict_info[v[0]] = " ".join(v[1:])

    speaker_dirs = list(chain.from_iterable(input_dir.glob("*") for input_dir in input_dirs))
    
    func = partial(dataset_info["speak_func"], out_dir=out_dir, skip_existing=skip_existing, 
                   hparams=hparams, dict_info=dict_info, no_alignments=no_alignments, encoder_model_fpath=encoder_model_fpath)
    job = Pool(n_processes).imap_unordered(func, speaker_dirs)
    
    for speaker_metadata in tqdm(job, dataset, len(speaker_dirs), unit="speakers"):
        if speaker_metadata is not None:
            for metadatum in speaker_metadata:
                metadata_file.write("|".join(map(str,metadatum)) + "\n")
    metadata_file.close()

    # Verify the contents of the metadata file
    with metadata_fpath.open("r", encoding="utf-8") as metadata_file:
        metadata = [line.split("|") for line in metadata_file]
    mel_frames = sum([int(m[4]) for m in metadata])
    timesteps = sum([int(m[3]) for m in metadata])
    sample_rate = hparams.sample_rate
    hours = (timesteps / sample_rate) / 3600
    print("The dataset consists of %d utterances, %d mel frames, %d audio timesteps (%.2f hours)." %
          (len(metadata), mel_frames, timesteps, hours))
    print("Max input length (text chars): %d" % max(len(m[5]) for m in metadata))
    print("Max mel frames length: %d" % max(int(m[4]) for m in metadata))
    print("Max audio timesteps length: %d" % max(int(m[3]) for m in metadata))

def _embed_utterance(fpaths: str, encoder_model_fpath: str):
    if not encoder.is_loaded():
        encoder.load_model(encoder_model_fpath)

    # Compute the speaker embedding of the utterance
    wav_fpath, embed_fpath = fpaths
    wav = np.load(wav_fpath)
    wav = encoder.preprocess_wav(wav)
    embed = encoder.embed_utterance(wav)
    np.save(embed_fpath, embed, allow_pickle=False)
    
def _emo_extract_from_utterance(fpaths, hparams):
    wav_fpath, emo_fpath = fpaths
    wav = np.load(wav_fpath)
    emo = extract_emo(np.expand_dims(wav, 0), hparams.sample_rate, True)
    np.save(emo_fpath, emo.squeeze(0), allow_pickle=False)
 
def create_embeddings(synthesizer_root: Path, encoder_model_fpath: Path, n_processes: int, skip_existing: bool):
    wav_dir = synthesizer_root.joinpath("audio")
    metadata_fpath = synthesizer_root.joinpath("train.txt")
    assert wav_dir.exists() and metadata_fpath.exists()
    embed_dir = synthesizer_root.joinpath("embeds")
    embed_dir.mkdir(exist_ok=True)
    
    # Gather the input wave filepath and the target output embed filepath
    with metadata_fpath.open("r", encoding="utf-8") as metadata_file:
        metadata = [line.split("|") for line in metadata_file]
        fpaths = [(wav_dir.joinpath(m[0]), embed_dir.joinpath(m[2])) for m in metadata if not should_skip(embed_dir.joinpath(m[2]), skip_existing)]

    # TODO: improve on the multiprocessing, it's terrible. Disk I/O is the bottleneck here.
    # Embed the utterances in separate threads
    func = partial(_embed_utterance, encoder_model_fpath=encoder_model_fpath)
    job = Pool(n_processes).imap(func, fpaths)
    tuple(tqdm(job, "Embedding", len(fpaths), unit="utterances"))

def create_emo(synthesizer_root: Path, n_processes: int, skip_existing: bool, hparams):
    wav_dir = synthesizer_root.joinpath("audio")
    metadata_fpath = synthesizer_root.joinpath("train.txt")
    assert wav_dir.exists() and metadata_fpath.exists()
    emo_dir = synthesizer_root.joinpath("emo")
    emo_dir.mkdir(exist_ok=True)

    # Gather the input wave filepath and the target output embed filepath
    with metadata_fpath.open("r", encoding="utf-8") as metadata_file:
        metadata = [line.split("|") for line in metadata_file]
        fpaths = [(wav_dir.joinpath(m[0]), emo_dir.joinpath(m[0].replace("audio-", "emo-"))) for m in metadata if not should_skip(emo_dir.joinpath(m[0].replace("audio-", "emo-")), skip_existing)]
        
    # TODO: improve on the multiprocessing, it's terrible. Disk I/O is the bottleneck here.
    # Embed the utterances in separate threads
    func = partial(_emo_extract_from_utterance, hparams=hparams)
    job = Pool(n_processes).imap(func, fpaths)
    tuple(tqdm(job, "Emo", len(fpaths), unit="utterances"))
Init to support Chinese Dataset. 4 years ago			`from multiprocessing.pool import Pool`
Refactor preprocessor of synthesizer to prepare to supprot more datasets 4 years ago
Init to support Chinese Dataset. 4 years ago			`from functools import partial`
			`from itertools import chain`
			`from pathlib import Path`
			`from tqdm import tqdm`
			`import numpy as np`
Refactor Project to 3 parts: Models, Control, Data Need readme 2 years ago			`from models.encoder import inference as encoder`
add pretrained 2 years ago			`from models.synthesizer.preprocess_audio import preprocess_general, extract_emo`
Refactor Project to 3 parts: Models, Control, Data Need readme 2 years ago			`from models.synthesizer.preprocess_transcript import preprocess_transcript_aishell3, preprocess_transcript_magicdata`
Init to support Chinese Dataset. 4 years ago
Refactor preprocessor of synthesizer to prepare to supprot more datasets 4 years ago			`data_info = {`
			`"aidatatang_200zh": {`
			`"subfolders": ["corpus/train"],`
Supporting new dataset SLR68 ! try python synthesizer_preprocess_audio.py ...\slr --dataset SLR68 4 years ago			`"trans_filepath": "transcript/aidatatang_200_zh_transcript.txt",`
Add vits 2 years ago			`"speak_func": preprocess_general`
			`},`
			`"aidatatang_200zh_s": {`
			`"subfolders": ["corpus/train"],`
			`"trans_filepath": "transcript/aidatatang_200_zh_transcript.txt",`
			`"speak_func": preprocess_general`
Supporting new dataset SLR68 ! try python synthesizer_preprocess_audio.py ...\slr --dataset SLR68 4 years ago			`},`
rename slr68 to magicdata to keep consistent naming convention (cherry picked from commit bbdad858ebc4d0ee3b720ba22ae3e0ce9732a734) 4 years ago			`"magicdata": {`
Supporting new dataset SLR68 ! try python synthesizer_preprocess_audio.py ...\slr --dataset SLR68 4 years ago			`"subfolders": ["train"],`
			`"trans_filepath": "train/TRANS.txt",`
Add vits 2 years ago			`"speak_func": preprocess_general,`
Fix bug pre-processing magicdata 3 years ago			`"transcript_func": preprocess_transcript_magicdata,`
[dataset]support aishell3(tested) 4 years ago			`},`
			`"aishell3":{`
			`"subfolders": ["train/wav"],`
			`"trans_filepath": "train/content.txt",`
Add vits 2 years ago			`"speak_func": preprocess_general,`
[FIX] Fix preprocessing bug for aishell3 4 years ago			`"transcript_func": preprocess_transcript_aishell3,`
支持data_aishell（SLR33）数据集 (#141) * 支持data_aishell（SLR33）数据集 * 更新readme 3 years ago			`},`
			`"data_aishell":{`
			`"subfolders": ["wav/train"],`
			`"trans_filepath": "transcript/aishell_transcript_v0.8.txt",`
Add vits 2 years ago			`"speak_func": preprocess_general`
[FIX] Fix preprocessing bug for aishell3 4 years ago			`}`
Refactor preprocessor of synthesizer to prepare to supprot more datasets 4 years ago			`}`
Init to support Chinese Dataset. 4 years ago
Skip embedding (#950) * Skip embedding * Skip earlier * Remove unused paramater * Pass param 2 years ago			`def should_skip(fpath: Path, skip_existing: bool) -> bool:`
			`return skip_existing and fpath.exists()`

Init to support Chinese Dataset. 4 years ago			`def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int,`
Add vits 2 years ago			`skip_existing: bool, hparams, no_alignments: bool,`
add pretrained 2 years ago			`dataset: str, emotion_extract = False, encoder_model_fpath=None):`
Supporting new dataset SLR68 ! try python synthesizer_preprocess_audio.py ...\slr --dataset SLR68 4 years ago			`dataset_info = data_info[dataset]`
Init to support Chinese Dataset. 4 years ago			`# Gather the input directories`
Refactor preprocessor of synthesizer to prepare to supprot more datasets 4 years ago			`dataset_root = datasets_root.joinpath(dataset)`
Supporting new dataset SLR68 ! try python synthesizer_preprocess_audio.py ...\slr --dataset SLR68 4 years ago			`input_dirs = [dataset_root.joinpath(subfolder.strip()) for subfolder in dataset_info["subfolders"]]`
Init to support Chinese Dataset. 4 years ago			`print("\n ".join(map(str, ["Using data from:"] + input_dirs)))`
			`assert all(input_dir.exists() for input_dir in input_dirs)`

			`# Create the output directories for each output file type`
			`out_dir.joinpath("mels").mkdir(exist_ok=True)`
			`out_dir.joinpath("audio").mkdir(exist_ok=True)`
Add vits 2 years ago			`if emotion_extract:`
			`out_dir.joinpath("emo").mkdir(exist_ok=True)`
Init to support Chinese Dataset. 4 years ago
			`# Create a metadata file`
			`metadata_fpath = out_dir.joinpath("train.txt")`
			`metadata_file = metadata_fpath.open("a" if skip_existing else "w", encoding="utf-8")`

			`# Preprocess the dataset`
Supporting new dataset SLR68 ! try python synthesizer_preprocess_audio.py ...\slr --dataset SLR68 4 years ago			`dict_info = {}`
			`transcript_dirs = dataset_root.joinpath(dataset_info["trans_filepath"])`
			`assert transcript_dirs.exists(), str(transcript_dirs)+" not exist."`
			`with open(transcript_dirs, "r", encoding="utf-8") as dict_transcript:`
[FIX] Fix preprocessing bug for aishell3 4 years ago			`# process with specific function for your dataset`
			`if "transcript_func" in dataset_info:`
			`dataset_info["transcript_func"](dict_info, dict_transcript)`
			`else:`
			`for v in dict_transcript:`
			`if not v:`
			`continue`
			`v = v.strip().replace("\n","").replace("\t"," ").split(" ")`
			`dict_info[v[0]] = " ".join(v[1:])`
Supporting new dataset SLR68 ! try python synthesizer_preprocess_audio.py ...\slr --dataset SLR68 4 years ago
Init to support Chinese Dataset. 4 years ago			`speaker_dirs = list(chain.from_iterable(input_dir.glob("*") for input_dir in input_dirs))`
Add vits 2 years ago
Supporting new dataset SLR68 ! try python synthesizer_preprocess_audio.py ...\slr --dataset SLR68 4 years ago			`func = partial(dataset_info["speak_func"], out_dir=out_dir, skip_existing=skip_existing,`
add pretrained 2 years ago			`hparams=hparams, dict_info=dict_info, no_alignments=no_alignments, encoder_model_fpath=encoder_model_fpath)`
Some changes to make it easier to install the dependencies 2 years ago			`job = Pool(n_processes).imap_unordered(func, speaker_dirs)`
Add vits 2 years ago
Refactor preprocessor of synthesizer to prepare to supprot more datasets 4 years ago			`for speaker_metadata in tqdm(job, dataset, len(speaker_dirs), unit="speakers"):`
Add vits 2 years ago			`if speaker_metadata is not None:`
			`for metadatum in speaker_metadata:`
Some changes to make it easier to install the dependencies 2 years ago			`metadata_file.write("\|".join(map(str,metadatum)) + "\n")`
Init to support Chinese Dataset. 4 years ago			`metadata_file.close()`

			`# Verify the contents of the metadata file`
			`with metadata_fpath.open("r", encoding="utf-8") as metadata_file:`
			`metadata = [line.split("\|") for line in metadata_file]`
			`mel_frames = sum([int(m[4]) for m in metadata])`
			`timesteps = sum([int(m[3]) for m in metadata])`
			`sample_rate = hparams.sample_rate`
			`hours = (timesteps / sample_rate) / 3600`
			`print("The dataset consists of %d utterances, %d mel frames, %d audio timesteps (%.2f hours)." %`
			`(len(metadata), mel_frames, timesteps, hours))`
			`print("Max input length (text chars): %d" % max(len(m[5]) for m in metadata))`
			`print("Max mel frames length: %d" % max(int(m[4]) for m in metadata))`
			`print("Max audio timesteps length: %d" % max(int(m[3]) for m in metadata))`

Skip embedding (#950) * Skip embedding * Skip earlier * Remove unused paramater * Pass param 2 years ago			`def _embed_utterance(fpaths: str, encoder_model_fpath: str):`
Init to support Chinese Dataset. 4 years ago			`if not encoder.is_loaded():`
			`encoder.load_model(encoder_model_fpath)`

			`# Compute the speaker embedding of the utterance`
			`wav_fpath, embed_fpath = fpaths`
			`wav = np.load(wav_fpath)`
			`wav = encoder.preprocess_wav(wav)`
			`embed = encoder.embed_utterance(wav)`
			`np.save(embed_fpath, embed, allow_pickle=False)`

Skip embedding (#950) * Skip embedding * Skip earlier * Remove unused paramater * Pass param 2 years ago			`def _emo_extract_from_utterance(fpaths, hparams):`
add pretrained 2 years ago			`wav_fpath, emo_fpath = fpaths`
			`wav = np.load(wav_fpath)`
			`emo = extract_emo(np.expand_dims(wav, 0), hparams.sample_rate, True)`
			`np.save(emo_fpath, emo.squeeze(0), allow_pickle=False)`
Init to support Chinese Dataset. 4 years ago
Skip embedding (#950) * Skip embedding * Skip earlier * Remove unused paramater * Pass param 2 years ago			`def create_embeddings(synthesizer_root: Path, encoder_model_fpath: Path, n_processes: int, skip_existing: bool):`
Init to support Chinese Dataset. 4 years ago			`wav_dir = synthesizer_root.joinpath("audio")`
			`metadata_fpath = synthesizer_root.joinpath("train.txt")`
			`assert wav_dir.exists() and metadata_fpath.exists()`
			`embed_dir = synthesizer_root.joinpath("embeds")`
			`embed_dir.mkdir(exist_ok=True)`

			`# Gather the input wave filepath and the target output embed filepath`
			`with metadata_fpath.open("r", encoding="utf-8") as metadata_file:`
			`metadata = [line.split("\|") for line in metadata_file]`
Skip embedding (#950) * Skip embedding * Skip earlier * Remove unused paramater * Pass param 2 years ago			`fpaths = [(wav_dir.joinpath(m[0]), embed_dir.joinpath(m[2])) for m in metadata if not should_skip(embed_dir.joinpath(m[2]), skip_existing)]`

Init to support Chinese Dataset. 4 years ago			`# TODO: improve on the multiprocessing, it's terrible. Disk I/O is the bottleneck here.`
			`# Embed the utterances in separate threads`
Skip embedding (#950) * Skip embedding * Skip earlier * Remove unused paramater * Pass param 2 years ago			`func = partial(_embed_utterance, encoder_model_fpath=encoder_model_fpath)`
Init to support Chinese Dataset. 4 years ago			`job = Pool(n_processes).imap(func, fpaths)`
Some changes to make it easier to install the dependencies 2 years ago			`tuple(tqdm(job, "Embedding", len(fpaths), unit="utterances"))`
add pretrained 2 years ago
			`def create_emo(synthesizer_root: Path, n_processes: int, skip_existing: bool, hparams):`
			`wav_dir = synthesizer_root.joinpath("audio")`
			`metadata_fpath = synthesizer_root.joinpath("train.txt")`
			`assert wav_dir.exists() and metadata_fpath.exists()`
			`emo_dir = synthesizer_root.joinpath("emo")`
			`emo_dir.mkdir(exist_ok=True)`
Skip embedding (#950) * Skip embedding * Skip earlier * Remove unused paramater * Pass param 2 years ago
add pretrained 2 years ago			`# Gather the input wave filepath and the target output embed filepath`
			`with metadata_fpath.open("r", encoding="utf-8") as metadata_file:`
			`metadata = [line.split("\|") for line in metadata_file]`
Skip embedding (#950) * Skip embedding * Skip earlier * Remove unused paramater * Pass param 2 years ago			`fpaths = [(wav_dir.joinpath(m[0]), emo_dir.joinpath(m[0].replace("audio-", "emo-"))) for m in metadata if not should_skip(emo_dir.joinpath(m[0].replace("audio-", "emo-")), skip_existing)]`
add pretrained 2 years ago
			`# TODO: improve on the multiprocessing, it's terrible. Disk I/O is the bottleneck here.`
			`# Embed the utterances in separate threads`
Skip embedding (#950) * Skip embedding * Skip earlier * Remove unused paramater * Pass param 2 years ago			`func = partial(_emo_extract_from_utterance, hparams=hparams)`
add pretrained 2 years ago			`job = Pool(n_processes).imap(func, fpaths)`
Some changes to make it easier to install the dependencies 2 years ago			`tuple(tqdm(job, "Emo", len(fpaths), unit="utterances"))`