Add quick path to preprocess audio, denoise audio when loading in toolbox

pull/75/head
babysor00 3 years ago
parent 2f1f4f70b4
commit 5c0cb50c3e

@ -23,6 +23,16 @@
"args": [
"dev", "..\\..\\chs1"
],
},
{
"name": "Python: demo box",
"type": "python",
"request": "launch",
"program": "demo_toolbox.py",
"console": "integratedTerminal",
"args": [
"-d", "..\\..\\chs"
],
}
]
}

@ -31,16 +31,13 @@
### 2. 使用数据集训练合成器
* 下载 数据集并解压:确保您可以访问 *train* 文件夹中的所有音频文件(如.wav
* 使用音频和梅尔频谱图进行预处理:
`python synthesizer_preprocess_audio.py <datasets_root>`
* 进行音频和梅尔频谱图预处理:
`python pre.py <datasets_root>`
可以传入参数 --dataset `{dataset}` 支持 adatatang_200zh, magicdata, aishell3
> 假如你下载的 `aidatatang_200zh`文件放在D盘`train`文件路径为 `D:\data\aidatatang_200zh\corpus\train` , 你的`datasets_root`就是 `D:\data\`
>假如發生 `頁面文件太小,無法完成操作`,請參考這篇[文章](https://blog.csdn.net/qq_17755303/article/details/112564030)將虛擬內存更改為100G(102400),例如:档案放置D槽就更改D槽的虚拟内存
* 预处理嵌入:
`python synthesizer_preprocess_embeds.py <datasets_root>/SV2TTS/synthesizer`
* 训练合成器:
`python synthesizer_train.py mandarin <datasets_root>/SV2TTS/synthesizer`

@ -32,13 +32,11 @@
### 2. Train synthesizer with your dataset
* Download aidatatang_200zh or other dataset and unzip: make sure you can access all .wav in *train* folder
* Preprocess with the audios and the mel spectrograms:
`python synthesizer_preprocess_audio.py <datasets_root>`
`python pre.py <datasets_root>`
Allow parameter `--dataset {dataset}` to support adatatang_200zh, magicdata, aishell3
>If it happens `the page file is too small to complete the operation`, please refer to this [video](https://www.youtube.com/watch?v=Oh6dga-Oy10&ab_channel=CodeProf) and change the virtual memory to 100G (102400), for example : When the file is placed in the D disk, the virtual memory of the D disk is changed.
* Preprocess the embeddings:
`python synthesizer_preprocess_embeds.py <datasets_root>/SV2TTS/synthesizer`
* Train the synthesizer:
`python synthesizer_train.py mandarin <datasets_root>/SV2TTS/synthesizer`

@ -0,0 +1,72 @@
from synthesizer.preprocess import create_embeddings
from utils.argutils import print_args
from pathlib import Path
import argparse
from synthesizer.preprocess import preprocess_dataset
from synthesizer.hparams import hparams
from utils.argutils import print_args
from pathlib import Path
import argparse
recognized_datasets = [
"aidatatang_200zh",
"magicdata",
"aishell3"
]
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Preprocesses audio files from datasets, encodes them as mel spectrograms "
"and writes them to the disk. Audio files are also saved, to be used by the "
"vocoder for training.",
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
parser.add_argument("datasets_root", type=Path, help=\
"Path to the directory containing your datasets.")
parser.add_argument("-o", "--out_dir", type=Path, default=argparse.SUPPRESS, help=\
"Path to the output directory that will contain the mel spectrograms, the audios and the "
"embeds. Defaults to <datasets_root>/SV2TTS/synthesizer/")
parser.add_argument("-n", "--n_processes", type=int, default=1, help=\
"Number of processes in parallel.An encoder is created for each, so you may need to lower "
"this value on GPUs with low memory. Set it to 1 if CUDA is unhappy")
parser.add_argument("-s", "--skip_existing", action="store_true", help=\
"Whether to overwrite existing files with the same name. Useful if the preprocessing was "
"interrupted. ")
parser.add_argument("--hparams", type=str, default="", help=\
"Hyperparameter overrides as a comma-separated list of name-value pairs")
parser.add_argument("--no_trim", action="store_true", help=\
"Preprocess audio without trimming silences (not recommended).")
parser.add_argument("--no_alignments", action="store_true", help=\
"Use this option when dataset does not include alignments\
(these are used to split long audio files into sub-utterances.)")
parser.add_argument("--dataset", type=str, default="aidatatang_200zh", help=\
"Name of the dataset to process, allowing values: magicdata, aidatatang_200zh, aishell3.")
parser.add_argument("-e", "--encoder_model_fpath", type=Path, default="encoder/saved_models/pretrained.pt", help=\
"Path your trained encoder model.")
args = parser.parse_args()
# Process the arguments
if not hasattr(args, "out_dir"):
args.out_dir = args.datasets_root.joinpath("SV2TTS", "synthesizer")
assert args.dataset in recognized_datasets, 'is not supported, please vote for it in https://github.com/babysor/MockingBird/issues/10'
# Create directories
assert args.datasets_root.exists()
args.out_dir.mkdir(exist_ok=True, parents=True)
# Verify webrtcvad is available
if not args.no_trim:
try:
import webrtcvad
except:
raise ModuleNotFoundError("Package 'webrtcvad' not found. This package enables "
"noise removal and is recommended. Please install and try again. If installation fails, "
"use --no_trim to disable this error message.")
encoder_model_fpath = args.encoder_model_fpath
del args.no_trim, args.encoder_model_fpath
args.hparams = hparams.parse(args.hparams)
preprocess_dataset(**vars(args))
create_embeddings(synthesizer_root=args.out_dir, n_processes=args.n_processes, encoder_model_fpath=encoder_model_fpath)

@ -9,6 +9,7 @@ from pathlib import Path
from typing import Union, List
import numpy as np
import librosa
from utils import logmmse
from pypinyin import lazy_pinyin, Style
class Synthesizer:
@ -90,8 +91,10 @@ class Synthesizer:
simple_table([("Tacotron", str(tts_k) + "k"),
("r", self._model.r)])
texts = [" ".join(lazy_pinyin(v, style=Style.TONE3)) for v in texts]
print("Read " + str(texts))
texts = [" ".join(lazy_pinyin(v, style=Style.TONE3, neutral_tone_with_five=True)) for v in texts]
print("Synthesizing " + str(texts))
# Preprocess text inputs
inputs = [text_to_sequence(text, hparams.tts_cleaner_names) for text in texts]
if not isinstance(embeddings, list):
@ -143,6 +146,12 @@ class Synthesizer:
wav = librosa.load(str(fpath), hparams.sample_rate)[0]
if hparams.rescale:
wav = wav / np.abs(wav).max() * hparams.rescaling_max
# denoise
if len(wav) > hparams.sample_rate*(0.3+0.1):
noise_wav = np.concatenate([wav[:int(hparams.sample_rate*0.15)],
wav[-int(hparams.sample_rate*0.15):]])
profile = logmmse.profile_noise(noise_wav, hparams.sample_rate)
wav = logmmse.denoise(wav, profile)
return wav
@staticmethod

Loading…
Cancel
Save