Merge pull request #822 from babysor/restruct-project

Restruct project
pull/892/head
Vega 2 years ago committed by GitHub
commit 9d67b757f0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

6
.gitignore vendored

@ -14,9 +14,9 @@
*.bcf *.bcf
*.toc *.toc
*.sh *.sh
*/saved_models data/ckpt
!vocoder/saved_models/pretrained/** !data/ckpt/vocoder/pretrained/**
!encoder/saved_models/pretrained.pt !data/ckpt/encoder/pretrained.pt
wavs wavs
log log
!/docker-entrypoint.sh !/docker-entrypoint.sh

@ -15,7 +15,8 @@
"name": "Python: Vocoder Preprocess", "name": "Python: Vocoder Preprocess",
"type": "python", "type": "python",
"request": "launch", "request": "launch",
"program": "vocoder_preprocess.py", "program": "control\\cli\\vocoder_preprocess.py",
"cwd": "${workspaceFolder}",
"console": "integratedTerminal", "console": "integratedTerminal",
"args": ["..\\audiodata"] "args": ["..\\audiodata"]
}, },
@ -23,7 +24,8 @@
"name": "Python: Vocoder Train", "name": "Python: Vocoder Train",
"type": "python", "type": "python",
"request": "launch", "request": "launch",
"program": "vocoder_train.py", "program": "control\\cli\\vocoder_train.py",
"cwd": "${workspaceFolder}",
"console": "integratedTerminal", "console": "integratedTerminal",
"args": ["dev", "..\\audiodata"] "args": ["dev", "..\\audiodata"]
}, },
@ -32,6 +34,7 @@
"type": "python", "type": "python",
"request": "launch", "request": "launch",
"program": "demo_toolbox.py", "program": "demo_toolbox.py",
"cwd": "${workspaceFolder}",
"console": "integratedTerminal", "console": "integratedTerminal",
"args": ["-d","..\\audiodata"] "args": ["-d","..\\audiodata"]
}, },
@ -40,6 +43,7 @@
"type": "python", "type": "python",
"request": "launch", "request": "launch",
"program": "demo_toolbox.py", "program": "demo_toolbox.py",
"cwd": "${workspaceFolder}",
"console": "integratedTerminal", "console": "integratedTerminal",
"args": ["-d","..\\audiodata","-vc"] "args": ["-d","..\\audiodata","-vc"]
}, },
@ -47,9 +51,9 @@
"name": "Python: Synth Train", "name": "Python: Synth Train",
"type": "python", "type": "python",
"request": "launch", "request": "launch",
"program": "synthesizer_train.py", "program": "train.py",
"console": "integratedTerminal", "console": "integratedTerminal",
"args": ["my_run", "..\\"] "args": ["--type", "synth", "..\\audiodata\\SV2TTS\\synthesizer"]
}, },
{ {
"name": "Python: PPG Convert", "name": "Python: PPG Convert",

@ -1,9 +1,9 @@
from encoder.params_model import model_embedding_size as speaker_embedding_size from models.encoder.params_model import model_embedding_size as speaker_embedding_size
from utils.argutils import print_args from utils.argutils import print_args
from utils.modelutils import check_model_paths from utils.modelutils import check_model_paths
from synthesizer.inference import Synthesizer from models.synthesizer.inference import Synthesizer
from encoder import inference as encoder from models.encoder import inference as encoder
from vocoder import inference as vocoder from models.vocoder import inference as vocoder
from pathlib import Path from pathlib import Path
import numpy as np import numpy as np
import soundfile as sf import soundfile as sf

@ -1,7 +1,10 @@
from encoder.preprocess import preprocess_librispeech, preprocess_voxceleb1, preprocess_voxceleb2, preprocess_aidatatang_200zh
from utils.argutils import print_args
from pathlib import Path
import argparse import argparse
from pathlib import Path
from models.encoder.preprocess import (preprocess_aidatatang_200zh,
preprocess_librispeech, preprocess_voxceleb1,
preprocess_voxceleb2)
from utils.argutils import print_args
if __name__ == "__main__": if __name__ == "__main__":
class MyFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionHelpFormatter): class MyFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionHelpFormatter):

@ -1,5 +1,5 @@
from utils.argutils import print_args from utils.argutils import print_args
from encoder.train import train from models.encoder.train import train
from pathlib import Path from pathlib import Path
import argparse import argparse

@ -2,8 +2,8 @@ import sys
import torch import torch
import argparse import argparse
import numpy as np import numpy as np
from utils.load_yaml import HpsYaml from utils.hparams import HpsYaml
from ppg2mel.train.train_linglf02mel_seq2seq_oneshotvc import Solver from models.ppg2mel.train.train_linglf02mel_seq2seq_oneshotvc import Solver
# For reproducibility, comment these may speed up training # For reproducibility, comment these may speed up training
torch.backends.cudnn.deterministic = True torch.backends.cudnn.deterministic = True

@ -1,7 +1,7 @@
from pathlib import Path from pathlib import Path
import argparse import argparse
from ppg2mel.preprocess import preprocess_dataset from models.ppg2mel.preprocess import preprocess_dataset
from pathlib import Path from pathlib import Path
import argparse import argparse

@ -1,10 +1,9 @@
from synthesizer.hparams import hparams from models.synthesizer.hparams import hparams
from synthesizer.train import train from models.synthesizer.train import train
from utils.argutils import print_args from utils.argutils import print_args
import argparse import argparse
def new_train():
if __name__ == "__main__":
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("run_id", type=str, help= \ parser.add_argument("run_id", type=str, help= \
"Name for this model instance. If a model state from the same run ID was previously " "Name for this model instance. If a model state from the same run ID was previously "
@ -13,7 +12,7 @@ if __name__ == "__main__":
parser.add_argument("syn_dir", type=str, default=argparse.SUPPRESS, help= \ parser.add_argument("syn_dir", type=str, default=argparse.SUPPRESS, help= \
"Path to the synthesizer directory that contains the ground truth mel spectrograms, " "Path to the synthesizer directory that contains the ground truth mel spectrograms, "
"the wavs and the embeds.") "the wavs and the embeds.")
parser.add_argument("-m", "--models_dir", type=str, default="synthesizer/saved_models/", help=\ parser.add_argument("-m", "--models_dir", type=str, default=f"data/ckpt/synthesizer/", help=\
"Path to the output directory that will contain the saved model weights and the logs.") "Path to the output directory that will contain the saved model weights and the logs.")
parser.add_argument("-s", "--save_every", type=int, default=1000, help= \ parser.add_argument("-s", "--save_every", type=int, default=1000, help= \
"Number of steps between updates of the model on the disk. Set to 0 to never save the " "Number of steps between updates of the model on the disk. Set to 0 to never save the "
@ -28,10 +27,14 @@ if __name__ == "__main__":
parser.add_argument("--hparams", default="", parser.add_argument("--hparams", default="",
help="Hyperparameter overrides as a comma-separated list of name=value " help="Hyperparameter overrides as a comma-separated list of name=value "
"pairs") "pairs")
args = parser.parse_args() args, _ = parser.parse_known_args()
print_args(args, parser) print_args(args, parser)
args.hparams = hparams.parse(args.hparams) args.hparams = hparams.parse(args.hparams)
# Run the training # Run the training
train(**vars(args)) train(**vars(args))
if __name__ == "__main__":
new_train()

@ -0,0 +1,66 @@
import sys
import torch
import argparse
import numpy as np
from utils.hparams import HpsYaml
from models.ppg2mel.train.train_linglf02mel_seq2seq_oneshotvc import Solver
# For reproducibility, comment these may speed up training
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
def main():
# Arguments
parser = argparse.ArgumentParser(description=
'Training PPG2Mel VC model.')
parser.add_argument('--config', type=str,
help='Path to experiment config, e.g., config/vc.yaml')
parser.add_argument('--name', default=None, type=str, help='Name for logging.')
parser.add_argument('--logdir', default='log/', type=str,
help='Logging path.', required=False)
parser.add_argument('--ckpdir', default='ppg2mel/saved_models/', type=str,
help='Checkpoint path.', required=False)
parser.add_argument('--outdir', default='result/', type=str,
help='Decode output path.', required=False)
parser.add_argument('--load', default=None, type=str,
help='Load pre-trained model (for training only)', required=False)
parser.add_argument('--warm_start', action='store_true',
help='Load model weights only, ignore specified layers.')
parser.add_argument('--seed', default=0, type=int,
help='Random seed for reproducable results.', required=False)
parser.add_argument('--njobs', default=8, type=int,
help='Number of threads for dataloader/decoding.', required=False)
parser.add_argument('--cpu', action='store_true', help='Disable GPU training.')
parser.add_argument('--no-pin', action='store_true',
help='Disable pin-memory for dataloader')
parser.add_argument('--test', action='store_true', help='Test the model.')
parser.add_argument('--no-msg', action='store_true', help='Hide all messages.')
parser.add_argument('--finetune', action='store_true', help='Finetune model')
parser.add_argument('--oneshotvc', action='store_true', help='Oneshot VC model')
parser.add_argument('--bilstm', action='store_true', help='BiLSTM VC model')
parser.add_argument('--lsa', action='store_true', help='Use location-sensitive attention (LSA)')
###
paras = parser.parse_args()
setattr(paras, 'gpu', not paras.cpu)
setattr(paras, 'pin_memory', not paras.no_pin)
setattr(paras, 'verbose', not paras.no_msg)
# Make the config dict dot visitable
config = HpsYaml(paras.config)
np.random.seed(paras.seed)
torch.manual_seed(paras.seed)
if torch.cuda.is_available():
torch.cuda.manual_seed_all(paras.seed)
print(">>> OneShot VC training ...")
mode = "train"
solver = Solver(config, paras, mode)
solver.load_data()
solver.set_model()
solver.exec()
print(">>> Oneshot VC train finished!")
sys.exit(0)
if __name__ == "__main__":
main()

@ -1,5 +1,5 @@
from synthesizer.synthesize import run_synthesis from models.synthesizer.synthesize import run_synthesis
from synthesizer.hparams import hparams from models.synthesizer.hparams import hparams
from utils.argutils import print_args from utils.argutils import print_args
import argparse import argparse
import os import os

@ -1,7 +1,7 @@
from utils.argutils import print_args from utils.argutils import print_args
from vocoder.wavernn.train import train from models.vocoder.wavernn.train import train
from vocoder.hifigan.train import train as train_hifigan from models.vocoder.hifigan.train import train as train_hifigan
from vocoder.fregan.train import train as train_fregan from models.vocoder.fregan.train import train as train_fregan
from utils.util import AttrDict from utils.util import AttrDict
from pathlib import Path from pathlib import Path
import argparse import argparse

@ -2,22 +2,22 @@ from pydantic import BaseModel, Field
import os import os
from pathlib import Path from pathlib import Path
from enum import Enum from enum import Enum
from encoder import inference as encoder from models.encoder import inference as encoder
import librosa import librosa
from scipy.io.wavfile import write from scipy.io.wavfile import write
import re import re
import numpy as np import numpy as np
from mkgui.base.components.types import FileContent from control.mkgui.base.components.types import FileContent
from vocoder.hifigan import inference as gan_vocoder from models.vocoder.hifigan import inference as gan_vocoder
from synthesizer.inference import Synthesizer from models.synthesizer.inference import Synthesizer
from typing import Any, Tuple from typing import Any, Tuple
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
# Constants # Constants
AUDIO_SAMPLES_DIR = f"samples{os.sep}" AUDIO_SAMPLES_DIR = f"data{os.sep}samples{os.sep}"
SYN_MODELS_DIRT = f"synthesizer{os.sep}saved_models" SYN_MODELS_DIRT = f"data{os.sep}ckpt{os.sep}synthesizer"
ENC_MODELS_DIRT = f"encoder{os.sep}saved_models" ENC_MODELS_DIRT = f"data{os.sep}ckpt{os.sep}encoder"
VOC_MODELS_DIRT = f"vocoder{os.sep}saved_models" VOC_MODELS_DIRT = f"data{os.sep}ckpt{os.sep}vocoder"
TEMP_SOURCE_AUDIO = f"wavs{os.sep}temp_source.wav" TEMP_SOURCE_AUDIO = f"wavs{os.sep}temp_source.wav"
TEMP_RESULT_AUDIO = f"wavs{os.sep}temp_result.wav" TEMP_RESULT_AUDIO = f"wavs{os.sep}temp_result.wav"
if not os.path.isdir("wavs"): if not os.path.isdir("wavs"):
@ -31,7 +31,7 @@ if os.path.isdir(SYN_MODELS_DIRT):
synthesizers = Enum('synthesizers', list((file.name, file) for file in Path(SYN_MODELS_DIRT).glob("**/*.pt"))) synthesizers = Enum('synthesizers', list((file.name, file) for file in Path(SYN_MODELS_DIRT).glob("**/*.pt")))
print("Loaded synthesizer models: " + str(len(synthesizers))) print("Loaded synthesizer models: " + str(len(synthesizers)))
else: else:
raise Exception(f"Model folder {SYN_MODELS_DIRT} doesn't exist.") raise Exception(f"Model folder {SYN_MODELS_DIRT} doesn't exist. 请将模型文件位置移动到上述位置中进行重试!")
if os.path.isdir(ENC_MODELS_DIRT): if os.path.isdir(ENC_MODELS_DIRT):
encoders = Enum('encoders', list((file.name, file) for file in Path(ENC_MODELS_DIRT).glob("**/*.pt"))) encoders = Enum('encoders', list((file.name, file) for file in Path(ENC_MODELS_DIRT).glob("**/*.pt")))
@ -46,15 +46,16 @@ else:
raise Exception(f"Model folder {VOC_MODELS_DIRT} doesn't exist.") raise Exception(f"Model folder {VOC_MODELS_DIRT} doesn't exist.")
class Input(BaseModel): class Input(BaseModel):
message: str = Field( message: str = Field(
..., example="欢迎使用工具箱, 现已支持中文输入!", alias="文本内容" ..., example="欢迎使用工具箱, 现已支持中文输入!", alias="文本内容"
) )
local_audio_file: audio_input_selection = Field( local_audio_file: audio_input_selection = Field(
..., alias="输入语音本地wav", ..., alias="选择语音本地wav",
description="选择本地语音文件." description="选择本地语音文件."
) )
record_audio_file: FileContent = Field(default=None, alias="录制语音",
description="录音.", is_recorder=True, mime_type="audio/wav")
upload_audio_file: FileContent = Field(default=None, alias="或上传语音", upload_audio_file: FileContent = Field(default=None, alias="或上传语音",
description="拖拽或点击上传.", mime_type="audio/wav") description="拖拽或点击上传.", mime_type="audio/wav")
encoder: encoders = Field( encoder: encoders = Field(
@ -104,7 +105,12 @@ def synthesize(input: Input) -> Output:
gan_vocoder.load_model(Path(input.vocoder.value)) gan_vocoder.load_model(Path(input.vocoder.value))
# load file # load file
if input.upload_audio_file != None: if input.record_audio_file != None:
with open(TEMP_SOURCE_AUDIO, "w+b") as f:
f.write(input.record_audio_file.as_bytes())
f.seek(0)
wav, sample_rate = librosa.load(TEMP_SOURCE_AUDIO)
elif input.upload_audio_file != None:
with open(TEMP_SOURCE_AUDIO, "w+b") as f: with open(TEMP_SOURCE_AUDIO, "w+b") as f:
f.write(input.upload_audio_file.as_bytes()) f.write(input.upload_audio_file.as_bytes())
f.seek(0) f.seek(0)

@ -1,27 +1,26 @@
from synthesizer.inference import Synthesizer
from pydantic import BaseModel, Field
from encoder import inference as speacker_encoder
import torch
import os import os
from pathlib import Path
from enum import Enum from enum import Enum
import ppg_extractor as Extractor from pathlib import Path
import ppg2mel as Convertor
import librosa
from scipy.io.wavfile import write
import re
import numpy as np
from mkgui.base.components.types import FileContent
from vocoder.hifigan import inference as gan_vocoder
from typing import Any, Tuple from typing import Any, Tuple
import librosa
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import torch
from pydantic import BaseModel, Field
from scipy.io.wavfile import write
import models.ppg2mel as Convertor
import models.ppg_extractor as Extractor
from control.mkgui.base.components.types import FileContent
from models.encoder import inference as speacker_encoder
from models.synthesizer.inference import Synthesizer
from models.vocoder.hifigan import inference as gan_vocoder
# Constants # Constants
AUDIO_SAMPLES_DIR = f'samples{os.sep}' AUDIO_SAMPLES_DIR = f'data{os.sep}samples{os.sep}'
EXT_MODELS_DIRT = f'ppg_extractor{os.sep}saved_models' EXT_MODELS_DIRT = f'data{os.sep}ckpt{os.sep}ppg_extractor'
CONV_MODELS_DIRT = f'ppg2mel{os.sep}saved_models' CONV_MODELS_DIRT = f'data{os.sep}ckpt{os.sep}ppg2mel'
VOC_MODELS_DIRT = f'vocoder{os.sep}saved_models' VOC_MODELS_DIRT = f'data{os.sep}ckpt{os.sep}vocoder'
TEMP_SOURCE_AUDIO = f'wavs{os.sep}temp_source.wav' TEMP_SOURCE_AUDIO = f'wavs{os.sep}temp_source.wav'
TEMP_TARGET_AUDIO = f'wavs{os.sep}temp_target.wav' TEMP_TARGET_AUDIO = f'wavs{os.sep}temp_target.wav'
TEMP_RESULT_AUDIO = f'wavs{os.sep}temp_result.wav' TEMP_RESULT_AUDIO = f'wavs{os.sep}temp_result.wav'
@ -132,9 +131,10 @@ def convert(input: Input) -> Output:
ppg = extractor.extract_from_wav(src_wav) ppg = extractor.extract_from_wav(src_wav)
# Import necessary dependency of Voice Conversion # Import necessary dependency of Voice Conversion
from utils.f0_utils import compute_f0, f02lf0, compute_mean_std, get_converted_lf0uv from utils.f0_utils import (compute_f0, compute_mean_std, f02lf0,
get_converted_lf0uv)
ref_lf0_mean, ref_lf0_std = compute_mean_std(f02lf0(compute_f0(ref_wav))) ref_lf0_mean, ref_lf0_std = compute_mean_std(f02lf0(compute_f0(ref_wav)))
speacker_encoder.load_model(Path(f"encoder{os.sep}saved_models{os.sep}pretrained_bak_5805000.pt")) speacker_encoder.load_model(Path(f"data{os.sep}ckpt{os.sep}encoder{os.sep}pretrained_bak_5805000.pt"))
embed = speacker_encoder.embed_utterance(ref_wav) embed = speacker_encoder.embed_utterance(ref_wav)
lf0_uv = get_converted_lf0uv(src_wav, ref_lf0_mean, ref_lf0_std, convert=True) lf0_uv = get_converted_lf0uv(src_wav, ref_lf0_mean, ref_lf0_std, convert=True)
min_len = min(ppg.shape[1], len(lf0_uv)) min_len = min(ppg.shape[1], len(lf0_uv))

@ -37,6 +37,12 @@ def is_single_file_property(property: Dict) -> bool:
# TODO: binary? # TODO: binary?
return property.get("format") == "byte" return property.get("format") == "byte"
def is_single_autio_property(property: Dict) -> bool:
if property.get("type") != "string":
return False
# TODO: binary?
return property.get("format") == "bytes"
def is_single_directory_property(property: Dict) -> bool: def is_single_directory_property(property: Dict) -> bool:
if property.get("type") != "string": if property.get("type") != "string":

@ -14,14 +14,13 @@ from fastapi.encoders import jsonable_encoder
from loguru import logger from loguru import logger
from pydantic import BaseModel, ValidationError, parse_obj_as from pydantic import BaseModel, ValidationError, parse_obj_as
from mkgui.base import Opyrator from control.mkgui.base import Opyrator
from mkgui.base.core import name_to_title from control.mkgui.base.core import name_to_title
from mkgui.base.ui import schema_utils from . import schema_utils
from mkgui.base.ui.streamlit_utils import CUSTOM_STREAMLIT_CSS from .streamlit_utils import CUSTOM_STREAMLIT_CSS
STREAMLIT_RUNNER_SNIPPET = """ STREAMLIT_RUNNER_SNIPPET = """
from mkgui.base.ui import render_streamlit_ui from control.mkgui.base.ui import render_streamlit_ui
from mkgui.base import Opyrator
import streamlit as st import streamlit as st
@ -243,7 +242,14 @@ class InputUI:
file_extension = None file_extension = None
if "mime_type" in property: if "mime_type" in property:
file_extension = mimetypes.guess_extension(property["mime_type"]) file_extension = mimetypes.guess_extension(property["mime_type"])
if "is_recorder" in property:
from audio_recorder_streamlit import audio_recorder
audio_bytes = audio_recorder()
if audio_bytes:
streamlit_app.audio(audio_bytes, format="audio/wav")
return audio_bytes
uploaded_file = streamlit_app.file_uploader( uploaded_file = streamlit_app.file_uploader(
**streamlit_kwargs, accept_multiple_files=False, type=file_extension **streamlit_kwargs, accept_multiple_files=False, type=file_extension
) )
@ -263,6 +269,39 @@ class InputUI:
streamlit_app.video(bytes, format=property.get("mime_type")) streamlit_app.video(bytes, format=property.get("mime_type"))
return bytes return bytes
def _render_single_audio_input(
self, streamlit_app: st, key: str, property: Dict
) -> Any:
# streamlit_kwargs = self._get_default_streamlit_input_kwargs(key, property)
from audio_recorder_streamlit import audio_recorder
audio_bytes = audio_recorder()
if audio_bytes:
streamlit_app.audio(audio_bytes, format="audio/wav")
return audio_bytes
# file_extension = None
# if "mime_type" in property:
# file_extension = mimetypes.guess_extension(property["mime_type"])
# uploaded_file = streamlit_app.file_uploader(
# **streamlit_kwargs, accept_multiple_files=False, type=file_extension
# )
# if uploaded_file is None:
# return None
# bytes = uploaded_file.getvalue()
# if property.get("mime_type"):
# if is_compatible_audio(property["mime_type"]):
# # Show audio
# streamlit_app.audio(bytes, format=property.get("mime_type"))
# if is_compatible_image(property["mime_type"]):
# # Show image
# streamlit_app.image(bytes)
# if is_compatible_video(property["mime_type"]):
# # Show video
# streamlit_app.video(bytes, format=property.get("mime_type"))
# return bytes
def _render_single_string_input( def _render_single_string_input(
self, streamlit_app: st, key: str, property: Dict self, streamlit_app: st, key: str, property: Dict
) -> Any: ) -> Any:
@ -807,21 +846,20 @@ class OutputUI:
def getOpyrator(mode: str) -> Opyrator: def getOpyrator(mode: str) -> Opyrator:
if mode == None or mode.startswith('VC'): if mode == None or mode.startswith('VC'):
from mkgui.app_vc import convert from control.mkgui.app_vc import convert
return Opyrator(convert) return Opyrator(convert)
if mode == None or mode.startswith('预处理'): if mode == None or mode.startswith('预处理'):
from mkgui.preprocess import preprocess from control.mkgui.preprocess import preprocess
return Opyrator(preprocess) return Opyrator(preprocess)
if mode == None or mode.startswith('模型训练'): if mode == None or mode.startswith('模型训练'):
from mkgui.train import train from control.mkgui.train import train
return Opyrator(train) return Opyrator(train)
if mode == None or mode.startswith('模型训练(VC)'): if mode == None or mode.startswith('模型训练(VC)'):
from mkgui.train_vc import train_vc from control.mkgui.train_vc import train_vc
return Opyrator(train_vc) return Opyrator(train_vc)
from mkgui.app import synthesize from control.mkgui.app import synthesize
return Opyrator(synthesize) return Opyrator(synthesize)
def render_streamlit_ui() -> None: def render_streamlit_ui() -> None:
# init # init
session_state = st.session_state session_state = st.session_state
@ -845,7 +883,7 @@ def render_streamlit_ui() -> None:
col2.title(title) col2.title(title)
col2.markdown("欢迎使用MockingBird Web 2") col2.markdown("欢迎使用MockingBird Web 2")
image = Image.open(path.join('mkgui', 'static', 'mb.png')) image = Image.open(path.join('control','mkgui', 'static', 'mb.png'))
col1.image(image) col1.image(image)
st.markdown("---") st.markdown("---")
@ -853,6 +891,13 @@ def render_streamlit_ui() -> None:
with left: with left:
st.header("Control 控制") st.header("Control 控制")
# if session_state.mode in ["AI拟音", "VC拟音"] :
# from audiorecorder import audiorecorder
# audio = audiorecorder("Click to record", "Recording...")
# if len(audio) > 0:
# # To play audio in frontend:
# st.audio(audio.tobytes())
InputUI(session_state=session_state, input_class=opyrator.input_type).render_ui(st) InputUI(session_state=session_state, input_class=opyrator.input_type).render_ui(st)
execute_selected = st.button(opyrator.action) execute_selected = st.button(opyrator.action)
if execute_selected: if execute_selected:

@ -6,8 +6,8 @@ from typing import Any, Tuple
# Constants # Constants
EXT_MODELS_DIRT = f"ppg_extractor{os.sep}saved_models" EXT_MODELS_DIRT = f"data{os.sep}ckpt{os.sep}ppg_extractor"
ENC_MODELS_DIRT = f"encoder{os.sep}saved_models" ENC_MODELS_DIRT = f"data{os.sep}ckpt{os.sep}encoder"
if os.path.isdir(EXT_MODELS_DIRT): if os.path.isdir(EXT_MODELS_DIRT):
@ -83,7 +83,7 @@ def preprocess(input: Input) -> Output:
"""Preprocess(预处理)""" """Preprocess(预处理)"""
finished = 0 finished = 0
if input.model == Model.VC_PPG2MEL: if input.model == Model.VC_PPG2MEL:
from ppg2mel.preprocess import preprocess_dataset from models.ppg2mel.preprocess import preprocess_dataset
finished = preprocess_dataset( finished = preprocess_dataset(
datasets_root=Path(input.datasets_root), datasets_root=Path(input.datasets_root),
dataset=input.dataset, dataset=input.dataset,

Before

Width:  |  Height:  |  Size: 5.6 KiB

After

Width:  |  Height:  |  Size: 5.6 KiB

@ -3,17 +3,17 @@ import os
from pathlib import Path from pathlib import Path
from enum import Enum from enum import Enum
from typing import Any from typing import Any
from synthesizer.hparams import hparams from models.synthesizer.hparams import hparams
from synthesizer.train import train as synt_train from models.synthesizer.train import train as synt_train
# Constants # Constants
SYN_MODELS_DIRT = f"synthesizer{os.sep}saved_models" SYN_MODELS_DIRT = f"data{os.sep}ckpt{os.sep}synthesizer"
ENC_MODELS_DIRT = f"encoder{os.sep}saved_models" ENC_MODELS_DIRT = f"data{os.sep}ckpt{os.sep}encoder"
# EXT_MODELS_DIRT = f"ppg_extractor{os.sep}saved_models" # EXT_MODELS_DIRT = f"data{os.sep}ckpt{os.sep}ppg_extractor"
# CONV_MODELS_DIRT = f"ppg2mel{os.sep}saved_models" # CONV_MODELS_DIRT = f"data{os.sep}ckpt{os.sep}ppg2mel"
# ENC_MODELS_DIRT = f"encoder{os.sep}saved_models" # ENC_MODELS_DIRT = f"data{os.sep}ckpt{os.sep}encoder"
# Pre-Load models # Pre-Load models
if os.path.isdir(SYN_MODELS_DIRT): if os.path.isdir(SYN_MODELS_DIRT):
@ -96,7 +96,7 @@ def train(input: Input) -> Output:
synt_train( synt_train(
input.run_id, input.run_id,
input.input_root, input.input_root,
f"synthesizer{os.sep}saved_models", f"data{os.sep}ckpt{os.sep}synthesizer",
input.save_every, input.save_every,
input.backup_every, input.backup_every,
input.log_every, input.log_every,

@ -4,14 +4,14 @@ from pathlib import Path
from enum import Enum from enum import Enum
from typing import Any, Tuple from typing import Any, Tuple
import numpy as np import numpy as np
from utils.load_yaml import HpsYaml from utils.hparams import HpsYaml
from utils.util import AttrDict from utils.util import AttrDict
import torch import torch
# Constants # Constants
EXT_MODELS_DIRT = f"ppg_extractor{os.sep}saved_models" EXT_MODELS_DIRT = f"data{os.sep}ckpt{os.sep}ppg_extractor"
CONV_MODELS_DIRT = f"ppg2mel{os.sep}saved_models" CONV_MODELS_DIRT = f"data{os.sep}ckpt{os.sep}ppg2mel"
ENC_MODELS_DIRT = f"encoder{os.sep}saved_models" ENC_MODELS_DIRT = f"data{os.sep}ckpt{os.sep}encoder"
if os.path.isdir(EXT_MODELS_DIRT): if os.path.isdir(EXT_MODELS_DIRT):
@ -144,7 +144,7 @@ def train_vc(input: Input) -> Output:
if torch.cuda.is_available(): if torch.cuda.is_available():
torch.cuda.manual_seed_all(input.seed) torch.cuda.manual_seed_all(input.seed)
mode = "train" mode = "train"
from ppg2mel.train.train_linglf02mel_seq2seq_oneshotvc import Solver from models.ppg2mel.train.train_linglf02mel_seq2seq_oneshotvc import Solver
solver = Solver(config, params, mode) solver = Solver(config, params, mode)
solver.load_data() solver.load_data()
solver.set_model() solver.set_model()

@ -1,12 +1,12 @@
from toolbox.ui import UI from control.toolbox.ui import UI
from encoder import inference as encoder from models.encoder import inference as encoder
from synthesizer.inference import Synthesizer from models.synthesizer.inference import Synthesizer
from vocoder.wavernn import inference as rnn_vocoder from models.vocoder.wavernn import inference as rnn_vocoder
from vocoder.hifigan import inference as gan_vocoder from models.vocoder.hifigan import inference as gan_vocoder
from vocoder.fregan import inference as fgan_vocoder from models.vocoder.fregan import inference as fgan_vocoder
from pathlib import Path from pathlib import Path
from time import perf_counter as timer from time import perf_counter as timer
from toolbox.utterance import Utterance from control.toolbox.utterance import Utterance
import numpy as np import numpy as np
import traceback import traceback
import sys import sys
@ -38,7 +38,8 @@ recognized_datasets = [
"VoxCeleb2/dev/aac", "VoxCeleb2/dev/aac",
"VoxCeleb2/test/aac", "VoxCeleb2/test/aac",
"VCTK-Corpus/wav48", "VCTK-Corpus/wav48",
"aidatatang_200zh/corpus", "aidatatang_200zh/corpus/test",
"aidatatang_200zh/corpus/train",
"aishell3/test/wav", "aishell3/test/wav",
"magicdata/train", "magicdata/train",
] ]
@ -396,7 +397,7 @@ class Toolbox:
self.ui.log("Loading the extractor %s... " % model_fpath) self.ui.log("Loading the extractor %s... " % model_fpath)
self.ui.set_loading(1) self.ui.set_loading(1)
start = timer() start = timer()
import ppg_extractor as extractor import models.ppg_extractor as extractor
self.extractor = extractor.load_model(model_fpath) self.extractor = extractor.load_model(model_fpath)
self.ui.log("Done (%dms)." % int(1000 * (timer() - start)), "append") self.ui.log("Done (%dms)." % int(1000 * (timer() - start)), "append")
self.ui.set_loading(0) self.ui.set_loading(0)
@ -408,7 +409,7 @@ class Toolbox:
self.ui.log("Loading the convertor %s... " % model_fpath) self.ui.log("Loading the convertor %s... " % model_fpath)
self.ui.set_loading(1) self.ui.set_loading(1)
start = timer() start = timer()
import ppg2mel as convertor import models.ppg2mel as convertor
self.convertor = convertor.load_model( model_fpath) self.convertor = convertor.load_model( model_fpath)
self.ui.log("Done (%dms)." % int(1000 * (timer() - start)), "append") self.ui.log("Done (%dms)." % int(1000 * (timer() - start)), "append")
self.ui.set_loading(0) self.ui.set_loading(0)

Before

Width:  |  Height:  |  Size: 5.6 KiB

After

Width:  |  Height:  |  Size: 5.6 KiB

@ -3,9 +3,8 @@ from PyQt5 import QtGui
from PyQt5.QtWidgets import * from PyQt5.QtWidgets import *
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
from matplotlib.backends.backend_qt5agg import FigureCanvasQTAgg as FigureCanvas from matplotlib.backends.backend_qt5agg import FigureCanvasQTAgg as FigureCanvas
from matplotlib.figure import Figure from models.encoder.inference import plot_embedding_as_heatmap
from encoder.inference import plot_embedding_as_heatmap from control.toolbox.utterance import Utterance
from toolbox.utterance import Utterance
from pathlib import Path from pathlib import Path
from typing import List, Set from typing import List, Set
import sounddevice as sd import sounddevice as sd

@ -1,5 +1,5 @@
from pathlib import Path from pathlib import Path
from toolbox import Toolbox from control.toolbox import Toolbox
from utils.argutils import print_args from utils.argutils import print_args
from utils.modelutils import check_model_paths from utils.modelutils import check_model_paths
import argparse import argparse
@ -17,15 +17,15 @@ if __name__ == '__main__':
"supported datasets.", default=None) "supported datasets.", default=None)
parser.add_argument("-vc", "--vc_mode", action="store_true", parser.add_argument("-vc", "--vc_mode", action="store_true",
help="Voice Conversion Mode(PPG based)") help="Voice Conversion Mode(PPG based)")
parser.add_argument("-e", "--enc_models_dir", type=Path, default="encoder/saved_models", parser.add_argument("-e", "--enc_models_dir", type=Path, default=f"data{os.sep}ckpt{os.sep}encoder",
help="Directory containing saved encoder models") help="Directory containing saved encoder models")
parser.add_argument("-s", "--syn_models_dir", type=Path, default="synthesizer/saved_models", parser.add_argument("-s", "--syn_models_dir", type=Path, default=f"data{os.sep}ckpt{os.sep}synthesizer",
help="Directory containing saved synthesizer models") help="Directory containing saved synthesizer models")
parser.add_argument("-v", "--voc_models_dir", type=Path, default="vocoder/saved_models", parser.add_argument("-v", "--voc_models_dir", type=Path, default=f"data{os.sep}ckpt{os.sep}vocoder",
help="Directory containing saved vocoder models") help="Directory containing saved vocoder models")
parser.add_argument("-ex", "--extractor_models_dir", type=Path, default="ppg_extractor/saved_models", parser.add_argument("-ex", "--extractor_models_dir", type=Path, default=f"data{os.sep}ckpt{os.sep}ppg_extractor",
help="Directory containing saved extrator models") help="Directory containing saved extrator models")
parser.add_argument("-cv", "--convertor_models_dir", type=Path, default="ppg2mel/saved_models", parser.add_argument("-cv", "--convertor_models_dir", type=Path, default=f"data{os.sep}ckpt{os.sep}ppg2mel",
help="Directory containing saved convert models") help="Directory containing saved convert models")
parser.add_argument("--cpu", action="store_true", help=\ parser.add_argument("--cpu", action="store_true", help=\
"If True, processing is done on CPU, even when a GPU is available.") "If True, processing is done on CPU, even when a GPU is available.")

@ -1,2 +0,0 @@
from encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataset
from encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataLoader

@ -1,23 +1,15 @@
from encoder.params_model import model_embedding_size as speaker_embedding_size from models.synthesizer.inference import Synthesizer
from utils.argutils import print_args from models.encoder import inference as encoder
from utils.modelutils import check_model_paths from models.vocoder.hifigan import inference as gan_vocoder
from synthesizer.inference import Synthesizer
from encoder import inference as encoder
from vocoder.wavernn import inference as rnn_vocoder
from vocoder.hifigan import inference as gan_vocoder
from pathlib import Path from pathlib import Path
import numpy as np import numpy as np
import soundfile as sf import soundfile as sf
import librosa
import argparse
import torch import torch
import sys import sys
import os import os
import re import re
import cn2an import cn2an
import glob
from audioread.exceptions import NoBackendError
vocoder = gan_vocoder vocoder = gan_vocoder
def gen_one_wav(synthesizer, in_fpath, embed, texts, file_name, seq): def gen_one_wav(synthesizer, in_fpath, embed, texts, file_name, seq):

@ -1,5 +1,5 @@
from scipy.ndimage.morphology import binary_dilation from scipy.ndimage.morphology import binary_dilation
from encoder.params_data import * from models.encoder.params_data import *
from pathlib import Path from pathlib import Path
from typing import Optional, Union from typing import Optional, Union
from warnings import warn from warnings import warn

@ -0,0 +1,2 @@
from models.encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataset
from models.encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataLoader

@ -1,5 +1,5 @@
from encoder.data_objects.random_cycler import RandomCycler from models.encoder.data_objects.random_cycler import RandomCycler
from encoder.data_objects.utterance import Utterance from models.encoder.data_objects.utterance import Utterance
from pathlib import Path from pathlib import Path
# Contains the set of utterances of a single speaker # Contains the set of utterances of a single speaker

@ -1,6 +1,6 @@
import numpy as np import numpy as np
from typing import List from typing import List
from encoder.data_objects.speaker import Speaker from models.encoder.data_objects.speaker import Speaker
class SpeakerBatch: class SpeakerBatch:
def __init__(self, speakers: List[Speaker], utterances_per_speaker: int, n_frames: int): def __init__(self, speakers: List[Speaker], utterances_per_speaker: int, n_frames: int):

@ -1,7 +1,7 @@
from encoder.data_objects.random_cycler import RandomCycler from models.encoder.data_objects.random_cycler import RandomCycler
from encoder.data_objects.speaker_batch import SpeakerBatch from models.encoder.data_objects.speaker_batch import SpeakerBatch
from encoder.data_objects.speaker import Speaker from models.encoder.data_objects.speaker import Speaker
from encoder.params_data import partials_n_frames from models.encoder.params_data import partials_n_frames
from torch.utils.data import Dataset, DataLoader from torch.utils.data import Dataset, DataLoader
from pathlib import Path from pathlib import Path

@ -1,8 +1,8 @@
from encoder.params_data import * from models.encoder.params_data import *
from encoder.model import SpeakerEncoder from models.encoder.model import SpeakerEncoder
from encoder.audio import preprocess_wav # We want to expose this function from here from models.encoder.audio import preprocess_wav # We want to expose this function from here
from matplotlib import cm from matplotlib import cm
from encoder import audio from models.encoder import audio
from pathlib import Path from pathlib import Path
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import numpy as np import numpy as np

@ -1,5 +1,5 @@
from encoder.params_model import * from models.encoder.params_model import *
from encoder.params_data import * from models.encoder.params_data import *
from scipy.interpolate import interp1d from scipy.interpolate import interp1d
from sklearn.metrics import roc_curve from sklearn.metrics import roc_curve
from torch.nn.utils import clip_grad_norm_ from torch.nn.utils import clip_grad_norm_

@ -1,8 +1,8 @@
from multiprocess.pool import ThreadPool from multiprocess.pool import ThreadPool
from encoder.params_data import * from models.encoder.params_data import *
from encoder.config import librispeech_datasets, anglophone_nationalites from models.encoder.config import librispeech_datasets, anglophone_nationalites
from datetime import datetime from datetime import datetime
from encoder import audio from models.encoder import audio
from pathlib import Path from pathlib import Path
from tqdm import tqdm from tqdm import tqdm
import numpy as np import numpy as np
@ -22,7 +22,7 @@ class DatasetLog:
self._log_params() self._log_params()
def _log_params(self): def _log_params(self):
from encoder import params_data from models.encoder import params_data
self.write_line("Parameter values:") self.write_line("Parameter values:")
for param_name in (p for p in dir(params_data) if not p.startswith("__")): for param_name in (p for p in dir(params_data) if not p.startswith("__")):
value = getattr(params_data, param_name) value = getattr(params_data, param_name)

@ -1,7 +1,7 @@
from encoder.visualizations import Visualizations from models.encoder.visualizations import Visualizations
from encoder.data_objects import SpeakerVerificationDataLoader, SpeakerVerificationDataset from models.encoder.data_objects import SpeakerVerificationDataLoader, SpeakerVerificationDataset
from encoder.params_model import * from models.encoder.params_model import *
from encoder.model import SpeakerEncoder from models.encoder.model import SpeakerEncoder
from utils.profiler import Profiler from utils.profiler import Profiler
from pathlib import Path from pathlib import Path
import torch import torch

@ -1,4 +1,4 @@
from encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataset from models.encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataset
from datetime import datetime from datetime import datetime
from time import perf_counter as timer from time import perf_counter as timer
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
@ -65,8 +65,8 @@ class Visualizations:
def log_params(self): def log_params(self):
if self.disabled: if self.disabled:
return return
from encoder import params_data from models.encoder import params_data
from encoder import params_model from models.encoder import params_model
param_string = "<b>Model parameters</b>:<br>" param_string = "<b>Model parameters</b>:<br>"
for param_name in (p for p in dir(params_model) if not p.startswith("__")): for param_name in (p for p in dir(params_model) if not p.startswith("__")):
value = getattr(params_model, param_name) value = getattr(params_model, param_name)

@ -15,7 +15,7 @@ from .rnn_decoder_mol import Decoder
from .utils.cnn_postnet import Postnet from .utils.cnn_postnet import Postnet
from .utils.vc_utils import get_mask_from_lengths from .utils.vc_utils import get_mask_from_lengths
from utils.load_yaml import HpsYaml from utils.hparams import HpsYaml
class MelDecoderMOLv2(AbsMelDecoder): class MelDecoderMOLv2(AbsMelDecoder):
"""Use an encoder to preprocess ppg.""" """Use an encoder to preprocess ppg."""

@ -7,10 +7,10 @@ from pathlib import Path
import soundfile import soundfile
import resampy import resampy
from ppg_extractor import load_model from models.ppg_extractor import load_model
import encoder.inference as Encoder import encoder.inference as Encoder
from encoder.audio import preprocess_wav from models.encoder.audio import preprocess_wav
from encoder import audio from models.encoder import audio
from utils.f0_utils import compute_f0 from utils.f0_utils import compute_f0
from torch.multiprocessing import Pool, cpu_count from torch.multiprocessing import Pool, cpu_count

@ -2,8 +2,8 @@ import sys
import torch import torch
import argparse import argparse
import numpy as np import numpy as np
from utils.load_yaml import HpsYaml from utils.hparams import HpsYaml
from ppg2mel.train.train_linglf02mel_seq2seq_oneshotvc import Solver from models.ppg2mel.train.train_linglf02mel_seq2seq_oneshotvc import Solver
# For reproducibility, comment these may speed up training # For reproducibility, comment these may speed up training
torch.backends.cudnn.deterministic = True torch.backends.cudnn.deterministic = True

@ -8,7 +8,6 @@ from torch.utils.tensorboard import SummaryWriter
from .option import default_hparas from .option import default_hparas
from utils.util import human_format, Timer from utils.util import human_format, Timer
from utils.load_yaml import HpsYaml
class BaseSolver(): class BaseSolver():

@ -14,7 +14,7 @@ from utils.data_load import OneshotVcDataset, MultiSpkVcCollate
from .loss import MaskedMSELoss from .loss import MaskedMSELoss
from .optim import Optimizer from .optim import Optimizer
from utils.util import human_format from utils.util import human_format
from ppg2mel import MelDecoderMOLv2 from models.ppg2mel import MelDecoderMOLv2
class Solver(BaseSolver): class Solver(BaseSolver):

@ -1,36 +1,4 @@
import ast from utils.hparams import HParams
import pprint
import json
class HParams(object):
def __init__(self, **kwargs): self.__dict__.update(kwargs)
def __setitem__(self, key, value): setattr(self, key, value)
def __getitem__(self, key): return getattr(self, key)
def __repr__(self): return pprint.pformat(self.__dict__)
def parse(self, string):
# Overrides hparams from a comma-separated string of name=value pairs
if len(string) > 0:
overrides = [s.split("=") for s in string.split(",")]
keys, values = zip(*overrides)
keys = list(map(str.strip, keys))
values = list(map(str.strip, values))
for k in keys:
self.__dict__[k] = ast.literal_eval(values[keys.index(k)])
return self
def loadJson(self, dict):
print("\Loading the json with %s\n", dict)
for k in dict.keys():
if k not in ["tts_schedule", "tts_finetune_layers"]:
self.__dict__[k] = dict[k]
return self
def dumpJson(self, fp):
print("\Saving the json with %s\n", fp)
with fp.open("w", encoding="utf-8") as f:
json.dump(self.__dict__, f)
return self
hparams = HParams( hparams = HParams(
### Signal Processing (used in both synthesizer and vocoder) ### Signal Processing (used in both synthesizer and vocoder)
@ -104,7 +72,7 @@ hparams = HParams(
### SV2TTS ### SV2TTS
speaker_embedding_size = 256, # Dimension for the speaker embedding speaker_embedding_size = 256, # Dimension for the speaker embedding
silence_min_duration_split = 0.4, # Duration in seconds of a silence for an utterance to be split silence_min_duration_split = 0.4, # Duration in seconds of a silence for an utterance to be split
utterance_min_duration = 1.6, # Duration in seconds below which utterances are discarded utterance_min_duration = 0.5, # Duration in seconds below which utterances are discarded
use_gst = True, # Whether to use global style token use_gst = True, # Whether to use global style token
use_ser_for_gst = True, # Whether to use speaker embedding referenced for global style token use_ser_for_gst = True, # Whether to use speaker embedding referenced for global style token
) )

@ -1,16 +1,15 @@
import torch import torch
from synthesizer import audio from models.synthesizer import audio
from synthesizer.hparams import hparams from models.synthesizer.hparams import hparams
from synthesizer.models.tacotron import Tacotron from models.synthesizer.models.tacotron import Tacotron
from synthesizer.utils.symbols import symbols from models.synthesizer.utils.symbols import symbols
from synthesizer.utils.text import text_to_sequence from models.synthesizer.utils.text import text_to_sequence
from vocoder.display import simple_table from models.vocoder.display import simple_table
from pathlib import Path from pathlib import Path
from typing import Union, List from typing import Union, List
import numpy as np import numpy as np
import librosa import librosa
from utils import logmmse from utils import logmmse
import json
from pypinyin import lazy_pinyin, Style from pypinyin import lazy_pinyin, Style
class Synthesizer: class Synthesizer:
@ -48,8 +47,7 @@ class Synthesizer:
# Try to scan config file # Try to scan config file
model_config_fpaths = list(self.model_fpath.parent.rglob("*.json")) model_config_fpaths = list(self.model_fpath.parent.rglob("*.json"))
if len(model_config_fpaths)>0 and model_config_fpaths[0].exists(): if len(model_config_fpaths)>0 and model_config_fpaths[0].exists():
with model_config_fpaths[0].open("r", encoding="utf-8") as f: hparams.loadJson(model_config_fpaths[0])
hparams.loadJson(json.load(f))
""" """
Instantiates and loads the model given the weights file that was passed in the constructor. Instantiates and loads the model given the weights file that was passed in the constructor.
""" """

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save