You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
MockingBird/toolbox/ui.py

933 lines
37 KiB
Python

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import matplotlib.pyplot as plt
import numpy
from scipy.fftpack import dct
from PyQt5.QtGui import QPalette, QBrush, QPixmap
from matplotlib.backends.backend_qt5agg import FigureCanvasQTAgg as FigureCanvas
from matplotlib.figure import Figure
from PyQt5.QtCore import Qt, QStringListModel
from PyQt5 import QtGui
from PyQt5.QtWidgets import *
from encoder.inference import plot_embedding_as_heatmap
from toolbox.utterance import Utterance
from pathlib import Path
from typing import List, Set
import sounddevice as sd
import soundfile as sf
import numpy as np
# from sklearn.manifold import TSNE # You can try with TSNE if you like, I prefer UMAP
from time import sleep
import umap
import sys
from warnings import filterwarnings, warn
filterwarnings("ignore")
colormap = np.array([
[0, 127, 70],
[255, 0, 0],
[255, 217, 38],
[0, 135, 255],
[165, 0, 165],
[255, 167, 255],
[97, 142, 151],
[0, 255, 255],
[255, 96, 38],
[142, 76, 0],
[33, 0, 127],
[0, 0, 0],
[183, 183, 183],
[76, 255, 0],
], dtype=np.float) / 255
default_text = \
"请输入需要克隆的语音文本!"
class UI(QDialog):
min_umap_points = 4
max_log_lines = 5
max_saved_utterances = 20
def draw_utterance(self, utterance: Utterance, which):
self.draw_spec(utterance.spec, which)
self.draw_embed(utterance.embed, utterance.name, which)
def draw_embed(self, embed, name, which):
embed_ax, _ = self.current_ax if which == "current" else self.gen_ax
embed_ax.figure.suptitle("" if embed is None else name)
## Embedding
# Clear the plot
if len(embed_ax.images) > 0:
embed_ax.images[0].colorbar.remove()
embed_ax.clear()
# Draw the embed
if embed is not None:
plot_embedding_as_heatmap(embed, embed_ax)
embed_ax.set_title("embedding")
embed_ax.set_aspect("equal", "datalim")
embed_ax.set_xticks([])
embed_ax.set_yticks([])
embed_ax.figure.canvas.draw()
def draw_spec(self, spec, which):
_, spec_ax = self.current_ax if which == "current" else self.gen_ax
## Spectrogram
# Draw the spectrogram
spec_ax.clear()
if spec is not None:
im = spec_ax.imshow(spec, aspect="auto", interpolation="none")
# spec_ax.figure.colorbar(mappable=im, shrink=0.65, orientation="horizontal",
# spec_ax=spec_ax)
spec_ax.set_title("mel spectrogram")
spec_ax.set_xticks([])
spec_ax.set_yticks([])
spec_ax.figure.canvas.draw()
if which != "current":
self.vocode_button.setDisabled(spec is None)
def draw_umap_projections(self, utterances: Set[Utterance]):
self.umap_ax.clear()
speakers = np.unique([u.speaker_name for u in utterances])
colors = {speaker_name: colormap[i] for i, speaker_name in enumerate(speakers)}
embeds = [u.embed for u in utterances]
# Display a message if there aren't enough points
if len(utterances) < self.min_umap_points:
self.umap_ax.text(.5, .5, "umap:\nAdd %d more points to\ngenerate the projections" %
(self.min_umap_points - len(utterances)),
horizontalalignment='center', fontsize=15)
self.umap_ax.set_title("")
# Compute the projections
else:
if not self.umap_hot:
self.log(
"Drawing UMAP projections for the first time, this will take a few seconds.")
self.umap_hot = True
reducer = umap.UMAP(int(np.ceil(np.sqrt(len(embeds)))), metric="cosine")
# reducer = TSNE()
projections = reducer.fit_transform(embeds)
speakers_done = set()
for projection, utterance in zip(projections, utterances):
color = colors[utterance.speaker_name]
mark = "x" if "_gen_" in utterance.name else "o"
label = None if utterance.speaker_name in speakers_done else utterance.speaker_name
speakers_done.add(utterance.speaker_name)
self.umap_ax.scatter(projection[0], projection[1], c=[color], marker=mark,
label=label)
# self.umap_ax.set_title("UMAP projections")
self.umap_ax.legend(prop={'size': 10})
# Draw the plot
self.umap_ax.set_aspect("equal", "datalim")
self.umap_ax.set_xticks([])
self.umap_ax.set_yticks([])
self.umap_ax.figure.canvas.draw()
def save_audio_file(self, wav, sample_rate):
dialog = QFileDialog()
dialog.setDefaultSuffix(".wav")
fpath, _ = dialog.getSaveFileName(
parent=self,
caption="Select a path to save the audio file",
filter="Audio Files (*.flac *.wav)"
)
if fpath:
#Default format is wav
if Path(fpath).suffix == "":
fpath += ".wav"
sf.write(fpath, wav, sample_rate)
def setup_audio_devices(self, sample_rate):
input_devices = []
output_devices = []
for device in sd.query_devices():
# Check if valid input
try:
sd.check_input_settings(device=device["name"], samplerate=sample_rate)
input_devices.append(device["name"])
except:
pass
# Check if valid output
try:
sd.check_output_settings(device=device["name"], samplerate=sample_rate)
output_devices.append(device["name"])
except Exception as e:
# Log a warning only if the device is not an input
if not device["name"] in input_devices:
warn("Unsupported output device %s for the sample rate: %d \nError: %s" % (device["name"], sample_rate, str(e)))
if len(input_devices) == 0:
self.log("No audio input device detected. Recording may not work.")
self.audio_in_device = None
else:
self.audio_in_device = input_devices[0]
if len(output_devices) == 0:
self.log("No supported output audio devices were found! Audio output may not work.")
self.audio_out_devices_cb.addItems(["None"])
self.audio_out_devices_cb.setDisabled(True)
else:
self.audio_out_devices_cb.clear()
self.audio_out_devices_cb.addItems(output_devices)
self.audio_out_devices_cb.currentTextChanged.connect(self.set_audio_device)
self.set_audio_device()
def set_audio_device(self):
output_device = self.audio_out_devices_cb.currentText()
if output_device == "None":
output_device = None
# If None, sounddevice queries portaudio
sd.default.device = (self.audio_in_device, output_device)
def play(self, wav, sample_rate):
try:
sd.stop()
sd.play(wav, sample_rate)
except Exception as e:
print(e)
self.log("Error in audio playback. Try selecting a different audio output device.")
self.log("Your device must be connected before you start the toolbox.")
def stop(self):
sd.stop()
def record_one(self, sample_rate, duration):
self.record_button.setText("Recording...")
self.record_button.setDisabled(True)
self.log("Recording %d seconds of audio" % duration)
sd.stop()
try:
wav = sd.rec(duration * sample_rate, sample_rate, 1)
except Exception as e:
print(e)
self.log("Could not record anything. Is your recording device enabled?")
self.log("Your device must be connected before you start the toolbox.")
return None
for i in np.arange(0, duration, 0.1):
self.set_loading(i, duration)
sleep(0.1)
self.set_loading(duration, duration)
sd.wait()
self.log("Done recording.")
self.record_button.setText("Record")
self.record_button.setDisabled(False)
return wav.squeeze()
#添加source_mfcc分析函数
def plot_mfcc(self, wav, sample_rate):
signal = wav
print(sample_rate, len(signal))
# 读取前3.5s 的数据
signal = signal[0:int(3.5 * sample_rate)]
print(signal)
# 预先处理
pre_emphasis = 0.97
emphasized_signal = numpy.append(signal[0], signal[1:] - pre_emphasis * signal[:-1])
frame_size = 0.025
frame_stride = 0.1
frame_length, frame_step = frame_size * sample_rate, frame_stride * sample_rate
signal_length = len(emphasized_signal)
frame_length = int(round(frame_length))
frame_step = int(round(frame_step))
num_frames = int(numpy.ceil(float(numpy.abs(signal_length - frame_length)) / frame_step))
pad_signal_length = num_frames * frame_step + frame_length
z = numpy.zeros((pad_signal_length - signal_length))
pad_signal = numpy.append(emphasized_signal, z)
indices = numpy.tile(numpy.arange(0, frame_length), (num_frames, 1)) + numpy.tile(
numpy.arange(0, num_frames * frame_step, frame_step), (frame_length, 1)).T
frames = pad_signal[numpy.mat(indices).astype(numpy.int32, copy=False)]
# 加上汉明窗
frames *= numpy.hamming(frame_length)
# frames *= 0.54 - 0.46 * numpy.cos((2 * numpy.pi * n) / (frame_length - 1)) # Explicit Implementation **
# 傅立叶变换和功率谱
NFFT = 512
mag_frames = numpy.absolute(numpy.fft.rfft(frames, NFFT)) # Magnitude of the FFT
# print(mag_frames.shape)
pow_frames = ((1.0 / NFFT) * ((mag_frames) ** 2)) # Power Spectrum
low_freq_mel = 0
# 将频率转换为Mel
nfilt = 40
high_freq_mel = (2595 * numpy.log10(1 + (sample_rate / 2) / 700))
mel_points = numpy.linspace(low_freq_mel, high_freq_mel, nfilt + 2) # Equally spaced in Mel scale
hz_points = (700 * (10 ** (mel_points / 2595) - 1)) # Convert Mel to Hz
bin = numpy.floor((NFFT + 1) * hz_points / sample_rate)
fbank = numpy.zeros((nfilt, int(numpy.floor(NFFT / 2 + 1))))
for m in range(1, nfilt + 1):
f_m_minus = int(bin[m - 1]) # left
f_m = int(bin[m]) # center
f_m_plus = int(bin[m + 1]) # right
for k in range(f_m_minus, f_m):
fbank[m - 1, k] = (k - bin[m - 1]) / (bin[m] - bin[m - 1])
for k in range(f_m, f_m_plus):
fbank[m - 1, k] = (bin[m + 1] - k) / (bin[m + 1] - bin[m])
filter_banks = numpy.dot(pow_frames, fbank.T)
filter_banks = numpy.where(filter_banks == 0, numpy.finfo(float).eps, filter_banks) # Numerical Stability
filter_banks = 20 * numpy.log10(filter_banks) # dB
# 所得到的倒谱系数2-13被保留其余的被丢弃
num_ceps = 12
mfcc = dct(filter_banks, type=2, axis=1, norm='ortho')[:, 1: (num_ceps + 1)]
(nframes, ncoeff) = mfcc.shape
n = numpy.arange(ncoeff)
cep_lifter = 22
lift = 1 + (cep_lifter / 2) * numpy.sin(numpy.pi * n / cep_lifter)
mfcc *= lift # *
# filter_banks -= (numpy.mean(filter_banks, axis=0) + 1e-8)
mfcc -= (numpy.mean(mfcc, axis=0) + 1e-8)
print(mfcc.shape)
# 创建新的figure
fig10 = plt.figure(figsize=(16,8))
# 绘制1x2两行两列共四个图编号从1开始
ax = fig10.add_subplot(121)
plt.plot(mfcc)
ax = fig10.add_subplot(122)
# 平均归一化MFCC
mfcc -= (numpy.mean(mfcc, axis=0) + 1e-8)
plt.imshow(numpy.flipud(mfcc.T), cmap=plt.cm.jet, aspect=0.2,
extent=[0, mfcc.shape[0], 0, mfcc.shape[1]]) # 热力图
#将figure保存为png并显示在新创建的子窗口上
plt.savefig("fmcc_source.png")
dialog_fault = QDialog()
dialog_fault.setWindowTitle("源音频MFCC特征图及MFCC平均归一化热图") # 设置窗口名
pic = QPixmap("fmcc_source.png")
label_pic = QLabel("show", dialog_fault)
label_pic.setPixmap(pic)
label_pic.setGeometry(0,0,1500,800)
dialog_fault.exec_()
@property
def current_dataset_name(self):
return self.dataset_box.currentText()
@property
def current_speaker_name(self):
return self.speaker_box.currentText()
@property
def current_utterance_name(self):
return self.utterance_box.currentText()
def browse_file(self):
fpath = QFileDialog().getOpenFileName(
parent=self,
caption="Select an audio file",
filter="Audio Files (*.mp3 *.flac *.wav *.m4a)"
)
return Path(fpath[0]) if fpath[0] != "" else ""
@staticmethod
def repopulate_box(box, items, random=False):
"""
Resets a box and adds a list of items. Pass a list of (item, data) pairs instead to join
data to the items
"""
box.blockSignals(True)
box.clear()
for item in items:
item = list(item) if isinstance(item, tuple) else [item]
box.addItem(str(item[0]), *item[1:])
if len(items) > 0:
box.setCurrentIndex(np.random.randint(len(items)) if random else 0)
box.setDisabled(len(items) == 0)
box.blockSignals(False)
def populate_browser(self, datasets_root: Path, recognized_datasets: List, level: int,
random=True):
# Select a random dataset
if level <= 0:
if datasets_root is not None:
datasets = [datasets_root.joinpath(d) for d in recognized_datasets]
datasets = [d.relative_to(datasets_root) for d in datasets if d.exists()]
self.browser_load_button.setDisabled(len(datasets) == 0)
if datasets_root is None or len(datasets) == 0:
msg = "Tip: Please " + (" select the voice to be cloned" \
if datasets_root is None else "o not have any of the recognized datasets" \
" in %s" % datasets_root)
self.log(msg)
msg += ".\nThe recognized datasets are:\n\t%s\nFeel free to add your own. You " \
"can still use the toolbox by recording samples yourself." % \
("\n\t".join(recognized_datasets))
print(msg, file=sys.stderr)
self.random_utterance_button.setDisabled(True)
self.random_speaker_button.setDisabled(True)
self.random_dataset_button.setDisabled(True)
self.utterance_box.setDisabled(True)
self.speaker_box.setDisabled(True)
self.dataset_box.setDisabled(True)
self.browser_load_button.setDisabled(True)
self.auto_next_checkbox.setDisabled(True)
return
self.repopulate_box(self.dataset_box, datasets, random)
# Select a random speaker
if level <= 1:
speakers_root = datasets_root.joinpath(self.current_dataset_name)
speaker_names = [d.stem for d in speakers_root.glob("*") if d.is_dir()]
self.repopulate_box(self.speaker_box, speaker_names, random)
# Select a random utterance
if level <= 2:
utterances_root = datasets_root.joinpath(
self.current_dataset_name,
self.current_speaker_name
)
utterances = []
for extension in ['mp3', 'flac', 'wav', 'm4a']:
utterances.extend(Path(utterances_root).glob("**/*.%s" % extension))
utterances = [fpath.relative_to(utterances_root) for fpath in utterances]
self.repopulate_box(self.utterance_box, utterances, random)
def browser_select_next(self):
index = (self.utterance_box.currentIndex() + 1) % len(self.utterance_box)
self.utterance_box.setCurrentIndex(index)
@property
def current_encoder_fpath(self):
return self.encoder_box.itemData(self.encoder_box.currentIndex())
@property
def current_synthesizer_fpath(self):
return self.synthesizer_box.itemData(self.synthesizer_box.currentIndex())
@property
def current_vocoder_fpath(self):
return self.vocoder_box.itemData(self.vocoder_box.currentIndex())
def populate_models(self, encoder_models_dir: Path, synthesizer_models_dir: Path,
vocoder_models_dir: Path):
# Encoder
encoder_fpaths = list(encoder_models_dir.glob("*.pt"))
if len(encoder_fpaths) == 0:
raise Exception("No encoder models found in %s" % encoder_models_dir)
self.repopulate_box(self.encoder_box, [(f.stem, f) for f in encoder_fpaths])
# Synthesizer
synthesizer_fpaths = list(synthesizer_models_dir.glob("**/*.pt"))
if len(synthesizer_fpaths) == 0:
raise Exception("No synthesizer models found in %s" % synthesizer_models_dir)
self.repopulate_box(self.synthesizer_box, [(f.stem, f) for f in synthesizer_fpaths])
# Vocoder
vocoder_fpaths = list(vocoder_models_dir.glob("**/*.pt"))
vocoder_items = [(f.stem, f) for f in vocoder_fpaths] + [("Griffin-Lim", None)]
self.repopulate_box(self.vocoder_box, vocoder_items)
@property
def selected_utterance(self):
return self.utterance_history.itemData(self.utterance_history.currentIndex())
def register_utterance(self, utterance: Utterance):
self.utterance_history.blockSignals(True)
self.utterance_history.insertItem(0, utterance.name, utterance)
self.utterance_history.setCurrentIndex(0)
self.utterance_history.blockSignals(False)
if len(self.utterance_history) > self.max_saved_utterances:
self.utterance_history.removeItem(self.max_saved_utterances)
self.play_button.setDisabled(False)
self.generate_button.setDisabled(False)
self.synthesize_button.setDisabled(False)
def log(self, line, mode="newline"):
if mode == "newline":
self.logs.append(line)
if len(self.logs) > self.max_log_lines:
del self.logs[0]
elif mode == "append":
self.logs[-1] += line
elif mode == "overwrite":
self.logs[-1] = line
log_text = '\n'.join(self.logs)
self.log_window.setText(log_text)
self.app.processEvents()
def set_loading(self, value, maximum=1):
self.loading_bar.setValue(value * 100)
self.loading_bar.setMaximum(maximum * 100)
self.loading_bar.setTextVisible(value != 0)
self.app.processEvents()
def populate_gen_options(self, seed, trim_silences):
if seed is not None:
self.random_seed_checkbox.setChecked(True)
self.seed_textbox.setText(str(seed))
self.seed_textbox.setEnabled(True)
else:
self.random_seed_checkbox.setChecked(False)
self.seed_textbox.setText(str(0))
self.seed_textbox.setEnabled(False)
if not trim_silences:
self.trim_silences_checkbox.setChecked(False)
self.trim_silences_checkbox.setDisabled(True)
def update_seed_textbox(self):
if self.random_seed_checkbox.isChecked():
self.seed_textbox.setEnabled(True)
else:
self.seed_textbox.setEnabled(False)
def reset_interface(self):
self.draw_embed(None, None, "current")
self.draw_embed(None, None, "generated")
self.draw_spec(None, "current")
self.draw_spec(None, "generated")
self.draw_umap_projections(set())
self.set_loading(0)
self.play_button.setDisabled(True)
self.generate_button.setDisabled(True)
self.synthesize_button.setDisabled(True)
self.vocode_button.setDisabled(True)
self.replay_wav_button.setDisabled(True)
self.export_wav_button.setDisabled(True)
[self.log("") for _ in range(self.max_log_lines)]
#添加result_mfcc分析函数
def plot_mfcc1(self, wav, sample_rate):
signal = wav
print(sample_rate, len(signal))
# 读取前3.5s 的数据
signal = signal[0:int(3.5 * sample_rate)]
print(signal)
# 预先处理
pre_emphasis = 0.97
emphasized_signal = numpy.append(signal[0], signal[1:] - pre_emphasis * signal[:-1])
frame_size = 0.025
frame_stride = 0.1
frame_length, frame_step = frame_size * sample_rate, frame_stride * sample_rate
signal_length = len(emphasized_signal)
frame_length = int(round(frame_length))
frame_step = int(round(frame_step))
num_frames = int(numpy.ceil(float(numpy.abs(signal_length - frame_length)) / frame_step))
pad_signal_length = num_frames * frame_step + frame_length
z = numpy.zeros((pad_signal_length - signal_length))
pad_signal = numpy.append(emphasized_signal, z)
indices = numpy.tile(numpy.arange(0, frame_length), (num_frames, 1)) + numpy.tile(
numpy.arange(0, num_frames * frame_step, frame_step), (frame_length, 1)).T
frames = pad_signal[numpy.mat(indices).astype(numpy.int32, copy=False)]
# 加上汉明窗
frames *= numpy.hamming(frame_length)
# frames *= 0.54 - 0.46 * numpy.cos((2 * numpy.pi * n) / (frame_length - 1)) # Explicit Implementation **
# 傅立叶变换和功率谱
NFFT = 512
mag_frames = numpy.absolute(numpy.fft.rfft(frames, NFFT)) # Magnitude of the FFT
# print(mag_frames.shape)
pow_frames = ((1.0 / NFFT) * ((mag_frames) ** 2)) # Power Spectrum
low_freq_mel = 0
# 将频率转换为Mel
nfilt = 40
high_freq_mel = (2595 * numpy.log10(1 + (sample_rate / 2) / 700))
mel_points = numpy.linspace(low_freq_mel, high_freq_mel, nfilt + 2) # Equally spaced in Mel scale
hz_points = (700 * (10 ** (mel_points / 2595) - 1)) # Convert Mel to Hz
bin = numpy.floor((NFFT + 1) * hz_points / sample_rate)
fbank = numpy.zeros((nfilt, int(numpy.floor(NFFT / 2 + 1))))
for m in range(1, nfilt + 1):
f_m_minus = int(bin[m - 1]) # left
f_m = int(bin[m]) # center
f_m_plus = int(bin[m + 1]) # right
for k in range(f_m_minus, f_m):
fbank[m - 1, k] = (k - bin[m - 1]) / (bin[m] - bin[m - 1])
for k in range(f_m, f_m_plus):
fbank[m - 1, k] = (bin[m + 1] - k) / (bin[m + 1] - bin[m])
filter_banks = numpy.dot(pow_frames, fbank.T)
filter_banks = numpy.where(filter_banks == 0, numpy.finfo(float).eps, filter_banks) # Numerical Stability
filter_banks = 20 * numpy.log10(filter_banks) # dB
# 所得到的倒谱系数2-13被保留其余的被丢弃
num_ceps = 12
mfcc = dct(filter_banks, type=2, axis=1, norm='ortho')[:, 1: (num_ceps + 1)]
(nframes, ncoeff) = mfcc.shape
n = numpy.arange(ncoeff)
cep_lifter = 22
lift = 1 + (cep_lifter / 2) * numpy.sin(numpy.pi * n / cep_lifter)
mfcc *= lift # *
# filter_banks -= (numpy.mean(filter_banks, axis=0) + 1e-8)
mfcc -= (numpy.mean(mfcc, axis=0) + 1e-8)
print(mfcc.shape)
# 创建新的figure
fig11 = plt.figure(figsize=(16,8))
# 绘制1x2两行两列共四个图编号从1开始
ax = fig11.add_subplot(121)
plt.plot(mfcc)
ax = fig11.add_subplot(122)
# 平均归一化MFCC
mfcc -= (numpy.mean(mfcc, axis=0) + 1e-8)
plt.imshow(numpy.flipud(mfcc.T), cmap=plt.cm.jet, aspect=0.2,
extent=[0, mfcc.shape[0], 0, mfcc.shape[1]]) # 热力图
#将figure保存为png并显示在新创建的子窗口上
plt.savefig("fmcc_result.png")
dialog_fault1 = QDialog()
dialog_fault1.setWindowTitle("合成音频MFCC特征图及MFCC平均归一化热图") # 设置窗口名
pic = QPixmap("fmcc_result.png")
label_pic = QLabel("show", dialog_fault1)
label_pic.setPixmap(pic)
label_pic.setGeometry(0,0,1500,800)
dialog_fault1.exec_()
def __init__(self):
## Initialize the application
self.app = QApplication(sys.argv)
super().__init__(None)
self.setWindowTitle("中文语音克隆系统")
self.setWindowIcon(QtGui.QIcon('toolbox\\assets\\mb.png'))
self.setWindowFlag(Qt.WindowMinimizeButtonHint, True)
self.setWindowFlag(Qt.WindowMaximizeButtonHint, True)
## Main layouts
# Root
root_layout = QGridLayout()
self.setLayout(root_layout)
# Browser
browser_layout = QGridLayout()
root_layout.addLayout(browser_layout, 0, 0, 1, 8)
# Generation
gen_layout = QVBoxLayout()
root_layout.addLayout(gen_layout, 0, 8)
# Visualizations
vis_layout = QVBoxLayout()
root_layout.addLayout(vis_layout, 1, 0, 2, 8)
# Output
output_layout = QGridLayout()
vis_layout.addLayout(output_layout, 0)
# Projections
self.projections_layout = QVBoxLayout()
root_layout.addLayout(self.projections_layout, 1, 8, 2, 2)
## Projections
# UMap
fig, self.umap_ax = plt.subplots(figsize=(3, 3), facecolor="#F0F0F0")
fig.subplots_adjust(left=0.02, bottom=0.02, right=0.98, top=0.98)
self.projections_layout.addWidget(FigureCanvas(fig))
self.umap_hot = False
self.clear_button = QPushButton("Clear")
self.clear_button.setStyleSheet('QPushButton{border-image:url(toolbox/assets/2.png)}')
self.projections_layout.addWidget(self.clear_button)
## Browser
# Dataset, speaker and utterance selection
i = 0
source_groupbox = QGroupBox('Source(源音频)')
source_layout = QGridLayout()
source_groupbox.setLayout(source_layout)
browser_layout.addWidget(source_groupbox, i, 0, 1, 4)
self.dataset_box = QComboBox()
# source_layout.addWidget(QLabel("Dataset(数据集):"), i, 0) #隐藏标签文字
source_layout.addWidget(self.dataset_box, i, 1)
self.random_dataset_button = QPushButton("Random")
source_layout.addWidget(self.random_dataset_button, i, 2)
self.random_dataset_button.hide() #隐藏按钮
self.dataset_box.hide() #隐藏选项条
i += 1
self.speaker_box = QComboBox()
# source_layout.addWidget(QLabel("Speaker(说话者)"), i, 0)
source_layout.addWidget(self.speaker_box, i, 1)
self.random_speaker_button = QPushButton("Random")
source_layout.addWidget(self.random_speaker_button, i, 2)
self.random_speaker_button.hide()
self.speaker_box.hide()
i += 1
self.utterance_box = QComboBox()
# source_layout.addWidget(QLabel("Utterance(音频):"), i, 0)
source_layout.addWidget(self.utterance_box, i, 1)
self.random_utterance_button = QPushButton("Random")
source_layout.addWidget(self.random_utterance_button, i, 2)
self.random_utterance_button.hide()
self.utterance_box.hide()
i += 1
source_layout.addWidget(QLabel("<b>Use(使用):</b>"), i, 0)
self.browser_load_button = QPushButton("")
source_layout.addWidget(self.browser_load_button, i, 1, 1, 2)
self.auto_next_checkbox = QCheckBox("Auto select next")
self.auto_next_checkbox.setChecked(True)
source_layout.addWidget(self.auto_next_checkbox, i + 1, 1)
self.browser_browse_button = QPushButton("Browse(打开本地)")
self.browser_browse_button.setStyleSheet('QPushButton{border-image:url(toolbox/assets/1.png)}')
source_layout.addWidget(self.browser_browse_button, i, 3)
self.record_button = QPushButton("Record(录音)")
self.record_button.setStyleSheet('QPushButton{border-image:url(toolbox/assets/1.png)}')
source_layout.addWidget(self.record_button, i+1, 3)
i += 2
# Utterance box
browser_layout.addWidget(QLabel("<b>Current(当前):</b>"), i, 0)
self.utterance_history = QComboBox()
browser_layout.addWidget(self.utterance_history, i, 1)
self.play_button = QPushButton("Play(播放)")
self.play_button.setStyleSheet('QPushButton{border-image:url(toolbox/assets/1.png)}')
browser_layout.addWidget(self.play_button, i, 2)
self.stop_button = QPushButton("Stop(暂停)")
self.stop_button.setStyleSheet('QPushButton{border-image:url(toolbox/assets/1.png)}')
browser_layout.addWidget(self.stop_button, i, 3)
i += 1
model_groupbox = QGroupBox('Models(模型选择)')
model_layout = QHBoxLayout()
model_groupbox.setLayout(model_layout)
browser_layout.addWidget(model_groupbox, i, 0, 1, 4)
# Model and audio output selection
self.encoder_box = QComboBox()
model_layout.addWidget(QLabel("Encoder:"))
model_layout.addWidget(self.encoder_box)
self.synthesizer_box = QComboBox()
model_layout.addWidget(QLabel("Synthesizer:"))
model_layout.addWidget(self.synthesizer_box)
self.vocoder_box = QComboBox()
model_layout.addWidget(QLabel("Vocoder:"))
model_layout.addWidget(self.vocoder_box)
#Replay & Save Audio
i = 0
output_layout.addWidget(QLabel("<b>Toolbox Output:</b>"), i, 0)
self.waves_cb = QComboBox()
self.waves_cb_model = QStringListModel()
self.waves_cb.setModel(self.waves_cb_model)
self.waves_cb.setToolTip("Select one of the last generated waves in this section for replaying or exporting")
output_layout.addWidget(self.waves_cb, i, 1)
self.replay_wav_button = QPushButton("Replay(重播)")
self.replay_wav_button.setStyleSheet('QPushButton{border-image:url(toolbox/assets/1.png)}')
self.replay_wav_button.setToolTip("Replay last generated vocoder")
output_layout.addWidget(self.replay_wav_button, i, 2)
self.export_wav_button = QPushButton("Export(导出)")
self.export_wav_button.setStyleSheet('QPushButton{border-image:url(toolbox/assets/1.png)}')
self.export_wav_button.setToolTip("Save last generated vocoder audio in filesystem as a wav file")
output_layout.addWidget(self.export_wav_button, i, 3)
self.audio_out_devices_cb=QComboBox()
i += 1
output_layout.addWidget(QLabel("<b>Audio Output</b>"), i, 0)
output_layout.addWidget(self.audio_out_devices_cb, i, 1)
## Embed & spectrograms
vis_layout.addStretch()
#添加标签控件,设置标签文字格式并且居中
label1 = QLabel("source audio")
label1.setStyleSheet("QLabel{color:red;font-size:20px;font-weight:bold;font-family:Roman times;}")
label1.setAlignment(Qt.AlignCenter)
vis_layout.addWidget(label1) #addwidget:添加控件
gridspec_kw = {"width_ratios": [1, 4]}
fig, self.current_ax = plt.subplots(1, 2, figsize=(10, 2.25), facecolor="#F0F0F0",
gridspec_kw=gridspec_kw)
#self.current_ax[1].set_title("source audio", fontsize=50, color='red', fontstyle='italic', fontweight="heavy")
fig.subplots_adjust(left=0, bottom=0.1, right=1, top=0.8)
vis_layout.addWidget(FigureCanvas(fig))
label2 = QLabel("target audio")
label2.setStyleSheet("QLabel{color:red;font-size:20px;font-weight:bold;font-family:Roman times;}")
label2.setAlignment(Qt.AlignCenter)
vis_layout.addWidget(label2)
fig, self.gen_ax = plt.subplots(1, 2, figsize=(10, 2.25), facecolor="#F0F0F0",
gridspec_kw=gridspec_kw)
#self.gen_ax[1].set_title("target audio", fontsize=50, color='red', fontstyle='italic', fontweight="heavy")
fig.subplots_adjust(left=0, bottom=0.1, right=1, top=0.8)
vis_layout.addWidget(FigureCanvas(fig))
for ax in self.current_ax.tolist() + self.gen_ax.tolist():
ax.set_facecolor("#F0F0F0")
for side in ["top", "right", "bottom", "left"]:
ax.spines[side].set_visible(False)
## Generation
self.text_prompt = QPlainTextEdit(default_text)
gen_layout.addWidget(self.text_prompt, stretch=1)
self.generate_button = QPushButton("Synthesize and vocode(合成并播放)")
self.generate_button.setStyleSheet('QPushButton{border-image:url(toolbox/assets/1.png)}')
gen_layout.addWidget(self.generate_button)
layout = QHBoxLayout()
self.synthesize_button = QPushButton("Synthesize only(仅合成)")
self.synthesize_button.setStyleSheet('QPushButton{border-image:url(toolbox/assets/1.png)}')
layout.addWidget(self.synthesize_button)
self.vocode_button = QPushButton("Vocode only(仅播放)")
self.vocode_button.setStyleSheet('QPushButton{border-image:url(toolbox/assets/1.png)}')
layout.addWidget(self.vocode_button)
gen_layout.addLayout(layout)
layout_seed = QGridLayout()
self.random_seed_checkbox = QCheckBox("Random seed(随机数种子):")
self.random_seed_checkbox.setToolTip("When checked, makes the synthesizer and vocoder deterministic.")
layout_seed.addWidget(self.random_seed_checkbox, 0, 0)
self.seed_textbox = QLineEdit()
self.seed_textbox.setMaximumWidth(80)
layout_seed.addWidget(self.seed_textbox, 0, 1)
self.trim_silences_checkbox = QCheckBox("Enhance vocoder output语音增强")
self.trim_silences_checkbox.setToolTip("When checked, trims excess silence in vocoder output."
" This feature requires `webrtcvad` to be installed.")
layout_seed.addWidget(self.trim_silences_checkbox, 0, 2, 1, 2)
self.style_slider = QSlider(Qt.Horizontal)
self.style_slider.setTickInterval(1)
self.style_slider.setFocusPolicy(Qt.NoFocus)
self.style_slider.setSingleStep(1)
self.style_slider.setRange(-1, 9)
self.style_value_label = QLabel("-1")
self.style_slider.setValue(-1)
layout_seed.addWidget(QLabel("Style(风格):"), 1, 0)
self.style_slider.valueChanged.connect(lambda s: self.style_value_label.setNum(s))
layout_seed.addWidget(self.style_value_label, 1, 1)
layout_seed.addWidget(self.style_slider, 1, 3)
self.token_slider = QSlider(Qt.Horizontal)
self.token_slider.setTickInterval(1)
self.token_slider.setFocusPolicy(Qt.NoFocus)
self.token_slider.setSingleStep(1)
self.token_slider.setRange(3, 9)
self.token_value_label = QLabel("4")
self.token_slider.setValue(4)
layout_seed.addWidget(QLabel("Accuracy(精度):"), 2, 0)
self.token_slider.valueChanged.connect(lambda s: self.token_value_label.setNum(s))
layout_seed.addWidget(self.token_value_label, 2, 1)
layout_seed.addWidget(self.token_slider, 2, 3)
self.length_slider = QSlider(Qt.Horizontal)
self.length_slider.setTickInterval(1)
self.length_slider.setFocusPolicy(Qt.NoFocus)
self.length_slider.setSingleStep(1)
self.length_slider.setRange(1, 10)
self.length_value_label = QLabel("2")
self.length_slider.setValue(2)
layout_seed.addWidget(QLabel("MaxLength(最大句长):"), 3, 0)
self.length_slider.valueChanged.connect(lambda s: self.length_value_label.setNum(s))
layout_seed.addWidget(self.length_value_label, 3, 1)
layout_seed.addWidget(self.length_slider, 3, 3)
gen_layout.addLayout(layout_seed)
self.loading_bar = QProgressBar()
gen_layout.addWidget(self.loading_bar)
self.log_window = QLabel()
self.log_window.setAlignment(Qt.AlignBottom | Qt.AlignLeft)
gen_layout.addWidget(self.log_window)
self.logs = []
gen_layout.addStretch()
## Set the size of the window and of the elements
max_size = QDesktopWidget().availableGeometry(self).size() * 0.5
self.resize(max_size)
## Finalize the display
self.reset_interface()
self.show()
##set the picture of background
palette1 = QPalette()
# palette1.setColor(self.backgroundRole(), QColor(192,253,123)) # 设置背景颜色
palette1.setBrush(self.backgroundRole(), QBrush(QPixmap('toolbox\\assets\\picture1.jpg'))) # 设置背景图片
self.setPalette(palette1)
self.setAutoFillBackground(True)
def start(self):
self.app.exec_()