Tacotron2+HifiGAN派蒙600语音合成模型下载

模型使用谷歌的Colab进行训练,没钱买Colab+所以花了很长时间重连、训练、重连、训练;
定的训练目标是600,目前已经全部训练完了。
模型大小为:322MB(338,426,303 字节)
转换音频需要输入拼音+音标数字
测试音频:https://wwb.lanzoul.com/ia7gs0bcr6da
因为训练的数据不一,所以不同的句子效果也不同,但UP感觉很接近了,虽然没有VITS那样优秀;


因为模型大于100MB所以无法上传到蓝奏给大家分享;
谷歌云盘分享链接:https://drive.google.com/file/d/1I9kj7187xFyv9xapvmR-oBeILKX0gx9u/view?usp=sharing
另外群内(一群)也上传了文件,无法上谷歌的可以进群下载;
模型调用代码:
#@markdown Config:
#@markdown Restart the code to apply any changes.
#Add new characters here.
#Universal HiFi-GAN (has some robotic noise): 1qpgI41wNXFcH-iKq1Y42JlBC9j0je8PW
Tacotron2_Model = '/content/drive/MyDrive/colab/outdir/Paimon_test'#@param {type:"string"}
TACOTRON2_ID = Tacotron2_Model
HIFIGAN_ID = "1qpgI41wNXFcH-iKq1Y42JlBC9j0je8PW"
from pypinyin import lazy_pinyin,Style
# Check if Initilized
try:
initilized
except NameError:
print("Setting up, please wait.\n")
!pip install tqdm -q
from tqdm.notebook import tqdm
with tqdm(total=5, leave=False) as pbar:
%tensorflow_version 2.x
import os
from os.path import exists, join, basename, splitext
!pip install gdown
git_repo_url = 'https://github.com/NVIDIA/tacotron2.git'
project_name = splitext(basename(git_repo_url))[0]
if not exists(project_name):
# clone and install
!git clone -q --recursive {git_repo_url}
!git clone -q --recursive https://github.com/SortAnon/hifi-gan
!pip install -q librosa unidecode
pbar.update(1) # downloaded TT2 and HiFi-GAN
import sys
sys.path.append('hifi-gan')
sys.path.append(project_name)
import time
import matplotlib
import matplotlib.pylab as plt
import gdown
d = 'https://drive.google.com/uc?id='
%matplotlib inline
import IPython.display as ipd
import numpy as np
import torch
import json
from hparams import create_hparams
from model import Tacotron2
from layers import TacotronSTFT
from audio_processing import griffin_lim
from text import text_to_sequence
from env import AttrDict
from meldataset import MAX_WAV_VALUE
from models import Generator
pbar.update(1) # initialized Dependancies
graph_width = 900
graph_height = 360
def plot_data(data, figsize=(int(graph_width/100), int(graph_height/100))):
%matplotlib inline
fig, axes = plt.subplots(1, len(data), figsize=figsize)
for i in range(len(data)):
axes[i].imshow(data[i], aspect='auto', origin='bottom',
interpolation='none', cmap='inferno')
fig.canvas.draw()
plt.show()
# Setup Pronounciation Dictionary
!gdown --id '1E12g_sREdcH5vuZb44EZYX8JjGWQ9rRp'
thisdict = {}
for line in reversed((open('merged.dict.txt', "r").read()).splitlines()):
thisdict[(line.split(" ",1))[0]] = (line.split(" ",1))[1].strip()
pbar.update(1) # Downloaded and Set up Pronounciation Dictionary
def ARPA(text, punctuation=r"!?,.;", EOS_Token=True):
out = ''
for word_ in text.split(" "):
word=word_; end_chars = ''
while any(elem in word for elem in punctuation) and len(word) > 1:
if word[-1] in punctuation: end_chars = word[-1] + end_chars; word = word[:-1]
else: break
try:
word_arpa = thisdict[word.upper()]
word = "{" + str(word_arpa) + "}"
except KeyError: pass
out = (out + " " + word + end_chars).strip()
if EOS_Token and out[-1] != ";": out += ";"
return out
def get_hifigan(MODEL_ID):
# Download HiFi-GAN
hifigan_pretrained_model = 'hifimodel'
gdown.download(d+MODEL_ID, hifigan_pretrained_model, quiet=False)
if not exists(hifigan_pretrained_model):
raise Exception("HiFI-GAN model failed to download!")
# Load HiFi-GAN
conf = os.path.join("hifi-gan", "config_v1.json")
with open(conf) as f:
json_config = json.loads(f.read())
h = AttrDict(json_config)
torch.manual_seed(h.seed)
hifigan = Generator(h).to(torch.device("cuda"))
state_dict_g = torch.load(hifigan_pretrained_model, map_location=torch.device("cuda"))
hifigan.load_state_dict(state_dict_g["generator"])
hifigan.eval()
hifigan.remove_weight_norm()
return hifigan, h
hifigan, h = get_hifigan(HIFIGAN_ID)
pbar.update(1) # Downloaded and Set up HiFi-GAN
def has_MMI(STATE_DICT):
return any(True for x in STATE_DICT.keys() if "mi." in x)
def get_Tactron2(MODEL_ID):
# Download Tacotron2
tacotron2_pretrained_model = TACOTRON2_ID
if not exists(tacotron2_pretrained_model):
raise Exception("Tacotron2 model failed to download!")
# Load Tacotron2 and Config
hparams = create_hparams()
hparams.sampling_rate = 22050
hparams.max_decoder_steps = 3000 # Max Duration
hparams.gate_threshold = 0.25 # Model must be 25% sure the clip is over before ending generation
model = Tacotron2(hparams)
state_dict = torch.load(tacotron2_pretrained_model)['state_dict']
if has_MMI(state_dict):
raise Exception("ERROR: This notebook does not currently support MMI models.")
model.load_state_dict(state_dict)
_ = model.cuda().eval().half()
return model, hparams
model, hparams = get_Tactron2(TACOTRON2_ID)
previous_tt2_id = TACOTRON2_ID
pbar.update(1) # Downloaded and Set up Tacotron2
# Extra Info
def end_to_end_infer(text, pronounciation_dictionary, show_graphs):
for i in [x for x in text.split("\n") if len(x)]:
if not pronounciation_dictionary:
if i[-1] != ";": i=i+";"
else: i = ARPA(i)
with torch.no_grad(): # save VRAM by not including gradients
sequence = np.array(text_to_sequence(i, ['english_cleaners']))[None, :]
sequence = torch.autograd.Variable(torch.from_numpy(sequence)).cuda().long()
mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence)
if show_graphs:
plot_data((mel_outputs_postnet.float().data.cpu().numpy()[0],
alignments.float().data.cpu().numpy()[0].T))
y_g_hat = hifigan(mel_outputs_postnet.float())
audio = y_g_hat.squeeze()
audio = audio * MAX_WAV_VALUE
print("")
ipd.display(ipd.Audio(audio.cpu().numpy().astype("int16"), rate=hparams.sampling_rate))
from IPython.display import clear_output
clear_output()
initilized = "Ready"
if previous_tt2_id != TACOTRON2_ID:
print("Updating Models")
model, hparams = get_Tactron2(TACOTRON2_ID)
hifigan, h = get_hifigan(HIFIGAN_ID)
previous_tt2_id = TACOTRON2_ID
pronounciation_dictionary = False #@param {type:"boolean"}
# disables automatic ARPAbet conversion, useful for inputting your own ARPAbet pronounciations or just for testing
show_graphs = True #@param {type:"boolean"}
max_duration = 25 #this does nothing
model.decoder.max_decoder_steps = 1000 #@param {type:"integer"}
stop_threshold = 0.3 #@param {type:"number"}
model.decoder.gate_threshold = stop_threshold
#@markdown ---
print(f"Current Config:\npronounciation_dictionary: {pronounciation_dictionary}\nshow_graphs: {show_graphs}\nmax_duration (in seconds): {max_duration}\nstop_threshold: {stop_threshold}\n\n")
time.sleep(1)
print("Enter/Paste your text.输入拼音+数字表示声调,支持直接中文输入")
contents = []
while True:
try:
print("-"*50)
line = input()
if line != "":
line = " ".join(lazy_pinyin(line, style=Style.TONE3))
print(line)
end_to_end_infer(line, pronounciation_dictionary, show_graphs)
except EOFError:
break
except KeyboardInterrupt:
print("Stopping...")
break