Tacotron2+HifiGAN派蒙600语音合成模型下载

2022-09-09 14:56 作者:雾削木FHZ 0人读过 | 我要投稿

模型使用谷歌的Colab进行训练，没钱买Colab+所以花了很长时间重连、训练、重连、训练；

定的训练目标是600，目前已经全部训练完了。

模型大小为：322MB（338,426,303 字节）

转换音频需要输入拼音+音标数字

测试音频：https://wwb.lanzoul.com/ia7gs0bcr6da

因为训练的数据不一，所以不同的句子效果也不同，但UP感觉很接近了，虽然没有VITS那样优秀；

因为模型大于100MB所以无法上传到蓝奏给大家分享；

谷歌云盘分享链接：https://drive.google.com/file/d/1I9kj7187xFyv9xapvmR-oBeILKX0gx9u/view?usp=sharing

另外群内（一群）也上传了文件，无法上谷歌的可以进群下载；

模型调用代码：

#@markdown Config:

#@markdown Restart the code to apply any changes.

#Add new characters here.

#Universal HiFi-GAN (has some robotic noise): 1qpgI41wNXFcH-iKq1Y42JlBC9j0je8PW

Tacotron2_Model = '/content/drive/MyDrive/colab/outdir/Paimon_test'#@param {type:"string"}

TACOTRON2_ID = Tacotron2_Model

HIFIGAN_ID = "1qpgI41wNXFcH-iKq1Y42JlBC9j0je8PW"

from pypinyin import lazy_pinyin,Style

# Check if Initilized

try:

initilized

except NameError:

print("Setting up, please wait.\n")

!pip install tqdm -q

from tqdm.notebook import tqdm

with tqdm(total=5, leave=False) as pbar:

%tensorflow_version 2.x

import os

from os.path import exists, join, basename, splitext

!pip install gdown

git_repo_url = 'https://github.com/NVIDIA/tacotron2.git'

project_name = splitext(basename(git_repo_url))[0]

if not exists(project_name):

# clone and install

!git clone -q --recursive {git_repo_url}

!git clone -q --recursive https://github.com/SortAnon/hifi-gan

!pip install -q librosa unidecode

pbar.update(1) # downloaded TT2 and HiFi-GAN

import sys

sys.path.append('hifi-gan')

sys.path.append(project_name)

import time

import matplotlib

import matplotlib.pylab as plt

import gdown

d = 'https://drive.google.com/uc?id='

%matplotlib inline

import IPython.display as ipd

import numpy as np

import torch

import json

from hparams import create_hparams

from model import Tacotron2

from layers import TacotronSTFT

from audio_processing import griffin_lim

from text import text_to_sequence

from env import AttrDict

from meldataset import MAX_WAV_VALUE

from models import Generator

pbar.update(1) # initialized Dependancies

graph_width = 900

graph_height = 360

def plot_data(data, figsize=(int(graph_width/100), int(graph_height/100))):

%matplotlib inline

fig, axes = plt.subplots(1, len(data), figsize=figsize)

for i in range(len(data)):

axes[i].imshow(data[i], aspect='auto', origin='bottom',

interpolation='none', cmap='inferno')

fig.canvas.draw()

plt.show()

# Setup Pronounciation Dictionary

!gdown --id '1E12g_sREdcH5vuZb44EZYX8JjGWQ9rRp'

thisdict = {}

for line in reversed((open('merged.dict.txt', "r").read()).splitlines()):

thisdict[(line.split(" ",1))[0]] = (line.split(" ",1))[1].strip()

pbar.update(1) # Downloaded and Set up Pronounciation Dictionary

def ARPA(text, punctuation=r"!?,.;", EOS_Token=True):

out = ''

for word_ in text.split(" "):

word=word_; end_chars = ''

while any(elem in word for elem in punctuation) and len(word) > 1:

if word[-1] in punctuation: end_chars = word[-1] + end_chars; word = word[:-1]

else: break

try:

word_arpa = thisdict[word.upper()]

word = "{" + str(word_arpa) + "}"

except KeyError: pass

out = (out + " " + word + end_chars).strip()

if EOS_Token and out[-1] != ";": out += ";"

return out

def get_hifigan(MODEL_ID):

# Download HiFi-GAN

hifigan_pretrained_model = 'hifimodel'

gdown.download(d+MODEL_ID, hifigan_pretrained_model, quiet=False)

if not exists(hifigan_pretrained_model):

raise Exception("HiFI-GAN model failed to download!")

# Load HiFi-GAN

conf = os.path.join("hifi-gan", "config_v1.json")

with open(conf) as f:

json_config = json.loads(f.read())

h = AttrDict(json_config)

torch.manual_seed(h.seed)

hifigan = Generator(h).to(torch.device("cuda"))

state_dict_g = torch.load(hifigan_pretrained_model, map_location=torch.device("cuda"))

hifigan.load_state_dict(state_dict_g["generator"])

hifigan.eval()

hifigan.remove_weight_norm()

return hifigan, h

hifigan, h = get_hifigan(HIFIGAN_ID)

pbar.update(1) # Downloaded and Set up HiFi-GAN

def has_MMI(STATE_DICT):

return any(True for x in STATE_DICT.keys() if "mi." in x)

def get_Tactron2(MODEL_ID):

# Download Tacotron2

tacotron2_pretrained_model = TACOTRON2_ID

if not exists(tacotron2_pretrained_model):

raise Exception("Tacotron2 model failed to download!")

# Load Tacotron2 and Config

hparams = create_hparams()

hparams.sampling_rate = 22050

hparams.max_decoder_steps = 3000 # Max Duration

hparams.gate_threshold = 0.25 # Model must be 25% sure the clip is over before ending generation

model = Tacotron2(hparams)

state_dict = torch.load(tacotron2_pretrained_model)['state_dict']

if has_MMI(state_dict):

raise Exception("ERROR: This notebook does not currently support MMI models.")

model.load_state_dict(state_dict)

_ = model.cuda().eval().half()

return model, hparams

model, hparams = get_Tactron2(TACOTRON2_ID)

previous_tt2_id = TACOTRON2_ID

pbar.update(1) # Downloaded and Set up Tacotron2

# Extra Info

def end_to_end_infer(text, pronounciation_dictionary, show_graphs):

for i in [x for x in text.split("\n") if len(x)]:

if not pronounciation_dictionary:

if i[-1] != ";": i=i+";"

else: i = ARPA(i)

with torch.no_grad(): # save VRAM by not including gradients

sequence = np.array(text_to_sequence(i, ['english_cleaners']))[None, :]

sequence = torch.autograd.Variable(torch.from_numpy(sequence)).cuda().long()

mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence)

if show_graphs:

plot_data((mel_outputs_postnet.float().data.cpu().numpy()[0],

alignments.float().data.cpu().numpy()[0].T))

y_g_hat = hifigan(mel_outputs_postnet.float())

audio = y_g_hat.squeeze()

audio = audio * MAX_WAV_VALUE

print("")

ipd.display(ipd.Audio(audio.cpu().numpy().astype("int16"), rate=hparams.sampling_rate))

from IPython.display import clear_output

clear_output()

initilized = "Ready"

if previous_tt2_id != TACOTRON2_ID:

print("Updating Models")

model, hparams = get_Tactron2(TACOTRON2_ID)

hifigan, h = get_hifigan(HIFIGAN_ID)

previous_tt2_id = TACOTRON2_ID

pronounciation_dictionary = False #@param {type:"boolean"}

# disables automatic ARPAbet conversion, useful for inputting your own ARPAbet pronounciations or just for testing

show_graphs = True #@param {type:"boolean"}

max_duration = 25 #this does nothing

model.decoder.max_decoder_steps = 1000 #@param {type:"integer"}

stop_threshold = 0.3 #@param {type:"number"}

model.decoder.gate_threshold = stop_threshold

#@markdown ---

print(f"Current Config:\npronounciation_dictionary: {pronounciation_dictionary}\nshow_graphs: {show_graphs}\nmax_duration (in seconds): {max_duration}\nstop_threshold: {stop_threshold}\n\n")

time.sleep(1)

print("Enter/Paste your text.输入拼音+数字表示声调，支持直接中文输入")

contents = []

while True:

try:

print("-"*50)

line = input()

if line != "":

line = " ".join(lazy_pinyin(line, style=Style.TONE3))

print(line)

end_to_end_infer(line, pronounciation_dictionary, show_graphs)

except EOFError:

break

except KeyboardInterrupt:

print("Stopping...")

break

标签：

Tacotron2+HifiGAN派蒙600语音合成模型下载

Tacotron2+HifiGAN派蒙600语音合成模型下载的评论 (共条)

你可能也喜欢这些文章

最新发布的文章

Tacotron2+HifiGAN派蒙600语音合成模型下载

本文作者的其他文章

Tacotron2+HifiGAN派蒙600语音合成模型下载的评论 (共 条)

你可能也喜欢这些文章

最新发布的文章

Tacotron2+HifiGAN派蒙600语音合成模型下载的评论 (共条)