欢迎光临散文网 会员登陆 & 注册

Tacotron2+HifiGAN派蒙600语音合成模型下载

2022-09-09 14:56 作者:雾削木FHZ  | 我要投稿


    模型使用谷歌的Colab进行训练,没钱买Colab+所以花了很长时间重连、训练、重连、训练;

    定的训练目标是600,目前已经全部训练完了。

    模型大小为:322MB(338,426,303 字节)

转换音频需要输入拼音+音标数字

测试音频:https://wwb.lanzoul.com/ia7gs0bcr6da

因为训练的数据不一,所以不同的句子效果也不同,但UP感觉很接近了,虽然没有VITS那样优秀;


梅普图
第600训练

因为模型大于100MB所以无法上传到蓝奏给大家分享;

谷歌云盘分享链接:https://drive.google.com/file/d/1I9kj7187xFyv9xapvmR-oBeILKX0gx9u/view?usp=sharing

另外群内(一群)也上传了文件,无法上谷歌的可以进群下载;

模型调用代码:

#@markdown Config:


#@markdown Restart the code to apply any changes.


#Add new characters here.

#Universal HiFi-GAN (has some robotic noise): 1qpgI41wNXFcH-iKq1Y42JlBC9j0je8PW

Tacotron2_Model = '/content/drive/MyDrive/colab/outdir/Paimon_test'#@param {type:"string"}

TACOTRON2_ID = Tacotron2_Model

HIFIGAN_ID = "1qpgI41wNXFcH-iKq1Y42JlBC9j0je8PW"

from pypinyin import lazy_pinyin,Style


# Check if Initilized

try:

    initilized

except NameError:

    print("Setting up, please wait.\n")

    !pip install tqdm -q

    from tqdm.notebook import tqdm

    with tqdm(total=5, leave=False) as pbar:

        %tensorflow_version 2.x

        import os

        from os.path import exists, join, basename, splitext

        !pip install gdown

        git_repo_url = 'https://github.com/NVIDIA/tacotron2.git'

        project_name = splitext(basename(git_repo_url))[0]

        if not exists(project_name):

            # clone and install

            !git clone -q --recursive {git_repo_url}

            !git clone -q --recursive https://github.com/SortAnon/hifi-gan

            !pip install -q librosa unidecode

        pbar.update(1) # downloaded TT2 and HiFi-GAN

        import sys

        sys.path.append('hifi-gan')

        sys.path.append(project_name)

        import time

        import matplotlib

        import matplotlib.pylab as plt

        import gdown

        d = 'https://drive.google.com/uc?id='


        %matplotlib inline

        import IPython.display as ipd

        import numpy as np

        import torch

        import json

        from hparams import create_hparams

        from model import Tacotron2

        from layers import TacotronSTFT

        from audio_processing import griffin_lim

        from text import text_to_sequence

        from env import AttrDict

        from meldataset import MAX_WAV_VALUE

        from models import Generator


        pbar.update(1) # initialized Dependancies


        graph_width = 900

        graph_height = 360

        def plot_data(data, figsize=(int(graph_width/100), int(graph_height/100))):

            %matplotlib inline

            fig, axes = plt.subplots(1, len(data), figsize=figsize)

            for i in range(len(data)):

                axes[i].imshow(data[i], aspect='auto', origin='bottom', 

                            interpolation='none', cmap='inferno')

            fig.canvas.draw()

            plt.show()


        # Setup Pronounciation Dictionary

        !gdown --id '1E12g_sREdcH5vuZb44EZYX8JjGWQ9rRp'

        thisdict = {}

        for line in reversed((open('merged.dict.txt', "r").read()).splitlines()):

            thisdict[(line.split(" ",1))[0]] = (line.split(" ",1))[1].strip()


        pbar.update(1) # Downloaded and Set up Pronounciation Dictionary


        def ARPA(text, punctuation=r"!?,.;", EOS_Token=True):

            out = ''

            for word_ in text.split(" "):

                word=word_; end_chars = ''

                while any(elem in word for elem in punctuation) and len(word) > 1:

                    if word[-1] in punctuation: end_chars = word[-1] + end_chars; word = word[:-1]

                    else: break

                try:

                    word_arpa = thisdict[word.upper()]

                    word = "{" + str(word_arpa) + "}"

                except KeyError: pass

                out = (out + " " + word + end_chars).strip()

            if EOS_Token and out[-1] != ";": out += ";"

            return out


        def get_hifigan(MODEL_ID):

            # Download HiFi-GAN

            hifigan_pretrained_model = 'hifimodel'

            gdown.download(d+MODEL_ID, hifigan_pretrained_model, quiet=False)

            if not exists(hifigan_pretrained_model):

                raise Exception("HiFI-GAN model failed to download!")


            # Load HiFi-GAN

            conf = os.path.join("hifi-gan", "config_v1.json")

            with open(conf) as f:

                json_config = json.loads(f.read())

            h = AttrDict(json_config)

            torch.manual_seed(h.seed)

            hifigan = Generator(h).to(torch.device("cuda"))

            state_dict_g = torch.load(hifigan_pretrained_model, map_location=torch.device("cuda"))

            hifigan.load_state_dict(state_dict_g["generator"])

            hifigan.eval()

            hifigan.remove_weight_norm()

            return hifigan, h


        hifigan, h = get_hifigan(HIFIGAN_ID)

        pbar.update(1) # Downloaded and Set up HiFi-GAN


        def has_MMI(STATE_DICT):

            return any(True for x in STATE_DICT.keys() if "mi." in x)


        def get_Tactron2(MODEL_ID):

            # Download Tacotron2

            tacotron2_pretrained_model = TACOTRON2_ID

            if not exists(tacotron2_pretrained_model):

                raise Exception("Tacotron2 model failed to download!")

            # Load Tacotron2 and Config

            hparams = create_hparams()

            hparams.sampling_rate = 22050

            hparams.max_decoder_steps = 3000 # Max Duration

            hparams.gate_threshold = 0.25 # Model must be 25% sure the clip is over before ending generation

            model = Tacotron2(hparams)

            state_dict = torch.load(tacotron2_pretrained_model)['state_dict']

            if has_MMI(state_dict):

                raise Exception("ERROR: This notebook does not currently support MMI models.")

            model.load_state_dict(state_dict)

            _ = model.cuda().eval().half()

            return model, hparams


        model, hparams = get_Tactron2(TACOTRON2_ID)

        previous_tt2_id = TACOTRON2_ID


        pbar.update(1) # Downloaded and Set up Tacotron2


        # Extra Info

        def end_to_end_infer(text, pronounciation_dictionary, show_graphs):

            for i in [x for x in text.split("\n") if len(x)]:

                if not pronounciation_dictionary:

                    if i[-1] != ";": i=i+";" 

                else: i = ARPA(i)

                with torch.no_grad(): # save VRAM by not including gradients

                    sequence = np.array(text_to_sequence(i, ['english_cleaners']))[None, :]

                    sequence = torch.autograd.Variable(torch.from_numpy(sequence)).cuda().long()

                    mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence)

                    if show_graphs:

                        plot_data((mel_outputs_postnet.float().data.cpu().numpy()[0],

                                alignments.float().data.cpu().numpy()[0].T))

                    y_g_hat = hifigan(mel_outputs_postnet.float())

                    audio = y_g_hat.squeeze()

                    audio = audio * MAX_WAV_VALUE

                    print("")

                    ipd.display(ipd.Audio(audio.cpu().numpy().astype("int16"), rate=hparams.sampling_rate))

    from IPython.display import clear_output

    clear_output()

    initilized = "Ready"


if previous_tt2_id != TACOTRON2_ID:

    print("Updating Models")

    model, hparams = get_Tactron2(TACOTRON2_ID)

    hifigan, h = get_hifigan(HIFIGAN_ID)

    previous_tt2_id = TACOTRON2_ID


pronounciation_dictionary = False #@param {type:"boolean"}

# disables automatic ARPAbet conversion, useful for inputting your own ARPAbet pronounciations or just for testing

show_graphs = True #@param {type:"boolean"}

max_duration = 25 #this does nothing

model.decoder.max_decoder_steps = 1000 #@param {type:"integer"}

stop_threshold = 0.3 #@param {type:"number"}

model.decoder.gate_threshold = stop_threshold


#@markdown ---


print(f"Current Config:\npronounciation_dictionary: {pronounciation_dictionary}\nshow_graphs: {show_graphs}\nmax_duration (in seconds): {max_duration}\nstop_threshold: {stop_threshold}\n\n")


time.sleep(1)

print("Enter/Paste your text.输入拼音+数字表示声调,支持直接中文输入")

contents = []

while True:

    try:

        print("-"*50)

        line = input()

        if line != "":

          line = " ".join(lazy_pinyin(line, style=Style.TONE3))

        print(line)

        end_to_end_infer(line, pronounciation_dictionary, show_graphs)

    except EOFError:

        break

    except KeyboardInterrupt:

        print("Stopping...")

        break



Tacotron2+HifiGAN派蒙600语音合成模型下载的评论 (共 条)

分享到微博请遵守国家法律