欢迎光临散文网 会员登陆 & 注册

VitsApi-基于MoeGoe改造

2023-04-15 15:37 作者:不想起名字的雨狼  | 我要投稿

Moegoe环境配置(配置方法详见Vits本地部署)

一样使用Anaconda创建一个虚拟的python3.9环境

不怕污染本地环境可以直接安装python3.9,python3.10没有试验过无法保证运行

这个只是思路,没有配置文件上传等自定义方式

model和config文件都是要先放在服务器上的

下载好MoeGoe源码并解压到任意文件夹:

https://github.com/CjangCjengh/MoeGoe/

修改项目文件夹下的text文件夹中的mandarin.py文件

第10行改为

jieba.set_dictionary(os.path.abspath('.')+'/jieba/dict.txt')

在MoeGoe的项目文件夹中新建一个Api.py文件,将下列代码复制到Api.py文件中

audios存放输出后的音频文件,model放置G_*.pth和config.json

from flask import Flask, jsonify, request

from flask_cors import CORS, cross_origin

from scipy.io.wavfile import write

from mel_processing import spectrogram_torch

from text import text_to_sequence, _clean_text

from models import SynthesizerTrn

import utils

import commons

import sys

import re

from torch import no_grad, LongTensor

import logging


app = Flask(__name__)


api_port = 5000


vits_model_path = r""

vits_config_path = r""

vits_model_speaker_id = 0

tts_choice = True

read_text = ""

audio_output_path = r""

escape = False


hubert_model_path = r""

audio_input_path = r""

#true tts false vc

raw_text = ""

emotion_reference_path = r""

w2v2_dimensional_emotion_model_path = r""


cors = CORS(app)


logging.getLogger('numba').setLevel(logging.WARNING)

   

def ex_print(text, escape=False):

    if escape:

        print(text.encode('unicode_escape').decode())

    else:

        print(text)


def get_text(text, hps, cleaned=False):

    if cleaned:

        text_norm = text_to_sequence(text, hps.symbols, [])

    else:

        text_norm = text_to_sequence(text, hps.symbols, hps.data.text_cleaners)

    if hps.data.add_blank:

        text_norm = commons.intersperse(text_norm, 0)

    text_norm = LongTensor(text_norm)

    return text_norm


def ask_if_continue():

    while True:

        answer = input('Continue? (y/n): ')

        if answer == 'y':

            break

        elif answer == 'n':

            sys.exit(0)


def print_speakers(speakers, escape=False):

    if len(speakers) > 100:

        return

    print('ID\tSpeaker')

    for id, name in enumerate(speakers):

        ex_print(str(id) + '\t' + name, escape)


def get_speaker_id():

    speaker_id = vits_model_speaker_id

    try:

        speaker_id = int(speaker_id)

    except:

        print(str(speaker_id) + ' is not a valid ID!')

        sys.exit(1)

    return speaker_id


def get_label_value(text, label, default, warning_name='value'):

    value = re.search(rf'\[{label}=(.+?)\]', text)

    if value:

        try:

            text = re.sub(rf'\[{label}=(.+?)\]', '', text, 1)

            value = float(value.group(1))

        except:

            print(f'Invalid {warning_name}!')

            sys.exit(1)

    else:

        value = default

    return value, text


def get_label(text, label):

    if f'[{label}]' in text:

        return True, text.replace(f'[{label}]', '')

    else:

        return False, text


def api_for_main(json):


    vits_model_path = json["vits_model_path"]

    vits_config_path = json["vits_config_path"]

    vits_model_speaker_id = json["vits_model_speaker_id"]

    tts_choice = json["tts_choice"]

    read_text = json["read_text"]

    audio_output_path = json["audio_output_path"]

    escape = json["escape"]


    hubert_model_path = json["hubert_model_path"]

    audio_input_path = json["audio_input_path"]

    #true tts false vc

    raw_text = json["raw_text"]

    emotion_reference_path = json["emotion_reference_path"]

    w2v2_dimensional_emotion_model_path = json["w2v2_dimensional_emotion_model_path"]


    model = vits_model_path

    config = vits_config_path


    hps_ms = utils.get_hparams_from_file(config)

    n_speakers = hps_ms.data.n_speakers if 'n_speakers' in hps_ms.data.keys() else 0

    n_symbols = len(hps_ms.symbols) if 'symbols' in hps_ms.keys() else 0

    speakers = hps_ms.speakers if 'speakers' in hps_ms.keys() else ['0']

    use_f0 = hps_ms.data.use_f0 if 'use_f0' in hps_ms.data.keys() else False

    emotion_embedding = hps_ms.data.emotion_embedding if 'emotion_embedding' in hps_ms.data.keys() else False


    net_g_ms = SynthesizerTrn(

        n_symbols,

        hps_ms.data.filter_length // 2 + 1,

        hps_ms.train.segment_size // hps_ms.data.hop_length,

        n_speakers=n_speakers,

        emotion_embedding=emotion_embedding,

        **hps_ms.model)

    _ = net_g_ms.eval()

    utils.load_checkpoint(model, net_g_ms)


    def voice_conversion():

        audio_path = audio_path

        audio = utils.load_audio_to_torch(

            audio_path, hps_ms.data.sampling_rate)


        originnal_id = get_speaker_id()

        target_id = get_speaker_id()

        out_path = audio_output_path


        y = audio.unsqueeze(0)


        spec = spectrogram_torch(y, hps_ms.data.filter_length,

                                 hps_ms.data.sampling_rate, hps_ms.data.hop_length, hps_ms.data.win_length,

                                 center=False)

        spec_lengths = LongTensor([spec.size(-1)])

        sid_src = LongTensor([originnal_id])


        with no_grad():

            sid_tgt = LongTensor([target_id])

            audio = net_g_ms.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt)[

                0][0, 0].data.cpu().float().numpy()

        return audio, out_path


    if n_symbols != 0:

        if not emotion_embedding:

            while True:

                choice = tts_choice

                if choice == True:

                    text = read_text

                    if text == '[ADVANCED]':

                        text = raw_text

                        print('Cleaned text is:')

                        ex_print(_clean_text(

                            text, hps_ms.data.text_cleaners), escape)

                        continue


                    length_scale, text = get_label_value(

                        text, 'LENGTH', 1, 'length scale')

                    noise_scale, text = get_label_value(

                        text, 'NOISE', 0.667, 'noise scale')

                    noise_scale_w, text = get_label_value(

                        text, 'NOISEW', 0.8, 'deviation of noise')

                    cleaned, text = get_label(text, 'CLEANED')


                    stn_tst = get_text(text, hps_ms, cleaned=cleaned)


                    speaker_id = get_speaker_id()

                    out_path = audio_output_path


                    with no_grad():

                        x_tst = stn_tst.unsqueeze(0)

                        x_tst_lengths = LongTensor([stn_tst.size(0)])

                        sid = LongTensor([speaker_id])

                        audio = net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale,

                                               noise_scale_w=noise_scale_w, length_scale=length_scale)[0][0, 0].data.cpu().float().numpy()


                elif choice == False:

                    audio, out_path = voice_conversion()


                write(out_path, hps_ms.data.sampling_rate, audio)

        else:

            import os

            import librosa

            import numpy as np

            from torch import FloatTensor

            import audonnx

            w2v2_folder = w2v2_dimensional_emotion_model_path

            w2v2_model = audonnx.load(os.path.dirname(w2v2_folder))

            while True:

                choice = tts_choice

                if choice == True:

                    text = read_text

                    if text == '[ADVANCED]':

                        text = raw_text

                        print('Cleaned text is:')

                        ex_print(_clean_text(

                            text, hps_ms.data.text_cleaners), escape)

                        continue


                    length_scale, text = get_label_value(

                        text, 'LENGTH', 1, 'length scale')

                    noise_scale, text = get_label_value(

                        text, 'NOISE', 0.667, 'noise scale')

                    noise_scale_w, text = get_label_value(

                        text, 'NOISEW', 0.8, 'deviation of noise')

                    cleaned, text = get_label(text, 'CLEANED')


                    stn_tst = get_text(text, hps_ms, cleaned=cleaned)


                    speaker_id = get_speaker_id()


                    emotion_reference = emotion_reference_path

                    if emotion_reference.endswith('.npy'):

                        emotion = np.load(emotion_reference)

                        emotion = FloatTensor(emotion).unsqueeze(0)

                    else:

                        audio16000, sampling_rate = librosa.load(

                            emotion_reference, sr=16000, mono=True)

                        emotion = w2v2_model(audio16000, sampling_rate)[

                            'hidden_states']

                        emotion_reference = re.sub(

                            r'\..*$', '', emotion_reference)

                        np.save(emotion_reference, emotion.squeeze(0))

                        emotion = FloatTensor(emotion)


                    out_path = audio_output_path


                    with no_grad():

                        x_tst = stn_tst.unsqueeze(0)

                        x_tst_lengths = LongTensor([stn_tst.size(0)])

                        sid = LongTensor([speaker_id])

                        audio = net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale, noise_scale_w=noise_scale_w,

                                               length_scale=length_scale, emotion_embedding=emotion)[0][0, 0].data.cpu().float().numpy()


                elif choice == False:

                    audio, out_path = voice_conversion()


                write(out_path, hps_ms.data.sampling_rate, audio)


    else:

        model = hubert_model_path

        from hubert_model import hubert_soft

        hubert = hubert_soft(model)


        while True:

            audio_path = audio_input_path


            if audio_path != '[VC]':

                import librosa

                if use_f0:

                    audio, sampling_rate = librosa.load(

                        audio_path, sr=hps_ms.data.sampling_rate, mono=True)

                    audio16000 = librosa.resample(

                        audio, orig_sr=sampling_rate, target_sr=16000)

                else:

                    audio16000, sampling_rate = librosa.load(

                        audio_path, sr=16000, mono=True)


                target_id = get_speaker_id()

                out_path = audio_output_path

                length_scale, out_path = get_label_value(

                    out_path, 'LENGTH', 1, 'length scale')

                noise_scale, out_path = get_label_value(

                    out_path, 'NOISE', 0.1, 'noise scale')

                noise_scale_w, out_path = get_label_value(

                    out_path, 'NOISEW', 0.1, 'deviation of noise')


                from torch import inference_mode, FloatTensor

                import numpy as np

                with inference_mode():

                    units = hubert.units(FloatTensor(audio16000).unsqueeze(

                        0).unsqueeze(0)).squeeze(0).numpy()

                    if use_f0:

                        f0_scale, out_path = get_label_value(

                            out_path, 'F0', 1, 'f0 scale')

                        f0 = librosa.pyin(audio, sr=sampling_rate,

                                          fmin=librosa.note_to_hz('C0'),

                                          fmax=librosa.note_to_hz('C7'),

                                          frame_length=1780)[0]

                        target_length = len(units[:, 0])

                        f0 = np.nan_to_num(np.interp(np.arange(0, len(f0)*target_length, len(f0))/target_length,

                                                     np.arange(0, len(f0)), f0)) * f0_scale

                        units[:, 0] = f0 / 10


                stn_tst = FloatTensor(units)

                with no_grad():

                    x_tst = stn_tst.unsqueeze(0)

                    x_tst_lengths = LongTensor([stn_tst.size(0)])

                    sid = LongTensor([target_id])

                    audio = net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale,

                                           noise_scale_w=noise_scale_w, length_scale=length_scale)[0][0, 0].data.float().numpy()


            else:

                audio, out_path = voice_conversion()


            write(out_path, hps_ms.data.sampling_rate, audio)


@app.route("/tts",methods=["GET","POST"])

@cross_origin()

def tts():

    json = request.json.get('data', None)

    api_for_main(json)


@app.route("/",methods=["GET"])

@cross_origin()

def helloWorld():

  return "connect success"


if __name__ == '__main__':

    app.run(port=api_port, debug=True)

修改requirement.txt文件内容为【直接删除全部然后把下面的复制进去保存就行】:

numba

librosa

numpy==1.23

flask_cors

flask

jsonify

scipy

torch

unidecode

openjtalk>=0.3.0.dev2

jamo

pypinyin

jieba

protobuf

cn2an

inflect

eng_to_ipa

ko_pron

indic_transliteration

num_thai

opencc

audonnx

python-dotenv

保存退出


使用任意方法请求restful api的get方法,请求路径为http://【服务器/本机IP】:【端口号】/tts

示例请求路径

http://127.0.0.1:5000/tts



参数

 "vits_model_path": [string]"    模型路径【服务器上】,

 "vits_config_path": [string]"    TTS设置文件路径【服务器上】,

"vits_model_speaker_id": [int]    说话人,

"tts_choice": [boolean]    是否使用tts    [true:tts,false,vc]

"read_text":[string]    转换到语音的文本,

"audio_output_path":[string]    输出后的文件路径,

具体含义需要看官方解释

"escape":[boolean],

"hubert_model_path":[string],

"audio_input_path":[string],

"raw_text":[string],

"emotion_reference_path":[string],

"w2v2_dimensional_emotion_model_path":[string]


传递到后端的JSON结构:【路径可以改为绝对路径】

{

    "data": {

        "vits_model_path": "./model/G_1000.pth",

        "vits_config_path": "./model/config.json",

        "vits_model_speaker_id": 0,

        "tts_choice": true,

        "read_text":"测试",

        "audio_output_path":"./audios/test.wav",

        "escape":false,

        "hubert_model_path":"",

        "audio_input_path":"",

        "raw_text":"",

        "emotion_reference_path":"",

        "w2v2_dimensional_emotion_model_path":""

    }

}


测试可以使用apifox或者postman等接口调试工具


修改后端服务器api端口

修改Api.py文件中的api_port变量默认为5000

【api_port = 5000】


VitsApi-基于MoeGoe改造的评论 (共 条)

分享到微博请遵守国家法律