VitsApi-基于MoeGoe改造

2023-04-15 15:37 作者:不想起名字的雨狼 0人读过 | 我要投稿

Moegoe环境配置（配置方法详见Vits本地部署）

一样使用Anaconda创建一个虚拟的python3.9环境

不怕污染本地环境可以直接安装python3.9，python3.10没有试验过无法保证运行

这个只是思路，没有配置文件上传等自定义方式

model和config文件都是要先放在服务器上的

下载好MoeGoe源码并解压到任意文件夹：

https://github.com/CjangCjengh/MoeGoe/

修改项目文件夹下的text文件夹中的mandarin.py文件

第10行改为

jieba.set_dictionary(os.path.abspath('.')+'/jieba/dict.txt')

在MoeGoe的项目文件夹中新建一个Api.py文件，将下列代码复制到Api.py文件中

audios存放输出后的音频文件，model放置G_*.pth和config.json

from flask import Flask, jsonify, request

from flask_cors import CORS, cross_origin

from scipy.io.wavfile import write

from mel_processing import spectrogram_torch

from text import text_to_sequence, _clean_text

from models import SynthesizerTrn

import utils

import commons

import sys

import re

from torch import no_grad, LongTensor

import logging

app = Flask(__name__)

api_port = 5000

vits_model_path = r""

vits_config_path = r""

vits_model_speaker_id = 0

tts_choice = True

read_text = ""

audio_output_path = r""

escape = False

hubert_model_path = r""

audio_input_path = r""

#true tts false vc

raw_text = ""

emotion_reference_path = r""

w2v2_dimensional_emotion_model_path = r""

cors = CORS(app)

logging.getLogger('numba').setLevel(logging.WARNING)

def ex_print(text, escape=False):

if escape:

print(text.encode('unicode_escape').decode())

else:

print(text)

def get_text(text, hps, cleaned=False):

if cleaned:

text_norm = text_to_sequence(text, hps.symbols, [])

else:

text_norm = text_to_sequence(text, hps.symbols, hps.data.text_cleaners)

if hps.data.add_blank:

text_norm = commons.intersperse(text_norm, 0)

text_norm = LongTensor(text_norm)

return text_norm

def ask_if_continue():

while True:

answer = input('Continue? (y/n): ')

if answer == 'y':

break

elif answer == 'n':

sys.exit(0)

def print_speakers(speakers, escape=False):

if len(speakers) > 100:

return

print('ID\tSpeaker')

for id, name in enumerate(speakers):

ex_print(str(id) + '\t' + name, escape)

def get_speaker_id():

speaker_id = vits_model_speaker_id

try:

speaker_id = int(speaker_id)

except:

print(str(speaker_id) + ' is not a valid ID!')

sys.exit(1)

return speaker_id

def get_label_value(text, label, default, warning_name='value'):

value = re.search(rf'\[{label}=(.+?)\]', text)

if value:

try:

text = re.sub(rf'\[{label}=(.+?)\]', '', text, 1)

value = float(value.group(1))

except:

print(f'Invalid {warning_name}!')

sys.exit(1)

else:

value = default

return value, text

def get_label(text, label):

if f'[{label}]' in text:

return True, text.replace(f'[{label}]', '')

else:

return False, text

def api_for_main(json):

vits_model_path = json["vits_model_path"]

vits_config_path = json["vits_config_path"]

vits_model_speaker_id = json["vits_model_speaker_id"]

tts_choice = json["tts_choice"]

read_text = json["read_text"]

audio_output_path = json["audio_output_path"]

escape = json["escape"]

hubert_model_path = json["hubert_model_path"]

audio_input_path = json["audio_input_path"]

#true tts false vc

raw_text = json["raw_text"]

emotion_reference_path = json["emotion_reference_path"]

w2v2_dimensional_emotion_model_path = json["w2v2_dimensional_emotion_model_path"]

model = vits_model_path

config = vits_config_path

hps_ms = utils.get_hparams_from_file(config)

n_speakers = hps_ms.data.n_speakers if 'n_speakers' in hps_ms.data.keys() else 0

n_symbols = len(hps_ms.symbols) if 'symbols' in hps_ms.keys() else 0

speakers = hps_ms.speakers if 'speakers' in hps_ms.keys() else ['0']

use_f0 = hps_ms.data.use_f0 if 'use_f0' in hps_ms.data.keys() else False

emotion_embedding = hps_ms.data.emotion_embedding if 'emotion_embedding' in hps_ms.data.keys() else False

net_g_ms = SynthesizerTrn(

n_symbols,

hps_ms.data.filter_length // 2 + 1,

hps_ms.train.segment_size // hps_ms.data.hop_length,

n_speakers=n_speakers,

emotion_embedding=emotion_embedding,

**hps_ms.model)

_ = net_g_ms.eval()

utils.load_checkpoint(model, net_g_ms)

def voice_conversion():

audio_path = audio_path

audio = utils.load_audio_to_torch(

audio_path, hps_ms.data.sampling_rate)

originnal_id = get_speaker_id()

target_id = get_speaker_id()

out_path = audio_output_path

y = audio.unsqueeze(0)

spec = spectrogram_torch(y, hps_ms.data.filter_length,

hps_ms.data.sampling_rate, hps_ms.data.hop_length, hps_ms.data.win_length,

center=False)

spec_lengths = LongTensor([spec.size(-1)])

sid_src = LongTensor([originnal_id])

with no_grad():

sid_tgt = LongTensor([target_id])

audio = net_g_ms.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt)[

0][0, 0].data.cpu().float().numpy()

return audio, out_path

if n_symbols != 0:

if not emotion_embedding:

while True:

choice = tts_choice

if choice == True:

text = read_text

if text == '[ADVANCED]':

text = raw_text

print('Cleaned text is:')

ex_print(_clean_text(

text, hps_ms.data.text_cleaners), escape)

continue

length_scale, text = get_label_value(

text, 'LENGTH', 1, 'length scale')

noise_scale, text = get_label_value(

text, 'NOISE', 0.667, 'noise scale')

noise_scale_w, text = get_label_value(

text, 'NOISEW', 0.8, 'deviation of noise')

cleaned, text = get_label(text, 'CLEANED')

stn_tst = get_text(text, hps_ms, cleaned=cleaned)

speaker_id = get_speaker_id()

out_path = audio_output_path

with no_grad():

x_tst = stn_tst.unsqueeze(0)

x_tst_lengths = LongTensor([stn_tst.size(0)])

sid = LongTensor([speaker_id])

audio = net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale,

noise_scale_w=noise_scale_w, length_scale=length_scale)[0][0, 0].data.cpu().float().numpy()

elif choice == False:

audio, out_path = voice_conversion()

write(out_path, hps_ms.data.sampling_rate, audio)

else:

import os

import librosa

import numpy as np

from torch import FloatTensor

import audonnx

w2v2_folder = w2v2_dimensional_emotion_model_path

w2v2_model = audonnx.load(os.path.dirname(w2v2_folder))

while True:

choice = tts_choice

if choice == True:

text = read_text

if text == '[ADVANCED]':

text = raw_text

print('Cleaned text is:')

ex_print(_clean_text(

text, hps_ms.data.text_cleaners), escape)

continue

length_scale, text = get_label_value(

text, 'LENGTH', 1, 'length scale')

noise_scale, text = get_label_value(

text, 'NOISE', 0.667, 'noise scale')

noise_scale_w, text = get_label_value(

text, 'NOISEW', 0.8, 'deviation of noise')

cleaned, text = get_label(text, 'CLEANED')

stn_tst = get_text(text, hps_ms, cleaned=cleaned)

speaker_id = get_speaker_id()

emotion_reference = emotion_reference_path

if emotion_reference.endswith('.npy'):

emotion = np.load(emotion_reference)

emotion = FloatTensor(emotion).unsqueeze(0)

else:

audio16000, sampling_rate = librosa.load(

emotion_reference, sr=16000, mono=True)

emotion = w2v2_model(audio16000, sampling_rate)[

'hidden_states']

emotion_reference = re.sub(

r'\..*$', '', emotion_reference)

np.save(emotion_reference, emotion.squeeze(0))

emotion = FloatTensor(emotion)

out_path = audio_output_path

with no_grad():

x_tst = stn_tst.unsqueeze(0)

x_tst_lengths = LongTensor([stn_tst.size(0)])

sid = LongTensor([speaker_id])

audio = net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale, noise_scale_w=noise_scale_w,

length_scale=length_scale, emotion_embedding=emotion)[0][0, 0].data.cpu().float().numpy()

elif choice == False:

audio, out_path = voice_conversion()

write(out_path, hps_ms.data.sampling_rate, audio)

else:

model = hubert_model_path

from hubert_model import hubert_soft

hubert = hubert_soft(model)

while True:

audio_path = audio_input_path

if audio_path != '[VC]':

import librosa

if use_f0:

audio, sampling_rate = librosa.load(

audio_path, sr=hps_ms.data.sampling_rate, mono=True)

audio16000 = librosa.resample(

audio, orig_sr=sampling_rate, target_sr=16000)

else:

audio16000, sampling_rate = librosa.load(

audio_path, sr=16000, mono=True)

target_id = get_speaker_id()

out_path = audio_output_path

length_scale, out_path = get_label_value(

out_path, 'LENGTH', 1, 'length scale')

noise_scale, out_path = get_label_value(

out_path, 'NOISE', 0.1, 'noise scale')

noise_scale_w, out_path = get_label_value(

out_path, 'NOISEW', 0.1, 'deviation of noise')

from torch import inference_mode, FloatTensor

import numpy as np

with inference_mode():

units = hubert.units(FloatTensor(audio16000).unsqueeze(

0).unsqueeze(0)).squeeze(0).numpy()

if use_f0:

f0_scale, out_path = get_label_value(

out_path, 'F0', 1, 'f0 scale')

f0 = librosa.pyin(audio, sr=sampling_rate,

fmin=librosa.note_to_hz('C0'),

fmax=librosa.note_to_hz('C7'),

frame_length=1780)[0]

target_length = len(units[:, 0])

f0 = np.nan_to_num(np.interp(np.arange(0, len(f0)*target_length, len(f0))/target_length,

np.arange(0, len(f0)), f0)) * f0_scale

units[:, 0] = f0 / 10

stn_tst = FloatTensor(units)

with no_grad():

x_tst = stn_tst.unsqueeze(0)

x_tst_lengths = LongTensor([stn_tst.size(0)])

sid = LongTensor([target_id])

audio = net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale,

noise_scale_w=noise_scale_w, length_scale=length_scale)[0][0, 0].data.float().numpy()

else:

audio, out_path = voice_conversion()

write(out_path, hps_ms.data.sampling_rate, audio)

@app.route("/tts",methods=["GET","POST"])

@cross_origin()

def tts():

json = request.json.get('data', None)

api_for_main(json)

@app.route("/",methods=["GET"])

@cross_origin()

def helloWorld():

return "connect success"

if __name__ == '__main__':

app.run(port=api_port, debug=True)

修改requirement.txt文件内容为【直接删除全部然后把下面的复制进去保存就行】：

numba

librosa

numpy==1.23

flask_cors

flask

jsonify

scipy

torch

unidecode

openjtalk>=0.3.0.dev2

jamo

pypinyin

jieba

protobuf

cn2an

inflect

eng_to_ipa

ko_pron

indic_transliteration

num_thai

opencc

audonnx

python-dotenv

保存退出

使用任意方法请求restful api的get方法，请求路径为http://【服务器/本机IP】:【端口号】/tts

示例请求路径

http://127.0.0.1:5000/tts

参数

"vits_model_path": [string]" 模型路径【服务器上】,

"vits_config_path": [string]" TTS设置文件路径【服务器上】,

"vits_model_speaker_id": [int] 说话人,

"tts_choice": [boolean] 是否使用tts [true:tts,false,vc]

"read_text":[string] 转换到语音的文本,

"audio_output_path":[string] 输出后的文件路径,

具体含义需要看官方解释

"escape":[boolean],

"hubert_model_path":[string],

"audio_input_path":[string],

"raw_text":[string],

"emotion_reference_path":[string],

"w2v2_dimensional_emotion_model_path":[string]

传递到后端的JSON结构：【路径可以改为绝对路径】

{

"data": {

"vits_model_path": "./model/G_1000.pth",

"vits_config_path": "./model/config.json",

"vits_model_speaker_id": 0,

"tts_choice": true,

"read_text":"测试",

"audio_output_path":"./audios/test.wav",

"escape":false,

"hubert_model_path":"",

"audio_input_path":"",

"raw_text":"",

"emotion_reference_path":"",

"w2v2_dimensional_emotion_model_path":""

}

测试可以使用apifox或者postman等接口调试工具

修改后端服务器api端口

修改Api.py文件中的api_port变量默认为5000

【api_port = 5000】

标签：tts ai vits

VitsApi-基于MoeGoe改造

VitsApi-基于MoeGoe改造的评论 (共条)

你可能也喜欢这些文章

最新发布的文章

VitsApi-基于MoeGoe改造

本文作者的其他文章

VitsApi-基于MoeGoe改造的评论 (共 条)

你可能也喜欢这些文章

最新发布的文章

VitsApi-基于MoeGoe改造的评论 (共条)