VitsApi-基于MoeGoe改造
Moegoe环境配置(配置方法详见Vits本地部署)
一样使用Anaconda创建一个虚拟的python3.9环境
不怕污染本地环境可以直接安装python3.9,python3.10没有试验过无法保证运行
这个只是思路,没有配置文件上传等自定义方式
model和config文件都是要先放在服务器上的
下载好MoeGoe源码并解压到任意文件夹:
https://github.com/CjangCjengh/MoeGoe/
修改项目文件夹下的text文件夹中的mandarin.py文件
第10行改为
jieba.set_dictionary(os.path.abspath('.')+'/jieba/dict.txt')
在MoeGoe的项目文件夹中新建一个Api.py文件,将下列代码复制到Api.py文件中

from flask import Flask, jsonify, request
from flask_cors import CORS, cross_origin
from scipy.io.wavfile import write
from mel_processing import spectrogram_torch
from text import text_to_sequence, _clean_text
from models import SynthesizerTrn
import utils
import commons
import sys
import re
from torch import no_grad, LongTensor
import logging
app = Flask(__name__)
api_port = 5000
vits_model_path = r""
vits_config_path = r""
vits_model_speaker_id = 0
tts_choice = True
read_text = ""
audio_output_path = r""
escape = False
hubert_model_path = r""
audio_input_path = r""
#true tts false vc
raw_text = ""
emotion_reference_path = r""
w2v2_dimensional_emotion_model_path = r""
cors = CORS(app)
logging.getLogger('numba').setLevel(logging.WARNING)
def ex_print(text, escape=False):
if escape:
print(text.encode('unicode_escape').decode())
else:
print(text)
def get_text(text, hps, cleaned=False):
if cleaned:
text_norm = text_to_sequence(text, hps.symbols, [])
else:
text_norm = text_to_sequence(text, hps.symbols, hps.data.text_cleaners)
if hps.data.add_blank:
text_norm = commons.intersperse(text_norm, 0)
text_norm = LongTensor(text_norm)
return text_norm
def ask_if_continue():
while True:
answer = input('Continue? (y/n): ')
if answer == 'y':
break
elif answer == 'n':
sys.exit(0)
def print_speakers(speakers, escape=False):
if len(speakers) > 100:
return
print('ID\tSpeaker')
for id, name in enumerate(speakers):
ex_print(str(id) + '\t' + name, escape)
def get_speaker_id():
speaker_id = vits_model_speaker_id
try:
speaker_id = int(speaker_id)
except:
print(str(speaker_id) + ' is not a valid ID!')
sys.exit(1)
return speaker_id
def get_label_value(text, label, default, warning_name='value'):
value = re.search(rf'\[{label}=(.+?)\]', text)
if value:
try:
text = re.sub(rf'\[{label}=(.+?)\]', '', text, 1)
value = float(value.group(1))
except:
print(f'Invalid {warning_name}!')
sys.exit(1)
else:
value = default
return value, text
def get_label(text, label):
if f'[{label}]' in text:
return True, text.replace(f'[{label}]', '')
else:
return False, text
def api_for_main(json):
vits_model_path = json["vits_model_path"]
vits_config_path = json["vits_config_path"]
vits_model_speaker_id = json["vits_model_speaker_id"]
tts_choice = json["tts_choice"]
read_text = json["read_text"]
audio_output_path = json["audio_output_path"]
escape = json["escape"]
hubert_model_path = json["hubert_model_path"]
audio_input_path = json["audio_input_path"]
#true tts false vc
raw_text = json["raw_text"]
emotion_reference_path = json["emotion_reference_path"]
w2v2_dimensional_emotion_model_path = json["w2v2_dimensional_emotion_model_path"]
model = vits_model_path
config = vits_config_path
hps_ms = utils.get_hparams_from_file(config)
n_speakers = hps_ms.data.n_speakers if 'n_speakers' in hps_ms.data.keys() else 0
n_symbols = len(hps_ms.symbols) if 'symbols' in hps_ms.keys() else 0
speakers = hps_ms.speakers if 'speakers' in hps_ms.keys() else ['0']
use_f0 = hps_ms.data.use_f0 if 'use_f0' in hps_ms.data.keys() else False
emotion_embedding = hps_ms.data.emotion_embedding if 'emotion_embedding' in hps_ms.data.keys() else False
net_g_ms = SynthesizerTrn(
n_symbols,
hps_ms.data.filter_length // 2 + 1,
hps_ms.train.segment_size // hps_ms.data.hop_length,
n_speakers=n_speakers,
emotion_embedding=emotion_embedding,
**hps_ms.model)
_ = net_g_ms.eval()
utils.load_checkpoint(model, net_g_ms)
def voice_conversion():
audio_path = audio_path
audio = utils.load_audio_to_torch(
audio_path, hps_ms.data.sampling_rate)
originnal_id = get_speaker_id()
target_id = get_speaker_id()
out_path = audio_output_path
y = audio.unsqueeze(0)
spec = spectrogram_torch(y, hps_ms.data.filter_length,
hps_ms.data.sampling_rate, hps_ms.data.hop_length, hps_ms.data.win_length,
center=False)
spec_lengths = LongTensor([spec.size(-1)])
sid_src = LongTensor([originnal_id])
with no_grad():
sid_tgt = LongTensor([target_id])
audio = net_g_ms.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt)[
0][0, 0].data.cpu().float().numpy()
return audio, out_path
if n_symbols != 0:
if not emotion_embedding:
while True:
choice = tts_choice
if choice == True:
text = read_text
if text == '[ADVANCED]':
text = raw_text
print('Cleaned text is:')
ex_print(_clean_text(
text, hps_ms.data.text_cleaners), escape)
continue
length_scale, text = get_label_value(
text, 'LENGTH', 1, 'length scale')
noise_scale, text = get_label_value(
text, 'NOISE', 0.667, 'noise scale')
noise_scale_w, text = get_label_value(
text, 'NOISEW', 0.8, 'deviation of noise')
cleaned, text = get_label(text, 'CLEANED')
stn_tst = get_text(text, hps_ms, cleaned=cleaned)
speaker_id = get_speaker_id()
out_path = audio_output_path
with no_grad():
x_tst = stn_tst.unsqueeze(0)
x_tst_lengths = LongTensor([stn_tst.size(0)])
sid = LongTensor([speaker_id])
audio = net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale,
noise_scale_w=noise_scale_w, length_scale=length_scale)[0][0, 0].data.cpu().float().numpy()
elif choice == False:
audio, out_path = voice_conversion()
write(out_path, hps_ms.data.sampling_rate, audio)
else:
import os
import librosa
import numpy as np
from torch import FloatTensor
import audonnx
w2v2_folder = w2v2_dimensional_emotion_model_path
w2v2_model = audonnx.load(os.path.dirname(w2v2_folder))
while True:
choice = tts_choice
if choice == True:
text = read_text
if text == '[ADVANCED]':
text = raw_text
print('Cleaned text is:')
ex_print(_clean_text(
text, hps_ms.data.text_cleaners), escape)
continue
length_scale, text = get_label_value(
text, 'LENGTH', 1, 'length scale')
noise_scale, text = get_label_value(
text, 'NOISE', 0.667, 'noise scale')
noise_scale_w, text = get_label_value(
text, 'NOISEW', 0.8, 'deviation of noise')
cleaned, text = get_label(text, 'CLEANED')
stn_tst = get_text(text, hps_ms, cleaned=cleaned)
speaker_id = get_speaker_id()
emotion_reference = emotion_reference_path
if emotion_reference.endswith('.npy'):
emotion = np.load(emotion_reference)
emotion = FloatTensor(emotion).unsqueeze(0)
else:
audio16000, sampling_rate = librosa.load(
emotion_reference, sr=16000, mono=True)
emotion = w2v2_model(audio16000, sampling_rate)[
'hidden_states']
emotion_reference = re.sub(
r'\..*$', '', emotion_reference)
np.save(emotion_reference, emotion.squeeze(0))
emotion = FloatTensor(emotion)
out_path = audio_output_path
with no_grad():
x_tst = stn_tst.unsqueeze(0)
x_tst_lengths = LongTensor([stn_tst.size(0)])
sid = LongTensor([speaker_id])
audio = net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale, noise_scale_w=noise_scale_w,
length_scale=length_scale, emotion_embedding=emotion)[0][0, 0].data.cpu().float().numpy()
elif choice == False:
audio, out_path = voice_conversion()
write(out_path, hps_ms.data.sampling_rate, audio)
else:
model = hubert_model_path
from hubert_model import hubert_soft
hubert = hubert_soft(model)
while True:
audio_path = audio_input_path
if audio_path != '[VC]':
import librosa
if use_f0:
audio, sampling_rate = librosa.load(
audio_path, sr=hps_ms.data.sampling_rate, mono=True)
audio16000 = librosa.resample(
audio, orig_sr=sampling_rate, target_sr=16000)
else:
audio16000, sampling_rate = librosa.load(
audio_path, sr=16000, mono=True)
target_id = get_speaker_id()
out_path = audio_output_path
length_scale, out_path = get_label_value(
out_path, 'LENGTH', 1, 'length scale')
noise_scale, out_path = get_label_value(
out_path, 'NOISE', 0.1, 'noise scale')
noise_scale_w, out_path = get_label_value(
out_path, 'NOISEW', 0.1, 'deviation of noise')
from torch import inference_mode, FloatTensor
import numpy as np
with inference_mode():
units = hubert.units(FloatTensor(audio16000).unsqueeze(
0).unsqueeze(0)).squeeze(0).numpy()
if use_f0:
f0_scale, out_path = get_label_value(
out_path, 'F0', 1, 'f0 scale')
f0 = librosa.pyin(audio, sr=sampling_rate,
fmin=librosa.note_to_hz('C0'),
fmax=librosa.note_to_hz('C7'),
frame_length=1780)[0]
target_length = len(units[:, 0])
f0 = np.nan_to_num(np.interp(np.arange(0, len(f0)*target_length, len(f0))/target_length,
np.arange(0, len(f0)), f0)) * f0_scale
units[:, 0] = f0 / 10
stn_tst = FloatTensor(units)
with no_grad():
x_tst = stn_tst.unsqueeze(0)
x_tst_lengths = LongTensor([stn_tst.size(0)])
sid = LongTensor([target_id])
audio = net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale,
noise_scale_w=noise_scale_w, length_scale=length_scale)[0][0, 0].data.float().numpy()
else:
audio, out_path = voice_conversion()
write(out_path, hps_ms.data.sampling_rate, audio)
@app.route("/tts",methods=["GET","POST"])
@cross_origin()
def tts():
json = request.json.get('data', None)
api_for_main(json)
@app.route("/",methods=["GET"])
@cross_origin()
def helloWorld():
return "connect success"
if __name__ == '__main__':
app.run(port=api_port, debug=True)
修改requirement.txt文件内容为【直接删除全部然后把下面的复制进去保存就行】:
numba
librosa
numpy==1.23
flask_cors
flask
jsonify
scipy
torch
unidecode
openjtalk>=0.3.0.dev2
jamo
pypinyin
jieba
protobuf
cn2an
inflect
eng_to_ipa
ko_pron
indic_transliteration
num_thai
opencc
audonnx
python-dotenv
保存退出
使用任意方法请求restful api的get方法,请求路径为http://【服务器/本机IP】:【端口号】/tts
示例请求路径
http://127.0.0.1:5000/tts
参数
"vits_model_path": [string]" 模型路径【服务器上】,
"vits_config_path": [string]" TTS设置文件路径【服务器上】,
"vits_model_speaker_id": [int] 说话人,
"tts_choice": [boolean] 是否使用tts [true:tts,false,vc]
"read_text":[string] 转换到语音的文本,
"audio_output_path":[string] 输出后的文件路径,
具体含义需要看官方解释
"escape":[boolean],
"hubert_model_path":[string],
"audio_input_path":[string],
"raw_text":[string],
"emotion_reference_path":[string],
"w2v2_dimensional_emotion_model_path":[string]
传递到后端的JSON结构:【路径可以改为绝对路径】
{
"data": {
"vits_model_path": "./model/G_1000.pth",
"vits_config_path": "./model/config.json",
"vits_model_speaker_id": 0,
"tts_choice": true,
"read_text":"测试",
"audio_output_path":"./audios/test.wav",
"escape":false,
"hubert_model_path":"",
"audio_input_path":"",
"raw_text":"",
"emotion_reference_path":"",
"w2v2_dimensional_emotion_model_path":""
}
}
测试可以使用apifox或者postman等接口调试工具
修改后端服务器api端口
修改Api.py文件中的api_port变量默认为5000
【api_port = 5000】