Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,16 @@
# XTTS-RVC-UI

This is a Fork of XTTS-RVC-UI that adds realtime typing, updates voice playback to happen right away if any changes are made in the interface, and adds additional temperature and repetition penality sliders to adjust your voice. Made it autoplay only the RVC output.

Note: You can also separately adjust the xtts model's config.json top_k and top_p settings for further tweaking before starting the start.bat. Here is what I am using for that (experimental):

"top_k": 70,
"top_p": 0.95,

Note2: When you finish typing if it didn't read the entire thing, you can simply press . or spacebar or backspace and it will read the entire sentence during a refresh (usually within 1 or 2 seconds). I have found the best results by using Dragon Naturally Speaking and my microphone. Having it type in the box for me and using a "custom dragon command" word "erase" to erase the box. My dragon step-by-step command is like this, Steps: Control + A, Backspace" when myCommand "erase" is spoken.

Original Repo Info:

This is a simple UI that utilize's [Coqui's XTTSv2](https://github.com/coqui-ai/TTS) paired with RVC functionality to improve output quality.

# Prerequisites
Expand Down
254 changes: 141 additions & 113 deletions app.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,47 +2,46 @@
from TTS.api import TTS
import gradio as gr
from rvc import Config, load_hubert, get_vc, rvc_infer
import gc , os, sys, argparse, requests
import gc, os, sys, argparse, requests
from pathlib import Path

parser = argparse.ArgumentParser(
prog='XTTS-RVC-UI',
description='Gradio UI for XTTSv2 and RVC'
prog='XTTS-RVC-UI',
description='Gradio UI for XTTSv2 and RVC'
)

parser.add_argument('-s', '--silent', action=argparse.BooleanOptionalAction, default=False)
args = parser.parse_args()

if args.silent:
print('Enabling silent mode.')
sys.stdout = open(os.devnull, 'w')
if args.silent:
print('Enabling silent mode.')
sys.stdout = open(os.devnull, 'w')

def download_models():
rvc_files = ['hubert_base.pt', 'rmvpe.pt']
rvc_files = ['hubert_base.pt', 'rmvpe.pt']

for file in rvc_files:
if(not os.path.isfile(f'./models/{file}')):
print(f'Downloading{file}')
r = requests.get(f'https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/{file}')
with open(f'./models/{file}', 'wb') as f:
f.write(r.content)
for file in rvc_files:
if not os.path.isfile(f'./models/{file}'):
print(f'Downloading {file}')
r = requests.get(f'https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/{file}')
with open(f'./models/{file}', 'wb') as f:
f.write(r.content)

xtts_files = ['vocab.json', 'config.json', 'dvae.path', 'mel_stats.pth', 'model.pth']
xtts_files = ['vocab.json', 'config.json', 'dvae.path', 'mel_stats.pth', 'model.pth']

for file in xtts_files:
if(not os.path.isfile(f'./models/xtts/{file}')):
print(f'Downloading {file}')
r = requests.get(f'https://huggingface.co/coqui/XTTS-v2/resolve/v2.0.2/{file}')
with open(f'./models/xtts/{file}', 'wb') as f:
f.write(r.content)

for file in xtts_files:
if not os.path.isfile(f'./models/xtts/{file}'):
print(f'Downloading {file}')
r = requests.get(f'https://huggingface.co/coqui/XTTS-v2/resolve/v2.0.2/{file}')
with open(f'./models/xtts/{file}', 'wb') as f:
f.write(r.content)

[Path(_dir).mkdir(parents=True, exist_ok=True) for _dir in ['./models/xtts', './voices', './rvcs']]

download_models()

device = "cuda:0" if torch.cuda.is_available() else "cpu"
print("Device: " + device)
print("Device: " + device)

config = Config(device, device != 'cpu')
hubert_model = load_hubert(device, config.is_half, "./models/hubert_base.pt")
Expand All @@ -52,103 +51,132 @@ def download_models():
langs = ["en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru", "nl", "cs", "ar", "zh-cn", "hu", "ko", "ja", "hi"]

def get_rvc_voices():
global voices
voices = os.listdir("./voices")
global rvcs
rvcs = list(filter(lambda x:x.endswith(".pth"), os.listdir("./rvcs")))
return [rvcs, voices]

def runtts(rvc, voice, text, pitch_change, index_rate, language):
audio = tts.tts_to_file(text=text, speaker_wav="./voices/" + voice, language=language, file_path="./output.wav")
voice_change(rvc, pitch_change, index_rate)
return ["./output.wav" , "./outputrvc.wav"]
global voices
voices = os.listdir("./voices")
global rvcs
rvcs = list(filter(lambda x: x.endswith(".pth"), os.listdir("./rvcs")))
return [rvcs, voices]

def runtts(rvc, voice, text, pitch_change, index_rate, temperature, repetition_penalty, language):
try:
if not text.strip():
raise ValueError("Text input is required for synthesis.")

# Ensure the TTS function uses the temperature and repetition penalty parameters
audio = tts.tts_to_file(
text=text,
speaker_wav="./voices/" + voice,
language=language,
file_path="./output.wav",
temperature=temperature, # Add temperature here
repetition_penalty=repetition_penalty # Add repetition penalty here
)

voice_change(rvc, pitch_change, index_rate)
return ["./output.wav", "./outputrvc.wav"]
except Exception as e:
print(f"Error in runtts: {e}")
return [None, None]

def main():
get_rvc_voices()
print(rvcs)
print(voices)
with gr.Blocks(title='TTS RVC UI') as interface:
with gr.Row():
gr.Markdown("""
#XTTS RVC UI
""")
with gr.Row():
with gr.Column():
lang_dropdown = gr.Dropdown(choices=langs, value=langs[0], label='Language')
rvc_dropdown = gr.Dropdown(choices=rvcs, value=rvcs[0] if len(rvcs) > 0 else '', label='RVC model')
voice_dropdown = gr.Dropdown(choices=voices, value=voices[0] if len(voices) > 0 else '', label='Voice sample')
refresh_button = gr.Button(value='Refresh')
text_input = gr.Textbox(placeholder="Write here...")
submit_button = gr.Button(value='Submit')
with gr.Row():
pitch_slider = gr.Slider(minimum=-12, maximum=12, value=0, step=1, label="Pitch")
index_rate_slider = gr.Slider(minimum=0, maximum=1, value=0.75, step=0.05, label="Index Rate")
with gr.Column():
audio_output = gr.Audio(label="TTS result", type="filepath", interactive=False)
rvc_audio_output = gr.Audio(label="RVC result", type="filepath", interactive=False)

submit_button.click(inputs=[rvc_dropdown, voice_dropdown, text_input, pitch_slider, index_rate_slider, lang_dropdown], outputs=[audio_output, rvc_audio_output], fn=runtts)
def refresh_dropdowns():
get_rvc_voices()
print('Refreshed voice and RVC list!')
return [gr.update(choices=rvcs, value=rvcs[0] if len(rvcs) > 0 else ''), gr.update(choices=voices, value=voices[0] if len(voices) > 0 else '')]

refresh_button.click(fn=refresh_dropdowns, outputs=[rvc_dropdown, voice_dropdown])

interface.launch(server_name="0.0.0.0", server_port=5000, quiet=True)

# delete later
get_rvc_voices()
print(rvcs)
print(voices)
interface = gr.Interface(
fn=runtts,
inputs=[
gr.Dropdown(choices=rvcs, value=rvcs[0] if len(rvcs) > 0 else '', label='RVC model'),
gr.Dropdown(choices=voices, value=voices[0] if len(voices) > 0 else '', label='Voice sample'),
gr.Textbox(placeholder="Write here...", label='Text', elem_id="text_input"),
gr.Slider(minimum=-12, maximum=12, value=0, step=1, label="Pitch"),
gr.Slider(minimum=0, maximum=1, value=0.75, step=0.05, label="Index Rate"),
gr.Slider(minimum=0.0, maximum=2.0, value=1.0, step=0.001, label="Temperature"),
gr.Slider(minimum=0.0, maximum=2.0, value=1.0, step=0.001, label="Repetition Penalty"),
gr.Dropdown(choices=langs, value=langs[0], label='Language')
],
outputs=[
gr.Audio(label="TTS result", type="filepath", interactive=False),
gr.Audio(label="RVC result", type="filepath", interactive=False, autoplay=True)
],
live=True,
title="XTTS RVC UI",
description="XTTS and RVC integration"
)

js_code = """
<script>
let timeout = null;
const inputElem = document.querySelector("#text_input input");
inputElem.addEventListener('input', function() {
clearTimeout(timeout);
timeout = setTimeout(() => {
document.querySelector('button:contains("Submit")').click();
}, 2000);
});
</script>
"""
interface.launch(server_name="127.0.0.1", server_port=5000, quiet=True, share=False)

class RVC_Data:
def __init__(self):
self.current_model = {}
self.cpt = {}
self.version = {}
self.net_g = {}
self.tgt_sr = {}
self.vc = {}

def load_cpt(self, modelname, rvc_model_path):
if self.current_model != modelname:
print("Loading new model")
del self.cpt, self.version, self.net_g, self.tgt_sr, self.vc
self.cpt, self.version, self.net_g, self.tgt_sr, self.vc = get_vc(device, config.is_half, config, rvc_model_path)
self.current_model = modelname
def __init__(self):
self.current_model = {}
self.cpt = {}
self.version = {}
self.net_g = {}
self.tgt_sr = {}
self.vc = {}

def load_cpt(self, modelname, rvc_model_path):
try:
if self.current_model != modelname:
print("Loading new model")
del self.cpt, self.version, self.net_g, self.tgt_sr, self.vc
self.cpt, self.version, self.net_g, self.tgt_sr, self.vc = get_vc(device, config.is_half, config, rvc_model_path)
self.current_model = modelname
except Exception as e:
print(f"Error in load_cpt: {e}")
input("Press Enter to continue...")

rvc_data = RVC_Data()

def voice_change(rvc, pitch_change, index_rate):
modelname = os.path.splitext(rvc)[0]
print("Using RVC model: "+ modelname)
rvc_model_path = "./rvcs/" + rvc
rvc_index_path = "./rvcs/" + modelname + ".index" if os.path.isfile("./rvcs/" + modelname + ".index") and index_rate != 0 else ""

if rvc_index_path != "" :
print("Index file found!")

#load_cpt(modelname, rvc_model_path)
#cpt, version, net_g, tgt_sr, vc = get_vc(device, config.is_half, config, rvc_model_path)
rvc_data.load_cpt(modelname, rvc_model_path)

rvc_infer(
index_path=rvc_index_path,
index_rate=index_rate,
input_path="./output.wav",
output_path="./outputrvc.wav",
pitch_change=pitch_change,
f0_method="rmvpe",
cpt=rvc_data.cpt,
version=rvc_data.version,
net_g=rvc_data.net_g,
filter_radius=3,
tgt_sr=rvc_data.tgt_sr,
rms_mix_rate=0.25,
protect=0,
crepe_hop_length=0,
vc=rvc_data.vc,
hubert_model=hubert_model
)
gc.collect()

try:
modelname = os.path.splitext(rvc)[0]
print("Using RVC model: " + modelname)
rvc_model_path = "./rvcs/" + rvc
rvc_index_path = "./rvcs/" + modelname + ".index" if os.path.isfile("./rvcs/" + modelname + ".index") and index_rate != 0 else ""

if rvc_index_path != "":
print("Index file found!")

rvc_data.load_cpt(modelname, rvc_model_path)

rvc_infer(
index_path=rvc_index_path,
index_rate=index_rate,
input_path="./output.wav",
output_path="./outputrvc.wav",
pitch_change=pitch_change,
f0_method="rmvpe",
cpt=rvc_data.cpt,
version=rvc_data.version,
net_g=rvc_data.net_g,
filter_radius=3,
tgt_sr=rvc_data.tgt_sr,
rms_mix_rate=0.25,
protect=0,
crepe_hop_length=0,
vc=rvc_data.vc,
hubert_model=hubert_model
)
gc.collect()
except Exception as e:
print(f"Error in voice_change: {e}")
input("Press Enter to continue...")

if __name__ == "__main__":
main()
try:
main()
except Exception as e:
print(f"An error occurred: {e}")
input("Press Enter to exit...")