Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
116 changes: 116 additions & 0 deletions rvc/lib/uvr5_pack/ffio.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
# -*- coding: utf-8 -*-
import numpy as np, librosa as rosa, ffmpeg
##
def wavread(
filepath:str, fs:int=0, ch:int=0,
dtype=None, read_async=False, res_type:str="soxr_hq",
**kwargs) -> tuple[np.ndarray, int]:
"""
Reads an audio file using `ffmpeg` &
returns the audio data as an `np.ndarray` &
the sample rate (`fs`).
Args:
filepath (str): Path to the audio file.
fs (int, optional): Desired sample rate. If 0,
the original sample rate is used. Defaults to 0.
ch (int, optional): Desired number of channels. If 0,
the original number of channels is used. Defaults to 0.
dtype (data-type, optional): Desired data type
for the output array. If None, the data is returned
as 32-bit float. Defaults to None.
read_async (bool, optional): If True, reads the audio data
asynchronously. Defaults to False.
res_type (str, optional): Resampling type
defined in `librosa`, defaulting to "soxr_hq".
**kwargs: Additional arguments.
Returns:
tuple[np.ndarray, int]: A tuple containing the audio data
as a NumPy array and the sample rate.
"""
## Performing FFProbe the Audio File to get Stream Information
d_probe = ffmpeg.probe(filepath)
st_audio = next(
s for s in d_probe["streams"] \
if s["codec_type"] == "audio")
ch_origin = int(st_audio["channels"])
fs_origin = int(st_audio["sample_rate"])
ch = ch or ch_origin
fs = fs or fs_origin
## Determining the float32 format based on system endianness
fp32, ffmpeg_format = ("<f4", "f32le") if np.little_endian else \
(">f4", "f32be")
ffmpeg_acodec = f"pcm_{ffmpeg_format}"
## Setting Keyword-based (non-positional) Args of `ffmpeg.output`
kwgs_output = {
"format": ffmpeg_format,
"acodec": ffmpeg_acodec,
## Using original sample rate and channels if not specified
"ac": ch,
#"ar": fs, # Resampling later using `librosa`
"loglevel": "error"}
## Reading the Audio Asynchronously
if read_async:
async_pipe = (
ffmpeg
.input(filepath)
.output("pipe:", **kwgs_output)
.run_async(pipe_stdout=True))
pcm_raw = async_pipe.stdout.read()
async_pipe.wait()
## Reading the Audio Synchronously
else:
pcm_raw, _ = (
ffmpeg
.input(filepath)
.output("pipe:", **kwgs_output)
.run(
capture_stdout = True,
capture_stderr = True,
))
## Converting the Raw PCM Data to Float32 NumPy Array
x_raw = np.frombuffer(pcm_raw, dtype=fp32).reshape(-1, ch).T
## Resampling using `librosa`, if necessary
x_res = rosa.resample(x_raw, orig_sr=fs_origin, target_sr=fs,
res_type=res_type, axis=-1) if fs != fs_origin else x_raw
## Converting to the target data type if specified
if dtype:
dt = np.dtype(dtype)
if dt.kind == "i":
x_res = np.clip(x_res, -1., +1.)
x_res = (x_res * (np.iinfo(dt).max - 1)).astype(dt)
## Returning with shapped as (channels, samples)
return x_res, fs
##
def wavread_rosa(filepath:str,
fs:int=22050, mono:bool=True,
dtype=np.float32, res_type:str="soxr_hq",
**kwargs) -> tuple[np.ndarray, int]:
"""
Reads an audio file using `ffmpeg` &
returns the audio data as an `np.ndarray` &
the sample rate (`fs`).
Args:
filepath (str): Path to the audio file.
fs (int, optional): Desired sample rate. Defaults to 22050.
mono (bool, optional): If True, converts the audio to mono.
Defaults to True.
dtype (data-type, optional): Desired data type
for the output array. If None, the data is returned
as 32-bit float. Defaults to `np.float32`.
res_type (str, optional): Resampling type
defined in `librosa`, defaulting to "soxr_hq".
**kwargs: Additional arguments.
Returns:
tuple[np.ndarray, int]: A tuple containing the audio data
as a NumPy array and the sample rate.
"""
x, fs = wavread(filepath,
fs = fs,
ch = 1 if mono else 2,
dtype = dtype,
res_type = res_type,
**kwargs)
if x.shape[0] < 2:
x = x.squeeze(0)
return x, fs
##
30 changes: 22 additions & 8 deletions rvc/modules/uvr5/vr.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from rvc.lib.uvr5_pack.lib_v5.model_param_init import ModelParameters
from rvc.lib.uvr5_pack.lib_v5.nets_new import CascadedNet
from rvc.lib.uvr5_pack.utils import inference
from rvc.lib.uvr5_pack.ffio import wavread_rosa as ffread_rosa

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -49,21 +50,34 @@ def __init__(self, model_path, agg, tta=False):
def process(
self,
music_file,
):
## Param for trying to read audio using `ffmpeg`,
## but still resampling using `librosa.resample`,
## implemented in the file ".../uvr5_pack/ffio.py"
load_using_ffmpeg:bool = False,
**kwargs):
x_wave, y_wave, x_spec_s, y_spec_s = {}, {}, {}, {}
bands_n = len(self.mp.param["band"])

for d in range(bands_n, 0, -1):
bp = self.mp.param["band"][d]
if d == bands_n: # high-end band
# librosa loading may be buggy for some audio. ffmpeg will solve this, but it's a pain
x_wave[d] = librosa.core.load(
music_file,
sr=bp["sr"],
mono=False,
dtype=np.float32,
res_type=bp["res_type"],
)[0]
if load_using_ffmpeg: # [TODO] Serious Unit Tests may be Required
x_wave[d] = ffread_rosa(
music_file,
fs = bp["sr"],
mono = False,
dtype = np.float32,
res_type = bp["res_type"],
)[0]
else:
x_wave[d] = librosa.core.load(
music_file,
sr=bp["sr"],
mono=False,
dtype=np.float32,
res_type=bp["res_type"],
)[0]
if x_wave[d].ndim == 1:
x_wave[d] = np.asfortranarray([x_wave[d], x_wave[d]])
else: # lower bands
Expand Down