Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 40 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -278,6 +278,46 @@ For programmatic use, you can output the model list in JSON format:
audio-separator -l --list_format=json
```

### Processing Large Files

For very long audio files (>1 hour), you may encounter out-of-memory errors. The `--chunk_duration` option automatically splits large files into smaller chunks, processes them separately, and merges the results:

```sh
# Process an 8-hour podcast in 10-minute chunks
audio-separator long_podcast.wav --chunk_duration 600

# Adjust chunk size based on available memory
audio-separator very_long_audio.wav --chunk_duration 300 # 5-minute chunks
```

#### How It Works

1. **Split**: The input file is split into fixed-duration chunks (e.g., 10 minutes)
2. **Process**: Each chunk is processed separately, reducing peak memory usage
3. **Merge**: The results are merged back together with simple concatenation

The chunking feature supports all model types:
- **2-stem models** (e.g., MDX): Vocals + Instrumental
- **4-stem models** (e.g., Demucs): Drums, Bass, Other, Vocals
- **6-stem models** (e.g., Demucs 6s): Bass, Drums, Other, Vocals, Guitar, Piano

#### Benefits

- **Prevents OOM errors**: Process files of any length without running out of memory
- **Predictable memory usage**: Memory usage stays bounded regardless of file length
- **No quality loss**: Each chunk is fully processed with the selected model
- **Multi-stem support**: Works seamlessly with 2, 4, and 6-stem models

#### Recommendations

- **Files > 1 hour**: Use `--chunk_duration 600` (10 minutes)
- **Limited memory systems**: Use smaller chunks (300-600 seconds)
- **Ample memory**: You may not need chunking at all

#### Note on Audio Quality

Chunks are concatenated without crossfading, which may result in minor artifacts at chunk boundaries in rare cases. For most use cases, these are not noticeable. The simple concatenation approach keeps processing time minimal while solving out-of-memory issues.

### Full command-line interface options

```sh
Expand Down
141 changes: 141 additions & 0 deletions audio_separator/separator/audio_chunking.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
"""Audio chunking utilities for processing large audio files to prevent OOM errors."""

import os
import logging
from typing import List
from pydub import AudioSegment


class AudioChunker:
"""
Handles splitting and merging of large audio files.

This class provides utilities to:
- Split large audio files into fixed-duration chunks
- Merge processed chunks back together with simple concatenation
- Determine if a file should be chunked based on its duration

Example:
>>> chunker = AudioChunker(chunk_duration_seconds=600) # 10-minute chunks
>>> chunk_paths = chunker.split_audio("long_audio.wav", "/tmp/chunks")
>>> # Process each chunk...
>>> output_path = chunker.merge_chunks(processed_chunks, "output.wav")
"""

def __init__(self, chunk_duration_seconds: float, logger: logging.Logger = None):
"""
Initialize the AudioChunker.

Args:
chunk_duration_seconds: Duration of each chunk in seconds
logger: Optional logger instance for logging operations
"""
self.chunk_duration_ms = int(chunk_duration_seconds * 1000)
self.logger = logger or logging.getLogger(__name__)

def split_audio(self, input_path: str, output_dir: str) -> List[str]:
"""
Split audio file into fixed-size chunks.

Args:
input_path: Path to the input audio file
output_dir: Directory where chunk files will be saved

Returns:
List of paths to the created chunk files

Raises:
FileNotFoundError: If input file doesn't exist
IOError: If there's an error reading or writing audio files
"""
if not os.path.exists(input_path):
raise FileNotFoundError(f"Input file not found: {input_path}")

if not os.path.exists(output_dir):
os.makedirs(output_dir)

self.logger.debug(f"Loading audio file: {input_path}")
audio = AudioSegment.from_file(input_path)

total_duration_ms = len(audio)
chunk_paths = []

# Calculate number of chunks
num_chunks = (total_duration_ms + self.chunk_duration_ms - 1) // self.chunk_duration_ms
self.logger.info(f"Splitting {total_duration_ms / 1000:.1f}s audio into {num_chunks} chunks of {self.chunk_duration_ms / 1000:.1f}s each")

# Get file extension from input
_, ext = os.path.splitext(input_path)
if not ext:
ext = ".wav" # Default to WAV if no extension

# Split into chunks
for i in range(num_chunks):
start_ms = i * self.chunk_duration_ms
end_ms = min(start_ms + self.chunk_duration_ms, total_duration_ms)

chunk = audio[start_ms:end_ms]
chunk_filename = f"chunk_{i:04d}{ext}"
chunk_path = os.path.join(output_dir, chunk_filename)

self.logger.debug(f"Exporting chunk {i + 1}/{num_chunks}: {start_ms / 1000:.1f}s - {end_ms / 1000:.1f}s to {chunk_path}")
chunk.export(chunk_path, format=ext.lstrip('.'))
chunk_paths.append(chunk_path)

return chunk_paths

def merge_chunks(self, chunk_paths: List[str], output_path: str) -> str:
"""
Merge processed chunks with simple concatenation.

Args:
chunk_paths: List of paths to chunk files to merge
output_path: Path where the merged output will be saved

Returns:
Path to the merged output file

Raises:
ValueError: If chunk_paths is empty
FileNotFoundError: If any chunk file doesn't exist
IOError: If there's an error reading or writing audio files
"""
if not chunk_paths:
raise ValueError("Cannot merge empty list of chunks")

# Verify all chunks exist
for chunk_path in chunk_paths:
if not os.path.exists(chunk_path):
raise FileNotFoundError(f"Chunk file not found: {chunk_path}")

self.logger.info(f"Merging {len(chunk_paths)} chunks into {output_path}")

# Start with empty audio segment
combined = AudioSegment.empty()

# Concatenate all chunks
for i, chunk_path in enumerate(chunk_paths):
self.logger.debug(f"Loading chunk {i + 1}/{len(chunk_paths)}: {chunk_path}")
chunk = AudioSegment.from_file(chunk_path)
combined += chunk # Simple concatenation

# Get output format from file extension
_, ext = os.path.splitext(output_path)
output_format = ext.lstrip('.') if ext else 'wav'

self.logger.info(f"Exporting merged audio ({len(combined) / 1000:.1f}s) to {output_path}")
combined.export(output_path, format=output_format)

return output_path

def should_chunk(self, audio_duration_seconds: float) -> bool:
"""
Determine if file is large enough to benefit from chunking.

Args:
audio_duration_seconds: Duration of the audio file in seconds

Returns:
True if the file should be chunked, False otherwise
"""
return audio_duration_seconds > (self.chunk_duration_ms / 1000)
130 changes: 130 additions & 0 deletions audio_separator/separator/separator.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import warnings
import importlib
import io
import re
from typing import Optional

import hashlib
Expand Down Expand Up @@ -94,6 +95,7 @@ def __init__(
use_soundfile=False,
use_autocast=False,
use_directml=False,
chunk_duration=None,
mdx_params={"hop_length": 1024, "segment_size": 256, "overlap": 0.25, "batch_size": 1, "enable_denoise": False},
vr_params={"batch_size": 1, "window_size": 512, "aggression": 5, "enable_tta": False, "enable_post_process": False, "post_process_threshold": 0.2, "high_end_process": False},
demucs_params={"segment_size": "Default", "shifts": 2, "overlap": 0.25, "segments_enabled": True},
Expand Down Expand Up @@ -182,6 +184,11 @@ def __init__(
self.use_autocast = use_autocast
self.use_directml = use_directml

self.chunk_duration = chunk_duration
if chunk_duration is not None:
if chunk_duration <= 0:
raise ValueError("chunk_duration must be greater than 0")

# These are parameters which users may want to configure so we expose them to the top-level Separator class,
# even though they are specific to a single model architecture
self.arch_specific_params = {"MDX": mdx_params, "VR": vr_params, "Demucs": demucs_params, "MDXC": mdxc_params}
Expand Down Expand Up @@ -866,6 +873,18 @@ def _separate_file(self, audio_file_path, custom_output_names=None):
Returns:
- output_files (list of str): A list containing the paths to the separated audio stem files.
"""
# Check if chunking is enabled and file is large enough
if self.chunk_duration is not None:
import librosa
duration = librosa.get_duration(path=audio_file_path)

from audio_separator.separator.audio_chunking import AudioChunker
chunker = AudioChunker(self.chunk_duration, self.logger)

if chunker.should_chunk(duration):
self.logger.info(f"File duration {duration:.1f}s exceeds chunk size {self.chunk_duration}s, using chunked processing")
return self._process_with_chunking(audio_file_path, custom_output_names)

# Log the start of the separation process
self.logger.info(f"Starting separation process for audio_file_path: {audio_file_path}")
separate_start_time = time.perf_counter()
Expand Down Expand Up @@ -899,6 +918,117 @@ def _separate_file(self, audio_file_path, custom_output_names=None):

return output_files

def _process_with_chunking(self, audio_file_path, custom_output_names=None):
"""
Process large file by splitting into chunks.

This method splits a large audio file into smaller chunks, processes each chunk
separately, and merges the results back together. This helps prevent out-of-memory
errors when processing very long audio files.

Parameters:
- audio_file_path (str): The path to the audio file.
- custom_output_names (dict, optional): Custom names for the output files. Defaults to None.

Returns:
- output_files (list of str): A list containing the paths to the separated audio stem files.
"""
import tempfile
import shutil
from audio_separator.separator.audio_chunking import AudioChunker

# Create temporary directory for chunks
temp_dir = tempfile.mkdtemp(prefix="audio-separator-chunks-")
self.logger.debug(f"Created temporary directory for chunks: {temp_dir}")

try:
# Split audio into chunks
chunker = AudioChunker(self.chunk_duration, self.logger)
chunk_paths = chunker.split_audio(audio_file_path, temp_dir)

# Process each chunk
processed_chunks_by_stem = {}

for i, chunk_path in enumerate(chunk_paths):
self.logger.info(f"Processing chunk {i+1}/{len(chunk_paths)}: {chunk_path}")

original_chunk_duration = self.chunk_duration
original_output_dir = self.output_dir
self.chunk_duration = None
self.output_dir = temp_dir

if self.model_instance:
original_model_output_dir = self.model_instance.output_dir
self.model_instance.output_dir = temp_dir

try:
output_files = self._separate_file(chunk_path, custom_output_names)

# Dynamically group chunks by stem name
for stem_path in output_files:
# Extract stem name from filename: "chunk_0000_(Vocals).wav" → "Vocals"
filename = os.path.basename(stem_path)
match = re.search(r'_\(([^)]+)\)', filename)
if match:
stem_name = match.group(1)
else:
# Fallback: use index-based name if pattern not found
stem_index = len([k for k in processed_chunks_by_stem.keys() if k.startswith('stem_')])
stem_name = f"stem_{stem_index}"
self.logger.warning(f"Could not extract stem name from {filename}, using {stem_name}")

if stem_name not in processed_chunks_by_stem:
processed_chunks_by_stem[stem_name] = []

# Ensure absolute path
abs_path = stem_path if os.path.isabs(stem_path) else os.path.join(temp_dir, stem_path)
processed_chunks_by_stem[stem_name].append(abs_path)

if not output_files:
self.logger.warning(f"Chunk {i+1} produced no output files")

finally:
self.chunk_duration = original_chunk_duration
self.output_dir = original_output_dir
if self.model_instance:
self.model_instance.output_dir = original_model_output_dir

# Clear GPU cache between chunks
if self.model_instance:
self.model_instance.clear_gpu_cache()

# Merge chunks for each stem dynamically
base_name = os.path.splitext(os.path.basename(audio_file_path))[0]
output_files = []

for stem_name in sorted(processed_chunks_by_stem.keys()):
chunk_paths_for_stem = processed_chunks_by_stem[stem_name]

if not chunk_paths_for_stem:
self.logger.warning(f"No chunks found for stem: {stem_name}")
continue

# Determine output filename
if custom_output_names and stem_name in custom_output_names:
output_filename = custom_output_names[stem_name]
else:
output_filename = f"{base_name}_({stem_name})"

output_path = os.path.join(self.output_dir, f"{output_filename}.{self.output_format.lower()}")

self.logger.info(f"Merging {len(chunk_paths_for_stem)} chunks for stem: {stem_name}")
chunker.merge_chunks(chunk_paths_for_stem, output_path)
output_files.append(output_path)

self.logger.info(f"Chunked processing completed. Output files: {output_files}")
return output_files

finally:
# Clean up temporary directory
if os.path.exists(temp_dir):
self.logger.debug(f"Cleaning up temporary directory: {temp_dir}")
shutil.rmtree(temp_dir, ignore_errors=True)

def download_model_and_data(self, model_filename):
"""
Downloads the model file without loading it into memory.
Expand Down
3 changes: 3 additions & 0 deletions audio_separator/utils/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ def main():
sample_rate_help = "Modify the sample rate of the output audio (default: %(default)s). Example: --sample_rate=44100"
use_soundfile_help = "Use soundfile to write audio output (default: %(default)s). Example: --use_soundfile"
use_autocast_help = "Use PyTorch autocast for faster inference (default: %(default)s). Do not use for CPU inference. Example: --use_autocast"
chunk_duration_help = "Split audio into chunks of this duration in seconds (default: %(default)s = no chunking). Useful for processing very long audio files on systems with limited memory. Recommended: 600 (10 minutes) for files >1 hour. Chunks are concatenated without overlap/crossfade. Example: --chunk_duration=600"
custom_output_names_help = 'Custom names for all output files in JSON format (default: %(default)s). Example: --custom_output_names=\'{"Vocals": "vocals_output", "Drums": "drums_output"}\''

common_params = parser.add_argument_group("Common Separation Parameters")
Expand All @@ -69,6 +70,7 @@ def main():
common_params.add_argument("--sample_rate", type=int, default=44100, help=sample_rate_help)
common_params.add_argument("--use_soundfile", action="store_true", help=use_soundfile_help)
common_params.add_argument("--use_autocast", action="store_true", help=use_autocast_help)
common_params.add_argument("--chunk_duration", type=float, default=None, help=chunk_duration_help)
common_params.add_argument("--custom_output_names", type=json.loads, default=None, help=custom_output_names_help)

mdx_segment_size_help = "Larger consumes more resources, but may give better results (default: %(default)s). Example: --mdx_segment_size=256"
Expand Down Expand Up @@ -200,6 +202,7 @@ def main():
sample_rate=args.sample_rate,
use_soundfile=args.use_soundfile,
use_autocast=args.use_autocast,
chunk_duration=args.chunk_duration,
mdx_params={
"hop_length": args.mdx_hop_length,
"segment_size": args.mdx_segment_size,
Expand Down
Loading