diff --git a/.gitignore b/.gitignore
index 3d83f225..c17906d6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,7 @@
 # Andrew env
 .DS_Store
 .vscode
+debug_results.txt
 
 # Andrew functional adds
 /tracks/
diff --git a/Dockerfile.rocm b/Dockerfile.rocm
new file mode 100644
index 00000000..348051f9
--- /dev/null
+++ b/Dockerfile.rocm
@@ -0,0 +1,34 @@
+# Use ROCm base image with Python
+FROM rocm/dev-ubuntu-22.04:7.2-complete
+
+# Set the working directory in the container
+WORKDIR /workdir
+
+# Install necessary packages
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    ffmpeg \
+    python3 \
+    python3-pip \
+    && rm -rf /var/lib/apt/lists/*
+
+RUN python3 -m pip install --upgrade pip
+
+# Install PyTorch with ROCm support
+RUN --mount=type=cache,target=/root/.cache \
+    pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm7.2
+
+# Install audio-separator with GPU support
+RUN --mount=type=cache,target=/root/.cache \
+    pip3 install "audio-separator[gpu]" onnxruntime-rocm
+
+# Default environment variables for AMD RX 6600 series (gfx1032)
+# Override these for other GPUs:
+#   RX 7900: HSA_OVERRIDE_GFX_VERSION=11.0.0, PYTORCH_ROCM_ARCH=gfx1100
+#   MI250X:  HSA_OVERRIDE_GFX_VERSION=9.4.2, PYTORCH_ROCM_ARCH=gfx90a
+ARG HSA_OVERRIDE_GFX_VERSION=10.3.2
+ARG PYTORCH_ROCM_ARCH=gfx1030
+ENV HSA_OVERRIDE_GFX_VERSION=${HSA_OVERRIDE_GFX_VERSION}
+ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}
+
+# Run audio-separator when the container launches
+ENTRYPOINT ["audio-separator"]
\ No newline at end of file
diff --git a/README.md b/README.md
index e51e32d7..b56fe41f 100644
--- a/README.md
+++ b/README.md
@@ -25,11 +25,13 @@ The simplest (and probably most used) use case for this package is to separate a
   - [Installation 🛠️](#installation-%EF%B8%8F)
     - [🐳 Docker](#-docker)
     - [🎮 Nvidia GPU with CUDA or 🧪 Google Colab](#-nvidia-gpu-with-cuda-or--google-colab)
+    - [🖥️ AMD GPU with ROCm (Linux)](#-amd-gpu-with-rocm-linux)
     - [ Apple Silicon, macOS Sonoma+ with M1 or newer CPU (CoreML acceleration)](#-apple-silicon-macos-sonoma-with-m1-or-newer-cpu-coreml-acceleration)
     - [🐢 No hardware acceleration, CPU only](#-no-hardware-acceleration-cpu-only)
     - [🎥 FFmpeg dependency](#-ffmpeg-dependency)
   - [GPU / CUDA specific installation steps with Pip](#gpu--cuda-specific-installation-steps-with-pip)
     - [Multiple CUDA library versions may be needed](#multiple-cuda-library-versions-may-be-needed)
+    - [ROCm specific troubleshooting](#rocm-specific-troubleshooting)
   - [Usage 🚀](#usage-)
     - [Command Line Interface (CLI)](#command-line-interface-cli)
     - [Listing and Filtering Available Models](#listing-and-filtering-available-models)
@@ -67,6 +69,7 @@ The simplest (and probably most used) use case for this package is to separate a
 - Ability to inference using a pre-trained model in PTH or ONNX format.
 - CLI support for easy use in scripts and batch processing.
 - Python API for integration into other projects.
+- **Multi-platform GPU acceleration**: NVIDIA CUDA, AMD ROCm, Apple Silicon MPS/CoreML, DirectML, and CPU fallback.
 
 ## Installation 🛠️
 
@@ -112,6 +115,50 @@ Docker:
 beveradb/audio-separator:gpu
 ```
 
+### 🖥️ AMD GPU with ROCm (Linux)
+
+**Supported ROCm Versions:** 5.7+
+
+💬 If successfully configured, you should see this log message when running `audio-separator --env_info`:
+ `ONNXruntime has ROCMExecutionProvider available, enabling acceleration`
+
+Pip (complete installation):
+```sh
+# First install PyTorch with ROCm support (Change ROCm version as needed.)
+pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm7.2
+
+# Then install audio-separator with ROCm support
+pip install "audio-separator[rocm]"
+```
+
+**Important:** You must install PyTorch with ROCm support BEFORE installing audio-separator. If you already have PyTorch with CUDA support installed, uninstall it first:
+```sh
+pip uninstall torch torchvision torchaudio
+pip cache purge
+pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm5.7
+pip install "audio-separator[rocm]"
+```
+
+**Required ROCm Packages:**
+- PyTorch ROCm: `torch`, `torchvision`, `torchaudio` with ROCm support
+- ONNX Runtime: `onnxruntime`, `onnxruntime-rocm`
+
+**Basic ROCm Setup:**
+- For AMD Radeon RX 6600 series (gfx1032), set environment variables:
+```sh
+export HSA_OVERRIDE_GFX_VERSION=10.3.2
+export PYTORCH_ROCM_ARCH=gfx1030
+```
+- ROCm acceleration uses the CUDAExecutionProvider (ONNX Runtime maps ROCm to CUDA for compatibility)
+- The system detects ROCm packages and PyTorch ROCm support automatically
+- ROCm libraries must be properly installed on your system for acceleration to work
+
+Docker (build from source):
+```sh
+docker build -f Dockerfile.rocm -t audio-separator:rocm .
+docker run -it --device=/dev/kfd --device=/dev/dri --group-add=video -v `pwd`:/workdir audio-separator:rocm input.wav
+```
+
 ###  Apple Silicon, macOS Sonoma+ with M1 or newer CPU (CoreML acceleration)
 
 💬 If successfully configured, you should see this log message when running `audio-separator --env_info`:
@@ -157,19 +204,26 @@ apt-get update; apt-get install -y ffmpeg
 brew update; brew install ffmpeg
 ```
 
-## GPU / CUDA specific installation steps with Pip
+## GPU / CUDA specific installation steps with Pip (CUDA and ROCm)
 
-In theory, all you should need to do to get `audio-separator` working with a GPU is install it with the `[gpu]` extra as above.
+In theory, all you should need to do to get `audio-separator` working with a GPU is install it with the appropriate extra (`[gpu]` for CUDA/NVIDIA or `[rocm]` for ROCm/AMD) as above.
 
-However, sometimes getting both PyTorch and ONNX Runtime working with CUDA support can be a bit tricky so it may not work that easily.
+However, sometimes getting both PyTorch and ONNX Runtime working with GPU support can be a bit tricky so it may not work that easily.
 
 You may need to reinstall both packages directly, allowing pip to calculate the right versions for your platform, for example:
 
+**For CUDA/NVIDIA (`[gpu]`):**
 - `pip uninstall torch onnxruntime`
 - `pip cache purge`
 - `pip install --force-reinstall torch torchvision torchaudio`
 - `pip install --force-reinstall onnxruntime-gpu`
 
+**For ROCm/AMD (`[rocm]`):**
+- `pip uninstall torch onnxruntime onnxruntime-rocm`
+- `pip cache purge`
+- `pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm7.2`
+- `pip install --force-reinstall onnxruntime-rocm`
+
 I generally recommend installing the latest version of PyTorch for your environment using the command recommended by the wizard here:
 <https://pytorch.org/get-started/locally/>
 
@@ -197,6 +251,34 @@ You can resolve this by running the following command:
 python -m pip install ort-nightly-gpu --index-url=https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ort-cuda-12-nightly/pypi/simple/
 ```
 
+### ROCm specific troubleshooting
+
+For ROCm (AMD GPU) support, make sure you have:
+1. ROCm installed on your system (typically version 5.7+)
+2. PyTorch with ROCm support installed (check PyTorch website for ROCm installation)
+3. `onnxruntime-rocm` package installed
+
+If you encounter issues with ROCm detection, try reinstalling the packages:
+```sh
+pip uninstall torch onnxruntime
+pip cache purge
+# Install PyTorch with ROCm support (check https://pytorch.org for the correct command)
+pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm5.7
+pip install onnxruntime-rocm
+```
+
+**ROCm Performance Optimization:**
+- The ROCm execution provider includes performance optimizations for AMD GPUs:
+  - Parallel execution mode for better multi-core utilization
+  - Kernel tuning enabled for optimal performance
+  - Memory pattern optimization for better cache usage
+  - Smart memory allocation strategy
+
+**Common ROCm Issues:**
+- If you see ROCm package installed but no acceleration: Make sure `onnxruntime-rocm` is installed and ROCm libraries are in your PATH
+- If PyTorch shows CUDA but not ROCm: Reinstall PyTorch with ROCm support using the PyTorch ROCm index URL
+- Docker issues: Use the provided `Dockerfile.rocm` and ensure proper device mounting
+
 > Note: if anyone knows how to make this cleaner so we can support both different platform-specific dependencies for hardware acceleration without a separate installation process for each, please let me know or raise a PR!
 
 ## Usage 🚀
diff --git a/audio_separator/separator/architectures/demucs_separator.py b/audio_separator/separator/architectures/demucs_separator.py
index d1d62dc3..a653f853 100644
--- a/audio_separator/separator/architectures/demucs_separator.py
+++ b/audio_separator/separator/architectures/demucs_separator.py
@@ -4,15 +4,25 @@
 import torch
 import numpy as np
 from audio_separator.separator.common_separator import CommonSeparator
-from audio_separator.separator.uvr_lib_v5.demucs.apply import apply_model, demucs_segments
+from audio_separator.separator.uvr_lib_v5.demucs.apply import (
+    apply_model,
+    demucs_segments,
+)
 from audio_separator.separator.uvr_lib_v5.demucs.hdemucs import HDemucs
-from audio_separator.separator.uvr_lib_v5.demucs.pretrained import get_model as get_demucs_model
+from audio_separator.separator.uvr_lib_v5.demucs.pretrained import (
+    get_model as get_demucs_model,
+)
 from audio_separator.separator.uvr_lib_v5 import spec_utils
 
 DEMUCS_4_SOURCE = ["drums", "bass", "other", "vocals"]
 
 DEMUCS_2_SOURCE_MAPPER = {CommonSeparator.INST_STEM: 0, CommonSeparator.VOCAL_STEM: 1}
-DEMUCS_4_SOURCE_MAPPER = {CommonSeparator.BASS_STEM: 0, CommonSeparator.DRUM_STEM: 1, CommonSeparator.OTHER_STEM: 2, CommonSeparator.VOCAL_STEM: 3}
+DEMUCS_4_SOURCE_MAPPER = {
+    CommonSeparator.BASS_STEM: 0,
+    CommonSeparator.DRUM_STEM: 1,
+    CommonSeparator.OTHER_STEM: 2,
+    CommonSeparator.VOCAL_STEM: 3,
+}
 DEMUCS_6_SOURCE_MAPPER = {
     CommonSeparator.BASS_STEM: 0,
     CommonSeparator.DRUM_STEM: 1,
@@ -64,8 +74,12 @@ def __init__(self, common_config, arch_config):
         # Enables "Segments". Deselecting this option is only recommended for those with powerful PCs.
         self.segments_enabled = arch_config.get("segments_enabled", True)
 
-        self.logger.debug(f"Demucs arch params: segment_size={self.segment_size}, segments_enabled={self.segments_enabled}")
-        self.logger.debug(f"Demucs arch params: shifts={self.shifts}, overlap={self.overlap}")
+        self.logger.debug(
+            f"Demucs arch params: segment_size={self.segment_size}, segments_enabled={self.segments_enabled}"
+        )
+        self.logger.debug(
+            f"Demucs arch params: shifts={self.shifts}, overlap={self.overlap}"
+        )
 
         self.demucs_source_map = DEMUCS_4_SOURCE_MAPPER
 
@@ -107,15 +121,23 @@ def separate(self, audio_file_path, custom_output_names=None):
 
         self.logger.debug("Loading model for demixing...")
 
-        self.demucs_model_instance = HDemucs(sources=DEMUCS_4_SOURCE)
-        self.demucs_model_instance = get_demucs_model(name=os.path.splitext(os.path.basename(self.model_path))[0], repo=Path(os.path.dirname(self.model_path)))
-        self.demucs_model_instance = demucs_segments(self.segment_size, self.demucs_model_instance)
-        self.demucs_model_instance.to(self.torch_device)
+        # Use GPU device for Demucs if available and not explicitly disabled
+        inference_device = self.torch_device
+
+        # Load the ROCm-compatible Demucs model
+        self.demucs_model_instance = get_demucs_model(
+            name=os.path.splitext(os.path.basename(self.model_path))[0],
+            repo=Path(os.path.dirname(self.model_path)),
+        )
+        self.demucs_model_instance = demucs_segments(
+            self.segment_size, self.demucs_model_instance
+        )
+        self.demucs_model_instance.to(inference_device)
         self.demucs_model_instance.eval()
 
         self.logger.debug("Model loaded and set to evaluation mode.")
 
-        source = self.demix_demucs(mix)
+        source = self.demix_demucs(mix, inference_device)
 
         del self.demucs_model_instance
         self.clear_gpu_cache()
@@ -126,13 +148,20 @@ def separate(self, audio_file_path, custom_output_names=None):
 
         if isinstance(inst_source, np.ndarray):
             self.logger.debug("Processing instance source...")
-            source_reshape = spec_utils.reshape_sources(inst_source[self.demucs_source_map[CommonSeparator.VOCAL_STEM]], source[self.demucs_source_map[CommonSeparator.VOCAL_STEM]])
-            inst_source[self.demucs_source_map[CommonSeparator.VOCAL_STEM]] = source_reshape
+            source_reshape = spec_utils.reshape_sources(
+                inst_source[self.demucs_source_map[CommonSeparator.VOCAL_STEM]],
+                source[self.demucs_source_map[CommonSeparator.VOCAL_STEM]],
+            )
+            inst_source[self.demucs_source_map[CommonSeparator.VOCAL_STEM]] = (
+                source_reshape
+            )
             source = inst_source
 
         if isinstance(source, np.ndarray):
             source_length = len(source)
-            self.logger.debug(f"Processing source array, source length is {source_length}")
+            self.logger.debug(
+                f"Processing source array, source length is {source_length}"
+            )
             match source_length:
                 case 2:
                     self.logger.debug("Setting source map to 2-stem...")
@@ -148,7 +177,9 @@ def separate(self, audio_file_path, custom_output_names=None):
         for stem_name, stem_value in self.demucs_source_map.items():
             if self.output_single_stem is not None:
                 if stem_name.lower() != self.output_single_stem.lower():
-                    self.logger.debug(f"Skipping writing stem {stem_name} as output_single_stem is set to {self.output_single_stem}...")
+                    self.logger.debug(
+                        f"Skipping writing stem {stem_name} as output_single_stem is set to {self.output_single_stem}..."
+                    )
                     continue
 
             stem_path = self.get_stem_output_path(stem_name, custom_output_names)
@@ -159,7 +190,7 @@ def separate(self, audio_file_path, custom_output_names=None):
 
         return output_files
 
-    def demix_demucs(self, mix):
+    def demix_demucs(self, mix, inference_device):
         """
         Demixes the input mix using the demucs model.
         """
@@ -181,7 +212,7 @@ def demix_demucs(self, mix):
                 overlap=self.overlap,
                 static_shifts=1 if self.shifts == 0 else self.shifts,
                 set_progress_bar=None,
-                device=self.torch_device,
+                device=inference_device,
                 progress=True,
             )[0]
 
diff --git a/audio_separator/separator/architectures/mdxc_separator.py b/audio_separator/separator/architectures/mdxc_separator.py
index 1ddb4999..b9b22199 100644
--- a/audio_separator/separator/architectures/mdxc_separator.py
+++ b/audio_separator/separator/architectures/mdxc_separator.py
@@ -37,7 +37,9 @@ def __init__(self, common_config, arch_config):
 
         # Whether or not to use the segment size from model config, or the default
         # The segment size is set based on the value provided in a chosen model's associated config file (yaml).
-        self.override_model_segment_size = arch_config.get("override_model_segment_size", False)
+        self.override_model_segment_size = arch_config.get(
+            "override_model_segment_size", False
+        )
 
         self.overlap = arch_config.get("overlap", 8)
         self.batch_size = arch_config.get("batch_size", 1)
@@ -51,9 +53,15 @@ def __init__(self, common_config, arch_config):
 
         self.process_all_stems = arch_config.get("process_all_stems", True)
 
-        self.logger.debug(f"MDXC arch params: batch_size={self.batch_size}, segment_size={self.segment_size}, overlap={self.overlap}")
-        self.logger.debug(f"MDXC arch params: override_model_segment_size={self.override_model_segment_size}, pitch_shift={self.pitch_shift}")
-        self.logger.debug(f"MDXC multi-stem params: process_all_stems={self.process_all_stems}")
+        self.logger.debug(
+            f"MDXC arch params: batch_size={self.batch_size}, segment_size={self.segment_size}, overlap={self.overlap}"
+        )
+        self.logger.debug(
+            f"MDXC arch params: override_model_segment_size={self.override_model_segment_size}, pitch_shift={self.pitch_shift}"
+        )
+        self.logger.debug(
+            f"MDXC multi-stem params: process_all_stems={self.process_all_stems}"
+        )
 
         # Align Roformer detection flag with CommonSeparator to ensure consistent stats/logging
         self.is_roformer = getattr(self, "is_roformer_model", False)
@@ -67,21 +75,27 @@ def __init__(self, common_config, arch_config):
 
         # Only mark primary stem as main target for single-target models.
         # Multi-stem models should not trigger residual subtraction logic.
-        self.is_primary_stem_main_target = bool(self.model_data_cfgdict.training.target_instrument)
+        self.is_primary_stem_main_target = bool(
+            self.model_data_cfgdict.training.target_instrument
+        )
 
-        self.logger.debug(f"is_primary_stem_main_target: {self.is_primary_stem_main_target}")
+        self.logger.debug(
+            f"is_primary_stem_main_target: {self.is_primary_stem_main_target}"
+        )
 
         self.logger.info("MDXC Separator initialisation complete")
 
     def load_model(self):
         """
-        Load the model into memory from file on disk, initialize it with config from the model data,
+        Load the model into memory from file on disk, initialize it with config from the model_data,
         and prepare for inferencing using hardware accelerated Torch device.
         """
         self.logger.debug("Loading checkpoint model for inference...")
 
         self.model_data_cfgdict = ConfigDict(self.model_data)
 
+        inference_device = self.torch_device
+
         try:
             if self.is_roformer:
                 # Use the RoformerLoader exclusively; no legacy fallback
@@ -89,30 +103,49 @@ def load_model(self):
                 result = self.roformer_loader.load_model(
                     model_path=self.model_path,
                     config=self.model_data,
-                    device=str(self.torch_device),
+                    device=str(inference_device),
                 )
 
-                if getattr(result, "success", False) and getattr(result, "model", None) is not None:
+                if (
+                    getattr(result, "success", False)
+                    and getattr(result, "model", None) is not None
+                ):
                     self.model_run = result.model
-                    self.model_run.to(self.torch_device).eval()
+                    self.logger.debug(
+                        f"Roformer model device before .to(): {next(self.model_run.parameters()).device}"
+                    )
+                    self.model_run.to(inference_device).eval()
+                    self.logger.debug(
+                        f"Roformer model device after .to(): {next(self.model_run.parameters()).device}"
+                    )
                 else:
-                    error_msg = getattr(result, "error_message", "RoformerLoader unsuccessful")
+                    error_msg = getattr(
+                        result, "error_message", "RoformerLoader unsuccessful"
+                    )
                     self.logger.error(f"Failed to load Roformer model: {error_msg}")
                     raise RuntimeError(error_msg)
 
             else:
                 self.logger.debug("Loading TFC_TDF_net model...")
-                self.model_run = TFC_TDF_net(self.model_data_cfgdict, device=self.torch_device)
+                self.model_run = TFC_TDF_net(
+                    self.model_data_cfgdict, device=inference_device
+                )
                 self.logger.debug("Loading model onto cpu")
-                # For some reason loading the state onto a hardware accelerated devices causes issues, 
+                # For some reason loading the state onto a hardware accelerated devices causes issues,
                 # so we load it onto CPU first then move it to the device
-                self.model_run.load_state_dict(torch.load(self.model_path, map_location="cpu"))
-                self.model_run.to(self.torch_device).eval()
+                self.model_run.load_state_dict(
+                    torch.load(self.model_path, map_location="cpu")
+                )
+                self.model_run.to(inference_device).eval()
 
         except RuntimeError as e:
             self.logger.error(f"Error: {e}")
-            self.logger.error("An error occurred while loading the model file. This often occurs when the model file is corrupt or incomplete.")
-            self.logger.error(f"Please try deleting the model file from {self.model_path} and run audio-separator again to re-download it.")
+            self.logger.error(
+                "An error occurred while loading the model file. This often occurs when the model file is corrupt or incomplete."
+            )
+            self.logger.error(
+                f"Please try deleting the model file from {self.model_path} and run audio-separator again to re-download it."
+            )
             sys.exit(1)
 
     def separate(self, audio_file_path, custom_output_names=None):
@@ -133,7 +166,9 @@ def separate(self, audio_file_path, custom_output_names=None):
         self.audio_file_path = audio_file_path
         self.audio_file_base = os.path.splitext(os.path.basename(audio_file_path))[0]
 
-        self.logger.debug(f"Preparing mix for input audio file {self.audio_file_path}...")
+        self.logger.debug(
+            f"Preparing mix for input audio file {self.audio_file_path}..."
+        )
         mix = self.prepare_mix(self.audio_file_path)
 
         # Check if audio is shorter than threshold
@@ -142,11 +177,19 @@ def separate(self, audio_file_path, custom_output_names=None):
             # Only change and warn if it wasn't already set by the user
             if not self.override_model_segment_size:
                 self.override_model_segment_size = True
-                self.logger.warning(f"Audio duration ({audio_duration_seconds:.2f}s) is less than 10 seconds.")
-                self.logger.warning("Automatically enabling override_model_segment_size for better processing of short audio.")
+                self.logger.warning(
+                    f"Audio duration ({audio_duration_seconds:.2f}s) is less than 10 seconds."
+                )
+                self.logger.warning(
+                    "Automatically enabling override_model_segment_size for better processing of short audio."
+                )
 
         self.logger.debug("Normalizing mix before demixing...")
-        mix = spec_utils.normalize(wave=mix, max_peak=self.normalization_threshold, min_peak=self.amplification_threshold)
+        mix = spec_utils.normalize(
+            wave=mix,
+            max_peak=self.normalization_threshold,
+            min_peak=self.amplification_threshold,
+        )
 
         source = self.demix(mix=mix)
         self.logger.debug("Demixing completed.")
@@ -156,73 +199,115 @@ def separate(self, audio_file_path, custom_output_names=None):
 
         if isinstance(source, dict):
             self.logger.debug("Source is a dict, processing each stem...")
-            
+
             stem_list = []
             if self.model_data_cfgdict.training.target_instrument:
                 stem_list = [self.model_data_cfgdict.training.target_instrument]
             else:
                 stem_list = self.model_data_cfgdict.training.instruments
-            
+
             self.logger.debug(f"Available stems: {stem_list}")
 
             is_multi_stem_model = len(stem_list) > 2
             should_process_all_stems = self.process_all_stems and is_multi_stem_model
-            
+
             if should_process_all_stems:
                 self.logger.debug("Processing all stems from multi-stem model...")
                 for stem_name in stem_list:
-                    stem_output_path = self.get_stem_output_path(stem_name, custom_output_names)
+                    stem_output_path = self.get_stem_output_path(
+                        stem_name, custom_output_names
+                    )
                     stem_source = spec_utils.normalize(
-                        wave=source[stem_name], 
-                        max_peak=self.normalization_threshold, 
-                        min_peak=self.amplification_threshold
+                        wave=source[stem_name],
+                        max_peak=self.normalization_threshold,
+                        min_peak=self.amplification_threshold,
                     ).T
-                    
-                    self.logger.info(f"Saving {stem_name} stem to {stem_output_path}...")
+
+                    self.logger.info(
+                        f"Saving {stem_name} stem to {stem_output_path}..."
+                    )
                     self.final_process(stem_output_path, stem_source, stem_name)
                     output_files.append(stem_output_path)
             else:
                 # Standard processing for primary and secondary stems
                 if not isinstance(self.primary_source, np.ndarray):
-                    self.logger.debug(f"Normalizing primary source for primary stem {self.primary_stem_name}...")
+                    self.logger.debug(
+                        f"Normalizing primary source for primary stem {self.primary_stem_name}..."
+                    )
                     self.primary_source = spec_utils.normalize(
-                        wave=source[self.primary_stem_name], 
-                        max_peak=self.normalization_threshold, 
-                        min_peak=self.amplification_threshold
+                        wave=source[self.primary_stem_name],
+                        max_peak=self.normalization_threshold,
+                        min_peak=self.amplification_threshold,
                     ).T
 
                 if not isinstance(self.secondary_source, np.ndarray):
-                    self.logger.debug(f"Normalizing secondary source for secondary stem {self.secondary_stem_name}...")
+                    self.logger.debug(
+                        f"Normalizing secondary source for secondary stem {self.secondary_stem_name}..."
+                    )
                     self.secondary_source = spec_utils.normalize(
-                        wave=source[self.secondary_stem_name], 
-                        max_peak=self.normalization_threshold, 
-                        min_peak=self.amplification_threshold
+                        wave=source[self.secondary_stem_name],
+                        max_peak=self.normalization_threshold,
+                        min_peak=self.amplification_threshold,
                     ).T
 
-                if not self.output_single_stem or self.output_single_stem.lower() == self.secondary_stem_name.lower():
-                    self.secondary_stem_output_path = self.get_stem_output_path(self.secondary_stem_name, custom_output_names)
-
-                    self.logger.info(f"Saving {self.secondary_stem_name} stem to {self.secondary_stem_output_path}...")
-                    self.final_process(self.secondary_stem_output_path, self.secondary_source, self.secondary_stem_name)
+                if (
+                    not self.output_single_stem
+                    or self.output_single_stem.lower()
+                    == self.secondary_stem_name.lower()
+                ):
+                    self.secondary_stem_output_path = self.get_stem_output_path(
+                        self.secondary_stem_name, custom_output_names
+                    )
+
+                    self.logger.info(
+                        f"Saving {self.secondary_stem_name} stem to {self.secondary_stem_output_path}..."
+                    )
+                    self.final_process(
+                        self.secondary_stem_output_path,
+                        self.secondary_source,
+                        self.secondary_stem_name,
+                    )
                     output_files.append(self.secondary_stem_output_path)
-                
-                if not self.output_single_stem or self.output_single_stem.lower() == self.primary_stem_name.lower():
-                    self.primary_stem_output_path = self.get_stem_output_path(self.primary_stem_name, custom_output_names)
-                    
-                    self.logger.info(f"Saving {self.primary_stem_name} stem to {self.primary_stem_output_path}...")
-                    self.final_process(self.primary_stem_output_path, self.primary_source, self.primary_stem_name)
+
+                if (
+                    not self.output_single_stem
+                    or self.output_single_stem.lower() == self.primary_stem_name.lower()
+                ):
+                    self.primary_stem_output_path = self.get_stem_output_path(
+                        self.primary_stem_name, custom_output_names
+                    )
+
+                    self.logger.info(
+                        f"Saving {self.primary_stem_name} stem to {self.primary_stem_output_path}..."
+                    )
+                    self.final_process(
+                        self.primary_stem_output_path,
+                        self.primary_source,
+                        self.primary_stem_name,
+                    )
                     output_files.append(self.primary_stem_output_path)
 
         else:
             # Handle case when source is not a dictionary (single source model)
-            if not self.output_single_stem or self.output_single_stem.lower() == self.primary_stem_name.lower():
-                self.primary_stem_output_path = self.get_stem_output_path(self.primary_stem_name, custom_output_names)
+            if (
+                not self.output_single_stem
+                or self.output_single_stem.lower() == self.primary_stem_name.lower()
+            ):
+                self.primary_stem_output_path = self.get_stem_output_path(
+                    self.primary_stem_name, custom_output_names
+                )
 
                 if not isinstance(self.primary_source, np.ndarray):
                     self.primary_source = source.T
 
-                self.logger.info(f"Saving {self.primary_stem_name} stem to {self.primary_stem_output_path}...")
-                self.final_process(self.primary_stem_output_path, self.primary_source, self.primary_stem_name)
+                self.logger.info(
+                    f"Saving {self.primary_stem_name} stem to {self.primary_stem_output_path}..."
+                )
+                self.final_process(
+                    self.primary_stem_output_path,
+                    self.primary_source,
+                    self.primary_stem_name,
+                )
                 output_files.append(self.primary_stem_output_path)
 
         return output_files
@@ -239,7 +324,9 @@ def pitch_fix(self, source, sr_pitched, orig_mix):
         Returns:
             np.ndarray: The pitch-shifted source audio.
         """
-        source = spec_utils.change_pitch_semitones(source, sr_pitched, semitone_shift=self.pitch_shift)[0]
+        source = spec_utils.change_pitch_semitones(
+            source, sr_pitched, semitone_shift=self.pitch_shift
+        )[0]
         source = spec_utils.match_array_shapes(source, orig_mix)
         return source
 
@@ -251,7 +338,9 @@ def overlap_add(self, result, x, weights, start, length):
         # Use the minimum of provided lengths to avoid broadcasting errors
         safe_len = min(length, x.shape[-1], weights.shape[0])
         if safe_len > 0:
-            result[..., start : start + safe_len] += x[..., :safe_len] * weights[:safe_len]
+            result[..., start : start + safe_len] += (
+                x[..., :safe_len] * weights[:safe_len]
+            )
         return result
 
     def demix(self, mix: np.ndarray) -> dict:
@@ -267,7 +356,9 @@ def demix(self, mix: np.ndarray) -> dict:
 
         if self.pitch_shift != 0:
             self.logger.debug(f"Shifting pitch by -{self.pitch_shift} semitones...")
-            mix, sample_rate = spec_utils.change_pitch_semitones(mix, self.sample_rate, semitone_shift=-self.pitch_shift)
+            mix, sample_rate = spec_utils.change_pitch_semitones(
+                mix, self.sample_rate, semitone_shift=-self.pitch_shift
+            )
 
         if self.is_roformer:
             # Note: Currently, for Roformer models, `batch_size` is not utilized due to negligible performance improvements.
@@ -279,15 +370,23 @@ def demix(self, mix: np.ndarray) -> dict:
                 self.logger.debug(f"Using configured segment size: {mdx_segment_size}")
             else:
                 mdx_segment_size = self.model_data_cfgdict.inference.dim_t
-                self.logger.debug(f"Using model default segment size: {mdx_segment_size}")
+                self.logger.debug(
+                    f"Using model default segment size: {mdx_segment_size}"
+                )
 
             # num_stems aka "S" in UVR
-            num_stems = 1 if self.model_data_cfgdict.training.target_instrument else len(self.model_data_cfgdict.training.instruments)
+            num_stems = (
+                1
+                if self.model_data_cfgdict.training.target_instrument
+                else len(self.model_data_cfgdict.training.instruments)
+            )
             self.logger.debug(f"Number of stems: {num_stems}")
 
             # chunk_size aka "C" in UVR
             # IMPORTANT: For Roformer models, use the model's STFT hop length to derive the temporal chunk size
-            stft_hop_len = getattr(self.model_data_cfgdict.model, "stft_hop_length", None)
+            stft_hop_len = getattr(
+                self.model_data_cfgdict.model, "stft_hop_length", None
+            )
             if stft_hop_len is None:
                 # Fallback to audio.hop_length if not present, but log for visibility
                 stft_hop_len = self.model_data_cfgdict.audio.hop_length
@@ -307,13 +406,16 @@ def demix(self, mix: np.ndarray) -> dict:
             self.logger.debug(f"Step: {step} (desired={desired_step})")
 
             # Create a weighting table and convert it to a PyTorch tensor
-            window = torch.tensor(signal.windows.hamming(chunk_size), dtype=torch.float32)
+            window = torch.tensor(
+                signal.windows.hamming(chunk_size), dtype=torch.float32
+            )
 
             device = next(self.model_run.parameters()).device
 
-
             with torch.no_grad():
-                req_shape = (len(self.model_data_cfgdict.training.instruments),) + tuple(mix.shape)
+                req_shape = (
+                    len(self.model_data_cfgdict.training.instruments),
+                ) + tuple(mix.shape)
                 result = torch.zeros(req_shape, dtype=torch.float32)
                 counter = torch.zeros(req_shape, dtype=torch.float32)
 
@@ -333,7 +435,9 @@ def demix(self, mix: np.ndarray) -> dict:
                         result = self.overlap_add(result, x, window, start_idx, length)
                         safe_len = min(length, x.shape[-1], window.shape[0])
                         if safe_len > 0:
-                            counter[..., start_idx : start_idx + safe_len] += window[:safe_len]
+                            counter[..., start_idx : start_idx + safe_len] += window[
+                                :safe_len
+                            ]
                     else:
                         result = self.overlap_add(result, x, window, i, length)
                         safe_len = min(length, x.shape[-1], window.shape[0])
@@ -356,9 +460,13 @@ def demix(self, mix: np.ndarray) -> dict:
                 self.logger.debug(f"Using configured segment size: {mdx_segment_size}")
             else:
                 mdx_segment_size = self.model_data_cfgdict.inference.dim_t
-                self.logger.debug(f"Using model default segment size: {mdx_segment_size}")
+                self.logger.debug(
+                    f"Using model default segment size: {mdx_segment_size}"
+                )
 
-            chunk_size = self.model_data_cfgdict.audio.hop_length * (mdx_segment_size - 1)
+            chunk_size = self.model_data_cfgdict.audio.hop_length * (
+                mdx_segment_size - 1
+            )
             self.logger.debug(f"Chunk size: {chunk_size}")
 
             hop_size = chunk_size // self.overlap
@@ -368,19 +476,35 @@ def demix(self, mix: np.ndarray) -> dict:
             pad_size = hop_size - (mix_shape - chunk_size) % hop_size
             self.logger.debug(f"Pad size: {pad_size}")
 
-            mix = torch.cat([torch.zeros(2, chunk_size - hop_size), mix, torch.zeros(2, pad_size + chunk_size - hop_size)], 1)
+            mix = torch.cat(
+                [
+                    torch.zeros(2, chunk_size - hop_size),
+                    mix,
+                    torch.zeros(2, pad_size + chunk_size - hop_size),
+                ],
+                1,
+            )
             self.logger.debug(f"Mix shape: {mix.shape}")
 
             chunks = mix.unfold(1, chunk_size, hop_size).transpose(0, 1)
             self.logger.debug(f"Chunks length: {len(chunks)} and shape: {chunks.shape}")
 
-            batches = [chunks[i : i + self.batch_size] for i in range(0, len(chunks), self.batch_size)]
-            self.logger.debug(f"Batch size: {self.batch_size}, number of batches: {len(batches)}")
+            batches = [
+                chunks[i : i + self.batch_size]
+                for i in range(0, len(chunks), self.batch_size)
+            ]
+            self.logger.debug(
+                f"Batch size: {self.batch_size}, number of batches: {len(batches)}"
+            )
 
             # accumulated_outputs is used to accumulate the output from processing each batch of chunks through the model.
             # It starts as a tensor of zeros and is updated in-place as the model processes each batch.
             # The variable holds the combined result of all processed batches, which, after post-processing, represents the separated audio sources.
-            accumulated_outputs = torch.zeros(num_stems, *mix.shape) if num_stems > 1 else torch.zeros_like(mix)
+            accumulated_outputs = (
+                torch.zeros(num_stems, *mix.shape)
+                if num_stems > 1
+                else torch.zeros_like(mix)
+            )
 
             with torch.no_grad():
                 count = 0
@@ -395,16 +519,27 @@ def demix(self, mix: np.ndarray) -> dict:
                     for individual_output in single_batch_result:
                         individual_output_cpu = individual_output.cpu()
                         # Accumulate outputs on CPU
-                        accumulated_outputs[..., count * hop_size : count * hop_size + chunk_size] += individual_output_cpu
+                        accumulated_outputs[
+                            ..., count * hop_size : count * hop_size + chunk_size
+                        ] += individual_output_cpu
                         count += 1
 
-            self.logger.debug("Calculating inferenced outputs based on accumulated outputs and overlap")
-            inferenced_outputs = accumulated_outputs[..., chunk_size - hop_size : -(pad_size + chunk_size - hop_size)] / self.overlap
+            self.logger.debug(
+                "Calculating inferenced outputs based on accumulated outputs and overlap"
+            )
+            inferenced_outputs = (
+                accumulated_outputs[
+                    ..., chunk_size - hop_size : -(pad_size + chunk_size - hop_size)
+                ]
+                / self.overlap
+            )
             self.logger.debug("Deleting accumulated outputs to free up memory")
             del accumulated_outputs
 
         if num_stems > 1:
-            self.logger.debug("Number of stems is greater than 1, detaching individual sources and correcting pitch if necessary...")
+            self.logger.debug(
+                "Number of stems is greater than 1, detaching individual sources and correcting pitch if necessary..."
+            )
 
             sources = {}
 
@@ -412,7 +547,10 @@ def demix(self, mix: np.ndarray) -> dict:
             # self.model_data_cfgdict.training.instruments provides the list of stems.
             # estimated_sources.cpu().detach().numpy() converts the separated sources tensor to a NumPy array for processing.
             # Each iteration provides an instrument name ('key') and its separated audio ('value') for further processing.
-            for key, value in zip(self.model_data_cfgdict.training.instruments, inferenced_outputs.cpu().detach().numpy()):
+            for key, value in zip(
+                self.model_data_cfgdict.training.instruments,
+                inferenced_outputs.cpu().detach().numpy(),
+            ):
                 self.logger.debug(f"Processing instrument: {key}")
                 if self.pitch_shift != 0:
                     self.logger.debug(f"Applying pitch correction for {key}")
@@ -422,10 +560,16 @@ def demix(self, mix: np.ndarray) -> dict:
 
             # Residual subtraction is only applicable for single-target models (not multi-stem)
             if self.is_primary_stem_main_target and num_stems == 1:
-                self.logger.debug(f"Primary stem: {self.primary_stem_name} is main target, detaching and matching array shapes if necessary...")
+                self.logger.debug(
+                    f"Primary stem: {self.primary_stem_name} is main target, detaching and matching array shapes if necessary..."
+                )
                 if sources[self.primary_stem_name].shape[1] != orig_mix.shape[1]:
-                    sources[self.primary_stem_name] = spec_utils.match_array_shapes(sources[self.primary_stem_name], orig_mix)
-                sources[self.secondary_stem_name] = orig_mix - sources[self.primary_stem_name]
+                    sources[self.primary_stem_name] = spec_utils.match_array_shapes(
+                        sources[self.primary_stem_name], orig_mix
+                    )
+                sources[self.secondary_stem_name] = (
+                    orig_mix - sources[self.primary_stem_name]
+                )
 
             self.logger.debug("Deleting inferenced outputs to free up memory")
             del inferenced_outputs
@@ -436,8 +580,16 @@ def demix(self, mix: np.ndarray) -> dict:
             self.logger.debug("Processing single source...")
 
             if self.is_roformer:
-                sources = {k: v.cpu().detach().numpy() for k, v in zip([self.model_data_cfgdict.training.target_instrument], inferenced_outputs)}
-                inferenced_output = sources[self.model_data_cfgdict.training.target_instrument]
+                sources = {
+                    k: v.cpu().detach().numpy()
+                    for k, v in zip(
+                        [self.model_data_cfgdict.training.target_instrument],
+                        inferenced_outputs,
+                    )
+                }
+                inferenced_output = sources[
+                    self.model_data_cfgdict.training.target_instrument
+                ]
             else:
                 inferenced_output = inferenced_outputs.cpu().detach().numpy()
 
@@ -454,7 +606,9 @@ def demix(self, mix: np.ndarray) -> dict:
                 primary = inferenced_output
 
             if self.is_primary_stem_main_target:
-                self.logger.debug("Single-target model detected; computing residual secondary stem from original mix")
+                self.logger.debug(
+                    "Single-target model detected; computing residual secondary stem from original mix"
+                )
                 # Ensure shapes match before residual subtraction
                 if primary.shape[1] != orig_mix.shape[1]:
                     primary = spec_utils.match_array_shapes(primary, orig_mix)
diff --git a/audio_separator/separator/architectures/vr_separator.py b/audio_separator/separator/architectures/vr_separator.py
index d00887cc..18062612 100644
--- a/audio_separator/separator/architectures/vr_separator.py
+++ b/audio_separator/separator/architectures/vr_separator.py
@@ -15,7 +15,9 @@
 from audio_separator.separator.uvr_lib_v5 import spec_utils
 from audio_separator.separator.uvr_lib_v5.vr_network import nets
 from audio_separator.separator.uvr_lib_v5.vr_network import nets_new
-from audio_separator.separator.uvr_lib_v5.vr_network.model_param_init import ModelParameters
+from audio_separator.separator.uvr_lib_v5.vr_network.model_param_init import (
+    ModelParameters,
+)
 
 
 class VRSeparator(CommonSeparator):
@@ -44,10 +46,16 @@ def __init__(self, common_config, arch_config: dict):
 
         # Model params are additional technical parameter values from JSON files in separator/uvr_lib_v5/vr_network/modelparams/*.json,
         # with filenames referenced by the model_data["vr_model_param"] value
-        package_root_filepath = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
-        vr_params_json_dir = os.path.join(package_root_filepath, "uvr_lib_v5", "vr_network", "modelparams")
+        package_root_filepath = os.path.dirname(
+            os.path.dirname(os.path.abspath(__file__))
+        )
+        vr_params_json_dir = os.path.join(
+            package_root_filepath, "uvr_lib_v5", "vr_network", "modelparams"
+        )
         vr_params_json_filename = f"{self.model_data['vr_model_param']}.json"
-        vr_params_json_filepath = os.path.join(vr_params_json_dir, vr_params_json_filename)
+        vr_params_json_filepath = os.path.join(
+            vr_params_json_dir, vr_params_json_filename
+        )
         self.model_params = ModelParameters(vr_params_json_filepath)
 
         self.logger.debug(f"Model params: {self.model_params.param}")
@@ -94,16 +102,30 @@ def __init__(self, common_config, arch_config: dict):
         # - Values beyond 5 might muddy the sound for non-vocal models.
         self.aggression = float(int(arch_config.get("aggression", 5)) / 100)
 
-        self.aggressiveness = {"value": self.aggression, "split_bin": self.model_params.param["band"][1]["crop_stop"], "aggr_correction": self.model_params.param.get("aggr_correction")}
+        self.aggressiveness = {
+            "value": self.aggression,
+            "split_bin": self.model_params.param["band"][1]["crop_stop"],
+            "aggr_correction": self.model_params.param.get("aggr_correction"),
+        }
 
         self.model_samplerate = self.model_params.param["sr"]
 
-        self.logger.debug(f"VR arch params: enable_tta={self.enable_tta}, enable_post_process={self.enable_post_process}, post_process_threshold={self.post_process_threshold}")
-        self.logger.debug(f"VR arch params: batch_size={self.batch_size}, window_size={self.window_size}")
-        self.logger.debug(f"VR arch params: high_end_process={self.high_end_process}, aggression={self.aggression}")
-        self.logger.debug(f"VR arch params: is_vr_51_model={self.is_vr_51_model}, model_samplerate={self.model_samplerate}, model_capacity={self.model_capacity}")
-
-        self.model_run = lambda *args, **kwargs: self.logger.error("Model run method is not initialised yet.")
+        self.logger.debug(
+            f"VR arch params: enable_tta={self.enable_tta}, enable_post_process={self.enable_post_process}, post_process_threshold={self.post_process_threshold}"
+        )
+        self.logger.debug(
+            f"VR arch params: batch_size={self.batch_size}, window_size={self.window_size}"
+        )
+        self.logger.debug(
+            f"VR arch params: high_end_process={self.high_end_process}, aggression={self.aggression}"
+        )
+        self.logger.debug(
+            f"VR arch params: is_vr_51_model={self.is_vr_51_model}, model_samplerate={self.model_samplerate}, model_capacity={self.model_capacity}"
+        )
+
+        self.model_run = lambda *args, **kwargs: self.logger.error(
+            "Model run method is not initialised yet."
+        )
 
         # wav_subtype will be set based on input audio bit depth in prepare_mix()
         # Removed hardcoded "PCM_16" to allow bit depth preservation
@@ -126,15 +148,16 @@ def separate(self, audio_file_path, custom_output_names=None):
         self.secondary_source = None
 
         self.audio_file_path = audio_file_path
-        self.audio_file_base = os.path.splitext(os.path.basename(audio_file_path))[ 0]
+        self.audio_file_base = os.path.splitext(os.path.basename(audio_file_path))[0]
 
         # Detect input audio bit depth for output preservation
         try:
             import soundfile as sf
+
             info = sf.info(audio_file_path)
             self.input_audio_subtype = info.subtype
             self.logger.info(f"Input audio subtype: {self.input_audio_subtype}")
-            
+
             # Map subtype to wav_subtype for soundfile and set input_bit_depth for pydub
             if "24" in self.input_audio_subtype:
                 self.wav_subtype = "PCM_24"
@@ -149,44 +172,80 @@ def separate(self, audio_file_path, custom_output_names=None):
                 self.input_bit_depth = 16
                 self.logger.info("Detected 16-bit input audio")
         except Exception as e:
-            self.logger.warning(f"Could not detect input audio bit depth: {e}. Defaulting to PCM_16")
+            self.logger.warning(
+                f"Could not detect input audio bit depth: {e}. Defaulting to PCM_16"
+            )
             self.wav_subtype = "PCM_16"
             self.input_audio_subtype = None
             self.input_bit_depth = 16
 
-        self.logger.debug(f"Starting separation for input audio file {self.audio_file_path}...")
-
-        nn_arch_sizes = [31191, 33966, 56817, 123821, 123812, 129605, 218409, 537238, 537227]  # default
+        self.logger.debug(
+            f"Starting separation for input audio file {self.audio_file_path}..."
+        )
+
+        nn_arch_sizes = [
+            31191,
+            33966,
+            56817,
+            123821,
+            123812,
+            129605,
+            218409,
+            537238,
+            537227,
+        ]  # default
         vr_5_1_models = [56817, 218409]
         model_size = math.ceil(os.stat(self.model_path).st_size / 1024)
         nn_arch_size = min(nn_arch_sizes, key=lambda x: abs(x - model_size))
-        self.logger.debug(f"Model size determined: {model_size}, NN architecture size: {nn_arch_size}")
+        self.logger.debug(
+            f"Model size determined: {model_size}, NN architecture size: {nn_arch_size}"
+        )
 
         if nn_arch_size in vr_5_1_models or self.is_vr_51_model:
             self.logger.debug("Using CascadedNet for VR 5.1 model...")
-            self.model_run = nets_new.CascadedNet(self.model_params.param["bins"] * 2, nn_arch_size, nout=self.model_capacity[0], nout_lstm=self.model_capacity[1])
+            self.model_run = nets_new.CascadedNet(
+                self.model_params.param["bins"] * 2,
+                nn_arch_size,
+                nout=self.model_capacity[0],
+                nout_lstm=self.model_capacity[1],
+            )
             self.is_vr_51_model = True
         else:
             self.logger.debug("Determining model capacity...")
-            self.model_run = nets.determine_model_capacity(self.model_params.param["bins"] * 2, nn_arch_size)
+            self.model_run = nets.determine_model_capacity(
+                self.model_params.param["bins"] * 2, nn_arch_size
+            )
+
+        # VR models use PyTorch directly - STFT now handles ROCm via CPU offload
+        inference_device = self.torch_device
 
         self.model_run.load_state_dict(torch.load(self.model_path, map_location="cpu"))
-        self.model_run.to(self.torch_device)
+        self.model_run.to(inference_device)
         self.logger.debug("Model loaded and moved to device.")
 
-        y_spec, v_spec = self.inference_vr(self.loading_mix(), self.torch_device, self.aggressiveness)
+        y_spec, v_spec = self.inference_vr(
+            self.loading_mix(), inference_device, self.aggressiveness
+        )
         self.logger.debug("Inference completed.")
 
         # Sanitize y_spec and v_spec to replace NaN and infinite values
         y_spec = np.nan_to_num(y_spec, nan=0.0, posinf=0.0, neginf=0.0)
         v_spec = np.nan_to_num(v_spec, nan=0.0, posinf=0.0, neginf=0.0)
 
-        self.logger.debug("Sanitization completed. Replaced NaN and infinite values in y_spec and v_spec.")
+        self.logger.debug(
+            "Sanitization completed. Replaced NaN and infinite values in y_spec and v_spec."
+        )
 
         # After inference_vr call
-        self.logger.debug(f"Inference VR completed. y_spec shape: {y_spec.shape}, v_spec shape: {v_spec.shape}")
-        self.logger.debug(f"y_spec stats - min: {np.min(y_spec)}, max: {np.max(y_spec)}, isnan: {np.isnan(y_spec).any()}, isinf: {np.isinf(y_spec).any()}")
-        self.logger.debug(f"v_spec stats - min: {np.min(v_spec)}, max: {np.max(v_spec)}, isnan: {np.isnan(v_spec).any()}, isinf: {np.isinf(v_spec).any()}")
+        self.logger.debug(
+            f"Inference VR completed. y_spec shape: {y_spec.shape}, v_spec shape: {v_spec.shape}"
+        )
+        self.logger.debug(
+            f"y_spec stats - min: {np.min(y_spec)}, max: {np.max(y_spec)}, isnan: {np.isnan(y_spec).any()}, isinf: {np.isinf(y_spec).any()}"
+        )
+        self.logger.debug(
+            f"v_spec stats - min: {np.min(v_spec)}, max: {np.max(v_spec)}, isnan: {np.isnan(v_spec).any()}, isinf: {np.isinf(v_spec).any()}"
+        )
 
         # Not yet implemented from UVR features:
         #
@@ -205,45 +264,86 @@ def separate(self, audio_file_path, custom_output_names=None):
 
         # Note: logic similar to the following should probably be added to the other architectures
         # Check if output_single_stem is set to a value that would result in no output files
-        if self.output_single_stem and (self.output_single_stem.lower() != self.primary_stem_name.lower() and self.output_single_stem.lower() != self.secondary_stem_name.lower()):
+        if self.output_single_stem and (
+            self.output_single_stem.lower() != self.primary_stem_name.lower()
+            and self.output_single_stem.lower() != self.secondary_stem_name.lower()
+        ):
             # If so, reset output_single_stem to None to save both stems
             self.output_single_stem = None
-            self.logger.warning(f"The output_single_stem setting '{self.output_single_stem}' does not match any of the output files: '{self.primary_stem_name}' and '{self.secondary_stem_name}'. For this model '{self.model_name}', the output_single_stem setting will be ignored and all output files will be saved.")
+            self.logger.warning(
+                f"The output_single_stem setting '{self.output_single_stem}' does not match any of the output files: '{self.primary_stem_name}' and '{self.secondary_stem_name}'. For this model '{self.model_name}', the output_single_stem setting will be ignored and all output files will be saved."
+            )
 
         # Save and process the primary stem if needed
-        if not self.output_single_stem or self.output_single_stem.lower() == self.primary_stem_name.lower():
+        if (
+            not self.output_single_stem
+            or self.output_single_stem.lower() == self.primary_stem_name.lower()
+        ):
             self.logger.debug(f"Processing primary stem: {self.primary_stem_name}")
             if not isinstance(self.primary_source, np.ndarray):
-                self.logger.debug(f"Preparing to convert spectrogram to waveform. Spec shape: {y_spec.shape}")
+                self.logger.debug(
+                    f"Preparing to convert spectrogram to waveform. Spec shape: {y_spec.shape}"
+                )
 
                 self.primary_source = self.spec_to_wav(y_spec).T
                 self.logger.debug("Converting primary source spectrogram to waveform.")
                 if not self.model_samplerate == 44100:
-                    self.primary_source = librosa.resample(self.primary_source.T, orig_sr=self.model_samplerate, target_sr=44100).T
+                    self.primary_source = librosa.resample(
+                        self.primary_source.T,
+                        orig_sr=self.model_samplerate,
+                        target_sr=44100,
+                    ).T
                     self.logger.debug("Resampling primary source to 44100Hz.")
 
-            self.primary_stem_output_path = self.get_stem_output_path(self.primary_stem_name, custom_output_names)
-
-            self.logger.info(f"Saving {self.primary_stem_name} stem to {self.primary_stem_output_path}...")
-            self.final_process(self.primary_stem_output_path, self.primary_source, self.primary_stem_name)
+            self.primary_stem_output_path = self.get_stem_output_path(
+                self.primary_stem_name, custom_output_names
+            )
+
+            self.logger.info(
+                f"Saving {self.primary_stem_name} stem to {self.primary_stem_output_path}..."
+            )
+            self.final_process(
+                self.primary_stem_output_path,
+                self.primary_source,
+                self.primary_stem_name,
+            )
             output_files.append(self.primary_stem_output_path)
 
         # Save and process the secondary stem if needed
-        if not self.output_single_stem or self.output_single_stem.lower() == self.secondary_stem_name.lower():
+        if (
+            not self.output_single_stem
+            or self.output_single_stem.lower() == self.secondary_stem_name.lower()
+        ):
             self.logger.debug(f"Processing secondary stem: {self.secondary_stem_name}")
             if not isinstance(self.secondary_source, np.ndarray):
-                self.logger.debug(f"Preparing to convert spectrogram to waveform. Spec shape: {v_spec.shape}")
+                self.logger.debug(
+                    f"Preparing to convert spectrogram to waveform. Spec shape: {v_spec.shape}"
+                )
 
                 self.secondary_source = self.spec_to_wav(v_spec).T
-                self.logger.debug("Converting secondary source spectrogram to waveform.")
+                self.logger.debug(
+                    "Converting secondary source spectrogram to waveform."
+                )
                 if not self.model_samplerate == 44100:
-                    self.secondary_source = librosa.resample(self.secondary_source.T, orig_sr=self.model_samplerate, target_sr=44100).T
+                    self.secondary_source = librosa.resample(
+                        self.secondary_source.T,
+                        orig_sr=self.model_samplerate,
+                        target_sr=44100,
+                    ).T
                     self.logger.debug("Resampling secondary source to 44100Hz.")
 
-            self.secondary_stem_output_path = self.get_stem_output_path(self.secondary_stem_name, custom_output_names)
-
-            self.logger.info(f"Saving {self.secondary_stem_name} stem to {self.secondary_stem_output_path}...")
-            self.final_process(self.secondary_stem_output_path, self.secondary_source, self.secondary_stem_name)
+            self.secondary_stem_output_path = self.get_stem_output_path(
+                self.secondary_stem_name, custom_output_names
+            )
+
+            self.logger.info(
+                f"Saving {self.secondary_stem_name} stem to {self.secondary_stem_output_path}..."
+            )
+            self.final_process(
+                self.secondary_stem_output_path,
+                self.secondary_source,
+                self.secondary_stem_name,
+            )
             output_files.append(self.secondary_stem_output_path)
 
         # Not yet implemented from UVR features:
@@ -257,7 +357,9 @@ def loading_mix(self):
 
         bands_n = len(self.model_params.param["band"])
 
-        audio_file = spec_utils.write_array_to_mem(self.audio_file_path, subtype=self.wav_subtype)
+        audio_file = spec_utils.write_array_to_mem(
+            self.audio_file_path, subtype=self.wav_subtype
+        )
         is_mp3 = audio_file.endswith(".mp3") if isinstance(audio_file, str) else False
 
         self.logger.debug(f"loading_mix iteraring through {bands_n} bands")
@@ -270,8 +372,21 @@ def loading_mix(self):
                 wav_resolution = "polyphase"
 
             if d == bands_n:  # high-end band
-                X_wave[d], _ = librosa.load(audio_file, sr=bp["sr"], mono=False, dtype=np.float32, res_type=wav_resolution)
-                X_spec_s[d] = spec_utils.wave_to_spectrogram(X_wave[d], bp["hl"], bp["n_fft"], self.model_params, band=d, is_v51_model=self.is_vr_51_model)
+                X_wave[d], _ = librosa.load(
+                    audio_file,
+                    sr=bp["sr"],
+                    mono=False,
+                    dtype=np.float32,
+                    res_type=wav_resolution,
+                )
+                X_spec_s[d] = spec_utils.wave_to_spectrogram(
+                    X_wave[d],
+                    bp["hl"],
+                    bp["n_fft"],
+                    self.model_params,
+                    band=d,
+                    is_v51_model=self.is_vr_51_model,
+                )
 
                 if not np.any(X_wave[d]) and is_mp3:
                     X_wave[d] = rerun_mp3(audio_file, bp["sr"])
@@ -279,14 +394,33 @@ def loading_mix(self):
                 if X_wave[d].ndim == 1:
                     X_wave[d] = np.asarray([X_wave[d], X_wave[d]])
             else:  # lower bands
-                X_wave[d] = librosa.resample(X_wave[d + 1], orig_sr=self.model_params.param["band"][d + 1]["sr"], target_sr=bp["sr"], res_type=wav_resolution)
-                X_spec_s[d] = spec_utils.wave_to_spectrogram(X_wave[d], bp["hl"], bp["n_fft"], self.model_params, band=d, is_v51_model=self.is_vr_51_model)
+                X_wave[d] = librosa.resample(
+                    X_wave[d + 1],
+                    orig_sr=self.model_params.param["band"][d + 1]["sr"],
+                    target_sr=bp["sr"],
+                    res_type=wav_resolution,
+                )
+                X_spec_s[d] = spec_utils.wave_to_spectrogram(
+                    X_wave[d],
+                    bp["hl"],
+                    bp["n_fft"],
+                    self.model_params,
+                    band=d,
+                    is_v51_model=self.is_vr_51_model,
+                )
 
             if d == bands_n and self.high_end_process:
-                self.input_high_end_h = (bp["n_fft"] // 2 - bp["crop_stop"]) + (self.model_params.param["pre_filter_stop"] - self.model_params.param["pre_filter_start"])
-                self.input_high_end = X_spec_s[d][:, bp["n_fft"] // 2 - self.input_high_end_h : bp["n_fft"] // 2, :]
-
-        X_spec = spec_utils.combine_spectrograms(X_spec_s, self.model_params, is_v51_model=self.is_vr_51_model)
+                self.input_high_end_h = (bp["n_fft"] // 2 - bp["crop_stop"]) + (
+                    self.model_params.param["pre_filter_stop"]
+                    - self.model_params.param["pre_filter_start"]
+                )
+                self.input_high_end = X_spec_s[d][
+                    :, bp["n_fft"] // 2 - self.input_high_end_h : bp["n_fft"] // 2, :
+                ]
+
+        X_spec = spec_utils.combine_spectrograms(
+            X_spec_s, self.model_params, is_v51_model=self.is_vr_51_model
+        )
 
         del X_wave, X_spec_s, audio_file
 
@@ -297,14 +431,22 @@ def _execute(X_mag_pad, roi_size):
             X_dataset = []
             patches = (X_mag_pad.shape[2] - 2 * self.model_run.offset) // roi_size
 
-            self.logger.debug(f"inference_vr appending to X_dataset for each of {patches} patches")
+            self.logger.debug(
+                f"inference_vr appending to X_dataset for each of {patches} patches"
+            )
             for i in tqdm(range(patches)):
                 start = i * roi_size
                 X_mag_window = X_mag_pad[:, :, start : start + self.window_size]
                 X_dataset.append(X_mag_window)
 
-            total_iterations = patches // self.batch_size if not self.enable_tta else (patches // self.batch_size) * 2
-            self.logger.debug(f"inference_vr iterating through {total_iterations} batches, batch_size = {self.batch_size}")
+            total_iterations = (
+                patches // self.batch_size
+                if not self.enable_tta
+                else (patches // self.batch_size) * 2
+            )
+            self.logger.debug(
+                f"inference_vr iterating through {total_iterations} batches, batch_size = {self.batch_size}"
+            )
 
             X_dataset = np.asarray(X_dataset)
             self.model_run.eval()
@@ -312,17 +454,20 @@ def _execute(X_mag_pad, roi_size):
                 mask = []
 
                 for i in tqdm(range(0, patches, self.batch_size)):
-
                     X_batch = X_dataset[i : i + self.batch_size]
                     X_batch = torch.from_numpy(X_batch).to(device)
                     pred = self.model_run.predict_mask(X_batch)
                     if not pred.size()[3] > 0:
-                        raise ValueError(f"Window size error: h1_shape[3] must be greater than h2_shape[3]")
+                        raise ValueError(
+                            f"Window size error: h1_shape[3] must be greater than h2_shape[3]"
+                        )
                     pred = pred.detach().cpu().numpy()
                     pred = np.concatenate(pred, axis=2)
                     mask.append(pred)
                 if len(mask) == 0:
-                    raise ValueError(f"Window size error: h1_shape[3] must be greater than h2_shape[3]")
+                    raise ValueError(
+                        f"Window size error: h1_shape[3] must be greater than h2_shape[3]"
+                    )
 
                 mask = np.concatenate(mask, axis=2)
             return mask
@@ -336,7 +481,9 @@ def postprocess(mask, X_mag, X_phase):
             mask = spec_utils.adjust_aggr(mask, is_non_accom_stem, aggressiveness)
 
             if self.enable_post_process:
-                mask = spec_utils.merge_artifacts(mask, thres=self.post_process_threshold)
+                mask = spec_utils.merge_artifacts(
+                    mask, thres=self.post_process_threshold
+                )
 
             y_spec = mask * X_mag * np.exp(1.0j * X_phase)
             v_spec = (1 - mask) * X_mag * np.exp(1.0j * X_phase)
@@ -345,7 +492,9 @@ def postprocess(mask, X_mag, X_phase):
 
         X_mag, X_phase = spec_utils.preprocess(X_spec)
         n_frame = X_mag.shape[2]
-        pad_l, pad_r, roi_size = spec_utils.make_padding(n_frame, self.window_size, self.model_run.offset)
+        pad_l, pad_r, roi_size = spec_utils.make_padding(
+            n_frame, self.window_size, self.model_run.offset
+        )
         X_mag_pad = np.pad(X_mag, ((0, 0), (0, 0), (pad_l, pad_r)), mode="constant")
         X_mag_pad /= X_mag_pad.max()
         mask = _execute(X_mag_pad, roi_size)
@@ -366,11 +515,25 @@ def postprocess(mask, X_mag, X_phase):
         return y_spec, v_spec
 
     def spec_to_wav(self, spec):
-        if self.high_end_process and isinstance(self.input_high_end, np.ndarray) and self.input_high_end_h:
-            input_high_end_ = spec_utils.mirroring("mirroring", spec, self.input_high_end, self.model_params)
-            wav = spec_utils.cmb_spectrogram_to_wave(spec, self.model_params, self.input_high_end_h, input_high_end_, is_v51_model=self.is_vr_51_model)
+        if (
+            self.high_end_process
+            and isinstance(self.input_high_end, np.ndarray)
+            and self.input_high_end_h
+        ):
+            input_high_end_ = spec_utils.mirroring(
+                "mirroring", spec, self.input_high_end, self.model_params
+            )
+            wav = spec_utils.cmb_spectrogram_to_wave(
+                spec,
+                self.model_params,
+                self.input_high_end_h,
+                input_high_end_,
+                is_v51_model=self.is_vr_51_model,
+            )
         else:
-            wav = spec_utils.cmb_spectrogram_to_wave(spec, self.model_params, is_v51_model=self.is_vr_51_model)
+            wav = spec_utils.cmb_spectrogram_to_wave(
+                spec, self.model_params, is_v51_model=self.is_vr_51_model
+            )
 
         return wav
 
@@ -380,4 +543,6 @@ def rerun_mp3(audio_file, sample_rate=44100):
     with audioread.audio_open(audio_file) as f:
         track_length = int(f.duration)
 
-    return librosa.load(audio_file, duration=track_length, mono=False, sr=sample_rate)[0]
+    return librosa.load(audio_file, duration=track_length, mono=False, sr=sample_rate)[
+        0
+    ]
diff --git a/audio_separator/separator/common_separator.py b/audio_separator/separator/common_separator.py
index 34435ea7..66807c5b 100644
--- a/audio_separator/separator/common_separator.py
+++ b/audio_separator/separator/common_separator.py
@@ -1,4 +1,4 @@
-""" This file contains the CommonSeparator class, common to all architecture-specific Separator classes. """
+"""This file contains the CommonSeparator class, common to all architecture-specific Separator classes."""
 
 from logging import Logger
 import os
@@ -50,9 +50,27 @@ class CommonSeparator:
     BV_VOCAL_STEM_LABEL = "Backing Vocals"
     NO_STEM = "No "
 
-    STEM_PAIR_MAPPER = {VOCAL_STEM: INST_STEM, INST_STEM: VOCAL_STEM, LEAD_VOCAL_STEM: BV_VOCAL_STEM, BV_VOCAL_STEM: LEAD_VOCAL_STEM, PRIMARY_STEM: SECONDARY_STEM}
-
-    NON_ACCOM_STEMS = (VOCAL_STEM, OTHER_STEM, BASS_STEM, DRUM_STEM, GUITAR_STEM, PIANO_STEM, SYNTH_STEM, STRINGS_STEM, WOODWINDS_STEM, BRASS_STEM, WIND_INST_STEM)
+    STEM_PAIR_MAPPER = {
+        VOCAL_STEM: INST_STEM,
+        INST_STEM: VOCAL_STEM,
+        LEAD_VOCAL_STEM: BV_VOCAL_STEM,
+        BV_VOCAL_STEM: LEAD_VOCAL_STEM,
+        PRIMARY_STEM: SECONDARY_STEM,
+    }
+
+    NON_ACCOM_STEMS = (
+        VOCAL_STEM,
+        OTHER_STEM,
+        BASS_STEM,
+        DRUM_STEM,
+        GUITAR_STEM,
+        PIANO_STEM,
+        SYNTH_STEM,
+        STRINGS_STEM,
+        WOODWINDS_STEM,
+        BRASS_STEM,
+        WIND_INST_STEM,
+    )
 
     def __init__(self, config):
 
@@ -63,6 +81,7 @@ def __init__(self, config):
         self.torch_device = config.get("torch_device")
         self.torch_device_cpu = config.get("torch_device_cpu")
         self.torch_device_mps = config.get("torch_device_mps")
+        self.is_rocm = config.get("is_rocm", False)
         self.onnx_execution_provider = config.get("onnx_execution_provider")
 
         # Model data
@@ -83,7 +102,7 @@ def __init__(self, config):
         self.invert_using_spec = config.get("invert_using_spec")
         self.sample_rate = config.get("sample_rate")
         self.use_soundfile = config.get("use_soundfile")
-        
+
         # Roformer-specific loading support
         self.roformer_loader = None
         self.is_roformer_model = self._detect_roformer_model()
@@ -95,12 +114,15 @@ def __init__(self, config):
         # Check if model_data has a "training" key with "instruments" list
         self.primary_stem_name = None
         self.secondary_stem_name = None
-        
+
         # Audio bit depth tracking for preserving input quality
         self.input_bit_depth = None
         self.input_subtype = None
 
-        if "training" in self.model_data and "instruments" in self.model_data["training"]:
+        if (
+            "training" in self.model_data
+            and "instruments" in self.model_data["training"]
+        ):
             instruments = self.model_data["training"]["instruments"]
             if instruments:
                 target_instrument = self.model_data["training"].get("target_instrument")
@@ -124,14 +146,28 @@ def __init__(self, config):
         self.is_bv_model = self.model_data.get("is_bv_model", False)
         self.bv_model_rebalance = self.model_data.get("is_bv_model_rebalanced", 0)
 
-        self.logger.debug(f"Common params: model_name={self.model_name}, model_path={self.model_path}")
-        self.logger.debug(f"Common params: output_dir={self.output_dir}, output_format={self.output_format}")
-        self.logger.debug(f"Common params: normalization_threshold={self.normalization_threshold}, amplification_threshold={self.amplification_threshold}")
-        self.logger.debug(f"Common params: enable_denoise={self.enable_denoise}, output_single_stem={self.output_single_stem}")
-        self.logger.debug(f"Common params: invert_using_spec={self.invert_using_spec}, sample_rate={self.sample_rate}")
-
-        self.logger.debug(f"Common params: primary_stem_name={self.primary_stem_name}, secondary_stem_name={self.secondary_stem_name}")
-        self.logger.debug(f"Common params: is_karaoke={self.is_karaoke}, is_bv_model={self.is_bv_model}, bv_model_rebalance={self.bv_model_rebalance}")
+        self.logger.debug(
+            f"Common params: model_name={self.model_name}, model_path={self.model_path}"
+        )
+        self.logger.debug(
+            f"Common params: output_dir={self.output_dir}, output_format={self.output_format}"
+        )
+        self.logger.debug(
+            f"Common params: normalization_threshold={self.normalization_threshold}, amplification_threshold={self.amplification_threshold}"
+        )
+        self.logger.debug(
+            f"Common params: enable_denoise={self.enable_denoise}, output_single_stem={self.output_single_stem}"
+        )
+        self.logger.debug(
+            f"Common params: invert_using_spec={self.invert_using_spec}, sample_rate={self.sample_rate}"
+        )
+
+        self.logger.debug(
+            f"Common params: primary_stem_name={self.primary_stem_name}, secondary_stem_name={self.secondary_stem_name}"
+        )
+        self.logger.debug(
+            f"Common params: is_karaoke={self.is_karaoke}, is_bv_model={self.is_bv_model}, bv_model_rebalance={self.bv_model_rebalance}"
+        )
 
         # File-specific variables which need to be cleared between processing different audio inputs
         self.audio_file_path = None
@@ -152,7 +188,11 @@ def secondary_stem(self, primary_stem: str):
         if primary_stem in self.STEM_PAIR_MAPPER:
             secondary_stem = self.STEM_PAIR_MAPPER[primary_stem]
         else:
-            secondary_stem = primary_stem.replace(self.NO_STEM, "") if self.NO_STEM in primary_stem else f"{self.NO_STEM}{primary_stem}"
+            secondary_stem = (
+                primary_stem.replace(self.NO_STEM, "")
+                if self.NO_STEM in primary_stem
+                else f"{self.NO_STEM}{primary_stem}"
+            )
 
         return secondary_stem
 
@@ -166,7 +206,9 @@ def final_process(self, stem_path, source, stem_name):
         """
         Finalizes the processing of a stem by writing the audio to a file and returning the processed source.
         """
-        self.logger.debug(f"Finalizing {stem_name} stem processing and writing audio...")
+        self.logger.debug(
+            f"Finalizing {stem_name} stem processing and writing audio..."
+        )
         self.write_audio(stem_path, source)
 
         return {stem_name: source}
@@ -212,7 +254,10 @@ def cached_model_source_holder(self, model_architecture, sources, model_name=Non
         Update the dictionary for the given model_architecture with the new model name and its sources.
         Use the model_architecture as a key to access the corresponding cache source mapper dictionary.
         """
-        self.cached_sources_map[model_architecture] = {**self.cached_sources_map.get(model_architecture, {}), **{model_name: sources}}
+        self.cached_sources_map[model_architecture] = {
+            **self.cached_sources_map.get(model_architecture, {}),
+            **{model_name: sources},
+        }
 
     def prepare_mix(self, mix):
         """
@@ -225,40 +270,52 @@ def prepare_mix(self, mix):
         # Check if the input is a file path (string) and needs to be loaded
         if not isinstance(mix, np.ndarray):
             self.logger.debug(f"Loading audio from file: {mix}")
-            
+
             # Get audio file info to capture bit depth before loading
             try:
                 audio_info = sf.info(mix)
                 self.input_subtype = audio_info.subtype
                 self.logger.info(f"Input audio subtype: {self.input_subtype}")
-                
+
                 # Map subtype to bit depth
-                if 'PCM_16' in self.input_subtype or self.input_subtype == 'PCM_S8':
+                if "PCM_16" in self.input_subtype or self.input_subtype == "PCM_S8":
                     self.input_bit_depth = 16
-                elif 'PCM_24' in self.input_subtype:
+                elif "PCM_24" in self.input_subtype:
                     self.input_bit_depth = 24
-                elif 'PCM_32' in self.input_subtype or 'FLOAT' in self.input_subtype or 'DOUBLE' in self.input_subtype:
+                elif (
+                    "PCM_32" in self.input_subtype
+                    or "FLOAT" in self.input_subtype
+                    or "DOUBLE" in self.input_subtype
+                ):
                     self.input_bit_depth = 32
                 else:
                     # Default to 16-bit for unknown formats
                     self.input_bit_depth = 16
-                    self.logger.warning(f"Unknown audio subtype {self.input_subtype}, defaulting to 16-bit output")
-                
-                self.logger.info(f"Detected input bit depth: {self.input_bit_depth}-bit")
+                    self.logger.warning(
+                        f"Unknown audio subtype {self.input_subtype}, defaulting to 16-bit output"
+                    )
+
+                self.logger.info(
+                    f"Detected input bit depth: {self.input_bit_depth}-bit"
+                )
             except Exception as e:
-                self.logger.warning(f"Could not read audio file info, defaulting to 16-bit output: {e}")
+                self.logger.warning(
+                    f"Could not read audio file info, defaulting to 16-bit output: {e}"
+                )
                 self.input_bit_depth = 16
-                self.input_subtype = 'PCM_16'
-            
+                self.input_subtype = "PCM_16"
+
             mix, sr = librosa.load(mix, mono=False, sr=self.sample_rate)
-            self.logger.debug(f"Audio loaded. Sample rate: {sr}, Audio shape: {mix.shape}")
+            self.logger.debug(
+                f"Audio loaded. Sample rate: {sr}, Audio shape: {mix.shape}"
+            )
         else:
             # Transpose the mix if it's already an ndarray (expected shape: [channels, samples])
             self.logger.debug("Transposing the provided mix array.")
             # Default to 16-bit if numpy array provided directly
             if self.input_bit_depth is None:
                 self.input_bit_depth = 16
-                self.input_subtype = 'PCM_16'
+                self.input_subtype = "PCM_16"
             mix = mix.T
             self.logger.debug(f"Transposed mix shape: {mix.shape}")
 
@@ -291,7 +348,9 @@ def write_audio(self, stem_path: str, stem_source):
         # Get the duration of the input audio file
         duration_seconds = librosa.get_duration(filename=self.audio_file_path)
         duration_hours = duration_seconds / 3600
-        self.logger.info(f"Audio duration is {duration_hours:.2f} hours ({duration_seconds:.2f} seconds).")
+        self.logger.info(
+            f"Audio duration is {duration_hours:.2f} hours ({duration_seconds:.2f} seconds)."
+        )
 
         if self.use_soundfile:
             self.logger.warning(f"Using soundfile for writing.")
@@ -306,7 +365,11 @@ def write_audio_pydub(self, stem_path: str, stem_source):
         """
         self.logger.debug(f"Entering write_audio_pydub with stem_path: {stem_path}")
 
-        stem_source = spec_utils.normalize(wave=stem_source, max_peak=self.normalization_threshold, min_peak=self.amplification_threshold)
+        stem_source = spec_utils.normalize(
+            wave=stem_source,
+            max_peak=self.normalization_threshold,
+            min_peak=self.amplification_threshold,
+        )
 
         # Check if the numpy array is empty or contains very low values
         if np.max(np.abs(stem_source)) < 1e-6:
@@ -322,7 +385,9 @@ def write_audio_pydub(self, stem_path: str, stem_source):
         self.logger.debug(f"Data type before conversion: {stem_source.dtype}")
 
         # Determine bit depth for output (use input bit depth if available, otherwise default to 16)
-        output_bit_depth = self.input_bit_depth if self.input_bit_depth is not None else 16
+        output_bit_depth = (
+            self.input_bit_depth if self.input_bit_depth is not None else 16
+        )
         self.logger.info(f"Writing output with {output_bit_depth}-bit depth")
 
         # For pydub, we always convert to int16 for the AudioSegment creation
@@ -336,11 +401,18 @@ def write_audio_pydub(self, stem_path: str, stem_source):
         stem_source_interleaved[0::2] = stem_source[:, 0]  # Left channel
         stem_source_interleaved[1::2] = stem_source[:, 1]  # Right channel
 
-        self.logger.debug(f"Interleaved audio data shape: {stem_source_interleaved.shape}")
+        self.logger.debug(
+            f"Interleaved audio data shape: {stem_source_interleaved.shape}"
+        )
 
         # Create a pydub AudioSegment (always from 16-bit data)
         try:
-            audio_segment = AudioSegment(stem_source_interleaved.tobytes(), frame_rate=self.sample_rate, sample_width=2, channels=2)
+            audio_segment = AudioSegment(
+                stem_source_interleaved.tobytes(),
+                frame_rate=self.sample_rate,
+                sample_width=2,
+                channels=2,
+            )
             self.logger.debug("Created AudioSegment successfully.")
         except (IOError, ValueError) as e:
             self.logger.error(f"Specific error creating AudioSegment: {e}")
@@ -356,16 +428,20 @@ def write_audio_pydub(self, stem_path: str, stem_source):
             file_format = "matroska"
 
         # Set the bitrate to 320k for mp3 files if output_bitrate is not specified
-        bitrate = "320k" if file_format == "mp3" and self.output_bitrate is None else self.output_bitrate
+        bitrate = (
+            "320k"
+            if file_format == "mp3" and self.output_bitrate is None
+            else self.output_bitrate
+        )
 
         # Export using the determined format
         try:
             # Pass codec parameters to ffmpeg to enforce bit depth for lossless formats
             export_params = {"format": file_format}
-            
+
             if bitrate:
                 export_params["bitrate"] = bitrate
-            
+
             # For lossless formats (WAV/FLAC), specify the codec parameters to enforce bit depth
             if file_format in ["wav", "flac"]:
                 if output_bit_depth == 16:
@@ -382,9 +458,11 @@ def write_audio_pydub(self, stem_path: str, stem_source):
                     export_params["parameters"] = ["-sample_fmt", "s32"]
                     if file_format == "wav":
                         export_params["codec"] = "pcm_s32le"
-            
+
             audio_segment.export(stem_path, **export_params)
-            self.logger.debug(f"Exported audio file successfully to {stem_path} with {output_bit_depth}-bit depth")
+            self.logger.debug(
+                f"Exported audio file successfully to {stem_path} with {output_bit_depth}-bit depth"
+            )
         except (IOError, ValueError) as e:
             self.logger.error(f"Error exporting audio file: {e}")
 
@@ -394,7 +472,11 @@ def write_audio_soundfile(self, stem_path: str, stem_source):
         """
         self.logger.debug(f"Entering write_audio_soundfile with stem_path: {stem_path}")
 
-        stem_source = spec_utils.normalize(wave=stem_source, max_peak=self.normalization_threshold, min_peak=self.amplification_threshold)
+        stem_source = spec_utils.normalize(
+            wave=stem_source,
+            max_peak=self.normalization_threshold,
+            min_peak=self.amplification_threshold,
+        )
 
         # Check if the numpy array is empty or contains very low values
         if np.max(np.abs(stem_source)) < 1e-6:
@@ -414,17 +496,19 @@ def write_audio_soundfile(self, stem_path: str, stem_source):
         elif self.input_bit_depth:
             # Map bit depth to subtype
             if self.input_bit_depth == 16:
-                output_subtype = 'PCM_16'
+                output_subtype = "PCM_16"
             elif self.input_bit_depth == 24:
-                output_subtype = 'PCM_24'
+                output_subtype = "PCM_24"
             elif self.input_bit_depth == 32:
-                output_subtype = 'PCM_32'
+                output_subtype = "PCM_32"
             else:
-                output_subtype = 'PCM_16'  # Default fallback
-            self.logger.info(f"Using output subtype based on bit depth: {output_subtype}")
+                output_subtype = "PCM_16"  # Default fallback
+            self.logger.info(
+                f"Using output subtype based on bit depth: {output_subtype}"
+            )
         else:
             # Default to PCM_16 if no bit depth info available
-            output_subtype = 'PCM_16'
+            output_subtype = "PCM_16"
             self.logger.warning("No bit depth info available, defaulting to PCM_16")
 
         # Correctly interleave stereo channels if needed
@@ -446,7 +530,9 @@ def write_audio_soundfile(self, stem_path: str, stem_source):
         try:
             # Specify the subtype to match input bit depth
             sf.write(stem_path, stem_source, self.sample_rate, subtype=output_subtype)
-            self.logger.debug(f"Exported audio file successfully to {stem_path} with subtype {output_subtype}")
+            self.logger.debug(
+                f"Exported audio file successfully to {stem_path} with subtype {output_subtype}"
+            )
         except Exception as e:
             self.logger.error(f"Error exporting audio file: {e}")
 
@@ -482,9 +568,9 @@ def sanitize_filename(self, filename):
         """
         Cleans the filename by replacing invalid characters with underscores.
         """
-        sanitized = re.sub(r'[<>:"/\\|?*]', '_', filename)
-        sanitized = re.sub(r'_+', '_', sanitized)
-        sanitized = sanitized.strip('_. ')
+        sanitized = re.sub(r'[<>:"/\\|?*]', "_", filename)
+        sanitized = re.sub(r"_+", "_", sanitized)
+        sanitized = sanitized.strip("_. ")
         return sanitized
 
     def get_stem_output_path(self, stem_name, custom_output_names):
@@ -493,74 +579,81 @@ def get_stem_output_path(self, stem_name, custom_output_names):
         """
         # Convert custom_output_names keys to lowercase for case-insensitive comparison
         if custom_output_names:
-            custom_output_names_lower = {k.lower(): v for k, v in custom_output_names.items()}
+            custom_output_names_lower = {
+                k.lower(): v for k, v in custom_output_names.items()
+            }
             stem_name_lower = stem_name.lower()
             if stem_name_lower in custom_output_names_lower:
-                sanitized_custom_name = self.sanitize_filename(custom_output_names_lower[stem_name_lower])
-                return os.path.join(f"{sanitized_custom_name}.{self.output_format.lower()}")
+                sanitized_custom_name = self.sanitize_filename(
+                    custom_output_names_lower[stem_name_lower]
+                )
+                return os.path.join(
+                    f"{sanitized_custom_name}.{self.output_format.lower()}"
+                )
 
         sanitized_audio_base = self.sanitize_filename(self.audio_file_base)
         sanitized_stem_name = self.sanitize_filename(stem_name)
-        sanitized_model_name = self.sanitize_filename(self.model_name) 
+        sanitized_model_name = self.sanitize_filename(self.model_name)
 
         filename = f"{sanitized_audio_base}_({sanitized_stem_name})_{sanitized_model_name}.{self.output_format.lower()}"
         return os.path.join(filename)
-    
+
     def _detect_roformer_model(self):
         """
         Detect if the current model is a Roformer model.
-        
+
         Returns:
             bool: True if this is a Roformer model, False otherwise
         """
         if not self.model_data:
             return False
-            
+
         # Check for explicit Roformer flag
         if self.model_data.get("is_roformer", False):
             return True
-            
+
         # Check model path for Roformer indicators
         if self.model_path and "roformer" in self.model_path.lower():
             return True
-            
+
         # Check model name for Roformer indicators
         if self.model_name and "roformer" in self.model_name.lower():
             return True
-            
+
         return False
-    
+
     def _initialize_roformer_loader(self):
         """
         Initialize the Roformer loader for this model.
         """
         try:
             from .roformer.roformer_loader import RoformerLoader
+
             self.roformer_loader = RoformerLoader()
             self.logger.debug("Initialized Roformer loader for CommonSeparator")
         except ImportError as e:
             self.logger.warning(f"Could not import RoformerLoader: {e}")
             self.roformer_loader = None
-    
+
     def get_roformer_loading_stats(self):
         """
         Get Roformer loading statistics if available.
-        
+
         Returns:
             dict: Loading statistics or empty dict if not available
         """
         if self.roformer_loader:
             return self.roformer_loader.get_loading_stats()
         return {}
-    
+
     def validate_roformer_config(self, config, model_type):
         """
         Validate Roformer configuration if loader is available.
-        
+
         Args:
             config: Configuration dictionary to validate
             model_type: Type of model to validate for
-            
+
         Returns:
             bool: True if valid or validation not available, False if invalid
         """
diff --git a/audio_separator/separator/roformer/roformer_loader.py b/audio_separator/separator/roformer/roformer_loader.py
index 1df1c7b8..6d861e60 100644
--- a/audio_separator/separator/roformer/roformer_loader.py
+++ b/audio_separator/separator/roformer/roformer_loader.py
@@ -1,4 +1,5 @@
 """Roformer model loader with simplified new-implementation only path."""
+
 from typing import Dict, Any
 import logging
 import os
@@ -15,15 +16,11 @@ class RoformerLoader:
 
     def __init__(self):
         self.config_normalizer = ConfigurationNormalizer()
-        self._loading_stats = {
-            'new_implementation_success': 0,
-            'total_failures': 0
-        }
+        self._loading_stats = {"new_implementation_success": 0, "total_failures": 0}
 
-    def load_model(self,
-                   model_path: str,
-                   config: Dict[str, Any],
-                   device: str = 'cpu') -> ModelLoadingResult:
+    def load_model(
+        self, model_path: str, config: Dict[str, Any], device: str = "cpu"
+    ) -> ModelLoadingResult:
         logger.info(f"Loading Roformer model from {model_path}")
         try:
             normalized_config = self.config_normalizer.normalize_from_file_path(
@@ -42,8 +39,10 @@ def load_model(self,
             result = self._load_with_new_implementation(
                 model_path, normalized_config, model_type, device
             )
-            self._loading_stats['new_implementation_success'] += 1
-            logger.info(f"Successfully loaded {model_type} model with new implementation")
+            self._loading_stats["new_implementation_success"] += 1
+            logger.info(
+                f"Successfully loaded {model_type} model with new implementation"
+            )
             return result
         except (RuntimeError, ValueError, TypeError) as e:
             logger.error(f"New implementation failed: {e}")
@@ -53,13 +52,15 @@ def load_model(self,
                     model_path=model_path,
                     original_config=config,
                     device=device,
-                    original_error=str(e)
+                    original_error=str(e),
+                )
+                logger.warning(
+                    "Fell back to legacy Roformer implementation successfully"
                 )
-                logger.warning("Fell back to legacy Roformer implementation successfully")
                 return fallback_result
             except (RuntimeError, ValueError, TypeError) as fallback_error:
                 logger.error(f"Legacy implementation also failed: {fallback_error}")
-                self._loading_stats['total_failures'] += 1
+                self._loading_stats["total_failures"] += 1
                 return ModelLoadingResult.failure_result(
                     error_message=f"New implementation failed: {e}; Legacy fallback failed: {fallback_error}",
                     implementation=ImplementationVersion.NEW,
@@ -79,11 +80,9 @@ def validate_configuration(self, config: Dict[str, Any], model_type: str) -> boo
             logger.error(f"Unexpected error during validation: {e}")
             return False
 
-    def _load_with_new_implementation(self,
-                                      model_path: str,
-                                      config: Dict[str, Any],
-                                      model_type: str,
-                                      device: str) -> ModelLoadingResult:
+    def _load_with_new_implementation(
+        self, model_path: str, config: Dict[str, Any], model_type: str, device: str
+    ) -> ModelLoadingResult:
         import torch
 
         try:
@@ -96,10 +95,10 @@ def _load_with_new_implementation(self,
 
             if os.path.exists(model_path):
                 state_dict = torch.load(model_path, map_location=device)
-                if isinstance(state_dict, dict) and 'state_dict' in state_dict:
-                    model.load_state_dict(state_dict['state_dict'])
-                elif isinstance(state_dict, dict) and 'model' in state_dict:
-                    model.load_state_dict(state_dict['model'])
+                if isinstance(state_dict, dict) and "state_dict" in state_dict:
+                    model.load_state_dict(state_dict["state_dict"])
+                elif isinstance(state_dict, dict) and "model" in state_dict:
+                    model.load_state_dict(state_dict["model"])
                 else:
                     model.load_state_dict(state_dict)
                 logger.debug(f"Loaded state dict from {model_path}")
@@ -112,9 +111,9 @@ def _load_with_new_implementation(self,
                 implementation=ImplementationVersion.NEW,
                 config=config,
             )
-            result.add_model_info('model_type', model_type)
-            result.add_model_info('loading_method', 'direct')
-            result.add_model_info('device', device)
+            result.add_model_info("model_type", model_type)
+            result.add_model_info("loading_method", "direct")
+            result.add_model_info("device", device)
             return result
         except (RuntimeError, ValueError) as e:
             logger.error(f"Failed to create {model_type} model: {e}")
@@ -122,71 +121,73 @@ def _load_with_new_implementation(self,
 
     def _create_bs_roformer(self, config: Dict[str, Any]):
         from ..uvr_lib_v5.roformer.bs_roformer import BSRoformer
+
         model_args = {
-            'dim': config['dim'],
-            'depth': config['depth'],
-            'stereo': config.get('stereo', False),
-            'num_stems': config.get('num_stems', 2),
-            'time_transformer_depth': config.get('time_transformer_depth', 2),
-            'freq_transformer_depth': config.get('freq_transformer_depth', 2),
-            'freqs_per_bands': config['freqs_per_bands'],
-            'dim_head': config.get('dim_head', 64),
-            'heads': config.get('heads', 8),
-            'attn_dropout': config.get('attn_dropout', 0.0),
-            'ff_dropout': config.get('ff_dropout', 0.0),
-            'flash_attn': config.get('flash_attn', True),
-            'mlp_expansion_factor': config.get('mlp_expansion_factor', 4),
-            'sage_attention': config.get('sage_attention', False),
-            'zero_dc': config.get('zero_dc', True),
-            'use_torch_checkpoint': config.get('use_torch_checkpoint', False),
-            'skip_connection': config.get('skip_connection', False),
+            "dim": config["dim"],
+            "depth": config["depth"],
+            "stereo": config.get("stereo", False),
+            "num_stems": config.get("num_stems", 2),
+            "time_transformer_depth": config.get("time_transformer_depth", 2),
+            "freq_transformer_depth": config.get("freq_transformer_depth", 2),
+            "freqs_per_bands": config["freqs_per_bands"],
+            "dim_head": config.get("dim_head", 64),
+            "heads": config.get("heads", 8),
+            "attn_dropout": config.get("attn_dropout", 0.0),
+            "ff_dropout": config.get("ff_dropout", 0.0),
+            "flash_attn": config.get("flash_attn", True),
+            "mlp_expansion_factor": config.get("mlp_expansion_factor", 4),
+            "sage_attention": config.get("sage_attention", False),
+            "zero_dc": config.get("zero_dc", True),
+            "use_torch_checkpoint": config.get("use_torch_checkpoint", False),
+            "skip_connection": config.get("skip_connection", False),
         }
-        if 'stft_n_fft' in config:
-            model_args['stft_n_fft'] = config['stft_n_fft']
-        if 'stft_hop_length' in config:
-            model_args['stft_hop_length'] = config['stft_hop_length']
-        if 'stft_win_length' in config:
-            model_args['stft_win_length'] = config['stft_win_length']
+        if "stft_n_fft" in config:
+            model_args["stft_n_fft"] = config["stft_n_fft"]
+        if "stft_hop_length" in config:
+            model_args["stft_hop_length"] = config["stft_hop_length"]
+        if "stft_win_length" in config:
+            model_args["stft_win_length"] = config["stft_win_length"]
         logger.debug(f"Creating BSRoformer with args: {list(model_args.keys())}")
         return BSRoformer(**model_args)
 
     def _create_mel_band_roformer(self, config: Dict[str, Any]):
         from ..uvr_lib_v5.roformer.mel_band_roformer import MelBandRoformer
+
         model_args = {
-            'dim': config['dim'],
-            'depth': config['depth'],
-            'stereo': config.get('stereo', False),
-            'num_stems': config.get('num_stems', 2),
-            'time_transformer_depth': config.get('time_transformer_depth', 2),
-            'freq_transformer_depth': config.get('freq_transformer_depth', 2),
-            'num_bands': config['num_bands'],
-            'dim_head': config.get('dim_head', 64),
-            'heads': config.get('heads', 8),
-            'attn_dropout': config.get('attn_dropout', 0.0),
-            'ff_dropout': config.get('ff_dropout', 0.0),
-            'flash_attn': config.get('flash_attn', True),
-            'mlp_expansion_factor': config.get('mlp_expansion_factor', 4),
-            'sage_attention': config.get('sage_attention', False),
-            'zero_dc': config.get('zero_dc', True),
-            'use_torch_checkpoint': config.get('use_torch_checkpoint', False),
-            'skip_connection': config.get('skip_connection', False),
+            "dim": config["dim"],
+            "depth": config["depth"],
+            "stereo": config.get("stereo", False),
+            "num_stems": config.get("num_stems", 2),
+            "time_transformer_depth": config.get("time_transformer_depth", 2),
+            "freq_transformer_depth": config.get("freq_transformer_depth", 2),
+            "num_bands": config["num_bands"],
+            "dim_head": config.get("dim_head", 64),
+            "heads": config.get("heads", 8),
+            "attn_dropout": config.get("attn_dropout", 0.0),
+            "ff_dropout": config.get("ff_dropout", 0.0),
+            "flash_attn": config.get("flash_attn", True),
+            "mlp_expansion_factor": config.get("mlp_expansion_factor", 4),
+            "sage_attention": config.get("sage_attention", False),
+            "zero_dc": config.get("zero_dc", True),
+            "use_torch_checkpoint": config.get("use_torch_checkpoint", False),
+            "skip_connection": config.get("skip_connection", False),
         }
-        if 'sample_rate' in config:
-            model_args['sample_rate'] = config['sample_rate']
+        if "sample_rate" in config:
+            model_args["sample_rate"] = config["sample_rate"]
         # Optional parameters commonly present in legacy configs
         for optional_key in [
-            'mask_estimator_depth',
-            'stft_n_fft',
-            'stft_hop_length',
-            'stft_win_length',
-            'stft_normalized',
-            'stft_window_fn',
-            'multi_stft_resolution_loss_weight',
-            'multi_stft_resolutions_window_sizes',
-            'multi_stft_hop_size',
-            'multi_stft_normalized',
-            'multi_stft_window_fn',
-            'match_input_audio_length',
+            "mask_estimator_depth",
+            "stft_n_fft",
+            "stft_hop_length",
+            "stft_win_length",
+            "stft_normalized",
+            "stft_window_fn",
+            "multi_stft_resolution_loss_weight",
+            "multi_stft_resolutions_window_sizes",
+            "multi_stft_hop_size",
+            "multi_stft_normalized",
+            "multi_stft_window_fn",
+            "match_input_audio_length",
         ]:
             if optional_key in config:
                 model_args[optional_key] = config[optional_key]
@@ -194,11 +195,13 @@ def _create_mel_band_roformer(self, config: Dict[str, Any]):
         logger.debug(f"Creating MelBandRoformer with args: {list(model_args.keys())}")
         return MelBandRoformer(**model_args)
 
-    def _load_with_legacy_implementation(self,
-                                          model_path: str,
-                                          original_config: Dict[str, Any],
-                                          device: str,
-                                          original_error: str) -> ModelLoadingResult:
+    def _load_with_legacy_implementation(
+        self,
+        model_path: str,
+        original_config: Dict[str, Any],
+        device: str,
+        original_error: str,
+    ) -> ModelLoadingResult:
         """
         Attempt to load the model using the legacy direct-constructor path
         for maximum backward compatibility with existing checkpoints.
@@ -206,24 +209,27 @@ def _load_with_legacy_implementation(self,
         import torch
 
         # Use nested 'model' section if present; otherwise assume flat
-        model_cfg = original_config.get('model', original_config)
+        model_cfg = original_config.get("model", original_config)
 
         # Determine model type from config
-        if 'num_bands' in model_cfg:
+        if "num_bands" in model_cfg:
             from ..uvr_lib_v5.roformer.mel_band_roformer import MelBandRoformer
+
             model = MelBandRoformer(**model_cfg)
-        elif 'freqs_per_bands' in model_cfg:
+        elif "freqs_per_bands" in model_cfg:
             from ..uvr_lib_v5.roformer.bs_roformer import BSRoformer
+
             model = BSRoformer(**model_cfg)
         else:
             raise ValueError("Unknown Roformer model type in legacy configuration")
 
         # Load checkpoint as raw state dict (legacy behavior)
+        # Use the target device for loading to avoid device mismatches
         try:
-            checkpoint = torch.load(model_path, map_location='cpu', weights_only=True)
+            checkpoint = torch.load(model_path, map_location=device, weights_only=True)
         except TypeError:
             # For older torch versions without weights_only
-            checkpoint = torch.load(model_path, map_location='cpu')
+            checkpoint = torch.load(model_path, map_location=device)
 
         model.load_state_dict(checkpoint)
         model.to(device).eval()
@@ -238,67 +244,74 @@ def get_loading_stats(self) -> Dict[str, int]:
         return self._loading_stats.copy()
 
     def reset_loading_stats(self) -> None:
-        self._loading_stats = {
-            'new_implementation_success': 0,
-            'total_failures': 0
-        }
+        self._loading_stats = {"new_implementation_success": 0, "total_failures": 0}
 
     def detect_model_type(self, model_path: str) -> str:
         model_path_lower = model_path.lower()
-        if any(indicator in model_path_lower for indicator in ['bs_roformer', 'bs-roformer', 'bsroformer']):
+        if any(
+            indicator in model_path_lower
+            for indicator in ["bs_roformer", "bs-roformer", "bsroformer"]
+        ):
             return "bs_roformer"
-        if any(indicator in model_path_lower for indicator in ['mel_band_roformer', 'mel-band-roformer', 'melband']):
+        if any(
+            indicator in model_path_lower
+            for indicator in ["mel_band_roformer", "mel-band-roformer", "melband"]
+        ):
             return "mel_band_roformer"
-        if 'roformer' in model_path_lower:
-            logger.warning(f"Generic 'roformer' detected in {model_path}, defaulting to bs_roformer")
+        if "roformer" in model_path_lower:
+            logger.warning(
+                f"Generic 'roformer' detected in {model_path}, defaulting to bs_roformer"
+            )
             return "bs_roformer"
-        raise ValueError(f"Cannot determine Roformer model type from path: {model_path}")
+        raise ValueError(
+            f"Cannot determine Roformer model type from path: {model_path}"
+        )
 
     def get_default_configuration(self, model_type: str) -> Dict[str, Any]:
         if model_type == "bs_roformer":
             return {
-                'dim': 512,
-                'depth': 12,
-                'stereo': False,
-                'num_stems': 2,
-                'time_transformer_depth': 2,
-                'freq_transformer_depth': 2,
-                'freqs_per_bands': (2, 4, 8, 16, 32, 64),
-                'dim_head': 64,
-                'heads': 8,
-                'attn_dropout': 0.0,
-                'ff_dropout': 0.0,
-                'flash_attn': True,
-                'mlp_expansion_factor': 4,
-                'sage_attention': False,
-                'zero_dc': True,
-                'use_torch_checkpoint': False,
-                'skip_connection': False,
-                'mask_estimator_depth': 2,
-                'stft_n_fft': 2048,
-                'stft_hop_length': 512,
-                'stft_win_length': 2048,
+                "dim": 512,
+                "depth": 12,
+                "stereo": False,
+                "num_stems": 2,
+                "time_transformer_depth": 2,
+                "freq_transformer_depth": 2,
+                "freqs_per_bands": (2, 4, 8, 16, 32, 64),
+                "dim_head": 64,
+                "heads": 8,
+                "attn_dropout": 0.0,
+                "ff_dropout": 0.0,
+                "flash_attn": True,
+                "mlp_expansion_factor": 4,
+                "sage_attention": False,
+                "zero_dc": True,
+                "use_torch_checkpoint": False,
+                "skip_connection": False,
+                "mask_estimator_depth": 2,
+                "stft_n_fft": 2048,
+                "stft_hop_length": 512,
+                "stft_win_length": 2048,
             }
         elif model_type == "mel_band_roformer":
             return {
-                'dim': 512,
-                'depth': 12,
-                'stereo': False,
-                'num_stems': 2,
-                'time_transformer_depth': 2,
-                'freq_transformer_depth': 2,
-                'num_bands': 64,
-                'dim_head': 64,
-                'heads': 8,
-                'attn_dropout': 0.0,
-                'ff_dropout': 0.0,
-                'flash_attn': True,
-                'mlp_expansion_factor': 4,
-                'sage_attention': False,
-                'zero_dc': True,
-                'use_torch_checkpoint': False,
-                'skip_connection': False,
-                'sample_rate': 44100,
+                "dim": 512,
+                "depth": 12,
+                "stereo": False,
+                "num_stems": 2,
+                "time_transformer_depth": 2,
+                "freq_transformer_depth": 2,
+                "num_bands": 64,
+                "dim_head": 64,
+                "heads": 8,
+                "attn_dropout": 0.0,
+                "ff_dropout": 0.0,
+                "flash_attn": True,
+                "mlp_expansion_factor": 4,
+                "sage_attention": False,
+                "zero_dc": True,
+                "use_torch_checkpoint": False,
+                "skip_connection": False,
+                "sample_rate": 44100,
                 # Note: fmin and fmax are not implemented in MelBandRoformer constructor
             }
         else:
diff --git a/audio_separator/separator/separator.py b/audio_separator/separator/separator.py
index b2c40022..0fcfa2f7 100644
--- a/audio_separator/separator/separator.py
+++ b/audio_separator/separator/separator.py
@@ -1,4 +1,4 @@
-""" This file contains the Separator class, to facilitate the separation of stems from audio. """
+"""This file contains the Separator class, to facilitate the separation of stems from audio."""
 
 from importlib import metadata, resources
 import os
@@ -122,13 +122,13 @@ def __init__(
         use_autocast=False,
         use_directml=False,
         chunk_duration=None,
-        mdx_params={"hop_length": 1024, "segment_size": 256, "overlap": 0.25, "batch_size": 1, "enable_denoise": False},
-        vr_params={"batch_size": 1, "window_size": 512, "aggression": 5, "enable_tta": False, "enable_post_process": False, "post_process_threshold": 0.2, "high_end_process": False},
-        demucs_params={"segment_size": "Default", "shifts": 2, "overlap": 0.25, "segments_enabled": True},
-        mdxc_params={"segment_size": 256, "override_model_segment_size": False, "batch_size": 1, "overlap": 8, "pitch_shift": 0},
         ensemble_algorithm=None,
         ensemble_weights=None,
         ensemble_preset=None,
+        mdx_params=None,
+        vr_params=None,
+        demucs_params=None,
+        mdxc_params=None,
         info_only=False,
     ):
         """Initialize the separator."""
@@ -140,7 +140,9 @@ def __init__(
         self.log_handler = logging.StreamHandler()
 
         if self.log_formatter is None:
-            self.log_formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(module)s - %(message)s")
+            self.log_formatter = logging.Formatter(
+                "%(asctime)s - %(levelname)s - %(module)s - %(message)s"
+            )
 
         self.log_handler.setFormatter(self.log_formatter)
 
@@ -154,12 +156,16 @@ def __init__(
         # Skip initialization logs if info_only is True
         if not info_only:
             package_version = self.get_package_distribution("audio-separator").version
-            self.logger.info(f"Separator version {package_version} instantiating with output_dir: {output_dir}, output_format: {output_format}")
+            self.logger.info(
+                f"Separator version {package_version} instantiating with output_dir: {output_dir}, output_format: {output_format}"
+            )
 
         if output_dir is None:
             output_dir = os.getcwd()
             if not info_only:
-                self.logger.info("Output directory not specified. Using current working directory.")
+                self.logger.info(
+                    "Output directory not specified. Using current working directory."
+                )
 
         self.output_dir = output_dir
 
@@ -167,11 +173,17 @@ def __init__(
         env_model_dir = os.environ.get("AUDIO_SEPARATOR_MODEL_DIR")
         if env_model_dir:
             self.model_file_dir = env_model_dir
-            self.logger.info(f"Using model directory from AUDIO_SEPARATOR_MODEL_DIR env var: {self.model_file_dir}")
+            self.logger.info(
+                f"Using model directory from AUDIO_SEPARATOR_MODEL_DIR env var: {self.model_file_dir}"
+            )
             if not os.path.exists(self.model_file_dir):
-                raise FileNotFoundError(f"The specified model directory does not exist: {self.model_file_dir}")
+                raise FileNotFoundError(
+                    f"The specified model directory does not exist: {self.model_file_dir}"
+                )
         else:
-            self.logger.info(f"Using model directory from model_file_dir parameter: {model_file_dir}")
+            self.logger.info(
+                f"Using model directory from model_file_dir parameter: {model_file_dir}"
+            )
             self.model_file_dir = model_file_dir
 
         # Create the model directory if it does not exist
@@ -186,28 +198,42 @@ def __init__(
 
         self.normalization_threshold = normalization_threshold
         if normalization_threshold <= 0 or normalization_threshold > 1:
-            raise ValueError("The normalization_threshold must be greater than 0 and less than or equal to 1.")
+            raise ValueError(
+                "The normalization_threshold must be greater than 0 and less than or equal to 1."
+            )
 
         self.amplification_threshold = amplification_threshold
         if amplification_threshold < 0 or amplification_threshold > 1:
-            raise ValueError("The amplification_threshold must be greater than or equal to 0 and less than or equal to 1.")
+            raise ValueError(
+                "The amplification_threshold must be greater than or equal to 0 and less than or equal to 1."
+            )
 
         self.output_single_stem = output_single_stem
         if output_single_stem is not None:
-            self.logger.debug(f"Single stem output requested, so only one output file ({output_single_stem}) will be written")
+            self.logger.debug(
+                f"Single stem output requested, so only one output file ({output_single_stem}) will be written"
+            )
 
         self.invert_using_spec = invert_using_spec
         if self.invert_using_spec:
-            self.logger.debug(f"Secondary step will be inverted using spectogram rather than waveform. This may improve quality but is slightly slower.")
+            self.logger.debug(
+                f"Secondary step will be inverted using spectogram rather than waveform. This may improve quality but is slightly slower."
+            )
 
         try:
             self.sample_rate = int(sample_rate)
             if self.sample_rate <= 0:
-                raise ValueError(f"The sample rate setting is {self.sample_rate} but it must be a non-zero whole number.")
+                raise ValueError(
+                    f"The sample rate setting is {self.sample_rate} but it must be a non-zero whole number."
+                )
             if self.sample_rate > 12800000:
-                raise ValueError(f"The sample rate setting is {self.sample_rate}. Enter something less ambitious.")
+                raise ValueError(
+                    f"The sample rate setting is {self.sample_rate}. Enter something less ambitious."
+                )
         except ValueError:
-            raise ValueError("The sample rate must be a non-zero whole number. Please provide a valid integer.")
+            raise ValueError(
+                "The sample rate must be a non-zero whole number. Please provide a valid integer."
+            )
 
         self.use_soundfile = use_soundfile
         self.use_autocast = use_autocast
@@ -237,13 +263,54 @@ def __init__(
         if self.ensemble_algorithm is None:
             self.ensemble_algorithm = "avg_wave"
 
+        # Set defaults for arch_specific_params if not provided
+        if mdx_params is None:
+            mdx_params = {
+                "hop_length": 1024,
+                "segment_size": 256,
+                "overlap": 0.25,
+                "batch_size": 1,
+                "enable_denoise": False,
+            }
+        if vr_params is None:
+            vr_params = {
+                "batch_size": 1,
+                "window_size": 512,
+                "aggression": 5,
+                "enable_tta": False,
+                "enable_post_process": False,
+                "post_process_threshold": 0.2,
+                "high_end_process": False,
+            }
+        if demucs_params is None:
+            demucs_params = {
+                "segment_size": "Default",
+                "shifts": 2,
+                "overlap": 0.25,
+                "segments_enabled": True,
+            }
+        if mdxc_params is None:
+            mdxc_params = {
+                "segment_size": 256,
+                "override_model_segment_size": False,
+                "batch_size": 1,
+                "overlap": 8,
+                "pitch_shift": 0,
+            }
+
         # These are parameters which users may want to configure so we expose them to the top-level Separator class,
         # even though they are specific to a single model architecture
-        self.arch_specific_params = {"MDX": mdx_params, "VR": vr_params, "Demucs": demucs_params, "MDXC": mdxc_params}
+        self.arch_specific_params = {
+            "MDX": mdx_params,
+            "VR": vr_params,
+            "Demucs": demucs_params,
+            "MDXC": mdxc_params,
+        }
 
         self.torch_device = None
         self.torch_device_cpu = None
         self.torch_device_mps = None
+        self.is_rocm = False
 
         self.onnx_execution_provider = None
         self.model_instance = None
@@ -257,9 +324,17 @@ def __init__(
             self.setup_accelerated_inferencing_device()
 
     VALID_ENSEMBLE_ALGORITHMS = [
-        "avg_wave", "median_wave", "min_wave", "max_wave",
-        "avg_fft", "median_fft", "min_fft", "max_fft",
-        "uvr_max_spec", "uvr_min_spec", "ensemble_wav",
+        "avg_wave",
+        "median_wave",
+        "min_wave",
+        "max_wave",
+        "avg_fft",
+        "median_fft",
+        "min_fft",
+        "max_fft",
+        "uvr_max_spec",
+        "uvr_min_spec",
+        "ensemble_wav",
     ]
 
     def _load_ensemble_preset(self, preset_name):
@@ -273,32 +348,44 @@ def _load_ensemble_preset(self, preset_name):
             with resources.open_text("audio_separator", "ensemble_presets.json") as f:
                 presets_data = json.load(f)
         except FileNotFoundError:
-            raise ValueError("Ensemble presets file not found. The package may be corrupted or improperly installed.")
+            raise ValueError(
+                "Ensemble presets file not found. The package may be corrupted or improperly installed."
+            )
 
         presets = presets_data.get("presets", {})
         if preset_name not in presets:
             available = ", ".join(sorted(presets.keys()))
-            raise ValueError(f"Unknown ensemble preset: '{preset_name}'. Available presets: {available}")
+            raise ValueError(
+                f"Unknown ensemble preset: '{preset_name}'. Available presets: {available}"
+            )
 
         preset = presets[preset_name]
 
         # Validate models
         models = preset.get("models", [])
         if not isinstance(models, list) or len(models) < 2:
-            raise ValueError(f"Ensemble preset '{preset_name}' must specify at least 2 models, got {len(models) if isinstance(models, list) else 0}")
+            raise ValueError(
+                f"Ensemble preset '{preset_name}' must specify at least 2 models, got {len(models) if isinstance(models, list) else 0}"
+            )
 
         # Validate algorithm
         algorithm = preset.get("algorithm", "avg_wave")
         if algorithm not in self.VALID_ENSEMBLE_ALGORITHMS:
-            raise ValueError(f"Ensemble preset '{preset_name}' has unknown algorithm: '{algorithm}'")
+            raise ValueError(
+                f"Ensemble preset '{preset_name}' has unknown algorithm: '{algorithm}'"
+            )
 
         # Validate weights
         weights = preset.get("weights")
         if weights is not None:
             if not isinstance(weights, list) or len(weights) != len(models):
-                raise ValueError(f"Ensemble preset '{preset_name}' weights length ({len(weights) if isinstance(weights, list) else 'N/A'}) must match models count ({len(models)})")
+                raise ValueError(
+                    f"Ensemble preset '{preset_name}' weights length ({len(weights) if isinstance(weights, list) else 'N/A'}) must match models count ({len(models)})"
+                )
 
-        self.logger.info(f"Loaded ensemble preset '{preset_name}': {preset.get('name', preset_name)} — {preset.get('description', '')}")
+        self.logger.info(
+            f"Loaded ensemble preset '{preset_name}': {preset.get('name', preset_name)} — {preset.get('description', '')}"
+        )
         return preset
 
     def list_ensemble_presets(self):
@@ -332,7 +419,9 @@ def get_system_info(self):
         self.logger.info(f"Operating System: {os_name} {os_version}")
 
         system_info = platform.uname()
-        self.logger.info(f"System: {system_info.system} Node: {system_info.node} Release: {system_info.release} Machine: {system_info.machine} Proc: {system_info.processor}")
+        self.logger.info(
+            f"System: {system_info.system} Node: {system_info.node} Release: {system_info.release} Machine: {system_info.machine} Proc: {system_info.processor}"
+        )
 
         python_version = platform.python_version()
         self.logger.info(f"Python Version: {python_version}")
@@ -346,11 +435,15 @@ def check_ffmpeg_installed(self):
         This method checks if ffmpeg is installed and logs its version.
         """
         try:
-            ffmpeg_version_output = subprocess.check_output(["ffmpeg", "-version"], text=True)
+            ffmpeg_version_output = subprocess.check_output(
+                ["ffmpeg", "-version"], text=True
+            )
             first_line = ffmpeg_version_output.splitlines()[0]
             self.logger.info(f"FFmpeg installed: {first_line}")
         except FileNotFoundError:
-            self.logger.error("FFmpeg is not installed. Please install FFmpeg to use this package.")
+            self.logger.error(
+                "FFmpeg is not installed. Please install FFmpeg to use this package."
+            )
             # Raise an exception if this is being run by a user, as ffmpeg is required for pydub to write audio
             # but if we're just running unit tests in CI, no reason to throw
             if "PYTEST_CURRENT_TEST" not in os.environ:
@@ -361,18 +454,28 @@ def log_onnxruntime_packages(self):
         This method logs the ONNX Runtime package versions, including the GPU and Silicon packages if available.
         """
         onnxruntime_gpu_package = self.get_package_distribution("onnxruntime-gpu")
-        onnxruntime_silicon_package = self.get_package_distribution("onnxruntime-silicon")
+        onnxruntime_silicon_package = self.get_package_distribution(
+            "onnxruntime-silicon"
+        )
         onnxruntime_cpu_package = self.get_package_distribution("onnxruntime")
         onnxruntime_dml_package = self.get_package_distribution("onnxruntime-directml")
 
         if onnxruntime_gpu_package is not None:
-            self.logger.info(f"ONNX Runtime GPU package installed with version: {onnxruntime_gpu_package.version}")
+            self.logger.info(
+                f"ONNX Runtime GPU package installed with version: {onnxruntime_gpu_package.version}"
+            )
         if onnxruntime_silicon_package is not None:
-            self.logger.info(f"ONNX Runtime Silicon package installed with version: {onnxruntime_silicon_package.version}")
+            self.logger.info(
+                f"ONNX Runtime Silicon package installed with version: {onnxruntime_silicon_package.version}"
+            )
         if onnxruntime_cpu_package is not None:
-            self.logger.info(f"ONNX Runtime CPU package installed with version: {onnxruntime_cpu_package.version}")
+            self.logger.info(
+                f"ONNX Runtime CPU package installed with version: {onnxruntime_cpu_package.version}"
+            )
         if onnxruntime_dml_package is not None:
-            self.logger.info(f"ONNX Runtime DirectML package installed with version: {onnxruntime_dml_package.version}")
+            self.logger.info(
+                f"ONNX Runtime DirectML package installed with version: {onnxruntime_dml_package.version}"
+            )
 
     def setup_torch_device(self, system_info):
         """
@@ -385,19 +488,56 @@ def setup_torch_device(self, system_info):
         self.torch_device_cpu = torch.device("cpu")
 
         if torch.cuda.is_available():
-            self.configure_cuda(ort_providers)
-            hardware_acceleration_enabled = True
-        elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available() and system_info.processor == "arm":
+            # Check if ROCm packages are installed and PyTorch shows ROCm support
+            onnxruntime_rocm_package = self.get_package_distribution("onnxruntime-rocm")
+            torch_version = torch.__version__
+
+            # Prioritize ROCm if ROCm packages are installed and PyTorch shows ROCm support
+            if onnxruntime_rocm_package is not None and ("+rocm" in torch_version):
+                self.logger.info(
+                    "ROCm packages detected and PyTorch shows ROCm support"
+                )
+                if "ROCMExecutionProvider" in ort_providers:
+                    self.configure_rocm(ort_providers)
+                    hardware_acceleration_enabled = True
+                elif "CUDAExecutionProvider" in ort_providers:
+                    self.logger.info(
+                        "ROCm detected with PyTorch, using CUDAExecutionProvider for AMD GPU acceleration"
+                    )
+                    self.configure_rocm(ort_providers)
+                    hardware_acceleration_enabled = True
+                else:
+                    self.logger.warning(
+                        "ROCm packages installed but no GPU execution provider available"
+                    )
+                    self.logger.warning("Falling back to CPU mode")
+                    hardware_acceleration_enabled = False
+            elif "ROCMExecutionProvider" in ort_providers:
+                # Fallback: check ROCMExecutionProvider directly
+                self.configure_rocm(ort_providers)
+                hardware_acceleration_enabled = True
+            else:
+                # Standard CUDA configuration
+                self.configure_cuda(ort_providers)
+                hardware_acceleration_enabled = True
+        elif (
+            hasattr(torch.backends, "mps")
+            and torch.backends.mps.is_available()
+            and system_info.processor == "arm"
+        ):
             self.configure_mps(ort_providers)
             hardware_acceleration_enabled = True
         elif self.use_directml and has_torch_dml_installed:
             import torch_directml
+
             if torch_directml.is_available():
                 self.configure_dml(ort_providers)
                 hardware_acceleration_enabled = True
 
         if not hardware_acceleration_enabled:
-            self.logger.info("No hardware acceleration could be configured, running in CPU mode")
+            self.logger.info(
+                "No hardware acceleration could be configured, running in CPU mode"
+            )
             self.torch_device = self.torch_device_cpu
             self.onnx_execution_provider = ["CPUExecutionProvider"]
 
@@ -408,40 +548,108 @@ def configure_cuda(self, ort_providers):
         self.logger.info("CUDA is available in Torch, setting Torch device to CUDA")
         self.torch_device = torch.device("cuda")
         if "CUDAExecutionProvider" in ort_providers:
-            self.logger.info("ONNXruntime has CUDAExecutionProvider available, enabling acceleration")
+            self.logger.info(
+                "ONNXruntime has CUDAExecutionProvider available, enabling acceleration"
+            )
+            self.onnx_execution_provider = ["CUDAExecutionProvider"]
+        else:
+            self.logger.warning(
+                "CUDAExecutionProvider not available in ONNXruntime, so acceleration will NOT be enabled"
+            )
+
+    def configure_rocm(self, ort_providers):
+        """
+        This method configures the ROCm device for PyTorch and ONNX Runtime, if available.
+        """
+        self.is_rocm = True
+        torch_version = torch.__version__
+        if "+cu" in torch_version:
+            self.logger.warning(
+                "ROCm ExecutionProvider detected, but PyTorch appears to have CUDA support instead of ROCm support."
+            )
+            self.logger.warning(
+                "For optimal AMD GPU performance, consider reinstalling PyTorch with ROCm support:"
+            )
+            self.logger.warning("pip uninstall torch torchvision torchaudio")
+            self.logger.warning("pip cache purge")
+            self.logger.warning(
+                "pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm5.7"
+            )
+        elif "+rocm" in torch_version:
+            self.logger.info(
+                "PyTorch with ROCm support detected (version includes +rocm)"
+            )
+
+        self.logger.info(
+            "ROCm (AMD GPU) detected, setting Torch device to CUDA (ROCm presents as CUDA)"
+        )
+        self.torch_device = torch.device("cuda")
+
+        # Try to configure ROCm execution provider if available
+        if "ROCMExecutionProvider" in ort_providers:
+            self.logger.info(
+                "ONNXruntime has ROCMExecutionProvider available, enabling acceleration"
+            )
+            self.onnx_execution_provider = ["ROCMExecutionProvider"]
+            self.logger.info(
+                "✓ ROCm (AMD GPU) acceleration enabled via ROCMExecutionProvider"
+            )
+        elif "CUDAExecutionProvider" in ort_providers:
+            self.logger.info(
+                "Using CUDAExecutionProvider as fallback for ROCm acceleration"
+            )
             self.onnx_execution_provider = ["CUDAExecutionProvider"]
+            self.logger.info(
+                "✓ ROCm (AMD GPU) acceleration enabled via CUDAExecutionProvider"
+            )
         else:
-            self.logger.warning("CUDAExecutionProvider not available in ONNXruntime, so acceleration will NOT be enabled")
+            self.logger.warning(
+                "No GPU execution provider available for ROCm, falling back to CPU"
+            )
+            self.onnx_execution_provider = ["CPUExecutionProvider"]
 
     def configure_mps(self, ort_providers):
         """
         This method configures the Apple Silicon MPS/CoreML device for PyTorch and ONNX Runtime, if available.
         """
-        self.logger.info("Apple Silicon MPS/CoreML is available in Torch and processor is ARM, setting Torch device to MPS")
+        self.logger.info(
+            "Apple Silicon MPS/CoreML is available in Torch and processor is ARM, setting Torch device to MPS"
+        )
         self.torch_device_mps = torch.device("mps")
 
         self.torch_device = self.torch_device_mps
 
         if "CoreMLExecutionProvider" in ort_providers:
-            self.logger.info("ONNXruntime has CoreMLExecutionProvider available, enabling acceleration")
+            self.logger.info(
+                "ONNXruntime has CoreMLExecutionProvider available, enabling acceleration"
+            )
             self.onnx_execution_provider = ["CoreMLExecutionProvider"]
         else:
-            self.logger.warning("CoreMLExecutionProvider not available in ONNXruntime, so acceleration will NOT be enabled")
+            self.logger.warning(
+                "CoreMLExecutionProvider not available in ONNXruntime, so acceleration will NOT be enabled"
+            )
 
     def configure_dml(self, ort_providers):
         """
         This method configures the DirectML device for PyTorch and ONNX Runtime, if available.
         """
         import torch_directml
-        self.logger.info("DirectML is available in Torch, setting Torch device to DirectML")
-        self.torch_device_dml = torch_directml.device() 
+
+        self.logger.info(
+            "DirectML is available in Torch, setting Torch device to DirectML"
+        )
+        self.torch_device_dml = torch_directml.device()
         self.torch_device = self.torch_device_dml
 
         if "DmlExecutionProvider" in ort_providers:
-            self.logger.info("ONNXruntime has DmlExecutionProvider available, enabling acceleration")
+            self.logger.info(
+                "ONNXruntime has DmlExecutionProvider available, enabling acceleration"
+            )
             self.onnx_execution_provider = ["DmlExecutionProvider"]
         else:
-            self.logger.warning("DmlExecutionProvider not available in ONNXruntime, so acceleration will NOT be enabled")
+            self.logger.warning(
+                "DmlExecutionProvider not available in ONNXruntime, so acceleration will NOT be enabled"
+            )
 
     def get_package_distribution(self, package_name):
         """
@@ -467,12 +675,16 @@ def get_model_hash(self, model_path):
             with open(model_path, "rb") as f:
                 if file_size < BYTES_TO_HASH:
                     # Hash the entire file if smaller than the target byte count
-                    self.logger.debug(f"File size {file_size} < {BYTES_TO_HASH}, hashing entire file.")
+                    self.logger.debug(
+                        f"File size {file_size} < {BYTES_TO_HASH}, hashing entire file."
+                    )
                     hash_value = hashlib.md5(f.read()).hexdigest()
                 else:
                     # Seek to the specific position before the end (from the beginning) and hash
                     seek_pos = file_size - BYTES_TO_HASH
-                    self.logger.debug(f"File size {file_size} >= {BYTES_TO_HASH}, seeking to {seek_pos} and hashing remaining bytes.")
+                    self.logger.debug(
+                        f"File size {file_size} >= {BYTES_TO_HASH}, seeking to {seek_pos} and hashing remaining bytes."
+                    )
                     f.seek(seek_pos, io.SEEK_SET)
                     hash_value = hashlib.md5(f.read()).hexdigest()
 
@@ -482,11 +694,11 @@ def get_model_hash(self, model_path):
 
         except FileNotFoundError:
             self.logger.error(f"Model file not found at {model_path}")
-            raise # Re-raise the specific error
+            raise  # Re-raise the specific error
         except Exception as e:
             # Catch other potential errors (e.g., permissions, other IOErrors)
             self.logger.error(f"Error calculating hash for {model_path}: {e}")
-            raise # Re-raise other errors
+            raise  # Re-raise other errors
 
     def download_file_if_not_exists(self, url, output_path):
         """
@@ -494,10 +706,14 @@ def download_file_if_not_exists(self, url, output_path):
         """
 
         if os.path.isfile(output_path):
-            self.logger.debug(f"File already exists at {output_path}, skipping download")
+            self.logger.debug(
+                f"File already exists at {output_path}, skipping download"
+            )
             return
 
-        self.logger.debug(f"Downloading file from {url} to {output_path} with timeout 300s")
+        self.logger.debug(
+            f"Downloading file from {url} to {output_path} with timeout 300s"
+        )
         response = requests.get(url, stream=True, timeout=300)
 
         if response.status_code == 200:
@@ -510,7 +726,9 @@ def download_file_if_not_exists(self, url, output_path):
                     f.write(chunk)
             progress_bar.close()
         else:
-            raise RuntimeError(f"Failed to download file from {url}, response code: {response.status_code}")
+            raise RuntimeError(
+                f"Failed to download file from {url}, response code: {response.status_code}"
+            )
 
     def list_supported_model_files(self):
         """
@@ -601,7 +819,10 @@ def list_supported_model_files(self):
         """
         download_checks_path = os.path.join(self.model_file_dir, "download_checks.json")
 
-        self.download_file_if_not_exists("https://raw.githubusercontent.com/TRvlvr/application_data/main/filelists/download_checks.json", download_checks_path)
+        self.download_file_if_not_exists(
+            "https://raw.githubusercontent.com/TRvlvr/application_data/main/filelists/download_checks.json",
+            download_checks_path,
+        )
 
         model_downloads_list = json.load(open(download_checks_path, encoding="utf-8"))
         self.logger.debug(f"UVR model download list loaded")
@@ -617,13 +838,20 @@ def list_supported_model_files(self):
             self.logger.warning("Continuing without model scores")
 
         # Only show Demucs v4 models as we've only implemented support for v4
-        filtered_demucs_v4 = {key: value for key, value in model_downloads_list["demucs_download_list"].items() if key.startswith("Demucs v4")}
+        filtered_demucs_v4 = {
+            key: value
+            for key, value in model_downloads_list["demucs_download_list"].items()
+            if key.startswith("Demucs v4")
+        }
 
         # Modified Demucs handling to use YAML files as identifiers and include download files
         demucs_models = {}
         for name, files in filtered_demucs_v4.items():
             # Find the YAML file in the model files
-            yaml_file = next((filename for filename in files.keys() if filename.endswith(".yaml")), None)
+            yaml_file = next(
+                (filename for filename in files.keys() if filename.endswith(".yaml")),
+                None,
+            )
             if yaml_file:
                 model_score_data = model_scores.get(yaml_file, {})
                 demucs_models[name] = {
@@ -631,7 +859,9 @@ def list_supported_model_files(self):
                     "scores": model_score_data.get("median_scores", {}),
                     "stems": model_score_data.get("stems", []),
                     "target_stem": model_score_data.get("target_stem"),
-                    "download_files": list(files.values()),  # List of all download URLs/filenames
+                    "download_files": list(
+                        files.values()
+                    ),  # List of all download URLs/filenames
                 }
 
         # Load the JSON file using importlib.resources
@@ -649,7 +879,10 @@ def list_supported_model_files(self):
                     "target_stem": model_scores.get(filename, {}).get("target_stem"),
                     "download_files": [filename],
                 }  # Just the filename for VR models
-                for name, filename in {**model_downloads_list["vr_download_list"], **audio_separator_models_list["vr_download_list"]}.items()
+                for name, filename in {
+                    **model_downloads_list["vr_download_list"],
+                    **audio_separator_models_list["vr_download_list"],
+                }.items()
             },
             "MDX": {
                 name: {
@@ -659,16 +892,29 @@ def list_supported_model_files(self):
                     "target_stem": model_scores.get(filename, {}).get("target_stem"),
                     "download_files": [filename],
                 }  # Just the filename for MDX models
-                for name, filename in {**model_downloads_list["mdx_download_list"], **model_downloads_list["mdx_download_vip_list"], **audio_separator_models_list["mdx_download_list"]}.items()
+                for name, filename in {
+                    **model_downloads_list["mdx_download_list"],
+                    **model_downloads_list["mdx_download_vip_list"],
+                    **audio_separator_models_list["mdx_download_list"],
+                }.items()
             },
             "Demucs": demucs_models,
             "MDXC": {
                 name: {
                     "filename": next(iter(files.keys())),
-                    "scores": model_scores.get(next(iter(files.keys())), {}).get("median_scores", {}),
-                    "stems": model_scores.get(next(iter(files.keys())), {}).get("stems", []),
-                    "target_stem": model_scores.get(next(iter(files.keys())), {}).get("target_stem"),
-                    "download_files": list(files.keys()) + list(files.values()),  # List of both model filenames and config filenames
+                    "scores": model_scores.get(next(iter(files.keys())), {}).get(
+                        "median_scores", {}
+                    ),
+                    "stems": model_scores.get(next(iter(files.keys())), {}).get(
+                        "stems", []
+                    ),
+                    "target_stem": model_scores.get(next(iter(files.keys())), {}).get(
+                        "target_stem"
+                    ),
+                    "download_files": list(files.keys())
+                    + list(
+                        files.values()
+                    ),  # List of both model filenames and config filenames
                 }
                 for name, files in {
                     **model_downloads_list["mdx23c_download_list"],
@@ -687,8 +933,12 @@ def print_uvr_vip_message(self):
         This method prints a message to the user if they have downloaded a VIP model, reminding them to support Anjok07 on Patreon.
         """
         if self.model_is_uvr_vip:
-            self.logger.warning(f"The model: '{self.model_friendly_name}' is a VIP model, intended by Anjok07 for access by paying subscribers only.")
-            self.logger.warning("If you are not already subscribed, please consider supporting the developer of UVR, Anjok07 by subscribing here: https://patreon.com/uvr")
+            self.logger.warning(
+                f"The model: '{self.model_friendly_name}' is a VIP model, intended by Anjok07 for access by paying subscribers only."
+            )
+            self.logger.warning(
+                "If you are not already subscribed, please consider supporting the developer of UVR, Anjok07 by subscribing here: https://patreon.com/uvr"
+            )
 
     def download_model_files(self, model_filename):
         """
@@ -699,22 +949,33 @@ def download_model_files(self, model_filename):
 
         supported_model_files_grouped = self.list_supported_model_files()
         public_model_repo_url_prefix = "https://github.com/TRvlvr/model_repo/releases/download/all_public_uvr_models"
-        vip_model_repo_url_prefix = "https://github.com/Anjok0109/ai_magic/releases/download/v5"
+        vip_model_repo_url_prefix = (
+            "https://github.com/Anjok0109/ai_magic/releases/download/v5"
+        )
         audio_separator_models_repo_url_prefix = "https://github.com/nomadkaraoke/python-audio-separator/releases/download/model-configs"
 
         yaml_config_filename = None
 
-        self.logger.debug(f"Searching for model_filename {model_filename} in supported_model_files_grouped")
+        self.logger.debug(
+            f"Searching for model_filename {model_filename} in supported_model_files_grouped"
+        )
 
         # Iterate through model types (MDX, Demucs, MDXC)
         for model_type, models in supported_model_files_grouped.items():
             # Iterate through each model in this type
             for model_friendly_name, model_info in models.items():
                 self.model_is_uvr_vip = "VIP" in model_friendly_name
-                model_repo_url_prefix = vip_model_repo_url_prefix if self.model_is_uvr_vip else public_model_repo_url_prefix
+                model_repo_url_prefix = (
+                    vip_model_repo_url_prefix
+                    if self.model_is_uvr_vip
+                    else public_model_repo_url_prefix
+                )
 
                 # Check if this model matches our target filename
-                if model_info["filename"] == model_filename or model_filename in model_info["download_files"]:
+                if (
+                    model_info["filename"] == model_filename
+                    or model_filename in model_info["download_files"]
+                ):
                     self.logger.debug(f"Found matching model: {model_friendly_name}")
                     self.model_friendly_name = model_friendly_name
                     self.print_uvr_vip_message()
@@ -725,35 +986,59 @@ def download_model_files(self, model_filename):
                         if file_to_download.startswith("http"):
                             filename = file_to_download.split("/")[-1]
                             download_path = os.path.join(self.model_file_dir, filename)
-                            self.download_file_if_not_exists(file_to_download, download_path)
+                            self.download_file_if_not_exists(
+                                file_to_download, download_path
+                            )
                             continue
 
-                        download_path = os.path.join(self.model_file_dir, file_to_download)
+                        download_path = os.path.join(
+                            self.model_file_dir, file_to_download
+                        )
 
                         # For MDXC models, handle YAML config files specially
                         if model_type == "MDXC" and file_to_download.endswith(".yaml"):
                             yaml_config_filename = file_to_download
                             try:
                                 yaml_url = f"{model_repo_url_prefix}/mdx_model_data/mdx_c_configs/{file_to_download}"
-                                self.download_file_if_not_exists(yaml_url, download_path)
+                                self.download_file_if_not_exists(
+                                    yaml_url, download_path
+                                )
                             except RuntimeError:
-                                self.logger.debug("YAML config not found in UVR repo, trying audio-separator models repo...")
+                                self.logger.debug(
+                                    "YAML config not found in UVR repo, trying audio-separator models repo..."
+                                )
                                 yaml_url = f"{audio_separator_models_repo_url_prefix}/{file_to_download}"
-                                self.download_file_if_not_exists(yaml_url, download_path)
+                                self.download_file_if_not_exists(
+                                    yaml_url, download_path
+                                )
                             continue
 
                         # For regular model files, try UVR repo first, then audio-separator repo
                         try:
                             download_url = f"{model_repo_url_prefix}/{file_to_download}"
-                            self.download_file_if_not_exists(download_url, download_path)
+                            self.download_file_if_not_exists(
+                                download_url, download_path
+                            )
                         except RuntimeError:
-                            self.logger.debug("Model not found in UVR repo, trying audio-separator models repo...")
+                            self.logger.debug(
+                                "Model not found in UVR repo, trying audio-separator models repo..."
+                            )
                             download_url = f"{audio_separator_models_repo_url_prefix}/{file_to_download}"
-                            self.download_file_if_not_exists(download_url, download_path)
-
-                    return model_filename, model_type, model_friendly_name, model_path, yaml_config_filename
-
-        raise ValueError(f"Model file {model_filename} not found in supported model files")
+                            self.download_file_if_not_exists(
+                                download_url, download_path
+                            )
+
+                    return (
+                        model_filename,
+                        model_type,
+                        model_friendly_name,
+                        model_path,
+                        yaml_config_filename,
+                    )
+
+        raise ValueError(
+            f"Model file {model_filename} not found in supported model files"
+        )
 
     def load_model_data_from_yaml(self, yaml_config_filename):
         """
@@ -762,13 +1047,18 @@ def load_model_data_from_yaml(self, yaml_config_filename):
         """
         # Verify if the YAML filename includes a full path or just the filename
         if not os.path.exists(yaml_config_filename):
-            model_data_yaml_filepath = os.path.join(self.model_file_dir, yaml_config_filename)
+            model_data_yaml_filepath = os.path.join(
+                self.model_file_dir, yaml_config_filename
+            )
         else:
             model_data_yaml_filepath = yaml_config_filename
 
-        self.logger.debug(f"Loading model data from YAML at path {model_data_yaml_filepath}")
+        self.logger.debug(
+            f"Loading model data from YAML at path {model_data_yaml_filepath}"
+        )
 
-        model_data = yaml.load(open(model_data_yaml_filepath, encoding="utf-8"), Loader=yaml.FullLoader)
+        with open(model_data_yaml_filepath, encoding="utf-8") as f:
+            model_data = yaml.load(f, Loader=yaml.FullLoader)
         self.logger.debug(f"Model data loaded from YAML file: {model_data}")
 
         if "roformer" in model_data_yaml_filepath.lower():
@@ -783,13 +1073,19 @@ def load_model_data_using_hash(self, model_path):
         The correct parameters are identified by calculating the hash of the model file and looking up the hash in the UVR data files.
         """
         # Model data and configuration sources from UVR
-        model_data_url_prefix = "https://raw.githubusercontent.com/TRvlvr/application_data/main"
+        model_data_url_prefix = (
+            "https://raw.githubusercontent.com/TRvlvr/application_data/main"
+        )
 
         vr_model_data_url = f"{model_data_url_prefix}/vr_model_data/model_data_new.json"
-        mdx_model_data_url = f"{model_data_url_prefix}/mdx_model_data/model_data_new.json"
+        mdx_model_data_url = (
+            f"{model_data_url_prefix}/mdx_model_data/model_data_new.json"
+        )
 
         # Calculate hash for the downloaded model
-        self.logger.debug("Calculating MD5 hash for model file to identify model parameters from UVR data...")
+        self.logger.debug(
+            "Calculating MD5 hash for model file to identify model parameters from UVR data..."
+        )
         model_hash = self.get_model_hash(model_path)
         self.logger.debug(f"Model {model_path} has hash {model_hash}")
 
@@ -803,25 +1099,37 @@ def load_model_data_using_hash(self, model_path):
         self.download_file_if_not_exists(mdx_model_data_url, mdx_model_data_path)
 
         # Loading model data from UVR
-        self.logger.debug("Loading MDX and VR model parameters from UVR model data files...")
+        self.logger.debug(
+            "Loading MDX and VR model parameters from UVR model data files..."
+        )
         vr_model_data_object = json.load(open(vr_model_data_path, encoding="utf-8"))
         mdx_model_data_object = json.load(open(mdx_model_data_path, encoding="utf-8"))
 
         # Load additional model data from audio-separator
-        self.logger.debug("Loading additional model parameters from audio-separator model data file...")
+        self.logger.debug(
+            "Loading additional model parameters from audio-separator model data file..."
+        )
         with resources.open_text("audio_separator", "model-data.json") as f:
             audio_separator_model_data = json.load(f)
 
         # Merge the model data objects, with audio-separator data taking precedence
-        vr_model_data_object = {**vr_model_data_object, **audio_separator_model_data.get("vr_model_data", {})}
-        mdx_model_data_object = {**mdx_model_data_object, **audio_separator_model_data.get("mdx_model_data", {})}
+        vr_model_data_object = {
+            **vr_model_data_object,
+            **audio_separator_model_data.get("vr_model_data", {}),
+        }
+        mdx_model_data_object = {
+            **mdx_model_data_object,
+            **audio_separator_model_data.get("mdx_model_data", {}),
+        }
 
         if model_hash in mdx_model_data_object:
             model_data = mdx_model_data_object[model_hash]
         elif model_hash in vr_model_data_object:
             model_data = vr_model_data_object[model_hash]
         else:
-            raise ValueError(f"Unsupported Model File: parameters for MD5 hash {model_hash} could not be found in UVR model data file for MDX or VR arch.")
+            raise ValueError(
+                f"Unsupported Model File: parameters for MD5 hash {model_hash} could not be found in UVR model data file for MDX or VR arch."
+            )
 
         self.logger.debug(f"Model data loaded using hash {model_hash}: {model_data}")
 
@@ -833,14 +1141,19 @@ def load_model(self, model_filename="model_bs_roformer_ep_317_sdr_12.9755.ckpt")
         loading the separation model into memory, downloading it first if necessary.
         """
         # If an ensemble preset was loaded and no explicit model list was provided, use preset models
-        if self._ensemble_preset_models is not None and model_filename == "model_bs_roformer_ep_317_sdr_12.9755.ckpt":
+        if (
+            self._ensemble_preset_models is not None
+            and model_filename == "model_bs_roformer_ep_317_sdr_12.9755.ckpt"
+        ):
             model_filename = self._ensemble_preset_models
 
         if isinstance(model_filename, list):
             if len(model_filename) > 1:
                 self.model_filename = list(model_filename)
                 self.model_filenames = list(model_filename)
-                self.logger.info(f"Multiple models specified for ensembling: {self.model_filenames}")
+                self.logger.info(
+                    f"Multiple models specified for ensembling: {self.model_filenames}"
+                )
                 return
             model_filename = model_filename[0]
 
@@ -852,9 +1165,17 @@ def load_model(self, model_filename="model_bs_roformer_ep_317_sdr_12.9755.ckpt")
         load_model_start_time = time.perf_counter()
 
         # Setting up the model path
-        model_filename, model_type, model_friendly_name, model_path, yaml_config_filename = self.download_model_files(model_filename)
+        (
+            model_filename,
+            model_type,
+            model_friendly_name,
+            model_path,
+            yaml_config_filename,
+        ) = self.download_model_files(model_filename)
         model_name = model_filename.split(".")[0]
-        self.logger.debug(f"Model downloaded, friendly name: {model_friendly_name}, model_path: {model_path}")
+        self.logger.debug(
+            f"Model downloaded, friendly name: {model_friendly_name}, model_path: {model_path}"
+        )
 
         if model_path.lower().endswith(".yaml"):
             yaml_config_filename = model_path
@@ -870,6 +1191,7 @@ def load_model(self, model_filename="model_bs_roformer_ep_317_sdr_12.9755.ckpt")
             "torch_device": self.torch_device,
             "torch_device_cpu": self.torch_device_cpu,
             "torch_device_mps": self.torch_device_mps,
+            "is_rocm": self.is_rocm,
             "onnx_execution_provider": self.onnx_execution_provider,
             "model_name": model_name,
             "model_path": model_path,
@@ -886,14 +1208,26 @@ def load_model(self, model_filename="model_bs_roformer_ep_317_sdr_12.9755.ckpt")
         }
 
         # Instantiate the appropriate separator class depending on the model type
-        separator_classes = {"MDX": "mdx_separator.MDXSeparator", "VR": "vr_separator.VRSeparator", "Demucs": "demucs_separator.DemucsSeparator", "MDXC": "mdxc_separator.MDXCSeparator"}
+        separator_classes = {
+            "MDX": "mdx_separator.MDXSeparator",
+            "VR": "vr_separator.VRSeparator",
+            "Demucs": "demucs_separator.DemucsSeparator",
+            "MDXC": "mdxc_separator.MDXCSeparator",
+        }
 
-        if model_type not in self.arch_specific_params or model_type not in separator_classes:
+        if (
+            model_type not in self.arch_specific_params
+            or model_type not in separator_classes
+        ):
             # Enhanced error message for Roformer models
-            if "roformer" in model_filename.lower() or (model_data and model_data.get("is_roformer", False)):
-                error_msg = (f"Roformer model type not properly configured: {model_type}. "
-                           f"This may indicate a configuration validation failure. "
-                           f"Please check the model file and YAML configuration.")
+            if "roformer" in model_filename.lower() or (
+                model_data and model_data.get("is_roformer", False)
+            ):
+                error_msg = (
+                    f"Roformer model type not properly configured: {model_type}. "
+                    f"This may indicate a configuration validation failure. "
+                    f"Please check the model file and YAML configuration."
+                )
                 self.logger.error(error_msg)
                 raise ValueError(error_msg)
             else:
@@ -902,35 +1236,53 @@ def load_model(self, model_filename="model_bs_roformer_ep_317_sdr_12.9755.ckpt")
         if model_type == "Demucs" and sys.version_info < (3, 10):
             raise Exception("Demucs models require Python version 3.10 or newer.")
 
-        self.logger.debug(f"Importing module for model type {model_type}: {separator_classes[model_type]}")
+        self.logger.debug(
+            f"Importing module for model type {model_type}: {separator_classes[model_type]}"
+        )
 
         module_name, class_name = separator_classes[model_type].split(".")
-        module = importlib.import_module(f"audio_separator.separator.architectures.{module_name}")
+        module = importlib.import_module(
+            f"audio_separator.separator.architectures.{module_name}"
+        )
         separator_class = getattr(module, class_name)
 
-        self.logger.debug(f"Instantiating separator class for model type {model_type}: {separator_class}")
+        self.logger.debug(
+            f"Instantiating separator class for model type {model_type}: {separator_class}"
+        )
 
         try:
-            self.model_instance = separator_class(common_config=common_params, arch_config=self.arch_specific_params[model_type])
+            self.model_instance = separator_class(
+                common_config=common_params,
+                arch_config=self.arch_specific_params[model_type],
+            )
         except Exception as e:
             # Enhanced error handling for Roformer models
-            if "roformer" in model_filename.lower() or (model_data and model_data.get("is_roformer", False)):
-                error_msg = (f"Failed to instantiate Roformer model: {e}. "
-                           f"This may be due to missing parameters or configuration validation failures.")
+            if "roformer" in model_filename.lower() or (
+                model_data and model_data.get("is_roformer", False)
+            ):
+                error_msg = (
+                    f"Failed to instantiate Roformer model: {e}. "
+                    f"This may be due to missing parameters or configuration validation failures."
+                )
                 self.logger.error(error_msg)
                 raise RuntimeError(error_msg) from e
             else:
                 raise
 
         # Log Roformer implementation version if applicable
-        if hasattr(self.model_instance, 'is_roformer_model') and self.model_instance.is_roformer_model:
+        if (
+            hasattr(self.model_instance, "is_roformer_model")
+            and self.model_instance.is_roformer_model
+        ):
             roformer_stats = self.model_instance.get_roformer_loading_stats()
             if roformer_stats:
                 self.logger.info(f"Roformer loading stats: {roformer_stats}")
 
         # Log the completion of the model load process
         self.logger.debug("Loading model completed.")
-        self.logger.info(f'Load model duration: {time.strftime("%H:%M:%S", time.gmtime(int(time.perf_counter() - load_model_start_time)))}')
+        self.logger.info(
+            f"Load model duration: {time.strftime('%H:%M:%S', time.gmtime(int(time.perf_counter() - load_model_start_time)))}"
+        )
 
     def separate(self, audio_file_path, custom_output_names=None):
         """
@@ -947,13 +1299,16 @@ def separate(self, audio_file_path, custom_output_names=None):
         Returns:
         - output_files (list of str): A list containing the paths to the separated audio stem files.
         """
-        # Check if the model and device are properly initialized
-        if not (self.torch_device and (self.model_instance or (isinstance(self.model_filename, list) and len(self.model_filename) > 0))):
-            raise ValueError("Initialization failed or model not loaded. Please load a model before attempting to separate.")
-
+        # Check for ensemble (multi-model) separation first - model_instance not needed
         if isinstance(self.model_filename, list) and len(self.model_filename) > 1:
             return self._separate_ensemble(audio_file_path, custom_output_names)
 
+        # Check if the model and device are properly initialized
+        if not (self.torch_device and self.model_instance):
+            raise ValueError(
+                "Initialization failed or model not loaded. Please load a model before attempting to separate."
+            )
+
         # If audio_file_path is a string, convert it to a list for uniform processing
         if isinstance(audio_file_path, str):
             audio_file_path = [audio_file_path]
@@ -968,15 +1323,30 @@ def separate(self, audio_file_path, custom_output_names=None):
                 for root, dirs, files in os.walk(path):
                     for file in files:
                         # Check the file extension to ensure it's an audio file
-                        if file.endswith((".wav", ".flac", ".mp3", ".ogg", ".opus", ".m4a", ".aiff", ".ac3")):  # Add other formats if needed
+                        if file.endswith(
+                            (
+                                ".wav",
+                                ".flac",
+                                ".mp3",
+                                ".ogg",
+                                ".opus",
+                                ".m4a",
+                                ".aiff",
+                                ".ac3",
+                            )
+                        ):  # Add other formats if needed
                             full_path = os.path.join(root, file)
                             self.logger.info(f"Processing file: {full_path}")
                             try:
                                 # Perform separation for each file
-                                files_output = self._separate_file(full_path, custom_output_names)
+                                files_output = self._separate_file(
+                                    full_path, custom_output_names
+                                )
                                 output_files.extend(files_output)
                             except Exception as e:
-                                self.logger.error(f"Failed to process file {full_path}: {e}")
+                                self.logger.error(
+                                    f"Failed to process file {full_path}: {e}"
+                                )
             else:
                 # If the path is a file, process it directly
                 self.logger.info(f"Processing file: {path}")
@@ -1002,32 +1372,48 @@ def _separate_file(self, audio_file_path, custom_output_names=None):
         # Check if chunking is enabled and file is large enough
         if self.chunk_duration is not None:
             import librosa
+
             duration = librosa.get_duration(path=audio_file_path)
 
             from audio_separator.separator.audio_chunking import AudioChunker
+
             chunker = AudioChunker(self.chunk_duration, self.logger)
 
             if chunker.should_chunk(duration):
-                self.logger.info(f"File duration {duration:.1f}s exceeds chunk size {self.chunk_duration}s, using chunked processing")
+                self.logger.info(
+                    f"File duration {duration:.1f}s exceeds chunk size {self.chunk_duration}s, using chunked processing"
+                )
                 return self._process_with_chunking(audio_file_path, custom_output_names)
 
         # Log the start of the separation process
-        self.logger.info(f"Starting separation process for audio_file_path: {audio_file_path}")
+        self.logger.info(
+            f"Starting separation process for audio_file_path: {audio_file_path}"
+        )
         separate_start_time = time.perf_counter()
 
         # Log normalization and amplification thresholds
-        self.logger.debug(f"Normalization threshold set to {self.normalization_threshold}, waveform will be lowered to this max amplitude to avoid clipping.")
-        self.logger.debug(f"Amplification threshold set to {self.amplification_threshold}, waveform will be scaled up to this max amplitude if below it.")
+        self.logger.debug(
+            f"Normalization threshold set to {self.normalization_threshold}, waveform will be lowered to this max amplitude to avoid clipping."
+        )
+        self.logger.debug(
+            f"Amplification threshold set to {self.amplification_threshold}, waveform will be scaled up to this max amplitude if below it."
+        )
 
         # Run separation method for the loaded model with autocast enabled if supported by the device
         output_files = None
-        if self.use_autocast and autocast_mode.is_autocast_available(self.torch_device.type):
+        if self.use_autocast and autocast_mode.is_autocast_available(
+            self.torch_device.type
+        ):
             self.logger.debug("Autocast available.")
             with autocast_mode.autocast(self.torch_device.type):
-                output_files = self.model_instance.separate(audio_file_path, custom_output_names)
+                output_files = self.model_instance.separate(
+                    audio_file_path, custom_output_names
+                )
         else:
             self.logger.debug("Autocast unavailable.")
-            output_files = self.model_instance.separate(audio_file_path, custom_output_names)
+            output_files = self.model_instance.separate(
+                audio_file_path, custom_output_names
+            )
 
         # Clear GPU cache to free up memory
         self.model_instance.clear_gpu_cache()
@@ -1040,7 +1426,9 @@ def _separate_file(self, audio_file_path, custom_output_names=None):
 
         # Log the completion of the separation process
         self.logger.debug("Separation process completed.")
-        self.logger.info(f'Separation duration: {time.strftime("%H:%M:%S", time.gmtime(int(time.perf_counter() - separate_start_time)))}')
+        self.logger.info(
+            f"Separation duration: {time.strftime('%H:%M:%S', time.gmtime(int(time.perf_counter() - separate_start_time)))}"
+        )
 
         return output_files
 
@@ -1076,7 +1464,9 @@ def _process_with_chunking(self, audio_file_path, custom_output_names=None):
             processed_chunks_by_stem = {}
 
             for i, chunk_path in enumerate(chunk_paths):
-                self.logger.info(f"Processing chunk {i+1}/{len(chunk_paths)}: {chunk_path}")
+                self.logger.info(
+                    f"Processing chunk {i + 1}/{len(chunk_paths)}: {chunk_path}"
+                )
 
                 original_chunk_duration = self.chunk_duration
                 original_output_dir = self.output_dir
@@ -1094,24 +1484,36 @@ def _process_with_chunking(self, audio_file_path, custom_output_names=None):
                     for stem_path in output_files:
                         # Extract stem name from filename: "chunk_0000_(Vocals).wav" → "Vocals"
                         filename = os.path.basename(stem_path)
-                        match = re.search(r'_\(([^)]+)\)', filename)
+                        match = re.search(r"_\(([^)]+)\)", filename)
                         if match:
                             stem_name = match.group(1)
                         else:
                             # Fallback: use index-based name if pattern not found
-                            stem_index = len([k for k in processed_chunks_by_stem.keys() if k.startswith('stem_')])
+                            stem_index = len(
+                                [
+                                    k
+                                    for k in processed_chunks_by_stem.keys()
+                                    if k.startswith("stem_")
+                                ]
+                            )
                             stem_name = f"stem_{stem_index}"
-                            self.logger.warning(f"Could not extract stem name from {filename}, using {stem_name}")
+                            self.logger.warning(
+                                f"Could not extract stem name from {filename}, using {stem_name}"
+                            )
 
                         if stem_name not in processed_chunks_by_stem:
                             processed_chunks_by_stem[stem_name] = []
 
                         # Ensure absolute path
-                        abs_path = stem_path if os.path.isabs(stem_path) else os.path.join(temp_dir, stem_path)
+                        abs_path = (
+                            stem_path
+                            if os.path.isabs(stem_path)
+                            else os.path.join(temp_dir, stem_path)
+                        )
                         processed_chunks_by_stem[stem_name].append(abs_path)
 
                     if not output_files:
-                        self.logger.warning(f"Chunk {i+1} produced no output files")
+                        self.logger.warning(f"Chunk {i + 1} produced no output files")
 
                 finally:
                     self.chunk_duration = original_chunk_duration
@@ -1140,13 +1542,19 @@ def _process_with_chunking(self, audio_file_path, custom_output_names=None):
                 else:
                     output_filename = f"{base_name}_({stem_name})"
 
-                output_path = os.path.join(self.output_dir, f"{output_filename}.{self.output_format.lower()}")
+                output_path = os.path.join(
+                    self.output_dir, f"{output_filename}.{self.output_format.lower()}"
+                )
 
-                self.logger.info(f"Merging {len(chunk_paths_for_stem)} chunks for stem: {stem_name}")
+                self.logger.info(
+                    f"Merging {len(chunk_paths_for_stem)} chunks for stem: {stem_name}"
+                )
                 chunker.merge_chunks(chunk_paths_for_stem, output_path)
                 output_files.append(output_path)
 
-            self.logger.info(f"Chunked processing completed. Output files: {output_files}")
+            self.logger.info(
+                f"Chunked processing completed. Output files: {output_files}"
+            )
             return output_files
 
         finally:
@@ -1161,7 +1569,13 @@ def download_model_and_data(self, model_filename):
         """
         self.logger.info(f"Downloading model {model_filename}...")
 
-        model_filename, model_type, model_friendly_name, model_path, yaml_config_filename = self.download_model_files(model_filename)
+        (
+            model_filename,
+            model_type,
+            model_friendly_name,
+            model_path,
+            yaml_config_filename,
+        ) = self.download_model_files(model_filename)
 
         if model_path.lower().endswith(".yaml"):
             yaml_config_filename = model_path
@@ -1173,7 +1587,9 @@ def download_model_and_data(self, model_filename):
 
         model_data_dict_size = len(model_data)
 
-        self.logger.info(f"Model downloaded, type: {model_type}, friendly name: {model_friendly_name}, model_path: {model_path}, model_data: {model_data_dict_size} items")
+        self.logger.info(
+            f"Model downloaded, type: {model_type}, friendly name: {model_friendly_name}, model_path: {model_path}, model_data: {model_data_dict_size} items"
+        )
 
     def get_simplified_model_list(self, filter_sort_by: Optional[str] = None):
         """
@@ -1216,7 +1632,12 @@ def get_simplified_model_list(self, filter_sort_by: Optional[str] = None):
                     stems_with_scores = ["Unknown"]
                     stem_sdr_dict["unknown"] = None
 
-                simplified_list[filename] = {"Name": name, "Type": model_type, "Stems": stems_with_scores, "SDR": stem_sdr_dict}
+                simplified_list[filename] = {
+                    "Name": name,
+                    "Type": model_type,
+                    "Stems": stems_with_scores,
+                    "SDR": stem_sdr_dict,
+                }
 
         # Sort and filter the list if a sort_by parameter is provided
         if filter_sort_by:
@@ -1228,12 +1649,19 @@ def get_simplified_model_list(self, filter_sort_by: Optional[str] = None):
                 # Convert sort_by to lowercase for case-insensitive comparison
                 sort_by_lower = filter_sort_by.lower()
                 # Filter out models that don't have the specified stem
-                filtered_list = {k: v for k, v in simplified_list.items() if sort_by_lower in v["SDR"]}
+                filtered_list = {
+                    k: v
+                    for k, v in simplified_list.items()
+                    if sort_by_lower in v["SDR"]
+                }
 
                 # Sort by SDR score if available, putting None values last
                 def sort_key(item):
                     sdr = item[1]["SDR"][sort_by_lower]
-                    return (0 if sdr is None else 1, sdr if sdr is not None else float("-inf"))
+                    return (
+                        0 if sdr is None else 1,
+                        sdr if sdr is not None else float("-inf"),
+                    )
 
                 return dict(sorted(filtered_list.items(), key=sort_key, reverse=True))
 
@@ -1286,7 +1714,7 @@ def _separate_ensemble(self, audio_file_path, custom_output_names=None):
                         model_stem_names = []
                         for stem_path in model_stems:
                             filename = os.path.basename(stem_path)
-                            match = re.search(r'_\(([^)]+)\)', filename)
+                            match = re.search(r"_\(([^)]+)\)", filename)
                             stem_name = match.group(1) if match else "Unknown"
                             model_stem_names.append(stem_name)
 
@@ -1297,15 +1725,27 @@ def _separate_ensemble(self, audio_file_path, custom_output_names=None):
                             for s in model_stem_names
                         )
 
-                        for stem_path, raw_stem_name in zip(model_stems, model_stem_names):
+                        for stem_path, raw_stem_name in zip(
+                            model_stems, model_stem_names
+                        ):
                             lower_name = raw_stem_name.lower()
 
-                            if "vocal" in lower_name and "lead" not in lower_name and "backing" not in lower_name:
+                            if (
+                                "vocal" in lower_name
+                                and "lead" not in lower_name
+                                and "backing" not in lower_name
+                            ):
                                 stem_name = "Vocals"
-                            elif lower_name == "other" and num_model_stems == 2 and has_vocal_stem:
+                            elif (
+                                lower_name == "other"
+                                and num_model_stems == 2
+                                and has_vocal_stem
+                            ):
                                 # For 2-stem models where one stem is vocals, "other" is the instrumental
                                 stem_name = "Instrumental"
-                                self.logger.debug(f"Mapped 'other' → 'Instrumental' for 2-stem model (model produced: {model_stem_names})")
+                                self.logger.debug(
+                                    f"Mapped 'other' → 'Instrumental' for 2-stem model (model produced: {model_stem_names})"
+                                )
                             elif lower_name in STEM_NAME_MAP:
                                 stem_name = STEM_NAME_MAP[lower_name]
                             else:
@@ -1314,17 +1754,25 @@ def _separate_ensemble(self, audio_file_path, custom_output_names=None):
                             if stem_name not in stems_by_type:
                                 stems_by_type[stem_name] = []
 
-                            abs_path = stem_path if os.path.isabs(stem_path) else os.path.join(temp_dir, stem_path)
+                            abs_path = (
+                                stem_path
+                                if os.path.isabs(stem_path)
+                                else os.path.join(temp_dir, stem_path)
+                            )
                             stems_by_type[stem_name].append(abs_path)
                     finally:
                         self.output_dir = original_output_dir
 
                 # Perform ensembling for each stem type
-                ensembler = Ensembler(self.logger, self.ensemble_algorithm, self.ensemble_weights)
+                ensembler = Ensembler(
+                    self.logger, self.ensemble_algorithm, self.ensemble_weights
+                )
                 base_name = os.path.splitext(os.path.basename(path))[0]
 
                 for stem_name, stem_paths in stems_by_type.items():
-                    self.logger.info(f"Ensembling {len(stem_paths)} stems for type: {stem_name}")
+                    self.logger.info(
+                        f"Ensembling {len(stem_paths)} stems for type: {stem_name}"
+                    )
 
                     waveforms = []
                     original_channels = None
@@ -1348,7 +1796,9 @@ def _separate_ensemble(self, audio_file_path, custom_output_names=None):
                     if custom_output_names and stem_name in custom_output_names:
                         output_filename = custom_output_names[stem_name]
                     elif self.ensemble_preset:
-                        output_filename = f"{base_name}_({stem_name})_preset_{self.ensemble_preset}"
+                        output_filename = (
+                            f"{base_name}_({stem_name})_preset_{self.ensemble_preset}"
+                        )
                     else:
                         # Build descriptive name from model filenames
                         model_slugs = []
@@ -1356,13 +1806,22 @@ def _separate_ensemble(self, audio_file_path, custom_output_names=None):
                             # Remove extension, then truncate to keep filenames reasonable
                             name = os.path.splitext(mf)[0]
                             # Remove common verbose prefixes
-                            for prefix in ["mel_band_roformer_", "melband_roformer_", "bs_roformer_", "model_bs_roformer_", "UVR-MDX-NET-", "UVR_MDXNET_"]:
+                            for prefix in [
+                                "mel_band_roformer_",
+                                "melband_roformer_",
+                                "bs_roformer_",
+                                "model_bs_roformer_",
+                                "UVR-MDX-NET-",
+                                "UVR_MDXNET_",
+                            ]:
                                 if name.startswith(prefix):
-                                    name = name[len(prefix):]
+                                    name = name[len(prefix) :]
                                     break
                             model_slugs.append(name[:12])
                         slugs_str = "_".join(model_slugs)
-                        output_filename = f"{base_name}_({stem_name})_custom_ensemble_{slugs_str}"
+                        output_filename = (
+                            f"{base_name}_({stem_name})_custom_ensemble_{slugs_str}"
+                        )
 
                     output_path = f"{output_filename}.{self.output_format.lower()}"
 
@@ -1378,18 +1837,30 @@ def _separate_ensemble(self, audio_file_path, custom_output_names=None):
                         output_files.append(final_output_path)
                     else:
                         # Fallback writer if no model instance is available
-                        self.logger.warning(f"No model instance available to write ensembled audio. Using fallback writer for {output_path}")
+                        self.logger.warning(
+                            f"No model instance available to write ensembled audio. Using fallback writer for {output_path}"
+                        )
                         final_output_path = os.path.join(self.output_dir, output_path)
 
                         import soundfile as sf
 
                         try:
-                            self.logger.debug(f"Attempting to write ensembled audio to {final_output_path}...")
-                            sf.write(final_output_path, ensembled_wav.T, self.sample_rate)
+                            self.logger.debug(
+                                f"Attempting to write ensembled audio to {final_output_path}..."
+                            )
+                            sf.write(
+                                final_output_path, ensembled_wav.T, self.sample_rate
+                            )
                         except Exception as e:
-                            self.logger.error(f"Error writing {self.output_format} format: {e}. Falling back to WAV.")
-                            final_output_path = final_output_path.rsplit(".", 1)[0] + ".wav"
-                            sf.write(final_output_path, ensembled_wav.T, self.sample_rate)
+                            self.logger.error(
+                                f"Error writing {self.output_format} format: {e}. Falling back to WAV."
+                            )
+                            final_output_path = (
+                                final_output_path.rsplit(".", 1)[0] + ".wav"
+                            )
+                            sf.write(
+                                final_output_path, ensembled_wav.T, self.sample_rate
+                            )
 
                         output_files.append(final_output_path)
 
diff --git a/audio_separator/separator/uvr_lib_v5/demucs/hdemucs_rocm.py b/audio_separator/separator/uvr_lib_v5/demucs/hdemucs_rocm.py
new file mode 100644
index 00000000..1c95d545
--- /dev/null
+++ b/audio_separator/separator/uvr_lib_v5/demucs/hdemucs_rocm.py
@@ -0,0 +1,820 @@
+from copy import deepcopy
+import math
+import typing as tp
+import torch
+from torch import nn
+from torch.nn import functional as F
+from .filtering import wiener
+from .demucs import DConv, rescale_module
+from .states import capture_init
+from .spec import spectro, ispectro
+
+
+def pad1d(
+    x: torch.Tensor,
+    paddings: tp.Tuple[int, int],
+    mode: str = "constant",
+    value: float = 0.0,
+):
+    """Tiny wrapper around F.pad, just to allow for reflect padding on small input.
+    If this is the case, we insert extra 0 padding to the right before the reflection happen."""
+    x0 = x
+    length = x.shape[-1]
+    padding_left, padding_right = paddings
+    if mode == "reflect":
+        max_pad = max(padding_left, padding_right)
+        if length <= max_pad:
+            extra_pad = max_pad - length + 1
+            extra_pad_right = min(padding_right, extra_pad)
+            extra_pad_left = extra_pad - extra_pad_right
+            paddings = (padding_left - extra_pad_left, padding_right - extra_pad_right)
+            x = F.pad(x, (extra_pad_left, extra_pad_right))
+    out = F.pad(x, paddings, mode, value)
+    assert out.shape[-1] == length + padding_left + padding_right
+    assert (out[..., padding_left : padding_left + length] == x0).all()
+    return out
+
+
+class ScaledEmbedding(nn.Module):
+    """
+    Boost learning rate for embeddings (with `scale`).
+    Also, can make embeddings continuous with `smooth`.
+    """
+
+    def __init__(
+        self, num_embeddings: int, embedding_dim: int, scale: float = 10.0, smooth=False
+    ):
+        super().__init__()
+        self.embedding = nn.Embedding(num_embeddings, embedding_dim)
+        if smooth:
+            weight = torch.cumsum(self.embedding.weight.data, dim=0)
+            # when summing gaussian, overscale raises as sqrt(n), so we normalize by that.
+            weight = (
+                weight / torch.arange(1, num_embeddings + 1).to(weight).sqrt()[:, None]
+            )
+            self.embedding.weight.data[:] = weight
+        self.embedding.weight.data /= scale
+        self.scale = scale
+
+    @property
+    def weight(self):
+        return self.embedding.weight * self.scale
+
+    def forward(self, x):
+        out = self.embedding(x) * self.scale
+        return out
+
+
+class HEncLayer(nn.Module):
+    def __init__(
+        self,
+        chin,
+        chout,
+        kernel_size=8,
+        stride=4,
+        norm_groups=1,
+        empty=False,
+        freq=True,
+        dconv=True,
+        norm=True,
+        context=0,
+        dconv_kw={},
+        pad=True,
+        rewrite=True,
+    ):
+        """Encoder layer. This used both by the time and the frequency branch.
+
+        Args:
+            chin: number of input channels.
+            chout: number of output channels.
+            norm_groups: number of groups for group norm.
+            empty: used to make a layer with just the first conv. this is used
+                before merging the time and freq. branches.
+            freq: this is acting on frequencies.
+            dconv: insert DConv residual branches.
+            norm: use GroupNorm.
+            context: context size for the 1x1 conv.
+            dconv_kw: list of kwargs for the DConv class.
+            pad: pad the input. Padding is done so that the output size is
+                always the input size / stride.
+            rewrite: add 1x1 conv at the end of the layer.
+        """
+        super().__init__()
+        norm_fn = lambda d: nn.Identity()  # noqa
+        if norm:
+            norm_fn = lambda d: nn.GroupNorm(norm_groups, d)  # noqa
+        if pad:
+            pad = kernel_size // 4
+        else:
+            pad = 0
+        klass = nn.Conv1d
+        self.freq = freq
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.empty = empty
+        self.norm = norm
+        self.pad = pad
+        if freq:
+            kernel_size = [kernel_size, 1]
+            stride = [stride, 1]
+            pad = [pad, 0]
+            klass = nn.Conv2d
+        self.conv = klass(chin, chout, kernel_size, stride, pad)
+        if self.empty:
+            return
+        self.norm1 = norm_fn(chout)
+        self.rewrite = None
+        if rewrite:
+            self.rewrite = klass(chout, 2 * chout, 1 + 2 * context, 1, context)
+            self.norm2 = norm_fn(2 * chout)
+
+        self.dconv = None
+        if dconv:
+            self.dconv = DConv(chout, **dconv_kw)
+
+    def forward(self, x, inject=None):
+        """
+        `inject` is used to inject the result from the time branch into the frequency branch,
+        when both have the same stride.
+        """
+        if not self.freq and x.dim() == 4:
+            B, C, Fr, T = x.shape
+            x = x.view(B, -1, T)
+
+        if not self.freq:
+            le = x.shape[-1]
+            if not le % self.stride == 0:
+                x = F.pad(x, (0, self.stride - (le % self.stride)))
+        y = self.conv(x)
+        if self.empty:
+            return y
+        if inject is not None:
+            assert inject.shape[-1] == y.shape[-1], (inject.shape, y.shape)
+            if inject.dim() == 3 and y.dim() == 4:
+                inject = inject[:, :, None]
+            y = y + inject
+        y = F.gelu(self.norm1(y))
+        if self.dconv:
+            if self.freq:
+                B, C, Fr, T = y.shape
+                y = y.permute(0, 2, 1, 3).reshape(-1, C, T)
+            y = self.dconv(y)
+            if self.freq:
+                y = y.view(B, Fr, C, T).permute(0, 2, 1, 3)
+        if self.rewrite:
+            z = self.norm2(self.rewrite(y))
+            z = F.glu(z, dim=1)
+        else:
+            z = y
+        return z
+
+
+class MultiWrap(nn.Module):
+    """
+    Takes one layer and replicate it N times. each replica will act
+    on a frequency band. All is done so that if the N replica have the same weights,
+    then this is exactly equivalent to applying the original module on all frequencies.
+
+    This is a bit over-engineered to avoid edge artifacts when splitting
+    the frequency bands, but it is possible the naive implementation would work as well...
+    """
+
+    def __init__(self, layer, split_ratios):
+        """
+        Args:
+            layer: module to clone, must be either HEncLayer or HDecLayer.
+            split_ratios: list of float indicating which ratio to keep for each band.
+        """
+        super().__init__()
+        self.split_ratios = split_ratios
+        self.layers = nn.ModuleList()
+        self.conv = isinstance(layer, HEncLayer)
+        assert not layer.norm
+        assert layer.freq
+        assert layer.pad
+        if not self.conv:
+            assert not layer.context_freq
+        for _ in range(len(split_ratios) + 1):
+            lay = deepcopy(layer)
+            if self.conv:
+                lay.conv.padding = (0, 0)
+            else:
+                lay.pad = False
+            for m in lay.modules():
+                if hasattr(m, "reset_parameters"):
+                    m.reset_parameters()
+            self.layers.append(lay)
+
+    def forward(self, x, skip=None, length=None):
+        B, C, Fr, T = x.shape
+
+        ratios = list(self.split_ratios) + [1]
+        start = 0
+        outs = []
+        for ratio, layer in zip(ratios, self.layers, strict=True):
+            if self.conv:
+                pad = layer.kernel_size // 4
+                if ratio == 1:
+                    limit = Fr
+                    frames = -1
+                else:
+                    limit = int(round(Fr * ratio))
+                    le = limit - start
+                    if start == 0:
+                        le += pad
+                    frames = round((le - layer.kernel_size) / layer.stride + 1)
+                    limit = start + (frames - 1) * layer.stride + layer.kernel_size
+                    if start == 0:
+                        limit -= pad
+                assert limit - start > 0, (limit, start)
+                assert limit <= Fr, (limit, Fr)
+                y = x[:, :, start:limit, :]
+                if start == 0:
+                    y = F.pad(y, (0, 0, pad, 0))
+                if ratio == 1:
+                    y = F.pad(y, (0, 0, 0, pad))
+                outs.append(layer(y))
+                start = limit - layer.kernel_size + layer.stride
+            else:
+                if ratio == 1:
+                    limit = Fr
+                else:
+                    limit = int(round(Fr * ratio))
+                last = layer.last
+                layer.last = True
+
+                y = x[:, :, start:limit]
+                s = skip[:, :, start:limit]
+                out, _ = layer(y, s, None)
+                if outs:
+                    outs[-1][:, :, -layer.stride :] += out[
+                        :, :, : layer.stride
+                    ] - layer.conv_tr.bias.view(1, -1, 1, 1)
+                    out = out[:, :, layer.stride :]
+                if ratio == 1:
+                    out = out[:, :, : -layer.stride // 2, :]
+                if start == 0:
+                    out = out[:, :, layer.stride // 2 :, :]
+                outs.append(out)
+                layer.last = last
+                start = limit
+        out = torch.cat(outs, dim=2)
+        if not self.conv and not last:
+            out = F.gelu(out)
+        if self.conv:
+            return out
+        else:
+            return out, None
+
+
+class HDecLayer(nn.Module):
+    def __init__(
+        self,
+        chin,
+        chout,
+        last=False,
+        kernel_size=8,
+        stride=4,
+        norm_groups=1,
+        empty=False,
+        freq=True,
+        dconv=True,
+        norm=True,
+        context=1,
+        dconv_kw={},
+        pad=True,
+        context_freq=True,
+        rewrite=True,
+    ):
+        """
+        Same as HEncLayer but for decoder. See `HEncLayer` for documentation.
+        """
+        super().__init__()
+        norm_fn = lambda d: nn.Identity()  # noqa
+        if norm:
+            norm_fn = lambda d: nn.GroupNorm(norm_groups, d)  # noqa
+        if pad:
+            pad = kernel_size // 4
+        else:
+            pad = 0
+        self.pad = pad
+        self.last = last
+        self.freq = freq
+        self.chin = chin
+        self.empty = empty
+        self.stride = stride
+        self.kernel_size = kernel_size
+        self.norm = norm
+        self.context_freq = context_freq
+        klass = nn.Conv1d
+        klass_tr = nn.ConvTranspose1d
+        if freq:
+            kernel_size = [kernel_size, 1]
+            stride = [stride, 1]
+            klass = nn.Conv2d
+            klass_tr = nn.ConvTranspose2d
+        self.conv_tr = klass_tr(chin, chout, kernel_size, stride)
+        self.norm2 = norm_fn(chout)
+        if self.empty:
+            return
+        self.rewrite = None
+        if rewrite:
+            if context_freq:
+                self.rewrite = klass(chin, 2 * chin, 1 + 2 * context, 1, context)
+            else:
+                self.rewrite = klass(
+                    chin, 2 * chin, [1, 1 + 2 * context], 1, [0, context]
+                )
+            self.norm1 = norm_fn(2 * chin)
+
+        self.dconv = None
+        if dconv:
+            self.dconv = DConv(chin, **dconv_kw)
+
+    def forward(self, x, skip, length):
+        if self.freq and x.dim() == 3:
+            B, C, T = x.shape
+            x = x.view(B, self.chin, -1, T)
+
+        if not self.empty:
+            x = x + skip
+
+            if self.rewrite:
+                y = F.glu(self.norm1(self.rewrite(x)), dim=1)
+            else:
+                y = x
+            if self.dconv:
+                if self.freq:
+                    B, C, Fr, T = y.shape
+                    y = y.permute(0, 2, 1, 3).reshape(-1, C, T)
+                y = self.dconv(y)
+                if self.freq:
+                    y = y.view(B, Fr, C, T).permute(0, 2, 1, 3)
+        else:
+            y = x
+            assert skip is None
+        z = self.norm2(self.conv_tr(y))
+        if self.freq:
+            if self.pad:
+                z = z[..., self.pad : -self.pad, :]
+        else:
+            z = z[..., self.pad : self.pad + length]
+            assert z.shape[-1] == length, (z.shape[-1], length)
+        if not self.last:
+            z = F.gelu(z)
+        return z, y
+
+
+class HDemucsROCm(nn.Module):
+    """
+    Spectrogram and hybrid Demucs model with ROCm support.
+    Modified to remove CPU forcing for complex number operations.
+    """
+
+    @capture_init
+    def __init__(
+        self,
+        sources,
+        # Channels
+        audio_channels=2,
+        channels=48,
+        channels_time=None,
+        growth=2,
+        # STFT
+        nfft=4096,
+        wiener_iters=0,
+        end_iters=0,
+        wiener_residual=False,
+        cac=True,
+        # Main structure
+        depth=6,
+        rewrite=True,
+        hybrid=True,
+        hybrid_old=False,
+        # Frequency branch
+        multi_freqs=None,
+        multi_freqs_depth=2,
+        freq_emb=0.2,
+        emb_scale=10,
+        emb_smooth=True,
+        # Convolutions
+        kernel_size=8,
+        time_stride=2,
+        stride=4,
+        context=1,
+        context_enc=0,
+        # Normalization
+        norm_starts=4,
+        norm_groups=4,
+        # DConv residual branch
+        dconv_mode=1,
+        dconv_depth=2,
+        dconv_comp=4,
+        dconv_attn=4,
+        dconv_lstm=4,
+        dconv_init=1e-4,
+        # Weight init
+        rescale=0.1,
+        # Metadata
+        samplerate=44100,
+        segment=4 * 10,
+    ):
+        """
+        Args:
+            sources (list[str]): list of source names.
+            audio_channels (int): input/output audio channels.
+            channels (int): initial number of hidden channels.
+            channels_time: if not None, use a different `channels` value for the time branch.
+            growth: increase the number of hidden channels by this factor at each layer.
+            nfft: number of fft bins. Note that changing this require careful computation of
+                various shape parameters and will not work out of the box for hybrid models.
+            wiener_iters: when using Wiener filtering, number of iterations at test time.
+            end_iters: same but at train time. For a hybrid model, must be equal to `wiener_iters`.
+            wiener_residual: add residual source before wiener filtering.
+            cac: uses complex as channels, i.e. complex numbers are 2 channels each
+                in input and output. no further processing is done before ISTFT.
+            depth (int): number of layers in the encoder and in the decoder.
+            rewrite (bool): add 1x1 convolution to each layer.
+            hybrid (bool): make a hybrid time/frequency domain, otherwise frequency only.
+            hybrid_old: some models trained for MDX had a padding bug. This replicates
+                this bug to avoid retraining them.
+            multi_freqs: list of frequency ratios for splitting frequency bands with `MultiWrap`.
+            multi_freqs_depth: how many layers to wrap with `MultiWrap`. Only the outermost
+                layers will be wrapped.
+            freq_emb: add frequency embedding after the first frequency layer if > 0,
+                the actual value controls the weight of the embedding.
+            emb_scale: equivalent to scaling the embedding learning rate
+            emb_smooth: initialize the embedding with a smooth one (with respect to frequencies).
+            kernel_size: kernel_size for encoder and decoder layers.
+            stride: stride for encoder and decoder layers.
+            time_stride: stride for the final time layer, after the merge.
+            context: context for 1x1 conv in the decoder.
+            context_enc: context for 1x1 conv in the encoder.
+            norm_starts: layer at which group norm starts being used.
+                decoder layers are numbered in reverse order.
+            norm_groups: number of groups for group norm.
+            dconv_mode: if 1: dconv in encoder only, 2: decoder only, 3: both.
+            dconv_depth: depth of residual DConv branch.
+            dconv_comp: compression of DConv branch.
+            dconv_attn: adds attention layers in DConv branch starting at this layer.
+            dconv_lstm: adds a LSTM layer in DConv branch starting at this layer.
+            dconv_init: initial scale for the DConv branch LayerScale.
+            rescale: weight recaling trick
+
+        """
+        super().__init__()
+
+        self.cac = cac
+        self.wiener_residual = wiener_residual
+        self.audio_channels = audio_channels
+        self.sources = sources
+        self.kernel_size = kernel_size
+        self.context = context
+        self.stride = stride
+        self.depth = depth
+        self.channels = channels
+        self.samplerate = samplerate
+        self.segment = segment
+
+        self.nfft = nfft
+        self.hop_length = nfft // 4
+        self.wiener_iters = wiener_iters
+        self.end_iters = end_iters
+        self.freq_emb = None
+        self.hybrid = hybrid
+        self.hybrid_old = hybrid_old
+        if hybrid_old:
+            assert hybrid, "hybrid_old must come with hybrid=True"
+        if hybrid:
+            assert wiener_iters == end_iters
+
+        self.encoder = nn.ModuleList()
+        self.decoder = nn.ModuleList()
+
+        if hybrid:
+            self.tencoder = nn.ModuleList()
+            self.tdecoder = nn.ModuleList()
+
+        chin = audio_channels
+        chin_z = chin  # number of channels for the freq branch
+        if self.cac:
+            chin_z *= 2
+        chout = channels_time or channels
+        chout_z = channels
+        freqs = nfft // 2
+
+        for index in range(depth):
+            lstm = index >= dconv_lstm
+            attn = index >= dconv_attn
+            norm = index >= norm_starts
+            freq = freqs > 1
+            stri = stride
+            ker = kernel_size
+            if not freq:
+                assert freqs == 1
+                ker = time_stride * 2
+                stri = time_stride
+
+            pad = True
+            last_freq = False
+            if freq and freqs <= kernel_size:
+                ker = freqs
+                pad = False
+                last_freq = True
+
+            kw = {
+                "kernel_size": ker,
+                "stride": stri,
+                "freq": freq,
+                "pad": pad,
+                "norm": norm,
+                "rewrite": rewrite,
+                "norm_groups": norm_groups,
+                "dconv_kw": {
+                    "lstm": lstm,
+                    "attn": attn,
+                    "depth": dconv_depth,
+                    "compress": dconv_comp,
+                    "init": dconv_init,
+                    "gelu": True,
+                },
+            }
+            kwt = dict(kw)
+            kwt["freq"] = 0
+            kwt["kernel_size"] = kernel_size
+            kwt["stride"] = stride
+            kwt["pad"] = True
+            kw_dec = dict(kw)
+            multi = False
+            if multi_freqs and index < multi_freqs_depth:
+                multi = True
+                kw_dec["context_freq"] = False
+
+            if last_freq:
+                chout_z = max(chout, chout_z)
+                chout = chout_z
+
+            enc = HEncLayer(
+                chin_z, chout_z, dconv=dconv_mode & 1, context=context_enc, **kw
+            )
+            if hybrid and freq:
+                tenc = HEncLayer(
+                    chin,
+                    chout,
+                    dconv=dconv_mode & 1,
+                    context=context_enc,
+                    empty=last_freq,
+                    **kwt,
+                )
+                self.tencoder.append(tenc)
+
+            if multi:
+                enc = MultiWrap(enc, multi_freqs)
+            self.encoder.append(enc)
+            if index == 0:
+                chin = self.audio_channels * len(self.sources)
+                chin_z = chin
+                if self.cac:
+                    chin_z *= 2
+            dec = HDecLayer(
+                chout_z,
+                chin_z,
+                dconv=dconv_mode & 2,
+                last=index == 0,
+                context=context,
+                **kw_dec,
+            )
+            if multi:
+                dec = MultiWrap(dec, multi_freqs)
+            if hybrid and freq:
+                tdec = HDecLayer(
+                    chout,
+                    chin,
+                    dconv=dconv_mode & 2,
+                    empty=last_freq,
+                    last=index == 0,
+                    context=context,
+                    **kwt,
+                )
+                self.tdecoder.insert(0, tdec)
+            self.decoder.insert(0, dec)
+
+            chin = chout
+            chin_z = chout_z
+            chout = int(growth * chout)
+            chout_z = int(growth * chout_z)
+            if freq:
+                if freqs <= kernel_size:
+                    freqs = 1
+                else:
+                    freqs //= stride
+            if index == 0 and freq_emb:
+                self.freq_emb = ScaledEmbedding(
+                    freqs, chin_z, smooth=emb_smooth, scale=emb_scale
+                )
+                self.freq_emb_scale = freq_emb
+
+        if rescale:
+            rescale_module(self, reference=rescale)
+
+    def _spec(self, x):
+        hl = self.hop_length
+        nfft = self.nfft
+        x0 = x  # noqa
+
+        if self.hybrid:
+            # We re-pad the signal in order to keep the property
+            # that the size of the output is exactly the size of the input
+            # divided by the stride (here hop_length), when divisible.
+            # This is achieved by padding by 1/4th of the kernel size (here nfft).
+            # which is not supported by torch.stft.
+            # Having all convolution operations follow this convention allow to easily
+            # align the time and frequency branches later on.
+            assert hl == nfft // 4
+            le = math.ceil(x.shape[-1] / hl)
+            pad = hl // 2 * 3
+            if not self.hybrid_old:
+                x = pad1d(x, (pad, pad + le * hl - x.shape[-1]), mode="reflect")
+            else:
+                x = pad1d(x, (pad, pad + le * hl - x.shape[-1]))
+
+        z = spectro(x, nfft, hl)[..., :-1, :]
+        if self.hybrid:
+            assert z.shape[-1] == le + 4, (z.shape, x.shape, le)
+            z = z[..., 2 : 2 + le]
+        return z
+
+    def _ispec(self, z, length=None, scale=0):
+        hl = self.hop_length // (4**scale)
+        z = F.pad(z, (0, 0, 0, 1))
+        if self.hybrid:
+            z = F.pad(z, (2, 2))
+            pad = hl // 2 * 3
+            if not self.hybrid_old:
+                le = hl * int(math.ceil(length / hl)) + 2 * pad
+            else:
+                le = hl * int(math.ceil(length / hl))
+            x = ispectro(z, hl, length=le)
+            if not self.hybrid_old:
+                x = x[..., pad : pad + length]
+            else:
+                x = x[..., :length]
+        else:
+            x = ispectro(z, hl, length)
+        return x
+
+    def _magnitude(self, z):
+        # return the magnitude of the spectrogram, except when cac is True,
+        # in which case we just move the complex dimension to the channel one.
+        if self.cac:
+            B, C, Fr, T = z.shape
+            m = torch.view_as_real(z).permute(0, 1, 4, 2, 3)
+            m = m.reshape(B, C * 2, Fr, T)
+        else:
+            m = z.abs()
+        return m
+
+    def _mask(self, z, m):
+        # Apply masking given the mixture spectrogram `z` and the estimated mask `m`.
+        # If `cac` is True, `m` is actually a full spectrogram and `z` is ignored.
+        niters = self.wiener_iters
+        if self.cac:
+            B, S, C, Fr, T = m.shape
+            out = m.view(B, S, -1, 2, Fr, T).permute(0, 1, 2, 4, 5, 3)
+            out = torch.view_as_complex(out.contiguous())
+            return out
+        if self.training:
+            niters = self.end_iters
+        if niters < 0:
+            z = z[:, None]
+            return z / (1e-8 + z.abs()) * m
+        else:
+            return self._wiener(m, z, niters)
+
+    def _wiener(self, mag_out, mix_stft, niters):
+        # apply wiener filtering from OpenUnmix.
+        init = mix_stft.dtype
+        wiener_win_len = 300
+        residual = self.wiener_residual
+
+        B, S, C, Fq, T = mag_out.shape
+        mag_out = mag_out.permute(0, 4, 3, 2, 1)
+        mix_stft = torch.view_as_real(mix_stft.permute(0, 3, 2, 1))
+
+        outs = []
+        for sample in range(B):
+            pos = 0
+            out = []
+            for pos in range(0, T, wiener_win_len):
+                frame = slice(pos, pos + wiener_win_len)
+                z_out = wiener(
+                    mag_out[sample, frame],
+                    mix_stft[sample, frame],
+                    niters,
+                    residual=residual,
+                )
+                out.append(z_out.transpose(-1, -2))
+            outs.append(torch.cat(out, dim=0))
+        out = torch.view_as_complex(torch.stack(outs, 0))
+        out = out.permute(0, 4, 3, 2, 1).contiguous()
+        if residual:
+            out = out[:, :-1]
+        assert list(out.shape) == [B, S, C, Fq, T]
+        return out.to(init)
+
+    def forward(self, mix):
+        x = mix
+        length = x.shape[-1]
+
+        z = self._spec(mix)
+        mag = self._magnitude(z).to(mix.device)
+        x = mag
+
+        B, C, Fq, T = x.shape
+
+        # unlike previous Demucs, we always normalize because it is easier.
+        mean = x.mean(dim=(1, 2, 3), keepdim=True)
+        std = x.std(dim=(1, 2, 3), keepdim=True)
+        x = (x - mean) / (1e-5 + std)
+        # x will be the freq. branch input.
+
+        if self.hybrid:
+            # Prepare the time branch input.
+            xt = mix
+            meant = xt.mean(dim=(1, 2), keepdim=True)
+            stdt = xt.std(dim=(1, 2), keepdim=True)
+            xt = (xt - meant) / (1e-5 + stdt)
+
+        # okay, this is a giant mess I know...
+        saved = []  # skip connections, freq.
+        saved_t = []  # skip connections, time.
+        lengths = []  # saved lengths to properly remove padding, freq branch.
+        lengths_t = []  # saved lengths for time branch.
+        for idx, encode in enumerate(self.encoder):
+            lengths.append(x.shape[-1])
+            inject = None
+            if self.hybrid and idx < len(self.tencoder):
+                # we have not yet merged branches.
+                lengths_t.append(xt.shape[-1])
+                tenc = self.tencoder[idx]
+                xt = tenc(xt)
+                if not tenc.empty:
+                    # save for skip connection
+                    saved_t.append(xt)
+                else:
+                    # tenc contains just the first conv., so that now time and freq.
+                    # branches have the same shape and can be merged.
+                    inject = xt
+            x = encode(x, inject)
+            if idx == 0 and self.freq_emb is not None:
+                # add frequency embedding to allow for non equivariant convolutions
+                # over the frequency axis.
+                frs = torch.arange(x.shape[-2], device=x.device)
+                emb = self.freq_emb(frs).t()[None, :, :, None].expand_as(x)
+                x = x + self.freq_emb_scale * emb
+
+            saved.append(x)
+
+        x = torch.zeros_like(x)
+        if self.hybrid:
+            xt = torch.zeros_like(x)
+        # initialize everything to zero (signal will go through u-net skips).
+
+        for idx, decode in enumerate(self.decoder):
+            skip = saved.pop(-1)
+            x, pre = decode(x, skip, lengths.pop(-1))
+            # `pre` contains the output just before final transposed convolution,
+            # which is used when the freq. and time branch separate.
+
+            if self.hybrid:
+                offset = self.depth - len(self.tdecoder)
+            if self.hybrid and idx >= offset:
+                tdec = self.tdecoder[idx - offset]
+                length_t = lengths_t.pop(-1)
+                if tdec.empty:
+                    assert pre.shape[2] == 1, pre.shape
+                    pre = pre[:, :, 0]
+                    xt, _ = tdec(pre, None, length_t)
+                else:
+                    skip = saved_t.pop(-1)
+                    xt, _ = tdec(xt, skip, length_t)
+
+        # Let's make sure we used all stored skip connections.
+        assert len(saved) == 0
+        assert len(lengths_t) == 0
+        assert len(saved_t) == 0
+
+        S = len(self.sources)
+        x = x.view(B, S, -1, Fq, T)
+        x = x * std[:, None] + mean[:, None]
+
+        # Modified: Remove CPU forcing for ROCm devices
+        # Keep everything on the original device
+        zout = self._mask(z, x)
+        x = self._ispec(zout, length)
+
+        if self.hybrid:
+            xt = xt.view(B, S, -1, length)
+            xt = xt * stdt[:, None] + meant[:, None]
+            x = xt + x
+        return x
diff --git a/audio_separator/separator/uvr_lib_v5/roformer/bs_roformer.py b/audio_separator/separator/uvr_lib_v5/roformer/bs_roformer.py
index 8d966710..630de338 100644
--- a/audio_separator/separator/uvr_lib_v5/roformer/bs_roformer.py
+++ b/audio_separator/separator/uvr_lib_v5/roformer/bs_roformer.py
@@ -59,14 +59,23 @@ class FeedForward(Module):
     def __init__(self, dim, mult=4, dropout=0.0):
         super().__init__()
         dim_inner = int(dim * mult)
-        self.net = nn.Sequential(RMSNorm(dim), nn.Linear(dim, dim_inner), nn.GELU(), nn.Dropout(dropout), nn.Linear(dim_inner, dim), nn.Dropout(dropout))
+        self.net = nn.Sequential(
+            RMSNorm(dim),
+            nn.Linear(dim, dim_inner),
+            nn.GELU(),
+            nn.Dropout(dropout),
+            nn.Linear(dim_inner, dim),
+            nn.Dropout(dropout),
+        )
 
     def forward(self, x):
         return self.net(x)
 
 
 class Attention(Module):
-    def __init__(self, dim, heads=8, dim_head=64, dropout=0.0, rotary_embed=None, flash=True):
+    def __init__(
+        self, dim, heads=8, dim_head=64, dropout=0.0, rotary_embed=None, flash=True
+    ):
         super().__init__()
         self.heads = heads
         self.scale = dim_head**-0.5
@@ -81,12 +90,16 @@ def __init__(self, dim, heads=8, dim_head=64, dropout=0.0, rotary_embed=None, fl
 
         self.to_gates = nn.Linear(dim, heads)
 
-        self.to_out = nn.Sequential(nn.Linear(dim_inner, dim, bias=False), nn.Dropout(dropout))
+        self.to_out = nn.Sequential(
+            nn.Linear(dim_inner, dim, bias=False), nn.Dropout(dropout)
+        )
 
     def forward(self, x):
         x = self.norm(x)
 
-        q, k, v = rearrange(self.to_qkv(x), "b n (qkv h d) -> qkv b h n d", qkv=3, h=self.heads)
+        q, k, v = rearrange(
+            self.to_qkv(x), "b n (qkv h d) -> qkv b h n d", qkv=3, h=self.heads
+        )
 
         if exists(self.rotary_embed):
             q = self.rotary_embed.rotate_queries_or_keys(q)
@@ -112,13 +125,18 @@ def __init__(self, *, dim, dim_head=32, heads=8, scale=8, flash=False, dropout=0
         dim_inner = dim_head * heads
         self.norm = RMSNorm(dim)
 
-        self.to_qkv = nn.Sequential(nn.Linear(dim, dim_inner * 3, bias=False), Rearrange("b n (qkv h d) -> qkv b h d n", qkv=3, h=heads))
+        self.to_qkv = nn.Sequential(
+            nn.Linear(dim, dim_inner * 3, bias=False),
+            Rearrange("b n (qkv h d) -> qkv b h d n", qkv=3, h=heads),
+        )
 
         self.temperature = nn.Parameter(torch.ones(heads, 1, 1))
 
         self.attend = Attend(scale=scale, dropout=dropout, flash=flash)
 
-        self.to_out = nn.Sequential(Rearrange("b h d n -> b n (h d)"), nn.Linear(dim_inner, dim, bias=False))
+        self.to_out = nn.Sequential(
+            Rearrange("b h d n -> b n (h d)"), nn.Linear(dim_inner, dim, bias=False)
+        )
 
     def forward(self, x):
         x = self.norm(x)
@@ -134,17 +152,48 @@ def forward(self, x):
 
 
 class Transformer(Module):
-    def __init__(self, *, dim, depth, dim_head=64, heads=8, attn_dropout=0.0, ff_dropout=0.0, ff_mult=4, norm_output=True, rotary_embed=None, flash_attn=True, linear_attn=False):
+    def __init__(
+        self,
+        *,
+        dim,
+        depth,
+        dim_head=64,
+        heads=8,
+        attn_dropout=0.0,
+        ff_dropout=0.0,
+        ff_mult=4,
+        norm_output=True,
+        rotary_embed=None,
+        flash_attn=True,
+        linear_attn=False,
+    ):
         super().__init__()
         self.layers = ModuleList([])
 
         for _ in range(depth):
             if linear_attn:
-                attn = LinearAttention(dim=dim, dim_head=dim_head, heads=heads, dropout=attn_dropout, flash=flash_attn)
+                attn = LinearAttention(
+                    dim=dim,
+                    dim_head=dim_head,
+                    heads=heads,
+                    dropout=attn_dropout,
+                    flash=flash_attn,
+                )
             else:
-                attn = Attention(dim=dim, dim_head=dim_head, heads=heads, dropout=attn_dropout, rotary_embed=rotary_embed, flash=flash_attn)
-
-            self.layers.append(ModuleList([attn, FeedForward(dim=dim, mult=ff_mult, dropout=ff_dropout)]))
+                attn = Attention(
+                    dim=dim,
+                    dim_head=dim_head,
+                    heads=heads,
+                    dropout=attn_dropout,
+                    rotary_embed=rotary_embed,
+                    flash=flash_attn,
+                )
+
+            self.layers.append(
+                ModuleList(
+                    [attn, FeedForward(dim=dim, mult=ff_mult, dropout=ff_dropout)]
+                )
+            )
 
         self.norm = RMSNorm(dim) if norm_output else nn.Identity()
 
@@ -213,7 +262,9 @@ def __init__(self, dim, dim_inputs: Tuple[int, ...], depth, mlp_expansion_factor
         for dim_in in dim_inputs:
             net = []
 
-            mlp = nn.Sequential(MLP(dim, dim_in * 2, dim_hidden=dim_hidden, depth=depth), nn.GLU(dim=-1))
+            mlp = nn.Sequential(
+                MLP(dim, dim_in * 2, dim_hidden=dim_hidden, depth=depth), nn.GLU(dim=-1)
+            )
 
             self.to_freqs.append(mlp)
 
@@ -256,6 +307,7 @@ def forward(self, x):
     2,
     2,
     2,
+    2,
     4,
     4,
     4,
@@ -293,12 +345,11 @@ def forward(self, x):
     48,
     48,
     128,
-    129,
+    127,
 )
 
 
 class BSRoformer(Module):
-
     @beartype
     def __init__(
         self,
@@ -333,7 +384,13 @@ def __init__(
         stft_window_fn: Optional[Callable] = None,
         mask_estimator_depth=2,
         multi_stft_resolution_loss_weight=1.0,
-        multi_stft_resolutions_window_sizes: Tuple[int, ...] = (4096, 2048, 1024, 512, 256),
+        multi_stft_resolutions_window_sizes: Tuple[int, ...] = (
+            4096,
+            2048,
+            1024,
+            512,
+            256,
+        ),
         multi_stft_hop_size=147,
         multi_stft_normalized=False,
         multi_stft_window_fn: Callable = torch.hann_window,
@@ -343,7 +400,7 @@ def __init__(
         self.stereo = stereo
         self.audio_channels = 2 if stereo else 1
         self.num_stems = num_stems
-        
+
         # Store new parameters as instance variables
         self.mlp_expansion_factor = mlp_expansion_factor
         self.sage_attention = sage_attention
@@ -355,15 +412,15 @@ def __init__(
 
         # Add parameters to transformer kwargs (excluding sage_attention for now)
         transformer_kwargs = dict(
-            dim=dim, 
-            heads=heads, 
-            dim_head=dim_head, 
-            attn_dropout=attn_dropout, 
-            ff_dropout=ff_dropout, 
-            flash_attn=flash_attn, 
-            norm_output=False
+            dim=dim,
+            heads=heads,
+            dim_head=dim_head,
+            attn_dropout=attn_dropout,
+            ff_dropout=ff_dropout,
+            flash_attn=flash_attn,
+            norm_output=False,
         )
-        
+
         # Print sage attention status if enabled (as per research findings)
         if sage_attention:
             print("Use Sage Attention")
@@ -374,23 +431,54 @@ def __init__(
         for _ in range(depth):
             tran_modules = []
             if linear_transformer_depth > 0:
-                tran_modules.append(Transformer(depth=linear_transformer_depth, linear_attn=True, **transformer_kwargs))
-            tran_modules.append(Transformer(depth=time_transformer_depth, rotary_embed=time_rotary_embed, **transformer_kwargs))
-            tran_modules.append(Transformer(depth=freq_transformer_depth, rotary_embed=freq_rotary_embed, **transformer_kwargs))
+                tran_modules.append(
+                    Transformer(
+                        depth=linear_transformer_depth,
+                        linear_attn=True,
+                        **transformer_kwargs,
+                    )
+                )
+            tran_modules.append(
+                Transformer(
+                    depth=time_transformer_depth,
+                    rotary_embed=time_rotary_embed,
+                    **transformer_kwargs,
+                )
+            )
+            tran_modules.append(
+                Transformer(
+                    depth=freq_transformer_depth,
+                    rotary_embed=freq_rotary_embed,
+                    **transformer_kwargs,
+                )
+            )
             self.layers.append(nn.ModuleList(tran_modules))
 
         self.final_norm = RMSNorm(dim)
 
-        self.stft_kwargs = dict(n_fft=stft_n_fft, hop_length=stft_hop_length, win_length=stft_win_length, normalized=stft_normalized)
+        self.stft_kwargs = dict(
+            n_fft=stft_n_fft,
+            hop_length=stft_hop_length,
+            win_length=stft_win_length,
+            normalized=stft_normalized,
+        )
 
-        self.stft_window_fn = partial(default(stft_window_fn, torch.hann_window), stft_win_length)
+        self.stft_window_fn = partial(
+            default(stft_window_fn, torch.hann_window), stft_win_length
+        )
 
-        freqs = torch.stft(torch.randn(1, 4096), **self.stft_kwargs, return_complex=True).shape[1]
+        freqs = torch.stft(
+            torch.randn(1, 4096), **self.stft_kwargs, return_complex=True
+        ).shape[1]
 
         assert len(freqs_per_bands) > 1
-        assert sum(freqs_per_bands) == freqs, f"the number of freqs in the bands must equal {freqs} based on the STFT settings, but got {sum(freqs_per_bands)}"
+        assert sum(freqs_per_bands) == freqs, (
+            f"the number of freqs in the bands must equal {freqs} based on the STFT settings, but got {sum(freqs_per_bands)}"
+        )
 
-        freqs_per_bands_with_complex = tuple(2 * f * self.audio_channels for f in freqs_per_bands)
+        freqs_per_bands_with_complex = tuple(
+            2 * f * self.audio_channels for f in freqs_per_bands
+        )
 
         self.band_split = BandSplit(dim=dim, dim_inputs=freqs_per_bands_with_complex)
 
@@ -398,10 +486,10 @@ def __init__(
 
         for _ in range(num_stems):
             mask_estimator = MaskEstimator(
-                dim=dim, 
-                dim_inputs=freqs_per_bands_with_complex, 
+                dim=dim,
+                dim_inputs=freqs_per_bands_with_complex,
                 depth=mask_estimator_depth,
-                mlp_expansion_factor=mlp_expansion_factor  # Use the new parameter
+                mlp_expansion_factor=mlp_expansion_factor,  # Use the new parameter
             )
 
             self.mask_estimators.append(mask_estimator)
@@ -413,7 +501,9 @@ def __init__(
         self.multi_stft_n_fft = stft_n_fft
         self.multi_stft_window_fn = multi_stft_window_fn
 
-        self.multi_stft_kwargs = dict(hop_length=multi_stft_hop_size, normalized=multi_stft_normalized)
+        self.multi_stft_kwargs = dict(
+            hop_length=multi_stft_hop_size, normalized=multi_stft_normalized
+        )
 
     def forward(self, raw_audio, target=None, return_loss_breakdown=False):
         """
@@ -430,19 +520,18 @@ def forward(self, raw_audio, target=None, return_loss_breakdown=False):
 
         original_device = raw_audio.device
         x_is_mps = True if original_device.type == "mps" else False
+        x_is_rocm = True if "rocm" in str(original_device).lower() else False
 
-        # if x_is_mps:
-        #     raw_audio = raw_audio.cpu()
-
+        # Only move to CPU for MPS, not for ROCm
         device = raw_audio.device
 
         if raw_audio.ndim == 2:
             raw_audio = rearrange(raw_audio, "b t -> b 1 t")
 
         channels = raw_audio.shape[1]
-        assert (not self.stereo and channels == 1) or (
-            self.stereo and channels == 2
-        ), "stereo needs to be set to True if passing in audio signal that is stereo (channel dimension of 2). also need to be False if mono (channel dimension of 1)"
+        assert (not self.stereo and channels == 1) or (self.stereo and channels == 2), (
+            "stereo needs to be set to True if passing in audio signal that is stereo (channel dimension of 2). also need to be False if mono (channel dimension of 1)"
+        )
 
         # to stft
 
@@ -450,11 +539,15 @@ def forward(self, raw_audio, target=None, return_loss_breakdown=False):
 
         stft_window = self.stft_window_fn().to(device)
 
-        stft_repr = torch.stft(raw_audio, **self.stft_kwargs, window=stft_window, return_complex=True)
+        stft_repr = torch.stft(
+            raw_audio, **self.stft_kwargs, window=stft_window, return_complex=True
+        )
         stft_repr = torch.view_as_real(stft_repr)
 
         stft_repr = unpack_one(stft_repr, batch_audio_channel_packed_shape, "* f t c")
-        stft_repr = rearrange(stft_repr, "b s f t c -> b (f s) t c")  # merge stereo / mono into the frequency, with frequency leading dimension, for band splitting
+        stft_repr = rearrange(
+            stft_repr, "b s f t c -> b (f s) t c"
+        )  # merge stereo / mono into the frequency, with frequency leading dimension, for band splitting
 
         x = rearrange(stft_repr, "b f t c -> b t (f c)")
 
@@ -463,9 +556,10 @@ def forward(self, raw_audio, target=None, return_loss_breakdown=False):
         # axial / hierarchical attention
 
         for transformer_block in self.layers:
-
             if len(transformer_block) == 3:
-                linear_transformer, time_transformer, freq_transformer = transformer_block
+                linear_transformer, time_transformer, freq_transformer = (
+                    transformer_block
+                )
 
                 x, ft_ps = pack([x], "b * d")
                 x = linear_transformer(x)
@@ -507,11 +601,25 @@ def forward(self, raw_audio, target=None, return_loss_breakdown=False):
 
         # istft
 
-        stft_repr = rearrange(stft_repr, "b n (f s) t -> (b n s) f t", s=self.audio_channels)
+        stft_repr = rearrange(
+            stft_repr, "b n (f s) t -> (b n s) f t", s=self.audio_channels
+        )
 
-        recon_audio = torch.istft(stft_repr.cpu() if x_is_mps else stft_repr, **self.stft_kwargs, window=stft_window.cpu() if x_is_mps else stft_window, return_complex=False).to(device)
+        # Only move to CPU for MPS, not for ROCm
+        # Only move to CPU for MPS, not for ROCm
+        recon_audio = torch.istft(
+            stft_repr.cpu() if x_is_mps else stft_repr,
+            **self.stft_kwargs,
+            window=stft_window.cpu() if x_is_mps else stft_window,
+            return_complex=False,
+        )
 
-        recon_audio = rearrange(recon_audio, "(b n s) t -> b n s t", s=self.audio_channels, n=self.num_stems)
+        # Move result back to original device
+        recon_audio = recon_audio.to(device)
+
+        recon_audio = rearrange(
+            recon_audio, "(b n s) t -> b n s t", s=self.audio_channels, n=self.num_stems
+        )
 
         if self.num_stems == 1:
             recon_audio = rearrange(recon_audio, "b 1 s t -> b s t")
@@ -535,15 +643,27 @@ def forward(self, raw_audio, target=None, return_loss_breakdown=False):
 
         for window_size in self.multi_stft_resolutions_window_sizes:
             res_stft_kwargs = dict(
-                n_fft=max(window_size, self.multi_stft_n_fft), win_length=window_size, return_complex=True, window=self.multi_stft_window_fn(window_size, device=device), **self.multi_stft_kwargs
+                n_fft=max(window_size, self.multi_stft_n_fft),
+                win_length=window_size,
+                return_complex=True,
+                window=self.multi_stft_window_fn(window_size, device=device),
+                **self.multi_stft_kwargs,
             )
 
-            recon_Y = torch.stft(rearrange(recon_audio, "... s t -> (... s) t"), **res_stft_kwargs)
-            target_Y = torch.stft(rearrange(target, "... s t -> (... s) t"), **res_stft_kwargs)
+            recon_Y = torch.stft(
+                rearrange(recon_audio, "... s t -> (... s) t"), **res_stft_kwargs
+            )
+            target_Y = torch.stft(
+                rearrange(target, "... s t -> (... s) t"), **res_stft_kwargs
+            )
 
-            multi_stft_resolution_loss = multi_stft_resolution_loss + F.l1_loss(recon_Y, target_Y)
+            multi_stft_resolution_loss = multi_stft_resolution_loss + F.l1_loss(
+                recon_Y, target_Y
+            )
 
-        weighted_multi_resolution_loss = multi_stft_resolution_loss * self.multi_stft_resolution_loss_weight
+        weighted_multi_resolution_loss = (
+            multi_stft_resolution_loss * self.multi_stft_resolution_loss_weight
+        )
 
         total_loss = loss + weighted_multi_resolution_loss
 
diff --git a/audio_separator/separator/uvr_lib_v5/roformer/mel_band_roformer.py b/audio_separator/separator/uvr_lib_v5/roformer/mel_band_roformer.py
index 61660c58..f3ff0694 100644
--- a/audio_separator/separator/uvr_lib_v5/roformer/mel_band_roformer.py
+++ b/audio_separator/separator/uvr_lib_v5/roformer/mel_band_roformer.py
@@ -54,14 +54,23 @@ class FeedForward(Module):
     def __init__(self, dim, mult=4, dropout=0.0):
         super().__init__()
         dim_inner = int(dim * mult)
-        self.net = nn.Sequential(RMSNorm(dim), nn.Linear(dim, dim_inner), nn.GELU(), nn.Dropout(dropout), nn.Linear(dim_inner, dim), nn.Dropout(dropout))
+        self.net = nn.Sequential(
+            RMSNorm(dim),
+            nn.Linear(dim, dim_inner),
+            nn.GELU(),
+            nn.Dropout(dropout),
+            nn.Linear(dim_inner, dim),
+            nn.Dropout(dropout),
+        )
 
     def forward(self, x):
         return self.net(x)
 
 
 class Attention(Module):
-    def __init__(self, dim, heads=8, dim_head=64, dropout=0.0, rotary_embed=None, flash=True):
+    def __init__(
+        self, dim, heads=8, dim_head=64, dropout=0.0, rotary_embed=None, flash=True
+    ):
         super().__init__()
         self.heads = heads
         self.scale = dim_head**-0.5
@@ -76,12 +85,16 @@ def __init__(self, dim, heads=8, dim_head=64, dropout=0.0, rotary_embed=None, fl
 
         self.to_gates = nn.Linear(dim, heads)
 
-        self.to_out = nn.Sequential(nn.Linear(dim_inner, dim, bias=False), nn.Dropout(dropout))
+        self.to_out = nn.Sequential(
+            nn.Linear(dim_inner, dim, bias=False), nn.Dropout(dropout)
+        )
 
     def forward(self, x):
         x = self.norm(x)
 
-        q, k, v = rearrange(self.to_qkv(x), "b n (qkv h d) -> qkv b h n d", qkv=3, h=self.heads)
+        q, k, v = rearrange(
+            self.to_qkv(x), "b n (qkv h d) -> qkv b h n d", qkv=3, h=self.heads
+        )
 
         if exists(self.rotary_embed):
             q = self.rotary_embed.rotate_queries_or_keys(q)
@@ -97,14 +110,37 @@ def forward(self, x):
 
 
 class Transformer(Module):
-    def __init__(self, *, dim, depth, dim_head=64, heads=8, attn_dropout=0.0, ff_dropout=0.0, ff_mult=4, norm_output=True, rotary_embed=None, flash_attn=True):
+    def __init__(
+        self,
+        *,
+        dim,
+        depth,
+        dim_head=64,
+        heads=8,
+        attn_dropout=0.0,
+        ff_dropout=0.0,
+        ff_mult=4,
+        norm_output=True,
+        rotary_embed=None,
+        flash_attn=True,
+    ):
         super().__init__()
         self.layers = ModuleList([])
 
         for _ in range(depth):
             self.layers.append(
                 ModuleList(
-                    [Attention(dim=dim, dim_head=dim_head, heads=heads, dropout=attn_dropout, rotary_embed=rotary_embed, flash=flash_attn), FeedForward(dim=dim, mult=ff_mult, dropout=ff_dropout)]
+                    [
+                        Attention(
+                            dim=dim,
+                            dim_head=dim_head,
+                            heads=heads,
+                            dropout=attn_dropout,
+                            rotary_embed=rotary_embed,
+                            flash=flash_attn,
+                        ),
+                        FeedForward(dim=dim, mult=ff_mult, dropout=ff_dropout),
+                    ]
                 )
             )
 
@@ -172,7 +208,9 @@ def __init__(self, dim, dim_inputs: Tuple[int, ...], depth, mlp_expansion_factor
         for dim_in in dim_inputs:
             net = []
 
-            mlp = nn.Sequential(MLP(dim, dim_in * 2, dim_hidden=dim_hidden, depth=depth), nn.GLU(dim=-1))
+            mlp = nn.Sequential(
+                MLP(dim, dim_in * 2, dim_hidden=dim_hidden, depth=depth), nn.GLU(dim=-1)
+            )
 
             self.to_freqs.append(mlp)
 
@@ -189,7 +227,6 @@ def forward(self, x):
 
 
 class MelBandRoformer(Module):
-
     @beartype
     def __init__(
         self,
@@ -222,7 +259,13 @@ def __init__(
         stft_window_fn: Optional[Callable] = None,
         mask_estimator_depth=1,
         multi_stft_resolution_loss_weight=1.0,
-        multi_stft_resolutions_window_sizes: Tuple[int, ...] = (4096, 2048, 1024, 512, 256),
+        multi_stft_resolutions_window_sizes: Tuple[int, ...] = (
+            4096,
+            2048,
+            1024,
+            512,
+            256,
+        ),
         multi_stft_hop_size=147,
         multi_stft_normalized=False,
         multi_stft_window_fn: Callable = torch.hann_window,
@@ -233,7 +276,7 @@ def __init__(
         self.stereo = stereo
         self.audio_channels = 2 if stereo else 1
         self.num_stems = num_stems
-        
+
         # Store new parameters as instance variables
         self.mlp_expansion_factor = mlp_expansion_factor
         self.sage_attention = sage_attention
@@ -245,14 +288,14 @@ def __init__(
 
         # Add parameters to transformer kwargs (excluding sage_attention for now)
         transformer_kwargs = dict(
-            dim=dim, 
-            heads=heads, 
-            dim_head=dim_head, 
-            attn_dropout=attn_dropout, 
-            ff_dropout=ff_dropout, 
-            flash_attn=flash_attn
+            dim=dim,
+            heads=heads,
+            dim_head=dim_head,
+            attn_dropout=attn_dropout,
+            ff_dropout=ff_dropout,
+            flash_attn=flash_attn,
         )
-        
+
         # Print sage attention status if enabled (as per research findings)
         if sage_attention:
             print("Use Sage Attention")
@@ -264,19 +307,38 @@ def __init__(
             self.layers.append(
                 nn.ModuleList(
                     [
-                        Transformer(depth=time_transformer_depth, rotary_embed=time_rotary_embed, **transformer_kwargs),
-                        Transformer(depth=freq_transformer_depth, rotary_embed=freq_rotary_embed, **transformer_kwargs),
+                        Transformer(
+                            depth=time_transformer_depth,
+                            rotary_embed=time_rotary_embed,
+                            **transformer_kwargs,
+                        ),
+                        Transformer(
+                            depth=freq_transformer_depth,
+                            rotary_embed=freq_rotary_embed,
+                            **transformer_kwargs,
+                        ),
                     ]
                 )
             )
 
-        self.stft_window_fn = partial(default(stft_window_fn, torch.hann_window), stft_win_length)
+        self.stft_window_fn = partial(
+            default(stft_window_fn, torch.hann_window), stft_win_length
+        )
 
-        self.stft_kwargs = dict(n_fft=stft_n_fft, hop_length=stft_hop_length, win_length=stft_win_length, normalized=stft_normalized)
+        self.stft_kwargs = dict(
+            n_fft=stft_n_fft,
+            hop_length=stft_hop_length,
+            win_length=stft_win_length,
+            normalized=stft_normalized,
+        )
 
-        freqs = torch.stft(torch.randn(1, 4096), **self.stft_kwargs, return_complex=True).shape[1]
+        freqs = torch.stft(
+            torch.randn(1, 4096), **self.stft_kwargs, return_complex=True
+        ).shape[1]
 
-        mel_filter_bank_numpy = filters.mel(sr=sample_rate, n_fft=stft_n_fft, n_mels=num_bands)
+        mel_filter_bank_numpy = filters.mel(
+            sr=sample_rate, n_fft=stft_n_fft, n_mels=num_bands
+        )
 
         mel_filter_bank = torch.from_numpy(mel_filter_bank_numpy)
 
@@ -285,7 +347,9 @@ def __init__(
         mel_filter_bank[-1, -1] = 1.0
 
         freqs_per_band = mel_filter_bank > 0
-        assert freqs_per_band.any(dim=0).all(), "all frequencies need to be covered by all bands for now"
+        assert freqs_per_band.any(dim=0).all(), (
+            "all frequencies need to be covered by all bands for now"
+        )
 
         repeated_freq_indices = repeat(torch.arange(freqs), "f -> b f", b=num_bands)
         freq_indices = repeated_freq_indices[freqs_per_band]
@@ -295,8 +359,9 @@ def __init__(
             freq_indices = freq_indices * 2 + torch.arange(2)
             freq_indices = rearrange(freq_indices, "f s -> (f s)")
 
-        self.register_buffer("freq_indices", freq_indices, persistent=False)
-        self.register_buffer("freqs_per_band", freqs_per_band, persistent=False)
+        # Register buffers on the same device as the model
+        self.register_buffer("freq_indices", freq_indices)
+        self.register_buffer("freqs_per_band", freqs_per_band)
 
         num_freqs_per_band = reduce(freqs_per_band, "b f -> b", "sum")
         num_bands_per_freq = reduce(freqs_per_band, "b f -> f", "sum")
@@ -304,14 +369,20 @@ def __init__(
         self.register_buffer("num_freqs_per_band", num_freqs_per_band, persistent=False)
         self.register_buffer("num_bands_per_freq", num_bands_per_freq, persistent=False)
 
-        freqs_per_bands_with_complex = tuple(2 * f * self.audio_channels for f in num_freqs_per_band.tolist())
+        freqs_per_bands_with_complex = tuple(
+            2 * f * self.audio_channels for f in num_freqs_per_band.tolist()
+        )
 
         self.band_split = BandSplit(dim=dim, dim_inputs=freqs_per_bands_with_complex)
 
         self.mask_estimators = nn.ModuleList([])
 
         for _ in range(num_stems):
-            mask_estimator = MaskEstimator(dim=dim, dim_inputs=freqs_per_bands_with_complex, depth=mask_estimator_depth)
+            mask_estimator = MaskEstimator(
+                dim=dim,
+                dim_inputs=freqs_per_bands_with_complex,
+                depth=mask_estimator_depth,
+            )
 
             self.mask_estimators.append(mask_estimator)
 
@@ -320,7 +391,9 @@ def __init__(
         self.multi_stft_n_fft = stft_n_fft
         self.multi_stft_window_fn = multi_stft_window_fn
 
-        self.multi_stft_kwargs = dict(hop_length=multi_stft_hop_size, normalized=multi_stft_normalized)
+        self.multi_stft_kwargs = dict(
+            hop_length=multi_stft_hop_size, normalized=multi_stft_normalized
+        )
 
         self.match_input_audio_length = match_input_audio_length
 
@@ -339,7 +412,9 @@ def forward(self, raw_audio, target=None, return_loss_breakdown=False):
 
         original_device = raw_audio.device
         x_is_mps = True if original_device.type == "mps" else False
+        x_is_rocm = True if "rocm" in str(original_device).lower() else False
 
+        # Only move to CPU for MPS, not for ROCm
         if x_is_mps:
             raw_audio = raw_audio.cpu()
 
@@ -352,23 +427,31 @@ def forward(self, raw_audio, target=None, return_loss_breakdown=False):
 
         istft_length = raw_audio_length if self.match_input_audio_length else None
 
-        assert (not self.stereo and channels == 1) or (
-            self.stereo and channels == 2
-        ), "stereo needs to be set to True if passing in audio signal that is stereo (channel dimension of 2). also need to be False if mono (channel dimension of 1)"
+        assert (not self.stereo and channels == 1) or (self.stereo and channels == 2), (
+            "stereo needs to be set to True if passing in audio signal that is stereo (channel dimension of 2). also need to be False if mono (channel dimension of 1)"
+        )
 
         raw_audio, batch_audio_channel_packed_shape = pack_one(raw_audio, "* t")
 
         stft_window = self.stft_window_fn().to(device)
 
-        stft_repr = torch.stft(raw_audio, **self.stft_kwargs, window=stft_window, return_complex=True)
+        stft_repr = torch.stft(
+            raw_audio, **self.stft_kwargs, window=stft_window, return_complex=True
+        )
         stft_repr = torch.view_as_real(stft_repr)
 
         stft_repr = unpack_one(stft_repr, batch_audio_channel_packed_shape, "* f t c")
-        stft_repr = rearrange(stft_repr, "b s f t c -> b (f s) t c")  # merge stereo / mono into the frequency, with frequency leading dimension, for band splitting
+        stft_repr = rearrange(
+            stft_repr, "b s f t c -> b (f s) t c"
+        )  # merge stereo / mono into the frequency, with frequency leading dimension, for band splitting
 
         batch_arange = torch.arange(batch, device=device)[..., None]
 
-        x = stft_repr[batch_arange, self.freq_indices.cpu()] if x_is_mps else stft_repr[batch_arange, self.freq_indices]
+        x = (
+            stft_repr[batch_arange, self.freq_indices.cpu()]
+            if x_is_mps
+            else stft_repr[batch_arange, self.freq_indices]
+        )
 
         x = rearrange(x, "b f t c -> b t (f c)")
 
@@ -402,14 +485,36 @@ def forward(self, raw_audio, target=None, return_loss_breakdown=False):
         masks = masks.type(stft_repr.dtype)
 
         if x_is_mps:
-            scatter_indices = repeat(self.freq_indices.cpu(), "f -> b n f t", b=batch, n=self.num_stems, t=stft_repr.shape[-1])
+            scatter_indices = repeat(
+                self.freq_indices.cpu(),
+                "f -> b n f t",
+                b=batch,
+                n=self.num_stems,
+                t=stft_repr.shape[-1],
+            )
         else:
-            scatter_indices = repeat(self.freq_indices, "f -> b n f t", b=batch, n=self.num_stems, t=stft_repr.shape[-1])
+            scatter_indices = repeat(
+                self.freq_indices,
+                "f -> b n f t",
+                b=batch,
+                n=self.num_stems,
+                t=stft_repr.shape[-1],
+            )
+
+        stft_repr_expanded_stems = repeat(
+            stft_repr, "b 1 ... -> b n ...", n=self.num_stems
+        )
+
+        # Only move to CPU for MPS, not for ROCm
+        masks_cpu = masks.cpu() if x_is_mps else masks
+        scatter_indices_cpu = scatter_indices.cpu() if x_is_mps else scatter_indices
+        stft_repr_cpu = (
+            stft_repr_expanded_stems.cpu() if x_is_mps else stft_repr_expanded_stems
+        )
 
-        stft_repr_expanded_stems = repeat(stft_repr, "b 1 ... -> b n ...", n=self.num_stems)
         masks_summed = (
-            torch.zeros_like(stft_repr_expanded_stems.cpu() if x_is_mps else stft_repr_expanded_stems)
-            .scatter_add_(2, scatter_indices.cpu() if x_is_mps else scatter_indices, masks.cpu() if x_is_mps else masks)
+            torch.zeros_like(stft_repr_cpu)
+            .scatter_add_(2, scatter_indices_cpu, masks_cpu)
             .to(device)
         )
 
@@ -422,11 +527,29 @@ def forward(self, raw_audio, target=None, return_loss_breakdown=False):
 
         stft_repr = stft_repr * masks_averaged
 
-        stft_repr = rearrange(stft_repr, "b n (f s) t -> (b n s) f t", s=self.audio_channels)
+        stft_repr = rearrange(
+            stft_repr, "b n (f s) t -> (b n s) f t", s=self.audio_channels
+        )
+
+        # Only move to CPU for MPS, not for ROCm
+        recon_audio = torch.istft(
+            stft_repr.cpu() if x_is_mps else stft_repr,
+            **self.stft_kwargs,
+            window=stft_window.cpu() if x_is_mps else stft_window,
+            return_complex=False,
+            length=istft_length,
+        )
 
-        recon_audio = torch.istft(stft_repr.cpu() if x_is_mps else stft_repr, **self.stft_kwargs, window=stft_window.cpu() if x_is_mps else stft_window, return_complex=False, length=istft_length)
+        # Move result back to original device
+        recon_audio = recon_audio.to(device)
 
-        recon_audio = rearrange(recon_audio, "(b n s) t -> b n s t", b=batch, s=self.audio_channels, n=self.num_stems)
+        recon_audio = rearrange(
+            recon_audio,
+            "(b n s) t -> b n s t",
+            b=batch,
+            s=self.audio_channels,
+            n=self.num_stems,
+        )
 
         if self.num_stems == 1:
             recon_audio = rearrange(recon_audio, "b 1 s t -> b s t")
@@ -448,15 +571,27 @@ def forward(self, raw_audio, target=None, return_loss_breakdown=False):
 
         for window_size in self.multi_stft_resolutions_window_sizes:
             res_stft_kwargs = dict(
-                n_fft=max(window_size, self.multi_stft_n_fft), win_length=window_size, return_complex=True, window=self.multi_stft_window_fn(window_size, device=device), **self.multi_stft_kwargs
+                n_fft=max(window_size, self.multi_stft_n_fft),
+                win_length=window_size,
+                return_complex=True,
+                window=self.multi_stft_window_fn(window_size, device=device),
+                **self.multi_stft_kwargs,
             )
 
-            recon_Y = torch.stft(rearrange(recon_audio, "... s t -> (... s) t"), **res_stft_kwargs)
-            target_Y = torch.stft(rearrange(target, "... s t -> (... s) t"), **res_stft_kwargs)
+            recon_Y = torch.stft(
+                rearrange(recon_audio, "... s t -> (... s) t"), **res_stft_kwargs
+            )
+            target_Y = torch.stft(
+                rearrange(target, "... s t -> (... s) t"), **res_stft_kwargs
+            )
 
-            multi_stft_resolution_loss = multi_stft_resolution_loss + F.l1_loss(recon_Y, target_Y)
+            multi_stft_resolution_loss = multi_stft_resolution_loss + F.l1_loss(
+                recon_Y, target_Y
+            )
 
-        weighted_multi_resolution_loss = multi_stft_resolution_loss * self.multi_stft_resolution_loss_weight
+        weighted_multi_resolution_loss = (
+            multi_stft_resolution_loss * self.multi_stft_resolution_loss_weight
+        )
 
         total_loss = loss + weighted_multi_resolution_loss
 
@@ -468,4 +603,9 @@ def forward(self, raw_audio, target=None, return_loss_breakdown=False):
             return total_loss
 
         # If detailed loss breakdown is requested, ensure all components are on the original device
-        return total_loss, (loss.to(original_device) if x_is_mps else loss, multi_stft_resolution_loss.to(original_device) if x_is_mps else multi_stft_resolution_loss)
+        return total_loss, (
+            loss.to(original_device) if x_is_mps else loss,
+            multi_stft_resolution_loss.to(original_device)
+            if x_is_mps
+            else multi_stft_resolution_loss,
+        )
diff --git a/audio_separator/separator/uvr_lib_v5/stft.py b/audio_separator/separator/uvr_lib_v5/stft.py
index f4403958..780da5c5 100644
--- a/audio_separator/separator/uvr_lib_v5/stft.py
+++ b/audio_separator/separator/uvr_lib_v5/stft.py
@@ -1,5 +1,7 @@
 import torch
 
+from audio_separator.separator.uvr_lib_v5.utils import is_rocm
+
 
 class STFT:
     """
@@ -20,13 +22,22 @@ def __init__(self, logger, n_fft, hop_length, dim_f, device):
     def __call__(self, input_tensor):
         # Determine if the input tensor's device is not a standard computing device (i.e., not CPU or CUDA).
         is_non_standard_device = not input_tensor.device.type in ["cuda", "cpu"]
+        run_on_cpu = is_non_standard_device or (
+            input_tensor.device.type == "cuda" and is_rocm()
+        )
 
-        # If on a non-standard device, temporarily move the tensor to CPU for processing.
-        if is_non_standard_device:
+        # If on a non-standard device or ROCm, temporarily move the tensor to CPU for processing.
+        if run_on_cpu:
             input_tensor = input_tensor.cpu()
 
+        # Ensure FP32 for stability on ROCm and non-standard devices
+        if input_tensor.dtype in (torch.float16, torch.bfloat16):
+            input_tensor = input_tensor.float()
+
         # Transfer the pre-defined window tensor to the same device as the input tensor.
         stft_window = self.hann_window.to(input_tensor.device)
+        if stft_window.dtype in (torch.float16, torch.bfloat16):
+            stft_window = stft_window.float()
 
         # Extract batch dimensions (all dimensions except the last two which are channel and time).
         batch_dimensions = input_tensor.shape[:-2]
@@ -38,29 +49,60 @@ def __call__(self, input_tensor):
         reshaped_tensor = input_tensor.reshape([-1, time_dim])
 
         # Perform the Short-Time Fourier Transform (STFT) on the reshaped tensor.
-        stft_output = torch.stft(reshaped_tensor, n_fft=self.n_fft, hop_length=self.hop_length, window=stft_window, center=True, return_complex=False)
+        try:
+            stft_output = torch.stft(
+                reshaped_tensor,
+                n_fft=self.n_fft,
+                hop_length=self.hop_length,
+                window=stft_window,
+                center=True,
+                return_complex=False,
+            )
+        except Exception as e:
+            # Fallback: try with return_complex=True
+            stft_complex = torch.stft(
+                reshaped_tensor,
+                n_fft=self.n_fft,
+                hop_length=self.hop_length,
+                window=stft_window,
+                center=True,
+                return_complex=True,
+            )
+            stft_output = torch.stack([stft_complex.real, stft_complex.imag], dim=-1)
 
         # Rearrange the dimensions of the STFT output to bring the frequency dimension forward.
         permuted_stft_output = stft_output.permute([0, 3, 1, 2])
 
         # Reshape the output to restore the original batch and channel dimensions, while keeping the newly formed frequency and time dimensions.
-        final_output = permuted_stft_output.reshape([*batch_dimensions, channel_dim, 2, -1, permuted_stft_output.shape[-1]]).reshape(
+        final_output = permuted_stft_output.reshape(
+            [*batch_dimensions, channel_dim, 2, -1, permuted_stft_output.shape[-1]]
+        ).reshape(
             [*batch_dimensions, channel_dim * 2, -1, permuted_stft_output.shape[-1]]
         )
 
-        # If the original tensor was on a non-standard device, move the processed tensor back to that device.
-        if is_non_standard_device:
+        # If the original tensor was on a non-standard device or ROCm, move the processed tensor back to that device.
+        if run_on_cpu:
             final_output = final_output.to(self.device)
 
         # Return the transformed tensor, sliced to retain only the required frequency dimension (`dim_f`).
         return final_output[..., : self.dim_f, :]
 
-    def pad_frequency_dimension(self, input_tensor, batch_dimensions, channel_dim, freq_dim, time_dim, num_freq_bins):
+    def pad_frequency_dimension(
+        self,
+        input_tensor,
+        batch_dimensions,
+        channel_dim,
+        freq_dim,
+        time_dim,
+        num_freq_bins,
+    ):
         """
         Adds zero padding to the frequency dimension of the input tensor.
         """
         # Create a padding tensor for the frequency dimension
-        freq_padding = torch.zeros([*batch_dimensions, channel_dim, num_freq_bins - freq_dim, time_dim]).to(input_tensor.device)
+        freq_padding = torch.zeros(
+            [*batch_dimensions, channel_dim, num_freq_bins - freq_dim, time_dim]
+        ).to(input_tensor.device)
 
         # Concatenate the padding to the input tensor along the frequency dimension.
         padded_tensor = torch.cat([input_tensor, freq_padding], -2)
@@ -77,13 +119,17 @@ def calculate_inverse_dimensions(self, input_tensor):
 
         return batch_dimensions, channel_dim, freq_dim, time_dim, num_freq_bins
 
-    def prepare_for_istft(self, padded_tensor, batch_dimensions, channel_dim, num_freq_bins, time_dim):
+    def prepare_for_istft(
+        self, padded_tensor, batch_dimensions, channel_dim, num_freq_bins, time_dim
+    ):
         """
         Prepares the tensor for Inverse Short-Time Fourier Transform (ISTFT) by reshaping
         and creating a complex tensor from the real and imaginary parts.
         """
         # Reshape the tensor to separate real and imaginary parts and prepare for ISTFT.
-        reshaped_tensor = padded_tensor.reshape([*batch_dimensions, channel_dim // 2, 2, num_freq_bins, time_dim])
+        reshaped_tensor = padded_tensor.reshape(
+            [*batch_dimensions, channel_dim // 2, 2, num_freq_bins, time_dim]
+        )
 
         # Flatten batch dimensions and rearrange for ISTFT.
         flattened_tensor = reshaped_tensor.reshape([-1, 2, num_freq_bins, time_dim])
@@ -99,28 +145,54 @@ def prepare_for_istft(self, padded_tensor, batch_dimensions, channel_dim, num_fr
     def inverse(self, input_tensor):
         # Determine if the input tensor's device is not a standard computing device (i.e., not CPU or CUDA).
         is_non_standard_device = not input_tensor.device.type in ["cuda", "cpu"]
+        run_on_cpu = is_non_standard_device or (
+            input_tensor.device.type == "cuda" and is_rocm()
+        )
 
-        # If on a non-standard device, temporarily move the tensor to CPU for processing.
-        if is_non_standard_device:
+        # If on a non-standard device or ROCm, temporarily move the tensor to CPU for processing.
+        if run_on_cpu:
             input_tensor = input_tensor.cpu()
 
+        # Ensure FP32 for stability on ROCm and non-standard devices
+        if input_tensor.dtype in (torch.float16, torch.bfloat16):
+            input_tensor = input_tensor.float()
+
         # Transfer the pre-defined Hann window tensor to the same device as the input tensor.
         stft_window = self.hann_window.to(input_tensor.device)
+        if stft_window.dtype in (torch.float16, torch.bfloat16):
+            stft_window = stft_window.float()
 
-        batch_dimensions, channel_dim, freq_dim, time_dim, num_freq_bins = self.calculate_inverse_dimensions(input_tensor)
+        batch_dimensions, channel_dim, freq_dim, time_dim, num_freq_bins = (
+            self.calculate_inverse_dimensions(input_tensor)
+        )
 
-        padded_tensor = self.pad_frequency_dimension(input_tensor, batch_dimensions, channel_dim, freq_dim, time_dim, num_freq_bins)
+        padded_tensor = self.pad_frequency_dimension(
+            input_tensor,
+            batch_dimensions,
+            channel_dim,
+            freq_dim,
+            time_dim,
+            num_freq_bins,
+        )
 
-        complex_tensor = self.prepare_for_istft(padded_tensor, batch_dimensions, channel_dim, num_freq_bins, time_dim)
+        complex_tensor = self.prepare_for_istft(
+            padded_tensor, batch_dimensions, channel_dim, num_freq_bins, time_dim
+        )
 
         # Perform the Inverse Short-Time Fourier Transform (ISTFT).
-        istft_result = torch.istft(complex_tensor, n_fft=self.n_fft, hop_length=self.hop_length, window=stft_window, center=True)
+        istft_result = torch.istft(
+            complex_tensor,
+            n_fft=self.n_fft,
+            hop_length=self.hop_length,
+            window=stft_window,
+            center=True,
+        )
 
         # Reshape ISTFT result to restore original batch and channel dimensions.
         final_output = istft_result.reshape([*batch_dimensions, 2, -1])
 
-        # If the original tensor was on a non-standard device, move the processed tensor back to that device.
-        if is_non_standard_device:
+        # If the original tensor was on a non-standard device or ROCm, move the processed tensor back to that device.
+        if run_on_cpu:
             final_output = final_output.to(self.device)
 
         return final_output
diff --git a/audio_separator/separator/uvr_lib_v5/tfc_tdf_v3.py b/audio_separator/separator/uvr_lib_v5/tfc_tdf_v3.py
index 4d3356f1..5b8377ea 100644
--- a/audio_separator/separator/uvr_lib_v5/tfc_tdf_v3.py
+++ b/audio_separator/separator/uvr_lib_v5/tfc_tdf_v3.py
@@ -2,6 +2,9 @@
 import torch.nn as nn
 from functools import partial
 
+from audio_separator.separator.uvr_lib_v5.utils import is_rocm
+
+
 class STFT:
     def __init__(self, n_fft, hop_length, dim_f, device):
         self.n_fft = n_fft
@@ -11,57 +14,93 @@ def __init__(self, n_fft, hop_length, dim_f, device):
         self.device = device
 
     def __call__(self, x):
-        
-        x_is_mps = not x.device.type in ["cuda", "cpu"]
-        if x_is_mps:
+
+        x_is_non_cuda_device = x.device.type not in ["cuda", "cpu"]
+        run_on_cpu = x_is_non_cuda_device
+
+        if run_on_cpu:
             x = x.cpu()
 
+        # Ensure FP32 for stability on ROCm and non-standard devices
+        original_dtype = x.dtype
+        if x.dtype in (torch.float16, torch.bfloat16):
+            x = x.float()
+
         window = self.window.to(x.device)
+        if window.dtype in (torch.float16, torch.bfloat16):
+            window = window.float()
+
         batch_dims = x.shape[:-2]
         c, t = x.shape[-2:]
         x = x.reshape([-1, t])
-        x = torch.stft(x, n_fft=self.n_fft, hop_length=self.hop_length, window=window, center=True,return_complex=False)
+
+        # Use return_complex=True for ROCm compatibility
+        x_complex = torch.stft(
+            x,
+            n_fft=self.n_fft,
+            hop_length=self.hop_length,
+            window=window,
+            center=True,
+            return_complex=True,
+        )
+        x = torch.stack([x_complex.real, x_complex.imag], dim=-1)
+
         x = x.permute([0, 3, 1, 2])
-        x = x.reshape([*batch_dims, c, 2, -1, x.shape[-1]]).reshape([*batch_dims, c * 2, -1, x.shape[-1]])
+        x = x.reshape([*batch_dims, c, 2, -1, x.shape[-1]]).reshape(
+            [*batch_dims, c * 2, -1, x.shape[-1]]
+        )
 
-        if x_is_mps:
+        if run_on_cpu:
             x = x.to(self.device)
 
-        return x[..., :self.dim_f, :]
+        return x[..., : self.dim_f, :]
 
     def inverse(self, x):
-        
-        x_is_mps = not x.device.type in ["cuda", "cpu"]
-        if x_is_mps:
+
+        x_is_non_cuda_device = x.device.type not in ["cuda", "cpu"]
+        run_on_cpu = x_is_non_cuda_device
+
+        if run_on_cpu:
             x = x.cpu()
 
+        # Ensure FP32 for stability on ROCm and non-standard devices
+        original_dtype = x.dtype
+        if x.dtype in (torch.float16, torch.bfloat16):
+            x = x.float()
+
         window = self.window.to(x.device)
+        if window.dtype in (torch.float16, torch.bfloat16):
+            window = window.float()
+
         batch_dims = x.shape[:-3]
         c, f, t = x.shape[-3:]
         n = self.n_fft // 2 + 1
-        f_pad = torch.zeros([*batch_dims, c, n - f, t]).to(x.device)
+        f_pad = torch.zeros([*batch_dims, c, n - f, t], dtype=x.dtype).to(x.device)
         x = torch.cat([x, f_pad], -2)
         x = x.reshape([*batch_dims, c // 2, 2, n, t]).reshape([-1, 2, n, t])
         x = x.permute([0, 2, 3, 1])
-        x = x[..., 0] + x[..., 1] * 1.j
-        x = torch.istft(x, n_fft=self.n_fft, hop_length=self.hop_length, window=window, center=True)
+        x = x[..., 0] + x[..., 1] * 1.0j
+        x = torch.istft(
+            x, n_fft=self.n_fft, hop_length=self.hop_length, window=window, center=True
+        )
         x = x.reshape([*batch_dims, 2, -1])
 
-        if x_is_mps:
+        if run_on_cpu:
             x = x.to(self.device)
 
         return x
 
+
 def get_norm(norm_type):
     def norm(c, norm_type):
         if norm_type is None:
             return nn.Identity()
-        elif norm_type == 'BatchNorm':
+        elif norm_type == "BatchNorm":
             return nn.BatchNorm2d(c)
-        elif norm_type == 'InstanceNorm':
+        elif norm_type == "InstanceNorm":
             return nn.InstanceNorm2d(c, affine=True)
-        elif 'GroupNorm' in norm_type:
-            g = int(norm_type.replace('GroupNorm', ''))
+        elif "GroupNorm" in norm_type:
+            g = int(norm_type.replace("GroupNorm", ""))
             return nn.GroupNorm(num_groups=g, num_channels=c)
         else:
             return nn.Identity()
@@ -70,12 +109,12 @@ def norm(c, norm_type):
 
 
 def get_act(act_type):
-    if act_type == 'gelu':
+    if act_type == "gelu":
         return nn.GELU()
-    elif act_type == 'relu':
+    elif act_type == "relu":
         return nn.ReLU()
-    elif act_type[:3] == 'elu':
-        alpha = float(act_type.replace('elu', ''))
+    elif act_type[:3] == "elu":
+        alpha = float(act_type.replace("elu", ""))
         return nn.ELU(alpha)
     else:
         raise Exception
@@ -87,7 +126,13 @@ def __init__(self, in_c, out_c, scale, norm, act):
         self.conv = nn.Sequential(
             norm(in_c),
             act,
-            nn.ConvTranspose2d(in_channels=in_c, out_channels=out_c, kernel_size=scale, stride=scale, bias=False)
+            nn.ConvTranspose2d(
+                in_channels=in_c,
+                out_channels=out_c,
+                kernel_size=scale,
+                stride=scale,
+                bias=False,
+            ),
         )
 
     def forward(self, x):
@@ -100,7 +145,13 @@ def __init__(self, in_c, out_c, scale, norm, act):
         self.conv = nn.Sequential(
             norm(in_c),
             act,
-            nn.Conv2d(in_channels=in_c, out_channels=out_c, kernel_size=scale, stride=scale, bias=False)
+            nn.Conv2d(
+                in_channels=in_c,
+                out_channels=out_c,
+                kernel_size=scale,
+                stride=scale,
+                bias=False,
+            ),
         )
 
     def forward(self, x):
@@ -159,19 +210,25 @@ def __init__(self, config, device):
             norm_type = config.model.norm
         except (AttributeError, KeyError):
             norm_type = None
-            print("Warning: Model configuration missing 'norm' attribute, using Identity normalization")
-        
+            print(
+                "Warning: Model configuration missing 'norm' attribute, using Identity normalization"
+            )
+
         norm = get_norm(norm_type=norm_type)
-        
+
         try:
             act_type = config.model.act
         except (AttributeError, KeyError):
-            act_type = 'gelu'
-            print("Warning: Model configuration missing 'act' attribute, using GELU activation")
-            
+            act_type = "gelu"
+            print(
+                "Warning: Model configuration missing 'act' attribute, using GELU activation"
+            )
+
         act = get_act(act_type=act_type)
 
-        self.num_target_instruments = 1 if config.training.target_instrument else len(config.training.instruments)
+        self.num_target_instruments = (
+            1 if config.training.target_instrument else len(config.training.instruments)
+        )
         self.num_subbands = config.model.num_subbands
 
         dim_c = self.num_subbands * config.audio.num_channels * 2
@@ -208,10 +265,12 @@ def __init__(self, config, device):
         self.final_conv = nn.Sequential(
             nn.Conv2d(c + dim_c, c, 1, 1, 0, bias=False),
             act,
-            nn.Conv2d(c, self.num_target_instruments * dim_c, 1, 1, 0, bias=False)
+            nn.Conv2d(c, self.num_target_instruments * dim_c, 1, 1, 0, bias=False),
         )
 
-        self.stft = STFT(config.audio.n_fft, config.audio.hop_length, config.audio.dim_f, self.device)
+        self.stft = STFT(
+            config.audio.n_fft, config.audio.hop_length, config.audio.dim_f, self.device
+        )
 
     def cac2cws(self, x):
         k = self.num_subbands
@@ -265,5 +324,3 @@ def forward(self, x):
         x = self.stft.inverse(x)
 
         return x
-
-
diff --git a/audio_separator/separator/uvr_lib_v5/utils.py b/audio_separator/separator/uvr_lib_v5/utils.py
new file mode 100644
index 00000000..70d75c6f
--- /dev/null
+++ b/audio_separator/separator/uvr_lib_v5/utils.py
@@ -0,0 +1,6 @@
+import torch
+
+
+def is_rocm():
+    """Check if PyTorch is built with ROCm support."""
+    return getattr(torch.version, "hip", None) is not None
diff --git a/debug_minimal.py b/debug_minimal.py
new file mode 100644
index 00000000..49e3cb23
--- /dev/null
+++ b/debug_minimal.py
@@ -0,0 +1,59 @@
+#!/usr/bin/env python3
+"""
+Minimal debug script to diagnose ROCm implementation.
+Saves results to debug_results.txt
+"""
+
+import os
+import sys
+import traceback
+
+
+def main():
+    output = []
+
+    output.append("=== ROCm Debug Script ===")
+    output.append(f"Python: {sys.version}")
+    output.append(f"Arguments: {sys.argv}")
+
+    # Check environment
+    output.append("\n=== Environment Variables ===")
+    for var in ["HSA_OVERRIDE_GFX_VERSION", "PYTORCH_ROCM_ARCH", "ROCM_PATH"]:
+        val = os.environ.get(var, "NOT SET")
+        output.append(f"{var}: {val}")
+
+    # Check PyTorch
+    output.append("\n=== PyTorch Info ===")
+    try:
+        import torch
+
+        output.append(f"Version: {torch.__version__}")
+        output.append(f"CUDA available: {torch.cuda.is_available()}")
+        if torch.cuda.is_available():
+            output.append(f"Device: {torch.cuda.get_device_name(0)}")
+    except Exception as e:
+        output.append(f"Error: {e}")
+        output.append(f"Traceback:\n{traceback.format_exc()}")
+
+    # Check ONNX Runtime
+    output.append("\n=== ONNX Runtime Info ===")
+    try:
+        import onnxruntime as ort
+
+        output.append(f"Version: {ort.__version__}")
+        output.append(f"Available providers: {ort.get_available_providers()}")
+    except Exception as e:
+        output.append(f"Error: {e}")
+        output.append(f"Traceback:\n{traceback.format_exc()}")
+
+    output.append("\n=== Done ===")
+
+    return "\n".join(output)
+
+
+if __name__ == "__main__":
+    output = main()
+    print(output)
+    with open("debug_results.txt", "w") as f:
+        f.write(output)
+    print("\nResults saved to debug_results.txt")
diff --git a/debug_rocm_issues.py b/debug_rocm_issues.py
new file mode 100644
index 00000000..e4ca15aa
--- /dev/null
+++ b/debug_rocm_issues.py
@@ -0,0 +1,1311 @@
+#!/usr/bin/env python3
+"""
+Debug script to diagnose ROCm implementation crashes in audio-separator.
+This script tests key operations that commonly cause issues with ROCm.
+"""
+
+import os
+import random
+import re
+import subprocess
+import sys
+import time
+import traceback
+
+import torch
+import torch.amp
+
+
+def get_rocm_info():
+    """Get ROCm system information."""
+    rocm_info = {}
+
+    # Try to get ROCm version
+    try:
+        result = subprocess.run(
+            ["rocminfo"], capture_output=True, text=True, timeout=10, check=True
+        )
+        rocm_info["rocminfo"] = result.stdout
+
+        # Extract version from rocminfo output
+        version_match = re.search(
+            r"ROCm Stack Version: (.+)$", result.stdout, re.MULTILINE
+        )
+        if version_match:
+            rocm_info["version"] = version_match.group(1)
+
+    except (
+        subprocess.TimeoutExpired,
+        subprocess.CalledProcessError,
+        FileNotFoundError,
+    ):
+        rocm_info["rocminfo"] = "rocminfo command not found or failed"
+
+    # Try to get HIP version
+    try:
+        result = subprocess.run(
+            ["hipconfig"], capture_output=True, text=True, timeout=10, check=True
+        )
+        rocm_info["hipconfig"] = result.stdout
+
+        # Extract HIP version
+        version_match = re.search(r"HIP version\s+:\s+(.+)", result.stdout)
+        if version_match:
+            rocm_info["hip_version"] = version_match.group(1)
+
+    except (
+        subprocess.TimeoutExpired,
+        subprocess.CalledProcessError,
+        FileNotFoundError,
+    ):
+        rocm_info["hipconfig"] = "hipconfig command not found or failed"
+
+    return rocm_info
+
+
+def get_system_info():
+    """Get system information."""
+    system_info = {}
+
+    # Get CPU information
+    try:
+        with open("/proc/cpuinfo", "r") as f:
+            cpuinfo = f.read()
+            cpu_model = re.search(r"model name\s+: (.+)$", cpuinfo, re.MULTILINE)
+            if cpu_model:
+                system_info["cpu_model"] = cpu_model.group(1)
+
+            cpu_cores = len(re.findall(r"^processor", cpuinfo, re.MULTILINE))
+            system_info["cpu_cores"] = cpu_cores
+
+    except Exception as e:
+        system_info["cpu_info"] = f"Failed to read CPU info: {e}"
+
+    # Get memory information
+    try:
+        with open("/proc/meminfo", "r") as f:
+            meminfo = f.read()
+            mem_total = re.search(r"MemTotal:\s+(\d+) kB", meminfo)
+            if mem_total:
+                system_info["memory_total"] = (
+                    int(mem_total.group(1)) / 1024
+                )  # Convert to MB
+
+    except Exception as e:
+        system_info["memory_info"] = f"Failed to read memory info: {e}"
+
+    # Get kernel version
+    try:
+        with open("/proc/version", "r") as f:
+            system_info["kernel_version"] = f.read().strip()
+    except Exception as e:
+        system_info["kernel_version"] = f"Failed to read kernel version: {e}"
+
+    return system_info
+
+
+def configure_rocm_env():
+    """Configure ROCm environment variables if not already set."""
+    if "HSA_OVERRIDE_GFX_VERSION" not in os.environ:
+        os.environ["HSA_OVERRIDE_GFX_VERSION"] = "10.3.2"
+
+    if "PYTORCH_ROCM_ARCH" not in os.environ:
+        # Try to detect GPU version from rocminfo
+        try:
+            result = subprocess.run(
+                ["rocminfo"],
+                capture_output=True,
+                text=True,
+                timeout=10,
+            )
+            # Look for gfx version in output
+            import re
+
+            match = re.search(r"gfx(\d+)", result.stdout)
+            if match:
+                gfx_version = match.group(1)
+                if gfx_version == "1032":
+                    os.environ["PYTORCH_ROCM_ARCH"] = "gfx1030"
+                elif gfx_version.startswith("11"):
+                    os.environ["PYTORCH_ROCM_ARCH"] = "gfx1100"
+                elif gfx_version.startswith("9"):
+                    os.environ["PYTORCH_ROCM_ARCH"] = "gfx90a"
+            else:
+                os.environ["PYTORCH_ROCM_ARCH"] = "gfx1030"
+        except Exception:
+            os.environ["PYTORCH_ROCM_ARCH"] = "gfx1030"
+
+
+# Configure ROCm environment at import time
+configure_rocm_env()
+
+
+def print_section(title):
+    print(f"\n{'=' * 50}")
+    print(f"{title}")
+    print(f"{'=' * 50}")
+
+
+def test_pytorch_rocm_setup():
+    """Test basic PyTorch and ROCm setup."""
+    print_section("1. PyTorch and ROCm Setup")
+
+    print(f"PyTorch version: {torch.__version__}")
+    print(f"ROCm detected in version: {'+rocm' in torch.__version__}")
+    print(f"CUDA available: {torch.cuda.is_available()}")
+
+    if torch.cuda.is_available():
+        print(f"CUDA device count: {torch.cuda.device_count()}")
+        print(f"Current CUDA device: {torch.cuda.current_device()}")
+        print(f"CUDA device name: {torch.cuda.get_device_name(0)}")
+
+        # Test ROCm-specific features
+        device = torch.device("cuda")
+
+        # Test ROCm version and capabilities
+        try:
+            # Check for ROCm-specific attributes
+            if hasattr(torch.cuda, "get_device_properties"):
+                props = torch.cuda.get_device_properties(0)
+                print(f"GPU Architecture: {props.major}.{props.minor}")
+                print(f"GPU Memory: {props.total_memory / 1024**3:.2f} GB")
+
+            # Test ROCm version
+            if hasattr(torch.version, "rocm"):
+                print(f"PyTorch ROCm version: {torch.version.rocm}")
+
+        except Exception as e:
+            print(f"✗ ROCm property test failed - {type(e).__name__}: {e}")
+
+        # Test basic tensor operations
+        try:
+            x = torch.randn(1000, 1000).to("cuda")
+            y = torch.randn(1000, 1000).to("cuda")
+            z = torch.mm(x, y)
+            print("✓ Basic tensor multiplication on GPU: PASSED")
+        except Exception as e:
+            print(
+                f"✗ Basic tensor multiplication on GPU: FAILED - {type(e).__name__}: {e}"
+            )
+
+        # Test ROCm memory operations
+        try:
+            # Test memory pinning
+            cpu_tensor = torch.randn(1000, 1000)
+            pinned_tensor = cpu_tensor.pin_memory()
+            gpu_tensor = pinned_tensor.to(device, non_blocking=True)
+            print("✓ Memory pinning and non-blocking transfer successful")
+
+            # Test stream operations
+            stream = torch.cuda.Stream()
+            with torch.cuda.stream(stream):
+                a = torch.randn(1000, 1000, device=device)
+                b = torch.randn(1000, 1000, device=device)
+                c = torch.mm(a, b)
+            stream.synchronize()
+            print("✓ CUDA stream operations successful")
+
+            # Test event operations
+            start_event = torch.cuda.Event(enable_timing=True)
+            end_event = torch.cuda.Event(enable_timing=True)
+            start_event.record()
+            d = torch.mm(c, c)
+            end_event.record()
+            end_event.synchronize()
+            elapsed_time = start_event.elapsed_time(end_event)
+            print(f"✓ CUDA event timing successful: {elapsed_time:.2f}ms")
+
+        except Exception as e:
+            print(f"✗ ROCm memory/stream test failed - {type(e).__name__}: {e}")
+
+        # Test ROCm-specific math operations
+        try:
+            # Test half precision
+            if (
+                torch.cuda.is_available()
+                and torch.cuda.get_device_properties(0).major >= 5
+            ):
+                x_half = torch.randn(100, 100, device=device, dtype=torch.float16)
+                y_half = torch.randn(100, 100, device=device, dtype=torch.float16)
+                z_half = torch.mm(x_half, y_half)
+                print("✓ Half precision (float16) operations successful")
+
+            # Test complex number operations
+            x_complex = torch.randn(100, 100, dtype=torch.complex64, device=device)
+            y_complex = torch.randn(100, 100, dtype=torch.complex64, device=device)
+            z_complex = x_complex * y_complex
+            print("✓ Complex number operations successful")
+
+        except Exception as e:
+            print(f"✗ ROCm math operation test failed - {type(e).__name__}: {e}")
+
+    else:
+        print("✗ CUDA not available - ROCm setup may be incomplete")
+
+
+def test_onnxruntime_setup():
+    """Test ONNX Runtime setup with ROCm."""
+    print_section("2. ONNX Runtime Setup")
+
+    try:
+        import onnxruntime as ort
+
+        print(f"ONNX Runtime version: {ort.__version__}")
+
+        # Check available providers
+        providers = ort.get_available_providers()
+        print(f"Available execution providers: {providers}")
+
+        # Test ROCm provider
+        if "ROCMExecutionProvider" in providers:
+            print("✓ ROCMExecutionProvider is available")
+
+            # Create a simple model for testing (before any session creation)
+            try:
+                import numpy as np
+                from onnx import TensorProto, helper
+
+                # Define a simple model that does matrix multiplication
+                node1 = helper.make_node(
+                    "MatMul",
+                    inputs=["input1", "input2"],
+                    outputs=["output"],
+                )
+
+                # Create the graph
+                graph = helper.make_graph(
+                    [node1],
+                    "test_graph",
+                    [
+                        helper.make_tensor_value_info(
+                            "input1", TensorProto.FLOAT, [2, 2]
+                        ),
+                        helper.make_tensor_value_info(
+                            "input2", TensorProto.FLOAT, [2, 2]
+                        ),
+                    ],
+                    [
+                        helper.make_tensor_value_info(
+                            "output", TensorProto.FLOAT, [2, 2]
+                        )
+                    ],
+                )
+
+                # Create the model
+                model = helper.make_model(graph)
+                model_serialized = model.SerializeToString()
+                print("✓ In-memory test model created")
+
+                # Test creating a session with ROCm provider
+                session_options = ort.SessionOptions()
+                session_options.log_severity_level = 2  # Warning level
+                session = ort.InferenceSession(
+                    model_serialized,
+                    sess_options=session_options,
+                    providers=["ROCMExecutionProvider", "CPUExecutionProvider"],
+                )
+                print("✓ Successfully created session with ROCMExecutionProvider")
+
+                # Test ROCm provider capabilities
+                try:
+                    # Run inference
+                    input1 = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32)
+                    input2 = np.array([[5.0, 6.0], [7.0, 8.0]], dtype=np.float32)
+                    result = session.run(None, {"input1": input1, "input2": input2})
+
+                    expected = np.array([[19.0, 22.0], [43.0, 50.0]], dtype=np.float32)
+                    if np.allclose(result[0], expected):
+                        print("✓ ROCm provider inference test passed")
+                    else:
+                        print("✗ ROCm provider inference test failed")
+                        print(f"  Expected: {expected}")
+                        print(f"  Got: {result[0]}")
+
+                except Exception as e:
+                    print(f"✗ ROCm inference test failed - {type(e).__name__}: {e}")
+
+            except Exception as e:
+                print(
+                    f"✗ Failed to create session with ROCMExecutionProvider: {type(e).__name__}: {e}"
+                )
+        else:
+            print("✗ ROCMExecutionProvider not available")
+
+        # Test CUDA provider as fallback
+        if "CUDAExecutionProvider" in providers:
+            print(
+                "✓ CUDAExecutionProvider is available (can be used as fallback for AMD)"
+            )
+
+            # Test creating a session with CUDA provider
+            try:
+                session_options = ort.SessionOptions()
+                # Reuse the model_serialized if available, otherwise create it
+                try:
+                    _ = model_serialized
+                except NameError:
+                    import numpy as np
+                    from onnx import TensorProto, helper
+
+                    node1 = helper.make_node(
+                        "MatMul",
+                        inputs=["input1", "input2"],
+                        outputs=["output"],
+                    )
+                    graph = helper.make_graph(
+                        [node1],
+                        "test_graph",
+                        [
+                            helper.make_tensor_value_info(
+                                "input1", TensorProto.FLOAT, [2, 2]
+                            ),
+                            helper.make_tensor_value_info(
+                                "input2", TensorProto.FLOAT, [2, 2]
+                            ),
+                        ],
+                        [
+                            helper.make_tensor_value_info(
+                                "output", TensorProto.FLOAT, [2, 2]
+                            )
+                        ],
+                    )
+                    model = helper.make_model(graph)
+                    model_serialized = model.SerializeToString()
+
+                session = ort.InferenceSession(
+                    model_serialized,
+                    sess_options=session_options,
+                    providers=["CUDAExecutionProvider", "CPUExecutionProvider"],
+                )
+                print("✓ Successfully created session with CUDAExecutionProvider")
+
+                # Test CUDA provider capabilities
+                try:
+                        model.SerializeToString(),
+                        providers=["CUDAExecutionProvider", "CPUExecutionProvider"],
+                    )
+
+                    # Run inference
+                    input1 = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32)
+                    input2 = np.array([[5.0, 6.0], [7.0, 8.0]], dtype=np.float32)
+                    result = session.run(None, {"input1": input1, "input2": input2})
+
+                    expected = np.array([[19.0, 22.0], [43.0, 50.0]], dtype=np.float32)
+                    if np.allclose(result[0], expected):
+                        print("✓ CUDA provider inference test passed")
+                    else:
+                        print("✗ CUDA provider inference test failed")
+                        print(f"  Expected: {expected}")
+                        print(f"  Got: {result[0]}")
+
+                except Exception as e:
+                    print(f"✗ CUDA inference test failed - {type(e).__name__}: {e}")
+
+            except Exception as e:
+                print(
+                    f"✗ Failed to create session with CUDAExecutionProvider: {type(e).__name__}: {e}"
+                )
+        else:
+            print("✗ CUDAExecutionProvider not available")
+
+        # Additional ROCm-specific checks
+        try:
+            # Check for ROCm-specific environment variables
+            rocm_vars = [v for v in os.environ.keys() if "rocm" in v.lower()]
+            if rocm_vars:
+                print("ROCm environment variables detected:")
+                for var in rocm_vars:
+                    print(f"  {var}: {os.environ[var]}")
+
+            # Check for ROCm installation paths
+            rocm_paths = ["/opt/rocm", "/usr/lib/rocm"]
+            for path in rocm_paths:
+                if os.path.exists(path):
+                    print(f"✓ ROCm installation found at {path}")
+                    # List contents to verify
+                    try:
+                        contents = os.listdir(path)
+                        if "lib" in contents or "bin" in contents:
+                            print(f"  ROCm components detected in {path}")
+                    except OSError as e:
+                        print(f"  Unable to read {path}: {e}")
+
+        except Exception as e:
+            print(f"✗ ROCm environment check failed - {type(e).__name__}: {e}")
+
+    except ImportError:
+        print("✗ ONNX Runtime not installed")
+    except Exception as e:
+        print(f"✗ Error testing ONNX Runtime: {type(e).__name__}: {e}")
+
+
+def test_stft_operations():
+    """Test STFT operations which are critical for audio processing."""
+    print_section("3. STFT Operations")
+
+    if not torch.cuda.is_available():
+        print("✗ CUDA not available - skipping STFT tests")
+        return
+
+    try:
+        # Test window creation
+        print("Testing Hann window creation...")
+        window = torch.hann_window(2048, periodic=True)
+        window_gpu = window.to("cuda")
+        print(
+            f"✓ Hann window created and moved to GPU: shape={window_gpu.shape}, device={window_gpu.device}"
+        )
+
+        # Test different window types
+        try:
+            # Test Hamming window
+            hamming_window = torch.hamming_window(2048, periodic=True).to("cuda")
+            print("✓ Hamming window created on GPU")
+
+            # Test Blackman window
+            blackman_window = torch.blackman_window(2048, periodic=True).to("cuda")
+            print("✓ Blackman window created on GPU")
+
+            # Test custom window
+            custom_window = torch.ones(2048, device="cuda")
+            print("✓ Custom window created on GPU")
+
+        except Exception as e:
+            print(f"✗ Window type test failed - {type(e).__name__}: {e}")
+
+        # Test STFT on GPU
+        x_gpu = torch.randn(2, 44100).to("cuda")
+
+        # Initialize sentinel for contiguity check
+        result_gpu = None
+
+        # Try different n_fft values
+        for n_fft in [1024, 2048]:
+            hop_length = n_fft // 4
+            try:
+                result_gpu = torch.stft(
+                    x_gpu,
+                    n_fft=n_fft,
+                    hop_length=hop_length,
+                    window=window_gpu[:n_fft].to("cuda"),
+                    center=True,
+                    return_complex=True,
+                )
+                print(
+                    f"✓ STFT with n_fft={n_fft}: shape={result_gpu.shape}, device={result_gpu.device}"
+                )
+            except Exception as e:
+                print(f"✗ STFT with n_fft={n_fft}: FAILED - {type(e).__name__}: {e}")
+
+        # Test memory layout of result
+        if result_gpu is None:
+            print("  Skipping contiguity check because all STFT attempts failed")
+        elif result_gpu.is_contiguous():
+            print("  Result is contiguous")
+        else:
+            print("  Result is not contiguous")
+
+        # Test complex tensor operations
+        print("\nTesting complex tensor operations...")
+        try:
+            x_complex = torch.randn(2, 1025, 100, 2, device="cuda")
+            x_complex = torch.view_as_complex(x_complex)
+            print(f"✓ Complex tensor created: {x_complex.shape}, {x_complex.dtype}")
+
+            # Test different complex operations
+            try:
+                # Test complex multiplication
+                y_complex = torch.randn(
+                    2, 1025, 100, dtype=torch.complex64, device="cuda"
+                )
+                z_complex = x_complex * y_complex
+                print("✓ Complex multiplication successful")
+
+                # Test complex addition
+                w_complex = x_complex + y_complex
+                print("✓ Complex addition successful")
+
+                # Test complex magnitude
+                mag = torch.abs(x_complex)
+                print("✓ Complex magnitude calculation successful")
+
+                # Test complex angle
+                angle = torch.angle(x_complex)
+                print("✓ Complex angle calculation successful")
+
+            except Exception as e:
+                print(f"✗ Complex operation test failed - {type(e).__name__}: {e}")
+
+            # Test istft
+            result_istft = torch.istft(
+                x_complex,
+                n_fft=2048,
+                hop_length=512,
+                window=window_gpu,
+                center=True,
+                length=44100,
+            )
+            print(f"✓ ISTFT completed: {result_istft.shape}, {result_istft.device}")
+
+            # Test different istft parameters
+            try:
+                # Test with different length
+                result_istft_long = torch.istft(
+                    x_complex,
+                    n_fft=2048,
+                    hop_length=512,
+                    window=window_gpu,
+                    center=True,
+                    length=48000,
+                )
+                print(
+                    f"✓ ISTFT with longer length completed: {result_istft_long.shape}"
+                )
+
+            except Exception as e:
+                print(f"✗ ISTFT parameter test failed - {type(e).__name__}: {e}")
+
+        except Exception as e:
+            print(f"✗ Complex tensor operations: FAILED - {type(e).__name__}: {e}")
+
+        # Test ROCm-specific STFT optimizations
+        try:
+            print("\nTesting ROCm-specific STFT optimizations...")
+
+            # Test with different data types
+            x_float64 = torch.randn(2, 44100, dtype=torch.float64, device="cuda")
+            result_float64 = torch.stft(
+                x_float64,
+                n_fft=2048,
+                hop_length=512,
+                window=window_gpu[:2048].to(torch.float64),
+                center=True,
+                return_complex=True,
+            )
+            print("✓ STFT with float64 precision successful")
+
+            # Test with half precision
+            if (
+                torch.cuda.is_available()
+                and torch.cuda.get_device_properties(0).major >= 5
+            ):
+                x_float16 = torch.randn(2, 44100, dtype=torch.float16, device="cuda")
+                window_float16 = window_gpu[:2048].to(torch.float16)
+                result_float16 = torch.stft(
+                    x_float16,
+                    n_fft=2048,
+                    hop_length=512,
+                    window=window_float16,
+                    center=True,
+                    return_complex=True,
+                )
+                print("✓ STFT with float16 precision successful")
+
+        except Exception as e:
+            print(f"✗ ROCm STFT optimization test failed - {type(e).__name__}: {e}")
+
+    except Exception as e:
+        print(f"✗ STFT operations: FAILED - {type(e).__name__}: {e}")
+
+
+def test_memory_allocation():
+    """Test GPU memory allocation patterns and identify potential memory issues."""
+    print_section("4. GPU Memory Allocation")
+
+    if not torch.cuda.is_available():
+        print("✗ CUDA not available - skipping memory tests")
+        return
+
+    try:
+        # Get detailed GPU memory information
+        print(f"GPU memory allocated: {torch.cuda.memory_allocated() / 1024**2:.2f} MB")
+        print(f"GPU memory reserved: {torch.cuda.memory_reserved() / 1024**2:.2f} MB")
+        print(
+            f"GPU memory max allocated: {torch.cuda.max_memory_allocated() / 1024**2:.2f} MB"
+        )
+        print(
+            f"GPU memory max reserved: {torch.cuda.max_memory_reserved() / 1024**2:.2f} MB"
+        )
+
+        # Get GPU properties
+        device = torch.cuda.current_device()
+        props = torch.cuda.get_device_properties(device)
+        print(f"GPU Name: {props.name}")
+        print(f"GPU Compute Capability: {props.major}.{props.minor}")
+        print(f"GPU Total Memory: {props.total_memory / 1024**2:.2f} MB")
+        print(f"GPU Multi processor count: {props.multi_processor_count}")
+
+        # Test different tensor sizes with memory monitoring
+        sizes = [1024, 2048, 4096, 8192, 16384]
+        for size in sizes:
+            try:
+                # Report memory before allocation
+                mem_before = torch.cuda.memory_allocated() / 1024**2
+
+                # Allocate a large tensor
+                tensor = torch.randn(size, size, device="cuda")
+
+                # Report memory after allocation
+                mem_after = torch.cuda.memory_allocated() / 1024**2
+
+                print(
+                    f"✓ Successfully allocated {size}x{size} tensor ({tensor.element_size() * tensor.nelement() / 1024**2:.2f} MB)"
+                    f", memory change: +{mem_after - mem_before:.2f} MB"
+                )
+
+                # Perform an operation
+                result = torch.mm(tensor, tensor)
+                print(f"✓ Matrix multiplication completed")
+
+                # Clean up
+                del tensor, result
+                torch.cuda.empty_cache()
+
+                # Report memory after cleanup
+                mem_cleanup = torch.cuda.memory_allocated() / 1024**2
+                print(f"  Memory after cleanup: {mem_cleanup:.2f} MB")
+
+            except RuntimeError as e:
+                if "out of memory" in str(e).lower():
+                    print(f"✗ Allocation of {size}x{size} tensor: OOM - {e}")
+                    break
+                else:
+                    print(f"✗ Allocation of {size}x{size} tensor: FAILED - {e}")
+                    break
+            except Exception as e:
+                print(
+                    f"✗ Allocation of {size}x{size} tensor: FAILED - {type(e).__name__}: {e}"
+                )
+                break
+
+        # Test memory fragmentation by allocating and deallocating in random order
+        try:
+            print("\nTesting memory fragmentation...")
+            tensors = []
+            sizes_mb = [16, 32, 64, 128, 256, 512]
+
+            for i in range(20):  # Create 20 random allocations
+                size_mb = random.choice(sizes_mb)
+                size = int(
+                    (size_mb * 1024**2) / 4
+                )  # Convert MB to number of floats (4 bytes each)
+                size = int(size**0.5)  # Make it a square tensor
+
+                try:
+                    tensor = torch.randn(size, size, device="cuda")
+                    tensors.append(tensor)
+
+                    if len(tensors) % 5 == 0:  # Deallocate every 5th allocation
+                        del tensors[:]
+                        import gc
+
+                        gc.collect()
+                        torch.cuda.empty_cache()
+
+                except RuntimeError as e:
+                    if "out of memory" in str(e).lower():
+                        print(
+                            f"✗ Memory fragmentation test failed at iteration {i}: OOM"
+                        )
+                        break
+
+            else:
+                print("✓ Memory fragmentation test completed successfully")
+
+            # Clean up remaining tensors
+            tensors.clear()
+            torch.cuda.empty_cache()
+
+        except Exception as e:
+            print(f"✗ Memory fragmentation test failed - {type(e).__name__}: {e}")
+
+        # Test large contiguous allocation
+        try:
+            print("\nTesting large contiguous allocation...")
+            # Try to allocate 80% of free memory as a single tensor
+            free_memory = props.total_memory - torch.cuda.memory_reserved()
+            target_memory = int(free_memory * 0.8)
+            elements = target_memory // 4  # 4 bytes per float
+            side_length = int(elements**0.5)
+
+            print(
+                f"Attempting to allocate {target_memory / 1024**2:.2f} MB ({side_length}x{side_length} tensor)"
+            )
+
+            tensor = torch.randn(side_length, side_length, device="cuda")
+            print(f"✓ Successfully allocated large contiguous tensor")
+
+            # Test operation on large tensor
+            result = torch.mm(tensor, tensor)
+            print(f"✓ Matrix multiplication completed on large tensor")
+
+            del tensor, result
+            torch.cuda.empty_cache()
+
+        except RuntimeError as e:
+            if "out of memory" in str(e).lower():
+                print(f"✗ Large contiguous allocation failed: OOM")
+            else:
+                print(f"✗ Large contiguous allocation failed - {type(e).__name__}: {e}")
+        except Exception as e:
+            print(
+                f"✗ Large contiguous allocation test failed - {type(e).__name__}: {e}"
+            )
+
+        # Test ROCm-specific memory operations
+        print("\nTesting ROCm-specific memory operations...")
+        try:
+            # Test pinned memory (important for ROCm)
+            pinned_tensor = torch.randn(1024, 1024, pin_memory=True)
+            print("✓ Pinned memory allocation successful")
+            del pinned_tensor
+
+            # Test non-blocking operations
+            src_tensor = torch.randn(1024, 1024, device="cuda")
+            dst_tensor = torch.randn(1024, 1024, device="cuda")
+            dst_tensor.copy_(src_tensor, non_blocking=True)
+            print("✓ Non-blocking copy successful")
+
+            # Test memory pinning with CPU tensor
+            cpu_tensor = torch.randn(1024, 1024)
+            cpu_tensor = cpu_tensor.pin_memory()
+            gpu_tensor = cpu_tensor.to("cuda", non_blocking=True)
+            print("✓ Pinned memory transfer successful")
+
+        except Exception as e:
+            print(f"✗ ROCm memory operation failed - {type(e).__name__}: {e}")
+
+    except Exception as e:
+        print(f"✗ Memory allocation tests: FAILED - {type(e).__name__}: {e}")
+
+
+def test_model_types():
+    """Test loading and running all model types on ROCm."""
+    print_section("5. Model Type Testing")
+
+    if not torch.cuda.is_available():
+        print("✗ CUDA not available - skipping model tests")
+        return
+
+    try:
+        # Test importing model modules one at a time to isolate issues
+        print("Importing Roformer modules...")
+        from audio_separator.separator.roformer.roformer_loader import RoformerLoader
+
+        print("✓ RoformerLoader imported")
+
+        print("Importing Demucs modules...")
+        from audio_separator.separator.uvr_lib_v5.demucs.hdemucs import HDemucs
+
+        print("✓ HDemucs imported")
+
+        print("Importing BSRoformer...")
+        from audio_separator.separator.uvr_lib_v5.roformer.bs_roformer import BSRoformer
+
+        print("✓ BSRoformer imported")
+
+        print("Importing MelBandRoformer...")
+        from audio_separator.separator.uvr_lib_v5.roformer.mel_band_roformer import (
+            MelBandRoformer,
+        )
+
+        print("✓ MelBandRoformer imported")
+
+        print("Importing TFC_TDF_net (MDX)...")
+        from audio_separator.separator.uvr_lib_v5.tfc_tdf_v3 import TFC_TDF_net
+
+        print("✓ TFC_TDF_net imported")
+
+        # Test Roformer models
+        print_section("5.1 Roformer Models")
+
+        # Test BSRoformer
+        try:
+            # Use proper freqs_per_bands that sums to 1025 for n_fft=2048
+            model = BSRoformer(
+                dim=512,
+                depth=2,
+                freqs_per_bands=(
+                    2,
+                    2,
+                    2,
+                    2,
+                    2,
+                    2,
+                    2,
+                    2,
+                    2,
+                    2,
+                    2,
+                    2,
+                    2,
+                    2,
+                    2,
+                    2,
+                    2,
+                    2,
+                    2,
+                    2,
+                    2,
+                    2,
+                    2,
+                    2,
+                    2,
+                    4,
+                    4,
+                    4,
+                    4,
+                    4,
+                    4,
+                    4,
+                    4,
+                    4,
+                    4,
+                    4,
+                    4,
+                    12,
+                    12,
+                    12,
+                    12,
+                    12,
+                    12,
+                    12,
+                    12,
+                    24,
+                    24,
+                    24,
+                    24,
+                    24,
+                    24,
+                    24,
+                    24,
+                    48,
+                    48,
+                    48,
+                    48,
+                    48,
+                    48,
+                    48,
+                    48,
+                    128,
+                    127,
+                ),
+                stft_n_fft=2048,
+                stft_hop_length=512,
+                stft_win_length=2048,
+            )
+            model.to("cuda")
+            model.eval()
+
+            print("✓ Successfully created and moved BSRoformer model to GPU")
+
+            # Test with a small input
+            test_input = torch.randn(1, 44100, device="cuda", requires_grad=True)
+            output = model(test_input)
+
+            print(f"✓ Model forward pass completed: output shape {output.shape}")
+
+            # Test gradient computation
+            try:
+                model.train()
+                loss = torch.nn.functional.l1_loss(output, torch.zeros_like(output))
+                loss.backward()
+                print("✓ Gradient computation successful")
+                model.eval()
+            except Exception as e:
+                print(f"✗ Gradient computation failed - {type(e).__name__}: {e}")
+
+        except Exception as e:
+            print(f"✗ BSRoformer model operations: FAILED - {type(e).__name__}: {e}")
+
+        # Test MelBandRoformer
+        try:
+            model = MelBandRoformer(
+                dim=512,
+                depth=2,
+                num_bands=60,
+                stft_n_fft=2048,
+                stft_hop_length=512,
+                stft_win_length=2048,
+            )
+            model.to("cuda")
+            model.eval()
+
+            print("✓ Successfully created and moved MelBandRoformer model to GPU")
+
+            # Test with a small input
+            test_input = torch.randn(1, 44100, device="cuda", requires_grad=True)
+            output = model(test_input)
+
+            print(f"✓ Model forward pass completed: output shape {output.shape}")
+
+            # Test gradient computation
+            try:
+                model.train()
+                loss = torch.nn.functional.l1_loss(output, torch.zeros_like(output))
+                loss.backward()
+                print("✓ Gradient computation successful")
+                model.eval()
+            except Exception as e:
+                print(f"✗ Gradient computation failed - {type(e).__name__}: {e}")
+
+        except Exception as e:
+            print(
+                f"✗ MelBandRoformer model operations: FAILED - {type(e).__name__}: {e}"
+            )
+
+        # Test RoformerLoader
+        try:
+            loader = RoformerLoader()
+            print("✓ Successfully created RoformerLoader")
+
+            # Test configuration validation
+            config = {
+                "dim": 512,
+                "depth": 2,
+                "freqs_per_bands": (
+                    2,
+                    2,
+                    2,
+                    2,
+                    2,
+                    2,
+                    2,
+                    2,
+                    2,
+                    2,
+                    2,
+                    2,
+                    2,
+                    2,
+                    2,
+                    2,
+                    2,
+                    2,
+                    2,
+                    2,
+                    2,
+                    2,
+                    2,
+                    2,
+                    2,
+                    4,
+                    4,
+                    4,
+                    4,
+                    4,
+                    4,
+                    4,
+                    4,
+                    4,
+                    4,
+                    4,
+                    4,
+                    12,
+                    12,
+                    12,
+                    12,
+                    12,
+                    12,
+                    12,
+                    12,
+                    24,
+                    24,
+                    24,
+                    24,
+                    24,
+                    24,
+                    24,
+                    24,
+                    48,
+                    48,
+                    48,
+                    48,
+                    48,
+                    48,
+                    48,
+                    48,
+                    128,
+                    127,
+                ),
+                "stft_n_fft": 2048,
+                "stft_hop_length": 512,
+                "stft_win_length": 2048,
+            }
+
+            is_valid = loader.validate_configuration(config, "bs_roformer")
+            if is_valid:
+                print("✓ Configuration validation passed")
+            else:
+                print("✗ Configuration validation failed")
+
+        except Exception as e:
+            print(f"✗ RoformerLoader test failed - {type(e).__name__}: {e}")
+
+        # Test Demucs models
+        print_section("5.2 Demucs Models")
+        try:
+            # Test HDemucs model architecture
+            demucs_model = HDemucs(sources=["drums", "bass", "other", "vocals"])
+            demucs_model.to("cuda")
+            demucs_model.train()  # Keep in training mode for backward pass
+
+            print("✓ Successfully created and moved Demucs model to GPU")
+
+            # Test with a small input
+            test_input = torch.randn(1, 2, 44100, device="cuda", requires_grad=True)
+            output = demucs_model(test_input)
+
+            print(f"✓ Demucs model forward pass completed: output shape {output.shape}")
+
+            # Test gradient computation
+            try:
+                loss = torch.nn.functional.l1_loss(output, torch.zeros_like(output))
+                loss.backward()
+                print("✓ Demucs gradient computation successful")
+                demucs_model.eval()
+            except Exception as e:
+                print(f"✗ Demucs gradient computation failed - {type(e).__name__}: {e}")
+
+        except Exception as e:
+            print(f"✗ Demucs model operations: FAILED - {type(e).__name__}: {e}")
+
+        # Test MDX models
+        print_section("5.3 MDX Models")
+        try:
+            # Test TFC_TDF_net model
+            from ml_collections import ConfigDict
+
+            # Create a complete config for testing
+            test_config = ConfigDict(
+                {
+                    "audio": {
+                        "n_fft": 2048,
+                        "hop_length": 512,
+                        "dim_f": 1025,
+                        "num_channels": 2,
+                    },
+                    "inference": {
+                        "dim_t": 256,
+                    },
+                    "training": {
+                        "instruments": ["vocals", "instrumental"],
+                        "target_instrument": "vocals",
+                    },
+                    "model": {
+                        "norm": "Identity",
+                        "act": "GELU",
+                        "num_subbands": 1,
+                        "num_scales": 2,
+                        "scale": [2, 2],
+                        "num_blocks_per_scale": 2,
+                        "num_channels": 16,
+                        "growth": 8,
+                        "bottleneck_factor": 4,
+                    },
+                }
+            )
+
+            print("Creating TFC_TDF_net model...")
+            try:
+                model = TFC_TDF_net(test_config, device="cuda")
+                print(f"Model created. Model device: {next(model.parameters()).device}")
+
+                # Print first few parameters to verify model creation
+                for i, (name, param) in enumerate(model.named_parameters()):
+                    if i < 3:  # Just show first few
+                        print(f"  {name}: {param.shape} {param.device}")
+                    else:
+                        print("  ...")
+                        break
+
+            except Exception as e:
+                print(f"✗ Failed to create model - {type(e).__name__}: {e}")
+                return
+
+            print("Moving model to CUDA...")
+            try:
+                model.to("cuda")
+                print(
+                    f"Model moved to CUDA. Current device: {next(model.parameters()).device}"
+                )
+
+                # Verify parameters are on CUDA
+                for i, (name, param) in enumerate(model.named_parameters()):
+                    if i < 3:  # Just show first few
+                        print(f"  {name}: {param.shape} {param.device}")
+                    else:
+                        print("  ...")
+                        break
+
+            except Exception as e:
+                print(f"✗ Failed to move model to CUDA - {type(e).__name__}: {e}")
+                return
+
+            model.eval()
+            print("✓ Successfully created and moved TFC_TDF_net model to GPU")
+
+            # Test with a small input
+            test_input = torch.randn(1, 2, 44100, device="cuda", requires_grad=True)
+            print(f"Test input created on device: {test_input.device}")
+
+            try:
+                print("Running forward pass...")
+                print(f"Input shape: {test_input.shape}")
+                output = model(test_input)
+                print(
+                    f"✓ MDX model forward pass completed: output shape {output.shape}"
+                )
+
+                # Test gradient computation
+                try:
+                    model.train()
+                    print("Computing loss...")
+                    loss = torch.nn.functional.l1_loss(output, torch.zeros_like(output))
+                    print(f"Loss computed: {loss.item()}")
+                    print("Running backward pass...")
+                    loss.backward()
+                    print("✓ MDX gradient computation successful")
+                    model.eval()
+                except Exception as e:
+                    print(
+                        f"✗ MDX gradient computation failed - {type(e).__name__}: {e}"
+                    )
+            except Exception as e:
+                print(f"✗ MDX forward pass failed - {type(e).__name__}: {e}")
+
+        except Exception as e:
+            print(f"✗ MDX model operations: FAILED - {type(e).__name__}: {e} {str(e)}")
+
+    except ImportError as e:
+        print(f"✗ Failed to import model modules: {e}")
+    except Exception as e:
+        print(f"✗ Model loading: FAILED - {type(e).__name__}: {e}")
+
+
+def main():
+    """Main function to run all tests."""
+    print("ROCm Debug Script for audio-separator")
+    print(f"Python version: {sys.version}")
+    print(f"Working directory: {os.getcwd()}")
+
+    # Print system information
+    print_section("0. System Information")
+    system_info = get_system_info()
+    if "cpu_model" in system_info:
+        print(f"CPU: {system_info['cpu_model']}")
+    if "cpu_cores" in system_info:
+        print(f"CPU Cores: {system_info['cpu_cores']}")
+    if "memory_total" in system_info:
+        print(f"Memory: {system_info['memory_total']:.0f} MB")
+    if "kernel_version" in system_info:
+        print(f"Kernel: {system_info['kernel_version']}")
+
+    # Print ROCm information
+    rocm_info = get_rocm_info()
+    if "version" in rocm_info:
+        print(f"ROCm Version: {rocm_info['version']}")
+    if "hip_version" in rocm_info:
+        print(f"HIP Version: {rocm_info['hip_version']}")
+
+    # Print environment variables
+    print(
+        f"HSA_OVERRIDE_GFX_VERSION: {os.environ.get('HSA_OVERRIDE_GFX_VERSION', 'Not set')}"
+    )
+    print(f"PYTORCH_ROCM_ARCH: {os.environ.get('PYTORCH_ROCM_ARCH', 'Not set')}")
+
+    # Check for relevant environment variables
+    relevant_vars = [
+        v
+        for v in os.environ.keys()
+        if "rocm" in v.lower() or "gpu" in v.lower() or "hip" in v.lower()
+    ]
+    if relevant_vars:
+        print("Relevant environment variables:")
+        for var in relevant_vars:
+            print(f"  {var}: {os.environ[var]}")
+
+    # Check PyTorch ROCm build
+    print_section("0.1 PyTorch ROCm Build Check")
+    if "+rocm" in torch.__version__:
+        print("✓ PyTorch built with ROCm support")
+    else:
+        print("✗ PyTorch not built with ROCm support - consider reinstalling with ROCm")
+
+    # Check for ROCm-specific environment variables
+    print_section("0.2 ROCm Environment Variables")
+    rocm_vars = [v for v in os.environ.keys() if "rocm" in v.lower()]
+    if rocm_vars:
+        for var in rocm_vars:
+            print(f"  {var}: {os.environ[var]}")
+    else:
+        print("  No ROCm-specific environment variables set")
+
+    # Check for known ROCm issues
+    print_section("0.3 Known ROCm Issues Check")
+    if torch.cuda.is_available():
+        # Check for gfx1032 issue (wrapped in try/except for stability)
+        try:
+            device_props = torch.cuda.get_device_properties(0)
+            if "gfx1032" in str(device_props):
+                if "HSA_OVERRIDE_GFX_VERSION" not in os.environ:
+                    print("✗ gfx1032 GPU detected but HSA_OVERRIDE_GFX_VERSION not set")
+                    print("  Recommendation: export HSA_OVERRIDE_GFX_VERSION=10.3.2")
+                else:
+                    print("✓ gfx1032 GPU workaround enabled")
+        except Exception as e:
+            print(f"  Could not check device properties: {e}")
+
+        # Check for autocast issues
+        if hasattr(torch.cuda, "amp"):
+            print("✓ CUDA AMP (autocast) available")
+        else:
+            print("✗ CUDA AMP (autocast) not available - may affect performance")
+
+    # Check memory layout (disabled due to ROCm stability issues)
+    print_section("0.4 Memory Layout Check")
+    if torch.cuda.is_available():
+        print("Memory layout check skipped for ROCm stability")
+
+    # Run only MDX model test for focused debugging
+    # Note: STFT operations may crash on certain ROCm configurations
+    # This is a known ROCm limitation, not a code bug
+    print("\nNOTE: If STFT tests crash, this indicates a ROCm system-level issue")
+    print("      not a code bug. ONNX-based models (MDX-Net) will still work.\n")
+
+    try:
+        test_model_types()
+    except Exception as e:
+        print(f"Model test crashed (expected on some ROCm configurations): {e}")
+        print("This is a ROCm/PyTorch system issue, not a code bug.")
+
+    print("\n" + "=" * 50)
+    print("Debugging complete. Review the results above to identify issues.")
+    print("=" * 50)
+
+
+if __name__ == "__main__":
+    # Capture output to file
+    import io
+    from contextlib import redirect_stdout
+
+    output_buffer = io.StringIO()
+
+    try:
+        with redirect_stdout(output_buffer):
+            main()
+        output = output_buffer.getvalue()
+        print(output)
+
+        # Save to file
+        with open("debug_results.txt", "w") as f:
+            f.write(output)
+        print("\nResults saved to debug_results.txt")
+
+    except Exception as e:
+        print(f"\nUnexpected error: {type(e).__name__}: {e}")
+        traceback.print_exc()
+        # Save error to file
+        with open("debug_results.txt", "w") as f:
+            f.write(f"Error: {type(e).__name__}: {e}\n")
+            traceback.print_exc(file=f)
diff --git a/poetry.lock b/poetry.lock
index f498a90b..7fa7eee3 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 2.2.1 and should not be changed by hand.
+# This file is automatically @generated by Poetry 2.1.2 and should not be changed by hand.
 
 [[package]]
 name = "absl-py"
@@ -387,7 +387,7 @@ description = "Colored terminal output for Python's logging module"
 optional = true
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
 groups = ["main"]
-markers = "extra == \"cpu\" or extra == \"gpu\" or extra == \"dml\""
+markers = "extra == \"cpu\" or extra == \"gpu\" or extra == \"dml\" or extra == \"rocm\""
 files = [
     {file = "coloredlogs-15.0.1-py2.py3-none-any.whl", hash = "sha256:612ee75c546f53e92e70049c9dbfcc18c935a2b9a53b66085ce9ef6a6e5c0934"},
     {file = "coloredlogs-15.0.1.tar.gz", hash = "sha256:7c991aa71a4577af2f82600d8f8f3a89f936baeaf9b50a9c197da014e5bf16b0"},
@@ -935,7 +935,7 @@ description = "The FlatBuffers serialization format for Python"
 optional = true
 python-versions = "*"
 groups = ["main"]
-markers = "extra == \"cpu\" or extra == \"gpu\" or extra == \"dml\""
+markers = "extra == \"cpu\" or extra == \"gpu\" or extra == \"dml\" or extra == \"rocm\""
 files = [
     {file = "flatbuffers-25.9.23-py2.py3-none-any.whl", hash = "sha256:255538574d6cb6d0a79a17ec8bc0d30985913b87513a01cce8bcdb6b4c44d0e2"},
     {file = "flatbuffers-25.9.23.tar.gz", hash = "sha256:676f9fa62750bb50cf531b42a0a2a118ad8f7f797a511eda12881c016f093b12"},
@@ -1069,7 +1069,7 @@ description = "Human friendly output for text interfaces using Python"
 optional = true
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
 groups = ["main"]
-markers = "extra == \"cpu\" or extra == \"gpu\" or extra == \"dml\""
+markers = "extra == \"cpu\" or extra == \"gpu\" or extra == \"dml\" or extra == \"rocm\""
 files = [
     {file = "humanfriendly-10.0-py2.py3-none-any.whl", hash = "sha256:1697e1a8a8f550fd43c2865cd84542fc175a61dcb779b6fee18cf6b6ccba1477"},
     {file = "humanfriendly-10.0.tar.gz", hash = "sha256:6b0b831ce8f15f7300721aa49829fc4e83921a9a301cc7f606be6686a2288ddc"},
@@ -2562,6 +2562,29 @@ sympy = "*"
 cuda = ["nvidia-cuda-nvrtc-cu12 (>=12.0,<13.0)", "nvidia-cuda-runtime-cu12 (>=12.0,<13.0)", "nvidia-cufft-cu12 (>=11.0,<12.0)", "nvidia-curand-cu12 (>=10.0,<11.0)"]
 cudnn = ["nvidia-cudnn-cu12 (>=9.0,<10.0)"]
 
+[[package]]
+name = "onnxruntime-rocm"
+version = "1.22.2.post1"
+description = "ONNX Runtime is a runtime accelerator for Machine Learning models"
+optional = true
+python-versions = ">=3.10"
+groups = ["main"]
+markers = "extra == \"rocm\""
+files = [
+    {file = "onnxruntime_rocm-1.22.2.post1-cp310-cp310-manylinux_2_35_x86_64.whl", hash = "sha256:5d7fe86d657ae9808db49fca050c0215e24cd5e1bec1f80a70b6315b2062eabc"},
+    {file = "onnxruntime_rocm-1.22.2.post1-cp311-cp311-manylinux_2_35_x86_64.whl", hash = "sha256:e0940061b679224a7492125884e6988470aee2ef02c81b09eb46f02e0ddb0135"},
+    {file = "onnxruntime_rocm-1.22.2.post1-cp312-cp312-manylinux_2_35_x86_64.whl", hash = "sha256:ed3add7c2f0f1c164656e5c0f7a01f130cb4317f7954391da025c2d955704709"},
+    {file = "onnxruntime_rocm-1.22.2.post1-cp313-cp313-manylinux_2_35_x86_64.whl", hash = "sha256:19b56e9e41da3c7042dc97223ef46976ed8bbdf0ffe43646129f773647794b44"},
+]
+
+[package.dependencies]
+coloredlogs = "*"
+flatbuffers = "*"
+numpy = ">=1.21.6"
+packaging = "*"
+protobuf = "*"
+sympy = "*"
+
 [[package]]
 name = "packaging"
 version = "25.0"
@@ -2596,6 +2619,8 @@ groups = ["main", "dev"]
 files = [
     {file = "pillow-11.3.0-cp310-cp310-macosx_10_10_x86_64.whl", hash = "sha256:1b9c17fd4ace828b3003dfd1e30bff24863e0eb59b535e8f80194d9cc7ecf860"},
     {file = "pillow-11.3.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:65dc69160114cdd0ca0f35cb434633c75e8e7fad4cf855177a05bf38678f73ad"},
+    {file = "pillow-11.3.0-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:7107195ddc914f656c7fc8e4a5e1c25f32e9236ea3ea860f257b0436011fddd0"},
+    {file = "pillow-11.3.0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:cc3e831b563b3114baac7ec2ee86819eb03caa1a2cef0b481a5675b59c4fe23b"},
     {file = "pillow-11.3.0-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f1f182ebd2303acf8c380a54f615ec883322593320a9b00438eb842c1f37ae50"},
     {file = "pillow-11.3.0-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4445fa62e15936a028672fd48c4c11a66d641d2c05726c7ec1f8ba6a572036ae"},
     {file = "pillow-11.3.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:71f511f6b3b91dd543282477be45a033e4845a40278fa8dcdbfdb07109bf18f9"},
@@ -2605,6 +2630,8 @@ files = [
     {file = "pillow-11.3.0-cp310-cp310-win_arm64.whl", hash = "sha256:819931d25e57b513242859ce1876c58c59dc31587847bf74cfe06b2e0cb22d2f"},
     {file = "pillow-11.3.0-cp311-cp311-macosx_10_10_x86_64.whl", hash = "sha256:1cd110edf822773368b396281a2293aeb91c90a2db00d78ea43e7e861631b722"},
     {file = "pillow-11.3.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:9c412fddd1b77a75aa904615ebaa6001f169b26fd467b4be93aded278266b288"},
+    {file = "pillow-11.3.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:7d1aa4de119a0ecac0a34a9c8bde33f34022e2e8f99104e47a3ca392fd60e37d"},
+    {file = "pillow-11.3.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:91da1d88226663594e3f6b4b8c3c8d85bd504117d043740a8e0ec449087cc494"},
     {file = "pillow-11.3.0-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:643f189248837533073c405ec2f0bb250ba54598cf80e8c1e043381a60632f58"},
     {file = "pillow-11.3.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:106064daa23a745510dabce1d84f29137a37224831d88eb4ce94bb187b1d7e5f"},
     {file = "pillow-11.3.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:cd8ff254faf15591e724dc7c4ddb6bf4793efcbe13802a4ae3e863cd300b493e"},
@@ -2614,6 +2641,8 @@ files = [
     {file = "pillow-11.3.0-cp311-cp311-win_arm64.whl", hash = "sha256:30807c931ff7c095620fe04448e2c2fc673fcbb1ffe2a7da3fb39613489b1ddd"},
     {file = "pillow-11.3.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:fdae223722da47b024b867c1ea0be64e0df702c5e0a60e27daad39bf960dd1e4"},
     {file = "pillow-11.3.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:921bd305b10e82b4d1f5e802b6850677f965d8394203d182f078873851dada69"},
+    {file = "pillow-11.3.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:eb76541cba2f958032d79d143b98a3a6b3ea87f0959bbe256c0b5e416599fd5d"},
+    {file = "pillow-11.3.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:67172f2944ebba3d4a7b54f2e95c786a3a50c21b88456329314caaa28cda70f6"},
     {file = "pillow-11.3.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:97f07ed9f56a3b9b5f49d3661dc9607484e85c67e27f3e8be2c7d28ca032fec7"},
     {file = "pillow-11.3.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:676b2815362456b5b3216b4fd5bd89d362100dc6f4945154ff172e206a22c024"},
     {file = "pillow-11.3.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:3e184b2f26ff146363dd07bde8b711833d7b0202e27d13540bfe2e35a323a809"},
@@ -2626,6 +2655,8 @@ files = [
     {file = "pillow-11.3.0-cp313-cp313-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:7859a4cc7c9295f5838015d8cc0a9c215b77e43d07a25e460f35cf516df8626f"},
     {file = "pillow-11.3.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:ec1ee50470b0d050984394423d96325b744d55c701a439d2bd66089bff963d3c"},
     {file = "pillow-11.3.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:7db51d222548ccfd274e4572fdbf3e810a5e66b00608862f947b163e613b67dd"},
+    {file = "pillow-11.3.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:2d6fcc902a24ac74495df63faad1884282239265c6839a0a6416d33faedfae7e"},
+    {file = "pillow-11.3.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:f0f5d8f4a08090c6d6d578351a2b91acf519a54986c055af27e7a93feae6d3f1"},
     {file = "pillow-11.3.0-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c37d8ba9411d6003bba9e518db0db0c58a680ab9fe5179f040b0463644bc9805"},
     {file = "pillow-11.3.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:13f87d581e71d9189ab21fe0efb5a23e9f28552d5be6979e84001d3b8505abe8"},
     {file = "pillow-11.3.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:023f6d2d11784a465f09fd09a34b150ea4672e85fb3d05931d89f373ab14abb2"},
@@ -2635,6 +2666,8 @@ files = [
     {file = "pillow-11.3.0-cp313-cp313-win_arm64.whl", hash = "sha256:1904e1264881f682f02b7f8167935cce37bc97db457f8e7849dc3a6a52b99580"},
     {file = "pillow-11.3.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:4c834a3921375c48ee6b9624061076bc0a32a60b5532b322cc0ea64e639dd50e"},
     {file = "pillow-11.3.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:5e05688ccef30ea69b9317a9ead994b93975104a677a36a8ed8106be9260aa6d"},
+    {file = "pillow-11.3.0-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:1019b04af07fc0163e2810167918cb5add8d74674b6267616021ab558dc98ced"},
+    {file = "pillow-11.3.0-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:f944255db153ebb2b19c51fe85dd99ef0ce494123f21b9db4877ffdfc5590c7c"},
     {file = "pillow-11.3.0-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1f85acb69adf2aaee8b7da124efebbdb959a104db34d3a2cb0f3793dbae422a8"},
     {file = "pillow-11.3.0-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:05f6ecbeff5005399bb48d198f098a9b4b6bdf27b8487c7f38ca16eeb070cd59"},
     {file = "pillow-11.3.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:a7bc6e6fd0395bc052f16b1a8670859964dbd7003bd0af2ff08342eb6e442cfe"},
@@ -2644,6 +2677,8 @@ files = [
     {file = "pillow-11.3.0-cp313-cp313t-win_arm64.whl", hash = "sha256:8797edc41f3e8536ae4b10897ee2f637235c94f27404cac7297f7b607dd0716e"},
     {file = "pillow-11.3.0-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:d9da3df5f9ea2a89b81bb6087177fb1f4d1c7146d583a3fe5c672c0d94e55e12"},
     {file = "pillow-11.3.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:0b275ff9b04df7b640c59ec5a3cb113eefd3795a8df80bac69646ef699c6981a"},
+    {file = "pillow-11.3.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:0743841cabd3dba6a83f38a92672cccbd69af56e3e91777b0ee7f4dba4385632"},
+    {file = "pillow-11.3.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:2465a69cf967b8b49ee1b96d76718cd98c4e925414ead59fdf75cf0fd07df673"},
     {file = "pillow-11.3.0-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:41742638139424703b4d01665b807c6468e23e699e8e90cffefe291c5832b027"},
     {file = "pillow-11.3.0-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:93efb0b4de7e340d99057415c749175e24c8864302369e05914682ba642e5d77"},
     {file = "pillow-11.3.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:7966e38dcd0fa11ca390aed7c6f20454443581d758242023cf36fcb319b1a874"},
@@ -2653,6 +2688,8 @@ files = [
     {file = "pillow-11.3.0-cp314-cp314-win_arm64.whl", hash = "sha256:155658efb5e044669c08896c0c44231c5e9abcaadbc5cd3648df2f7c0b96b9a6"},
     {file = "pillow-11.3.0-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:59a03cdf019efbfeeed910bf79c7c93255c3d54bc45898ac2a4140071b02b4ae"},
     {file = "pillow-11.3.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:f8a5827f84d973d8636e9dc5764af4f0cf2318d26744b3d902931701b0d46653"},
+    {file = "pillow-11.3.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:ee92f2fd10f4adc4b43d07ec5e779932b4eb3dbfbc34790ada5a6669bc095aa6"},
+    {file = "pillow-11.3.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:c96d333dcf42d01f47b37e0979b6bd73ec91eae18614864622d9b87bbd5bbf36"},
     {file = "pillow-11.3.0-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4c96f993ab8c98460cd0c001447bff6194403e8b1d7e149ade5f00594918128b"},
     {file = "pillow-11.3.0-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:41342b64afeba938edb034d122b2dda5db2139b9a4af999729ba8818e0056477"},
     {file = "pillow-11.3.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:068d9c39a2d1b358eb9f245ce7ab1b5c3246c7c8c7d9ba58cfa5b43146c06e50"},
@@ -2662,6 +2699,8 @@ files = [
     {file = "pillow-11.3.0-cp314-cp314t-win_arm64.whl", hash = "sha256:79ea0d14d3ebad43ec77ad5272e6ff9bba5b679ef73375ea760261207fa8e0aa"},
     {file = "pillow-11.3.0-cp39-cp39-macosx_10_10_x86_64.whl", hash = "sha256:48d254f8a4c776de343051023eb61ffe818299eeac478da55227d96e241de53f"},
     {file = "pillow-11.3.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:7aee118e30a4cf54fdd873bd3a29de51e29105ab11f9aad8c32123f58c8f8081"},
+    {file = "pillow-11.3.0-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:23cff760a9049c502721bdb743a7cb3e03365fafcdfc2ef9784610714166e5a4"},
+    {file = "pillow-11.3.0-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:6359a3bc43f57d5b375d1ad54a0074318a0844d11b76abccf478c37c986d3cfc"},
     {file = "pillow-11.3.0-cp39-cp39-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:092c80c76635f5ecb10f3f83d76716165c96f5229addbd1ec2bdbbda7d496e06"},
     {file = "pillow-11.3.0-cp39-cp39-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:cadc9e0ea0a2431124cde7e1697106471fc4c1da01530e679b2391c37d3fbb3a"},
     {file = "pillow-11.3.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:6a418691000f2a418c9135a7cf0d797c1bb7d9a485e61fe8e7722845b95ef978"},
@@ -2671,11 +2710,15 @@ files = [
     {file = "pillow-11.3.0-cp39-cp39-win_arm64.whl", hash = "sha256:6abdbfd3aea42be05702a8dd98832329c167ee84400a1d1f61ab11437f1717eb"},
     {file = "pillow-11.3.0-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:3cee80663f29e3843b68199b9d6f4f54bd1d4a6b59bdd91bceefc51238bcb967"},
     {file = "pillow-11.3.0-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:b5f56c3f344f2ccaf0dd875d3e180f631dc60a51b314295a3e681fe8cf851fbe"},
+    {file = "pillow-11.3.0-pp310-pypy310_pp73-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:e67d793d180c9df62f1f40aee3accca4829d3794c95098887edc18af4b8b780c"},
+    {file = "pillow-11.3.0-pp310-pypy310_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:d000f46e2917c705e9fb93a3606ee4a819d1e3aa7a9b442f6444f07e77cf5e25"},
     {file = "pillow-11.3.0-pp310-pypy310_pp73-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:527b37216b6ac3a12d7838dc3bd75208ec57c1c6d11ef01902266a5a0c14fc27"},
     {file = "pillow-11.3.0-pp310-pypy310_pp73-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:be5463ac478b623b9dd3937afd7fb7ab3d79dd290a28e2b6df292dc75063eb8a"},
     {file = "pillow-11.3.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:8dc70ca24c110503e16918a658b869019126ecfe03109b754c402daff12b3d9f"},
     {file = "pillow-11.3.0-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:7c8ec7a017ad1bd562f93dbd8505763e688d388cde6e4a010ae1486916e713e6"},
     {file = "pillow-11.3.0-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:9ab6ae226de48019caa8074894544af5b53a117ccb9d3b3dcb2871464c829438"},
+    {file = "pillow-11.3.0-pp311-pypy311_pp73-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:fe27fb049cdcca11f11a7bfda64043c37b30e6b91f10cb5bab275806c32f6ab3"},
+    {file = "pillow-11.3.0-pp311-pypy311_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:465b9e8844e3c3519a983d58b80be3f668e2a7a5db97f2784e7079fbc9f9822c"},
     {file = "pillow-11.3.0-pp311-pypy311_pp73-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5418b53c0d59b3824d05e029669efa023bbef0f3e92e75ec8428f3799487f361"},
     {file = "pillow-11.3.0-pp311-pypy311_pp73-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:504b6f59505f08ae014f724b6207ff6222662aab5cc9542577fb084ed0676ac7"},
     {file = "pillow-11.3.0-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:c84d689db21a1c397d001aa08241044aa2069e7587b398c8cc63020390b1c1b8"},
@@ -2827,7 +2870,7 @@ description = "A python implementation of GNU readline."
 optional = true
 python-versions = ">=3.8"
 groups = ["main"]
-markers = "sys_platform == \"win32\" and (extra == \"cpu\" or extra == \"gpu\" or extra == \"dml\")"
+markers = "sys_platform == \"win32\" and (extra == \"cpu\" or extra == \"gpu\" or extra == \"dml\" or extra == \"rocm\")"
 files = [
     {file = "pyreadline3-3.5.4-py3-none-any.whl", hash = "sha256:eaf8e6cc3c49bcccf145fc6067ba8643d1df34d604a1ec0eccbf7a18e6d3fae6"},
     {file = "pyreadline3-3.5.4.tar.gz", hash = "sha256:8d57d53039a1c75adba8e50dd3d992b28143480816187ea5efbd5c78e6c885b7"},
@@ -2918,6 +2961,13 @@ optional = false
 python-versions = ">=3.8"
 groups = ["main"]
 files = [
+    {file = "PyYAML-6.0.3-cp38-cp38-macosx_10_13_x86_64.whl", hash = "sha256:c2514fceb77bc5e7a2f7adfaa1feb2fb311607c9cb518dbc378688ec73d8292f"},
+    {file = "PyYAML-6.0.3-cp38-cp38-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9c57bb8c96f6d1808c030b1687b9b5fb476abaa47f0db9c0101f5e9f394e97f4"},
+    {file = "PyYAML-6.0.3-cp38-cp38-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:efd7b85f94a6f21e4932043973a7ba2613b059c4a000551892ac9f1d11f5baf3"},
+    {file = "PyYAML-6.0.3-cp38-cp38-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:22ba7cfcad58ef3ecddc7ed1db3409af68d023b7f940da23c6c2a1890976eda6"},
+    {file = "PyYAML-6.0.3-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:6344df0d5755a2c9a276d4473ae6b90647e216ab4757f8426893b5dd2ac3f369"},
+    {file = "PyYAML-6.0.3-cp38-cp38-win32.whl", hash = "sha256:3ff07ec89bae51176c0549bc4c63aa6202991da2d9a6129d7aef7f1407d3f295"},
+    {file = "PyYAML-6.0.3-cp38-cp38-win_amd64.whl", hash = "sha256:5cf4e27da7e3fbed4d6c3d8e797387aaad68102272f8f9752883bc32d61cb87b"},
     {file = "pyyaml-6.0.3-cp310-cp310-macosx_10_13_x86_64.whl", hash = "sha256:214ed4befebe12df36bcc8bc2b64b396ca31be9304b8f59e25c11cf94a4c033b"},
     {file = "pyyaml-6.0.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:02ea2dfa234451bbb8772601d7b8e426c2bfa197136796224e50e35a78777956"},
     {file = "pyyaml-6.0.3-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b30236e45cf30d2b8e7b3e85881719e98507abed1011bf463a8fa23e9c3e98a8"},
@@ -3918,8 +3968,9 @@ zstd = ["zstandard (>=0.18.0)"]
 cpu = ["onnxruntime"]
 dml = ["onnxruntime-directml", "torch_directml"]
 gpu = ["onnxruntime-gpu"]
+rocm = ["onnxruntime-rocm"]
 
 [metadata]
 lock-version = "2.1"
 python-versions = ">=3.10"
-content-hash = "399816772283c841da09cb934329f3b5e5b5909ca210b0d79ad5689bb05b6d34"
+content-hash = "891ca1c4e59efd0b6ffe379fa4f2bd5a00047683c23d83915198e21b60f3c952"
diff --git a/pyproject.toml b/pyproject.toml
index 06c551a8..2eaa7cff 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -46,6 +46,7 @@ onnx2torch-py313 = ">=1.6"
 onnxruntime = { version = ">=1.17", optional = true }
 onnxruntime-gpu = { version = ">=1.17", optional = true }
 onnxruntime-directml = { version = ">=1.17", optional = true }
+onnxruntime-rocm = { version = ">=1.17", optional = true }
 julius = ">=0.2"
 diffq-fixed = { version = ">=0.2", platform = "win32" }
 diffq = { version = ">=0.2", platform = "!=win32" }
@@ -62,6 +63,7 @@ soundfile = ">=0.12"
 cpu = ["onnxruntime"]
 gpu = ["onnxruntime-gpu"]
 dml = ["onnxruntime-directml", "torch_directml"]
+rocm = ["onnxruntime-rocm"]
 
 [tool.poetry.scripts]
 audio-separator = 'audio_separator.utils.cli:main'