Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,12 @@
*.pyo
*.pyd

# Build artifacts
*.egg-info/
dist/
build/

# Test outputs
*.trk
*.trx
*.nii.gz
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ SHELL ["/bin/bash", "-c"]

ENV DEBIAN_FRONTEND=noninteractive

RUN apt-get update && apt-get install --assume-yes curl
RUN apt-get update && apt-get install --assume-yes curl git

RUN curl -L "https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh" \
-o "/tmp/Miniconda3.sh"
Expand Down
8 changes: 6 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,16 @@
## Installation
To install from pypi, simply run `pip install "cuslines[cu13]"` or `pip install "cuslines[cu12]"` depending on your CUDA version.

To install from dev, simply run `pip install ".[cu13]"` or `pip install ".[cu12]"` in the top-level repository directory.
For Apple Silicon (M1/M2/M3/M4), install the Metal backend: `pip install "cuslines[metal]"`

To install from dev, simply run `pip install ".[cu13]"` or `pip install ".[cu12]"` (or `pip install ".[metal]"` on macOS) in the top-level repository directory.

The GPU backend is auto-detected at import time. On macOS with Apple Silicon, Metal is used; on Linux/Windows with an NVIDIA GPU, CUDA is used.

## Running the examples
This repository contains several example usage scripts.

The script `run_gpu_streamlines.py` demonstrates how to run any diffusion MRI dataset on the GPU. It can also run on the CPU for reference, if the argument `--device=cpu` is used. If not data is passed, it will donaload and use the HARDI dataset.
The script `run_gpu_streamlines.py` demonstrates how to run any diffusion MRI dataset on the GPU. It can also run on the CPU for reference, if the argument `--device=cpu` is used. Use `--device=metal` to explicitly select the Metal backend on macOS. If no data is passed, it will download and use the HARDI dataset.

To run the baseline CPU example on a random set of 1000 seeds, this is the command and example output:
```
Expand Down
56 changes: 49 additions & 7 deletions cuslines/__init__.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,55 @@
from .cuda_python import (
GPUTracker,
ProbDirectionGetter,
PttDirectionGetter,
BootDirectionGetter
)
import platform as _platform


def _detect_backend():
"""Auto-detect the best available GPU backend."""
system = _platform.system()
if system == "Darwin":
try:
import Metal

if Metal.MTLCreateSystemDefaultDevice() is not None:
return "metal"
except ImportError:
pass
try:
from cuda.bindings import runtime

count = runtime.cudaGetDeviceCount()
if count[1] > 0:
return "cuda"
except (ImportError, Exception):
pass
return None


BACKEND = _detect_backend()

if BACKEND == "metal":
from cuslines.metal import (
MetalGPUTracker as GPUTracker,
MetalProbDirectionGetter as ProbDirectionGetter,
MetalPttDirectionGetter as PttDirectionGetter,
MetalBootDirectionGetter as BootDirectionGetter,
)
elif BACKEND == "cuda":
from cuslines.cuda_python import (
GPUTracker,
ProbDirectionGetter,
PttDirectionGetter,
BootDirectionGetter,
)
else:
raise ImportError(
"No GPU backend available. Install either:\n"
" - CUDA: pip install 'cuslines[cu13]' (NVIDIA GPU)\n"
" - Metal: pip install 'cuslines[metal]' (Apple Silicon)"
)

__all__ = [
"GPUTracker",
"ProbDirectionGetter",
"PttDirectionGetter",
"BootDirectionGetter"
"BootDirectionGetter",
"BACKEND",
]
72 changes: 72 additions & 0 deletions cuslines/boot_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
"""Shared utilities for bootstrap direction getters (CUDA and Metal).

Extracts DIPY model matrices (H, R, delta_b, delta_q, sampling_matrix)
for OPDT and CSA models. Both backends need the same matrices — only
the GPU dispatch differs.
"""

from dipy.reconst import shm


def prepare_opdt(gtab, sphere, sh_order_max=6, full_basis=False,
sh_lambda=0.006, min_signal=1):
"""Build bootstrap matrices for the OPDT model.

Returns dict with keys: model_type, min_signal, H, R, delta_b,
delta_q, sampling_matrix, b0s_mask.
"""
sampling_matrix, _, _ = shm.real_sh_descoteaux(
sh_order_max, sphere.theta, sphere.phi,
full_basis=full_basis, legacy=True,
)
model = shm.OpdtModel(
gtab, sh_order_max=sh_order_max, smooth=sh_lambda,
min_signal=min_signal,
)
delta_b, delta_q = model._fit_matrix

H, R = _hat_and_lcr(gtab, model, sh_order_max)

return dict(
model_type="OPDT", min_signal=min_signal,
H=H, R=R, delta_b=delta_b, delta_q=delta_q,
sampling_matrix=sampling_matrix, b0s_mask=gtab.b0s_mask,
)


def prepare_csa(gtab, sphere, sh_order_max=6, full_basis=False,
sh_lambda=0.006, min_signal=1):
"""Build bootstrap matrices for the CSA model.

Returns dict with keys: model_type, min_signal, H, R, delta_b,
delta_q, sampling_matrix, b0s_mask.
"""
sampling_matrix, _, _ = shm.real_sh_descoteaux(
sh_order_max, sphere.theta, sphere.phi,
full_basis=full_basis, legacy=True,
)
model = shm.CsaOdfModel(
gtab, sh_order_max=sh_order_max, smooth=sh_lambda,
min_signal=min_signal,
)
delta_b = model._fit_matrix
delta_q = model._fit_matrix

H, R = _hat_and_lcr(gtab, model, sh_order_max)

return dict(
model_type="CSA", min_signal=min_signal,
H=H, R=R, delta_b=delta_b, delta_q=delta_q,
sampling_matrix=sampling_matrix, b0s_mask=gtab.b0s_mask,
)


def _hat_and_lcr(gtab, model, sh_order_max):
"""Compute hat matrix H and leveraged centered residuals matrix R."""
dwi_mask = ~gtab.b0s_mask
x, y, z = model.gtab.gradients[dwi_mask].T
_, theta, phi = shm.cart2sphere(x, y, z)
B, _, _ = shm.real_sh_descoteaux(sh_order_max, theta, phi, legacy=True)
H = shm.hat(B)
R = shm.lcr_matrix(H)
return H, R
85 changes: 9 additions & 76 deletions cuslines/cuda_python/cu_direction_getters.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from importlib.resources import files
from time import time

from dipy.reconst import shm
from cuslines.boot_utils import prepare_opdt, prepare_csa

from cuda.core import Device, LaunchConfig, Program, launch, ProgramOptions
from cuda.pathfinder import find_nvidia_header_directory
Expand Down Expand Up @@ -135,83 +135,16 @@ def __init__(
self.compile_program()

@classmethod
def from_dipy_opdt(
cls,
gtab,
sphere,
sh_order_max=6,
full_basis=False,
sh_lambda=0.006,
min_signal=1,
):
sampling_matrix, _, _ = shm.real_sh_descoteaux(
sh_order_max, sphere.theta, sphere.phi, full_basis=full_basis, legacy=False
)

model = shm.OpdtModel(
gtab, sh_order_max=sh_order_max, smooth=sh_lambda, min_signal=min_signal
)
fit_matrix = model._fit_matrix
delta_b, delta_q = fit_matrix

b0s_mask = gtab.b0s_mask
dwi_mask = ~b0s_mask
x, y, z = model.gtab.gradients[dwi_mask].T
_, theta, phi = shm.cart2sphere(x, y, z)
B, _, _ = shm.real_sym_sh_basis(sh_order_max, theta, phi)
H = shm.hat(B)
R = shm.lcr_matrix(H)

return cls(
model_type="OPDT",
min_signal=min_signal,
H=H,
R=R,
delta_b=delta_b,
delta_q=delta_q,
sampling_matrix=sampling_matrix,
b0s_mask=gtab.b0s_mask,
)
def from_dipy_opdt(cls, gtab, sphere, sh_order_max=6, full_basis=False,
sh_lambda=0.006, min_signal=1):
return cls(**prepare_opdt(gtab, sphere, sh_order_max, full_basis,
sh_lambda, min_signal))

@classmethod
def from_dipy_csa(
cls,
gtab,
sphere,
sh_order_max=6,
full_basis=False,
sh_lambda=0.006,
min_signal=1,
):
sampling_matrix, _, _ = shm.real_sh_descoteaux(
sh_order_max, sphere.theta, sphere.phi, full_basis=full_basis, legacy=False
)

model = shm.CsaOdfModel(
gtab, sh_order_max=sh_order_max, smooth=sh_lambda, min_signal=min_signal
)
fit_matrix = model._fit_matrix
delta_b = fit_matrix
delta_q = fit_matrix

b0s_mask = gtab.b0s_mask
dwi_mask = ~b0s_mask
x, y, z = model.gtab.gradients[dwi_mask].T
_, theta, phi = shm.cart2sphere(x, y, z)
B, _, _ = shm.real_sym_sh_basis(sh_order_max, theta, phi)
H = shm.hat(B)
R = shm.lcr_matrix(H)

return cls(
model_type="CSA",
min_signal=min_signal,
H=H,
R=R,
delta_b=delta_b,
delta_q=delta_q,
sampling_matrix=sampling_matrix,
b0s_mask=gtab.b0s_mask,
)
def from_dipy_csa(cls, gtab, sphere, sh_order_max=6, full_basis=False,
sh_lambda=0.006, min_signal=1):
return cls(**prepare_csa(gtab, sphere, sh_order_max, full_basis,
sh_lambda, min_signal))

def allocate_on_gpu(self, n):
self.H_d.append(checkCudaErrors(runtime.cudaMalloc(REAL_SIZE * self.H.size)))
Expand Down
Loading
Loading