diff --git a/p_kit/solver/annealing.py b/p_kit/solver/annealing.py
index 27f9ef3..b2218ae 100644
--- a/p_kit/solver/annealing.py
+++ b/p_kit/solver/annealing.py
@@ -12,6 +12,10 @@ def job():
         _, time_points, _ = solver.copy().solve(circuit.copy(), annealing_func=annealing)
         return time_points[-n_last_samples-1:-1, :]
 
-    samples = Parallel(n_jobs=n_jobs)(delayed(job)() for _ in range(n_shots))
+    if getattr(solver, 'device', 'cpu') == 'cuda':
+        all_m = solver.solve(circuit, annealing_func=annealing, n_shots=n_shots)
+        samples = all_m[-n_last_samples-1:-1]  # (n_last_samples, n_shots, n_pbits)
+        return samples.transpose(1, 0, 2).reshape(n_shots * n_last_samples, circuit.n_pbits)
 
+    samples = Parallel(n_jobs=n_jobs)(delayed(job)() for _ in range(n_shots))
     return np.array(samples).reshape((n_shots * n_last_samples, circuit.n_pbits))
diff --git a/p_kit/solver/base_solver.py b/p_kit/solver/base_solver.py
index 20528f3..d6ada51 100644
--- a/p_kit/solver/base_solver.py
+++ b/p_kit/solver/base_solver.py
@@ -1,21 +1,32 @@
-import random
-import time
 import numpy as np
 
+try:
+    import cupy as cp
+except ImportError:
+    cp = None
+
+
 class Solver:
-    def __init__(self, Nt, dt, i0, expected_mean=0, seed=None) -> None:
+    def __init__(self, Nt, dt, i0, expected_mean=0, seed=None, device='cpu') -> None:
         self.Nt = Nt
         self.dt = dt
         self.i0 = i0
         self.expected_mean = expected_mean
         self.seed = seed
-        self._random_gen = np.random.default_rng(self.seed)
-    
+        self.device = device
+        if device == 'cuda':
+            if cp is None:
+                raise ImportError("cupy is required for device='cuda'. Install with: pip install cupy-cuda13x")
+            self.xp = cp
+        else:
+            self.xp = np
+        self._random_gen = self.xp.random.default_rng(self.seed)
+
     def random(self, n_pbits):
         return self._random_gen.random(n_pbits)
 
-    def solve(self, annealing_func):
+    def solve(self, c, annealing_func=None, n_shots=1):
         raise NotImplementedError()
-    
+
     def copy(self):
         raise NotImplementedError()
diff --git a/p_kit/solver/csd_solver.py b/p_kit/solver/csd_solver.py
index 2ce8adf..5caa8cf 100644
--- a/p_kit/solver/csd_solver.py
+++ b/p_kit/solver/csd_solver.py
@@ -5,38 +5,38 @@
 
 
 class CaSuDaSolver(Solver):
-    # K. Y. Camsari, B. M. Sutton, and S. Datta, ‘p-bits for probabilistic spin logic’, Applied Physics Reviews, vol. 6, no. 1, p. 011305, Mar. 2019, doi: 10.1063/1.5055860.
+    # K. Y. Camsari, B. M. Sutton, and S. Datta, 'p-bits for probabilistic spin logic', Applied Physics Reviews, vol. 6, no. 1, p. 011305, Mar. 2019, doi: 10.1063/1.5055860.
+
+    def solve(self, c: PCircuit, annealing_func=constant, n_shots=1):
 
-    def solve(self, c: PCircuit, annealing_func=constant):
-    
         # credit: https://www.purdue.edu/p-bit/blog.html
+        xp = self.xp
         n_pbits = c.n_pbits
 
-        all_I = np.zeros((self.Nt, n_pbits))
-        all_m = np.zeros((self.Nt, n_pbits))
-        E = np.zeros(self.Nt)
-
-        m = np.sign(0.5 - self.random(n_pbits))
+        J = xp.asarray(c.J)
+        h = xp.asarray(c.h)
+        threshold = float(np.arctanh(self.expected_mean))
 
-        threshold = np.arctanh(self.expected_mean)
+        # m is (n_shots, n_pbits) — works for n_shots=1 too
+        all_m = xp.zeros((self.Nt, n_shots, n_pbits))
+        all_I = xp.zeros((self.Nt, n_pbits))
+        E = xp.zeros(self.Nt)
+        m = xp.sign(0.5 - self.random((n_shots, n_pbits)))
 
         for run in range(self.Nt):
-
-            # compute input biases
-            I = annealing_func(self, run) * (np.dot(m, c.J) + c.h)
-
-            # apply S(input)
-            s = np.exp(-self.dt * np.exp(-m * (I + threshold)))
-
-            # compute new output
-            m = m * np.sign(s - self.random(n_pbits))
-            
-            all_I[run] = I
+            I = annealing_func(self, run) * (m @ J + h)
+            s = xp.exp(-self.dt * xp.exp(-m * (I + threshold)))
+            m = m * xp.sign(s - self.random((n_shots, n_pbits)))
             all_m[run] = m
+            all_I[run] = I[0]
+            E[run] = self.i0 * (xp.dot(m[0], h) + 0.5 * xp.dot(xp.dot(m[0], J), m[0]))
 
-            E[run] = self.i0 * (np.dot(m, c.h) + 0.5 * np.dot(np.dot(m, c.J), m))
+        if self.device == 'cuda':
+            all_I, all_m, E = all_I.get(), all_m.get(), E.get()
 
-        return all_I, all_m, E
+        if n_shots == 1:
+            return all_I, all_m[:, 0, :], E
+        return all_m
 
     def copy(self):
-        return CaSuDaSolver(self.Nt, self.dt, self.i0, self.expected_mean, self.seed)
+        return CaSuDaSolver(self.Nt, self.dt, self.i0, self.expected_mean, self.seed, self.device)
diff --git a/setup.py b/setup.py
index eecb691..2d0cd22 100644
--- a/setup.py
+++ b/setup.py
@@ -44,6 +44,7 @@
                         ],
       extras_require={
                       'tests': ['pytest', 'seaborn', 'flake8'],
+                      'gpu': ['cupy-cuda13x'],
                       },
       zip_safe=False,
 )