From 535ae058ef060577e8ca3777613deb8d7aed44c5 Mon Sep 17 00:00:00 2001
From: Marius Lange <marius.lange@bsse.ethz.ch>
Date: Tue, 17 Feb 2026 18:30:26 -0500
Subject: [PATCH] Add gradient-based optimization and multi-start support

Add analytical Jacobian for the GPCCA rotation matrix objective,
enabling gradient-based optimizers (CG, L-BFGS-B, BFGS) as alternatives
to Nelder-Mead. Add multi-start optimization via SO(k) rotation
perturbation to escape local optima.

Key changes:
- _jacobian(): backpropagates through _fill_matrix (row-sum constraint,
  max condition, rescaling) for the correct gradient
- _perturb_rotation(): perturbs rotation matrix on the SO(k) manifold
  via expm(epsilon * S) with random skew-symmetric S
- _opt_soft(): dispatches to scipy.optimize.minimize for gradient-based
  methods, keeps fmin for Nelder-Mead
- _gpcca_core(): multi-start loop with degeneracy filtering
- _fill_matrix(): optional _return_intermediates for Jacobian backward
  pass, avoiding forward-pass duplication
- GPCCA.optimize(): new parameters method, n_starts, perturbation_scale,
  seed -- all backward compatible (defaults reproduce old behavior)

Tests:
- Jacobian vs finite differences (m=3, m=5)
- CG vs Nelder-Mead crispness comparison
- All 4 methods produce valid memberships
- Full GPCCA pipeline with CG
- _perturb_rotation preserves orthogonality
- Multi-start >= single-start crispness
- Seed determinism
---
 pygpcca/_gpcca.py   | 322 +++++++++++++++++++++++++++++++++++++++++---
 tests/test_gpcca.py | 174 ++++++++++++++++++++++++
 2 files changed, 479 insertions(+), 17 deletions(-)

diff --git a/pygpcca/_gpcca.py b/pygpcca/_gpcca.py
index fc58141..d6b9362 100644
--- a/pygpcca/_gpcca.py
+++ b/pygpcca/_gpcca.py
@@ -45,7 +45,7 @@
 ]
 
 
-from typing import Dict, List, Tuple, Union, Literal, Callable, Optional, TYPE_CHECKING
+from typing import Dict, List, Tuple, Union, Literal, Callable, Optional, NamedTuple, TYPE_CHECKING
 
 try:
     from functools import cached_property  # type: ignore[attr-defined]
@@ -68,9 +68,9 @@ def cached_property(fn: Callable) -> property:  # type: ignore[no-redef,type-arg
     warnings.simplefilter("always", category=UserWarning)  # Change the filter in this process
     os.environ["PYTHONWARNINGS"] = "always::UserWarning"  # Also affect subprocesses
 
-from scipy.linalg import subspace_angles
+from scipy.linalg import expm, subspace_angles
 from scipy.sparse import issparse, spmatrix
-from scipy.optimize import fmin
+from scipy.optimize import fmin, minimize
 import numpy as np
 import scipy.sparse as sp
 
@@ -361,6 +361,51 @@ def _initialize_rot_matrix(X: ArrayLike) -> ArrayLike:
     return np.linalg.pinv(X[index, :])
 
 
+def _perturb_rotation(
+    rot_matrix: ArrayLike,
+    epsilon: float,
+    rng: np.random.Generator,
+) -> ArrayLike:
+    """Perturb a rotation matrix on the SO(k) manifold.
+
+    Generates a random skew-symmetric matrix S of shape ``(k, k)``
+    (where ``k = m - 1``) and applies ``expm(epsilon * S)`` to the
+    cropped rotation matrix ``rot_matrix[1:, 1:]``. This produces
+    a perturbation that lives on the natural manifold of the
+    optimization variable.
+
+    Parameters
+    ----------
+    rot_matrix
+        Initial rotation matrix of shape ``(m, m)``.
+    epsilon
+        Angular scale of the perturbation. Small values (0.1-0.5)
+        stay near the original; large values (1.0-2.0) explore
+        more broadly.
+    rng
+        NumPy random generator for reproducibility.
+
+    Returns
+    -------
+    rot_perturbed
+        Perturbed rotation matrix of shape ``(m, m)``.
+    """
+    m = rot_matrix.shape[0]
+    k = m - 1
+
+    # Random skew-symmetric matrix -> element of so(k)
+    S = rng.standard_normal((k, k))
+    S = (S - S.T) / 2.0
+
+    # Rotation near identity on SO(k)
+    R = expm(epsilon * S)
+
+    rot_perturbed = rot_matrix.copy()
+    rot_perturbed[1:, 1:] = R @ rot_matrix[1:, 1:]
+
+    return rot_perturbed
+
+
 @d.dedent
 def _indexsearch(X: ArrayLike) -> ArrayLike:
     """
@@ -477,7 +522,98 @@ def _objective(alpha: ArrayLike, X: ArrayLike) -> float:
 
 
 @d.dedent
-def _opt_soft(X: ArrayLike, rot_matrix: ArrayLike) -> Tuple[ArrayLike, ArrayLike, float]:
+def _jacobian(alpha: ArrayLike, X: ArrayLike) -> ArrayLike:
+    r"""
+    Compute the Jacobian of the objective function via backpropagation through :func:`_fill_matrix`.
+
+    The objective is :math:`f(\alpha) = m - \mathrm{trace}(S(A(\alpha)))` where
+    :math:`A(\alpha)` is the full rotation matrix after :func:`_fill_matrix`.
+    The free parameters :math:`\alpha` are the flattened entries of
+    ``rot_matrix[1:, 1:]``.
+
+    :func:`_fill_matrix` introduces dependencies between matrix entries:
+
+    1. Row-sum constraint: ``A[r, 0] = -sum(A[r, 1:])`` for ``r >= 1``
+    2. Max condition: ``A[0, j] = max_i(-X[:, 1:] @ A[1:, :])[:, j]``
+    3. Rescaling: ``A = A / sum(A[0, :])``
+
+    The published gradient (below Eq. 16 in [Roeblitz13]_) assumes
+    independent entries and omits the chain rule through these
+    transformations. This implementation backpropagates correctly
+    through all three steps.
+
+    Parameters
+    ----------
+    alpha
+        Vector of shape `((m - 1) ^ 2,)` containing the flattened and
+        cropped rotation matrix ``rot_matrix[1:, 1:]``.
+    X
+        %(Q_sort)s
+
+    Returns
+    -------
+    Jacobian vector of shape `((m - 1) ^ 2,)`, the gradient of
+    :math:`f = m - \mathrm{trace}(S)` with respect to ``alpha``.
+    """  # noqa: D205, D400
+    n, m = X.shape
+    k = m - 1
+
+    if alpha.shape[0] != k**2:
+        raise ValueError(
+            "The shape of alpha doesn't match with the shape of X: "
+            f"It is not a ({k}^2,)-vector, but of dimension {alpha.shape}. X is of shape `{X.shape}`."
+        )
+
+    # --- Forward pass via _fill_matrix ---
+    B = np.reshape(alpha, (k, k))
+
+    rot_mat = np.zeros((m, m), dtype=np.float64)
+    rot_mat[1:, 1:] = B
+    result = _fill_matrix(rot_mat, X, _return_intermediates=True)
+    A = result.rot_matrix
+    A_pre = result.A_pre
+    argmax_indices = result.argmax_indices
+    s = result.scale
+
+    # --- Backward pass ---
+    # Step 1 (backward): df/dA where f = m - trace(diag(1/A[0,:]) @ A^T @ A).
+    # For i > 0:  df/dA[i,j] = -2 * A[i,j] / A[0,j]
+    # For i == 0: df/dA[0,j] = ||A[:,j]||^2 / A[0,j]^2 - 2
+    dfdA = np.zeros((m, m), dtype=np.float64)
+    for j in range(m):
+        col_norm_sq = np.sum(A[:, j] ** 2)
+        dfdA[0, j] = col_norm_sq / A[0, j] ** 2 - 2.0
+        dfdA[1:, j] = -2.0 * A[1:, j] / A[0, j]
+
+    # Step 2 (backward): backprop through rescaling A = A_pre / s.
+    # Direct part: dfdA_pre[i,j] = dfdA[i,j] / s
+    # Through s = sum(A_pre[0,:]): dfdA_pre[0,c] += (-1/s) * sum_{i,j} dfdA[i,j] * A[i,j]
+    dfdA_pre = dfdA / s
+    dfdA_pre[0, :] += -np.sum(dfdA * A) / s
+
+    # Step 3 (backward): backprop through max condition.
+    # A_pre[0,j] = max_i(dummy[i,j]) = dummy[argmax_j, j]
+    # dummy[i,j] = -sum_{r} X[i, r+1] * A_pre[r+1, j]
+    # So d(A_pre[0,j])/d(A_pre[r+1, c]) = -X[argmax_j, r+1] if c == j, else 0
+    dfdA_pre_lower = dfdA_pre[1:, :].copy()
+    for j in range(m):
+        i_max = argmax_indices[j]
+        dfdA_pre_lower[:, j] += dfdA_pre[0, j] * (-X[i_max, 1:])
+
+    # Step 4 (backward): backprop through row-sum constraint.
+    # A_pre[r, 0] = -sum_c B[r-1, c], so d(A_pre[r,0])/d(B[r-1,c]) = -1
+    dfdB = dfdA_pre_lower[:, 1:].copy()
+    dfdB += -dfdA_pre_lower[:, 0:1]
+
+    return dfdB.ravel()
+
+
+@d.dedent
+def _opt_soft(
+    X: ArrayLike,
+    rot_matrix: ArrayLike,
+    method: str = "Nelder-Mead",
+) -> Tuple[ArrayLike, ArrayLike, float]:
     r"""
     Optimize the G-PCCA rotation matrix such that the memberships are
     exclusively non-negative and compute the membership matrix.
@@ -517,8 +653,23 @@ def _opt_soft(X: ArrayLike, rot_matrix: ArrayLike) -> Tuple[ArrayLike, ArrayLike
     # Now reshape rot_crop_matrix into a linear vector alpha.
     k = m - 1
     alpha = np.reshape(rot_crop_matrix, k**2)
-    # TODO: Implement Gauss Newton Optimization to speed things up esp. for m > 10
-    alpha, fopt, _, _, _ = fmin(_objective, alpha, args=(X,), full_output=True, disp=False)
+
+    if method == "Nelder-Mead":
+        alpha, fopt, _, _, _ = fmin(_objective, alpha, args=(X,), full_output=True, disp=False)
+    elif method in ("L-BFGS-B", "BFGS", "CG"):
+        kwargs: Dict = {"method": method, "jac": _jacobian}
+        if method == "L-BFGS-B":
+            kwargs["bounds"] = [(-1, 1)] * k**2
+        else:
+            kwargs["options"] = {"disp": False}
+        result = minimize(_objective, alpha, args=(X,), **kwargs)
+        alpha = result.x
+        fopt = result.fun
+    else:
+        raise ValueError(
+            f"Invalid optimization method `{method!r}`. "
+            f"Valid options are: 'Nelder-Mead', 'L-BFGS-B', 'BFGS', 'CG'."
+        )
 
     # Now reshape alpha into a (k,k)-matrix.
     rot_crop_matrix = np.reshape(alpha, (k, k))
@@ -548,7 +699,25 @@ def _opt_soft(X: ArrayLike, rot_matrix: ArrayLike) -> Tuple[ArrayLike, ArrayLike
 
 
 @d.dedent
-def _fill_matrix(rot_matrix: ArrayLike, X: ArrayLike) -> ArrayLike:
+class _FillMatrixResult(NamedTuple):
+    """Intermediates from :func:`_fill_matrix` needed by the Jacobian."""
+
+    rot_matrix: ArrayLike
+    """Feasible rotation matrix of shape ``(m, m)``."""
+    A_pre: ArrayLike
+    """Pre-scaling rotation matrix of shape ``(m, m)``."""
+    argmax_indices: ArrayLike
+    """Row indices of the maximum in each column, shape ``(m,)``."""
+    scale: float
+    """Scaling factor ``sum(A_pre[0, :])``."""
+
+
+@d.dedent
+def _fill_matrix(
+    rot_matrix: ArrayLike,
+    X: ArrayLike,
+    _return_intermediates: bool = False,
+) -> Union[ArrayLike, _FillMatrixResult]:
     """
     Make the rotation matrix feasible.
 
@@ -558,10 +727,14 @@ def _fill_matrix(rot_matrix: ArrayLike, X: ArrayLike) -> ArrayLike:
         (Infeasible) rotation matrix of shape `(m, m)`.
     X
         %(Q_sort)s
+    _return_intermediates
+        If ``True``, return a :class:`_FillMatrixResult` containing
+        intermediates needed by the Jacobian backward pass.
 
     Returns
     -------
-    Feasible rotation matrix of shape `(m, m)`.
+    Feasible rotation matrix of shape `(m, m)`, or a
+    :class:`_FillMatrixResult` if ``_return_intermediates`` is ``True``.
     """
     n, m = X.shape
 
@@ -576,10 +749,13 @@ def _fill_matrix(rot_matrix: ArrayLike, X: ArrayLike) -> ArrayLike:
 
     # Compute first row of A by maximum condition.
     dummy = -np.dot(X[:, 1:], rot_matrix[1:, :])
+    argmax_indices = np.argmax(dummy, axis=0)
     rot_matrix[0, :] = np.max(dummy, axis=0)
 
     # Reskale rot_mat to be in the feasible set.
-    rot_matrix = rot_matrix / np.sum(rot_matrix[0, :])
+    scale = np.sum(rot_matrix[0, :])
+    A_pre = rot_matrix.copy() if _return_intermediates else None
+    rot_matrix = rot_matrix / scale
 
     # Make sure, that there are no zero or negative elements in the first row of A.
     if np.any(rot_matrix[0, :] == 0):
@@ -587,6 +763,13 @@ def _fill_matrix(rot_matrix: ArrayLike, X: ArrayLike) -> ArrayLike:
     if np.min(rot_matrix[0, :]) < 0:
         raise ValueError("First row of rotation matrix has elements < 0.")
 
+    if _return_intermediates:
+        return _FillMatrixResult(
+            rot_matrix=rot_matrix,
+            A_pre=A_pre,
+            argmax_indices=argmax_indices,
+            scale=scale,
+        )
     return rot_matrix
 
 
@@ -629,7 +812,13 @@ def _cluster_by_isa(X: ArrayLike) -> Tuple[ArrayLike, float]:
 
 
 @d.dedent
-def _gpcca_core(X: ArrayLike) -> Tuple[ArrayLike, ArrayLike, float]:
+def _gpcca_core(
+    X: ArrayLike,
+    method: str = "Nelder-Mead",
+    n_starts: int = 1,
+    perturbation_scale: float = 0.1,
+    seed: Optional[int] = None,
+) -> Tuple[ArrayLike, ArrayLike, float]:
     r"""
     Core of the G-PCCA spectral clustering method with optimized memberships [Reuter18]_, [Reuter19]_.
 
@@ -641,6 +830,26 @@ def _gpcca_core(X: ArrayLike) -> Tuple[ArrayLike, ArrayLike, float]:
     ----------
     X
         %(Q_sort)s
+    method
+        Optimization method for the rotation matrix. Valid options are:
+
+        - ``'Nelder-Mead'`` - derivative-free simplex method (default).
+        - ``'L-BFGS-B'`` - gradient-based with bounds, recommended for ``m > 10``.
+        - ``'BFGS'`` - gradient-based without bounds.
+        - ``'CG'`` - conjugate gradient.
+    n_starts
+        Number of optimization runs. The first run always uses the
+        deterministic ISA initialization. Subsequent runs perturb the
+        initial rotation matrix on the SO(k) manifold. The result
+        with the best crispness is returned. Set to ``1`` to disable
+        perturbation (fully deterministic, backward compatible).
+    perturbation_scale
+        Angular scale for the rotation perturbation (only used when
+        ``n_starts > 1``). Recommended range is 0.05-0.2; larger
+        values risk producing degenerate solutions.
+    seed
+        Random seed for reproducibility of the perturbations
+        (only used when ``n_starts > 1``).
 
     Returns
     -------
@@ -655,14 +864,57 @@ def _gpcca_core(X: ArrayLike) -> Tuple[ArrayLike, ArrayLike, float]:
     """
     m = np.shape(X)[1]
 
-    rot_matrix = _initialize_rot_matrix(X)
+    rot_matrix_init = _initialize_rot_matrix(X)
+
+    if n_starts <= 1:
+        # Single run: fully backward compatible, no randomness.
+        rot_matrix, chi, fopt = _opt_soft(
+            X, rot_matrix_init, method=method
+        )
+        crispness = (m - fopt) / m
+        return chi, rot_matrix, crispness
+
+    # Multi-start: run from deterministic init + perturbed inits,
+    # keep the result with the best crispness.
+    rng = np.random.default_rng(seed)
+    best_chi, best_rot, best_fopt = None, None, np.inf
 
-    rot_matrix, chi, fopt = _opt_soft(X, rot_matrix)
+    for i in range(n_starts):
+        if i == 0:
+            rot_start = rot_matrix_init
+        else:
+            rot_start = _perturb_rotation(
+                rot_matrix_init, perturbation_scale, rng
+            )
+        try:
+            rot_i, chi_i, fopt_i = _opt_soft(
+                X, rot_start, method=method
+            )
+        except ValueError:
+            # Perturbation led to infeasible solution; skip.
+            continue
+
+        # For perturbed starts, check that every cluster is the argmax
+        # for at least one row. A degenerate chi where one column is
+        # never dominant leads to empty clusters on discretization.
+        # The deterministic start (i=0) is always accepted to guarantee
+        # we return a result even if perturbed starts all fail.
+        if i > 0:
+            assignments = np.argmax(chi_i, axis=1)
+            if len(set(assignments)) < m:
+                continue
 
-    # calculate crispness of the decomposition of the state space into m clusters
-    crispness = (m - fopt) / m
+        if fopt_i < best_fopt:
+            best_chi, best_rot, best_fopt = chi_i, rot_i, fopt_i
 
-    return chi, rot_matrix, crispness
+    if best_chi is None:
+        raise ValueError(
+            "All optimization starts failed. "
+            "Try reducing `perturbation_scale`."
+        )
+
+    crispness = (m - best_fopt) / m
+    return best_chi, best_rot, crispness
 
 
 @d.dedent
@@ -706,6 +958,7 @@ def gpcca_coarsegrain(
     eta: Optional[ArrayLike] = None,
     z: Literal["LM", "LR"] = "LM",
     method: str = DEFAULT_SCHUR_METHOD,
+    optimization_method: str = "Nelder-Mead",
 ) -> ArrayLike:
     r"""
     Coarse-grain the transition matrix `P` into `m` sets using G-PCCA [Reuter18]_, [Reuter19]_.
@@ -727,6 +980,9 @@ def gpcca_coarsegrain(
     %(method)s
         See the `installation <https://pygpcca.readthedocs.io/en/latest/installation.html>`_ instructions
         for more information.
+    optimization_method
+        Optimization method for the rotation matrix. Valid options are
+        ``'Nelder-Mead'`` (default), ``'L-BFGS-B'``, ``'BFGS'``, ``'CG'``.
 
     Returns
     -------
@@ -737,7 +993,7 @@ def gpcca_coarsegrain(
     If you use this code or parts of it, please cite [Reuter19]_.
     """
     # Matlab: Pc = pinv(chi'*diag(eta)*chi)*(chi'*diag(eta)*P*chi)
-    chi = GPCCA(P, eta=eta, z=z, method=method).optimize(m).memberships
+    chi = GPCCA(P, eta=eta, z=z, method=method).optimize(m, method=optimization_method).memberships
 
     return _coarsegrain(P, eta=eta, chi=chi)
 
@@ -904,6 +1160,10 @@ def minChi(self, m_min: int, m_max: int) -> List[float]:
     def optimize(
         self,
         m: Union[int, Tuple[int, int], List[int], Dict[str, int]],
+        method: str = "Nelder-Mead",
+        n_starts: int = 1,
+        perturbation_scale: float = 0.1,
+        seed: Optional[int] = None,
     ) -> "GPCCA":
         r"""
         Full G-PCCA spectral clustering method with optimized memberships [Reuter18]_, [Reuter19]_.
@@ -930,6 +1190,28 @@ def optimize(
 
             See :meth:`minChi` for selection of good (potentially optimal)
             number of clusters.
+        method
+            Optimization method for the rotation matrix. Valid options are:
+
+            - ``'Nelder-Mead'`` - derivative-free simplex method (default).
+            - ``'L-BFGS-B'`` - gradient-based with bounds, recommended for
+              ``m > 10``.
+            - ``'BFGS'`` - gradient-based without bounds.
+            - ``'CG'`` - conjugate gradient.
+        n_starts
+            Number of optimization runs. The first run uses the
+            deterministic ISA initialization; subsequent runs perturb
+            the initial rotation matrix on the SO(k) manifold. The
+            result with the best crispness is returned. Set to ``1``
+            to disable perturbation (fully deterministic, backward
+            compatible).
+        perturbation_scale
+            Angular scale for the rotation perturbation (only used
+            when ``n_starts > 1``). Recommended range is 0.05-0.2;
+            larger values risk producing degenerate solutions.
+        seed
+            Random seed for reproducibility of the perturbations
+            (only used when ``n_starts > 1``).
 
         Returns
         -------
@@ -1035,7 +1317,13 @@ def optimize(
 
             # Reduce X according to m and make a work copy.
             # Xm = np.copy(X[:, :m])
-            chi, rot_matrix, crispness = _gpcca_core(self._p_X[:, :m])
+            chi, rot_matrix, crispness = _gpcca_core(
+                self._p_X[:, :m],
+                method=method,
+                n_starts=n_starts,
+                perturbation_scale=perturbation_scale,
+                seed=seed,
+            )
             # check if we have at least m dominant sets. If less than m, we warn.
             nmeta = np.count_nonzero(chi.sum(axis=0))
             if m > nmeta:
diff --git a/tests/test_gpcca.py b/tests/test_gpcca.py
index 20dc80b..59094b5 100644
--- a/tests/test_gpcca.py
+++ b/tests/test_gpcca.py
@@ -32,12 +32,14 @@
 import pytest
 
 from scipy.linalg import lu, pinv, hilbert, subspace_angles
+from scipy.optimize import approx_fprime
 from scipy.sparse import issparse, csr_matrix
 import numpy as np
 
 from pygpcca._gpcca import (
     GPCCA,
     _do_schur,
+    _jacobian,
     _opt_soft,
     _objective,
     _gpcca_core,
@@ -45,6 +47,7 @@
     _indexsearch,
     _cluster_by_isa,
     _gram_schmidt_mod,
+    _perturb_rotation,
     gpcca_coarsegrain,
     _initialize_rot_matrix,
 )
@@ -989,6 +992,177 @@ def test_optimize_range_all_invalid(self, P_2: np.ndarray, mocker):
             g.optimize([3, P_2.shape[0]])
 
 
+class TestGradientOptimization:
+    """Tests for gradient-based optimization and multi-start."""
+
+    @pytest.mark.parametrize("m", [3, 5])
+    def test_jacobian_vs_finite_differences(self, m: int):
+        """Verify _jacobian matches scipy.optimize.approx_fprime.
+
+        The ISA initialization (``_initialize_rot_matrix``) places alpha at
+        a degenerate point where multiple rows of ``dummy`` achieve the same
+        maximum in ``_fill_matrix``, making ``argmax`` — and therefore the
+        gradient — ill-defined. This is analogous to ``d|x|/dx`` at ``x=0``.
+        The test evaluates the Jacobian at a randomly perturbed alpha away
+        from this singularity, where the ``argmax`` gaps are stable under
+        finite-difference perturbations.
+        """
+        rng = np.random.default_rng(42)
+        block_size = 300 // m
+        n = block_size * m
+        P = np.zeros((n, n))
+        coupling = 0.01
+        for b in range(m):
+            s, e = b * block_size, (b + 1) * block_size
+            block = rng.dirichlet(np.ones(block_size), size=block_size)
+            P[s:e, s:e] = block * (1 - coupling)
+            for b2 in range(m):
+                if b2 != b:
+                    s2, e2 = b2 * block_size, (b2 + 1) * block_size
+                    P[s:e, s2:e2] = coupling / (m - 1) / block_size
+        P = P / P.sum(axis=1, keepdims=True)
+
+        g = GPCCA(P, eta=None, z="LM", method="brandts")
+        g._do_schur_helper(m)
+        svecs = g._p_X[:, :m]
+
+        # Perturb away from the ISA initialization to avoid the argmax
+        # degeneracy (where the Jacobian is not well-defined).
+        rot_matrix = _initialize_rot_matrix(svecs)
+        alpha = rot_matrix[1:, 1:].ravel().copy()
+        alpha += rng.normal(0, 0.1, size=alpha.shape)
+
+        jac_analytical = _jacobian(alpha, svecs)
+        jac_fd = approx_fprime(alpha, _objective, 1e-7, svecs)
+
+        rel_err = np.abs(jac_analytical - jac_fd) / np.maximum(
+            np.abs(jac_fd), 1e-10
+        )
+        assert np.max(rel_err) < 1e-3, (
+            f"m={m}: Jacobian max relative error "
+            f"{np.max(rel_err):.2e} exceeds 1e-3"
+        )
+
+    @pytest.mark.parametrize("mu_val", [0, 100, 1000])
+    def test_cg_vs_nelder_mead(self, mu_val: int):
+        """CG produces crispness close to Nelder-Mead on well-separated spectra."""
+        m = 3
+        P, sd = get_known_input(mu(mu_val))
+        X, _, _ = _do_schur(P, eta=sd, m=m)
+        svecs = X[:, :m]
+
+        A_nm = _initialize_rot_matrix(svecs)
+        _, chi_nm, fopt_nm = _opt_soft(svecs, A_nm, method="Nelder-Mead")
+
+        A_cg = _initialize_rot_matrix(svecs)
+        _, chi_cg, fopt_cg = _opt_soft(svecs, A_cg, method="CG")
+
+        crisp_nm = (m - fopt_nm) / m
+        crisp_cg = (m - fopt_cg) / m
+
+        assert crisp_cg >= crisp_nm - 0.05, (
+            f"mu={mu_val}: CG crispness ({crisp_cg:.4f}) "
+            f"much worse than NM ({crisp_nm:.4f})"
+        )
+
+        assert_allclose(chi_cg.sum(axis=1), 1.0, atol=1e-10)
+        assert np.all(chi_cg >= -1e-8)
+
+    @pytest.mark.parametrize(
+        "method", ["Nelder-Mead", "L-BFGS-B", "BFGS", "CG"]
+    )
+    def test_all_methods_produce_valid_memberships(self, method: str):
+        """All optimization methods produce valid membership matrices."""
+        m = 3
+        P, sd = get_known_input(mu(0))
+        X, _, _ = _do_schur(P, eta=sd, m=m)
+        svecs = X[:, :m]
+
+        A = _initialize_rot_matrix(svecs)
+        rot_matrix, chi, fopt = _opt_soft(svecs, A, method=method)
+
+        assert_allclose(chi.sum(axis=1), 1.0, atol=1e-10)
+        assert np.all(chi >= -1e-8), f"{method}: min(chi)={chi.min():.2e}"
+        crispness = (m - fopt) / m
+        assert crispness > 0, f"{method}: non-positive crispness {crispness}"
+
+    def test_gpcca_optimize_with_cg(self):
+        """GPCCA.optimize() works with CG through the full pipeline."""
+        P, sd = get_known_input(mu(0))
+        g = GPCCA(P, eta=sd, z="LM", method="brandts")
+        g.optimize(3, method="CG")
+
+        chi = g.memberships
+        assert chi is not None
+        assert chi.shape[1] == 3
+        assert_allclose(chi.sum(axis=1), 1.0, atol=1e-10)
+
+    def test_invalid_method_raises(self):
+        """Invalid optimization method raises ValueError."""
+        P, sd = get_known_input(mu(0))
+        X, _, _ = _do_schur(P, eta=sd, m=3)
+        svecs = X[:, :3]
+        A = _initialize_rot_matrix(svecs)
+
+        with pytest.raises(
+            ValueError, match="Invalid optimization method"
+        ):
+            _opt_soft(svecs, A, method="invalid")
+
+    def test_perturb_rotation_stays_on_manifold(self):
+        """Perturbed rotation matrices preserve orthogonality."""
+        rng = np.random.default_rng(0)
+        m = 5
+        P, sd = get_known_input(mu(0))
+        X, _, _ = _do_schur(P, eta=sd, m=m)
+        svecs = X[:, :m]
+        rot_matrix = _initialize_rot_matrix(svecs)
+
+        for eps in [0.05, 0.1, 0.5]:
+            rot_p = _perturb_rotation(rot_matrix, eps, rng)
+            # Shape preserved
+            assert rot_p.shape == rot_matrix.shape
+            # First row unchanged (only [1:, 1:] block is perturbed)
+            assert_allclose(rot_p[0, :], rot_matrix[0, :])
+            assert_allclose(rot_p[1:, 0], rot_matrix[1:, 0])
+            # The [1:, 1:] block should still be related by an
+            # orthogonal transformation, so det should be ±1
+            R_block = rot_p[1:, 1:] @ np.linalg.pinv(rot_matrix[1:, 1:])
+            assert abs(abs(np.linalg.det(R_block)) - 1.0) < 1e-10
+
+    def test_multi_start_improves_or_matches_single(self):
+        """Multi-start with CG achieves crispness >= single start."""
+        m = 5
+        P, sd = get_known_input(mu(0))
+
+        g1 = GPCCA(P, eta=sd, z="LM", method="brandts")
+        g1.optimize(m, method="CG", n_starts=1)
+
+        g10 = GPCCA(P, eta=sd, z="LM", method="brandts")
+        g10.optimize(m, method="CG", n_starts=10, seed=42)
+
+        # Multi-start picks the best, so crispness >= single start
+        assert g10._crispness >= g1._crispness - 1e-10
+
+        # Both produce valid memberships
+        assert_allclose(g10.memberships.sum(axis=1), 1.0, atol=1e-10)
+        assert np.all(g10.memberships >= -1e-8)
+
+    def test_multi_start_deterministic_with_seed(self):
+        """Same seed produces identical results."""
+        m = 5
+        P, sd = get_known_input(mu(0))
+
+        g1 = GPCCA(P, eta=sd, z="LM", method="brandts")
+        g1.optimize(m, method="CG", n_starts=5, seed=123)
+
+        g2 = GPCCA(P, eta=sd, z="LM", method="brandts")
+        g2.optimize(m, method="CG", n_starts=5, seed=123)
+
+        assert_allclose(g1.memberships, g2.memberships)
+        assert_allclose(g1._crispness, g2._crispness)
+
+
 class TestUtils:
     def test_transition_matrix_dtype(self, P_2: np.ndarray):
         g = GPCCA(P_2, eta=None, z="LR")