From 535ae058ef060577e8ca3777613deb8d7aed44c5 Mon Sep 17 00:00:00 2001 From: Marius Lange Date: Tue, 17 Feb 2026 18:30:26 -0500 Subject: [PATCH] Add gradient-based optimization and multi-start support Add analytical Jacobian for the GPCCA rotation matrix objective, enabling gradient-based optimizers (CG, L-BFGS-B, BFGS) as alternatives to Nelder-Mead. Add multi-start optimization via SO(k) rotation perturbation to escape local optima. Key changes: - _jacobian(): backpropagates through _fill_matrix (row-sum constraint, max condition, rescaling) for the correct gradient - _perturb_rotation(): perturbs rotation matrix on the SO(k) manifold via expm(epsilon * S) with random skew-symmetric S - _opt_soft(): dispatches to scipy.optimize.minimize for gradient-based methods, keeps fmin for Nelder-Mead - _gpcca_core(): multi-start loop with degeneracy filtering - _fill_matrix(): optional _return_intermediates for Jacobian backward pass, avoiding forward-pass duplication - GPCCA.optimize(): new parameters method, n_starts, perturbation_scale, seed -- all backward compatible (defaults reproduce old behavior) Tests: - Jacobian vs finite differences (m=3, m=5) - CG vs Nelder-Mead crispness comparison - All 4 methods produce valid memberships - Full GPCCA pipeline with CG - _perturb_rotation preserves orthogonality - Multi-start >= single-start crispness - Seed determinism --- pygpcca/_gpcca.py | 322 +++++++++++++++++++++++++++++++++++++++++--- tests/test_gpcca.py | 174 ++++++++++++++++++++++++ 2 files changed, 479 insertions(+), 17 deletions(-) diff --git a/pygpcca/_gpcca.py b/pygpcca/_gpcca.py index fc58141..d6b9362 100644 --- a/pygpcca/_gpcca.py +++ b/pygpcca/_gpcca.py @@ -45,7 +45,7 @@ ] -from typing import Dict, List, Tuple, Union, Literal, Callable, Optional, TYPE_CHECKING +from typing import Dict, List, Tuple, Union, Literal, Callable, Optional, NamedTuple, TYPE_CHECKING try: from functools import cached_property # type: ignore[attr-defined] @@ -68,9 +68,9 @@ def cached_property(fn: Callable) -> property: # type: ignore[no-redef,type-arg warnings.simplefilter("always", category=UserWarning) # Change the filter in this process os.environ["PYTHONWARNINGS"] = "always::UserWarning" # Also affect subprocesses -from scipy.linalg import subspace_angles +from scipy.linalg import expm, subspace_angles from scipy.sparse import issparse, spmatrix -from scipy.optimize import fmin +from scipy.optimize import fmin, minimize import numpy as np import scipy.sparse as sp @@ -361,6 +361,51 @@ def _initialize_rot_matrix(X: ArrayLike) -> ArrayLike: return np.linalg.pinv(X[index, :]) +def _perturb_rotation( + rot_matrix: ArrayLike, + epsilon: float, + rng: np.random.Generator, +) -> ArrayLike: + """Perturb a rotation matrix on the SO(k) manifold. + + Generates a random skew-symmetric matrix S of shape ``(k, k)`` + (where ``k = m - 1``) and applies ``expm(epsilon * S)`` to the + cropped rotation matrix ``rot_matrix[1:, 1:]``. This produces + a perturbation that lives on the natural manifold of the + optimization variable. + + Parameters + ---------- + rot_matrix + Initial rotation matrix of shape ``(m, m)``. + epsilon + Angular scale of the perturbation. Small values (0.1-0.5) + stay near the original; large values (1.0-2.0) explore + more broadly. + rng + NumPy random generator for reproducibility. + + Returns + ------- + rot_perturbed + Perturbed rotation matrix of shape ``(m, m)``. + """ + m = rot_matrix.shape[0] + k = m - 1 + + # Random skew-symmetric matrix -> element of so(k) + S = rng.standard_normal((k, k)) + S = (S - S.T) / 2.0 + + # Rotation near identity on SO(k) + R = expm(epsilon * S) + + rot_perturbed = rot_matrix.copy() + rot_perturbed[1:, 1:] = R @ rot_matrix[1:, 1:] + + return rot_perturbed + + @d.dedent def _indexsearch(X: ArrayLike) -> ArrayLike: """ @@ -477,7 +522,98 @@ def _objective(alpha: ArrayLike, X: ArrayLike) -> float: @d.dedent -def _opt_soft(X: ArrayLike, rot_matrix: ArrayLike) -> Tuple[ArrayLike, ArrayLike, float]: +def _jacobian(alpha: ArrayLike, X: ArrayLike) -> ArrayLike: + r""" + Compute the Jacobian of the objective function via backpropagation through :func:`_fill_matrix`. + + The objective is :math:`f(\alpha) = m - \mathrm{trace}(S(A(\alpha)))` where + :math:`A(\alpha)` is the full rotation matrix after :func:`_fill_matrix`. + The free parameters :math:`\alpha` are the flattened entries of + ``rot_matrix[1:, 1:]``. + + :func:`_fill_matrix` introduces dependencies between matrix entries: + + 1. Row-sum constraint: ``A[r, 0] = -sum(A[r, 1:])`` for ``r >= 1`` + 2. Max condition: ``A[0, j] = max_i(-X[:, 1:] @ A[1:, :])[:, j]`` + 3. Rescaling: ``A = A / sum(A[0, :])`` + + The published gradient (below Eq. 16 in [Roeblitz13]_) assumes + independent entries and omits the chain rule through these + transformations. This implementation backpropagates correctly + through all three steps. + + Parameters + ---------- + alpha + Vector of shape `((m - 1) ^ 2,)` containing the flattened and + cropped rotation matrix ``rot_matrix[1:, 1:]``. + X + %(Q_sort)s + + Returns + ------- + Jacobian vector of shape `((m - 1) ^ 2,)`, the gradient of + :math:`f = m - \mathrm{trace}(S)` with respect to ``alpha``. + """ # noqa: D205, D400 + n, m = X.shape + k = m - 1 + + if alpha.shape[0] != k**2: + raise ValueError( + "The shape of alpha doesn't match with the shape of X: " + f"It is not a ({k}^2,)-vector, but of dimension {alpha.shape}. X is of shape `{X.shape}`." + ) + + # --- Forward pass via _fill_matrix --- + B = np.reshape(alpha, (k, k)) + + rot_mat = np.zeros((m, m), dtype=np.float64) + rot_mat[1:, 1:] = B + result = _fill_matrix(rot_mat, X, _return_intermediates=True) + A = result.rot_matrix + A_pre = result.A_pre + argmax_indices = result.argmax_indices + s = result.scale + + # --- Backward pass --- + # Step 1 (backward): df/dA where f = m - trace(diag(1/A[0,:]) @ A^T @ A). + # For i > 0: df/dA[i,j] = -2 * A[i,j] / A[0,j] + # For i == 0: df/dA[0,j] = ||A[:,j]||^2 / A[0,j]^2 - 2 + dfdA = np.zeros((m, m), dtype=np.float64) + for j in range(m): + col_norm_sq = np.sum(A[:, j] ** 2) + dfdA[0, j] = col_norm_sq / A[0, j] ** 2 - 2.0 + dfdA[1:, j] = -2.0 * A[1:, j] / A[0, j] + + # Step 2 (backward): backprop through rescaling A = A_pre / s. + # Direct part: dfdA_pre[i,j] = dfdA[i,j] / s + # Through s = sum(A_pre[0,:]): dfdA_pre[0,c] += (-1/s) * sum_{i,j} dfdA[i,j] * A[i,j] + dfdA_pre = dfdA / s + dfdA_pre[0, :] += -np.sum(dfdA * A) / s + + # Step 3 (backward): backprop through max condition. + # A_pre[0,j] = max_i(dummy[i,j]) = dummy[argmax_j, j] + # dummy[i,j] = -sum_{r} X[i, r+1] * A_pre[r+1, j] + # So d(A_pre[0,j])/d(A_pre[r+1, c]) = -X[argmax_j, r+1] if c == j, else 0 + dfdA_pre_lower = dfdA_pre[1:, :].copy() + for j in range(m): + i_max = argmax_indices[j] + dfdA_pre_lower[:, j] += dfdA_pre[0, j] * (-X[i_max, 1:]) + + # Step 4 (backward): backprop through row-sum constraint. + # A_pre[r, 0] = -sum_c B[r-1, c], so d(A_pre[r,0])/d(B[r-1,c]) = -1 + dfdB = dfdA_pre_lower[:, 1:].copy() + dfdB += -dfdA_pre_lower[:, 0:1] + + return dfdB.ravel() + + +@d.dedent +def _opt_soft( + X: ArrayLike, + rot_matrix: ArrayLike, + method: str = "Nelder-Mead", +) -> Tuple[ArrayLike, ArrayLike, float]: r""" Optimize the G-PCCA rotation matrix such that the memberships are exclusively non-negative and compute the membership matrix. @@ -517,8 +653,23 @@ def _opt_soft(X: ArrayLike, rot_matrix: ArrayLike) -> Tuple[ArrayLike, ArrayLike # Now reshape rot_crop_matrix into a linear vector alpha. k = m - 1 alpha = np.reshape(rot_crop_matrix, k**2) - # TODO: Implement Gauss Newton Optimization to speed things up esp. for m > 10 - alpha, fopt, _, _, _ = fmin(_objective, alpha, args=(X,), full_output=True, disp=False) + + if method == "Nelder-Mead": + alpha, fopt, _, _, _ = fmin(_objective, alpha, args=(X,), full_output=True, disp=False) + elif method in ("L-BFGS-B", "BFGS", "CG"): + kwargs: Dict = {"method": method, "jac": _jacobian} + if method == "L-BFGS-B": + kwargs["bounds"] = [(-1, 1)] * k**2 + else: + kwargs["options"] = {"disp": False} + result = minimize(_objective, alpha, args=(X,), **kwargs) + alpha = result.x + fopt = result.fun + else: + raise ValueError( + f"Invalid optimization method `{method!r}`. " + f"Valid options are: 'Nelder-Mead', 'L-BFGS-B', 'BFGS', 'CG'." + ) # Now reshape alpha into a (k,k)-matrix. rot_crop_matrix = np.reshape(alpha, (k, k)) @@ -548,7 +699,25 @@ def _opt_soft(X: ArrayLike, rot_matrix: ArrayLike) -> Tuple[ArrayLike, ArrayLike @d.dedent -def _fill_matrix(rot_matrix: ArrayLike, X: ArrayLike) -> ArrayLike: +class _FillMatrixResult(NamedTuple): + """Intermediates from :func:`_fill_matrix` needed by the Jacobian.""" + + rot_matrix: ArrayLike + """Feasible rotation matrix of shape ``(m, m)``.""" + A_pre: ArrayLike + """Pre-scaling rotation matrix of shape ``(m, m)``.""" + argmax_indices: ArrayLike + """Row indices of the maximum in each column, shape ``(m,)``.""" + scale: float + """Scaling factor ``sum(A_pre[0, :])``.""" + + +@d.dedent +def _fill_matrix( + rot_matrix: ArrayLike, + X: ArrayLike, + _return_intermediates: bool = False, +) -> Union[ArrayLike, _FillMatrixResult]: """ Make the rotation matrix feasible. @@ -558,10 +727,14 @@ def _fill_matrix(rot_matrix: ArrayLike, X: ArrayLike) -> ArrayLike: (Infeasible) rotation matrix of shape `(m, m)`. X %(Q_sort)s + _return_intermediates + If ``True``, return a :class:`_FillMatrixResult` containing + intermediates needed by the Jacobian backward pass. Returns ------- - Feasible rotation matrix of shape `(m, m)`. + Feasible rotation matrix of shape `(m, m)`, or a + :class:`_FillMatrixResult` if ``_return_intermediates`` is ``True``. """ n, m = X.shape @@ -576,10 +749,13 @@ def _fill_matrix(rot_matrix: ArrayLike, X: ArrayLike) -> ArrayLike: # Compute first row of A by maximum condition. dummy = -np.dot(X[:, 1:], rot_matrix[1:, :]) + argmax_indices = np.argmax(dummy, axis=0) rot_matrix[0, :] = np.max(dummy, axis=0) # Reskale rot_mat to be in the feasible set. - rot_matrix = rot_matrix / np.sum(rot_matrix[0, :]) + scale = np.sum(rot_matrix[0, :]) + A_pre = rot_matrix.copy() if _return_intermediates else None + rot_matrix = rot_matrix / scale # Make sure, that there are no zero or negative elements in the first row of A. if np.any(rot_matrix[0, :] == 0): @@ -587,6 +763,13 @@ def _fill_matrix(rot_matrix: ArrayLike, X: ArrayLike) -> ArrayLike: if np.min(rot_matrix[0, :]) < 0: raise ValueError("First row of rotation matrix has elements < 0.") + if _return_intermediates: + return _FillMatrixResult( + rot_matrix=rot_matrix, + A_pre=A_pre, + argmax_indices=argmax_indices, + scale=scale, + ) return rot_matrix @@ -629,7 +812,13 @@ def _cluster_by_isa(X: ArrayLike) -> Tuple[ArrayLike, float]: @d.dedent -def _gpcca_core(X: ArrayLike) -> Tuple[ArrayLike, ArrayLike, float]: +def _gpcca_core( + X: ArrayLike, + method: str = "Nelder-Mead", + n_starts: int = 1, + perturbation_scale: float = 0.1, + seed: Optional[int] = None, +) -> Tuple[ArrayLike, ArrayLike, float]: r""" Core of the G-PCCA spectral clustering method with optimized memberships [Reuter18]_, [Reuter19]_. @@ -641,6 +830,26 @@ def _gpcca_core(X: ArrayLike) -> Tuple[ArrayLike, ArrayLike, float]: ---------- X %(Q_sort)s + method + Optimization method for the rotation matrix. Valid options are: + + - ``'Nelder-Mead'`` - derivative-free simplex method (default). + - ``'L-BFGS-B'`` - gradient-based with bounds, recommended for ``m > 10``. + - ``'BFGS'`` - gradient-based without bounds. + - ``'CG'`` - conjugate gradient. + n_starts + Number of optimization runs. The first run always uses the + deterministic ISA initialization. Subsequent runs perturb the + initial rotation matrix on the SO(k) manifold. The result + with the best crispness is returned. Set to ``1`` to disable + perturbation (fully deterministic, backward compatible). + perturbation_scale + Angular scale for the rotation perturbation (only used when + ``n_starts > 1``). Recommended range is 0.05-0.2; larger + values risk producing degenerate solutions. + seed + Random seed for reproducibility of the perturbations + (only used when ``n_starts > 1``). Returns ------- @@ -655,14 +864,57 @@ def _gpcca_core(X: ArrayLike) -> Tuple[ArrayLike, ArrayLike, float]: """ m = np.shape(X)[1] - rot_matrix = _initialize_rot_matrix(X) + rot_matrix_init = _initialize_rot_matrix(X) + + if n_starts <= 1: + # Single run: fully backward compatible, no randomness. + rot_matrix, chi, fopt = _opt_soft( + X, rot_matrix_init, method=method + ) + crispness = (m - fopt) / m + return chi, rot_matrix, crispness + + # Multi-start: run from deterministic init + perturbed inits, + # keep the result with the best crispness. + rng = np.random.default_rng(seed) + best_chi, best_rot, best_fopt = None, None, np.inf - rot_matrix, chi, fopt = _opt_soft(X, rot_matrix) + for i in range(n_starts): + if i == 0: + rot_start = rot_matrix_init + else: + rot_start = _perturb_rotation( + rot_matrix_init, perturbation_scale, rng + ) + try: + rot_i, chi_i, fopt_i = _opt_soft( + X, rot_start, method=method + ) + except ValueError: + # Perturbation led to infeasible solution; skip. + continue + + # For perturbed starts, check that every cluster is the argmax + # for at least one row. A degenerate chi where one column is + # never dominant leads to empty clusters on discretization. + # The deterministic start (i=0) is always accepted to guarantee + # we return a result even if perturbed starts all fail. + if i > 0: + assignments = np.argmax(chi_i, axis=1) + if len(set(assignments)) < m: + continue - # calculate crispness of the decomposition of the state space into m clusters - crispness = (m - fopt) / m + if fopt_i < best_fopt: + best_chi, best_rot, best_fopt = chi_i, rot_i, fopt_i - return chi, rot_matrix, crispness + if best_chi is None: + raise ValueError( + "All optimization starts failed. " + "Try reducing `perturbation_scale`." + ) + + crispness = (m - best_fopt) / m + return best_chi, best_rot, crispness @d.dedent @@ -706,6 +958,7 @@ def gpcca_coarsegrain( eta: Optional[ArrayLike] = None, z: Literal["LM", "LR"] = "LM", method: str = DEFAULT_SCHUR_METHOD, + optimization_method: str = "Nelder-Mead", ) -> ArrayLike: r""" Coarse-grain the transition matrix `P` into `m` sets using G-PCCA [Reuter18]_, [Reuter19]_. @@ -727,6 +980,9 @@ def gpcca_coarsegrain( %(method)s See the `installation `_ instructions for more information. + optimization_method + Optimization method for the rotation matrix. Valid options are + ``'Nelder-Mead'`` (default), ``'L-BFGS-B'``, ``'BFGS'``, ``'CG'``. Returns ------- @@ -737,7 +993,7 @@ def gpcca_coarsegrain( If you use this code or parts of it, please cite [Reuter19]_. """ # Matlab: Pc = pinv(chi'*diag(eta)*chi)*(chi'*diag(eta)*P*chi) - chi = GPCCA(P, eta=eta, z=z, method=method).optimize(m).memberships + chi = GPCCA(P, eta=eta, z=z, method=method).optimize(m, method=optimization_method).memberships return _coarsegrain(P, eta=eta, chi=chi) @@ -904,6 +1160,10 @@ def minChi(self, m_min: int, m_max: int) -> List[float]: def optimize( self, m: Union[int, Tuple[int, int], List[int], Dict[str, int]], + method: str = "Nelder-Mead", + n_starts: int = 1, + perturbation_scale: float = 0.1, + seed: Optional[int] = None, ) -> "GPCCA": r""" Full G-PCCA spectral clustering method with optimized memberships [Reuter18]_, [Reuter19]_. @@ -930,6 +1190,28 @@ def optimize( See :meth:`minChi` for selection of good (potentially optimal) number of clusters. + method + Optimization method for the rotation matrix. Valid options are: + + - ``'Nelder-Mead'`` - derivative-free simplex method (default). + - ``'L-BFGS-B'`` - gradient-based with bounds, recommended for + ``m > 10``. + - ``'BFGS'`` - gradient-based without bounds. + - ``'CG'`` - conjugate gradient. + n_starts + Number of optimization runs. The first run uses the + deterministic ISA initialization; subsequent runs perturb + the initial rotation matrix on the SO(k) manifold. The + result with the best crispness is returned. Set to ``1`` + to disable perturbation (fully deterministic, backward + compatible). + perturbation_scale + Angular scale for the rotation perturbation (only used + when ``n_starts > 1``). Recommended range is 0.05-0.2; + larger values risk producing degenerate solutions. + seed + Random seed for reproducibility of the perturbations + (only used when ``n_starts > 1``). Returns ------- @@ -1035,7 +1317,13 @@ def optimize( # Reduce X according to m and make a work copy. # Xm = np.copy(X[:, :m]) - chi, rot_matrix, crispness = _gpcca_core(self._p_X[:, :m]) + chi, rot_matrix, crispness = _gpcca_core( + self._p_X[:, :m], + method=method, + n_starts=n_starts, + perturbation_scale=perturbation_scale, + seed=seed, + ) # check if we have at least m dominant sets. If less than m, we warn. nmeta = np.count_nonzero(chi.sum(axis=0)) if m > nmeta: diff --git a/tests/test_gpcca.py b/tests/test_gpcca.py index 20dc80b..59094b5 100644 --- a/tests/test_gpcca.py +++ b/tests/test_gpcca.py @@ -32,12 +32,14 @@ import pytest from scipy.linalg import lu, pinv, hilbert, subspace_angles +from scipy.optimize import approx_fprime from scipy.sparse import issparse, csr_matrix import numpy as np from pygpcca._gpcca import ( GPCCA, _do_schur, + _jacobian, _opt_soft, _objective, _gpcca_core, @@ -45,6 +47,7 @@ _indexsearch, _cluster_by_isa, _gram_schmidt_mod, + _perturb_rotation, gpcca_coarsegrain, _initialize_rot_matrix, ) @@ -989,6 +992,177 @@ def test_optimize_range_all_invalid(self, P_2: np.ndarray, mocker): g.optimize([3, P_2.shape[0]]) +class TestGradientOptimization: + """Tests for gradient-based optimization and multi-start.""" + + @pytest.mark.parametrize("m", [3, 5]) + def test_jacobian_vs_finite_differences(self, m: int): + """Verify _jacobian matches scipy.optimize.approx_fprime. + + The ISA initialization (``_initialize_rot_matrix``) places alpha at + a degenerate point where multiple rows of ``dummy`` achieve the same + maximum in ``_fill_matrix``, making ``argmax`` — and therefore the + gradient — ill-defined. This is analogous to ``d|x|/dx`` at ``x=0``. + The test evaluates the Jacobian at a randomly perturbed alpha away + from this singularity, where the ``argmax`` gaps are stable under + finite-difference perturbations. + """ + rng = np.random.default_rng(42) + block_size = 300 // m + n = block_size * m + P = np.zeros((n, n)) + coupling = 0.01 + for b in range(m): + s, e = b * block_size, (b + 1) * block_size + block = rng.dirichlet(np.ones(block_size), size=block_size) + P[s:e, s:e] = block * (1 - coupling) + for b2 in range(m): + if b2 != b: + s2, e2 = b2 * block_size, (b2 + 1) * block_size + P[s:e, s2:e2] = coupling / (m - 1) / block_size + P = P / P.sum(axis=1, keepdims=True) + + g = GPCCA(P, eta=None, z="LM", method="brandts") + g._do_schur_helper(m) + svecs = g._p_X[:, :m] + + # Perturb away from the ISA initialization to avoid the argmax + # degeneracy (where the Jacobian is not well-defined). + rot_matrix = _initialize_rot_matrix(svecs) + alpha = rot_matrix[1:, 1:].ravel().copy() + alpha += rng.normal(0, 0.1, size=alpha.shape) + + jac_analytical = _jacobian(alpha, svecs) + jac_fd = approx_fprime(alpha, _objective, 1e-7, svecs) + + rel_err = np.abs(jac_analytical - jac_fd) / np.maximum( + np.abs(jac_fd), 1e-10 + ) + assert np.max(rel_err) < 1e-3, ( + f"m={m}: Jacobian max relative error " + f"{np.max(rel_err):.2e} exceeds 1e-3" + ) + + @pytest.mark.parametrize("mu_val", [0, 100, 1000]) + def test_cg_vs_nelder_mead(self, mu_val: int): + """CG produces crispness close to Nelder-Mead on well-separated spectra.""" + m = 3 + P, sd = get_known_input(mu(mu_val)) + X, _, _ = _do_schur(P, eta=sd, m=m) + svecs = X[:, :m] + + A_nm = _initialize_rot_matrix(svecs) + _, chi_nm, fopt_nm = _opt_soft(svecs, A_nm, method="Nelder-Mead") + + A_cg = _initialize_rot_matrix(svecs) + _, chi_cg, fopt_cg = _opt_soft(svecs, A_cg, method="CG") + + crisp_nm = (m - fopt_nm) / m + crisp_cg = (m - fopt_cg) / m + + assert crisp_cg >= crisp_nm - 0.05, ( + f"mu={mu_val}: CG crispness ({crisp_cg:.4f}) " + f"much worse than NM ({crisp_nm:.4f})" + ) + + assert_allclose(chi_cg.sum(axis=1), 1.0, atol=1e-10) + assert np.all(chi_cg >= -1e-8) + + @pytest.mark.parametrize( + "method", ["Nelder-Mead", "L-BFGS-B", "BFGS", "CG"] + ) + def test_all_methods_produce_valid_memberships(self, method: str): + """All optimization methods produce valid membership matrices.""" + m = 3 + P, sd = get_known_input(mu(0)) + X, _, _ = _do_schur(P, eta=sd, m=m) + svecs = X[:, :m] + + A = _initialize_rot_matrix(svecs) + rot_matrix, chi, fopt = _opt_soft(svecs, A, method=method) + + assert_allclose(chi.sum(axis=1), 1.0, atol=1e-10) + assert np.all(chi >= -1e-8), f"{method}: min(chi)={chi.min():.2e}" + crispness = (m - fopt) / m + assert crispness > 0, f"{method}: non-positive crispness {crispness}" + + def test_gpcca_optimize_with_cg(self): + """GPCCA.optimize() works with CG through the full pipeline.""" + P, sd = get_known_input(mu(0)) + g = GPCCA(P, eta=sd, z="LM", method="brandts") + g.optimize(3, method="CG") + + chi = g.memberships + assert chi is not None + assert chi.shape[1] == 3 + assert_allclose(chi.sum(axis=1), 1.0, atol=1e-10) + + def test_invalid_method_raises(self): + """Invalid optimization method raises ValueError.""" + P, sd = get_known_input(mu(0)) + X, _, _ = _do_schur(P, eta=sd, m=3) + svecs = X[:, :3] + A = _initialize_rot_matrix(svecs) + + with pytest.raises( + ValueError, match="Invalid optimization method" + ): + _opt_soft(svecs, A, method="invalid") + + def test_perturb_rotation_stays_on_manifold(self): + """Perturbed rotation matrices preserve orthogonality.""" + rng = np.random.default_rng(0) + m = 5 + P, sd = get_known_input(mu(0)) + X, _, _ = _do_schur(P, eta=sd, m=m) + svecs = X[:, :m] + rot_matrix = _initialize_rot_matrix(svecs) + + for eps in [0.05, 0.1, 0.5]: + rot_p = _perturb_rotation(rot_matrix, eps, rng) + # Shape preserved + assert rot_p.shape == rot_matrix.shape + # First row unchanged (only [1:, 1:] block is perturbed) + assert_allclose(rot_p[0, :], rot_matrix[0, :]) + assert_allclose(rot_p[1:, 0], rot_matrix[1:, 0]) + # The [1:, 1:] block should still be related by an + # orthogonal transformation, so det should be ±1 + R_block = rot_p[1:, 1:] @ np.linalg.pinv(rot_matrix[1:, 1:]) + assert abs(abs(np.linalg.det(R_block)) - 1.0) < 1e-10 + + def test_multi_start_improves_or_matches_single(self): + """Multi-start with CG achieves crispness >= single start.""" + m = 5 + P, sd = get_known_input(mu(0)) + + g1 = GPCCA(P, eta=sd, z="LM", method="brandts") + g1.optimize(m, method="CG", n_starts=1) + + g10 = GPCCA(P, eta=sd, z="LM", method="brandts") + g10.optimize(m, method="CG", n_starts=10, seed=42) + + # Multi-start picks the best, so crispness >= single start + assert g10._crispness >= g1._crispness - 1e-10 + + # Both produce valid memberships + assert_allclose(g10.memberships.sum(axis=1), 1.0, atol=1e-10) + assert np.all(g10.memberships >= -1e-8) + + def test_multi_start_deterministic_with_seed(self): + """Same seed produces identical results.""" + m = 5 + P, sd = get_known_input(mu(0)) + + g1 = GPCCA(P, eta=sd, z="LM", method="brandts") + g1.optimize(m, method="CG", n_starts=5, seed=123) + + g2 = GPCCA(P, eta=sd, z="LM", method="brandts") + g2.optimize(m, method="CG", n_starts=5, seed=123) + + assert_allclose(g1.memberships, g2.memberships) + assert_allclose(g1._crispness, g2._crispness) + + class TestUtils: def test_transition_matrix_dtype(self, P_2: np.ndarray): g = GPCCA(P_2, eta=None, z="LR")