matchms · lfischerr · Mar 25, 2026 · Mar 25, 2026
diff --git a/graphconstructor/operators/doubly_stochastic.py b/graphconstructor/operators/doubly_stochastic.py
@@ -1,9 +1,12 @@
 from dataclasses import dataclass
+import networkx as nx
 import numpy as np
 from ..graph import Graph
 from .base import GraphOperator
 
 
+# Z. 48-57:
+# https://gitlab.liris.cnrs.fr/coregraphie/netbone/-/blob/main/netbone/structural/doubly_stochastic.py?ref_type=heads
 @dataclass(slots=True)
 class DoublyStochastic(GraphOperator):
     """
@@ -30,6 +33,7 @@ class DoublyStochastic(GraphOperator):
         Copy metadata frame if present. Default True.
     """
 
+    backbone_method: bool = False
     tolerance: float = 1e-5
     max_iter: int = 10_000
     copy_meta: bool = True
@@ -84,14 +88,12 @@ def apply(self, G: Graph) -> Graph:
 
             # Only check rows/cols that have edges (others stay 0 and are irrelevant)
             if row_has_edges.any():
-                rows_ok = np.all((row_sums[row_has_edges] >= min_thres) &
-                                 (row_sums[row_has_edges] <= max_thres))
+                rows_ok = np.all((row_sums[row_has_edges] >= min_thres) & (row_sums[row_has_edges] <= max_thres))
             else:
                 rows_ok = True
 
             if col_has_edges.any():
-                cols_ok = np.all((col_sums[col_has_edges] >= min_thres) &
-                                 (col_sums[col_has_edges] <= max_thres))
+                cols_ok = np.all((col_sums[col_has_edges] >= min_thres) & (col_sums[col_has_edges] <= max_thres))
             else:
                 cols_ok = True
 
@@ -105,6 +107,41 @@ def apply(self, G: Graph) -> Graph:
         # col scaling
         A_scaled.data *= c[A_scaled.indices]
 
+        # step 2
+        if self.backbone_method:
+            i = 0
+
+            rows, cols = A_scaled.nonzero()
+            vals = A_scaled.data
+
+            order = np.argsort(vals)[::-1]
+            rows = rows[order]
+            cols = cols[order]
+            vals = vals[order]
+            print(rows, cols, order)
+
+            if not G.directed:
+                G_filtered = nx.Graph()
+                while (
+                    nx.number_connected_components(G_filtered) != 1
+                    or len(G_filtered) < A_scaled.shape[0]
+                    or not nx.is_connected(G_filtered)
+                ):
+                    if i == A_scaled.shape[0]:
+                        break
+                    G_filtered.add_edge(rows[i], cols[i], weight=vals[i])
+                    i += 1
+            G_csr = nx.to_scipy_sparse_array(G_filtered)
+
+            return Graph.from_csr(
+                G_csr,
+                directed=G.directed,
+                weighted=True,
+                mode=G.mode,
+                # meta=(G.meta.copy() if (self.copy_meta and G.meta is not None) else G.meta),
+                sym_op="max",
+            )
+
         return Graph.from_csr(
             A_scaled,
             directed=G.directed,

diff --git a/tests/test_doubly_stochastic.py b/tests/test_doubly_stochastic.py
@@ -16,12 +16,15 @@ def _csr(data, rows, cols, n):
 # ----------------- Positive dense matrix: converges to ~doubly stochastic -----------------
 def test_doubly_stochastic_converges_on_positive_dense():
     # Strictly positive, symmetric 4x4 (undirected)
-    M = np.array([
-        [0.2, 0.8, 0.5, 0.3],
-        [0.7, 0.1, 0.4, 0.6],
-        [0.3, 0.9, 0.2, 0.5],
-        [0.5, 0.2, 0.7, 0.4],
-    ], dtype=float)
+    M = np.array(
+        [
+            [0.2, 0.8, 0.5, 0.3],
+            [0.7, 0.1, 0.4, 0.6],
+            [0.3, 0.9, 0.2, 0.5],
+            [0.5, 0.2, 0.7, 0.4],
+        ],
+        dtype=float,
+    )
     # Zero the diagonal (typical adjacency semantics)
     np.fill_diagonal(M, 0.0)
 
@@ -43,6 +46,49 @@ def test_doubly_stochastic_converges_on_positive_dense():
     assert not G.directed and G.weighted
 
 
+def test_doubly_stochastic_with_backbone_method():
+    # Strictly positive, asymmetric matrix (to make backbone relevant)
+    M = np.array(
+        [
+            [0.2, 0.8, 0.5, 0.3],
+            [0.7, 0.1, 0.4, 0.6],
+            [0.3, 0.9, 0.2, 0.5],
+            [0.5, 0.2, 0.7, 0.4],
+        ],
+        dtype=float,
+    )
+
+    # Zero diagonal
+    np.fill_diagonal(M, 0.0)
+
+    G0 = Graph.from_dense(M, directed=False, weighted=True, mode="similarity", sym_op="max")
+
+    op = DoublyStochastic(tolerance=1e-6, max_iter=10_000, backbone_method=True)
+
+    G = op.apply(G0)
+    A = G.adj
+
+    # No NaNs or infs
+    assert np.isfinite(A.data).all()
+
+    # Check that only backbone edges remain (sparser than original)
+    assert A.nnz <= G0.adj.nnz
+
+    # Rows/cols should still approximately sum to 1 (on non-isolated nodes)
+    row_sums = np.asarray(A.sum(axis=1)).ravel()
+    col_sums = np.asarray(A.sum(axis=0)).ravel()
+
+    # Only check nodes that still have edges
+    nonzero_rows = row_sums > 0
+    nonzero_cols = col_sums > 0
+
+    assert np.allclose(row_sums[nonzero_rows], row_sums[nonzero_rows][0], atol=1e-6)
+    assert np.allclose(col_sums[nonzero_cols], col_sums[nonzero_cols][0], atol=1e-6)
+
+    # Graph properties preserved
+    assert not G.directed and G.weighted
+
+
 # ----------------- Sparse graph with isolates: zero rows/cols remain zero, others ~1 -----------------
 def test_doubly_stochastic_sparse_with_isolates():
     # 5 nodes, node 4 is isolated
@@ -65,7 +111,7 @@ def test_doubly_stochastic_sparse_with_isolates():
     col_sums = np.asarray(A2.sum(axis=0)).ravel()
 
     # Indices with edges
-    rows_with = (np.diff(A2.indptr) > 0)
+    rows_with = np.diff(A2.indptr) > 0
     cols_with = (sp.csc_matrix(A2).indptr[1:] - sp.csc_matrix(A2).indptr[:-1]) > 0
 
     # Non-isolated rows/cols sum ~1
@@ -81,6 +127,35 @@ def test_doubly_stochastic_sparse_with_isolates():
     assert not G.directed and G.weighted
 
 
+def test_doubly_stochastic_sparse_with_isolates_backbone():
+    A = _csr(
+        data=[0.4, 0.6, 0.3, 0.7, 0.2, 0.5],
+        rows=[0, 0, 1, 1, 2, 3],
+        cols=[1, 2, 2, 3, 3, 2],
+        n=5,
+    )
+    G0 = Graph.from_csr(A, directed=False, weighted=True, mode="similarity", sym_op="max")
+
+    op = DoublyStochastic(tolerance=1e-6, max_iter=10_000, backbone_method=True)
+    G = op.apply(G0)
+    A2 = G.adj
+
+    assert np.isfinite(A2.data).all()
+
+    row_sums = np.asarray(A2.sum(axis=1)).ravel()
+    col_sums = np.asarray(A2.sum(axis=0)).ravel()
+    rows_with = np.diff(A2.indptr) > 0
+    cols_with = (sp.csc_matrix(A2).indptr[1:] - sp.csc_matrix(A2).indptr[:-1]) > 0
+
+    if rows_with.any():
+        assert np.all(row_sums[rows_with] > 0)
+    if cols_with.any():
+        assert np.all(col_sums[cols_with] > 0)
+
+    # Isolated node (4) stays isolated (not in the graph)
+    assert len(row_sums) == 4
+
+
 # ----------------- Directed case: rows and cols ~1 for nonzero rows/cols -----------------
 def test_doubly_stochastic_directed_graph_unsolvable():
     # Directed 4x4 with zeros on diagonal, not symmetric
@@ -100,12 +175,7 @@ def test_doubly_stochastic_directed_graph_unsolvable():
     # No NaNs or infs
     assert np.isfinite(A2.data).all()
 
-    expected_result = np.array([
-        [0. , 0.5, 0.5, 0. ],
-        [0. , 0. , 1. , 0. ],
-        [0.5, 0. , 0. , 0.5],
-        [0. , 1. , 0. , 0. ]
-        ])
+    expected_result = np.array([[0.0, 0.5, 0.5, 0.0], [0.0, 0.0, 1.0, 0.0], [0.5, 0.0, 0.0, 0.5], [0.0, 1.0, 0.0, 0.0]])
     assert np.allclose(A2.toarray(), expected_result, atol=1e-4)
 
     # Directed flag preserved