Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 15 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -83,10 +83,11 @@ D = (D + D.T) / 2
np.fill_diagonal(D, 0.0)

# Import (from dense array)
G0 = from_dense(D, directed=False)
G0 = from_dense(D, directed=False, mode="distance")
```

### 2) Refine a graph (operators)
Some operators can work on both distance and similarity based graphs, such as kNN and a simple weight thresholding method:

```python
from graphconstructor.operators import KNNSelector, WeightThreshold
Expand All @@ -98,6 +99,19 @@ G_refined = KNNSelector(k=5, mutual=True, mutual_k=20, mode="distance").apply(G0
G_pruned = WeightThreshold(threshold=0.3, mode="distance").apply(G_refined)
```

Other operators require particular either similarity or distance values as weights. In such cases it can be necessary to switch from distance to similarity measures (or vice versa).

```python
from graphconstructor.operators import NoiseCorrected

# There are various method for distance/similarity conversions (including using a custom function)
G0_sim = G0.convert_mode("similarity", method="exp")

# Once converted to similarities, other operators become available
G_refined = NoiseCorrected().apply(G0_sim)
```


### 3) Export when needed

```python
Expand Down
173 changes: 152 additions & 21 deletions graphconstructor/graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import pandas as pd
import scipy.sparse as sp
from scipy.sparse.csgraph import connected_components
from .utils import ConversionMethod, Mode, _drop_diagonal, convert_adjacency_mode


SymOp = Literal["max", "min", "average"]
Expand All @@ -22,18 +23,29 @@ class Graph:
- `adj`: CSR adjacency of shape (n, n)
- `directed`: True if directed, else undirected (stored symmetric)
- `weighted`: True if edge weights are meaningful; if False, all edges are 1.0
- `mode`: "distance" or "similarity" (for interpretation of weights)
- `meta`: pandas DataFrame with n rows (optional). May have a 'name' column.
- `ignore_selfloops`: If True, self-loops are ignored/removed (default for undirected graphs)
- `keep_explicit_zeros`: If True, explicit zeros in adjacency are kept (default for distance graphs)
"""
adj: sp.csr_matrix
directed: bool
weighted: bool
mode: str
meta: pd.DataFrame | None = None
ignore_selfloops: bool = None
keep_explicit_zeros: bool = None

def __post_init__(self):
# Default: ignore self-loops for undirected graphs
if self.ignore_selfloops is None:
self.ignore_selfloops = not self.directed
# Default: keep explicit zeros for distance graphs
if self.keep_explicit_zeros is None:
self.keep_explicit_zeros = self.mode == "distance"
# Check mode
if self.mode not in {"distance", "similarity"}:
raise ValueError("mode must be 'distance' or 'similarity'.")

# -------- Construction helpers --------
@staticmethod
Expand All @@ -45,73 +57,122 @@ def _ensure_csr(M: sp.spmatrix | np.ndarray) -> sp.csr_matrix:
raise TypeError("Adjacency must be square (n x n).")
return sp.csr_matrix(arr)


@staticmethod
def _preserve_explicit_zeros(original: sp.csr_matrix, result: sp.csr_matrix) -> sp.csr_matrix:
"""Reinsert explicit zeros that were present in `original` into `result` (CSR).
This avoids CSR ops (like max/min/avg) pruning stored zeros."""
coo = original.tocoo()
zmask = (coo.data == 0)
if not np.any(zmask):
return result
zr = coo.row[zmask]
zc = coo.col[zmask]
# Merge by concatenating coordinates with zero data; CSR will coalesce duplicates.
res_coo = result.tocoo()
rows = np.concatenate([res_coo.row, zr])
cols = np.concatenate([res_coo.col, zc])
data = np.concatenate([res_coo.data, np.zeros(zr.size, dtype=float)])
return sp.csr_matrix((data, (rows, cols)), shape=result.shape)

@staticmethod
def _symmetrize(A: sp.csr_matrix, how: SymOp = "max") -> sp.csr_matrix:
def _symmetrize(A: sp.csr_matrix, how: SymOp = "max",
*, preserve_zeros_from: sp.csr_matrix | None = None) -> sp.csr_matrix:
if how == "max":
return A.maximum(A.T)
if how == "min":
return A.minimum(A.T)
if how == "average":
return (A + A.T) * 0.5
raise ValueError("Unsupported symmetrization op. Use 'max', 'min', or 'average'.")
B = A.maximum(A.T)
elif how == "min":
B = A.minimum(A.T)
elif how == "average":
B = (A + A.T) * 0.5
else:
raise ValueError("Unsupported symmetrization op. Use 'max', 'min', or 'average'.")
# If asked, reinsert explicit zeros that existed before symmetrization.
if preserve_zeros_from is not None:
B = Graph._preserve_explicit_zeros(preserve_zeros_from, B)
return B

@classmethod
def from_csr(
cls,
adj: sp.spmatrix | np.ndarray,
mode: str,
*,
directed: bool = False,
weighted: bool = True,
meta: pd.DataFrame | None = None,
ignore_selfloops: bool = None,
keep_explicit_zeros: bool = None,
sym_op: SymOp = "max",
copy: bool = False,
) -> "Graph":

# Ignore self-loops (unless directed or specified otherwise)
if ignore_selfloops is None:
ignore_selfloops = not directed
# Keep explicit zeros (unless similarity or specified otherwise)
if keep_explicit_zeros is None:
keep_explicit_zeros = mode == "distance"

A = cls._ensure_csr(adj)
if not weighted:
if not copy and sp.issparse(adj):
A = A.copy()
A.data[:] = 1.0
if not directed:
A = cls._symmetrize(A, how=sym_op)
# Ignore self-loops (unless directed or specified otherwise)
if ignore_selfloops is None:
ignore_selfloops = not directed
if ignore_selfloops and A.diagonal().any():
A = A.tolil(copy=False)
A.setdiag(0)
A = A.tocsr(copy=False)
A.eliminate_zeros()
preserve_src = A if keep_explicit_zeros else None
A = cls._symmetrize(A, how=sym_op, preserve_zeros_from=preserve_src)

if mode == "similarity" and ignore_selfloops and A.diagonal().any():
A = _drop_diagonal(A)
if mode == "distance" and ignore_selfloops and (A.diagonal() == 0).any():
A = _drop_diagonal(A)

n = A.shape[0]
if meta is not None:
if len(meta) != n:
raise ValueError(f"meta has {len(meta)} rows but adjacency is {n}x{n}.")
meta = meta.reset_index(drop=True)
return cls(adj=A.astype(float, copy=False), directed=directed, weighted=weighted, meta=meta)
return cls(
adj=A.astype(float, copy=False),
directed=directed, weighted=weighted,
mode=mode, ignore_selfloops=ignore_selfloops,
meta=meta,
keep_explicit_zeros=keep_explicit_zeros,
)

@classmethod
def from_dense(
cls,
adj: np.ndarray,
mode: str,
**kwargs,
) -> "Graph":
return cls.from_csr(adj, **kwargs)
return cls.from_csr(adj, mode=mode, **kwargs)

@classmethod
def from_edges(
cls,
n: int,
edges: Sequence[tuple[int, int]] | np.ndarray,
mode: str,
weights: Sequence[float] | np.ndarray | None = None,
*,
directed: bool = False,
weighted: bool = True,
meta: pd.DataFrame | None = None,
ignore_selfloops: bool = None,
keep_explicit_zeros: bool = None,
sym_op: SymOp = "max",
) -> "Graph":
"""Build from an edge list. For undirected=True, we symmetrize later."""

# Ignore self-loops (unless directed or specified otherwise)
if ignore_selfloops is None:
ignore_selfloops = not directed
# Keep explicit zeros (unless similarity or specified otherwise)
if keep_explicit_zeros is None:
keep_explicit_zeros = mode == "distance"

if isinstance(edges, np.ndarray):
if edges.ndim != 2 or edges.shape[1] != 2:
raise TypeError("edges ndarray must be shape (m, 2).")
Expand All @@ -138,8 +199,11 @@ def from_edges(
return cls.from_csr(
A, directed=directed,
weighted=weighted_eff,
mode=mode,
ignore_selfloops=ignore_selfloops,
meta=meta, sym_op=sym_op)
meta=meta, sym_op=sym_op,
keep_explicit_zeros=keep_explicit_zeros,
)

# -------- Core properties --------
@property
Expand Down Expand Up @@ -197,7 +261,73 @@ def drop(self, nodes: Iterable[int | str]) -> "Graph":

A2 = self.adj[keep_mask][:, keep_mask].tocsr(copy=False)
meta2 = self.meta.loc[keep_mask].reset_index(drop=True) if self.meta is not None else None
return Graph(adj=A2, directed=self.directed, weighted=self.weighted, meta=meta2)
return Graph(adj=A2, directed=self.directed, weighted=self.weighted, mode=self.mode, meta=meta2)

# ----- Convert distance/similarity -----
def convert_mode(
self,
target_mode: Mode,
method: ConversionMethod = "reciprocal",
inplace: bool = False,
**kwargs
) -> "Graph":
"""
Convert graph weights between distance and similarity representations.

Parameters
----------
target_mode : {"distance", "similarity"}
Desired mode for edge weights.
method : str or callable, default="reciprocal"
Conversion method to use. Options:

- "reciprocal": similarity = 1/distance (default, bidirectional)
- "negative": similarity = -distance (bidirectional, for optimization)
- "exp": similarity = exp(-distance) (distance -> similarity only)
- "gaussian": similarity = exp(-distance^2/(2*sigma^2)) (distance -> similarity only)
- Custom callable: func(weights) -> converted_weights
inplace : bool, default=False
If True, modify this graph in place and return self.
If False (default), create and return a new Graph instance.
**kwargs
Additional parameters for conversion:

- epsilon (float): Small value to avoid division by zero (default: 1e-10)
- sigma (float): Bandwidth for gaussian method (default: 1.0)
"""
if target_mode not in ("distance", "similarity"):
raise ValueError(
f"target_mode must be 'distance' or 'similarity', got '{target_mode}'"
)

if self.mode == target_mode:
raise ValueError(
f"Graph is already in mode '{target_mode}'. No conversion needed."
)

# Convert the adjacency matrix
new_adj = convert_adjacency_mode(
self.adj,
source_mode=self.mode,
target_mode=target_mode,
method=method,
inplace=inplace,
**kwargs
)

if inplace:
self.adj = new_adj
self.mode = target_mode
return self
else:
# Create a new Graph instance
return Graph(
adj=new_adj,
mode=target_mode,
directed=self.directed,
weighted=self.weighted,
meta=None if self.meta is None else self.meta.copy(),
)

# -------- Exporters --------
def to_networkx(self):
Expand Down Expand Up @@ -250,6 +380,7 @@ def copy(self) -> "Graph":
adj=self.adj.copy(),
directed=self.directed,
weighted=self.weighted,
mode=self.mode,
meta=None if self.meta is None else self.meta.copy(),
)

Expand All @@ -260,7 +391,7 @@ def sorted_by(self, col: str) -> "Graph":
order = np.argsort(self.meta[col].to_numpy())
A2 = self.adj[order][:, order]
meta2 = self.meta.iloc[order].reset_index(drop=True)
return Graph(adj=A2, directed=self.directed, weighted=self.weighted, meta=meta2)
return Graph(adj=A2, directed=self.directed, weighted=self.weighted, mode=self.mode, meta=meta2)

def degree(self, ignore_weights: bool = False) -> np.ndarray | tuple[np.ndarray, np.ndarray]:
"""Return node degree(s).
Expand Down
14 changes: 8 additions & 6 deletions graphconstructor/importers.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,12 @@

Mode = Literal["distance", "similarity"]

def from_dense(arr, *, directed=False, weighted=True, meta=None, sym_op="max") -> Graph:
return Graph.from_dense(arr, directed=directed, weighted=weighted, meta=meta, sym_op=sym_op)
def from_dense(arr, mode, *, directed=False, weighted=True, meta=None, sym_op="max") -> Graph:
return Graph.from_dense(arr, directed=directed, weighted=weighted, mode=mode, meta=meta, sym_op=sym_op)


def from_csr(adj, *, directed=False, weighted=True, meta=None, sym_op="max") -> Graph:
return Graph.from_csr(adj, directed=directed, weighted=weighted, meta=meta, sym_op=sym_op)
def from_csr(adj, mode, *, directed=False, weighted=True, meta=None, sym_op="max") -> Graph:
return Graph.from_csr(adj, directed=directed, weighted=weighted, mode=mode, meta=meta, sym_op=sym_op)


def from_knn(indices, distances, *, store_weights=True, directed=False, meta=None, sym_op="max") -> Graph:
Expand All @@ -33,7 +33,8 @@ def from_knn(indices, distances, *, store_weights=True, directed=False, meta=Non
# Infer full graph size from neighbor ids
n_full = _infer_n_from_indices(ind)
A = sp.csr_matrix((weights, (rows, cols)), shape=(n_full, n_full))
return Graph.from_csr(A, directed=directed, weighted=store_weights, meta=meta, sym_op=sym_op)
return Graph.from_csr(A, directed=directed, weighted=store_weights, mode="distance",
meta=meta, sym_op=sym_op)


def from_ann(ann, query_data, k: int, *, store_weights=True, directed=False, meta=None, sym_op="max") -> Graph:
Expand All @@ -45,7 +46,8 @@ def from_ann(ann, query_data, k: int, *, store_weights=True, directed=False, met
if query_data is None:
raise TypeError("from_ann requires query_data when index has no cached neighbors.")
ind, dist = idx.query(query_data, k=k)
return from_knn(ind, dist, store_weights=store_weights, directed=directed, meta=meta, sym_op=sym_op)
return from_knn(ind, dist, store_weights=store_weights, directed=directed,
meta=meta, sym_op=sym_op)


# helper functions ---------------------------------------------
Expand Down
8 changes: 8 additions & 0 deletions graphconstructor/operators/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,14 @@


class GraphOperator(ABC):
"""Base class for graph operators."""
supported_modes = [] # Specify supported modes from ["distance", "similarity"]

"""Pure transform: Graph -> Graph."""
@abstractmethod
def apply(self, G: Graph) -> Graph: ...

def _check_mode_supported(self, G: Graph):
if G.mode not in self.supported_modes:
raise ValueError(f"{self.__class__.__name__} only supports modes: {self.supported_modes}, got {G.mode}")

Loading