Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -165,3 +165,21 @@ If you are new to **pydfc**, we recommend starting with:

This optional AI-assisted workflow is designed to complement — not replace —
the documentation and example scripts.

Generating New dFC Methods with AI
-----------------------------------

You can ask an AI coding assistant (Claude, Copilot, Codex, etc.) to implement
brand-new dFC methods and add them directly to ``pydfc``. Just describe what
you want at whatever level of specificity feels right:

- *"Generate 5 new creative dFC methods."*
- *"Generate 3 new state-based methods."*
- *"Implement a dFC method based on Granger causality."*
- *"Add a method that uses Riemannian geometry on covariance matrices."*
- *"Here is a paper — implement the method it describes."* (paste the PDF or text)

The AI will read the existing codebase, follow the conventions in
``docs/ADDING_DFC_METHODS.md``, write the new method file, and register it in
``pydfc/dfc_methods/__init__.py`` so it works immediately alongside all other
methods.
27 changes: 20 additions & 7 deletions algorithm_similarity.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
import re
import sys
from pathlib import Path

import numpy as np

# Only calls whose resolved root module starts with one of these are kept as
Expand Down Expand Up @@ -129,7 +130,7 @@ def _make_unique_labels(filepaths):

def _hierarchical_cluster_order(matrix, cluster_method="average"):
"""Return indices that order similar methods next to each other."""
from scipy.cluster.hierarchy import linkage, leaves_list
from scipy.cluster.hierarchy import leaves_list, linkage
from scipy.spatial.distance import squareform

if matrix.shape[0] < 2:
Expand Down Expand Up @@ -159,7 +160,9 @@ def plot_similarity_heatmap(
matrix = matrix[np.ix_(order, order)]
labels = [labels[i] for i in order]

fig, ax = plt.subplots(figsize=(max(8, 0.45 * len(labels)), max(6, 0.45 * len(labels))))
fig, ax = plt.subplots(
figsize=(max(8, 0.45 * len(labels)), max(6, 0.45 * len(labels)))
)
image = ax.imshow(matrix, vmin=0.0, vmax=1.0, cmap="viridis", aspect="equal")
fig.colorbar(image, ax=ax, label="AS")

Expand Down Expand Up @@ -187,7 +190,9 @@ def save_similarity_outputs(output_dir, labels, source_paths, matrix, table):
with open(output_dir / "AS_jaccard_source_paths.json", "w", encoding="utf-8") as f:
json.dump(source_paths, f, indent=2)

with open(output_dir / "AS_jaccard_pairs.csv", "w", newline="", encoding="utf-8") as f:
with open(
output_dir / "AS_jaccard_pairs.csv", "w", newline="", encoding="utf-8"
) as f:
fieldnames = [
"method_a",
"method_b",
Expand All @@ -206,9 +211,13 @@ def save_similarity_outputs(output_dir, labels, source_paths, matrix, table):
try:
fig, _ = plot_similarity_heatmap(matrix, labels, cluster=True)
except ImportError:
print("Skipping heatmap export because matplotlib is not available in this environment.")
print(
"Skipping heatmap export because matplotlib is not available in this environment."
)
else:
fig.savefig(str(output_dir / "AS_jaccard_heatmap.png"), dpi=200, bbox_inches="tight")
fig.savefig(
str(output_dir / "AS_jaccard_heatmap.png"), dpi=200, bbox_inches="tight"
)
import matplotlib.pyplot as plt

plt.close(fig)
Expand All @@ -221,7 +230,9 @@ def load_similarity_outputs(output_dir):
labels = np.load(output_dir / "AS_jaccard_names.npy", allow_pickle=True).tolist()
with open(output_dir / "AS_jaccard_source_paths.json", "r", encoding="utf-8") as f:
source_paths = json.load(f)
with open(output_dir / "AS_jaccard_pairs.csv", "r", newline="", encoding="utf-8") as f:
with open(
output_dir / "AS_jaccard_pairs.csv", "r", newline="", encoding="utf-8"
) as f:
table = list(csv.DictReader(f))
return labels, source_paths, matrix, table

Expand Down Expand Up @@ -264,7 +275,9 @@ def main(filepaths):

print(f"{method_a:35s} vs {method_b:35s} AS = {sim:.3f} shared = {shared}")

save_similarity_outputs("algorithm_similarity_results", names, source_paths, alg_sim, pairwise_rows)
save_similarity_outputs(
"algorithm_similarity_results", names, source_paths, alg_sim, pairwise_rows
)
print("Saved outputs to algorithm_similarity_results/")


Expand Down
98 changes: 49 additions & 49 deletions compute_feature_similarity.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
import pickle
import sys
import numpy as np
from pathlib import Path
from collections import defaultdict
from pathlib import Path

from pydfc.comparison import SimilarityAssessment # pip install pydfc
import numpy as np

import pickle
from pydfc.comparison import SimilarityAssessment # pip install pydfc

# FULL PATH usually looks like:
# "{path_to_datasets}/{dataset_id}/derivatives/dFC_assessed/{subject_id}/{session_id}/*.npy"
Expand All @@ -17,7 +17,7 @@
print("Missing a path to the datasets directory")
print("Usage: sbatch run_dfc.sh <path_to_datasets>")
sys.exit(1)

path_to_datasets = sys.argv[1]

root = Path(path_to_datasets)
Expand All @@ -28,25 +28,21 @@
# where matrix.shape = (1, num_methods, num_methods) and contains the similarity values between methods

similarity = defaultdict(
lambda: defaultdict(
lambda: defaultdict(
lambda: defaultdict(dict)
)
)
lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(dict)))
)

for dataset_dir in root.iterdir():

if not dataset_dir.is_dir():
continue

if not dataset_dir.name.startswith("ds"):
continue

dataset_id = dataset_dir.name

dfc_dir = dataset_dir / "derivatives" / "dFC_assessed"

if not dfc_dir.is_dir():
print(f"Skipping {dataset_id} since /derivatives/dFC_assessed not found")
continue
Expand All @@ -61,104 +57,108 @@
# If no session folders, treat the subject directory as the session directory
# to avoid file path issues. If this case, session_id will be set to None later.
session_dirs = [
p for p in subject_dir.iterdir()
if p.is_dir() and p.name.startswith("ses-")
p for p in subject_dir.iterdir() if p.is_dir() and p.name.startswith("ses-")
]

if not session_dirs:
session_dirs = [subject_dir]


for session_dir in session_dirs:

# Group files by identifier
files_by_identifier = defaultdict(list)

for npy_file in session_dir.glob("dFC_*.npy"):

filename = npy_file.stem # removed .npy

_, rest = filename.split("_", 1) # e.g., "dFC", "ses-wave1bas_task-Stroop_run-2_24"
identifier, method_number = rest.rsplit("_", 1) # e.g., "ses-wave1bas_task-Stroop_run-2", "24"
filename = npy_file.stem # removed .npy

files_by_identifier[identifier].append(
(int(method_number), npy_file)
)
_, rest = filename.split(
"_", 1
) # e.g., "dFC", "ses-wave1bas_task-Stroop_run-2_24"
identifier, method_number = rest.rsplit(
"_", 1
) # e.g., "ses-wave1bas_task-Stroop_run-2", "24"

files_by_identifier[identifier].append((int(method_number), npy_file))

# Process one identifier at a time (similarity across methods)
for identifier, file_info in files_by_identifier.items():

# Initialize session_id and run_id as None in case they don't exist
session_id = None
run_id = None
task_id = None # must exist, see check later to catch error.

# Get session, task, and run from identifier (if they exist)
for part in identifier.split("_"):
if part.startswith("ses-"): # e.g., "ses-wave1bas"
if part.startswith("ses-"): # e.g., "ses-wave1bas"
session_id = part

elif part.startswith("run-"): # e.g., "run-2"
elif part.startswith("run-"): # e.g., "run-2"
run_id = part

elif part.startswith("task-"): # e.g., "task-Stroop"
task_id = part

else:
print(f"Warning: Unrecognized part '{part}' in identifier '{identifier}' \
of subject '{subject_id}' in dataset '{dataset_id}'. Ignoring this part.")

print(
f"Warning: Unrecognized part '{part}' in identifier '{identifier}' \
of subject '{subject_id}' in dataset '{dataset_id}'. Ignoring this part."
)

if task_id is None:
print(f"Error: task_id not found in identifier '{identifier}' of subject '{subject_id}' \
in dataset '{dataset_id}'. Skipping this file.")
print(
f"Error: task_id not found in identifier '{identifier}' of subject '{subject_id}' \
in dataset '{dataset_id}'. Skipping this file."
)
continue

# Sort methods numerically
file_info.sort(key=lambda x: x[0])

method_numbers = []
# This is a list of the dFC objects from various methods
# that share the same identifier i.e., they came from the same

# This is a list of the dFC objects from various methods
# that share the same identifier i.e., they came from the same
# BOLD time series, but they were computed using different methods
# Each dFC in the list is recognized as a dFC object by pydfc
dFC_lst = []

for method_num, path in file_info:
method_numbers.append(method_num)
dFC_lst.append(
np.load(path, allow_pickle=True).item()
)

# Note: type(output) = dict with
dFC_lst.append(np.load(path, allow_pickle=True).item())

# Note: type(output) = dict with
# dict_keys(['measure_lst', 'TS_info_lst', 'common_TRs', 'time_record_dict', 'all'])
similarity_assessment = SimilarityAssessment(dFC_lst=dFC_lst)
output = similarity_assessment.assess_similarity_fast(dFC_lst=dFC_lst)



similarity[dataset_id][subject_id][session_id][run_id][task_id] = {
"matrix": output,
"methods": method_numbers,
}

print(f"Finished processing subject {subject_id} in dataset {dataset_id}")


output_dir = Path("/home/kinichen/scratch/data/pydfc_validator/similarity_assessments_complete")

output_dir = Path(
"/home/kinichen/scratch/data/pydfc_validator/similarity_assessments_complete"
)
output_dir.mkdir(parents=True, exist_ok=True)
output_file = output_dir / "similarity.pkl"


# Convert to normal dict for pickling. Need to do recursively because of the nested defaultdicts.
def to_dict(d):
if isinstance(d, defaultdict):
return {k: to_dict(v) for k, v in d.items()}
return d


similarity = to_dict(similarity)

with open(output_file, "wb") as f:
pickle.dump(similarity, f)
print(f"Saved results to: {output_file}")


print(f"Saved results to: {output_file}")
40 changes: 40 additions & 0 deletions docs/ADDING_DFC_METHODS.md
Original file line number Diff line number Diff line change
Expand Up @@ -259,6 +259,42 @@ time_series = self.manipulate_time_series4FCS(time_series)

and include any FCS-only parameters such as `num_subj` if the method uses them.

## ML Pipeline Registration (State-Based Methods Only)

If the new method is state-based (`is_state_based = True`), you **must** also
register it in `pydfc/ml_utils.py` inside `process_SB_features`. This function
applies the correct feature transformation before classification. Omitting this
step causes the function to return `None`, which crashes the ML pipeline with a
`TypeError` at `subject_center`.

Determine which branch your method belongs to:

- **Softmax → ILR** (`if` branch, methods like `CAP`, `Clustering`): use this
when `FCS_proba` stores raw distances or dissimilarity scores that must first
be converted to a probability simplex via softmax.
- **ILR only** (`elif` branch, methods like `GaussianMixtureStates`,
`ContinuousHMM`, `NMFStates`): use this when `FCS_proba` already contains
proper probabilities (non-negative, rows summing to 1).

Add the method name to the correct branch:

```python
# pydfc/ml_utils.py — process_SB_features
elif measure_name in [
"ContinuousHMM",
...
"NMFStates", # ← add your method here if FCS_proba rows sum to 1
...
]:
X_transformed = ilr_transform(X)
```

A quick check: inspect `estimate_dFC` in the method file and look at how
`FCS_proba` is set. If it is produced by a row-wise normalization
(`/ row_sums`) or a soft-assignment model (GMM, HMM posterior), it belongs in
the ILR-only branch. If it stores distances or un-normalized scores, it belongs
in the softmax + ILR branch.

## Package Export

After adding a method file, update:
Expand Down Expand Up @@ -376,3 +412,7 @@ against established methods rather than interpreted in isolation.
subclass.
- Importing optional dependencies at package level in a way that breaks unrelated
methods.
- For state-based methods: forgetting to add the method name to `process_SB_features`
in `pydfc/ml_utils.py`. The function silently returns `None` if the method is
missing from both branches, crashing the ML pipeline. See the
"ML Pipeline Registration" section above.
Loading
Loading