neurodatascience · mtorabi59 · Jun 27, 2026 · Jun 14, 2026 · Jun 14, 2026 · Jun 14, 2026
diff --git a/README.rst b/README.rst
@@ -165,3 +165,21 @@ If you are new to **pydfc**, we recommend starting with:
 
 This optional AI-assisted workflow is designed to complement — not replace —
 the documentation and example scripts.
+
+Generating New dFC Methods with AI
+-----------------------------------
+
+You can ask an AI coding assistant (Claude, Copilot, Codex, etc.) to implement
+brand-new dFC methods and add them directly to ``pydfc``.  Just describe what
+you want at whatever level of specificity feels right:
+
+- *"Generate 5 new creative dFC methods."*
+- *"Generate 3 new state-based methods."*
+- *"Implement a dFC method based on Granger causality."*
+- *"Add a method that uses Riemannian geometry on covariance matrices."*
+- *"Here is a paper — implement the method it describes."*  (paste the PDF or text)
+
+The AI will read the existing codebase, follow the conventions in
+``docs/ADDING_DFC_METHODS.md``, write the new method file, and register it in
+``pydfc/dfc_methods/__init__.py`` so it works immediately alongside all other
+methods.
diff --git a/algorithm_similarity.py b/algorithm_similarity.py
@@ -26,6 +26,7 @@
 import re
 import sys
 from pathlib import Path
+
 import numpy as np
 
 # Only calls whose resolved root module starts with one of these are kept as
@@ -129,7 +130,7 @@ def _make_unique_labels(filepaths):
 
 def _hierarchical_cluster_order(matrix, cluster_method="average"):
     """Return indices that order similar methods next to each other."""
-    from scipy.cluster.hierarchy import linkage, leaves_list
+    from scipy.cluster.hierarchy import leaves_list, linkage
     from scipy.spatial.distance import squareform
 
     if matrix.shape[0] < 2:
@@ -159,7 +160,9 @@ def plot_similarity_heatmap(
         matrix = matrix[np.ix_(order, order)]
         labels = [labels[i] for i in order]
 
-    fig, ax = plt.subplots(figsize=(max(8, 0.45 * len(labels)), max(6, 0.45 * len(labels))))
+    fig, ax = plt.subplots(
+        figsize=(max(8, 0.45 * len(labels)), max(6, 0.45 * len(labels)))
+    )
     image = ax.imshow(matrix, vmin=0.0, vmax=1.0, cmap="viridis", aspect="equal")
     fig.colorbar(image, ax=ax, label="AS")
 
@@ -187,7 +190,9 @@ def save_similarity_outputs(output_dir, labels, source_paths, matrix, table):
     with open(output_dir / "AS_jaccard_source_paths.json", "w", encoding="utf-8") as f:
         json.dump(source_paths, f, indent=2)
 
-    with open(output_dir / "AS_jaccard_pairs.csv", "w", newline="", encoding="utf-8") as f:
+    with open(
+        output_dir / "AS_jaccard_pairs.csv", "w", newline="", encoding="utf-8"
+    ) as f:
         fieldnames = [
             "method_a",
             "method_b",
@@ -206,9 +211,13 @@ def save_similarity_outputs(output_dir, labels, source_paths, matrix, table):
     try:
         fig, _ = plot_similarity_heatmap(matrix, labels, cluster=True)
     except ImportError:
-        print("Skipping heatmap export because matplotlib is not available in this environment.")
+        print(
+            "Skipping heatmap export because matplotlib is not available in this environment."
+        )
     else:
-        fig.savefig(str(output_dir / "AS_jaccard_heatmap.png"), dpi=200, bbox_inches="tight")
+        fig.savefig(
+            str(output_dir / "AS_jaccard_heatmap.png"), dpi=200, bbox_inches="tight"
+        )
         import matplotlib.pyplot as plt
 
         plt.close(fig)
@@ -221,7 +230,9 @@ def load_similarity_outputs(output_dir):
     labels = np.load(output_dir / "AS_jaccard_names.npy", allow_pickle=True).tolist()
     with open(output_dir / "AS_jaccard_source_paths.json", "r", encoding="utf-8") as f:
         source_paths = json.load(f)
-    with open(output_dir / "AS_jaccard_pairs.csv", "r", newline="", encoding="utf-8") as f:
+    with open(
+        output_dir / "AS_jaccard_pairs.csv", "r", newline="", encoding="utf-8"
+    ) as f:
         table = list(csv.DictReader(f))
     return labels, source_paths, matrix, table
 
@@ -264,7 +275,9 @@ def main(filepaths):
 
         print(f"{method_a:35s} vs {method_b:35s}  AS = {sim:.3f}   shared = {shared}")
 
-    save_similarity_outputs("algorithm_similarity_results", names, source_paths, alg_sim, pairwise_rows)
+    save_similarity_outputs(
+        "algorithm_similarity_results", names, source_paths, alg_sim, pairwise_rows
+    )
     print("Saved outputs to algorithm_similarity_results/")
 
 

diff --git a/compute_feature_similarity.py b/compute_feature_similarity.py
@@ -1,11 +1,11 @@
+import pickle
 import sys
-import numpy as np
-from pathlib import Path
 from collections import defaultdict
+from pathlib import Path
 
-from pydfc.comparison import SimilarityAssessment   # pip install pydfc
+import numpy as np
 
-import pickle
+from pydfc.comparison import SimilarityAssessment  # pip install pydfc
 
 # FULL PATH usually looks like:
 # "{path_to_datasets}/{dataset_id}/derivatives/dFC_assessed/{subject_id}/{session_id}/*.npy"
@@ -17,7 +17,7 @@
     print("Missing a path to the datasets directory")
     print("Usage: sbatch run_dfc.sh <path_to_datasets>")
     sys.exit(1)
-    
+
 path_to_datasets = sys.argv[1]
 
 root = Path(path_to_datasets)
@@ -28,25 +28,21 @@
 # where matrix.shape = (1, num_methods, num_methods) and contains the similarity values between methods
 
 similarity = defaultdict(
-    lambda: defaultdict(
-        lambda: defaultdict(
-            lambda: defaultdict(dict)
-        )
-    )
+    lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(dict)))
 )
 
 for dataset_dir in root.iterdir():
 
     if not dataset_dir.is_dir():
         continue
-    
+
     if not dataset_dir.name.startswith("ds"):
         continue
 
     dataset_id = dataset_dir.name
 
     dfc_dir = dataset_dir / "derivatives" / "dFC_assessed"
-    
+
     if not dfc_dir.is_dir():
         print(f"Skipping {dataset_id} since /derivatives/dFC_assessed not found")
         continue
@@ -61,104 +57,108 @@
         # If no session folders, treat the subject directory as the session directory
         # to avoid file path issues. If this case, session_id will be set to None later.
         session_dirs = [
-            p for p in subject_dir.iterdir()
-            if p.is_dir() and p.name.startswith("ses-")
+            p for p in subject_dir.iterdir() if p.is_dir() and p.name.startswith("ses-")
         ]
 
         if not session_dirs:
             session_dirs = [subject_dir]
-
 
         for session_dir in session_dirs:
-            
+
             # Group files by identifier
             files_by_identifier = defaultdict(list)
 
             for npy_file in session_dir.glob("dFC_*.npy"):
 
-                filename = npy_file.stem    # removed .npy
-
-                _, rest = filename.split("_", 1)  # e.g., "dFC", "ses-wave1bas_task-Stroop_run-2_24"
-                identifier, method_number = rest.rsplit("_", 1)  # e.g., "ses-wave1bas_task-Stroop_run-2", "24"
+                filename = npy_file.stem  # removed .npy
 
-                files_by_identifier[identifier].append(
-                    (int(method_number), npy_file)
-                )
+                _, rest = filename.split(
+                    "_", 1
+                )  # e.g., "dFC", "ses-wave1bas_task-Stroop_run-2_24"
+                identifier, method_number = rest.rsplit(
+                    "_", 1
+                )  # e.g., "ses-wave1bas_task-Stroop_run-2", "24"
 
+                files_by_identifier[identifier].append((int(method_number), npy_file))
 
             # Process one identifier at a time (similarity across methods)
             for identifier, file_info in files_by_identifier.items():
-                
+
                 # Initialize session_id and run_id as None in case they don't exist
                 session_id = None
                 run_id = None
                 task_id = None  # must exist, see check later to catch error.
-                
+
                 # Get session, task, and run from identifier (if they exist)
                 for part in identifier.split("_"):
-                    if part.startswith("ses-"):     # e.g., "ses-wave1bas"
+                    if part.startswith("ses-"):  # e.g., "ses-wave1bas"
                         session_id = part
 
-                    elif part.startswith("run-"):   # e.g., "run-2"
+                    elif part.startswith("run-"):  # e.g., "run-2"
                         run_id = part
 
                     elif part.startswith("task-"):  # e.g., "task-Stroop"
                         task_id = part
-                        
+
                     else:
-                        print(f"Warning: Unrecognized part '{part}' in identifier '{identifier}' \
-                            of subject '{subject_id}' in dataset '{dataset_id}'. Ignoring this part.")
-
+                        print(
+                            f"Warning: Unrecognized part '{part}' in identifier '{identifier}' \
+                            of subject '{subject_id}' in dataset '{dataset_id}'. Ignoring this part."
+                        )
+
                 if task_id is None:
-                    print(f"Error: task_id not found in identifier '{identifier}' of subject '{subject_id}' \
-                        in dataset '{dataset_id}'. Skipping this file.")
+                    print(
+                        f"Error: task_id not found in identifier '{identifier}' of subject '{subject_id}' \
+                        in dataset '{dataset_id}'. Skipping this file."
+                    )
                     continue
 
                 # Sort methods numerically
                 file_info.sort(key=lambda x: x[0])
 
                 method_numbers = []
-                
-                # This is a list of the dFC objects from various methods 
-                # that share the same identifier i.e., they came from the same 
+
+                # This is a list of the dFC objects from various methods
+                # that share the same identifier i.e., they came from the same
                 # BOLD time series, but they were computed using different methods
                 # Each dFC in the list is recognized as a dFC object by pydfc
                 dFC_lst = []
 
                 for method_num, path in file_info:
                     method_numbers.append(method_num)
-                    dFC_lst.append(
-                        np.load(path, allow_pickle=True).item()
-                    )
-
-                # Note: type(output) = dict with 
+                    dFC_lst.append(np.load(path, allow_pickle=True).item())
+
+                # Note: type(output) = dict with
                 # dict_keys(['measure_lst', 'TS_info_lst', 'common_TRs', 'time_record_dict', 'all'])
                 similarity_assessment = SimilarityAssessment(dFC_lst=dFC_lst)
                 output = similarity_assessment.assess_similarity_fast(dFC_lst=dFC_lst)
-
-
+
                 similarity[dataset_id][subject_id][session_id][run_id][task_id] = {
                     "matrix": output,
                     "methods": method_numbers,
                 }
-                
+
         print(f"Finished processing subject {subject_id} in dataset {dataset_id}")
-
 
-output_dir = Path("/home/kinichen/scratch/data/pydfc_validator/similarity_assessments_complete")
+
+output_dir = Path(
+    "/home/kinichen/scratch/data/pydfc_validator/similarity_assessments_complete"
+)
 output_dir.mkdir(parents=True, exist_ok=True)
 output_file = output_dir / "similarity.pkl"
 
+
 # Convert to normal dict for pickling. Need to do recursively because of the nested defaultdicts.
 def to_dict(d):
     if isinstance(d, defaultdict):
         return {k: to_dict(v) for k, v in d.items()}
     return d
 
+
 similarity = to_dict(similarity)
 
 with open(output_file, "wb") as f:
     pickle.dump(similarity, f)
-    
-    
-print(f"Saved results to: {output_file}")
+
+
+print(f"Saved results to: {output_file}")
diff --git a/docs/ADDING_DFC_METHODS.md b/docs/ADDING_DFC_METHODS.md
@@ -259,6 +259,42 @@ time_series = self.manipulate_time_series4FCS(time_series)
 
 and include any FCS-only parameters such as `num_subj` if the method uses them.
 
+## ML Pipeline Registration (State-Based Methods Only)
+
+If the new method is state-based (`is_state_based = True`), you **must** also
+register it in `pydfc/ml_utils.py` inside `process_SB_features`. This function
+applies the correct feature transformation before classification. Omitting this
+step causes the function to return `None`, which crashes the ML pipeline with a
+`TypeError` at `subject_center`.
+
+Determine which branch your method belongs to:
+
+- **Softmax → ILR** (`if` branch, methods like `CAP`, `Clustering`): use this
+  when `FCS_proba` stores raw distances or dissimilarity scores that must first
+  be converted to a probability simplex via softmax.
+- **ILR only** (`elif` branch, methods like `GaussianMixtureStates`,
+  `ContinuousHMM`, `NMFStates`): use this when `FCS_proba` already contains
+  proper probabilities (non-negative, rows summing to 1).
+
+Add the method name to the correct branch:
+
+```python
+# pydfc/ml_utils.py — process_SB_features
+elif measure_name in [
+    "ContinuousHMM",
+    ...
+    "NMFStates",        # ← add your method here if FCS_proba rows sum to 1
+    ...
+]:
+    X_transformed = ilr_transform(X)
+```
+
+A quick check: inspect `estimate_dFC` in the method file and look at how
+`FCS_proba` is set. If it is produced by a row-wise normalization
+(`/ row_sums`) or a soft-assignment model (GMM, HMM posterior), it belongs in
+the ILR-only branch. If it stores distances or un-normalized scores, it belongs
+in the softmax + ILR branch.
+
 ## Package Export
 
 After adding a method file, update:
@@ -376,3 +412,7 @@ against established methods rather than interpreted in isolation.
   subclass.
 - Importing optional dependencies at package level in a way that breaks unrelated
   methods.
+- For state-based methods: forgetting to add the method name to `process_SB_features`
+  in `pydfc/ml_utils.py`. The function silently returns `None` if the method is
+  missing from both branches, crashing the ML pipeline. See the
+  "ML Pipeline Registration" section above.